diff options
| author | Linus Torvalds <torvalds@ppc970.osdl.org> | 2004-08-23 00:29:47 -0700 |
|---|---|---|
| committer | Linus Torvalds <torvalds@ppc970.osdl.org> | 2004-08-23 00:29:47 -0700 |
| commit | 9c9dada1d5cfd06f06b00ef84ef01839b9ac65ed (patch) | |
| tree | cd290f9c5c159122223c050da7db4fe54f5932b2 | |
| parent | 27b5be2921179dacff74114a80aae95087ad90ab (diff) | |
| parent | ad9b7f31e119e43028061beaaca8664e57202670 (diff) | |
Merge bk://kernel.bkbits.net/davem/net-2.6
into ppc970.osdl.org:/home/torvalds/v2.6/linux
163 files changed, 4810 insertions, 2216 deletions
diff --git a/crypto/Kconfig b/crypto/Kconfig index 12429ef28b39..f5bee997bee9 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -182,7 +182,7 @@ config CRYPTO_TEA many rounds for security. It is very fast and uses little memory. - Xtendend Tiny Encryption Algorithm is a modifcation to + Xtendend Tiny Encryption Algorithm is a modification to the TEA algorithm to address a potential key weakness in the TEA algorithm. diff --git a/crypto/aes.c b/crypto/aes.c index 13b6daa37089..3a26f9c99aee 100644 --- a/crypto/aes.c +++ b/crypto/aes.c @@ -160,7 +160,7 @@ gen_tabs (void) u8 p, q; /* log and power tables for GF(2**8) finite field with - 0x011b as modular polynomial - the simplest prmitive + 0x011b as modular polynomial - the simplest primitive root is 0x03, used here to generate the tables */ for (i = 0, p = 1; i < 256; ++i) { diff --git a/crypto/arc4.c b/crypto/arc4.c index b66eb1f0d001..9efbcaae88a1 100644 --- a/crypto/arc4.c +++ b/crypto/arc4.c @@ -3,7 +3,7 @@ * * ARC4 Cipher Algorithm * - * Jon Oberheide <jon@focalhost.com> + * Jon Oberheide <jon@oberheide.org> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -100,4 +100,4 @@ module_exit(arc4_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("ARC4 Cipher Algorithm"); -MODULE_AUTHOR("Jon Oberheide <jon@focalhost.com>"); +MODULE_AUTHOR("Jon Oberheide <jon@oberheide.org>"); diff --git a/crypto/blowfish.c b/crypto/blowfish.c index 8fd18dcdb679..b6bea821c05f 100644 --- a/crypto/blowfish.c +++ b/crypto/blowfish.c @@ -3,9 +3,9 @@ * * Blowfish Cipher Algorithm, by Bruce Schneier. * http://www.counterpane.com/blowfish.html - * - * Adapated from Kerneli implementation. - * + * + * Adapted from Kerneli implementation. + * * Copyright (c) Herbert Valerio Riedel <hvr@hvrlab.org> * Copyright (c) Kyle McMartin <kyle@debian.org> * Copyright (c) 2002 James Morris <jmorris@intercode.com.au> diff --git a/crypto/scatterwalk.c b/crypto/scatterwalk.c index 540dd9ac1504..f6a5c9e5b2e0 100644 --- a/crypto/scatterwalk.c +++ b/crypto/scatterwalk.c @@ -70,7 +70,7 @@ static void scatterwalk_pagedone(struct scatter_walk *walk, int out, { /* walk->data may be pointing the first byte of the next page; however, we know we transfered at least one byte. So, - walk->data - 1 will be a virutual address in the mapped page. */ + walk->data - 1 will be a virtual address in the mapped page. */ if (out) flush_dcache_page(walk->page); diff --git a/crypto/tcrypt.h b/crypto/tcrypt.h index 56a825d6539b..81791c1c0142 100644 --- a/crypto/tcrypt.h +++ b/crypto/tcrypt.h @@ -1186,7 +1186,7 @@ struct cipher_testvec tf_cbc_dec_tv_template[] = { /* * Serpent test vectors. These are backwards because Serpent writes - * octect sequences in right-to-left mode. + * octet sequences in right-to-left mode. */ #define SERPENT_ENC_TEST_VECTORS 4 #define SERPENT_DEC_TEST_VECTORS 4 diff --git a/crypto/twofish.c b/crypto/twofish.c index 5d6d02c1a95d..4efff8cf9958 100644 --- a/crypto/twofish.c +++ b/crypto/twofish.c @@ -1,7 +1,7 @@ /* * Twofish for CryptoAPI * - * Originaly Twofish for GPG + * Originally Twofish for GPG * By Matthew Skala <mskala@ansuz.sooke.bc.ca>, July 26, 1998 * 256-bit key length added March 20, 1999 * Some modifications to reduce the text size by Werner Koch, April, 1998 @@ -514,7 +514,7 @@ static const u8 calc_sb_tbl[512] = { * preprocessed through q0 and q1 respectively; for longer keys they are the * output of previous stages. j is the index of the first key byte to use. * CALC_K computes a pair of subkeys for 128-bit Twofish, by calling CALC_K_2 - * twice, doing the Psuedo-Hadamard Transform, and doing the necessary + * twice, doing the Pseudo-Hadamard Transform, and doing the necessary * rotations. Its parameters are: a, the array to write the results into, * j, the index of the first output entry, k and l, the preprocessed indices * for index 2i, and m and n, the preprocessed indices for index 2i+1. diff --git a/drivers/net/sungem.c b/drivers/net/sungem.c index e48ec3421328..72138612d2fb 100644 --- a/drivers/net/sungem.c +++ b/drivers/net/sungem.c @@ -808,16 +808,16 @@ static irqreturn_t gem_interrupt(int irq, void *dev_id, struct pt_regs *regs) if (gem_status & GREG_STAT_ABNORMAL) { if (gem_abnormal_irq(dev, gp, gem_status)) - goto out; + goto out_unlock; } if (gem_status & (GREG_STAT_TXALL | GREG_STAT_TXINTME)) gem_tx(dev, gp, gem_status); if (gem_status & GREG_STAT_RXDONE) gem_rx(gp); -out: +out_unlock: spin_unlock(&gp->lock); - +out: return IRQ_HANDLED; } diff --git a/drivers/net/wan/syncppp.c b/drivers/net/wan/syncppp.c index f7442d52dabe..2329c23af83e 100644 --- a/drivers/net/wan/syncppp.c +++ b/drivers/net/wan/syncppp.c @@ -50,6 +50,7 @@ #include <linux/random.h> #include <linux/pkt_sched.h> #include <linux/spinlock.h> +#include <linux/rcupdate.h> #include <net/syncppp.h> @@ -767,9 +768,9 @@ static void sppp_cisco_input (struct sppp *sp, struct sk_buff *skb) struct in_ifaddr *ifa; u32 addr = 0, mask = ~0; /* FIXME: is the mask correct? */ #ifdef CONFIG_INET - if ((in_dev=in_dev_get(dev)) != NULL) + rcu_read_lock(); + if ((in_dev = __in_dev_get(dev)) != NULL) { - read_lock(&in_dev->lock); for (ifa=in_dev->ifa_list; ifa != NULL; ifa=ifa->ifa_next) { if (strcmp(dev->name, ifa->ifa_label) == 0) @@ -779,9 +780,8 @@ static void sppp_cisco_input (struct sppp *sp, struct sk_buff *skb) break; } } - read_unlock(&in_dev->lock); - in_dev_put(in_dev); } + rcu_read_unlock(); #endif /* I hope both addr and mask are in the net order */ sppp_cisco_send (sp, CISCO_ADDR_REPLY, addr, mask); diff --git a/drivers/net/wireless/strip.c b/drivers/net/wireless/strip.c index 98cee21f7d84..c9331f589645 100644 --- a/drivers/net/wireless/strip.c +++ b/drivers/net/wireless/strip.c @@ -106,6 +106,7 @@ static const char StripVersion[] = "1.3A-STUART.CHESHIRE"; #include <linux/seq_file.h> #include <linux/serial.h> #include <linux/serialP.h> +#include <linux/rcupdate.h> #include <net/arp.h> #include <linux/ip.h> @@ -1348,14 +1349,17 @@ static unsigned char *strip_make_packet(unsigned char *buffer, */ if (haddr.c[0] == 0xFF) { u32 brd = 0; - struct in_device *in_dev = in_dev_get(strip_info->dev); - if (in_dev == NULL) + struct in_device *in_dev; + + rcu_read_lock(); + in_dev = __in_dev_get(strip_info->dev); + if (in_dev == NULL) { + rcu_read_unlock(); return NULL; - read_lock(&in_dev->lock); + } if (in_dev->ifa_list) brd = in_dev->ifa_list->ifa_broadcast; - read_unlock(&in_dev->lock); - in_dev_put(in_dev); + rcu_read_unlock(); /* arp_query returns 1 if it succeeds in looking up the address, 0 if it fails */ if (!arp_query(haddr.c, brd, strip_info->dev)) { @@ -1500,17 +1504,18 @@ static void strip_send(struct strip *strip_info, struct sk_buff *skb) } if (1) { - struct in_device *in_dev = in_dev_get(strip_info->dev); + struct in_device *in_dev; + brd = addr = 0; + rcu_read_lock(); + in_dev = __in_dev_get(strip_info->dev); if (in_dev) { - read_lock(&in_dev->lock); if (in_dev->ifa_list) { brd = in_dev->ifa_list->ifa_broadcast; addr = in_dev->ifa_list->ifa_local; } - read_unlock(&in_dev->lock); - in_dev_put(in_dev); } + rcu_read_unlock(); } diff --git a/drivers/s390/net/lcs.c b/drivers/s390/net/lcs.c index cac57951ae28..e4b7bbbeff17 100644 --- a/drivers/s390/net/lcs.c +++ b/drivers/s390/net/lcs.c @@ -1002,7 +1002,7 @@ lcs_register_mc_addresses(void *data) in4_dev = in_dev_get(card->dev); if (in4_dev == NULL) return 0; - read_lock(&in4_dev->lock); + read_lock(&in4_dev->mc_list_lock); spin_lock(&card->ipm_lock); /* Check for multicast addresses to be removed. */ list_for_each(l, &card->ipm_list) { @@ -1046,7 +1046,7 @@ lcs_register_mc_addresses(void *data) list_add(&ipm->list, &card->ipm_list); } spin_unlock(&card->ipm_lock); - read_unlock(&in4_dev->lock); + read_unlock(&in4_dev->mc_list_lock); in_dev_put(in4_dev); lcs_fix_multicast_list(card); return 0; diff --git a/drivers/s390/net/qeth_main.c b/drivers/s390/net/qeth_main.c index 17f0f67d8e55..378906eba48f 100644 --- a/drivers/s390/net/qeth_main.c +++ b/drivers/s390/net/qeth_main.c @@ -73,6 +73,7 @@ qeth_eyecatcher(void) #include <linux/reboot.h> #include <asm/qeth.h> #include <linux/mii.h> +#include <linux/rcupdate.h> #include "qeth.h" #include "qeth_mpc.h" @@ -4733,9 +4734,10 @@ qeth_free_vlan_addresses4(struct qeth_card *card, unsigned short vid) QETH_DBF_TEXT(trace, 4, "frvaddr4"); if (!card->vlangrp) return; - in_dev = in_dev_get(card->vlangrp->vlan_devices[vid]); + rcu_read_lock(); + in_dev = __in_dev_get(card->vlangrp->vlan_devices[vid]); if (!in_dev) - return; + goto out; for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next){ addr = qeth_get_addr_buffer(QETH_PROT_IPV4); if (addr){ @@ -4746,7 +4748,8 @@ qeth_free_vlan_addresses4(struct qeth_card *card, unsigned short vid) kfree(addr); } } - in_dev_put(in_dev); +out: + rcu_read_unlock(); } static void @@ -4918,9 +4921,9 @@ qeth_add_vlan_mc(struct qeth_card *card) in_dev = in_dev_get(vg->vlan_devices[i]); if (!in_dev) continue; - read_lock(&in_dev->lock); + read_lock(&in_dev->mc_list_lock); qeth_add_mc(card,in_dev); - read_unlock(&in_dev->lock); + read_unlock(&in_dev->mc_list_lock); in_dev_put(in_dev); } #endif @@ -4935,10 +4938,10 @@ qeth_add_multicast_ipv4(struct qeth_card *card) in4_dev = in_dev_get(card->dev); if (in4_dev == NULL) return; - read_lock(&in4_dev->lock); + read_lock(&in4_dev->mc_list_lock); qeth_add_mc(card, in4_dev); qeth_add_vlan_mc(card); - read_unlock(&in4_dev->lock); + read_unlock(&in4_dev->mc_list_lock); in_dev_put(in4_dev); } diff --git a/include/linux/atalk.h b/include/linux/atalk.h index c4d2a0949177..2a9b82002591 100644 --- a/include/linux/atalk.h +++ b/include/linux/atalk.h @@ -191,10 +191,13 @@ extern int aarp_send_ddp(struct net_device *dev, extern void aarp_send_probe(struct net_device *dev, struct atalk_addr *addr); extern void aarp_device_down(struct net_device *dev); +extern void aarp_probe_network(struct atalk_iface *atif); +extern int aarp_proxy_probe_network(struct atalk_iface *atif, + struct atalk_addr *sa); +extern void aarp_proxy_remove(struct net_device *dev, + struct atalk_addr *sa); -#ifdef MODULE -extern void aarp_cleanup_module(void); -#endif /* MODULE */ +extern void aarp_cleanup_module(void); #define at_sk(__sk) ((struct atalk_sock *)(__sk)->sk_protinfo) @@ -209,8 +212,28 @@ extern rwlock_t atalk_interfaces_lock; extern struct atalk_route atrtr_default; +extern struct file_operations atalk_seq_arp_fops; + +extern int sysctl_aarp_expiry_time; +extern int sysctl_aarp_tick_time; +extern int sysctl_aarp_retransmit_limit; +extern int sysctl_aarp_resolve_time; + +#ifdef CONFIG_SYSCTL +extern void atalk_register_sysctl(void); +extern void atalk_unregister_sysctl(void); +#else +#define atalk_register_sysctl() do { } while(0) +#define atalk_unregister_sysctl() do { } while(0) +#endif + +#ifdef CONFIG_PROC_FS extern int atalk_proc_init(void); extern void atalk_proc_exit(void); +#else +#define atalk_proc_init() 0 +#define atalk_proc_exit() do { } while(0) +#endif /* CONFIG_PROC_FS */ #endif /* __KERNEL__ */ #endif /* __LINUX_ATALK_H__ */ diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 420767fcb3c9..529c401a9a86 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -152,7 +152,7 @@ static inline int __vlan_hwaccel_rx(struct sk_buff *skb, skb->real_dev = skb->dev; skb->dev = grp->vlan_devices[vlan_tag & VLAN_VID_MASK]; if (skb->dev == NULL) { - kfree_skb(skb); + dev_kfree_skb_any(skb); /* Not NET_RX_DROP, this is not being dropped * due to congestion. diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h index 27a5d0a97dbc..ec751e9fb1c2 100644 --- a/include/linux/inetdevice.h +++ b/include/linux/inetdevice.h @@ -3,6 +3,8 @@ #ifdef __KERNEL__ +#include <linux/rcupdate.h> + struct ipv4_devconf { int accept_redirects; @@ -31,13 +33,13 @@ extern struct ipv4_devconf ipv4_devconf; struct in_device { - struct net_device *dev; + struct net_device *dev; atomic_t refcnt; - rwlock_t lock; int dead; struct in_ifaddr *ifa_list; /* IP ifaddr chain */ + rwlock_t mc_list_lock; struct ip_mc_list *mc_list; /* IP multicast filter chain */ - rwlock_t mc_lock; /* for mc_tomb */ + spinlock_t mc_tomb_lock; struct ip_mc_list *mc_tomb; unsigned long mr_v1_seen; unsigned long mr_v2_seen; @@ -50,6 +52,7 @@ struct in_device struct neigh_parms *arp_parms; struct ipv4_devconf cnf; + struct rcu_head rcu_head; }; #define IN_DEV_FORWARD(in_dev) ((in_dev)->cnf.forwarding) @@ -80,6 +83,7 @@ struct in_ifaddr { struct in_ifaddr *ifa_next; struct in_device *ifa_dev; + struct rcu_head rcu_head; u32 ifa_local; u32 ifa_address; u32 ifa_mask; @@ -133,19 +137,16 @@ static __inline__ int bad_mask(u32 mask, u32 addr) #define endfor_ifa(in_dev) } -extern rwlock_t inetdev_lock; - - static __inline__ struct in_device * in_dev_get(const struct net_device *dev) { struct in_device *in_dev; - read_lock(&inetdev_lock); + rcu_read_lock(); in_dev = dev->ip_ptr; if (in_dev) atomic_inc(&in_dev->refcnt); - read_unlock(&inetdev_lock); + rcu_read_unlock(); return in_dev; } @@ -157,8 +158,7 @@ __in_dev_get(const struct net_device *dev) extern void in_dev_finish_destroy(struct in_device *idev); -static __inline__ void -in_dev_put(struct in_device *idev) +static inline void in_dev_put(struct in_device *idev) { if (atomic_dec_and_test(&idev->refcnt)) in_dev_finish_destroy(idev); diff --git a/include/linux/net.h b/include/linux/net.h index cec1482f28e2..80e7fec727e3 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -169,6 +169,7 @@ extern struct socket *sockfd_lookup(int fd, int *err); extern int net_ratelimit(void); extern unsigned long net_random(void); extern void net_srandom(unsigned long); +extern void net_random_init(void); extern int kernel_sendmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t num, size_t len); diff --git a/include/linux/netfilter_ipv4/ip_conntrack.h b/include/linux/netfilter_ipv4/ip_conntrack.h index 1974f162f5a0..28b61a71bce9 100644 --- a/include/linux/netfilter_ipv4/ip_conntrack.h +++ b/include/linux/netfilter_ipv4/ip_conntrack.h @@ -51,10 +51,12 @@ enum ip_conntrack_status { #include <linux/netfilter_ipv4/ip_conntrack_tcp.h> #include <linux/netfilter_ipv4/ip_conntrack_icmp.h> +#include <linux/netfilter_ipv4/ip_conntrack_sctp.h> /* per conntrack: protocol private data */ union ip_conntrack_proto { /* insert conntrack proto private data here */ + struct ip_ct_sctp sctp; struct ip_ct_tcp tcp; struct ip_ct_icmp icmp; }; @@ -156,6 +158,12 @@ struct ip_conntrack_expect union ip_conntrack_expect_help help; }; +struct ip_conntrack_counter +{ + u_int64_t packets; + u_int64_t bytes; +}; + struct ip_conntrack_helper; struct ip_conntrack @@ -173,6 +181,11 @@ struct ip_conntrack /* Timer function; drops refcnt when it goes off. */ struct timer_list timeout; +#ifdef CONFIG_IP_NF_CT_ACCT + /* Accounting Information (same cache line as other written members) */ + struct ip_conntrack_counter counters[IP_CT_DIR_MAX]; +#endif + /* If we're expecting another related connection, this will be in expected linked list */ struct list_head sibling_list; @@ -245,10 +258,17 @@ extern int invert_tuplepr(struct ip_conntrack_tuple *inverse, const struct ip_conntrack_tuple *orig); /* Refresh conntrack for this many jiffies */ -extern void ip_ct_refresh(struct ip_conntrack *ct, - unsigned long extra_jiffies); +extern void ip_ct_refresh_acct(struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo, + const struct sk_buff *skb, + unsigned long extra_jiffies); /* These are for NAT. Icky. */ +/* Update TCP window tracking data when NAT mangles the packet */ +extern int ip_conntrack_tcp_update(struct sk_buff *skb, + struct ip_conntrack *conntrack, + int dir); + /* Call me when a conntrack is destroyed. */ extern void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack); @@ -271,6 +291,26 @@ static inline int is_confirmed(struct ip_conntrack *ct) } extern unsigned int ip_conntrack_htable_size; + +struct ip_conntrack_stat +{ + unsigned int searched; + unsigned int found; + unsigned int new; + unsigned int invalid; + unsigned int ignore; + unsigned int delete; + unsigned int delete_list; + unsigned int insert; + unsigned int insert_failed; + unsigned int drop; + unsigned int early_drop; + unsigned int icmp_error; + unsigned int expect_new; + unsigned int expect_create; + unsigned int expect_delete; +}; + /* eg. PROVIDES_CONNTRACK(ftp); */ #define PROVIDES_CONNTRACK(name) \ diff --git a/include/linux/netfilter_ipv4/ip_conntrack_core.h b/include/linux/netfilter_ipv4/ip_conntrack_core.h index 4c8b5d189089..9a31e96b7ab7 100644 --- a/include/linux/netfilter_ipv4/ip_conntrack_core.h +++ b/include/linux/netfilter_ipv4/ip_conntrack_core.h @@ -21,15 +21,17 @@ extern struct ip_conntrack_protocol *ip_ct_find_proto(u_int8_t protocol); extern struct ip_conntrack_protocol *__ip_ct_find_proto(u_int8_t protocol); extern struct list_head protocol_list; -/* Returns conntrack if it dealt with ICMP, and filled in skb->nfct */ -extern struct ip_conntrack *icmp_error_track(struct sk_buff *skb, - enum ip_conntrack_info *ctinfo, - unsigned int hooknum); -extern int get_tuple(const struct iphdr *iph, - const struct sk_buff *skb, - unsigned int dataoff, - struct ip_conntrack_tuple *tuple, - const struct ip_conntrack_protocol *protocol); +extern int +ip_ct_get_tuple(const struct iphdr *iph, + const struct sk_buff *skb, + unsigned int dataoff, + struct ip_conntrack_tuple *tuple, + const struct ip_conntrack_protocol *protocol); + +extern int +ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse, + const struct ip_conntrack_tuple *orig, + const struct ip_conntrack_protocol *protocol); /* Find a connection corresponding to a tuple. */ struct ip_conntrack_tuple_hash * diff --git a/include/linux/netfilter_ipv4/ip_conntrack_protocol.h b/include/linux/netfilter_ipv4/ip_conntrack_protocol.h index 56e37ef255b7..55d57404acb8 100644 --- a/include/linux/netfilter_ipv4/ip_conntrack_protocol.h +++ b/include/linux/netfilter_ipv4/ip_conntrack_protocol.h @@ -3,6 +3,11 @@ #define _IP_CONNTRACK_PROTOCOL_H #include <linux/netfilter_ipv4/ip_conntrack.h> +/* length of buffer to which print_tuple/print_conntrack members are + * writing */ + +#define IP_CT_PRINT_BUFLEN 100 + struct ip_conntrack_protocol { /* Next pointer. */ @@ -50,6 +55,9 @@ struct ip_conntrack_protocol int (*exp_matches_pkt)(struct ip_conntrack_expect *exp, const struct sk_buff *skb); + int (*error)(struct sk_buff *skb, enum ip_conntrack_info *ctinfo, + unsigned int hooknum); + /* Module (if any) which this is connected to. */ struct module *me; }; @@ -63,4 +71,17 @@ extern struct ip_conntrack_protocol ip_conntrack_protocol_tcp; extern struct ip_conntrack_protocol ip_conntrack_protocol_udp; extern struct ip_conntrack_protocol ip_conntrack_protocol_icmp; extern int ip_conntrack_protocol_tcp_init(void); + +/* Log invalid packets */ +extern unsigned int ip_ct_log_invalid; + +#ifdef DEBUG_INVALID_PACKETS +#define LOG_INVALID(proto) \ + (ip_ct_log_invalid == (proto) || ip_ct_log_invalid == IPPROTO_RAW) +#else +#define LOG_INVALID(proto) \ + ((ip_ct_log_invalid == (proto) || ip_ct_log_invalid == IPPROTO_RAW) \ + && net_ratelimit()) +#endif + #endif /*_IP_CONNTRACK_PROTOCOL_H*/ diff --git a/include/linux/netfilter_ipv4/ip_conntrack_sctp.h b/include/linux/netfilter_ipv4/ip_conntrack_sctp.h new file mode 100644 index 000000000000..7a8d869321f7 --- /dev/null +++ b/include/linux/netfilter_ipv4/ip_conntrack_sctp.h @@ -0,0 +1,25 @@ +#ifndef _IP_CONNTRACK_SCTP_H +#define _IP_CONNTRACK_SCTP_H +/* SCTP tracking. */ + +enum sctp_conntrack { + SCTP_CONNTRACK_NONE, + SCTP_CONNTRACK_CLOSED, + SCTP_CONNTRACK_COOKIE_WAIT, + SCTP_CONNTRACK_COOKIE_ECHOED, + SCTP_CONNTRACK_ESTABLISHED, + SCTP_CONNTRACK_SHUTDOWN_SENT, + SCTP_CONNTRACK_SHUTDOWN_RECD, + SCTP_CONNTRACK_SHUTDOWN_ACK_SENT, + SCTP_CONNTRACK_MAX +}; + +struct ip_ct_sctp +{ + enum sctp_conntrack state; + + u_int32_t vtag[IP_CT_DIR_MAX]; + u_int32_t ttag[IP_CT_DIR_MAX]; +}; + +#endif /* _IP_CONNTRACK_SCTP_H */ diff --git a/include/linux/netfilter_ipv4/ip_conntrack_tcp.h b/include/linux/netfilter_ipv4/ip_conntrack_tcp.h index d6698c911e11..0ab4590a0b16 100644 --- a/include/linux/netfilter_ipv4/ip_conntrack_tcp.h +++ b/include/linux/netfilter_ipv4/ip_conntrack_tcp.h @@ -4,25 +4,44 @@ enum tcp_conntrack { TCP_CONNTRACK_NONE, - TCP_CONNTRACK_ESTABLISHED, TCP_CONNTRACK_SYN_SENT, TCP_CONNTRACK_SYN_RECV, + TCP_CONNTRACK_ESTABLISHED, TCP_CONNTRACK_FIN_WAIT, - TCP_CONNTRACK_TIME_WAIT, - TCP_CONNTRACK_CLOSE, TCP_CONNTRACK_CLOSE_WAIT, TCP_CONNTRACK_LAST_ACK, + TCP_CONNTRACK_TIME_WAIT, + TCP_CONNTRACK_CLOSE, TCP_CONNTRACK_LISTEN, - TCP_CONNTRACK_MAX + TCP_CONNTRACK_MAX, + TCP_CONNTRACK_IGNORE +}; + +/* Window scaling is advertised by the sender */ +#define IP_CT_TCP_STATE_FLAG_WINDOW_SCALE 0x01 + +/* SACK is permitted by the sender */ +#define IP_CT_TCP_FLAG_SACK_PERM 0x02 + +struct ip_ct_tcp_state { + u_int32_t td_end; /* max of seq + len */ + u_int32_t td_maxend; /* max of ack + max(win, 1) */ + u_int32_t td_maxwin; /* max(win) */ + u_int8_t td_scale; /* window scale factor */ + u_int8_t loose; /* used when connection picked up from the middle */ + u_int8_t flags; /* per direction state flags */ }; struct ip_ct_tcp { - enum tcp_conntrack state; - - /* Poor man's window tracking: sequence number of valid ACK - handshake completion packet */ - u_int32_t handshake_ack; + struct ip_ct_tcp_state seen[2]; /* connection parameters per direction */ + u_int8_t state; /* state of the connection (enum tcp_conntrack) */ + /* For detecting stale connections */ + u_int8_t last_dir; /* Direction of the last packet (enum ip_conntrack_dir) */ + u_int8_t retrans; /* Number of retransmitted packets */ + u_int8_t last_index; /* Index of the last packet */ + u_int32_t last_seq; /* Last sequence number seen in dir */ + u_int32_t last_end; /* Last seq + len */ }; #endif /* _IP_CONNTRACK_TCP_H */ diff --git a/include/linux/netfilter_ipv4/ip_conntrack_tuple.h b/include/linux/netfilter_ipv4/ip_conntrack_tuple.h index 1e7691189c67..3a71176e2060 100644 --- a/include/linux/netfilter_ipv4/ip_conntrack_tuple.h +++ b/include/linux/netfilter_ipv4/ip_conntrack_tuple.h @@ -25,6 +25,9 @@ union ip_conntrack_manip_proto struct { u_int16_t id; } icmp; + struct { + u_int16_t port; + } sctp; }; /* The manipulable part of the tuple. */ @@ -55,6 +58,9 @@ struct ip_conntrack_tuple struct { u_int8_t type, code; } icmp; + struct { + u_int16_t port; + } sctp; } u; /* The protocol. */ diff --git a/include/linux/netfilter_ipv4/ip_nat_helper.h b/include/linux/netfilter_ipv4/ip_nat_helper.h index 185a24a6a047..be6bb082d0ba 100644 --- a/include/linux/netfilter_ipv4/ip_nat_helper.h +++ b/include/linux/netfilter_ipv4/ip_nat_helper.h @@ -38,11 +38,12 @@ struct ip_nat_helper struct ip_nat_info *info); }; -extern struct list_head helpers; - extern int ip_nat_helper_register(struct ip_nat_helper *me); extern void ip_nat_helper_unregister(struct ip_nat_helper *me); +extern struct ip_nat_helper * +ip_nat_find_helper(const struct ip_conntrack_tuple *tuple); + /* These return true or false. */ extern int ip_nat_mangle_tcp_packet(struct sk_buff **skb, struct ip_conntrack *ct, diff --git a/include/linux/netfilter_ipv4/ip_tables.h b/include/linux/netfilter_ipv4/ip_tables.h index b4c2b2b381c1..02a006f17ac4 100644 --- a/include/linux/netfilter_ipv4/ip_tables.h +++ b/include/linux/netfilter_ipv4/ip_tables.h @@ -336,7 +336,6 @@ ipt_get_target(struct ipt_entry *e) * Main firewall chains definitions and global var's definitions. */ #ifdef __KERNEL__ -static DECLARE_MUTEX(ipt_mutex); #include <linux/init.h> extern void ipt_init(void) __init; diff --git a/include/linux/netfilter_ipv4/ipt_sctp.h b/include/linux/netfilter_ipv4/ipt_sctp.h new file mode 100644 index 000000000000..e93a9ec99fc2 --- /dev/null +++ b/include/linux/netfilter_ipv4/ipt_sctp.h @@ -0,0 +1,107 @@ +#ifndef _IPT_SCTP_H_ +#define _IPT_SCTP_H_ + +#define IPT_SCTP_SRC_PORTS 0x01 +#define IPT_SCTP_DEST_PORTS 0x02 +#define IPT_SCTP_CHUNK_TYPES 0x04 + +#define IPT_SCTP_VALID_FLAGS 0x07 + +#define ELEMCOUNT(x) (sizeof(x)/sizeof(x[0])) + + +struct ipt_sctp_flag_info { + u_int8_t chunktype; + u_int8_t flag; + u_int8_t flag_mask; +}; + +#define IPT_NUM_SCTP_FLAGS 4 + +struct ipt_sctp_info { + u_int16_t dpts[2]; /* Min, Max */ + u_int16_t spts[2]; /* Min, Max */ + + u_int32_t chunkmap[256 / sizeof (u_int32_t)]; /* Bit mask of chunks to be matched according to RFC 2960 */ + +#define SCTP_CHUNK_MATCH_ANY 0x01 /* Match if any of the chunk types are present */ +#define SCTP_CHUNK_MATCH_ALL 0x02 /* Match if all of the chunk types are present */ +#define SCTP_CHUNK_MATCH_ONLY 0x04 /* Match if these are the only chunk types present */ + + u_int32_t chunk_match_type; + struct ipt_sctp_flag_info flag_info[IPT_NUM_SCTP_FLAGS]; + int flag_count; + + u_int32_t flags; + u_int32_t invflags; +}; + +#define bytes(type) (sizeof(type) * 8) + +#define SCTP_CHUNKMAP_SET(chunkmap, type) \ + do { \ + chunkmap[type / bytes(u_int32_t)] |= \ + 1 << (type % bytes(u_int32_t)); \ + } while (0) + +#define SCTP_CHUNKMAP_CLEAR(chunkmap, type) \ + do { \ + chunkmap[type / bytes(u_int32_t)] &= \ + ~(1 << (type % bytes(u_int32_t))); \ + } while (0) + +#define SCTP_CHUNKMAP_IS_SET(chunkmap, type) \ +({ \ + (chunkmap[type / bytes (u_int32_t)] & \ + (1 << (type % bytes (u_int32_t)))) ? 1: 0; \ +}) + +#define SCTP_CHUNKMAP_RESET(chunkmap) \ + do { \ + int i; \ + for (i = 0; i < ELEMCOUNT(chunkmap); i++) \ + chunkmap[i] = 0; \ + } while (0) + +#define SCTP_CHUNKMAP_SET_ALL(chunkmap) \ + do { \ + int i; \ + for (i = 0; i < ELEMCOUNT(chunkmap); i++) \ + chunkmap[i] = ~0; \ + } while (0) + +#define SCTP_CHUNKMAP_COPY(destmap, srcmap) \ + do { \ + int i; \ + for (i = 0; i < ELEMCOUNT(chunkmap); i++) \ + destmap[i] = srcmap[i]; \ + } while (0) + +#define SCTP_CHUNKMAP_IS_CLEAR(chunkmap) \ +({ \ + int i; \ + int flag = 1; \ + for (i = 0; i < ELEMCOUNT(chunkmap); i++) { \ + if (chunkmap[i]) { \ + flag = 0; \ + break; \ + } \ + } \ + flag; \ +}) + +#define SCTP_CHUNKMAP_IS_ALL_SET(chunkmap) \ +({ \ + int i; \ + int flag = 1; \ + for (i = 0; i < ELEMCOUNT(chunkmap); i++) { \ + if (chunkmap[i] != ~0) { \ + flag = 0; \ + break; \ + } \ + } \ + flag; \ +}) + +#endif /* _IPT_SCTP_H_ */ + diff --git a/include/linux/netfilter_ipv6/ip6_tables.h b/include/linux/netfilter_ipv6/ip6_tables.h index f9983d16cc1c..6f70cf3df39a 100644 --- a/include/linux/netfilter_ipv6/ip6_tables.h +++ b/include/linux/netfilter_ipv6/ip6_tables.h @@ -107,10 +107,6 @@ struct ip6t_counters u_int64_t pcnt, bcnt; /* Packet and byte counters */ }; -#ifdef __KERNEL__ -static DECLARE_MUTEX(ip6t_mutex); -#endif - /* Values for "flag" field in struct ip6t_ip6 (general ip6 structure). */ #define IP6T_F_PROTO 0x01 /* Set if rule cares about upper protocols */ diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 670558170bbd..ee61b0f31174 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -99,6 +99,7 @@ enum { #ifdef __KERNEL__ #include <linux/capability.h> +#include <linux/skbuff.h> struct netlink_skb_parms { diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index fb2088f0bd4a..3ae0c6e140af 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h @@ -123,6 +123,7 @@ extern void rb_erase(struct rb_node *, struct rb_root *); extern struct rb_node *rb_next(struct rb_node *); extern struct rb_node *rb_prev(struct rb_node *); extern struct rb_node *rb_first(struct rb_root *); +extern struct rb_node *rb_last(struct rb_root *); /* Fast replacement of a single node without remove/rebalance/add/rebalance */ extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 724d6841d0ae..256c05c11298 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -89,6 +89,8 @@ #define NET_CALLER(arg) __builtin_return_address(0) #endif +struct net_device; + #ifdef CONFIG_NETFILTER struct nf_conntrack { atomic_t use; @@ -1105,6 +1107,20 @@ extern void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to); extern void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len); +static inline void *skb_header_pointer(const struct sk_buff *skb, int offset, + int len, void *buffer) +{ + int hlen = skb_headlen(skb); + + if (offset + len <= hlen) + return skb->data + offset; + + if (skb_copy_bits(skb, offset, buffer, len) < 0) + return NULL; + + return buffer; +} + extern void skb_init(void); extern void skb_add_mtu(int mtu); diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 2ae5058a051d..afd89be7c193 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -411,6 +411,18 @@ enum NET_IPV4_NF_CONNTRACK_ICMP_TIMEOUT=12, NET_IPV4_NF_CONNTRACK_GENERIC_TIMEOUT=13, NET_IPV4_NF_CONNTRACK_BUCKETS=14, + NET_IPV4_NF_CONNTRACK_LOG_INVALID=15, + NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS=16, + NET_IPV4_NF_CONNTRACK_TCP_LOOSE=17, + NET_IPV4_NF_CONNTRACK_TCP_BE_LIBERAL=18, + NET_IPV4_NF_CONNTRACK_TCP_MAX_RETRANS=19, + NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED=20, + NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT=21, + NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED=22, + NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED=23, + NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT=24, + NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD=25, + NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT=26, }; /* /proc/sys/net/ipv6 */ diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h index 48280b138cb9..76ce5f8b6c1e 100644 --- a/include/net/if_inet6.h +++ b/include/net/if_inet6.h @@ -51,6 +51,7 @@ struct inet6_ifaddr struct timer_list timer; struct inet6_dev *idev; + struct rt6_info *rt; struct inet6_ifaddr *lst_next; /* next addr in addr_lst */ struct inet6_ifaddr *if_next; /* next addr in inet6_dev */ @@ -133,6 +134,7 @@ struct ifacaddr6 { struct in6_addr aca_addr; struct inet6_dev *aca_idev; + struct rt6_info *aca_rt; struct ifacaddr6 *aca_next; int aca_users; atomic_t aca_refcnt; diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 14d41c4baa99..319904518194 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -20,6 +20,7 @@ #include <net/dst.h> #include <net/flow.h> #include <linux/rtnetlink.h> +#include <linux/spinlock.h> struct rt6_info; diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 75d503dc1b5e..dbfe1d6923fd 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -11,8 +11,10 @@ #include <net/flow.h> #include <net/ip6_fib.h> +#include <net/sock.h> #include <linux/tcp.h> #include <linux/ip.h> +#include <linux/ipv6.h> struct pol_chain { int type; @@ -40,6 +42,9 @@ extern int ipv6_route_ioctl(unsigned int cmd, void __user *arg); extern int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *, void *rtattr); +extern int ip6_ins_rt(struct rt6_info *, + struct nlmsghdr *, + void *rtattr); extern int ip6_del_rt(struct rt6_info *, struct nlmsghdr *, void *rtattr); @@ -69,6 +74,10 @@ extern struct dst_entry *ndisc_dst_alloc(struct net_device *dev, extern int ndisc_dst_gc(int *more); extern void fib6_force_start_gc(void); +extern struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, + const struct in6_addr *addr, + int anycast); + /* * support functions for ND * diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index f14edafeabbd..59825c399e15 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -242,7 +242,6 @@ extern u32 fib_rules_map_destination(u32 daddr, struct fib_result *res); #ifdef CONFIG_NET_CLS_ROUTE extern u32 fib_rules_tclass(struct fib_result *res); #endif -extern u32 fib_rules_policy(u32 saddr, struct fib_result *res, unsigned *flags); extern void fib_rules_init(void); #endif diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 464203b2abac..2f1c3783f7ba 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -45,6 +45,7 @@ #include <asm/atomic.h> #include <linux/skbuff.h> +#include <linux/netdevice.h> #include <linux/err.h> #include <linux/sysctl.h> @@ -53,6 +54,8 @@ #define NUD_VALID (NUD_PERMANENT|NUD_NOARP|NUD_REACHABLE|NUD_PROBE|NUD_STALE|NUD_DELAY) #define NUD_CONNECTED (NUD_PERMANENT|NUD_NOARP|NUD_REACHABLE) +struct neighbour; + struct neigh_parms { struct neigh_parms *next; diff --git a/include/net/route.h b/include/net/route.h index a5e9c575ea3e..5e0100185d95 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -73,11 +73,6 @@ struct rtable /* Miscellaneous cached information */ __u32 rt_spec_dst; /* RFC1122 specific destination */ struct inet_peer *peer; /* long-living peer info */ - -#ifdef CONFIG_IP_ROUTE_NAT - __u32 rt_src_map; - __u32 rt_dst_map; -#endif }; struct ip_rt_acct diff --git a/include/net/xfrm.h b/include/net/xfrm.h index aaf74999a1f3..756c2016e4a1 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -216,7 +216,7 @@ struct xfrm_type void (*destructor)(struct xfrm_state *); int (*input)(struct xfrm_state *, struct xfrm_decap_state *, struct sk_buff *skb); int (*post_input)(struct xfrm_state *, struct xfrm_decap_state *, struct sk_buff *skb); - int (*output)(struct sk_buff **pskb); + int (*output)(struct sk_buff *pskb); /* Estimate maximal size of result of transformation of a dgram */ u32 (*get_max_size)(struct xfrm_state *, int size); }; @@ -304,47 +304,6 @@ extern int xfrm_register_km(struct xfrm_mgr *km); extern int xfrm_unregister_km(struct xfrm_mgr *km); -#define XFRM_FLOWCACHE_HASH_SIZE 1024 - -static inline u32 __flow_hash4(struct flowi *fl) -{ - u32 hash = fl->fl4_src ^ fl->fl_ip_sport; - - hash = ((hash & 0xF0F0F0F0) >> 4) | ((hash & 0x0F0F0F0F) << 4); - - hash ^= fl->fl4_dst ^ fl->fl_ip_dport; - hash ^= (hash >> 10); - hash ^= (hash >> 20); - return hash & (XFRM_FLOWCACHE_HASH_SIZE-1); -} - -static inline u32 __flow_hash6(struct flowi *fl) -{ - u32 hash = fl->fl6_src.s6_addr32[2] ^ - fl->fl6_src.s6_addr32[3] ^ - fl->fl_ip_sport; - - hash = ((hash & 0xF0F0F0F0) >> 4) | ((hash & 0x0F0F0F0F) << 4); - - hash ^= fl->fl6_dst.s6_addr32[2] ^ - fl->fl6_dst.s6_addr32[3] ^ - fl->fl_ip_dport; - hash ^= (hash >> 10); - hash ^= (hash >> 20); - return hash & (XFRM_FLOWCACHE_HASH_SIZE-1); -} - -static inline u32 flow_hash(struct flowi *fl, unsigned short family) -{ - switch (family) { - case AF_INET: - return __flow_hash4(fl); - case AF_INET6: - return __flow_hash6(fl); - } - return 0; /*XXX*/ -} - extern struct xfrm_policy *xfrm_policy_list[XFRM_POLICY_MAX*2]; static inline void xfrm_pol_hold(struct xfrm_policy *policy) @@ -462,13 +421,51 @@ static __inline__ int addr_match(void *token1, void *token2, int prefixlen) return 1; } +static __inline__ +u16 xfrm_flowi_sport(struct flowi *fl) +{ + u16 port; + switch(fl->proto) { + case IPPROTO_TCP: + case IPPROTO_UDP: + port = fl->fl_ip_sport; + break; + case IPPROTO_ICMP: + case IPPROTO_ICMPV6: + port = htons(fl->fl_icmp_type); + break; + default: + port = 0; /*XXX*/ + } + return port; +} + +static __inline__ +u16 xfrm_flowi_dport(struct flowi *fl) +{ + u16 port; + switch(fl->proto) { + case IPPROTO_TCP: + case IPPROTO_UDP: + port = fl->fl_ip_dport; + break; + case IPPROTO_ICMP: + case IPPROTO_ICMPV6: + port = htons(fl->fl_icmp_code); + break; + default: + port = 0; /*XXX*/ + } + return port; +} + static inline int __xfrm4_selector_match(struct xfrm_selector *sel, struct flowi *fl) { return addr_match(&fl->fl4_dst, &sel->daddr, sel->prefixlen_d) && addr_match(&fl->fl4_src, &sel->saddr, sel->prefixlen_s) && - !((fl->fl_ip_dport^sel->dport)&sel->dport_mask) && - !((fl->fl_ip_sport^sel->sport)&sel->sport_mask) && + !((xfrm_flowi_dport(fl) ^ sel->dport) & sel->dport_mask) && + !((xfrm_flowi_sport(fl) ^ sel->sport) & sel->sport_mask) && (fl->proto == sel->proto || !sel->proto) && (fl->oif == sel->ifindex || !sel->ifindex); } @@ -478,8 +475,8 @@ __xfrm6_selector_match(struct xfrm_selector *sel, struct flowi *fl) { return addr_match(&fl->fl6_dst, &sel->daddr, sel->prefixlen_d) && addr_match(&fl->fl6_src, &sel->saddr, sel->prefixlen_s) && - !((fl->fl_ip_dport^sel->dport)&sel->dport_mask) && - !((fl->fl_ip_sport^sel->sport)&sel->sport_mask) && + !((xfrm_flowi_dport(fl) ^ sel->dport) & sel->dport_mask) && + !((xfrm_flowi_sport(fl) ^ sel->sport) & sel->sport_mask) && (fl->proto == sel->proto || !sel->proto) && (fl->oif == sel->ifindex || !sel->ifindex); } @@ -795,8 +792,6 @@ extern void xfrm4_state_init(void); extern void xfrm4_state_fini(void); extern void xfrm6_state_init(void); extern void xfrm6_state_fini(void); -extern void xfrm6_tunnel_init(void); -extern void xfrm6_tunnel_fini(void); extern int xfrm_state_walk(u8 proto, int (*func)(struct xfrm_state *, int, void*), void *); extern struct xfrm_state *xfrm_state_alloc(void); @@ -821,6 +816,7 @@ extern int xfrm4_rcv(struct sk_buff *skb); extern int xfrm4_output(struct sk_buff **pskb); extern int xfrm4_tunnel_register(struct xfrm_tunnel *handler); extern int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler); +extern int xfrm6_rcv_spi(struct sk_buff **pskb, unsigned int *nhoffp, u32 spi); extern int xfrm6_rcv(struct sk_buff **pskb, unsigned int *nhoffp); extern int xfrm6_tunnel_register(struct xfrm6_tunnel *handler); extern int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler); @@ -852,8 +848,6 @@ static inline int xfrm_dst_lookup(struct xfrm_dst **dst, struct flowi *fl, unsig #endif void xfrm_policy_init(void); -void xfrm4_policy_init(void); -void xfrm6_policy_init(void); struct xfrm_policy *xfrm_policy_alloc(int gfp); extern int xfrm_policy_walk(int (*func)(struct xfrm_policy *, int, int, void*), void *); int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl); diff --git a/include/rxrpc/rxrpc.h b/include/rxrpc/rxrpc.h index df6595c32c37..e9c690964cea 100644 --- a/include/rxrpc/rxrpc.h +++ b/include/rxrpc/rxrpc.h @@ -16,10 +16,17 @@ extern uint32_t rxrpc_epoch; +#ifdef CONFIG_SYSCTL extern int rxrpc_ktrace; extern int rxrpc_kdebug; extern int rxrpc_kproto; extern int rxrpc_knet; +#else +#define rxrpc_ktrace 0 +#define rxrpc_kdebug 0 +#define rxrpc_kproto 0 +#define rxrpc_knet 0 +#endif extern int rxrpc_sysctl_init(void); extern void rxrpc_sysctl_cleanup(void); diff --git a/lib/rbtree.c b/lib/rbtree.c index 621552c344e7..14b791ac5089 100644 --- a/lib/rbtree.c +++ b/lib/rbtree.c @@ -312,6 +312,19 @@ struct rb_node *rb_first(struct rb_root *root) } EXPORT_SYMBOL(rb_first); +struct rb_node *rb_last(struct rb_root *root) +{ + struct rb_node *n; + + n = root->rb_node; + if (!n) + return NULL; + while (n->rb_right) + n = n->rb_right; + return n; +} +EXPORT_SYMBOL(rb_last); + struct rb_node *rb_next(struct rb_node *node) { /* If we have a right-hand child, go down and then left as far diff --git a/net/Kconfig b/net/Kconfig index 37ee31e7d2a9..cae135013a82 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -324,6 +324,19 @@ config BRIDGE config VLAN_8021Q tristate "802.1Q VLAN Support" + ---help--- + Select this and you will be able to create 802.1Q VLAN interfaces + on your ethernet interfaces. 802.1Q VLAN supports almost + everything a regular ethernet interface does, including + firewalling, bridging, and of course IP traffic. You will need + the 'vconfig' tool from the VLAN project in order to effectively + use VLANs. See the VLAN web page for more information: + <http://www.candelatech.com/~greear/vlan.html> + + To compile this code as a module, choose M here: the module + will be called 8021q. + + If unsure, say N. config DECNET tristate "DECnet Support" diff --git a/net/appletalk/Makefile b/net/appletalk/Makefile index cbe8a6a0bef0..d179728ad522 100644 --- a/net/appletalk/Makefile +++ b/net/appletalk/Makefile @@ -4,5 +4,6 @@ obj-$(CONFIG_ATALK) += appletalk.o -appletalk-y := aarp.o ddp.o atalk_proc.o +appletalk-y := aarp.o ddp.o +appletalk-$(CONFIG_PROC_FS) += atalk_proc.o appletalk-$(CONFIG_SYSCTL) += sysctl_net_atalk.o diff --git a/net/appletalk/atalk_proc.c b/net/appletalk/atalk_proc.c index 1e00a582277d..4d20501fad77 100644 --- a/net/appletalk/atalk_proc.c +++ b/net/appletalk/atalk_proc.c @@ -15,8 +15,6 @@ #include <net/sock.h> #include <linux/atalk.h> -#ifdef CONFIG_PROC_FS -extern struct file_operations atalk_seq_arp_fops; static __inline__ struct atalk_iface *atalk_get_interface_idx(loff_t pos) { @@ -321,14 +319,3 @@ void __exit atalk_proc_exit(void) remove_proc_entry("arp", atalk_proc_dir); remove_proc_entry("atalk", proc_net); } - -#else /* CONFIG_PROC_FS */ -int __init atalk_proc_init(void) -{ - return 0; -} - -void __exit atalk_proc_exit(void) -{ -} -#endif /* CONFIG_PROC_FS */ diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 4185d7b8ed02..588cbe1ec16f 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -61,16 +61,6 @@ #include <net/route.h> #include <linux/atalk.h> -extern void aarp_cleanup_module(void); - -extern void aarp_probe_network(struct atalk_iface *atif); -extern int aarp_proxy_probe_network(struct atalk_iface *atif, - struct atalk_addr *sa); -extern void aarp_proxy_remove(struct net_device *dev, struct atalk_addr *sa); - -extern void atalk_register_sysctl(void); -extern void atalk_unregister_sysctl(void); - struct datalink_proto *ddp_dl, *aarp_dl; static struct proto_ops atalk_dgram_ops; diff --git a/net/appletalk/sysctl_net_atalk.c b/net/appletalk/sysctl_net_atalk.c index 25b33f670499..af7f0604395d 100644 --- a/net/appletalk/sysctl_net_atalk.c +++ b/net/appletalk/sysctl_net_atalk.c @@ -7,13 +7,9 @@ */ #include <linux/config.h> - -#ifdef CONFIG_SYSCTL #include <linux/sysctl.h> -extern int sysctl_aarp_expiry_time; -extern int sysctl_aarp_tick_time; -extern int sysctl_aarp_retransmit_limit; -extern int sysctl_aarp_resolve_time; +#include <net/sock.h> +#include <linux/atalk.h> static struct ctl_table atalk_table[] = { { @@ -85,13 +81,3 @@ void atalk_unregister_sysctl(void) { unregister_sysctl_table(atalk_table_header); } - -#else /* CONFIG_PROC_FS */ -void atalk_register_sysctl(void) -{ -} - -void atalk_unregister_sysctl(void) -{ -} -#endif /* CONFIG_PROC_FS */ diff --git a/net/atm/clip.c b/net/atm/clip.c index 4417df3fafa6..5de7c1fd73b5 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -47,8 +47,8 @@ #endif -struct net_device *clip_devs = NULL; -struct atm_vcc *atmarpd = NULL; +static struct net_device *clip_devs; +static struct atm_vcc *atmarpd; static struct neigh_table clip_tbl; static struct timer_list idle_timer; static int start_timer = 1; diff --git a/net/atm/ipcommon.h b/net/atm/ipcommon.h index bc1675eca081..d72165f60939 100644 --- a/net/atm/ipcommon.h +++ b/net/atm/ipcommon.h @@ -12,9 +12,6 @@ #include <linux/netdevice.h> #include <linux/atmdev.h> - -extern struct net_device *clip_devs; - /* * Appends all skbs from "from" to "to". The operation is atomic with respect * to all other skb operations on "from" or "to". diff --git a/net/bridge/netfilter/ebt_among.c b/net/bridge/netfilter/ebt_among.c index 727311dfd884..78aa491ceb90 100644 --- a/net/bridge/netfilter/ebt_among.c +++ b/net/bridge/netfilter/ebt_among.c @@ -73,20 +73,27 @@ static int ebt_mac_wormhash_check_integrity(const struct ebt_mac_wormhash static int get_ip_dst(const struct sk_buff *skb, uint32_t *addr) { if (skb->mac.ethernet->h_proto == __constant_htons(ETH_P_IP)) { - struct iphdr iph; + struct iphdr _iph, *ih; - if (skb_copy_bits(skb, 0, &iph, sizeof(iph))) + ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph); + if (ih == NULL) return -1; - *addr = iph.daddr; + *addr = ih->daddr; } else if (skb->mac.ethernet->h_proto == __constant_htons(ETH_P_ARP)) { - struct arphdr arph; + struct arphdr _arph, *ah; + uint32_t buf, *bp; - if (skb_copy_bits(skb, 0, &arph, sizeof(arph)) || - arph.ar_pln != sizeof(uint32_t) || arph.ar_hln != ETH_ALEN) + ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph); + if (ah == NULL || + ah->ar_pln != sizeof(uint32_t) || + ah->ar_hln != ETH_ALEN) return -1; - if (skb_copy_bits(skb, sizeof(struct arphdr) + - 2 * ETH_ALEN + sizeof(uint32_t), addr, sizeof(uint32_t))) + bp = skb_header_pointer(skb, sizeof(struct arphdr) + + 2 * ETH_ALEN + sizeof(uint32_t), + sizeof(uint32_t), &buf); + if (bp == NULL) return -1; + *addr = *bp; } return 0; } @@ -94,20 +101,26 @@ static int get_ip_dst(const struct sk_buff *skb, uint32_t *addr) static int get_ip_src(const struct sk_buff *skb, uint32_t *addr) { if (skb->mac.ethernet->h_proto == __constant_htons(ETH_P_IP)) { - struct iphdr iph; + struct iphdr _iph, *ih; - if (skb_copy_bits(skb, 0, &iph, sizeof(iph))) + ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph); + if (ih == NULL) return -1; - *addr = iph.saddr; + *addr = ih->saddr; } else if (skb->mac.ethernet->h_proto == __constant_htons(ETH_P_ARP)) { - struct arphdr arph; + struct arphdr _arph, *ah; + uint32_t buf, *bp; - if (skb_copy_bits(skb, 0, &arph, sizeof(arph)) || - arph.ar_pln != sizeof(uint32_t) || arph.ar_hln != ETH_ALEN) + ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph); + if (ah == NULL || + ah->ar_pln != sizeof(uint32_t) || + ah->ar_hln != ETH_ALEN) return -1; - if (skb_copy_bits(skb, sizeof(struct arphdr) + - ETH_ALEN, addr, sizeof(uint32_t))) + bp = skb_header_pointer(skb, sizeof(struct arphdr) + + ETH_ALEN, sizeof(uint32_t), &buf); + if (bp == NULL) return -1; + *addr = *bp; } return 0; } diff --git a/net/bridge/netfilter/ebt_arp.c b/net/bridge/netfilter/ebt_arp.c index eb675848fbc3..e913cac50066 100644 --- a/net/bridge/netfilter/ebt_arp.c +++ b/net/bridge/netfilter/ebt_arp.c @@ -19,72 +19,79 @@ static int ebt_filter_arp(const struct sk_buff *skb, const struct net_device *in const struct net_device *out, const void *data, unsigned int datalen) { struct ebt_arp_info *info = (struct ebt_arp_info *)data; - struct arphdr arph; + struct arphdr _arph, *ah; - if (skb_copy_bits(skb, 0, &arph, sizeof(arph))) + ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph); + if (ah == NULL) return EBT_NOMATCH; if (info->bitmask & EBT_ARP_OPCODE && FWINV(info->opcode != - arph.ar_op, EBT_ARP_OPCODE)) + ah->ar_op, EBT_ARP_OPCODE)) return EBT_NOMATCH; if (info->bitmask & EBT_ARP_HTYPE && FWINV(info->htype != - arph.ar_hrd, EBT_ARP_HTYPE)) + ah->ar_hrd, EBT_ARP_HTYPE)) return EBT_NOMATCH; if (info->bitmask & EBT_ARP_PTYPE && FWINV(info->ptype != - arph.ar_pro, EBT_ARP_PTYPE)) + ah->ar_pro, EBT_ARP_PTYPE)) return EBT_NOMATCH; if (info->bitmask & (EBT_ARP_SRC_IP | EBT_ARP_DST_IP)) { - uint32_t addr; + uint32_t _addr, *ap; /* IPv4 addresses are always 4 bytes */ - if (arph.ar_pln != sizeof(uint32_t)) + if (ah->ar_pln != sizeof(uint32_t)) return EBT_NOMATCH; if (info->bitmask & EBT_ARP_SRC_IP) { - if (skb_copy_bits(skb, sizeof(struct arphdr) + - arph.ar_hln, &addr, sizeof(addr))) + ap = skb_header_pointer(skb, sizeof(struct arphdr) + + ah->ar_hln, sizeof(_addr), + &_addr); + if (ap == NULL) return EBT_NOMATCH; - if (FWINV(info->saddr != (addr & info->smsk), + if (FWINV(info->saddr != (*ap & info->smsk), EBT_ARP_SRC_IP)) return EBT_NOMATCH; } if (info->bitmask & EBT_ARP_DST_IP) { - if (skb_copy_bits(skb, sizeof(struct arphdr) + - 2*arph.ar_hln + sizeof(uint32_t), &addr, - sizeof(addr))) + ap = skb_header_pointer(skb, sizeof(struct arphdr) + + 2*ah->ar_hln+sizeof(uint32_t), + sizeof(_addr), &_addr); + if (ap == NULL) return EBT_NOMATCH; - if (FWINV(info->daddr != (addr & info->dmsk), + if (FWINV(info->daddr != (*ap & info->dmsk), EBT_ARP_DST_IP)) return EBT_NOMATCH; } } if (info->bitmask & (EBT_ARP_SRC_MAC | EBT_ARP_DST_MAC)) { - unsigned char mac[ETH_ALEN]; + unsigned char _mac[ETH_ALEN], *mp; uint8_t verdict, i; /* MAC addresses are 6 bytes */ - if (arph.ar_hln != ETH_ALEN) + if (ah->ar_hln != ETH_ALEN) return EBT_NOMATCH; if (info->bitmask & EBT_ARP_SRC_MAC) { - if (skb_copy_bits(skb, sizeof(struct arphdr), &mac, - ETH_ALEN)) + mp = skb_header_pointer(skb, sizeof(struct arphdr), + sizeof(_mac), &_mac); + if (mp == NULL) return EBT_NOMATCH; verdict = 0; for (i = 0; i < 6; i++) - verdict |= (mac[i] ^ info->smaddr[i]) & + verdict |= (mp[i] ^ info->smaddr[i]) & info->smmsk[i]; if (FWINV(verdict != 0, EBT_ARP_SRC_MAC)) return EBT_NOMATCH; } if (info->bitmask & EBT_ARP_DST_MAC) { - if (skb_copy_bits(skb, sizeof(struct arphdr) + - arph.ar_hln + arph.ar_pln, &mac, ETH_ALEN)) + mp = skb_header_pointer(skb, sizeof(struct arphdr) + + ah->ar_hln + ah->ar_pln, + sizeof(_mac), &_mac); + if (mp == NULL) return EBT_NOMATCH; verdict = 0; for (i = 0; i < 6; i++) - verdict |= (mac[i] ^ info->dmaddr[i]) & + verdict |= (mp[i] ^ info->dmaddr[i]) & info->dmmsk[i]; if (FWINV(verdict != 0, EBT_ARP_DST_MAC)) return EBT_NOMATCH; diff --git a/net/bridge/netfilter/ebt_arpreply.c b/net/bridge/netfilter/ebt_arpreply.c index 87ba30dd090f..95189f02fcc0 100644 --- a/net/bridge/netfilter/ebt_arpreply.c +++ b/net/bridge/netfilter/ebt_arpreply.c @@ -20,30 +20,38 @@ static int ebt_target_reply(struct sk_buff **pskb, unsigned int hooknr, const void *data, unsigned int datalen) { struct ebt_arpreply_info *info = (struct ebt_arpreply_info *)data; - u32 sip, dip; - struct arphdr ah; - unsigned char sha[ETH_ALEN]; + u32 _sip, *siptr, _dip, *diptr; + struct arphdr _ah, *ap; + unsigned char _sha[ETH_ALEN], *shp; struct sk_buff *skb = *pskb; - if (skb_copy_bits(skb, 0, &ah, sizeof(ah))) + ap = skb_header_pointer(skb, 0, sizeof(_ah), &_ah); + if (ap == NULL) return EBT_DROP; - if (ah.ar_op != __constant_htons(ARPOP_REQUEST) || ah.ar_hln != ETH_ALEN - || ah.ar_pro != __constant_htons(ETH_P_IP) || ah.ar_pln != 4) + if (ap->ar_op != __constant_htons(ARPOP_REQUEST) || + ap->ar_hln != ETH_ALEN || + ap->ar_pro != __constant_htons(ETH_P_IP) || + ap->ar_pln != 4) return EBT_CONTINUE; - if (skb_copy_bits(skb, sizeof(ah), &sha, ETH_ALEN)) + shp = skb_header_pointer(skb, sizeof(_ah), ETH_ALEN, &_sha); + if (shp == NULL) return EBT_DROP; - if (skb_copy_bits(skb, sizeof(ah) + ETH_ALEN, &sip, sizeof(sip))) + siptr = skb_header_pointer(skb, sizeof(_ah) + ETH_ALEN, + sizeof(_sip), &_sip); + if (siptr == NULL) return EBT_DROP; - if (skb_copy_bits(skb, sizeof(ah) + 2 * ETH_ALEN + sizeof(sip), - &dip, sizeof(dip))) + diptr = skb_header_pointer(skb, + sizeof(_ah) + 2 * ETH_ALEN + sizeof(_sip), + sizeof(_dip), &_dip); + if (diptr == NULL) return EBT_DROP; - arp_send(ARPOP_REPLY, ETH_P_ARP, sip, (struct net_device *)in, - dip, sha, info->mac, sha); + arp_send(ARPOP_REPLY, ETH_P_ARP, *siptr, (struct net_device *)in, + *diptr, shp, info->mac, shp); return info->target; } diff --git a/net/bridge/netfilter/ebt_ip.c b/net/bridge/netfilter/ebt_ip.c index 7bab7d065bd3..0b2f19943dac 100644 --- a/net/bridge/netfilter/ebt_ip.c +++ b/net/bridge/netfilter/ebt_ip.c @@ -28,41 +28,44 @@ static int ebt_filter_ip(const struct sk_buff *skb, const struct net_device *in, unsigned int datalen) { struct ebt_ip_info *info = (struct ebt_ip_info *)data; - union {struct iphdr iph; struct tcpudphdr ports;} u; + struct iphdr _iph, *ih; + struct tcpudphdr _ports, *pptr; - if (skb_copy_bits(skb, 0, &u.iph, sizeof(u.iph))) + ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph); + if (ih == NULL) return EBT_NOMATCH; if (info->bitmask & EBT_IP_TOS && - FWINV(info->tos != u.iph.tos, EBT_IP_TOS)) + FWINV(info->tos != ih->tos, EBT_IP_TOS)) return EBT_NOMATCH; if (info->bitmask & EBT_IP_SOURCE && - FWINV((u.iph.saddr & info->smsk) != + FWINV((ih->saddr & info->smsk) != info->saddr, EBT_IP_SOURCE)) return EBT_NOMATCH; if ((info->bitmask & EBT_IP_DEST) && - FWINV((u.iph.daddr & info->dmsk) != + FWINV((ih->daddr & info->dmsk) != info->daddr, EBT_IP_DEST)) return EBT_NOMATCH; if (info->bitmask & EBT_IP_PROTO) { - if (FWINV(info->protocol != u.iph.protocol, EBT_IP_PROTO)) + if (FWINV(info->protocol != ih->protocol, EBT_IP_PROTO)) return EBT_NOMATCH; if (!(info->bitmask & EBT_IP_DPORT) && !(info->bitmask & EBT_IP_SPORT)) return EBT_MATCH; - if (skb_copy_bits(skb, u.iph.ihl*4, &u.ports, - sizeof(u.ports))) + pptr = skb_header_pointer(skb, ih->ihl*4, + sizeof(_ports), &_ports); + if (pptr == NULL) return EBT_NOMATCH; if (info->bitmask & EBT_IP_DPORT) { - u.ports.dst = ntohs(u.ports.dst); - if (FWINV(u.ports.dst < info->dport[0] || - u.ports.dst > info->dport[1], + u32 dst = ntohs(pptr->dst); + if (FWINV(dst < info->dport[0] || + dst > info->dport[1], EBT_IP_DPORT)) return EBT_NOMATCH; } if (info->bitmask & EBT_IP_SPORT) { - u.ports.src = ntohs(u.ports.src); - if (FWINV(u.ports.src < info->sport[0] || - u.ports.src > info->sport[1], + u32 src = ntohs(pptr->src); + if (FWINV(src < info->sport[0] || + src > info->sport[1], EBT_IP_SPORT)) return EBT_NOMATCH; } diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c index 2da7c682744d..407dfdbaf688 100644 --- a/net/bridge/netfilter/ebt_log.c +++ b/net/bridge/netfilter/ebt_log.c @@ -78,23 +78,29 @@ static void ebt_log(const struct sk_buff *skb, const struct net_device *in, if ((info->bitmask & EBT_LOG_IP) && skb->mac.ethernet->h_proto == htons(ETH_P_IP)){ - if (skb_copy_bits(skb, 0, &u.iph, sizeof(u.iph))) { + struct iphdr _iph, *ih; + + ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph); + if (ih == NULL) { printk(" INCOMPLETE IP header"); goto out; } printk(" IP SRC=%u.%u.%u.%u IP DST=%u.%u.%u.%u,", - NIPQUAD(u.iph.saddr), NIPQUAD(u.iph.daddr)); + NIPQUAD(ih->saddr), NIPQUAD(ih->daddr)); printk(" IP tos=0x%02X, IP proto=%d", u.iph.tos, - u.iph.protocol); - if (u.iph.protocol == IPPROTO_TCP || - u.iph.protocol == IPPROTO_UDP) { - if (skb_copy_bits(skb, u.iph.ihl*4, &u.ports, - sizeof(u.ports))) { + ih->protocol); + if (ih->protocol == IPPROTO_TCP || + ih->protocol == IPPROTO_UDP) { + struct tcpudphdr _ports, *pptr; + + pptr = skb_header_pointer(skb, ih->ihl*4, + sizeof(_ports), &_ports); + if (pptr == NULL) { printk(" INCOMPLETE TCP/UDP header"); goto out; } - printk(" SPT=%u DPT=%u", ntohs(u.ports.src), - ntohs(u.ports.dst)); + printk(" SPT=%u DPT=%u", ntohs(pptr->src), + ntohs(pptr->dst)); } goto out; } @@ -102,32 +108,38 @@ static void ebt_log(const struct sk_buff *skb, const struct net_device *in, if ((info->bitmask & EBT_LOG_ARP) && ((skb->mac.ethernet->h_proto == __constant_htons(ETH_P_ARP)) || (skb->mac.ethernet->h_proto == __constant_htons(ETH_P_RARP)))) { - if (skb_copy_bits(skb, 0, &u.arph, sizeof(u.arph))) { + struct arphdr _arph, *ah; + + ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph); + if (ah == NULL) { printk(" INCOMPLETE ARP header"); goto out; } printk(" ARP HTYPE=%d, PTYPE=0x%04x, OPCODE=%d", - ntohs(u.arph.ar_hrd), ntohs(u.arph.ar_pro), - ntohs(u.arph.ar_op)); + ntohs(ah->ar_hrd), ntohs(ah->ar_pro), + ntohs(ah->ar_op)); /* If it's for Ethernet and the lengths are OK, * then log the ARP payload */ - if (u.arph.ar_hrd == __constant_htons(1) && - u.arph.ar_hln == ETH_ALEN && - u.arph.ar_pln == sizeof(uint32_t)) { - if (skb_copy_bits(skb, sizeof(u.arph), &u.arpp, - sizeof(u.arpp))) { + if (ah->ar_hrd == __constant_htons(1) && + ah->ar_hln == ETH_ALEN && + ah->ar_pln == sizeof(uint32_t)) { + struct arppayload _arpp, *ap; + + ap = skb_header_pointer(skb, sizeof(u.arph), + sizeof(_arpp), &_arpp); + if (ap == NULL) { printk(" INCOMPLETE ARP payload"); goto out; } printk(" ARP MAC SRC="); - print_MAC(u.arpp.mac_src); + print_MAC(ap->mac_src); printk(" ARP IP SRC=%u.%u.%u.%u", - myNIPQUAD(u.arpp.ip_src)); + myNIPQUAD(ap->ip_src)); printk(" ARP MAC DST="); - print_MAC(u.arpp.mac_dst); + print_MAC(ap->mac_dst); printk(" ARP IP DST=%u.%u.%u.%u", - myNIPQUAD(u.arpp.ip_dst)); + myNIPQUAD(ap->ip_dst)); } } out: diff --git a/net/bridge/netfilter/ebt_stp.c b/net/bridge/netfilter/ebt_stp.c index d0299efa1001..f8a8cdec16ee 100644 --- a/net/bridge/netfilter/ebt_stp.c +++ b/net/bridge/netfilter/ebt_stp.c @@ -122,26 +122,30 @@ static int ebt_filter_stp(const struct sk_buff *skb, const struct net_device *in const struct net_device *out, const void *data, unsigned int datalen) { struct ebt_stp_info *info = (struct ebt_stp_info *)data; - struct stp_header stph; + struct stp_header _stph, *sp; uint8_t header[6] = {0x42, 0x42, 0x03, 0x00, 0x00, 0x00}; - if (skb_copy_bits(skb, 0, &stph, sizeof(stph))) + + sp = skb_header_pointer(skb, 0, sizeof(_stph), &_stph); + if (sp == NULL) return EBT_NOMATCH; /* The stp code only considers these */ - if (memcmp(&stph, header, sizeof(header))) + if (memcmp(sp, header, sizeof(header))) return EBT_NOMATCH; if (info->bitmask & EBT_STP_TYPE - && FWINV(info->type != stph.type, EBT_STP_TYPE)) + && FWINV(info->type != sp->type, EBT_STP_TYPE)) return EBT_NOMATCH; - if (stph.type == BPDU_TYPE_CONFIG && + if (sp->type == BPDU_TYPE_CONFIG && info->bitmask & EBT_STP_CONFIG_MASK) { - struct stp_config_pdu stpc; + struct stp_config_pdu _stpc, *st; - if (skb_copy_bits(skb, sizeof(stph), &stpc, sizeof(stpc))) - return EBT_NOMATCH; - return ebt_filter_config(info, &stpc); + st = skb_header_pointer(skb, sizeof(_stph), + sizeof(_stpc), &_stpc); + if (st == NULL) + return EBT_NOMATCH; + return ebt_filter_config(info, st); } return EBT_MATCH; } diff --git a/net/bridge/netfilter/ebt_vlan.c b/net/bridge/netfilter/ebt_vlan.c index ec111772bbe9..da30941db2ef 100644 --- a/net/bridge/netfilter/ebt_vlan.c +++ b/net/bridge/netfilter/ebt_vlan.c @@ -21,13 +21,14 @@ #include <linux/if_ether.h> #include <linux/if_vlan.h> #include <linux/module.h> +#include <linux/moduleparam.h> #include <linux/netfilter_bridge/ebtables.h> #include <linux/netfilter_bridge/ebt_vlan.h> -static unsigned char debug; +static int debug; #define MODULE_VERS "0.6" -MODULE_PARM(debug, "0-1b"); +module_param(debug, int, 0); MODULE_PARM_DESC(debug, "debug=1 is turn on debug messages"); MODULE_AUTHOR("Nick Fedchik <nick@fedchik.org.ua>"); MODULE_DESCRIPTION("802.1Q match module (ebtables extension), v" @@ -48,7 +49,7 @@ ebt_filter_vlan(const struct sk_buff *skb, const void *data, unsigned int datalen) { struct ebt_vlan_info *info = (struct ebt_vlan_info *) data; - struct vlan_hdr frame; + struct vlan_hdr _frame, *fp; unsigned short TCI; /* Whole TCI, given from parsed frame */ unsigned short id; /* VLAN ID, given from frame TCI */ @@ -56,7 +57,8 @@ ebt_filter_vlan(const struct sk_buff *skb, /* VLAN encapsulated Type/Length field, given from orig frame */ unsigned short encap; - if (skb_copy_bits(skb, 0, &frame, sizeof(frame))) + fp = skb_header_pointer(skb, 0, sizeof(_frame), &_frame); + if (fp == NULL) return EBT_NOMATCH; /* Tag Control Information (TCI) consists of the following elements: @@ -66,10 +68,10 @@ ebt_filter_vlan(const struct sk_buff *skb, * (CFI) is a single bit flag value. Currently ignored. * - VLAN Identifier (VID). The VID is encoded as * an unsigned binary number. */ - TCI = ntohs(frame.h_vlan_TCI); + TCI = ntohs(fp->h_vlan_TCI); id = TCI & VLAN_VID_MASK; prio = (TCI >> 13) & 0x7; - encap = frame.h_vlan_encapsulated_proto; + encap = fp->h_vlan_encapsulated_proto; /* Checking VLAN Identifier (VID) */ if (GET_BITMASK(EBT_VLAN_ID)) diff --git a/net/core/dev.c b/net/core/dev.c index a67e65a0f267..da7fabc7aa26 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3279,6 +3279,8 @@ static int __init net_dev_init(void) BUG_ON(!dev_boot_phase); + net_random_init(); + if (dev_proc_init()) goto out; diff --git a/net/core/filter.c b/net/core/filter.c index 9c2a95080768..f3b88205ace2 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -183,9 +183,10 @@ int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen) continue; } } else { - u32 tmp; - if (!skb_copy_bits(skb, k, &tmp, 4)) { - A = ntohl(tmp); + u32 _tmp, *p; + p = skb_header_pointer(skb, k, 4, &_tmp); + if (p != NULL) { + A = ntohl(*p); continue; } } @@ -208,9 +209,10 @@ int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen) continue; } } else { - u16 tmp; - if (!skb_copy_bits(skb, k, &tmp, 2)) { - A = ntohs(tmp); + u16 _tmp, *p; + p = skb_header_pointer(skb, k, 2, &_tmp); + if (p != NULL) { + A = ntohs(*p); continue; } } @@ -233,9 +235,10 @@ load_b: continue; } } else { - u8 tmp; - if (!skb_copy_bits(skb, k, &tmp, 1)) { - A = tmp; + u8 _tmp, *p; + p = skb_header_pointer(skb, k, 1, &_tmp); + if (p != NULL) { + A = *p; continue; } } diff --git a/net/core/netfilter.c b/net/core/netfilter.c index 09d10722632b..a81816a57827 100644 --- a/net/core/netfilter.c +++ b/net/core/netfilter.c @@ -695,11 +695,12 @@ int skb_ip_make_writable(struct sk_buff **pskb, unsigned int writable_len) /* DaveM says protocol headers are also modifiable. */ switch ((*pskb)->nh.iph->protocol) { case IPPROTO_TCP: { - struct tcphdr hdr; - if (skb_copy_bits(*pskb, (*pskb)->nh.iph->ihl*4, - &hdr, sizeof(hdr)) != 0) + struct tcphdr _hdr, *hp; + hp = skb_header_pointer(*pskb, (*pskb)->nh.iph->ihl*4, + sizeof(_hdr), &_hdr); + if (hp == NULL) goto copy_skb; - if (writable_len <= (*pskb)->nh.iph->ihl*4 + hdr.doff*4) + if (writable_len <= (*pskb)->nh.iph->ihl*4 + hp->doff*4) goto pull_skb; goto copy_skb; } diff --git a/net/core/netpoll.c b/net/core/netpoll.c index ec4dea2d7f7e..5276e4134f87 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -18,6 +18,7 @@ #include <linux/interrupt.h> #include <linux/netpoll.h> #include <linux/sched.h> +#include <linux/rcupdate.h> #include <net/tcp.h> #include <net/udp.h> @@ -572,16 +573,18 @@ int netpoll_setup(struct netpoll *np) memcpy(np->local_mac, ndev->dev_addr, 6); if (!np->local_ip) { - in_dev = in_dev_get(ndev); + rcu_read_lock(); + in_dev = __in_dev_get(ndev); if (!in_dev) { + rcu_read_unlock(); printk(KERN_ERR "%s: no IP address for %s, aborting\n", np->name, np->dev_name); goto release; } np->local_ip = ntohl(in_dev->ifa_list->ifa_local); - in_dev_put(in_dev); + rcu_read_unlock(); printk(KERN_INFO "%s: local IP %d.%d.%d.%d\n", np->name, HIPQUAD(np->local_ip)); } diff --git a/net/core/pktgen.c b/net/core/pktgen.c index d1a62cddb31a..d3ba2c75e530 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -70,6 +70,7 @@ #include <linux/delay.h> #include <linux/init.h> #include <linux/inet.h> +#include <linux/rcupdate.h> #include <asm/byteorder.h> #include <asm/bitops.h> #include <asm/io.h> @@ -263,14 +264,17 @@ static struct net_device *setup_inject(struct pktgen_info* info) info->saddr_min = 0; info->saddr_max = 0; if (strlen(info->src_min) == 0) { - struct in_device *in_dev = in_dev_get(odev); + struct in_device *in_dev; + + rcu_read_lock(); + in_dev = __in_dev_get(odev); if (in_dev) { if (in_dev->ifa_list) { info->saddr_min = in_dev->ifa_list->ifa_address; info->saddr_max = info->saddr_min; } - in_dev_put(in_dev); } + rcu_read_unlock(); } else { info->saddr_min = in_aton(info->src_min); diff --git a/net/core/utils.c b/net/core/utils.c index 8058d9c5e236..6093174581fd 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -19,22 +19,116 @@ #include <linux/mm.h> #include <linux/string.h> #include <linux/types.h> +#include <linux/random.h> +#include <linux/percpu.h> #include <asm/system.h> #include <asm/uaccess.h> -static unsigned long net_rand_seed = 152L; + +/* + This is a maximally equidistributed combined Tausworthe generator + based on code from GNU Scientific Library 1.5 (30 Jun 2004) + + x_n = (s1_n ^ s2_n ^ s3_n) + + s1_{n+1} = (((s1_n & 4294967294) <<12) ^ (((s1_n <<13) ^ s1_n) >>19)) + s2_{n+1} = (((s2_n & 4294967288) << 4) ^ (((s2_n << 2) ^ s2_n) >>25)) + s3_{n+1} = (((s3_n & 4294967280) <<17) ^ (((s3_n << 3) ^ s3_n) >>11)) + + The period of this generator is about 2^88. + + From: P. L'Ecuyer, "Maximally Equidistributed Combined Tausworthe + Generators", Mathematics of Computation, 65, 213 (1996), 203--213. + + This is available on the net from L'Ecuyer's home page, + + http://www.iro.umontreal.ca/~lecuyer/myftp/papers/tausme.ps + ftp://ftp.iro.umontreal.ca/pub/simulation/lecuyer/papers/tausme.ps + + There is an erratum in the paper "Tables of Maximally + Equidistributed Combined LFSR Generators", Mathematics of + Computation, 68, 225 (1999), 261--269: + http://www.iro.umontreal.ca/~lecuyer/myftp/papers/tausme2.ps + + ... the k_j most significant bits of z_j must be non- + zero, for each j. (Note: this restriction also applies to the + computer code given in [4], but was mistakenly not mentioned in + that paper.) + + This affects the seeding procedure by imposing the requirement + s1 > 1, s2 > 7, s3 > 15. + +*/ +struct nrnd_state { + u32 s1, s2, s3; +}; + +static DEFINE_PER_CPU(struct nrnd_state, net_rand_state); + +static u32 __net_random(struct nrnd_state *state) +{ +#define TAUSWORTHE(s,a,b,c,d) ((s&c)<<d) ^ (((s <<a) ^ s)>>b) + + state->s1 = TAUSWORTHE(state->s1, 13, 19, 4294967294UL, 12); + state->s2 = TAUSWORTHE(state->s2, 2, 25, 4294967288UL, 4); + state->s3 = TAUSWORTHE(state->s3, 3, 11, 4294967280UL, 17); + + return (state->s1 ^ state->s2 ^ state->s3); +} + +static void __net_srandom(struct nrnd_state *state, unsigned long entropy) +{ + u32 s = state->s1 ^ entropy; + + if (s == 0) + s = 1; /* default seed is 1 */ + +#define LCG(n) (69069 * n) + state->s1 = LCG(s); + state->s2 = LCG(state->s1); + state->s3 = LCG(state->s2); + + /* "warm it up" */ + __net_random(state); + __net_random(state); + __net_random(state); + __net_random(state); + __net_random(state); + __net_random(state); +} + unsigned long net_random(void) { - net_rand_seed=net_rand_seed*69069L+1; - return net_rand_seed^jiffies; + unsigned long r; + struct nrnd_state *state = &get_cpu_var(net_rand_state); + r = __net_random(state); + put_cpu_var(state); + return r; } + void net_srandom(unsigned long entropy) { - net_rand_seed ^= entropy; - net_random(); + struct nrnd_state *state = &get_cpu_var(net_rand_state); + __net_srandom(state, entropy); + put_cpu_var(state); +} + +void __init net_random_init(void) +{ + int i; + unsigned long seed[NR_CPUS]; + + get_random_bytes(seed, sizeof(seed)); + + for (i = 0; i < NR_CPUS; i++) { + struct nrnd_state *state = &per_cpu(net_rand_state,i); + + memset(state, 0, sizeof(*state)); + __net_srandom(state, seed[i]); + } } int net_msg_cost = 5*HZ; diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c index 11b0f0c6d45c..5a05efb83092 100644 --- a/net/decnet/dn_dev.c +++ b/net/decnet/dn_dev.c @@ -247,21 +247,6 @@ static struct dn_dev_sysctl_table { }, {0}} }; -static inline __u16 mtu2blksize(struct net_device *dev) -{ - u32 blksize = dev->mtu; - if (blksize > 0xffff) - blksize = 0xffff; - - if (dev->type == ARPHRD_ETHER || - dev->type == ARPHRD_PPP || - dev->type == ARPHRD_IPGRE || - dev->type == ARPHRD_LOOPBACK) - blksize -= 2; - - return (__u16)blksize; -} - static void dn_dev_sysctl_register(struct net_device *dev, struct dn_dev_parms *parms) { struct dn_dev_sysctl_table *t; @@ -314,52 +299,6 @@ static void dn_dev_sysctl_unregister(struct dn_dev_parms *parms) } } -struct net_device *dn_dev_get_default(void) -{ - struct net_device *dev; - read_lock(&dndev_lock); - dev = decnet_default_device; - if (dev) { - if (dev->dn_ptr) - dev_hold(dev); - else - dev = NULL; - } - read_unlock(&dndev_lock); - return dev; -} - -int dn_dev_set_default(struct net_device *dev, int force) -{ - struct net_device *old = NULL; - int rv = -EBUSY; - if (!dev->dn_ptr) - return -ENODEV; - write_lock(&dndev_lock); - if (force || decnet_default_device == NULL) { - old = decnet_default_device; - decnet_default_device = dev; - rv = 0; - } - write_unlock(&dndev_lock); - if (old) - dev_put(dev); - return rv; -} - -static void dn_dev_check_default(struct net_device *dev) -{ - write_lock(&dndev_lock); - if (dev == decnet_default_device) { - decnet_default_device = NULL; - } else { - dev = NULL; - } - write_unlock(&dndev_lock); - if (dev) - dev_put(dev); -} - static int dn_forwarding_proc(ctl_table *table, int write, struct file *filep, void __user *buffer, @@ -454,6 +393,21 @@ static void dn_dev_sysctl_register(struct net_device *dev, struct dn_dev_parms * #endif /* CONFIG_SYSCTL */ +static inline __u16 mtu2blksize(struct net_device *dev) +{ + u32 blksize = dev->mtu; + if (blksize > 0xffff) + blksize = 0xffff; + + if (dev->type == ARPHRD_ETHER || + dev->type == ARPHRD_PPP || + dev->type == ARPHRD_IPGRE || + dev->type == ARPHRD_LOOPBACK) + blksize -= 2; + + return (__u16)blksize; +} + static struct dn_ifaddr *dn_dev_alloc_ifa(void) { struct dn_ifaddr *ifa; @@ -635,6 +589,52 @@ rarok: goto done; } +struct net_device *dn_dev_get_default(void) +{ + struct net_device *dev; + read_lock(&dndev_lock); + dev = decnet_default_device; + if (dev) { + if (dev->dn_ptr) + dev_hold(dev); + else + dev = NULL; + } + read_unlock(&dndev_lock); + return dev; +} + +int dn_dev_set_default(struct net_device *dev, int force) +{ + struct net_device *old = NULL; + int rv = -EBUSY; + if (!dev->dn_ptr) + return -ENODEV; + write_lock(&dndev_lock); + if (force || decnet_default_device == NULL) { + old = decnet_default_device; + decnet_default_device = dev; + rv = 0; + } + write_unlock(&dndev_lock); + if (old) + dev_put(dev); + return rv; +} + +static void dn_dev_check_default(struct net_device *dev) +{ + write_lock(&dndev_lock); + if (dev == decnet_default_device) { + decnet_default_device = NULL; + } else { + dev = NULL; + } + write_unlock(&dndev_lock); + if (dev) + dev_put(dev); +} + static struct dn_dev *dn_dev_by_index(int ifindex) { struct net_device *dev; diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c index 07b4cff2f44d..fc31ae1209d1 100644 --- a/net/econet/af_econet.c +++ b/net/econet/af_econet.c @@ -39,6 +39,7 @@ #include <net/udp.h> #include <net/ip.h> #include <linux/spinlock.h> +#include <linux/rcupdate.h> #include <asm/uaccess.h> #include <asm/system.h> @@ -401,16 +402,17 @@ static int econet_sendmsg(struct kiocb *iocb, struct socket *sock, y.x maps to IP a.b.c.x. This should be replaced with something more flexible and more aware of subnet masks. */ { - struct in_device *idev = in_dev_get(dev); + struct in_device *idev; unsigned long network = 0; + + rcu_read_lock(); + idev = __in_dev_get(dev); if (idev) { - read_lock(&idev->lock); if (idev->ifa_list) network = ntohl(idev->ifa_list->ifa_address) & 0xffffff00; /* !!! */ - read_unlock(&idev->lock); - in_dev_put(idev); } + rcu_read_unlock(); udpdest.sin_addr.s_addr = htonl(network | addr.station); } diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index de00c668c98c..fc370970ba83 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -82,16 +82,6 @@ config IP_ROUTE_FWMARK If you say Y here, you will be able to specify different routes for packets with different mark values (see iptables(8), MARK target). -config IP_ROUTE_NAT - bool "IP: fast network address translation" - depends on IP_MULTIPLE_TABLES - help - If you say Y here, your router will be able to modify source and - destination addresses of packets that pass through it, in a manner - you specify. General information about Network Address Translation - can be gotten from the document - <http://www.hasenstein.com/linux-ip-nat/diplom/nat.html>. - config IP_ROUTE_MULTIPATH bool "IP: equal cost multipath" depends on IP_ADVANCED_ROUTER @@ -187,7 +177,7 @@ config IP_PNP_RARP config NET_IPIP tristate "IP: tunneling" depends on INET - select XFRM + select INET_TUNNEL ---help--- Tunneling means encapsulating data of one protocol type within another protocol and sending it over a channel that understands the @@ -351,6 +341,7 @@ config INET_ESP config INET_IPCOMP tristate "IP: IPComp transformation" select XFRM + select INET_TUNNEL select CRYPTO select CRYPTO_DEFLATE ---help--- @@ -359,5 +350,14 @@ config INET_IPCOMP If unsure, say Y. +config INET_TUNNEL + tristate "IP: tunnel transformation" + select XFRM + ---help--- + Support for generic IP tunnel transformation, which is required by + the IP tunneling module as well as tunnel mode IPComp. + + If unsure, say Y. + source "net/ipv4/ipvs/Kconfig" diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 0fe409afa094..a7a7a35574d4 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -19,9 +19,10 @@ obj-$(CONFIG_SYN_COOKIES) += syncookies.o obj-$(CONFIG_INET_AH) += ah4.o obj-$(CONFIG_INET_ESP) += esp4.o obj-$(CONFIG_INET_IPCOMP) += ipcomp.o +obj-$(CONFIG_INET_TUNNEL) += xfrm4_tunnel.o obj-$(CONFIG_IP_PNP) += ipconfig.o obj-$(CONFIG_NETFILTER) += netfilter/ obj-$(CONFIG_IP_VS) += ipvs/ obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ - xfrm4_tunnel.o xfrm4_output.o + xfrm4_output.o diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index 9784f0376980..970fe58b4880 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -53,10 +53,10 @@ static int ip_clear_mutable_options(struct iphdr *iph, u32 *daddr) return 0; } -static int ah_output(struct sk_buff **pskb) +static int ah_output(struct sk_buff *skb) { int err; - struct dst_entry *dst = (*pskb)->dst; + struct dst_entry *dst = skb->dst; struct xfrm_state *x = dst->xfrm; struct iphdr *iph, *top_iph; struct ip_auth_hdr *ah; @@ -66,7 +66,7 @@ static int ah_output(struct sk_buff **pskb) char buf[60]; } tmp_iph; - top_iph = (*pskb)->nh.iph; + top_iph = skb->nh.iph; iph = &tmp_iph.iph; iph->tos = top_iph->tos; @@ -85,7 +85,7 @@ static int ah_output(struct sk_buff **pskb) ah->nexthdr = top_iph->protocol; top_iph->tos = 0; - top_iph->tot_len = htons((*pskb)->len); + top_iph->tot_len = htons(skb->len); top_iph->frag_off = 0; top_iph->ttl = 0; top_iph->protocol = IPPROTO_AH; @@ -98,7 +98,7 @@ static int ah_output(struct sk_buff **pskb) ah->reserved = 0; ah->spi = x->id.spi; ah->seq_no = htonl(++x->replay.oseq); - ahp->icv(ahp, *pskb, ah->auth_data); + ahp->icv(ahp, skb, ah->auth_data); top_iph->tos = iph->tos; top_iph->ttl = iph->ttl; @@ -116,7 +116,7 @@ error: return err; } -int ah_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb) +static int ah_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb) { int ah_hlen; struct iphdr *iph; @@ -184,7 +184,7 @@ out: return -EINVAL; } -void ah4_err(struct sk_buff *skb, u32 info) +static void ah4_err(struct sk_buff *skb, u32 info) { struct iphdr *iph = (struct iphdr*)skb->data; struct ip_auth_hdr *ah = (struct ip_auth_hdr*)(skb->data+(iph->ihl<<2)); @@ -214,6 +214,9 @@ static int ah_init_state(struct xfrm_state *x, void *args) if (x->aalg->alg_key_len > 512) goto error; + if (x->encap) + goto error; + ahp = kmalloc(sizeof(*ahp), GFP_KERNEL); if (ahp == NULL) return -ENOMEM; diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 4781dea42dfe..fc9930460864 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -88,31 +88,31 @@ static void devinet_sysctl_register(struct in_device *in_dev, static void devinet_sysctl_unregister(struct ipv4_devconf *p); #endif -int inet_ifa_count; -int inet_dev_count; - /* Locks all the inet devices. */ -rwlock_t inetdev_lock = RW_LOCK_UNLOCKED; - static struct in_ifaddr *inet_alloc_ifa(void) { struct in_ifaddr *ifa = kmalloc(sizeof(*ifa), GFP_KERNEL); if (ifa) { memset(ifa, 0, sizeof(*ifa)); - inet_ifa_count++; + INIT_RCU_HEAD(&ifa->rcu_head); } return ifa; } -static __inline__ void inet_free_ifa(struct in_ifaddr *ifa) +static void inet_rcu_free_ifa(struct rcu_head *head) { + struct in_ifaddr *ifa = container_of(head, struct in_ifaddr, rcu_head); if (ifa->ifa_dev) - __in_dev_put(ifa->ifa_dev); + in_dev_put(ifa->ifa_dev); kfree(ifa); - inet_ifa_count--; +} + +static inline void inet_free_ifa(struct in_ifaddr *ifa) +{ + call_rcu(&ifa->rcu_head, inet_rcu_free_ifa); } void in_dev_finish_destroy(struct in_device *idev) @@ -129,7 +129,6 @@ void in_dev_finish_destroy(struct in_device *idev) if (!idev->dead) printk("Freeing alive in_device %p\n", idev); else { - inet_dev_count--; kfree(idev); } } @@ -144,24 +143,24 @@ struct in_device *inetdev_init(struct net_device *dev) if (!in_dev) goto out; memset(in_dev, 0, sizeof(*in_dev)); - in_dev->lock = RW_LOCK_UNLOCKED; + INIT_RCU_HEAD(&in_dev->rcu_head); memcpy(&in_dev->cnf, &ipv4_devconf_dflt, sizeof(in_dev->cnf)); in_dev->cnf.sysctl = NULL; in_dev->dev = dev; if ((in_dev->arp_parms = neigh_parms_alloc(dev, &arp_tbl)) == NULL) goto out_kfree; - inet_dev_count++; /* Reference in_dev->dev */ dev_hold(dev); #ifdef CONFIG_SYSCTL neigh_sysctl_register(dev, in_dev->arp_parms, NET_IPV4, NET_IPV4_NEIGH, "ipv4", NULL); #endif - write_lock_bh(&inetdev_lock); - dev->ip_ptr = in_dev; + /* Account for reference dev->ip_ptr */ in_dev_hold(in_dev); - write_unlock_bh(&inetdev_lock); + smp_wmb(); + dev->ip_ptr = in_dev; + #ifdef CONFIG_SYSCTL devinet_sysctl_register(in_dev, &in_dev->cnf); #endif @@ -176,6 +175,12 @@ out_kfree: goto out; } +static void in_dev_rcu_put(struct rcu_head *head) +{ + struct in_device *idev = container_of(head, struct in_device, rcu_head); + in_dev_put(idev); +} + static void inetdev_destroy(struct in_device *in_dev) { struct in_ifaddr *ifa; @@ -194,30 +199,28 @@ static void inetdev_destroy(struct in_device *in_dev) #ifdef CONFIG_SYSCTL devinet_sysctl_unregister(&in_dev->cnf); #endif - write_lock_bh(&inetdev_lock); + in_dev->dev->ip_ptr = NULL; - /* in_dev_put following below will kill the in_device */ - write_unlock_bh(&inetdev_lock); #ifdef CONFIG_SYSCTL neigh_sysctl_unregister(in_dev->arp_parms); #endif neigh_parms_release(&arp_tbl, in_dev->arp_parms); - in_dev_put(in_dev); + call_rcu(&in_dev->rcu_head, in_dev_rcu_put); } int inet_addr_onlink(struct in_device *in_dev, u32 a, u32 b) { - read_lock(&in_dev->lock); + rcu_read_lock(); for_primary_ifa(in_dev) { if (inet_ifa_match(a, ifa)) { if (!b || inet_ifa_match(b, ifa)) { - read_unlock(&in_dev->lock); + rcu_read_unlock(); return 1; } } } endfor_ifa(in_dev); - read_unlock(&in_dev->lock); + rcu_read_unlock(); return 0; } @@ -241,9 +244,8 @@ static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, ifap1 = &ifa->ifa_next; continue; } - write_lock_bh(&in_dev->lock); + *ifap1 = ifa->ifa_next; - write_unlock_bh(&in_dev->lock); rtmsg_ifa(RTM_DELADDR, ifa); notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa); @@ -253,9 +255,7 @@ static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, /* 2. Unlink it */ - write_lock_bh(&in_dev->lock); *ifap = ifa1->ifa_next; - write_unlock_bh(&in_dev->lock); /* 3. Announce address deletion */ @@ -317,9 +317,7 @@ static int inet_insert_ifa(struct in_ifaddr *ifa) } ifa->ifa_next = *ifap; - write_lock_bh(&in_dev->lock); *ifap = ifa; - write_unlock_bh(&in_dev->lock); /* Send message first, then call notifier. Notifier will trigger FIB update, so that @@ -771,12 +769,11 @@ u32 inet_select_addr(const struct net_device *dev, u32 dst, int scope) u32 addr = 0; struct in_device *in_dev; - read_lock(&inetdev_lock); + rcu_read_lock(); in_dev = __in_dev_get(dev); if (!in_dev) - goto out_unlock_inetdev; + goto no_in_dev; - read_lock(&in_dev->lock); for_primary_ifa(in_dev) { if (ifa->ifa_scope > scope) continue; @@ -787,8 +784,8 @@ u32 inet_select_addr(const struct net_device *dev, u32 dst, int scope) if (!addr) addr = ifa->ifa_local; } endfor_ifa(in_dev); - read_unlock(&in_dev->lock); - read_unlock(&inetdev_lock); +no_in_dev: + rcu_read_unlock(); if (addr) goto out; @@ -798,30 +795,24 @@ u32 inet_select_addr(const struct net_device *dev, u32 dst, int scope) in dev_base list. */ read_lock(&dev_base_lock); - read_lock(&inetdev_lock); + rcu_read_lock(); for (dev = dev_base; dev; dev = dev->next) { if ((in_dev = __in_dev_get(dev)) == NULL) continue; - read_lock(&in_dev->lock); for_primary_ifa(in_dev) { if (ifa->ifa_scope != RT_SCOPE_LINK && ifa->ifa_scope <= scope) { - read_unlock(&in_dev->lock); addr = ifa->ifa_local; goto out_unlock_both; } } endfor_ifa(in_dev); - read_unlock(&in_dev->lock); } out_unlock_both: - read_unlock(&inetdev_lock); read_unlock(&dev_base_lock); + rcu_read_unlock(); out: return addr; -out_unlock_inetdev: - read_unlock(&inetdev_lock); - goto out; } static u32 confirm_addr_indev(struct in_device *in_dev, u32 dst, @@ -874,29 +865,24 @@ u32 inet_confirm_addr(const struct net_device *dev, u32 dst, u32 local, int scop struct in_device *in_dev; if (dev) { - read_lock(&inetdev_lock); - if ((in_dev = __in_dev_get(dev))) { - read_lock(&in_dev->lock); + rcu_read_lock(); + if ((in_dev = __in_dev_get(dev))) addr = confirm_addr_indev(in_dev, dst, local, scope); - read_unlock(&in_dev->lock); - } - read_unlock(&inetdev_lock); + rcu_read_unlock(); return addr; } read_lock(&dev_base_lock); - read_lock(&inetdev_lock); + rcu_read_lock(); for (dev = dev_base; dev; dev = dev->next) { if ((in_dev = __in_dev_get(dev))) { - read_lock(&in_dev->lock); addr = confirm_addr_indev(in_dev, dst, local, scope); - read_unlock(&in_dev->lock); if (addr) break; } } - read_unlock(&inetdev_lock); + rcu_read_unlock(); read_unlock(&dev_base_lock); return addr; @@ -1065,12 +1051,12 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) continue; if (idx > s_idx) s_ip_idx = 0; - read_lock(&inetdev_lock); + rcu_read_lock(); if ((in_dev = __in_dev_get(dev)) == NULL) { - read_unlock(&inetdev_lock); + rcu_read_unlock(); continue; } - read_lock(&in_dev->lock); + for (ifa = in_dev->ifa_list, ip_idx = 0; ifa; ifa = ifa->ifa_next, ip_idx++) { if (ip_idx < s_ip_idx) @@ -1078,13 +1064,11 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) if (inet_fill_ifaddr(skb, ifa, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, RTM_NEWADDR) <= 0) { - read_unlock(&in_dev->lock); - read_unlock(&inetdev_lock); + rcu_read_unlock(); goto done; } } - read_unlock(&in_dev->lock); - read_unlock(&inetdev_lock); + rcu_read_unlock(); } done: @@ -1138,11 +1122,11 @@ void inet_forward_change(void) read_lock(&dev_base_lock); for (dev = dev_base; dev; dev = dev->next) { struct in_device *in_dev; - read_lock(&inetdev_lock); + rcu_read_lock(); in_dev = __in_dev_get(dev); if (in_dev) in_dev->cnf.forwarding = on; - read_unlock(&inetdev_lock); + rcu_read_unlock(); } read_unlock(&dev_base_lock); @@ -1508,6 +1492,5 @@ EXPORT_SYMBOL(devinet_ioctl); EXPORT_SYMBOL(in_dev_finish_destroy); EXPORT_SYMBOL(inet_select_addr); EXPORT_SYMBOL(inetdev_by_index); -EXPORT_SYMBOL(inetdev_lock); EXPORT_SYMBOL(register_inetaddr_notifier); EXPORT_SYMBOL(unregister_inetaddr_notifier); diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 07a594b831d2..27d8f808bad4 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -17,10 +17,10 @@ struct esp_decap_data { __u8 proto; }; -int esp_output(struct sk_buff **pskb) +static int esp_output(struct sk_buff *skb) { int err; - struct dst_entry *dst = (*pskb)->dst; + struct dst_entry *dst = skb->dst; struct xfrm_state *x = dst->xfrm; struct iphdr *top_iph; struct ip_esp_hdr *esph; @@ -33,13 +33,13 @@ int esp_output(struct sk_buff **pskb) int nfrags; /* Strip IP+ESP header. */ - __skb_pull(*pskb, (*pskb)->h.raw - (*pskb)->data); + __skb_pull(skb, skb->h.raw - skb->data); /* Now skb is pure payload to encrypt */ err = -ENOMEM; /* Round to block size */ - clen = (*pskb)->len; + clen = skb->len; esp = x->data; alen = esp->auth.icv_trunc_len; @@ -49,22 +49,22 @@ int esp_output(struct sk_buff **pskb) if (esp->conf.padlen) clen = (clen + esp->conf.padlen-1)&~(esp->conf.padlen-1); - if ((nfrags = skb_cow_data(*pskb, clen-(*pskb)->len+alen, &trailer)) < 0) + if ((nfrags = skb_cow_data(skb, clen-skb->len+alen, &trailer)) < 0) goto error; /* Fill padding... */ do { int i; - for (i=0; i<clen-(*pskb)->len - 2; i++) + for (i=0; i<clen-skb->len - 2; i++) *(u8*)(trailer->tail + i) = i+1; } while (0); - *(u8*)(trailer->tail + clen-(*pskb)->len - 2) = (clen - (*pskb)->len)-2; - pskb_put(*pskb, trailer, clen - (*pskb)->len); + *(u8*)(trailer->tail + clen-skb->len - 2) = (clen - skb->len)-2; + pskb_put(skb, trailer, clen - skb->len); - __skb_push(*pskb, (*pskb)->data - (*pskb)->nh.raw); - top_iph = (*pskb)->nh.iph; - esph = (struct ip_esp_hdr *)((*pskb)->nh.raw + top_iph->ihl*4); - top_iph->tot_len = htons((*pskb)->len + alen); + __skb_push(skb, skb->data - skb->nh.raw); + top_iph = skb->nh.iph; + esph = (struct ip_esp_hdr *)(skb->nh.raw + top_iph->ihl*4); + top_iph->tot_len = htons(skb->len + alen); *(u8*)(trailer->tail - 1) = top_iph->protocol; /* this is non-NULL only with UDP Encapsulation */ @@ -76,7 +76,7 @@ int esp_output(struct sk_buff **pskb) uh = (struct udphdr *)esph; uh->source = encap->encap_sport; uh->dest = encap->encap_dport; - uh->len = htons((*pskb)->len + alen - top_iph->ihl*4); + uh->len = htons(skb->len + alen - top_iph->ihl*4); uh->check = 0; switch (encap->encap_type) { @@ -109,7 +109,7 @@ int esp_output(struct sk_buff **pskb) if (!sg) goto error; } - skb_to_sgvec(*pskb, sg, esph->enc_data+esp->conf.ivlen-(*pskb)->data, clen); + skb_to_sgvec(skb, sg, esph->enc_data+esp->conf.ivlen-skb->data, clen); crypto_cipher_encrypt(tfm, sg, sg, clen); if (unlikely(sg != &esp->sgbuf[0])) kfree(sg); @@ -121,9 +121,9 @@ int esp_output(struct sk_buff **pskb) } if (esp->auth.icv_full_len) { - esp->auth.icv(esp, *pskb, (u8*)esph-(*pskb)->data, + esp->auth.icv(esp, skb, (u8*)esph-skb->data, sizeof(struct ip_esp_hdr) + esp->conf.ivlen+clen, trailer->tail); - pskb_put(*pskb, trailer, alen); + pskb_put(skb, trailer, alen); } ip_send_check(top_iph); @@ -139,7 +139,7 @@ error: * expensive, so we only support truncated data, which is the recommended * and common case. */ -int esp_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb) +static int esp_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb) { struct iphdr *iph; struct ip_esp_hdr *esph; @@ -246,7 +246,7 @@ out: return -EINVAL; } -int esp_post_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb) +static int esp_post_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb) { if (x->encap) { @@ -320,7 +320,7 @@ static u32 esp4_get_max_size(struct xfrm_state *x, int mtu) return mtu + x->props.header_len + esp->auth.icv_trunc_len; } -void esp4_err(struct sk_buff *skb, u32 info) +static void esp4_err(struct sk_buff *skb, u32 info) { struct iphdr *iph = (struct iphdr*)skb->data; struct ip_esp_hdr *esph = (struct ip_esp_hdr*)(skb->data+(iph->ihl<<2)); @@ -338,7 +338,7 @@ void esp4_err(struct sk_buff *skb, u32 info) xfrm_state_put(x); } -void esp_destroy(struct xfrm_state *x) +static void esp_destroy(struct xfrm_state *x) { struct esp_data *esp = x->data; @@ -364,7 +364,7 @@ void esp_destroy(struct xfrm_state *x) kfree(esp); } -int esp_init_state(struct xfrm_state *x, void *args) +static int esp_init_state(struct xfrm_state *x, void *args) { struct esp_data *esp = NULL; @@ -436,6 +436,7 @@ int esp_init_state(struct xfrm_state *x, void *args) switch (encap->encap_type) { default: + goto error; case UDP_ENCAP_ESPINUDP: x->props.header_len += sizeof(struct udphdr); break; @@ -449,15 +450,9 @@ int esp_init_state(struct xfrm_state *x, void *args) return 0; error: - if (esp) { - if (esp->auth.tfm) - crypto_free_tfm(esp->auth.tfm); - if (esp->auth.work_icv) - kfree(esp->auth.work_icv); - if (esp->conf.tfm) - crypto_free_tfm(esp->conf.tfm); - kfree(esp); - } + x->data = esp; + esp_destroy(x); + x->data = NULL; return -EINVAL; } diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index f5b008a9d7d0..f13e797c32e8 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -172,13 +172,13 @@ int fib_validate_source(u32 src, u32 dst, u8 tos, int oif, int ret; no_addr = rpf = 0; - read_lock(&inetdev_lock); + rcu_read_lock(); in_dev = __in_dev_get(dev); if (in_dev) { no_addr = in_dev->ifa_list == NULL; rpf = IN_DEV_RPFILTER(in_dev); } - read_unlock(&inetdev_lock); + rcu_read_unlock(); if (in_dev == NULL) goto e_inval; diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 66e78bb4e2d4..ad2481f8fa68 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -176,7 +176,7 @@ int inet_rtm_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) table_id = rtm->rtm_table; if (table_id == RT_TABLE_UNSPEC) { struct fib_table *table; - if (rtm->rtm_type == RTN_UNICAST || rtm->rtm_type == RTN_NAT) { + if (rtm->rtm_type == RTN_UNICAST) { if ((table = fib_empty_table()) == NULL) return -ENOBUFS; table_id = table->tb_id; @@ -251,26 +251,6 @@ u32 fib_rules_map_destination(u32 daddr, struct fib_result *res) return (daddr&~mask)|res->fi->fib_nh->nh_gw; } -u32 fib_rules_policy(u32 saddr, struct fib_result *res, unsigned *flags) -{ - struct fib_rule *r = res->r; - - if (r->r_action == RTN_NAT) { - int addrtype = inet_addr_type(r->r_srcmap); - - if (addrtype == RTN_NAT) { - /* Packet is from translated source; remember it */ - saddr = (saddr&~r->r_srcmask)|r->r_srcmap; - *flags |= RTCF_SNAT; - } else if (addrtype == RTN_LOCAL || r->r_srcmap == 0) { - /* Packet is from masqueraded source; remember it */ - saddr = r->r_srcmap; - *flags |= RTCF_MASQ; - } - } - return saddr; -} - #ifdef CONFIG_NET_CLS_ROUTE u32 fib_rules_tclass(struct fib_result *res) { @@ -334,7 +314,6 @@ FRprintk("Lookup: %u.%u.%u.%u <- %u.%u.%u.%u ", FRprintk("tb %d r %d ", r->r_table, r->r_action); switch (r->r_action) { case RTN_UNICAST: - case RTN_NAT: policy = r; break; case RTN_UNREACHABLE: diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index c1b6060df3f1..51191971eb12 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -124,17 +124,10 @@ static struct .error = -EAGAIN, .scope = RT_SCOPE_UNIVERSE, }, /* RTN_THROW */ -#ifdef CONFIG_IP_ROUTE_NAT - { - .error = 0, - .scope = RT_SCOPE_HOST, - }, /* RTN_NAT */ -#else { .error = -EINVAL, .scope = RT_SCOPE_NOWHERE, }, /* RTN_NAT */ -#endif { .error = -EINVAL, .scope = RT_SCOPE_NOWHERE, @@ -543,15 +536,6 @@ fib_create_info(const struct rtmsg *r, struct kern_rta *rta, #endif } -#ifdef CONFIG_IP_ROUTE_NAT - if (r->rtm_type == RTN_NAT) { - if (rta->rta_gw == NULL || nhs != 1 || rta->rta_oif) - goto err_inval; - memcpy(&fi->fib_nh->nh_gw, rta->rta_gw, 4); - goto link_it; - } -#endif - if (fib_props[r->rtm_type].error) { if (rta->rta_gw || rta->rta_oif || rta->rta_mp) goto err_inval; @@ -629,12 +613,6 @@ fib_semantic_match(int type, struct fib_info *fi, const struct flowi *flp, struc res->fi = fi; switch (type) { -#ifdef CONFIG_IP_ROUTE_NAT - case RTN_NAT: - FIB_RES_RESET(*res); - atomic_inc(&fi->fib_clntref); - return 0; -#endif case RTN_UNICAST: case RTN_LOCAL: case RTN_BROADCAST: diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index be40431b73cf..062589289b4f 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -478,20 +478,25 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, u32 info) * ICMP error */ if (iph->protocol == IPPROTO_ICMP) { - u8 inner_type; - - if (skb_copy_bits(skb_in, - skb_in->nh.raw + (iph->ihl << 2) + - offsetof(struct icmphdr, type) - - skb_in->data, &inner_type, 1)) + u8 _inner_type, *itp; + + itp = skb_header_pointer(skb_in, + skb_in->nh.raw + + (iph->ihl << 2) + + offsetof(struct icmphdr, + type) - + skb_in->data, + sizeof(_inner_type), + &_inner_type); + if (itp == NULL) goto out; /* * Assume any unknown ICMP type is an error. This * isn't specified by the RFC, but think about it.. */ - if (inner_type > NR_ICMP_TYPES || - icmp_pointers[inner_type].error) + if (*itp > NR_ICMP_TYPES || + icmp_pointers[*itp].error) goto out; } } @@ -503,16 +508,6 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, u32 info) * Construct source address and options. */ -#ifdef CONFIG_IP_ROUTE_NAT - /* - * Restore original addresses if packet has been translated. - */ - if (rt->rt_flags & RTCF_NAT && IPCB(skb_in)->flags & IPSKB_TRANSLATED) { - iph->daddr = rt->fl.fl4_dst; - iph->saddr = rt->fl.fl4_src; - } -#endif - saddr = iph->daddr; if (!(rt->rt_flags & RTCF_LOCAL)) saddr = 0; @@ -879,7 +874,6 @@ static void icmp_address_reply(struct sk_buff *skb) struct net_device *dev = skb->dev; struct in_device *in_dev; struct in_ifaddr *ifa; - u32 mask; if (skb->len < 4 || !(rt->rt_flags&RTCF_DIRECTSRC)) goto out; @@ -887,24 +881,27 @@ static void icmp_address_reply(struct sk_buff *skb) in_dev = in_dev_get(dev); if (!in_dev) goto out; - read_lock(&in_dev->lock); + rcu_read_lock(); if (in_dev->ifa_list && IN_DEV_LOG_MARTIANS(in_dev) && IN_DEV_FORWARD(in_dev)) { - if (skb_copy_bits(skb, 0, &mask, 4)) + u32 _mask, *mp; + + mp = skb_header_pointer(skb, 0, sizeof(_mask), &_mask); + if (mp == NULL) BUG(); for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { - if (mask == ifa->ifa_mask && + if (*mp == ifa->ifa_mask && inet_ifa_match(rt->rt_src, ifa)) break; } if (!ifa && net_ratelimit()) { printk(KERN_INFO "Wrong address mask %u.%u.%u.%u from " "%s/%u.%u.%u.%u\n", - NIPQUAD(mask), dev->name, NIPQUAD(rt->rt_src)); + NIPQUAD(*mp), dev->name, NIPQUAD(rt->rt_src)); } } - read_unlock(&in_dev->lock); + rcu_read_unlock(); in_dev_put(in_dev); out:; } diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 01db76123d88..d1815d3efd6c 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -487,7 +487,7 @@ static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc) int type; if (!pmc) { - read_lock(&in_dev->lock); + read_lock(&in_dev->mc_list_lock); for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) { if (pmc->multiaddr == IGMP_ALL_HOSTS) continue; @@ -499,7 +499,7 @@ static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc) skb = add_grec(skb, pmc, type, 0, 0); spin_unlock_bh(&pmc->lock); } - read_unlock(&in_dev->lock); + read_unlock(&in_dev->mc_list_lock); } else { spin_lock_bh(&pmc->lock); if (pmc->sfcount[MCAST_EXCLUDE]) @@ -541,8 +541,8 @@ static void igmpv3_send_cr(struct in_device *in_dev) struct sk_buff *skb = NULL; int type, dtype; - read_lock(&in_dev->lock); - write_lock_bh(&in_dev->mc_lock); + read_lock(&in_dev->mc_list_lock); + spin_lock_bh(&in_dev->mc_tomb_lock); /* deleted MCA's */ pmc_prev = NULL; @@ -575,7 +575,7 @@ static void igmpv3_send_cr(struct in_device *in_dev) } else pmc_prev = pmc; } - write_unlock_bh(&in_dev->mc_lock); + spin_unlock_bh(&in_dev->mc_tomb_lock); /* change recs */ for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) { @@ -601,7 +601,8 @@ static void igmpv3_send_cr(struct in_device *in_dev) } spin_unlock_bh(&pmc->lock); } - read_unlock(&in_dev->lock); + read_unlock(&in_dev->mc_list_lock); + if (!skb) return; (void) igmpv3_sendpack(skb); @@ -759,14 +760,14 @@ static void igmp_heard_report(struct in_device *in_dev, u32 group) if (group == IGMP_ALL_HOSTS) return; - read_lock(&in_dev->lock); + read_lock(&in_dev->mc_list_lock); for (im=in_dev->mc_list; im!=NULL; im=im->next) { if (im->multiaddr == group) { igmp_stop_timer(im); break; } } - read_unlock(&in_dev->lock); + read_unlock(&in_dev->mc_list_lock); } static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, @@ -840,7 +841,7 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, * - Use the igmp->igmp_code field as the maximum * delay possible */ - read_lock(&in_dev->lock); + read_lock(&in_dev->mc_list_lock); for (im=in_dev->mc_list; im!=NULL; im=im->next) { if (group && group != im->multiaddr) continue; @@ -856,7 +857,7 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, spin_unlock_bh(&im->lock); igmp_mod_timer(im, max_delay); } - read_unlock(&in_dev->lock); + read_unlock(&in_dev->mc_list_lock); } int igmp_rcv(struct sk_buff *skb) @@ -982,10 +983,10 @@ static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im) } spin_unlock_bh(&im->lock); - write_lock_bh(&in_dev->mc_lock); + spin_lock_bh(&in_dev->mc_tomb_lock); pmc->next = in_dev->mc_tomb; in_dev->mc_tomb = pmc; - write_unlock_bh(&in_dev->mc_lock); + spin_unlock_bh(&in_dev->mc_tomb_lock); } static void igmpv3_del_delrec(struct in_device *in_dev, __u32 multiaddr) @@ -993,7 +994,7 @@ static void igmpv3_del_delrec(struct in_device *in_dev, __u32 multiaddr) struct ip_mc_list *pmc, *pmc_prev; struct ip_sf_list *psf, *psf_next; - write_lock_bh(&in_dev->mc_lock); + spin_lock_bh(&in_dev->mc_tomb_lock); pmc_prev = NULL; for (pmc=in_dev->mc_tomb; pmc; pmc=pmc->next) { if (pmc->multiaddr == multiaddr) @@ -1006,7 +1007,7 @@ static void igmpv3_del_delrec(struct in_device *in_dev, __u32 multiaddr) else in_dev->mc_tomb = pmc->next; } - write_unlock_bh(&in_dev->mc_lock); + spin_unlock_bh(&in_dev->mc_tomb_lock); if (pmc) { for (psf=pmc->tomb; psf; psf=psf_next) { psf_next = psf->sf_next; @@ -1021,10 +1022,10 @@ static void igmpv3_clear_delrec(struct in_device *in_dev) { struct ip_mc_list *pmc, *nextpmc; - write_lock_bh(&in_dev->mc_lock); + spin_lock_bh(&in_dev->mc_tomb_lock); pmc = in_dev->mc_tomb; in_dev->mc_tomb = NULL; - write_unlock_bh(&in_dev->mc_lock); + spin_unlock_bh(&in_dev->mc_tomb_lock); for (; pmc; pmc = nextpmc) { nextpmc = pmc->next; @@ -1033,7 +1034,7 @@ static void igmpv3_clear_delrec(struct in_device *in_dev) kfree(pmc); } /* clear dead sources, too */ - read_lock(&in_dev->lock); + read_lock(&in_dev->mc_list_lock); for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) { struct ip_sf_list *psf, *psf_next; @@ -1046,7 +1047,7 @@ static void igmpv3_clear_delrec(struct in_device *in_dev) kfree(psf); } } - read_unlock(&in_dev->lock); + read_unlock(&in_dev->mc_list_lock); } #endif @@ -1167,10 +1168,10 @@ void ip_mc_inc_group(struct in_device *in_dev, u32 addr) im->gsquery = 0; #endif im->loaded = 0; - write_lock_bh(&in_dev->lock); + write_lock_bh(&in_dev->mc_list_lock); im->next=in_dev->mc_list; in_dev->mc_list=im; - write_unlock_bh(&in_dev->lock); + write_unlock_bh(&in_dev->mc_list_lock); #ifdef CONFIG_IP_MULTICAST igmpv3_del_delrec(in_dev, im->multiaddr); #endif @@ -1194,9 +1195,9 @@ void ip_mc_dec_group(struct in_device *in_dev, u32 addr) for (ip=&in_dev->mc_list; (i=*ip)!=NULL; ip=&i->next) { if (i->multiaddr==addr) { if (--i->users == 0) { - write_lock_bh(&in_dev->lock); + write_lock_bh(&in_dev->mc_list_lock); *ip = i->next; - write_unlock_bh(&in_dev->lock); + write_unlock_bh(&in_dev->mc_list_lock); igmp_group_dropped(i); if (!in_dev->dead) @@ -1251,7 +1252,8 @@ void ip_mc_init_dev(struct in_device *in_dev) in_dev->mr_qrv = IGMP_Unsolicited_Report_Count; #endif - in_dev->mc_lock = RW_LOCK_UNLOCKED; + in_dev->mc_list_lock = RW_LOCK_UNLOCKED; + in_dev->mc_tomb_lock = SPIN_LOCK_UNLOCKED; } /* Device going up */ @@ -1281,17 +1283,17 @@ void ip_mc_destroy_dev(struct in_device *in_dev) /* Deactivate timers */ ip_mc_down(in_dev); - write_lock_bh(&in_dev->lock); + write_lock_bh(&in_dev->mc_list_lock); while ((i = in_dev->mc_list) != NULL) { in_dev->mc_list = i->next; - write_unlock_bh(&in_dev->lock); + write_unlock_bh(&in_dev->mc_list_lock); igmp_group_dropped(i); ip_ma_put(i); - write_lock_bh(&in_dev->lock); + write_lock_bh(&in_dev->mc_list_lock); } - write_unlock_bh(&in_dev->lock); + write_unlock_bh(&in_dev->mc_list_lock); } static struct in_device * ip_mc_find_dev(struct ip_mreqn *imr) @@ -1391,18 +1393,18 @@ int ip_mc_del_src(struct in_device *in_dev, __u32 *pmca, int sfmode, if (!in_dev) return -ENODEV; - read_lock(&in_dev->lock); + read_lock(&in_dev->mc_list_lock); for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) { if (*pmca == pmc->multiaddr) break; } if (!pmc) { /* MCA not found?? bug */ - read_unlock(&in_dev->lock); + read_unlock(&in_dev->mc_list_lock); return -ESRCH; } spin_lock_bh(&pmc->lock); - read_unlock(&in_dev->lock); + read_unlock(&in_dev->mc_list_lock); #ifdef CONFIG_IP_MULTICAST sf_markstate(pmc); #endif @@ -1527,18 +1529,18 @@ int ip_mc_add_src(struct in_device *in_dev, __u32 *pmca, int sfmode, if (!in_dev) return -ENODEV; - read_lock(&in_dev->lock); + read_lock(&in_dev->mc_list_lock); for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) { if (*pmca == pmc->multiaddr) break; } if (!pmc) { /* MCA not found?? bug */ - read_unlock(&in_dev->lock); + read_unlock(&in_dev->mc_list_lock); return -ESRCH; } spin_lock_bh(&pmc->lock); - read_unlock(&in_dev->lock); + read_unlock(&in_dev->mc_list_lock); #ifdef CONFIG_IP_MULTICAST sf_markstate(pmc); @@ -2095,7 +2097,7 @@ int ip_check_mc(struct in_device *in_dev, u32 mc_addr, u32 src_addr, u16 proto) struct ip_sf_list *psf; int rv = 0; - read_lock(&in_dev->lock); + read_lock(&in_dev->mc_list_lock); for (im=in_dev->mc_list; im; im=im->next) { if (im->multiaddr == mc_addr) break; @@ -2117,7 +2119,7 @@ int ip_check_mc(struct in_device *in_dev, u32 mc_addr, u32 src_addr, u16 proto) } else rv = 1; /* unspecified source; tentatively allow */ } - read_unlock(&in_dev->lock); + read_unlock(&in_dev->mc_list_lock); return rv; } @@ -2141,13 +2143,13 @@ static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq) in_dev = in_dev_get(state->dev); if (!in_dev) continue; - read_lock(&in_dev->lock); + read_lock(&in_dev->mc_list_lock); im = in_dev->mc_list; if (im) { state->in_dev = in_dev; break; } - read_unlock(&in_dev->lock); + read_unlock(&in_dev->mc_list_lock); in_dev_put(in_dev); } return im; @@ -2159,7 +2161,7 @@ static struct ip_mc_list *igmp_mc_get_next(struct seq_file *seq, struct ip_mc_li im = im->next; while (!im) { if (likely(state->in_dev != NULL)) { - read_unlock(&state->in_dev->lock); + read_unlock(&state->in_dev->mc_list_lock); in_dev_put(state->in_dev); } state->dev = state->dev->next; @@ -2170,7 +2172,7 @@ static struct ip_mc_list *igmp_mc_get_next(struct seq_file *seq, struct ip_mc_li state->in_dev = in_dev_get(state->dev); if (!state->in_dev) continue; - read_lock(&state->in_dev->lock); + read_lock(&state->in_dev->mc_list_lock); im = state->in_dev->mc_list; } return im; @@ -2206,7 +2208,7 @@ static void igmp_mc_seq_stop(struct seq_file *seq, void *v) { struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); if (likely(state->in_dev != NULL)) { - read_unlock(&state->in_dev->lock); + read_unlock(&state->in_dev->mc_list_lock); in_dev_put(state->in_dev); state->in_dev = NULL; } @@ -2304,7 +2306,7 @@ static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq) idev = in_dev_get(state->dev); if (unlikely(idev == NULL)) continue; - read_lock_bh(&idev->lock); + read_lock(&idev->mc_list_lock); im = idev->mc_list; if (likely(im != NULL)) { spin_lock_bh(&im->lock); @@ -2316,7 +2318,7 @@ static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq) } spin_unlock_bh(&im->lock); } - read_unlock_bh(&idev->lock); + read_unlock(&idev->mc_list_lock); in_dev_put(idev); } return psf; @@ -2332,7 +2334,7 @@ static struct ip_sf_list *igmp_mcf_get_next(struct seq_file *seq, struct ip_sf_l state->im = state->im->next; while (!state->im) { if (likely(state->idev != NULL)) { - read_unlock_bh(&state->idev->lock); + read_unlock(&state->idev->mc_list_lock); in_dev_put(state->idev); } state->dev = state->dev->next; @@ -2343,7 +2345,7 @@ static struct ip_sf_list *igmp_mcf_get_next(struct seq_file *seq, struct ip_sf_l state->idev = in_dev_get(state->dev); if (!state->idev) continue; - read_lock_bh(&state->idev->lock); + read_lock(&state->idev->mc_list_lock); state->im = state->idev->mc_list; } if (!state->im) @@ -2389,7 +2391,7 @@ static void igmp_mcf_seq_stop(struct seq_file *seq, void *v) state->im = NULL; } if (likely(state->idev != NULL)) { - read_unlock_bh(&state->idev->lock); + read_unlock(&state->idev->mc_list_lock); in_dev_put(state->idev); state->idev = NULL; } diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 6fd69feffce4..b9f1586ae455 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -169,14 +169,18 @@ static void ipfrag_secret_rebuild(unsigned long dummy) atomic_t ip_frag_mem = ATOMIC_INIT(0); /* Memory used for fragments */ /* Memory Tracking Functions. */ -static __inline__ void frag_kfree_skb(struct sk_buff *skb) +static __inline__ void frag_kfree_skb(struct sk_buff *skb, int *work) { + if (work) + *work -= skb->truesize; atomic_sub(skb->truesize, &ip_frag_mem); kfree_skb(skb); } -static __inline__ void frag_free_queue(struct ipq *qp) +static __inline__ void frag_free_queue(struct ipq *qp, int *work) { + if (work) + *work -= sizeof(struct ipq); atomic_sub(sizeof(struct ipq), &ip_frag_mem); kfree(qp); } @@ -195,7 +199,7 @@ static __inline__ struct ipq *frag_alloc_queue(void) /* Destruction primitives. */ /* Complete destruction of ipq. */ -static void ip_frag_destroy(struct ipq *qp) +static void ip_frag_destroy(struct ipq *qp, int *work) { struct sk_buff *fp; @@ -207,18 +211,18 @@ static void ip_frag_destroy(struct ipq *qp) while (fp) { struct sk_buff *xp = fp->next; - frag_kfree_skb(fp); + frag_kfree_skb(fp, work); fp = xp; } /* Finally, release the queue descriptor itself. */ - frag_free_queue(qp); + frag_free_queue(qp, work); } -static __inline__ void ipq_put(struct ipq *ipq) +static __inline__ void ipq_put(struct ipq *ipq, int *work) { if (atomic_dec_and_test(&ipq->refcnt)) - ip_frag_destroy(ipq); + ip_frag_destroy(ipq, work); } /* Kill ipq entry. It is not destroyed immediately, @@ -243,10 +247,13 @@ static void ip_evictor(void) { struct ipq *qp; struct list_head *tmp; + int work; - for(;;) { - if (atomic_read(&ip_frag_mem) <= sysctl_ipfrag_low_thresh) - return; + work = atomic_read(&ip_frag_mem) - sysctl_ipfrag_low_thresh; + if (work <= 0) + return; + + while (work > 0) { read_lock(&ipfrag_lock); if (list_empty(&ipq_lru_list)) { read_unlock(&ipfrag_lock); @@ -262,7 +269,7 @@ static void ip_evictor(void) ipq_kill(qp); spin_unlock(&qp->lock); - ipq_put(qp); + ipq_put(qp, &work); IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); } } @@ -294,7 +301,7 @@ static void ip_expire(unsigned long arg) } out: spin_unlock(&qp->lock); - ipq_put(qp); + ipq_put(qp, NULL); } /* Creation primitives. */ @@ -317,7 +324,7 @@ static struct ipq *ip_frag_intern(unsigned int hash, struct ipq *qp_in) atomic_inc(&qp->refcnt); write_unlock(&ipfrag_lock); qp_in->last_in |= COMPLETE; - ipq_put(qp_in); + ipq_put(qp_in, NULL); return qp; } } @@ -506,7 +513,7 @@ static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb) qp->fragments = next; qp->meat -= free_it->len; - frag_kfree_skb(free_it); + frag_kfree_skb(free_it, NULL); } } @@ -657,7 +664,7 @@ struct sk_buff *ip_defrag(struct sk_buff *skb) ret = ip_frag_reasm(qp, dev); spin_unlock(&qp->lock); - ipq_put(qp); + ipq_put(qp, NULL); return ret; } diff --git a/net/ipv4/ip_nat_dumb.c b/net/ipv4/ip_nat_dumb.c deleted file mode 100644 index b58b5e22d019..000000000000 --- a/net/ipv4/ip_nat_dumb.c +++ /dev/null @@ -1,166 +0,0 @@ -/* - * INET An implementation of the TCP/IP protocol suite for the LINUX - * operating system. INET is implemented using the BSD Socket - * interface as the means of communication with the user level. - * - * Dumb Network Address Translation. - * - * Version: $Id: ip_nat_dumb.c,v 1.11 2000/12/13 18:31:48 davem Exp $ - * - * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Fixes: - * Rani Assaf : A zero checksum is a special case - * only in UDP - * Rani Assaf : Added ICMP messages rewriting - * Rani Assaf : Repaired wrong changes, made by ANK. - * - * - * NOTE: It is just working model of real NAT. - */ - -#include <linux/config.h> -#include <linux/types.h> -#include <linux/mm.h> -#include <linux/sched.h> -#include <linux/skbuff.h> -#include <linux/ip.h> -#include <linux/icmp.h> -#include <linux/netdevice.h> -#include <net/sock.h> -#include <net/ip.h> -#include <net/icmp.h> -#include <linux/tcp.h> -#include <linux/udp.h> -#include <net/checksum.h> -#include <linux/route.h> -#include <net/route.h> -#include <net/ip_fib.h> - - -int -ip_do_nat(struct sk_buff *skb) -{ - struct rtable *rt = (struct rtable*)skb->dst; - struct iphdr *iph = skb->nh.iph; - u32 odaddr = iph->daddr; - u32 osaddr = iph->saddr; - u16 check; - - IPCB(skb)->flags |= IPSKB_TRANSLATED; - - /* Rewrite IP header */ - iph->daddr = rt->rt_dst_map; - iph->saddr = rt->rt_src_map; - iph->check = 0; - iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); - - /* If it is the first fragment, rewrite protocol headers */ - - if (!(iph->frag_off & htons(IP_OFFSET))) { - u16 *cksum; - - switch(iph->protocol) { - case IPPROTO_TCP: - cksum = (u16*)&((struct tcphdr*)(((char*)iph) + (iph->ihl<<2)))->check; - if ((u8*)(cksum+1) > skb->tail) - goto truncated; - check = *cksum; - if (skb->ip_summed != CHECKSUM_HW) - check = ~check; - check = csum_tcpudp_magic(iph->saddr, iph->daddr, 0, 0, check); - check = csum_tcpudp_magic(~osaddr, ~odaddr, 0, 0, ~check); - if (skb->ip_summed == CHECKSUM_HW) - check = ~check; - *cksum = check; - break; - case IPPROTO_UDP: - cksum = (u16*)&((struct udphdr*)(((char*)iph) + (iph->ihl<<2)))->check; - if ((u8*)(cksum+1) > skb->tail) - goto truncated; - if ((check = *cksum) != 0) { - check = csum_tcpudp_magic(iph->saddr, iph->daddr, 0, 0, ~check); - check = csum_tcpudp_magic(~osaddr, ~odaddr, 0, 0, ~check); - *cksum = check ? : 0xFFFF; - } - break; - case IPPROTO_ICMP: - { - struct icmphdr *icmph = (struct icmphdr*)((char*)iph + (iph->ihl<<2)); - struct iphdr *ciph; - u32 idaddr, isaddr; - int updated; - - if ((icmph->type != ICMP_DEST_UNREACH) && - (icmph->type != ICMP_TIME_EXCEEDED) && - (icmph->type != ICMP_PARAMETERPROB)) - break; - - ciph = (struct iphdr *) (icmph + 1); - - if ((u8*)(ciph+1) > skb->tail) - goto truncated; - - isaddr = ciph->saddr; - idaddr = ciph->daddr; - updated = 0; - - if (rt->rt_flags&RTCF_DNAT && ciph->saddr == odaddr) { - ciph->saddr = iph->daddr; - updated = 1; - } - if (rt->rt_flags&RTCF_SNAT) { - if (ciph->daddr != osaddr) { - struct fib_result res; - unsigned flags = 0; - struct flowi fl = { - .iif = skb->dev->ifindex, - .nl_u = - { .ip4_u = - { .daddr = ciph->saddr, - .saddr = ciph->daddr, -#ifdef CONFIG_IP_ROUTE_TOS - .tos = RT_TOS(ciph->tos) -#endif - } }, - .proto = ciph->protocol }; - - /* Use fib_lookup() until we get our own - * hash table of NATed hosts -- Rani - */ - if (fib_lookup(&fl, &res) == 0) { - if (res.r) { - ciph->daddr = fib_rules_policy(ciph->daddr, &res, &flags); - if (ciph->daddr != idaddr) - updated = 1; - } - fib_res_put(&res); - } - } else { - ciph->daddr = iph->saddr; - updated = 1; - } - } - if (updated) { - cksum = &icmph->checksum; - /* Using tcpudp primitive. Why not? */ - check = csum_tcpudp_magic(ciph->saddr, ciph->daddr, 0, 0, ~(*cksum)); - *cksum = csum_tcpudp_magic(~isaddr, ~idaddr, 0, 0, ~check); - } - break; - } - default: - break; - } - } - return NET_RX_SUCCESS; - -truncated: - /* should be return NET_RX_BAD; */ - return -EINVAL; -} diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index 7ce7469a3c04..095028111e64 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -120,20 +120,20 @@ out: return err; } -static int ipcomp_output(struct sk_buff **pskb) +static int ipcomp_output(struct sk_buff *skb) { int err; - struct dst_entry *dst = (*pskb)->dst; + struct dst_entry *dst = skb->dst; struct xfrm_state *x = dst->xfrm; struct iphdr *iph; struct ip_comp_hdr *ipch; struct ipcomp_data *ipcd = x->data; int hdr_len = 0; - iph = (*pskb)->nh.iph; - iph->tot_len = htons((*pskb)->len); + iph = skb->nh.iph; + iph->tot_len = htons(skb->len); hdr_len = iph->ihl * 4; - if (((*pskb)->len - hdr_len) < ipcd->threshold) { + if ((skb->len - hdr_len) < ipcd->threshold) { /* Don't bother compressing */ if (x->props.mode) { ip_send_check(iph); @@ -141,17 +141,17 @@ static int ipcomp_output(struct sk_buff **pskb) goto out_ok; } - if ((skb_is_nonlinear(*pskb) || skb_cloned(*pskb)) && - skb_linearize(*pskb, GFP_ATOMIC) != 0) { + if ((skb_is_nonlinear(skb) || skb_cloned(skb)) && + skb_linearize(skb, GFP_ATOMIC) != 0) { err = -ENOMEM; goto error; } - err = ipcomp_compress(x, *pskb); + err = ipcomp_compress(x, skb); if (err) { if (err == -EMSGSIZE) { if (x->props.mode) { - iph = (*pskb)->nh.iph; + iph = skb->nh.iph; ip_send_check(iph); } goto out_ok; @@ -160,8 +160,8 @@ static int ipcomp_output(struct sk_buff **pskb) } /* Install ipcomp header, convert into ipcomp datagram. */ - iph = (*pskb)->nh.iph; - iph->tot_len = htons((*pskb)->len); + iph = skb->nh.iph; + iph->tot_len = htons(skb->len); ipch = (struct ip_comp_hdr *)((char *)iph + iph->ihl * 4); ipch->nexthdr = iph->protocol; ipch->flags = 0; @@ -288,6 +288,9 @@ static int ipcomp_init_state(struct xfrm_state *x, void *args) if (!x->calg) goto out; + if (x->encap) + goto out; + err = -ENOMEM; ipcd = kmalloc(sizeof(*ipcd), GFP_KERNEL); if (!ipcd) diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index 9f4b94f0768d..323a1e7746b8 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -383,21 +383,23 @@ ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) struct ip_vs_conn *cp = NULL; struct iphdr *iph = skb->nh.iph; struct ip_vs_dest *dest; - __u16 ports[2]; + __u16 _ports[2], *pptr; - if (skb_copy_bits(skb, iph->ihl*4, ports, sizeof(ports)) < 0) + pptr = skb_header_pointer(skb, iph->ihl*4, + sizeof(_ports), _ports); + if (pptr == NULL) return NULL; /* * Persistent service */ if (svc->flags & IP_VS_SVC_F_PERSISTENT) - return ip_vs_sched_persist(svc, skb, ports); + return ip_vs_sched_persist(svc, skb, pptr); /* * Non-persistent service */ - if (!svc->fwmark && ports[1] != svc->port) { + if (!svc->fwmark && pptr[1] != svc->port) { if (!svc->port) IP_VS_ERR("Schedule: port zero only supported " "in persistent services, " @@ -415,9 +417,9 @@ ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) * Create a connection entry. */ cp = ip_vs_conn_new(iph->protocol, - iph->saddr, ports[0], - iph->daddr, ports[1], - dest->addr, dest->port?dest->port:ports[1], + iph->saddr, pptr[0], + iph->daddr, pptr[1], + dest->addr, dest->port?dest->port:pptr[1], 0, dest); if (cp == NULL) @@ -444,10 +446,12 @@ ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, struct ip_vs_protocol *pp) { - __u16 ports[2]; + __u16 _ports[2], *pptr; struct iphdr *iph = skb->nh.iph; - if (skb_copy_bits(skb, iph->ihl*4, ports, sizeof(ports)) < 0) { + pptr = skb_header_pointer(skb, iph->ihl*4, + sizeof(_ports), _ports); + if (pptr == NULL) { ip_vs_service_put(svc); return NF_DROP; } @@ -465,8 +469,8 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, /* create a new connection entry */ IP_VS_DBG(6, "ip_vs_leave: create a cache_bypass entry\n"); cp = ip_vs_conn_new(iph->protocol, - iph->saddr, ports[0], - iph->daddr, ports[1], + iph->saddr, pptr[0], + iph->daddr, pptr[1], 0, 0, IP_VS_CONN_F_BYPASS, NULL); @@ -494,7 +498,7 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, * listed in the ipvs table), pass the packets, because it is * not ipvs job to decide to drop the packets. */ - if ((svc->port == FTPPORT) && (ports[1] != FTPPORT)) { + if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) { ip_vs_service_put(svc); return NF_ACCEPT; } @@ -607,8 +611,8 @@ static int ip_vs_out_icmp(struct sk_buff **pskb, int *related) { struct sk_buff *skb = *pskb; struct iphdr *iph; - struct icmphdr icmph; - struct iphdr ciph; /* The ip header contained within the ICMP */ + struct icmphdr _icmph, *ic; + struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ struct ip_vs_conn *cp; struct ip_vs_protocol *pp; unsigned int offset, ihl, verdict; @@ -625,11 +629,12 @@ static int ip_vs_out_icmp(struct sk_buff **pskb, int *related) iph = skb->nh.iph; offset = ihl = iph->ihl * 4; - if (skb_copy_bits(skb, offset, &icmph, sizeof(icmph)) < 0) + ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); + if (ic == NULL) return NF_DROP; IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n", - icmph.type, ntohs(icmp_id(&icmph)), + ic->type, ntohs(icmp_id(ic)), NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); /* @@ -639,33 +644,34 @@ static int ip_vs_out_icmp(struct sk_buff **pskb, int *related) * this means that some packets will manage to get a long way * down this stack and then be rejected, but that's life. */ - if ((icmph.type != ICMP_DEST_UNREACH) && - (icmph.type != ICMP_SOURCE_QUENCH) && - (icmph.type != ICMP_TIME_EXCEEDED)) { + if ((ic->type != ICMP_DEST_UNREACH) && + (ic->type != ICMP_SOURCE_QUENCH) && + (ic->type != ICMP_TIME_EXCEEDED)) { *related = 0; return NF_ACCEPT; } /* Now find the contained IP header */ - offset += sizeof(icmph); - if (skb_copy_bits(skb, offset, &ciph, sizeof(ciph)) < 0) + offset += sizeof(_icmph); + cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); + if (cih == NULL) return NF_ACCEPT; /* The packet looks wrong, ignore */ - pp = ip_vs_proto_get(ciph.protocol); + pp = ip_vs_proto_get(cih->protocol); if (!pp) return NF_ACCEPT; /* Is the embedded protocol header present? */ - if (unlikely(ciph.frag_off & __constant_htons(IP_OFFSET) && + if (unlikely(cih->frag_off & __constant_htons(IP_OFFSET) && pp->dont_defrag)) return NF_ACCEPT; IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for"); - offset += ciph.ihl * 4; + offset += cih->ihl * 4; /* The embedded headers contain source and dest in reverse order */ - cp = pp->conn_out_get(skb, pp, &ciph, offset, 1); + cp = pp->conn_out_get(skb, pp, cih, offset, 1); if (!cp) return NF_ACCEPT; @@ -685,7 +691,7 @@ static int ip_vs_out_icmp(struct sk_buff **pskb, int *related) goto out; } - if (IPPROTO_TCP == ciph.protocol || IPPROTO_UDP == ciph.protocol) + if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol) offset += 2 * sizeof(__u16); if (!ip_vs_make_skb_writable(pskb, offset)) goto out; @@ -707,11 +713,13 @@ static int ip_vs_out_icmp(struct sk_buff **pskb, int *related) static inline int is_tcp_reset(const struct sk_buff *skb) { - struct tcphdr tcph; + struct tcphdr _tcph, *th; - if (skb_copy_bits(skb, skb->nh.iph->ihl * 4, &tcph, sizeof(tcph)) < 0) + th = skb_header_pointer(skb, skb->nh.iph->ihl * 4, + sizeof(_tcph), &_tcph); + if (th == NULL) return 0; - return tcph.rst; + return th->rst; } /* @@ -777,12 +785,14 @@ ip_vs_out(unsigned int hooknum, struct sk_buff **pskb, if (sysctl_ip_vs_nat_icmp_send && (pp->protocol == IPPROTO_TCP || pp->protocol == IPPROTO_UDP)) { - __u16 ports[2]; + __u16 _ports[2], *pptr; - if (skb_copy_bits(skb, ihl, ports, sizeof(ports)) < 0) + pptr = skb_header_pointer(skb, ihl, + sizeof(_ports), _ports); + if (pptr == NULL) return NF_ACCEPT; /* Not for me */ if (ip_vs_lookup_real_service(iph->protocol, - iph->saddr, ports[0])) { + iph->saddr, pptr[0])) { /* * Notify the real server: there is no * existing entry if it is not RST @@ -866,8 +876,8 @@ static int ip_vs_in_icmp(struct sk_buff **pskb, int *related) { struct sk_buff *skb = *pskb; struct iphdr *iph; - struct icmphdr icmph; - struct iphdr ciph; /* The ip header contained within the ICMP */ + struct icmphdr _icmph, *ic; + struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ struct ip_vs_conn *cp; struct ip_vs_protocol *pp; unsigned int offset, ihl, verdict; @@ -884,11 +894,12 @@ static int ip_vs_in_icmp(struct sk_buff **pskb, int *related) iph = skb->nh.iph; offset = ihl = iph->ihl * 4; - if (skb_copy_bits(skb, offset, &icmph, sizeof(icmph)) < 0) + ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); + if (ic == NULL) return NF_DROP; IP_VS_DBG(12, "Incoming ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n", - icmph.type, ntohs(icmp_id(&icmph)), + ic->type, ntohs(icmp_id(ic)), NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); /* @@ -898,33 +909,34 @@ static int ip_vs_in_icmp(struct sk_buff **pskb, int *related) * this means that some packets will manage to get a long way * down this stack and then be rejected, but that's life. */ - if ((icmph.type != ICMP_DEST_UNREACH) && - (icmph.type != ICMP_SOURCE_QUENCH) && - (icmph.type != ICMP_TIME_EXCEEDED)) { + if ((ic->type != ICMP_DEST_UNREACH) && + (ic->type != ICMP_SOURCE_QUENCH) && + (ic->type != ICMP_TIME_EXCEEDED)) { *related = 0; return NF_ACCEPT; } /* Now find the contained IP header */ - offset += sizeof(icmph); - if (skb_copy_bits(skb, offset, &ciph, sizeof(ciph)) < 0) + offset += sizeof(_icmph); + cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); + if (cih == NULL) return NF_ACCEPT; /* The packet looks wrong, ignore */ - pp = ip_vs_proto_get(ciph.protocol); + pp = ip_vs_proto_get(cih->protocol); if (!pp) return NF_ACCEPT; /* Is the embedded protocol header present? */ - if (unlikely(ciph.frag_off & __constant_htons(IP_OFFSET) && + if (unlikely(cih->frag_off & __constant_htons(IP_OFFSET) && pp->dont_defrag)) return NF_ACCEPT; IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for"); - offset += ciph.ihl * 4; + offset += cih->ihl * 4; /* The embedded headers contain source and dest in reverse order */ - cp = pp->conn_in_get(skb, pp, &ciph, offset, 1); + cp = pp->conn_in_get(skb, pp, cih, offset, 1); if (!cp) return NF_ACCEPT; @@ -941,7 +953,7 @@ static int ip_vs_in_icmp(struct sk_buff **pskb, int *related) /* do the statistics and put it back */ ip_vs_in_stats(cp, skb); - if (IPPROTO_TCP == ciph.protocol || IPPROTO_UDP == ciph.protocol) + if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol) offset += 2 * sizeof(__u16); verdict = ip_vs_icmp_xmit(skb, cp, pp, offset); /* do not touch skb anymore */ diff --git a/net/ipv4/ipvs/ip_vs_proto.c b/net/ipv4/ipvs/ip_vs_proto.c index caf24e3754c6..dfd0a7dd3b75 100644 --- a/net/ipv4/ipvs/ip_vs_proto.c +++ b/net/ipv4/ipvs/ip_vs_proto.c @@ -166,27 +166,33 @@ ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp, const char *msg) { char buf[128]; - __u16 ports[2]; - struct iphdr iph; + struct iphdr _iph, *ih; - if (skb_copy_bits(skb, offset, &iph, sizeof(iph)) < 0) + ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); + if (ih == NULL) sprintf(buf, "%s TRUNCATED", pp->name); - else if (iph.frag_off & __constant_htons(IP_OFFSET)) + else if (ih->frag_off & __constant_htons(IP_OFFSET)) sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u frag", - pp->name, NIPQUAD(iph.saddr), - NIPQUAD(iph.daddr)); - else if (skb_copy_bits(skb, offset + iph.ihl*4, ports, sizeof(ports)) < 0) - sprintf(buf, "%s TRUNCATED %u.%u.%u.%u->%u.%u.%u.%u", - pp->name, - NIPQUAD(iph.saddr), - NIPQUAD(iph.daddr)); - else - sprintf(buf, "%s %u.%u.%u.%u:%u->%u.%u.%u.%u:%u", - pp->name, - NIPQUAD(iph.saddr), - ntohs(ports[0]), - NIPQUAD(iph.daddr), - ntohs(ports[1])); + pp->name, NIPQUAD(ih->saddr), + NIPQUAD(ih->daddr)); + else { + __u16 _ports[2], *pptr +; + pptr = skb_header_pointer(skb, offset + ih->ihl*4, + sizeof(_ports), _ports); + if (pptr == NULL) + sprintf(buf, "%s TRUNCATED %u.%u.%u.%u->%u.%u.%u.%u", + pp->name, + NIPQUAD(ih->saddr), + NIPQUAD(ih->daddr)); + else + sprintf(buf, "%s %u.%u.%u.%u:%u->%u.%u.%u.%u:%u", + pp->name, + NIPQUAD(ih->saddr), + ntohs(pptr[0]), + NIPQUAD(ih->daddr), + ntohs(pptr[1])); + } printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); } diff --git a/net/ipv4/ipvs/ip_vs_proto_ah.c b/net/ipv4/ipvs/ip_vs_proto_ah.c index ffea536b09a8..453e94a0bbd7 100644 --- a/net/ipv4/ipvs/ip_vs_proto_ah.c +++ b/net/ipv4/ipvs/ip_vs_proto_ah.c @@ -129,14 +129,15 @@ ah_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb, int offset, const char *msg) { char buf[256]; - struct iphdr iph; + struct iphdr _iph, *ih; - if (skb_copy_bits(skb, offset, &iph, sizeof(iph)) < 0) + ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); + if (ih == NULL) sprintf(buf, "%s TRUNCATED", pp->name); else sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u", - pp->name, NIPQUAD(iph.saddr), - NIPQUAD(iph.daddr)); + pp->name, NIPQUAD(ih->saddr), + NIPQUAD(ih->daddr)); printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); } diff --git a/net/ipv4/ipvs/ip_vs_proto_esp.c b/net/ipv4/ipvs/ip_vs_proto_esp.c index 1922388327b8..478e5c7c7e8e 100644 --- a/net/ipv4/ipvs/ip_vs_proto_esp.c +++ b/net/ipv4/ipvs/ip_vs_proto_esp.c @@ -128,14 +128,15 @@ esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb, int offset, const char *msg) { char buf[256]; - struct iphdr iph; + struct iphdr _iph, *ih; - if (skb_copy_bits(skb, offset, &iph, sizeof(iph)) < 0) + ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); + if (ih == NULL) sprintf(buf, "%s TRUNCATED", pp->name); else sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u", - pp->name, NIPQUAD(iph.saddr), - NIPQUAD(iph.daddr)); + pp->name, NIPQUAD(ih->saddr), + NIPQUAD(ih->daddr)); printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); } diff --git a/net/ipv4/ipvs/ip_vs_proto_icmp.c b/net/ipv4/ipvs/ip_vs_proto_icmp.c index d611b5a36d48..747e0333f5de 100644 --- a/net/ipv4/ipvs/ip_vs_proto_icmp.c +++ b/net/ipv4/ipvs/ip_vs_proto_icmp.c @@ -104,24 +104,29 @@ icmp_debug_packet(struct ip_vs_protocol *pp, const char *msg) { char buf[256]; - struct iphdr iph; - struct icmphdr icmph; + struct iphdr _iph, *ih; - if (skb_copy_bits(skb, offset, &iph, sizeof(iph)) < 0) + ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); + if (ih == NULL) sprintf(buf, "%s TRUNCATED", pp->name); - else if (iph.frag_off & __constant_htons(IP_OFFSET)) + else if (ih->frag_off & __constant_htons(IP_OFFSET)) sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u frag", - pp->name, NIPQUAD(iph.saddr), - NIPQUAD(iph.daddr)); - else if (skb_copy_bits(skb, offset + iph.ihl*4, &icmph, sizeof(icmph)) < 0) - sprintf(buf, "%s TRUNCATED to %u bytes\n", - pp->name, skb->len - offset); - else - sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u T:%d C:%d", - pp->name, NIPQUAD(iph.saddr), - NIPQUAD(iph.daddr), - icmph.type, icmph.code); - + pp->name, NIPQUAD(ih->saddr), + NIPQUAD(ih->daddr)); + else { + struct icmphdr _icmph, *ic; + + ic = skb_header_pointer(skb, offset + ih->ihl*4, + sizeof(_icmph), &_icmph); + if (ic == NULL) + sprintf(buf, "%s TRUNCATED to %u bytes\n", + pp->name, skb->len - offset); + else + sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u T:%d C:%d", + pp->name, NIPQUAD(ih->saddr), + NIPQUAD(ih->daddr), + ic->type, ic->code); + } printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); } diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c index 2f00e914288a..bd8f898bfe19 100644 --- a/net/ipv4/ipvs/ip_vs_proto_tcp.c +++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c @@ -29,19 +29,20 @@ static struct ip_vs_conn * tcp_conn_in_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, const struct iphdr *iph, unsigned int proto_off, int inverse) { - __u16 ports[2]; + __u16 _ports[2], *pptr; - if (skb_copy_bits(skb, proto_off, ports, sizeof(ports)) < 0) + pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); + if (pptr == NULL) return NULL; if (likely(!inverse)) { return ip_vs_conn_in_get(iph->protocol, - iph->saddr, ports[0], - iph->daddr, ports[1]); + iph->saddr, pptr[0], + iph->daddr, pptr[1]); } else { return ip_vs_conn_in_get(iph->protocol, - iph->daddr, ports[1], - iph->saddr, ports[0]); + iph->daddr, pptr[1], + iph->saddr, pptr[0]); } } @@ -49,19 +50,20 @@ static struct ip_vs_conn * tcp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, const struct iphdr *iph, unsigned int proto_off, int inverse) { - __u16 ports[2]; + __u16 _ports[2], *pptr; - if (skb_copy_bits(skb, proto_off, ports, sizeof(ports)) < 0) + pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); + if (pptr == NULL) return NULL; if (likely(!inverse)) { return ip_vs_conn_out_get(iph->protocol, - iph->saddr, ports[0], - iph->daddr, ports[1]); + iph->saddr, pptr[0], + iph->daddr, pptr[1]); } else { return ip_vs_conn_out_get(iph->protocol, - iph->daddr, ports[1], - iph->saddr, ports[0]); + iph->daddr, pptr[1], + iph->saddr, pptr[0]); } } @@ -72,16 +74,18 @@ tcp_conn_schedule(struct sk_buff *skb, int *verdict, struct ip_vs_conn **cpp) { struct ip_vs_service *svc; - struct tcphdr tcph; + struct tcphdr _tcph, *th; - if (skb_copy_bits(skb, skb->nh.iph->ihl*4, &tcph, sizeof(tcph)) < 0) { + th = skb_header_pointer(skb, skb->nh.iph->ihl*4, + sizeof(_tcph), &_tcph); + if (th == NULL) { *verdict = NF_DROP; return 0; } - if (tcph.syn && + if (th->syn && (svc = ip_vs_service_get(skb->nfmark, skb->nh.iph->protocol, - skb->nh.iph->daddr, tcph.dest))) { + skb->nh.iph->daddr, th->dest))) { if (ip_vs_todrop()) { /* * It seems that we are very loaded. @@ -483,13 +487,15 @@ tcp_state_transition(struct ip_vs_conn *cp, int direction, const struct sk_buff *skb, struct ip_vs_protocol *pp) { - struct tcphdr tcph; + struct tcphdr _tcph, *th; - if (skb_copy_bits(skb, skb->nh.iph->ihl*4, &tcph, sizeof(tcph)) < 0) + th = skb_header_pointer(skb, skb->nh.iph->ihl*4, + sizeof(_tcph), &_tcph); + if (th == NULL) return 0; spin_lock(&cp->lock); - set_tcp_state(pp, cp, direction, &tcph); + set_tcp_state(pp, cp, direction, th); spin_unlock(&cp->lock); return 1; diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c index 81501c938605..443ec4578d40 100644 --- a/net/ipv4/ipvs/ip_vs_proto_udp.c +++ b/net/ipv4/ipvs/ip_vs_proto_udp.c @@ -26,19 +26,20 @@ udp_conn_in_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, const struct iphdr *iph, unsigned int proto_off, int inverse) { struct ip_vs_conn *cp; - __u16 ports[2]; + __u16 _ports[2], *pptr; - if (skb_copy_bits(skb, proto_off, ports, sizeof(ports)) < 0) + pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); + if (pptr == NULL) return NULL; if (likely(!inverse)) { cp = ip_vs_conn_in_get(iph->protocol, - iph->saddr, ports[0], - iph->daddr, ports[1]); + iph->saddr, pptr[0], + iph->daddr, pptr[1]); } else { cp = ip_vs_conn_in_get(iph->protocol, - iph->daddr, ports[1], - iph->saddr, ports[0]); + iph->daddr, pptr[1], + iph->saddr, pptr[0]); } return cp; @@ -50,19 +51,21 @@ udp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, const struct iphdr *iph, unsigned int proto_off, int inverse) { struct ip_vs_conn *cp; - __u16 ports[2]; + __u16 _ports[2], *pptr; - if (skb_copy_bits(skb, skb->nh.iph->ihl*4, ports, sizeof(ports)) < 0) + pptr = skb_header_pointer(skb, skb->nh.iph->ihl*4, + sizeof(_ports), _ports); + if (pptr == NULL) return NULL; if (likely(!inverse)) { cp = ip_vs_conn_out_get(iph->protocol, - iph->saddr, ports[0], - iph->daddr, ports[1]); + iph->saddr, pptr[0], + iph->daddr, pptr[1]); } else { cp = ip_vs_conn_out_get(iph->protocol, - iph->daddr, ports[1], - iph->saddr, ports[0]); + iph->daddr, pptr[1], + iph->saddr, pptr[0]); } return cp; @@ -74,15 +77,17 @@ udp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp, int *verdict, struct ip_vs_conn **cpp) { struct ip_vs_service *svc; - struct udphdr udph; + struct udphdr _udph, *uh; - if (skb_copy_bits(skb, skb->nh.iph->ihl*4, &udph, sizeof(udph)) < 0) { + uh = skb_header_pointer(skb, skb->nh.iph->ihl*4, + sizeof(_udph), &_udph); + if (uh == NULL) { *verdict = NF_DROP; return 0; } if ((svc = ip_vs_service_get(skb->nfmark, skb->nh.iph->protocol, - skb->nh.iph->daddr, udph.dest))) { + skb->nh.iph->daddr, uh->dest))) { if (ip_vs_todrop()) { /* * It seems that we are very loaded. @@ -230,13 +235,14 @@ udp_dnat_handler(struct sk_buff **pskb, static int udp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp) { - struct udphdr udph; + struct udphdr _udph, *uh; unsigned int udphoff = skb->nh.iph->ihl*4; - if (skb_copy_bits(skb, udphoff, &udph, sizeof(udph)) < 0) + uh = skb_header_pointer(skb, udphoff, sizeof(_udph), &_udph); + if (uh == NULL) return 0; - if (udph.check != 0) { + if (uh->check != 0) { switch (skb->ip_summed) { case CHECKSUM_NONE: skb->csum = skb_checksum(skb, udphoff, diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c index 204767be411e..3a85f7a8d02a 100644 --- a/net/ipv4/ipvs/ip_vs_xmit.c +++ b/net/ipv4/ipvs/ip_vs_xmit.c @@ -234,11 +234,12 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* check if it is a connection of no-client-port */ if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { - __u16 pt; - if (skb_copy_bits(skb, iph->ihl*4, &pt, sizeof(pt)) < 0) + __u16 _pt, *p; + p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt); + if (p == NULL) goto tx_error; - ip_vs_conn_fill_cport(cp, pt); - IP_VS_DBG(10, "filled cport=%d\n", ntohs(pt)); + ip_vs_conn_fill_cport(cp, *p); + IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); } if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos)))) diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index b58141ead442..07c0fb9044b8 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -628,5 +628,17 @@ config IP_NF_MATCH_REALM If you want to compile it as a module, say M here and read Documentation/modules.txt. If unsure, say `N'. +config IP_NF_CT_ACCT + bool "Connection tracking flow accounting" + depends on IP_NF_CONNTRACK + +config IP_NF_MATCH_SCTP + tristate 'SCTP protocol match support' + depends on IP_NF_IPTABLES + +config IP_NF_CT_PROTO_SCTP + tristate 'SCTP protocol connection tracking support (EXPERIMENTAL)' + depends on IP_NF_CONNTRACK && EXPERIMENTAL + endmenu diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index bdb23fde133f..164f4332a72d 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -19,6 +19,9 @@ ipchains-objs := $(ip_nf_compat-objs) ipchains_core.o # connection tracking obj-$(CONFIG_IP_NF_CONNTRACK) += ip_conntrack.o +# SCTP protocol connection tracking +obj-$(CONFIG_IP_NF_CT_PROTO_SCTP) += ip_conntrack_proto_sctp.o + # connection tracking helpers obj-$(CONFIG_IP_NF_AMANDA) += ip_conntrack_amanda.o obj-$(CONFIG_IP_NF_TFTP) += ip_conntrack_tftp.o @@ -43,6 +46,7 @@ obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o # matches obj-$(CONFIG_IP_NF_MATCH_HELPER) += ipt_helper.o obj-$(CONFIG_IP_NF_MATCH_LIMIT) += ipt_limit.o +obj-$(CONFIG_IP_NF_MATCH_SCTP) += ipt_sctp.o obj-$(CONFIG_IP_NF_MATCH_MARK) += ipt_mark.o obj-$(CONFIG_IP_NF_MATCH_MAC) += ipt_mac.o obj-$(CONFIG_IP_NF_MATCH_IPRANGE) += ipt_iprange.o diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c index 4e8f4d83baf2..a54ef782f8b5 100644 --- a/net/ipv4/netfilter/ip_conntrack_amanda.c +++ b/net/ipv4/netfilter/ip_conntrack_amanda.c @@ -22,6 +22,7 @@ #include <linux/module.h> #include <linux/netfilter.h> #include <linux/ip.h> +#include <linux/moduleparam.h> #include <net/checksum.h> #include <net/udp.h> @@ -34,7 +35,7 @@ static unsigned int master_timeout = 300; MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>"); MODULE_DESCRIPTION("Amanda connection tracking module"); MODULE_LICENSE("GPL"); -MODULE_PARM(master_timeout, "i"); +module_param(master_timeout, int, 0600); MODULE_PARM_DESC(master_timeout, "timeout for the master connection"); static char *conns[] = { "DATA ", "MESG ", "INDEX " }; @@ -48,7 +49,7 @@ static int help(struct sk_buff *skb, { struct ip_conntrack_expect *exp; struct ip_ct_amanda_expect *exp_amanda_info; - char *data, *data_limit, *tmp; + char *amp, *data, *data_limit, *tmp; unsigned int dataoff, i; u_int16_t port, len; @@ -58,7 +59,7 @@ static int help(struct sk_buff *skb, /* increase the UDP timeout of the master connection as replies from * Amanda clients to the server can be quite delayed */ - ip_ct_refresh(ct, master_timeout * HZ); + ip_ct_refresh_acct(ct, ctinfo, NULL, master_timeout * HZ); /* No data? */ dataoff = skb->nh.iph->ihl*4 + sizeof(struct udphdr); @@ -69,9 +70,11 @@ static int help(struct sk_buff *skb, } LOCK_BH(&amanda_buffer_lock); - skb_copy_bits(skb, dataoff, amanda_buffer, skb->len - dataoff); - data = amanda_buffer; - data_limit = amanda_buffer + skb->len - dataoff; + amp = skb_header_pointer(skb, dataoff, + skb->len - dataoff, amanda_buffer); + BUG_ON(amp == NULL); + data = amp; + data_limit = amp + skb->len - dataoff; *data_limit = '\0'; /* Search for the CONNECT string */ @@ -107,7 +110,7 @@ static int help(struct sk_buff *skb, exp->mask.dst.u.tcp.port = 0xFFFF; exp_amanda_info = &exp->help.exp_amanda_info; - exp_amanda_info->offset = tmp - amanda_buffer; + exp_amanda_info->offset = tmp - amp; exp_amanda_info->port = port; exp_amanda_info->len = len; diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c index 00a89f4f8d8b..f6def5a4b491 100644 --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ b/net/ipv4/netfilter/ip_conntrack_core.c @@ -34,8 +34,9 @@ #include <linux/slab.h> #include <linux/random.h> #include <linux/jhash.h> -/* For ERR_PTR(). Yeah, I know... --RR */ -#include <linux/fs.h> +#include <linux/err.h> +#include <linux/percpu.h> +#include <linux/moduleparam.h> /* This rwlock protects the main hash table, protocol/helper/expected registrations, conntrack timers*/ @@ -59,17 +60,23 @@ DECLARE_RWLOCK(ip_conntrack_lock); DECLARE_RWLOCK(ip_conntrack_expect_tuple_lock); +/* ip_conntrack_standalone needs this */ +atomic_t ip_conntrack_count = ATOMIC_INIT(0); +EXPORT_SYMBOL(ip_conntrack_count); + void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL; LIST_HEAD(ip_conntrack_expect_list); LIST_HEAD(protocol_list); static LIST_HEAD(helpers); unsigned int ip_conntrack_htable_size = 0; int ip_conntrack_max; -static atomic_t ip_conntrack_count = ATOMIC_INIT(0); struct list_head *ip_conntrack_hash; static kmem_cache_t *ip_conntrack_cachep; +static kmem_cache_t *ip_conntrack_expect_cachep; struct ip_conntrack ip_conntrack_untracked; +DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat); + extern struct ip_conntrack_protocol ip_conntrack_generic_protocol; static inline int proto_cmpfn(const struct ip_conntrack_protocol *curr, @@ -127,11 +134,11 @@ hash_conntrack(const struct ip_conntrack_tuple *tuple) } int -get_tuple(const struct iphdr *iph, - const struct sk_buff *skb, - unsigned int dataoff, - struct ip_conntrack_tuple *tuple, - const struct ip_conntrack_protocol *protocol) +ip_ct_get_tuple(const struct iphdr *iph, + const struct sk_buff *skb, + unsigned int dataoff, + struct ip_conntrack_tuple *tuple, + const struct ip_conntrack_protocol *protocol) { /* Never happen */ if (iph->frag_off & htons(IP_OFFSET)) { @@ -147,10 +154,10 @@ get_tuple(const struct iphdr *iph, return protocol->pkt_to_tuple(skb, dataoff, tuple); } -static int -invert_tuple(struct ip_conntrack_tuple *inverse, - const struct ip_conntrack_tuple *orig, - const struct ip_conntrack_protocol *protocol) +int +ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse, + const struct ip_conntrack_tuple *orig, + const struct ip_conntrack_protocol *protocol) { inverse->src.ip = orig->dst.ip; inverse->dst.ip = orig->src.ip; @@ -177,7 +184,8 @@ destroy_expect(struct ip_conntrack_expect *exp) IP_NF_ASSERT(atomic_read(&exp->use) == 0); IP_NF_ASSERT(!timer_pending(&exp->timeout)); - kfree(exp); + kmem_cache_free(ip_conntrack_expect_cachep, exp); + __get_cpu_var(ip_conntrack_stat).expect_delete++; } inline void ip_conntrack_expect_put(struct ip_conntrack_expect *exp) @@ -336,7 +344,7 @@ destroy_conntrack(struct nf_conntrack *nfct) list_del(&ct->master->expected_list); master = ct->master->expectant; } - kfree(ct->master); + kmem_cache_free(ip_conntrack_expect_cachep, ct->master); } WRITE_UNLOCK(&ip_conntrack_lock); @@ -346,12 +354,15 @@ destroy_conntrack(struct nf_conntrack *nfct) DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct); kmem_cache_free(ip_conntrack_cachep, ct); atomic_dec(&ip_conntrack_count); + __get_cpu_var(ip_conntrack_stat).delete++; } static void death_by_timeout(unsigned long ul_conntrack) { struct ip_conntrack *ct = (void *)ul_conntrack; + __get_cpu_var(ip_conntrack_stat).delete_list++; + WRITE_LOCK(&ip_conntrack_lock); clean_from_lists(ct); WRITE_UNLOCK(&ip_conntrack_lock); @@ -374,13 +385,19 @@ __ip_conntrack_find(const struct ip_conntrack_tuple *tuple, { struct ip_conntrack_tuple_hash *h; unsigned int hash = hash_conntrack(tuple); + /* use per_cpu() to avoid multiple calls to smp_processor_id() */ + unsigned int cpu = smp_processor_id(); MUST_BE_READ_LOCKED(&ip_conntrack_lock); - h = LIST_FIND(&ip_conntrack_hash[hash], - conntrack_tuple_cmp, - struct ip_conntrack_tuple_hash *, - tuple, ignored_conntrack); - return h; + list_for_each_entry(h, &ip_conntrack_hash[hash], list) { + if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) { + per_cpu(ip_conntrack_stat, cpu).found++; + return h; + } + per_cpu(ip_conntrack_stat, cpu).searched++; + } + + return NULL; } /* Find a connection corresponding to a tuple. */ @@ -474,10 +491,12 @@ __ip_conntrack_confirm(struct nf_ct_info *nfct) atomic_inc(&ct->ct_general.use); set_bit(IPS_CONFIRMED_BIT, &ct->status); WRITE_UNLOCK(&ip_conntrack_lock); + __get_cpu_var(ip_conntrack_stat).insert++; return NF_ACCEPT; } WRITE_UNLOCK(&ip_conntrack_lock); + __get_cpu_var(ip_conntrack_stat).insert_failed++; return NF_DROP; } @@ -496,83 +515,6 @@ ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple, return h != NULL; } -/* Returns conntrack if it dealt with ICMP, and filled in skb fields */ -struct ip_conntrack * -icmp_error_track(struct sk_buff *skb, - enum ip_conntrack_info *ctinfo, - unsigned int hooknum) -{ - struct ip_conntrack_tuple innertuple, origtuple; - struct { - struct icmphdr icmp; - struct iphdr ip; - } inside; - struct ip_conntrack_protocol *innerproto; - struct ip_conntrack_tuple_hash *h; - int dataoff; - - IP_NF_ASSERT(skb->nfct == NULL); - - /* Not enough header? */ - if (skb_copy_bits(skb, skb->nh.iph->ihl*4, &inside, sizeof(inside))!=0) - return NULL; - - if (inside.icmp.type != ICMP_DEST_UNREACH - && inside.icmp.type != ICMP_SOURCE_QUENCH - && inside.icmp.type != ICMP_TIME_EXCEEDED - && inside.icmp.type != ICMP_PARAMETERPROB - && inside.icmp.type != ICMP_REDIRECT) - return NULL; - - /* Ignore ICMP's containing fragments (shouldn't happen) */ - if (inside.ip.frag_off & htons(IP_OFFSET)) { - DEBUGP("icmp_error_track: fragment of proto %u\n", - inside.ip.protocol); - return NULL; - } - - innerproto = ip_ct_find_proto(inside.ip.protocol); - dataoff = skb->nh.iph->ihl*4 + sizeof(inside.icmp) + inside.ip.ihl*4; - /* Are they talking about one of our connections? */ - if (!get_tuple(&inside.ip, skb, dataoff, &origtuple, innerproto)) { - DEBUGP("icmp_error: ! get_tuple p=%u", inside.ip.protocol); - return NULL; - } - - /* Ordinarily, we'd expect the inverted tupleproto, but it's - been preserved inside the ICMP. */ - if (!invert_tuple(&innertuple, &origtuple, innerproto)) { - DEBUGP("icmp_error_track: Can't invert tuple\n"); - return NULL; - } - - *ctinfo = IP_CT_RELATED; - - h = ip_conntrack_find_get(&innertuple, NULL); - if (!h) { - /* Locally generated ICMPs will match inverted if they - haven't been SNAT'ed yet */ - /* FIXME: NAT code has to handle half-done double NAT --RR */ - if (hooknum == NF_IP_LOCAL_OUT) - h = ip_conntrack_find_get(&origtuple, NULL); - - if (!h) { - DEBUGP("icmp_error_track: no match\n"); - return NULL; - } - /* Reverse direction from that found */ - if (DIRECTION(h) != IP_CT_DIR_REPLY) - *ctinfo += IP_CT_IS_REPLY; - } else { - if (DIRECTION(h) == IP_CT_DIR_REPLY) - *ctinfo += IP_CT_IS_REPLY; - } - - /* Update skb to refer to this connection */ - skb->nfct = &h->ctrack->infos[*ctinfo]; - return h->ctrack; -} - /* There's a small race here where we may free a just-assured connection. Too bad: we're in trouble anyway. */ static inline int unreplied(const struct ip_conntrack_tuple_hash *i) @@ -598,6 +540,7 @@ static int early_drop(struct list_head *chain) if (del_timer(&h->ctrack->timeout)) { death_by_timeout((unsigned long)h->ctrack); dropped = 1; + __get_cpu_var(ip_conntrack_stat).early_drop++; } ip_conntrack_put(h->ctrack); return dropped; @@ -654,7 +597,7 @@ init_conntrack(const struct ip_conntrack_tuple *tuple, } } - if (!invert_tuple(&repl_tuple, tuple, protocol)) { + if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) { DEBUGP("Can't invert tuple.\n"); return NULL; } @@ -693,41 +636,53 @@ init_conntrack(const struct ip_conntrack_tuple *tuple, struct ip_conntrack_expect *, tuple); READ_UNLOCK(&ip_conntrack_expect_tuple_lock); - /* If master is not in hash table yet (ie. packet hasn't left - this machine yet), how can other end know about expected? - Hence these are not the droids you are looking for (if - master ct never got confirmed, we'd hold a reference to it - and weird things would happen to future packets). */ - if (expected && !is_confirmed(expected->expectant)) - expected = NULL; - - /* Look up the conntrack helper for master connections only */ - if (!expected) - conntrack->helper = ip_ct_find_helper(&repl_tuple); + if (expected) { + /* If master is not in hash table yet (ie. packet hasn't left + this machine yet), how can other end know about expected? + Hence these are not the droids you are looking for (if + master ct never got confirmed, we'd hold a reference to it + and weird things would happen to future packets). */ + if (!is_confirmed(expected->expectant)) { + conntrack->helper = ip_ct_find_helper(&repl_tuple); + goto end; + } - /* If the expectation is dying, then this is a loser. */ - if (expected - && expected->expectant->helper->timeout - && ! del_timer(&expected->timeout)) - expected = NULL; + /* Expectation is dying... */ + if (expected->expectant->helper->timeout + && !del_timer(&expected->timeout)) + goto end; - if (expected) { DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n", conntrack, expected); /* Welcome, Mr. Bond. We've been expecting you... */ + IP_NF_ASSERT(master_ct(conntrack)); __set_bit(IPS_EXPECTED_BIT, &conntrack->status); conntrack->master = expected; expected->sibling = conntrack; LIST_DELETE(&ip_conntrack_expect_list, expected); expected->expectant->expecting--; nf_conntrack_get(&master_ct(conntrack)->infos[0]); + + /* this is a braindead... --pablo */ + atomic_inc(&ip_conntrack_count); + WRITE_UNLOCK(&ip_conntrack_lock); + + if (expected->expectfn) + expected->expectfn(conntrack); + + __get_cpu_var(ip_conntrack_stat).expect_new++; + + goto ret; + } else { + conntrack->helper = ip_ct_find_helper(&repl_tuple); + + __get_cpu_var(ip_conntrack_stat).new++; } - atomic_inc(&ip_conntrack_count); + +end: atomic_inc(&ip_conntrack_count); WRITE_UNLOCK(&ip_conntrack_lock); - if (expected && expected->expectfn) - expected->expectfn(conntrack); - return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL]; +ret: return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL]; } /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */ @@ -743,7 +698,8 @@ resolve_normal_ct(struct sk_buff *skb, IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0); - if (!get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4, &tuple, proto)) + if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4, + &tuple,proto)) return NULL; /* look for tuple match */ @@ -823,38 +779,51 @@ unsigned int ip_conntrack_in(unsigned int hooknum, #endif /* Previously seen (loopback or untracked)? Ignore. */ - if ((*pskb)->nfct) + if ((*pskb)->nfct) { + __get_cpu_var(ip_conntrack_stat).ignore++; return NF_ACCEPT; + } proto = ip_ct_find_proto((*pskb)->nh.iph->protocol); - /* It may be an icmp error... */ - if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP - && icmp_error_track(*pskb, &ctinfo, hooknum)) - return NF_ACCEPT; + /* It may be an special packet, error, unclean... + * inverse of the return code tells to the netfilter + * core what to do with the packet. */ + if (proto->error != NULL + && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) { + __get_cpu_var(ip_conntrack_stat).icmp_error++; + return -ret; + } - if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) + if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) { /* Not valid part of a connection */ + __get_cpu_var(ip_conntrack_stat).invalid++; return NF_ACCEPT; + } - if (IS_ERR(ct)) + if (IS_ERR(ct)) { /* Too stressed to deal. */ + __get_cpu_var(ip_conntrack_stat).drop++; return NF_DROP; + } IP_NF_ASSERT((*pskb)->nfct); ret = proto->packet(ct, *pskb, ctinfo); - if (ret == -1) { - /* Invalid */ + if (ret < 0) { + /* Invalid: inverse of the return code tells + * the netfilter core what to do*/ nf_conntrack_put((*pskb)->nfct); (*pskb)->nfct = NULL; - return NF_ACCEPT; + __get_cpu_var(ip_conntrack_stat).invalid++; + return -ret; } if (ret != NF_DROP && ct->helper) { ret = ct->helper->help(*pskb, ct, ctinfo); if (ret == -1) { /* Invalid */ + __get_cpu_var(ip_conntrack_stat).invalid++; nf_conntrack_put((*pskb)->nfct); (*pskb)->nfct = NULL; return NF_ACCEPT; @@ -869,7 +838,8 @@ unsigned int ip_conntrack_in(unsigned int hooknum, int invert_tuplepr(struct ip_conntrack_tuple *inverse, const struct ip_conntrack_tuple *orig) { - return invert_tuple(inverse, orig, ip_ct_find_proto(orig->dst.protonum)); + return ip_ct_invert_tuple(inverse, orig, + ip_ct_find_proto(orig->dst.protonum)); } static inline int resent_expect(const struct ip_conntrack_expect *i, @@ -923,9 +893,8 @@ struct ip_conntrack_expect * ip_conntrack_expect_alloc(void) { struct ip_conntrack_expect *new; - - new = (struct ip_conntrack_expect *) - kmalloc(sizeof(struct ip_conntrack_expect), GFP_ATOMIC); + + new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC); if (!new) { DEBUGP("expect_related: OOM allocating expect\n"); return NULL; @@ -933,6 +902,7 @@ ip_conntrack_expect_alloc(void) /* tuple_cmp compares whole union, we have to initialized cleanly */ memset(new, 0, sizeof(struct ip_conntrack_expect)); + atomic_set(&new->use, 1); return new; } @@ -944,7 +914,6 @@ ip_conntrack_expect_insert(struct ip_conntrack_expect *new, DEBUGP("new expectation %p of conntrack %p\n", new, related_to); new->expectant = related_to; new->sibling = NULL; - atomic_set(&new->use, 1); /* add to expected list for this connection */ list_add_tail(&new->expected_list, &related_to->sibling_list); @@ -997,7 +966,8 @@ int ip_conntrack_expect_related(struct ip_conntrack_expect *expect, } WRITE_UNLOCK(&ip_conntrack_lock); - kfree(expect); + /* This expectation is not inserted so no need to lock */ + kmem_cache_free(ip_conntrack_expect_cachep, expect); return -EEXIST; } else if (related_to->helper->max_expected && @@ -1015,7 +985,7 @@ int ip_conntrack_expect_related(struct ip_conntrack_expect *expect, related_to->helper->name, NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip), NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip)); - kfree(expect); + kmem_cache_free(ip_conntrack_expect_cachep, expect); return -EPERM; } DEBUGP("ip_conntrack: max number of expected " @@ -1049,7 +1019,7 @@ int ip_conntrack_expect_related(struct ip_conntrack_expect *expect, WRITE_UNLOCK(&ip_conntrack_lock); DEBUGP("expect_related: busy!\n"); - kfree(expect); + kmem_cache_free(ip_conntrack_expect_cachep, expect); return -EBUSY; } @@ -1057,6 +1027,8 @@ out: ip_conntrack_expect_insert(expect, related_to); WRITE_UNLOCK(&ip_conntrack_lock); + __get_cpu_var(ip_conntrack_stat).expect_create++; + return ret; } @@ -1164,21 +1136,39 @@ void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me) synchronize_net(); } -/* Refresh conntrack for this many jiffies. */ -void ip_ct_refresh(struct ip_conntrack *ct, unsigned long extra_jiffies) +static inline void ct_add_counters(struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo, + const struct sk_buff *skb) +{ +#ifdef CONFIG_IP_NF_CT_ACCT + if (skb) { + ct->counters[CTINFO2DIR(ctinfo)].packets++; + ct->counters[CTINFO2DIR(ctinfo)].bytes += + ntohs(skb->nh.iph->tot_len); + } +#endif +} + +/* Refresh conntrack for this many jiffies and do accounting (if skb != NULL) */ +void ip_ct_refresh_acct(struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo, + const struct sk_buff *skb, + unsigned long extra_jiffies) { IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct); /* If not in hash table, timer will not be active yet */ - if (!is_confirmed(ct)) + if (!is_confirmed(ct)) { ct->timeout.expires = extra_jiffies; - else { + ct_add_counters(ct, ctinfo, skb); + } else { WRITE_LOCK(&ip_conntrack_lock); /* Need del_timer for race avoidance (may already be dying). */ if (del_timer(&ct->timeout)) { ct->timeout.expires = jiffies + extra_jiffies; add_timer(&ct->timeout); } + ct_add_counters(ct, ctinfo, skb); WRITE_UNLOCK(&ip_conntrack_lock); } } @@ -1368,12 +1358,13 @@ void ip_conntrack_cleanup(void) } kmem_cache_destroy(ip_conntrack_cachep); + kmem_cache_destroy(ip_conntrack_expect_cachep); vfree(ip_conntrack_hash); nf_unregister_sockopt(&so_getorigdst); } static int hashsize; -MODULE_PARM(hashsize, "i"); +module_param(hashsize, int, 0400); int __init ip_conntrack_init(void) { @@ -1420,6 +1411,15 @@ int __init ip_conntrack_init(void) printk(KERN_ERR "Unable to create ip_conntrack slab cache\n"); goto err_free_hash; } + + ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect", + sizeof(struct ip_conntrack_expect), + 0, SLAB_HWCACHE_ALIGN, NULL, NULL); + if (!ip_conntrack_expect_cachep) { + printk(KERN_ERR "Unable to create ip_expect slab cache\n"); + goto err_free_conntrack_slab; + } + /* Don't NEED lock here, but good form anyway. */ WRITE_LOCK(&ip_conntrack_lock); /* Sew in builtin protocols. */ @@ -1447,6 +1447,8 @@ int __init ip_conntrack_init(void) return ret; +err_free_conntrack_slab: + kmem_cache_destroy(ip_conntrack_cachep); err_free_hash: vfree(ip_conntrack_hash); err_unreg_sockopt: diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c index b9c27d5e458b..c1403a0cc7d4 100644 --- a/net/ipv4/netfilter/ip_conntrack_ftp.c +++ b/net/ipv4/netfilter/ip_conntrack_ftp.c @@ -19,6 +19,7 @@ #include <linux/netfilter_ipv4/lockhelp.h> #include <linux/netfilter_ipv4/ip_conntrack_helper.h> #include <linux/netfilter_ipv4/ip_conntrack_ftp.h> +#include <linux/moduleparam.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>"); @@ -33,10 +34,10 @@ struct module *ip_conntrack_ftp = THIS_MODULE; #define MAX_PORTS 8 static int ports[MAX_PORTS]; static int ports_c; -MODULE_PARM(ports, "1-" __MODULE_STRING(MAX_PORTS) "i"); +module_param_array(ports, int, ports_c, 0400); static int loose; -MODULE_PARM(loose, "i"); +module_param(loose, int, 0600); #if 0 #define DEBUGP printk @@ -247,7 +248,8 @@ static int help(struct sk_buff *skb, enum ip_conntrack_info ctinfo) { unsigned int dataoff, datalen; - struct tcphdr tcph; + struct tcphdr _tcph, *th; + char *fb_ptr; u_int32_t old_seq_aft_nl; int old_seq_aft_nl_set, ret; u_int32_t array[6] = { 0 }; @@ -267,10 +269,12 @@ static int help(struct sk_buff *skb, return NF_ACCEPT; } - if (skb_copy_bits(skb, skb->nh.iph->ihl*4, &tcph, sizeof(tcph)) != 0) + th = skb_header_pointer(skb, skb->nh.iph->ihl*4, + sizeof(_tcph), &_tcph); + if (th == NULL) return NF_ACCEPT; - dataoff = skb->nh.iph->ihl*4 + tcph.doff*4; + dataoff = skb->nh.iph->ihl*4 + th->doff*4; /* No data? */ if (dataoff >= skb->len) { DEBUGP("ftp: skblen = %u\n", skb->len); @@ -279,26 +283,28 @@ static int help(struct sk_buff *skb, datalen = skb->len - dataoff; LOCK_BH(&ip_ftp_lock); - skb_copy_bits(skb, dataoff, ftp_buffer, skb->len - dataoff); + fb_ptr = skb_header_pointer(skb, dataoff, + skb->len - dataoff, ftp_buffer); + BUG_ON(fb_ptr == NULL); old_seq_aft_nl_set = ct_ftp_info->seq_aft_nl_set[dir]; old_seq_aft_nl = ct_ftp_info->seq_aft_nl[dir]; DEBUGP("conntrack_ftp: datalen %u\n", datalen); - if (ftp_buffer[datalen - 1] == '\n') { + if (fb_ptr[datalen - 1] == '\n') { DEBUGP("conntrack_ftp: datalen %u ends in \\n\n", datalen); if (!old_seq_aft_nl_set - || after(ntohl(tcph.seq) + datalen, old_seq_aft_nl)) { + || after(ntohl(th->seq) + datalen, old_seq_aft_nl)) { DEBUGP("conntrack_ftp: updating nl to %u\n", - ntohl(tcph.seq) + datalen); + ntohl(th->seq) + datalen); ct_ftp_info->seq_aft_nl[dir] = - ntohl(tcph.seq) + datalen; + ntohl(th->seq) + datalen; ct_ftp_info->seq_aft_nl_set[dir] = 1; } } if(!old_seq_aft_nl_set || - (ntohl(tcph.seq) != old_seq_aft_nl)) { + (ntohl(th->seq) != old_seq_aft_nl)) { DEBUGP("ip_conntrack_ftp_help: wrong seq pos %s(%u)\n", old_seq_aft_nl_set ? "":"(UNSET) ", old_seq_aft_nl); ret = NF_ACCEPT; @@ -315,7 +321,7 @@ static int help(struct sk_buff *skb, for (i = 0; i < ARRAY_SIZE(search); i++) { if (search[i].dir != dir) continue; - found = find_pattern(ftp_buffer, skb->len - dataoff, + found = find_pattern(fb_ptr, skb->len - dataoff, search[i].pattern, search[i].plen, search[i].skip, @@ -333,7 +339,7 @@ static int help(struct sk_buff *skb, if (net_ratelimit()) printk("conntrack_ftp: partial %s %u+%u\n", search[i].pattern, - ntohl(tcph.seq), datalen); + ntohl(th->seq), datalen); ret = NF_DROP; goto out; } else if (found == 0) { /* No match */ @@ -343,7 +349,7 @@ static int help(struct sk_buff *skb, DEBUGP("conntrack_ftp: match `%.*s' (%u bytes at %u)\n", (int)matchlen, data + matchoff, - matchlen, ntohl(tcph.seq) + matchoff); + matchlen, ntohl(th->seq) + matchoff); /* Allocate expectation which will be inserted */ exp = ip_conntrack_expect_alloc(); @@ -357,7 +363,7 @@ static int help(struct sk_buff *skb, /* Update the ftp info */ if (htonl((array[0] << 24) | (array[1] << 16) | (array[2] << 8) | array[3]) == ct->tuplehash[dir].tuple.src.ip) { - exp->seq = ntohl(tcph.seq) + matchoff; + exp->seq = ntohl(th->seq) + matchoff; exp_ftp_info->len = matchlen; exp_ftp_info->ftptype = search[i].ftptype; exp_ftp_info->port = array[4] << 8 | array[5]; @@ -420,10 +426,10 @@ static int __init init(void) int i, ret; char *tmpname; - if (ports[0] == 0) - ports[0] = FTP_PORT; + if (ports_c == 0) + ports[ports_c++] = FTP_PORT; - for (i = 0; (i < MAX_PORTS) && ports[i]; i++) { + for (i = 0; i < ports_c; i++) { ftp[i].tuple.src.u.tcp.port = htons(ports[i]); ftp[i].tuple.dst.protonum = IPPROTO_TCP; ftp[i].mask.src.u.tcp.port = 0xFFFF; @@ -449,7 +455,6 @@ static int __init init(void) fini(); return ret; } - ports_c++; } return 0; } diff --git a/net/ipv4/netfilter/ip_conntrack_irc.c b/net/ipv4/netfilter/ip_conntrack_irc.c index 32b5daee81c0..0d0afe131e4e 100644 --- a/net/ipv4/netfilter/ip_conntrack_irc.c +++ b/net/ipv4/netfilter/ip_conntrack_irc.c @@ -32,6 +32,7 @@ #include <linux/netfilter_ipv4/lockhelp.h> #include <linux/netfilter_ipv4/ip_conntrack_helper.h> #include <linux/netfilter_ipv4/ip_conntrack_irc.h> +#include <linux/moduleparam.h> #define MAX_PORTS 8 static int ports[MAX_PORTS]; @@ -44,11 +45,11 @@ static char irc_buffer[65536]; MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); MODULE_DESCRIPTION("IRC (DCC) connection tracking helper"); MODULE_LICENSE("GPL"); -MODULE_PARM(ports, "1-" __MODULE_STRING(MAX_PORTS) "i"); +module_param_array(ports, int, ports_c, 0400); MODULE_PARM_DESC(ports, "port numbers of IRC servers"); -MODULE_PARM(max_dcc_channels, "i"); +module_param(max_dcc_channels, int, 0400); MODULE_PARM_DESC(max_dcc_channels, "max number of expected DCC channels per IRC session"); -MODULE_PARM(dcc_timeout, "i"); +module_param(dcc_timeout, int, 0400); MODULE_PARM_DESC(dcc_timeout, "timeout on for unestablished DCC channels"); static char *dccprotos[] = { "SEND ", "CHAT ", "MOVE ", "TSEND ", "SCHAT " }; @@ -101,8 +102,8 @@ static int help(struct sk_buff *skb, struct ip_conntrack *ct, enum ip_conntrack_info ctinfo) { unsigned int dataoff; - struct tcphdr tcph; - char *data, *data_limit; + struct tcphdr _tcph, *th; + char *data, *data_limit, *ib_ptr; int dir = CTINFO2DIR(ctinfo); struct ip_conntrack_expect *exp; struct ip_ct_irc_expect *exp_irc_info = NULL; @@ -126,19 +127,23 @@ static int help(struct sk_buff *skb, } /* Not a full tcp header? */ - if (skb_copy_bits(skb, skb->nh.iph->ihl*4, &tcph, sizeof(tcph)) != 0) + th = skb_header_pointer(skb, skb->nh.iph->ihl*4, + sizeof(_tcph), &_tcph); + if (th == NULL) return NF_ACCEPT; /* No data? */ - dataoff = skb->nh.iph->ihl*4 + tcph.doff*4; + dataoff = skb->nh.iph->ihl*4 + th->doff*4; if (dataoff >= skb->len) return NF_ACCEPT; LOCK_BH(&ip_irc_lock); - skb_copy_bits(skb, dataoff, irc_buffer, skb->len - dataoff); + ib_ptr = skb_header_pointer(skb, dataoff, + skb->len - dataoff, irc_buffer); + BUG_ON(ib_ptr == NULL); - data = irc_buffer; - data_limit = irc_buffer + skb->len - dataoff; + data = ib_ptr; + data_limit = ib_ptr + skb->len - dataoff; /* strlen("\1DCC SENT t AAAAAAAA P\1\n")=24 * 5+MINMATCHLEN+strlen("t AAAAAAAA P\1\n")=14 */ @@ -152,8 +157,8 @@ static int help(struct sk_buff *skb, /* we have at least (19+MINMATCHLEN)-5 bytes valid data left */ DEBUGP("DCC found in master %u.%u.%u.%u:%u %u.%u.%u.%u:%u...\n", - NIPQUAD(iph->saddr), ntohs(tcph.source), - NIPQUAD(iph->daddr), ntohs(tcph.dest)); + NIPQUAD(iph->saddr), ntohs(th->source), + NIPQUAD(iph->daddr), ntohs(th->dest)); for (i = 0; i < ARRAY_SIZE(dccprotos); i++) { if (memcmp(data, dccprotos[i], strlen(dccprotos[i]))) { @@ -197,8 +202,8 @@ static int help(struct sk_buff *skb, /* save position of address in dcc string, * necessary for NAT */ - DEBUGP("tcph->seq = %u\n", tcph.seq); - exp->seq = ntohl(tcph.seq) + (addr_beg_p - irc_buffer); + DEBUGP("tcph->seq = %u\n", th->seq); + exp->seq = ntohl(th->seq) + (addr_beg_p - ib_ptr); exp_irc_info->len = (addr_end_p - addr_beg_p); exp_irc_info->port = dcc_port; DEBUGP("wrote info seq=%u (ofs=%u), len=%d\n", @@ -252,10 +257,10 @@ static int __init init(void) } /* If no port given, default to standard irc port */ - if (ports[0] == 0) - ports[0] = IRC_PORT; + if (ports_c == 0) + ports[ports_c++] = IRC_PORT; - for (i = 0; (i < MAX_PORTS) && ports[i]; i++) { + for (i = 0; i < ports_c; i++) { hlpr = &irc_helpers[i]; hlpr->tuple.src.u.tcp.port = htons(ports[i]); hlpr->tuple.dst.protonum = IPPROTO_TCP; @@ -284,7 +289,6 @@ static int __init init(void) fini(); return -EBUSY; } - ports_c++; } return 0; } diff --git a/net/ipv4/netfilter/ip_conntrack_proto_generic.c b/net/ipv4/netfilter/ip_conntrack_proto_generic.c index 0df558a58020..c7a913149b8e 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_generic.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_generic.c @@ -50,9 +50,9 @@ static unsigned int generic_print_conntrack(char *buffer, /* Returns verdict for packet, or -1 for invalid. */ static int packet(struct ip_conntrack *conntrack, const struct sk_buff *skb, - enum ip_conntrack_info conntrackinfo) + enum ip_conntrack_info ctinfo) { - ip_ct_refresh(conntrack, ip_ct_generic_timeout); + ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_generic_timeout); return NF_ACCEPT; } @@ -62,8 +62,14 @@ static int new(struct ip_conntrack *conntrack, const struct sk_buff *skb) return 1; } -struct ip_conntrack_protocol ip_conntrack_generic_protocol -= { { NULL, NULL }, 0, "unknown", - generic_pkt_to_tuple, generic_invert_tuple, generic_print_tuple, - generic_print_conntrack, packet, new, NULL, NULL, NULL }; - +struct ip_conntrack_protocol ip_conntrack_generic_protocol = +{ + .proto = 0, + .name = "unknown", + .pkt_to_tuple = generic_pkt_to_tuple, + .invert_tuple = generic_invert_tuple, + .print_tuple = generic_print_tuple, + .print_conntrack = generic_print_conntrack, + .packet = packet, + .new = new, +}; diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c index 47114840fa84..b2f0dee33f2a 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c @@ -12,6 +12,11 @@ #include <linux/netfilter.h> #include <linux/in.h> #include <linux/icmp.h> +#include <net/ip.h> +#include <net/checksum.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4/ip_conntrack.h> +#include <linux/netfilter_ipv4/ip_conntrack_core.h> #include <linux/netfilter_ipv4/ip_conntrack_protocol.h> unsigned long ip_ct_icmp_timeout = 30*HZ; @@ -26,14 +31,15 @@ static int icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, struct ip_conntrack_tuple *tuple) { - struct icmphdr hdr; + struct icmphdr _hdr, *hp; - if (skb_copy_bits(skb, dataoff, &hdr, sizeof(hdr)) != 0) + hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); + if (hp == NULL) return 0; - tuple->dst.u.icmp.type = hdr.type; - tuple->src.u.icmp.id = hdr.un.echo.id; - tuple->dst.u.icmp.code = hdr.code; + tuple->dst.u.icmp.type = hp->type; + tuple->src.u.icmp.id = hp->un.echo.id; + tuple->dst.u.icmp.code = hp->code; return 1; } @@ -94,7 +100,7 @@ static int icmp_packet(struct ip_conntrack *ct, ct->timeout.function((unsigned long)ct); } else { atomic_inc(&ct->proto.icmp.count); - ip_ct_refresh(ct, ip_ct_icmp_timeout); + ip_ct_refresh_acct(ct, ctinfo, skb, ip_ct_icmp_timeout); } return NF_ACCEPT; @@ -122,7 +128,147 @@ static int icmp_new(struct ip_conntrack *conntrack, return 1; } -struct ip_conntrack_protocol ip_conntrack_protocol_icmp -= { { NULL, NULL }, IPPROTO_ICMP, "icmp", - icmp_pkt_to_tuple, icmp_invert_tuple, icmp_print_tuple, - icmp_print_conntrack, icmp_packet, icmp_new, NULL, NULL, NULL }; +static int +icmp_error_message(struct sk_buff *skb, + enum ip_conntrack_info *ctinfo, + unsigned int hooknum) +{ + struct ip_conntrack_tuple innertuple, origtuple; + struct { + struct icmphdr icmp; + struct iphdr ip; + } inside; + struct ip_conntrack_protocol *innerproto; + struct ip_conntrack_tuple_hash *h; + int dataoff; + + IP_NF_ASSERT(skb->nfct == NULL); + + /* Not enough header? */ + if (skb_copy_bits(skb, skb->nh.iph->ihl*4, &inside, sizeof(inside))!=0) + return NF_ACCEPT; + + /* Ignore ICMP's containing fragments (shouldn't happen) */ + if (inside.ip.frag_off & htons(IP_OFFSET)) { + DEBUGP("icmp_error_track: fragment of proto %u\n", + inside.ip.protocol); + return NF_ACCEPT; + } + + innerproto = ip_ct_find_proto(inside.ip.protocol); + dataoff = skb->nh.iph->ihl*4 + sizeof(inside.icmp) + inside.ip.ihl*4; + /* Are they talking about one of our connections? */ + if (!ip_ct_get_tuple(&inside.ip, skb, dataoff, &origtuple, innerproto)) { + DEBUGP("icmp_error: ! get_tuple p=%u", inside.ip.protocol); + return NF_ACCEPT; + } + + /* Ordinarily, we'd expect the inverted tupleproto, but it's + been preserved inside the ICMP. */ + if (!ip_ct_invert_tuple(&innertuple, &origtuple, innerproto)) { + DEBUGP("icmp_error_track: Can't invert tuple\n"); + return NF_ACCEPT; + } + + *ctinfo = IP_CT_RELATED; + + h = ip_conntrack_find_get(&innertuple, NULL); + if (!h) { + /* Locally generated ICMPs will match inverted if they + haven't been SNAT'ed yet */ + /* FIXME: NAT code has to handle half-done double NAT --RR */ + if (hooknum == NF_IP_LOCAL_OUT) + h = ip_conntrack_find_get(&origtuple, NULL); + + if (!h) { + DEBUGP("icmp_error_track: no match\n"); + return NF_ACCEPT; + } + /* Reverse direction from that found */ + if (DIRECTION(h) != IP_CT_DIR_REPLY) + *ctinfo += IP_CT_IS_REPLY; + } else { + if (DIRECTION(h) == IP_CT_DIR_REPLY) + *ctinfo += IP_CT_IS_REPLY; + } + + /* Update skb to refer to this connection */ + skb->nfct = &h->ctrack->infos[*ctinfo]; + return -NF_ACCEPT; +} + +/* Small and modified version of icmp_rcv */ +static int +icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo, + unsigned int hooknum) +{ + struct icmphdr icmph; + + /* Not enough header? */ + if (skb_copy_bits(skb, skb->nh.iph->ihl*4, &icmph, sizeof(icmph))!=0) { + if (LOG_INVALID(IPPROTO_ICMP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_icmp: short packet "); + return -NF_ACCEPT; + } + + /* See ip_conntrack_proto_tcp.c */ + if (hooknum != NF_IP_PRE_ROUTING) + goto checksum_skipped; + + switch (skb->ip_summed) { + case CHECKSUM_HW: + if (!(u16)csum_fold(skb->csum)) + break; + if (LOG_INVALID(IPPROTO_ICMP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_icmp: bad HW ICMP checksum "); + return -NF_ACCEPT; + case CHECKSUM_NONE: + if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) { + if (LOG_INVALID(IPPROTO_ICMP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_icmp: bad ICMP checksum "); + return -NF_ACCEPT; + } + default: + break; + } + +checksum_skipped: + /* + * 18 is the highest 'known' ICMP type. Anything else is a mystery + * + * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently + * discarded. + */ + if (icmph.type > NR_ICMP_TYPES) { + if (LOG_INVALID(IPPROTO_ICMP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_icmp: invalid ICMP type "); + return -NF_ACCEPT; + } + + /* Need to track icmp error message? */ + if (icmph.type != ICMP_DEST_UNREACH + && icmph.type != ICMP_SOURCE_QUENCH + && icmph.type != ICMP_TIME_EXCEEDED + && icmph.type != ICMP_PARAMETERPROB + && icmph.type != ICMP_REDIRECT) + return NF_ACCEPT; + + return icmp_error_message(skb, ctinfo, hooknum); +} + +struct ip_conntrack_protocol ip_conntrack_protocol_icmp = +{ + .proto = IPPROTO_ICMP, + .name = "icmp", + .pkt_to_tuple = icmp_pkt_to_tuple, + .invert_tuple = icmp_invert_tuple, + .print_tuple = icmp_print_tuple, + .print_conntrack = icmp_print_conntrack, + .packet = icmp_packet, + .new = icmp_new, + .error = icmp_error, +}; diff --git a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c new file mode 100644 index 000000000000..8296e7c52cf8 --- /dev/null +++ b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c @@ -0,0 +1,650 @@ +/* + * Connection tracking protocol helper module for SCTP. + * + * SCTP is defined in RFC 2960. References to various sections in this code + * are to this RFC. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* + * Added support for proc manipulation of timeouts. + */ + +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/timer.h> +#include <linux/netfilter.h> +#include <linux/module.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/sctp.h> +#include <linux/string.h> + +#include <linux/netfilter_ipv4/ip_conntrack.h> +#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> +#include <linux/netfilter_ipv4/lockhelp.h> + +#if 0 +#define DEBUGP(format, ...) printk(format, ## __VA_ARGS__) +#else +#define DEBUGP(format, args...) +#endif + +/* Protects conntrack->proto.sctp */ +static DECLARE_RWLOCK(sctp_lock); + +/* FIXME: Examine ipfilter's timeouts and conntrack transitions more + closely. They're more complex. --RR + + And so for me for SCTP :D -Kiran */ + +static const char *sctp_conntrack_names[] = { + "NONE", + "CLOSED", + "COOKIE_WAIT", + "COOKIE_ECHOED", + "ESTABLISHED", + "SHUTDOWN_SENT", + "SHUTDOWN_RECD", + "SHUTDOWN_ACK_SENT", +}; + +#define SECS * HZ +#define MINS * 60 SECS +#define HOURS * 60 MINS +#define DAYS * 24 HOURS + +unsigned long ip_ct_sctp_timeout_closed = 10 SECS; +unsigned long ip_ct_sctp_timeout_cookie_wait = 3 SECS; +unsigned long ip_ct_sctp_timeout_cookie_echoed = 3 SECS; +unsigned long ip_ct_sctp_timeout_established = 5 DAYS; +unsigned long ip_ct_sctp_timeout_shutdown_sent = 300 SECS / 1000; +unsigned long ip_ct_sctp_timeout_shutdown_recd = 300 SECS / 1000; +unsigned long ip_ct_sctp_timeout_shutdown_ack_sent = 3 SECS; + +static unsigned long * sctp_timeouts[] += { 0, /* SCTP_CONNTRACK_NONE */ + &ip_ct_sctp_timeout_closed, /* SCTP_CONNTRACK_CLOSED */ + &ip_ct_sctp_timeout_cookie_wait, /* SCTP_CONNTRACK_COOKIE_WAIT */ + &ip_ct_sctp_timeout_cookie_echoed, /* SCTP_CONNTRACK_COOKIE_ECHOED */ + &ip_ct_sctp_timeout_established, /* SCTP_CONNTRACK_ESTABLISHED */ + &ip_ct_sctp_timeout_shutdown_sent, /* SCTP_CONNTRACK_SHUTDOWN_SENT */ + &ip_ct_sctp_timeout_shutdown_recd, /* SCTP_CONNTRACK_SHUTDOWN_RECD */ + &ip_ct_sctp_timeout_shutdown_ack_sent /* SCTP_CONNTRACK_SHUTDOWN_ACK_SENT */ + }; + +#define sNO SCTP_CONNTRACK_NONE +#define sCL SCTP_CONNTRACK_CLOSED +#define sCW SCTP_CONNTRACK_COOKIE_WAIT +#define sCE SCTP_CONNTRACK_COOKIE_ECHOED +#define sES SCTP_CONNTRACK_ESTABLISHED +#define sSS SCTP_CONNTRACK_SHUTDOWN_SENT +#define sSR SCTP_CONNTRACK_SHUTDOWN_RECD +#define sSA SCTP_CONNTRACK_SHUTDOWN_ACK_SENT +#define sIV SCTP_CONNTRACK_MAX + +/* + These are the descriptions of the states: + +NOTE: These state names are tantalizingly similar to the states of an +SCTP endpoint. But the interpretation of the states is a little different, +considering that these are the states of the connection and not of an end +point. Please note the subtleties. -Kiran + +NONE - Nothing so far. +COOKIE WAIT - We have seen an INIT chunk in the original direction, or also + an INIT_ACK chunk in the reply direction. +COOKIE ECHOED - We have seen a COOKIE_ECHO chunk in the original direction. +ESTABLISHED - We have seen a COOKIE_ACK in the reply direction. +SHUTDOWN_SENT - We have seen a SHUTDOWN chunk in the original direction. +SHUTDOWN_RECD - We have seen a SHUTDOWN chunk in the reply directoin. +SHUTDOWN_ACK_SENT - We have seen a SHUTDOWN_ACK chunk in the direction opposite + to that of the SHUTDOWN chunk. +CLOSED - We have seen a SHUTDOWN_COMPLETE chunk in the direction of + the SHUTDOWN chunk. Connection is closed. +*/ + +/* TODO + - I have assumed that the first INIT is in the original direction. + This messes things when an INIT comes in the reply direction in CLOSED + state. + - Check the error type in the reply dir before transitioning from +cookie echoed to closed. + - Sec 5.2.4 of RFC 2960 + - Multi Homing support. +*/ + +/* SCTP conntrack state transitions */ +static enum sctp_conntrack sctp_conntracks[2][9][SCTP_CONNTRACK_MAX] = { + { +/* ORIGINAL */ +/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */ +/* init */ {sCW, sCW, sCW, sCE, sES, sSS, sSR, sSA}, +/* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA}, +/* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, +/* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA}, +/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA}, +/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant have Stale cookie*/ +/* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA},/* 5.2.4 - Big TODO */ +/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant come in orig dir */ +/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL} + }, + { +/* REPLY */ +/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */ +/* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* INIT in sCL Big TODO */ +/* init_ack */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA}, +/* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, +/* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA}, +/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA}, +/* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA}, +/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant come in reply dir */ +/* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA}, +/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL} + } +}; + +static int sctp_pkt_to_tuple(const struct sk_buff *skb, + unsigned int dataoff, + struct ip_conntrack_tuple *tuple) +{ + sctp_sctphdr_t hdr; + + DEBUGP(__FUNCTION__); + DEBUGP("\n"); + + /* Actually only need first 8 bytes. */ + if (skb_copy_bits(skb, dataoff, &hdr, 8) != 0) + return 0; + + tuple->src.u.sctp.port = hdr.source; + tuple->dst.u.sctp.port = hdr.dest; + + return 1; +} + +static int sctp_invert_tuple(struct ip_conntrack_tuple *tuple, + const struct ip_conntrack_tuple *orig) +{ + DEBUGP(__FUNCTION__); + DEBUGP("\n"); + + tuple->src.u.sctp.port = orig->dst.u.sctp.port; + tuple->dst.u.sctp.port = orig->src.u.sctp.port; + return 1; +} + +/* Print out the per-protocol part of the tuple. */ +static unsigned int sctp_print_tuple(char *buffer, + const struct ip_conntrack_tuple *tuple) +{ + DEBUGP(__FUNCTION__); + DEBUGP("\n"); + + return sprintf(buffer, "sport=%hu dport=%hu ", + ntohs(tuple->src.u.sctp.port), + ntohs(tuple->dst.u.sctp.port)); +} + +/* Print out the private part of the conntrack. */ +static unsigned int sctp_print_conntrack(char *buffer, + const struct ip_conntrack *conntrack) +{ + enum sctp_conntrack state; + + DEBUGP(__FUNCTION__); + DEBUGP("\n"); + + READ_LOCK(&sctp_lock); + state = conntrack->proto.sctp.state; + READ_UNLOCK(&sctp_lock); + + return sprintf(buffer, "%s ", sctp_conntrack_names[state]); +} + +#define for_each_sctp_chunk(skb, sch, offset, count) \ +for (offset = skb->nh.iph->ihl * 4 + sizeof (sctp_sctphdr_t), count = 0; \ + offset < skb->len && !skb_copy_bits(skb, offset, &sch, sizeof(sch)); \ + offset += (htons(sch.length) + 3) & ~3, count++) + +/* Some validity checks to make sure the chunks are fine */ +static int do_basic_checks(struct ip_conntrack *conntrack, + const struct sk_buff *skb, + char *map) +{ + u_int32_t offset, count; + sctp_chunkhdr_t sch; + int flag; + + DEBUGP(__FUNCTION__); + DEBUGP("\n"); + + flag = 0; + + for_each_sctp_chunk (skb, sch, offset, count) { + DEBUGP("Chunk Num: %d Type: %d\n", count, sch.type); + + if (sch.type == SCTP_CID_INIT + || sch.type == SCTP_CID_INIT_ACK + || sch.type == SCTP_CID_SHUTDOWN_COMPLETE) { + flag = 1; + } + + /* Cookie Ack/Echo chunks not the first OR + Init / Init Ack / Shutdown compl chunks not the only chunks */ + if ((sch.type == SCTP_CID_COOKIE_ACK + || sch.type == SCTP_CID_COOKIE_ECHO + || flag) + && count !=0 ) { + DEBUGP("Basic checks failed\n"); + return 1; + } + + if (map) { + set_bit (sch.type, (void *)map); + } + } + + DEBUGP("Basic checks passed\n"); + return 0; +} + +static int new_state(enum ip_conntrack_dir dir, + enum sctp_conntrack cur_state, + int chunk_type) +{ + int i; + + DEBUGP(__FUNCTION__); + DEBUGP("\n"); + + DEBUGP("Chunk type: %d\n", chunk_type); + + switch (chunk_type) { + case SCTP_CID_INIT: + DEBUGP("SCTP_CID_INIT\n"); + i = 0; break; + case SCTP_CID_INIT_ACK: + DEBUGP("SCTP_CID_INIT_ACK\n"); + i = 1; break; + case SCTP_CID_ABORT: + DEBUGP("SCTP_CID_ABORT\n"); + i = 2; break; + case SCTP_CID_SHUTDOWN: + DEBUGP("SCTP_CID_SHUTDOWN\n"); + i = 3; break; + case SCTP_CID_SHUTDOWN_ACK: + DEBUGP("SCTP_CID_SHUTDOWN_ACK\n"); + i = 4; break; + case SCTP_CID_ERROR: + DEBUGP("SCTP_CID_ERROR\n"); + i = 5; break; + case SCTP_CID_COOKIE_ECHO: + DEBUGP("SCTP_CID_COOKIE_ECHO\n"); + i = 6; break; + case SCTP_CID_COOKIE_ACK: + DEBUGP("SCTP_CID_COOKIE_ACK\n"); + i = 7; break; + case SCTP_CID_SHUTDOWN_COMPLETE: + DEBUGP("SCTP_CID_SHUTDOWN_COMPLETE\n"); + i = 8; break; + default: + /* Other chunks like DATA, SACK, HEARTBEAT and + its ACK do not cause a change in state */ + DEBUGP("Unknown chunk type, Will stay in %s\n", + sctp_conntrack_names[cur_state]); + return cur_state; + } + + DEBUGP("dir: %d cur_state: %s chunk_type: %d new_state: %s\n", + dir, sctp_conntrack_names[cur_state], chunk_type, + sctp_conntrack_names[sctp_conntracks[dir][i][cur_state]]); + + return sctp_conntracks[dir][i][cur_state]; +} + +/* Returns verdict for packet, or -1 for invalid. */ +static int sctp_packet(struct ip_conntrack *conntrack, + const struct sk_buff *skb, + enum ip_conntrack_info ctinfo) +{ + enum sctp_conntrack newconntrack, oldsctpstate; + sctp_sctphdr_t sctph; + sctp_chunkhdr_t sch; + u_int32_t offset, count; + char map[256 / sizeof (char)] = {0}; + + DEBUGP(__FUNCTION__); + DEBUGP("\n"); + + if (skb_copy_bits(skb, skb->nh.iph->ihl * 4, &sctph, sizeof(sctph)) != 0) + return -1; + + if (do_basic_checks(conntrack, skb, map) != 0) + return -1; + + /* Check the verification tag (Sec 8.5) */ + if (!test_bit(SCTP_CID_INIT, (void *)map) + && !test_bit(SCTP_CID_SHUTDOWN_COMPLETE, (void *)map) + && !test_bit(SCTP_CID_COOKIE_ECHO, (void *)map) + && !test_bit(SCTP_CID_ABORT, (void *)map) + && !test_bit(SCTP_CID_SHUTDOWN_ACK, (void *)map) + && (sctph.vtag != conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])) { + DEBUGP("Verification tag check failed\n"); + return -1; + } + + oldsctpstate = newconntrack = SCTP_CONNTRACK_MAX; + for_each_sctp_chunk (skb, sch, offset, count) { + WRITE_LOCK(&sctp_lock); + + /* Special cases of Verification tag check (Sec 8.5.1) */ + if (sch.type == SCTP_CID_INIT) { + /* Sec 8.5.1 (A) */ + if (sctph.vtag != 0) { + WRITE_UNLOCK(&sctp_lock); + return -1; + } + } else if (sch.type == SCTP_CID_ABORT) { + /* Sec 8.5.1 (B) */ + if (!(sctph.vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)]) + && !(sctph.vtag == conntrack->proto.sctp.vtag + [1 - CTINFO2DIR(ctinfo)])) { + WRITE_UNLOCK(&sctp_lock); + return -1; + } + } else if (sch.type == SCTP_CID_SHUTDOWN_COMPLETE) { + /* Sec 8.5.1 (C) */ + if (!(sctph.vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)]) + && !(sctph.vtag == conntrack->proto.sctp.vtag + [1 - CTINFO2DIR(ctinfo)] + && (sch.flags & 1))) { + WRITE_UNLOCK(&sctp_lock); + return -1; + } + } else if (sch.type == SCTP_CID_COOKIE_ECHO) { + /* Sec 8.5.1 (D) */ + if (!(sctph.vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])) { + WRITE_UNLOCK(&sctp_lock); + return -1; + } + } + + oldsctpstate = conntrack->proto.sctp.state; + newconntrack = new_state(CTINFO2DIR(ctinfo), oldsctpstate, sch.type); + + /* Invalid */ + if (newconntrack == SCTP_CONNTRACK_MAX) { + DEBUGP("ip_conntrack_sctp: Invalid dir=%i ctype=%u conntrack=%u\n", + CTINFO2DIR(ctinfo), sch.type, oldsctpstate); + WRITE_UNLOCK(&sctp_lock); + return -1; + } + + /* If it is an INIT or an INIT ACK note down the vtag */ + if (sch.type == SCTP_CID_INIT + || sch.type == SCTP_CID_INIT_ACK) { + sctp_inithdr_t inithdr; + + if (skb_copy_bits(skb, offset + sizeof (sctp_chunkhdr_t), + &inithdr, sizeof(inithdr)) != 0) { + WRITE_UNLOCK(&sctp_lock); + return -1; + } + DEBUGP("Setting vtag %x for dir %d\n", + inithdr.init_tag, CTINFO2DIR(ctinfo)); + conntrack->proto.sctp.vtag[IP_CT_DIR_ORIGINAL] = inithdr.init_tag; + } + + conntrack->proto.sctp.state = newconntrack; + WRITE_UNLOCK(&sctp_lock); + } + + ip_ct_refresh_acct(conntrack, ctinfo, skb, *sctp_timeouts[newconntrack]); + + if (oldsctpstate == SCTP_CONNTRACK_COOKIE_ECHOED + && CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY + && newconntrack == SCTP_CONNTRACK_ESTABLISHED) { + DEBUGP("Setting assured bit\n"); + set_bit(IPS_ASSURED_BIT, &conntrack->status); + } + + return NF_ACCEPT; +} + +/* Called when a new connection for this protocol found. */ +static int sctp_new(struct ip_conntrack *conntrack, + const struct sk_buff *skb) +{ + enum sctp_conntrack newconntrack; + sctp_sctphdr_t sctph; + sctp_chunkhdr_t sch; + u_int32_t offset, count; + char map[256 / sizeof (char)] = {0}; + + DEBUGP(__FUNCTION__); + DEBUGP("\n"); + + if (skb_copy_bits(skb, skb->nh.iph->ihl * 4, &sctph, sizeof(sctph)) != 0) + return -1; + + if (do_basic_checks(conntrack, skb, map) != 0) + return -1; + + /* If an OOTB packet has any of these chunks discard (Sec 8.4) */ + if ((test_bit (SCTP_CID_ABORT, (void *)map)) + || (test_bit (SCTP_CID_SHUTDOWN_COMPLETE, (void *)map)) + || (test_bit (SCTP_CID_COOKIE_ACK, (void *)map))) { + return -1; + } + + newconntrack = SCTP_CONNTRACK_MAX; + for_each_sctp_chunk (skb, sch, offset, count) { + /* Don't need lock here: this conntrack not in circulation yet */ + newconntrack = new_state (IP_CT_DIR_ORIGINAL, + SCTP_CONNTRACK_NONE, sch.type); + + /* Invalid: delete conntrack */ + if (newconntrack == SCTP_CONNTRACK_MAX) { + DEBUGP("ip_conntrack_sctp: invalid new deleting.\n"); + return 0; + } + + /* Copy the vtag into the state info */ + if (sch.type == SCTP_CID_INIT) { + if (sctph.vtag == 0) { + sctp_inithdr_t inithdr; + + if (skb_copy_bits(skb, offset + sizeof (sctp_chunkhdr_t), + &inithdr, sizeof(inithdr)) != 0) { + return -1; + } + + DEBUGP("Setting vtag %x for new conn\n", + inithdr.init_tag); + + conntrack->proto.sctp.vtag[IP_CT_DIR_REPLY] = + inithdr.init_tag; + } else { + /* Sec 8.5.1 (A) */ + return -1; + } + } + /* If it is a shutdown ack OOTB packet, we expect a return + shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */ + else { + DEBUGP("Setting vtag %x for new conn OOTB\n", + sctph.vtag); + conntrack->proto.sctp.vtag[IP_CT_DIR_REPLY] = sctph.vtag; + } + + conntrack->proto.sctp.state = newconntrack; + } + + return 1; +} + +static int sctp_exp_matches_pkt(struct ip_conntrack_expect *exp, + const struct sk_buff *skb) +{ + /* To be implemented */ + return 0; +} + +struct ip_conntrack_protocol ip_conntrack_protocol_sctp = { + .list = { NULL, NULL }, + .proto = IPPROTO_SCTP, + .name = "sctp", + .pkt_to_tuple = sctp_pkt_to_tuple, + .invert_tuple = sctp_invert_tuple, + .print_tuple = sctp_print_tuple, + .print_conntrack = sctp_print_conntrack, + .packet = sctp_packet, + .new = sctp_new, + .destroy = NULL, + .exp_matches_pkt = sctp_exp_matches_pkt, + .me = THIS_MODULE +}; + +#ifdef CONFIG_SYSCTL +static ctl_table ip_ct_sysctl_table[] = { + { + .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED, + .procname = "ip_conntrack_sctp_timeout_closed", + .data = &ip_ct_sctp_timeout_closed, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT, + .procname = "ip_conntrack_sctp_timeout_cookie_wait", + .data = &ip_ct_sctp_timeout_cookie_wait, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED, + .procname = "ip_conntrack_sctp_timeout_cookie_echoed", + .data = &ip_ct_sctp_timeout_cookie_echoed, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED, + .procname = "ip_conntrack_sctp_timeout_established", + .data = &ip_ct_sctp_timeout_established, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT, + .procname = "ip_conntrack_sctp_timeout_shutdown_sent", + .data = &ip_ct_sctp_timeout_shutdown_sent, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD, + .procname = "ip_conntrack_sctp_timeout_shutdown_recd", + .data = &ip_ct_sctp_timeout_shutdown_recd, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT, + .procname = "ip_conntrack_sctp_timeout_shutdown_ack_sent", + .data = &ip_ct_sctp_timeout_shutdown_ack_sent, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { .ctl_name = 0 } +}; + +static ctl_table ip_ct_netfilter_table[] = { + { + .ctl_name = NET_IPV4_NETFILTER, + .procname = "netfilter", + .mode = 0555, + .child = ip_ct_sysctl_table, + }, + { .ctl_name = 0 } +}; + +static ctl_table ip_ct_ipv4_table[] = { + { + .ctl_name = NET_IPV4, + .procname = "ipv4", + .mode = 0555, + .child = ip_ct_netfilter_table, + }, + { .ctl_name = 0 } +}; + +static ctl_table ip_ct_net_table[] = { + { + .ctl_name = CTL_NET, + .procname = "net", + .mode = 0555, + .child = ip_ct_ipv4_table, + }, + { .ctl_name = 0 } +}; + +static struct ctl_table_header *ip_ct_sysctl_header; +#endif + +int __init init(void) +{ + int ret; + + ret = ip_conntrack_protocol_register(&ip_conntrack_protocol_sctp); + if (ret) { + printk("ip_conntrack_proto_sctp: protocol register failed\n"); + goto out; + } + +#ifdef CONFIG_SYSCTL + ip_ct_sysctl_header = register_sysctl_table(ip_ct_net_table, 0); + if (ip_ct_sysctl_header == NULL) { + printk("ip_conntrack_proto_sctp: can't register to sysctl.\n"); + goto cleanup; + } +#endif + + return ret; + + cleanup: +#ifdef CONFIG_SYSCTL + ip_conntrack_protocol_unregister(&ip_conntrack_protocol_sctp); +#endif + out: + DEBUGP("SCTP conntrack module loading %s\n", + ret ? "failed": "succeeded"); + return ret; +} + +void __exit fini(void) +{ + ip_conntrack_protocol_unregister(&ip_conntrack_protocol_sctp); +#ifdef CONFIG_SYSCTL + unregister_sysctl_table(ip_ct_sysctl_header); +#endif + DEBUGP("SCTP conntrack module unloaded\n"); +} + +module_init(init); +module_exit(fini); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Kiran Kumar Immidi"); +MODULE_DESCRIPTION("Netfilter connection tracking protocol helper for SCTP"); diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c index 463cafa6692a..64c7538c4b18 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c @@ -4,8 +4,22 @@ * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. + * + * Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>: + * - Real stateful connection tracking + * - Modified state transitions table + * - Window scaling support added + * - SACK support added + * + * Willy Tarreau: + * - State table bugfixes + * - More robust state changes + * - Tuning timer parameters + * + * version 2.2 */ +#include <linux/config.h> #include <linux/types.h> #include <linux/sched.h> #include <linux/timer.h> @@ -14,16 +28,18 @@ #include <linux/in.h> #include <linux/ip.h> #include <linux/tcp.h> -#include <linux/string.h> +#include <linux/spinlock.h> #include <net/tcp.h> +#include <linux/netfilter.h> #include <linux/netfilter_ipv4/ip_conntrack.h> #include <linux/netfilter_ipv4/ip_conntrack_protocol.h> #include <linux/netfilter_ipv4/lockhelp.h> #if 0 #define DEBUGP printk +#define DEBUGP_VARS #else #define DEBUGP(format, args...) #endif @@ -31,28 +47,40 @@ /* Protects conntrack->proto.tcp */ static DECLARE_RWLOCK(tcp_lock); -/* FIXME: Examine ipfilter's timeouts and conntrack transitions more - closely. They're more complex. --RR */ +/* "Be conservative in what you do, + be liberal in what you accept from others." + If it's non-zero, we mark only out of window RST segments as INVALID. */ +int ip_ct_tcp_be_liberal = 0; -/* Actually, I believe that neither ipmasq (where this code is stolen - from) nor ipfilter do it exactly right. A new conntrack machine taking - into account packet loss (which creates uncertainty as to exactly - the conntrack of the connection) is required. RSN. --RR */ +/* When connection is picked up from the middle, how many packets are required + to pass in each direction when we assume we are in sync - if any side uses + window scaling, we lost the game. + If it is set to zero, we disable picking up already established + connections. */ +int ip_ct_tcp_loose = 3; + +/* Max number of the retransmitted packets without receiving an (acceptable) + ACK from the destination. If this number is reached, a shorter timer + will be started. */ +int ip_ct_tcp_max_retrans = 3; + + /* FIXME: Examine ipfilter's timeouts and conntrack transitions more + closely. They're more complex. --RR */ static const char *tcp_conntrack_names[] = { "NONE", - "ESTABLISHED", "SYN_SENT", "SYN_RECV", + "ESTABLISHED", "FIN_WAIT", - "TIME_WAIT", - "CLOSE", "CLOSE_WAIT", "LAST_ACK", + "TIME_WAIT", + "CLOSE", "LISTEN" }; - -#define SECS *HZ + +#define SECS * HZ #define MINS * 60 SECS #define HOURS * 60 MINS #define DAYS * 24 HOURS @@ -66,64 +94,214 @@ unsigned long ip_ct_tcp_timeout_last_ack = 30 SECS; unsigned long ip_ct_tcp_timeout_time_wait = 2 MINS; unsigned long ip_ct_tcp_timeout_close = 10 SECS; +/* RFC1122 says the R2 limit should be at least 100 seconds. + Linux uses 15 packets as limit, which corresponds + to ~13-30min depending on RTO. */ +unsigned long ip_ct_tcp_timeout_max_retrans = 5 MINS; + static unsigned long * tcp_timeouts[] = { NULL, /* TCP_CONNTRACK_NONE */ - &ip_ct_tcp_timeout_established, /* TCP_CONNTRACK_ESTABLISHED, */ &ip_ct_tcp_timeout_syn_sent, /* TCP_CONNTRACK_SYN_SENT, */ &ip_ct_tcp_timeout_syn_recv, /* TCP_CONNTRACK_SYN_RECV, */ + &ip_ct_tcp_timeout_established, /* TCP_CONNTRACK_ESTABLISHED, */ &ip_ct_tcp_timeout_fin_wait, /* TCP_CONNTRACK_FIN_WAIT, */ - &ip_ct_tcp_timeout_time_wait, /* TCP_CONNTRACK_TIME_WAIT, */ - &ip_ct_tcp_timeout_close, /* TCP_CONNTRACK_CLOSE, */ &ip_ct_tcp_timeout_close_wait, /* TCP_CONNTRACK_CLOSE_WAIT, */ &ip_ct_tcp_timeout_last_ack, /* TCP_CONNTRACK_LAST_ACK, */ + &ip_ct_tcp_timeout_time_wait, /* TCP_CONNTRACK_TIME_WAIT, */ + &ip_ct_tcp_timeout_close, /* TCP_CONNTRACK_CLOSE, */ NULL, /* TCP_CONNTRACK_LISTEN */ }; #define sNO TCP_CONNTRACK_NONE -#define sES TCP_CONNTRACK_ESTABLISHED #define sSS TCP_CONNTRACK_SYN_SENT #define sSR TCP_CONNTRACK_SYN_RECV +#define sES TCP_CONNTRACK_ESTABLISHED #define sFW TCP_CONNTRACK_FIN_WAIT -#define sTW TCP_CONNTRACK_TIME_WAIT -#define sCL TCP_CONNTRACK_CLOSE #define sCW TCP_CONNTRACK_CLOSE_WAIT #define sLA TCP_CONNTRACK_LAST_ACK +#define sTW TCP_CONNTRACK_TIME_WAIT +#define sCL TCP_CONNTRACK_CLOSE #define sLI TCP_CONNTRACK_LISTEN #define sIV TCP_CONNTRACK_MAX +#define sIG TCP_CONNTRACK_IGNORE -static enum tcp_conntrack tcp_conntracks[2][5][TCP_CONNTRACK_MAX] = { +/* What TCP flags are set from RST/SYN/FIN/ACK. */ +enum tcp_bit_set { + TCP_SYN_SET, + TCP_SYNACK_SET, + TCP_FIN_SET, + TCP_ACK_SET, + TCP_RST_SET, + TCP_NONE_SET, +}; + +/* + * The TCP state transition table needs a few words... + * + * We are the man in the middle. All the packets go through us + * but might get lost in transit to the destination. + * It is assumed that the destinations can't receive segments + * we haven't seen. + * + * The checked segment is in window, but our windows are *not* + * equivalent with the ones of the sender/receiver. We always + * try to guess the state of the current sender. + * + * The meaning of the states are: + * + * NONE: initial state + * SYN_SENT: SYN-only packet seen + * SYN_RECV: SYN-ACK packet seen + * ESTABLISHED: ACK packet seen + * FIN_WAIT: FIN packet seen + * CLOSE_WAIT: ACK seen (after FIN) + * LAST_ACK: FIN seen (after FIN) + * TIME_WAIT: last ACK seen + * CLOSE: closed connection + * + * LISTEN state is not used. + * + * Packets marked as IGNORED (sIG): + * if they may be either invalid or valid + * and the receiver may send back a connection + * closing RST or a SYN/ACK. + * + * Packets marked as INVALID (sIV): + * if they are invalid + * or we do not support the request (simultaneous open) + */ +static enum tcp_conntrack tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { { -/* ORIGINAL */ -/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI */ -/*syn*/ {sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI }, -/*fin*/ {sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI }, -/*ack*/ {sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES }, -/*rst*/ {sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL }, -/*none*/{sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV } +/* ORIGINAL */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*syn*/ { sSS, sSS, sIG, sIG, sIG, sIG, sIG, sSS, sSS, sIV }, +/* + * sNO -> sSS Initialize a new connection + * sSS -> sSS Retransmitted SYN + * sSR -> sIG Late retransmitted SYN? + * sES -> sIG Error: SYNs in window outside the SYN_SENT state + * are errors. Receiver will reply with RST + * and close the connection. + * Or we are not in sync and hold a dead connection. + * sFW -> sIG + * sCW -> sIG + * sLA -> sIG + * sTW -> sSS Reopened connection (RFC 1122). + * sCL -> sSS + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*synack*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }, +/* + * A SYN/ACK from the client is always invalid: + * - either it tries to set up a simultaneous open, which is + * not supported; + * - or the firewall has just been inserted between the two hosts + * during the session set-up. The SYN will be retransmitted + * by the true client (or it'll time out). + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV }, +/* + * sNO -> sIV Too late and no reason to do anything... + * sSS -> sIV Client migth not send FIN in this state: + * we enforce waiting for a SYN/ACK reply first. + * sSR -> sFW Close started. + * sES -> sFW + * sFW -> sLA FIN seen in both directions, waiting for + * the last ACK. + * Migth be a retransmitted FIN as well... + * sCW -> sLA + * sLA -> sLA Retransmitted FIN. Remain in the same state. + * sTW -> sTW + * sCL -> sCL + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*ack*/ { sES, sIV, sES, sES, sCW, sCW, sTW, sTW, sCL, sIV }, +/* + * sNO -> sES Assumed. + * sSS -> sIV ACK is invalid: we haven't seen a SYN/ACK yet. + * sSR -> sES Established state is reached. + * sES -> sES :-) + * sFW -> sCW Normal close request answered by ACK. + * sCW -> sCW + * sLA -> sTW Last ACK detected. + * sTW -> sTW Retransmitted last ACK. Remain in the same state. + * sCL -> sCL + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV }, +/*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV } }, { -/* REPLY */ -/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI */ -/*syn*/ {sSR, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }, -/*fin*/ {sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI }, -/*ack*/ {sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI }, -/*rst*/ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sLA, sLI }, -/*none*/{sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV } - } +/* REPLY */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*syn*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }, +/* + * sNO -> sIV Never reached. + * sSS -> sIV Simultaneous open, not supported + * sSR -> sIV Simultaneous open, not supported. + * sES -> sIV Server may not initiate a connection. + * sFW -> sIV + * sCW -> sIV + * sLA -> sIV + * sTW -> sIV Reopened connection, but server may not do it. + * sCL -> sIV + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*synack*/ { sIV, sSR, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sIV }, +/* + * sSS -> sSR Standard open. + * sSR -> sSR Retransmitted SYN/ACK. + * sES -> sIG Late retransmitted SYN/ACK? + * sFW -> sIG + * sCW -> sIG + * sLA -> sIG + * sTW -> sIG + * sCL -> sIG + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV }, +/* + * sSS -> sIV Server might not send FIN in this state. + * sSR -> sFW Close started. + * sES -> sFW + * sFW -> sLA FIN seen in both directions. + * sCW -> sLA + * sLA -> sLA Retransmitted FIN. + * sTW -> sTW + * sCL -> sCL + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*ack*/ { sIV, sIV, sIV, sES, sCW, sCW, sTW, sTW, sCL, sIV }, +/* + * sSS -> sIV ACK is invalid: we haven't seen a SYN/ACK yet. + * sSR -> sIV Simultaneous open. + * sES -> sES :-) + * sFW -> sCW Normal close request answered by ACK. + * sCW -> sCW + * sLA -> sTW Last ACK detected. + * sTW -> sTW Retransmitted last ACK. + * sCL -> sCL + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV }, +/*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV } + } }; static int tcp_pkt_to_tuple(const struct sk_buff *skb, - unsigned int dataoff, - struct ip_conntrack_tuple *tuple) + unsigned int dataoff, + struct ip_conntrack_tuple *tuple) { - struct tcphdr hdr; + struct tcphdr _hdr, *hp; /* Actually only need first 8 bytes. */ - if (skb_copy_bits(skb, dataoff, &hdr, 8) != 0) + hp = skb_header_pointer(skb, dataoff, 8, &_hdr); + if (hp == NULL) return 0; - tuple->src.u.tcp.port = hdr.source; - tuple->dst.u.tcp.port = hdr.dest; + tuple->src.u.tcp.port = hp->source; + tuple->dst.u.tcp.port = hp->dest; return 1; } @@ -160,11 +338,488 @@ static unsigned int tcp_print_conntrack(char *buffer, static unsigned int get_conntrack_index(const struct tcphdr *tcph) { - if (tcph->rst) return 3; - else if (tcph->syn) return 0; - else if (tcph->fin) return 1; - else if (tcph->ack) return 2; - else return 4; + if (tcph->rst) return TCP_RST_SET; + else if (tcph->syn) return (tcph->ack ? TCP_SYNACK_SET : TCP_SYN_SET); + else if (tcph->fin) return TCP_FIN_SET; + else if (tcph->ack) return TCP_ACK_SET; + else return TCP_NONE_SET; +} + +/* TCP connection tracking based on 'Real Stateful TCP Packet Filtering + in IP Filter' by Guido van Rooij. + + http://www.nluug.nl/events/sane2000/papers.html + http://www.iae.nl/users/guido/papers/tcp_filtering.ps.gz + + The boundaries and the conditions are slightly changed: + + td_maxend = max(sack + max(win,1)) seen in reply packets + td_maxwin = max(max(win, 1)) + (sack - ack) seen in sent packets + td_end = max(seq + len) seen in sent packets + + I. Upper bound for valid data: seq + len <= sender.td_maxend + II. Lower bound for valid data: seq >= sender.td_end - receiver.td_maxwin + III. Upper bound for valid ack: sack <= receiver.td_end + IV. Lower bound for valid ack: ack >= receiver.td_end - MAXACKWINDOW + + where sack is the highest right edge of sack block found in the packet. + + The upper bound limit for a valid ack is not ignored - + we doesn't have to deal with fragments. +*/ + +static inline __u32 segment_seq_plus_len(__u32 seq, + size_t len, + struct iphdr *iph, + struct tcphdr *tcph) + { + return (seq + len - (iph->ihl + tcph->doff)*4 + + (tcph->syn ? 1 : 0) + (tcph->fin ? 1 : 0)); +} + +/* Fixme: what about big packets? */ +#define MAXACKWINCONST 66000 +#define MAXACKWINDOW(sender) \ + ((sender)->td_maxwin > MAXACKWINCONST ? (sender)->td_maxwin \ + : MAXACKWINCONST) + +/* + * Simplified tcp_parse_options routine from tcp_input.c + */ +static void tcp_options(const struct sk_buff *skb, + struct iphdr *iph, + struct tcphdr *tcph, + struct ip_ct_tcp_state *state) +{ + unsigned char buff[(15 * 4) - sizeof(struct tcphdr)]; + unsigned char *ptr; + int length = (tcph->doff*4) - sizeof(struct tcphdr); + + if (!length) + return; + + ptr = skb_header_pointer(skb, + (iph->ihl * 4) + sizeof(struct tcphdr), + length, buff); + BUG_ON(ptr == NULL); + + state->td_scale = + state->flags = 0; + + while (length > 0) { + int opcode=*ptr++; + int opsize; + + switch (opcode) { + case TCPOPT_EOL: + return; + case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ + length--; + continue; + default: + opsize=*ptr++; + if (opsize < 2) /* "silly options" */ + return; + if (opsize > length) + break; /* don't parse partial options */ + + if (opcode == TCPOPT_SACK_PERM + && opsize == TCPOLEN_SACK_PERM) + state->flags |= IP_CT_TCP_FLAG_SACK_PERM; + else if (opcode == TCPOPT_WINDOW + && opsize == TCPOLEN_WINDOW) { + state->td_scale = *(u_int8_t *)ptr; + + if (state->td_scale > 14) { + /* See RFC1323 */ + state->td_scale = 14; + } + state->flags |= + IP_CT_TCP_STATE_FLAG_WINDOW_SCALE; + } + ptr += opsize - 2; + length -= opsize; + } + } +} + +static void tcp_sack(struct tcphdr *tcph, __u32 *sack) +{ + __u32 tmp; + unsigned char *ptr; + int length = (tcph->doff*4) - sizeof(struct tcphdr); + + /* Fast path for timestamp-only option */ + if (length == TCPOLEN_TSTAMP_ALIGNED*4 + && *(__u32 *)(tcph + 1) == + __constant_ntohl((TCPOPT_NOP << 24) + | (TCPOPT_NOP << 16) + | (TCPOPT_TIMESTAMP << 8) + | TCPOLEN_TIMESTAMP)) + return; + + ptr = (unsigned char *)(tcph + 1); + while (length > 0) { + int opcode=*ptr++; + int opsize, i; + + switch (opcode) { + case TCPOPT_EOL: + return; + case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ + length--; + continue; + default: + opsize=*ptr++; + if (opsize < 2) /* "silly options" */ + return; + if (opsize > length) + break; /* don't parse partial options */ + + if (opcode == TCPOPT_SACK + && opsize >= (TCPOLEN_SACK_BASE + + TCPOLEN_SACK_PERBLOCK) + && !((opsize - TCPOLEN_SACK_BASE) + % TCPOLEN_SACK_PERBLOCK)) { + for (i = 0; + i < (opsize - TCPOLEN_SACK_BASE); + i += TCPOLEN_SACK_PERBLOCK) { + tmp = ntohl(*((u_int32_t *)(ptr+i)+1)); + + if (after(tmp, *sack)) + *sack = tmp; + } + return; + } + ptr += opsize - 2; + length -= opsize; + } + } +} + +static int tcp_in_window(struct ip_ct_tcp *state, + enum ip_conntrack_dir dir, + unsigned int *index, + const struct sk_buff *skb, + struct iphdr *iph, + struct tcphdr *tcph) +{ + struct ip_ct_tcp_state *sender = &state->seen[dir]; + struct ip_ct_tcp_state *receiver = &state->seen[!dir]; + __u32 seq, ack, sack, end, win, swin; + int res; + + /* + * Get the required data from the packet. + */ + seq = ntohl(tcph->seq); + ack = sack = ntohl(tcph->ack_seq); + win = ntohs(tcph->window); + end = segment_seq_plus_len(seq, skb->len, iph, tcph); + + if (receiver->flags & IP_CT_TCP_FLAG_SACK_PERM) + tcp_sack(tcph, &sack); + + DEBUGP("tcp_in_window: START\n"); + DEBUGP("tcp_in_window: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu " + "seq=%u ack=%u sack=%u win=%u end=%u\n", + NIPQUAD(iph->saddr), ntohs(tcph->source), + NIPQUAD(iph->daddr), ntohs(tcph->dest), + seq, ack, sack, win, end); + DEBUGP("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i " + "receiver end=%u maxend=%u maxwin=%u scale=%i\n", + sender->td_end, sender->td_maxend, sender->td_maxwin, + sender->td_scale, + receiver->td_end, receiver->td_maxend, receiver->td_maxwin, + receiver->td_scale); + + if (sender->td_end == 0) { + /* + * Initialize sender data. + */ + if (tcph->syn && tcph->ack) { + /* + * Outgoing SYN-ACK in reply to a SYN. + */ + sender->td_end = + sender->td_maxend = end; + sender->td_maxwin = (win == 0 ? 1 : win); + + tcp_options(skb, iph, tcph, sender); + /* + * RFC 1323: + * Both sides must send the Window Scale option + * to enable window scaling in either direction. + */ + if (!(sender->flags & IP_CT_TCP_STATE_FLAG_WINDOW_SCALE + && receiver->flags & IP_CT_TCP_STATE_FLAG_WINDOW_SCALE)) + sender->td_scale = + receiver->td_scale = 0; + } else { + /* + * We are in the middle of a connection, + * its history is lost for us. + * Let's try to use the data from the packet. + */ + sender->td_end = end; + sender->td_maxwin = (win == 0 ? 1 : win); + sender->td_maxend = end + sender->td_maxwin; + } + } else if (state->state == TCP_CONNTRACK_SYN_SENT + && dir == IP_CT_DIR_ORIGINAL + && after(end, sender->td_end)) { + /* + * RFC 793: "if a TCP is reinitialized ... then it need + * not wait at all; it must only be sure to use sequence + * numbers larger than those recently used." + */ + sender->td_end = + sender->td_maxend = end; + sender->td_maxwin = (win == 0 ? 1 : win); + + tcp_options(skb, iph, tcph, sender); + } + + if (!(tcph->ack)) { + /* + * If there is no ACK, just pretend it was set and OK. + */ + ack = sack = receiver->td_end; + } else if (((tcp_flag_word(tcph) & (TCP_FLAG_ACK|TCP_FLAG_RST)) == + (TCP_FLAG_ACK|TCP_FLAG_RST)) + && (ack == 0)) { + /* + * Broken TCP stacks, that set ACK in RST packets as well + * with zero ack value. + */ + ack = sack = receiver->td_end; + } + + if (seq == end) + /* + * Packets contains no data: we assume it is valid + * and check the ack value only. + */ + seq = end = sender->td_end; + + DEBUGP("tcp_in_window: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu " + "seq=%u ack=%u sack =%u win=%u end=%u trim=%u\n", + NIPQUAD(iph->saddr), ntohs(tcph->source), + NIPQUAD(iph->daddr), ntohs(tcph->dest), + seq, ack, sack, win, end, + after(end, sender->td_maxend) && before(seq, sender->td_maxend) + ? sender->td_maxend : end); + DEBUGP("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i " + "receiver end=%u maxend=%u maxwin=%u scale=%i\n", + sender->td_end, sender->td_maxend, sender->td_maxwin, + sender->td_scale, + receiver->td_end, receiver->td_maxend, receiver->td_maxwin, + receiver->td_scale); + + /* Ignore data over the right edge of the receiver's window. */ + if (after(end, sender->td_maxend) && + before(seq, sender->td_maxend)) { + end = sender->td_maxend; + if (*index == TCP_FIN_SET) + *index = TCP_ACK_SET; + } + DEBUGP("tcp_in_window: I=%i II=%i III=%i IV=%i\n", + before(end, sender->td_maxend + 1) + || before(seq, sender->td_maxend + 1), + after(seq, sender->td_end - receiver->td_maxwin - 1) + || after(end, sender->td_end - receiver->td_maxwin - 1), + before(sack, receiver->td_end + 1), + after(ack, receiver->td_end - MAXACKWINDOW(sender))); + + if (sender->loose || receiver->loose || + (before(end, sender->td_maxend + 1) && + after(seq, sender->td_end - receiver->td_maxwin - 1) && + before(sack, receiver->td_end + 1) && + after(ack, receiver->td_end - MAXACKWINDOW(sender)))) { + /* + * Take into account window scaling (RFC 1323). + */ + if (!tcph->syn) + win <<= sender->td_scale; + + /* + * Update sender data. + */ + swin = win + (sack - ack); + if (sender->td_maxwin < swin) + sender->td_maxwin = swin; + if (after(end, sender->td_end)) + sender->td_end = end; + if (after(sack + win, receiver->td_maxend - 1)) { + receiver->td_maxend = sack + win; + if (win == 0) + receiver->td_maxend++; + } + + /* + * Check retransmissions. + */ + if (*index == TCP_ACK_SET) { + if (state->last_dir == dir + && state->last_seq == seq + && state->last_end == end) + state->retrans++; + else { + state->last_dir = dir; + state->last_seq = seq; + state->last_end = end; + state->retrans = 0; + } + } + /* + * Close the window of disabled window tracking :-) + */ + if (sender->loose) + sender->loose--; + + res = 1; + } else { + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_tcp: %s ", + before(end, sender->td_maxend + 1) ? + after(seq, sender->td_end - receiver->td_maxwin - 1) ? + before(ack, receiver->td_end + 1) ? + after(ack, receiver->td_end - MAXACKWINDOW(sender)) ? "BUG" + : "ACK is under the lower bound (possibly overly delayed ACK)" + : "ACK is over the upper bound (ACKed data has never seen yet)" + : "SEQ is under the lower bound (retransmitted already ACKed data)" + : "SEQ is over the upper bound (over the window of the receiver)"); + + res = ip_ct_tcp_be_liberal && !tcph->rst; + } + + DEBUGP("tcp_in_window: res=%i sender end=%u maxend=%u maxwin=%u " + "receiver end=%u maxend=%u maxwin=%u\n", + res, sender->td_end, sender->td_maxend, sender->td_maxwin, + receiver->td_end, receiver->td_maxend, receiver->td_maxwin); + + return res; +} + +#ifdef CONFIG_IP_NF_NAT_NEEDED +/* Update sender->td_end after NAT successfully mangled the packet */ +int ip_conntrack_tcp_update(struct sk_buff *skb, + struct ip_conntrack *conntrack, + int dir) +{ + struct iphdr *iph = skb->nh.iph; + struct tcphdr *tcph = (void *)skb->nh.iph + skb->nh.iph->ihl*4; + __u32 end; +#ifdef DEBUGP_VARS + struct ip_ct_tcp_state *sender = &conntrack->proto.tcp.seen[dir]; + struct ip_ct_tcp_state *receiver = &conntrack->proto.tcp.seen[!dir]; +#endif + + end = segment_seq_plus_len(ntohl(tcph->seq), skb->len, iph, tcph); + + WRITE_LOCK(&tcp_lock); + /* + * We have to worry for the ack in the reply packet only... + */ + if (after(end, conntrack->proto.tcp.seen[dir].td_end)) + conntrack->proto.tcp.seen[dir].td_end = end; + conntrack->proto.tcp.last_end = end; + WRITE_UNLOCK(&tcp_lock); + DEBUGP("tcp_update: sender end=%u maxend=%u maxwin=%u scale=%i " + "receiver end=%u maxend=%u maxwin=%u scale=%i\n", + sender->td_end, sender->td_maxend, sender->td_maxwin, + sender->td_scale, + receiver->td_end, receiver->td_maxend, receiver->td_maxwin, + receiver->td_scale); + + return 1; +} + +EXPORT_SYMBOL(ip_conntrack_tcp_update); +#endif + +#define TH_FIN 0x01 +#define TH_SYN 0x02 +#define TH_RST 0x04 +#define TH_PUSH 0x08 +#define TH_ACK 0x10 +#define TH_URG 0x20 +#define TH_ECE 0x40 +#define TH_CWR 0x80 + +/* table of valid flag combinations - ECE and CWR are always valid */ +static u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG) + 1] = +{ + [TH_SYN] = 1, + [TH_SYN|TH_ACK] = 1, + [TH_RST] = 1, + [TH_RST|TH_ACK] = 1, + [TH_RST|TH_ACK|TH_PUSH] = 1, + [TH_FIN|TH_ACK] = 1, + [TH_ACK] = 1, + [TH_ACK|TH_PUSH] = 1, + [TH_ACK|TH_URG] = 1, + [TH_ACK|TH_URG|TH_PUSH] = 1, + [TH_FIN|TH_ACK|TH_PUSH] = 1, + [TH_FIN|TH_ACK|TH_URG] = 1, + [TH_FIN|TH_ACK|TH_URG|TH_PUSH] = 1, +}; + +/* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c. */ +static int tcp_error(struct sk_buff *skb, + enum ip_conntrack_info *ctinfo, + unsigned int hooknum) +{ + struct iphdr *iph = skb->nh.iph; + struct tcphdr _tcph, *th; + unsigned int tcplen = skb->len - iph->ihl * 4; + u_int8_t tcpflags; + + /* Smaller that minimal TCP header? */ + th = skb_header_pointer(skb, iph->ihl * 4, + sizeof(_tcph), &_tcph); + if (th == NULL) { + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_tcp: short packet "); + return -NF_ACCEPT; + } + + /* Not whole TCP header or malformed packet */ + if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) { + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_tcp: truncated/malformed packet "); + return -NF_ACCEPT; + } + + /* Checksum invalid? Ignore. + * We skip checking packets on the outgoing path + * because the semantic of CHECKSUM_HW is different there + * and moreover root might send raw packets. + */ + /* FIXME: Source route IP option packets --RR */ + if (hooknum == NF_IP_PRE_ROUTING + && csum_tcpudp_magic(iph->saddr, iph->daddr, tcplen, IPPROTO_TCP, + skb->ip_summed == CHECKSUM_HW ? skb->csum + : skb_checksum(skb, iph->ihl*4, tcplen, 0))) { + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_tcp: bad TCP checksum "); + return -NF_ACCEPT; + } + + /* Check TCP flags. */ + tcpflags = (((u_int8_t *)th)[13] & ~(TH_ECE|TH_CWR)); + if (!tcp_valid_flags[tcpflags]) { + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_tcp: invalid TCP flag combination "); + return -NF_ACCEPT; + } + + return NF_ACCEPT; } /* Returns verdict for packet, or -1 for invalid. */ @@ -172,103 +827,260 @@ static int tcp_packet(struct ip_conntrack *conntrack, const struct sk_buff *skb, enum ip_conntrack_info ctinfo) { - enum tcp_conntrack newconntrack, oldtcpstate; - struct tcphdr tcph; - - if (skb_copy_bits(skb, skb->nh.iph->ihl * 4, &tcph, sizeof(tcph)) != 0) - return -1; - if (skb->len < skb->nh.iph->ihl * 4 + tcph.doff * 4) - return -1; - - /* If only reply is a RST, we can consider ourselves not to - have an established connection: this is a fairly common - problem case, so we can delete the conntrack - immediately. --RR */ - if (!test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status) && tcph.rst) { - if (del_timer(&conntrack->timeout)) - conntrack->timeout.function((unsigned long)conntrack); + enum tcp_conntrack new_state, old_state; + enum ip_conntrack_dir dir; + struct iphdr *iph = skb->nh.iph; + struct tcphdr *th, _tcph; + unsigned long timeout; + unsigned int index; + + th = skb_header_pointer(skb, iph->ihl * 4, + sizeof(_tcph), &_tcph); + BUG_ON(th == NULL); + + WRITE_LOCK(&tcp_lock); + old_state = conntrack->proto.tcp.state; + dir = CTINFO2DIR(ctinfo); + index = get_conntrack_index(th); + new_state = tcp_conntracks[dir][index][old_state]; + + switch (new_state) { + case TCP_CONNTRACK_IGNORE: + /* Either SYN in ORIGINAL, or SYN/ACK in REPLY direction. */ + if (index == TCP_SYNACK_SET + && conntrack->proto.tcp.last_index == TCP_SYN_SET + && conntrack->proto.tcp.last_dir != dir + && after(ntohl(th->ack_seq), + conntrack->proto.tcp.last_seq)) { + /* This SYN/ACK acknowledges a SYN that we earlier + * ignored as invalid. This means that the client and + * the server are both in sync, while the firewall is + * not. We kill this session and block the SYN/ACK so + * that the client cannot but retransmit its SYN and + * thus initiate a clean new session. + */ + WRITE_UNLOCK(&tcp_lock); + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_tcp: killing out of sync session "); + if (del_timer(&conntrack->timeout)) + conntrack->timeout.function((unsigned long) + conntrack); + return -NF_DROP; + } + conntrack->proto.tcp.last_index = index; + conntrack->proto.tcp.last_dir = dir; + conntrack->proto.tcp.last_seq = ntohl(th->seq); + + WRITE_UNLOCK(&tcp_lock); + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_tcp: invalid SYN (ignored) "); return NF_ACCEPT; + case TCP_CONNTRACK_MAX: + /* Invalid packet */ + DEBUGP("ip_ct_tcp: Invalid dir=%i index=%u ostate=%u\n", + dir, get_conntrack_index(th), + old_state); + WRITE_UNLOCK(&tcp_lock); + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_tcp: invalid state "); + return -NF_ACCEPT; + case TCP_CONNTRACK_SYN_SENT: + if (old_state >= TCP_CONNTRACK_TIME_WAIT) { + /* Attempt to reopen a closed connection. + * Delete this connection and look up again. */ + WRITE_UNLOCK(&tcp_lock); + if (del_timer(&conntrack->timeout)) + conntrack->timeout.function((unsigned long) + conntrack); + return -NF_REPEAT; + } + break; + case TCP_CONNTRACK_CLOSE: + if (index == TCP_RST_SET + && test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status) + && conntrack->proto.tcp.last_index <= TCP_SYNACK_SET + && after(ntohl(th->ack_seq), + conntrack->proto.tcp.last_seq)) { + /* Ignore RST closing down invalid SYN + we had let trough. */ + WRITE_UNLOCK(&tcp_lock); + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_tcp: invalid RST (ignored) "); + return NF_ACCEPT; + } + /* Just fall trough */ + default: + /* Keep compilers happy. */ + break; } - WRITE_LOCK(&tcp_lock); - oldtcpstate = conntrack->proto.tcp.state; - newconntrack - = tcp_conntracks - [CTINFO2DIR(ctinfo)] - [get_conntrack_index(&tcph)][oldtcpstate]; - - /* Invalid */ - if (newconntrack == TCP_CONNTRACK_MAX) { - DEBUGP("ip_conntrack_tcp: Invalid dir=%i index=%u conntrack=%u\n", - CTINFO2DIR(ctinfo), get_conntrack_index(&tcph), - conntrack->proto.tcp.state); + if (!tcp_in_window(&conntrack->proto.tcp, dir, &index, + skb, iph, th)) { WRITE_UNLOCK(&tcp_lock); - return -1; + return -NF_ACCEPT; } + /* From now on we have got in-window packets */ + + /* If FIN was trimmed off, we don't change state. */ + conntrack->proto.tcp.last_index = index; + new_state = tcp_conntracks[dir][index][old_state]; - conntrack->proto.tcp.state = newconntrack; - - /* Poor man's window tracking: record SYN/ACK for handshake check */ - if (oldtcpstate == TCP_CONNTRACK_SYN_SENT - && CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY - && tcph.syn && tcph.ack) { - conntrack->proto.tcp.handshake_ack - = htonl(ntohl(tcph.seq) + 1); - goto out; - } + DEBUGP("tcp_conntracks: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu " + "syn=%i ack=%i fin=%i rst=%i old=%i new=%i\n", + NIPQUAD(iph->saddr), ntohs(th->source), + NIPQUAD(iph->daddr), ntohs(th->dest), + (th->syn ? 1 : 0), (th->ack ? 1 : 0), + (th->fin ? 1 : 0), (th->rst ? 1 : 0), + old_state, new_state); - /* Set ASSURED if we see valid ack in ESTABLISHED after SYN_RECV */ - if (oldtcpstate == TCP_CONNTRACK_SYN_RECV - && CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL - && tcph.ack && !tcph.syn - && tcph.ack_seq == conntrack->proto.tcp.handshake_ack) - set_bit(IPS_ASSURED_BIT, &conntrack->status); + conntrack->proto.tcp.state = new_state; + timeout = conntrack->proto.tcp.retrans >= ip_ct_tcp_max_retrans + && *tcp_timeouts[new_state] > ip_ct_tcp_timeout_max_retrans + ? ip_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state]; + WRITE_UNLOCK(&tcp_lock); -out: WRITE_UNLOCK(&tcp_lock); - ip_ct_refresh(conntrack, *tcp_timeouts[newconntrack]); + if (!test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) { + /* If only reply is a RST, we can consider ourselves not to + have an established connection: this is a fairly common + problem case, so we can delete the conntrack + immediately. --RR */ + if (th->rst) { + if (del_timer(&conntrack->timeout)) + conntrack->timeout.function((unsigned long) + conntrack); + return NF_ACCEPT; + } + } else if (!test_bit(IPS_ASSURED_BIT, &conntrack->status) + && (old_state == TCP_CONNTRACK_SYN_RECV + || old_state == TCP_CONNTRACK_ESTABLISHED) + && new_state == TCP_CONNTRACK_ESTABLISHED) { + /* Set ASSURED if we see see valid ack in ESTABLISHED + after SYN_RECV or a valid answer for a picked up + connection. */ + set_bit(IPS_ASSURED_BIT, &conntrack->status); + } + ip_ct_refresh_acct(conntrack, ctinfo, skb, timeout); return NF_ACCEPT; } - -/* Called when a new connection for this protocol found. */ -static int tcp_new(struct ip_conntrack *conntrack, const struct sk_buff *skb) + + /* Called when a new connection for this protocol found. */ +static int tcp_new(struct ip_conntrack *conntrack, + const struct sk_buff *skb) { - enum tcp_conntrack newconntrack; - struct tcphdr tcph; - - if (skb_copy_bits(skb, skb->nh.iph->ihl * 4, &tcph, sizeof(tcph)) != 0) - return -1; + enum tcp_conntrack new_state; + struct iphdr *iph = skb->nh.iph; + struct tcphdr *th, _tcph; +#ifdef DEBUGP_VARS + struct ip_ct_tcp_state *sender = &conntrack->proto.tcp.seen[0]; + struct ip_ct_tcp_state *receiver = &conntrack->proto.tcp.seen[1]; +#endif + th = skb_header_pointer(skb, iph->ihl * 4, + sizeof(_tcph), &_tcph); + BUG_ON(th == NULL); + /* Don't need lock here: this conntrack not in circulation yet */ - newconntrack - = tcp_conntracks[0][get_conntrack_index(&tcph)] + new_state + = tcp_conntracks[0][get_conntrack_index(th)] [TCP_CONNTRACK_NONE]; /* Invalid: delete conntrack */ - if (newconntrack == TCP_CONNTRACK_MAX) { - DEBUGP("ip_conntrack_tcp: invalid new deleting.\n"); + if (new_state >= TCP_CONNTRACK_MAX) { + DEBUGP("ip_ct_tcp: invalid new deleting.\n"); + return 0; + } + + if (new_state == TCP_CONNTRACK_SYN_SENT) { + /* SYN packet */ + conntrack->proto.tcp.seen[0].td_end = + segment_seq_plus_len(ntohl(th->seq), skb->len, + iph, th); + conntrack->proto.tcp.seen[0].td_maxwin = ntohs(th->window); + if (conntrack->proto.tcp.seen[0].td_maxwin == 0) + conntrack->proto.tcp.seen[0].td_maxwin = 1; + conntrack->proto.tcp.seen[0].td_maxend = + conntrack->proto.tcp.seen[0].td_end; + + tcp_options(skb, iph, th, &conntrack->proto.tcp.seen[0]); + conntrack->proto.tcp.seen[1].flags = 0; + conntrack->proto.tcp.seen[0].loose = + conntrack->proto.tcp.seen[1].loose = 0; + } else if (ip_ct_tcp_loose == 0) { + /* Don't try to pick up connections. */ return 0; + } else { + /* + * We are in the middle of a connection, + * its history is lost for us. + * Let's try to use the data from the packet. + */ + conntrack->proto.tcp.seen[0].td_end = + segment_seq_plus_len(ntohl(th->seq), skb->len, + iph, th); + conntrack->proto.tcp.seen[0].td_maxwin = ntohs(th->window); + if (conntrack->proto.tcp.seen[0].td_maxwin == 0) + conntrack->proto.tcp.seen[0].td_maxwin = 1; + conntrack->proto.tcp.seen[0].td_maxend = + conntrack->proto.tcp.seen[0].td_end + + conntrack->proto.tcp.seen[0].td_maxwin; + conntrack->proto.tcp.seen[0].td_scale = 0; + + /* We assume SACK. Should we assume window scaling too? */ + conntrack->proto.tcp.seen[0].flags = + conntrack->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM; + conntrack->proto.tcp.seen[0].loose = + conntrack->proto.tcp.seen[1].loose = ip_ct_tcp_loose; } + + conntrack->proto.tcp.seen[1].td_end = 0; + conntrack->proto.tcp.seen[1].td_maxend = 0; + conntrack->proto.tcp.seen[1].td_maxwin = 1; + conntrack->proto.tcp.seen[1].td_scale = 0; - conntrack->proto.tcp.state = newconntrack; + /* tcp_packet will set them */ + conntrack->proto.tcp.state = TCP_CONNTRACK_NONE; + conntrack->proto.tcp.last_index = TCP_NONE_SET; + + DEBUGP("tcp_new: sender end=%u maxend=%u maxwin=%u scale=%i " + "receiver end=%u maxend=%u maxwin=%u scale=%i\n", + sender->td_end, sender->td_maxend, sender->td_maxwin, + sender->td_scale, + receiver->td_end, receiver->td_maxend, receiver->td_maxwin, + receiver->td_scale); return 1; } - + static int tcp_exp_matches_pkt(struct ip_conntrack_expect *exp, const struct sk_buff *skb) { const struct iphdr *iph = skb->nh.iph; - struct tcphdr tcph; + struct tcphdr *th, _tcph; unsigned int datalen; - if (skb_copy_bits(skb, skb->nh.iph->ihl * 4, &tcph, sizeof(tcph)) != 0) + th = skb_header_pointer(skb, iph->ihl * 4, + sizeof(_tcph), &_tcph); + if (th == NULL) return 0; - datalen = skb->len - iph->ihl*4 - tcph.doff*4; + datalen = skb->len - iph->ihl*4 - th->doff*4; - return between(exp->seq, ntohl(tcph.seq), ntohl(tcph.seq) + datalen); + return between(exp->seq, ntohl(th->seq), ntohl(th->seq) + datalen); } -struct ip_conntrack_protocol ip_conntrack_protocol_tcp -= { { NULL, NULL }, IPPROTO_TCP, "tcp", - tcp_pkt_to_tuple, tcp_invert_tuple, tcp_print_tuple, tcp_print_conntrack, - tcp_packet, tcp_new, NULL, tcp_exp_matches_pkt, NULL }; +struct ip_conntrack_protocol ip_conntrack_protocol_tcp = +{ + .proto = IPPROTO_TCP, + .name = "tcp", + .pkt_to_tuple = tcp_pkt_to_tuple, + .invert_tuple = tcp_invert_tuple, + .print_tuple = tcp_print_tuple, + .print_conntrack = tcp_print_conntrack, + .packet = tcp_packet, + .new = tcp_new, + .exp_matches_pkt = tcp_exp_matches_pkt, + .error = tcp_error, +}; diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c index a63c32d1840e..0fe9e9188fdf 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c @@ -12,6 +12,8 @@ #include <linux/netfilter.h> #include <linux/in.h> #include <linux/udp.h> +#include <net/checksum.h> +#include <linux/netfilter.h> #include <linux/netfilter_ipv4/ip_conntrack_protocol.h> unsigned long ip_ct_udp_timeout = 30*HZ; @@ -21,14 +23,15 @@ static int udp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, struct ip_conntrack_tuple *tuple) { - struct udphdr hdr; + struct udphdr _hdr, *hp; /* Actually only need first 8 bytes. */ - if (skb_copy_bits(skb, dataoff, &hdr, 8) != 0) + hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); + if (hp == NULL) return 0; - tuple->src.u.udp.port = hdr.source; - tuple->dst.u.udp.port = hdr.dest; + tuple->src.u.udp.port = hp->source; + tuple->dst.u.udp.port = hp->dest; return 1; } @@ -60,16 +63,17 @@ static unsigned int udp_print_conntrack(char *buffer, /* Returns verdict for packet, and may modify conntracktype */ static int udp_packet(struct ip_conntrack *conntrack, const struct sk_buff *skb, - enum ip_conntrack_info conntrackinfo) + enum ip_conntrack_info ctinfo) { /* If we've seen traffic both ways, this is some kind of UDP stream. Extend timeout. */ if (test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) { - ip_ct_refresh(conntrack, ip_ct_udp_timeout_stream); + ip_ct_refresh_acct(conntrack, ctinfo, skb, + ip_ct_udp_timeout_stream); /* Also, more likely to be important, and not a probe */ set_bit(IPS_ASSURED_BIT, &conntrack->status); } else - ip_ct_refresh(conntrack, ip_ct_udp_timeout); + ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_udp_timeout); return NF_ACCEPT; } @@ -80,7 +84,60 @@ static int udp_new(struct ip_conntrack *conntrack, const struct sk_buff *skb) return 1; } -struct ip_conntrack_protocol ip_conntrack_protocol_udp -= { { NULL, NULL }, IPPROTO_UDP, "udp", - udp_pkt_to_tuple, udp_invert_tuple, udp_print_tuple, udp_print_conntrack, - udp_packet, udp_new, NULL, NULL, NULL }; +static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo, + unsigned int hooknum) +{ + struct iphdr *iph = skb->nh.iph; + unsigned int udplen = skb->len - iph->ihl * 4; + struct udphdr hdr; + + /* Header is too small? */ + if (skb_copy_bits(skb, iph->ihl*4, &hdr, sizeof(hdr)) != 0) { + if (LOG_INVALID(IPPROTO_UDP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_udp: short packet "); + return -NF_ACCEPT; + } + + /* Truncated/malformed packets */ + if (ntohs(hdr.len) > udplen || ntohs(hdr.len) < sizeof(hdr)) { + if (LOG_INVALID(IPPROTO_UDP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_udp: truncated/malformed packet "); + return -NF_ACCEPT; + } + + /* Packet with no checksum */ + if (!hdr.check) + return NF_ACCEPT; + + /* Checksum invalid? Ignore. + * We skip checking packets on the outgoing path + * because the semantic of CHECKSUM_HW is different there + * and moreover root might send raw packets. + * FIXME: Source route IP option packets --RR */ + if (hooknum == NF_IP_PRE_ROUTING + && csum_tcpudp_magic(iph->saddr, iph->daddr, udplen, IPPROTO_UDP, + skb->ip_summed == CHECKSUM_HW ? skb->csum + : skb_checksum(skb, iph->ihl*4, udplen, 0))) { + if (LOG_INVALID(IPPROTO_UDP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_udp: bad UDP checksum "); + return -NF_ACCEPT; + } + + return NF_ACCEPT; +} + +struct ip_conntrack_protocol ip_conntrack_protocol_udp = +{ + .proto = IPPROTO_UDP, + .name = "udp", + .pkt_to_tuple = udp_pkt_to_tuple, + .invert_tuple = udp_invert_tuple, + .print_tuple = udp_print_tuple, + .print_conntrack = udp_print_conntrack, + .packet = udp_packet, + .new = udp_new, + .error = udp_error, +}; diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c index 80edac904188..86010ea65de1 100644 --- a/net/ipv4/netfilter/ip_conntrack_standalone.c +++ b/net/ipv4/netfilter/ip_conntrack_standalone.c @@ -20,6 +20,8 @@ #include <linux/module.h> #include <linux/skbuff.h> #include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/percpu.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif @@ -43,6 +45,9 @@ MODULE_LICENSE("GPL"); +extern atomic_t ip_conntrack_count; +DECLARE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat); + static int kill_proto(const struct ip_conntrack *i, void *data) { return (i->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum == @@ -63,128 +68,305 @@ print_tuple(char *buffer, const struct ip_conntrack_tuple *tuple, return len; } -/* FIXME: Don't print source proto part. --RR */ +#ifdef CONFIG_IP_NF_CT_ACCT static unsigned int -print_expect(char *buffer, const struct ip_conntrack_expect *expect) +seq_print_counters(struct seq_file *s, struct ip_conntrack_counter *counter) { - unsigned int len; - - if (expect->expectant->helper->timeout) - len = sprintf(buffer, "EXPECTING: %lu ", - timer_pending(&expect->timeout) - ? (expect->timeout.expires - jiffies)/HZ : 0); - else - len = sprintf(buffer, "EXPECTING: - "); - len += sprintf(buffer + len, "use=%u proto=%u ", - atomic_read(&expect->use), expect->tuple.dst.protonum); - len += print_tuple(buffer + len, &expect->tuple, - __ip_ct_find_proto(expect->tuple.dst.protonum)); - len += sprintf(buffer + len, "\n"); - return len; + return seq_printf(s, "packets=%llu bytes=%llu ", + counter->packets, counter->bytes); } +#else +#define seq_print_counters(x, y) 0 +#endif -static unsigned int -print_conntrack(char *buffer, struct ip_conntrack *conntrack) +static void *ct_seq_start(struct seq_file *s, loff_t *pos) { - unsigned int len; - struct ip_conntrack_protocol *proto - = __ip_ct_find_proto(conntrack->tuplehash[IP_CT_DIR_ORIGINAL] - .tuple.dst.protonum); + unsigned int *bucket; - len = sprintf(buffer, "%-8s %u %lu ", - proto->name, - conntrack->tuplehash[IP_CT_DIR_ORIGINAL] - .tuple.dst.protonum, - timer_pending(&conntrack->timeout) - ? (conntrack->timeout.expires - jiffies)/HZ : 0); + /* strange seq_file api calls stop even if we fail, + * thus we need to grab lock since stop unlocks */ + READ_LOCK(&ip_conntrack_lock); + + if (*pos >= ip_conntrack_htable_size) + return NULL; - len += proto->print_conntrack(buffer + len, conntrack); - len += print_tuple(buffer + len, - &conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple, - proto); - if (!(test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status))) - len += sprintf(buffer + len, "[UNREPLIED] "); - len += print_tuple(buffer + len, - &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple, - proto); - if (test_bit(IPS_ASSURED_BIT, &conntrack->status)) - len += sprintf(buffer + len, "[ASSURED] "); - len += sprintf(buffer + len, "use=%u ", - atomic_read(&conntrack->ct_general.use)); - len += sprintf(buffer + len, "\n"); + bucket = kmalloc(sizeof(unsigned int), GFP_KERNEL); + if (!bucket) { + return ERR_PTR(-ENOMEM); + } + + *bucket = *pos; + return bucket; +} + +static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + unsigned int *bucket = (unsigned int *) v; - return len; + *pos = ++(*bucket); + if (*pos >= ip_conntrack_htable_size) { + kfree(v); + return NULL; + } + return bucket; +} + +static void ct_seq_stop(struct seq_file *s, void *v) +{ + READ_UNLOCK(&ip_conntrack_lock); } -/* Returns true when finished. */ -static inline int -conntrack_iterate(const struct ip_conntrack_tuple_hash *hash, - char *buffer, off_t offset, off_t *upto, - unsigned int *len, unsigned int maxlen) +/* return 0 on success, 1 in case of error */ +static int ct_seq_real_show(const struct ip_conntrack_tuple_hash *hash, + struct seq_file *s) { - unsigned int newlen; - IP_NF_ASSERT(hash->ctrack); + struct ip_conntrack *conntrack = hash->ctrack; + struct ip_conntrack_protocol *proto; + char buffer[IP_CT_PRINT_BUFLEN]; MUST_BE_READ_LOCKED(&ip_conntrack_lock); - /* Only count originals */ + IP_NF_ASSERT(conntrack); + + /* we only want to print DIR_ORIGINAL */ if (DIRECTION(hash)) return 0; - if ((*upto)++ < offset) - return 0; + proto = __ip_ct_find_proto(conntrack->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple.dst.protonum); + IP_NF_ASSERT(proto); + + if (seq_printf(s, "%-8s %u %lu ", + proto->name, + conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum, + timer_pending(&conntrack->timeout) + ? (conntrack->timeout.expires - jiffies)/HZ : 0) != 0) + return 1; + + proto->print_conntrack(buffer, conntrack); + if (seq_puts(s, buffer)) + return 1; + + print_tuple(buffer, &conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + proto); + + if (seq_print_counters(s, &conntrack->counters[IP_CT_DIR_ORIGINAL])) + return 1; - newlen = print_conntrack(buffer + *len, hash->ctrack); - if (*len + newlen > maxlen) + if (!(test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status))) + if (seq_printf(s, "[UNREPLIED] ")) + return 1; + + print_tuple(buffer, &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple, + proto); + if (seq_puts(s, buffer)) + return 1; + + if (seq_print_counters(s, &conntrack->counters[IP_CT_DIR_REPLY])) + return 1; + + if (test_bit(IPS_ASSURED_BIT, &conntrack->status)) + if (seq_printf(s, "[ASSURED] ")) + return 1; + + if (seq_printf(s, "use=%u\n", atomic_read(&conntrack->ct_general.use))) return 1; - else *len += newlen; return 0; } -static int -list_conntracks(char *buffer, char **start, off_t offset, int length) + +static int ct_seq_show(struct seq_file *s, void *v) { - unsigned int i; - unsigned int len = 0; - off_t upto = 0; - struct list_head *e; + unsigned int *bucket = (unsigned int *) v; - READ_LOCK(&ip_conntrack_lock); - /* Traverse hash; print originals then reply. */ - for (i = 0; i < ip_conntrack_htable_size; i++) { - if (LIST_FIND(&ip_conntrack_hash[i], conntrack_iterate, - struct ip_conntrack_tuple_hash *, - buffer, offset, &upto, &len, length)) - goto finished; + if (LIST_FIND(&ip_conntrack_hash[*bucket], ct_seq_real_show, + struct ip_conntrack_tuple_hash *, s)) { + /* buffer was filled and unable to print that tuple */ + return 1; } + return 0; +} + +static struct seq_operations ct_seq_ops = { + .start = ct_seq_start, + .next = ct_seq_next, + .stop = ct_seq_stop, + .show = ct_seq_show +}; + +static int ct_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &ct_seq_ops); +} - /* Now iterate through expecteds. */ +static struct file_operations ct_file_ops = { + .owner = THIS_MODULE, + .open = ct_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + +/* expects */ +static void *exp_seq_start(struct seq_file *s, loff_t *pos) +{ + struct list_head *e = &ip_conntrack_expect_list; + loff_t i; + + /* strange seq_file api calls stop even if we fail, + * thus we need to grab lock since stop unlocks */ + READ_LOCK(&ip_conntrack_lock); READ_LOCK(&ip_conntrack_expect_tuple_lock); - list_for_each(e, &ip_conntrack_expect_list) { - unsigned int last_len; - struct ip_conntrack_expect *expect - = (struct ip_conntrack_expect *)e; - if (upto++ < offset) continue; - - last_len = len; - len += print_expect(buffer + len, expect); - if (len > length) { - len = last_len; - goto finished_expects; - } + + if (list_empty(e)) + return NULL; + + for (i = 0; i <= *pos; i++) { + e = e->next; + if (e == &ip_conntrack_expect_list) + return NULL; } + return e; +} - finished_expects: +static void *exp_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct list_head *e = v; + + e = e->next; + + if (e == &ip_conntrack_expect_list) + return NULL; + + return e; +} + +static void exp_seq_stop(struct seq_file *s, void *v) +{ READ_UNLOCK(&ip_conntrack_expect_tuple_lock); - finished: READ_UNLOCK(&ip_conntrack_lock); +} - /* `start' hack - see fs/proc/generic.c line ~165 */ - *start = (char *)((unsigned int)upto - offset); - return len; +static int exp_seq_show(struct seq_file *s, void *v) +{ + struct ip_conntrack_expect *expect = v; + char buffer[IP_CT_PRINT_BUFLEN]; + + if (expect->expectant->helper->timeout) + seq_printf(s, "%lu ", timer_pending(&expect->timeout) + ? (expect->timeout.expires - jiffies)/HZ : 0); + else + seq_printf(s, "- "); + + seq_printf(s, "use=%u proto=%u ", atomic_read(&expect->use), + expect->tuple.dst.protonum); + + print_tuple(buffer, &expect->tuple, + __ip_ct_find_proto(expect->tuple.dst.protonum)); + return seq_printf(s, "%s\n", buffer); +} + +static struct seq_operations exp_seq_ops = { + .start = exp_seq_start, + .next = exp_seq_next, + .stop = exp_seq_stop, + .show = exp_seq_show +}; + +static int exp_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &exp_seq_ops); +} + +static struct file_operations exp_file_ops = { + .owner = THIS_MODULE, + .open = exp_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + +static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos) +{ + int cpu; + + for (cpu = *pos; cpu < NR_CPUS; ++cpu) { + if (!cpu_possible(cpu)) + continue; + *pos = cpu; + return &per_cpu(ip_conntrack_stat, cpu); + } + + return NULL; +} + +static void *ct_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + int cpu; + + for (cpu = *pos + 1; cpu < NR_CPUS; ++cpu) { + if (!cpu_possible(cpu)) + continue; + *pos = cpu; + return &per_cpu(ip_conntrack_stat, cpu); + } + + return NULL; +} + +static void ct_cpu_seq_stop(struct seq_file *seq, void *v) +{ +} + +static int ct_cpu_seq_show(struct seq_file *seq, void *v) +{ + unsigned int nr_conntracks = atomic_read(&ip_conntrack_count); + struct ip_conntrack_stat *st = v; + + seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x " + "%08x %08x %08x %08x %08x %08x %08x %08x \n", + nr_conntracks, + st->searched, + st->found, + st->new, + st->invalid, + st->ignore, + st->delete, + st->delete_list, + st->insert, + st->insert_failed, + st->drop, + st->early_drop, + st->icmp_error, + + st->expect_new, + st->expect_create, + st->expect_delete + ); + return 0; +} + +static struct seq_operations ct_cpu_seq_ops = { + .start = ct_cpu_seq_start, + .next = ct_cpu_seq_next, + .stop = ct_cpu_seq_stop, + .show = ct_cpu_seq_show, +}; + +static int ct_cpu_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &ct_cpu_seq_ops); } +static struct file_operations ct_cpu_seq_fops = { + .owner = THIS_MODULE, + .open = ct_cpu_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + static unsigned int ip_confirm(unsigned int hooknum, struct sk_buff **pskb, const struct net_device *in, @@ -323,6 +505,10 @@ extern unsigned long ip_ct_tcp_timeout_close_wait; extern unsigned long ip_ct_tcp_timeout_last_ack; extern unsigned long ip_ct_tcp_timeout_time_wait; extern unsigned long ip_ct_tcp_timeout_close; +extern unsigned long ip_ct_tcp_timeout_max_retrans; +extern int ip_ct_tcp_loose; +extern int ip_ct_tcp_be_liberal; +extern int ip_ct_tcp_max_retrans; /* From ip_conntrack_proto_udp.c */ extern unsigned long ip_ct_udp_timeout; @@ -334,6 +520,11 @@ extern unsigned long ip_ct_icmp_timeout; /* From ip_conntrack_proto_icmp.c */ extern unsigned long ip_ct_generic_timeout; +/* Log invalid packets of a given protocol */ +unsigned int ip_ct_log_invalid = 0; +static int log_invalid_proto_min = 0; +static int log_invalid_proto_max = 255; + static struct ctl_table_header *ip_ct_sysctl_header; static ctl_table ip_ct_sysctl_table[] = { @@ -449,6 +640,49 @@ static ctl_table ip_ct_sysctl_table[] = { .mode = 0644, .proc_handler = &proc_dointvec_jiffies, }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_LOG_INVALID, + .procname = "ip_conntrack_log_invalid", + .data = &ip_ct_log_invalid, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &log_invalid_proto_min, + .extra2 = &log_invalid_proto_max, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS, + .procname = "ip_conntrack_tcp_timeout_max_retrans", + .data = &ip_ct_tcp_timeout_max_retrans, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_LOOSE, + .procname = "ip_conntrack_tcp_loose", + .data = &ip_ct_tcp_loose, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_BE_LIBERAL, + .procname = "ip_conntrack_tcp_be_liberal", + .data = &ip_ct_tcp_be_liberal, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_MAX_RETRANS, + .procname = "ip_conntrack_tcp_max_retrans", + .data = &ip_ct_tcp_max_retrans, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, { .ctl_name = 0 } }; @@ -494,7 +728,7 @@ static ctl_table ip_ct_net_table[] = { #endif static int init_or_cleanup(int init) { - struct proc_dir_entry *proc; + struct proc_dir_entry *proc, *proc_exp, *proc_stat; int ret = 0; if (!init) goto cleanup; @@ -503,14 +737,24 @@ static int init_or_cleanup(int init) if (ret < 0) goto cleanup_nothing; - proc = proc_net_create("ip_conntrack", 0440, list_conntracks); + proc = proc_net_create("ip_conntrack", 0440, NULL); if (!proc) goto cleanup_init; - proc->owner = THIS_MODULE; + proc->proc_fops = &ct_file_ops; + + proc_exp = proc_net_create("ip_conntrack_expect", 0440, NULL); + if (!proc_exp) goto cleanup_proc; + proc_exp->proc_fops = &exp_file_ops; + + proc_stat = proc_net_fops_create("ip_conntrack_stat", S_IRUGO, + &ct_cpu_seq_fops); + if (!proc_stat) + goto cleanup_proc_exp; + proc_stat->owner = THIS_MODULE; ret = nf_register_hook(&ip_conntrack_defrag_ops); if (ret < 0) { printk("ip_conntrack: can't register pre-routing defrag hook.\n"); - goto cleanup_proc; + goto cleanup_proc_stat; } ret = nf_register_hook(&ip_conntrack_defrag_local_out_ops); if (ret < 0) { @@ -562,6 +806,10 @@ static int init_or_cleanup(int init) nf_unregister_hook(&ip_conntrack_defrag_local_out_ops); cleanup_defragops: nf_unregister_hook(&ip_conntrack_defrag_ops); + cleanup_proc_stat: + proc_net_remove("ip_conntrack_stat"); +cleanup_proc_exp: + proc_net_remove("ip_conntrack_exp"); cleanup_proc: proc_net_remove("ip_conntrack"); cleanup_init: @@ -638,7 +886,7 @@ EXPORT_SYMBOL(need_ip_conntrack); EXPORT_SYMBOL(ip_conntrack_helper_register); EXPORT_SYMBOL(ip_conntrack_helper_unregister); EXPORT_SYMBOL(ip_ct_selective_cleanup); -EXPORT_SYMBOL(ip_ct_refresh); +EXPORT_SYMBOL(ip_ct_refresh_acct); EXPORT_SYMBOL(ip_ct_find_proto); EXPORT_SYMBOL(__ip_ct_find_proto); EXPORT_SYMBOL(ip_ct_find_helper); @@ -657,3 +905,4 @@ EXPORT_SYMBOL(ip_conntrack_hash); EXPORT_SYMBOL(ip_conntrack_untracked); EXPORT_SYMBOL_GPL(ip_conntrack_find_get); EXPORT_SYMBOL_GPL(ip_conntrack_put); +EXPORT_SYMBOL(ip_ct_log_invalid); diff --git a/net/ipv4/netfilter/ip_conntrack_tftp.c b/net/ipv4/netfilter/ip_conntrack_tftp.c index ddee7378209d..d132a3c48d8d 100644 --- a/net/ipv4/netfilter/ip_conntrack_tftp.c +++ b/net/ipv4/netfilter/ip_conntrack_tftp.c @@ -19,6 +19,7 @@ #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv4/ip_conntrack_helper.h> #include <linux/netfilter_ipv4/ip_conntrack_tftp.h> +#include <linux/moduleparam.h> MODULE_AUTHOR("Magnus Boden <mb@ozaba.mine.nu>"); MODULE_DESCRIPTION("tftp connection tracking helper"); @@ -27,7 +28,7 @@ MODULE_LICENSE("GPL"); #define MAX_PORTS 8 static int ports[MAX_PORTS]; static int ports_c; -MODULE_PARM(ports, "1-" __MODULE_STRING(MAX_PORTS) "i"); +module_param_array(ports, int, ports_c, 0400); MODULE_PARM_DESC(ports, "port numbers of tftp servers"); #if 0 @@ -41,14 +42,16 @@ static int tftp_help(struct sk_buff *skb, struct ip_conntrack *ct, enum ip_conntrack_info ctinfo) { - struct tftphdr tftph; + struct tftphdr _tftph, *tfh; struct ip_conntrack_expect *exp; - if (skb_copy_bits(skb, skb->nh.iph->ihl * 4 + sizeof(struct udphdr), - &tftph, sizeof(tftph)) != 0) + tfh = skb_header_pointer(skb, + skb->nh.iph->ihl * 4 + sizeof(struct udphdr), + sizeof(_tftph), &_tftph); + if (tfh == NULL) return NF_ACCEPT; - switch (ntohs(tftph.opcode)) { + switch (ntohs(tfh->opcode)) { /* RRQ and WRQ works the same way */ case TFTP_OPCODE_READ: case TFTP_OPCODE_WRITE: @@ -104,10 +107,10 @@ static int __init init(void) int i, ret; char *tmpname; - if (!ports[0]) - ports[0]=TFTP_PORT; + if (ports_c == 0) + ports[ports_c++] = TFTP_PORT; - for (i = 0 ; (i < MAX_PORTS) && ports[i] ; i++) { + for (i = 0; i < ports_c; i++) { /* Create helper structure */ memset(&tftp[i], 0, sizeof(struct ip_conntrack_helper)); @@ -137,7 +140,6 @@ static int __init init(void) fini(); return(ret); } - ports_c++; } return(0); } diff --git a/net/ipv4/netfilter/ip_fw_compat_masq.c b/net/ipv4/netfilter/ip_fw_compat_masq.c index fbd8b9bfeb28..48227fe19dc2 100644 --- a/net/ipv4/netfilter/ip_fw_compat_masq.c +++ b/net/ipv4/netfilter/ip_fw_compat_masq.c @@ -31,6 +31,7 @@ #include <linux/netfilter_ipv4/ip_conntrack.h> #include <linux/netfilter_ipv4/ip_conntrack_core.h> +#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> #include <linux/netfilter_ipv4/ip_nat.h> #include <linux/netfilter_ipv4/ip_nat_core.h> #include <linux/netfilter_ipv4/listhelp.h> @@ -144,7 +145,8 @@ check_for_demasq(struct sk_buff **pskb) switch ((*pskb)->nh.iph->protocol) { case IPPROTO_ICMP: /* ICMP errors. */ - ct = icmp_error_track(*pskb, &ctinfo, NF_IP_PRE_ROUTING); + protocol->error(*pskb, &ctinfo, NF_IP_PRE_ROUTING); + ct = (struct ip_conntrack *)(*pskb)->nfct->master; if (ct) { /* We only do SNAT in the compatibility layer. So we can manipulate ICMP errors from @@ -165,7 +167,8 @@ check_for_demasq(struct sk_buff **pskb) case IPPROTO_UDP: IP_NF_ASSERT(((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) == 0); - if (!get_tuple((*pskb)->nh.iph, *pskb, (*pskb)->nh.iph->ihl*4, &tuple, protocol)) { + if (!ip_ct_get_tuple((*pskb)->nh.iph, *pskb, + (*pskb)->nh.iph->ihl*4, &tuple, protocol)) { if (net_ratelimit()) printk("ip_fw_compat_masq: Can't get tuple\n"); return NF_ACCEPT; diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c index 1c6b7810655a..d350134dacb1 100644 --- a/net/ipv4/netfilter/ip_nat_core.c +++ b/net/ipv4/netfilter/ip_nat_core.c @@ -49,7 +49,6 @@ static unsigned int ip_nat_htable_size; static struct list_head *bysource; static struct list_head *byipsproto; LIST_HEAD(protos); -LIST_HEAD(helpers); extern struct ip_nat_protocol unknown_nat_protocol; @@ -498,13 +497,6 @@ get_unique_tuple(struct ip_conntrack_tuple *tuple, return ret; } -static inline int -helper_cmp(const struct ip_nat_helper *helper, - const struct ip_conntrack_tuple *tuple) -{ - return ip_ct_tuple_mask_cmp(tuple, &helper->tuple, &helper->mask); -} - /* Where to manip the reply packets (will be reverse manip). */ static unsigned int opposite_hook[NF_IP_NUMHOOKS] = { [NF_IP_PRE_ROUTING] = NF_IP_POST_ROUTING, @@ -643,8 +635,7 @@ ip_nat_setup_info(struct ip_conntrack *conntrack, /* If there's a helper, assign it; based on new tuple. */ if (!conntrack->master) - info->helper = LIST_FIND(&helpers, helper_cmp, struct ip_nat_helper *, - &reply); + info->helper = ip_nat_find_helper(&reply); /* It's done. */ info->initialized |= (1 << HOOK2MANIP(hooknum)); diff --git a/net/ipv4/netfilter/ip_nat_ftp.c b/net/ipv4/netfilter/ip_nat_ftp.c index 946ca05bb90f..ae15f410bc4b 100644 --- a/net/ipv4/netfilter/ip_nat_ftp.c +++ b/net/ipv4/netfilter/ip_nat_ftp.c @@ -12,6 +12,7 @@ #include <linux/netfilter_ipv4.h> #include <linux/ip.h> #include <linux/tcp.h> +#include <linux/moduleparam.h> #include <net/tcp.h> #include <linux/netfilter_ipv4/ip_nat.h> #include <linux/netfilter_ipv4/ip_nat_helper.h> @@ -33,7 +34,7 @@ MODULE_DESCRIPTION("ftp NAT helper"); static int ports[MAX_PORTS]; static int ports_c; -MODULE_PARM(ports, "1-" __MODULE_STRING(MAX_PORTS) "i"); +module_param_array(ports, int, ports_c, 0400); DECLARE_LOCK_EXTERN(ip_ftp_lock); @@ -313,10 +314,10 @@ static int __init init(void) int i, ret = 0; char *tmpname; - if (ports[0] == 0) - ports[0] = FTP_PORT; + if (ports_c == 0) + ports[ports_c] = FTP_PORT; - for (i = 0; (i < MAX_PORTS) && ports[i]; i++) { + for (i = 0; i < ports_c; i++) { ftp[i].tuple.dst.protonum = IPPROTO_TCP; ftp[i].tuple.src.u.tcp.port = htons(ports[i]); ftp[i].mask.dst.protonum = 0xFFFF; @@ -343,7 +344,6 @@ static int __init init(void) fini(); return ret; } - ports_c++; } return ret; diff --git a/net/ipv4/netfilter/ip_nat_helper.c b/net/ipv4/netfilter/ip_nat_helper.c index a49c722adbc1..2e8d021aff44 100644 --- a/net/ipv4/netfilter/ip_nat_helper.c +++ b/net/ipv4/netfilter/ip_nat_helper.c @@ -47,6 +47,7 @@ #define DUMP_OFFSET(x) #endif +static LIST_HEAD(helpers); DECLARE_LOCK(ip_nat_seqofs_lock); /* Setup TCP sequence correction given this change at this sequence */ @@ -419,6 +420,18 @@ int ip_nat_helper_register(struct ip_nat_helper *me) return ret; } +struct ip_nat_helper * +ip_nat_find_helper(const struct ip_conntrack_tuple *tuple) +{ + struct ip_nat_helper *h; + + READ_LOCK(&ip_nat_lock); + h = LIST_FIND(&helpers, helper_cmp, struct ip_nat_helper *, tuple); + READ_UNLOCK(&ip_nat_lock); + + return h; +} + static int kill_helper(const struct ip_conntrack *i, void *helper) { diff --git a/net/ipv4/netfilter/ip_nat_irc.c b/net/ipv4/netfilter/ip_nat_irc.c index dc778dd4ab2c..06555b44e49b 100644 --- a/net/ipv4/netfilter/ip_nat_irc.c +++ b/net/ipv4/netfilter/ip_nat_irc.c @@ -27,6 +27,7 @@ #include <linux/netfilter_ipv4/ip_nat_rule.h> #include <linux/netfilter_ipv4/ip_conntrack_irc.h> #include <linux/netfilter_ipv4/ip_conntrack_helper.h> +#include <linux/moduleparam.h> #if 0 #define DEBUGP printk @@ -41,7 +42,7 @@ static int ports_c; MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>"); MODULE_DESCRIPTION("IRC (DCC) NAT helper"); MODULE_LICENSE("GPL"); -MODULE_PARM(ports, "1-" __MODULE_STRING(MAX_PORTS) "i"); +module_param_array(ports, int, ports_c, 0400); MODULE_PARM_DESC(ports, "port numbers of IRC servers"); /* protects irc part of conntracks */ @@ -235,11 +236,10 @@ static int __init init(void) struct ip_nat_helper *hlpr; char *tmpname; - if (ports[0] == 0) { - ports[0] = IRC_PORT; - } + if (ports_c == 0) + ports[ports_c++] = IRC_PORT; - for (i = 0; (i < MAX_PORTS) && ports[i] != 0; i++) { + for (i = 0; i < ports_c; i++) { hlpr = &ip_nat_irc_helpers[i]; hlpr->tuple.dst.protonum = IPPROTO_TCP; hlpr->tuple.src.u.tcp.port = htons(ports[i]); @@ -269,7 +269,6 @@ static int __init init(void) fini(); return 1; } - ports_c++; } return ret; } diff --git a/net/ipv4/netfilter/ip_nat_rule.c b/net/ipv4/netfilter/ip_nat_rule.c index 145b2c57368a..8ee96d10449a 100644 --- a/net/ipv4/netfilter/ip_nat_rule.c +++ b/net/ipv4/netfilter/ip_nat_rule.c @@ -132,7 +132,8 @@ static unsigned int ipt_snat_target(struct sk_buff **pskb, ct = ip_conntrack_get(*pskb, &ctinfo); /* Connection must be valid and new. */ - IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)); + IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED + || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)); IP_NF_ASSERT(out); return ip_nat_setup_info(ct, targinfo, hooknum); diff --git a/net/ipv4/netfilter/ip_nat_snmp_basic.c b/net/ipv4/netfilter/ip_nat_snmp_basic.c index 539ad18f8131..32d6e966aa13 100644 --- a/net/ipv4/netfilter/ip_nat_snmp_basic.c +++ b/net/ipv4/netfilter/ip_nat_snmp_basic.c @@ -47,6 +47,7 @@ #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> +#include <linux/moduleparam.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv4/ip_nat.h> #include <linux/netfilter_ipv4/ip_nat_helper.h> @@ -1252,6 +1253,9 @@ static unsigned int nat_help(struct ip_conntrack *ct, int dir = CTINFO2DIR(ctinfo); struct iphdr *iph = (*pskb)->nh.iph; struct udphdr *udph = (struct udphdr *)((u_int32_t *)iph + iph->ihl); + + if (!skb_ip_make_writable(pskb, (*pskb)->len)) + return NF_DROP; spin_lock_bh(&snmp_lock); @@ -1357,4 +1361,4 @@ static void __exit fini(void) module_init(init); module_exit(fini); -MODULE_PARM(debug, "i"); +module_param(debug, bool, 0600); diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c index 9765fd2d5cf1..62ef0d1f7554 100644 --- a/net/ipv4/netfilter/ip_nat_standalone.c +++ b/net/ipv4/netfilter/ip_nat_standalone.c @@ -99,11 +99,13 @@ ip_nat_fn(unsigned int hooknum, hash table yet). We must not let this through, in case we're doing NAT to the same network. */ if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP) { - struct icmphdr hdr; + struct icmphdr _hdr, *hp; - if (skb_copy_bits(*pskb, (*pskb)->nh.iph->ihl*4, - &hdr, sizeof(hdr)) == 0 - && hdr.type == ICMP_REDIRECT) + hp = skb_header_pointer(*pskb, + (*pskb)->nh.iph->ihl*4, + sizeof(_hdr), &_hdr); + if (hp != NULL && + hp->type == ICMP_REDIRECT) return NF_DROP; } return NF_ACCEPT; diff --git a/net/ipv4/netfilter/ip_nat_tftp.c b/net/ipv4/netfilter/ip_nat_tftp.c index a2097bfbefb3..cacaab6f768c 100644 --- a/net/ipv4/netfilter/ip_nat_tftp.c +++ b/net/ipv4/netfilter/ip_nat_tftp.c @@ -32,6 +32,7 @@ #include <linux/netfilter_ipv4/ip_conntrack_tftp.h> #include <linux/netfilter_ipv4/ip_nat_helper.h> #include <linux/netfilter_ipv4/ip_nat_rule.h> +#include <linux/moduleparam.h> MODULE_AUTHOR("Magnus Boden <mb@ozaba.mine.nu>"); MODULE_DESCRIPTION("tftp NAT helper"); @@ -41,7 +42,7 @@ MODULE_LICENSE("GPL"); static int ports[MAX_PORTS]; static int ports_c = 0; -MODULE_PARM(ports,"1-" __MODULE_STRING(MAX_PORTS) "i"); +module_param_array(ports, int, ports_c, 0400); MODULE_PARM_DESC(ports, "port numbers of tftp servers"); #if 0 @@ -59,7 +60,7 @@ tftp_nat_help(struct ip_conntrack *ct, struct sk_buff **pskb) { int dir = CTINFO2DIR(ctinfo); - struct tftphdr tftph; + struct tftphdr _tftph, *tfh; struct ip_conntrack_tuple repl; if (!((hooknum == NF_IP_POST_ROUTING && dir == IP_CT_DIR_ORIGINAL) @@ -71,11 +72,13 @@ tftp_nat_help(struct ip_conntrack *ct, return NF_ACCEPT; } - if (skb_copy_bits(*pskb, (*pskb)->nh.iph->ihl*4+sizeof(struct udphdr), - &tftph, sizeof(tftph)) != 0) + tfh = skb_header_pointer(*pskb, + (*pskb)->nh.iph->ihl*4+sizeof(struct udphdr), + sizeof(_tftph), &_tftph); + if (tfh == NULL) return NF_DROP; - switch (ntohs(tftph.opcode)) { + switch (ntohs(tfh->opcode)) { /* RRQ and WRQ works the same way */ case TFTP_OPCODE_READ: case TFTP_OPCODE_WRITE: @@ -108,9 +111,12 @@ tftp_nat_expected(struct sk_buff **pskb, #if 0 const struct ip_conntrack_tuple *repl = &master->tuplehash[IP_CT_DIR_REPLY].tuple; - struct udphdr udph; + struct udphdr _udph, *uh; - if (skb_copy_bits(*pskb,(*pskb)->nh.iph->ihl*4,&udph,sizeof(udph))!=0) + uh = skb_header_pointer(*pskb, + (*pskb)->nh.iph->ihl*4, + sizeof(_udph), &_udph); + if (uh == NULL) return NF_DROP; #endif @@ -125,8 +131,8 @@ tftp_nat_expected(struct sk_buff **pskb, mr.range[0].min_ip = mr.range[0].max_ip = orig->dst.ip; DEBUGP("orig: %u.%u.%u.%u:%u <-> %u.%u.%u.%u:%u " "newsrc: %u.%u.%u.%u\n", - NIPQUAD((*pskb)->nh.iph->saddr), ntohs(udph.source), - NIPQUAD((*pskb)->nh.iph->daddr), ntohs(udph.dest), + NIPQUAD((*pskb)->nh.iph->saddr), ntohs(uh->source), + NIPQUAD((*pskb)->nh.iph->daddr), ntohs(uh->dest), NIPQUAD(orig->dst.ip)); } else { mr.range[0].min_ip = mr.range[0].max_ip = orig->src.ip; @@ -136,8 +142,8 @@ tftp_nat_expected(struct sk_buff **pskb, DEBUGP("orig: %u.%u.%u.%u:%u <-> %u.%u.%u.%u:%u " "newdst: %u.%u.%u.%u:%u\n", - NIPQUAD((*pskb)->nh.iph->saddr), ntohs(udph.source), - NIPQUAD((*pskb)->nh.iph->daddr), ntohs(udph.dest), + NIPQUAD((*pskb)->nh.iph->saddr), ntohs(uh->source), + NIPQUAD((*pskb)->nh.iph->daddr), ntohs(uh->dest), NIPQUAD(orig->src.ip), ntohs(orig->src.u.udp.port)); } @@ -162,10 +168,10 @@ static int __init init(void) int i, ret = 0; char *tmpname; - if (!ports[0]) - ports[0] = TFTP_PORT; + if (ports_c == 0) + ports[ports_c++] = TFTP_PORT; - for (i = 0 ; (i < MAX_PORTS) && ports[i] ; i++) { + for (i = 0; i < ports_c; i++) { memset(&tftp[i], 0, sizeof(struct ip_nat_helper)); tftp[i].tuple.dst.protonum = IPPROTO_UDP; @@ -194,7 +200,6 @@ static int __init init(void) fini(); return ret; } - ports_c++; } return ret; } diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index f24f17b8e03e..bd2e13211a7f 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -61,6 +61,8 @@ do { \ #endif #define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1)) +static DECLARE_MUTEX(ipt_mutex); + /* Must have mutex */ #define ASSERT_READ_LOCK(x) IP_NF_ASSERT(down_trylock(&ipt_mutex) != 0) #define ASSERT_WRITE_LOCK(x) IP_NF_ASSERT(down_trylock(&ipt_mutex) != 0) @@ -1458,21 +1460,24 @@ tcp_find_option(u_int8_t option, int *hotdrop) { /* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */ - u_int8_t opt[60 - sizeof(struct tcphdr)]; + u_int8_t _opt[60 - sizeof(struct tcphdr)], *op; unsigned int i; duprintf("tcp_match: finding option\n"); /* If we don't have the whole header, drop packet. */ - if (skb_copy_bits(skb, skb->nh.iph->ihl*4 + sizeof(struct tcphdr), - opt, optlen) < 0) { + BUG_ON(!optlen); + op = skb_header_pointer(skb, + skb->nh.iph->ihl*4 + sizeof(struct tcphdr), + optlen, _opt); + if (op == NULL) { *hotdrop = 1; return 0; } for (i = 0; i < optlen; ) { - if (opt[i] == option) return !invert; - if (opt[i] < 2) i++; - else i += opt[i+1]?:1; + if (op[i] == option) return !invert; + if (op[i] < 2) i++; + else i += op[i+1]?:1; } return invert; @@ -1486,7 +1491,7 @@ tcp_match(const struct sk_buff *skb, int offset, int *hotdrop) { - struct tcphdr tcph; + struct tcphdr _tcph, *th; const struct ipt_tcp *tcpinfo = matchinfo; if (offset) { @@ -1506,7 +1511,9 @@ tcp_match(const struct sk_buff *skb, #define FWINVTCP(bool,invflg) ((bool) ^ !!(tcpinfo->invflags & invflg)) - if (skb_copy_bits(skb, skb->nh.iph->ihl*4, &tcph, sizeof(tcph)) < 0) { + th = skb_header_pointer(skb, skb->nh.iph->ihl*4, + sizeof(_tcph), &_tcph); + if (th == NULL) { /* We've been asked to examine this packet, and we can't. Hence, no choice but to drop. */ duprintf("Dropping evil TCP offset=0 tinygram.\n"); @@ -1515,23 +1522,24 @@ tcp_match(const struct sk_buff *skb, } if (!port_match(tcpinfo->spts[0], tcpinfo->spts[1], - ntohs(tcph.source), + ntohs(th->source), !!(tcpinfo->invflags & IPT_TCP_INV_SRCPT))) return 0; if (!port_match(tcpinfo->dpts[0], tcpinfo->dpts[1], - ntohs(tcph.dest), + ntohs(th->dest), !!(tcpinfo->invflags & IPT_TCP_INV_DSTPT))) return 0; - if (!FWINVTCP((((unsigned char *)&tcph)[13] & tcpinfo->flg_mask) + if (!FWINVTCP((((unsigned char *)th)[13] & tcpinfo->flg_mask) == tcpinfo->flg_cmp, IPT_TCP_INV_FLAGS)) return 0; if (tcpinfo->option) { - if (tcph.doff * 4 < sizeof(tcph)) { + if (th->doff * 4 < sizeof(_tcph)) { *hotdrop = 1; return 0; } - if (!tcp_find_option(tcpinfo->option, skb, tcph.doff*4 - sizeof(tcph), + if (!tcp_find_option(tcpinfo->option, skb, + th->doff*4 - sizeof(_tcph), tcpinfo->invflags & IPT_TCP_INV_OPTION, hotdrop)) return 0; @@ -1564,14 +1572,16 @@ udp_match(const struct sk_buff *skb, int offset, int *hotdrop) { - struct udphdr udph; + struct udphdr _udph, *uh; const struct ipt_udp *udpinfo = matchinfo; /* Must not be a fragment. */ if (offset) return 0; - if (skb_copy_bits(skb, skb->nh.iph->ihl*4, &udph, sizeof(udph)) < 0) { + uh = skb_header_pointer(skb, skb->nh.iph->ihl*4, + sizeof(_udph), &_udph); + if (uh == NULL) { /* We've been asked to examine this packet, and we can't. Hence, no choice but to drop. */ duprintf("Dropping evil UDP tinygram.\n"); @@ -1580,10 +1590,10 @@ udp_match(const struct sk_buff *skb, } return port_match(udpinfo->spts[0], udpinfo->spts[1], - ntohs(udph.source), + ntohs(uh->source), !!(udpinfo->invflags & IPT_UDP_INV_SRCPT)) && port_match(udpinfo->dpts[0], udpinfo->dpts[1], - ntohs(udph.dest), + ntohs(uh->dest), !!(udpinfo->invflags & IPT_UDP_INV_DSTPT)); } @@ -1635,16 +1645,19 @@ icmp_match(const struct sk_buff *skb, int offset, int *hotdrop) { - struct icmphdr icmph; + struct icmphdr _icmph, *ic; const struct ipt_icmp *icmpinfo = matchinfo; /* Must not be a fragment. */ if (offset) return 0; - if (skb_copy_bits(skb, skb->nh.iph->ihl*4, &icmph, sizeof(icmph)) < 0){ + ic = skb_header_pointer(skb, skb->nh.iph->ihl*4, + sizeof(_icmph), &_icmph); + if (ic == NULL) { /* We've been asked to examine this packet, and we - can't. Hence, no choice but to drop. */ + * can't. Hence, no choice but to drop. + */ duprintf("Dropping evil ICMP tinygram.\n"); *hotdrop = 1; return 0; @@ -1653,7 +1666,7 @@ icmp_match(const struct sk_buff *skb, return icmp_type_code_match(icmpinfo->type, icmpinfo->code[0], icmpinfo->code[1], - icmph.type, icmph.code, + ic->type, ic->code, !!(icmpinfo->invflags&IPT_ICMP_INV)); } diff --git a/net/ipv4/netfilter/ipchains_core.c b/net/ipv4/netfilter/ipchains_core.c index 97b5401ef1ad..1360222c2537 100644 --- a/net/ipv4/netfilter/ipchains_core.c +++ b/net/ipv4/netfilter/ipchains_core.c @@ -679,49 +679,53 @@ ip_fw_check(const char *rif, case IPPROTO_TCP: dprintf("TCP "); if (!offset) { - struct tcphdr tcph; + struct tcphdr _tcph, *th; - if (skb_copy_bits(*pskb, - (*pskb)->nh.iph->ihl * 4, - &tcph, sizeof(tcph))) + th = skb_header_pointer(*pskb, + (*pskb)->nh.iph->ihl*4, + sizeof(_tcph), &_tcph); + if (th == NULL) return FW_BLOCK; - src_port = ntohs(tcph.source); - dst_port = ntohs(tcph.dest); + src_port = ntohs(th->source); + dst_port = ntohs(th->dest); /* Connection initilisation can only * be made when the syn bit is set and * neither of the ack or reset is * set. */ - if (tcph.syn && !(tcph.ack || tcph.rst)) + if (th->syn && !(th->ack || th->rst)) tcpsyn = 1; } break; case IPPROTO_UDP: dprintf("UDP "); if (!offset) { - struct udphdr udph; + struct udphdr _udph, *uh; - if (skb_copy_bits(*pskb, - (*pskb)->nh.iph->ihl * 4, - &udph, sizeof(udph))) + uh = skb_header_pointer(*pskb, + (*pskb)->nh.iph->ihl*4, + sizeof(_udph), &_udph); + if (uh == NULL) return FW_BLOCK; - src_port = ntohs(udph.source); - dst_port = ntohs(udph.dest); + src_port = ntohs(uh->source); + dst_port = ntohs(uh->dest); } break; case IPPROTO_ICMP: if (!offset) { - struct icmphdr icmph; + struct icmphdr _icmph, *ic; - if (skb_copy_bits(*pskb, - (*pskb)->nh.iph->ihl * 4, - &icmph, sizeof(icmph))) + ic = skb_header_pointer(*pskb, + (*pskb)->nh.iph->ihl*4, + sizeof(_icmph), + &_icmph); + if (ic == NULL) return FW_BLOCK; - src_port = (__u16) icmph.type; - dst_port = (__u16) icmph.code; + src_port = (__u16) ic->type; + dst_port = (__u16) ic->code; } dprintf("ICMP "); break; diff --git a/net/ipv4/netfilter/ipfwadm_core.c b/net/ipv4/netfilter/ipfwadm_core.c index 424a9034fa27..c38a6887722d 100644 --- a/net/ipv4/netfilter/ipfwadm_core.c +++ b/net/ipv4/netfilter/ipfwadm_core.c @@ -410,20 +410,21 @@ int ip_fw_chk(struct sk_buff **pskb, dprintf1("TCP "); /* ports stay 0xFFFF if it is not the first fragment */ if (!offset) { - struct tcphdr tcph; + struct tcphdr _tcph, *th; - if (skb_copy_bits(*pskb, - (*pskb)->nh.iph->ihl * 4, - &tcph, sizeof(tcph))) + th = skb_header_pointer(*pskb, + (*pskb)->nh.iph->ihl*4, + sizeof(_tcph), &_tcph); + if (th == NULL) return FW_BLOCK; - src_port = ntohs(tcph.source); - dst_port = ntohs(tcph.dest); + src_port = ntohs(th->source); + dst_port = ntohs(th->dest); - if(!tcph.ack && !tcph.rst) + if(!th->ack && !th->rst) /* We do NOT have ACK, value TRUE */ notcpack = 1; - if(!tcph.syn || !notcpack) + if(!th->syn || !notcpack) /* We do NOT have SYN, value TRUE */ notcpsyn = 1; } @@ -433,29 +434,32 @@ int ip_fw_chk(struct sk_buff **pskb, dprintf1("UDP "); /* ports stay 0xFFFF if it is not the first fragment */ if (!offset) { - struct udphdr udph; + struct udphdr _udph, *uh; - if (skb_copy_bits(*pskb, - (*pskb)->nh.iph->ihl * 4, - &udph, sizeof(udph))) + uh = skb_header_pointer(*pskb, + (*pskb)->nh.iph->ihl*4, + sizeof(_udph), &_udph); + if (uh == NULL) return FW_BLOCK; - src_port = ntohs(udph.source); - dst_port = ntohs(udph.dest); + src_port = ntohs(uh->source); + dst_port = ntohs(uh->dest); } prt = IP_FW_F_UDP; break; case IPPROTO_ICMP: /* icmp_type stays 255 if it is not the first fragment */ if (!offset) { - struct icmphdr icmph; + struct icmphdr _icmph, *ic; - if (skb_copy_bits(*pskb, - (*pskb)->nh.iph->ihl * 4, - &icmph, sizeof(icmph))) + ic = skb_header_pointer(*pskb, + (*pskb)->nh.iph->ihl*4, + sizeof(_icmph), + &_icmph); + if (ic == NULL) return FW_BLOCK; - icmp_type = (__u16) icmph.type; + icmp_type = (__u16) ic->type; } dprintf2("ICMP:%d ", icmp_type); prt = IP_FW_F_ICMP; diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c index 8ca402564f5e..120109cd294d 100644 --- a/net/ipv4/netfilter/ipt_ECN.c +++ b/net/ipv4/netfilter/ipt_ECN.c @@ -52,34 +52,39 @@ set_ect_ip(struct sk_buff **pskb, const struct ipt_ECN_info *einfo) static inline int set_ect_tcp(struct sk_buff **pskb, const struct ipt_ECN_info *einfo, int inward) { - struct tcphdr tcph; + struct tcphdr _tcph, *th; u_int16_t diffs[2]; /* Not enought header? */ - if (skb_copy_bits(*pskb, (*pskb)->nh.iph->ihl*4, &tcph, sizeof(tcph)) - < 0) + th = skb_header_pointer(*pskb, (*pskb)->nh.iph->ihl*4, + sizeof(_tcph), &_tcph); + if (th == NULL) return 0; - diffs[0] = ((u_int16_t *)&tcph)[6]; + diffs[0] = ((u_int16_t *)th)[6]; if (einfo->operation & IPT_ECN_OP_SET_ECE) - tcph.ece = einfo->proto.tcp.ece; + th->ece = einfo->proto.tcp.ece; if (einfo->operation & IPT_ECN_OP_SET_CWR) - tcph.cwr = einfo->proto.tcp.cwr; - diffs[1] = ((u_int16_t *)&tcph)[6]; + th->cwr = einfo->proto.tcp.cwr; + diffs[1] = ((u_int16_t *)&th)[6]; /* Only mangle if it's changed. */ if (diffs[0] != diffs[1]) { diffs[0] = diffs[0] ^ 0xFFFF; if (!skb_ip_make_writable(pskb, - (*pskb)->nh.iph->ihl*4+sizeof(tcph))) + (*pskb)->nh.iph->ihl*4+sizeof(_tcph))) return 0; + + if (th != &_tcph) + memcpy(&_tcph, th, sizeof(_tcph)); + if ((*pskb)->ip_summed != CHECKSUM_HW) - tcph.check = csum_fold(csum_partial((char *)diffs, - sizeof(diffs), - tcph.check^0xFFFF)); + _tcph.check = csum_fold(csum_partial((char *)diffs, + sizeof(diffs), + _tcph.check^0xFFFF)); memcpy((*pskb)->data + (*pskb)->nh.iph->ihl*4, - &tcph, sizeof(tcph)); + &_tcph, sizeof(_tcph)); if ((*pskb)->ip_summed == CHECKSUM_HW) if (skb_checksum_help(pskb, inward)) return 0; diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c index b79962e225f7..2a3e3eb424e3 100644 --- a/net/ipv4/netfilter/ipt_LOG.c +++ b/net/ipv4/netfilter/ipt_LOG.c @@ -28,7 +28,7 @@ MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); MODULE_DESCRIPTION("iptables syslog logging module"); static unsigned int nflog = 1; -MODULE_PARM(nflog, "i"); +module_param(nflog, int, 0400); MODULE_PARM_DESC(nflog, "register as internal netfilter logging module"); #if 0 @@ -45,9 +45,10 @@ static void dump_packet(const struct ipt_log_info *info, const struct sk_buff *skb, unsigned int iphoff) { - struct iphdr iph; + struct iphdr _iph, *ih; - if (skb_copy_bits(skb, iphoff, &iph, sizeof(iph)) < 0) { + ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph); + if (ih == NULL) { printk("TRUNCATED"); return; } @@ -56,32 +57,34 @@ static void dump_packet(const struct ipt_log_info *info, * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */ /* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */ printk("SRC=%u.%u.%u.%u DST=%u.%u.%u.%u ", - NIPQUAD(iph.saddr), NIPQUAD(iph.daddr)); + NIPQUAD(ih->saddr), NIPQUAD(ih->daddr)); /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */ printk("LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ", - ntohs(iph.tot_len), iph.tos & IPTOS_TOS_MASK, - iph.tos & IPTOS_PREC_MASK, iph.ttl, ntohs(iph.id)); + ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK, + ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id)); /* Max length: 6 "CE DF MF " */ - if (ntohs(iph.frag_off) & IP_CE) + if (ntohs(ih->frag_off) & IP_CE) printk("CE "); - if (ntohs(iph.frag_off) & IP_DF) + if (ntohs(ih->frag_off) & IP_DF) printk("DF "); - if (ntohs(iph.frag_off) & IP_MF) + if (ntohs(ih->frag_off) & IP_MF) printk("MF "); /* Max length: 11 "FRAG:65535 " */ - if (ntohs(iph.frag_off) & IP_OFFSET) - printk("FRAG:%u ", ntohs(iph.frag_off) & IP_OFFSET); + if (ntohs(ih->frag_off) & IP_OFFSET) + printk("FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET); if ((info->logflags & IPT_LOG_IPOPT) - && iph.ihl * 4 > sizeof(struct iphdr)) { - unsigned char opt[4 * 15 - sizeof(struct iphdr)]; + && ih->ihl * 4 > sizeof(struct iphdr)) { + unsigned char _opt[4 * 15 - sizeof(struct iphdr)], *op; unsigned int i, optsize; - optsize = iph.ihl * 4 - sizeof(struct iphdr); - if (skb_copy_bits(skb, iphoff+sizeof(iph), opt, optsize) < 0) { + optsize = ih->ihl * 4 - sizeof(struct iphdr); + op = skb_header_pointer(skb, iphoff+sizeof(_iph), + optsize, _opt); + if (op == NULL) { printk("TRUNCATED"); return; } @@ -89,67 +92,71 @@ static void dump_packet(const struct ipt_log_info *info, /* Max length: 127 "OPT (" 15*4*2chars ") " */ printk("OPT ("); for (i = 0; i < optsize; i++) - printk("%02X", opt[i]); + printk("%02X", op[i]); printk(") "); } - switch (iph.protocol) { + switch (ih->protocol) { case IPPROTO_TCP: { - struct tcphdr tcph; + struct tcphdr _tcph, *th; /* Max length: 10 "PROTO=TCP " */ printk("PROTO=TCP "); - if (ntohs(iph.frag_off) & IP_OFFSET) + if (ntohs(ih->frag_off) & IP_OFFSET) break; /* Max length: 25 "INCOMPLETE [65535 bytes] " */ - if (skb_copy_bits(skb, iphoff+iph.ihl*4, &tcph, sizeof(tcph)) - < 0) { + th = skb_header_pointer(skb, iphoff + ih->ihl * 4, + sizeof(_tcph), &_tcph); + if (th == NULL) { printk("INCOMPLETE [%u bytes] ", - skb->len - iphoff - iph.ihl*4); + skb->len - iphoff - ih->ihl*4); break; } /* Max length: 20 "SPT=65535 DPT=65535 " */ printk("SPT=%u DPT=%u ", - ntohs(tcph.source), ntohs(tcph.dest)); + ntohs(th->source), ntohs(th->dest)); /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ if (info->logflags & IPT_LOG_TCPSEQ) printk("SEQ=%u ACK=%u ", - ntohl(tcph.seq), ntohl(tcph.ack_seq)); + ntohl(th->seq), ntohl(th->ack_seq)); /* Max length: 13 "WINDOW=65535 " */ - printk("WINDOW=%u ", ntohs(tcph.window)); + printk("WINDOW=%u ", ntohs(th->window)); /* Max length: 9 "RES=0x3F " */ - printk("RES=0x%02x ", (u8)(ntohl(tcp_flag_word(&tcph) & TCP_RESERVED_BITS) >> 22)); + printk("RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22)); /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */ - if (tcph.cwr) + if (th->cwr) printk("CWR "); - if (tcph.ece) + if (th->ece) printk("ECE "); - if (tcph.urg) + if (th->urg) printk("URG "); - if (tcph.ack) + if (th->ack) printk("ACK "); - if (tcph.psh) + if (th->psh) printk("PSH "); - if (tcph.rst) + if (th->rst) printk("RST "); - if (tcph.syn) + if (th->syn) printk("SYN "); - if (tcph.fin) + if (th->fin) printk("FIN "); /* Max length: 11 "URGP=65535 " */ - printk("URGP=%u ", ntohs(tcph.urg_ptr)); + printk("URGP=%u ", ntohs(th->urg_ptr)); if ((info->logflags & IPT_LOG_TCPOPT) - && tcph.doff * 4 > sizeof(struct tcphdr)) { - unsigned char opt[4 * 15 - sizeof(struct tcphdr)]; + && th->doff * 4 > sizeof(struct tcphdr)) { + unsigned char _opt[4 * 15 - sizeof(struct tcphdr)]; + unsigned char *op; unsigned int i, optsize; - optsize = tcph.doff * 4 - sizeof(struct tcphdr); - if (skb_copy_bits(skb, iphoff+iph.ihl*4 + sizeof(tcph), - opt, optsize) < 0) { + optsize = th->doff * 4 - sizeof(struct tcphdr); + op = skb_header_pointer(skb, + iphoff+ih->ihl*4+sizeof(_tcph), + optsize, _opt); + if (op == NULL) { printk("TRUNCATED"); return; } @@ -157,36 +164,37 @@ static void dump_packet(const struct ipt_log_info *info, /* Max length: 127 "OPT (" 15*4*2chars ") " */ printk("OPT ("); for (i = 0; i < optsize; i++) - printk("%02X", opt[i]); + printk("%02X", op[i]); printk(") "); } break; } case IPPROTO_UDP: { - struct udphdr udph; + struct udphdr _udph, *uh; /* Max length: 10 "PROTO=UDP " */ printk("PROTO=UDP "); - if (ntohs(iph.frag_off) & IP_OFFSET) + if (ntohs(ih->frag_off) & IP_OFFSET) break; /* Max length: 25 "INCOMPLETE [65535 bytes] " */ - if (skb_copy_bits(skb, iphoff+iph.ihl*4, &udph, sizeof(udph)) - < 0) { + uh = skb_header_pointer(skb, iphoff+ih->ihl*4, + sizeof(_udph), &_udph); + if (uh == NULL) { printk("INCOMPLETE [%u bytes] ", - skb->len - iphoff - iph.ihl*4); + skb->len - iphoff - ih->ihl*4); break; } /* Max length: 20 "SPT=65535 DPT=65535 " */ printk("SPT=%u DPT=%u LEN=%u ", - ntohs(udph.source), ntohs(udph.dest), - ntohs(udph.len)); + ntohs(uh->source), ntohs(uh->dest), + ntohs(uh->len)); break; } case IPPROTO_ICMP: { - struct icmphdr icmph; + struct icmphdr _icmph, *ich; static size_t required_len[NR_ICMP_TYPES+1] = { [ICMP_ECHOREPLY] = 4, [ICMP_DEST_UNREACH] @@ -208,47 +216,48 @@ static void dump_packet(const struct ipt_log_info *info, /* Max length: 11 "PROTO=ICMP " */ printk("PROTO=ICMP "); - if (ntohs(iph.frag_off) & IP_OFFSET) + if (ntohs(ih->frag_off) & IP_OFFSET) break; /* Max length: 25 "INCOMPLETE [65535 bytes] " */ - if (skb_copy_bits(skb, iphoff+iph.ihl*4, &icmph, sizeof(icmph)) - < 0) { + ich = skb_header_pointer(skb, iphoff + ih->ihl * 4, + sizeof(_icmph), &_icmph); + if (ich == NULL) { printk("INCOMPLETE [%u bytes] ", - skb->len - iphoff - iph.ihl*4); + skb->len - iphoff - ih->ihl*4); break; } /* Max length: 18 "TYPE=255 CODE=255 " */ - printk("TYPE=%u CODE=%u ", icmph.type, icmph.code); + printk("TYPE=%u CODE=%u ", ich->type, ich->code); /* Max length: 25 "INCOMPLETE [65535 bytes] " */ - if (icmph.type <= NR_ICMP_TYPES - && required_len[icmph.type] - && skb->len-iphoff-iph.ihl*4 < required_len[icmph.type]) { + if (ich->type <= NR_ICMP_TYPES + && required_len[ich->type] + && skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) { printk("INCOMPLETE [%u bytes] ", - skb->len - iphoff - iph.ihl*4); + skb->len - iphoff - ih->ihl*4); break; } - switch (icmph.type) { + switch (ich->type) { case ICMP_ECHOREPLY: case ICMP_ECHO: /* Max length: 19 "ID=65535 SEQ=65535 " */ printk("ID=%u SEQ=%u ", - ntohs(icmph.un.echo.id), - ntohs(icmph.un.echo.sequence)); + ntohs(ich->un.echo.id), + ntohs(ich->un.echo.sequence)); break; case ICMP_PARAMETERPROB: /* Max length: 14 "PARAMETER=255 " */ printk("PARAMETER=%u ", - ntohl(icmph.un.gateway) >> 24); + ntohl(ich->un.gateway) >> 24); break; case ICMP_REDIRECT: /* Max length: 24 "GATEWAY=255.255.255.255 " */ printk("GATEWAY=%u.%u.%u.%u ", - NIPQUAD(icmph.un.gateway)); + NIPQUAD(ich->un.gateway)); /* Fall through */ case ICMP_DEST_UNREACH: case ICMP_SOURCE_QUENCH: @@ -257,62 +266,65 @@ static void dump_packet(const struct ipt_log_info *info, if (!iphoff) { /* Only recurse once. */ printk("["); dump_packet(info, skb, - iphoff + iph.ihl*4+sizeof(icmph)); + iphoff + ih->ihl*4+sizeof(_icmph)); printk("] "); } /* Max length: 10 "MTU=65535 " */ - if (icmph.type == ICMP_DEST_UNREACH - && icmph.code == ICMP_FRAG_NEEDED) - printk("MTU=%u ", ntohs(icmph.un.frag.mtu)); + if (ich->type == ICMP_DEST_UNREACH + && ich->code == ICMP_FRAG_NEEDED) + printk("MTU=%u ", ntohs(ich->un.frag.mtu)); } break; } /* Max Length */ case IPPROTO_AH: { - struct ip_auth_hdr ah; + struct ip_auth_hdr _ahdr, *ah; - if (ntohs(iph.frag_off) & IP_OFFSET) + if (ntohs(ih->frag_off) & IP_OFFSET) break; /* Max length: 9 "PROTO=AH " */ printk("PROTO=AH "); /* Max length: 25 "INCOMPLETE [65535 bytes] " */ - if (skb_copy_bits(skb, iphoff+iph.ihl*4, &ah, sizeof(ah)) < 0) { + ah = skb_header_pointer(skb, iphoff+ih->ihl*4, + sizeof(_ahdr), &_ahdr); + if (ah == NULL) { printk("INCOMPLETE [%u bytes] ", - skb->len - iphoff - iph.ihl*4); + skb->len - iphoff - ih->ihl*4); break; } /* Length: 15 "SPI=0xF1234567 " */ - printk("SPI=0x%x ", ntohl(ah.spi)); + printk("SPI=0x%x ", ntohl(ah->spi)); break; } case IPPROTO_ESP: { - struct ip_esp_hdr esph; + struct ip_esp_hdr _esph, *eh; /* Max length: 10 "PROTO=ESP " */ printk("PROTO=ESP "); - if (ntohs(iph.frag_off) & IP_OFFSET) + if (ntohs(ih->frag_off) & IP_OFFSET) break; /* Max length: 25 "INCOMPLETE [65535 bytes] " */ - if (skb_copy_bits(skb, iphoff+iph.ihl*4, &esph, sizeof(esph)) - < 0) { + eh = skb_header_pointer(skb, iphoff+ih->ihl*4, + sizeof(_esph), &_esph); + if (eh == NULL) { printk("INCOMPLETE [%u bytes] ", - skb->len - iphoff - iph.ihl*4); + skb->len - iphoff - ih->ihl*4); break; } /* Length: 15 "SPI=0xF1234567 " */ - printk("SPI=0x%x ", ntohl(esph.spi)); + printk("SPI=0x%x ", ntohl(eh->spi)); break; } /* Max length: 10 "PROTO 255 " */ default: - printk("PROTO=%u ", iph.protocol); + printk("PROTO=%u ", ih->protocol); } /* Proto Max log string length */ diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index 54bc4684cc9d..ea02a12d7625 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -92,8 +92,8 @@ masquerade_target(struct sk_buff **pskb, return NF_ACCEPT; ct = ip_conntrack_get(*pskb, &ctinfo); - IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW - || ctinfo == IP_CT_RELATED)); + IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED + || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)); mr = targinfo; diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index b8018cb023ff..9637b75fd71e 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -103,7 +103,7 @@ static inline struct rtable *route_reverse(struct sk_buff *skb, int hook) static void send_reset(struct sk_buff *oldskb, int hook) { struct sk_buff *nskb; - struct tcphdr otcph, *tcph; + struct tcphdr _otcph, *oth, *tcph; struct rtable *rt; u_int16_t tmp_port; u_int32_t tmp_addr; @@ -114,12 +114,13 @@ static void send_reset(struct sk_buff *oldskb, int hook) if (oldskb->nh.iph->frag_off & htons(IP_OFFSET)) return; - if (skb_copy_bits(oldskb, oldskb->nh.iph->ihl*4, - &otcph, sizeof(otcph)) < 0) + oth = skb_header_pointer(oldskb, oldskb->nh.iph->ihl * 4, + sizeof(_otcph), &_otcph); + if (oth == NULL) return; /* No RST for RST. */ - if (otcph.rst) + if (oth->rst) return; /* FIXME: Check checksum --RR */ @@ -167,13 +168,13 @@ static void send_reset(struct sk_buff *oldskb, int hook) if (tcph->ack) { needs_ack = 0; - tcph->seq = otcph.ack_seq; + tcph->seq = oth->ack_seq; tcph->ack_seq = 0; } else { needs_ack = 1; - tcph->ack_seq = htonl(ntohl(otcph.seq) + otcph.syn + otcph.fin + tcph->ack_seq = htonl(ntohl(oth->seq) + oth->syn + oth->fin + oldskb->len - oldskb->nh.iph->ihl*4 - - (otcph.doff<<2)); + - (oth->doff<<2)); tcph->seq = 0; } diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c index c352df54aa1f..51d16d33bcbd 100644 --- a/net/ipv4/netfilter/ipt_ULOG.c +++ b/net/ipv4/netfilter/ipt_ULOG.c @@ -34,8 +34,8 @@ * by that factor. * * flushtimeout: - * Specify, after how many clock ticks (intel: 100 per second) the queue - * should be flushed even if it is not full yet. + * Specify, after how many hundredths of a second the queue should be + * flushed even if it is not full yet. * * ipt_ULOG.c,v 1.22 2002/10/30 09:07:31 laforge Exp */ @@ -50,6 +50,7 @@ #include <linux/netlink.h> #include <linux/netdevice.h> #include <linux/mm.h> +#include <linux/moduleparam.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv4/ipt_ULOG.h> @@ -74,15 +75,15 @@ MODULE_DESCRIPTION("iptables userspace logging module"); #define PRINTR(format, args...) do { if (net_ratelimit()) printk(format , ## args); } while (0) static unsigned int nlbufsiz = 4096; -MODULE_PARM(nlbufsiz, "i"); +module_param(nlbufsiz, uint, 0600); /* FIXME: Check size < 128k --RR */ MODULE_PARM_DESC(nlbufsiz, "netlink buffer size"); -static unsigned int flushtimeout = 10 * HZ; -MODULE_PARM(flushtimeout, "i"); -MODULE_PARM_DESC(flushtimeout, "buffer flush timeout"); +static unsigned int flushtimeout = 10; +module_param(flushtimeout, int, 0600); +MODULE_PARM_DESC(flushtimeout, "buffer flush timeout (hundredths of a second)"); static unsigned int nflog = 1; -MODULE_PARM(nflog, "i"); +module_param(nflog, int, 0400); MODULE_PARM_DESC(nflog, "register as internal netfilter logging module"); /* global data structures */ @@ -97,7 +98,6 @@ typedef struct { static ulog_buff_t ulog_buffers[ULOG_MAXNLGROUPS]; /* array of buffers */ static struct sock *nflognl; /* our socket */ -static size_t qlen; /* current length of multipart-nlmsg */ DECLARE_LOCK(ulog_lock); /* spinlock */ /* send one ulog_buff_t to userspace */ @@ -116,7 +116,7 @@ static void ulog_send(unsigned int nlgroupnum) NETLINK_CB(ub->skb).dst_groups = (1 << nlgroupnum); DEBUGP("ipt_ULOG: throwing %d packets to netlink mask %u\n", - ub->qlen, nlgroup); + ub->qlen, nlgroupnum); netlink_broadcast(nflognl, ub->skb, 0, (1 << nlgroupnum), GFP_ATOMIC); ub->qlen = 0; @@ -126,7 +126,7 @@ static void ulog_send(unsigned int nlgroupnum) } -/* timer function to flush queue in ULOG_FLUSH_INTERVAL time */ +/* timer function to flush queue in flushtimeout time */ static void ulog_timer(unsigned long data) { DEBUGP("ipt_ULOG: timer function called, calling ulog_send\n"); @@ -261,20 +261,21 @@ static void ipt_ulog_packet(unsigned int hooknum, ub->lastnlh->nlmsg_flags |= NLM_F_MULTI; } - /* if threshold is reached, send message to userspace */ - if (qlen >= loginfo->qthreshold) { - if (loginfo->qthreshold > 1) - nlh->nlmsg_type = NLMSG_DONE; - } - ub->lastnlh = nlh; /* if timer isn't already running, start it */ if (!timer_pending(&ub->timer)) { - ub->timer.expires = jiffies + flushtimeout; + ub->timer.expires = jiffies + flushtimeout * HZ / 100; add_timer(&ub->timer); } + /* if threshold is reached, send message to userspace */ + if (ub->qlen >= loginfo->qthreshold) { + if (loginfo->qthreshold > 1) + nlh->nlmsg_type = NLMSG_DONE; + ulog_send(groupnum); + } + UNLOCK_BH(&ulog_lock); return; diff --git a/net/ipv4/netfilter/ipt_ah.c b/net/ipv4/netfilter/ipt_ah.c index 1f0d7652f6dc..a0fea847cb72 100644 --- a/net/ipv4/netfilter/ipt_ah.c +++ b/net/ipv4/netfilter/ipt_ah.c @@ -43,23 +43,26 @@ match(const struct sk_buff *skb, int offset, int *hotdrop) { - struct ip_auth_hdr ah; + struct ip_auth_hdr _ahdr, *ah; const struct ipt_ah *ahinfo = matchinfo; /* Must not be a fragment. */ if (offset) return 0; - if (skb_copy_bits(skb, skb->nh.iph->ihl*4, &ah, sizeof(ah)) < 0) { + ah = skb_header_pointer(skb, skb->nh.iph->ihl * 4, + sizeof(_ahdr), &_ahdr); + if (ah == NULL) { /* We've been asked to examine this packet, and we - can't. Hence, no choice but to drop. */ + * can't. Hence, no choice but to drop. + */ duprintf("Dropping evil AH tinygram.\n"); *hotdrop = 1; return 0; } return spi_match(ahinfo->spis[0], ahinfo->spis[1], - ntohl(ah.spi), + ntohl(ah->spi), !!(ahinfo->invflags & IPT_AH_INV_SPI)); } diff --git a/net/ipv4/netfilter/ipt_ecn.c b/net/ipv4/netfilter/ipt_ecn.c index 0e1efd764fc6..b6f7181e89cc 100644 --- a/net/ipv4/netfilter/ipt_ecn.c +++ b/net/ipv4/netfilter/ipt_ecn.c @@ -30,31 +30,34 @@ static inline int match_tcp(const struct sk_buff *skb, const struct ipt_ecn_info *einfo, int *hotdrop) { - struct tcphdr tcph; + struct tcphdr _tcph, *th; /* In practice, TCP match does this, so can't fail. But let's - be good citizens. */ - if (skb_copy_bits(skb, skb->nh.iph->ihl*4, &tcph, sizeof(tcph)) < 0) { + * be good citizens. + */ + th = skb_header_pointer(skb, skb->nh.iph->ihl * 4, + sizeof(_tcph), &_tcph); + if (th == NULL) { *hotdrop = 0; return 0; } if (einfo->operation & IPT_ECN_OP_MATCH_ECE) { if (einfo->invert & IPT_ECN_OP_MATCH_ECE) { - if (tcph.ece == 1) + if (th->ece == 1) return 0; } else { - if (tcph.ece == 0) + if (th->ece == 0) return 0; } } if (einfo->operation & IPT_ECN_OP_MATCH_CWR) { if (einfo->invert & IPT_ECN_OP_MATCH_CWR) { - if (tcph.cwr == 1) + if (th->cwr == 1) return 0; } else { - if (tcph.cwr == 0) + if (th->cwr == 0) return 0; } } diff --git a/net/ipv4/netfilter/ipt_esp.c b/net/ipv4/netfilter/ipt_esp.c index c3b889378844..e1d0dd31e117 100644 --- a/net/ipv4/netfilter/ipt_esp.c +++ b/net/ipv4/netfilter/ipt_esp.c @@ -44,23 +44,26 @@ match(const struct sk_buff *skb, int offset, int *hotdrop) { - struct ip_esp_hdr esp; + struct ip_esp_hdr _esp, *eh; const struct ipt_esp *espinfo = matchinfo; /* Must not be a fragment. */ if (offset) return 0; - if (skb_copy_bits(skb, skb->nh.iph->ihl*4, &esp, sizeof(esp)) < 0) { + eh = skb_header_pointer(skb, skb->nh.iph->ihl * 4, + sizeof(_esp), &_esp); + if (eh == NULL) { /* We've been asked to examine this packet, and we - can't. Hence, no choice but to drop. */ + * can't. Hence, no choice but to drop. + */ duprintf("Dropping evil ESP tinygram.\n"); *hotdrop = 1; return 0; } return spi_match(espinfo->spis[0], espinfo->spis[1], - ntohl(esp.spi), + ntohl(eh->spi), !!(espinfo->invflags & IPT_ESP_INV_SPI)); } diff --git a/net/ipv4/netfilter/ipt_multiport.c b/net/ipv4/netfilter/ipt_multiport.c index 64e7999b049c..7fdf41e22c86 100644 --- a/net/ipv4/netfilter/ipt_multiport.c +++ b/net/ipv4/netfilter/ipt_multiport.c @@ -54,7 +54,7 @@ match(const struct sk_buff *skb, int offset, int *hotdrop) { - u16 ports[2]; + u16 _ports[2], *pptr; const struct ipt_multiport *multiinfo = matchinfo; /* Must not be a fragment. */ @@ -63,18 +63,21 @@ match(const struct sk_buff *skb, /* Must be big enough to read ports (both UDP and TCP have them at the start). */ - if (skb_copy_bits(skb, skb->nh.iph->ihl*4, ports, sizeof(ports)) < 0) { + pptr = skb_header_pointer(skb, skb->nh.iph->ihl * 4, + sizeof(_ports), &_ports[0]); + if (pptr == NULL) { /* We've been asked to examine this packet, and we - can't. Hence, no choice but to drop. */ - duprintf("ipt_multiport:" - " Dropping evil offset=0 tinygram.\n"); - *hotdrop = 1; - return 0; + * can't. Hence, no choice but to drop. + */ + duprintf("ipt_multiport:" + " Dropping evil offset=0 tinygram.\n"); + *hotdrop = 1; + return 0; } return ports_match(multiinfo->ports, multiinfo->flags, multiinfo->count, - ntohs(ports[0]), ntohs(ports[1])); + ntohs(pptr[0]), ntohs(pptr[1])); } /* Called when user tries to insert an entry of this type. */ diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c index 08b786ac34dd..15472b3e9e56 100644 --- a/net/ipv4/netfilter/ipt_recent.c +++ b/net/ipv4/netfilter/ipt_recent.c @@ -15,6 +15,7 @@ #include <linux/ctype.h> #include <linux/ip.h> #include <linux/vmalloc.h> +#include <linux/moduleparam.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv4/ipt_recent.h> @@ -37,12 +38,12 @@ KERN_INFO RECENT_NAME " " RECENT_VER ": Stephen Frost <sfrost@snowman.net>. htt MODULE_AUTHOR("Stephen Frost <sfrost@snowman.net>"); MODULE_DESCRIPTION("IP tables recently seen matching module " RECENT_VER); MODULE_LICENSE("GPL"); -MODULE_PARM(ip_list_tot,"i"); -MODULE_PARM(ip_pkt_list_tot,"i"); -MODULE_PARM(ip_list_hash_size,"i"); -MODULE_PARM(ip_list_perms,"i"); +module_param(ip_list_tot, int, 0400); +module_param(ip_pkt_list_tot, int, 0400); +module_param(ip_list_hash_size, int, 0400); +module_param(ip_list_perms, int, 0400); #ifdef DEBUG -MODULE_PARM(debug,"i"); +module_param(debug, int, 0600); MODULE_PARM_DESC(debug,"debugging level, defaults to 1"); #endif MODULE_PARM_DESC(ip_list_tot,"number of IPs to remember per list"); diff --git a/net/ipv4/netfilter/ipt_sctp.c b/net/ipv4/netfilter/ipt_sctp.c new file mode 100644 index 000000000000..8f875940b8dc --- /dev/null +++ b/net/ipv4/netfilter/ipt_sctp.c @@ -0,0 +1,201 @@ +#include <linux/module.h> +#include <linux/skbuff.h> +#include <net/ip.h> +#include <linux/sctp.h> + +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ipt_sctp.h> + +#ifdef DEBUG_SCTP +#define duprintf(format, args...) printk(format , ## args) +#else +#define duprintf(format, args...) +#endif + +#define SCCHECK(cond, option, flag, invflag) (!((flag) & (option)) \ + || (!!((invflag) & (option)) ^ (cond))) + +static int +match_flags(const struct ipt_sctp_flag_info *flag_info, + const int flag_count, + u_int8_t chunktype, + u_int8_t chunkflags) +{ + int i; + + for (i = 0; i < flag_count; i++) { + if (flag_info[i].chunktype == chunktype) { + return (chunkflags & flag_info[i].flag_mask) == flag_info[i].flag; + } + } + + return 1; +} + +static int +match_packet(const struct sk_buff *skb, + const u_int32_t *chunkmap, + int chunk_match_type, + const struct ipt_sctp_flag_info *flag_info, + const int flag_count, + int *hotdrop) +{ + int offset; + u_int32_t chunkmapcopy[256 / sizeof (u_int32_t)]; + sctp_chunkhdr_t sch; + +#ifdef DEBUG_SCTP + int i = 0; +#endif + + if (chunk_match_type == SCTP_CHUNK_MATCH_ALL) { + SCTP_CHUNKMAP_COPY(chunkmapcopy, chunkmap); + } + + offset = skb->nh.iph->ihl * 4 + sizeof (sctp_sctphdr_t); + do { + if (skb_copy_bits(skb, offset, &sch, sizeof(sch)) < 0) { + duprintf("Dropping invalid SCTP packet.\n"); + *hotdrop = 1; + return 0; + } + + duprintf("Chunk num: %d\toffset: %d\ttype: %d\tlength: %d\tflags: %x\n", + ++i, offset, sch.type, htons(sch.length), sch.flags); + + offset += (htons(sch.length) + 3) & ~3; + + duprintf("skb->len: %d\toffset: %d\n", skb->len, offset); + + if (SCTP_CHUNKMAP_IS_SET(chunkmap, sch.type)) { + switch (chunk_match_type) { + case SCTP_CHUNK_MATCH_ANY: + if (match_flags(flag_info, flag_count, + sch.type, sch.flags)) { + return 1; + } + break; + + case SCTP_CHUNK_MATCH_ALL: + if (match_flags(flag_info, flag_count, + sch.type, sch.flags)) { + SCTP_CHUNKMAP_CLEAR(chunkmapcopy, sch.type); + } + break; + + case SCTP_CHUNK_MATCH_ONLY: + if (!match_flags(flag_info, flag_count, + sch.type, sch.flags)) { + return 0; + } + break; + } + } else { + switch (chunk_match_type) { + case SCTP_CHUNK_MATCH_ONLY: + return 0; + } + } + } while (offset < skb->len); + + switch (chunk_match_type) { + case SCTP_CHUNK_MATCH_ALL: + return SCTP_CHUNKMAP_IS_CLEAR(chunkmap); + case SCTP_CHUNK_MATCH_ANY: + return 0; + case SCTP_CHUNK_MATCH_ONLY: + return 1; + } + + /* This will never be reached, but required to stop compiler whine */ + return 0; +} + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + const struct ipt_sctp_info *info; + sctp_sctphdr_t sh; + + info = (const struct ipt_sctp_info *)matchinfo; + + if (offset) { + duprintf("Dropping non-first fragment.. FIXME\n"); + return 0; + } + + if (skb_copy_bits(skb, skb->nh.iph->ihl*4, &sh, sizeof(sh)) < 0) { + duprintf("Dropping evil TCP offset=0 tinygram.\n"); + *hotdrop = 1; + return 0; + } + duprintf("spt: %d\tdpt: %d\n", ntohs(sh.source), ntohs(sh.dest)); + + return SCCHECK(((ntohs(sh.source) >= info->spts[0]) + && (ntohs(sh.source) <= info->spts[1])), + IPT_SCTP_SRC_PORTS, info->flags, info->invflags) + && SCCHECK(((ntohs(sh.dest) >= info->dpts[0]) + && (ntohs(sh.dest) <= info->dpts[1])), + IPT_SCTP_DEST_PORTS, info->flags, info->invflags) + && SCCHECK(match_packet(skb, info->chunkmap, info->chunk_match_type, + info->flag_info, info->flag_count, + hotdrop), + IPT_SCTP_CHUNK_TYPES, info->flags, info->invflags); +} + +static int +checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + const struct ipt_sctp_info *info; + + info = (const struct ipt_sctp_info *)matchinfo; + + return ip->proto == IPPROTO_SCTP + && !(ip->invflags & IPT_INV_PROTO) + && matchsize == IPT_ALIGN(sizeof(struct ipt_sctp_info)) + && !(info->flags & ~IPT_SCTP_VALID_FLAGS) + && !(info->invflags & ~IPT_SCTP_VALID_FLAGS) + && !(info->invflags & ~info->flags) + && ((!(info->flags & IPT_SCTP_CHUNK_TYPES)) || + (info->chunk_match_type & + (SCTP_CHUNK_MATCH_ALL + | SCTP_CHUNK_MATCH_ANY + | SCTP_CHUNK_MATCH_ONLY))); +} + +static struct ipt_match sctp_match = +{ + .list = { NULL, NULL}, + .name = "sctp", + .match = &match, + .checkentry = &checkentry, + .destroy = NULL, + .me = THIS_MODULE +}; + +static int __init init(void) +{ + return ipt_register_match(&sctp_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&sctp_match); +} + +module_init(init); +module_exit(fini); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Kiran Kumar Immidi"); +MODULE_DESCRIPTION("Match for SCTP protocol packets"); + diff --git a/net/ipv4/netfilter/ipt_tcpmss.c b/net/ipv4/netfilter/ipt_tcpmss.c index c7cb62ade3f4..5cda547e011e 100644 --- a/net/ipv4/netfilter/ipt_tcpmss.c +++ b/net/ipv4/netfilter/ipt_tcpmss.c @@ -27,37 +27,45 @@ mssoption_match(u_int16_t min, u_int16_t max, int invert, int *hotdrop) { - struct tcphdr tcph; + struct tcphdr _tcph, *th; /* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */ - u8 opt[15 * 4 - sizeof(tcph)]; + u8 _opt[15 * 4 - sizeof(_tcph)], *op; unsigned int i, optlen; /* If we don't have the whole header, drop packet. */ - if (skb_copy_bits(skb, skb->nh.iph->ihl*4, &tcph, sizeof(tcph)) < 0) + th = skb_header_pointer(skb, skb->nh.iph->ihl * 4, + sizeof(_tcph), &_tcph); + if (th == NULL) goto dropit; /* Malformed. */ - if (tcph.doff*4 < sizeof(tcph)) + if (th->doff*4 < sizeof(*th)) goto dropit; - optlen = tcph.doff*4 - sizeof(tcph); + optlen = th->doff*4 - sizeof(*th); + if (!optlen) + goto out; + /* Truncated options. */ - if (skb_copy_bits(skb, skb->nh.iph->ihl*4+sizeof(tcph), opt, optlen)<0) + op = skb_header_pointer(skb, skb->nh.iph->ihl * 4 + sizeof(*th), + optlen, _opt); + if (op == NULL) goto dropit; for (i = 0; i < optlen; ) { - if (opt[i] == TCPOPT_MSS + if (op[i] == TCPOPT_MSS && (optlen - i) >= TCPOLEN_MSS - && opt[i+1] == TCPOLEN_MSS) { + && op[i+1] == TCPOLEN_MSS) { u_int16_t mssval; - mssval = (opt[i+2] << 8) | opt[i+3]; + mssval = (op[i+2] << 8) | op[i+3]; return (mssval >= min && mssval <= max) ^ invert; } - if (opt[i] < 2) i++; - else i += opt[i+1]?:1; + if (op[i] < 2) i++; + else i += op[i+1]?:1; } +out: return invert; dropit: diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index 8fb2ed9d1f9a..6b291da92656 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c @@ -11,6 +11,7 @@ */ #include <linux/module.h> +#include <linux/moduleparam.h> #include <linux/netfilter_ipv4/ip_tables.h> MODULE_LICENSE("GPL"); @@ -155,7 +156,7 @@ static struct nf_hook_ops ipt_ops[] = { /* Default to forward because I got too much mail already. */ static int forward = NF_ACCEPT; -MODULE_PARM(forward, "i"); +module_param(forward, bool, 0000); static int __init init(void) { diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 1cfd749d651e..c17f8716ecdd 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -323,6 +323,51 @@ error: return err; } +static void raw_probe_proto_opt(struct flowi *fl, struct msghdr *msg) +{ + struct iovec *iov; + u8 __user *type = NULL; + u8 __user *code = NULL; + int probed = 0; + int i; + + if (!msg->msg_iov) + return; + + for (i = 0; i < msg->msg_iovlen; i++) { + iov = &msg->msg_iov[i]; + if (!iov) + continue; + + switch (fl->proto) { + case IPPROTO_ICMP: + /* check if one-byte field is readable or not. */ + if (iov->iov_base && iov->iov_len < 1) + break; + + if (!type) { + type = iov->iov_base; + /* check if code field is readable or not. */ + if (iov->iov_len > 1) + code = type + 1; + } else if (!code) + code = iov->iov_base; + + if (type && code) { + get_user(fl->fl_icmp_type, type); + __get_user(fl->fl_icmp_code, code); + probed = 1; + } + break; + default: + probed = 1; + break; + } + if (probed) + break; + } +} + static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len) { @@ -429,6 +474,9 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, .proto = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, }; + if (!inet->hdrincl) + raw_probe_proto_opt(&fl, msg); + err = ip_route_output_flow(&rt, &fl, sk, !(msg->msg_flags&MSG_DONTWAIT)); } if (err) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 92c79da5f297..aa8581413cc6 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1339,9 +1339,12 @@ static void ipv4_dst_ifdown(struct dst_entry *dst, int how) { struct rtable *rt = (struct rtable *) dst; struct in_device *idev = rt->idev; - if (idev) { - rt->idev = NULL; - in_dev_put(idev); + if (idev && idev->dev != &loopback_dev) { + struct in_device *loopback_idev = in_dev_get(&loopback_dev); + if (loopback_idev) { + rt->idev = loopback_idev; + in_dev_put(idev); + } } } @@ -1384,13 +1387,7 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt) if (rt->fl.iif == 0) src = rt->rt_src; else if (fib_lookup(&rt->fl, &res) == 0) { -#ifdef CONFIG_IP_ROUTE_NAT - if (res.type == RTN_NAT) - src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway, - RT_SCOPE_UNIVERSE); - else -#endif - src = FIB_RES_PREFSRC(res); + src = FIB_RES_PREFSRC(res); fib_res_put(&res); } else src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway, @@ -1494,10 +1491,6 @@ static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr, #endif rth->fl.fl4_src = saddr; rth->rt_src = saddr; -#ifdef CONFIG_IP_ROUTE_NAT - rth->rt_dst_map = daddr; - rth->rt_src_map = saddr; -#endif #ifdef CONFIG_NET_CLS_ROUTE rth->u.dst.tclassid = itag; #endif @@ -1607,31 +1600,6 @@ static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr, RT_CACHE_STAT_INC(in_slow_tot); -#ifdef CONFIG_IP_ROUTE_NAT - /* Policy is applied before mapping destination, - but rerouting after map should be made with old source. - */ - - if (1) { - u32 src_map = saddr; - if (res.r) - src_map = fib_rules_policy(saddr, &res, &flags); - - if (res.type == RTN_NAT) { - fl.fl4_dst = fib_rules_map_destination(daddr, &res); - fib_res_put(&res); - free_res = 0; - if (fib_lookup(&fl, &res)) - goto e_inval; - free_res = 1; - if (res.type != RTN_UNICAST) - goto e_inval; - flags |= RTCF_DNAT; - } - fl.fl4_src = src_map; - } -#endif - if (res.type == RTN_BROADCAST) goto brd_input; @@ -1705,12 +1673,6 @@ static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr, rth->fl.fl4_src = saddr; rth->rt_src = saddr; rth->rt_gateway = daddr; -#ifdef CONFIG_IP_ROUTE_NAT - rth->rt_src_map = fl.fl4_src; - rth->rt_dst_map = fl.fl4_dst; - if (flags&RTCF_DNAT) - rth->rt_gateway = fl.fl4_dst; -#endif rth->rt_iif = rth->fl.iif = dev->ifindex; rth->u.dst.dev = out_dev->dev; @@ -1773,10 +1735,6 @@ local_input: #endif rth->fl.fl4_src = saddr; rth->rt_src = saddr; -#ifdef CONFIG_IP_ROUTE_NAT - rth->rt_dst_map = fl.fl4_dst; - rth->rt_src_map = fl.fl4_src; -#endif #ifdef CONFIG_NET_CLS_ROUTE rth->u.dst.tclassid = itag; #endif @@ -1897,7 +1855,7 @@ int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr, if (MULTICAST(daddr)) { struct in_device *in_dev; - read_lock(&inetdev_lock); + rcu_read_lock(); if ((in_dev = __in_dev_get(dev)) != NULL) { int our = ip_check_mc(in_dev, daddr, saddr, skb->nh.iph->protocol); @@ -1906,12 +1864,12 @@ int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr, || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev)) #endif ) { - read_unlock(&inetdev_lock); + rcu_read_unlock(); return ip_route_input_mc(skb, daddr, saddr, tos, dev, our); } } - read_unlock(&inetdev_lock); + rcu_read_unlock(); return -EINVAL; } return ip_route_input_slow(skb, daddr, saddr, tos, dev); @@ -2069,9 +2027,6 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) } free_res = 1; - if (res.type == RTN_NAT) - goto e_inval; - if (res.type == RTN_LOCAL) { if (!fl.fl4_src) fl.fl4_src = fl.fl4_dst; @@ -2161,10 +2116,6 @@ make_route: #endif rth->rt_dst = fl.fl4_dst; rth->rt_src = fl.fl4_src; -#ifdef CONFIG_IP_ROUTE_NAT - rth->rt_dst_map = fl.fl4_dst; - rth->rt_src_map = fl.fl4_src; -#endif rth->rt_iif = oldflp->oif ? : dev_out->ifindex; rth->u.dst.dev = dev_out; dev_hold(dev_out); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5e7f70f1c940..85643472b84d 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -852,8 +852,10 @@ static void tcp_init_metrics(struct sock *sk) * to low value, and then abruptly stops to do it and starts to delay * ACKs, wait for troubles. */ - if (dst_metric(dst, RTAX_RTT) > tp->srtt) + if (dst_metric(dst, RTAX_RTT) > tp->srtt) { tp->srtt = dst_metric(dst, RTAX_RTT); + tp->rtt_seq = tp->snd_nxt; + } if (dst_metric(dst, RTAX_RTTVAR) > tp->mdev) { tp->mdev = dst_metric(dst, RTAX_RTTVAR); tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN); diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index 191cec718e95..1bf740e71c64 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -119,8 +119,7 @@ int xfrm4_output(struct sk_buff **pskb) xfrm4_encap(skb); - err = x->type->output(pskb); - skb = *pskb; + err = x->type->output(skb); if (err) goto error; diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 3aacce604561..3ce69883bcc4 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -12,8 +12,8 @@ #include <net/xfrm.h> #include <net/ip.h> -extern struct dst_ops xfrm4_dst_ops; -extern struct xfrm_policy_afinfo xfrm4_policy_afinfo; +static struct dst_ops xfrm4_dst_ops; +static struct xfrm_policy_afinfo xfrm4_policy_afinfo; static struct xfrm_type_map xfrm4_type_map = { .lock = RW_LOCK_UNLOCKED }; @@ -183,6 +183,15 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl) } break; + case IPPROTO_ICMP: + if (pskb_may_pull(skb, xprth + 2 - skb->data)) { + u8 *icmp = xprth; + + fl->fl_icmp_type = icmp[0]; + fl->fl_icmp_code = icmp[1]; + } + break; + case IPPROTO_ESP: if (pskb_may_pull(skb, xprth + 4 - skb->data)) { u32 *ehdr = (u32 *)xprth; @@ -234,7 +243,7 @@ static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu) path->ops->update_pmtu(path, mtu); } -struct dst_ops xfrm4_dst_ops = { +static struct dst_ops xfrm4_dst_ops = { .family = AF_INET, .protocol = __constant_htons(ETH_P_IP), .gc = xfrm4_garbage_collect, @@ -243,7 +252,7 @@ struct dst_ops xfrm4_dst_ops = { .entry_size = sizeof(struct xfrm_dst), }; -struct xfrm_policy_afinfo xfrm4_policy_afinfo = { +static struct xfrm_policy_afinfo xfrm4_policy_afinfo = { .family = AF_INET, .lock = RW_LOCK_UNLOCKED, .type_map = &xfrm4_type_map, @@ -254,12 +263,12 @@ struct xfrm_policy_afinfo xfrm4_policy_afinfo = { .decode_session = _decode_session4, }; -void __init xfrm4_policy_init(void) +static void __init xfrm4_policy_init(void) { xfrm_policy_register_afinfo(&xfrm4_policy_afinfo); } -void __exit xfrm4_policy_fini(void) +static void __exit xfrm4_policy_fini(void) { xfrm_policy_unregister_afinfo(&xfrm4_policy_afinfo); } diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c index 0d1a0b0c7901..dcc04644ccec 100644 --- a/net/ipv4/xfrm4_tunnel.c +++ b/net/ipv4/xfrm4_tunnel.c @@ -4,13 +4,13 @@ */ #include <linux/skbuff.h> +#include <linux/module.h> #include <net/xfrm.h> #include <net/ip.h> #include <net/protocol.h> -static int ipip_output(struct sk_buff **pskb) +static int ipip_output(struct sk_buff *skb) { - struct sk_buff *skb = *pskb; struct iphdr *iph; iph = skb->nh.iph; @@ -43,6 +43,8 @@ int xfrm4_tunnel_register(struct xfrm_tunnel *handler) return ret; } +EXPORT_SYMBOL(xfrm4_tunnel_register); + int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler) { int ret; @@ -60,6 +62,8 @@ int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler) return ret; } +EXPORT_SYMBOL(xfrm4_tunnel_deregister); + static int ipip_rcv(struct sk_buff *skb) { struct xfrm_tunnel *handler = ipip_handler; @@ -68,7 +72,7 @@ static int ipip_rcv(struct sk_buff *skb) if (handler && handler->handler(skb) == 0) return 0; - return xfrm4_rcv_encap(skb, 0); + return xfrm4_rcv(skb); } static void ipip_err(struct sk_buff *skb, u32 info) @@ -84,6 +88,10 @@ static int ipip_init_state(struct xfrm_state *x, void *args) { if (!x->props.mode) return -EINVAL; + + if (x->encap) + return -EINVAL; + x->props.header_len = sizeof(struct iphdr); return 0; diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index 23c5759c022d..fd87a5a192da 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -48,6 +48,7 @@ config INET6_IPCOMP tristate "IPv6: IPComp transformation" depends on IPV6 select XFRM + select INET6_TUNNEL select CRYPTO select CRYPTO_DEFLATE ---help--- @@ -56,9 +57,21 @@ config INET6_IPCOMP If unsure, say Y. +config INET6_TUNNEL + tristate "IPv6: tunnel transformation" + depends on IPV6 + select XFRM + ---help--- + Support for generic IPv6-in-IPv6 tunnel transformation, which is + required by the IPv6-in-IPv6 tunneling module as well as tunnel mode + IPComp. + + If unsure, say Y. + config IPV6_TUNNEL tristate "IPv6: IPv6-in-IPv6 tunnel" depends on IPV6 + select INET6_TUNNEL ---help--- Support for IPv6-in-IPv6 tunnels described in RFC 2473. diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile index d9e309fe8490..b39e04940590 100644 --- a/net/ipv6/Makefile +++ b/net/ipv6/Makefile @@ -11,12 +11,13 @@ ipv6-objs := af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o sit.o \ ip6_flowlabel.o ipv6_syms.o ipv6-$(CONFIG_XFRM) += xfrm6_policy.o xfrm6_state.o xfrm6_input.o \ - xfrm6_tunnel.o xfrm6_output.o + xfrm6_output.o ipv6-objs += $(ipv6-y) obj-$(CONFIG_INET6_AH) += ah6.o obj-$(CONFIG_INET6_ESP) += esp6.o obj-$(CONFIG_INET6_IPCOMP) += ipcomp6.o +obj-$(CONFIG_INET6_TUNNEL) += xfrm6_tunnel.o obj-$(CONFIG_NETFILTER) += netfilter/ obj-$(CONFIG_IPV6_TUNNEL) += ip6_tunnel.o diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 40ad73c5cbb7..7150375908a8 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -472,6 +472,8 @@ void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp) printk("Freeing alive inet6 address %p\n", ifp); return; } + dst_release(&ifp->rt->u.dst); + inet6_ifa_count--; kfree(ifp); } @@ -482,25 +484,33 @@ static struct inet6_ifaddr * ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, int pfxlen, int scope, unsigned flags) { - struct inet6_ifaddr *ifa; + struct inet6_ifaddr *ifa = NULL; + struct rt6_info *rt; int hash; static spinlock_t lock = SPIN_LOCK_UNLOCKED; + int err = 0; spin_lock_bh(&lock); /* Ignore adding duplicate addresses on an interface */ if (ipv6_chk_same_addr(addr, idev->dev)) { - spin_unlock_bh(&lock); ADBG(("ipv6_add_addr: already assigned\n")); - return ERR_PTR(-EEXIST); + err = -EEXIST; + goto out; } ifa = kmalloc(sizeof(struct inet6_ifaddr), GFP_ATOMIC); if (ifa == NULL) { - spin_unlock_bh(&lock); ADBG(("ipv6_add_addr: malloc failed\n")); - return ERR_PTR(-ENOBUFS); + err = -ENOBUFS; + goto out; + } + + rt = addrconf_dst_alloc(idev, addr, 0); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + goto out; } memset(ifa, 0, sizeof(struct inet6_ifaddr)); @@ -517,9 +527,8 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, int pfxlen, read_lock(&addrconf_lock); if (idev->dead) { read_unlock(&addrconf_lock); - spin_unlock_bh(&lock); - kfree(ifa); - return ERR_PTR(-ENODEV); /*XXX*/ + err = -ENODEV; /*XXX*/ + goto out; } inet6_ifa_count++; @@ -553,12 +562,20 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, int pfxlen, } #endif + ifa->rt = rt; + in6_ifa_hold(ifa); write_unlock_bh(&idev->lock); read_unlock(&addrconf_lock); +out: spin_unlock_bh(&lock); - notifier_call_chain(&inet6addr_chain,NETDEV_UP,ifa); + if (unlikely(err == 0)) + notifier_call_chain(&inet6addr_chain, NETDEV_UP, ifa); + else { + kfree(ifa); + ifa = ERR_PTR(err); + } return ifa; } @@ -1457,8 +1474,7 @@ ok: spin_unlock(&ifp->lock); if (!(flags&IFA_F_TENTATIVE)) - ipv6_ifa_notify((flags&IFA_F_DEPRECATED) ? - 0 : RTM_NEWADDR, ifp); + ipv6_ifa_notify(0, ifp); } else spin_unlock(&ifp->lock); @@ -2982,7 +2998,9 @@ static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) switch (event) { case RTM_NEWADDR: - ip6_rt_addr_add(&ifp->addr, ifp->idev->dev, 0); + dst_hold(&ifp->rt->u.dst); + if (ip6_ins_rt(ifp->rt, NULL, NULL)) + dst_release(&ifp->rt->u.dst); break; case RTM_DELADDR: addrconf_leave_solict(ifp->idev->dev, &ifp->addr); @@ -2993,8 +3011,11 @@ static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) if (!ipv6_addr_any(&addr)) ipv6_dev_ac_dec(ifp->idev->dev, &addr); } - if (!ipv6_chk_addr(&ifp->addr, ifp->idev->dev, 1)) - ip6_rt_addr_del(&ifp->addr, ifp->idev->dev); + dst_hold(&ifp->rt->u.dst); + if (ip6_del_rt(ifp->rt, NULL, NULL)) + dst_free(&ifp->rt->u.dst); + else + dst_release(&ifp->rt->u.dst); break; } } diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index eda2737e572b..32ecedb04abd 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -154,11 +154,11 @@ static int ipv6_clear_mutable_options(struct ipv6hdr *iph, int len) return 0; } -int ah6_output(struct sk_buff **pskb) +static int ah6_output(struct sk_buff *skb) { int err; int extlen; - struct dst_entry *dst = (*pskb)->dst; + struct dst_entry *dst = skb->dst; struct xfrm_state *x = dst->xfrm; struct ipv6hdr *top_iph; struct ip_auth_hdr *ah; @@ -170,11 +170,11 @@ int ah6_output(struct sk_buff **pskb) char hdrs[0]; } *tmp_ext; - top_iph = (struct ipv6hdr *)(*pskb)->data; - top_iph->payload_len = htons((*pskb)->len - sizeof(*top_iph)); + top_iph = (struct ipv6hdr *)skb->data; + top_iph->payload_len = htons(skb->len - sizeof(*top_iph)); - nexthdr = *(*pskb)->nh.raw; - *(*pskb)->nh.raw = IPPROTO_AH; + nexthdr = *skb->nh.raw; + *skb->nh.raw = IPPROTO_AH; /* When there are no extension headers, we only need to save the first * 8 bytes of the base IP header. @@ -182,7 +182,7 @@ int ah6_output(struct sk_buff **pskb) memcpy(tmp_base, top_iph, sizeof(tmp_base)); tmp_ext = NULL; - extlen = (*pskb)->h.raw - (unsigned char *)(top_iph + 1); + extlen = skb->h.raw - (unsigned char *)(top_iph + 1); if (extlen) { extlen += sizeof(*tmp_ext); tmp_ext = kmalloc(extlen, GFP_ATOMIC); @@ -198,7 +198,7 @@ int ah6_output(struct sk_buff **pskb) goto error_free_iph; } - ah = (struct ip_auth_hdr *)(*pskb)->h.raw; + ah = (struct ip_auth_hdr *)skb->h.raw; ah->nexthdr = nexthdr; top_iph->priority = 0; @@ -214,7 +214,7 @@ int ah6_output(struct sk_buff **pskb) ah->reserved = 0; ah->spi = x->id.spi; ah->seq_no = htonl(++x->replay.oseq); - ahp->icv(ahp, *pskb, ah->auth_data); + ahp->icv(ahp, skb, ah->auth_data); err = 0; @@ -229,7 +229,7 @@ error: return err; } -int ah6_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb) +static int ah6_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb) { /* * Before process AH @@ -319,8 +319,8 @@ out: return -EINVAL; } -void ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, - int type, int code, int offset, __u32 info) +static void ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + int type, int code, int offset, __u32 info) { struct ipv6hdr *iph = (struct ipv6hdr*)skb->data; struct ip_auth_hdr *ah = (struct ip_auth_hdr*)(skb->data+offset); @@ -353,6 +353,9 @@ static int ah6_init_state(struct xfrm_state *x, void *args) if (x->aalg->alg_key_len > 512) goto error; + if (x->encap) + goto error; + ahp = kmalloc(sizeof(*ahp), GFP_KERNEL); if (ahp == NULL) return -ENOMEM; @@ -445,7 +448,7 @@ static struct inet6_protocol ah6_protocol = { .flags = INET6_PROTO_NOPOLICY, }; -int __init ah6_init(void) +static int __init ah6_init(void) { if (xfrm_register_type(&ah6_type, AF_INET6) < 0) { printk(KERN_INFO "ipv6 ah init: can't add xfrm type\n"); diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index 5b1e4d959f4a..537dc37be239 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -293,6 +293,7 @@ static void aca_put(struct ifacaddr6 *ac) { if (atomic_dec_and_test(&ac->aca_refcnt)) { in6_dev_put(ac->aca_idev); + dst_release(&ac->aca_rt->u.dst); kfree(ac); } } @@ -304,6 +305,8 @@ int ipv6_dev_ac_inc(struct net_device *dev, struct in6_addr *addr) { struct ifacaddr6 *aca; struct inet6_dev *idev; + struct rt6_info *rt; + int err; idev = in6_dev_get(dev); @@ -312,17 +315,15 @@ int ipv6_dev_ac_inc(struct net_device *dev, struct in6_addr *addr) write_lock_bh(&idev->lock); if (idev->dead) { - write_unlock_bh(&idev->lock); - in6_dev_put(idev); - return -ENODEV; + err = -ENODEV; + goto out; } for (aca = idev->ac_list; aca; aca = aca->aca_next) { if (ipv6_addr_cmp(&aca->aca_addr, addr) == 0) { aca->aca_users++; - write_unlock_bh(&idev->lock); - in6_dev_put(idev); - return 0; + err = 0; + goto out; } } @@ -333,15 +334,22 @@ int ipv6_dev_ac_inc(struct net_device *dev, struct in6_addr *addr) aca = kmalloc(sizeof(struct ifacaddr6), GFP_ATOMIC); if (aca == NULL) { - write_unlock_bh(&idev->lock); - in6_dev_put(idev); - return -ENOMEM; + err = -ENOMEM; + goto out; + } + + rt = addrconf_dst_alloc(idev, addr, 1); + if (IS_ERR(rt)) { + kfree(aca); + err = PTR_ERR(rt); + goto out; } memset(aca, 0, sizeof(struct ifacaddr6)); ipv6_addr_copy(&aca->aca_addr, addr); aca->aca_idev = idev; + aca->aca_rt = rt; aca->aca_users = 1; /* aca_tstamp should be updated upon changes */ aca->aca_cstamp = aca->aca_tstamp = jiffies; @@ -352,12 +360,18 @@ int ipv6_dev_ac_inc(struct net_device *dev, struct in6_addr *addr) idev->ac_list = aca; write_unlock_bh(&idev->lock); - ip6_rt_addr_add(&aca->aca_addr, dev, 1); + dst_hold(&rt->u.dst); + if (ip6_ins_rt(rt, NULL, NULL)) + dst_release(&rt->u.dst); addrconf_join_solict(dev, &aca->aca_addr); aca_put(aca); return 0; +out: + write_unlock_bh(&idev->lock); + in6_dev_put(idev); + return err; } /* @@ -396,7 +410,11 @@ int ipv6_dev_ac_dec(struct net_device *dev, struct in6_addr *addr) write_unlock_bh(&idev->lock); addrconf_leave_solict(dev, &aca->aca_addr); - ip6_rt_addr_del(&aca->aca_addr, dev); + dst_hold(&aca->aca_rt->u.dst); + if (ip6_del_rt(aca->aca_rt, NULL, NULL)) + dst_free(&aca->aca_rt->u.dst); + else + dst_release(&aca->aca_rt->u.dst); aca_put(aca); in6_dev_put(idev); diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 497727195c98..77b4ba6f8016 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -37,11 +37,11 @@ #include <net/ipv6.h> #include <linux/icmpv6.h> -int esp6_output(struct sk_buff **pskb) +static int esp6_output(struct sk_buff *skb) { int err; int hdr_len; - struct dst_entry *dst = (*pskb)->dst; + struct dst_entry *dst = skb->dst; struct xfrm_state *x = dst->xfrm; struct ipv6hdr *top_iph; struct ipv6_esp_hdr *esph; @@ -54,17 +54,17 @@ int esp6_output(struct sk_buff **pskb) int nfrags; esp = x->data; - hdr_len = (*pskb)->h.raw - (*pskb)->data + + hdr_len = skb->h.raw - skb->data + sizeof(*esph) + esp->conf.ivlen; /* Strip IP+ESP header. */ - __skb_pull(*pskb, hdr_len); + __skb_pull(skb, hdr_len); /* Now skb is pure payload to encrypt */ err = -ENOMEM; /* Round to block size */ - clen = (*pskb)->len; + clen = skb->len; alen = esp->auth.icv_trunc_len; tfm = esp->conf.tfm; @@ -73,24 +73,24 @@ int esp6_output(struct sk_buff **pskb) if (esp->conf.padlen) clen = (clen + esp->conf.padlen-1)&~(esp->conf.padlen-1); - if ((nfrags = skb_cow_data(*pskb, clen-(*pskb)->len+alen, &trailer)) < 0) { + if ((nfrags = skb_cow_data(skb, clen-skb->len+alen, &trailer)) < 0) { goto error; } /* Fill padding... */ do { int i; - for (i=0; i<clen-(*pskb)->len - 2; i++) + for (i=0; i<clen-skb->len - 2; i++) *(u8*)(trailer->tail + i) = i+1; } while (0); - *(u8*)(trailer->tail + clen-(*pskb)->len - 2) = (clen - (*pskb)->len)-2; - pskb_put(*pskb, trailer, clen - (*pskb)->len); + *(u8*)(trailer->tail + clen-skb->len - 2) = (clen - skb->len)-2; + pskb_put(skb, trailer, clen - skb->len); - top_iph = (struct ipv6hdr *)__skb_push(*pskb, hdr_len); - esph = (struct ipv6_esp_hdr *)(*pskb)->h.raw; - top_iph->payload_len = htons((*pskb)->len + alen - sizeof(*top_iph)); - *(u8*)(trailer->tail - 1) = *(*pskb)->nh.raw; - *(*pskb)->nh.raw = IPPROTO_ESP; + top_iph = (struct ipv6hdr *)__skb_push(skb, hdr_len); + esph = (struct ipv6_esp_hdr *)skb->h.raw; + top_iph->payload_len = htons(skb->len + alen - sizeof(*top_iph)); + *(u8*)(trailer->tail - 1) = *skb->nh.raw; + *skb->nh.raw = IPPROTO_ESP; esph->spi = x->id.spi; esph->seq_no = htonl(++x->replay.oseq); @@ -106,7 +106,7 @@ int esp6_output(struct sk_buff **pskb) if (!sg) goto error; } - skb_to_sgvec(*pskb, sg, esph->enc_data+esp->conf.ivlen-(*pskb)->data, clen); + skb_to_sgvec(skb, sg, esph->enc_data+esp->conf.ivlen-skb->data, clen); crypto_cipher_encrypt(tfm, sg, sg, clen); if (unlikely(sg != &esp->sgbuf[0])) kfree(sg); @@ -118,9 +118,9 @@ int esp6_output(struct sk_buff **pskb) } if (esp->auth.icv_full_len) { - esp->auth.icv(esp, *pskb, (u8*)esph-(*pskb)->data, + esp->auth.icv(esp, skb, (u8*)esph-skb->data, sizeof(struct ipv6_esp_hdr) + esp->conf.ivlen+clen, trailer->tail); - pskb_put(*pskb, trailer, alen); + pskb_put(skb, trailer, alen); } err = 0; @@ -129,7 +129,7 @@ error: return err; } -int esp6_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb) +static int esp6_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb) { struct ipv6hdr *iph; struct ipv6_esp_hdr *esph; @@ -252,8 +252,8 @@ static u32 esp6_get_max_size(struct xfrm_state *x, int mtu) return mtu + x->props.header_len + esp->auth.icv_full_len; } -void esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, - int type, int code, int offset, __u32 info) +static void esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + int type, int code, int offset, __u32 info) { struct ipv6hdr *iph = (struct ipv6hdr*)skb->data; struct ipv6_esp_hdr *esph = (struct ipv6_esp_hdr*)(skb->data+offset); @@ -272,7 +272,7 @@ void esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, xfrm_state_put(x); } -void esp6_destroy(struct xfrm_state *x) +static void esp6_destroy(struct xfrm_state *x) { struct esp_data *esp = x->data; @@ -298,17 +298,21 @@ void esp6_destroy(struct xfrm_state *x) kfree(esp); } -int esp6_init_state(struct xfrm_state *x, void *args) +static int esp6_init_state(struct xfrm_state *x, void *args) { struct esp_data *esp = NULL; + /* null auth and encryption can have zero length keys */ if (x->aalg) { - if (x->aalg->alg_key_len == 0 || x->aalg->alg_key_len > 512) + if (x->aalg->alg_key_len > 512) goto error; } if (x->ealg == NULL) goto error; + if (x->encap) + goto error; + esp = kmalloc(sizeof(*esp), GFP_KERNEL); if (esp == NULL) return -ENOMEM; @@ -398,7 +402,7 @@ static struct inet6_protocol esp6_protocol = { .flags = INET6_PROTO_NOPOLICY, }; -int __init esp6_init(void) +static int __init esp6_init(void) { if (xfrm_register_type(&esp6_type, AF_INET6) < 0) { printk(KERN_INFO "ipv6 esp init: can't add xfrm type\n"); diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c index 07151a6c354d..6dda815c013f 100644 --- a/net/ipv6/exthdrs_core.c +++ b/net/ipv6/exthdrs_core.c @@ -68,34 +68,35 @@ int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp, int len u8 nexthdr = *nexthdrp; while (ipv6_ext_hdr(nexthdr)) { - struct ipv6_opt_hdr hdr; + struct ipv6_opt_hdr _hdr, *hp; int hdrlen; if (len < (int)sizeof(struct ipv6_opt_hdr)) return -1; if (nexthdr == NEXTHDR_NONE) return -1; - if (skb_copy_bits(skb, start, &hdr, sizeof(hdr))) + hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr); + if (hp == NULL) BUG(); if (nexthdr == NEXTHDR_FRAGMENT) { - unsigned short frag_off; - if (skb_copy_bits(skb, - start+offsetof(struct frag_hdr, - frag_off), - &frag_off, - sizeof(frag_off))) { + unsigned short _frag_off, *fp; + fp = skb_header_pointer(skb, + start+offsetof(struct frag_hdr, + frag_off), + sizeof(_frag_off), + &_frag_off); + if (fp == NULL) return -1; - } - if (ntohs(frag_off) & ~0x7) + if (ntohs(*fp) & ~0x7) break; hdrlen = 8; } else if (nexthdr == NEXTHDR_AUTH) - hdrlen = (hdr.hdrlen+2)<<2; + hdrlen = (hp->hdrlen+2)<<2; else - hdrlen = ipv6_optlen(&hdr); + hdrlen = ipv6_optlen(hp); - nexthdr = hdr.nexthdr; + nexthdr = hp->nexthdr; len -= hdrlen; start += hdrlen; } diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 9a676aaf6184..a6d435f4f2e3 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -139,10 +139,12 @@ static int is_ineligible(struct sk_buff *skb) if (ptr < 0) return 0; if (nexthdr == IPPROTO_ICMPV6) { - u8 type; - if (skb_copy_bits(skb, ptr+offsetof(struct icmp6hdr, icmp6_type), - &type, 1) - || !(type & ICMPV6_INFOMSG_MASK)) + u8 _type, *tp; + tp = skb_header_pointer(skb, + ptr+offsetof(struct icmp6hdr, icmp6_type), + sizeof(_type), &_type); + if (tp == NULL || + !(*tp & ICMPV6_INFOMSG_MASK)) return 1; } return 0; @@ -200,12 +202,13 @@ static inline int icmpv6_xrlim_allow(struct sock *sk, int type, static __inline__ int opt_unrec(struct sk_buff *skb, __u32 offset) { - u8 optval; + u8 _optval, *op; offset += skb->nh.raw - skb->data; - if (skb_copy_bits(skb, offset, &optval, 1)) + op = skb_header_pointer(skb, offset, sizeof(_optval), &_optval); + if (op == NULL) return 1; - return (optval&0xC0) == 0x80; + return (*op & 0xC0) == 0x80; } int icmpv6_push_pending_frames(struct sock *sk, struct flowi *fl, struct icmp6hdr *thdr, int len) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 65a137241777..ff6bd80f7b1b 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -449,9 +449,10 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, * Same priority level */ - if ((iter->rt6i_dev == rt->rt6i_dev) && - (ipv6_addr_cmp(&iter->rt6i_gateway, - &rt->rt6i_gateway) == 0)) { + if (iter->rt6i_dev == rt->rt6i_dev && + iter->rt6i_idev == rt->rt6i_idev && + ipv6_addr_cmp(&iter->rt6i_gateway, + &rt->rt6i_gateway) == 0) { if (!(iter->rt6i_flags&RTF_EXPIRES)) return -EEXIST; iter->rt6i_expires = rt->rt6i_expires; @@ -514,7 +515,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nlmsghdr *nlh, int err = -ENOMEM; fn = fib6_add_1(root, &rt->rt6i_dst.addr, sizeof(struct in6_addr), - rt->rt6i_dst.plen, (u8*) &rt->rt6i_dst - (u8*) rt); + rt->rt6i_dst.plen, offsetof(struct rt6_info, rt6i_dst)); if (fn == NULL) goto out; @@ -550,7 +551,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nlmsghdr *nlh, sn = fib6_add_1(sfn, &rt->rt6i_src.addr, sizeof(struct in6_addr), rt->rt6i_src.plen, - (u8*) &rt->rt6i_src - (u8*) rt); + offsetof(struct rt6_info, rt6i_src)); if (sn == NULL) { /* If it is failed, discard just allocated @@ -571,7 +572,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nlmsghdr *nlh, } else { sn = fib6_add_1(fn->subtree, &rt->rt6i_src.addr, sizeof(struct in6_addr), rt->rt6i_src.plen, - (u8*) &rt->rt6i_src - (u8*) rt); + offsetof(struct rt6_info, rt6i_src)); if (sn == NULL) goto st_failure; @@ -680,14 +681,13 @@ struct fib6_node * fib6_lookup(struct fib6_node *root, struct in6_addr *daddr, struct in6_addr *saddr) { struct lookup_args args[2]; - struct rt6_info *rt = NULL; struct fib6_node *fn; - args[0].offset = (u8*) &rt->rt6i_dst - (u8*) rt; + args[0].offset = offsetof(struct rt6_info, rt6i_dst); args[0].addr = daddr; #ifdef CONFIG_IPV6_SUBTREES - args[1].offset = (u8*) &rt->rt6i_src - (u8*) rt; + args[1].offset = offsetof(struct rt6_info, rt6i_src); args[1].addr = saddr; #endif @@ -739,11 +739,10 @@ struct fib6_node * fib6_locate(struct fib6_node *root, struct in6_addr *daddr, int dst_len, struct in6_addr *saddr, int src_len) { - struct rt6_info *rt = NULL; struct fib6_node *fn; fn = fib6_locate_1(root, daddr, dst_len, - (u8*) &rt->rt6i_dst - (u8*) rt); + offsetof(struct rt6_info, rt6i_dst)); #ifdef CONFIG_IPV6_SUBTREES if (src_len) { @@ -752,7 +751,7 @@ struct fib6_node * fib6_locate(struct fib6_node *root, fn = fn->subtree; if (fn) fn = fib6_locate_1(fn, saddr, src_len, - (u8*) &rt->rt6i_src - (u8*) rt); + offsetof(struct rt6_info, rt6i_src)); } #endif diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index 2c5aab0894d2..15f341adc74c 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -538,7 +538,8 @@ release: /* Do not check for fault */ if (!freq.flr_label) - copy_to_user(optval + ((u8*)&freq.flr_label - (u8*)&freq), &fl->label, sizeof(fl->label)); + copy_to_user(&((struct in6_flowlabel_req __user *) optval)->flr_label, + &fl->label, sizeof(fl->label)); sfl1->fl = fl; sfl1->next = np->ipv6_fl_list; diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c index 04303769d36b..8f5296e3f9d0 100644 --- a/net/ipv6/ipcomp6.c +++ b/net/ipv6/ipcomp6.c @@ -114,10 +114,10 @@ error_out: return err; } -static int ipcomp6_output(struct sk_buff **pskb) +static int ipcomp6_output(struct sk_buff *skb) { int err; - struct dst_entry *dst = (*pskb)->dst; + struct dst_entry *dst = skb->dst; struct xfrm_state *x = dst->xfrm; struct ipv6hdr *top_iph; int hdr_len; @@ -126,23 +126,23 @@ static int ipcomp6_output(struct sk_buff **pskb) int plen, dlen; u8 *start, *scratch = ipcd->scratch; - hdr_len = (*pskb)->h.raw - (*pskb)->data; + hdr_len = skb->h.raw - skb->data; /* check whether datagram len is larger than threshold */ - if (((*pskb)->len - hdr_len) < ipcd->threshold) { + if ((skb->len - hdr_len) < ipcd->threshold) { goto out_ok; } - if ((skb_is_nonlinear(*pskb) || skb_cloned(*pskb)) && - skb_linearize(*pskb, GFP_ATOMIC) != 0) { + if ((skb_is_nonlinear(skb) || skb_cloned(skb)) && + skb_linearize(skb, GFP_ATOMIC) != 0) { err = -ENOMEM; goto error; } /* compression */ - plen = (*pskb)->len - hdr_len; + plen = skb->len - hdr_len; dlen = IPCOMP_SCRATCH_SIZE; - start = (*pskb)->h.raw; + start = skb->h.raw; err = crypto_comp_compress(ipcd->tfm, start, plen, scratch, &dlen); if (err) { @@ -152,18 +152,18 @@ static int ipcomp6_output(struct sk_buff **pskb) goto out_ok; } memcpy(start + sizeof(struct ip_comp_hdr), scratch, dlen); - pskb_trim(*pskb, hdr_len + dlen + sizeof(struct ip_comp_hdr)); + pskb_trim(skb, hdr_len + dlen + sizeof(struct ip_comp_hdr)); /* insert ipcomp header and replace datagram */ - top_iph = (struct ipv6hdr *)(*pskb)->data; + top_iph = (struct ipv6hdr *)skb->data; - top_iph->payload_len = htons((*pskb)->len - sizeof(struct ipv6hdr)); + top_iph->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); ipch = (struct ipv6_comp_hdr *)start; - ipch->nexthdr = *(*pskb)->nh.raw; + ipch->nexthdr = *skb->nh.raw; ipch->flags = 0; ipch->cpi = htons((u16 )ntohl(x->id.spi)); - *(*pskb)->nh.raw = IPPROTO_COMP; + *skb->nh.raw = IPPROTO_COMP; out_ok: err = 0; @@ -284,6 +284,9 @@ static int ipcomp6_init_state(struct xfrm_state *x, void *args) if (!x->calg) goto out; + if (x->encap) + goto out; + err = -ENOMEM; ipcd = kmalloc(sizeof(*ipcd), GFP_KERNEL); if (!ipcd) diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index d2ce00d81d4c..0cef15b866f5 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -66,6 +66,7 @@ do { \ #endif #define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1)) +static DECLARE_MUTEX(ip6t_mutex); /* Must have mutex */ #define ASSERT_READ_LOCK(x) IP_NF_ASSERT(down_trylock(&ip6t_mutex) != 0) diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c index bb8590bdd605..acc673ce9591 100644 --- a/net/ipv6/netfilter/ip6t_LOG.c +++ b/net/ipv6/netfilter/ip6t_LOG.c @@ -11,6 +11,7 @@ */ #include <linux/module.h> +#include <linux/moduleparam.h> #include <linux/skbuff.h> #include <linux/ip.h> #include <linux/spinlock.h> @@ -26,7 +27,7 @@ MODULE_DESCRIPTION("IP6 tables LOG target module"); MODULE_LICENSE("GPL"); static unsigned int nflog = 1; -MODULE_PARM(nflog, "i"); +module_param(nflog, int, 0400); MODULE_PARM_DESC(nflog, "register as internal netfilter logging module"); struct in_device; diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c index 46daa79051d1..aca6d21cc588 100644 --- a/net/ipv6/netfilter/ip6table_filter.c +++ b/net/ipv6/netfilter/ip6table_filter.c @@ -10,6 +10,7 @@ */ #include <linux/module.h> +#include <linux/moduleparam.h> #include <linux/netfilter_ipv6/ip6_tables.h> MODULE_LICENSE("GPL"); @@ -156,7 +157,7 @@ static struct nf_hook_ops ip6t_ops[] = { /* Default to forward because I got too much mail already. */ static int forward = NF_ACCEPT; -MODULE_PARM(forward, "i"); +module_param(forward, bool, 0000); static int __init init(void) { diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 031989611932..91fbcfee87ac 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -555,6 +555,52 @@ error: IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS); return err; } + +static void rawv6_probe_proto_opt(struct flowi *fl, struct msghdr *msg) +{ + struct iovec *iov; + u8 __user *type = NULL; + u8 __user *code = NULL; + int probed = 0; + int i; + + if (!msg->msg_iov) + return; + + for (i = 0; i < msg->msg_iovlen; i++) { + iov = &msg->msg_iov[i]; + if (!iov) + continue; + + switch (fl->proto) { + case IPPROTO_ICMPV6: + /* check if one-byte field is readable or not. */ + if (iov->iov_base && iov->iov_len < 1) + break; + + if (!type) { + type = iov->iov_base; + /* check if code field is readable or not. */ + if (iov->iov_len > 1) + code = type + 1; + } else if (!code) + code = iov->iov_base; + + if (type && code) { + get_user(fl->fl_icmp_type, type); + __get_user(fl->fl_icmp_code, code); + probed = 1; + } + break; + default: + probed = 1; + break; + } + if (probed) + break; + } +} + static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len) { @@ -674,6 +720,8 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk, opt = fl6_merge_options(&opt_space, flowlabel, opt); fl.proto = proto; + rawv6_probe_proto_opt(&fl, msg); + ipv6_addr_copy(&fl.fl6_dst, daddr); if (ipv6_addr_any(&fl.fl6_src) && !ipv6_addr_any(&np->saddr)) ipv6_addr_copy(&fl.fl6_src, &np->saddr); diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 836d2ae8464e..e07da9ee8990 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -195,14 +195,18 @@ static void ip6_frag_secret_rebuild(unsigned long dummy) atomic_t ip6_frag_mem = ATOMIC_INIT(0); /* Memory Tracking Functions. */ -static inline void frag_kfree_skb(struct sk_buff *skb) +static inline void frag_kfree_skb(struct sk_buff *skb, int *work) { + if (work) + *work -= skb->truesize; atomic_sub(skb->truesize, &ip6_frag_mem); kfree_skb(skb); } -static inline void frag_free_queue(struct frag_queue *fq) +static inline void frag_free_queue(struct frag_queue *fq, int *work) { + if (work) + *work -= sizeof(struct frag_queue); atomic_sub(sizeof(struct frag_queue), &ip6_frag_mem); kfree(fq); } @@ -220,7 +224,7 @@ static inline struct frag_queue *frag_alloc_queue(void) /* Destruction primitives. */ /* Complete destruction of fq. */ -static void ip6_frag_destroy(struct frag_queue *fq) +static void ip6_frag_destroy(struct frag_queue *fq, int *work) { struct sk_buff *fp; @@ -232,17 +236,17 @@ static void ip6_frag_destroy(struct frag_queue *fq) while (fp) { struct sk_buff *xp = fp->next; - frag_kfree_skb(fp); + frag_kfree_skb(fp, work); fp = xp; } - frag_free_queue(fq); + frag_free_queue(fq, work); } -static __inline__ void fq_put(struct frag_queue *fq) +static __inline__ void fq_put(struct frag_queue *fq, int *work) { if (atomic_dec_and_test(&fq->refcnt)) - ip6_frag_destroy(fq); + ip6_frag_destroy(fq, work); } /* Kill fq entry. It is not destroyed immediately, @@ -264,10 +268,13 @@ static void ip6_evictor(void) { struct frag_queue *fq; struct list_head *tmp; + int work; - for(;;) { - if (atomic_read(&ip6_frag_mem) <= sysctl_ip6frag_low_thresh) - return; + work = atomic_read(&ip6_frag_mem) - sysctl_ip6frag_low_thresh; + if (work <= 0) + return; + + while(work > 0) { read_lock(&ip6_frag_lock); if (list_empty(&ip6_frag_lru_list)) { read_unlock(&ip6_frag_lock); @@ -283,7 +290,7 @@ static void ip6_evictor(void) fq_kill(fq); spin_unlock(&fq->lock); - fq_put(fq); + fq_put(fq, &work); IP6_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); } } @@ -320,7 +327,7 @@ static void ip6_frag_expire(unsigned long data) } out: spin_unlock(&fq->lock); - fq_put(fq); + fq_put(fq, NULL); } /* Creation primitives. */ @@ -340,7 +347,7 @@ static struct frag_queue *ip6_frag_intern(unsigned int hash, atomic_inc(&fq->refcnt); write_unlock(&ip6_frag_lock); fq_in->last_in |= COMPLETE; - fq_put(fq_in); + fq_put(fq_in, NULL); return fq; } } @@ -539,7 +546,7 @@ static void ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, fq->fragments = next; fq->meat -= free_it->len; - frag_kfree_skb(free_it); + frag_kfree_skb(free_it, NULL); } } @@ -734,7 +741,7 @@ static int ipv6_frag_rcv(struct sk_buff **skbp, unsigned int *nhoffp) ret = ip6_frag_reasm(fq, skbp, nhoffp, dev); spin_unlock(&fq->lock); - fq_put(fq); + fq_put(fq, NULL); return ret; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 2017c69dc9f3..77e9de707e96 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -155,7 +155,16 @@ static void ip6_dst_destroy(struct dst_entry *dst) static void ip6_dst_ifdown(struct dst_entry *dst, int how) { - ip6_dst_destroy(dst); + struct rt6_info *rt = (struct rt6_info *)dst; + struct inet6_dev *idev = rt->rt6i_idev; + + if (idev != NULL && idev->dev != &loopback_dev) { + struct inet6_dev *loopback_idev = in6_dev_get(&loopback_dev); + if (loopback_idev != NULL) { + rt->rt6i_idev = loopback_idev; + in6_dev_put(idev); + } + } } /* @@ -174,8 +183,16 @@ static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt, struct net_device *dev = sprt->rt6i_dev; if (dev->ifindex == oif) return sprt; - if (dev->flags&IFF_LOOPBACK) + if (dev->flags & IFF_LOOPBACK) { + if (sprt->rt6i_idev->dev->ifindex != oif) { + if (strict && oif) + continue; + if (local && (!oif || + local->rt6i_idev->dev->ifindex == oif)) + continue; + } local = sprt; + } } if (local) @@ -336,13 +353,13 @@ struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr, return NULL; } -/* rt6_ins is called with FREE rt6_lock. +/* ip6_ins_rt is called with FREE rt6_lock. It takes new route entry, the addition fails by any reason the route is freed. In any case, if caller does not hold it, it may be destroyed. */ -static int rt6_ins(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr) +int ip6_ins_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr) { int err; @@ -390,7 +407,7 @@ static struct rt6_info *rt6_cow(struct rt6_info *ort, struct in6_addr *daddr, dst_hold(&rt->u.dst); - err = rt6_ins(rt, NULL, NULL); + err = ip6_ins_rt(rt, NULL, NULL); if (err == 0) return rt; @@ -608,8 +625,13 @@ struct dst_entry *ndisc_dst_alloc(struct net_device *dev, struct in6_addr *addr, int (*output)(struct sk_buff **)) { - struct rt6_info *rt = ip6_dst_alloc(); + struct rt6_info *rt; + struct inet6_dev *idev = in6_dev_get(dev); + if (unlikely(idev == NULL)) + return NULL; + + rt = ip6_dst_alloc(); if (unlikely(rt == NULL)) goto out; @@ -620,7 +642,7 @@ struct dst_entry *ndisc_dst_alloc(struct net_device *dev, neigh = ndisc_get_neigh(dev, addr); rt->rt6i_dev = dev; - rt->rt6i_idev = in6_dev_get(dev); + rt->rt6i_idev = idev; rt->rt6i_nexthop = neigh; atomic_set(&rt->u.dst.__refcnt, 1); rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255; @@ -731,8 +753,9 @@ int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr) int err; struct rtmsg *r; struct rtattr **rta; - struct rt6_info *rt; + struct rt6_info *rt = NULL; struct net_device *dev = NULL; + struct inet6_dev *idev = NULL; int addr_type; rta = (struct rtattr **) _rtattr; @@ -744,9 +767,13 @@ int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr) return -EINVAL; #endif if (rtmsg->rtmsg_ifindex) { + err = -ENODEV; dev = dev_get_by_index(rtmsg->rtmsg_ifindex); if (!dev) - return -ENODEV; + goto out; + idev = in6_dev_get(dev); + if (!idev) + goto out; } if (rtmsg->rtmsg_metric == 0) @@ -793,10 +820,17 @@ int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr) */ if ((rtmsg->rtmsg_flags&RTF_REJECT) || (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) { - if (dev) + if (dev && dev != &loopback_dev) { dev_put(dev); - dev = &loopback_dev; - dev_hold(dev); + in6_dev_put(idev); + dev = &loopback_dev; + dev_hold(dev); + idev = in6_dev_get(dev); + if (!idev) { + err = -ENODEV; + goto out; + } + } rt->u.dst.output = ip6_pkt_discard_out; rt->u.dst.input = ip6_pkt_discard; rt->u.dst.error = -ENETUNREACH; @@ -838,7 +872,9 @@ int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr) } } else { dev = grt->rt6i_dev; + idev = grt->rt6i_idev; dev_hold(dev); + in6_dev_hold(grt->rt6i_idev); } if (!(grt->rt6i_flags&RTF_GATEWAY)) err = 0; @@ -900,8 +936,8 @@ install_route: if (!rt->u.dst.metrics[RTAX_ADVMSS-1]) rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_pmtu(&rt->u.dst)); rt->u.dst.dev = dev; - rt->rt6i_idev = in6_dev_get(dev); - return rt6_ins(rt, nlh, _rtattr); + rt->rt6i_idev = idev; + return ip6_ins_rt(rt, nlh, _rtattr); out: if (dev) @@ -1054,7 +1090,7 @@ source_ok: nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev); nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_pmtu(&nrt->u.dst)); - if (rt6_ins(nrt, NULL, NULL)) + if (ip6_ins_rt(nrt, NULL, NULL)) goto out; if (rt->rt6i_flags&RTF_CACHE) { @@ -1144,7 +1180,7 @@ void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr, dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires); nrt->rt6i_flags |= RTF_DYNAMIC|RTF_CACHE|RTF_EXPIRES; nrt->u.dst.metrics[RTAX_MTU-1] = pmtu; - rt6_ins(nrt, NULL, NULL); + ip6_ins_rt(nrt, NULL, NULL); } out: @@ -1303,23 +1339,26 @@ int ip6_pkt_discard_out(struct sk_buff **pskb) } /* - * Add address + * Allocate a dst for local (unicast / anycast) address. */ -int ip6_rt_addr_add(struct in6_addr *addr, struct net_device *dev, int anycast) +struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, + const struct in6_addr *addr, + int anycast) { struct rt6_info *rt = ip6_dst_alloc(); if (rt == NULL) - return -ENOMEM; + return ERR_PTR(-ENOMEM); dev_hold(&loopback_dev); + in6_dev_hold(idev); rt->u.dst.flags = DST_HOST; rt->u.dst.input = ip6_input; rt->u.dst.output = ip6_output; rt->rt6i_dev = &loopback_dev; - rt->rt6i_idev = in6_dev_get(&loopback_dev); + rt->rt6i_idev = idev; rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev); rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_pmtu(&rt->u.dst)); rt->u.dst.metrics[RTAX_HOPLIMIT-1] = ipv6_get_hoplimit(rt->rt6i_dev); @@ -1331,34 +1370,15 @@ int ip6_rt_addr_add(struct in6_addr *addr, struct net_device *dev, int anycast) rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway); if (rt->rt6i_nexthop == NULL) { dst_free((struct dst_entry *) rt); - return -ENOMEM; + return ERR_PTR(-ENOMEM); } ipv6_addr_copy(&rt->rt6i_dst.addr, addr); rt->rt6i_dst.plen = 128; - rt6_ins(rt, NULL, NULL); - - return 0; -} - -/* Delete address. Warning: you should check that this address - disappeared before calling this function. - */ - -int ip6_rt_addr_del(struct in6_addr *addr, struct net_device *dev) -{ - struct rt6_info *rt; - int err = -ENOENT; - rt = rt6_lookup(addr, NULL, loopback_dev.ifindex, 1); - if (rt) { - if (rt->rt6i_dst.plen == 128) - err = ip6_del_rt(rt, NULL, NULL); - else - dst_release(&rt->u.dst); - } + atomic_set(&rt->u.dst.__refcnt, 1); - return err; + return rt; } static int fib6_ifdown(struct rt6_info *rt, void *arg) diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c index 0791594f8878..92e74233fcdb 100644 --- a/net/ipv6/xfrm6_input.c +++ b/net/ipv6/xfrm6_input.c @@ -9,6 +9,7 @@ * IPv6 support */ +#include <linux/module.h> #include <linux/string.h> #include <net/inet_ecn.h> #include <net/ip.h> @@ -25,11 +26,11 @@ static inline void ipip6_ecn_decapsulate(struct sk_buff *skb) IP6_ECN_set_ce(inner_iph); } -int xfrm6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) +int xfrm6_rcv_spi(struct sk_buff **pskb, unsigned int *nhoffp, u32 spi) { struct sk_buff *skb = *pskb; int err; - u32 spi, seq; + u32 seq; struct sec_decap_state xfrm_vec[XFRM_MAX_DEPTH]; struct xfrm_state *x; int xfrm_nr = 0; @@ -40,7 +41,8 @@ int xfrm6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) nhoff = *nhoffp; nexthdr = skb->nh.raw[nhoff]; - if ((err = xfrm_parse_spi(skb, nexthdr, &spi, &seq)) != 0) + seq = 0; + if (!spi && (err = xfrm_parse_spi(skb, nexthdr, &spi, &seq)) != 0) goto drop; do { @@ -137,3 +139,10 @@ drop: kfree_skb(skb); return -1; } + +EXPORT_SYMBOL(xfrm6_rcv_spi); + +int xfrm6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) +{ + return xfrm6_rcv_spi(pskb, nhoffp, 0); +} diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index 712856f0f356..786de7d912bb 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -113,8 +113,7 @@ int xfrm6_output(struct sk_buff **pskb) xfrm6_encap(skb); - err = x->type->output(pskb); - skb = *pskb; + err = x->type->output(skb); if (err) goto error; diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index ab4e40b0ab76..a0715e2f05d7 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -17,12 +17,12 @@ #include <net/ipv6.h> #include <net/ip6_route.h> -extern struct dst_ops xfrm6_dst_ops; -extern struct xfrm_policy_afinfo xfrm6_policy_afinfo; +static struct dst_ops xfrm6_dst_ops; +static struct xfrm_policy_afinfo xfrm6_policy_afinfo; static struct xfrm_type_map xfrm6_type_map = { .lock = RW_LOCK_UNLOCKED }; -int xfrm6_dst_lookup(struct xfrm_dst **dst, struct flowi *fl) +static int xfrm6_dst_lookup(struct xfrm_dst **dst, struct flowi *fl) { int err = 0; *dst = (struct xfrm_dst*)ip6_route_output(NULL, fl); @@ -213,6 +213,16 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl) fl->proto = nexthdr; return; + case IPPROTO_ICMPV6: + if (pskb_may_pull(skb, skb->nh.raw + offset + 2 - skb->data)) { + u8 *icmp = (u8 *)exthdr; + + fl->fl_icmp_type = icmp[0]; + fl->fl_icmp_code = icmp[1]; + } + fl->proto = nexthdr; + return; + /* XXX Why are there these headers? */ case IPPROTO_AH: case IPPROTO_ESP: @@ -243,7 +253,7 @@ static void xfrm6_update_pmtu(struct dst_entry *dst, u32 mtu) return; } -struct dst_ops xfrm6_dst_ops = { +static struct dst_ops xfrm6_dst_ops = { .family = AF_INET6, .protocol = __constant_htons(ETH_P_IPV6), .gc = xfrm6_garbage_collect, @@ -252,7 +262,7 @@ struct dst_ops xfrm6_dst_ops = { .entry_size = sizeof(struct xfrm_dst), }; -struct xfrm_policy_afinfo xfrm6_policy_afinfo = { +static struct xfrm_policy_afinfo xfrm6_policy_afinfo = { .family = AF_INET6, .lock = RW_LOCK_UNLOCKED, .type_map = &xfrm6_type_map, @@ -263,12 +273,12 @@ struct xfrm_policy_afinfo xfrm6_policy_afinfo = { .decode_session = _decode_session6, }; -void __init xfrm6_policy_init(void) +static void __init xfrm6_policy_init(void) { xfrm_policy_register_afinfo(&xfrm6_policy_afinfo); } -void __exit xfrm6_policy_fini(void) +static void __exit xfrm6_policy_fini(void) { xfrm_policy_unregister_afinfo(&xfrm6_policy_afinfo); } @@ -277,12 +287,10 @@ void __init xfrm6_init(void) { xfrm6_policy_init(); xfrm6_state_init(); - xfrm6_tunnel_init(); } void __exit xfrm6_fini(void) { - xfrm6_tunnel_fini(); //xfrm6_input_fini(); xfrm6_policy_fini(); xfrm6_state_fini(); diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c index 5766a133411a..9616a63cc431 100644 --- a/net/ipv6/xfrm6_tunnel.c +++ b/net/ipv6/xfrm6_tunnel.c @@ -343,9 +343,8 @@ void xfrm6_tunnel_free_spi(xfrm_address_t *saddr) EXPORT_SYMBOL(xfrm6_tunnel_free_spi); -static int xfrm6_tunnel_output(struct sk_buff **pskb) +static int xfrm6_tunnel_output(struct sk_buff *skb) { - struct sk_buff *skb = *pskb; struct ipv6hdr *top_iph; top_iph = (struct ipv6hdr *)skb->data; @@ -356,17 +355,6 @@ static int xfrm6_tunnel_output(struct sk_buff **pskb) static int xfrm6_tunnel_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb) { - if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) - return -EINVAL; - - skb->mac.raw = skb->nh.raw; - skb->nh.raw = skb->data; - dst_release(skb->dst); - skb->dst = NULL; - skb->protocol = htons(ETH_P_IPV6); - skb->pkt_type = PACKET_HOST; - netif_rx(skb); - return 0; } @@ -413,49 +401,15 @@ static int xfrm6_tunnel_rcv(struct sk_buff **pskb, unsigned int *nhoffp) { struct sk_buff *skb = *pskb; struct xfrm6_tunnel *handler = xfrm6_tunnel_handler; - struct xfrm_state *x = NULL; struct ipv6hdr *iph = skb->nh.ipv6h; - int err = 0; u32 spi; /* device-like_ip6ip6_handler() */ - if (handler) { - err = handler->handler(pskb, nhoffp); - if (!err) - goto out; - } + if (handler && handler->handler(pskb, nhoffp) == 0) + return 0; spi = xfrm6_tunnel_spi_lookup((xfrm_address_t *)&iph->saddr); - x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, - spi, - IPPROTO_IPV6, AF_INET6); - - if (!x) - goto drop; - - spin_lock(&x->lock); - - if (unlikely(x->km.state != XFRM_STATE_VALID)) - goto drop_unlock; - - err = xfrm6_tunnel_input(x, NULL, skb); - if (err) - goto drop_unlock; - - x->curlft.bytes += skb->len; - x->curlft.packets++; - spin_unlock(&x->lock); - xfrm_state_put(x); - -out: - return 0; - -drop_unlock: - spin_unlock(&x->lock); - xfrm_state_put(x); -drop: - kfree_skb(skb); - return -1; + return xfrm6_rcv_spi(pskb, nhoffp, spi); } static void xfrm6_tunnel_err(struct sk_buff *skb, struct inet6_skb_parm *opt, @@ -517,6 +471,9 @@ static int xfrm6_tunnel_init_state(struct xfrm_state *x, void *args) if (!x->props.mode) return -EINVAL; + if (x->encap) + return -EINVAL; + x->props.header_len = sizeof(struct ipv6hdr); return 0; @@ -543,31 +500,32 @@ static struct inet6_protocol xfrm6_tunnel_protocol = { .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, }; -void __init xfrm6_tunnel_init(void) +static int __init xfrm6_tunnel_init(void) { X6TPRINTK3(KERN_DEBUG "%s()\n", __FUNCTION__); if (xfrm_register_type(&xfrm6_tunnel_type, AF_INET6) < 0) { X6TPRINTK1(KERN_ERR "xfrm6_tunnel init: can't add xfrm type\n"); - return; + return -EAGAIN; } if (inet6_add_protocol(&xfrm6_tunnel_protocol, IPPROTO_IPV6) < 0) { X6TPRINTK1(KERN_ERR "xfrm6_tunnel init(): can't add protocol\n"); xfrm_unregister_type(&xfrm6_tunnel_type, AF_INET6); - return; + return -EAGAIN; } if (xfrm6_tunnel_spi_init() < 0) { X6TPRINTK1(KERN_ERR "xfrm6_tunnel init: failed to initialize spi\n"); inet6_del_protocol(&xfrm6_tunnel_protocol, IPPROTO_IPV6); xfrm_unregister_type(&xfrm6_tunnel_type, AF_INET6); - return; + return -EAGAIN; } + return 0; } -void __exit xfrm6_tunnel_fini(void) +static void __exit xfrm6_tunnel_fini(void) { X6TPRINTK3(KERN_DEBUG "%s()\n", __FUNCTION__); @@ -579,3 +537,7 @@ void __exit xfrm6_tunnel_fini(void) X6TPRINTK1(KERN_ERR "xfrm6_tunnel close: can't remove xfrm type\n"); } + +module_init(xfrm6_tunnel_init); +module_exit(xfrm6_tunnel_fini); +MODULE_LICENSE("GPL"); diff --git a/net/irda/irlan/irlan_eth.c b/net/irda/irlan/irlan_eth.c index 11a2955f8e09..04bb8925ac04 100644 --- a/net/irda/irlan/irlan_eth.c +++ b/net/irda/irlan/irlan_eth.c @@ -303,10 +303,10 @@ void irlan_eth_send_gratuitous_arp(struct net_device *dev) */ #ifdef CONFIG_INET IRDA_DEBUG(4, "IrLAN: Sending gratuitous ARP\n"); - in_dev = in_dev_get(dev); + rcu_read_lock(); + in_dev = __in_dev_get(dev); if (in_dev == NULL) - return; - read_lock(&in_dev->lock); + goto out; if (in_dev->ifa_list) arp_send(ARPOP_REQUEST, ETH_P_ARP, @@ -314,8 +314,8 @@ void irlan_eth_send_gratuitous_arp(struct net_device *dev) dev, in_dev->ifa_list->ifa_address, NULL, dev->dev_addr, NULL); - read_unlock(&in_dev->lock); - in_dev_put(in_dev); +out: + rcu_read_unlock(); #endif /* CONFIG_INET */ } diff --git a/net/key/af_key.c b/net/key/af_key.c index fdf75a1ba801..8ca25fd7efe7 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -1075,15 +1075,6 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct sadb_msg *hdr, n_type = ext_hdrs[SADB_X_EXT_NAT_T_TYPE-1]; natt->encap_type = n_type->sadb_x_nat_t_type_type; - switch (natt->encap_type) { - case UDP_ENCAP_ESPINUDP: - case UDP_ENCAP_ESPINUDP_NON_IKE: - break; - default: - err = -ENOPROTOOPT; - goto out; - } - if (ext_hdrs[SADB_X_EXT_NAT_T_SPORT-1]) { struct sadb_x_nat_t_port* n_port = ext_hdrs[SADB_X_EXT_NAT_T_SPORT-1]; diff --git a/net/rxrpc/call.c b/net/rxrpc/call.c index 6c87fcaeca6a..1f0d51a341f3 100644 --- a/net/rxrpc/call.c +++ b/net/rxrpc/call.c @@ -929,7 +929,6 @@ static void rxrpc_call_receive_packet(struct rxrpc_call *call) { struct rxrpc_message *msg; struct list_head *_p; - uint32_t data32; _enter("%p", call); @@ -986,22 +985,21 @@ static void rxrpc_call_receive_packet(struct rxrpc_call *call) break; /* deal with abort packets */ - case RXRPC_PACKET_TYPE_ABORT: - data32 = 0; - if (skb_copy_bits(msg->pkt, msg->offset, - &data32, sizeof(data32)) < 0) { + case RXRPC_PACKET_TYPE_ABORT: { + uint32_t _dbuf, *dp; + + dp = skb_header_pointer(msg->pkt, msg->offset, + sizeof(_dbuf), &_dbuf); + if (dp == NULL) printk("Rx Received short ABORT packet\n"); - } - else { - data32 = ntohl(data32); - } - _proto("Rx Received Call ABORT { data=%d }", data32); + _proto("Rx Received Call ABORT { data=%d }", + (dp ? ntohl(*dp) : 0)); spin_lock(&call->lock); call->app_call_state = RXRPC_CSTATE_ERROR; call->app_err_state = RXRPC_ESTATE_PEER_ABORT; - call->app_abort_code = data32; + call->app_abort_code = (dp ? ntohl(*dp) : 0); call->app_errno = -ECONNABORTED; call->app_mark = RXRPC_APP_MARK_EOF; call->app_read_buf = NULL; @@ -1013,7 +1011,7 @@ static void rxrpc_call_receive_packet(struct rxrpc_call *call) spin_unlock(&call->lock); call->app_error_func(call); break; - + } default: /* deal with other packet types */ _proto("Rx Unsupported packet type %u (#%u)", @@ -1271,7 +1269,7 @@ static void rxrpc_call_receive_data_packet(struct rxrpc_call *call, static void rxrpc_call_receive_ack_packet(struct rxrpc_call *call, struct rxrpc_message *msg) { - struct rxrpc_ackpacket ack; + struct rxrpc_ackpacket _ack, *ap; rxrpc_serial_t serial; rxrpc_seq_t seq; int ret; @@ -1279,33 +1277,34 @@ static void rxrpc_call_receive_ack_packet(struct rxrpc_call *call, _enter("%p{%u},%p{%u}", call, ntohl(call->call_id), msg, msg->seq); /* extract the basic ACK record */ - if (skb_copy_bits(msg->pkt, msg->offset, &ack, sizeof(ack)) < 0) { + ap = skb_header_pointer(msg->pkt, msg->offset, sizeof(_ack), &_ack); + if (ap == NULL) { printk("Rx Received short ACK packet\n"); return; } - msg->offset += sizeof(ack); + msg->offset += sizeof(_ack); - serial = ack.serial; - seq = ntohl(ack.firstPacket); + serial = ap->serial; + seq = ntohl(ap->firstPacket); _proto("Rx Received ACK %%%d { b=%hu m=%hu f=%u p=%u s=%u r=%s n=%u }", ntohl(msg->hdr.serial), - ntohs(ack.bufferSpace), - ntohs(ack.maxSkew), + ntohs(ap->bufferSpace), + ntohs(ap->maxSkew), seq, - ntohl(ack.previousPacket), + ntohl(ap->previousPacket), ntohl(serial), - rxrpc_acks[ack.reason], + rxrpc_acks[ap->reason], call->ackr.nAcks ); /* check the other side isn't ACK'ing a sequence number I haven't sent * yet */ - if (ack.nAcks > 0 && + if (ap->nAcks > 0 && (seq > call->snd_seq_count || - seq + ack.nAcks - 1 > call->snd_seq_count)) { + seq + ap->nAcks - 1 > call->snd_seq_count)) { printk("Received ACK (#%u-#%u) for unsent packet\n", - seq, seq + ack.nAcks - 1); + seq, seq + ap->nAcks - 1); rxrpc_call_abort(call, -EINVAL); _leave(""); return; @@ -1354,7 +1353,7 @@ static void rxrpc_call_receive_ack_packet(struct rxrpc_call *call, } } - switch (ack.reason) { + switch (ap->reason) { /* deal with negative/positive acknowledgement of data * packets */ case RXRPC_ACK_REQUESTED: @@ -1366,14 +1365,14 @@ static void rxrpc_call_receive_ack_packet(struct rxrpc_call *call, case RXRPC_ACK_OUT_OF_SEQUENCE: case RXRPC_ACK_EXCEEDS_WINDOW: call->snd_resend_cnt = 0; - ret = rxrpc_call_record_ACK(call, msg, seq, ack.nAcks); + ret = rxrpc_call_record_ACK(call, msg, seq, ap->nAcks); if (ret < 0) rxrpc_call_abort(call, ret); break; /* respond to ping packets immediately */ case RXRPC_ACK_PING: - rxrpc_call_generate_ACK(call, &msg->hdr, &ack); + rxrpc_call_generate_ACK(call, &msg->hdr, ap); break; /* only record RTT on ping response packets */ @@ -1386,7 +1385,7 @@ static void rxrpc_call_receive_ack_packet(struct rxrpc_call *call, rttmsg = NULL; spin_lock(&call->lock); if (call->snd_ping && - call->snd_ping->hdr.serial == ack.serial) { + call->snd_ping->hdr.serial == ap->serial) { rttmsg = call->snd_ping; call->snd_ping = NULL; } @@ -1402,7 +1401,7 @@ static void rxrpc_call_receive_ack_packet(struct rxrpc_call *call, break; default: - printk("Unsupported ACK reason %u\n", ack.reason); + printk("Unsupported ACK reason %u\n", ap->reason); break; } diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 1f9bf9d0834c..ebb9935ab4ca 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -389,7 +389,8 @@ qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp) { int err; struct rtattr *kind = tca[TCA_KIND-1]; - struct Qdisc *sch = NULL; + void *p = NULL; + struct Qdisc *sch; struct Qdisc_ops *ops; int size; @@ -406,21 +407,22 @@ qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp) err = -EINVAL; if (ops == NULL) goto err_out; - - size = sizeof(*sch) + ops->priv_size; - - sch = kmalloc(size, GFP_KERNEL); - err = -ENOBUFS; - if (!sch) + err = -EBUSY; + if (!try_module_get(ops->owner)) goto err_out; - /* Grrr... Resolve race condition with module unload */ - - err = -EINVAL; - if (ops != qdisc_lookup_ops(kind)) - goto err_out; + /* ensure that the Qdisc and the private data are 32-byte aligned */ + size = ((sizeof(*sch) + QDISC_ALIGN_CONST) & ~QDISC_ALIGN_CONST); + size += ops->priv_size + QDISC_ALIGN_CONST; - memset(sch, 0, size); + p = kmalloc(size, GFP_KERNEL); + err = -ENOBUFS; + if (!p) + goto err_out2; + memset(p, 0, size); + sch = (struct Qdisc *)(((unsigned long)p + QDISC_ALIGN_CONST) + & ~QDISC_ALIGN_CONST); + sch->padded = (char *)sch - (char *)p; INIT_LIST_HEAD(&sch->list); skb_queue_head_init(&sch->q); @@ -439,7 +441,7 @@ qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp) handle = qdisc_alloc_handle(dev); err = -ENOMEM; if (handle == 0) - goto err_out; + goto err_out3; } if (handle == TC_H_INGRESS) @@ -447,10 +449,6 @@ qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp) else sch->handle = handle; - err = -EBUSY; - if (!try_module_get(ops->owner)) - goto err_out; - /* enqueue is accessed locklessly - make sure it's visible * before we set a netdevice's qdisc pointer to sch */ smp_wmb(); @@ -466,12 +464,14 @@ qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp) #endif return sch; } +err_out3: + dev_put(dev); +err_out2: module_put(ops->owner); - err_out: *errp = err; - if (sch) - kfree(sch); + if (p) + kfree(p); return NULL; } diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index ca08449e7b03..fe530156875a 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -573,7 +573,6 @@ static int atm_tc_init(struct Qdisc *sch,struct rtattr *opt) struct atm_qdisc_data *p = PRIV(sch); DPRINTK("atm_tc_init(sch %p,[qdisc %p],opt %p)\n",sch,p,opt); - memset(p,0,sizeof(*p)); p->flows = &p->link; if(!(p->link.q = qdisc_create_dflt(sch->dev,&pfifo_qdisc_ops))) p->link.q = &noop_qdisc; diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 01dfcb1ab832..192ad0a9b904 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -1746,15 +1746,18 @@ static void cbq_destroy_filters(struct cbq_class *cl) } } -static void cbq_destroy_class(struct cbq_class *cl) +static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl) { + struct cbq_sched_data *q = qdisc_priv(sch); + cbq_destroy_filters(cl); qdisc_destroy(cl->q); qdisc_put_rtab(cl->R_tab); #ifdef CONFIG_NET_ESTIMATOR qdisc_kill_estimator(&cl->stats); #endif - kfree(cl); + if (cl != &q->link) + kfree(cl); } static void @@ -1777,8 +1780,7 @@ cbq_destroy(struct Qdisc* sch) for (cl = q->classes[h]; cl; cl = next) { next = cl->next; - if (cl != &q->link) - cbq_destroy_class(cl); + cbq_destroy_class(sch, cl); } } @@ -1799,7 +1801,7 @@ static void cbq_put(struct Qdisc *sch, unsigned long arg) spin_unlock_bh(&sch->dev->queue_lock); #endif - cbq_destroy_class(cl); + cbq_destroy_class(sch, cl); } } @@ -2035,7 +2037,7 @@ static int cbq_delete(struct Qdisc *sch, unsigned long arg) sch_tree_unlock(sch); if (--cl->refcnt == 0) - cbq_destroy_class(cl); + cbq_destroy_class(sch, cl); return 0; } diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index e0831a4a4457..28b61f0f87a7 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -331,8 +331,6 @@ int dsmark_init(struct Qdisc *sch,struct rtattr *opt) !tb[TCA_DSMARK_INDICES-1] || RTA_PAYLOAD(tb[TCA_DSMARK_INDICES-1]) < sizeof(__u16)) return -EINVAL; - memset(p,0,sizeof(*p)); - p->filter_list = NULL; p->indices = *(__u16 *) RTA_DATA(tb[TCA_DSMARK_INDICES-1]); if (!p->indices) return -EINVAL; diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 82117f9ba7d1..2d7e859d0cd8 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -415,6 +415,7 @@ struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops) if (!ops->init || ops->init(sch, NULL) == 0) return sch; + dev_put(dev); kfree(p); return NULL; } diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 84ef3ab6a843..fa1a9e5494c8 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -62,6 +62,7 @@ #include <linux/slab.h> #include <linux/timer.h> #include <linux/list.h> +#include <linux/rbtree.h> #include <linux/init.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> @@ -133,9 +134,11 @@ struct hfsc_class struct list_head children; /* child classes */ struct Qdisc *qdisc; /* leaf qdisc */ - struct list_head actlist; /* active children list */ - struct list_head alist; /* active children list member */ - struct list_head ellist; /* eligible list member */ + struct rb_node el_node; /* qdisc's eligible tree member */ + struct rb_root vt_tree; /* active children sorted by cl_vt */ + struct rb_node vt_node; /* parent's vt_tree member */ + struct rb_root cf_tree; /* active children sorted by cl_f */ + struct rb_node cf_node; /* parent's cf_heap member */ struct list_head hlist; /* hash list member */ struct list_head dlist; /* drop list member */ @@ -161,6 +164,9 @@ struct hfsc_class adjustment */ u64 cl_vtoff; /* inter-period cumulative vt offset */ u64 cl_cvtmax; /* max child's vt in the last period */ + u64 cl_cvtoff; /* cumulative cvtmax of all periods */ + u64 cl_pcvtoff; /* parent's cvtoff at initalization + time */ struct internal_sc cl_rsc; /* internal real-time service curve */ struct internal_sc cl_fsc; /* internal fair service curve */ @@ -183,7 +189,7 @@ struct hfsc_sched u16 defcls; /* default class id */ struct hfsc_class root; /* root class */ struct list_head clhash[HFSC_HSIZE]; /* class hash */ - struct list_head eligible; /* eligible list */ + struct rb_root eligible; /* eligible tree */ struct list_head droplist; /* active leaf class list (for dropping) */ struct sk_buff_head requeue; /* requeued packet */ @@ -219,82 +225,51 @@ do { \ /* - * eligible list holds backlogged classes being sorted by their eligible times. - * there is one eligible list per hfsc instance. + * eligible tree holds backlogged classes being sorted by their eligible times. + * there is one eligible tree per hfsc instance. */ static void -ellist_insert(struct hfsc_class *cl) +eltree_insert(struct hfsc_class *cl) { - struct list_head *head = &cl->sched->eligible; - struct hfsc_class *p; + struct rb_node **p = &cl->sched->eligible.rb_node; + struct rb_node *parent = NULL; + struct hfsc_class *cl1; - /* check the last entry first */ - if (list_empty(head) || - ((p = list_entry(head->prev, struct hfsc_class, ellist)) && - p->cl_e <= cl->cl_e)) { - list_add_tail(&cl->ellist, head); - return; - } - - list_for_each_entry(p, head, ellist) { - if (cl->cl_e < p->cl_e) { - /* insert cl before p */ - list_add_tail(&cl->ellist, &p->ellist); - return; - } + while (*p != NULL) { + parent = *p; + cl1 = rb_entry(parent, struct hfsc_class, el_node); + if (cl->cl_e >= cl1->cl_e) + p = &parent->rb_right; + else + p = &parent->rb_left; } - ASSERT(0); /* should not reach here */ + rb_link_node(&cl->el_node, parent, p); + rb_insert_color(&cl->el_node, &cl->sched->eligible); } static inline void -ellist_remove(struct hfsc_class *cl) +eltree_remove(struct hfsc_class *cl) { - list_del(&cl->ellist); + rb_erase(&cl->el_node, &cl->sched->eligible); } -static void -ellist_update(struct hfsc_class *cl) +static inline void +eltree_update(struct hfsc_class *cl) { - struct list_head *head = &cl->sched->eligible; - struct hfsc_class *p, *last; - - /* - * the eligible time of a class increases monotonically. - * if the next entry has a larger eligible time, nothing to do. - */ - if (cl->ellist.next == head || - ((p = list_entry(cl->ellist.next, struct hfsc_class, ellist)) && - cl->cl_e <= p->cl_e)) - return; - - /* check the last entry */ - last = list_entry(head->prev, struct hfsc_class, ellist); - if (last->cl_e <= cl->cl_e) { - list_move_tail(&cl->ellist, head); - return; - } - - /* - * the new position must be between the next entry - * and the last entry - */ - list_for_each_entry_continue(p, head, ellist) { - if (cl->cl_e < p->cl_e) { - list_move_tail(&cl->ellist, &p->ellist); - return; - } - } - ASSERT(0); /* should not reach here */ + eltree_remove(cl); + eltree_insert(cl); } /* find the class with the minimum deadline among the eligible classes */ static inline struct hfsc_class * -ellist_get_mindl(struct list_head *head, u64 cur_time) +eltree_get_mindl(struct hfsc_sched *q, u64 cur_time) { struct hfsc_class *p, *cl = NULL; + struct rb_node *n; - list_for_each_entry(p, head, ellist) { + for (n = rb_first(&q->eligible); n != NULL; n = rb_next(n)) { + p = rb_entry(n, struct hfsc_class, el_node); if (p->cl_e > cur_time) break; if (cl == NULL || p->cl_d < cl->cl_d) @@ -305,92 +280,62 @@ ellist_get_mindl(struct list_head *head, u64 cur_time) /* find the class with minimum eligible time among the eligible classes */ static inline struct hfsc_class * -ellist_get_minel(struct list_head *head) +eltree_get_minel(struct hfsc_sched *q) { - if (list_empty(head)) + struct rb_node *n; + + n = rb_first(&q->eligible); + if (n == NULL) return NULL; - return list_entry(head->next, struct hfsc_class, ellist); + return rb_entry(n, struct hfsc_class, el_node); } /* - * active children list holds backlogged child classes being sorted - * by their virtual time. each intermediate class has one active - * children list. + * vttree holds holds backlogged child classes being sorted by their virtual + * time. each intermediate class has one vttree. */ static void -actlist_insert(struct hfsc_class *cl) +vttree_insert(struct hfsc_class *cl) { - struct list_head *head = &cl->cl_parent->actlist; - struct hfsc_class *p; + struct rb_node **p = &cl->cl_parent->vt_tree.rb_node; + struct rb_node *parent = NULL; + struct hfsc_class *cl1; - /* check the last entry first */ - if (list_empty(head) || - ((p = list_entry(head->prev, struct hfsc_class, alist)) && - p->cl_vt <= cl->cl_vt)) { - list_add_tail(&cl->alist, head); - return; - } - - list_for_each_entry(p, head, alist) { - if (cl->cl_vt < p->cl_vt) { - /* insert cl before p */ - list_add_tail(&cl->alist, &p->alist); - return; - } + while (*p != NULL) { + parent = *p; + cl1 = rb_entry(parent, struct hfsc_class, vt_node); + if (cl->cl_vt >= cl1->cl_vt) + p = &parent->rb_right; + else + p = &parent->rb_left; } - ASSERT(0); /* should not reach here */ + rb_link_node(&cl->vt_node, parent, p); + rb_insert_color(&cl->vt_node, &cl->cl_parent->vt_tree); } static inline void -actlist_remove(struct hfsc_class *cl) +vttree_remove(struct hfsc_class *cl) { - list_del(&cl->alist); + rb_erase(&cl->vt_node, &cl->cl_parent->vt_tree); } -static void -actlist_update(struct hfsc_class *cl) +static inline void +vttree_update(struct hfsc_class *cl) { - struct list_head *head = &cl->cl_parent->actlist; - struct hfsc_class *p, *last; - - /* - * the virtual time of a class increases monotonically. - * if the next entry has a larger virtual time, nothing to do. - */ - if (cl->alist.next == head || - ((p = list_entry(cl->alist.next, struct hfsc_class, alist)) && - cl->cl_vt <= p->cl_vt)) - return; - - /* check the last entry */ - last = list_entry(head->prev, struct hfsc_class, alist); - if (last->cl_vt <= cl->cl_vt) { - list_move_tail(&cl->alist, head); - return; - } - - /* - * the new position must be between the next entry - * and the last entry - */ - list_for_each_entry_continue(p, head, alist) { - if (cl->cl_vt < p->cl_vt) { - list_move_tail(&cl->alist, &p->alist); - return; - } - } - ASSERT(0); /* should not reach here */ + vttree_remove(cl); + vttree_insert(cl); } static inline struct hfsc_class * -actlist_firstfit(struct hfsc_class *cl, u64 cur_time) +vttree_firstfit(struct hfsc_class *cl, u64 cur_time) { struct hfsc_class *p; + struct rb_node *n; - list_for_each_entry(p, &cl->actlist, alist) { - if (p->cl_f <= cur_time) { + for (n = rb_first(&cl->vt_tree); n != NULL; n = rb_next(n)) { + p = rb_entry(n, struct hfsc_class, vt_node); + if (p->cl_f <= cur_time) return p; - } } return NULL; } @@ -399,14 +344,14 @@ actlist_firstfit(struct hfsc_class *cl, u64 cur_time) * get the leaf class with the minimum vt in the hierarchy */ static struct hfsc_class * -actlist_get_minvt(struct hfsc_class *cl, u64 cur_time) +vttree_get_minvt(struct hfsc_class *cl, u64 cur_time) { /* if root-class's cfmin is bigger than cur_time nothing to do */ if (cl->cl_cfmin > cur_time) return NULL; while (cl->level > 0) { - cl = actlist_firstfit(cl, cur_time); + cl = vttree_firstfit(cl, cur_time); if (cl == NULL) return NULL; /* @@ -418,6 +363,38 @@ actlist_get_minvt(struct hfsc_class *cl, u64 cur_time) return cl; } +static void +cftree_insert(struct hfsc_class *cl) +{ + struct rb_node **p = &cl->cl_parent->cf_tree.rb_node; + struct rb_node *parent = NULL; + struct hfsc_class *cl1; + + while (*p != NULL) { + parent = *p; + cl1 = rb_entry(parent, struct hfsc_class, cf_node); + if (cl->cl_f >= cl1->cl_f) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + rb_link_node(&cl->cf_node, parent, p); + rb_insert_color(&cl->cf_node, &cl->cl_parent->cf_tree); +} + +static inline void +cftree_remove(struct hfsc_class *cl) +{ + rb_erase(&cl->cf_node, &cl->cl_parent->cf_tree); +} + +static inline void +cftree_update(struct hfsc_class *cl) +{ + cftree_remove(cl); + cftree_insert(cl); +} + /* * service curve support functions * @@ -711,7 +688,7 @@ init_ed(struct hfsc_class *cl, unsigned int next_len) cl->cl_e = rtsc_y2x(&cl->cl_eligible, cl->cl_cumul); cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len); - ellist_insert(cl); + eltree_insert(cl); } static void @@ -720,7 +697,7 @@ update_ed(struct hfsc_class *cl, unsigned int next_len) cl->cl_e = rtsc_y2x(&cl->cl_eligible, cl->cl_cumul); cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len); - ellist_update(cl); + eltree_update(cl); } static inline void @@ -729,32 +706,25 @@ update_d(struct hfsc_class *cl, unsigned int next_len) cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len); } -static void +static inline void update_cfmin(struct hfsc_class *cl) { + struct rb_node *n = rb_first(&cl->cf_tree); struct hfsc_class *p; - u64 cfmin; - if (list_empty(&cl->actlist)) { + if (n == NULL) { cl->cl_cfmin = 0; return; } - cfmin = HT_INFINITY; - list_for_each_entry(p, &cl->actlist, alist) { - if (p->cl_f == 0) { - cl->cl_cfmin = 0; - return; - } - if (p->cl_f < cfmin) - cfmin = p->cl_f; - } - cl->cl_cfmin = cfmin; + p = rb_entry(n, struct hfsc_class, cf_node); + cl->cl_cfmin = p->cl_f; } static void init_vf(struct hfsc_class *cl, unsigned int len) { - struct hfsc_class *max_cl, *p; + struct hfsc_class *max_cl; + struct rb_node *n; u64 vt, f, cur_time; int go_active; @@ -767,9 +737,9 @@ init_vf(struct hfsc_class *cl, unsigned int len) go_active = 0; if (go_active) { - if (!list_empty(&cl->cl_parent->actlist)) { - max_cl = list_entry(cl->cl_parent->actlist.prev, - struct hfsc_class, alist); + n = rb_last(&cl->cl_parent->vt_tree); + if (n != NULL) { + max_cl = rb_entry(n, struct hfsc_class,vt_node); /* * set vt to the average of the min and max * classes. if the parent's period didn't @@ -785,19 +755,20 @@ init_vf(struct hfsc_class *cl, unsigned int len) } else { /* * first child for a new parent backlog period. - * add parent's cvtmax to vtoff of children - * to make a new vt (vtoff + vt) larger than - * the vt in the last period for all children. + * add parent's cvtmax to cvtoff to make a new + * vt (vtoff + vt) larger than the vt in the + * last period for all children. */ vt = cl->cl_parent->cl_cvtmax; - list_for_each_entry(p, &cl->cl_parent->children, - siblings) - p->cl_vtoff += vt; - cl->cl_vt = 0; + cl->cl_parent->cl_cvtoff += vt; cl->cl_parent->cl_cvtmax = 0; cl->cl_parent->cl_cvtmin = 0; + cl->cl_vt = 0; } + cl->cl_vtoff = cl->cl_parent->cl_cvtoff - + cl->cl_pcvtoff; + /* update the virtual curve */ vt = cl->cl_vt + cl->cl_vtoff; rtsc_min(&cl->cl_virtual, &cl->cl_fsc, vt, @@ -814,7 +785,8 @@ init_vf(struct hfsc_class *cl, unsigned int len) cl->cl_parentperiod++; cl->cl_f = 0; - actlist_insert(cl); + vttree_insert(cl); + cftree_insert(cl); if (cl->cl_flags & HFSC_USC) { /* class has upper limit curve */ @@ -834,6 +806,7 @@ init_vf(struct hfsc_class *cl, unsigned int len) f = max(cl->cl_myf, cl->cl_cfmin); if (f != cl->cl_f) { cl->cl_f = f; + cftree_update(cl); update_cfmin(cl->cl_parent); } } @@ -866,9 +839,10 @@ update_vf(struct hfsc_class *cl, unsigned int len, u64 cur_time) if (cl->cl_vt > cl->cl_parent->cl_cvtmax) cl->cl_parent->cl_cvtmax = cl->cl_vt; - /* remove this class from the vt list */ - actlist_remove(cl); + /* remove this class from the vt tree */ + vttree_remove(cl); + cftree_remove(cl); update_cfmin(cl->cl_parent); continue; @@ -890,8 +864,8 @@ update_vf(struct hfsc_class *cl, unsigned int len, u64 cur_time) cl->cl_vt = cl->cl_parent->cl_cvtmin; } - /* update the vt list */ - actlist_update(cl); + /* update the vt tree */ + vttree_update(cl); if (cl->cl_flags & HFSC_USC) { cl->cl_myf = cl->cl_myfadj + rtsc_y2x(&cl->cl_ulimit, @@ -921,6 +895,7 @@ update_vf(struct hfsc_class *cl, unsigned int len, u64 cur_time) f = max(cl->cl_myf, cl->cl_cfmin); if (f != cl->cl_f) { cl->cl_f = f; + cftree_update(cl); update_cfmin(cl->cl_parent); } } @@ -941,13 +916,13 @@ static void set_passive(struct hfsc_class *cl) { if (cl->cl_flags & HFSC_RSC) - ellist_remove(cl); + eltree_remove(cl); list_del(&cl->dlist); /* - * actlist is now handled in update_vf() so that update_vf(cl, 0, 0) - * needs to be called explicitly to remove a class from actlist + * vttree is now handled in update_vf() so that update_vf(cl, 0, 0) + * needs to be called explicitly to remove a class from vttree. */ } @@ -1171,7 +1146,8 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, cl->qdisc = &noop_qdisc; cl->stats_lock = &sch->dev->queue_lock; INIT_LIST_HEAD(&cl->children); - INIT_LIST_HEAD(&cl->actlist); + cl->vt_tree = RB_ROOT; + cl->cf_tree = RB_ROOT; sch_tree_lock(sch); list_add_tail(&cl->hlist, &q->clhash[hfsc_hash(classid)]); @@ -1179,6 +1155,7 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (parent->level == 0) hfsc_purge_queue(sch, parent); hfsc_adjust_levels(parent); + cl->cl_pcvtoff = parent->cl_cvtoff; sch_tree_unlock(sch); #ifdef CONFIG_NET_ESTIMATOR @@ -1528,7 +1505,7 @@ hfsc_schedule_watchdog(struct Qdisc *sch, u64 cur_time) u64 next_time = 0; long delay; - if ((cl = ellist_get_minel(&q->eligible)) != NULL) + if ((cl = eltree_get_minel(q)) != NULL) next_time = cl->cl_e; if (q->root.cl_cfmin != 0) { if (next_time == 0 || next_time > q->root.cl_cfmin) @@ -1553,13 +1530,12 @@ hfsc_init_qdisc(struct Qdisc *sch, struct rtattr *opt) return -EINVAL; qopt = RTA_DATA(opt); - memset(q, 0, sizeof(struct hfsc_sched)); sch->stats_lock = &sch->dev->queue_lock; q->defcls = qopt->defcls; for (i = 0; i < HFSC_HSIZE; i++) INIT_LIST_HEAD(&q->clhash[i]); - INIT_LIST_HEAD(&q->eligible); + q->eligible = RB_ROOT; INIT_LIST_HEAD(&q->droplist); skb_queue_head_init(&q->requeue); @@ -1571,7 +1547,8 @@ hfsc_init_qdisc(struct Qdisc *sch, struct rtattr *opt) q->root.qdisc = &noop_qdisc; q->root.stats_lock = &sch->dev->queue_lock; INIT_LIST_HEAD(&q->root.children); - INIT_LIST_HEAD(&q->root.actlist); + q->root.vt_tree = RB_ROOT; + q->root.cf_tree = RB_ROOT; list_add(&q->root.hlist, &q->clhash[hfsc_hash(q->root.classid)]); @@ -1611,6 +1588,8 @@ hfsc_reset_class(struct hfsc_class *cl) cl->cl_vtoff = 0; cl->cl_cvtmin = 0; cl->cl_cvtmax = 0; + cl->cl_cvtoff = 0; + cl->cl_pcvtoff = 0; cl->cl_vtperiod = 0; cl->cl_parentperiod = 0; cl->cl_f = 0; @@ -1618,7 +1597,9 @@ hfsc_reset_class(struct hfsc_class *cl) cl->cl_myfadj = 0; cl->cl_cfmin = 0; cl->cl_nactive = 0; - INIT_LIST_HEAD(&cl->actlist); + + cl->vt_tree = RB_ROOT; + cl->cf_tree = RB_ROOT; qdisc_reset(cl->qdisc); if (cl->cl_flags & HFSC_RSC) @@ -1641,7 +1622,7 @@ hfsc_reset_qdisc(struct Qdisc *sch) hfsc_reset_class(cl); } __skb_queue_purge(&q->requeue); - INIT_LIST_HEAD(&q->eligible); + q->eligible = RB_ROOT; INIT_LIST_HEAD(&q->droplist); del_timer(&q->wd_timer); sch->flags &= ~TCQ_F_THROTTLED; @@ -1749,14 +1730,14 @@ hfsc_dequeue(struct Qdisc *sch) * find the class with the minimum deadline among * the eligible classes. */ - if ((cl = ellist_get_mindl(&q->eligible, cur_time)) != NULL) { + if ((cl = eltree_get_mindl(q, cur_time)) != NULL) { realtime = 1; } else { /* * use link-sharing criteria * get the class with the minimum vt in the hierarchy */ - cl = actlist_get_minvt(&q->root, cur_time); + cl = vttree_get_minvt(&q->root, cur_time); if (cl == NULL) { sch->stats.overlimits++; hfsc_schedule_watchdog(sch, cur_time); diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index d07dfd8b5cf0..61c8fa4db608 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -1277,7 +1277,6 @@ static int htb_init(struct Qdisc *sch, struct rtattr *opt) HTB_VER >> 16,HTB_VER & 0xffff,gopt->version); return -EINVAL; } - memset(q,0,sizeof(*q)); q->debug = gopt->debug; HTB_DBG(0,1,"htb_init sch=%p handle=%X r2q=%d\n",sch,sch->handle,gopt->rate2quantum); diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index 30f2176b992d..13b5c3414794 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -283,21 +283,18 @@ int ingress_init(struct Qdisc *sch,struct rtattr *opt) #ifndef CONFIG_NET_CLS_ACT #ifndef CONFIG_NETFILTER printk("You MUST compile classifier actions into the kernel\n"); - goto error; + return -EINVAL; #else printk("Ingress scheduler: Classifier actions prefered over netfilter\n"); #endif #endif - if (NULL == p) - goto error; - #ifndef CONFIG_NET_CLS_ACT #ifdef CONFIG_NETFILTER if (!nf_registered) { if (nf_register_hook(&ing_ops) < 0) { printk("ingress qdisc registration error \n"); - goto error; + return -EINVAL; } nf_registered++; } @@ -305,12 +302,8 @@ int ingress_init(struct Qdisc *sch,struct rtattr *opt) #endif DPRINTK("ingress_init(sch %p,[qdisc %p],opt %p)\n",sch,p,opt); - memset(p, 0, sizeof(*p)); - p->filter_list = NULL; p->q = &noop_qdisc; return 0; -error: - return -EINVAL; } @@ -346,9 +339,6 @@ static void ingress_destroy(struct Qdisc *sch) p->filter_list = tp->next; tcf_destroy(tp); } - memset(p, 0, sizeof(*p)); - p->filter_list = NULL; - #if 0 /* for future use */ qdisc_destroy(p->q); diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 16a57cf9bcca..bae07708eb01 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -148,13 +148,12 @@ static void sctp_v4_copy_addrlist(struct list_head *addrlist, struct in_ifaddr *ifa; struct sctp_sockaddr_entry *addr; - read_lock(&inetdev_lock); + rcu_read_lock(); if ((in_dev = __in_dev_get(dev)) == NULL) { - read_unlock(&inetdev_lock); + rcu_read_unlock(); return; } - read_lock(&in_dev->lock); for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { /* Add the address to the local list. */ addr = t_new(struct sctp_sockaddr_entry, GFP_ATOMIC); @@ -166,8 +165,7 @@ static void sctp_v4_copy_addrlist(struct list_head *addrlist, } } - read_unlock(&in_dev->lock); - read_unlock(&inetdev_lock); + rcu_read_unlock(); } /* Extract our IP addresses from the system and stash them in the diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index a7ab07fe5cc0..de4bccc1c25d 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -753,7 +753,7 @@ udp_data_ready(struct sock *sk, int len) struct rpc_rqst *rovr; struct sk_buff *skb; int err, repsize, copied; - u32 xid; + u32 _xid, *xp; read_lock(&sk->sk_callback_lock); dprintk("RPC: udp_data_ready...\n"); @@ -777,12 +777,14 @@ udp_data_ready(struct sock *sk, int len) } /* Copy the XID from the skb... */ - if (skb_copy_bits(skb, sizeof(struct udphdr), &xid, sizeof(xid)) < 0) + xp = skb_header_pointer(skb, sizeof(struct udphdr), + sizeof(_xid), &_xid); + if (xp == NULL) goto dropit; /* Look up and lock the request corresponding to the given XID */ spin_lock(&xprt->sock_lock); - rovr = xprt_lookup_rqst(xprt, xid); + rovr = xprt_lookup_rqst(xprt, *xp); if (!rovr) goto out_unlock; task = rovr->rq_task; diff --git a/net/xfrm/xfrm_export.c b/net/xfrm/xfrm_export.c index f72754953d29..9f335640d254 100644 --- a/net/xfrm/xfrm_export.c +++ b/net/xfrm/xfrm_export.c @@ -33,8 +33,6 @@ EXPORT_SYMBOL(secpath_dup); EXPORT_SYMBOL(xfrm_get_acqseq); EXPORT_SYMBOL(xfrm_parse_spi); EXPORT_SYMBOL(xfrm4_rcv); -EXPORT_SYMBOL(xfrm4_tunnel_register); -EXPORT_SYMBOL(xfrm4_tunnel_deregister); EXPORT_SYMBOL(xfrm_register_type); EXPORT_SYMBOL(xfrm_unregister_type); EXPORT_SYMBOL(xfrm_get_type); diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index eccc0231faeb..be298cde3022 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -78,15 +78,6 @@ static int verify_encap_tmpl(struct rtattr **xfrma) if ((rt->rta_len - sizeof(*rt)) < sizeof(*encap)) return -EINVAL; - encap = RTA_DATA(rt); - switch (encap->encap_type) { - case UDP_ENCAP_ESPINUDP: - case UDP_ENCAP_ESPINUDP_NON_IKE: - break; - default: - return -ENOPROTOOPT; - } - return 0; } @@ -164,15 +155,24 @@ out: return err; } -static int attach_one_algo(struct xfrm_algo **algpp, struct rtattr *u_arg) +static int attach_one_algo(struct xfrm_algo **algpp, u8 *props, + struct xfrm_algo_desc *(*get_byname)(char *), + struct rtattr *u_arg) { struct rtattr *rta = u_arg; struct xfrm_algo *p, *ualg; + struct xfrm_algo_desc *algo; if (!rta) return 0; ualg = RTA_DATA(rta); + + algo = get_byname(ualg->alg_name); + if (!algo) + return -ENOSYS; + *props = algo->desc.sadb_alg_id; + p = kmalloc(sizeof(*ualg) + ualg->alg_key_len, GFP_KERNEL); if (!p) return -ENOMEM; @@ -225,11 +225,17 @@ static struct xfrm_state *xfrm_state_construct(struct xfrm_usersa_info *p, copy_from_user_state(x, p); - if ((err = attach_one_algo(&x->aalg, xfrma[XFRMA_ALG_AUTH-1]))) + if ((err = attach_one_algo(&x->aalg, &x->props.aalgo, + xfrm_aalg_get_byname, + xfrma[XFRMA_ALG_AUTH-1]))) goto error; - if ((err = attach_one_algo(&x->ealg, xfrma[XFRMA_ALG_CRYPT-1]))) + if ((err = attach_one_algo(&x->ealg, &x->props.ealgo, + xfrm_ealg_get_byname, + xfrma[XFRMA_ALG_CRYPT-1]))) goto error; - if ((err = attach_one_algo(&x->calg, xfrma[XFRMA_ALG_COMP-1]))) + if ((err = attach_one_algo(&x->calg, &x->props.calgo, + xfrm_calg_get_byname, + xfrma[XFRMA_ALG_COMP-1]))) goto error; if ((err = attach_encap_tmpl(&x->encap, xfrma[XFRMA_ENCAP-1]))) goto error; diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 52fa3cfdfd5b..587d63bd6861 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -2823,48 +2823,50 @@ static void selinux_task_to_inode(struct task_struct *p, static int selinux_parse_skb_ipv4(struct sk_buff *skb, struct avc_audit_data *ad) { int offset, ihlen, ret; - struct iphdr iph; + struct iphdr _iph, *ih; offset = skb->nh.raw - skb->data; - ret = skb_copy_bits(skb, offset, &iph, sizeof(iph)); - if (ret) + ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); + if (ih == NULL) goto out; - ihlen = iph.ihl * 4; - if (ihlen < sizeof(iph)) + ihlen = ih->ihl * 4; + if (ihlen < sizeof(_iph)) goto out; - ad->u.net.v4info.saddr = iph.saddr; - ad->u.net.v4info.daddr = iph.daddr; + ad->u.net.v4info.saddr = ih->saddr; + ad->u.net.v4info.daddr = ih->daddr; - switch (iph.protocol) { + switch (ih->protocol) { case IPPROTO_TCP: { - struct tcphdr tcph; + struct tcphdr _tcph, *th; - if (ntohs(iph.frag_off) & IP_OFFSET) + if (ntohs(ih->frag_off) & IP_OFFSET) break; offset += ihlen; - if (skb_copy_bits(skb, offset, &tcph, sizeof(tcph)) < 0) + th = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph); + if (th == NULL) break; - ad->u.net.sport = tcph.source; - ad->u.net.dport = tcph.dest; + ad->u.net.sport = th->source; + ad->u.net.dport = th->dest; break; } case IPPROTO_UDP: { - struct udphdr udph; + struct udphdr _udph, *uh; - if (ntohs(iph.frag_off) & IP_OFFSET) + if (ntohs(ih->frag_off) & IP_OFFSET) break; offset += ihlen; - if (skb_copy_bits(skb, offset, &udph, sizeof(udph)) < 0) - break; + uh = skb_header_pointer(skb, offset, sizeof(_udph), &_udph); + if (uh == NULL) + break; - ad->u.net.sport = udph.source; - ad->u.net.dport = udph.dest; + ad->u.net.sport = uh->source; + ad->u.net.dport = uh->dest; break; } @@ -2882,18 +2884,18 @@ static int selinux_parse_skb_ipv6(struct sk_buff *skb, struct avc_audit_data *ad { u8 nexthdr; int ret, offset; - struct ipv6hdr ipv6h; + struct ipv6hdr _ipv6h, *ip6; offset = skb->nh.raw - skb->data; - ret = skb_copy_bits(skb, offset, &ipv6h, sizeof(ipv6h)); - if (ret) + ip6 = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h); + if (ip6 == NULL) goto out; - ipv6_addr_copy(&ad->u.net.v6info.saddr, &ipv6h.saddr); - ipv6_addr_copy(&ad->u.net.v6info.daddr, &ipv6h.daddr); + ipv6_addr_copy(&ad->u.net.v6info.saddr, &ip6->saddr); + ipv6_addr_copy(&ad->u.net.v6info.daddr, &ip6->daddr); - nexthdr = ipv6h.nexthdr; - offset += sizeof(ipv6h); + nexthdr = ip6->nexthdr; + offset += sizeof(_ipv6h); offset = ipv6_skip_exthdr(skb, offset, &nexthdr, skb->tail - skb->head - offset); if (offset < 0) @@ -2901,24 +2903,26 @@ static int selinux_parse_skb_ipv6(struct sk_buff *skb, struct avc_audit_data *ad switch (nexthdr) { case IPPROTO_TCP: { - struct tcphdr tcph; + struct tcphdr _tcph, *th; - if (skb_copy_bits(skb, offset, &tcph, sizeof(tcph)) < 0) + th = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph); + if (th == NULL) break; - ad->u.net.sport = tcph.source; - ad->u.net.dport = tcph.dest; + ad->u.net.sport = th->source; + ad->u.net.dport = th->dest; break; } case IPPROTO_UDP: { - struct udphdr udph; + struct udphdr _udph, *uh; - if (skb_copy_bits(skb, offset, &udph, sizeof(udph)) < 0) + uh = skb_header_pointer(skb, offset, sizeof(_udph), &_udph); + if (uh == NULL) break; - ad->u.net.sport = udph.source; - ad->u.net.dport = udph.dest; + ad->u.net.sport = uh->source; + ad->u.net.dport = uh->dest; break; } |
