diff options
| author | Linus Torvalds <torvalds@home.osdl.org> | 2003-07-12 21:25:57 -0700 |
|---|---|---|
| committer | Linus Torvalds <torvalds@home.osdl.org> | 2003-07-12 21:25:57 -0700 |
| commit | 5af7d0d99dff67a314baa9378e8bc28a70a883f7 (patch) | |
| tree | 0bad7c5a7c5a3497383a746ddb584f9072803617 | |
| parent | 86f488778a193d4bacc041baf161c5dd0f3bfa4a (diff) | |
| parent | d4cef02ef663223b3d779cebb457a5cc898e7d8c (diff) | |
Merge bk://kernel.bkbits.net/davem/sparc-2.5
into home.osdl.org:/home/torvalds/v2.5/linux
40 files changed, 13461 insertions, 81 deletions
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 752b31601ee4..c421186227f7 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -61,7 +61,11 @@ extern struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr, int oif, int flags); -extern struct rt6_info *ip6_dst_alloc(void); +extern struct dst_entry *ndisc_dst_alloc(struct net_device *dev, + struct neighbour *neigh, + int (*output)(struct sk_buff *)); +extern int ndisc_dst_gc(int *more); +extern void fib6_force_start_gc(void); /* * support functions for ND diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h new file mode 100644 index 000000000000..92336b4d945c --- /dev/null +++ b/include/net/ip_vs.h @@ -0,0 +1,1002 @@ +/* + * IP Virtual Server + * data structure and functionality definitions + */ + +#ifndef _IP_VS_H +#define _IP_VS_H + +#include <asm/types.h> /* For __uXX types */ + +#define IP_VS_VERSION_CODE 0x010107 +#define NVERSION(version) \ + (version >> 16) & 0xFF, \ + (version >> 8) & 0xFF, \ + version & 0xFF + +/* + * Virtual Service Flags + */ +#define IP_VS_SVC_F_PERSISTENT 0x0001 /* persistent port */ +#define IP_VS_SVC_F_HASHED 0x0002 /* hashed entry */ + +/* + * Destination Server Flags + */ +#define IP_VS_DEST_F_AVAILABLE 0x0001 /* server is available */ +#define IP_VS_DEST_F_OVERLOAD 0x0002 /* server is overloaded */ + +/* + * IPVS sync daemon states + */ +#define IP_VS_STATE_NONE 0x0000 /* daemon is stopped */ +#define IP_VS_STATE_MASTER 0x0001 /* started as master */ +#define IP_VS_STATE_BACKUP 0x0002 /* started as backup */ + +/* + * IPVS socket options + */ +#define IP_VS_BASE_CTL (64+1024+64) /* base */ + +#define IP_VS_SO_SET_NONE IP_VS_BASE_CTL /* just peek */ +#define IP_VS_SO_SET_INSERT (IP_VS_BASE_CTL+1) +#define IP_VS_SO_SET_ADD (IP_VS_BASE_CTL+2) +#define IP_VS_SO_SET_EDIT (IP_VS_BASE_CTL+3) +#define IP_VS_SO_SET_DEL (IP_VS_BASE_CTL+4) +#define IP_VS_SO_SET_FLUSH (IP_VS_BASE_CTL+5) +#define IP_VS_SO_SET_LIST (IP_VS_BASE_CTL+6) +#define IP_VS_SO_SET_ADDDEST (IP_VS_BASE_CTL+7) +#define IP_VS_SO_SET_DELDEST (IP_VS_BASE_CTL+8) +#define IP_VS_SO_SET_EDITDEST (IP_VS_BASE_CTL+9) +#define IP_VS_SO_SET_TIMEOUT (IP_VS_BASE_CTL+10) +#define IP_VS_SO_SET_STARTDAEMON (IP_VS_BASE_CTL+11) +#define IP_VS_SO_SET_STOPDAEMON (IP_VS_BASE_CTL+12) +#define IP_VS_SO_SET_RESTORE (IP_VS_BASE_CTL+13) +#define IP_VS_SO_SET_SAVE (IP_VS_BASE_CTL+14) +#define IP_VS_SO_SET_ZERO (IP_VS_BASE_CTL+15) +#define IP_VS_SO_SET_MAX IP_VS_SO_SET_ZERO + +#define IP_VS_SO_GET_VERSION IP_VS_BASE_CTL +#define IP_VS_SO_GET_INFO (IP_VS_BASE_CTL+1) +#define IP_VS_SO_GET_SERVICES (IP_VS_BASE_CTL+2) +#define IP_VS_SO_GET_SERVICE (IP_VS_BASE_CTL+3) +#define IP_VS_SO_GET_DESTS (IP_VS_BASE_CTL+4) +#define IP_VS_SO_GET_DEST (IP_VS_BASE_CTL+5) /* not used now */ +#define IP_VS_SO_GET_TIMEOUT (IP_VS_BASE_CTL+6) +#define IP_VS_SO_GET_DAEMON (IP_VS_BASE_CTL+7) +#define IP_VS_SO_GET_MAX IP_VS_SO_GET_DAEMON + + +/* + * IPVS Connection Flags + */ +#define IP_VS_CONN_F_FWD_MASK 0x0007 /* mask for the fwd methods */ +#define IP_VS_CONN_F_MASQ 0x0000 /* masquerading/NAT */ +#define IP_VS_CONN_F_LOCALNODE 0x0001 /* local node */ +#define IP_VS_CONN_F_TUNNEL 0x0002 /* tunneling */ +#define IP_VS_CONN_F_DROUTE 0x0003 /* direct routing */ +#define IP_VS_CONN_F_BYPASS 0x0004 /* cache bypass */ +#define IP_VS_CONN_F_SYNC 0x0020 /* entry created by sync */ +#define IP_VS_CONN_F_HASHED 0x0040 /* hashed entry */ +#define IP_VS_CONN_F_NOOUTPUT 0x0080 /* no output packets */ +#define IP_VS_CONN_F_INACTIVE 0x0100 /* not established */ +#define IP_VS_CONN_F_OUT_SEQ 0x0200 /* must do output seq adjust */ +#define IP_VS_CONN_F_IN_SEQ 0x0400 /* must do input seq adjust */ +#define IP_VS_CONN_F_SEQ_MASK 0x0600 /* in/out sequence mask */ +#define IP_VS_CONN_F_NO_CPORT 0x0800 /* no client port set yet */ + +/* Move it to better place one day, for now keep it unique */ +#define NFC_IPVS_PROPERTY 0x10000 + +#define IP_VS_SCHEDNAME_MAXLEN 16 +#define IP_VS_IFNAME_MAXLEN 16 + + +/* + * The struct ip_vs_service_user and struct ip_vs_dest_user are + * used to set IPVS rules through setsockopt. + */ +struct ip_vs_service_user { + /* virtual service addresses */ + u_int16_t protocol; + u_int32_t addr; /* virtual ip address */ + u_int16_t port; + u_int32_t fwmark; /* firwall mark of service */ + + /* virtual service options */ + char sched_name[IP_VS_SCHEDNAME_MAXLEN]; + unsigned flags; /* virtual service flags */ + unsigned timeout; /* persistent timeout in sec */ + u_int32_t netmask; /* persistent netmask */ +}; + + +struct ip_vs_dest_user { + /* destination server address */ + u_int32_t addr; + u_int16_t port; + + /* real server options */ + unsigned conn_flags; /* connection flags */ + int weight; /* destination weight */ + + /* thresholds for active connections */ + u_int32_t u_threshold; /* upper threshold */ + u_int32_t l_threshold; /* lower threshold */ +}; + + +/* + * IPVS statistics object (for user space) + */ +struct ip_vs_stats_user +{ + __u32 conns; /* connections scheduled */ + __u32 inpkts; /* incoming packets */ + __u32 outpkts; /* outgoing packets */ + __u64 inbytes; /* incoming bytes */ + __u64 outbytes; /* outgoing bytes */ + + __u32 cps; /* current connection rate */ + __u32 inpps; /* current in packet rate */ + __u32 outpps; /* current out packet rate */ + __u32 inbps; /* current in byte rate */ + __u32 outbps; /* current out byte rate */ +}; + + +/* The argument to IP_VS_SO_GET_INFO */ +struct ip_vs_getinfo { + /* version number */ + unsigned int version; + + /* size of connection hash table */ + unsigned int size; + + /* number of virtual services */ + unsigned int num_services; +}; + + +/* The argument to IP_VS_SO_GET_SERVICE */ +struct ip_vs_service_entry { + /* which service: user fills in these */ + u_int16_t protocol; + u_int32_t addr; /* virtual address */ + u_int16_t port; + u_int32_t fwmark; /* firwall mark of service */ + + /* service options */ + char sched_name[IP_VS_SCHEDNAME_MAXLEN]; + unsigned flags; /* virtual service flags */ + unsigned timeout; /* persistent timeout */ + u_int32_t netmask; /* persistent netmask */ + + /* number of real servers */ + unsigned int num_dests; + + /* statistics */ + struct ip_vs_stats_user stats; +}; + + +struct ip_vs_dest_entry { + u_int32_t addr; /* destination address */ + u_int16_t port; + unsigned conn_flags; /* connection flags */ + int weight; /* destination weight */ + + u_int32_t u_threshold; /* upper threshold */ + u_int32_t l_threshold; /* lower threshold */ + + u_int32_t activeconns; /* active connections */ + u_int32_t inactconns; /* inactive connections */ + u_int32_t persistconns; /* persistent connections */ + + /* statistics */ + struct ip_vs_stats_user stats; +}; + + +/* The argument to IP_VS_SO_GET_DESTS */ +struct ip_vs_get_dests { + /* which service: user fills in these */ + u_int16_t protocol; + u_int32_t addr; /* virtual address */ + u_int16_t port; + u_int32_t fwmark; /* firwall mark of service */ + + /* number of real servers */ + unsigned int num_dests; + + /* the real servers */ + struct ip_vs_dest_entry entrytable[0]; +}; + + +/* The argument to IP_VS_SO_GET_SERVICES */ +struct ip_vs_get_services { + /* number of virtual services */ + unsigned int num_services; + + /* service table */ + struct ip_vs_service_entry entrytable[0]; +}; + + +/* The argument to IP_VS_SO_GET_TIMEOUT */ +struct ip_vs_timeout_user { + int tcp_timeout; + int tcp_fin_timeout; + int udp_timeout; +}; + + +/* The argument to IP_VS_SO_GET_DAEMON */ +struct ip_vs_daemon_user { + /* sync daemon state (master/backup) */ + int state; + + /* multicast interface name */ + char mcast_ifn[IP_VS_IFNAME_MAXLEN]; + + /* SyncID we belong to */ + int syncid; +}; + + +#ifdef __KERNEL__ + +#include <linux/config.h> +#include <linux/list.h> /* for struct list_head */ +#include <linux/spinlock.h> /* for struct rwlock_t */ +#include <linux/skbuff.h> /* for struct sk_buff */ +#include <linux/ip.h> /* for struct iphdr */ +#include <asm/atomic.h> /* for struct atomic_t */ +#include <linux/netdevice.h> /* for struct neighbour */ +#include <net/dst.h> /* for struct dst_entry */ +#include <net/tcp.h> +#include <net/udp.h> +#include <linux/compiler.h> + + +#ifdef CONFIG_IP_VS_DEBUG +extern int ip_vs_get_debug_level(void); +#define IP_VS_DBG(level, msg...) \ + do { \ + if (level <= ip_vs_get_debug_level()) \ + printk(KERN_DEBUG "IPVS: " msg); \ + } while (0) +#define IP_VS_DBG_RL(msg...) \ + do { \ + if (net_ratelimit()) \ + printk(KERN_DEBUG "IPVS: " msg); \ + } while (0) +#define IP_VS_DBG_PKT(level, pp, iph, msg) \ + do { \ + if (level <= ip_vs_get_debug_level()) \ + pp->debug_packet(pp, iph, msg); \ + } while (0) +#define IP_VS_DBG_RL_PKT(level, pp, iph, msg) \ + do { \ + if (level <= ip_vs_get_debug_level() && \ + net_ratelimit()) \ + pp->debug_packet(pp, iph, msg); \ + } while (0) +#else /* NO DEBUGGING at ALL */ +#define IP_VS_DBG(level, msg...) do {} while (0) +#define IP_VS_DBG_RL(msg...) do {} while (0) +#define IP_VS_DBG_PKT(level, pp, iph, msg) do {} while (0) +#define IP_VS_DBG_RL_PKT(level, pp, iph, msg) do {} while (0) +#endif + +#define IP_VS_BUG() BUG() +#define IP_VS_ERR(msg...) printk(KERN_ERR "IPVS: " msg) +#define IP_VS_INFO(msg...) printk(KERN_INFO "IPVS: " msg) +#define IP_VS_WARNING(msg...) \ + printk(KERN_WARNING "IPVS: " msg) +#define IP_VS_ERR_RL(msg...) \ + do { \ + if (net_ratelimit()) \ + printk(KERN_ERR "IPVS: " msg); \ + } while (0) + +#ifdef CONFIG_IP_VS_DEBUG +#define EnterFunction(level) \ + do { \ + if (level <= ip_vs_get_debug_level()) \ + printk(KERN_DEBUG "Enter: %s, %s line %i\n", \ + __FUNCTION__, __FILE__, __LINE__); \ + } while (0) +#define LeaveFunction(level) \ + do { \ + if (level <= ip_vs_get_debug_level()) \ + printk(KERN_DEBUG "Leave: %s, %s line %i\n", \ + __FUNCTION__, __FILE__, __LINE__); \ + } while (0) +#else +#define EnterFunction(level) do {} while (0) +#define LeaveFunction(level) do {} while (0) +#endif + +#define IP_VS_WAIT_WHILE(expr) while (expr) { cpu_relax(); } + + +/* + * The port number of FTP service (in network order). + */ +#define FTPPORT __constant_htons(21) +#define FTPDATA __constant_htons(20) + +/* + * IPVS sysctl variables under the /proc/sys/net/ipv4/vs/ + */ +#define NET_IPV4_VS 21 + +enum { + NET_IPV4_VS_DEBUG_LEVEL=1, + NET_IPV4_VS_AMEMTHRESH=2, + NET_IPV4_VS_AMDROPRATE=3, + NET_IPV4_VS_DROP_ENTRY=4, + NET_IPV4_VS_DROP_PACKET=5, + NET_IPV4_VS_SECURE_TCP=6, + NET_IPV4_VS_TO_ES=7, + NET_IPV4_VS_TO_SS=8, + NET_IPV4_VS_TO_SR=9, + NET_IPV4_VS_TO_FW=10, + NET_IPV4_VS_TO_TW=11, + NET_IPV4_VS_TO_CL=12, + NET_IPV4_VS_TO_CW=13, + NET_IPV4_VS_TO_LA=14, + NET_IPV4_VS_TO_LI=15, + NET_IPV4_VS_TO_SA=16, + NET_IPV4_VS_TO_UDP=17, + NET_IPV4_VS_TO_ICMP=18, + NET_IPV4_VS_LBLC_EXPIRE=19, + NET_IPV4_VS_LBLCR_EXPIRE=20, + NET_IPV4_VS_CACHE_BYPASS=22, + NET_IPV4_VS_EXPIRE_NODEST_CONN=23, + NET_IPV4_VS_SYNC_THRESHOLD=24, + NET_IPV4_VS_NAT_ICMP_SEND=25, + NET_IPV4_VS_LAST +}; + +/* + * TCP State Values + */ +enum { + IP_VS_TCP_S_NONE = 0, + IP_VS_TCP_S_ESTABLISHED, + IP_VS_TCP_S_SYN_SENT, + IP_VS_TCP_S_SYN_RECV, + IP_VS_TCP_S_FIN_WAIT, + IP_VS_TCP_S_TIME_WAIT, + IP_VS_TCP_S_CLOSE, + IP_VS_TCP_S_CLOSE_WAIT, + IP_VS_TCP_S_LAST_ACK, + IP_VS_TCP_S_LISTEN, + IP_VS_TCP_S_SYNACK, + IP_VS_TCP_S_LAST +}; + +/* + * UDP State Values + */ +enum { + IP_VS_UDP_S_NORMAL, + IP_VS_UDP_S_LAST, +}; + +/* + * ICMP State Values + */ +enum { + IP_VS_ICMP_S_NORMAL, + IP_VS_ICMP_S_LAST, +}; + +/* + * Transport protocol header + */ +union ip_vs_tphdr { + unsigned char *raw; + struct udphdr *uh; + struct tcphdr *th; + struct icmphdr *icmph; + __u16 *portp; +}; + + +/* + * Delta sequence info structure + * Each ip_vs_conn has 2 (output AND input seq. changes). + * Only used in the VS/NAT. + */ +struct ip_vs_seq { + __u32 init_seq; /* Add delta from this seq */ + __u32 delta; /* Delta in sequence numbers */ + __u32 previous_delta; /* Delta in sequence numbers + before last resized pkt */ +}; + + +/* + * IPVS statistics object + */ +struct ip_vs_stats +{ + __u32 conns; /* connections scheduled */ + __u32 inpkts; /* incoming packets */ + __u32 outpkts; /* outgoing packets */ + __u64 inbytes; /* incoming bytes */ + __u64 outbytes; /* outgoing bytes */ + + __u32 cps; /* current connection rate */ + __u32 inpps; /* current in packet rate */ + __u32 outpps; /* current out packet rate */ + __u32 inbps; /* current in byte rate */ + __u32 outbps; /* current out byte rate */ + + spinlock_t lock; /* spin lock */ +}; + +struct ip_vs_conn; +struct ip_vs_app; + +struct ip_vs_protocol { + struct ip_vs_protocol *next; + char *name; + __u16 protocol; + int minhlen; + int minhlen_icmp; + int dont_defrag; + int skip_nonexisting; + int slave; /* if controlled by others */ + atomic_t appcnt; /* counter of proto app incs */ + int *timeout_table; /* protocol timeout table */ + + void (*init)(struct ip_vs_protocol *pp); + + void (*exit)(struct ip_vs_protocol *pp); + + int (*conn_schedule)(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct iphdr *iph, union ip_vs_tphdr h, + int *verdict, struct ip_vs_conn **cpp); + + struct ip_vs_conn * + (*conn_in_get)(struct sk_buff *skb, + struct ip_vs_protocol *pp, struct iphdr *iph, + union ip_vs_tphdr h, int inverse); + + struct ip_vs_conn * + (*conn_out_get)(struct sk_buff *skb, + struct ip_vs_protocol *pp, struct iphdr *iph, + union ip_vs_tphdr h, int inverse); + + int (*snat_handler)(struct sk_buff *skb, + struct ip_vs_protocol *pp, struct ip_vs_conn *cp, + struct iphdr *iph, union ip_vs_tphdr h, int size); + + int (*dnat_handler)(struct sk_buff *skb, + struct ip_vs_protocol *pp, struct ip_vs_conn *cp, + struct iphdr *iph, union ip_vs_tphdr h, int size); + + int (*csum_check)(struct sk_buff *skb, + struct ip_vs_protocol *pp, struct iphdr *iph, + union ip_vs_tphdr h, int size); + + const char *(*state_name)(int state); + + int (*state_transition)(struct ip_vs_conn *cp, int direction, + struct iphdr *iph, union ip_vs_tphdr h, + struct ip_vs_protocol *pp); + + int (*register_app)(struct ip_vs_app *inc); + + void (*unregister_app)(struct ip_vs_app *inc); + + int (*app_conn_bind)(struct ip_vs_conn *cp); + + void (*debug_packet)(struct ip_vs_protocol *pp, struct iphdr *iph, + char *msg); + + void (*timeout_change)(struct ip_vs_protocol *pp, int flags); + + int (*set_state_timeout)(struct ip_vs_protocol *pp, char *sname, int to); +}; + +extern struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto); + +/* + * IP_VS structure allocated for each dynamically scheduled connection + */ +struct ip_vs_conn { + struct list_head c_list; /* hashed list heads */ + + /* Protocol, addresses and port numbers */ + __u32 caddr; /* client address */ + __u32 vaddr; /* virtual address */ + __u32 daddr; /* destination address */ + __u16 cport; + __u16 vport; + __u16 dport; + __u16 protocol; /* Which protocol (TCP/UDP) */ + + /* counter and timer */ + atomic_t refcnt; /* reference count */ + struct timer_list timer; /* Expiration timer */ + volatile unsigned long timeout; /* timeout */ + + /* Flags and state transition */ + spinlock_t lock; /* lock for state transition */ + volatile __u16 flags; /* status flags */ + volatile __u16 state; /* state info */ + + /* Control members */ + struct ip_vs_conn *control; /* Master control connection */ + atomic_t n_control; /* Number of controlled ones */ + struct ip_vs_dest *dest; /* real server */ + atomic_t in_pkts; /* incoming packet counter */ + + /* packet transmitter for different forwarding methods */ + int (*packet_xmit)(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp); + + /* Note: we can group the following members into a structure, + in order to save more space, and the following members are + only used in VS/NAT anyway */ + struct ip_vs_app *app; /* bound ip_vs_app object */ + void *app_data; /* Application private data */ + struct ip_vs_seq in_seq; /* incoming seq. struct */ + struct ip_vs_seq out_seq; /* outgoing seq. struct */ +}; + + +/* + * The information about the virtual service offered to the net + * and the forwarding entries + */ +struct ip_vs_service { + struct list_head s_list; /* for normal service table */ + struct list_head f_list; /* for fwmark-based service table */ + atomic_t refcnt; /* reference counter */ + atomic_t usecnt; /* use counter */ + + __u16 protocol; /* which protocol (TCP/UDP) */ + __u32 addr; /* IP address for virtual service */ + __u16 port; /* port number for the service */ + __u32 fwmark; /* firewall mark of the service */ + unsigned flags; /* service status flags */ + unsigned timeout; /* persistent timeout in ticks */ + __u32 netmask; /* grouping granularity */ + + struct list_head destinations; /* real server d-linked list */ + __u32 num_dests; /* number of servers */ + struct ip_vs_stats stats; /* statistics for the service */ + struct ip_vs_app *inc; /* bind conns to this app inc */ + + /* for scheduling */ + struct ip_vs_scheduler *scheduler; /* bound scheduler object */ + rwlock_t sched_lock; /* lock sched_data */ + void *sched_data; /* scheduler application data */ +}; + + +/* + * The real server destination forwarding entry + * with ip address, port number, and so on. + */ +struct ip_vs_dest { + struct list_head n_list; /* for the dests in the service */ + struct list_head d_list; /* for table with all the dests */ + + __u32 addr; /* IP address of the server */ + __u16 port; /* port number of the server */ + volatile unsigned flags; /* dest status flags */ + atomic_t conn_flags; /* flags to copy to conn */ + atomic_t weight; /* server weight */ + + atomic_t refcnt; /* reference counter */ + struct ip_vs_stats stats; /* statistics */ + + /* connection counters and thresholds */ + atomic_t activeconns; /* active connections */ + atomic_t inactconns; /* inactive connections */ + atomic_t persistconns; /* persistent connections */ + __u32 u_threshold; /* upper threshold */ + __u32 l_threshold; /* lower threshold */ + + /* for destination cache */ + spinlock_t dst_lock; /* lock of dst_cache */ + struct dst_entry *dst_cache; /* destination cache entry */ + u32 dst_rtos; /* RT_TOS(tos) for dst */ + + /* for virtual service */ + struct ip_vs_service *svc; /* service it belongs to */ + __u16 protocol; /* which protocol (TCP/UDP) */ + __u32 vaddr; /* virtual IP address */ + __u16 vport; /* virtual port number */ + __u32 vfwmark; /* firewall mark of service */ +}; + + +/* + * The scheduler object + */ +struct ip_vs_scheduler { + struct list_head n_list; /* d-linked list head */ + char *name; /* scheduler name */ + atomic_t refcnt; /* reference counter */ + struct module *module; /* THIS_MODULE/NULL */ + + /* scheduler initializing service */ + int (*init_service)(struct ip_vs_service *svc); + /* scheduling service finish */ + int (*done_service)(struct ip_vs_service *svc); + /* scheduler updating service */ + int (*update_service)(struct ip_vs_service *svc); + + /* selecting a server from the given service */ + struct ip_vs_dest* (*schedule)(struct ip_vs_service *svc, + struct iphdr *iph); +}; + + +/* + * The application module object (a.k.a. app incarnation) + */ +struct ip_vs_app +{ + struct list_head a_list; /* member in app list */ + int type; /* IP_VS_APP_TYPE_xxx */ + char *name; /* application module name */ + __u16 protocol; + struct module *module; /* THIS_MODULE/NULL */ + struct list_head incs_list; /* list of incarnations */ + + /* members for application incarnations */ + struct list_head p_list; /* member in proto app list */ + struct ip_vs_app *app; /* its real application */ + __u16 port; /* port number in net order */ + atomic_t usecnt; /* usage counter */ + + /* output hook */ + int (*pkt_out)(struct ip_vs_app *, struct ip_vs_conn *, + struct sk_buff *); + + /* input hook */ + int (*pkt_in)(struct ip_vs_app *, struct ip_vs_conn *, + struct sk_buff *); + + /* ip_vs_app initializer */ + int (*init_conn)(struct ip_vs_app *, struct ip_vs_conn *); + + /* ip_vs_app finish */ + int (*done_conn)(struct ip_vs_app *, struct ip_vs_conn *); + + + /* not used now */ + int (*bind_conn)(struct ip_vs_app *, struct ip_vs_conn *, + struct ip_vs_protocol *); + + void (*unbind_conn)(struct ip_vs_app *, struct ip_vs_conn *); + + int * timeout_table; + int * timeouts; + int timeouts_size; + + int (*conn_schedule)(struct sk_buff *skb, struct ip_vs_app *app, + struct iphdr *iph, union ip_vs_tphdr h, + int *verdict, struct ip_vs_conn **cpp); + + struct ip_vs_conn * + (*conn_in_get)(struct sk_buff *skb, struct ip_vs_app *app, + struct iphdr *iph, union ip_vs_tphdr h, int inverse); + + struct ip_vs_conn * + (*conn_out_get)(struct sk_buff *skb, struct ip_vs_app *app, + struct iphdr *iph, union ip_vs_tphdr h, int inverse); + + int (*state_transition)(struct ip_vs_conn *cp, int direction, + struct iphdr *iph, + union ip_vs_tphdr h, struct ip_vs_app *app); + + void (*timeout_change)(struct ip_vs_app *app, int flags); +}; + + +/* + * IPVS core functions + * (from ip_vs_core.c) + */ +extern const char *ip_vs_proto_name(unsigned proto); +extern unsigned int check_for_ip_vs_out(struct sk_buff **skb_p, + int (*okfn)(struct sk_buff *)); +extern void ip_vs_init_hash_table(struct list_head *table, int rows); +#define IP_VS_INIT_HASH_TABLE(t) ip_vs_init_hash_table(t, sizeof(t)/sizeof(t[0])) + +#define IP_VS_APP_TYPE_UNSPEC 0 +#define IP_VS_APP_TYPE_FTP 1 + +/* + * ip_vs_conn handling functions + * (from ip_vs_conn.c) + */ + +/* + * IPVS connection entry hash table + */ +#ifndef CONFIG_IP_VS_TAB_BITS +#define CONFIG_IP_VS_TAB_BITS 12 +#endif +/* make sure that IP_VS_CONN_TAB_BITS is located in [8, 20] */ +#if CONFIG_IP_VS_TAB_BITS < 8 +#define IP_VS_CONN_TAB_BITS 8 +#endif +#if CONFIG_IP_VS_TAB_BITS > 20 +#define IP_VS_CONN_TAB_BITS 20 +#endif +#if 8 <= CONFIG_IP_VS_TAB_BITS && CONFIG_IP_VS_TAB_BITS <= 20 +#define IP_VS_CONN_TAB_BITS CONFIG_IP_VS_TAB_BITS +#endif +#define IP_VS_CONN_TAB_SIZE (1 << IP_VS_CONN_TAB_BITS) +#define IP_VS_CONN_TAB_MASK (IP_VS_CONN_TAB_SIZE - 1) + +enum { + IP_VS_DIR_INPUT = 0, + IP_VS_DIR_OUTPUT, + IP_VS_DIR_INPUT_ONLY, + IP_VS_DIR_LAST, +}; + +extern struct ip_vs_conn *ip_vs_conn_in_get +(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port); +extern struct ip_vs_conn *ip_vs_conn_out_get +(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port); + +/* put back the conn without restarting its timer */ +static inline void __ip_vs_conn_put(struct ip_vs_conn *cp) +{ + atomic_dec(&cp->refcnt); +} +extern void ip_vs_conn_put(struct ip_vs_conn *cp); +extern void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __u16 cport); + +extern struct ip_vs_conn * +ip_vs_conn_new(int proto, __u32 caddr, __u16 cport, __u32 vaddr, __u16 vport, + __u32 daddr, __u16 dport, unsigned flags, + struct ip_vs_dest *dest); +extern void ip_vs_conn_expire_now(struct ip_vs_conn *cp); + +extern const char * ip_vs_state_name(__u16 proto, int state); + +extern void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp); +extern int ip_vs_check_template(struct ip_vs_conn *ct); +extern void ip_vs_secure_tcp_set(int on); +extern void ip_vs_random_dropentry(void); +extern int ip_vs_conn_init(void); +extern void ip_vs_conn_cleanup(void); + +static inline void ip_vs_control_del(struct ip_vs_conn *cp) +{ + struct ip_vs_conn *ctl_cp = cp->control; + if (!ctl_cp) { + IP_VS_ERR("request control DEL for uncontrolled: " + "%d.%d.%d.%d:%d to %d.%d.%d.%d:%d\n", + NIPQUAD(cp->caddr),ntohs(cp->cport), + NIPQUAD(cp->vaddr),ntohs(cp->vport)); + return; + } + + IP_VS_DBG(7, "DELeting control for: " + "cp.dst=%d.%d.%d.%d:%d ctl_cp.dst=%d.%d.%d.%d:%d\n", + NIPQUAD(cp->caddr),ntohs(cp->cport), + NIPQUAD(ctl_cp->caddr),ntohs(ctl_cp->cport)); + + cp->control = NULL; + if (atomic_read(&ctl_cp->n_control) == 0) { + IP_VS_ERR("BUG control DEL with n=0 : " + "%d.%d.%d.%d:%d to %d.%d.%d.%d:%d\n", + NIPQUAD(cp->caddr),ntohs(cp->cport), + NIPQUAD(cp->vaddr),ntohs(cp->vport)); + return; + } + atomic_dec(&ctl_cp->n_control); +} + +static inline void +ip_vs_control_add(struct ip_vs_conn *cp, struct ip_vs_conn *ctl_cp) +{ + if (cp->control) { + IP_VS_ERR("request control ADD for already controlled: " + "%d.%d.%d.%d:%d to %d.%d.%d.%d:%d\n", + NIPQUAD(cp->caddr),ntohs(cp->cport), + NIPQUAD(cp->vaddr),ntohs(cp->vport)); + ip_vs_control_del(cp); + } + + IP_VS_DBG(7, "ADDing control for: " + "cp.dst=%d.%d.%d.%d:%d ctl_cp.dst=%d.%d.%d.%d:%d\n", + NIPQUAD(cp->caddr),ntohs(cp->cport), + NIPQUAD(ctl_cp->caddr),ntohs(ctl_cp->cport)); + + cp->control = ctl_cp; + atomic_inc(&ctl_cp->n_control); +} + + +/* + * IPVS application functions + * (from ip_vs_app.c) + */ +#define IP_VS_APP_MAX_PORTS 8 +extern int register_ip_vs_app(struct ip_vs_app *app); +extern void unregister_ip_vs_app(struct ip_vs_app *app); +extern int ip_vs_bind_app(struct ip_vs_conn *cp, struct ip_vs_protocol *pp); +extern void ip_vs_unbind_app(struct ip_vs_conn *cp); +extern int +register_ip_vs_app_inc(struct ip_vs_app *app, __u16 proto, __u16 port); +extern int ip_vs_app_inc_get(struct ip_vs_app *inc); +extern void ip_vs_app_inc_put(struct ip_vs_app *inc); + +extern int ip_vs_app_pkt_out(struct ip_vs_conn *, struct sk_buff *skb); +extern int ip_vs_app_pkt_in(struct ip_vs_conn *, struct sk_buff *skb); +extern int ip_vs_skb_replace(struct sk_buff *skb, int pri, + char *o_buf, int o_len, char *n_buf, int n_len); +extern int ip_vs_app_init(void); +extern void ip_vs_app_cleanup(void); + + +/* + * IPVS protocol functions (from ip_vs_proto.c) + */ +extern int ip_vs_protocol_init(void); +extern void ip_vs_protocol_cleanup(void); +extern void ip_vs_protocol_timeout_change(int flags); +extern int *ip_vs_create_timeout_table(int *table, int size); +extern int +ip_vs_set_state_timeout(int *table, int num, char **names, char *name, int to); +extern struct ip_vs_protocol ip_vs_protocol_tcp; +extern struct ip_vs_protocol ip_vs_protocol_udp; +extern struct ip_vs_protocol ip_vs_protocol_icmp; +extern struct ip_vs_protocol ip_vs_protocol_esp; +extern struct ip_vs_protocol ip_vs_protocol_ah; + + +/* + * Registering/unregistering scheduler functions + * (from ip_vs_sched.c) + */ +extern int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler); +extern int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler); +extern int ip_vs_bind_scheduler(struct ip_vs_service *svc, + struct ip_vs_scheduler *scheduler); +extern int ip_vs_unbind_scheduler(struct ip_vs_service *svc); +extern struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name); +extern void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler); +extern struct ip_vs_conn * +ip_vs_schedule(struct ip_vs_service *svc, struct iphdr *iph); +extern int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, + struct ip_vs_protocol *pp, union ip_vs_tphdr h); + + +/* + * IPVS control data and functions (from ip_vs_ctl.c) + */ +extern int sysctl_ip_vs_cache_bypass; +extern int sysctl_ip_vs_expire_nodest_conn; +extern int sysctl_ip_vs_sync_threshold[2]; +extern int sysctl_ip_vs_nat_icmp_send; +extern atomic_t ip_vs_dropentry; +extern struct ip_vs_stats ip_vs_stats; + +extern struct ip_vs_service * +ip_vs_service_get(__u32 fwmark, __u16 protocol, __u32 vaddr, __u16 vport); + +static inline void ip_vs_service_put(struct ip_vs_service *svc) +{ + atomic_dec(&svc->usecnt); +} + +extern struct ip_vs_dest * +ip_vs_lookup_real_service(__u16 protocol, __u32 daddr, __u16 dport); +extern int ip_vs_use_count_inc(void); +extern void ip_vs_use_count_dec(void); +extern void update_defense_level(void); +extern int ip_vs_control_init(void); +extern void ip_vs_control_cleanup(void); + + +/* + * IPVS sync daemon data and function prototypes + * (from ip_vs_sync.c) + */ +extern volatile int ip_vs_sync_state; +extern volatile int ip_vs_master_syncid; +extern volatile int ip_vs_backup_syncid; +extern char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN]; +extern char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN]; +extern int start_sync_thread(int state, char *mcast_ifn, __u8 syncid); +extern int stop_sync_thread(int state); +extern void ip_vs_sync_conn(struct ip_vs_conn *cp); + + +/* + * IPVS rate estimator prototypes (from ip_vs_est.c) + */ +extern int ip_vs_new_estimator(struct ip_vs_stats *stats); +extern void ip_vs_kill_estimator(struct ip_vs_stats *stats); +extern void ip_vs_zero_estimator(struct ip_vs_stats *stats); + +/* + * Various IPVS packet transmitters (from ip_vs_xmit.c) + */ +extern int ip_vs_null_xmit +(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp); +extern int ip_vs_bypass_xmit +(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp); +extern int ip_vs_nat_xmit +(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp); +extern int ip_vs_tunnel_xmit +(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp); +extern int ip_vs_dr_xmit +(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp); +extern int ip_vs_icmp_xmit +(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp); +extern void ip_vs_dst_reset(struct ip_vs_dest *dest); + + +/* + * This is a simple mechanism to ignore packets when + * we are loaded. Just set ip_vs_drop_rate to 'n' and + * we start to drop 1/rate of the packets + */ +extern int ip_vs_drop_rate; +extern int ip_vs_drop_counter; + +static __inline__ int ip_vs_todrop(void) +{ + if (!ip_vs_drop_rate) return 0; + if (--ip_vs_drop_counter > 0) return 0; + ip_vs_drop_counter = ip_vs_drop_rate; + return 1; +} + +/* + * ip_vs_fwd_tag returns the forwarding tag of the connection + */ +#define IP_VS_FWD_METHOD(cp) (cp->flags & IP_VS_CONN_F_FWD_MASK) + +extern __inline__ char ip_vs_fwd_tag(struct ip_vs_conn *cp) +{ + char fwd; + + switch (IP_VS_FWD_METHOD(cp)) { + case IP_VS_CONN_F_MASQ: + fwd = 'M'; break; + case IP_VS_CONN_F_LOCALNODE: + fwd = 'L'; break; + case IP_VS_CONN_F_TUNNEL: + fwd = 'T'; break; + case IP_VS_CONN_F_DROUTE: + fwd = 'R'; break; + case IP_VS_CONN_F_BYPASS: + fwd = 'B'; break; + default: + fwd = '?'; break; + } + return fwd; +} + + +static inline u16 ip_vs_check_diff(u32 old, u32 new, u16 oldsum) +{ + u32 diff[2] = { old, new }; + + return csum_fold(csum_partial((char *) diff, sizeof(diff), + oldsum ^ 0xFFFF)); +} + +#endif /* __KERNEL__ */ + +#endif /* _IP_VS_H */ diff --git a/net/compat.c b/net/compat.c index 625a95c9e6bf..459307c2038e 100644 --- a/net/compat.c +++ b/net/compat.c @@ -405,34 +405,6 @@ static int do_set_attach_filter(int fd, int level, int optname, sizeof(struct sock_fprog)); } -static int do_set_icmpv6_filter(int fd, int level, int optname, - char *optval, int optlen) -{ - struct icmp6_filter kfilter; - mm_segment_t old_fs; - int ret, i; - - if (optlen < sizeof(kfilter)) - return -EINVAL; - if (copy_from_user(&kfilter, optval, sizeof(kfilter))) - return -EFAULT; - - for (i = 0; i < 8; i += 2) { - u32 tmp = kfilter.data[i]; - - kfilter.data[i] = kfilter.data[i + 1]; - kfilter.data[i + 1] = tmp; - } - - old_fs = get_fs(); - set_fs(KERNEL_DS); - ret = sys_setsockopt(fd, level, optname, - (char *) &kfilter, sizeof(kfilter)); - set_fs(old_fs); - - return ret; -} - static int do_set_sock_timeout(int fd, int level, int optname, char *optval, int optlen) { struct compat_timeval *up = (struct compat_timeval *) optval; @@ -465,9 +437,6 @@ asmlinkage long compat_sys_setsockopt(int fd, int level, int optname, optval, optlen); if (optname == SO_RCVTIMEO || optname == SO_SNDTIMEO) return do_set_sock_timeout(fd, level, optname, optval, optlen); - if (level == SOL_ICMPV6 && optname == ICMPV6_FILTER) - return do_set_icmpv6_filter(fd, level, optname, - optval, optlen); return sys_setsockopt(fd, level, optname, optval, optlen); } diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index dca22740e2d2..fcb38de5ad40 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1,7 +1,7 @@ /* * net-sysfs.c - network device class and attributes * - * Copyright (c) 2003 Stephen Hemminber <shemminger@osdl.org> + * Copyright (c) 2003 Stephen Hemminger <shemminger@osdl.org> * */ diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 5b60c0683b24..be334d427b70 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -375,4 +375,5 @@ config INET_IPCOMP If unsure, say Y. source "net/ipv4/netfilter/Kconfig" +source "net/ipv4/ipvs/Kconfig" diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 9db534a0cf0f..99c8f308bb60 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -21,5 +21,6 @@ obj-$(CONFIG_INET_ESP) += esp4.o obj-$(CONFIG_INET_IPCOMP) += ipcomp.o obj-$(CONFIG_IP_PNP) += ipconfig.o obj-$(CONFIG_NETFILTER) += netfilter/ +obj-$(CONFIG_IP_VS) += ipvs/ obj-y += xfrm4_policy.o xfrm4_state.o xfrm4_input.o xfrm4_tunnel.o diff --git a/net/ipv4/ipvs/Kconfig b/net/ipv4/ipvs/Kconfig new file mode 100644 index 000000000000..a6487f20a50b --- /dev/null +++ b/net/ipv4/ipvs/Kconfig @@ -0,0 +1,257 @@ +# +# IP Virtual Server configuration +# +menu "IP: Virtual Server Configuration" + depends on INET && NETFILTER + +config IP_VS + tristate "IP virtual server support (EXPERIMENTAL)" + depends on INET && NETFILTER + ---help--- + IP Virtual Server support will let you build a high-performance + virtual server based on cluster of two or more real servers. This + option must be enabled for at least one of the clustered computers + that will take care of intercepting incomming connections to a + single IP address and scheduling them to real servers. + + Three request dispatching techniques are implemented, they are + virtual server via NAT, virtual server via tunneling and virtual + server via direct routing. The several scheduling algorithms can + be used to choose which server the connection is directed to, + thus load balancing can be achieved among the servers. For more + information and its administration program, please visit the + following URL: + http://www.linuxvirtualserver.org/ + + If you want to compile it in kernel, say Y. If you want to compile + it as a module, say M here and read Documentation/modules.txt. If + unsure, say N. + +config IP_VS_DEBUG + bool "IP virtual server debugging" + depends on IP_VS + ---help--- + Say Y here if you want to get additional messages useful in + debugging the IP virtual server code. You can change the debug + level in /proc/sys/net/ipv4/vs/debug_level + +config IP_VS_TAB_BITS + int "IPVS connection table size (the Nth power of 2)" + depends on IP_VS + default "12" + ---help--- + The IPVS connection hash table uses the chaining scheme to handle + hash collisions. Using a big IPVS connection hash table will greatly + reduce conflicts when there are hundreds of thousands of connections + in the hash table. + + Note the table size must be power of 2. The table size will be the + value of 2 to the your input number power. The number to choose is + from 8 to 20, the default number is 12, which means the table size + is 4096. Don't input the number too small, otherwise you will lose + performance on it. You can adapt the table size yourself, according + to your virtual server application. It is good to set the table size + not far less than the number of connections per second multiplying + average lasting time of connection in the table. For example, your + virtual server gets 200 connections per second, the connection lasts + for 200 seconds in average in the connection table, the table size + should be not far less than 200x200, it is good to set the table + size 32768 (2**15). + + Another note that each connection occupies 128 bytes effectively and + each hash entry uses 8 bytes, so you can estimate how much memory is + needed for your box. + +comment "IPVS transport protocol load balancing support" + depends on IP_VS + +config IP_VS_PROTO_TCP + bool "TCP load balancing support" + depends on IP_VS + ---help--- + This option enables support for load balancing TCP transport + protocol. Say Y if unsure. + +config IP_VS_PROTO_UDP + bool "UDP load balancing support" + depends on IP_VS + ---help--- + This option enables support for load balancing UDP transport + protocol. Say Y if unsure. + +config IP_VS_PROTO_ESP + bool "ESP load balancing support" + depends on IP_VS + ---help--- + This option enables support for load balancing ESP (Encapsultion + Security Payload) transport protocol. Say Y if unsure. + +config IP_VS_PROTO_AH + bool "AH load balancing support" + depends on IP_VS + ---help--- + This option enables support for load balancing AH (Authentication + Header) transport protocol. Say Y if unsure. + +comment "IPVS scheduler" + depends on IP_VS + +config IP_VS_RR + tristate "round-robin scheduling" + depends on IP_VS + ---help--- + The robin-robin scheduling algorithm simply directs network + connections to different real servers in a round-robin manner. + + If you want to compile it in kernel, say Y. If you want to compile + it as a module, say M here and read Documentation/modules.txt. If + unsure, say N. + +config IP_VS_WRR + tristate "weighted round-robin scheduling" + depends on IP_VS + ---help--- + The weighted robin-robin scheduling algorithm directs network + connections to different real servers based on server weights + in a round-robin manner. Servers with higher weights receive + new connections first than those with less weights, and servers + with higher weights get more connections than those with less + weights and servers with equal weights get equal connections. + + If you want to compile it in kernel, say Y. If you want to compile + it as a module, say M here and read Documentation/modules.txt. If + unsure, say N. + +config IP_VS_LC + tristate "least-connection scheduling scheduling" + depends on IP_VS + ---help--- + The least-connection scheduling algorithm directs network + connections to the server with the least number of active + connections. + + If you want to compile it in kernel, say Y. If you want to compile + it as a module, say M here and read Documentation/modules.txt. If + unsure, say N. + +config IP_VS_WLC + tristate "weighted least-connection scheduling" + depends on IP_VS + ---help--- + The weighted least-connection scheduling algorithm directs network + connections to the server with the least active connections + normalized by the server weight. + + If you want to compile it in kernel, say Y. If you want to compile + it as a module, say M here and read Documentation/modules.txt. If + unsure, say N. + +config IP_VS_LBLC + tristate "locality-based least-connection with replication scheduling" + depends on IP_VS + ---help--- + The locality-based least-connection scheduling algorithm is for + destination IP load balancing. It is usually used in cache cluster. + This algorithm usually directs packet destined for an IP address to + its server if the server is alive and under load. If the server is + overloaded (its active connection numbers is larger than its weight) + and there is a server in its half load, then allocate the weighted + least-connection server to this IP address. + + If you want to compile it in kernel, say Y. If you want to compile + it as a module, say M here and read Documentation/modules.txt. If + unsure, say N. + +config IP_VS_LBLCR + tristate "locality-based least-connection with replication schedulin" + depends on IP_VS + ---help--- + The locality-based least-connection with replication scheduling + algorithm is also for destination IP load balancing. It is + usually used in cache cluster. It differs from the LBLC scheduling + as follows: the load balancer maintains mappings from a target + to a set of server nodes that can serve the target. Requests for + a target are assigned to the least-connection node in the target's + server set. If all the node in the server set are over loaded, + it picks up a least-connection node in the cluster and adds it + in the sever set for the target. If the server set has not been + modified for the specified time, the most loaded node is removed + from the server set, in order to avoid high degree of replication. + + If you want to compile it in kernel, say Y. If you want to compile + it as a module, say M here and read Documentation/modules.txt. If + unsure, say N. + +config IP_VS_DH + tristate "destination hashing scheduling" + depends on IP_VS + ---help--- + The destination hashing scheduling algorithm assigns network + connections to the servers through looking up a statically assigned + hash table by their destination IP addresses. + + If you want to compile it in kernel, say Y. If you want to compile + it as a module, say M here and read Documentation/modules.txt. If + unsure, say N. + +config IP_VS_SH + tristate "source hashing scheduling" + depends on IP_VS + ---help--- + The source hashing scheduling algorithm assigns network + connections to the servers through looking up a statically assigned + hash table by their source IP addresses. + + If you want to compile it in kernel, say Y. If you want to compile + it as a module, say M here and read Documentation/modules.txt. If + unsure, say N. + +config IP_VS_SED + tristate "shortest expected delay scheduling" + depends on IP_VS + ---help--- + The shortest expected delay scheduling algorithm assigns network + connections to the server with the shortest expected delay. The + expected delay that the job will experience is (Ci + 1) / Ui if + sent to the ith server, in which Ci is the number of connections + on the the ith server and Ui is the fixed service rate (weight) + of the ith server. + + If you want to compile it in kernel, say Y. If you want to compile + it as a module, say M here and read Documentation/modules.txt. If + unsure, say N. + +config IP_VS_NQ + tristate "never queue scheduling" + depends on IP_VS + ---help--- + The never queue scheduling algorithm adopts a two-speed model. + When there is an idle server available, the job will be sent to + the idle server, instead of waiting for a fast one. When there + is no idle server available, the job will be sent to the server + that minimize its expected delay (The Shortest Expected Delay + scheduling algorithm). + + If you want to compile it in kernel, say Y. If you want to compile + it as a module, say M here and read Documentation/modules.txt. If + unsure, say N. + +comment 'IPVS application helper' + depends on IP_VS + +config IP_VS_FTP + tristate "FTP protocol helper" + depends on IP_VS + ---help--- + FTP is a protocol that transfers IP address and/or port number in + the payload. In the virtual server via Network Address Translation, + the IP address and port number of real servers cannot be sent to + clients in ftp connections directly, so FTP protocol helper is + required for tracking the connection and mangling it back to that of + virtual service. + + If you want to compile it in kernel, say Y. If you want to compile + it as a module, say M here and read Documentation/modules.txt. If + unsure, say N. + +endmenu diff --git a/net/ipv4/ipvs/Makefile b/net/ipv4/ipvs/Makefile new file mode 100644 index 000000000000..15fcc9e41ef1 --- /dev/null +++ b/net/ipv4/ipvs/Makefile @@ -0,0 +1,40 @@ +# +# Makefile for the IPVS modules on top of IPv4. +# +# Note! Dependencies are done automagically by 'make dep', which also +# removes any old dependencies. DON'T put your own dependencies here +# unless it's something special (ie not a .c file). +# +# Note 2! The CFLAGS definition is now in the main makefile... + + +# IPVS transport protocol load balancing support +ip_vs_proto-objs-y := +ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_TCP) += ip_vs_proto_tcp.o +ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o +ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_ESP) += ip_vs_proto_esp.o +ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH) += ip_vs_proto_ah.o + +ip_vs-objs := ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o \ + ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o \ + ip_vs_est.o ip_vs_proto.o ip_vs_proto_icmp.o \ + $(ip_vs_proto-objs-y) + + +# IPVS core +obj-$(CONFIG_IP_VS) += ip_vs.o + +# IPVS schedulers +obj-$(CONFIG_IP_VS_RR) += ip_vs_rr.o +obj-$(CONFIG_IP_VS_WRR) += ip_vs_wrr.o +obj-$(CONFIG_IP_VS_LC) += ip_vs_lc.o +obj-$(CONFIG_IP_VS_WLC) += ip_vs_wlc.o +obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o +obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o +obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o +obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o +obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o +obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o + +# IPVS application helpers +obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c new file mode 100644 index 000000000000..9242eef96bcf --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_app.c @@ -0,0 +1,588 @@ +/* + * ip_vs_app.c: Application module support for IPVS + * + * Version: $Id: ip_vs_app.c,v 1.17 2003/03/22 06:31:21 wensong Exp $ + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Most code here is taken from ip_masq_app.c in kernel 2.2. The difference + * is that ip_vs_app module handles the reverse direction (incoming requests + * and outgoing responses). + * + * IP_MASQ_APP application masquerading module + * + * Author: Juan Jose Ciarlante, <jjciarla@raiz.uncu.edu.ar> + * + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/skbuff.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/init.h> +#include <net/protocol.h> +#include <net/tcp.h> +#include <net/udp.h> +#include <asm/system.h> +#include <linux/stat.h> +#include <linux/proc_fs.h> + +#include <net/ip_vs.h> + +EXPORT_SYMBOL(register_ip_vs_app); +EXPORT_SYMBOL(unregister_ip_vs_app); +EXPORT_SYMBOL(register_ip_vs_app_inc); + +/* ipvs application list head */ +static LIST_HEAD(ip_vs_app_list); +static DECLARE_MUTEX(__ip_vs_app_mutex); + + +/* + * Get an ip_vs_app object + */ +static inline int ip_vs_app_get(struct ip_vs_app *app) +{ + /* test and get the module atomically */ + if (app->module) + return try_module_get(app->module); + else + return 1; +} + + +static inline void ip_vs_app_put(struct ip_vs_app *app) +{ + if (app->module) + module_put(app->module); +} + + +/* + * Allocate/initialize app incarnation and register it in proto apps. + */ +int +ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port) +{ + struct ip_vs_protocol *pp; + struct ip_vs_app *inc; + int ret; + + if (!(pp = ip_vs_proto_get(proto))) + return -EPROTONOSUPPORT; + + if (!pp->unregister_app) + return -EOPNOTSUPP; + + inc = kmalloc(sizeof(struct ip_vs_app), GFP_KERNEL); + if (!inc) + return -ENOMEM; + memcpy(inc, app, sizeof(*inc)); + INIT_LIST_HEAD(&inc->p_list); + INIT_LIST_HEAD(&inc->incs_list); + inc->app = app; + inc->port = htons(port); + atomic_set(&inc->usecnt, 0); + + if (app->timeouts) { + inc->timeout_table = + ip_vs_create_timeout_table(app->timeouts, + app->timeouts_size); + if (!inc->timeout_table) { + ret = -ENOMEM; + goto out; + } + } + + ret = pp->register_app(inc); + if (ret) + goto out; + + list_add(&inc->a_list, &app->incs_list); + IP_VS_DBG(9, "%s application %s:%u registered\n", + pp->name, inc->name, inc->port); + + return 0; + + out: + if (inc->timeout_table) + kfree(inc->timeout_table); + kfree(inc); + return ret; +} + + +/* + * Release app incarnation + */ +static void +ip_vs_app_inc_release(struct ip_vs_app *inc) +{ + struct ip_vs_protocol *pp; + + if (!(pp = ip_vs_proto_get(inc->protocol))) + return; + + if (pp->unregister_app) + pp->unregister_app(inc); + + IP_VS_DBG(9, "%s App %s:%u unregistered\n", + pp->name, inc->name, inc->port); + + list_del(&inc->a_list); + + if (inc->timeout_table != NULL) + kfree(inc->timeout_table); + kfree(inc); +} + + +/* + * Get reference to app inc (only called from softirq) + * + */ +int ip_vs_app_inc_get(struct ip_vs_app *inc) +{ + int result; + + atomic_inc(&inc->usecnt); + if (unlikely((result = ip_vs_app_get(inc->app)) != 1)) + atomic_dec(&inc->usecnt); + return result; +} + + +/* + * Put the app inc (only called from timer or net softirq) + */ +void ip_vs_app_inc_put(struct ip_vs_app *inc) +{ + ip_vs_app_put(inc->app); + atomic_dec(&inc->usecnt); +} + + +/* + * Register an application incarnation in protocol applications + */ +int +register_ip_vs_app_inc(struct ip_vs_app *app, __u16 proto, __u16 port) +{ + int result; + + down(&__ip_vs_app_mutex); + + result = ip_vs_app_inc_new(app, proto, port); + + up(&__ip_vs_app_mutex); + + return result; +} + + +/* + * ip_vs_app registration routine + */ +int register_ip_vs_app(struct ip_vs_app *app) +{ + /* increase the module use count */ + ip_vs_use_count_inc(); + + down(&__ip_vs_app_mutex); + + list_add(&app->a_list, &ip_vs_app_list); + + up(&__ip_vs_app_mutex); + + return 0; +} + + +/* + * ip_vs_app unregistration routine + * We are sure there are no app incarnations attached to services + */ +void unregister_ip_vs_app(struct ip_vs_app *app) +{ + struct ip_vs_app *inc; + struct list_head *l = &app->incs_list; + + down(&__ip_vs_app_mutex); + + while (l->next != l) { + inc = list_entry(l->next, struct ip_vs_app, a_list); + ip_vs_app_inc_release(inc); + } + + list_del(&app->a_list); + + up(&__ip_vs_app_mutex); + + /* decrease the module use count */ + ip_vs_use_count_dec(); +} + + +#if 0000 +/* + * Get reference to app by name (called from user context) + */ +struct ip_vs_app *ip_vs_app_get_by_name(char *appname) +{ + struct list_head *p; + struct ip_vs_app *app, *a = NULL; + + down(&__ip_vs_app_mutex); + + list_for_each (p, &ip_vs_app_list) { + app = list_entry(p, struct ip_vs_app, a_list); + if (strcmp(app->name, appname)) + continue; + + /* softirq may call ip_vs_app_get too, so the caller + must disable softirq on the current CPU */ + if (ip_vs_app_get(app)) + a = app; + break; + } + + up(&__ip_vs_app_mutex); + + return a; +} +#endif + + +/* + * Bind ip_vs_conn to its ip_vs_app (called by cp constructor) + */ +int ip_vs_bind_app(struct ip_vs_conn *cp, struct ip_vs_protocol *pp) +{ + return pp->app_conn_bind(cp); +} + + +/* + * Unbind cp from application incarnation (called by cp destructor) + */ +void ip_vs_unbind_app(struct ip_vs_conn *cp) +{ + struct ip_vs_app *inc = cp->app; + + if (!inc) + return; + + if (inc->unbind_conn) + inc->unbind_conn(inc, cp); + if (inc->done_conn) + inc->done_conn(inc, cp); + ip_vs_app_inc_put(inc); + cp->app = NULL; +} + + +/* + * Fixes th->seq based on ip_vs_seq info. + */ +static inline void vs_fix_seq(const struct ip_vs_seq *vseq, struct tcphdr *th) +{ + __u32 seq = ntohl(th->seq); + + /* + * Adjust seq with delta-offset for all packets after + * the most recent resized pkt seq and with previous_delta offset + * for all packets before most recent resized pkt seq. + */ + if (vseq->delta || vseq->previous_delta) { + if(after(seq, vseq->init_seq)) { + th->seq = htonl(seq + vseq->delta); + IP_VS_DBG(9, "vs_fix_seq(): added delta (%d) to seq\n", + vseq->delta); + } else { + th->seq = htonl(seq + vseq->previous_delta); + IP_VS_DBG(9, "vs_fix_seq(): added previous_delta " + "(%d) to seq\n", vseq->previous_delta); + } + } +} + + +/* + * Fixes th->ack_seq based on ip_vs_seq info. + */ +static inline void +vs_fix_ack_seq(const struct ip_vs_seq *vseq, struct tcphdr *th) +{ + __u32 ack_seq = ntohl(th->ack_seq); + + /* + * Adjust ack_seq with delta-offset for + * the packets AFTER most recent resized pkt has caused a shift + * for packets before most recent resized pkt, use previous_delta + */ + if (vseq->delta || vseq->previous_delta) { + /* since ack_seq is the number of octet that is expected + to receive next, so compare it with init_seq+delta */ + if(after(ack_seq, vseq->init_seq+vseq->delta)) { + th->ack_seq = htonl(ack_seq - vseq->delta); + IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted delta " + "(%d) from ack_seq\n", vseq->delta); + + } else { + th->ack_seq = htonl(ack_seq - vseq->previous_delta); + IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted " + "previous_delta (%d) from ack_seq\n", + vseq->previous_delta); + } + } +} + + +/* + * Updates ip_vs_seq if pkt has been resized + * Assumes already checked proto==IPPROTO_TCP and diff!=0. + */ +static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq, + unsigned flag, __u32 seq, int diff) +{ + /* spinlock is to keep updating cp->flags atomic */ + spin_lock(&cp->lock); + if (!(cp->flags & flag) || after(seq, vseq->init_seq)) { + vseq->previous_delta = vseq->delta; + vseq->delta += diff; + vseq->init_seq = seq; + cp->flags |= flag; + } + spin_unlock(&cp->lock); +} + + +/* + * Output pkt hook. Will call bound ip_vs_app specific function + * called by ipvs packet handler, assumes previously checked cp!=NULL + * returns (new - old) skb->len diff. + */ +int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb) +{ + struct ip_vs_app *app; + int diff; + struct iphdr *iph; + struct tcphdr *th; + __u32 seq; + + /* + * check if application module is bound to + * this ip_vs_conn. + */ + if ((app = cp->app) == NULL) + return 0; + + iph = skb->nh.iph; + th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); + + /* + * Remember seq number in case this pkt gets resized + */ + seq = ntohl(th->seq); + + /* + * Fix seq stuff if flagged as so. + */ + if (cp->protocol == IPPROTO_TCP) { + if (cp->flags & IP_VS_CONN_F_OUT_SEQ) + vs_fix_seq(&cp->out_seq, th); + if (cp->flags & IP_VS_CONN_F_IN_SEQ) + vs_fix_ack_seq(&cp->in_seq, th); + } + + /* + * Call private output hook function + */ + if (app->pkt_out == NULL) + return 0; + + diff = app->pkt_out(app, cp, skb); + + /* + * Update ip_vs seq stuff if len has changed. + */ + if (diff != 0 && cp->protocol == IPPROTO_TCP) + vs_seq_update(cp, &cp->out_seq, + IP_VS_CONN_F_OUT_SEQ, seq, diff); + + return diff; +} + + +/* + * Input pkt hook. Will call bound ip_vs_app specific function + * called by ipvs packet handler, assumes previously checked cp!=NULL. + * returns (new - old) skb->len diff. + */ +int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb) +{ + struct ip_vs_app *app; + int diff; + struct iphdr *iph; + struct tcphdr *th; + __u32 seq; + + /* + * check if application module is bound to + * this ip_vs_conn. + */ + if ((app = cp->app) == NULL) + return 0; + + iph = skb->nh.iph; + th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); + + /* + * Remember seq number in case this pkt gets resized + */ + seq = ntohl(th->seq); + + /* + * Fix seq stuff if flagged as so. + */ + if (cp->protocol == IPPROTO_TCP) { + if (cp->flags & IP_VS_CONN_F_IN_SEQ) + vs_fix_seq(&cp->in_seq, th); + if (cp->flags & IP_VS_CONN_F_OUT_SEQ) + vs_fix_ack_seq(&cp->out_seq, th); + } + + /* + * Call private input hook function + */ + if (app->pkt_in == NULL) + return 0; + + diff = app->pkt_in(app, cp, skb); + + /* + * Update ip_vs seq stuff if len has changed. + */ + if (diff != 0 && cp->protocol == IPPROTO_TCP) + vs_seq_update(cp, &cp->in_seq, + IP_VS_CONN_F_IN_SEQ, seq, diff); + + return diff; +} + + +/* + * /proc/net/ip_vs_app entry function + */ +static int +ip_vs_app_getinfo(char *buffer, char **start, off_t offset, int length) +{ + off_t pos=0; + int len=0; + char temp[64]; + struct ip_vs_app *app, *inc; + struct list_head *e, *i; + + pos = 64; + if (pos > offset) { + len += sprintf(buffer+len, "%-63s\n", + "prot port usecnt name"); + } + + down(&__ip_vs_app_mutex); + list_for_each (e, &ip_vs_app_list) { + app = list_entry(e, struct ip_vs_app, a_list); + list_for_each (i, &app->incs_list) { + inc = list_entry(i, struct ip_vs_app, a_list); + + pos += 64; + if (pos <= offset) + continue; + sprintf(temp, "%-3s %-7u %-6d %-17s", + ip_vs_proto_name(inc->protocol), + ntohs(inc->port), + atomic_read(&inc->usecnt), + inc->name); + len += sprintf(buffer+len, "%-63s\n", temp); + if (pos >= offset+length) + goto done; + } + } + done: + up(&__ip_vs_app_mutex); + + *start = buffer+len-(pos-offset); /* Start of wanted data */ + len = pos-offset; + if (len > length) + len = length; + if (len < 0) + len = 0; + return len; +} + + +/* + * Replace a segment of data with a new segment + */ +int ip_vs_skb_replace(struct sk_buff *skb, int pri, + char *o_buf, int o_len, char *n_buf, int n_len) +{ + struct iphdr *iph; + int diff; + int o_offset; + int o_left; + + EnterFunction(9); + + diff = n_len - o_len; + o_offset = o_buf - (char *)skb->data; + /* The length of left data after o_buf+o_len in the skb data */ + o_left = skb->len - (o_offset + o_len); + + if (diff <= 0) { + memmove(o_buf + n_len, o_buf + o_len, o_left); + memcpy(o_buf, n_buf, n_len); + skb_trim(skb, skb->len + diff); + } else if (diff <= skb_tailroom(skb)) { + skb_put(skb, diff); + memmove(o_buf + n_len, o_buf + o_len, o_left); + memcpy(o_buf, n_buf, n_len); + } else { + if (pskb_expand_head(skb, skb_headroom(skb), diff, pri)) + return -ENOMEM; + skb_put(skb, diff); + memmove(skb->data + o_offset + n_len, + skb->data + o_offset + o_len, o_left); + memcpy(skb->data + o_offset, n_buf, n_len); + } + + /* must update the iph total length here */ + iph = skb->nh.iph; + iph->tot_len = htons(skb->len); + + LeaveFunction(9); + return 0; +} + + +int ip_vs_app_init(void) +{ + /* we will replace it with proc_net_ipvs_create() soon */ + proc_net_create("ip_vs_app", 0, ip_vs_app_getinfo); + return 0; +} + + +void ip_vs_app_cleanup(void) +{ + proc_net_remove("ip_vs_app"); +} diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c new file mode 100644 index 000000000000..73de3e8e8779 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_conn.c @@ -0,0 +1,869 @@ +/* + * IPVS An implementation of the IP virtual server support for the + * LINUX operating system. IPVS is now implemented as a module + * over the Netfilter framework. IPVS can be used to build a + * high-performance and highly available server based on a + * cluster of servers. + * + * Version: $Id: ip_vs_conn.c,v 1.31 2003/04/18 09:03:16 wensong Exp $ + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * Peter Kese <peter.kese@ijs.si> + * Julian Anastasov <ja@ssi.bg> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese, + * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms + * and others. Many code here is taken from IP MASQ code of kernel 2.2. + * + * Changes: + * + */ + +#include <linux/config.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/compiler.h> +#include <linux/vmalloc.h> +#include <linux/proc_fs.h> /* for proc_net_* */ +#include <linux/jhash.h> +#include <linux/random.h> + +#include <net/ip_vs.h> + + +/* + * Connection hash table: for input and output packets lookups of IPVS + */ +static struct list_head *ip_vs_conn_tab; + +/* SLAB cache for IPVS connections */ +static kmem_cache_t *ip_vs_conn_cachep; + +/* counter for current IPVS connections */ +static atomic_t ip_vs_conn_count = ATOMIC_INIT(0); + +/* counter for no client port connections */ +static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0); + +/* random value for IPVS connection hash */ +static unsigned int ip_vs_conn_rnd; + +/* + * Fine locking granularity for big connection hash table + */ +#define CT_LOCKARRAY_BITS 4 +#define CT_LOCKARRAY_SIZE (1<<CT_LOCKARRAY_BITS) +#define CT_LOCKARRAY_MASK (CT_LOCKARRAY_SIZE-1) + +struct ip_vs_aligned_lock +{ + rwlock_t l; +} __attribute__((__aligned__(SMP_CACHE_BYTES))); + +/* lock array for conn table */ +struct ip_vs_aligned_lock +__ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned; + +static inline void ct_read_lock(unsigned key) +{ + read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); +} + +static inline void ct_read_unlock(unsigned key) +{ + read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); +} + +static inline void ct_write_lock(unsigned key) +{ + write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); +} + +static inline void ct_write_unlock(unsigned key) +{ + write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); +} + +static inline void ct_read_lock_bh(unsigned key) +{ + read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); +} + +static inline void ct_read_unlock_bh(unsigned key) +{ + read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); +} + +static inline void ct_write_lock_bh(unsigned key) +{ + write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); +} + +static inline void ct_write_unlock_bh(unsigned key) +{ + write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); +} + + +/* + * Returns hash value for IPVS connection entry + */ +static unsigned int ip_vs_conn_hashkey(unsigned proto, __u32 addr, __u16 port) +{ + return jhash_3words(addr, port, proto, ip_vs_conn_rnd) + & IP_VS_CONN_TAB_MASK; +} + + +/* + * Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port. + * returns bool success. + */ +static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) +{ + unsigned hash; + + if (cp->flags & IP_VS_CONN_F_HASHED) { + IP_VS_ERR("ip_vs_conn_hash(): request for already hashed, " + "called from %p\n", __builtin_return_address(0)); + return 0; + } + + /* Hash by protocol, client address and port */ + hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport); + + ct_write_lock(hash); + + list_add(&cp->c_list, &ip_vs_conn_tab[hash]); + cp->flags |= IP_VS_CONN_F_HASHED; + atomic_inc(&cp->refcnt); + + ct_write_unlock(hash); + + return 1; +} + + +/* + * UNhashes ip_vs_conn from ip_vs_conn_tab. + * returns bool success. + */ +static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) +{ + unsigned hash; + + if (!(cp->flags & IP_VS_CONN_F_HASHED)) { + IP_VS_ERR("ip_vs_conn_unhash(): request for unhash flagged, " + "called from %p\n", __builtin_return_address(0)); + return 0; + } + + /* unhash it and decrease its reference counter */ + hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport); + ct_write_lock(hash); + + list_del(&cp->c_list); + cp->flags &= ~IP_VS_CONN_F_HASHED; + atomic_dec(&cp->refcnt); + + ct_write_unlock(hash); + + return 1; +} + + +/* + * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. + * Called for pkts coming from OUTside-to-INside. + * s_addr, s_port: pkt source address (foreign host) + * d_addr, d_port: pkt dest address (load balancer) + */ +static inline struct ip_vs_conn *__ip_vs_conn_in_get +(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port) +{ + unsigned hash; + struct ip_vs_conn *cp; + struct list_head *l,*e; + + hash = ip_vs_conn_hashkey(protocol, s_addr, s_port); + l = &ip_vs_conn_tab[hash]; + + ct_read_lock(hash); + + for (e=l->next; e!=l; e=e->next) { + cp = list_entry(e, struct ip_vs_conn, c_list); + if (s_addr==cp->caddr && s_port==cp->cport && + d_port==cp->vport && d_addr==cp->vaddr && + protocol==cp->protocol) { + /* HIT */ + atomic_inc(&cp->refcnt); + ct_read_unlock(hash); + return cp; + } + } + + ct_read_unlock(hash); + + return NULL; +} + +struct ip_vs_conn *ip_vs_conn_in_get +(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port) +{ + struct ip_vs_conn *cp; + + cp = __ip_vs_conn_in_get(protocol, s_addr, s_port, d_addr, d_port); + if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) + cp = __ip_vs_conn_in_get(protocol, s_addr, 0, d_addr, d_port); + + IP_VS_DBG(7, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", + ip_vs_proto_name(protocol), + NIPQUAD(s_addr), ntohs(s_port), + NIPQUAD(d_addr), ntohs(d_port), + cp?"hit":"not hit"); + + return cp; +} + + +/* + * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. + * Called for pkts coming from inside-to-OUTside. + * s_addr, s_port: pkt source address (inside host) + * d_addr, d_port: pkt dest address (foreign host) + */ +struct ip_vs_conn *ip_vs_conn_out_get +(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port) +{ + unsigned hash; + struct ip_vs_conn *cp, *ret=NULL; + struct list_head *l,*e; + + /* + * Check for "full" addressed entries + */ + hash = ip_vs_conn_hashkey(protocol, d_addr, d_port); + l = &ip_vs_conn_tab[hash]; + + ct_read_lock(hash); + + for (e=l->next; e!=l; e=e->next) { + cp = list_entry(e, struct ip_vs_conn, c_list); + if (d_addr == cp->caddr && d_port == cp->cport && + s_port == cp->dport && s_addr == cp->daddr && + protocol == cp->protocol) { + /* HIT */ + atomic_inc(&cp->refcnt); + ret = cp; + break; + } + } + + ct_read_unlock(hash); + + IP_VS_DBG(7, "lookup/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", + ip_vs_proto_name(protocol), + NIPQUAD(s_addr), ntohs(s_port), + NIPQUAD(d_addr), ntohs(d_port), + ret?"hit":"not hit"); + + return ret; +} + + +/* + * Put back the conn and restart its timer with its timeout + */ +void ip_vs_conn_put(struct ip_vs_conn *cp) +{ + /* reset it expire in its timeout */ + mod_timer(&cp->timer, jiffies+cp->timeout); + + __ip_vs_conn_put(cp); +} + + +/* + * Fill a no_client_port connection with a client port number + */ +void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __u16 cport) +{ + atomic_dec(&ip_vs_conn_no_cport_cnt); + ip_vs_conn_unhash(cp); + cp->flags &= ~IP_VS_CONN_F_NO_CPORT; + cp->cport = cport; + /* hash on new dport */ + ip_vs_conn_hash(cp); +} + + +/* + * Bind a connection entry with the corresponding packet_xmit. + * Called by ip_vs_conn_new. + */ +static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp) +{ + switch (IP_VS_FWD_METHOD(cp)) { + case IP_VS_CONN_F_MASQ: + cp->packet_xmit = ip_vs_nat_xmit; + break; + + case IP_VS_CONN_F_TUNNEL: + cp->packet_xmit = ip_vs_tunnel_xmit; + break; + + case IP_VS_CONN_F_DROUTE: + cp->packet_xmit = ip_vs_dr_xmit; + break; + + case IP_VS_CONN_F_LOCALNODE: + cp->packet_xmit = ip_vs_null_xmit; + break; + + case IP_VS_CONN_F_BYPASS: + cp->packet_xmit = ip_vs_bypass_xmit; + break; + } +} + + +static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest) +{ + return atomic_read(&dest->activeconns) + + atomic_read(&dest->inactconns); +} + +/* + * Bind a connection entry with a virtual service destination + * Called just after a new connection entry is created. + */ +static inline void +ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) +{ + /* if dest is NULL, then return directly */ + if (!dest) + return; + + /* Increase the refcnt counter of the dest */ + atomic_inc(&dest->refcnt); + + /* Bind with the destination and its corresponding transmitter */ + cp->flags |= atomic_read(&dest->conn_flags); + cp->dest = dest; + + IP_VS_DBG(9, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " + "d:%u.%u.%u.%u:%d fwd:%c s:%u flg:%X cnt:%d destcnt:%d\n", + ip_vs_proto_name(cp->protocol), + NIPQUAD(cp->caddr), ntohs(cp->cport), + NIPQUAD(cp->vaddr), ntohs(cp->vport), + NIPQUAD(cp->daddr), ntohs(cp->dport), + ip_vs_fwd_tag(cp), cp->state, + cp->flags, atomic_read(&cp->refcnt), + atomic_read(&dest->refcnt)); + + /* Update the connection counters */ + if (cp->cport || (cp->flags & IP_VS_CONN_F_NO_CPORT)) { + /* It is a normal connection, so increase the inactive + connection counter because it is in TCP SYNRECV + state (inactive) or other protocol inacive state */ + atomic_inc(&dest->inactconns); + } else { + /* It is a persistent connection/template, so increase + the peristent connection counter */ + atomic_inc(&dest->persistconns); + } + + if (dest->u_threshold != 0 && + ip_vs_dest_totalconns(dest) >= dest->u_threshold) + dest->flags |= IP_VS_DEST_F_OVERLOAD; +} + + +/* + * Unbind a connection entry with its VS destination + * Called by the ip_vs_conn_expire function. + */ +static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp) +{ + struct ip_vs_dest *dest = cp->dest; + + if (!dest) + return; + + IP_VS_DBG(9, "Unbind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " + "d:%u.%u.%u.%u:%d fwd:%c s:%u flg:%X cnt:%d destcnt:%d\n", + ip_vs_proto_name(cp->protocol), + NIPQUAD(cp->caddr), ntohs(cp->cport), + NIPQUAD(cp->vaddr), ntohs(cp->vport), + NIPQUAD(cp->daddr), ntohs(cp->dport), + ip_vs_fwd_tag(cp), cp->state, + cp->flags, atomic_read(&cp->refcnt), + atomic_read(&dest->refcnt)); + + /* Update the connection counters */ + if (cp->cport || (cp->flags & IP_VS_CONN_F_NO_CPORT)) { + /* It is a normal connection, so decrease the inactconns + or activeconns counter */ + if (cp->flags & IP_VS_CONN_F_INACTIVE) { + atomic_dec(&dest->inactconns); + } else { + atomic_dec(&dest->activeconns); + } + } else { + /* It is a persistent connection/template, so decrease + the peristent connection counter */ + atomic_dec(&dest->persistconns); + } + + if (dest->l_threshold != 0) { + if (ip_vs_dest_totalconns(dest) < dest->l_threshold) + dest->flags &= ~IP_VS_DEST_F_OVERLOAD; + } else if (dest->u_threshold != 0) { + if (ip_vs_dest_totalconns(dest) * 4 < dest->u_threshold * 3) + dest->flags &= ~IP_VS_DEST_F_OVERLOAD; + } else { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + dest->flags &= ~IP_VS_DEST_F_OVERLOAD; + } + + /* + * Simply decrease the refcnt of the dest, because the + * dest will be either in service's destination list + * or in the trash. + */ + atomic_dec(&dest->refcnt); +} + + +/* + * Checking if the destination of a connection template is available. + * If available, return 1, otherwise invalidate this connection + * template and return 0. + */ +int ip_vs_check_template(struct ip_vs_conn *ct) +{ + struct ip_vs_dest *dest = ct->dest; + + /* + * Checking the dest server status. + */ + if ((dest == NULL) || + !(dest->flags & IP_VS_DEST_F_AVAILABLE)) { + IP_VS_DBG(9, "check_template: dest not available for " + "protocol %s s:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " + "-> d:%u.%u.%u.%u:%d\n", + ip_vs_proto_name(ct->protocol), + NIPQUAD(ct->caddr), ntohs(ct->cport), + NIPQUAD(ct->vaddr), ntohs(ct->vport), + NIPQUAD(ct->daddr), ntohs(ct->dport)); + + /* + * Invalidate the connection template + */ + ip_vs_conn_unhash(ct); + ct->dport = 65535; + ct->vport = 65535; + ct->cport = 0; + ip_vs_conn_hash(ct); + + /* + * Simply decrease the refcnt of the template, + * don't restart its timer. + */ + atomic_dec(&ct->refcnt); + return 0; + } + return 1; +} + +static void ip_vs_conn_expire(unsigned long data) +{ + struct ip_vs_conn *cp = (struct ip_vs_conn *)data; + + cp->timeout = 60*HZ; + + /* + * hey, I'm using it + */ + atomic_inc(&cp->refcnt); + + /* + * do I control anybody? + */ + if (atomic_read(&cp->n_control)) + goto expire_later; + + /* + * unhash it if it is hashed in the conn table + */ + ip_vs_conn_unhash(cp); + + /* + * refcnt==1 implies I'm the only one referrer + */ + if (likely(atomic_read(&cp->refcnt) == 1)) { + /* make sure that there is no timer on it now */ + del_timer_sync(&cp->timer); + + /* does anybody control me? */ + if (cp->control) + ip_vs_control_del(cp); + + if (unlikely(cp->app != NULL)) + ip_vs_unbind_app(cp); + ip_vs_unbind_dest(cp); + //ip_vs_timeout_detach(cp); + if (cp->flags & IP_VS_CONN_F_NO_CPORT) + atomic_dec(&ip_vs_conn_no_cport_cnt); + atomic_dec(&ip_vs_conn_count); + + kmem_cache_free(ip_vs_conn_cachep, cp); + return; + } + + /* hash it back to the table */ + ip_vs_conn_hash(cp); + + expire_later: + IP_VS_DBG(7, "delayed: refcnt-1=%d conn.n_control=%d\n", + atomic_read(&cp->refcnt)-1, + atomic_read(&cp->n_control)); + + ip_vs_conn_put(cp); +} + + +void ip_vs_conn_expire_now(struct ip_vs_conn *cp) +{ + cp->timeout = 0; + mod_timer(&cp->timer, jiffies); + __ip_vs_conn_put(cp); +} + + +/* + * Create a new connection entry and hash it into the ip_vs_conn_tab + */ +struct ip_vs_conn * +ip_vs_conn_new(int proto, __u32 caddr, __u16 cport, __u32 vaddr, __u16 vport, + __u32 daddr, __u16 dport, unsigned flags, + struct ip_vs_dest *dest) +{ + struct ip_vs_conn *cp; + struct ip_vs_protocol *pp = ip_vs_proto_get(proto); + + cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC); + if (cp == NULL) { + IP_VS_ERR_RL("ip_vs_conn_new: no memory available.\n"); + return NULL; + } + + memset(cp, 0, sizeof(*cp)); + INIT_LIST_HEAD(&cp->c_list); + init_timer(&cp->timer); + cp->timer.data = (unsigned long)cp; + cp->timer.function = ip_vs_conn_expire; + cp->protocol = proto; + cp->caddr = caddr; + cp->cport = cport; + cp->vaddr = vaddr; + cp->vport = vport; + cp->daddr = daddr; + cp->dport = dport; + cp->flags = flags; + cp->lock = SPIN_LOCK_UNLOCKED; + + /* + * Set the entry is referenced by the current thread before hashing + * it in the table, so that other thread run ip_vs_random_dropentry + * but cannot drop this entry. + */ + atomic_set(&cp->refcnt, 1); + + atomic_set(&cp->n_control, 0); + atomic_set(&cp->in_pkts, 0); + + atomic_inc(&ip_vs_conn_count); + if (flags & IP_VS_CONN_F_NO_CPORT) + atomic_inc(&ip_vs_conn_no_cport_cnt); + + /* Bind the connection with a destination server */ + ip_vs_bind_dest(cp, dest); + + /* Set its state and timeout */ + cp->state = 0; + cp->timeout = 3*HZ; + + /* Bind its packet transmitter */ + ip_vs_bind_xmit(cp); + + if (unlikely(pp && atomic_read(&pp->appcnt))) + ip_vs_bind_app(cp, pp); + + /* Hash it in the ip_vs_conn_tab finally */ + ip_vs_conn_hash(cp); + + return cp; +} + + +/* + * /proc/net/ip_vs_conn entries + */ +static int +ip_vs_conn_getinfo(char *buffer, char **start, off_t offset, int length) +{ + off_t pos=0; + int idx, len=0; + char temp[70]; + struct ip_vs_conn *cp; + struct list_head *l, *e; + + pos = 128; + if (pos > offset) { + len += sprintf(buffer+len, "%-127s\n", + "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires"); + } + + for(idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) { + /* + * Lock is actually only need in next loop + * we are called from uspace: must stop bh. + */ + ct_read_lock_bh(idx); + + l = &ip_vs_conn_tab[idx]; + for (e=l->next; e!=l; e=e->next) { + cp = list_entry(e, struct ip_vs_conn, c_list); + pos += 128; + if (pos <= offset) + continue; + sprintf(temp, + "%-3s %08X %04X %08X %04X %08X %04X %-11s %7lu", + ip_vs_proto_name(cp->protocol), + ntohl(cp->caddr), ntohs(cp->cport), + ntohl(cp->vaddr), ntohs(cp->vport), + ntohl(cp->daddr), ntohs(cp->dport), + ip_vs_state_name(cp->protocol, cp->state), + (cp->timer.expires-jiffies)/HZ); + len += sprintf(buffer+len, "%-127s\n", temp); + if (pos >= offset+length) { + ct_read_unlock_bh(idx); + goto done; + } + } + ct_read_unlock_bh(idx); + } + + done: + *start = buffer+len-(pos-offset); /* Start of wanted data */ + len = pos-offset; + if (len > length) + len = length; + if (len < 0) + len = 0; + return len; +} + + +/* + * Randomly drop connection entries before running out of memory + */ +static inline int todrop_entry(struct ip_vs_conn *cp) +{ + /* + * The drop rate array needs tuning for real environments. + * Called from timer bh only => no locking + */ + static char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8}; + static char todrop_counter[9] = {0}; + int i; + + /* if the conn entry hasn't lasted for 60 seconds, don't drop it. + This will leave enough time for normal connection to get + through. */ + if (cp->timeout+jiffies-cp->timer.expires < 60*HZ) + return 0; + + /* Don't drop the entry if its number of incoming packets is not + located in [0, 8] */ + i = atomic_read(&cp->in_pkts); + if (i > 8 || i < 0) return 0; + + if (!todrop_rate[i]) return 0; + if (--todrop_counter[i] > 0) return 0; + + todrop_counter[i] = todrop_rate[i]; + return 1; +} + + +void ip_vs_random_dropentry(void) +{ + int idx; + struct ip_vs_conn *cp; + struct list_head *l,*e; + struct ip_vs_conn *ct; + + /* + * Randomly scan 1/32 of the whole table every second + */ + for (idx = 0; idx < (IP_VS_CONN_TAB_SIZE>>5); idx++) { + unsigned hash = net_random() & IP_VS_CONN_TAB_MASK; + + /* + * Lock is actually needed in this loop. + */ + ct_write_lock(hash); + + l = &ip_vs_conn_tab[hash]; + for (e=l->next; e!=l; e=e->next) { + cp = list_entry(e, struct ip_vs_conn, c_list); + if (!cp->cport && !(cp->flags & IP_VS_CONN_F_NO_CPORT)) + /* connection template */ + continue; + + if (cp->protocol == IPPROTO_TCP) { + switch(cp->state) { + case IP_VS_TCP_S_SYN_RECV: + case IP_VS_TCP_S_SYNACK: + break; + + case IP_VS_TCP_S_ESTABLISHED: + if (todrop_entry(cp)) + break; + continue; + + default: + continue; + } + } else { + if (!todrop_entry(cp)) + continue; + } + + /* + * Drop the entry, and drop its ct if not referenced + */ + atomic_inc(&cp->refcnt); + ct_write_unlock(hash); + + if ((ct = cp->control)) + atomic_inc(&ct->refcnt); + IP_VS_DBG(4, "del connection\n"); + ip_vs_conn_expire_now(cp); + if (ct) { + IP_VS_DBG(4, "del conn template\n"); + ip_vs_conn_expire_now(ct); + } + ct_write_lock(hash); + } + ct_write_unlock(hash); + } +} + + +/* + * Flush all the connection entries in the ip_vs_conn_tab + */ +static void ip_vs_conn_flush(void) +{ + int idx; + struct ip_vs_conn *cp; + struct list_head *l,*e; + struct ip_vs_conn *ct; + + flush_again: + for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) { + /* + * Lock is actually needed in this loop. + */ + ct_write_lock_bh(idx); + + l = &ip_vs_conn_tab[idx]; + for (e=l->next; e!=l; e=e->next) { + cp = list_entry(e, struct ip_vs_conn, c_list); + atomic_inc(&cp->refcnt); + ct_write_unlock(idx); + + if ((ct = cp->control)) + atomic_inc(&ct->refcnt); + IP_VS_DBG(4, "del connection\n"); + ip_vs_conn_expire_now(cp); + if (ct) { + IP_VS_DBG(4, "del conn template\n"); + ip_vs_conn_expire_now(ct); + } + ct_write_lock(idx); + } + ct_write_unlock_bh(idx); + } + + /* the counter may be not NULL, because maybe some conn entries + are run by slow timer handler or unhashed but still referred */ + if (atomic_read(&ip_vs_conn_count) != 0) { + schedule(); + goto flush_again; + } +} + + +int ip_vs_conn_init(void) +{ + int idx; + + /* + * Allocate the connection hash table and initialize its list heads + */ + ip_vs_conn_tab = vmalloc(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head)); + if (!ip_vs_conn_tab) + return -ENOMEM; + + /* Allocate ip_vs_conn slab cache */ + ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn", + sizeof(struct ip_vs_conn), 0, + SLAB_HWCACHE_ALIGN, NULL, NULL); + if (!ip_vs_conn_cachep) { + vfree(ip_vs_conn_tab); + return -ENOMEM; + } + + IP_VS_INFO("Connection hash table configured " + "(size=%d, memory=%ldKbytes)\n", + IP_VS_CONN_TAB_SIZE, + (long)(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head))/1024); + IP_VS_DBG(0, "Each connection entry needs %d bytes at least\n", + sizeof(struct ip_vs_conn)); + + for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) { + INIT_LIST_HEAD(&ip_vs_conn_tab[idx]); + } + + for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) { + __ip_vs_conntbl_lock_array[idx].l = RW_LOCK_UNLOCKED; + } + + proc_net_create("ip_vs_conn", 0, ip_vs_conn_getinfo); + + /* calculate the random value for connection hash */ + get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd)); + + return 0; +} + + +void ip_vs_conn_cleanup(void) +{ + /* flush all the connection entries first */ + ip_vs_conn_flush(); + + /* Release the empty cache */ + kmem_cache_destroy(ip_vs_conn_cachep); + proc_net_remove("ip_vs_conn"); + vfree(ip_vs_conn_tab); +} diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c new file mode 100644 index 000000000000..5695301dc253 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -0,0 +1,1166 @@ +/* + * IPVS An implementation of the IP virtual server support for the + * LINUX operating system. IPVS is now implemented as a module + * over the Netfilter framework. IPVS can be used to build a + * high-performance and highly available server based on a + * cluster of servers. + * + * Version: $Id: ip_vs_core.c,v 1.34 2003/05/10 03:05:23 wensong Exp $ + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * Peter Kese <peter.kese@ijs.si> + * Julian Anastasov <ja@ssi.bg> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese, + * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms + * and others. + * + * Changes: + * + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/compiler.h> +#include <linux/ip.h> +#include <linux/tcp.h> +#include <linux/icmp.h> + +#include <net/ip.h> +#include <net/tcp.h> +#include <net/udp.h> +#include <net/icmp.h> /* for icmp_send */ +#include <net/route.h> + +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> + +#include <net/ip_vs.h> + + +EXPORT_SYMBOL(register_ip_vs_scheduler); +EXPORT_SYMBOL(unregister_ip_vs_scheduler); +EXPORT_SYMBOL(ip_vs_skb_replace); +EXPORT_SYMBOL(ip_vs_proto_name); +EXPORT_SYMBOL(ip_vs_conn_new); +EXPORT_SYMBOL(ip_vs_conn_in_get); +EXPORT_SYMBOL(ip_vs_conn_out_get); +EXPORT_SYMBOL(ip_vs_tcp_conn_listen); +EXPORT_SYMBOL(ip_vs_conn_put); +#ifdef CONFIG_IP_VS_DEBUG +EXPORT_SYMBOL(ip_vs_get_debug_level); +#endif +EXPORT_SYMBOL(check_for_ip_vs_out); + + +/* ID used in ICMP lookups */ +#define icmp_id(icmph) ((icmph->un).echo.id) + +const char *ip_vs_proto_name(unsigned proto) +{ + static char buf[20]; + + switch (proto) { + case IPPROTO_IP: + return "IP"; + case IPPROTO_UDP: + return "UDP"; + case IPPROTO_TCP: + return "TCP"; + case IPPROTO_ICMP: + return "ICMP"; + default: + sprintf(buf, "IP_%d", proto); + return buf; + } +} + +void ip_vs_init_hash_table(struct list_head *table, int rows) +{ + while (--rows >= 0) + INIT_LIST_HEAD(&table[rows]); +} + +static inline void +ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) +{ + struct ip_vs_dest *dest = cp->dest; + if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { + spin_lock(&dest->stats.lock); + dest->stats.inpkts++; + dest->stats.inbytes += skb->len; + spin_unlock(&dest->stats.lock); + + spin_lock(&dest->svc->stats.lock); + dest->svc->stats.inpkts++; + dest->svc->stats.inbytes += skb->len; + spin_unlock(&dest->svc->stats.lock); + + spin_lock(&ip_vs_stats.lock); + ip_vs_stats.inpkts++; + ip_vs_stats.inbytes += skb->len; + spin_unlock(&ip_vs_stats.lock); + } +} + + +static inline void +ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) +{ + struct ip_vs_dest *dest = cp->dest; + if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { + spin_lock(&dest->stats.lock); + dest->stats.outpkts++; + dest->stats.outbytes += skb->len; + spin_unlock(&dest->stats.lock); + + spin_lock(&dest->svc->stats.lock); + dest->svc->stats.outpkts++; + dest->svc->stats.outbytes += skb->len; + spin_unlock(&dest->svc->stats.lock); + + spin_lock(&ip_vs_stats.lock); + ip_vs_stats.outpkts++; + ip_vs_stats.outbytes += skb->len; + spin_unlock(&ip_vs_stats.lock); + } +} + + +static inline void +ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc) +{ + spin_lock(&cp->dest->stats.lock); + cp->dest->stats.conns++; + spin_unlock(&cp->dest->stats.lock); + + spin_lock(&svc->stats.lock); + svc->stats.conns++; + spin_unlock(&svc->stats.lock); + + spin_lock(&ip_vs_stats.lock); + ip_vs_stats.conns++; + spin_unlock(&ip_vs_stats.lock); +} + + +static inline int +ip_vs_set_state(struct ip_vs_conn *cp, int direction, + struct iphdr *iph, union ip_vs_tphdr h, + struct ip_vs_protocol *pp) +{ + if (unlikely(!pp->state_transition)) + return 0; + return pp->state_transition(cp, direction, iph, h, pp); +} + + +/* + * IPVS persistent scheduling function + * It creates a connection entry according to its template if exists, + * or selects a server and creates a connection entry plus a template. + * Locking: we are svc user (svc->refcnt), so we hold all dests too + * Protocols supported: TCP, UDP + */ +static struct ip_vs_conn * +ip_vs_sched_persist(struct ip_vs_service *svc, struct iphdr *iph) +{ + struct ip_vs_conn *cp = NULL; + struct ip_vs_dest *dest; + const __u16 *portp; + struct ip_vs_conn *ct; + __u16 dport; /* destination port to forward */ + __u32 snet; /* source network of the client, after masking */ + + portp = (__u16 *)&(((char *)iph)[iph->ihl*4]); + + /* Mask saddr with the netmask to adjust template granularity */ + snet = iph->saddr & svc->netmask; + + IP_VS_DBG(6, "p-schedule: src %u.%u.%u.%u:%u dest %u.%u.%u.%u:%u " + "mnet %u.%u.%u.%u\n", + NIPQUAD(iph->saddr), ntohs(portp[0]), + NIPQUAD(iph->daddr), ntohs(portp[1]), + NIPQUAD(snet)); + + /* + * As far as we know, FTP is a very complicated network protocol, and + * it uses control connection and data connections. For active FTP, + * FTP server initialize data connection to the client, its source port + * is often 20. For passive FTP, FTP server tells the clients the port + * that it passively listens to, and the client issues the data + * connection. In the tunneling or direct routing mode, the load + * balancer is on the client-to-server half of connection, the port + * number is unknown to the load balancer. So, a conn template like + * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP + * service, and a template like <caddr, 0, vaddr, vport, daddr, dport> + * is created for other persistent services. + */ + if (portp[1] == svc->port) { + /* Check if a template already exists */ + if (svc->port != FTPPORT) + ct = ip_vs_conn_in_get(iph->protocol, snet, 0, + iph->daddr, portp[1]); + else + ct = ip_vs_conn_in_get(iph->protocol, snet, 0, + iph->daddr, 0); + + if (!ct || !ip_vs_check_template(ct)) { + /* + * No template found or the dest of the connection + * template is not available. + */ + dest = svc->scheduler->schedule(svc, iph); + if (dest == NULL) { + IP_VS_DBG(1, "p-schedule: no dest found.\n"); + return NULL; + } + + /* + * Create a template like <protocol,caddr,0, + * vaddr,vport,daddr,dport> for non-ftp service, + * and <protocol,caddr,0,vaddr,0,daddr,0> + * for ftp service. + */ + if (svc->port != FTPPORT) + ct = ip_vs_conn_new(iph->protocol, + snet, 0, + iph->daddr, portp[1], + dest->addr, dest->port, + 0, + dest); + else + ct = ip_vs_conn_new(iph->protocol, + snet, 0, + iph->daddr, 0, + dest->addr, 0, + 0, + dest); + if (ct == NULL) + return NULL; + + ct->timeout = svc->timeout; + } else { + /* set destination with the found template */ + dest = ct->dest; + } + dport = dest->port; + } else { + /* + * Note: persistent fwmark-based services and persistent + * port zero service are handled here. + * fwmark template: <IPPROTO_IP,caddr,0,fwmark,0,daddr,0> + * port zero template: <protocol,caddr,0,vaddr,0,daddr,0> + */ + if (svc->fwmark) + ct = ip_vs_conn_in_get(IPPROTO_IP, snet, 0, + htonl(svc->fwmark), 0); + else + ct = ip_vs_conn_in_get(iph->protocol, snet, 0, + iph->daddr, 0); + + if (!ct || !ip_vs_check_template(ct)) { + /* + * If it is not persistent port zero, return NULL, + * otherwise create a connection template. + */ + if (svc->port) + return NULL; + + dest = svc->scheduler->schedule(svc, iph); + if (dest == NULL) { + IP_VS_DBG(1, "p-schedule: no dest found.\n"); + return NULL; + } + + /* + * Create a template according to the service + */ + if (svc->fwmark) + ct = ip_vs_conn_new(IPPROTO_IP, + snet, 0, + htonl(svc->fwmark), 0, + dest->addr, 0, + 0, + dest); + else + ct = ip_vs_conn_new(iph->protocol, + snet, 0, + iph->daddr, 0, + dest->addr, 0, + 0, + dest); + if (ct == NULL) + return NULL; + + ct->timeout = svc->timeout; + } else { + /* set destination with the found template */ + dest = ct->dest; + } + dport = portp[1]; + } + + /* + * Create a new connection according to the template + */ + cp = ip_vs_conn_new(iph->protocol, + iph->saddr, portp[0], + iph->daddr, portp[1], + dest->addr, dport, + 0, + dest); + if (cp == NULL) { + ip_vs_conn_put(ct); + return NULL; + } + + /* + * Add its control + */ + ip_vs_control_add(cp, ct); + ip_vs_conn_put(ct); + + ip_vs_conn_stats(cp, svc); + return cp; +} + + +/* + * IPVS main scheduling function + * It selects a server according to the virtual service, and + * creates a connection entry. + * Protocols supported: TCP, UDP + */ +struct ip_vs_conn * +ip_vs_schedule(struct ip_vs_service *svc, struct iphdr *iph) +{ + struct ip_vs_conn *cp = NULL; + struct ip_vs_dest *dest; + const __u16 *portp; + + /* + * Persistent service + */ + if (svc->flags & IP_VS_SVC_F_PERSISTENT) + return ip_vs_sched_persist(svc, iph); + + /* + * Non-persistent service + */ + portp = (__u16 *)&(((char *)iph)[iph->ihl*4]); + if (!svc->fwmark && portp[1] != svc->port) { + if (!svc->port) + IP_VS_ERR("Schedule: port zero only supported " + "in persistent services, " + "check your ipvs configuration\n"); + return NULL; + } + + dest = svc->scheduler->schedule(svc, iph); + if (dest == NULL) { + IP_VS_DBG(1, "Schedule: no dest found.\n"); + return NULL; + } + + /* + * Create a connection entry. + */ + cp = ip_vs_conn_new(iph->protocol, + iph->saddr, portp[0], + iph->daddr, portp[1], + dest->addr, dest->port?dest->port:portp[1], + 0, + dest); + if (cp == NULL) + return NULL; + + IP_VS_DBG(6, "Schedule fwd:%c c:%u.%u.%u.%u:%u v:%u.%u.%u.%u:%u " + "d:%u.%u.%u.%u:%u flg:%X cnt:%d\n", + ip_vs_fwd_tag(cp), + NIPQUAD(cp->caddr), ntohs(cp->cport), + NIPQUAD(cp->vaddr), ntohs(cp->vport), + NIPQUAD(cp->daddr), ntohs(cp->dport), + cp->flags, atomic_read(&cp->refcnt)); + + ip_vs_conn_stats(cp, svc); + return cp; +} + + +/* + * Pass or drop the packet. + * Called by ip_vs_in, when the virtual service is available but + * no destination is available for a new connection. + */ +int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, + struct ip_vs_protocol *pp, union ip_vs_tphdr h) +{ + struct iphdr *iph = skb->nh.iph; + + /* if it is fwmark-based service, the cache_bypass sysctl is up + and the destination is RTN_UNICAST (and not local), then create + a cache_bypass connection entry */ + if (sysctl_ip_vs_cache_bypass && svc->fwmark + && (inet_addr_type(iph->daddr) == RTN_UNICAST)) { + int ret, cs; + struct ip_vs_conn *cp; + + ip_vs_service_put(svc); + + /* create a new connection entry */ + IP_VS_DBG(6, "ip_vs_leave: create a cache_bypass entry\n"); + cp = ip_vs_conn_new(iph->protocol, + iph->saddr, h.portp[0], + iph->daddr, h.portp[1], + 0, 0, + IP_VS_CONN_F_BYPASS, + NULL); + if (cp == NULL) { + kfree_skb(skb); + return NF_STOLEN; + } + + /* statistics */ + ip_vs_in_stats(cp, skb); + + /* set state */ + cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, iph, h, pp); + + /* transmit the first SYN packet */ + ret = cp->packet_xmit(skb, cp, pp); + + atomic_inc(&cp->in_pkts); + ip_vs_conn_put(cp); + return ret; + } + + /* + * When the virtual ftp service is presented, packets destined + * for other services on the VIP may get here (except services + * listed in the ipvs table), pass the packets, because it is + * not ipvs job to decide to drop the packets. + */ + if ((svc->port == FTPPORT) && (h.portp[1] != FTPPORT)) { + ip_vs_service_put(svc); + return NF_ACCEPT; + } + + ip_vs_service_put(svc); + + /* + * Notify the client that the destination is unreachable, and + * release the socket buffer. + * Since it is in IP layer, the TCP socket is not actually + * created, the TCP RST packet cannot be sent, instead that + * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ + */ + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + kfree_skb(skb); + return NF_STOLEN; +} + + +/* + * It is hooked before NF_IP_PRI_NAT_SRC at the NF_IP_POST_ROUTING + * chain, and is used for VS/NAT. + * It detects packets for VS/NAT connections and sends the packets + * immediately. This can avoid that iptable_nat mangles the packets + * for VS/NAT. + */ +static unsigned int ip_vs_post_routing(unsigned int hooknum, + struct sk_buff **skb_p, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct sk_buff *skb = *skb_p; + + if (!(skb->nfcache & NFC_IPVS_PROPERTY)) + return NF_ACCEPT; + + /* The packet was sent from IPVS, exit this chain */ + (*okfn)(skb); + + return NF_STOLEN; +} + + +/* + * Handle ICMP messages in the inside-to-outside direction (outgoing). + * Find any that might be relevant, check against existing connections, + * forward to the right destination host if relevant. + * Currently handles error types - unreachable, quench, ttl exceeded. + * (Only used in VS/NAT) + */ +static int ip_vs_out_icmp(struct sk_buff **skb_p, int *related) +{ + struct sk_buff *skb = *skb_p; + struct iphdr *iph; + struct icmphdr *icmph; + struct iphdr *ciph; /* The ip header contained within the ICMP */ + unsigned short ihl; + unsigned short len; + unsigned short clen, cihl; + struct ip_vs_conn *cp; + struct ip_vs_protocol *pp; + union ip_vs_tphdr h; + + *related = 1; + + /* reassemble IP fragments, but will it happen in ICMP packets?? */ + if (skb->nh.iph->frag_off & __constant_htons(IP_MF|IP_OFFSET)) { + skb = ip_defrag(skb); + if (!skb) + return NF_STOLEN; + *skb_p = skb; + } + + if (skb_is_nonlinear(skb)) { + if (skb_linearize(skb, GFP_ATOMIC) != 0) + return NF_DROP; + ip_send_check(skb->nh.iph); + } + + iph = skb->nh.iph; + ihl = iph->ihl << 2; + icmph = (struct icmphdr *)((char *)iph + ihl); + len = ntohs(iph->tot_len) - ihl; + if (len < sizeof(struct icmphdr)) + return NF_DROP; + + IP_VS_DBG(12, "outgoing ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n", + icmph->type, ntohs(icmp_id(icmph)), + NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); + + /* + * Work through seeing if this is for us. + * These checks are supposed to be in an order that means easy + * things are checked first to speed up processing.... however + * this means that some packets will manage to get a long way + * down this stack and then be rejected, but that's life. + */ + if ((icmph->type != ICMP_DEST_UNREACH) && + (icmph->type != ICMP_SOURCE_QUENCH) && + (icmph->type != ICMP_TIME_EXCEEDED)) { + *related = 0; + return NF_ACCEPT; + } + + /* Now find the contained IP header */ + clen = len - sizeof(struct icmphdr); + if (clen < sizeof(struct iphdr)) + return NF_DROP; + ciph = (struct iphdr *) (icmph + 1); + cihl = ciph->ihl << 2; + if (clen < cihl) + return NF_DROP; + + pp = ip_vs_proto_get(ciph->protocol); + if (!pp) + return NF_ACCEPT; + + /* Is the embedded protocol header present? */ + if (unlikely(ciph->frag_off & __constant_htons(IP_OFFSET) && + (pp->minhlen || pp->dont_defrag))) + return NF_ACCEPT; + + /* We need at least TCP/UDP ports here */ + if (clen < cihl + pp->minhlen_icmp) + return NF_DROP; + + h.raw = (char *) ciph + cihl; + + /* Ensure the checksum is correct */ + if (ip_compute_csum((unsigned char *) icmph, len)) { + /* Failed checksum! */ + IP_VS_DBG(1, "forward ICMP: failed checksum from %d.%d.%d.%d!\n", + NIPQUAD(iph->saddr)); + return NF_DROP; + } + + IP_VS_DBG_PKT(11, pp, ciph, "Handling outgoing ICMP for"); + + /* ciph content is actually <protocol, caddr, cport, daddr, dport> */ + cp = pp->conn_out_get(skb, pp, ciph, h, 1); + if (!cp) + return NF_ACCEPT; + + if (IP_VS_FWD_METHOD(cp) != 0) { + IP_VS_ERR("shouldn't reach here, because the box is on the" + "half connection in the tun/dr module.\n"); + } + + /* Now we do real damage to this packet...! */ + /* First change the source IP address, and recalc checksum */ + iph->saddr = cp->vaddr; + ip_send_check(iph); + + /* Now change the *dest* address in the contained IP */ + ciph->daddr = cp->vaddr; + ip_send_check(ciph); + + /* the TCP/UDP dest port - cannot redo check */ + if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol) + h.portp[1] = cp->vport; + + /* And finally the ICMP checksum */ + icmph->checksum = 0; + icmph->checksum = ip_compute_csum((unsigned char *) icmph, len); + skb->ip_summed = CHECKSUM_UNNECESSARY; + + /* do the statistics and put it back */ + ip_vs_out_stats(cp, skb); + __ip_vs_conn_put(cp); + + IP_VS_DBG_PKT(11, pp, ciph, "Forwarding correct outgoing ICMP"); + + skb->nfcache |= NFC_IPVS_PROPERTY; + + return NF_ACCEPT; +} + + +/* + * It is hooked at the NF_IP_FORWARD chain, used only for VS/NAT. + * Check if outgoing packet belongs to the established ip_vs_conn, + * rewrite addresses of the packet and send it on its way... + */ +static unsigned int +ip_vs_out(unsigned int hooknum, struct sk_buff **skb_p, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct sk_buff *skb = *skb_p; + struct iphdr *iph; + struct ip_vs_protocol *pp; + union ip_vs_tphdr h; + struct ip_vs_conn *cp; + int size, ihl, firstfrag; + + EnterFunction(11); + + if (skb->nfcache & NFC_IPVS_PROPERTY) + return NF_ACCEPT; + + iph = skb->nh.iph; + if (unlikely(iph->protocol == IPPROTO_ICMP)) { + int related, verdict = ip_vs_out_icmp(skb_p, &related); + + if (related) + return verdict; + } + + pp = ip_vs_proto_get(iph->protocol); + if (unlikely(!pp)) + return NF_ACCEPT; + + /* reassemble IP fragments */ + if (unlikely(iph->frag_off & __constant_htons(IP_MF|IP_OFFSET) && + !pp->dont_defrag)) { + skb = ip_defrag(skb); + if (!skb) + return NF_STOLEN; + iph = skb->nh.iph; + *skb_p = skb; + } + + /* make sure that protocol header is available in skb data area, + note that skb data area may be reallocated. */ + ihl = iph->ihl << 2; + firstfrag = !(iph->frag_off & __constant_htons(IP_OFFSET)); + /* + * WARNING: we can work with !firstfrag packets, make sure + * each protocol handler checks for firstfrag + */ + if (firstfrag && + !pskb_may_pull(skb, ihl+pp->minhlen)) + return NF_DROP; + iph = skb->nh.iph; + h.raw = (char*) iph + ihl; + + /* + * Check if the packet belongs to an existing entry + */ + cp = pp->conn_out_get(skb, pp, iph, h, 0); + + if (unlikely(!cp)) { + if (sysctl_ip_vs_nat_icmp_send && + (pp->protocol == IPPROTO_TCP || + pp->protocol == IPPROTO_UDP) && + ip_vs_lookup_real_service(iph->protocol, + iph->saddr, h.portp[0])) { + /* + * Notify the real server: there is no existing + * entry if it is not RST packet or not TCP packet. + */ + if (!h.th->rst || iph->protocol != IPPROTO_TCP) { + icmp_send(skb, ICMP_DEST_UNREACH, + ICMP_PORT_UNREACH, 0); + kfree_skb(skb); + return NF_STOLEN; + } + } + IP_VS_DBG_PKT(12, pp, iph, + "packet continues traversal as normal"); + if (!pp->dont_defrag) + ip_send_check(iph); + return NF_ACCEPT; + } + + /* + * If it has ip_vs_app helper, the helper may change the payload, + * so it needs full checksum checking and checksum calculation. + * If not, only the header (addr/port) is changed, so it is fast + * to do incremental checksum update, and let the destination host + * do final checksum checking. + */ + + if (unlikely(cp->app && !pp->slave && skb_is_nonlinear(skb))) { + if (skb_linearize(skb, GFP_ATOMIC) != 0) { + ip_vs_conn_put(cp); + return NF_DROP; + } + iph = skb->nh.iph; + h.raw = (char*) iph + ihl; + } + + size = skb->len - ihl; + IP_VS_DBG(11, "O-pkt: %s size=%d\n", pp->name, size); + + /* do TCP/UDP checksum checking if it has application helper */ + if (unlikely(cp->app && pp->csum_check && !pp->slave)) { + if (!pp->csum_check(skb, pp, iph, h, size)) { + ip_vs_conn_put(cp); + return NF_DROP; + } + } + + IP_VS_DBG_PKT(11, pp, iph, "Outgoing packet"); + + /* mangle the packet */ + iph->saddr = cp->vaddr; + if (pp->snat_handler) { + pp->snat_handler(skb, pp, cp, iph, h, size); + iph = skb->nh.iph; + h.raw = (char*) iph + ihl; + } + ip_send_check(iph); + + IP_VS_DBG_PKT(10, pp, iph, "After SNAT"); + + ip_vs_out_stats(cp, skb); + ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, iph, h, pp); + ip_vs_conn_put(cp); + + skb->nfcache |= NFC_IPVS_PROPERTY; + + LeaveFunction(11); + return NF_ACCEPT; +} + + +/* + * Check if the packet is for VS/NAT connections, then send it + * immediately. + * Called by ip_fw_compact to detect packets for VS/NAT before + * they are changed by ipchains masquerading code. + */ +unsigned int +check_for_ip_vs_out(struct sk_buff **skb_p, int (*okfn)(struct sk_buff *)) +{ + unsigned int ret; + + ret = ip_vs_out(NF_IP_FORWARD, skb_p, NULL, NULL, NULL); + if (ret != NF_ACCEPT) { + return ret; + } else { + /* send the packet immediately if it is already mangled + by ip_vs_out */ + if ((*skb_p)->nfcache & NFC_IPVS_PROPERTY) { + (*okfn)(*skb_p); + return NF_STOLEN; + } + } + return NF_ACCEPT; +} + + +/* + * Handle ICMP messages in the outside-to-inside direction (incoming). + * Find any that might be relevant, check against existing connections, + * forward to the right destination host if relevant. + * Currently handles error types - unreachable, quench, ttl exceeded + */ +static int ip_vs_in_icmp(struct sk_buff **skb_p, int *related) +{ + struct sk_buff *skb = *skb_p; + struct iphdr *iph; + struct icmphdr *icmph; + struct iphdr *ciph; /* The ip header contained within the ICMP */ + unsigned short len; + unsigned short clen, cihl; + struct ip_vs_conn *cp; + struct ip_vs_protocol *pp; + union ip_vs_tphdr h; + int rc; + + *related = 1; + if (skb_is_nonlinear(skb)) { + if (skb_linearize(skb, GFP_ATOMIC) != 0) + return NF_DROP; + ip_send_check(skb->nh.iph); + } + + iph = skb->nh.iph; + icmph = (struct icmphdr *)((char *)iph+(iph->ihl<<2)); + len = ntohs(iph->tot_len) - (iph->ihl<<2); + if (len < sizeof(struct icmphdr)) + return NF_DROP; + + IP_VS_DBG(12, "icmp in (%d,%d) %u.%u.%u.%u -> %u.%u.%u.%u\n", + icmph->type, ntohs(icmp_id(icmph)), + NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); + + if ((icmph->type != ICMP_DEST_UNREACH) && + (icmph->type != ICMP_SOURCE_QUENCH) && + (icmph->type != ICMP_TIME_EXCEEDED)) { + *related = 0; + return NF_ACCEPT; + } + + /* + * If we get here we have an ICMP error of one of the above 3 types + * Now find the contained IP header + */ + clen = len - sizeof(struct icmphdr); + if (clen < sizeof(struct iphdr)) + return NF_DROP; + ciph = (struct iphdr *) (icmph + 1); + cihl = ciph->ihl << 2; + if (clen < cihl) + return NF_DROP; + + pp = ip_vs_proto_get(ciph->protocol); + if (!pp) + return NF_ACCEPT; + + /* Is the embedded protocol header present? */ + if (unlikely(ciph->frag_off & __constant_htons(IP_OFFSET) && + (pp->minhlen || pp->dont_defrag))) + return NF_ACCEPT; + + /* We need at least TCP/UDP ports here */ + if (clen < cihl + pp->minhlen_icmp) + return NF_DROP; + + /* Ensure the checksum is correct */ + if (ip_compute_csum((unsigned char *) icmph, len)) { + /* Failed checksum! */ + IP_VS_ERR_RL("incoming ICMP: failed checksum from " + "%d.%d.%d.%d!\n", NIPQUAD(iph->saddr)); + return NF_DROP; + } + + h.raw = (char *) ciph + cihl; + + IP_VS_DBG_PKT(11, pp, ciph, "Handling incoming ICMP for"); + + /* This is pretty much what ip_vs_conn_in_get() does, + except parameters are in the reverse order */ + cp = pp->conn_in_get(skb, pp, ciph, h, 1); + if (cp == NULL) + return NF_ACCEPT; + + ip_vs_in_stats(cp, skb); + rc = ip_vs_icmp_xmit(skb, cp, pp); + __ip_vs_conn_put(cp); + return rc; +} + + +/* + * Check if it's for virtual services, look it up, + * and send it on its way... + */ +static unsigned int +ip_vs_in(unsigned int hooknum, struct sk_buff **skb_p, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct sk_buff *skb = *skb_p; + struct iphdr *iph = skb->nh.iph; + struct ip_vs_protocol *pp = ip_vs_proto_get(iph->protocol); + union ip_vs_tphdr h; + struct ip_vs_conn *cp; + int ihl, ret, restart; + int firstfrag; + + /* + * Big tappo: only PACKET_HOST (neither loopback nor mcasts) + * ... don't know why 1st test DOES NOT include 2nd (?) + */ + if (unlikely(skb->pkt_type != PACKET_HOST || skb->dev == &loopback_dev)) { + IP_VS_DBG(12, "packet type=%d proto=%d daddr=%d.%d.%d.%d ignored\n", + skb->pkt_type, + iph->protocol, + NIPQUAD(iph->daddr)); + return NF_ACCEPT; + } + + if (unlikely(iph->protocol == IPPROTO_ICMP)) { + int related, verdict = ip_vs_in_icmp(skb_p, &related); + + if (related) + return verdict; + } + + /* Protocol supported? */ + if (unlikely(!pp)) + return NF_ACCEPT; + + /* make sure that protocol header is available in skb data area, + note that skb data area may be reallocated. */ + ihl = iph->ihl << 2; +#if 0 + /* Enable this when not in LOCAL_IN */ + firstfrag = !(iph->frag_off & __constant_htons(IP_OFFSET)); + /* + * WARNING: we can work with !firstfrag packets, make sure + * each protocol handler checks for firstfrag + */ +#else + firstfrag = 1; +#endif + if (firstfrag && + !pskb_may_pull(skb, ihl+pp->minhlen)) + return NF_DROP; + iph = skb->nh.iph; + h.raw = (char*) iph + ihl; + + /* + * Check if the packet belongs to an existing connection entry + */ + cp = pp->conn_in_get(skb, pp, iph, h, 0); + + if (unlikely(!cp)) { + int v; + + if (!pp->conn_schedule(skb, pp, iph, h, &v, &cp)) { + return v; + } + } + + if (unlikely(!cp)) { + /* sorry, all this trouble for a no-hit :) */ + IP_VS_DBG_PKT(12, pp, iph, + "packet continues traversal as normal"); + return NF_ACCEPT; + } + + IP_VS_DBG_PKT(11, pp, iph, "Incoming packet"); + + /* Check the server status */ + if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { + /* the destination server is not availabe */ + + if (sysctl_ip_vs_expire_nodest_conn) { + /* try to expire the connection immediately */ + ip_vs_conn_expire_now(cp); + } else { + /* don't restart its timer, and silently + drop the packet. */ + __ip_vs_conn_put(cp); + } + return NF_DROP; + } + + ip_vs_in_stats(cp, skb); + restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, iph, h, pp); + if (cp->packet_xmit) + ret = cp->packet_xmit(skb, cp, pp); + else { + IP_VS_DBG_RL("warning: packet_xmit is null"); + ret = NF_ACCEPT; + } + + /* increase its packet counter and check if it is needed + to be synchronized */ + atomic_inc(&cp->in_pkts); + if (ip_vs_sync_state == IP_VS_STATE_MASTER && + (cp->protocol != IPPROTO_TCP || + cp->state == IP_VS_TCP_S_ESTABLISHED) && + (atomic_read(&cp->in_pkts) % sysctl_ip_vs_sync_threshold[1] + == sysctl_ip_vs_sync_threshold[0])) + ip_vs_sync_conn(cp); + + ip_vs_conn_put(cp); + return ret; +} + + +/* + * It is hooked at the NF_IP_FORWARD chain, in order to catch ICMP + * packets destined for 0.0.0.0/0. + * When fwmark-based virtual service is used, such as transparent + * cache cluster, TCP packets can be marked and routed to ip_vs_in, + * but ICMP destined for 0.0.0.0/0 cannot not be easily marked and + * sent to ip_vs_in_icmp. So, catch them at the NF_IP_FORWARD chain + * and send them to ip_vs_in_icmp. + */ +static unsigned int +ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff **skb_p, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct sk_buff *skb = *skb_p; + struct iphdr *iph = skb->nh.iph; + int r; + + if (iph->protocol != IPPROTO_ICMP) + return NF_ACCEPT; + + if (iph->frag_off & __constant_htons(IP_MF|IP_OFFSET)) { + skb = ip_defrag(skb); + if (!skb) + return NF_STOLEN; + *skb_p = skb; + } + + return ip_vs_in_icmp(skb_p, &r); +} + + +/* After packet filtering, forward packet through VS/DR, VS/TUN, + or VS/NAT(change destination), so that filtering rules can be + applied to IPVS. */ +static struct nf_hook_ops ip_vs_in_ops = { + .hook = ip_vs_in, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_LOCAL_IN, + .priority = 100, +}; + +/* After packet filtering, change source only for VS/NAT */ +static struct nf_hook_ops ip_vs_out_ops = { + .hook = ip_vs_out, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_FORWARD, + .priority = 100, +}; + +/* After packet filtering (but before ip_vs_out_icmp), catch icmp + destined for 0.0.0.0/0, which is for incoming IPVS connections */ +static struct nf_hook_ops ip_vs_forward_icmp_ops = { + .hook = ip_vs_forward_icmp, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_FORWARD, + .priority = 99, +}; + +/* Before the netfilter connection tracking, exit from POST_ROUTING */ +static struct nf_hook_ops ip_vs_post_routing_ops = { + .hook = ip_vs_post_routing, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_POST_ROUTING, + .priority = NF_IP_PRI_NAT_SRC-1, +}; + + +/* + * Initialize IP Virtual Server + */ +static int __init ip_vs_init(void) +{ + int ret; + + ret = ip_vs_control_init(); + if (ret < 0) { + IP_VS_ERR("can't setup control.\n"); + goto cleanup_nothing; + } + + ip_vs_protocol_init(); + + ret = ip_vs_app_init(); + if (ret < 0) { + IP_VS_ERR("can't setup application helper.\n"); + goto cleanup_protocol; + } + + ret = ip_vs_conn_init(); + if (ret < 0) { + IP_VS_ERR("can't setup connection table.\n"); + goto cleanup_app; + } + + ret = nf_register_hook(&ip_vs_in_ops); + if (ret < 0) { + IP_VS_ERR("can't register in hook.\n"); + goto cleanup_conn; + } + + ret = nf_register_hook(&ip_vs_out_ops); + if (ret < 0) { + IP_VS_ERR("can't register out hook.\n"); + goto cleanup_inops; + } + ret = nf_register_hook(&ip_vs_post_routing_ops); + if (ret < 0) { + IP_VS_ERR("can't register post_routing hook.\n"); + goto cleanup_outops; + } + ret = nf_register_hook(&ip_vs_forward_icmp_ops); + if (ret < 0) { + IP_VS_ERR("can't register forward_icmp hook.\n"); + goto cleanup_postroutingops; + } + + IP_VS_INFO("ipvs loaded.\n"); + return ret; + + cleanup_postroutingops: + nf_unregister_hook(&ip_vs_post_routing_ops); + cleanup_outops: + nf_unregister_hook(&ip_vs_out_ops); + cleanup_inops: + nf_unregister_hook(&ip_vs_in_ops); + cleanup_conn: + ip_vs_conn_cleanup(); + cleanup_app: + ip_vs_app_cleanup(); + cleanup_protocol: + ip_vs_protocol_cleanup(); + ip_vs_control_cleanup(); + cleanup_nothing: + return ret; +} + +static void __exit ip_vs_cleanup(void) +{ + nf_unregister_hook(&ip_vs_forward_icmp_ops); + nf_unregister_hook(&ip_vs_post_routing_ops); + nf_unregister_hook(&ip_vs_out_ops); + nf_unregister_hook(&ip_vs_in_ops); + ip_vs_conn_cleanup(); + ip_vs_app_cleanup(); + ip_vs_protocol_cleanup(); + ip_vs_control_cleanup(); + IP_VS_INFO("ipvs unloaded.\n"); +} + +module_init(ip_vs_init); +module_exit(ip_vs_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c new file mode 100644 index 000000000000..e0d7aadf7927 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -0,0 +1,2205 @@ +/* + * IPVS An implementation of the IP virtual server support for the + * LINUX operating system. IPVS is now implemented as a module + * over the NetFilter framework. IPVS can be used to build a + * high-performance and highly available server based on a + * cluster of servers. + * + * Version: $Id: ip_vs_ctl.c,v 1.36 2003/06/08 09:31:19 wensong Exp $ + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * Peter Kese <peter.kese@ijs.si> + * Julian Anastasov <ja@ssi.bg> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +#include <linux/config.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/types.h> +#include <linux/errno.h> +#include <linux/fs.h> +#include <linux/sysctl.h> +#include <linux/proc_fs.h> +#include <linux/timer.h> +#include <linux/swap.h> +#include <linux/proc_fs.h> + +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> + +#include <net/ip.h> +#include <net/sock.h> + +#include <asm/uaccess.h> + +#include <net/ip_vs.h> + +/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */ +static DECLARE_MUTEX(__ip_vs_mutex); + +/* lock for service table */ +rwlock_t __ip_vs_svc_lock = RW_LOCK_UNLOCKED; + +/* lock for table with the real services */ +static rwlock_t __ip_vs_rs_lock = RW_LOCK_UNLOCKED; + +/* lock for state and timeout tables */ +static rwlock_t __ip_vs_securetcp_lock = RW_LOCK_UNLOCKED; + +/* lock for drop entry handling */ +static spinlock_t __ip_vs_dropentry_lock = SPIN_LOCK_UNLOCKED; + +/* lock for drop packet handling */ +static spinlock_t __ip_vs_droppacket_lock = SPIN_LOCK_UNLOCKED; + +/* 1/rate drop and drop-entry variables */ +int ip_vs_drop_rate = 0; +int ip_vs_drop_counter = 0; +atomic_t ip_vs_dropentry = ATOMIC_INIT(0); + +/* number of virtual services */ +static int ip_vs_num_services = 0; + +/* sysctl variables */ +static int sysctl_ip_vs_drop_entry = 0; +static int sysctl_ip_vs_drop_packet = 0; +static int sysctl_ip_vs_secure_tcp = 0; +static int sysctl_ip_vs_amemthresh = 1024; +static int sysctl_ip_vs_am_droprate = 10; +int sysctl_ip_vs_cache_bypass = 0; +int sysctl_ip_vs_expire_nodest_conn = 0; +int sysctl_ip_vs_sync_threshold[2] = { 3, 50 }; +int sysctl_ip_vs_nat_icmp_send = 0; + + +#ifdef CONFIG_IP_VS_DEBUG +static int sysctl_ip_vs_debug_level = 0; + +int ip_vs_get_debug_level(void) +{ + return sysctl_ip_vs_debug_level; +} +#endif + +/* + * update_defense_level is called from timer bh and from sysctl. + */ +void update_defense_level(void) +{ + struct sysinfo i; + static int old_secure_tcp = 0; + int availmem; + int nomem; + int to_change = -1; + + /* we only count free and buffered memory (in pages) */ + si_meminfo(&i); + availmem = i.freeram + i.bufferram; + /* however in linux 2.5 the i.bufferram is total page cache size, + we need adjust it */ + /* si_swapinfo(&i); */ + /* availmem = availmem - (i.totalswap - i.freeswap); */ + + nomem = (availmem < sysctl_ip_vs_amemthresh); + + /* drop_entry */ + spin_lock(&__ip_vs_dropentry_lock); + switch (sysctl_ip_vs_drop_entry) { + case 0: + atomic_set(&ip_vs_dropentry, 0); + break; + case 1: + if (nomem) { + atomic_set(&ip_vs_dropentry, 1); + sysctl_ip_vs_drop_entry = 2; + } else { + atomic_set(&ip_vs_dropentry, 0); + } + break; + case 2: + if (nomem) { + atomic_set(&ip_vs_dropentry, 1); + } else { + atomic_set(&ip_vs_dropentry, 0); + sysctl_ip_vs_drop_entry = 1; + }; + break; + case 3: + atomic_set(&ip_vs_dropentry, 1); + break; + } + spin_unlock(&__ip_vs_dropentry_lock); + + /* drop_packet */ + spin_lock(&__ip_vs_droppacket_lock); + switch (sysctl_ip_vs_drop_packet) { + case 0: + ip_vs_drop_rate = 0; + break; + case 1: + if (nomem) { + ip_vs_drop_rate = ip_vs_drop_counter + = sysctl_ip_vs_amemthresh / + (sysctl_ip_vs_amemthresh-availmem); + sysctl_ip_vs_drop_packet = 2; + } else { + ip_vs_drop_rate = 0; + } + break; + case 2: + if (nomem) { + ip_vs_drop_rate = ip_vs_drop_counter + = sysctl_ip_vs_amemthresh / + (sysctl_ip_vs_amemthresh-availmem); + } else { + ip_vs_drop_rate = 0; + sysctl_ip_vs_drop_packet = 1; + } + break; + case 3: + ip_vs_drop_rate = sysctl_ip_vs_am_droprate; + break; + } + spin_unlock(&__ip_vs_droppacket_lock); + + /* secure_tcp */ + write_lock(&__ip_vs_securetcp_lock); + switch (sysctl_ip_vs_secure_tcp) { + case 0: + if (old_secure_tcp >= 2) + to_change = 0; + break; + case 1: + if (nomem) { + if (old_secure_tcp < 2) + to_change = 1; + sysctl_ip_vs_secure_tcp = 2; + } else { + if (old_secure_tcp >= 2) + to_change = 0; + } + break; + case 2: + if (nomem) { + if (old_secure_tcp < 2) + to_change = 1; + } else { + if (old_secure_tcp >= 2) + to_change = 0; + sysctl_ip_vs_secure_tcp = 1; + } + break; + case 3: + if (old_secure_tcp < 2) + to_change = 1; + break; + } + old_secure_tcp = sysctl_ip_vs_secure_tcp; + if (to_change >= 0) + ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1); + write_unlock(&__ip_vs_securetcp_lock); +} + + +int +ip_vs_use_count_inc(void) +{ + return try_module_get(THIS_MODULE); +} + +void +ip_vs_use_count_dec(void) +{ + module_put(THIS_MODULE); +} + + +/* + * Hash table: for virtual service lookups + */ +#define IP_VS_SVC_TAB_BITS 8 +#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS) +#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1) + +/* the service table hashed by <protocol, addr, port> */ +static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE]; +/* the service table hashed by fwmark */ +static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE]; + +/* + * Hash table: for real service lookups + */ +#define IP_VS_RTAB_BITS 4 +#define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS) +#define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1) + +static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE]; + +/* + * Trash for destinations + */ +static LIST_HEAD(ip_vs_dest_trash); + +/* + * FTP & NULL virtual service counters + */ +static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0); +static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0); + + +/* + * Returns hash value for virtual service + */ +static __inline__ unsigned +ip_vs_svc_hashkey(unsigned proto, __u32 addr, __u16 port) +{ + register unsigned porth = ntohs(port); + + return (proto^ntohl(addr)^(porth>>IP_VS_SVC_TAB_BITS)^porth) + & IP_VS_SVC_TAB_MASK; +} + +/* + * Returns hash value of fwmark for virtual service lookup + */ +static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark) +{ + return fwmark & IP_VS_SVC_TAB_MASK; +} + +/* + * Hashes a service in the ip_vs_svc_table by <proto,addr,port> + * or in the ip_vs_svc_fwm_table by fwmark. + * Should be called with locked tables. + */ +static int ip_vs_svc_hash(struct ip_vs_service *svc) +{ + unsigned hash; + + if (svc->flags & IP_VS_SVC_F_HASHED) { + IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, " + "called from %p\n", __builtin_return_address(0)); + return 0; + } + + if (svc->fwmark == 0) { + /* + * Hash it by <protocol,addr,port> in ip_vs_svc_table + */ + hash = ip_vs_svc_hashkey(svc->protocol, svc->addr, svc->port); + list_add(&svc->s_list, &ip_vs_svc_table[hash]); + } else { + /* + * Hash it by fwmark in ip_vs_svc_fwm_table + */ + hash = ip_vs_svc_fwm_hashkey(svc->fwmark); + list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]); + } + + svc->flags |= IP_VS_SVC_F_HASHED; + /* increase its refcnt because it is referenced by the svc table */ + atomic_inc(&svc->refcnt); + return 1; +} + + +/* + * Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table. + * Should be called with locked tables. + */ +static int ip_vs_svc_unhash(struct ip_vs_service *svc) +{ + if (!(svc->flags & IP_VS_SVC_F_HASHED)) { + IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, " + "called from %p\n", __builtin_return_address(0)); + return 0; + } + + if (svc->fwmark == 0) { + /* Remove it from the ip_vs_svc_table table */ + list_del(&svc->s_list); + } else { + /* Remove it from the ip_vs_svc_fwm_table table */ + list_del(&svc->f_list); + } + + svc->flags &= ~IP_VS_SVC_F_HASHED; + atomic_dec(&svc->refcnt); + return 1; +} + + +/* + * Get service by {proto,addr,port} in the service table. + */ +static __inline__ struct ip_vs_service * +__ip_vs_service_get(__u16 protocol, __u32 vaddr, __u16 vport) +{ + unsigned hash; + struct ip_vs_service *svc; + struct list_head *l,*e; + + /* Check for "full" addressed entries */ + hash = ip_vs_svc_hashkey(protocol, vaddr, vport); + + l = &ip_vs_svc_table[hash]; + for (e=l->next; e!=l; e=e->next) { + svc = list_entry(e, struct ip_vs_service, s_list); + if ((svc->addr == vaddr) + && (svc->port == vport) + && (svc->protocol == protocol)) { + /* HIT */ + atomic_inc(&svc->usecnt); + return svc; + } + } + + return NULL; +} + + +/* + * Get service by {fwmark} in the service table. + */ +static __inline__ struct ip_vs_service *__ip_vs_svc_fwm_get(__u32 fwmark) +{ + unsigned hash; + struct ip_vs_service *svc; + struct list_head *l,*e; + + /* Check for fwmark addressed entries */ + hash = ip_vs_svc_fwm_hashkey(fwmark); + + l = &ip_vs_svc_fwm_table[hash]; + for (e=l->next; e!=l; e=e->next) { + svc = list_entry(e, struct ip_vs_service, f_list); + if (svc->fwmark == fwmark) { + /* HIT */ + atomic_inc(&svc->usecnt); + return svc; + } + } + + return NULL; +} + +struct ip_vs_service * +ip_vs_service_get(__u32 fwmark, __u16 protocol, __u32 vaddr, __u16 vport) +{ + struct ip_vs_service *svc; + + read_lock(&__ip_vs_svc_lock); + + /* + * Check the table hashed by fwmark first + */ + if (fwmark && (svc = __ip_vs_svc_fwm_get(fwmark))) + goto out; + + /* + * Check the table hashed by <protocol,addr,port> + * for "full" addressed entries + */ + svc = __ip_vs_service_get(protocol, vaddr, vport); + + if (svc == NULL + && protocol == IPPROTO_TCP + && atomic_read(&ip_vs_ftpsvc_counter) + && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) { + /* + * Check if ftp service entry exists, the packet + * might belong to FTP data connections. + */ + svc = __ip_vs_service_get(protocol, vaddr, FTPPORT); + } + + if (svc == NULL + && atomic_read(&ip_vs_nullsvc_counter)) { + /* + * Check if the catch-all port (port zero) exists + */ + svc = __ip_vs_service_get(protocol, vaddr, 0); + } + + out: + read_unlock(&__ip_vs_svc_lock); + + IP_VS_DBG(6, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n", + fwmark, ip_vs_proto_name(protocol), + NIPQUAD(vaddr), ntohs(vport), + svc?"hit":"not hit"); + + return svc; +} + + +static inline void +__ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc) +{ + atomic_inc(&svc->refcnt); + dest->svc = svc; +} + +static inline void +__ip_vs_unbind_svc(struct ip_vs_dest *dest) +{ + struct ip_vs_service *svc = dest->svc; + + dest->svc = NULL; + if (atomic_dec_and_test(&svc->refcnt)) + kfree(svc); +} + + +/* + * Returns hash value for real service + */ +static __inline__ unsigned ip_vs_rs_hashkey(__u32 addr, __u16 port) +{ + register unsigned porth = ntohs(port); + + return (ntohl(addr)^(porth>>IP_VS_RTAB_BITS)^porth) + & IP_VS_RTAB_MASK; +} + +/* + * Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>. + * should be called with locked tables. + */ +static int ip_vs_rs_hash(struct ip_vs_dest *dest) +{ + unsigned hash; + + if (!list_empty(&dest->d_list)) { + return 0; + } + + /* + * Hash by proto,addr,port, + * which are the parameters of the real service. + */ + hash = ip_vs_rs_hashkey(dest->addr, dest->port); + list_add(&dest->d_list, &ip_vs_rtable[hash]); + + return 1; +} + +/* + * UNhashes ip_vs_dest from ip_vs_rtable. + * should be called with locked tables. + */ +static int ip_vs_rs_unhash(struct ip_vs_dest *dest) +{ + /* + * Remove it from the ip_vs_rtable table. + */ + if (!list_empty(&dest->d_list)) { + list_del(&dest->d_list); + INIT_LIST_HEAD(&dest->d_list); + } + + return 1; +} + +/* + * Lookup real service by <proto,addr,port> in the real service table. + */ +struct ip_vs_dest * +ip_vs_lookup_real_service(__u16 protocol, __u32 daddr, __u16 dport) +{ + unsigned hash; + struct ip_vs_dest *dest; + struct list_head *l,*e; + + /* + * Check for "full" addressed entries + * Return the first found entry + */ + hash = ip_vs_rs_hashkey(daddr, dport); + + l = &ip_vs_rtable[hash]; + + read_lock(&__ip_vs_rs_lock); + for (e=l->next; e!=l; e=e->next) { + dest = list_entry(e, struct ip_vs_dest, d_list); + if ((dest->addr == daddr) + && (dest->port == dport) + && ((dest->protocol == protocol) || + dest->vfwmark)) { + /* HIT */ + read_unlock(&__ip_vs_rs_lock); + return dest; + } + } + read_unlock(&__ip_vs_rs_lock); + + return NULL; +} + +/* + * Lookup destination by {addr,port} in the given service + */ +static struct ip_vs_dest * +ip_vs_lookup_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport) +{ + struct ip_vs_dest *dest; + struct list_head *l, *e; + + /* + * Find the destination for the given service + */ + l = &svc->destinations; + for (e=l->next; e!=l; e=e->next) { + dest = list_entry(e, struct ip_vs_dest, n_list); + if ((dest->addr == daddr) && (dest->port == dport)) { + /* HIT */ + return dest; + } + } + + return NULL; +} + + +/* + * Lookup dest by {svc,addr,port} in the destination trash. + * The destination trash is used to hold the destinations that are removed + * from the service table but are still referenced by some conn entries. + * The reason to add the destination trash is when the dest is temporary + * down (either by administrator or by monitor program), the dest can be + * picked back from the trash, the remaining connections to the dest can + * continue, and the counting information of the dest is also useful for + * scheduling. + */ +static struct ip_vs_dest * +ip_vs_trash_get_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport) +{ + struct ip_vs_dest *dest; + struct list_head *l, *e; + + /* + * Find the destination in trash + */ + l = &ip_vs_dest_trash; + + for (e=l->next; e!=l; e=e->next) { + dest = list_entry(e, struct ip_vs_dest, n_list); + IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, " + "refcnt=%d\n", + dest->vfwmark, + NIPQUAD(dest->addr), ntohs(dest->port), + atomic_read(&dest->refcnt)); + if (dest->addr == daddr && + dest->port == dport && + dest->vfwmark == svc->fwmark && + dest->protocol == svc->protocol && + (svc->fwmark || + (dest->vaddr == svc->addr && + dest->vport == svc->port))) { + /* HIT */ + return dest; + } + + /* + * Try to purge the destination from trash if not referenced + */ + if (atomic_read(&dest->refcnt) == 1) { + IP_VS_DBG(3, "Removing destination %u/%u.%u.%u.%u:%u " + "from trash\n", + dest->vfwmark, + NIPQUAD(dest->addr), ntohs(dest->port)); + e = e->prev; + list_del(&dest->n_list); + ip_vs_dst_reset(dest); + __ip_vs_unbind_svc(dest); + kfree(dest); + } + } + + return NULL; +} + + +/* + * Clean up all the destinations in the trash + * Called by the ip_vs_control_cleanup() + * + * When the ip_vs_control_clearup is activated by ipvs module exit, + * the service tables must have been flushed and all the connections + * are expired, and the refcnt of each destination in the trash must + * be 1, so we simply release them here. + */ +static void ip_vs_trash_cleanup(void) +{ + struct ip_vs_dest *dest; + struct list_head *l; + + l = &ip_vs_dest_trash; + + while (l->next != l) { + dest = list_entry(l->next, struct ip_vs_dest, n_list); + list_del(&dest->n_list); + ip_vs_dst_reset(dest); + __ip_vs_unbind_svc(dest); + kfree(dest); + } +} + + +/* + * Update a destination in the given service + */ +static void +__ip_vs_update_dest(struct ip_vs_service *svc, + struct ip_vs_dest *dest, struct ip_vs_dest_user *udest) +{ + int conn_flags; + + /* set the weight and the flags */ + atomic_set(&dest->weight, udest->weight); + conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE; + + /* check if local node and update the flags */ + if (inet_addr_type(udest->addr) == RTN_LOCAL) { + conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK) + | IP_VS_CONN_F_LOCALNODE; + } + + /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */ + if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) { + conn_flags |= IP_VS_CONN_F_NOOUTPUT; + } else { + /* + * Put the real service in ip_vs_rtable if not present. + * For now only for NAT! + */ + write_lock_bh(&__ip_vs_rs_lock); + ip_vs_rs_hash(dest); + write_unlock_bh(&__ip_vs_rs_lock); + } + atomic_set(&dest->conn_flags, conn_flags); + + /* bind the service */ + if (!dest->svc) { + __ip_vs_bind_svc(dest, svc); + } else { + if (dest->svc != svc) { + __ip_vs_unbind_svc(dest); + __ip_vs_bind_svc(dest, svc); + } + } + + /* set the dest status flags */ + dest->flags |= IP_VS_DEST_F_AVAILABLE; + + if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold) + dest->flags &= ~IP_VS_DEST_F_OVERLOAD; + dest->u_threshold = udest->u_threshold; + dest->l_threshold = udest->l_threshold; +} + + +/* + * Create a destination for the given service + */ +static int +ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest, + struct ip_vs_dest **dest_p) +{ + struct ip_vs_dest *dest; + unsigned atype; + + EnterFunction(2); + + atype = inet_addr_type(udest->addr); + if (atype != RTN_LOCAL && atype != RTN_UNICAST) + return -EINVAL; + + dest = kmalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC); + if (dest == NULL) { + IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n"); + return -ENOMEM; + } + memset(dest, 0, sizeof(struct ip_vs_dest)); + + dest->protocol = svc->protocol; + dest->vaddr = svc->addr; + dest->vport = svc->port; + dest->vfwmark = svc->fwmark; + dest->addr = udest->addr; + dest->port = udest->port; + + atomic_set(&dest->activeconns, 0); + atomic_set(&dest->inactconns, 0); + atomic_set(&dest->persistconns, 0); + atomic_set(&dest->refcnt, 0); + + INIT_LIST_HEAD(&dest->d_list); + dest->dst_lock = SPIN_LOCK_UNLOCKED; + dest->stats.lock = SPIN_LOCK_UNLOCKED; + __ip_vs_update_dest(svc, dest, udest); + ip_vs_new_estimator(&dest->stats); + + *dest_p = dest; + + LeaveFunction(2); + return 0; +} + + +/* + * Add a destination into an existing service + */ +static int +ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest) +{ + struct ip_vs_dest *dest; + __u32 daddr = udest->addr; + __u16 dport = udest->port; + int ret; + + EnterFunction(2); + + if (udest->weight < 0) { + IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n"); + return -ERANGE; + } + + /* + * Check if the dest already exists in the list + */ + dest = ip_vs_lookup_dest(svc, daddr, dport); + if (dest != NULL) { + IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n"); + return -EEXIST; + } + + /* + * Check if the dest already exists in the trash and + * is from the same service + */ + dest = ip_vs_trash_get_dest(svc, daddr, dport); + if (dest != NULL) { + IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, " + "refcnt=%d, service %u/%u.%u.%u.%u:%u\n", + NIPQUAD(daddr), ntohs(dport), + atomic_read(&dest->refcnt), + dest->vfwmark, + NIPQUAD(dest->vaddr), + ntohs(dest->vport)); + __ip_vs_update_dest(svc, dest, udest); + + /* + * Get the destination from the trash + */ + list_del(&dest->n_list); + + ip_vs_new_estimator(&dest->stats); + + write_lock_bh(&__ip_vs_svc_lock); + + /* + * Wait until all other svc users go away. + */ + IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); + + list_add(&dest->n_list, &svc->destinations); + svc->num_dests++; + + /* call the update_service function of its scheduler */ + svc->scheduler->update_service(svc); + + write_unlock_bh(&__ip_vs_svc_lock); + return 0; + } + + /* + * Allocate and initialize the dest structure + */ + ret = ip_vs_new_dest(svc, udest, &dest); + if (ret) { + return ret; + } + + /* + * Add the dest entry into the list + */ + atomic_inc(&dest->refcnt); + + write_lock_bh(&__ip_vs_svc_lock); + + /* + * Wait until all other svc users go away. + */ + IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); + + list_add(&dest->n_list, &svc->destinations); + svc->num_dests++; + + /* call the update_service function of its scheduler */ + svc->scheduler->update_service(svc); + + write_unlock_bh(&__ip_vs_svc_lock); + + LeaveFunction(2); + + return 0; +} + + +/* + * Edit a destination in the given service + */ +static int +ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest) +{ + struct ip_vs_dest *dest; + __u32 daddr = udest->addr; + __u16 dport = udest->port; + + EnterFunction(2); + + if (udest->weight < 0) { + IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n"); + return -ERANGE; + } + + /* + * Lookup the destination list + */ + dest = ip_vs_lookup_dest(svc, daddr, dport); + if (dest == NULL) { + IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n"); + return -ENOENT; + } + + __ip_vs_update_dest(svc, dest, udest); + + /* call the update_service, because server weight may be changed */ + svc->scheduler->update_service(svc); + + LeaveFunction(2); + + return 0; +} + + +/* + * Delete a destination (must be already unlinked from the service) + */ +static void __ip_vs_del_dest(struct ip_vs_dest *dest) +{ + ip_vs_kill_estimator(&dest->stats); + + /* + * Remove it from the d-linked list with the real services. + */ + write_lock_bh(&__ip_vs_rs_lock); + ip_vs_rs_unhash(dest); + write_unlock_bh(&__ip_vs_rs_lock); + + /* + * Decrease the refcnt of the dest, and free the dest + * if nobody refers to it (refcnt=0). Otherwise, throw + * the destination into the trash. + */ + if (atomic_dec_and_test(&dest->refcnt)) { + ip_vs_dst_reset(dest); + /* simply decrease svc->refcnt here, let the caller check + and release the service if nobody refers to it. + Only user context can release destination and service, + and only one user context can update virtual service at a + time, so the operation here is OK */ + atomic_dec(&dest->svc->refcnt); + kfree(dest); + } else { + IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, refcnt=%d\n", + NIPQUAD(dest->addr), ntohs(dest->port), + atomic_read(&dest->refcnt)); + list_add(&dest->n_list, &ip_vs_dest_trash); + atomic_inc(&dest->refcnt); + } +} + + +/* + * Unlink a destination from the given service + */ +static void __ip_vs_unlink_dest(struct ip_vs_service *svc, + struct ip_vs_dest *dest, + int svcupd) +{ + dest->flags &= ~IP_VS_DEST_F_AVAILABLE; + + /* + * Remove it from the d-linked destination list. + */ + list_del(&dest->n_list); + svc->num_dests--; + if (svcupd) { + /* + * Call the update_service function of its scheduler + */ + svc->scheduler->update_service(svc); + } +} + + +/* + * Delete a destination server in the given service + */ +static int +ip_vs_del_dest(struct ip_vs_service *svc,struct ip_vs_dest_user *udest) +{ + struct ip_vs_dest *dest; + __u32 daddr = udest->addr; + __u16 dport = udest->port; + + EnterFunction(2); + + dest = ip_vs_lookup_dest(svc, daddr, dport); + if (dest == NULL) { + IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n"); + return -ENOENT; + } + + write_lock_bh(&__ip_vs_svc_lock); + + /* + * Wait until all other svc users go away. + */ + IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); + + /* + * Unlink dest from the service + */ + __ip_vs_unlink_dest(svc, dest, 1); + + write_unlock_bh(&__ip_vs_svc_lock); + + /* + * Delete the destination + */ + __ip_vs_del_dest(dest); + + LeaveFunction(2); + + return 0; +} + + +/* + * Add a service into the service hash table + */ +static int +ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p) +{ + int ret = 0; + struct ip_vs_scheduler *sched = NULL; + struct ip_vs_service *svc = NULL; + + /* increase the module use count */ + ip_vs_use_count_inc(); + + /* Lookup the scheduler by 'u->sched_name' */ + sched = ip_vs_scheduler_get(u->sched_name); + if (sched == NULL) { + IP_VS_INFO("Scheduler module ip_vs_%s not found\n", + u->sched_name); + ret = -ENOENT; + goto out_mod_dec; + } + + svc = (struct ip_vs_service *) + kmalloc(sizeof(struct ip_vs_service), GFP_ATOMIC); + if (svc == NULL) { + IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n"); + ret = -ENOMEM; + goto out_err; + } + memset(svc, 0, sizeof(struct ip_vs_service)); + + /* I'm the first user of the service */ + atomic_set(&svc->usecnt, 1); + atomic_set(&svc->refcnt, 0); + + svc->protocol = u->protocol; + svc->addr = u->addr; + svc->port = u->port; + svc->fwmark = u->fwmark; + svc->flags = u->flags; + svc->timeout = u->timeout * HZ; + svc->netmask = u->netmask; + + INIT_LIST_HEAD(&svc->destinations); + svc->sched_lock = RW_LOCK_UNLOCKED; + svc->stats.lock = SPIN_LOCK_UNLOCKED; + + /* Bind the scheduler */ + ret = ip_vs_bind_scheduler(svc, sched); + if (ret) + goto out_err; + sched = NULL; + + /* Update the virtual service counters */ + if (svc->port == FTPPORT) + atomic_inc(&ip_vs_ftpsvc_counter); + else if (svc->port == 0) + atomic_inc(&ip_vs_nullsvc_counter); + + ip_vs_new_estimator(&svc->stats); + ip_vs_num_services++; + + /* Hash the service into the service table */ + write_lock_bh(&__ip_vs_svc_lock); + ip_vs_svc_hash(svc); + write_unlock_bh(&__ip_vs_svc_lock); + + *svc_p = svc; + return 0; + + out_err: + if (svc != NULL) { + if (svc->scheduler) + ip_vs_unbind_scheduler(svc); + if (svc->inc) { + local_bh_disable(); + ip_vs_app_inc_put(svc->inc); + local_bh_enable(); + } + kfree(svc); + } + ip_vs_scheduler_put(sched); + + out_mod_dec: + /* decrease the module use count */ + ip_vs_use_count_dec(); + + return ret; +} + + +/* + * Edit a service and bind it with a new scheduler + */ +static int +ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user *u) +{ + struct ip_vs_scheduler *sched, *old_sched; + int ret = 0; + + /* + * Lookup the scheduler, by 'u->sched_name' + */ + sched = ip_vs_scheduler_get(u->sched_name); + if (sched == NULL) { + IP_VS_INFO("Scheduler module ip_vs_%s not found\n", + u->sched_name); + return -ENOENT; + } + old_sched = sched; + + write_lock_bh(&__ip_vs_svc_lock); + + /* + * Wait until all other svc users go away. + */ + IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); + + /* + * Set the flags and timeout value + */ + svc->flags = u->flags | IP_VS_SVC_F_HASHED; + svc->timeout = u->timeout * HZ; + svc->netmask = u->netmask; + + old_sched = svc->scheduler; + if (sched != old_sched) { + /* + * Unbind the old scheduler + */ + if ((ret = ip_vs_unbind_scheduler(svc))) { + old_sched = sched; + goto out; + } + + /* + * Bind the new scheduler + */ + if ((ret = ip_vs_bind_scheduler(svc, sched))) { + /* + * If ip_vs_bind_scheduler fails, restore the old + * scheduler. + * The main reason of failure is out of memory. + * + * The question is if the old scheduler can be + * restored all the time. TODO: if it cannot be + * restored some time, we must delete the service, + * otherwise the system may crash. + */ + ip_vs_bind_scheduler(svc, old_sched); + old_sched = sched; + goto out; + } + } + + out: + write_unlock_bh(&__ip_vs_svc_lock); + + if (old_sched) + ip_vs_scheduler_put(old_sched); + + return ret; +} + + +/* + * Delete a service from the service list + * - The service must be unlinked, unlocked and not referenced! + * - We are called under _bh lock + */ +static void __ip_vs_del_service(struct ip_vs_service *svc) +{ + struct list_head *l; + struct ip_vs_dest *dest; + struct ip_vs_scheduler *old_sched; + + ip_vs_num_services--; + ip_vs_kill_estimator(&svc->stats); + + /* Unbind scheduler */ + old_sched = svc->scheduler; + ip_vs_unbind_scheduler(svc); + if (old_sched) + ip_vs_scheduler_put(old_sched); + + /* Unbind app inc */ + if (svc->inc) { + ip_vs_app_inc_put(svc->inc); + svc->inc = NULL; + } + + /* + * Unlink the whole destination list + */ + l = &svc->destinations; + while (l->next != l) { + dest = list_entry(l->next, struct ip_vs_dest, n_list); + __ip_vs_unlink_dest(svc, dest, 0); + __ip_vs_del_dest(dest); + } + + /* + * Update the virtual service counters + */ + if (svc->port == FTPPORT) + atomic_dec(&ip_vs_ftpsvc_counter); + else if (svc->port == 0) + atomic_dec(&ip_vs_nullsvc_counter); + + /* + * Free the service if nobody refers to it + */ + if (atomic_read(&svc->refcnt) == 0) + kfree(svc); + + /* decrease the module use count */ + ip_vs_use_count_dec(); +} + +/* + * Delete a service from the service list + */ +static int ip_vs_del_service(struct ip_vs_service *svc) +{ + if (svc == NULL) + return -EEXIST; + + /* + * Unhash it from the service table + */ + write_lock_bh(&__ip_vs_svc_lock); + + ip_vs_svc_unhash(svc); + + /* + * Wait until all the svc users go away. + */ + IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); + + __ip_vs_del_service(svc); + + write_unlock_bh(&__ip_vs_svc_lock); + + return 0; +} + + +/* + * Flush all the virtual services + */ +static int ip_vs_flush(void) +{ + int idx; + struct ip_vs_service *svc; + struct list_head *l; + + /* + * Flush the service table hashed by <protocol,addr,port> + */ + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + l = &ip_vs_svc_table[idx]; + while (l->next != l) { + svc = list_entry(l->next,struct ip_vs_service,s_list); + write_lock_bh(&__ip_vs_svc_lock); + ip_vs_svc_unhash(svc); + /* + * Wait until all the svc users go away. + */ + IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); + __ip_vs_del_service(svc); + write_unlock_bh(&__ip_vs_svc_lock); + } + } + + /* + * Flush the service table hashed by fwmark + */ + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + l = &ip_vs_svc_fwm_table[idx]; + while (l->next != l) { + svc = list_entry(l->next,struct ip_vs_service,f_list); + write_lock_bh(&__ip_vs_svc_lock); + ip_vs_svc_unhash(svc); + /* + * Wait until all the svc users go away. + */ + IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); + __ip_vs_del_service(svc); + write_unlock_bh(&__ip_vs_svc_lock); + } + } + + return 0; +} + + +/* + * Zero counters in a service or all services + */ +static void +ip_vs_zero_stats(struct ip_vs_stats *stats) +{ + spin_lock_bh(&stats->lock); + memset(stats, 0, (char *)&stats->lock - (char *)stats); + spin_unlock_bh(&stats->lock); + ip_vs_zero_estimator(stats); +} + +static int ip_vs_zero_service(struct ip_vs_service *svc) +{ + struct list_head *l; + struct ip_vs_dest *dest; + + write_lock_bh(&__ip_vs_svc_lock); + list_for_each (l, &svc->destinations) { + dest = list_entry(l, struct ip_vs_dest, n_list); + ip_vs_zero_stats(&dest->stats); + } + ip_vs_zero_stats(&svc->stats); + write_unlock_bh(&__ip_vs_svc_lock); + return 0; +} + +static int ip_vs_zero_all(void) +{ + int idx; + struct list_head *l; + struct ip_vs_service *svc; + + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each (l, &ip_vs_svc_table[idx]) { + svc = list_entry(l, struct ip_vs_service, s_list); + ip_vs_zero_service(svc); + } + } + + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each (l, &ip_vs_svc_fwm_table[idx]) { + svc = list_entry(l, struct ip_vs_service, f_list); + ip_vs_zero_service(svc); + } + } + + ip_vs_zero_stats(&ip_vs_stats); + return 0; +} + + +static int +proc_do_defense_mode(ctl_table *table, int write, struct file * filp, + void *buffer, size_t *lenp) +{ + int *valp = table->data; + int val = *valp; + int rc; + + rc = proc_dointvec(table, write, filp, buffer, lenp); + if (write && (*valp != val)) { + if ((*valp < 0) || (*valp > 3)) { + /* Restore the correct value */ + *valp = val; + } else { + local_bh_disable(); + update_defense_level(); + local_bh_enable(); + } + } + return rc; +} + + +static int +proc_do_sync_threshold(ctl_table *table, int write, struct file *filp, + void *buffer, size_t *lenp) +{ + int *valp = table->data; + int val[2]; + int rc; + + /* backup the value first */ + memcpy(val, valp, sizeof(val)); + + rc = proc_dointvec(table, write, filp, buffer, lenp); + if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) { + /* Restore the correct value */ + memcpy(valp, val, sizeof(val)); + } + return rc; +} + + +/* + * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/) + */ +struct ip_vs_sysctl_table { + struct ctl_table_header *sysctl_header; + ctl_table vs_vars[NET_IPV4_VS_LAST]; + ctl_table vs_dir[2]; + ctl_table ipv4_dir[2]; + ctl_table root_dir[2]; +}; + +static struct ip_vs_sysctl_table ipv4_vs_table = { + NULL, + {{NET_IPV4_VS_AMEMTHRESH, "amemthresh", + &sysctl_ip_vs_amemthresh, sizeof(int), 0644, NULL, + &proc_dointvec}, +#ifdef CONFIG_IP_VS_DEBUG + {NET_IPV4_VS_DEBUG_LEVEL, "debug_level", + &sysctl_ip_vs_debug_level, sizeof(int), 0644, NULL, + &proc_dointvec}, +#endif + {NET_IPV4_VS_AMDROPRATE, "am_droprate", + &sysctl_ip_vs_am_droprate, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_IPV4_VS_DROP_ENTRY, "drop_entry", + &sysctl_ip_vs_drop_entry, sizeof(int), 0644, NULL, + &proc_do_defense_mode}, + {NET_IPV4_VS_DROP_PACKET, "drop_packet", + &sysctl_ip_vs_drop_packet, sizeof(int), 0644, NULL, + &proc_do_defense_mode}, + {NET_IPV4_VS_SECURE_TCP, "secure_tcp", + &sysctl_ip_vs_secure_tcp, sizeof(int), 0644, NULL, + &proc_do_defense_mode}, +#if 0 + {NET_IPV4_VS_TO_ES, "timeout_established", + &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED], + sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, + {NET_IPV4_VS_TO_SS, "timeout_synsent", + &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT], + sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, + {NET_IPV4_VS_TO_SR, "timeout_synrecv", + &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV], + sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, + {NET_IPV4_VS_TO_FW, "timeout_finwait", + &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT], + sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, + {NET_IPV4_VS_TO_TW, "timeout_timewait", + &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT], + sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, + {NET_IPV4_VS_TO_CL, "timeout_close", + &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE], + sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, + {NET_IPV4_VS_TO_CW, "timeout_closewait", + &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT], + sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, + {NET_IPV4_VS_TO_LA, "timeout_lastack", + &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK], + sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, + {NET_IPV4_VS_TO_LI, "timeout_listen", + &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN], + sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, + {NET_IPV4_VS_TO_SA, "timeout_synack", + &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK], + sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, + {NET_IPV4_VS_TO_UDP, "timeout_udp", + &vs_timeout_table_dos.timeout[IP_VS_S_UDP], + sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, + {NET_IPV4_VS_TO_ICMP, "timeout_icmp", + &vs_timeout_table_dos.timeout[IP_VS_S_ICMP], + sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, +#endif + {NET_IPV4_VS_CACHE_BYPASS, "cache_bypass", + &sysctl_ip_vs_cache_bypass, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_IPV4_VS_EXPIRE_NODEST_CONN, "expire_nodest_conn", + &sysctl_ip_vs_expire_nodest_conn, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_IPV4_VS_SYNC_THRESHOLD, "sync_threshold", + &sysctl_ip_vs_sync_threshold, sizeof(sysctl_ip_vs_sync_threshold), + 0644, NULL, &proc_do_sync_threshold}, + {NET_IPV4_VS_NAT_ICMP_SEND, "nat_icmp_send", + &sysctl_ip_vs_nat_icmp_send, sizeof(int), 0644, NULL, + &proc_dointvec}, + {0}}, + {{NET_IPV4_VS, "vs", NULL, 0, 0555, ipv4_vs_table.vs_vars}, + {0}}, + {{NET_IPV4, "ipv4", NULL, 0, 0555, ipv4_vs_table.vs_dir}, + {0}}, + {{CTL_NET, "net", NULL, 0, 0555, ipv4_vs_table.ipv4_dir}, + {0}} +}; + + +/* + * Write the contents of the VS rule table to a PROCfs file. + * (It is kept just for backward compatibility) + */ +static inline char *ip_vs_fwd_name(unsigned flags) +{ + char *fwd; + + switch (flags & IP_VS_CONN_F_FWD_MASK) { + case IP_VS_CONN_F_LOCALNODE: + fwd = "Local"; + break; + case IP_VS_CONN_F_TUNNEL: + fwd = "Tunnel"; + break; + case IP_VS_CONN_F_DROUTE: + fwd = "Route"; + break; + default: + fwd = "Masq"; + } + return fwd; +} + +static inline int sprintf_dest(char *str, struct ip_vs_dest *dest) +{ + return sprintf(str, " -> %08X:%04X %-7s %-6d %-10d %-10d", + ntohl(dest->addr), ntohs(dest->port), + ip_vs_fwd_name(atomic_read(&dest->conn_flags)), + atomic_read(&dest->weight), + atomic_read(&dest->activeconns), + atomic_read(&dest->inactconns)); +} + +static int ip_vs_get_info(char *buf, char **start, off_t offset, int length) +{ + int len=0; + off_t pos=0; + char temp[64], temp2[32]; + int idx; + struct ip_vs_service *svc; + struct ip_vs_dest *dest; + struct list_head *l, *e, *p, *q; + + /* + * Note: since the length of the buffer is usually the multiple + * of 512, it is good to use fixed record of the divisor of 512, + * so that records won't be truncated at buffer boundary. + */ + pos = 192; + if (pos > offset) { + sprintf(temp, + "IP Virtual Server version %d.%d.%d (size=%d)", + NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE); + len += sprintf(buf+len, "%-63s\n", temp); + len += sprintf(buf+len, "%-63s\n", + "Prot LocalAddress:Port Scheduler Flags"); + len += sprintf(buf+len, "%-63s\n", + " -> RemoteAddress:Port Forward Weight ActiveConn InActConn"); + } + + read_lock_bh(&__ip_vs_svc_lock); + + /* print the service table hashed by <protocol,addr,port> */ + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + l = &ip_vs_svc_table[idx]; + for (e=l->next; e!=l; e=e->next) { + svc = list_entry(e, struct ip_vs_service, s_list); + pos += 64; + if (pos > offset) { + if (svc->flags & IP_VS_SVC_F_PERSISTENT) + sprintf(temp2, "persistent %d %08X", + svc->timeout, + ntohl(svc->netmask)); + else + temp2[0] = '\0'; + + sprintf(temp, "%s %08X:%04X %s %s", + ip_vs_proto_name(svc->protocol), + ntohl(svc->addr), + ntohs(svc->port), + svc->scheduler->name, temp2); + len += sprintf(buf+len, "%-63s\n", temp); + if (len >= length) + goto done; + } + + p = &svc->destinations; + for (q=p->next; q!=p; q=q->next) { + dest = list_entry(q, struct ip_vs_dest, n_list); + pos += 64; + if (pos <= offset) + continue; + sprintf_dest(temp, dest); + len += sprintf(buf+len, "%-63s\n", temp); + if (len >= length) + goto done; + } + } + } + + /* print the service table hashed by fwmark */ + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + l = &ip_vs_svc_fwm_table[idx]; + for (e=l->next; e!=l; e=e->next) { + svc = list_entry(e, struct ip_vs_service, f_list); + pos += 64; + if (pos > offset) { + if (svc->flags & IP_VS_SVC_F_PERSISTENT) + sprintf(temp2, "persistent %d %08X", + svc->timeout, + ntohl(svc->netmask)); + else + temp2[0] = '\0'; + + sprintf(temp, "FWM %08X %s %s", + svc->fwmark, + svc->scheduler->name, temp2); + len += sprintf(buf+len, "%-63s\n", temp); + if (len >= length) + goto done; + } + + p = &svc->destinations; + for (q=p->next; q!=p; q=q->next) { + dest = list_entry(q, struct ip_vs_dest, n_list); + pos += 64; + if (pos <= offset) + continue; + sprintf_dest(temp, dest); + len += sprintf(buf+len, "%-63s\n", temp); + if (len >= length) + goto done; + } + } + } + + done: + read_unlock_bh(&__ip_vs_svc_lock); + + *start = buf+len-(pos-offset); /* Start of wanted data */ + len = pos-offset; + if (len > length) + len = length; + if (len < 0) + len = 0; + return len; +} + + +struct ip_vs_stats ip_vs_stats; + +static int +ip_vs_stats_get_info(char *buf, char **start, off_t offset, int length) +{ + int len=0; + off_t pos=0; + char temp[64]; + + pos += 320; + if (pos > offset) { + len += sprintf(buf+len, "%-63s\n%-63s\n", +/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ + " Total Incoming Outgoing Incoming Outgoing", + " Conns Packets Packets Bytes Bytes"); + + spin_lock_bh(&ip_vs_stats.lock); + sprintf(temp, "%8X %8X %8X %8X%08X %8X%08X", + ip_vs_stats.conns, + ip_vs_stats.inpkts, + ip_vs_stats.outpkts, + (__u32)(ip_vs_stats.inbytes>>32), + (__u32)ip_vs_stats.inbytes, + (__u32)(ip_vs_stats.outbytes>>32), + (__u32)ip_vs_stats.outbytes); + len += sprintf(buf+len, "%-62s\n\n", temp); + + len += sprintf(buf+len, "%-63s\n", +/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ + " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s"); + sprintf(temp, "%8X %8X %8X %16X %16X", + ip_vs_stats.cps, + ip_vs_stats.inpps, + ip_vs_stats.outpps, + ip_vs_stats.inbps, + ip_vs_stats.outbps); + len += sprintf(buf+len, "%-63s\n", temp); + + spin_unlock_bh(&ip_vs_stats.lock); + } + + *start = buf+len-(pos-offset); /* Start of wanted data */ + len = pos-offset; + if (len > length) + len = length; + if (len < 0) + len = 0; + return len; +} + + +/* + * Set timeout values for tcp tcpfin udp in the timeout_table. + */ +static int ip_vs_set_timeout(struct ip_vs_timeout_user *u) +{ + IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n", + u->tcp_timeout, + u->tcp_fin_timeout, + u->udp_timeout); + +#ifdef CONFIG_IP_VS_PROTO_TCP + if (u->tcp_timeout) { + ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] + = u->tcp_timeout * HZ; + } + + if (u->tcp_fin_timeout) { + ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] + = u->tcp_fin_timeout * HZ; + } +#endif + +#ifdef CONFIG_IP_VS_PROTO_UDP + if (u->udp_timeout) { + ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] + = u->udp_timeout * HZ; + } +#endif + return 0; +} + + +#define SET_CMDID(cmd) (cmd - IP_VS_BASE_CTL) +#define SERVICE_ARG_LEN (sizeof(struct ip_vs_service_user)) +#define SVCDEST_ARG_LEN (sizeof(struct ip_vs_service_user) + \ + sizeof(struct ip_vs_dest_user)) +#define TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user)) +#define DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user)) +#define MAX_ARG_LEN SVCDEST_ARG_LEN + +static unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = { + [SET_CMDID(IP_VS_SO_SET_ADD)] = SERVICE_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_EDIT)] = SERVICE_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_DEL)] = SERVICE_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_FLUSH)] = 0, + [SET_CMDID(IP_VS_SO_SET_ADDDEST)] = SVCDEST_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_DELDEST)] = SVCDEST_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_EDITDEST)] = SVCDEST_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_TIMEOUT)] = TIMEOUT_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)] = DAEMON_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)] = DAEMON_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_ZERO)] = SERVICE_ARG_LEN, +}; + +static int +do_ip_vs_set_ctl(struct sock *sk, int cmd, void *user, unsigned int len) +{ + int ret; + unsigned char arg[MAX_ARG_LEN]; + struct ip_vs_service_user *usvc; + struct ip_vs_service *svc; + struct ip_vs_dest_user *udest; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (len != set_arglen[SET_CMDID(cmd)]) { + IP_VS_ERR("set_ctl: len %u != %u\n", + len, set_arglen[SET_CMDID(cmd)]); + return -EINVAL; + } + + if (copy_from_user(arg, user, len) != 0) + return -EFAULT; + + /* increase the module use count */ + ip_vs_use_count_inc(); + + if (down_interruptible(&__ip_vs_mutex)) { + ret = -ERESTARTSYS; + goto out_dec; + } + + if (cmd == IP_VS_SO_SET_FLUSH) { + /* Flush the virtual service */ + ret = ip_vs_flush(); + goto out_unlock; + } else if (cmd == IP_VS_SO_SET_TIMEOUT) { + /* Set timeout values for (tcp tcpfin udp) */ + ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg); + goto out_unlock; + } else if (cmd == IP_VS_SO_SET_STARTDAEMON) { + struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; + ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid); + goto out_unlock; + } else if (cmd == IP_VS_SO_SET_STOPDAEMON) { + struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; + ret = stop_sync_thread(dm->state); + goto out_unlock; + } + + usvc = (struct ip_vs_service_user *)arg; + udest = (struct ip_vs_dest_user *)(usvc + 1); + + if (cmd == IP_VS_SO_SET_ZERO) { + /* if no service address is set, zero counters in all */ + if (!usvc->fwmark && !usvc->addr && !usvc->port) { + ret = ip_vs_zero_all(); + goto out_unlock; + } + } + + /* Check for valid protocol: TCP or UDP, even for fwmark!=0 */ + if (usvc->protocol!=IPPROTO_TCP && usvc->protocol!=IPPROTO_UDP) { + IP_VS_INFO("vs_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s", + ntohs(usvc->protocol), NIPQUAD(usvc->addr), + ntohs(usvc->port), usvc->sched_name); + ret = -EFAULT; + goto out_unlock; + } + + /* Lookup the exact service by <protocol, addr, port> or fwmark */ + if (usvc->fwmark == 0) + svc = __ip_vs_service_get(usvc->protocol, + usvc->addr, usvc->port); + else + svc = __ip_vs_svc_fwm_get(usvc->fwmark); + + if (cmd != IP_VS_SO_SET_ADD + && (svc == NULL || svc->protocol != usvc->protocol)) { + ret = -ESRCH; + goto out_unlock; + } + + switch (cmd) { + case IP_VS_SO_SET_ADD: + if (svc != NULL) + ret = -EEXIST; + else + ret = ip_vs_add_service(usvc, &svc); + break; + case IP_VS_SO_SET_EDIT: + ret = ip_vs_edit_service(svc, usvc); + break; + case IP_VS_SO_SET_DEL: + ret = ip_vs_del_service(svc); + if (!ret) + goto out_unlock; + break; + case IP_VS_SO_SET_ZERO: + ret = ip_vs_zero_service(svc); + break; + case IP_VS_SO_SET_ADDDEST: + ret = ip_vs_add_dest(svc, udest); + break; + case IP_VS_SO_SET_EDITDEST: + ret = ip_vs_edit_dest(svc, udest); + break; + case IP_VS_SO_SET_DELDEST: + ret = ip_vs_del_dest(svc, udest); + break; + default: + ret = -EINVAL; + } + + if (svc) + ip_vs_service_put(svc); + + out_unlock: + up(&__ip_vs_mutex); + out_dec: + /* decrease the module use count */ + ip_vs_use_count_dec(); + + return ret; +} + + +static void +ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src) +{ + spin_lock_bh(&src->lock); + memcpy(dst, src, (char*)&src->lock - (char*)src); + spin_unlock_bh(&src->lock); +} + +static void +ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src) +{ + dst->protocol = src->protocol; + dst->addr = src->addr; + dst->port = src->port; + dst->fwmark = src->fwmark; + strcpy(dst->sched_name, src->scheduler->name); + dst->flags = src->flags; + dst->timeout = src->timeout / HZ; + dst->netmask = src->netmask; + dst->num_dests = src->num_dests; + ip_vs_copy_stats(&dst->stats, &src->stats); +} + +static inline int +__ip_vs_get_service_entries(const struct ip_vs_get_services *get, + struct ip_vs_get_services *uptr) +{ + int idx, count=0; + struct ip_vs_service *svc; + struct list_head *l; + struct ip_vs_service_entry entry; + int ret = 0; + + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each (l, &ip_vs_svc_table[idx]) { + if (count >= get->num_services) + goto out; + svc = list_entry(l, struct ip_vs_service, s_list); + ip_vs_copy_service(&entry, svc); + if (copy_to_user(&uptr->entrytable[count], + &entry, sizeof(entry))) { + ret = -EFAULT; + goto out; + } + count++; + } + } + + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each (l, &ip_vs_svc_fwm_table[idx]) { + if (count >= get->num_services) + goto out; + svc = list_entry(l, struct ip_vs_service, f_list); + ip_vs_copy_service(&entry, svc); + if (copy_to_user(&uptr->entrytable[count], + &entry, sizeof(entry))) { + ret = -EFAULT; + goto out; + } + count++; + } + } + out: + return ret; +} + +static inline int +__ip_vs_get_dest_entries(const struct ip_vs_get_dests *get, + struct ip_vs_get_dests *uptr) +{ + struct ip_vs_service *svc; + int ret = 0; + + if (get->fwmark) + svc = __ip_vs_svc_fwm_get(get->fwmark); + else + svc = __ip_vs_service_get(get->protocol, + get->addr, get->port); + if (svc) { + int count = 0; + struct ip_vs_dest *dest; + struct list_head *l, *e; + struct ip_vs_dest_entry entry; + + l = &svc->destinations; + for (e=l->next; e!=l; e=e->next) { + if (count >= get->num_dests) + break; + dest = list_entry(e, struct ip_vs_dest, n_list); + entry.addr = dest->addr; + entry.port = dest->port; + entry.conn_flags = atomic_read(&dest->conn_flags); + entry.weight = atomic_read(&dest->weight); + entry.u_threshold = dest->u_threshold; + entry.l_threshold = dest->l_threshold; + entry.activeconns = atomic_read(&dest->activeconns); + entry.inactconns = atomic_read(&dest->inactconns); + entry.persistconns = atomic_read(&dest->persistconns); + ip_vs_copy_stats(&entry.stats, &dest->stats); + if (copy_to_user(&uptr->entrytable[count], + &entry, sizeof(entry))) { + ret = -EFAULT; + break; + } + count++; + } + ip_vs_service_put(svc); + } else + ret = -ESRCH; + return ret; +} + +static inline void +__ip_vs_get_timeouts(struct ip_vs_timeout_user *u) +{ +#ifdef CONFIG_IP_VS_PROTO_TCP + u->tcp_timeout = + ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ; + u->tcp_fin_timeout = + ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ; +#endif +#ifdef CONFIG_IP_VS_PROTO_UDP + u->udp_timeout = + ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ; +#endif +} + + +#define GET_CMDID(cmd) (cmd - IP_VS_BASE_CTL) +#define GET_INFO_ARG_LEN (sizeof(struct ip_vs_getinfo)) +#define GET_SERVICES_ARG_LEN (sizeof(struct ip_vs_get_services)) +#define GET_SERVICE_ARG_LEN (sizeof(struct ip_vs_service_entry)) +#define GET_DESTS_ARG_LEN (sizeof(struct ip_vs_get_dests)) +#define GET_TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user)) +#define GET_DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user) * 2) + +static unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = { + [GET_CMDID(IP_VS_SO_GET_VERSION)] = 64, + [GET_CMDID(IP_VS_SO_GET_INFO)] = GET_INFO_ARG_LEN, + [GET_CMDID(IP_VS_SO_GET_SERVICES)] = GET_SERVICES_ARG_LEN, + [GET_CMDID(IP_VS_SO_GET_SERVICE)] = GET_SERVICE_ARG_LEN, + [GET_CMDID(IP_VS_SO_GET_DESTS)] = GET_DESTS_ARG_LEN, + [GET_CMDID(IP_VS_SO_GET_TIMEOUT)] = GET_TIMEOUT_ARG_LEN, + [GET_CMDID(IP_VS_SO_GET_DAEMON)] = GET_DAEMON_ARG_LEN, +}; + +static int +do_ip_vs_get_ctl(struct sock *sk, int cmd, void *user, int *len) +{ + unsigned char arg[128]; + int ret = 0; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (*len < get_arglen[GET_CMDID(cmd)]) { + IP_VS_ERR("get_ctl: len %u < %u\n", + *len, get_arglen[GET_CMDID(cmd)]); + return -EINVAL; + } + + if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0) + return -EFAULT; + + if (down_interruptible(&__ip_vs_mutex)) + return -ERESTARTSYS; + + switch (cmd) { + case IP_VS_SO_GET_VERSION: + { + char buf[64]; + + sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)", + NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE); + if (copy_to_user(user, buf, strlen(buf)+1) != 0) { + ret = -EFAULT; + goto out; + } + *len = strlen(buf)+1; + } + break; + + case IP_VS_SO_GET_INFO: + { + struct ip_vs_getinfo info; + info.version = IP_VS_VERSION_CODE; + info.size = IP_VS_CONN_TAB_SIZE; + info.num_services = ip_vs_num_services; + if (copy_to_user(user, &info, sizeof(info)) != 0) + ret = -EFAULT; + } + break; + + case IP_VS_SO_GET_SERVICES: + { + struct ip_vs_get_services *get; + int size; + + get = (struct ip_vs_get_services *)arg; + size = sizeof(*get) + + sizeof(struct ip_vs_service_entry) * get->num_services; + if (*len != size) { + IP_VS_ERR("length: %u != %u\n", *len, size); + ret = -EINVAL; + goto out; + } + ret = __ip_vs_get_service_entries(get, user); + } + break; + + case IP_VS_SO_GET_SERVICE: + { + struct ip_vs_service_entry *entry; + struct ip_vs_service *svc; + + entry = (struct ip_vs_service_entry *)arg; + if (entry->fwmark) + svc = __ip_vs_svc_fwm_get(entry->fwmark); + else + svc = __ip_vs_service_get(entry->protocol, + entry->addr, entry->port); + if (svc) { + ip_vs_copy_service(entry, svc); + if (copy_to_user(user, entry, sizeof(*entry)) != 0) + ret = -EFAULT; + ip_vs_service_put(svc); + } else + ret = -ESRCH; + } + break; + + case IP_VS_SO_GET_DESTS: + { + struct ip_vs_get_dests *get; + int size; + + get = (struct ip_vs_get_dests *)arg; + size = sizeof(*get) + + sizeof(struct ip_vs_dest_entry) * get->num_dests; + if (*len != size) { + IP_VS_ERR("length: %u != %u\n", *len, size); + ret = -EINVAL; + goto out; + } + ret = __ip_vs_get_dest_entries(get, user); + } + break; + + case IP_VS_SO_GET_TIMEOUT: + { + struct ip_vs_timeout_user t; + + __ip_vs_get_timeouts(&t); + if (copy_to_user(user, &t, sizeof(t)) != 0) + ret = -EFAULT; + } + break; + + case IP_VS_SO_GET_DAEMON: + { + struct ip_vs_daemon_user d[2]; + + memset(&d, 0, sizeof(d)); + if (ip_vs_sync_state & IP_VS_STATE_MASTER) { + d[0].state = IP_VS_STATE_MASTER; + strcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn); + d[0].syncid = ip_vs_master_syncid; + } + if (ip_vs_sync_state & IP_VS_STATE_BACKUP) { + d[1].state = IP_VS_STATE_BACKUP; + strcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn); + d[1].syncid = ip_vs_backup_syncid; + } + if (copy_to_user(user, &d, sizeof(d)) != 0) + ret = -EFAULT; + } + break; + + default: + ret = -EINVAL; + } + + out: + up(&__ip_vs_mutex); + return ret; +} + + +static struct nf_sockopt_ops ip_vs_sockopts = { + { NULL, NULL }, PF_INET, + IP_VS_BASE_CTL, IP_VS_SO_SET_MAX+1, do_ip_vs_set_ctl, + IP_VS_BASE_CTL, IP_VS_SO_GET_MAX+1, do_ip_vs_get_ctl +}; + + +int ip_vs_control_init(void) +{ + int ret; + int idx; + + EnterFunction(2); + + ret = nf_register_sockopt(&ip_vs_sockopts); + if (ret) { + IP_VS_ERR("cannot register sockopt.\n"); + return ret; + } + + proc_net_create("ip_vs", 0, ip_vs_get_info); + proc_net_create("ip_vs_stats", 0, ip_vs_stats_get_info); + + ipv4_vs_table.sysctl_header = + register_sysctl_table(ipv4_vs_table.root_dir, 0); + + /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */ + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + INIT_LIST_HEAD(&ip_vs_svc_table[idx]); + INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]); + } + for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) { + INIT_LIST_HEAD(&ip_vs_rtable[idx]); + } + + memset(&ip_vs_stats, 0, sizeof(ip_vs_stats)); + ip_vs_stats.lock = SPIN_LOCK_UNLOCKED; + ip_vs_new_estimator(&ip_vs_stats); + + LeaveFunction(2); + return 0; +} + + +void ip_vs_control_cleanup(void) +{ + EnterFunction(2); + ip_vs_trash_cleanup(); + ip_vs_kill_estimator(&ip_vs_stats); + unregister_sysctl_table(ipv4_vs_table.sysctl_header); + proc_net_remove("ip_vs_stats"); + proc_net_remove("ip_vs"); + nf_unregister_sockopt(&ip_vs_sockopts); + LeaveFunction(2); +} diff --git a/net/ipv4/ipvs/ip_vs_dh.c b/net/ipv4/ipvs/ip_vs_dh.c new file mode 100644 index 000000000000..a2a425e5fe72 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_dh.c @@ -0,0 +1,261 @@ +/* + * IPVS: Destination Hashing scheduling module + * + * Version: $Id: ip_vs_dh.c,v 1.5 2002/09/15 08:14:08 wensong Exp $ + * + * Authors: Wensong Zhang <wensong@gnuchina.org> + * + * Inspired by the consistent hashing scheduler patch from + * Thomas Proell <proellt@gmx.de> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +/* + * The dh algorithm is to select server by the hash key of destination IP + * address. The pseudo code is as follows: + * + * n <- servernode[dest_ip]; + * if (n is dead) OR + * (n is overloaded) OR (n.weight <= 0) then + * return NULL; + * + * return n; + * + * Notes that servernode is a 256-bucket hash table that maps the hash + * index derived from packet destination IP address to the current server + * array. If the dh scheduler is used in cache cluster, it is good to + * combine it with cache_bypass feature. When the statically assigned + * server is dead or overloaded, the load balancer can bypass the cache + * server and send requests to the original server directly. + * + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/errno.h> + +#include <net/ip_vs.h> + + +/* + * IPVS DH bucket + */ +struct ip_vs_dh_bucket { + struct ip_vs_dest *dest; /* real server (cache) */ +}; + +/* + * for IPVS DH entry hash table + */ +#ifndef CONFIG_IP_VS_DH_TAB_BITS +#define CONFIG_IP_VS_DH_TAB_BITS 8 +#endif +#define IP_VS_DH_TAB_BITS CONFIG_IP_VS_DH_TAB_BITS +#define IP_VS_DH_TAB_SIZE (1 << IP_VS_DH_TAB_BITS) +#define IP_VS_DH_TAB_MASK (IP_VS_DH_TAB_SIZE - 1) + + +/* + * Returns hash value for IPVS DH entry + */ +static inline unsigned ip_vs_dh_hashkey(__u32 addr) +{ + return (ntohl(addr)*2654435761UL) & IP_VS_DH_TAB_MASK; +} + + +/* + * Get ip_vs_dest associated with supplied parameters. + */ +static inline struct ip_vs_dest * +ip_vs_dh_get(struct ip_vs_dh_bucket *tbl, __u32 addr) +{ + return (tbl[ip_vs_dh_hashkey(addr)]).dest; +} + + +/* + * Assign all the hash buckets of the specified table with the service. + */ +static int +ip_vs_dh_assign(struct ip_vs_dh_bucket *tbl, struct ip_vs_service *svc) +{ + int i; + struct ip_vs_dh_bucket *b; + struct list_head *p; + struct ip_vs_dest *dest; + + b = tbl; + p = &svc->destinations; + for (i=0; i<IP_VS_DH_TAB_SIZE; i++) { + if (list_empty(p)) { + b->dest = NULL; + } else { + if (p == &svc->destinations) + p = p->next; + + dest = list_entry(p, struct ip_vs_dest, n_list); + atomic_inc(&dest->refcnt); + b->dest = dest; + + p = p->next; + } + b++; + } + return 0; +} + + +/* + * Flush all the hash buckets of the specified table. + */ +static void ip_vs_dh_flush(struct ip_vs_dh_bucket *tbl) +{ + int i; + struct ip_vs_dh_bucket *b; + + b = tbl; + for (i=0; i<IP_VS_DH_TAB_SIZE; i++) { + if (b->dest) { + atomic_dec(&b->dest->refcnt); + b->dest = NULL; + } + b++; + } +} + + +static int ip_vs_dh_init_svc(struct ip_vs_service *svc) +{ + struct ip_vs_dh_bucket *tbl; + + /* allocate the DH table for this service */ + tbl = kmalloc(sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE, + GFP_ATOMIC); + if (tbl == NULL) { + IP_VS_ERR("ip_vs_dh_init_svc(): no memory\n"); + return -ENOMEM; + } + svc->sched_data = tbl; + IP_VS_DBG(6, "DH hash table (memory=%dbytes) allocated for " + "current service\n", + sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE); + + /* assign the hash buckets with the updated service */ + ip_vs_dh_assign(tbl, svc); + + return 0; +} + + +static int ip_vs_dh_done_svc(struct ip_vs_service *svc) +{ + struct ip_vs_dh_bucket *tbl = svc->sched_data; + + /* got to clean up hash buckets here */ + ip_vs_dh_flush(tbl); + + /* release the table itself */ + kfree(svc->sched_data); + IP_VS_DBG(6, "DH hash table (memory=%dbytes) released\n", + sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE); + + return 0; +} + + +static int ip_vs_dh_update_svc(struct ip_vs_service *svc) +{ + struct ip_vs_dh_bucket *tbl = svc->sched_data; + + /* got to clean up hash buckets here */ + ip_vs_dh_flush(tbl); + + /* assign the hash buckets with the updated service */ + ip_vs_dh_assign(tbl, svc); + + return 0; +} + + +/* + * If the dest flags is set with IP_VS_DEST_F_OVERLOAD, + * consider that the server is overloaded here. + */ +static inline int is_overloaded(struct ip_vs_dest *dest) +{ + return dest->flags & IP_VS_DEST_F_OVERLOAD; +} + + +/* + * Destination hashing scheduling + */ +static struct ip_vs_dest * +ip_vs_dh_schedule(struct ip_vs_service *svc, struct iphdr *iph) +{ + struct ip_vs_dest *dest; + struct ip_vs_dh_bucket *tbl; + + IP_VS_DBG(6, "ip_vs_dh_schedule(): Scheduling...\n"); + + tbl = (struct ip_vs_dh_bucket *)svc->sched_data; + dest = ip_vs_dh_get(tbl, iph->daddr); + if (!dest + || !(dest->flags & IP_VS_DEST_F_AVAILABLE) + || atomic_read(&dest->weight) <= 0 + || is_overloaded(dest)) { + return NULL; + } + + IP_VS_DBG(6, "DH: destination IP address %u.%u.%u.%u " + "--> server %u.%u.%u.%u:%d\n", + NIPQUAD(iph->daddr), + NIPQUAD(dest->addr), + ntohs(dest->port)); + + return dest; +} + + +/* + * IPVS DH Scheduler structure + */ +static struct ip_vs_scheduler ip_vs_dh_scheduler = +{ + .name = "dh", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .init_service = ip_vs_dh_init_svc, + .done_service = ip_vs_dh_done_svc, + .update_service = ip_vs_dh_update_svc, + .schedule = ip_vs_dh_schedule, +}; + + +static int __init ip_vs_dh_init(void) +{ + INIT_LIST_HEAD(&ip_vs_dh_scheduler.n_list); + return register_ip_vs_scheduler(&ip_vs_dh_scheduler); +} + + +static void __exit ip_vs_dh_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_dh_scheduler); +} + + +module_init(ip_vs_dh_init); +module_exit(ip_vs_dh_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_est.c b/net/ipv4/ipvs/ip_vs_est.c new file mode 100644 index 000000000000..159d16083940 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_est.c @@ -0,0 +1,197 @@ +/* + * ip_vs_est.c: simple rate estimator for IPVS + * + * Version: $Id: ip_vs_est.c,v 1.4 2002/11/30 01:50:35 wensong Exp $ + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ +#include <linux/kernel.h> +#include <linux/types.h> + +#include <net/ip_vs.h> + +/* + This code is to estimate rate in a shorter interval (such as 8 + seconds) for virtual services and real servers. For measure rate in a + long interval, it is easy to implement a user level daemon which + periodically reads those statistical counters and measure rate. + + Currently, the measurement is activated by slow timer handler. Hope + this measurement will not introduce too much load. + + We measure rate during the last 8 seconds every 2 seconds: + + avgrate = avgrate*(1-W) + rate*W + + where W = 2^(-2) + + NOTES. + + * The stored value for average bps is scaled by 2^5, so that maximal + rate is ~2.15Gbits/s, average pps and cps are scaled by 2^10. + + * A lot code is taken from net/sched/estimator.c + */ + + +struct ip_vs_estimator +{ + struct ip_vs_estimator *next; + struct ip_vs_stats *stats; + + u32 last_conns; + u32 last_inpkts; + u32 last_outpkts; + u64 last_inbytes; + u64 last_outbytes; + + u32 cps; + u32 inpps; + u32 outpps; + u32 inbps; + u32 outbps; +}; + + +static struct ip_vs_estimator *est_list = NULL; +static rwlock_t est_lock = RW_LOCK_UNLOCKED; +static struct timer_list est_timer; + +static void estimation_timer(unsigned long arg) +{ + struct ip_vs_estimator *e; + struct ip_vs_stats *s; + u32 n_conns; + u32 n_inpkts, n_outpkts; + u64 n_inbytes, n_outbytes; + u32 rate; + + read_lock(&est_lock); + for (e = est_list; e; e = e->next) { + s = e->stats; + n_conns = s->conns; + n_inpkts = s->inpkts; + n_outpkts = s->outpkts; + n_inbytes = s->inbytes; + n_outbytes = s->outbytes; + + /* scaled by 2^10, but divided 2 seconds */ + rate = (n_conns - e->last_conns)<<9; + e->last_conns = n_conns; + e->cps += ((long)rate - (long)e->cps)>>2; + s->cps = (e->cps+0x1FF)>>10; + + rate = (n_inpkts - e->last_inpkts)<<9; + e->last_inpkts = n_inpkts; + e->inpps += ((long)rate - (long)e->inpps)>>2; + s->inpps = (e->inpps+0x1FF)>>10; + + rate = (n_outpkts - e->last_outpkts)<<9; + e->last_outpkts = n_outpkts; + e->outpps += ((long)rate - (long)e->outpps)>>2; + s->outpps = (e->outpps+0x1FF)>>10; + + rate = (n_inbytes - e->last_inbytes)<<4; + e->last_inbytes = n_inbytes; + e->inbps += ((long)rate - (long)e->inbps)>>2; + s->inbps = (e->inbps+0xF)>>5; + + rate = (n_outbytes - e->last_outbytes)<<4; + e->last_outbytes = n_outbytes; + e->outbps += ((long)rate - (long)e->outbps)>>2; + s->outbps = (e->outbps+0xF)>>5; + } + read_unlock(&est_lock); + mod_timer(&est_timer, jiffies + 2*HZ); +} + +int ip_vs_new_estimator(struct ip_vs_stats *stats) +{ + struct ip_vs_estimator *est; + + est = kmalloc(sizeof(*est), GFP_KERNEL); + if (est == NULL) + return -ENOMEM; + + memset(est, 0, sizeof(*est)); + est->stats = stats; + est->last_conns = stats->conns; + est->cps = stats->cps<<10; + + est->last_inpkts = stats->inpkts; + est->inpps = stats->inpps<<10; + + est->last_outpkts = stats->outpkts; + est->outpps = stats->outpps<<10; + + est->last_inbytes = stats->inbytes; + est->inbps = stats->inbps<<5; + + est->last_outbytes = stats->outbytes; + est->outbps = stats->outbps<<5; + + write_lock_bh(&est_lock); + est->next = est_list; + if (est->next == NULL) { + init_timer(&est_timer); + est_timer.expires = jiffies + 2*HZ; + est_timer.function = estimation_timer; + add_timer(&est_timer); + } + est_list = est; + write_unlock_bh(&est_lock); + return 0; +} + +void ip_vs_kill_estimator(struct ip_vs_stats *stats) +{ + struct ip_vs_estimator *est, **pest; + int killed = 0; + + write_lock_bh(&est_lock); + pest = &est_list; + while ((est=*pest) != NULL) { + if (est->stats != stats) { + pest = &est->next; + continue; + } + *pest = est->next; + kfree(est); + killed++; + } + if (killed && est_list == NULL) + del_timer_sync(&est_timer); + write_unlock_bh(&est_lock); +} + +void ip_vs_zero_estimator(struct ip_vs_stats *stats) +{ + struct ip_vs_estimator *e; + + write_lock_bh(&est_lock); + for (e = est_list; e; e = e->next) { + if (e->stats != stats) + continue; + + /* set counters zero */ + e->last_conns = 0; + e->last_inpkts = 0; + e->last_outpkts = 0; + e->last_inbytes = 0; + e->last_outbytes = 0; + e->cps = 0; + e->inpps = 0; + e->outpps = 0; + e->inbps = 0; + e->outbps = 0; + } + write_unlock_bh(&est_lock); +} diff --git a/net/ipv4/ipvs/ip_vs_ftp.c b/net/ipv4/ipvs/ip_vs_ftp.c new file mode 100644 index 000000000000..12ceca2134e1 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_ftp.c @@ -0,0 +1,383 @@ +/* + * ip_vs_ftp.c: IPVS ftp application module + * + * Version: $Id: ip_vs_ftp.c,v 1.13 2002/09/15 08:14:08 wensong Exp $ + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * + * Changes: + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Most code here is taken from ip_masq_ftp.c in kernel 2.2. The difference + * is that ip_vs_ftp module handles the reverse direction to ip_masq_ftp. + * + * IP_MASQ_FTP ftp masquerading module + * + * Version: @(#)ip_masq_ftp.c 0.04 02/05/96 + * + * Author: Wouter Gadeyne + * + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <asm/system.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/init.h> +#include <net/protocol.h> +#include <net/tcp.h> + +#include <net/ip_vs.h> + + +#define SERVER_STRING "227 Entering Passive Mode (" +#define CLIENT_STRING "PORT " + + +/* + * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper + * First port is set to the default port. + */ +static int ports[IP_VS_APP_MAX_PORTS] = {21, 0}; + +/* + * Debug level + */ +#ifdef CONFIG_IP_VS_DEBUG +static int debug=0; +MODULE_PARM(debug, "i"); +#endif + +MODULE_PARM(ports, "1-" __MODULE_STRING(IP_VS_APP_MAX_PORTS) "i"); + +/* Dummy variable */ +static int ip_vs_ftp_pasv; + + +static int +ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp) +{ + return 0; +} + + +static int +ip_vs_ftp_done_conn(struct ip_vs_app *app, struct ip_vs_conn *cp) +{ + return 0; +} + + +/* + * Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started + * with the "pattern" and terminated with the "term" character. + * <addr,port> is in network order. + */ +static int ip_vs_ftp_get_addrport(char *data, char *data_limit, + const char *pattern, size_t plen, char term, + __u32 *addr, __u16 *port, + char **start, char **end) +{ + unsigned char p1,p2,p3,p4,p5,p6; + + while (data < data_limit) { + if (strnicmp(data, pattern, plen) != 0) { + data++; + continue; + } + *start = data+plen; + p1 = simple_strtoul(data+plen, &data, 10); + if (*data != ',') + continue; + p2 = simple_strtoul(data+1, &data, 10); + if (*data != ',') + continue; + p3 = simple_strtoul(data+1, &data, 10); + if (*data != ',') + continue; + p4 = simple_strtoul(data+1, &data, 10); + if (*data != ',') + continue; + p5 = simple_strtoul(data+1, &data, 10); + if (*data != ',') + continue; + p6 = simple_strtoul(data+1, &data, 10); + if (*data != term) + continue; + + *end = data; + *addr = (p4<<24) | (p3<<16) | (p2<<8) | p1; + *port = (p6<<8) | p5; + return 1; + } + return 0; +} + + +/* + * Look at outgoing ftp packets to catch the response to a PASV command + * from the server (inside-to-outside). + * When we see one, we build a connection entry with the client address, + * client port 0 (unknown at the moment), the server address and the + * server port. Mark the current connection entry as a control channel + * of the new entry. All this work is just to make the data connection + * can be scheduled to the right server later. + * + * The outgoing packet should be something like + * "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)". + * xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number. + */ +static int ip_vs_ftp_out(struct ip_vs_app *app, + struct ip_vs_conn *cp, struct sk_buff *skb) +{ + struct iphdr *iph; + struct tcphdr *th; + char *data, *data_limit; + char *start, *end; + __u32 from; + __u16 port; + struct ip_vs_conn *n_cp; + char buf[24]; /* xxx.xxx.xxx.xxx,ppp,ppp\000 */ + unsigned buf_len; + int diff; + + /* Only useful for established sessions */ + if (cp->state != IP_VS_TCP_S_ESTABLISHED) + return 0; + + if (cp->app_data == &ip_vs_ftp_pasv) { + iph = skb->nh.iph; + th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); + data = (char *)th + (th->doff << 2); + data_limit = skb->tail; + + if (ip_vs_ftp_get_addrport(data, data_limit, + SERVER_STRING, + sizeof(SERVER_STRING)-1, ')', + &from, &port, + &start, &end) == 0) + return 0; + + IP_VS_DBG(1-debug, "PASV response (%u.%u.%u.%u:%d) -> " + "%u.%u.%u.%u:%d detected\n", + NIPQUAD(from), ntohs(port), NIPQUAD(cp->caddr), 0); + + /* + * Now update or create an connection entry for it + */ + n_cp = ip_vs_conn_out_get(iph->protocol, from, port, + cp->caddr, 0); + if (!n_cp) { + n_cp = ip_vs_conn_new(IPPROTO_TCP, + cp->caddr, 0, + cp->vaddr, port, + from, port, + IP_VS_CONN_F_NO_CPORT, + cp->dest); + if (!n_cp) + return 0; + + /* add its controller */ + ip_vs_control_add(n_cp, cp); + } + + /* + * Replace the old passive address with the new one + */ + from = n_cp->vaddr; + port = n_cp->vport; + sprintf(buf,"%d,%d,%d,%d,%d,%d", NIPQUAD(from), + port&255, port>>8&255); + buf_len = strlen(buf); + + /* + * Calculate required delta-offset to keep TCP happy + */ + diff = buf_len - (end-start); + + if (diff == 0) { + /* simply replace it with new passive address */ + memcpy(start, buf, buf_len); + } else { + /* fixme: return value isn't checked here */ + ip_vs_skb_replace(skb, GFP_ATOMIC, start, + end-start, buf, buf_len); + } + + cp->app_data = NULL; + ip_vs_tcp_conn_listen(n_cp); + ip_vs_conn_put(n_cp); + return diff; + } + return 0; +} + + +/* + * Look at incoming ftp packets to catch the PASV/PORT command + * (outside-to-inside). + * + * The incoming packet having the PORT command should be something like + * "PORT xxx,xxx,xxx,xxx,ppp,ppp\n". + * xxx,xxx,xxx,xxx is the client address, ppp,ppp is the client port number. + * In this case, we create a connection entry using the client address and + * port, so that the active ftp data connection from the server can reach + * the client. + */ +static int ip_vs_ftp_in(struct ip_vs_app *app, + struct ip_vs_conn *cp, struct sk_buff *skb) +{ + struct iphdr *iph; + struct tcphdr *th; + char *data, *data_start, *data_limit; + char *start, *end; + __u32 to; + __u16 port; + struct ip_vs_conn *n_cp; + + /* Only useful for established sessions */ + if (cp->state != IP_VS_TCP_S_ESTABLISHED) + return 0; + + /* + * Detecting whether it is passive + */ + iph = skb->nh.iph; + th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); + + /* Since there may be OPTIONS in the TCP packet and the HLEN is + the length of the header in 32-bit multiples, it is accurate + to calculate data address by th+HLEN*4 */ + data = data_start = (char *)th + (th->doff << 2); + data_limit = skb->tail; + + while (data < data_limit) { + if (strnicmp(data, "PASV\r\n", 6) == 0) { + IP_VS_DBG(1-debug, "got PASV at %d of %d\n", + data - data_start, + data_limit - data_start); + cp->app_data = &ip_vs_ftp_pasv; + return 0; + } + data++; + } + + /* + * To support virtual FTP server, the scenerio is as follows: + * FTP client ----> Load Balancer ----> FTP server + * First detect the port number in the application data, + * then create a new connection entry for the coming data + * connection. + */ + data = data_start; + data_limit = skb->h.raw + skb->len - 18; + + if (ip_vs_ftp_get_addrport(data, data_limit, + CLIENT_STRING, sizeof(CLIENT_STRING)-1, + '\r', &to, &port, + &start, &end) == 0) + return 0; + + IP_VS_DBG(1-debug, "PORT %u.%u.%u.%u:%d detected\n", + NIPQUAD(to), ntohs(port)); + + /* + * Now update or create a connection entry for it + */ + IP_VS_DBG(1-debug, "protocol %s %u.%u.%u.%u:%d %u.%u.%u.%u:%d\n", + ip_vs_proto_name(iph->protocol), + NIPQUAD(to), ntohs(port), NIPQUAD(iph->daddr), 0); + + n_cp = ip_vs_conn_in_get(iph->protocol, + to, port, + iph->daddr, htons(ntohs(cp->vport)-1)); + if (!n_cp) { + n_cp = ip_vs_conn_new(IPPROTO_TCP, + to, port, + cp->vaddr, htons(ntohs(cp->vport)-1), + cp->daddr, htons(ntohs(cp->dport)-1), + 0, + cp->dest); + if (!n_cp) + return 0; + + /* add its controller */ + ip_vs_control_add(n_cp, cp); + } + + /* + * Move tunnel to listen state + */ + ip_vs_tcp_conn_listen(n_cp); + ip_vs_conn_put(n_cp); + + /* no diff required for incoming packets */ + return 0; +} + + +static struct ip_vs_app ip_vs_ftp = { + .name = "ftp", + .type = IP_VS_APP_TYPE_FTP, + .protocol = IPPROTO_TCP, + .module = THIS_MODULE, + .incs_list = LIST_HEAD_INIT(ip_vs_ftp.incs_list), + .init_conn = ip_vs_ftp_init_conn, + .done_conn = ip_vs_ftp_done_conn, + .bind_conn = NULL, + .unbind_conn = NULL, + .pkt_out = ip_vs_ftp_out, + .pkt_in = ip_vs_ftp_in, +}; + + +/* + * ip_vs_ftp initialization + */ +static int __init ip_vs_ftp_init(void) +{ + int i, ret; + struct ip_vs_app *app = &ip_vs_ftp; + + ret = register_ip_vs_app(app); + if (ret) + return ret; + + for (i=0; i<IP_VS_APP_MAX_PORTS; i++) { + if (!ports[i]) + continue; + ret = register_ip_vs_app_inc(app, app->protocol, ports[i]); + if (ret) + break; + IP_VS_DBG(1-debug, "%s: loaded support on port[%d] = %d\n", + app->name, i, ports[i]); + } + + if (ret) + unregister_ip_vs_app(app); + + return ret; +} + + +/* + * ip_vs_ftp finish. + */ +static void __exit ip_vs_ftp_exit(void) +{ + unregister_ip_vs_app(&ip_vs_ftp); +} + + +module_init(ip_vs_ftp_init); +module_exit(ip_vs_ftp_exit); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c new file mode 100644 index 000000000000..69dcf0f0ede8 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_lblc.c @@ -0,0 +1,629 @@ +/* + * IPVS: Locality-Based Least-Connection scheduling module + * + * Version: $Id: ip_vs_lblc.c,v 1.10 2002/09/15 08:14:08 wensong Exp $ + * + * Authors: Wensong Zhang <wensong@gnuchina.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Martin Hamilton : fixed the terrible locking bugs + * *lock(tbl->lock) ==> *lock(&tbl->lock) + * Wensong Zhang : fixed the uninitilized tbl->lock bug + * Wensong Zhang : added doing full expiration check to + * collect stale entries of 24+ hours when + * no partial expire check in a half hour + * Julian Anastasov : replaced del_timer call with del_timer_sync + * to avoid the possible race between timer + * handler and del_timer thread in SMP + * + */ + +/* + * The lblc algorithm is as follows (pseudo code): + * + * if cachenode[dest_ip] is null then + * n, cachenode[dest_ip] <- {weighted least-conn node}; + * else + * n <- cachenode[dest_ip]; + * if (n is dead) OR + * (n.conns>n.weight AND + * there is a node m with m.conns<m.weight/2) then + * n, cachenode[dest_ip] <- {weighted least-conn node}; + * + * return n; + * + * Thanks must go to Wenzhuo Zhang for talking WCCP to me and pushing + * me to write this module. + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/errno.h> + +/* for systcl */ +#include <linux/fs.h> +#include <linux/sysctl.h> + +#include <net/ip_vs.h> + + +/* + * It is for garbage collection of stale IPVS lblc entries, + * when the table is full. + */ +#define CHECK_EXPIRE_INTERVAL (60*HZ) +#define ENTRY_TIMEOUT (6*60*HZ) + +/* + * It is for full expiration check. + * When there is no partial expiration check (garbage collection) + * in a half hour, do a full expiration check to collect stale + * entries that haven't been touched for a day. + */ +#define COUNT_FOR_FULL_EXPIRATION 30 +int sysctl_ip_vs_lblc_expiration = 24*60*60*HZ; + + +/* + * for IPVS lblc entry hash table + */ +#ifndef CONFIG_IP_VS_LBLC_TAB_BITS +#define CONFIG_IP_VS_LBLC_TAB_BITS 10 +#endif +#define IP_VS_LBLC_TAB_BITS CONFIG_IP_VS_LBLC_TAB_BITS +#define IP_VS_LBLC_TAB_SIZE (1 << IP_VS_LBLC_TAB_BITS) +#define IP_VS_LBLC_TAB_MASK (IP_VS_LBLC_TAB_SIZE - 1) + + +/* + * IPVS lblc entry represents an association between destination + * IP address and its destination server + */ +struct ip_vs_lblc_entry { + struct list_head list; + __u32 addr; /* destination IP address */ + struct ip_vs_dest *dest; /* real server (cache) */ + unsigned long lastuse; /* last used time */ +}; + + +/* + * IPVS lblc hash table + */ +struct ip_vs_lblc_table { + rwlock_t lock; /* lock for this table */ + struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */ + atomic_t entries; /* number of entries */ + int max_size; /* maximum size of entries */ + struct timer_list periodic_timer; /* collect stale entries */ + int rover; /* rover for expire check */ + int counter; /* counter for no expire */ +}; + + +/* + * IPVS LBLC sysctl table + */ +struct ip_vs_lblc_sysctl_table { + struct ctl_table_header *sysctl_header; + ctl_table vs_vars[2]; + ctl_table vs_dir[2]; + ctl_table ipv4_dir[2]; + ctl_table root_dir[2]; +}; + + +static struct ip_vs_lblc_sysctl_table lblc_sysctl_table = { + NULL, + {{NET_IPV4_VS_LBLC_EXPIRE, "lblc_expiration", + &sysctl_ip_vs_lblc_expiration, + sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, + {0}}, + {{NET_IPV4_VS, "vs", NULL, 0, 0555, lblc_sysctl_table.vs_vars}, + {0}}, + {{NET_IPV4, "ipv4", NULL, 0, 0555, lblc_sysctl_table.vs_dir}, + {0}}, + {{CTL_NET, "net", NULL, 0, 0555, lblc_sysctl_table.ipv4_dir}, + {0}} +}; + + +/* + * new/free a ip_vs_lblc_entry, which is a mapping of a destionation + * IP address to a server. + */ +static inline struct ip_vs_lblc_entry * +ip_vs_lblc_new(__u32 daddr, struct ip_vs_dest *dest) +{ + struct ip_vs_lblc_entry *en; + + en = kmalloc(sizeof(struct ip_vs_lblc_entry), GFP_ATOMIC); + if (en == NULL) { + IP_VS_ERR("ip_vs_lblc_new(): no memory\n"); + return NULL; + } + + INIT_LIST_HEAD(&en->list); + en->addr = daddr; + + atomic_inc(&dest->refcnt); + en->dest = dest; + + return en; +} + + +static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en) +{ + list_del(&en->list); + /* + * We don't kfree dest because it is refered either by its service + * or the trash dest list. + */ + atomic_dec(&en->dest->refcnt); + kfree(en); +} + + +/* + * Returns hash value for IPVS LBLC entry + */ +static inline unsigned ip_vs_lblc_hashkey(__u32 addr) +{ + return (ntohl(addr)*2654435761UL) & IP_VS_LBLC_TAB_MASK; +} + + +/* + * Hash an entry in the ip_vs_lblc_table. + * returns bool success. + */ +static int +ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en) +{ + unsigned hash; + + if (!list_empty(&en->list)) { + IP_VS_ERR("ip_vs_lblc_hash(): request for already hashed, " + "called from %p\n", __builtin_return_address(0)); + return 0; + } + + /* + * Hash by destination IP address + */ + hash = ip_vs_lblc_hashkey(en->addr); + + write_lock(&tbl->lock); + list_add(&en->list, &tbl->bucket[hash]); + atomic_inc(&tbl->entries); + write_unlock(&tbl->lock); + + return 1; +} + + +#if 0000 +/* + * Unhash ip_vs_lblc_entry from ip_vs_lblc_table. + * returns bool success. + */ +static int ip_vs_lblc_unhash(struct ip_vs_lblc_table *tbl, + struct ip_vs_lblc_entry *en) +{ + if (list_empty(&en->list)) { + IP_VS_ERR("ip_vs_lblc_unhash(): request for not hashed entry, " + "called from %p\n", __builtin_return_address(0)); + return 0; + } + + /* + * Remove it from the table + */ + write_lock(&tbl->lock); + list_del(&en->list); + INIT_LIST_HEAD(&en->list); + write_unlock(&tbl->lock); + + return 1; +} +#endif + + +/* + * Get ip_vs_lblc_entry associated with supplied parameters. + */ +static inline struct ip_vs_lblc_entry * +ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __u32 addr) +{ + unsigned hash; + struct ip_vs_lblc_entry *en; + struct list_head *l,*e; + + hash = ip_vs_lblc_hashkey(addr); + l = &tbl->bucket[hash]; + + read_lock(&tbl->lock); + + for (e=l->next; e!=l; e=e->next) { + en = list_entry(e, struct ip_vs_lblc_entry, list); + if (en->addr == addr) { + /* HIT */ + read_unlock(&tbl->lock); + return en; + } + } + + read_unlock(&tbl->lock); + + return NULL; +} + + +/* + * Flush all the entries of the specified table. + */ +static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl) +{ + int i; + struct list_head *l; + struct ip_vs_lblc_entry *en; + + for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { + write_lock(&tbl->lock); + for (l=&tbl->bucket[i]; l->next!=l; ) { + en = list_entry(l->next, + struct ip_vs_lblc_entry, list); + ip_vs_lblc_free(en); + atomic_dec(&tbl->entries); + } + write_unlock(&tbl->lock); + } +} + + +static inline void ip_vs_lblc_full_check(struct ip_vs_lblc_table *tbl) +{ + unsigned long now = jiffies; + int i, j; + struct list_head *l, *e; + struct ip_vs_lblc_entry *en; + + for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { + j = (j + 1) & IP_VS_LBLC_TAB_MASK; + e = l = &tbl->bucket[j]; + write_lock(&tbl->lock); + while (e->next != l) { + en = list_entry(e->next, + struct ip_vs_lblc_entry, list); + if ((now - en->lastuse) < + sysctl_ip_vs_lblc_expiration) { + e = e->next; + continue; + } + ip_vs_lblc_free(en); + atomic_dec(&tbl->entries); + } + write_unlock(&tbl->lock); + } + tbl->rover = j; +} + + +/* + * Periodical timer handler for IPVS lblc table + * It is used to collect stale entries when the number of entries + * exceeds the maximum size of the table. + * + * Fixme: we probably need more complicated algorithm to collect + * entries that have not been used for a long time even + * if the number of entries doesn't exceed the maximum size + * of the table. + * The full expiration check is for this purpose now. + */ +static void ip_vs_lblc_check_expire(unsigned long data) +{ + struct ip_vs_lblc_table *tbl; + unsigned long now = jiffies; + int goal; + int i, j; + struct list_head *l, *e; + struct ip_vs_lblc_entry *en; + + tbl = (struct ip_vs_lblc_table *)data; + + if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { + /* do full expiration check */ + ip_vs_lblc_full_check(tbl); + tbl->counter = 1; + goto out; + } + + if (atomic_read(&tbl->entries) <= tbl->max_size) { + tbl->counter++; + goto out; + } + + goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3; + if (goal > tbl->max_size/2) + goal = tbl->max_size/2; + + for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { + j = (j + 1) & IP_VS_LBLC_TAB_MASK; + e = l = &tbl->bucket[j]; + write_lock(&tbl->lock); + while (e->next != l) { + en = list_entry(e->next, + struct ip_vs_lblc_entry, list); + if ((now - en->lastuse) < ENTRY_TIMEOUT) { + e = e->next; + continue; + } + ip_vs_lblc_free(en); + atomic_dec(&tbl->entries); + goal--; + } + write_unlock(&tbl->lock); + if (goal <= 0) + break; + } + tbl->rover = j; + + out: + mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL); +} + + +static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) +{ + int i; + struct ip_vs_lblc_table *tbl; + + /* + * Allocate the ip_vs_lblc_table for this service + */ + tbl = kmalloc(sizeof(struct ip_vs_lblc_table), GFP_ATOMIC); + if (tbl == NULL) { + IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n"); + return -ENOMEM; + } + svc->sched_data = tbl; + IP_VS_DBG(6, "LBLC hash table (memory=%dbytes) allocated for " + "current service\n", + sizeof(struct ip_vs_lblc_table)); + + /* + * Initialize the hash buckets + */ + for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { + INIT_LIST_HEAD(&tbl->bucket[i]); + } + tbl->lock = RW_LOCK_UNLOCKED; + tbl->max_size = IP_VS_LBLC_TAB_SIZE*16; + tbl->rover = 0; + tbl->counter = 1; + + /* + * Hook periodic timer for garbage collection + */ + init_timer(&tbl->periodic_timer); + tbl->periodic_timer.data = (unsigned long)tbl; + tbl->periodic_timer.function = ip_vs_lblc_check_expire; + tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL; + add_timer(&tbl->periodic_timer); + + return 0; +} + + +static int ip_vs_lblc_done_svc(struct ip_vs_service *svc) +{ + struct ip_vs_lblc_table *tbl = svc->sched_data; + + /* remove periodic timer */ + del_timer_sync(&tbl->periodic_timer); + + /* got to clean up table entries here */ + ip_vs_lblc_flush(tbl); + + /* release the table itself */ + kfree(svc->sched_data); + IP_VS_DBG(6, "LBLC hash table (memory=%dbytes) released\n", + sizeof(struct ip_vs_lblc_table)); + + return 0; +} + + +static int ip_vs_lblc_update_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static inline struct ip_vs_dest * +__ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph) +{ + register struct list_head *l, *e; + struct ip_vs_dest *dest, *least; + int loh, doh; + + /* + * We think the overhead of processing active connections is fifty + * times higher than that of inactive connections in average. (This + * fifty times might not be accurate, we will change it later.) We + * use the following formula to estimate the overhead: + * dest->activeconns*50 + dest->inactconns + * and the load: + * (dest overhead) / dest->weight + * + * Remember -- no floats in kernel mode!!! + * The comparison of h1*w2 > h2*w1 is equivalent to that of + * h1/w1 > h2/w2 + * if every weight is larger than zero. + * + * The server with weight=0 is quiesced and will not receive any + * new connection. + */ + + l = &svc->destinations; + for (e=l->next; e!=l; e=e->next) { + least = list_entry(e, struct ip_vs_dest, n_list); + if (least->flags & IP_VS_DEST_F_OVERLOAD) + continue; + if (atomic_read(&least->weight) > 0) { + loh = atomic_read(&least->activeconns) * 50 + + atomic_read(&least->inactconns); + goto nextstage; + } + } + return NULL; + + /* + * Find the destination with the least load. + */ + nextstage: + for (e=e->next; e!=l; e=e->next) { + dest = list_entry(e, struct ip_vs_dest, n_list); + + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + + doh = atomic_read(&dest->activeconns) * 50 + + atomic_read(&dest->inactconns); + if (loh * atomic_read(&dest->weight) > + doh * atomic_read(&least->weight)) { + least = dest; + loh = doh; + } + } + + IP_VS_DBG(6, "LBLC: server %d.%d.%d.%d:%d " + "activeconns %d refcnt %d weight %d overhead %d\n", + NIPQUAD(least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + + return least; +} + + +/* + * If this destination server is overloaded and there is a less loaded + * server, then return true. + */ +static inline int +is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) +{ + if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) { + register struct list_head *l, *e; + struct ip_vs_dest *d; + + l = &svc->destinations; + for (e=l->next; e!=l; e=e->next) { + d = list_entry(e, struct ip_vs_dest, n_list); + if (atomic_read(&d->activeconns)*2 + < atomic_read(&d->weight)) { + return 1; + } + } + } + return 0; +} + + +/* + * Locality-Based (weighted) Least-Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_lblc_schedule(struct ip_vs_service *svc, struct iphdr *iph) +{ + struct ip_vs_dest *dest; + struct ip_vs_lblc_table *tbl; + struct ip_vs_lblc_entry *en; + + IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n"); + + tbl = (struct ip_vs_lblc_table *)svc->sched_data; + en = ip_vs_lblc_get(tbl, iph->daddr); + if (en == NULL) { + dest = __ip_vs_wlc_schedule(svc, iph); + if (dest == NULL) { + IP_VS_DBG(1, "no destination available\n"); + return NULL; + } + en = ip_vs_lblc_new(iph->daddr, dest); + if (en == NULL) { + return NULL; + } + ip_vs_lblc_hash(tbl, en); + } else { + dest = en->dest; + if (!(dest->flags & IP_VS_DEST_F_AVAILABLE) + || atomic_read(&dest->weight) <= 0 + || is_overloaded(dest, svc)) { + dest = __ip_vs_wlc_schedule(svc, iph); + if (dest == NULL) { + IP_VS_DBG(1, "no destination available\n"); + return NULL; + } + atomic_dec(&en->dest->refcnt); + atomic_inc(&dest->refcnt); + en->dest = dest; + } + } + en->lastuse = jiffies; + + IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u " + "--> server %u.%u.%u.%u:%d\n", + NIPQUAD(en->addr), + NIPQUAD(dest->addr), + ntohs(dest->port)); + + return dest; +} + + +/* + * IPVS LBLC Scheduler structure + */ +static struct ip_vs_scheduler ip_vs_lblc_scheduler = +{ + .name = "lblc", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .init_service = ip_vs_lblc_init_svc, + .done_service = ip_vs_lblc_done_svc, + .update_service = ip_vs_lblc_update_svc, + .schedule = ip_vs_lblc_schedule, +}; + + +static int __init ip_vs_lblc_init(void) +{ + INIT_LIST_HEAD(&ip_vs_lblc_scheduler.n_list); + lblc_sysctl_table.sysctl_header = + register_sysctl_table(lblc_sysctl_table.root_dir, 0); + return register_ip_vs_scheduler(&ip_vs_lblc_scheduler); +} + + +static void __exit ip_vs_lblc_cleanup(void) +{ + unregister_sysctl_table(lblc_sysctl_table.sysctl_header); + unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler); +} + + +module_init(ip_vs_lblc_init); +module_exit(ip_vs_lblc_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c new file mode 100644 index 000000000000..0071a7250df3 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_lblcr.c @@ -0,0 +1,895 @@ +/* + * IPVS: Locality-Based Least-Connection with Replication scheduler + * + * Version: $Id: ip_vs_lblcr.c,v 1.11 2002/09/15 08:14:08 wensong Exp $ + * + * Authors: Wensong Zhang <wensong@gnuchina.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Julian Anastasov : Added the missing (dest->weight>0) + * condition in the ip_vs_dest_set_max. + * + */ + +/* + * The lblc/r algorithm is as follows (pseudo code): + * + * if serverSet[dest_ip] is null then + * n, serverSet[dest_ip] <- {weighted least-conn node}; + * else + * n <- {least-conn (alive) node in serverSet[dest_ip]}; + * if (n is null) OR + * (n.conns>n.weight AND + * there is a node m with m.conns<m.weight/2) then + * n <- {weighted least-conn node}; + * add n to serverSet[dest_ip]; + * if |serverSet[dest_ip]| > 1 AND + * now - serverSet[dest_ip].lastMod > T then + * m <- {most conn node in serverSet[dest_ip]}; + * remove m from serverSet[dest_ip]; + * if serverSet[dest_ip] changed then + * serverSet[dest_ip].lastMod <- now; + * + * return n; + * + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/errno.h> + +/* for systcl */ +#include <linux/fs.h> +#include <linux/sysctl.h> +/* for proc_net_create/proc_net_remove */ +#include <linux/proc_fs.h> + +#include <net/ip_vs.h> + + +/* + * It is for garbage collection of stale IPVS lblcr entries, + * when the table is full. + */ +#define CHECK_EXPIRE_INTERVAL (60*HZ) +#define ENTRY_TIMEOUT (6*60*HZ) + +/* + * It is for full expiration check. + * When there is no partial expiration check (garbage collection) + * in a half hour, do a full expiration check to collect stale + * entries that haven't been touched for a day. + */ +#define COUNT_FOR_FULL_EXPIRATION 30 +int sysctl_ip_vs_lblcr_expiration = 24*60*60*HZ; + + +/* + * for IPVS lblcr entry hash table + */ +#ifndef CONFIG_IP_VS_LBLCR_TAB_BITS +#define CONFIG_IP_VS_LBLCR_TAB_BITS 10 +#endif +#define IP_VS_LBLCR_TAB_BITS CONFIG_IP_VS_LBLCR_TAB_BITS +#define IP_VS_LBLCR_TAB_SIZE (1 << IP_VS_LBLCR_TAB_BITS) +#define IP_VS_LBLCR_TAB_MASK (IP_VS_LBLCR_TAB_SIZE - 1) + + +/* + * IPVS destination set structure and operations + */ +struct ip_vs_dest_list { + struct ip_vs_dest_list *next; /* list link */ + struct ip_vs_dest *dest; /* destination server */ +}; + +struct ip_vs_dest_set { + atomic_t size; /* set size */ + unsigned long lastmod; /* last modified time */ + struct ip_vs_dest_list *list; /* destination list */ + rwlock_t lock; /* lock for this list */ +}; + + +static struct ip_vs_dest_list * +ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) +{ + struct ip_vs_dest_list *e; + + for (e=set->list; e!=NULL; e=e->next) { + if (e->dest == dest) + /* already existed */ + return NULL; + } + + e = kmalloc(sizeof(struct ip_vs_dest_list), GFP_ATOMIC); + if (e == NULL) { + IP_VS_ERR("ip_vs_dest_set_insert(): no memory\n"); + return NULL; + } + + atomic_inc(&dest->refcnt); + e->dest = dest; + + /* link it to the list */ + write_lock(&set->lock); + e->next = set->list; + set->list = e; + atomic_inc(&set->size); + write_unlock(&set->lock); + + set->lastmod = jiffies; + return e; +} + +static void +ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) +{ + struct ip_vs_dest_list *e, **ep; + + write_lock(&set->lock); + for (ep=&set->list, e=*ep; e!=NULL; e=*ep) { + if (e->dest == dest) { + /* HIT */ + *ep = e->next; + atomic_dec(&set->size); + set->lastmod = jiffies; + atomic_dec(&e->dest->refcnt); + kfree(e); + break; + } + ep = &e->next; + } + write_unlock(&set->lock); +} + +static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set) +{ + struct ip_vs_dest_list *e, **ep; + + write_lock(&set->lock); + for (ep=&set->list, e=*ep; e!=NULL; e=*ep) { + *ep = e->next; + /* + * We don't kfree dest because it is refered either + * by its service or by the trash dest list. + */ + atomic_dec(&e->dest->refcnt); + kfree(e); + } + write_unlock(&set->lock); +} + +/* get weighted least-connection node in the destination set */ +static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set) +{ + register struct ip_vs_dest_list *e; + struct ip_vs_dest *dest, *least; + int loh, doh; + + if (set == NULL) + return NULL; + + read_lock(&set->lock); + /* select the first destination server, whose weight > 0 */ + for (e=set->list; e!=NULL; e=e->next) { + least = e->dest; + if (least->flags & IP_VS_DEST_F_OVERLOAD) + continue; + + if ((atomic_read(&least->weight) > 0) + && (least->flags & IP_VS_DEST_F_AVAILABLE)) { + loh = atomic_read(&least->activeconns) * 50 + + atomic_read(&least->inactconns); + goto nextstage; + } + } + read_unlock(&set->lock); + return NULL; + + /* find the destination with the weighted least load */ + nextstage: + for (e=e->next; e!=NULL; e=e->next) { + dest = e->dest; + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + + doh = atomic_read(&dest->activeconns) * 50 + + atomic_read(&dest->inactconns); + if ((loh * atomic_read(&dest->weight) > + doh * atomic_read(&least->weight)) + && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { + least = dest; + loh = doh; + } + } + read_unlock(&set->lock); + + IP_VS_DBG(6, "ip_vs_dest_set_min: server %d.%d.%d.%d:%d " + "activeconns %d refcnt %d weight %d overhead %d\n", + NIPQUAD(least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + return least; +} + + +/* get weighted most-connection node in the destination set */ +static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set) +{ + register struct ip_vs_dest_list *e; + struct ip_vs_dest *dest, *most; + int moh, doh; + + if (set == NULL) + return NULL; + + read_lock(&set->lock); + /* select the first destination server, whose weight > 0 */ + for (e=set->list; e!=NULL; e=e->next) { + most = e->dest; + if (atomic_read(&most->weight) > 0) { + moh = atomic_read(&most->activeconns) * 50 + + atomic_read(&most->inactconns); + goto nextstage; + } + } + read_unlock(&set->lock); + return NULL; + + /* find the destination with the weighted most load */ + nextstage: + for (e=e->next; e!=NULL; e=e->next) { + dest = e->dest; + doh = atomic_read(&dest->activeconns) * 50 + + atomic_read(&dest->inactconns); + /* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */ + if ((moh * atomic_read(&dest->weight) < + doh * atomic_read(&most->weight)) + && (atomic_read(&dest->weight) > 0)) { + most = dest; + moh = doh; + } + } + read_unlock(&set->lock); + + IP_VS_DBG(6, "ip_vs_dest_set_max: server %d.%d.%d.%d:%d " + "activeconns %d refcnt %d weight %d overhead %d\n", + NIPQUAD(most->addr), ntohs(most->port), + atomic_read(&most->activeconns), + atomic_read(&most->refcnt), + atomic_read(&most->weight), moh); + return most; +} + + +/* + * IPVS lblcr entry represents an association between destination + * IP address and its destination server set + */ +struct ip_vs_lblcr_entry { + struct list_head list; + __u32 addr; /* destination IP address */ + struct ip_vs_dest_set set; /* destination server set */ + unsigned long lastuse; /* last used time */ +}; + + +/* + * IPVS lblcr hash table + */ +struct ip_vs_lblcr_table { + rwlock_t lock; /* lock for this table */ + struct list_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */ + atomic_t entries; /* number of entries */ + int max_size; /* maximum size of entries */ + struct timer_list periodic_timer; /* collect stale entries */ + int rover; /* rover for expire check */ + int counter; /* counter for no expire */ +}; + + +/* + * IPVS LBLCR sysctl table + */ +struct ip_vs_lblcr_sysctl_table { + struct ctl_table_header *sysctl_header; + ctl_table vs_vars[2]; + ctl_table vs_dir[2]; + ctl_table ipv4_dir[2]; + ctl_table root_dir[2]; +}; + + +static struct ip_vs_lblcr_sysctl_table lblcr_sysctl_table = { + NULL, + {{NET_IPV4_VS_LBLCR_EXPIRE, "lblcr_expiration", + &sysctl_ip_vs_lblcr_expiration, + sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, + {0}}, + {{NET_IPV4_VS, "vs", NULL, 0, 0555, lblcr_sysctl_table.vs_vars}, + {0}}, + {{NET_IPV4, "ipv4", NULL, 0, 0555, lblcr_sysctl_table.vs_dir}, + {0}}, + {{CTL_NET, "net", NULL, 0, 0555, lblcr_sysctl_table.ipv4_dir}, + {0}} +}; + + +/* + * new/free a ip_vs_lblcr_entry, which is a mapping of a destination + * IP address to a server. + */ +static inline struct ip_vs_lblcr_entry *ip_vs_lblcr_new(__u32 daddr) +{ + struct ip_vs_lblcr_entry *en; + + en = kmalloc(sizeof(struct ip_vs_lblcr_entry), GFP_ATOMIC); + if (en == NULL) { + IP_VS_ERR("ip_vs_lblcr_new(): no memory\n"); + return NULL; + } + + INIT_LIST_HEAD(&en->list); + en->addr = daddr; + + /* initilize its dest set */ + atomic_set(&(en->set.size), 0); + en->set.list = NULL; + en->set.lock = RW_LOCK_UNLOCKED; + + return en; +} + + +static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en) +{ + list_del(&en->list); + ip_vs_dest_set_eraseall(&en->set); + kfree(en); +} + + +/* + * Returns hash value for IPVS LBLCR entry + */ +static inline unsigned ip_vs_lblcr_hashkey(__u32 addr) +{ + return (ntohl(addr)*2654435761UL) & IP_VS_LBLCR_TAB_MASK; +} + + +/* + * Hash an entry in the ip_vs_lblcr_table. + * returns bool success. + */ +static int +ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en) +{ + unsigned hash; + + if (!list_empty(&en->list)) { + IP_VS_ERR("ip_vs_lblcr_hash(): request for already hashed, " + "called from %p\n", __builtin_return_address(0)); + return 0; + } + + /* + * Hash by destination IP address + */ + hash = ip_vs_lblcr_hashkey(en->addr); + + write_lock(&tbl->lock); + list_add(&en->list, &tbl->bucket[hash]); + atomic_inc(&tbl->entries); + write_unlock(&tbl->lock); + + return 1; +} + + +#if 0000 +/* + * Unhash ip_vs_lblcr_entry from ip_vs_lblcr_table. + * returns bool success. + */ +static int ip_vs_lblcr_unhash(struct ip_vs_lblcr_table *tbl, + struct ip_vs_lblcr_entry *en) +{ + if (list_empty(&en->list)) { + IP_VS_ERR("ip_vs_lblcr_unhash(): request for not hashed entry, " + "called from %p\n", __builtin_return_address(0)); + return 0; + } + + /* + * Remove it from the table + */ + write_lock(&tbl->lock); + list_del(&en->list); + INIT_LIST_HEAD(&en->list); + write_unlock(&tbl->lock); + + return 1; +} +#endif + + +/* + * Get ip_vs_lblcr_entry associated with supplied parameters. + */ +static inline struct ip_vs_lblcr_entry * +ip_vs_lblcr_get(struct ip_vs_lblcr_table *tbl, __u32 addr) +{ + unsigned hash; + struct ip_vs_lblcr_entry *en; + struct list_head *l,*e; + + hash = ip_vs_lblcr_hashkey(addr); + l = &tbl->bucket[hash]; + + read_lock(&tbl->lock); + + for (e=l->next; e!=l; e=e->next) { + en = list_entry(e, struct ip_vs_lblcr_entry, list); + if (en->addr == addr) { + /* HIT */ + read_unlock(&tbl->lock); + return en; + } + } + + read_unlock(&tbl->lock); + + return NULL; +} + + +/* + * Flush all the entries of the specified table. + */ +static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl) +{ + int i; + struct list_head *l; + struct ip_vs_lblcr_entry *en; + + for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) { + write_lock(&tbl->lock); + for (l=&tbl->bucket[i]; l->next!=l; ) { + en = list_entry(l->next, + struct ip_vs_lblcr_entry, list); + ip_vs_lblcr_free(en); + atomic_dec(&tbl->entries); + } + write_unlock(&tbl->lock); + } +} + + +static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl) +{ + unsigned long now = jiffies; + int i, j; + struct list_head *l, *e; + struct ip_vs_lblcr_entry *en; + + for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) { + j = (j + 1) & IP_VS_LBLCR_TAB_MASK; + e = l = &tbl->bucket[j]; + write_lock(&tbl->lock); + while (e->next != l) { + en = list_entry(e->next, + struct ip_vs_lblcr_entry, list); + if ((now - en->lastuse) < + sysctl_ip_vs_lblcr_expiration) { + e = e->next; + continue; + } + ip_vs_lblcr_free(en); + atomic_dec(&tbl->entries); + } + write_unlock(&tbl->lock); + } + tbl->rover = j; +} + + +/* + * Periodical timer handler for IPVS lblcr table + * It is used to collect stale entries when the number of entries + * exceeds the maximum size of the table. + * + * Fixme: we probably need more complicated algorithm to collect + * entries that have not been used for a long time even + * if the number of entries doesn't exceed the maximum size + * of the table. + * The full expiration check is for this purpose now. + */ +static void ip_vs_lblcr_check_expire(unsigned long data) +{ + struct ip_vs_lblcr_table *tbl; + unsigned long now = jiffies; + int goal; + int i, j; + struct list_head *l, *e; + struct ip_vs_lblcr_entry *en; + + tbl = (struct ip_vs_lblcr_table *)data; + + if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { + /* do full expiration check */ + ip_vs_lblcr_full_check(tbl); + tbl->counter = 1; + goto out; + } + + if (atomic_read(&tbl->entries) <= tbl->max_size) { + tbl->counter++; + goto out; + } + + goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3; + if (goal > tbl->max_size/2) + goal = tbl->max_size/2; + + for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) { + j = (j + 1) & IP_VS_LBLCR_TAB_MASK; + e = l = &tbl->bucket[j]; + write_lock(&tbl->lock); + while (e->next != l) { + en = list_entry(e->next, + struct ip_vs_lblcr_entry, list); + if ((now - en->lastuse) < ENTRY_TIMEOUT) { + e = e->next; + continue; + } + ip_vs_lblcr_free(en); + atomic_dec(&tbl->entries); + goal--; + } + write_unlock(&tbl->lock); + if (goal <= 0) + break; + } + tbl->rover = j; + + out: + mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL); +} + + +#ifdef CONFIG_IP_VS_LBLCR_DEBUG +static struct ip_vs_lblcr_table *lblcr_table_list; + +/* + * /proc/net/ip_vs_lblcr to display the mappings of + * destination IP address <==> its serverSet + */ +static int +ip_vs_lblcr_getinfo(char *buffer, char **start, off_t offset, int length) +{ + off_t pos=0, begin; + int len=0, size; + struct ip_vs_lblcr_table *tbl; + unsigned long now = jiffies; + int i; + struct list_head *l, *e; + struct ip_vs_lblcr_entry *en; + + tbl = lblcr_table_list; + + size = sprintf(buffer, "LastTime Dest IP address Server set\n"); + pos += size; + len += size; + + for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) { + l = &tbl->bucket[i]; + read_lock_bh(&tbl->lock); + for (e=l->next; e!=l; e=e->next) { + char tbuf[16]; + struct ip_vs_dest_list *d; + + en = list_entry(e, struct ip_vs_lblcr_entry, list); + sprintf(tbuf, "%u.%u.%u.%u", NIPQUAD(en->addr)); + size = sprintf(buffer+len, "%8lu %-16s ", + now-en->lastuse, tbuf); + + read_lock(&en->set.lock); + for (d=en->set.list; d!=NULL; d=d->next) { + size += sprintf(buffer+len+size, + "%u.%u.%u.%u ", + NIPQUAD(d->dest->addr)); + } + read_unlock(&en->set.lock); + size += sprintf(buffer+len+size, "\n"); + len += size; + pos += size; + if (pos <= offset) + len=0; + if (pos >= offset+length) { + read_unlock_bh(&tbl->lock); + goto done; + } + } + read_unlock_bh(&tbl->lock); + } + + done: + begin = len - (pos - offset); + *start = buffer + begin; + len -= begin; + if(len>length) + len = length; + return len; +} +#endif + + +static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc) +{ + int i; + struct ip_vs_lblcr_table *tbl; + + /* + * Allocate the ip_vs_lblcr_table for this service + */ + tbl = kmalloc(sizeof(struct ip_vs_lblcr_table), GFP_ATOMIC); + if (tbl == NULL) { + IP_VS_ERR("ip_vs_lblcr_init_svc(): no memory\n"); + return -ENOMEM; + } + svc->sched_data = tbl; + IP_VS_DBG(6, "LBLCR hash table (memory=%dbytes) allocated for " + "current service\n", + sizeof(struct ip_vs_lblcr_table)); + + /* + * Initialize the hash buckets + */ + for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) { + INIT_LIST_HEAD(&tbl->bucket[i]); + } + tbl->lock = RW_LOCK_UNLOCKED; + tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16; + tbl->rover = 0; + tbl->counter = 1; + + /* + * Hook periodic timer for garbage collection + */ + init_timer(&tbl->periodic_timer); + tbl->periodic_timer.data = (unsigned long)tbl; + tbl->periodic_timer.function = ip_vs_lblcr_check_expire; + tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL; + add_timer(&tbl->periodic_timer); + +#ifdef CONFIG_IP_VS_LBLCR_DEBUG + lblcr_table_list = tbl; +#endif + return 0; +} + + +static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc) +{ + struct ip_vs_lblcr_table *tbl = svc->sched_data; + + /* remove periodic timer */ + del_timer_sync(&tbl->periodic_timer); + + /* got to clean up table entries here */ + ip_vs_lblcr_flush(tbl); + + /* release the table itself */ + kfree(svc->sched_data); + IP_VS_DBG(6, "LBLCR hash table (memory=%dbytes) released\n", + sizeof(struct ip_vs_lblcr_table)); + + return 0; +} + + +static int ip_vs_lblcr_update_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static inline struct ip_vs_dest * +__ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph) +{ + register struct list_head *l, *e; + struct ip_vs_dest *dest, *least; + int loh, doh; + + /* + * We think the overhead of processing active connections is fifty + * times higher than that of inactive connections in average. (This + * fifty times might not be accurate, we will change it later.) We + * use the following formula to estimate the overhead: + * dest->activeconns*50 + dest->inactconns + * and the load: + * (dest overhead) / dest->weight + * + * Remember -- no floats in kernel mode!!! + * The comparison of h1*w2 > h2*w1 is equivalent to that of + * h1/w1 > h2/w2 + * if every weight is larger than zero. + * + * The server with weight=0 is quiesced and will not receive any + * new connection. + */ + + l = &svc->destinations; + for (e=l->next; e!=l; e=e->next) { + least = list_entry(e, struct ip_vs_dest, n_list); + if (least->flags & IP_VS_DEST_F_OVERLOAD) + continue; + + if (atomic_read(&least->weight) > 0) { + loh = atomic_read(&least->activeconns) * 50 + + atomic_read(&least->inactconns); + goto nextstage; + } + } + return NULL; + + /* + * Find the destination with the least load. + */ + nextstage: + for (e=e->next; e!=l; e=e->next) { + dest = list_entry(e, struct ip_vs_dest, n_list); + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + + doh = atomic_read(&dest->activeconns) * 50 + + atomic_read(&dest->inactconns); + if (loh * atomic_read(&dest->weight) > + doh * atomic_read(&least->weight)) { + least = dest; + loh = doh; + } + } + + IP_VS_DBG(6, "LBLCR: server %d.%d.%d.%d:%d " + "activeconns %d refcnt %d weight %d overhead %d\n", + NIPQUAD(least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + + return least; +} + + +/* + * If this destination server is overloaded and there is a less loaded + * server, then return true. + */ +static inline int +is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) +{ + if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) { + register struct list_head *l, *e; + struct ip_vs_dest *d; + + l = &svc->destinations; + for (e=l->next; e!=l; e=e->next) { + d = list_entry(e, struct ip_vs_dest, n_list); + if (atomic_read(&d->activeconns)*2 + < atomic_read(&d->weight)) { + return 1; + } + } + } + return 0; +} + + +/* + * Locality-Based (weighted) Least-Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_lblcr_schedule(struct ip_vs_service *svc, struct iphdr *iph) +{ + struct ip_vs_dest *dest; + struct ip_vs_lblcr_table *tbl; + struct ip_vs_lblcr_entry *en; + + IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n"); + + tbl = (struct ip_vs_lblcr_table *)svc->sched_data; + en = ip_vs_lblcr_get(tbl, iph->daddr); + if (en == NULL) { + dest = __ip_vs_wlc_schedule(svc, iph); + if (dest == NULL) { + IP_VS_DBG(1, "no destination available\n"); + return NULL; + } + en = ip_vs_lblcr_new(iph->daddr); + if (en == NULL) { + return NULL; + } + ip_vs_dest_set_insert(&en->set, dest); + ip_vs_lblcr_hash(tbl, en); + } else { + dest = ip_vs_dest_set_min(&en->set); + if (!dest || is_overloaded(dest, svc)) { + dest = __ip_vs_wlc_schedule(svc, iph); + if (dest == NULL) { + IP_VS_DBG(1, "no destination available\n"); + return NULL; + } + ip_vs_dest_set_insert(&en->set, dest); + } + if (atomic_read(&en->set.size) > 1 && + jiffies-en->set.lastmod > sysctl_ip_vs_lblcr_expiration) { + struct ip_vs_dest *m; + m = ip_vs_dest_set_max(&en->set); + if (m) + ip_vs_dest_set_erase(&en->set, m); + } + } + en->lastuse = jiffies; + + IP_VS_DBG(6, "LBLCR: destination IP address %u.%u.%u.%u " + "--> server %u.%u.%u.%u:%d\n", + NIPQUAD(en->addr), + NIPQUAD(dest->addr), + ntohs(dest->port)); + + return dest; +} + + +/* + * IPVS LBLCR Scheduler structure + */ +static struct ip_vs_scheduler ip_vs_lblcr_scheduler = +{ + .name = "lblcr", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .init_service = ip_vs_lblcr_init_svc, + .done_service = ip_vs_lblcr_done_svc, + .update_service = ip_vs_lblcr_update_svc, + .schedule = ip_vs_lblcr_schedule, +}; + + +static int __init ip_vs_lblcr_init(void) +{ + INIT_LIST_HEAD(&ip_vs_lblcr_scheduler.n_list); + lblcr_sysctl_table.sysctl_header = + register_sysctl_table(lblcr_sysctl_table.root_dir, 0); +#ifdef CONFIG_IP_VS_LBLCR_DEBUG + proc_net_create("ip_vs_lblcr", 0, ip_vs_lblcr_getinfo); +#endif + return register_ip_vs_scheduler(&ip_vs_lblcr_scheduler); +} + + +static void __exit ip_vs_lblcr_cleanup(void) +{ +#ifdef CONFIG_IP_VS_LBLCR_DEBUG + proc_net_remove("ip_vs_lblcr"); +#endif + unregister_sysctl_table(lblcr_sysctl_table.sysctl_header); + unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler); +} + + +module_init(ip_vs_lblcr_init); +module_exit(ip_vs_lblcr_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_lc.c b/net/ipv4/ipvs/ip_vs_lc.c new file mode 100644 index 000000000000..13b61dfd0590 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_lc.c @@ -0,0 +1,144 @@ +/* + * IPVS: Least-Connection Scheduling module + * + * Version: $Id: ip_vs_lc.c,v 1.10 2003/04/18 09:03:16 wensong Exp $ + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Wensong Zhang : added the ip_vs_lc_update_svc + * Wensong Zhang : added any dest with weight=0 is quiesced + * + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/errno.h> + +#include <net/ip_vs.h> + + +static int ip_vs_lc_init_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static int ip_vs_lc_done_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static int ip_vs_lc_update_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static inline unsigned int +ip_vs_lc_dest_overhead(struct ip_vs_dest *dest) +{ + /* + * We think the overhead of processing active connections is 256 + * times higher than that of inactive connections in average. (This + * 256 times might not be accurate, we will change it later) We + * use the following formula to estimate the overhead now: + * dest->activeconns*256 + dest->inactconns + */ + return (atomic_read(&dest->activeconns) << 8) + + atomic_read(&dest->inactconns); +} + + +/* + * Least Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_lc_schedule(struct ip_vs_service *svc, struct iphdr *iph) +{ + struct list_head *l, *e; + struct ip_vs_dest *dest, *least; + unsigned int loh, doh; + + IP_VS_DBG(6, "ip_vs_lc_schedule(): Scheduling...\n"); + + /* + * Simply select the server with the least number of + * (activeconns<<5) + inactconns + * Except whose weight is equal to zero. + * If the weight is equal to zero, it means that the server is + * quiesced, the existing connections to the server still get + * served, but no new connection is assigned to the server. + */ + + l = &svc->destinations; + for (e=l->next; e!=l; e=e->next) { + least = list_entry (e, struct ip_vs_dest, n_list); + if (least->flags & IP_VS_DEST_F_OVERLOAD) + continue; + if (atomic_read(&least->weight) > 0) { + loh = ip_vs_lc_dest_overhead(least); + goto nextstage; + } + } + return NULL; + + /* + * Find the destination with the least load. + */ + nextstage: + for (e=e->next; e!=l; e=e->next) { + dest = list_entry(e, struct ip_vs_dest, n_list); + if ((dest->flags & IP_VS_DEST_F_OVERLOAD) || + atomic_read(&dest->weight) == 0) + continue; + doh = ip_vs_lc_dest_overhead(dest); + if (doh < loh) { + least = dest; + loh = doh; + } + } + + IP_VS_DBG(6, "LC: server %u.%u.%u.%u:%u activeconns %d inactconns %d\n", + NIPQUAD(least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->inactconns)); + + return least; +} + + +static struct ip_vs_scheduler ip_vs_lc_scheduler = { + .name = "lc", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .init_service = ip_vs_lc_init_svc, + .done_service = ip_vs_lc_done_svc, + .update_service = ip_vs_lc_update_svc, + .schedule = ip_vs_lc_schedule, +}; + + +static int __init ip_vs_lc_init(void) +{ + INIT_LIST_HEAD(&ip_vs_lc_scheduler.n_list); + return register_ip_vs_scheduler(&ip_vs_lc_scheduler) ; +} + +static void __exit ip_vs_lc_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_lc_scheduler); +} + +module_init(ip_vs_lc_init); +module_exit(ip_vs_lc_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_nq.c b/net/ipv4/ipvs/ip_vs_nq.c new file mode 100644 index 000000000000..23828fcd37d8 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_nq.c @@ -0,0 +1,181 @@ +/* + * IPVS: Never Queue scheduling module + * + * Version: $Id: ip_vs_nq.c,v 1.2 2003/06/08 09:31:19 wensong Exp $ + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +/* + * The NQ algorithm adopts a two-speed model. When there is an idle server + * available, the job will be sent to the idle server, instead of waiting + * for a fast one. When there is no idle server available, the job will be + * sent to the server that minimize its expected delay (The Shortest + * Expected Delay scheduling algorithm). + * + * See the following paper for more information: + * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing + * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88, + * pages 986-994, 1988. + * + * Thanks must go to Marko Buuri <marko@buuri.name> for talking NQ to me. + * + * The difference between NQ and SED is that NQ can improve overall + * system utilization. + * + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/errno.h> + +#include <net/ip_vs.h> + + +static int +ip_vs_nq_init_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static int +ip_vs_nq_done_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static int +ip_vs_nq_update_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static inline unsigned int +ip_vs_nq_dest_overhead(struct ip_vs_dest *dest) +{ + /* + * We only use the active connection number in the cost + * calculation here. + */ + return atomic_read(&dest->activeconns) + 1; +} + + +/* + * Weighted Least Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_nq_schedule(struct ip_vs_service *svc, struct iphdr *iph) +{ + register struct list_head *l, *e; + struct ip_vs_dest *dest, *least; + unsigned int loh, doh; + + IP_VS_DBG(6, "ip_vs_nq_schedule(): Scheduling...\n"); + + /* + * We calculate the load of each dest server as follows: + * (server expected overhead) / dest->weight + * + * Remember -- no floats in kernel mode!!! + * The comparison of h1*w2 > h2*w1 is equivalent to that of + * h1/w1 > h2/w2 + * if every weight is larger than zero. + * + * The server with weight=0 is quiesced and will not receive any + * new connections. + */ + + l = &svc->destinations; + for (e=l->next; e!=l; e=e->next) { + least = list_entry(e, struct ip_vs_dest, n_list); + if (!(least->flags & IP_VS_DEST_F_OVERLOAD) && + atomic_read(&least->weight) > 0) { + loh = ip_vs_nq_dest_overhead(least); + + /* return the server directly if it is idle */ + if (atomic_read(&least->activeconns) == 0) + goto out; + + goto nextstage; + } + } + return NULL; + + /* + * Find the destination with the least load. + */ + nextstage: + for (e=e->next; e!=l; e=e->next) { + dest = list_entry(e, struct ip_vs_dest, n_list); + + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + + doh = ip_vs_nq_dest_overhead(dest); + + /* return the server directly if it is idle */ + if (atomic_read(&dest->activeconns) == 0) { + least = dest; + goto out; + } + + if (loh * atomic_read(&dest->weight) > + doh * atomic_read(&least->weight)) { + least = dest; + loh = doh; + } + } + + out: + IP_VS_DBG(6, "NQ: server %u.%u.%u.%u:%u " + "activeconns %d refcnt %d weight %d overhead %d\n", + NIPQUAD(least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + + return least; +} + + +static struct ip_vs_scheduler ip_vs_nq_scheduler = +{ + .name = "nq", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .init_service = ip_vs_nq_init_svc, + .done_service = ip_vs_nq_done_svc, + .update_service = ip_vs_nq_update_svc, + .schedule = ip_vs_nq_schedule, +}; + + +static int __init ip_vs_nq_init(void) +{ + INIT_LIST_HEAD(&ip_vs_nq_scheduler.n_list); + return register_ip_vs_scheduler(&ip_vs_nq_scheduler); +} + +static void __exit ip_vs_nq_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_nq_scheduler); +} + +module_init(ip_vs_nq_init); +module_exit(ip_vs_nq_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_proto.c b/net/ipv4/ipvs/ip_vs_proto.c new file mode 100644 index 000000000000..a369743353a3 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_proto.c @@ -0,0 +1,231 @@ +/* + * ip_vs_proto.c: transport protocol load balancing support for IPVS + * + * Version: $Id: ip_vs_proto.c,v 1.2 2003/04/18 09:03:16 wensong Exp $ + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * Julian Anastasov <ja@ssi.bg> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/skbuff.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/init.h> +#include <net/protocol.h> +#include <net/tcp.h> +#include <net/udp.h> +#include <asm/system.h> +#include <linux/stat.h> +#include <linux/proc_fs.h> + +#include <net/ip_vs.h> + + +/* + * IPVS protocols can only be registered/unregistered when the ipvs + * module is loaded/unloaded, so no lock is needed in accessing the + * ipvs protocol table. + */ + +#define IP_VS_PROTO_TAB_SIZE 32 /* must be power of 2 */ +#define IP_VS_PROTO_HASH(proto) ((proto) & (IP_VS_PROTO_TAB_SIZE-1)) + +static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE]; + + +/* + * register an ipvs protocol + */ +int register_ip_vs_protocol(struct ip_vs_protocol *pp) +{ + unsigned hash = IP_VS_PROTO_HASH(pp->protocol); + + pp->next = ip_vs_proto_table[hash]; + ip_vs_proto_table[hash] = pp; + + if (pp->init != NULL) + pp->init(pp); + + return 0; +} + + +/* + * unregister an ipvs protocol + */ +int unregister_ip_vs_protocol(struct ip_vs_protocol *pp) +{ + struct ip_vs_protocol **pp_p; + unsigned hash = IP_VS_PROTO_HASH(pp->protocol); + + pp_p = &ip_vs_proto_table[hash]; + for (; *pp_p; pp_p = &(*pp_p)->next) { + if (*pp_p == pp) { + *pp_p = pp->next; + if (pp->exit != NULL) + pp->exit(pp); + return 0; + } + } + + return -ESRCH; +} + + +/* + * get ip_vs_protocol object by its proto. + */ +struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto) +{ + struct ip_vs_protocol *pp; + unsigned hash = IP_VS_PROTO_HASH(proto); + + for (pp = ip_vs_proto_table[hash]; pp; pp = pp->next) { + if (pp->protocol == proto) + return pp; + } + + return NULL; +} + + +/* + * Propagate event for state change to all protocols + */ +void ip_vs_protocol_timeout_change(int flags) +{ + struct ip_vs_protocol *pp; + int i; + + for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { + for (pp = ip_vs_proto_table[i]; pp; pp = pp->next) { + if (pp->timeout_change) + pp->timeout_change(pp, flags); + } + } +} + + +int * +ip_vs_create_timeout_table(int *table, int size) +{ + int *t; + + t = kmalloc(size, GFP_ATOMIC); + if (t == NULL) + return NULL; + memcpy(t, table, size); + return t; +} + + +/* + * Set timeout value for state specified by name + */ +int +ip_vs_set_state_timeout(int *table, int num, char **names, char *name, int to) +{ + int i; + + if (!table || !name || !to) + return -EINVAL; + + for (i = 0; i < num; i++) { + if (strcmp(names[i], name)) + continue; + table[i] = to * HZ; + return 0; + } + return -ENOENT; +} + + +const char * ip_vs_state_name(__u16 proto, int state) +{ + struct ip_vs_protocol *pp = ip_vs_proto_get(proto); + + if (pp == NULL || pp->state_name == NULL) + return "ERR!"; + return pp->state_name(state); +} + + +void +tcpudp_debug_packet(struct ip_vs_protocol *pp, struct iphdr *iph, char *msg) +{ + char buf[128]; + union ip_vs_tphdr h; + + h.raw = (char *) iph + iph->ihl * 4; + if (iph->frag_off & __constant_htons(IP_OFFSET)) + sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u frag", + pp->name, NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); + else + sprintf(buf, "%s %u.%u.%u.%u:%u->%u.%u.%u.%u:%u", + pp->name, + NIPQUAD(iph->saddr), + ntohs(h.portp[0]), + NIPQUAD(iph->daddr), + ntohs(h.portp[1])); + + printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); +} + + +int ip_vs_protocol_init(void) +{ + char protocols[64]; +#define REGISTER_PROTOCOL(p) \ + do { \ + register_ip_vs_protocol(p); \ + strcat(protocols, ", "); \ + strcat(protocols, (p)->name); \ + } while (0) + + protocols[0] = '\0'; + protocols[2] = '\0'; +#ifdef CONFIG_IP_VS_PROTO_TCP + REGISTER_PROTOCOL(&ip_vs_protocol_tcp); +#endif +#ifdef CONFIG_IP_VS_PROTO_UDP + REGISTER_PROTOCOL(&ip_vs_protocol_udp); +#endif +#ifdef CONFIG_IP_VS_PROTO_ICMP + REGISTER_PROTOCOL(&ip_vs_protocol_icmp); +#endif +#ifdef CONFIG_IP_VS_PROTO_AH + REGISTER_PROTOCOL(&ip_vs_protocol_ah); +#endif +#ifdef CONFIG_IP_VS_PROTO_ESP + REGISTER_PROTOCOL(&ip_vs_protocol_esp); +#endif + IP_VS_INFO("Registered protocols (%s)\n", &protocols[2]); + + return 0; +} + + +void ip_vs_protocol_cleanup(void) +{ + struct ip_vs_protocol *pp; + int i; + + /* unregister all the ipvs protocols */ + for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { + while ((pp = ip_vs_proto_table[i]) != NULL) + unregister_ip_vs_protocol(pp); + } +} diff --git a/net/ipv4/ipvs/ip_vs_proto_ah.c b/net/ipv4/ipvs/ip_vs_proto_ah.c new file mode 100644 index 000000000000..04e44321b322 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_proto_ah.c @@ -0,0 +1,181 @@ +/* + * ip_vs_proto_ah.c: AH IPSec load balancing support for IPVS + * + * Version: $Id: ip_vs_proto_ah.c,v 1.1 2003/07/04 15:04:37 wensong Exp $ + * + * Authors: Julian Anastasov <ja@ssi.bg>, February 2002 + * Wensong Zhang <wensong@linuxvirtualserver.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation; + * + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/compiler.h> +#include <linux/vmalloc.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> + +#include <net/ip_vs.h> + + +/* TODO: + +struct isakmp_hdr { + __u8 icookie[8]; + __u8 rcookie[8]; + __u8 np; + __u8 version; + __u8 xchgtype; + __u8 flags; + __u32 msgid; + __u32 length; +}; + +*/ + +#define PORT_ISAKMP 500 + + +static struct ip_vs_conn * +ah_conn_in_get(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct iphdr *iph, union ip_vs_tphdr h, int inverse) +{ + struct ip_vs_conn *cp; + + if (likely(!inverse)) { + cp = ip_vs_conn_in_get(IPPROTO_UDP, + iph->saddr, + __constant_htons(PORT_ISAKMP), + iph->daddr, + __constant_htons(PORT_ISAKMP)); + } else { + cp = ip_vs_conn_in_get(IPPROTO_UDP, + iph->daddr, + __constant_htons(PORT_ISAKMP), + iph->saddr, + __constant_htons(PORT_ISAKMP)); + } + + if (!cp) { + /* + * We are not sure if the packet is from our + * service, so the caller should check skip_nonexisting + */ + IP_VS_DBG(12, "Unknown ISAKMP entry for outin packet " + "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n", + inverse?"ICMP+":"", + pp->name, + NIPQUAD(iph->saddr), + NIPQUAD(iph->daddr)); + } + + return cp; +} + + +static struct ip_vs_conn * +ah_conn_out_get(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct iphdr *iph, union ip_vs_tphdr h, int inverse) +{ + struct ip_vs_conn *cp; + + if (likely(!inverse)) { + cp = ip_vs_conn_out_get(IPPROTO_UDP, + iph->saddr, + __constant_htons(PORT_ISAKMP), + iph->daddr, + __constant_htons(PORT_ISAKMP)); + } else { + cp = ip_vs_conn_out_get(IPPROTO_UDP, + iph->daddr, + __constant_htons(PORT_ISAKMP), + iph->saddr, + __constant_htons(PORT_ISAKMP)); + } + + if (!cp) { + /* + * We are not sure if the packet is from our + * service, so the caller should check skip_nonexisting + * or our conn_schedule hook should return NF_ACCEPT + */ + IP_VS_DBG(12, "Unknown ISAKMP entry for inout packet " + "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n", + inverse ? "ICMP+" : "", + pp->name, + NIPQUAD(iph->saddr), + NIPQUAD(iph->daddr)); + } + + return cp; +} + + +static int +ah_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct iphdr *iph, union ip_vs_tphdr h, + int *verdict, struct ip_vs_conn **cpp) +{ + /* + * AH is only related traffic. Pass the packet to IP stack. + */ + *verdict = NF_ACCEPT; + return 0; +} + + +static void +ah_debug_packet(struct ip_vs_protocol *pp, struct iphdr *iph, char *msg) +{ + char buf[256]; + + sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u", + pp->name, NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); + + printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); +} + + +static void ah_init(struct ip_vs_protocol *pp) +{ + /* nothing to do now */ +} + + +static void ah_exit(struct ip_vs_protocol *pp) +{ + /* nothing to do now */ +} + + +struct ip_vs_protocol ip_vs_protocol_ah = { + .name = "AH", + .protocol = IPPROTO_AH, + .minhlen = 0, + .minhlen_icmp = 0, + .dont_defrag = 1, + .skip_nonexisting = 1, + .slave = 1, + .init = ah_init, + .exit = ah_exit, + .conn_schedule = ah_conn_schedule, + .conn_in_get = ah_conn_in_get, + .conn_out_get = ah_conn_out_get, + .snat_handler = NULL, + .dnat_handler = NULL, + .state_transition = NULL, + .register_app = NULL, + .unregister_app = NULL, + .app_conn_bind = NULL, + .csum_check = NULL, + .debug_packet = ah_debug_packet, + .timeout_change = NULL, /* ISAKMP */ + .set_state_timeout = NULL, +}; diff --git a/net/ipv4/ipvs/ip_vs_proto_esp.c b/net/ipv4/ipvs/ip_vs_proto_esp.c new file mode 100644 index 000000000000..7255a9aebd79 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_proto_esp.c @@ -0,0 +1,180 @@ +/* + * ip_vs_proto_esp.c: ESP IPSec load balancing support for IPVS + * + * Version: $Id: ip_vs_proto_esp.c,v 1.1 2003/07/04 15:04:37 wensong Exp $ + * + * Authors: Julian Anastasov <ja@ssi.bg>, February 2002 + * Wensong Zhang <wensong@linuxvirtualserver.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation; + * + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/compiler.h> +#include <linux/vmalloc.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> + +#include <net/ip_vs.h> + + +/* TODO: + +struct isakmp_hdr { + __u8 icookie[8]; + __u8 rcookie[8]; + __u8 np; + __u8 version; + __u8 xchgtype; + __u8 flags; + __u32 msgid; + __u32 length; +}; + +*/ + +#define PORT_ISAKMP 500 + + +static struct ip_vs_conn * +esp_conn_in_get(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct iphdr *iph, union ip_vs_tphdr h, int inverse) +{ + struct ip_vs_conn *cp; + + if (likely(!inverse)) { + cp = ip_vs_conn_in_get(IPPROTO_UDP, + iph->saddr, + __constant_htons(PORT_ISAKMP), + iph->daddr, + __constant_htons(PORT_ISAKMP)); + } else { + cp = ip_vs_conn_in_get(IPPROTO_UDP, + iph->daddr, + __constant_htons(PORT_ISAKMP), + iph->saddr, + __constant_htons(PORT_ISAKMP)); + } + + if (!cp) { + /* + * We are not sure if the packet is from our + * service, so the caller should check skip_nonexisting + */ + IP_VS_DBG(12, "Unknown ISAKMP entry for outin packet " + "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n", + inverse ? "ICMP+" : "", + pp->name, + NIPQUAD(iph->saddr), + NIPQUAD(iph->daddr)); + } + + return cp; +} + + +static struct ip_vs_conn * +esp_conn_out_get(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct iphdr *iph, union ip_vs_tphdr h, int inverse) +{ + struct ip_vs_conn *cp; + + if (likely(!inverse)) { + cp = ip_vs_conn_out_get(IPPROTO_UDP, + iph->saddr, + __constant_htons(PORT_ISAKMP), + iph->daddr, + __constant_htons(PORT_ISAKMP)); + } else { + cp = ip_vs_conn_out_get(IPPROTO_UDP, + iph->daddr, + __constant_htons(PORT_ISAKMP), + iph->saddr, + __constant_htons(PORT_ISAKMP)); + } + + if (!cp) { + /* + * We are not sure if the packet is from our + * service, so the caller should check skip_nonexisting + * or our conn_schedule hook should return NF_ACCEPT + */ + IP_VS_DBG(12, "Unknown ISAKMP entry for inout packet " + "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n", + inverse?"ICMP+":"", + pp->name, + NIPQUAD(iph->saddr), + NIPQUAD(iph->daddr)); + } + + return cp; +} + + +static int +esp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct iphdr *iph, union ip_vs_tphdr h, + int *verdict, struct ip_vs_conn **cpp) +{ + /* + * ESP is only related traffic. Pass the packet to IP stack. + */ + *verdict = NF_ACCEPT; + return 0; +} + + +static void +esp_debug_packet(struct ip_vs_protocol *pp, struct iphdr *iph, char *msg) +{ + char buf[256]; + + sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u", + pp->name, NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); + + printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); +} + + +static void esp_init(struct ip_vs_protocol *pp) +{ + /* nothing to do now */ +} + + +static void esp_exit(struct ip_vs_protocol *pp) +{ + /* nothing to do now */ +} + + +struct ip_vs_protocol ip_vs_protocol_esp = { + .name = "ESP", + .protocol = IPPROTO_ESP, + .minhlen = 0, + .minhlen_icmp = 0, + .dont_defrag = 1, + .skip_nonexisting = 1, + .slave = 1, + .init = esp_init, + .exit = esp_exit, + .conn_schedule = esp_conn_schedule, + .conn_in_get = esp_conn_in_get, + .conn_out_get = esp_conn_out_get, + .snat_handler = NULL, + .dnat_handler = NULL, + .csum_check = NULL, + .state_transition = NULL, + .register_app = NULL, + .unregister_app = NULL, + .app_conn_bind = NULL, + .debug_packet = esp_debug_packet, + .timeout_change = NULL, /* ISAKMP */ +}; diff --git a/net/ipv4/ipvs/ip_vs_proto_icmp.c b/net/ipv4/ipvs/ip_vs_proto_icmp.c new file mode 100644 index 000000000000..4339fa52c6a5 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_proto_icmp.c @@ -0,0 +1,171 @@ +/* + * ip_vs_proto_icmp.c: ICMP load balancing support for IP Virtual Server + * + * Authors: Julian Anastasov <ja@ssi.bg>, March 2002 + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation; + * + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/compiler.h> +#include <linux/vmalloc.h> +#include <linux/icmp.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> + +#include <net/ip_vs.h> + + +static int icmp_timeouts[1] = { 1*60*HZ }; + +static char * icmp_state_name_table[1] = { "ICMP" }; + +struct ip_vs_conn * +icmp_conn_in_get(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct iphdr *iph, union ip_vs_tphdr h, int inverse) +{ +#if 0 + struct ip_vs_conn *cp; + + if (likely(!inverse)) { + cp = ip_vs_conn_in_get(iph->protocol, + iph->saddr, 0, + iph->daddr, 0); + } else { + cp = ip_vs_conn_in_get(iph->protocol, + iph->daddr, 0, + iph->saddr, 0); + } + + return cp; + +#else + return NULL; +#endif +} + +struct ip_vs_conn * +icmp_conn_out_get(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct iphdr *iph, union ip_vs_tphdr h, int inverse) +{ +#if 0 + struct ip_vs_conn *cp; + + if (likely(!inverse)) { + cp = ip_vs_conn_out_get(iph->protocol, + iph->saddr, 0, + iph->daddr, 0); + } else { + cp = ip_vs_conn_out_get(IPPROTO_UDP, + iph->daddr, 0, + iph->saddr, 0); + } + + return cp; +#else + return NULL; +#endif +} + +static int +icmp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct iphdr *iph, union ip_vs_tphdr h, + int *verdict, struct ip_vs_conn **cpp) +{ + *verdict = NF_ACCEPT; + return 0; +} + +static int +icmp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct iphdr *iph, union ip_vs_tphdr h, int size) +{ + if (!(iph->frag_off & __constant_htons(IP_OFFSET))) { + if (ip_compute_csum(h.raw, size)) { + IP_VS_DBG_RL_PKT(0, pp, iph, "Failed checksum for"); + return 0; + } + } + return 1; + +} + +static void +icmp_debug_packet(struct ip_vs_protocol *pp, struct iphdr *iph, char *msg) +{ + char buf[256]; + union ip_vs_tphdr h; + + h.raw = (char *) iph + iph->ihl * 4; + if (iph->frag_off & __constant_htons(IP_OFFSET)) + sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u frag", + pp->name, NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); + else + sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u T:%d C:%d", + pp->name, NIPQUAD(iph->saddr), NIPQUAD(iph->daddr), + h.icmph->type, h.icmph->code); + + printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); +} + +static int +icmp_state_transition(struct ip_vs_conn *cp, + int direction, struct iphdr *iph, + union ip_vs_tphdr h, struct ip_vs_protocol *pp) +{ + cp->timeout = pp->timeout_table[IP_VS_ICMP_S_NORMAL]; + return 1; +} + +static int +icmp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to) +{ + int num; + char **names; + + num = IP_VS_ICMP_S_LAST; + names = icmp_state_name_table; + return ip_vs_set_state_timeout(pp->timeout_table, num, names, sname, to); +} + + +static void icmp_init(struct ip_vs_protocol *pp) +{ + pp->timeout_table = icmp_timeouts; +} + +static void icmp_exit(struct ip_vs_protocol *pp) +{ +} + +struct ip_vs_protocol ip_vs_protocol_icmp = { + .name = "ICMP", + .protocol = IPPROTO_ICMP, + .minhlen = sizeof(struct icmphdr), + .minhlen_icmp = 8, + .dont_defrag = 0, + .skip_nonexisting = 0, + .slave = 0, + .init = icmp_init, + .exit = icmp_exit, + .conn_schedule = icmp_conn_schedule, + .conn_in_get = icmp_conn_in_get, + .conn_out_get = icmp_conn_out_get, + .snat_handler = NULL, + .dnat_handler = NULL, + .csum_check = icmp_csum_check, + .state_transition = icmp_state_transition, + .register_app = NULL, + .unregister_app = NULL, + .app_conn_bind = NULL, + .debug_packet = icmp_debug_packet, + .timeout_change = NULL, + .set_state_timeout = icmp_set_state_timeout, +}; diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c new file mode 100644 index 000000000000..00c9107b8585 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c @@ -0,0 +1,611 @@ +/* + * ip_vs_proto_tcp.c: TCP load balancing support for IPVS + * + * Version: $Id: ip_vs_proto_tcp.c,v 1.3 2002/11/30 01:50:35 wensong Exp $ + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * Julian Anastasov <ja@ssi.bg> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +#include <linux/config.h> +#include <linux/compiler.h> +#include <linux/ip.h> +#include <linux/tcp.h> /* for tcphdr */ +#include <net/ip.h> +#include <net/tcp.h> /* for csum_tcpudp_magic */ +#include <linux/netfilter.h> + +#include <net/ip_vs.h> + + +static struct ip_vs_conn * +tcp_conn_in_get(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct iphdr *iph, union ip_vs_tphdr h, int inverse) +{ + if (likely(!inverse)) { + return ip_vs_conn_in_get(iph->protocol, + iph->saddr, h.th->source, + iph->daddr, h.th->dest); + } else { + return ip_vs_conn_in_get(iph->protocol, + iph->daddr, h.th->dest, + iph->saddr, h.th->source); + } +} + +static struct ip_vs_conn * +tcp_conn_out_get(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct iphdr *iph, union ip_vs_tphdr h, int inverse) +{ + if (likely(!inverse)) { + return ip_vs_conn_out_get(iph->protocol, + iph->saddr, h.th->source, + iph->daddr, h.th->dest); + } else { + return ip_vs_conn_out_get(iph->protocol, + iph->daddr, h.th->dest, + iph->saddr, h.th->source); + } +} + + +static int +tcp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct iphdr *iph, union ip_vs_tphdr h, + int *verdict, struct ip_vs_conn **cpp) +{ + struct ip_vs_service *svc; + + if (h.th->syn && + (svc = ip_vs_service_get(skb->nfmark, iph->protocol, + iph->daddr, h.portp[1]))) { + if (ip_vs_todrop()) { + /* + * It seems that we are very loaded. + * We have to drop this packet :( + */ + ip_vs_service_put(svc); + *verdict = NF_DROP; + return 0; + } + + /* + * Let the virtual server select a real server for the + * incoming connection, and create a connection entry. + */ + *cpp = ip_vs_schedule(svc, iph); + if (!*cpp) { + *verdict = ip_vs_leave(svc, skb, pp, h); + return 0; + } + ip_vs_service_put(svc); + } + return 1; +} + + +static inline void +tcp_fast_csum_update(union ip_vs_tphdr *h, u32 oldip, u32 newip, + u16 oldport, u16 newport) +{ + h->th->check = + ip_vs_check_diff(~oldip, newip, + ip_vs_check_diff(oldport ^ 0xFFFF, + newport, h->th->check)); +} + + +static int +tcp_snat_handler(struct sk_buff *skb, + struct ip_vs_protocol *pp, struct ip_vs_conn *cp, + struct iphdr *iph, union ip_vs_tphdr h, int size) +{ + int ihl = (char *) h.raw - (char *) iph; + + /* We are sure that we work on first fragment */ + + h.th->source = cp->vport; + + /* Call application helper if needed */ + if (ip_vs_app_pkt_out(cp, skb) != 0) { + /* skb data has probably changed, update pointers */ + iph = skb->nh.iph; + h.raw = (char*)iph + ihl; + size = skb->len - ihl; + } + + /* Adjust TCP checksums */ + if (!cp->app) { + /* Only port and addr are changed, do fast csum update */ + tcp_fast_csum_update(&h, cp->daddr, cp->vaddr, + cp->dport, cp->vport); + if (skb->ip_summed == CHECKSUM_HW) + skb->ip_summed = CHECKSUM_NONE; + } else { + /* full checksum calculation */ + h.th->check = 0; + skb->csum = csum_partial(h.raw, size, 0); + h.th->check = csum_tcpudp_magic(iph->saddr, iph->daddr, + size, iph->protocol, + skb->csum); + IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%d)\n", + pp->name, h.th->check, + (char*)&(h.th->check) - (char*)h.raw); + } + return 1; +} + + +static int +tcp_dnat_handler(struct sk_buff *skb, + struct ip_vs_protocol *pp, struct ip_vs_conn *cp, + struct iphdr *iph, union ip_vs_tphdr h, int size) +{ + int ihl = (char *) h.raw - (char *) iph; + + /* We are sure that we work on first fragment */ + + h.th->dest = cp->dport; + + /* + * Attempt ip_vs_app call. + * It will fix ip_vs_conn and iph ack_seq stuff + */ + if (ip_vs_app_pkt_in(cp, skb) != 0) { + /* skb data has probably changed, update pointers */ + iph = skb->nh.iph; + h.raw = (char*) iph + ihl; + size = skb->len - ihl; + } + + /* + * Adjust TCP/UDP checksums + */ + if (!cp->app) { + /* Only port and addr are changed, do fast csum update */ + tcp_fast_csum_update(&h, cp->vaddr, cp->daddr, + cp->vport, cp->dport); + if (skb->ip_summed == CHECKSUM_HW) + skb->ip_summed = CHECKSUM_NONE; + } else { + /* full checksum calculation */ + h.th->check = 0; + h.th->check = csum_tcpudp_magic(iph->saddr, iph->daddr, + size, iph->protocol, + csum_partial(h.raw, size, 0)); + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + return 1; +} + + +static int +tcp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct iphdr *iph, union ip_vs_tphdr h, int size) +{ + switch (skb->ip_summed) { + case CHECKSUM_NONE: + skb->csum = csum_partial(h.raw, size, 0); + case CHECKSUM_HW: + if (csum_tcpudp_magic(iph->saddr, iph->daddr, size, + iph->protocol, skb->csum)) { + IP_VS_DBG_RL_PKT(0, pp, iph, + "Failed checksum for"); + return 0; + } + break; + default: + /* CHECKSUM_UNNECESSARY */ + break; + } + + return 1; +} + + +#define TCP_DIR_INPUT 0 +#define TCP_DIR_OUTPUT 4 +#define TCP_DIR_INPUT_ONLY 8 + +static int tcp_state_off[IP_VS_DIR_LAST] = { + [IP_VS_DIR_INPUT] = TCP_DIR_INPUT, + [IP_VS_DIR_OUTPUT] = TCP_DIR_OUTPUT, + [IP_VS_DIR_INPUT_ONLY] = TCP_DIR_INPUT_ONLY, +}; + +/* + * Timeout table[state] + */ +static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = { + [IP_VS_TCP_S_NONE] = 2*HZ, + [IP_VS_TCP_S_ESTABLISHED] = 15*60*HZ, + [IP_VS_TCP_S_SYN_SENT] = 2*60*HZ, + [IP_VS_TCP_S_SYN_RECV] = 1*60*HZ, + [IP_VS_TCP_S_FIN_WAIT] = 2*60*HZ, + [IP_VS_TCP_S_TIME_WAIT] = 2*60*HZ, + [IP_VS_TCP_S_CLOSE] = 10*HZ, + [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ, + [IP_VS_TCP_S_LAST_ACK] = 30*HZ, + [IP_VS_TCP_S_LISTEN] = 2*60*HZ, + [IP_VS_TCP_S_SYNACK] = 120*HZ, + [IP_VS_TCP_S_LAST] = 2*HZ, +}; + + +#if 0 + +/* FIXME: This is going to die */ + +static int tcp_timeouts_dos[IP_VS_TCP_S_LAST+1] = { + [IP_VS_TCP_S_NONE] = 2*HZ, + [IP_VS_TCP_S_ESTABLISHED] = 8*60*HZ, + [IP_VS_TCP_S_SYN_SENT] = 60*HZ, + [IP_VS_TCP_S_SYN_RECV] = 10*HZ, + [IP_VS_TCP_S_FIN_WAIT] = 60*HZ, + [IP_VS_TCP_S_TIME_WAIT] = 60*HZ, + [IP_VS_TCP_S_CLOSE] = 10*HZ, + [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ, + [IP_VS_TCP_S_LAST_ACK] = 30*HZ, + [IP_VS_TCP_S_LISTEN] = 2*60*HZ, + [IP_VS_TCP_S_SYNACK] = 100*HZ, + [IP_VS_TCP_S_LAST] = 2*HZ, +}; + +#endif + +static char * tcp_state_name_table[IP_VS_TCP_S_LAST+1] = { + [IP_VS_TCP_S_NONE] = "NONE", + [IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED", + [IP_VS_TCP_S_SYN_SENT] = "SYN_SENT", + [IP_VS_TCP_S_SYN_RECV] = "SYN_RECV", + [IP_VS_TCP_S_FIN_WAIT] = "FIN_WAIT", + [IP_VS_TCP_S_TIME_WAIT] = "TIME_WAIT", + [IP_VS_TCP_S_CLOSE] = "CLOSE", + [IP_VS_TCP_S_CLOSE_WAIT] = "CLOSE_WAIT", + [IP_VS_TCP_S_LAST_ACK] = "LAST_ACK", + [IP_VS_TCP_S_LISTEN] = "LISTEN", + [IP_VS_TCP_S_SYNACK] = "SYNACK", + [IP_VS_TCP_S_LAST] = "BUG!", +}; + +#define sNO IP_VS_TCP_S_NONE +#define sES IP_VS_TCP_S_ESTABLISHED +#define sSS IP_VS_TCP_S_SYN_SENT +#define sSR IP_VS_TCP_S_SYN_RECV +#define sFW IP_VS_TCP_S_FIN_WAIT +#define sTW IP_VS_TCP_S_TIME_WAIT +#define sCL IP_VS_TCP_S_CLOSE +#define sCW IP_VS_TCP_S_CLOSE_WAIT +#define sLA IP_VS_TCP_S_LAST_ACK +#define sLI IP_VS_TCP_S_LISTEN +#define sSA IP_VS_TCP_S_SYNACK + +struct tcp_states_t { + int next_state[IP_VS_TCP_S_LAST]; +}; + +static const char * tcp_state_name(int state) +{ + if (state >= IP_VS_TCP_S_LAST) + return "ERR!"; + return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?"; +} + +static struct tcp_states_t tcp_states [] = { +/* INPUT */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, +/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }}, +/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, +/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }}, + +/* OUTPUT */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }}, +/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }}, +/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }}, +/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }}, + +/* INPUT-ONLY */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, +/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, +/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, +/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, +}; + +static struct tcp_states_t tcp_states_dos [] = { +/* INPUT */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }}, +/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }}, +/*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }}, +/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, + +/* OUTPUT */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }}, +/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }}, +/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }}, +/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }}, + +/* INPUT-ONLY */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }}, +/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, +/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, +/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, +}; + +static struct tcp_states_t *tcp_state_table = tcp_states; + + +static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags) +{ + int on = (flags & 1); /* secure_tcp */ + + /* + ** FIXME: change secure_tcp to independent sysctl var + ** or make it per-service or per-app because it is valid + ** for most if not for all of the applications. Something + ** like "capabilities" (flags) for each object. + */ + tcp_state_table = (on? tcp_states_dos : tcp_states); +} + +static int +tcp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to) +{ + return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_TCP_S_LAST, + tcp_state_name_table, sname, to); +} + +static inline int tcp_state_idx(struct tcphdr *th) +{ + if (th->rst) + return 3; + if (th->syn) + return 0; + if (th->fin) + return 1; + if (th->ack) + return 2; + return -1; +} + +static inline void +set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp, + int direction, union ip_vs_tphdr h) +{ + int state_idx; + struct tcphdr *th = h.th; + int new_state = IP_VS_TCP_S_CLOSE; + int state_off = tcp_state_off[direction]; + + /* + * Update state offset to INPUT_ONLY if necessary + * or delete NO_OUTPUT flag if output packet detected + */ + if (cp->flags & IP_VS_CONN_F_NOOUTPUT) { + if (state_off == TCP_DIR_OUTPUT) + cp->flags &= ~IP_VS_CONN_F_NOOUTPUT; + else + state_off = TCP_DIR_INPUT_ONLY; + } + + if ((state_idx = tcp_state_idx(th)) < 0) { + IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx); + goto tcp_state_out; + } + + new_state = tcp_state_table[state_off+state_idx].next_state[cp->state]; + + tcp_state_out: + if (new_state != cp->state) { + struct ip_vs_dest *dest = cp->dest; + + IP_VS_DBG(8, "%s %s [%c%c%c%c] %u.%u.%u.%u:%d->" + "%u.%u.%u.%u:%d state: %s->%s cnt:%d\n", + pp->name, + (state_off==TCP_DIR_OUTPUT)?"output ":"input ", + th->syn? 'S' : '.', + th->fin? 'F' : '.', + th->ack? 'A' : '.', + th->rst? 'R' : '.', + NIPQUAD(cp->daddr), ntohs(cp->dport), + NIPQUAD(cp->caddr), ntohs(cp->cport), + tcp_state_name(cp->state), + tcp_state_name(new_state), + atomic_read(&cp->refcnt)); + if (dest) { + if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && + (new_state != IP_VS_TCP_S_ESTABLISHED)) { + atomic_dec(&dest->activeconns); + atomic_inc(&dest->inactconns); + cp->flags |= IP_VS_CONN_F_INACTIVE; + } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && + (new_state == IP_VS_TCP_S_ESTABLISHED)) { + atomic_inc(&dest->activeconns); + atomic_dec(&dest->inactconns); + cp->flags &= ~IP_VS_CONN_F_INACTIVE; + } + } + } + + cp->timeout = pp->timeout_table[cp->state = new_state]; +} + + +/* + * Handle state transitions + */ +static int +tcp_state_transition(struct ip_vs_conn *cp, + int direction, struct iphdr *iph, + union ip_vs_tphdr h, struct ip_vs_protocol *pp) +{ + spin_lock(&cp->lock); + set_tcp_state(pp, cp, direction, h); + spin_unlock(&cp->lock); + + return 1; +} + + +/* + * Hash table for TCP application incarnations + */ +#define TCP_APP_TAB_BITS 4 +#define TCP_APP_TAB_SIZE (1 << TCP_APP_TAB_BITS) +#define TCP_APP_TAB_MASK (TCP_APP_TAB_SIZE - 1) + +static struct list_head tcp_apps[TCP_APP_TAB_SIZE]; +static spinlock_t tcp_app_lock = SPIN_LOCK_UNLOCKED; + +static inline __u16 tcp_app_hashkey(__u16 port) +{ + return ((port >> TCP_APP_TAB_BITS) ^ port) & TCP_APP_TAB_MASK; +} + + +static int tcp_register_app(struct ip_vs_app *inc) +{ + struct ip_vs_app *i; + struct list_head *t, *p; + __u16 hash, port = inc->port; + int ret = 0; + + hash = tcp_app_hashkey(port); + t = &tcp_apps[hash]; + + spin_lock_bh(&tcp_app_lock); + for (p = t->next; p != t; p = p->next) { + i = list_entry(p, struct ip_vs_app, p_list); + if (i->port == port) { + ret = -EEXIST; + goto out; + } + } + list_add(&inc->p_list, t); + atomic_inc(&ip_vs_protocol_tcp.appcnt); + + out: + spin_unlock_bh(&tcp_app_lock); + return ret; +} + + +static void +tcp_unregister_app(struct ip_vs_app *inc) +{ + spin_lock_bh(&tcp_app_lock); + atomic_dec(&ip_vs_protocol_tcp.appcnt); + list_del(&inc->p_list); + spin_unlock_bh(&tcp_app_lock); +} + + +static int +tcp_app_conn_bind(struct ip_vs_conn *cp) +{ + struct list_head *t, *p; + int hash; + struct ip_vs_app *inc; + int result = 0; + + /* Default binding: bind app only for NAT */ + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) + return 0; + + /* Lookup application incarnations and bind the right one */ + hash = tcp_app_hashkey(cp->vport); + t = &tcp_apps[hash]; + + spin_lock(&tcp_app_lock); + for (p = t->next; p != t; p = p->next) { + inc = list_entry(p, struct ip_vs_app, p_list); + if (inc->port == cp->vport) { + if (unlikely(!ip_vs_app_inc_get(inc))) + break; + spin_unlock(&tcp_app_lock); + + IP_VS_DBG(9, "%s: Binding conn %u.%u.%u.%u:%u->" + "%u.%u.%u.%u:%u to app %s on port %u\n", + __FUNCTION__, + NIPQUAD(cp->caddr), ntohs(cp->cport), + NIPQUAD(cp->vaddr), ntohs(cp->vport), + inc->name, ntohs(inc->port)); + cp->app = inc; + if (inc->init_conn) + result = inc->init_conn(inc, cp); + goto out; + } + } + spin_unlock(&tcp_app_lock); + + out: + return result; +} + + +/* + * Set LISTEN timeout. (ip_vs_conn_put will setup timer) + */ +void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp) +{ + spin_lock(&cp->lock); + cp->state = IP_VS_TCP_S_LISTEN; + cp->timeout = ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_LISTEN]; + spin_unlock(&cp->lock); +} + + +static void tcp_init(struct ip_vs_protocol *pp) +{ + IP_VS_INIT_HASH_TABLE(tcp_apps); + pp->timeout_table = tcp_timeouts; +} + + +static void tcp_exit(struct ip_vs_protocol *pp) +{ +} + + +extern void +tcpudp_debug_packet(struct ip_vs_protocol *pp, struct iphdr *iph, char *msg); + +struct ip_vs_protocol ip_vs_protocol_tcp = { + .name = "TCP", + .protocol = IPPROTO_TCP, + .minhlen = sizeof(struct tcphdr), + .minhlen_icmp = 8, + .dont_defrag = 0, + .skip_nonexisting = 0, + .slave = 0, + .appcnt = ATOMIC_INIT(0), + .init = tcp_init, + .exit = tcp_exit, + .register_app = tcp_register_app, + .unregister_app = tcp_unregister_app, + .conn_schedule = tcp_conn_schedule, + .conn_in_get = tcp_conn_in_get, + .conn_out_get = tcp_conn_out_get, + .snat_handler = tcp_snat_handler, + .dnat_handler = tcp_dnat_handler, + .csum_check = tcp_csum_check, + .state_name = tcp_state_name, + .state_transition = tcp_state_transition, + .app_conn_bind = tcp_app_conn_bind, + .debug_packet = tcpudp_debug_packet, + .timeout_change = tcp_timeout_change, + .set_state_timeout = tcp_set_state_timeout, +}; diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c new file mode 100644 index 000000000000..c2cad339364b --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_proto_udp.c @@ -0,0 +1,396 @@ +/* + * ip_vs_proto_udp.c: UDP load balancing support for IPVS + * + * Version: $Id: ip_vs_proto_udp.c,v 1.3 2002/11/30 01:50:35 wensong Exp $ + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * Julian Anastasov <ja@ssi.bg> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +#include <linux/kernel.h> +#include <linux/netfilter.h> + +#include <net/ip_vs.h> + + +static struct ip_vs_conn * +udp_conn_in_get(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct iphdr *iph, union ip_vs_tphdr h, int inverse) +{ + struct ip_vs_conn *cp; + + if (likely(!inverse)) { + cp = ip_vs_conn_in_get(iph->protocol, + iph->saddr, h.portp[0], + iph->daddr, h.portp[1]); + } else { + cp = ip_vs_conn_in_get(iph->protocol, + iph->daddr, h.portp[1], + iph->saddr, h.portp[0]); + } + + return cp; +} + + +static struct ip_vs_conn * +udp_conn_out_get(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct iphdr *iph, union ip_vs_tphdr h, int inverse) +{ + struct ip_vs_conn *cp; + + if (likely(!inverse)) { + cp = ip_vs_conn_out_get(iph->protocol, + iph->saddr, h.portp[0], + iph->daddr, h.portp[1]); + } else { + cp = ip_vs_conn_out_get(iph->protocol, + iph->daddr, h.portp[1], + iph->saddr, h.portp[0]); + } + + return cp; +} + + +static int +udp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct iphdr *iph, union ip_vs_tphdr h, + int *verdict, struct ip_vs_conn **cpp) +{ + struct ip_vs_service *svc; + + if ((svc = ip_vs_service_get(skb->nfmark, iph->protocol, + iph->daddr, h.portp[1]))) { + if (ip_vs_todrop()) { + /* + * It seems that we are very loaded. + * We have to drop this packet :( + */ + ip_vs_service_put(svc); + *verdict = NF_DROP; + return 0; + } + + /* + * Let the virtual server select a real server for the + * incoming connection, and create a connection entry. + */ + *cpp = ip_vs_schedule(svc, iph); + if (!*cpp) { + *verdict = ip_vs_leave(svc, skb, pp, h); + return 0; + } + ip_vs_service_put(svc); + } + return 1; +} + + +static inline void +udp_fast_csum_update(union ip_vs_tphdr *h, u32 oldip, u32 newip, + u16 oldport, u16 newport) +{ + h->uh->check = + ip_vs_check_diff(~oldip, newip, + ip_vs_check_diff(oldport ^ 0xFFFF, + newport, h->uh->check)); + if (!h->uh->check) + h->uh->check = 0xFFFF; +} + +static int +udp_snat_handler(struct sk_buff *skb, + struct ip_vs_protocol *pp, struct ip_vs_conn *cp, + struct iphdr *iph, union ip_vs_tphdr h, int size) +{ + int ihl = (char *) h.raw - (char *) iph; + + /* We are sure that we work on first fragment */ + + h.portp[0] = cp->vport; + + /* + * Call application helper if needed + */ + if (ip_vs_app_pkt_out(cp, skb) != 0) { + /* skb data has probably changed, update pointers */ + iph = skb->nh.iph; + h.raw = (char*)iph + ihl; + size = skb->len - ihl; + } + + /* + * Adjust UDP checksums + */ + if (!cp->app && (h.uh->check != 0)) { + /* Only port and addr are changed, do fast csum update */ + udp_fast_csum_update(&h, cp->daddr, cp->vaddr, + cp->dport, cp->vport); + if (skb->ip_summed == CHECKSUM_HW) + skb->ip_summed = CHECKSUM_NONE; + } else { + /* full checksum calculation */ + h.uh->check = 0; + skb->csum = csum_partial(h.raw, size, 0); + h.uh->check = csum_tcpudp_magic(iph->saddr, iph->daddr, + size, iph->protocol, + skb->csum); + if (h.uh->check == 0) + h.uh->check = 0xFFFF; + IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%d)\n", + pp->name, h.uh->check, + (char*)&(h.uh->check) - (char*)h.raw); + } + return 1; +} + + +static int +udp_dnat_handler(struct sk_buff *skb, + struct ip_vs_protocol *pp, struct ip_vs_conn *cp, + struct iphdr *iph, union ip_vs_tphdr h, int size) +{ + int ihl = (char *) h.raw - (char *) iph; + + /* We are sure that we work on first fragment */ + + h.portp[1] = cp->dport; + + /* + * Attempt ip_vs_app call. + * will fix ip_vs_conn and iph ack_seq stuff + */ + if (ip_vs_app_pkt_in(cp, skb) != 0) { + /* skb data has probably changed, update pointers */ + iph = skb->nh.iph; + h.raw = (char*) iph + ihl; + size = skb->len - ihl; + } + + /* + * Adjust UDP checksums + */ + if (!cp->app && (h.uh->check != 0)) { + /* Only port and addr are changed, do fast csum update */ + udp_fast_csum_update(&h, cp->vaddr, cp->daddr, + cp->vport, cp->dport); + if (skb->ip_summed == CHECKSUM_HW) + skb->ip_summed = CHECKSUM_NONE; + } else { + /* full checksum calculation */ + h.uh->check = 0; + h.uh->check = csum_tcpudp_magic(iph->saddr, iph->daddr, + size, iph->protocol, + csum_partial(h.raw, size, 0)); + if (h.uh->check == 0) + h.uh->check = 0xFFFF; + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + return 1; +} + + +static int +udp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct iphdr *iph, union ip_vs_tphdr h, int size) +{ + if (h.uh->check != 0) { + switch (skb->ip_summed) { + case CHECKSUM_NONE: + skb->csum = csum_partial(h.raw, size, 0); + case CHECKSUM_HW: + if (csum_tcpudp_magic(iph->saddr, iph->daddr, size, + iph->protocol, skb->csum)) { + IP_VS_DBG_RL_PKT(0, pp, iph, + "Failed checksum for"); + return 0; + } + break; + default: + /* CHECKSUM_UNNECESSARY */ + break; + } + } + return 1; +} + + +/* + * Note: the caller guarantees that only one of register_app, + * unregister_app or app_conn_bind is called each time. + */ + +#define UDP_APP_TAB_BITS 4 +#define UDP_APP_TAB_SIZE (1 << UDP_APP_TAB_BITS) +#define UDP_APP_TAB_MASK (UDP_APP_TAB_SIZE - 1) + +static struct list_head udp_apps[UDP_APP_TAB_SIZE]; +static spinlock_t udp_app_lock = SPIN_LOCK_UNLOCKED; + +static inline __u16 udp_app_hashkey(__u16 port) +{ + return ((port >> UDP_APP_TAB_BITS) ^ port) & UDP_APP_TAB_MASK; +} + + +static int udp_register_app(struct ip_vs_app *inc) +{ + struct ip_vs_app *i; + struct list_head *t, *p; + __u16 hash, port = inc->port; + int ret = 0; + + hash = udp_app_hashkey(port); + t = &udp_apps[hash]; + + spin_lock_bh(&udp_app_lock); + for (p = t->next; p != t; p = p->next) { + i = list_entry(p, struct ip_vs_app, p_list); + if (i->port == port) { + ret = -EEXIST; + goto out; + } + } + list_add(&inc->p_list, t); + atomic_inc(&ip_vs_protocol_udp.appcnt); + + out: + spin_unlock_bh(&udp_app_lock); + return ret; +} + + +static void +udp_unregister_app(struct ip_vs_app *inc) +{ + spin_lock_bh(&udp_app_lock); + atomic_dec(&ip_vs_protocol_udp.appcnt); + list_del(&inc->p_list); + spin_unlock_bh(&udp_app_lock); +} + + +static int udp_app_conn_bind(struct ip_vs_conn *cp) +{ + struct list_head *t, *p; + int hash; + struct ip_vs_app *inc; + int result = 0; + + /* Default binding: bind app only for NAT */ + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) + return 0; + + /* Lookup application incarnations and bind the right one */ + hash = udp_app_hashkey(cp->vport); + t = &udp_apps[hash]; + + spin_lock(&udp_app_lock); + for (p = t->next; p != t; p = p->next) { + inc = list_entry(p, struct ip_vs_app, p_list); + if (inc->port == cp->vport) { + if (unlikely(!ip_vs_app_inc_get(inc))) + break; + spin_unlock(&udp_app_lock); + + IP_VS_DBG(9, "%s: Binding conn %u.%u.%u.%u:%u->" + "%u.%u.%u.%u:%u to app %s on port %u\n", + __FUNCTION__, + NIPQUAD(cp->caddr), ntohs(cp->cport), + NIPQUAD(cp->vaddr), ntohs(cp->vport), + inc->name, ntohs(inc->port)); + cp->app = inc; + if (inc->init_conn) + result = inc->init_conn(inc, cp); + goto out; + } + } + spin_unlock(&udp_app_lock); + + out: + return result; +} + + +static int udp_timeouts[IP_VS_UDP_S_LAST+1] = { + [IP_VS_UDP_S_NORMAL] = 5*60*HZ, + [IP_VS_UDP_S_LAST] = 2*HZ, +}; + +static char * udp_state_name_table[IP_VS_UDP_S_LAST+1] = { + [IP_VS_UDP_S_NORMAL] = "UDP", + [IP_VS_UDP_S_LAST] = "BUG!", +}; + + +static int +udp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to) +{ + return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_UDP_S_LAST, + udp_state_name_table, sname, to); +} + +static const char * udp_state_name(int state) +{ + if (state >= IP_VS_UDP_S_LAST) + return "ERR!"; + return udp_state_name_table[state] ? udp_state_name_table[state] : "?"; +} + +static int +udp_state_transition(struct ip_vs_conn *cp, + int direction, struct iphdr *iph, + union ip_vs_tphdr h, struct ip_vs_protocol *pp) +{ + cp->timeout = pp->timeout_table[IP_VS_UDP_S_NORMAL]; + return 1; +} + +static void udp_init(struct ip_vs_protocol *pp) +{ + IP_VS_INIT_HASH_TABLE(udp_apps); + pp->timeout_table = udp_timeouts; +} + +static void udp_exit(struct ip_vs_protocol *pp) +{ +} + + +extern void +tcpudp_debug_packet(struct ip_vs_protocol *pp, struct iphdr *iph, char *msg); + +struct ip_vs_protocol ip_vs_protocol_udp = { + .name = "UDP", + .protocol = IPPROTO_UDP, + .minhlen = sizeof(struct udphdr), + .minhlen_icmp = 8, + .dont_defrag = 0, + .skip_nonexisting = 0, + .slave = 0, + .init = udp_init, + .exit = udp_exit, + .conn_schedule = udp_conn_schedule, + .conn_in_get = udp_conn_in_get, + .conn_out_get = udp_conn_out_get, + .snat_handler = udp_snat_handler, + .dnat_handler = udp_dnat_handler, + .csum_check = udp_csum_check, + .state_transition = udp_state_transition, + .state_name = udp_state_name, + .register_app = udp_register_app, + .unregister_app = udp_unregister_app, + .app_conn_bind = udp_app_conn_bind, + .debug_packet = tcpudp_debug_packet, + .timeout_change = NULL, + .set_state_timeout = udp_set_state_timeout, +}; diff --git a/net/ipv4/ipvs/ip_vs_rr.c b/net/ipv4/ipvs/ip_vs_rr.c new file mode 100644 index 000000000000..6401222c2324 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_rr.c @@ -0,0 +1,122 @@ +/* + * IPVS: Round-Robin Scheduling module + * + * Version: $Id: ip_vs_rr.c,v 1.9 2002/09/15 08:14:08 wensong Exp $ + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * Peter Kese <peter.kese@ijs.si> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Fixes/Changes: + * Wensong Zhang : changed the ip_vs_rr_schedule to return dest + * Julian Anastasov : fixed the NULL pointer access bug in debugging + * Wensong Zhang : changed some comestics things for debugging + * Wensong Zhang : changed for the d-linked destination list + * Wensong Zhang : added the ip_vs_rr_update_svc + * Wensong Zhang : added any dest with weight=0 is quiesced + * + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/errno.h> + +#include <net/ip_vs.h> + + +static int ip_vs_rr_init_svc(struct ip_vs_service *svc) +{ + svc->sched_data = &svc->destinations; + return 0; +} + + +static int ip_vs_rr_done_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static int ip_vs_rr_update_svc(struct ip_vs_service *svc) +{ + svc->sched_data = &svc->destinations; + return 0; +} + + +/* + * Round-Robin Scheduling + */ +static struct ip_vs_dest * +ip_vs_rr_schedule(struct ip_vs_service *svc, struct iphdr *iph) +{ + register struct list_head *p, *q; + struct ip_vs_dest *dest; + + IP_VS_DBG(6, "ip_vs_rr_schedule(): Scheduling...\n"); + + write_lock(&svc->sched_lock); + p = (struct list_head *)svc->sched_data; + p = p->next; + q = p; + do { + /* skip list head */ + if (q == &svc->destinations) { + q = q->next; + continue; + } + + dest = list_entry(q, struct ip_vs_dest, n_list); + if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && + atomic_read(&dest->weight) > 0) + /* HIT */ + goto out; + q = q->next; + } while (q != p); + write_unlock(&svc->sched_lock); + return NULL; + + out: + svc->sched_data = q; + write_unlock(&svc->sched_lock); + IP_VS_DBG(6, "RR: server %u.%u.%u.%u:%u " + "activeconns %d refcnt %d weight %d\n", + NIPQUAD(dest->addr), ntohs(dest->port), + atomic_read(&dest->activeconns), + atomic_read(&dest->refcnt), atomic_read(&dest->weight)); + + return dest; +} + + +static struct ip_vs_scheduler ip_vs_rr_scheduler = { + .name = "rr", /* name */ + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .init_service = ip_vs_rr_init_svc, + .done_service = ip_vs_rr_done_svc, + .update_service = ip_vs_rr_update_svc, + .schedule = ip_vs_rr_schedule, +}; + +static int __init ip_vs_rr_init(void) +{ + INIT_LIST_HEAD(&ip_vs_rr_scheduler.n_list); + return register_ip_vs_scheduler(&ip_vs_rr_scheduler); +} + +static void __exit ip_vs_rr_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_rr_scheduler); +} + +module_init(ip_vs_rr_init); +module_exit(ip_vs_rr_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_sched.c b/net/ipv4/ipvs/ip_vs_sched.c new file mode 100644 index 000000000000..ec23d00a2ee7 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_sched.c @@ -0,0 +1,261 @@ +/* + * IPVS An implementation of the IP virtual server support for the + * LINUX operating system. IPVS is now implemented as a module + * over the Netfilter framework. IPVS can be used to build a + * high-performance and highly available server based on a + * cluster of servers. + * + * Version: $Id: ip_vs_sched.c,v 1.13 2003/05/10 03:05:23 wensong Exp $ + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * Peter Kese <peter.kese@ijs.si> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/spinlock.h> +#include <asm/string.h> +#include <linux/kmod.h> + +#include <net/ip_vs.h> + +/* + * IPVS scheduler list + */ +static LIST_HEAD(ip_vs_schedulers); + +/* lock for service table */ +static rwlock_t __ip_vs_sched_lock = RW_LOCK_UNLOCKED; + + +/* + * Bind a service with a scheduler + */ +int ip_vs_bind_scheduler(struct ip_vs_service *svc, + struct ip_vs_scheduler *scheduler) +{ + int ret; + + if (svc == NULL) { + IP_VS_ERR("ip_vs_bind_scheduler(): svc arg NULL\n"); + return -EINVAL; + } + if (scheduler == NULL) { + IP_VS_ERR("ip_vs_bind_scheduler(): scheduler arg NULL\n"); + return -EINVAL; + } + + svc->scheduler = scheduler; + + if (scheduler->init_service) { + ret = scheduler->init_service(svc); + if (ret) { + IP_VS_ERR("ip_vs_bind_scheduler(): init error\n"); + return ret; + } + } + + return 0; +} + + +/* + * Unbind a service with its scheduler + */ +int ip_vs_unbind_scheduler(struct ip_vs_service *svc) +{ + struct ip_vs_scheduler *sched; + + if (svc == NULL) { + IP_VS_ERR("ip_vs_unbind_scheduler(): svc arg NULL\n"); + return -EINVAL; + } + + sched = svc->scheduler; + if (sched == NULL) { + IP_VS_ERR("ip_vs_unbind_scheduler(): svc isn't bound\n"); + return -EINVAL; + } + + if (sched->done_service) { + if (sched->done_service(svc) != 0) { + IP_VS_ERR("ip_vs_unbind_scheduler(): done error\n"); + return -EINVAL; + } + } + + svc->scheduler = NULL; + return 0; +} + + +/* + * Get scheduler in the scheduler list by name + */ +static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name) +{ + struct ip_vs_scheduler *sched; + struct list_head *l, *e; + + IP_VS_DBG(2, "ip_vs_sched_getbyname(): sched_name \"%s\"\n", + sched_name); + + l = &ip_vs_schedulers; + + read_lock_bh(&__ip_vs_sched_lock); + + for (e=l->next; e!=l; e=e->next) { + sched = list_entry(e, struct ip_vs_scheduler, n_list); + + /* + * Test and get the modules atomically + */ + if (sched->module && !try_module_get(sched->module)) { + /* + * This scheduler is just deleted + */ + continue; + } + if (strcmp(sched_name, sched->name)==0) { + /* HIT */ + read_unlock_bh(&__ip_vs_sched_lock); + return sched; + } + if (sched->module) + module_put(sched->module); + } + + read_unlock_bh(&__ip_vs_sched_lock); + return NULL; +} + + +/* + * Lookup scheduler and try to load it if it doesn't exist + */ +struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name) +{ + struct ip_vs_scheduler *sched; + + /* + * Search for the scheduler by sched_name + */ + sched = ip_vs_sched_getbyname(sched_name); + + /* + * If scheduler not found, load the module and search again + */ + if (sched == NULL) { + char module_name[IP_VS_SCHEDNAME_MAXLEN+8]; + sprintf(module_name,"ip_vs_%s", sched_name); + request_module(module_name); + sched = ip_vs_sched_getbyname(sched_name); + } + + return sched; +} + +void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler) +{ + if (scheduler->module) + module_put(scheduler->module); +} + + +/* + * Register a scheduler in the scheduler list + */ +int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) +{ + struct ip_vs_scheduler *sched; + + if (!scheduler) { + IP_VS_ERR("register_ip_vs_scheduler(): NULL arg\n"); + return -EINVAL; + } + + if (!scheduler->name) { + IP_VS_ERR("register_ip_vs_scheduler(): NULL scheduler_name\n"); + return -EINVAL; + } + + /* increase the module use count */ + ip_vs_use_count_inc(); + + /* + * Make sure that the scheduler with this name doesn't exist + * in the scheduler list. + */ + sched = ip_vs_sched_getbyname(scheduler->name); + if (sched) { + ip_vs_scheduler_put(sched); + ip_vs_use_count_dec(); + IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler " + "already existed in the system\n", scheduler->name); + return -EINVAL; + } + + write_lock_bh(&__ip_vs_sched_lock); + + if (scheduler->n_list.next != &scheduler->n_list) { + write_unlock_bh(&__ip_vs_sched_lock); + ip_vs_use_count_dec(); + IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler " + "already linked\n", scheduler->name); + return -EINVAL; + } + + /* + * Add it into the d-linked scheduler list + */ + list_add(&scheduler->n_list, &ip_vs_schedulers); + write_unlock_bh(&__ip_vs_sched_lock); + + IP_VS_INFO("[%s] scheduler registered.\n", scheduler->name); + + return 0; +} + + +/* + * Unregister a scheduler from the scheduler list + */ +int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) +{ + if (!scheduler) { + IP_VS_ERR( "unregister_ip_vs_scheduler(): NULL arg\n"); + return -EINVAL; + } + + write_lock_bh(&__ip_vs_sched_lock); + if (scheduler->n_list.next == &scheduler->n_list) { + write_unlock_bh(&__ip_vs_sched_lock); + IP_VS_ERR("unregister_ip_vs_scheduler(): [%s] scheduler " + "is not in the list. failed\n", scheduler->name); + return -EINVAL; + } + + /* + * Remove it from the d-linked scheduler list + */ + list_del(&scheduler->n_list); + write_unlock_bh(&__ip_vs_sched_lock); + + /* decrease the module use count */ + ip_vs_use_count_dec(); + + IP_VS_INFO("[%s] scheduler unregistered.\n", scheduler->name); + + return 0; +} diff --git a/net/ipv4/ipvs/ip_vs_sed.c b/net/ipv4/ipvs/ip_vs_sed.c new file mode 100644 index 000000000000..b0684065fef8 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_sed.c @@ -0,0 +1,171 @@ +/* + * IPVS: Shortest Expected Delay scheduling module + * + * Version: $Id: ip_vs_sed.c,v 1.1 2003/05/10 03:06:08 wensong Exp $ + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +/* + * The SED algorithm attempts to minimize each job's expected delay until + * completion. The expected delay that the job will experience is + * (Ci + 1) / Ui if sent to the ith server, in which Ci is the number of + * jobs on the the ith server and Ui is the fixed service rate (weight) of + * the ith server. The SED algorithm adopts a greedy policy that each does + * what is in its own best interest, i.e. to join the queue which would + * minimize its expected delay of completion. + * + * See the following paper for more information: + * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing + * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88, + * pages 986-994, 1988. + * + * Thanks must go to Marko Buuri <marko@buuri.name> for talking SED to me. + * + * The difference between SED and WLC is that SED includes the incoming + * job in the cost function (the increment of 1). SED may outperform + * WLC, while scheduling big jobs under larger heterogeneous systems + * (the server weight varies a lot). + * + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/errno.h> + +#include <net/ip_vs.h> + + +static int +ip_vs_sed_init_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static int +ip_vs_sed_done_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static int +ip_vs_sed_update_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static inline unsigned int +ip_vs_sed_dest_overhead(struct ip_vs_dest *dest) +{ + /* + * We only use the active connection number in the cost + * calculation here. + */ + return atomic_read(&dest->activeconns) + 1; +} + + +/* + * Weighted Least Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_sed_schedule(struct ip_vs_service *svc, struct iphdr *iph) +{ + register struct list_head *l, *e; + struct ip_vs_dest *dest, *least; + unsigned int loh, doh; + + IP_VS_DBG(6, "ip_vs_sed_schedule(): Scheduling...\n"); + + /* + * We calculate the load of each dest server as follows: + * (server expected overhead) / dest->weight + * + * Remember -- no floats in kernel mode!!! + * The comparison of h1*w2 > h2*w1 is equivalent to that of + * h1/w1 > h2/w2 + * if every weight is larger than zero. + * + * The server with weight=0 is quiesced and will not receive any + * new connections. + */ + + l = &svc->destinations; + for (e=l->next; e!=l; e=e->next) { + least = list_entry(e, struct ip_vs_dest, n_list); + if (!(least->flags & IP_VS_DEST_F_OVERLOAD) && + atomic_read(&least->weight) > 0) { + loh = ip_vs_sed_dest_overhead(least); + goto nextstage; + } + } + return NULL; + + /* + * Find the destination with the least load. + */ + nextstage: + for (e=e->next; e!=l; e=e->next) { + dest = list_entry(e, struct ip_vs_dest, n_list); + + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + doh = ip_vs_sed_dest_overhead(dest); + if (loh * atomic_read(&dest->weight) > + doh * atomic_read(&least->weight)) { + least = dest; + loh = doh; + } + } + + IP_VS_DBG(6, "SED: server %u.%u.%u.%u:%u " + "activeconns %d refcnt %d weight %d overhead %d\n", + NIPQUAD(least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + + return least; +} + + +static struct ip_vs_scheduler ip_vs_sed_scheduler = +{ + .name = "sed", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .init_service = ip_vs_sed_init_svc, + .done_service = ip_vs_sed_done_svc, + .update_service = ip_vs_sed_update_svc, + .schedule = ip_vs_sed_schedule, +}; + + +static int __init ip_vs_sed_init(void) +{ + INIT_LIST_HEAD(&ip_vs_sed_scheduler.n_list); + return register_ip_vs_scheduler(&ip_vs_sed_scheduler); +} + +static void __exit ip_vs_sed_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_sed_scheduler); +} + +module_init(ip_vs_sed_init); +module_exit(ip_vs_sed_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_sh.c b/net/ipv4/ipvs/ip_vs_sh.c new file mode 100644 index 000000000000..4b66ea2b0e91 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_sh.c @@ -0,0 +1,258 @@ +/* + * IPVS: Source Hashing scheduling module + * + * Version: $Id: ip_vs_sh.c,v 1.5 2002/09/15 08:14:08 wensong Exp $ + * + * Authors: Wensong Zhang <wensong@gnuchina.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +/* + * The sh algorithm is to select server by the hash key of source IP + * address. The pseudo code is as follows: + * + * n <- servernode[src_ip]; + * if (n is dead) OR + * (n is overloaded) or (n.weight <= 0) then + * return NULL; + * + * return n; + * + * Notes that servernode is a 256-bucket hash table that maps the hash + * index derived from packet source IP address to the current server + * array. If the sh scheduler is used in cache cluster, it is good to + * combine it with cache_bypass feature. When the statically assigned + * server is dead or overloaded, the load balancer can bypass the cache + * server and send requests to the original server directly. + * + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/errno.h> + +#include <net/ip_vs.h> + + +/* + * IPVS SH bucket + */ +struct ip_vs_sh_bucket { + struct ip_vs_dest *dest; /* real server (cache) */ +}; + +/* + * for IPVS SH entry hash table + */ +#ifndef CONFIG_IP_VS_SH_TAB_BITS +#define CONFIG_IP_VS_SH_TAB_BITS 8 +#endif +#define IP_VS_SH_TAB_BITS CONFIG_IP_VS_SH_TAB_BITS +#define IP_VS_SH_TAB_SIZE (1 << IP_VS_SH_TAB_BITS) +#define IP_VS_SH_TAB_MASK (IP_VS_SH_TAB_SIZE - 1) + + +/* + * Returns hash value for IPVS SH entry + */ +static inline unsigned ip_vs_sh_hashkey(__u32 addr) +{ + return (ntohl(addr)*2654435761UL) & IP_VS_SH_TAB_MASK; +} + + +/* + * Get ip_vs_dest associated with supplied parameters. + */ +static inline struct ip_vs_dest * +ip_vs_sh_get(struct ip_vs_sh_bucket *tbl, __u32 addr) +{ + return (tbl[ip_vs_sh_hashkey(addr)]).dest; +} + + +/* + * Assign all the hash buckets of the specified table with the service. + */ +static int +ip_vs_sh_assign(struct ip_vs_sh_bucket *tbl, struct ip_vs_service *svc) +{ + int i; + struct ip_vs_sh_bucket *b; + struct list_head *p; + struct ip_vs_dest *dest; + + b = tbl; + p = &svc->destinations; + for (i=0; i<IP_VS_SH_TAB_SIZE; i++) { + if (list_empty(p)) { + b->dest = NULL; + } else { + if (p == &svc->destinations) + p = p->next; + + dest = list_entry(p, struct ip_vs_dest, n_list); + atomic_inc(&dest->refcnt); + b->dest = dest; + + p = p->next; + } + b++; + } + return 0; +} + + +/* + * Flush all the hash buckets of the specified table. + */ +static void ip_vs_sh_flush(struct ip_vs_sh_bucket *tbl) +{ + int i; + struct ip_vs_sh_bucket *b; + + b = tbl; + for (i=0; i<IP_VS_SH_TAB_SIZE; i++) { + if (b->dest) { + atomic_dec(&b->dest->refcnt); + b->dest = NULL; + } + b++; + } +} + + +static int ip_vs_sh_init_svc(struct ip_vs_service *svc) +{ + struct ip_vs_sh_bucket *tbl; + + /* allocate the SH table for this service */ + tbl = kmalloc(sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE, + GFP_ATOMIC); + if (tbl == NULL) { + IP_VS_ERR("ip_vs_sh_init_svc(): no memory\n"); + return -ENOMEM; + } + svc->sched_data = tbl; + IP_VS_DBG(6, "SH hash table (memory=%dbytes) allocated for " + "current service\n", + sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); + + /* assign the hash buckets with the updated service */ + ip_vs_sh_assign(tbl, svc); + + return 0; +} + + +static int ip_vs_sh_done_svc(struct ip_vs_service *svc) +{ + struct ip_vs_sh_bucket *tbl = svc->sched_data; + + /* got to clean up hash buckets here */ + ip_vs_sh_flush(tbl); + + /* release the table itself */ + kfree(svc->sched_data); + IP_VS_DBG(6, "SH hash table (memory=%dbytes) released\n", + sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); + + return 0; +} + + +static int ip_vs_sh_update_svc(struct ip_vs_service *svc) +{ + struct ip_vs_sh_bucket *tbl = svc->sched_data; + + /* got to clean up hash buckets here */ + ip_vs_sh_flush(tbl); + + /* assign the hash buckets with the updated service */ + ip_vs_sh_assign(tbl, svc); + + return 0; +} + + +/* + * If the dest flags is set with IP_VS_DEST_F_OVERLOAD, + * consider that the server is overloaded here. + */ +static inline int is_overloaded(struct ip_vs_dest *dest) +{ + return dest->flags & IP_VS_DEST_F_OVERLOAD; +} + + +/* + * Source Hashing scheduling + */ +static struct ip_vs_dest * +ip_vs_sh_schedule(struct ip_vs_service *svc, struct iphdr *iph) +{ + struct ip_vs_dest *dest; + struct ip_vs_sh_bucket *tbl; + + IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n"); + + tbl = (struct ip_vs_sh_bucket *)svc->sched_data; + dest = ip_vs_sh_get(tbl, iph->saddr); + if (!dest + || !(dest->flags & IP_VS_DEST_F_AVAILABLE) + || atomic_read(&dest->weight) <= 0 + || is_overloaded(dest)) { + return NULL; + } + + IP_VS_DBG(6, "SH: source IP address %u.%u.%u.%u " + "--> server %u.%u.%u.%u:%d\n", + NIPQUAD(iph->saddr), + NIPQUAD(dest->addr), + ntohs(dest->port)); + + return dest; +} + + +/* + * IPVS SH Scheduler structure + */ +static struct ip_vs_scheduler ip_vs_sh_scheduler = +{ + .name = "sh", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .init_service = ip_vs_sh_init_svc, + .done_service = ip_vs_sh_done_svc, + .update_service = ip_vs_sh_update_svc, + .schedule = ip_vs_sh_schedule, +}; + + +static int __init ip_vs_sh_init(void) +{ + INIT_LIST_HEAD(&ip_vs_sh_scheduler.n_list); + return register_ip_vs_scheduler(&ip_vs_sh_scheduler); +} + + +static void __exit ip_vs_sh_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_sh_scheduler); +} + + +module_init(ip_vs_sh_init); +module_exit(ip_vs_sh_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c new file mode 100644 index 000000000000..540c38d9980a --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_sync.c @@ -0,0 +1,909 @@ +/* + * IPVS An implementation of the IP virtual server support for the + * LINUX operating system. IPVS is now implemented as a module + * over the NetFilter framework. IPVS can be used to build a + * high-performance and highly available server based on a + * cluster of servers. + * + * Version: $Id: ip_vs_sync.c,v 1.13 2003/06/08 09:31:19 wensong Exp $ + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * + * ip_vs_sync: sync connection info from master load balancer to backups + * through multicast + * + * Changes: + * Alexandre Cassen : Added master & backup support at a time. + * Alexandre Cassen : Added SyncID support for incoming sync + * messages filtering. + */ + +#define __KERNEL_SYSCALLS__ /* for waitpid */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/slab.h> +#include <linux/net.h> +#include <linux/sched.h> +#include <linux/wait.h> +#include <linux/unistd.h> +#include <linux/completion.h> + +#include <linux/skbuff.h> +#include <linux/in.h> +#include <linux/igmp.h> /* for ip_mc_join_group */ + +#include <net/ip.h> +#include <net/sock.h> +#include <asm/uaccess.h> /* for get_fs and set_fs */ + +#include <net/ip_vs.h> + +#define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */ +#define IP_VS_SYNC_PORT 8848 /* multicast port */ + + +/* + * IPVS sync connection entry + */ +struct ip_vs_sync_conn { + __u8 reserved; + + /* Protocol, addresses and port numbers */ + __u8 protocol; /* Which protocol (TCP/UDP) */ + __u16 cport; + __u16 vport; + __u16 dport; + __u32 caddr; /* client address */ + __u32 vaddr; /* virtual address */ + __u32 daddr; /* destination address */ + + /* Flags and state transition */ + __u16 flags; /* status flags */ + __u16 state; /* state info */ + + /* The sequence options start here */ +}; + +struct ip_vs_sync_conn_options { + struct ip_vs_seq in_seq; /* incoming seq. struct */ + struct ip_vs_seq out_seq; /* outgoing seq. struct */ +}; + +#define IP_VS_SYNC_CONN_TIMEOUT (3*60*HZ) +#define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn)) +#define FULL_CONN_SIZE \ +(sizeof(struct ip_vs_sync_conn) + sizeof(struct ip_vs_sync_conn_options)) + + +/* + The master mulitcasts messages to the backup load balancers in the + following format. + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Count Conns | SyncID | Size | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | | + | IPVS Sync Connection (1) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | . | + | . | + | . | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | | + | IPVS Sync Connection (n) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +*/ + +#define SYNC_MESG_HEADER_LEN 4 + +struct ip_vs_sync_mesg { + __u8 nr_conns; + __u8 syncid; + __u16 size; + + /* ip_vs_sync_conn entries start here */ +}; + +/* the maximum length of sync (sending/receiving) message */ +static int sync_send_mesg_maxlen; +static int sync_recv_mesg_maxlen; + +struct ip_vs_sync_buff { + struct list_head list; + unsigned long firstuse; + + /* pointers for the message data */ + struct ip_vs_sync_mesg *mesg; + unsigned char *head; + unsigned char *end; +}; + + +/* the sync_buff list head and the lock */ +static LIST_HEAD(ip_vs_sync_queue); +static spinlock_t ip_vs_sync_lock = SPIN_LOCK_UNLOCKED; + +/* current sync_buff for accepting new conn entries */ +static struct ip_vs_sync_buff *curr_sb = NULL; +static spinlock_t curr_sb_lock = SPIN_LOCK_UNLOCKED; + +/* ipvs sync daemon state */ +volatile int ip_vs_sync_state = IP_VS_STATE_NONE; +volatile int ip_vs_master_syncid = 0; +volatile int ip_vs_backup_syncid = 0; + +/* multicast interface name */ +char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN]; +char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN]; + +/* multicast addr */ +static struct sockaddr_in mcast_addr; + + +static inline void sb_queue_tail(struct ip_vs_sync_buff *sb) +{ + spin_lock(&ip_vs_sync_lock); + list_add_tail(&sb->list, &ip_vs_sync_queue); + spin_unlock(&ip_vs_sync_lock); +} + +static inline struct ip_vs_sync_buff * sb_dequeue(void) +{ + struct ip_vs_sync_buff *sb; + + spin_lock_bh(&ip_vs_sync_lock); + if (list_empty(&ip_vs_sync_queue)) { + sb = NULL; + } else { + sb = list_entry(ip_vs_sync_queue.next, + struct ip_vs_sync_buff, + list); + list_del(&sb->list); + } + spin_unlock_bh(&ip_vs_sync_lock); + + return sb; +} + +static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(void) +{ + struct ip_vs_sync_buff *sb; + + if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) + return NULL; + + if (!(sb->mesg=kmalloc(sync_send_mesg_maxlen, GFP_ATOMIC))) { + kfree(sb); + return NULL; + } + sb->mesg->nr_conns = 0; + sb->mesg->syncid = ip_vs_master_syncid; + sb->mesg->size = 4; + sb->head = (unsigned char *)sb->mesg + 4; + sb->end = (unsigned char *)sb->mesg + sync_send_mesg_maxlen; + sb->firstuse = jiffies; + return sb; +} + +static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb) +{ + kfree(sb->mesg); + kfree(sb); +} + +/* + * Get the current sync buffer if it has been created for more + * than the specified time or the specified time is zero. + */ +static inline struct ip_vs_sync_buff * +get_curr_sync_buff(unsigned long time) +{ + struct ip_vs_sync_buff *sb; + + spin_lock_bh(&curr_sb_lock); + if (curr_sb && + (jiffies - curr_sb->firstuse > time || time == 0)) { + sb = curr_sb; + curr_sb = NULL; + } else + sb = NULL; + spin_unlock_bh(&curr_sb_lock); + return sb; +} + + +/* + * Add an ip_vs_conn information into the current sync_buff. + * Called by ip_vs_in. + */ +void ip_vs_sync_conn(struct ip_vs_conn *cp) +{ + struct ip_vs_sync_mesg *m; + struct ip_vs_sync_conn *s; + int len; + + spin_lock(&curr_sb_lock); + if (!curr_sb) { + if (!(curr_sb=ip_vs_sync_buff_create())) { + spin_unlock(&curr_sb_lock); + IP_VS_ERR("ip_vs_sync_buff_create failed.\n"); + return; + } + } + + len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE : + SIMPLE_CONN_SIZE; + m = curr_sb->mesg; + s = (struct ip_vs_sync_conn *)curr_sb->head; + + /* copy members */ + s->protocol = cp->protocol; + s->cport = cp->cport; + s->vport = cp->vport; + s->dport = cp->dport; + s->caddr = cp->caddr; + s->vaddr = cp->vaddr; + s->daddr = cp->daddr; + s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED); + s->state = htons(cp->state); + if (cp->flags & IP_VS_CONN_F_SEQ_MASK) { + struct ip_vs_sync_conn_options *opt = + (struct ip_vs_sync_conn_options *)&s[1]; + memcpy(opt, &cp->in_seq, sizeof(*opt)); + } + + m->nr_conns++; + m->size += len; + curr_sb->head += len; + + /* check if there is a space for next one */ + if (curr_sb->head+FULL_CONN_SIZE > curr_sb->end) { + sb_queue_tail(curr_sb); + curr_sb = NULL; + } + spin_unlock(&curr_sb_lock); + + /* synchronize its controller if it has */ + if (cp->control) + ip_vs_sync_conn(cp->control); +} + + +/* + * Process received multicast message and create the corresponding + * ip_vs_conn entries. + */ +static void ip_vs_process_message(const char *buffer, const size_t buflen) +{ + struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer; + struct ip_vs_sync_conn *s; + struct ip_vs_sync_conn_options *opt; + struct ip_vs_conn *cp; + char *p; + int i; + + if (buflen != m->size) { + IP_VS_ERR("bogus message\n"); + return; + } + + /* SyncID sanity check */ + if (ip_vs_backup_syncid != 0 && m->syncid != ip_vs_backup_syncid) { + IP_VS_DBG(7, "Ignoring incoming msg with syncid = %d\n", + m->syncid); + return; + } + + p = (char *)buffer + sizeof(struct ip_vs_sync_mesg); + for (i=0; i<m->nr_conns; i++) { + s = (struct ip_vs_sync_conn *)p; + cp = ip_vs_conn_in_get(s->protocol, + s->caddr, s->cport, + s->vaddr, s->vport); + if (!cp) { + cp = ip_vs_conn_new(s->protocol, + s->caddr, s->cport, + s->vaddr, s->vport, + s->daddr, s->dport, + ntohs(s->flags), NULL); + if (!cp) { + IP_VS_ERR("ip_vs_conn_new failed\n"); + return; + } + cp->state = ntohs(s->state); + } else if (!cp->dest) { + /* it is an entry created by the synchronization */ + cp->state = ntohs(s->state); + cp->flags = ntohs(s->flags) | IP_VS_CONN_F_HASHED; + } /* Note that we don't touch its state and flags + if it is a normal entry. */ + + if (ntohs(s->flags) & IP_VS_CONN_F_SEQ_MASK) { + opt = (struct ip_vs_sync_conn_options *)&s[1]; + memcpy(&cp->in_seq, opt, sizeof(*opt)); + p += FULL_CONN_SIZE; + } else + p += SIMPLE_CONN_SIZE; + + atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]); + cp->timeout = IP_VS_SYNC_CONN_TIMEOUT; + ip_vs_conn_put(cp); + + if (p > buffer+buflen) { + IP_VS_ERR("bogus message\n"); + return; + } + } +} + + +/* + * Setup loopback of outgoing multicasts on a sending socket + */ +static void set_mcast_loop(struct sock *sk, u_char loop) +{ + struct inet_opt *inet = inet_sk(sk); + + /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */ + lock_sock(sk); + inet->mc_loop = loop ? 1 : 0; + release_sock(sk); +} + +/* + * Specify TTL for outgoing multicasts on a sending socket + */ +static void set_mcast_ttl(struct sock *sk, u_char ttl) +{ + struct inet_opt *inet = inet_sk(sk); + + /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */ + lock_sock(sk); + inet->mc_ttl = ttl; + release_sock(sk); +} + +/* + * Specifiy default interface for outgoing multicasts + */ +static int set_mcast_if(struct sock *sk, char *ifname) +{ + struct net_device *dev; + struct inet_opt *inet = inet_sk(sk); + + if ((dev = __dev_get_by_name(ifname)) == NULL) + return -ENODEV; + + if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) + return -EINVAL; + + lock_sock(sk); + inet->mc_index = dev->ifindex; + /* inet->mc_addr = 0; */ + release_sock(sk); + + return 0; +} + + +/* + * Set the maximum length of sync message according to the + * specified interface's MTU. + */ +static int set_sync_mesg_maxlen(int sync_state) +{ + struct net_device *dev; + int num; + + if (sync_state == IP_VS_STATE_MASTER) { + if ((dev = __dev_get_by_name(ip_vs_master_mcast_ifn)) == NULL) + return -ENODEV; + + num = (dev->mtu - sizeof(struct iphdr) - + sizeof(struct udphdr) - + SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE; + sync_send_mesg_maxlen = + SYNC_MESG_HEADER_LEN + SIMPLE_CONN_SIZE * num; + IP_VS_DBG(7, "setting the maximum length of sync sending " + "message %d.\n", sync_send_mesg_maxlen); + } else if (sync_state == IP_VS_STATE_BACKUP) { + if ((dev = __dev_get_by_name(ip_vs_backup_mcast_ifn)) == NULL) + return -ENODEV; + + sync_recv_mesg_maxlen = dev->mtu - + sizeof(struct iphdr) - sizeof(struct udphdr); + IP_VS_DBG(7, "setting the maximum length of sync receiving " + "message %d.\n", sync_recv_mesg_maxlen); + } + + return 0; +} + + +/* + * Join a multicast group. + * the group is specified by a class D multicast address 224.0.0.0/8 + * in the in_addr structure passed in as a parameter. + */ +static int +join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname) +{ + struct ip_mreqn mreq; + struct net_device *dev; + int ret; + + memset(&mreq, 0, sizeof(mreq)); + memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr)); + + if ((dev = __dev_get_by_name(ifname)) == NULL) + return -ENODEV; + if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) + return -EINVAL; + + mreq.imr_ifindex = dev->ifindex; + + lock_sock(sk); + ret = ip_mc_join_group(sk, &mreq); + release_sock(sk); + + return ret; +} + + +static int bind_mcastif_addr(struct socket *sock, char *ifname) +{ + struct net_device *dev; + u32 addr; + struct sockaddr_in sin; + + if ((dev = __dev_get_by_name(ifname)) == NULL) + return -ENODEV; + + addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); + if (!addr) + IP_VS_ERR("You probably need to specify IP address on " + "multicast interface.\n"); + + IP_VS_DBG(7, "binding socket with (%s) %u.%u.%u.%u\n", + ifname, NIPQUAD(addr)); + + /* Now bind the socket with the address of multicast interface */ + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = addr; + sin.sin_port = 0; + + return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin)); +} + +/* + * Set up sending multicast socket over UDP + */ +static struct socket * make_send_sock(void) +{ + struct socket *sock; + + /* First create a socket */ + if (sock_create(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock) < 0) { + IP_VS_ERR("Error during creation of socket; terminating\n"); + return NULL; + } + + if (set_mcast_if(sock->sk, ip_vs_master_mcast_ifn) < 0) { + IP_VS_ERR("Error setting outbound mcast interface\n"); + goto error; + } + + set_mcast_loop(sock->sk, 0); + set_mcast_ttl(sock->sk, 1); + + if (bind_mcastif_addr(sock, ip_vs_master_mcast_ifn) < 0) { + IP_VS_ERR("Error binding address of the mcast interface\n"); + goto error; + } + + if (sock->ops->connect(sock, + (struct sockaddr*)&mcast_addr, + sizeof(struct sockaddr), 0) < 0) { + IP_VS_ERR("Error connecting to the multicast addr\n"); + goto error; + } + + return sock; + + error: + sock_release(sock); + return NULL; +} + + +/* + * Set up receiving multicast socket over UDP + */ +static struct socket * make_receive_sock(void) +{ + struct socket *sock; + + /* First create a socket */ + if (sock_create(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock) < 0) { + IP_VS_ERR("Error during creation of socket; terminating\n"); + return NULL; + } + + /* it is equivalent to the REUSEADDR option in user-space */ + sock->sk->sk_reuse = 1; + + if (sock->ops->bind(sock, + (struct sockaddr*)&mcast_addr, + sizeof(struct sockaddr)) < 0) { + IP_VS_ERR("Error binding to the multicast addr\n"); + goto error; + } + + /* join the multicast group */ + if (join_mcast_group(sock->sk, + (struct in_addr*)&mcast_addr.sin_addr, + ip_vs_backup_mcast_ifn) < 0) { + IP_VS_ERR("Error joining to the multicast group\n"); + goto error; + } + + return sock; + + error: + sock_release(sock); + return NULL; +} + + +static int +ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length) +{ + struct msghdr msg; + mm_segment_t oldfs; + struct iovec iov; + int len; + + EnterFunction(7); + iov.iov_base = (void *)buffer; + iov.iov_len = length; + msg.msg_name = 0; + msg.msg_namelen = 0; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL; + + oldfs = get_fs(); set_fs(KERNEL_DS); + len = sock_sendmsg(sock, &msg, (size_t)(length)); + set_fs(oldfs); + + LeaveFunction(7); + return len; +} + + +static int +ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen) +{ + struct msghdr msg; + struct iovec iov; + int len; + mm_segment_t oldfs; + + EnterFunction(7); + + /* Receive a packet */ + iov.iov_base = buffer; + iov.iov_len = (size_t)buflen; + msg.msg_name = 0; + msg.msg_namelen = 0; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = 0; + + oldfs = get_fs(); set_fs(KERNEL_DS); + len = sock_recvmsg(sock, &msg, buflen, 0); + set_fs(oldfs); + + if (len < 0) + return -1; + + LeaveFunction(7); + return len; +} + + +static int errno; + +static DECLARE_WAIT_QUEUE_HEAD(sync_wait); +static pid_t sync_master_pid = 0; +static pid_t sync_backup_pid = 0; + +static DECLARE_WAIT_QUEUE_HEAD(stop_sync_wait); +static int stop_master_sync = 0; +static int stop_backup_sync = 0; + +static void sync_master_loop(void) +{ + struct socket *sock; + struct ip_vs_sync_buff *sb; + struct ip_vs_sync_mesg *m; + + /* create the sending multicast socket */ + sock = make_send_sock(); + if (!sock) + return; + + IP_VS_INFO("sync thread started: state = MASTER, mcast_ifn = %s, " + "syncid = %d\n", + ip_vs_master_mcast_ifn, ip_vs_master_syncid); + + for (;;) { + while ((sb=sb_dequeue())) { + m = sb->mesg; + if (ip_vs_send_async(sock, (char *)m, + m->size) != m->size) + IP_VS_ERR("ip_vs_send_async error\n"); + ip_vs_sync_buff_release(sb); + } + + /* check if entries stay in curr_sb for 2 seconds */ + if ((sb = get_curr_sync_buff(2*HZ))) { + m = sb->mesg; + if (ip_vs_send_async(sock, (char *)m, + m->size) != m->size) + IP_VS_ERR("ip_vs_send_async error\n"); + ip_vs_sync_buff_release(sb); + } + + if (stop_master_sync) + break; + + __set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(HZ); + __set_current_state(TASK_RUNNING); + } + + /* clean up the sync_buff queue */ + while ((sb=sb_dequeue())) { + ip_vs_sync_buff_release(sb); + } + + /* clean up the current sync_buff */ + if ((sb = get_curr_sync_buff(0))) { + ip_vs_sync_buff_release(sb); + } + + /* release the sending multicast socket */ + sock_release(sock); +} + + +static void sync_backup_loop(void) +{ + struct socket *sock; + char *buf; + int len; + + if (!(buf = kmalloc(sync_recv_mesg_maxlen, GFP_ATOMIC))) { + IP_VS_ERR("sync_backup_loop: kmalloc error\n"); + return; + } + + /* create the receiving multicast socket */ + sock = make_receive_sock(); + if (!sock) + goto out; + + IP_VS_INFO("sync thread started: state = BACKUP, mcast_ifn = %s, " + "syncid = %d\n", + ip_vs_backup_mcast_ifn, ip_vs_backup_syncid); + + for (;;) { + /* do you have data now? */ + while (!skb_queue_empty(&(sock->sk->sk_receive_queue))) { + if ((len = + ip_vs_receive(sock, buf, + sync_recv_mesg_maxlen)) <= 0) { + IP_VS_ERR("receiving message error\n"); + break; + } + /* disable bottom half, because it accessed the data + shared by softirq while getting/creating conns */ + local_bh_disable(); + ip_vs_process_message(buf, len); + local_bh_enable(); + } + + if (stop_backup_sync) + break; + + __set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(HZ); + __set_current_state(TASK_RUNNING); + } + + /* release the sending multicast socket */ + sock_release(sock); + + out: + kfree(buf); +} + + +static void set_sync_pid(int sync_state, pid_t sync_pid) +{ + if (sync_state == IP_VS_STATE_MASTER) + sync_master_pid = sync_pid; + else if (sync_state == IP_VS_STATE_BACKUP) + sync_backup_pid = sync_pid; +} + +static void set_stop_sync(int sync_state, int set) +{ + if (sync_state == IP_VS_STATE_MASTER) + stop_master_sync = set; + else if (sync_state == IP_VS_STATE_BACKUP) + stop_backup_sync = set; + else { + stop_master_sync = set; + stop_backup_sync = set; + } +} + +static int sync_thread(void *startup) +{ + DECLARE_WAITQUEUE(wait, current); + mm_segment_t oldmm; + int state; + const char *name; + + /* increase the module use count */ + ip_vs_use_count_inc(); + + if (ip_vs_sync_state & IP_VS_STATE_MASTER && !sync_master_pid) { + state = IP_VS_STATE_MASTER; + name = "ipvs syncmaster"; + } else if (ip_vs_sync_state & IP_VS_STATE_BACKUP && !sync_backup_pid) { + state = IP_VS_STATE_BACKUP; + name = "ipvs syncbackup"; + } else { + IP_VS_BUG(); + ip_vs_use_count_dec(); + return -EINVAL; + } + + daemonize(name); + + oldmm = get_fs(); + set_fs(KERNEL_DS); + + /* Block all signals */ + spin_lock_irq(¤t->sighand->siglock); + siginitsetinv(¤t->blocked, 0); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + + /* set the maximum length of sync message */ + set_sync_mesg_maxlen(state); + + /* set up multicast address */ + mcast_addr.sin_family = AF_INET; + mcast_addr.sin_port = htons(IP_VS_SYNC_PORT); + mcast_addr.sin_addr.s_addr = htonl(IP_VS_SYNC_GROUP); + + add_wait_queue(&sync_wait, &wait); + + set_sync_pid(state, current->pid); + complete((struct completion *)startup); + + /* processing master/backup loop here */ + if (state == IP_VS_STATE_MASTER) + sync_master_loop(); + else if (state == IP_VS_STATE_BACKUP) + sync_backup_loop(); + else IP_VS_BUG(); + + remove_wait_queue(&sync_wait, &wait); + + /* thread exits */ + set_sync_pid(state, 0); + IP_VS_INFO("sync thread stopped!\n"); + + set_fs(oldmm); + + /* decrease the module use count */ + ip_vs_use_count_dec(); + + set_stop_sync(state, 0); + wake_up(&stop_sync_wait); + + return 0; +} + + +static int fork_sync_thread(void *startup) +{ + /* fork the sync thread here, then the parent process of the + sync thread is the init process after this thread exits. */ + if (kernel_thread(sync_thread, startup, 0) < 0) + IP_VS_BUG(); + return 0; +} + + +int start_sync_thread(int state, char *mcast_ifn, __u8 syncid) +{ + DECLARE_COMPLETION(startup); + pid_t pid; + int waitpid_result; + + if ((state == IP_VS_STATE_MASTER && sync_master_pid) || + (state == IP_VS_STATE_BACKUP && sync_backup_pid)) + return -EEXIST; + + IP_VS_DBG(7, "%s: pid %d\n", __FUNCTION__, current->pid); + IP_VS_DBG(7, "Each ip_vs_sync_conn entry need %d bytes\n", + sizeof(struct ip_vs_sync_conn)); + + ip_vs_sync_state |= state; + if (state == IP_VS_STATE_MASTER) { + strcpy(ip_vs_master_mcast_ifn, mcast_ifn); + ip_vs_master_syncid = syncid; + } else { + strcpy(ip_vs_backup_mcast_ifn, mcast_ifn); + ip_vs_backup_syncid = syncid; + } + + if ((pid = kernel_thread(fork_sync_thread, &startup, 0)) < 0) + IP_VS_BUG(); + + if ((waitpid_result = waitpid(pid, NULL, __WCLONE)) != pid) { + IP_VS_ERR("%s: waitpid(%d,...) failed, errno %d\n", + __FUNCTION__, pid, -waitpid_result); + } + + wait_for_completion(&startup); + + return 0; +} + + +int stop_sync_thread(int state) +{ + DECLARE_WAITQUEUE(wait, current); + + if ((state == IP_VS_STATE_MASTER && !sync_master_pid) || + (state == IP_VS_STATE_BACKUP && !sync_backup_pid)) + return -ESRCH; + + IP_VS_DBG(7, "%s: pid %d\n", __FUNCTION__, current->pid); + IP_VS_INFO("stopping sync thread %d ...\n", + (state == IP_VS_STATE_MASTER) ? sync_master_pid : sync_backup_pid); + + __set_current_state(TASK_UNINTERRUPTIBLE); + add_wait_queue(&stop_sync_wait, &wait); + set_stop_sync(state, 1); + ip_vs_sync_state -= state; + wake_up(&sync_wait); + schedule(); + __set_current_state(TASK_RUNNING); + remove_wait_queue(&stop_sync_wait, &wait); + + /* Note: no need to reap the sync thread, because its parent + process is the init process */ + + if ((state == IP_VS_STATE_MASTER && stop_master_sync) || + (state == IP_VS_STATE_BACKUP && stop_backup_sync)) + IP_VS_BUG(); + + return 0; +} diff --git a/net/ipv4/ipvs/ip_vs_wlc.c b/net/ipv4/ipvs/ip_vs_wlc.c new file mode 100644 index 000000000000..9ba05f2ad458 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_wlc.c @@ -0,0 +1,159 @@ +/* + * IPVS: Weighted Least-Connection Scheduling module + * + * Version: $Id: ip_vs_wlc.c,v 1.13 2003/04/18 09:03:16 wensong Exp $ + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * Peter Kese <peter.kese@ijs.si> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Wensong Zhang : changed the ip_vs_wlc_schedule to return dest + * Wensong Zhang : changed to use the inactconns in scheduling + * Wensong Zhang : changed some comestics things for debugging + * Wensong Zhang : changed for the d-linked destination list + * Wensong Zhang : added the ip_vs_wlc_update_svc + * Wensong Zhang : added any dest with weight=0 is quiesced + * + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/errno.h> + +#include <net/ip_vs.h> + + +static int +ip_vs_wlc_init_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static int +ip_vs_wlc_done_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static int +ip_vs_wlc_update_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static inline unsigned int +ip_vs_wlc_dest_overhead(struct ip_vs_dest *dest) +{ + /* + * We think the overhead of processing active connections is 256 + * times higher than that of inactive connections in average. (This + * 256 times might not be accurate, we will change it later) We + * use the following formula to estimate the overhead now: + * dest->activeconns*256 + dest->inactconns + */ + return (atomic_read(&dest->activeconns) << 8) + + atomic_read(&dest->inactconns); +} + + +/* + * Weighted Least Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph) +{ + register struct list_head *l, *e; + struct ip_vs_dest *dest, *least; + unsigned int loh, doh; + + IP_VS_DBG(6, "ip_vs_wlc_schedule(): Scheduling...\n"); + + /* + * We calculate the load of each dest server as follows: + * (dest overhead) / dest->weight + * + * Remember -- no floats in kernel mode!!! + * The comparison of h1*w2 > h2*w1 is equivalent to that of + * h1/w1 > h2/w2 + * if every weight is larger than zero. + * + * The server with weight=0 is quiesced and will not receive any + * new connections. + */ + + l = &svc->destinations; + for (e=l->next; e!=l; e=e->next) { + least = list_entry(e, struct ip_vs_dest, n_list); + if (!(least->flags & IP_VS_DEST_F_OVERLOAD) && + atomic_read(&least->weight) > 0) { + loh = ip_vs_wlc_dest_overhead(least); + goto nextstage; + } + } + return NULL; + + /* + * Find the destination with the least load. + */ + nextstage: + for (e=e->next; e!=l; e=e->next) { + dest = list_entry(e, struct ip_vs_dest, n_list); + + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + doh = ip_vs_wlc_dest_overhead(dest); + if (loh * atomic_read(&dest->weight) > + doh * atomic_read(&least->weight)) { + least = dest; + loh = doh; + } + } + + IP_VS_DBG(6, "WLC: server %u.%u.%u.%u:%u " + "activeconns %d refcnt %d weight %d overhead %d\n", + NIPQUAD(least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + + return least; +} + + +static struct ip_vs_scheduler ip_vs_wlc_scheduler = +{ + .name = "wlc", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .init_service = ip_vs_wlc_init_svc, + .done_service = ip_vs_wlc_done_svc, + .update_service = ip_vs_wlc_update_svc, + .schedule = ip_vs_wlc_schedule, +}; + + +static int __init ip_vs_wlc_init(void) +{ + INIT_LIST_HEAD(&ip_vs_wlc_scheduler.n_list); + return register_ip_vs_scheduler(&ip_vs_wlc_scheduler); +} + +static void __exit ip_vs_wlc_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_wlc_scheduler); +} + +module_init(ip_vs_wlc_init); +module_exit(ip_vs_wlc_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_wrr.c b/net/ipv4/ipvs/ip_vs_wrr.c new file mode 100644 index 000000000000..8dc38ca2158c --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_wrr.c @@ -0,0 +1,251 @@ +/* + * IPVS: Weighted Round-Robin Scheduling module + * + * Version: $Id: ip_vs_wrr.c,v 1.12 2002/09/15 08:14:08 wensong Exp $ + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Wensong Zhang : changed the ip_vs_wrr_schedule to return dest + * Wensong Zhang : changed some comestics things for debugging + * Wensong Zhang : changed for the d-linked destination list + * Wensong Zhang : added the ip_vs_wrr_update_svc + * Julian Anastasov : fixed the bug of returning destination + * with weight 0 when all weights are zero + * + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/errno.h> + +#include <net/ip_vs.h> + +/* + * current destination pointer for weighted round-robin scheduling + */ +struct ip_vs_wrr_mark { + struct list_head *cl; /* current list head */ + int cw; /* current weight */ + int mw; /* maximum weight */ + int di; /* decreasing interval */ +}; + + +/* + * Get the gcd of server weights + */ +static int gcd(int a, int b) +{ + int c; + + while ((c = a % b)) { + a = b; + b = c; + } + return b; +} + +static int ip_vs_wrr_gcd_weight(struct ip_vs_service *svc) +{ + register struct list_head *l, *e; + struct ip_vs_dest *dest; + int weight; + int g = 1; + + l = &svc->destinations; + for (e=l->next; e!=l; e=e->next) { + dest = list_entry(e, struct ip_vs_dest, n_list); + weight = atomic_read(&dest->weight); + if (weight > 0) { + g = weight; + break; + } + } + if (e == l) + return g; + + for (e=e->next; e!=l; e=e->next) { + dest = list_entry(e, struct ip_vs_dest, n_list); + weight = atomic_read(&dest->weight); + if (weight > 0) + g = gcd(weight, g); + } + + return g; +} + + +/* + * Get the maximum weight of the service destinations. + */ +static int ip_vs_wrr_max_weight(struct ip_vs_service *svc) +{ + register struct list_head *l, *e; + struct ip_vs_dest *dest; + int weight = 0; + + l = &svc->destinations; + for (e=l->next; e!=l; e=e->next) { + dest = list_entry(e, struct ip_vs_dest, n_list); + if (atomic_read(&dest->weight) > weight) + weight = atomic_read(&dest->weight); + } + + return weight; +} + + +static int ip_vs_wrr_init_svc(struct ip_vs_service *svc) +{ + struct ip_vs_wrr_mark *mark; + + /* + * Allocate the mark variable for WRR scheduling + */ + mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_ATOMIC); + if (mark == NULL) { + IP_VS_ERR("ip_vs_wrr_init_svc(): no memory\n"); + return -ENOMEM; + } + mark->cl = &svc->destinations; + mark->cw = 0; + mark->mw = ip_vs_wrr_max_weight(svc); + mark->di = ip_vs_wrr_gcd_weight(svc); + svc->sched_data = mark; + + return 0; +} + + +static int ip_vs_wrr_done_svc(struct ip_vs_service *svc) +{ + /* + * Release the mark variable + */ + kfree(svc->sched_data); + + return 0; +} + + +static int ip_vs_wrr_update_svc(struct ip_vs_service *svc) +{ + struct ip_vs_wrr_mark *mark = svc->sched_data; + + mark->cl = &svc->destinations; + mark->mw = ip_vs_wrr_max_weight(svc); + mark->di = ip_vs_wrr_gcd_weight(svc); + return 0; +} + + +/* + * Weighted Round-Robin Scheduling + */ +static struct ip_vs_dest * +ip_vs_wrr_schedule(struct ip_vs_service *svc, struct iphdr *iph) +{ + struct ip_vs_dest *dest; + struct ip_vs_wrr_mark *mark = svc->sched_data; + struct list_head *p; + + IP_VS_DBG(6, "ip_vs_wrr_schedule(): Scheduling...\n"); + + /* + * This loop will always terminate, because mark->cw in (0, max_weight] + * and at least one server has its weight equal to max_weight. + */ + write_lock(&svc->sched_lock); + p = mark->cl; + while (1) { + if (mark->cl == &svc->destinations) { + /* it is at the head of the destination list */ + + if (mark->cl == mark->cl->next) { + /* no dest entry */ + dest = NULL; + goto out; + } + + mark->cl = svc->destinations.next; + mark->cw -= mark->di; + if (mark->cw <= 0) { + mark->cw = mark->mw; + /* + * Still zero, which means no availabe servers. + */ + if (mark->cw == 0) { + mark->cl = &svc->destinations; + IP_VS_INFO("ip_vs_wrr_schedule(): " + "no available servers\n"); + dest = NULL; + goto out; + } + } + } else + mark->cl = mark->cl->next; + + if (mark->cl != &svc->destinations) { + /* not at the head of the list */ + dest = list_entry(mark->cl, struct ip_vs_dest, n_list); + if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && + atomic_read(&dest->weight) >= mark->cw) { + /* got it */ + break; + } + } + + if (mark->cl == p) { + /* back to the start, and no dest is found. + It is only possible when all dests are OVERLOADED */ + dest = NULL; + goto out; + } + } + + IP_VS_DBG(6, "WRR: server %u.%u.%u.%u:%u " + "activeconns %d refcnt %d weight %d\n", + NIPQUAD(dest->addr), ntohs(dest->port), + atomic_read(&dest->activeconns), + atomic_read(&dest->refcnt), + atomic_read(&dest->weight)); + + out: + write_unlock(&svc->sched_lock); + return dest; +} + + +static struct ip_vs_scheduler ip_vs_wrr_scheduler = { + .name = "wrr", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .init_service = ip_vs_wrr_init_svc, + .done_service = ip_vs_wrr_done_svc, + .update_service = ip_vs_wrr_update_svc, + .schedule = ip_vs_wrr_schedule, +}; + +static int __init ip_vs_wrr_init(void) +{ + INIT_LIST_HEAD(&ip_vs_wrr_scheduler.n_list); + return register_ip_vs_scheduler(&ip_vs_wrr_scheduler) ; +} + +static void __exit ip_vs_wrr_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler); +} + +module_init(ip_vs_wrr_init); +module_exit(ip_vs_wrr_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c new file mode 100644 index 000000000000..0207a3f3f8d0 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_xmit.c @@ -0,0 +1,635 @@ +/* + * ip_vs_xmit.c: various packet transmitters for IPVS + * + * Version: $Id: ip_vs_xmit.c,v 1.2 2002/11/30 01:50:35 wensong Exp $ + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * Julian Anastasov <ja@ssi.bg> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +#include <linux/config.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/compiler.h> +#include <linux/ip.h> +#include <linux/tcp.h> /* for tcphdr */ +#include <net/tcp.h> /* for csum_tcpudp_magic */ +#include <net/udp.h> +#include <net/icmp.h> /* for icmp_send */ +#include <net/route.h> /* for ip_route_output */ +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> + +#include <net/ip_vs.h> + + +/* + * Destination cache to speed up outgoing route lookup + */ +static inline void +__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst) +{ + struct dst_entry *old_dst; + + old_dst = dest->dst_cache; + dest->dst_cache = dst; + dest->dst_rtos = rtos; + dst_release(old_dst); +} + +static inline struct dst_entry * +__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie) +{ + struct dst_entry *dst = dest->dst_cache; + + if (!dst) + return NULL; + if ((dst->obsolete || rtos != dest->dst_rtos) && + dst->ops->check(dst, cookie) == NULL) { + dest->dst_cache = 0; + return NULL; + } + dst_hold(dst); + return dst; +} + +static inline struct rtable * +__ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos) +{ + struct rtable *rt; /* Route to the other host */ + struct ip_vs_dest *dest = cp->dest; + + if (dest) { + spin_lock(&dest->dst_lock); + if (!(rt = (struct rtable *) + __ip_vs_dst_check(dest, rtos, 0))) { + struct flowi fl = { + .oif = 0, + .nl_u = { + .ip4_u = { + .daddr = dest->addr, + .saddr = 0, + .tos = rtos, } }, + .proto = cp->protocol, + }; + + if (ip_route_output_key(&rt, &fl)) { + spin_unlock(&dest->dst_lock); + IP_VS_DBG_RL("ip_route_output error, " + "dest: %u.%u.%u.%u\n", + NIPQUAD(dest->addr)); + return NULL; + } + __ip_vs_dst_set(dest, rtos, dst_clone(&rt->u.dst)); + IP_VS_DBG(10, "new dst %u.%u.%u.%u, refcnt=%d, rtos=%X\n", + NIPQUAD(dest->addr), + atomic_read(&rt->u.dst.__refcnt), rtos); + } + spin_unlock(&dest->dst_lock); + } else { + struct flowi fl = { + .oif = 0, + .nl_u = { + .ip4_u = { + .daddr = dest->addr, + .saddr = 0, + .tos = rtos, } }, + .proto = cp->protocol, + }; + + if (ip_route_output_key(&rt, &fl)) { + IP_VS_DBG_RL("ip_route_output error, dest: " + "%u.%u.%u.%u\n", NIPQUAD(cp->daddr)); + return NULL; + } + } + + return rt; +} + + +/* + * Release dest->dst_cache before a dest is removed + */ +void +ip_vs_dst_reset(struct ip_vs_dest *dest) +{ + struct dst_entry *old_dst; + + old_dst = dest->dst_cache; + dest->dst_cache = NULL; + dst_release(old_dst); +} + + +static inline int +ip_vs_skb_cow(struct sk_buff *skb, unsigned int headroom, + struct iphdr **iph_p, unsigned char **t_p) +{ + int delta = (headroom > 16 ? headroom : 16) - skb_headroom(skb); + + if (delta < 0) + delta = 0; + + if (delta ||skb_cloned(skb)) { + if (pskb_expand_head(skb, (delta+15)&~15, 0, GFP_ATOMIC)) + return -ENOMEM; + + /* skb data changed, update pointers */ + *iph_p = skb->nh.iph; + *t_p = (char*) (*iph_p) + (*iph_p)->ihl * 4; + } + return 0; +} + + +#define IP_VS_XMIT(skb, rt) \ +do { \ + skb->nfcache |= NFC_IPVS_PROPERTY; \ + NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, \ + rt->u.dst.dev, dst_output); \ +} while (0) + + +/* + * NULL transmitter (do nothing except return NF_ACCEPT) + */ +int +ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + return NF_ACCEPT; +} + + +/* + * Bypass transmitter + * Let packets bypass the destination when the destination is not + * available, it may be only used in transparent cache cluster. + */ +int +ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rtable *rt; /* Route to the other host */ + struct iphdr *iph = skb->nh.iph; + u8 tos = iph->tos; + int mtu; + struct flowi fl = { + .oif = 0, + .nl_u = { + .ip4_u = { + .daddr = iph->daddr, + .saddr = 0, + .tos = RT_TOS(tos), } }, + .proto = iph->protocol, + }; + + EnterFunction(10); + + if (ip_route_output_key(&rt, &fl)) { + IP_VS_DBG_RL("ip_vs_bypass_xmit(): ip_route_output error, " + "dest: %u.%u.%u.%u\n", NIPQUAD(iph->daddr)); + goto tx_error_icmp; + } + + /* MTU checking */ + mtu = dst_pmtu(&rt->u.dst); + if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) { + ip_rt_put(rt); + icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); + IP_VS_DBG_RL("ip_vs_bypass_xmit(): frag needed\n"); + goto tx_error; + } + + if (skb_is_nonlinear(skb) && skb->len <= mtu) + ip_send_check(iph); + + if (unlikely(skb_headroom(skb) < rt->u.dst.dev->hard_header_len)) { + if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) { + ip_rt_put(rt); + IP_VS_ERR_RL("ip_vs_bypass_xmit(): no memory\n"); + goto tx_error; + } + } + + /* drop old route */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + +#ifdef CONFIG_NETFILTER_DEBUG + skb->nf_debug = 0; +#endif /* CONFIG_NETFILTER_DEBUG */ + IP_VS_XMIT(skb, rt); + + LeaveFunction(10); + return NF_STOLEN; + + tx_error_icmp: + dst_link_failure(skb); + tx_error: + kfree_skb(skb); + return NF_STOLEN; +} + + +/* + * NAT transmitter (only for outside-to-inside nat forwarding) + * Not used for related ICMP + */ +int +ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rtable *rt; /* Route to the other host */ + struct iphdr *iph; + union ip_vs_tphdr h; + int ihl; + unsigned short size; + int mtu; + + EnterFunction(10); + + /* + * If it has ip_vs_app helper, the helper may change the payload, + * so it needs full checksum checking and checksum calculation. + * If not, only the header (such as IP address and port number) + * will be changed, so it is fast to do incremental checksum update, + * and let the destination host do final checksum checking. + */ + + if (unlikely(cp->app && !pp->slave)) { + if (skb_is_nonlinear(skb) && + skb_linearize(skb, GFP_ATOMIC) != 0) + return NF_DROP; + } + + iph = skb->nh.iph; + ihl = iph->ihl << 2; + h.raw = (char*) iph + ihl; + size = ntohs(iph->tot_len) - ihl; + + /* do TCP/UDP checksum checking if it has application helper */ + if (unlikely(cp->app && pp->csum_check && !pp->slave)) { + if (!pp->csum_check(skb, pp, iph, h, size)) + goto tx_error; + } + + /* + * Check if it is no clinet port connection ... + */ + if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { + ip_vs_conn_fill_cport(cp, h.portp[0]); + IP_VS_DBG(10, "filled cport=%d\n", ntohs(cp->dport)); + } + + if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos)))) + goto tx_error_icmp; + + /* MTU checking */ + mtu = dst_pmtu(&rt->u.dst); + if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) { + ip_rt_put(rt); + icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); + IP_VS_DBG_RL_PKT(0, pp, iph, "ip_vs_nat_xmit(): frag needed for"); + goto tx_error; + } + + /* drop old route */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + /* copy-on-write the packet before mangling it */ + if (ip_vs_skb_cow(skb, rt->u.dst.dev->hard_header_len, &iph, &h.raw)) + return NF_DROP; + + /* mangle the packet */ + iph->daddr = cp->daddr; + if (pp->dnat_handler) { + pp->dnat_handler(skb, pp, cp, iph, h, size); + iph = skb->nh.iph; + h.raw = (char*) iph + ihl; + } + ip_send_check(iph); + + IP_VS_DBG_PKT(10, pp, iph, "After DNAT"); + + /* FIXME: when application helper enlarges the packet and the length + is larger than the MTU of outgoing device, there will be still + MTU problem. */ + +#ifdef CONFIG_NETFILTER_DEBUG + skb->nf_debug = 0; +#endif /* CONFIG_NETFILTER_DEBUG */ + IP_VS_XMIT(skb, rt); + + LeaveFunction(10); + return NF_STOLEN; + + tx_error_icmp: + dst_link_failure(skb); + tx_error: + kfree_skb(skb); + return NF_STOLEN; +} + + +/* + * IP Tunneling transmitter + * + * This function encapsulates the packet in a new IP packet, its + * destination will be set to cp->daddr. Most code of this function + * is taken from ipip.c. + * + * It is used in VS/TUN cluster. The load balancer selects a real + * server from a cluster based on a scheduling algorithm, + * encapsulates the request packet and forwards it to the selected + * server. For example, all real servers are configured with + * "ifconfig tunl0 <Virtual IP Address> up". When the server receives + * the encapsulated packet, it will decapsulate the packet, processe + * the request and return the response packets directly to the client + * without passing the load balancer. This can greatly increase the + * scalability of virtual server. + * + * Used for ANY protocol + */ +int +ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rtable *rt; /* Route to the other host */ + struct net_device *tdev; /* Device to other host */ + struct iphdr *old_iph = skb->nh.iph; + u8 tos = old_iph->tos; + u16 df = old_iph->frag_off; + struct iphdr *iph; /* Our new IP header */ + int max_headroom; /* The extra header space needed */ + int mtu; + + EnterFunction(10); + + if (skb->protocol != __constant_htons(ETH_P_IP)) { + IP_VS_DBG_RL("ip_vs_tunnel_xmit(): protocol error, " + "ETH_P_IP: %d, skb protocol: %d\n", + __constant_htons(ETH_P_IP), skb->protocol); + goto tx_error; + } + + if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos)))) + goto tx_error_icmp; + + tdev = rt->u.dst.dev; + + mtu = dst_pmtu(&rt->u.dst) - sizeof(struct iphdr); + if (mtu < 68) { + ip_rt_put(rt); + IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68\n"); + goto tx_error; + } + if (skb->dst) + skb->dst->ops->update_pmtu(skb->dst, mtu); + + df |= (old_iph->frag_off&__constant_htons(IP_DF)); + + if ((old_iph->frag_off&__constant_htons(IP_DF)) + && mtu < ntohs(old_iph->tot_len)) { + icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); + ip_rt_put(rt); + IP_VS_DBG_RL("ip_vs_tunnel_xmit(): frag needed\n"); + goto tx_error; + } + + if (skb_is_nonlinear(skb)) + ip_send_check(old_iph); + + skb->h.raw = skb->nh.raw; + + /* + * Okay, now see if we can stuff it in the buffer as-is. + */ + max_headroom = (((tdev->hard_header_len+15)&~15)+sizeof(struct iphdr)); + + if (skb_headroom(skb) < max_headroom + || skb_cloned(skb) || skb_shared(skb)) { + struct sk_buff *new_skb = + skb_realloc_headroom(skb, max_headroom); + if (!new_skb) { + ip_rt_put(rt); + kfree_skb(skb); + IP_VS_ERR_RL("ip_vs_tunnel_xmit(): no memory\n"); + return -EINVAL; + } + kfree_skb(skb); + skb = new_skb; + } + + skb->nh.raw = skb_push(skb, sizeof(struct iphdr)); + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + + /* drop old route */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + /* + * Push down and install the IPIP header. + */ + iph = skb->nh.iph; + iph->version = 4; + iph->ihl = sizeof(struct iphdr)>>2; + iph->frag_off = df; + iph->protocol = IPPROTO_IPIP; + iph->tos = tos; + iph->daddr = rt->rt_dst; + iph->saddr = rt->rt_src; + iph->ttl = old_iph->ttl; + iph->tot_len = htons(skb->len); + ip_select_ident(iph, &rt->u.dst, NULL); + ip_send_check(iph); + + skb->ip_summed = CHECKSUM_NONE; +#ifdef CONFIG_NETFILTER_DEBUG + skb->nf_debug = 0; +#endif /* CONFIG_NETFILTER_DEBUG */ + IP_VS_XMIT(skb, rt); + + LeaveFunction(10); + + return NF_STOLEN; + + tx_error_icmp: + dst_link_failure(skb); + tx_error: + kfree_skb(skb); + return NF_STOLEN; +} + + +/* + * Direct Routing transmitter + * Used for ANY protocol + */ +int +ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rtable *rt; /* Route to the other host */ + struct iphdr *iph = skb->nh.iph; + int mtu; + + EnterFunction(10); + + if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos)))) + goto tx_error_icmp; + + /* MTU checking */ + mtu = dst_pmtu(&rt->u.dst); + if ((iph->frag_off&__constant_htons(IP_DF)) && skb->len > mtu) { + icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); + ip_rt_put(rt); + IP_VS_DBG_RL("ip_vs_dr_xmit(): frag needed\n"); + goto tx_error; + } + + if (skb_is_nonlinear(skb) && skb->len <= mtu) + ip_send_check(iph); + + if (unlikely(skb_headroom(skb) < rt->u.dst.dev->hard_header_len)) { + if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) { + ip_rt_put(rt); + IP_VS_ERR_RL("ip_vs_dr_xmit(): no memory\n"); + goto tx_error; + } + } + + /* drop old route */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + +#ifdef CONFIG_NETFILTER_DEBUG + skb->nf_debug = 0; +#endif /* CONFIG_NETFILTER_DEBUG */ + IP_VS_XMIT(skb, rt); + + LeaveFunction(10); + return NF_STOLEN; + + tx_error_icmp: + dst_link_failure(skb); + tx_error: + kfree_skb(skb); + return NF_STOLEN; +} + + +/* + * ICMP packet transmitter + * called by the ip_vs_in_icmp + */ +int +ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rtable *rt; /* Route to the other host */ + struct iphdr *iph; + struct icmphdr *icmph; + struct iphdr *ciph; /* The ip header contained within the ICMP */ + unsigned short len; + union ip_vs_tphdr h; + int mtu; + int rc; + + EnterFunction(10); + + /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be + forwarded directly here, because there is no need to + translate address/port back */ + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { + if (cp->packet_xmit) + rc = cp->packet_xmit(skb, cp, pp); + else + rc = NF_ACCEPT; + atomic_inc(&cp->in_pkts); + __ip_vs_conn_put(cp); + goto out; + } + + iph = skb->nh.iph; + icmph = (struct icmphdr *)((char *)iph+(iph->ihl<<2)); + len = ntohs(iph->tot_len) - (iph->ihl<<2); + + /* + * mangle and send the packet here (only for VS/NAT) + */ + + if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos)))) + goto tx_error_icmp; + + /* MTU checking */ + mtu = dst_pmtu(&rt->u.dst); + if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) { + ip_rt_put(rt); + icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); + IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n"); + goto tx_error; + } + + /* drop old route */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + /* copy-on-write the packet before mangling it */ + if (ip_vs_skb_cow(skb, rt->u.dst.dev->hard_header_len, + &iph, (unsigned char**)&icmph)) { + rc = NF_DROP; + goto out; + } + ciph = (struct iphdr *) (icmph + 1); + h.raw = (char *) ciph + (ciph->ihl << 2); + + /* The ICMP packet for VS/NAT must be written to correct addresses + before being forwarded to the right server */ + + /* First change the dest IP address, and recalc checksum */ + iph->daddr = cp->daddr; + ip_send_check(iph); + + /* Now change the *source* address in the contained IP */ + ciph->saddr = cp->daddr; + ip_send_check(ciph); + + /* the TCP/UDP source port - cannot redo check */ + if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol) + h.portp[0] = cp->dport; + + /* And finally the ICMP checksum */ + icmph->checksum = 0; + icmph->checksum = ip_compute_csum((unsigned char *) icmph, len); + skb->ip_summed = CHECKSUM_UNNECESSARY; + + IP_VS_DBG_PKT(11, pp, ciph, "Forwarding incoming ICMP"); + +#ifdef CONFIG_NETFILTER_DEBUG + skb->nf_debug = 0; +#endif /* CONFIG_NETFILTER_DEBUG */ + IP_VS_XMIT(skb, rt); + + rc = NF_STOLEN; + goto out; + + tx_error_icmp: + dst_link_failure(skb); + tx_error: + dev_kfree_skb(skb); + rc = NF_STOLEN; + out: + LeaveFunction(10); + return rc; +} diff --git a/net/ipv4/netfilter/ip_fw_compat.c b/net/ipv4/netfilter/ip_fw_compat.c index 967ee1266d11..ed9ceca79e56 100644 --- a/net/ipv4/netfilter/ip_fw_compat.c +++ b/net/ipv4/netfilter/ip_fw_compat.c @@ -19,6 +19,12 @@ struct notifier_block; static struct firewall_ops *fwops; +#ifdef CONFIG_IP_VS +/* From ip_vs_core.c */ +extern unsigned int +check_for_ip_vs_out(struct sk_buff **skb_p, int (*okfn)(struct sk_buff *)); +#endif + /* They call these; we do what they want. */ int register_firewall(int pf, struct firewall_ops *fw) { @@ -134,8 +140,14 @@ fw_in(unsigned int hooknum, return NF_ACCEPT; case FW_MASQUERADE: - if (hooknum == NF_IP_FORWARD) + if (hooknum == NF_IP_FORWARD) { +#ifdef CONFIG_IP_VS + /* check if it is for ip_vs */ + if (check_for_ip_vs_out(pskb, okfn) == NF_STOLEN) + return NF_STOLEN; +#endif return do_masquerade(pskb, out); + } else return NF_ACCEPT; case FW_REDIRECT: diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index e5edda7d7a18..9956944a50bf 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -496,6 +496,12 @@ static __inline__ void fib6_start_gc(struct rt6_info *rt) mod_timer(&ip6_fib_timer, jiffies + ip6_rt_gc_interval); } +void fib6_force_start_gc(void) +{ + if (ip6_fib_timer.expires == 0) + mod_timer(&ip6_fib_timer, jiffies + ip6_rt_gc_interval); +} + /* * Add routing information to the routing tree. * <destination addr>/<source addr> @@ -1214,6 +1220,7 @@ void fib6_run_gc(unsigned long dummy) write_lock_bh(&rt6_lock); + ndisc_dst_gc(&gc_args.more); fib6_clean_tree(&ip6_routing_table, fib6_age, 0, NULL); write_unlock_bh(&rt6_lock); diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index b7cca33ff75b..75a9eeaa05d0 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -402,25 +402,6 @@ static int ndisc_output(struct sk_buff *skb) return -EINVAL; } -static inline struct dst_entry *ndisc_dst_alloc(struct net_device *dev, - struct neighbour *neigh) -{ - struct rt6_info *rt = ip6_dst_alloc(); - - if (unlikely(rt == NULL)) - goto out; - - rt->rt6i_dev = dev; - rt->rt6i_nexthop = neigh; - rt->rt6i_expires = 0; - rt->rt6i_flags = RTF_LOCAL; - rt->rt6i_metric = 0; - rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255; - rt->u.dst.output = ndisc_output; -out: - return (struct dst_entry *)rt; -} - static inline void ndisc_flow_init(struct flowi *fl, u8 type, struct in6_addr *saddr, struct in6_addr *daddr) { @@ -463,13 +444,13 @@ static void ndisc_send_na(struct net_device *dev, struct neighbour *neigh, ndisc_flow_init(&fl, NDISC_NEIGHBOUR_ADVERTISEMENT, src_addr, daddr); - dst = ndisc_dst_alloc(dev, neigh); + dst = ndisc_dst_alloc(dev, neigh, ndisc_output); if (!dst) return; err = xfrm_lookup(&dst, &fl, NULL, 0); if (err < 0) { - dst_free(dst); + dst_release(dst); return; } @@ -485,7 +466,7 @@ static void ndisc_send_na(struct net_device *dev, struct neighbour *neigh, if (skb == NULL) { ND_PRINTK1("send_na: alloc skb failed\n"); - dst_free(dst); + dst_release(dst); return; } @@ -515,7 +496,6 @@ static void ndisc_send_na(struct net_device *dev, struct neighbour *neigh, csum_partial((__u8 *) msg, len, 0)); - dst_clone(dst); skb->dst = dst; idev = in6_dev_get(dst->dev); dst_output(skb); @@ -550,10 +530,9 @@ void ndisc_send_ns(struct net_device *dev, struct neighbour *neigh, ndisc_flow_init(&fl, NDISC_NEIGHBOUR_SOLICITATION, saddr, daddr); - dst = ndisc_dst_alloc(dev, neigh); + dst = ndisc_dst_alloc(dev, neigh, ndisc_output); if (!dst) return; - dst_clone(dst); err = xfrm_lookup(&dst, &fl, NULL, 0); if (err < 0) { @@ -570,6 +549,7 @@ void ndisc_send_ns(struct net_device *dev, struct neighbour *neigh, 1, &err); if (skb == NULL) { ND_PRINTK1("send_ns: alloc skb failed\n"); + dst_release(dst); return; } @@ -595,7 +575,6 @@ void ndisc_send_ns(struct net_device *dev, struct neighbour *neigh, csum_partial((__u8 *) msg, len, 0)); /* send it! */ - dst_clone(dst); skb->dst = dst; idev = in6_dev_get(dst->dev); dst_output(skb); @@ -622,10 +601,9 @@ void ndisc_send_rs(struct net_device *dev, struct in6_addr *saddr, ndisc_flow_init(&fl, NDISC_ROUTER_SOLICITATION, saddr, daddr); - dst = ndisc_dst_alloc(dev, NULL); + dst = ndisc_dst_alloc(dev, NULL, ndisc_output); if (!dst) return; - dst_clone(dst); err = xfrm_lookup(&dst, &fl, NULL, 0); if (err < 0) { @@ -664,7 +642,6 @@ void ndisc_send_rs(struct net_device *dev, struct in6_addr *saddr, csum_partial((__u8 *) hdr, len, 0)); /* send it! */ - dst_clone(dst); skb->dst = dst; idev = in6_dev_get(dst->dev); dst_output(skb); @@ -1321,7 +1298,6 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh, if (rt == NULL) return; dst = &rt->u.dst; - dst_clone(dst); err = xfrm_lookup(&dst, &fl, NULL, 0); if (err) { @@ -1329,16 +1305,17 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh, return; } + rt = (struct rt6_info *) dst; + if (rt->rt6i_flags & RTF_GATEWAY) { ND_PRINTK1("ndisc_send_redirect: not a neighbour\n"); - dst_release(&rt->u.dst); + dst_release(dst); return; } - if (!xrlim_allow(&rt->u.dst, 1*HZ)) { - dst_release(&rt->u.dst); + if (!xrlim_allow(dst, 1*HZ)) { + dst_release(dst); return; } - dst_release(&rt->u.dst); if (dev->addr_len) { if (neigh->nud_state&NUD_VALID) { @@ -1348,6 +1325,7 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh, We will make it later, when will be sure, that it is alive. */ + dst_release(dst); return; } } @@ -1366,11 +1344,11 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh, hlen = 0; - skb_reserve(skb, (dev->hard_header_len + 15) & ~15); + skb_reserve(buff, (dev->hard_header_len + 15) & ~15); ip6_nd_hdr(sk, buff, dev, &saddr_buf, &skb->nh.ipv6h->saddr, IPPROTO_ICMPV6, len); - skb->h.raw = (unsigned char*) icmph = (struct icmp6hdr *) skb_put(buff, len); + buff->h.raw = (unsigned char*) icmph = (struct icmp6hdr *) skb_put(buff, len); memset(icmph, 0, sizeof(struct icmp6hdr)); icmph->icmp6_type = NDISC_REDIRECT; @@ -1408,9 +1386,9 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh, len, IPPROTO_ICMPV6, csum_partial((u8 *) icmph, len, 0)); - skb->dst = dst; + buff->dst = dst; idev = in6_dev_get(dst->dev); - dst_output(skb); + dst_output(buff); ICMP6_INC_STATS(idev, Icmp6OutRedirects); ICMP6_INC_STATS(idev, Icmp6OutMsgs); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index ff37d74152fb..1b9b06e5fc51 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -131,16 +131,11 @@ rwlock_t rt6_lock = RW_LOCK_UNLOCKED; /* allocate dst with ip6_dst_ops */ -static __inline__ struct rt6_info *__ip6_dst_alloc(void) +static __inline__ struct rt6_info *ip6_dst_alloc(void) { return dst_alloc(&ip6_dst_ops); } -struct rt6_info *ip6_dst_alloc(void) -{ - return __ip6_dst_alloc(); -} - /* * Route lookup. Any rt6_lock is implied. */ @@ -560,6 +555,60 @@ static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu) } } +/* Protected by rt6_lock. */ +static struct dst_entry *ndisc_dst_gc_list; + +struct dst_entry *ndisc_dst_alloc(struct net_device *dev, + struct neighbour *neigh, + int (*output)(struct sk_buff *)) +{ + struct rt6_info *rt = ip6_dst_alloc(); + + if (unlikely(rt == NULL)) + goto out; + + rt->rt6i_dev = dev; + rt->rt6i_nexthop = neigh; + rt->rt6i_expires = 0; + rt->rt6i_flags = RTF_LOCAL; + rt->rt6i_metric = 0; + atomic_set(&rt->u.dst.__refcnt, 1); + rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255; + rt->u.dst.output = output; + + write_lock_bh(&rt6_lock); + rt->u.dst.next = ndisc_dst_gc_list; + ndisc_dst_gc_list = &rt->u.dst; + write_unlock_bh(&rt6_lock); + + fib6_force_start_gc(); + +out: + return (struct dst_entry *)rt; +} + +int ndisc_dst_gc(int *more) +{ + struct dst_entry *dst, *next, **pprev; + int freed; + + next = NULL; + pprev = &ndisc_dst_gc_list; + freed = 0; + while ((dst = *pprev) != NULL) { + if (!atomic_read(&dst->__refcnt)) { + *pprev = dst->next; + dst_free(dst); + freed++; + } else { + pprev = &dst->next; + (*more)++; + } + } + + return freed; +} + static int ip6_dst_gc(void) { static unsigned expire = 30*HZ; @@ -655,7 +704,7 @@ int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr) if (rtmsg->rtmsg_metric == 0) rtmsg->rtmsg_metric = IP6_RT_PRIO_USER; - rt = __ip6_dst_alloc(); + rt = ip6_dst_alloc(); if (rt == NULL) return -ENOMEM; @@ -1066,7 +1115,7 @@ out: static struct rt6_info * ip6_rt_copy(struct rt6_info *ort) { - struct rt6_info *rt = __ip6_dst_alloc(); + struct rt6_info *rt = ip6_dst_alloc(); if (rt) { rt->u.dst.input = ort->u.dst.input; @@ -1209,7 +1258,7 @@ int ip6_pkt_discard(struct sk_buff *skb) int ip6_rt_addr_add(struct in6_addr *addr, struct net_device *dev) { - struct rt6_info *rt = __ip6_dst_alloc(); + struct rt6_info *rt = ip6_dst_alloc(); if (rt == NULL) return -ENOMEM; diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 54d88de80e20..8cd678de82eb 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -430,6 +430,10 @@ retry: goto no_dst; nlk = nlk_sk(sk); + /* Don't bother queuing skb if kernel socket has no input function */ + if (nlk->pid == 0 && !nlk->data_ready) + goto no_dst; + #ifdef NL_EMULATE_DEV if (nlk->handler) { skb_orphan(skb); diff --git a/net/netsyms.c b/net/netsyms.c index f327ebe6c755..7a6b4148a020 100644 --- a/net/netsyms.c +++ b/net/netsyms.c @@ -265,6 +265,7 @@ EXPORT_SYMBOL(inet_family_ops); EXPORT_SYMBOL(in_aton); EXPORT_SYMBOL(ip_mc_inc_group); EXPORT_SYMBOL(ip_mc_dec_group); +EXPORT_SYMBOL(ip_mc_join_group); EXPORT_SYMBOL(ip_finish_output); EXPORT_SYMBOL(inet_stream_ops); EXPORT_SYMBOL(inet_dgram_ops); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index dcfd387ab328..45774f24940c 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -855,6 +855,7 @@ xfrm_state_ok(struct xfrm_tmpl *tmpl, struct xfrm_state *x, { return x->id.proto == tmpl->id.proto && (x->id.spi == tmpl->id.spi || !tmpl->id.spi) && + (x->props.reqid == tmpl->reqid || !tmpl->reqid) && x->props.mode == tmpl->mode && (tmpl->aalgos & (1<<x->props.aalgo)) && !(x->props.mode && xfrm_state_addr_cmp(tmpl, x, family)); |
