From b3bba872ddb0320a7ecb54decae53c13ceb2ed4c Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Thu, 8 Dec 2011 16:53:54 -0600 Subject: writeback: show writeback reason with __print_symbolic This makes the binary trace understandable by trace-cmd. CC: Dave Chinner CC: Curt Wohlgemuth CC: Steven Rostedt Signed-off-by: Wu Fengguang --- include/trace/events/writeback.h | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index b99caa8b780c..99d1d0decf88 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -21,6 +21,16 @@ {I_REFERENCED, "I_REFERENCED"} \ ) +#define WB_WORK_REASON \ + {WB_REASON_BACKGROUND, "background"}, \ + {WB_REASON_TRY_TO_FREE_PAGES, "try_to_free_pages"}, \ + {WB_REASON_SYNC, "sync"}, \ + {WB_REASON_PERIODIC, "periodic"}, \ + {WB_REASON_LAPTOP_TIMER, "laptop_timer"}, \ + {WB_REASON_FREE_MORE_MEM, "free_more_memory"}, \ + {WB_REASON_FS_FREE_SPACE, "fs_free_space"}, \ + {WB_REASON_FORKER_THREAD, "forker_thread"} + struct wb_writeback_work; DECLARE_EVENT_CLASS(writeback_work_class, @@ -55,7 +65,7 @@ DECLARE_EVENT_CLASS(writeback_work_class, __entry->for_kupdate, __entry->range_cyclic, __entry->for_background, - wb_reason_name[__entry->reason] + __print_symbolic(__entry->reason, WB_WORK_REASON) ) ); #define DEFINE_WRITEBACK_WORK_EVENT(name) \ @@ -184,7 +194,8 @@ TRACE_EVENT(writeback_queue_io, __entry->older, /* older_than_this in jiffies */ __entry->age, /* older_than_this in relative milliseconds */ __entry->moved, - wb_reason_name[__entry->reason]) + __print_symbolic(__entry->reason, WB_WORK_REASON) + ) ); TRACE_EVENT(global_dirty_state, -- cgit v1.2.3 From 2692ba61a82203404abd7dd2a027bda962861f74 Mon Sep 17 00:00:00 2001 From: Xi Wang Date: Fri, 16 Dec 2011 12:44:15 +0000 Subject: sctp: fix incorrect overflow check on autoclose Commit 8ffd3208 voids the previous patches f6778aab and 810c0719 for limiting the autoclose value. If userspace passes in -1 on 32-bit platform, the overflow check didn't work and autoclose would be set to 0xffffffff. This patch defines a max_autoclose (in seconds) for limiting the value and exposes it through sysctl, with the following intentions. 1) Avoid overflowing autoclose * HZ. 2) Keep the default autoclose bound consistent across 32- and 64-bit platforms (INT_MAX / HZ in this patch). 3) Keep the autoclose value consistent between setsockopt() and getsockopt() calls. Suggested-by: Vlad Yasevich Signed-off-by: Xi Wang Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 4 ++++ net/sctp/associola.c | 2 +- net/sctp/protocol.c | 3 +++ net/sctp/socket.c | 2 -- net/sctp/sysctl.c | 13 +++++++++++++ 5 files changed, 21 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index e90e7a9935dd..a15432da27c3 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -241,6 +241,9 @@ extern struct sctp_globals { * bits is an indicator of when to send and window update SACK. */ int rwnd_update_shift; + + /* Threshold for autoclose timeout, in seconds. */ + unsigned long max_autoclose; } sctp_globals; #define sctp_rto_initial (sctp_globals.rto_initial) @@ -281,6 +284,7 @@ extern struct sctp_globals { #define sctp_auth_enable (sctp_globals.auth_enable) #define sctp_checksum_disable (sctp_globals.checksum_disable) #define sctp_rwnd_upd_shift (sctp_globals.rwnd_update_shift) +#define sctp_max_autoclose (sctp_globals.max_autoclose) /* SCTP Socket type: UDP or TCP style. */ typedef enum { diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 152b5b3c3fff..acd2edbc073e 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -173,7 +173,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a asoc->timeouts[SCTP_EVENT_TIMEOUT_HEARTBEAT] = 0; asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = asoc->sackdelay; asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = - (unsigned long)sp->autoclose * HZ; + min_t(unsigned long, sp->autoclose, sctp_max_autoclose) * HZ; /* Initializes the timers */ for (i = SCTP_EVENT_TIMEOUT_NONE; i < SCTP_NUM_TIMEOUT_TYPES; ++i) diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 61b9fca5a173..6f6ad8686833 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1285,6 +1285,9 @@ SCTP_STATIC __init int sctp_init(void) sctp_max_instreams = SCTP_DEFAULT_INSTREAMS; sctp_max_outstreams = SCTP_DEFAULT_OUTSTREAMS; + /* Initialize maximum autoclose timeout. */ + sctp_max_autoclose = INT_MAX / HZ; + /* Initialize handle used for association ids. */ idr_init(&sctp_assocs_id); diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 13bf5fcdbff1..54a7cd2fdd7a 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -2200,8 +2200,6 @@ static int sctp_setsockopt_autoclose(struct sock *sk, char __user *optval, return -EINVAL; if (copy_from_user(&sp->autoclose, optval, optlen)) return -EFAULT; - /* make sure it won't exceed MAX_SCHEDULE_TIMEOUT */ - sp->autoclose = min_t(long, sp->autoclose, MAX_SCHEDULE_TIMEOUT / HZ); return 0; } diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index 6b3952961b85..60ffbd067ff7 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -53,6 +53,10 @@ static int sack_timer_min = 1; static int sack_timer_max = 500; static int addr_scope_max = 3; /* check sctp_scope_policy_t in include/net/sctp/constants.h for max entries */ static int rwnd_scale_max = 16; +static unsigned long max_autoclose_min = 0; +static unsigned long max_autoclose_max = + (MAX_SCHEDULE_TIMEOUT / HZ > UINT_MAX) + ? UINT_MAX : MAX_SCHEDULE_TIMEOUT / HZ; extern long sysctl_sctp_mem[3]; extern int sysctl_sctp_rmem[3]; @@ -258,6 +262,15 @@ static ctl_table sctp_table[] = { .extra1 = &one, .extra2 = &rwnd_scale_max, }, + { + .procname = "max_autoclose", + .data = &sctp_max_autoclose, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = &proc_doulongvec_minmax, + .extra1 = &max_autoclose_min, + .extra2 = &max_autoclose_max, + }, { /* sentinel */ } }; -- cgit v1.2.3 From c0ed1c14a72ca9ebacd51fb94a8aca488b0d361e Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Wed, 21 Dec 2011 16:48:08 -0500 Subject: net: Add a flow_cache_flush_deferred function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit flow_cach_flush() might sleep but can be called from atomic context via the xfrm garbage collector. So add a flow_cache_flush_deferred() function and use this if the xfrm garbage colector is invoked from within the packet path. Signed-off-by: Steffen Klassert Acked-by: Timo Teräs Signed-off-by: David S. Miller --- include/net/flow.h | 1 + net/core/flow.c | 12 ++++++++++++ net/xfrm/xfrm_policy.c | 18 ++++++++++++++---- 3 files changed, 27 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/flow.h b/include/net/flow.h index a09447749e2d..57f15a7f1cdd 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -207,6 +207,7 @@ extern struct flow_cache_object *flow_cache_lookup( u8 dir, flow_resolve_t resolver, void *ctx); extern void flow_cache_flush(void); +extern void flow_cache_flush_deferred(void); extern atomic_t flow_cache_genid; #endif diff --git a/net/core/flow.c b/net/core/flow.c index 8ae42de9c79e..e318c7e98042 100644 --- a/net/core/flow.c +++ b/net/core/flow.c @@ -358,6 +358,18 @@ void flow_cache_flush(void) put_online_cpus(); } +static void flow_cache_flush_task(struct work_struct *work) +{ + flow_cache_flush(); +} + +static DECLARE_WORK(flow_cache_flush_work, flow_cache_flush_task); + +void flow_cache_flush_deferred(void) +{ + schedule_work(&flow_cache_flush_work); +} + static int __cpuinit flow_cache_cpu_prepare(struct flow_cache *fc, int cpu) { struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 2118d6446630..9049a5caeb25 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2276,8 +2276,6 @@ static void __xfrm_garbage_collect(struct net *net) { struct dst_entry *head, *next; - flow_cache_flush(); - spin_lock_bh(&xfrm_policy_sk_bundle_lock); head = xfrm_policy_sk_bundles; xfrm_policy_sk_bundles = NULL; @@ -2290,6 +2288,18 @@ static void __xfrm_garbage_collect(struct net *net) } } +static void xfrm_garbage_collect(struct net *net) +{ + flow_cache_flush(); + __xfrm_garbage_collect(net); +} + +static void xfrm_garbage_collect_deferred(struct net *net) +{ + flow_cache_flush_deferred(); + __xfrm_garbage_collect(net); +} + static void xfrm_init_pmtu(struct dst_entry *dst) { do { @@ -2422,7 +2432,7 @@ int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo) if (likely(dst_ops->neigh_lookup == NULL)) dst_ops->neigh_lookup = xfrm_neigh_lookup; if (likely(afinfo->garbage_collect == NULL)) - afinfo->garbage_collect = __xfrm_garbage_collect; + afinfo->garbage_collect = xfrm_garbage_collect_deferred; xfrm_policy_afinfo[afinfo->family] = afinfo; } write_unlock_bh(&xfrm_policy_afinfo_lock); @@ -2516,7 +2526,7 @@ static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void switch (event) { case NETDEV_DOWN: - __xfrm_garbage_collect(dev_net(dev)); + xfrm_garbage_collect(dev_net(dev)); } return NOTIFY_DONE; } -- cgit v1.2.3 From e30e2fdfe56288576ee9e04dbb06b4bd5f282203 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Thu, 22 Dec 2011 02:45:29 +0530 Subject: VFS: Fix race between CPU hotplug and lglocks Currently, the *_global_[un]lock_online() routines are not at all synchronized with CPU hotplug. Soft-lockups detected as a consequence of this race was reported earlier at https://lkml.org/lkml/2011/8/24/185. (Thanks to Cong Meng for finding out that the root-cause of this issue is the race condition between br_write_[un]lock() and CPU hotplug, which results in the lock states getting messed up). Fixing this race by just adding {get,put}_online_cpus() at appropriate places in *_global_[un]lock_online() is not a good option, because, then suddenly br_write_[un]lock() would become blocking, whereas they have been kept as non-blocking all this time, and we would want to keep them that way. So, overall, we want to ensure 3 things: 1. br_write_lock() and br_write_unlock() must remain as non-blocking. 2. The corresponding lock and unlock of the per-cpu spinlocks must not happen for different sets of CPUs. 3. Either prevent any new CPU online operation in between this lock-unlock, or ensure that the newly onlined CPU does not proceed with its corresponding per-cpu spinlock unlocked. To achieve all this: (a) We introduce a new spinlock that is taken by the *_global_lock_online() routine and released by the *_global_unlock_online() routine. (b) We register a callback for CPU hotplug notifications, and this callback takes the same spinlock as above. (c) We maintain a bitmap which is close to the cpu_online_mask, and once it is initialized in the lock_init() code, all future updates to it are done in the callback, under the above spinlock. (d) The above bitmap is used (instead of cpu_online_mask) while locking and unlocking the per-cpu locks. The callback takes the spinlock upon the CPU_UP_PREPARE event. So, if the br_write_lock-unlock sequence is in progress, the callback keeps spinning, thus preventing the CPU online operation till the lock-unlock sequence is complete. This takes care of requirement (3). The bitmap that we maintain remains unmodified throughout the lock-unlock sequence, since all updates to it are managed by the callback, which takes the same spinlock as the one taken by the lock code and released only by the unlock routine. Combining this with (d) above, satisfies requirement (2). Overall, since we use a spinlock (mentioned in (a)) to prevent CPU hotplug operations from racing with br_write_lock-unlock, requirement (1) is also taken care of. By the way, it is to be noted that a CPU offline operation can actually run in parallel with our lock-unlock sequence, because our callback doesn't react to notifications earlier than CPU_DEAD (in order to maintain our bitmap properly). And this means, since we use our own bitmap (which is stale, on purpose) during the lock-unlock sequence, we could end up unlocking the per-cpu lock of an offline CPU (because we had locked it earlier, when the CPU was online), in order to satisfy requirement (2). But this is harmless, though it looks a bit awkward. Debugged-by: Cong Meng Signed-off-by: Srivatsa S. Bhat Signed-off-by: Al Viro Cc: stable@vger.kernel.org --- include/linux/lglock.h | 36 ++++++++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/lglock.h b/include/linux/lglock.h index f549056fb20b..87f402ccec55 100644 --- a/include/linux/lglock.h +++ b/include/linux/lglock.h @@ -22,6 +22,7 @@ #include #include #include +#include /* can make br locks by using local lock for read side, global lock for write */ #define br_lock_init(name) name##_lock_init() @@ -72,9 +73,31 @@ #define DEFINE_LGLOCK(name) \ \ + DEFINE_SPINLOCK(name##_cpu_lock); \ + cpumask_t name##_cpus __read_mostly; \ DEFINE_PER_CPU(arch_spinlock_t, name##_lock); \ DEFINE_LGLOCK_LOCKDEP(name); \ \ + static int \ + name##_lg_cpu_callback(struct notifier_block *nb, \ + unsigned long action, void *hcpu) \ + { \ + switch (action & ~CPU_TASKS_FROZEN) { \ + case CPU_UP_PREPARE: \ + spin_lock(&name##_cpu_lock); \ + cpu_set((unsigned long)hcpu, name##_cpus); \ + spin_unlock(&name##_cpu_lock); \ + break; \ + case CPU_UP_CANCELED: case CPU_DEAD: \ + spin_lock(&name##_cpu_lock); \ + cpu_clear((unsigned long)hcpu, name##_cpus); \ + spin_unlock(&name##_cpu_lock); \ + } \ + return NOTIFY_OK; \ + } \ + static struct notifier_block name##_lg_cpu_notifier = { \ + .notifier_call = name##_lg_cpu_callback, \ + }; \ void name##_lock_init(void) { \ int i; \ LOCKDEP_INIT_MAP(&name##_lock_dep_map, #name, &name##_lock_key, 0); \ @@ -83,6 +106,11 @@ lock = &per_cpu(name##_lock, i); \ *lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; \ } \ + register_hotcpu_notifier(&name##_lg_cpu_notifier); \ + get_online_cpus(); \ + for_each_online_cpu(i) \ + cpu_set(i, name##_cpus); \ + put_online_cpus(); \ } \ EXPORT_SYMBOL(name##_lock_init); \ \ @@ -124,9 +152,9 @@ \ void name##_global_lock_online(void) { \ int i; \ - preempt_disable(); \ + spin_lock(&name##_cpu_lock); \ rwlock_acquire(&name##_lock_dep_map, 0, 0, _RET_IP_); \ - for_each_online_cpu(i) { \ + for_each_cpu(i, &name##_cpus) { \ arch_spinlock_t *lock; \ lock = &per_cpu(name##_lock, i); \ arch_spin_lock(lock); \ @@ -137,12 +165,12 @@ void name##_global_unlock_online(void) { \ int i; \ rwlock_release(&name##_lock_dep_map, 1, _RET_IP_); \ - for_each_online_cpu(i) { \ + for_each_cpu(i, &name##_cpus) { \ arch_spinlock_t *lock; \ lock = &per_cpu(name##_lock, i); \ arch_spin_unlock(lock); \ } \ - preempt_enable(); \ + spin_unlock(&name##_cpu_lock); \ } \ EXPORT_SYMBOL(name##_global_unlock_online); \ \ -- cgit v1.2.3 From e688a604807647c9450f9c12a7cb6d027150a895 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 22 Dec 2011 04:15:53 +0000 Subject: net: introduce DST_NOPEER dst flag Chris Boot reported crashes occurring in ipv6_select_ident(). [ 461.457562] RIP: 0010:[] [] ipv6_select_ident+0x31/0xa7 [ 461.578229] Call Trace: [ 461.580742] [ 461.582870] [] ? udp6_ufo_fragment+0x124/0x1a2 [ 461.589054] [] ? ipv6_gso_segment+0xc0/0x155 [ 461.595140] [] ? skb_gso_segment+0x208/0x28b [ 461.601198] [] ? ipv6_confirm+0x146/0x15e [nf_conntrack_ipv6] [ 461.608786] [] ? nf_iterate+0x41/0x77 [ 461.614227] [] ? dev_hard_start_xmit+0x357/0x543 [ 461.620659] [] ? nf_hook_slow+0x73/0x111 [ 461.626440] [] ? br_parse_ip_options+0x19a/0x19a [bridge] [ 461.633581] [] ? dev_queue_xmit+0x3af/0x459 [ 461.639577] [] ? br_dev_queue_push_xmit+0x72/0x76 [bridge] [ 461.646887] [] ? br_nf_post_routing+0x17d/0x18f [bridge] [ 461.653997] [] ? nf_iterate+0x41/0x77 [ 461.659473] [] ? br_flood+0xfa/0xfa [bridge] [ 461.665485] [] ? nf_hook_slow+0x73/0x111 [ 461.671234] [] ? br_flood+0xfa/0xfa [bridge] [ 461.677299] [] ? nf_bridge_update_protocol+0x20/0x20 [bridge] [ 461.684891] [] ? nf_ct_zone+0xa/0x17 [nf_conntrack] [ 461.691520] [] ? br_flood+0xfa/0xfa [bridge] [ 461.697572] [] ? NF_HOOK.constprop.8+0x3c/0x56 [bridge] [ 461.704616] [] ? nf_bridge_push_encap_header+0x1c/0x26 [bridge] [ 461.712329] [] ? br_nf_forward_finish+0x8a/0x95 [bridge] [ 461.719490] [] ? nf_bridge_pull_encap_header+0x1c/0x27 [bridge] [ 461.727223] [] ? br_nf_forward_ip+0x1c0/0x1d4 [bridge] [ 461.734292] [] ? nf_iterate+0x41/0x77 [ 461.739758] [] ? __br_deliver+0xa0/0xa0 [bridge] [ 461.746203] [] ? nf_hook_slow+0x73/0x111 [ 461.751950] [] ? __br_deliver+0xa0/0xa0 [bridge] [ 461.758378] [] ? NF_HOOK.constprop.4+0x56/0x56 [bridge] This is caused by bridge netfilter special dst_entry (fake_rtable), a special shared entry, where attaching an inetpeer makes no sense. Problem is present since commit 87c48fa3b46 (ipv6: make fragment identifications less predictable) Introduce DST_NOPEER dst flag and make sure ipv6_select_ident() and __ip_select_ident() fallback to the 'no peer attached' handling. Reported-by: Chris Boot Tested-by: Chris Boot Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/dst.h | 1 + net/bridge/br_netfilter.c | 2 +- net/ipv4/route.c | 4 ++-- net/ipv6/ip6_output.c | 2 +- 4 files changed, 5 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/dst.h b/include/net/dst.h index 6faec1a60216..75766b42660e 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -53,6 +53,7 @@ struct dst_entry { #define DST_NOHASH 0x0008 #define DST_NOCACHE 0x0010 #define DST_NOCOUNT 0x0020 +#define DST_NOPEER 0x0040 short error; short obsolete; diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index 08757dc670a4..fa8b8f763580 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -147,7 +147,7 @@ void br_netfilter_rtable_init(struct net_bridge *br) rt->dst.dev = br->dev; rt->dst.path = &rt->dst; dst_init_metrics(&rt->dst, br_dst_default_metrics, true); - rt->dst.flags = DST_NOXFRM; + rt->dst.flags = DST_NOXFRM | DST_NOPEER; rt->dst.ops = &fake_dst_ops; } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 85cc053d9d6e..94cdbc55ca7e 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1367,7 +1367,7 @@ void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more) { struct rtable *rt = (struct rtable *) dst; - if (rt) { + if (rt && !(rt->dst.flags & DST_NOPEER)) { if (rt->peer == NULL) rt_bind_peer(rt, rt->rt_dst, 1); @@ -1378,7 +1378,7 @@ void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more) iph->id = htons(inet_getid(rt->peer, more)); return; } - } else + } else if (!rt) printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", __builtin_return_address(0)); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 84d0bd5cac93..ec562713db9b 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -603,7 +603,7 @@ void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt) static atomic_t ipv6_fragmentation_id; int old, new; - if (rt) { + if (rt && !(rt->dst.flags & DST_NOPEER)) { struct inet_peer *peer; if (!rt->rt6i_peer) -- cgit v1.2.3 From 0fd7bac6b6157eed6cf0cb86a1e88ba29e57c033 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 21 Dec 2011 07:11:44 +0000 Subject: net: relax rcvbuf limits skb->truesize might be big even for a small packet. Its even bigger after commit 87fb4b7b533 (net: more accurate skb truesize) and big MTU. We should allow queueing at least one packet per receiver, even with a low RCVBUF setting. Reported-by: Michal Simek Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 4 +++- net/core/sock.c | 6 +----- net/packet/af_packet.c | 6 ++---- 3 files changed, 6 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index abb6e0f0c3c3..32e39371fba6 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -637,12 +637,14 @@ static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb) /* * Take into account size of receive queue and backlog queue + * Do not take into account this skb truesize, + * to allow even a single big packet to come. */ static inline bool sk_rcvqueues_full(const struct sock *sk, const struct sk_buff *skb) { unsigned int qsize = sk->sk_backlog.len + atomic_read(&sk->sk_rmem_alloc); - return qsize + skb->truesize > sk->sk_rcvbuf; + return qsize > sk->sk_rcvbuf; } /* The per-socket spinlock must be held here. */ diff --git a/net/core/sock.c b/net/core/sock.c index 4ed7b1d12f5e..b23f174ab84c 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -288,11 +288,7 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) unsigned long flags; struct sk_buff_head *list = &sk->sk_receive_queue; - /* Cast sk->rcvbuf to unsigned... It's pointless, but reduces - number of warnings when compiling with -W --ANK - */ - if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= - (unsigned)sk->sk_rcvbuf) { + if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) { atomic_inc(&sk->sk_drops); trace_sock_rcvqueue_full(sk, skb); return -ENOMEM; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 82a6f34d39d0..3891702b81df 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1630,8 +1630,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, if (snaplen > res) snaplen = res; - if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= - (unsigned)sk->sk_rcvbuf) + if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) goto drop_n_acct; if (skb_shared(skb)) { @@ -1762,8 +1761,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, if (po->tp_version <= TPACKET_V2) { if (macoff + snaplen > po->rx_ring.frame_size) { if (po->copy_thresh && - atomic_read(&sk->sk_rmem_alloc) + skb->truesize - < (unsigned)sk->sk_rcvbuf) { + atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) { if (skb_shared(skb)) { copy_skb = skb_clone(skb, GFP_ATOMIC); } else { -- cgit v1.2.3 From 4d25a066b69fb749a39d0d4c610689dd765a0b0e Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 21 Dec 2011 12:28:29 +0100 Subject: KVM: Don't automatically expose the TSC deadline timer in cpuid Unlike all of the other cpuid bits, the TSC deadline timer bit is set unconditionally, regardless of what userspace wants. This is broken in several ways: - if userspace doesn't use KVM_CREATE_IRQCHIP, and doesn't emulate the TSC deadline timer feature, a guest that uses the feature will break - live migration to older host kernels that don't support the TSC deadline timer will cause the feature to be pulled from under the guest's feet; breaking it - guests that are broken wrt the feature will fail. Fix by not enabling the feature automatically; instead report it to userspace. Because the feature depends on KVM_CREATE_IRQCHIP, which we cannot guarantee will be called, we expose it via a KVM_CAP_TSC_DEADLINE_TIMER and not KVM_GET_SUPPORTED_CPUID. Fixes the Illumos guest kernel, which uses the TSC deadline timer feature. [avi: add the KVM_CAP + documentation] Reported-by: Alexey Zaytsev Tested-by: Alexey Zaytsev Signed-off-by: Jan Kiszka Signed-off-by: Avi Kivity --- Documentation/virtual/kvm/api.txt | 9 +++++++++ arch/x86/kvm/x86.c | 19 +++++++++---------- include/linux/kvm.h | 1 + 3 files changed, 19 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 4df9af4f6132..e2a4b5287361 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1100,6 +1100,15 @@ emulate them efficiently. The fields in each entry are defined as follows: eax, ebx, ecx, edx: the values returned by the cpuid instruction for this function/index combination +The TSC deadline timer feature (CPUID leaf 1, ecx[24]) is always returned +as false, since the feature depends on KVM_CREATE_IRQCHIP for local APIC +support. Instead it is reported via + + ioctl(KVM_CHECK_EXTENSION, KVM_CAP_TSC_DEADLINE_TIMER) + +if that returns true and you use KVM_CREATE_IRQCHIP, or if you emulate the +feature in userspace, then you can enable the feature for KVM_SET_CPUID2. + 4.47 KVM_PPC_GET_PVINFO Capability: KVM_CAP_PPC_GET_PVINFO diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c38efd7b792e..4c938da2ba00 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -602,7 +602,6 @@ static void update_cpuid(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; struct kvm_lapic *apic = vcpu->arch.apic; - u32 timer_mode_mask; best = kvm_find_cpuid_entry(vcpu, 1, 0); if (!best) @@ -615,15 +614,12 @@ static void update_cpuid(struct kvm_vcpu *vcpu) best->ecx |= bit(X86_FEATURE_OSXSAVE); } - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && - best->function == 0x1) { - best->ecx |= bit(X86_FEATURE_TSC_DEADLINE_TIMER); - timer_mode_mask = 3 << 17; - } else - timer_mode_mask = 1 << 17; - - if (apic) - apic->lapic_timer.timer_mode_mask = timer_mode_mask; + if (apic) { + if (best->ecx & bit(X86_FEATURE_TSC_DEADLINE_TIMER)) + apic->lapic_timer.timer_mode_mask = 3 << 17; + else + apic->lapic_timer.timer_mode_mask = 1 << 17; + } } int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) @@ -2135,6 +2131,9 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_TSC_CONTROL: r = kvm_has_tsc_control; break; + case KVM_CAP_TSC_DEADLINE_TIMER: + r = boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER); + break; default: r = 0; break; diff --git a/include/linux/kvm.h b/include/linux/kvm.h index c3892fc1d538..68e67e50d028 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -557,6 +557,7 @@ struct kvm_ppc_pvinfo { #define KVM_CAP_MAX_VCPUS 66 /* returns max vcpus per vm */ #define KVM_CAP_PPC_PAPR 68 #define KVM_CAP_S390_GMAP 71 +#define KVM_CAP_TSC_DEADLINE_TIMER 72 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3 From 34845636a184f3be91a531098192592cbe6db587 Mon Sep 17 00:00:00 2001 From: Andreas Schwab Date: Wed, 28 Dec 2011 15:57:15 -0800 Subject: procfs: do not confuse jiffies with cputime64_t Commit 2a95ea6c0d129b4 ("procfs: do not overflow get_{idle,iowait}_time for nohz") did not take into account that one some architectures jiffies and cputime use different units. This causes get_idle_time() to return numbers in the wrong units, making the idle time fields in /proc/stat wrong. Instead of converting the usec value returned by get_cpu_{idle,iowait}_time_us to units of jiffies, use the new function usecs_to_cputime64 to convert it to the correct unit of cputime64_t. Signed-off-by: Andreas Schwab Acked-by: Michal Hocko Cc: Arnd Bergmann Cc: "Artem S. Tashkinov" Cc: Dave Jones Cc: Alexey Dobriyan Cc: Thomas Gleixner Cc: "Luck, Tony" Cc: Benjamin Herrenschmidt Cc: Martin Schwidefsky Cc: Heiko Carstens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/include/asm/cputime.h | 1 + arch/powerpc/include/asm/cputime.h | 2 ++ arch/s390/include/asm/cputime.h | 2 ++ fs/proc/stat.c | 4 ++-- include/asm-generic/cputime.h | 1 + 5 files changed, 8 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/arch/ia64/include/asm/cputime.h b/arch/ia64/include/asm/cputime.h index 6073b187528a..5a274af31b2b 100644 --- a/arch/ia64/include/asm/cputime.h +++ b/arch/ia64/include/asm/cputime.h @@ -60,6 +60,7 @@ typedef u64 cputime64_t; */ #define cputime_to_usecs(__ct) ((__ct) / NSEC_PER_USEC) #define usecs_to_cputime(__usecs) ((__usecs) * NSEC_PER_USEC) +#define usecs_to_cputime64(__usecs) usecs_to_cputime(__usecs) /* * Convert cputime <-> seconds diff --git a/arch/powerpc/include/asm/cputime.h b/arch/powerpc/include/asm/cputime.h index 1cf20bdfbeca..98b7c4b49c9d 100644 --- a/arch/powerpc/include/asm/cputime.h +++ b/arch/powerpc/include/asm/cputime.h @@ -150,6 +150,8 @@ static inline cputime_t usecs_to_cputime(const unsigned long us) return ct; } +#define usecs_to_cputime64(us) usecs_to_cputime(us) + /* * Convert cputime <-> seconds */ diff --git a/arch/s390/include/asm/cputime.h b/arch/s390/include/asm/cputime.h index 081434878296..b9acaaa175d8 100644 --- a/arch/s390/include/asm/cputime.h +++ b/arch/s390/include/asm/cputime.h @@ -87,6 +87,8 @@ usecs_to_cputime(const unsigned int m) return (cputime_t) m * 4096; } +#define usecs_to_cputime64(m) usecs_to_cputime(m) + /* * Convert cputime to milliseconds and back. */ diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 2a30d67dd6b8..0855e6f20391 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -32,7 +32,7 @@ static cputime64_t get_idle_time(int cpu) idle = kstat_cpu(cpu).cpustat.idle; idle = cputime64_add(idle, arch_idle_time(cpu)); } else - idle = nsecs_to_jiffies64(1000 * idle_time); + idle = usecs_to_cputime64(idle_time); return idle; } @@ -46,7 +46,7 @@ static cputime64_t get_iowait_time(int cpu) /* !NO_HZ so we can rely on cpustat.iowait */ iowait = kstat_cpu(cpu).cpustat.iowait; else - iowait = nsecs_to_jiffies64(1000 * iowait_time); + iowait = usecs_to_cputime64(iowait_time); return iowait; } diff --git a/include/asm-generic/cputime.h b/include/asm-generic/cputime.h index 62ce6823c0f2..12a1764f612b 100644 --- a/include/asm-generic/cputime.h +++ b/include/asm-generic/cputime.h @@ -40,6 +40,7 @@ typedef u64 cputime64_t; */ #define cputime_to_usecs(__ct) jiffies_to_usecs(__ct) #define usecs_to_cputime(__msecs) usecs_to_jiffies(__msecs) +#define usecs_to_cputime64(__msecs) nsecs_to_jiffies64((__msecs) * 1000) /* * Convert cputime to seconds and back. -- cgit v1.2.3 From 52793dbe3d60bd73bbebe28b2bfc9f6b4b920d4c Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Fri, 30 Dec 2011 14:19:02 +0900 Subject: ipvs: try also real server with port 0 in backup server We should not forget to try for real server with port 0 in the backup server when processing the sync message. We should do it in all cases because the backup server can use different forwarding method. Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- include/net/ip_vs.h | 2 +- net/netfilter/ipvs/ip_vs_conn.c | 2 +- net/netfilter/ipvs/ip_vs_ctl.c | 10 ++++++++-- net/netfilter/ipvs/ip_vs_sync.c | 2 +- 4 files changed, 11 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 873d5be7926c..e5a7b9aaf552 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -1207,7 +1207,7 @@ extern void ip_vs_control_cleanup(void); extern struct ip_vs_dest * ip_vs_find_dest(struct net *net, int af, const union nf_inet_addr *daddr, __be16 dport, const union nf_inet_addr *vaddr, __be16 vport, - __u16 protocol, __u32 fwmark); + __u16 protocol, __u32 fwmark, __u32 flags); extern struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp); diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 12571fb2881c..29fa5badde75 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -616,7 +616,7 @@ struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp) if ((cp) && (!cp->dest)) { dest = ip_vs_find_dest(ip_vs_conn_net(cp), cp->af, &cp->daddr, cp->dport, &cp->vaddr, cp->vport, - cp->protocol, cp->fwmark); + cp->protocol, cp->fwmark, cp->flags); ip_vs_bind_dest(cp, dest); return dest; } else diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 008bf97cc91a..e1a66cf37f9a 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -619,15 +619,21 @@ struct ip_vs_dest *ip_vs_find_dest(struct net *net, int af, const union nf_inet_addr *daddr, __be16 dport, const union nf_inet_addr *vaddr, - __be16 vport, __u16 protocol, __u32 fwmark) + __be16 vport, __u16 protocol, __u32 fwmark, + __u32 flags) { struct ip_vs_dest *dest; struct ip_vs_service *svc; + __be16 port = dport; svc = ip_vs_service_get(net, af, fwmark, protocol, vaddr, vport); if (!svc) return NULL; - dest = ip_vs_lookup_dest(svc, daddr, dport); + if (fwmark && (flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) + port = 0; + dest = ip_vs_lookup_dest(svc, daddr, port); + if (!dest) + dest = ip_vs_lookup_dest(svc, daddr, port ^ dport); if (dest) atomic_inc(&dest->refcnt); ip_vs_service_put(svc); diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 3cdd479f9b5d..2b6678c0ce14 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -740,7 +740,7 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param, * but still handled. */ dest = ip_vs_find_dest(net, type, daddr, dport, param->vaddr, - param->vport, protocol, fwmark); + param->vport, protocol, fwmark, flags); /* Set the approprite ativity flag */ if (protocol == IPPROTO_TCP) { -- cgit v1.2.3 From 30e053248da178cf6154bb7e950dc8713567e3fa Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 3 Jan 2012 13:14:29 +0100 Subject: security: Fix security_old_inode_init_security() when CONFIG_SECURITY is not set Commit 1e39f384bb01 ("evm: fix build problems") makes the stub version of security_old_inode_init_security() return 0 when CONFIG_SECURITY is not set. But that makes callers such as reiserfs_security_init() assume that security_old_inode_init_security() has set name, value, and len arguments properly - but security_old_inode_init_security() left them uninitialized which then results in interesting failures. Revert security_old_inode_init_security() to the old behavior of returning EOPNOTSUPP since both callers (reiserfs and ocfs2) handle this just fine. [ Also fixed the S_PRIVATE(inode) case of the actual non-stub security_old_inode_init_security() function to return EOPNOTSUPP for the same reason, as pointed out by Mimi Zohar. It got incorrectly changed to match the new function in commit fb88c2b6cbb1: "evm: fix security/security_old_init_security return code". - Linus ] Reported-by: Jorge Bastos Acked-by: James Morris Acked-by: Mimi Zohar Signed-off-by: Jan Kara Signed-off-by: Linus Torvalds --- include/linux/security.h | 2 +- security/security.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/security.h b/include/linux/security.h index 19d8e04e1688..e8c619d39291 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -2056,7 +2056,7 @@ static inline int security_old_inode_init_security(struct inode *inode, char **name, void **value, size_t *len) { - return 0; + return -EOPNOTSUPP; } static inline int security_inode_create(struct inode *dir, diff --git a/security/security.c b/security/security.c index 0c6cc69c8f86..e2f684aeb70c 100644 --- a/security/security.c +++ b/security/security.c @@ -381,7 +381,7 @@ int security_old_inode_init_security(struct inode *inode, struct inode *dir, void **value, size_t *len) { if (unlikely(IS_PRIVATE(inode))) - return 0; + return -EOPNOTSUPP; return security_ops->inode_init_security(inode, dir, qstr, name, value, len); } -- cgit v1.2.3