diff options
| author | Rafael J. Wysocki <rjw@sisk.pl> | 2011-12-21 21:59:45 +0100 | 
|---|---|---|
| committer | Rafael J. Wysocki <rjw@sisk.pl> | 2011-12-21 21:59:45 +0100 | 
| commit | b00f4dc5ff022cb9cbaffd376d9454d7fa1e496f (patch) | |
| tree | 40f1b232e2f1e8ac365317a14fdcbcb331722b46 /fs/ocfs2/stack_o2cb.c | |
| parent | 1eac8111e0763853266a171ce11214da3a347a0a (diff) | |
| parent | b9e26dfdad5a4f9cbdaacafac6998614cc9c41bc (diff) | |
Merge branch 'master' into pm-sleep
* master: (848 commits)
  SELinux: Fix RCU deref check warning in sel_netport_insert()
  binary_sysctl(): fix memory leak
  mm/vmalloc.c: remove static declaration of va from __get_vm_area_node
  ipmi_watchdog: restore settings when BMC reset
  oom: fix integer overflow of points in oom_badness
  memcg: keep root group unchanged if creation fails
  nilfs2: potential integer overflow in nilfs_ioctl_clean_segments()
  nilfs2: unbreak compat ioctl
  cpusets: stall when updating mems_allowed for mempolicy or disjoint nodemask
  evm: prevent racing during tfm allocation
  evm: key must be set once during initialization
  mmc: vub300: fix type of firmware_rom_wait_states module parameter
  Revert "mmc: enable runtime PM by default"
  mmc: sdhci: remove "state" argument from sdhci_suspend_host
  x86, dumpstack: Fix code bytes breakage due to missing KERN_CONT
  IB/qib: Correct sense on freectxts increment and decrement
  RDMA/cma: Verify private data length
  cgroups: fix a css_set not found bug in cgroup_attach_proc
  oprofile: Fix uninitialized memory access when writing to writing to oprofilefs
  Revert "xen/pv-on-hvm kexec: add xs_reset_watches to shutdown watches from old kernel"
  ...
Conflicts:
	kernel/cgroup_freezer.c
Diffstat (limited to 'fs/ocfs2/stack_o2cb.c')
| -rw-r--r-- | fs/ocfs2/stack_o2cb.c | 71 | 
1 files changed, 63 insertions, 8 deletions
| diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c index 19965b00c43c..94368017edb3 100644 --- a/fs/ocfs2/stack_o2cb.c +++ b/fs/ocfs2/stack_o2cb.c @@ -28,6 +28,7 @@  #include "cluster/masklog.h"  #include "cluster/nodemanager.h"  #include "cluster/heartbeat.h" +#include "cluster/tcp.h"  #include "stackglue.h" @@ -256,6 +257,61 @@ static void o2cb_dump_lksb(struct ocfs2_dlm_lksb *lksb)  }  /* + * Check if this node is heartbeating and is connected to all other + * heartbeating nodes. + */ +static int o2cb_cluster_check(void) +{ +	u8 node_num; +	int i; +	unsigned long hbmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; +	unsigned long netmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; + +	node_num = o2nm_this_node(); +	if (node_num == O2NM_MAX_NODES) { +		printk(KERN_ERR "o2cb: This node has not been configured.\n"); +		return -EINVAL; +	} + +	/* +	 * o2dlm expects o2net sockets to be created. If not, then +	 * dlm_join_domain() fails with a stack of errors which are both cryptic +	 * and incomplete. The idea here is to detect upfront whether we have +	 * managed to connect to all nodes or not. If not, then list the nodes +	 * to allow the user to check the configuration (incorrect IP, firewall, +	 * etc.) Yes, this is racy. But its not the end of the world. +	 */ +#define	O2CB_MAP_STABILIZE_COUNT	60 +	for (i = 0; i < O2CB_MAP_STABILIZE_COUNT; ++i) { +		o2hb_fill_node_map(hbmap, sizeof(hbmap)); +		if (!test_bit(node_num, hbmap)) { +			printk(KERN_ERR "o2cb: %s heartbeat has not been " +			       "started.\n", (o2hb_global_heartbeat_active() ? +					      "Global" : "Local")); +			return -EINVAL; +		} +		o2net_fill_node_map(netmap, sizeof(netmap)); +		/* Force set the current node to allow easy compare */ +		set_bit(node_num, netmap); +		if (!memcmp(hbmap, netmap, sizeof(hbmap))) +			return 0; +		if (i < O2CB_MAP_STABILIZE_COUNT) +			msleep(1000); +	} + +	printk(KERN_ERR "o2cb: This node could not connect to nodes:"); +	i = -1; +	while ((i = find_next_bit(hbmap, O2NM_MAX_NODES, +				  i + 1)) < O2NM_MAX_NODES) { +		if (!test_bit(i, netmap)) +			printk(" %u", i); +	} +	printk(".\n"); + +	return -ENOTCONN; +} + +/*   * Called from the dlm when it's about to evict a node. This is how the   * classic stack signals node death.   */ @@ -263,8 +319,8 @@ static void o2dlm_eviction_cb(int node_num, void *data)  {  	struct ocfs2_cluster_connection *conn = data; -	mlog(ML_NOTICE, "o2dlm has evicted node %d from group %.*s\n", -	     node_num, conn->cc_namelen, conn->cc_name); +	printk(KERN_NOTICE "o2cb: o2dlm has evicted node %d from domain %.*s\n", +	       node_num, conn->cc_namelen, conn->cc_name);  	conn->cc_recovery_handler(node_num, conn->cc_recovery_data);  } @@ -280,12 +336,11 @@ static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn)  	BUG_ON(conn == NULL);  	BUG_ON(conn->cc_proto == NULL); -	/* for now we only have one cluster/node, make sure we see it -	 * in the heartbeat universe */ -	if (!o2hb_check_local_node_heartbeating()) { -		if (o2hb_global_heartbeat_active()) -			mlog(ML_ERROR, "Global heartbeat not started\n"); -		rc = -EINVAL; +	/* Ensure cluster stack is up and all nodes are connected */ +	rc = o2cb_cluster_check(); +	if (rc) { +		printk(KERN_ERR "o2cb: Cluster check failed. Fix errors " +		       "before retrying.\n");  		goto out;  	} | 
