diff options
Diffstat (limited to 'fs/btrfs/volumes.c')
| -rw-r--r-- | fs/btrfs/volumes.c | 506 | 
1 files changed, 225 insertions, 281 deletions
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index be3fc701f389..e034ad9e23b4 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -40,6 +40,9 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {  		.tolerated_failures = 1,  		.devs_increment	= 2,  		.ncopies	= 2, +		.raid_name	= "raid10", +		.bg_flag	= BTRFS_BLOCK_GROUP_RAID10, +		.mindev_error	= BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,  	},  	[BTRFS_RAID_RAID1] = {  		.sub_stripes	= 1, @@ -49,6 +52,9 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {  		.tolerated_failures = 1,  		.devs_increment	= 2,  		.ncopies	= 2, +		.raid_name	= "raid1", +		.bg_flag	= BTRFS_BLOCK_GROUP_RAID1, +		.mindev_error	= BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,  	},  	[BTRFS_RAID_DUP] = {  		.sub_stripes	= 1, @@ -58,6 +64,9 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {  		.tolerated_failures = 0,  		.devs_increment	= 1,  		.ncopies	= 2, +		.raid_name	= "dup", +		.bg_flag	= BTRFS_BLOCK_GROUP_DUP, +		.mindev_error	= 0,  	},  	[BTRFS_RAID_RAID0] = {  		.sub_stripes	= 1, @@ -67,6 +76,9 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {  		.tolerated_failures = 0,  		.devs_increment	= 1,  		.ncopies	= 1, +		.raid_name	= "raid0", +		.bg_flag	= BTRFS_BLOCK_GROUP_RAID0, +		.mindev_error	= 0,  	},  	[BTRFS_RAID_SINGLE] = {  		.sub_stripes	= 1, @@ -76,6 +88,9 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {  		.tolerated_failures = 0,  		.devs_increment	= 1,  		.ncopies	= 1, +		.raid_name	= "single", +		.bg_flag	= 0, +		.mindev_error	= 0,  	},  	[BTRFS_RAID_RAID5] = {  		.sub_stripes	= 1, @@ -85,6 +100,9 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {  		.tolerated_failures = 1,  		.devs_increment	= 1,  		.ncopies	= 2, +		.raid_name	= "raid5", +		.bg_flag	= BTRFS_BLOCK_GROUP_RAID5, +		.mindev_error	= BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,  	},  	[BTRFS_RAID_RAID6] = {  		.sub_stripes	= 1, @@ -94,33 +112,19 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {  		.tolerated_failures = 2,  		.devs_increment	= 1,  		.ncopies	= 3, +		.raid_name	= "raid6", +		.bg_flag	= BTRFS_BLOCK_GROUP_RAID6, +		.mindev_error	= BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,  	},  }; -const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES] = { -	[BTRFS_RAID_RAID10] = BTRFS_BLOCK_GROUP_RAID10, -	[BTRFS_RAID_RAID1]  = BTRFS_BLOCK_GROUP_RAID1, -	[BTRFS_RAID_DUP]    = BTRFS_BLOCK_GROUP_DUP, -	[BTRFS_RAID_RAID0]  = BTRFS_BLOCK_GROUP_RAID0, -	[BTRFS_RAID_SINGLE] = 0, -	[BTRFS_RAID_RAID5]  = BTRFS_BLOCK_GROUP_RAID5, -	[BTRFS_RAID_RAID6]  = BTRFS_BLOCK_GROUP_RAID6, -}; +const char *get_raid_name(enum btrfs_raid_types type) +{ +	if (type >= BTRFS_NR_RAID_TYPES) +		return NULL; -/* - * Table to convert BTRFS_RAID_* to the error code if minimum number of devices - * condition is not met. Zero means there's no corresponding - * BTRFS_ERROR_DEV_*_NOT_MET value. - */ -const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES] = { -	[BTRFS_RAID_RAID10] = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET, -	[BTRFS_RAID_RAID1]  = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET, -	[BTRFS_RAID_DUP]    = 0, -	[BTRFS_RAID_RAID0]  = 0, -	[BTRFS_RAID_SINGLE] = 0, -	[BTRFS_RAID_RAID5]  = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET, -	[BTRFS_RAID_RAID6]  = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET, -}; +	return btrfs_raid_array[type].raid_name; +}  static int init_first_rw_device(struct btrfs_trans_handle *trans,  				struct btrfs_fs_info *fs_info); @@ -167,12 +171,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,   * may be used to exclude some operations from running concurrently without any   * modifications to the list (see write_all_supers)   * - * volume_mutex - * ------------ - * coarse lock owned by a mounted filesystem; used to exclude some operations - * that cannot run in parallel and affect the higher-level properties of the - * filesystem like: device add/deleting/resize/replace, or balance - *   * balance_mutex   * -------------   * protects balance structures (status, state) and context accessed from @@ -197,6 +195,41 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,   *     device_list_mutex   *       chunk_mutex   *     balance_mutex + * + * + * Exclusive operations, BTRFS_FS_EXCL_OP + * ====================================== + * + * Maintains the exclusivity of the following operations that apply to the + * whole filesystem and cannot run in parallel. + * + * - Balance (*) + * - Device add + * - Device remove + * - Device replace (*) + * - Resize + * + * The device operations (as above) can be in one of the following states: + * + * - Running state + * - Paused state + * - Completed state + * + * Only device operations marked with (*) can go into the Paused state for the + * following reasons: + * + * - ioctl (only Balance can be Paused through ioctl) + * - filesystem remounted as read-only + * - filesystem unmounted and mounted as read-only + * - system power-cycle and filesystem mounted as read-only + * - filesystem or device errors leading to forced read-only + * + * BTRFS_FS_EXCL_OP flag is set and cleared using atomic operations. + * During the course of Paused state, the BTRFS_FS_EXCL_OP remains set. + * A device operation in Paused or Running state can be canceled or resumed + * either by ioctl (Balance only) or when remounted as read-write. + * BTRFS_FS_EXCL_OP flag is cleared when the device operation is canceled or + * completed.   */  DEFINE_MUTEX(uuid_mutex); @@ -227,14 +260,14 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)  	INIT_LIST_HEAD(&fs_devs->devices);  	INIT_LIST_HEAD(&fs_devs->resized_devices);  	INIT_LIST_HEAD(&fs_devs->alloc_list); -	INIT_LIST_HEAD(&fs_devs->list); +	INIT_LIST_HEAD(&fs_devs->fs_list);  	if (fsid)  		memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);  	return fs_devs;  } -static void free_device(struct btrfs_device *device) +void btrfs_free_device(struct btrfs_device *device)  {  	rcu_string_free(device->name);  	bio_put(device->flush_bio); @@ -249,7 +282,7 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices)  		device = list_entry(fs_devices->devices.next,  				    struct btrfs_device, dev_list);  		list_del(&device->dev_list); -		free_device(device); +		btrfs_free_device(device);  	}  	kfree(fs_devices);  } @@ -273,8 +306,8 @@ void __exit btrfs_cleanup_fs_uuids(void)  	while (!list_empty(&fs_uuids)) {  		fs_devices = list_entry(fs_uuids.next, -					struct btrfs_fs_devices, list); -		list_del(&fs_devices->list); +					struct btrfs_fs_devices, fs_list); +		list_del(&fs_devices->fs_list);  		free_fs_devices(fs_devices);  	}  } @@ -282,7 +315,7 @@ void __exit btrfs_cleanup_fs_uuids(void)  /*   * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error.   * Returned struct is not linked onto any lists and must be destroyed using - * free_device. + * btrfs_free_device.   */  static struct btrfs_device *__alloc_device(void)  { @@ -327,10 +360,9 @@ static struct btrfs_device *__alloc_device(void)  static struct btrfs_device *find_device(struct btrfs_fs_devices *fs_devices,  		u64 devid, const u8 *uuid)  { -	struct list_head *head = &fs_devices->devices;  	struct btrfs_device *dev; -	list_for_each_entry(dev, head, dev_list) { +	list_for_each_entry(dev, &fs_devices->devices, dev_list) {  		if (dev->devid == devid &&  		    (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {  			return dev; @@ -343,7 +375,7 @@ static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)  {  	struct btrfs_fs_devices *fs_devices; -	list_for_each_entry(fs_devices, &fs_uuids, list) { +	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {  		if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)  			return fs_devices;  	} @@ -607,7 +639,7 @@ static void btrfs_free_stale_devices(const char *path,  	struct btrfs_fs_devices *fs_devs, *tmp_fs_devs;  	struct btrfs_device *dev, *tmp_dev; -	list_for_each_entry_safe(fs_devs, tmp_fs_devs, &fs_uuids, list) { +	list_for_each_entry_safe(fs_devs, tmp_fs_devs, &fs_uuids, fs_list) {  		if (fs_devs->opened)  			continue; @@ -632,13 +664,13 @@ static void btrfs_free_stale_devices(const char *path,  			/* delete the stale device */  			if (fs_devs->num_devices == 1) {  				btrfs_sysfs_remove_fsid(fs_devs); -				list_del(&fs_devs->list); +				list_del(&fs_devs->fs_list);  				free_fs_devices(fs_devs);  				break;  			} else {  				fs_devs->num_devices--;  				list_del(&dev->dev_list); -				free_device(dev); +				btrfs_free_device(dev);  			}  		}  	} @@ -732,7 +764,7 @@ static noinline struct btrfs_device *device_list_add(const char *path,  		if (IS_ERR(fs_devices))  			return ERR_CAST(fs_devices); -		list_add(&fs_devices->list, &fs_uuids); +		list_add(&fs_devices->fs_list, &fs_uuids);  		device = NULL;  	} else { @@ -753,7 +785,7 @@ static noinline struct btrfs_device *device_list_add(const char *path,  		name = rcu_string_strdup(path, GFP_NOFS);  		if (!name) { -			free_device(device); +			btrfs_free_device(device);  			return ERR_PTR(-ENOMEM);  		}  		rcu_assign_pointer(device->name, name); @@ -866,7 +898,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)  			name = rcu_string_strdup(orig_dev->name->str,  					GFP_KERNEL);  			if (!name) { -				free_device(device); +				btrfs_free_device(device);  				goto error;  			}  			rcu_assign_pointer(device->name, name); @@ -938,7 +970,7 @@ again:  		}  		list_del_init(&device->dev_list);  		fs_devices->num_devices--; -		free_device(device); +		btrfs_free_device(device);  	}  	if (fs_devices->seed) { @@ -956,7 +988,7 @@ static void free_device_rcu(struct rcu_head *head)  	struct btrfs_device *device;  	device = container_of(head, struct btrfs_device, rcu); -	free_device(device); +	btrfs_free_device(device);  }  static void btrfs_close_bdev(struct btrfs_device *device) @@ -1005,7 +1037,7 @@ static void btrfs_prepare_close_one_device(struct btrfs_device *device)  	new_device->fs_devices = device->fs_devices;  } -static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) +static int close_fs_devices(struct btrfs_fs_devices *fs_devices)  {  	struct btrfs_device *device, *tmp;  	struct list_head pending_put; @@ -1050,7 +1082,7 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)  	int ret;  	mutex_lock(&uuid_mutex); -	ret = __btrfs_close_devices(fs_devices); +	ret = close_fs_devices(fs_devices);  	if (!fs_devices->opened) {  		seed_devices = fs_devices->seed;  		fs_devices->seed = NULL; @@ -1060,23 +1092,22 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)  	while (seed_devices) {  		fs_devices = seed_devices;  		seed_devices = fs_devices->seed; -		__btrfs_close_devices(fs_devices); +		close_fs_devices(fs_devices);  		free_fs_devices(fs_devices);  	}  	return ret;  } -static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, +static int open_fs_devices(struct btrfs_fs_devices *fs_devices,  				fmode_t flags, void *holder)  { -	struct list_head *head = &fs_devices->devices;  	struct btrfs_device *device;  	struct btrfs_device *latest_dev = NULL;  	int ret = 0;  	flags |= FMODE_EXCL; -	list_for_each_entry(device, head, dev_list) { +	list_for_each_entry(device, &fs_devices->devices, dev_list) {  		/* Just open everything we can; ignore failures here */  		if (btrfs_open_one_device(fs_devices, device, flags, holder))  			continue; @@ -1115,15 +1146,16 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,  {  	int ret; -	mutex_lock(&uuid_mutex); +	mutex_lock(&fs_devices->device_list_mutex);  	if (fs_devices->opened) {  		fs_devices->opened++;  		ret = 0;  	} else {  		list_sort(NULL, &fs_devices->devices, devid_cmp); -		ret = __btrfs_open_devices(fs_devices, flags, holder); +		ret = open_fs_devices(fs_devices, flags, holder);  	} -	mutex_unlock(&uuid_mutex); +	mutex_unlock(&fs_devices->device_list_mutex); +  	return ret;  } @@ -1201,31 +1233,29 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,  	 */  	bytenr = btrfs_sb_offset(0);  	flags |= FMODE_EXCL; -	mutex_lock(&uuid_mutex);  	bdev = blkdev_get_by_path(path, flags, holder); -	if (IS_ERR(bdev)) { -		ret = PTR_ERR(bdev); -		goto error; -	} +	if (IS_ERR(bdev)) +		return PTR_ERR(bdev);  	if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) {  		ret = -EINVAL;  		goto error_bdev_put;  	} +	mutex_lock(&uuid_mutex);  	device = device_list_add(path, disk_super);  	if (IS_ERR(device))  		ret = PTR_ERR(device);  	else  		*fs_devices_ret = device->fs_devices; +	mutex_unlock(&uuid_mutex);  	btrfs_release_disk_super(page);  error_bdev_put:  	blkdev_put(bdev, flags); -error: -	mutex_unlock(&uuid_mutex); +  	return ret;  } @@ -1857,11 +1887,11 @@ static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,  	} while (read_seqretry(&fs_info->profiles_lock, seq));  	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { -		if (!(all_avail & btrfs_raid_group[i])) +		if (!(all_avail & btrfs_raid_array[i].bg_flag))  			continue;  		if (num_devices < btrfs_raid_array[i].devs_min) { -			int ret = btrfs_raid_mindev_error[i]; +			int ret = btrfs_raid_array[i].mindev_error;  			if (ret)  				return ret; @@ -1917,13 +1947,13 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,  {  	struct btrfs_device *device;  	struct btrfs_fs_devices *cur_devices; +	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;  	u64 num_devices;  	int ret = 0; -	mutex_lock(&fs_info->volume_mutex);  	mutex_lock(&uuid_mutex); -	num_devices = fs_info->fs_devices->num_devices; +	num_devices = fs_devices->num_devices;  	btrfs_dev_replace_read_lock(&fs_info->dev_replace);  	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {  		WARN_ON(num_devices < 1); @@ -1986,27 +2016,32 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,  	 * (super_copy) should hold the device list mutex.  	 */ +	/* +	 * In normal cases the cur_devices == fs_devices. But in case +	 * of deleting a seed device, the cur_devices should point to +	 * its own fs_devices listed under the fs_devices->seed. +	 */  	cur_devices = device->fs_devices; -	mutex_lock(&fs_info->fs_devices->device_list_mutex); +	mutex_lock(&fs_devices->device_list_mutex);  	list_del_rcu(&device->dev_list); -	device->fs_devices->num_devices--; -	device->fs_devices->total_devices--; +	cur_devices->num_devices--; +	cur_devices->total_devices--;  	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) -		device->fs_devices->missing_devices--; +		cur_devices->missing_devices--;  	btrfs_assign_next_active_device(fs_info, device, NULL);  	if (device->bdev) { -		device->fs_devices->open_devices--; +		cur_devices->open_devices--;  		/* remove sysfs entry */ -		btrfs_sysfs_rm_device_link(fs_info->fs_devices, device); +		btrfs_sysfs_rm_device_link(fs_devices, device);  	}  	num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;  	btrfs_set_super_num_devices(fs_info->super_copy, num_devices); -	mutex_unlock(&fs_info->fs_devices->device_list_mutex); +	mutex_unlock(&fs_devices->device_list_mutex);  	/*  	 * at this point, the device is zero sized and detached from @@ -2020,8 +2055,6 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,  	call_rcu(&device->rcu, free_device_rcu);  	if (cur_devices->open_devices == 0) { -		struct btrfs_fs_devices *fs_devices; -		fs_devices = fs_info->fs_devices;  		while (fs_devices) {  			if (fs_devices->seed == cur_devices) {  				fs_devices->seed = cur_devices->seed; @@ -2030,20 +2063,19 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,  			fs_devices = fs_devices->seed;  		}  		cur_devices->seed = NULL; -		__btrfs_close_devices(cur_devices); +		close_fs_devices(cur_devices);  		free_fs_devices(cur_devices);  	}  out:  	mutex_unlock(&uuid_mutex); -	mutex_unlock(&fs_info->volume_mutex);  	return ret;  error_undo:  	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {  		mutex_lock(&fs_info->chunk_mutex);  		list_add(&device->dev_alloc_list, -			 &fs_info->fs_devices->alloc_list); +			 &fs_devices->alloc_list);  		device->fs_devices->rw_devices++;  		mutex_unlock(&fs_info->chunk_mutex);  	} @@ -2112,7 +2144,7 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,  			tmp_fs_devices = tmp_fs_devices->seed;  		}  		fs_devices->seed = NULL; -		__btrfs_close_devices(fs_devices); +		close_fs_devices(fs_devices);  		free_fs_devices(fs_devices);  	}  } @@ -2120,23 +2152,23 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,  void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,  				      struct btrfs_device *tgtdev)  { -	mutex_lock(&uuid_mutex); +	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; +  	WARN_ON(!tgtdev); -	mutex_lock(&fs_info->fs_devices->device_list_mutex); +	mutex_lock(&fs_devices->device_list_mutex); -	btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev); +	btrfs_sysfs_rm_device_link(fs_devices, tgtdev);  	if (tgtdev->bdev) -		fs_info->fs_devices->open_devices--; +		fs_devices->open_devices--; -	fs_info->fs_devices->num_devices--; +	fs_devices->num_devices--;  	btrfs_assign_next_active_device(fs_info, tgtdev, NULL);  	list_del_rcu(&tgtdev->dev_list); -	mutex_unlock(&fs_info->fs_devices->device_list_mutex); -	mutex_unlock(&uuid_mutex); +	mutex_unlock(&fs_devices->device_list_mutex);  	/*  	 * The update_dev_time() with in btrfs_scratch_superblocks() @@ -2188,10 +2220,6 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info,  		struct btrfs_device *tmp;  		devices = &fs_info->fs_devices->devices; -		/* -		 * It is safe to read the devices since the volume_mutex -		 * is held by the caller. -		 */  		list_for_each_entry(tmp, devices, dev_list) {  			if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,  					&tmp->dev_state) && !tmp->bdev) { @@ -2259,7 +2287,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)  		return PTR_ERR(old_devices);  	} -	list_add(&old_devices->list, &fs_uuids); +	list_add(&old_devices->fs_list, &fs_uuids);  	memcpy(seed_devices, fs_devices, sizeof(*seed_devices));  	seed_devices->opened = 1; @@ -2570,7 +2598,7 @@ error_trans:  	if (trans)  		btrfs_end_transaction(trans);  error_free_device: -	free_device(device); +	btrfs_free_device(device);  error:  	blkdev_put(bdev, FMODE_EXCL);  	if (seeding_dev && !unlocked) { @@ -2580,99 +2608,6 @@ error:  	return ret;  } -int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, -				  const char *device_path, -				  struct btrfs_device *srcdev, -				  struct btrfs_device **device_out) -{ -	struct btrfs_device *device; -	struct block_device *bdev; -	struct list_head *devices; -	struct rcu_string *name; -	u64 devid = BTRFS_DEV_REPLACE_DEVID; -	int ret = 0; - -	*device_out = NULL; -	if (fs_info->fs_devices->seeding) { -		btrfs_err(fs_info, "the filesystem is a seed filesystem!"); -		return -EINVAL; -	} - -	bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, -				  fs_info->bdev_holder); -	if (IS_ERR(bdev)) { -		btrfs_err(fs_info, "target device %s is invalid!", device_path); -		return PTR_ERR(bdev); -	} - -	filemap_write_and_wait(bdev->bd_inode->i_mapping); - -	devices = &fs_info->fs_devices->devices; -	list_for_each_entry(device, devices, dev_list) { -		if (device->bdev == bdev) { -			btrfs_err(fs_info, -				  "target device is in the filesystem!"); -			ret = -EEXIST; -			goto error; -		} -	} - - -	if (i_size_read(bdev->bd_inode) < -	    btrfs_device_get_total_bytes(srcdev)) { -		btrfs_err(fs_info, -			  "target device is smaller than source device!"); -		ret = -EINVAL; -		goto error; -	} - - -	device = btrfs_alloc_device(NULL, &devid, NULL); -	if (IS_ERR(device)) { -		ret = PTR_ERR(device); -		goto error; -	} - -	name = rcu_string_strdup(device_path, GFP_KERNEL); -	if (!name) { -		free_device(device); -		ret = -ENOMEM; -		goto error; -	} -	rcu_assign_pointer(device->name, name); - -	mutex_lock(&fs_info->fs_devices->device_list_mutex); -	set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); -	device->generation = 0; -	device->io_width = fs_info->sectorsize; -	device->io_align = fs_info->sectorsize; -	device->sector_size = fs_info->sectorsize; -	device->total_bytes = btrfs_device_get_total_bytes(srcdev); -	device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev); -	device->bytes_used = btrfs_device_get_bytes_used(srcdev); -	device->commit_total_bytes = srcdev->commit_total_bytes; -	device->commit_bytes_used = device->bytes_used; -	device->fs_info = fs_info; -	device->bdev = bdev; -	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); -	set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); -	device->mode = FMODE_EXCL; -	device->dev_stats_valid = 1; -	set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); -	device->fs_devices = fs_info->fs_devices; -	list_add(&device->dev_list, &fs_info->fs_devices->devices); -	fs_info->fs_devices->num_devices++; -	fs_info->fs_devices->open_devices++; -	mutex_unlock(&fs_info->fs_devices->device_list_mutex); - -	*device_out = device; -	return ret; - -error: -	blkdev_put(bdev, FMODE_EXCL); -	return ret; -} -  static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,  					struct btrfs_device *device)  { @@ -3273,24 +3208,12 @@ static void update_balance_args(struct btrfs_balance_control *bctl)  }  /* - * Should be called with both balance and volume mutexes held to - * serialize other volume operations (add_dev/rm_dev/resize) with - * restriper.  Same goes for unset_balance_control. + * Clear the balance status in fs_info and delete the balance item from disk.   */ -static void set_balance_control(struct btrfs_balance_control *bctl) -{ -	struct btrfs_fs_info *fs_info = bctl->fs_info; - -	BUG_ON(fs_info->balance_ctl); - -	spin_lock(&fs_info->balance_lock); -	fs_info->balance_ctl = bctl; -	spin_unlock(&fs_info->balance_lock); -} - -static void unset_balance_control(struct btrfs_fs_info *fs_info) +static void reset_balance_state(struct btrfs_fs_info *fs_info)  {  	struct btrfs_balance_control *bctl = fs_info->balance_ctl; +	int ret;  	BUG_ON(!fs_info->balance_ctl); @@ -3299,6 +3222,9 @@ static void unset_balance_control(struct btrfs_fs_info *fs_info)  	spin_unlock(&fs_info->balance_lock);  	kfree(bctl); +	ret = del_balance_item(fs_info); +	if (ret) +		btrfs_handle_fs_error(fs_info, ret, NULL);  }  /* @@ -3835,18 +3761,6 @@ static inline int balance_need_close(struct btrfs_fs_info *fs_info)  		 atomic_read(&fs_info->balance_cancel_req) == 0);  } -static void __cancel_balance(struct btrfs_fs_info *fs_info) -{ -	int ret; - -	unset_balance_control(fs_info); -	ret = del_balance_item(fs_info); -	if (ret) -		btrfs_handle_fs_error(fs_info, ret, NULL); - -	clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); -} -  /* Non-zero return value signifies invalidity */  static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg,  		u64 allowed) @@ -3857,12 +3771,12 @@ static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg,  }  /* - * Should be called with both balance and volume mutexes held + * Should be called with balance mutexe held   */ -int btrfs_balance(struct btrfs_balance_control *bctl, +int btrfs_balance(struct btrfs_fs_info *fs_info, +		  struct btrfs_balance_control *bctl,  		  struct btrfs_ioctl_balance_args *bargs)  { -	struct btrfs_fs_info *fs_info = bctl->fs_info;  	u64 meta_target, data_target;  	u64 allowed;  	int mixed = 0; @@ -3891,7 +3805,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl,  		    !(bctl->flags & BTRFS_BALANCE_METADATA) ||  		    memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {  			btrfs_err(fs_info, -				  "with mixed groups data and metadata balance options must be the same"); +	  "balance: mixed groups data and metadata options must be the same");  			ret = -EINVAL;  			goto out;  		} @@ -3913,23 +3827,29 @@ int btrfs_balance(struct btrfs_balance_control *bctl,  		allowed |= (BTRFS_BLOCK_GROUP_RAID10 |  			    BTRFS_BLOCK_GROUP_RAID6);  	if (validate_convert_profile(&bctl->data, allowed)) { +		int index = btrfs_bg_flags_to_raid_index(bctl->data.target); +  		btrfs_err(fs_info, -			  "unable to start balance with target data profile %llu", -			  bctl->data.target); +			  "balance: invalid convert data profile %s", +			  get_raid_name(index));  		ret = -EINVAL;  		goto out;  	}  	if (validate_convert_profile(&bctl->meta, allowed)) { +		int index = btrfs_bg_flags_to_raid_index(bctl->meta.target); +  		btrfs_err(fs_info, -			  "unable to start balance with target metadata profile %llu", -			  bctl->meta.target); +			  "balance: invalid convert metadata profile %s", +			  get_raid_name(index));  		ret = -EINVAL;  		goto out;  	}  	if (validate_convert_profile(&bctl->sys, allowed)) { +		int index = btrfs_bg_flags_to_raid_index(bctl->sys.target); +  		btrfs_err(fs_info, -			  "unable to start balance with target system profile %llu", -			  bctl->sys.target); +			  "balance: invalid convert system profile %s", +			  get_raid_name(index));  		ret = -EINVAL;  		goto out;  	} @@ -3950,10 +3870,10 @@ int btrfs_balance(struct btrfs_balance_control *bctl,  		     !(bctl->meta.target & allowed))) {  			if (bctl->flags & BTRFS_BALANCE_FORCE) {  				btrfs_info(fs_info, -					   "force reducing metadata integrity"); +				"balance: force reducing metadata integrity");  			} else {  				btrfs_err(fs_info, -					  "balance will reduce metadata integrity, use force if you want this"); +	"balance: reduces metadata integrity, use --force if you want this");  				ret = -EINVAL;  				goto out;  			} @@ -3967,9 +3887,12 @@ int btrfs_balance(struct btrfs_balance_control *bctl,  		bctl->data.target : fs_info->avail_data_alloc_bits;  	if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <  		btrfs_get_num_tolerated_disk_barrier_failures(data_target)) { +		int meta_index = btrfs_bg_flags_to_raid_index(meta_target); +		int data_index = btrfs_bg_flags_to_raid_index(data_target); +  		btrfs_warn(fs_info, -			   "metadata profile 0x%llx has lower redundancy than data profile 0x%llx", -			   meta_target, data_target); +	"balance: metadata profile %s has lower redundancy than data profile %s", +			   get_raid_name(meta_index), get_raid_name(data_index));  	}  	ret = insert_balance_item(fs_info, bctl); @@ -3978,7 +3901,10 @@ int btrfs_balance(struct btrfs_balance_control *bctl,  	if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {  		BUG_ON(ret == -EEXIST); -		set_balance_control(bctl); +		BUG_ON(fs_info->balance_ctl); +		spin_lock(&fs_info->balance_lock); +		fs_info->balance_ctl = bctl; +		spin_unlock(&fs_info->balance_lock);  	} else {  		BUG_ON(ret != -EEXIST);  		spin_lock(&fs_info->balance_lock); @@ -3986,22 +3912,24 @@ int btrfs_balance(struct btrfs_balance_control *bctl,  		spin_unlock(&fs_info->balance_lock);  	} -	atomic_inc(&fs_info->balance_running); +	ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); +	set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);  	mutex_unlock(&fs_info->balance_mutex);  	ret = __btrfs_balance(fs_info);  	mutex_lock(&fs_info->balance_mutex); -	atomic_dec(&fs_info->balance_running); +	clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);  	if (bargs) {  		memset(bargs, 0, sizeof(*bargs)); -		update_ioctl_balance_args(fs_info, 0, bargs); +		btrfs_update_ioctl_balance_args(fs_info, bargs);  	}  	if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||  	    balance_need_close(fs_info)) { -		__cancel_balance(fs_info); +		reset_balance_state(fs_info); +		clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);  	}  	wake_up(&fs_info->balance_wait_q); @@ -4009,11 +3937,11 @@ int btrfs_balance(struct btrfs_balance_control *bctl,  	return ret;  out:  	if (bctl->flags & BTRFS_BALANCE_RESUME) -		__cancel_balance(fs_info); -	else { +		reset_balance_state(fs_info); +	else  		kfree(bctl); -		clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); -	} +	clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); +  	return ret;  } @@ -4022,16 +3950,12 @@ static int balance_kthread(void *data)  	struct btrfs_fs_info *fs_info = data;  	int ret = 0; -	mutex_lock(&fs_info->volume_mutex);  	mutex_lock(&fs_info->balance_mutex); -  	if (fs_info->balance_ctl) { -		btrfs_info(fs_info, "continuing balance"); -		ret = btrfs_balance(fs_info->balance_ctl, NULL); +		btrfs_info(fs_info, "balance: resuming"); +		ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);  	} -  	mutex_unlock(&fs_info->balance_mutex); -	mutex_unlock(&fs_info->volume_mutex);  	return ret;  } @@ -4040,15 +3964,15 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)  {  	struct task_struct *tsk; -	spin_lock(&fs_info->balance_lock); +	mutex_lock(&fs_info->balance_mutex);  	if (!fs_info->balance_ctl) { -		spin_unlock(&fs_info->balance_lock); +		mutex_unlock(&fs_info->balance_mutex);  		return 0;  	} -	spin_unlock(&fs_info->balance_lock); +	mutex_unlock(&fs_info->balance_mutex);  	if (btrfs_test_opt(fs_info, SKIP_BALANCE)) { -		btrfs_info(fs_info, "force skipping balance"); +		btrfs_info(fs_info, "balance: resume skipped");  		return 0;  	} @@ -4100,7 +4024,6 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info)  	leaf = path->nodes[0];  	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); -	bctl->fs_info = fs_info;  	bctl->flags = btrfs_balance_flags(leaf, item);  	bctl->flags |= BTRFS_BALANCE_RESUME; @@ -4111,15 +4034,26 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info)  	btrfs_balance_sys(leaf, item, &disk_bargs);  	btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); -	WARN_ON(test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)); +	/* +	 * This should never happen, as the paused balance state is recovered +	 * during mount without any chance of other exclusive ops to collide. +	 * +	 * This gives the exclusive op status to balance and keeps in paused +	 * state until user intervention (cancel or umount). If the ownership +	 * cannot be assigned, show a message but do not fail. The balance +	 * is in a paused state and must have fs_info::balance_ctl properly +	 * set up. +	 */ +	if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) +		btrfs_warn(fs_info, +	"balance: cannot set exclusive op status, resume manually"); -	mutex_lock(&fs_info->volume_mutex);  	mutex_lock(&fs_info->balance_mutex); - -	set_balance_control(bctl); - +	BUG_ON(fs_info->balance_ctl); +	spin_lock(&fs_info->balance_lock); +	fs_info->balance_ctl = bctl; +	spin_unlock(&fs_info->balance_lock);  	mutex_unlock(&fs_info->balance_mutex); -	mutex_unlock(&fs_info->volume_mutex);  out:  	btrfs_free_path(path);  	return ret; @@ -4135,16 +4069,16 @@ int btrfs_pause_balance(struct btrfs_fs_info *fs_info)  		return -ENOTCONN;  	} -	if (atomic_read(&fs_info->balance_running)) { +	if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {  		atomic_inc(&fs_info->balance_pause_req);  		mutex_unlock(&fs_info->balance_mutex);  		wait_event(fs_info->balance_wait_q, -			   atomic_read(&fs_info->balance_running) == 0); +			   !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));  		mutex_lock(&fs_info->balance_mutex);  		/* we are good with balance_ctl ripped off from under us */ -		BUG_ON(atomic_read(&fs_info->balance_running)); +		BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));  		atomic_dec(&fs_info->balance_pause_req);  	} else {  		ret = -ENOTCONN; @@ -4156,38 +4090,49 @@ int btrfs_pause_balance(struct btrfs_fs_info *fs_info)  int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)  { -	if (sb_rdonly(fs_info->sb)) -		return -EROFS; -  	mutex_lock(&fs_info->balance_mutex);  	if (!fs_info->balance_ctl) {  		mutex_unlock(&fs_info->balance_mutex);  		return -ENOTCONN;  	} +	/* +	 * A paused balance with the item stored on disk can be resumed at +	 * mount time if the mount is read-write. Otherwise it's still paused +	 * and we must not allow cancelling as it deletes the item. +	 */ +	if (sb_rdonly(fs_info->sb)) { +		mutex_unlock(&fs_info->balance_mutex); +		return -EROFS; +	} +  	atomic_inc(&fs_info->balance_cancel_req);  	/*  	 * if we are running just wait and return, balance item is  	 * deleted in btrfs_balance in this case  	 */ -	if (atomic_read(&fs_info->balance_running)) { +	if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {  		mutex_unlock(&fs_info->balance_mutex);  		wait_event(fs_info->balance_wait_q, -			   atomic_read(&fs_info->balance_running) == 0); +			   !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));  		mutex_lock(&fs_info->balance_mutex);  	} else { -		/* __cancel_balance needs volume_mutex */  		mutex_unlock(&fs_info->balance_mutex); -		mutex_lock(&fs_info->volume_mutex); +		/* +		 * Lock released to allow other waiters to continue, we'll +		 * reexamine the status again. +		 */  		mutex_lock(&fs_info->balance_mutex); -		if (fs_info->balance_ctl) -			__cancel_balance(fs_info); - -		mutex_unlock(&fs_info->volume_mutex); +		if (fs_info->balance_ctl) { +			reset_balance_state(fs_info); +			clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); +			btrfs_info(fs_info, "balance: canceled"); +		}  	} -	BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running)); +	BUG_ON(fs_info->balance_ctl || +		test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));  	atomic_dec(&fs_info->balance_cancel_req);  	mutex_unlock(&fs_info->balance_mutex);  	return 0; @@ -4264,8 +4209,7 @@ static int btrfs_uuid_scan_kthread(void *data)  		}  update_tree:  		if (!btrfs_is_empty_uuid(root_item.uuid)) { -			ret = btrfs_uuid_tree_add(trans, fs_info, -						  root_item.uuid, +			ret = btrfs_uuid_tree_add(trans, root_item.uuid,  						  BTRFS_UUID_KEY_SUBVOL,  						  key.objectid);  			if (ret < 0) { @@ -4276,7 +4220,7 @@ update_tree:  		}  		if (!btrfs_is_empty_uuid(root_item.received_uuid)) { -			ret = btrfs_uuid_tree_add(trans, fs_info, +			ret = btrfs_uuid_tree_add(trans,  						  root_item.received_uuid,  						 BTRFS_UUID_KEY_RECEIVED_SUBVOL,  						  key.objectid); @@ -4482,7 +4426,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)  	if (!path)  		return -ENOMEM; -	path->reada = READA_FORWARD; +	path->reada = READA_BACK;  	mutex_lock(&fs_info->chunk_mutex); @@ -6043,9 +5987,8 @@ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,  	return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1);  } -int btrfs_rmap_block(struct btrfs_fs_info *fs_info, -		     u64 chunk_start, u64 physical, u64 devid, -		     u64 **logical, int *naddrs, int *stripe_len) +int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, +		     u64 physical, u64 **logical, int *naddrs, int *stripe_len)  {  	struct extent_map *em;  	struct map_lookup *map; @@ -6077,8 +6020,6 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info,  	BUG_ON(!buf); /* -ENOMEM */  	for (i = 0; i < map->num_stripes; i++) { -		if (devid && map->stripes[i].dev->devid != devid) -			continue;  		if (map->stripes[i].physical > physical ||  		    map->stripes[i].physical + length <= physical)  			continue; @@ -6410,7 +6351,7 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,   *   * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()   * on error.  Returned struct is not linked onto any lists and must be - * destroyed with free_device. + * destroyed with btrfs_free_device.   */  struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,  					const u64 *devid, @@ -6433,7 +6374,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,  		ret = find_next_devid(fs_info, &tmp);  		if (ret) { -			free_device(dev); +			btrfs_free_device(dev);  			return ERR_PTR(ret);  		}  	} @@ -6684,8 +6625,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,  	if (IS_ERR(fs_devices))  		return fs_devices; -	ret = __btrfs_open_devices(fs_devices, FMODE_READ, -				   fs_info->bdev_holder); +	ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder);  	if (ret) {  		free_fs_devices(fs_devices);  		fs_devices = ERR_PTR(ret); @@ -6693,7 +6633,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,  	}  	if (!fs_devices->seeding) { -		__btrfs_close_devices(fs_devices); +		close_fs_devices(fs_devices);  		free_fs_devices(fs_devices);  		fs_devices = ERR_PTR(-EINVAL);  		goto out; @@ -7002,6 +6942,10 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)  	if (!path)  		return -ENOMEM; +	/* +	 * uuid_mutex is needed only if we are mounting a sprout FS +	 * otherwise we don't need it. +	 */  	mutex_lock(&uuid_mutex);  	mutex_lock(&fs_info->chunk_mutex);  | 
