diff options
| author | Ilya Dryomov <idryomov@gmail.com> | 2016-04-15 16:22:16 +0200 |
|---|---|---|
| committer | Ben Hutchings <ben@decadent.org.uk> | 2016-06-15 21:29:28 +0100 |
| commit | 80e4da2500457c054fd4ed8243b183bdec6bc6d6 (patch) | |
| tree | 59342fa3d7edf04fada4fb0bb7f459cf24276be5 | |
| parent | b5752ee2fab8340a7544c11635b227a2865f2933 (diff) | |
rbd: fix rbd map vs notify races
commit 811c6688774613a78bfa020f64b570b73f6974c8 upstream.
A while ago, commit 9875201e1049 ("rbd: fix use-after free of
rbd_dev->disk") fixed rbd unmap vs notify race by introducing
an exported wrapper for flushing notifies and sticking it into
do_rbd_remove().
A similar problem exists on the rbd map path, though: the watch is
registered in rbd_dev_image_probe(), while the disk is set up quite
a few steps later, in rbd_dev_device_setup(). Nothing prevents
a notify from coming in and crashing on a NULL rbd_dev->disk:
BUG: unable to handle kernel NULL pointer dereference at 0000000000000050
Call Trace:
[<ffffffffa0508344>] rbd_watch_cb+0x34/0x180 [rbd]
[<ffffffffa04bd290>] do_event_work+0x40/0xb0 [libceph]
[<ffffffff8109d5db>] process_one_work+0x17b/0x470
[<ffffffff8109e3ab>] worker_thread+0x11b/0x400
[<ffffffff8109e290>] ? rescuer_thread+0x400/0x400
[<ffffffff810a5acf>] kthread+0xcf/0xe0
[<ffffffff810b41b3>] ? finish_task_switch+0x53/0x170
[<ffffffff810a5a00>] ? kthread_create_on_node+0x140/0x140
[<ffffffff81645dd8>] ret_from_fork+0x58/0x90
[<ffffffff810a5a00>] ? kthread_create_on_node+0x140/0x140
RIP [<ffffffffa050828a>] rbd_dev_refresh+0xfa/0x180 [rbd]
If an error occurs during rbd map, we have to error out, potentially
tearing down a watch. Just like on rbd unmap, notifies have to be
flushed, otherwise rbd_watch_cb() may end up trying to read in the
image header after rbd_dev_image_release() has run:
Assertion failure in rbd_dev_header_info() at line 4722:
rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Call Trace:
[<ffffffff81cccee0>] ? rbd_parent_request_create+0x150/0x150
[<ffffffff81cd4e59>] rbd_dev_refresh+0x59/0x390
[<ffffffff81cd5229>] rbd_watch_cb+0x69/0x290
[<ffffffff81fde9bf>] do_event_work+0x10f/0x1c0
[<ffffffff81107799>] process_one_work+0x689/0x1a80
[<ffffffff811076f7>] ? process_one_work+0x5e7/0x1a80
[<ffffffff81132065>] ? finish_task_switch+0x225/0x640
[<ffffffff81107110>] ? pwq_dec_nr_in_flight+0x2b0/0x2b0
[<ffffffff81108c69>] worker_thread+0xd9/0x1320
[<ffffffff81108b90>] ? process_one_work+0x1a80/0x1a80
[<ffffffff8111b02d>] kthread+0x21d/0x2e0
[<ffffffff8111ae10>] ? kthread_stop+0x550/0x550
[<ffffffff82022802>] ret_from_fork+0x22/0x40
[<ffffffff8111ae10>] ? kthread_stop+0x550/0x550
RIP [<ffffffff81ccd8f9>] rbd_dev_header_info+0xa19/0x1e30
To fix this, a) check if RBD_DEV_FLAG_EXISTS is set before calling
revalidate_disk(), b) move ceph_osdc_flush_notifies() call into
rbd_dev_header_unwatch_sync() to cover rbd map error paths and c) turn
header read-in into a critical section. The latter also happens to
take care of rbd map foo@bar vs rbd snap rm foo@bar race.
Fixes: http://tracker.ceph.com/issues/15490
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Josh Durgin <jdurgin@redhat.com>
[bwh: Backported to 3.16: adjust context]
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
| -rw-r--r-- | drivers/block/rbd.c | 43 |
1 files changed, 19 insertions, 24 deletions
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 34cd70d06d3b..6eea35b05f6a 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -528,7 +528,6 @@ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, u8 *order, u64 *snap_size); static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, u64 *snap_features); -static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name); static int rbd_open(struct block_device *bdev, fmode_t mode) { @@ -2957,9 +2956,6 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) struct rbd_device *rbd_dev = (struct rbd_device *)data; int ret; - if (!rbd_dev) - return; - dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__, rbd_dev->header_name, (unsigned long long)notify_id, (unsigned int)opcode); @@ -3099,6 +3095,9 @@ out_cancel: ceph_osdc_cancel_event(rbd_dev->watch_event); rbd_dev->watch_event = NULL; + dout("%s flushing notifies\n", __func__); + ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc); + return ret; } @@ -3534,21 +3533,14 @@ static void rbd_exists_validate(struct rbd_device *rbd_dev) static void rbd_dev_update_size(struct rbd_device *rbd_dev) { sector_t size; - bool removing; /* - * Don't hold the lock while doing disk operations, - * or lock ordering will conflict with the bdev mutex via: - * rbd_add() -> blkdev_get() -> rbd_open() + * If EXISTS is not set, rbd_dev->disk may be NULL, so don't + * try to update its size. If REMOVING is set, updating size + * is just useless work since the device can't be opened. */ - spin_lock_irq(&rbd_dev->lock); - removing = test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags); - spin_unlock_irq(&rbd_dev->lock); - /* - * If the device is being removed, rbd_dev->disk has - * been destroyed, so don't try to update its size - */ - if (!removing) { + if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) && + !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) { size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE; dout("setting size to %llu sectors", (unsigned long long)size); set_capacity(rbd_dev->disk, size); @@ -5078,6 +5070,10 @@ out_err: return ret; } +/* + * rbd_dev->header_rwsem must be locked for write and will be unlocked + * upon return. + */ static int rbd_dev_device_setup(struct rbd_device *rbd_dev) { int ret; @@ -5086,7 +5082,7 @@ static int rbd_dev_device_setup(struct rbd_device *rbd_dev) ret = rbd_dev_id_get(rbd_dev); if (ret) - return ret; + goto err_out_unlock; BUILD_BUG_ON(DEV_NAME_LEN < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH); @@ -5133,8 +5129,9 @@ static int rbd_dev_device_setup(struct rbd_device *rbd_dev) /* Everything's ready. Announce the disk to the world. */ set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); - add_disk(rbd_dev->disk); + up_write(&rbd_dev->header_rwsem); + add_disk(rbd_dev->disk); pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name, (unsigned long long) rbd_dev->mapping.size); @@ -5153,6 +5150,8 @@ err_out_blkdev: err_out_id: rbd_dev_id_put(rbd_dev); rbd_dev_mapping_clear(rbd_dev); +err_out_unlock: + up_write(&rbd_dev->header_rwsem); return ret; } @@ -5315,6 +5314,7 @@ static ssize_t do_rbd_add(struct bus_type *bus, rbdc = NULL; /* rbd_dev now owns this */ spec = NULL; /* rbd_dev now owns this */ + down_write(&rbd_dev->header_rwsem); rc = rbd_dev_image_probe(rbd_dev, 0); if (rc < 0) goto err_out_rbd_dev; @@ -5340,6 +5340,7 @@ static ssize_t do_rbd_add(struct bus_type *bus, return count; err_out_rbd_dev: + up_write(&rbd_dev->header_rwsem); rbd_dev_destroy(rbd_dev); err_out_client: rbd_put_client(rbdc); @@ -5453,12 +5454,6 @@ static ssize_t do_rbd_remove(struct bus_type *bus, return ret; rbd_dev_header_unwatch_sync(rbd_dev); - /* - * flush remaining watch callbacks - these must be complete - * before the osd_client is shutdown - */ - dout("%s: flushing notifies", __func__); - ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc); /* * Don't free anything from rbd_dev->disk until after all |
