diff options
| author | Alexander Viro <viro@math.psu.edu> | 2002-08-10 02:22:02 -0700 |
|---|---|---|
| committer | Linus Torvalds <torvalds@penguin.transmeta.com> | 2002-08-10 02:22:02 -0700 |
| commit | 951f4be9c9bab97236c84734e892ee4d379c68af (patch) | |
| tree | ed6b6027d4b4c5c8933bb1f91b4f0771154fa225 | |
| parent | 9804df6ca4f1a9fe9e1045801ceec383a3aaf0ad (diff) | |
[PATCH] fix check_disk_change() deadlocks
Small, but tricky: fix for check_disk_change() deadlocks.
What we do is
a) opening block device shifted from check_partition() to
grok_partitions(); check_partitions() takes opened
struct block_device.
b) all callers of check_disk_change() fall in two groups -
ones that are called only from some ->open() and ones
that are _never_ called from ->open(). There is no
middle ground. We split the thing in two functions -
check_disk_change() for the first class and full_check_....
for the second. The former (ones inside ->open()) doesn't
touch partition tables but marks the bdev as "had been
invalidated". In the end of do_open() we check if
bdev is marked and call wipe_partitions()/check_partition()
if it is - at that point bdev is fully set up and ready.
c) ->bd_part_sem kludge is gone - we use ->bd_sem instead.
That is, do_open() on a partition grabs ->bd_sem on entire
disk and picks partition data while under it; do_open() on
entire disk rereads partition if needed before dropping
->bd_sem (right before dropping it); BLKRRPART does
trylock on ->bd_sem and then checks ->bd_part_count -
same logics as before, except that we use ->bd_sem instead
of ->bd_part_sem.
That kills recursive open(), gives us the same exclusion rules as
we had and makes sure that actual IO (including rereading partition
tables) is done only when we are ready to do it.
It actually sounds a lot nastier than it is. do_open() is a one sick
puppy right now, but we have everything in one place and _out_ of drivers
(and 20-odd equally sick puppies are gone from them, along with about
the same number of races).
Now we are almost ready to clean it up for good - all that remains to
do before that is to get the rest of drivers (cciss, DAC960, i2o and
a couple of ancients - xd and acsi) using per-disk gendisks. Then
most of that crap will disappear.
BTW, the only generic ioctl remaining in the drivers is HDIO_GETGEO -
a lot of foo_ioctl() starts with if (cmd != HDIO_GETGEO) return -EINVAL; ;-)
| -rw-r--r-- | fs/block_dev.c | 153 | ||||
| -rw-r--r-- | fs/devfs/base.c | 2 | ||||
| -rw-r--r-- | fs/partitions/check.c | 37 | ||||
| -rw-r--r-- | fs/super.c | 1 | ||||
| -rw-r--r-- | include/linux/blkdev.h | 1 | ||||
| -rw-r--r-- | include/linux/fs.h | 3 |
6 files changed, 113 insertions, 84 deletions
diff --git a/fs/block_dev.c b/fs/block_dev.c index 255861a3ce50..e43631fac6df 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -310,7 +310,7 @@ struct block_device *bdget(dev_t dev) new_bdev->bd_contains = NULL; new_bdev->bd_inode = inode; new_bdev->bd_part_count = 0; - sema_init(&new_bdev->bd_part_sem, 1); + new_bdev->bd_invalidated = 0; inode->i_mode = S_IFBLK; inode->i_rdev = kdev; inode->i_bdev = new_bdev; @@ -518,24 +518,34 @@ int check_disk_change(struct block_device *bdev) disk = get_gendisk(dev); part = disk->part + minor(dev) - disk->first_minor; - if (disk && disk->minor_shift) { - if (!down_trylock(&bdev->bd_part_sem)) { - if (!bdev->bd_part_count) { - if (wipe_partitions(dev) == 0) { - if (bdops->revalidate) - bdops->revalidate(dev); - grok_partitions(dev, part[0].nr_sects); - } - } - up(&bdev->bd_part_sem); - } - } else { - if (bdops->revalidate) - bdops->revalidate(dev); - } + if (bdops->revalidate) + bdops->revalidate(dev); + if (disk && disk->minor_shift) + bdev->bd_invalidated = 1; return 1; } +int full_check_disk_change(struct block_device *bdev) +{ + int res; + down(&bdev->bd_sem); + res = check_disk_change(bdev); + if (bdev->bd_invalidated && !bdev->bd_part_count) { + struct gendisk *g = get_gendisk(to_kdev_t(bdev->bd_dev)); + struct hd_struct *part; + part = g->part + MINOR(bdev->bd_dev) - g->first_minor; + bdev->bd_invalidated = 0; + wipe_partitions(to_kdev_t(bdev->bd_dev)); + if (part[0].nr_sects) + check_partition(g, bdev); + } + up(&bdev->bd_sem); + return res; +} + +/* + * Will die as soon as two remaining callers get converted. + */ int __check_disk_change(dev_t dev) { struct block_device *bdev = bdget(dev); @@ -544,11 +554,24 @@ int __check_disk_change(dev_t dev) return 0; if (blkdev_get(bdev, FMODE_READ, 0, BDEV_RAW) < 0) return 0; - res = check_disk_change(bdev); + res = full_check_disk_change(bdev); blkdev_put(bdev, BDEV_RAW); return res; } +static void bd_set_size(struct block_device *bdev, loff_t size) +{ + unsigned bsize = bdev_hardsect_size(bdev); + bdev->bd_inode->i_size = size; + while (bsize < PAGE_CACHE_SIZE) { + if (size & bsize) + break; + bsize <<= 1; + } + bdev->bd_block_size = bsize; + bdev->bd_inode->i_blkbits = blksize_bits(bsize); +} + static int do_open(struct block_device *bdev, struct inode *inode, struct file *file) { int ret = -ENXIO; @@ -595,53 +618,64 @@ static int do_open(struct block_device *bdev, struct inode *inode, struct file * } } if (bdev->bd_contains == bdev) { + struct gendisk *g = get_gendisk(dev); if (bdev->bd_op->open) { ret = bdev->bd_op->open(inode, file); if (ret) goto out2; } - } else { - down(&bdev->bd_contains->bd_part_sem); - bdev->bd_contains->bd_part_count++; - up(&bdev->bd_contains->bd_part_sem); - } - if (!bdev->bd_openers) { - struct blk_dev_struct *p = blk_dev + major(dev); - struct gendisk *g = get_gendisk(dev); - unsigned bsize = bdev_hardsect_size(bdev); - - bdev->bd_offset = 0; - if (g) { - struct hd_struct *p; - p = g->part + minor(dev) - g->first_minor; - bdev->bd_inode->i_size = (loff_t) p->nr_sects << 9; - bdev->bd_offset = p->start_sect; - } else if (blk_size[major(dev)]) - bdev->bd_inode->i_size = - (loff_t) blk_size[major(dev)][minor(dev)] << 10; - else - bdev->bd_inode->i_size = 0; - while (bsize < PAGE_CACHE_SIZE) { - if (bdev->bd_inode->i_size & bsize) - break; - bsize <<= 1; - } - bdev->bd_block_size = bsize; - bdev->bd_inode->i_blkbits = blksize_bits(bsize); - if (p->queue) - bdev->bd_queue = p->queue(dev); - else - bdev->bd_queue = &p->request_queue; - if (bdev->bd_inode->i_data.backing_dev_info == - &default_backing_dev_info) { + if (!bdev->bd_openers) { + struct blk_dev_struct *p = blk_dev + major(dev); struct backing_dev_info *bdi; - + sector_t sect = 0; + + bdev->bd_offset = 0; + if (g) { + struct hd_struct *p; + p = g->part + minor(dev) - g->first_minor; + sect = p->nr_sects; + } else if (blk_size[major(dev)]) + sect = blk_size[major(dev)][minor(dev)] << 1; + if (p->queue) + bdev->bd_queue = p->queue(dev); + else + bdev->bd_queue = &p->request_queue; + bd_set_size(bdev, (loff_t)sect << 9); bdi = blk_get_backing_dev_info(bdev); if (bdi == NULL) bdi = &default_backing_dev_info; inode->i_data.backing_dev_info = bdi; bdev->bd_inode->i_data.backing_dev_info = bdi; } + if (bdev->bd_invalidated && !bdev->bd_part_count) { + struct hd_struct *part; + part = g->part + minor(dev) - g->first_minor; + bdev->bd_invalidated = 0; + wipe_partitions(dev); + if (part[0].nr_sects) + check_partition(g, bdev); + } + } else { + down(&bdev->bd_contains->bd_sem); + bdev->bd_contains->bd_part_count++; + if (!bdev->bd_openers) { + struct gendisk *g = get_gendisk(dev); + struct hd_struct *p; + p = g->part + minor(dev) - g->first_minor; + inode->i_data.backing_dev_info = + bdev->bd_inode->i_data.backing_dev_info = + bdev->bd_contains->bd_inode->i_data.backing_dev_info; + if (!p->nr_sects) { + bdev->bd_contains->bd_part_count--; + up(&bdev->bd_contains->bd_sem); + ret = -ENXIO; + goto out2; + } + bdev->bd_queue = bdev->bd_contains->bd_queue; + bdev->bd_offset = p->start_sect; + bd_set_size(bdev, (loff_t) p->nr_sects << 9); + } + up(&bdev->bd_contains->bd_sem); } bdev->bd_openers++; up(&bdev->bd_sem); @@ -725,9 +759,9 @@ int blkdev_put(struct block_device *bdev, int kind) if (bdev->bd_op->release) ret = bdev->bd_op->release(bd_inode, NULL); } else { - down(&bdev->bd_contains->bd_part_sem); + down(&bdev->bd_contains->bd_sem); bdev->bd_contains->bd_part_count--; - up(&bdev->bd_contains->bd_part_sem); + up(&bdev->bd_contains->bd_sem); } if (!bdev->bd_openers) { if (bdev->bd_op->owner) @@ -758,24 +792,25 @@ static int blkdev_reread_part(struct block_device *bdev) struct hd_struct *part; int res; - if (!disk) + if (!disk || !disk->minor_shift) return -EINVAL; part = disk->part + minor(dev) - disk->first_minor; if (!capable(CAP_SYS_ADMIN)) return -EACCES; - if (down_trylock(&bdev->bd_part_sem)); + if (down_trylock(&bdev->bd_sem)); return -EBUSY; if (bdev->bd_part_count) { - up(&bdev->bd_part_sem); + up(&bdev->bd_sem); return -EBUSY; } res = wipe_partitions(dev); if (!res) { if (bdev->bd_op->revalidate) bdev->bd_op->revalidate(dev); - grok_partitions(dev, part[0].nr_sects); + if (part[0].nr_sects) + check_partition(disk, bdev); } - up(&bdev->bd_part_sem); + up(&bdev->bd_sem); return res; } diff --git a/fs/devfs/base.c b/fs/devfs/base.c index 172ef0e9113b..98492594db7a 100644 --- a/fs/devfs/base.c +++ b/fs/devfs/base.c @@ -2390,7 +2390,7 @@ static int check_disc_changed (struct devfs_entry *de) /* Ugly hack to disable messages about unable to read partition table */ tmp = warn_no_part; warn_no_part = 0; - retval = check_disk_change(bdev); + retval = full_check_disk_change(bdev); warn_no_part = tmp; out: devfs_put_ops (de); diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 19ad765de4b3..862d80ba5513 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -289,10 +289,13 @@ void driverfs_remove_partitions(struct gendisk *hd, int minor) return; } -static void check_partition(struct gendisk *hd, kdev_t dev) +/* + * DON'T EXPORT + */ +void check_partition(struct gendisk *hd, struct block_device *bdev) { devfs_handle_t de = NULL; - struct block_device *bdev; + kdev_t dev = to_kdev_t(bdev->bd_dev); char buf[64]; struct parsed_partitions *state; int i; @@ -314,9 +317,6 @@ static void check_partition(struct gendisk *hd, kdev_t dev) if (n - COMPAQ_SMART2_MAJOR <= 7 || n - COMPAQ_CISS_MAJOR <= 7) sprintf(state->name, "p"); } - bdev = bdget(kdev_t_to_nr(dev)); - if (blkdev_get(bdev, FMODE_READ, 0, BDEV_RAW)) - goto out; state->limit = 1<<hd->minor_shift; for (i = 0; check_part[i]; i++) { int res, j; @@ -328,7 +328,7 @@ static void check_partition(struct gendisk *hd, kdev_t dev) if (res < 0) { if (warn_no_part) printk(" unable to read partition table\n"); - goto setup_devfs; + goto out; } p = hd->part + minor(dev) - hd->first_minor; for (j = 1; j < state->limit; j++) { @@ -340,12 +340,10 @@ static void check_partition(struct gendisk *hd, kdev_t dev) md_autodetect_dev(mk_kdev(major(dev),minor(dev)+j)); #endif } - goto setup_devfs; + goto out; } printk(" unknown partition table\n"); -setup_devfs: - blkdev_put(bdev, BDEV_RAW); out: driverfs_create_partitions(hd, minor(dev)); devfs_register_partitions (hd, minor(dev), 0); @@ -463,34 +461,29 @@ void register_disk(struct gendisk *gdev, kdev_t dev, unsigned minors, void grok_partitions(kdev_t dev, long size) { - int minors, first_minor, end_minor; + struct block_device *bdev; struct gendisk *g = get_gendisk(dev); struct hd_struct *p; if (!g) return; - minors = 1 << g->minor_shift; - first_minor = minor(dev); - if (first_minor & (minors-1)) { - printk("grok_partitions: bad device 0x%02x:%02x\n", - major(dev), first_minor); - first_minor &= ~(minors-1); - } - end_minor = first_minor + minors; - - p = g->part + first_minor - g->first_minor; + p = g->part + minor(dev) - g->first_minor; p[0].nr_sects = size; /* No minors to use for partitions */ - if (minors == 1) + if (!g->minor_shift) return; /* No such device (e.g., media were just removed) */ if (!size) return; - check_partition(g, mk_kdev(g->major, first_minor)); + bdev = bdget(kdev_t_to_nr(dev)); + if (blkdev_get(bdev, FMODE_READ, 0, BDEV_RAW) < 0) + return; + check_partition(g, bdev); + blkdev_put(bdev, BDEV_RAW); } unsigned char *read_dev_sector(struct block_device *bdev, unsigned long n, Sector *p) diff --git a/fs/super.c b/fs/super.c index e5232d531047..acae6165e572 100644 --- a/fs/super.c +++ b/fs/super.c @@ -502,7 +502,6 @@ struct super_block *get_sb_bdev(struct file_system_type *fs_type, devfs_put_ops (de); /* Decrement module use count now we're safe */ if (error) goto out; - check_disk_change(bdev); error = -EACCES; if (!(flags & MS_RDONLY) && bdev_read_only(bdev)) goto out1; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f83c52f82ab0..d0a89877ad94 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -279,6 +279,7 @@ extern struct blk_dev_struct blk_dev[MAX_BLKDEV]; extern void grok_partitions(kdev_t dev, long size); extern int wipe_partitions(kdev_t dev); extern void register_disk(struct gendisk *dev, kdev_t first, unsigned minors, struct block_device_operations *ops, long size); +extern void check_partition(struct gendisk *disk, struct block_device *bdev); extern void generic_make_request(struct bio *bio); extern inline request_queue_t *bdev_get_queue(struct block_device *bdev); extern void blk_put_request(struct request *); diff --git a/include/linux/fs.h b/include/linux/fs.h index 9ad9cdf3e668..192107379fd9 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -356,8 +356,8 @@ struct block_device { struct block_device * bd_contains; unsigned bd_block_size; unsigned long bd_offset; - struct semaphore bd_part_sem; unsigned bd_part_count; + int bd_invalidated; }; struct inode { @@ -1132,6 +1132,7 @@ extern int fs_may_remount_ro(struct super_block *); #define bio_data_dir(bio) ((bio)->bi_rw & 1) extern int check_disk_change(struct block_device *); +extern int full_check_disk_change(struct block_device *); extern int __check_disk_change(dev_t); extern int invalidate_inodes(struct super_block *); extern int invalidate_device(kdev_t, int); |
