summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlexander Viro <viro@math.psu.edu>2002-08-10 02:22:02 -0700
committerLinus Torvalds <torvalds@penguin.transmeta.com>2002-08-10 02:22:02 -0700
commit951f4be9c9bab97236c84734e892ee4d379c68af (patch)
treeed6b6027d4b4c5c8933bb1f91b4f0771154fa225
parent9804df6ca4f1a9fe9e1045801ceec383a3aaf0ad (diff)
[PATCH] fix check_disk_change() deadlocks
Small, but tricky: fix for check_disk_change() deadlocks. What we do is a) opening block device shifted from check_partition() to grok_partitions(); check_partitions() takes opened struct block_device. b) all callers of check_disk_change() fall in two groups - ones that are called only from some ->open() and ones that are _never_ called from ->open(). There is no middle ground. We split the thing in two functions - check_disk_change() for the first class and full_check_.... for the second. The former (ones inside ->open()) doesn't touch partition tables but marks the bdev as "had been invalidated". In the end of do_open() we check if bdev is marked and call wipe_partitions()/check_partition() if it is - at that point bdev is fully set up and ready. c) ->bd_part_sem kludge is gone - we use ->bd_sem instead. That is, do_open() on a partition grabs ->bd_sem on entire disk and picks partition data while under it; do_open() on entire disk rereads partition if needed before dropping ->bd_sem (right before dropping it); BLKRRPART does trylock on ->bd_sem and then checks ->bd_part_count - same logics as before, except that we use ->bd_sem instead of ->bd_part_sem. That kills recursive open(), gives us the same exclusion rules as we had and makes sure that actual IO (including rereading partition tables) is done only when we are ready to do it. It actually sounds a lot nastier than it is. do_open() is a one sick puppy right now, but we have everything in one place and _out_ of drivers (and 20-odd equally sick puppies are gone from them, along with about the same number of races). Now we are almost ready to clean it up for good - all that remains to do before that is to get the rest of drivers (cciss, DAC960, i2o and a couple of ancients - xd and acsi) using per-disk gendisks. Then most of that crap will disappear. BTW, the only generic ioctl remaining in the drivers is HDIO_GETGEO - a lot of foo_ioctl() starts with if (cmd != HDIO_GETGEO) return -EINVAL; ;-)
-rw-r--r--fs/block_dev.c153
-rw-r--r--fs/devfs/base.c2
-rw-r--r--fs/partitions/check.c37
-rw-r--r--fs/super.c1
-rw-r--r--include/linux/blkdev.h1
-rw-r--r--include/linux/fs.h3
6 files changed, 113 insertions, 84 deletions
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 255861a3ce50..e43631fac6df 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -310,7 +310,7 @@ struct block_device *bdget(dev_t dev)
new_bdev->bd_contains = NULL;
new_bdev->bd_inode = inode;
new_bdev->bd_part_count = 0;
- sema_init(&new_bdev->bd_part_sem, 1);
+ new_bdev->bd_invalidated = 0;
inode->i_mode = S_IFBLK;
inode->i_rdev = kdev;
inode->i_bdev = new_bdev;
@@ -518,24 +518,34 @@ int check_disk_change(struct block_device *bdev)
disk = get_gendisk(dev);
part = disk->part + minor(dev) - disk->first_minor;
- if (disk && disk->minor_shift) {
- if (!down_trylock(&bdev->bd_part_sem)) {
- if (!bdev->bd_part_count) {
- if (wipe_partitions(dev) == 0) {
- if (bdops->revalidate)
- bdops->revalidate(dev);
- grok_partitions(dev, part[0].nr_sects);
- }
- }
- up(&bdev->bd_part_sem);
- }
- } else {
- if (bdops->revalidate)
- bdops->revalidate(dev);
- }
+ if (bdops->revalidate)
+ bdops->revalidate(dev);
+ if (disk && disk->minor_shift)
+ bdev->bd_invalidated = 1;
return 1;
}
+int full_check_disk_change(struct block_device *bdev)
+{
+ int res;
+ down(&bdev->bd_sem);
+ res = check_disk_change(bdev);
+ if (bdev->bd_invalidated && !bdev->bd_part_count) {
+ struct gendisk *g = get_gendisk(to_kdev_t(bdev->bd_dev));
+ struct hd_struct *part;
+ part = g->part + MINOR(bdev->bd_dev) - g->first_minor;
+ bdev->bd_invalidated = 0;
+ wipe_partitions(to_kdev_t(bdev->bd_dev));
+ if (part[0].nr_sects)
+ check_partition(g, bdev);
+ }
+ up(&bdev->bd_sem);
+ return res;
+}
+
+/*
+ * Will die as soon as two remaining callers get converted.
+ */
int __check_disk_change(dev_t dev)
{
struct block_device *bdev = bdget(dev);
@@ -544,11 +554,24 @@ int __check_disk_change(dev_t dev)
return 0;
if (blkdev_get(bdev, FMODE_READ, 0, BDEV_RAW) < 0)
return 0;
- res = check_disk_change(bdev);
+ res = full_check_disk_change(bdev);
blkdev_put(bdev, BDEV_RAW);
return res;
}
+static void bd_set_size(struct block_device *bdev, loff_t size)
+{
+ unsigned bsize = bdev_hardsect_size(bdev);
+ bdev->bd_inode->i_size = size;
+ while (bsize < PAGE_CACHE_SIZE) {
+ if (size & bsize)
+ break;
+ bsize <<= 1;
+ }
+ bdev->bd_block_size = bsize;
+ bdev->bd_inode->i_blkbits = blksize_bits(bsize);
+}
+
static int do_open(struct block_device *bdev, struct inode *inode, struct file *file)
{
int ret = -ENXIO;
@@ -595,53 +618,64 @@ static int do_open(struct block_device *bdev, struct inode *inode, struct file *
}
}
if (bdev->bd_contains == bdev) {
+ struct gendisk *g = get_gendisk(dev);
if (bdev->bd_op->open) {
ret = bdev->bd_op->open(inode, file);
if (ret)
goto out2;
}
- } else {
- down(&bdev->bd_contains->bd_part_sem);
- bdev->bd_contains->bd_part_count++;
- up(&bdev->bd_contains->bd_part_sem);
- }
- if (!bdev->bd_openers) {
- struct blk_dev_struct *p = blk_dev + major(dev);
- struct gendisk *g = get_gendisk(dev);
- unsigned bsize = bdev_hardsect_size(bdev);
-
- bdev->bd_offset = 0;
- if (g) {
- struct hd_struct *p;
- p = g->part + minor(dev) - g->first_minor;
- bdev->bd_inode->i_size = (loff_t) p->nr_sects << 9;
- bdev->bd_offset = p->start_sect;
- } else if (blk_size[major(dev)])
- bdev->bd_inode->i_size =
- (loff_t) blk_size[major(dev)][minor(dev)] << 10;
- else
- bdev->bd_inode->i_size = 0;
- while (bsize < PAGE_CACHE_SIZE) {
- if (bdev->bd_inode->i_size & bsize)
- break;
- bsize <<= 1;
- }
- bdev->bd_block_size = bsize;
- bdev->bd_inode->i_blkbits = blksize_bits(bsize);
- if (p->queue)
- bdev->bd_queue = p->queue(dev);
- else
- bdev->bd_queue = &p->request_queue;
- if (bdev->bd_inode->i_data.backing_dev_info ==
- &default_backing_dev_info) {
+ if (!bdev->bd_openers) {
+ struct blk_dev_struct *p = blk_dev + major(dev);
struct backing_dev_info *bdi;
-
+ sector_t sect = 0;
+
+ bdev->bd_offset = 0;
+ if (g) {
+ struct hd_struct *p;
+ p = g->part + minor(dev) - g->first_minor;
+ sect = p->nr_sects;
+ } else if (blk_size[major(dev)])
+ sect = blk_size[major(dev)][minor(dev)] << 1;
+ if (p->queue)
+ bdev->bd_queue = p->queue(dev);
+ else
+ bdev->bd_queue = &p->request_queue;
+ bd_set_size(bdev, (loff_t)sect << 9);
bdi = blk_get_backing_dev_info(bdev);
if (bdi == NULL)
bdi = &default_backing_dev_info;
inode->i_data.backing_dev_info = bdi;
bdev->bd_inode->i_data.backing_dev_info = bdi;
}
+ if (bdev->bd_invalidated && !bdev->bd_part_count) {
+ struct hd_struct *part;
+ part = g->part + minor(dev) - g->first_minor;
+ bdev->bd_invalidated = 0;
+ wipe_partitions(dev);
+ if (part[0].nr_sects)
+ check_partition(g, bdev);
+ }
+ } else {
+ down(&bdev->bd_contains->bd_sem);
+ bdev->bd_contains->bd_part_count++;
+ if (!bdev->bd_openers) {
+ struct gendisk *g = get_gendisk(dev);
+ struct hd_struct *p;
+ p = g->part + minor(dev) - g->first_minor;
+ inode->i_data.backing_dev_info =
+ bdev->bd_inode->i_data.backing_dev_info =
+ bdev->bd_contains->bd_inode->i_data.backing_dev_info;
+ if (!p->nr_sects) {
+ bdev->bd_contains->bd_part_count--;
+ up(&bdev->bd_contains->bd_sem);
+ ret = -ENXIO;
+ goto out2;
+ }
+ bdev->bd_queue = bdev->bd_contains->bd_queue;
+ bdev->bd_offset = p->start_sect;
+ bd_set_size(bdev, (loff_t) p->nr_sects << 9);
+ }
+ up(&bdev->bd_contains->bd_sem);
}
bdev->bd_openers++;
up(&bdev->bd_sem);
@@ -725,9 +759,9 @@ int blkdev_put(struct block_device *bdev, int kind)
if (bdev->bd_op->release)
ret = bdev->bd_op->release(bd_inode, NULL);
} else {
- down(&bdev->bd_contains->bd_part_sem);
+ down(&bdev->bd_contains->bd_sem);
bdev->bd_contains->bd_part_count--;
- up(&bdev->bd_contains->bd_part_sem);
+ up(&bdev->bd_contains->bd_sem);
}
if (!bdev->bd_openers) {
if (bdev->bd_op->owner)
@@ -758,24 +792,25 @@ static int blkdev_reread_part(struct block_device *bdev)
struct hd_struct *part;
int res;
- if (!disk)
+ if (!disk || !disk->minor_shift)
return -EINVAL;
part = disk->part + minor(dev) - disk->first_minor;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
- if (down_trylock(&bdev->bd_part_sem));
+ if (down_trylock(&bdev->bd_sem));
return -EBUSY;
if (bdev->bd_part_count) {
- up(&bdev->bd_part_sem);
+ up(&bdev->bd_sem);
return -EBUSY;
}
res = wipe_partitions(dev);
if (!res) {
if (bdev->bd_op->revalidate)
bdev->bd_op->revalidate(dev);
- grok_partitions(dev, part[0].nr_sects);
+ if (part[0].nr_sects)
+ check_partition(disk, bdev);
}
- up(&bdev->bd_part_sem);
+ up(&bdev->bd_sem);
return res;
}
diff --git a/fs/devfs/base.c b/fs/devfs/base.c
index 172ef0e9113b..98492594db7a 100644
--- a/fs/devfs/base.c
+++ b/fs/devfs/base.c
@@ -2390,7 +2390,7 @@ static int check_disc_changed (struct devfs_entry *de)
/* Ugly hack to disable messages about unable to read partition table */
tmp = warn_no_part;
warn_no_part = 0;
- retval = check_disk_change(bdev);
+ retval = full_check_disk_change(bdev);
warn_no_part = tmp;
out:
devfs_put_ops (de);
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 19ad765de4b3..862d80ba5513 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -289,10 +289,13 @@ void driverfs_remove_partitions(struct gendisk *hd, int minor)
return;
}
-static void check_partition(struct gendisk *hd, kdev_t dev)
+/*
+ * DON'T EXPORT
+ */
+void check_partition(struct gendisk *hd, struct block_device *bdev)
{
devfs_handle_t de = NULL;
- struct block_device *bdev;
+ kdev_t dev = to_kdev_t(bdev->bd_dev);
char buf[64];
struct parsed_partitions *state;
int i;
@@ -314,9 +317,6 @@ static void check_partition(struct gendisk *hd, kdev_t dev)
if (n - COMPAQ_SMART2_MAJOR <= 7 || n - COMPAQ_CISS_MAJOR <= 7)
sprintf(state->name, "p");
}
- bdev = bdget(kdev_t_to_nr(dev));
- if (blkdev_get(bdev, FMODE_READ, 0, BDEV_RAW))
- goto out;
state->limit = 1<<hd->minor_shift;
for (i = 0; check_part[i]; i++) {
int res, j;
@@ -328,7 +328,7 @@ static void check_partition(struct gendisk *hd, kdev_t dev)
if (res < 0) {
if (warn_no_part)
printk(" unable to read partition table\n");
- goto setup_devfs;
+ goto out;
}
p = hd->part + minor(dev) - hd->first_minor;
for (j = 1; j < state->limit; j++) {
@@ -340,12 +340,10 @@ static void check_partition(struct gendisk *hd, kdev_t dev)
md_autodetect_dev(mk_kdev(major(dev),minor(dev)+j));
#endif
}
- goto setup_devfs;
+ goto out;
}
printk(" unknown partition table\n");
-setup_devfs:
- blkdev_put(bdev, BDEV_RAW);
out:
driverfs_create_partitions(hd, minor(dev));
devfs_register_partitions (hd, minor(dev), 0);
@@ -463,34 +461,29 @@ void register_disk(struct gendisk *gdev, kdev_t dev, unsigned minors,
void grok_partitions(kdev_t dev, long size)
{
- int minors, first_minor, end_minor;
+ struct block_device *bdev;
struct gendisk *g = get_gendisk(dev);
struct hd_struct *p;
if (!g)
return;
- minors = 1 << g->minor_shift;
- first_minor = minor(dev);
- if (first_minor & (minors-1)) {
- printk("grok_partitions: bad device 0x%02x:%02x\n",
- major(dev), first_minor);
- first_minor &= ~(minors-1);
- }
- end_minor = first_minor + minors;
-
- p = g->part + first_minor - g->first_minor;
+ p = g->part + minor(dev) - g->first_minor;
p[0].nr_sects = size;
/* No minors to use for partitions */
- if (minors == 1)
+ if (!g->minor_shift)
return;
/* No such device (e.g., media were just removed) */
if (!size)
return;
- check_partition(g, mk_kdev(g->major, first_minor));
+ bdev = bdget(kdev_t_to_nr(dev));
+ if (blkdev_get(bdev, FMODE_READ, 0, BDEV_RAW) < 0)
+ return;
+ check_partition(g, bdev);
+ blkdev_put(bdev, BDEV_RAW);
}
unsigned char *read_dev_sector(struct block_device *bdev, unsigned long n, Sector *p)
diff --git a/fs/super.c b/fs/super.c
index e5232d531047..acae6165e572 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -502,7 +502,6 @@ struct super_block *get_sb_bdev(struct file_system_type *fs_type,
devfs_put_ops (de); /* Decrement module use count now we're safe */
if (error)
goto out;
- check_disk_change(bdev);
error = -EACCES;
if (!(flags & MS_RDONLY) && bdev_read_only(bdev))
goto out1;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f83c52f82ab0..d0a89877ad94 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -279,6 +279,7 @@ extern struct blk_dev_struct blk_dev[MAX_BLKDEV];
extern void grok_partitions(kdev_t dev, long size);
extern int wipe_partitions(kdev_t dev);
extern void register_disk(struct gendisk *dev, kdev_t first, unsigned minors, struct block_device_operations *ops, long size);
+extern void check_partition(struct gendisk *disk, struct block_device *bdev);
extern void generic_make_request(struct bio *bio);
extern inline request_queue_t *bdev_get_queue(struct block_device *bdev);
extern void blk_put_request(struct request *);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9ad9cdf3e668..192107379fd9 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -356,8 +356,8 @@ struct block_device {
struct block_device * bd_contains;
unsigned bd_block_size;
unsigned long bd_offset;
- struct semaphore bd_part_sem;
unsigned bd_part_count;
+ int bd_invalidated;
};
struct inode {
@@ -1132,6 +1132,7 @@ extern int fs_may_remount_ro(struct super_block *);
#define bio_data_dir(bio) ((bio)->bi_rw & 1)
extern int check_disk_change(struct block_device *);
+extern int full_check_disk_change(struct block_device *);
extern int __check_disk_change(dev_t);
extern int invalidate_inodes(struct super_block *);
extern int invalidate_device(kdev_t, int);