diff options
| author | Andrew Morton <akpm@digeo.com> | 2002-10-28 16:22:23 -0800 |
|---|---|---|
| committer | Jens Axboe <axboe@suse.de> | 2002-10-28 16:22:23 -0800 |
| commit | 4a4c6811f4fb8aa7f59fbb04c678e48d080e1071 (patch) | |
| tree | dba6ecf5bd85ac2c0100be691b45de6ce5490c8a | |
| parent | a9577554546f46b9c53773e7dbfb295a2e968799 (diff) | |
[PATCH] permit direct IO with finer-than-fs-blocksize alignments
Mainly from Badari Pulavarty
Traditionally we have only supported O_DIRECT I/O at an alignment and
granularity which matches the underlying filesystem. That typically
means that all IO must be 4k-aligned and a multiple of 4k in size.
Here, we relax that so that direct I/O happens with (typically)
512-byte alignment and multiple-of-512-byte size.
The tricky part is when a write starts and/or ends partway through a
filesystem block which has just been added. We need to zero out the
parts of that block which lie outside the written region.
We handle that by putting appropriately-sized parts of the ZERO_PAGE
into sepatate BIOs.
The generic_direct_IO() function has been changed so that the
filesystem must pass in the address of the block_device against which
the IO is to be performed. I'd have preferred to not do this, but we
do need that info at that time so that alignment checks can be
performed.
If the filesystem passes in a NULL block_device pointer then we fall
back to the old behaviour - must align with the fs blocksize.
There is no trivial way for userspace to know what the minimum
alignment is - it depends on what bdev_hardsect_size() says about the
device. It is _usually_ 512 bytes, but not always. This introduces
the risk that someone will develop and test applications which work
fine on their hardware, but will fail on someone else's hardware.
It is possible to query the hardsect size using the BLKSSZGET ioctl
against the backing block device. This can be performed at runtime or
at application installation time.
| -rw-r--r-- | fs/block_dev.c | 2 | ||||
| -rw-r--r-- | fs/direct-io.c | 180 | ||||
| -rw-r--r-- | fs/ext2/inode.c | 2 | ||||
| -rw-r--r-- | fs/ext3/inode.c | 2 | ||||
| -rw-r--r-- | fs/jfs/inode.c | 2 | ||||
| -rw-r--r-- | fs/xfs/linux/xfs_aops.c | 4 | ||||
| -rw-r--r-- | include/linux/fs.h | 4 |
7 files changed, 166 insertions, 30 deletions
diff --git a/fs/block_dev.c b/fs/block_dev.c index b48d3cfe82e6..2afc62c188f8 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -120,7 +120,7 @@ blkdev_direct_IO(int rw, struct file *file, const struct iovec *iov, { struct inode *inode = file->f_dentry->d_inode->i_mapping->host; - return generic_direct_IO(rw, inode, iov, offset, + return generic_direct_IO(rw, inode, inode->i_bdev, iov, offset, nr_segs, blkdev_get_blocks); } diff --git a/fs/direct-io.c b/fs/direct-io.c index 3637657eb123..04c7143afccb 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -29,14 +29,35 @@ */ #define DIO_PAGES 64 +/* + * This code generally works in units of "dio_blocks". A dio_block is + * somewhere between the hard sector size and the filesystem block size. it + * is determined on a per-invokation basis. When talking to the filesystem + * we need to convert dio_blocks to fs_blocks by scaling the dio_block quantity + * down by dio->blkfactor. Similarly, fs-blocksize quantities are converted + * to bio_block quantities by shifting left by blkfactor. + * + * If blkfactor is zero then the user's request was aligned to the filesystem's + * blocksize. + */ + struct dio { /* BIO submission state */ struct bio *bio; /* bio under assembly */ struct inode *inode; int rw; unsigned blkbits; /* doesn't change */ + unsigned blkfactor; /* When we're using an aligment which + is finer than the filesystem's soft + blocksize, this specifies how much + finer. blkfactor=2 means 1/4-block + alignment. Does not change */ + unsigned start_zero_done; /* flag: sub-blocksize zeroing has + been performed at the start of a + write */ int pages_in_io; /* approximate total IO pages */ - sector_t block_in_file; /* changes */ + sector_t block_in_file; /* Current offset into the underlying + file in dio_block units. */ unsigned blocks_available; /* At block_in_file. changes */ sector_t final_block_in_request;/* doesn't change */ unsigned first_block_in_page; /* doesn't change, Used only once */ @@ -44,7 +65,8 @@ struct dio { int reap_counter; /* rate limit reaping */ get_blocks_t *get_blocks; /* block mapping function */ sector_t final_block_in_bio; /* current final block in bio + 1 */ - sector_t next_block_for_io; /* next block to be put under IO */ + sector_t next_block_for_io; /* next block to be put under IO, + in dio_blocks units */ struct buffer_head map_bh; /* last get_blocks() result */ /* @@ -340,6 +362,10 @@ static int get_more_blocks(struct dio *dio) { int ret; struct buffer_head *map_bh = &dio->map_bh; + sector_t fs_startblk; /* Into file, in filesystem-sized blocks */ + unsigned long fs_count; /* Number of filesystem-sized blocks */ + unsigned long dio_count;/* Number of dio_block-sized blocks */ + unsigned long blkmask; /* * If there was a memory error and we've overwritten all the @@ -350,8 +376,14 @@ static int get_more_blocks(struct dio *dio) map_bh->b_state = 0; map_bh->b_size = 0; BUG_ON(dio->block_in_file >= dio->final_block_in_request); - ret = (*dio->get_blocks)(dio->inode, dio->block_in_file, - dio->final_block_in_request-dio->block_in_file, + fs_startblk = dio->block_in_file >> dio->blkfactor; + dio_count = dio->final_block_in_request - dio->block_in_file; + fs_count = dio_count >> dio->blkfactor; + blkmask = (1 << dio->blkfactor) - 1; + if (dio_count & blkmask) + fs_count++; + + ret = (*dio->get_blocks)(dio->inode, fs_startblk, fs_count, map_bh, dio->rw == WRITE); } return ret; @@ -524,6 +556,49 @@ static void clean_blockdev_aliases(struct dio *dio) } /* + * If we are not writing the entire block and get_block() allocated + * the block for us, we need to fill-in the unused portion of the + * block with zeros. This happens only if user-buffer, fileoffset or + * io length is not filesystem block-size multiple. + * + * `end' is zero if we're doing the start of the IO, 1 at the end of the + * IO. + */ +static void dio_zero_block(struct dio *dio, int end) +{ + unsigned dio_blocks_per_fs_block; + unsigned this_chunk_blocks; /* In dio_blocks */ + unsigned this_chunk_bytes; + struct page *page; + + dio->start_zero_done = 1; + if (!dio->blkfactor || !buffer_new(&dio->map_bh)) + return; + + dio_blocks_per_fs_block = 1 << dio->blkfactor; + this_chunk_blocks = dio->block_in_file & (dio_blocks_per_fs_block - 1); + + if (!this_chunk_blocks) + return; + + /* + * We need to zero out part of an fs block. It is either at the + * beginning or the end of the fs block. + */ + if (end) + this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks; + + this_chunk_bytes = this_chunk_blocks << dio->blkbits; + + page = ZERO_PAGE(dio->cur_user_address); + if (submit_page_section(dio, page, 0, this_chunk_bytes, + dio->next_block_for_io)) + return; + + dio->next_block_for_io += this_chunk_blocks; +} + +/* * Walk the user pages, and the file, mapping blocks to disk and generating * a sequence of (page,offset,len,block) mappings. These mappings are injected * into submit_page_section(), which takes care of the next stage of submission @@ -565,21 +640,49 @@ static int do_direct_IO(struct dio *dio) unsigned u; if (dio->blocks_available == 0) { + /* + * Need to go and map some more disk + */ + unsigned long blkmask; + unsigned long dio_remainder; + ret = get_more_blocks(dio); if (ret) { page_cache_release(page); goto out; } - if (buffer_mapped(map_bh)) { - dio->blocks_available = + if (!buffer_mapped(map_bh)) + goto do_holes; + + dio->blocks_available = map_bh->b_size >> dio->blkbits; - dio->next_block_for_io = - map_bh->b_blocknr; - if (buffer_new(map_bh)) - clean_blockdev_aliases(dio); - } + dio->next_block_for_io = + map_bh->b_blocknr << dio->blkfactor; + if (buffer_new(map_bh)) + clean_blockdev_aliases(dio); + + if (!dio->blkfactor) + goto do_holes; + + blkmask = (1 << dio->blkfactor) - 1; + dio_remainder = (dio->block_in_file & blkmask); + + /* + * If we are at the start of IO and that IO + * starts partway into a fs-block, + * dio_remainder will be non-zero. If the IO + * is a read then we can simply advance the IO + * cursor to the first block which is to be + * read. But if the IO is a write and the + * block was newly allocated we cannot do that; + * the start of the fs block must be zeroed out + * on-disk + */ + if (!buffer_new(map_bh)) + dio->next_block_for_io += dio_remainder; + dio->blocks_available -= dio_remainder; } - +do_holes: /* Handle holes */ if (!buffer_mapped(map_bh)) { char *kaddr = kmap_atomic(page, KM_USER0); @@ -593,6 +696,14 @@ static int do_direct_IO(struct dio *dio) } /* + * If we're performing IO which has an alignment which + * is finer than the underlying fs, go check to see if + * we must zero out the start of this block. + */ + if (unlikely(dio->blkfactor && !dio->start_zero_done)) + dio_zero_block(dio, 0); + + /* * Work out, in this_chunk_blocks, how much disk we * can add to this page */ @@ -635,9 +746,9 @@ out: static int direct_io_worker(int rw, struct inode *inode, const struct iovec *iov, - loff_t offset, unsigned long nr_segs, get_blocks_t get_blocks) + loff_t offset, unsigned long nr_segs, unsigned blkbits, + get_blocks_t get_blocks) { - const unsigned blkbits = inode->i_blkbits; unsigned long user_addr; int seg, ret2, ret = 0; struct dio dio; @@ -647,6 +758,8 @@ direct_io_worker(int rw, struct inode *inode, const struct iovec *iov, dio.inode = inode; dio.rw = rw; dio.blkbits = blkbits; + dio.blkfactor = inode->i_blkbits - blkbits; + dio.start_zero_done = 0; dio.block_in_file = offset >> blkbits; dio.blocks_available = 0; @@ -702,6 +815,12 @@ direct_io_worker(int rw, struct inode *inode, const struct iovec *iov, } /* end iovec loop */ + /* + * There may be some unwritten disk at the end of a part-written + * fs-block-sized block. Go zero that now. + */ + dio_zero_block(&dio, 1); + if (dio.cur_page) { ret2 = dio_send_cur_page(&dio); page_cache_release(dio.cur_page); @@ -723,27 +842,44 @@ direct_io_worker(int rw, struct inode *inode, const struct iovec *iov, * This is a library function for use by filesystem drivers. */ int -generic_direct_IO(int rw, struct inode *inode, const struct iovec *iov, - loff_t offset, unsigned long nr_segs, get_blocks_t get_blocks) +generic_direct_IO(int rw, struct inode *inode, struct block_device *bdev, + const struct iovec *iov, loff_t offset, unsigned long nr_segs, + get_blocks_t get_blocks) { int seg; size_t size; unsigned long addr; - unsigned blocksize_mask = (1 << inode->i_blkbits) - 1; + unsigned blkbits = inode->i_blkbits; + unsigned bdev_blkbits = 0; + unsigned blocksize_mask = (1 << blkbits) - 1; ssize_t retval = -EINVAL; - if (offset & blocksize_mask) - goto out; + if (bdev) + bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev)); + + if (offset & blocksize_mask) { + if (bdev) + blkbits = bdev_blkbits; + blocksize_mask = (1 << blkbits) - 1; + if (offset & blocksize_mask) + goto out; + } /* Check the memory alignment. Blocks cannot straddle pages */ for (seg = 0; seg < nr_segs; seg++) { addr = (unsigned long)iov[seg].iov_base; size = iov[seg].iov_len; - if ((addr & blocksize_mask) || (size & blocksize_mask)) - goto out; + if ((addr & blocksize_mask) || (size & blocksize_mask)) { + if (bdev) + blkbits = bdev_blkbits; + blocksize_mask = (1 << blkbits) - 1; + if ((addr & blocksize_mask) || (size & blocksize_mask)) + goto out; + } } - retval = direct_io_worker(rw, inode, iov, offset, nr_segs, get_blocks); + retval = direct_io_worker(rw, inode, iov, offset, + nr_segs, blkbits, get_blocks); out: return retval; } diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 6cc6052871ad..aa998d7d1527 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -624,7 +624,7 @@ ext2_direct_IO(int rw, struct file *file, const struct iovec *iov, { struct inode *inode = file->f_dentry->d_inode->i_mapping->host; - return generic_direct_IO(rw, inode, iov, + return generic_direct_IO(rw, inode, inode->i_sb->s_bdev, iov, offset, nr_segs, ext2_get_blocks); } diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index d6a0d9dfcd71..64332feab0a4 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -1431,7 +1431,7 @@ static int ext3_direct_IO(int rw, struct file *file, } } - ret = generic_direct_IO(rw, inode, iov, offset, + ret = generic_direct_IO(rw, inode, inode->i_sb->s_bdev, iov, offset, nr_segs, ext3_direct_io_get_blocks); out_stop: diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 64f1103c0012..fe9560e59e87 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -315,7 +315,7 @@ static int jfs_direct_IO(int rw, struct file *file, const struct iovec *iov, { struct inode *inode = file->f_dentry->d_inode->i_mapping->host; - return generic_direct_IO(rw, inode, iov, + return generic_direct_IO(rw, inode, inode->i_sb->s_bdev, iov, offset, nr_segs, jfs_get_blocks); } diff --git a/fs/xfs/linux/xfs_aops.c b/fs/xfs/linux/xfs_aops.c index 8364f6c3eb41..e4ecd6729214 100644 --- a/fs/xfs/linux/xfs_aops.c +++ b/fs/xfs/linux/xfs_aops.c @@ -607,8 +607,8 @@ linvfs_direct_IO( { struct inode *inode = file->f_dentry->d_inode->i_mapping->host; - return generic_direct_IO(rw, inode, iov, offset, nr_segs, - linvfs_get_blocks_direct); + return generic_direct_IO(rw, inode, NULL, + iov, offset, nr_segs, linvfs_get_blocks_direct); } diff --git a/include/linux/fs.h b/include/linux/fs.h index d6d1e30c01dd..862d767fe310 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1252,8 +1252,8 @@ extern void do_generic_mapping_read(struct address_space *, struct file_ra_state loff_t *, read_descriptor_t *, read_actor_t); extern ssize_t generic_file_direct_IO(int rw, struct file *file, const struct iovec *iov, loff_t offset, unsigned long nr_segs); -extern int generic_direct_IO(int rw, struct inode *inode, const struct iovec - *iov, loff_t offset, unsigned long nr_segs, get_blocks_t *get_blocks); +extern int generic_direct_IO(int rw, struct inode *inode, struct block_device *bdev, + const struct iovec *iov, loff_t offset, unsigned long nr_segs, get_blocks_t *get_blocks); extern ssize_t generic_file_readv(struct file *filp, const struct iovec *iov, unsigned long nr_segs, loff_t *ppos); ssize_t generic_file_writev(struct file *filp, const struct iovec *iov, |
