diff options
| -rw-r--r-- | fs/direct-io.c | 335 |
1 files changed, 212 insertions, 123 deletions
diff --git a/fs/direct-io.c b/fs/direct-io.c index 35b8550d5334..3637657eb123 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -35,6 +35,7 @@ struct dio { struct inode *inode; int rw; unsigned blkbits; /* doesn't change */ + int pages_in_io; /* approximate total IO pages */ sector_t block_in_file; /* changes */ unsigned blocks_available; /* At block_in_file. changes */ sector_t final_block_in_request;/* doesn't change */ @@ -42,17 +43,31 @@ struct dio { int boundary; /* prev block is at a boundary */ int reap_counter; /* rate limit reaping */ get_blocks_t *get_blocks; /* block mapping function */ - sector_t last_block_in_bio; /* current final block in bio */ - sector_t next_block_in_bio; /* next block to be added to bio */ + sector_t final_block_in_bio; /* current final block in bio + 1 */ + sector_t next_block_for_io; /* next block to be put under IO */ struct buffer_head map_bh; /* last get_blocks() result */ - /* Page fetching state */ + /* + * Deferred addition of a page to the dio. These variables are + * private to dio_send_cur_page(), submit_page_section() and + * dio_bio_add_page(). + */ + struct page *cur_page; /* The page */ + unsigned cur_page_offset; /* Offset into it, in bytes */ + unsigned cur_page_len; /* Nr of bytes at cur_page_offset */ + sector_t cur_page_block; /* Where it starts */ + + /* + * Page fetching state. These variables belong to dio_refill_pages(). + */ int curr_page; /* changes */ int total_pages; /* doesn't change */ - int pages_left; /* approximate total IO pages */ unsigned long curr_user_address;/* changes */ - /* Page queue */ + /* + * Page queue. These variables belong to dio_refill_pages() and + * dio_get_page(). + */ struct page *pages[DIO_PAGES]; /* page buffer */ unsigned head; /* next page to process */ unsigned tail; /* last valid page + 1 */ @@ -318,74 +333,31 @@ static int dio_bio_reap(struct dio *dio) * * In the case of filesystem holes: the fs may return an arbitrarily-large * hole by returning an appropriate value in b_size and by clearing - * buffer_mapped(). This code _should_ handle that case correctly, but it has - * only been tested against single-block holes (b_size == blocksize). + * buffer_mapped(). However the direct-io code will only process holes one + * block at a time - it will repeatedly call get_blocks() as it walks the hole. */ static int get_more_blocks(struct dio *dio) { int ret; struct buffer_head *map_bh = &dio->map_bh; - if (dio->blocks_available) - return 0; - /* * If there was a memory error and we've overwritten all the * mapped blocks then we can now return that memory error */ - if (dio->page_errors) { - ret = dio->page_errors; - goto out; - } - - map_bh->b_state = 0; - map_bh->b_size = 0; - BUG_ON(dio->block_in_file >= dio->final_block_in_request); - ret = (*dio->get_blocks)(dio->inode, dio->block_in_file, - dio->final_block_in_request - dio->block_in_file, - map_bh, dio->rw == WRITE); - if (ret) - goto out; - - if (buffer_mapped(map_bh)) { - BUG_ON(map_bh->b_size == 0); - BUG_ON((map_bh->b_size & ((1 << dio->blkbits) - 1)) != 0); - - dio->blocks_available = map_bh->b_size >> dio->blkbits; - - /* blockdevs do not set buffer_new */ - if (buffer_new(map_bh)) { - sector_t block = map_bh->b_blocknr; - unsigned i; - - for (i = 0; i < dio->blocks_available; i++) - unmap_underlying_metadata(map_bh->b_bdev, - block++); - } - } else { - BUG_ON(dio->rw != READ); - if (dio->bio) - dio_bio_submit(dio); + ret = dio->page_errors; + if (ret == 0) { + map_bh->b_state = 0; + map_bh->b_size = 0; + BUG_ON(dio->block_in_file >= dio->final_block_in_request); + ret = (*dio->get_blocks)(dio->inode, dio->block_in_file, + dio->final_block_in_request-dio->block_in_file, + map_bh, dio->rw == WRITE); } - dio->next_block_in_bio = map_bh->b_blocknr; -out: return ret; } /* - * Check to see if we can continue to grow the BIO. If not, then send it. - */ -static void dio_prep_bio(struct dio *dio) -{ - if (dio->bio == NULL) - return; - - if (dio->boundary || - dio->last_block_in_bio != dio->next_block_in_bio - 1) - dio_bio_submit(dio); -} - -/* * There is no bio. Make one now. */ static int dio_new_bio(struct dio *dio, sector_t blkno) @@ -397,7 +369,7 @@ static int dio_new_bio(struct dio *dio, sector_t blkno) if (ret) goto out; sector = blkno << (dio->blkbits - 9); - nr_pages = min(dio->pages_left, bio_get_nr_vecs(dio->map_bh.b_bdev)); + nr_pages = min(dio->pages_in_io, bio_get_nr_vecs(dio->map_bh.b_bdev)); BUG_ON(nr_pages <= 0); ret = dio_bio_alloc(dio, dio->map_bh.b_bdev, sector, nr_pages); dio->boundary = 0; @@ -405,37 +377,156 @@ out: return ret; } +/* + * Attempt tp put the current chunk of 'cur_page' into the current BIO. If + * that was successful then update final_block_in_bio and take a ref against + * the just-added page. + */ +static int dio_bio_add_page(struct dio *dio) +{ + int ret; -static int -dio_bio_add_page(struct dio *dio, struct page *page, - unsigned int bv_len, unsigned int bv_offset, sector_t blkno) + ret = bio_add_page(dio->bio, dio->cur_page, + dio->cur_page_len, dio->cur_page_offset); + if (ret == dio->cur_page_len) { + dio->pages_in_io--; + page_cache_get(dio->cur_page); + dio->final_block_in_bio = dio->cur_page_block + + (dio->cur_page_len >> dio->blkbits); + ret = 0; + } + return ret; +} + +/* + * Put cur_page under IO. The section of cur_page which is described by + * cur_page_offset,cur_page_len is put into a BIO. The section of cur_page + * starts on-disk at cur_page_block. + * + * We take a ref against the page here (on behalf of its presence in the bio). + * + * The caller of this function is responsible for removing cur_page from the + * dio, and for dropping the refcount which came from that presence. + */ +static int dio_send_cur_page(struct dio *dio) { int ret = 0; - if (bv_len == 0) - goto out; + if (dio->bio) { + /* + * See whether this new request is contiguous with the old + */ + if (dio->final_block_in_bio != dio->cur_page_block) + dio_bio_submit(dio); + /* + * Submit now if the underlying fs is about to perform a + * metadata read + */ + if (dio->boundary) + dio_bio_submit(dio); + } + + if (dio->bio == NULL) { + ret = dio_new_bio(dio, dio->cur_page_block); + if (ret) + goto out; + } - /* Take a ref against the page each time it is placed into a BIO */ - page_cache_get(page); - if (bio_add_page(dio->bio, page, bv_len, bv_offset) < bv_len) { + if (dio_bio_add_page(dio) != 0) { dio_bio_submit(dio); - ret = dio_new_bio(dio, blkno); + ret = dio_new_bio(dio, dio->cur_page_block); if (ret == 0) { - ret = bio_add_page(dio->bio, page, bv_len, bv_offset); - BUG_ON(ret < bv_len); - } else { - /* The page didn't make it into a BIO */ - page_cache_release(page); + ret = dio_bio_add_page(dio); + BUG_ON(ret != 0); } } - dio->pages_left--; out: return ret; } +/* + * An autonomous function to put a chunk of a page under deferred IO. + * + * The caller doesn't actually know (or care) whether this piece of page is in + * a BIO, or is under IO or whatever. We just take care of all possible + * situations here. The separation between the logic of do_direct_IO() and + * that of submit_page_section() is important for clarity. Please don't break. + * + * The chunk of page starts on-disk at blocknr. + * + * We perform deferred IO, by recording the last-submitted page inside our + * private part of the dio structure. If possible, we just expand the IO + * across that page here. + * + * If that doesn't work out then we put the old page into the bio and add this + * page to the dio instead. + */ +static int +submit_page_section(struct dio *dio, struct page *page, + unsigned offset, unsigned len, sector_t blocknr) +{ + int ret = 0; + + /* + * Can we just grow the current page's presence in the dio? + */ + if ( (dio->cur_page == page) && + (dio->cur_page_offset + dio->cur_page_len == offset) && + (dio->cur_page_block + + (dio->cur_page_len >> dio->blkbits) == blocknr)) { + dio->cur_page_len += len; + + /* + * If dio->boundary then we want to schedule the IO now to + * avoid metadata seeks. + */ + if (dio->boundary) { + ret = dio_send_cur_page(dio); + page_cache_release(dio->cur_page); + dio->cur_page = NULL; + } + goto out; + } + + /* + * If there's a deferred page already there then send it. + */ + if (dio->cur_page) { + ret = dio_send_cur_page(dio); + page_cache_release(dio->cur_page); + dio->cur_page = NULL; + if (ret) + goto out; + } + + page_cache_get(page); /* It is in dio */ + dio->cur_page = page; + dio->cur_page_offset = offset; + dio->cur_page_len = len; + dio->cur_page_block = blocknr; +out: + return ret; +} /* - * Walk the user pages, and the file, mapping blocks to disk and emitting BIOs. + * Clean any dirty buffers in the blockdev mapping which alias newly-created + * file blocks. Only called for S_ISREG files - blockdevs do not set + * buffer_new + */ +static void clean_blockdev_aliases(struct dio *dio) +{ + unsigned i; + + for (i = 0; i < dio->blocks_available; i++) { + unmap_underlying_metadata(dio->map_bh.b_bdev, + dio->map_bh.b_blocknr + i); + } +} + +/* + * Walk the user pages, and the file, mapping blocks to disk and generating + * a sequence of (page,offset,len,block) mappings. These mappings are injected + * into submit_page_section(), which takes care of the next stage of submission * * Direct IO against a blockdev is different from a file. Because we can * happily perform page-sized but 512-byte aligned IOs. It is important that @@ -448,73 +539,65 @@ out: * it should set b_size to PAGE_SIZE or more inside get_blocks(). This gives * fine alignment but still allows this function to work in PAGE_SIZE units. */ -int do_direct_IO(struct dio *dio) +static int do_direct_IO(struct dio *dio) { const unsigned blkbits = dio->blkbits; const unsigned blocks_per_page = PAGE_SIZE >> blkbits; struct page *page; unsigned block_in_page; + struct buffer_head *map_bh = &dio->map_bh; int ret = 0; /* The I/O can start at any block offset within the first page */ block_in_page = dio->first_block_in_page; while (dio->block_in_file < dio->final_block_in_request) { - int new_page; /* Need to insert this page into the BIO? */ - unsigned int bv_offset; - unsigned int bv_len; - sector_t curr_blkno; - page = dio_get_page(dio); if (IS_ERR(page)) { ret = PTR_ERR(page); goto out; } - new_page = 1; - bv_offset = 0; - bv_len = 0; - curr_blkno = 0; while (block_in_page < blocks_per_page) { + unsigned offset_in_page = block_in_page << blkbits; unsigned this_chunk_bytes; /* # of bytes mapped */ unsigned this_chunk_blocks; /* # of blocks */ unsigned u; - ret = get_more_blocks(dio); - if (ret) - goto fail_release; + if (dio->blocks_available == 0) { + ret = get_more_blocks(dio); + if (ret) { + page_cache_release(page); + goto out; + } + if (buffer_mapped(map_bh)) { + dio->blocks_available = + map_bh->b_size >> dio->blkbits; + dio->next_block_for_io = + map_bh->b_blocknr; + if (buffer_new(map_bh)) + clean_blockdev_aliases(dio); + } + } /* Handle holes */ - if (!buffer_mapped(&dio->map_bh)) { + if (!buffer_mapped(map_bh)) { char *kaddr = kmap_atomic(page, KM_USER0); memset(kaddr + (block_in_page << blkbits), 0, 1 << blkbits); flush_dcache_page(page); kunmap_atomic(kaddr, KM_USER0); dio->block_in_file++; - dio->next_block_in_bio++; block_in_page++; goto next_block; } - dio_prep_bio(dio); - if (dio->bio == NULL) { - ret = dio_new_bio(dio, dio->next_block_in_bio); - if (ret) - goto fail_release; - new_page = 1; - } - - if (new_page) { - bv_len = 0; - bv_offset = block_in_page << blkbits; - curr_blkno = dio->next_block_in_bio; - new_page = 0; - } - - /* Work out how much disk we can add to this page */ + /* + * Work out, in this_chunk_blocks, how much disk we + * can add to this page + */ this_chunk_blocks = dio->blocks_available; - u = (PAGE_SIZE - (bv_len + bv_offset)) >> blkbits; + u = (PAGE_SIZE - offset_in_page) >> blkbits; if (this_chunk_blocks > u) this_chunk_blocks = u; u = dio->final_block_in_request - dio->block_in_file; @@ -523,10 +606,15 @@ int do_direct_IO(struct dio *dio) this_chunk_bytes = this_chunk_blocks << blkbits; BUG_ON(this_chunk_bytes == 0); - bv_len += this_chunk_bytes; - dio->next_block_in_bio += this_chunk_blocks; - dio->last_block_in_bio = dio->next_block_in_bio - 1; - dio->boundary = buffer_boundary(&dio->map_bh); + dio->boundary = buffer_boundary(map_bh); + ret = submit_page_section(dio, page, offset_in_page, + this_chunk_bytes, dio->next_block_for_io); + if (ret) { + page_cache_release(page); + goto out; + } + dio->next_block_for_io += this_chunk_blocks; + dio->block_in_file += this_chunk_blocks; block_in_page += this_chunk_blocks; dio->blocks_available -= this_chunk_blocks; @@ -536,23 +624,16 @@ next_block: if (dio->block_in_file == dio->final_block_in_request) break; } - ret = dio_bio_add_page(dio, page, bv_len, - bv_offset, curr_blkno); - if (ret) - goto fail_release; /* Drop the ref which was taken in get_user_pages() */ page_cache_release(page); block_in_page = 0; } - goto out; -fail_release: - page_cache_release(page); out: return ret; } -int +static int direct_io_worker(int rw, struct inode *inode, const struct iovec *iov, loff_t offset, unsigned long nr_segs, get_blocks_t get_blocks) { @@ -569,11 +650,13 @@ direct_io_worker(int rw, struct inode *inode, const struct iovec *iov, dio.block_in_file = offset >> blkbits; dio.blocks_available = 0; + dio.cur_page = NULL; + dio.boundary = 0; dio.reap_counter = 0; dio.get_blocks = get_blocks; - dio.last_block_in_bio = -1; - dio.next_block_in_bio = -1; + dio.final_block_in_bio = -1; + dio.next_block_for_io = -1; dio.page_errors = 0; @@ -582,10 +665,10 @@ direct_io_worker(int rw, struct inode *inode, const struct iovec *iov, spin_lock_init(&dio.bio_list_lock); dio.bio_list = NULL; dio.waiter = NULL; - dio.pages_left = 0; + dio.pages_in_io = 0; for (seg = 0; seg < nr_segs; seg++) - dio.pages_left += (iov[seg].iov_len / PAGE_SIZE) + 2; + dio.pages_in_io += (iov[seg].iov_len >> blkbits) + 2; for (seg = 0; seg < nr_segs; seg++) { user_addr = (unsigned long)iov[seg].iov_base; @@ -619,6 +702,12 @@ direct_io_worker(int rw, struct inode *inode, const struct iovec *iov, } /* end iovec loop */ + if (dio.cur_page) { + ret2 = dio_send_cur_page(&dio); + page_cache_release(dio.cur_page); + if (ret == 0) + ret = ret2; + } ret2 = dio_await_completion(&dio); if (ret == 0) ret = ret2; |
