From 334fd34d76f237c0ee58dfc400d2c4e34d660544 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Thu, 29 Jun 2017 11:43:20 -0700 Subject: vfs: Add page_cache_seek_hole_data helper Both ext4 and xfs implement seeking for the next hole or piece of data in unwritten extents by scanning the page cache, and both versions share the same bug when iterating the buffers of a page: the start offset into the page isn't taken into account, so when a page fits more than two filesystem blocks, things will go wrong. For example, on a filesystem with a block size of 1k, the following command will fail: xfs_io -f -c "falloc 0 4k" \ -c "pwrite 1k 1k" \ -c "pwrite 3k 1k" \ -c "seek -a -r 0" foo In this example, neither lseek(fd, 1024, SEEK_HOLE) nor lseek(fd, 2048, SEEK_DATA) will return the correct result. Introduce a generic vfs helper for seeking in the page cache that gets this right. The next commits will replace the filesystem specific implementations. Signed-off-by: Andreas Gruenbacher [hch: dropped the export] Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/buffer.c | 124 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 124 insertions(+) (limited to 'fs/buffer.c') diff --git a/fs/buffer.c b/fs/buffer.c index 161be58c5cb0..b3674eb7c9c0 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3492,6 +3492,130 @@ int bh_submit_read(struct buffer_head *bh) } EXPORT_SYMBOL(bh_submit_read); +/* + * Seek for SEEK_DATA / SEEK_HOLE within @page, starting at @lastoff. + * + * Returns the offset within the file on success, and -ENOENT otherwise. + */ +static loff_t +page_seek_hole_data(struct page *page, loff_t lastoff, int whence) +{ + loff_t offset = page_offset(page); + struct buffer_head *bh, *head; + bool seek_data = whence == SEEK_DATA; + + if (lastoff < offset) + lastoff = offset; + + bh = head = page_buffers(page); + do { + offset += bh->b_size; + if (lastoff >= offset) + continue; + + /* + * Unwritten extents that have data in the page cache covering + * them can be identified by the BH_Unwritten state flag. + * Pages with multiple buffers might have a mix of holes, data + * and unwritten extents - any buffer with valid data in it + * should have BH_Uptodate flag set on it. + */ + + if ((buffer_unwritten(bh) || buffer_uptodate(bh)) == seek_data) + return lastoff; + + lastoff = offset; + } while ((bh = bh->b_this_page) != head); + return -ENOENT; +} + +/* + * Seek for SEEK_DATA / SEEK_HOLE in the page cache. + * + * Within unwritten extents, the page cache determines which parts are holes + * and which are data: unwritten and uptodate buffer heads count as data; + * everything else counts as a hole. + * + * Returns the resulting offset on successs, and -ENOENT otherwise. + */ +loff_t +page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length, + int whence) +{ + pgoff_t index = offset >> PAGE_SHIFT; + pgoff_t end = DIV_ROUND_UP(offset + length, PAGE_SIZE); + loff_t lastoff = offset; + struct pagevec pvec; + + if (length <= 0) + return -ENOENT; + + pagevec_init(&pvec, 0); + + do { + unsigned want, nr_pages, i; + + want = min_t(unsigned, end - index, PAGEVEC_SIZE); + nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, want); + if (nr_pages == 0) + break; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + /* + * At this point, the page may be truncated or + * invalidated (changing page->mapping to NULL), or + * even swizzled back from swapper_space to tmpfs file + * mapping. However, page->index will not change + * because we have a reference on the page. + * + * If current page offset is beyond where we've ended, + * we've found a hole. + */ + if (whence == SEEK_HOLE && + lastoff < page_offset(page)) + goto check_range; + + /* Searching done if the page index is out of range. */ + if (page->index >= end) + goto not_found; + + lock_page(page); + if (likely(page->mapping == inode->i_mapping) && + page_has_buffers(page)) { + lastoff = page_seek_hole_data(page, lastoff, whence); + if (lastoff >= 0) { + unlock_page(page); + goto check_range; + } + } + unlock_page(page); + lastoff = page_offset(page) + PAGE_SIZE; + } + + /* Searching done if fewer pages returned than wanted. */ + if (nr_pages < want) + break; + + index = pvec.pages[i - 1]->index + 1; + pagevec_release(&pvec); + } while (index < end); + + /* When no page at lastoff and we are not done, we found a hole. */ + if (whence != SEEK_HOLE) + goto not_found; + +check_range: + if (lastoff < offset + length) + goto out; +not_found: + lastoff = -ENOENT; +out: + pagevec_release(&pvec); + return lastoff; +} + void __init buffer_init(void) { unsigned long nrpages; -- cgit v1.2.3 From 241f01fbeda2521f802eeef4de0261387e6e9c1d Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 10 Jul 2017 15:47:29 -0700 Subject: fs/buffer.c: make bh_lru_install() more efficient To install a buffer_head into the cpu's LRU queue, bh_lru_install() would construct a new copy of the queue and then memcpy it over the real queue. But it's easily possible to do the update in-place, which is faster and simpler. Some work can also be skipped if the buffer_head was already in the queue. As a microbenchmark I timed how long it takes to run sb_getblk() 10,000,000 times alternating between BH_LRU_SIZE + 1 blocks. Effectively, this benchmarks looking up buffer_heads that are in the page cache but not in the LRU: Before this patch: 1.758s After this patch: 1.653s This patch also removes about 350 bytes of compiled code (on x86_64), partly due to removal of the memcpy() which was being inlined+unrolled. Link: http://lkml.kernel.org/r/20161229193445.1913-1-ebiggers3@gmail.com Signed-off-by: Eric Biggers Cc: Alexander Viro Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 43 +++++++++++++++---------------------------- 1 file changed, 15 insertions(+), 28 deletions(-) (limited to 'fs/buffer.c') diff --git a/fs/buffer.c b/fs/buffer.c index ea0e05ec2916..5715dac7821f 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1281,44 +1281,31 @@ static inline void check_irqs_on(void) } /* - * The LRU management algorithm is dopey-but-simple. Sorry. + * Install a buffer_head into this cpu's LRU. If not already in the LRU, it is + * inserted at the front, and the buffer_head at the back if any is evicted. + * Or, if already in the LRU it is moved to the front. */ static void bh_lru_install(struct buffer_head *bh) { - struct buffer_head *evictee = NULL; + struct buffer_head *evictee = bh; + struct bh_lru *b; + int i; check_irqs_on(); bh_lru_lock(); - if (__this_cpu_read(bh_lrus.bhs[0]) != bh) { - struct buffer_head *bhs[BH_LRU_SIZE]; - int in; - int out = 0; - - get_bh(bh); - bhs[out++] = bh; - for (in = 0; in < BH_LRU_SIZE; in++) { - struct buffer_head *bh2 = - __this_cpu_read(bh_lrus.bhs[in]); - if (bh2 == bh) { - __brelse(bh2); - } else { - if (out >= BH_LRU_SIZE) { - BUG_ON(evictee != NULL); - evictee = bh2; - } else { - bhs[out++] = bh2; - } - } + b = this_cpu_ptr(&bh_lrus); + for (i = 0; i < BH_LRU_SIZE; i++) { + swap(evictee, b->bhs[i]); + if (evictee == bh) { + bh_lru_unlock(); + return; } - while (out < BH_LRU_SIZE) - bhs[out++] = NULL; - memcpy(this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs)); } - bh_lru_unlock(); - if (evictee) - __brelse(evictee); + get_bh(bh); + bh_lru_unlock(); + brelse(evictee); } /* -- cgit v1.2.3