diff options
Diffstat (limited to 'fs/btrfs/extent_io.c')
-rw-r--r-- | fs/btrfs/extent_io.c | 167 |
1 files changed, 129 insertions, 38 deletions
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c953297aa89a..c123a3ef154a 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -101,6 +101,26 @@ struct btrfs_bio_ctrl { enum btrfs_compression_type compress_type; u32 len_to_oe_boundary; blk_opf_t opf; + /* + * For data read bios, we attempt to optimize csum lookups if the extent + * generation is older than the current one. To make this possible, we + * need to track the maximum generation of an extent in a bio_ctrl to + * make the decision when submitting the bio. + * + * The pattern between do_readpage(), submit_one_bio() and + * submit_extent_folio() is quite subtle, so tracking this is tricky. + * + * As we process extent E, we might submit a bio with existing built up + * extents before adding E to a new bio, or we might just add E to the + * bio. As a result, E's generation could apply to the current bio or + * to the next one, so we need to be careful to update the bio_ctrl's + * generation with E's only when we are sure E is added to bio_ctrl->bbio + * in submit_extent_folio(). + * + * See the comment in btrfs_lookup_bio_sums() for more detail on the + * need for this optimization. + */ + u64 generation; btrfs_bio_end_io_t end_io_func; struct writeback_control *wbc; @@ -111,8 +131,46 @@ struct btrfs_bio_ctrl { */ unsigned long submit_bitmap; struct readahead_control *ractl; + + /* + * The start offset of the last used extent map by a read operation. + * + * This is for proper compressed read merge. + * U64_MAX means we are starting the read and have made no progress yet. + * + * The current btrfs_bio_is_contig() only uses disk_bytenr as + * the condition to check if the read can be merged with previous + * bio, which is not correct. E.g. two file extents pointing to the + * same extent but with different offset. + * + * So here we need to do extra checks to only merge reads that are + * covered by the same extent map. + * Just extent_map::start will be enough, as they are unique + * inside the same inode. + */ + u64 last_em_start; }; +/* + * Helper to set the csum search commit root option for a bio_ctrl's bbio + * before submitting the bio. + * + * Only for use by submit_one_bio(). + */ +static void bio_set_csum_search_commit_root(struct btrfs_bio_ctrl *bio_ctrl) +{ + struct btrfs_bio *bbio = bio_ctrl->bbio; + + ASSERT(bbio); + + if (!(btrfs_op(&bbio->bio) == BTRFS_MAP_READ && is_data_inode(bbio->inode))) + return; + + bio_ctrl->bbio->csum_search_commit_root = + (bio_ctrl->generation && + bio_ctrl->generation < btrfs_get_fs_generation(bbio->inode->root->fs_info)); +} + static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) { struct btrfs_bio *bbio = bio_ctrl->bbio; @@ -123,6 +181,8 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) /* Caller should ensure the bio has at least some range added */ ASSERT(bbio->bio.bi_iter.bi_size); + bio_set_csum_search_commit_root(bio_ctrl); + if (btrfs_op(&bbio->bio) == BTRFS_MAP_READ && bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) btrfs_submit_compressed_read(bbio); @@ -131,6 +191,12 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) /* The bbio is owned by the end_io handler now */ bio_ctrl->bbio = NULL; + /* + * We used the generation to decide whether to lookup csums in the + * commit_root or not when we called bio_set_csum_search_commit_root() + * above. Now, reset the generation for the next bio. + */ + bio_ctrl->generation = 0; } /* @@ -327,6 +393,13 @@ again: /* step one, find a bunch of delalloc bytes starting at start */ delalloc_start = *start; delalloc_end = 0; + + /* + * If @max_bytes is smaller than a block, btrfs_find_delalloc_range() can + * return early without handling any dirty ranges. + */ + ASSERT(max_bytes >= fs_info->sectorsize); + found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end, max_bytes, &cached_state); if (!found || delalloc_end <= *start || delalloc_start > orig_end) { @@ -352,18 +425,19 @@ again: if (delalloc_end + 1 - delalloc_start > max_bytes) delalloc_end = delalloc_start + max_bytes - 1; - /* step two, lock all the folioss after the folios that has start */ + /* step two, lock all the folios after the folios that has start */ ret = lock_delalloc_folios(inode, locked_folio, delalloc_start, delalloc_end); ASSERT(!ret || ret == -EAGAIN); if (ret == -EAGAIN) { - /* some of the folios are gone, lets avoid looping by - * shortening the size of the delalloc range we're searching + /* + * Some of the folios are gone, lets avoid looping by + * shortening the size of the delalloc range we're searching. */ btrfs_free_extent_state(cached_state); cached_state = NULL; if (!loops) { - max_bytes = PAGE_SIZE; + max_bytes = fs_info->sectorsize; loops = 1; goto again; } else { @@ -552,6 +626,7 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) * Populate every free slot in a provided array with folios using GFP_NOFS. * * @nr_folios: number of folios to allocate + * @order: the order of the folios to be allocated * @folio_array: the array to fill with folios; any existing non-NULL entries in * the array will be skipped * @@ -559,12 +634,13 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) * -ENOMEM otherwise, the partially allocated folios would be freed and * the array slots zeroed */ -int btrfs_alloc_folio_array(unsigned int nr_folios, struct folio **folio_array) +int btrfs_alloc_folio_array(unsigned int nr_folios, unsigned int order, + struct folio **folio_array) { for (int i = 0; i < nr_folios; i++) { if (folio_array[i]) continue; - folio_array[i] = folio_alloc(GFP_NOFS, 0); + folio_array[i] = folio_alloc(GFP_NOFS, order); if (!folio_array[i]) goto error; } @@ -573,6 +649,7 @@ error: for (int i = 0; i < nr_folios; i++) { if (folio_array[i]) folio_put(folio_array[i]); + folio_array[i] = NULL; } return -ENOMEM; } @@ -701,15 +778,18 @@ static void alloc_new_bio(struct btrfs_inode *inode, * @size: portion of page that we want to write to * @pg_offset: offset of the new bio or to check whether we are adding * a contiguous page to the previous one + * @read_em_generation: generation of the extent_map we are submitting + * (only used for read) * * The will either add the page into the existing @bio_ctrl->bbio, or allocate a * new one in @bio_ctrl->bbio. - * The mirror number for this IO should already be initizlied in + * The mirror number for this IO should already be initialized in * @bio_ctrl->mirror_num. */ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl, u64 disk_bytenr, struct folio *folio, - size_t size, unsigned long pg_offset) + size_t size, unsigned long pg_offset, + u64 read_em_generation) { struct btrfs_inode *inode = folio_to_inode(folio); loff_t file_offset = folio_pos(folio) + pg_offset; @@ -740,6 +820,11 @@ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl, submit_one_bio(bio_ctrl); continue; } + /* + * Now that the folio is definitely added to the bio, include its + * generation in the max generation calculation. + */ + bio_ctrl->generation = max(bio_ctrl->generation, read_em_generation); bio_ctrl->next_file_offset += len; if (bio_ctrl->wbc) @@ -909,7 +994,7 @@ static void btrfs_readahead_expand(struct readahead_control *ractl, * return 0 on success, otherwise return error */ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, - struct btrfs_bio_ctrl *bio_ctrl, u64 *prev_em_start) + struct btrfs_bio_ctrl *bio_ctrl) { struct inode *inode = folio->mapping->host; struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); @@ -942,6 +1027,7 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, bool force_bio_submit = false; u64 disk_bytenr; u64 block_start; + u64 em_gen; ASSERT(IS_ALIGNED(cur, fs_info->sectorsize)); if (cur >= last_byte) { @@ -1019,13 +1105,13 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, * non-optimal behavior (submitting 2 bios for the same extent). */ if (compress_type != BTRFS_COMPRESS_NONE && - prev_em_start && *prev_em_start != (u64)-1 && - *prev_em_start != em->start) + bio_ctrl->last_em_start != U64_MAX && + bio_ctrl->last_em_start != em->start) force_bio_submit = true; - if (prev_em_start) - *prev_em_start = em->start; + bio_ctrl->last_em_start = em->start; + em_gen = em->generation; btrfs_free_extent_map(em); em = NULL; @@ -1049,7 +1135,7 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, if (force_bio_submit) submit_one_bio(bio_ctrl); submit_extent_folio(bio_ctrl, disk_bytenr, folio, blocksize, - pg_offset); + pg_offset, em_gen); } return 0; } @@ -1238,12 +1324,15 @@ int btrfs_read_folio(struct file *file, struct folio *folio) const u64 start = folio_pos(folio); const u64 end = start + folio_size(folio) - 1; struct extent_state *cached_state = NULL; - struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ }; + struct btrfs_bio_ctrl bio_ctrl = { + .opf = REQ_OP_READ, + .last_em_start = U64_MAX, + }; struct extent_map *em_cached = NULL; int ret; lock_extents_for_read(inode, start, end, &cached_state); - ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl, NULL); + ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl); btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state); btrfs_free_extent_map(em_cached); @@ -1580,7 +1669,7 @@ static int submit_one_sector(struct btrfs_inode *inode, ASSERT(folio_test_writeback(folio)); submit_extent_folio(bio_ctrl, disk_bytenr, folio, - sectorsize, filepos - folio_pos(folio)); + sectorsize, filepos - folio_pos(folio), 0); return 0; } @@ -1601,7 +1690,7 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, struct btrfs_fs_info *fs_info = inode->root->fs_info; unsigned long range_bitmap = 0; bool submitted_io = false; - bool error = false; + int found_error = 0; const u64 folio_start = folio_pos(folio); const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); u64 cur; @@ -1665,7 +1754,8 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, */ btrfs_mark_ordered_io_finished(inode, folio, cur, fs_info->sectorsize, false); - error = true; + if (!found_error) + found_error = ret; continue; } submitted_io = true; @@ -1682,11 +1772,11 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, * If we hit any error, the corresponding sector will have its dirty * flag cleared and writeback finished, thus no need to handle the error case. */ - if (!submitted_io && !error) { + if (!submitted_io && !found_error) { btrfs_folio_set_writeback(fs_info, folio, start, len); btrfs_folio_clear_writeback(fs_info, folio, start, len); } - return ret; + return found_error; } /* @@ -2147,7 +2237,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, * @fs_info: The fs_info for this file system. * @start: The offset of the range to start waiting on writeback. * @end: The end of the range, inclusive. This is meant to be used in - * conjuction with wait_marked_extents, so this will usually be + * conjunction with wait_marked_extents, so this will usually be * the_next_eb->start - 1. */ void btrfs_btree_wait_writeback_range(struct btrfs_fs_info *fs_info, u64 start, @@ -2417,7 +2507,7 @@ retry: * In above case, [32K, 96K) is asynchronously submitted * for compression, and [124K, 128K) needs to be written back. * - * If we didn't wait wrtiteback for page 64K, [128K, 128K) + * If we didn't wait writeback for page 64K, [128K, 128K) * won't be submitted as the page still has writeback flag * and will be skipped in the next check. * @@ -2583,7 +2673,8 @@ void btrfs_readahead(struct readahead_control *rac) { struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ | REQ_RAHEAD, - .ractl = rac + .ractl = rac, + .last_em_start = U64_MAX, }; struct folio *folio; struct btrfs_inode *inode = BTRFS_I(rac->mapping->host); @@ -2591,12 +2682,11 @@ void btrfs_readahead(struct readahead_control *rac) const u64 end = start + readahead_length(rac) - 1; struct extent_state *cached_state = NULL; struct extent_map *em_cached = NULL; - u64 prev_em_start = (u64)-1; lock_extents_for_read(inode, start, end, &cached_state); while ((folio = readahead_folio(rac)) != NULL) - btrfs_do_readpage(folio, &em_cached, &bio_ctrl, &prev_em_start); + btrfs_do_readpage(folio, &em_cached, &bio_ctrl); btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state); @@ -2901,7 +2991,7 @@ static void cleanup_extent_buffer_folios(struct extent_buffer *eb) { const int num_folios = num_extent_folios(eb); - /* We canont use num_extent_folios() as loop bound as eb->folios changes. */ + /* We cannot use num_extent_folios() as loop bound as eb->folios changes. */ for (int i = 0; i < num_folios; i++) { ASSERT(eb->folios[i]); detach_extent_buffer_folio(eb, eb->folios[i]); @@ -3148,29 +3238,30 @@ static struct extent_buffer *grab_extent_buffer(struct btrfs_fs_info *fs_info, */ static bool check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) { - if (!IS_ALIGNED(start, fs_info->sectorsize)) { + const u32 nodesize = fs_info->nodesize; + + if (unlikely(!IS_ALIGNED(start, fs_info->sectorsize))) { btrfs_err(fs_info, "bad tree block start %llu", start); return true; } - if (fs_info->nodesize < PAGE_SIZE && !IS_ALIGNED(start, fs_info->nodesize)) { + if (unlikely(nodesize < PAGE_SIZE && !IS_ALIGNED(start, nodesize))) { btrfs_err(fs_info, "tree block is not nodesize aligned, start %llu nodesize %u", - start, fs_info->nodesize); + start, nodesize); return true; } - if (fs_info->nodesize >= PAGE_SIZE && - !PAGE_ALIGNED(start)) { + if (unlikely(nodesize >= PAGE_SIZE && !PAGE_ALIGNED(start))) { btrfs_err(fs_info, "tree block is not page aligned, start %llu nodesize %u", - start, fs_info->nodesize); + start, nodesize); return true; } - if (!IS_ALIGNED(start, fs_info->nodesize) && - !test_and_set_bit(BTRFS_FS_UNALIGNED_TREE_BLOCK, &fs_info->flags)) { + if (unlikely(!IS_ALIGNED(start, nodesize) && + !test_and_set_bit(BTRFS_FS_UNALIGNED_TREE_BLOCK, &fs_info->flags))) { btrfs_warn(fs_info, "tree block not nodesize aligned, start %llu nodesize %u, can be resolved by a full metadata balance", - start, fs_info->nodesize); + start, nodesize); } return false; } @@ -3789,7 +3880,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int mirror_num, return ret; wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_READING, TASK_UNINTERRUPTIBLE); - if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) + if (unlikely(!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))) return -EIO; return 0; } @@ -4465,7 +4556,7 @@ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, if (IS_ERR(eb)) return; - if (btrfs_buffer_uptodate(eb, gen, 1)) { + if (btrfs_buffer_uptodate(eb, gen, true)) { free_extent_buffer(eb); return; } |