diff options
Diffstat (limited to 'fs/btrfs/zoned.c')
-rw-r--r-- | fs/btrfs/zoned.c | 209 |
1 files changed, 143 insertions, 66 deletions
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 245e813ecd78..e00036672f33 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -17,6 +17,7 @@ #include "accessors.h" #include "bio.h" #include "transaction.h" +#include "sysfs.h" /* Maximum number of zones to report per blkdev_report_zones() call */ #define BTRFS_REPORT_NR_ZONES 4096 @@ -42,6 +43,9 @@ /* Number of superblock log zones */ #define BTRFS_NR_SB_LOG_ZONES 2 +/* Default number of max active zones when the device has no limits. */ +#define BTRFS_DEFAULT_MAX_ACTIVE_ZONES 128 + /* * Minimum of active zones we need: * @@ -270,7 +274,7 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, return ret; } *nr_zones = ret; - if (!ret) + if (unlikely(!ret)) return -EIO; /* Populate cache */ @@ -311,7 +315,7 @@ static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info) if (ret < 0) return ret; /* No dev extents at all? Not good */ - if (ret > 0) + if (unlikely(ret > 0)) return -EUCLEAN; } @@ -416,7 +420,10 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) if (!IS_ALIGNED(nr_sectors, zone_sectors)) zone_info->nr_zones++; - max_active_zones = bdev_max_active_zones(bdev); + max_active_zones = min_not_zero(bdev_max_active_zones(bdev), + bdev_max_open_zones(bdev)); + if (!max_active_zones && zone_info->nr_zones > BTRFS_DEFAULT_MAX_ACTIVE_ZONES) + max_active_zones = BTRFS_DEFAULT_MAX_ACTIVE_ZONES; if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) { btrfs_err(fs_info, "zoned: %s: max active zones %u is too small, need at least %u active zones", @@ -496,7 +503,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) sector = zones[nr_zones - 1].start + zones[nr_zones - 1].len; } - if (nreported != zone_info->nr_zones) { + if (unlikely(nreported != zone_info->nr_zones)) { btrfs_err(device->fs_info, "inconsistent number of zones on %s (%u/%u)", rcu_dereference(device->name), nreported, @@ -506,7 +513,12 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) } if (max_active_zones) { - if (nactive > max_active_zones) { + if (unlikely(nactive > max_active_zones)) { + if (bdev_max_active_zones(bdev) == 0) { + max_active_zones = 0; + zone_info->max_active_zones = 0; + goto validate; + } btrfs_err(device->fs_info, "zoned: %u active zones on %s exceeds max_active_zones %u", nactive, rcu_dereference(device->name), @@ -519,6 +531,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) set_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags); } +validate: /* Validate superblock log */ nr_zones = BTRFS_NR_SB_LOG_ZONES; for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { @@ -537,7 +550,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) if (ret) goto out; - if (nr_zones != BTRFS_NR_SB_LOG_ZONES) { + if (unlikely(nr_zones != BTRFS_NR_SB_LOG_ZONES)) { btrfs_err(device->fs_info, "zoned: failed to read super block log zone info at devid %llu zone %u", device->devid, sb_zone); @@ -555,7 +568,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) ret = sb_write_pointer(device->bdev, &zone_info->sb_zones[sb_pos], &sb_wp); - if (ret != -ENOENT && ret) { + if (unlikely(ret != -ENOENT && ret)) { btrfs_err(device->fs_info, "zoned: super block log zone corrupted devid %llu zone %u", device->devid, sb_zone); @@ -888,7 +901,7 @@ int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw, zones); if (ret < 0) return ret; - if (ret != BTRFS_NR_SB_LOG_ZONES) + if (unlikely(ret != BTRFS_NR_SB_LOG_ZONES)) return -EIO; return sb_log_location(bdev, zones, rw, bytenr_ret); @@ -1240,7 +1253,7 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache, root = btrfs_extent_root(fs_info, key.objectid); ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); /* We should not find the exact match */ - if (!ret) + if (unlikely(!ret)) ret = -EUCLEAN; if (ret < 0) return ret; @@ -1261,8 +1274,8 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache, else length = fs_info->nodesize; - if (!(found_key.objectid >= cache->start && - found_key.objectid + length <= cache->start + cache->length)) { + if (unlikely(!(found_key.objectid >= cache->start && + found_key.objectid + length <= cache->start + cache->length))) { return -EUCLEAN; } *offset_ret = found_key.objectid + length - cache->start; @@ -1344,7 +1357,7 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, return 0; } - if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) { + if (unlikely(zone.type == BLK_ZONE_TYPE_CONVENTIONAL)) { btrfs_err(fs_info, "zoned: unexpected conventional zone %llu on device %s (devid %llu)", zone.start << SECTOR_SHIFT, rcu_dereference(device->name), @@ -1386,7 +1399,7 @@ static int btrfs_load_block_group_single(struct btrfs_block_group *bg, struct zone_info *info, unsigned long *active) { - if (info->alloc_offset == WP_MISSING_DEV) { + if (unlikely(info->alloc_offset == WP_MISSING_DEV)) { btrfs_err(bg->fs_info, "zoned: cannot recover write pointer for zone %llu", info->physical); @@ -1415,13 +1428,13 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg, bg->zone_capacity = min_not_zero(zone_info[0].capacity, zone_info[1].capacity); - if (zone_info[0].alloc_offset == WP_MISSING_DEV) { + if (unlikely(zone_info[0].alloc_offset == WP_MISSING_DEV)) { btrfs_err(bg->fs_info, "zoned: cannot recover write pointer for zone %llu", zone_info[0].physical); return -EIO; } - if (zone_info[1].alloc_offset == WP_MISSING_DEV) { + if (unlikely(zone_info[1].alloc_offset == WP_MISSING_DEV)) { btrfs_err(bg->fs_info, "zoned: cannot recover write pointer for zone %llu", zone_info[1].physical); @@ -1434,14 +1447,14 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg, if (zone_info[1].alloc_offset == WP_CONVENTIONAL) zone_info[1].alloc_offset = last_alloc; - if (zone_info[0].alloc_offset != zone_info[1].alloc_offset) { + if (unlikely(zone_info[0].alloc_offset != zone_info[1].alloc_offset)) { btrfs_err(bg->fs_info, "zoned: write pointer offset mismatch of zones in DUP profile"); return -EIO; } if (test_bit(0, active) != test_bit(1, active)) { - if (!btrfs_zone_activate(bg)) + if (unlikely(!btrfs_zone_activate(bg))) return -EIO; } else if (test_bit(0, active)) { set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags); @@ -1476,16 +1489,16 @@ static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg, if (zone_info[i].alloc_offset == WP_CONVENTIONAL) zone_info[i].alloc_offset = last_alloc; - if ((zone_info[0].alloc_offset != zone_info[i].alloc_offset) && - !btrfs_test_opt(fs_info, DEGRADED)) { + if (unlikely((zone_info[0].alloc_offset != zone_info[i].alloc_offset) && + !btrfs_test_opt(fs_info, DEGRADED))) { btrfs_err(fs_info, "zoned: write pointer offset mismatch of zones in %s profile", btrfs_bg_type_to_raid_name(map->type)); return -EIO; } if (test_bit(0, active) != test_bit(i, active)) { - if (!btrfs_test_opt(fs_info, DEGRADED) && - !btrfs_zone_activate(bg)) { + if (unlikely(!btrfs_test_opt(fs_info, DEGRADED) && + !btrfs_zone_activate(bg))) { return -EIO; } } else { @@ -1541,7 +1554,7 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg, } if (test_bit(0, active) != test_bit(i, active)) { - if (!btrfs_zone_activate(bg)) + if (unlikely(!btrfs_zone_activate(bg))) return -EIO; } else { if (test_bit(0, active)) @@ -1573,7 +1586,7 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, continue; if (test_bit(0, active) != test_bit(i, active)) { - if (!btrfs_zone_activate(bg)) + if (unlikely(!btrfs_zone_activate(bg))) return -EIO; } else { if (test_bit(0, active)) @@ -1630,7 +1643,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) return 0; /* Sanity check */ - if (!IS_ALIGNED(length, fs_info->zone_size)) { + if (unlikely(!IS_ALIGNED(length, fs_info->zone_size))) { btrfs_err(fs_info, "zoned: block group %llu len %llu unaligned to zone size %llu", logical, length, fs_info->zone_size); @@ -1743,7 +1756,7 @@ out: return -EINVAL; } - if (cache->alloc_offset > cache->zone_capacity) { + if (unlikely(cache->alloc_offset > cache->zone_capacity)) { btrfs_err(fs_info, "zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu", cache->alloc_offset, cache->zone_capacity, @@ -2074,7 +2087,7 @@ static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical, ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, &mapped_length, &bioc, NULL, NULL); - if (ret || !bioc || mapped_length < PAGE_SIZE) { + if (unlikely(ret || !bioc || mapped_length < PAGE_SIZE)) { ret = -EIO; goto out_put_bioc; } @@ -2132,7 +2145,7 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, if (physical_pos == wp) return 0; - if (physical_pos > wp) + if (unlikely(physical_pos > wp)) return -EUCLEAN; length = wp - physical_pos; @@ -2168,10 +2181,15 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) goto out_unlock; } - /* No space left */ - if (btrfs_zoned_bg_is_full(block_group)) { - ret = false; - goto out_unlock; + if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) { + /* The caller should check if the block group is full. */ + if (WARN_ON_ONCE(btrfs_zoned_bg_is_full(block_group))) { + ret = false; + goto out_unlock; + } + } else { + /* Since it is already written, it should have been active. */ + WARN_ON_ONCE(block_group->meta_write_pointer != block_group->start); } for (i = 0; i < map->num_stripes; i++) { @@ -2230,7 +2248,7 @@ static void wait_eb_writebacks(struct btrfs_block_group *block_group) struct btrfs_fs_info *fs_info = block_group->fs_info; const u64 end = block_group->start + block_group->length; struct extent_buffer *eb; - unsigned long index, start = (block_group->start >> fs_info->sectorsize_bits); + unsigned long index, start = (block_group->start >> fs_info->nodesize_bits); rcu_read_lock(); xa_for_each_start(&fs_info->buffer_tree, index, eb, start) { @@ -2245,6 +2263,40 @@ static void wait_eb_writebacks(struct btrfs_block_group *block_group) rcu_read_unlock(); } +static int call_zone_finish(struct btrfs_block_group *block_group, + struct btrfs_io_stripe *stripe) +{ + struct btrfs_device *device = stripe->dev; + const u64 physical = stripe->physical; + struct btrfs_zoned_device_info *zinfo = device->zone_info; + int ret; + + if (!device->bdev) + return 0; + + if (zinfo->max_active_zones == 0) + return 0; + + if (btrfs_dev_is_sequential(device, physical)) { + unsigned int nofs_flags; + + nofs_flags = memalloc_nofs_save(); + ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, + physical >> SECTOR_SHIFT, + zinfo->zone_size >> SECTOR_SHIFT); + memalloc_nofs_restore(nofs_flags); + + if (ret) + return ret; + } + + if (!(block_group->flags & BTRFS_BLOCK_GROUP_DATA)) + zinfo->reserved_active_zones++; + btrfs_dev_clear_active_zone(device, physical); + + return 0; +} + static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written) { struct btrfs_fs_info *fs_info = block_group->fs_info; @@ -2329,31 +2381,12 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ down_read(&dev_replace->rwsem); map = block_group->physical_map; for (i = 0; i < map->num_stripes; i++) { - struct btrfs_device *device = map->stripes[i].dev; - const u64 physical = map->stripes[i].physical; - struct btrfs_zoned_device_info *zinfo = device->zone_info; - unsigned int nofs_flags; - - if (!device->bdev) - continue; - - if (zinfo->max_active_zones == 0) - continue; - - nofs_flags = memalloc_nofs_save(); - ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, - physical >> SECTOR_SHIFT, - zinfo->zone_size >> SECTOR_SHIFT); - memalloc_nofs_restore(nofs_flags); + ret = call_zone_finish(block_group, &map->stripes[i]); if (ret) { up_read(&dev_replace->rwsem); return ret; } - - if (!(block_group->flags & BTRFS_BLOCK_GROUP_DATA)) - zinfo->reserved_active_zones++; - btrfs_dev_clear_active_zone(device, physical); } up_read(&dev_replace->rwsem); @@ -2431,16 +2464,17 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) return ret; } -void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length) +int btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length) { struct btrfs_block_group *block_group; u64 min_alloc_bytes; if (!btrfs_is_zoned(fs_info)) - return; + return 0; block_group = btrfs_lookup_block_group(fs_info, logical); - ASSERT(block_group); + if (WARN_ON_ONCE(!block_group)) + return -ENOENT; /* No MIXED_BG on zoned btrfs. */ if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) @@ -2457,16 +2491,21 @@ void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 len out: btrfs_put_block_group(block_group); + return 0; } static void btrfs_zone_finish_endio_workfn(struct work_struct *work) { + int ret; struct btrfs_block_group *bg = container_of(work, struct btrfs_block_group, zone_finish_work); wait_on_extent_buffer_writeback(bg->last_eb); free_extent_buffer(bg->last_eb); - btrfs_zone_finish_endio(bg->fs_info, bg->start, bg->length); + ret = do_zone_finish(bg, true); + if (ret) + btrfs_handle_fs_error(bg->fs_info, ret, + "Failed to finish block-group's zone"); btrfs_put_block_group(bg); } @@ -2488,7 +2527,7 @@ void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, refcount_inc(&eb->refs); bg->last_eb = eb; INIT_WORK(&bg->zone_finish_work, btrfs_zone_finish_endio_workfn); - queue_work(system_unbound_wq, &bg->zone_finish_work); + queue_work(system_dfl_wq, &bg->zone_finish_work); } void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) @@ -2504,12 +2543,12 @@ void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) void btrfs_zoned_reserve_data_reloc_bg(struct btrfs_fs_info *fs_info) { struct btrfs_space_info *data_sinfo = fs_info->data_sinfo; - struct btrfs_space_info *space_info = data_sinfo->sub_group[0]; + struct btrfs_space_info *space_info = data_sinfo; struct btrfs_trans_handle *trans; struct btrfs_block_group *bg; struct list_head *bg_list; u64 alloc_flags; - bool initial = false; + bool first = true; bool did_chunk_alloc = false; int index; int ret; @@ -2523,21 +2562,52 @@ void btrfs_zoned_reserve_data_reloc_bg(struct btrfs_fs_info *fs_info) if (sb_rdonly(fs_info->sb)) return; - ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC); alloc_flags = btrfs_get_alloc_profile(fs_info, space_info->flags); index = btrfs_bg_flags_to_raid_index(alloc_flags); - bg_list = &data_sinfo->block_groups[index]; + /* Scan the data space_info to find empty block groups. Take the second one. */ again: + bg_list = &space_info->block_groups[index]; list_for_each_entry(bg, bg_list, list) { - if (bg->used > 0) + if (bg->alloc_offset != 0) continue; - if (!initial) { - initial = true; + if (first) { + first = false; continue; } + if (space_info == data_sinfo) { + /* Migrate the block group to the data relocation space_info. */ + struct btrfs_space_info *reloc_sinfo = data_sinfo->sub_group[0]; + int factor; + + ASSERT(reloc_sinfo->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC); + factor = btrfs_bg_type_to_factor(bg->flags); + + down_write(&space_info->groups_sem); + list_del_init(&bg->list); + /* We can assume this as we choose the second empty one. */ + ASSERT(!list_empty(&space_info->block_groups[index])); + up_write(&space_info->groups_sem); + + spin_lock(&space_info->lock); + space_info->total_bytes -= bg->length; + space_info->disk_total -= bg->length * factor; + space_info->disk_total -= bg->zone_unusable; + /* There is no allocation ever happened. */ + ASSERT(bg->used == 0); + /* No super block in a block group on the zoned setup. */ + ASSERT(bg->bytes_super == 0); + spin_unlock(&space_info->lock); + + bg->space_info = reloc_sinfo; + if (reloc_sinfo->block_group_kobjs[index] == NULL) + btrfs_sysfs_add_block_group_type(bg); + + btrfs_add_bg_to_space_info(fs_info, bg); + } + fs_info->data_reloc_bg = bg->start; set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &bg->runtime_flags); btrfs_zone_activate(bg); @@ -2552,11 +2622,18 @@ again: if (IS_ERR(trans)) return; + /* Allocate new BG in the data relocation space_info. */ + space_info = data_sinfo->sub_group[0]; + ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC); ret = btrfs_chunk_alloc(trans, space_info, alloc_flags, CHUNK_ALLOC_FORCE); btrfs_end_transaction(trans); if (ret == 1) { + /* + * We allocated a new block group in the data relocation space_info. We + * can take that one. + */ + first = false; did_chunk_alloc = true; - bg_list = &space_info->block_groups[index]; goto again; } } @@ -2650,7 +2727,7 @@ int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info) spin_lock(&block_group->lock); if (block_group->reserved || block_group->alloc_offset == 0 || - (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM) || + !(block_group->flags & BTRFS_BLOCK_GROUP_DATA) || test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) { spin_unlock(&block_group->lock); continue; |