// SPDX-License-Identifier: GPL-2.0-or-later #include #include #include #include #include #include #include #include #include #include #include #include "md.h" #include "md-bitmap.h" /* * #### Background * * Redundant data is used to enhance data fault tolerance, and the storage * methods for redundant data vary depending on the RAID levels. And it's * important to maintain the consistency of redundant data. * * Bitmap is used to record which data blocks have been synchronized and which * ones need to be resynchronized or recovered. Each bit in the bitmap * represents a segment of data in the array. When a bit is set, it indicates * that the multiple redundant copies of that data segment may not be * consistent. Data synchronization can be performed based on the bitmap after * power failure or readding a disk. If there is no bitmap, a full disk * synchronization is required. * * #### Key Features * * - IO fastpath is lockless, if user issues lots of write IO to the same * bitmap bit in a short time, only the first write has additional overhead * to update bitmap bit, no additional overhead for the following writes; * - support only resync or recover written data, means in the case creating * new array or replacing with a new disk, there is no need to do a full disk * resync/recovery; * * #### Key Concept * * ##### State Machine * * Each bit is one byte, contain 6 different states, see llbitmap_state. And * there are total 8 different actions, see llbitmap_action, can change state: * * llbitmap state machine: transitions between states * * | | Startwrite | Startsync | Endsync | Abortsync| * | --------- | ---------- | --------- | ------- | ------- | * | Unwritten | Dirty | x | x | x | * | Clean | Dirty | x | x | x | * | Dirty | x | x | x | x | * | NeedSync | x | Syncing | x | x | * | Syncing | x | Syncing | Dirty | NeedSync | * * | | Reload | Daemon | Discard | Stale | * | --------- | -------- | ------ | --------- | --------- | * | Unwritten | x | x | x | x | * | Clean | x | x | Unwritten | NeedSync | * | Dirty | NeedSync | Clean | Unwritten | NeedSync | * | NeedSync | x | x | Unwritten | x | * | Syncing | NeedSync | x | Unwritten | NeedSync | * * Typical scenarios: * * 1) Create new array * All bits will be set to Unwritten by default, if --assume-clean is set, * all bits will be set to Clean instead. * * 2) write data, raid1/raid10 have full copy of data, while raid456 doesn't and * rely on xor data * * 2.1) write new data to raid1/raid10: * Unwritten --StartWrite--> Dirty * * 2.2) write new data to raid456: * Unwritten --StartWrite--> NeedSync * * Because the initial recover for raid456 is skipped, the xor data is not built * yet, the bit must be set to NeedSync first and after lazy initial recover is * finished, the bit will finally set to Dirty(see 5.1 and 5.4); * * 2.3) cover write * Clean --StartWrite--> Dirty * * 3) daemon, if the array is not degraded: * Dirty --Daemon--> Clean * * 4) discard * {Clean, Dirty, NeedSync, Syncing} --Discard--> Unwritten * * 5) resync and recover * * 5.1) common process * NeedSync --Startsync--> Syncing --Endsync--> Dirty --Daemon--> Clean * * 5.2) resync after power failure * Dirty --Reload--> NeedSync * * 5.3) recover while replacing with a new disk * By default, the old bitmap framework will recover all data, and llbitmap * implements this by a new helper, see llbitmap_skip_sync_blocks: * * skip recover for bits other than dirty or clean; * * 5.4) lazy initial recover for raid5: * By default, the old bitmap framework will only allow new recover when there * are spares(new disk), a new recovery flag MD_RECOVERY_LAZY_RECOVER is added * to perform raid456 lazy recover for set bits(from 2.2). * * 6. special handling for degraded array: * * - Dirty bits will never be cleared, daemon will just do nothing, so that if * a disk is readded, Clean bits can be skipped with recovery; * - Dirty bits will convert to Syncing from start write, to do data recovery * for new added disks; * - New write will convert bits to NeedSync directly; * * ##### Bitmap IO * * ##### Chunksize * * The default bitmap size is 128k, incluing 1k bitmap super block, and * the default size of segment of data in the array each bit(chunksize) is 64k, * and chunksize will adjust to twice the old size each time if the total number * bits is not less than 127k.(see llbitmap_init) * * ##### READ * * While creating bitmap, all pages will be allocated and read for llbitmap, * there won't be read afterwards * * ##### WRITE * * WRITE IO is divided into logical_block_size of the array, the dirty state * of each block is tracked independently, for example: * * each page is 4k, contain 8 blocks; each block is 512 bytes contain 512 bit; * * | page0 | page1 | ... | page 31 | * | | * | \-----------------------\ * | | * | block0 | block1 | ... | block 8| * | | * | \-----------------\ * | | * | bit0 | bit1 | ... | bit511 | * * From IO path, if one bit is changed to Dirty or NeedSync, the corresponding * subpage will be marked dirty, such block must write first before the IO is * issued. This behaviour will affect IO performance, to reduce the impact, if * multiple bits are changed in the same block in a short time, all bits in this * block will be changed to Dirty/NeedSync, so that there won't be any overhead * until daemon clears dirty bits. * * ##### Dirty Bits synchronization * * IO fast path will set bits to dirty, and those dirty bits will be cleared * by daemon after IO is done. llbitmap_page_ctl is used to synchronize between * IO path and daemon; * * IO path: * 1) try to grab a reference, if succeed, set expire time after 5s and return; * 2) if failed to grab a reference, wait for daemon to finish clearing dirty * bits; * * Daemon (Daemon will be woken up every daemon_sleep seconds): * For each page: * 1) check if page expired, if not skip this page; for expired page: * 2) suspend the page and wait for inflight write IO to be done; * 3) change dirty page to clean; * 4) resume the page; */ #define BITMAP_DATA_OFFSET 1024 /* 64k is the max IO size of sync IO for raid1/raid10 */ #define MIN_CHUNK_SIZE (64 * 2) /* By default, daemon will be woken up every 30s */ #define DEFAULT_DAEMON_SLEEP 30 /* * Dirtied bits that have not been accessed for more than 5s will be cleared * by daemon. */ #define DEFAULT_BARRIER_IDLE 5 enum llbitmap_state { /* No valid data, init state after assemble the array */ BitUnwritten = 0, /* data is consistent */ BitClean, /* data will be consistent after IO is done, set directly for writes */ BitDirty, /* * data need to be resynchronized: * 1) set directly for writes if array is degraded, prevent full disk * synchronization after readding a disk; * 2) reassemble the array after power failure, and dirty bits are * found after reloading the bitmap; * 3) set for first write for raid5, to build initial xor data lazily */ BitNeedSync, /* data is synchronizing */ BitSyncing, BitStateCount, BitNone = 0xff, }; enum llbitmap_action { /* User write new data, this is the only action from IO fast path */ BitmapActionStartwrite = 0, /* Start recovery */ BitmapActionStartsync, /* Finish recovery */ BitmapActionEndsync, /* Failed recovery */ BitmapActionAbortsync, /* Reassemble the array */ BitmapActionReload, /* Daemon thread is trying to clear dirty bits */ BitmapActionDaemon, /* Data is deleted */ BitmapActionDiscard, /* * Bitmap is stale, mark all bits in addition to BitUnwritten to * BitNeedSync. */ BitmapActionStale, BitmapActionCount, /* Init state is BitUnwritten */ BitmapActionInit, }; enum llbitmap_page_state { LLPageFlush = 0, LLPageDirty, }; struct llbitmap_page_ctl { char *state; struct page *page; unsigned long expire; unsigned long flags; wait_queue_head_t wait; struct percpu_ref active; /* Per block size dirty state, maximum 64k page / 1 sector = 128 */ unsigned long dirty[]; }; struct llbitmap { struct mddev *mddev; struct llbitmap_page_ctl **pctl; unsigned int nr_pages; unsigned int io_size; unsigned int blocks_per_page; /* shift of one chunk */ unsigned long chunkshift; /* size of one chunk in sector */ unsigned long chunksize; /* total number of chunks */ unsigned long chunks; unsigned long last_end_sync; /* * time in seconds that dirty bits will be cleared if the page is not * accessed. */ unsigned long barrier_idle; /* fires on first BitDirty state */ struct timer_list pending_timer; struct work_struct daemon_work; unsigned long flags; __u64 events_cleared; /* for slow disks */ atomic_t behind_writes; wait_queue_head_t behind_wait; }; struct llbitmap_unplug_work { struct work_struct work; struct llbitmap *llbitmap; struct completion *done; }; static struct workqueue_struct *md_llbitmap_io_wq; static struct workqueue_struct *md_llbitmap_unplug_wq; static char state_machine[BitStateCount][BitmapActionCount] = { [BitUnwritten] = { [BitmapActionStartwrite] = BitDirty, [BitmapActionStartsync] = BitNone, [BitmapActionEndsync] = BitNone, [BitmapActionAbortsync] = BitNone, [BitmapActionReload] = BitNone, [BitmapActionDaemon] = BitNone, [BitmapActionDiscard] = BitNone, [BitmapActionStale] = BitNone, }, [BitClean] = { [BitmapActionStartwrite] = BitDirty, [BitmapActionStartsync] = BitNone, [BitmapActionEndsync] = BitNone, [BitmapActionAbortsync] = BitNone, [BitmapActionReload] = BitNone, [BitmapActionDaemon] = BitNone, [BitmapActionDiscard] = BitUnwritten, [BitmapActionStale] = BitNeedSync, }, [BitDirty] = { [BitmapActionStartwrite] = BitNone, [BitmapActionStartsync] = BitNone, [BitmapActionEndsync] = BitNone, [BitmapActionAbortsync] = BitNone, [BitmapActionReload] = BitNeedSync, [BitmapActionDaemon] = BitClean, [BitmapActionDiscard] = BitUnwritten, [BitmapActionStale] = BitNeedSync, }, [BitNeedSync] = { [BitmapActionStartwrite] = BitNone, [BitmapActionStartsync] = BitSyncing, [BitmapActionEndsync] = BitNone, [BitmapActionAbortsync] = BitNone, [BitmapActionReload] = BitNone, [BitmapActionDaemon] = BitNone, [BitmapActionDiscard] = BitUnwritten, [BitmapActionStale] = BitNone, }, [BitSyncing] = { [BitmapActionStartwrite] = BitNone, [BitmapActionStartsync] = BitSyncing, [BitmapActionEndsync] = BitDirty, [BitmapActionAbortsync] = BitNeedSync, [BitmapActionReload] = BitNeedSync, [BitmapActionDaemon] = BitNone, [BitmapActionDiscard] = BitUnwritten, [BitmapActionStale] = BitNeedSync, }, }; static void __llbitmap_flush(struct mddev *mddev); static enum llbitmap_state llbitmap_read(struct llbitmap *llbitmap, loff_t pos) { unsigned int idx; unsigned int offset; pos += BITMAP_DATA_OFFSET; idx = pos >> PAGE_SHIFT; offset = offset_in_page(pos); return llbitmap->pctl[idx]->state[offset]; } /* set all the bits in the subpage as dirty */ static void llbitmap_infect_dirty_bits(struct llbitmap *llbitmap, struct llbitmap_page_ctl *pctl, unsigned int block) { bool level_456 = raid_is_456(llbitmap->mddev); unsigned int io_size = llbitmap->io_size; int pos; for (pos = block * io_size; pos < (block + 1) * io_size; pos++) { switch (pctl->state[pos]) { case BitUnwritten: pctl->state[pos] = level_456 ? BitNeedSync : BitDirty; break; case BitClean: pctl->state[pos] = BitDirty; break; }; } } static void llbitmap_set_page_dirty(struct llbitmap *llbitmap, int idx, int offset) { struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx]; unsigned int io_size = llbitmap->io_size; int block = offset / io_size; int pos; if (!test_bit(LLPageDirty, &pctl->flags)) set_bit(LLPageDirty, &pctl->flags); /* * For degraded array, dirty bits will never be cleared, and we must * resync all the dirty bits, hence skip infect new dirty bits to * prevent resync unnecessary data. */ if (llbitmap->mddev->degraded) { set_bit(block, pctl->dirty); return; } /* * The subpage usually contains a total of 512 bits. If any single bit * within the subpage is marked as dirty, the entire sector will be * written. To avoid impacting write performance, when multiple bits * within the same sector are modified within llbitmap->barrier_idle, * all bits in the sector will be collectively marked as dirty at once. */ if (test_and_set_bit(block, pctl->dirty)) { llbitmap_infect_dirty_bits(llbitmap, pctl, block); return; } for (pos = block * io_size; pos < (block + 1) * io_size; pos++) { if (pos == offset) continue; if (pctl->state[pos] == BitDirty || pctl->state[pos] == BitNeedSync) { llbitmap_infect_dirty_bits(llbitmap, pctl, block); return; } } } static void llbitmap_write(struct llbitmap *llbitmap, enum llbitmap_state state, loff_t pos) { unsigned int idx; unsigned int bit; pos += BITMAP_DATA_OFFSET; idx = pos >> PAGE_SHIFT; bit = offset_in_page(pos); llbitmap->pctl[idx]->state[bit] = state; if (state == BitDirty || state == BitNeedSync) llbitmap_set_page_dirty(llbitmap, idx, bit); } static struct page *llbitmap_read_page(struct llbitmap *llbitmap, int idx) { struct mddev *mddev = llbitmap->mddev; struct page *page = NULL; struct md_rdev *rdev; if (llbitmap->pctl && llbitmap->pctl[idx]) page = llbitmap->pctl[idx]->page; if (page) return page; page = alloc_page(GFP_KERNEL | __GFP_ZERO); if (!page) return ERR_PTR(-ENOMEM); rdev_for_each(rdev, mddev) { sector_t sector; if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags)) continue; sector = mddev->bitmap_info.offset + (idx << PAGE_SECTORS_SHIFT); if (sync_page_io(rdev, sector, PAGE_SIZE, page, REQ_OP_READ, true)) return page; md_error(mddev, rdev); } __free_page(page); return ERR_PTR(-EIO); } static void llbitmap_write_page(struct llbitmap *llbitmap, int idx) { struct page *page = llbitmap->pctl[idx]->page; struct mddev *mddev = llbitmap->mddev; struct md_rdev *rdev; int block; for (block = 0; block < llbitmap->blocks_per_page; block++) { struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx]; if (!test_and_clear_bit(block, pctl->dirty)) continue; rdev_for_each(rdev, mddev) { sector_t sector; sector_t bit_sector = llbitmap->io_size >> SECTOR_SHIFT; if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags)) continue; sector = mddev->bitmap_info.offset + rdev->sb_start + (idx << PAGE_SECTORS_SHIFT) + block * bit_sector; md_write_metadata(mddev, rdev, sector, llbitmap->io_size, page, block * llbitmap->io_size); } } } static void active_release(struct percpu_ref *ref) { struct llbitmap_page_ctl *pctl = container_of(ref, struct llbitmap_page_ctl, active); wake_up(&pctl->wait); } static void llbitmap_free_pages(struct llbitmap *llbitmap) { int i; if (!llbitmap->pctl) return; for (i = 0; i < llbitmap->nr_pages; i++) { struct llbitmap_page_ctl *pctl = llbitmap->pctl[i]; if (!pctl || !pctl->page) break; __free_page(pctl->page); percpu_ref_exit(&pctl->active); } kfree(llbitmap->pctl[0]); kfree(llbitmap->pctl); llbitmap->pctl = NULL; } static int llbitmap_cache_pages(struct llbitmap *llbitmap) { struct llbitmap_page_ctl *pctl; unsigned int nr_pages = DIV_ROUND_UP(llbitmap->chunks + BITMAP_DATA_OFFSET, PAGE_SIZE); unsigned int size = struct_size(pctl, dirty, BITS_TO_LONGS( llbitmap->blocks_per_page)); int i; llbitmap->pctl = kmalloc_array(nr_pages, sizeof(void *), GFP_KERNEL | __GFP_ZERO); if (!llbitmap->pctl) return -ENOMEM; size = round_up(size, cache_line_size()); pctl = kmalloc_array(nr_pages, size, GFP_KERNEL | __GFP_ZERO); if (!pctl) { kfree(llbitmap->pctl); return -ENOMEM; } llbitmap->nr_pages = nr_pages; for (i = 0; i < nr_pages; i++, pctl = (void *)pctl + size) { struct page *page = llbitmap_read_page(llbitmap, i); llbitmap->pctl[i] = pctl; if (IS_ERR(page)) { llbitmap_free_pages(llbitmap); return PTR_ERR(page); } if (percpu_ref_init(&pctl->active, active_release, PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) { __free_page(page); llbitmap_free_pages(llbitmap); return -ENOMEM; } pctl->page = page; pctl->state = page_address(page); init_waitqueue_head(&pctl->wait); } return 0; } static void llbitmap_init_state(struct llbitmap *llbitmap) { enum llbitmap_state state = BitUnwritten; unsigned long i; if (test_and_clear_bit(BITMAP_CLEAN, &llbitmap->flags)) state = BitClean; for (i = 0; i < llbitmap->chunks; i++) llbitmap_write(llbitmap, state, i); } /* The return value is only used from resync, where @start == @end. */ static enum llbitmap_state llbitmap_state_machine(struct llbitmap *llbitmap, unsigned long start, unsigned long end, enum llbitmap_action action) { struct mddev *mddev = llbitmap->mddev; enum llbitmap_state state = BitNone; bool level_456 = raid_is_456(llbitmap->mddev); bool need_resync = false; bool need_recovery = false; if (test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags)) return BitNone; if (action == BitmapActionInit) { llbitmap_init_state(llbitmap); return BitNone; } while (start <= end) { enum llbitmap_state c = llbitmap_read(llbitmap, start); if (c < 0 || c >= BitStateCount) { pr_err("%s: invalid bit %lu state %d action %d, forcing resync\n", __func__, start, c, action); state = BitNeedSync; goto write_bitmap; } if (c == BitNeedSync) need_resync = !mddev->degraded; state = state_machine[c][action]; write_bitmap: if (unlikely(mddev->degraded)) { /* For degraded array, mark new data as need sync. */ if (state == BitDirty && action == BitmapActionStartwrite) state = BitNeedSync; /* * For degraded array, resync dirty data as well, noted * if array is still degraded after resync is done, all * new data will still be dirty until array is clean. */ else if (c == BitDirty && action == BitmapActionStartsync) state = BitSyncing; } else if (c == BitUnwritten && state == BitDirty && action == BitmapActionStartwrite && level_456) { /* Delay raid456 initial recovery to first write. */ state = BitNeedSync; } if (state == BitNone) { start++; continue; } llbitmap_write(llbitmap, state, start); if (state == BitNeedSync) need_resync = !mddev->degraded; else if (state == BitDirty && !timer_pending(&llbitmap->pending_timer)) mod_timer(&llbitmap->pending_timer, jiffies + mddev->bitmap_info.daemon_sleep * HZ); start++; } if (need_resync && level_456) need_recovery = true; if (need_recovery) { set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); set_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery); md_wakeup_thread(mddev->thread); } else if (need_resync) { set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); set_bit(MD_RECOVERY_SYNC, &mddev->recovery); md_wakeup_thread(mddev->thread); } return state; } static void llbitmap_raise_barrier(struct llbitmap *llbitmap, int page_idx) { struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx]; retry: if (likely(percpu_ref_tryget_live(&pctl->active))) { WRITE_ONCE(pctl->expire, jiffies + llbitmap->barrier_idle * HZ); return; } wait_event(pctl->wait, !percpu_ref_is_dying(&pctl->active)); goto retry; } static void llbitmap_release_barrier(struct llbitmap *llbitmap, int page_idx) { struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx]; percpu_ref_put(&pctl->active); } static int llbitmap_suspend_timeout(struct llbitmap *llbitmap, int page_idx) { struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx]; percpu_ref_kill(&pctl->active); if (!wait_event_timeout(pctl->wait, percpu_ref_is_zero(&pctl->active), llbitmap->mddev->bitmap_info.daemon_sleep * HZ)) return -ETIMEDOUT; return 0; } static void llbitmap_resume(struct llbitmap *llbitmap, int page_idx) { struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx]; pctl->expire = LONG_MAX; percpu_ref_resurrect(&pctl->active); wake_up(&pctl->wait); } static int llbitmap_check_support(struct mddev *mddev) { if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { pr_notice("md/llbitmap: %s: array with journal cannot have bitmap\n", mdname(mddev)); return -EBUSY; } if (mddev->bitmap_info.space == 0) { if (mddev->bitmap_info.default_space == 0) { pr_notice("md/llbitmap: %s: no space for bitmap\n", mdname(mddev)); return -ENOSPC; } } if (!mddev->persistent) { pr_notice("md/llbitmap: %s: array must be persistent\n", mdname(mddev)); return -EOPNOTSUPP; } if (mddev->bitmap_info.file) { pr_notice("md/llbitmap: %s: doesn't support bitmap file\n", mdname(mddev)); return -EOPNOTSUPP; } if (mddev->bitmap_info.external) { pr_notice("md/llbitmap: %s: doesn't support external metadata\n", mdname(mddev)); return -EOPNOTSUPP; } if (mddev_is_dm(mddev)) { pr_notice("md/llbitmap: %s: doesn't support dm-raid\n", mdname(mddev)); return -EOPNOTSUPP; } return 0; } static int llbitmap_init(struct llbitmap *llbitmap) { struct mddev *mddev = llbitmap->mddev; sector_t blocks = mddev->resync_max_sectors; unsigned long chunksize = MIN_CHUNK_SIZE; unsigned long chunks = DIV_ROUND_UP(blocks, chunksize); unsigned long space = mddev->bitmap_info.space << SECTOR_SHIFT; int ret; while (chunks > space) { chunksize = chunksize << 1; chunks = DIV_ROUND_UP_SECTOR_T(blocks, chunksize); } llbitmap->barrier_idle = DEFAULT_BARRIER_IDLE; llbitmap->chunkshift = ffz(~chunksize); llbitmap->chunksize = chunksize; llbitmap->chunks = chunks; mddev->bitmap_info.daemon_sleep = DEFAULT_DAEMON_SLEEP; ret = llbitmap_cache_pages(llbitmap); if (ret) return ret; llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1, BitmapActionInit); /* flush initial llbitmap to disk */ __llbitmap_flush(mddev); return 0; } static int llbitmap_read_sb(struct llbitmap *llbitmap) { struct mddev *mddev = llbitmap->mddev; unsigned long daemon_sleep; unsigned long chunksize; unsigned long events; struct page *sb_page; bitmap_super_t *sb; int ret = -EINVAL; if (!mddev->bitmap_info.offset) { pr_err("md/llbitmap: %s: no super block found", mdname(mddev)); return -EINVAL; } sb_page = llbitmap_read_page(llbitmap, 0); if (IS_ERR(sb_page)) { pr_err("md/llbitmap: %s: read super block failed", mdname(mddev)); return -EIO; } sb = kmap_local_page(sb_page); if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) { pr_err("md/llbitmap: %s: invalid super block magic number", mdname(mddev)); goto out_put_page; } if (sb->version != cpu_to_le32(BITMAP_MAJOR_LOCKLESS)) { pr_err("md/llbitmap: %s: invalid super block version", mdname(mddev)); goto out_put_page; } if (memcmp(sb->uuid, mddev->uuid, 16)) { pr_err("md/llbitmap: %s: bitmap superblock UUID mismatch\n", mdname(mddev)); goto out_put_page; } if (mddev->bitmap_info.space == 0) { int room = le32_to_cpu(sb->sectors_reserved); if (room) mddev->bitmap_info.space = room; else mddev->bitmap_info.space = mddev->bitmap_info.default_space; } llbitmap->flags = le32_to_cpu(sb->state); if (test_and_clear_bit(BITMAP_FIRST_USE, &llbitmap->flags)) { ret = llbitmap_init(llbitmap); goto out_put_page; } chunksize = le32_to_cpu(sb->chunksize); if (!is_power_of_2(chunksize)) { pr_err("md/llbitmap: %s: chunksize not a power of 2", mdname(mddev)); goto out_put_page; } if (chunksize < DIV_ROUND_UP_SECTOR_T(mddev->resync_max_sectors, mddev->bitmap_info.space << SECTOR_SHIFT)) { pr_err("md/llbitmap: %s: chunksize too small %lu < %llu / %lu", mdname(mddev), chunksize, mddev->resync_max_sectors, mddev->bitmap_info.space); goto out_put_page; } daemon_sleep = le32_to_cpu(sb->daemon_sleep); if (daemon_sleep < 1 || daemon_sleep > MAX_SCHEDULE_TIMEOUT / HZ) { pr_err("md/llbitmap: %s: daemon sleep %lu period out of range", mdname(mddev), daemon_sleep); goto out_put_page; } events = le64_to_cpu(sb->events); if (events < mddev->events) { pr_warn("md/llbitmap :%s: bitmap file is out of date (%lu < %llu) -- forcing full recovery", mdname(mddev), events, mddev->events); set_bit(BITMAP_STALE, &llbitmap->flags); } sb->sync_size = cpu_to_le64(mddev->resync_max_sectors); mddev->bitmap_info.chunksize = chunksize; mddev->bitmap_info.daemon_sleep = daemon_sleep; llbitmap->barrier_idle = DEFAULT_BARRIER_IDLE; llbitmap->chunksize = chunksize; llbitmap->chunks = DIV_ROUND_UP_SECTOR_T(mddev->resync_max_sectors, chunksize); llbitmap->chunkshift = ffz(~chunksize); ret = llbitmap_cache_pages(llbitmap); out_put_page: __free_page(sb_page); kunmap_local(sb); return ret; } static void llbitmap_pending_timer_fn(struct timer_list *pending_timer) { struct llbitmap *llbitmap = container_of(pending_timer, struct llbitmap, pending_timer); if (work_busy(&llbitmap->daemon_work)) { pr_warn("md/llbitmap: %s daemon_work not finished in %lu seconds\n", mdname(llbitmap->mddev), llbitmap->mddev->bitmap_info.daemon_sleep); set_bit(BITMAP_DAEMON_BUSY, &llbitmap->flags); return; } queue_work(md_llbitmap_io_wq, &llbitmap->daemon_work); } static void md_llbitmap_daemon_fn(struct work_struct *work) { struct llbitmap *llbitmap = container_of(work, struct llbitmap, daemon_work); unsigned long start; unsigned long end; bool restart; int idx; if (llbitmap->mddev->degraded) return; retry: start = 0; end = min(llbitmap->chunks, PAGE_SIZE - BITMAP_DATA_OFFSET) - 1; restart = false; for (idx = 0; idx < llbitmap->nr_pages; idx++) { struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx]; if (idx > 0) { start = end + 1; end = min(end + PAGE_SIZE, llbitmap->chunks - 1); } if (!test_bit(LLPageFlush, &pctl->flags) && time_before(jiffies, pctl->expire)) { restart = true; continue; } if (llbitmap_suspend_timeout(llbitmap, idx) < 0) { pr_warn("md/llbitmap: %s: %s waiting for page %d timeout\n", mdname(llbitmap->mddev), __func__, idx); continue; } llbitmap_state_machine(llbitmap, start, end, BitmapActionDaemon); llbitmap_resume(llbitmap, idx); } /* * If the daemon took a long time to finish, retry to prevent missing * clearing dirty bits. */ if (test_and_clear_bit(BITMAP_DAEMON_BUSY, &llbitmap->flags)) goto retry; /* If some page is dirty but not expired, setup timer again */ if (restart) mod_timer(&llbitmap->pending_timer, jiffies + llbitmap->mddev->bitmap_info.daemon_sleep * HZ); } static int llbitmap_create(struct mddev *mddev) { struct llbitmap *llbitmap; int ret; ret = llbitmap_check_support(mddev); if (ret) return ret; llbitmap = kzalloc(sizeof(*llbitmap), GFP_KERNEL); if (!llbitmap) return -ENOMEM; llbitmap->mddev = mddev; llbitmap->io_size = bdev_logical_block_size(mddev->gendisk->part0); llbitmap->blocks_per_page = PAGE_SIZE / llbitmap->io_size; timer_setup(&llbitmap->pending_timer, llbitmap_pending_timer_fn, 0); INIT_WORK(&llbitmap->daemon_work, md_llbitmap_daemon_fn); atomic_set(&llbitmap->behind_writes, 0); init_waitqueue_head(&llbitmap->behind_wait); mutex_lock(&mddev->bitmap_info.mutex); mddev->bitmap = llbitmap; ret = llbitmap_read_sb(llbitmap); mutex_unlock(&mddev->bitmap_info.mutex); if (ret) { kfree(llbitmap); mddev->bitmap = NULL; } return ret; } static int llbitmap_resize(struct mddev *mddev, sector_t blocks, int chunksize) { struct llbitmap *llbitmap = mddev->bitmap; unsigned long chunks; if (chunksize == 0) chunksize = llbitmap->chunksize; /* If there is enough space, leave the chunksize unchanged. */ chunks = DIV_ROUND_UP_SECTOR_T(blocks, chunksize); while (chunks > mddev->bitmap_info.space << SECTOR_SHIFT) { chunksize = chunksize << 1; chunks = DIV_ROUND_UP_SECTOR_T(blocks, chunksize); } llbitmap->chunkshift = ffz(~chunksize); llbitmap->chunksize = chunksize; llbitmap->chunks = chunks; return 0; } static int llbitmap_load(struct mddev *mddev) { enum llbitmap_action action = BitmapActionReload; struct llbitmap *llbitmap = mddev->bitmap; if (test_and_clear_bit(BITMAP_STALE, &llbitmap->flags)) action = BitmapActionStale; llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1, action); return 0; } static void llbitmap_destroy(struct mddev *mddev) { struct llbitmap *llbitmap = mddev->bitmap; if (!llbitmap) return; mutex_lock(&mddev->bitmap_info.mutex); timer_delete_sync(&llbitmap->pending_timer); flush_workqueue(md_llbitmap_io_wq); flush_workqueue(md_llbitmap_unplug_wq); mddev->bitmap = NULL; llbitmap_free_pages(llbitmap); kfree(llbitmap); mutex_unlock(&mddev->bitmap_info.mutex); } static void llbitmap_start_write(struct mddev *mddev, sector_t offset, unsigned long sectors) { struct llbitmap *llbitmap = mddev->bitmap; unsigned long start = offset >> llbitmap->chunkshift; unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift; int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; llbitmap_state_machine(llbitmap, start, end, BitmapActionStartwrite); while (page_start <= page_end) { llbitmap_raise_barrier(llbitmap, page_start); page_start++; } } static void llbitmap_end_write(struct mddev *mddev, sector_t offset, unsigned long sectors) { struct llbitmap *llbitmap = mddev->bitmap; unsigned long start = offset >> llbitmap->chunkshift; unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift; int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; while (page_start <= page_end) { llbitmap_release_barrier(llbitmap, page_start); page_start++; } } static void llbitmap_start_discard(struct mddev *mddev, sector_t offset, unsigned long sectors) { struct llbitmap *llbitmap = mddev->bitmap; unsigned long start = DIV_ROUND_UP_SECTOR_T(offset, llbitmap->chunksize); unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift; int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; llbitmap_state_machine(llbitmap, start, end, BitmapActionDiscard); while (page_start <= page_end) { llbitmap_raise_barrier(llbitmap, page_start); page_start++; } } static void llbitmap_end_discard(struct mddev *mddev, sector_t offset, unsigned long sectors) { struct llbitmap *llbitmap = mddev->bitmap; unsigned long start = DIV_ROUND_UP_SECTOR_T(offset, llbitmap->chunksize); unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift; int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; while (page_start <= page_end) { llbitmap_release_barrier(llbitmap, page_start); page_start++; } } static void llbitmap_unplug_fn(struct work_struct *work) { struct llbitmap_unplug_work *unplug_work = container_of(work, struct llbitmap_unplug_work, work); struct llbitmap *llbitmap = unplug_work->llbitmap; struct blk_plug plug; int i; blk_start_plug(&plug); for (i = 0; i < llbitmap->nr_pages; i++) { if (!test_bit(LLPageDirty, &llbitmap->pctl[i]->flags) || !test_and_clear_bit(LLPageDirty, &llbitmap->pctl[i]->flags)) continue; llbitmap_write_page(llbitmap, i); } blk_finish_plug(&plug); md_super_wait(llbitmap->mddev); complete(unplug_work->done); } static bool llbitmap_dirty(struct llbitmap *llbitmap) { int i; for (i = 0; i < llbitmap->nr_pages; i++) if (test_bit(LLPageDirty, &llbitmap->pctl[i]->flags)) return true; return false; } static void llbitmap_unplug(struct mddev *mddev, bool sync) { DECLARE_COMPLETION_ONSTACK(done); struct llbitmap *llbitmap = mddev->bitmap; struct llbitmap_unplug_work unplug_work = { .llbitmap = llbitmap, .done = &done, }; if (!llbitmap_dirty(llbitmap)) return; /* * Issue new bitmap IO under submit_bio() context will deadlock: * - the bio will wait for bitmap bio to be done, before it can be * issued; * - bitmap bio will be added to current->bio_list and wait for this * bio to be issued; */ INIT_WORK_ONSTACK(&unplug_work.work, llbitmap_unplug_fn); queue_work(md_llbitmap_unplug_wq, &unplug_work.work); wait_for_completion(&done); destroy_work_on_stack(&unplug_work.work); } /* * Force to write all bitmap pages to disk, called when stopping the array, or * every daemon_sleep seconds when sync_thread is running. */ static void __llbitmap_flush(struct mddev *mddev) { struct llbitmap *llbitmap = mddev->bitmap; struct blk_plug plug; int i; blk_start_plug(&plug); for (i = 0; i < llbitmap->nr_pages; i++) { struct llbitmap_page_ctl *pctl = llbitmap->pctl[i]; /* mark all blocks as dirty */ set_bit(LLPageDirty, &pctl->flags); bitmap_fill(pctl->dirty, llbitmap->blocks_per_page); llbitmap_write_page(llbitmap, i); } blk_finish_plug(&plug); md_super_wait(llbitmap->mddev); } static void llbitmap_flush(struct mddev *mddev) { struct llbitmap *llbitmap = mddev->bitmap; int i; for (i = 0; i < llbitmap->nr_pages; i++) set_bit(LLPageFlush, &llbitmap->pctl[i]->flags); timer_delete_sync(&llbitmap->pending_timer); queue_work(md_llbitmap_io_wq, &llbitmap->daemon_work); flush_work(&llbitmap->daemon_work); __llbitmap_flush(mddev); } /* This is used for raid5 lazy initial recovery */ static bool llbitmap_blocks_synced(struct mddev *mddev, sector_t offset) { struct llbitmap *llbitmap = mddev->bitmap; unsigned long p = offset >> llbitmap->chunkshift; enum llbitmap_state c = llbitmap_read(llbitmap, p); return c == BitClean || c == BitDirty; } static sector_t llbitmap_skip_sync_blocks(struct mddev *mddev, sector_t offset) { struct llbitmap *llbitmap = mddev->bitmap; unsigned long p = offset >> llbitmap->chunkshift; int blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1)); enum llbitmap_state c = llbitmap_read(llbitmap, p); /* always skip unwritten blocks */ if (c == BitUnwritten) return blocks; /* For degraded array, don't skip */ if (mddev->degraded) return 0; /* For resync also skip clean/dirty blocks */ if ((c == BitClean || c == BitDirty) && test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) return blocks; return 0; } static bool llbitmap_start_sync(struct mddev *mddev, sector_t offset, sector_t *blocks, bool degraded) { struct llbitmap *llbitmap = mddev->bitmap; unsigned long p = offset >> llbitmap->chunkshift; /* * Handle one bit at a time, this is much simpler. And it doesn't matter * if md_do_sync() loop more times. */ *blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1)); return llbitmap_state_machine(llbitmap, p, p, BitmapActionStartsync) == BitSyncing; } /* Something is wrong, sync_thread stop at @offset */ static void llbitmap_end_sync(struct mddev *mddev, sector_t offset, sector_t *blocks) { struct llbitmap *llbitmap = mddev->bitmap; unsigned long p = offset >> llbitmap->chunkshift; *blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1)); llbitmap_state_machine(llbitmap, p, llbitmap->chunks - 1, BitmapActionAbortsync); } /* A full sync_thread is finished */ static void llbitmap_close_sync(struct mddev *mddev) { struct llbitmap *llbitmap = mddev->bitmap; int i; for (i = 0; i < llbitmap->nr_pages; i++) { struct llbitmap_page_ctl *pctl = llbitmap->pctl[i]; /* let daemon_fn clear dirty bits immediately */ WRITE_ONCE(pctl->expire, jiffies); } llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1, BitmapActionEndsync); } /* * sync_thread have reached @sector, update metadata every daemon_sleep seconds, * just in case sync_thread have to restart after power failure. */ static void llbitmap_cond_end_sync(struct mddev *mddev, sector_t sector, bool force) { struct llbitmap *llbitmap = mddev->bitmap; if (sector == 0) { llbitmap->last_end_sync = jiffies; return; } if (time_before(jiffies, llbitmap->last_end_sync + HZ * mddev->bitmap_info.daemon_sleep)) return; wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); mddev->curr_resync_completed = sector; set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); llbitmap_state_machine(llbitmap, 0, sector >> llbitmap->chunkshift, BitmapActionEndsync); __llbitmap_flush(mddev); llbitmap->last_end_sync = jiffies; sysfs_notify_dirent_safe(mddev->sysfs_completed); } static bool llbitmap_enabled(void *data, bool flush) { struct llbitmap *llbitmap = data; return llbitmap && !test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags); } static void llbitmap_dirty_bits(struct mddev *mddev, unsigned long s, unsigned long e) { llbitmap_state_machine(mddev->bitmap, s, e, BitmapActionStartwrite); } static void llbitmap_write_sb(struct llbitmap *llbitmap) { int nr_blocks = DIV_ROUND_UP(BITMAP_DATA_OFFSET, llbitmap->io_size); bitmap_fill(llbitmap->pctl[0]->dirty, nr_blocks); llbitmap_write_page(llbitmap, 0); md_super_wait(llbitmap->mddev); } static void llbitmap_update_sb(void *data) { struct llbitmap *llbitmap = data; struct mddev *mddev = llbitmap->mddev; struct page *sb_page; bitmap_super_t *sb; if (test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags)) return; sb_page = llbitmap_read_page(llbitmap, 0); if (IS_ERR(sb_page)) { pr_err("%s: %s: read super block failed", __func__, mdname(mddev)); set_bit(BITMAP_WRITE_ERROR, &llbitmap->flags); return; } if (mddev->events < llbitmap->events_cleared) llbitmap->events_cleared = mddev->events; sb = kmap_local_page(sb_page); sb->events = cpu_to_le64(mddev->events); sb->state = cpu_to_le32(llbitmap->flags); sb->chunksize = cpu_to_le32(llbitmap->chunksize); sb->sync_size = cpu_to_le64(mddev->resync_max_sectors); sb->events_cleared = cpu_to_le64(llbitmap->events_cleared); sb->sectors_reserved = cpu_to_le32(mddev->bitmap_info.space); sb->daemon_sleep = cpu_to_le32(mddev->bitmap_info.daemon_sleep); kunmap_local(sb); llbitmap_write_sb(llbitmap); } static int llbitmap_get_stats(void *data, struct md_bitmap_stats *stats) { struct llbitmap *llbitmap = data; memset(stats, 0, sizeof(*stats)); stats->missing_pages = 0; stats->pages = llbitmap->nr_pages; stats->file_pages = llbitmap->nr_pages; stats->behind_writes = atomic_read(&llbitmap->behind_writes); stats->behind_wait = wq_has_sleeper(&llbitmap->behind_wait); stats->events_cleared = llbitmap->events_cleared; return 0; } /* just flag all pages as needing to be written */ static void llbitmap_write_all(struct mddev *mddev) { int i; struct llbitmap *llbitmap = mddev->bitmap; for (i = 0; i < llbitmap->nr_pages; i++) { struct llbitmap_page_ctl *pctl = llbitmap->pctl[i]; set_bit(LLPageDirty, &pctl->flags); bitmap_fill(pctl->dirty, llbitmap->blocks_per_page); } } static void llbitmap_start_behind_write(struct mddev *mddev) { struct llbitmap *llbitmap = mddev->bitmap; atomic_inc(&llbitmap->behind_writes); } static void llbitmap_end_behind_write(struct mddev *mddev) { struct llbitmap *llbitmap = mddev->bitmap; if (atomic_dec_and_test(&llbitmap->behind_writes)) wake_up(&llbitmap->behind_wait); } static void llbitmap_wait_behind_writes(struct mddev *mddev) { struct llbitmap *llbitmap = mddev->bitmap; if (!llbitmap) return; wait_event(llbitmap->behind_wait, atomic_read(&llbitmap->behind_writes) == 0); } static ssize_t bits_show(struct mddev *mddev, char *page) { struct llbitmap *llbitmap; int bits[BitStateCount] = {0}; loff_t start = 0; mutex_lock(&mddev->bitmap_info.mutex); llbitmap = mddev->bitmap; if (!llbitmap || !llbitmap->pctl) { mutex_unlock(&mddev->bitmap_info.mutex); return sprintf(page, "no bitmap\n"); } if (test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags)) { mutex_unlock(&mddev->bitmap_info.mutex); return sprintf(page, "bitmap io error\n"); } while (start < llbitmap->chunks) { enum llbitmap_state c = llbitmap_read(llbitmap, start); if (c < 0 || c >= BitStateCount) pr_err("%s: invalid bit %llu state %d\n", __func__, start, c); else bits[c]++; start++; } mutex_unlock(&mddev->bitmap_info.mutex); return sprintf(page, "unwritten %d\nclean %d\ndirty %d\nneed sync %d\nsyncing %d\n", bits[BitUnwritten], bits[BitClean], bits[BitDirty], bits[BitNeedSync], bits[BitSyncing]); } static struct md_sysfs_entry llbitmap_bits = __ATTR_RO(bits); static ssize_t metadata_show(struct mddev *mddev, char *page) { struct llbitmap *llbitmap; ssize_t ret; mutex_lock(&mddev->bitmap_info.mutex); llbitmap = mddev->bitmap; if (!llbitmap) { mutex_unlock(&mddev->bitmap_info.mutex); return sprintf(page, "no bitmap\n"); } ret = sprintf(page, "chunksize %lu\nchunkshift %lu\nchunks %lu\noffset %llu\ndaemon_sleep %lu\n", llbitmap->chunksize, llbitmap->chunkshift, llbitmap->chunks, mddev->bitmap_info.offset, llbitmap->mddev->bitmap_info.daemon_sleep); mutex_unlock(&mddev->bitmap_info.mutex); return ret; } static struct md_sysfs_entry llbitmap_metadata = __ATTR_RO(metadata); static ssize_t daemon_sleep_show(struct mddev *mddev, char *page) { return sprintf(page, "%lu\n", mddev->bitmap_info.daemon_sleep); } static ssize_t daemon_sleep_store(struct mddev *mddev, const char *buf, size_t len) { unsigned long timeout; int rv = kstrtoul(buf, 10, &timeout); if (rv) return rv; mddev->bitmap_info.daemon_sleep = timeout; return len; } static struct md_sysfs_entry llbitmap_daemon_sleep = __ATTR_RW(daemon_sleep); static ssize_t barrier_idle_show(struct mddev *mddev, char *page) { struct llbitmap *llbitmap = mddev->bitmap; return sprintf(page, "%lu\n", llbitmap->barrier_idle); } static ssize_t barrier_idle_store(struct mddev *mddev, const char *buf, size_t len) { struct llbitmap *llbitmap = mddev->bitmap; unsigned long timeout; int rv = kstrtoul(buf, 10, &timeout); if (rv) return rv; llbitmap->barrier_idle = timeout; return len; } static struct md_sysfs_entry llbitmap_barrier_idle = __ATTR_RW(barrier_idle); static struct attribute *md_llbitmap_attrs[] = { &llbitmap_bits.attr, &llbitmap_metadata.attr, &llbitmap_daemon_sleep.attr, &llbitmap_barrier_idle.attr, NULL }; static struct attribute_group md_llbitmap_group = { .name = "llbitmap", .attrs = md_llbitmap_attrs, }; static struct bitmap_operations llbitmap_ops = { .head = { .type = MD_BITMAP, .id = ID_LLBITMAP, .name = "llbitmap", }, .enabled = llbitmap_enabled, .create = llbitmap_create, .resize = llbitmap_resize, .load = llbitmap_load, .destroy = llbitmap_destroy, .start_write = llbitmap_start_write, .end_write = llbitmap_end_write, .start_discard = llbitmap_start_discard, .end_discard = llbitmap_end_discard, .unplug = llbitmap_unplug, .flush = llbitmap_flush, .start_behind_write = llbitmap_start_behind_write, .end_behind_write = llbitmap_end_behind_write, .wait_behind_writes = llbitmap_wait_behind_writes, .blocks_synced = llbitmap_blocks_synced, .skip_sync_blocks = llbitmap_skip_sync_blocks, .start_sync = llbitmap_start_sync, .end_sync = llbitmap_end_sync, .close_sync = llbitmap_close_sync, .cond_end_sync = llbitmap_cond_end_sync, .update_sb = llbitmap_update_sb, .get_stats = llbitmap_get_stats, .dirty_bits = llbitmap_dirty_bits, .write_all = llbitmap_write_all, .group = &md_llbitmap_group, }; int md_llbitmap_init(void) { md_llbitmap_io_wq = alloc_workqueue("md_llbitmap_io", WQ_MEM_RECLAIM | WQ_UNBOUND, 0); if (!md_llbitmap_io_wq) return -ENOMEM; md_llbitmap_unplug_wq = alloc_workqueue("md_llbitmap_unplug", WQ_MEM_RECLAIM | WQ_UNBOUND, 0); if (!md_llbitmap_unplug_wq) { destroy_workqueue(md_llbitmap_io_wq); md_llbitmap_io_wq = NULL; return -ENOMEM; } return register_md_submodule(&llbitmap_ops.head); } void md_llbitmap_exit(void) { destroy_workqueue(md_llbitmap_io_wq); md_llbitmap_io_wq = NULL; destroy_workqueue(md_llbitmap_unplug_wq); md_llbitmap_unplug_wq = NULL; unregister_md_submodule(&llbitmap_ops.head); }