diff options
Diffstat (limited to 'drivers/md')
57 files changed, 7965 insertions, 521 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index ddb37f6670de..104aa5355090 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -37,6 +37,32 @@ config BLK_DEV_MD If unsure, say N. +config MD_BITMAP + bool "MD RAID bitmap support" + default y + depends on BLK_DEV_MD + help + If you say Y here, support for the write intent bitmap will be + enabled. The bitmap can be used to optimize resync speed after power + failure or readding a disk, limiting it to recorded dirty sectors in + bitmap. + + This feature can be added to existing MD array or MD array can be + created with bitmap via mdadm(8). + + If unsure, say Y. + +config MD_LLBITMAP + bool "MD RAID lockless bitmap support" + depends on BLK_DEV_MD + help + If you say Y here, support for the lockless write intent bitmap will + be enabled. + + Note, this is an experimental feature. + + If unsure, say N. + config MD_AUTODETECT bool "Autodetect RAID arrays during kernel boot" depends on BLK_DEV_MD=y @@ -54,6 +80,7 @@ config MD_AUTODETECT config MD_BITMAP_FILE bool "MD bitmap file support (deprecated)" default y + depends on MD_BITMAP help If you say Y here, support for write intent bitmaps in files on an external file system is enabled. This is an alternative to the internal @@ -174,6 +201,7 @@ config MD_RAID456 config MD_CLUSTER tristate "Cluster Support for MD" + select MD_BITMAP depends on BLK_DEV_MD depends on DLM default n @@ -393,6 +421,7 @@ config DM_RAID select MD_RAID1 select MD_RAID10 select MD_RAID456 + select MD_BITMAP select BLK_DEV_MD help A dm target that supports RAID1, RAID10, RAID4, RAID5 and RAID6 mappings @@ -659,4 +688,6 @@ config DM_AUDIT source "drivers/md/dm-vdo/Kconfig" +source "drivers/md/dm-pcache/Kconfig" + endif # MD diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 87bdfc9fe14c..c338cc6fbe2e 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -27,7 +27,9 @@ dm-clone-y += dm-clone-target.o dm-clone-metadata.o dm-verity-y += dm-verity-target.o dm-zoned-y += dm-zoned-target.o dm-zoned-metadata.o dm-zoned-reclaim.o -md-mod-y += md.o md-bitmap.o +md-mod-y += md.o +md-mod-$(CONFIG_MD_BITMAP) += md-bitmap.o +md-mod-$(CONFIG_MD_LLBITMAP) += md-llbitmap.o raid456-y += raid5.o raid5-cache.o raid5-ppl.o linear-y += md-linear.o @@ -71,6 +73,7 @@ obj-$(CONFIG_DM_RAID) += dm-raid.o obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o obj-$(CONFIG_DM_VERITY) += dm-verity.o obj-$(CONFIG_DM_VDO) += dm-vdo/ +obj-$(CONFIG_DM_PCACHE) += dm-pcache/ obj-$(CONFIG_DM_CACHE) += dm-cache.o obj-$(CONFIG_DM_CACHE_SMQ) += dm-cache-smq.o obj-$(CONFIG_DM_EBS) += dm-ebs.o diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 7510d1c983a5..f327456fc4e0 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -115,8 +115,7 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio) check = bio_kmalloc(nr_segs, GFP_NOIO); if (!check) return; - bio_init(check, bio->bi_bdev, check->bi_inline_vecs, nr_segs, - REQ_OP_READ); + bio_init_inline(check, bio->bi_bdev, nr_segs, REQ_OP_READ); check->bi_iter.bi_sector = bio->bi_iter.bi_sector; check->bi_iter.bi_size = bio->bi_iter.bi_size; diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index 020712c5203f..2386d08bf4e4 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -26,8 +26,7 @@ struct bio *bch_bbio_alloc(struct cache_set *c) struct bbio *b = mempool_alloc(&c->bio_meta, GFP_NOIO); struct bio *bio = &b->bio; - bio_init(bio, NULL, bio->bi_inline_vecs, - meta_bucket_pages(&c->cache->sb), 0); + bio_init_inline(bio, NULL, meta_bucket_pages(&c->cache->sb), 0); return bio; } diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 7ff14bd2feb8..d50eb82ccb4f 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -615,7 +615,7 @@ static void do_journal_discard(struct cache *ca) atomic_set(&ja->discard_in_flight, DISCARD_IN_FLIGHT); - bio_init(bio, ca->bdev, bio->bi_inline_vecs, 1, REQ_OP_DISCARD); + bio_init_inline(bio, ca->bdev, 1, REQ_OP_DISCARD); bio->bi_iter.bi_sector = bucket_to_sector(ca->set, ca->sb.d[ja->discard_idx]); bio->bi_iter.bi_size = bucket_bytes(ca); diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 26a6a535ec32..73918e55bf04 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -79,7 +79,7 @@ static void moving_init(struct moving_io *io) { struct bio *bio = &io->bio.bio; - bio_init(bio, NULL, bio->bi_inline_vecs, + bio_init_inline(bio, NULL, DIV_ROUND_UP(KEY_SIZE(&io->w->key), PAGE_SECTORS), 0); bio_get(bio); bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); @@ -145,9 +145,9 @@ static void read_moving(struct cache_set *c) continue; } - io = kzalloc(struct_size(io, bio.bio.bi_inline_vecs, - DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS)), - GFP_KERNEL); + io = kzalloc(sizeof(*io) + sizeof(struct bio_vec) * + DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), + GFP_KERNEL); if (!io) goto err; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 1492c8552255..6d250e366412 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -2236,7 +2236,7 @@ static int cache_alloc(struct cache *ca) __module_get(THIS_MODULE); kobject_init(&ca->kobj, &bch_cache_ktype); - bio_init(&ca->journal.bio, NULL, ca->journal.bio.bi_inline_vecs, 8, 0); + bio_init_inline(&ca->journal.bio, NULL, 8, 0); /* * When the cache disk is first registered, ca->sb.njournal_buckets diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 302e75f1fc4b..6ba73dc1a3df 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -331,7 +331,7 @@ static void dirty_init(struct keybuf_key *w) struct dirty_io *io = w->private; struct bio *bio = &io->bio; - bio_init(bio, NULL, bio->bi_inline_vecs, + bio_init_inline(bio, NULL, DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), 0); if (!io->dc->writeback_percent) bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); @@ -536,9 +536,9 @@ static void read_dirty(struct cached_dev *dc) for (i = 0; i < nk; i++) { w = keys[i]; - io = kzalloc(struct_size(io, bio.bi_inline_vecs, - DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS)), - GFP_KERNEL); + io = kzalloc(sizeof(*io) + sizeof(struct bio_vec) * + DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), + GFP_KERNEL); if (!io) goto err; diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index ff7595caf440..e6d28be11c5c 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -1337,12 +1337,12 @@ static void use_bio(struct dm_buffer *b, enum req_op op, sector_t sector, char *ptr; unsigned int len; - bio = bio_kmalloc(1, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOWARN); + bio = bio_kmalloc(1, GFP_NOWAIT); if (!bio) { use_dmio(b, op, sector, n_sectors, offset, ioprio); return; } - bio_init(bio, b->c->bdev, bio->bi_inline_vecs, 1, op); + bio_init_inline(bio, b->c->bdev, 1, op); bio->bi_iter.bi_sector = sector; bio->bi_end_io = bio_complete; bio->bi_private = b; @@ -1601,18 +1601,18 @@ static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client * dm-bufio is resistant to allocation failures (it just keeps * one buffer reserved in cases all the allocations fail). * So set flags to not try too hard: - * GFP_NOWAIT: don't wait; if we need to sleep we'll release our - * mutex and wait ourselves. + * GFP_NOWAIT: don't wait and don't print a warning in case of + * failure; if we need to sleep we'll release our mutex + * and wait ourselves. * __GFP_NORETRY: don't retry and rather return failure * __GFP_NOMEMALLOC: don't use emergency reserves - * __GFP_NOWARN: don't print a warning in case of failure * * For debugging, if we set the cache size to 1, no new buffers will * be allocated. */ while (1) { if (dm_bufio_cache_size_latch != 1) { - b = alloc_buffer(c, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); + b = alloc_buffer(c, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOMEMALLOC); if (b) return b; } diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c index 2ed894155cab..7e1e8cc0e33a 100644 --- a/drivers/md/dm-cache-policy-smq.c +++ b/drivers/md/dm-cache-policy-smq.c @@ -590,7 +590,7 @@ static int h_init(struct smq_hash_table *ht, struct entry_space *es, unsigned in nr_buckets = roundup_pow_of_two(max(nr_entries / 4u, 16u)); ht->hash_bits = __ffs(nr_buckets); - ht->buckets = vmalloc(array_size(nr_buckets, sizeof(*ht->buckets))); + ht->buckets = vmalloc_array(nr_buckets, sizeof(*ht->buckets)); if (!ht->buckets) return -ENOMEM; diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index c889332e533b..a3c9f74fe2dc 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -162,6 +162,7 @@ struct mapped_device { #define DMF_SUSPENDED_INTERNALLY 7 #define DMF_POST_SUSPENDING 8 #define DMF_EMULATE_ZONE_APPEND 9 +#define DMF_QUEUE_STOPPED 10 static inline sector_t dm_get_size(struct mapped_device *md) { @@ -291,6 +292,7 @@ struct dm_io { struct dm_io *next; struct dm_stats_aux stats_aux; blk_status_t status; + bool requeue_flush_with_data; atomic_t io_count; struct mapped_device *md; diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index cf17fd46e255..08925aca838c 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -441,7 +441,7 @@ static struct bio *clone_bio(struct dm_target *ti, struct flakey_c *fc, struct b if (!clone) return NULL; - bio_init(clone, fc->dev->bdev, clone->bi_inline_vecs, nr_iovecs, bio->bi_opf); + bio_init_inline(clone, fc->dev->bdev, nr_iovecs, bio->bi_opf); clone->bi_iter.bi_sector = flakey_map_sector(ti, bio->bi_iter.bi_sector); clone->bi_private = bio; diff --git a/drivers/md/dm-ima.c b/drivers/md/dm-ima.c index 8b50c908c6f4..efb3cd4f9cd4 100644 --- a/drivers/md/dm-ima.c +++ b/drivers/md/dm-ima.c @@ -45,7 +45,7 @@ static void fix_separator_chars(char **buf) /* * Internal function to allocate memory for IMA measurements. */ -static void *dm_ima_alloc(size_t len, gfp_t flags, bool noio) +static void *dm_ima_alloc(size_t len, bool noio) { unsigned int noio_flag; void *ptr; @@ -53,7 +53,7 @@ static void *dm_ima_alloc(size_t len, gfp_t flags, bool noio) if (noio) noio_flag = memalloc_noio_save(); - ptr = kzalloc(len, flags); + ptr = kzalloc(len, GFP_KERNEL); if (noio) memalloc_noio_restore(noio_flag); @@ -68,13 +68,13 @@ static int dm_ima_alloc_and_copy_name_uuid(struct mapped_device *md, char **dev_ char **dev_uuid, bool noio) { int r; - *dev_name = dm_ima_alloc(DM_NAME_LEN*2, GFP_KERNEL, noio); + *dev_name = dm_ima_alloc(DM_NAME_LEN*2, noio); if (!(*dev_name)) { r = -ENOMEM; goto error; } - *dev_uuid = dm_ima_alloc(DM_UUID_LEN*2, GFP_KERNEL, noio); + *dev_uuid = dm_ima_alloc(DM_UUID_LEN*2, noio); if (!(*dev_uuid)) { r = -ENOMEM; goto error; @@ -109,7 +109,7 @@ static int dm_ima_alloc_and_copy_device_data(struct mapped_device *md, char **de if (r) return r; - *device_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN, GFP_KERNEL, noio); + *device_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN, noio); if (!(*device_data)) { r = -ENOMEM; goto error; @@ -153,14 +153,12 @@ static int dm_ima_alloc_and_copy_capacity_str(struct mapped_device *md, char **c capacity = get_capacity(md->disk); - *capacity_str = dm_ima_alloc(DM_IMA_DEVICE_CAPACITY_BUF_LEN, GFP_KERNEL, noio); + *capacity_str = dm_ima_alloc(DM_IMA_DEVICE_CAPACITY_BUF_LEN, noio); if (!(*capacity_str)) return -ENOMEM; - scnprintf(*capacity_str, DM_IMA_DEVICE_BUF_LEN, "current_device_capacity=%llu;", - capacity); - - return 0; + return scnprintf(*capacity_str, DM_IMA_DEVICE_BUF_LEN, "current_device_capacity=%llu;", + capacity); } /* @@ -195,15 +193,15 @@ void dm_ima_measure_on_table_load(struct dm_table *table, unsigned int status_fl const size_t hash_alg_prefix_len = strlen(DM_IMA_TABLE_HASH_ALG) + 1; char table_load_event_name[] = "dm_table_load"; - ima_buf = dm_ima_alloc(DM_IMA_MEASUREMENT_BUF_LEN, GFP_KERNEL, noio); + ima_buf = dm_ima_alloc(DM_IMA_MEASUREMENT_BUF_LEN, noio); if (!ima_buf) return; - target_metadata_buf = dm_ima_alloc(DM_IMA_TARGET_METADATA_BUF_LEN, GFP_KERNEL, noio); + target_metadata_buf = dm_ima_alloc(DM_IMA_TARGET_METADATA_BUF_LEN, noio); if (!target_metadata_buf) goto error; - target_data_buf = dm_ima_alloc(DM_IMA_TARGET_DATA_BUF_LEN, GFP_KERNEL, noio); + target_data_buf = dm_ima_alloc(DM_IMA_TARGET_DATA_BUF_LEN, noio); if (!target_data_buf) goto error; @@ -218,7 +216,7 @@ void dm_ima_measure_on_table_load(struct dm_table *table, unsigned int status_fl shash->tfm = tfm; digest_size = crypto_shash_digestsize(tfm); - digest = dm_ima_alloc(digest_size, GFP_KERNEL, noio); + digest = dm_ima_alloc(digest_size, noio); if (!digest) goto error; @@ -327,7 +325,7 @@ void dm_ima_measure_on_table_load(struct dm_table *table, unsigned int status_fl if (r < 0) goto error; - digest_buf = dm_ima_alloc((digest_size*2) + hash_alg_prefix_len + 1, GFP_KERNEL, noio); + digest_buf = dm_ima_alloc((digest_size*2) + hash_alg_prefix_len + 1, noio); if (!digest_buf) goto error; @@ -371,18 +369,18 @@ void dm_ima_measure_on_device_resume(struct mapped_device *md, bool swap) { char *device_table_data, *dev_name = NULL, *dev_uuid = NULL, *capacity_str = NULL; char active[] = "active_table_hash="; - unsigned int active_len = strlen(active), capacity_len = 0; + unsigned int active_len = strlen(active); unsigned int l = 0; bool noio = true; bool nodata = true; - int r; + int capacity_len; - device_table_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN, GFP_KERNEL, noio); + device_table_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN, noio); if (!device_table_data) return; - r = dm_ima_alloc_and_copy_capacity_str(md, &capacity_str, noio); - if (r) + capacity_len = dm_ima_alloc_and_copy_capacity_str(md, &capacity_str, noio); + if (capacity_len < 0) goto error; memcpy(device_table_data + l, DM_IMA_VERSION_STR, md->ima.dm_version_str_len); @@ -445,8 +443,7 @@ void dm_ima_measure_on_device_resume(struct mapped_device *md, bool swap) } if (nodata) { - r = dm_ima_alloc_and_copy_name_uuid(md, &dev_name, &dev_uuid, noio); - if (r) + if (dm_ima_alloc_and_copy_name_uuid(md, &dev_name, &dev_uuid, noio)) goto error; l = scnprintf(device_table_data, DM_IMA_DEVICE_BUF_LEN, @@ -454,7 +451,6 @@ void dm_ima_measure_on_device_resume(struct mapped_device *md, bool swap) DM_IMA_VERSION_STR, dev_name, dev_uuid); } - capacity_len = strlen(capacity_str); memcpy(device_table_data + l, capacity_str, capacity_len); l += capacity_len; @@ -483,18 +479,17 @@ void dm_ima_measure_on_device_remove(struct mapped_device *md, bool remove_all) unsigned int device_active_len = strlen(device_active_str); unsigned int device_inactive_len = strlen(device_inactive_str); unsigned int remove_all_len = strlen(remove_all_str); - unsigned int capacity_len = 0; unsigned int l = 0; bool noio = true; bool nodata = true; - int r; + int capacity_len; - device_table_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN*2, GFP_KERNEL, noio); + device_table_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN*2, noio); if (!device_table_data) goto exit; - r = dm_ima_alloc_and_copy_capacity_str(md, &capacity_str, noio); - if (r) { + capacity_len = dm_ima_alloc_and_copy_capacity_str(md, &capacity_str, noio); + if (capacity_len < 0) { kfree(device_table_data); goto exit; } @@ -570,7 +565,6 @@ void dm_ima_measure_on_device_remove(struct mapped_device *md, bool remove_all) memcpy(device_table_data + l, remove_all ? "y;" : "n;", 2); l += 2; - capacity_len = strlen(capacity_str); memcpy(device_table_data + l, capacity_str, capacity_len); l += capacity_len; @@ -602,20 +596,20 @@ exit: */ void dm_ima_measure_on_table_clear(struct mapped_device *md, bool new_map) { - unsigned int l = 0, capacity_len = 0; + unsigned int l = 0; char *device_table_data = NULL, *dev_name = NULL, *dev_uuid = NULL, *capacity_str = NULL; char inactive_str[] = "inactive_table_hash="; unsigned int inactive_len = strlen(inactive_str); bool noio = true; bool nodata = true; - int r; + int capacity_len; - device_table_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN, GFP_KERNEL, noio); + device_table_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN, noio); if (!device_table_data) return; - r = dm_ima_alloc_and_copy_capacity_str(md, &capacity_str, noio); - if (r) + capacity_len = dm_ima_alloc_and_copy_capacity_str(md, &capacity_str, noio); + if (capacity_len < 0) goto error1; memcpy(device_table_data + l, DM_IMA_VERSION_STR, md->ima.dm_version_str_len); @@ -650,7 +644,6 @@ void dm_ima_measure_on_table_clear(struct mapped_device *md, bool new_map) DM_IMA_VERSION_STR, dev_name, dev_uuid); } - capacity_len = strlen(capacity_str); memcpy(device_table_data + l, capacity_str, capacity_len); l += capacity_len; @@ -703,7 +696,7 @@ void dm_ima_measure_on_device_rename(struct mapped_device *md) char *old_device_data = NULL, *new_device_data = NULL, *combined_device_data = NULL; char *new_dev_name = NULL, *new_dev_uuid = NULL, *capacity_str = NULL; bool noio = true; - int r, len; + int len; if (dm_ima_alloc_and_copy_device_data(md, &new_device_data, md->ima.active_table.num_targets, noio)) @@ -712,12 +705,11 @@ void dm_ima_measure_on_device_rename(struct mapped_device *md) if (dm_ima_alloc_and_copy_name_uuid(md, &new_dev_name, &new_dev_uuid, noio)) goto error; - combined_device_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN * 2, GFP_KERNEL, noio); + combined_device_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN * 2, noio); if (!combined_device_data) goto error; - r = dm_ima_alloc_and_copy_capacity_str(md, &capacity_str, noio); - if (r) + if (dm_ima_alloc_and_copy_capacity_str(md, &capacity_str, noio) < 0) goto error; old_device_data = md->ima.active_table.device_metadata; diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index efeee0a873c0..170bf67a2edd 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -133,7 +133,7 @@ struct journal_sector { commit_id_t commit_id; }; -#define MAX_TAG_SIZE (JOURNAL_SECTOR_DATA - JOURNAL_MAC_PER_SECTOR - offsetof(struct journal_entry, last_bytes[MAX_SECTORS_PER_BLOCK])) +#define MAX_TAG_SIZE 255 #define METADATA_PADDING_SECTORS 8 @@ -219,10 +219,13 @@ struct dm_integrity_c { __u8 log2_blocks_per_bitmap_bit; unsigned char mode; + bool internal_hash; int failed; - struct crypto_shash *internal_hash; + struct crypto_shash *internal_shash; + struct crypto_ahash *internal_ahash; + unsigned int internal_hash_digestsize; struct dm_target *ti; @@ -277,6 +280,9 @@ struct dm_integrity_c { bool fix_hmac; bool legacy_recalculate; + mempool_t ahash_req_pool; + struct ahash_request *journal_ahash_req; + struct alg_spec internal_hash_alg; struct alg_spec journal_crypt_alg; struct alg_spec journal_mac_alg; @@ -326,6 +332,8 @@ struct dm_integrity_io { unsigned payload_len; bool integrity_payload_from_mempool; bool integrity_range_locked; + + struct ahash_request *ahash_req; }; struct journal_completion { @@ -352,6 +360,7 @@ struct bitmap_block_status { static struct kmem_cache *journal_io_cache; #define JOURNAL_IO_MEMPOOL 32 +#define AHASH_MEMPOOL 32 #ifdef DEBUG_PRINT #define DEBUG_print(x, ...) printk(KERN_DEBUG x, ##__VA_ARGS__) @@ -1634,15 +1643,15 @@ static void integrity_end_io(struct bio *bio) dec_in_flight(dio); } -static void integrity_sector_checksum(struct dm_integrity_c *ic, sector_t sector, - const char *data, char *result) +static void integrity_sector_checksum_shash(struct dm_integrity_c *ic, sector_t sector, + const char *data, unsigned offset, char *result) { __le64 sector_le = cpu_to_le64(sector); - SHASH_DESC_ON_STACK(req, ic->internal_hash); + SHASH_DESC_ON_STACK(req, ic->internal_shash); int r; unsigned int digest_size; - req->tfm = ic->internal_hash; + req->tfm = ic->internal_shash; r = crypto_shash_init(req); if (unlikely(r < 0)) { @@ -1664,7 +1673,7 @@ static void integrity_sector_checksum(struct dm_integrity_c *ic, sector_t sector goto failed; } - r = crypto_shash_update(req, data, ic->sectors_per_block << SECTOR_SHIFT); + r = crypto_shash_update(req, data + offset, ic->sectors_per_block << SECTOR_SHIFT); if (unlikely(r < 0)) { dm_integrity_io_error(ic, "crypto_shash_update", r); goto failed; @@ -1676,7 +1685,7 @@ static void integrity_sector_checksum(struct dm_integrity_c *ic, sector_t sector goto failed; } - digest_size = crypto_shash_digestsize(ic->internal_hash); + digest_size = ic->internal_hash_digestsize; if (unlikely(digest_size < ic->tag_size)) memset(result + digest_size, 0, ic->tag_size - digest_size); @@ -1687,6 +1696,104 @@ failed: get_random_bytes(result, ic->tag_size); } +static void integrity_sector_checksum_ahash(struct dm_integrity_c *ic, struct ahash_request **ahash_req, + sector_t sector, struct page *page, unsigned offset, char *result) +{ + __le64 sector_le = cpu_to_le64(sector); + struct ahash_request *req; + DECLARE_CRYPTO_WAIT(wait); + struct scatterlist sg[3], *s = sg; + int r; + unsigned int digest_size; + unsigned int nbytes = 0; + + might_sleep(); + + req = *ahash_req; + if (unlikely(!req)) { + req = mempool_alloc(&ic->ahash_req_pool, GFP_NOIO); + *ahash_req = req; + } + + ahash_request_set_tfm(req, ic->internal_ahash); + ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, crypto_req_done, &wait); + + if (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) { + sg_init_table(sg, 3); + sg_set_buf(s, (const __u8 *)&ic->sb->salt, SALT_SIZE); + nbytes += SALT_SIZE; + s++; + } else { + sg_init_table(sg, 2); + } + + if (likely(!is_vmalloc_addr(§or_le))) { + sg_set_buf(s, §or_le, sizeof(sector_le)); + } else { + struct page *sec_page = vmalloc_to_page(§or_le); + unsigned int sec_off = offset_in_page(§or_le); + sg_set_page(s, sec_page, sizeof(sector_le), sec_off); + } + nbytes += sizeof(sector_le); + s++; + + sg_set_page(s, page, ic->sectors_per_block << SECTOR_SHIFT, offset); + nbytes += ic->sectors_per_block << SECTOR_SHIFT; + + ahash_request_set_crypt(req, sg, result, nbytes); + + r = crypto_wait_req(crypto_ahash_digest(req), &wait); + if (unlikely(r)) { + dm_integrity_io_error(ic, "crypto_ahash_digest", r); + goto failed; + } + + digest_size = ic->internal_hash_digestsize; + if (unlikely(digest_size < ic->tag_size)) + memset(result + digest_size, 0, ic->tag_size - digest_size); + + return; + +failed: + /* this shouldn't happen anyway, the hash functions have no reason to fail */ + get_random_bytes(result, ic->tag_size); +} + +static void integrity_sector_checksum(struct dm_integrity_c *ic, struct ahash_request **ahash_req, + sector_t sector, const char *data, unsigned offset, char *result) +{ + if (likely(ic->internal_shash != NULL)) + integrity_sector_checksum_shash(ic, sector, data, offset, result); + else + integrity_sector_checksum_ahash(ic, ahash_req, sector, (struct page *)data, offset, result); +} + +static void *integrity_kmap(struct dm_integrity_c *ic, struct page *p) +{ + if (likely(ic->internal_shash != NULL)) + return kmap_local_page(p); + else + return p; +} + +static void integrity_kunmap(struct dm_integrity_c *ic, const void *ptr) +{ + if (likely(ic->internal_shash != NULL)) + kunmap_local(ptr); +} + +static void *integrity_identity(struct dm_integrity_c *ic, void *data) +{ +#ifdef CONFIG_DEBUG_SG + BUG_ON(offset_in_page(data)); + BUG_ON(!virt_addr_valid(data)); +#endif + if (likely(ic->internal_shash != NULL)) + return data; + else + return virt_to_page(data); +} + static noinline void integrity_recheck(struct dm_integrity_io *dio, char *checksum) { struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); @@ -1711,6 +1818,7 @@ static noinline void integrity_recheck(struct dm_integrity_io *dio, char *checks sector_t alignment; char *mem; char *buffer = page_to_virt(page); + unsigned int buffer_offset; int r; struct dm_io_request io_req; struct dm_io_region io_loc; @@ -1728,7 +1836,7 @@ static noinline void integrity_recheck(struct dm_integrity_io *dio, char *checks alignment &= -alignment; io_loc.sector = round_down(io_loc.sector, alignment); io_loc.count += sector - io_loc.sector; - buffer += (sector - io_loc.sector) << SECTOR_SHIFT; + buffer_offset = (sector - io_loc.sector) << SECTOR_SHIFT; io_loc.count = round_up(io_loc.count, alignment); r = dm_io(&io_req, 1, &io_loc, NULL, IOPRIO_DEFAULT); @@ -1737,7 +1845,7 @@ static noinline void integrity_recheck(struct dm_integrity_io *dio, char *checks goto free_ret; } - integrity_sector_checksum(ic, logical_sector, buffer, checksum); + integrity_sector_checksum(ic, &dio->ahash_req, logical_sector, integrity_identity(ic, buffer), buffer_offset, checksum); r = dm_integrity_rw_tag(ic, checksum, &dio->metadata_block, &dio->metadata_offset, ic->tag_size, TAG_CMP); if (r) { @@ -1754,7 +1862,7 @@ static noinline void integrity_recheck(struct dm_integrity_io *dio, char *checks } mem = bvec_kmap_local(&bv); - memcpy(mem + pos, buffer, ic->sectors_per_block << SECTOR_SHIFT); + memcpy(mem + pos, buffer + buffer_offset, ic->sectors_per_block << SECTOR_SHIFT); kunmap_local(mem); pos += ic->sectors_per_block << SECTOR_SHIFT; @@ -1776,7 +1884,7 @@ static void integrity_metadata(struct work_struct *w) if (ic->internal_hash) { struct bvec_iter iter; struct bio_vec bv; - unsigned int digest_size = crypto_shash_digestsize(ic->internal_hash); + unsigned int digest_size = ic->internal_hash_digestsize; struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); char *checksums; unsigned int extra_space = unlikely(digest_size > ic->tag_size) ? digest_size - ic->tag_size : 0; @@ -1837,17 +1945,17 @@ static void integrity_metadata(struct work_struct *w) char *mem, *checksums_ptr; again: - mem = bvec_kmap_local(&bv_copy); + mem = integrity_kmap(ic, bv_copy.bv_page); pos = 0; checksums_ptr = checksums; do { - integrity_sector_checksum(ic, sector, mem + pos, checksums_ptr); + integrity_sector_checksum(ic, &dio->ahash_req, sector, mem, bv_copy.bv_offset + pos, checksums_ptr); checksums_ptr += ic->tag_size; sectors_to_process -= ic->sectors_per_block; pos += ic->sectors_per_block << SECTOR_SHIFT; sector += ic->sectors_per_block; } while (pos < bv_copy.bv_len && sectors_to_process && checksums != checksums_onstack); - kunmap_local(mem); + integrity_kunmap(ic, mem); r = dm_integrity_rw_tag(ic, checksums, &dio->metadata_block, &dio->metadata_offset, checksums_ptr - checksums, dio->op == REQ_OP_READ ? TAG_CMP : TAG_WRITE); @@ -1949,6 +2057,7 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio) dio->ic = ic; dio->bi_status = 0; dio->op = bio_op(bio); + dio->ahash_req = NULL; if (ic->mode == 'I') { bio->bi_iter.bi_sector = dm_target_offset(ic->ti, bio->bi_iter.bi_sector); @@ -2071,19 +2180,6 @@ retry_kmap: js++; mem_ptr += 1 << SECTOR_SHIFT; } while (++s < ic->sectors_per_block); -#ifdef INTERNAL_VERIFY - if (ic->internal_hash) { - char checksums_onstack[MAX_T(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)]; - - integrity_sector_checksum(ic, logical_sector, mem + bv.bv_offset, checksums_onstack); - if (unlikely(crypto_memneq(checksums_onstack, journal_entry_tag(ic, je), ic->tag_size))) { - DMERR_LIMIT("Checksum failed when reading from journal, at sector 0x%llx", - logical_sector); - dm_audit_log_bio(DM_MSG_PREFIX, "journal-checksum", - bio, logical_sector, 0); - } - } -#endif } if (!ic->internal_hash) { @@ -2124,15 +2220,17 @@ retry_kmap: } while (++s < ic->sectors_per_block); if (ic->internal_hash) { - unsigned int digest_size = crypto_shash_digestsize(ic->internal_hash); + unsigned int digest_size = ic->internal_hash_digestsize; + void *js_page = integrity_identity(ic, (char *)js - offset_in_page(js)); + unsigned js_offset = offset_in_page(js); if (unlikely(digest_size > ic->tag_size)) { char checksums_onstack[HASH_MAX_DIGESTSIZE]; - integrity_sector_checksum(ic, logical_sector, (char *)js, checksums_onstack); + integrity_sector_checksum(ic, &dio->ahash_req, logical_sector, js_page, js_offset, checksums_onstack); memcpy(journal_entry_tag(ic, je), checksums_onstack, ic->tag_size); } else - integrity_sector_checksum(ic, logical_sector, (char *)js, journal_entry_tag(ic, je)); + integrity_sector_checksum(ic, &dio->ahash_req, logical_sector, js_page, js_offset, journal_entry_tag(ic, je)); } journal_entry_set_sector(je, logical_sector); @@ -2428,7 +2526,7 @@ retry: if (!dio->integrity_payload) { unsigned digest_size, extra_size; dio->payload_len = ic->tuple_size * (bio_sectors(bio) >> ic->sb->log2_sectors_per_block); - digest_size = crypto_shash_digestsize(ic->internal_hash); + digest_size = ic->internal_hash_digestsize; extra_size = unlikely(digest_size > ic->tag_size) ? digest_size - ic->tag_size : 0; dio->payload_len += extra_size; dio->integrity_payload = kmalloc(dio->payload_len, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); @@ -2505,11 +2603,11 @@ skip_spinlock: unsigned pos = 0; while (dio->bio_details.bi_iter.bi_size) { struct bio_vec bv = bio_iter_iovec(bio, dio->bio_details.bi_iter); - const char *mem = bvec_kmap_local(&bv); + const char *mem = integrity_kmap(ic, bv.bv_page); if (ic->tag_size < ic->tuple_size) memset(dio->integrity_payload + pos + ic->tag_size, 0, ic->tuple_size - ic->tuple_size); - integrity_sector_checksum(ic, dio->bio_details.bi_iter.bi_sector, mem, dio->integrity_payload + pos); - kunmap_local(mem); + integrity_sector_checksum(ic, &dio->ahash_req, dio->bio_details.bi_iter.bi_sector, mem, bv.bv_offset, dio->integrity_payload + pos); + integrity_kunmap(ic, mem); pos += ic->tuple_size; bio_advance_iter_single(bio, &dio->bio_details.bi_iter, ic->sectors_per_block << SECTOR_SHIFT); } @@ -2588,8 +2686,8 @@ static void dm_integrity_inline_recheck(struct work_struct *w) } bio_put(outgoing_bio); - integrity_sector_checksum(ic, dio->bio_details.bi_iter.bi_sector, outgoing_data, digest); - if (unlikely(crypto_memneq(digest, dio->integrity_payload, min(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)))) { + integrity_sector_checksum(ic, &dio->ahash_req, dio->bio_details.bi_iter.bi_sector, integrity_identity(ic, outgoing_data), 0, digest); + if (unlikely(crypto_memneq(digest, dio->integrity_payload, min(ic->internal_hash_digestsize, ic->tag_size)))) { DMERR_LIMIT("%pg: Checksum failed at sector 0x%llx", ic->dev->bdev, dio->bio_details.bi_iter.bi_sector); atomic64_inc(&ic->number_of_mismatches); @@ -2612,33 +2710,58 @@ static void dm_integrity_inline_recheck(struct work_struct *w) bio_endio(bio); } +static inline bool dm_integrity_check(struct dm_integrity_c *ic, struct dm_integrity_io *dio) +{ + struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); + unsigned pos = 0; + + while (dio->bio_details.bi_iter.bi_size) { + char digest[HASH_MAX_DIGESTSIZE]; + struct bio_vec bv = bio_iter_iovec(bio, dio->bio_details.bi_iter); + char *mem = integrity_kmap(ic, bv.bv_page); + integrity_sector_checksum(ic, &dio->ahash_req, dio->bio_details.bi_iter.bi_sector, mem, bv.bv_offset, digest); + if (unlikely(crypto_memneq(digest, dio->integrity_payload + pos, + min(ic->internal_hash_digestsize, ic->tag_size)))) { + integrity_kunmap(ic, mem); + dm_integrity_free_payload(dio); + INIT_WORK(&dio->work, dm_integrity_inline_recheck); + queue_work(ic->offload_wq, &dio->work); + return false; + } + integrity_kunmap(ic, mem); + pos += ic->tuple_size; + bio_advance_iter_single(bio, &dio->bio_details.bi_iter, ic->sectors_per_block << SECTOR_SHIFT); + } + + return true; +} + +static void dm_integrity_inline_async_check(struct work_struct *w) +{ + struct dm_integrity_io *dio = container_of(w, struct dm_integrity_io, work); + struct dm_integrity_c *ic = dio->ic; + struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); + + if (likely(dm_integrity_check(ic, dio))) + bio_endio(bio); +} + static int dm_integrity_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *status) { struct dm_integrity_c *ic = ti->private; + struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io)); if (ic->mode == 'I') { - struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io)); - if (dio->op == REQ_OP_READ && likely(*status == BLK_STS_OK)) { - unsigned pos = 0; + if (dio->op == REQ_OP_READ && likely(*status == BLK_STS_OK) && likely(dio->bio_details.bi_iter.bi_size != 0)) { if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING) && unlikely(dio->integrity_range_locked)) - goto skip_check; - while (dio->bio_details.bi_iter.bi_size) { - char digest[HASH_MAX_DIGESTSIZE]; - struct bio_vec bv = bio_iter_iovec(bio, dio->bio_details.bi_iter); - char *mem = bvec_kmap_local(&bv); - //memset(mem, 0xff, ic->sectors_per_block << SECTOR_SHIFT); - integrity_sector_checksum(ic, dio->bio_details.bi_iter.bi_sector, mem, digest); - if (unlikely(crypto_memneq(digest, dio->integrity_payload + pos, - min(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)))) { - kunmap_local(mem); - dm_integrity_free_payload(dio); - INIT_WORK(&dio->work, dm_integrity_inline_recheck); - queue_work(ic->offload_wq, &dio->work); + goto skip_check; + if (likely(ic->internal_shash != NULL)) { + if (unlikely(!dm_integrity_check(ic, dio))) return DM_ENDIO_INCOMPLETE; - } - kunmap_local(mem); - pos += ic->tuple_size; - bio_advance_iter_single(bio, &dio->bio_details.bi_iter, ic->sectors_per_block << SECTOR_SHIFT); + } else { + INIT_WORK(&dio->work, dm_integrity_inline_async_check); + queue_work(ic->offload_wq, &dio->work); + return DM_ENDIO_INCOMPLETE; } } skip_check: @@ -2646,6 +2769,8 @@ skip_check: if (unlikely(dio->integrity_range_locked)) remove_range(ic, &dio->range); } + if (unlikely(dio->ahash_req)) + mempool_free(dio->ahash_req, &ic->ahash_req_pool); return DM_ENDIO_DONE; } @@ -2902,9 +3027,12 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned int write_start #endif ic->internal_hash) { char test_tag[MAX_T(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)]; + struct journal_sector *js = access_journal_data(ic, i, l); + void *js_page = integrity_identity(ic, (char *)js - offset_in_page(js)); + unsigned js_offset = offset_in_page(js); - integrity_sector_checksum(ic, sec + ((l - j) << ic->sb->log2_sectors_per_block), - (char *)access_journal_data(ic, i, l), test_tag); + integrity_sector_checksum(ic, &ic->journal_ahash_req, sec + ((l - j) << ic->sb->log2_sectors_per_block), + js_page, js_offset, test_tag); if (unlikely(crypto_memneq(test_tag, journal_entry_tag(ic, je2), ic->tag_size))) { dm_integrity_io_error(ic, "tag mismatch when replaying journal", -EILSEQ); dm_audit_log_target(DM_MSG_PREFIX, "integrity-replay-journal", ic->ti, 0); @@ -2987,6 +3115,7 @@ static void integrity_recalc(struct work_struct *w) size_t recalc_tags_size; u8 *recalc_buffer = NULL; u8 *recalc_tags = NULL; + struct ahash_request *ahash_req = NULL; struct dm_integrity_range range; struct dm_io_request io_req; struct dm_io_region io_loc; @@ -3001,7 +3130,7 @@ static void integrity_recalc(struct work_struct *w) unsigned recalc_sectors = RECALC_SECTORS; retry: - recalc_buffer = __vmalloc(recalc_sectors << SECTOR_SHIFT, GFP_NOIO); + recalc_buffer = kmalloc(recalc_sectors << SECTOR_SHIFT, GFP_NOIO | __GFP_NOWARN); if (!recalc_buffer) { oom: recalc_sectors >>= 1; @@ -3011,11 +3140,11 @@ oom: goto free_ret; } recalc_tags_size = (recalc_sectors >> ic->sb->log2_sectors_per_block) * ic->tag_size; - if (crypto_shash_digestsize(ic->internal_hash) > ic->tag_size) - recalc_tags_size += crypto_shash_digestsize(ic->internal_hash) - ic->tag_size; + if (ic->internal_hash_digestsize > ic->tag_size) + recalc_tags_size += ic->internal_hash_digestsize - ic->tag_size; recalc_tags = kvmalloc(recalc_tags_size, GFP_NOIO); if (!recalc_tags) { - vfree(recalc_buffer); + kfree(recalc_buffer); recalc_buffer = NULL; goto oom; } @@ -3081,7 +3210,7 @@ next_chunk: goto err; io_req.bi_opf = REQ_OP_READ; - io_req.mem.type = DM_IO_VMA; + io_req.mem.type = DM_IO_KMEM; io_req.mem.ptr.addr = recalc_buffer; io_req.notify.fn = NULL; io_req.client = ic->io; @@ -3097,7 +3226,10 @@ next_chunk: t = recalc_tags; for (i = 0; i < n_sectors; i += ic->sectors_per_block) { - integrity_sector_checksum(ic, logical_sector + i, recalc_buffer + (i << SECTOR_SHIFT), t); + void *ptr = recalc_buffer + (i << SECTOR_SHIFT); + void *ptr_page = integrity_identity(ic, (char *)ptr - offset_in_page(ptr)); + unsigned ptr_offset = offset_in_page(ptr); + integrity_sector_checksum(ic, &ahash_req, logical_sector + i, ptr_page, ptr_offset, t); t += ic->tag_size; } @@ -3139,8 +3271,9 @@ unlock_ret: recalc_write_super(ic); free_ret: - vfree(recalc_buffer); + kfree(recalc_buffer); kvfree(recalc_tags); + mempool_free(ahash_req, &ic->ahash_req_pool); } static void integrity_recalc_inline(struct work_struct *w) @@ -3149,6 +3282,7 @@ static void integrity_recalc_inline(struct work_struct *w) size_t recalc_tags_size; u8 *recalc_buffer = NULL; u8 *recalc_tags = NULL; + struct ahash_request *ahash_req = NULL; struct dm_integrity_range range; struct bio *bio; struct bio_integrity_payload *bip; @@ -3171,8 +3305,8 @@ oom: } recalc_tags_size = (recalc_sectors >> ic->sb->log2_sectors_per_block) * ic->tuple_size; - if (crypto_shash_digestsize(ic->internal_hash) > ic->tuple_size) - recalc_tags_size += crypto_shash_digestsize(ic->internal_hash) - ic->tuple_size; + if (ic->internal_hash_digestsize > ic->tuple_size) + recalc_tags_size += ic->internal_hash_digestsize - ic->tuple_size; recalc_tags = kmalloc(recalc_tags_size, GFP_NOIO | __GFP_NOWARN); if (!recalc_tags) { kfree(recalc_buffer); @@ -3217,8 +3351,11 @@ next_chunk: t = recalc_tags; for (i = 0; i < range.n_sectors; i += ic->sectors_per_block) { + void *ptr = recalc_buffer + (i << SECTOR_SHIFT); + void *ptr_page = integrity_identity(ic, (char *)ptr - offset_in_page(ptr)); + unsigned ptr_offset = offset_in_page(ptr); memset(t, 0, ic->tuple_size); - integrity_sector_checksum(ic, range.logical_sector + i, recalc_buffer + (i << SECTOR_SHIFT), t); + integrity_sector_checksum(ic, &ahash_req, range.logical_sector + i, ptr_page, ptr_offset, t); t += ic->tuple_size; } @@ -3270,6 +3407,7 @@ unlock_ret: free_ret: kfree(recalc_buffer); kfree(recalc_tags); + mempool_free(ahash_req, &ic->ahash_req_pool); } static void bitmap_block_work(struct work_struct *w) @@ -4210,30 +4348,53 @@ nomem: return -ENOMEM; } -static int get_mac(struct crypto_shash **hash, struct alg_spec *a, char **error, - char *error_alg, char *error_key) +static int get_mac(struct crypto_shash **shash, struct crypto_ahash **ahash, + struct alg_spec *a, char **error, char *error_alg, char *error_key) { int r; if (a->alg_string) { - *hash = crypto_alloc_shash(a->alg_string, 0, CRYPTO_ALG_ALLOCATES_MEMORY); - if (IS_ERR(*hash)) { - *error = error_alg; - r = PTR_ERR(*hash); - *hash = NULL; - return r; - } - - if (a->key) { - r = crypto_shash_setkey(*hash, a->key, a->key_size); - if (r) { + if (shash) { + *shash = crypto_alloc_shash(a->alg_string, 0, CRYPTO_ALG_ALLOCATES_MEMORY); + if (IS_ERR(*shash)) { + *shash = NULL; + goto try_ahash; + } + if (a->key) { + r = crypto_shash_setkey(*shash, a->key, a->key_size); + if (r) { + *error = error_key; + return r; + } + } else if (crypto_shash_get_flags(*shash) & CRYPTO_TFM_NEED_KEY) { *error = error_key; + return -ENOKEY; + } + return 0; + } +try_ahash: + if (ahash) { + *ahash = crypto_alloc_ahash(a->alg_string, 0, CRYPTO_ALG_ALLOCATES_MEMORY); + if (IS_ERR(*ahash)) { + *error = error_alg; + r = PTR_ERR(*ahash); + *ahash = NULL; return r; } - } else if (crypto_shash_get_flags(*hash) & CRYPTO_TFM_NEED_KEY) { - *error = error_key; - return -ENOKEY; + if (a->key) { + r = crypto_ahash_setkey(*ahash, a->key, a->key_size); + if (r) { + *error = error_key; + return r; + } + } else if (crypto_ahash_get_flags(*ahash) & CRYPTO_TFM_NEED_KEY) { + *error = error_key; + return -ENOKEY; + } + return 0; } + *error = error_alg; + return -ENOENT; } return 0; @@ -4690,12 +4851,26 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned int argc, char **argv buffer_sectors = 1; ic->log2_buffer_sectors = min((int)__fls(buffer_sectors), 31 - SECTOR_SHIFT); - r = get_mac(&ic->internal_hash, &ic->internal_hash_alg, &ti->error, + r = get_mac(&ic->internal_shash, &ic->internal_ahash, &ic->internal_hash_alg, &ti->error, "Invalid internal hash", "Error setting internal hash key"); if (r) goto bad; + if (ic->internal_shash) { + ic->internal_hash = true; + ic->internal_hash_digestsize = crypto_shash_digestsize(ic->internal_shash); + } + if (ic->internal_ahash) { + ic->internal_hash = true; + ic->internal_hash_digestsize = crypto_ahash_digestsize(ic->internal_ahash); + r = mempool_init_kmalloc_pool(&ic->ahash_req_pool, AHASH_MEMPOOL, + sizeof(struct ahash_request) + crypto_ahash_reqsize(ic->internal_ahash)); + if (r) { + ti->error = "Cannot allocate mempool"; + goto bad; + } + } - r = get_mac(&ic->journal_mac, &ic->journal_mac_alg, &ti->error, + r = get_mac(&ic->journal_mac, NULL, &ic->journal_mac_alg, &ti->error, "Invalid journal mac", "Error setting journal mac key"); if (r) goto bad; @@ -4706,7 +4881,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned int argc, char **argv r = -EINVAL; goto bad; } - ic->tag_size = crypto_shash_digestsize(ic->internal_hash); + ic->tag_size = ic->internal_hash_digestsize; } if (ic->tag_size > MAX_TAG_SIZE) { ti->error = "Too big tag size"; @@ -5178,6 +5353,8 @@ static void dm_integrity_dtr(struct dm_target *ti) kvfree(ic->bbs); if (ic->bufio) dm_bufio_client_destroy(ic->bufio); + mempool_free(ic->journal_ahash_req, &ic->ahash_req_pool); + mempool_exit(&ic->ahash_req_pool); bioset_exit(&ic->recalc_bios); bioset_exit(&ic->recheck_bios); mempool_exit(&ic->recheck_pool); @@ -5215,8 +5392,10 @@ static void dm_integrity_dtr(struct dm_target *ti) if (ic->sb) free_pages_exact(ic->sb, SB_SECTORS << SECTOR_SHIFT); - if (ic->internal_hash) - crypto_free_shash(ic->internal_hash); + if (ic->internal_shash) + crypto_free_shash(ic->internal_shash); + if (ic->internal_ahash) + crypto_free_ahash(ic->internal_ahash); free_alg(&ic->internal_hash_alg); if (ic->journal_crypt) @@ -5233,7 +5412,7 @@ static void dm_integrity_dtr(struct dm_target *ti) static struct target_type integrity_target = { .name = "integrity", - .version = {1, 13, 0}, + .version = {1, 14, 0}, .module = THIS_MODULE, .features = DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY, .ctr = dm_integrity_ctr, diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c index 679b07dee229..7bb7174f8f4f 100644 --- a/drivers/md/dm-log-writes.c +++ b/drivers/md/dm-log-writes.c @@ -414,7 +414,7 @@ static int log_super(struct log_writes_c *lc) } /* - * Super sector should be writen in-order, otherwise the + * Super sector should be written in-order, otherwise the * nr_entries could be rewritten incorrectly by an old bio. */ wait_for_completion_io(&lc->super_done); diff --git a/drivers/md/dm-pcache/Kconfig b/drivers/md/dm-pcache/Kconfig new file mode 100644 index 000000000000..0e251eca892e --- /dev/null +++ b/drivers/md/dm-pcache/Kconfig @@ -0,0 +1,17 @@ +config DM_PCACHE + tristate "Persistent cache for Block Device (Experimental)" + depends on BLK_DEV_DM + depends on DEV_DAX + help + PCACHE provides a mechanism to use persistent memory (e.g., CXL persistent memory, + DAX-enabled devices) as a high-performance cache layer in front of + traditional block devices such as SSDs or HDDs. + + PCACHE is implemented as a kernel module that integrates with the block + layer and supports direct access (DAX) to persistent memory for low-latency, + byte-addressable caching. + + Note: This feature is experimental and should be tested thoroughly + before use in production environments. + + If unsure, say 'N'. diff --git a/drivers/md/dm-pcache/Makefile b/drivers/md/dm-pcache/Makefile new file mode 100644 index 000000000000..86776e4acad2 --- /dev/null +++ b/drivers/md/dm-pcache/Makefile @@ -0,0 +1,3 @@ +dm-pcache-y := dm_pcache.o cache_dev.o segment.o backing_dev.o cache.o cache_gc.o cache_writeback.o cache_segment.o cache_key.o cache_req.o + +obj-m += dm-pcache.o diff --git a/drivers/md/dm-pcache/backing_dev.c b/drivers/md/dm-pcache/backing_dev.c new file mode 100644 index 000000000000..7165fc0364bb --- /dev/null +++ b/drivers/md/dm-pcache/backing_dev.c @@ -0,0 +1,374 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include <linux/blkdev.h> + +#include "../dm-core.h" +#include "pcache_internal.h" +#include "cache_dev.h" +#include "backing_dev.h" +#include "cache.h" +#include "dm_pcache.h" + +static struct kmem_cache *backing_req_cache; +static struct kmem_cache *backing_bvec_cache; + +static void backing_dev_exit(struct pcache_backing_dev *backing_dev) +{ + mempool_exit(&backing_dev->req_pool); + mempool_exit(&backing_dev->bvec_pool); +} + +static void req_submit_fn(struct work_struct *work); +static void req_complete_fn(struct work_struct *work); +static int backing_dev_init(struct dm_pcache *pcache) +{ + struct pcache_backing_dev *backing_dev = &pcache->backing_dev; + int ret; + + ret = mempool_init_slab_pool(&backing_dev->req_pool, 128, backing_req_cache); + if (ret) + goto err; + + ret = mempool_init_slab_pool(&backing_dev->bvec_pool, 128, backing_bvec_cache); + if (ret) + goto req_pool_exit; + + INIT_LIST_HEAD(&backing_dev->submit_list); + INIT_LIST_HEAD(&backing_dev->complete_list); + spin_lock_init(&backing_dev->submit_lock); + spin_lock_init(&backing_dev->complete_lock); + INIT_WORK(&backing_dev->req_submit_work, req_submit_fn); + INIT_WORK(&backing_dev->req_complete_work, req_complete_fn); + atomic_set(&backing_dev->inflight_reqs, 0); + init_waitqueue_head(&backing_dev->inflight_wq); + + return 0; + +req_pool_exit: + mempool_exit(&backing_dev->req_pool); +err: + return ret; +} + +int backing_dev_start(struct dm_pcache *pcache) +{ + struct pcache_backing_dev *backing_dev = &pcache->backing_dev; + int ret; + + ret = backing_dev_init(pcache); + if (ret) + return ret; + + backing_dev->dev_size = bdev_nr_sectors(backing_dev->dm_dev->bdev); + + return 0; +} + +void backing_dev_stop(struct dm_pcache *pcache) +{ + struct pcache_backing_dev *backing_dev = &pcache->backing_dev; + + /* + * There should not be any new request comming, just wait + * inflight requests done. + */ + wait_event(backing_dev->inflight_wq, + atomic_read(&backing_dev->inflight_reqs) == 0); + + flush_work(&backing_dev->req_submit_work); + flush_work(&backing_dev->req_complete_work); + + backing_dev_exit(backing_dev); +} + +/* pcache_backing_dev_req functions */ +void backing_dev_req_end(struct pcache_backing_dev_req *backing_req) +{ + struct pcache_backing_dev *backing_dev = backing_req->backing_dev; + + if (backing_req->end_req) + backing_req->end_req(backing_req, backing_req->ret); + + switch (backing_req->type) { + case BACKING_DEV_REQ_TYPE_REQ: + if (backing_req->req.upper_req) + pcache_req_put(backing_req->req.upper_req, backing_req->ret); + break; + case BACKING_DEV_REQ_TYPE_KMEM: + if (backing_req->kmem.bvecs != backing_req->kmem.inline_bvecs) + mempool_free(backing_req->kmem.bvecs, &backing_dev->bvec_pool); + break; + default: + BUG(); + } + + mempool_free(backing_req, &backing_dev->req_pool); + + if (atomic_dec_and_test(&backing_dev->inflight_reqs)) + wake_up(&backing_dev->inflight_wq); +} + +static void req_complete_fn(struct work_struct *work) +{ + struct pcache_backing_dev *backing_dev = container_of(work, struct pcache_backing_dev, req_complete_work); + struct pcache_backing_dev_req *backing_req; + LIST_HEAD(tmp_list); + + spin_lock_irq(&backing_dev->complete_lock); + list_splice_init(&backing_dev->complete_list, &tmp_list); + spin_unlock_irq(&backing_dev->complete_lock); + + while (!list_empty(&tmp_list)) { + backing_req = list_first_entry(&tmp_list, + struct pcache_backing_dev_req, node); + list_del_init(&backing_req->node); + backing_dev_req_end(backing_req); + } +} + +static void backing_dev_bio_end(struct bio *bio) +{ + struct pcache_backing_dev_req *backing_req = bio->bi_private; + struct pcache_backing_dev *backing_dev = backing_req->backing_dev; + unsigned long flags; + + backing_req->ret = blk_status_to_errno(bio->bi_status); + + spin_lock_irqsave(&backing_dev->complete_lock, flags); + list_move_tail(&backing_req->node, &backing_dev->complete_list); + queue_work(BACKING_DEV_TO_PCACHE(backing_dev)->task_wq, &backing_dev->req_complete_work); + spin_unlock_irqrestore(&backing_dev->complete_lock, flags); +} + +static void req_submit_fn(struct work_struct *work) +{ + struct pcache_backing_dev *backing_dev = container_of(work, struct pcache_backing_dev, req_submit_work); + struct pcache_backing_dev_req *backing_req; + LIST_HEAD(tmp_list); + + spin_lock(&backing_dev->submit_lock); + list_splice_init(&backing_dev->submit_list, &tmp_list); + spin_unlock(&backing_dev->submit_lock); + + while (!list_empty(&tmp_list)) { + backing_req = list_first_entry(&tmp_list, + struct pcache_backing_dev_req, node); + list_del_init(&backing_req->node); + submit_bio_noacct(&backing_req->bio); + } +} + +void backing_dev_req_submit(struct pcache_backing_dev_req *backing_req, bool direct) +{ + struct pcache_backing_dev *backing_dev = backing_req->backing_dev; + + if (direct) { + submit_bio_noacct(&backing_req->bio); + return; + } + + spin_lock(&backing_dev->submit_lock); + list_add_tail(&backing_req->node, &backing_dev->submit_list); + queue_work(BACKING_DEV_TO_PCACHE(backing_dev)->task_wq, &backing_dev->req_submit_work); + spin_unlock(&backing_dev->submit_lock); +} + +static void bio_map(struct bio *bio, void *base, size_t size) +{ + struct page *page; + unsigned int offset; + unsigned int len; + + if (!is_vmalloc_addr(base)) { + page = virt_to_page(base); + offset = offset_in_page(base); + + BUG_ON(!bio_add_page(bio, page, size, offset)); + return; + } + + flush_kernel_vmap_range(base, size); + while (size) { + page = vmalloc_to_page(base); + offset = offset_in_page(base); + len = min_t(size_t, PAGE_SIZE - offset, size); + + BUG_ON(!bio_add_page(bio, page, len, offset)); + size -= len; + base += len; + } +} + +static struct pcache_backing_dev_req *req_type_req_alloc(struct pcache_backing_dev *backing_dev, + struct pcache_backing_dev_req_opts *opts) +{ + struct pcache_request *pcache_req = opts->req.upper_req; + struct pcache_backing_dev_req *backing_req; + struct bio *orig = pcache_req->bio; + + backing_req = mempool_alloc(&backing_dev->req_pool, opts->gfp_mask); + if (!backing_req) + return NULL; + + memset(backing_req, 0, sizeof(struct pcache_backing_dev_req)); + + bio_init_clone(backing_dev->dm_dev->bdev, &backing_req->bio, orig, opts->gfp_mask); + + backing_req->type = BACKING_DEV_REQ_TYPE_REQ; + backing_req->backing_dev = backing_dev; + atomic_inc(&backing_dev->inflight_reqs); + + return backing_req; +} + +static struct pcache_backing_dev_req *kmem_type_req_alloc(struct pcache_backing_dev *backing_dev, + struct pcache_backing_dev_req_opts *opts) +{ + struct pcache_backing_dev_req *backing_req; + u32 n_vecs = bio_add_max_vecs(opts->kmem.data, opts->kmem.len); + + backing_req = mempool_alloc(&backing_dev->req_pool, opts->gfp_mask); + if (!backing_req) + return NULL; + + memset(backing_req, 0, sizeof(struct pcache_backing_dev_req)); + + if (n_vecs > BACKING_DEV_REQ_INLINE_BVECS) { + backing_req->kmem.bvecs = mempool_alloc(&backing_dev->bvec_pool, opts->gfp_mask); + if (!backing_req->kmem.bvecs) + goto free_backing_req; + } else { + backing_req->kmem.bvecs = backing_req->kmem.inline_bvecs; + } + + backing_req->kmem.n_vecs = n_vecs; + backing_req->type = BACKING_DEV_REQ_TYPE_KMEM; + backing_req->backing_dev = backing_dev; + atomic_inc(&backing_dev->inflight_reqs); + + return backing_req; + +free_backing_req: + mempool_free(backing_req, &backing_dev->req_pool); + return NULL; +} + +struct pcache_backing_dev_req *backing_dev_req_alloc(struct pcache_backing_dev *backing_dev, + struct pcache_backing_dev_req_opts *opts) +{ + if (opts->type == BACKING_DEV_REQ_TYPE_REQ) + return req_type_req_alloc(backing_dev, opts); + + if (opts->type == BACKING_DEV_REQ_TYPE_KMEM) + return kmem_type_req_alloc(backing_dev, opts); + + BUG(); +} + +static void req_type_req_init(struct pcache_backing_dev_req *backing_req, + struct pcache_backing_dev_req_opts *opts) +{ + struct pcache_request *pcache_req = opts->req.upper_req; + struct bio *clone; + u32 off = opts->req.req_off; + u32 len = opts->req.len; + + clone = &backing_req->bio; + BUG_ON(off & SECTOR_MASK); + BUG_ON(len & SECTOR_MASK); + bio_trim(clone, off >> SECTOR_SHIFT, len >> SECTOR_SHIFT); + + clone->bi_iter.bi_sector = (pcache_req->off + off) >> SECTOR_SHIFT; + clone->bi_private = backing_req; + clone->bi_end_io = backing_dev_bio_end; + + INIT_LIST_HEAD(&backing_req->node); + backing_req->end_req = opts->end_fn; + + pcache_req_get(pcache_req); + backing_req->req.upper_req = pcache_req; + backing_req->req.bio_off = off; +} + +static void kmem_type_req_init(struct pcache_backing_dev_req *backing_req, + struct pcache_backing_dev_req_opts *opts) +{ + struct pcache_backing_dev *backing_dev = backing_req->backing_dev; + struct bio *backing_bio; + + bio_init(&backing_req->bio, backing_dev->dm_dev->bdev, backing_req->kmem.bvecs, + backing_req->kmem.n_vecs, opts->kmem.opf); + + backing_bio = &backing_req->bio; + bio_map(backing_bio, opts->kmem.data, opts->kmem.len); + + backing_bio->bi_iter.bi_sector = (opts->kmem.backing_off) >> SECTOR_SHIFT; + backing_bio->bi_private = backing_req; + backing_bio->bi_end_io = backing_dev_bio_end; + + INIT_LIST_HEAD(&backing_req->node); + backing_req->end_req = opts->end_fn; + backing_req->priv_data = opts->priv_data; +} + +void backing_dev_req_init(struct pcache_backing_dev_req *backing_req, + struct pcache_backing_dev_req_opts *opts) +{ + if (opts->type == BACKING_DEV_REQ_TYPE_REQ) + return req_type_req_init(backing_req, opts); + + if (opts->type == BACKING_DEV_REQ_TYPE_KMEM) + return kmem_type_req_init(backing_req, opts); + + BUG(); +} + +struct pcache_backing_dev_req *backing_dev_req_create(struct pcache_backing_dev *backing_dev, + struct pcache_backing_dev_req_opts *opts) +{ + struct pcache_backing_dev_req *backing_req; + + backing_req = backing_dev_req_alloc(backing_dev, opts); + if (!backing_req) + return NULL; + + backing_dev_req_init(backing_req, opts); + + return backing_req; +} + +void backing_dev_flush(struct pcache_backing_dev *backing_dev) +{ + blkdev_issue_flush(backing_dev->dm_dev->bdev); +} + +int pcache_backing_init(void) +{ + u32 max_bvecs = (PCACHE_CACHE_SUBTREE_SIZE >> PAGE_SHIFT) + 1; + int ret; + + backing_req_cache = KMEM_CACHE(pcache_backing_dev_req, 0); + if (!backing_req_cache) { + ret = -ENOMEM; + goto err; + } + + backing_bvec_cache = kmem_cache_create("pcache-bvec-slab", + max_bvecs * sizeof(struct bio_vec), + 0, 0, NULL); + if (!backing_bvec_cache) { + ret = -ENOMEM; + goto destroy_req_cache; + } + + return 0; +destroy_req_cache: + kmem_cache_destroy(backing_req_cache); +err: + return ret; +} + +void pcache_backing_exit(void) +{ + kmem_cache_destroy(backing_bvec_cache); + kmem_cache_destroy(backing_req_cache); +} diff --git a/drivers/md/dm-pcache/backing_dev.h b/drivers/md/dm-pcache/backing_dev.h new file mode 100644 index 000000000000..b371cba483b9 --- /dev/null +++ b/drivers/md/dm-pcache/backing_dev.h @@ -0,0 +1,127 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _BACKING_DEV_H +#define _BACKING_DEV_H + +#include <linux/device-mapper.h> + +#include "pcache_internal.h" + +struct pcache_backing_dev_req; +typedef void (*backing_req_end_fn_t)(struct pcache_backing_dev_req *backing_req, int ret); + +#define BACKING_DEV_REQ_TYPE_REQ 1 +#define BACKING_DEV_REQ_TYPE_KMEM 2 + +#define BACKING_DEV_REQ_INLINE_BVECS 4 + +struct pcache_request; +struct pcache_backing_dev_req { + u8 type; + struct bio bio; + struct pcache_backing_dev *backing_dev; + + void *priv_data; + backing_req_end_fn_t end_req; + + struct list_head node; + int ret; + + union { + struct { + struct pcache_request *upper_req; + u32 bio_off; + } req; + struct { + struct bio_vec inline_bvecs[BACKING_DEV_REQ_INLINE_BVECS]; + struct bio_vec *bvecs; + u32 n_vecs; + } kmem; + }; +}; + +struct pcache_backing_dev { + struct pcache_cache *cache; + + struct dm_dev *dm_dev; + mempool_t req_pool; + mempool_t bvec_pool; + + struct list_head submit_list; + spinlock_t submit_lock; + struct work_struct req_submit_work; + + struct list_head complete_list; + spinlock_t complete_lock; + struct work_struct req_complete_work; + + atomic_t inflight_reqs; + wait_queue_head_t inflight_wq; + + u64 dev_size; +}; + +struct dm_pcache; +int backing_dev_start(struct dm_pcache *pcache); +void backing_dev_stop(struct dm_pcache *pcache); + +struct pcache_backing_dev_req_opts { + u32 type; + union { + struct { + struct pcache_request *upper_req; + u32 req_off; + u32 len; + } req; + struct { + void *data; + blk_opf_t opf; + u32 len; + u64 backing_off; + } kmem; + }; + + gfp_t gfp_mask; + backing_req_end_fn_t end_fn; + void *priv_data; +}; + +static inline u32 backing_dev_req_coalesced_max_len(const void *data, u32 len) +{ + const void *p = data; + u32 done = 0, in_page, to_advance; + struct page *first_page, *next_page; + + if (!is_vmalloc_addr(data)) + return len; + + first_page = vmalloc_to_page(p); +advance: + in_page = PAGE_SIZE - offset_in_page(p); + to_advance = min_t(u32, in_page, len - done); + + done += to_advance; + p += to_advance; + + if (done == len) + return done; + + next_page = vmalloc_to_page(p); + if (zone_device_pages_have_same_pgmap(first_page, next_page)) + goto advance; + + return done; +} + +void backing_dev_req_submit(struct pcache_backing_dev_req *backing_req, bool direct); +void backing_dev_req_end(struct pcache_backing_dev_req *backing_req); +struct pcache_backing_dev_req *backing_dev_req_create(struct pcache_backing_dev *backing_dev, + struct pcache_backing_dev_req_opts *opts); +struct pcache_backing_dev_req *backing_dev_req_alloc(struct pcache_backing_dev *backing_dev, + struct pcache_backing_dev_req_opts *opts); +void backing_dev_req_init(struct pcache_backing_dev_req *backing_req, + struct pcache_backing_dev_req_opts *opts); +void backing_dev_flush(struct pcache_backing_dev *backing_dev); + +int pcache_backing_init(void); +void pcache_backing_exit(void); +#endif /* _BACKING_DEV_H */ diff --git a/drivers/md/dm-pcache/cache.c b/drivers/md/dm-pcache/cache.c new file mode 100644 index 000000000000..d8e92367d947 --- /dev/null +++ b/drivers/md/dm-pcache/cache.c @@ -0,0 +1,445 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include <linux/blk_types.h> + +#include "cache.h" +#include "cache_dev.h" +#include "backing_dev.h" +#include "dm_pcache.h" + +struct kmem_cache *key_cache; + +static inline struct pcache_cache_info *get_cache_info_addr(struct pcache_cache *cache) +{ + return cache->cache_info_addr + cache->info_index; +} + +static void cache_info_write(struct pcache_cache *cache) +{ + struct pcache_cache_info *cache_info = &cache->cache_info; + + cache_info->header.seq++; + cache_info->header.crc = pcache_meta_crc(&cache_info->header, + sizeof(struct pcache_cache_info)); + + memcpy_flushcache(get_cache_info_addr(cache), cache_info, + sizeof(struct pcache_cache_info)); + + cache->info_index = (cache->info_index + 1) % PCACHE_META_INDEX_MAX; +} + +static void cache_info_init_default(struct pcache_cache *cache); +static int cache_info_init(struct pcache_cache *cache, struct pcache_cache_options *opts) +{ + struct dm_pcache *pcache = CACHE_TO_PCACHE(cache); + struct pcache_cache_info *cache_info_addr; + + cache_info_addr = pcache_meta_find_latest(&cache->cache_info_addr->header, + sizeof(struct pcache_cache_info), + PCACHE_CACHE_INFO_SIZE, + &cache->cache_info); + if (IS_ERR(cache_info_addr)) + return PTR_ERR(cache_info_addr); + + if (cache_info_addr) { + if (opts->data_crc != + (cache->cache_info.flags & PCACHE_CACHE_FLAGS_DATA_CRC)) { + pcache_dev_err(pcache, "invalid option for data_crc: %s, expected: %s", + opts->data_crc ? "true" : "false", + cache->cache_info.flags & PCACHE_CACHE_FLAGS_DATA_CRC ? "true" : "false"); + return -EINVAL; + } + + return 0; + } + + /* init cache_info for new cache */ + cache_info_init_default(cache); + cache_mode_set(cache, opts->cache_mode); + if (opts->data_crc) + cache->cache_info.flags |= PCACHE_CACHE_FLAGS_DATA_CRC; + + return 0; +} + +static void cache_info_set_gc_percent(struct pcache_cache_info *cache_info, u8 percent) +{ + cache_info->flags &= ~PCACHE_CACHE_FLAGS_GC_PERCENT_MASK; + cache_info->flags |= FIELD_PREP(PCACHE_CACHE_FLAGS_GC_PERCENT_MASK, percent); +} + +int pcache_cache_set_gc_percent(struct pcache_cache *cache, u8 percent) +{ + if (percent > PCACHE_CACHE_GC_PERCENT_MAX || percent < PCACHE_CACHE_GC_PERCENT_MIN) + return -EINVAL; + + mutex_lock(&cache->cache_info_lock); + cache_info_set_gc_percent(&cache->cache_info, percent); + + cache_info_write(cache); + mutex_unlock(&cache->cache_info_lock); + + return 0; +} + +void cache_pos_encode(struct pcache_cache *cache, + struct pcache_cache_pos_onmedia *pos_onmedia_base, + struct pcache_cache_pos *pos, u64 seq, u32 *index) +{ + struct pcache_cache_pos_onmedia pos_onmedia; + struct pcache_cache_pos_onmedia *pos_onmedia_addr = pos_onmedia_base + *index; + + pos_onmedia.cache_seg_id = pos->cache_seg->cache_seg_id; + pos_onmedia.seg_off = pos->seg_off; + pos_onmedia.header.seq = seq; + pos_onmedia.header.crc = cache_pos_onmedia_crc(&pos_onmedia); + + memcpy_flushcache(pos_onmedia_addr, &pos_onmedia, sizeof(struct pcache_cache_pos_onmedia)); + pmem_wmb(); + + *index = (*index + 1) % PCACHE_META_INDEX_MAX; +} + +int cache_pos_decode(struct pcache_cache *cache, + struct pcache_cache_pos_onmedia *pos_onmedia, + struct pcache_cache_pos *pos, u64 *seq, u32 *index) +{ + struct pcache_cache_pos_onmedia latest, *latest_addr; + + latest_addr = pcache_meta_find_latest(&pos_onmedia->header, + sizeof(struct pcache_cache_pos_onmedia), + sizeof(struct pcache_cache_pos_onmedia), + &latest); + if (IS_ERR(latest_addr)) + return PTR_ERR(latest_addr); + + if (!latest_addr) + return -EIO; + + pos->cache_seg = &cache->segments[latest.cache_seg_id]; + pos->seg_off = latest.seg_off; + *seq = latest.header.seq; + *index = (latest_addr - pos_onmedia); + + return 0; +} + +static inline void cache_info_set_seg_id(struct pcache_cache *cache, u32 seg_id) +{ + cache->cache_info.seg_id = seg_id; +} + +static int cache_init(struct dm_pcache *pcache) +{ + struct pcache_cache *cache = &pcache->cache; + struct pcache_backing_dev *backing_dev = &pcache->backing_dev; + struct pcache_cache_dev *cache_dev = &pcache->cache_dev; + int ret; + + cache->segments = kvcalloc(cache_dev->seg_num, sizeof(struct pcache_cache_segment), GFP_KERNEL); + if (!cache->segments) { + ret = -ENOMEM; + goto err; + } + + cache->seg_map = kvcalloc(BITS_TO_LONGS(cache_dev->seg_num), sizeof(unsigned long), GFP_KERNEL); + if (!cache->seg_map) { + ret = -ENOMEM; + goto free_segments; + } + + cache->backing_dev = backing_dev; + cache->cache_dev = &pcache->cache_dev; + cache->n_segs = cache_dev->seg_num; + atomic_set(&cache->gc_errors, 0); + spin_lock_init(&cache->seg_map_lock); + spin_lock_init(&cache->key_head_lock); + + mutex_init(&cache->cache_info_lock); + mutex_init(&cache->key_tail_lock); + mutex_init(&cache->dirty_tail_lock); + mutex_init(&cache->writeback_lock); + + INIT_DELAYED_WORK(&cache->writeback_work, cache_writeback_fn); + INIT_DELAYED_WORK(&cache->gc_work, pcache_cache_gc_fn); + INIT_WORK(&cache->clean_work, clean_fn); + + return 0; + +free_segments: + kvfree(cache->segments); +err: + return ret; +} + +static void cache_exit(struct pcache_cache *cache) +{ + kvfree(cache->seg_map); + kvfree(cache->segments); +} + +static void cache_info_init_default(struct pcache_cache *cache) +{ + struct pcache_cache_info *cache_info = &cache->cache_info; + + cache_info->header.seq = 0; + cache_info->n_segs = cache->cache_dev->seg_num; + cache_info_set_gc_percent(cache_info, PCACHE_CACHE_GC_PERCENT_DEFAULT); +} + +static int cache_tail_init(struct pcache_cache *cache) +{ + struct dm_pcache *pcache = CACHE_TO_PCACHE(cache); + bool new_cache = !(cache->cache_info.flags & PCACHE_CACHE_FLAGS_INIT_DONE); + + if (new_cache) { + __set_bit(0, cache->seg_map); + + cache->key_head.cache_seg = &cache->segments[0]; + cache->key_head.seg_off = 0; + cache_pos_copy(&cache->key_tail, &cache->key_head); + cache_pos_copy(&cache->dirty_tail, &cache->key_head); + + cache_encode_dirty_tail(cache); + cache_encode_key_tail(cache); + } else { + if (cache_decode_key_tail(cache) || cache_decode_dirty_tail(cache)) { + pcache_dev_err(pcache, "Corrupted key tail or dirty tail.\n"); + return -EIO; + } + } + + return 0; +} + +static int get_seg_id(struct pcache_cache *cache, + struct pcache_cache_segment *prev_cache_seg, + bool new_cache, u32 *seg_id) +{ + struct dm_pcache *pcache = CACHE_TO_PCACHE(cache); + struct pcache_cache_dev *cache_dev = cache->cache_dev; + int ret; + + if (new_cache) { + ret = cache_dev_get_empty_segment_id(cache_dev, seg_id); + if (ret) { + pcache_dev_err(pcache, "no available segment\n"); + goto err; + } + + if (prev_cache_seg) + cache_seg_set_next_seg(prev_cache_seg, *seg_id); + else + cache_info_set_seg_id(cache, *seg_id); + } else { + if (prev_cache_seg) { + struct pcache_segment_info *prev_seg_info; + + prev_seg_info = &prev_cache_seg->cache_seg_info; + if (!segment_info_has_next(prev_seg_info)) { + ret = -EFAULT; + goto err; + } + *seg_id = prev_cache_seg->cache_seg_info.next_seg; + } else { + *seg_id = cache->cache_info.seg_id; + } + } + return 0; +err: + return ret; +} + +static int cache_segs_init(struct pcache_cache *cache) +{ + struct pcache_cache_segment *prev_cache_seg = NULL; + struct pcache_cache_info *cache_info = &cache->cache_info; + bool new_cache = !(cache->cache_info.flags & PCACHE_CACHE_FLAGS_INIT_DONE); + u32 seg_id; + int ret; + u32 i; + + for (i = 0; i < cache_info->n_segs; i++) { + ret = get_seg_id(cache, prev_cache_seg, new_cache, &seg_id); + if (ret) + goto err; + + ret = cache_seg_init(cache, seg_id, i, new_cache); + if (ret) + goto err; + + prev_cache_seg = &cache->segments[i]; + } + return 0; +err: + return ret; +} + +static int cache_init_req_keys(struct pcache_cache *cache, u32 n_paral) +{ + struct dm_pcache *pcache = CACHE_TO_PCACHE(cache); + u32 n_subtrees; + int ret; + u32 i, cpu; + + /* Calculate number of cache trees based on the device size */ + n_subtrees = DIV_ROUND_UP(cache->dev_size << SECTOR_SHIFT, PCACHE_CACHE_SUBTREE_SIZE); + ret = cache_tree_init(cache, &cache->req_key_tree, n_subtrees); + if (ret) + goto err; + + cache->n_ksets = n_paral; + cache->ksets = kvcalloc(cache->n_ksets, PCACHE_KSET_SIZE, GFP_KERNEL); + if (!cache->ksets) { + ret = -ENOMEM; + goto req_tree_exit; + } + + /* + * Initialize each kset with a spinlock and delayed work for flushing. + * Each kset is associated with one queue to ensure independent handling + * of cache keys across multiple queues, maximizing multiqueue concurrency. + */ + for (i = 0; i < cache->n_ksets; i++) { + struct pcache_cache_kset *kset = get_kset(cache, i); + + kset->cache = cache; + spin_lock_init(&kset->kset_lock); + INIT_DELAYED_WORK(&kset->flush_work, kset_flush_fn); + } + + cache->data_heads = alloc_percpu(struct pcache_cache_data_head); + if (!cache->data_heads) { + ret = -ENOMEM; + goto free_kset; + } + + for_each_possible_cpu(cpu) { + struct pcache_cache_data_head *h = + per_cpu_ptr(cache->data_heads, cpu); + h->head_pos.cache_seg = NULL; + } + + /* + * Replay persisted cache keys using cache_replay. + * This function loads and replays cache keys from previously stored + * ksets, allowing the cache to restore its state after a restart. + */ + ret = cache_replay(cache); + if (ret) { + pcache_dev_err(pcache, "failed to replay keys\n"); + goto free_heads; + } + + return 0; + +free_heads: + free_percpu(cache->data_heads); +free_kset: + kvfree(cache->ksets); +req_tree_exit: + cache_tree_exit(&cache->req_key_tree); +err: + return ret; +} + +static void cache_destroy_req_keys(struct pcache_cache *cache) +{ + u32 i; + + for (i = 0; i < cache->n_ksets; i++) { + struct pcache_cache_kset *kset = get_kset(cache, i); + + cancel_delayed_work_sync(&kset->flush_work); + } + + free_percpu(cache->data_heads); + kvfree(cache->ksets); + cache_tree_exit(&cache->req_key_tree); +} + +int pcache_cache_start(struct dm_pcache *pcache) +{ + struct pcache_backing_dev *backing_dev = &pcache->backing_dev; + struct pcache_cache *cache = &pcache->cache; + struct pcache_cache_options *opts = &pcache->opts; + int ret; + + ret = cache_init(pcache); + if (ret) + return ret; + + cache->cache_info_addr = CACHE_DEV_CACHE_INFO(cache->cache_dev); + cache->cache_ctrl = CACHE_DEV_CACHE_CTRL(cache->cache_dev); + backing_dev->cache = cache; + cache->dev_size = backing_dev->dev_size; + + ret = cache_info_init(cache, opts); + if (ret) + goto cache_exit; + + ret = cache_segs_init(cache); + if (ret) + goto cache_exit; + + ret = cache_tail_init(cache); + if (ret) + goto cache_exit; + + ret = cache_init_req_keys(cache, num_online_cpus()); + if (ret) + goto cache_exit; + + ret = cache_writeback_init(cache); + if (ret) + goto destroy_keys; + + cache->cache_info.flags |= PCACHE_CACHE_FLAGS_INIT_DONE; + cache_info_write(cache); + queue_delayed_work(cache_get_wq(cache), &cache->gc_work, 0); + + return 0; + +destroy_keys: + cache_destroy_req_keys(cache); +cache_exit: + cache_exit(cache); + + return ret; +} + +void pcache_cache_stop(struct dm_pcache *pcache) +{ + struct pcache_cache *cache = &pcache->cache; + + cache_flush(cache); + + cancel_delayed_work_sync(&cache->gc_work); + flush_work(&cache->clean_work); + cache_writeback_exit(cache); + + if (cache->req_key_tree.n_subtrees) + cache_destroy_req_keys(cache); + + cache_exit(cache); +} + +struct workqueue_struct *cache_get_wq(struct pcache_cache *cache) +{ + struct dm_pcache *pcache = CACHE_TO_PCACHE(cache); + + return pcache->task_wq; +} + +int pcache_cache_init(void) +{ + key_cache = KMEM_CACHE(pcache_cache_key, 0); + if (!key_cache) + return -ENOMEM; + + return 0; +} + +void pcache_cache_exit(void) +{ + kmem_cache_destroy(key_cache); +} diff --git a/drivers/md/dm-pcache/cache.h b/drivers/md/dm-pcache/cache.h new file mode 100644 index 000000000000..1136d86958c8 --- /dev/null +++ b/drivers/md/dm-pcache/cache.h @@ -0,0 +1,635 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _PCACHE_CACHE_H +#define _PCACHE_CACHE_H + +#include "segment.h" + +/* Garbage collection thresholds */ +#define PCACHE_CACHE_GC_PERCENT_MIN 0 /* Minimum GC percentage */ +#define PCACHE_CACHE_GC_PERCENT_MAX 90 /* Maximum GC percentage */ +#define PCACHE_CACHE_GC_PERCENT_DEFAULT 70 /* Default GC percentage */ + +#define PCACHE_CACHE_SUBTREE_SIZE (4 * PCACHE_MB) /* 4MB total tree size */ +#define PCACHE_CACHE_SUBTREE_SIZE_MASK 0x3FFFFF /* Mask for tree size */ +#define PCACHE_CACHE_SUBTREE_SIZE_SHIFT 22 /* Bit shift for tree size */ + +/* Maximum number of keys per key set */ +#define PCACHE_KSET_KEYS_MAX 128 +#define PCACHE_CACHE_SEGS_MAX (1024 * 1024) /* maximum cache size for each device is 16T */ +#define PCACHE_KSET_ONMEDIA_SIZE_MAX struct_size_t(struct pcache_cache_kset_onmedia, data, PCACHE_KSET_KEYS_MAX) +#define PCACHE_KSET_SIZE (sizeof(struct pcache_cache_kset) + sizeof(struct pcache_cache_key_onmedia) * PCACHE_KSET_KEYS_MAX) + +/* Maximum number of keys to clean in one round of clean_work */ +#define PCACHE_CLEAN_KEYS_MAX 10 + +/* Writeback and garbage collection intervals in jiffies */ +#define PCACHE_CACHE_WRITEBACK_INTERVAL (5 * HZ) +#define PCACHE_CACHE_GC_INTERVAL (5 * HZ) + +/* Macro to get the cache key structure from an rb_node pointer */ +#define CACHE_KEY(node) (container_of(node, struct pcache_cache_key, rb_node)) + +struct pcache_cache_pos_onmedia { + struct pcache_meta_header header; + __u32 cache_seg_id; + __u32 seg_off; +}; + +/* Offset and size definitions for cache segment control */ +#define PCACHE_CACHE_SEG_CTRL_OFF (PCACHE_SEG_INFO_SIZE * PCACHE_META_INDEX_MAX) +#define PCACHE_CACHE_SEG_CTRL_SIZE (4 * PCACHE_KB) + +struct pcache_cache_seg_gen { + struct pcache_meta_header header; + __u64 gen; +}; + +/* Control structure for cache segments */ +struct pcache_cache_seg_ctrl { + struct pcache_cache_seg_gen gen[PCACHE_META_INDEX_MAX]; + __u64 res[64]; +}; + +#define PCACHE_CACHE_FLAGS_DATA_CRC BIT(0) +#define PCACHE_CACHE_FLAGS_INIT_DONE BIT(1) + +#define PCACHE_CACHE_FLAGS_CACHE_MODE_MASK GENMASK(5, 2) +#define PCACHE_CACHE_MODE_WRITEBACK 0 +#define PCACHE_CACHE_MODE_WRITETHROUGH 1 +#define PCACHE_CACHE_MODE_WRITEAROUND 2 +#define PCACHE_CACHE_MODE_WRITEONLY 3 + +#define PCACHE_CACHE_FLAGS_GC_PERCENT_MASK GENMASK(12, 6) + +struct pcache_cache_info { + struct pcache_meta_header header; + __u32 seg_id; + __u32 n_segs; + __u32 flags; + __u32 reserved; +}; + +struct pcache_cache_pos { + struct pcache_cache_segment *cache_seg; + u32 seg_off; +}; + +struct pcache_cache_segment { + struct pcache_cache *cache; + u32 cache_seg_id; /* Index in cache->segments */ + struct pcache_segment segment; + atomic_t refs; + + struct pcache_segment_info cache_seg_info; + struct mutex info_lock; + u32 info_index; + + spinlock_t gen_lock; + u64 gen; + u64 gen_seq; + u32 gen_index; + + struct pcache_cache_seg_ctrl *cache_seg_ctrl; +}; + +/* rbtree for cache entries */ +struct pcache_cache_subtree { + struct rb_root root; + spinlock_t tree_lock; +}; + +struct pcache_cache_tree { + struct pcache_cache *cache; + u32 n_subtrees; + mempool_t key_pool; + struct pcache_cache_subtree *subtrees; +}; + +extern struct kmem_cache *key_cache; + +struct pcache_cache_key { + struct pcache_cache_tree *cache_tree; + struct pcache_cache_subtree *cache_subtree; + struct kref ref; + struct rb_node rb_node; + struct list_head list_node; + u64 off; + u32 len; + u32 flags; + struct pcache_cache_pos cache_pos; + u64 seg_gen; +}; + +#define PCACHE_CACHE_KEY_FLAGS_EMPTY BIT(0) +#define PCACHE_CACHE_KEY_FLAGS_CLEAN BIT(1) + +struct pcache_cache_key_onmedia { + __u64 off; + __u32 len; + __u32 flags; + __u32 cache_seg_id; + __u32 cache_seg_off; + __u64 seg_gen; + __u32 data_crc; + __u32 reserved; +}; + +struct pcache_cache_kset_onmedia { + __u32 crc; + union { + __u32 key_num; + __u32 next_cache_seg_id; + }; + __u64 magic; + __u64 flags; + struct pcache_cache_key_onmedia data[]; +}; + +struct pcache_cache { + struct pcache_backing_dev *backing_dev; + struct pcache_cache_dev *cache_dev; + struct pcache_cache_ctrl *cache_ctrl; + u64 dev_size; + + struct pcache_cache_data_head __percpu *data_heads; + + spinlock_t key_head_lock; + struct pcache_cache_pos key_head; + u32 n_ksets; + struct pcache_cache_kset *ksets; + + struct mutex key_tail_lock; + struct pcache_cache_pos key_tail; + u64 key_tail_seq; + u32 key_tail_index; + + struct mutex dirty_tail_lock; + struct pcache_cache_pos dirty_tail; + u64 dirty_tail_seq; + u32 dirty_tail_index; + + struct pcache_cache_tree req_key_tree; + struct work_struct clean_work; + + struct mutex writeback_lock; + char wb_kset_onmedia_buf[PCACHE_KSET_ONMEDIA_SIZE_MAX]; + struct pcache_cache_tree writeback_key_tree; + struct delayed_work writeback_work; + struct { + atomic_t pending; + u32 advance; + int ret; + } writeback_ctx; + + char gc_kset_onmedia_buf[PCACHE_KSET_ONMEDIA_SIZE_MAX]; + struct delayed_work gc_work; + atomic_t gc_errors; + + struct mutex cache_info_lock; + struct pcache_cache_info cache_info; + struct pcache_cache_info *cache_info_addr; + u32 info_index; + + u32 n_segs; + unsigned long *seg_map; + u32 last_cache_seg; + bool cache_full; + spinlock_t seg_map_lock; + struct pcache_cache_segment *segments; +}; + +struct workqueue_struct *cache_get_wq(struct pcache_cache *cache); + +struct dm_pcache; +struct pcache_cache_options { + u32 cache_mode:4; + u32 data_crc:1; +}; +int pcache_cache_start(struct dm_pcache *pcache); +void pcache_cache_stop(struct dm_pcache *pcache); + +struct pcache_cache_ctrl { + /* Updated by gc_thread */ + struct pcache_cache_pos_onmedia key_tail_pos[PCACHE_META_INDEX_MAX]; + + /* Updated by writeback_thread */ + struct pcache_cache_pos_onmedia dirty_tail_pos[PCACHE_META_INDEX_MAX]; +}; + +struct pcache_cache_data_head { + struct pcache_cache_pos head_pos; +}; + +static inline u16 pcache_cache_get_gc_percent(struct pcache_cache *cache) +{ + return FIELD_GET(PCACHE_CACHE_FLAGS_GC_PERCENT_MASK, cache->cache_info.flags); +} + +int pcache_cache_set_gc_percent(struct pcache_cache *cache, u8 percent); + +/* cache key */ +struct pcache_cache_key *cache_key_alloc(struct pcache_cache_tree *cache_tree, gfp_t gfp_mask); +void cache_key_init(struct pcache_cache_tree *cache_tree, struct pcache_cache_key *key); +void cache_key_get(struct pcache_cache_key *key); +void cache_key_put(struct pcache_cache_key *key); +int cache_key_append(struct pcache_cache *cache, struct pcache_cache_key *key, bool force_close); +void cache_key_insert(struct pcache_cache_tree *cache_tree, struct pcache_cache_key *key, bool fixup); +int cache_key_decode(struct pcache_cache *cache, + struct pcache_cache_key_onmedia *key_onmedia, + struct pcache_cache_key *key); +void cache_pos_advance(struct pcache_cache_pos *pos, u32 len); + +#define PCACHE_KSET_FLAGS_LAST BIT(0) +#define PCACHE_KSET_MAGIC 0x676894a64e164f1aULL + +struct pcache_cache_kset { + struct pcache_cache *cache; + spinlock_t kset_lock; + struct delayed_work flush_work; + struct pcache_cache_kset_onmedia kset_onmedia; +}; + +extern struct pcache_cache_kset_onmedia pcache_empty_kset; + +#define SUBTREE_WALK_RET_OK 0 +#define SUBTREE_WALK_RET_ERR 1 +#define SUBTREE_WALK_RET_NEED_KEY 2 +#define SUBTREE_WALK_RET_NEED_REQ 3 +#define SUBTREE_WALK_RET_RESEARCH 4 + +struct pcache_cache_subtree_walk_ctx { + struct pcache_cache_tree *cache_tree; + struct rb_node *start_node; + struct pcache_request *pcache_req; + struct pcache_cache_key *key; + u32 req_done; + int ret; + + /* pre-allocated key and backing_dev_req */ + struct pcache_cache_key *pre_alloc_key; + struct pcache_backing_dev_req *pre_alloc_req; + + struct list_head *delete_key_list; + struct list_head *submit_req_list; + + /* + * |--------| key_tmp + * |====| key + */ + int (*before)(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp, + struct pcache_cache_subtree_walk_ctx *ctx); + + /* + * |----------| key_tmp + * |=====| key + */ + int (*after)(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp, + struct pcache_cache_subtree_walk_ctx *ctx); + + /* + * |----------------| key_tmp + * |===========| key + */ + int (*overlap_tail)(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp, + struct pcache_cache_subtree_walk_ctx *ctx); + + /* + * |--------| key_tmp + * |==========| key + */ + int (*overlap_head)(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp, + struct pcache_cache_subtree_walk_ctx *ctx); + + /* + * |----| key_tmp + * |==========| key + */ + int (*overlap_contain)(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp, + struct pcache_cache_subtree_walk_ctx *ctx); + + /* + * |-----------| key_tmp + * |====| key + */ + int (*overlap_contained)(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp, + struct pcache_cache_subtree_walk_ctx *ctx); + + int (*walk_finally)(struct pcache_cache_subtree_walk_ctx *ctx, int ret); + bool (*walk_done)(struct pcache_cache_subtree_walk_ctx *ctx); +}; + +int cache_subtree_walk(struct pcache_cache_subtree_walk_ctx *ctx); +struct rb_node *cache_subtree_search(struct pcache_cache_subtree *cache_subtree, struct pcache_cache_key *key, + struct rb_node **parentp, struct rb_node ***newp, + struct list_head *delete_key_list); +int cache_kset_close(struct pcache_cache *cache, struct pcache_cache_kset *kset); +void clean_fn(struct work_struct *work); +void kset_flush_fn(struct work_struct *work); +int cache_replay(struct pcache_cache *cache); +int cache_tree_init(struct pcache_cache *cache, struct pcache_cache_tree *cache_tree, u32 n_subtrees); +void cache_tree_clear(struct pcache_cache_tree *cache_tree); +void cache_tree_exit(struct pcache_cache_tree *cache_tree); + +/* cache segments */ +struct pcache_cache_segment *get_cache_segment(struct pcache_cache *cache); +int cache_seg_init(struct pcache_cache *cache, u32 seg_id, u32 cache_seg_id, + bool new_cache); +void cache_seg_get(struct pcache_cache_segment *cache_seg); +void cache_seg_put(struct pcache_cache_segment *cache_seg); +void cache_seg_set_next_seg(struct pcache_cache_segment *cache_seg, u32 seg_id); + +/* cache request*/ +int cache_flush(struct pcache_cache *cache); +void miss_read_end_work_fn(struct work_struct *work); +int pcache_cache_handle_req(struct pcache_cache *cache, struct pcache_request *pcache_req); + +/* gc */ +void pcache_cache_gc_fn(struct work_struct *work); + +/* writeback */ +void cache_writeback_exit(struct pcache_cache *cache); +int cache_writeback_init(struct pcache_cache *cache); +void cache_writeback_fn(struct work_struct *work); + +/* inline functions */ +static inline struct pcache_cache_subtree *get_subtree(struct pcache_cache_tree *cache_tree, u64 off) +{ + if (cache_tree->n_subtrees == 1) + return &cache_tree->subtrees[0]; + + return &cache_tree->subtrees[off >> PCACHE_CACHE_SUBTREE_SIZE_SHIFT]; +} + +static inline void *cache_pos_addr(struct pcache_cache_pos *pos) +{ + return (pos->cache_seg->segment.data + pos->seg_off); +} + +static inline void *get_key_head_addr(struct pcache_cache *cache) +{ + return cache_pos_addr(&cache->key_head); +} + +static inline u32 get_kset_id(struct pcache_cache *cache, u64 off) +{ + u32 kset_id; + + div_u64_rem(off >> PCACHE_CACHE_SUBTREE_SIZE_SHIFT, cache->n_ksets, &kset_id); + + return kset_id; +} + +static inline struct pcache_cache_kset *get_kset(struct pcache_cache *cache, u32 kset_id) +{ + return (void *)cache->ksets + PCACHE_KSET_SIZE * kset_id; +} + +static inline struct pcache_cache_data_head *get_data_head(struct pcache_cache *cache) +{ + return this_cpu_ptr(cache->data_heads); +} + +static inline bool cache_key_empty(struct pcache_cache_key *key) +{ + return key->flags & PCACHE_CACHE_KEY_FLAGS_EMPTY; +} + +static inline bool cache_key_clean(struct pcache_cache_key *key) +{ + return key->flags & PCACHE_CACHE_KEY_FLAGS_CLEAN; +} + +static inline void cache_pos_copy(struct pcache_cache_pos *dst, struct pcache_cache_pos *src) +{ + memcpy(dst, src, sizeof(struct pcache_cache_pos)); +} + +/** + * cache_seg_is_ctrl_seg - Checks if a cache segment is a cache ctrl segment. + * @cache_seg_id: ID of the cache segment. + * + * Returns true if the cache segment ID corresponds to a cache ctrl segment. + * + * Note: We extend the segment control of the first cache segment + * (cache segment ID 0) to serve as the cache control (pcache_cache_ctrl) + * for the entire PCACHE cache. This function determines whether the given + * cache segment is the one storing the pcache_cache_ctrl information. + */ +static inline bool cache_seg_is_ctrl_seg(u32 cache_seg_id) +{ + return (cache_seg_id == 0); +} + +/** + * cache_key_cutfront - Cuts a specified length from the front of a cache key. + * @key: Pointer to pcache_cache_key structure. + * @cut_len: Length to cut from the front. + * + * Advances the cache key position by cut_len and adjusts offset and length accordingly. + */ +static inline void cache_key_cutfront(struct pcache_cache_key *key, u32 cut_len) +{ + if (key->cache_pos.cache_seg) + cache_pos_advance(&key->cache_pos, cut_len); + + key->off += cut_len; + key->len -= cut_len; +} + +/** + * cache_key_cutback - Cuts a specified length from the back of a cache key. + * @key: Pointer to pcache_cache_key structure. + * @cut_len: Length to cut from the back. + * + * Reduces the length of the cache key by cut_len. + */ +static inline void cache_key_cutback(struct pcache_cache_key *key, u32 cut_len) +{ + key->len -= cut_len; +} + +static inline void cache_key_delete(struct pcache_cache_key *key) +{ + struct pcache_cache_subtree *cache_subtree; + + cache_subtree = key->cache_subtree; + BUG_ON(!cache_subtree); + + rb_erase(&key->rb_node, &cache_subtree->root); + key->flags = 0; + cache_key_put(key); +} + +static inline bool cache_data_crc_on(struct pcache_cache *cache) +{ + return (cache->cache_info.flags & PCACHE_CACHE_FLAGS_DATA_CRC); +} + +static inline u32 cache_mode_get(struct pcache_cache *cache) +{ + return FIELD_GET(PCACHE_CACHE_FLAGS_CACHE_MODE_MASK, cache->cache_info.flags); +} + +static inline void cache_mode_set(struct pcache_cache *cache, u32 cache_mode) +{ + cache->cache_info.flags &= ~PCACHE_CACHE_FLAGS_CACHE_MODE_MASK; + cache->cache_info.flags |= FIELD_PREP(PCACHE_CACHE_FLAGS_CACHE_MODE_MASK, cache_mode); +} + +/** + * cache_key_data_crc - Calculates CRC for data in a cache key. + * @key: Pointer to the pcache_cache_key structure. + * + * Returns the CRC-32 checksum of the data within the cache key's position. + */ +static inline u32 cache_key_data_crc(struct pcache_cache_key *key) +{ + void *data; + + data = cache_pos_addr(&key->cache_pos); + + return crc32c(PCACHE_CRC_SEED, data, key->len); +} + +static inline u32 cache_kset_crc(struct pcache_cache_kset_onmedia *kset_onmedia) +{ + u32 crc_size; + + if (kset_onmedia->flags & PCACHE_KSET_FLAGS_LAST) + crc_size = sizeof(struct pcache_cache_kset_onmedia) - 4; + else + crc_size = struct_size(kset_onmedia, data, kset_onmedia->key_num) - 4; + + return crc32c(PCACHE_CRC_SEED, (void *)kset_onmedia + 4, crc_size); +} + +static inline u32 get_kset_onmedia_size(struct pcache_cache_kset_onmedia *kset_onmedia) +{ + return struct_size_t(struct pcache_cache_kset_onmedia, data, kset_onmedia->key_num); +} + +/** + * cache_seg_remain - Computes remaining space in a cache segment. + * @pos: Pointer to pcache_cache_pos structure. + * + * Returns the amount of remaining space in the segment data starting from + * the current position offset. + */ +static inline u32 cache_seg_remain(struct pcache_cache_pos *pos) +{ + struct pcache_cache_segment *cache_seg; + struct pcache_segment *segment; + u32 seg_remain; + + cache_seg = pos->cache_seg; + segment = &cache_seg->segment; + seg_remain = segment->data_size - pos->seg_off; + + return seg_remain; +} + +/** + * cache_key_invalid - Checks if a cache key is invalid. + * @key: Pointer to pcache_cache_key structure. + * + * Returns true if the cache key is invalid due to its generation being + * less than the generation of its segment; otherwise returns false. + * + * When the GC (garbage collection) thread identifies a segment + * as reclaimable, it increments the segment's generation (gen). However, + * it does not immediately remove all related cache keys. When accessing + * such a cache key, this function can be used to determine if the cache + * key has already become invalid. + */ +static inline bool cache_key_invalid(struct pcache_cache_key *key) +{ + if (cache_key_empty(key)) + return false; + + return (key->seg_gen < key->cache_pos.cache_seg->gen); +} + +/** + * cache_key_lstart - Retrieves the logical start offset of a cache key. + * @key: Pointer to pcache_cache_key structure. + * + * Returns the logical start offset for the cache key. + */ +static inline u64 cache_key_lstart(struct pcache_cache_key *key) +{ + return key->off; +} + +/** + * cache_key_lend - Retrieves the logical end offset of a cache key. + * @key: Pointer to pcache_cache_key structure. + * + * Returns the logical end offset for the cache key. + */ +static inline u64 cache_key_lend(struct pcache_cache_key *key) +{ + return key->off + key->len; +} + +static inline void cache_key_copy(struct pcache_cache_key *key_dst, struct pcache_cache_key *key_src) +{ + key_dst->off = key_src->off; + key_dst->len = key_src->len; + key_dst->seg_gen = key_src->seg_gen; + key_dst->cache_tree = key_src->cache_tree; + key_dst->cache_subtree = key_src->cache_subtree; + key_dst->flags = key_src->flags; + + cache_pos_copy(&key_dst->cache_pos, &key_src->cache_pos); +} + +/** + * cache_pos_onmedia_crc - Calculates the CRC for an on-media cache position. + * @pos_om: Pointer to pcache_cache_pos_onmedia structure. + * + * Calculates the CRC-32 checksum of the position, excluding the first 4 bytes. + * Returns the computed CRC value. + */ +static inline u32 cache_pos_onmedia_crc(struct pcache_cache_pos_onmedia *pos_om) +{ + return pcache_meta_crc(&pos_om->header, sizeof(struct pcache_cache_pos_onmedia)); +} + +void cache_pos_encode(struct pcache_cache *cache, + struct pcache_cache_pos_onmedia *pos_onmedia, + struct pcache_cache_pos *pos, u64 seq, u32 *index); +int cache_pos_decode(struct pcache_cache *cache, + struct pcache_cache_pos_onmedia *pos_onmedia, + struct pcache_cache_pos *pos, u64 *seq, u32 *index); + +static inline void cache_encode_key_tail(struct pcache_cache *cache) +{ + cache_pos_encode(cache, cache->cache_ctrl->key_tail_pos, + &cache->key_tail, ++cache->key_tail_seq, + &cache->key_tail_index); +} + +static inline int cache_decode_key_tail(struct pcache_cache *cache) +{ + return cache_pos_decode(cache, cache->cache_ctrl->key_tail_pos, + &cache->key_tail, &cache->key_tail_seq, + &cache->key_tail_index); +} + +static inline void cache_encode_dirty_tail(struct pcache_cache *cache) +{ + cache_pos_encode(cache, cache->cache_ctrl->dirty_tail_pos, + &cache->dirty_tail, ++cache->dirty_tail_seq, + &cache->dirty_tail_index); +} + +static inline int cache_decode_dirty_tail(struct pcache_cache *cache) +{ + return cache_pos_decode(cache, cache->cache_ctrl->dirty_tail_pos, + &cache->dirty_tail, &cache->dirty_tail_seq, + &cache->dirty_tail_index); +} + +int pcache_cache_init(void); +void pcache_cache_exit(void); +#endif /* _PCACHE_CACHE_H */ diff --git a/drivers/md/dm-pcache/cache_dev.c b/drivers/md/dm-pcache/cache_dev.c new file mode 100644 index 000000000000..ece689e6ce59 --- /dev/null +++ b/drivers/md/dm-pcache/cache_dev.c @@ -0,0 +1,303 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/blkdev.h> +#include <linux/dax.h> +#include <linux/vmalloc.h> +#include <linux/parser.h> + +#include "cache_dev.h" +#include "backing_dev.h" +#include "cache.h" +#include "dm_pcache.h" + +static void cache_dev_dax_exit(struct pcache_cache_dev *cache_dev) +{ + if (cache_dev->use_vmap) + vunmap(cache_dev->mapping); +} + +static int build_vmap(struct dax_device *dax_dev, long total_pages, void **vaddr) +{ + struct page **pages; + long i = 0, chunk; + unsigned long pfn; + int ret; + + pages = vmalloc_array(total_pages, sizeof(struct page *)); + if (!pages) + return -ENOMEM; + + do { + chunk = dax_direct_access(dax_dev, i, total_pages - i, + DAX_ACCESS, NULL, &pfn); + if (chunk <= 0) { + ret = chunk ? chunk : -EINVAL; + goto out_free; + } + + if (!pfn_valid(pfn)) { + ret = -EOPNOTSUPP; + goto out_free; + } + + while (chunk-- && i < total_pages) { + pages[i++] = pfn_to_page(pfn); + pfn++; + if (!(i & 15)) + cond_resched(); + } + } while (i < total_pages); + + *vaddr = vmap(pages, total_pages, VM_MAP, PAGE_KERNEL); + if (!*vaddr) { + ret = -ENOMEM; + goto out_free; + } + + ret = 0; + +out_free: + vfree(pages); + return ret; +} + +static int cache_dev_dax_init(struct pcache_cache_dev *cache_dev) +{ + struct dm_pcache *pcache = CACHE_DEV_TO_PCACHE(cache_dev); + struct dax_device *dax_dev; + long total_pages, mapped_pages; + u64 bdev_size; + void *vaddr; + int ret; + int id; + unsigned long pfn; + + dax_dev = cache_dev->dm_dev->dax_dev; + /* total size check */ + bdev_size = bdev_nr_bytes(cache_dev->dm_dev->bdev); + if (bdev_size < PCACHE_CACHE_DEV_SIZE_MIN) { + pcache_dev_err(pcache, "dax device is too small, required at least %llu", + PCACHE_CACHE_DEV_SIZE_MIN); + ret = -ENOSPC; + goto out; + } + + total_pages = bdev_size >> PAGE_SHIFT; + /* attempt: direct-map the whole range */ + id = dax_read_lock(); + mapped_pages = dax_direct_access(dax_dev, 0, total_pages, + DAX_ACCESS, &vaddr, &pfn); + if (mapped_pages < 0) { + pcache_dev_err(pcache, "dax_direct_access failed: %ld\n", mapped_pages); + ret = mapped_pages; + goto unlock; + } + + if (!pfn_valid(pfn)) { + ret = -EOPNOTSUPP; + goto unlock; + } + + if (mapped_pages == total_pages) { + /* success: contiguous direct mapping */ + cache_dev->mapping = vaddr; + } else { + /* need vmap fallback */ + ret = build_vmap(dax_dev, total_pages, &vaddr); + if (ret) { + pcache_dev_err(pcache, "vmap fallback failed: %d\n", ret); + goto unlock; + } + + cache_dev->mapping = vaddr; + cache_dev->use_vmap = true; + } + dax_read_unlock(id); + + return 0; +unlock: + dax_read_unlock(id); +out: + return ret; +} + +void cache_dev_zero_range(struct pcache_cache_dev *cache_dev, void *pos, u32 size) +{ + memset(pos, 0, size); + dax_flush(cache_dev->dm_dev->dax_dev, pos, size); +} + +static int sb_read(struct pcache_cache_dev *cache_dev, struct pcache_sb *sb) +{ + struct pcache_sb *sb_addr = CACHE_DEV_SB(cache_dev); + + if (copy_mc_to_kernel(sb, sb_addr, sizeof(struct pcache_sb))) + return -EIO; + + return 0; +} + +static void sb_write(struct pcache_cache_dev *cache_dev, struct pcache_sb *sb) +{ + struct pcache_sb *sb_addr = CACHE_DEV_SB(cache_dev); + + memcpy_flushcache(sb_addr, sb, sizeof(struct pcache_sb)); + pmem_wmb(); +} + +static int sb_init(struct pcache_cache_dev *cache_dev, struct pcache_sb *sb) +{ + struct dm_pcache *pcache = CACHE_DEV_TO_PCACHE(cache_dev); + u64 nr_segs; + u64 cache_dev_size; + u64 magic; + u32 flags = 0; + + magic = le64_to_cpu(sb->magic); + if (magic) + return -EEXIST; + + cache_dev_size = bdev_nr_bytes(file_bdev(cache_dev->dm_dev->bdev_file)); + if (cache_dev_size < PCACHE_CACHE_DEV_SIZE_MIN) { + pcache_dev_err(pcache, "dax device is too small, required at least %llu", + PCACHE_CACHE_DEV_SIZE_MIN); + return -ENOSPC; + } + + nr_segs = (cache_dev_size - PCACHE_SEGMENTS_OFF) / ((PCACHE_SEG_SIZE)); + +#if defined(__BYTE_ORDER) ? (__BIG_ENDIAN == __BYTE_ORDER) : defined(__BIG_ENDIAN) + flags |= PCACHE_SB_F_BIGENDIAN; +#endif + sb->flags = cpu_to_le32(flags); + sb->magic = cpu_to_le64(PCACHE_MAGIC); + sb->seg_num = cpu_to_le32(nr_segs); + sb->crc = cpu_to_le32(crc32c(PCACHE_CRC_SEED, (void *)(sb) + 4, sizeof(struct pcache_sb) - 4)); + + cache_dev_zero_range(cache_dev, CACHE_DEV_CACHE_INFO(cache_dev), + PCACHE_CACHE_INFO_SIZE * PCACHE_META_INDEX_MAX + + PCACHE_CACHE_CTRL_SIZE); + + return 0; +} + +static int sb_validate(struct pcache_cache_dev *cache_dev, struct pcache_sb *sb) +{ + struct dm_pcache *pcache = CACHE_DEV_TO_PCACHE(cache_dev); + u32 flags; + u32 crc; + + if (le64_to_cpu(sb->magic) != PCACHE_MAGIC) { + pcache_dev_err(pcache, "unexpected magic: %llx\n", + le64_to_cpu(sb->magic)); + return -EINVAL; + } + + crc = crc32c(PCACHE_CRC_SEED, (void *)(sb) + 4, sizeof(struct pcache_sb) - 4); + if (crc != le32_to_cpu(sb->crc)) { + pcache_dev_err(pcache, "corrupted sb: %u, expected: %u\n", crc, le32_to_cpu(sb->crc)); + return -EINVAL; + } + + flags = le32_to_cpu(sb->flags); +#if defined(__BYTE_ORDER) ? (__BIG_ENDIAN == __BYTE_ORDER) : defined(__BIG_ENDIAN) + if (!(flags & PCACHE_SB_F_BIGENDIAN)) { + pcache_dev_err(pcache, "cache_dev is not big endian\n"); + return -EINVAL; + } +#else + if (flags & PCACHE_SB_F_BIGENDIAN) { + pcache_dev_err(pcache, "cache_dev is big endian\n"); + return -EINVAL; + } +#endif + return 0; +} + +static int cache_dev_init(struct pcache_cache_dev *cache_dev, u32 seg_num) +{ + cache_dev->seg_num = seg_num; + cache_dev->seg_bitmap = kvcalloc(BITS_TO_LONGS(cache_dev->seg_num), sizeof(unsigned long), GFP_KERNEL); + if (!cache_dev->seg_bitmap) + return -ENOMEM; + + return 0; +} + +static void cache_dev_exit(struct pcache_cache_dev *cache_dev) +{ + kvfree(cache_dev->seg_bitmap); +} + +void cache_dev_stop(struct dm_pcache *pcache) +{ + struct pcache_cache_dev *cache_dev = &pcache->cache_dev; + + cache_dev_exit(cache_dev); + cache_dev_dax_exit(cache_dev); +} + +int cache_dev_start(struct dm_pcache *pcache) +{ + struct pcache_cache_dev *cache_dev = &pcache->cache_dev; + struct pcache_sb sb; + bool format = false; + int ret; + + mutex_init(&cache_dev->seg_lock); + + ret = cache_dev_dax_init(cache_dev); + if (ret) { + pcache_dev_err(pcache, "failed to init cache_dev %s via dax way: %d.", + cache_dev->dm_dev->name, ret); + goto err; + } + + ret = sb_read(cache_dev, &sb); + if (ret) + goto dax_release; + + if (le64_to_cpu(sb.magic) == 0) { + format = true; + ret = sb_init(cache_dev, &sb); + if (ret < 0) + goto dax_release; + } + + ret = sb_validate(cache_dev, &sb); + if (ret) + goto dax_release; + + cache_dev->sb_flags = le32_to_cpu(sb.flags); + ret = cache_dev_init(cache_dev, le32_to_cpu(sb.seg_num)); + if (ret) + goto dax_release; + + if (format) + sb_write(cache_dev, &sb); + + return 0; + +dax_release: + cache_dev_dax_exit(cache_dev); +err: + return ret; +} + +int cache_dev_get_empty_segment_id(struct pcache_cache_dev *cache_dev, u32 *seg_id) +{ + int ret; + + mutex_lock(&cache_dev->seg_lock); + *seg_id = find_next_zero_bit(cache_dev->seg_bitmap, cache_dev->seg_num, 0); + if (*seg_id == cache_dev->seg_num) { + ret = -ENOSPC; + goto unlock; + } + + __set_bit(*seg_id, cache_dev->seg_bitmap); + ret = 0; +unlock: + mutex_unlock(&cache_dev->seg_lock); + return ret; +} diff --git a/drivers/md/dm-pcache/cache_dev.h b/drivers/md/dm-pcache/cache_dev.h new file mode 100644 index 000000000000..6251eb4ebe96 --- /dev/null +++ b/drivers/md/dm-pcache/cache_dev.h @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _PCACHE_CACHE_DEV_H +#define _PCACHE_CACHE_DEV_H + +#include <linux/device.h> +#include <linux/device-mapper.h> + +#include "pcache_internal.h" + +#define PCACHE_MAGIC 0x65B05EFA96C596EFULL + +#define PCACHE_SB_OFF (4 * PCACHE_KB) +#define PCACHE_SB_SIZE (4 * PCACHE_KB) + +#define PCACHE_CACHE_INFO_OFF (PCACHE_SB_OFF + PCACHE_SB_SIZE) +#define PCACHE_CACHE_INFO_SIZE (4 * PCACHE_KB) + +#define PCACHE_CACHE_CTRL_OFF (PCACHE_CACHE_INFO_OFF + (PCACHE_CACHE_INFO_SIZE * PCACHE_META_INDEX_MAX)) +#define PCACHE_CACHE_CTRL_SIZE (4 * PCACHE_KB) + +#define PCACHE_SEGMENTS_OFF (PCACHE_CACHE_CTRL_OFF + PCACHE_CACHE_CTRL_SIZE) +#define PCACHE_SEG_INFO_SIZE (4 * PCACHE_KB) + +#define PCACHE_CACHE_DEV_SIZE_MIN (512 * PCACHE_MB) /* 512 MB */ +#define PCACHE_SEG_SIZE (16 * PCACHE_MB) /* Size of each PCACHE segment (16 MB) */ + +#define CACHE_DEV_SB(cache_dev) ((struct pcache_sb *)(cache_dev->mapping + PCACHE_SB_OFF)) +#define CACHE_DEV_CACHE_INFO(cache_dev) ((void *)cache_dev->mapping + PCACHE_CACHE_INFO_OFF) +#define CACHE_DEV_CACHE_CTRL(cache_dev) ((void *)cache_dev->mapping + PCACHE_CACHE_CTRL_OFF) +#define CACHE_DEV_SEGMENTS(cache_dev) ((void *)cache_dev->mapping + PCACHE_SEGMENTS_OFF) +#define CACHE_DEV_SEGMENT(cache_dev, id) ((void *)CACHE_DEV_SEGMENTS(cache_dev) + (u64)id * PCACHE_SEG_SIZE) + +/* + * PCACHE SB flags configured during formatting + * + * The PCACHE_SB_F_xxx flags define registration requirements based on cache_dev + * formatting. For a machine to register a cache_dev: + * - PCACHE_SB_F_BIGENDIAN: Requires a big-endian machine. + */ +#define PCACHE_SB_F_BIGENDIAN BIT(0) + +struct pcache_sb { + __le32 crc; + __le32 flags; + __le64 magic; + + __le32 seg_num; +}; + +struct pcache_cache_dev { + u32 sb_flags; + u32 seg_num; + void *mapping; + bool use_vmap; + + struct dm_dev *dm_dev; + + struct mutex seg_lock; + unsigned long *seg_bitmap; +}; + +struct dm_pcache; +int cache_dev_start(struct dm_pcache *pcache); +void cache_dev_stop(struct dm_pcache *pcache); + +void cache_dev_zero_range(struct pcache_cache_dev *cache_dev, void *pos, u32 size); + +int cache_dev_get_empty_segment_id(struct pcache_cache_dev *cache_dev, u32 *seg_id); + +#endif /* _PCACHE_CACHE_DEV_H */ diff --git a/drivers/md/dm-pcache/cache_gc.c b/drivers/md/dm-pcache/cache_gc.c new file mode 100644 index 000000000000..94f8b276a021 --- /dev/null +++ b/drivers/md/dm-pcache/cache_gc.c @@ -0,0 +1,170 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include "cache.h" +#include "backing_dev.h" +#include "cache_dev.h" +#include "dm_pcache.h" + +/** + * cache_key_gc - Releases the reference of a cache key segment. + * @cache: Pointer to the pcache_cache structure. + * @key: Pointer to the cache key to be garbage collected. + * + * This function decrements the reference count of the cache segment + * associated with the given key. If the reference count drops to zero, + * the segment may be invalidated and reused. + */ +static void cache_key_gc(struct pcache_cache *cache, struct pcache_cache_key *key) +{ + cache_seg_put(key->cache_pos.cache_seg); +} + +static bool need_gc(struct pcache_cache *cache, struct pcache_cache_pos *dirty_tail, struct pcache_cache_pos *key_tail) +{ + struct dm_pcache *pcache = CACHE_TO_PCACHE(cache); + struct pcache_cache_kset_onmedia *kset_onmedia; + void *dirty_addr, *key_addr; + u32 segs_used, segs_gc_threshold, to_copy; + int ret; + + dirty_addr = cache_pos_addr(dirty_tail); + key_addr = cache_pos_addr(key_tail); + if (dirty_addr == key_addr) { + pcache_dev_debug(pcache, "key tail is equal to dirty tail: %u:%u\n", + dirty_tail->cache_seg->cache_seg_id, + dirty_tail->seg_off); + return false; + } + + kset_onmedia = (struct pcache_cache_kset_onmedia *)cache->gc_kset_onmedia_buf; + + to_copy = min(PCACHE_KSET_ONMEDIA_SIZE_MAX, PCACHE_SEG_SIZE - key_tail->seg_off); + ret = copy_mc_to_kernel(kset_onmedia, key_addr, to_copy); + if (ret) { + pcache_dev_err(pcache, "error to read kset: %d", ret); + return false; + } + + /* Check if kset_onmedia is corrupted */ + if (kset_onmedia->magic != PCACHE_KSET_MAGIC) { + pcache_dev_debug(pcache, "gc error: magic is not as expected. key_tail: %u:%u magic: %llx, expected: %llx\n", + key_tail->cache_seg->cache_seg_id, key_tail->seg_off, + kset_onmedia->magic, PCACHE_KSET_MAGIC); + return false; + } + + /* Verify the CRC of the kset_onmedia */ + if (kset_onmedia->crc != cache_kset_crc(kset_onmedia)) { + pcache_dev_debug(pcache, "gc error: crc is not as expected. crc: %x, expected: %x\n", + cache_kset_crc(kset_onmedia), kset_onmedia->crc); + return false; + } + + segs_used = bitmap_weight(cache->seg_map, cache->n_segs); + segs_gc_threshold = cache->n_segs * pcache_cache_get_gc_percent(cache) / 100; + if (segs_used < segs_gc_threshold) { + pcache_dev_debug(pcache, "segs_used: %u, segs_gc_threshold: %u\n", segs_used, segs_gc_threshold); + return false; + } + + return true; +} + +/** + * last_kset_gc - Advances the garbage collection for the last kset. + * @cache: Pointer to the pcache_cache structure. + * @kset_onmedia: Pointer to the kset_onmedia structure for the last kset. + */ +static void last_kset_gc(struct pcache_cache *cache, struct pcache_cache_kset_onmedia *kset_onmedia) +{ + struct dm_pcache *pcache = CACHE_TO_PCACHE(cache); + struct pcache_cache_segment *cur_seg, *next_seg; + + cur_seg = cache->key_tail.cache_seg; + + next_seg = &cache->segments[kset_onmedia->next_cache_seg_id]; + + mutex_lock(&cache->key_tail_lock); + cache->key_tail.cache_seg = next_seg; + cache->key_tail.seg_off = 0; + cache_encode_key_tail(cache); + mutex_unlock(&cache->key_tail_lock); + + pcache_dev_debug(pcache, "gc advance kset seg: %u\n", cur_seg->cache_seg_id); + + spin_lock(&cache->seg_map_lock); + __clear_bit(cur_seg->cache_seg_id, cache->seg_map); + spin_unlock(&cache->seg_map_lock); +} + +void pcache_cache_gc_fn(struct work_struct *work) +{ + struct pcache_cache *cache = container_of(work, struct pcache_cache, gc_work.work); + struct dm_pcache *pcache = CACHE_TO_PCACHE(cache); + struct pcache_cache_pos dirty_tail, key_tail; + struct pcache_cache_kset_onmedia *kset_onmedia; + struct pcache_cache_key_onmedia *key_onmedia; + struct pcache_cache_key *key; + int ret; + int i; + + kset_onmedia = (struct pcache_cache_kset_onmedia *)cache->gc_kset_onmedia_buf; + + while (true) { + if (pcache_is_stopping(pcache) || atomic_read(&cache->gc_errors)) + return; + + /* Get new tail positions */ + mutex_lock(&cache->dirty_tail_lock); + cache_pos_copy(&dirty_tail, &cache->dirty_tail); + mutex_unlock(&cache->dirty_tail_lock); + + mutex_lock(&cache->key_tail_lock); + cache_pos_copy(&key_tail, &cache->key_tail); + mutex_unlock(&cache->key_tail_lock); + + if (!need_gc(cache, &dirty_tail, &key_tail)) + break; + + if (kset_onmedia->flags & PCACHE_KSET_FLAGS_LAST) { + /* Don't move to the next segment if dirty_tail has not moved */ + if (dirty_tail.cache_seg == key_tail.cache_seg) + break; + + last_kset_gc(cache, kset_onmedia); + continue; + } + + for (i = 0; i < kset_onmedia->key_num; i++) { + struct pcache_cache_key key_tmp = { 0 }; + + key_onmedia = &kset_onmedia->data[i]; + + key = &key_tmp; + cache_key_init(&cache->req_key_tree, key); + + ret = cache_key_decode(cache, key_onmedia, key); + if (ret) { + /* return without re-arm gc work, and prevent future + * gc, because we can't retry the partial-gc-ed kset + */ + atomic_inc(&cache->gc_errors); + pcache_dev_err(pcache, "failed to decode cache key in gc\n"); + return; + } + + cache_key_gc(cache, key); + } + + pcache_dev_debug(pcache, "gc advance: %u:%u %u\n", + key_tail.cache_seg->cache_seg_id, + key_tail.seg_off, + get_kset_onmedia_size(kset_onmedia)); + + mutex_lock(&cache->key_tail_lock); + cache_pos_advance(&cache->key_tail, get_kset_onmedia_size(kset_onmedia)); + cache_encode_key_tail(cache); + mutex_unlock(&cache->key_tail_lock); + } + + queue_delayed_work(cache_get_wq(cache), &cache->gc_work, PCACHE_CACHE_GC_INTERVAL); +} diff --git a/drivers/md/dm-pcache/cache_key.c b/drivers/md/dm-pcache/cache_key.c new file mode 100644 index 000000000000..2b77e121f89b --- /dev/null +++ b/drivers/md/dm-pcache/cache_key.c @@ -0,0 +1,888 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include "cache.h" +#include "backing_dev.h" +#include "cache_dev.h" +#include "dm_pcache.h" + +struct pcache_cache_kset_onmedia pcache_empty_kset = { 0 }; + +void cache_key_init(struct pcache_cache_tree *cache_tree, struct pcache_cache_key *key) +{ + kref_init(&key->ref); + key->cache_tree = cache_tree; + INIT_LIST_HEAD(&key->list_node); + RB_CLEAR_NODE(&key->rb_node); +} + +struct pcache_cache_key *cache_key_alloc(struct pcache_cache_tree *cache_tree, gfp_t gfp_mask) +{ + struct pcache_cache_key *key; + + key = mempool_alloc(&cache_tree->key_pool, gfp_mask); + if (!key) + return NULL; + + memset(key, 0, sizeof(struct pcache_cache_key)); + cache_key_init(cache_tree, key); + + return key; +} + +/** + * cache_key_get - Increment the reference count of a cache key. + * @key: Pointer to the pcache_cache_key structure. + * + * This function increments the reference count of the specified cache key, + * ensuring that it is not freed while still in use. + */ +void cache_key_get(struct pcache_cache_key *key) +{ + kref_get(&key->ref); +} + +/** + * cache_key_destroy - Free a cache key structure when its reference count drops to zero. + * @ref: Pointer to the kref structure. + * + * This function is called when the reference count of the cache key reaches zero. + * It frees the allocated cache key back to the slab cache. + */ +static void cache_key_destroy(struct kref *ref) +{ + struct pcache_cache_key *key = container_of(ref, struct pcache_cache_key, ref); + struct pcache_cache_tree *cache_tree = key->cache_tree; + + mempool_free(key, &cache_tree->key_pool); +} + +void cache_key_put(struct pcache_cache_key *key) +{ + kref_put(&key->ref, cache_key_destroy); +} + +void cache_pos_advance(struct pcache_cache_pos *pos, u32 len) +{ + /* Ensure enough space remains in the current segment */ + BUG_ON(cache_seg_remain(pos) < len); + + pos->seg_off += len; +} + +static void cache_key_encode(struct pcache_cache *cache, + struct pcache_cache_key_onmedia *key_onmedia, + struct pcache_cache_key *key) +{ + key_onmedia->off = key->off; + key_onmedia->len = key->len; + + key_onmedia->cache_seg_id = key->cache_pos.cache_seg->cache_seg_id; + key_onmedia->cache_seg_off = key->cache_pos.seg_off; + + key_onmedia->seg_gen = key->seg_gen; + key_onmedia->flags = key->flags; + + if (cache_data_crc_on(cache)) + key_onmedia->data_crc = cache_key_data_crc(key); +} + +int cache_key_decode(struct pcache_cache *cache, + struct pcache_cache_key_onmedia *key_onmedia, + struct pcache_cache_key *key) +{ + struct dm_pcache *pcache = CACHE_TO_PCACHE(cache); + + key->off = key_onmedia->off; + key->len = key_onmedia->len; + + key->cache_pos.cache_seg = &cache->segments[key_onmedia->cache_seg_id]; + key->cache_pos.seg_off = key_onmedia->cache_seg_off; + + key->seg_gen = key_onmedia->seg_gen; + key->flags = key_onmedia->flags; + + if (cache_data_crc_on(cache) && + key_onmedia->data_crc != cache_key_data_crc(key)) { + pcache_dev_err(pcache, "key: %llu:%u seg %u:%u data_crc error: %x, expected: %x\n", + key->off, key->len, key->cache_pos.cache_seg->cache_seg_id, + key->cache_pos.seg_off, cache_key_data_crc(key), key_onmedia->data_crc); + return -EIO; + } + + return 0; +} + +static void append_last_kset(struct pcache_cache *cache, u32 next_seg) +{ + struct pcache_cache_kset_onmedia kset_onmedia = { 0 }; + + kset_onmedia.flags |= PCACHE_KSET_FLAGS_LAST; + kset_onmedia.next_cache_seg_id = next_seg; + kset_onmedia.magic = PCACHE_KSET_MAGIC; + kset_onmedia.crc = cache_kset_crc(&kset_onmedia); + + memcpy_flushcache(get_key_head_addr(cache), &kset_onmedia, sizeof(struct pcache_cache_kset_onmedia)); + pmem_wmb(); + cache_pos_advance(&cache->key_head, sizeof(struct pcache_cache_kset_onmedia)); +} + +int cache_kset_close(struct pcache_cache *cache, struct pcache_cache_kset *kset) +{ + struct pcache_cache_kset_onmedia *kset_onmedia; + u32 kset_onmedia_size; + int ret; + + kset_onmedia = &kset->kset_onmedia; + + if (!kset_onmedia->key_num) + return 0; + + kset_onmedia_size = struct_size(kset_onmedia, data, kset_onmedia->key_num); + + spin_lock(&cache->key_head_lock); +again: + /* Reserve space for the last kset */ + if (cache_seg_remain(&cache->key_head) < kset_onmedia_size + sizeof(struct pcache_cache_kset_onmedia)) { + struct pcache_cache_segment *next_seg; + + next_seg = get_cache_segment(cache); + if (!next_seg) { + ret = -EBUSY; + goto out; + } + + /* clear outdated kset in next seg */ + memcpy_flushcache(next_seg->segment.data, &pcache_empty_kset, + sizeof(struct pcache_cache_kset_onmedia)); + append_last_kset(cache, next_seg->cache_seg_id); + cache->key_head.cache_seg = next_seg; + cache->key_head.seg_off = 0; + goto again; + } + + kset_onmedia->magic = PCACHE_KSET_MAGIC; + kset_onmedia->crc = cache_kset_crc(kset_onmedia); + + /* clear outdated kset after current kset */ + memcpy_flushcache(get_key_head_addr(cache) + kset_onmedia_size, &pcache_empty_kset, + sizeof(struct pcache_cache_kset_onmedia)); + /* write current kset into segment */ + memcpy_flushcache(get_key_head_addr(cache), kset_onmedia, kset_onmedia_size); + pmem_wmb(); + + /* reset kset_onmedia */ + memset(kset_onmedia, 0, sizeof(struct pcache_cache_kset_onmedia)); + cache_pos_advance(&cache->key_head, kset_onmedia_size); + + ret = 0; +out: + spin_unlock(&cache->key_head_lock); + + return ret; +} + +/** + * cache_key_append - Append a cache key to the related kset. + * @cache: Pointer to the pcache_cache structure. + * @key: Pointer to the cache key structure to append. + * @force_close: Need to close current kset if true. + * + * This function appends a cache key to the appropriate kset. If the kset + * is full, it closes the kset. If not, it queues a flush work to write + * the kset to media. + * + * Returns 0 on success, or a negative error code on failure. + */ +int cache_key_append(struct pcache_cache *cache, struct pcache_cache_key *key, bool force_close) +{ + struct pcache_cache_kset *kset; + struct pcache_cache_kset_onmedia *kset_onmedia; + struct pcache_cache_key_onmedia *key_onmedia; + u32 kset_id = get_kset_id(cache, key->off); + int ret = 0; + + kset = get_kset(cache, kset_id); + kset_onmedia = &kset->kset_onmedia; + + spin_lock(&kset->kset_lock); + key_onmedia = &kset_onmedia->data[kset_onmedia->key_num]; + cache_key_encode(cache, key_onmedia, key); + + /* Check if the current kset has reached the maximum number of keys */ + if (++kset_onmedia->key_num == PCACHE_KSET_KEYS_MAX || force_close) { + /* If full, close the kset */ + ret = cache_kset_close(cache, kset); + if (ret) { + kset_onmedia->key_num--; + goto out; + } + } else { + /* If not full, queue a delayed work to flush the kset */ + queue_delayed_work(cache_get_wq(cache), &kset->flush_work, 1 * HZ); + } +out: + spin_unlock(&kset->kset_lock); + + return ret; +} + +/** + * cache_subtree_walk - Traverse the cache tree. + * @ctx: Pointer to the context structure for traversal. + * + * This function traverses the cache tree starting from the specified node. + * It calls the appropriate callback functions based on the relationships + * between the keys in the cache tree. + * + * Returns 0 on success, or a negative error code on failure. + */ +int cache_subtree_walk(struct pcache_cache_subtree_walk_ctx *ctx) +{ + struct pcache_cache_key *key_tmp, *key; + struct rb_node *node_tmp; + int ret = SUBTREE_WALK_RET_OK; + + key = ctx->key; + node_tmp = ctx->start_node; + + while (node_tmp) { + if (ctx->walk_done && ctx->walk_done(ctx)) + break; + + key_tmp = CACHE_KEY(node_tmp); + /* + * If key_tmp ends before the start of key, continue to the next node. + * |----------| + * |=====| + */ + if (cache_key_lend(key_tmp) <= cache_key_lstart(key)) { + if (ctx->after) { + ret = ctx->after(key, key_tmp, ctx); + if (ret) + goto out; + } + goto next; + } + + /* + * If key_tmp starts after the end of key, stop traversing. + * |--------| + * |====| + */ + if (cache_key_lstart(key_tmp) >= cache_key_lend(key)) { + if (ctx->before) { + ret = ctx->before(key, key_tmp, ctx); + if (ret) + goto out; + } + break; + } + + /* Handle overlapping keys */ + if (cache_key_lstart(key_tmp) >= cache_key_lstart(key)) { + /* + * If key_tmp encompasses key. + * |----------------| key_tmp + * |===========| key + */ + if (cache_key_lend(key_tmp) >= cache_key_lend(key)) { + if (ctx->overlap_tail) { + ret = ctx->overlap_tail(key, key_tmp, ctx); + if (ret) + goto out; + } + break; + } + + /* + * If key_tmp is contained within key. + * |----| key_tmp + * |==========| key + */ + if (ctx->overlap_contain) { + ret = ctx->overlap_contain(key, key_tmp, ctx); + if (ret) + goto out; + } + + goto next; + } + + /* + * If key_tmp starts before key ends but ends after key. + * |-----------| key_tmp + * |====| key + */ + if (cache_key_lend(key_tmp) > cache_key_lend(key)) { + if (ctx->overlap_contained) { + ret = ctx->overlap_contained(key, key_tmp, ctx); + if (ret) + goto out; + } + break; + } + + /* + * If key_tmp starts before key and ends within key. + * |--------| key_tmp + * |==========| key + */ + if (ctx->overlap_head) { + ret = ctx->overlap_head(key, key_tmp, ctx); + if (ret) + goto out; + } +next: + node_tmp = rb_next(node_tmp); + } + +out: + if (ctx->walk_finally) + ret = ctx->walk_finally(ctx, ret); + + return ret; +} + +/** + * cache_subtree_search - Search for a key in the cache tree. + * @cache_subtree: Pointer to the cache tree structure. + * @key: Pointer to the cache key to search for. + * @parentp: Pointer to store the parent node of the found node. + * @newp: Pointer to store the location where the new node should be inserted. + * @delete_key_list: List to collect invalid keys for deletion. + * + * This function searches the cache tree for a specific key and returns + * the node that is the predecessor of the key, or first node if the key is + * less than all keys in the tree. If any invalid keys are found during + * the search, they are added to the delete_key_list for later cleanup. + * + * Returns a pointer to the previous node. + */ +struct rb_node *cache_subtree_search(struct pcache_cache_subtree *cache_subtree, struct pcache_cache_key *key, + struct rb_node **parentp, struct rb_node ***newp, + struct list_head *delete_key_list) +{ + struct rb_node **new, *parent = NULL; + struct pcache_cache_key *key_tmp; + struct rb_node *prev_node = NULL; + + new = &(cache_subtree->root.rb_node); + while (*new) { + key_tmp = container_of(*new, struct pcache_cache_key, rb_node); + if (cache_key_invalid(key_tmp)) + list_add(&key_tmp->list_node, delete_key_list); + + parent = *new; + if (key_tmp->off >= key->off) { + new = &((*new)->rb_left); + } else { + prev_node = *new; + new = &((*new)->rb_right); + } + } + + if (!prev_node) + prev_node = rb_first(&cache_subtree->root); + + if (parentp) + *parentp = parent; + + if (newp) + *newp = new; + + return prev_node; +} + +static struct pcache_cache_key *get_pre_alloc_key(struct pcache_cache_subtree_walk_ctx *ctx) +{ + struct pcache_cache_key *key; + + if (ctx->pre_alloc_key) { + key = ctx->pre_alloc_key; + ctx->pre_alloc_key = NULL; + + return key; + } + + return cache_key_alloc(ctx->cache_tree, GFP_NOWAIT); +} + +/** + * fixup_overlap_tail - Adjust the key when it overlaps at the tail. + * @key: Pointer to the new cache key being inserted. + * @key_tmp: Pointer to the existing key that overlaps. + * @ctx: Pointer to the context for walking the cache tree. + * + * This function modifies the existing key (key_tmp) when there is an + * overlap at the tail with the new key. If the modified key becomes + * empty, it is deleted. + */ +static int fixup_overlap_tail(struct pcache_cache_key *key, + struct pcache_cache_key *key_tmp, + struct pcache_cache_subtree_walk_ctx *ctx) +{ + /* + * |----------------| key_tmp + * |===========| key + */ + BUG_ON(cache_key_empty(key)); + if (cache_key_empty(key_tmp)) { + cache_key_delete(key_tmp); + return SUBTREE_WALK_RET_RESEARCH; + } + + cache_key_cutfront(key_tmp, cache_key_lend(key) - cache_key_lstart(key_tmp)); + if (key_tmp->len == 0) { + cache_key_delete(key_tmp); + return SUBTREE_WALK_RET_RESEARCH; + } + + return SUBTREE_WALK_RET_OK; +} + +/** + * fixup_overlap_contain - Handle case where new key completely contains an existing key. + * @key: Pointer to the new cache key being inserted. + * @key_tmp: Pointer to the existing key that is being contained. + * @ctx: Pointer to the context for walking the cache tree. + * + * This function deletes the existing key (key_tmp) when the new key + * completely contains it. It returns SUBTREE_WALK_RET_RESEARCH to indicate that the + * tree structure may have changed, necessitating a re-insertion of + * the new key. + */ +static int fixup_overlap_contain(struct pcache_cache_key *key, + struct pcache_cache_key *key_tmp, + struct pcache_cache_subtree_walk_ctx *ctx) +{ + /* + * |----| key_tmp + * |==========| key + */ + BUG_ON(cache_key_empty(key)); + cache_key_delete(key_tmp); + + return SUBTREE_WALK_RET_RESEARCH; +} + +/** + * fixup_overlap_contained - Handle overlap when a new key is contained in an existing key. + * @key: The new cache key being inserted. + * @key_tmp: The existing cache key that overlaps with the new key. + * @ctx: Context for the cache tree walk. + * + * This function adjusts the existing key if the new key is contained + * within it. If the existing key is empty, it indicates a placeholder key + * that was inserted during a miss read. This placeholder will later be + * updated with real data from the backing_dev, making it no longer an empty key. + * + * If we delete key or insert a key, the structure of the entire cache tree may change, + * requiring a full research of the tree to find a new insertion point. + */ +static int fixup_overlap_contained(struct pcache_cache_key *key, + struct pcache_cache_key *key_tmp, struct pcache_cache_subtree_walk_ctx *ctx) +{ + struct pcache_cache_tree *cache_tree = ctx->cache_tree; + + /* + * |-----------| key_tmp + * |====| key + */ + BUG_ON(cache_key_empty(key)); + if (cache_key_empty(key_tmp)) { + /* If key_tmp is empty, don't split it; + * it's a placeholder key for miss reads that will be updated later. + */ + cache_key_cutback(key_tmp, cache_key_lend(key_tmp) - cache_key_lstart(key)); + if (key_tmp->len == 0) { + cache_key_delete(key_tmp); + return SUBTREE_WALK_RET_RESEARCH; + } + } else { + struct pcache_cache_key *key_fixup; + bool need_research = false; + + key_fixup = get_pre_alloc_key(ctx); + if (!key_fixup) + return SUBTREE_WALK_RET_NEED_KEY; + + cache_key_copy(key_fixup, key_tmp); + + /* Split key_tmp based on the new key's range */ + cache_key_cutback(key_tmp, cache_key_lend(key_tmp) - cache_key_lstart(key)); + if (key_tmp->len == 0) { + cache_key_delete(key_tmp); + need_research = true; + } + + /* Create a new portion for key_fixup */ + cache_key_cutfront(key_fixup, cache_key_lend(key) - cache_key_lstart(key_tmp)); + if (key_fixup->len == 0) { + cache_key_put(key_fixup); + } else { + /* Insert the new key into the cache */ + cache_key_insert(cache_tree, key_fixup, false); + need_research = true; + } + + if (need_research) + return SUBTREE_WALK_RET_RESEARCH; + } + + return SUBTREE_WALK_RET_OK; +} + +/** + * fixup_overlap_head - Handle overlap when a new key overlaps with the head of an existing key. + * @key: The new cache key being inserted. + * @key_tmp: The existing cache key that overlaps with the new key. + * @ctx: Context for the cache tree walk. + * + * This function adjusts the existing key if the new key overlaps + * with the beginning of it. If the resulting key length is zero + * after the adjustment, the key is deleted. This indicates that + * the key no longer holds valid data and requires the tree to be + * re-researched for a new insertion point. + */ +static int fixup_overlap_head(struct pcache_cache_key *key, + struct pcache_cache_key *key_tmp, struct pcache_cache_subtree_walk_ctx *ctx) +{ + /* + * |--------| key_tmp + * |==========| key + */ + BUG_ON(cache_key_empty(key)); + /* Adjust key_tmp by cutting back based on the new key's start */ + cache_key_cutback(key_tmp, cache_key_lend(key_tmp) - cache_key_lstart(key)); + if (key_tmp->len == 0) { + /* If the adjusted key_tmp length is zero, delete it */ + cache_key_delete(key_tmp); + return SUBTREE_WALK_RET_RESEARCH; + } + + return SUBTREE_WALK_RET_OK; +} + +/** + * cache_key_insert - Insert a new cache key into the cache tree. + * @cache_tree: Pointer to the cache_tree structure. + * @key: The cache key to insert. + * @fixup: Indicates if this is a new key being inserted. + * + * This function searches for the appropriate location to insert + * a new cache key into the cache tree. It handles key overlaps + * and ensures any invalid keys are removed before insertion. + */ +void cache_key_insert(struct pcache_cache_tree *cache_tree, struct pcache_cache_key *key, bool fixup) +{ + struct pcache_cache *cache = cache_tree->cache; + struct pcache_cache_subtree_walk_ctx walk_ctx = { 0 }; + struct rb_node **new, *parent = NULL; + struct pcache_cache_subtree *cache_subtree; + struct pcache_cache_key *key_tmp = NULL, *key_next; + struct rb_node *prev_node = NULL; + LIST_HEAD(delete_key_list); + int ret; + + cache_subtree = get_subtree(cache_tree, key->off); + key->cache_subtree = cache_subtree; +search: + prev_node = cache_subtree_search(cache_subtree, key, &parent, &new, &delete_key_list); + if (!list_empty(&delete_key_list)) { + /* Remove invalid keys from the delete list */ + list_for_each_entry_safe(key_tmp, key_next, &delete_key_list, list_node) { + list_del_init(&key_tmp->list_node); + cache_key_delete(key_tmp); + } + goto search; + } + + if (fixup) { + /* Set up the context with the cache, start node, and new key */ + walk_ctx.cache_tree = cache_tree; + walk_ctx.start_node = prev_node; + walk_ctx.key = key; + + /* Assign overlap handling functions for different scenarios */ + walk_ctx.overlap_tail = fixup_overlap_tail; + walk_ctx.overlap_head = fixup_overlap_head; + walk_ctx.overlap_contain = fixup_overlap_contain; + walk_ctx.overlap_contained = fixup_overlap_contained; + + ret = cache_subtree_walk(&walk_ctx); + switch (ret) { + case SUBTREE_WALK_RET_OK: + break; + case SUBTREE_WALK_RET_RESEARCH: + goto search; + case SUBTREE_WALK_RET_NEED_KEY: + spin_unlock(&cache_subtree->tree_lock); + pcache_dev_debug(CACHE_TO_PCACHE(cache), "allocate pre_alloc_key with GFP_NOIO"); + walk_ctx.pre_alloc_key = cache_key_alloc(cache_tree, GFP_NOIO); + spin_lock(&cache_subtree->tree_lock); + goto search; + default: + BUG(); + } + } + + if (walk_ctx.pre_alloc_key) + cache_key_put(walk_ctx.pre_alloc_key); + + /* Link and insert the new key into the red-black tree */ + rb_link_node(&key->rb_node, parent, new); + rb_insert_color(&key->rb_node, &cache_subtree->root); +} + +/** + * clean_fn - Cleanup function to remove invalid keys from the cache tree. + * @work: Pointer to the work_struct associated with the cleanup. + * + * This function cleans up invalid keys from the cache tree in the background + * after a cache segment has been invalidated during cache garbage collection. + * It processes a maximum of PCACHE_CLEAN_KEYS_MAX keys per iteration and holds + * the tree lock to ensure thread safety. + */ +void clean_fn(struct work_struct *work) +{ + struct pcache_cache *cache = container_of(work, struct pcache_cache, clean_work); + struct pcache_cache_subtree *cache_subtree; + struct rb_node *node; + struct pcache_cache_key *key; + int i, count; + + for (i = 0; i < cache->req_key_tree.n_subtrees; i++) { + cache_subtree = &cache->req_key_tree.subtrees[i]; + +again: + if (pcache_is_stopping(CACHE_TO_PCACHE(cache))) + return; + + /* Delete up to PCACHE_CLEAN_KEYS_MAX keys in one iteration */ + count = 0; + spin_lock(&cache_subtree->tree_lock); + node = rb_first(&cache_subtree->root); + while (node) { + key = CACHE_KEY(node); + node = rb_next(node); + if (cache_key_invalid(key)) { + count++; + cache_key_delete(key); + } + + if (count >= PCACHE_CLEAN_KEYS_MAX) { + /* Unlock and pause before continuing cleanup */ + spin_unlock(&cache_subtree->tree_lock); + usleep_range(1000, 2000); + goto again; + } + } + spin_unlock(&cache_subtree->tree_lock); + } +} + +/* + * kset_flush_fn - Flush work for a cache kset. + * + * This function is called when a kset flush work is queued from + * cache_key_append(). If the kset is full, it will be closed + * immediately. If not, the flush work will be queued for later closure. + * + * If cache_kset_close detects that a new segment is required to store + * the kset and there are no available segments, it will return an error. + * In this scenario, a retry will be attempted. + */ +void kset_flush_fn(struct work_struct *work) +{ + struct pcache_cache_kset *kset = container_of(work, struct pcache_cache_kset, flush_work.work); + struct pcache_cache *cache = kset->cache; + int ret; + + if (pcache_is_stopping(CACHE_TO_PCACHE(cache))) + return; + + spin_lock(&kset->kset_lock); + ret = cache_kset_close(cache, kset); + spin_unlock(&kset->kset_lock); + + if (ret) { + /* Failed to flush kset, schedule a retry. */ + queue_delayed_work(cache_get_wq(cache), &kset->flush_work, msecs_to_jiffies(100)); + } +} + +static int kset_replay(struct pcache_cache *cache, struct pcache_cache_kset_onmedia *kset_onmedia) +{ + struct pcache_cache_key_onmedia *key_onmedia; + struct pcache_cache_subtree *cache_subtree; + struct pcache_cache_key *key; + int ret; + int i; + + for (i = 0; i < kset_onmedia->key_num; i++) { + key_onmedia = &kset_onmedia->data[i]; + + key = cache_key_alloc(&cache->req_key_tree, GFP_NOIO); + ret = cache_key_decode(cache, key_onmedia, key); + if (ret) { + cache_key_put(key); + goto err; + } + + __set_bit(key->cache_pos.cache_seg->cache_seg_id, cache->seg_map); + + /* Check if the segment generation is valid for insertion. */ + if (key->seg_gen < key->cache_pos.cache_seg->gen) { + cache_key_put(key); + } else { + cache_subtree = get_subtree(&cache->req_key_tree, key->off); + spin_lock(&cache_subtree->tree_lock); + cache_key_insert(&cache->req_key_tree, key, true); + spin_unlock(&cache_subtree->tree_lock); + } + + cache_seg_get(key->cache_pos.cache_seg); + } + + return 0; +err: + return ret; +} + +int cache_replay(struct pcache_cache *cache) +{ + struct dm_pcache *pcache = CACHE_TO_PCACHE(cache); + struct pcache_cache_pos pos_tail; + struct pcache_cache_pos *pos; + struct pcache_cache_kset_onmedia *kset_onmedia; + u32 to_copy, count = 0; + int ret = 0; + + kset_onmedia = kzalloc(PCACHE_KSET_ONMEDIA_SIZE_MAX, GFP_KERNEL); + if (!kset_onmedia) + return -ENOMEM; + + cache_pos_copy(&pos_tail, &cache->key_tail); + pos = &pos_tail; + + /* + * In cache replaying stage, there is no other one will access + * cache->seg_map, so we can set bit here without cache->seg_map_lock. + */ + __set_bit(pos->cache_seg->cache_seg_id, cache->seg_map); + + while (true) { + to_copy = min(PCACHE_KSET_ONMEDIA_SIZE_MAX, PCACHE_SEG_SIZE - pos->seg_off); + ret = copy_mc_to_kernel(kset_onmedia, cache_pos_addr(pos), to_copy); + if (ret) { + ret = -EIO; + goto out; + } + + if (kset_onmedia->magic != PCACHE_KSET_MAGIC || + kset_onmedia->crc != cache_kset_crc(kset_onmedia)) { + break; + } + + /* Process the last kset and prepare for the next segment. */ + if (kset_onmedia->flags & PCACHE_KSET_FLAGS_LAST) { + struct pcache_cache_segment *next_seg; + + pcache_dev_debug(pcache, "last kset replay, next: %u\n", kset_onmedia->next_cache_seg_id); + + next_seg = &cache->segments[kset_onmedia->next_cache_seg_id]; + + pos->cache_seg = next_seg; + pos->seg_off = 0; + + __set_bit(pos->cache_seg->cache_seg_id, cache->seg_map); + continue; + } + + /* Replay the kset and check for errors. */ + ret = kset_replay(cache, kset_onmedia); + if (ret) + goto out; + + /* Advance the position after processing the kset. */ + cache_pos_advance(pos, get_kset_onmedia_size(kset_onmedia)); + if (++count > 512) { + cond_resched(); + count = 0; + } + } + + /* Update the key_head position after replaying. */ + spin_lock(&cache->key_head_lock); + cache_pos_copy(&cache->key_head, pos); + spin_unlock(&cache->key_head_lock); +out: + kfree(kset_onmedia); + return ret; +} + +int cache_tree_init(struct pcache_cache *cache, struct pcache_cache_tree *cache_tree, u32 n_subtrees) +{ + int ret; + u32 i; + + cache_tree->cache = cache; + cache_tree->n_subtrees = n_subtrees; + + ret = mempool_init_slab_pool(&cache_tree->key_pool, 1024, key_cache); + if (ret) + goto err; + + /* + * Allocate and initialize the subtrees array. + * Each element is a cache tree structure that contains + * an RB tree root and a spinlock for protecting its contents. + */ + cache_tree->subtrees = kvcalloc(cache_tree->n_subtrees, sizeof(struct pcache_cache_subtree), GFP_KERNEL); + if (!cache_tree->subtrees) { + ret = -ENOMEM; + goto key_pool_exit; + } + + for (i = 0; i < cache_tree->n_subtrees; i++) { + struct pcache_cache_subtree *cache_subtree = &cache_tree->subtrees[i]; + + cache_subtree->root = RB_ROOT; + spin_lock_init(&cache_subtree->tree_lock); + } + + return 0; + +key_pool_exit: + mempool_exit(&cache_tree->key_pool); +err: + return ret; +} + +void cache_tree_clear(struct pcache_cache_tree *cache_tree) +{ + struct pcache_cache_subtree *cache_subtree; + struct rb_node *node; + struct pcache_cache_key *key; + u32 i; + + for (i = 0; i < cache_tree->n_subtrees; i++) { + cache_subtree = &cache_tree->subtrees[i]; + + spin_lock(&cache_subtree->tree_lock); + node = rb_first(&cache_subtree->root); + while (node) { + key = CACHE_KEY(node); + node = rb_next(node); + + cache_key_delete(key); + } + spin_unlock(&cache_subtree->tree_lock); + } +} + +void cache_tree_exit(struct pcache_cache_tree *cache_tree) +{ + cache_tree_clear(cache_tree); + kvfree(cache_tree->subtrees); + mempool_exit(&cache_tree->key_pool); +} diff --git a/drivers/md/dm-pcache/cache_req.c b/drivers/md/dm-pcache/cache_req.c new file mode 100644 index 000000000000..27f94c1fa968 --- /dev/null +++ b/drivers/md/dm-pcache/cache_req.c @@ -0,0 +1,836 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "cache.h" +#include "backing_dev.h" +#include "cache_dev.h" +#include "dm_pcache.h" + +static int cache_data_head_init(struct pcache_cache *cache) +{ + struct pcache_cache_segment *next_seg; + struct pcache_cache_data_head *data_head; + + data_head = get_data_head(cache); + next_seg = get_cache_segment(cache); + if (!next_seg) + return -EBUSY; + + cache_seg_get(next_seg); + data_head->head_pos.cache_seg = next_seg; + data_head->head_pos.seg_off = 0; + + return 0; +} + +/** + * cache_data_alloc - Allocate data for a cache key. + * @cache: Pointer to the cache structure. + * @key: Pointer to the cache key to allocate data for. + * + * This function tries to allocate space from the cache segment specified by the + * data head. If the remaining space in the segment is insufficient to allocate + * the requested length for the cache key, it will allocate whatever is available + * and adjust the key's length accordingly. This function does not allocate + * space that crosses segment boundaries. + */ +static int cache_data_alloc(struct pcache_cache *cache, struct pcache_cache_key *key) +{ + struct pcache_cache_data_head *data_head; + struct pcache_cache_pos *head_pos; + struct pcache_cache_segment *cache_seg; + u32 seg_remain; + u32 allocated = 0, to_alloc; + int ret = 0; + + preempt_disable(); + data_head = get_data_head(cache); +again: + to_alloc = key->len - allocated; + if (!data_head->head_pos.cache_seg) { + seg_remain = 0; + } else { + cache_pos_copy(&key->cache_pos, &data_head->head_pos); + key->seg_gen = key->cache_pos.cache_seg->gen; + + head_pos = &data_head->head_pos; + cache_seg = head_pos->cache_seg; + seg_remain = cache_seg_remain(head_pos); + } + + if (seg_remain > to_alloc) { + /* If remaining space in segment is sufficient for the cache key, allocate it. */ + cache_pos_advance(head_pos, to_alloc); + allocated += to_alloc; + cache_seg_get(cache_seg); + } else if (seg_remain) { + /* If remaining space is not enough, allocate the remaining space and adjust the cache key length. */ + cache_pos_advance(head_pos, seg_remain); + key->len = seg_remain; + + /* Get for key: obtain a reference to the cache segment for the key. */ + cache_seg_get(cache_seg); + /* Put for head_pos->cache_seg: release the reference for the current head's segment. */ + cache_seg_put(head_pos->cache_seg); + head_pos->cache_seg = NULL; + } else { + /* Initialize a new data head if no segment is available. */ + ret = cache_data_head_init(cache); + if (ret) + goto out; + + goto again; + } + +out: + preempt_enable(); + + return ret; +} + +static int cache_copy_from_req_bio(struct pcache_cache *cache, struct pcache_cache_key *key, + struct pcache_request *pcache_req, u32 bio_off) +{ + struct pcache_cache_pos *pos = &key->cache_pos; + struct pcache_segment *segment; + + segment = &pos->cache_seg->segment; + + return segment_copy_from_bio(segment, pos->seg_off, key->len, pcache_req->bio, bio_off); +} + +static int cache_copy_to_req_bio(struct pcache_cache *cache, struct pcache_request *pcache_req, + u32 bio_off, u32 len, struct pcache_cache_pos *pos, u64 key_gen) +{ + struct pcache_cache_segment *cache_seg = pos->cache_seg; + struct pcache_segment *segment = &cache_seg->segment; + int ret; + + spin_lock(&cache_seg->gen_lock); + if (key_gen < cache_seg->gen) { + spin_unlock(&cache_seg->gen_lock); + return -EINVAL; + } + + ret = segment_copy_to_bio(segment, pos->seg_off, len, pcache_req->bio, bio_off); + spin_unlock(&cache_seg->gen_lock); + + return ret; +} + +/** + * miss_read_end_req - Handle the end of a miss read request. + * @backing_req: Pointer to the request structure. + * @read_ret: Return value of read. + * + * This function is called when a backing request to read data from + * the backing_dev is completed. If the key associated with the request + * is empty (a placeholder), it allocates cache space for the key, + * copies the data read from the bio into the cache, and updates + * the key's status. If the key has been overwritten by a write + * request during this process, it will be deleted from the cache + * tree and no further action will be taken. + */ +static void miss_read_end_req(struct pcache_backing_dev_req *backing_req, int read_ret) +{ + void *priv_data = backing_req->priv_data; + struct pcache_request *pcache_req = backing_req->req.upper_req; + struct pcache_cache *cache = backing_req->backing_dev->cache; + int ret; + + if (priv_data) { + struct pcache_cache_key *key; + struct pcache_cache_subtree *cache_subtree; + + key = (struct pcache_cache_key *)priv_data; + cache_subtree = key->cache_subtree; + + /* if this key was deleted from cache_subtree by a write, key->flags should be cleared, + * so if cache_key_empty() return true, this key is still in cache_subtree + */ + spin_lock(&cache_subtree->tree_lock); + if (cache_key_empty(key)) { + /* Check if the backing request was successful. */ + if (read_ret) { + cache_key_delete(key); + goto unlock; + } + + /* Allocate cache space for the key and copy data from the backing_dev. */ + ret = cache_data_alloc(cache, key); + if (ret) { + cache_key_delete(key); + goto unlock; + } + + ret = cache_copy_from_req_bio(cache, key, pcache_req, backing_req->req.bio_off); + if (ret) { + cache_seg_put(key->cache_pos.cache_seg); + cache_key_delete(key); + goto unlock; + } + key->flags &= ~PCACHE_CACHE_KEY_FLAGS_EMPTY; + key->flags |= PCACHE_CACHE_KEY_FLAGS_CLEAN; + + /* Append the key to the cache. */ + ret = cache_key_append(cache, key, false); + if (ret) { + cache_seg_put(key->cache_pos.cache_seg); + cache_key_delete(key); + goto unlock; + } + } +unlock: + spin_unlock(&cache_subtree->tree_lock); + cache_key_put(key); + } +} + +/** + * submit_cache_miss_req - Submit a backing request when cache data is missing + * @cache: The cache context that manages cache operations + * @backing_req: The cache request containing information about the read request + * + * This function is used to handle cases where a cache read request cannot locate + * the required data in the cache. When such a miss occurs during `cache_subtree_walk`, + * it triggers a backing read request to fetch data from the backing storage. + * + * If `pcache_req->priv_data` is set, it points to a `pcache_cache_key`, representing + * a new cache key to be inserted into the cache. The function calls `cache_key_insert` + * to attempt adding the key. On insertion failure, it releases the key reference and + * clears `priv_data` to avoid further processing. + */ +static void submit_cache_miss_req(struct pcache_cache *cache, struct pcache_backing_dev_req *backing_req) +{ + if (backing_req->priv_data) { + struct pcache_cache_key *key; + + /* Attempt to insert the key into the cache if priv_data is set */ + key = (struct pcache_cache_key *)backing_req->priv_data; + cache_key_insert(&cache->req_key_tree, key, true); + } + backing_dev_req_submit(backing_req, false); +} + +static void cache_miss_req_free(struct pcache_backing_dev_req *backing_req) +{ + struct pcache_cache_key *key; + + if (backing_req->priv_data) { + key = backing_req->priv_data; + backing_req->priv_data = NULL; + cache_key_put(key); /* for ->priv_data */ + cache_key_put(key); /* for init ref in alloc */ + } + + backing_dev_req_end(backing_req); +} + +static struct pcache_backing_dev_req *cache_miss_req_alloc(struct pcache_cache *cache, + struct pcache_request *parent, + gfp_t gfp_mask) +{ + struct pcache_backing_dev *backing_dev = cache->backing_dev; + struct pcache_backing_dev_req *backing_req; + struct pcache_cache_key *key = NULL; + struct pcache_backing_dev_req_opts req_opts = { 0 }; + + req_opts.type = BACKING_DEV_REQ_TYPE_REQ; + req_opts.gfp_mask = gfp_mask; + req_opts.req.upper_req = parent; + + backing_req = backing_dev_req_alloc(backing_dev, &req_opts); + if (!backing_req) + return NULL; + + key = cache_key_alloc(&cache->req_key_tree, gfp_mask); + if (!key) + goto free_backing_req; + + cache_key_get(key); + backing_req->priv_data = key; + + return backing_req; + +free_backing_req: + cache_miss_req_free(backing_req); + return NULL; +} + +static void cache_miss_req_init(struct pcache_cache *cache, + struct pcache_backing_dev_req *backing_req, + struct pcache_request *parent, + u32 off, u32 len, bool insert_key) +{ + struct pcache_cache_key *key; + struct pcache_backing_dev_req_opts req_opts = { 0 }; + + req_opts.type = BACKING_DEV_REQ_TYPE_REQ; + req_opts.req.upper_req = parent; + req_opts.req.req_off = off; + req_opts.req.len = len; + req_opts.end_fn = miss_read_end_req; + + backing_dev_req_init(backing_req, &req_opts); + + if (insert_key) { + key = backing_req->priv_data; + key->off = parent->off + off; + key->len = len; + key->flags |= PCACHE_CACHE_KEY_FLAGS_EMPTY; + } else { + key = backing_req->priv_data; + backing_req->priv_data = NULL; + cache_key_put(key); + cache_key_put(key); + } +} + +static struct pcache_backing_dev_req *get_pre_alloc_req(struct pcache_cache_subtree_walk_ctx *ctx) +{ + struct pcache_cache *cache = ctx->cache_tree->cache; + struct pcache_request *pcache_req = ctx->pcache_req; + struct pcache_backing_dev_req *backing_req; + + if (ctx->pre_alloc_req) { + backing_req = ctx->pre_alloc_req; + ctx->pre_alloc_req = NULL; + + return backing_req; + } + + return cache_miss_req_alloc(cache, pcache_req, GFP_NOWAIT); +} + +/* + * In the process of walking the cache tree to locate cached data, this + * function handles the situation where the requested data range lies + * entirely before an existing cache node (`key_tmp`). This outcome + * signifies that the target data is absent from the cache (cache miss). + * + * To fulfill this portion of the read request, the function creates a + * backing request (`backing_req`) for the missing data range represented + * by `key`. It then appends this request to the submission list in the + * `ctx`, which will later be processed to retrieve the data from backing + * storage. After setting up the backing request, `req_done` in `ctx` is + * updated to reflect the length of the handled range, and the range + * in `key` is adjusted by trimming off the portion that is now handled. + * + * The scenario handled here: + * + * |--------| key_tmp (existing cached range) + * |====| key (requested range, preceding key_tmp) + * + * Since `key` is before `key_tmp`, it signifies that the requested data + * range is missing in the cache (cache miss) and needs retrieval from + * backing storage. + */ +static int read_before(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp, + struct pcache_cache_subtree_walk_ctx *ctx) +{ + struct pcache_backing_dev_req *backing_req; + struct pcache_cache *cache = ctx->cache_tree->cache; + + /* + * In this scenario, `key` represents a range that precedes `key_tmp`, + * meaning the requested data range is missing from the cache tree + * and must be retrieved from the backing_dev. + */ + backing_req = get_pre_alloc_req(ctx); + if (!backing_req) + return SUBTREE_WALK_RET_NEED_REQ; + + cache_miss_req_init(cache, backing_req, ctx->pcache_req, ctx->req_done, key->len, true); + + list_add(&backing_req->node, ctx->submit_req_list); + ctx->req_done += key->len; + cache_key_cutfront(key, key->len); + + return SUBTREE_WALK_RET_OK; +} + +/* + * During cache_subtree_walk, this function manages a scenario where part of the + * requested data range overlaps with an existing cache node (`key_tmp`). + * + * |----------------| key_tmp (existing cached range) + * |===========| key (requested range, overlapping the tail of key_tmp) + */ +static int read_overlap_tail(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp, + struct pcache_cache_subtree_walk_ctx *ctx) +{ + struct pcache_cache *cache = ctx->cache_tree->cache; + struct pcache_backing_dev_req *backing_req; + u32 io_len; + int ret; + + /* + * Calculate the length of the non-overlapping portion of `key` + * before `key_tmp`, representing the data missing in the cache. + */ + io_len = cache_key_lstart(key_tmp) - cache_key_lstart(key); + if (io_len) { + backing_req = get_pre_alloc_req(ctx); + if (!backing_req) + return SUBTREE_WALK_RET_NEED_REQ; + + cache_miss_req_init(cache, backing_req, ctx->pcache_req, ctx->req_done, io_len, true); + + list_add(&backing_req->node, ctx->submit_req_list); + ctx->req_done += io_len; + cache_key_cutfront(key, io_len); + } + + /* + * Handle the overlapping portion by calculating the length of + * the remaining data in `key` that coincides with `key_tmp`. + */ + io_len = cache_key_lend(key) - cache_key_lstart(key_tmp); + if (cache_key_empty(key_tmp)) { + backing_req = get_pre_alloc_req(ctx); + if (!backing_req) + return SUBTREE_WALK_RET_NEED_REQ; + + cache_miss_req_init(cache, backing_req, ctx->pcache_req, ctx->req_done, io_len, false); + submit_cache_miss_req(cache, backing_req); + } else { + ret = cache_copy_to_req_bio(ctx->cache_tree->cache, ctx->pcache_req, ctx->req_done, + io_len, &key_tmp->cache_pos, key_tmp->seg_gen); + if (ret) { + if (ret == -EINVAL) { + cache_key_delete(key_tmp); + return SUBTREE_WALK_RET_RESEARCH; + } + + ctx->ret = ret; + return SUBTREE_WALK_RET_ERR; + } + } + + ctx->req_done += io_len; + cache_key_cutfront(key, io_len); + + return SUBTREE_WALK_RET_OK; +} + +/* + * |----| key_tmp (existing cached range) + * |==========| key (requested range) + */ +static int read_overlap_contain(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp, + struct pcache_cache_subtree_walk_ctx *ctx) +{ + struct pcache_cache *cache = ctx->cache_tree->cache; + struct pcache_backing_dev_req *backing_req; + u32 io_len; + int ret; + + /* + * Calculate the non-overlapping part of `key` before `key_tmp` + * to identify the missing data length. + */ + io_len = cache_key_lstart(key_tmp) - cache_key_lstart(key); + if (io_len) { + backing_req = get_pre_alloc_req(ctx); + if (!backing_req) + return SUBTREE_WALK_RET_NEED_REQ; + + cache_miss_req_init(cache, backing_req, ctx->pcache_req, ctx->req_done, io_len, true); + + list_add(&backing_req->node, ctx->submit_req_list); + + ctx->req_done += io_len; + cache_key_cutfront(key, io_len); + } + + /* + * Handle the overlapping portion between `key` and `key_tmp`. + */ + io_len = key_tmp->len; + if (cache_key_empty(key_tmp)) { + backing_req = get_pre_alloc_req(ctx); + if (!backing_req) + return SUBTREE_WALK_RET_NEED_REQ; + + cache_miss_req_init(cache, backing_req, ctx->pcache_req, ctx->req_done, io_len, false); + submit_cache_miss_req(cache, backing_req); + } else { + ret = cache_copy_to_req_bio(ctx->cache_tree->cache, ctx->pcache_req, ctx->req_done, + io_len, &key_tmp->cache_pos, key_tmp->seg_gen); + if (ret) { + if (ret == -EINVAL) { + cache_key_delete(key_tmp); + return SUBTREE_WALK_RET_RESEARCH; + } + + ctx->ret = ret; + return SUBTREE_WALK_RET_ERR; + } + } + + ctx->req_done += io_len; + cache_key_cutfront(key, io_len); + + return SUBTREE_WALK_RET_OK; +} + +/* + * |-----------| key_tmp (existing cached range) + * |====| key (requested range, fully within key_tmp) + * + * If `key_tmp` contains valid cached data, this function copies the relevant + * portion to the request's bio. Otherwise, it sends a backing request to + * fetch the required data range. + */ +static int read_overlap_contained(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp, + struct pcache_cache_subtree_walk_ctx *ctx) +{ + struct pcache_cache *cache = ctx->cache_tree->cache; + struct pcache_backing_dev_req *backing_req; + struct pcache_cache_pos pos; + int ret; + + /* + * Check if `key_tmp` is empty, indicating a miss. If so, initiate + * a backing request to fetch the required data for `key`. + */ + if (cache_key_empty(key_tmp)) { + backing_req = get_pre_alloc_req(ctx); + if (!backing_req) + return SUBTREE_WALK_RET_NEED_REQ; + + cache_miss_req_init(cache, backing_req, ctx->pcache_req, ctx->req_done, key->len, false); + submit_cache_miss_req(cache, backing_req); + } else { + cache_pos_copy(&pos, &key_tmp->cache_pos); + cache_pos_advance(&pos, cache_key_lstart(key) - cache_key_lstart(key_tmp)); + + ret = cache_copy_to_req_bio(ctx->cache_tree->cache, ctx->pcache_req, ctx->req_done, + key->len, &pos, key_tmp->seg_gen); + if (ret) { + if (ret == -EINVAL) { + cache_key_delete(key_tmp); + return SUBTREE_WALK_RET_RESEARCH; + } + + ctx->ret = ret; + return SUBTREE_WALK_RET_ERR; + } + } + + ctx->req_done += key->len; + cache_key_cutfront(key, key->len); + + return SUBTREE_WALK_RET_OK; +} + +/* + * |--------| key_tmp (existing cached range) + * |==========| key (requested range, overlapping the head of key_tmp) + */ +static int read_overlap_head(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp, + struct pcache_cache_subtree_walk_ctx *ctx) +{ + struct pcache_cache *cache = ctx->cache_tree->cache; + struct pcache_backing_dev_req *backing_req; + struct pcache_cache_pos pos; + u32 io_len; + int ret; + + io_len = cache_key_lend(key_tmp) - cache_key_lstart(key); + + if (cache_key_empty(key_tmp)) { + backing_req = get_pre_alloc_req(ctx); + if (!backing_req) + return SUBTREE_WALK_RET_NEED_REQ; + + cache_miss_req_init(cache, backing_req, ctx->pcache_req, ctx->req_done, io_len, false); + submit_cache_miss_req(cache, backing_req); + } else { + cache_pos_copy(&pos, &key_tmp->cache_pos); + cache_pos_advance(&pos, cache_key_lstart(key) - cache_key_lstart(key_tmp)); + + ret = cache_copy_to_req_bio(ctx->cache_tree->cache, ctx->pcache_req, ctx->req_done, + io_len, &pos, key_tmp->seg_gen); + if (ret) { + if (ret == -EINVAL) { + cache_key_delete(key_tmp); + return SUBTREE_WALK_RET_RESEARCH; + } + + ctx->ret = ret; + return SUBTREE_WALK_RET_ERR; + } + } + + ctx->req_done += io_len; + cache_key_cutfront(key, io_len); + + return SUBTREE_WALK_RET_OK; +} + +/** + * read_walk_finally - Finalizes the cache read tree walk by submitting any + * remaining backing requests + * @ctx: Context structure holding information about the cache, + * read request, and submission list + * @ret: the return value after this walk. + * + * This function is called at the end of the `cache_subtree_walk` during a + * cache read operation. It completes the walk by checking if any data + * requested by `key` was not found in the cache tree, and if so, it sends + * a backing request to retrieve that data. Then, it iterates through the + * submission list of backing requests created during the walk, removing + * each request from the list and submitting it. + * + * The scenario managed here includes: + * - Sending a backing request for the remaining length of `key` if it was + * not fulfilled by existing cache entries. + * - Iterating through `ctx->submit_req_list` to submit each backing request + * enqueued during the walk. + * + * This ensures all necessary backing requests for cache misses are submitted + * to the backing storage to retrieve any data that could not be found in + * the cache. + */ +static int read_walk_finally(struct pcache_cache_subtree_walk_ctx *ctx, int ret) +{ + struct pcache_cache *cache = ctx->cache_tree->cache; + struct pcache_backing_dev_req *backing_req, *next_req; + struct pcache_cache_key *key = ctx->key; + + list_for_each_entry_safe(backing_req, next_req, ctx->submit_req_list, node) { + list_del_init(&backing_req->node); + submit_cache_miss_req(ctx->cache_tree->cache, backing_req); + } + + if (ret != SUBTREE_WALK_RET_OK) + return ret; + + if (key->len) { + backing_req = get_pre_alloc_req(ctx); + if (!backing_req) + return SUBTREE_WALK_RET_NEED_REQ; + + cache_miss_req_init(cache, backing_req, ctx->pcache_req, ctx->req_done, key->len, true); + submit_cache_miss_req(cache, backing_req); + ctx->req_done += key->len; + } + + return SUBTREE_WALK_RET_OK; +} + +/* + * This function is used within `cache_subtree_walk` to determine whether the + * read operation has covered the requested data length. It compares the + * amount of data processed (`ctx->req_done`) with the total data length + * specified in the original request (`ctx->pcache_req->data_len`). + * + * If `req_done` meets or exceeds the required data length, the function + * returns `true`, indicating the walk is complete. Otherwise, it returns `false`, + * signaling that additional data processing is needed to fulfill the request. + */ +static bool read_walk_done(struct pcache_cache_subtree_walk_ctx *ctx) +{ + return (ctx->req_done >= ctx->pcache_req->data_len); +} + +/** + * cache_read - Process a read request by traversing the cache tree + * @cache: Cache structure holding cache trees and related configurations + * @pcache_req: Request structure with information about the data to read + * + * This function attempts to fulfill a read request by traversing the cache tree(s) + * to locate cached data for the requested range. If parts of the data are missing + * in the cache, backing requests are generated to retrieve the required segments. + * + * The function operates by initializing a key for the requested data range and + * preparing a context (`walk_ctx`) to manage the cache tree traversal. The context + * includes pointers to functions (e.g., `read_before`, `read_overlap_tail`) that handle + * specific conditions encountered during the traversal. The `walk_finally` and `walk_done` + * functions manage the end stages of the traversal, while the `delete_key_list` and + * `submit_req_list` lists track any keys to be deleted or requests to be submitted. + * + * The function first calculates the requested range and checks if it fits within the + * current cache tree (based on the tree's size limits). It then locks the cache tree + * and performs a search to locate any matching keys. If there are outdated keys, + * these are deleted, and the search is restarted to ensure accurate data retrieval. + * + * If the requested range spans multiple cache trees, the function moves on to the + * next tree once the current range has been processed. This continues until the + * entire requested data length has been handled. + */ +static int cache_read(struct pcache_cache *cache, struct pcache_request *pcache_req) +{ + struct pcache_cache_key key_data = { .off = pcache_req->off, .len = pcache_req->data_len }; + struct pcache_cache_subtree *cache_subtree; + struct pcache_cache_key *key_tmp = NULL, *key_next; + struct rb_node *prev_node = NULL; + struct pcache_cache_key *key = &key_data; + struct pcache_cache_subtree_walk_ctx walk_ctx = { 0 }; + struct pcache_backing_dev_req *backing_req, *next_req; + LIST_HEAD(delete_key_list); + LIST_HEAD(submit_req_list); + int ret; + + walk_ctx.cache_tree = &cache->req_key_tree; + walk_ctx.req_done = 0; + walk_ctx.pcache_req = pcache_req; + walk_ctx.before = read_before; + walk_ctx.overlap_tail = read_overlap_tail; + walk_ctx.overlap_head = read_overlap_head; + walk_ctx.overlap_contain = read_overlap_contain; + walk_ctx.overlap_contained = read_overlap_contained; + walk_ctx.walk_finally = read_walk_finally; + walk_ctx.walk_done = read_walk_done; + walk_ctx.delete_key_list = &delete_key_list; + walk_ctx.submit_req_list = &submit_req_list; + +next: + key->off = pcache_req->off + walk_ctx.req_done; + key->len = pcache_req->data_len - walk_ctx.req_done; + if (key->len > PCACHE_CACHE_SUBTREE_SIZE - (key->off & PCACHE_CACHE_SUBTREE_SIZE_MASK)) + key->len = PCACHE_CACHE_SUBTREE_SIZE - (key->off & PCACHE_CACHE_SUBTREE_SIZE_MASK); + + cache_subtree = get_subtree(&cache->req_key_tree, key->off); + spin_lock(&cache_subtree->tree_lock); +search: + prev_node = cache_subtree_search(cache_subtree, key, NULL, NULL, &delete_key_list); + if (!list_empty(&delete_key_list)) { + list_for_each_entry_safe(key_tmp, key_next, &delete_key_list, list_node) { + list_del_init(&key_tmp->list_node); + cache_key_delete(key_tmp); + } + goto search; + } + + walk_ctx.start_node = prev_node; + walk_ctx.key = key; + + ret = cache_subtree_walk(&walk_ctx); + if (ret == SUBTREE_WALK_RET_RESEARCH) + goto search; + spin_unlock(&cache_subtree->tree_lock); + + if (ret == SUBTREE_WALK_RET_ERR) { + ret = walk_ctx.ret; + goto out; + } + + if (ret == SUBTREE_WALK_RET_NEED_REQ) { + walk_ctx.pre_alloc_req = cache_miss_req_alloc(cache, pcache_req, GFP_NOIO); + pcache_dev_debug(CACHE_TO_PCACHE(cache), "allocate pre_alloc_req with GFP_NOIO"); + } + + if (walk_ctx.req_done < pcache_req->data_len) + goto next; + ret = 0; +out: + if (walk_ctx.pre_alloc_req) + cache_miss_req_free(walk_ctx.pre_alloc_req); + + list_for_each_entry_safe(backing_req, next_req, &submit_req_list, node) { + list_del_init(&backing_req->node); + backing_dev_req_end(backing_req); + } + + return ret; +} + +static int cache_write(struct pcache_cache *cache, struct pcache_request *pcache_req) +{ + struct pcache_cache_subtree *cache_subtree; + struct pcache_cache_key *key; + u64 offset = pcache_req->off; + u32 length = pcache_req->data_len; + u32 io_done = 0; + int ret; + + while (true) { + if (io_done >= length) + break; + + key = cache_key_alloc(&cache->req_key_tree, GFP_NOIO); + key->off = offset + io_done; + key->len = length - io_done; + if (key->len > PCACHE_CACHE_SUBTREE_SIZE - (key->off & PCACHE_CACHE_SUBTREE_SIZE_MASK)) + key->len = PCACHE_CACHE_SUBTREE_SIZE - (key->off & PCACHE_CACHE_SUBTREE_SIZE_MASK); + + ret = cache_data_alloc(cache, key); + if (ret) { + cache_key_put(key); + goto err; + } + + ret = cache_copy_from_req_bio(cache, key, pcache_req, io_done); + if (ret) { + cache_seg_put(key->cache_pos.cache_seg); + cache_key_put(key); + goto err; + } + + cache_subtree = get_subtree(&cache->req_key_tree, key->off); + spin_lock(&cache_subtree->tree_lock); + cache_key_insert(&cache->req_key_tree, key, true); + ret = cache_key_append(cache, key, pcache_req->bio->bi_opf & REQ_FUA); + if (ret) { + cache_seg_put(key->cache_pos.cache_seg); + cache_key_delete(key); + goto unlock; + } + + io_done += key->len; + spin_unlock(&cache_subtree->tree_lock); + } + + return 0; +unlock: + spin_unlock(&cache_subtree->tree_lock); +err: + return ret; +} + +/** + * cache_flush - Flush all ksets to persist any pending cache data + * @cache: Pointer to the cache structure + * + * This function iterates through all ksets associated with the provided `cache` + * and ensures that any data marked for persistence is written to media. For each + * kset, it acquires the kset lock, then invokes `cache_kset_close`, which handles + * the persistence logic for that kset. + * + * If `cache_kset_close` encounters an error, the function exits immediately with + * the respective error code, preventing the flush operation from proceeding to + * subsequent ksets. + */ +int cache_flush(struct pcache_cache *cache) +{ + struct pcache_cache_kset *kset; + int ret; + u32 i; + + for (i = 0; i < cache->n_ksets; i++) { + kset = get_kset(cache, i); + + spin_lock(&kset->kset_lock); + ret = cache_kset_close(cache, kset); + spin_unlock(&kset->kset_lock); + + if (ret) + return ret; + } + + return 0; +} + +int pcache_cache_handle_req(struct pcache_cache *cache, struct pcache_request *pcache_req) +{ + struct bio *bio = pcache_req->bio; + + if (unlikely(bio->bi_opf & REQ_PREFLUSH)) + return cache_flush(cache); + + if (bio_data_dir(bio) == READ) + return cache_read(cache, pcache_req); + + return cache_write(cache, pcache_req); +} diff --git a/drivers/md/dm-pcache/cache_segment.c b/drivers/md/dm-pcache/cache_segment.c new file mode 100644 index 000000000000..f0b58980806e --- /dev/null +++ b/drivers/md/dm-pcache/cache_segment.c @@ -0,0 +1,305 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "cache_dev.h" +#include "cache.h" +#include "backing_dev.h" +#include "dm_pcache.h" + +static inline struct pcache_segment_info *get_seg_info_addr(struct pcache_cache_segment *cache_seg) +{ + struct pcache_segment_info *seg_info_addr; + u32 seg_id = cache_seg->segment.seg_id; + void *seg_addr; + + seg_addr = CACHE_DEV_SEGMENT(cache_seg->cache->cache_dev, seg_id); + seg_info_addr = seg_addr + PCACHE_SEG_INFO_SIZE * cache_seg->info_index; + + return seg_info_addr; +} + +static void cache_seg_info_write(struct pcache_cache_segment *cache_seg) +{ + struct pcache_segment_info *seg_info_addr; + struct pcache_segment_info *seg_info = &cache_seg->cache_seg_info; + + mutex_lock(&cache_seg->info_lock); + seg_info->header.seq++; + seg_info->header.crc = pcache_meta_crc(&seg_info->header, sizeof(struct pcache_segment_info)); + + seg_info_addr = get_seg_info_addr(cache_seg); + memcpy_flushcache(seg_info_addr, seg_info, sizeof(struct pcache_segment_info)); + pmem_wmb(); + + cache_seg->info_index = (cache_seg->info_index + 1) % PCACHE_META_INDEX_MAX; + mutex_unlock(&cache_seg->info_lock); +} + +static int cache_seg_info_load(struct pcache_cache_segment *cache_seg) +{ + struct pcache_segment_info *cache_seg_info_addr_base, *cache_seg_info_addr; + struct pcache_cache_dev *cache_dev = cache_seg->cache->cache_dev; + struct dm_pcache *pcache = CACHE_DEV_TO_PCACHE(cache_dev); + u32 seg_id = cache_seg->segment.seg_id; + int ret = 0; + + cache_seg_info_addr_base = CACHE_DEV_SEGMENT(cache_dev, seg_id); + + mutex_lock(&cache_seg->info_lock); + cache_seg_info_addr = pcache_meta_find_latest(&cache_seg_info_addr_base->header, + sizeof(struct pcache_segment_info), + PCACHE_SEG_INFO_SIZE, + &cache_seg->cache_seg_info); + if (IS_ERR(cache_seg_info_addr)) { + ret = PTR_ERR(cache_seg_info_addr); + goto out; + } else if (!cache_seg_info_addr) { + ret = -EIO; + goto out; + } + cache_seg->info_index = cache_seg_info_addr - cache_seg_info_addr_base; +out: + mutex_unlock(&cache_seg->info_lock); + + if (ret) + pcache_dev_err(pcache, "can't read segment info of segment: %u, ret: %d\n", + cache_seg->segment.seg_id, ret); + return ret; +} + +static int cache_seg_ctrl_load(struct pcache_cache_segment *cache_seg) +{ + struct pcache_cache_seg_ctrl *cache_seg_ctrl = cache_seg->cache_seg_ctrl; + struct pcache_cache_seg_gen cache_seg_gen, *cache_seg_gen_addr; + int ret = 0; + + cache_seg_gen_addr = pcache_meta_find_latest(&cache_seg_ctrl->gen->header, + sizeof(struct pcache_cache_seg_gen), + sizeof(struct pcache_cache_seg_gen), + &cache_seg_gen); + if (IS_ERR(cache_seg_gen_addr)) { + ret = PTR_ERR(cache_seg_gen_addr); + goto out; + } + + if (!cache_seg_gen_addr) { + cache_seg->gen = 0; + cache_seg->gen_seq = 0; + cache_seg->gen_index = 0; + goto out; + } + + cache_seg->gen = cache_seg_gen.gen; + cache_seg->gen_seq = cache_seg_gen.header.seq; + cache_seg->gen_index = (cache_seg_gen_addr - cache_seg_ctrl->gen); +out: + + return ret; +} + +static inline struct pcache_cache_seg_gen *get_cache_seg_gen_addr(struct pcache_cache_segment *cache_seg) +{ + struct pcache_cache_seg_ctrl *cache_seg_ctrl = cache_seg->cache_seg_ctrl; + + return (cache_seg_ctrl->gen + cache_seg->gen_index); +} + +/* + * cache_seg_ctrl_write - write cache segment control information + * @seg: the cache segment to update + * + * This function writes the control information of a cache segment to media. + * + * Although this updates shared control data, we intentionally do not use + * any locking here. All accesses to control information are single-threaded: + * + * - All reads occur during the init phase, where no concurrent writes + * can happen. + * - Writes happen once during init and once when the last reference + * to the segment is dropped in cache_seg_put(). + * + * Both cases are guaranteed to be single-threaded, so there is no risk + * of concurrent read/write races. + */ +static void cache_seg_ctrl_write(struct pcache_cache_segment *cache_seg) +{ + struct pcache_cache_seg_gen cache_seg_gen; + + cache_seg_gen.gen = cache_seg->gen; + cache_seg_gen.header.seq = ++cache_seg->gen_seq; + cache_seg_gen.header.crc = pcache_meta_crc(&cache_seg_gen.header, + sizeof(struct pcache_cache_seg_gen)); + + memcpy_flushcache(get_cache_seg_gen_addr(cache_seg), &cache_seg_gen, sizeof(struct pcache_cache_seg_gen)); + pmem_wmb(); + + cache_seg->gen_index = (cache_seg->gen_index + 1) % PCACHE_META_INDEX_MAX; +} + +static void cache_seg_ctrl_init(struct pcache_cache_segment *cache_seg) +{ + cache_seg->gen = 0; + cache_seg->gen_seq = 0; + cache_seg->gen_index = 0; + cache_seg_ctrl_write(cache_seg); +} + +static int cache_seg_meta_load(struct pcache_cache_segment *cache_seg) +{ + int ret; + + ret = cache_seg_info_load(cache_seg); + if (ret) + goto err; + + ret = cache_seg_ctrl_load(cache_seg); + if (ret) + goto err; + + return 0; +err: + return ret; +} + +/** + * cache_seg_set_next_seg - Sets the ID of the next segment + * @cache_seg: Pointer to the cache segment structure. + * @seg_id: The segment ID to set as the next segment. + * + * A pcache_cache allocates multiple cache segments, which are linked together + * through next_seg. When loading a pcache_cache, the first cache segment can + * be found using cache->seg_id, which allows access to all the cache segments. + */ +void cache_seg_set_next_seg(struct pcache_cache_segment *cache_seg, u32 seg_id) +{ + cache_seg->cache_seg_info.flags |= PCACHE_SEG_INFO_FLAGS_HAS_NEXT; + cache_seg->cache_seg_info.next_seg = seg_id; + cache_seg_info_write(cache_seg); +} + +int cache_seg_init(struct pcache_cache *cache, u32 seg_id, u32 cache_seg_id, + bool new_cache) +{ + struct pcache_cache_dev *cache_dev = cache->cache_dev; + struct pcache_cache_segment *cache_seg = &cache->segments[cache_seg_id]; + struct pcache_segment_init_options seg_options = { 0 }; + struct pcache_segment *segment = &cache_seg->segment; + int ret; + + cache_seg->cache = cache; + cache_seg->cache_seg_id = cache_seg_id; + spin_lock_init(&cache_seg->gen_lock); + atomic_set(&cache_seg->refs, 0); + mutex_init(&cache_seg->info_lock); + + /* init pcache_segment */ + seg_options.type = PCACHE_SEGMENT_TYPE_CACHE_DATA; + seg_options.data_off = PCACHE_CACHE_SEG_CTRL_OFF + PCACHE_CACHE_SEG_CTRL_SIZE; + seg_options.seg_id = seg_id; + seg_options.seg_info = &cache_seg->cache_seg_info; + pcache_segment_init(cache_dev, segment, &seg_options); + + cache_seg->cache_seg_ctrl = CACHE_DEV_SEGMENT(cache_dev, seg_id) + PCACHE_CACHE_SEG_CTRL_OFF; + + if (new_cache) { + cache_dev_zero_range(cache_dev, CACHE_DEV_SEGMENT(cache_dev, seg_id), + PCACHE_SEG_INFO_SIZE * PCACHE_META_INDEX_MAX + + PCACHE_CACHE_SEG_CTRL_SIZE); + + cache_seg_ctrl_init(cache_seg); + + cache_seg->info_index = 0; + cache_seg_info_write(cache_seg); + + /* clear outdated kset in segment */ + memcpy_flushcache(segment->data, &pcache_empty_kset, sizeof(struct pcache_cache_kset_onmedia)); + pmem_wmb(); + } else { + ret = cache_seg_meta_load(cache_seg); + if (ret) + goto err; + } + + return 0; +err: + return ret; +} + +/** + * get_cache_segment - Retrieves a free cache segment from the cache. + * @cache: Pointer to the cache structure. + * + * This function attempts to find a free cache segment that can be used. + * It locks the segment map and checks for the next available segment ID. + * If a free segment is found, it initializes it and returns a pointer to the + * cache segment structure. Returns NULL if no segments are available. + */ +struct pcache_cache_segment *get_cache_segment(struct pcache_cache *cache) +{ + struct pcache_cache_segment *cache_seg; + u32 seg_id; + + spin_lock(&cache->seg_map_lock); +again: + seg_id = find_next_zero_bit(cache->seg_map, cache->n_segs, cache->last_cache_seg); + if (seg_id == cache->n_segs) { + /* reset the hint of ->last_cache_seg and retry */ + if (cache->last_cache_seg) { + cache->last_cache_seg = 0; + goto again; + } + cache->cache_full = true; + spin_unlock(&cache->seg_map_lock); + return NULL; + } + + /* + * found an available cache_seg, mark it used in seg_map + * and update the search hint ->last_cache_seg + */ + __set_bit(seg_id, cache->seg_map); + cache->last_cache_seg = seg_id; + spin_unlock(&cache->seg_map_lock); + + cache_seg = &cache->segments[seg_id]; + cache_seg->cache_seg_id = seg_id; + + return cache_seg; +} + +static void cache_seg_gen_increase(struct pcache_cache_segment *cache_seg) +{ + spin_lock(&cache_seg->gen_lock); + cache_seg->gen++; + spin_unlock(&cache_seg->gen_lock); + + cache_seg_ctrl_write(cache_seg); +} + +void cache_seg_get(struct pcache_cache_segment *cache_seg) +{ + atomic_inc(&cache_seg->refs); +} + +static void cache_seg_invalidate(struct pcache_cache_segment *cache_seg) +{ + struct pcache_cache *cache; + + cache = cache_seg->cache; + cache_seg_gen_increase(cache_seg); + + spin_lock(&cache->seg_map_lock); + if (cache->cache_full) + cache->cache_full = false; + __clear_bit(cache_seg->cache_seg_id, cache->seg_map); + spin_unlock(&cache->seg_map_lock); + + pcache_defer_reqs_kick(CACHE_TO_PCACHE(cache)); + /* clean_work will clean the bad key in key_tree*/ + queue_work(cache_get_wq(cache), &cache->clean_work); +} + +void cache_seg_put(struct pcache_cache_segment *cache_seg) +{ + if (atomic_dec_and_test(&cache_seg->refs)) + cache_seg_invalidate(cache_seg); +} diff --git a/drivers/md/dm-pcache/cache_writeback.c b/drivers/md/dm-pcache/cache_writeback.c new file mode 100644 index 000000000000..87a82b3fe836 --- /dev/null +++ b/drivers/md/dm-pcache/cache_writeback.c @@ -0,0 +1,261 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/bio.h> + +#include "cache.h" +#include "backing_dev.h" +#include "cache_dev.h" +#include "dm_pcache.h" + +static void writeback_ctx_end(struct pcache_cache *cache, int ret) +{ + if (ret && !cache->writeback_ctx.ret) { + pcache_dev_err(CACHE_TO_PCACHE(cache), "writeback error: %d", ret); + cache->writeback_ctx.ret = ret; + } + + if (!atomic_dec_and_test(&cache->writeback_ctx.pending)) + return; + + if (!cache->writeback_ctx.ret) { + backing_dev_flush(cache->backing_dev); + + mutex_lock(&cache->dirty_tail_lock); + cache_pos_advance(&cache->dirty_tail, cache->writeback_ctx.advance); + cache_encode_dirty_tail(cache); + mutex_unlock(&cache->dirty_tail_lock); + } + queue_delayed_work(cache_get_wq(cache), &cache->writeback_work, 0); +} + +static void writeback_end_req(struct pcache_backing_dev_req *backing_req, int ret) +{ + struct pcache_cache *cache = backing_req->priv_data; + + mutex_lock(&cache->writeback_lock); + writeback_ctx_end(cache, ret); + mutex_unlock(&cache->writeback_lock); +} + +static inline bool is_cache_clean(struct pcache_cache *cache, struct pcache_cache_pos *dirty_tail) +{ + struct dm_pcache *pcache = CACHE_TO_PCACHE(cache); + struct pcache_cache_kset_onmedia *kset_onmedia; + u32 to_copy; + void *addr; + int ret; + + addr = cache_pos_addr(dirty_tail); + kset_onmedia = (struct pcache_cache_kset_onmedia *)cache->wb_kset_onmedia_buf; + + to_copy = min(PCACHE_KSET_ONMEDIA_SIZE_MAX, PCACHE_SEG_SIZE - dirty_tail->seg_off); + ret = copy_mc_to_kernel(kset_onmedia, addr, to_copy); + if (ret) { + pcache_dev_err(pcache, "error to read kset: %d", ret); + return true; + } + + /* Check if the magic number matches the expected value */ + if (kset_onmedia->magic != PCACHE_KSET_MAGIC) { + pcache_dev_debug(pcache, "dirty_tail: %u:%u magic: %llx, not expected: %llx\n", + dirty_tail->cache_seg->cache_seg_id, dirty_tail->seg_off, + kset_onmedia->magic, PCACHE_KSET_MAGIC); + return true; + } + + /* Verify the CRC checksum for data integrity */ + if (kset_onmedia->crc != cache_kset_crc(kset_onmedia)) { + pcache_dev_debug(pcache, "dirty_tail: %u:%u crc: %x, not expected: %x\n", + dirty_tail->cache_seg->cache_seg_id, dirty_tail->seg_off, + cache_kset_crc(kset_onmedia), kset_onmedia->crc); + return true; + } + + return false; +} + +void cache_writeback_exit(struct pcache_cache *cache) +{ + cancel_delayed_work_sync(&cache->writeback_work); + backing_dev_flush(cache->backing_dev); + cache_tree_exit(&cache->writeback_key_tree); +} + +int cache_writeback_init(struct pcache_cache *cache) +{ + int ret; + + ret = cache_tree_init(cache, &cache->writeback_key_tree, 1); + if (ret) + goto err; + + atomic_set(&cache->writeback_ctx.pending, 0); + + /* Queue delayed work to start writeback handling */ + queue_delayed_work(cache_get_wq(cache), &cache->writeback_work, 0); + + return 0; +err: + return ret; +} + +static void cache_key_writeback(struct pcache_cache *cache, struct pcache_cache_key *key) +{ + struct pcache_backing_dev_req *writeback_req; + struct pcache_backing_dev_req_opts writeback_req_opts = { 0 }; + struct pcache_cache_pos *pos; + void *addr; + u32 seg_remain, req_len, done = 0; + + if (cache_key_clean(key)) + return; + + pos = &key->cache_pos; + + seg_remain = cache_seg_remain(pos); + BUG_ON(seg_remain < key->len); +next_req: + addr = cache_pos_addr(pos) + done; + req_len = backing_dev_req_coalesced_max_len(addr, key->len - done); + + writeback_req_opts.type = BACKING_DEV_REQ_TYPE_KMEM; + writeback_req_opts.gfp_mask = GFP_NOIO; + writeback_req_opts.end_fn = writeback_end_req; + writeback_req_opts.priv_data = cache; + + writeback_req_opts.kmem.data = addr; + writeback_req_opts.kmem.opf = REQ_OP_WRITE; + writeback_req_opts.kmem.len = req_len; + writeback_req_opts.kmem.backing_off = key->off + done; + + writeback_req = backing_dev_req_create(cache->backing_dev, &writeback_req_opts); + + atomic_inc(&cache->writeback_ctx.pending); + backing_dev_req_submit(writeback_req, true); + + done += req_len; + if (done < key->len) + goto next_req; +} + +static void cache_wb_tree_writeback(struct pcache_cache *cache, u32 advance) +{ + struct pcache_cache_tree *cache_tree = &cache->writeback_key_tree; + struct pcache_cache_subtree *cache_subtree; + struct rb_node *node; + struct pcache_cache_key *key; + u32 i; + + cache->writeback_ctx.ret = 0; + cache->writeback_ctx.advance = advance; + atomic_set(&cache->writeback_ctx.pending, 1); + + for (i = 0; i < cache_tree->n_subtrees; i++) { + cache_subtree = &cache_tree->subtrees[i]; + + node = rb_first(&cache_subtree->root); + while (node) { + key = CACHE_KEY(node); + node = rb_next(node); + + cache_key_writeback(cache, key); + cache_key_delete(key); + } + } + writeback_ctx_end(cache, 0); +} + +static int cache_kset_insert_tree(struct pcache_cache *cache, struct pcache_cache_kset_onmedia *kset_onmedia) +{ + struct pcache_cache_key_onmedia *key_onmedia; + struct pcache_cache_subtree *cache_subtree; + struct pcache_cache_key *key; + int ret; + u32 i; + + /* Iterate through all keys in the kset and write each back to storage */ + for (i = 0; i < kset_onmedia->key_num; i++) { + key_onmedia = &kset_onmedia->data[i]; + + key = cache_key_alloc(&cache->writeback_key_tree, GFP_NOIO); + ret = cache_key_decode(cache, key_onmedia, key); + if (ret) { + cache_key_put(key); + goto clear_tree; + } + + cache_subtree = get_subtree(&cache->writeback_key_tree, key->off); + spin_lock(&cache_subtree->tree_lock); + cache_key_insert(&cache->writeback_key_tree, key, true); + spin_unlock(&cache_subtree->tree_lock); + } + + return 0; +clear_tree: + cache_tree_clear(&cache->writeback_key_tree); + return ret; +} + +static void last_kset_writeback(struct pcache_cache *cache, + struct pcache_cache_kset_onmedia *last_kset_onmedia) +{ + struct dm_pcache *pcache = CACHE_TO_PCACHE(cache); + struct pcache_cache_segment *next_seg; + + pcache_dev_debug(pcache, "last kset, next: %u\n", last_kset_onmedia->next_cache_seg_id); + + next_seg = &cache->segments[last_kset_onmedia->next_cache_seg_id]; + + mutex_lock(&cache->dirty_tail_lock); + cache->dirty_tail.cache_seg = next_seg; + cache->dirty_tail.seg_off = 0; + cache_encode_dirty_tail(cache); + mutex_unlock(&cache->dirty_tail_lock); +} + +void cache_writeback_fn(struct work_struct *work) +{ + struct pcache_cache *cache = container_of(work, struct pcache_cache, writeback_work.work); + struct dm_pcache *pcache = CACHE_TO_PCACHE(cache); + struct pcache_cache_pos dirty_tail; + struct pcache_cache_kset_onmedia *kset_onmedia; + u32 delay; + int ret; + + mutex_lock(&cache->writeback_lock); + if (atomic_read(&cache->writeback_ctx.pending)) + goto unlock; + + if (pcache_is_stopping(pcache)) + goto unlock; + + kset_onmedia = (struct pcache_cache_kset_onmedia *)cache->wb_kset_onmedia_buf; + + mutex_lock(&cache->dirty_tail_lock); + cache_pos_copy(&dirty_tail, &cache->dirty_tail); + mutex_unlock(&cache->dirty_tail_lock); + + if (is_cache_clean(cache, &dirty_tail)) { + delay = PCACHE_CACHE_WRITEBACK_INTERVAL; + goto queue_work; + } + + if (kset_onmedia->flags & PCACHE_KSET_FLAGS_LAST) { + last_kset_writeback(cache, kset_onmedia); + delay = 0; + goto queue_work; + } + + ret = cache_kset_insert_tree(cache, kset_onmedia); + if (ret) { + delay = PCACHE_CACHE_WRITEBACK_INTERVAL; + goto queue_work; + } + + cache_wb_tree_writeback(cache, get_kset_onmedia_size(kset_onmedia)); + delay = 0; +queue_work: + queue_delayed_work(cache_get_wq(cache), &cache->writeback_work, delay); +unlock: + mutex_unlock(&cache->writeback_lock); +} diff --git a/drivers/md/dm-pcache/dm_pcache.c b/drivers/md/dm-pcache/dm_pcache.c new file mode 100644 index 000000000000..e5f5936fa6f0 --- /dev/null +++ b/drivers/md/dm-pcache/dm_pcache.c @@ -0,0 +1,497 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include <linux/module.h> +#include <linux/blkdev.h> +#include <linux/bio.h> + +#include "../dm-core.h" +#include "cache_dev.h" +#include "backing_dev.h" +#include "cache.h" +#include "dm_pcache.h" + +void pcache_defer_reqs_kick(struct dm_pcache *pcache) +{ + struct pcache_cache *cache = &pcache->cache; + + spin_lock(&cache->seg_map_lock); + if (!cache->cache_full) + queue_work(pcache->task_wq, &pcache->defered_req_work); + spin_unlock(&cache->seg_map_lock); +} + +static void defer_req(struct pcache_request *pcache_req) +{ + struct dm_pcache *pcache = pcache_req->pcache; + + BUG_ON(!list_empty(&pcache_req->list_node)); + + spin_lock(&pcache->defered_req_list_lock); + list_add(&pcache_req->list_node, &pcache->defered_req_list); + pcache_defer_reqs_kick(pcache); + spin_unlock(&pcache->defered_req_list_lock); +} + +static void defered_req_fn(struct work_struct *work) +{ + struct dm_pcache *pcache = container_of(work, struct dm_pcache, defered_req_work); + struct pcache_request *pcache_req; + LIST_HEAD(tmp_list); + int ret; + + if (pcache_is_stopping(pcache)) + return; + + spin_lock(&pcache->defered_req_list_lock); + list_splice_init(&pcache->defered_req_list, &tmp_list); + spin_unlock(&pcache->defered_req_list_lock); + + while (!list_empty(&tmp_list)) { + pcache_req = list_first_entry(&tmp_list, + struct pcache_request, list_node); + list_del_init(&pcache_req->list_node); + pcache_req->ret = 0; + ret = pcache_cache_handle_req(&pcache->cache, pcache_req); + if (ret == -EBUSY) + defer_req(pcache_req); + else + pcache_req_put(pcache_req, ret); + } +} + +void pcache_req_get(struct pcache_request *pcache_req) +{ + kref_get(&pcache_req->ref); +} + +static void end_req(struct kref *ref) +{ + struct pcache_request *pcache_req = container_of(ref, struct pcache_request, ref); + struct dm_pcache *pcache = pcache_req->pcache; + struct bio *bio = pcache_req->bio; + int ret = pcache_req->ret; + + if (ret == -EBUSY) { + pcache_req_get(pcache_req); + defer_req(pcache_req); + } else { + bio->bi_status = errno_to_blk_status(ret); + bio_endio(bio); + + if (atomic_dec_and_test(&pcache->inflight_reqs)) + wake_up(&pcache->inflight_wq); + } +} + +void pcache_req_put(struct pcache_request *pcache_req, int ret) +{ + /* Set the return status if it is not already set */ + if (ret && !pcache_req->ret) + pcache_req->ret = ret; + + kref_put(&pcache_req->ref, end_req); +} + +static bool at_least_one_arg(struct dm_arg_set *as, char **error) +{ + if (!as->argc) { + *error = "Insufficient args"; + return false; + } + + return true; +} + +static int parse_cache_dev(struct dm_pcache *pcache, struct dm_arg_set *as, + char **error) +{ + int ret; + + if (!at_least_one_arg(as, error)) + return -EINVAL; + ret = dm_get_device(pcache->ti, dm_shift_arg(as), + BLK_OPEN_READ | BLK_OPEN_WRITE, + &pcache->cache_dev.dm_dev); + if (ret) { + *error = "Error opening cache device"; + return ret; + } + + return 0; +} + +static int parse_backing_dev(struct dm_pcache *pcache, struct dm_arg_set *as, + char **error) +{ + int ret; + + if (!at_least_one_arg(as, error)) + return -EINVAL; + + ret = dm_get_device(pcache->ti, dm_shift_arg(as), + BLK_OPEN_READ | BLK_OPEN_WRITE, + &pcache->backing_dev.dm_dev); + if (ret) { + *error = "Error opening backing device"; + return ret; + } + + return 0; +} + +static void pcache_init_opts(struct pcache_cache_options *opts) +{ + opts->cache_mode = PCACHE_CACHE_MODE_WRITEBACK; + opts->data_crc = false; +} + +static int parse_cache_opts(struct dm_pcache *pcache, struct dm_arg_set *as, + char **error) +{ + struct pcache_cache_options *opts = &pcache->opts; + static const struct dm_arg _args[] = { + {0, 4, "Invalid number of cache option arguments"}, + }; + unsigned int argc; + const char *arg; + int ret; + + pcache_init_opts(opts); + if (!as->argc) + return 0; + + ret = dm_read_arg_group(_args, as, &argc, error); + if (ret) + return -EINVAL; + + while (argc) { + arg = dm_shift_arg(as); + argc--; + + if (!strcmp(arg, "cache_mode")) { + arg = dm_shift_arg(as); + if (!strcmp(arg, "writeback")) { + opts->cache_mode = PCACHE_CACHE_MODE_WRITEBACK; + } else { + *error = "Invalid cache mode parameter"; + return -EINVAL; + } + argc--; + } else if (!strcmp(arg, "data_crc")) { + arg = dm_shift_arg(as); + if (!strcmp(arg, "true")) { + opts->data_crc = true; + } else if (!strcmp(arg, "false")) { + opts->data_crc = false; + } else { + *error = "Invalid data crc parameter"; + return -EINVAL; + } + argc--; + } else { + *error = "Unrecognised cache option requested"; + return -EINVAL; + } + } + + return 0; +} + +static int pcache_start(struct dm_pcache *pcache, char **error) +{ + int ret; + + ret = cache_dev_start(pcache); + if (ret) { + *error = "Failed to start cache dev"; + return ret; + } + + ret = backing_dev_start(pcache); + if (ret) { + *error = "Failed to start backing dev"; + goto stop_cache; + } + + ret = pcache_cache_start(pcache); + if (ret) { + *error = "Failed to start pcache"; + goto stop_backing; + } + + return 0; +stop_backing: + backing_dev_stop(pcache); +stop_cache: + cache_dev_stop(pcache); + + return ret; +} + +static void pcache_destroy_args(struct dm_pcache *pcache) +{ + if (pcache->cache_dev.dm_dev) + dm_put_device(pcache->ti, pcache->cache_dev.dm_dev); + if (pcache->backing_dev.dm_dev) + dm_put_device(pcache->ti, pcache->backing_dev.dm_dev); +} + +static int pcache_parse_args(struct dm_pcache *pcache, unsigned int argc, char **argv, + char **error) +{ + struct dm_arg_set as; + int ret; + + as.argc = argc; + as.argv = argv; + + /* + * Parse cache device + */ + ret = parse_cache_dev(pcache, &as, error); + if (ret) + return ret; + /* + * Parse backing device + */ + ret = parse_backing_dev(pcache, &as, error); + if (ret) + goto out; + /* + * Parse optional arguments + */ + ret = parse_cache_opts(pcache, &as, error); + if (ret) + goto out; + + return 0; +out: + pcache_destroy_args(pcache); + return ret; +} + +static int dm_pcache_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + struct mapped_device *md = ti->table->md; + struct dm_pcache *pcache; + int ret; + + if (md->map) { + ti->error = "Don't support table loading for live md"; + return -EOPNOTSUPP; + } + + /* Allocate memory for the cache structure */ + pcache = kzalloc(sizeof(struct dm_pcache), GFP_KERNEL); + if (!pcache) + return -ENOMEM; + + pcache->task_wq = alloc_workqueue("pcache-%s-wq", WQ_UNBOUND | WQ_MEM_RECLAIM, + 0, md->name); + if (!pcache->task_wq) { + ret = -ENOMEM; + goto free_pcache; + } + + spin_lock_init(&pcache->defered_req_list_lock); + INIT_LIST_HEAD(&pcache->defered_req_list); + INIT_WORK(&pcache->defered_req_work, defered_req_fn); + pcache->ti = ti; + + ret = pcache_parse_args(pcache, argc, argv, &ti->error); + if (ret) + goto destroy_wq; + + ret = pcache_start(pcache, &ti->error); + if (ret) + goto destroy_args; + + ti->num_flush_bios = 1; + ti->flush_supported = true; + ti->per_io_data_size = sizeof(struct pcache_request); + ti->private = pcache; + atomic_set(&pcache->inflight_reqs, 0); + atomic_set(&pcache->state, PCACHE_STATE_RUNNING); + init_waitqueue_head(&pcache->inflight_wq); + + return 0; +destroy_args: + pcache_destroy_args(pcache); +destroy_wq: + destroy_workqueue(pcache->task_wq); +free_pcache: + kfree(pcache); + + return ret; +} + +static void defer_req_stop(struct dm_pcache *pcache) +{ + struct pcache_request *pcache_req; + LIST_HEAD(tmp_list); + + flush_work(&pcache->defered_req_work); + + spin_lock(&pcache->defered_req_list_lock); + list_splice_init(&pcache->defered_req_list, &tmp_list); + spin_unlock(&pcache->defered_req_list_lock); + + while (!list_empty(&tmp_list)) { + pcache_req = list_first_entry(&tmp_list, + struct pcache_request, list_node); + list_del_init(&pcache_req->list_node); + pcache_req_put(pcache_req, -EIO); + } +} + +static void dm_pcache_dtr(struct dm_target *ti) +{ + struct dm_pcache *pcache; + + pcache = ti->private; + atomic_set(&pcache->state, PCACHE_STATE_STOPPING); + defer_req_stop(pcache); + + wait_event(pcache->inflight_wq, + atomic_read(&pcache->inflight_reqs) == 0); + + pcache_cache_stop(pcache); + backing_dev_stop(pcache); + cache_dev_stop(pcache); + + pcache_destroy_args(pcache); + drain_workqueue(pcache->task_wq); + destroy_workqueue(pcache->task_wq); + + kfree(pcache); +} + +static int dm_pcache_map_bio(struct dm_target *ti, struct bio *bio) +{ + struct pcache_request *pcache_req = dm_per_bio_data(bio, sizeof(struct pcache_request)); + struct dm_pcache *pcache = ti->private; + int ret; + + pcache_req->pcache = pcache; + kref_init(&pcache_req->ref); + pcache_req->ret = 0; + pcache_req->bio = bio; + pcache_req->off = (u64)bio->bi_iter.bi_sector << SECTOR_SHIFT; + pcache_req->data_len = bio->bi_iter.bi_size; + INIT_LIST_HEAD(&pcache_req->list_node); + atomic_inc(&pcache->inflight_reqs); + + ret = pcache_cache_handle_req(&pcache->cache, pcache_req); + if (ret == -EBUSY) + defer_req(pcache_req); + else + pcache_req_put(pcache_req, ret); + + return DM_MAPIO_SUBMITTED; +} + +static void dm_pcache_status(struct dm_target *ti, status_type_t type, + unsigned int status_flags, char *result, + unsigned int maxlen) +{ + struct dm_pcache *pcache = ti->private; + struct pcache_cache_dev *cache_dev = &pcache->cache_dev; + struct pcache_backing_dev *backing_dev = &pcache->backing_dev; + struct pcache_cache *cache = &pcache->cache; + unsigned int sz = 0; + + switch (type) { + case STATUSTYPE_INFO: + DMEMIT("%x %u %u %u %u %x %u:%u %u:%u %u:%u", + cache_dev->sb_flags, + cache_dev->seg_num, + cache->n_segs, + bitmap_weight(cache->seg_map, cache->n_segs), + pcache_cache_get_gc_percent(cache), + cache->cache_info.flags, + cache->key_head.cache_seg->cache_seg_id, + cache->key_head.seg_off, + cache->dirty_tail.cache_seg->cache_seg_id, + cache->dirty_tail.seg_off, + cache->key_tail.cache_seg->cache_seg_id, + cache->key_tail.seg_off); + break; + case STATUSTYPE_TABLE: + DMEMIT("%s %s 4 cache_mode writeback crc %s", + cache_dev->dm_dev->name, + backing_dev->dm_dev->name, + cache_data_crc_on(cache) ? "true" : "false"); + break; + case STATUSTYPE_IMA: + *result = '\0'; + break; + } +} + +static int dm_pcache_message(struct dm_target *ti, unsigned int argc, + char **argv, char *result, unsigned int maxlen) +{ + struct dm_pcache *pcache = ti->private; + unsigned long val; + + if (argc != 2) + goto err; + + if (!strcasecmp(argv[0], "gc_percent")) { + if (kstrtoul(argv[1], 10, &val)) + goto err; + + return pcache_cache_set_gc_percent(&pcache->cache, val); + } +err: + return -EINVAL; +} + +static struct target_type dm_pcache_target = { + .name = "pcache", + .version = {0, 1, 0}, + .module = THIS_MODULE, + .features = DM_TARGET_SINGLETON, + .ctr = dm_pcache_ctr, + .dtr = dm_pcache_dtr, + .map = dm_pcache_map_bio, + .status = dm_pcache_status, + .message = dm_pcache_message, +}; + +static int __init dm_pcache_init(void) +{ + int ret; + + ret = pcache_backing_init(); + if (ret) + goto err; + + ret = pcache_cache_init(); + if (ret) + goto backing_exit; + + ret = dm_register_target(&dm_pcache_target); + if (ret) + goto cache_exit; + return 0; + +cache_exit: + pcache_cache_exit(); +backing_exit: + pcache_backing_exit(); +err: + return ret; +} +module_init(dm_pcache_init); + +static void __exit dm_pcache_exit(void) +{ + dm_unregister_target(&dm_pcache_target); + pcache_cache_exit(); + pcache_backing_exit(); +} +module_exit(dm_pcache_exit); + +MODULE_DESCRIPTION("dm-pcache Persistent Cache for block device"); +MODULE_AUTHOR("Dongsheng Yang <dongsheng.yang@linux.dev>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-pcache/dm_pcache.h b/drivers/md/dm-pcache/dm_pcache.h new file mode 100644 index 000000000000..b4e06be0c0b9 --- /dev/null +++ b/drivers/md/dm-pcache/dm_pcache.h @@ -0,0 +1,67 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _DM_PCACHE_H +#define _DM_PCACHE_H +#include <linux/device-mapper.h> + +#include "../dm-core.h" + +#define CACHE_DEV_TO_PCACHE(cache_dev) (container_of(cache_dev, struct dm_pcache, cache_dev)) +#define BACKING_DEV_TO_PCACHE(backing_dev) (container_of(backing_dev, struct dm_pcache, backing_dev)) +#define CACHE_TO_PCACHE(cache) (container_of(cache, struct dm_pcache, cache)) + +#define PCACHE_STATE_RUNNING 1 +#define PCACHE_STATE_STOPPING 2 + +struct pcache_cache_dev; +struct pcache_backing_dev; +struct pcache_cache; +struct pcache_cache_options; +struct dm_pcache { + struct dm_target *ti; + struct pcache_cache_dev cache_dev; + struct pcache_backing_dev backing_dev; + struct pcache_cache cache; + struct pcache_cache_options opts; + + spinlock_t defered_req_list_lock; + struct list_head defered_req_list; + struct workqueue_struct *task_wq; + + struct work_struct defered_req_work; + + atomic_t state; + atomic_t inflight_reqs; + wait_queue_head_t inflight_wq; +}; + +static inline bool pcache_is_stopping(struct dm_pcache *pcache) +{ + return (atomic_read(&pcache->state) == PCACHE_STATE_STOPPING); +} + +#define pcache_dev_err(pcache, fmt, ...) \ + pcache_err("%s " fmt, pcache->ti->table->md->name, ##__VA_ARGS__) +#define pcache_dev_info(pcache, fmt, ...) \ + pcache_info("%s " fmt, pcache->ti->table->md->name, ##__VA_ARGS__) +#define pcache_dev_debug(pcache, fmt, ...) \ + pcache_debug("%s " fmt, pcache->ti->table->md->name, ##__VA_ARGS__) + +struct pcache_request { + struct dm_pcache *pcache; + struct bio *bio; + + u64 off; + u32 data_len; + + struct kref ref; + int ret; + + struct list_head list_node; +}; + +void pcache_req_get(struct pcache_request *pcache_req); +void pcache_req_put(struct pcache_request *pcache_req, int ret); + +void pcache_defer_reqs_kick(struct dm_pcache *pcache); + +#endif /* _DM_PCACHE_H */ diff --git a/drivers/md/dm-pcache/pcache_internal.h b/drivers/md/dm-pcache/pcache_internal.h new file mode 100644 index 000000000000..d427e534727c --- /dev/null +++ b/drivers/md/dm-pcache/pcache_internal.h @@ -0,0 +1,117 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _PCACHE_INTERNAL_H +#define _PCACHE_INTERNAL_H + +#include <linux/delay.h> +#include <linux/crc32c.h> + +#define pcache_err(fmt, ...) \ + pr_err("dm-pcache: %s:%u " fmt, __func__, __LINE__, ##__VA_ARGS__) +#define pcache_info(fmt, ...) \ + pr_info("dm-pcache: %s:%u " fmt, __func__, __LINE__, ##__VA_ARGS__) +#define pcache_debug(fmt, ...) \ + pr_debug("dm-pcache: %s:%u " fmt, __func__, __LINE__, ##__VA_ARGS__) + +#define PCACHE_KB (1024ULL) +#define PCACHE_MB (1024 * PCACHE_KB) + +/* Maximum number of metadata indices */ +#define PCACHE_META_INDEX_MAX 2 + +#define PCACHE_CRC_SEED 0x3B15A +/* + * struct pcache_meta_header - PCACHE metadata header structure + * @crc: CRC checksum for validating metadata integrity. + * @seq: Sequence number to track metadata updates. + * @version: Metadata version. + * @res: Reserved space for future use. + */ +struct pcache_meta_header { + __u32 crc; + __u8 seq; + __u8 version; + __u16 res; +}; + +/* + * pcache_meta_crc - Calculate CRC for the given metadata header. + * @header: Pointer to the metadata header. + * @meta_size: Size of the metadata structure. + * + * Returns the CRC checksum calculated by excluding the CRC field itself. + */ +static inline u32 pcache_meta_crc(struct pcache_meta_header *header, u32 meta_size) +{ + return crc32c(PCACHE_CRC_SEED, (void *)header + 4, meta_size - 4); +} + +/* + * pcache_meta_seq_after - Check if a sequence number is more recent, accounting for overflow. + * @seq1: First sequence number. + * @seq2: Second sequence number. + * + * Determines if @seq1 is more recent than @seq2 by calculating the signed + * difference between them. This approach allows handling sequence number + * overflow correctly because the difference wraps naturally, and any value + * greater than zero indicates that @seq1 is "after" @seq2. This method + * assumes 8-bit unsigned sequence numbers, where the difference wraps + * around if seq1 overflows past seq2. + * + * Returns: + * - true if @seq1 is more recent than @seq2, indicating it comes "after" + * - false otherwise. + */ +static inline bool pcache_meta_seq_after(u8 seq1, u8 seq2) +{ + return (s8)(seq1 - seq2) > 0; +} + +/* + * pcache_meta_find_latest - Find the latest valid metadata. + * @header: Pointer to the metadata header. + * @meta_size: Size of each metadata block. + * + * Finds the latest valid metadata by checking sequence numbers. If a + * valid entry with the highest sequence number is found, its pointer + * is returned. Returns NULL if no valid metadata is found. + */ +static inline void __must_check *pcache_meta_find_latest(struct pcache_meta_header *header, + u32 meta_size, u32 meta_max_size, + void *meta_ret) +{ + struct pcache_meta_header *meta, *latest = NULL; + u32 i, seq_latest = 0; + void *meta_addr; + + meta = meta_ret; + + for (i = 0; i < PCACHE_META_INDEX_MAX; i++) { + meta_addr = (void *)header + (i * meta_max_size); + if (copy_mc_to_kernel(meta, meta_addr, meta_size)) { + pcache_err("hardware memory error when copy meta"); + return ERR_PTR(-EIO); + } + + /* Skip if CRC check fails, which means corrupted */ + if (meta->crc != pcache_meta_crc(meta, meta_size)) + continue; + + /* Update latest if a more recent sequence is found */ + if (!latest || pcache_meta_seq_after(meta->seq, seq_latest)) { + seq_latest = meta->seq; + latest = (void *)header + (i * meta_max_size); + } + } + + if (!latest) + return NULL; + + if (copy_mc_to_kernel(meta_ret, latest, meta_size)) { + pcache_err("hardware memory error"); + return ERR_PTR(-EIO); + } + + return latest; +} + +#endif /* _PCACHE_INTERNAL_H */ diff --git a/drivers/md/dm-pcache/segment.c b/drivers/md/dm-pcache/segment.c new file mode 100644 index 000000000000..7e9818701445 --- /dev/null +++ b/drivers/md/dm-pcache/segment.c @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include <linux/dax.h> + +#include "pcache_internal.h" +#include "cache_dev.h" +#include "segment.h" + +int segment_copy_to_bio(struct pcache_segment *segment, + u32 data_off, u32 data_len, struct bio *bio, u32 bio_off) +{ + struct iov_iter iter; + size_t copied; + void *src; + + iov_iter_bvec(&iter, ITER_DEST, &bio->bi_io_vec[bio->bi_iter.bi_idx], + bio_segments(bio), bio->bi_iter.bi_size); + iter.iov_offset = bio->bi_iter.bi_bvec_done; + if (bio_off) + iov_iter_advance(&iter, bio_off); + + src = segment->data + data_off; + copied = _copy_mc_to_iter(src, data_len, &iter); + if (copied != data_len) + return -EIO; + + return 0; +} + +int segment_copy_from_bio(struct pcache_segment *segment, + u32 data_off, u32 data_len, struct bio *bio, u32 bio_off) +{ + struct iov_iter iter; + size_t copied; + void *dst; + + iov_iter_bvec(&iter, ITER_SOURCE, &bio->bi_io_vec[bio->bi_iter.bi_idx], + bio_segments(bio), bio->bi_iter.bi_size); + iter.iov_offset = bio->bi_iter.bi_bvec_done; + if (bio_off) + iov_iter_advance(&iter, bio_off); + + dst = segment->data + data_off; + copied = _copy_from_iter_flushcache(dst, data_len, &iter); + if (copied != data_len) + return -EIO; + pmem_wmb(); + + return 0; +} + +void pcache_segment_init(struct pcache_cache_dev *cache_dev, struct pcache_segment *segment, + struct pcache_segment_init_options *options) +{ + segment->seg_info = options->seg_info; + segment_info_set_type(segment->seg_info, options->type); + + segment->cache_dev = cache_dev; + segment->seg_id = options->seg_id; + segment->data_size = PCACHE_SEG_SIZE - options->data_off; + segment->data = CACHE_DEV_SEGMENT(cache_dev, options->seg_id) + options->data_off; +} diff --git a/drivers/md/dm-pcache/segment.h b/drivers/md/dm-pcache/segment.h new file mode 100644 index 000000000000..deca1ddcb02b --- /dev/null +++ b/drivers/md/dm-pcache/segment.h @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _PCACHE_SEGMENT_H +#define _PCACHE_SEGMENT_H + +#include <linux/bio.h> +#include <linux/bitfield.h> + +#include "pcache_internal.h" + +struct pcache_segment_info { + struct pcache_meta_header header; + __u32 flags; + __u32 next_seg; +}; + +#define PCACHE_SEG_INFO_FLAGS_HAS_NEXT BIT(0) + +#define PCACHE_SEG_INFO_FLAGS_TYPE_MASK GENMASK(4, 1) +#define PCACHE_SEGMENT_TYPE_CACHE_DATA 1 + +static inline bool segment_info_has_next(struct pcache_segment_info *seg_info) +{ + return (seg_info->flags & PCACHE_SEG_INFO_FLAGS_HAS_NEXT); +} + +static inline void segment_info_set_type(struct pcache_segment_info *seg_info, u8 type) +{ + seg_info->flags &= ~PCACHE_SEG_INFO_FLAGS_TYPE_MASK; + seg_info->flags |= FIELD_PREP(PCACHE_SEG_INFO_FLAGS_TYPE_MASK, type); +} + +static inline u8 segment_info_get_type(struct pcache_segment_info *seg_info) +{ + return FIELD_GET(PCACHE_SEG_INFO_FLAGS_TYPE_MASK, seg_info->flags); +} + +struct pcache_segment_pos { + struct pcache_segment *segment; /* Segment associated with the position */ + u32 off; /* Offset within the segment */ +}; + +struct pcache_segment_init_options { + u8 type; + u32 seg_id; + u32 data_off; + + struct pcache_segment_info *seg_info; +}; + +struct pcache_segment { + struct pcache_cache_dev *cache_dev; + + void *data; + u32 data_size; + u32 seg_id; + + struct pcache_segment_info *seg_info; +}; + +int segment_copy_to_bio(struct pcache_segment *segment, + u32 data_off, u32 data_len, struct bio *bio, u32 bio_off); +int segment_copy_from_bio(struct pcache_segment *segment, + u32 data_off, u32 data_len, struct bio *bio, u32 bio_off); + +static inline void segment_pos_advance(struct pcache_segment_pos *seg_pos, u32 len) +{ + BUG_ON(seg_pos->off + len > seg_pos->segment->data_size); + + seg_pos->off += len; +} + +void pcache_segment_init(struct pcache_cache_dev *cache_dev, struct pcache_segment *segment, + struct pcache_segment_init_options *options); +#endif /* _PCACHE_SEGMENT_H */ diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 79ea85d18e24..c6f7129e43d3 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3247,7 +3247,7 @@ size_check: rs_reset_inconclusive_reshape(rs); /* Start raid set read-only and assumed clean to change in raid_resume() */ - rs->md.ro = 1; + rs->md.ro = MD_RDONLY; rs->md.in_sync = 1; /* Has to be held on running the array */ @@ -3385,7 +3385,7 @@ static enum sync_state decipher_sync_action(struct mddev *mddev, unsigned long r /* The MD sync thread can be done with io or be interrupted but still be running */ if (!test_bit(MD_RECOVERY_DONE, &recovery) && (test_bit(MD_RECOVERY_RUNNING, &recovery) || - (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery)))) { + (md_is_rdwr(mddev) && test_bit(MD_RECOVERY_NEEDED, &recovery)))) { if (test_bit(MD_RECOVERY_RESHAPE, &recovery)) return st_reshape; @@ -3775,11 +3775,11 @@ static int raid_message(struct dm_target *ti, unsigned int argc, char **argv, } else return -EINVAL; } - if (mddev->ro == 2) { + if (mddev->ro == MD_AUTO_READ) { /* A write to sync_action is enough to justify * canceling read-auto mode */ - mddev->ro = 0; + mddev->ro = MD_RDWR; if (!mddev->suspended) md_wakeup_thread(mddev->sync_thread); } @@ -3813,8 +3813,10 @@ static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits) struct raid_set *rs = ti->private; unsigned int chunk_size_bytes = to_bytes(rs->md.chunk_sectors); - limits->io_min = chunk_size_bytes; - limits->io_opt = chunk_size_bytes * mddev_data_stripes(rs); + if (chunk_size_bytes) { + limits->io_min = chunk_size_bytes; + limits->io_opt = chunk_size_bytes * mddev_data_stripes(rs); + } } static void raid_presuspend(struct dm_target *ti) @@ -3858,6 +3860,7 @@ static void raid_postsuspend(struct dm_target *ti) */ md_stop_writes(&rs->md); mddev_suspend(&rs->md, false); + rs->md.ro = MD_RDONLY; } } @@ -3953,9 +3956,11 @@ static int __load_dirty_region_bitmap(struct raid_set *rs) !test_and_set_bit(RT_FLAG_RS_BITMAP_LOADED, &rs->runtime_flags)) { struct mddev *mddev = &rs->md; - r = mddev->bitmap_ops->load(mddev); - if (r) - DMERR("Failed to load bitmap"); + if (md_bitmap_enabled(mddev, false)) { + r = mddev->bitmap_ops->load(mddev); + if (r) + DMERR("Failed to load bitmap"); + } } return r; @@ -3968,7 +3973,7 @@ static void rs_update_sbs(struct raid_set *rs) int ro = mddev->ro; set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); - mddev->ro = 0; + mddev->ro = MD_RDWR; md_update_sb(mddev, 1); mddev->ro = ro; } @@ -4070,10 +4075,12 @@ static int raid_preresume(struct dm_target *ti) mddev->bitmap_info.chunksize != to_bytes(rs->requested_bitmap_chunk_sectors)))) { int chunksize = to_bytes(rs->requested_bitmap_chunk_sectors) ?: mddev->bitmap_info.chunksize; - r = mddev->bitmap_ops->resize(mddev, mddev->dev_sectors, - chunksize, false); - if (r) - DMERR("Failed to resize bitmap"); + if (md_bitmap_enabled(mddev, false)) { + r = mddev->bitmap_ops->resize(mddev, mddev->dev_sectors, + chunksize); + if (r) + DMERR("Failed to resize bitmap"); + } } /* Check for any resize/reshape on @rs and adjust/initiate */ @@ -4125,7 +4132,7 @@ static void raid_resume(struct dm_target *ti) WARN_ON_ONCE(rcu_dereference_protected(mddev->sync_thread, lockdep_is_held(&mddev->reconfig_mutex))); clear_bit(RT_FLAG_RS_FROZEN, &rs->runtime_flags); - mddev->ro = 0; + mddev->ro = MD_RDWR; mddev->in_sync = 0; md_unfrozen_sync_thread(mddev); mddev_unlock_and_resume(mddev); diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c index a4550975c27d..e9b47b659976 100644 --- a/drivers/md/dm-region-hash.c +++ b/drivers/md/dm-region-hash.c @@ -206,7 +206,7 @@ struct dm_region_hash *dm_region_hash_create( rh->shift = RH_HASH_SHIFT; rh->prime = RH_HASH_MULT; - rh->buckets = vmalloc(array_size(nr_buckets, sizeof(*rh->buckets))); + rh->buckets = vmalloc_array(nr_buckets, sizeof(*rh->buckets)); if (!rh->buckets) { DMERR("unable to allocate region hash bucket memory"); kfree(rh); diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 58902091bf79..1461dc740dae 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -456,11 +456,15 @@ static void stripe_io_hints(struct dm_target *ti, struct queue_limits *limits) { struct stripe_c *sc = ti->private; - unsigned int chunk_size = sc->chunk_size << SECTOR_SHIFT; + unsigned int io_min, io_opt; limits->chunk_sectors = sc->chunk_size; - limits->io_min = chunk_size; - limits->io_opt = chunk_size * sc->stripes; + + if (!check_shl_overflow(sc->chunk_size, SECTOR_SHIFT, &io_min) && + !check_mul_overflow(io_min, sc->stripes, &io_opt)) { + limits->io_min = io_min; + limits->io_opt = io_opt; + } } static struct target_type stripe_target = { diff --git a/drivers/md/dm-switch.c b/drivers/md/dm-switch.c index bb1a70b5a215..50a52ca50b34 100644 --- a/drivers/md/dm-switch.c +++ b/drivers/md/dm-switch.c @@ -114,8 +114,8 @@ static int alloc_region_table(struct dm_target *ti, unsigned int nr_paths) return -EINVAL; } - sctx->region_table = vmalloc(array_size(nr_slots, - sizeof(region_table_slot_t))); + sctx->region_table = vmalloc_array(nr_slots, + sizeof(region_table_slot_t)); if (!sctx->region_table) { ti->error = "Cannot allocate region table"; return -ENOMEM; diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c index 2af5a9514c05..8fede41adec0 100644 --- a/drivers/md/dm-target.c +++ b/drivers/md/dm-target.c @@ -263,7 +263,8 @@ static long io_err_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, static struct target_type error_target = { .name = "error", .version = {1, 7, 0}, - .features = DM_TARGET_WILDCARD | DM_TARGET_ZONED_HM, + .features = DM_TARGET_WILDCARD | DM_TARGET_ZONED_HM | + DM_TARGET_PASSES_INTEGRITY, .ctr = io_err_ctr, .dtr = io_err_dtr, .map = io_err_map, diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 007bb93e5fca..c84149ba4e38 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -3031,8 +3031,8 @@ static struct pool *pool_create(struct mapped_device *pool_md, } pool->cell_sort_array = - vmalloc(array_size(CELL_SORT_ARRAY_SIZE, - sizeof(*pool->cell_sort_array))); + vmalloc_array(CELL_SORT_ARRAY_SIZE, + sizeof(*pool->cell_sort_array)); if (!pool->cell_sort_array) { *error = "Error allocating cell sort array"; err_p = ERR_PTR(-ENOMEM); diff --git a/drivers/md/dm-vdo/data-vio.c b/drivers/md/dm-vdo/data-vio.c index 810002747091..262e11581f2d 100644 --- a/drivers/md/dm-vdo/data-vio.c +++ b/drivers/md/dm-vdo/data-vio.c @@ -17,6 +17,7 @@ #include <linux/minmax.h> #include <linux/sched.h> #include <linux/spinlock.h> +#include <linux/string.h> #include <linux/wait.h> #include "logger.h" @@ -509,18 +510,6 @@ static void launch_data_vio(struct data_vio *data_vio, logical_block_number_t lb vdo_enqueue_completion(completion, VDO_DEFAULT_Q_MAP_BIO_PRIORITY); } -static bool is_zero_block(char *block) -{ - int i; - - for (i = 0; i < VDO_BLOCK_SIZE; i += sizeof(u64)) { - if (*((u64 *) &block[i])) - return false; - } - - return true; -} - static void copy_from_bio(struct bio *bio, char *data_ptr) { struct bio_vec biovec; @@ -572,7 +561,7 @@ static void launch_bio(struct vdo *vdo, struct data_vio *data_vio, struct bio *b * we acknowledge the bio. */ copy_from_bio(bio, data_vio->vio.data); - data_vio->is_zero = is_zero_block(data_vio->vio.data); + data_vio->is_zero = mem_is_zero(data_vio->vio.data, VDO_BLOCK_SIZE); data_vio->write = true; } @@ -1459,7 +1448,7 @@ static void modify_for_partial_write(struct vdo_completion *completion) copy_from_bio(bio, data + data_vio->offset); } - data_vio->is_zero = is_zero_block(data); + data_vio->is_zero = mem_is_zero(data, VDO_BLOCK_SIZE); data_vio->read = false; launch_data_vio_logical_callback(data_vio, continue_data_vio_with_block_map_slot); diff --git a/drivers/md/dm-vdo/indexer/volume-index.c b/drivers/md/dm-vdo/indexer/volume-index.c index 12f954a0c532..afb062e1f1fb 100644 --- a/drivers/md/dm-vdo/indexer/volume-index.c +++ b/drivers/md/dm-vdo/indexer/volume-index.c @@ -836,7 +836,7 @@ static int start_restoring_volume_sub_index(struct volume_sub_index *sub_index, "%zu bytes decoded of %zu expected", offset, sizeof(buffer)); if (result != VDO_SUCCESS) - result = UDS_CORRUPT_DATA; + return UDS_CORRUPT_DATA; if (memcmp(header.magic, MAGIC_START_5, MAGIC_SIZE) != 0) { return vdo_log_warning_strerror(UDS_CORRUPT_DATA, @@ -928,7 +928,7 @@ static int start_restoring_volume_index(struct volume_index *volume_index, "%zu bytes decoded of %zu expected", offset, sizeof(buffer)); if (result != VDO_SUCCESS) - result = UDS_CORRUPT_DATA; + return UDS_CORRUPT_DATA; if (memcmp(header.magic, MAGIC_START_6, MAGIC_SIZE) != 0) return vdo_log_warning_strerror(UDS_CORRUPT_DATA, diff --git a/drivers/md/dm-vdo/vio.c b/drivers/md/dm-vdo/vio.c index e7f4153e55e3..8fc22fb14196 100644 --- a/drivers/md/dm-vdo/vio.c +++ b/drivers/md/dm-vdo/vio.c @@ -212,7 +212,7 @@ int vio_reset_bio_with_size(struct vio *vio, char *data, int size, bio_end_io_t return VDO_SUCCESS; bio->bi_ioprio = 0; - bio->bi_io_vec = bio->bi_inline_vecs; + bio->bi_io_vec = bio_inline_vecs(bio); bio->bi_max_vecs = vio->block_count + 1; if (VDO_ASSERT(size <= vio_size, "specified size %d is not greater than allocated %d", size, vio_size) != VDO_SUCCESS) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index a44e8c2dccee..f5e5e59b232b 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -403,9 +403,9 @@ static void do_deferred_remove(struct work_struct *w) dm_deferred_remove(); } -static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int dm_blk_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct mapped_device *md = bdev->bd_disk->private_data; + struct mapped_device *md = disk->private_data; return dm_get_geometry(md, geo); } @@ -490,18 +490,13 @@ u64 dm_start_time_ns_from_clone(struct bio *bio) } EXPORT_SYMBOL_GPL(dm_start_time_ns_from_clone); -static inline bool bio_is_flush_with_data(struct bio *bio) -{ - return ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size); -} - static inline unsigned int dm_io_sectors(struct dm_io *io, struct bio *bio) { /* * If REQ_PREFLUSH set, don't account payload, it will be * submitted (and accounted) after this flush completes. */ - if (bio_is_flush_with_data(bio)) + if (io->requeue_flush_with_data) return 0; if (unlikely(dm_io_flagged(io, DM_IO_WAS_SPLIT))) return io->sectors; @@ -590,6 +585,7 @@ static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio, gfp_t g io = container_of(tio, struct dm_io, tio); io->magic = DM_IO_MAGIC; io->status = BLK_STS_OK; + io->requeue_flush_with_data = false; /* one ref is for submission, the other is for completion */ atomic_set(&io->io_count, 2); @@ -948,6 +944,7 @@ static void __dm_io_complete(struct dm_io *io, bool first_stage) struct mapped_device *md = io->md; blk_status_t io_error; bool requeued; + bool requeue_flush_with_data; requeued = dm_handle_requeue(io, first_stage); if (requeued && first_stage) @@ -964,6 +961,7 @@ static void __dm_io_complete(struct dm_io *io, bool first_stage) __dm_start_io_acct(io); dm_end_io_acct(io); } + requeue_flush_with_data = io->requeue_flush_with_data; free_io(io); smp_wmb(); this_cpu_dec(*md->pending_io); @@ -976,7 +974,7 @@ static void __dm_io_complete(struct dm_io *io, bool first_stage) if (requeued) return; - if (bio_is_flush_with_data(bio)) { + if (unlikely(requeue_flush_with_data)) { /* * Preflush done for flush with data, reissue * without REQ_PREFLUSH. @@ -1996,12 +1994,30 @@ static void dm_split_and_process_bio(struct mapped_device *md, } init_clone_info(&ci, io, map, bio, is_abnormal); - if (bio->bi_opf & REQ_PREFLUSH) { + if (unlikely((bio->bi_opf & REQ_PREFLUSH) != 0)) { + /* + * The "flush_bypasses_map" is set on targets where it is safe + * to skip the map function and submit bios directly to the + * underlying block devices - currently, it is set for dm-linear + * and dm-stripe. + * + * If we have just one underlying device (i.e. there is one + * linear target or multiple linear targets pointing to the same + * device), we can send the flush with data directly to it. + */ + if (map->flush_bypasses_map) { + struct list_head *devices = dm_table_get_devices(map); + if (devices->next == devices->prev) + goto send_preflush_with_data; + } + if (bio->bi_iter.bi_size) + io->requeue_flush_with_data = true; __send_empty_flush(&ci); /* dm_io_complete submits any data associated with flush */ goto out; } +send_preflush_with_data: if (static_branch_unlikely(&zoned_enabled) && (bio_op(bio) == REQ_OP_ZONE_RESET_ALL)) { error = __send_zone_reset_all(&ci); @@ -2908,7 +2924,7 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map, { bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG; bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG; - int r; + int r = 0; lockdep_assert_held(&md->suspend_lock); @@ -2960,8 +2976,10 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map, * Stop md->queue before flushing md->wq in case request-based * dm defers requests to md->wq from md->queue. */ - if (dm_request_based(md)) + if (map && dm_request_based(md)) { dm_stop_queue(md->queue); + set_bit(DMF_QUEUE_STOPPED, &md->flags); + } flush_workqueue(md->wq); @@ -2970,7 +2988,8 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map, * We call dm_wait_for_completion to wait for all existing requests * to finish. */ - r = dm_wait_for_completion(md, task_state); + if (map) + r = dm_wait_for_completion(md, task_state); if (!r) set_bit(dmf_suspended_flag, &md->flags); @@ -2983,7 +3002,7 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map, if (r < 0) { dm_queue_flush(md); - if (dm_request_based(md)) + if (test_and_clear_bit(DMF_QUEUE_STOPPED, &md->flags)) dm_start_queue(md->queue); unlock_fs(md); @@ -3067,7 +3086,7 @@ static int __dm_resume(struct mapped_device *md, struct dm_table *map) * so that mapping of targets can work correctly. * Request-based dm is queueing the deferred I/Os in its request_queue. */ - if (dm_request_based(md)) + if (test_and_clear_bit(DMF_QUEUE_STOPPED, &md->flags)) dm_start_queue(md->queue); unlock_fs(md); diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 334b71404930..84b7e2af6dba 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -34,15 +34,6 @@ #include "md-bitmap.h" #include "md-cluster.h" -#define BITMAP_MAJOR_LO 3 -/* version 4 insists the bitmap is in little-endian order - * with version 3, it is host-endian which is non-portable - * Version 5 is currently set only for clustered devices - */ -#define BITMAP_MAJOR_HI 4 -#define BITMAP_MAJOR_CLUSTERED 5 -#define BITMAP_MAJOR_HOSTENDIAN 3 - /* * in-memory bitmap: * @@ -224,6 +215,8 @@ struct bitmap { int cluster_slot; }; +static struct workqueue_struct *md_bitmap_wq; + static int __bitmap_resize(struct bitmap *bitmap, sector_t blocks, int chunksize, bool init); @@ -232,20 +225,19 @@ static inline char *bmname(struct bitmap *bitmap) return bitmap->mddev ? mdname(bitmap->mddev) : "mdX"; } -static bool __bitmap_enabled(struct bitmap *bitmap) -{ - return bitmap->storage.filemap && - !test_bit(BITMAP_STALE, &bitmap->flags); -} - -static bool bitmap_enabled(struct mddev *mddev) +static bool bitmap_enabled(void *data, bool flush) { - struct bitmap *bitmap = mddev->bitmap; + struct bitmap *bitmap = data; - if (!bitmap) - return false; + if (!flush) + return true; - return __bitmap_enabled(bitmap); + /* + * If caller want to flush bitmap pages to underlying disks, check if + * there are cached pages in filemap. + */ + return !test_bit(BITMAP_STALE, &bitmap->flags) && + bitmap->storage.filemap != NULL; } /* @@ -484,7 +476,8 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap, return -EINVAL; } - md_super_write(mddev, rdev, sboff + ps, (int)min(size, bitmap_limit), page); + md_write_metadata(mddev, rdev, sboff + ps, (int)min(size, bitmap_limit), + page, 0); return 0; } @@ -1244,7 +1237,7 @@ static void __bitmap_unplug(struct bitmap *bitmap) int dirty, need_write; int writing = 0; - if (!__bitmap_enabled(bitmap)) + if (!bitmap_enabled(bitmap, true)) return; /* look at each page to see if there are any set bits that need to be @@ -1788,15 +1781,9 @@ static bool __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, bool degraded) { bitmap_counter_t *bmc; - bool rv; + bool rv = false; - if (bitmap == NULL) {/* FIXME or bitmap set as 'failed' */ - *blocks = 1024; - return true; /* always resync if no bitmap */ - } spin_lock_irq(&bitmap->counts.lock); - - rv = false; bmc = md_bitmap_get_counter(&bitmap->counts, offset, blocks, 0); if (bmc) { /* locked */ @@ -1845,10 +1832,6 @@ static void __bitmap_end_sync(struct bitmap *bitmap, sector_t offset, bitmap_counter_t *bmc; unsigned long flags; - if (bitmap == NULL) { - *blocks = 1024; - return; - } spin_lock_irqsave(&bitmap->counts.lock, flags); bmc = md_bitmap_get_counter(&bitmap->counts, offset, blocks, 0); if (bmc == NULL) @@ -2060,9 +2043,6 @@ static void bitmap_start_behind_write(struct mddev *mddev) struct bitmap *bitmap = mddev->bitmap; int bw; - if (!bitmap) - return; - atomic_inc(&bitmap->behind_writes); bw = atomic_read(&bitmap->behind_writes); if (bw > bitmap->behind_writes_used) @@ -2076,9 +2056,6 @@ static void bitmap_end_behind_write(struct mddev *mddev) { struct bitmap *bitmap = mddev->bitmap; - if (!bitmap) - return; - if (atomic_dec_and_test(&bitmap->behind_writes)) wake_up(&bitmap->behind_wait); pr_debug("dec write-behind count %d/%lu\n", @@ -2593,15 +2570,14 @@ err: return ret; } -static int bitmap_resize(struct mddev *mddev, sector_t blocks, int chunksize, - bool init) +static int bitmap_resize(struct mddev *mddev, sector_t blocks, int chunksize) { struct bitmap *bitmap = mddev->bitmap; if (!bitmap) return 0; - return __bitmap_resize(bitmap, blocks, chunksize, init); + return __bitmap_resize(bitmap, blocks, chunksize, false); } static ssize_t @@ -2990,12 +2966,19 @@ static struct attribute *md_bitmap_attrs[] = { &max_backlog_used.attr, NULL }; -const struct attribute_group md_bitmap_group = { + +static struct attribute_group md_bitmap_group = { .name = "bitmap", .attrs = md_bitmap_attrs, }; static struct bitmap_operations bitmap_ops = { + .head = { + .type = MD_BITMAP, + .id = ID_BITMAP, + .name = "bitmap", + }, + .enabled = bitmap_enabled, .create = bitmap_create, .resize = bitmap_resize, @@ -3013,6 +2996,9 @@ static struct bitmap_operations bitmap_ops = { .start_write = bitmap_start_write, .end_write = bitmap_end_write, + .start_discard = bitmap_start_write, + .end_discard = bitmap_end_write, + .start_sync = bitmap_start_sync, .end_sync = bitmap_end_sync, .cond_end_sync = bitmap_cond_end_sync, @@ -3026,9 +3012,22 @@ static struct bitmap_operations bitmap_ops = { .copy_from_slot = bitmap_copy_from_slot, .set_pages = bitmap_set_pages, .free = md_bitmap_free, + + .group = &md_bitmap_group, }; -void mddev_set_bitmap_ops(struct mddev *mddev) +int md_bitmap_init(void) +{ + md_bitmap_wq = alloc_workqueue("md_bitmap", WQ_MEM_RECLAIM | WQ_UNBOUND, + 0); + if (!md_bitmap_wq) + return -ENOMEM; + + return register_md_submodule(&bitmap_ops.head); +} + +void md_bitmap_exit(void) { - mddev->bitmap_ops = &bitmap_ops; + destroy_workqueue(md_bitmap_wq); + unregister_md_submodule(&bitmap_ops.head); } diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index 59e9dd45cfde..b42a28fa83a0 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -9,10 +9,26 @@ #define BITMAP_MAGIC 0x6d746962 +/* + * version 3 is host-endian order, this is deprecated and not used for new + * array + */ +#define BITMAP_MAJOR_LO 3 +#define BITMAP_MAJOR_HOSTENDIAN 3 +/* version 4 is little-endian order, the default value */ +#define BITMAP_MAJOR_HI 4 +/* version 5 is only used for cluster */ +#define BITMAP_MAJOR_CLUSTERED 5 +/* version 6 is only used for lockless bitmap */ +#define BITMAP_MAJOR_LOCKLESS 6 + /* use these for bitmap->flags and bitmap->sb->state bit-fields */ enum bitmap_state { - BITMAP_STALE = 1, /* the bitmap file is out of date or had -EIO */ + BITMAP_STALE = 1, /* the bitmap file is out of date or had -EIO */ BITMAP_WRITE_ERROR = 2, /* A write error has occurred */ + BITMAP_FIRST_USE = 3, /* llbitmap is just created */ + BITMAP_CLEAN = 4, /* llbitmap is created with assume_clean */ + BITMAP_DAEMON_BUSY = 5, /* llbitmap daemon is not finished after daemon_sleep */ BITMAP_HOSTENDIAN =15, }; @@ -61,11 +77,15 @@ struct md_bitmap_stats { struct file *file; }; +typedef void (md_bitmap_fn)(struct mddev *mddev, sector_t offset, + unsigned long sectors); + struct bitmap_operations { - bool (*enabled)(struct mddev *mddev); + struct md_submodule_head head; + + bool (*enabled)(void *data, bool flush); int (*create)(struct mddev *mddev); - int (*resize)(struct mddev *mddev, sector_t blocks, int chunksize, - bool init); + int (*resize)(struct mddev *mddev, sector_t blocks, int chunksize); int (*load)(struct mddev *mddev); void (*destroy)(struct mddev *mddev); @@ -80,10 +100,13 @@ struct bitmap_operations { void (*end_behind_write)(struct mddev *mddev); void (*wait_behind_writes)(struct mddev *mddev); - void (*start_write)(struct mddev *mddev, sector_t offset, - unsigned long sectors); - void (*end_write)(struct mddev *mddev, sector_t offset, - unsigned long sectors); + md_bitmap_fn *start_write; + md_bitmap_fn *end_write; + md_bitmap_fn *start_discard; + md_bitmap_fn *end_discard; + + sector_t (*skip_sync_blocks)(struct mddev *mddev, sector_t offset); + bool (*blocks_synced)(struct mddev *mddev, sector_t offset); bool (*start_sync)(struct mddev *mddev, sector_t offset, sector_t *blocks, bool degraded); void (*end_sync)(struct mddev *mddev, sector_t offset, sector_t *blocks); @@ -101,9 +124,75 @@ struct bitmap_operations { sector_t *hi, bool clear_bits); void (*set_pages)(void *data, unsigned long pages); void (*free)(void *data); + + struct attribute_group *group; }; /* the bitmap API */ -void mddev_set_bitmap_ops(struct mddev *mddev); +static inline bool md_bitmap_registered(struct mddev *mddev) +{ + return mddev->bitmap_ops != NULL; +} + +static inline bool md_bitmap_enabled(struct mddev *mddev, bool flush) +{ + /* bitmap_ops must be registered before creating bitmap. */ + if (!md_bitmap_registered(mddev)) + return false; + + if (!mddev->bitmap) + return false; + + return mddev->bitmap_ops->enabled(mddev->bitmap, flush); +} + +static inline bool md_bitmap_start_sync(struct mddev *mddev, sector_t offset, + sector_t *blocks, bool degraded) +{ + /* always resync if no bitmap */ + if (!md_bitmap_enabled(mddev, false)) { + *blocks = 1024; + return true; + } + + return mddev->bitmap_ops->start_sync(mddev, offset, blocks, degraded); +} + +static inline void md_bitmap_end_sync(struct mddev *mddev, sector_t offset, + sector_t *blocks) +{ + if (!md_bitmap_enabled(mddev, false)) { + *blocks = 1024; + return; + } + + mddev->bitmap_ops->end_sync(mddev, offset, blocks); +} + +#ifdef CONFIG_MD_BITMAP +int md_bitmap_init(void); +void md_bitmap_exit(void); +#else +static inline int md_bitmap_init(void) +{ + return 0; +} +static inline void md_bitmap_exit(void) +{ +} +#endif + +#ifdef CONFIG_MD_LLBITMAP +int md_llbitmap_init(void); +void md_llbitmap_exit(void); +#else +static inline int md_llbitmap_init(void) +{ + return 0; +} +static inline void md_llbitmap_exit(void) +{ +} +#endif #endif diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 5497eaee96e7..11f1e91d387d 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -630,7 +630,7 @@ static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg) if (le64_to_cpu(msg->high) != mddev->pers->size(mddev, 0, 0)) ret = mddev->bitmap_ops->resize(mddev, le64_to_cpu(msg->high), - 0, false); + 0); break; default: ret = -1; @@ -979,7 +979,7 @@ err: lockres_free(cinfo->resync_lockres); lockres_free(cinfo->bitmap_lockres); if (cinfo->lockspace) - dlm_release_lockspace(cinfo->lockspace, 2); + dlm_release_lockspace(cinfo->lockspace, DLM_RELEASE_NORMAL); mddev->cluster_info = NULL; kfree(cinfo); return ret; @@ -1042,7 +1042,7 @@ static int leave(struct mddev *mddev) lockres_free(cinfo->resync_lockres); lockres_free(cinfo->bitmap_lockres); unlock_all_bitmaps(mddev); - dlm_release_lockspace(cinfo->lockspace, 2); + dlm_release_lockspace(cinfo->lockspace, DLM_RELEASE_NORMAL); kfree(cinfo); return 0; } diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c index 5d9b08115375..7033d982d377 100644 --- a/drivers/md/md-linear.c +++ b/drivers/md/md-linear.c @@ -73,6 +73,7 @@ static int linear_set_limits(struct mddev *mddev) md_init_stacking_limits(&lim); lim.max_hw_sectors = mddev->chunk_sectors; lim.max_write_zeroes_sectors = mddev->chunk_sectors; + lim.max_hw_wzeroes_unmap_sectors = mddev->chunk_sectors; lim.io_min = mddev->chunk_sectors << 9; err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); if (err) @@ -256,18 +257,10 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio) if (unlikely(bio_end_sector(bio) > end_sector)) { /* This bio crosses a device boundary, so we have to split it */ - struct bio *split = bio_split(bio, end_sector - bio_sector, - GFP_NOIO, &mddev->bio_set); - - if (IS_ERR(split)) { - bio->bi_status = errno_to_blk_status(PTR_ERR(split)); - bio_endio(bio); + bio = bio_submit_split_bioset(bio, end_sector - bio_sector, + &mddev->bio_set); + if (!bio) return true; - } - - bio_chain(split, bio); - submit_bio_noacct(bio); - bio = split; } md_account_bio(mddev, &bio); diff --git a/drivers/md/md-llbitmap.c b/drivers/md/md-llbitmap.c new file mode 100644 index 000000000000..1eb434306162 --- /dev/null +++ b/drivers/md/md-llbitmap.c @@ -0,0 +1,1626 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/blkdev.h> +#include <linux/module.h> +#include <linux/errno.h> +#include <linux/slab.h> +#include <linux/init.h> +#include <linux/timer.h> +#include <linux/sched.h> +#include <linux/list.h> +#include <linux/file.h> +#include <linux/seq_file.h> +#include <trace/events/block.h> + +#include "md.h" +#include "md-bitmap.h" + +/* + * #### Background + * + * Redundant data is used to enhance data fault tolerance, and the storage + * methods for redundant data vary depending on the RAID levels. And it's + * important to maintain the consistency of redundant data. + * + * Bitmap is used to record which data blocks have been synchronized and which + * ones need to be resynchronized or recovered. Each bit in the bitmap + * represents a segment of data in the array. When a bit is set, it indicates + * that the multiple redundant copies of that data segment may not be + * consistent. Data synchronization can be performed based on the bitmap after + * power failure or readding a disk. If there is no bitmap, a full disk + * synchronization is required. + * + * #### Key Features + * + * - IO fastpath is lockless, if user issues lots of write IO to the same + * bitmap bit in a short time, only the first write has additional overhead + * to update bitmap bit, no additional overhead for the following writes; + * - support only resync or recover written data, means in the case creating + * new array or replacing with a new disk, there is no need to do a full disk + * resync/recovery; + * + * #### Key Concept + * + * ##### State Machine + * + * Each bit is one byte, contain 6 different states, see llbitmap_state. And + * there are total 8 different actions, see llbitmap_action, can change state: + * + * llbitmap state machine: transitions between states + * + * | | Startwrite | Startsync | Endsync | Abortsync| + * | --------- | ---------- | --------- | ------- | ------- | + * | Unwritten | Dirty | x | x | x | + * | Clean | Dirty | x | x | x | + * | Dirty | x | x | x | x | + * | NeedSync | x | Syncing | x | x | + * | Syncing | x | Syncing | Dirty | NeedSync | + * + * | | Reload | Daemon | Discard | Stale | + * | --------- | -------- | ------ | --------- | --------- | + * | Unwritten | x | x | x | x | + * | Clean | x | x | Unwritten | NeedSync | + * | Dirty | NeedSync | Clean | Unwritten | NeedSync | + * | NeedSync | x | x | Unwritten | x | + * | Syncing | NeedSync | x | Unwritten | NeedSync | + * + * Typical scenarios: + * + * 1) Create new array + * All bits will be set to Unwritten by default, if --assume-clean is set, + * all bits will be set to Clean instead. + * + * 2) write data, raid1/raid10 have full copy of data, while raid456 doesn't and + * rely on xor data + * + * 2.1) write new data to raid1/raid10: + * Unwritten --StartWrite--> Dirty + * + * 2.2) write new data to raid456: + * Unwritten --StartWrite--> NeedSync + * + * Because the initial recover for raid456 is skipped, the xor data is not built + * yet, the bit must be set to NeedSync first and after lazy initial recover is + * finished, the bit will finally set to Dirty(see 5.1 and 5.4); + * + * 2.3) cover write + * Clean --StartWrite--> Dirty + * + * 3) daemon, if the array is not degraded: + * Dirty --Daemon--> Clean + * + * 4) discard + * {Clean, Dirty, NeedSync, Syncing} --Discard--> Unwritten + * + * 5) resync and recover + * + * 5.1) common process + * NeedSync --Startsync--> Syncing --Endsync--> Dirty --Daemon--> Clean + * + * 5.2) resync after power failure + * Dirty --Reload--> NeedSync + * + * 5.3) recover while replacing with a new disk + * By default, the old bitmap framework will recover all data, and llbitmap + * implements this by a new helper, see llbitmap_skip_sync_blocks: + * + * skip recover for bits other than dirty or clean; + * + * 5.4) lazy initial recover for raid5: + * By default, the old bitmap framework will only allow new recover when there + * are spares(new disk), a new recovery flag MD_RECOVERY_LAZY_RECOVER is added + * to perform raid456 lazy recover for set bits(from 2.2). + * + * 6. special handling for degraded array: + * + * - Dirty bits will never be cleared, daemon will just do nothing, so that if + * a disk is readded, Clean bits can be skipped with recovery; + * - Dirty bits will convert to Syncing from start write, to do data recovery + * for new added disks; + * - New write will convert bits to NeedSync directly; + * + * ##### Bitmap IO + * + * ##### Chunksize + * + * The default bitmap size is 128k, incluing 1k bitmap super block, and + * the default size of segment of data in the array each bit(chunksize) is 64k, + * and chunksize will adjust to twice the old size each time if the total number + * bits is not less than 127k.(see llbitmap_init) + * + * ##### READ + * + * While creating bitmap, all pages will be allocated and read for llbitmap, + * there won't be read afterwards + * + * ##### WRITE + * + * WRITE IO is divided into logical_block_size of the array, the dirty state + * of each block is tracked independently, for example: + * + * each page is 4k, contain 8 blocks; each block is 512 bytes contain 512 bit; + * + * | page0 | page1 | ... | page 31 | + * | | + * | \-----------------------\ + * | | + * | block0 | block1 | ... | block 8| + * | | + * | \-----------------\ + * | | + * | bit0 | bit1 | ... | bit511 | + * + * From IO path, if one bit is changed to Dirty or NeedSync, the corresponding + * subpage will be marked dirty, such block must write first before the IO is + * issued. This behaviour will affect IO performance, to reduce the impact, if + * multiple bits are changed in the same block in a short time, all bits in this + * block will be changed to Dirty/NeedSync, so that there won't be any overhead + * until daemon clears dirty bits. + * + * ##### Dirty Bits synchronization + * + * IO fast path will set bits to dirty, and those dirty bits will be cleared + * by daemon after IO is done. llbitmap_page_ctl is used to synchronize between + * IO path and daemon; + * + * IO path: + * 1) try to grab a reference, if succeed, set expire time after 5s and return; + * 2) if failed to grab a reference, wait for daemon to finish clearing dirty + * bits; + * + * Daemon (Daemon will be woken up every daemon_sleep seconds): + * For each page: + * 1) check if page expired, if not skip this page; for expired page: + * 2) suspend the page and wait for inflight write IO to be done; + * 3) change dirty page to clean; + * 4) resume the page; + */ + +#define BITMAP_DATA_OFFSET 1024 + +/* 64k is the max IO size of sync IO for raid1/raid10 */ +#define MIN_CHUNK_SIZE (64 * 2) + +/* By default, daemon will be woken up every 30s */ +#define DEFAULT_DAEMON_SLEEP 30 + +/* + * Dirtied bits that have not been accessed for more than 5s will be cleared + * by daemon. + */ +#define DEFAULT_BARRIER_IDLE 5 + +enum llbitmap_state { + /* No valid data, init state after assemble the array */ + BitUnwritten = 0, + /* data is consistent */ + BitClean, + /* data will be consistent after IO is done, set directly for writes */ + BitDirty, + /* + * data need to be resynchronized: + * 1) set directly for writes if array is degraded, prevent full disk + * synchronization after readding a disk; + * 2) reassemble the array after power failure, and dirty bits are + * found after reloading the bitmap; + * 3) set for first write for raid5, to build initial xor data lazily + */ + BitNeedSync, + /* data is synchronizing */ + BitSyncing, + BitStateCount, + BitNone = 0xff, +}; + +enum llbitmap_action { + /* User write new data, this is the only action from IO fast path */ + BitmapActionStartwrite = 0, + /* Start recovery */ + BitmapActionStartsync, + /* Finish recovery */ + BitmapActionEndsync, + /* Failed recovery */ + BitmapActionAbortsync, + /* Reassemble the array */ + BitmapActionReload, + /* Daemon thread is trying to clear dirty bits */ + BitmapActionDaemon, + /* Data is deleted */ + BitmapActionDiscard, + /* + * Bitmap is stale, mark all bits in addition to BitUnwritten to + * BitNeedSync. + */ + BitmapActionStale, + BitmapActionCount, + /* Init state is BitUnwritten */ + BitmapActionInit, +}; + +enum llbitmap_page_state { + LLPageFlush = 0, + LLPageDirty, +}; + +struct llbitmap_page_ctl { + char *state; + struct page *page; + unsigned long expire; + unsigned long flags; + wait_queue_head_t wait; + struct percpu_ref active; + /* Per block size dirty state, maximum 64k page / 1 sector = 128 */ + unsigned long dirty[]; +}; + +struct llbitmap { + struct mddev *mddev; + struct llbitmap_page_ctl **pctl; + + unsigned int nr_pages; + unsigned int io_size; + unsigned int blocks_per_page; + + /* shift of one chunk */ + unsigned long chunkshift; + /* size of one chunk in sector */ + unsigned long chunksize; + /* total number of chunks */ + unsigned long chunks; + unsigned long last_end_sync; + /* + * time in seconds that dirty bits will be cleared if the page is not + * accessed. + */ + unsigned long barrier_idle; + /* fires on first BitDirty state */ + struct timer_list pending_timer; + struct work_struct daemon_work; + + unsigned long flags; + __u64 events_cleared; + + /* for slow disks */ + atomic_t behind_writes; + wait_queue_head_t behind_wait; +}; + +struct llbitmap_unplug_work { + struct work_struct work; + struct llbitmap *llbitmap; + struct completion *done; +}; + +static struct workqueue_struct *md_llbitmap_io_wq; +static struct workqueue_struct *md_llbitmap_unplug_wq; + +static char state_machine[BitStateCount][BitmapActionCount] = { + [BitUnwritten] = { + [BitmapActionStartwrite] = BitDirty, + [BitmapActionStartsync] = BitNone, + [BitmapActionEndsync] = BitNone, + [BitmapActionAbortsync] = BitNone, + [BitmapActionReload] = BitNone, + [BitmapActionDaemon] = BitNone, + [BitmapActionDiscard] = BitNone, + [BitmapActionStale] = BitNone, + }, + [BitClean] = { + [BitmapActionStartwrite] = BitDirty, + [BitmapActionStartsync] = BitNone, + [BitmapActionEndsync] = BitNone, + [BitmapActionAbortsync] = BitNone, + [BitmapActionReload] = BitNone, + [BitmapActionDaemon] = BitNone, + [BitmapActionDiscard] = BitUnwritten, + [BitmapActionStale] = BitNeedSync, + }, + [BitDirty] = { + [BitmapActionStartwrite] = BitNone, + [BitmapActionStartsync] = BitNone, + [BitmapActionEndsync] = BitNone, + [BitmapActionAbortsync] = BitNone, + [BitmapActionReload] = BitNeedSync, + [BitmapActionDaemon] = BitClean, + [BitmapActionDiscard] = BitUnwritten, + [BitmapActionStale] = BitNeedSync, + }, + [BitNeedSync] = { + [BitmapActionStartwrite] = BitNone, + [BitmapActionStartsync] = BitSyncing, + [BitmapActionEndsync] = BitNone, + [BitmapActionAbortsync] = BitNone, + [BitmapActionReload] = BitNone, + [BitmapActionDaemon] = BitNone, + [BitmapActionDiscard] = BitUnwritten, + [BitmapActionStale] = BitNone, + }, + [BitSyncing] = { + [BitmapActionStartwrite] = BitNone, + [BitmapActionStartsync] = BitSyncing, + [BitmapActionEndsync] = BitDirty, + [BitmapActionAbortsync] = BitNeedSync, + [BitmapActionReload] = BitNeedSync, + [BitmapActionDaemon] = BitNone, + [BitmapActionDiscard] = BitUnwritten, + [BitmapActionStale] = BitNeedSync, + }, +}; + +static void __llbitmap_flush(struct mddev *mddev); + +static enum llbitmap_state llbitmap_read(struct llbitmap *llbitmap, loff_t pos) +{ + unsigned int idx; + unsigned int offset; + + pos += BITMAP_DATA_OFFSET; + idx = pos >> PAGE_SHIFT; + offset = offset_in_page(pos); + + return llbitmap->pctl[idx]->state[offset]; +} + +/* set all the bits in the subpage as dirty */ +static void llbitmap_infect_dirty_bits(struct llbitmap *llbitmap, + struct llbitmap_page_ctl *pctl, + unsigned int block) +{ + bool level_456 = raid_is_456(llbitmap->mddev); + unsigned int io_size = llbitmap->io_size; + int pos; + + for (pos = block * io_size; pos < (block + 1) * io_size; pos++) { + switch (pctl->state[pos]) { + case BitUnwritten: + pctl->state[pos] = level_456 ? BitNeedSync : BitDirty; + break; + case BitClean: + pctl->state[pos] = BitDirty; + break; + }; + } +} + +static void llbitmap_set_page_dirty(struct llbitmap *llbitmap, int idx, + int offset) +{ + struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx]; + unsigned int io_size = llbitmap->io_size; + int block = offset / io_size; + int pos; + + if (!test_bit(LLPageDirty, &pctl->flags)) + set_bit(LLPageDirty, &pctl->flags); + + /* + * For degraded array, dirty bits will never be cleared, and we must + * resync all the dirty bits, hence skip infect new dirty bits to + * prevent resync unnecessary data. + */ + if (llbitmap->mddev->degraded) { + set_bit(block, pctl->dirty); + return; + } + + /* + * The subpage usually contains a total of 512 bits. If any single bit + * within the subpage is marked as dirty, the entire sector will be + * written. To avoid impacting write performance, when multiple bits + * within the same sector are modified within llbitmap->barrier_idle, + * all bits in the sector will be collectively marked as dirty at once. + */ + if (test_and_set_bit(block, pctl->dirty)) { + llbitmap_infect_dirty_bits(llbitmap, pctl, block); + return; + } + + for (pos = block * io_size; pos < (block + 1) * io_size; pos++) { + if (pos == offset) + continue; + if (pctl->state[pos] == BitDirty || + pctl->state[pos] == BitNeedSync) { + llbitmap_infect_dirty_bits(llbitmap, pctl, block); + return; + } + } +} + +static void llbitmap_write(struct llbitmap *llbitmap, enum llbitmap_state state, + loff_t pos) +{ + unsigned int idx; + unsigned int bit; + + pos += BITMAP_DATA_OFFSET; + idx = pos >> PAGE_SHIFT; + bit = offset_in_page(pos); + + llbitmap->pctl[idx]->state[bit] = state; + if (state == BitDirty || state == BitNeedSync) + llbitmap_set_page_dirty(llbitmap, idx, bit); +} + +static struct page *llbitmap_read_page(struct llbitmap *llbitmap, int idx) +{ + struct mddev *mddev = llbitmap->mddev; + struct page *page = NULL; + struct md_rdev *rdev; + + if (llbitmap->pctl && llbitmap->pctl[idx]) + page = llbitmap->pctl[idx]->page; + if (page) + return page; + + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!page) + return ERR_PTR(-ENOMEM); + + rdev_for_each(rdev, mddev) { + sector_t sector; + + if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags)) + continue; + + sector = mddev->bitmap_info.offset + + (idx << PAGE_SECTORS_SHIFT); + + if (sync_page_io(rdev, sector, PAGE_SIZE, page, REQ_OP_READ, + true)) + return page; + + md_error(mddev, rdev); + } + + __free_page(page); + return ERR_PTR(-EIO); +} + +static void llbitmap_write_page(struct llbitmap *llbitmap, int idx) +{ + struct page *page = llbitmap->pctl[idx]->page; + struct mddev *mddev = llbitmap->mddev; + struct md_rdev *rdev; + int block; + + for (block = 0; block < llbitmap->blocks_per_page; block++) { + struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx]; + + if (!test_and_clear_bit(block, pctl->dirty)) + continue; + + rdev_for_each(rdev, mddev) { + sector_t sector; + sector_t bit_sector = llbitmap->io_size >> SECTOR_SHIFT; + + if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags)) + continue; + + sector = mddev->bitmap_info.offset + rdev->sb_start + + (idx << PAGE_SECTORS_SHIFT) + + block * bit_sector; + md_write_metadata(mddev, rdev, sector, + llbitmap->io_size, page, + block * llbitmap->io_size); + } + } +} + +static void active_release(struct percpu_ref *ref) +{ + struct llbitmap_page_ctl *pctl = + container_of(ref, struct llbitmap_page_ctl, active); + + wake_up(&pctl->wait); +} + +static void llbitmap_free_pages(struct llbitmap *llbitmap) +{ + int i; + + if (!llbitmap->pctl) + return; + + for (i = 0; i < llbitmap->nr_pages; i++) { + struct llbitmap_page_ctl *pctl = llbitmap->pctl[i]; + + if (!pctl || !pctl->page) + break; + + __free_page(pctl->page); + percpu_ref_exit(&pctl->active); + } + + kfree(llbitmap->pctl[0]); + kfree(llbitmap->pctl); + llbitmap->pctl = NULL; +} + +static int llbitmap_cache_pages(struct llbitmap *llbitmap) +{ + struct llbitmap_page_ctl *pctl; + unsigned int nr_pages = DIV_ROUND_UP(llbitmap->chunks + + BITMAP_DATA_OFFSET, PAGE_SIZE); + unsigned int size = struct_size(pctl, dirty, BITS_TO_LONGS( + llbitmap->blocks_per_page)); + int i; + + llbitmap->pctl = kmalloc_array(nr_pages, sizeof(void *), + GFP_KERNEL | __GFP_ZERO); + if (!llbitmap->pctl) + return -ENOMEM; + + size = round_up(size, cache_line_size()); + pctl = kmalloc_array(nr_pages, size, GFP_KERNEL | __GFP_ZERO); + if (!pctl) { + kfree(llbitmap->pctl); + return -ENOMEM; + } + + llbitmap->nr_pages = nr_pages; + + for (i = 0; i < nr_pages; i++, pctl = (void *)pctl + size) { + struct page *page = llbitmap_read_page(llbitmap, i); + + llbitmap->pctl[i] = pctl; + + if (IS_ERR(page)) { + llbitmap_free_pages(llbitmap); + return PTR_ERR(page); + } + + if (percpu_ref_init(&pctl->active, active_release, + PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) { + __free_page(page); + llbitmap_free_pages(llbitmap); + return -ENOMEM; + } + + pctl->page = page; + pctl->state = page_address(page); + init_waitqueue_head(&pctl->wait); + } + + return 0; +} + +static void llbitmap_init_state(struct llbitmap *llbitmap) +{ + enum llbitmap_state state = BitUnwritten; + unsigned long i; + + if (test_and_clear_bit(BITMAP_CLEAN, &llbitmap->flags)) + state = BitClean; + + for (i = 0; i < llbitmap->chunks; i++) + llbitmap_write(llbitmap, state, i); +} + +/* The return value is only used from resync, where @start == @end. */ +static enum llbitmap_state llbitmap_state_machine(struct llbitmap *llbitmap, + unsigned long start, + unsigned long end, + enum llbitmap_action action) +{ + struct mddev *mddev = llbitmap->mddev; + enum llbitmap_state state = BitNone; + bool level_456 = raid_is_456(llbitmap->mddev); + bool need_resync = false; + bool need_recovery = false; + + if (test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags)) + return BitNone; + + if (action == BitmapActionInit) { + llbitmap_init_state(llbitmap); + return BitNone; + } + + while (start <= end) { + enum llbitmap_state c = llbitmap_read(llbitmap, start); + + if (c < 0 || c >= BitStateCount) { + pr_err("%s: invalid bit %lu state %d action %d, forcing resync\n", + __func__, start, c, action); + state = BitNeedSync; + goto write_bitmap; + } + + if (c == BitNeedSync) + need_resync = !mddev->degraded; + + state = state_machine[c][action]; + +write_bitmap: + if (unlikely(mddev->degraded)) { + /* For degraded array, mark new data as need sync. */ + if (state == BitDirty && + action == BitmapActionStartwrite) + state = BitNeedSync; + /* + * For degraded array, resync dirty data as well, noted + * if array is still degraded after resync is done, all + * new data will still be dirty until array is clean. + */ + else if (c == BitDirty && + action == BitmapActionStartsync) + state = BitSyncing; + } else if (c == BitUnwritten && state == BitDirty && + action == BitmapActionStartwrite && level_456) { + /* Delay raid456 initial recovery to first write. */ + state = BitNeedSync; + } + + if (state == BitNone) { + start++; + continue; + } + + llbitmap_write(llbitmap, state, start); + + if (state == BitNeedSync) + need_resync = !mddev->degraded; + else if (state == BitDirty && + !timer_pending(&llbitmap->pending_timer)) + mod_timer(&llbitmap->pending_timer, + jiffies + mddev->bitmap_info.daemon_sleep * HZ); + + start++; + } + + if (need_resync && level_456) + need_recovery = true; + + if (need_recovery) { + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + set_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } else if (need_resync) { + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + set_bit(MD_RECOVERY_SYNC, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } + + return state; +} + +static void llbitmap_raise_barrier(struct llbitmap *llbitmap, int page_idx) +{ + struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx]; + +retry: + if (likely(percpu_ref_tryget_live(&pctl->active))) { + WRITE_ONCE(pctl->expire, jiffies + llbitmap->barrier_idle * HZ); + return; + } + + wait_event(pctl->wait, !percpu_ref_is_dying(&pctl->active)); + goto retry; +} + +static void llbitmap_release_barrier(struct llbitmap *llbitmap, int page_idx) +{ + struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx]; + + percpu_ref_put(&pctl->active); +} + +static int llbitmap_suspend_timeout(struct llbitmap *llbitmap, int page_idx) +{ + struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx]; + + percpu_ref_kill(&pctl->active); + + if (!wait_event_timeout(pctl->wait, percpu_ref_is_zero(&pctl->active), + llbitmap->mddev->bitmap_info.daemon_sleep * HZ)) + return -ETIMEDOUT; + + return 0; +} + +static void llbitmap_resume(struct llbitmap *llbitmap, int page_idx) +{ + struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx]; + + pctl->expire = LONG_MAX; + percpu_ref_resurrect(&pctl->active); + wake_up(&pctl->wait); +} + +static int llbitmap_check_support(struct mddev *mddev) +{ + if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { + pr_notice("md/llbitmap: %s: array with journal cannot have bitmap\n", + mdname(mddev)); + return -EBUSY; + } + + if (mddev->bitmap_info.space == 0) { + if (mddev->bitmap_info.default_space == 0) { + pr_notice("md/llbitmap: %s: no space for bitmap\n", + mdname(mddev)); + return -ENOSPC; + } + } + + if (!mddev->persistent) { + pr_notice("md/llbitmap: %s: array must be persistent\n", + mdname(mddev)); + return -EOPNOTSUPP; + } + + if (mddev->bitmap_info.file) { + pr_notice("md/llbitmap: %s: doesn't support bitmap file\n", + mdname(mddev)); + return -EOPNOTSUPP; + } + + if (mddev->bitmap_info.external) { + pr_notice("md/llbitmap: %s: doesn't support external metadata\n", + mdname(mddev)); + return -EOPNOTSUPP; + } + + if (mddev_is_dm(mddev)) { + pr_notice("md/llbitmap: %s: doesn't support dm-raid\n", + mdname(mddev)); + return -EOPNOTSUPP; + } + + return 0; +} + +static int llbitmap_init(struct llbitmap *llbitmap) +{ + struct mddev *mddev = llbitmap->mddev; + sector_t blocks = mddev->resync_max_sectors; + unsigned long chunksize = MIN_CHUNK_SIZE; + unsigned long chunks = DIV_ROUND_UP(blocks, chunksize); + unsigned long space = mddev->bitmap_info.space << SECTOR_SHIFT; + int ret; + + while (chunks > space) { + chunksize = chunksize << 1; + chunks = DIV_ROUND_UP_SECTOR_T(blocks, chunksize); + } + + llbitmap->barrier_idle = DEFAULT_BARRIER_IDLE; + llbitmap->chunkshift = ffz(~chunksize); + llbitmap->chunksize = chunksize; + llbitmap->chunks = chunks; + mddev->bitmap_info.daemon_sleep = DEFAULT_DAEMON_SLEEP; + + ret = llbitmap_cache_pages(llbitmap); + if (ret) + return ret; + + llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1, + BitmapActionInit); + /* flush initial llbitmap to disk */ + __llbitmap_flush(mddev); + + return 0; +} + +static int llbitmap_read_sb(struct llbitmap *llbitmap) +{ + struct mddev *mddev = llbitmap->mddev; + unsigned long daemon_sleep; + unsigned long chunksize; + unsigned long events; + struct page *sb_page; + bitmap_super_t *sb; + int ret = -EINVAL; + + if (!mddev->bitmap_info.offset) { + pr_err("md/llbitmap: %s: no super block found", mdname(mddev)); + return -EINVAL; + } + + sb_page = llbitmap_read_page(llbitmap, 0); + if (IS_ERR(sb_page)) { + pr_err("md/llbitmap: %s: read super block failed", + mdname(mddev)); + return -EIO; + } + + sb = kmap_local_page(sb_page); + if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) { + pr_err("md/llbitmap: %s: invalid super block magic number", + mdname(mddev)); + goto out_put_page; + } + + if (sb->version != cpu_to_le32(BITMAP_MAJOR_LOCKLESS)) { + pr_err("md/llbitmap: %s: invalid super block version", + mdname(mddev)); + goto out_put_page; + } + + if (memcmp(sb->uuid, mddev->uuid, 16)) { + pr_err("md/llbitmap: %s: bitmap superblock UUID mismatch\n", + mdname(mddev)); + goto out_put_page; + } + + if (mddev->bitmap_info.space == 0) { + int room = le32_to_cpu(sb->sectors_reserved); + + if (room) + mddev->bitmap_info.space = room; + else + mddev->bitmap_info.space = mddev->bitmap_info.default_space; + } + llbitmap->flags = le32_to_cpu(sb->state); + if (test_and_clear_bit(BITMAP_FIRST_USE, &llbitmap->flags)) { + ret = llbitmap_init(llbitmap); + goto out_put_page; + } + + chunksize = le32_to_cpu(sb->chunksize); + if (!is_power_of_2(chunksize)) { + pr_err("md/llbitmap: %s: chunksize not a power of 2", + mdname(mddev)); + goto out_put_page; + } + + if (chunksize < DIV_ROUND_UP_SECTOR_T(mddev->resync_max_sectors, + mddev->bitmap_info.space << SECTOR_SHIFT)) { + pr_err("md/llbitmap: %s: chunksize too small %lu < %llu / %lu", + mdname(mddev), chunksize, mddev->resync_max_sectors, + mddev->bitmap_info.space); + goto out_put_page; + } + + daemon_sleep = le32_to_cpu(sb->daemon_sleep); + if (daemon_sleep < 1 || daemon_sleep > MAX_SCHEDULE_TIMEOUT / HZ) { + pr_err("md/llbitmap: %s: daemon sleep %lu period out of range", + mdname(mddev), daemon_sleep); + goto out_put_page; + } + + events = le64_to_cpu(sb->events); + if (events < mddev->events) { + pr_warn("md/llbitmap :%s: bitmap file is out of date (%lu < %llu) -- forcing full recovery", + mdname(mddev), events, mddev->events); + set_bit(BITMAP_STALE, &llbitmap->flags); + } + + sb->sync_size = cpu_to_le64(mddev->resync_max_sectors); + mddev->bitmap_info.chunksize = chunksize; + mddev->bitmap_info.daemon_sleep = daemon_sleep; + + llbitmap->barrier_idle = DEFAULT_BARRIER_IDLE; + llbitmap->chunksize = chunksize; + llbitmap->chunks = DIV_ROUND_UP_SECTOR_T(mddev->resync_max_sectors, chunksize); + llbitmap->chunkshift = ffz(~chunksize); + ret = llbitmap_cache_pages(llbitmap); + +out_put_page: + __free_page(sb_page); + kunmap_local(sb); + return ret; +} + +static void llbitmap_pending_timer_fn(struct timer_list *pending_timer) +{ + struct llbitmap *llbitmap = + container_of(pending_timer, struct llbitmap, pending_timer); + + if (work_busy(&llbitmap->daemon_work)) { + pr_warn("md/llbitmap: %s daemon_work not finished in %lu seconds\n", + mdname(llbitmap->mddev), + llbitmap->mddev->bitmap_info.daemon_sleep); + set_bit(BITMAP_DAEMON_BUSY, &llbitmap->flags); + return; + } + + queue_work(md_llbitmap_io_wq, &llbitmap->daemon_work); +} + +static void md_llbitmap_daemon_fn(struct work_struct *work) +{ + struct llbitmap *llbitmap = + container_of(work, struct llbitmap, daemon_work); + unsigned long start; + unsigned long end; + bool restart; + int idx; + + if (llbitmap->mddev->degraded) + return; +retry: + start = 0; + end = min(llbitmap->chunks, PAGE_SIZE - BITMAP_DATA_OFFSET) - 1; + restart = false; + + for (idx = 0; idx < llbitmap->nr_pages; idx++) { + struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx]; + + if (idx > 0) { + start = end + 1; + end = min(end + PAGE_SIZE, llbitmap->chunks - 1); + } + + if (!test_bit(LLPageFlush, &pctl->flags) && + time_before(jiffies, pctl->expire)) { + restart = true; + continue; + } + + if (llbitmap_suspend_timeout(llbitmap, idx) < 0) { + pr_warn("md/llbitmap: %s: %s waiting for page %d timeout\n", + mdname(llbitmap->mddev), __func__, idx); + continue; + } + + llbitmap_state_machine(llbitmap, start, end, BitmapActionDaemon); + llbitmap_resume(llbitmap, idx); + } + + /* + * If the daemon took a long time to finish, retry to prevent missing + * clearing dirty bits. + */ + if (test_and_clear_bit(BITMAP_DAEMON_BUSY, &llbitmap->flags)) + goto retry; + + /* If some page is dirty but not expired, setup timer again */ + if (restart) + mod_timer(&llbitmap->pending_timer, + jiffies + llbitmap->mddev->bitmap_info.daemon_sleep * HZ); +} + +static int llbitmap_create(struct mddev *mddev) +{ + struct llbitmap *llbitmap; + int ret; + + ret = llbitmap_check_support(mddev); + if (ret) + return ret; + + llbitmap = kzalloc(sizeof(*llbitmap), GFP_KERNEL); + if (!llbitmap) + return -ENOMEM; + + llbitmap->mddev = mddev; + llbitmap->io_size = bdev_logical_block_size(mddev->gendisk->part0); + llbitmap->blocks_per_page = PAGE_SIZE / llbitmap->io_size; + + timer_setup(&llbitmap->pending_timer, llbitmap_pending_timer_fn, 0); + INIT_WORK(&llbitmap->daemon_work, md_llbitmap_daemon_fn); + atomic_set(&llbitmap->behind_writes, 0); + init_waitqueue_head(&llbitmap->behind_wait); + + mutex_lock(&mddev->bitmap_info.mutex); + mddev->bitmap = llbitmap; + ret = llbitmap_read_sb(llbitmap); + mutex_unlock(&mddev->bitmap_info.mutex); + if (ret) { + kfree(llbitmap); + mddev->bitmap = NULL; + } + + return ret; +} + +static int llbitmap_resize(struct mddev *mddev, sector_t blocks, int chunksize) +{ + struct llbitmap *llbitmap = mddev->bitmap; + unsigned long chunks; + + if (chunksize == 0) + chunksize = llbitmap->chunksize; + + /* If there is enough space, leave the chunksize unchanged. */ + chunks = DIV_ROUND_UP_SECTOR_T(blocks, chunksize); + while (chunks > mddev->bitmap_info.space << SECTOR_SHIFT) { + chunksize = chunksize << 1; + chunks = DIV_ROUND_UP_SECTOR_T(blocks, chunksize); + } + + llbitmap->chunkshift = ffz(~chunksize); + llbitmap->chunksize = chunksize; + llbitmap->chunks = chunks; + + return 0; +} + +static int llbitmap_load(struct mddev *mddev) +{ + enum llbitmap_action action = BitmapActionReload; + struct llbitmap *llbitmap = mddev->bitmap; + + if (test_and_clear_bit(BITMAP_STALE, &llbitmap->flags)) + action = BitmapActionStale; + + llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1, action); + return 0; +} + +static void llbitmap_destroy(struct mddev *mddev) +{ + struct llbitmap *llbitmap = mddev->bitmap; + + if (!llbitmap) + return; + + mutex_lock(&mddev->bitmap_info.mutex); + + timer_delete_sync(&llbitmap->pending_timer); + flush_workqueue(md_llbitmap_io_wq); + flush_workqueue(md_llbitmap_unplug_wq); + + mddev->bitmap = NULL; + llbitmap_free_pages(llbitmap); + kfree(llbitmap); + mutex_unlock(&mddev->bitmap_info.mutex); +} + +static void llbitmap_start_write(struct mddev *mddev, sector_t offset, + unsigned long sectors) +{ + struct llbitmap *llbitmap = mddev->bitmap; + unsigned long start = offset >> llbitmap->chunkshift; + unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift; + int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; + int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; + + llbitmap_state_machine(llbitmap, start, end, BitmapActionStartwrite); + + while (page_start <= page_end) { + llbitmap_raise_barrier(llbitmap, page_start); + page_start++; + } +} + +static void llbitmap_end_write(struct mddev *mddev, sector_t offset, + unsigned long sectors) +{ + struct llbitmap *llbitmap = mddev->bitmap; + unsigned long start = offset >> llbitmap->chunkshift; + unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift; + int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; + int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; + + while (page_start <= page_end) { + llbitmap_release_barrier(llbitmap, page_start); + page_start++; + } +} + +static void llbitmap_start_discard(struct mddev *mddev, sector_t offset, + unsigned long sectors) +{ + struct llbitmap *llbitmap = mddev->bitmap; + unsigned long start = DIV_ROUND_UP_SECTOR_T(offset, llbitmap->chunksize); + unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift; + int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; + int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; + + llbitmap_state_machine(llbitmap, start, end, BitmapActionDiscard); + + while (page_start <= page_end) { + llbitmap_raise_barrier(llbitmap, page_start); + page_start++; + } +} + +static void llbitmap_end_discard(struct mddev *mddev, sector_t offset, + unsigned long sectors) +{ + struct llbitmap *llbitmap = mddev->bitmap; + unsigned long start = DIV_ROUND_UP_SECTOR_T(offset, llbitmap->chunksize); + unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift; + int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; + int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; + + while (page_start <= page_end) { + llbitmap_release_barrier(llbitmap, page_start); + page_start++; + } +} + +static void llbitmap_unplug_fn(struct work_struct *work) +{ + struct llbitmap_unplug_work *unplug_work = + container_of(work, struct llbitmap_unplug_work, work); + struct llbitmap *llbitmap = unplug_work->llbitmap; + struct blk_plug plug; + int i; + + blk_start_plug(&plug); + + for (i = 0; i < llbitmap->nr_pages; i++) { + if (!test_bit(LLPageDirty, &llbitmap->pctl[i]->flags) || + !test_and_clear_bit(LLPageDirty, &llbitmap->pctl[i]->flags)) + continue; + + llbitmap_write_page(llbitmap, i); + } + + blk_finish_plug(&plug); + md_super_wait(llbitmap->mddev); + complete(unplug_work->done); +} + +static bool llbitmap_dirty(struct llbitmap *llbitmap) +{ + int i; + + for (i = 0; i < llbitmap->nr_pages; i++) + if (test_bit(LLPageDirty, &llbitmap->pctl[i]->flags)) + return true; + + return false; +} + +static void llbitmap_unplug(struct mddev *mddev, bool sync) +{ + DECLARE_COMPLETION_ONSTACK(done); + struct llbitmap *llbitmap = mddev->bitmap; + struct llbitmap_unplug_work unplug_work = { + .llbitmap = llbitmap, + .done = &done, + }; + + if (!llbitmap_dirty(llbitmap)) + return; + + /* + * Issue new bitmap IO under submit_bio() context will deadlock: + * - the bio will wait for bitmap bio to be done, before it can be + * issued; + * - bitmap bio will be added to current->bio_list and wait for this + * bio to be issued; + */ + INIT_WORK_ONSTACK(&unplug_work.work, llbitmap_unplug_fn); + queue_work(md_llbitmap_unplug_wq, &unplug_work.work); + wait_for_completion(&done); + destroy_work_on_stack(&unplug_work.work); +} + +/* + * Force to write all bitmap pages to disk, called when stopping the array, or + * every daemon_sleep seconds when sync_thread is running. + */ +static void __llbitmap_flush(struct mddev *mddev) +{ + struct llbitmap *llbitmap = mddev->bitmap; + struct blk_plug plug; + int i; + + blk_start_plug(&plug); + for (i = 0; i < llbitmap->nr_pages; i++) { + struct llbitmap_page_ctl *pctl = llbitmap->pctl[i]; + + /* mark all blocks as dirty */ + set_bit(LLPageDirty, &pctl->flags); + bitmap_fill(pctl->dirty, llbitmap->blocks_per_page); + llbitmap_write_page(llbitmap, i); + } + blk_finish_plug(&plug); + md_super_wait(llbitmap->mddev); +} + +static void llbitmap_flush(struct mddev *mddev) +{ + struct llbitmap *llbitmap = mddev->bitmap; + int i; + + for (i = 0; i < llbitmap->nr_pages; i++) + set_bit(LLPageFlush, &llbitmap->pctl[i]->flags); + + timer_delete_sync(&llbitmap->pending_timer); + queue_work(md_llbitmap_io_wq, &llbitmap->daemon_work); + flush_work(&llbitmap->daemon_work); + + __llbitmap_flush(mddev); +} + +/* This is used for raid5 lazy initial recovery */ +static bool llbitmap_blocks_synced(struct mddev *mddev, sector_t offset) +{ + struct llbitmap *llbitmap = mddev->bitmap; + unsigned long p = offset >> llbitmap->chunkshift; + enum llbitmap_state c = llbitmap_read(llbitmap, p); + + return c == BitClean || c == BitDirty; +} + +static sector_t llbitmap_skip_sync_blocks(struct mddev *mddev, sector_t offset) +{ + struct llbitmap *llbitmap = mddev->bitmap; + unsigned long p = offset >> llbitmap->chunkshift; + int blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1)); + enum llbitmap_state c = llbitmap_read(llbitmap, p); + + /* always skip unwritten blocks */ + if (c == BitUnwritten) + return blocks; + + /* For degraded array, don't skip */ + if (mddev->degraded) + return 0; + + /* For resync also skip clean/dirty blocks */ + if ((c == BitClean || c == BitDirty) && + test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && + !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) + return blocks; + + return 0; +} + +static bool llbitmap_start_sync(struct mddev *mddev, sector_t offset, + sector_t *blocks, bool degraded) +{ + struct llbitmap *llbitmap = mddev->bitmap; + unsigned long p = offset >> llbitmap->chunkshift; + + /* + * Handle one bit at a time, this is much simpler. And it doesn't matter + * if md_do_sync() loop more times. + */ + *blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1)); + return llbitmap_state_machine(llbitmap, p, p, + BitmapActionStartsync) == BitSyncing; +} + +/* Something is wrong, sync_thread stop at @offset */ +static void llbitmap_end_sync(struct mddev *mddev, sector_t offset, + sector_t *blocks) +{ + struct llbitmap *llbitmap = mddev->bitmap; + unsigned long p = offset >> llbitmap->chunkshift; + + *blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1)); + llbitmap_state_machine(llbitmap, p, llbitmap->chunks - 1, + BitmapActionAbortsync); +} + +/* A full sync_thread is finished */ +static void llbitmap_close_sync(struct mddev *mddev) +{ + struct llbitmap *llbitmap = mddev->bitmap; + int i; + + for (i = 0; i < llbitmap->nr_pages; i++) { + struct llbitmap_page_ctl *pctl = llbitmap->pctl[i]; + + /* let daemon_fn clear dirty bits immediately */ + WRITE_ONCE(pctl->expire, jiffies); + } + + llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1, + BitmapActionEndsync); +} + +/* + * sync_thread have reached @sector, update metadata every daemon_sleep seconds, + * just in case sync_thread have to restart after power failure. + */ +static void llbitmap_cond_end_sync(struct mddev *mddev, sector_t sector, + bool force) +{ + struct llbitmap *llbitmap = mddev->bitmap; + + if (sector == 0) { + llbitmap->last_end_sync = jiffies; + return; + } + + if (time_before(jiffies, llbitmap->last_end_sync + + HZ * mddev->bitmap_info.daemon_sleep)) + return; + + wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); + + mddev->curr_resync_completed = sector; + set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); + llbitmap_state_machine(llbitmap, 0, sector >> llbitmap->chunkshift, + BitmapActionEndsync); + __llbitmap_flush(mddev); + + llbitmap->last_end_sync = jiffies; + sysfs_notify_dirent_safe(mddev->sysfs_completed); +} + +static bool llbitmap_enabled(void *data, bool flush) +{ + struct llbitmap *llbitmap = data; + + return llbitmap && !test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags); +} + +static void llbitmap_dirty_bits(struct mddev *mddev, unsigned long s, + unsigned long e) +{ + llbitmap_state_machine(mddev->bitmap, s, e, BitmapActionStartwrite); +} + +static void llbitmap_write_sb(struct llbitmap *llbitmap) +{ + int nr_blocks = DIV_ROUND_UP(BITMAP_DATA_OFFSET, llbitmap->io_size); + + bitmap_fill(llbitmap->pctl[0]->dirty, nr_blocks); + llbitmap_write_page(llbitmap, 0); + md_super_wait(llbitmap->mddev); +} + +static void llbitmap_update_sb(void *data) +{ + struct llbitmap *llbitmap = data; + struct mddev *mddev = llbitmap->mddev; + struct page *sb_page; + bitmap_super_t *sb; + + if (test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags)) + return; + + sb_page = llbitmap_read_page(llbitmap, 0); + if (IS_ERR(sb_page)) { + pr_err("%s: %s: read super block failed", __func__, + mdname(mddev)); + set_bit(BITMAP_WRITE_ERROR, &llbitmap->flags); + return; + } + + if (mddev->events < llbitmap->events_cleared) + llbitmap->events_cleared = mddev->events; + + sb = kmap_local_page(sb_page); + sb->events = cpu_to_le64(mddev->events); + sb->state = cpu_to_le32(llbitmap->flags); + sb->chunksize = cpu_to_le32(llbitmap->chunksize); + sb->sync_size = cpu_to_le64(mddev->resync_max_sectors); + sb->events_cleared = cpu_to_le64(llbitmap->events_cleared); + sb->sectors_reserved = cpu_to_le32(mddev->bitmap_info.space); + sb->daemon_sleep = cpu_to_le32(mddev->bitmap_info.daemon_sleep); + + kunmap_local(sb); + llbitmap_write_sb(llbitmap); +} + +static int llbitmap_get_stats(void *data, struct md_bitmap_stats *stats) +{ + struct llbitmap *llbitmap = data; + + memset(stats, 0, sizeof(*stats)); + + stats->missing_pages = 0; + stats->pages = llbitmap->nr_pages; + stats->file_pages = llbitmap->nr_pages; + + stats->behind_writes = atomic_read(&llbitmap->behind_writes); + stats->behind_wait = wq_has_sleeper(&llbitmap->behind_wait); + stats->events_cleared = llbitmap->events_cleared; + + return 0; +} + +/* just flag all pages as needing to be written */ +static void llbitmap_write_all(struct mddev *mddev) +{ + int i; + struct llbitmap *llbitmap = mddev->bitmap; + + for (i = 0; i < llbitmap->nr_pages; i++) { + struct llbitmap_page_ctl *pctl = llbitmap->pctl[i]; + + set_bit(LLPageDirty, &pctl->flags); + bitmap_fill(pctl->dirty, llbitmap->blocks_per_page); + } +} + +static void llbitmap_start_behind_write(struct mddev *mddev) +{ + struct llbitmap *llbitmap = mddev->bitmap; + + atomic_inc(&llbitmap->behind_writes); +} + +static void llbitmap_end_behind_write(struct mddev *mddev) +{ + struct llbitmap *llbitmap = mddev->bitmap; + + if (atomic_dec_and_test(&llbitmap->behind_writes)) + wake_up(&llbitmap->behind_wait); +} + +static void llbitmap_wait_behind_writes(struct mddev *mddev) +{ + struct llbitmap *llbitmap = mddev->bitmap; + + if (!llbitmap) + return; + + wait_event(llbitmap->behind_wait, + atomic_read(&llbitmap->behind_writes) == 0); + +} + +static ssize_t bits_show(struct mddev *mddev, char *page) +{ + struct llbitmap *llbitmap; + int bits[BitStateCount] = {0}; + loff_t start = 0; + + mutex_lock(&mddev->bitmap_info.mutex); + llbitmap = mddev->bitmap; + if (!llbitmap || !llbitmap->pctl) { + mutex_unlock(&mddev->bitmap_info.mutex); + return sprintf(page, "no bitmap\n"); + } + + if (test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags)) { + mutex_unlock(&mddev->bitmap_info.mutex); + return sprintf(page, "bitmap io error\n"); + } + + while (start < llbitmap->chunks) { + enum llbitmap_state c = llbitmap_read(llbitmap, start); + + if (c < 0 || c >= BitStateCount) + pr_err("%s: invalid bit %llu state %d\n", + __func__, start, c); + else + bits[c]++; + start++; + } + + mutex_unlock(&mddev->bitmap_info.mutex); + return sprintf(page, "unwritten %d\nclean %d\ndirty %d\nneed sync %d\nsyncing %d\n", + bits[BitUnwritten], bits[BitClean], bits[BitDirty], + bits[BitNeedSync], bits[BitSyncing]); +} + +static struct md_sysfs_entry llbitmap_bits = __ATTR_RO(bits); + +static ssize_t metadata_show(struct mddev *mddev, char *page) +{ + struct llbitmap *llbitmap; + ssize_t ret; + + mutex_lock(&mddev->bitmap_info.mutex); + llbitmap = mddev->bitmap; + if (!llbitmap) { + mutex_unlock(&mddev->bitmap_info.mutex); + return sprintf(page, "no bitmap\n"); + } + + ret = sprintf(page, "chunksize %lu\nchunkshift %lu\nchunks %lu\noffset %llu\ndaemon_sleep %lu\n", + llbitmap->chunksize, llbitmap->chunkshift, + llbitmap->chunks, mddev->bitmap_info.offset, + llbitmap->mddev->bitmap_info.daemon_sleep); + mutex_unlock(&mddev->bitmap_info.mutex); + + return ret; +} + +static struct md_sysfs_entry llbitmap_metadata = __ATTR_RO(metadata); + +static ssize_t +daemon_sleep_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%lu\n", mddev->bitmap_info.daemon_sleep); +} + +static ssize_t +daemon_sleep_store(struct mddev *mddev, const char *buf, size_t len) +{ + unsigned long timeout; + int rv = kstrtoul(buf, 10, &timeout); + + if (rv) + return rv; + + mddev->bitmap_info.daemon_sleep = timeout; + return len; +} + +static struct md_sysfs_entry llbitmap_daemon_sleep = __ATTR_RW(daemon_sleep); + +static ssize_t +barrier_idle_show(struct mddev *mddev, char *page) +{ + struct llbitmap *llbitmap = mddev->bitmap; + + return sprintf(page, "%lu\n", llbitmap->barrier_idle); +} + +static ssize_t +barrier_idle_store(struct mddev *mddev, const char *buf, size_t len) +{ + struct llbitmap *llbitmap = mddev->bitmap; + unsigned long timeout; + int rv = kstrtoul(buf, 10, &timeout); + + if (rv) + return rv; + + llbitmap->barrier_idle = timeout; + return len; +} + +static struct md_sysfs_entry llbitmap_barrier_idle = __ATTR_RW(barrier_idle); + +static struct attribute *md_llbitmap_attrs[] = { + &llbitmap_bits.attr, + &llbitmap_metadata.attr, + &llbitmap_daemon_sleep.attr, + &llbitmap_barrier_idle.attr, + NULL +}; + +static struct attribute_group md_llbitmap_group = { + .name = "llbitmap", + .attrs = md_llbitmap_attrs, +}; + +static struct bitmap_operations llbitmap_ops = { + .head = { + .type = MD_BITMAP, + .id = ID_LLBITMAP, + .name = "llbitmap", + }, + + .enabled = llbitmap_enabled, + .create = llbitmap_create, + .resize = llbitmap_resize, + .load = llbitmap_load, + .destroy = llbitmap_destroy, + + .start_write = llbitmap_start_write, + .end_write = llbitmap_end_write, + .start_discard = llbitmap_start_discard, + .end_discard = llbitmap_end_discard, + .unplug = llbitmap_unplug, + .flush = llbitmap_flush, + + .start_behind_write = llbitmap_start_behind_write, + .end_behind_write = llbitmap_end_behind_write, + .wait_behind_writes = llbitmap_wait_behind_writes, + + .blocks_synced = llbitmap_blocks_synced, + .skip_sync_blocks = llbitmap_skip_sync_blocks, + .start_sync = llbitmap_start_sync, + .end_sync = llbitmap_end_sync, + .close_sync = llbitmap_close_sync, + .cond_end_sync = llbitmap_cond_end_sync, + + .update_sb = llbitmap_update_sb, + .get_stats = llbitmap_get_stats, + .dirty_bits = llbitmap_dirty_bits, + .write_all = llbitmap_write_all, + + .group = &md_llbitmap_group, +}; + +int md_llbitmap_init(void) +{ + md_llbitmap_io_wq = alloc_workqueue("md_llbitmap_io", + WQ_MEM_RECLAIM | WQ_UNBOUND, 0); + if (!md_llbitmap_io_wq) + return -ENOMEM; + + md_llbitmap_unplug_wq = alloc_workqueue("md_llbitmap_unplug", + WQ_MEM_RECLAIM | WQ_UNBOUND, 0); + if (!md_llbitmap_unplug_wq) { + destroy_workqueue(md_llbitmap_io_wq); + md_llbitmap_io_wq = NULL; + return -ENOMEM; + } + + return register_md_submodule(&llbitmap_ops.head); +} + +void md_llbitmap_exit(void) +{ + destroy_workqueue(md_llbitmap_io_wq); + md_llbitmap_io_wq = NULL; + destroy_workqueue(md_llbitmap_unplug_wq); + md_llbitmap_unplug_wq = NULL; + unregister_md_submodule(&llbitmap_ops.head); +} diff --git a/drivers/md/md.c b/drivers/md/md.c index 4e033c26fdd4..41c476b40c7a 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -94,7 +94,6 @@ static struct workqueue_struct *md_wq; * workqueue whith reconfig_mutex grabbed. */ static struct workqueue_struct *md_misc_wq; -struct workqueue_struct *md_bitmap_wq; static int remove_and_add_spares(struct mddev *mddev, struct md_rdev *this); @@ -677,8 +676,64 @@ static void active_io_release(struct percpu_ref *ref) static void no_op(struct percpu_ref *r) {} +static bool mddev_set_bitmap_ops(struct mddev *mddev) +{ + struct bitmap_operations *old = mddev->bitmap_ops; + struct md_submodule_head *head; + + if (mddev->bitmap_id == ID_BITMAP_NONE || + (old && old->head.id == mddev->bitmap_id)) + return true; + + xa_lock(&md_submodule); + head = xa_load(&md_submodule, mddev->bitmap_id); + + if (!head) { + pr_warn("md: can't find bitmap id %d\n", mddev->bitmap_id); + goto err; + } + + if (head->type != MD_BITMAP) { + pr_warn("md: invalid bitmap id %d\n", mddev->bitmap_id); + goto err; + } + + mddev->bitmap_ops = (void *)head; + xa_unlock(&md_submodule); + + if (!mddev_is_dm(mddev) && mddev->bitmap_ops->group) { + if (sysfs_create_group(&mddev->kobj, mddev->bitmap_ops->group)) + pr_warn("md: cannot register extra bitmap attributes for %s\n", + mdname(mddev)); + else + /* + * Inform user with KOBJ_CHANGE about new bitmap + * attributes. + */ + kobject_uevent(&mddev->kobj, KOBJ_CHANGE); + } + return true; + +err: + xa_unlock(&md_submodule); + return false; +} + +static void mddev_clear_bitmap_ops(struct mddev *mddev) +{ + if (!mddev_is_dm(mddev) && mddev->bitmap_ops && + mddev->bitmap_ops->group) + sysfs_remove_group(&mddev->kobj, mddev->bitmap_ops->group); + + mddev->bitmap_ops = NULL; +} + int mddev_init(struct mddev *mddev) { + if (!IS_ENABLED(CONFIG_MD_BITMAP)) + mddev->bitmap_id = ID_BITMAP_NONE; + else + mddev->bitmap_id = ID_BITMAP; if (percpu_ref_init(&mddev->active_io, active_io_release, PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) @@ -713,7 +768,6 @@ int mddev_init(struct mddev *mddev) mddev->resync_min = 0; mddev->resync_max = MaxSector; mddev->level = LEVEL_NONE; - mddev_set_bitmap_ops(mddev); INIT_WORK(&mddev->sync_work, md_start_sync); INIT_WORK(&mddev->del_work, mddev_delayed_delete); @@ -1020,15 +1074,26 @@ static void super_written(struct bio *bio) wake_up(&mddev->sb_wait); } -void md_super_write(struct mddev *mddev, struct md_rdev *rdev, - sector_t sector, int size, struct page *page) +/** + * md_write_metadata - write metadata to underlying disk, including + * array superblock, badblocks, bitmap superblock and bitmap bits. + * @mddev: the array to write + * @rdev: the underlying disk to write + * @sector: the offset to @rdev + * @size: the length of the metadata + * @page: the metadata + * @offset: the offset to @page + * + * Write @size bytes of @page start from @offset, to @sector of @rdev, Increment + * mddev->pending_writes before returning, and decrement it on completion, + * waking up sb_wait. Caller must call md_super_wait() after issuing io to all + * rdev. If an error occurred, md_error() will be called, and the @rdev will be + * kicked out from @mddev. + */ +void md_write_metadata(struct mddev *mddev, struct md_rdev *rdev, + sector_t sector, int size, struct page *page, + unsigned int offset) { - /* write first size bytes of page to sector of rdev - * Increment mddev->pending_writes before returning - * and decrement it on completion, waking up sb_wait - * if zero is reached. - * If an error occurred, call md_error - */ struct bio *bio; if (!page) @@ -1046,7 +1111,7 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev, atomic_inc(&rdev->nr_pending); bio->bi_iter.bi_sector = sector; - __bio_add_page(bio, page, size, 0); + __bio_add_page(bio, page, size, offset); bio->bi_private = rdev; bio->bi_end_io = super_written; @@ -1356,6 +1421,9 @@ static u64 md_bitmap_events_cleared(struct mddev *mddev) struct md_bitmap_stats stats; int err; + if (!md_bitmap_enabled(mddev, false)) + return 0; + err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); if (err) return 0; @@ -1653,8 +1721,8 @@ super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) if ((u64)num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1) num_sectors = (sector_t)(2ULL << 32) - 2; do { - md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, - rdev->sb_page); + md_write_metadata(rdev->mddev, rdev, rdev->sb_start, + rdev->sb_size, rdev->sb_page, 0); } while (md_super_wait(rdev->mddev) < 0); return num_sectors; } @@ -2302,8 +2370,8 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) sb->super_offset = cpu_to_le64(rdev->sb_start); sb->sb_csum = calc_sb_1_csum(sb); do { - md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, - rdev->sb_page); + md_write_metadata(rdev->mddev, rdev, rdev->sb_start, + rdev->sb_size, rdev->sb_page, 0); } while (md_super_wait(rdev->mddev) < 0); return num_sectors; @@ -2313,13 +2381,15 @@ static int super_1_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset) { + struct mddev *mddev = rdev->mddev; + /* All necessary checks on new >= old have been done */ if (new_offset >= rdev->data_offset) return 1; /* with 1.0 metadata, there is no metadata to tread on * so we can always move back */ - if (rdev->mddev->minor_version == 0) + if (mddev->minor_version == 0) return 1; /* otherwise we must be sure not to step on @@ -2331,8 +2401,7 @@ super_1_allow_new_offset(struct md_rdev *rdev, if (rdev->sb_start + (32+4)*2 > new_offset) return 0; - if (!rdev->mddev->bitmap_info.file) { - struct mddev *mddev = rdev->mddev; + if (md_bitmap_registered(mddev) && !mddev->bitmap_info.file) { struct md_bitmap_stats stats; int err; @@ -2804,24 +2873,24 @@ repeat: mddev_add_trace_msg(mddev, "md md_update_sb"); rewrite: - mddev->bitmap_ops->update_sb(mddev->bitmap); + if (md_bitmap_enabled(mddev, false)) + mddev->bitmap_ops->update_sb(mddev->bitmap); rdev_for_each(rdev, mddev) { if (rdev->sb_loaded != 1) continue; /* no noise on spare devices */ if (!test_bit(Faulty, &rdev->flags)) { - md_super_write(mddev,rdev, - rdev->sb_start, rdev->sb_size, - rdev->sb_page); + md_write_metadata(mddev, rdev, rdev->sb_start, + rdev->sb_size, rdev->sb_page, 0); pr_debug("md: (write) %pg's sb offset: %llu\n", rdev->bdev, (unsigned long long)rdev->sb_start); rdev->sb_events = mddev->events; if (rdev->badblocks.size) { - md_super_write(mddev, rdev, - rdev->badblocks.sector, - rdev->badblocks.size << 9, - rdev->bb_page); + md_write_metadata(mddev, rdev, + rdev->badblocks.sector, + rdev->badblocks.size << 9, + rdev->bb_page, 0); rdev->badblocks.size = 0; } @@ -4150,6 +4219,86 @@ static struct md_sysfs_entry md_new_level = __ATTR(new_level, 0664, new_level_show, new_level_store); static ssize_t +bitmap_type_show(struct mddev *mddev, char *page) +{ + struct md_submodule_head *head; + unsigned long i; + ssize_t len = 0; + + if (mddev->bitmap_id == ID_BITMAP_NONE) + len += sprintf(page + len, "[none] "); + else + len += sprintf(page + len, "none "); + + xa_lock(&md_submodule); + xa_for_each(&md_submodule, i, head) { + if (head->type != MD_BITMAP) + continue; + + if (mddev->bitmap_id == head->id) + len += sprintf(page + len, "[%s] ", head->name); + else + len += sprintf(page + len, "%s ", head->name); + } + xa_unlock(&md_submodule); + + len += sprintf(page + len, "\n"); + return len; +} + +static ssize_t +bitmap_type_store(struct mddev *mddev, const char *buf, size_t len) +{ + struct md_submodule_head *head; + enum md_submodule_id id; + unsigned long i; + int err = 0; + + xa_lock(&md_submodule); + + if (mddev->bitmap_ops) { + err = -EBUSY; + goto out; + } + + if (cmd_match(buf, "none")) { + mddev->bitmap_id = ID_BITMAP_NONE; + goto out; + } + + xa_for_each(&md_submodule, i, head) { + if (head->type == MD_BITMAP && cmd_match(buf, head->name)) { + mddev->bitmap_id = head->id; + goto out; + } + } + + err = kstrtoint(buf, 10, &id); + if (err) + goto out; + + if (id == ID_BITMAP_NONE) { + mddev->bitmap_id = id; + goto out; + } + + head = xa_load(&md_submodule, id); + if (head && head->type == MD_BITMAP) { + mddev->bitmap_id = id; + goto out; + } + + err = -ENOENT; + +out: + xa_unlock(&md_submodule); + return err ? err : len; +} + +static struct md_sysfs_entry md_bitmap_type = +__ATTR(bitmap_type, 0664, bitmap_type_show, bitmap_type_store); + +static ssize_t layout_show(struct mddev *mddev, char *page) { /* just a number, not meaningful for all levels */ @@ -4680,6 +4829,9 @@ bitmap_store(struct mddev *mddev, const char *buf, size_t len) unsigned long chunk, end_chunk; int err; + if (!md_bitmap_enabled(mddev, false)) + return len; + err = mddev_lock(mddev); if (err) return err; @@ -5752,6 +5904,7 @@ __ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show, static struct attribute *md_default_attrs[] = { &md_level.attr, &md_new_level.attr, + &md_bitmap_type.attr, &md_layout.attr, &md_raid_disks.attr, &md_uuid.attr, @@ -5801,7 +5954,6 @@ static const struct attribute_group md_redundancy_group = { static const struct attribute_group *md_attr_groups[] = { &md_default_group, - &md_bitmap_group, NULL, }; @@ -6133,6 +6285,26 @@ static void md_safemode_timeout(struct timer_list *t) static int start_dirty_degraded; +static int md_bitmap_create(struct mddev *mddev) +{ + if (mddev->bitmap_id == ID_BITMAP_NONE) + return -EINVAL; + + if (!mddev_set_bitmap_ops(mddev)) + return -ENOENT; + + return mddev->bitmap_ops->create(mddev); +} + +static void md_bitmap_destroy(struct mddev *mddev) +{ + if (!md_bitmap_registered(mddev)) + return; + + mddev->bitmap_ops->destroy(mddev); + mddev_clear_bitmap_ops(mddev); +} + int md_run(struct mddev *mddev) { int err; @@ -6299,7 +6471,7 @@ int md_run(struct mddev *mddev) } if (err == 0 && pers->sync_request && (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { - err = mddev->bitmap_ops->create(mddev); + err = md_bitmap_create(mddev); if (err) pr_warn("%s: failed to create bitmap (%d)\n", mdname(mddev), err); @@ -6372,7 +6544,7 @@ bitmap_abort: pers->free(mddev, mddev->private); mddev->private = NULL; put_pers(pers); - mddev->bitmap_ops->destroy(mddev); + md_bitmap_destroy(mddev); abort: bioset_exit(&mddev->io_clone_set); exit_sync_set: @@ -6392,10 +6564,12 @@ int do_md_run(struct mddev *mddev) if (err) goto out; - err = mddev->bitmap_ops->load(mddev); - if (err) { - mddev->bitmap_ops->destroy(mddev); - goto out; + if (md_bitmap_registered(mddev)) { + err = mddev->bitmap_ops->load(mddev); + if (err) { + md_bitmap_destroy(mddev); + goto out; + } } if (mddev_is_clustered(mddev)) @@ -6546,7 +6720,8 @@ static void __md_stop_writes(struct mddev *mddev) mddev->pers->quiesce(mddev, 0); } - mddev->bitmap_ops->flush(mddev); + if (md_bitmap_enabled(mddev, true)) + mddev->bitmap_ops->flush(mddev); if (md_is_rdwr(mddev) && ((!mddev->in_sync && !mddev_is_clustered(mddev)) || @@ -6573,7 +6748,8 @@ EXPORT_SYMBOL_GPL(md_stop_writes); static void mddev_detach(struct mddev *mddev) { - mddev->bitmap_ops->wait_behind_writes(mddev); + if (md_bitmap_enabled(mddev, false)) + mddev->bitmap_ops->wait_behind_writes(mddev); if (mddev->pers && mddev->pers->quiesce && !is_md_suspended(mddev)) { mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 0); @@ -6589,7 +6765,7 @@ static void __md_stop(struct mddev *mddev) { struct md_personality *pers = mddev->pers; - mddev->bitmap_ops->destroy(mddev); + md_bitmap_destroy(mddev); mddev_detach(mddev); spin_lock(&mddev->lock); mddev->pers = NULL; @@ -7307,6 +7483,9 @@ static int set_bitmap_file(struct mddev *mddev, int fd) { int err = 0; + if (!md_bitmap_registered(mddev)) + return -EINVAL; + if (mddev->pers) { if (!mddev->pers->quiesce || !mddev->thread) return -EBUSY; @@ -7363,16 +7542,16 @@ static int set_bitmap_file(struct mddev *mddev, int fd) err = 0; if (mddev->pers) { if (fd >= 0) { - err = mddev->bitmap_ops->create(mddev); + err = md_bitmap_create(mddev); if (!err) err = mddev->bitmap_ops->load(mddev); if (err) { - mddev->bitmap_ops->destroy(mddev); + md_bitmap_destroy(mddev); fd = -1; } } else if (fd < 0) { - mddev->bitmap_ops->destroy(mddev); + md_bitmap_destroy(mddev); } } @@ -7679,12 +7858,12 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) mddev->bitmap_info.default_offset; mddev->bitmap_info.space = mddev->bitmap_info.default_space; - rv = mddev->bitmap_ops->create(mddev); + rv = md_bitmap_create(mddev); if (!rv) rv = mddev->bitmap_ops->load(mddev); if (rv) - mddev->bitmap_ops->destroy(mddev); + md_bitmap_destroy(mddev); } else { struct md_bitmap_stats stats; @@ -7710,7 +7889,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) put_cluster_ops(mddev); mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; } - mddev->bitmap_ops->destroy(mddev); + md_bitmap_destroy(mddev); mddev->bitmap_info.offset = 0; } } @@ -7747,9 +7926,9 @@ static int set_disk_faulty(struct mddev *mddev, dev_t dev) * 4 sectors (with a BIG number of cylinders...). This drives * dosfs just mad... ;-) */ -static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int md_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct mddev *mddev = bdev->bd_disk->private_data; + struct mddev *mddev = disk->private_data; geo->heads = 2; geo->sectors = 4; @@ -8491,6 +8670,9 @@ static void md_bitmap_status(struct seq_file *seq, struct mddev *mddev) unsigned long chunk_kb; int err; + if (!md_bitmap_enabled(mddev, false)) + return; + err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); if (err) return; @@ -8873,18 +9055,24 @@ EXPORT_SYMBOL_GPL(md_submit_discard_bio); static void md_bitmap_start(struct mddev *mddev, struct md_io_clone *md_io_clone) { + md_bitmap_fn *fn = unlikely(md_io_clone->rw == STAT_DISCARD) ? + mddev->bitmap_ops->start_discard : + mddev->bitmap_ops->start_write; + if (mddev->pers->bitmap_sector) mddev->pers->bitmap_sector(mddev, &md_io_clone->offset, &md_io_clone->sectors); - mddev->bitmap_ops->start_write(mddev, md_io_clone->offset, - md_io_clone->sectors); + fn(mddev, md_io_clone->offset, md_io_clone->sectors); } static void md_bitmap_end(struct mddev *mddev, struct md_io_clone *md_io_clone) { - mddev->bitmap_ops->end_write(mddev, md_io_clone->offset, - md_io_clone->sectors); + md_bitmap_fn *fn = unlikely(md_io_clone->rw == STAT_DISCARD) ? + mddev->bitmap_ops->end_discard : + mddev->bitmap_ops->end_write; + + fn(mddev, md_io_clone->offset, md_io_clone->sectors); } static void md_end_clone_io(struct bio *bio) @@ -8893,7 +9081,7 @@ static void md_end_clone_io(struct bio *bio) struct bio *orig_bio = md_io_clone->orig_bio; struct mddev *mddev = md_io_clone->mddev; - if (bio_data_dir(orig_bio) == WRITE && mddev->bitmap) + if (bio_data_dir(orig_bio) == WRITE && md_bitmap_enabled(mddev, false)) md_bitmap_end(mddev, md_io_clone); if (bio->bi_status && !orig_bio->bi_status) @@ -8920,9 +9108,10 @@ static void md_clone_bio(struct mddev *mddev, struct bio **bio) if (blk_queue_io_stat(bdev->bd_disk->queue)) md_io_clone->start_time = bio_start_io_acct(*bio); - if (bio_data_dir(*bio) == WRITE && mddev->bitmap) { + if (bio_data_dir(*bio) == WRITE && md_bitmap_enabled(mddev, false)) { md_io_clone->offset = (*bio)->bi_iter.bi_sector; md_io_clone->sectors = bio_sectors(*bio); + md_io_clone->rw = op_stat_group(bio_op(*bio)); md_bitmap_start(mddev, md_io_clone); } @@ -8944,7 +9133,7 @@ void md_free_cloned_bio(struct bio *bio) struct bio *orig_bio = md_io_clone->orig_bio; struct mddev *mddev = md_io_clone->mddev; - if (bio_data_dir(orig_bio) == WRITE && mddev->bitmap) + if (bio_data_dir(orig_bio) == WRITE && md_bitmap_enabled(mddev, false)) md_bitmap_end(mddev, md_io_clone); if (bio->bi_status && !orig_bio->bi_status) @@ -9010,6 +9199,39 @@ static sector_t md_sync_max_sectors(struct mddev *mddev, } } +/* + * If lazy recovery is requested and all rdevs are in sync, select the rdev with + * the higest index to perfore recovery to build initial xor data, this is the + * same as old bitmap. + */ +static bool mddev_select_lazy_recover_rdev(struct mddev *mddev) +{ + struct md_rdev *recover_rdev = NULL; + struct md_rdev *rdev; + bool ret = false; + + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) { + if (rdev->raid_disk < 0) + continue; + + if (test_bit(Faulty, &rdev->flags) || + !test_bit(In_sync, &rdev->flags)) + break; + + if (!recover_rdev || recover_rdev->raid_disk < rdev->raid_disk) + recover_rdev = rdev; + } + + if (recover_rdev) { + clear_bit(In_sync, &recover_rdev->flags); + ret = true; + } + + rcu_read_unlock(); + return ret; +} + static sector_t md_sync_position(struct mddev *mddev, enum sync_action action) { sector_t start = 0; @@ -9041,6 +9263,14 @@ static sector_t md_sync_position(struct mddev *mddev, enum sync_action action) start = rdev->recovery_offset; rcu_read_unlock(); + /* + * If there are no spares, and raid456 lazy initial recover is + * requested. + */ + if (test_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery) && + start == MaxSector && mddev_select_lazy_recover_rdev(mddev)) + start = 0; + /* If there is a bitmap, we need to make sure all * writes that started before we added a spare * complete before we start doing a recovery. @@ -9061,19 +9291,12 @@ static sector_t md_sync_position(struct mddev *mddev, enum sync_action action) static bool sync_io_within_limit(struct mddev *mddev) { - int io_sectors; - /* * For raid456, sync IO is stripe(4k) per IO, for other levels, it's * RESYNC_PAGES(64k) per IO. */ - if (mddev->level == 4 || mddev->level == 5 || mddev->level == 6) - io_sectors = 8; - else - io_sectors = 128; - return atomic_read(&mddev->recovery_active) < - io_sectors * sync_io_depth(mddev); + (raid_is_456(mddev) ? 8 : 128) * sync_io_depth(mddev); } #define SYNC_MARKS 10 @@ -9283,6 +9506,12 @@ void md_do_sync(struct md_thread *thread) if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) break; + if (mddev->bitmap_ops && mddev->bitmap_ops->skip_sync_blocks) { + sectors = mddev->bitmap_ops->skip_sync_blocks(mddev, j); + if (sectors) + goto update; + } + sectors = mddev->pers->sync_request(mddev, j, max_sectors, &skipped); if (sectors == 0) { @@ -9298,6 +9527,7 @@ void md_do_sync(struct md_thread *thread) if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) break; +update: j += sectors; if (j > max_sectors) /* when skipping, extra large numbers can be returned. */ @@ -9607,6 +9837,7 @@ static bool md_choose_sync_action(struct mddev *mddev, int *spares) set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery); return true; } @@ -9615,6 +9846,7 @@ static bool md_choose_sync_action(struct mddev *mddev, int *spares) remove_spares(mddev, NULL); set_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery); return true; } @@ -9624,7 +9856,7 @@ static bool md_choose_sync_action(struct mddev *mddev, int *spares) * re-add. */ *spares = remove_and_add_spares(mddev, NULL); - if (*spares) { + if (*spares || test_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery)) { clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); @@ -9682,7 +9914,7 @@ static void md_start_sync(struct work_struct *ws) * We are adding a device or devices to an array which has the bitmap * stored on all devices. So make sure all bitmap pages get written. */ - if (spares) + if (spares && md_bitmap_enabled(mddev, true)) mddev->bitmap_ops->write_all(mddev); name = test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ? @@ -9770,7 +10002,7 @@ static void unregister_sync_thread(struct mddev *mddev) */ void md_check_recovery(struct mddev *mddev) { - if (mddev->bitmap) + if (md_bitmap_enabled(mddev, false) && mddev->bitmap_ops->daemon_work) mddev->bitmap_ops->daemon_work(mddev); if (signal_pending(current)) { @@ -9837,6 +10069,7 @@ void md_check_recovery(struct mddev *mddev) } clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery); clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); @@ -9947,6 +10180,7 @@ void md_reap_sync_thread(struct mddev *mddev) clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); + clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery); /* * We call mddev->cluster_ops->update_size here because sync_size could * be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared, @@ -10094,8 +10328,16 @@ static void md_geninit(void) static int __init md_init(void) { - int ret = -ENOMEM; + int ret = md_bitmap_init(); + + if (ret) + return ret; + ret = md_llbitmap_init(); + if (ret) + goto err_bitmap; + + ret = -ENOMEM; md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0); if (!md_wq) goto err_wq; @@ -10104,11 +10346,6 @@ static int __init md_init(void) if (!md_misc_wq) goto err_misc_wq; - md_bitmap_wq = alloc_workqueue("md_bitmap", WQ_MEM_RECLAIM | WQ_UNBOUND, - 0); - if (!md_bitmap_wq) - goto err_bitmap_wq; - ret = __register_blkdev(MD_MAJOR, "md", md_probe); if (ret < 0) goto err_md; @@ -10127,12 +10364,13 @@ static int __init md_init(void) err_mdp: unregister_blkdev(MD_MAJOR, "md"); err_md: - destroy_workqueue(md_bitmap_wq); -err_bitmap_wq: destroy_workqueue(md_misc_wq); err_misc_wq: destroy_workqueue(md_wq); err_wq: + md_llbitmap_exit(); +err_bitmap: + md_bitmap_exit(); return ret; } @@ -10150,7 +10388,7 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size)); if (ret) pr_info("md-cluster: resize failed\n"); - else + else if (md_bitmap_enabled(mddev, false)) mddev->bitmap_ops->update_sb(mddev->bitmap); } @@ -10438,8 +10676,8 @@ static __exit void md_exit(void) spin_unlock(&all_mddevs_lock); destroy_workqueue(md_misc_wq); - destroy_workqueue(md_bitmap_wq); destroy_workqueue(md_wq); + md_bitmap_exit(); } subsys_initcall(md_init); diff --git a/drivers/md/md.h b/drivers/md/md.h index 51af29a03079..1979c2d4fe89 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -26,7 +26,7 @@ enum md_submodule_type { MD_PERSONALITY = 0, MD_CLUSTER, - MD_BITMAP, /* TODO */ + MD_BITMAP, }; enum md_submodule_id { @@ -38,8 +38,9 @@ enum md_submodule_id { ID_RAID6 = 6, ID_RAID10 = 10, ID_CLUSTER, - ID_BITMAP, /* TODO */ - ID_LLBITMAP, /* TODO */ + ID_BITMAP, + ID_LLBITMAP, + ID_BITMAP_NONE, }; struct md_submodule_head { @@ -565,6 +566,7 @@ struct mddev { struct percpu_ref writes_pending; int sync_checkers; /* # of threads checking writes_pending */ + enum md_submodule_id bitmap_id; void *bitmap; /* the bitmap for the device */ struct bitmap_operations *bitmap_ops; struct { @@ -665,6 +667,8 @@ enum recovery_flags { MD_RECOVERY_RESHAPE, /* remote node is running resync thread */ MD_RESYNCING_REMOTE, + /* raid456 lazy initial recover */ + MD_RECOVERY_LAZY_RECOVER, }; enum md_ro_state { @@ -796,7 +800,6 @@ struct md_sysfs_entry { ssize_t (*show)(struct mddev *, char *); ssize_t (*store)(struct mddev *, const char *, size_t); }; -extern const struct attribute_group md_bitmap_group; static inline struct kernfs_node *sysfs_get_dirent_safe(struct kernfs_node *sd, char *name) { @@ -873,6 +876,7 @@ struct md_io_clone { unsigned long start_time; sector_t offset; unsigned long sectors; + enum stat_group rw; struct bio bio_clone; }; @@ -909,8 +913,9 @@ void md_account_bio(struct mddev *mddev, struct bio **bio); void md_free_cloned_bio(struct bio *bio); extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio); -extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev, - sector_t sector, int size, struct page *page); +void md_write_metadata(struct mddev *mddev, struct md_rdev *rdev, + sector_t sector, int size, struct page *page, + unsigned int offset); extern int md_super_wait(struct mddev *mddev); extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, struct page *page, blk_opf_t opf, bool metadata_op); @@ -1013,7 +1018,6 @@ struct mdu_array_info_s; struct mdu_disk_info_s; extern int mdp_major; -extern struct workqueue_struct *md_bitmap_wq; void md_autostart_arrays(int part); int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info); int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info); @@ -1034,6 +1038,12 @@ static inline bool mddev_is_dm(struct mddev *mddev) return !mddev->gendisk; } +static inline bool raid_is_456(struct mddev *mddev) +{ + return mddev->level == ID_RAID4 || mddev->level == ID_RAID5 || + mddev->level == ID_RAID6; +} + static inline void mddev_trace_remap(struct mddev *mddev, struct bio *bio, sector_t sector) { diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index f1d8811a542a..e443e478645a 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -382,6 +382,7 @@ static int raid0_set_limits(struct mddev *mddev) md_init_stacking_limits(&lim); lim.max_hw_sectors = mddev->chunk_sectors; lim.max_write_zeroes_sectors = mddev->chunk_sectors; + lim.max_hw_wzeroes_unmap_sectors = mddev->chunk_sectors; lim.io_min = mddev->chunk_sectors << 9; lim.io_opt = lim.io_min * mddev->raid_disks; lim.chunk_sectors = mddev->chunk_sectors; @@ -463,21 +464,16 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) zone = find_zone(conf, &start); if (bio_end_sector(bio) > zone->zone_end) { - struct bio *split = bio_split(bio, - zone->zone_end - bio->bi_iter.bi_sector, GFP_NOIO, - &mddev->bio_set); - - if (IS_ERR(split)) { - bio->bi_status = errno_to_blk_status(PTR_ERR(split)); - bio_endio(bio); + bio = bio_submit_split_bioset(bio, + zone->zone_end - bio->bi_iter.bi_sector, + &mddev->bio_set); + if (!bio) return; - } - bio_chain(split, bio); - submit_bio_noacct(bio); - bio = split; + end = zone->zone_end; - } else + } else { end = bio_end_sector(bio); + } orig_end = end; if (zone != conf->strip_zone) @@ -612,17 +608,10 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio) : sector_div(sector, chunk_sects)); if (sectors < bio_sectors(bio)) { - struct bio *split = bio_split(bio, sectors, GFP_NOIO, + bio = bio_submit_split_bioset(bio, sectors, &mddev->bio_set); - - if (IS_ERR(split)) { - bio->bi_status = errno_to_blk_status(PTR_ERR(split)); - bio_endio(bio); + if (!bio) return true; - } - bio_chain(split, bio); - raid0_map_submit_bio(mddev, bio); - bio = split; } raid0_map_submit_bio(mddev, bio); diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c index 52881e6032da..521625756128 100644 --- a/drivers/md/raid1-10.c +++ b/drivers/md/raid1-10.c @@ -140,7 +140,7 @@ static inline bool raid1_add_bio_to_plug(struct mddev *mddev, struct bio *bio, * If bitmap is not enabled, it's safe to submit the io directly, and * this can get optimal performance. */ - if (!mddev->bitmap_ops->enabled(mddev)) { + if (!md_bitmap_enabled(mddev, true)) { raid1_submit_write(bio); return true; } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index bf44878ec640..592a40233004 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -167,7 +167,7 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) bio = bio_kmalloc(RESYNC_PAGES, gfp_flags); if (!bio) goto out_free_bio; - bio_init(bio, NULL, bio->bi_inline_vecs, RESYNC_PAGES, 0); + bio_init_inline(bio, NULL, RESYNC_PAGES, 0); r1_bio->bios[j] = bio; } /* @@ -1317,7 +1317,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, struct raid1_info *mirror; struct bio *read_bio; int max_sectors; - int rdisk, error; + int rdisk; bool r1bio_existed = !!r1_bio; /* @@ -1366,7 +1366,8 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, (unsigned long long)r1_bio->sector, mirror->rdev->bdev); - if (test_bit(WriteMostly, &mirror->rdev->flags)) { + if (test_bit(WriteMostly, &mirror->rdev->flags) && + md_bitmap_enabled(mddev, false)) { /* * Reading from a write-mostly device must take care not to * over-take any writes that are 'behind' @@ -1376,16 +1377,13 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, } if (max_sectors < bio_sectors(bio)) { - struct bio *split = bio_split(bio, max_sectors, - gfp, &conf->bio_split); - - if (IS_ERR(split)) { - error = PTR_ERR(split); + bio = bio_submit_split_bioset(bio, max_sectors, + &conf->bio_split); + if (!bio) { + set_bit(R1BIO_Returned, &r1_bio->state); goto err_handle; } - bio_chain(split, bio); - submit_bio_noacct(bio); - bio = split; + r1_bio->master_bio = bio; r1_bio->sectors = max_sectors; } @@ -1413,8 +1411,6 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, err_handle: atomic_dec(&mirror->rdev->nr_pending); - bio->bi_status = errno_to_blk_status(error); - set_bit(R1BIO_Uptodate, &r1_bio->state); raid_end_bio_io(r1_bio); } @@ -1452,12 +1448,36 @@ retry: return true; } +static void raid1_start_write_behind(struct mddev *mddev, struct r1bio *r1_bio, + struct bio *bio) +{ + unsigned long max_write_behind = mddev->bitmap_info.max_write_behind; + struct md_bitmap_stats stats; + int err; + + /* behind write rely on bitmap, see bitmap_operations */ + if (!md_bitmap_enabled(mddev, false)) + return; + + err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); + if (err) + return; + + /* Don't do behind IO if reader is waiting, or there are too many. */ + if (!stats.behind_wait && stats.behind_writes < max_write_behind) + alloc_behind_master_bio(r1_bio, bio); + + if (test_bit(R1BIO_BehindIO, &r1_bio->state)) + mddev->bitmap_ops->start_behind_write(mddev); + +} + static void raid1_write_request(struct mddev *mddev, struct bio *bio, int max_write_sectors) { struct r1conf *conf = mddev->private; struct r1bio *r1_bio; - int i, disks, k, error; + int i, disks, k; unsigned long flags; int first_clone; int max_sectors; @@ -1561,10 +1581,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, * complexity of supporting that is not worth * the benefit. */ - if (bio->bi_opf & REQ_ATOMIC) { - error = -EIO; + if (bio->bi_opf & REQ_ATOMIC) goto err_handle; - } good_sectors = first_bad - r1_bio->sector; if (good_sectors < max_sectors) @@ -1584,16 +1602,13 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, max_sectors = min_t(int, max_sectors, BIO_MAX_VECS * (PAGE_SIZE >> 9)); if (max_sectors < bio_sectors(bio)) { - struct bio *split = bio_split(bio, max_sectors, - GFP_NOIO, &conf->bio_split); - - if (IS_ERR(split)) { - error = PTR_ERR(split); + bio = bio_submit_split_bioset(bio, max_sectors, + &conf->bio_split); + if (!bio) { + set_bit(R1BIO_Returned, &r1_bio->state); goto err_handle; } - bio_chain(split, bio); - submit_bio_noacct(bio); - bio = split; + r1_bio->master_bio = bio; r1_bio->sectors = max_sectors; } @@ -1612,22 +1627,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, continue; if (first_clone) { - unsigned long max_write_behind = - mddev->bitmap_info.max_write_behind; - struct md_bitmap_stats stats; - int err; - - /* do behind I/O ? - * Not if there are too many, or cannot - * allocate memory, or a reader on WriteMostly - * is waiting for behind writes to flush */ - err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); - if (!err && write_behind && !stats.behind_wait && - stats.behind_writes < max_write_behind) - alloc_behind_master_bio(r1_bio, bio); - - if (test_bit(R1BIO_BehindIO, &r1_bio->state)) - mddev->bitmap_ops->start_behind_write(mddev); + if (write_behind) + raid1_start_write_behind(mddev, r1_bio, bio); first_clone = 0; } @@ -1683,8 +1684,6 @@ err_handle: } } - bio->bi_status = errno_to_blk_status(error); - set_bit(R1BIO_Uptodate, &r1_bio->state); raid_end_bio_io(r1_bio); } @@ -2057,7 +2056,7 @@ static void abort_sync_write(struct mddev *mddev, struct r1bio *r1_bio) /* make sure these bits don't get cleared. */ do { - mddev->bitmap_ops->end_sync(mddev, s, &sync_blocks); + md_bitmap_end_sync(mddev, s, &sync_blocks); s += sync_blocks; sectors_to_go -= sync_blocks; } while (sectors_to_go > 0); @@ -2804,12 +2803,13 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, * We can find the current addess in mddev->curr_resync */ if (mddev->curr_resync < max_sector) /* aborted */ - mddev->bitmap_ops->end_sync(mddev, mddev->curr_resync, - &sync_blocks); + md_bitmap_end_sync(mddev, mddev->curr_resync, + &sync_blocks); else /* completed sync */ conf->fullsync = 0; - mddev->bitmap_ops->close_sync(mddev); + if (md_bitmap_enabled(mddev, false)) + mddev->bitmap_ops->close_sync(mddev); close_sync(conf); if (mddev_is_clustered(mddev)) { @@ -2829,7 +2829,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, /* before building a request, check if we can skip these blocks.. * This call the bitmap_start_sync doesn't actually record anything */ - if (!mddev->bitmap_ops->start_sync(mddev, sector_nr, &sync_blocks, true) && + if (!md_bitmap_start_sync(mddev, sector_nr, &sync_blocks, true) && !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { /* We can skip this block, and probably several more */ *skipped = 1; @@ -2846,10 +2846,11 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, /* we are incrementing sector_nr below. To be safe, we check against * sector_nr + two times RESYNC_SECTORS */ - - mddev->bitmap_ops->cond_end_sync(mddev, sector_nr, - mddev_is_clustered(mddev) && - (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high)); + if (md_bitmap_enabled(mddev, false)) + mddev->bitmap_ops->cond_end_sync(mddev, sector_nr, + mddev_is_clustered(mddev) && + (sector_nr + 2 * RESYNC_SECTORS > + conf->cluster_sync_high)); if (raise_barrier(conf, sector_nr)) return 0; @@ -3004,8 +3005,8 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, if (len == 0) break; if (sync_blocks == 0) { - if (!mddev->bitmap_ops->start_sync(mddev, sector_nr, - &sync_blocks, still_degraded) && + if (!md_bitmap_start_sync(mddev, sector_nr, + &sync_blocks, still_degraded) && !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) break; @@ -3211,6 +3212,7 @@ static int raid1_set_limits(struct mddev *mddev) md_init_stacking_limits(&lim); lim.max_write_zeroes_sectors = 0; + lim.max_hw_wzeroes_unmap_sectors = 0; lim.features |= BLK_FEAT_ATOMIC_WRITES; err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); if (err) @@ -3324,15 +3326,17 @@ static int raid1_resize(struct mddev *mddev, sector_t sectors) * worth it. */ sector_t newsize = raid1_size(mddev, sectors, 0); - int ret; if (mddev->external_size && mddev->array_sectors > newsize) return -EINVAL; - ret = mddev->bitmap_ops->resize(mddev, newsize, 0, false); - if (ret) - return ret; + if (md_bitmap_enabled(mddev, false)) { + int ret = mddev->bitmap_ops->resize(mddev, newsize, 0); + + if (ret) + return ret; + } md_set_array_sectors(mddev, newsize); if (sectors > mddev->dev_sectors && diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index d236ef179cfb..2ebe35aaa534 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -178,7 +178,9 @@ enum r1bio_state { * any write was successful. Otherwise we call when * any write-behind write succeeds, otherwise we call * with failure when last write completes (and all failed). - * Record that bi_end_io was called with this flag... + * + * And for bio_split errors, record that bi_end_io was called + * with this flag... */ R1BIO_Returned, /* If a write for this request means we can clear some diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index b60c30bfb6c7..14dcd5142eb4 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -163,14 +163,14 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) bio = bio_kmalloc(RESYNC_PAGES, gfp_flags); if (!bio) goto out_free_bio; - bio_init(bio, NULL, bio->bi_inline_vecs, RESYNC_PAGES, 0); + bio_init_inline(bio, NULL, RESYNC_PAGES, 0); r10_bio->devs[j].bio = bio; if (!conf->have_replacement) continue; bio = bio_kmalloc(RESYNC_PAGES, gfp_flags); if (!bio) goto out_free_bio; - bio_init(bio, NULL, bio->bi_inline_vecs, RESYNC_PAGES, 0); + bio_init_inline(bio, NULL, RESYNC_PAGES, 0); r10_bio->devs[j].repl_bio = bio; } /* @@ -322,10 +322,12 @@ static void raid_end_bio_io(struct r10bio *r10_bio) struct bio *bio = r10_bio->master_bio; struct r10conf *conf = r10_bio->mddev->private; - if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) - bio->bi_status = BLK_STS_IOERR; + if (!test_and_set_bit(R10BIO_Returned, &r10_bio->state)) { + if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) + bio->bi_status = BLK_STS_IOERR; + bio_endio(bio); + } - bio_endio(bio); /* * Wake up any possible resync thread that waits for the device * to go idle. @@ -1154,7 +1156,6 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, int slot = r10_bio->read_slot; struct md_rdev *err_rdev = NULL; gfp_t gfp = GFP_NOIO; - int error; if (slot >= 0 && r10_bio->devs[slot].rdev) { /* @@ -1203,17 +1204,15 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, rdev->bdev, (unsigned long long)r10_bio->sector); if (max_sectors < bio_sectors(bio)) { - struct bio *split = bio_split(bio, max_sectors, - gfp, &conf->bio_split); - if (IS_ERR(split)) { - error = PTR_ERR(split); - goto err_handle; - } - bio_chain(split, bio); allow_barrier(conf); - submit_bio_noacct(bio); + bio = bio_submit_split_bioset(bio, max_sectors, + &conf->bio_split); wait_barrier(conf, false); - bio = split; + if (!bio) { + set_bit(R10BIO_Returned, &r10_bio->state); + goto err_handle; + } + r10_bio->master_bio = bio; r10_bio->sectors = max_sectors; } @@ -1241,8 +1240,6 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, return; err_handle: atomic_dec(&rdev->nr_pending); - bio->bi_status = errno_to_blk_status(error); - set_bit(R10BIO_Uptodate, &r10_bio->state); raid_end_bio_io(r10_bio); } @@ -1351,7 +1348,6 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, int i, k; sector_t sectors; int max_sectors; - int error; if ((mddev_is_clustered(mddev) && mddev->cluster_ops->area_resyncing(mddev, WRITE, @@ -1465,10 +1461,8 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, * complexity of supporting that is not worth * the benefit. */ - if (bio->bi_opf & REQ_ATOMIC) { - error = -EIO; + if (bio->bi_opf & REQ_ATOMIC) goto err_handle; - } good_sectors = first_bad - dev_sector; if (good_sectors < max_sectors) @@ -1489,17 +1483,15 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, r10_bio->sectors = max_sectors; if (r10_bio->sectors < bio_sectors(bio)) { - struct bio *split = bio_split(bio, r10_bio->sectors, - GFP_NOIO, &conf->bio_split); - if (IS_ERR(split)) { - error = PTR_ERR(split); - goto err_handle; - } - bio_chain(split, bio); allow_barrier(conf); - submit_bio_noacct(bio); + bio = bio_submit_split_bioset(bio, r10_bio->sectors, + &conf->bio_split); wait_barrier(conf, false); - bio = split; + if (!bio) { + set_bit(R10BIO_Returned, &r10_bio->state); + goto err_handle; + } + r10_bio->master_bio = bio; } @@ -1531,8 +1523,6 @@ err_handle: } } - bio->bi_status = errno_to_blk_status(error); - set_bit(R10BIO_Uptodate, &r10_bio->state); raid_end_bio_io(r10_bio); } @@ -1679,7 +1669,9 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio) bio_endio(bio); return 0; } + bio_chain(split, bio); + trace_block_split(split, bio->bi_iter.bi_sector); allow_barrier(conf); /* Resend the fist split part */ submit_bio_noacct(split); @@ -1694,7 +1686,9 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio) bio_endio(bio); return 0; } + bio_chain(split, bio); + trace_block_split(split, bio->bi_iter.bi_sector); allow_barrier(conf); /* Resend the second split part */ submit_bio_noacct(bio); @@ -3221,15 +3215,13 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, if (mddev->curr_resync < max_sector) { /* aborted */ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) - mddev->bitmap_ops->end_sync(mddev, - mddev->curr_resync, - &sync_blocks); + md_bitmap_end_sync(mddev, mddev->curr_resync, + &sync_blocks); else for (i = 0; i < conf->geo.raid_disks; i++) { sector_t sect = raid10_find_virt(conf, mddev->curr_resync, i); - mddev->bitmap_ops->end_sync(mddev, sect, - &sync_blocks); + md_bitmap_end_sync(mddev, sect, &sync_blocks); } } else { /* completed sync */ @@ -3249,7 +3241,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, } conf->fullsync = 0; } - mddev->bitmap_ops->close_sync(mddev); + if (md_bitmap_enabled(mddev, false)) + mddev->bitmap_ops->close_sync(mddev); close_sync(conf); *skipped = 1; return sectors_skipped; @@ -3351,9 +3344,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, * we only need to recover the block if it is set in * the bitmap */ - must_sync = mddev->bitmap_ops->start_sync(mddev, sect, - &sync_blocks, - true); + must_sync = md_bitmap_start_sync(mddev, sect, + &sync_blocks, true); if (sync_blocks < max_sync) max_sync = sync_blocks; if (!must_sync && @@ -3396,9 +3388,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, } } - must_sync = mddev->bitmap_ops->start_sync(mddev, sect, - &sync_blocks, still_degraded); - + md_bitmap_start_sync(mddev, sect, &sync_blocks, + still_degraded); any_working = 0; for (j=0; j<conf->copies;j++) { int k; @@ -3570,13 +3561,13 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, * safety reason, which ensures curr_resync_completed is * updated in bitmap_cond_end_sync. */ - mddev->bitmap_ops->cond_end_sync(mddev, sector_nr, + if (md_bitmap_enabled(mddev, false)) + mddev->bitmap_ops->cond_end_sync(mddev, sector_nr, mddev_is_clustered(mddev) && (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high)); - if (!mddev->bitmap_ops->start_sync(mddev, sector_nr, - &sync_blocks, - mddev->degraded) && + if (!md_bitmap_start_sync(mddev, sector_nr, &sync_blocks, + mddev->degraded) && !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { /* We can skip this block */ @@ -4008,6 +3999,7 @@ static int raid10_set_queue_limits(struct mddev *mddev) md_init_stacking_limits(&lim); lim.max_write_zeroes_sectors = 0; + lim.max_hw_wzeroes_unmap_sectors = 0; lim.io_min = mddev->chunk_sectors << 9; lim.chunk_sectors = mddev->chunk_sectors; lim.io_opt = lim.io_min * raid10_nr_stripes(conf); @@ -4225,7 +4217,6 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors) */ struct r10conf *conf = mddev->private; sector_t oldsize, size; - int ret; if (mddev->reshape_position != MaxSector) return -EBUSY; @@ -4239,9 +4230,12 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors) mddev->array_sectors > size) return -EINVAL; - ret = mddev->bitmap_ops->resize(mddev, size, 0, false); - if (ret) - return ret; + if (md_bitmap_enabled(mddev, false)) { + int ret = mddev->bitmap_ops->resize(mddev, size, 0); + + if (ret) + return ret; + } md_set_array_sectors(mddev, size); if (sectors > mddev->dev_sectors && @@ -4507,8 +4501,9 @@ static int raid10_start_reshape(struct mddev *mddev) oldsize = raid10_size(mddev, 0, 0); newsize = raid10_size(mddev, 0, conf->geo.raid_disks); - if (!mddev_is_clustered(mddev)) { - ret = mddev->bitmap_ops->resize(mddev, newsize, 0, false); + if (!mddev_is_clustered(mddev) && + md_bitmap_enabled(mddev, false)) { + ret = mddev->bitmap_ops->resize(mddev, newsize, 0); if (ret) goto abort; else @@ -4530,13 +4525,14 @@ static int raid10_start_reshape(struct mddev *mddev) MD_FEATURE_RESHAPE_ACTIVE)) || (oldsize == newsize)) goto out; - ret = mddev->bitmap_ops->resize(mddev, newsize, 0, false); + /* cluster can't be setup without bitmap */ + ret = mddev->bitmap_ops->resize(mddev, newsize, 0); if (ret) goto abort; ret = mddev->cluster_ops->resize_bitmaps(mddev, newsize, oldsize); if (ret) { - mddev->bitmap_ops->resize(mddev, oldsize, 0, false); + mddev->bitmap_ops->resize(mddev, oldsize, 0); goto abort; } } diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 3f16ad6904a9..da00a55f7a55 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -165,6 +165,8 @@ enum r10bio_state { * so that raid10d knows what to do with them. */ R10BIO_ReadError, +/* For bio_split errors, record that bi_end_io was called. */ + R10BIO_Returned, /* If a write for this request means we can clear some * known-bad-block records, we set this flag. */ diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 023649fe2476..24b32a0c95b4 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4097,7 +4097,8 @@ static int handle_stripe_dirtying(struct r5conf *conf, int disks) { int rmw = 0, rcw = 0, i; - sector_t resync_offset = conf->mddev->resync_offset; + struct mddev *mddev = conf->mddev; + sector_t resync_offset = mddev->resync_offset; /* Check whether resync is now happening or should start. * If yes, then the array is dirty (after unclean shutdown or @@ -4116,6 +4117,12 @@ static int handle_stripe_dirtying(struct r5conf *conf, pr_debug("force RCW rmw_level=%u, resync_offset=%llu sh->sector=%llu\n", conf->rmw_level, (unsigned long long)resync_offset, (unsigned long long)sh->sector); + } else if (mddev->bitmap_ops && mddev->bitmap_ops->blocks_synced && + !mddev->bitmap_ops->blocks_synced(mddev, sh->sector)) { + /* The initial recover is not done, must read everything */ + rcw = 1; rmw = 2; + pr_debug("force RCW by lazy recovery, sh->sector=%llu\n", + sh->sector); } else for (i = disks; i--; ) { /* would I have to read this buffer for read_modify_write */ struct r5dev *dev = &sh->dev[i]; @@ -4148,7 +4155,7 @@ static int handle_stripe_dirtying(struct r5conf *conf, set_bit(STRIPE_HANDLE, &sh->state); if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) { /* prefer read-modify-write, but need to get some data */ - mddev_add_trace_msg(conf->mddev, "raid5 rmw %llu %d", + mddev_add_trace_msg(mddev, "raid5 rmw %llu %d", sh->sector, rmw); for (i = disks; i--; ) { @@ -4227,8 +4234,8 @@ static int handle_stripe_dirtying(struct r5conf *conf, set_bit(STRIPE_DELAYED, &sh->state); } } - if (rcw && !mddev_is_dm(conf->mddev)) - blk_add_trace_msg(conf->mddev->gendisk->queue, + if (rcw && !mddev_is_dm(mddev)) + blk_add_trace_msg(mddev->gendisk->queue, "raid5 rcw %llu %d %d %d", (unsigned long long)sh->sector, rcw, qread, test_bit(STRIPE_DELAYED, &sh->state)); @@ -4698,10 +4705,21 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) } } else if (test_bit(In_sync, &rdev->flags)) set_bit(R5_Insync, &dev->flags); - else if (sh->sector + RAID5_STRIPE_SECTORS(conf) <= rdev->recovery_offset) - /* in sync if before recovery_offset */ - set_bit(R5_Insync, &dev->flags); - else if (test_bit(R5_UPTODATE, &dev->flags) && + else if (sh->sector + RAID5_STRIPE_SECTORS(conf) <= + rdev->recovery_offset) { + /* + * in sync if: + * - normal IO, or + * - resync IO that is not lazy recovery + * + * For lazy recovery, we have to mark the rdev without + * In_sync as failed, to build initial xor data. + */ + if (!test_bit(STRIPE_SYNCING, &sh->state) || + !test_bit(MD_RECOVERY_LAZY_RECOVER, + &conf->mddev->recovery)) + set_bit(R5_Insync, &dev->flags); + } else if (test_bit(R5_UPTODATE, &dev->flags) && test_bit(R5_Expanded, &dev->flags)) /* If we've reshaped into here, we assume it is Insync. * We will shortly update recovery_offset to make @@ -5468,17 +5486,17 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio) { - struct bio *split; sector_t sector = raid_bio->bi_iter.bi_sector; unsigned chunk_sects = mddev->chunk_sectors; unsigned sectors = chunk_sects - (sector & (chunk_sects-1)); if (sectors < bio_sectors(raid_bio)) { struct r5conf *conf = mddev->private; - split = bio_split(raid_bio, sectors, GFP_NOIO, &conf->bio_split); - bio_chain(split, raid_bio); - submit_bio_noacct(raid_bio); - raid_bio = split; + + raid_bio = bio_submit_split_bioset(raid_bio, sectors, + &conf->bio_split); + if (!raid_bio) + return NULL; } if (!raid5_read_one_chunk(mddev, raid_bio)) @@ -6492,11 +6510,12 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n } if (mddev->curr_resync < max_sector) /* aborted */ - mddev->bitmap_ops->end_sync(mddev, mddev->curr_resync, - &sync_blocks); + md_bitmap_end_sync(mddev, mddev->curr_resync, + &sync_blocks); else /* completed sync */ conf->fullsync = 0; - mddev->bitmap_ops->close_sync(mddev); + if (md_bitmap_enabled(mddev, false)) + mddev->bitmap_ops->close_sync(mddev); return 0; } @@ -6525,8 +6544,7 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n } if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && !conf->fullsync && - !mddev->bitmap_ops->start_sync(mddev, sector_nr, &sync_blocks, - true) && + !md_bitmap_start_sync(mddev, sector_nr, &sync_blocks, true) && sync_blocks >= RAID5_STRIPE_SECTORS(conf)) { /* we can skip this block, and probably more */ do_div(sync_blocks, RAID5_STRIPE_SECTORS(conf)); @@ -6535,7 +6553,8 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n return sync_blocks * RAID5_STRIPE_SECTORS(conf); } - mddev->bitmap_ops->cond_end_sync(mddev, sector_nr, false); + if (md_bitmap_enabled(mddev, false)) + mddev->bitmap_ops->cond_end_sync(mddev, sector_nr, false); sh = raid5_get_active_stripe(conf, NULL, sector_nr, R5_GAS_NOBLOCK); @@ -6557,9 +6576,7 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n still_degraded = true; } - mddev->bitmap_ops->start_sync(mddev, sector_nr, &sync_blocks, - still_degraded); - + md_bitmap_start_sync(mddev, sector_nr, &sync_blocks, still_degraded); set_bit(STRIPE_SYNC_REQUESTED, &sh->state); set_bit(STRIPE_HANDLE, &sh->state); @@ -6763,7 +6780,8 @@ static void raid5d(struct md_thread *thread) /* Now is a good time to flush some bitmap updates */ conf->seq_flush++; spin_unlock_irq(&conf->device_lock); - mddev->bitmap_ops->unplug(mddev, true); + if (md_bitmap_enabled(mddev, true)) + mddev->bitmap_ops->unplug(mddev, true); spin_lock_irq(&conf->device_lock); conf->seq_write = conf->seq_flush; activate_bit_delay(conf, conf->temp_inactive_list); @@ -7732,6 +7750,7 @@ static int raid5_set_limits(struct mddev *mddev) lim.features |= BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE; lim.discard_granularity = stripe; lim.max_write_zeroes_sectors = 0; + lim.max_hw_wzeroes_unmap_sectors = 0; mddev_stack_rdev_limits(mddev, &lim, 0); rdev_for_each(rdev, mddev) queue_limits_stack_bdev(&lim, rdev->bdev, rdev->new_data_offset, @@ -8312,7 +8331,6 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors) */ sector_t newsize; struct r5conf *conf = mddev->private; - int ret; if (raid5_has_log(conf) || raid5_has_ppl(conf)) return -EINVAL; @@ -8322,9 +8340,12 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors) mddev->array_sectors > newsize) return -EINVAL; - ret = mddev->bitmap_ops->resize(mddev, sectors, 0, false); - if (ret) - return ret; + if (md_bitmap_enabled(mddev, false)) { + int ret = mddev->bitmap_ops->resize(mddev, sectors, 0); + + if (ret) + return ret; + } md_set_array_sectors(mddev, newsize); if (sectors > mddev->dev_sectors && |