diff options
Diffstat (limited to 'drivers/md/bcache')
| -rw-r--r-- | drivers/md/bcache/alloc.c | 25 | ||||
| -rw-r--r-- | drivers/md/bcache/bcache.h | 6 | ||||
| -rw-r--r-- | drivers/md/bcache/bset.h | 8 | ||||
| -rw-r--r-- | drivers/md/bcache/btree.c | 53 | ||||
| -rw-r--r-- | drivers/md/bcache/journal.c | 93 | ||||
| -rw-r--r-- | drivers/md/bcache/journal.h | 13 | ||||
| -rw-r--r-- | drivers/md/bcache/super.c | 33 | ||||
| -rw-r--r-- | drivers/md/bcache/sysfs.c | 15 | ||||
| -rw-r--r-- | drivers/md/bcache/writeback.c | 5 |
9 files changed, 69 insertions, 182 deletions
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 48ce750bf70a..7708d92df23e 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -24,21 +24,18 @@ * Since the gens and priorities are all stored contiguously on disk, we can * batch this up: We fill up the free_inc list with freshly invalidated buckets, * call prio_write(), and when prio_write() finishes we pull buckets off the - * free_inc list and optionally discard them. + * free_inc list. * * free_inc isn't the only freelist - if it was, we'd often to sleep while * priorities and gens were being written before we could allocate. c->free is a * smaller freelist, and buckets on that list are always ready to be used. * - * If we've got discards enabled, that happens when a bucket moves from the - * free_inc list to the free list. - * * There is another freelist, because sometimes we have buckets that we know * have nothing pointing into them - these we can reuse without waiting for * priorities to be rewritten. These come from freed btree nodes and buckets * that garbage collection discovered no longer had valid keys pointing into * them (because they were overwritten). That's the unused list - buckets on the - * unused list move to the free list, optionally being discarded in the process. + * unused list move to the free list. * * It's also important to ensure that gens don't wrap around - with respect to * either the oldest gen in the btree or the gen on disk. This is quite @@ -118,8 +115,7 @@ void bch_rescale_priorities(struct cache_set *c, int sectors) /* * Background allocation thread: scans for buckets to be invalidated, * invalidates them, rewrites prios/gens (marking them as invalidated on disk), - * then optionally issues discard commands to the newly free buckets, then puts - * them on the various freelists. + * then puts them on the various freelists. */ static inline bool can_inc_bucket_gen(struct bucket *b) @@ -321,8 +317,7 @@ static int bch_allocator_thread(void *arg) while (1) { /* * First, we pull buckets off of the unused and free_inc lists, - * possibly issue discards to them, then we add the bucket to - * the free list: + * then we add the bucket to the free list: */ while (1) { long bucket; @@ -330,14 +325,6 @@ static int bch_allocator_thread(void *arg) if (!fifo_pop(&ca->free_inc, bucket)) break; - if (ca->discard) { - mutex_unlock(&ca->set->bucket_lock); - blkdev_issue_discard(ca->bdev, - bucket_to_sector(ca->set, bucket), - ca->sb.bucket_size, GFP_KERNEL); - mutex_lock(&ca->set->bucket_lock); - } - allocator_wait(ca, bch_allocator_push(ca, bucket)); wake_up(&ca->set->btree_cache_wait); wake_up(&ca->set->bucket_wait); @@ -412,7 +399,11 @@ long bch_bucket_alloc(struct cache *ca, unsigned int reserve, bool wait) TASK_UNINTERRUPTIBLE); mutex_unlock(&ca->set->bucket_lock); + + atomic_inc(&ca->set->bucket_wait_cnt); schedule(); + atomic_dec(&ca->set->bucket_wait_cnt); + mutex_lock(&ca->set->bucket_lock); } while (!fifo_pop(&ca->free[RESERVE_NONE], r) && !fifo_pop(&ca->free[reserve], r)); diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 1d33e40d26ea..8ccacba85547 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -447,8 +447,7 @@ struct cache { * free_inc: Incoming buckets - these are buckets that currently have * cached data in them, and we can't reuse them until after we write * their new gen to disk. After prio_write() finishes writing the new - * gens/prios, they'll be moved to the free list (and possibly discarded - * in the process) + * gens/prios, they'll be moved to the free list. */ DECLARE_FIFO(long, free)[RESERVE_NR]; DECLARE_FIFO(long, free_inc); @@ -467,8 +466,6 @@ struct cache { */ unsigned int invalidate_needs_gc; - bool discard; /* Get rid of? */ - struct journal_device journal; /* The rest of this all shows up in sysfs */ @@ -607,6 +604,7 @@ struct cache_set { */ atomic_t prio_blocked; wait_queue_head_t bucket_wait; + atomic_t bucket_wait_cnt; /* * For any bio we don't skip we subtract the number of sectors from diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index 011f6062c4c0..6ee2c6a506a2 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -327,9 +327,13 @@ struct btree_iter { /* Fixed-size btree_iter that can be allocated on the stack */ struct btree_iter_stack { - struct btree_iter iter; - struct btree_iter_set stack_data[MAX_BSETS]; + /* Must be last as it ends in a flexible-array member. */ + TRAILING_OVERLAP(struct btree_iter, iter, data, + struct btree_iter_set stack_data[MAX_BSETS]; + ); }; +static_assert(offsetof(struct btree_iter_stack, iter.data) == + offsetof(struct btree_iter_stack, stack_data)); typedef bool (*ptr_filter_fn)(struct btree_keys *b, const struct bkey *k); diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 210b59007d98..3ed39c823826 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -89,8 +89,9 @@ * Test module load/unload */ -#define MAX_GC_TIMES 100 -#define MIN_GC_NODES 100 +#define MAX_GC_TIMES_SHIFT 7 /* 128 loops */ +#define GC_NODES_MIN 10 +#define GC_SLEEP_MS_MIN 10 #define GC_SLEEP_MS 100 #define PTR_DIRTY_BIT (((uint64_t) 1 << 36)) @@ -371,7 +372,7 @@ static void do_btree_node_write(struct btree *b) SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_sector_offset(&b->keys, i)); - if (!bch_bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) { + if (!bch_bio_alloc_pages(b->bio, GFP_NOWAIT)) { struct bio_vec *bv; void *addr = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); struct bvec_iter_all iter_all; @@ -1578,29 +1579,29 @@ static unsigned int btree_gc_count_keys(struct btree *b) static size_t btree_gc_min_nodes(struct cache_set *c) { - size_t min_nodes; + size_t min_nodes = GC_NODES_MIN; - /* - * Since incremental GC would stop 100ms when front - * side I/O comes, so when there are many btree nodes, - * if GC only processes constant (100) nodes each time, - * GC would last a long time, and the front side I/Os - * would run out of the buckets (since no new bucket - * can be allocated during GC), and be blocked again. - * So GC should not process constant nodes, but varied - * nodes according to the number of btree nodes, which - * realized by dividing GC into constant(100) times, - * so when there are many btree nodes, GC can process - * more nodes each time, otherwise, GC will process less - * nodes each time (but no less than MIN_GC_NODES) - */ - min_nodes = c->gc_stats.nodes / MAX_GC_TIMES; - if (min_nodes < MIN_GC_NODES) - min_nodes = MIN_GC_NODES; + if (atomic_read(&c->search_inflight) == 0) { + size_t n = c->gc_stats.nodes >> MAX_GC_TIMES_SHIFT; + + if (min_nodes < n) + min_nodes = n; + } return min_nodes; } +static uint64_t btree_gc_sleep_ms(struct cache_set *c) +{ + uint64_t sleep_ms; + + if (atomic_read(&c->bucket_wait_cnt) > 0) + sleep_ms = GC_SLEEP_MS_MIN; + else + sleep_ms = GC_SLEEP_MS; + + return sleep_ms; +} static int btree_gc_recurse(struct btree *b, struct btree_op *op, struct closure *writes, struct gc_stat *gc) @@ -1668,8 +1669,7 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1)); r->b = NULL; - if (atomic_read(&b->c->search_inflight) && - gc->nodes >= gc->nodes_pre + btree_gc_min_nodes(b->c)) { + if (gc->nodes >= (gc->nodes_pre + btree_gc_min_nodes(b->c))) { gc->nodes_pre = gc->nodes; ret = -EAGAIN; break; @@ -1846,8 +1846,8 @@ static void bch_btree_gc(struct cache_set *c) cond_resched(); if (ret == -EAGAIN) - schedule_timeout_interruptible(msecs_to_jiffies - (GC_SLEEP_MS)); + schedule_timeout_interruptible( + msecs_to_jiffies(btree_gc_sleep_ms(c))); else if (ret) pr_warn("gc failed!\n"); } while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags)); @@ -2822,7 +2822,8 @@ void bch_btree_exit(void) int __init bch_btree_init(void) { - btree_io_wq = alloc_workqueue("bch_btree_io", WQ_MEM_RECLAIM, 0); + btree_io_wq = alloc_workqueue("bch_btree_io", + WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (!btree_io_wq) return -ENOMEM; diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index d50eb82ccb4f..144693b7c46a 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -275,8 +275,7 @@ bsearch: * ja->cur_idx */ ja->cur_idx = i; - ja->last_idx = ja->discard_idx = (i + 1) % - ca->sb.njournal_buckets; + ja->last_idx = (i + 1) % ca->sb.njournal_buckets; } @@ -336,16 +335,6 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list) } } -static bool is_discard_enabled(struct cache_set *s) -{ - struct cache *ca = s->cache; - - if (ca->discard) - return true; - - return false; -} - int bch_journal_replay(struct cache_set *s, struct list_head *list) { int ret = 0, keys = 0, entries = 0; @@ -360,15 +349,10 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list) BUG_ON(i->pin && atomic_read(i->pin) != 1); if (n != i->j.seq) { - if (n == start && is_discard_enabled(s)) - pr_info("journal entries %llu-%llu may be discarded! (replaying %llu-%llu)\n", - n, i->j.seq - 1, start, end); - else { - pr_err("journal entries %llu-%llu missing! (replaying %llu-%llu)\n", - n, i->j.seq - 1, start, end); - ret = -EIO; - goto err; - } + pr_err("journal entries %llu-%llu missing! (replaying %llu-%llu)\n", + n, i->j.seq - 1, start, end); + ret = -EIO; + goto err; } for (k = i->j.start; @@ -568,65 +552,6 @@ out: #define last_seq(j) ((j)->seq - fifo_used(&(j)->pin) + 1) -static void journal_discard_endio(struct bio *bio) -{ - struct journal_device *ja = - container_of(bio, struct journal_device, discard_bio); - struct cache *ca = container_of(ja, struct cache, journal); - - atomic_set(&ja->discard_in_flight, DISCARD_DONE); - - closure_wake_up(&ca->set->journal.wait); - closure_put(&ca->set->cl); -} - -static void journal_discard_work(struct work_struct *work) -{ - struct journal_device *ja = - container_of(work, struct journal_device, discard_work); - - submit_bio(&ja->discard_bio); -} - -static void do_journal_discard(struct cache *ca) -{ - struct journal_device *ja = &ca->journal; - struct bio *bio = &ja->discard_bio; - - if (!ca->discard) { - ja->discard_idx = ja->last_idx; - return; - } - - switch (atomic_read(&ja->discard_in_flight)) { - case DISCARD_IN_FLIGHT: - return; - - case DISCARD_DONE: - ja->discard_idx = (ja->discard_idx + 1) % - ca->sb.njournal_buckets; - - atomic_set(&ja->discard_in_flight, DISCARD_READY); - fallthrough; - - case DISCARD_READY: - if (ja->discard_idx == ja->last_idx) - return; - - atomic_set(&ja->discard_in_flight, DISCARD_IN_FLIGHT); - - bio_init_inline(bio, ca->bdev, 1, REQ_OP_DISCARD); - bio->bi_iter.bi_sector = bucket_to_sector(ca->set, - ca->sb.d[ja->discard_idx]); - bio->bi_iter.bi_size = bucket_bytes(ca); - bio->bi_end_io = journal_discard_endio; - - closure_get(&ca->set->cl); - INIT_WORK(&ja->discard_work, journal_discard_work); - queue_work(bch_journal_wq, &ja->discard_work); - } -} - static unsigned int free_journal_buckets(struct cache_set *c) { struct journal *j = &c->journal; @@ -635,10 +560,10 @@ static unsigned int free_journal_buckets(struct cache_set *c) unsigned int n; /* In case njournal_buckets is not power of 2 */ - if (ja->cur_idx >= ja->discard_idx) - n = ca->sb.njournal_buckets + ja->discard_idx - ja->cur_idx; + if (ja->cur_idx >= ja->last_idx) + n = ca->sb.njournal_buckets + ja->last_idx - ja->cur_idx; else - n = ja->discard_idx - ja->cur_idx; + n = ja->last_idx - ja->cur_idx; if (n > (1 + j->do_reserve)) return n - (1 + j->do_reserve); @@ -668,8 +593,6 @@ static void journal_reclaim(struct cache_set *c) ja->last_idx = (ja->last_idx + 1) % ca->sb.njournal_buckets; - do_journal_discard(ca); - if (c->journal.blocks_free) goto out; diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h index cd316b4a1e95..9e9d1b3016a5 100644 --- a/drivers/md/bcache/journal.h +++ b/drivers/md/bcache/journal.h @@ -139,19 +139,6 @@ struct journal_device { /* Last journal bucket that still contains an open journal entry */ unsigned int last_idx; - /* Next journal bucket to be discarded */ - unsigned int discard_idx; - -#define DISCARD_READY 0 -#define DISCARD_IN_FLIGHT 1 -#define DISCARD_DONE 2 - /* 1 - discard in flight, -1 - discard completed */ - atomic_t discard_in_flight; - - struct work_struct discard_work; - struct bio discard_bio; - struct bio_vec discard_bv; - /* Bio for journal reads/writes to this device */ struct bio bio; struct bio_vec bv[8]; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 6d250e366412..c17d4517af22 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1388,7 +1388,7 @@ static CLOSURE_CALLBACK(cached_dev_flush) bch_cache_accounting_destroy(&dc->accounting); kobject_del(&d->kobj); - continue_at(cl, cached_dev_free, system_wq); + continue_at(cl, cached_dev_free, system_percpu_wq); } static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) @@ -1400,7 +1400,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) __module_get(THIS_MODULE); INIT_LIST_HEAD(&dc->list); closure_init(&dc->disk.cl, NULL); - set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq); + set_closure_fn(&dc->disk.cl, cached_dev_flush, system_percpu_wq); kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype); INIT_WORK(&dc->detach, cached_dev_detach_finish); sema_init(&dc->sb_write_mutex, 1); @@ -1513,7 +1513,7 @@ static CLOSURE_CALLBACK(flash_dev_flush) bcache_device_unlink(d); mutex_unlock(&bch_register_lock); kobject_del(&d->kobj); - continue_at(cl, flash_dev_free, system_wq); + continue_at(cl, flash_dev_free, system_percpu_wq); } static int flash_dev_run(struct cache_set *c, struct uuid_entry *u) @@ -1525,7 +1525,7 @@ static int flash_dev_run(struct cache_set *c, struct uuid_entry *u) goto err_ret; closure_init(&d->cl, NULL); - set_closure_fn(&d->cl, flash_dev_flush, system_wq); + set_closure_fn(&d->cl, flash_dev_flush, system_percpu_wq); kobject_init(&d->kobj, &bch_flash_dev_ktype); @@ -1833,7 +1833,7 @@ static CLOSURE_CALLBACK(__cache_set_unregister) mutex_unlock(&bch_register_lock); - continue_at(cl, cache_set_flush, system_wq); + continue_at(cl, cache_set_flush, system_percpu_wq); } void bch_cache_set_stop(struct cache_set *c) @@ -1863,10 +1863,10 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) __module_get(THIS_MODULE); closure_init(&c->cl, NULL); - set_closure_fn(&c->cl, cache_set_free, system_wq); + set_closure_fn(&c->cl, cache_set_free, system_percpu_wq); closure_init(&c->caching, &c->cl); - set_closure_fn(&c->caching, __cache_set_unregister, system_wq); + set_closure_fn(&c->caching, __cache_set_unregister, system_percpu_wq); /* Maybe create continue_at_noreturn() and use it here? */ closure_set_stopped(&c->cl); @@ -1939,7 +1939,8 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) if (!c->uuids) goto err; - c->moving_gc_wq = alloc_workqueue("bcache_gc", WQ_MEM_RECLAIM, 0); + c->moving_gc_wq = alloc_workqueue("bcache_gc", + WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (!c->moving_gc_wq) goto err; @@ -2382,9 +2383,6 @@ static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk, ca->bdev = file_bdev(bdev_file); ca->sb_disk = sb_disk; - if (bdev_max_discard_sectors(file_bdev(bdev_file))) - ca->discard = CACHE_DISCARD(&ca->sb); - ret = cache_alloc(ca); if (ret != 0) { if (ret == -ENOMEM) @@ -2531,7 +2529,7 @@ static void register_device_async(struct async_reg_args *args) INIT_DELAYED_WORK(&args->reg_work, register_cache_worker); /* 10 jiffies is enough for a delay */ - queue_delayed_work(system_wq, &args->reg_work, 10); + queue_delayed_work(system_percpu_wq, &args->reg_work, 10); } static void *alloc_holder_object(struct cache_sb *sb) @@ -2905,24 +2903,25 @@ static int __init bcache_init(void) if (bch_btree_init()) goto err; - bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0); + bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (!bcache_wq) goto err; /* * Let's not make this `WQ_MEM_RECLAIM` for the following reasons: * - * 1. It used `system_wq` before which also does no memory reclaim. + * 1. It used `system_percpu_wq` before which also does no memory reclaim. * 2. With `WQ_MEM_RECLAIM` desktop stalls, increased boot times, and * reduced throughput can be observed. * - * We still want to user our own queue to not congest the `system_wq`. + * We still want to user our own queue to not congest the `system_percpu_wq`. */ - bch_flush_wq = alloc_workqueue("bch_flush", 0, 0); + bch_flush_wq = alloc_workqueue("bch_flush", WQ_PERCPU, 0); if (!bch_flush_wq) goto err; - bch_journal_wq = alloc_workqueue("bch_journal", WQ_MEM_RECLAIM, 0); + bch_journal_wq = alloc_workqueue("bch_journal", + WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (!bch_journal_wq) goto err; diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 826b14cae4e5..72f38e5b6f5c 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -134,7 +134,6 @@ read_attribute(partial_stripes_expensive); rw_attribute(synchronous); rw_attribute(journal_delay_ms); rw_attribute(io_disable); -rw_attribute(discard); rw_attribute(running); rw_attribute(label); rw_attribute(errors); @@ -1036,7 +1035,6 @@ SHOW(__bch_cache) sysfs_hprint(bucket_size, bucket_bytes(ca)); sysfs_hprint(block_size, block_bytes(ca)); sysfs_print(nbuckets, ca->sb.nbuckets); - sysfs_print(discard, ca->discard); sysfs_hprint(written, atomic_long_read(&ca->sectors_written) << 9); sysfs_hprint(btree_written, atomic_long_read(&ca->btree_sectors_written) << 9); @@ -1142,18 +1140,6 @@ STORE(__bch_cache) if (bcache_is_reboot) return -EBUSY; - if (attr == &sysfs_discard) { - bool v = strtoul_or_return(buf); - - if (bdev_max_discard_sectors(ca->bdev)) - ca->discard = v; - - if (v != CACHE_DISCARD(&ca->sb)) { - SET_CACHE_DISCARD(&ca->sb, v); - bcache_write_super(ca->set); - } - } - if (attr == &sysfs_cache_replacement_policy) { v = __sysfs_match_string(cache_replacement_policies, -1, buf); if (v < 0) @@ -1185,7 +1171,6 @@ static struct attribute *bch_cache_attrs[] = { &sysfs_block_size, &sysfs_nbuckets, &sysfs_priority_stats, - &sysfs_discard, &sysfs_written, &sysfs_btree_written, &sysfs_metadata_written, diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 6ba73dc1a3df..4b237074f453 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -805,8 +805,7 @@ static int bch_writeback_thread(void *arg) * may set BCH_ENABLE_AUTO_GC via sysfs, then when * BCH_DO_AUTO_GC is set, garbage collection thread * will be wake up here. After moving gc, the shrunk - * btree and discarded free buckets SSD space may be - * helpful for following write requests. + * btree may be helpful for following write requests. */ if (c->gc_after_writeback == (BCH_ENABLE_AUTO_GC|BCH_DO_AUTO_GC)) { @@ -1076,7 +1075,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) int bch_cached_dev_writeback_start(struct cached_dev *dc) { dc->writeback_write_wq = alloc_workqueue("bcache_writeback_wq", - WQ_MEM_RECLAIM, 0); + WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (!dc->writeback_write_wq) return -ENOMEM; |
