summaryrefslogtreecommitdiff
path: root/fs/fs-writeback.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/fs-writeback.c')
-rw-r--r--fs/fs-writeback.c141
1 files changed, 90 insertions, 51 deletions
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index a07b8cf73ae2..2b35e80037fe 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -368,7 +368,8 @@ static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
}
struct inode_switch_wbs_context {
- struct rcu_work work;
+ /* List of queued switching contexts for the wb */
+ struct llist_node list;
/*
* Multiple inodes can be switched at once. The switching procedure
@@ -378,7 +379,6 @@ struct inode_switch_wbs_context {
* array embedded into struct inode_switch_wbs_context. Otherwise
* an inode could be left in a non-consistent state.
*/
- struct bdi_writeback *new_wb;
struct inode *inodes[];
};
@@ -445,22 +445,23 @@ static bool inode_do_switch_wbs(struct inode *inode,
* Transfer to @new_wb's IO list if necessary. If the @inode is dirty,
* the specific list @inode was on is ignored and the @inode is put on
* ->b_dirty which is always correct including from ->b_dirty_time.
- * The transfer preserves @inode->dirtied_when ordering. If the @inode
- * was clean, it means it was on the b_attached list, so move it onto
- * the b_attached list of @new_wb.
+ * If the @inode was clean, it means it was on the b_attached list, so
+ * move it onto the b_attached list of @new_wb.
*/
if (!list_empty(&inode->i_io_list)) {
inode->i_wb = new_wb;
if (inode->i_state & I_DIRTY_ALL) {
- struct inode *pos;
-
- list_for_each_entry(pos, &new_wb->b_dirty, i_io_list)
- if (time_after_eq(inode->dirtied_when,
- pos->dirtied_when))
- break;
+ /*
+ * We need to keep b_dirty list sorted by
+ * dirtied_time_when. However properly sorting the
+ * inode in the list gets too expensive when switching
+ * many inodes. So just attach inode at the end of the
+ * dirty list and clobber the dirtied_time_when.
+ */
+ inode->dirtied_time_when = jiffies;
inode_io_list_move_locked(inode, new_wb,
- pos->i_io_list.prev);
+ &new_wb->b_dirty);
} else {
inode_cgwb_move_to_attached(inode, new_wb);
}
@@ -486,13 +487,11 @@ skip_switch:
return switched;
}
-static void inode_switch_wbs_work_fn(struct work_struct *work)
+static void process_inode_switch_wbs(struct bdi_writeback *new_wb,
+ struct inode_switch_wbs_context *isw)
{
- struct inode_switch_wbs_context *isw =
- container_of(to_rcu_work(work), struct inode_switch_wbs_context, work);
struct backing_dev_info *bdi = inode_to_bdi(isw->inodes[0]);
struct bdi_writeback *old_wb = isw->inodes[0]->i_wb;
- struct bdi_writeback *new_wb = isw->new_wb;
unsigned long nr_switched = 0;
struct inode **inodep;
@@ -502,6 +501,7 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
*/
down_read(&bdi->wb_switch_rwsem);
+ inodep = isw->inodes;
/*
* By the time control reaches here, RCU grace period has passed
* since I_WB_SWITCH assertion and all wb stat update transactions
@@ -512,6 +512,7 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
* gives us exclusion against all wb related operations on @inode
* including IO list manipulations and stat updates.
*/
+relock:
if (old_wb < new_wb) {
spin_lock(&old_wb->list_lock);
spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING);
@@ -520,10 +521,17 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
}
- for (inodep = isw->inodes; *inodep; inodep++) {
+ while (*inodep) {
WARN_ON_ONCE((*inodep)->i_wb != old_wb);
if (inode_do_switch_wbs(*inodep, old_wb, new_wb))
nr_switched++;
+ inodep++;
+ if (*inodep && need_resched()) {
+ spin_unlock(&new_wb->list_lock);
+ spin_unlock(&old_wb->list_lock);
+ cond_resched();
+ goto relock;
+ }
}
spin_unlock(&new_wb->list_lock);
@@ -543,6 +551,38 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
atomic_dec(&isw_nr_in_flight);
}
+void inode_switch_wbs_work_fn(struct work_struct *work)
+{
+ struct bdi_writeback *new_wb = container_of(work, struct bdi_writeback,
+ switch_work);
+ struct inode_switch_wbs_context *isw, *next_isw;
+ struct llist_node *list;
+
+ /*
+ * Grab out reference to wb so that it cannot get freed under us
+ * after we process all the isw items.
+ */
+ wb_get(new_wb);
+ while (1) {
+ list = llist_del_all(&new_wb->switch_wbs_ctxs);
+ /* Nothing to do? */
+ if (!list)
+ break;
+ /*
+ * In addition to synchronizing among switchers, I_WB_SWITCH
+ * tells the RCU protected stat update paths to grab the i_page
+ * lock so that stat transfer can synchronize against them.
+ * Let's continue after I_WB_SWITCH is guaranteed to be
+ * visible.
+ */
+ synchronize_rcu();
+
+ llist_for_each_entry_safe(isw, next_isw, list, list)
+ process_inode_switch_wbs(new_wb, isw);
+ }
+ wb_put(new_wb);
+}
+
static bool inode_prepare_wbs_switch(struct inode *inode,
struct bdi_writeback *new_wb)
{
@@ -572,6 +612,13 @@ static bool inode_prepare_wbs_switch(struct inode *inode,
return true;
}
+static void wb_queue_isw(struct bdi_writeback *wb,
+ struct inode_switch_wbs_context *isw)
+{
+ if (llist_add(&isw->list, &wb->switch_wbs_ctxs))
+ queue_work(isw_wq, &wb->switch_work);
+}
+
/**
* inode_switch_wbs - change the wb association of an inode
* @inode: target inode
@@ -585,6 +632,7 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
struct backing_dev_info *bdi = inode_to_bdi(inode);
struct cgroup_subsys_state *memcg_css;
struct inode_switch_wbs_context *isw;
+ struct bdi_writeback *new_wb = NULL;
/* noop if seems to be already in progress */
if (inode->i_state & I_WB_SWITCH)
@@ -609,40 +657,35 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
if (!memcg_css)
goto out_free;
- isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
+ new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
css_put(memcg_css);
- if (!isw->new_wb)
+ if (!new_wb)
goto out_free;
- if (!inode_prepare_wbs_switch(inode, isw->new_wb))
+ if (!inode_prepare_wbs_switch(inode, new_wb))
goto out_free;
isw->inodes[0] = inode;
- /*
- * In addition to synchronizing among switchers, I_WB_SWITCH tells
- * the RCU protected stat update paths to grab the i_page
- * lock so that stat transfer can synchronize against them.
- * Let's continue after I_WB_SWITCH is guaranteed to be visible.
- */
- INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn);
- queue_rcu_work(isw_wq, &isw->work);
+ trace_inode_switch_wbs_queue(inode->i_wb, new_wb, 1);
+ wb_queue_isw(new_wb, isw);
return;
out_free:
atomic_dec(&isw_nr_in_flight);
- if (isw->new_wb)
- wb_put(isw->new_wb);
+ if (new_wb)
+ wb_put(new_wb);
kfree(isw);
}
-static bool isw_prepare_wbs_switch(struct inode_switch_wbs_context *isw,
+static bool isw_prepare_wbs_switch(struct bdi_writeback *new_wb,
+ struct inode_switch_wbs_context *isw,
struct list_head *list, int *nr)
{
struct inode *inode;
list_for_each_entry(inode, list, i_io_list) {
- if (!inode_prepare_wbs_switch(inode, isw->new_wb))
+ if (!inode_prepare_wbs_switch(inode, new_wb))
continue;
isw->inodes[*nr] = inode;
@@ -666,6 +709,7 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
{
struct cgroup_subsys_state *memcg_css;
struct inode_switch_wbs_context *isw;
+ struct bdi_writeback *new_wb;
int nr;
bool restart = false;
@@ -678,12 +722,12 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
for (memcg_css = wb->memcg_css->parent; memcg_css;
memcg_css = memcg_css->parent) {
- isw->new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL);
- if (isw->new_wb)
+ new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL);
+ if (new_wb)
break;
}
- if (unlikely(!isw->new_wb))
- isw->new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */
+ if (unlikely(!new_wb))
+ new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */
nr = 0;
spin_lock(&wb->list_lock);
@@ -695,27 +739,22 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
* bandwidth restrictions, as writeback of inode metadata is not
* accounted for.
*/
- restart = isw_prepare_wbs_switch(isw, &wb->b_attached, &nr);
+ restart = isw_prepare_wbs_switch(new_wb, isw, &wb->b_attached, &nr);
if (!restart)
- restart = isw_prepare_wbs_switch(isw, &wb->b_dirty_time, &nr);
+ restart = isw_prepare_wbs_switch(new_wb, isw, &wb->b_dirty_time,
+ &nr);
spin_unlock(&wb->list_lock);
/* no attached inodes? bail out */
if (nr == 0) {
atomic_dec(&isw_nr_in_flight);
- wb_put(isw->new_wb);
+ wb_put(new_wb);
kfree(isw);
return restart;
}
- /*
- * In addition to synchronizing among switchers, I_WB_SWITCH tells
- * the RCU protected stat update paths to grab the i_page
- * lock so that stat transfer can synchronize against them.
- * Let's continue after I_WB_SWITCH is guaranteed to be visible.
- */
- INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn);
- queue_rcu_work(isw_wq, &isw->work);
+ trace_inode_switch_wbs_queue(wb, new_wb, nr);
+ wb_queue_isw(new_wb, isw);
return restart;
}
@@ -1123,7 +1162,7 @@ int cgroup_writeback_by_id(u64 bdi_id, int memcg_id,
dirty = dirty * 10 / 8;
/* issue the writeback work */
- work = kzalloc(sizeof(*work), GFP_NOWAIT | __GFP_NOWARN);
+ work = kzalloc(sizeof(*work), GFP_NOWAIT);
if (work) {
work->nr_pages = dirty;
work->sync_mode = WB_SYNC_NONE;
@@ -1180,7 +1219,7 @@ void cgroup_writeback_umount(struct super_block *sb)
static int __init cgroup_writeback_init(void)
{
- isw_wq = alloc_workqueue("inode_switch_wbs", 0, 0);
+ isw_wq = alloc_workqueue("inode_switch_wbs", WQ_PERCPU, 0);
if (!isw_wq)
return -ENOMEM;
return 0;
@@ -1767,7 +1806,7 @@ static int writeback_single_inode(struct inode *inode,
int ret = 0;
spin_lock(&inode->i_lock);
- if (!atomic_read(&inode->i_count))
+ if (!icount_read(inode))
WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
else
WARN_ON(inode->i_state & I_WILL_FREE);
@@ -2442,7 +2481,7 @@ static int dirtytime_interval_handler(const struct ctl_table *table, int write,
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (ret == 0 && write)
- mod_delayed_work(system_wq, &dirtytime_work, 0);
+ mod_delayed_work(system_percpu_wq, &dirtytime_work, 0);
return ret;
}