1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
|
/*
* mm/page-writeback.c.
*
* Copyright (C) 2002, Linus Torvalds.
*
* Contains functions related to writing back dirty pages at the
* address_space level.
*
* 10Apr2002 akpm@zip.com.au
* Initial version
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/spinlock.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
#include <linux/writeback.h>
#include <linux/init.h>
#include <linux/sysrq.h>
#include <linux/backing-dev.h>
#include <linux/mpage.h>
/*
* The maximum number of pages to writeout in a single bdflush/kupdate
* operation. We do this so we don't hold I_LOCK against an inode for
* enormous amounts of time, which would block a userspace task which has
* been forced to throttle against that inode. Also, the code reevaluates
* the dirty each time it has written this many pages.
*/
#define MAX_WRITEBACK_PAGES 1024
/*
* After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
* will look to see if it needs to force writeback or throttling. Probably
* should be scaled by memory size.
*/
#define RATELIMIT_PAGES ((512 * 1024) / PAGE_SIZE)
/*
* When balance_dirty_pages decides that the caller needs to perform some
* non-background writeback, this is how many pages it will attempt to write.
* It should be somewhat larger than RATELIMIT_PAGES to ensure that reasonably
* large amounts of I/O are submitted.
*/
#define SYNC_WRITEBACK_PAGES ((RATELIMIT_PAGES * 3) / 2)
/* The following parameters are exported via /proc/sys/vm */
/*
* Dirty memory thresholds, in percentages
*/
/*
* Start background writeback (via pdflush) at this level
*/
int dirty_background_ratio = 40;
/*
* The generator of dirty data starts async writeback at this level
*/
int dirty_async_ratio = 50;
/*
* The generator of dirty data performs sync writeout at this level
*/
int dirty_sync_ratio = 60;
/*
* The interval between `kupdate'-style writebacks, in centiseconds
* (hundredths of a second)
*/
int dirty_writeback_centisecs = 5 * 100;
/*
* The longest amount of time for which data is allowed to remain dirty
*/
int dirty_expire_centisecs = 30 * 100;
/* End of sysctl-exported parameters */
static void background_writeout(unsigned long _min_pages);
/*
* balance_dirty_pages() must be called by processes which are
* generating dirty data. It looks at the number of dirty pages
* in the machine and either:
*
* - Starts background writeback or
* - Causes the caller to perform async writeback or
* - Causes the caller to perform synchronous writeback, then
* tells a pdflush thread to perform more writeback or
* - Does nothing at all.
*
* balance_dirty_pages() can sleep.
*
* FIXME: WB_SYNC_LAST doesn't actually work. It waits on the last dirty
* inode on the superblock list. It should wait when nr_to_write is
* exhausted. Doesn't seem to matter.
*/
void balance_dirty_pages(struct address_space *mapping)
{
const int tot = nr_free_pagecache_pages();
struct page_state ps;
int background_thresh, async_thresh, sync_thresh;
unsigned long dirty_and_writeback;
struct backing_dev_info *bdi;
get_page_state(&ps);
dirty_and_writeback = ps.nr_dirty + ps.nr_writeback;
background_thresh = (dirty_background_ratio * tot) / 100;
async_thresh = (dirty_async_ratio * tot) / 100;
sync_thresh = (dirty_sync_ratio * tot) / 100;
bdi = mapping->backing_dev_info;
if (dirty_and_writeback > sync_thresh) {
int nr_to_write = SYNC_WRITEBACK_PAGES;
writeback_backing_dev(bdi, &nr_to_write, WB_SYNC_LAST, NULL);
get_page_state(&ps);
} else if (dirty_and_writeback > async_thresh) {
int nr_to_write = SYNC_WRITEBACK_PAGES;
writeback_backing_dev(bdi, &nr_to_write, WB_SYNC_NONE, NULL);
get_page_state(&ps);
}
if (!writeback_in_progress(bdi) && ps.nr_dirty > background_thresh)
pdflush_operation(background_writeout, 0);
}
/**
* balance_dirty_pages_ratelimited - balance dirty memory state
* @mapping - address_space which was dirtied
*
* Processes which are dirtying memory should call in here once for each page
* which was newly dirtied. The function will periodically check the system's
* dirty state and will initiate writeback if needed.
*
* balance_dirty_pages_ratelimited() may sleep.
*/
void balance_dirty_pages_ratelimited(struct address_space *mapping)
{
static struct rate_limit_struct {
int count;
} ____cacheline_aligned ratelimits[NR_CPUS];
int cpu;
cpu = get_cpu();
if (ratelimits[cpu].count++ >= RATELIMIT_PAGES) {
ratelimits[cpu].count = 0;
put_cpu();
balance_dirty_pages(mapping);
return;
}
put_cpu();
}
/*
* writeback at least _min_pages, and keep writing until the amount of dirty
* memory is less than the background threshold, or until we're all clean.
*/
static void background_writeout(unsigned long _min_pages)
{
const int tot = nr_free_pagecache_pages();
const int background_thresh = (dirty_background_ratio * tot) / 100;
long min_pages = _min_pages;
int nr_to_write;
CHECK_EMERGENCY_SYNC
do {
struct page_state ps;
get_page_state(&ps);
if (ps.nr_dirty < background_thresh && min_pages <= 0)
break;
nr_to_write = MAX_WRITEBACK_PAGES;
writeback_unlocked_inodes(&nr_to_write, WB_SYNC_NONE, NULL);
min_pages -= MAX_WRITEBACK_PAGES - nr_to_write;
} while (nr_to_write <= 0);
blk_run_queues();
}
/*
* Start heavy writeback of everything.
*/
void wakeup_bdflush(void)
{
struct page_state ps;
get_page_state(&ps);
pdflush_operation(background_writeout, ps.nr_dirty);
}
static struct timer_list wb_timer;
/*
* Periodic writeback of "old" data.
*
* Define "old": the first time one of an inode's pages is dirtied, we mark the
* dirtying-time in the inode's address_space. So this periodic writeback code
* just walks the superblock inode list, writing back any inodes which are
* older than a specific point in time.
*
* Try to run once per dirty_writeback_centisecs. But if a writeback event
* takes longer than a dirty_writeback_centisecs interval, then leave a
* one-second gap.
*
* older_than_this takes precedence over nr_to_write. So we'll only write back
* all dirty pages if they are all attached to "old" mappings.
*/
static void wb_kupdate(unsigned long arg)
{
unsigned long oldest_jif;
unsigned long start_jif;
unsigned long next_jif;
struct page_state ps;
int nr_to_write;
sync_supers();
get_page_state(&ps);
oldest_jif = jiffies - (dirty_expire_centisecs * HZ) / 100;
start_jif = jiffies;
next_jif = start_jif + (dirty_writeback_centisecs * HZ) / 100;
nr_to_write = ps.nr_dirty;
writeback_unlocked_inodes(&nr_to_write, WB_SYNC_NONE, &oldest_jif);
blk_run_queues();
yield();
if (time_before(next_jif, jiffies + HZ))
next_jif = jiffies + HZ;
mod_timer(&wb_timer, next_jif);
}
static void wb_timer_fn(unsigned long unused)
{
if (pdflush_operation(wb_kupdate, 0) < 0)
mod_timer(&wb_timer, jiffies + HZ); /* delay 1 second */
}
static int __init wb_timer_init(void)
{
init_timer(&wb_timer);
wb_timer.expires = jiffies + (dirty_writeback_centisecs * HZ) / 100;
wb_timer.data = 0;
wb_timer.function = wb_timer_fn;
add_timer(&wb_timer);
return 0;
}
module_init(wb_timer_init);
/*
* A library function, which implements the vm_writeback a_op. It's fairly
* lame at this time. The idea is: the VM wants to liberate this page,
* so we pass the page to the address_space and give the fs the opportunity
* to write out lots of pages around this one. It allows extent-based
* filesytems to do intelligent things. It lets delayed-allocate filesystems
* perform better file layout. It lets the address_space opportunistically
* write back disk-contiguous pages which are in other zones.
*
* FIXME: the VM wants to start I/O against *this* page. Because its zone
* is under pressure. But this function may start writeout against a
* totally different set of pages. Unlikely to be a huge problem, but if it
* is, we could just writepage the page if it is still (PageDirty &&
* !PageWriteback) (See below).
*
* Another option is to just reposition page->mapping->dirty_pages so we
* *know* that the page will be written. That will work fine, but seems
* unpleasant. (If the page is not for-sure on ->dirty_pages we're dead).
* Plus it assumes that the address_space is performing writeback in
* ->dirty_pages order.
*
* So. The proper fix is to leave the page locked-and-dirty and to pass
* it all the way down.
*/
int generic_vm_writeback(struct page *page, int *nr_to_write)
{
struct inode *inode = page->mapping->host;
/*
* We don't own this inode, and we don't want the address_space
* vanishing while writeback is walking its pages.
*/
inode = igrab(inode);
unlock_page(page);
if (inode) {
do_writepages(inode->i_mapping, nr_to_write);
/*
* This iput() will internally call ext2_discard_prealloc(),
* which is rather bogus. But there is no other way of
* dropping our ref to the inode. However, there's no harm
* in dropping the prealloc, because there probably isn't any.
* Just a waste of cycles.
*/
iput(inode);
#if 0
if (!PageWriteback(page) && PageDirty(page)) {
lock_page(page);
if (!PageWriteback(page) && TestClearPageDirty(page))
page->mapping->a_ops->writepage(page);
else
unlock_page(page);
}
#endif
}
return 0;
}
EXPORT_SYMBOL(generic_vm_writeback);
int do_writepages(struct address_space *mapping, int *nr_to_write)
{
if (mapping->a_ops->writepages)
return mapping->a_ops->writepages(mapping, nr_to_write);
return generic_writepages(mapping, nr_to_write);
}
/**
* write_one_page - write out a single page and optionally wait on I/O
*
* @page - the page to write
* @wait - if true, wait on writeout
*
* The page must be locked by the caller and will be unlocked upon return.
*
* write_one_page() returns a negative error code if I/O failed.
*/
int write_one_page(struct page *page, int wait)
{
struct address_space *mapping = page->mapping;
int ret = 0;
BUG_ON(!PageLocked(page));
if (wait && PageWriteback(page))
wait_on_page_writeback(page);
write_lock(&mapping->page_lock);
list_del(&page->list);
if (TestClearPageDirty(page)) {
list_add(&page->list, &mapping->locked_pages);
page_cache_get(page);
write_unlock(&mapping->page_lock);
ret = mapping->a_ops->writepage(page);
if (ret == 0 && wait) {
wait_on_page_writeback(page);
if (PageError(page))
ret = -EIO;
}
page_cache_release(page);
} else {
list_add(&page->list, &mapping->clean_pages);
write_unlock(&mapping->page_lock);
unlock_page(page);
}
return ret;
}
EXPORT_SYMBOL(write_one_page);
/*
* Add a page to the dirty page list.
*
* It is a sad fact of life that this function is called from several places
* deeply under spinlocking. It may not sleep.
*
* If the page has buffers, the uptodate buffers are set dirty, to preserve
* dirty-state coherency between the page and the buffers. It the page does
* not have buffers then when they are later attached they will all be set
* dirty.
*
* The buffers are dirtied before the page is dirtied. There's a small race
* window in which a writepage caller may see the page cleanness but not the
* buffer dirtiness. That's fine. If this code were to set the page dirty
* before the buffers, a concurrent writepage caller could clear the page dirty
* bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
* page on the dirty page list.
*
* There is also a small window where the page is dirty, and not on dirty_pages.
* Also a possibility that by the time the page is added to dirty_pages, it has
* been set clean. The page lists are somewhat approximate in this regard.
* It's better to have clean pages accidentally attached to dirty_pages than to
* leave dirty pages attached to clean_pages.
*
* We use private_lock to lock against try_to_free_buffers while using the
* page's buffer list. Also use this to protect against clean buffers being
* added to the page after it was set dirty.
*
* FIXME: may need to call ->reservepage here as well. That's rather up to the
* address_space though.
*
* For now, we treat swapper_space specially. It doesn't use the normal
* block a_ops.
*
* FIXME: this should move over to fs/buffer.c - buffer_heads have no business in mm/
*/
#include <linux/buffer_head.h>
int __set_page_dirty_buffers(struct page *page)
{
struct address_space * const mapping = page->mapping;
int ret = 0;
if (mapping == NULL) {
SetPageDirty(page);
goto out;
}
if (!PageUptodate(page))
buffer_error();
spin_lock(&mapping->private_lock);
if (page_has_buffers(page)) {
struct buffer_head *head = page_buffers(page);
struct buffer_head *bh = head;
do {
if (buffer_uptodate(bh))
set_buffer_dirty(bh);
else
buffer_error();
bh = bh->b_this_page;
} while (bh != head);
}
if (!TestSetPageDirty(page)) {
write_lock(&mapping->page_lock);
list_del(&page->list);
list_add(&page->list, &mapping->dirty_pages);
write_unlock(&mapping->page_lock);
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
}
spin_unlock(&mapping->private_lock);
out:
return ret;
}
EXPORT_SYMBOL(__set_page_dirty_buffers);
/*
* For address_spaces which do not use buffers. Just set the page's dirty bit
* and move it to the dirty_pages list. Also perform space reservation if
* required.
*
* __set_page_dirty_nobuffers() may return -ENOSPC. But if it does, the page
* is still safe, as long as it actually manages to find some blocks at
* writeback time.
*
* This is also used when a single buffer is being dirtied: we want to set the
* page dirty in that case, but not all the buffers. This is a "bottom-up"
* dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying.
*/
int __set_page_dirty_nobuffers(struct page *page)
{
int ret = 0;
if (!TestSetPageDirty(page)) {
struct address_space *mapping = page->mapping;
if (mapping) {
write_lock(&mapping->page_lock);
list_del(&page->list);
list_add(&page->list, &mapping->dirty_pages);
write_unlock(&mapping->page_lock);
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
}
}
return ret;
}
EXPORT_SYMBOL(__set_page_dirty_nobuffers);
|