summaryrefslogtreecommitdiff
path: root/include/linux
diff options
context:
space:
mode:
authorAndrew Morton <akpm@zip.com.au>2002-05-19 02:22:50 -0700
committerArnaldo Carvalho de Melo <acme@conectiva.com.br>2002-05-19 02:22:50 -0700
commit799391cc6d6ff6b37192eb49d5ea3e3aa1137e31 (patch)
tree1b6ae5848a6380a7312a32cb303bd2017fce3b3f /include/linux
parenta9f525e6819d2c136e469d13c8b2e8097100930c (diff)
[PATCH] improved I/O scheduling for indirect blocks
Fixes a performance problem with many-small-file writeout. At present, files are written out via their mapping and their indirect blocks are written out via the blockdev mapping. As we know that indirects are disk-adjacent to the data it is better to start I/O against the indirects at the same time as the data. The delalloc pathes have code in ext2_writepage() which recognises when the target page->index was at an indirect boundary and does an explicit hunt-and-write against the neighbouring indirect block. Which is ideal. (Unless the file was dirtied seekily and the page which is next to the indirect was not dirtied). This patch does it the other way: when we start writeback against a mapping, also start writeback against any dirty buffers which are attached to mapping->private_list. Let the elevator take care of the rest. The patch makes a number of tuning changes to the writeback path in fs-writeback.c. This is very fiddly code: getting the throughput tuned, getting the data-integrity "sync" operations right, avoiding most of the livelock opportunities, getting the `kupdate' function working efficiently, keeping it all least somewhat comprehensible. An important intent here is to ensure that metadata blocks for inodes are marked dirty before writeback starts working the blockdev mapping, so all the inode blocks are efficiently written back. The patch removes try_to_writeback_unused_inodes(), which became unreferenced in vm-writeback.patch. The patch has a tweak in ext2_put_inode() to prevent ext2 from incorrectly droppping its preallocation window in response to a random iput(). Generally, many-small-file writeout is a lot faster than 2.5.7 (which is linux-before-I-futzed-with-it). The workload which was optimised was tar xfz /nfs/mountpoint/linux-2.4.18.tar.gz ; sync on mem=128M and mem=2048M. With these patches, 2.5.15 is completing in about 2/3 of the time of 2.5.7. But it is only a shade faster than 2.4.19-pre7. Why is 2.5.7 so much slower than 2.4.19? Not sure yet. Heavy dbench loads (dbench 32 on mem=128M) are slightly faster than 2.5.7 and significantly slower than 2.4.19. It appears that the cause is poor read throughput at the later stages of the run. Because there are background writeback threads operating at the same time. The 2.4.19-pre8 write scheduling manages to stop writeback during the latter stages of the dbench run in a way which I haven't been able to sanely emulate yet. It may not be desirable to do this anyway - it's optimising for the case where the files are about to be deleted. But it would be good to find a way of "pausing" the writeback for a few seconds to allow readers to get an interval of decent bandwidth. tiobench throughput is basically the same across all recent kernels. CPU load on writes is down maybe 30% in 2.5.15.
Diffstat (limited to 'include/linux')
-rw-r--r--include/linux/buffer_head.h12
-rw-r--r--include/linux/fs.h4
-rw-r--r--include/linux/writeback.h8
3 files changed, 13 insertions, 11 deletions
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 328af2a6c275..5560b6ee5878 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -29,6 +29,7 @@ enum bh_state_bits {
struct page;
struct kiobuf;
struct buffer_head;
+struct address_space;
typedef void (bh_end_io_t)(struct buffer_head *bh, int uptodate);
/*
@@ -145,14 +146,19 @@ int try_to_free_buffers(struct page *);
void create_empty_buffers(struct page *, unsigned long,
unsigned long b_state);
void end_buffer_io_sync(struct buffer_head *bh, int uptodate);
+
+/* Things to do with buffers at mapping->private_list */
void buffer_insert_list(spinlock_t *lock,
struct buffer_head *, struct list_head *);
-int sync_mapping_buffers(struct address_space *mapping);
void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode);
+int write_mapping_buffers(struct address_space *mapping);
+int inode_has_buffers(struct inode *);
+void invalidate_inode_buffers(struct inode *);
+int fsync_buffers_list(spinlock_t *lock, struct list_head *);
+int sync_mapping_buffers(struct address_space *mapping);
void mark_buffer_async_read(struct buffer_head *bh);
void mark_buffer_async_write(struct buffer_head *bh);
-void invalidate_inode_buffers(struct inode *);
void invalidate_bdev(struct block_device *, int);
void __invalidate_buffers(kdev_t dev, int);
int sync_blockdev(struct block_device *bdev);
@@ -163,8 +169,6 @@ int fsync_dev(kdev_t);
int fsync_bdev(struct block_device *);
int fsync_super(struct super_block *);
int fsync_no_super(struct block_device *);
-int fsync_buffers_list(spinlock_t *lock, struct list_head *);
-int inode_has_buffers(struct inode *);
struct buffer_head *__get_hash_table(struct block_device *, sector_t, int);
struct buffer_head * __getblk(struct block_device *, sector_t, int);
void __brelse(struct buffer_head *);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b936413f96f2..9b2bfa8cc3d6 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -618,7 +618,6 @@ struct super_block {
kdev_t s_dev;
unsigned long s_blocksize;
unsigned long s_old_blocksize;
- unsigned short s_writeback_gen;/* To avoid writeback livelock */
unsigned char s_blocksize_bits;
unsigned char s_dirt;
unsigned long long s_maxbytes; /* Max file size */
@@ -632,9 +631,11 @@ struct super_block {
struct rw_semaphore s_umount;
struct semaphore s_lock;
int s_count;
+ int s_syncing;
atomic_t s_active;
struct list_head s_dirty; /* dirty inodes */
+ struct list_head s_io; /* parked for writeback */
struct list_head s_locked_inodes;/* inodes being synced */
struct list_head s_anon; /* anonymous dentries for (nfs) exporting */
struct list_head s_files;
@@ -1116,7 +1117,6 @@ extern int invalidate_device(kdev_t, int);
extern void invalidate_inode_pages(struct inode *);
extern void invalidate_inode_pages2(struct address_space *);
extern void write_inode_now(struct inode *, int);
-extern void sync_inodes_sb(struct super_block *);
extern int filemap_fdatawrite(struct address_space *);
extern int filemap_fdatawait(struct address_space *);
extern void sync_supers(void);
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index e345205b6d86..9dc03210ee62 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -27,15 +27,13 @@ static inline int current_is_pdflush(void)
#define WB_SYNC_NONE 0 /* Don't wait on anything */
#define WB_SYNC_LAST 1 /* Wait on the last-written mapping */
#define WB_SYNC_ALL 2 /* Wait on every mapping */
+#define WB_SYNC_HOLD 3 /* Hold the inode on sb_dirty for sys_sync() */
-void try_to_writeback_unused_inodes(unsigned long pexclusive);
-void writeback_single_inode(struct inode *inode,
- int sync, int *nr_to_write);
void writeback_unlocked_inodes(int *nr_to_write, int sync_mode,
unsigned long *older_than_this);
-void writeback_inodes_sb(struct super_block *);
void __wait_on_inode(struct inode * inode);
-void sync_inodes(void);
+void sync_inodes_sb(struct super_block *, int wait);
+void sync_inodes(int wait);
static inline void wait_on_inode(struct inode *inode)
{