From 585d3bc06f4ca57f975a5a1f698f65a45ea66225 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 25 Feb 2009 10:44:19 +0100 Subject: fs: move bdev code out of buffer.c Move some block device related code out from buffer.c and put it in block_dev.c. I'm trying to move non-buffer_head code out of buffer.c Signed-off-by: Al Viro --- fs/buffer.c | 145 ------------------------------------------------------------ 1 file changed, 145 deletions(-) (limited to 'fs/buffer.c') diff --git a/fs/buffer.c b/fs/buffer.c index 891e1c78e4f1..a2fd743d97cb 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -165,151 +165,6 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate) put_bh(bh); } -/* - * Write out and wait upon all the dirty data associated with a block - * device via its mapping. Does not take the superblock lock. - */ -int sync_blockdev(struct block_device *bdev) -{ - int ret = 0; - - if (bdev) - ret = filemap_write_and_wait(bdev->bd_inode->i_mapping); - return ret; -} -EXPORT_SYMBOL(sync_blockdev); - -/* - * Write out and wait upon all dirty data associated with this - * device. Filesystem data as well as the underlying block - * device. Takes the superblock lock. - */ -int fsync_bdev(struct block_device *bdev) -{ - struct super_block *sb = get_super(bdev); - if (sb) { - int res = fsync_super(sb); - drop_super(sb); - return res; - } - return sync_blockdev(bdev); -} - -/** - * freeze_bdev -- lock a filesystem and force it into a consistent state - * @bdev: blockdevice to lock - * - * This takes the block device bd_mount_sem to make sure no new mounts - * happen on bdev until thaw_bdev() is called. - * If a superblock is found on this device, we take the s_umount semaphore - * on it to make sure nobody unmounts until the snapshot creation is done. - * The reference counter (bd_fsfreeze_count) guarantees that only the last - * unfreeze process can unfreeze the frozen filesystem actually when multiple - * freeze requests arrive simultaneously. It counts up in freeze_bdev() and - * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze - * actually. - */ -struct super_block *freeze_bdev(struct block_device *bdev) -{ - struct super_block *sb; - int error = 0; - - mutex_lock(&bdev->bd_fsfreeze_mutex); - if (bdev->bd_fsfreeze_count > 0) { - bdev->bd_fsfreeze_count++; - sb = get_super(bdev); - mutex_unlock(&bdev->bd_fsfreeze_mutex); - return sb; - } - bdev->bd_fsfreeze_count++; - - down(&bdev->bd_mount_sem); - sb = get_super(bdev); - if (sb && !(sb->s_flags & MS_RDONLY)) { - sb->s_frozen = SB_FREEZE_WRITE; - smp_wmb(); - - __fsync_super(sb); - - sb->s_frozen = SB_FREEZE_TRANS; - smp_wmb(); - - sync_blockdev(sb->s_bdev); - - if (sb->s_op->freeze_fs) { - error = sb->s_op->freeze_fs(sb); - if (error) { - printk(KERN_ERR - "VFS:Filesystem freeze failed\n"); - sb->s_frozen = SB_UNFROZEN; - drop_super(sb); - up(&bdev->bd_mount_sem); - bdev->bd_fsfreeze_count--; - mutex_unlock(&bdev->bd_fsfreeze_mutex); - return ERR_PTR(error); - } - } - } - - sync_blockdev(bdev); - mutex_unlock(&bdev->bd_fsfreeze_mutex); - - return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */ -} -EXPORT_SYMBOL(freeze_bdev); - -/** - * thaw_bdev -- unlock filesystem - * @bdev: blockdevice to unlock - * @sb: associated superblock - * - * Unlocks the filesystem and marks it writeable again after freeze_bdev(). - */ -int thaw_bdev(struct block_device *bdev, struct super_block *sb) -{ - int error = 0; - - mutex_lock(&bdev->bd_fsfreeze_mutex); - if (!bdev->bd_fsfreeze_count) { - mutex_unlock(&bdev->bd_fsfreeze_mutex); - return -EINVAL; - } - - bdev->bd_fsfreeze_count--; - if (bdev->bd_fsfreeze_count > 0) { - if (sb) - drop_super(sb); - mutex_unlock(&bdev->bd_fsfreeze_mutex); - return 0; - } - - if (sb) { - BUG_ON(sb->s_bdev != bdev); - if (!(sb->s_flags & MS_RDONLY)) { - if (sb->s_op->unfreeze_fs) { - error = sb->s_op->unfreeze_fs(sb); - if (error) { - printk(KERN_ERR - "VFS:Filesystem thaw failed\n"); - sb->s_frozen = SB_FREEZE_TRANS; - bdev->bd_fsfreeze_count++; - mutex_unlock(&bdev->bd_fsfreeze_mutex); - return error; - } - } - sb->s_frozen = SB_UNFROZEN; - smp_wmb(); - wake_up(&sb->s_wait_unfrozen); - } - drop_super(sb); - } - - up(&bdev->bd_mount_sem); - mutex_unlock(&bdev->bd_fsfreeze_mutex); - return 0; -} -EXPORT_SYMBOL(thaw_bdev); - /* * Various filesystems appear to want __find_get_block to be non-blocking. * But it's the page lock which protects the buffers. To get around this, -- cgit v1.2.3 From a64c8610bd3b753c6aff58f51c04cdf0ae478c18 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 27 Mar 2009 22:14:10 -0400 Subject: block_write_full_page: Use synchronous writes for WBC_SYNC_ALL writebacks When doing synchronous writes because wbc->sync_mode is set to WBC_SYNC_ALL, send the write request using WRITE_SYNC, so that we don't unduly block system calls such as fsync(). Signed-off-by: "Theodore Ts'o" Acked-by: Jan Kara --- fs/buffer.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs/buffer.c') diff --git a/fs/buffer.c b/fs/buffer.c index 891e1c78e4f1..e7ebd95e0c68 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1714,6 +1714,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page, struct buffer_head *bh, *head; const unsigned blocksize = 1 << inode->i_blkbits; int nr_underway = 0; + int write_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); BUG_ON(!PageLocked(page)); @@ -1805,7 +1806,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page, do { struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { - submit_bh(WRITE, bh); + submit_bh(write_op, bh); nr_underway++; } bh = next; @@ -1859,7 +1860,7 @@ recover: struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { clear_buffer_dirty(bh); - submit_bh(WRITE, bh); + submit_bh(write_op, bh); nr_underway++; } bh = next; -- cgit v1.2.3 From 47e4491b40df73c3b117e3d80b31b5b512a4b19f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 1 Apr 2009 07:07:16 -0400 Subject: Cleanup after commit 585d3bc06f4ca57f975a5a1f698f65a45ea66225 fsync_bdev() export and a bunch of stubs for !CONFIG_BLOCK case had been left behind Signed-off-by: Al Viro --- fs/block_dev.c | 1 + fs/buffer.c | 1 - include/linux/buffer_head.h | 12 ------------ include/linux/fs.h | 12 ++++++++++++ 4 files changed, 13 insertions(+), 13 deletions(-) (limited to 'fs/buffer.c') diff --git a/fs/block_dev.c b/fs/block_dev.c index 8c3c6899ccf3..f45dbc18dd17 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -204,6 +204,7 @@ int fsync_bdev(struct block_device *bdev) } return sync_blockdev(bdev); } +EXPORT_SYMBOL(fsync_bdev); /** * freeze_bdev -- lock a filesystem and force it into a consistent state diff --git a/fs/buffer.c b/fs/buffer.c index a2fd743d97cb..b71e52925c83 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3281,7 +3281,6 @@ EXPORT_SYMBOL(cont_write_begin); EXPORT_SYMBOL(end_buffer_read_sync); EXPORT_SYMBOL(end_buffer_write_sync); EXPORT_SYMBOL(file_fsync); -EXPORT_SYMBOL(fsync_bdev); EXPORT_SYMBOL(generic_block_bmap); EXPORT_SYMBOL(generic_cont_expand_simple); EXPORT_SYMBOL(init_buffer); diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index f19fd9045ea0..fc91665d39d0 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -332,22 +332,10 @@ extern int __set_page_dirty_buffers(struct page *page); static inline void buffer_init(void) {} static inline int try_to_free_buffers(struct page *page) { return 1; } -static inline int sync_blockdev(struct block_device *bdev) { return 0; } static inline int inode_has_buffers(struct inode *inode) { return 0; } static inline void invalidate_inode_buffers(struct inode *inode) {} static inline int remove_inode_buffers(struct inode *inode) { return 1; } static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; } -static inline void invalidate_bdev(struct block_device *bdev) {} - -static inline struct super_block *freeze_bdev(struct block_device *sb) -{ - return NULL; -} - -static inline int thaw_bdev(struct block_device *bdev, struct super_block *sb) -{ - return 0; -} #endif /* CONFIG_BLOCK */ #endif /* _LINUX_BUFFER_HEAD_H */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 3d7bd5447ca3..674134725597 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1886,6 +1886,18 @@ extern int fsync_super(struct super_block *); extern int fsync_no_super(struct block_device *); #else static inline void bd_forget(struct inode *inode) {} +static inline int sync_blockdev(struct block_device *bdev) { return 0; } +static inline void invalidate_bdev(struct block_device *bdev) {} + +static inline struct super_block *freeze_bdev(struct block_device *sb) +{ + return NULL; +} + +static inline int thaw_bdev(struct block_device *bdev, struct super_block *sb) +{ + return 0; +} #endif extern const struct file_operations def_blk_fops; extern const struct file_operations def_chr_fops; -- cgit v1.2.3 From e3a7cca1ef4c1af9b0acef9bd66eff6582a737b5 Mon Sep 17 00:00:00 2001 From: Edward Shishkin Date: Tue, 31 Mar 2009 15:19:39 -0700 Subject: vfs: add/use account_page_dirtied() Add a helper function account_page_dirtied(). Use that from two callsites. reiser4 adds a function which adds a third callsite. Signed-off-by: Edward Shishkin Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 9 +-------- include/linux/mm.h | 1 + mm/page-writeback.c | 22 +++++++++++++++------- 3 files changed, 17 insertions(+), 15 deletions(-) (limited to 'fs/buffer.c') diff --git a/fs/buffer.c b/fs/buffer.c index a2fd743d97cb..73abe6d8218c 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -621,14 +621,7 @@ static void __set_page_dirty(struct page *page, spin_lock_irq(&mapping->tree_lock); if (page->mapping) { /* Race with truncate? */ WARN_ON_ONCE(warn && !PageUptodate(page)); - - if (mapping_cap_account_dirty(mapping)) { - __inc_zone_page_state(page, NR_FILE_DIRTY); - __inc_bdi_stat(mapping->backing_dev_info, - BDI_RECLAIMABLE); - task_dirty_inc(current); - task_io_account_write(PAGE_CACHE_SIZE); - } + account_page_dirtied(page, mapping); radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } diff --git a/include/linux/mm.h b/include/linux/mm.h index b1ea37fc7a24..2223f8dfa568 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -834,6 +834,7 @@ int __set_page_dirty_nobuffers(struct page *page); int __set_page_dirty_no_writeback(struct page *page); int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page); +void account_page_dirtied(struct page *page, struct address_space *mapping); int set_page_dirty(struct page *page); int set_page_dirty_lock(struct page *page); int clear_page_dirty_for_io(struct page *page); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 40ca7cdb653e..6aa92b03c747 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1197,6 +1197,20 @@ int __set_page_dirty_no_writeback(struct page *page) return 0; } +/* + * Helper function for set_page_dirty family. + * NOTE: This relies on being atomic wrt interrupts. + */ +void account_page_dirtied(struct page *page, struct address_space *mapping) +{ + if (mapping_cap_account_dirty(mapping)) { + __inc_zone_page_state(page, NR_FILE_DIRTY); + __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); + task_dirty_inc(current); + task_io_account_write(PAGE_CACHE_SIZE); + } +} + /* * For address_spaces which do not use buffers. Just tag the page as dirty in * its radix tree. @@ -1226,13 +1240,7 @@ int __set_page_dirty_nobuffers(struct page *page) if (mapping2) { /* Race with truncate? */ BUG_ON(mapping2 != mapping); WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); - if (mapping_cap_account_dirty(mapping)) { - __inc_zone_page_state(page, NR_FILE_DIRTY); - __inc_bdi_stat(mapping->backing_dev_info, - BDI_RECLAIMABLE); - task_dirty_inc(current); - task_io_account_write(PAGE_CACHE_SIZE); - } + account_page_dirtied(page, mapping); radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } -- cgit v1.2.3 From c2ec175c39f62949438354f603f4aa170846aabb Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Tue, 31 Mar 2009 15:23:21 -0700 Subject: mm: page_mkwrite change prototype to match fault Change the page_mkwrite prototype to take a struct vm_fault, and return VM_FAULT_xxx flags. There should be no functional change. This makes it possible to return much more detailed error information to the VM (and also can provide more information eg. virtual_address to the driver, which might be important in some special cases). This is required for a subsequent fix. And will also make it easier to merge page_mkwrite() with fault() in future. Signed-off-by: Nick Piggin Cc: Chris Mason Cc: Trond Myklebust Cc: Miklos Szeredi Cc: Steven Whitehouse Cc: Mark Fasheh Cc: Joel Becker Cc: Artem Bityutskiy Cc: Felix Blyakher Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/Locking | 2 +- drivers/video/fb_defio.c | 3 ++- fs/btrfs/ctree.h | 2 +- fs/btrfs/inode.c | 5 ++++- fs/buffer.c | 6 +++++- fs/ext4/ext4.h | 2 +- fs/ext4/inode.c | 5 ++++- fs/fuse/file.c | 3 ++- fs/gfs2/ops_file.c | 5 ++++- fs/nfs/file.c | 5 ++++- fs/ocfs2/mmap.c | 6 ++++-- fs/ubifs/file.c | 9 ++++++--- fs/xfs/linux-2.6/xfs_file.c | 4 ++-- include/linux/buffer_head.h | 2 +- include/linux/mm.h | 3 ++- mm/memory.c | 26 ++++++++++++++++++++++---- 16 files changed, 65 insertions(+), 23 deletions(-) (limited to 'fs/buffer.c') diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 4e78ce677843..76efe5b71d7d 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -505,7 +505,7 @@ prototypes: void (*open)(struct vm_area_struct*); void (*close)(struct vm_area_struct*); int (*fault)(struct vm_area_struct*, struct vm_fault *); - int (*page_mkwrite)(struct vm_area_struct *, struct page *); + int (*page_mkwrite)(struct vm_area_struct *, struct vm_fault *); int (*access)(struct vm_area_struct *, unsigned long, void*, int, int); locking rules: diff --git a/drivers/video/fb_defio.c b/drivers/video/fb_defio.c index 082026546aee..0a7a6679ee6e 100644 --- a/drivers/video/fb_defio.c +++ b/drivers/video/fb_defio.c @@ -85,8 +85,9 @@ EXPORT_SYMBOL_GPL(fb_deferred_io_fsync); /* vm_ops->page_mkwrite handler */ static int fb_deferred_io_mkwrite(struct vm_area_struct *vma, - struct page *page) + struct vm_fault *vmf) { + struct page *page = vmf->page; struct fb_info *info = vma->vm_private_data; struct fb_deferred_io *fbdefio = info->fbdefio; struct page *cur; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 5e1d4e30e9d8..7dd1b6d0bf32 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2060,7 +2060,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, unsigned long btrfs_force_ra(struct address_space *mapping, struct file_ra_state *ra, struct file *file, pgoff_t offset, pgoff_t last_index); -int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page); +int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); int btrfs_readpage(struct file *file, struct page *page); void btrfs_delete_inode(struct inode *inode); void btrfs_put_inode(struct inode *inode); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7d4f948bc22a..ec5423790bbb 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4292,8 +4292,9 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) * beyond EOF, then the page is guaranteed safe against truncation until we * unlock the page. */ -int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page) +int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { + struct page *page = vmf->page; struct inode *inode = fdentry(vma->vm_file)->d_inode; struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; @@ -4362,6 +4363,8 @@ again: out_unlock: unlock_page(page); out: + if (ret) + ret = VM_FAULT_SIGBUS; return ret; } diff --git a/fs/buffer.c b/fs/buffer.c index 73abe6d8218c..6d51a3da362c 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2313,9 +2313,10 @@ int block_commit_write(struct page *page, unsigned from, unsigned to) * unlock the page. */ int -block_page_mkwrite(struct vm_area_struct *vma, struct page *page, +block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, get_block_t get_block) { + struct page *page = vmf->page; struct inode *inode = vma->vm_file->f_path.dentry->d_inode; unsigned long end; loff_t size; @@ -2340,6 +2341,9 @@ block_page_mkwrite(struct vm_area_struct *vma, struct page *page, ret = block_commit_write(page, 0, end); out_unlock: + if (ret) + ret = VM_FAULT_SIGBUS; + unlock_page(page); return ret; } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 6083bb38057b..990c94000924 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1098,7 +1098,7 @@ extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks); extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); extern int ext4_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from); -extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page); +extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); extern qsize_t ext4_get_reserved_space(struct inode *inode); /* ioctl.c */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 71d3ecd5db79..dd82ff390067 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5146,8 +5146,9 @@ static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh) return !buffer_mapped(bh); } -int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page) +int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { + struct page *page = vmf->page; loff_t size; unsigned long len; int ret = -EINVAL; @@ -5199,6 +5200,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page) goto out_unlock; ret = 0; out_unlock: + if (ret) + ret = VM_FAULT_SIGBUS; up_read(&inode->i_alloc_sem); return ret; } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 821d10f719bd..4e340fedf768 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1234,8 +1234,9 @@ static void fuse_vma_close(struct vm_area_struct *vma) * - sync(2) * - try_to_free_pages() with order > PAGE_ALLOC_COSTLY_ORDER */ -static int fuse_page_mkwrite(struct vm_area_struct *vma, struct page *page) +static int fuse_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { + struct page *page = vmf->page; /* * Don't use page->mapping as it may become NULL from a * concurrent truncate. diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c index 3b9e8de3500b..70b9b8548945 100644 --- a/fs/gfs2/ops_file.c +++ b/fs/gfs2/ops_file.c @@ -337,8 +337,9 @@ static int gfs2_allocate_page_backing(struct page *page) * blocks allocated on disk to back that page. */ -static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page) +static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { + struct page *page = vmf->page; struct inode *inode = vma->vm_file->f_path.dentry->d_inode; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); @@ -412,6 +413,8 @@ out_unlock: gfs2_glock_dq(&gh); out: gfs2_holder_uninit(&gh); + if (ret) + ret = VM_FAULT_SIGBUS; return ret; } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 90f292b520d2..cec79392e4ba 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -451,8 +451,9 @@ const struct address_space_operations nfs_file_aops = { .launder_page = nfs_launder_page, }; -static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page) +static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { + struct page *page = vmf->page; struct file *filp = vma->vm_file; struct dentry *dentry = filp->f_path.dentry; unsigned pagelen; @@ -483,6 +484,8 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page) ret = pagelen; out_unlock: unlock_page(page); + if (ret) + ret = VM_FAULT_SIGBUS; return ret; } diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index eea1d24713ea..b606496b72ec 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -154,8 +154,9 @@ out: return ret; } -static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page) +static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { + struct page *page = vmf->page; struct inode *inode = vma->vm_file->f_path.dentry->d_inode; struct buffer_head *di_bh = NULL; sigset_t blocked, oldset; @@ -196,7 +197,8 @@ out: ret2 = ocfs2_vm_op_unblock_sigs(&oldset); if (ret2 < 0) mlog_errno(ret2); - + if (ret) + ret = VM_FAULT_SIGBUS; return ret; } diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 93b6de51f261..0ff89fe71e51 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1434,8 +1434,9 @@ static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags) * mmap()d file has taken write protection fault and is being made * writable. UBIFS must ensure page is budgeted for. */ -static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page) +static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { + struct page *page = vmf->page; struct inode *inode = vma->vm_file->f_path.dentry->d_inode; struct ubifs_info *c = inode->i_sb->s_fs_info; struct timespec now = ubifs_current_time(inode); @@ -1447,7 +1448,7 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page) ubifs_assert(!(inode->i_sb->s_flags & MS_RDONLY)); if (unlikely(c->ro_media)) - return -EROFS; + return VM_FAULT_SIGBUS; /* -EROFS */ /* * We have not locked @page so far so we may budget for changing the @@ -1480,7 +1481,7 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page) if (err == -ENOSPC) ubifs_warn("out of space for mmapped file " "(inode number %lu)", inode->i_ino); - return err; + return VM_FAULT_SIGBUS; } lock_page(page); @@ -1520,6 +1521,8 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page) out_unlock: unlock_page(page); ubifs_release_budget(c, &req); + if (err) + err = VM_FAULT_SIGBUS; return err; } diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index e14c4e3aea0c..f4e255441574 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -234,9 +234,9 @@ xfs_file_mmap( STATIC int xfs_vm_page_mkwrite( struct vm_area_struct *vma, - struct page *page) + struct vm_fault *vmf) { - return block_page_mkwrite(vma, page, xfs_get_blocks); + return block_page_mkwrite(vma, vmf, xfs_get_blocks); } const struct file_operations xfs_file_operations = { diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index f19fd9045ea0..3d7bcde2e332 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -216,7 +216,7 @@ int cont_write_begin(struct file *, struct address_space *, loff_t, get_block_t *, loff_t *); int generic_cont_expand_simple(struct inode *inode, loff_t size); int block_commit_write(struct page *page, unsigned from, unsigned to); -int block_page_mkwrite(struct vm_area_struct *vma, struct page *page, +int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, get_block_t get_block); void block_sync_page(struct page *); sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *); diff --git a/include/linux/mm.h b/include/linux/mm.h index 2223f8dfa568..aeabe953ba4f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -135,6 +135,7 @@ extern pgprot_t protection_map[16]; #define FAULT_FLAG_WRITE 0x01 /* Fault was a write access */ #define FAULT_FLAG_NONLINEAR 0x02 /* Fault was via a nonlinear mapping */ +#define FAULT_FLAG_MKWRITE 0x04 /* Fault was mkwrite of existing pte */ /* * This interface is used by x86 PAT code to identify a pfn mapping that is @@ -187,7 +188,7 @@ struct vm_operations_struct { /* notification that a previously read-only page is about to become * writable, if an error is returned it will cause a SIGBUS */ - int (*page_mkwrite)(struct vm_area_struct *vma, struct page *page); + int (*page_mkwrite)(struct vm_area_struct *vma, struct vm_fault *vmf); /* called by access_process_vm when get_user_pages() fails, typically * for use by special VMAs that can switch between memory and hardware diff --git a/mm/memory.c b/mm/memory.c index 5b4ad5e4f98d..cf6873e91c6a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1945,6 +1945,15 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, * get_user_pages(.write=1, .force=1). */ if (vma->vm_ops && vma->vm_ops->page_mkwrite) { + struct vm_fault vmf; + int tmp; + + vmf.virtual_address = (void __user *)(address & + PAGE_MASK); + vmf.pgoff = old_page->index; + vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; + vmf.page = old_page; + /* * Notify the address space that the page is about to * become writable so that it can prohibit this or wait @@ -1956,8 +1965,12 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, page_cache_get(old_page); pte_unmap_unlock(page_table, ptl); - if (vma->vm_ops->page_mkwrite(vma, old_page) < 0) + tmp = vma->vm_ops->page_mkwrite(vma, &vmf); + if (unlikely(tmp & + (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { + ret = tmp; goto unwritable_page; + } /* * Since we dropped the lock we need to revalidate @@ -2106,7 +2119,7 @@ oom: unwritable_page: page_cache_release(old_page); - return VM_FAULT_SIGBUS; + return ret; } /* @@ -2648,9 +2661,14 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, * to become writable */ if (vma->vm_ops->page_mkwrite) { + int tmp; + unlock_page(page); - if (vma->vm_ops->page_mkwrite(vma, page) < 0) { - ret = VM_FAULT_SIGBUS; + vmf.flags |= FAULT_FLAG_MKWRITE; + tmp = vma->vm_ops->page_mkwrite(vma, &vmf); + if (unlikely(tmp & + (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { + ret = tmp; anon = 1; /* no anon but release vmf.page */ goto out_unlocked; } -- cgit v1.2.3 From 56a76f8275c379ed73c8a43cfa1dfa2f5e9cfa19 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Tue, 31 Mar 2009 15:23:23 -0700 Subject: fs: fix page_mkwrite error cases in core code and btrfs page_mkwrite is called with neither the page lock nor the ptl held. This means a page can be concurrently truncated or invalidated out from underneath it. Callers are supposed to prevent truncate races themselves, however previously the only thing they can do in case they hit one is to raise a SIGBUS. A sigbus is wrong for the case that the page has been invalidated or truncated within i_size (eg. hole punched). Callers may also have to perform memory allocations in this path, where again, SIGBUS would be wrong. The previous patch ("mm: page_mkwrite change prototype to match fault") made it possible to properly specify errors. Convert the generic buffer.c code and btrfs to return sane error values (in the case of page removed from pagecache, VM_FAULT_NOPAGE will cause the fault handler to exit without doing anything, and the fault will be retried properly). This fixes core code, and converts btrfs as a template/example. All other filesystems defining their own page_mkwrite should be fixed in a similar manner. Acked-by: Chris Mason Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/btrfs/inode.c | 11 +++++++---- fs/buffer.c | 12 ++++++++---- 2 files changed, 15 insertions(+), 8 deletions(-) (limited to 'fs/buffer.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ec5423790bbb..17e608c4dc70 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4307,10 +4307,15 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) u64 page_end; ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); - if (ret) + if (ret) { + if (ret == -ENOMEM) + ret = VM_FAULT_OOM; + else /* -ENOSPC, -EIO, etc */ + ret = VM_FAULT_SIGBUS; goto out; + } - ret = -EINVAL; + ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ again: lock_page(page); size = i_size_read(inode); @@ -4363,8 +4368,6 @@ again: out_unlock: unlock_page(page); out: - if (ret) - ret = VM_FAULT_SIGBUS; return ret; } diff --git a/fs/buffer.c b/fs/buffer.c index 6d51a3da362c..0c14f8d52ee5 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2320,7 +2320,7 @@ block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, struct inode *inode = vma->vm_file->f_path.dentry->d_inode; unsigned long end; loff_t size; - int ret = -EINVAL; + int ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ lock_page(page); size = i_size_read(inode); @@ -2340,10 +2340,14 @@ block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, if (!ret) ret = block_commit_write(page, 0, end); -out_unlock: - if (ret) - ret = VM_FAULT_SIGBUS; + if (unlikely(ret)) { + if (ret == -ENOMEM) + ret = VM_FAULT_OOM; + else /* -ENOSPC, -EIO, etc */ + ret = VM_FAULT_SIGBUS; + } +out_unlock: unlock_page(page); return ret; } -- cgit v1.2.3 From 327c0e968645f2601a43f5ea7c19c7b3a5fa0a34 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 31 Mar 2009 15:23:31 -0700 Subject: vmscan: fix it to take care of nodemask try_to_free_pages() is used for the direct reclaim of up to SWAP_CLUSTER_MAX pages when watermarks are low. The caller to alloc_pages_nodemask() can specify a nodemask of nodes that are allowed to be used but this is not passed to try_to_free_pages(). This can lead to unnecessary reclaim of pages that are unusable by the caller and int the worst case lead to allocation failure as progress was not been make where it is needed. This patch passes the nodemask used for alloc_pages_nodemask() to try_to_free_pages(). Reviewed-by: KOSAKI Motohiro Acked-by: Mel Gorman Signed-off-by: KAMEZAWA Hiroyuki Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 2 +- include/linux/swap.h | 2 +- mm/page_alloc.c | 3 ++- mm/vmscan.c | 13 +++++++++++-- 4 files changed, 15 insertions(+), 5 deletions(-) (limited to 'fs/buffer.c') diff --git a/fs/buffer.c b/fs/buffer.c index 0c14f8d52ee5..c77b848c3d43 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -290,7 +290,7 @@ static void free_more_memory(void) &zone); if (zone) try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0, - GFP_NOFS); + GFP_NOFS, NULL); } } diff --git a/include/linux/swap.h b/include/linux/swap.h index d30215578877..b8b0c4ce83e6 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -212,7 +212,7 @@ static inline void lru_cache_add_active_file(struct page *page) /* linux/mm/vmscan.c */ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, - gfp_t gfp_mask); + gfp_t gfp_mask, nodemask_t *mask); extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem, gfp_t gfp_mask, bool noswap, unsigned int swappiness); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cbd532161f68..0284e528748d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1582,7 +1582,8 @@ nofail_alloc: reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; - did_some_progress = try_to_free_pages(zonelist, order, gfp_mask); + did_some_progress = try_to_free_pages(zonelist, order, + gfp_mask, nodemask); p->reclaim_state = NULL; lockdep_clear_current_reclaim_state(); diff --git a/mm/vmscan.c b/mm/vmscan.c index f4619c6cd59e..06e72693b458 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -78,6 +78,12 @@ struct scan_control { /* Which cgroup do we reclaim from */ struct mem_cgroup *mem_cgroup; + /* + * Nodemask of nodes allowed by the caller. If NULL, all nodes + * are scanned. + */ + nodemask_t *nodemask; + /* Pluggable isolate pages callback */ unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst, unsigned long *scanned, int order, int mode, @@ -1538,7 +1544,8 @@ static void shrink_zones(int priority, struct zonelist *zonelist, struct zone *zone; sc->all_unreclaimable = 1; - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { + for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, + sc->nodemask) { if (!populated_zone(zone)) continue; /* @@ -1683,7 +1690,7 @@ out: } unsigned long try_to_free_pages(struct zonelist *zonelist, int order, - gfp_t gfp_mask) + gfp_t gfp_mask, nodemask_t *nodemask) { struct scan_control sc = { .gfp_mask = gfp_mask, @@ -1694,6 +1701,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, .order = order, .mem_cgroup = NULL, .isolate_pages = isolate_pages_global, + .nodemask = nodemask, }; return do_try_to_free_pages(zonelist, &sc); @@ -1714,6 +1722,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, .order = 0, .mem_cgroup = mem_cont, .isolate_pages = mem_cgroup_isolate_pages, + .nodemask = NULL, /* we don't care the placement */ }; struct zonelist *zonelist; -- cgit v1.2.3 From c2d7543851849a6923680cdd7e1047ed1a84a1c5 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Tue, 31 Mar 2009 15:23:46 -0700 Subject: filesystem freeze: allow SysRq emergency thaw to thaw frozen filesystems Now that the filesystem freeze operation has been elevated to the VFS, and is just an ioctl away, some sort of safety net for unintentionally frozen root filesystems may be in order. The timeout thaw originally proposed did not get merged, but perhaps something like this would be useful in emergencies. For example, freeze /path/to/mountpoint may freeze your root filesystem if you forgot that you had that unmounted. I chose 'j' as the last remaining character other than 'h' which is sort of reserved for help (because help is generated on any unknown character). I've tested this on a non-root fs with multiple (nested) freezers, as well as on a system rendered unresponsive due to a frozen root fs. [randy.dunlap@oracle.com: emergency thaw only if CONFIG_BLOCK enabled] Signed-off-by: Eric Sandeen Cc: Takashi Sato Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysrq.txt | 5 +++++ drivers/char/sysrq.c | 19 ++++++++++++++++++- fs/buffer.c | 33 +++++++++++++++++++++++++++++++++ include/linux/fs.h | 1 + 4 files changed, 57 insertions(+), 1 deletion(-) (limited to 'fs/buffer.c') diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt index 9e592c718afb..afa2946892da 100644 --- a/Documentation/sysrq.txt +++ b/Documentation/sysrq.txt @@ -81,6 +81,8 @@ On all - write a character to /proc/sysrq-trigger. e.g.: 'i' - Send a SIGKILL to all processes, except for init. +'j' - Forcibly "Just thaw it" - filesystems frozen by the FIFREEZE ioctl. + 'k' - Secure Access Key (SAK) Kills all programs on the current virtual console. NOTE: See important comments below in SAK section. @@ -160,6 +162,9 @@ t'E'rm and k'I'll are useful if you have some sort of runaway process you are unable to kill any other way, especially if it's spawning other processes. +"'J'ust thaw it" is useful if your system becomes unresponsive due to a frozen +(probably root) filesystem via the FIFREEZE ioctl. + * Sometimes SysRq seems to get 'stuck' after using it, what can I do? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ That happens to me, also. I've found that tapping shift, alt, and control diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c index 33a9351c896d..5afe7316c72e 100644 --- a/drivers/char/sysrq.c +++ b/drivers/char/sysrq.c @@ -346,6 +346,19 @@ static struct sysrq_key_op sysrq_moom_op = { .enable_mask = SYSRQ_ENABLE_SIGNAL, }; +#ifdef CONFIG_BLOCK +static void sysrq_handle_thaw(int key, struct tty_struct *tty) +{ + emergency_thaw_all(); +} +static struct sysrq_key_op sysrq_thaw_op = { + .handler = sysrq_handle_thaw, + .help_msg = "thaw-filesystems(J)", + .action_msg = "Emergency Thaw of all frozen filesystems", + .enable_mask = SYSRQ_ENABLE_SIGNAL, +}; +#endif + static void sysrq_handle_kill(int key, struct tty_struct *tty) { send_sig_all(SIGKILL); @@ -396,9 +409,13 @@ static struct sysrq_key_op *sysrq_key_table[36] = { &sysrq_moom_op, /* f */ /* g: May be registered by ppc for kgdb */ NULL, /* g */ - NULL, /* h */ + NULL, /* h - reserved for help */ &sysrq_kill_op, /* i */ +#ifdef CONFIG_BLOCK + &sysrq_thaw_op, /* j */ +#else NULL, /* j */ +#endif &sysrq_SAK_op, /* k */ #ifdef CONFIG_SMP &sysrq_showallcpus_op, /* l */ diff --git a/fs/buffer.c b/fs/buffer.c index c77b848c3d43..f5f8b15a6e40 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -547,6 +547,39 @@ repeat: return err; } +void do_thaw_all(unsigned long unused) +{ + struct super_block *sb; + char b[BDEVNAME_SIZE]; + + spin_lock(&sb_lock); +restart: + list_for_each_entry(sb, &super_blocks, s_list) { + sb->s_count++; + spin_unlock(&sb_lock); + down_read(&sb->s_umount); + while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb)) + printk(KERN_WARNING "Emergency Thaw on %s\n", + bdevname(sb->s_bdev, b)); + up_read(&sb->s_umount); + spin_lock(&sb_lock); + if (__put_super_and_need_restart(sb)) + goto restart; + } + spin_unlock(&sb_lock); + printk(KERN_WARNING "Emergency Thaw complete\n"); +} + +/** + * emergency_thaw_all -- forcibly thaw every frozen filesystem + * + * Used for emergency unfreeze of all filesystems via SysRq + */ +void emergency_thaw_all(void) +{ + pdflush_operation(do_thaw_all, 0); +} + /** * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers * @mapping: the mapping which wants those buffers written diff --git a/include/linux/fs.h b/include/linux/fs.h index 87e7bfc5ebd7..61211ad823fe 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1878,6 +1878,7 @@ extern struct block_device *open_by_devnum(dev_t, fmode_t); extern void invalidate_bdev(struct block_device *); extern int sync_blockdev(struct block_device *bdev); extern struct super_block *freeze_bdev(struct block_device *); +extern void emergency_thaw_all(void); extern int thaw_bdev(struct block_device *bdev, struct super_block *sb); extern int fsync_bdev(struct block_device *); extern int fsync_super(struct super_block *); -- cgit v1.2.3 From 97f76d3d197f201ac8a8a3ced5b8fef81568e50e Mon Sep 17 00:00:00 2001 From: Nikanth Karthikesan Date: Thu, 2 Apr 2009 16:56:46 -0700 Subject: vfs: check bh->b_blocknr only if BH_Mapped is set Check bh->b_blocknr only if BH_Mapped is set. akpm: I doubt if b_blocknr is ever uninitialised here, but it could conceivably cause a problem if we're doing a lookup for block zero. Signed-off-by: Nikanth Karthikesan Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/buffer.c') diff --git a/fs/buffer.c b/fs/buffer.c index f5f8b15a6e40..2963858f0f31 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -199,13 +199,13 @@ __find_get_block_slow(struct block_device *bdev, sector_t block) head = page_buffers(page); bh = head; do { - if (bh->b_blocknr == block) { + if (!buffer_mapped(bh)) + all_mapped = 0; + else if (bh->b_blocknr == block) { ret = bh; get_bh(bh); goto out_unlock; } - if (!buffer_mapped(bh)) - all_mapped = 0; bh = bh->b_this_page; } while (bh != head); -- cgit v1.2.3