From 4bb9c5c02153dfc89a6c73a6f32091413805ad7d Mon Sep 17 00:00:00 2001 From: "Pallipadi, Venkatesh" Date: Thu, 12 Mar 2009 17:45:27 -0700 Subject: VM, x86, PAT: Change is_linear_pfn_mapping to not use vm_pgoff Impact: fix false positive PAT warnings - also fix VirtalBox hang Use of vma->vm_pgoff to identify the pfnmaps that are fully mapped at mmap time is broken. vm_pgoff is set by generic mmap code even for cases where drivers are setting up the mappings at the fault time. The problem was originally reported here: http://marc.info/?l=linux-kernel&m=123383810628583&w=2 Change is_linear_pfn_mapping logic to overload VM_INSERTPAGE flag along with VM_PFNMAP to mean full PFNMAP setup at mmap time. Problem also tracked at: http://bugzilla.kernel.org/show_bug.cgi?id=12800 Reported-by: Thomas Hellstrom Tested-by: Frans Pop Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha @intel.com> Cc: Nick Piggin Cc: "ebiederm@xmission.com" Cc: # only for 2.6.29.1, not .28 LKML-Reference: <20090313004527.GA7176@linux-os.sc.intel.com> Signed-off-by: Ingo Molnar --- include/linux/mm.h | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 065cdf8c09fb..3daa05feed9f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -98,7 +98,7 @@ extern unsigned int kobjsize(const void *objp); #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ #define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */ #define VM_MAPPED_COPY 0x01000000 /* T if mapped copy of data (nommu mmap) */ -#define VM_INSERTPAGE 0x02000000 /* The vma has had "vm_insert_page()" done on it */ +#define VM_INSERTPAGE 0x02000000 /* The vma has had "vm_insert_page()" done on it. Refer note in VM_PFNMAP_AT_MMAP below */ #define VM_ALWAYSDUMP 0x04000000 /* Always include in core dumps */ #define VM_CAN_NONLINEAR 0x08000000 /* Has ->fault & does nonlinear pages */ @@ -126,6 +126,17 @@ extern unsigned int kobjsize(const void *objp); */ #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP) +/* + * pfnmap vmas that are fully mapped at mmap time (not mapped on fault). + * Used by x86 PAT to identify such PFNMAP mappings and optimize their handling. + * Note VM_INSERTPAGE flag is overloaded here. i.e, + * VM_INSERTPAGE && !VM_PFNMAP implies + * The vma has had "vm_insert_page()" done on it + * VM_INSERTPAGE && VM_PFNMAP implies + * The vma is PFNMAP with full mapping at mmap time + */ +#define VM_PFNMAP_AT_MMAP (VM_INSERTPAGE | VM_PFNMAP) + /* * mapping from the currently active vm_flags protection bits (the * low four bits) to a page protection mask.. @@ -145,7 +156,7 @@ extern pgprot_t protection_map[16]; */ static inline int is_linear_pfn_mapping(struct vm_area_struct *vma) { - return ((vma->vm_flags & VM_PFNMAP) && vma->vm_pgoff); + return ((vma->vm_flags & VM_PFNMAP_AT_MMAP) == VM_PFNMAP_AT_MMAP); } static inline int is_pfn_mapping(struct vm_area_struct *vma) -- cgit v1.2.3 From 895791dac6946d535991edd11341046f8e85ea77 Mon Sep 17 00:00:00 2001 From: "Pallipadi, Venkatesh" Date: Fri, 13 Mar 2009 16:35:44 -0700 Subject: VM, x86, PAT: add a new vm flag to track full pfnmap at mmap Impact: cleanup Add a new vm flag VM_PFN_AT_MMAP to identify a PFNMAP that is fully mapped with remap_pfn_range. Patch removes the overloading of VM_INSERTPAGE from the earlier patch. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Acked-by: Nick Piggin LKML-Reference: <20090313233543.GA19909@linux-os.sc.intel.com> Signed-off-by: Ingo Molnar --- include/linux/mm.h | 16 +++------------- mm/memory.c | 4 ++-- 2 files changed, 5 insertions(+), 15 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 3daa05feed9f..b1ea37fc7a24 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -98,12 +98,13 @@ extern unsigned int kobjsize(const void *objp); #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ #define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */ #define VM_MAPPED_COPY 0x01000000 /* T if mapped copy of data (nommu mmap) */ -#define VM_INSERTPAGE 0x02000000 /* The vma has had "vm_insert_page()" done on it. Refer note in VM_PFNMAP_AT_MMAP below */ +#define VM_INSERTPAGE 0x02000000 /* The vma has had "vm_insert_page()" done on it */ #define VM_ALWAYSDUMP 0x04000000 /* Always include in core dumps */ #define VM_CAN_NONLINEAR 0x08000000 /* Has ->fault & does nonlinear pages */ #define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */ #define VM_SAO 0x20000000 /* Strong Access Ordering (powerpc) */ +#define VM_PFN_AT_MMAP 0x40000000 /* PFNMAP vma that is fully mapped at mmap time */ #ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS @@ -126,17 +127,6 @@ extern unsigned int kobjsize(const void *objp); */ #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP) -/* - * pfnmap vmas that are fully mapped at mmap time (not mapped on fault). - * Used by x86 PAT to identify such PFNMAP mappings and optimize their handling. - * Note VM_INSERTPAGE flag is overloaded here. i.e, - * VM_INSERTPAGE && !VM_PFNMAP implies - * The vma has had "vm_insert_page()" done on it - * VM_INSERTPAGE && VM_PFNMAP implies - * The vma is PFNMAP with full mapping at mmap time - */ -#define VM_PFNMAP_AT_MMAP (VM_INSERTPAGE | VM_PFNMAP) - /* * mapping from the currently active vm_flags protection bits (the * low four bits) to a page protection mask.. @@ -156,7 +146,7 @@ extern pgprot_t protection_map[16]; */ static inline int is_linear_pfn_mapping(struct vm_area_struct *vma) { - return ((vma->vm_flags & VM_PFNMAP_AT_MMAP) == VM_PFNMAP_AT_MMAP); + return (vma->vm_flags & VM_PFN_AT_MMAP); } static inline int is_pfn_mapping(struct vm_area_struct *vma) diff --git a/mm/memory.c b/mm/memory.c index d7df5babcba9..2032ad2fc34b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1667,7 +1667,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, */ if (addr == vma->vm_start && end == vma->vm_end) { vma->vm_pgoff = pfn; - vma->vm_flags |= VM_PFNMAP_AT_MMAP; + vma->vm_flags |= VM_PFN_AT_MMAP; } else if (is_cow_mapping(vma->vm_flags)) return -EINVAL; @@ -1680,7 +1680,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, * needed from higher level routine calling unmap_vmas */ vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP); - vma->vm_flags &= ~VM_PFNMAP_AT_MMAP; + vma->vm_flags &= ~VM_PFN_AT_MMAP; return -EINVAL; } -- cgit v1.2.3 From e3a7cca1ef4c1af9b0acef9bd66eff6582a737b5 Mon Sep 17 00:00:00 2001 From: Edward Shishkin Date: Tue, 31 Mar 2009 15:19:39 -0700 Subject: vfs: add/use account_page_dirtied() Add a helper function account_page_dirtied(). Use that from two callsites. reiser4 adds a function which adds a third callsite. Signed-off-by: Edward Shishkin Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 9 +-------- include/linux/mm.h | 1 + mm/page-writeback.c | 22 +++++++++++++++------- 3 files changed, 17 insertions(+), 15 deletions(-) (limited to 'include/linux/mm.h') diff --git a/fs/buffer.c b/fs/buffer.c index a2fd743d97cb..73abe6d8218c 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -621,14 +621,7 @@ static void __set_page_dirty(struct page *page, spin_lock_irq(&mapping->tree_lock); if (page->mapping) { /* Race with truncate? */ WARN_ON_ONCE(warn && !PageUptodate(page)); - - if (mapping_cap_account_dirty(mapping)) { - __inc_zone_page_state(page, NR_FILE_DIRTY); - __inc_bdi_stat(mapping->backing_dev_info, - BDI_RECLAIMABLE); - task_dirty_inc(current); - task_io_account_write(PAGE_CACHE_SIZE); - } + account_page_dirtied(page, mapping); radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } diff --git a/include/linux/mm.h b/include/linux/mm.h index b1ea37fc7a24..2223f8dfa568 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -834,6 +834,7 @@ int __set_page_dirty_nobuffers(struct page *page); int __set_page_dirty_no_writeback(struct page *page); int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page); +void account_page_dirtied(struct page *page, struct address_space *mapping); int set_page_dirty(struct page *page); int set_page_dirty_lock(struct page *page); int clear_page_dirty_for_io(struct page *page); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 40ca7cdb653e..6aa92b03c747 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1197,6 +1197,20 @@ int __set_page_dirty_no_writeback(struct page *page) return 0; } +/* + * Helper function for set_page_dirty family. + * NOTE: This relies on being atomic wrt interrupts. + */ +void account_page_dirtied(struct page *page, struct address_space *mapping) +{ + if (mapping_cap_account_dirty(mapping)) { + __inc_zone_page_state(page, NR_FILE_DIRTY); + __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); + task_dirty_inc(current); + task_io_account_write(PAGE_CACHE_SIZE); + } +} + /* * For address_spaces which do not use buffers. Just tag the page as dirty in * its radix tree. @@ -1226,13 +1240,7 @@ int __set_page_dirty_nobuffers(struct page *page) if (mapping2) { /* Race with truncate? */ BUG_ON(mapping2 != mapping); WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); - if (mapping_cap_account_dirty(mapping)) { - __inc_zone_page_state(page, NR_FILE_DIRTY); - __inc_bdi_stat(mapping->backing_dev_info, - BDI_RECLAIMABLE); - task_dirty_inc(current); - task_io_account_write(PAGE_CACHE_SIZE); - } + account_page_dirtied(page, mapping); radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } -- cgit v1.2.3 From c2ec175c39f62949438354f603f4aa170846aabb Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Tue, 31 Mar 2009 15:23:21 -0700 Subject: mm: page_mkwrite change prototype to match fault Change the page_mkwrite prototype to take a struct vm_fault, and return VM_FAULT_xxx flags. There should be no functional change. This makes it possible to return much more detailed error information to the VM (and also can provide more information eg. virtual_address to the driver, which might be important in some special cases). This is required for a subsequent fix. And will also make it easier to merge page_mkwrite() with fault() in future. Signed-off-by: Nick Piggin Cc: Chris Mason Cc: Trond Myklebust Cc: Miklos Szeredi Cc: Steven Whitehouse Cc: Mark Fasheh Cc: Joel Becker Cc: Artem Bityutskiy Cc: Felix Blyakher Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/Locking | 2 +- drivers/video/fb_defio.c | 3 ++- fs/btrfs/ctree.h | 2 +- fs/btrfs/inode.c | 5 ++++- fs/buffer.c | 6 +++++- fs/ext4/ext4.h | 2 +- fs/ext4/inode.c | 5 ++++- fs/fuse/file.c | 3 ++- fs/gfs2/ops_file.c | 5 ++++- fs/nfs/file.c | 5 ++++- fs/ocfs2/mmap.c | 6 ++++-- fs/ubifs/file.c | 9 ++++++--- fs/xfs/linux-2.6/xfs_file.c | 4 ++-- include/linux/buffer_head.h | 2 +- include/linux/mm.h | 3 ++- mm/memory.c | 26 ++++++++++++++++++++++---- 16 files changed, 65 insertions(+), 23 deletions(-) (limited to 'include/linux/mm.h') diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 4e78ce677843..76efe5b71d7d 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -505,7 +505,7 @@ prototypes: void (*open)(struct vm_area_struct*); void (*close)(struct vm_area_struct*); int (*fault)(struct vm_area_struct*, struct vm_fault *); - int (*page_mkwrite)(struct vm_area_struct *, struct page *); + int (*page_mkwrite)(struct vm_area_struct *, struct vm_fault *); int (*access)(struct vm_area_struct *, unsigned long, void*, int, int); locking rules: diff --git a/drivers/video/fb_defio.c b/drivers/video/fb_defio.c index 082026546aee..0a7a6679ee6e 100644 --- a/drivers/video/fb_defio.c +++ b/drivers/video/fb_defio.c @@ -85,8 +85,9 @@ EXPORT_SYMBOL_GPL(fb_deferred_io_fsync); /* vm_ops->page_mkwrite handler */ static int fb_deferred_io_mkwrite(struct vm_area_struct *vma, - struct page *page) + struct vm_fault *vmf) { + struct page *page = vmf->page; struct fb_info *info = vma->vm_private_data; struct fb_deferred_io *fbdefio = info->fbdefio; struct page *cur; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 5e1d4e30e9d8..7dd1b6d0bf32 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2060,7 +2060,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, unsigned long btrfs_force_ra(struct address_space *mapping, struct file_ra_state *ra, struct file *file, pgoff_t offset, pgoff_t last_index); -int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page); +int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); int btrfs_readpage(struct file *file, struct page *page); void btrfs_delete_inode(struct inode *inode); void btrfs_put_inode(struct inode *inode); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7d4f948bc22a..ec5423790bbb 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4292,8 +4292,9 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) * beyond EOF, then the page is guaranteed safe against truncation until we * unlock the page. */ -int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page) +int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { + struct page *page = vmf->page; struct inode *inode = fdentry(vma->vm_file)->d_inode; struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; @@ -4362,6 +4363,8 @@ again: out_unlock: unlock_page(page); out: + if (ret) + ret = VM_FAULT_SIGBUS; return ret; } diff --git a/fs/buffer.c b/fs/buffer.c index 73abe6d8218c..6d51a3da362c 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2313,9 +2313,10 @@ int block_commit_write(struct page *page, unsigned from, unsigned to) * unlock the page. */ int -block_page_mkwrite(struct vm_area_struct *vma, struct page *page, +block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, get_block_t get_block) { + struct page *page = vmf->page; struct inode *inode = vma->vm_file->f_path.dentry->d_inode; unsigned long end; loff_t size; @@ -2340,6 +2341,9 @@ block_page_mkwrite(struct vm_area_struct *vma, struct page *page, ret = block_commit_write(page, 0, end); out_unlock: + if (ret) + ret = VM_FAULT_SIGBUS; + unlock_page(page); return ret; } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 6083bb38057b..990c94000924 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1098,7 +1098,7 @@ extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks); extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); extern int ext4_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from); -extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page); +extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); extern qsize_t ext4_get_reserved_space(struct inode *inode); /* ioctl.c */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 71d3ecd5db79..dd82ff390067 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5146,8 +5146,9 @@ static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh) return !buffer_mapped(bh); } -int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page) +int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { + struct page *page = vmf->page; loff_t size; unsigned long len; int ret = -EINVAL; @@ -5199,6 +5200,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page) goto out_unlock; ret = 0; out_unlock: + if (ret) + ret = VM_FAULT_SIGBUS; up_read(&inode->i_alloc_sem); return ret; } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 821d10f719bd..4e340fedf768 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1234,8 +1234,9 @@ static void fuse_vma_close(struct vm_area_struct *vma) * - sync(2) * - try_to_free_pages() with order > PAGE_ALLOC_COSTLY_ORDER */ -static int fuse_page_mkwrite(struct vm_area_struct *vma, struct page *page) +static int fuse_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { + struct page *page = vmf->page; /* * Don't use page->mapping as it may become NULL from a * concurrent truncate. diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c index 3b9e8de3500b..70b9b8548945 100644 --- a/fs/gfs2/ops_file.c +++ b/fs/gfs2/ops_file.c @@ -337,8 +337,9 @@ static int gfs2_allocate_page_backing(struct page *page) * blocks allocated on disk to back that page. */ -static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page) +static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { + struct page *page = vmf->page; struct inode *inode = vma->vm_file->f_path.dentry->d_inode; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); @@ -412,6 +413,8 @@ out_unlock: gfs2_glock_dq(&gh); out: gfs2_holder_uninit(&gh); + if (ret) + ret = VM_FAULT_SIGBUS; return ret; } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 90f292b520d2..cec79392e4ba 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -451,8 +451,9 @@ const struct address_space_operations nfs_file_aops = { .launder_page = nfs_launder_page, }; -static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page) +static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { + struct page *page = vmf->page; struct file *filp = vma->vm_file; struct dentry *dentry = filp->f_path.dentry; unsigned pagelen; @@ -483,6 +484,8 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page) ret = pagelen; out_unlock: unlock_page(page); + if (ret) + ret = VM_FAULT_SIGBUS; return ret; } diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index eea1d24713ea..b606496b72ec 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -154,8 +154,9 @@ out: return ret; } -static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page) +static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { + struct page *page = vmf->page; struct inode *inode = vma->vm_file->f_path.dentry->d_inode; struct buffer_head *di_bh = NULL; sigset_t blocked, oldset; @@ -196,7 +197,8 @@ out: ret2 = ocfs2_vm_op_unblock_sigs(&oldset); if (ret2 < 0) mlog_errno(ret2); - + if (ret) + ret = VM_FAULT_SIGBUS; return ret; } diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 93b6de51f261..0ff89fe71e51 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1434,8 +1434,9 @@ static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags) * mmap()d file has taken write protection fault and is being made * writable. UBIFS must ensure page is budgeted for. */ -static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page) +static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { + struct page *page = vmf->page; struct inode *inode = vma->vm_file->f_path.dentry->d_inode; struct ubifs_info *c = inode->i_sb->s_fs_info; struct timespec now = ubifs_current_time(inode); @@ -1447,7 +1448,7 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page) ubifs_assert(!(inode->i_sb->s_flags & MS_RDONLY)); if (unlikely(c->ro_media)) - return -EROFS; + return VM_FAULT_SIGBUS; /* -EROFS */ /* * We have not locked @page so far so we may budget for changing the @@ -1480,7 +1481,7 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page) if (err == -ENOSPC) ubifs_warn("out of space for mmapped file " "(inode number %lu)", inode->i_ino); - return err; + return VM_FAULT_SIGBUS; } lock_page(page); @@ -1520,6 +1521,8 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page) out_unlock: unlock_page(page); ubifs_release_budget(c, &req); + if (err) + err = VM_FAULT_SIGBUS; return err; } diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index e14c4e3aea0c..f4e255441574 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -234,9 +234,9 @@ xfs_file_mmap( STATIC int xfs_vm_page_mkwrite( struct vm_area_struct *vma, - struct page *page) + struct vm_fault *vmf) { - return block_page_mkwrite(vma, page, xfs_get_blocks); + return block_page_mkwrite(vma, vmf, xfs_get_blocks); } const struct file_operations xfs_file_operations = { diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index f19fd9045ea0..3d7bcde2e332 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -216,7 +216,7 @@ int cont_write_begin(struct file *, struct address_space *, loff_t, get_block_t *, loff_t *); int generic_cont_expand_simple(struct inode *inode, loff_t size); int block_commit_write(struct page *page, unsigned from, unsigned to); -int block_page_mkwrite(struct vm_area_struct *vma, struct page *page, +int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, get_block_t get_block); void block_sync_page(struct page *); sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *); diff --git a/include/linux/mm.h b/include/linux/mm.h index 2223f8dfa568..aeabe953ba4f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -135,6 +135,7 @@ extern pgprot_t protection_map[16]; #define FAULT_FLAG_WRITE 0x01 /* Fault was a write access */ #define FAULT_FLAG_NONLINEAR 0x02 /* Fault was via a nonlinear mapping */ +#define FAULT_FLAG_MKWRITE 0x04 /* Fault was mkwrite of existing pte */ /* * This interface is used by x86 PAT code to identify a pfn mapping that is @@ -187,7 +188,7 @@ struct vm_operations_struct { /* notification that a previously read-only page is about to become * writable, if an error is returned it will cause a SIGBUS */ - int (*page_mkwrite)(struct vm_area_struct *vma, struct page *page); + int (*page_mkwrite)(struct vm_area_struct *vma, struct vm_fault *vmf); /* called by access_process_vm when get_user_pages() fails, typically * for use by special VMAs that can switch between memory and hardware diff --git a/mm/memory.c b/mm/memory.c index 5b4ad5e4f98d..cf6873e91c6a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1945,6 +1945,15 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, * get_user_pages(.write=1, .force=1). */ if (vma->vm_ops && vma->vm_ops->page_mkwrite) { + struct vm_fault vmf; + int tmp; + + vmf.virtual_address = (void __user *)(address & + PAGE_MASK); + vmf.pgoff = old_page->index; + vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; + vmf.page = old_page; + /* * Notify the address space that the page is about to * become writable so that it can prohibit this or wait @@ -1956,8 +1965,12 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, page_cache_get(old_page); pte_unmap_unlock(page_table, ptl); - if (vma->vm_ops->page_mkwrite(vma, old_page) < 0) + tmp = vma->vm_ops->page_mkwrite(vma, &vmf); + if (unlikely(tmp & + (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { + ret = tmp; goto unwritable_page; + } /* * Since we dropped the lock we need to revalidate @@ -2106,7 +2119,7 @@ oom: unwritable_page: page_cache_release(old_page); - return VM_FAULT_SIGBUS; + return ret; } /* @@ -2648,9 +2661,14 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, * to become writable */ if (vma->vm_ops->page_mkwrite) { + int tmp; + unlock_page(page); - if (vma->vm_ops->page_mkwrite(vma, page) < 0) { - ret = VM_FAULT_SIGBUS; + vmf.flags |= FAULT_FLAG_MKWRITE; + tmp = vma->vm_ops->page_mkwrite(vma, &vmf); + if (unlikely(tmp & + (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { + ret = tmp; anon = 1; /* no anon but release vmf.page */ goto out_unlocked; } -- cgit v1.2.3 From 33e5d76979cf01e3834814fe0aea569d1d602c1a Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 2 Apr 2009 16:56:32 -0700 Subject: nommu: fix a number of issues with the per-MM VMA patch Fix a number of issues with the per-MM VMA patch: (1) Make mmap_pages_allocated an atomic_long_t, just in case this is used on a NOMMU system with more than 2G pages. Makes no difference on a 32-bit system. (2) Report vma->vm_pgoff * PAGE_SIZE as a 64-bit value, not a 32-bit value, lest it overflow. (3) Move the allocation of the vm_area_struct slab back for fork.c. (4) Use KMEM_CACHE() for both vm_area_struct and vm_region slabs. (5) Use BUG_ON() rather than if () BUG(). (6) Make the default validate_nommu_regions() a static inline rather than a #define. (7) Make free_page_series()'s objection to pages with a refcount != 1 more informative. (8) Adjust the __put_nommu_region() banner comment to indicate that the semaphore must be held for writing. (9) Limit the number of warnings about munmaps of non-mmapped regions. Reported-by: Andrew Morton Signed-off-by: David Howells Cc: Greg Ungerer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/meminfo.c | 2 +- fs/proc/task_nommu.c | 4 ++-- include/linux/mm.h | 2 +- kernel/fork.c | 1 + mm/mmap.c | 3 --- mm/nommu.c | 52 +++++++++++++++++++++++++--------------------------- 6 files changed, 30 insertions(+), 34 deletions(-) (limited to 'include/linux/mm.h') diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 43d23948384a..74ea974f5ca6 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -120,7 +120,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) K(i.freeram-i.freehigh), #endif #ifndef CONFIG_MMU - K((unsigned long) atomic_read(&mmap_pages_allocated)), + K((unsigned long) atomic_long_read(&mmap_pages_allocated)), #endif K(i.totalswap), K(i.freeswap), diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 343ea1216bc8..370be0a2c909 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -136,14 +136,14 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) } seq_printf(m, - "%08lx-%08lx %c%c%c%c %08lx %02x:%02x %lu %n", + "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n", vma->vm_start, vma->vm_end, flags & VM_READ ? 'r' : '-', flags & VM_WRITE ? 'w' : '-', flags & VM_EXEC ? 'x' : '-', flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p', - vma->vm_pgoff << PAGE_SHIFT, + (unsigned long long) vma->vm_pgoff << PAGE_SHIFT, MAJOR(dev), MINOR(dev), ino, &len); if (file) { diff --git a/include/linux/mm.h b/include/linux/mm.h index aeabe953ba4f..bff1f0d475c7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1079,7 +1079,7 @@ static inline void setup_per_cpu_pageset(void) {} #endif /* nommu.c */ -extern atomic_t mmap_pages_allocated; +extern atomic_long_t mmap_pages_allocated; /* prio_tree.c */ void vma_prio_tree_add(struct vm_area_struct *, struct vm_area_struct *old); diff --git a/kernel/fork.c b/kernel/fork.c index 47c15840a381..51d1aa21483b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1488,6 +1488,7 @@ void __init proc_caches_init(void) mm_cachep = kmem_cache_create("mm_struct", sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC); mmap_init(); } diff --git a/mm/mmap.c b/mm/mmap.c index 1abb9185a686..4a3841186c11 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2481,7 +2481,4 @@ void mm_drop_all_locks(struct mm_struct *mm) */ void __init mmap_init(void) { - vm_area_cachep = kmem_cache_create("vm_area_struct", - sizeof(struct vm_area_struct), 0, - SLAB_PANIC, NULL); } diff --git a/mm/nommu.c b/mm/nommu.c index 2fcf47d449b4..72eda4aee2cb 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -69,7 +69,7 @@ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; int sysctl_nr_trim_pages = 1; /* page trimming behaviour */ int heap_stack_gap = 0; -atomic_t mmap_pages_allocated; +atomic_long_t mmap_pages_allocated; EXPORT_SYMBOL(mem_map); EXPORT_SYMBOL(num_physpages); @@ -463,12 +463,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) */ void __init mmap_init(void) { - vm_region_jar = kmem_cache_create("vm_region_jar", - sizeof(struct vm_region), 0, - SLAB_PANIC, NULL); - vm_area_cachep = kmem_cache_create("vm_area_struct", - sizeof(struct vm_area_struct), 0, - SLAB_PANIC, NULL); + vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC); } /* @@ -486,27 +481,24 @@ static noinline void validate_nommu_regions(void) return; last = rb_entry(lastp, struct vm_region, vm_rb); - if (unlikely(last->vm_end <= last->vm_start)) - BUG(); - if (unlikely(last->vm_top < last->vm_end)) - BUG(); + BUG_ON(unlikely(last->vm_end <= last->vm_start)); + BUG_ON(unlikely(last->vm_top < last->vm_end)); while ((p = rb_next(lastp))) { region = rb_entry(p, struct vm_region, vm_rb); last = rb_entry(lastp, struct vm_region, vm_rb); - if (unlikely(region->vm_end <= region->vm_start)) - BUG(); - if (unlikely(region->vm_top < region->vm_end)) - BUG(); - if (unlikely(region->vm_start < last->vm_top)) - BUG(); + BUG_ON(unlikely(region->vm_end <= region->vm_start)); + BUG_ON(unlikely(region->vm_top < region->vm_end)); + BUG_ON(unlikely(region->vm_start < last->vm_top)); lastp = p; } } #else -#define validate_nommu_regions() do {} while(0) +static void validate_nommu_regions(void) +{ +} #endif /* @@ -563,16 +555,17 @@ static void free_page_series(unsigned long from, unsigned long to) struct page *page = virt_to_page(from); kdebug("- free %lx", from); - atomic_dec(&mmap_pages_allocated); + atomic_long_dec(&mmap_pages_allocated); if (page_count(page) != 1) - kdebug("free page %p [%d]", page, page_count(page)); + kdebug("free page %p: refcount not one: %d", + page, page_count(page)); put_page(page); } } /* * release a reference to a region - * - the caller must hold the region semaphore, which this releases + * - the caller must hold the region semaphore for writing, which this releases * - the region may not have been added to the tree yet, in which case vm_top * will equal vm_start */ @@ -1096,7 +1089,7 @@ static int do_mmap_private(struct vm_area_struct *vma, goto enomem; total = 1 << order; - atomic_add(total, &mmap_pages_allocated); + atomic_long_add(total, &mmap_pages_allocated); point = rlen >> PAGE_SHIFT; @@ -1107,7 +1100,7 @@ static int do_mmap_private(struct vm_area_struct *vma, order = ilog2(total - point); n = 1 << order; kdebug("shave %lu/%lu @%lu", n, total - point, total); - atomic_sub(n, &mmap_pages_allocated); + atomic_long_sub(n, &mmap_pages_allocated); total -= n; set_page_refcounted(pages + total); __free_pages(pages + total, order); @@ -1536,10 +1529,15 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) /* find the first potentially overlapping VMA */ vma = find_vma(mm, start); if (!vma) { - printk(KERN_WARNING - "munmap of memory not mmapped by process %d (%s):" - " 0x%lx-0x%lx\n", - current->pid, current->comm, start, start + len - 1); + static int limit = 0; + if (limit < 5) { + printk(KERN_WARNING + "munmap of memory not mmapped by process %d" + " (%s): 0x%lx-0x%lx\n", + current->pid, current->comm, + start, start + len - 1); + limit++; + } return -EINVAL; } -- cgit v1.2.3