From 8c88cd21b9eccf0b65591056531eba0998212a56 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 11 Dec 2002 22:52:04 +0100 Subject: share some code between get_sb_bdev and xfs log/rtdev handling --- include/linux/fs.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index f39d21e5bcd9..b0db195fa19e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1096,15 +1096,20 @@ extern int bd_claim(struct block_device *, void *); extern void bd_release(struct block_device *); extern void blk_run_queues(void); -/* fs/devices.c */ +/* fs/char_dev.c */ extern int register_chrdev(unsigned int, const char *, struct file_operations *); extern int unregister_chrdev(unsigned int, const char *); extern int chrdev_open(struct inode *, struct file *); + +/* fs/block_dev.c */ extern const char *__bdevname(dev_t); extern inline const char *bdevname(struct block_device *bdev) { return __bdevname(bdev->bd_dev); } +extern struct block_device *open_bdev_excl(const char *, int, int, void *); +extern void close_bdev_excl(struct block_device *, int); + extern const char * cdevname(kdev_t); extern const char * kdevname(kdev_t); extern void init_special_inode(struct inode *, umode_t, dev_t); -- cgit v1.2.3 From f99a1a552f067b2352e2525ac72ddb499dd53ec4 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 14 Dec 2002 03:16:52 -0800 Subject: [PATCH] semtimedop - semop() with a timeout Patch from Mark Fasheh (plus a few cleanups and a speedup from yours truly) Adds the semtimedop() function - semop with a timeout. Solaris has this. It's apparently worth a couple of percent to Oracle throughput and given the simplicity, that is sufficient benefit for inclusion IMO. This patch hooks up semtimedop() only for ia64 and ia32. --- arch/i386/kernel/sys_i386.c | 6 +++++- arch/ia64/ia32/sys_ia32.c | 1 + arch/ia64/kernel/entry.S | 2 +- include/asm-i386/ipc.h | 1 + include/asm-ia64/unistd.h | 1 + include/linux/sem.h | 2 ++ ipc/sem.c | 29 +++++++++++++++++++++++++++-- ipc/util.c | 7 +++++++ 8 files changed, 45 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/arch/i386/kernel/sys_i386.c b/arch/i386/kernel/sys_i386.c index e876d4800065..b7271a4c6926 100644 --- a/arch/i386/kernel/sys_i386.c +++ b/arch/i386/kernel/sys_i386.c @@ -140,7 +140,11 @@ asmlinkage int sys_ipc (uint call, int first, int second, switch (call) { case SEMOP: - return sys_semop (first, (struct sembuf *)ptr, second); + return sys_semtimedop (first, (struct sembuf *)ptr, second, NULL); + case SEMTIMEDOP: + return sys_semtimedop(first, (struct sembuf *)ptr, second, + (const struct timespec *)fifth); + case SEMGET: return sys_semget (first, second, third); case SEMCTL: { diff --git a/arch/ia64/ia32/sys_ia32.c b/arch/ia64/ia32/sys_ia32.c index 9090d3ea5d26..c82b6945f28d 100644 --- a/arch/ia64/ia32/sys_ia32.c +++ b/arch/ia64/ia32/sys_ia32.c @@ -2124,6 +2124,7 @@ struct ipc_kludge { #define SEMOP 1 #define SEMGET 2 #define SEMCTL 3 +#define SEMTIMEDOP 4 #define MSGSND 11 #define MSGRCV 12 #define MSGGET 13 diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S index 599ac9dd9a3e..d7318dd16392 100644 --- a/arch/ia64/kernel/entry.S +++ b/arch/ia64/kernel/entry.S @@ -1254,7 +1254,7 @@ sys_call_table: data8 sys_epoll_create data8 sys_epoll_ctl data8 sys_epoll_wait // 1245 - data8 ia64_ni_syscall + data8 sys_semtimedop data8 ia64_ni_syscall data8 ia64_ni_syscall data8 ia64_ni_syscall diff --git a/include/asm-i386/ipc.h b/include/asm-i386/ipc.h index 36f43063adcd..88229f11796f 100644 --- a/include/asm-i386/ipc.h +++ b/include/asm-i386/ipc.h @@ -14,6 +14,7 @@ struct ipc_kludge { #define SEMOP 1 #define SEMGET 2 #define SEMCTL 3 +#define SEMTIMEDOP 4 #define MSGSND 11 #define MSGRCV 12 #define MSGGET 13 diff --git a/include/asm-ia64/unistd.h b/include/asm-ia64/unistd.h index c1ed0630e97d..afad108b0d0a 100644 --- a/include/asm-ia64/unistd.h +++ b/include/asm-ia64/unistd.h @@ -235,6 +235,7 @@ #define __NR_epoll_create 1243 #define __NR_epoll_ctl 1244 #define __NR_epoll_wait 1245 +#define __NR_semtimedop 1246 #if !defined(__ASSEMBLY__) && !defined(ASSEMBLER) diff --git a/include/linux/sem.h b/include/linux/sem.h index 429e72a2eadd..1b869bf3aad9 100644 --- a/include/linux/sem.h +++ b/include/linux/sem.h @@ -140,6 +140,8 @@ struct sysv_sem { asmlinkage long sys_semget (key_t key, int nsems, int semflg); asmlinkage long sys_semop (int semid, struct sembuf *sops, unsigned nsops); asmlinkage long sys_semctl (int semid, int semnum, int cmd, union semun arg); +asmlinkage long sys_semtimedop(int semid, struct sembuf *sops, + unsigned nsops, const struct timespec *timeout); #endif /* __KERNEL__ */ diff --git a/ipc/sem.c b/ipc/sem.c index 166f839d0995..c849b9375eeb 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -62,6 +62,7 @@ #include #include #include +#include #include #include #include @@ -968,6 +969,12 @@ static int alloc_undo(struct sem_array *sma, struct sem_undo** unp, int semid, i } asmlinkage long sys_semop (int semid, struct sembuf *tsops, unsigned nsops) +{ + return sys_semtimedop(semid, tsops, nsops, NULL); +} + +asmlinkage long sys_semtimedop(int semid, struct sembuf *tsops, + unsigned nsops, const struct timespec *timeout) { int error = -EINVAL; struct sem_array *sma; @@ -976,7 +983,7 @@ asmlinkage long sys_semop (int semid, struct sembuf *tsops, unsigned nsops) struct sem_undo *un; int undos = 0, decrease = 0, alter = 0; struct sem_queue queue; - + unsigned long jiffies_left = 0; if (nsops < 1 || semid < 0) return -EINVAL; @@ -991,6 +998,19 @@ asmlinkage long sys_semop (int semid, struct sembuf *tsops, unsigned nsops) error=-EFAULT; goto out_free; } + if (timeout) { + struct timespec _timeout; + if (copy_from_user(&_timeout, timeout, sizeof(*timeout))) { + error = -EFAULT; + goto out_free; + } + if (_timeout.tv_sec < 0 || _timeout.tv_nsec < 0 || + _timeout.tv_nsec >= 1000000000L) { + error = -EINVAL; + goto out_free; + } + jiffies_left = timespec_to_jiffies(&_timeout); + } lock_semundo(); sma = sem_lock(semid); error=-EINVAL; @@ -1058,7 +1078,10 @@ asmlinkage long sys_semop (int semid, struct sembuf *tsops, unsigned nsops) sem_unlock(sma); unlock_semundo(); - schedule(); + if (timeout) + jiffies_left = schedule_timeout(jiffies_left); + else + schedule(); lock_semundo(); sma = sem_lock(semid); @@ -1084,6 +1107,8 @@ asmlinkage long sys_semop (int semid, struct sembuf *tsops, unsigned nsops) break; } else { error = queue.status; + if (error == -EINTR && timeout && jiffies_left == 0) + error = -EAGAIN; if (queue.prev) /* got Interrupt */ break; /* Everything done by update_queue */ diff --git a/ipc/util.c b/ipc/util.c index 688bd2af4ef6..a2f4c3b1c680 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -562,6 +562,13 @@ asmlinkage long sys_semop (int semid, struct sembuf *sops, unsigned nsops) return -ENOSYS; } +asmlinkage long sys_semtimedop(int semid, struct sembuf *sops, unsigned nsops, + const struct timespec *timeout) +{ + return -ENOSYS; +} + + asmlinkage long sys_semctl (int semid, int semnum, int cmd, union semun arg) { return -ENOSYS; -- cgit v1.2.3 From 3e9afe4cdf0329a82b288d03d420537d7942c9d8 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 14 Dec 2002 03:17:03 -0800 Subject: [PATCH] Remove fail_writepage, redux fail_writepage() does not work. Its activate_page() call cannot activate the page because it is not on the LRU. So perform that function (more efficiently) in the VM. Remove fail_writepage() and, if the filesystem does not implement ->writepage() then activate the page from shrink_list(). A special case is tmpfs, which does have a writepage, but which sometimes wants to activate the pages anyway. The most important case is when there is no swap online and we don't want to keep all those pages on the inactive list. So just as a tmpfs special-case, allow writepage() to return WRITEPAGE_ACTIVATE, and handle that in the VM. Also, the whole idea of allowing ->writepage() to return -EAGAIN, and handling that in the caller has been reverted. If a writepage() implementation wants to back out and not write the page, it must redirty the page, unlock it and return zero. (This is Hugh's preferred way). And remove the now-unneeded shmem_writepages() - shmem inodes are marked as `memory backed' so it will not be called. And remove the test for non-null ->writepage() in generic_file_mmap(). Memory-backed files _are_ mmappable, and they do not have a writepage(). It just isn't called. So the locking rules for writepage() are unchanged. They are: - Called with the page locked - Returns with the page unlocked - Must redirty the page itself if it wasn't all written. But there is a new, special, hidden, undocumented, secret hack for tmpfs: writepage may return WRITEPAGE_ACTIVATE to tell the VM to move the page to the active list. The page must be kept locked in this one case. --- Documentation/filesystems/Locking | 39 ++++++++++++++++++++++++++++++--------- drivers/block/rd.c | 1 - fs/buffer.c | 7 ++----- fs/ext3/inode.c | 7 +++---- fs/hugetlbfs/inode.c | 1 - fs/mpage.c | 4 ---- fs/ramfs/inode.c | 1 - fs/sysfs/inode.c | 1 - fs/udf/inode.c | 3 +-- include/linux/mm.h | 1 - include/linux/writeback.h | 15 ++++++++++----- mm/filemap.c | 38 +++----------------------------------- mm/page-writeback.c | 4 ---- mm/shmem.c | 15 ++++++--------- mm/vmscan.c | 14 ++++++-------- 15 files changed, 61 insertions(+), 90 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index b69c40cc5748..e88e79968484 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -147,7 +147,7 @@ locking rules: All except set_page_dirty may block BKL PageLocked(page) -writepage: no yes, unlocks +writepage: no yes, unlocks (see below) readpage: no yes, unlocks readpages: no sync_page: no maybe @@ -165,16 +165,37 @@ may be called from the request handler (/dev/loop). ->readpage() unlocks the page, either synchronously or via I/O completion. - ->readpages() populates the pagecache with the passed pages and starts I/O against them. They come unlocked upon I/O completion. + ->readpages() populates the pagecache with the passed pages and starts +I/O against them. They come unlocked upon I/O completion. - ->writepage() unlocks the page synchronously, before returning to -the caller. If the page has write I/O underway against it, writepage() -should run SetPageWriteback() against the page prior to unlocking it. -The write I/O completion handler should run ClearPageWriteback against -the page. + ->writepage() is used for two purposes: for "memory cleansing" and for +"sync". These are quite different operations and the behaviour may differ +depending upon the mode. (Yes, there should be two a_ops for this, or +writepage should take a writeback_control*) - That is: after 2.5.12, pages which are under writeout are *not* -locked. +If writepage is called for sync (current->flags & PF_SYNC) then it *must* +write the page, even if that would involve blocking on in-progress I/O. + +If writepage is called for memory cleansing (!(current->flags & PF_SYNC)) +then its role is to get as much writeout underway as possible. So writepage +should try to avoid blocking against currently-in-progress I/O. + +If the filesystem is not called for "sync" and it determines that it +would need to block against in-progress I/O to be able to start new I/O +against the page the filesystem shoud redirty the page (usually with +__set_page_dirty_nobuffers()), then unlock the page and return zero. +This may also be done to avoid internal deadlocks, but rarely. + +If the filesytem is called for sync then it must wait on any +in-progress I/O and then start new I/O. + +The filesystem should unlock the page synchronously, before returning +to the caller. If the page has write I/O underway against it, +writepage() should run SetPageWriteback() against the page prior to +unlocking it. The write I/O completion handler should run +end_page_writeback() against the page. + +That is: after 2.5.12, pages which are under writeout are *not* locked. ->sync_page() locking rules are not well-defined - usually it is called with lock on page, but that is not guaranteed. Considering the currently diff --git a/drivers/block/rd.c b/drivers/block/rd.c index 801ac0a4df1b..198c8f13b7e6 100644 --- a/drivers/block/rd.c +++ b/drivers/block/rd.c @@ -132,7 +132,6 @@ static int ramdisk_commit_write(struct file *file, struct page *page, unsigned o static struct address_space_operations ramdisk_aops = { .readpage = ramdisk_readpage, - .writepage = fail_writepage, .prepare_write = ramdisk_prepare_write, .commit_write = ramdisk_commit_write, }; diff --git a/fs/buffer.c b/fs/buffer.c index 374076022fc4..6808e8802039 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1622,7 +1622,7 @@ EXPORT_SYMBOL(unmap_underlying_metadata); * state inside lock_buffer(). * * If block_write_full_page() is called for regular writeback - * (called_for_sync() is false) then it will return -EAGAIN for a locked + * (called_for_sync() is false) then it will redirty a page which has a locked * buffer. This only can happen if someone has written the buffer directly, * with submit_bh(). At the address_space level PageWriteback prevents this * contention from occurring. @@ -1631,7 +1631,6 @@ static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block) { int err; - int ret = 0; unsigned long block; unsigned long last_block; struct buffer_head *bh, *head; @@ -1705,7 +1704,7 @@ static int __block_write_full_page(struct inode *inode, lock_buffer(bh); } else { if (test_set_buffer_locked(bh)) { - ret = -EAGAIN; + __set_page_dirty_nobuffers(page); continue; } } @@ -1757,8 +1756,6 @@ done: SetPageUptodate(page); end_page_writeback(page); } - if (err == 0) - return ret; return err; recover: diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 6bf8f48d7fe3..fe1c0cca19bb 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -1371,11 +1371,10 @@ out_fail: /* * We have to fail this writepage to avoid cross-fs transactions. - * Return EAGAIN so the caller will the page back on - * mapping->dirty_pages. The page's buffers' dirty state will be left - * as-is. + * Put the page back on mapping->dirty_pages. The page's buffers' + * dirty state will be left as-is. */ - ret = -EAGAIN; + __set_page_dirty_nobuffers(page); unlock_page(page); return ret; } diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 4db1d07df306..3ccc3b3661a4 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -448,7 +448,6 @@ static int hugetlbfs_symlink(struct inode * dir, struct dentry *dentry, const ch static struct address_space_operations hugetlbfs_aops = { .readpage = hugetlbfs_readpage, - .writepage = fail_writepage, .prepare_write = hugetlbfs_prepare_write, .commit_write = hugetlbfs_commit_write }; diff --git a/fs/mpage.c b/fs/mpage.c index 8307f43f18b6..59cf471aae63 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -610,10 +610,6 @@ mpage_writepages(struct address_space *mapping, test_clear_page_dirty(page)) { if (writepage) { ret = (*writepage)(page); - if (ret == -EAGAIN) { - __set_page_dirty_nobuffers(page); - ret = 0; - } } else { bio = mpage_writepage(bio, page, get_block, &last_block_in_bio, &ret); diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index 64b5c100cb91..9f8ac09e12ce 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -135,7 +135,6 @@ static int ramfs_symlink(struct inode * dir, struct dentry *dentry, const char * static struct address_space_operations ramfs_aops = { .readpage = simple_readpage, - .writepage = fail_writepage, .prepare_write = simple_prepare_write, .commit_write = simple_commit_write }; diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index 684038810f77..aafa0cd66743 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -382,7 +382,6 @@ static struct file_operations sysfs_file_operations = { static struct address_space_operations sysfs_aops = { .readpage = simple_readpage, - .writepage = fail_writepage, .prepare_write = simple_prepare_write, .commit_write = simple_commit_write }; diff --git a/fs/udf/inode.c b/fs/udf/inode.c index d0d4ee152347..24a197a1eeee 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -206,8 +206,7 @@ void udf_expand_file_adinicb(struct inode * inode, int newsize, int * err) else UDF_I_ALLOCTYPE(inode) = ICBTAG_FLAG_AD_LONG; - if (inode->i_data.a_ops->writepage(page) == -EAGAIN) - __set_page_dirty_nobuffers(page); + inode->i_data.a_ops->writepage(page); page_cache_release(page); mark_inode_dirty(inode); diff --git a/include/linux/mm.h b/include/linux/mm.h index d91bd3e8ce14..f2c0f9645de5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -355,7 +355,6 @@ extern struct page *mem_map; extern void show_free_areas(void); -extern int fail_writepage(struct page *); struct page * shmem_nopage(struct vm_area_struct * vma, unsigned long address, int unused); struct file *shmem_file_setup(char * name, loff_t size, unsigned long flags); extern void shmem_lock(struct file * file, int lock); diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 687091648a3a..42569bc0bb61 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -1,9 +1,5 @@ /* * include/linux/writeback.h. - * - * These declarations are private to fs/ and mm/. - * Declarations which are exported to filesystems do not - * get placed here. */ #ifndef WRITEBACK_H #define WRITEBACK_H @@ -46,7 +42,16 @@ struct writeback_control { int nonblocking; /* Don't get stuck on request queues */ int encountered_congestion; /* An output: a queue is full */ }; - + +/* + * ->writepage() return values (make these much larger than a pagesize, in + * case some fs is returning number-of-bytes-written from writepage) + */ +#define WRITEPAGE_ACTIVATE 0x80000 /* IO was not started: activate page */ + +/* + * fs/fs-writeback.c + */ void writeback_inodes(struct writeback_control *wbc); void wake_up_inode(struct inode *inode); void __wait_on_inode(struct inode * inode); diff --git a/mm/filemap.c b/mm/filemap.c index 9060506b08f0..04b94af71ccf 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -108,37 +108,6 @@ static inline int sync_page(struct page *page) return 0; } -/* - * In-memory filesystems have to fail their - * writepage function - and this has to be - * worked around in the VM layer.. - * - * We - * - mark the page dirty again (but do NOT - * add it back to the inode dirty list, as - * that would livelock in fdatasync) - * - activate the page so that the page stealer - * doesn't try to write it out over and over - * again. - * - * NOTE! The livelock in fdatasync went away, due to io_pages. - * So this function can now call set_page_dirty(). - */ -int fail_writepage(struct page *page) -{ - /* Only activate on memory-pressure, not fsync.. */ - if (current->flags & PF_MEMALLOC) { - if (!PageActive(page)) - activate_page(page); - if (!PageReferenced(page)) - SetPageReferenced(page); - } - - unlock_page(page); - return -EAGAIN; /* It will be set dirty again */ -} -EXPORT_SYMBOL(fail_writepage); - /** * filemap_fdatawrite - start writeback against all of a mapping's dirty pages * @mapping: address space structure to write @@ -160,6 +129,9 @@ int filemap_fdatawrite(struct address_space *mapping) .nr_to_write = mapping->nrpages * 2, }; + if (mapping->backing_dev_info->memory_backed) + return 0; + current->flags |= PF_SYNC; ret = do_writepages(mapping, &wbc); current->flags &= ~PF_SYNC; @@ -1327,10 +1299,6 @@ int generic_file_mmap(struct file * file, struct vm_area_struct * vma) struct address_space *mapping = file->f_dentry->d_inode->i_mapping; struct inode *inode = mapping->host; - if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) { - if (!mapping->a_ops->writepage) - return -EINVAL; - } if (!mapping->a_ops->readpage) return -ENOEXEC; UPDATE_ATIME(inode); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 28826c712309..981367656157 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -424,10 +424,6 @@ int write_one_page(struct page *page, int wait) page_cache_get(page); write_unlock(&mapping->page_lock); ret = mapping->a_ops->writepage(page); - if (ret == -EAGAIN) { - __set_page_dirty_nobuffers(page); - ret = 0; - } if (ret == 0 && wait) { wait_on_page_writeback(page); if (PageError(page)) diff --git a/mm/shmem.c b/mm/shmem.c index 215f7d3caba7..cb8bd154df8f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -31,6 +31,7 @@ #include #include #include +#include #include @@ -686,10 +687,10 @@ static int shmem_writepage(struct page *page) inode = mapping->host; info = SHMEM_I(inode); if (info->flags & VM_LOCKED) - return fail_writepage(page); + goto redirty; swap = get_swap_page(); if (!swap.val) - return fail_writepage(page); + goto redirty; spin_lock(&info->lock); shmem_recalc_inode(inode); @@ -709,12 +710,9 @@ static int shmem_writepage(struct page *page) shmem_swp_unmap(entry); spin_unlock(&info->lock); swap_free(swap); - return fail_writepage(page); -} - -static int shmem_writepages(struct address_space *mapping, struct writeback_control *wbc) -{ - return 0; +redirty: + set_page_dirty(page); + return WRITEPAGE_ACTIVATE; /* Return with the page locked */ } /* @@ -1802,7 +1800,6 @@ static void destroy_inodecache(void) static struct address_space_operations shmem_aops = { .writepage = shmem_writepage, - .writepages = shmem_writepages, .set_page_dirty = __set_page_dirty_nobuffers, #ifdef CONFIG_TMPFS .readpage = shmem_readpage, diff --git a/mm/vmscan.c b/mm/vmscan.c index 8bae5e9e8012..1c79a9a637c8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -309,7 +309,7 @@ shrink_list(struct list_head *page_list, unsigned int gfp_mask, goto keep_locked; if (!mapping) goto keep_locked; - if (mapping->a_ops->writepage == fail_writepage) + if (mapping->a_ops->writepage == NULL) goto activate_locked; if (!may_enter_fs) goto keep_locked; @@ -327,14 +327,12 @@ shrink_list(struct list_head *page_list, unsigned int gfp_mask, SetPageReclaim(page); res = mapping->a_ops->writepage(page); - if (res == -EAGAIN) { + if (res == WRITEPAGE_ACTIVATE) { ClearPageReclaim(page); - __set_page_dirty_nobuffers(page); - } else if (!PageWriteback(page)) { - /* - * synchronous writeout or broken - * a_ops? - */ + goto activate_locked; + } + if (!PageWriteback(page)) { + /* synchronous write or broken a_ops? */ ClearPageReclaim(page); } goto keep; -- cgit v1.2.3 From 75f19a4075dff845fbc1a2940c6aee9cd4891b2e Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 14 Dec 2002 03:17:26 -0800 Subject: [PATCH] Add a sync_fs super_block operation This is infrastructure for fixing the journalled-data ext3 unmount data loss problem. It was sent for comment to linux-fsdevel a week ago; there was none. Add a `sync_fs' superblock operation whose mandate is to perform filesystem-specific operations to ensure a successful sync. It is called in two places: 1: fsync_super() - for umount. 2: sys_sync() - for global sync. In the sys_sync() case we call all the ->write_super() methods first. write_super() is an async flushing operation. It should not block. After that, we call all the ->sync_fs functions. This is independent of the state of s_dirt! That was all confused up before, and in this patch ->write_super() and ->sync_fs() are quite separate. With ext3 as an example, the initial ->write_super() will start a transaction, but will not wait on it. (But only if s_dirt was set!) The first ->sync_fs() call will get the IO underway. The second ->sync_fs() call will wait on the IO. And we really do need to be this elaborate, because all the testing of s_dirt in there makes ->write_super() an unreliable way of detecting when the VFS is trying to sync the filesystem. --- Documentation/filesystems/Locking | 2 ++ fs/buffer.c | 11 +++++++--- fs/super.c | 46 +++++++++++++++++++++++++++++++++++++-- include/linux/fs.h | 3 +++ 4 files changed, 57 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index e88e79968484..fa86d6a18ad1 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -92,6 +92,7 @@ prototypes: void (*delete_inode) (struct inode *); void (*put_super) (struct super_block *); void (*write_super) (struct super_block *); + void (*sync_fs) (struct super_block *sb, int wait); int (*statfs) (struct super_block *, struct statfs *); int (*remount_fs) (struct super_block *, int *, char *); void (*clear_inode) (struct inode *); @@ -108,6 +109,7 @@ delete_inode: no clear_inode: no put_super: yes yes maybe (see below) write_super: no yes maybe (see below) +sync_fs: no no maybe (see below) statfs: no no no remount_fs: yes yes maybe (see below) umount_begin: yes no maybe (see below) diff --git a/fs/buffer.c b/fs/buffer.c index 6808e8802039..3de883fc5009 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -221,6 +221,9 @@ int fsync_super(struct super_block *sb) lock_super(sb); if (sb->s_dirt && sb->s_op && sb->s_op->write_super) sb->s_op->write_super(sb); + if (sb->s_op && sb->s_op->sync_fs) { + sb->s_op->sync_fs(sb, 1); + } unlock_super(sb); sync_blockdev(sb->s_bdev); sync_inodes_sb(sb, 1); @@ -251,10 +254,12 @@ int fsync_bdev(struct block_device *bdev) asmlinkage long sys_sync(void) { wakeup_bdflush(0); - sync_inodes(0); /* All mappings and inodes, including block devices */ + sync_inodes(0); /* All mappings, inodes and their blockdevs */ DQUOT_SYNC(NULL); - sync_supers(); /* Write the superblocks */ - sync_inodes(1); /* All the mappings and inodes, again. */ + sync_supers(); /* Write the superblocks */ + sync_filesystems(0); /* Start syncing the filesystems */ + sync_filesystems(1); /* Waitingly sync the filesystems */ + sync_inodes(1); /* Mappings, inodes and blockdevs, again. */ return 0; } diff --git a/fs/super.c b/fs/super.c index e8f273943ce8..bc0e23a321a9 100644 --- a/fs/super.c +++ b/fs/super.c @@ -189,6 +189,8 @@ void generic_shutdown_super(struct super_block *sb) if (sop) { if (sop->write_super && sb->s_dirt) sop->write_super(sb); + if (sop->sync_fs) + sop->sync_fs(sb, 1); if (sop->put_super) sop->put_super(sb); } @@ -266,8 +268,8 @@ void drop_super(struct super_block *sb) static inline void write_super(struct super_block *sb) { lock_super(sb); - if (sb->s_root && sb->s_dirt) - if (sb->s_op && sb->s_op->write_super) + if (sb->s_op && sb->s_root && sb->s_dirt) + if (sb->s_op->write_super) sb->s_op->write_super(sb); unlock_super(sb); } @@ -296,6 +298,46 @@ restart: spin_unlock(&sb_lock); } +/* + * Call the ->sync_fs super_op against all filesytems which are r/w and + * which implement it. + */ +void sync_filesystems(int wait) +{ + struct super_block * sb; + + spin_lock(&sb_lock); + for (sb = sb_entry(super_blocks.next); sb != sb_entry(&super_blocks); + sb = sb_entry(sb->s_list.next)) { + if (!sb->s_op) + continue; + if (!sb->s_op->sync_fs); + continue; + if (sb->s_flags & MS_RDONLY) + continue; + sb->s_need_sync_fs = 1; + } + spin_unlock(&sb_lock); + +restart: + spin_lock(&sb_lock); + for (sb = sb_entry(super_blocks.next); sb != sb_entry(&super_blocks); + sb = sb_entry(sb->s_list.next)) { + if (!sb->s_need_sync_fs) + continue; + sb->s_need_sync_fs = 0; + if (sb->s_flags & MS_RDONLY) + continue; /* hm. Was remounted r/w meanwhile */ + sb->s_count++; + spin_unlock(&sb_lock); + down_read(&sb->s_umount); + sb->s_op->sync_fs(sb, wait); + drop_super(sb); + goto restart; + } + spin_unlock(&sb_lock); +} + /** * get_super - get the superblock of a device * @dev: device to get the superblock for diff --git a/include/linux/fs.h b/include/linux/fs.h index f39d21e5bcd9..afbb9474f25b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -631,6 +631,7 @@ struct super_block { struct semaphore s_lock; int s_count; int s_syncing; + int s_need_sync_fs; atomic_t s_active; void *s_security; @@ -810,6 +811,7 @@ struct super_operations { void (*delete_inode) (struct inode *); void (*put_super) (struct super_block *); void (*write_super) (struct super_block *); + int (*sync_fs)(struct super_block *sb, int wait); void (*write_super_lockfs) (struct super_block *); void (*unlockfs) (struct super_block *); int (*statfs) (struct super_block *, struct statfs *); @@ -1143,6 +1145,7 @@ extern void write_inode_now(struct inode *, int); extern int filemap_fdatawrite(struct address_space *); extern int filemap_fdatawait(struct address_space *); extern void sync_supers(void); +extern void sync_filesystems(int wait); extern sector_t bmap(struct inode *, sector_t); extern int setattr_mask(unsigned int); extern int notify_change(struct dentry *, struct iattr *); -- cgit v1.2.3 From c720c50a1411bebdf590b4a86675acd355f0508b Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 14 Dec 2002 03:17:42 -0800 Subject: [PATCH] vm accounting fixes and addition - /proc/vmstat:pageoutrun and /proc/vmstat:allocstall are always identical. Rework this so that - "allocstall" is the number of times a page allocator ran diect reclaim - "pageoutrun" is the number of times kswapd ran page reclaim - Add a new stat: "pgrotated". The number of pages which were rotated to the tail of the LRU for immediate reclaim by rotate_reclaimable_page(). - Document things a bit. --- include/linux/page-flags.h | 47 +++++++++++++++++++++++----------------------- mm/page_alloc.c | 2 +- mm/swap.c | 1 + mm/vmscan.c | 4 +++- 4 files changed, 29 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 7018961aea91..a50e09ff79ea 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -79,35 +79,36 @@ * allowed. */ struct page_state { - unsigned long nr_dirty; - unsigned long nr_writeback; - unsigned long nr_pagecache; - unsigned long nr_page_table_pages; - unsigned long nr_reverse_maps; - unsigned long nr_mapped; - unsigned long nr_slab; + unsigned long nr_dirty; /* Dirty writeable pages */ + unsigned long nr_writeback; /* Pages under writeback */ + unsigned long nr_pagecache; /* Pages in pagecache */ + unsigned long nr_page_table_pages;/* Pages used for pagetables */ + unsigned long nr_reverse_maps; /* includes PageDirect */ + unsigned long nr_mapped; /* mapped into pagetables */ + unsigned long nr_slab; /* In slab */ #define GET_PAGE_STATE_LAST nr_slab /* * The below are zeroed by get_page_state(). Use get_full_page_state() * to add up all these. */ - unsigned long pgpgin; - unsigned long pgpgout; - unsigned long pswpin; - unsigned long pswpout; - unsigned long pgalloc; - unsigned long pgfree; - unsigned long pgactivate; - unsigned long pgdeactivate; - unsigned long pgfault; - unsigned long pgmajfault; - unsigned long pgscan; - unsigned long pgrefill; - unsigned long pgsteal; - unsigned long kswapd_steal; - unsigned long pageoutrun; - unsigned long allocstall; + unsigned long pgpgin; /* Disk reads */ + unsigned long pgpgout; /* Disk writes */ + unsigned long pswpin; /* swap reads */ + unsigned long pswpout; /* swap writes */ + unsigned long pgalloc; /* page allocations */ + unsigned long pgfree; /* page freeings */ + unsigned long pgactivate; /* pages moved inactive->active */ + unsigned long pgdeactivate; /* pages moved active->inactive */ + unsigned long pgfault; /* faults (major+minor) */ + unsigned long pgmajfault; /* faults (major only) */ + unsigned long pgscan; /* pages scanned by page reclaim */ + unsigned long pgrefill; /* inspected in refill_inactive_zone */ + unsigned long pgsteal; /* total pages reclaimed */ + unsigned long kswapd_steal; /* pages reclaimed by kswapd */ + unsigned long pageoutrun; /* kswapd's calls to page reclaim */ + unsigned long allocstall; /* direct reclaim calls */ + unsigned long pgrotated; /* pages rotated to tail of the LRU */ } ____cacheline_aligned; DECLARE_PER_CPU(struct page_state, page_states); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1a267456dc4c..9dd74ba4bdb3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -513,7 +513,6 @@ rebalance: if (!wait) goto nopage; - inc_page_state(allocstall); current->flags |= PF_MEMALLOC; try_to_free_pages(classzone, gfp_mask, order); current->flags &= ~PF_MEMALLOC; @@ -1354,6 +1353,7 @@ static char *vmstat_text[] = { "kswapd_steal", "pageoutrun", "allocstall", + "pgrotated", }; static void *vmstat_start(struct seq_file *m, loff_t *pos) diff --git a/mm/swap.c b/mm/swap.c index 3f8d7aa5e9d2..246fa5b74664 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -61,6 +61,7 @@ int rotate_reclaimable_page(struct page *page) if (PageLRU(page) && !PageActive(page)) { list_del(&page->lru); list_add_tail(&page->lru, &zone->inactive_list); + inc_page_state(pgrotated); } if (!TestClearPageWriteback(page)) BUG(); diff --git a/mm/vmscan.c b/mm/vmscan.c index 1c79a9a637c8..8243646bf5c5 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -797,7 +797,7 @@ try_to_free_pages(struct zone *classzone, const int nr_pages = SWAP_CLUSTER_MAX; int nr_reclaimed = 0; - inc_page_state(pageoutrun); + inc_page_state(allocstall); for (priority = DEF_PRIORITY; priority >= 0; priority--) { int total_scanned = 0; @@ -853,6 +853,8 @@ static int balance_pgdat(pg_data_t *pgdat, int nr_pages, struct page_state *ps) int priority; int i; + inc_page_state(pageoutrun); + for (priority = DEF_PRIORITY; priority; priority--) { int all_zones_ok = 1; -- cgit v1.2.3 From 21c2baef2924136d97713f093820de9dc2bf16c6 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 14 Dec 2002 03:17:47 -0800 Subject: [PATCH] hugetlb fixes From Rohit 1) hugetlbfs_zero_setup returns ENOMEM in case the request size can not be easily handleed. 2) Preference is given to LOW_MEM while freeing the pages from hugetlbpage free list. --- arch/i386/mm/hugetlbpage.c | 71 +++++++++++++++++++++++++++++++++++++++------- fs/hugetlbfs/inode.c | 2 ++ include/linux/hugetlb.h | 2 ++ 3 files changed, 64 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/arch/i386/mm/hugetlbpage.c b/arch/i386/mm/hugetlbpage.c index 9159e6f3ac7f..1f1ecfa5a94d 100644 --- a/arch/i386/mm/hugetlbpage.c +++ b/arch/i386/mm/hugetlbpage.c @@ -20,6 +20,8 @@ #include #include +#include + static long htlbpagemem; int htlbpage_max; static long htlbzone_pages; @@ -555,6 +557,53 @@ int alloc_hugetlb_pages(int key, unsigned long addr, unsigned long len, int prot return alloc_shared_hugetlb_pages(key, addr, len, prot, flag); return alloc_private_hugetlb_pages(key, addr, len, prot, flag); } +void update_and_free_page(struct page *page) +{ + int j; + struct page *map; + + map = page; + htlbzone_pages--; + for (j = 0; j < (HPAGE_SIZE / PAGE_SIZE); j++) { + map->flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | + 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | + 1 << PG_private | 1<< PG_writeback); + set_page_count(map, 0); + map++; + } + set_page_count(page, 1); + __free_pages(page, HUGETLB_PAGE_ORDER); +} + +int try_to_free_low(int count) +{ + struct list_head *p; + struct page *page, *map; + + map = NULL; + spin_lock(&htlbpage_lock); + list_for_each(p, &htlbpage_freelist) { + if (map) { + list_del(&map->list); + update_and_free_page(map); + htlbpagemem--; + map = NULL; + if (++count == 0) + break; + } + page = list_entry(p, struct page, list); + if ((page_zone(page))->name[0] != 'H') // Look for non-Highmem + map = page; + } + if (map) { + list_del(&map->list); + update_and_free_page(map); + htlbpagemem--; + count++; + } + spin_unlock(&htlbpage_lock); + return count; +} int set_hugetlb_mem_size(int count) { @@ -568,6 +617,8 @@ int set_hugetlb_mem_size(int count) else lcount = count - htlbzone_pages; + if (lcount == 0) + return (int)htlbzone_pages; if (lcount > 0) { /* Increase the mem size. */ while (lcount--) { page = alloc_pages(__GFP_HIGHMEM, HUGETLB_PAGE_ORDER); @@ -587,23 +638,14 @@ int set_hugetlb_mem_size(int count) return (int) htlbzone_pages; } /* Shrink the memory size. */ + lcount = try_to_free_low(lcount); while (lcount++) { page = alloc_hugetlb_page(); if (page == NULL) break; spin_lock(&htlbpage_lock); - htlbzone_pages--; + update_and_free_page(page); spin_unlock(&htlbpage_lock); - map = page; - for (j = 0; j < (HPAGE_SIZE / PAGE_SIZE); j++) { - map->flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | - 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | - 1 << PG_private | 1<< PG_writeback); - set_page_count(map, 0); - map++; - } - set_page_count(page, 1); - __free_pages(page, HUGETLB_PAGE_ORDER); } return (int) htlbzone_pages; } @@ -659,6 +701,13 @@ int hugetlb_report_meminfo(char *buf) HPAGE_SIZE/1024); } +int is_hugepage_mem_enough(size_t size) +{ + if (size > (htlbpagemem << HPAGE_SHIFT)) + return 0; + return 1; +} + static struct page * hugetlb_nopage(struct vm_area_struct * area, unsigned long address, int unused) { BUG(); diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 3ccc3b3661a4..2a8b3b7c8d58 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -528,6 +528,8 @@ struct file *hugetlb_zero_setup(size_t size) if (!capable(CAP_IPC_LOCK)) return ERR_PTR(-EPERM); + if (!is_hugepage_mem_enough(size)) + return ERR_PTR(-ENOMEM); n = atomic_read(&hugetlbfs_counter); atomic_inc(&hugetlbfs_counter); diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 76d1e1e1fb6b..3bf94e2205bd 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -20,6 +20,7 @@ int hugetlb_prefault(struct address_space *, struct vm_area_struct *); void huge_page_release(struct page *); void hugetlb_release_key(struct hugetlb_key *); int hugetlb_report_meminfo(char *); +int is_hugepage_mem_enough(size_t); extern int htlbpage_max; @@ -35,6 +36,7 @@ static inline int is_vm_hugetlb_page(struct vm_area_struct *vma) #define zap_hugepage_range(vma, start, len) BUG() #define unmap_hugepage_range(vma, start, end) BUG() #define huge_page_release(page) BUG() +#define is_hugepage_mem_enough(size) 0 #define hugetlb_report_meminfo(buf) 0 #endif /* !CONFIG_HUGETLB_PAGE */ -- cgit v1.2.3 From 20b96b5225db64dbc4b1226a46dfdb9fd659deb7 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 14 Dec 2002 03:17:52 -0800 Subject: [PATCH] fs-writeback rework. I've revisited all the superblock->inode->page writeback paths. There were several silly things in there, and things were not as clear as they could be. scenario 1: create and dirty a MAP_SHARED segment over a sparse file, then exit. All the memory turns into dirty pagecache, but the kupdate function only writes it out at a trickle - 4 megabytes every thirty seconds. We should sync it all within 30 seconds. What's happening is that when writeback tries to write those pages, the filesystem needs to instantiate new blocks for them (they're over holes). The filesystem runs mark_inode_dirty() within the writeback function. This redirtying of the inode while we're writing it out triggers some livelock avoidance code in __sync_single_inode(). That function says "ah, someone redirtied the file while I was writing it. Let's move the file to the new end of the superblock dirty list and write it out later." Problem is, writeback dirtied the inode itself. (It is rather silly that mark_inode_dirty() sets I_DIRTY_PAGES when clearly no pages have been dirtied. Fixing that up would be a largish work, so work around it here). So this patch just removes the livelock avoidance from __sync_single_inode(). It is no longer needed anyway - writeback livelock is now avoided (in all writeback paths) by writing a finite number of pages. scenario 2: an application is continuously dirtying a 200 megabyte file, and your disk has a bandwidth of less than 40 megabytes/sec. What happens is that once 30 seconds passes, pdflush starts writing out the file. And because that writeout will take more than five seconds (a `kupdate' interval), pdflush just keeps writing it out forever - continuous I/O. What we _want_ to happen is that the 200 megabytes gets written, and then IO stops for thirty seconds (minus the writeout period). So the file is fully synced every thirty seconds. The patch solves this by using mapping->io_pages more intelligently. When the time comes to write the file out, move all the dirty pages onto io_pages. That is a "batch of pages for this kupdate round". When io_pages is empty, we know we're done. The address_space_operations.writepages() API is changed! It now only needs to write the pages which the caller placed on mapping->io_pages. This conceptually cleans things up a bit, by more clearly defining the role of ->io_pages, and the motion between the various mapping lists. The treatment of sb->s_dirty and sb->s_io is now conceptually identical to mapping->dirty_pages and mapping->io_pages: move the items-to-be written onto ->s_io/io_pages, alk walk that list. As inodes (or pages) are written, move them over to the clean/locked/dirty lists. Oh, scenario 3: start an app whcih continuously overwrites a 5 meg file. Wait five seconds, start another, wait 5 seconds, start another. What we _should_ see is three 5-meg writes, five seconds apart, every thirty seconds. That did all sorts of odd things. It now does the right thing. --- Documentation/filesystems/Locking | 3 ++ fs/fs-writeback.c | 83 ++++++++++++++++++++------------------- fs/mpage.c | 17 ++++---- include/linux/writeback.h | 1 + mm/filemap.c | 4 ++ mm/page-writeback.c | 3 +- 6 files changed, 60 insertions(+), 51 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index fa86d6a18ad1..66d727bb9ce7 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -211,6 +211,9 @@ written. The address_space implementation may write more (or less) pages than *nr_to_write asks for, but it should try to be reasonably close. If nr_to_write is NULL, all dirty pages must be written. +writepages should _only_ write pages which are present on +mapping->io_pages. + ->set_page_dirty() is called from various places in the kernel when the target page is marked as needing writeback. It may be called under spinlock (it cannot block) and is sometimes called with the page diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 33a59010c5df..326c2963ca3d 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -67,14 +67,10 @@ void __mark_inode_dirty(struct inode *inode, int flags) spin_lock(&inode_lock); if ((inode->i_state & flags) != flags) { - const int was_dirty = inode->i_state & I_DIRTY; struct address_space *mapping = inode->i_mapping; inode->i_state |= flags; - if (!was_dirty) - mapping->dirtied_when = jiffies; - /* * If the inode is locked, just update its dirty state. * The unlocker will place the inode on the appropriate @@ -84,18 +80,20 @@ void __mark_inode_dirty(struct inode *inode, int flags) goto out; /* - * Only add valid (hashed) inode to the superblock's + * Only add valid (hashed) inodes to the superblock's * dirty list. Add blockdev inodes as well. */ if (list_empty(&inode->i_hash) && !S_ISBLK(inode->i_mode)) goto out; /* - * If the inode was already on s_dirty, don't reposition - * it (that would break s_dirty time-ordering). + * If the inode was already on s_dirty or s_io, don't + * reposition it (that would break s_dirty time-ordering). */ - if (!was_dirty) + if (!mapping->dirtied_when) { + mapping->dirtied_when = jiffies|1; /* 0 is special */ list_move(&inode->i_list, &sb->s_dirty); + } } out: spin_unlock(&inode_lock); @@ -110,19 +108,17 @@ static void write_inode(struct inode *inode, int sync) /* * Write a single inode's dirty pages and inode data out to disk. - * If `sync' is set, wait on the writeout. - * Subtract the number of written pages from nr_to_write. + * If `wait' is set, wait on the writeout. * - * Normally it is not legal for a single process to lock more than one - * page at a time, due to ab/ba deadlock problems. But writepages() - * does want to lock a large number of pages, without immediately submitting - * I/O against them (starting I/O is a "deferred unlock_page"). + * The whole writeout design is quite complex and fragile. We want to avoid + * starvation of particular inodes when others are being redirtied, prevent + * livelocks, etc. * - * However it *is* legal to lock multiple pages, if this is only ever performed - * by a single process. We provide that exclusion via locking in the - * filesystem's ->writepages a_op. This ensures that only a single - * process is locking multiple pages against this inode. And as I/O is - * submitted against all those locked pages, there is no deadlock. + * So what we do is to move all pages which are to be written from dirty_pages + * onto io_pages. And keep on writing io_pages until it's empty. Refusing to + * move more pages onto io_pages until io_pages is empty. Once that point has + * been reached, we are ready to take another pass across the inode's dirty + * pages. * * Called under inode_lock. */ @@ -131,7 +127,6 @@ __sync_single_inode(struct inode *inode, int wait, struct writeback_control *wbc) { unsigned dirty; - unsigned long orig_dirtied_when; struct address_space *mapping = inode->i_mapping; struct super_block *sb = inode->i_sb; @@ -141,8 +136,11 @@ __sync_single_inode(struct inode *inode, int wait, dirty = inode->i_state & I_DIRTY; inode->i_state |= I_LOCK; inode->i_state &= ~I_DIRTY; - orig_dirtied_when = mapping->dirtied_when; - mapping->dirtied_when = 0; /* assume it's whole-file writeback */ + + write_lock(&mapping->page_lock); + if (wait || !wbc->for_kupdate || list_empty(&mapping->io_pages)) + list_splice_init(&mapping->dirty_pages, &mapping->io_pages); + write_unlock(&mapping->page_lock); spin_unlock(&inode_lock); do_writepages(mapping, wbc); @@ -155,24 +153,26 @@ __sync_single_inode(struct inode *inode, int wait, filemap_fdatawait(mapping); spin_lock(&inode_lock); - inode->i_state &= ~I_LOCK; if (!(inode->i_state & I_FREEING)) { - list_del(&inode->i_list); - if (inode->i_state & I_DIRTY) { /* Redirtied */ - list_add(&inode->i_list, &sb->s_dirty); + if (!list_empty(&mapping->io_pages)) { + /* Needs more writeback */ + inode->i_state |= I_DIRTY_PAGES; + } else if (!list_empty(&mapping->dirty_pages)) { + /* Redirtied */ + inode->i_state |= I_DIRTY_PAGES; + mapping->dirtied_when = jiffies|1; + list_move(&inode->i_list, &sb->s_dirty); + } else if (inode->i_state & I_DIRTY) { + /* Redirtied */ + mapping->dirtied_when = jiffies|1; + list_move(&inode->i_list, &sb->s_dirty); + } else if (atomic_read(&inode->i_count)) { + mapping->dirtied_when = 0; + list_move(&inode->i_list, &inode_in_use); } else { - if (!list_empty(&mapping->dirty_pages) || - !list_empty(&mapping->io_pages)) { - /* Not a whole-file writeback */ - mapping->dirtied_when = orig_dirtied_when; - inode->i_state |= I_DIRTY_PAGES; - list_add_tail(&inode->i_list, &sb->s_dirty); - } else if (atomic_read(&inode->i_count)) { - list_add(&inode->i_list, &inode_in_use); - } else { - list_add(&inode->i_list, &inode_unused); - } + mapping->dirtied_when = 0; + list_move(&inode->i_list, &inode_unused); } } wake_up_inode(inode); @@ -185,8 +185,10 @@ static void __writeback_single_inode(struct inode *inode, int sync, struct writeback_control *wbc) { - if (current_is_pdflush() && (inode->i_state & I_LOCK)) + if (current_is_pdflush() && (inode->i_state & I_LOCK)) { + list_move(&inode->i_list, &inode->i_sb->s_dirty); return; + } while (inode->i_state & I_LOCK) { __iget(inode); @@ -233,7 +235,9 @@ sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc) { const unsigned long start = jiffies; /* livelock avoidance */ - list_splice_init(&sb->s_dirty, &sb->s_io); + if (!wbc->for_kupdate || list_empty(&sb->s_io)) + list_splice_init(&sb->s_dirty, &sb->s_io); + while (!list_empty(&sb->s_io)) { struct inode *inode = list_entry(sb->s_io.prev, struct inode, i_list); @@ -275,7 +279,6 @@ sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc) really_sync = (wbc->sync_mode == WB_SYNC_ALL); BUG_ON(inode->i_state & I_FREEING); __iget(inode); - list_move(&inode->i_list, &sb->s_dirty); __writeback_single_inode(inode, really_sync, wbc); if (wbc->sync_mode == WB_SYNC_HOLD) { mapping->dirtied_when = jiffies; diff --git a/fs/mpage.c b/fs/mpage.c index 59cf471aae63..7f3043c7ee90 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -525,12 +525,12 @@ out: * Pages can be moved from clean_pages or locked_pages onto dirty_pages * at any time - it's not possible to lock against that. So pages which * have already been added to a BIO may magically reappear on the dirty_pages - * list. And generic_writepages() will again try to lock those pages. + * list. And mpage_writepages() will again try to lock those pages. * But I/O has not yet been started against the page. Thus deadlock. * - * To avoid this, the entire contents of the dirty_pages list are moved - * onto io_pages up-front. We then walk io_pages, locking the - * pages and submitting them for I/O, moving them to locked_pages. + * To avoid this, mpage_writepages() will only write pages from io_pages. The + * caller must place them there. We walk io_pages, locking the pages and + * submitting them for I/O, moving them to locked_pages. * * This has the added benefit of preventing a livelock which would otherwise * occur if pages are being dirtied faster than we can write them out. @@ -539,8 +539,8 @@ out: * if it's dirty. This is desirable behaviour for memory-cleaning writeback, * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() * and msync() need to guarentee that all the data which was dirty at the time - * the call was made get new I/O started against them. The way to do this is - * to run filemap_fdatawait() before calling filemap_fdatawrite(). + * the call was made get new I/O started against them. So if called_for_sync() + * is true, we must wait for existing IO to complete. * * It's fairly rare for PageWriteback pages to be on ->dirty_pages. It * means that someone redirtied the page while it was under I/O. @@ -570,10 +570,7 @@ mpage_writepages(struct address_space *mapping, pagevec_init(&pvec, 0); write_lock(&mapping->page_lock); - - list_splice_init(&mapping->dirty_pages, &mapping->io_pages); - - while (!list_empty(&mapping->io_pages) && !done) { + while (!list_empty(&mapping->io_pages) && !done) { struct page *page = list_entry(mapping->io_pages.prev, struct page, list); list_del(&page->list); diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 42569bc0bb61..351e5851c041 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -41,6 +41,7 @@ struct writeback_control { this for each page written */ int nonblocking; /* Don't get stuck on request queues */ int encountered_congestion; /* An output: a queue is full */ + int for_kupdate; /* A kupdate writeback */ }; /* diff --git a/mm/filemap.c b/mm/filemap.c index 04b94af71ccf..50b05fe9a2e0 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -62,6 +62,7 @@ * ->mapping->page_lock * ->inode_lock * ->sb_lock (fs/fs-writeback.c) + * ->mapping->page_lock (__sync_single_inode) * ->page_table_lock * ->swap_device_lock (try_to_unmap_one) * ->private_lock (try_to_unmap_one) @@ -133,6 +134,9 @@ int filemap_fdatawrite(struct address_space *mapping) return 0; current->flags |= PF_SYNC; + write_lock(&mapping->page_lock); + list_splice_init(&mapping->dirty_pages, &mapping->io_pages); + write_unlock(&mapping->page_lock); ret = do_writepages(mapping, &wbc); current->flags &= ~PF_SYNC; return ret; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 981367656157..3880394c8562 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -286,6 +286,7 @@ static void wb_kupdate(unsigned long arg) .older_than_this = &oldest_jif, .nr_to_write = 0, .nonblocking = 1, + .for_kupdate = 1, }; sync_supers(); @@ -299,7 +300,7 @@ static void wb_kupdate(unsigned long arg) wbc.encountered_congestion = 0; wbc.nr_to_write = MAX_WRITEBACK_PAGES; writeback_inodes(&wbc); - if (wbc.nr_to_write == MAX_WRITEBACK_PAGES) { + if (wbc.nr_to_write > 0) { if (wbc.encountered_congestion) blk_congestion_wait(WRITE, HZ); else -- cgit v1.2.3 From c18592135f19933d4509328d6c9923731b06ed69 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 14 Dec 2002 03:17:58 -0800 Subject: [PATCH] Add /proc/sys/vm/lower_zone_protection This allows us to control the aggressiveness of the lower-zone defense algorithm. The `incremental min'. For workloads which are using a serious amount of mlocked memory, a few megabytes is not enough. So the `lower_zone_protection' tunable allows the administrator to increase the amount of protection which lower zones receive against allocations which _could_ use higher zones. The default value of lower_zone_protection is zero, giving unchanged behaviour. We should not normally make large amounts of memory unavailable for pagecache just in case someone mlocks many hundreds of megabytes. --- Documentation/filesystems/proc.txt | 64 ++++++++++++++++++++++++-------------- include/linux/sysctl.h | 1 + kernel/sysctl.c | 8 ++++- mm/page_alloc.c | 5 ++- 4 files changed, 52 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 8b1059c89f5a..9b0a31e5b5fd 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -989,42 +989,58 @@ for writeout by the pdflush daemons. It is expressed in 100'ths of a second. Data which has been dirty in-memory for longer than this interval will be written out next time a pdflush daemon wakes up. +lower_zone_protection +--------------------- -kswapd ------- +For some specialised workloads on highmem machines it is dangerous for +the kernel to allow process memory to be allocated from the "lowmem" +zone. This is because that memory could then be pinned via the mlock() +system call, or by unavailability of swapspace. -Kswapd is the kernel swap out daemon. That is, kswapd is that piece of the -kernel that frees memory when it gets fragmented or full. Since every system -is different, you'll probably want some control over this piece of the system. +And on large highmem machines this lack of reclaimable lowmem memory +can be fatal. -The file contains three numbers: +So the Linux page allocator has a mechanism which prevents allocations +which _could_ use highmem from using too much lowmem. This means that +a certain amount of lowmem is defended from the possibility of being +captured into pinned user memory. -tries_base ----------- +(The same argument applies to the old 16 megabyte ISA DMA region. This +mechanism will also defend that region from allocations which could use +highmem or lowmem). -The maximum number of pages kswapd tries to free in one round is calculated -from this number. Usually this number will be divided by 4 or 8 (see -mm/vmscan.c), so it isn't as big as it looks. +The `lower_zone_protection' tunable determines how aggressive the kernel is +in defending these lower zones. The default value is zero - no +protection at all. -When you need to increase the bandwidth to/from swap, you'll want to increase -this number. +If you have a machine which uses highmem or ISA DMA and your +applications are using mlock(), or if you are running with no swap then +you probably should increase the lower_zone_protection setting. -tries_min ---------- +The units of this tunable are fairly vague. It is approximately equal +to "megabytes". So setting lower_zone_protection=100 will protect around 100 +megabytes of the lowmem zone from user allocations. It will also make +those 100 megabytes unavaliable for use by applications and by +pagecache, so there is a cost. + +The effects of this tunable may be observed by monitoring +/proc/meminfo:LowFree. Write a single huge file and observe the point +at which LowFree ceases to fall. -This is the minimum number of times kswapd tries to free a page each time it -is called. Basically it's just there to make sure that kswapd frees some pages -even when it's being called with minimum priority. +A reasonable value for lower_zone_protection is 100. -swap_cluster +page-cluster ------------ -This is probably the greatest influence on system performance. +page-cluster controls the number of pages which are written to swap in +a single attempt. The swap I/O size. + +It is a logarithmic value - setting it to zero means "1 page", setting +it to 1 means "2 pages", setting it to 2 means "4 pages", etc. -swap_cluster is the number of pages kswapd writes in one turn. You'll want -this value to be large so that kswapd does its I/O in large chunks and the -disk doesn't have to seek as often, but you don't want it to be too large -since that would flood the request queue. +The default value is three (eight pages at a time). There may be some +small benefits in tuning this to a different value if your workload is +swap-intensive. overcommit_memory ----------------- diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index e70fc2ef0856..0b905659ac4b 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -154,6 +154,7 @@ enum VM_PAGEBUF=17, /* struct: Control pagebuf parameters */ VM_HUGETLB_PAGES=18, /* int: Number of available Huge Pages */ VM_SWAPPINESS=19, /* Tendency to steal mapped memory */ + VM_LOWER_ZONE_PROTECTION=20,/* Amount of protection of lower zones */ }; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c6eeba758371..0f2359578775 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -53,6 +53,7 @@ extern int core_uses_pid; extern char core_pattern[]; extern int cad_pid; extern int pid_max; +extern int sysctl_lower_zone_protection; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ static int maxolduid = 65535; @@ -310,8 +311,13 @@ static ctl_table vm_table[] = { 0644, NULL, &proc_dointvec_minmax, &sysctl_intvec, NULL, &zero, &one_hundred }, #ifdef CONFIG_HUGETLB_PAGE - {VM_HUGETLB_PAGES, "nr_hugepages", &htlbpage_max, sizeof(int), 0644, NULL, &hugetlb_sysctl_handler}, + {VM_HUGETLB_PAGES, "nr_hugepages", &htlbpage_max, sizeof(int), 0644, + NULL, &hugetlb_sysctl_handler}, #endif + {VM_LOWER_ZONE_PROTECTION, "lower_zone_protection", + &sysctl_lower_zone_protection, sizeof(sysctl_lower_zone_protection), + 0644, NULL, &proc_dointvec_minmax, &sysctl_intvec, NULL, &zero, + NULL, }, {0} }; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9dd74ba4bdb3..2e73929c15e5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -38,7 +38,7 @@ unsigned long totalram_pages; unsigned long totalhigh_pages; int nr_swap_pages; int numnodes = 1; - +int sysctl_lower_zone_protection = 0; /* * Used by page_zone() to look up the address of the struct zone whose @@ -470,6 +470,7 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order, if (page) return page; } + min += z->pages_low * sysctl_lower_zone_protection; } /* we're somewhat low on memory, failed to find what we needed */ @@ -492,6 +493,7 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order, if (page) return page; } + min += local_min * sysctl_lower_zone_protection; } /* here we're in the low on memory slow path */ @@ -529,6 +531,7 @@ rebalance: if (page) return page; } + min += z->pages_low * sysctl_lower_zone_protection; } /* -- cgit v1.2.3 From 577c516f30df0f8a3d91e3beb68be3374ad25a8c Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 14 Dec 2002 03:18:17 -0800 Subject: [PATCH] remove PF_SYNC current->flags:PF_SYNC was a hack I added because I didn't want to change all ->writepage implementations. It's foul. And it means that if someone happens to run direct page reclaim within the context of (say) sys_sync, the writepage invokations from the VM will be treated as "data integrity" operations, not "memory cleansing" operations, which would cause latency. So the patch removes PF_SYNC and adds an extra arg to a_ops->writepage. It is the `writeback_control' structure which contains the full context information about why writepage was called. The initial version of this patch just passed in a bare `int sync', but the XFS team need more info so they can perform writearound from within page reclaim. The patch also adds writeback_control.for_reclaim, so writepage implementations can inspect that to work out the call context rather than peeking at current->flags:PF_MEMALLOC. --- Documentation/filesystems/Locking | 19 ++++++++++--------- Documentation/filesystems/vfs.txt | 2 +- drivers/mtd/devices/blkmtd.c | 3 ++- fs/adfs/inode.c | 4 ++-- fs/affs/file.c | 4 ++-- fs/bfs/file.c | 4 ++-- fs/block_dev.c | 4 ++-- fs/buffer.c | 13 +++++++------ fs/cifs/file.c | 2 +- fs/ext2/inode.c | 4 ++-- fs/ext3/inode.c | 7 ++++--- fs/fat/inode.c | 4 ++-- fs/hfs/inode.c | 4 ++-- fs/hpfs/file.c | 4 ++-- fs/jfs/inode.c | 4 ++-- fs/minix/inode.c | 4 ++-- fs/mpage.c | 15 +++++++-------- fs/nfs/write.c | 2 +- fs/ntfs/aops.c | 2 +- fs/qnx4/inode.c | 4 ++-- fs/reiserfs/inode.c | 6 +++--- fs/smbfs/file.c | 2 +- fs/sysv/itree.c | 4 ++-- fs/udf/file.c | 2 +- fs/udf/inode.c | 11 ++++++++--- fs/ufs/inode.c | 4 ++-- fs/xfs/linux/xfs_aops.c | 3 ++- include/linux/buffer_head.h | 2 +- include/linux/fs.h | 2 +- include/linux/nfs_fs.h | 2 +- include/linux/sched.h | 5 ++--- include/linux/swap.h | 3 ++- include/linux/writeback.h | 10 +--------- mm/filemap.c | 6 ------ mm/page-writeback.c | 5 ++++- mm/page_io.c | 8 ++++++-- mm/shmem.c | 2 +- mm/swapfile.c | 7 ++++++- mm/vmscan.c | 8 +++++++- 39 files changed, 107 insertions(+), 94 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 66d727bb9ce7..fa7c6091b5d2 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -133,10 +133,10 @@ unlocks and drops the reference. --------------------------- address_space_operations -------------------------- prototypes: - int (*writepage)(struct page *); + int (*writepage)(struct page *page, struct writeback_control *wbc); int (*readpage)(struct file *, struct page *); int (*sync_page)(struct page *); - int (*writepages)(struct address_space *, int *nr_to_write); + int (*writepages)(struct address_space *, struct writeback_control *); int (*set_page_dirty)(struct page *page); int (*prepare_write)(struct file *, struct page *, unsigned, unsigned); int (*commit_write)(struct file *, struct page *, unsigned, unsigned); @@ -172,15 +172,16 @@ I/O against them. They come unlocked upon I/O completion. ->writepage() is used for two purposes: for "memory cleansing" and for "sync". These are quite different operations and the behaviour may differ -depending upon the mode. (Yes, there should be two a_ops for this, or -writepage should take a writeback_control*) +depending upon the mode. -If writepage is called for sync (current->flags & PF_SYNC) then it *must* -write the page, even if that would involve blocking on in-progress I/O. +If writepage is called for sync (wbc->sync_mode != WBC_SYNC_NONE) then +it *must* start I/O against the page, even if that would involve +blocking on in-progress I/O. -If writepage is called for memory cleansing (!(current->flags & PF_SYNC)) -then its role is to get as much writeout underway as possible. So writepage -should try to avoid blocking against currently-in-progress I/O. +If writepage is called for memory cleansing (sync_mode == +WBC_SYNC_NONE) then its role is to get as much writeout underway as +possible. So writepage should try to avoid blocking against +currently-in-progress I/O. If the filesystem is not called for "sync" and it determines that it would need to block against in-progress I/O to be able to start new I/O diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 09c219cfb284..ab67bdd0f6dc 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -264,7 +264,7 @@ struct inode_operations { int (*readlink) (struct dentry *, char *,int); struct dentry * (*follow_link) (struct dentry *, struct dentry *); int (*readpage) (struct file *, struct page *); - int (*writepage) (struct file *, struct page *); + int (*writepage) (struct page *page, struct writeback_control *wbc); int (*bmap) (struct inode *,int); void (*truncate) (struct inode *); int (*permission) (struct inode *, int); diff --git a/drivers/mtd/devices/blkmtd.c b/drivers/mtd/devices/blkmtd.c index 39f09c9c0039..4b9c704fd737 100644 --- a/drivers/mtd/devices/blkmtd.c +++ b/drivers/mtd/devices/blkmtd.c @@ -151,9 +151,10 @@ MODULE_PARM(wqs, "i"); /* Page cache stuff */ /* writepage() - should never be called - catch it anyway */ -static int blkmtd_writepage(struct page *page) +static int blkmtd_writepage(struct page *page, struct writeback_control *wbc) { printk("blkmtd: writepage called!!!\n"); + unlock_page(page); return -EIO; } diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index b139e7a1f8d0..c1a4bf8d7bb8 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -51,9 +51,9 @@ abort_toobig: return 0; } -static int adfs_writepage(struct page *page) +static int adfs_writepage(struct page *page, struct writeback_control *wbc) { - return block_write_full_page(page, adfs_get_block); + return block_write_full_page(page, adfs_get_block, wbc); } static int adfs_readpage(struct file *file, struct page *page) diff --git a/fs/affs/file.c b/fs/affs/file.c index 8db6f186a42b..93c545e071ca 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -407,9 +407,9 @@ err_alloc: return -ENOSPC; } -static int affs_writepage(struct page *page) +static int affs_writepage(struct page *page, struct writeback_control *wbc) { - return block_write_full_page(page, affs_get_block); + return block_write_full_page(page, affs_get_block, wbc); } static int affs_readpage(struct file *file, struct page *page) { diff --git a/fs/bfs/file.c b/fs/bfs/file.c index 8f8dbde4636f..747fd1ea55e0 100644 --- a/fs/bfs/file.c +++ b/fs/bfs/file.c @@ -130,9 +130,9 @@ out: return err; } -static int bfs_writepage(struct page *page) +static int bfs_writepage(struct page *page, struct writeback_control *wbc) { - return block_write_full_page(page, bfs_get_block); + return block_write_full_page(page, bfs_get_block, wbc); } static int bfs_readpage(struct file *file, struct page *page) diff --git a/fs/block_dev.c b/fs/block_dev.c index 02ab29042036..8715fead5101 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -126,9 +126,9 @@ blkdev_direct_IO(int rw, struct file *file, const struct iovec *iov, nr_segs, blkdev_get_blocks); } -static int blkdev_writepage(struct page * page) +static int blkdev_writepage(struct page *page, struct writeback_control *wbc) { - return block_write_full_page(page, blkdev_get_block); + return block_write_full_page(page, blkdev_get_block, wbc); } static int blkdev_readpage(struct file * file, struct page * page) diff --git a/fs/buffer.c b/fs/buffer.c index 3de883fc5009..f8018f4eef92 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1632,8 +1632,8 @@ EXPORT_SYMBOL(unmap_underlying_metadata); * with submit_bh(). At the address_space level PageWriteback prevents this * contention from occurring. */ -static int __block_write_full_page(struct inode *inode, - struct page *page, get_block_t *get_block) +static int __block_write_full_page(struct inode *inode, struct page *page, + get_block_t *get_block, struct writeback_control *wbc) { int err; unsigned long block; @@ -1705,7 +1705,7 @@ static int __block_write_full_page(struct inode *inode, do { get_bh(bh); if (buffer_mapped(bh) && buffer_dirty(bh)) { - if (called_for_sync()) { + if (wbc->sync_mode != WB_SYNC_NONE) { lock_buffer(bh); } else { if (test_set_buffer_locked(bh)) { @@ -2485,7 +2485,8 @@ out: /* * The generic ->writepage function for buffer-backed address_spaces */ -int block_write_full_page(struct page *page, get_block_t *get_block) +int block_write_full_page(struct page *page, get_block_t *get_block, + struct writeback_control *wbc) { struct inode * const inode = page->mapping->host; const unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT; @@ -2494,7 +2495,7 @@ int block_write_full_page(struct page *page, get_block_t *get_block) /* Is the page fully inside i_size? */ if (page->index < end_index) - return __block_write_full_page(inode, page, get_block); + return __block_write_full_page(inode, page, get_block, wbc); /* Is the page fully outside i_size? (truncate in progress) */ offset = inode->i_size & (PAGE_CACHE_SIZE-1); @@ -2514,7 +2515,7 @@ int block_write_full_page(struct page *page, get_block_t *get_block) memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset); flush_dcache_page(page); kunmap_atomic(kaddr, KM_USER0); - return __block_write_full_page(inode, page, get_block); + return __block_write_full_page(inode, page, get_block, wbc); } sector_t generic_block_bmap(struct address_space *mapping, sector_t block, diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 0984cb20c826..0e6d0b9515d4 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -415,7 +415,7 @@ cifs_partialpagewrite(struct page *page,unsigned from, unsigned to) } static int -cifs_writepage(struct page* page) +cifs_writepage(struct page* page, struct writeback_control *wbc) { int rc = -EFAULT; int xid; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 29cccde53b74..f04ec5f0b98e 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -588,9 +588,9 @@ changed: goto reread; } -static int ext2_writepage(struct page *page) +static int ext2_writepage(struct page *page, struct writeback_control *wbc) { - return block_write_full_page(page,ext2_get_block); + return block_write_full_page(page, ext2_get_block, wbc); } static int ext2_readpage(struct file *file, struct page *page) diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index cf150fed6765..72555eb87b02 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include "xattr.h" @@ -1287,7 +1288,7 @@ static int bget_one(handle_t *handle, struct buffer_head *bh) * disastrous. Any write() or metadata operation will sync the fs for * us. */ -static int ext3_writepage(struct page *page) +static int ext3_writepage(struct page *page, struct writeback_control *wbc) { struct inode *inode = page->mapping->host; struct buffer_head *page_bufs; @@ -1308,7 +1309,7 @@ static int ext3_writepage(struct page *page) goto out_fail; needed = ext3_writepage_trans_blocks(inode); - if (current->flags & PF_MEMALLOC) + if (wbc->for_reclaim) handle = ext3_journal_try_start(inode, needed); else handle = ext3_journal_start(inode, needed); @@ -1339,7 +1340,7 @@ static int ext3_writepage(struct page *page) PAGE_CACHE_SIZE, NULL, bget_one); } - ret = block_write_full_page(page, ext3_get_block); + ret = block_write_full_page(page, ext3_get_block, wbc); /* * The page can become unlocked at any point now, and diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 6ad055bba0b0..f0e05a3c04e0 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -1050,9 +1050,9 @@ static int is_exec(char *extension) return 0; } -static int fat_writepage(struct page *page) +static int fat_writepage(struct page *page, struct writeback_control *wbc) { - return block_write_full_page(page,fat_get_block); + return block_write_full_page(page,fat_get_block, wbc); } static int fat_readpage(struct file *file, struct page *page) { diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index b005f18026d9..21f14fcf8ccf 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -229,9 +229,9 @@ int hfs_notify_change_hdr(struct dentry *dentry, struct iattr * attr) return __hfs_notify_change(dentry, attr, HFS_HDR); } -static int hfs_writepage(struct page *page) +static int hfs_writepage(struct page *page, struct writeback_control *wbc) { - return block_write_full_page(page,hfs_get_block); + return block_write_full_page(page,hfs_get_block, wbc); } static int hfs_readpage(struct file *file, struct page *page) { diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index 4891ce052407..5504d729c3dc 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -98,9 +98,9 @@ int hpfs_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_ return 0; } -static int hpfs_writepage(struct page *page) +static int hpfs_writepage(struct page *page, struct writeback_control *wbc) { - return block_write_full_page(page,hpfs_get_block); + return block_write_full_page(page,hpfs_get_block, wbc); } static int hpfs_readpage(struct file *file, struct page *page) { diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 41ffc39e9c54..454b27a20f58 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -277,9 +277,9 @@ static int jfs_get_block(struct inode *ip, sector_t lblock, return jfs_get_blocks(ip, lblock, 1, bh_result, create); } -static int jfs_writepage(struct page *page) +static int jfs_writepage(struct page *page, struct writeback_control *wbc) { - return block_write_full_page(page, jfs_get_block); + return block_write_full_page(page, jfs_get_block, wbc); } static int jfs_writepages(struct address_space *mapping, diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 18ea5aa41a34..c327d5f03443 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -316,9 +316,9 @@ static int minix_get_block(struct inode *inode, sector_t block, return V2_minix_get_block(inode, block, bh_result, create); } -static int minix_writepage(struct page *page) +static int minix_writepage(struct page *page, struct writeback_control *wbc) { - return block_write_full_page(page,minix_get_block); + return block_write_full_page(page, minix_get_block, wbc); } static int minix_readpage(struct file *file, struct page *page) { diff --git a/fs/mpage.c b/fs/mpage.c index 7f3043c7ee90..c2e3a2d4e8c4 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -327,7 +327,7 @@ EXPORT_SYMBOL(mpage_readpage); */ static struct bio * mpage_writepage(struct bio *bio, struct page *page, get_block_t get_block, - sector_t *last_block_in_bio, int *ret) + sector_t *last_block_in_bio, int *ret, struct writeback_control *wbc) { struct inode *inode = page->mapping->host; const unsigned blkbits = inode->i_blkbits; @@ -501,7 +501,7 @@ alloc_new: confused: if (bio) bio = mpage_bio_submit(WRITE, bio); - *ret = page->mapping->a_ops->writepage(page); + *ret = page->mapping->a_ops->writepage(page, wbc); out: return bio; } @@ -554,9 +554,8 @@ mpage_writepages(struct address_space *mapping, sector_t last_block_in_bio = 0; int ret = 0; int done = 0; - int sync = called_for_sync(); struct pagevec pvec; - int (*writepage)(struct page *); + int (*writepage)(struct page *page, struct writeback_control *wbc); if (wbc->nonblocking && bdi_write_congested(bdi)) { blk_run_queues(); @@ -574,7 +573,7 @@ mpage_writepages(struct address_space *mapping, struct page *page = list_entry(mapping->io_pages.prev, struct page, list); list_del(&page->list); - if (PageWriteback(page) && !sync) { + if (PageWriteback(page) && wbc->sync_mode == WB_SYNC_NONE) { if (PageDirty(page)) { list_add(&page->list, &mapping->dirty_pages); continue; @@ -600,16 +599,16 @@ mpage_writepages(struct address_space *mapping, lock_page(page); - if (sync) + if (wbc->sync_mode != WB_SYNC_NONE) wait_on_page_writeback(page); if (page->mapping == mapping && !PageWriteback(page) && test_clear_page_dirty(page)) { if (writepage) { - ret = (*writepage)(page); + ret = (*writepage)(page, wbc); } else { bio = mpage_writepage(bio, page, get_block, - &last_block_in_bio, &ret); + &last_block_in_bio, &ret, wbc); } if (ret || (--(wbc->nr_to_write) <= 0)) done = 1; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 4849ed704570..21d30b3a36e6 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -240,7 +240,7 @@ nfs_writepage_async(struct file *file, struct inode *inode, struct page *page, * Write an mmapped page to the server. */ int -nfs_writepage(struct page *page) +nfs_writepage(struct page *page, struct writeback_control *wbc) { struct inode *inode = page->mapping->host; unsigned long end_index; diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index 1c8703416db6..8e80a2be182a 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -792,7 +792,7 @@ lock_retry_remap: * * Return 0 on success and -errno on error. */ -static int ntfs_writepage(struct page *page) +static int ntfs_writepage(struct page *page, struct writeback_control *wbc) { s64 attr_pos; struct inode *vi; diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index 37ae3c466e7a..616e97433632 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -424,9 +424,9 @@ static void qnx4_put_super(struct super_block *sb) return; } -static int qnx4_writepage(struct page *page) +static int qnx4_writepage(struct page *page, struct writeback_control *wbc) { - return block_write_full_page(page,qnx4_get_block); + return block_write_full_page(page,qnx4_get_block, wbc); } static int qnx4_readpage(struct file *file, struct page *page) { diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 795330017bab..817c5c465d19 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -1917,7 +1917,7 @@ static inline void submit_bh_for_writepage(struct buffer_head **bhp, int nr) { } } -static int reiserfs_write_full_page(struct page *page) { +static int reiserfs_write_full_page(struct page *page, struct writeback_control *wbc) { struct inode *inode = page->mapping->host ; unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT ; unsigned last_offset = PAGE_CACHE_SIZE; @@ -2018,11 +2018,11 @@ static int reiserfs_readpage (struct file *f, struct page * page) } -static int reiserfs_writepage (struct page * page) +static int reiserfs_writepage (struct page * page, struct writeback_control *wbc) { struct inode *inode = page->mapping->host ; reiserfs_wait_on_write_block(inode->i_sb) ; - return reiserfs_write_full_page(page) ; + return reiserfs_write_full_page(page, wbc) ; } diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c index 9ded6c19c5f1..a174775b2d13 100644 --- a/fs/smbfs/file.c +++ b/fs/smbfs/file.c @@ -170,7 +170,7 @@ smb_writepage_sync(struct inode *inode, struct page *page, * We are called with the page locked and we unlock it when done. */ static int -smb_writepage(struct page *page) +smb_writepage(struct page *page, struct writeback_control *wbc) { struct address_space *mapping = page->mapping; struct inode *inode; diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c index 55e7ff30ed70..a1c0b6361351 100644 --- a/fs/sysv/itree.c +++ b/fs/sysv/itree.c @@ -449,9 +449,9 @@ int sysv_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat return 0; } -static int sysv_writepage(struct page *page) +static int sysv_writepage(struct page *page, struct writeback_control *wbc) { - return block_write_full_page(page,get_block); + return block_write_full_page(page,get_block,wbc); } static int sysv_readpage(struct file *file, struct page *page) { diff --git a/fs/udf/file.c b/fs/udf/file.c index 4c519798fc1b..9fd46aff63ae 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -62,7 +62,7 @@ static int udf_adinicb_readpage(struct file *file, struct page * page) return 0; } -static int udf_adinicb_writepage(struct page *page) +static int udf_adinicb_writepage(struct page *page, struct writeback_control *wbc) { struct inode *inode = page->mapping->host; char *kaddr; diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 24a197a1eeee..19a6e06cd46e 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include "udf_i.h" @@ -137,9 +138,9 @@ void udf_discard_prealloc(struct inode * inode) } } -static int udf_writepage(struct page *page) +static int udf_writepage(struct page *page, struct writeback_control *wbc) { - return block_write_full_page(page, udf_get_block); + return block_write_full_page(page, udf_get_block, wbc); } static int udf_readpage(struct file *file, struct page *page) @@ -170,6 +171,10 @@ void udf_expand_file_adinicb(struct inode * inode, int newsize, int * err) { struct page *page; char *kaddr; + struct writeback_control udf_wbc = { + .sync_mode = WB_SYNC_NONE, + .nr_to_write = 1, + }; /* from now on we have normal address_space methods */ inode->i_data.a_ops = &udf_aops; @@ -206,7 +211,7 @@ void udf_expand_file_adinicb(struct inode * inode, int newsize, int * err) else UDF_I_ALLOCTYPE(inode) = ICBTAG_FLAG_AD_LONG; - inode->i_data.a_ops->writepage(page); + inode->i_data.a_ops->writepage(page, &udf_wbc); page_cache_release(page); mark_inode_dirty(inode); diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index e9d42f0c2b3f..615f61a0b88d 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -445,9 +445,9 @@ struct buffer_head * ufs_bread (struct inode * inode, unsigned fragment, return NULL; } -static int ufs_writepage(struct page *page) +static int ufs_writepage(struct page *page, struct writeback_control *wbc) { - return block_write_full_page(page,ufs_getfrag_block); + return block_write_full_page(page,ufs_getfrag_block,wbc); } static int ufs_readpage(struct file *file, struct page *page) { diff --git a/fs/xfs/linux/xfs_aops.c b/fs/xfs/linux/xfs_aops.c index 3e2bd1679bb8..15b035f67c8f 100644 --- a/fs/xfs/linux/xfs_aops.c +++ b/fs/xfs/linux/xfs_aops.c @@ -691,7 +691,8 @@ count_page_state( STATIC int linvfs_writepage( - struct page *page) + struct page *page, + struct writeback_control *wbc) { int error; int need_trans = 1; diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 4e7a9bbf99dd..e9d6251fa168 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -178,7 +178,7 @@ extern int buffer_heads_over_limit; */ int try_to_release_page(struct page * page, int gfp_mask); int block_invalidatepage(struct page *page, unsigned long offset); -int block_write_full_page(struct page*, get_block_t*); +int block_write_full_page(struct page *page, get_block_t *get_block, struct writeback_control *wbc); int block_read_full_page(struct page*, get_block_t*); int block_prepare_write(struct page*, unsigned, unsigned, get_block_t*); int cont_prepare_write(struct page*, unsigned, unsigned, get_block_t*, diff --git a/include/linux/fs.h b/include/linux/fs.h index afbb9474f25b..bd133801c3e0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -278,7 +278,7 @@ struct address_space; struct writeback_control; struct address_space_operations { - int (*writepage)(struct page *); + int (*writepage)(struct page *page, struct writeback_control *wbc); int (*readpage)(struct file *, struct page *); int (*sync_page)(struct page *); diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 8d15e17c0b94..2673e32cc4ba 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -291,7 +291,7 @@ extern void nfs_complete_unlink(struct dentry *); /* * linux/fs/nfs/write.c */ -extern int nfs_writepage(struct page *); +extern int nfs_writepage(struct page *page, struct writeback_control *wbc); extern int nfs_writepages(struct address_space *, struct writeback_control *); extern int nfs_flush_incompatible(struct file *file, struct page *page); extern int nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int); diff --git a/include/linux/sched.h b/include/linux/sched.h index 80a9836df919..d0726cb87145 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -424,9 +424,8 @@ do { if (atomic_dec_and_test(&(tsk)->usage)) __put_task_struct(tsk); } while(0) #define PF_FREEZE 0x00008000 /* this task should be frozen for suspend */ #define PF_IOTHREAD 0x00010000 /* this thread is needed for doing I/O to swap */ #define PF_FROZEN 0x00020000 /* frozen for system suspend */ -#define PF_SYNC 0x00040000 /* performing fsync(), etc */ -#define PF_FSTRANS 0x00080000 /* inside a filesystem transaction */ -#define PF_KSWAPD 0x00100000 /* I am kswapd */ +#define PF_FSTRANS 0x00040000 /* inside a filesystem transaction */ +#define PF_KSWAPD 0x00080000 /* I am kswapd */ /* * Ptrace flags diff --git a/include/linux/swap.h b/include/linux/swap.h index f6b1421f86b0..c635f392d6c1 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -65,6 +65,7 @@ typedef struct { struct sysinfo; struct address_space; struct zone; +struct writeback_control; /* * A swap extent maps a range of a swapfile's PAGE_SIZE pages onto a range of @@ -180,7 +181,7 @@ extern int shmem_unuse(swp_entry_t entry, struct page *page); #ifdef CONFIG_SWAP /* linux/mm/page_io.c */ extern int swap_readpage(struct file *, struct page *); -extern int swap_writepage(struct page *); +extern int swap_writepage(struct page *page, struct writeback_control *wbc); extern int rw_swap_page_sync(int, swp_entry_t, struct page *); /* linux/mm/swap_state.c */ diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 351e5851c041..620f18f5ceeb 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -42,6 +42,7 @@ struct writeback_control { int nonblocking; /* Don't get stuck on request queues */ int encountered_congestion; /* An output: a queue is full */ int for_kupdate; /* A kupdate writeback */ + int for_reclaim; /* Invoked from the page allocator */ }; /* @@ -88,13 +89,4 @@ extern int nr_pdflush_threads; /* Global so it can be exported to sysctl read-only. */ -/* - * Tell the writeback paths that they are being called for a "data integrity" - * operation such as fsync(). - */ -static inline int called_for_sync(void) -{ - return current->flags & PF_SYNC; -} - #endif /* WRITEBACK_H */ diff --git a/mm/filemap.c b/mm/filemap.c index 50b05fe9a2e0..cacecf650277 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -117,10 +117,6 @@ static inline int sync_page(struct page *page) * cleansing writeback. The difference between these two operations is that * if a dirty page/buffer is encountered, it must be waited upon, and not just * skipped over. - * - * The PF_SYNC flag is set across this operation and the various functions - * which care about this distinction must use called_for_sync() to find out - * which behaviour they should implement. */ int filemap_fdatawrite(struct address_space *mapping) { @@ -133,12 +129,10 @@ int filemap_fdatawrite(struct address_space *mapping) if (mapping->backing_dev_info->memory_backed) return 0; - current->flags |= PF_SYNC; write_lock(&mapping->page_lock); list_splice_init(&mapping->dirty_pages, &mapping->io_pages); write_unlock(&mapping->page_lock); ret = do_writepages(mapping, &wbc); - current->flags &= ~PF_SYNC; return ret; } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 3880394c8562..1111f37feb4b 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -412,6 +412,9 @@ int write_one_page(struct page *page, int wait) { struct address_space *mapping = page->mapping; int ret = 0; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + }; BUG_ON(!PageLocked(page)); @@ -424,7 +427,7 @@ int write_one_page(struct page *page, int wait) list_add(&page->list, &mapping->locked_pages); page_cache_get(page); write_unlock(&mapping->page_lock); - ret = mapping->a_ops->writepage(page); + ret = mapping->a_ops->writepage(page, &wbc); if (ret == 0 && wait) { wait_on_page_writeback(page); if (PageError(page)) diff --git a/mm/page_io.c b/mm/page_io.c index 5d3bfdce334f..faf3e211a33a 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -18,6 +18,7 @@ #include #include /* for block_sync_page() */ #include +#include #include static struct bio * @@ -86,7 +87,7 @@ static int end_swap_bio_read(struct bio *bio, unsigned int bytes_done, int err) * We may have stale swap cache pages in memory: notice * them here and get rid of the unnecessary final write. */ -int swap_writepage(struct page *page) +int swap_writepage(struct page *page, struct writeback_control *wbc) { struct bio *bio; int ret = 0; @@ -143,6 +144,9 @@ struct address_space_operations swap_aops = { int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page) { int ret; + struct writeback_control swap_wbc = { + .sync_mode = WB_SYNC_ALL, + }; lock_page(page); @@ -154,7 +158,7 @@ int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page) ret = swap_readpage(NULL, page); wait_on_page_locked(page); } else { - ret = swap_writepage(page); + ret = swap_writepage(page, &swap_wbc); wait_on_page_writeback(page); } page->mapping = NULL; diff --git a/mm/shmem.c b/mm/shmem.c index cb8bd154df8f..987203cb2a41 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -671,7 +671,7 @@ int shmem_unuse(swp_entry_t entry, struct page *page) /* * Move the page from the page cache to the swap cache. */ -static int shmem_writepage(struct page *page) +static int shmem_writepage(struct page *page, struct writeback_control *wbc) { struct shmem_inode_info *info; swp_entry_t *entry, swap; diff --git a/mm/swapfile.c b/mm/swapfile.c index 0ad388eaf542..067c3095225a 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -696,7 +697,11 @@ static int try_to_unuse(unsigned int type) * and now we must reincrement count to try again later. */ if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) { - swap_writepage(page); + struct writeback_control wbc = { + .sync_mode = WB_SYNC_NONE, + }; + + swap_writepage(page, &wbc); lock_page(page); wait_on_page_writeback(page); } diff --git a/mm/vmscan.c b/mm/vmscan.c index 8243646bf5c5..d2add0ef819d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -319,13 +319,19 @@ shrink_list(struct list_head *page_list, unsigned int gfp_mask, goto keep_locked; if (test_clear_page_dirty(page)) { int res; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_NONE, + .nr_to_write = SWAP_CLUSTER_MAX, + .nonblocking = 1, + .for_reclaim = 1, + }; write_lock(&mapping->page_lock); list_move(&page->list, &mapping->locked_pages); write_unlock(&mapping->page_lock); SetPageReclaim(page); - res = mapping->a_ops->writepage(page); + res = mapping->a_ops->writepage(page, &wbc); if (res == WRITEPAGE_ACTIVATE) { ClearPageReclaim(page); -- cgit v1.2.3 From 654107b93fcc5a4697e696f6bdb32f5d138d8d47 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 14 Dec 2002 03:18:58 -0800 Subject: [PATCH] madvise_willneed() maximum readahead checking madvise_willneed() currently has a very strange check on how much readahead it is prepared to do. It is based on the user's rss limit. But this is usually enormous, and the user isn't necessarily going to map all that memory at the same time anyway. And the logic is wrong - it is comparing rss (which is in bytes) with `end - start', which is in pages. And it returns -EIO on error, which is not mentioned in the Open Group spec and doesn't make sense. This patch takes it all out and applies the same upper limit as is used in sys_readahead() - half the inactive list. --- include/linux/mm.h | 1 + mm/filemap.c | 12 +----------- mm/madvise.c | 23 +++++------------------ mm/readahead.c | 13 +++++++++++++ 4 files changed, 20 insertions(+), 29 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index f2c0f9645de5..7f92f6775eb2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -516,6 +516,7 @@ void page_cache_readaround(struct address_space *mapping, unsigned long offset); void handle_ra_miss(struct address_space *mapping, struct file_ra_state *ra); +unsigned long max_sane_readahead(unsigned long nr); /* Do stack extension */ extern int expand_stack(struct vm_area_struct * vma, unsigned long address); diff --git a/mm/filemap.c b/mm/filemap.c index 2b993e3f8423..1595d52c9bb7 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -897,20 +897,10 @@ static ssize_t do_readahead(struct address_space *mapping, struct file *filp, unsigned long index, unsigned long nr) { - unsigned long max; - unsigned long active; - unsigned long inactive; - if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) return -EINVAL; - /* Limit it to a sane percentage of the inactive list.. */ - get_zone_counts(&active, &inactive); - max = inactive / 2; - if (nr > max) - nr = max; - - do_page_cache_readahead(mapping, filp, index, nr); + do_page_cache_readahead(mapping, filp, index, max_sane_readahead(nr)); return 0; } diff --git a/mm/madvise.c b/mm/madvise.c index ac845fe3553a..5ff452899f41 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -51,36 +51,23 @@ static long madvise_behavior(struct vm_area_struct * vma, unsigned long start, } /* - * Schedule all required I/O operations, then run the disk queue - * to make sure they are started. Do not wait for completion. + * Schedule all required I/O operations. Do not wait for completion. */ static long madvise_willneed(struct vm_area_struct * vma, unsigned long start, unsigned long end) { - long error = -EBADF; - struct file * file; - unsigned long size, rlim_rss; + struct file *file = vma->vm_file; - /* Doesn't work if there's no mapped file. */ if (!vma->vm_file) - return error; - file = vma->vm_file; - size = (file->f_dentry->d_inode->i_size + PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT; + return -EBADF; start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; if (end > vma->vm_end) end = vma->vm_end; end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; - /* Make sure this doesn't exceed the process's max rss. */ - error = -EIO; - rlim_rss = current->rlim ? current->rlim[RLIMIT_RSS].rlim_cur : - LONG_MAX; /* default: see resource.h */ - if ((vma->vm_mm->rss + (end - start)) > rlim_rss) - return error; - - do_page_cache_readahead(file->f_dentry->d_inode->i_mapping, file, start, end - start); + do_page_cache_readahead(file->f_dentry->d_inode->i_mapping, + file, start, max_sane_readahead(end - start)); return 0; } diff --git a/mm/readahead.c b/mm/readahead.c index e2f2cd5941a7..460ca8a0d149 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -467,3 +467,16 @@ void handle_ra_miss(struct address_space *mapping, struct file_ra_state *ra) ra->next_size = min; } } + +/* + * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a + * sensible upper limit. + */ +unsigned long max_sane_readahead(unsigned long nr) +{ + unsigned long active; + unsigned long inactive; + + get_zone_counts(&active, &inactive); + return min(nr, inactive / 2); +} -- cgit v1.2.3 From 55478b6c1b34f7096e3a3e09d2055350625d72df Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 14 Dec 2002 03:19:46 -0800 Subject: [PATCH] remove vm_area_struct.vm_raend Remove the unused vm_area_struct.vm_raend. If someone wants to tune per-VMA readaround then they can alter vma->vm_file->f_ra.ra_pages. --- arch/ia64/kernel/perfmon.c | 1 - include/linux/mm.h | 1 - mm/madvise.c | 1 - mm/mmap.c | 3 --- mm/mremap.c | 1 - 5 files changed, 7 deletions(-) (limited to 'include/linux') diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index f31a02c27654..7ec39ed57d52 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c @@ -783,7 +783,6 @@ pfm_smpl_buffer_alloc(pfm_context_t *ctx, unsigned long *which_pmds, unsigned lo vma->vm_ops = &pfm_vm_ops; /* necesarry to get the close() callback */ vma->vm_pgoff = 0; vma->vm_file = NULL; - vma->vm_raend = 0; vma->vm_private_data = psb; /* information needed by the pfm_vm_close() function */ /* diff --git a/include/linux/mm.h b/include/linux/mm.h index 7f92f6775eb2..df49cb472866 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -70,7 +70,6 @@ struct vm_area_struct { unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE units, *not* PAGE_CACHE_SIZE */ struct file * vm_file; /* File we map to (can be NULL). */ - unsigned long vm_raend; /* XXX: put full readahead info here. */ void * vm_private_data; /* was vm_pte (shared mem) */ }; diff --git a/mm/madvise.c b/mm/madvise.c index 5ff452899f41..83a31a6ae113 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -32,7 +32,6 @@ static long madvise_behavior(struct vm_area_struct * vma, unsigned long start, } spin_lock(&mm->page_table_lock); - vma->vm_raend = 0; VM_ClearReadHint(vma); switch (behavior) { diff --git a/mm/mmap.c b/mm/mmap.c index 76800c94aaad..e10ec8a3a158 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -559,7 +559,6 @@ munmap_back: vma->vm_pgoff = pgoff; vma->vm_file = NULL; vma->vm_private_data = NULL; - vma->vm_raend = 0; INIT_LIST_HEAD(&vma->shared); if (file) { @@ -1089,8 +1088,6 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); } - new->vm_raend = 0; - if (new->vm_file) get_file(new->vm_file); diff --git a/mm/mremap.c b/mm/mremap.c index 51e788b5b798..0cd568fd8645 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -233,7 +233,6 @@ static unsigned long move_vma(struct vm_area_struct * vma, new_vma->vm_start = new_addr; new_vma->vm_end = new_addr+new_len; new_vma->vm_pgoff += (addr - vma->vm_start) >> PAGE_SHIFT; - new_vma->vm_raend = 0; if (new_vma->vm_file) get_file(new_vma->vm_file); if (new_vma->vm_ops && new_vma->vm_ops->open) -- cgit v1.2.3 From f3ce0064cbb6bbada20942729aa439cfd33da301 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 14 Dec 2002 19:42:33 -0800 Subject: [PATCH] move console_loglevel scalars to array (resend) Moves console_loglevel & friends to an array, as sysctl expects. --- Documentation/sysctl/kernel.txt | 5 ----- arch/i386/mm/fault.c | 2 -- drivers/char/sysrq.c | 1 + include/linux/kernel.h | 9 +++++++-- init/main.c | 1 + kernel/printk.c | 12 +++++++----- kernel/suspend.c | 1 - kernel/sysctl.c | 1 + 8 files changed, 17 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index d75397d5a262..ac873ad4ce6e 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -237,11 +237,6 @@ the different loglevels. console_loglevel can be set - default_console_loglevel: default value for console_loglevel -Note: a quick look in linux/kernel/printk.c will reveal that -these variables aren't put inside a structure, so their order -in-core isn't formally guaranteed and garbage values _might_ -occur when the compiler changes. (???) - ============================================================== reboot-cmd: (Sparc only) diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c index 232aac460eb3..56827d817963 100644 --- a/arch/i386/mm/fault.c +++ b/arch/i386/mm/fault.c @@ -28,8 +28,6 @@ extern void die(const char *,struct pt_regs *,long); -extern int console_loglevel; - #ifndef CONFIG_X86_WP_WORKS_OK /* * Ugly, ugly, but the goto's result in better assembly.. diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c index 7800f28a98ae..90f41e8d6cf0 100644 --- a/drivers/char/sysrq.c +++ b/drivers/char/sysrq.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 80a3e97ab59b..602617f9addf 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -38,6 +38,13 @@ #define KERN_INFO "<6>" /* informational */ #define KERN_DEBUG "<7>" /* debug-level messages */ +extern int console_printk[]; + +#define console_loglevel (console_printk[0]) +#define default_message_loglevel (console_printk[1]) +#define minimum_console_loglevel (console_printk[2]) +#define default_console_loglevel (console_printk[3]) + struct completion; #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP @@ -81,8 +88,6 @@ extern int session_of_pgrp(int pgrp); asmlinkage int printk(const char * fmt, ...) __attribute__ ((format (printf, 1, 2))); -extern int console_loglevel; - static inline void console_silent(void) { console_loglevel = 0; diff --git a/init/main.c b/init/main.c index 1d477ebc16a5..ef444b704fbf 100644 --- a/init/main.c +++ b/init/main.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include diff --git a/kernel/printk.c b/kernel/printk.c index 403eea7da400..cbe027aaf151 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -16,6 +16,7 @@ * 01Mar01 Andrew Morton */ +#include #include #include #include @@ -54,11 +55,12 @@ DECLARE_WAIT_QUEUE_HEAD(log_wait); -/* Keep together for sysctl support */ -int console_loglevel = DEFAULT_CONSOLE_LOGLEVEL; -int default_message_loglevel = DEFAULT_MESSAGE_LOGLEVEL; -int minimum_console_loglevel = MINIMUM_CONSOLE_LOGLEVEL; -int default_console_loglevel = DEFAULT_CONSOLE_LOGLEVEL; +int console_printk[4] = { + DEFAULT_CONSOLE_LOGLEVEL, /* console_loglevel */ + DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */ + MINIMUM_CONSOLE_LOGLEVEL, /* minimum_console_loglevel */ + DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ +}; int oops_in_progress; diff --git a/kernel/suspend.c b/kernel/suspend.c index 0dcb6d6b8e79..3e4b1729c5b1 100644 --- a/kernel/suspend.c +++ b/kernel/suspend.c @@ -88,7 +88,6 @@ unsigned char software_suspend_enabled = 0; extern char _text, _etext, _edata, __bss_start, _end; extern char __nosave_begin, __nosave_end; -extern int console_loglevel; extern int is_head_of_free_region(struct page *); /* Locks */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 0f2359578775..c3c96cd208d4 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3 From b9daa0066d792983da59154af3ae486eff9b9aa1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 14 Dec 2002 19:44:20 -0800 Subject: [PATCH] threaded coredumps, tcore-fixes-2.5.51-A0 This fixes one more threaded-coredumps detail reported by the glibc people: all threads taken down by the coredump code should report the proper exit code. We can do this rather easily via the group_exit mechanism. 'Other' threads used to report SIGKILL, which was highly confusing as the shell often displayed the 'Killed' message instead of a 'Segmentation fault' message. Another missing bit was the 0x80 bit set in the exit status for all threads, if the coredump was successful. (it's safe to set this bit in ->sig->group_exit_code in an unlocked way because all threads are artificially descheduled by the coredump code.) --- fs/exec.c | 5 ++++- include/linux/binfmts.h | 2 +- kernel/signal.c | 2 +- 3 files changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/fs/exec.c b/fs/exec.c index 4c0937c43144..1699f5bdf92e 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1268,7 +1268,7 @@ static void coredump_wait(struct mm_struct *mm) BUG_ON(mm->core_waiters); } -int do_coredump(long signr, struct pt_regs * regs) +int do_coredump(long signr, int exit_code, struct pt_regs * regs) { char corename[CORENAME_MAX_SIZE + 1]; struct mm_struct *mm = current->mm; @@ -1288,6 +1288,8 @@ int do_coredump(long signr, struct pt_regs * regs) } mm->dumpable = 0; init_completion(&mm->core_done); + current->sig->group_exit = 1; + current->sig->group_exit_code = exit_code; coredump_wait(mm); if (current->rlim[RLIMIT_CORE].rlim_cur < binfmt->min_coredump) @@ -1314,6 +1316,7 @@ int do_coredump(long signr, struct pt_regs * regs) retval = binfmt->core_dump(signr, regs, file); + current->sig->group_exit_code |= 0x80; close_fail: filp_close(file, NULL); fail_unlock: diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index dfea0f47ed3e..ae1b454395b5 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -57,7 +57,7 @@ extern int setup_arg_pages(struct linux_binprm * bprm); extern int copy_strings(int argc,char ** argv,struct linux_binprm *bprm); extern int copy_strings_kernel(int argc,char ** argv,struct linux_binprm *bprm); extern void compute_creds(struct linux_binprm *binprm); -extern int do_coredump(long signr, struct pt_regs * regs); +extern int do_coredump(long signr, int exit_code, struct pt_regs * regs); extern void set_binfmt(struct linux_binfmt *new); diff --git a/kernel/signal.c b/kernel/signal.c index c3e602f2822a..d617fddfb60a 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1313,7 +1313,7 @@ int get_signal_to_deliver(siginfo_t *info, struct pt_regs *regs) case SIGQUIT: case SIGILL: case SIGTRAP: case SIGABRT: case SIGFPE: case SIGSEGV: case SIGBUS: case SIGSYS: case SIGXCPU: case SIGXFSZ: - if (do_coredump(signr, regs)) + if (do_coredump(signr, exit_code, regs)) exit_code |= 0x80; /* FALLTHRU */ -- cgit v1.2.3 From 8d13373b408f1e9df4238d0df17c40d7a77e5b02 Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Sat, 14 Dec 2002 19:50:32 -0800 Subject: [PATCH] cpufreq: clean up CPU information This patch moves some basic per-CPU static information (minimum frequency, maximum frequency and maximum transition latency) into a struct cpufreq_cpuinfo. This offers a much cleaner struct cpufreq_driver and struct cpufreq_policy. --- arch/arm/mach-integrator/cpu.c | 14 ++++++---- arch/arm/mach-sa1100/cpu-sa1100.c | 5 ++-- arch/arm/mach-sa1100/cpu-sa1110.c | 5 ++-- arch/arm/mach-sa1100/generic.c | 4 +-- arch/i386/kernel/cpu/cpufreq/elanfreq.c | 5 ++-- arch/i386/kernel/cpu/cpufreq/longhaul.c | 5 ++-- arch/i386/kernel/cpu/cpufreq/longrun.c | 10 ++++--- arch/i386/kernel/cpu/cpufreq/p4-clockmod.c | 6 ++-- arch/i386/kernel/cpu/cpufreq/powernow-k6.c | 5 ++-- arch/i386/kernel/cpu/cpufreq/speedstep.c | 6 ++-- drivers/acpi/processor.c | 11 ++++++-- include/linux/cpufreq.h | 20 +++++++++---- kernel/cpufreq.c | 45 +++++++++++++++++++----------- 13 files changed, 91 insertions(+), 50 deletions(-) (limited to 'include/linux') diff --git a/arch/arm/mach-integrator/cpu.c b/arch/arm/mach-integrator/cpu.c index 23a8788ef76e..db5ab22d1d42 100644 --- a/arch/arm/mach-integrator/cpu.c +++ b/arch/arm/mach-integrator/cpu.c @@ -77,8 +77,8 @@ static int integrator_verify_speed(struct cpufreq_policy *policy) { struct vco vco; - if (policy->max > policy->max_cpu_freq) - policy->max = policy->max_cpu_freq; + if (policy->max > policy->cpuinfo.max_freq) + policy->max = policy->cpuinfo.max_freq; if (policy->max < 12000) policy->max = 12000; @@ -148,7 +148,9 @@ static int integrator_set_policy(struct cpufreq_policy *policy) static struct cpufreq_policy integrator_policy = { .cpu = 0, .policy = CPUFREQ_POLICY_POWERSAVE, - .max_cpu_freq = 160000, + .cpuinfo.max_cpu_freq = 160000, + .cpuinfo.min_cpu_freq = 12000, + .cpuinfo.transition_latency = CPUFREQ_ETERNAL, }; static struct cpufreq_driver integrator_driver = { @@ -197,7 +199,9 @@ static int __init integrator_cpu_init(void) policies[cpu].cpu = cpu; policies[cpu].policy = CPUFREQ_POLICY_POWERSAVE, - policies[cpu].max_cpu_freq = 160000; + policies[cpu].cpuinfo.max_freq = 160000; + policies[cpu].cpuinfo.min_freq = 12000; + policies[cpu].cpuinfo.transition_latency = CPUFREQ_ETERNAL; policies[cpu].min = policies[cpu].max = vco_to_freq(vco, 1); } @@ -205,8 +209,6 @@ static int __init integrator_cpu_init(void) set_cpus_allowed(current, cpus_allowed); #ifdef CONFIG_CPU_FREQ - for (cpu=0; cpumax > policy->max_cpu_freq) - policy->max = policy->max_cpu_freq; + if (policy->max > policy->cpuinfo.max_freq) + policy->max = policy->cpuinfo.max_freq; policy->max = cclk_frequency_100khz[sa11x0_freq_to_ppcr(policy->max)] * 100; policy->min = policy->max; diff --git a/arch/i386/kernel/cpu/cpufreq/elanfreq.c b/arch/i386/kernel/cpu/cpufreq/elanfreq.c index bc71cc376350..2c751897ce36 100644 --- a/arch/i386/kernel/cpu/cpufreq/elanfreq.c +++ b/arch/i386/kernel/cpu/cpufreq/elanfreq.c @@ -296,7 +296,6 @@ static int __init elanfreq_init(void) max_freq = elanfreq_get_cpu_frequency(); #ifdef CONFIG_CPU_FREQ_24_API - driver->cpu_min_freq[0] = 1000; driver->cpu_cur_freq[0] = elanfreq_get_cpu_frequency(); #endif @@ -307,7 +306,9 @@ static int __init elanfreq_init(void) driver->policy[0].min = 1000; driver->policy[0].max = max_freq; driver->policy[0].policy = CPUFREQ_POLICY_PERFORMANCE; - driver->policy[0].max_cpu_freq = max_freq; + driver->policy[0].cpuinfo.max_freq = max_freq; + driver->policy[0].cpuinfo.min_freq = min_freq; + driver->policy[0].cpuinfo.transition_latency = CPUFREQ_ETERNAL; elanfreq_driver = driver; diff --git a/arch/i386/kernel/cpu/cpufreq/longhaul.c b/arch/i386/kernel/cpu/cpufreq/longhaul.c index 511d52051512..72395b13eb00 100644 --- a/arch/i386/kernel/cpu/cpufreq/longhaul.c +++ b/arch/i386/kernel/cpu/cpufreq/longhaul.c @@ -779,7 +779,6 @@ static int __init longhaul_init (void) driver->policy = (struct cpufreq_policy *) (driver + 1); #ifdef CONFIG_CPU_FREQ_24_API - driver->cpu_min_freq[0] = (unsigned int) lowest_speed; driver->cpu_cur_freq[0] = currentspeed; #endif @@ -790,7 +789,9 @@ static int __init longhaul_init (void) driver->policy[0].min = (unsigned int) lowest_speed; driver->policy[0].max = (unsigned int) highest_speed; driver->policy[0].policy = CPUFREQ_POLICY_PERFORMANCE; - driver->policy[0].max_cpu_freq = (unsigned int) highest_speed; + driver->policy[0].cpuinfo.min_freq = (unsigned int) lowest_speed; + driver->policy[0].cpuinfo.max_freq = (unsigned int) highest_speed; + driver->policy[0].cpuinfo.transition_latency = CPUFREQ_ETERNAL; longhaul_driver = driver; diff --git a/arch/i386/kernel/cpu/cpufreq/longrun.c b/arch/i386/kernel/cpu/cpufreq/longrun.c index 982314536c79..69b4fdb42751 100644 --- a/arch/i386/kernel/cpu/cpufreq/longrun.c +++ b/arch/i386/kernel/cpu/cpufreq/longrun.c @@ -121,8 +121,9 @@ static int longrun_verify_policy(struct cpufreq_policy *policy) return -EINVAL; policy->cpu = 0; - cpufreq_verify_within_limits(policy, 0, - longrun_driver->policy[0].max_cpu_freq); + cpufreq_verify_within_limits(policy, + longrun_driver->policy[0].cpuinfo.min_freq, + longrun_driver->policy[0].cpuinfo.max_freq); return 0; } @@ -247,12 +248,13 @@ static int __init longrun_init(void) kfree(driver); return -EIO; } - driver->policy[0].max_cpu_freq = longrun_high_freq; + driver->policy[0].cpuinfo.min_freq = longrun_low_freq; + driver->policy[0].cpuinfo.max_freq = longrun_high_freq; + driver->policy[0].cpuinfo.transition_latency = CPUFREQ_ETERNAL; longrun_get_policy(&driver->policy[0]); #ifdef CONFIG_CPU_FREQ_24_API - driver->cpu_min_freq[0] = longrun_low_freq; driver->cpu_cur_freq[0] = longrun_high_freq; /* dummy value */ #endif diff --git a/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c b/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c index a86f3cc32f7a..52cb1a4ab188 100644 --- a/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c +++ b/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c @@ -257,7 +257,6 @@ int __init cpufreq_p4_init(void) #ifdef CONFIG_CPU_FREQ_24_API for (i=0;icpu_min_freq[i] = stock_freq / 8; driver->cpu_cur_freq[i] = stock_freq; } #endif @@ -272,7 +271,10 @@ int __init cpufreq_p4_init(void) driver->policy[i].min = stock_freq / 8; driver->policy[i].max = stock_freq; driver->policy[i].policy = CPUFREQ_POLICY_PERFORMANCE; - driver->policy[i].max_cpu_freq = stock_freq; + driver->policy[i].cpuinfo.min_freq = driver->policy[i].min; + driver->policy[i].cpuinfo.max_freq = stock_freq; + driver->policy[i].cpuinfo.transition_latency = CPUFREQ_ETERNAL; + driver->policy[i].cpu = i; } diff --git a/arch/i386/kernel/cpu/cpufreq/powernow-k6.c b/arch/i386/kernel/cpu/cpufreq/powernow-k6.c index c43c9a6ddac2..8021e8a21590 100644 --- a/arch/i386/kernel/cpu/cpufreq/powernow-k6.c +++ b/arch/i386/kernel/cpu/cpufreq/powernow-k6.c @@ -242,7 +242,6 @@ static int __init powernow_k6_init(void) driver->policy = (struct cpufreq_policy *) (driver + 1); #ifdef CONFIG_CPU_FREQ_24_API - driver->cpu_min_freq[0] = busfreq * 20; driver->cpu_cur_freq[0] = busfreq * max_multiplier; #endif @@ -253,7 +252,9 @@ static int __init powernow_k6_init(void) driver->policy[0].min = busfreq * 20; driver->policy[0].max = busfreq * max_multiplier; driver->policy[0].policy = CPUFREQ_POLICY_PERFORMANCE; - driver->policy[0].max_cpu_freq = busfreq * max_multiplier; + driver->policy[0].cpuinfo.max_freq = busfreq * max_multiplier; + driver->policy[0].cpuinfo.min_freq = busfreq * 20; + driver->policy[0].cpuinfo.transition_latency = CPUFREQ_ETERNAL; powernow_driver = driver; diff --git a/arch/i386/kernel/cpu/cpufreq/speedstep.c b/arch/i386/kernel/cpu/cpufreq/speedstep.c index 2e7e50fca818..227e726aa4bd 100644 --- a/arch/i386/kernel/cpu/cpufreq/speedstep.c +++ b/arch/i386/kernel/cpu/cpufreq/speedstep.c @@ -693,7 +693,6 @@ static int __init speedstep_init(void) driver->policy = (struct cpufreq_policy *) (driver + 1); #ifdef CONFIG_CPU_FREQ_24_API - driver->cpu_min_freq[0] = speedstep_low_freq; driver->cpu_cur_freq[0] = speed; #endif @@ -703,7 +702,10 @@ static int __init speedstep_init(void) driver->policy[0].cpu = 0; driver->policy[0].min = speedstep_low_freq; driver->policy[0].max = speedstep_high_freq; - driver->policy[0].max_cpu_freq = speedstep_high_freq; + driver->policy[0].cpuinfo.min_freq = speedstep_low_freq; + driver->policy[0].cpuinfo.max_freq = speedstep_high_freq; + driver->policy[0].cpuinfo.transition_latency = CPUFREQ_ETERNAL; + driver->policy[0].policy = (speed == speedstep_low_freq) ? CPUFREQ_POLICY_POWERSAVE : CPUFREQ_POLICY_PERFORMANCE; diff --git a/drivers/acpi/processor.c b/drivers/acpi/processor.c index 22eddbfdb73a..8167bf258edf 100644 --- a/drivers/acpi/processor.c +++ b/drivers/acpi/processor.c @@ -1849,10 +1849,15 @@ acpi_cpufreq_init ( #ifdef CONFIG_CPU_FREQ_24_API for (i=0;icpu_cur_freq[0] = pr->performance.states[current_state].core_frequency * 1000; - driver->cpu_min_freq[0] = pr->performance.states[pr->performance.state_count - 1].core_frequency * 1000; } #endif + /* detect highest transition latency */ + for (i=0;iperformance.state_count;i++) { + if (pr->performance.states[i].transition_latency > driver->policy[0].cpuinfo.transition_latency) + driver->policy[0].cpuinfo.transition_latency = pr->performance.states[i].transition_latency; + } + driver->verify = &acpi_cpufreq_verify; driver->setpolicy = &acpi_cpufreq_setpolicy; @@ -1860,7 +1865,9 @@ acpi_cpufreq_init ( driver->policy[i].cpu = pr->id; driver->policy[i].min = pr->performance.states[pr->performance.state_count - 1].core_frequency * 1000; driver->policy[i].max = pr->performance.states[pr->limit.state.px].core_frequency * 1000; - driver->policy[i].max_cpu_freq = pr->performance.states[0].core_frequency * 1000; + driver->policy[i].cpuinfo.max_freq = pr->performance.states[0].core_frequency * 1000; + driver->policy[i].cpuinfo.min_freq = pr->performance.states[pr->performance.state_count - 1].core_frequency * 1000; + driver->policy[i].cpuinfo.transition_latency = driver->policy[0].cpuinfo.transition_latency; driver->policy[i].policy = ( pr->performance.states[current_state].core_frequency * 1000 == driver->policy[i].max) ? CPUFREQ_POLICY_PERFORMANCE : CPUFREQ_POLICY_POWERSAVE; } diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index a8f8d2a31936..e93501c7a0b8 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -37,15 +37,26 @@ int cpufreq_unregister_notifier(struct notifier_block *nb, unsigned int list); #define CPUFREQ_POLICY_POWERSAVE (1) #define CPUFREQ_POLICY_PERFORMANCE (2) -/* values here are CPU kHz so that hardware which doesn't run with some - * frequencies can complain without having to guess what per cent / per - * mille means. */ +/* Frequency values here are CPU kHz so that hardware which doesn't run + * with some frequencies can complain without having to guess what per + * cent / per mille means. + * Maximum transition latency is in nanoseconds - if it's unknown, + * CPUFREQ_ETERNAL shall be used. + */ + +#define CPUFREQ_ETERNAL (-1) +struct cpufreq_cpuinfo { + unsigned int max_freq; + unsigned int min_freq; + unsigned int transition_latency; +}; + struct cpufreq_policy { unsigned int cpu; /* cpu nr or CPUFREQ_ALL_CPUS */ unsigned int min; /* in kHz */ unsigned int max; /* in kHz */ unsigned int policy; /* see above */ - unsigned int max_cpu_freq; /* for information */ + struct cpufreq_cpuinfo cpuinfo; /* see above */ }; #define CPUFREQ_ADJUST (0) @@ -116,7 +127,6 @@ struct cpufreq_driver { #endif /* 2.4. compatible API */ #ifdef CONFIG_CPU_FREQ_24_API - unsigned int cpu_min_freq[NR_CPUS]; unsigned int cpu_cur_freq[NR_CPUS]; #endif }; diff --git a/kernel/cpufreq.c b/kernel/cpufreq.c index 1cfc2489a08a..c8208fb2191a 100644 --- a/kernel/cpufreq.c +++ b/kernel/cpufreq.c @@ -119,8 +119,8 @@ static int cpufreq_parse_policy(char input_string[42], struct cpufreq_policy *po if (sscanf(input_string, "%d%%%d%%%d%%%s", &cpu, &min, &max, policy_string) == 4) { if (!cpufreq_get_policy(¤t_policy, cpu)) { - policy->min = (min * current_policy.max_cpu_freq) / 100; - policy->max = (max * current_policy.max_cpu_freq) / 100; + policy->min = (min * current_policy.cpuinfo.max_freq) / 100; + policy->max = (max * current_policy.cpuinfo.max_freq) / 100; policy->cpu = cpu; result = 0; goto scan_policy; @@ -138,8 +138,8 @@ static int cpufreq_parse_policy(char input_string[42], struct cpufreq_policy *po if (sscanf(input_string, "%d%%%d%%%s", &min, &max, policy_string) == 3) { if (!cpufreq_get_policy(¤t_policy, cpu)) { - policy->min = (min * current_policy.max_cpu_freq) / 100; - policy->max = (max * current_policy.max_cpu_freq) / 100; + policy->min = (min * current_policy.cpuinfo.max_freq) / 100; + policy->max = (max * current_policy.cpuinfo.max_freq) / 100; result = 0; goto scan_policy; } @@ -229,11 +229,11 @@ static int cpufreq_proc_read ( cpufreq_get_policy(&policy, i); - if (!policy.max_cpu_freq) + if (!policy.cpuinfo.max_freq) continue; - min_pctg = (policy.min * 100) / policy.max_cpu_freq; - max_pctg = (policy.max * 100) / policy.max_cpu_freq; + min_pctg = (policy.min * 100) / policy.cpuinfo.max_freq; + max_pctg = (policy.max * 100) / policy.cpuinfo.max_freq; p += sprintf(p, "CPU%3d %9d kHz (%3d %%) - %9d kHz (%3d %%) - ", i , policy.min, min_pctg, policy.max, max_pctg); @@ -279,6 +279,7 @@ static int cpufreq_proc_write ( int result = 0; char proc_string[42] = {'\0'}; struct cpufreq_policy policy; + unsigned int i = 0; if ((count > sizeof(proc_string) - 1)) @@ -293,7 +294,17 @@ static int cpufreq_proc_write ( if (result) return -EFAULT; - cpufreq_set_policy(&policy); + if (policy.cpu == CPUFREQ_ALL_CPUS) + { + for (i=0; imin = cpufreq_driver->policy[cpu].min; policy->max = cpufreq_driver->policy[cpu].max; policy->policy = cpufreq_driver->policy[cpu].policy; - policy->max_cpu_freq = cpufreq_driver->policy[cpu].max_cpu_freq; + policy->cpuinfo.max_freq = cpufreq_driver->policy[cpu].cpuinfo.max_freq; + policy->cpuinfo.min_freq = cpufreq_driver->policy[cpu].cpuinfo.min_freq; + policy->cpuinfo.transition_latency = cpufreq_driver->policy[cpu].cpuinfo.transition_latency; policy->cpu = cpu; up(&cpufreq_driver_sem); @@ -835,16 +848,14 @@ int cpufreq_set_policy(struct cpufreq_policy *policy) down(&cpufreq_driver_sem); if (!cpufreq_driver || !cpufreq_driver->verify || !cpufreq_driver->setpolicy || !policy || - (policy->cpu > NR_CPUS)) { + (policy->cpu >= NR_CPUS) || (!cpu_online(policy->cpu))) { up(&cpufreq_driver_sem); return -EINVAL; } - if (policy->cpu == CPUFREQ_ALL_CPUS) - policy->max_cpu_freq = cpufreq_driver->policy[0].max_cpu_freq; - else - policy->max_cpu_freq = cpufreq_driver->policy[policy->cpu].max_cpu_freq; - + policy->cpuinfo.max_freq = cpufreq_driver->policy[policy->cpu].cpuinfo.max_freq; + policy->cpuinfo.min_freq = cpufreq_driver->policy[policy->cpu].cpuinfo.min_freq; + policy->cpuinfo.transition_latency = cpufreq_driver->policy[policy->cpu].cpuinfo.transition_latency; /* verify the cpu speed can be set within this limit */ ret = cpufreq_driver->verify(policy); @@ -1039,8 +1050,8 @@ int cpufreq_register(struct cpufreq_driver *driver_data) down(&cpufreq_driver_sem); for (i=0; icpu_min_freq[i]; - cpu_max_freq[i] = driver_data->policy[i].max_cpu_freq; + cpu_min_freq[i] = driver_data->policy[i].cpuinfo.min_freq; + cpu_max_freq[i] = driver_data->policy[i].cpuinfo.max_freq; cpu_cur_freq[i] = driver_data->cpu_cur_freq[i]; } up(&cpufreq_driver_sem); -- cgit v1.2.3 From 326e7842d30d5cfc1089b85a7aa63e5c9f3c0a74 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 14 Dec 2002 20:13:11 -0800 Subject: [PATCH] Module Parameter Core Patch This patch is a rewrite of the insmod and boot parameter handling, to unify them. The new format is fairly simple: built on top of __module_param_call there are several helpers, eg "module_param(foo, int, 000)". The final argument is the permissions bits, for exposing parameters in sysfs (if non-zero) at a later stage. --- arch/i386/vmlinux.lds.S | 3 + arch/ia64/vmlinux.lds.S | 4 + arch/ppc/vmlinux.lds.S | 3 + arch/ppc64/vmlinux.lds.S | 3 + arch/sparc64/vmlinux.lds.S | 3 + include/linux/init.h | 20 ++- include/linux/kernel.h | 2 +- include/linux/moduleparam.h | 127 +++++++++++++++++ init/main.c | 104 ++++++-------- kernel/Makefile | 4 +- kernel/params.c | 338 ++++++++++++++++++++++++++++++++++++++++++++ lib/cmdline.c | 6 +- 12 files changed, 540 insertions(+), 77 deletions(-) create mode 100644 include/linux/moduleparam.h create mode 100644 kernel/params.c (limited to 'include/linux') diff --git a/arch/i386/vmlinux.lds.S b/arch/i386/vmlinux.lds.S index 8be909d2b94f..71860a99a417 100644 --- a/arch/i386/vmlinux.lds.S +++ b/arch/i386/vmlinux.lds.S @@ -67,6 +67,9 @@ SECTIONS __setup_start = .; .init.setup : { *(.init.setup) } __setup_end = .; + __start___param = .; + __param : { *(__param) } + __stop___param = .; __initcall_start = .; .initcall.init : { *(.initcall1.init) diff --git a/arch/ia64/vmlinux.lds.S b/arch/ia64/vmlinux.lds.S index bd2c57acff83..042b080b6a87 100644 --- a/arch/ia64/vmlinux.lds.S +++ b/arch/ia64/vmlinux.lds.S @@ -102,6 +102,10 @@ SECTIONS .init.setup : AT(ADDR(.init.setup) - PAGE_OFFSET) { *(.init.setup) } __setup_end = .; + __start___param = .; + __param : AT(ADDR(__param) - PAGE_OFFSET) + { *(__param) } + __stop___param = .; __initcall_start = .; .initcall.init : AT(ADDR(.initcall.init) - PAGE_OFFSET) { diff --git a/arch/ppc/vmlinux.lds.S b/arch/ppc/vmlinux.lds.S index d98c934dddac..4522197a7556 100644 --- a/arch/ppc/vmlinux.lds.S +++ b/arch/ppc/vmlinux.lds.S @@ -103,6 +103,9 @@ SECTIONS __setup_start = .; .init.setup : { *(.init.setup) } __setup_end = .; + __start___param = .; + __param : { *(__param) } + __stop___param = .; __initcall_start = .; .initcall.init : { *(.initcall1.init) diff --git a/arch/ppc64/vmlinux.lds.S b/arch/ppc64/vmlinux.lds.S index a1fd28af3a9d..00911a7ec795 100644 --- a/arch/ppc64/vmlinux.lds.S +++ b/arch/ppc64/vmlinux.lds.S @@ -91,6 +91,9 @@ SECTIONS __setup_start = .; .init.setup : { *(.init.setup) } __setup_end = .; + __start___param = .; + __param : { *(__param) } + __stop___param = .; __initcall_start = .; .initcall.init : { *(.initcall1.init) diff --git a/arch/sparc64/vmlinux.lds.S b/arch/sparc64/vmlinux.lds.S index 93947b611615..68b3827b6997 100644 --- a/arch/sparc64/vmlinux.lds.S +++ b/arch/sparc64/vmlinux.lds.S @@ -51,6 +51,9 @@ SECTIONS __setup_start = .; .init.setup : { *(.init.setup) } __setup_end = .; + __start___param = .; + __param : { *(__param) } + __stop___param = .; __initcall_start = .; .initcall.init : { *(.initcall1.init) diff --git a/include/linux/init.h b/include/linux/init.h index 26a518d9cfc1..46b1ef190c52 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -91,19 +91,15 @@ typedef void (*exitcall_t)(void); #define __exitcall(fn) \ static exitcall_t __exitcall_##fn __exit_call = fn -/* - * Used for kernel command line parameter setup - */ -struct kernel_param { +struct obs_kernel_param { const char *str; int (*setup_func)(char *); }; -extern struct kernel_param __setup_start, __setup_end; - +/* OBSOLETE: see moduleparam.h for the right way. */ #define __setup(str, fn) \ static char __setup_str_##fn[] __initdata = str; \ - static struct kernel_param __setup_##fn \ + static struct obs_kernel_param __setup_##fn \ __attribute__((unused,__section__ (".init.setup"))) \ = { __setup_str_##fn, fn } @@ -166,6 +162,16 @@ extern struct kernel_param __setup_start, __setup_end; /* Data marked not to be saved by software_suspend() */ #define __nosavedata __attribute__ ((__section__ (".data.nosave"))) +/* This means "can be init if no module support, otherwise module load + may call it." */ +#ifdef CONFIG_MODULES +#define __init_or_module +#define __initdata_or_module +#else +#define __init_or_module __init +#define __initdata_or_module __initdata +#endif /*CONFIG_MODULES*/ + #ifdef CONFIG_HOTPLUG #define __devinit #define __devinitdata diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 602617f9addf..f3cadd9416ad 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -78,7 +78,7 @@ extern int sscanf(const char *, const char *, ...) extern int vsscanf(const char *, const char *, va_list); extern int get_option(char **str, int *pint); -extern char *get_options(char *str, int nints, int *ints); +extern char *get_options(const char *str, int nints, int *ints); extern unsigned long long memparse(char *ptr, char **retptr); extern void dev_probe_lock(void); extern void dev_probe_unlock(void); diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h new file mode 100644 index 000000000000..92a1bc154e8e --- /dev/null +++ b/include/linux/moduleparam.h @@ -0,0 +1,127 @@ +#ifndef _LINUX_MODULE_PARAMS_H +#define _LINUX_MODULE_PARAMS_H +/* (C) Copyright 2001, 2002 Rusty Russell IBM Corporation */ +#include +#include + +/* You can override this manually, but generally this should match the + module name. */ +#ifdef MODULE +#define MODULE_PARAM_PREFIX /* empty */ +#else +#define MODULE_PARAM_PREFIX __stringify(KBUILD_MODNAME) "." +#endif + +struct kernel_param; + +/* Returns 0, or -errno. arg is in kp->arg. */ +typedef int (*param_set_fn)(const char *val, struct kernel_param *kp); +/* Returns length written or -errno. Buffer is 4k (ie. be short!) */ +typedef int (*param_get_fn)(char *buffer, struct kernel_param *kp); + +struct kernel_param { + const char *name; + unsigned int perm; + param_set_fn set; + param_get_fn get; + void *arg; +}; + +/* Special one for strings we want to copy into */ +struct kparam_string { + unsigned int maxlen; + char *string; +}; + +/* This is the fundamental function for registering boot/module + parameters. perm sets the visibility in driverfs: 000 means it's + not there, read bits mean it's readable, write bits mean it's + writable. */ +#define __module_param_call(prefix, name, set, get, arg, perm) \ + static char __param_str_##name[] __initdata = prefix #name; \ + static struct kernel_param __param_##name \ + __attribute__ ((unused,__section__ ("__param"))) \ + = { __param_str_##name, perm, set, get, arg } + +#define module_param_call(name, set, get, arg, perm) \ + __module_param_call(MODULE_PARAM_PREFIX, name, set, get, arg, perm) + +/* Helper functions: type is byte, short, ushort, int, uint, long, + ulong, charp, bool or invbool, or XXX if you define param_get_XXX, + param_set_XXX and param_check_XXX. */ +#define module_param_named(name, value, type, perm) \ + param_check_##type(name, &(value)); \ + module_param_call(name, param_set_##type, param_get_##type, &value, perm) + +#define module_param(name, type, perm) \ + module_param_named(name, name, type, perm) + +/* Actually copy string: maxlen param is usually sizeof(string). */ +#define module_param_string(name, string, len, perm) \ + static struct kparam_string __param_string_##name __initdata \ + = { len, string }; \ + module_param_call(name, param_set_copystring, param_get_charp, \ + &__param_string_##name, perm) + +/* Called on module insert or kernel boot */ +extern int parse_args(const char *name, + char *args, + struct kernel_param *params, + unsigned num, + int (*unknown)(char *param, char *val)); + +/* All the helper functions */ +/* The macros to do compile-time type checking stolen from Jakub + Jelinek, who IIRC came up with this idea for the 2.4 module init code. */ +#define __param_check(name, p, type) \ + static inline type *__check_##name(void) { return(p); } + +extern int param_set_short(const char *val, struct kernel_param *kp); +extern int param_get_short(char *buffer, struct kernel_param *kp); +#define param_check_short(name, p) __param_check(name, p, short) + +extern int param_set_ushort(const char *val, struct kernel_param *kp); +extern int param_get_ushort(char *buffer, struct kernel_param *kp); +#define param_check_ushort(name, p) __param_check(name, p, unsigned short) + +extern int param_set_int(const char *val, struct kernel_param *kp); +extern int param_get_int(char *buffer, struct kernel_param *kp); +#define param_check_int(name, p) __param_check(name, p, int) + +extern int param_set_uint(const char *val, struct kernel_param *kp); +extern int param_get_uint(char *buffer, struct kernel_param *kp); +#define param_check_uint(name, p) __param_check(name, p, unsigned int) + +extern int param_set_long(const char *val, struct kernel_param *kp); +extern int param_get_long(char *buffer, struct kernel_param *kp); +#define param_check_long(name, p) __param_check(name, p, long) + +extern int param_set_ulong(const char *val, struct kernel_param *kp); +extern int param_get_ulong(char *buffer, struct kernel_param *kp); +#define param_check_ulong(name, p) __param_check(name, p, unsigned long) + +extern int param_set_charp(const char *val, struct kernel_param *kp); +extern int param_get_charp(char *buffer, struct kernel_param *kp); +#define param_check_charp(name, p) __param_check(name, p, char *) + +extern int param_set_bool(const char *val, struct kernel_param *kp); +extern int param_get_bool(char *buffer, struct kernel_param *kp); +#define param_check_bool(name, p) __param_check(name, p, int) + +extern int param_set_invbool(const char *val, struct kernel_param *kp); +extern int param_get_invbool(char *buffer, struct kernel_param *kp); +#define param_check_invbool(name, p) __param_check(name, p, int) + +/* First two elements are the max and min array length (which don't change) */ +extern int param_set_intarray(const char *val, struct kernel_param *kp); +extern int param_get_intarray(char *buffer, struct kernel_param *kp); +#define param_check_intarray(name, p) __param_check(name, p, int *) + +extern int param_set_copystring(const char *val, struct kernel_param *kp); + +int param_array(const char *name, + const char *val, + unsigned int min, unsigned int max, + void *elem, int elemsize, + int (*set)(const char *, struct kernel_param *kp)); +#endif /* _LINUX_MODULE_PARAM_TYPES_H */ diff --git a/init/main.c b/init/main.c index ef444b704fbf..b6d4d4e103d3 100644 --- a/init/main.c +++ b/init/main.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -133,9 +134,10 @@ char * envp_init[MAX_INIT_ENVS+2] = { "HOME=/", "TERM=linux", NULL, }; __setup("profile=", profile_setup); -static int __init checksetup(char *line) +static int __init obsolete_checksetup(char *line) { - struct kernel_param *p; + struct obs_kernel_param *p; + extern struct obs_kernel_param __setup_start, __setup_end; p = &__setup_start; do { @@ -218,74 +220,45 @@ static int __init quiet_kernel(char *str) __setup("debug", debug_kernel); __setup("quiet", quiet_kernel); -/* - * This is a simple kernel command line parsing function: it parses - * the command line, and fills in the arguments/environment to init - * as appropriate. Any cmd-line option is taken to be an environment - * variable if it contains the character '='. - * - * This routine also checks for options meant for the kernel. - * These options are not given to init - they are for internal kernel use only. - */ -static void __init parse_options(char *line) +/* Unknown boot options get handed to init, unless they look like + failed parameters */ +static int __init unknown_bootoption(char *param, char *val) { - char *next,*quote; - int args, envs; + /* Change NUL term back to "=", to make "param" the whole string. */ + if (val) + val[-1] = '='; - if (!*line) - return; - args = 0; - envs = 1; /* TERM is set to 'linux' by default */ - next = line; - while ((line = next) != NULL) { - quote = strchr(line,'"'); - next = strchr(line, ' '); - while (next != NULL && quote != NULL && quote < next) { - /* we found a left quote before the next blank - * now we have to find the matching right quote - */ - next = strchr(quote+1, '"'); - if (next != NULL) { - quote = strchr(next+1, '"'); - next = strchr(next+1, ' '); - } - } - if (next != NULL) - *next++ = 0; - if (!strncmp(line,"init=",5)) { - line += 5; - execute_command = line; - /* In case LILO is going to boot us with default command line, - * it prepends "auto" before the whole cmdline which makes - * the shell think it should execute a script with such name. - * So we ignore all arguments entered _before_ init=... [MJ] - */ - args = 0; - continue; + /* Handle obsolete-style parameters */ + if (obsolete_checksetup(param)) + return 0; + + /* Preemptive maintenance for "why didn't my mispelled command + line work?" */ + if (strchr(param, '.') && (!val || strchr(param, '.') < val)) { + printk(KERN_ERR "Unknown boot option `%s': ignoring\n", param); + return 0; + } + + if (val) { + /* Environment option */ + unsigned int i; + for (i = 0; envp_init[i]; i++) { + if (i == MAX_INIT_ENVS) + panic("Too many boot env vars at `%s'", param); } - if (checksetup(line)) - continue; - - /* - * Then check if it's an environment variable or - * an option. - */ - if (strchr(line,'=')) { - if (envs >= MAX_INIT_ENVS) - break; - envp_init[++envs] = line; - } else { - if (args >= MAX_INIT_ARGS) - break; - if (*line) - argv_init[++args] = line; + envp_init[i] = param; + } else { + /* Command line option */ + unsigned int i; + for (i = 0; argv_init[i]; i++) { + if (i == MAX_INIT_ARGS) + panic("Too many boot init vars at `%s'",param); } + argv_init[i] = param; } - argv_init[args+1] = NULL; - envp_init[envs+1] = NULL; + return 0; } - extern void setup_arch(char **); extern void cpu_idle(void); @@ -379,6 +352,7 @@ asmlinkage void __init start_kernel(void) { char * command_line; extern char saved_command_line[]; + extern struct kernel_param __start___param, __stop___param; /* * Interrupts are still disabled. Do necessary setups, then * enable them @@ -390,7 +364,9 @@ asmlinkage void __init start_kernel(void) build_all_zonelists(); page_alloc_init(); printk("Kernel command line: %s\n", saved_command_line); - parse_options(command_line); + parse_args("Booting kernel", command_line, &__start___param, + &__stop___param - &__start___param, + &unknown_bootoption); trap_init(); extable_init(); rcu_init(); diff --git a/kernel/Makefile b/kernel/Makefile index 80b26c206791..8c8fc229092b 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -4,13 +4,13 @@ export-objs = signal.o sys.o kmod.o workqueue.o ksyms.o pm.o exec_domain.o \ printk.o platform.o suspend.o dma.o module.o cpufreq.o \ - profile.o rcupdate.o intermodule.o + profile.o rcupdate.o intermodule.o params.o obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ exit.o itimer.o time.o softirq.o resource.o \ sysctl.o capability.o ptrace.o timer.o user.o \ signal.o sys.o kmod.o workqueue.o futex.o platform.o pid.o \ - rcupdate.o intermodule.o extable.o + rcupdate.o intermodule.o extable.o params.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_SMP) += cpu.o diff --git a/kernel/params.c b/kernel/params.c new file mode 100644 index 000000000000..ffc65eb38748 --- /dev/null +++ b/kernel/params.c @@ -0,0 +1,338 @@ +/* Helpers for initial module or kernel cmdline parsing + Copyright (C) 2001 Rusty Russell. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ +#include +#include +#include +#include +#include + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(fmt , ...) +#endif + +static int parse_one(char *param, + char *val, + struct kernel_param *params, + unsigned num_params, + int (*handle_unknown)(char *param, char *val)) +{ + unsigned int i; + + /* Find parameter */ + for (i = 0; i < num_params; i++) { + if (strcmp(param, params[i].name) == 0) { + DEBUGP("They are equal! Calling %p\n", + params[i].set); + return params[i].set(val, ¶ms[i]); + } + } + + if (handle_unknown) { + DEBUGP("Unknown argument: calling %p\n", handle_unknown); + return handle_unknown(param, val); + } + + DEBUGP("Unknown argument `%s'\n", param); + return -ENOENT; +} + +/* You can use " around spaces, but can't escape ". */ +/* Hyphens and underscores equivalent in parameter names. */ +static char *next_arg(char *args, char **param, char **val) +{ + unsigned int i, equals = 0; + int in_quote = 0; + + /* Chew any extra spaces */ + while (*args == ' ') args++; + + for (i = 0; args[i]; i++) { + if (args[i] == ' ' && !in_quote) + break; + if (equals == 0) { + if (args[i] == '=') + equals = i; + else if (args[i] == '-') + args[i] = '_'; + } + if (args[i] == '"') + in_quote = !in_quote; + } + + *param = args; + if (!equals) + *val = NULL; + else { + args[equals] = '\0'; + *val = args + equals + 1; + } + + if (args[i]) { + args[i] = '\0'; + return args + i + 1; + } else + return args + i; +} + +/* Args looks like "foo=bar,bar2 baz=fuz wiz". */ +int parse_args(const char *name, + char *args, + struct kernel_param *params, + unsigned num, + int (*unknown)(char *param, char *val)) +{ + char *param, *val; + + DEBUGP("Parsing ARGS: %s\n", args); + + while (*args) { + int ret; + + args = next_arg(args, ¶m, &val); + ret = parse_one(param, val, params, num, unknown); + switch (ret) { + case -ENOENT: + printk(KERN_ERR "%s: Unknown parameter `%s'\n", + name, param); + return ret; + case -ENOSPC: + printk(KERN_ERR + "%s: `%s' too large for parameter `%s'\n", + name, val ?: "", param); + return ret; + case 0: + break; + default: + printk(KERN_ERR + "%s: `%s' invalid for parameter `%s'\n", + name, val ?: "", param); + return ret; + } + } + + /* All parsed OK. */ + return 0; +} + +/* Lazy bastard, eh? */ +#define STANDARD_PARAM_DEF(name, type, format, tmptype, strtolfn) \ + int param_set_##name(const char *val, struct kernel_param *kp) \ + { \ + char *endp; \ + tmptype l; \ + \ + if (!val) return -EINVAL; \ + l = strtolfn(val, &endp, 0); \ + if (endp == val || *endp || ((type)l != l)) \ + return -EINVAL; \ + *((type *)kp->arg) = l; \ + return 0; \ + } \ + int param_get_##name(char *buffer, struct kernel_param *kp) \ + { \ + return sprintf(buffer, format, *((type *)kp->arg)); \ + } + +STANDARD_PARAM_DEF(short, short, "%hi", long, simple_strtol); +STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", long, simple_strtol); +STANDARD_PARAM_DEF(int, int, "%i", long, simple_strtol); +STANDARD_PARAM_DEF(uint, unsigned int, "%u", long, simple_strtol); +STANDARD_PARAM_DEF(long, long, "%li", long, simple_strtol); +STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, simple_strtoul); + +int param_set_charp(const char *val, struct kernel_param *kp) +{ + if (!val) { + printk(KERN_ERR "%s: string parameter expected\n", + kp->name); + return -EINVAL; + } + + if (strlen(val) > 1024) { + printk(KERN_ERR "%s: string parameter too long\n", + kp->name); + return -ENOSPC; + } + + *(char **)kp->arg = (char *)val; + return 0; +} + +int param_get_charp(char *buffer, struct kernel_param *kp) +{ + return sprintf(buffer, "%s", *((char **)kp->arg)); +} + +int param_set_bool(const char *val, struct kernel_param *kp) +{ + /* No equals means "set"... */ + if (!val) val = "1"; + + /* One of =[yYnN01] */ + switch (val[0]) { + case 'y': case 'Y': case '1': + *(int *)kp->arg = 1; + return 0; + case 'n': case 'N': case '0': + *(int *)kp->arg = 0; + return 0; + } + return -EINVAL; +} + +int param_get_bool(char *buffer, struct kernel_param *kp) +{ + /* Y and N chosen as being relatively non-coder friendly */ + return sprintf(buffer, "%c", (*(int *)kp->arg) ? 'Y' : 'N'); +} + +int param_set_invbool(const char *val, struct kernel_param *kp) +{ + int boolval, ret; + struct kernel_param dummy = { .arg = &boolval }; + + ret = param_set_bool(val, &dummy); + if (ret == 0) + *(int *)kp->arg = !boolval; + return ret; +} + +int param_get_invbool(char *buffer, struct kernel_param *kp) +{ + int val; + struct kernel_param dummy = { .arg = &val }; + + val = !*(int *)kp->arg; + return param_get_bool(buffer, &dummy); +} + +/* We cheat here and temporarily mangle the string. */ +int param_array(const char *name, + const char *val, + unsigned int min, unsigned int max, + void *elem, int elemsize, + int (*set)(const char *, struct kernel_param *kp)) +{ + int ret; + unsigned int count = 0; + struct kernel_param kp; + + /* Get the name right for errors. */ + kp.name = name; + kp.arg = elem; + + /* No equals sign? */ + if (!val) { + printk(KERN_ERR "%s: expects arguments\n", name); + return -EINVAL; + } + + /* We expect a comma-separated list of values. */ + do { + int len; + char save; + + if (count > max) { + printk(KERN_ERR "%s: can only take %i arguments\n", + name, max); + return -EINVAL; + } + len = strcspn(val, ","); + + /* Temporarily nul-terminate and parse */ + save = val[len]; + ((char *)val)[len] = '\0'; + ret = set(val, &kp); + ((char *)val)[len] = save; + + if (ret != 0) + return ret; + kp.arg += elemsize; + val += len+1; + count++; + } while (val[-1] == ','); + + if (count < min) { + printk(KERN_ERR "%s: needs at least %i arguments\n", + name, min); + return -EINVAL; + } + return 0; +} + +/* First two elements are the max and min array length (which don't change) */ +int param_set_intarray(const char *val, struct kernel_param *kp) +{ + int *array; + + /* Grab min and max as first two elements */ + array = kp->arg; + return param_array(kp->name, val, array[0], array[1], &array[2], + sizeof(int), param_set_int); +} + +int param_get_intarray(char *buffer, struct kernel_param *kp) +{ + int max; + int *array; + unsigned int i; + + array = kp->arg; + max = array[1]; + + for (i = 2; i < max + 2; i++) + sprintf(buffer, "%s%i", i > 2 ? "," : "", array[i]); + return strlen(buffer); +} + +int param_set_copystring(const char *val, struct kernel_param *kp) +{ + struct kparam_string *kps = kp->arg; + + if (strlen(val)+1 > kps->maxlen) { + printk(KERN_ERR "%s: string doesn't fit in %u chars.\n", + kp->name, kps->maxlen-1); + return -ENOSPC; + } + strcpy(kps->string, val); + return 0; +} + +EXPORT_SYMBOL(param_set_short); +EXPORT_SYMBOL(param_get_short); +EXPORT_SYMBOL(param_set_ushort); +EXPORT_SYMBOL(param_get_ushort); +EXPORT_SYMBOL(param_set_int); +EXPORT_SYMBOL(param_get_int); +EXPORT_SYMBOL(param_set_uint); +EXPORT_SYMBOL(param_get_uint); +EXPORT_SYMBOL(param_set_long); +EXPORT_SYMBOL(param_get_long); +EXPORT_SYMBOL(param_set_ulong); +EXPORT_SYMBOL(param_get_ulong); +EXPORT_SYMBOL(param_set_charp); +EXPORT_SYMBOL(param_get_charp); +EXPORT_SYMBOL(param_set_bool); +EXPORT_SYMBOL(param_get_bool); +EXPORT_SYMBOL(param_set_invbool); +EXPORT_SYMBOL(param_get_invbool); +EXPORT_SYMBOL(param_set_intarray); +EXPORT_SYMBOL(param_get_intarray); +EXPORT_SYMBOL(param_set_copystring); diff --git a/lib/cmdline.c b/lib/cmdline.c index a9e9589d3d57..0331ed825ea7 100644 --- a/lib/cmdline.c +++ b/lib/cmdline.c @@ -64,12 +64,12 @@ int get_option (char **str, int *pint) * completely parseable). */ -char *get_options (char *str, int nints, int *ints) +char *get_options(const char *str, int nints, int *ints) { int res, i = 1; while (i < nints) { - res = get_option (&str, ints + i); + res = get_option ((char **)&str, ints + i); if (res == 0) break; i++; @@ -77,7 +77,7 @@ char *get_options (char *str, int nints, int *ints) break; } ints[0] = i - 1; - return (str); + return (char *)str; } /** -- cgit v1.2.3 From 136839a1b4597a6b99a4286586d04a16b5d30295 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 14 Dec 2002 20:13:23 -0800 Subject: [PATCH] MODULE_PARM support for older modules This is the backwards compatibility code for MODULE_PARM, and moves __MODULE_STRING() down to the graveyard at the bottom of module.h. It's complicated by the fact that many modules place MODULE_PARM() before the declaration (some do MODULE_PARM() for non-existant variables, too). To avoid breaking them, we have to do the name lookups at load time, rather than just storing a pointer 8( CONFIG_OBSOLETE_MODPARM is set to y without prompting: it's a useful marker for deprecating in 2.7. --- include/linux/module.h | 21 +++++-- init/Kconfig | 9 +++ kernel/module.c | 161 ++++++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 177 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index 90810238f424..2392edcc3307 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -20,10 +20,6 @@ #include #include /* For struct exception_table_entry */ -/* Indirect stringification */ -#define __MODULE_STRING_1(x) #x -#define __MODULE_STRING(x) __MODULE_STRING_1(x) - /* Not Yet Implemented */ #define MODULE_LICENSE(name) #define MODULE_AUTHOR(name) @@ -305,6 +301,21 @@ extern spinlock_t modlist_lock; #define __MOD_DEC_USE_COUNT(mod) module_put(mod) #define SET_MODULE_OWNER(dev) ((dev)->owner = THIS_MODULE) +struct obsolete_modparm { + char name[64]; + char type[64-sizeof(void *)]; + void *addr; +}; +#ifdef MODULE +/* DEPRECATED: Do not use. */ +#define MODULE_PARM(var,type) \ +struct obsolete_modparm __parm_##var __attribute__((section("__obsparm"))) = \ +{ __stringify(var), type }; + +#else +#define MODULE_PARM(var,type) +#endif + /* People do this inside their init routines, when the module isn't "live" yet. They should no longer be doing that, but meanwhile... */ @@ -317,11 +328,11 @@ extern spinlock_t modlist_lock; #endif #define MOD_DEC_USE_COUNT module_put(THIS_MODULE) #define try_inc_mod_count(mod) try_module_get(mod) -#define MODULE_PARM(parm,string) #define EXPORT_NO_SYMBOLS extern int module_dummy_usage; #define GET_USE_COUNT(module) (module_dummy_usage) #define MOD_IN_USE 0 +#define __MODULE_STRING(x) __stringify(x) #define __mod_between(a_start, a_len, b_start, b_len) \ (((a_start) >= (b_start) && (a_start) <= (b_start)+(b_len)) \ || ((a_start)+(a_len) >= (b_start) \ diff --git a/init/Kconfig b/init/Kconfig index f0f984c1b26c..1597113085a8 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -135,6 +135,15 @@ config MODULE_FORCE_UNLOAD rmmod). This is mainly for kernel developers and desparate users. If unsure, say N. +config OBSOLETE_MODPARM + bool + default y + depends on MODULES + help + You need this option to use module parameters on modules which + have not been converted to the new module parameter system yet. + If unsure, say Y. + config KMOD bool "Kernel module loader" depends on MODULES diff --git a/kernel/module.c b/kernel/module.c index ac0c424fce0b..c7dfc2ea7a3d 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -541,6 +541,134 @@ sys_delete_module(const char *name_user, unsigned int flags) #endif /* CONFIG_MODULE_UNLOAD */ +#ifdef CONFIG_OBSOLETE_MODPARM +static int param_set_byte(const char *val, struct kernel_param *kp) +{ + char *endp; + long l; + + if (!val) return -EINVAL; + l = simple_strtol(val, &endp, 0); + if (endp == val || *endp || ((char)l != l)) + return -EINVAL; + *((char *)kp->arg) = l; + return 0; +} + +static int param_string(const char *name, const char *val, + unsigned int min, unsigned int max, + char *dest) +{ + if (strlen(val) < min || strlen(val) > max) { + printk(KERN_ERR + "Parameter %s length must be %u-%u characters\n", + name, min, max); + return -EINVAL; + } + strcpy(dest, val); + return 0; +} + +extern int set_obsolete(const char *val, struct kernel_param *kp) +{ + unsigned int min, max; + char *p, *endp; + struct obsolete_modparm *obsparm = kp->arg; + + if (!val) { + printk(KERN_ERR "Parameter %s needs an argument\n", kp->name); + return -EINVAL; + } + + /* type is: [min[-max]]{b,h,i,l,s} */ + p = obsparm->type; + min = simple_strtol(p, &endp, 10); + if (endp == obsparm->type) + min = max = 1; + else if (*endp == '-') { + p = endp+1; + max = simple_strtol(p, &endp, 10); + } else + max = min; + switch (*endp) { + case 'b': + return param_array(kp->name, val, min, max, obsparm->addr, + 1, param_set_byte); + case 'h': + return param_array(kp->name, val, min, max, obsparm->addr, + sizeof(short), param_set_short); + case 'i': + return param_array(kp->name, val, min, max, obsparm->addr, + sizeof(int), param_set_int); + case 'l': + return param_array(kp->name, val, min, max, obsparm->addr, + sizeof(long), param_set_long); + case 's': + return param_string(kp->name, val, min, max, obsparm->addr); + } + printk(KERN_ERR "Unknown obsolete parameter type %s\n", obsparm->type); + return -EINVAL; +} + +static int obsolete_params(const char *name, + char *args, + struct obsolete_modparm obsparm[], + unsigned int num, + Elf_Shdr *sechdrs, + unsigned int symindex, + const char *strtab) +{ + struct kernel_param *kp; + unsigned int i; + int ret; + + kp = kmalloc(sizeof(kp[0]) * num, GFP_KERNEL); + if (!kp) + return -ENOMEM; + + DEBUGP("Module %s has %u obsolete params\n", name, num); + for (i = 0; i < num; i++) + DEBUGP("Param %i: %s type %s\n", + num, obsparm[i].name, obsparm[i].type); + + for (i = 0; i < num; i++) { + kp[i].name = obsparm[i].name; + kp[i].perm = 000; + kp[i].set = set_obsolete; + kp[i].get = NULL; + obsparm[i].addr + = (void *)find_local_symbol(sechdrs, symindex, strtab, + obsparm[i].name); + if (!obsparm[i].addr) { + printk("%s: falsely claims to have parameter %s\n", + name, obsparm[i].name); + ret = -EINVAL; + goto out; + } + kp[i].arg = &obsparm[i]; + } + + ret = parse_args(name, args, kp, num, NULL); + out: + kfree(kp); + return ret; +} +#else +static int obsolete_params(const char *name, + char *args, + struct obsolete_modparm obsparm[], + unsigned int num, + Elf_Shdr *sechdrs, + unsigned int symindex, + const char *strtab) +{ + if (num != 0) + printk(KERN_WARNING "%s: Ignoring obsolete parameters\n", + name); + return 0; +} +#endif /* CONFIG_OBSOLETE_MODPARM */ + /* Find an symbol for this module (ie. resolve internals first). It we find one, record usage. Must be holding module_mutex. */ unsigned long find_symbol_internal(Elf_Shdr *sechdrs, @@ -837,7 +965,7 @@ static struct module *load_module(void *umod, Elf_Shdr *sechdrs; char *secstrings; unsigned int i, symindex, exportindex, strindex, setupindex, exindex, - modnameindex; + modnameindex, obsparmindex; long arglen; unsigned long common_length; struct sizes sizes, used; @@ -875,7 +1003,7 @@ static struct module *load_module(void *umod, /* May not export symbols, or have setup params, so these may not exist */ - exportindex = setupindex = 0; + exportindex = setupindex = obsparmindex = 0; /* And these should exist, but gcc whinges if we don't init them */ symindex = strindex = exindex = modnameindex = 0; @@ -911,6 +1039,11 @@ static struct module *load_module(void *umod, /* Exception table */ DEBUGP("Exception table found in section %u\n", i); exindex = i; + } else if (strcmp(secstrings+sechdrs[i].sh_name, "__obsparm") + == 0) { + /* Obsolete MODULE_PARM() table */ + DEBUGP("Obsolete param found in section %u\n", i); + obsparmindex = i; } #ifdef CONFIG_KALLSYMS /* symbol and string tables for decoding later. */ @@ -1049,13 +1182,23 @@ static struct module *load_module(void *umod, if (err < 0) goto cleanup; - /* Size of section 0 is 0, so this works well if no params */ - err = parse_args(mod->args, - (struct kernel_param *) - sechdrs[setupindex].sh_offset, - sechdrs[setupindex].sh_size - / sizeof(struct kernel_param), - NULL); + if (obsparmindex) { + err = obsolete_params(mod->name, mod->args, + (struct obsolete_modparm *) + sechdrs[obsparmindex].sh_offset, + sechdrs[obsparmindex].sh_size + / sizeof(struct obsolete_modparm), + sechdrs, symindex, + (char *)sechdrs[strindex].sh_offset); + } else { + /* Size of section 0 is 0, so this works well if no params */ + err = parse_args(mod->name, mod->args, + (struct kernel_param *) + sechdrs[setupindex].sh_offset, + sechdrs[setupindex].sh_size + / sizeof(struct kernel_param), + NULL); + } if (err < 0) goto cleanup; -- cgit v1.2.3 From a9ff25c88198b789ac3b1f294dd878408629ef88 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Sat, 14 Dec 2002 20:16:06 -0800 Subject: [PATCH] consolidate sys32_times - architecture independent This patch creates compat_sys_times and a few more compability types. --- include/linux/compat.h | 13 +++++++++++-- kernel/compat.c | 20 ++++++++++++++++++++ 2 files changed, 31 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compat.h b/include/linux/compat.h index 5aa29316f9d8..228bd4b13365 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -1,8 +1,8 @@ #ifndef _LINUX_COMPAT_H #define _LINUX_COMPAT_H /* - * These are the type definitions for the arhitecure sepcific - * compatibility layer. + * These are the type definitions for the architecture specific + * syscall compatibility layer. */ #include @@ -10,6 +10,8 @@ #include +#define compat_jiffies_to_clock_t(x) ((x) / (HZ / COMPAT_USER_HZ)) + struct compat_utimbuf { compat_time_t actime; compat_time_t modtime; @@ -20,5 +22,12 @@ struct compat_itimerval { struct compat_timeval it_value; }; +struct compat_tms { + compat_clock_t tms_utime; + compat_clock_t tms_stime; + compat_clock_t tms_cutime; + compat_clock_t tms_cstime; +}; + #endif /* CONFIG_COMPAT */ #endif /* _LINUX_COMPAT_H */ diff --git a/kernel/compat.c b/kernel/compat.c index f4da56c960f8..701e39120b22 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -156,3 +156,23 @@ asmlinkage long compat_sys_setitimer(int which, struct compat_itimerval *in, return -EFAULT; return 0; } + +asmlinkage long compat_sys_times(struct compat_tms *tbuf) +{ + /* + * In the SMP world we might just be unlucky and have one of + * the times increment as we use it. Since the value is an + * atomically safe type this is just fine. Conceptually its + * as if the syscall took an instant longer to occur. + */ + if (tbuf) { + struct compat_tms tmp; + tmp.tms_utime = compat_jiffies_to_clock_t(current->utime); + tmp.tms_stime = compat_jiffies_to_clock_t(current->stime); + tmp.tms_cutime = compat_jiffies_to_clock_t(current->cutime); + tmp.tms_cstime = compat_jiffies_to_clock_t(current->cstime); + if (copy_to_user(tbuf, &tmp, sizeof(tmp))) + return -EFAULT; + } + return compat_jiffies_to_clock_t(jiffies); +} -- cgit v1.2.3 From bda0e9568615f5af3bd927420f6deb2d1d7773ac Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Sat, 14 Dec 2002 20:16:19 -0800 Subject: [PATCH] consolidate sys32_new[lf]stat - architecture independent This renames more types and moves them into asm/compat.h and also consolidates sys32_new{stat,fstat,lstat}. --- fs/Makefile | 2 ++ fs/compat.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/compat.h | 3 +++ kernel/compat.c | 19 ------------- 4 files changed, 77 insertions(+), 19 deletions(-) create mode 100644 fs/compat.c (limited to 'include/linux') diff --git a/fs/Makefile b/fs/Makefile index 45a501684111..e2b76bcd9f87 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -15,6 +15,8 @@ obj-y := open.o read_write.o file_table.o buffer.o \ filesystems.o namespace.o seq_file.o xattr.o libfs.o \ fs-writeback.o mpage.o direct-io.o aio.o eventpoll.o +obj-$(CONFIG_COMPAT) += compat.o + ifneq ($(CONFIG_NFSD),n) ifneq ($(CONFIG_NFSD),) obj-y += nfsctl.o diff --git a/fs/compat.c b/fs/compat.c new file mode 100644 index 000000000000..91dc5981fa38 --- /dev/null +++ b/fs/compat.c @@ -0,0 +1,72 @@ +/* + * linux/fs/compat.c + * + * Kernel compatibililty routines for e.g. 32 bit syscall support + * on 64 bit kernels. + * + * Copyright (C) 2002 Stephen Rothwell, IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include + +#include + +/* + * Not all architectures have sys_utime, so implement this in terms + * of sys_utimes. + */ +asmlinkage long compat_sys_utime(char *filename, struct compat_utimbuf *t) +{ + struct timeval tv[2]; + + if (t) { + if (get_user(tv[0].tv_sec, &t->actime) || + get_user(tv[1].tv_sec, &t->modtime)) + return -EFAULT; + tv[0].tv_usec = 0; + tv[1].tv_usec = 0; + } + return do_utimes(filename, t ? tv : NULL); +} + + +asmlinkage long compat_sys_newstat(char * filename, + struct compat_stat *statbuf) +{ + struct kstat stat; + int error = vfs_stat(filename, &stat); + + if (!error) + error = cp_compat_stat(&stat, statbuf); + return error; +} + +asmlinkage long compat_sys_newlstat(char * filename, + struct compat_stat *statbuf) +{ + struct kstat stat; + int error = vfs_lstat(filename, &stat); + + if (!error) + error = cp_compat_stat(&stat, statbuf); + return error; +} + +asmlinkage long compat_sys_newfstat(unsigned int fd, + struct compat_stat * statbuf) +{ + struct kstat stat; + int error = vfs_fstat(fd, &stat); + + if (!error) + error = cp_compat_stat(&stat, statbuf); + return error; +} diff --git a/include/linux/compat.h b/include/linux/compat.h index 228bd4b13365..62daefb17672 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -8,6 +8,7 @@ #ifdef CONFIG_COMPAT +#include #include #define compat_jiffies_to_clock_t(x) ((x) / (HZ / COMPAT_USER_HZ)) @@ -29,5 +30,7 @@ struct compat_tms { compat_clock_t tms_cstime; }; +extern int cp_compat_stat(struct kstat *, struct compat_stat *); + #endif /* CONFIG_COMPAT */ #endif /* _LINUX_COMPAT_H */ diff --git a/kernel/compat.c b/kernel/compat.c index 701e39120b22..db0f7fd64a8a 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -83,25 +83,6 @@ asmlinkage long compat_sys_nanosleep(struct compat_timespec *rqtp, return -ERESTART_RESTARTBLOCK; } -/* - * Not all architectures have sys_utime, so implement this in terms - * of sys_utimes. - */ -asmlinkage long compat_sys_utime(char *filename, struct compat_utimbuf *t) -{ - struct timeval tv[2]; - - if (t) { - if (get_user(tv[0].tv_sec, &t->actime) || - get_user(tv[1].tv_sec, &t->modtime)) - return -EFAULT; - tv[0].tv_usec = 0; - tv[1].tv_usec = 0; - } - return do_utimes(filename, t ? tv : NULL); -} - - static inline long get_compat_itimerval(struct itimerval *o, struct compat_itimerval *i) { -- cgit v1.2.3 From 4de40457252dd617c7c826e1135da30da77e802f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 15 Dec 2002 00:11:18 -0800 Subject: [PATCH] NFSv4 cleanups - Move the encoding/decoding of the actual COMPOUND XDR header out of encode_compound()/decode_compound(). - Make each NFSv4 operation 'decode_' routine also take care of decoding its own header, and checking it for correctness. Also allows us to get rid of the 'nfserr' parameter... --- fs/nfs/nfs4proc.c | 12 +- fs/nfs/nfs4xdr.c | 703 +++++++++++++++++++++++++++++------------------- include/linux/nfs4.h | 10 + include/linux/nfs_xdr.h | 1 - 4 files changed, 440 insertions(+), 286 deletions(-) (limited to 'include/linux') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index e563bdc7e315..2ac8115aa8e7 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -648,7 +648,7 @@ nfs4_call_compound(struct nfs4_compound *cp, struct rpc_cred *cred, int flags) { int status; struct rpc_message msg = { - .rpc_proc = &nfs4_procedures[NFSPROC4_COMPOUND], + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMPOUND], .rpc_argp = cp, .rpc_resp = cp, .rpc_cred = cred, @@ -1112,7 +1112,7 @@ nfs4_proc_unlink_setup(struct rpc_message *msg, struct dentry *dir, struct qstr nfs4_setup_remove(cp, name, &up->cinfo); nfs4_setup_getattr(cp, &up->attrs, bmres); - msg->rpc_proc = &nfs4_procedures[NFSPROC4_COMPOUND]; + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMPOUND]; msg->rpc_argp = cp; msg->rpc_resp = cp; return 0; @@ -1373,7 +1373,7 @@ nfs4_proc_read_setup(struct nfs_read_data *data, unsigned int count) struct rpc_task *task = &data->task; struct nfs4_compound *cp = &data->u.v4.compound; struct rpc_message msg = { - .rpc_proc = &nfs4_procedures[NFSPROC4_COMPOUND], + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMPOUND], .rpc_argp = cp, .rpc_resp = cp, .rpc_cred = data->cred, @@ -1417,7 +1417,7 @@ nfs4_proc_write_setup(struct nfs_write_data *data, unsigned int count, int how) struct rpc_task *task = &data->task; struct nfs4_compound *cp = &data->u.v4.compound; struct rpc_message msg = { - .rpc_proc = &nfs4_procedures[NFSPROC4_COMPOUND], + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMPOUND], .rpc_argp = cp, .rpc_resp = cp, .rpc_cred = data->cred, @@ -1468,7 +1468,7 @@ nfs4_proc_commit_setup(struct nfs_write_data *data, u64 start, u32 len, int how) struct rpc_task *task = &data->task; struct nfs4_compound *cp = &data->u.v4.compound; struct rpc_message msg = { - .rpc_proc = &nfs4_procedures[NFSPROC4_COMPOUND], + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMPOUND], .rpc_argp = cp, .rpc_resp = cp, .rpc_cred = data->cred, @@ -1523,7 +1523,7 @@ nfs4_proc_renew(struct nfs_server *server) struct rpc_task *task; struct nfs4_compound *cp; struct rpc_message msg = { - .rpc_proc = &nfs4_procedures[NFSPROC4_COMPOUND], + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMPOUND], }; rp = (struct renew_desc *) kmalloc(sizeof(*rp), GFP_KERNEL); diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index baf674309b86..c2b7c3fb5300 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -83,6 +83,13 @@ static struct { { 0, NFNON }, }; +struct compound_hdr { + int32_t status; + uint32_t nops; + uint32_t taglen; + char * tag; +}; + /* * START OF "GENERIC" ENCODE ROUTINES. * These may look a little ugly since they are imported from a "generic" @@ -118,6 +125,20 @@ uint32_t *xdr_writemem(uint32_t *p, const void *ptr, int nbytes) return p + tmp; } +static int +encode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr) +{ + uint32_t *p; + + dprintk("encode_compound: tag=%.*s\n", (int)hdr->taglen, hdr->tag); + RESERVE_SPACE(12+XDR_QUADLEN(hdr->taglen)); + WRITE32(hdr->taglen); + WRITEMEM(hdr->tag, hdr->taglen); + WRITE32(NFS4_MINOR_VERSION); + WRITE32(hdr->nops); + return 0; +} + /* * FIXME: The following dummy entries will be replaced once the userland * upcall gets in... @@ -696,16 +717,14 @@ encode_write(struct xdr_stream *xdr, struct nfs4_write *write, struct rpc_rqst * static int encode_compound(struct xdr_stream *xdr, struct nfs4_compound *cp, struct rpc_rqst *req) { + struct compound_hdr hdr = { + .taglen = cp->taglen, + .tag = cp->tag, + .nops = cp->req_nops, + }; int i, status = 0; - uint32_t *p; - dprintk("encode_compound: tag=%.*s\n", (int)cp->taglen, cp->tag); - - RESERVE_SPACE(12 + cp->taglen); - WRITE32(cp->taglen); - WRITEMEM(cp->tag, cp->taglen); - WRITE32(NFS4_MINOR_VERSION); - WRITE32(cp->req_nops); + encode_compound_hdr(xdr, &hdr); for (i = 0; i < cp->req_nops; i++) { switch (cp->ops[i].opnum) { @@ -849,8 +868,8 @@ xdr_error: \ p = xdr_inline_decode(xdr, nbytes); \ if (!p) { \ printk(KERN_WARNING "%s: reply buffer overflowed in line %d.", \ - __FUNCTION__, __LINE__); \ - return -EIO; \ + __FUNCTION__, __LINE__); \ + return -EIO; \ } \ } while (0) @@ -876,6 +895,44 @@ decode_gid(char *p, uint32_t len, gid_t *gid) return 0; } +static int +decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr) +{ + uint32_t *p; + + READ_BUF(8); + READ32(hdr->status); + READ32(hdr->taglen); + + READ_BUF(hdr->taglen + 4); + hdr->tag = (char *)p; + p += XDR_QUADLEN(hdr->taglen); + READ32(hdr->nops); + return 0; +} + +static int +decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected) +{ + uint32_t *p; + uint32_t opnum; + int32_t nfserr; + + READ_BUF(8); + READ32(opnum); + if (opnum != expected) { + printk(KERN_NOTICE + "nfs4_decode_op_hdr: Server returned operation" + " %d but we issued a request for %d\n", + opnum, expected); + return -EIO; + } + READ32(nfserr); + if (nfserr != NFS_OK) + return -nfs_stat_to_errno(nfserr); + return 0; +} + static int decode_change_info(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) { @@ -889,68 +946,71 @@ decode_change_info(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) } static int -decode_access(struct xdr_stream *xdr, int nfserr, struct nfs4_access *access) +decode_access(struct xdr_stream *xdr, struct nfs4_access *access) { uint32_t *p; uint32_t supp, acc; + int status; - if (!nfserr) { - READ_BUF(8); - READ32(supp); - READ32(acc); - - if ((supp & ~access->ac_req_access) || (acc & ~supp)) { - printk(KERN_NOTICE "NFS: server returned bad bits in access call!\n"); - return -EIO; - } - *access->ac_resp_supported = supp; - *access->ac_resp_access = acc; + status = decode_op_hdr(xdr, OP_ACCESS); + if (status) + return status; + READ_BUF(8); + READ32(supp); + READ32(acc); + if ((supp & ~access->ac_req_access) || (acc & ~supp)) { + printk(KERN_NOTICE "NFS: server returned bad bits in access call!\n"); + return -EIO; } + *access->ac_resp_supported = supp; + *access->ac_resp_access = acc; return 0; } static int -decode_close(struct xdr_stream *xdr, int nfserr, struct nfs4_close *close) +decode_close(struct xdr_stream *xdr, struct nfs4_close *close) { uint32_t *p; + int status; - if (!nfserr) { - READ_BUF(sizeof(nfs4_stateid)); - COPYMEM(close->cl_stateid, sizeof(nfs4_stateid)); - } + status = decode_op_hdr(xdr, OP_CLOSE); + if (status) + return status; + READ_BUF(sizeof(nfs4_stateid)); + COPYMEM(close->cl_stateid, sizeof(nfs4_stateid)); return 0; } static int -decode_commit(struct xdr_stream *xdr, int nfserr, struct nfs4_commit *commit) +decode_commit(struct xdr_stream *xdr, struct nfs4_commit *commit) { uint32_t *p; + int status; - if (!nfserr) { - READ_BUF(8); - COPYMEM(commit->co_verifier->verifier, 8); - } + status = decode_op_hdr(xdr, OP_COMMIT); + if (status) + return status; + READ_BUF(8); + COPYMEM(commit->co_verifier->verifier, 8); return 0; } static int -decode_create(struct xdr_stream *xdr, int nfserr, struct nfs4_create *create) +decode_create(struct xdr_stream *xdr, struct nfs4_create *create) { uint32_t *p; uint32_t bmlen; int status; - if (!nfserr) { - if ((status = decode_change_info(xdr, create->cr_cinfo))) - goto out; - READ_BUF(4); - READ32(bmlen); - if (bmlen > 2) - goto xdr_error; - READ_BUF(bmlen << 2); - } - - DECODE_TAIL; + status = decode_op_hdr(xdr, OP_CREATE); + if (status) + return status; + if ((status = decode_change_info(xdr, create->cr_cinfo))) + return status; + READ_BUF(4); + READ32(bmlen); + READ_BUF(bmlen << 2); + return 0; } extern uint32_t nfs4_fattr_bitmap[2]; @@ -959,25 +1019,24 @@ extern uint32_t nfs4_fsstat_bitmap[2]; extern uint32_t nfs4_pathconf_bitmap[2]; static int -decode_getattr(struct xdr_stream *xdr, int nfserr, struct nfs4_getattr *getattr) +decode_getattr(struct xdr_stream *xdr, struct nfs4_getattr *getattr) { - struct nfs_fattr *nfp = getattr->gt_attrs; + struct nfs_fattr *nfp = getattr->gt_attrs; struct nfs_fsstat *fsstat = getattr->gt_fsstat; struct nfs_fsinfo *fsinfo = getattr->gt_fsinfo; struct nfs_pathconf *pathconf = getattr->gt_pathconf; + uint32_t attrlen, dummy32, bmlen, + bmval0 = 0, + bmval1 = 0, + len = 0; uint32_t *p; - uint32_t bmlen; - uint32_t bmval0 = 0; - uint32_t bmval1 = 0; - uint32_t attrlen; - uint32_t dummy32; - uint32_t len = 0; unsigned int type; int fmode = 0; int status; - if (nfserr) - goto success; + status = decode_op_hdr(xdr, OP_GETATTR); + if (status) + return status; READ_BUF(4); READ32(bmlen); @@ -1208,265 +1267,361 @@ decode_getattr(struct xdr_stream *xdr, int nfserr, struct nfs4_getattr *getattr) if (len != attrlen) goto xdr_error; -success: DECODE_TAIL; } static int -decode_getfh(struct xdr_stream *xdr, int nfserr, struct nfs4_getfh *getfh) +decode_getfh(struct xdr_stream *xdr, struct nfs4_getfh *getfh) { struct nfs_fh *fh = getfh->gf_fhandle; uint32_t *p; uint32_t len; int status; + status = decode_op_hdr(xdr, OP_GETFH); + if (status) + return status; /* Zero handle first to allow comparisons */ memset(fh, 0, sizeof(*fh)); - - if (!nfserr) { - READ_BUF(4); - READ32(len); - if (len > NFS_MAXFHSIZE) - goto xdr_error; - fh->size = len; - READ_BUF(len); - COPYMEM(fh->data, len); - } - DECODE_TAIL; + READ_BUF(4); + READ32(len); + if (len > NFS_MAXFHSIZE) + return -EIO; + fh->size = len; + READ_BUF(len); + COPYMEM(fh->data, len); + return 0; } static int -decode_link(struct xdr_stream *xdr, int nfserr, struct nfs4_link *link) +decode_link(struct xdr_stream *xdr, struct nfs4_link *link) { - int status = 0; + int status; - if (!nfserr) - status = decode_change_info(xdr, link->ln_cinfo); - return status; + status = decode_op_hdr(xdr, OP_LINK); + if (status) + return status; + return decode_change_info(xdr, link->ln_cinfo); +} + +static int +decode_lookup(struct xdr_stream *xdr) +{ + return decode_op_hdr(xdr, OP_LOOKUP); } static int -decode_open(struct xdr_stream *xdr, int nfserr, struct nfs4_open *open) +decode_open(struct xdr_stream *xdr, struct nfs4_open *open) { uint32_t *p; uint32_t bmlen, delegation_type; int status; - if (!nfserr) { - READ_BUF(sizeof(nfs4_stateid)); - COPYMEM(open->op_stateid, sizeof(nfs4_stateid)); + status = decode_op_hdr(xdr, OP_OPEN); + if (status) + return status; + READ_BUF(sizeof(nfs4_stateid)); + COPYMEM(open->op_stateid, sizeof(nfs4_stateid)); - decode_change_info(xdr, open->op_cinfo); + decode_change_info(xdr, open->op_cinfo); - READ_BUF(8); - READ32(*open->op_rflags); - READ32(bmlen); - if (bmlen > 10) - goto xdr_error; + READ_BUF(8); + READ32(*open->op_rflags); + READ32(bmlen); + if (bmlen > 10) + goto xdr_error; - READ_BUF((bmlen << 2) + 4); - p += bmlen; - READ32(delegation_type); - if (delegation_type != NFS4_OPEN_DELEGATE_NONE) - goto xdr_error; - } + READ_BUF((bmlen << 2) + 4); + p += bmlen; + READ32(delegation_type); + if (delegation_type != NFS4_OPEN_DELEGATE_NONE) + goto xdr_error; DECODE_TAIL; } static int -decode_open_confirm(struct xdr_stream *xdr, int nfserr, struct nfs4_open_confirm *open_confirm) +decode_open_confirm(struct xdr_stream *xdr, struct nfs4_open_confirm *open_confirm) { uint32_t *p; + int status; - if (!nfserr) { - READ_BUF(sizeof(nfs4_stateid)); - COPYMEM(open_confirm->oc_stateid, sizeof(nfs4_stateid)); - } + status = decode_op_hdr(xdr, OP_OPEN_CONFIRM); + if (status) + return status; + READ_BUF(sizeof(nfs4_stateid)); + COPYMEM(open_confirm->oc_stateid, sizeof(nfs4_stateid)); return 0; } static int -decode_read(struct xdr_stream *xdr, int nfserr, struct nfs4_read *read) +decode_putfh(struct xdr_stream *xdr) +{ + return decode_op_hdr(xdr, OP_PUTFH); +} + +static int +decode_putrootfh(struct xdr_stream *xdr) { - uint32_t throwaway; + return decode_op_hdr(xdr, OP_PUTROOTFH); +} + +static int +decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs4_read *read) +{ + struct iovec *iov = req->rq_rvec; uint32_t *p; + uint32_t count, eof, recvd, hdrlen; int status; - if (!nfserr) { - READ_BUF(8); - if (read->rd_eof) - READ32(*read->rd_eof); - else - READ32(throwaway); - READ32(*read->rd_bytes_read); - if (*read->rd_bytes_read > read->rd_length) - goto xdr_error; + status = decode_op_hdr(xdr, OP_READ); + if (status) + return status; + READ_BUF(8); + READ32(eof); + READ32(count); + hdrlen = (u8 *) p - (u8 *) iov->iov_base; + if (iov->iov_len < hdrlen) { + printk(KERN_WARNING "NFS: READ reply header overflowed:" + "length %u > %Zu\n", hdrlen, iov->iov_len); + return -errno_NFSERR_IO; + } else if (iov->iov_len != hdrlen) { + dprintk("NFS: READ header is short. iovec will be shifted.\n"); + xdr_shift_buf(&req->rq_rcv_buf, iov->iov_len - hdrlen); } - - DECODE_TAIL; + recvd = req->rq_received - hdrlen; + if (count > recvd) { + printk(KERN_WARNING "NFS: server cheating in read reply: " + "count %u > recvd %u\n", count, recvd); + count = recvd; + eof = 0; + } + if (read->rd_eof) + *read->rd_eof = eof; + *read->rd_bytes_read = count; + return 0; } static int -decode_readdir(struct xdr_stream *xdr, int nfserr, struct rpc_rqst *req, struct nfs4_readdir *readdir) +decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs4_readdir *readdir) { struct xdr_buf *rcvbuf = &req->rq_rcv_buf; struct page *page = *rcvbuf->pages; - unsigned int pglen = rcvbuf->page_len; - uint32_t *end, *entry, *p; - uint32_t len, attrlen, word; - int i; - - if (!nfserr) { - READ_BUF(8); - COPYMEM(readdir->rd_resp_verifier, 8); - - BUG_ON(pglen > PAGE_CACHE_SIZE); - p = (uint32_t *) kmap(page); - end = (uint32_t *) ((char *)p + pglen + readdir->rd_pgbase); - - while (*p++) { - entry = p - 1; - if (p + 3 > end) - goto short_pkt; - p += 2; /* cookie */ - len = ntohl(*p++); /* filename length */ - if (len > NFS4_MAXNAMLEN) { - printk(KERN_WARNING "NFS: giant filename in readdir (len 0x%x)\n", len); - goto err_unmap; - } + struct iovec *iov = rcvbuf->head; + unsigned int nr, pglen = rcvbuf->page_len; + uint32_t *end, *entry, *p; + uint32_t len, attrlen, word; + int i, hdrlen, recvd, status; + + status = decode_op_hdr(xdr, OP_READDIR); + if (status) + return status; + READ_BUF(8); + COPYMEM(readdir->rd_resp_verifier, 8); + + hdrlen = (char *) p - (char *) iov->iov_base; + if (iov->iov_len < hdrlen) { + printk(KERN_WARNING "NFS: READDIR reply header overflowed:" + "length %d > %Zu\n", hdrlen, iov->iov_len); + return -EIO; + } else if (iov->iov_len != hdrlen) { + dprintk("NFS: READDIR header is short. iovec will be shifted.\n"); + xdr_shift_buf(rcvbuf, iov->iov_len - hdrlen); + } + recvd = req->rq_received - hdrlen; + if (pglen > recvd) + pglen = recvd; + + BUG_ON(pglen + readdir->rd_pgbase > PAGE_CACHE_SIZE); + p = (uint32_t *) kmap(page); + end = (uint32_t *) ((char *)p + pglen + readdir->rd_pgbase); + entry = p; + for (nr = 0; *p++; nr++) { + if (p + 3 > end) + goto short_pkt; + p += 2; /* cookie */ + len = ntohl(*p++); /* filename length */ + if (len > NFS4_MAXNAMLEN) { + printk(KERN_WARNING "NFS: giant filename in readdir (len 0x%x)\n", len); + goto err_unmap; + } - p += XDR_QUADLEN(len); - if (p + 1 > end) - goto short_pkt; - len = ntohl(*p++); /* bitmap length */ - if (len > 10) { - printk(KERN_WARNING "NFS: giant bitmap in readdir (len 0x%x)\n", len); - goto err_unmap; - } - if (p + len + 1 > end) - goto short_pkt; - attrlen = 0; - for (i = 0; i < len; i++) { - word = ntohl(*p++); - if (!word) - continue; - else if (i == 0 && word == FATTR4_WORD0_FILEID) { - attrlen = 8; - continue; - } - printk(KERN_WARNING "NFS: unexpected bitmap word in readdir (0x%x)\n", word); - goto err_unmap; - } - if (ntohl(*p++) != attrlen) { - printk(KERN_WARNING "NFS: unexpected attrlen in readdir\n"); - goto err_unmap; + p += XDR_QUADLEN(len); + if (p + 1 > end) + goto short_pkt; + len = ntohl(*p++); /* bitmap length */ + if (len > 10) { + printk(KERN_WARNING "NFS: giant bitmap in readdir (len 0x%x)\n", len); + goto err_unmap; + } + if (p + len + 1 > end) + goto short_pkt; + attrlen = 0; + for (i = 0; i < len; i++) { + word = ntohl(*p++); + if (!word) + continue; + else if (i == 0 && word == FATTR4_WORD0_FILEID) { + attrlen = 8; + continue; } - p += XDR_QUADLEN(attrlen); - if (p + 1 > end) - goto short_pkt; + printk(KERN_WARNING "NFS: unexpected bitmap word in readdir (0x%x)\n", word); + goto err_unmap; + } + if (ntohl(*p++) != attrlen) { + printk(KERN_WARNING "NFS: unexpected attrlen in readdir\n"); + goto err_unmap; } - kunmap(page); + p += XDR_QUADLEN(attrlen); + if (p + 1 > end) + goto short_pkt; } - + if (!nr && (entry[0] != 0 || entry[1] == 0)) + goto short_pkt; +out: + kunmap(page); return 0; short_pkt: - printk(KERN_NOTICE "NFS: short packet in readdir reply!\n"); - /* truncate listing */ - kunmap(page); entry[0] = entry[1] = 0; - return 0; + /* truncate listing ? */ + if (!nr) { + printk(KERN_NOTICE "NFS: readdir reply truncated!\n"); + entry[1] = 1; + } + goto out; err_unmap: kunmap(page); return -errno_NFSERR_IO; } static int -decode_readlink(struct xdr_stream *xdr, int nfserr, struct rpc_rqst *req, struct nfs4_readlink *readlink) +decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs4_readlink *readlink) { struct xdr_buf *rcvbuf = &req->rq_rcv_buf; + struct iovec *iov = rcvbuf->head; uint32_t *strlen; - uint32_t len; + unsigned int hdrlen, len; char *string; + int status; - if (!nfserr) { - /* - * The XDR encode routine has set things up so that - * the link text will be copied directly into the - * buffer. We just have to do overflow-checking, - * and and null-terminate the text (the VFS expects - * null-termination). - */ - strlen = (uint32_t *) kmap(rcvbuf->pages[0]); - len = ntohl(*strlen); - if (len > PAGE_CACHE_SIZE - 5) { - printk(KERN_WARNING "nfs: server returned giant symlink!\n"); - kunmap(rcvbuf->pages[0]); - return -EIO; - } - *strlen = len; - - string = (char *)(strlen + 1); - string[len] = '\0'; + status = decode_op_hdr(xdr, OP_READLINK); + if (status) + return status; + + hdrlen = (char *) xdr->p - (char *) iov->iov_base; + if (iov->iov_len > hdrlen) { + dprintk("NFS: READLINK header is short. iovec will be shifted.\n"); + xdr_shift_buf(rcvbuf, iov->iov_len - hdrlen); + + } + /* + * The XDR encode routine has set things up so that + * the link text will be copied directly into the + * buffer. We just have to do overflow-checking, + * and and null-terminate the text (the VFS expects + * null-termination). + */ + strlen = (uint32_t *) kmap(rcvbuf->pages[0]); + len = ntohl(*strlen); + if (len > PAGE_CACHE_SIZE - 5) { + printk(KERN_WARNING "nfs: server returned giant symlink!\n"); kunmap(rcvbuf->pages[0]); + return -EIO; } + *strlen = len; + + string = (char *)(strlen + 1); + string[len] = '\0'; + kunmap(rcvbuf->pages[0]); return 0; } static int -decode_remove(struct xdr_stream *xdr, int nfserr, struct nfs4_remove *remove) +decode_restorefh(struct xdr_stream *xdr) +{ + return decode_op_hdr(xdr, OP_RESTOREFH); +} + +static int +decode_remove(struct xdr_stream *xdr, struct nfs4_remove *remove) { int status; - status = 0; - if (!nfserr) - status = decode_change_info(xdr, remove->rm_cinfo); + status = decode_op_hdr(xdr, OP_REMOVE); + if (status) + goto out; + status = decode_change_info(xdr, remove->rm_cinfo); +out: return status; } static int -decode_rename(struct xdr_stream *xdr, int nfserr, struct nfs4_rename *rename) +decode_rename(struct xdr_stream *xdr, struct nfs4_rename *rename) { - int status = 0; + int status; - if (!nfserr) { - if ((status = decode_change_info(xdr, rename->rn_src_cinfo))) - goto out; - if ((status = decode_change_info(xdr, rename->rn_dst_cinfo))) - goto out; - } + status = decode_op_hdr(xdr, OP_RENAME); + if (status) + goto out; + if ((status = decode_change_info(xdr, rename->rn_src_cinfo))) + goto out; + if ((status = decode_change_info(xdr, rename->rn_dst_cinfo))) + goto out; out: return status; } +static int +decode_renew(struct xdr_stream *xdr) +{ + return decode_op_hdr(xdr, OP_RENEW); +} + +static int +decode_savefh(struct xdr_stream *xdr) +{ + return decode_op_hdr(xdr, OP_SAVEFH); +} + static int decode_setattr(struct xdr_stream *xdr) { uint32_t *p; - uint32_t bmlen; + uint32_t bmlen; int status; - - READ_BUF(4); - READ32(bmlen); - if (bmlen > 10) - goto xdr_error; - READ_BUF(bmlen << 2); - DECODE_TAIL; + + status = decode_op_hdr(xdr, OP_SETATTR); + if (status) + return status; + READ_BUF(4); + READ32(bmlen); + READ_BUF(bmlen << 2); + return 0; } static int -decode_setclientid(struct xdr_stream *xdr, int nfserr, struct nfs4_setclientid *setclientid) +decode_setclientid(struct xdr_stream *xdr, struct nfs4_setclientid *setclientid) { uint32_t *p; + uint32_t opnum; + int32_t nfserr; - if (!nfserr) { + READ_BUF(8); + READ32(opnum); + if (opnum != OP_SETCLIENTID) { + printk(KERN_NOTICE + "nfs4_decode_setclientid: Server returned operation" + " %d\n", opnum); + return -EIO; + } + READ32(nfserr); + if (nfserr == NFS_OK) { READ_BUF(8 + sizeof(nfs4_verifier)); READ64(setclientid->sc_state->cl_clientid); COPYMEM(setclientid->sc_state->cl_confirm, sizeof(nfs4_verifier)); - } - else if (nfserr == NFSERR_CLID_INUSE) { + } else if (nfserr == NFSERR_CLID_INUSE) { uint32_t len; /* skip netid string */ @@ -1478,156 +1633,146 @@ decode_setclientid(struct xdr_stream *xdr, int nfserr, struct nfs4_setclientid * READ_BUF(4); READ32(len); READ_BUF(len); - } + return -EEXIST; + } else + return -nfs_stat_to_errno(nfserr); return 0; } static int -decode_write(struct xdr_stream *xdr, int nfserr, struct nfs4_write *write) +decode_setclientid_confirm(struct xdr_stream *xdr) +{ + return decode_op_hdr(xdr, OP_SETCLIENTID_CONFIRM); +} + +static int +decode_write(struct xdr_stream *xdr, struct nfs4_write *write) { uint32_t *p; int status; - if (!nfserr) { - READ_BUF(16); - READ32(*write->wr_bytes_written); - if (*write->wr_bytes_written > write->wr_len) - goto xdr_error; - READ32(write->wr_verf->committed); - COPYMEM(write->wr_verf->verifier, 8); - } + status = decode_op_hdr(xdr, OP_WRITE); + if (status) + return status; - DECODE_TAIL; + READ_BUF(16); + READ32(*write->wr_bytes_written); + if (*write->wr_bytes_written > write->wr_len) + return -EIO; + READ32(write->wr_verf->committed); + COPYMEM(write->wr_verf->verifier, 8); + return 0; } /* FIXME: this sucks */ static int decode_compound(struct xdr_stream *xdr, struct nfs4_compound *cp, struct rpc_rqst *req) { - uint32_t *p; - uint32_t taglen; - uint32_t opnum, nfserr; + struct compound_hdr hdr; + struct nfs4_op *op; int status; - READ_BUF(8); - READ32(cp->toplevel_status); - READ32(taglen); + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + + cp->toplevel_status = hdr.status; /* * We need this if our zero-copy I/O is going to work. Rumor has * it that the spec will soon mandate it... */ - if (taglen != cp->taglen) + if (hdr.taglen != cp->taglen) dprintk("nfs4: non-conforming server returns tag length mismatch!\n"); - READ_BUF(taglen + 4); - p += XDR_QUADLEN(taglen); - READ32(cp->resp_nops); - if (cp->resp_nops > cp->req_nops) { + cp->resp_nops = hdr.nops; + if (hdr.nops > cp->req_nops) { dprintk("nfs4: resp_nops > req_nops!\n"); goto xdr_error; } - for (cp->nops = 0; cp->nops < cp->resp_nops; cp->nops++) { - READ_BUF(8); - READ32(opnum); - if (opnum != cp->ops[cp->nops].opnum) { - dprintk("nfs4: operation mismatch!\n"); - goto xdr_error; - } - READ32(nfserr); - if (cp->nops == cp->resp_nops - 1) { - if (nfserr != cp->toplevel_status) { - dprintk("nfs4: status mismatch!\n"); - goto xdr_error; - } - } - else if (nfserr) { - dprintk("nfs4: intermediate status nonzero!\n"); - goto xdr_error; - } - cp->ops[cp->nops].nfserr = nfserr; - - switch (opnum) { + op = &cp->ops[0]; + for (cp->nops = 0; cp->nops < cp->resp_nops; cp->nops++, op++) { + switch (op->opnum) { case OP_ACCESS: - status = decode_access(xdr, nfserr, &cp->ops[cp->nops].u.access); + status = decode_access(xdr, &op->u.access); break; case OP_CLOSE: - status = decode_close(xdr, nfserr, &cp->ops[cp->nops].u.close); + status = decode_close(xdr, &op->u.close); break; case OP_COMMIT: - status = decode_commit(xdr, nfserr, &cp->ops[cp->nops].u.commit); + status = decode_commit(xdr, &op->u.commit); break; case OP_CREATE: - status = decode_create(xdr, nfserr, &cp->ops[cp->nops].u.create); + status = decode_create(xdr, &op->u.create); break; case OP_GETATTR: - status = decode_getattr(xdr, nfserr, &cp->ops[cp->nops].u.getattr); + status = decode_getattr(xdr, &op->u.getattr); break; case OP_GETFH: - status = decode_getfh(xdr, nfserr, &cp->ops[cp->nops].u.getfh); + status = decode_getfh(xdr, &op->u.getfh); break; case OP_LINK: - status = decode_link(xdr, nfserr, &cp->ops[cp->nops].u.link); + status = decode_link(xdr, &op->u.link); break; case OP_LOOKUP: - status = 0; + status = decode_lookup(xdr); break; case OP_OPEN: - status = decode_open(xdr, nfserr, &cp->ops[cp->nops].u.open); + status = decode_open(xdr, &op->u.open); break; case OP_OPEN_CONFIRM: - status = decode_open_confirm(xdr, nfserr, &cp->ops[cp->nops].u.open_confirm); + status = decode_open_confirm(xdr, &op->u.open_confirm); break; case OP_PUTFH: - status = 0; + status = decode_putfh(xdr); break; case OP_PUTROOTFH: - status = 0; + status = decode_putrootfh(xdr); break; case OP_READ: - status = decode_read(xdr, nfserr, &cp->ops[cp->nops].u.read); + status = decode_read(xdr, req, &op->u.read); break; case OP_READDIR: - status = decode_readdir(xdr, nfserr, req, &cp->ops[cp->nops].u.readdir); + status = decode_readdir(xdr, req, &op->u.readdir); break; case OP_READLINK: - status = decode_readlink(xdr, nfserr, req, &cp->ops[cp->nops].u.readlink); + status = decode_readlink(xdr, req, &op->u.readlink); break; case OP_RESTOREFH: - status = 0; + status = decode_restorefh(xdr); break; case OP_REMOVE: - status = decode_remove(xdr, nfserr, &cp->ops[cp->nops].u.remove); + status = decode_remove(xdr, &op->u.remove); break; case OP_RENAME: - status = decode_rename(xdr, nfserr, &cp->ops[cp->nops].u.rename); + status = decode_rename(xdr, &op->u.rename); break; case OP_RENEW: - status = 0; + status = decode_renew(xdr); break; case OP_SAVEFH: - status = 0; + status = decode_savefh(xdr); break; case OP_SETATTR: status = decode_setattr(xdr); break; case OP_SETCLIENTID: - status = decode_setclientid(xdr, nfserr, &cp->ops[cp->nops].u.setclientid); + status = decode_setclientid(xdr, &op->u.setclientid); break; case OP_SETCLIENTID_CONFIRM: - status = 0; + status = decode_setclientid_confirm(xdr); break; case OP_WRITE: - status = decode_write(xdr, nfserr, &cp->ops[cp->nops].u.write); + status = decode_write(xdr, &op->u.write); break; default: BUG(); return -EIO; } if (status) - goto xdr_error; + break; } DECODE_TAIL; @@ -1700,15 +1845,15 @@ nfs4_decode_dirent(uint32_t *p, struct nfs_entry *entry, int plus) #endif #define PROC(proc, argtype, restype) \ -[NFSPROC4_##proc] = { \ - .p_proc = NFSPROC4_##proc, \ +[NFSPROC4_CLNT_##proc] = { \ + .p_proc = NFSPROC4_COMPOUND, \ .p_encode = (kxdrproc_t) nfs4_xdr_##argtype, \ .p_decode = (kxdrproc_t) nfs4_xdr_##restype, \ .p_bufsiz = MAX(NFS4_##argtype##_sz,NFS4_##restype##_sz) << 2, \ } struct rpc_procinfo nfs4_procedures[] = { - PROC(COMPOUND, enc_compound, dec_compound) + PROC(COMPOUND, enc_compound, dec_compound), }; struct rpc_version nfs_version4 = { diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index d320dd969b07..36e71c8ed51e 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -196,6 +196,16 @@ enum open_delegation_type4 { #define NFS4_MINOR_VERSION 0 #define NFS4_DEBUG 1 +#ifdef __KERNEL__ + +/* Index of predefined Linux client operations */ + +enum { + NFSPROC4_CLNT_NULL = 0, /* Unused */ + NFSPROC4_CLNT_COMPOUND, /* Soon to be unused */ +}; + +#endif #endif /* diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 970ffa785f78..af914e160fc6 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -488,7 +488,6 @@ struct nfs4_write { struct nfs4_op { u32 opnum; - u32 nfserr; union { struct nfs4_access access; struct nfs4_close close; -- cgit v1.2.3 From deb86db18ed25acb4b58b3989445998649e486f5 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 15 Dec 2002 00:11:24 -0800 Subject: [PATCH] Add helper routines for fixing up page alignment on xdr_buf In order to speed up NFS reads, we attempt to copy directly from skbuffs into the pagecache pages. As we cannot do XDR decoding in the soft interrupts, we attempt to estimate the size of the RPC header (+ attributes,...) that will precede the actual data that goes in the pagecache. If we get the estimate wrong, the XDR decode routines perform a realignment of the data into the pagecache. In the existing code, we do a multi-page kmap() from the xdr_buf into an iovec array, in order to do the shift. The following patch adds tools for doing the realigment without going through the iovec array (and without having to do the deadlock-prone multi-page kmap()). It also adds the 2 helper routines xdr_read_pages()/xdr_write_pages() which will be needed for NFSv4 reads/writes in order to add pre/post operation GETATTR calls. --- include/linux/sunrpc/xdr.h | 4 + net/sunrpc/sunrpc_syms.c | 2 + net/sunrpc/xdr.c | 308 +++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 307 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index 74c5260b2343..fd871e87f7f4 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -211,6 +211,10 @@ xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes) return p; } +extern void xdr_write_pages(struct xdr_stream *xdr, struct page **pages, + unsigned int base, unsigned int len); +extern void xdr_read_pages(struct xdr_stream *xdr, unsigned int len); + /* * Initialize an xdr_stream for decoding data. */ diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c index f3c5407546cd..2c06ada571be 100644 --- a/net/sunrpc/sunrpc_syms.c +++ b/net/sunrpc/sunrpc_syms.c @@ -120,6 +120,8 @@ EXPORT_SYMBOL(xdr_encode_netobj); EXPORT_SYMBOL(xdr_encode_pages); EXPORT_SYMBOL(xdr_inline_pages); EXPORT_SYMBOL(xdr_shift_buf); +EXPORT_SYMBOL(xdr_write_pages); +EXPORT_SYMBOL(xdr_read_pages); /* Debugging symbols */ #ifdef RPC_DEBUG diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index e5f05adbb7c9..b7ba4f10b96c 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -141,7 +141,6 @@ xdr_inline_pages(struct xdr_buf *xdr, unsigned int offset, xdr->len += len; } - /* * Realign the iovec if the server missed out some reply elements * (such as post-op attributes,...) @@ -318,13 +317,308 @@ copy_tail: copy_actor(desc, (char *)xdr->tail[0].iov_base + base, len); } + +/* + * Helper routines for doing 'memmove' like operations on a struct xdr_buf + * + * _shift_data_right_pages + * @pages: vector of pages containing both the source and dest memory area. + * @pgto_base: page vector address of destination + * @pgfrom_base: page vector address of source + * @len: number of bytes to copy + * + * Note: the addresses pgto_base and pgfrom_base are both calculated in + * the same way: + * if a memory area starts at byte 'base' in page 'pages[i]', + * then its address is given as (i << PAGE_CACHE_SHIFT) + base + * Also note: pgfrom_base must be < pgto_base, but the memory areas + * they point to may overlap. + */ +static void +_shift_data_right_pages(struct page **pages, size_t pgto_base, + size_t pgfrom_base, size_t len) +{ + struct page **pgfrom, **pgto; + char *vfrom, *vto; + size_t copy; + + BUG_ON(pgto_base <= pgfrom_base); + + pgto_base += len; + pgfrom_base += len; + + pgto = pages + (pgto_base >> PAGE_CACHE_SHIFT); + pgfrom = pages + (pgfrom_base >> PAGE_CACHE_SHIFT); + + pgto_base &= ~PAGE_CACHE_MASK; + pgfrom_base &= ~PAGE_CACHE_MASK; + + do { + /* Are any pointers crossing a page boundary? */ + if (pgto_base == 0) { + pgto_base = PAGE_CACHE_SIZE; + pgto--; + } + if (pgfrom_base == 0) { + pgfrom_base = PAGE_CACHE_SIZE; + pgfrom--; + } + + copy = len; + if (copy > pgto_base) + copy = pgto_base; + if (copy > pgfrom_base) + copy = pgfrom_base; + pgto_base -= copy; + pgfrom_base -= copy; + + vto = kmap_atomic(*pgto, KM_USER0); + vfrom = kmap_atomic(*pgfrom, KM_USER1); + memmove(vto + pgto_base, vfrom + pgfrom_base, copy); + kunmap_atomic(vfrom, KM_USER1); + kunmap_atomic(vto, KM_USER0); + + } while ((len -= copy) != 0); +} + +/* + * _copy_to_pages + * @pages: array of pages + * @pgbase: page vector address of destination + * @p: pointer to source data + * @len: length + * + * Copies data from an arbitrary memory location into an array of pages + * The copy is assumed to be non-overlapping. + */ +static void +_copy_to_pages(struct page **pages, size_t pgbase, const char *p, size_t len) +{ + struct page **pgto; + char *vto; + size_t copy; + + pgto = pages + (pgbase >> PAGE_CACHE_SHIFT); + pgbase &= ~PAGE_CACHE_MASK; + + do { + copy = PAGE_CACHE_SIZE - pgbase; + if (copy > len) + copy = len; + + vto = kmap_atomic(*pgto, KM_USER0); + memcpy(vto + pgbase, p, copy); + kunmap_atomic(vto, KM_USER0); + + pgbase += copy; + if (pgbase == PAGE_CACHE_SIZE) { + pgbase = 0; + pgto++; + } + p += copy; + + } while ((len -= copy) != 0); +} + +/* + * _copy_from_pages + * @p: pointer to destination + * @pages: array of pages + * @pgbase: offset of source data + * @len: length + * + * Copies data into an arbitrary memory location from an array of pages + * The copy is assumed to be non-overlapping. + */ +static void +_copy_from_pages(char *p, struct page **pages, size_t pgbase, size_t len) +{ + struct page **pgfrom; + char *vfrom; + size_t copy; + + pgfrom = pages + (pgbase >> PAGE_CACHE_SHIFT); + pgbase &= ~PAGE_CACHE_MASK; + + do { + copy = PAGE_CACHE_SIZE - pgbase; + if (copy > len) + copy = len; + + vfrom = kmap_atomic(*pgfrom, KM_USER0); + memcpy(p, vfrom + pgbase, copy); + kunmap_atomic(vfrom, KM_USER0); + + pgbase += copy; + if (pgbase == PAGE_CACHE_SIZE) { + pgbase = 0; + pgfrom++; + } + p += copy; + + } while ((len -= copy) != 0); +} + +/* + * xdr_shrink_bufhead + * @buf: xdr_buf + * @len: bytes to remove from buf->head[0] + * + * Shrinks XDR buffer's header iovec buf->head[0] by + * 'len' bytes. The extra data is not lost, but is instead + * moved into the inlined pages and/or the tail. + */ void -xdr_shift_buf(struct xdr_buf *xdr, size_t len) +xdr_shrink_bufhead(struct xdr_buf *buf, size_t len) { - struct iovec iov[MAX_IOVEC]; - unsigned int nr; + struct iovec *head, *tail; + size_t copy, offs; + unsigned int pglen = buf->page_len; + + tail = buf->tail; + head = buf->head; + BUG_ON (len > head->iov_len); + + /* Shift the tail first */ + if (tail->iov_len != 0) { + if (tail->iov_len > len) { + copy = tail->iov_len - len; + memmove((char *)tail->iov_base + len, + tail->iov_base, copy); + } + /* Copy from the inlined pages into the tail */ + copy = len; + if (copy > pglen) + copy = pglen; + offs = len - copy; + if (offs >= tail->iov_len) + copy = 0; + else if (copy > tail->iov_len - offs) + copy = tail->iov_len - offs; + if (copy != 0) + _copy_from_pages((char *)tail->iov_base + offs, + buf->pages, + buf->page_base + pglen + offs - len, + copy); + /* Do we also need to copy data from the head into the tail ? */ + if (len > pglen) { + offs = copy = len - pglen; + if (copy > tail->iov_len) + copy = tail->iov_len; + memcpy(tail->iov_base, + (char *)head->iov_base + + head->iov_len - offs, + copy); + } + } + /* Now handle pages */ + if (pglen != 0) { + if (pglen > len) + _shift_data_right_pages(buf->pages, + buf->page_base + len, + buf->page_base, + pglen - len); + copy = len; + if (len > pglen) + copy = pglen; + _copy_to_pages(buf->pages, buf->page_base, + (char *)head->iov_base + head->iov_len - len, + copy); + } + head->iov_len -= len; + buf->len -= len; +} + +/* + * xdr_shrink_pagelen + * @buf: xdr_buf + * @len: bytes to remove from buf->pages + * + * Shrinks XDR buffer's page array buf->pages by + * 'len' bytes. The extra data is not lost, but is instead + * moved into the tail. + */ +void +xdr_shrink_pagelen(struct xdr_buf *buf, size_t len) +{ + struct iovec *tail; + size_t copy; + char *p; + unsigned int pglen = buf->page_len; + + tail = buf->tail; + BUG_ON (len > pglen); + + /* Shift the tail first */ + if (tail->iov_len != 0) { + p = (char *)tail->iov_base + len; + if (tail->iov_len > len) { + copy = tail->iov_len - len; + memmove(p, tail->iov_base, copy); + } else + buf->len -= len; + /* Copy from the inlined pages into the tail */ + copy = len; + if (copy > tail->iov_len) + copy = tail->iov_len; + _copy_from_pages((char *)tail->iov_base, + buf->pages, buf->page_base + pglen - len, + copy); + } + buf->page_len -= len; + buf->len -= len; +} - nr = xdr_kmap(iov, xdr, 0); - xdr_shift_iovec(iov, nr, len); - xdr_kunmap(xdr, 0); +void +xdr_shift_buf(struct xdr_buf *buf, size_t len) +{ + xdr_shrink_bufhead(buf, len); +} + +void +xdr_write_pages(struct xdr_stream *xdr, struct page **pages, unsigned int base, + unsigned int len) +{ + struct xdr_buf *buf = xdr->buf; + struct iovec *iov = buf->tail; + buf->pages = pages; + buf->page_base = base; + buf->page_len = len; + + iov->iov_base = (char *)xdr->p; + iov->iov_len = 0; + xdr->iov = iov; + + if (len & 3) { + unsigned int pad = 4 - (len & 3); + + BUG_ON(xdr->p >= xdr->end); + iov->iov_base = (char *)xdr->p + (len & 3); + iov->iov_len += pad; + len += pad; + *xdr->p++ = 0; + } + buf->len += len; +} + +void +xdr_read_pages(struct xdr_stream *xdr, unsigned int len) +{ + struct xdr_buf *buf = xdr->buf; + struct iovec *iov; + ssize_t shift; + + /* Realign pages to current pointer position */ + iov = buf->head; + shift = iov->iov_len + (char *)iov->iov_base - (char *)xdr->p; + if (shift > 0) + xdr_shrink_bufhead(buf, shift); + + /* Truncate page data and move it into the tail */ + len = XDR_QUADLEN(len) << 2; + if (buf->page_len > len) + xdr_shrink_pagelen(buf, buf->page_len - len); + xdr->iov = iov = buf->tail; + xdr->p = (uint32_t *)iov->iov_base; + xdr->end = (uint32_t *)((char *)iov->iov_base + iov->iov_len); } -- cgit v1.2.3 From 8967788d14e6429357223ec1ac13249ca967449c Mon Sep 17 00:00:00 2001 From: Stelian Pop Date: Sun, 15 Dec 2002 00:59:25 -0800 Subject: [PATCH] sonypi driver update This little patch changes the way button release events are reported by the sonypi driver to the application: previously, separate release events were detected for each button. However, many buttons (example: the jogdial, the capture button, the back button etc) share the same release event. The attached patch propagates a single 'ANYBUTTON_RELEASED' event to the userspace, leaving all state machine intelligence to the application. Kunihiko IMAI should be credited for his ideas and tests. --- drivers/char/sonypi.c | 4 ++-- drivers/char/sonypi.h | 13 ++++++++++--- include/linux/sonypi.h | 6 +++--- 3 files changed, 15 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/drivers/char/sonypi.c b/drivers/char/sonypi.c index 76c5a42e2e19..7057a4d0ff14 100644 --- a/drivers/char/sonypi.c +++ b/drivers/char/sonypi.c @@ -714,11 +714,11 @@ static int __devinit sonypi_probe(struct pci_dev *pcidev) { SONYPI_DRIVER_MAJORVERSION, SONYPI_DRIVER_MINORVERSION); printk(KERN_INFO "sonypi: detected %s model, " - "verbose = %s, fnkeyinit = %s, camera = %s, " + "verbose = %d, fnkeyinit = %s, camera = %s, " "compat = %s, mask = 0x%08lx\n", (sonypi_device.model == SONYPI_DEVICE_MODEL_TYPE1) ? "type1" : "type2", - verbose ? "on" : "off", + verbose, fnkeyinit ? "on" : "off", camera ? "on" : "off", compat ? "on" : "off", diff --git a/drivers/char/sonypi.h b/drivers/char/sonypi.h index 9f1d285372cf..c0553b22c0bc 100644 --- a/drivers/char/sonypi.h +++ b/drivers/char/sonypi.h @@ -37,7 +37,7 @@ #ifdef __KERNEL__ #define SONYPI_DRIVER_MAJORVERSION 1 -#define SONYPI_DRIVER_MINORVERSION 16 +#define SONYPI_DRIVER_MINORVERSION 17 #define SONYPI_DEVICE_MODEL_TYPE1 1 #define SONYPI_DEVICE_MODEL_TYPE2 2 @@ -171,6 +171,13 @@ struct sonypi_event { u8 data; u8 event; }; + +/* The set of possible button release events */ +static struct sonypi_event sonypi_releaseev[] = { + { 0x00, SONYPI_EVENT_ANYBUTTON_RELEASED }, + { 0, 0 } +}; + /* The set of possible jogger events */ static struct sonypi_event sonypi_joggerev[] = { { 0x1f, SONYPI_EVENT_JOGDIAL_UP }, @@ -186,7 +193,6 @@ static struct sonypi_event sonypi_joggerev[] = { { 0x5d, SONYPI_EVENT_JOGDIAL_VFAST_UP_PRESSED }, { 0x43, SONYPI_EVENT_JOGDIAL_VFAST_DOWN_PRESSED }, { 0x40, SONYPI_EVENT_JOGDIAL_PRESSED }, - { 0x00, SONYPI_EVENT_JOGDIAL_RELEASED }, { 0, 0 } }; @@ -195,7 +201,6 @@ static struct sonypi_event sonypi_captureev[] = { { 0x05, SONYPI_EVENT_CAPTURE_PARTIALPRESSED }, { 0x07, SONYPI_EVENT_CAPTURE_PRESSED }, { 0x01, SONYPI_EVENT_CAPTURE_PARTIALRELEASED }, - { 0x00, SONYPI_EVENT_CAPTURE_RELEASED }, { 0, 0 } }; @@ -293,6 +298,7 @@ struct sonypi_eventtypes { unsigned long mask; struct sonypi_event * events; } sonypi_eventtypes[] = { + { SONYPI_DEVICE_MODEL_TYPE1, 0, 0xffffffff, sonypi_releaseev }, { SONYPI_DEVICE_MODEL_TYPE1, 0x70, SONYPI_MEYE_MASK, sonypi_meyeev }, { SONYPI_DEVICE_MODEL_TYPE1, 0x30, SONYPI_LID_MASK, sonypi_lidev }, { SONYPI_DEVICE_MODEL_TYPE1, 0x60, SONYPI_CAPTURE_MASK, sonypi_captureev }, @@ -301,6 +307,7 @@ struct sonypi_eventtypes { { SONYPI_DEVICE_MODEL_TYPE1, 0x30, SONYPI_BLUETOOTH_MASK, sonypi_blueev }, { SONYPI_DEVICE_MODEL_TYPE1, 0x40, SONYPI_PKEY_MASK, sonypi_pkeyev }, + { SONYPI_DEVICE_MODEL_TYPE2, 0, 0xffffffff, sonypi_releaseev }, { SONYPI_DEVICE_MODEL_TYPE2, 0x38, SONYPI_LID_MASK, sonypi_lidev }, { SONYPI_DEVICE_MODEL_TYPE2, 0x08, SONYPI_JOGGER_MASK, sonypi_joggerev }, { SONYPI_DEVICE_MODEL_TYPE2, 0x08, SONYPI_CAPTURE_MASK, sonypi_captureev }, diff --git a/include/linux/sonypi.h b/include/linux/sonypi.h index 89ead755abfe..e10e33a99574 100644 --- a/include/linux/sonypi.h +++ b/include/linux/sonypi.h @@ -43,9 +43,9 @@ #define SONYPI_EVENT_JOGDIAL_DOWN_PRESSED 3 #define SONYPI_EVENT_JOGDIAL_UP_PRESSED 4 #define SONYPI_EVENT_JOGDIAL_PRESSED 5 -#define SONYPI_EVENT_JOGDIAL_RELEASED 6 +#define SONYPI_EVENT_JOGDIAL_RELEASED 6 /* obsolete */ #define SONYPI_EVENT_CAPTURE_PRESSED 7 -#define SONYPI_EVENT_CAPTURE_RELEASED 8 +#define SONYPI_EVENT_CAPTURE_RELEASED 8 /* obsolete */ #define SONYPI_EVENT_CAPTURE_PARTIALPRESSED 9 #define SONYPI_EVENT_CAPTURE_PARTIALRELEASED 10 #define SONYPI_EVENT_FNKEY_ESC 11 @@ -93,7 +93,7 @@ #define SONYPI_EVENT_MEYE_OPPOSITE 53 #define SONYPI_EVENT_MEMORYSTICK_INSERT 54 #define SONYPI_EVENT_MEMORYSTICK_EJECT 55 - +#define SONYPI_EVENT_ANYBUTTON_RELEASED 56 /* get/set brightness */ #define SONYPI_IOCGBRT _IOR('v', 0, __u8) -- cgit v1.2.3 From 91ec8aa9f5416d3474a34af9b8f197a344d0d523 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 15 Dec 2002 04:39:06 -0800 Subject: [PATCH] Fix filesystems that cannot do mmap writeback The writepage-removal patch broke filesystems which do not want to support writeable mappings. Fix that up by making those filesystems point their mmap vector at the new generic_file_readonly_mmap(). --- fs/afs/file.c | 2 +- fs/befs/linuxvfs.c | 2 +- fs/jffs/inode-v23.c | 2 +- fs/jffs2/file.c | 2 +- fs/read_write.c | 2 +- include/linux/fs.h | 1 + kernel/ksyms.c | 1 + mm/filemap.c | 8 ++++++++ 8 files changed, 15 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/fs/afs/file.c b/fs/afs/file.c index a63b05ca27d0..c0344ae61ef2 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -37,7 +37,7 @@ struct inode_operations afs_file_inode_operations = { struct file_operations afs_file_file_operations = { .read = generic_file_read, .write = afs_file_write, - .mmap = generic_file_mmap, + .mmap = generic_file_readonly_mmap, #if 0 .open = afs_file_open, .release = afs_file_release, diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 4b180c8b9ff9..9ef5130f12f6 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -74,7 +74,7 @@ struct inode_operations befs_dir_inode_operations = { struct file_operations befs_file_operations = { .llseek = default_llseek, .read = generic_file_read, - .mmap = generic_file_mmap, + .mmap = generic_file_readonly_mmap, }; struct address_space_operations befs_aops = { diff --git a/fs/jffs/inode-v23.c b/fs/jffs/inode-v23.c index 27098dc0e895..325afb2d489e 100644 --- a/fs/jffs/inode-v23.c +++ b/fs/jffs/inode-v23.c @@ -1641,7 +1641,7 @@ static struct file_operations jffs_file_operations = .read = generic_file_read, .write = generic_file_write, .ioctl = jffs_ioctl, - .mmap = generic_file_mmap, + .mmap = generic_file_readonly_mmap, .fsync = jffs_fsync, .sendfile = generic_file_sendfile, }; diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index 418b1b00cb7d..f7fadfa10640 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -58,7 +58,7 @@ struct file_operations jffs2_file_operations = .read = generic_file_read, .write = generic_file_write, .ioctl = jffs2_ioctl, - .mmap = generic_file_mmap, + .mmap = generic_file_readonly_mmap, .fsync = jffs2_fsync, #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,29) .sendfile = generic_file_sendfile diff --git a/fs/read_write.c b/fs/read_write.c index 8947becabdad..d91e4ea4ec95 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -19,7 +19,7 @@ struct file_operations generic_ro_fops = { .llseek = generic_file_llseek, .read = generic_file_read, - .mmap = generic_file_mmap, + .mmap = generic_file_readonly_mmap, .sendfile = generic_file_sendfile, }; diff --git a/include/linux/fs.h b/include/linux/fs.h index 18557957520f..500cb3ac421e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1233,6 +1233,7 @@ extern int sb_set_blocksize(struct super_block *, int); extern int sb_min_blocksize(struct super_block *, int); extern int generic_file_mmap(struct file *, struct vm_area_struct *); +extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *); extern int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size); extern int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size); extern ssize_t generic_file_read(struct file *, char *, size_t, loff_t *); diff --git a/kernel/ksyms.c b/kernel/ksyms.c index cbe3fec93bd1..9fa73fdf4741 100644 --- a/kernel/ksyms.c +++ b/kernel/ksyms.c @@ -228,6 +228,7 @@ EXPORT_SYMBOL(file_ra_state_init); EXPORT_SYMBOL(generic_file_write); EXPORT_SYMBOL(generic_file_write_nolock); EXPORT_SYMBOL(generic_file_mmap); +EXPORT_SYMBOL(generic_file_readonly_mmap); EXPORT_SYMBOL(generic_ro_fops); EXPORT_SYMBOL(file_lock_list); EXPORT_SYMBOL(locks_init_lock); diff --git a/mm/filemap.c b/mm/filemap.c index 1595d52c9bb7..c771d48b9cde 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1293,6 +1293,14 @@ int generic_file_mmap(struct file * file, struct vm_area_struct * vma) vma->vm_ops = &generic_file_vm_ops; return 0; } + +int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma) +{ + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) + return -EINVAL; + vma->vm_flags &= ~VM_MAYWRITE; + return generic_file_mmap(file, vma); +} #else int generic_file_mmap(struct file * file, struct vm_area_struct * vma) { -- cgit v1.2.3 From e36a44915d7140d680cd7e952381dec21798c068 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Sun, 15 Dec 2002 17:15:43 -0800 Subject: [PATCH] dm: move ioctl numbers to a sane place Four constants: DM_DIR, DM_MAX_TYPE_NAME, DM_NAME_LEN, DM_UUID_LEN Were being declared in device-mapper.h, these are all specific to the ioctl interface, so they've been moved to dm-ioctl.h. Nobody in userland should ever include so remove ifdef __KERNEL guards. --- include/linux/device-mapper.h | 9 --------- include/linux/dm-ioctl.h | 6 +++++- 2 files changed, 5 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index bf9b15f5f70b..7dc8a14b8396 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -7,13 +7,6 @@ #ifndef _LINUX_DEVICE_MAPPER_H #define _LINUX_DEVICE_MAPPER_H -#define DM_DIR "mapper" /* Slashes not supported */ -#define DM_MAX_TYPE_NAME 16 -#define DM_NAME_LEN 128 -#define DM_UUID_LEN 129 - -#ifdef __KERNEL__ - struct dm_target; struct dm_table; struct dm_dev; @@ -101,6 +94,4 @@ struct dm_target { int dm_register_target(struct target_type *t); int dm_unregister_target(struct target_type *t); -#endif /* __KERNEL__ */ - #endif /* _LINUX_DEVICE_MAPPER_H */ diff --git a/include/linux/dm-ioctl.h b/include/linux/dm-ioctl.h index c5ae8cd3921c..72edd5e19e62 100644 --- a/include/linux/dm-ioctl.h +++ b/include/linux/dm-ioctl.h @@ -7,9 +7,13 @@ #ifndef _LINUX_DM_IOCTL_H #define _LINUX_DM_IOCTL_H -#include #include +#define DM_DIR "mapper" /* Slashes not supported */ +#define DM_MAX_TYPE_NAME 16 +#define DM_NAME_LEN 128 +#define DM_UUID_LEN 129 + /* * Implements a traditional ioctl interface to the device mapper. */ -- cgit v1.2.3