From 8e6b97f5741482c860d062e656ea2bf95be56ca5 Mon Sep 17 00:00:00 2001 From: Kai Germaschewski Date: Thu, 9 May 2002 03:51:22 -0500 Subject: Don't implicitly export all symbols In the old days, we used to export all symbols from a module by default. We still do so, unless o either exported symbols are explicitly listed in EXPORT_SYMBOL() o or EXPORT_NO_SYMBOLS is given. This patches changes the default of 'export all symbols' to 'export no symbols' for all files which are not listed in $(export-objs) in the relevant Makefile. --- include/linux/module.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/module.h b/include/linux/module.h index f3a8370db10a..1021d58d1742 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -370,6 +370,8 @@ extern struct module *module_list; #define EXPORT_SYMBOL_NOVERS(var) error this_object_must_be_defined_as_export_objs_in_the_Makefile #define EXPORT_SYMBOL_GPL(var) error this_object_must_be_defined_as_export_objs_in_the_Makefile +__asm__(".section __ksymtab,\"a\"\n.previous"); + #else #define __EXPORT_SYMBOL(sym, str) \ -- cgit v1.2.3 From 43152186ec28f3d4adf2a79ff8becacdfca9c82d Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 19 May 2002 02:20:58 -0700 Subject: [PATCH] i_dirty_buffers locking fix This fixes a race between try_to_free_buffers' call to __remove_inode_queue() and other users of b_inode_buffers (fsync_inode_buffers and mark_buffer_dirty_inode()). They are presently taking different locks. The patch relocates and redefines and clarifies(?) the role of inode.i_dirty_buffers. The 2.4 definition of i_dirty_buffers is "a list of random buffers which is protected by a kernel-wide lock". This definition needs to be narrowed in the 2.5 context. It is now "a list of buffers from a different mapping, protected by a lock within that mapping". This list of buffers is specifically for fsync(). As this is a "data plane" operation, all the structures have been moved out of the inode and into the address_space. So address_space now has: list_head private_list; A list, available to the address_space for any purpose. If that address_space chooses to use the helper functions mark_buffer_dirty_inode and sync_mapping_buffers() then this list will contain buffer_heads, attached via buffer_head.b_assoc_buffers. If the address_space does not call those helper functions then the list is free for other usage. The only requirement is that the list be list_empty() at destroy_inode() time. At least, this is the objective. At present, generic_file_write() will call generic_osync_inode(), which expects that list to contain buffer_heads. So private_list isn't useful for anything else yet. spinlock_t private_lock; A spinlock, available to the address_space. If the address_space is using try_to_free_buffers(), mark_inode_dirty_buffers() and fsync_inode_buffers() then this lock is used to protect the private_list of *other* mappings which have listed buffers from *this* mapping onto themselves. That is: for buffer_heads, mapping_A->private_lock does not protect mapping_A->private_list! It protects the b_assoc_buffers list from buffers which are backed by mapping_A and it protects mapping_B->private_list, mapping_C->private_list, ... So what we have here is a cross-mapping association. S_ISREG mappings maintain a list of buffers from the blockdev's address_space which they need to know about for a successful fsync(). The locking follows the buffers: the lock in in the blockdev's mapping, not in the S_ISREG file's mapping. For address_spaces which use try_to_free_buffers, private_lock is also (and quite unrelatedly) used for protection of the buffer ring at page->private. Exclusion between try_to_free_buffers(), __get_hash_table() and __set_page_dirty_buffers(). This is in fact its major use. address_space *assoc_mapping Sigh. This is the address of the mapping which backs the buffers which are attached to private_list. It's here so that generic_osync_inode() can locate the lock which protects this mapping's private_list. Will probably go away. A consequence of all the above is that: a) All the buffers at a mapping_A's ->private_list must come from the same mapping, mapping_B. There is no requirement that mapping_B be a blockdev mapping, but that's how it's used. There is a BUG() check in mark_buffer_dirty_inode() for this. b) blockdev mappings never have any buffers on ->private_list. It just never happens, and doesn't make a lot of sense. reiserfs is using b_inode_buffers for attaching dependent buffers to its journal and that caused a few problems. Fixed in reiserfs_releasepage.patch --- fs/buffer.c | 229 +++++++++++++++++++++++++++++++------------- fs/ext2/fsync.c | 2 +- fs/ext2/inode.c | 5 +- fs/ext3/fsync.c | 6 +- fs/ext3/inode.c | 10 +- fs/fs-writeback.c | 47 ++++----- fs/inode.c | 5 +- fs/minix/file.c | 2 +- fs/ntfs/super.c | 7 ++ fs/reiserfs/file.c | 2 +- fs/sysv/file.c | 2 +- fs/udf/fsync.c | 2 +- include/linux/buffer_head.h | 17 +--- include/linux/fs.h | 9 +- mm/filemap.c | 2 +- mm/page-writeback.c | 11 +-- mm/swap_state.c | 7 +- 17 files changed, 222 insertions(+), 143 deletions(-) (limited to 'include') diff --git a/fs/buffer.c b/fs/buffer.c index 5dedcd3c9c7c..760540c26caa 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -35,7 +35,7 @@ #include #include -#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_inode_buffers) +#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers) /* This is used by some architectures to estimate available memory. */ atomic_t buffermem_pages = ATOMIC_INIT(0); @@ -392,30 +392,31 @@ out: /* * Various filesystems appear to want __get_hash_table to be non-blocking. * But it's the page lock which protects the buffers. To get around this, - * we get exclusion from try_to_free_buffers with the inode's - * i_bufferlist_lock. + * we get exclusion from try_to_free_buffers with the blockdev mapping's + * private_lock. * * Hack idea: for the blockdev mapping, i_bufferlist_lock contention * may be quite high. This code could TryLock the page, and if that - * succeeds, there is no need to take i_bufferlist_lock. (But if - * i_bufferlist_lock is contended then so is mapping->page_lock). + * succeeds, there is no need to take private_lock. (But if + * private_lock is contended then so is mapping->page_lock). */ struct buffer_head * __get_hash_table(struct block_device *bdev, sector_t block, int unused) { - struct inode * const inode = bdev->bd_inode; + struct inode *bd_inode = bdev->bd_inode; + struct address_space *bd_mapping = bd_inode->i_mapping; struct buffer_head *ret = NULL; unsigned long index; struct buffer_head *bh; struct buffer_head *head; struct page *page; - index = block >> (PAGE_CACHE_SHIFT - inode->i_blkbits); - page = find_get_page(inode->i_mapping, index); + index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits); + page = find_get_page(bd_mapping, index); if (!page) goto out; - spin_lock(&inode->i_bufferlist_lock); + spin_lock(&bd_mapping->private_lock); if (!page_has_buffers(page)) goto out_unlock; head = page_buffers(page); @@ -430,40 +431,12 @@ __get_hash_table(struct block_device *bdev, sector_t block, int unused) } while (bh != head); buffer_error(); out_unlock: - spin_unlock(&inode->i_bufferlist_lock); + spin_unlock(&bd_mapping->private_lock); page_cache_release(page); out: return ret; } -void buffer_insert_list(spinlock_t *lock, - struct buffer_head *bh, struct list_head *list) -{ - spin_lock(lock); - list_del(&bh->b_inode_buffers); - list_add(&bh->b_inode_buffers, list); - spin_unlock(lock); -} - -/* - * i_bufferlist_lock must be held - */ -static inline void __remove_inode_queue(struct buffer_head *bh) -{ - list_del_init(&bh->b_inode_buffers); -} - -int inode_has_buffers(struct inode *inode) -{ - int ret; - - spin_lock(&inode->i_bufferlist_lock); - ret = !list_empty(&inode->i_dirty_buffers); - spin_unlock(&inode->i_bufferlist_lock); - - return ret; -} - /* If invalidate_buffers() will trash dirty buffers, it means some kind of fs corruption is going on. Trashing dirty data always imply losing information that was supposed to be just stored on the physical layer @@ -674,6 +647,78 @@ inline void mark_buffer_async_write(struct buffer_head *bh) } EXPORT_SYMBOL(mark_buffer_async_write); + +/* + * fs/buffer.c contains helper functions for buffer-backed address space's + * fsync functions. A common requirement for buffer-based filesystems is + * that certain data from the backing blockdev needs to be written out for + * a successful fsync(). For example, ext2 indirect blocks need to be + * written back and waited upon before fsync() returns. + * + * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(), + * inode_has_buffers() and invalidate_inode_buffers() are provided for the + * management of a list of dependent buffers at ->i_mapping->private_list. + * + * Locking is a little subtle: try_to_free_buffers() will remove buffers + * from their controlling inode's queue when they are being freed. But + * try_to_free_buffers() will be operating against the *blockdev* mapping + * at the time, not against the S_ISREG file which depends on those buffers. + * So the locking for private_list is via the private_lock in the address_space + * which backs the buffers. Which is different from the address_space + * against which the buffers are listed. So for a particular address_space, + * mapping->private_lock does *not* protect mapping->private_list! In fact, + * mapping->private_list will always be protected by the backing blockdev's + * ->private_lock. + * + * Which introduces a requirement: all buffers on an address_space's + * ->private_list must be from the same address_space: the blockdev's. + * + * address_spaces which do not place buffers at ->private_list via these + * utility functions are free to use private_lock and private_list for + * whatever they want. The only requirement is that list_empty(private_list) + * be true at clear_inode() time. + * + * FIXME: clear_inode should not call invalidate_inode_buffers(). The + * filesystems should do that. invalidate_inode_buffers() should just go + * BUG_ON(!list_empty). + * + * FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should + * take an address_space, not an inode. And it should be called + * mark_buffer_dirty_fsync() to clearly define why those buffers are being + * queued up. + * + * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the + * list if it is already on a list. Because if the buffer is on a list, + * it *must* already be on the right one. If not, the filesystem is being + * silly. This will save a ton of locking. But first we have to ensure + * that buffers are taken *off* the old inode's list when they are freed + * (presumably in truncate). That requires careful auditing of all + * filesystems (do it inside bforget()). It could also be done by bringing + * b_inode back. + */ + +void buffer_insert_list(spinlock_t *lock, + struct buffer_head *bh, struct list_head *list) +{ + spin_lock(lock); + list_del(&bh->b_assoc_buffers); + list_add(&bh->b_assoc_buffers, list); + spin_unlock(lock); +} + +/* + * The buffer's backing address_space's private_lock must be held + */ +static inline void __remove_assoc_queue(struct buffer_head *bh) +{ + list_del_init(&bh->b_assoc_buffers); +} + +int inode_has_buffers(struct inode *inode) +{ + return !list_empty(&inode->i_mapping->private_list); +} + /* * osync is designed to support O_SYNC io. It waits synchronously for * all already-submitted IO to complete, but does not queue any new @@ -709,8 +754,50 @@ repeat: return err; } +/** + * sync_mapping_buffers - write out and wait upon a mapping's "associated" + * buffers + * @buffer_mapping - the mapping which backs the buffers' data + * @mapping - the mapping which wants those buffers written + * + * Starts I/O against the buffers at mapping->private_list, and waits upon + * that I/O. + * + * Basically, this is a convenience function for fsync(). @buffer_mapping is + * the blockdev which "owns" the buffers and @mapping is a file or directory + * which needs those buffers to be written for a successful fsync(). + */ +int sync_mapping_buffers(struct address_space *mapping) +{ + struct address_space *buffer_mapping = mapping->assoc_mapping; + + if (buffer_mapping == NULL || list_empty(&mapping->private_list)) + return 0; + + return fsync_buffers_list(&buffer_mapping->private_lock, + &mapping->private_list); +} +EXPORT_SYMBOL(sync_mapping_buffers); + +void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode) +{ + struct address_space *mapping = inode->i_mapping; + struct address_space *buffer_mapping = bh->b_page->mapping; + + mark_buffer_dirty(bh); + if (!mapping->assoc_mapping) { + mapping->assoc_mapping = buffer_mapping; + } else { + if (mapping->assoc_mapping != buffer_mapping) + BUG(); + } + buffer_insert_list(&buffer_mapping->private_lock, + bh, &mapping->private_list); +} +EXPORT_SYMBOL(mark_buffer_dirty_inode); + /* - * Synchronise all the inode's dirty buffers to the disk. + * Write out and wait upon a list of buffers. * * We have conflicting pressures: we want to make sure that all * initially dirty buffers get waited on, but that any subsequently @@ -739,9 +826,9 @@ int fsync_buffers_list(spinlock_t *lock, struct list_head *list) spin_lock(lock); while (!list_empty(list)) { bh = BH_ENTRY(list->next); - list_del_init(&bh->b_inode_buffers); + list_del_init(&bh->b_assoc_buffers); if (buffer_dirty(bh) || buffer_locked(bh)) { - list_add(&bh->b_inode_buffers, &tmp); + list_add(&bh->b_assoc_buffers, &tmp); if (buffer_dirty(bh)) { get_bh(bh); spin_unlock(lock); @@ -754,7 +841,7 @@ int fsync_buffers_list(spinlock_t *lock, struct list_head *list) while (!list_empty(&tmp)) { bh = BH_ENTRY(tmp.prev); - __remove_inode_queue(bh); + __remove_assoc_queue(bh); get_bh(bh); spin_unlock(lock); wait_on_buffer(bh); @@ -776,16 +863,23 @@ int fsync_buffers_list(spinlock_t *lock, struct list_head *list) * Invalidate any and all dirty buffers on a given inode. We are * probably unmounting the fs, but that doesn't mean we have already * done a sync(). Just drop the buffers from the inode list. + * + * NOTE: we take the inode's blockdev's mapping's private_lock. Which + * assumes that all the buffers are against the blockdev. Not true + * for reiserfs. */ void invalidate_inode_buffers(struct inode *inode) { - struct list_head * entry; - - spin_lock(&inode->i_bufferlist_lock); - while ((entry = inode->i_dirty_buffers.next) != - &inode->i_dirty_buffers) - __remove_inode_queue(BH_ENTRY(entry)); - spin_unlock(&inode->i_bufferlist_lock); + if (inode_has_buffers(inode)) { + struct address_space *mapping = inode->i_mapping; + struct list_head *list = &mapping->private_list; + struct address_space *buffer_mapping = mapping->assoc_mapping; + + spin_lock(&buffer_mapping->private_lock); + while (!list_empty(list)) + __remove_assoc_queue(BH_ENTRY(list->next)); + spin_unlock(&buffer_mapping->private_lock); + } } /* @@ -939,10 +1033,10 @@ grow_dev_page(struct block_device *bdev, unsigned long block, * lock to be atomic wrt __get_hash_table(), which does not * run under the page lock. */ - spin_lock(&inode->i_bufferlist_lock); + spin_lock(&inode->i_mapping->private_lock); link_dev_buffers(page, bh); init_page_buffers(page, bdev, block, size); - spin_unlock(&inode->i_bufferlist_lock); + spin_unlock(&inode->i_mapping->private_lock); return page; failed: @@ -1051,7 +1145,7 @@ __getblk(struct block_device *bdev, sector_t block, int size) * address_space's dirty_pages list and then attach the address_space's * inode to its superblock's dirty inode list. * - * mark_buffer_dirty() is atomic. It takes inode->i_bufferlist_lock, + * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock, * mapping->page_lock and the global inode_lock. */ void mark_buffer_dirty(struct buffer_head *bh) @@ -1237,7 +1331,7 @@ EXPORT_SYMBOL(block_flushpage); /* * We attach and possibly dirty the buffers atomically wrt - * __set_page_dirty_buffers() via i_bufferlist_lock. try_to_free_buffers + * __set_page_dirty_buffers() via private_lock. try_to_free_buffers * is already excluded via the page lock. */ void create_empty_buffers(struct page *page, @@ -1255,7 +1349,7 @@ void create_empty_buffers(struct page *page, } while (bh); tail->b_this_page = head; - spin_lock(&page->mapping->host->i_bufferlist_lock); + spin_lock(&page->mapping->private_lock); if (PageUptodate(page) || PageDirty(page)) { bh = head; do { @@ -1267,7 +1361,7 @@ void create_empty_buffers(struct page *page, } while (bh != head); } __set_page_buffers(page, head); - spin_unlock(&page->mapping->host->i_bufferlist_lock); + spin_unlock(&page->mapping->private_lock); } EXPORT_SYMBOL(create_empty_buffers); @@ -1281,6 +1375,11 @@ EXPORT_SYMBOL(create_empty_buffers); * unmap_buffer() for such invalidation, but that was wrong. We definitely * don't want to mark the alias unmapped, for example - it would confuse * anyone who might pick it with bread() afterwards... + * + * Also.. Note that bforget() doesn't lock the buffer. So there can + * be writeout I/O going on against recently-freed buffers. We don't + * wait on that I/O in bforget() - it's more efficient to wait on the I/O + * only if we really need to. That happens here. */ static void unmap_underlying_metadata(struct buffer_head *bh) { @@ -2209,7 +2308,7 @@ static void check_ttfb_buffer(struct page *page, struct buffer_head *bh) * are unused, and releases them if so. * * Exclusion against try_to_free_buffers may be obtained by either - * locking the page or by holding its inode's i_bufferlist_lock. + * locking the page or by holding its mapping's private_lock. * * If the page is dirty but all the buffers are clean then we need to * be sure to mark the page clean as well. This is because the page @@ -2220,7 +2319,7 @@ static void check_ttfb_buffer(struct page *page, struct buffer_head *bh) * The same applies to regular filesystem pages: if all the buffers are * clean then we set the page clean and proceed. To do that, we require * total exclusion from __set_page_dirty_buffers(). That is obtained with - * i_bufferlist_lock. + * private_lock. * * try_to_free_buffers() is non-blocking. */ @@ -2252,7 +2351,8 @@ static /*inline*/ int drop_buffers(struct page *page) do { struct buffer_head *next = bh->b_this_page; - __remove_inode_queue(bh); + if (!list_empty(&bh->b_assoc_buffers)) + __remove_assoc_queue(bh); free_buffer_head(bh); bh = next; } while (bh != head); @@ -2264,18 +2364,17 @@ failed: int try_to_free_buffers(struct page *page) { - struct inode *inode; + struct address_space * const mapping = page->mapping; int ret = 0; BUG_ON(!PageLocked(page)); if (PageWriteback(page)) return 0; - if (page->mapping == NULL) /* swapped-in anon page */ + if (mapping == NULL) /* swapped-in anon page */ return drop_buffers(page); - inode = page->mapping->host; - spin_lock(&inode->i_bufferlist_lock); + spin_lock(&mapping->private_lock); ret = drop_buffers(page); if (ret && !PageSwapCache(page)) { /* @@ -2288,7 +2387,7 @@ int try_to_free_buffers(struct page *page) */ ClearPageDirty(page); } - spin_unlock(&inode->i_bufferlist_lock); + spin_unlock(&mapping->private_lock); return ret; } EXPORT_SYMBOL(try_to_free_buffers); @@ -2331,7 +2430,7 @@ EXPORT_SYMBOL(alloc_buffer_head); void free_buffer_head(struct buffer_head *bh) { - BUG_ON(!list_empty(&bh->b_inode_buffers)); + BUG_ON(!list_empty(&bh->b_assoc_buffers)); mempool_free(bh, bh_mempool); } EXPORT_SYMBOL(free_buffer_head); @@ -2344,7 +2443,7 @@ static void init_buffer_head(void *data, kmem_cache_t *cachep, unsigned long fla memset(bh, 0, sizeof(*bh)); bh->b_blocknr = -1; - INIT_LIST_HEAD(&bh->b_inode_buffers); + INIT_LIST_HEAD(&bh->b_assoc_buffers); } } diff --git a/fs/ext2/fsync.c b/fs/ext2/fsync.c index 71ac1701a3a4..5ba02176b331 100644 --- a/fs/ext2/fsync.c +++ b/fs/ext2/fsync.c @@ -37,7 +37,7 @@ int ext2_sync_file(struct file * file, struct dentry *dentry, int datasync) struct inode *inode = dentry->d_inode; int err; - err = fsync_inode_buffers(inode); + err = sync_mapping_buffers(inode->i_mapping); if (!(inode->i_state & I_DIRTY)) return err; if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 7200da15a9bf..b29af3b55ca1 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -41,7 +41,8 @@ static int ext2_update_inode(struct inode * inode, int do_sync); */ void ext2_put_inode (struct inode * inode) { - ext2_discard_prealloc (inode); + if (atomic_read(&inode->i_count) < 2) + ext2_discard_prealloc (inode); } /* @@ -860,7 +861,7 @@ do_indirects: } inode->i_mtime = inode->i_ctime = CURRENT_TIME; if (IS_SYNC(inode)) { - fsync_inode_buffers(inode); + sync_mapping_buffers(inode->i_mapping); ext2_sync_inode (inode); } else { mark_inode_dirty(inode); diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c index 8266f2408664..463f2981437e 100644 --- a/fs/ext3/fsync.c +++ b/fs/ext3/fsync.c @@ -55,13 +55,13 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync) J_ASSERT(ext3_journal_current_handle() == 0); /* - * fsync_inode_buffers() just walks i_dirty_buffers and waits + * fsync_inode_buffers() just walks private_list and waits * on them. It's a no-op for full data journalling because - * i_dirty_buffers will be ampty. + * private_list will be empty. * Really, we only need to start I/O on the dirty buffers - * we'll end up waiting on them in commit. */ - ret = fsync_inode_buffers(inode); + ret = sync_mapping_buffers(inode->i_mapping); ext3_force_commit(inode->i_sb); return ret; diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 076f527e7b23..6b52c880aa86 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -1078,14 +1078,8 @@ static int commit_write_fn(handle_t *handle, struct buffer_head *bh) * We need to pick up the new inode size which generic_commit_write gave us * `file' can be NULL - eg, when called from block_symlink(). * - * ext3 inode->i_dirty_buffers policy: If we're journalling data we - * definitely don't want them to appear on the inode at all - instead - * we need to manage them at the JBD layer and we need to intercept - * the relevant sync operations and translate them into journal operations. - * - * If we're not journalling data then we can just leave the buffers - * on ->i_dirty_buffers. If someone writes them out for us then thanks. - * Otherwise we'll do it in commit, if we're using ordered data. + * ext3 never places buffers on inode->i_mapping->private_list. metadata + * buffers are managed internally. */ static int ext3_commit_write(struct file *file, struct page *page, diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index eadb01f85bd2..b561f2cdde4c 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -467,43 +467,34 @@ void write_inode_now(struct inode *inode, int sync) /** * generic_osync_inode - flush all dirty data for a given inode to disk * @inode: inode to write - * @datasync: if set, don't bother flushing timestamps + * @what: what to write and wait upon * * This can be called by file_write functions for files which have the - * O_SYNC flag set, to flush dirty writes to disk. + * O_SYNC flag set, to flush dirty writes to disk. + * + * @what is a bitmask, specifying which part of the inode's data should be + * written and waited upon: + * + * OSYNC_DATA: i_mapping's dirty data + * OSYNC_METADATA: the buffers at i_mapping->private_list + * OSYNC_INODE: the inode itself */ int generic_osync_inode(struct inode *inode, int what) { - int err = 0, err2 = 0, need_write_inode_now = 0; - - /* - * WARNING - * - * Currently, the filesystem write path does not pass the - * filp down to the low-level write functions. Therefore it - * is impossible for (say) __block_commit_write to know if - * the operation is O_SYNC or not. - * - * Ideally, O_SYNC writes would have the filesystem call - * ll_rw_block as it went to kick-start the writes, and we - * could call osync_inode_buffers() here to wait only for - * those IOs which have already been submitted to the device - * driver layer. As it stands, if we did this we'd not write - * anything to disk since our writes have not been queued by - * this point: they are still on the dirty LRU. - * - * So, currently we will call fsync_inode_buffers() instead, - * to flush _all_ dirty buffers for this inode to disk on - * every O_SYNC write, not just the synchronous I/Os. --sct - */ + int err = 0; + int need_write_inode_now = 0; + int err2; if (what & OSYNC_DATA) - writeback_single_inode(inode, 0, NULL); - if (what & (OSYNC_METADATA|OSYNC_DATA)) - err = fsync_inode_buffers(inode); + err = filemap_fdatawrite(inode->i_mapping); + if (what & (OSYNC_METADATA|OSYNC_DATA)) { + err2 = sync_mapping_buffers(inode->i_mapping); + if (!err) + err = err2; + } if (what & OSYNC_DATA) { - err2 = filemap_fdatawrite(inode->i_mapping); + err2 = filemap_fdatawait(inode->i_mapping); if (!err) err = err2; } diff --git a/fs/inode.c b/fs/inode.c index 61e3f6678737..fc748da51c0c 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -106,6 +106,7 @@ static struct inode *alloc_inode(struct super_block *sb) inode->i_data.dirtied_when = 0; inode->i_mapping = &inode->i_data; inode->i_data.ra_pages = &default_ra_pages; + inode->i_data.assoc_mapping = NULL; if (sb->s_bdev) inode->i_data.ra_pages = sb->s_bdev->bd_inode->i_mapping->ra_pages; memset(&inode->u, 0, sizeof(inode->u)); @@ -139,13 +140,13 @@ void inode_init_once(struct inode *inode) INIT_LIST_HEAD(&inode->i_data.locked_pages); INIT_LIST_HEAD(&inode->i_data.io_pages); INIT_LIST_HEAD(&inode->i_dentry); - INIT_LIST_HEAD(&inode->i_dirty_buffers); INIT_LIST_HEAD(&inode->i_devices); sema_init(&inode->i_sem, 1); INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); rwlock_init(&inode->i_data.page_lock); spin_lock_init(&inode->i_data.i_shared_lock); - spin_lock_init(&inode->i_bufferlist_lock); + INIT_LIST_HEAD(&inode->i_data.private_list); + spin_lock_init(&inode->i_data.private_lock); INIT_LIST_HEAD(&inode->i_data.i_mmap); INIT_LIST_HEAD(&inode->i_data.i_mmap_shared); } diff --git a/fs/minix/file.c b/fs/minix/file.c index 870b602d0c39..c9ac58fe9872 100644 --- a/fs/minix/file.c +++ b/fs/minix/file.c @@ -31,7 +31,7 @@ int minix_sync_file(struct file * file, struct dentry *dentry, int datasync) struct inode *inode = dentry->d_inode; int err; - err = fsync_inode_buffers(inode); + err = sync_mapping_buffers(inode->i_mapping); if (!(inode->i_state & I_DIRTY)) return err; if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 049babc922de..b51def4b641c 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -1510,6 +1510,13 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent) INIT_LIST_HEAD(&vol->mftbmp_mapping.i_mmap); INIT_LIST_HEAD(&vol->mftbmp_mapping.i_mmap_shared); spin_lock_init(&vol->mftbmp_mapping.i_shared_lock); + /* + * private_lock and private_list are unused by ntfs. But they + * are available. + */ + spin_lock_init(&vol->mftbmp_mapping.private_lock); + INIT_LIST_HEAD(&vol->mftbmp_mapping.private_list); + vol->mftbmp_mapping.assoc_mapping = NULL; vol->mftbmp_mapping.dirtied_when = 0; vol->mftbmp_mapping.gfp_mask = GFP_HIGHUSER; vol->mftbmp_mapping.ra_pages = diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index f6aa41b82360..286e63281e51 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -85,7 +85,7 @@ static int reiserfs_sync_file( if (!S_ISREG(p_s_inode->i_mode)) BUG (); - n_err = fsync_inode_buffers(p_s_inode) ; + n_err = sync_mapping_buffers(p_s_inode->i_mapping) ; reiserfs_commit_for_inode(p_s_inode) ; unlock_kernel() ; return ( n_err < 0 ) ? -EIO : 0; diff --git a/fs/sysv/file.c b/fs/sysv/file.c index 255230c20988..9dce95103718 100644 --- a/fs/sysv/file.c +++ b/fs/sysv/file.c @@ -36,7 +36,7 @@ int sysv_sync_file(struct file * file, struct dentry *dentry, int datasync) struct inode *inode = dentry->d_inode; int err; - err = fsync_inode_buffers(inode); + err = sync_mapping_buffers(inode->i_mapping); if (!(inode->i_state & I_DIRTY)) return err; if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) diff --git a/fs/udf/fsync.c b/fs/udf/fsync.c index c48ddf0d0408..c36daeee6d10 100644 --- a/fs/udf/fsync.c +++ b/fs/udf/fsync.c @@ -44,7 +44,7 @@ int udf_fsync_inode(struct inode *inode, int datasync) { int err; - err = fsync_inode_buffers(inode); + err = sync_mapping_buffers(inode->i_mapping); if (!(inode->i_state & I_DIRTY)) return err; if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 500a082b7bcc..b2c54106b8c8 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -50,7 +50,7 @@ struct buffer_head { struct block_device *b_bdev; bh_end_io_t *b_end_io; /* I/O completion */ void *b_private; /* reserved for b_end_io */ - struct list_head b_inode_buffers; /* list of inode dirty buffers */ + struct list_head b_assoc_buffers; /* associated with another mapping */ }; @@ -147,6 +147,8 @@ void create_empty_buffers(struct page *, unsigned long, void end_buffer_io_sync(struct buffer_head *bh, int uptodate); void buffer_insert_list(spinlock_t *lock, struct buffer_head *, struct list_head *); +int sync_mapping_buffers(struct address_space *mapping); +void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode); void mark_buffer_async_read(struct buffer_head *bh); void mark_buffer_async_write(struct buffer_head *bh); @@ -217,14 +219,6 @@ static inline void put_bh(struct buffer_head *bh) atomic_dec(&bh->b_count); } -static inline void -mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode) -{ - mark_buffer_dirty(bh); - buffer_insert_list(&inode->i_bufferlist_lock, - bh, &inode->i_dirty_buffers); -} - /* * If an error happens during the make_request, this function * has to be recalled. It marks the buffer as clean and not @@ -243,11 +237,6 @@ static inline void buffer_IO_error(struct buffer_head * bh) bh->b_end_io(bh, buffer_uptodate(bh)); } -static inline int fsync_inode_buffers(struct inode *inode) -{ - return fsync_buffers_list(&inode->i_bufferlist_lock, - &inode->i_dirty_buffers); -} static inline void brelse(struct buffer_head *buf) { diff --git a/include/linux/fs.h b/include/linux/fs.h index 4b858f90c6fe..25578c7a5e62 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -306,6 +306,7 @@ struct address_space_operations { }; struct address_space { + struct inode *host; /* owner: inode, block_device */ struct radix_tree_root page_tree; /* radix tree of all pages */ rwlock_t page_lock; /* and rwlock protecting it */ struct list_head clean_pages; /* list of clean pages */ @@ -314,13 +315,15 @@ struct address_space { struct list_head io_pages; /* being prepared for I/O */ unsigned long nrpages; /* number of total pages */ struct address_space_operations *a_ops; /* methods */ - struct inode *host; /* owner: inode, block_device */ list_t i_mmap; /* list of private mappings */ list_t i_mmap_shared; /* list of private mappings */ spinlock_t i_shared_lock; /* and spinlock protecting it */ unsigned long dirtied_when; /* jiffies of first page dirtying */ int gfp_mask; /* how to allocate the pages */ unsigned long *ra_pages; /* device readahead */ + spinlock_t private_lock; /* for use by the address_space */ + struct list_head private_list; /* ditto */ + struct address_space *assoc_mapping; /* ditto */ }; struct char_device { @@ -350,10 +353,6 @@ struct inode { struct list_head i_hash; struct list_head i_list; struct list_head i_dentry; - - struct list_head i_dirty_buffers; /* uses i_bufferlist_lock */ - spinlock_t i_bufferlist_lock; - unsigned long i_ino; atomic_t i_count; kdev_t i_dev; diff --git a/mm/filemap.c b/mm/filemap.c index 681d02d62685..af964d8602c8 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -42,7 +42,7 @@ * * pagemap_lru_lock * ->i_shared_lock (vmtruncate) - * ->i_bufferlist_lock (__free_pte->__set_page_dirty_buffers) + * ->private_lock (__free_pte->__set_page_dirty_buffers) * ->mapping->page_lock * ->inode_lock (__mark_inode_dirty) * ->sb_lock (fs/fs-writeback.c) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 40e80dff9369..590cf2e53abf 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -450,7 +450,7 @@ EXPORT_SYMBOL(write_one_page); * It's better to have clean pages accidentally attached to dirty_pages than to * leave dirty pages attached to clean_pages. * - * We use i_bufferlist_lock to lock against try_to_free_buffers while using the + * We use private_lock to lock against try_to_free_buffers while using the * page's buffer list. Also use this to protect against clean buffers being * added to the page after it was set dirty. * @@ -462,18 +462,15 @@ EXPORT_SYMBOL(write_one_page); */ int __set_page_dirty_buffers(struct page *page) { + struct address_space * const mapping = page->mapping; int ret = 0; - struct address_space *mapping = page->mapping; - struct inode *inode; if (mapping == NULL) { SetPageDirty(page); goto out; } - inode = mapping->host; - - spin_lock(&inode->i_bufferlist_lock); + spin_lock(&mapping->private_lock); if (page_has_buffers(page) && !PageSwapCache(page)) { struct buffer_head *head = page_buffers(page); @@ -496,7 +493,7 @@ int __set_page_dirty_buffers(struct page *page) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } - spin_unlock(&inode->i_bufferlist_lock); + spin_unlock(&mapping->private_lock); out: return ret; } diff --git a/mm/swap_state.c b/mm/swap_state.c index 85002f16a6fa..8ada8b5ada79 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -37,11 +37,10 @@ static struct address_space_operations swap_aops = { }; /* - * swapper_inode is needed only for for i_bufferlist_lock. This - * avoid special-casing in other parts of the kernel. + * swapper_inode doesn't do anything much. It is really only here to + * avoid some special-casing in other parts of the kernel. */ static struct inode swapper_inode = { - i_bufferlist_lock: SPIN_LOCK_UNLOCKED, i_mapping: &swapper_space, }; @@ -55,6 +54,8 @@ struct address_space swapper_space = { host: &swapper_inode, a_ops: &swap_aops, i_shared_lock: SPIN_LOCK_UNLOCKED, + private_lock: SPIN_LOCK_UNLOCKED, + private_list: LIST_HEAD_INIT(swapper_space.private_list), }; #ifdef SWAP_CACHE_INFO -- cgit v1.2.3 From 2d8f24d09522389b8c382f46cd0a69d32ac959fb Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 19 May 2002 02:21:35 -0700 Subject: [PATCH] larger b_size, and misc fixlets Miscellany. - make the printk in buffer_io_error() sector_t-aware. - Some buffer.c cleanups from AntonA: remove a couple of !uptodate checks, and set a new buffer's b_blocknr to -1 in a more sensible place. - Make buffer_head.b_size a 32-bit quantity. Needed for 64k pagesize on ia64. Does not increase sizeof(struct buffer_head). --- fs/buffer.c | 22 +++++++++------------- include/linux/buffer_head.h | 2 +- 2 files changed, 10 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/fs/buffer.c b/fs/buffer.c index 760540c26caa..123d8dcb5ea4 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -179,8 +179,8 @@ __clear_page_buffers(struct page *page) static void buffer_io_error(struct buffer_head *bh) { - printk(KERN_ERR "Buffer I/O error on device %s, logical block %ld\n", - bdevname(bh->b_bdev), bh->b_blocknr); + printk(KERN_ERR "Buffer I/O error on device %s, logical block %Ld\n", + bdevname(bh->b_bdev), (u64)bh->b_blocknr); } /* @@ -189,12 +189,12 @@ static void buffer_io_error(struct buffer_head *bh) */ void end_buffer_io_sync(struct buffer_head *bh, int uptodate) { - if (!uptodate) - buffer_io_error(bh); - if (uptodate) + if (uptodate) { set_buffer_uptodate(bh); - else + } else { + buffer_io_error(bh); clear_buffer_uptodate(bh); + } unlock_buffer(bh); put_bh(bh); } @@ -519,14 +519,12 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate) BUG_ON(!buffer_async_read(bh)); - if (!uptodate) - buffer_io_error(bh); - page = bh->b_page; if (uptodate) { set_buffer_uptodate(bh); } else { clear_buffer_uptodate(bh); + buffer_io_error(bh); SetPageError(page); } @@ -579,13 +577,11 @@ static void end_buffer_async_write(struct buffer_head *bh, int uptodate) BUG_ON(!buffer_async_write(bh)); - if (!uptodate) - buffer_io_error(bh); - page = bh->b_page; if (uptodate) { set_buffer_uptodate(bh); } else { + buffer_io_error(bh); clear_buffer_uptodate(bh); SetPageError(page); } @@ -907,6 +903,7 @@ try_again: bh->b_bdev = NULL; bh->b_this_page = head; + bh->b_blocknr = -1; head = bh; bh->b_state = 0; @@ -2442,7 +2439,6 @@ static void init_buffer_head(void *data, kmem_cache_t *cachep, unsigned long fla struct buffer_head * bh = (struct buffer_head *)data; memset(bh, 0, sizeof(*bh)); - bh->b_blocknr = -1; INIT_LIST_HEAD(&bh->b_assoc_buffers); } } diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index b2c54106b8c8..328af2a6c275 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -44,7 +44,7 @@ struct buffer_head { struct page *b_page; /* the page this bh is mapped to */ sector_t b_blocknr; /* block number */ - unsigned short b_size; /* block size */ + u32 b_size; /* block size */ char *b_data; /* pointer to data block */ struct block_device *b_bdev; -- cgit v1.2.3 From 1f6acea0de867d7f5e5a43ba43cf3be744da412c Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 19 May 2002 02:22:01 -0700 Subject: [PATCH] pdflush exclusion infrastructure Collision avoidance for pdflush threads. Turns the request_queue-based `unsigned long ra_pages' into a structure which contains ra_pages as well as a longword. That longword is used to record the fact that a pdflush thread is currently writing something back against this request_queue. Avoids the situation where several pdflush threads are sleeping on the same request_queue. This patch provides only the infrastructure for the pdflush exclusion. This infrastructure gets used in pdflush-single.patch --- drivers/block/blkpg.c | 15 ++++++++------- drivers/block/ll_rw_blk.c | 14 ++++++++------ fs/block_dev.c | 17 +++++++++-------- fs/fs-writeback.c | 38 ++++++++++++++++++++++++++++++++++++++ fs/inode.c | 20 ++++++++++++-------- fs/ntfs/super.c | 5 +++-- fs/open.c | 3 ++- include/linux/backing-dev.h | 30 ++++++++++++++++++++++++++++++ include/linux/blkdev.h | 9 +++------ include/linux/fs.h | 3 ++- include/linux/mm.h | 1 - mm/page-writeback.c | 8 ++------ mm/pdflush.c | 4 +++- mm/readahead.c | 6 +++++- 14 files changed, 125 insertions(+), 48 deletions(-) create mode 100644 include/linux/backing-dev.h (limited to 'include') diff --git a/drivers/block/blkpg.c b/drivers/block/blkpg.c index e8059084b8f0..595fa49af3ef 100644 --- a/drivers/block/blkpg.c +++ b/drivers/block/blkpg.c @@ -35,6 +35,7 @@ #include #include #include /* for EXPORT_SYMBOL */ +#include #include @@ -219,7 +220,7 @@ int blk_ioctl(struct block_device *bdev, unsigned int cmd, unsigned long arg) unsigned short usval; kdev_t dev = to_kdev_t(bdev->bd_dev); int holder; - unsigned long *ra_pages; + struct backing_dev_info *bdi; intval = block_ioctl(bdev, cmd, arg); if (intval != -ENOTTY) @@ -241,20 +242,20 @@ int blk_ioctl(struct block_device *bdev, unsigned int cmd, unsigned long arg) case BLKFRASET: if(!capable(CAP_SYS_ADMIN)) return -EACCES; - ra_pages = blk_get_ra_pages(bdev); - if (ra_pages == NULL) + bdi = blk_get_backing_dev_info(bdev); + if (bdi == NULL) return -ENOTTY; - *ra_pages = (arg * 512) / PAGE_CACHE_SIZE; + bdi->ra_pages = (arg * 512) / PAGE_CACHE_SIZE; return 0; case BLKRAGET: case BLKFRAGET: if (!arg) return -EINVAL; - ra_pages = blk_get_ra_pages(bdev); - if (ra_pages == NULL) + bdi = blk_get_backing_dev_info(bdev); + if (bdi == NULL) return -ENOTTY; - return put_user((*ra_pages * PAGE_CACHE_SIZE) / 512, + return put_user((bdi->ra_pages * PAGE_CACHE_SIZE) / 512, (long *)arg); case BLKSECTGET: diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c index 5430dea71325..51fd5be00995 100644 --- a/drivers/block/ll_rw_blk.c +++ b/drivers/block/ll_rw_blk.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -100,21 +101,21 @@ inline request_queue_t *blk_get_queue(kdev_t dev) } /** - * blk_get_ra_pages - get the address of a queue's readahead tunable + * blk_get_backing_dev_info - get the address of a queue's backing_dev_info * @dev: device * * Locates the passed device's request queue and returns the address of its - * readahead setting. + * backing_dev_info * * Will return NULL if the request queue cannot be located. */ -unsigned long *blk_get_ra_pages(struct block_device *bdev) +struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev) { - unsigned long *ret = NULL; + struct backing_dev_info *ret = NULL; request_queue_t *q = blk_get_queue(to_kdev_t(bdev->bd_dev)); if (q) - ret = &q->ra_pages; + ret = &q->backing_dev_info; return ret; } @@ -153,7 +154,8 @@ void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn) q->max_phys_segments = MAX_PHYS_SEGMENTS; q->max_hw_segments = MAX_HW_SEGMENTS; q->make_request_fn = mfn; - q->ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; + q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; + q->backing_dev_info.state = 0; blk_queue_max_sectors(q, MAX_SECTORS); blk_queue_hardsect_size(q, 512); diff --git a/fs/block_dev.c b/fs/block_dev.c index f9326d65a756..76c5e5cf0555 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -331,7 +331,7 @@ struct block_device *bdget(dev_t dev) inode->i_bdev = new_bdev; inode->i_data.a_ops = &def_blk_aops; inode->i_data.gfp_mask = GFP_USER; - inode->i_data.ra_pages = &default_ra_pages; + inode->i_data.backing_dev_info = &default_backing_dev_info; spin_lock(&bdev_lock); bdev = bdfind(dev, head); if (!bdev) { @@ -594,11 +594,12 @@ static int do_open(struct block_device *bdev, struct inode *inode, struct file * } } } - if (bdev->bd_inode->i_data.ra_pages == &default_ra_pages) { - unsigned long *ra_pages = blk_get_ra_pages(bdev); - if (ra_pages == NULL) - ra_pages = &default_ra_pages; - inode->i_data.ra_pages = ra_pages; + if (bdev->bd_inode->i_data.backing_dev_info == + &default_backing_dev_info) { + struct backing_dev_info *bdi = blk_get_backing_dev_info(bdev); + if (bdi == NULL) + bdi = &default_backing_dev_info; + inode->i_data.backing_dev_info = bdi; } if (bdev->bd_op->open) { ret = bdev->bd_op->open(inode, file); @@ -624,7 +625,7 @@ static int do_open(struct block_device *bdev, struct inode *inode, struct file * out2: if (!bdev->bd_openers) { bdev->bd_op = NULL; - bdev->bd_inode->i_data.ra_pages = &default_ra_pages; + bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info; if (bdev != bdev->bd_contains) { blkdev_put(bdev->bd_contains, BDEV_RAW); bdev->bd_contains = NULL; @@ -698,7 +699,7 @@ int blkdev_put(struct block_device *bdev, int kind) __MOD_DEC_USE_COUNT(bdev->bd_op->owner); if (!bdev->bd_openers) { bdev->bd_op = NULL; - bdev->bd_inode->i_data.ra_pages = &default_ra_pages; + bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info; if (bdev != bdev->bd_contains) { blkdev_put(bdev->bd_contains, BDEV_RAW); bdev->bd_contains = NULL; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index d1880be27437..139283a310a6 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -19,6 +19,7 @@ #include #include #include +#include /** * __mark_inode_dirty - internal function @@ -508,3 +509,40 @@ int generic_osync_inode(struct inode *inode, int what) return err; } + +/** + * writeback_acquire: attempt to get exclusive writeback access to a device + * @bdi: the device's backing_dev_info structure + * + * It is a waste of resources to have more than one pdflush thread blocked on + * a single request queue. Exclusion at the request_queue level is obtained + * via a flag in the request_queue's backing_dev_info.state. + * + * Non-request_queue-backed address_spaces will share default_backing_dev_info, + * unless they implement their own. Which is somewhat inefficient, as this + * may prevent concurrent writeback against multiple devices. + */ +int writeback_acquire(struct backing_dev_info *bdi) +{ + return !test_and_set_bit(BDI_pdflush, &bdi->state); +} + +/** + * writeback_in_progress: determine whether there is writeback in progress + * against a backing device. + * @bdi: the device's backing_dev_info structure. + */ +int writeback_in_progress(struct backing_dev_info *bdi) +{ + return test_bit(BDI_pdflush, &bdi->state); +} + +/** + * writeback_release: relinquish exclusive writeback access against a device. + * @bdi: the device's backing_dev_info structure + */ +void writeback_release(struct backing_dev_info *bdi) +{ + BUG_ON(!writeback_in_progress(bdi)); + clear_bit(BDI_pdflush, &bdi->state); +} diff --git a/fs/inode.c b/fs/inode.c index fc748da51c0c..1c1256a5f799 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -12,6 +12,7 @@ #include #include #include +#include /* * New inode.c implementation. @@ -83,6 +84,8 @@ static struct inode *alloc_inode(struct super_block *sb) inode = (struct inode *) kmem_cache_alloc(inode_cachep, SLAB_KERNEL); if (inode) { + struct address_space * const mapping = &inode->i_data; + inode->i_sb = sb; inode->i_dev = sb->s_dev; inode->i_blkbits = sb->s_blocksize_bits; @@ -100,16 +103,17 @@ static struct inode *alloc_inode(struct super_block *sb) inode->i_pipe = NULL; inode->i_bdev = NULL; inode->i_cdev = NULL; - inode->i_data.a_ops = &empty_aops; - inode->i_data.host = inode; - inode->i_data.gfp_mask = GFP_HIGHUSER; - inode->i_data.dirtied_when = 0; - inode->i_mapping = &inode->i_data; - inode->i_data.ra_pages = &default_ra_pages; - inode->i_data.assoc_mapping = NULL; + + mapping->a_ops = &empty_aops; + mapping->host = inode; + mapping->gfp_mask = GFP_HIGHUSER; + mapping->dirtied_when = 0; + mapping->assoc_mapping = NULL; + mapping->backing_dev_info = &default_backing_dev_info; if (sb->s_bdev) - inode->i_data.ra_pages = sb->s_bdev->bd_inode->i_mapping->ra_pages; + inode->i_data.backing_dev_info = sb->s_bdev->bd_inode->i_mapping->backing_dev_info; memset(&inode->u, 0, sizeof(inode->u)); + inode->i_mapping = mapping; } return inode; } diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index b51def4b641c..546eb46bb51a 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -26,6 +26,7 @@ #include #include #include /* For bdev_hardsect_size(). */ +#include #include "ntfs.h" #include "sysctl.h" @@ -1519,8 +1520,8 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent) vol->mftbmp_mapping.assoc_mapping = NULL; vol->mftbmp_mapping.dirtied_when = 0; vol->mftbmp_mapping.gfp_mask = GFP_HIGHUSER; - vol->mftbmp_mapping.ra_pages = - sb->s_bdev->bd_inode->i_mapping->ra_pages; + vol->mftbmp_mapping.backing_dev_info = + sb->s_bdev->bd_inode->i_mapping->backing_dev_info; /* * Default is group and other don't have any access to files or diff --git a/fs/open.c b/fs/open.c index e0231b191336..2ef917feadb7 100644 --- a/fs/open.c +++ b/fs/open.c @@ -15,6 +15,7 @@ #include #include #include +#include #include @@ -632,7 +633,7 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags) goto cleanup_file; } - f->f_ra.ra_pages = *inode->i_mapping->ra_pages; + f->f_ra.ra_pages = inode->i_mapping->backing_dev_info->ra_pages; f->f_dentry = dentry; f->f_vfsmnt = mnt; f->f_pos = 0; diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h new file mode 100644 index 000000000000..075cacc389e1 --- /dev/null +++ b/include/linux/backing-dev.h @@ -0,0 +1,30 @@ +/* + * include/linux/backing-dev.h + * + * low-level device information and state which is propagated up through + * to high-level code. + */ + +#ifndef _LINUX_BACKING_DEV_H +#define _LINUX_BACKING_DEV_H + +/* + * Bits in backing_dev_info.state + */ +enum bdi_state { + BDI_pdflush, /* A pdflush thread is working this device */ + BDI_unused, /* Available bits start here */ +}; + +struct backing_dev_info { + unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */ + unsigned long state; /* Always use atomic bitops on this */ +}; + +extern struct backing_dev_info default_backing_dev_info; + +int writeback_acquire(struct backing_dev_info *bdi); +int writeback_in_progress(struct backing_dev_info *bdi); +void writeback_release(struct backing_dev_info *bdi); + +#endif /* _LINUX_BACKING_DEV_H */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d8175ccc104c..ac373e6a2454 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -7,6 +7,7 @@ #include #include #include +#include #include @@ -162,11 +163,7 @@ struct request_queue make_request_fn *make_request_fn; prep_rq_fn *prep_rq_fn; - /* - * The VM-level readahead tunable for this device. In - * units of PAGE_CACHE_SIZE pages. - */ - unsigned long ra_pages; + struct backing_dev_info backing_dev_info; /* * The queue owner gets to use this for whatever they like. @@ -328,7 +325,7 @@ extern void blk_queue_hardsect_size(request_queue_t *q, unsigned short); extern void blk_queue_segment_boundary(request_queue_t *q, unsigned long); extern void blk_queue_assign_lock(request_queue_t *q, spinlock_t *); extern void blk_queue_prep_rq(request_queue_t *q, prep_rq_fn *pfn); -extern unsigned long *blk_get_ra_pages(struct block_device *bdev); +extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev); extern int blk_rq_map_sg(request_queue_t *, struct request *, struct scatterlist *); extern void blk_dump_rq_flags(struct request *, char *); diff --git a/include/linux/fs.h b/include/linux/fs.h index 25578c7a5e62..374045884cb8 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -305,6 +305,7 @@ struct address_space_operations { int (*direct_IO)(int, struct inode *, struct kiobuf *, unsigned long, int); }; +struct backing_dev_info; struct address_space { struct inode *host; /* owner: inode, block_device */ struct radix_tree_root page_tree; /* radix tree of all pages */ @@ -320,7 +321,7 @@ struct address_space { spinlock_t i_shared_lock; /* and spinlock protecting it */ unsigned long dirtied_when; /* jiffies of first page dirtying */ int gfp_mask; /* how to allocate the pages */ - unsigned long *ra_pages; /* device readahead */ + struct backing_dev_info *backing_dev_info; /* device readahead, etc */ spinlock_t private_lock; /* for use by the address_space */ struct list_head private_list; /* ditto */ struct address_space *assoc_mapping; /* ditto */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 2f0b56f0183b..451cdff1ec16 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -454,7 +454,6 @@ void do_page_cache_readahead(struct file *file, void page_cache_readahead(struct file *file, unsigned long offset); void page_cache_readaround(struct file *file, unsigned long offset); void handle_ra_thrashing(struct file *file); -extern unsigned long default_ra_pages; /* vma is the first one with address < vma->vm_end, * and even address < vma->vm_start. Have to extend vma. */ diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 661f1860880c..e2c65e1057df 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -166,6 +166,7 @@ int pdflush_flush(unsigned long nr_pages) * to perform their I/O against a large file. */ static int wb_writeback_jifs = 5 * HZ; +static struct timer_list wb_timer; /* * Periodic writeback of "old" data. @@ -206,16 +207,11 @@ static void wb_kupdate(unsigned long arg) yield(); } run_task_queue(&tq_disk); + mod_timer(&wb_timer, jiffies + wb_writeback_jifs); } -/* - * The writeback timer, for kupdate-style functionality - */ -static struct timer_list wb_timer; - static void wb_timer_fn(unsigned long unused) { - mod_timer(&wb_timer, jiffies + wb_writeback_jifs); pdflush_operation(wb_kupdate, 0); } diff --git a/mm/pdflush.c b/mm/pdflush.c index 07ceb439e9ae..5e7d0125c39d 100644 --- a/mm/pdflush.c +++ b/mm/pdflush.c @@ -103,6 +103,7 @@ static int __pdflush(struct pdflush_work *my_work) preempt_disable(); spin_lock_irq(&pdflush_lock); nr_pdflush_threads++; +// printk("pdflush %d [%d] starts\n", nr_pdflush_threads, current->pid); for ( ; ; ) { struct pdflush_work *pdf; @@ -124,7 +125,7 @@ static int __pdflush(struct pdflush_work *my_work) if (jiffies - last_empty_jifs > 1 * HZ) { /* unlocked list_empty() test is OK here */ if (list_empty(&pdflush_list)) { - /* unlocked nr_pdflush_threads test is OK here */ + /* unlocked test is OK here */ if (nr_pdflush_threads < MAX_PDFLUSH_THREADS) start_one_pdflush_thread(); } @@ -147,6 +148,7 @@ static int __pdflush(struct pdflush_work *my_work) } } nr_pdflush_threads--; +// printk("pdflush %d [%d] ends\n", nr_pdflush_threads, current->pid); spin_unlock_irq(&pdflush_lock); preempt_enable(); return 0; diff --git a/mm/readahead.c b/mm/readahead.c index b59f8f4c57bc..03fd19c23bbb 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -11,8 +11,12 @@ #include #include #include +#include -unsigned long default_ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; +struct backing_dev_info default_backing_dev_info = { + ra_pages: (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE, + state: 0, +}; /* * Return max readahead size for this inode in number-of-pages. -- cgit v1.2.3 From 17a74e8800eb0f00a74b9c1d269483e4f9f22bc8 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 19 May 2002 02:22:12 -0700 Subject: [PATCH] pdflush exclusion Use the pdflush exclusion infrastructure to ensure that only one pdlfush thread is ever performing writeback against a particular request_queue. This works rather well. It requires a lot of activity against a lot of disks to cause more pdflush threads to start up. Possibly the thread-creation logic is a little weak: it starts more threads when a pdflush thread goes back to sleep. It may be better to start new threads within pdlfush_operation(). All non-request_queue-backed address_spaces share the global default_backing_dev_info structure. So at present only a single pdflush instance will be available for background writeback of *all* NFS filesystems (for example). If there is benefit in concurrent background writeback for multiple NFS mounts then NFS would need to create per-mount backing_dev_info structures and install those into new inode's address_spaces in some manner. --- fs/fs-writeback.c | 56 +++++++++++++++++++++++++++++------------------ fs/inode.c | 19 +++++----------- include/linux/fs.h | 2 -- include/linux/writeback.h | 9 ++++++++ mm/page-writeback.c | 6 ++--- 5 files changed, 52 insertions(+), 40 deletions(-) (limited to 'include') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 139283a310a6..b2d84f68c3da 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -187,6 +187,9 @@ static void __sync_single_inode(struct inode *inode, int wait, int *nr_to_write) static void __writeback_single_inode(struct inode *inode, int sync, int *nr_to_write) { + if (current_is_pdflush() && (inode->i_state & I_LOCK)) + return; + while (inode->i_state & I_LOCK) { __iget(inode); spin_unlock(&inode_lock); @@ -213,6 +216,9 @@ void writeback_single_inode(struct inode *inode, int sync, int *nr_to_write) * had their first dirtying at a time earlier than *older_than_this. * * Called under inode_lock. + * + * If we're a pdlfush thread, then implement pdlfush collision avoidance + * against the entire list. */ static void __sync_list(struct list_head *head, int sync_mode, int *nr_to_write, unsigned long *older_than_this) @@ -223,6 +229,8 @@ static void __sync_list(struct list_head *head, int sync_mode, while ((tmp = head->prev) != head) { struct inode *inode = list_entry(tmp, struct inode, i_list); struct address_space *mapping = inode->i_mapping; + struct backing_dev_info *bdi; + int really_sync; /* Was this inode dirtied after __sync_list was called? */ @@ -233,10 +241,18 @@ static void __sync_list(struct list_head *head, int sync_mode, time_after(mapping->dirtied_when, *older_than_this)) break; + bdi = mapping->backing_dev_info; + if (current_is_pdflush() && !writeback_acquire(bdi)) + break; + really_sync = (sync_mode == WB_SYNC_ALL); if ((sync_mode == WB_SYNC_LAST) && (head->prev == head)) really_sync = 1; __writeback_single_inode(inode, really_sync, nr_to_write); + + if (current_is_pdflush()) + writeback_release(bdi); + if (nr_to_write && *nr_to_write == 0) break; } @@ -255,6 +271,8 @@ static void __sync_list(struct list_head *head, int sync_mode, * * If `older_than_this' is non-zero then only flush inodes which have a * flushtime older than *older_than_this. + * + * This is a "memory cleansing" operation, not a "data integrity" operation. */ void writeback_unlocked_inodes(int *nr_to_write, int sync_mode, unsigned long *older_than_this) @@ -276,29 +294,12 @@ void writeback_unlocked_inodes(int *nr_to_write, int sync_mode, if (sb->s_writeback_gen == writeback_gen) continue; sb->s_writeback_gen = writeback_gen; - - if (current->flags & PF_FLUSHER) { - if (sb->s_flags & MS_FLUSHING) { - /* - * There's no point in two pdflush threads - * flushing the same device. But for other - * callers, we want to perform the flush - * because the fdatasync is how we implement - * writer throttling. - */ - continue; - } - sb->s_flags |= MS_FLUSHING; - } - if (!list_empty(&sb->s_dirty)) { spin_unlock(&sb_lock); __sync_list(&sb->s_dirty, sync_mode, nr_to_write, older_than_this); spin_lock(&sb_lock); } - if (current->flags & PF_FLUSHER) - sb->s_flags &= ~MS_FLUSHING; if (nr_to_write && *nr_to_write == 0) break; } @@ -307,7 +308,7 @@ void writeback_unlocked_inodes(int *nr_to_write, int sync_mode, } /* - * Called under inode_lock + * Called under inode_lock. */ static int __try_to_writeback_unused_list(struct list_head *head, int nr_inodes) { @@ -318,7 +319,17 @@ static int __try_to_writeback_unused_list(struct list_head *head, int nr_inodes) inode = list_entry(tmp, struct inode, i_list); if (!atomic_read(&inode->i_count)) { + struct backing_dev_info *bdi; + + bdi = inode->i_mapping->backing_dev_info; + if (current_is_pdflush() && !writeback_acquire(bdi)) + goto out; + __sync_single_inode(inode, 0, NULL); + + if (current_is_pdflush()) + writeback_release(bdi); + nr_inodes--; /* @@ -328,7 +339,7 @@ static int __try_to_writeback_unused_list(struct list_head *head, int nr_inodes) tmp = head; } } - +out: return nr_inodes; } @@ -421,7 +432,11 @@ void sync_inodes(void) } } -void try_to_writeback_unused_inodes(unsigned long pexclusive) +/* + * FIXME: the try_to_writeback_unused functions look dreadfully similar to + * writeback_unlocked_inodes... + */ +void try_to_writeback_unused_inodes(unsigned long unused) { struct super_block * sb; int nr_inodes = inodes_stat.nr_unused; @@ -440,7 +455,6 @@ void try_to_writeback_unused_inodes(unsigned long pexclusive) } spin_unlock(&sb_lock); spin_unlock(&inode_lock); - clear_bit(0, (unsigned long *)pexclusive); } /** diff --git a/fs/inode.c b/fs/inode.c index 1c1256a5f799..68c1ee161252 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -404,21 +404,14 @@ void prune_icache(int goal) dispose_list(freeable); /* - * If we didn't freed enough clean inodes schedule - * a sync of the dirty inodes, we cannot do it - * from here or we're either synchronously dogslow - * or we deadlock with oom. + * If we didn't free enough clean inodes then schedule writeback of + * the dirty inodes. We cannot do it from here or we're either + * synchronously dogslow or we deadlock with oom. */ - if (goal) { - static unsigned long exclusive; - - if (!test_and_set_bit(0, &exclusive)) { - if (pdflush_operation(try_to_writeback_unused_inodes, - (unsigned long)&exclusive)) - clear_bit(0, &exclusive); - } - } + if (goal) + pdflush_operation(try_to_writeback_unused_inodes, 0); } + /* * This is called from kswapd when we think we need some * more memory, but aren't really sure how much. So we diff --git a/include/linux/fs.h b/include/linux/fs.h index 374045884cb8..b936413f96f2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -112,7 +112,6 @@ extern int leases_enable, dir_notify_enable, lease_break_time; #define MS_MOVE 8192 #define MS_REC 16384 #define MS_VERBOSE 32768 -#define MS_FLUSHING (1<<16) /* inodes are currently under writeout */ #define MS_ACTIVE (1<<30) #define MS_NOUSER (1<<31) @@ -156,7 +155,6 @@ extern int leases_enable, dir_notify_enable, lease_break_time; #define IS_RDONLY(inode) ((inode)->i_sb->s_flags & MS_RDONLY) #define IS_SYNC(inode) (__IS_FLG(inode, MS_SYNCHRONOUS) || ((inode)->i_flags & S_SYNC)) #define IS_MANDLOCK(inode) __IS_FLG(inode, MS_MANDLOCK) -#define IS_FLUSHING(inode) __IS_FLG(inode, MS_FLUSHING) #define IS_QUOTAINIT(inode) ((inode)->i_flags & S_QUOTA) #define IS_NOQUOTA(inode) ((inode)->i_flags & S_NOQUOTA) diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 1978e06d1131..a089dd009fc1 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -12,6 +12,15 @@ extern spinlock_t inode_lock; extern struct list_head inode_in_use; extern struct list_head inode_unused; +/* + * Yes, writeback.h requires sched.h + * No, sched.h is not included from here. + */ +static inline int current_is_pdflush(void) +{ + return current->flags & PF_FLUSHER; +} + /* * fs/fs-writeback.c */ diff --git a/mm/page-writeback.c b/mm/page-writeback.c index e2c65e1057df..defc6988a305 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -20,6 +20,7 @@ #include #include #include +#include /* * Memory thresholds, in percentages @@ -86,10 +87,7 @@ void balance_dirty_pages(struct address_space *mapping) wake_pdflush = 1; } - if (wake_pdflush && !IS_FLUSHING(mapping->host)) { - /* - * There is no flush thread against this device. Start one now. - */ + if (wake_pdflush && !writeback_in_progress(mapping->backing_dev_info)) { if (dirty_and_writeback > async_thresh) { pdflush_flush(dirty_and_writeback - async_thresh); yield(); -- cgit v1.2.3 From acb5f6f9bb66a409205a3a9fa6dffa98e8520d00 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 19 May 2002 02:22:24 -0700 Subject: [PATCH] writeback tuning Tune up the VM-based writeback a bit. - Always use the multipage clustered-writeback function from within shrink_cache(), even if the page's mapping has a NULL ->vm_writeback(). So clustered writeback is turned on for all address_spaces, not just ext2. Subtle effect of this change: it is now the case that *all* writeback proceeds along the mapping->dirty_pages list. The orderedness of the page LRUs no longer has an impact on disk scheduling. So we only have one list to keep well-sorted rather than two, and churning pages around on the LRU will no longer damage write bandwidth - it's all up to the filesystem. - Decrease the clustered writeback from 1024 pages(!) to 32 pages. (1024 was a leftover from when this code was always dispatching writeback to a pdflush thread). - Fix wakeup_bdflush() so that it actually does write something (duh). do_wp_page() needs to call balance_dirty_pages_ratelimited(), so we throttle mmap page-dirtiers in the same way as write(2) page-dirtiers. This may make wakeup_bdflush() obsolete, but it doesn't hurt. - Converts generic_vm_writeback() to directly call ->writeback_mapping(), rather that going through writeback_single_inode(). This prevents memory allocators from blocking on the inode's I_LOCK. But it does mean that two processes can be writing pages from the same mapping at the same time. If filesystems care about this (for layout reasons) then they should serialise in their ->writeback_mapping a_op. This means that memory-allocators will writeback only pages, not pages and inodes. There are no locks in that writeback path (except for request queue exhaustion). Reduces memory allocation latency. - Implement new background_writeback function, which when kicked off will perform writeback until dirty memory falls below the background threshold. - Put written-back pages onto the remote end of the page LRU. It does this in the slow-and-stupid way at present. pagemap_lru_lock stress-relief is planned... - Remove the funny writeback_unused_inodes() stuff from prune_icache(). Writeback from wakeup_bdflush() and the `kupdate' function now just naturally cleanses the oldest inodes so we don't need to do anything there. - Dirty memory balancing is still using magic numbers: "after you dirtied your 1,000th page, go write 1,500". Obviously, this needs more work. --- fs/buffer.c | 5 - fs/inode.c | 8 -- include/linux/writeback.h | 10 +- mm/filemap.c | 4 +- mm/page-writeback.c | 229 ++++++++++++++++++++++++++-------------------- mm/swap_state.c | 18 ++++ mm/vmscan.c | 41 +++------ 7 files changed, 165 insertions(+), 150 deletions(-) (limited to 'include') diff --git a/fs/buffer.c b/fs/buffer.c index 123d8dcb5ea4..d590735164df 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2408,11 +2408,6 @@ asmlinkage long sys_bdflush(int func, long data) return 0; } -void wakeup_bdflush(void) -{ - pdflush_flush(0); -} - /* * Buffer-head allocation */ diff --git a/fs/inode.c b/fs/inode.c index 68c1ee161252..b750b108555b 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -402,14 +402,6 @@ void prune_icache(int goal) spin_unlock(&inode_lock); dispose_list(freeable); - - /* - * If we didn't free enough clean inodes then schedule writeback of - * the dirty inodes. We cannot do it from here or we're either - * synchronously dogslow or we deadlock with oom. - */ - if (goal) - pdflush_operation(try_to_writeback_unused_inodes, 0); } /* diff --git a/include/linux/writeback.h b/include/linux/writeback.h index a089dd009fc1..e345205b6d86 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -46,17 +46,9 @@ static inline void wait_on_inode(struct inode *inode) /* * mm/page-writeback.c */ -/* - * How much data to write out at a time in various places. This isn't - * really very important - it's just here to prevent any thread from - * locking an inode for too long and blocking other threads which wish - * to write the same file for allocation throttling purposes. - */ -#define WRITEOUT_PAGES ((4096 * 1024) / PAGE_CACHE_SIZE) - void balance_dirty_pages(struct address_space *mapping); void balance_dirty_pages_ratelimited(struct address_space *mapping); -int pdflush_flush(unsigned long nr_pages); int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0); +int writeback_mapping(struct address_space *mapping, int *nr_to_write); #endif /* WRITEBACK_H */ diff --git a/mm/filemap.c b/mm/filemap.c index 752547b2d3cb..769a1080c9df 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -453,9 +453,7 @@ EXPORT_SYMBOL(fail_writepage); */ int filemap_fdatawrite(struct address_space *mapping) { - if (mapping->a_ops->writeback_mapping) - return mapping->a_ops->writeback_mapping(mapping, NULL); - return generic_writeback_mapping(mapping, NULL); + return writeback_mapping(mapping, NULL); } /** diff --git a/mm/page-writeback.c b/mm/page-writeback.c index defc6988a305..748dbbb7e789 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -22,6 +22,14 @@ #include #include +/* + * The maximum number of pages to writeout in a single bdflush/kupdate + * operation. We do this so we don't hold I_LOCK against an inode for + * enormous amounts of time, which would block a userspace task which has + * been forced to throttle against that inode. + */ +#define MAX_WRITEBACK_PAGES 1024 + /* * Memory thresholds, in percentages * FIXME: expose these via /proc or whatever. @@ -42,6 +50,8 @@ static int dirty_async_ratio = 50; */ static int dirty_sync_ratio = 60; +static void background_writeout(unsigned long unused); + /* * balance_dirty_pages() must be called by processes which are * generating dirty data. It looks at the number of dirty pages @@ -54,15 +64,16 @@ static int dirty_sync_ratio = 60; * - Does nothing at all. * * balance_dirty_pages() can sleep. + * + * FIXME: WB_SYNC_LAST doesn't actually work. It waits on the last dirty + * inode on the superblock list. It should wait when nr_to_write is + * exhausted. Doesn't seem to matter. */ void balance_dirty_pages(struct address_space *mapping) { const int tot = nr_free_pagecache_pages(); struct page_state ps; - int background_thresh; - int async_thresh; - int sync_thresh; - int wake_pdflush = 0; + int background_thresh, async_thresh, sync_thresh; unsigned long dirty_and_writeback; get_page_state(&ps); @@ -77,27 +88,27 @@ void balance_dirty_pages(struct address_space *mapping) writeback_unlocked_inodes(&nr_to_write, WB_SYNC_LAST, NULL); get_page_state(&ps); - dirty_and_writeback = ps.nr_dirty + ps.nr_writeback; - wake_pdflush = 1; } else if (dirty_and_writeback > async_thresh) { int nr_to_write = 1500; writeback_unlocked_inodes(&nr_to_write, WB_SYNC_NONE, NULL); - } else if (dirty_and_writeback > background_thresh) { - wake_pdflush = 1; + get_page_state(&ps); } - if (wake_pdflush && !writeback_in_progress(mapping->backing_dev_info)) { - if (dirty_and_writeback > async_thresh) { - pdflush_flush(dirty_and_writeback - async_thresh); - yield(); - } - } + if (!writeback_in_progress(mapping->backing_dev_info) && + ps.nr_dirty > background_thresh) + pdflush_operation(background_writeout, 0); } -/* - * Front-end to balance_dirty_pages - just to make sure it's not called - * too often. +/** + * balance_dirty_pages_ratelimited - balance dirty memory state + * @mapping - address_space which was dirtied + * + * Processes which are dirtying memory should call in here once for each page + * which was newly dirtied. The function will periodically check the system's + * dirty state and will initiate writeback if needed. + * + * balance_dirty_pages_ratelimited() may sleep. */ void balance_dirty_pages_ratelimited(struct address_space *mapping) { @@ -118,39 +129,38 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping) } /* - * Here are some applications of the pdflush thread pool + * writeback at least _min_pages, and keep writing until the amount of dirty + * memory is less than the background threshold, or until we're all clean. */ - -/* - * Start heavy writeback of everything. This is the analogue of the old - * wakeup_bdflush(). Returns zero if a thread was successfully launched. - * - * Is passed in the number of pages to write. - * - * We yield, to allow page allocators to perform their I/O against large files. - */ - -static void pdflush_bdflush(unsigned long arg) +static void background_writeout(unsigned long _min_pages) { - int nr_pages = arg; - - CHECK_EMERGENCY_SYNC + const int tot = nr_free_pagecache_pages(); + const int background_thresh = (dirty_background_ratio * tot) / 100; + long min_pages = _min_pages; + int nr_to_write; - while (nr_pages) { - int nr_to_write = WRITEOUT_PAGES; + do { + struct page_state ps; - if (nr_to_write > nr_pages) - nr_to_write = nr_pages; - nr_pages -= nr_to_write; + get_page_state(&ps); + if (ps.nr_dirty < background_thresh && min_pages <= 0) + break; + nr_to_write = MAX_WRITEBACK_PAGES; writeback_unlocked_inodes(&nr_to_write, WB_SYNC_NONE, NULL); - yield(); - } + min_pages -= MAX_WRITEBACK_PAGES - nr_to_write; + } while (nr_to_write <= 0); run_task_queue(&tq_disk); } -int pdflush_flush(unsigned long nr_pages) +/* + * Start heavy writeback of everything. + */ +void wakeup_bdflush(void) { - return pdflush_operation(pdflush_bdflush, nr_pages); + struct page_state ps; + + get_page_state(&ps); + pdflush_operation(background_writeout, ps.nr_dirty); } /* @@ -174,43 +184,41 @@ static struct timer_list wb_timer; * just walks the superblock inode list, writing back any inodes which are * older than a specific point in time. * - * We also limit the number of pages which are written out, to avoid writing - * huge amounts of data against a single file, which would cause memory - * allocators to block for too long. + * Try to run once per wb_writeback_jifs jiffies. But if a writeback event + * takes longer than a wb_writeback_jifs interval, then leave a one-second + * gap. + * + * older_than_this takes precedence over nr_to_write. So we'll only write back + * all dirty pages if they are all attached to "old" mappings. */ static void wb_kupdate(unsigned long arg) { - unsigned long oldest_jif = jiffies - 30*HZ; + unsigned long oldest_jif; + unsigned long start_jif; + unsigned long next_jif; struct page_state ps; - int total_to_write; int nr_to_write; sync_supers(); - get_page_state(&ps); - total_to_write = ps.nr_dirty / 6; - if (total_to_write < 16384) { - total_to_write = 16384; - if (total_to_write > ps.nr_dirty) - total_to_write = ps.nr_dirty; - } - while (total_to_write > 0) { - nr_to_write = total_to_write; - if (nr_to_write > WRITEOUT_PAGES) - nr_to_write = WRITEOUT_PAGES; - total_to_write -= nr_to_write; - writeback_unlocked_inodes(&nr_to_write, - WB_SYNC_NONE, &oldest_jif); - yield(); - } + oldest_jif = jiffies - 30*HZ; + start_jif = jiffies; + next_jif = start_jif + wb_writeback_jifs; + nr_to_write = ps.nr_dirty; + writeback_unlocked_inodes(&nr_to_write, WB_SYNC_NONE, &oldest_jif); run_task_queue(&tq_disk); - mod_timer(&wb_timer, jiffies + wb_writeback_jifs); + yield(); + + if (time_before(next_jif, jiffies + HZ)) + next_jif = jiffies + HZ; + mod_timer(&wb_timer, next_jif); } static void wb_timer_fn(unsigned long unused) { - pdflush_operation(wb_kupdate, 0); + if (pdflush_operation(wb_kupdate, 0) < 0) + mod_timer(&wb_timer, jiffies + HZ); } static int __init wb_timer_init(void) @@ -225,23 +233,42 @@ static int __init wb_timer_init(void) module_init(wb_timer_init); /* - * FIXME: PG_launder gets cleared by accident. + * A library function, which implements the vm_writeback a_op. It's fairly + * lame at this time. The idea is: the VM wants to liberate this page, + * so we pass the page to the address_space and give the fs the opportunity + * to write out lots of pages around this one. It allows extent-based + * filesytems to do intelligent things. It lets delayed-allocate filesystems + * perform better file layout. It lets the address_space opportunistically + * write back disk-contiguous pages which are in other zones. + * + * FIXME: the VM wants to start I/O against *this* page. Because its zone + * is under pressure. But this function may start writeout against a + * totally different set of pages. Unlikely to be a huge problem, but if it + * is, we could just writepage the page if it is still (PageDirty && + * !PageWriteback) (See below). + * + * Another option is to just reposition page->mapping->dirty_pages so we + * *know* that the page will be written. That will work fine, but seems + * unpleasant. (If the page is not for-sure on ->dirty_pages we're dead). + * Plus it assumes that the address_space is performing writeback in + * ->dirty_pages order. + * + * So. The proper fix is to leave the page locked-and-dirty and to pass + * it all the way down. */ -static int writeback_mapping(struct page *page, int *nr_to_write) +int generic_vm_writeback(struct page *page, int *nr_to_write) { struct inode *inode = page->mapping->host; - SetPageDirty(page); - /* - * We don't own this inode, so we don't want the address_space - * vanishing while writeback is walking the list + * We don't own this inode, and we don't want the address_space + * vanishing while writeback is walking its pages. */ inode = igrab(inode); unlock_page(page); if (inode) { - writeback_single_inode(inode, 0, nr_to_write); + writeback_mapping(inode->i_mapping, nr_to_write); /* * This iput() will internally call ext2_discard_prealloc(), @@ -251,23 +278,18 @@ static int writeback_mapping(struct page *page, int *nr_to_write) * Just a waste of cycles. */ iput(inode); +#if 0 + if (!PageWriteback(page) && PageDirty(page)) { + lock_page(page); + if (!PageWriteback(page) && TestClearPageDirty(page)) + page->mapping->a_ops->writepage(page); + else + unlock_page(page); + } +#endif } return 0; } - -/* - * A library function, which implements the vm_writeback a_op. It's fairly - * lame at this time. The idea is: the VM wants to liberate this page, - * so we pass the page to the address_space and give the fs the opportunity - * to write out lots of pages around this one. It allows extent-based - * filesytems to do intelligent things. It lets delayed-allocate filesystems - * perform better file layout. It lets the address_space opportunistically - * write back disk-contiguous pages which are in other zones. - */ -int generic_vm_writeback(struct page *page, int *nr_to_write) -{ - return writeback_mapping(page, nr_to_write); -} EXPORT_SYMBOL(generic_vm_writeback); /** @@ -278,8 +300,7 @@ EXPORT_SYMBOL(generic_vm_writeback); * @nr_to_write: subtract the number of written pages from *@nr_to_write * * This is a library function, which implements the writeback_mapping() - * address_space_operation for filesystems which are using multipage BIO - * writeback. + * address_space_operation. * * (The next two paragraphs refer to code which isn't here yet, but they * explain the presence of address_space.io_pages) @@ -309,10 +330,10 @@ EXPORT_SYMBOL(generic_vm_writeback); */ int generic_writeback_mapping(struct address_space *mapping, int *nr_to_write) { + int (*writepage)(struct page *) = mapping->a_ops->writepage; int ret = 0; int done = 0; int err; - int (*writepage)(struct page *) = mapping->a_ops->writepage; write_lock(&mapping->page_lock); @@ -336,23 +357,29 @@ int generic_writeback_mapping(struct address_space *mapping, int *nr_to_write) continue; } list_add(&page->list, &mapping->locked_pages); - page_cache_get(page); write_unlock(&mapping->page_lock); - lock_page(page); - if (TestClearPageDirty(page)) { + /* It may have been removed from swapcache: check ->mapping */ + if (page->mapping && TestClearPageDirty(page) && + !PageWriteback(page)) { + /* FIXME: batch this up */ + if (!PageActive(page) && PageLRU(page)) { + spin_lock(&pagemap_lru_lock); + if (!PageActive(page) && PageLRU(page)) { + list_del(&page->lru); + list_add(&page->lru, &inactive_list); + } + spin_unlock(&pagemap_lru_lock); + } if (current->flags & PF_MEMALLOC) SetPageLaunder(page); err = writepage(page); if (!ret) ret = err; - if (nr_to_write) { - --(*nr_to_write); - if (*nr_to_write <= 0) - done = 1; - } + if (nr_to_write && --(*nr_to_write) <= 0) + done = 1; } else { unlock_page(page); } @@ -372,14 +399,20 @@ int generic_writeback_mapping(struct address_space *mapping, int *nr_to_write) } EXPORT_SYMBOL(generic_writeback_mapping); +int writeback_mapping(struct address_space *mapping, int *nr_to_write) +{ + if (mapping->a_ops->writeback_mapping) + return mapping->a_ops->writeback_mapping(mapping, nr_to_write); + return generic_writeback_mapping(mapping, nr_to_write); +} + /** * write_one_page - write out a single page and optionally wait on I/O * * @page - the page to write * @wait - if true, wait on writeout * - * The page must be locked by the caller and will come unlocked when I/O - * completes. + * The page must be locked by the caller and will be unlocked upon return. * * write_one_page() returns a negative error code if I/O failed. */ diff --git a/mm/swap_state.c b/mm/swap_state.c index acdabaeccb4a..e802fb3aa707 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -31,7 +31,25 @@ static int swap_writepage(struct page *page) return 0; } +/* + * swapper_space doesn't have a real inode, so it gets a special vm_writeback() + * so we don't need swap special cases in generic_vm_writeback(). + * + * FIXME: swap pages are locked, but not PageWriteback while under writeout. + * This will confuse throttling in shrink_cache(). It may be advantageous to + * set PG_writeback against swap pages while they're also locked. Either that, + * or special-case swap pages in shrink_cache(). + */ +static int swap_vm_writeback(struct page *page, int *nr_to_write) +{ + struct address_space *mapping = page->mapping; + + unlock_page(page); + return generic_writeback_mapping(mapping, nr_to_write); +} + static struct address_space_operations swap_aops = { + vm_writeback: swap_vm_writeback, writepage: swap_writepage, sync_page: block_sync_page, }; diff --git a/mm/vmscan.c b/mm/vmscan.c index 42b8ba093e9d..73341c18f3f2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -458,35 +458,20 @@ static int shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, * pinned it and after the I/O to the page is finished, * so the direct writes to the page cannot get lost. */ - struct address_space_operations *a_ops; int (*writeback)(struct page *, int *); - int (*writepage)(struct page *); + const int nr_pages = SWAP_CLUSTER_MAX; + int nr_to_write = nr_pages; - /* - * There's no guarantee that writeback() will actually - * start I/O against *this* page. Which is broken if we're - * trying to free memory in a particular zone. FIXME. - */ - a_ops = mapping->a_ops; - writeback = a_ops->vm_writeback; - writepage = a_ops->writepage; - if (writeback || writepage) { - SetPageLaunder(page); - page_cache_get(page); - spin_unlock(&pagemap_lru_lock); - ClearPageDirty(page); - - if (writeback) { - int nr_to_write = WRITEOUT_PAGES; - writeback(page, &nr_to_write); - } else { - writepage(page); - } - page_cache_release(page); - - spin_lock(&pagemap_lru_lock); - continue; - } + writeback = mapping->a_ops->vm_writeback; + if (writeback == NULL) + writeback = generic_vm_writeback; + page_cache_get(page); + spin_unlock(&pagemap_lru_lock); + (*writeback)(page, &nr_to_write); + max_scan -= (nr_pages - nr_to_write); + page_cache_release(page); + spin_lock(&pagemap_lru_lock); + continue; } /* @@ -648,6 +633,8 @@ static int shrink_caches(zone_t * classzone, int priority, unsigned int gfp_mask if (nr_pages <= 0) return 0; + wakeup_bdflush(); + shrink_dcache_memory(priority, gfp_mask); /* After shrinking the dcache, get rid of unused inodes too .. */ -- cgit v1.2.3 From 799391cc6d6ff6b37192eb49d5ea3e3aa1137e31 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 19 May 2002 02:22:50 -0700 Subject: [PATCH] improved I/O scheduling for indirect blocks Fixes a performance problem with many-small-file writeout. At present, files are written out via their mapping and their indirect blocks are written out via the blockdev mapping. As we know that indirects are disk-adjacent to the data it is better to start I/O against the indirects at the same time as the data. The delalloc pathes have code in ext2_writepage() which recognises when the target page->index was at an indirect boundary and does an explicit hunt-and-write against the neighbouring indirect block. Which is ideal. (Unless the file was dirtied seekily and the page which is next to the indirect was not dirtied). This patch does it the other way: when we start writeback against a mapping, also start writeback against any dirty buffers which are attached to mapping->private_list. Let the elevator take care of the rest. The patch makes a number of tuning changes to the writeback path in fs-writeback.c. This is very fiddly code: getting the throughput tuned, getting the data-integrity "sync" operations right, avoiding most of the livelock opportunities, getting the `kupdate' function working efficiently, keeping it all least somewhat comprehensible. An important intent here is to ensure that metadata blocks for inodes are marked dirty before writeback starts working the blockdev mapping, so all the inode blocks are efficiently written back. The patch removes try_to_writeback_unused_inodes(), which became unreferenced in vm-writeback.patch. The patch has a tweak in ext2_put_inode() to prevent ext2 from incorrectly droppping its preallocation window in response to a random iput(). Generally, many-small-file writeout is a lot faster than 2.5.7 (which is linux-before-I-futzed-with-it). The workload which was optimised was tar xfz /nfs/mountpoint/linux-2.4.18.tar.gz ; sync on mem=128M and mem=2048M. With these patches, 2.5.15 is completing in about 2/3 of the time of 2.5.7. But it is only a shade faster than 2.4.19-pre7. Why is 2.5.7 so much slower than 2.4.19? Not sure yet. Heavy dbench loads (dbench 32 on mem=128M) are slightly faster than 2.5.7 and significantly slower than 2.4.19. It appears that the cause is poor read throughput at the later stages of the run. Because there are background writeback threads operating at the same time. The 2.4.19-pre8 write scheduling manages to stop writeback during the latter stages of the dbench run in a way which I haven't been able to sanely emulate yet. It may not be desirable to do this anyway - it's optimising for the case where the files are about to be deleted. But it would be good to find a way of "pausing" the writeback for a few seconds to allow readers to get an interval of decent bandwidth. tiobench throughput is basically the same across all recent kernels. CPU load on writes is down maybe 30% in 2.5.15. --- fs/buffer.c | 87 +++++++++++++-- fs/ext2/inode.c | 18 +++- fs/fs-writeback.c | 257 ++++++++++++++++++-------------------------- fs/inode.c | 6 ++ fs/super.c | 4 + include/linux/buffer_head.h | 12 ++- include/linux/fs.h | 4 +- include/linux/writeback.h | 8 +- 8 files changed, 225 insertions(+), 171 deletions(-) (limited to 'include') diff --git a/fs/buffer.c b/fs/buffer.c index d590735164df..f9923e470bb3 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -210,10 +210,7 @@ int sync_blockdev(struct block_device *bdev) if (bdev) { int err; - ret = filemap_fdatawait(bdev->bd_inode->i_mapping); - err = filemap_fdatawrite(bdev->bd_inode->i_mapping); - if (!ret) - ret = err; + ret = filemap_fdatawrite(bdev->bd_inode->i_mapping); err = filemap_fdatawait(bdev->bd_inode->i_mapping); if (!ret) ret = err; @@ -229,12 +226,14 @@ EXPORT_SYMBOL(sync_blockdev); */ int fsync_super(struct super_block *sb) { - sync_inodes_sb(sb); /* All the inodes */ + sync_inodes_sb(sb, 0); DQUOT_SYNC(sb); lock_super(sb); if (sb->s_dirt && sb->s_op && sb->s_op->write_super) sb->s_op->write_super(sb); unlock_super(sb); + sync_blockdev(sb->s_bdev); + sync_inodes_sb(sb, 1); return sync_blockdev(sb->s_bdev); } @@ -276,10 +275,10 @@ int fsync_dev(kdev_t dev) */ asmlinkage long sys_sync(void) { - sync_inodes(); /* All mappings and inodes, including block devices */ + sync_inodes(0); /* All mappings and inodes, including block devices */ DQUOT_SYNC(NULL); sync_supers(); /* Write the superblocks */ - sync_inodes(); /* All the mappings and inodes, again. */ + sync_inodes(1); /* All the mappings and inodes, again. */ return 0; } @@ -775,6 +774,80 @@ int sync_mapping_buffers(struct address_space *mapping) } EXPORT_SYMBOL(sync_mapping_buffers); +/** + * write_mapping_buffers - Start writeout of a mapping's "associated" buffers. + * @mapping - the mapping which wants those buffers written. + * + * Starts I/O against dirty buffers which are on @mapping->private_list. + * Those buffers must be backed by @mapping->assoc_mapping. + * + * The private_list buffers generally contain filesystem indirect blocks. + * The idea is that the filesystem can start I/O against the indirects at + * the same time as running generic_writeback_mapping(), so the indirect's + * I/O will be merged with the data. + * + * We sneakliy write the buffers in probable tail-to-head order. This is + * because generic_writeback_mapping writes in probable head-to-tail + * order. If the file is so huge that the data or the indirects overflow + * the request queue we will at least get some merging this way. + * + * Any clean+unlocked buffers are de-listed. clean/locked buffers must be + * left on the list for an fsync() to wait on. + * + * Couldn't think of a smart way of avoiding livelock, so chose the dumb + * way instead. + * + * FIXME: duplicates fsync_inode_buffers() functionality a bit. + */ +int write_mapping_buffers(struct address_space *mapping) +{ + spinlock_t *lock; + struct address_space *buffer_mapping; + unsigned nr_to_write; /* livelock avoidance */ + struct list_head *lh; + int ret = 0; + + if (list_empty(&mapping->private_list)) + goto out; + + buffer_mapping = mapping->assoc_mapping; + lock = &buffer_mapping->private_lock; + spin_lock(lock); + nr_to_write = 0; + lh = mapping->private_list.next; + while (lh != &mapping->private_list) { + lh = lh->next; + nr_to_write++; + } + nr_to_write *= 2; /* Allow for some late additions */ + + while (nr_to_write-- && !list_empty(&mapping->private_list)) { + struct buffer_head *bh; + + bh = BH_ENTRY(mapping->private_list.prev); + list_del_init(&bh->b_assoc_buffers); + if (!buffer_dirty(bh) && !buffer_locked(bh)) + continue; + /* Stick it on the far end of the list. Order is preserved. */ + list_add(&bh->b_assoc_buffers, &mapping->private_list); + if (test_set_buffer_locked(bh)) + continue; + get_bh(bh); + spin_unlock(lock); + if (test_clear_buffer_dirty(bh)) { + bh->b_end_io = end_buffer_io_sync; + submit_bh(WRITE, bh); + } else { + unlock_buffer(bh); + put_bh(bh); + } + spin_lock(lock); + } + spin_unlock(lock); +out: + return ret; +} + void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode) { struct address_space *mapping = inode->i_mapping; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index b29af3b55ca1..55592347a48c 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -41,7 +41,7 @@ static int ext2_update_inode(struct inode * inode, int do_sync); */ void ext2_put_inode (struct inode * inode) { - if (atomic_read(&inode->i_count) < 2) + if (atomic_read(&inode->i_count) < 2) /* final iput? */ ext2_discard_prealloc (inode); } @@ -584,6 +584,20 @@ static int ext2_direct_IO(int rw, struct inode * inode, struct kiobuf * iobuf, u { return generic_direct_IO(rw, inode, iobuf, blocknr, blocksize, ext2_get_block); } + +static int +ext2_writeback_mapping(struct address_space *mapping, int *nr_to_write) +{ + int ret; + int err; + + ret = write_mapping_buffers(mapping); + err = generic_writeback_mapping(mapping, nr_to_write); + if (!ret) + ret = err; + return ret; +} + struct address_space_operations ext2_aops = { readpage: ext2_readpage, writepage: ext2_writepage, @@ -592,7 +606,7 @@ struct address_space_operations ext2_aops = { commit_write: generic_commit_write, bmap: ext2_bmap, direct_IO: ext2_direct_IO, - writeback_mapping: generic_writeback_mapping, + writeback_mapping: ext2_writeback_mapping, vm_writeback: generic_vm_writeback, }; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index b2d84f68c3da..5ad90478a547 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -77,14 +77,14 @@ void __mark_inode_dirty(struct inode *inode, int flags) * superblock list, based upon its state. */ if (inode->i_state & I_LOCK) - goto same_list; + goto out; /* * Only add valid (hashed) inode to the superblock's * dirty list. Add blockdev inodes as well. */ if (list_empty(&inode->i_hash) && !S_ISBLK(inode->i_mode)) - goto same_list; + goto out; /* * If the inode was already on s_dirty, don't reposition @@ -95,11 +95,11 @@ void __mark_inode_dirty(struct inode *inode, int flags) list_add(&inode->i_list, &sb->s_dirty); } } -same_list: +out: spin_unlock(&inode_lock); } -static inline void write_inode(struct inode *inode, int sync) +static void write_inode(struct inode *inode, int sync) { if (inode->i_sb->s_op && inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) @@ -130,9 +130,10 @@ static void __sync_single_inode(struct inode *inode, int wait, int *nr_to_write) unsigned dirty; unsigned long orig_dirtied_when; struct address_space *mapping = inode->i_mapping; + struct super_block *sb = inode->i_sb; list_del(&inode->i_list); - list_add(&inode->i_list, &inode->i_sb->s_locked_inodes); + list_add(&inode->i_list, &sb->s_locked_inodes); BUG_ON(inode->i_state & I_LOCK); @@ -144,13 +145,7 @@ static void __sync_single_inode(struct inode *inode, int wait, int *nr_to_write) mapping->dirtied_when = 0; /* assume it's whole-file writeback */ spin_unlock(&inode_lock); - if (wait) - filemap_fdatawait(mapping); - - if (mapping->a_ops->writeback_mapping) - mapping->a_ops->writeback_mapping(mapping, nr_to_write); - else - generic_writeback_mapping(mapping, NULL); + writeback_mapping(mapping, nr_to_write); /* Don't write the inode if only I_DIRTY_PAGES was set */ if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) @@ -164,17 +159,20 @@ static void __sync_single_inode(struct inode *inode, int wait, int *nr_to_write) inode->i_state &= ~I_LOCK; if (!(inode->i_state & I_FREEING)) { list_del(&inode->i_list); - if (!list_empty(&mapping->dirty_pages)) { - /* Not a whole-file writeback */ - mapping->dirtied_when = orig_dirtied_when; - inode->i_state |= I_DIRTY_PAGES; - list_add_tail(&inode->i_list, &inode->i_sb->s_dirty); - } else if (inode->i_state & I_DIRTY) { - list_add(&inode->i_list, &inode->i_sb->s_dirty); - } else if (atomic_read(&inode->i_count)) { - list_add(&inode->i_list, &inode_in_use); + if (inode->i_state & I_DIRTY) { /* Redirtied */ + list_add(&inode->i_list, &sb->s_dirty); } else { - list_add(&inode->i_list, &inode_unused); + if (!list_empty(&mapping->dirty_pages)) { + /* Not a whole-file writeback */ + mapping->dirtied_when = orig_dirtied_when; + inode->i_state |= I_DIRTY_PAGES; + list_add_tail(&inode->i_list, + &sb->s_dirty); + } else if (atomic_read(&inode->i_count)) { + list_add(&inode->i_list, &inode_in_use); + } else { + list_add(&inode->i_list, &inode_unused); + } } } if (waitqueue_active(&inode->i_wait)) @@ -200,37 +198,35 @@ __writeback_single_inode(struct inode *inode, int sync, int *nr_to_write) __sync_single_inode(inode, sync, nr_to_write); } -void writeback_single_inode(struct inode *inode, int sync, int *nr_to_write) -{ - spin_lock(&inode_lock); - __writeback_single_inode(inode, sync, nr_to_write); - spin_unlock(&inode_lock); -} - /* - * Write out a list of dirty inodes. - * - * If `sync' is true, wait on writeout of the last mapping which we write. + * Write out a superblock's list of dirty inodes. A wait will be performed + * upon no inodes, all inodes or the final one, depending upon sync_mode. * * If older_than_this is non-NULL, then only write out mappings which * had their first dirtying at a time earlier than *older_than_this. * - * Called under inode_lock. - * * If we're a pdlfush thread, then implement pdlfush collision avoidance * against the entire list. + * + * WB_SYNC_HOLD is a hack for sys_sync(): reattach the inode to sb->s_dirty so + * that it can be located for waiting on in __writeback_single_inode(). + * + * Called under inode_lock. */ -static void __sync_list(struct list_head *head, int sync_mode, +static void sync_sb_inodes(struct super_block *sb, int sync_mode, int *nr_to_write, unsigned long *older_than_this) { struct list_head *tmp; + struct list_head *head; const unsigned long start = jiffies; /* livelock avoidance */ + list_splice(&sb->s_dirty, &sb->s_io); + INIT_LIST_HEAD(&sb->s_dirty); + head = &sb->s_io; while ((tmp = head->prev) != head) { struct inode *inode = list_entry(tmp, struct inode, i_list); struct address_space *mapping = inode->i_mapping; struct backing_dev_info *bdi; - int really_sync; /* Was this inode dirtied after __sync_list was called? */ @@ -239,7 +235,7 @@ static void __sync_list(struct list_head *head, int sync_mode, if (older_than_this && time_after(mapping->dirtied_when, *older_than_this)) - break; + goto out; bdi = mapping->backing_dev_info; if (current_is_pdflush() && !writeback_acquire(bdi)) @@ -248,14 +244,29 @@ static void __sync_list(struct list_head *head, int sync_mode, really_sync = (sync_mode == WB_SYNC_ALL); if ((sync_mode == WB_SYNC_LAST) && (head->prev == head)) really_sync = 1; + __writeback_single_inode(inode, really_sync, nr_to_write); + if (sync_mode == WB_SYNC_HOLD) { + mapping->dirtied_when = jiffies; + list_del(&inode->i_list); + list_add(&inode->i_list, &inode->i_sb->s_dirty); + } + if (current_is_pdflush()) writeback_release(bdi); if (nr_to_write && *nr_to_write == 0) break; } +out: + if (!list_empty(&sb->s_io)) { + /* + * Put the rest back, in the correct order. + */ + list_splice(&sb->s_io, sb->s_dirty.prev); + INIT_LIST_HEAD(&sb->s_io); + } return; } @@ -277,27 +288,16 @@ static void __sync_list(struct list_head *head, int sync_mode, void writeback_unlocked_inodes(int *nr_to_write, int sync_mode, unsigned long *older_than_this) { - struct super_block * sb; - static unsigned short writeback_gen; + struct super_block *sb; spin_lock(&inode_lock); spin_lock(&sb_lock); - - /* - * We could get into livelock here if someone is dirtying - * inodes fast enough. writeback_gen is used to avoid that. - */ - writeback_gen++; - sb = sb_entry(super_blocks.prev); for (; sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.prev)) { - if (sb->s_writeback_gen == writeback_gen) - continue; - sb->s_writeback_gen = writeback_gen; if (!list_empty(&sb->s_dirty)) { spin_unlock(&sb_lock); - __sync_list(&sb->s_dirty, sync_mode, - nr_to_write, older_than_this); + sync_sb_inodes(sb, sync_mode, nr_to_write, + older_than_this); spin_lock(&sb_lock); } if (nr_to_write && *nr_to_write == 0) @@ -307,42 +307,6 @@ void writeback_unlocked_inodes(int *nr_to_write, int sync_mode, spin_unlock(&inode_lock); } -/* - * Called under inode_lock. - */ -static int __try_to_writeback_unused_list(struct list_head *head, int nr_inodes) -{ - struct list_head *tmp = head; - struct inode *inode; - - while (nr_inodes && (tmp = tmp->prev) != head) { - inode = list_entry(tmp, struct inode, i_list); - - if (!atomic_read(&inode->i_count)) { - struct backing_dev_info *bdi; - - bdi = inode->i_mapping->backing_dev_info; - if (current_is_pdflush() && !writeback_acquire(bdi)) - goto out; - - __sync_single_inode(inode, 0, NULL); - - if (current_is_pdflush()) - writeback_release(bdi); - - nr_inodes--; - - /* - * __sync_single_inode moved the inode to another list, - * so we have to start looking from the list head. - */ - tmp = head; - } - } -out: - return nr_inodes; -} - static void __wait_on_locked(struct list_head *head) { struct list_head * tmp; @@ -357,104 +321,95 @@ static void __wait_on_locked(struct list_head *head) } /* - * writeback and wait upon the filesystem's dirty inodes. - * We do it in two passes - one to write, and one to wait. + * writeback and wait upon the filesystem's dirty inodes. The caller will + * do this in two passes - one to write, and one to wait. WB_SYNC_HOLD is + * used to park the written inodes on sb->s_dirty for the wait pass. */ -void sync_inodes_sb(struct super_block *sb) +void sync_inodes_sb(struct super_block *sb, int wait) { spin_lock(&inode_lock); - while (!list_empty(&sb->s_dirty)||!list_empty(&sb->s_locked_inodes)) { - __sync_list(&sb->s_dirty, WB_SYNC_NONE, NULL, NULL); - __sync_list(&sb->s_dirty, WB_SYNC_ALL, NULL, NULL); + sync_sb_inodes(sb, wait ? WB_SYNC_ALL : WB_SYNC_HOLD, NULL, NULL); + if (wait) __wait_on_locked(&sb->s_locked_inodes); - } spin_unlock(&inode_lock); } /* - * writeback the dirty inodes for this filesystem + * Rather lame livelock avoidance. */ -void writeback_inodes_sb(struct super_block *sb) +static void set_sb_syncing(int val) { - spin_lock(&inode_lock); - while (!list_empty(&sb->s_dirty)) - __sync_list(&sb->s_dirty, WB_SYNC_NONE, NULL, NULL); - spin_unlock(&inode_lock); + struct super_block *sb; + spin_lock(&sb_lock); + sb = sb_entry(super_blocks.prev); + for (; sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.prev)) { + sb->s_syncing = val; + } + spin_unlock(&sb_lock); } /* * Find a superblock with inodes that need to be synced */ - static struct super_block *get_super_to_sync(void) { - struct list_head *p; + struct super_block *sb; restart: - spin_lock(&inode_lock); spin_lock(&sb_lock); - list_for_each(p, &super_blocks) { - struct super_block *s = list_entry(p,struct super_block,s_list); - if (list_empty(&s->s_dirty) && list_empty(&s->s_locked_inodes)) + sb = sb_entry(super_blocks.prev); + for (; sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.prev)) { + if (sb->s_syncing) continue; - s->s_count++; + sb->s_syncing = 1; + sb->s_count++; spin_unlock(&sb_lock); - spin_unlock(&inode_lock); - down_read(&s->s_umount); - if (!s->s_root) { - drop_super(s); + down_read(&sb->s_umount); + if (!sb->s_root) { + drop_super(sb); goto restart; } - return s; + return sb; } spin_unlock(&sb_lock); - spin_unlock(&inode_lock); return NULL; } /** - * sync_inodes - * @dev: device to sync the inodes from. + * sync_inodes * - * sync_inodes goes through the super block's dirty list, - * writes them out, waits on the writeout and puts the inodes - * back on the normal list. - */ - -void sync_inodes(void) -{ - struct super_block * s; - /* - * Search the super_blocks array for the device(s) to sync. - */ - while ((s = get_super_to_sync()) != NULL) { - sync_inodes_sb(s); - drop_super(s); - } -} - -/* - * FIXME: the try_to_writeback_unused functions look dreadfully similar to - * writeback_unlocked_inodes... + * sync_inodes() goes through each super block's dirty inode list, writes the + * inodes out, waits on the writeout and puts the inodes back on the normal + * list. + * + * This is for sys_sync(). fsync_dev() uses the same algorithm. The subtle + * part of the sync functions is that the blockdev "superblock" is processed + * last. This is because the write_inode() function of a typical fs will + * perform no I/O, but will mark buffers in the blockdev mapping as dirty. + * What we want to do is to perform all that dirtying first, and then write + * back all those inode blocks via the blockdev mapping in one sweep. So the + * additional (somewhat redundant) sync_blockdev() calls here are to make + * sure that really happens. Because if we call sync_inodes_sb(wait=1) with + * outstanding dirty inodes, the writeback goes block-at-a-time within the + * filesystem's write_inode(). This is extremely slow. */ -void try_to_writeback_unused_inodes(unsigned long unused) +void sync_inodes(int wait) { - struct super_block * sb; - int nr_inodes = inodes_stat.nr_unused; + struct super_block *sb; - spin_lock(&inode_lock); - spin_lock(&sb_lock); - sb = sb_entry(super_blocks.next); - for (; nr_inodes && sb != sb_entry(&super_blocks); - sb = sb_entry(sb->s_list.next)) { - if (list_empty(&sb->s_dirty)) - continue; - spin_unlock(&sb_lock); - nr_inodes = __try_to_writeback_unused_list(&sb->s_dirty, - nr_inodes); - spin_lock(&sb_lock); + set_sb_syncing(0); + while ((sb = get_super_to_sync()) != NULL) { + sync_inodes_sb(sb, 0); + sync_blockdev(sb->s_bdev); + drop_super(sb); + } + if (wait) { + set_sb_syncing(0); + while ((sb = get_super_to_sync()) != NULL) { + sync_inodes_sb(sb, 1); + sync_blockdev(sb->s_bdev); + drop_super(sb); + } } - spin_unlock(&sb_lock); - spin_unlock(&inode_lock); } /** diff --git a/fs/inode.c b/fs/inode.c index b750b108555b..503e500b6584 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -311,6 +311,7 @@ int invalidate_inodes(struct super_block * sb) busy = invalidate_list(&inode_in_use, sb, &throw_away); busy |= invalidate_list(&inode_unused, sb, &throw_away); busy |= invalidate_list(&sb->s_dirty, sb, &throw_away); + busy |= invalidate_list(&sb->s_io, sb, &throw_away); busy |= invalidate_list(&sb->s_locked_inodes, sb, &throw_away); spin_unlock(&inode_lock); @@ -896,6 +897,11 @@ void remove_dquot_ref(struct super_block *sb, short type) if (IS_QUOTAINIT(inode)) remove_inode_dquot_ref(inode, type, &tofree_head); } + list_for_each(act_head, &sb->s_io) { + inode = list_entry(act_head, struct inode, i_list); + if (IS_QUOTAINIT(inode)) + remove_inode_dquot_ref(inode, type, &tofree_head); + } list_for_each(act_head, &sb->s_locked_inodes) { inode = list_entry(act_head, struct inode, i_list); if (IS_QUOTAINIT(inode)) diff --git a/fs/super.c b/fs/super.c index 9a1be36c2012..52854d399227 100644 --- a/fs/super.c +++ b/fs/super.c @@ -48,6 +48,7 @@ static struct super_block *alloc_super(void) if (s) { memset(s, 0, sizeof(struct super_block)); INIT_LIST_HEAD(&s->s_dirty); + INIT_LIST_HEAD(&s->s_io); INIT_LIST_HEAD(&s->s_locked_inodes); INIT_LIST_HEAD(&s->s_files); INIT_LIST_HEAD(&s->s_instances); @@ -154,6 +155,9 @@ static int grab_super(struct super_block *s) * * Associates superblock with fs type and puts it on per-type and global * superblocks' lists. Should be called with sb_lock held; drops it. + * + * NOTE: the super_blocks ordering here is important: writeback wants + * the blockdev superblock to be at super_blocks.next. */ static void insert_super(struct super_block *s, struct file_system_type *type) { diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 328af2a6c275..5560b6ee5878 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -29,6 +29,7 @@ enum bh_state_bits { struct page; struct kiobuf; struct buffer_head; +struct address_space; typedef void (bh_end_io_t)(struct buffer_head *bh, int uptodate); /* @@ -145,14 +146,19 @@ int try_to_free_buffers(struct page *); void create_empty_buffers(struct page *, unsigned long, unsigned long b_state); void end_buffer_io_sync(struct buffer_head *bh, int uptodate); + +/* Things to do with buffers at mapping->private_list */ void buffer_insert_list(spinlock_t *lock, struct buffer_head *, struct list_head *); -int sync_mapping_buffers(struct address_space *mapping); void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode); +int write_mapping_buffers(struct address_space *mapping); +int inode_has_buffers(struct inode *); +void invalidate_inode_buffers(struct inode *); +int fsync_buffers_list(spinlock_t *lock, struct list_head *); +int sync_mapping_buffers(struct address_space *mapping); void mark_buffer_async_read(struct buffer_head *bh); void mark_buffer_async_write(struct buffer_head *bh); -void invalidate_inode_buffers(struct inode *); void invalidate_bdev(struct block_device *, int); void __invalidate_buffers(kdev_t dev, int); int sync_blockdev(struct block_device *bdev); @@ -163,8 +169,6 @@ int fsync_dev(kdev_t); int fsync_bdev(struct block_device *); int fsync_super(struct super_block *); int fsync_no_super(struct block_device *); -int fsync_buffers_list(spinlock_t *lock, struct list_head *); -int inode_has_buffers(struct inode *); struct buffer_head *__get_hash_table(struct block_device *, sector_t, int); struct buffer_head * __getblk(struct block_device *, sector_t, int); void __brelse(struct buffer_head *); diff --git a/include/linux/fs.h b/include/linux/fs.h index b936413f96f2..9b2bfa8cc3d6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -618,7 +618,6 @@ struct super_block { kdev_t s_dev; unsigned long s_blocksize; unsigned long s_old_blocksize; - unsigned short s_writeback_gen;/* To avoid writeback livelock */ unsigned char s_blocksize_bits; unsigned char s_dirt; unsigned long long s_maxbytes; /* Max file size */ @@ -632,9 +631,11 @@ struct super_block { struct rw_semaphore s_umount; struct semaphore s_lock; int s_count; + int s_syncing; atomic_t s_active; struct list_head s_dirty; /* dirty inodes */ + struct list_head s_io; /* parked for writeback */ struct list_head s_locked_inodes;/* inodes being synced */ struct list_head s_anon; /* anonymous dentries for (nfs) exporting */ struct list_head s_files; @@ -1116,7 +1117,6 @@ extern int invalidate_device(kdev_t, int); extern void invalidate_inode_pages(struct inode *); extern void invalidate_inode_pages2(struct address_space *); extern void write_inode_now(struct inode *, int); -extern void sync_inodes_sb(struct super_block *); extern int filemap_fdatawrite(struct address_space *); extern int filemap_fdatawait(struct address_space *); extern void sync_supers(void); diff --git a/include/linux/writeback.h b/include/linux/writeback.h index e345205b6d86..9dc03210ee62 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -27,15 +27,13 @@ static inline int current_is_pdflush(void) #define WB_SYNC_NONE 0 /* Don't wait on anything */ #define WB_SYNC_LAST 1 /* Wait on the last-written mapping */ #define WB_SYNC_ALL 2 /* Wait on every mapping */ +#define WB_SYNC_HOLD 3 /* Hold the inode on sb_dirty for sys_sync() */ -void try_to_writeback_unused_inodes(unsigned long pexclusive); -void writeback_single_inode(struct inode *inode, - int sync, int *nr_to_write); void writeback_unlocked_inodes(int *nr_to_write, int sync_mode, unsigned long *older_than_this); -void writeback_inodes_sb(struct super_block *); void __wait_on_inode(struct inode * inode); -void sync_inodes(void); +void sync_inodes_sb(struct super_block *, int wait); +void sync_inodes(int wait); static inline void wait_on_inode(struct inode *inode) { -- cgit v1.2.3 From 5409c2b52ffea911ba1e47b5cbf8d911efb5d0c6 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 19 May 2002 02:23:14 -0700 Subject: [PATCH] fix ext3 race with writeback The ext3-no-steal patch has exposed a long-standing race in ext3. It has been there all the time in 2.4, but never triggered until some timing change in the ext3-no-steal patch exposed it. The race was not present in 2.2 because 2.2's bdflush runs inside lock_kernel(). The problem is that when ext3 is shuffling a buffer between journalling lists there is a small window where the buffer is marked BH_dirty. Aonther CPU can grab it, mark it clean and write it out. Then ext3 puts the buffer onto a list of buffers which are expected to be dirty, and gets confused later on when the buffer turns out to be clean. The patch from Stephen records the expected dirtiness of the buffer in a local variable, so BH_dirty is not transiently set while ext3 shuffles. --- fs/jbd/transaction.c | 31 ++++++++++++++++++++----------- include/linux/jbd.h | 1 + 2 files changed, 21 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index 4f91132fc23d..2245e396ebd9 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -1941,6 +1941,8 @@ void __journal_file_buffer(struct journal_head *jh, transaction_t *transaction, int jlist) { struct journal_head **list = 0; + int was_dirty = 0; + struct buffer_head *bh = jh2bh(jh); assert_spin_locked(&journal_datalist_lock); @@ -1951,13 +1953,24 @@ void __journal_file_buffer(struct journal_head *jh, J_ASSERT_JH(jh, jh->b_transaction == transaction || jh->b_transaction == 0); - if (jh->b_transaction) { - if (jh->b_jlist == jlist) - return; + if (jh->b_transaction && jh->b_jlist == jlist) + return; + + /* The following list of buffer states needs to be consistent + * with __jbd_unexpected_dirty_buffer()'s handling of dirty + * state. */ + + if (jlist == BJ_Metadata || jlist == BJ_Reserved || + jlist == BJ_Shadow || jlist == BJ_Forget) { + if (test_clear_buffer_dirty(bh) || + test_clear_buffer_jbddirty(bh)) + was_dirty = 1; + } + + if (jh->b_transaction) __journal_unfile_buffer(jh); - } else { + else jh->b_transaction = transaction; - } switch (jlist) { case BJ_None: @@ -1994,12 +2007,8 @@ void __journal_file_buffer(struct journal_head *jh, __blist_add_buffer(list, jh); jh->b_jlist = jlist; - if (jlist == BJ_Metadata || jlist == BJ_Reserved || - jlist == BJ_Shadow || jlist == BJ_Forget) { - if (test_clear_buffer_dirty(jh2bh(jh))) { - set_bit(BH_JBDDirty, &jh2bh(jh)->b_state); - } - } + if (was_dirty) + set_buffer_jbddirty(bh); } void journal_file_buffer(struct journal_head *jh, diff --git a/include/linux/jbd.h b/include/linux/jbd.h index 2752f3a7375d..4a96c4ac35b9 100644 --- a/include/linux/jbd.h +++ b/include/linux/jbd.h @@ -235,6 +235,7 @@ enum jbd_state_bits { BUFFER_FNS(JBD, jbd) BUFFER_FNS(JBDDirty, jbddirty) +TAS_BUFFER_FNS(JBDDirty, jbddirty) static inline struct buffer_head *jh2bh(struct journal_head *jh) { -- cgit v1.2.3 From a25364526006361b7e7e011ce488cb46e89dd3ef Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 19 May 2002 02:23:27 -0700 Subject: [PATCH] remove PG_launder Removal of PG_launder. It's not obvious (to me) why this ever existed. If it's to prevent deadlocks then I'd like to know who was performing __GFP_FS allocations while holding a page lock? But in 2.5, the only memory allocations which are performed when the caller holds PG_writeback against an unsubmitted page are those which occur inside submit_bh(). There will be no __GFS_FS allocations in that call chain. Removing PG_launder means that memory allocators can block on any PageWriteback() page at all, which reduces the risk of very long list walks inside pagemap_lru_lock in shrink_cache(). --- include/linux/page-flags.h | 9 ++------- mm/filemap.c | 3 +-- mm/page-writeback.c | 2 -- mm/shmem.c | 3 ++- mm/vmscan.c | 5 ++--- 5 files changed, 7 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index f56db65ebef3..52b7117c4f64 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -62,9 +62,8 @@ #define PG_arch_1 10 #define PG_reserved 11 -#define PG_launder 12 /* written out by VM pressure.. */ -#define PG_private 13 /* Has something at ->private */ -#define PG_writeback 14 /* Page is under writeback */ +#define PG_private 12 /* Has something at ->private */ +#define PG_writeback 13 /* Page is under writeback */ /* * Global page accounting. One instance per CPU. @@ -172,10 +171,6 @@ extern void get_page_state(struct page_state *ret); #define SetPageReserved(page) set_bit(PG_reserved, &(page)->flags) #define ClearPageReserved(page) clear_bit(PG_reserved, &(page)->flags) -#define PageLaunder(page) test_bit(PG_launder, &(page)->flags) -#define SetPageLaunder(page) set_bit(PG_launder, &(page)->flags) -#define ClearPageLaunder(page) clear_bit(PG_launder, &(page)->flags) - #define SetPagePrivate(page) set_bit(PG_private, &(page)->flags) #define ClearPagePrivate(page) clear_bit(PG_private, &(page)->flags) #define PagePrivate(page) test_bit(PG_private, &(page)->flags) diff --git a/mm/filemap.c b/mm/filemap.c index 769a1080c9df..67a7cf78292d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -432,7 +432,7 @@ void invalidate_inode_pages2(struct address_space * mapping) int fail_writepage(struct page *page) { /* Only activate on memory-pressure, not fsync.. */ - if (PageLaunder(page)) { + if (current->flags & PF_MEMALLOC) { activate_page(page); SetPageReferenced(page); } @@ -652,7 +652,6 @@ void unlock_page(struct page *page) void end_page_writeback(struct page *page) { wait_queue_head_t *waitqueue = page_waitqueue(page); - ClearPageLaunder(page); smp_mb__before_clear_bit(); if (!TestClearPageWriteback(page)) BUG(); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 748dbbb7e789..725a4bdb60e1 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -373,8 +373,6 @@ int generic_writeback_mapping(struct address_space *mapping, int *nr_to_write) } spin_unlock(&pagemap_lru_lock); } - if (current->flags & PF_MEMALLOC) - SetPageLaunder(page); err = writepage(page); if (!ret) ret = err; diff --git a/mm/shmem.c b/mm/shmem.c index 64330ed216f4..615b0051bbcf 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -438,7 +438,8 @@ static int shmem_writepage(struct page * page) if (!PageLocked(page)) BUG(); - if (!PageLaunder(page)) + + if (!(current->flags & PF_MEMALLOC)) return fail_writepage(page); mapping = page->mapping; diff --git a/mm/vmscan.c b/mm/vmscan.c index 73341c18f3f2..c4ef073b682e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -424,11 +424,10 @@ static int shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, goto page_mapped; /* - * The page is locked. IO in progress? - * Move it to the back of the list. + * IO in progress? Leave it at the back of the list. */ if (unlikely(PageWriteback(page))) { - if (PageLaunder(page) && (gfp_mask & __GFP_FS)) { + if (gfp_mask & __GFP_FS) { page_cache_get(page); spin_unlock(&pagemap_lru_lock); wait_on_page_writeback(page); -- cgit v1.2.3 From 7a24f1a6d522cce1e319f434a202f7d6944924bc Mon Sep 17 00:00:00 2001 From: Jan Harkes Date: Sun, 19 May 2002 19:24:57 -0700 Subject: [PATCH] iget_locked [1/6] Fix a race in iget4. The fs specific data that is used to find an inode should be initialized while still holding the inode lock. It adds a 'set' callback function that should be a non-blocking FS provided function which initializes the private parts of the inode so that the 'test' callback function can correctly match new inodes. Touches all filesystems that use iget4 (Coda/NFS/ReiserFS). --- fs/coda/cnode.c | 76 +++++++++++++++++++-------------------------- fs/inode.c | 27 ++++++++++------ fs/nfs/inode.c | 18 ++++++++--- fs/reiserfs/inode.c | 13 +++++--- fs/reiserfs/super.c | 2 +- include/linux/fs.h | 5 ++- include/linux/reiserfs_fs.h | 2 ++ 7 files changed, 78 insertions(+), 65 deletions(-) (limited to 'include') diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c index 6d260b30d551..a75444e66eef 100644 --- a/fs/coda/cnode.c +++ b/fs/coda/cnode.c @@ -25,11 +25,6 @@ inline int coda_isnullfid(ViceFid *fid) return 1; } -static int coda_inocmp(struct inode *inode, unsigned long ino, void *opaque) -{ - return (coda_fideq((ViceFid *)opaque, &(ITOC(inode)->c_fid))); -} - static struct inode_operations coda_symlink_inode_operations = { readlink: page_readlink, follow_link: page_follow_link, @@ -55,27 +50,35 @@ static void coda_fill_inode(struct inode *inode, struct coda_vattr *attr) init_special_inode(inode, inode->i_mode, attr->va_rdev); } +static int coda_test_inode(struct inode *inode, void *data) +{ + ViceFid *fid = (ViceFid *)data; + return coda_fideq(&(ITOC(inode)->c_fid), fid); +} + +static int coda_set_inode(struct inode *inode, void *data) +{ + ViceFid *fid = (ViceFid *)data; + ITOC(inode)->c_fid = *fid; + return 0; +} + +static int coda_fail_inode(struct inode *inode, void *data) +{ + return -1; +} + struct inode * coda_iget(struct super_block * sb, ViceFid * fid, struct coda_vattr * attr) { struct inode *inode; - struct coda_inode_info *cii; ino_t ino = coda_f2i(fid); - inode = iget4(sb, ino, coda_inocmp, fid); + inode = iget4(sb, ino, coda_test_inode, coda_set_inode, fid); if (!inode) return ERR_PTR(-ENOMEM); - /* check if the inode is already initialized */ - cii = ITOC(inode); - if (coda_isnullfid(&cii->c_fid)) - /* new, empty inode found... initializing */ - cii->c_fid = *fid; - - /* we shouldnt see inode collisions anymore */ - if (!coda_fideq(fid, &cii->c_fid)) BUG(); - /* always replace the attributes, type might have changed */ coda_fill_inode(inode, attr); return inode; @@ -131,7 +134,6 @@ struct inode *coda_fid_to_inode(ViceFid *fid, struct super_block *sb) { ino_t nr; struct inode *inode; - struct coda_inode_info *cii; if ( !sb ) { printk("coda_fid_to_inode: no sb!\n"); @@ -139,43 +141,29 @@ struct inode *coda_fid_to_inode(ViceFid *fid, struct super_block *sb) } nr = coda_f2i(fid); - inode = iget4(sb, nr, coda_inocmp, fid); + inode = iget4(sb, nr, coda_test_inode, coda_fail_inode, fid); if ( !inode ) { printk("coda_fid_to_inode: null from iget, sb %p, nr %ld.\n", sb, (long)nr); return NULL; } - cii = ITOC(inode); - - /* The inode could already be purged due to memory pressure */ - if (coda_isnullfid(&cii->c_fid)) { - inode->i_nlink = 0; - iput(inode); - return NULL; - } - - /* we shouldn't see inode collisions anymore */ - if ( !coda_fideq(fid, &cii->c_fid) ) BUG(); - - return inode; + return inode; } /* the CONTROL inode is made without asking attributes from Venus */ int coda_cnode_makectl(struct inode **inode, struct super_block *sb) { - int error = 0; - - *inode = iget(sb, CTL_INO); - if ( *inode ) { - (*inode)->i_op = &coda_ioctl_inode_operations; - (*inode)->i_fop = &coda_ioctl_operations; - (*inode)->i_mode = 0444; - error = 0; - } else { - error = -ENOMEM; - } - - return error; + int error = -ENOMEM; + + *inode = iget(sb, CTL_INO); + if ( *inode ) { + (*inode)->i_op = &coda_ioctl_inode_operations; + (*inode)->i_fop = &coda_ioctl_operations; + (*inode)->i_mode = 0444; + error = 0; + } + + return error; } diff --git a/fs/inode.c b/fs/inode.c index 61e3f6678737..9dd13bf3ca83 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -452,7 +452,7 @@ int shrink_icache_memory(int priority, int gfp_mask) * by hand after calling find_inode now! This simplifies iunique and won't * add any additional branch in the common code. */ -static struct inode * find_inode(struct super_block * sb, unsigned long ino, struct list_head *head, find_inode_t find_actor, void *opaque) +static struct inode * find_inode(struct super_block * sb, unsigned long ino, struct list_head *head, int (*test)(struct inode *, void *), void *data) { struct list_head *tmp; struct inode * inode; @@ -468,7 +468,7 @@ static struct inode * find_inode(struct super_block * sb, unsigned long ino, str continue; if (inode->i_sb != sb) continue; - if (find_actor && !find_actor(inode, ino, opaque)) + if (test && !test(inode, data)) continue; break; } @@ -507,9 +507,10 @@ struct inode *new_inode(struct super_block *sb) * We no longer cache the sb_flags in i_flags - see fs.h * -- rmk@arm.uk.linux.org */ -static struct inode * get_new_inode(struct super_block *sb, unsigned long ino, struct list_head *head, find_inode_t find_actor, void *opaque) +static struct inode * get_new_inode(struct super_block *sb, unsigned long ino, struct list_head *head, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data) { struct inode * inode; + int err = 0; inode = alloc_inode(sb); if (inode) { @@ -517,12 +518,15 @@ static struct inode * get_new_inode(struct super_block *sb, unsigned long ino, s spin_lock(&inode_lock); /* We released the lock, so.. */ - old = find_inode(sb, ino, head, find_actor, opaque); + old = find_inode(sb, ino, head, test, data); if (!old) { + inode->i_ino = ino; + if (set && set(inode, data)) + goto set_failed; + inodes_stat.nr_inodes++; list_add(&inode->i_list, &inode_in_use); list_add(&inode->i_hash, head); - inode->i_ino = ino; inode->i_state = I_LOCK; spin_unlock(&inode_lock); @@ -532,7 +536,7 @@ static struct inode * get_new_inode(struct super_block *sb, unsigned long ino, s ** -- mason@suse.com */ if (sb->s_op->read_inode2) { - sb->s_op->read_inode2(inode, opaque) ; + sb->s_op->read_inode2(inode, data) ; } else { sb->s_op->read_inode(inode); } @@ -563,6 +567,11 @@ static struct inode * get_new_inode(struct super_block *sb, unsigned long ino, s wait_on_inode(inode); } return inode; + +set_failed: + spin_unlock(&inode_lock); + destroy_inode(inode); + return NULL; } static inline unsigned long hash(struct super_block *sb, unsigned long i_ino) @@ -628,13 +637,13 @@ struct inode *igrab(struct inode *inode) } -struct inode *iget4(struct super_block *sb, unsigned long ino, find_inode_t find_actor, void *opaque) +struct inode *iget4(struct super_block *sb, unsigned long ino, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data) { struct list_head * head = inode_hashtable + hash(sb,ino); struct inode * inode; spin_lock(&inode_lock); - inode = find_inode(sb, ino, head, find_actor, opaque); + inode = find_inode(sb, ino, head, test, data); if (inode) { __iget(inode); spin_unlock(&inode_lock); @@ -647,7 +656,7 @@ struct inode *iget4(struct super_block *sb, unsigned long ino, find_inode_t find * get_new_inode() will do the right thing, re-trying the search * in case it had to block at any point. */ - return get_new_inode(sb, ino, head, find_actor, opaque); + return get_new_inode(sb, ino, head, test, set, data); } /** diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 5a105fc344eb..030c570e30f5 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -592,7 +592,7 @@ struct nfs_find_desc { * i_ino. */ static int -nfs_find_actor(struct inode *inode, unsigned long ino, void *opaque) +nfs_find_actor(struct inode *inode, void *opaque) { struct nfs_find_desc *desc = (struct nfs_find_desc *)opaque; struct nfs_fh *fh = desc->fh; @@ -610,6 +610,18 @@ nfs_find_actor(struct inode *inode, unsigned long ino, void *opaque) return 1; } +static int +nfs_init_locked(struct inode *inode, void *opaque) +{ + struct nfs_find_desc *desc = (struct nfs_find_desc *)opaque; + struct nfs_fh *fh = desc->fh; + struct nfs_fattr *fattr = desc->fattr; + + NFS_FILEID(inode) = fattr->fileid; + memcpy(NFS_FH(inode), fh, sizeof(struct nfs_fh)); + return 0; +} + /* * This is our own version of iget that looks up inodes by file handle * instead of inode number. We use this technique instead of using @@ -652,7 +664,7 @@ __nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) ino = nfs_fattr_to_ino_t(fattr); - if (!(inode = iget4(sb, ino, nfs_find_actor, &desc))) + if (!(inode = iget4(sb, ino, nfs_find_actor, nfs_init_locked, &desc))) goto out_no_inode; if (NFS_NEW(inode)) { @@ -662,8 +674,6 @@ __nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) /* We can't support UPDATE_ATIME(), since the server will reset it */ NFS_FLAGS(inode) &= ~NFS_INO_NEW; - NFS_FILEID(inode) = fattr->fileid; - memcpy(NFS_FH(inode), fh, sizeof(struct nfs_fh)); inode->i_flags |= S_NOATIME; inode->i_mode = fattr->mode; /* Why so? Because we want revalidate for devices/FIFOs, and diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 9b2b26e901a3..8a3ce238e01c 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -1138,6 +1138,13 @@ void reiserfs_read_inode(struct inode *inode) { // evolved as the prototype did // +int reiserfs_init_locked_inode (struct inode * inode, void *p) +{ + struct reiserfs_iget4_args *args = (struct reiserfs_iget4_args *)p ; + INODE_PKEY(inode)->k_dir_id = cpu_to_le32(args->objectid); + return 0; +} + /* looks for stat data in the tree, and fills up the fields of in-core inode stat data fields */ void reiserfs_read_inode2 (struct inode * inode, void *p) @@ -1213,7 +1220,6 @@ void reiserfs_read_inode2 (struct inode * inode, void *p) * reiserfs_find_actor() - "find actor" reiserfs supplies to iget4(). * * @inode: inode from hash table to check - * @inode_no: inode number we are looking for * @opaque: "cookie" passed to iget4(). This is &reiserfs_iget4_args. * * This function is called by iget4() to distinguish reiserfs inodes @@ -1222,8 +1228,7 @@ void reiserfs_read_inode2 (struct inode * inode, void *p) * inode numbers (objectids) are distinguished by parent directory ids. * */ -static int reiserfs_find_actor( struct inode *inode, - unsigned long inode_no, void *opaque ) +int reiserfs_find_actor( struct inode *inode, void *opaque ) { struct reiserfs_iget4_args *args; @@ -1239,7 +1244,7 @@ struct inode * reiserfs_iget (struct super_block * s, const struct cpu_key * key args.objectid = key->on_disk_key.k_dir_id ; inode = iget4 (s, key->on_disk_key.k_objectid, - reiserfs_find_actor, (void *)(&args)); + reiserfs_find_actor, reiserfs_init_locked_inode, (void *)(&args)); if (!inode) return ERR_PTR(-ENOMEM) ; diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 46d63a4defbf..4bb9cbd49bce 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -1070,7 +1070,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) s->s_flags |= MS_RDONLY ; } args.objectid = REISERFS_ROOT_PARENT_OBJECTID ; - root_inode = iget4 (s, REISERFS_ROOT_OBJECTID, 0, (void *)(&args)); + root_inode = iget4 (s, REISERFS_ROOT_OBJECTID, reiserfs_find_actor, reiserfs_init_locked_inode, (void *)(&args)); if (!root_inode) { printk ("reiserfs_fill_super: get root inode failed\n"); goto error; diff --git a/include/linux/fs.h b/include/linux/fs.h index 5534da65ff74..4b551cb0326a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1207,11 +1207,10 @@ extern void force_delete(struct inode *); extern struct inode * igrab(struct inode *); extern ino_t iunique(struct super_block *, ino_t); -typedef int (*find_inode_t)(struct inode *, unsigned long, void *); -extern struct inode * iget4(struct super_block *, unsigned long, find_inode_t, void *); +extern struct inode * iget4(struct super_block *, unsigned long, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *); static inline struct inode *iget(struct super_block *sb, unsigned long ino) { - return iget4(sb, ino, NULL, NULL); + return iget4(sb, ino, NULL, NULL, NULL); } extern void __iget(struct inode * inode); diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h index a64b5bc5e7de..22c5547e86bc 100644 --- a/include/linux/reiserfs_fs.h +++ b/include/linux/reiserfs_fs.h @@ -1820,6 +1820,8 @@ void padd_item (char * item, int total_length, int length); void reiserfs_read_inode (struct inode * inode) ; void reiserfs_read_inode2(struct inode * inode, void *p) ; +int reiserfs_find_actor(struct inode * inode, void *p) ; +int reiserfs_init_locked_inode(struct inode * inode, void *p) ; void reiserfs_delete_inode (struct inode * inode); void reiserfs_write_inode (struct inode * inode, int) ; struct dentry *reiserfs_get_dentry(struct super_block *, void *) ; -- cgit v1.2.3 From 85b640c51ed9e8ba0ab15151d89a98e1a670d347 Mon Sep 17 00:00:00 2001 From: Jan Harkes Date: Sun, 19 May 2002 19:25:02 -0700 Subject: [PATCH] iget_locked [2/6] Now we introduce iget_locked and iget5_locked. These are similar to iget, but return a locked inode and read_inode has not been called. So the FS has to call read_inode to initialize the inode and then unlock it with unlock_new_inode(). This patch is based on the icreate patch from the XFS group, i.e. it is pretty much identical except for function naming. --- fs/Makefile | 2 +- fs/inode.c | 79 +++++++++++++++++++++++++++++++++++++----------------- include/linux/fs.h | 5 ++++ 3 files changed, 61 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/fs/Makefile b/fs/Makefile index 2449b05e367a..1d40929ad7bd 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -7,7 +7,7 @@ O_TARGET := fs.o -export-objs := filesystems.o open.o dcache.o buffer.o bio.o +export-objs := filesystems.o open.o dcache.o buffer.o bio.o inode.o mod-subdirs := nls obj-y := open.o read_write.o devices.o file_table.o buffer.o \ diff --git a/fs/inode.c b/fs/inode.c index 9dd13bf3ca83..c56a53a35c9b 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -12,6 +12,7 @@ #include #include #include +#include /* * New inode.c implementation. @@ -501,6 +502,21 @@ struct inode *new_inode(struct super_block *sb) return inode; } +void unlock_new_inode(struct inode *inode) +{ + /* + * This is special! We do not need the spinlock + * when clearing I_LOCK, because we're guaranteed + * that nobody else tries to do anything about the + * state of the inode when it is locked, as we + * just created it (so there can be no old holders + * that haven't tested I_LOCK). + */ + inode->i_state &= ~(I_LOCK|I_NEW); + wake_up(&inode->i_wait); +} + + /* * This is called without the inode lock held.. Be careful. * @@ -527,31 +543,12 @@ static struct inode * get_new_inode(struct super_block *sb, unsigned long ino, s inodes_stat.nr_inodes++; list_add(&inode->i_list, &inode_in_use); list_add(&inode->i_hash, head); - inode->i_state = I_LOCK; + inode->i_state = I_LOCK|I_NEW; spin_unlock(&inode_lock); - /* reiserfs specific hack right here. We don't - ** want this to last, and are looking for VFS changes - ** that will allow us to get rid of it. - ** -- mason@suse.com - */ - if (sb->s_op->read_inode2) { - sb->s_op->read_inode2(inode, data) ; - } else { - sb->s_op->read_inode(inode); - } - - /* - * This is special! We do not need the spinlock - * when clearing I_LOCK, because we're guaranteed - * that nobody else tries to do anything about the - * state of the inode when it is locked, as we - * just created it (so there can be no old holders - * that haven't tested I_LOCK). + /* Return the locked inode with I_NEW set, the + * caller is responsible for filling in the contents */ - inode->i_state &= ~I_LOCK; - wake_up(&inode->i_wait); - return inode; } @@ -636,8 +633,12 @@ struct inode *igrab(struct inode *inode) return inode; } - -struct inode *iget4(struct super_block *sb, unsigned long ino, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data) +/* + * This is iget without the read_inode portion of get_new_inode + * the filesystem gets back a new locked and hashed inode and gets + * to fill it in before unlocking it via unlock_new_inode(). + */ +struct inode *iget5_locked(struct super_block *sb, unsigned long ino, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data) { struct list_head * head = inode_hashtable + hash(sb,ino); struct inode * inode; @@ -659,6 +660,36 @@ struct inode *iget4(struct super_block *sb, unsigned long ino, int (*test)(struc return get_new_inode(sb, ino, head, test, set, data); } +struct inode *iget_locked(struct super_block *sb, unsigned long ino) +{ + return iget5_locked(sb, ino, NULL, NULL, NULL); +} + +EXPORT_SYMBOL(iget5_locked); +EXPORT_SYMBOL(iget_locked); +EXPORT_SYMBOL(unlock_new_inode); + +struct inode *iget4(struct super_block *sb, unsigned long ino, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data) +{ + struct inode *inode = iget5_locked(sb, ino, test, set, data); + + if (inode && (inode->i_state & I_NEW)) { + /* reiserfs specific hack right here. We don't + ** want this to last, and are looking for VFS changes + ** that will allow us to get rid of it. + ** -- mason@suse.com + */ + if (sb->s_op->read_inode2) { + sb->s_op->read_inode2(inode, data); + } else { + sb->s_op->read_inode(inode); + } + unlock_new_inode(inode); + } + + return inode; +} + /** * insert_inode_hash - hash an inode * @inode: unhashed inode diff --git a/include/linux/fs.h b/include/linux/fs.h index 4b551cb0326a..50138553bb15 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -799,6 +799,7 @@ struct super_operations { #define I_LOCK 8 #define I_FREEING 16 #define I_CLEAR 32 +#define I_NEW 64 #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES) @@ -1207,6 +1208,10 @@ extern void force_delete(struct inode *); extern struct inode * igrab(struct inode *); extern ino_t iunique(struct super_block *, ino_t); +extern struct inode * iget5_locked(struct super_block *, unsigned long, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *); +extern struct inode * iget_locked(struct super_block *, unsigned long); +extern void unlock_new_inode(struct inode *); + extern struct inode * iget4(struct super_block *, unsigned long, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *); static inline struct inode *iget(struct super_block *sb, unsigned long ino) { -- cgit v1.2.3 From 77d1ac9bf5beff0aab610c0cffb1fd59cea7664b Mon Sep 17 00:00:00 2001 From: Jan Harkes Date: Sun, 19 May 2002 19:25:07 -0700 Subject: [PATCH] iget_locked [3/6] Convert existing filesystems (Coda/NFS/ReiserFS) that currently use iget4 to iget5_locked. --- fs/coda/cnode.c | 26 ++++++++++++++++++-------- fs/coda/inode.c | 14 -------------- fs/nfs/inode.c | 20 +++++--------------- fs/reiserfs/inode.c | 42 ++++++++++++++++++------------------------ fs/reiserfs/super.c | 11 +++++++---- include/linux/fs.h | 9 ++++++++- include/linux/nfs_fs.h | 2 -- include/linux/reiserfs_fs.h | 5 ++--- 8 files changed, 58 insertions(+), 71 deletions(-) (limited to 'include') diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c index a75444e66eef..f541d14c47f1 100644 --- a/fs/coda/cnode.c +++ b/fs/coda/cnode.c @@ -72,13 +72,21 @@ struct inode * coda_iget(struct super_block * sb, ViceFid * fid, struct coda_vattr * attr) { struct inode *inode; + struct coda_inode_info *cii; + struct coda_sb_info *sbi = coda_sbp(sb); ino_t ino = coda_f2i(fid); - inode = iget4(sb, ino, coda_test_inode, coda_set_inode, fid); + inode = iget5_locked(sb, ino, coda_test_inode, coda_set_inode, fid); if (!inode) return ERR_PTR(-ENOMEM); + if (inode->i_state & I_NEW) { + cii = ITOC(inode); + list_add(&cii->c_cilist, &sbi->sbi_cihead); + unlock_new_inode(inode); + } + /* always replace the attributes, type might have changed */ coda_fill_inode(inode, attr); return inode; @@ -141,12 +149,13 @@ struct inode *coda_fid_to_inode(ViceFid *fid, struct super_block *sb) } nr = coda_f2i(fid); - inode = iget4(sb, nr, coda_test_inode, coda_fail_inode, fid); - if ( !inode ) { - printk("coda_fid_to_inode: null from iget, sb %p, nr %ld.\n", - sb, (long)nr); + inode = iget5_locked(sb, nr, coda_test_inode, coda_fail_inode, fid); + if ( !inode ) return NULL; - } + + /* we should never see newly created inodes because we intentionally + * fail in the initialization callback */ + BUG_ON(inode->i_state & I_NEW); return inode; } @@ -156,8 +165,9 @@ int coda_cnode_makectl(struct inode **inode, struct super_block *sb) { int error = -ENOMEM; - *inode = iget(sb, CTL_INO); - if ( *inode ) { + *inode = new_inode(sb); + if (*inode) { + (*inode)->i_ino = CTL_INO; (*inode)->i_op = &coda_ioctl_inode_operations; (*inode)->i_fop = &coda_ioctl_operations; (*inode)->i_mode = 0444; diff --git a/fs/coda/inode.c b/fs/coda/inode.c index 8a68f2a13461..621074e23410 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -33,7 +33,6 @@ #include /* VFS super_block ops */ -static void coda_read_inode(struct inode *); static void coda_clear_inode(struct inode *); static void coda_put_super(struct super_block *); static int coda_statfs(struct super_block *sb, struct statfs *buf); @@ -92,7 +91,6 @@ struct super_operations coda_super_operations = { alloc_inode: coda_alloc_inode, destroy_inode: coda_destroy_inode, - read_inode: coda_read_inode, clear_inode: coda_clear_inode, put_super: coda_put_super, statfs: coda_statfs, @@ -229,18 +227,6 @@ static void coda_put_super(struct super_block *sb) kfree(sbi); } -/* all filling in of inodes postponed until lookup */ -static void coda_read_inode(struct inode *inode) -{ - struct coda_sb_info *sbi = coda_sbp(inode->i_sb); - struct coda_inode_info *cii; - - if (!sbi) BUG(); - - cii = ITOC(inode); - list_add(&cii->c_cilist, &sbi->sbi_cihead); -} - static void coda_clear_inode(struct inode *inode) { struct coda_inode_info *cii = ITOC(inode); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 030c570e30f5..63710b5552ec 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -47,7 +47,6 @@ static void nfs_invalidate_inode(struct inode *); static struct inode *nfs_alloc_inode(struct super_block *sb); static void nfs_destroy_inode(struct inode *); -static void nfs_read_inode(struct inode *); static void nfs_write_inode(struct inode *,int); static void nfs_delete_inode(struct inode *); static void nfs_put_super(struct super_block *); @@ -59,7 +58,6 @@ static int nfs_show_options(struct seq_file *, struct vfsmount *); static struct super_operations nfs_sops = { alloc_inode: nfs_alloc_inode, destroy_inode: nfs_destroy_inode, - read_inode: nfs_read_inode, write_inode: nfs_write_inode, delete_inode: nfs_delete_inode, put_super: nfs_put_super, @@ -98,15 +96,6 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr) return nfs_fileid_to_ino_t(fattr->fileid); } -/* - * The "read_inode" function doesn't actually do anything: - * the real data is filled in later in nfs_fhget. - */ -static void -nfs_read_inode(struct inode * inode) -{ -} - static void nfs_write_inode(struct inode *inode, int sync) { @@ -664,16 +653,15 @@ __nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) ino = nfs_fattr_to_ino_t(fattr); - if (!(inode = iget4(sb, ino, nfs_find_actor, nfs_init_locked, &desc))) + if (!(inode = iget5_locked(sb, ino, nfs_find_actor, nfs_init_locked, &desc))) goto out_no_inode; - if (NFS_NEW(inode)) { + if (inode->i_state & I_NEW) { __u64 new_size, new_mtime; loff_t new_isize; time_t new_atime; /* We can't support UPDATE_ATIME(), since the server will reset it */ - NFS_FLAGS(inode) &= ~NFS_INO_NEW; inode->i_flags |= S_NOATIME; inode->i_mode = fattr->mode; /* Why so? Because we want revalidate for devices/FIFOs, and @@ -721,6 +709,8 @@ __nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) NFS_ATTRTIMEO(inode) = NFS_MINATTRTIMEO(inode); NFS_ATTRTIMEO_UPDATE(inode) = jiffies; memset(NFS_COOKIEVERF(inode), 0, sizeof(NFS_COOKIEVERF(inode))); + + unlock_new_inode(inode); } else nfs_refresh_inode(inode, fattr); dprintk("NFS: __nfs_fhget(%s/%Ld ct=%d)\n", @@ -1241,7 +1231,7 @@ static struct inode *nfs_alloc_inode(struct super_block *sb) nfsi = (struct nfs_inode *)kmem_cache_alloc(nfs_inode_cachep, SLAB_KERNEL); if (!nfsi) return NULL; - nfsi->flags = NFS_INO_NEW; + nfsi->flags = 0; nfsi->mm_cred = NULL; return &nfsi->vfs_inode; } diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 8a3ce238e01c..f6984b0f8d2c 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -33,7 +33,7 @@ void reiserfs_delete_inode (struct inode * inode) lock_kernel() ; /* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */ - if (INODE_PKEY(inode)->k_objectid != 0) { /* also handles bad_inode case */ + if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) { /* also handles bad_inode case */ down (&inode->i_sem); journal_begin(&th, inode->i_sb, jbegin_count) ; @@ -886,7 +886,7 @@ int reiserfs_get_block (struct inode * inode, sector_t block, // item version directly // -// called by read_inode +// called by read_locked_inode static void init_inode (struct inode * inode, struct path * path) { struct buffer_head * bh; @@ -1117,7 +1117,7 @@ void reiserfs_update_sd (struct reiserfs_transaction_handle *th, return; } -/* reiserfs_read_inode2 is called to read the inode off disk, and it +/* reiserfs_read_locked_inode is called to read the inode off disk, and it ** does a make_bad_inode when things go wrong. But, we need to make sure ** and clear the key in the private portion of the inode, otherwise a ** corresponding iput might try to delete whatever object the inode last @@ -1128,11 +1128,6 @@ static void reiserfs_make_bad_inode(struct inode *inode) { make_bad_inode(inode); } -void reiserfs_read_inode(struct inode *inode) { - reiserfs_make_bad_inode(inode) ; -} - - // // initially this function was derived from minix or ext2's analog and // evolved as the prototype did @@ -1140,26 +1135,20 @@ void reiserfs_read_inode(struct inode *inode) { int reiserfs_init_locked_inode (struct inode * inode, void *p) { - struct reiserfs_iget4_args *args = (struct reiserfs_iget4_args *)p ; + struct reiserfs_iget_args *args = (struct reiserfs_iget_args *)p ; INODE_PKEY(inode)->k_dir_id = cpu_to_le32(args->objectid); return 0; } /* looks for stat data in the tree, and fills up the fields of in-core inode stat data fields */ -void reiserfs_read_inode2 (struct inode * inode, void *p) +void reiserfs_read_locked_inode (struct inode * inode, struct reiserfs_iget_args *args) { INITIALIZE_PATH (path_to_sd); struct cpu_key key; - struct reiserfs_iget4_args *args = (struct reiserfs_iget4_args *)p ; unsigned long dirino; int retval; - if (!p) { - reiserfs_make_bad_inode(inode) ; - return; - } - dirino = args->objectid ; /* set version 1, version 2 could be used too, because stat data @@ -1173,7 +1162,7 @@ void reiserfs_read_inode2 (struct inode * inode, void *p) /* look for the object's stat data */ retval = search_item (inode->i_sb, &key, &path_to_sd); if (retval == IO_ERROR) { - reiserfs_warning ("vs-13070: reiserfs_read_inode2: " + reiserfs_warning ("vs-13070: reiserfs_read_locked_inode: " "i/o failure occurred trying to find stat data of %K\n", &key); reiserfs_make_bad_inode(inode) ; @@ -1205,7 +1194,7 @@ void reiserfs_read_inode2 (struct inode * inode, void *p) during mount (fs/reiserfs/super.c:finish_unfinished()). */ if( ( inode -> i_nlink == 0 ) && ! REISERFS_SB(inode -> i_sb) -> s_is_unlinked_ok ) { - reiserfs_warning( "vs-13075: reiserfs_read_inode2: " + reiserfs_warning( "vs-13075: reiserfs_read_locked_inode: " "dead inode read from disk %K. " "This is likely to be race with knfsd. Ignore\n", &key ); @@ -1217,12 +1206,12 @@ void reiserfs_read_inode2 (struct inode * inode, void *p) } /** - * reiserfs_find_actor() - "find actor" reiserfs supplies to iget4(). + * reiserfs_find_actor() - "find actor" reiserfs supplies to iget5_locked(). * * @inode: inode from hash table to check - * @opaque: "cookie" passed to iget4(). This is &reiserfs_iget4_args. + * @opaque: "cookie" passed to iget5_locked(). This is &reiserfs_iget_args. * - * This function is called by iget4() to distinguish reiserfs inodes + * This function is called by iget5_locked() to distinguish reiserfs inodes * having the same inode numbers. Such inodes can only exist due to some * error condition. One of them should be bad. Inodes with identical * inode numbers (objectids) are distinguished by parent directory ids. @@ -1230,7 +1219,7 @@ void reiserfs_read_inode2 (struct inode * inode, void *p) */ int reiserfs_find_actor( struct inode *inode, void *opaque ) { - struct reiserfs_iget4_args *args; + struct reiserfs_iget_args *args; args = opaque; /* args is already in CPU order */ @@ -1240,14 +1229,19 @@ int reiserfs_find_actor( struct inode *inode, void *opaque ) struct inode * reiserfs_iget (struct super_block * s, const struct cpu_key * key) { struct inode * inode; - struct reiserfs_iget4_args args ; + struct reiserfs_iget_args args ; args.objectid = key->on_disk_key.k_dir_id ; - inode = iget4 (s, key->on_disk_key.k_objectid, + inode = iget5_locked (s, key->on_disk_key.k_objectid, reiserfs_find_actor, reiserfs_init_locked_inode, (void *)(&args)); if (!inode) return ERR_PTR(-ENOMEM) ; + if (inode->i_state & I_NEW) { + reiserfs_read_locked_inode(inode, &args); + unlock_new_inode(inode); + } + if (comp_short_keys (INODE_PKEY (inode), key) || is_bad_inode (inode)) { /* either due to i/o error or a stale NFS handle */ iput (inode); diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 4bb9cbd49bce..83a01771ed84 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -484,8 +484,6 @@ struct super_operations reiserfs_sops = { alloc_inode: reiserfs_alloc_inode, destroy_inode: reiserfs_destroy_inode, - read_inode: reiserfs_read_inode, - read_inode2: reiserfs_read_inode2, write_inode: reiserfs_write_inode, dirty_inode: reiserfs_dirty_inode, delete_inode: reiserfs_delete_inode, @@ -1007,7 +1005,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) int old_format = 0; unsigned long blocks; int jinit_done = 0 ; - struct reiserfs_iget4_args args ; + struct reiserfs_iget_args args ; struct reiserfs_super_block * rs; char *jdev_name; struct reiserfs_sb_info *sbi; @@ -1070,12 +1068,17 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) s->s_flags |= MS_RDONLY ; } args.objectid = REISERFS_ROOT_PARENT_OBJECTID ; - root_inode = iget4 (s, REISERFS_ROOT_OBJECTID, reiserfs_find_actor, reiserfs_init_locked_inode, (void *)(&args)); + root_inode = iget5_locked (s, REISERFS_ROOT_OBJECTID, reiserfs_find_actor, reiserfs_init_locked_inode, (void *)(&args)); if (!root_inode) { printk ("reiserfs_fill_super: get root inode failed\n"); goto error; } + if (root_inode->i_state & I_NEW) { + reiserfs_read_locked_inode(root_inode, &args); + unlock_new_inode(root_inode); + } + s->s_root = d_alloc_root(root_inode); if (!s->s_root) { iput(root_inode); diff --git a/include/linux/fs.h b/include/linux/fs.h index 50138553bb15..2f82322fb04b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1215,7 +1215,14 @@ extern void unlock_new_inode(struct inode *); extern struct inode * iget4(struct super_block *, unsigned long, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *); static inline struct inode *iget(struct super_block *sb, unsigned long ino) { - return iget4(sb, ino, NULL, NULL, NULL); + struct inode *inode = iget_locked(sb, ino); + + if (inode && (inode->i_state & I_NEW)) { + sb->s_op->read_inode(inode); + unlock_new_inode(inode); + } + + return inode; } extern void __iget(struct inode * inode); diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 499b246788f4..a8a2259a8343 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -170,7 +170,6 @@ struct nfs_inode { #define NFS_INO_REVALIDATING 0x0004 /* revalidating attrs */ #define NFS_IS_SNAPSHOT 0x0010 /* a snapshot file */ #define NFS_INO_FLUSH 0x0020 /* inode is due for flushing */ -#define NFS_INO_NEW 0x0040 /* hadn't been filled yet */ static inline struct nfs_inode *NFS_I(struct inode *inode) { @@ -208,7 +207,6 @@ do { \ #define NFS_FLAGS(inode) (NFS_I(inode)->flags) #define NFS_REVALIDATING(inode) (NFS_FLAGS(inode) & NFS_INO_REVALIDATING) #define NFS_STALE(inode) (NFS_FLAGS(inode) & NFS_INO_STALE) -#define NFS_NEW(inode) (NFS_FLAGS(inode) & NFS_INO_NEW) #define NFS_FILEID(inode) (NFS_I(inode)->fileid) diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h index 22c5547e86bc..c2bfc3fd4ed5 100644 --- a/include/linux/reiserfs_fs.h +++ b/include/linux/reiserfs_fs.h @@ -1564,7 +1564,7 @@ extern struct item_operations * item_ops [TYPE_ANY + 1]; #define B_I_POS_UNFM_POINTER(bh,ih,pos) le32_to_cpu(*(((unp_t *)B_I_PITEM(bh,ih)) + (pos))) #define PUT_B_I_POS_UNFM_POINTER(bh,ih,pos, val) do {*(((unp_t *)B_I_PITEM(bh,ih)) + (pos)) = cpu_to_le32(val); } while (0) -struct reiserfs_iget4_args { +struct reiserfs_iget_args { __u32 objectid ; } ; @@ -1818,8 +1818,7 @@ void padd_item (char * item, int total_length, int length); /* inode.c */ -void reiserfs_read_inode (struct inode * inode) ; -void reiserfs_read_inode2(struct inode * inode, void *p) ; +void reiserfs_read_locked_inode(struct inode * inode, struct reiserfs_iget_args *args) ; int reiserfs_find_actor(struct inode * inode, void *p) ; int reiserfs_init_locked_inode(struct inode * inode, void *p) ; void reiserfs_delete_inode (struct inode * inode); -- cgit v1.2.3 From 16fb4ea349c270306e745c4288bd139fba6dbd18 Mon Sep 17 00:00:00 2001 From: Jan Harkes Date: Sun, 19 May 2002 19:25:12 -0700 Subject: [PATCH] iget_locked [4/6] Now that we have no more users of iget4 we can kill the function and the associated read_inode2 callback (i.e. the 'reiserfs specific hack'). Document iget5_locked as the replacement for iget4 in filesystems/porting. --- Documentation/filesystems/Locking | 2 +- Documentation/filesystems/porting | 33 +++++++++++++++++++++++++++++++++ fs/inode.c | 21 --------------------- include/linux/fs.h | 8 -------- kernel/ksyms.c | 1 - 5 files changed, 34 insertions(+), 31 deletions(-) (limited to 'include') diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 1acc415a99a3..72288b4d8f9e 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -115,7 +115,7 @@ statfs: yes no no remount_fs: yes yes maybe (see below) umount_begin: yes no maybe (see below) -->read_inode() is not a method - it's a callback used in iget()/iget4(). +->read_inode() is not a method - it's a callback used in iget(). rules for mount_sem are not too nice - it is going to die and be replaced by better scheme anyway. diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting index df06a180b650..ce31f689bdc2 100644 --- a/Documentation/filesystems/porting +++ b/Documentation/filesystems/porting @@ -152,3 +152,36 @@ settles down a bit. s_export_op is now required for exporting a filesystem. isofs, ext2, ext3, resierfs, fat can be used as examples of very different filesystems. + +--- +[mandatory] + +iget4() and the read_inode2 callback have been superseded by iget5_locked() +which has the following prototype, + + struct inode *iget5_locked(struct super_block *sb, unsigned long ino, + int (*test)(struct inode *, void *), + int (*set)(struct inode *, void *), + void *data); + +'test' is an additional function that can be used when the inode +number is not sufficient to identify the actual file object. 'set' +should be a non-blocking function that initializes those parts of a +newly created inode to allow the test function to succeed. 'data' is +passed as an opaque value to both test and set functions. + +When the inode has been created by iget5_locked(), it will be returned with +the I_NEW flag set and will still be locked. read_inode has not been +called so the file system still has to finalize the initialization. Once +the inode is initialized it must be unlocked by calling unlock_new_inode(). + +There is also a simpler iget_locked function that just takes the +superblock and inode number as arguments. + +e.g. + inode = iget_locked(sb, ino); + if (inode->i_state & I_NEW) { + read_inode_from_disk(inode); + unlock_new_inode(inode); + } + diff --git a/fs/inode.c b/fs/inode.c index c56a53a35c9b..58e41be7ee76 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -669,27 +669,6 @@ EXPORT_SYMBOL(iget5_locked); EXPORT_SYMBOL(iget_locked); EXPORT_SYMBOL(unlock_new_inode); -struct inode *iget4(struct super_block *sb, unsigned long ino, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data) -{ - struct inode *inode = iget5_locked(sb, ino, test, set, data); - - if (inode && (inode->i_state & I_NEW)) { - /* reiserfs specific hack right here. We don't - ** want this to last, and are looking for VFS changes - ** that will allow us to get rid of it. - ** -- mason@suse.com - */ - if (sb->s_op->read_inode2) { - sb->s_op->read_inode2(inode, data); - } else { - sb->s_op->read_inode(inode); - } - unlock_new_inode(inode); - } - - return inode; -} - /** * insert_inode_hash - hash an inode * @inode: unhashed inode diff --git a/include/linux/fs.h b/include/linux/fs.h index 2f82322fb04b..4ec6c2fe76c6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -769,13 +769,6 @@ struct super_operations { void (*read_inode) (struct inode *); - /* reiserfs kludge. reiserfs needs 64 bits of information to - ** find an inode. We are using the read_inode2 call to get - ** that information. We don't like this, and are waiting on some - ** VFS changes for the real solution. - ** iget4 calls read_inode2, iff it is defined - */ - void (*read_inode2) (struct inode *, void *) ; void (*dirty_inode) (struct inode *); void (*write_inode) (struct inode *, int); void (*put_inode) (struct inode *); @@ -1212,7 +1205,6 @@ extern struct inode * iget5_locked(struct super_block *, unsigned long, int (*te extern struct inode * iget_locked(struct super_block *, unsigned long); extern void unlock_new_inode(struct inode *); -extern struct inode * iget4(struct super_block *, unsigned long, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *); static inline struct inode *iget(struct super_block *sb, unsigned long ino) { struct inode *inode = iget_locked(sb, ino); diff --git a/kernel/ksyms.c b/kernel/ksyms.c index 9001c829f4ef..a3a721cbd426 100644 --- a/kernel/ksyms.c +++ b/kernel/ksyms.c @@ -137,7 +137,6 @@ EXPORT_SYMBOL(fput); EXPORT_SYMBOL(fget); EXPORT_SYMBOL(igrab); EXPORT_SYMBOL(iunique); -EXPORT_SYMBOL(iget4); EXPORT_SYMBOL(iput); EXPORT_SYMBOL(inode_init_once); EXPORT_SYMBOL(force_delete); -- cgit v1.2.3 From aa624c8d24de1cee65483886b3f4ffc2fbc72980 Mon Sep 17 00:00:00 2001 From: Jan Harkes Date: Sun, 19 May 2002 19:25:16 -0700 Subject: [PATCH] iget_locked [5/6] This patch starts taking i_ino dependencies out of the VFS. The FS provided test and set callbacks become responsible for testing and setting inode->i_ino. Because most filesystems are based on 32-bit unique inode numbers several functions are duplicated to keep iget_locked as a fast path. We can avoid unnecessary pointer dereferences and function calls for this specific case. --- Documentation/filesystems/porting | 6 +- fs/coda/cnode.c | 1 + fs/inode.c | 113 ++++++++++++++++++++++++++++++++++---- fs/nfs/inode.c | 2 + fs/reiserfs/inode.c | 11 ++-- fs/reiserfs/super.c | 3 +- include/linux/reiserfs_fs.h | 1 + 7 files changed, 118 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting index ce31f689bdc2..5e1e47711009 100644 --- a/Documentation/filesystems/porting +++ b/Documentation/filesystems/porting @@ -175,8 +175,10 @@ the I_NEW flag set and will still be locked. read_inode has not been called so the file system still has to finalize the initialization. Once the inode is initialized it must be unlocked by calling unlock_new_inode(). -There is also a simpler iget_locked function that just takes the -superblock and inode number as arguments. +The filesystem is responsible for setting (and possibly testing) i_ino +when appropriate. There is also a simpler iget_locked function that +just takes the superblock and inode number as arguments and does the +test and set for you. e.g. inode = iget_locked(sb, ino); diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c index f541d14c47f1..090a16fb6abf 100644 --- a/fs/coda/cnode.c +++ b/fs/coda/cnode.c @@ -83,6 +83,7 @@ struct inode * coda_iget(struct super_block * sb, ViceFid * fid, if (inode->i_state & I_NEW) { cii = ITOC(inode); + inode->i_ino = ino; list_add(&cii->c_cilist, &sbi->sbi_cihead); unlock_new_inode(inode); } diff --git a/fs/inode.c b/fs/inode.c index 58e41be7ee76..9d6db0e68210 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -453,7 +453,32 @@ int shrink_icache_memory(int priority, int gfp_mask) * by hand after calling find_inode now! This simplifies iunique and won't * add any additional branch in the common code. */ -static struct inode * find_inode(struct super_block * sb, unsigned long ino, struct list_head *head, int (*test)(struct inode *, void *), void *data) +static struct inode * find_inode(struct super_block * sb, struct list_head *head, int (*test)(struct inode *, void *), void *data) +{ + struct list_head *tmp; + struct inode * inode; + + tmp = head; + for (;;) { + tmp = tmp->next; + inode = NULL; + if (tmp == head) + break; + inode = list_entry(tmp, struct inode, i_hash); + if (inode->i_sb != sb) + continue; + if (!test(inode, data)) + continue; + break; + } + return inode; +} + +/* + * find_inode_fast is the fast path version of find_inode, see the comment at + * iget_locked for details. + */ +static struct inode * find_inode_fast(struct super_block * sb, struct list_head *head, unsigned long ino) { struct list_head *tmp; struct inode * inode; @@ -469,8 +494,6 @@ static struct inode * find_inode(struct super_block * sb, unsigned long ino, str continue; if (inode->i_sb != sb) continue; - if (test && !test(inode, data)) - continue; break; } return inode; @@ -523,10 +546,9 @@ void unlock_new_inode(struct inode *inode) * We no longer cache the sb_flags in i_flags - see fs.h * -- rmk@arm.uk.linux.org */ -static struct inode * get_new_inode(struct super_block *sb, unsigned long ino, struct list_head *head, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data) +static struct inode * get_new_inode(struct super_block *sb, struct list_head *head, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data) { struct inode * inode; - int err = 0; inode = alloc_inode(sb); if (inode) { @@ -534,10 +556,9 @@ static struct inode * get_new_inode(struct super_block *sb, unsigned long ino, s spin_lock(&inode_lock); /* We released the lock, so.. */ - old = find_inode(sb, ino, head, test, data); + old = find_inode(sb, head, test, data); if (!old) { - inode->i_ino = ino; - if (set && set(inode, data)) + if (set(inode, data)) goto set_failed; inodes_stat.nr_inodes++; @@ -571,6 +592,49 @@ set_failed: return NULL; } +/* + * get_new_inode_fast is the fast path version of get_new_inode, see the + * comment at iget_locked for details. + */ +static struct inode * get_new_inode_fast(struct super_block *sb, struct list_head *head, unsigned long ino) +{ + struct inode * inode; + + inode = alloc_inode(sb); + if (inode) { + struct inode * old; + + spin_lock(&inode_lock); + /* We released the lock, so.. */ + old = find_inode_fast(sb, head, ino); + if (!old) { + inode->i_ino = ino; + inodes_stat.nr_inodes++; + list_add(&inode->i_list, &inode_in_use); + list_add(&inode->i_hash, head); + inode->i_state = I_LOCK|I_NEW; + spin_unlock(&inode_lock); + + /* Return the locked inode with I_NEW set, the + * caller is responsible for filling in the contents + */ + return inode; + } + + /* + * Uhhuh, somebody else created the same inode under + * us. Use the old inode instead of the one we just + * allocated. + */ + __iget(old); + spin_unlock(&inode_lock); + destroy_inode(inode); + inode = old; + wait_on_inode(inode); + } + return inode; +} + static inline unsigned long hash(struct super_block *sb, unsigned long i_ino) { unsigned long tmp = i_ino + ((unsigned long) sb / L1_CACHE_BYTES); @@ -605,7 +669,8 @@ ino_t iunique(struct super_block *sb, ino_t max_reserved) retry: if (counter > max_reserved) { head = inode_hashtable + hash(sb,counter); - inode = find_inode(sb, res = counter++, head, NULL, NULL); + res = counter++; + inode = find_inode_fast(sb, head, res); if (!inode) { spin_unlock(&inode_lock); return res; @@ -644,7 +709,7 @@ struct inode *iget5_locked(struct super_block *sb, unsigned long ino, int (*test struct inode * inode; spin_lock(&inode_lock); - inode = find_inode(sb, ino, head, test, data); + inode = find_inode(sb, head, test, data); if (inode) { __iget(inode); spin_unlock(&inode_lock); @@ -657,12 +722,36 @@ struct inode *iget5_locked(struct super_block *sb, unsigned long ino, int (*test * get_new_inode() will do the right thing, re-trying the search * in case it had to block at any point. */ - return get_new_inode(sb, ino, head, test, set, data); + return get_new_inode(sb, head, test, set, data); } +/* + * Because most filesystems are based on 32-bit unique inode numbers some + * functions are duplicated to keep iget_locked as a fast path. We can avoid + * unnecessary pointer dereferences and function calls for this specific + * case. The duplicated functions (find_inode_fast and get_new_inode_fast) + * have the same pre- and post-conditions as their original counterparts. + */ struct inode *iget_locked(struct super_block *sb, unsigned long ino) { - return iget5_locked(sb, ino, NULL, NULL, NULL); + struct list_head * head = inode_hashtable + hash(sb, ino); + struct inode * inode; + + spin_lock(&inode_lock); + inode = find_inode_fast(sb, head, ino); + if (inode) { + __iget(inode); + spin_unlock(&inode_lock); + wait_on_inode(inode); + return inode; + } + spin_unlock(&inode_lock); + + /* + * get_new_inode_fast() will do the right thing, re-trying the search + * in case it had to block at any point. + */ + return get_new_inode_fast(sb, head, ino); } EXPORT_SYMBOL(iget5_locked); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 63710b5552ec..0011043d51cb 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -661,6 +661,8 @@ __nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) loff_t new_isize; time_t new_atime; + inode->i_ino = ino; + /* We can't support UPDATE_ATIME(), since the server will reset it */ inode->i_flags |= S_NOATIME; inode->i_mode = fattr->mode; diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index f6984b0f8d2c..24dad23cbbf4 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -1136,7 +1136,8 @@ static void reiserfs_make_bad_inode(struct inode *inode) { int reiserfs_init_locked_inode (struct inode * inode, void *p) { struct reiserfs_iget_args *args = (struct reiserfs_iget_args *)p ; - INODE_PKEY(inode)->k_dir_id = cpu_to_le32(args->objectid); + inode->i_ino = args->objectid; + INODE_PKEY(inode)->k_dir_id = cpu_to_le32(args->dirid); return 0; } @@ -1149,7 +1150,7 @@ void reiserfs_read_locked_inode (struct inode * inode, struct reiserfs_iget_args unsigned long dirino; int retval; - dirino = args->objectid ; + dirino = args->dirid ; /* set version 1, version 2 could be used too, because stat data key is the same in both versions */ @@ -1223,7 +1224,8 @@ int reiserfs_find_actor( struct inode *inode, void *opaque ) args = opaque; /* args is already in CPU order */ - return le32_to_cpu(INODE_PKEY(inode)->k_dir_id) == args -> objectid; + return (inode->i_ino == args->objectid) && + (le32_to_cpu(INODE_PKEY(inode)->k_dir_id) == args->dirid); } struct inode * reiserfs_iget (struct super_block * s, const struct cpu_key * key) @@ -1231,7 +1233,8 @@ struct inode * reiserfs_iget (struct super_block * s, const struct cpu_key * key struct inode * inode; struct reiserfs_iget_args args ; - args.objectid = key->on_disk_key.k_dir_id ; + args.objectid = key->on_disk_key.k_objectid ; + args.dirid = key->on_disk_key.k_dir_id ; inode = iget5_locked (s, key->on_disk_key.k_objectid, reiserfs_find_actor, reiserfs_init_locked_inode, (void *)(&args)); if (!inode) diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 83a01771ed84..b52e704d6c7f 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -1067,7 +1067,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) printk("clm-7000: Detected readonly device, marking FS readonly\n") ; s->s_flags |= MS_RDONLY ; } - args.objectid = REISERFS_ROOT_PARENT_OBJECTID ; + args.objectid = REISERFS_ROOT_OBJECTID ; + args.dirid = REISERFS_ROOT_PARENT_OBJECTID ; root_inode = iget5_locked (s, REISERFS_ROOT_OBJECTID, reiserfs_find_actor, reiserfs_init_locked_inode, (void *)(&args)); if (!root_inode) { printk ("reiserfs_fill_super: get root inode failed\n"); diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h index c2bfc3fd4ed5..a3172f03b2f4 100644 --- a/include/linux/reiserfs_fs.h +++ b/include/linux/reiserfs_fs.h @@ -1566,6 +1566,7 @@ extern struct item_operations * item_ops [TYPE_ANY + 1]; struct reiserfs_iget_args { __u32 objectid ; + __u32 dirid ; } ; /***************************************************************************/ -- cgit v1.2.3 From 9b406173ebec1ced8c3071d6df1f025aa9a1d488 Mon Sep 17 00:00:00 2001 From: Jan Harkes Date: Sun, 19 May 2002 19:25:21 -0700 Subject: [PATCH] iget_locked [6/6] As of the last patch the inode_hashtable doesn't really need to be indexed by i_ino anymore, the only reason we still have to keep the hashvalue and i_ino identical is because of insert_inode_hash. If at some point a FS specific getattr method is implemented it will be possible to completely remove any use of i_ino by the VFS. --- fs/coda/cnode.c | 17 +++++++++-------- fs/inode.c | 16 +++++++++------- fs/nfs/inode.c | 10 ++++++---- include/linux/fs.h | 7 ++++++- kernel/ksyms.c | 2 +- 5 files changed, 31 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c index 090a16fb6abf..60ee649aacc5 100644 --- a/fs/coda/cnode.c +++ b/fs/coda/cnode.c @@ -74,16 +74,17 @@ struct inode * coda_iget(struct super_block * sb, ViceFid * fid, struct inode *inode; struct coda_inode_info *cii; struct coda_sb_info *sbi = coda_sbp(sb); - ino_t ino = coda_f2i(fid); + unsigned long hash = coda_f2i(fid); - inode = iget5_locked(sb, ino, coda_test_inode, coda_set_inode, fid); + inode = iget5_locked(sb, hash, coda_test_inode, coda_set_inode, fid); if (!inode) return ERR_PTR(-ENOMEM); if (inode->i_state & I_NEW) { cii = ITOC(inode); - inode->i_ino = ino; + /* we still need to set i_ino for things like stat(2) */ + inode->i_ino = hash; list_add(&cii->c_cilist, &sbi->sbi_cihead); unlock_new_inode(inode); } @@ -124,6 +125,7 @@ void coda_replace_fid(struct inode *inode, struct ViceFid *oldfid, struct ViceFid *newfid) { struct coda_inode_info *cii; + unsigned long hash = coda_f2i(newfid); cii = ITOC(inode); @@ -134,23 +136,22 @@ void coda_replace_fid(struct inode *inode, struct ViceFid *oldfid, /* XXX we probably need to hold some lock here! */ remove_inode_hash(inode); cii->c_fid = *newfid; - inode->i_ino = coda_f2i(newfid); - insert_inode_hash(inode); + inode->i_ino = hash; + __insert_inode_hash(inode, hash); } /* convert a fid to an inode. */ struct inode *coda_fid_to_inode(ViceFid *fid, struct super_block *sb) { - ino_t nr; struct inode *inode; + unsigned long hash = coda_f2i(fid); if ( !sb ) { printk("coda_fid_to_inode: no sb!\n"); return NULL; } - nr = coda_f2i(fid); - inode = iget5_locked(sb, nr, coda_test_inode, coda_fail_inode, fid); + inode = iget5_locked(sb, hash, coda_test_inode, coda_fail_inode, fid); if ( !inode ) return NULL; diff --git a/fs/inode.c b/fs/inode.c index 9d6db0e68210..8389f550e30c 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -635,9 +635,9 @@ static struct inode * get_new_inode_fast(struct super_block *sb, struct list_hea return inode; } -static inline unsigned long hash(struct super_block *sb, unsigned long i_ino) +static inline unsigned long hash(struct super_block *sb, unsigned long hashval) { - unsigned long tmp = i_ino + ((unsigned long) sb / L1_CACHE_BYTES); + unsigned long tmp = hashval + ((unsigned long) sb / L1_CACHE_BYTES); tmp = tmp + (tmp >> I_HASHBITS); return tmp & I_HASHMASK; } @@ -703,9 +703,9 @@ struct inode *igrab(struct inode *inode) * the filesystem gets back a new locked and hashed inode and gets * to fill it in before unlocking it via unlock_new_inode(). */ -struct inode *iget5_locked(struct super_block *sb, unsigned long ino, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data) +struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data) { - struct list_head * head = inode_hashtable + hash(sb,ino); + struct list_head * head = inode_hashtable + hash(sb, hashval); struct inode * inode; spin_lock(&inode_lock); @@ -759,18 +759,20 @@ EXPORT_SYMBOL(iget_locked); EXPORT_SYMBOL(unlock_new_inode); /** - * insert_inode_hash - hash an inode + * __insert_inode_hash - hash an inode * @inode: unhashed inode + * @hashval: unsigned long value used to locate this object in the + * inode_hashtable. * * Add an inode to the inode hash for this superblock. If the inode * has no superblock it is added to a separate anonymous chain. */ -void insert_inode_hash(struct inode *inode) +void __insert_inode_hash(struct inode *inode, unsigned long hashval) { struct list_head *head = &anon_hash_chain; if (inode->i_sb) - head = inode_hashtable + hash(inode->i_sb, inode->i_ino); + head = inode_hashtable + hash(inode->i_sb, hashval); spin_lock(&inode_lock); list_add(&inode->i_hash, head); spin_unlock(&inode_lock); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 0011043d51cb..ea03c0e8a850 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -641,7 +641,7 @@ __nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) fattr: fattr }; struct inode *inode = NULL; - unsigned long ino; + unsigned long hash; if ((fattr->valid & NFS_ATTR_FATTR) == 0) goto out_no_inode; @@ -651,9 +651,9 @@ __nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) goto out_no_inode; } - ino = nfs_fattr_to_ino_t(fattr); + hash = nfs_fattr_to_ino_t(fattr); - if (!(inode = iget5_locked(sb, ino, nfs_find_actor, nfs_init_locked, &desc))) + if (!(inode = iget5_locked(sb, hash, nfs_find_actor, nfs_init_locked, &desc))) goto out_no_inode; if (inode->i_state & I_NEW) { @@ -661,7 +661,9 @@ __nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) loff_t new_isize; time_t new_atime; - inode->i_ino = ino; + /* We set i_ino for the few things that still rely on it, + * such as stat(2) */ + inode->i_ino = hash; /* We can't support UPDATE_ATIME(), since the server will reset it */ inode->i_flags |= S_NOATIME; diff --git a/include/linux/fs.h b/include/linux/fs.h index 4ec6c2fe76c6..f3e56d1f70f2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1221,8 +1221,13 @@ extern void __iget(struct inode * inode); extern void clear_inode(struct inode *); extern struct inode *new_inode(struct super_block *); extern void remove_suid(struct dentry *); -extern void insert_inode_hash(struct inode *); + +extern void __insert_inode_hash(struct inode *, unsigned long hashval); extern void remove_inode_hash(struct inode *); +static inline void insert_inode_hash(struct inode *inode) { + __insert_inode_hash(inode, inode->i_ino); +} + extern struct file * get_empty_filp(void); extern void file_move(struct file *f, struct list_head *list); extern void ll_rw_block(int, int, struct buffer_head * bh[]); diff --git a/kernel/ksyms.c b/kernel/ksyms.c index a3a721cbd426..51bf57dfda33 100644 --- a/kernel/ksyms.c +++ b/kernel/ksyms.c @@ -537,7 +537,7 @@ EXPORT_SYMBOL(clear_inode); EXPORT_SYMBOL(init_special_inode); EXPORT_SYMBOL(__get_hash_table); EXPORT_SYMBOL(new_inode); -EXPORT_SYMBOL(insert_inode_hash); +EXPORT_SYMBOL(__insert_inode_hash); EXPORT_SYMBOL(remove_inode_hash); EXPORT_SYMBOL(buffer_insert_list); EXPORT_SYMBOL(make_bad_inode); -- cgit v1.2.3 From 61d681d67f15e4bdbc576e2495b0dc2fc76dbf42 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sun, 19 May 2002 19:33:51 -0700 Subject: [PATCH] [1/13] quota-1-newlocks This patch adds dq_dup_ref to struct dquot. Functions altering just usage of quota take just this duplicated reference, inodes, quotactl() helpers take real dq_count reference. dqput() blocks if there are some duplicated references and put reference is last 'real one'. This way is assured that quota IO is not done from functions altering quota usage (quota structure is written on last dqput()). --- fs/dquot.c | 100 ++++++++++++++++++++++++++++++++++++++++++-------- include/linux/quota.h | 6 ++- 2 files changed, 89 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/fs/dquot.c b/fs/dquot.c index 78043b2a618b..ce707f3d7afa 100644 --- a/fs/dquot.c +++ b/fs/dquot.c @@ -35,7 +35,7 @@ * Jan Kara, , sponsored by SuSE CR, 10-11/99 * * Used struct list_head instead of own list struct - * Invalidation of dquots with dq_count > 0 no longer possible + * Invalidation of referenced dquots is no longer possible * Improved free_dquots list management * Quota and i_blocks are now updated in one place to avoid races * Warnings are now delayed so we won't block in critical section @@ -137,6 +137,26 @@ static inline char sb_has_quota_enabled(struct super_block *sb, short type) return is_enabled(sb_dqopt(sb), type); } +static inline void get_dquot_ref(struct dquot *dquot) +{ + dquot->dq_count++; +} + +static inline void put_dquot_ref(struct dquot *dquot) +{ + dquot->dq_count--; +} + +static inline void get_dquot_dup_ref(struct dquot *dquot) +{ + dquot->dq_dup_ref++; +} + +static inline void put_dquot_dup_ref(struct dquot *dquot) +{ + dquot->dq_dup_ref--; +} + static inline int const hashfn(struct super_block *sb, unsigned int id, short type) { return((HASHDEV(sb->s_dev) ^ id) * (MAXQUOTAS - type)) % NR_DQHASH; @@ -244,6 +264,7 @@ static inline void unlock_dquot(struct dquot *dquot) wake_up(&dquot->dq_wait_lock); } +/* Wait for dquot to be unused */ static void __wait_dquot_unused(struct dquot *dquot) { DECLARE_WAITQUEUE(wait, current); @@ -259,6 +280,22 @@ repeat: current->state = TASK_RUNNING; } +/* Wait for all duplicated dquot references to be dropped */ +static void __wait_dup_drop(struct dquot *dquot) +{ + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue(&dquot->dq_wait_free, &wait); +repeat: + set_current_state(TASK_UNINTERRUPTIBLE); + if (dquot->dq_dup_ref) { + schedule(); + goto repeat; + } + remove_wait_queue(&dquot->dq_wait_free, &wait); + current->state = TASK_RUNNING; +} + /* * We don't have to be afraid of deadlocks as we never have quotas on quota files... */ @@ -377,8 +414,10 @@ restart: continue; if (!(dquot->dq_flags & (DQ_MOD | DQ_LOCKED))) continue; - /* Raise use count so quota won't be invalidated. We can't use dqduplicate() as it does too many tests */ - dquot->dq_count++; + /* Get reference to quota so it won't be invalidated. get_dquot_ref() + * is enough since if dquot is locked/modified it can't be + * on the free list */ + get_dquot_ref(dquot); if (dquot->dq_flags & DQ_LOCKED) wait_on_dquot(dquot); if (dquot->dq_flags & DQ_MOD) @@ -433,11 +472,15 @@ int shrink_dqcache_memory(int priority, unsigned int gfp_mask) return 0; } -/* NOTE: If you change this function please check whether dqput_blocks() works right... */ +/* + * Put reference to dquot + * NOTE: If you change this function please check whether dqput_blocks() works right... + */ static void dqput(struct dquot *dquot) { if (!dquot) return; +#ifdef __DQUOT_PARANOIA if (!dquot->dq_count) { printk("VFS: dqput: trying to free free dquot\n"); printk("VFS: device %s, dquot of %s %d\n", @@ -446,12 +489,17 @@ static void dqput(struct dquot *dquot) dquot->dq_id); return; } +#endif dqstats.drops++; we_slept: + if (dquot->dq_dup_ref && dquot->dq_count - dquot->dq_dup_ref <= 1) { /* Last unduplicated reference? */ + __wait_dup_drop(dquot); + goto we_slept; + } if (dquot->dq_count > 1) { /* We have more than one user... We can simply decrement use count */ - dquot->dq_count--; + put_dquot_ref(dquot); return; } if (dquot->dq_flags & DQ_MOD) { @@ -462,10 +510,10 @@ we_slept: /* sanity check */ if (!list_empty(&dquot->dq_free)) { printk(KERN_ERR "dqput: dquot already on free list??\n"); - dquot->dq_count--; /* J.K. Just decrementing use count seems safer... */ + put_dquot_ref(dquot); return; } - dquot->dq_count--; + put_dquot_ref(dquot); /* If dquot is going to be invalidated invalidate_dquots() is going to free it so */ if (!(dquot->dq_flags & DQ_INVAL)) put_dquot_last(dquot); /* Place at end of LRU free queue */ @@ -520,8 +568,9 @@ we_slept: insert_dquot_hash(dquot); read_dquot(dquot); } else { - if (!dquot->dq_count++) + if (!dquot->dq_count) remove_free_dquot(dquot); + get_dquot_ref(dquot); dqstats.cache_hits++; wait_on_dquot(dquot); if (empty) @@ -539,23 +588,39 @@ we_slept: return dquot; } +/* Duplicate reference to dquot got from inode */ static struct dquot *dqduplicate(struct dquot *dquot) { if (dquot == NODQUOT) return NODQUOT; - dquot->dq_count++; + get_dquot_ref(dquot); if (!dquot->dq_sb) { printk(KERN_ERR "VFS: dqduplicate(): Invalidated quota to be duplicated!\n"); - dquot->dq_count--; + put_dquot_ref(dquot); return NODQUOT; } if (dquot->dq_flags & DQ_LOCKED) printk(KERN_ERR "VFS: dqduplicate(): Locked quota to be duplicated!\n"); + get_dquot_dup_ref(dquot); dquot->dq_referenced++; dqstats.lookups++; return dquot; } +/* Put duplicated reference */ +static void dqputduplicate(struct dquot *dquot) +{ + if (!dquot->dq_dup_ref) { + printk(KERN_ERR "VFS: dqputduplicate(): Duplicated dquot put without duplicate reference.\n"); + return; + } + put_dquot_dup_ref(dquot); + if (!dquot->dq_dup_ref) + wake_up(&dquot->dq_wait_free); + put_dquot_ref(dquot); + dqstats.drops++; +} + static int dqinit_needed(struct inode *inode, short type) { int cnt; @@ -599,7 +664,9 @@ restart: /* Return 0 if dqput() won't block (note that 1 doesn't necessarily mean blocking) */ static inline int dqput_blocks(struct dquot *dquot) { - if (dquot->dq_count == 1) + if (dquot->dq_dup_ref && dquot->dq_count - dquot->dq_dup_ref <= 1) + return 1; + if (dquot->dq_count <= 1 && dquot->dq_flags & DQ_MOD) return 1; return 0; } @@ -1065,7 +1132,7 @@ warn_put_all: flush_warnings(dquot, warntype); for (cnt = 0; cnt < MAXQUOTAS; cnt++) if (dquot[cnt] != NODQUOT) - dqput(dquot[cnt]); + dqputduplicate(dquot[cnt]); unlock_kernel(); return ret; } @@ -1104,7 +1171,7 @@ warn_put_all: flush_warnings(dquot, warntype); for (cnt = 0; cnt < MAXQUOTAS; cnt++) if (dquot[cnt] != NODQUOT) - dqput(dquot[cnt]); + dqputduplicate(dquot[cnt]); unlock_kernel(); return ret; } @@ -1124,7 +1191,7 @@ void dquot_free_block(struct inode *inode, unsigned long number) if (dquot == NODQUOT) continue; dquot_decr_blocks(dquot, number); - dqput(dquot); + dqputduplicate(dquot); } inode->i_blocks -= number << (BLOCK_SIZE_BITS - 9); unlock_kernel(); @@ -1146,7 +1213,7 @@ void dquot_free_inode(const struct inode *inode, unsigned long number) if (dquot == NODQUOT) continue; dquot_decr_inodes(dquot, number); - dqput(dquot); + dqputduplicate(dquot); } unlock_kernel(); /* NOBLOCK End */ @@ -1233,8 +1300,9 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr) warn_put_all: flush_warnings(transfer_to, warntype); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + /* First we must put duplicate - otherwise we might deadlock */ if (transfer_to[cnt] != NODQUOT) - dqput(transfer_to[cnt]); + dqputduplicate(transfer_to[cnt]); if (transfer_from[cnt] != NODQUOT) dqput(transfer_from[cnt]); } diff --git a/include/linux/quota.h b/include/linux/quota.h index b2d5de7368f6..0b4bb2b87580 100644 --- a/include/linux/quota.h +++ b/include/linux/quota.h @@ -41,6 +41,9 @@ #include +#define __DQUOT_VERSION__ "dquot_6.5.1" +#define __DQUOT_NUM_VERSION__ 6*10000+5*100+1 + /* * Convert diskblocks to blocks and the other way around. */ @@ -161,7 +164,8 @@ struct dquot { struct list_head dq_free; /* Free list element */ wait_queue_head_t dq_wait_lock; /* Pointer to waitqueue on dquot lock */ wait_queue_head_t dq_wait_free; /* Pointer to waitqueue for quota to be unused */ - int dq_count; /* Reference count */ + int dq_count; /* Use count */ + int dq_dup_ref; /* Number of duplicated refences */ /* fields after this point are cleared when invalidating */ struct super_block *dq_sb; /* superblock this applies to */ -- cgit v1.2.3 From b80d2549d1fadb8e5498bb8202793e80d9b4d7b7 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sun, 19 May 2002 19:33:56 -0700 Subject: [PATCH] [2/13] quota-2-formats This patch removes most format dependent code from dquot.c and quota.h and puts calls of callback functions instead. --- fs/dquot.c | 266 +++++++++----------------------------------------- include/linux/fs.h | 10 +- include/linux/quota.h | 83 +++++++++++----- 3 files changed, 108 insertions(+), 251 deletions(-) (limited to 'include') diff --git a/fs/dquot.c b/fs/dquot.c index ce707f3d7afa..55245e75b2d9 100644 --- a/fs/dquot.c +++ b/fs/dquot.c @@ -71,7 +71,7 @@ int nr_dquots, nr_free_dquots; static char *quotatypes[] = INITQFNAMES; -static inline struct quota_mount_options *sb_dqopt(struct super_block *sb) +static inline struct quota_info *sb_dqopt(struct super_block *sb) { return &sb->s_dquot; } @@ -121,7 +121,7 @@ static struct dqstats dqstats; static void dqput(struct dquot *); static struct dquot *dqduplicate(struct dquot *); -static inline char is_enabled(struct quota_mount_options *dqopt, short type) +static inline char is_enabled(struct quota_info *dqopt, short type) { switch (type) { case USRQUOTA: @@ -296,73 +296,28 @@ repeat: current->state = TASK_RUNNING; } -/* - * We don't have to be afraid of deadlocks as we never have quotas on quota files... - */ -static void write_dquot(struct dquot *dquot) +static int read_dqblk(struct dquot *dquot) { - short type = dquot->dq_type; - struct file *filp; - mm_segment_t fs; - loff_t offset; - ssize_t ret; - struct semaphore *sem = &dquot->dq_sb->s_dquot.dqio_sem; - struct dqblk dqbuf; - - down(sem); - filp = dquot->dq_sb->s_dquot.files[type]; - offset = dqoff(dquot->dq_id); - fs = get_fs(); - set_fs(KERNEL_DS); + int ret; + struct quota_info *dqopt = sb_dqopt(dquot->dq_sb); - /* - * Note: clear the DQ_MOD flag unconditionally, - * so we don't loop forever on failure. - */ - memcpy(&dqbuf, &dquot->dq_dqb, sizeof(struct dqblk)); - dquot->dq_flags &= ~DQ_MOD; - ret = 0; - if (filp) - ret = filp->f_op->write(filp, (char *)&dqbuf, - sizeof(struct dqblk), &offset); - if (ret != sizeof(struct dqblk)) - printk(KERN_WARNING "VFS: dquota write failed on dev %s\n", - dquot->dq_sb->s_id); - - set_fs(fs); - up(sem); - dqstats.writes++; + lock_dquot(dquot); + down(&dqopt->dqio_sem); + ret = dqopt->ops[dquot->dq_type]->read_dqblk(dquot); + up(&dqopt->dqio_sem); + unlock_dquot(dquot); + return ret; } -static void read_dquot(struct dquot *dquot) +static int commit_dqblk(struct dquot *dquot) { - short type = dquot->dq_type; - struct file *filp; - mm_segment_t fs; - loff_t offset; + int ret; + struct quota_info *dqopt = sb_dqopt(dquot->dq_sb); - filp = dquot->dq_sb->s_dquot.files[type]; - if (filp == (struct file *)NULL) - return; - - lock_dquot(dquot); - if (!dquot->dq_sb) /* Invalidated quota? */ - goto out_lock; - /* Now we are sure filp is valid - the dquot isn't invalidated */ - down(&dquot->dq_sb->s_dquot.dqio_sem); - offset = dqoff(dquot->dq_id); - fs = get_fs(); - set_fs(KERNEL_DS); - filp->f_op->read(filp, (char *)&dquot->dq_dqb, sizeof(struct dqblk), &offset); - up(&dquot->dq_sb->s_dquot.dqio_sem); - set_fs(fs); - - if (dquot->dq_bhardlimit == 0 && dquot->dq_bsoftlimit == 0 && - dquot->dq_ihardlimit == 0 && dquot->dq_isoftlimit == 0) - dquot->dq_flags |= DQ_FAKE; - dqstats.reads++; -out_lock: - unlock_dquot(dquot); + down(&dqopt->dqio_sem); + ret = dqopt->ops[dquot->dq_type]->commit_dqblk(dquot); + up(&dqopt->dqio_sem); + return ret; } /* Invalidate all dquots on the list, wait for all users. Note that this function is called @@ -412,7 +367,7 @@ restart: continue; if (!dquot->dq_sb) /* Invalidated? */ continue; - if (!(dquot->dq_flags & (DQ_MOD | DQ_LOCKED))) + if (!dquot_dirty(dquot) && !(dquot->dq_flags & DQ_LOCKED)) continue; /* Get reference to quota so it won't be invalidated. get_dquot_ref() * is enough since if dquot is locked/modified it can't be @@ -420,8 +375,8 @@ restart: get_dquot_ref(dquot); if (dquot->dq_flags & DQ_LOCKED) wait_on_dquot(dquot); - if (dquot->dq_flags & DQ_MOD) - write_dquot(dquot); + if (dquot_dirty(dquot)) + commit_dqblk(dquot); dqput(dquot); goto restart; } @@ -502,8 +457,8 @@ we_slept: put_dquot_ref(dquot); return; } - if (dquot->dq_flags & DQ_MOD) { - write_dquot(dquot); + if (dquot_dirty(dquot)) { + commit_dqblk(dquot); goto we_slept; } @@ -520,7 +475,7 @@ we_slept: wake_up(&dquot->dq_wait_free); } -static struct dquot *get_empty_dquot(void) +static struct dquot *get_empty_dquot(struct super_block *sb, int type) { struct dquot *dquot; @@ -534,6 +489,8 @@ static struct dquot *get_empty_dquot(void) INIT_LIST_HEAD(&dquot->dq_free); INIT_LIST_HEAD(&dquot->dq_inuse); INIT_LIST_HEAD(&dquot->dq_hash); + dquot->dq_sb = sb; + dquot->dq_type = type; dquot->dq_count = 1; /* all dquots go on the inuse_list */ put_inuse(dquot); @@ -545,7 +502,7 @@ static struct dquot *dqget(struct super_block *sb, unsigned int id, short type) { unsigned int hashent = hashfn(sb, id, type); struct dquot *dquot, *empty = NODQUOT; - struct quota_mount_options *dqopt = sb_dqopt(sb); + struct quota_info *dqopt = sb_dqopt(sb); we_slept: if (!is_enabled(dqopt, type)) { @@ -556,17 +513,15 @@ we_slept: if ((dquot = find_dquot(hashent, sb, id, type)) == NODQUOT) { if (empty == NODQUOT) { - if ((empty = get_empty_dquot()) == NODQUOT) + if ((empty = get_empty_dquot(sb, type)) == NODQUOT) schedule(); /* Try to wait for a moment... */ goto we_slept; } dquot = empty; dquot->dq_id = id; - dquot->dq_type = type; - dquot->dq_sb = sb; /* hash it first so it can be found */ insert_dquot_hash(dquot); - read_dquot(dquot); + read_dqblk(dquot); } else { if (!dquot->dq_count) remove_free_dquot(dquot); @@ -720,13 +675,13 @@ void put_dquot_list(struct list_head *tofree_head) static inline void dquot_incr_inodes(struct dquot *dquot, unsigned long number) { dquot->dq_curinodes += number; - dquot->dq_flags |= DQ_MOD; + mark_dquot_dirty(dquot); } static inline void dquot_incr_blocks(struct dquot *dquot, unsigned long number) { dquot->dq_curblocks += number; - dquot->dq_flags |= DQ_MOD; + mark_dquot_dirty(dquot); } static inline void dquot_decr_inodes(struct dquot *dquot, unsigned long number) @@ -738,7 +693,7 @@ static inline void dquot_decr_inodes(struct dquot *dquot, unsigned long number) if (dquot->dq_curinodes < dquot->dq_isoftlimit) dquot->dq_itime = (time_t) 0; dquot->dq_flags &= ~DQ_INODES; - dquot->dq_flags |= DQ_MOD; + mark_dquot_dirty(dquot); } static inline void dquot_decr_blocks(struct dquot *dquot, unsigned long number) @@ -750,7 +705,7 @@ static inline void dquot_decr_blocks(struct dquot *dquot, unsigned long number) if (dquot->dq_curblocks < dquot->dq_bsoftlimit) dquot->dq_btime = (time_t) 0; dquot->dq_flags &= ~DQ_BLKS; - dquot->dq_flags |= DQ_MOD; + mark_dquot_dirty(dquot); } static inline int need_print_warning(struct dquot *dquot, int flag) @@ -823,7 +778,7 @@ static inline void flush_warnings(struct dquot **dquots, char *warntype) static inline char ignore_hardlimit(struct dquot *dquot) { - return capable(CAP_SYS_RESOURCE) && !dquot->dq_sb->s_dquot.rsquash[dquot->dq_type]; + return capable(CAP_SYS_RESOURCE); } static int check_idq(struct dquot *dquot, ulong inodes, char *warntype) @@ -851,7 +806,7 @@ static int check_idq(struct dquot *dquot, ulong inodes, char *warntype) (dquot->dq_curinodes + inodes) > dquot->dq_isoftlimit && dquot->dq_itime == 0) { *warntype = ISOFTWARN; - dquot->dq_itime = CURRENT_TIME + dquot->dq_sb->s_dquot.inode_expire[dquot->dq_type]; + dquot->dq_itime = CURRENT_TIME + sb_dqopt(dquot->dq_sb)->info[dquot->dq_type].dqi_igrace; } return QUOTA_OK; @@ -885,7 +840,7 @@ static int check_bdq(struct dquot *dquot, ulong blocks, char prealloc, char *war dquot->dq_btime == 0) { if (!prealloc) { *warntype = BSOFTWARN; - dquot->dq_btime = CURRENT_TIME + dquot->dq_sb->s_dquot.block_expire[dquot->dq_type]; + dquot->dq_btime = CURRENT_TIME + sb_dqopt(dquot->dq_sb)->info[dquot->dq_type].dqi_bgrace; } else /* @@ -898,83 +853,6 @@ static int check_bdq(struct dquot *dquot, ulong blocks, char prealloc, char *war return QUOTA_OK; } -/* - * Initialize a dquot-struct with new quota info. This is used by the - * system call interface functions. - */ -static int set_dqblk(struct super_block *sb, int id, short type, int flags, struct dqblk *dqblk) -{ - struct dquot *dquot; - int error = -EFAULT; - struct dqblk dq_dqblk; - - if (copy_from_user(&dq_dqblk, dqblk, sizeof(struct dqblk))) - return error; - - if (sb && (dquot = dqget(sb, id, type)) != NODQUOT) { - /* We can't block while changing quota structure... */ - if (id > 0 && ((flags & SET_QUOTA) || (flags & SET_QLIMIT))) { - dquot->dq_bhardlimit = dq_dqblk.dqb_bhardlimit; - dquot->dq_bsoftlimit = dq_dqblk.dqb_bsoftlimit; - dquot->dq_ihardlimit = dq_dqblk.dqb_ihardlimit; - dquot->dq_isoftlimit = dq_dqblk.dqb_isoftlimit; - } - - if ((flags & SET_QUOTA) || (flags & SET_USE)) { - if (dquot->dq_isoftlimit && - dquot->dq_curinodes < dquot->dq_isoftlimit && - dq_dqblk.dqb_curinodes >= dquot->dq_isoftlimit) - dquot->dq_itime = CURRENT_TIME + dquot->dq_sb->s_dquot.inode_expire[type]; - dquot->dq_curinodes = dq_dqblk.dqb_curinodes; - if (dquot->dq_curinodes < dquot->dq_isoftlimit) - dquot->dq_flags &= ~DQ_INODES; - if (dquot->dq_bsoftlimit && - dquot->dq_curblocks < dquot->dq_bsoftlimit && - dq_dqblk.dqb_curblocks >= dquot->dq_bsoftlimit) - dquot->dq_btime = CURRENT_TIME + dquot->dq_sb->s_dquot.block_expire[type]; - dquot->dq_curblocks = dq_dqblk.dqb_curblocks; - if (dquot->dq_curblocks < dquot->dq_bsoftlimit) - dquot->dq_flags &= ~DQ_BLKS; - } - - if (id == 0) { - dquot->dq_sb->s_dquot.block_expire[type] = dquot->dq_btime = dq_dqblk.dqb_btime; - dquot->dq_sb->s_dquot.inode_expire[type] = dquot->dq_itime = dq_dqblk.dqb_itime; - } - - if (dq_dqblk.dqb_bhardlimit == 0 && dq_dqblk.dqb_bsoftlimit == 0 && - dq_dqblk.dqb_ihardlimit == 0 && dq_dqblk.dqb_isoftlimit == 0) - dquot->dq_flags |= DQ_FAKE; - else - dquot->dq_flags &= ~DQ_FAKE; - - dquot->dq_flags |= DQ_MOD; - dqput(dquot); - } - return 0; -} - -static int get_quota(struct super_block *sb, int id, short type, struct dqblk *dqblk) -{ - struct dquot *dquot; - struct dqblk data; - int error = -ESRCH; - - if (!sb || !sb_has_quota_enabled(sb, type)) - goto out; - dquot = dqget(sb, id, type); - if (dquot == NODQUOT) - goto out; - - memcpy(&data, &dquot->dq_dqb, sizeof(struct dqblk)); /* We copy data to preserve them from changing */ - dqput(dquot); - error = -EFAULT; - if (dqblk && !copy_to_user(dqblk, &data, sizeof(struct dqblk))) - error = 0; -out: - return error; -} - static int get_stats(caddr_t addr) { int error = -EFAULT; @@ -990,47 +868,6 @@ static int get_stats(caddr_t addr) return error; } -static int quota_root_squash(struct super_block *sb, short type, int *addr) -{ - int new_value, error; - - if (!sb) - return(-ENODEV); - - error = -EFAULT; - if (!copy_from_user(&new_value, addr, sizeof(int))) { - sb_dqopt(sb)->rsquash[type] = new_value; - error = 0; - } - return error; -} - -#if 0 /* We are not going to support filesystems without i_blocks... */ -/* - * This is a simple algorithm that calculates the size of a file in blocks. - * This is only used on filesystems that do not have an i_blocks count. - */ -static u_long isize_to_blocks(loff_t isize, size_t blksize_bits) -{ - u_long blocks; - u_long indirect; - - if (!blksize_bits) - blksize_bits = BLOCK_SIZE_BITS; - blocks = (isize >> blksize_bits) + ((isize & ~((1 << blksize_bits)-1)) ? 1 : 0); - if (blocks > 10) { - indirect = ((blocks - 11) >> 8) + 1; /* single indirect blocks */ - if (blocks > (10 + 256)) { - indirect += ((blocks - 267) >> 16) + 1; /* double indirect blocks */ - if (blocks > (10 + 256 + (256 << 8))) - indirect++; /* triple indirect blocks */ - } - blocks += indirect; - } - return blocks; -} -#endif - /* * Externally referenced functions through dquot_operations in inode. * @@ -1346,7 +1183,7 @@ struct dquot_operations dquot_operations = { transfer: dquot_transfer }; -static inline void set_enable_flags(struct quota_mount_options *dqopt, short type) +static inline void set_enable_flags(struct quota_info *dqopt, short type) { switch (type) { case USRQUOTA: @@ -1358,7 +1195,7 @@ static inline void set_enable_flags(struct quota_mount_options *dqopt, short typ } } -static inline void reset_enable_flags(struct quota_mount_options *dqopt, short type) +static inline void reset_enable_flags(struct quota_info *dqopt, short type) { switch (type) { case USRQUOTA: @@ -1380,7 +1217,7 @@ int quota_off(struct super_block *sb, short type) { struct file *filp; short cnt; - struct quota_mount_options *dqopt = sb_dqopt(sb); + struct quota_info *dqopt = sb_dqopt(sb); lock_kernel(); if (!sb) @@ -1398,11 +1235,15 @@ int quota_off(struct super_block *sb, short type) /* Note: these are blocking operations */ remove_dquot_ref(sb, cnt); invalidate_dquots(sb, cnt); + if (info_dirty(&dqopt->info[cnt])) + dqopt->ops[cnt]->write_file_info(sb, cnt); filp = dqopt->files[cnt]; dqopt->files[cnt] = (struct file *)NULL; - dqopt->inode_expire[cnt] = 0; - dqopt->block_expire[cnt] = 0; + dqopt->info[cnt].dqi_flags = 0; + dqopt->info[cnt].dqi_igrace = 0; + dqopt->info[cnt].dqi_bgrace = 0; + dqopt->ops[cnt] = NULL; fput(filp); } up(&dqopt->dqoff_sem); @@ -1411,20 +1252,12 @@ out: return 0; } -static inline int check_quotafile_size(loff_t size) -{ - ulong blocks = size >> BLOCK_SIZE_BITS; - size_t off = size & (BLOCK_SIZE - 1); - - return !(((blocks % sizeof(struct dqblk)) * BLOCK_SIZE + off % sizeof(struct dqblk)) % sizeof(struct dqblk)); -} - static int quota_on(struct super_block *sb, short type, char *path) { struct file *f; struct inode *inode; struct dquot *dquot; - struct quota_mount_options *dqopt = sb_dqopt(sb); + struct quota_info *dqopt = sb_dqopt(sb); char *tmp; int error; @@ -1451,7 +1284,7 @@ static int quota_on(struct super_block *sb, short type, char *path) if (!S_ISREG(inode->i_mode)) goto out_f; error = -EINVAL; - if (inode->i_size == 0 || !check_quotafile_size(inode->i_size)) + if (!check_quota_file(sb, type)) goto out_f; /* We don't want quota on quota files */ dquot_drop(inode); @@ -1461,11 +1294,6 @@ static int quota_on(struct super_block *sb, short type, char *path) sb->dq_op = &dquot_operations; set_enable_flags(dqopt, type); - dquot = dqget(sb, 0, type); - dqopt->inode_expire[type] = (dquot != NODQUOT) ? dquot->dq_itime : MAX_IQ_TIME; - dqopt->block_expire[type] = (dquot != NODQUOT) ? dquot->dq_btime : MAX_DQ_TIME; - dqput(dquot); - add_dquot_ref(sb, type); up(&dqopt->dqoff_sem); diff --git a/include/linux/fs.h b/include/linux/fs.h index cf2020f10428..aaa5e8910ce1 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -586,15 +586,13 @@ struct nameidata { #define DQUOT_USR_ENABLED 0x01 /* User diskquotas enabled */ #define DQUOT_GRP_ENABLED 0x02 /* Group diskquotas enabled */ -struct quota_mount_options -{ +struct quota_info { unsigned int flags; /* Flags for diskquotas on this device */ struct semaphore dqio_sem; /* lock device while I/O in progress */ struct semaphore dqoff_sem; /* serialize quota_off() and quota_on() on device */ struct file *files[MAXQUOTAS]; /* fp's to quotafiles */ - time_t inode_expire[MAXQUOTAS]; /* expiretime for inode-quota */ - time_t block_expire[MAXQUOTAS]; /* expiretime for block-quota */ - char rsquash[MAXQUOTAS]; /* for quotas threat root as any other user */ + struct mem_dqinfo info[MAXQUOTAS]; /* Information for each quota type */ + struct quota_format_ops *ops[MAXQUOTAS]; /* Operations for each format */ }; /* @@ -643,7 +641,7 @@ struct super_block { struct block_device *s_bdev; struct list_head s_instances; - struct quota_mount_options s_dquot; /* Diskquota specific options */ + struct quota_info s_dquot; /* Diskquota specific options */ char s_id[32]; /* Informational name */ diff --git a/include/linux/quota.h b/include/linux/quota.h index 0b4bb2b87580..820b561ba086 100644 --- a/include/linux/quota.h +++ b/include/linux/quota.h @@ -40,10 +40,13 @@ #define _LINUX_QUOTA_ #include +#include #define __DQUOT_VERSION__ "dquot_6.5.1" #define __DQUOT_NUM_VERSION__ 6*10000+5*100+1 +typedef __kernel_uid32_t qid_t; /* Type in which we store ids in memory */ + /* * Convert diskblocks to blocks and the other way around. */ @@ -94,32 +97,49 @@ #define SUBCMDSHIFT 8 #define QCMD(cmd, type) (((cmd) << SUBCMDSHIFT) | ((type) & SUBCMDMASK)) -#define Q_QUOTAON 0x0100 /* enable quotas */ -#define Q_QUOTAOFF 0x0200 /* disable quotas */ -#define Q_GETQUOTA 0x0300 /* get limits and usage */ -#define Q_SETQUOTA 0x0400 /* set limits and usage */ -#define Q_SETUSE 0x0500 /* set usage */ #define Q_SYNC 0x0600 /* sync disk copy of a filesystems quotas */ -#define Q_SETQLIM 0x0700 /* set limits */ -#define Q_GETSTATS 0x0800 /* get collected stats */ -#define Q_RSQUASH 0x1000 /* set root_squash option */ /* - * The following structure defines the format of the disk quota file - * (as it appears on disk) - the file is an array of these structures - * indexed by user or group number. + * Data for one user/group kept in memory */ -struct dqblk { +struct mem_dqblk { __u32 dqb_bhardlimit; /* absolute limit on disk blks alloc */ __u32 dqb_bsoftlimit; /* preferred limit on disk blks */ __u32 dqb_curblocks; /* current block count */ __u32 dqb_ihardlimit; /* absolute limit on allocated inodes */ __u32 dqb_isoftlimit; /* preferred inode limit */ __u32 dqb_curinodes; /* current # allocated inodes */ - time_t dqb_btime; /* time limit for excessive disk use */ - time_t dqb_itime; /* time limit for excessive inode use */ + time_t dqb_btime; /* time limit for excessive disk use */ + time_t dqb_itime; /* time limit for excessive inode use */ }; +/* + * Data for one quotafile kept in memory + */ +struct mem_dqinfo { + int dqi_flags; + unsigned int dqi_bgrace; + unsigned int dqi_igrace; + union { + } u; +}; + +#ifdef __KERNEL__ + +#define DQF_MASK 0xffff /* Mask for format specific flags */ +#define DQF_INFO_DIRTY 0x10000 /* Is info dirty? */ + +extern inline void mark_info_dirty(struct mem_dqinfo *info) +{ + info->dqi_flags |= DQF_INFO_DIRTY; +} + +#define info_dirty(info) ((info)->dqi_flags & DQF_INFO_DIRTY) + +#define sb_dqopt(sb) (&(sb)->s_dquot) + +#endif /* __KERNEL__ */ + /* * Shorthand notation. */ @@ -134,6 +154,11 @@ struct dqblk { #define dqoff(UID) ((loff_t)((UID) * sizeof (struct dqblk))) +#ifdef __KERNEL__ + +extern int nr_dquots, nr_free_dquots; +extern int dquot_root_squash; + struct dqstats { __u32 lookups; __u32 drops; @@ -145,10 +170,6 @@ struct dqstats { __u32 syncs; }; -#ifdef __KERNEL__ - -extern int dquot_root_squash; - #define NR_DQHASH 43 /* Just an arbitrary number */ #define DQ_LOCKED 0x01 /* dquot under IO */ @@ -174,21 +195,31 @@ struct dquot { short dq_flags; /* See DQ_* */ unsigned long dq_referenced; /* Number of times this dquot was referenced during its lifetime */ - struct dqblk dq_dqb; /* Diskquota usage */ + struct mem_dqblk dq_dqb; /* Diskquota usage */ }; -#define NODQUOT (struct dquot *)NULL +extern inline void mark_dquot_dirty(struct dquot *dquot) +{ + dquot->dq_flags |= DQ_MOD; +} -/* - * Flags used for set_dqblk. - */ -#define SET_QUOTA 0x02 -#define SET_USE 0x04 -#define SET_QLIMIT 0x08 +#define dquot_dirty(dquot) ((dquot)->dq_flags & DQ_MOD) + +#define NODQUOT (struct dquot *)NULL #define QUOTA_OK 0 #define NO_QUOTA 1 +/* Operations which must be implemented by each quota format */ +struct quota_format_ops { + int (*check_quota_file)(struct super_block *sb, int type); /* Detect whether file is in our format */ + int (*read_file_info)(struct super_block *sb, int type); /* Read main info about file */ + int (*write_file_info)(struct super_block *sb, int type); /* Write main info about file */ + int (*free_file_info)(struct super_block *sb, int type); /* Called on quotaoff() */ + int (*read_dqblk)(struct dquot *dquot); /* Read structure for one user */ + int (*commit_dqblk)(struct dquot *dquot); /* Write (or delete) structure for one user */ +}; + #else # /* nodep */ include -- cgit v1.2.3 From 48c39f24034b8c3050a016e0d02b3b4704da6835 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sun, 19 May 2002 19:34:00 -0700 Subject: [PATCH] [3/13] quota-3-register This patch implements list 'quota_formats' with registered quota formats and functions register_quota_format() and unregister_quota_format() for manipulating the list. --- fs/Makefile | 2 +- fs/dquot.c | 55 +++++++++++++++++++++++++++++++++++++++------------ include/linux/quota.h | 15 ++++++++++---- 3 files changed, 54 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/fs/Makefile b/fs/Makefile index 2449b05e367a..83769de6e2e4 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -7,7 +7,7 @@ O_TARGET := fs.o -export-objs := filesystems.o open.o dcache.o buffer.o bio.o +export-objs := filesystems.o open.o dcache.o buffer.o bio.o dquot.o mod-subdirs := nls obj-y := open.o read_write.o devices.o file_table.o buffer.o \ diff --git a/fs/dquot.c b/fs/dquot.c index 55245e75b2d9..f4d8fbe1c165 100644 --- a/fs/dquot.c +++ b/fs/dquot.c @@ -70,10 +70,36 @@ int nr_dquots, nr_free_dquots; static char *quotatypes[] = INITQFNAMES; +static struct quota_format_type *quota_formats; /* List of registered formats */ -static inline struct quota_info *sb_dqopt(struct super_block *sb) +int register_quota_format(struct quota_format_type *fmt) { - return &sb->s_dquot; + lock_kernel(); + fmt->qf_next = quota_formats; + quota_formats = fmt; + unlock_kernel(); + return 0; +} + +void unregister_quota_format(struct quota_format_type *fmt) +{ + struct quota_format_type **actqf; + + lock_kernel(); + for (actqf = "a_formats; *actqf && *actqf != fmt; actqf = &(*actqf)->qf_next); + if (*actqf) + *actqf = (*actqf)->qf_next; + unlock_kernel(); +} + +static struct quota_format_type *find_quota_format(int id) +{ + struct quota_format_type *actqf; + + lock_kernel(); + for (actqf = quota_formats; actqf && actqf->qf_fmt_id != id; actqf = actqf->qf_next); + unlock_kernel(); + return actqf; } /* @@ -1237,6 +1263,8 @@ int quota_off(struct super_block *sb, short type) invalidate_dquots(sb, cnt); if (info_dirty(&dqopt->info[cnt])) dqopt->ops[cnt]->write_file_info(sb, cnt); + if (dqopt->ops[cnt]->free_file_info) + dqopt->ops[cnt]->free_file_info(sb, cnt); filp = dqopt->files[cnt]; dqopt->files[cnt] = (struct file *)NULL; @@ -1252,30 +1280,27 @@ out: return 0; } -static int quota_on(struct super_block *sb, short type, char *path) +static int quota_on(struct super_block *sb, int type, int format_id, char *path) { struct file *f; struct inode *inode; - struct dquot *dquot; struct quota_info *dqopt = sb_dqopt(sb); - char *tmp; + struct quota_format_type *fmt = find_quota_format(format_id); int error; + if (!fmt) + return -EINVAL; if (is_enabled(dqopt, type)) return -EBUSY; down(&dqopt->dqoff_sem); - tmp = getname(path); - error = PTR_ERR(tmp); - if (IS_ERR(tmp)) - goto out_lock; - f = filp_open(tmp, O_RDWR, 0600); - putname(tmp); + f = filp_open(path, O_RDWR, 0600); error = PTR_ERR(f); if (IS_ERR(f)) goto out_lock; + dqopt->files[type] = f; error = -EIO; if (!f->f_op || !f->f_op->read || !f->f_op->write) goto out_f; @@ -1284,13 +1309,16 @@ static int quota_on(struct super_block *sb, short type, char *path) if (!S_ISREG(inode->i_mode)) goto out_f; error = -EINVAL; - if (!check_quota_file(sb, type)) + if (!fmt->qf_ops->check_quota_file(sb, type)) goto out_f; /* We don't want quota on quota files */ dquot_drop(inode); inode->i_flags |= S_NOQUOTA; - dqopt->files[type] = f; + dqopt->ops[type] = fmt->qf_ops; + dqopt->info[type].dqi_format = format_id; + if ((error = dqopt->ops[type]->read_file_info(sb, type)) < 0) + goto out_f; sb->dq_op = &dquot_operations; set_enable_flags(dqopt, type); @@ -1301,6 +1329,7 @@ static int quota_on(struct super_block *sb, short type, char *path) out_f: filp_close(f, NULL); + dqopt->files[type] = NULL; out_lock: up(&dqopt->dqoff_sem); diff --git a/include/linux/quota.h b/include/linux/quota.h index 820b561ba086..c6f1eacb9bc2 100644 --- a/include/linux/quota.h +++ b/include/linux/quota.h @@ -42,9 +42,6 @@ #include #include -#define __DQUOT_VERSION__ "dquot_6.5.1" -#define __DQUOT_NUM_VERSION__ 6*10000+5*100+1 - typedef __kernel_uid32_t qid_t; /* Type in which we store ids in memory */ /* @@ -116,7 +113,10 @@ struct mem_dqblk { /* * Data for one quotafile kept in memory */ +struct quota_format_type; + struct mem_dqinfo { + struct quota_format_type *dqi_format; int dqi_flags; unsigned int dqi_bgrace; unsigned int dqi_igrace; @@ -157,7 +157,6 @@ extern inline void mark_info_dirty(struct mem_dqinfo *info) #ifdef __KERNEL__ extern int nr_dquots, nr_free_dquots; -extern int dquot_root_squash; struct dqstats { __u32 lookups; @@ -170,6 +169,8 @@ struct dqstats { __u32 syncs; }; +extern struct dqstats dqstats; + #define NR_DQHASH 43 /* Just an arbitrary number */ #define DQ_LOCKED 0x01 /* dquot under IO */ @@ -220,6 +221,12 @@ struct quota_format_ops { int (*commit_dqblk)(struct dquot *dquot); /* Write (or delete) structure for one user */ }; +struct quota_format_type { + int qf_fmt_id; /* Quota format id */ + struct quota_format_ops *qf_ops; /* Operations of format */ + struct quota_format_type *qf_next; +}; + #else # /* nodep */ include -- cgit v1.2.3 From f48acc23db77f14f59e8337ad6e015d9fe8d47b2 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sun, 19 May 2002 19:34:05 -0700 Subject: [PATCH] [4/13] quota-4-getstats This patch moves reporting of quota statistics from Q_GETSTATS call to /proc/fs/quota. Also reporting of registered quota formats is added. --- fs/dquot.c | 103 ++++++++++++++++++++++++++++++-------------------- include/linux/quota.h | 3 ++ 2 files changed, 65 insertions(+), 41 deletions(-) (limited to 'include') diff --git a/fs/dquot.c b/fs/dquot.c index f4d8fbe1c165..b824ce109c57 100644 --- a/fs/dquot.c +++ b/fs/dquot.c @@ -62,11 +62,10 @@ #include #include #include +#include #include -#define __DQUOT_VERSION__ "dquot_6.4.0" - int nr_dquots, nr_free_dquots; static char *quotatypes[] = INITQFNAMES; @@ -879,21 +878,6 @@ static int check_bdq(struct dquot *dquot, ulong blocks, char prealloc, char *war return QUOTA_OK; } -static int get_stats(caddr_t addr) -{ - int error = -EFAULT; - struct dqstats stats; - - dqstats.allocated_dquots = nr_dquots; - dqstats.free_dquots = nr_free_dquots; - - /* make a copy, in case we page-fault in user space */ - memcpy(&stats, &dqstats, sizeof(struct dqstats)); - if (!copy_to_user(addr, &stats, sizeof(struct dqstats))) - error = 0; - return error; -} - /* * Externally referenced functions through dquot_operations in inode. * @@ -1172,30 +1156,6 @@ warn_put_all: return ret; } -static ctl_table fs_table[] = { - {FS_NRDQUOT, "dquot-nr", &nr_dquots, 2*sizeof(int), - 0444, NULL, &proc_dointvec}, - {}, -}; - -static ctl_table dquot_table[] = { - {CTL_FS, "fs", NULL, 0, 0555, fs_table}, - {}, -}; - -static int __init dquot_init(void) -{ - int i; - - register_sysctl_table(dquot_table, 0); - - for (i = 0; i < NR_DQHASH; i++) - INIT_LIST_HEAD(dquot_hash + i); - printk(KERN_NOTICE "VFS: Diskquotas version %s initialized\n", __DQUOT_VERSION__); - return 0; -} -__initcall(dquot_init); - /* * Definitions of diskquota operations. */ @@ -1439,3 +1399,64 @@ out: unlock_kernel(); return ret; } + +#ifdef CONFIG_PROC_FS +static int read_stats(char *buffer, char **start, off_t offset, int count, int *eof, void *data) +{ + int len; + struct quota_format_type *actqf; + + dqstats.allocated_dquots = nr_dquots; + dqstats.free_dquots = nr_free_dquots; + + len = sprintf(buffer, "Version %u\n", __DQUOT_NUM_VERSION__); + len += sprintf(buffer + len, "Formats"); + lock_kernel(); + for (actqf = quota_formats; actqf; actqf = actqf->qf_next) + len += sprintf(buffer + len, " %u", actqf->qf_id); + unlock_kernel(); + len += sprintf(buffer + len, "\n%u %u %u %u %u %u %u %u\n", + dqstats.lookups, dqstats.drops, + dqstats.reads, dqstats.writes, + dqstats.cache_hits, dqstats.allocated_dquots, + dqstats.free_dquots, dqstats.syncs); + + if (offset >= len) { + *start = buffer; + *eof = 1; + return 0; + } + *start = buffer + offset; + if ((len -= offset) > count) + return count; + *eof = 1; + + return len; +} +#endif + +static ctl_table fs_table[] = { + {FS_NRDQUOT, "dquot-nr", &nr_dquots, 2*sizeof(int), + 0444, NULL, &proc_dointvec}, + {}, +}; + +static ctl_table dquot_table[] = { + {CTL_FS, "fs", NULL, 0, 0555, fs_table}, + {}, +}; + +static int __init dquot_init(void) +{ + int i; + + register_sysctl_table(dquot_table, 0); + for (i = 0; i < NR_DQHASH; i++) + INIT_LIST_HEAD(dquot_hash + i); + printk(KERN_NOTICE "VFS: Diskquotas version %s initialized\n", __DQUOT_VERSION__); +#ifdef CONFIG_PROC_FS + create_proc_read_entry("fs/quota", 0, 0, read_stats, NULL); +#endif + return 0; +} +__initcall(dquot_init); diff --git a/include/linux/quota.h b/include/linux/quota.h index c6f1eacb9bc2..b17397cb51ba 100644 --- a/include/linux/quota.h +++ b/include/linux/quota.h @@ -42,6 +42,9 @@ #include #include +#define __DQUOT_VERSION__ "dquot_6.5.1" +#define __DQUOT_NUM_VERSION__ 6*10000+5*100+1 + typedef __kernel_uid32_t qid_t; /* Type in which we store ids in memory */ /* -- cgit v1.2.3 From f0071c7be1b10da1f9341fefdea1c2fab72880ac Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sun, 19 May 2002 19:34:10 -0700 Subject: [PATCH] [5/13] quota-5-space This patch implements accounting of used space in bytes. --- fs/dquot.c | 50 +++++++++++++++++----------------- include/linux/fs.h | 4 +-- include/linux/quota.h | 36 ++++++++----------------- include/linux/quotaops.h | 70 +++++++++++++++++++++++++++--------------------- 4 files changed, 77 insertions(+), 83 deletions(-) (limited to 'include') diff --git a/fs/dquot.c b/fs/dquot.c index b824ce109c57..d697e4b18e16 100644 --- a/fs/dquot.c +++ b/fs/dquot.c @@ -703,9 +703,9 @@ static inline void dquot_incr_inodes(struct dquot *dquot, unsigned long number) mark_dquot_dirty(dquot); } -static inline void dquot_incr_blocks(struct dquot *dquot, unsigned long number) +static inline void dquot_incr_space(struct dquot *dquot, qsize_t number) { - dquot->dq_curblocks += number; + dquot->dq_curspace += number; mark_dquot_dirty(dquot); } @@ -721,13 +721,13 @@ static inline void dquot_decr_inodes(struct dquot *dquot, unsigned long number) mark_dquot_dirty(dquot); } -static inline void dquot_decr_blocks(struct dquot *dquot, unsigned long number) +static inline void dquot_decr_space(struct dquot *dquot, qsize_t number) { - if (dquot->dq_curblocks > number) - dquot->dq_curblocks -= number; + if (dquot->dq_curspace > number) + dquot->dq_curspace -= number; else - dquot->dq_curblocks = 0; - if (dquot->dq_curblocks < dquot->dq_bsoftlimit) + dquot->dq_curspace = 0; + if (toqb(dquot->dq_curspace) < dquot->dq_bsoftlimit) dquot->dq_btime = (time_t) 0; dquot->dq_flags &= ~DQ_BLKS; mark_dquot_dirty(dquot); @@ -837,14 +837,14 @@ static int check_idq(struct dquot *dquot, ulong inodes, char *warntype) return QUOTA_OK; } -static int check_bdq(struct dquot *dquot, ulong blocks, char prealloc, char *warntype) +static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *warntype) { *warntype = 0; - if (blocks <= 0 || dquot->dq_flags & DQ_FAKE) + if (space <= 0 || dquot->dq_flags & DQ_FAKE) return QUOTA_OK; if (dquot->dq_bhardlimit && - (dquot->dq_curblocks + blocks) > dquot->dq_bhardlimit && + toqb(dquot->dq_curspace + space) > dquot->dq_bhardlimit && !ignore_hardlimit(dquot)) { if (!prealloc) *warntype = BHARDWARN; @@ -852,7 +852,7 @@ static int check_bdq(struct dquot *dquot, ulong blocks, char prealloc, char *war } if (dquot->dq_bsoftlimit && - (dquot->dq_curblocks + blocks) > dquot->dq_bsoftlimit && + toqb(dquot->dq_curspace + space) > dquot->dq_bsoftlimit && dquot->dq_btime && CURRENT_TIME >= dquot->dq_btime && !ignore_hardlimit(dquot)) { if (!prealloc) @@ -861,7 +861,7 @@ static int check_bdq(struct dquot *dquot, ulong blocks, char prealloc, char *war } if (dquot->dq_bsoftlimit && - (dquot->dq_curblocks + blocks) > dquot->dq_bsoftlimit && + toqb(dquot->dq_curspace + space) > dquot->dq_bsoftlimit && dquot->dq_btime == 0) { if (!prealloc) { *warntype = BSOFTWARN; @@ -948,7 +948,7 @@ void dquot_drop(struct inode *inode) /* * This operation can block, but only after everything is updated */ -int dquot_alloc_block(struct inode *inode, unsigned long number, char warn) +int dquot_alloc_space(struct inode *inode, qsize_t number, int warn) { int cnt, ret = NO_QUOTA; struct dquot *dquot[MAXQUOTAS]; @@ -970,9 +970,9 @@ int dquot_alloc_block(struct inode *inode, unsigned long number, char warn) for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (dquot[cnt] == NODQUOT) continue; - dquot_incr_blocks(dquot[cnt], number); + dquot_incr_space(dquot[cnt], number); } - inode->i_blocks += number << (BLOCK_SIZE_BITS - 9); + inode->i_blocks += number >> 9; /* NOBLOCK End */ ret = QUOTA_OK; warn_put_all: @@ -1026,7 +1026,7 @@ warn_put_all: /* * This is a non-blocking operation. */ -void dquot_free_block(struct inode *inode, unsigned long number) +void dquot_free_space(struct inode *inode, qsize_t number) { unsigned short cnt; struct dquot *dquot; @@ -1037,10 +1037,10 @@ void dquot_free_block(struct inode *inode, unsigned long number) dquot = dqduplicate(inode->i_dquot[cnt]); if (dquot == NODQUOT) continue; - dquot_decr_blocks(dquot, number); + dquot_decr_space(dquot, number); dqputduplicate(dquot); } - inode->i_blocks -= number << (BLOCK_SIZE_BITS - 9); + inode->i_blocks -= number >> 9; unlock_kernel(); /* NOBLOCK End */ } @@ -1073,7 +1073,7 @@ void dquot_free_inode(const struct inode *inode, unsigned long number) */ int dquot_transfer(struct inode *inode, struct iattr *iattr) { - unsigned long blocks; + qsize_t space; struct dquot *transfer_from[MAXQUOTAS]; struct dquot *transfer_to[MAXQUOTAS]; int cnt, ret = NO_QUOTA, chuid = (iattr->ia_valid & ATTR_UID) && inode->i_uid != iattr->ia_uid, @@ -1103,7 +1103,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr) } } /* NOBLOCK START: From now on we shouldn't block */ - blocks = (inode->i_blocks >> 1); + space = ((qsize_t)inode->i_blocks) << 9; /* Build the transfer_from list and check the limits */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { /* The second test can fail when quotaoff is in progress... */ @@ -1113,7 +1113,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr) if (transfer_from[cnt] == NODQUOT) /* Can happen on quotafiles (quota isn't initialized on them)... */ continue; if (check_idq(transfer_to[cnt], 1, warntype+cnt) == NO_QUOTA || - check_bdq(transfer_to[cnt], blocks, 0, warntype+cnt) == NO_QUOTA) + check_bdq(transfer_to[cnt], space, 0, warntype+cnt) == NO_QUOTA) goto warn_put_all; } @@ -1128,10 +1128,10 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr) continue; dquot_decr_inodes(transfer_from[cnt], 1); - dquot_decr_blocks(transfer_from[cnt], blocks); + dquot_decr_space(transfer_from[cnt], space); dquot_incr_inodes(transfer_to[cnt], 1); - dquot_incr_blocks(transfer_to[cnt], blocks); + dquot_incr_space(transfer_to[cnt], space); if (inode->i_dquot[cnt] == NODQUOT) BUG(); @@ -1162,9 +1162,9 @@ warn_put_all: struct dquot_operations dquot_operations = { initialize: dquot_initialize, /* mandatory */ drop: dquot_drop, /* mandatory */ - alloc_block: dquot_alloc_block, + alloc_block: dquot_alloc_space, alloc_inode: dquot_alloc_inode, - free_block: dquot_free_block, + free_block: dquot_free_space, free_inode: dquot_free_inode, transfer: dquot_transfer }; diff --git a/include/linux/fs.h b/include/linux/fs.h index aaa5e8910ce1..0f789728ce3a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -847,9 +847,9 @@ static inline void mark_inode_dirty_sync(struct inode *inode) struct dquot_operations { void (*initialize) (struct inode *, short); void (*drop) (struct inode *); - int (*alloc_block) (struct inode *, unsigned long, char); + int (*alloc_space) (struct inode *, qsize_t, int); int (*alloc_inode) (const struct inode *, unsigned long); - void (*free_block) (struct inode *, unsigned long); + void (*free_space) (struct inode *, qsize_t); void (*free_inode) (const struct inode *, unsigned long); int (*transfer) (struct inode *, struct iattr *); }; diff --git a/include/linux/quota.h b/include/linux/quota.h index b17397cb51ba..7e481591efc4 100644 --- a/include/linux/quota.h +++ b/include/linux/quota.h @@ -46,30 +46,16 @@ #define __DQUOT_NUM_VERSION__ 6*10000+5*100+1 typedef __kernel_uid32_t qid_t; /* Type in which we store ids in memory */ +typedef __u64 qsize_t; /* Type in which we store sizes */ -/* - * Convert diskblocks to blocks and the other way around. - */ -#define dbtob(num) (num << BLOCK_SIZE_BITS) -#define btodb(num) (num >> BLOCK_SIZE_BITS) - -/* - * Convert count of filesystem blocks to diskquota blocks, meant - * for filesystems where i_blksize != BLOCK_SIZE - */ -#define fs_to_dq_blocks(num, blksize) (((num) * (blksize)) / BLOCK_SIZE) +/* Size of blocks in which are counted size limits */ +#define QUOTABLOCK_BITS 10 +#define QUOTABLOCK_SIZE (1 << QUOTABLOCK_BITS) -/* - * Definitions for disk quotas imposed on the average user - * (big brother finally hits Linux). - * - * The following constants define the amount of time given a user - * before the soft limits are treated as hard limits (usually resulting - * in an allocation failure). The timer is started when the user crosses - * their soft limit, it is reset when they go below their soft limit. - */ -#define MAX_IQ_TIME 604800 /* (7*24*60*60) 1 week */ -#define MAX_DQ_TIME 604800 /* (7*24*60*60) 1 week */ +/* Conversion routines from and to quota blocks */ +#define qb2kb(x) ((x) << (QUOTABLOCK_BITS-10)) +#define kb2qb(x) ((x) >> (QUOTABLOCK_BITS-10)) +#define toqb(x) (((x) + QUOTABLOCK_SIZE - 1) >> QUOTABLOCK_BITS) #define MAXQUOTAS 2 #define USRQUOTA 0 /* element used for user quotas */ @@ -105,7 +91,7 @@ typedef __kernel_uid32_t qid_t; /* Type in which we store ids in memory */ struct mem_dqblk { __u32 dqb_bhardlimit; /* absolute limit on disk blks alloc */ __u32 dqb_bsoftlimit; /* preferred limit on disk blks */ - __u32 dqb_curblocks; /* current block count */ + qsize_t dqb_curspace; /* current used space */ __u32 dqb_ihardlimit; /* absolute limit on allocated inodes */ __u32 dqb_isoftlimit; /* preferred inode limit */ __u32 dqb_curinodes; /* current # allocated inodes */ @@ -119,7 +105,7 @@ struct mem_dqblk { struct quota_format_type; struct mem_dqinfo { - struct quota_format_type *dqi_format; + struct quota_format_type * dqi_format; int dqi_flags; unsigned int dqi_bgrace; unsigned int dqi_igrace; @@ -148,7 +134,7 @@ extern inline void mark_info_dirty(struct mem_dqinfo *info) */ #define dq_bhardlimit dq_dqb.dqb_bhardlimit #define dq_bsoftlimit dq_dqb.dqb_bsoftlimit -#define dq_curblocks dq_dqb.dqb_curblocks +#define dq_curspace dq_dqb.dqb_curspace #define dq_ihardlimit dq_dqb.dqb_ihardlimit #define dq_isoftlimit dq_dqb.dqb_isoftlimit #define dq_curinodes dq_dqb.dqb_curinodes diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h index 0a1df9e1fe56..ab702b2607cf 100644 --- a/include/linux/quotaops.h +++ b/include/linux/quotaops.h @@ -25,10 +25,10 @@ extern void dquot_drop(struct inode *inode); extern int quota_off(struct super_block *sb, short type); extern int sync_dquots(struct super_block *sb, short type); -extern int dquot_alloc_block(struct inode *inode, unsigned long number, char prealloc); +extern int dquot_alloc_space(struct inode *inode, qsize_t number, int prealloc); extern int dquot_alloc_inode(const struct inode *inode, unsigned long number); -extern void dquot_free_block(struct inode *inode, unsigned long number); +extern void dquot_free_space(struct inode *inode, qsize_t number); extern void dquot_free_inode(const struct inode *inode, unsigned long number); extern int dquot_transfer(struct inode *inode, struct iattr *iattr); @@ -59,50 +59,50 @@ static __inline__ void DQUOT_DROP(struct inode *inode) unlock_kernel(); } -static __inline__ int DQUOT_PREALLOC_BLOCK_NODIRTY(struct inode *inode, int nr) +static __inline__ int DQUOT_PREALLOC_SPACE_NODIRTY(struct inode *inode, qsize_t nr) { lock_kernel(); if (sb_any_quota_enabled(inode->i_sb)) { - /* Number of used blocks is updated in alloc_block() */ - if (inode->i_sb->dq_op->alloc_block(inode, fs_to_dq_blocks(nr, inode->i_sb->s_blocksize), 1) == NO_QUOTA) { + /* Used space is updated in alloc_space() */ + if (inode->i_sb->dq_op->alloc_space(inode, nr, 1) == NO_QUOTA) { unlock_kernel(); return 1; } } else - inode->i_blocks += nr << (inode->i_sb->s_blocksize_bits - 9); + inode->i_blocks += nr >> 9; unlock_kernel(); return 0; } -static __inline__ int DQUOT_PREALLOC_BLOCK(struct inode *inode, int nr) +static __inline__ int DQUOT_PREALLOC_SPACE(struct inode *inode, qsize_t nr) { int ret; - if (!(ret = DQUOT_PREALLOC_BLOCK_NODIRTY(inode, nr))) + if (!(ret = DQUOT_PREALLOC_SPACE_NODIRTY(inode, nr))) mark_inode_dirty(inode); return ret; } -static __inline__ int DQUOT_ALLOC_BLOCK_NODIRTY(struct inode *inode, int nr) +static __inline__ int DQUOT_ALLOC_SPACE_NODIRTY(struct inode *inode, qsize_t nr) { lock_kernel(); if (sb_any_quota_enabled(inode->i_sb)) { - /* Number of used blocks is updated in alloc_block() */ - if (inode->i_sb->dq_op->alloc_block(inode, fs_to_dq_blocks(nr, inode->i_sb->s_blocksize), 0) == NO_QUOTA) { + /* Used space is updated in alloc_space() */ + if (inode->i_sb->dq_op->alloc_space(inode, nr, 0) == NO_QUOTA) { unlock_kernel(); return 1; } } else - inode->i_blocks += nr << (inode->i_sb->s_blocksize_bits - 9); + inode->i_blocks += nr >> 9; unlock_kernel(); return 0; } -static __inline__ int DQUOT_ALLOC_BLOCK(struct inode *inode, int nr) +static __inline__ int DQUOT_ALLOC_SPACE(struct inode *inode, qsize_t nr) { int ret; - if (!(ret = DQUOT_ALLOC_BLOCK_NODIRTY(inode, nr))) + if (!(ret = DQUOT_ALLOC_SPACE_NODIRTY(inode, nr))) mark_inode_dirty(inode); return ret; } @@ -121,19 +121,19 @@ static __inline__ int DQUOT_ALLOC_INODE(struct inode *inode) return 0; } -static __inline__ void DQUOT_FREE_BLOCK_NODIRTY(struct inode *inode, int nr) +static __inline__ void DQUOT_FREE_SPACE_NODIRTY(struct inode *inode, qsize_t nr) { lock_kernel(); if (sb_any_quota_enabled(inode->i_sb)) - inode->i_sb->dq_op->free_block(inode, fs_to_dq_blocks(nr, inode->i_sb->s_blocksize)); + inode->i_sb->dq_op->free_space(inode, nr); else - inode->i_blocks -= nr << (inode->i_sb->s_blocksize_bits - 9); + inode->i_blocks -= nr >> 9; unlock_kernel(); } -static __inline__ void DQUOT_FREE_BLOCK(struct inode *inode, int nr) +static __inline__ void DQUOT_FREE_SPACE(struct inode *inode, qsize_t nr) { - DQUOT_FREE_BLOCK_NODIRTY(inode, nr); + DQUOT_FREE_SPACE_NODIRTY(inode, nr); mark_inode_dirty(inode); } @@ -174,48 +174,56 @@ static __inline__ int DQUOT_TRANSFER(struct inode *inode, struct iattr *iattr) #define DQUOT_SYNC(sb) do { } while(0) #define DQUOT_OFF(sb) do { } while(0) #define DQUOT_TRANSFER(inode, iattr) (0) -extern __inline__ int DQUOT_PREALLOC_BLOCK_NODIRTY(struct inode *inode, int nr) +extern __inline__ int DQUOT_PREALLOC_SPACE_NODIRTY(struct inode *inode, qsize_t nr) { lock_kernel(); - inode->i_blocks += nr << (inode->i_sb->s_blocksize_bits - 9); + inode->i_blocks += nr >> 9; unlock_kernel(); return 0; } -extern __inline__ int DQUOT_PREALLOC_BLOCK(struct inode *inode, int nr) +extern __inline__ int DQUOT_PREALLOC_SPACE(struct inode *inode, qsize_t nr) { - DQUOT_PREALLOC_BLOCK_NODIRTY(inode, nr); + DQUOT_PREALLOC_SPACE_NODIRTY(inode, nr); mark_inode_dirty(inode); return 0; } -extern __inline__ int DQUOT_ALLOC_BLOCK_NODIRTY(struct inode *inode, int nr) +extern __inline__ int DQUOT_ALLOC_SPACE_NODIRTY(struct inode *inode, qsize_t nr) { lock_kernel(); - inode->i_blocks += nr << (inode->i_sb->s_blocksize_bits - 9); + inode->i_blocks += nr >> 9; unlock_kernel(); return 0; } -extern __inline__ int DQUOT_ALLOC_BLOCK(struct inode *inode, int nr) +extern __inline__ int DQUOT_ALLOC_SPACE(struct inode *inode, qsize_t nr) { - DQUOT_ALLOC_BLOCK_NODIRTY(inode, nr); + DQUOT_ALLOC_SPACE_NODIRTY(inode, nr); mark_inode_dirty(inode); return 0; } -extern __inline__ void DQUOT_FREE_BLOCK_NODIRTY(struct inode *inode, int nr) +extern __inline__ void DQUOT_FREE_SPACE_NODIRTY(struct inode *inode, qsize_t nr) { lock_kernel(); - inode->i_blocks -= nr << (inode->i_sb->s_blocksize_bits - 9); + inode->i_blocks -= nr >> 9; unlock_kernel(); } -extern __inline__ void DQUOT_FREE_BLOCK(struct inode *inode, int nr) +extern __inline__ void DQUOT_FREE_SPACE(struct inode *inode, qsize_t nr) { - DQUOT_FREE_BLOCK_NODIRTY(inode, nr); + DQUOT_FREE_SPACE_NODIRTY(inode, nr); mark_inode_dirty(inode); } #endif /* CONFIG_QUOTA */ + +#define DQUOT_PREALLOC_BLOCK_NODIRTY(inode, nr) DQUOT_PREALLOC_SPACE_NODIRTY(inode, ((qsize_t)(nr)) << (inode)->i_sb->s_blocksize_bits) +#define DQUOT_PREALLOC_BLOCK(inode, nr) DQUOT_PREALLOC_SPACE(inode, ((qsize_t)(nr)) << (inode)->i_sb->s_blocksize_bits) +#define DQUOT_ALLOC_BLOCK_NODIRTY(inode, nr) DQUOT_ALLOC_SPACE_NODIRTY(inode, ((qsize_t)(nr)) << (inode)->i_sb->s_blocksize_bits) +#define DQUOT_ALLOC_BLOCK(inode, nr) DQUOT_ALLOC_SPACE(inode, fs_to_dq_blocks(nr, ((qsize_t)(nr)) << (inode)->i_sb->s_blocksize_bits) +#define DQUOT_FREE_BLOCK_NODIRTY(inode, nr) DQUOT_FREE_SPACE_NODIRTY(inode, ((qsize_t)(nr)) << (inode)->i_sb->s_blocksize_bits) +#define DQUOT_FREE_BLOCK(inode, nr) DQUOT_FREE_SPACE(inode, fs_to_dq_blocks(nr, ((qsize_t)(nr)) << (inode)->i_sb->s_blocksize_bits) + #endif /* _LINUX_QUOTAOPS_ */ -- cgit v1.2.3 From ce9fb13943c8e4d932f152dbd08097ed36c62a54 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sun, 19 May 2002 19:34:14 -0700 Subject: [PATCH] [6/13] quota-6-bytes This patch implements counting of used space in inodes in bytes. New field i_bytes is added and used space modulo 512 is kept in it (rest is still kept in i_blocks). Functions manipulating both i_blocks and i_bytes are implemented (inode_add_bytes(), inode_sub_bytes() and inode_set_bytes()). Filesystems allocating only in whole blocks can safely ignore i_bytes field and continue using i_blocks... --- fs/dquot.c | 10 +++++----- fs/inode.c | 1 + include/linux/fs.h | 34 ++++++++++++++++++++++++++++++++++ include/linux/quotaops.h | 16 ++++++++-------- 4 files changed, 48 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/fs/dquot.c b/fs/dquot.c index d697e4b18e16..1503708a4657 100644 --- a/fs/dquot.c +++ b/fs/dquot.c @@ -972,7 +972,7 @@ int dquot_alloc_space(struct inode *inode, qsize_t number, int warn) continue; dquot_incr_space(dquot[cnt], number); } - inode->i_blocks += number >> 9; + inode_add_bytes(inode, number); /* NOBLOCK End */ ret = QUOTA_OK; warn_put_all: @@ -1040,7 +1040,7 @@ void dquot_free_space(struct inode *inode, qsize_t number) dquot_decr_space(dquot, number); dqputduplicate(dquot); } - inode->i_blocks -= number >> 9; + inode_sub_bytes(inode, number); unlock_kernel(); /* NOBLOCK End */ } @@ -1103,7 +1103,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr) } } /* NOBLOCK START: From now on we shouldn't block */ - space = ((qsize_t)inode->i_blocks) << 9; + space = inode_get_bytes(inode); /* Build the transfer_from list and check the limits */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { /* The second test can fail when quotaoff is in progress... */ @@ -1162,9 +1162,9 @@ warn_put_all: struct dquot_operations dquot_operations = { initialize: dquot_initialize, /* mandatory */ drop: dquot_drop, /* mandatory */ - alloc_block: dquot_alloc_space, + alloc_space: dquot_alloc_space, alloc_inode: dquot_alloc_inode, - free_block: dquot_free_space, + free_space: dquot_free_space, free_inode: dquot_free_inode, transfer: dquot_transfer }; diff --git a/fs/inode.c b/fs/inode.c index 61e3f6678737..0ceabe7b934b 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -95,6 +95,7 @@ static struct inode *alloc_inode(struct super_block *sb) atomic_set(&inode->i_writecount, 0); inode->i_size = 0; inode->i_blocks = 0; + inode->i_bytes = 0; inode->i_generation = 0; memset(&inode->i_dquot, 0, sizeof(inode->i_dquot)); inode->i_pipe = NULL; diff --git a/include/linux/fs.h b/include/linux/fs.h index 0f789728ce3a..f50388ea30dc 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -370,6 +370,7 @@ struct inode { unsigned long i_blksize; unsigned long i_blocks; unsigned long i_version; + unsigned short i_bytes; struct semaphore i_sem; struct inode_operations *i_op; struct file_operations *i_fop; /* former ->i_op->default_file_ops */ @@ -427,6 +428,39 @@ struct fown_struct { int signum; /* posix.1b rt signal to be delivered on IO */ }; +static inline void inode_add_bytes(struct inode *inode, loff_t bytes) +{ + inode->i_blocks += bytes >> 9; + bytes &= 511; + inode->i_bytes += bytes; + if (inode->i_bytes >= 512) { + inode->i_blocks++; + inode->i_bytes -= 512; + } +} + +static inline void inode_sub_bytes(struct inode *inode, loff_t bytes) +{ + inode->i_blocks -= bytes >> 9; + bytes &= 511; + if (inode->i_bytes < bytes) { + inode->i_blocks--; + inode->i_bytes += 512; + } + inode->i_bytes -= bytes; +} + +static inline loff_t inode_get_bytes(struct inode *inode) +{ + return (((loff_t)inode->i_blocks) << 9) + inode->i_bytes; +} + +static inline void inode_set_bytes(struct inode *inode, loff_t bytes) +{ + inode->i_blocks = bytes >> 9; + inode->i_bytes = bytes & 511; +} + /* * Track a single file's readahead state */ diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h index ab702b2607cf..b5da5ff5bb9a 100644 --- a/include/linux/quotaops.h +++ b/include/linux/quotaops.h @@ -70,7 +70,7 @@ static __inline__ int DQUOT_PREALLOC_SPACE_NODIRTY(struct inode *inode, qsize_t } } else - inode->i_blocks += nr >> 9; + inode_add_bytes(inode, nr); unlock_kernel(); return 0; } @@ -94,7 +94,7 @@ static __inline__ int DQUOT_ALLOC_SPACE_NODIRTY(struct inode *inode, qsize_t nr) } } else - inode->i_blocks += nr >> 9; + inode_add_bytes(inode, nr); unlock_kernel(); return 0; } @@ -127,7 +127,7 @@ static __inline__ void DQUOT_FREE_SPACE_NODIRTY(struct inode *inode, qsize_t nr) if (sb_any_quota_enabled(inode->i_sb)) inode->i_sb->dq_op->free_space(inode, nr); else - inode->i_blocks -= nr >> 9; + inode_sub_bytes(inode, nr); unlock_kernel(); } @@ -177,7 +177,7 @@ static __inline__ int DQUOT_TRANSFER(struct inode *inode, struct iattr *iattr) extern __inline__ int DQUOT_PREALLOC_SPACE_NODIRTY(struct inode *inode, qsize_t nr) { lock_kernel(); - inode->i_blocks += nr >> 9; + inode_add_bytes(inode, nr); unlock_kernel(); return 0; } @@ -192,7 +192,7 @@ extern __inline__ int DQUOT_PREALLOC_SPACE(struct inode *inode, qsize_t nr) extern __inline__ int DQUOT_ALLOC_SPACE_NODIRTY(struct inode *inode, qsize_t nr) { lock_kernel(); - inode->i_blocks += nr >> 9; + inode_add_bytes(inode, nr); unlock_kernel(); return 0; } @@ -207,7 +207,7 @@ extern __inline__ int DQUOT_ALLOC_SPACE(struct inode *inode, qsize_t nr) extern __inline__ void DQUOT_FREE_SPACE_NODIRTY(struct inode *inode, qsize_t nr) { lock_kernel(); - inode->i_blocks -= nr >> 9; + inode_sub_bytes(inode, nr); unlock_kernel(); } @@ -222,8 +222,8 @@ extern __inline__ void DQUOT_FREE_SPACE(struct inode *inode, qsize_t nr) #define DQUOT_PREALLOC_BLOCK_NODIRTY(inode, nr) DQUOT_PREALLOC_SPACE_NODIRTY(inode, ((qsize_t)(nr)) << (inode)->i_sb->s_blocksize_bits) #define DQUOT_PREALLOC_BLOCK(inode, nr) DQUOT_PREALLOC_SPACE(inode, ((qsize_t)(nr)) << (inode)->i_sb->s_blocksize_bits) #define DQUOT_ALLOC_BLOCK_NODIRTY(inode, nr) DQUOT_ALLOC_SPACE_NODIRTY(inode, ((qsize_t)(nr)) << (inode)->i_sb->s_blocksize_bits) -#define DQUOT_ALLOC_BLOCK(inode, nr) DQUOT_ALLOC_SPACE(inode, fs_to_dq_blocks(nr, ((qsize_t)(nr)) << (inode)->i_sb->s_blocksize_bits) +#define DQUOT_ALLOC_BLOCK(inode, nr) DQUOT_ALLOC_SPACE(inode, ((qsize_t)(nr)) << (inode)->i_sb->s_blocksize_bits) #define DQUOT_FREE_BLOCK_NODIRTY(inode, nr) DQUOT_FREE_SPACE_NODIRTY(inode, ((qsize_t)(nr)) << (inode)->i_sb->s_blocksize_bits) -#define DQUOT_FREE_BLOCK(inode, nr) DQUOT_FREE_SPACE(inode, fs_to_dq_blocks(nr, ((qsize_t)(nr)) << (inode)->i_sb->s_blocksize_bits) +#define DQUOT_FREE_BLOCK(inode, nr) DQUOT_FREE_SPACE(inode, ((qsize_t)(nr)) << (inode)->i_sb->s_blocksize_bits) #endif /* _LINUX_QUOTAOPS_ */ -- cgit v1.2.3 From b5abbc1f844d101615d6d2c5abac8a692f028d01 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sun, 19 May 2002 19:34:20 -0700 Subject: [PATCH] [7/13] quota-7-quotactl This is probably the largest chunk in quota patches. It removes old quotactl interface and implements new one. New interface should not need arch specific conversions so they are removed. All quota interface stuff is moved to quota.c so we can easily separate things which should be compiled even if quota is disabled (mainly because XFS needs some interface even if standard VFS quota is disabled). Callbacks to filesystem on quota_on() and quota_off() are implemented (needed by Ext3), quota operations callbacks are now set in super.c on superblock initialization and not on quota_on(). This way it starts to make sense to have callbacks on alloc_space(), alloc_inode() etc. as filesystem can override them on read_super(). This will be used later for implementing journalled quota. --- arch/ia64/ia32/ia32_entry.S | 2 +- arch/ia64/ia32/sys_ia32.c | 54 ------ arch/s390x/kernel/linux32.c | 58 ------- arch/s390x/kernel/wrapper32.S | 2 +- arch/sparc64/kernel/sys_sparc32.c | 56 ------- arch/sparc64/kernel/systbls.S | 4 +- fs/Makefile | 2 +- fs/dquot.c | 344 ++++++++++++++++++++------------------ fs/quota.c | 263 +++++++++++++++++++++++++++++ fs/super.c | 3 + include/linux/fs.h | 13 +- include/linux/quota.h | 130 +++++++++++--- include/linux/quotaops.h | 24 ++- include/linux/xqm.h | 159 ++++++++++++++++++ 14 files changed, 739 insertions(+), 375 deletions(-) create mode 100644 fs/quota.c create mode 100644 include/linux/xqm.h (limited to 'include') diff --git a/arch/ia64/ia32/ia32_entry.S b/arch/ia64/ia32/ia32_entry.S index f3e7e950b7ac..f9595d3f43ed 100644 --- a/arch/ia64/ia32/ia32_entry.S +++ b/arch/ia64/ia32/ia32_entry.S @@ -310,7 +310,7 @@ ia32_syscall_table: data8 sys32_ni_syscall /* init_module */ data8 sys32_ni_syscall /* delete_module */ data8 sys32_ni_syscall /* get_kernel_syms */ /* 130 */ - data8 sys32_quotactl + data8 sys_quotactl data8 sys_getpgid data8 sys_fchdir data8 sys32_ni_syscall /* sys_bdflush */ diff --git a/arch/ia64/ia32/sys_ia32.c b/arch/ia64/ia32/sys_ia32.c index 0bcd16ac271e..c3852b487a11 100644 --- a/arch/ia64/ia32/sys_ia32.c +++ b/arch/ia64/ia32/sys_ia32.c @@ -3669,60 +3669,6 @@ getname32 (const char *filename) return result; } -struct dqblk32 { - __u32 dqb_bhardlimit; - __u32 dqb_bsoftlimit; - __u32 dqb_curblocks; - __u32 dqb_ihardlimit; - __u32 dqb_isoftlimit; - __u32 dqb_curinodes; - __kernel_time_t32 dqb_btime; - __kernel_time_t32 dqb_itime; -}; - -asmlinkage long -sys32_quotactl (int cmd, unsigned int special, int id, struct dqblk32 *addr) -{ - extern asmlinkage long sys_quotactl (int, const char *, int, caddr_t); - int cmds = cmd >> SUBCMDSHIFT; - mm_segment_t old_fs; - struct dqblk d; - char *spec; - long err; - - switch (cmds) { - case Q_GETQUOTA: - break; - case Q_SETQUOTA: - case Q_SETUSE: - case Q_SETQLIM: - if (copy_from_user (&d, addr, sizeof(struct dqblk32))) - return -EFAULT; - d.dqb_itime = ((struct dqblk32 *)&d)->dqb_itime; - d.dqb_btime = ((struct dqblk32 *)&d)->dqb_btime; - break; - default: - return sys_quotactl(cmd, (void *) A(special), id, (caddr_t) addr); - } - spec = getname32((void *) A(special)); - err = PTR_ERR(spec); - if (IS_ERR(spec)) - return err; - old_fs = get_fs (); - set_fs(KERNEL_DS); - err = sys_quotactl(cmd, (const char *)spec, id, (caddr_t)&d); - set_fs(old_fs); - putname(spec); - if (cmds == Q_GETQUOTA) { - __kernel_time_t b = d.dqb_btime, i = d.dqb_itime; - ((struct dqblk32 *)&d)->dqb_itime = i; - ((struct dqblk32 *)&d)->dqb_btime = b; - if (copy_to_user(addr, &d, sizeof(struct dqblk32))) - return -EFAULT; - } - return err; -} - asmlinkage long sys32_sched_rr_get_interval (pid_t pid, struct timespec32 *interval) { diff --git a/arch/s390x/kernel/linux32.c b/arch/s390x/kernel/linux32.c index a125f2d41043..e06f1958dd10 100644 --- a/arch/s390x/kernel/linux32.c +++ b/arch/s390x/kernel/linux32.c @@ -897,64 +897,6 @@ asmlinkage long sys32_fcntl64(unsigned int fd, unsigned int cmd, unsigned long a return sys32_fcntl(fd, cmd, arg); } -struct mem_dqblk32 { - __u32 dqb_ihardlimit; - __u32 dqb_isoftlimit; - __u32 dqb_curinodes; - __u32 dqb_bhardlimit; - __u32 dqb_bsoftlimit; - __u64 dqb_curspace; - __kernel_time_t32 dqb_btime; - __kernel_time_t32 dqb_itime; -}; - -extern asmlinkage long sys_quotactl(int cmd, const char *special, int id, __kernel_caddr_t addr); - -asmlinkage int sys32_quotactl(int cmd, const char *special, int id, unsigned long addr) -{ - int cmds = cmd >> SUBCMDSHIFT; - int err; - struct mem_dqblk d; - mm_segment_t old_fs; - char *spec; - - switch (cmds) { - case Q_GETQUOTA: - break; - case Q_SETQUOTA: - case Q_SETUSE: - case Q_SETQLIM: - if (copy_from_user (&d, (struct mem_dqblk32 *)addr, - sizeof (struct mem_dqblk32))) - return -EFAULT; - d.dqb_itime = ((struct mem_dqblk32 *)&d)->dqb_itime; - d.dqb_btime = ((struct mem_dqblk32 *)&d)->dqb_btime; - break; - default: - return sys_quotactl(cmd, special, - id, (__kernel_caddr_t)addr); - } - spec = getname (special); - err = PTR_ERR(spec); - if (IS_ERR(spec)) return err; - old_fs = get_fs (); - set_fs (KERNEL_DS); - err = sys_quotactl(cmd, (const char *)spec, id, (__kernel_caddr_t)&d); - set_fs (old_fs); - putname (spec); - if (err) - return err; - if (cmds == Q_GETQUOTA) { - __kernel_time_t b = d.dqb_btime, i = d.dqb_itime; - ((struct mem_dqblk32 *)&d)->dqb_itime = i; - ((struct mem_dqblk32 *)&d)->dqb_btime = b; - if (copy_to_user ((struct mem_dqblk32 *)addr, &d, - sizeof (struct mem_dqblk32))) - return -EFAULT; - } - return 0; -} - static inline int put_statfs (struct statfs32 *ubuf, struct statfs *kbuf) { int err; diff --git a/arch/s390x/kernel/wrapper32.S b/arch/s390x/kernel/wrapper32.S index a11ee19b21ff..8a66558332b1 100644 --- a/arch/s390x/kernel/wrapper32.S +++ b/arch/s390x/kernel/wrapper32.S @@ -586,7 +586,7 @@ sys32_quotactl_wrapper: llgtr %r3,%r3 # const char * lgfr %r4,%r4 # int llgtr %r5,%r5 # caddr_t - jg sys32_quotactl # branch to system call + jg sys_quotactl # branch to system call .globl sys32_getpgid_wrapper sys32_getpgid_wrapper: diff --git a/arch/sparc64/kernel/sys_sparc32.c b/arch/sparc64/kernel/sys_sparc32.c index 0e4720e33966..224387833d54 100644 --- a/arch/sparc64/kernel/sys_sparc32.c +++ b/arch/sparc64/kernel/sys_sparc32.c @@ -889,62 +889,6 @@ asmlinkage long sys32_fcntl64(unsigned int fd, unsigned int cmd, unsigned long a return sys32_fcntl(fd, cmd, arg); } -struct dqblk32 { - __u32 dqb_bhardlimit; - __u32 dqb_bsoftlimit; - __u32 dqb_curblocks; - __u32 dqb_ihardlimit; - __u32 dqb_isoftlimit; - __u32 dqb_curinodes; - __kernel_time_t32 dqb_btime; - __kernel_time_t32 dqb_itime; -}; - -extern asmlinkage int sys_quotactl(int cmd, const char *special, int id, caddr_t addr); - -asmlinkage int sys32_quotactl(int cmd, const char *special, int id, unsigned long addr) -{ - int cmds = cmd >> SUBCMDSHIFT; - int err; - struct dqblk d; - mm_segment_t old_fs; - char *spec; - - switch (cmds) { - case Q_GETQUOTA: - break; - case Q_SETQUOTA: - case Q_SETUSE: - case Q_SETQLIM: - if (copy_from_user (&d, (struct dqblk32 *)addr, - sizeof (struct dqblk32))) - return -EFAULT; - d.dqb_itime = ((struct dqblk32 *)&d)->dqb_itime; - d.dqb_btime = ((struct dqblk32 *)&d)->dqb_btime; - break; - default: - return sys_quotactl(cmd, special, - id, (caddr_t)addr); - } - spec = getname (special); - err = PTR_ERR(spec); - if (IS_ERR(spec)) return err; - old_fs = get_fs (); - set_fs (KERNEL_DS); - err = sys_quotactl(cmd, (const char *)spec, id, (caddr_t)&d); - set_fs (old_fs); - putname (spec); - if (cmds == Q_GETQUOTA) { - __kernel_time_t b = d.dqb_btime, i = d.dqb_itime; - ((struct dqblk32 *)&d)->dqb_itime = i; - ((struct dqblk32 *)&d)->dqb_btime = b; - if (copy_to_user ((struct dqblk32 *)addr, &d, - sizeof (struct dqblk32))) - return -EFAULT; - } - return err; -} - static inline int put_statfs (struct statfs32 *ubuf, struct statfs *kbuf) { int err; diff --git a/arch/sparc64/kernel/systbls.S b/arch/sparc64/kernel/systbls.S index b7b5414167bd..6138ce2fca94 100644 --- a/arch/sparc64/kernel/systbls.S +++ b/arch/sparc64/kernel/systbls.S @@ -52,7 +52,7 @@ sys_call_table32: /*150*/ .word sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_poll, sys_getdents64 .word sys32_fcntl64, sys_nis_syscall, sys32_statfs, sys32_fstatfs, sys_oldumount /*160*/ .word sys32_sched_setaffinity, sys32_sched_getaffinity, sys_getdomainname, sys_setdomainname, sys_nis_syscall - .word sys32_quotactl, sys_nis_syscall, sys32_mount, sys_ustat, sys_setxattr + .word sys_quotactl, sys_nis_syscall, sys32_mount, sys_ustat, sys_setxattr /*170*/ .word sys_lsetxattr, sys_fsetxattr, sys_getxattr, sys_lgetxattr, sys32_getdents .word sys_setsid, sys_fchdir, sys_fgetxattr, sys_listxattr, sys_llistxattr /*180*/ .word sys_flistxattr, sys_removexattr, sys_lremovexattr, sys32_sigpending, sys32_query_module @@ -194,7 +194,7 @@ sunos_sys_table: .word sunos_getdirentries, sys32_statfs, sys32_fstatfs .word sys_oldumount, sunos_nosys, sunos_nosys .word sys_getdomainname, sys_setdomainname - .word sunos_nosys, sys32_quotactl, sunos_nosys + .word sunos_nosys, sys_quotactl, sunos_nosys .word sunos_mount, sys_ustat, sunos_semsys .word sunos_nosys, sunos_shmsys, sunos_audit .word sunos_nosys, sunos_getdents, sys_setsid diff --git a/fs/Makefile b/fs/Makefile index 83769de6e2e4..e843c709297e 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -15,7 +15,7 @@ obj-y := open.o read_write.o devices.o file_table.o buffer.o \ namei.o fcntl.o ioctl.o readdir.o select.o fifo.o locks.o \ dcache.o inode.o attr.o bad_inode.o file.o iobuf.o dnotify.o \ filesystems.o namespace.o seq_file.o xattr.o libfs.o \ - fs-writeback.o + fs-writeback.o quota.o ifneq ($(CONFIG_NFSD),n) ifneq ($(CONFIG_NFSD),) diff --git a/fs/dquot.c b/fs/dquot.c index 1503708a4657..b6125801dd94 100644 --- a/fs/dquot.c +++ b/fs/dquot.c @@ -62,12 +62,11 @@ #include #include #include +#include #include #include -int nr_dquots, nr_free_dquots; - static char *quotatypes[] = INITQFNAMES; static struct quota_format_type *quota_formats; /* List of registered formats */ @@ -97,10 +96,18 @@ static struct quota_format_type *find_quota_format(int id) lock_kernel(); for (actqf = quota_formats; actqf && actqf->qf_fmt_id != id; actqf = actqf->qf_next); + if (actqf && !try_inc_mod_count(actqf->qf_owner)) + actqf = NULL; unlock_kernel(); return actqf; } +static void put_quota_format(struct quota_format_type *fmt) +{ + if (fmt->qf_owner) + __MOD_DEC_USE_COUNT(fmt->qf_owner); +} + /* * Dquot List Management: * The quota code uses three lists for dquot management: the inuse_list, @@ -141,27 +148,11 @@ static LIST_HEAD(inuse_list); static LIST_HEAD(free_dquots); static struct list_head dquot_hash[NR_DQHASH]; -static struct dqstats dqstats; +struct dqstats dqstats; static void dqput(struct dquot *); static struct dquot *dqduplicate(struct dquot *); -static inline char is_enabled(struct quota_info *dqopt, short type) -{ - switch (type) { - case USRQUOTA: - return((dqopt->flags & DQUOT_USR_ENABLED) != 0); - case GRPQUOTA: - return((dqopt->flags & DQUOT_GRP_ENABLED) != 0); - } - return(0); -} - -static inline char sb_has_quota_enabled(struct super_block *sb, short type) -{ - return is_enabled(sb_dqopt(sb), type); -} - static inline void get_dquot_ref(struct dquot *dquot) { dquot->dq_count++; @@ -405,6 +396,7 @@ restart: dqput(dquot); goto restart; } + /* FIXME: Here we should also sync all file info */ dqstats.syncs++; unlock_kernel(); return 0; @@ -619,9 +611,6 @@ static void add_dquot_ref(struct super_block *sb, short type) { struct list_head *p; - if (!sb->dq_op) - return; /* nothing to do */ - restart: file_list_lock(); list_for_each(p, &sb->s_files) { @@ -699,36 +688,36 @@ void put_dquot_list(struct list_head *tofree_head) static inline void dquot_incr_inodes(struct dquot *dquot, unsigned long number) { - dquot->dq_curinodes += number; + dquot->dq_dqb.dqb_curinodes += number; mark_dquot_dirty(dquot); } static inline void dquot_incr_space(struct dquot *dquot, qsize_t number) { - dquot->dq_curspace += number; + dquot->dq_dqb.dqb_curspace += number; mark_dquot_dirty(dquot); } static inline void dquot_decr_inodes(struct dquot *dquot, unsigned long number) { - if (dquot->dq_curinodes > number) - dquot->dq_curinodes -= number; + if (dquot->dq_dqb.dqb_curinodes > number) + dquot->dq_dqb.dqb_curinodes -= number; else - dquot->dq_curinodes = 0; - if (dquot->dq_curinodes < dquot->dq_isoftlimit) - dquot->dq_itime = (time_t) 0; + dquot->dq_dqb.dqb_curinodes = 0; + if (dquot->dq_dqb.dqb_curinodes < dquot->dq_dqb.dqb_isoftlimit) + dquot->dq_dqb.dqb_itime = (time_t) 0; dquot->dq_flags &= ~DQ_INODES; mark_dquot_dirty(dquot); } static inline void dquot_decr_space(struct dquot *dquot, qsize_t number) { - if (dquot->dq_curspace > number) - dquot->dq_curspace -= number; + if (dquot->dq_dqb.dqb_curspace > number) + dquot->dq_dqb.dqb_curspace -= number; else - dquot->dq_curspace = 0; - if (toqb(dquot->dq_curspace) < dquot->dq_bsoftlimit) - dquot->dq_btime = (time_t) 0; + dquot->dq_dqb.dqb_curspace = 0; + if (toqb(dquot->dq_dqb.dqb_curspace) < dquot->dq_dqb.dqb_bsoftlimit) + dquot->dq_dqb.dqb_btime = (time_t) 0; dquot->dq_flags &= ~DQ_BLKS; mark_dquot_dirty(dquot); } @@ -812,26 +801,26 @@ static int check_idq(struct dquot *dquot, ulong inodes, char *warntype) if (inodes <= 0 || dquot->dq_flags & DQ_FAKE) return QUOTA_OK; - if (dquot->dq_ihardlimit && - (dquot->dq_curinodes + inodes) > dquot->dq_ihardlimit && + if (dquot->dq_dqb.dqb_ihardlimit && + (dquot->dq_dqb.dqb_curinodes + inodes) > dquot->dq_dqb.dqb_ihardlimit && !ignore_hardlimit(dquot)) { *warntype = IHARDWARN; return NO_QUOTA; } - if (dquot->dq_isoftlimit && - (dquot->dq_curinodes + inodes) > dquot->dq_isoftlimit && - dquot->dq_itime && CURRENT_TIME >= dquot->dq_itime && + if (dquot->dq_dqb.dqb_isoftlimit && + (dquot->dq_dqb.dqb_curinodes + inodes) > dquot->dq_dqb.dqb_isoftlimit && + dquot->dq_dqb.dqb_itime && CURRENT_TIME >= dquot->dq_dqb.dqb_itime && !ignore_hardlimit(dquot)) { *warntype = ISOFTLONGWARN; return NO_QUOTA; } - if (dquot->dq_isoftlimit && - (dquot->dq_curinodes + inodes) > dquot->dq_isoftlimit && - dquot->dq_itime == 0) { + if (dquot->dq_dqb.dqb_isoftlimit && + (dquot->dq_dqb.dqb_curinodes + inodes) > dquot->dq_dqb.dqb_isoftlimit && + dquot->dq_dqb.dqb_itime == 0) { *warntype = ISOFTWARN; - dquot->dq_itime = CURRENT_TIME + sb_dqopt(dquot->dq_sb)->info[dquot->dq_type].dqi_igrace; + dquot->dq_dqb.dqb_itime = CURRENT_TIME + sb_dqopt(dquot->dq_sb)->info[dquot->dq_type].dqi_igrace; } return QUOTA_OK; @@ -843,29 +832,29 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war if (space <= 0 || dquot->dq_flags & DQ_FAKE) return QUOTA_OK; - if (dquot->dq_bhardlimit && - toqb(dquot->dq_curspace + space) > dquot->dq_bhardlimit && + if (dquot->dq_dqb.dqb_bhardlimit && + toqb(dquot->dq_dqb.dqb_curspace + space) > dquot->dq_dqb.dqb_bhardlimit && !ignore_hardlimit(dquot)) { if (!prealloc) *warntype = BHARDWARN; return NO_QUOTA; } - if (dquot->dq_bsoftlimit && - toqb(dquot->dq_curspace + space) > dquot->dq_bsoftlimit && - dquot->dq_btime && CURRENT_TIME >= dquot->dq_btime && + if (dquot->dq_dqb.dqb_bsoftlimit && + toqb(dquot->dq_dqb.dqb_curspace + space) > dquot->dq_dqb.dqb_bsoftlimit && + dquot->dq_dqb.dqb_btime && CURRENT_TIME >= dquot->dq_dqb.dqb_btime && !ignore_hardlimit(dquot)) { if (!prealloc) *warntype = BSOFTLONGWARN; return NO_QUOTA; } - if (dquot->dq_bsoftlimit && - toqb(dquot->dq_curspace + space) > dquot->dq_bsoftlimit && - dquot->dq_btime == 0) { + if (dquot->dq_dqb.dqb_bsoftlimit && + toqb(dquot->dq_dqb.dqb_curspace + space) > dquot->dq_dqb.dqb_bsoftlimit && + dquot->dq_dqb.dqb_btime == 0) { if (!prealloc) { *warntype = BSOFTWARN; - dquot->dq_btime = CURRENT_TIME + sb_dqopt(dquot->dq_sb)->info[dquot->dq_type].dqi_bgrace; + dquot->dq_dqb.dqb_btime = CURRENT_TIME + sb_dqopt(dquot->dq_sb)->info[dquot->dq_type].dqi_bgrace; } else /* @@ -1199,10 +1188,9 @@ extern void remove_dquot_ref(struct super_block *, short); /* * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount) */ -int quota_off(struct super_block *sb, short type) +int vfs_quota_off(struct super_block *sb, int type) { - struct file *filp; - short cnt; + int cnt; struct quota_info *dqopt = sb_dqopt(sb); lock_kernel(); @@ -1222,17 +1210,17 @@ int quota_off(struct super_block *sb, short type) remove_dquot_ref(sb, cnt); invalidate_dquots(sb, cnt); if (info_dirty(&dqopt->info[cnt])) - dqopt->ops[cnt]->write_file_info(sb, cnt); + dqopt->ops[cnt]->write_file_info(sb, cnt); if (dqopt->ops[cnt]->free_file_info) dqopt->ops[cnt]->free_file_info(sb, cnt); + put_quota_format(dqopt->info[cnt].dqi_format); - filp = dqopt->files[cnt]; + fput(dqopt->files[cnt]); dqopt->files[cnt] = (struct file *)NULL; dqopt->info[cnt].dqi_flags = 0; dqopt->info[cnt].dqi_igrace = 0; dqopt->info[cnt].dqi_bgrace = 0; dqopt->ops[cnt] = NULL; - fput(filp); } up(&dqopt->dqoff_sem); out: @@ -1240,9 +1228,9 @@ out: return 0; } -static int quota_on(struct super_block *sb, int type, int format_id, char *path) +int vfs_quota_on(struct super_block *sb, int type, int format_id, char *path) { - struct file *f; + struct file *f = NULL; struct inode *inode; struct quota_info *dqopt = sb_dqopt(sb); struct quota_format_type *fmt = find_quota_format(format_id); @@ -1250,8 +1238,10 @@ static int quota_on(struct super_block *sb, int type, int format_id, char *path) if (!fmt) return -EINVAL; - if (is_enabled(dqopt, type)) - return -EBUSY; + if (is_enabled(dqopt, type)) { + error = -EBUSY; + goto out_fmt; + } down(&dqopt->dqoff_sem); @@ -1276,10 +1266,9 @@ static int quota_on(struct super_block *sb, int type, int format_id, char *path) inode->i_flags |= S_NOQUOTA; dqopt->ops[type] = fmt->qf_ops; - dqopt->info[type].dqi_format = format_id; + dqopt->info[type].dqi_format = fmt; if ((error = dqopt->ops[type]->read_file_info(sb, type)) < 0) goto out_f; - sb->dq_op = &dquot_operations; set_enable_flags(dqopt, type); add_dquot_ref(sb, type); @@ -1288,116 +1277,137 @@ static int quota_on(struct super_block *sb, int type, int format_id, char *path) return 0; out_f: - filp_close(f, NULL); + if (f) + filp_close(f, NULL); dqopt->files[type] = NULL; out_lock: up(&dqopt->dqoff_sem); +out_fmt: + put_quota_format(fmt); return error; } -/* - * This is the system call interface. This communicates with - * the user-level programs. Currently this only supports diskquota - * calls. Maybe we need to add the process quotas etc. in the future, - * but we probably should use rlimits for that. - */ -asmlinkage long sys_quotactl(int cmd, const char *special, int id, caddr_t addr) +int vfs_quota_sync(struct super_block *sb, int type) { - int cmds = 0, type = 0, flags = 0; - kdev_t dev; - struct super_block *sb = NULL; - int ret = -EINVAL; + return sync_dquots(sb, type); +} - lock_kernel(); - cmds = cmd >> SUBCMDSHIFT; - type = cmd & SUBCMDMASK; +/* Generic routine for getting common part of quota structure */ +static void do_get_dqblk(struct dquot *dquot, struct if_dqblk *di) +{ + struct mem_dqblk *dm = &dquot->dq_dqb; - if ((u_int) type >= MAXQUOTAS) - goto out; - if (id & ~0xFFFF) - goto out; + di->dqb_bhardlimit = dm->dqb_bhardlimit; + di->dqb_bsoftlimit = dm->dqb_bsoftlimit; + di->dqb_curspace = dm->dqb_curspace; + di->dqb_ihardlimit = dm->dqb_ihardlimit; + di->dqb_isoftlimit = dm->dqb_isoftlimit; + di->dqb_curinodes = dm->dqb_curinodes; + di->dqb_btime = dm->dqb_btime; + di->dqb_itime = dm->dqb_itime; + di->dqb_valid = QIF_ALL; +} - ret = -EPERM; - switch (cmds) { - case Q_SYNC: - case Q_GETSTATS: - break; - case Q_GETQUOTA: - if (((type == USRQUOTA && current->euid != id) || - (type == GRPQUOTA && !in_egroup_p(id))) && - !capable(CAP_SYS_ADMIN)) - goto out; - break; - default: - if (!capable(CAP_SYS_ADMIN)) - goto out; - } +int vfs_get_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *di) +{ + struct dquot *dquot = dqget(sb, id, type); - ret = -EINVAL; - dev = NODEV; - if (special != NULL || (cmds != Q_SYNC && cmds != Q_GETSTATS)) { - mode_t mode; - struct nameidata nd; - - ret = user_path_walk(special, &nd); - if (ret) - goto out; - - dev = nd.dentry->d_inode->i_rdev; - mode = nd.dentry->d_inode->i_mode; - path_release(&nd); - - ret = -ENOTBLK; - if (!S_ISBLK(mode)) - goto out; - ret = -ENODEV; - sb = get_super(dev); - if (!sb) - goto out; - } + if (!dquot) + return -EINVAL; + do_get_dqblk(dquot, di); + dqput(dquot); + return 0; +} - ret = -EINVAL; - switch (cmds) { - case Q_QUOTAON: - ret = quota_on(sb, type, (char *) addr); - goto out; - case Q_QUOTAOFF: - ret = quota_off(sb, type); - goto out; - case Q_GETQUOTA: - ret = get_quota(sb, id, type, (struct dqblk *) addr); - goto out; - case Q_SETQUOTA: - flags |= SET_QUOTA; - break; - case Q_SETUSE: - flags |= SET_USE; - break; - case Q_SETQLIM: - flags |= SET_QLIMIT; - break; - case Q_SYNC: - ret = sync_dquots(sb, type); - goto out; - case Q_GETSTATS: - ret = get_stats(addr); - goto out; - case Q_RSQUASH: - ret = quota_root_squash(sb, type, (int *) addr); - goto out; - default: - goto out; +/* Generic routine for setting common part of quota structure */ +static void do_set_dqblk(struct dquot *dquot, struct if_dqblk *di) +{ + struct mem_dqblk *dm = &dquot->dq_dqb; + int check_blim = 0, check_ilim = 0; + + if (di->dqb_valid & QIF_SPACE) { + dm->dqb_curspace = di->dqb_curspace; + check_blim = 1; + } + if (di->dqb_valid & QIF_BLIMITS) { + dm->dqb_bsoftlimit = di->dqb_bsoftlimit; + dm->dqb_bhardlimit = di->dqb_bhardlimit; + check_blim = 1; + } + if (di->dqb_valid & QIF_INODES) { + dm->dqb_curinodes = di->dqb_curinodes; + check_ilim = 1; + } + if (di->dqb_valid & QIF_ILIMITS) { + dm->dqb_isoftlimit = di->dqb_isoftlimit; + dm->dqb_ihardlimit = di->dqb_ihardlimit; + check_ilim = 1; + } + if (di->dqb_valid & QIF_BTIME) + dm->dqb_btime = di->dqb_btime; + if (di->dqb_valid & QIF_ITIME) + dm->dqb_itime = di->dqb_itime; + + if (check_blim) { + if (!dm->dqb_bsoftlimit || toqb(dm->dqb_curspace) < dm->dqb_bsoftlimit) { + dm->dqb_btime = 0; + dquot->dq_flags &= ~DQ_BLKS; + } + else if (!(di->dqb_valid & QIF_BTIME)) /* Set grace only if user hasn't provided his own... */ + dm->dqb_btime = CURRENT_TIME + sb_dqopt(dquot->dq_sb)->info[dquot->dq_type].dqi_bgrace; + } + if (check_ilim) { + if (!dm->dqb_isoftlimit || dm->dqb_curinodes < dm->dqb_isoftlimit) { + dm->dqb_itime = 0; + dquot->dq_flags &= ~DQ_INODES; + } + else if (!(di->dqb_valid & QIF_ITIME)) /* Set grace only if user hasn't provided his own... */ + dm->dqb_itime = CURRENT_TIME + sb_dqopt(dquot->dq_sb)->info[dquot->dq_type].dqi_igrace; } + if (dm->dqb_bhardlimit || dm->dqb_bsoftlimit || dm->dqb_ihardlimit || dm->dqb_isoftlimit) + dquot->dq_flags &= ~DQ_FAKE; + else + dquot->dq_flags |= DQ_FAKE; + dquot->dq_flags |= DQ_MOD; +} - ret = -ENODEV; - if (sb && sb_has_quota_enabled(sb, type)) - ret = set_dqblk(sb, id, type, flags, (struct dqblk *) addr); -out: - if (sb) - drop_super(sb); - unlock_kernel(); - return ret; +int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *di) +{ + struct dquot *dquot = dqget(sb, id, type); + + if (!dquot) + return -EINVAL; + do_set_dqblk(dquot, di); + dqput(dquot); + return 0; +} + +/* Generic routine for getting common part of quota file information */ +int vfs_get_info(struct super_block *sb, int type, struct if_dqinfo *ii) +{ + struct mem_dqinfo *mi = sb_dqopt(sb)->info + type; + + ii->dqi_bgrace = mi->dqi_bgrace; + ii->dqi_igrace = mi->dqi_igrace; + ii->dqi_flags = mi->dqi_flags & DQF_MASK; + ii->dqi_valid = IIF_ALL; + return 0; +} + +/* Generic routine for setting common part of quota file information */ +int vfs_set_info(struct super_block *sb, int type, struct if_dqinfo *ii) +{ + struct mem_dqinfo *mi = sb_dqopt(sb)->info + type; + + if (ii->dqi_valid & IIF_BGRACE) + mi->dqi_bgrace = ii->dqi_bgrace; + if (ii->dqi_valid & IIF_IGRACE) + mi->dqi_igrace = ii->dqi_igrace; + if (ii->dqi_valid & IIF_FLAGS) + mi->dqi_flags = (mi->dqi_flags & ~DQF_MASK) | (ii->dqi_flags & DQF_MASK); + mark_info_dirty(mi); + return 0; } #ifdef CONFIG_PROC_FS @@ -1413,7 +1423,7 @@ static int read_stats(char *buffer, char **start, off_t offset, int count, int * len += sprintf(buffer + len, "Formats"); lock_kernel(); for (actqf = quota_formats; actqf; actqf = actqf->qf_next) - len += sprintf(buffer + len, " %u", actqf->qf_id); + len += sprintf(buffer + len, " %u", actqf->qf_fmt_id); unlock_kernel(); len += sprintf(buffer + len, "\n%u %u %u %u %u %u %u %u\n", dqstats.lookups, dqstats.drops, @@ -1435,6 +1445,16 @@ static int read_stats(char *buffer, char **start, off_t offset, int count, int * } #endif +struct quotactl_ops vfs_quotactl_ops { + quota_on: vfs_quota_on, + quota_off: vfs_quota_off, + quota_sync: vfs_quota_sync, + get_info: vfs_get_info, + set_info: vfs_set_info, + get_dqblk: vfs_get_dqblk, + set_dqblk: vfs_set_dqblk +}; + static ctl_table fs_table[] = { {FS_NRDQUOT, "dquot-nr", &nr_dquots, 2*sizeof(int), 0444, NULL, &proc_dointvec}, @@ -1460,3 +1480,7 @@ static int __init dquot_init(void) return 0; } __initcall(dquot_init); + +EXPORT_SYMBOL(register_quota_format); +EXPORT_SYMBOL(unregister_quota_format); +EXPORT_SYMBOL(dqstats); diff --git a/fs/quota.c b/fs/quota.c new file mode 100644 index 000000000000..88d54e8ade7b --- /dev/null +++ b/fs/quota.c @@ -0,0 +1,263 @@ +/* + * Quota code necessary even when VFS quota support is not compiled + * into the kernel. The interesting stuff is over in dquot.c, here + * we have symbols for initial quotactl(2) handling, the sysctl(2) + * variables, etc - things needed even when quota support disabled. + */ + +#include +#include +#include +#include +#include +#include + +int nr_dquots, nr_free_dquots; + +/* Check validity of quotactl */ +static int check_quotactl_valid(struct super_block *sb, int type, int cmd, qid_t id) +{ + if (type >= MAXQUOTAS) + return -EINVAL; + /* Is operation supported? */ + if (!sb->s_qcop) + return -ENOSYS; + + switch (cmd) { + case Q_GETFMT: + break; + case Q_QUOTAON: + if (!sb->s_qcop->quota_on) + return -ENOSYS; + break; + case Q_QUOTAOFF: + if (!sb->s_qcop->quota_off) + return -ENOSYS; + break; + case Q_SETINFO: + if (!sb->s_qcop->set_info) + return -ENOSYS; + break; + case Q_GETINFO: + if (!sb->s_qcop->get_info) + return -ENOSYS; + break; + case Q_SETQUOTA: + if (!sb->s_qcop->set_dqblk) + return -ENOSYS; + break; + case Q_GETQUOTA: + if (!sb->s_qcop->get_dqblk) + return -ENOSYS; + break; + case Q_SYNC: + if (!sb->s_qcop->quota_sync) + return -ENOSYS; + break; + case Q_XQUOTAON: + case Q_XQUOTAOFF: + case Q_XQUOTARM: + if (!sb->s_qcop->set_xstate) + return -ENOSYS; + break; + case Q_XGETQSTAT: + if (!sb->s_qcop->get_xstate) + return -ENOSYS; + break; + case Q_XSETQLIM: + if (!sb->s_qcop->set_xquota) + return -ENOSYS; + break; + case Q_XGETQUOTA: + if (!sb->s_qcop->get_xquota) + return -ENOSYS; + break; + default: + return -EINVAL; + } + + /* Is quota turned on for commands which need it? */ + switch (cmd) { + case Q_GETFMT: + case Q_GETINFO: + case Q_QUOTAOFF: + case Q_SETINFO: + case Q_SETQUOTA: + case Q_GETQUOTA: + if (!sb_has_quota_enabled(sb, type)) + return -ESRCH; + } + /* Check privileges */ + if (cmd == Q_GETQUOTA || cmd == Q_XGETQUOTA) { + if (((type == USRQUOTA && current->euid != id) || + (type == GRPQUOTA && !in_egroup_p(id))) && + !capable(CAP_SYS_ADMIN)) + return -EPERM; + } + else if (cmd != Q_GETFMT && cmd != Q_SYNC && cmd != Q_GETINFO && cmd != Q_XGETQSTAT) + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + return 0; +} + +/* Resolve device pathname to superblock */ +static struct super_block *resolve_dev(const char *path) +{ + int ret; + mode_t mode; + struct nameidata nd; + kdev_t dev; + struct super_block *sb; + + ret = user_path_walk(path, &nd); + if (ret) + goto out; + + dev = nd.dentry->d_inode->i_rdev; + mode = nd.dentry->d_inode->i_mode; + path_release(&nd); + + ret = -ENOTBLK; + if (!S_ISBLK(mode)) + goto out; + ret = -ENODEV; + sb = get_super(dev); + if (!sb) + goto out; + return sb; +out: + return ERR_PTR(ret); +} + +/* Copy parameters and call proper function */ +static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, caddr_t addr) +{ + int ret; + + switch (cmd) { + case Q_QUOTAON: { + char *pathname; + + if (IS_ERR(pathname = getname(addr))) + return PTR_ERR(pathname); + ret = sb->s_qcop->quota_on(sb, type, id, pathname); + putname(pathname); + return ret; + } + case Q_QUOTAOFF: + return sb->s_qcop->quota_off(sb, type); + + case Q_GETFMT: { + __u32 fmt; + + fmt = sb_dqopt(sb)->info[type].dqi_format->qf_fmt_id; + if (copy_to_user(addr, &fmt, sizeof(fmt))) + return -EFAULT; + return 0; + } + case Q_GETINFO: { + struct if_dqinfo info; + + if ((ret = sb->s_qcop->get_info(sb, type, &info))) + return ret; + if (copy_to_user(addr, &info, sizeof(info))) + return -EFAULT; + return 0; + } + case Q_SETINFO: { + struct if_dqinfo info; + + if (copy_from_user(&info, addr, sizeof(info))) + return -EFAULT; + return sb->s_qcop->set_info(sb, type, &info); + } + case Q_GETQUOTA: { + struct if_dqblk idq; + + if ((ret = sb->s_qcop->get_dqblk(sb, type, id, &idq))) + return ret; + if (copy_to_user(addr, &idq, sizeof(idq))) + return -EFAULT; + return 0; + } + case Q_SETQUOTA: { + struct if_dqblk idq; + + if (copy_from_user(&idq, addr, sizeof(idq))) + return -EFAULT; + return sb->s_qcop->set_dqblk(sb, type, id, &idq); + } + case Q_SYNC: + return sb->s_qcop->quota_sync(sb, type); + + case Q_XQUOTAON: + case Q_XQUOTAOFF: + case Q_XQUOTARM: { + __u32 flags; + + if (copy_from_user(&flags, addr, sizeof(flags))) + return -EFAULT; + return sb->s_qcop->set_xstate(sb, flags, cmd); + } + case Q_XGETQSTAT: { + struct fs_quota_stat fqs; + + if ((ret = sb->s_qcop->get_xstate(sb, &fqs))) + return ret; + if (copy_to_user(addr, &fqs, sizeof(fqs))) + return -EFAULT; + return 0; + } + case Q_XSETQLIM: { + struct fs_disk_quota fdq; + + if (copy_from_user(&fdq, addr, sizeof(fdq))) + return -EFAULT; + return sb->s_qcop->set_xquota(sb, type, id, &fdq); + } + case Q_XGETQUOTA: { + struct fs_disk_quota fdq; + + if ((ret = sb->s_qcop->get_xquota(sb, type, id, &fdq))) + return ret; + if (copy_to_user(addr, &fdq, sizeof(fdq))) + return -EFAULT; + return 0; + } + /* We never reach here unless validity check is broken */ + default: + BUG(); + } + return 0; +} + +/* + * This is the system call interface. This communicates with + * the user-level programs. Currently this only supports diskquota + * calls. Maybe we need to add the process quotas etc. in the future, + * but we probably should use rlimits for that. + */ +asmlinkage long sys_quotactl(unsigned int cmd, const char *special, qid_t id, caddr_t addr) +{ + uint cmds, type; + struct super_block *sb = NULL; + int ret = -EINVAL; + + lock_kernel(); + cmds = cmd >> SUBCMDSHIFT; + type = cmd & SUBCMDMASK; + + if (IS_ERR(sb = resolve_dev(special))) { + ret = PTR_ERR(sb); + sb = NULL; + goto out; + } + if ((ret = check_quotactl_valid(sb, type, cmds, id)) < 0) + goto out; + ret = do_quotactl(sb, type, cmds, id, addr); +out: + if (sb) + drop_super(sb); + unlock_kernel(); + return ret; +} diff --git a/fs/super.c b/fs/super.c index 76d3db53cace..269eaeb3cba5 100644 --- a/fs/super.c +++ b/fs/super.c @@ -27,6 +27,7 @@ #include #include #include +#include #include void get_filesystem(struct file_system_type *fs); @@ -62,6 +63,8 @@ static struct super_block *alloc_super(void) sema_init(&s->s_dquot.dqio_sem, 1); sema_init(&s->s_dquot.dqoff_sem, 1); s->s_maxbytes = MAX_NON_LFS; + s->dq_op = sb_dquot_ops; + s->s_qcop = sb_quotactl_ops; } return s; } diff --git a/include/linux/fs.h b/include/linux/fs.h index f50388ea30dc..f30905c9a4f1 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -626,7 +626,7 @@ struct quota_info { struct semaphore dqoff_sem; /* serialize quota_off() and quota_on() on device */ struct file *files[MAXQUOTAS]; /* fp's to quotafiles */ struct mem_dqinfo info[MAXQUOTAS]; /* Information for each quota type */ - struct quota_format_ops *ops[MAXQUOTAS]; /* Operations for each format */ + struct quota_format_ops *ops[MAXQUOTAS]; /* Operations for each type */ }; /* @@ -659,6 +659,7 @@ struct super_block { struct file_system_type *s_type; struct super_operations *s_op; struct dquot_operations *dq_op; + struct quotactl_ops *s_qcop; struct export_operations *s_export_op; unsigned long s_flags; unsigned long s_magic; @@ -878,16 +879,6 @@ static inline void mark_inode_dirty_sync(struct inode *inode) __mark_inode_dirty(inode, I_DIRTY_SYNC); } -struct dquot_operations { - void (*initialize) (struct inode *, short); - void (*drop) (struct inode *); - int (*alloc_space) (struct inode *, qsize_t, int); - int (*alloc_inode) (const struct inode *, unsigned long); - void (*free_space) (struct inode *, qsize_t); - void (*free_inode) (const struct inode *, unsigned long); - int (*transfer) (struct inode *, struct iattr *); -}; - /** * &export_operations - for nfsd to communicate with file systems diff --git a/include/linux/quota.h b/include/linux/quota.h index 7e481591efc4..49ceac92bc29 100644 --- a/include/linux/quota.h +++ b/include/linux/quota.h @@ -70,9 +70,6 @@ typedef __u64 qsize_t; /* Type in which we store sizes */ "undefined", \ }; -#define QUOTAFILENAME "quota" -#define QUOTAGROUP "staff" - /* * Command definitions for the 'quotactl' system call. * The commands are broken into a main command defined below @@ -83,7 +80,61 @@ typedef __u64 qsize_t; /* Type in which we store sizes */ #define SUBCMDSHIFT 8 #define QCMD(cmd, type) (((cmd) << SUBCMDSHIFT) | ((type) & SUBCMDMASK)) -#define Q_SYNC 0x0600 /* sync disk copy of a filesystems quotas */ +#define Q_SYNC 0x800001 /* sync disk copy of a filesystems quotas */ +#define Q_QUOTAON 0x800002 /* turn quotas on */ +#define Q_QUOTAOFF 0x800003 /* turn quotas off */ +#define Q_GETFMT 0x800004 /* get quota format used on given filesystem */ +#define Q_GETINFO 0x800005 /* get information about quota files */ +#define Q_SETINFO 0x800006 /* set information about quota files */ +#define Q_GETQUOTA 0x800007 /* get user quota structure */ +#define Q_SETQUOTA 0x800008 /* set user quota structure */ + +/* + * Quota structure used for communication with userspace via quotactl + * Following flags are used to specify which fields are valid + */ +#define QIF_BLIMITS 1 +#define QIF_SPACE 2 +#define QIF_ILIMITS 4 +#define QIF_INODES 8 +#define QIF_BTIME 16 +#define QIF_ITIME 32 +#define QIF_LIMITS (QIF_BLIMITS | QIF_ILIMITS) +#define QIF_USAGE (QIF_SPACE | QIF_INODES) +#define QIF_TIMES (QIF_BTIME | QIF_ITIME) +#define QIF_ALL (QIF_LIMITS | QIF_USAGE | QIF_TIMES) + +struct if_dqblk { + __u64 dqb_bhardlimit; + __u64 dqb_bsoftlimit; + __u64 dqb_curspace; + __u64 dqb_ihardlimit; + __u64 dqb_isoftlimit; + __u64 dqb_curinodes; + __u64 dqb_btime; + __u64 dqb_itime; + __u32 dqb_valid; +}; + +/* + * Structure used for setting quota information about file via quotactl + * Following flags are used to specify which fields are valid + */ +#define IIF_BGRACE 1 +#define IIF_IGRACE 2 +#define IIF_FLAGS 4 +#define IIF_ALL (IIF_BGRACE | IIF_IGRACE | IIF_FLAGS) + +struct if_dqinfo { + __u64 dqi_bgrace; + __u64 dqi_igrace; + __u32 dqi_flags; + __u32 dqi_valid; +}; + +#ifdef __KERNEL__ + +#include /* * Data for one user/group kept in memory @@ -105,7 +156,7 @@ struct mem_dqblk { struct quota_format_type; struct mem_dqinfo { - struct quota_format_type * dqi_format; + struct quota_format_type *dqi_format; int dqi_flags; unsigned int dqi_bgrace; unsigned int dqi_igrace; @@ -113,8 +164,6 @@ struct mem_dqinfo { } u; }; -#ifdef __KERNEL__ - #define DQF_MASK 0xffff /* Mask for format specific flags */ #define DQF_INFO_DIRTY 0x10000 /* Is info dirty? */ @@ -127,24 +176,6 @@ extern inline void mark_info_dirty(struct mem_dqinfo *info) #define sb_dqopt(sb) (&(sb)->s_dquot) -#endif /* __KERNEL__ */ - -/* - * Shorthand notation. - */ -#define dq_bhardlimit dq_dqb.dqb_bhardlimit -#define dq_bsoftlimit dq_dqb.dqb_bsoftlimit -#define dq_curspace dq_dqb.dqb_curspace -#define dq_ihardlimit dq_dqb.dqb_ihardlimit -#define dq_isoftlimit dq_dqb.dqb_isoftlimit -#define dq_curinodes dq_dqb.dqb_curinodes -#define dq_btime dq_dqb.dqb_btime -#define dq_itime dq_dqb.dqb_itime - -#define dqoff(UID) ((loff_t)((UID) * sizeof (struct dqblk))) - -#ifdef __KERNEL__ - extern int nr_dquots, nr_free_dquots; struct dqstats { @@ -203,25 +234,70 @@ extern inline void mark_dquot_dirty(struct dquot *dquot) /* Operations which must be implemented by each quota format */ struct quota_format_ops { int (*check_quota_file)(struct super_block *sb, int type); /* Detect whether file is in our format */ - int (*read_file_info)(struct super_block *sb, int type); /* Read main info about file */ + int (*read_file_info)(struct super_block *sb, int type); /* Read main info about file - called on quotaon() */ int (*write_file_info)(struct super_block *sb, int type); /* Write main info about file */ int (*free_file_info)(struct super_block *sb, int type); /* Called on quotaoff() */ int (*read_dqblk)(struct dquot *dquot); /* Read structure for one user */ int (*commit_dqblk)(struct dquot *dquot); /* Write (or delete) structure for one user */ }; +/* Operations working with dquots */ +struct dquot_operations { + void (*initialize) (struct inode *, short); + void (*drop) (struct inode *); + int (*alloc_space) (struct inode *, qsize_t, int); + int (*alloc_inode) (const struct inode *, unsigned long); + void (*free_space) (struct inode *, qsize_t); + void (*free_inode) (const struct inode *, unsigned long); + int (*transfer) (struct inode *, struct iattr *); +}; + +/* Operations handling requests from userspace */ +struct quotactl_ops { + int (*quota_on)(struct super_block *, int, int, char *); + int (*quota_off)(struct super_block *, int); + int (*quota_sync)(struct super_block *, int); + int (*get_info)(struct super_block *, int, struct if_dqinfo *); + int (*set_info)(struct super_block *, int, struct if_dqinfo *); + int (*get_dqblk)(struct super_block *, int, qid_t, struct if_dqblk *); + int (*set_dqblk)(struct super_block *, int, qid_t, struct if_dqblk *); + int (*get_xstate)(struct super_block *, struct fs_quota_stat *); + int (*set_xstate)(struct super_block *, unsigned int, int); + int (*get_xquota)(struct super_block *, int, qid_t, struct fs_disk_quota *); + int (*set_xquota)(struct super_block *, int, qid_t, struct fs_disk_quota *); +}; + struct quota_format_type { int qf_fmt_id; /* Quota format id */ struct quota_format_ops *qf_ops; /* Operations of format */ + struct module *qf_owner; /* Module implementing quota format */ struct quota_format_type *qf_next; }; +static inline int is_enabled(struct quota_info *dqopt, int type) +{ + switch (type) { + case USRQUOTA: + return dqopt->flags & DQUOT_USR_ENABLED; + case GRPQUOTA: + return dqopt->flags & DQUOT_GRP_ENABLED; + } + return 0; +} + +#define sb_any_quota_enabled(sb) (is_enabled(sb_dqopt(sb), USRQUOTA) | is_enabled(sb_dqopt(sb), GRPQUOTA)) + +#define sb_has_quota_enabled(sb, type) (is_enabled(sb_dqopt(sb), type)) + +int register_quota_format(struct quota_format_type *fmt); +void unregister_quota_format(struct quota_format_type *fmt); + #else # /* nodep */ include __BEGIN_DECLS -long quotactl __P ((int, const char *, int, caddr_t)); +long quotactl __P ((unsigned int, const char *, int, caddr_t)); __END_DECLS #endif /* __KERNEL__ */ diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h index b5da5ff5bb9a..0cd58a4fa275 100644 --- a/include/linux/quotaops.h +++ b/include/linux/quotaops.h @@ -20,10 +20,10 @@ /* * declaration of quota_function calls in kernel. */ +extern int sync_dquots(kdev_t dev, short type); + extern void dquot_initialize(struct inode *inode, short type); extern void dquot_drop(struct inode *inode); -extern int quota_off(struct super_block *sb, short type); -extern int sync_dquots(struct super_block *sb, short type); extern int dquot_alloc_space(struct inode *inode, qsize_t number, int prealloc); extern int dquot_alloc_inode(const struct inode *inode, unsigned long number); @@ -36,7 +36,11 @@ extern int dquot_transfer(struct inode *inode, struct iattr *iattr); /* * Operations supported for diskquotas. */ -#define sb_any_quota_enabled(sb) ((sb)->s_dquot.flags & (DQUOT_USR_ENABLED | DQUOT_GRP_ENABLED)) +extern struct dquot_operations dquot_operations; +extern struct quotactl_ops vfs_quotactl_ops; + +#define sb_dquot_ops (&dquot_operations) +#define sb_quotactl_ops (&vfs_quotactl_ops) static __inline__ void DQUOT_INIT(struct inode *inode) { @@ -160,13 +164,25 @@ static __inline__ int DQUOT_TRANSFER(struct inode *inode, struct iattr *iattr) } #define DQUOT_SYNC(sb) sync_dquots(sb, -1) -#define DQUOT_OFF(sb) quota_off(sb, -1) + +static __inline__ int DQUOT_OFF(struct super_block *sb) +{ + int ret = -ENOSYS; + + lock_kernel(); + if (sb->s_qcop && sb->s_qcop->quota_off) + ret = sb->s_qcop->quota_off(sb, -1); + unlock_kernel(); + return ret; +} #else /* * NO-OP when quota not configured. */ +#define sb_dquot_ops (NULL) +#define sb_quotactl_ops (NULL) #define DQUOT_INIT(inode) do { } while(0) #define DQUOT_DROP(inode) do { } while(0) #define DQUOT_ALLOC_INODE(inode) (0) diff --git a/include/linux/xqm.h b/include/linux/xqm.h new file mode 100644 index 000000000000..d077bc18a424 --- /dev/null +++ b/include/linux/xqm.h @@ -0,0 +1,159 @@ +/* + * Copyright (c) 1995-2001 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, write the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, + * USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef _LINUX_XQM_H +#define _LINUX_XQM_H + +#include + +/* + * Disk quota - quotactl(2) commands for the XFS Quota Manager (XQM). + */ + +#define XQM_CMD(x) (('X'<<8)+(x)) /* note: forms first QCMD argument */ +#define Q_XQUOTAON XQM_CMD(0x1) /* enable accounting/enforcement */ +#define Q_XQUOTAOFF XQM_CMD(0x2) /* disable accounting/enforcement */ +#define Q_XGETQUOTA XQM_CMD(0x3) /* get disk limits and usage */ +#define Q_XSETQLIM XQM_CMD(0x4) /* set disk limits */ +#define Q_XGETQSTAT XQM_CMD(0x5) /* get quota subsystem status */ +#define Q_XQUOTARM XQM_CMD(0x6) /* free disk space used by dquots */ + +/* + * fs_disk_quota structure: + * + * This contains the current quota information regarding a user/proj/group. + * It is 64-bit aligned, and all the blk units are in BBs (Basic Blocks) of + * 512 bytes. + */ +#define FS_DQUOT_VERSION 1 /* fs_disk_quota.d_version */ +typedef struct fs_disk_quota { + __s8 d_version; /* version of this structure */ + __s8 d_flags; /* XFS_{USER,PROJ,GROUP}_QUOTA */ + __u16 d_fieldmask; /* field specifier */ + __u32 d_id; /* user, project, or group ID */ + __u64 d_blk_hardlimit;/* absolute limit on disk blks */ + __u64 d_blk_softlimit;/* preferred limit on disk blks */ + __u64 d_ino_hardlimit;/* maximum # allocated inodes */ + __u64 d_ino_softlimit;/* preferred inode limit */ + __u64 d_bcount; /* # disk blocks owned by the user */ + __u64 d_icount; /* # inodes owned by the user */ + __s32 d_itimer; /* zero if within inode limits */ + /* if not, we refuse service */ + __s32 d_btimer; /* similar to above; for disk blocks */ + __u16 d_iwarns; /* # warnings issued wrt num inodes */ + __u16 d_bwarns; /* # warnings issued wrt disk blocks */ + __s32 d_padding2; /* padding2 - for future use */ + __u64 d_rtb_hardlimit;/* absolute limit on realtime blks */ + __u64 d_rtb_softlimit;/* preferred limit on RT disk blks */ + __u64 d_rtbcount; /* # realtime blocks owned */ + __s32 d_rtbtimer; /* similar to above; for RT disk blks */ + __u16 d_rtbwarns; /* # warnings issued wrt RT disk blks */ + __s16 d_padding3; /* padding3 - for future use */ + char d_padding4[8]; /* yet more padding */ +} fs_disk_quota_t; + +/* + * These fields are sent to Q_XSETQLIM to specify fields that need to change. + */ +#define FS_DQ_ISOFT (1<<0) +#define FS_DQ_IHARD (1<<1) +#define FS_DQ_BSOFT (1<<2) +#define FS_DQ_BHARD (1<<3) +#define FS_DQ_RTBSOFT (1<<4) +#define FS_DQ_RTBHARD (1<<5) +#define FS_DQ_LIMIT_MASK (FS_DQ_ISOFT | FS_DQ_IHARD | FS_DQ_BSOFT | \ + FS_DQ_BHARD | FS_DQ_RTBSOFT | FS_DQ_RTBHARD) +/* + * These timers can only be set in super user's dquot. For others, timers are + * automatically started and stopped. Superusers timer values set the limits + * for the rest. In case these values are zero, the DQ_{F,B}TIMELIMIT values + * defined below are used. + * These values also apply only to the d_fieldmask field for Q_XSETQLIM. + */ +#define FS_DQ_BTIMER (1<<6) +#define FS_DQ_ITIMER (1<<7) +#define FS_DQ_RTBTIMER (1<<8) +#define FS_DQ_TIMER_MASK (FS_DQ_BTIMER | FS_DQ_ITIMER | FS_DQ_RTBTIMER) + +/* + * The following constants define the default amount of time given a user + * before the soft limits are treated as hard limits (usually resulting + * in an allocation failure). These may be modified by the quotactl(2) + * system call with the Q_XSETQLIM command. + */ +#define DQ_FTIMELIMIT (7 * 24*60*60) /* 1 week */ +#define DQ_BTIMELIMIT (7 * 24*60*60) /* 1 week */ + +/* + * Various flags related to quotactl(2). Only relevant to XFS filesystems. + */ +#define XFS_QUOTA_UDQ_ACCT (1<<0) /* user quota accounting */ +#define XFS_QUOTA_UDQ_ENFD (1<<1) /* user quota limits enforcement */ +#define XFS_QUOTA_GDQ_ACCT (1<<2) /* group quota accounting */ +#define XFS_QUOTA_GDQ_ENFD (1<<3) /* group quota limits enforcement */ + +#define XFS_USER_QUOTA (1<<0) /* user quota type */ +#define XFS_PROJ_QUOTA (1<<1) /* (IRIX) project quota type */ +#define XFS_GROUP_QUOTA (1<<2) /* group quota type */ + +/* + * fs_quota_stat is the struct returned in Q_XGETQSTAT for a given file system. + * Provides a centralized way to get meta infomation about the quota subsystem. + * eg. space taken up for user and group quotas, number of dquots currently + * incore. + */ +#define FS_QSTAT_VERSION 1 /* fs_quota_stat.qs_version */ + +/* + * Some basic infomation about 'quota files'. + */ +typedef struct fs_qfilestat { + __u64 qfs_ino; /* inode number */ + __u64 qfs_nblks; /* number of BBs 512-byte-blks */ + __u32 qfs_nextents; /* number of extents */ +} fs_qfilestat_t; + +typedef struct fs_quota_stat { + __s8 qs_version; /* version number for future changes */ + __u16 qs_flags; /* XFS_QUOTA_{U,P,G}DQ_{ACCT,ENFD} */ + __s8 qs_pad; /* unused */ + fs_qfilestat_t qs_uquota; /* user quota storage information */ + fs_qfilestat_t qs_gquota; /* group quota storage information */ + __u32 qs_incoredqs; /* number of dquots incore */ + __s32 qs_btimelimit; /* limit for blks timer */ + __s32 qs_itimelimit; /* limit for inodes timer */ + __s32 qs_rtbtimelimit;/* limit for rt blks timer */ + __u16 qs_bwarnlimit; /* limit for num warnings */ + __u16 qs_iwarnlimit; /* limit for num warnings */ +} fs_quota_stat_t; + +#endif /* _LINUX_XQM_H */ -- cgit v1.2.3 From dcfb81117226d9862242115242b822faca3cf737 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sun, 19 May 2002 19:34:25 -0700 Subject: [PATCH] [8/13] quota-8-format1 Implementation of old quota format. All the code for old format is now in quota_v1.c. Code mostly remained the same as in older kernels (just minor changes were needed to bind it with quota interface). --- fs/Config.help | 5 + fs/Config.in | 1 + fs/Makefile | 1 + fs/dquot.c | 5 +- fs/quota_v1.c | 239 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/dqblk_v1.h | 18 ++++ include/linux/quota.h | 2 + include/linux/quotaio_v1.h | 33 +++++++ 8 files changed, 303 insertions(+), 1 deletion(-) create mode 100644 fs/quota_v1.c create mode 100644 include/linux/dqblk_v1.h create mode 100644 include/linux/quotaio_v1.h (limited to 'include') diff --git a/fs/Config.help b/fs/Config.help index 2b5b7758f2d4..e72b03b76f27 100644 --- a/fs/Config.help +++ b/fs/Config.help @@ -6,6 +6,11 @@ CONFIG_QUOTA . Probably the quota support is only useful for multi user systems. If unsure, say N. +CONFIG_QFMT_V1 + This quota format was (is) used by kernels earlier than 2.4.??. If + you have quota working and you don't want to convert to new quota + format say Y here. + CONFIG_MINIX_FS Minix is a simple operating system used in many classes about OS's. The minix file system (method to organize files on a hard disk diff --git a/fs/Config.in b/fs/Config.in index ef117fa232c4..44f8ba30b625 100644 --- a/fs/Config.in +++ b/fs/Config.in @@ -5,6 +5,7 @@ mainmenu_option next_comment comment 'File systems' bool 'Quota support' CONFIG_QUOTA +dep_tristate ' Old quota format support' CONFIG_QFMT_V1 $CONFIG_QUOTA tristate 'Kernel automounter support' CONFIG_AUTOFS_FS tristate 'Kernel automounter version 4 support (also supports v3)' CONFIG_AUTOFS4_FS diff --git a/fs/Makefile b/fs/Makefile index e843c709297e..4d945a800f57 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -82,6 +82,7 @@ obj-y += binfmt_script.o obj-$(CONFIG_BINFMT_ELF) += binfmt_elf.o obj-$(CONFIG_QUOTA) += dquot.o +obj-$(CONFIG_QFMT_V1) += quota_v1.o # persistent filesystems obj-y += $(join $(subdir-y),$(subdir-y:%=/%.o)) diff --git a/fs/dquot.c b/fs/dquot.c index b6125801dd94..17090a75a5f3 100644 --- a/fs/dquot.c +++ b/fs/dquot.c @@ -792,7 +792,10 @@ static inline void flush_warnings(struct dquot **dquots, char *warntype) static inline char ignore_hardlimit(struct dquot *dquot) { - return capable(CAP_SYS_RESOURCE); + struct mem_dqinfo *info = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_type]; + + return capable(CAP_SYS_RESOURCE) && + (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD || !(info->dqi_flags & V1_DQF_RSQUASH)); } static int check_idq(struct dquot *dquot, ulong inodes, char *warntype) diff --git a/fs/quota_v1.c b/fs/quota_v1.c new file mode 100644 index 000000000000..aa3b7842399d --- /dev/null +++ b/fs/quota_v1.c @@ -0,0 +1,239 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +static void v1_disk2mem_dqblk(struct mem_dqblk *m, struct v1_disk_dqblk *d) +{ + m->dqb_ihardlimit = d->dqb_ihardlimit; + m->dqb_isoftlimit = d->dqb_isoftlimit; + m->dqb_curinodes = d->dqb_curinodes; + m->dqb_bhardlimit = d->dqb_bhardlimit; + m->dqb_bsoftlimit = d->dqb_bsoftlimit; + m->dqb_curspace = d->dqb_curblocks << QUOTABLOCK_BITS; + m->dqb_itime = d->dqb_itime; + m->dqb_btime = d->dqb_btime; +} + +static void v1_mem2disk_dqblk(struct v1_disk_dqblk *d, struct mem_dqblk *m) +{ + d->dqb_ihardlimit = m->dqb_ihardlimit; + d->dqb_isoftlimit = m->dqb_isoftlimit; + d->dqb_curinodes = m->dqb_curinodes; + d->dqb_bhardlimit = m->dqb_bhardlimit; + d->dqb_bsoftlimit = m->dqb_bsoftlimit; + d->dqb_curblocks = toqb(m->dqb_curspace); + d->dqb_itime = m->dqb_itime; + d->dqb_btime = m->dqb_btime; +} + +static int v1_read_dqblk(struct dquot *dquot) +{ + int type = dquot->dq_type; + struct file *filp; + mm_segment_t fs; + loff_t offset; + struct v1_disk_dqblk dqblk; + + filp = sb_dqopt(dquot->dq_sb)->files[type]; + if (filp == (struct file *)NULL) + return -EINVAL; + + /* Now we are sure filp is valid */ + offset = v1_dqoff(dquot->dq_id); + fs = get_fs(); + set_fs(KERNEL_DS); + filp->f_op->read(filp, (char *)&dqblk, sizeof(struct v1_disk_dqblk), &offset); + set_fs(fs); + + v1_disk2mem_dqblk(&dquot->dq_dqb, &dqblk); + if (dquot->dq_dqb.dqb_bhardlimit == 0 && dquot->dq_dqb.dqb_bsoftlimit == 0 && + dquot->dq_dqb.dqb_ihardlimit == 0 && dquot->dq_dqb.dqb_isoftlimit == 0) + dquot->dq_flags |= DQ_FAKE; + dqstats.reads++; + return 0; +} + +static int v1_commit_dqblk(struct dquot *dquot) +{ + short type = dquot->dq_type; + struct file *filp; + mm_segment_t fs; + loff_t offset; + ssize_t ret; + struct v1_disk_dqblk dqblk; + + filp = sb_dqopt(dquot->dq_sb)->files[type]; + offset = v1_dqoff(dquot->dq_id); + fs = get_fs(); + set_fs(KERNEL_DS); + + /* + * Note: clear the DQ_MOD flag unconditionally, + * so we don't loop forever on failure. + */ + v1_mem2disk_dqblk(&dqblk, &dquot->dq_dqb); + dquot->dq_flags &= ~DQ_MOD; + if (dquot->dq_id == 0) { + dqblk.dqb_btime = sb_dqopt(dquot->dq_sb)->info[type].dqi_bgrace; + dqblk.dqb_itime = sb_dqopt(dquot->dq_sb)->info[type].dqi_igrace; + } + ret = 0; + if (filp) + ret = filp->f_op->write(filp, (char *)&dqblk, + sizeof(struct v1_disk_dqblk), &offset); + if (ret != sizeof(struct v1_disk_dqblk)) { + printk(KERN_WARNING "VFS: dquota write failed on dev %s\n", + kdevname(dquot->dq_sb->s_dev)); + if (ret >= 0) + ret = -EIO; + goto out; + } + ret = 0; + +out: + set_fs(fs); + dqstats.writes++; + return ret; +} + +/* Magics of new quota format */ +#define V2_INITQMAGICS {\ + 0xd9c01f11, /* USRQUOTA */\ + 0xd9c01927 /* GRPQUOTA */\ +} + +/* Header of new quota format */ +struct v2_disk_dqheader { + __u32 dqh_magic; /* Magic number identifying file */ + __u32 dqh_version; /* File version */ +}; + +static int v1_check_quota_file(struct super_block *sb, int type) +{ + struct file *f = sb_dqopt(sb)->files[type]; + struct inode *inode = f->f_dentry->d_inode; + ulong blocks; + size_t off; + struct v2_disk_dqheader dqhead; + mm_segment_t fs; + ssize_t size; + loff_t offset = 0; + static const uint quota_magics[] = V2_INITQMAGICS; + + if (!inode->i_size) + return 0; + blocks = inode->i_size >> BLOCK_SIZE_BITS; + off = inode->i_size & (BLOCK_SIZE - 1); + if ((blocks % sizeof(struct v1_disk_dqblk) * BLOCK_SIZE + off) % sizeof(struct v1_disk_dqblk)) + return 0; + /* Doublecheck whether we didn't get file with new format - with old quotactl() this could happen */ + fs = get_fs(); + set_fs(KERNEL_DS); + size = f->f_op->read(f, (char *)&dqhead, sizeof(struct v2_disk_dqheader), &offset); + set_fs(fs); + if (size != sizeof(struct v2_disk_dqheader)) + return 1; /* Probably not new format */ + if (le32_to_cpu(dqhead.dqh_magic) != quota_magics[type]) + return 1; /* Definitely not new format */ + printk(KERN_INFO "VFS: %s: Refusing to turn on old quota format on given file. It probably contains newer quota format.\n", kdevname(sb->s_dev)); + return 0; /* Seems like a new format file -> refuse it */ +} + +static int v1_read_file_info(struct super_block *sb, int type) +{ + struct quota_info *dqopt = sb_dqopt(sb); + mm_segment_t fs; + loff_t offset; + struct file *filp = dqopt->files[type]; + struct v1_disk_dqblk dqblk; + int ret; + + down(&dqopt->dqio_sem); + offset = v1_dqoff(0); + fs = get_fs(); + set_fs(KERNEL_DS); + if ((ret = filp->f_op->read(filp, (char *)&dqblk, sizeof(struct v1_disk_dqblk), &offset)) != sizeof(struct v1_disk_dqblk)) { + if (ret >= 0) + ret = -EIO; + goto out; + } + ret = 0; + dqopt->info[type].dqi_igrace = dqblk.dqb_itime ? dqblk.dqb_itime : MAX_IQ_TIME; + dqopt->info[type].dqi_bgrace = dqblk.dqb_btime ? dqblk.dqb_btime : MAX_DQ_TIME; +out: + up(&dqopt->dqio_sem); + set_fs(fs); + return ret; +} + +static int v1_write_file_info(struct super_block *sb, int type) +{ + struct quota_info *dqopt = sb_dqopt(sb); + mm_segment_t fs; + struct file *filp = dqopt->files[type]; + struct v1_disk_dqblk dqblk; + loff_t offset; + int ret; + + down(&dqopt->dqio_sem); + dqopt->info[type].dqi_flags &= ~DQF_INFO_DIRTY; + offset = v1_dqoff(0); + fs = get_fs(); + set_fs(KERNEL_DS); + if ((ret = filp->f_op->read(filp, (char *)&dqblk, sizeof(struct v1_disk_dqblk), &offset)) != sizeof(struct v1_disk_dqblk)) { + if (ret >= 0) + ret = -EIO; + goto out; + } + dqblk.dqb_itime = dqopt->info[type].dqi_igrace; + dqblk.dqb_btime = dqopt->info[type].dqi_bgrace; + offset = v1_dqoff(0); + ret = filp->f_op->write(filp, (char *)&dqblk, sizeof(struct v1_disk_dqblk), &offset); + if (ret == sizeof(struct v1_disk_dqblk)) + ret = 0; + else if (ret > 0) + ret = -EIO; +out: + up(&dqopt->dqio_sem); + set_fs(fs); + return ret; +} + +static struct quota_format_ops v1_format_ops = { + check_quota_file: v1_check_quota_file, + read_file_info: v1_read_file_info, + write_file_info: v1_write_file_info, + free_file_info: NULL, + read_dqblk: v1_read_dqblk, + commit_dqblk: v1_commit_dqblk, +}; + +static struct quota_format_type v1_quota_format = { + qf_fmt_id: QFMT_VFS_OLD, + qf_ops: &v1_format_ops, + qf_owner: THIS_MODULE +}; + +static int __init init_v1_quota_format(void) +{ + return register_quota_format(&v1_quota_format); +} + +static void __exit exit_v1_quota_format(void) +{ + unregister_quota_format(&v1_quota_format); +} + +EXPORT_NO_SYMBOLS; + +module_init(init_v1_quota_format); +module_exit(exit_v1_quota_format); + diff --git a/include/linux/dqblk_v1.h b/include/linux/dqblk_v1.h new file mode 100644 index 000000000000..42fbf4797156 --- /dev/null +++ b/include/linux/dqblk_v1.h @@ -0,0 +1,18 @@ +/* + * File with in-memory structures of old quota format + */ + +#ifndef _LINUX_DQBLK_V1_H +#define _LINUX_DQBLK_V1_H + +/* Id of quota format */ +#define QFMT_VFS_OLD 1 + +/* Root squash turned on */ +#define V1_DQF_RSQUASH 1 + +/* Special information about quotafile */ +struct v1_mem_dqinfo { +}; + +#endif /* _LINUX_DQBLK_V1_H */ diff --git a/include/linux/quota.h b/include/linux/quota.h index 49ceac92bc29..9b7ba43576d8 100644 --- a/include/linux/quota.h +++ b/include/linux/quota.h @@ -135,6 +135,7 @@ struct if_dqinfo { #ifdef __KERNEL__ #include +#include /* * Data for one user/group kept in memory @@ -161,6 +162,7 @@ struct mem_dqinfo { unsigned int dqi_bgrace; unsigned int dqi_igrace; union { + struct v1_mem_dqinfo v1_i; } u; }; diff --git a/include/linux/quotaio_v1.h b/include/linux/quotaio_v1.h new file mode 100644 index 000000000000..746654b5de70 --- /dev/null +++ b/include/linux/quotaio_v1.h @@ -0,0 +1,33 @@ +#ifndef _LINUX_QUOTAIO_V1_H +#define _LINUX_QUOTAIO_V1_H + +#include + +/* + * The following constants define the amount of time given a user + * before the soft limits are treated as hard limits (usually resulting + * in an allocation failure). The timer is started when the user crosses + * their soft limit, it is reset when they go below their soft limit. + */ +#define MAX_IQ_TIME 604800 /* (7*24*60*60) 1 week */ +#define MAX_DQ_TIME 604800 /* (7*24*60*60) 1 week */ + +/* + * The following structure defines the format of the disk quota file + * (as it appears on disk) - the file is an array of these structures + * indexed by user or group number. + */ +struct v1_disk_dqblk { + __u32 dqb_bhardlimit; /* absolute limit on disk blks alloc */ + __u32 dqb_bsoftlimit; /* preferred limit on disk blks */ + __u32 dqb_curblocks; /* current block count */ + __u32 dqb_ihardlimit; /* absolute limit on allocated inodes */ + __u32 dqb_isoftlimit; /* preferred inode limit */ + __u32 dqb_curinodes; /* current # allocated inodes */ + time_t dqb_btime; /* time limit for excessive disk use */ + time_t dqb_itime; /* time limit for excessive inode use */ +}; + +#define v1_dqoff(UID) ((loff_t)((UID) * sizeof (struct v1_disk_dqblk))) + +#endif /* _LINUX_QUOTAIO_V1_H */ -- cgit v1.2.3 From 8ea6f99ab22d47df786d4bf486055b7cff827e6b Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sun, 19 May 2002 19:34:29 -0700 Subject: [PATCH] [9/13] quota-9-format2 Implementation of new quota format. The code is almost the same as in -ac versions of kernel. All the code for new format is in quota_v2.c --- fs/Config.help | 13 +- fs/Config.in | 1 + fs/Makefile | 1 + fs/quota_v2.c | 690 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/dqblk_v2.h | 20 ++ include/linux/quota.h | 3 + include/linux/quotaio_v2.h | 79 ++++++ 7 files changed, 804 insertions(+), 3 deletions(-) create mode 100644 fs/quota_v2.c create mode 100644 include/linux/dqblk_v2.h create mode 100644 include/linux/quotaio_v2.h (limited to 'include') diff --git a/fs/Config.help b/fs/Config.help index e72b03b76f27..94e6693f25d4 100644 --- a/fs/Config.help +++ b/fs/Config.help @@ -1,8 +1,10 @@ CONFIG_QUOTA If you say Y here, you will be able to set per user limits for disk - usage (also called disk quotas). Currently, it works only for the - ext2 file system. You need additional software in order to use quota - support; for details, read the Quota mini-HOWTO, available from + usage (also called disk quotas). Currently, it works for the + ext2, ext3, and reiserfs file system. You need additional software + in order to use quota support (you can download sources from + ). For further details, read + the Quota mini-HOWTO, available from . Probably the quota support is only useful for multi user systems. If unsure, say N. @@ -11,6 +13,11 @@ CONFIG_QFMT_V1 you have quota working and you don't want to convert to new quota format say Y here. +CONFIG_QFMT_V2 + This quota format allows using quotas with 32-bit UIDs/GIDs. If you + need this functionality say Y here. Note that you will need latest + quota utilities for new quota format with this kernel. + CONFIG_MINIX_FS Minix is a simple operating system used in many classes about OS's. The minix file system (method to organize files on a hard disk diff --git a/fs/Config.in b/fs/Config.in index 44f8ba30b625..318e3a9814df 100644 --- a/fs/Config.in +++ b/fs/Config.in @@ -6,6 +6,7 @@ comment 'File systems' bool 'Quota support' CONFIG_QUOTA dep_tristate ' Old quota format support' CONFIG_QFMT_V1 $CONFIG_QUOTA +dep_tristate ' VFS v0 quota format support' CONFIG_QFMT_V2 $CONFIG_QUOTA tristate 'Kernel automounter support' CONFIG_AUTOFS_FS tristate 'Kernel automounter version 4 support (also supports v3)' CONFIG_AUTOFS4_FS diff --git a/fs/Makefile b/fs/Makefile index 4d945a800f57..44a5fcaf48c9 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -83,6 +83,7 @@ obj-$(CONFIG_BINFMT_ELF) += binfmt_elf.o obj-$(CONFIG_QUOTA) += dquot.o obj-$(CONFIG_QFMT_V1) += quota_v1.o +obj-$(CONFIG_QFMT_V2) += quota_v2.o # persistent filesystems obj-y += $(join $(subdir-y),$(subdir-y:%=/%.o)) diff --git a/fs/quota_v2.c b/fs/quota_v2.c new file mode 100644 index 000000000000..e28bee8c52ab --- /dev/null +++ b/fs/quota_v2.c @@ -0,0 +1,690 @@ +/* + * vfsv0 quota IO operations on file + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define __QUOTA_V2_PARANOIA + +typedef char *dqbuf_t; + +#define GETIDINDEX(id, depth) (((id) >> ((V2_DQTREEDEPTH-(depth)-1)*8)) & 0xff) +#define GETENTRIES(buf) ((struct v2_disk_dqblk *)(((char *)buf)+sizeof(struct v2_disk_dqdbheader))) + +/* Check whether given file is really vfsv0 quotafile */ +static int v2_check_quota_file(struct super_block *sb, int type) +{ + struct v2_disk_dqheader dqhead; + struct file *f = sb_dqopt(sb)->files[type]; + mm_segment_t fs; + ssize_t size; + loff_t offset = 0; + static const uint quota_magics[] = V2_INITQMAGICS; + static const uint quota_versions[] = V2_INITQVERSIONS; + + fs = get_fs(); + set_fs(KERNEL_DS); + size = f->f_op->read(f, (char *)&dqhead, sizeof(struct v2_disk_dqheader), &offset); + set_fs(fs); + if (size != sizeof(struct v2_disk_dqheader)) + return 0; + if (le32_to_cpu(dqhead.dqh_magic) != quota_magics[type] || + le32_to_cpu(dqhead.dqh_version) != quota_versions[type]) + return 0; + return 1; +} + +/* Read information header from quota file */ +static int v2_read_file_info(struct super_block *sb, int type) +{ + mm_segment_t fs; + struct v2_disk_dqinfo dinfo; + struct mem_dqinfo *info = sb_dqopt(sb)->info+type; + struct file *f = sb_dqopt(sb)->files[type]; + ssize_t size; + loff_t offset = V2_DQINFOOFF; + + fs = get_fs(); + set_fs(KERNEL_DS); + size = f->f_op->read(f, (char *)&dinfo, sizeof(struct v2_disk_dqinfo), &offset); + set_fs(fs); + if (size != sizeof(struct v2_disk_dqinfo)) { + printk(KERN_WARNING "Can't read info structure on device %s.\n", + kdevname(f->f_dentry->d_sb->s_dev)); + return -1; + } + info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace); + info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace); + info->dqi_flags = le32_to_cpu(dinfo.dqi_flags); + info->u.v2_i.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks); + info->u.v2_i.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk); + info->u.v2_i.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry); + return 0; +} + +/* Write information header to quota file */ +static int v2_write_file_info(struct super_block *sb, int type) +{ + mm_segment_t fs; + struct v2_disk_dqinfo dinfo; + struct mem_dqinfo *info = sb_dqopt(sb)->info+type; + struct file *f = sb_dqopt(sb)->files[type]; + ssize_t size; + loff_t offset = V2_DQINFOOFF; + + info->dqi_flags &= ~DQF_INFO_DIRTY; + dinfo.dqi_bgrace = cpu_to_le32(info->dqi_bgrace); + dinfo.dqi_igrace = cpu_to_le32(info->dqi_igrace); + dinfo.dqi_flags = cpu_to_le32(info->dqi_flags & DQF_MASK); + dinfo.dqi_blocks = cpu_to_le32(info->u.v2_i.dqi_blocks); + dinfo.dqi_free_blk = cpu_to_le32(info->u.v2_i.dqi_free_blk); + dinfo.dqi_free_entry = cpu_to_le32(info->u.v2_i.dqi_free_entry); + fs = get_fs(); + set_fs(KERNEL_DS); + size = f->f_op->write(f, (char *)&dinfo, sizeof(struct v2_disk_dqinfo), &offset); + set_fs(fs); + if (size != sizeof(struct v2_disk_dqinfo)) { + printk(KERN_WARNING "Can't write info structure on device %s.\n", + kdevname(f->f_dentry->d_sb->s_dev)); + return -1; + } + return 0; +} + +static void disk2memdqb(struct mem_dqblk *m, struct v2_disk_dqblk *d) +{ + m->dqb_ihardlimit = le32_to_cpu(d->dqb_ihardlimit); + m->dqb_isoftlimit = le32_to_cpu(d->dqb_isoftlimit); + m->dqb_curinodes = le32_to_cpu(d->dqb_curinodes); + m->dqb_itime = le64_to_cpu(d->dqb_itime); + m->dqb_bhardlimit = le32_to_cpu(d->dqb_bhardlimit); + m->dqb_bsoftlimit = le32_to_cpu(d->dqb_bsoftlimit); + m->dqb_curspace = le64_to_cpu(d->dqb_curspace); + m->dqb_btime = le64_to_cpu(d->dqb_btime); +} + +static void mem2diskdqb(struct v2_disk_dqblk *d, struct mem_dqblk *m, qid_t id) +{ + d->dqb_ihardlimit = cpu_to_le32(m->dqb_ihardlimit); + d->dqb_isoftlimit = cpu_to_le32(m->dqb_isoftlimit); + d->dqb_curinodes = cpu_to_le32(m->dqb_curinodes); + d->dqb_itime = cpu_to_le64(m->dqb_itime); + d->dqb_bhardlimit = cpu_to_le32(m->dqb_bhardlimit); + d->dqb_bsoftlimit = cpu_to_le32(m->dqb_bsoftlimit); + d->dqb_curspace = cpu_to_le64(m->dqb_curspace); + d->dqb_btime = cpu_to_le64(m->dqb_btime); + d->dqb_id = cpu_to_le32(id); +} + +static dqbuf_t getdqbuf(void) +{ + dqbuf_t buf = kmalloc(V2_DQBLKSIZE, GFP_KERNEL); + if (!buf) + printk(KERN_WARNING "VFS: Not enough memory for quota buffers.\n"); + return buf; +} + +static inline void freedqbuf(dqbuf_t buf) +{ + kfree(buf); +} + +static ssize_t read_blk(struct file *filp, uint blk, dqbuf_t buf) +{ + mm_segment_t fs; + ssize_t ret; + loff_t offset = blk<f_op->read(filp, (char *)buf, V2_DQBLKSIZE, &offset); + set_fs(fs); + return ret; +} + +static ssize_t write_blk(struct file *filp, uint blk, dqbuf_t buf) +{ + mm_segment_t fs; + ssize_t ret; + loff_t offset = blk<f_op->write(filp, (char *)buf, V2_DQBLKSIZE, &offset); + set_fs(fs); + return ret; + +} + +/* Remove empty block from list and return it */ +static int get_free_dqblk(struct file *filp, struct mem_dqinfo *info) +{ + dqbuf_t buf = getdqbuf(); + struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf; + int ret, blk; + + if (!buf) + return -ENOMEM; + if (info->u.v2_i.dqi_free_blk) { + blk = info->u.v2_i.dqi_free_blk; + if ((ret = read_blk(filp, blk, buf)) < 0) + goto out_buf; + info->u.v2_i.dqi_free_blk = le32_to_cpu(dh->dqdh_next_free); + } + else { + memset(buf, 0, V2_DQBLKSIZE); + if ((ret = write_blk(filp, info->u.v2_i.dqi_blocks, buf)) < 0) /* Assure block allocation... */ + goto out_buf; + blk = info->u.v2_i.dqi_blocks++; + } + mark_info_dirty(info); + ret = blk; +out_buf: + freedqbuf(buf); + return ret; +} + +/* Insert empty block to the list */ +static int put_free_dqblk(struct file *filp, struct mem_dqinfo *info, dqbuf_t buf, uint blk) +{ + struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf; + int err; + + dh->dqdh_next_free = cpu_to_le32(info->u.v2_i.dqi_free_blk); + dh->dqdh_prev_free = cpu_to_le32(0); + dh->dqdh_entries = cpu_to_le16(0); + info->u.v2_i.dqi_free_blk = blk; + mark_info_dirty(info); + if ((err = write_blk(filp, blk, buf)) < 0) /* Some strange block. We had better leave it... */ + return err; + return 0; +} + +/* Remove given block from the list of blocks with free entries */ +static int remove_free_dqentry(struct file *filp, struct mem_dqinfo *info, dqbuf_t buf, uint blk) +{ + dqbuf_t tmpbuf = getdqbuf(); + struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf; + uint nextblk = le32_to_cpu(dh->dqdh_next_free), prevblk = le32_to_cpu(dh->dqdh_prev_free); + int err; + + if (!tmpbuf) + return -ENOMEM; + if (nextblk) { + if ((err = read_blk(filp, nextblk, tmpbuf)) < 0) + goto out_buf; + ((struct v2_disk_dqdbheader *)tmpbuf)->dqdh_prev_free = dh->dqdh_prev_free; + if ((err = write_blk(filp, nextblk, tmpbuf)) < 0) + goto out_buf; + } + if (prevblk) { + if ((err = read_blk(filp, prevblk, tmpbuf)) < 0) + goto out_buf; + ((struct v2_disk_dqdbheader *)tmpbuf)->dqdh_next_free = dh->dqdh_next_free; + if ((err = write_blk(filp, prevblk, tmpbuf)) < 0) + goto out_buf; + } + else { + info->u.v2_i.dqi_free_entry = nextblk; + mark_info_dirty(info); + } + freedqbuf(tmpbuf); + dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0); + if (write_blk(filp, blk, buf) < 0) /* No matter whether write succeeds block is out of list */ + printk(KERN_ERR "VFS: Can't write block (%u) with free entries.\n", blk); + return 0; +out_buf: + freedqbuf(tmpbuf); + return err; +} + +/* Insert given block to the beginning of list with free entries */ +static int insert_free_dqentry(struct file *filp, struct mem_dqinfo *info, dqbuf_t buf, uint blk) +{ + dqbuf_t tmpbuf = getdqbuf(); + struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf; + int err; + + if (!tmpbuf) + return -ENOMEM; + dh->dqdh_next_free = cpu_to_le32(info->u.v2_i.dqi_free_entry); + dh->dqdh_prev_free = cpu_to_le32(0); + if ((err = write_blk(filp, blk, buf)) < 0) + goto out_buf; + if (info->u.v2_i.dqi_free_entry) { + if ((err = read_blk(filp, info->u.v2_i.dqi_free_entry, tmpbuf)) < 0) + goto out_buf; + ((struct v2_disk_dqdbheader *)tmpbuf)->dqdh_prev_free = cpu_to_le32(blk); + if ((err = write_blk(filp, info->u.v2_i.dqi_free_entry, tmpbuf)) < 0) + goto out_buf; + } + freedqbuf(tmpbuf); + info->u.v2_i.dqi_free_entry = blk; + mark_info_dirty(info); + return 0; +out_buf: + freedqbuf(tmpbuf); + return err; +} + +/* Find space for dquot */ +static uint find_free_dqentry(struct dquot *dquot, int *err) +{ + struct file *filp = sb_dqopt(dquot->dq_sb)->files[dquot->dq_type]; + struct mem_dqinfo *info = sb_dqopt(dquot->dq_sb)->info+dquot->dq_type; + uint blk, i; + struct v2_disk_dqdbheader *dh; + struct v2_disk_dqblk *ddquot; + struct v2_disk_dqblk fakedquot; + dqbuf_t buf; + + *err = 0; + if (!(buf = getdqbuf())) { + *err = -ENOMEM; + return 0; + } + dh = (struct v2_disk_dqdbheader *)buf; + ddquot = GETENTRIES(buf); + if (info->u.v2_i.dqi_free_entry) { + blk = info->u.v2_i.dqi_free_entry; + if ((*err = read_blk(filp, blk, buf)) < 0) + goto out_buf; + } + else { + blk = get_free_dqblk(filp, info); + if ((int)blk < 0) { + *err = blk; + return 0; + } + memset(buf, 0, V2_DQBLKSIZE); + info->u.v2_i.dqi_free_entry = blk; /* This is enough as block is already zeroed and entry list is empty... */ + mark_info_dirty(info); + } + if (le16_to_cpu(dh->dqdh_entries)+1 >= V2_DQSTRINBLK) /* Block will be full? */ + if ((*err = remove_free_dqentry(filp, info, buf, blk)) < 0) { + printk(KERN_ERR "VFS: find_free_dqentry(): Can't remove block (%u) from entry free list.\n", blk); + goto out_buf; + } + dh->dqdh_entries = cpu_to_le16(le16_to_cpu(dh->dqdh_entries)+1); + memset(&fakedquot, 0, sizeof(struct v2_disk_dqblk)); + /* Find free structure in block */ + for (i = 0; i < V2_DQSTRINBLK && memcmp(&fakedquot, ddquot+i, sizeof(struct v2_disk_dqblk)); i++); +#ifdef __QUOTA_V2_PARANOIA + if (i == V2_DQSTRINBLK) { + printk(KERN_ERR "VFS: find_free_dqentry(): Data block full but it shouldn't.\n"); + *err = -EIO; + goto out_buf; + } +#endif + if ((*err = write_blk(filp, blk, buf)) < 0) { + printk(KERN_ERR "VFS: find_free_dqentry(): Can't write quota data block %u.\n", blk); + goto out_buf; + } + dquot->dq_off = (blk<dq_sb)->files[dquot->dq_type]; + struct mem_dqinfo *info = sb_dqopt(dquot->dq_sb)->info + dquot->dq_type; + dqbuf_t buf; + int ret = 0, newson = 0, newact = 0; + u32 *ref; + uint newblk; + + if (!(buf = getdqbuf())) + return -ENOMEM; + if (!*treeblk) { + ret = get_free_dqblk(filp, info); + if (ret < 0) + goto out_buf; + *treeblk = ret; + memset(buf, 0, V2_DQBLKSIZE); + newact = 1; + } + else { + if ((ret = read_blk(filp, *treeblk, buf)) < 0) { + printk(KERN_ERR "VFS: Can't read tree quota block %u.\n", *treeblk); + goto out_buf; + } + } + ref = (u32 *)buf; + newblk = le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]); + if (!newblk) + newson = 1; + if (depth == V2_DQTREEDEPTH-1) { +#ifdef __QUOTA_V2_PARANOIA + if (newblk) { + printk(KERN_ERR "VFS: Inserting already present quota entry (block %u).\n", ref[GETIDINDEX(dquot->dq_id, depth)]); + ret = -EIO; + goto out_buf; + } +#endif + newblk = find_free_dqentry(dquot, &ret); + } + else + ret = do_insert_tree(dquot, &newblk, depth+1); + if (newson && ret >= 0) { + ref[GETIDINDEX(dquot->dq_id, depth)] = cpu_to_le32(newblk); + ret = write_blk(filp, *treeblk, buf); + } + else if (newact && ret < 0) + put_free_dqblk(filp, info, buf, *treeblk); +out_buf: + freedqbuf(buf); + return ret; +} + +/* Wrapper for inserting quota structure into tree */ +static inline int dq_insert_tree(struct dquot *dquot) +{ + int tmp = V2_DQTREEOFF; + return do_insert_tree(dquot, &tmp, 0); +} + +/* + * We don't have to be afraid of deadlocks as we never have quotas on quota files... + */ +static int v2_write_dquot(struct dquot *dquot) +{ + int type = dquot->dq_type; + struct file *filp; + mm_segment_t fs; + loff_t offset; + ssize_t ret; + struct v2_disk_dqblk ddquot; + + if (!dquot->dq_off) + if ((ret = dq_insert_tree(dquot)) < 0) { + printk(KERN_ERR "VFS: Error %d occured while creating quota.\n", ret); + return ret; + } + filp = sb_dqopt(dquot->dq_sb)->files[type]; + offset = dquot->dq_off; + mem2diskdqb(&ddquot, &dquot->dq_dqb, dquot->dq_id); + fs = get_fs(); + set_fs(KERNEL_DS); + ret = filp->f_op->write(filp, (char *)&ddquot, sizeof(struct v2_disk_dqblk), &offset); + set_fs(fs); + if (ret != sizeof(struct v2_disk_dqblk)) { + printk(KERN_WARNING "VFS: dquota write failed on dev %s\n", kdevname(dquot->dq_sb->s_dev)); + if (ret >= 0) + ret = -ENOSPC; + } + else + ret = 0; + dqstats.writes++; + return ret; +} + +/* Free dquot entry in data block */ +static int free_dqentry(struct dquot *dquot, uint blk) +{ + struct file *filp = sb_dqopt(dquot->dq_sb)->files[dquot->dq_type]; + struct mem_dqinfo *info = sb_dqopt(dquot->dq_sb)->info + dquot->dq_type; + struct v2_disk_dqdbheader *dh; + dqbuf_t buf = getdqbuf(); + int ret = 0; + + if (!buf) + return -ENOMEM; + if (dquot->dq_off >> V2_DQBLKSIZE_BITS != blk) { + printk(KERN_ERR "VFS: Quota structure has offset to other block (%u) than it should (%u).\n", blk, (uint)(dquot->dq_off >> V2_DQBLKSIZE_BITS)); + goto out_buf; + } + if ((ret = read_blk(filp, blk, buf)) < 0) { + printk(KERN_ERR "VFS: Can't read quota data block %u\n", blk); + goto out_buf; + } + dh = (struct v2_disk_dqdbheader *)buf; + dh->dqdh_entries = cpu_to_le16(le16_to_cpu(dh->dqdh_entries)-1); + if (!le16_to_cpu(dh->dqdh_entries)) { /* Block got free? */ + if ((ret = remove_free_dqentry(filp, info, buf, blk)) < 0 || + (ret = put_free_dqblk(filp, info, buf, blk)) < 0) { + printk(KERN_ERR "VFS: Can't move quota data block (%u) to free list.\n", blk); + goto out_buf; + } + } + else { + memset(buf+(dquot->dq_off & ((1 << V2_DQBLKSIZE_BITS)-1)), 0, sizeof(struct v2_disk_dqblk)); + if (le16_to_cpu(dh->dqdh_entries) == V2_DQSTRINBLK-1) { + /* Insert will write block itself */ + if ((ret = insert_free_dqentry(filp, info, buf, blk)) < 0) { + printk(KERN_ERR "VFS: Can't insert quota data block (%u) to free entry list.\n", blk); + goto out_buf; + } + } + else + if ((ret = write_blk(filp, blk, buf)) < 0) { + printk(KERN_ERR "VFS: Can't write quota data block %u\n", blk); + goto out_buf; + } + } + dquot->dq_off = 0; /* Quota is now unattached */ +out_buf: + freedqbuf(buf); + return ret; +} + +/* Remove reference to dquot from tree */ +static int remove_tree(struct dquot *dquot, uint *blk, int depth) +{ + struct file *filp = sb_dqopt(dquot->dq_sb)->files[dquot->dq_type]; + struct mem_dqinfo *info = sb_dqopt(dquot->dq_sb)->info + dquot->dq_type; + dqbuf_t buf = getdqbuf(); + int ret = 0; + uint newblk; + u32 *ref = (u32 *)buf; + + if (!buf) + return -ENOMEM; + if ((ret = read_blk(filp, *blk, buf)) < 0) { + printk(KERN_ERR "VFS: Can't read quota data block %u\n", *blk); + goto out_buf; + } + newblk = le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]); + if (depth == V2_DQTREEDEPTH-1) { + ret = free_dqentry(dquot, newblk); + newblk = 0; + } + else + ret = remove_tree(dquot, &newblk, depth+1); + if (ret >= 0 && !newblk) { + int i; + ref[GETIDINDEX(dquot->dq_id, depth)] = cpu_to_le32(0); + for (i = 0; i < V2_DQBLKSIZE && !buf[i]; i++); /* Block got empty? */ + if (i == V2_DQBLKSIZE) { + put_free_dqblk(filp, info, buf, *blk); + *blk = 0; + } + else + if ((ret = write_blk(filp, *blk, buf)) < 0) + printk(KERN_ERR "VFS: Can't write quota tree block %u.\n", *blk); + } +out_buf: + freedqbuf(buf); + return ret; +} + +/* Delete dquot from tree */ +static int v2_delete_dquot(struct dquot *dquot) +{ + uint tmp = V2_DQTREEOFF; + + if (!dquot->dq_off) /* Even not allocated? */ + return 0; + return remove_tree(dquot, &tmp, 0); +} + +/* Find entry in block */ +static loff_t find_block_dqentry(struct dquot *dquot, uint blk) +{ + struct file *filp = sb_dqopt(dquot->dq_sb)->files[dquot->dq_type]; + dqbuf_t buf = getdqbuf(); + loff_t ret = 0; + int i; + struct v2_disk_dqblk *ddquot = GETENTRIES(buf); + + if (!buf) + return -ENOMEM; + if ((ret = read_blk(filp, blk, buf)) < 0) { + printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk); + goto out_buf; + } + if (dquot->dq_id) + for (i = 0; i < V2_DQSTRINBLK && le32_to_cpu(ddquot[i].dqb_id) != dquot->dq_id; i++); + else { /* ID 0 as a bit more complicated searching... */ + struct v2_disk_dqblk fakedquot; + + memset(&fakedquot, 0, sizeof(struct v2_disk_dqblk)); + for (i = 0; i < V2_DQSTRINBLK; i++) + if (!le32_to_cpu(ddquot[i].dqb_id) && memcmp(&fakedquot, ddquot+i, sizeof(struct v2_disk_dqblk))) + break; + } + if (i == V2_DQSTRINBLK) { + printk(KERN_ERR "VFS: Quota for id %u referenced but not present.\n", dquot->dq_id); + ret = -EIO; + goto out_buf; + } + else + ret = (blk << V2_DQBLKSIZE_BITS) + sizeof(struct v2_disk_dqdbheader) + i * sizeof(struct v2_disk_dqblk); +out_buf: + freedqbuf(buf); + return ret; +} + +/* Find entry for given id in the tree */ +static loff_t find_tree_dqentry(struct dquot *dquot, uint blk, int depth) +{ + struct file *filp = sb_dqopt(dquot->dq_sb)->files[dquot->dq_type]; + dqbuf_t buf = getdqbuf(); + loff_t ret = 0; + u32 *ref = (u32 *)buf; + + if (!buf) + return -ENOMEM; + if ((ret = read_blk(filp, blk, buf)) < 0) { + printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk); + goto out_buf; + } + ret = 0; + blk = le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]); + if (!blk) /* No reference? */ + goto out_buf; + if (depth < V2_DQTREEDEPTH-1) + ret = find_tree_dqentry(dquot, blk, depth+1); + else + ret = find_block_dqentry(dquot, blk); +out_buf: + freedqbuf(buf); + return ret; +} + +/* Find entry for given id in the tree - wrapper function */ +static inline loff_t find_dqentry(struct dquot *dquot) +{ + return find_tree_dqentry(dquot, V2_DQTREEOFF, 0); +} + +static int v2_read_dquot(struct dquot *dquot) +{ + int type = dquot->dq_type; + struct file *filp; + mm_segment_t fs; + loff_t offset; + struct v2_disk_dqblk ddquot; + int ret = 0; + + filp = sb_dqopt(dquot->dq_sb)->files[type]; + +#ifdef __QUOTA_V2_PARANOIA + if (!filp || !dquot->dq_sb) { /* Invalidated quota? */ + printk(KERN_ERR "VFS: Quota invalidated while reading!\n"); + return -EIO; + } +#endif + offset = find_dqentry(dquot); + if (offset <= 0) { /* Entry not present? */ + if (offset < 0) + printk(KERN_ERR "VFS: Can't read quota structure for id %u.\n", dquot->dq_id); + dquot->dq_off = 0; + dquot->dq_flags |= DQ_FAKE; + memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk)); + ret = offset; + } + else { + dquot->dq_off = offset; + fs = get_fs(); + set_fs(KERNEL_DS); + if ((ret = filp->f_op->read(filp, (char *)&ddquot, sizeof(struct v2_disk_dqblk), &offset)) != sizeof(struct v2_disk_dqblk)) { + if (ret >= 0) + ret = -EIO; + printk(KERN_ERR "VFS: Error while reading quota structure for id %u.\n", dquot->dq_id); + memset(&ddquot, 0, sizeof(struct v2_disk_dqblk)); + } + else + ret = 0; + set_fs(fs); + disk2memdqb(&dquot->dq_dqb, &ddquot); + } + dqstats.reads++; + return ret; +} + +/* Commit changes of dquot to disk - it might also mean deleting it when quota became fake one and user has no blocks... */ +static int v2_commit_dquot(struct dquot *dquot) +{ + /* We clear the flag everytime so we don't loop when there was an IO error... */ + dquot->dq_flags &= ~DQ_MOD; + if (dquot->dq_flags & DQ_FAKE && !(dquot->dq_dqb.dqb_curinodes | dquot->dq_dqb.dqb_curspace)) + return v2_delete_dquot(dquot); + else + return v2_write_dquot(dquot); +} + +static struct quota_format_ops v2_format_ops = { + check_quota_file: v2_check_quota_file, + read_file_info: v2_read_file_info, + write_file_info: v2_write_file_info, + free_file_info: NULL, + read_dqblk: v2_read_dquot, + commit_dqblk: v2_commit_dquot, +}; + +static struct quota_format_type v2_quota_format = { + qf_fmt_id: QFMT_VFS_V0, + qf_ops: &v2_format_ops, + qf_owner: THIS_MODULE +}; + +static int __init init_v2_quota_format(void) +{ + return register_quota_format(&v2_quota_format); +} + +static void __exit exit_v2_quota_format(void) +{ + unregister_quota_format(&v2_quota_format); +} + +EXPORT_NO_SYMBOLS; + +module_init(init_v2_quota_format); +module_exit(exit_v2_quota_format); diff --git a/include/linux/dqblk_v2.h b/include/linux/dqblk_v2.h new file mode 100644 index 000000000000..4a6c5f6867bb --- /dev/null +++ b/include/linux/dqblk_v2.h @@ -0,0 +1,20 @@ +/* + * Definitions of structures for vfsv0 quota format + */ + +#ifndef _LINUX_DQBLK_V2_H +#define _LINUX_DQBLK_V2_H + +#include + +/* id numbers of quota format */ +#define QFMT_VFS_V0 2 + +/* Inmemory copy of version specific information */ +struct v2_mem_dqinfo { + unsigned int dqi_blocks; + unsigned int dqi_free_blk; + unsigned int dqi_free_entry; +}; + +#endif /* _LINUX_DQBLK_V2_H */ diff --git a/include/linux/quota.h b/include/linux/quota.h index 9b7ba43576d8..4ed31fe2a7f7 100644 --- a/include/linux/quota.h +++ b/include/linux/quota.h @@ -136,6 +136,7 @@ struct if_dqinfo { #include #include +#include /* * Data for one user/group kept in memory @@ -163,6 +164,7 @@ struct mem_dqinfo { unsigned int dqi_igrace; union { struct v1_mem_dqinfo v1_i; + struct v2_mem_dqinfo v2_i; } u; }; @@ -214,6 +216,7 @@ struct dquot { /* fields after this point are cleared when invalidating */ struct super_block *dq_sb; /* superblock this applies to */ unsigned int dq_id; /* ID this applies to (uid, gid) */ + loff_t dq_off; /* Offset of dquot on disk */ short dq_type; /* Type of quota */ short dq_flags; /* See DQ_* */ unsigned long dq_referenced; /* Number of times this dquot was diff --git a/include/linux/quotaio_v2.h b/include/linux/quotaio_v2.h new file mode 100644 index 000000000000..da4e02730bc8 --- /dev/null +++ b/include/linux/quotaio_v2.h @@ -0,0 +1,79 @@ +/* + * Definitions of structures for vfsv0 quota format + */ + +#ifndef _LINUX_QUOTAIO_V2_H +#define _LINUX_QUOTAIO_V2_H + +#include +#include + +/* + * Definitions of magics and versions of current quota files + */ +#define V2_INITQMAGICS {\ + 0xd9c01f11, /* USRQUOTA */\ + 0xd9c01927 /* GRPQUOTA */\ +} + +#define V2_INITQVERSIONS {\ + 0, /* USRQUOTA */\ + 0 /* GRPQUOTA */\ +} + +/* + * The following structure defines the format of the disk quota file + * (as it appears on disk) - the file is a radix tree whose leaves point + * to blocks of these structures. + */ +struct v2_disk_dqblk { + __u32 dqb_id; /* id this quota applies to */ + __u32 dqb_ihardlimit; /* absolute limit on allocated inodes */ + __u32 dqb_isoftlimit; /* preferred inode limit */ + __u32 dqb_curinodes; /* current # allocated inodes */ + __u32 dqb_bhardlimit; /* absolute limit on disk space (in QUOTABLOCK_SIZE) */ + __u32 dqb_bsoftlimit; /* preferred limit on disk space (in QUOTABLOCK_SIZE) */ + __u64 dqb_curspace; /* current space occupied (in bytes) */ + __u64 dqb_btime; /* time limit for excessive disk use */ + __u64 dqb_itime; /* time limit for excessive inode use */ +}; + +/* + * Here are header structures as written on disk and their in-memory copies + */ +/* First generic header */ +struct v2_disk_dqheader { + __u32 dqh_magic; /* Magic number identifying file */ + __u32 dqh_version; /* File version */ +}; + +/* Header with type and version specific information */ +struct v2_disk_dqinfo { + __u32 dqi_bgrace; /* Time before block soft limit becomes hard limit */ + __u32 dqi_igrace; /* Time before inode soft limit becomes hard limit */ + __u32 dqi_flags; /* Flags for quotafile (DQF_*) */ + __u32 dqi_blocks; /* Number of blocks in file */ + __u32 dqi_free_blk; /* Number of first free block in the list */ + __u32 dqi_free_entry; /* Number of block with at least one free entry */ +}; + +/* + * Structure of header of block with quota structures. It is padded to 16 bytes so + * there will be space for exactly 18 quota-entries in a block + */ +struct v2_disk_dqdbheader { + __u32 dqdh_next_free; /* Number of next block with free entry */ + __u32 dqdh_prev_free; /* Number of previous block with free entry */ + __u16 dqdh_entries; /* Number of valid entries in block */ + __u16 dqdh_pad1; + __u32 dqdh_pad2; +}; + +#define V2_DQINFOOFF sizeof(struct v2_disk_dqheader) /* Offset of info header in file */ +#define V2_DQBLKSIZE_BITS 10 +#define V2_DQBLKSIZE (1 << V2_DQBLKSIZE_BITS) /* Size of block with quota structures */ +#define V2_DQTREEOFF 1 /* Offset of tree in file in blocks */ +#define V2_DQTREEDEPTH 4 /* Depth of quota tree */ +#define V2_DQSTRINBLK ((V2_DQBLKSIZE - sizeof(struct v2_disk_dqdbheader)) / sizeof(struct v2_disk_dqblk)) /* Number of entries in one blocks */ + +#endif /* _LINUX_QUOTAIO_V2_H */ -- cgit v1.2.3 From 0c532315fb1317ce45be1af9f8b32586b4bc95e9 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sun, 19 May 2002 19:34:35 -0700 Subject: [PATCH] [10/13] quota-10-inttype Remove use of 'short' in parameters of functions. 'int' is used instead. --- fs/dquot.c | 34 +++++++++++++++++----------------- fs/inode.c | 4 ++-- include/linux/quota.h | 2 +- include/linux/quotaops.h | 4 ++-- 4 files changed, 22 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/fs/dquot.c b/fs/dquot.c index 17090a75a5f3..f7e5b77e43d1 100644 --- a/fs/dquot.c +++ b/fs/dquot.c @@ -173,7 +173,7 @@ static inline void put_dquot_dup_ref(struct dquot *dquot) dquot->dq_dup_ref--; } -static inline int const hashfn(struct super_block *sb, unsigned int id, short type) +static inline int const hashfn(struct super_block *sb, unsigned int id, int type) { return((HASHDEV(sb->s_dev) ^ id) * (MAXQUOTAS - type)) % NR_DQHASH; } @@ -190,7 +190,7 @@ static inline void remove_dquot_hash(struct dquot *dquot) INIT_LIST_HEAD(&dquot->dq_hash); } -static inline struct dquot *find_dquot(unsigned int hashent, struct super_block *sb, unsigned int id, short type) +static inline struct dquot *find_dquot(unsigned int hashent, struct super_block *sb, unsigned int id, int type) { struct list_head *head; struct dquot *dquot; @@ -339,7 +339,7 @@ static int commit_dqblk(struct dquot *dquot) /* Invalidate all dquots on the list, wait for all users. Note that this function is called * after quota is disabled so no new quota might be created. As we only insert to the end of * inuse list, we don't have to restart searching... */ -static void invalidate_dquots(struct super_block *sb, short type) +static void invalidate_dquots(struct super_block *sb, int type) { struct dquot *dquot; struct list_head *head; @@ -368,7 +368,7 @@ restart: } } -int sync_dquots(struct super_block *sb, short type) +int sync_dquots(struct super_block *sb, int type) { struct list_head *head; struct dquot *dquot; @@ -515,7 +515,7 @@ static struct dquot *get_empty_dquot(struct super_block *sb, int type) return dquot; } -static struct dquot *dqget(struct super_block *sb, unsigned int id, short type) +static struct dquot *dqget(struct super_block *sb, unsigned int id, int type) { unsigned int hashent = hashfn(sb, id, type); struct dquot *dquot, *empty = NODQUOT; @@ -593,7 +593,7 @@ static void dqputduplicate(struct dquot *dquot) dqstats.drops++; } -static int dqinit_needed(struct inode *inode, short type) +static int dqinit_needed(struct inode *inode, int type) { int cnt; @@ -607,7 +607,7 @@ static int dqinit_needed(struct inode *inode, short type) return 0; } -static void add_dquot_ref(struct super_block *sb, short type) +static void add_dquot_ref(struct super_block *sb, int type) { struct list_head *p; @@ -641,7 +641,7 @@ static inline int dqput_blocks(struct dquot *dquot) } /* Remove references to dquots from inode - add dquot to list for freeing if needed */ -int remove_inode_dquot_ref(struct inode *inode, short type, struct list_head *tofree_head) +int remove_inode_dquot_ref(struct inode *inode, int type, struct list_head *tofree_head) { struct dquot *dquot = inode->i_dquot[type]; int cnt; @@ -875,11 +875,11 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war * * Note: this is a blocking operation. */ -void dquot_initialize(struct inode *inode, short type) +void dquot_initialize(struct inode *inode, int type) { struct dquot *dquot[MAXQUOTAS]; unsigned int id = 0; - short cnt; + int cnt; if (IS_NOQUOTA(inode)) return; @@ -925,7 +925,7 @@ void dquot_initialize(struct inode *inode, short type) void dquot_drop(struct inode *inode) { struct dquot *dquot; - short cnt; + int cnt; inode->i_flags &= ~S_QUOTA; for (cnt = 0; cnt < MAXQUOTAS; cnt++) { @@ -1020,7 +1020,7 @@ warn_put_all: */ void dquot_free_space(struct inode *inode, qsize_t number) { - unsigned short cnt; + unsigned int cnt; struct dquot *dquot; /* NOBLOCK Start */ @@ -1042,7 +1042,7 @@ void dquot_free_space(struct inode *inode, qsize_t number) */ void dquot_free_inode(const struct inode *inode, unsigned long number) { - unsigned short cnt; + unsigned int cnt; struct dquot *dquot; /* NOBLOCK Start */ @@ -1161,7 +1161,7 @@ struct dquot_operations dquot_operations = { transfer: dquot_transfer }; -static inline void set_enable_flags(struct quota_info *dqopt, short type) +static inline void set_enable_flags(struct quota_info *dqopt, int type) { switch (type) { case USRQUOTA: @@ -1173,7 +1173,7 @@ static inline void set_enable_flags(struct quota_info *dqopt, short type) } } -static inline void reset_enable_flags(struct quota_info *dqopt, short type) +static inline void reset_enable_flags(struct quota_info *dqopt, int type) { switch (type) { case USRQUOTA: @@ -1186,7 +1186,7 @@ static inline void reset_enable_flags(struct quota_info *dqopt, short type) } /* Function in inode.c - remove pointers to dquots in icache */ -extern void remove_dquot_ref(struct super_block *, short); +extern void remove_dquot_ref(struct super_block *, int); /* * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount) @@ -1448,7 +1448,7 @@ static int read_stats(char *buffer, char **start, off_t offset, int count, int * } #endif -struct quotactl_ops vfs_quotactl_ops { +struct quotactl_ops vfs_quotactl_ops = { quota_on: vfs_quota_on, quota_off: vfs_quota_off, quota_sync: vfs_quota_sync, diff --git a/fs/inode.c b/fs/inode.c index 0ceabe7b934b..52fdf5dacb1f 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -878,9 +878,9 @@ void update_atime (struct inode *inode) /* Functions back in dquot.c */ void put_dquot_list(struct list_head *); -int remove_inode_dquot_ref(struct inode *, short, struct list_head *); +int remove_inode_dquot_ref(struct inode *, int, struct list_head *); -void remove_dquot_ref(struct super_block *sb, short type) +void remove_dquot_ref(struct super_block *sb, int type) { struct inode *inode; struct list_head *act_head; diff --git a/include/linux/quota.h b/include/linux/quota.h index 4ed31fe2a7f7..544c4531e69a 100644 --- a/include/linux/quota.h +++ b/include/linux/quota.h @@ -248,7 +248,7 @@ struct quota_format_ops { /* Operations working with dquots */ struct dquot_operations { - void (*initialize) (struct inode *, short); + void (*initialize) (struct inode *, int); void (*drop) (struct inode *); int (*alloc_space) (struct inode *, qsize_t, int); int (*alloc_inode) (const struct inode *, unsigned long); diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h index 0cd58a4fa275..fac031c896c1 100644 --- a/include/linux/quotaops.h +++ b/include/linux/quotaops.h @@ -20,9 +20,9 @@ /* * declaration of quota_function calls in kernel. */ -extern int sync_dquots(kdev_t dev, short type); +extern int sync_dquots(kdev_t dev, int type); -extern void dquot_initialize(struct inode *inode, short type); +extern void dquot_initialize(struct inode *inode, int type); extern void dquot_drop(struct inode *inode); extern int dquot_alloc_space(struct inode *inode, qsize_t number, int prealloc); -- cgit v1.2.3 From 736e690ebc2f1b885eb0182432c9f9d753e51b6f Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sun, 19 May 2002 19:34:39 -0700 Subject: [PATCH] [11/13] quota-11-sync Implemented proper syncing of dquots - ie. also global information about quota files are synced. We find info to sync by walking through all superblocks... --- fs/dquot.c | 68 +++++++++++++++++++++++++++++++++++++++++------- include/linux/fs.h | 12 --------- include/linux/quota.h | 31 +++++++++++++++++----- include/linux/quotaops.h | 2 +- 4 files changed, 83 insertions(+), 30 deletions(-) (limited to 'include') diff --git a/fs/dquot.c b/fs/dquot.c index f7e5b77e43d1..b585d30fee7a 100644 --- a/fs/dquot.c +++ b/fs/dquot.c @@ -368,12 +368,13 @@ restart: } } -int sync_dquots(struct super_block *sb, int type) +static int vfs_quota_sync(struct super_block *sb, int type) { struct list_head *head; struct dquot *dquot; + struct quota_info *dqopt = sb_dqopt(sb); + int cnt; - lock_kernel(); restart: list_for_each(head, &inuse_list) { dquot = list_entry(head, struct dquot, dq_inuse); @@ -396,12 +397,64 @@ restart: dqput(dquot); goto restart; } - /* FIXME: Here we should also sync all file info */ + for (cnt = 0; cnt < MAXQUOTAS; cnt++) + if ((cnt == type || type == -1) && sb_has_quota_enabled(sb, cnt)) + dqopt->info[cnt].dqi_flags &= ~DQF_ANY_DQUOT_DIRTY; + for (cnt = 0; cnt < MAXQUOTAS; cnt++) + if ((cnt == type || type == -1) && sb_has_quota_enabled(sb, cnt) && info_dirty(&dqopt->info[cnt])) + dqopt->ops[cnt]->write_file_info(sb, cnt); dqstats.syncs++; - unlock_kernel(); return 0; } +static struct super_block *get_super_to_sync(int type) +{ + struct list_head *head; + int cnt, dirty; + +restart: + spin_lock(&sb_lock); + list_for_each(head, &super_blocks) { + struct super_block *sb = list_entry(head, struct super_block, s_list); + + for (cnt = 0, dirty = 0; cnt < MAXQUOTAS; cnt++) + if ((type == cnt || type == -1) && sb_has_quota_enabled(sb, cnt) + && sb_dqopt(sb)->info[cnt].dqi_flags & DQF_ANY_DQUOT_DIRTY) + dirty = 1; + if (!dirty) + continue; + sb->s_count++; + spin_unlock(&sb_lock); + down_read(&sb->s_umount); + if (!sb->s_root) { + drop_super(sb); + goto restart; + } + return sb; + } + spin_unlock(&sb_lock); + return NULL; +} + +void sync_dquots(struct super_block *sb, int type) +{ + if (sb) { + lock_kernel(); + if (sb->s_qcop->quota_sync) + sb->s_qcop->quota_sync(sb, type); + unlock_kernel(); + } + else { + while ((sb = get_super_to_sync(type))) { + lock_kernel(); + if (sb->s_qcop->quota_sync) + sb->s_qcop->quota_sync(sb, type); + unlock_kernel(); + drop_super(sb); + } + } +} + /* Free unused dquots from cache */ static void prune_dqcache(int count) { @@ -1212,7 +1265,7 @@ int vfs_quota_off(struct super_block *sb, int type) /* Note: these are blocking operations */ remove_dquot_ref(sb, cnt); invalidate_dquots(sb, cnt); - if (info_dirty(&dqopt->info[cnt])) + if (info_dirty(&dqopt->info[cnt])) dqopt->ops[cnt]->write_file_info(sb, cnt); if (dqopt->ops[cnt]->free_file_info) dqopt->ops[cnt]->free_file_info(sb, cnt); @@ -1291,11 +1344,6 @@ out_fmt: return error; } -int vfs_quota_sync(struct super_block *sb, int type) -{ - return sync_dquots(sb, type); -} - /* Generic routine for getting common part of quota structure */ static void do_get_dqblk(struct dquot *dquot, struct if_dqblk *di) { diff --git a/include/linux/fs.h b/include/linux/fs.h index f30905c9a4f1..9a5c7d4bb2a4 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -617,18 +617,6 @@ struct nameidata { struct vfsmount *old_mnt; }; -#define DQUOT_USR_ENABLED 0x01 /* User diskquotas enabled */ -#define DQUOT_GRP_ENABLED 0x02 /* Group diskquotas enabled */ - -struct quota_info { - unsigned int flags; /* Flags for diskquotas on this device */ - struct semaphore dqio_sem; /* lock device while I/O in progress */ - struct semaphore dqoff_sem; /* serialize quota_off() and quota_on() on device */ - struct file *files[MAXQUOTAS]; /* fp's to quotafiles */ - struct mem_dqinfo info[MAXQUOTAS]; /* Information for each quota type */ - struct quota_format_ops *ops[MAXQUOTAS]; /* Operations for each type */ -}; - /* * Umount options */ diff --git a/include/linux/quota.h b/include/linux/quota.h index 544c4531e69a..0a36a5e59caf 100644 --- a/include/linux/quota.h +++ b/include/linux/quota.h @@ -170,6 +170,7 @@ struct mem_dqinfo { #define DQF_MASK 0xffff /* Mask for format specific flags */ #define DQF_INFO_DIRTY 0x10000 /* Is info dirty? */ +#define DQF_ANY_DQUOT_DIRTY 0x20000 /* Is any dquot dirty? */ extern inline void mark_info_dirty(struct mem_dqinfo *info) { @@ -178,6 +179,9 @@ extern inline void mark_info_dirty(struct mem_dqinfo *info) #define info_dirty(info) ((info)->dqi_flags & DQF_INFO_DIRTY) +#define info_any_dirty(info) ((info)->dqi_flags & DQF_INFO_DIRTY ||\ + (info)->dqi_flags & DQF_ANY_DQUOT_DIRTY) + #define sb_dqopt(sb) (&(sb)->s_dquot) extern int nr_dquots, nr_free_dquots; @@ -224,13 +228,6 @@ struct dquot { struct mem_dqblk dq_dqb; /* Diskquota usage */ }; -extern inline void mark_dquot_dirty(struct dquot *dquot) -{ - dquot->dq_flags |= DQ_MOD; -} - -#define dquot_dirty(dquot) ((dquot)->dq_flags & DQ_MOD) - #define NODQUOT (struct dquot *)NULL #define QUOTA_OK 0 @@ -279,6 +276,26 @@ struct quota_format_type { struct quota_format_type *qf_next; }; +#define DQUOT_USR_ENABLED 0x01 /* User diskquotas enabled */ +#define DQUOT_GRP_ENABLED 0x02 /* Group diskquotas enabled */ + +struct quota_info { + unsigned int flags; /* Flags for diskquotas on this device */ + struct semaphore dqio_sem; /* lock device while I/O in progress */ + struct semaphore dqoff_sem; /* serialize quota_off() and quota_on() on device */ + struct file *files[MAXQUOTAS]; /* fp's to quotafiles */ + struct mem_dqinfo info[MAXQUOTAS]; /* Information for each quota type */ + struct quota_format_ops *ops[MAXQUOTAS]; /* Operations for each type */ +}; + +/* Inline would be better but we need to dereference super_block which is not defined yet */ +#define mark_dquot_dirty(dquot) do {\ + dquot->dq_flags |= DQ_MOD;\ + sb_dqopt(dquot->dq_sb)->info[dquot->dq_type].dqi_flags |= DQF_ANY_DQUOT_DIRTY;\ +} while (0) + +#define dquot_dirty(dquot) ((dquot)->dq_flags & DQ_MOD) + static inline int is_enabled(struct quota_info *dqopt, int type) { switch (type) { diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h index fac031c896c1..31b24e37c159 100644 --- a/include/linux/quotaops.h +++ b/include/linux/quotaops.h @@ -20,7 +20,7 @@ /* * declaration of quota_function calls in kernel. */ -extern int sync_dquots(kdev_t dev, int type); +extern void sync_dquots(struct super_block *sb, int type); extern void dquot_initialize(struct inode *inode, int type); extern void dquot_drop(struct inode *inode); -- cgit v1.2.3 From 1c5bbffec39a2c2342b0b49da916c9a9c147f0d5 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sun, 19 May 2002 19:34:44 -0700 Subject: [PATCH] [12/13] quota-12-compat This patch implements configurable backward compatible quota interface. Maybe this isn't needed in 2.5 but as some people want to use patches in 2.4 where it's necessary I have implemented it. --- arch/ia64/ia32/ia32_entry.S | 2 +- arch/ia64/ia32/sys_ia32.c | 91 +++++++++ arch/s390x/kernel/linux32.c | 91 +++++++++ arch/s390x/kernel/wrapper32.S | 2 +- arch/sparc64/kernel/sys_sparc32.c | 91 +++++++++ arch/sparc64/kernel/systbls.S | 2 +- fs/Config.help | 15 ++ fs/Config.in | 6 + fs/quota.c | 395 +++++++++++++++++++++++++++++++++++++- include/linux/quotacompat.h | 86 +++++++++ 10 files changed, 777 insertions(+), 4 deletions(-) create mode 100644 include/linux/quotacompat.h (limited to 'include') diff --git a/arch/ia64/ia32/ia32_entry.S b/arch/ia64/ia32/ia32_entry.S index f9595d3f43ed..f3e7e950b7ac 100644 --- a/arch/ia64/ia32/ia32_entry.S +++ b/arch/ia64/ia32/ia32_entry.S @@ -310,7 +310,7 @@ ia32_syscall_table: data8 sys32_ni_syscall /* init_module */ data8 sys32_ni_syscall /* delete_module */ data8 sys32_ni_syscall /* get_kernel_syms */ /* 130 */ - data8 sys_quotactl + data8 sys32_quotactl data8 sys_getpgid data8 sys_fchdir data8 sys32_ni_syscall /* sys_bdflush */ diff --git a/arch/ia64/ia32/sys_ia32.c b/arch/ia64/ia32/sys_ia32.c index c3852b487a11..95b7e49bf51e 100644 --- a/arch/ia64/ia32/sys_ia32.c +++ b/arch/ia64/ia32/sys_ia32.c @@ -3669,6 +3669,97 @@ getname32 (const char *filename) return result; } +extern asmlinkage long sys_quotactl(int cmd, const char *special, int id, caddr_t addr); + +#ifdef CONFIG_QIFACE_COMPAT +#ifdef CONFIG_QIFACE_V1 +struct user_dqblk32 { + __u32 dqb_bhardlimit; + __u32 dqb_bsoftlimit; + __u32 dqb_curblocks; + __u32 dqb_ihardlimit; + __u32 dqb_isoftlimit; + __u32 dqb_curinodes; + __kernel_time_t32 dqb_btime; + __kernel_time_t32 dqb_itime; +}; +typedef struct v1c_mem_dqblk comp_dqblk_t; + +#define Q_COMP_GETQUOTA Q_V1_GETQUOTA +#define Q_COMP_SETQUOTA Q_V1_SETQUOTA +#define Q_COMP_SETQLIM Q_V1_SETQLIM +#define Q_COMP_SETUSE Q_V1_SETUSE +#else +struct user_dqblk32 { + __u32 dqb_ihardlimit; + __u32 dqb_isoftlimit; + __u32 dqb_curinodes; + __u32 dqb_bhardlimit; + __u32 dqb_bsoftlimit; + __u64 dqb_curspace; + __kernel_time_t32 dqb_btime; + __kernel_time_t32 dqb_itime; +}; +typedef struct v2c_mem_dqblk comp_dqblk_t; + +#define Q_COMP_GETQUOTA Q_V2_GETQUOTA +#define Q_COMP_SETQUOTA Q_V2_SETQUOTA +#define Q_COMP_SETQLIM Q_V2_SETQLIM +#define Q_COMP_SETUSE Q_V2_SETUSE +#endif + +asmlinkage long sys32_quotactl(int cmd, const char *special, int id, caddr_t addr) +{ + int cmds = cmd >> SUBCMDSHIFT; + long err; + comp_dqblk_t d; + mm_segment_t old_fs; + char *spec; + + switch (cmds) { + case Q_COMP_GETQUOTA: + break; + case Q_COMP_SETQUOTA: + case Q_COMP_SETUSE: + case Q_COMP_SETQLIM: + if (copy_from_user(&d, (struct user_dqblk32 *)addr, + sizeof (struct user_dqblk32))) + return -EFAULT; + d.dqb_itime = ((struct user_dqblk32 *)&d)->dqb_itime; + d.dqb_btime = ((struct user_dqblk32 *)&d)->dqb_btime; + break; + default: + return sys_quotactl(cmd, special, id, (__kernel_caddr_t)addr); + } + spec = getname (special); + err = PTR_ERR(spec); + if (IS_ERR(spec)) return err; + old_fs = get_fs(); + set_fs (KERNEL_DS); + err = sys_quotactl(cmd, (const char *)spec, id, (__kernel_caddr_t)&d); + set_fs (old_fs); + putname (spec); + if (err) + return err; + if (cmds == Q_COMP_GETQUOTA) { + __kernel_time_t b = d.dqb_btime, i = d.dqb_itime; + ((struct user_dqblk32 *)&d)->dqb_itime = i; + ((struct user_dqblk32 *)&d)->dqb_btime = b; + if (copy_to_user ((struct user_dqblk32 *)addr, &d, + sizeof (struct user_dqblk32))) + return -EFAULT; + } + return 0; +} + +#else +/* No conversion needed for new interface */ +asmlinkage long sys32_quotactl(int cmd, const char *special, int id, caddr_t addr) +{ + return sys_quotactl(cmd, special, id, addr); +} +#endif + asmlinkage long sys32_sched_rr_get_interval (pid_t pid, struct timespec32 *interval) { diff --git a/arch/s390x/kernel/linux32.c b/arch/s390x/kernel/linux32.c index e06f1958dd10..f9da89b329bb 100644 --- a/arch/s390x/kernel/linux32.c +++ b/arch/s390x/kernel/linux32.c @@ -897,6 +897,97 @@ asmlinkage long sys32_fcntl64(unsigned int fd, unsigned int cmd, unsigned long a return sys32_fcntl(fd, cmd, arg); } +extern asmlinkage int sys_quotactl(int cmd, const char *special, int id, caddr_t addr); + +#ifdef CONFIG_QIFACE_COMPAT +#ifdef CONFIG_QIFACE_V1 +struct user_dqblk32 { + __u32 dqb_bhardlimit; + __u32 dqb_bsoftlimit; + __u32 dqb_curblocks; + __u32 dqb_ihardlimit; + __u32 dqb_isoftlimit; + __u32 dqb_curinodes; + __kernel_time_t32 dqb_btime; + __kernel_time_t32 dqb_itime; +}; +typedef struct v1c_mem_dqblk comp_dqblk_t; + +#define Q_COMP_GETQUOTA Q_V1_GETQUOTA +#define Q_COMP_SETQUOTA Q_V1_SETQUOTA +#define Q_COMP_SETQLIM Q_V1_SETQLIM +#define Q_COMP_SETUSE Q_V1_SETUSE +#else +struct user_dqblk32 { + __u32 dqb_ihardlimit; + __u32 dqb_isoftlimit; + __u32 dqb_curinodes; + __u32 dqb_bhardlimit; + __u32 dqb_bsoftlimit; + __u64 dqb_curspace; + __kernel_time_t32 dqb_btime; + __kernel_time_t32 dqb_itime; +}; +typedef struct v2c_mem_dqblk comp_dqblk_t; + +#define Q_COMP_GETQUOTA Q_V2_GETQUOTA +#define Q_COMP_SETQUOTA Q_V2_SETQUOTA +#define Q_COMP_SETQLIM Q_V2_SETQLIM +#define Q_COMP_SETUSE Q_V2_SETUSE +#endif + +asmlinkage int sys32_quotactl(int cmd, const char *special, int id, caddr_t addr) +{ + int cmds = cmd >> SUBCMDSHIFT; + int err; + comp_dqblk_t d; + mm_segment_t old_fs; + char *spec; + + switch (cmds) { + case Q_COMP_GETQUOTA: + break; + case Q_COMP_SETQUOTA: + case Q_COMP_SETUSE: + case Q_COMP_SETQLIM: + if (copy_from_user(&d, (struct user_dqblk32 *)addr, + sizeof (struct user_dqblk32))) + return -EFAULT; + d.dqb_itime = ((struct user_dqblk32 *)&d)->dqb_itime; + d.dqb_btime = ((struct user_dqblk32 *)&d)->dqb_btime; + break; + default: + return sys_quotactl(cmd, special, id, (__kernel_caddr_t)addr); + } + spec = getname (special); + err = PTR_ERR(spec); + if (IS_ERR(spec)) return err; + old_fs = get_fs(); + set_fs (KERNEL_DS); + err = sys_quotactl(cmd, (const char *)spec, id, (__kernel_caddr_t)&d); + set_fs (old_fs); + putname (spec); + if (err) + return err; + if (cmds == Q_COMP_GETQUOTA) { + __kernel_time_t b = d.dqb_btime, i = d.dqb_itime; + ((struct user_dqblk32 *)&d)->dqb_itime = i; + ((struct user_dqblk32 *)&d)->dqb_btime = b; + if (copy_to_user ((struct user_dqblk32 *)addr, &d, + sizeof (struct user_dqblk32))) + return -EFAULT; + } + return 0; +} + +#else +/* No conversion needed for new interface */ +asmlinkage int sys32_quotactl(int cmd, const char *special, int id, caddr_t addr) +{ + return sys_quotactl(cmd, special, id, addr); +} +#endif + static inline int put_statfs (struct statfs32 *ubuf, struct statfs *kbuf) { int err; diff --git a/arch/s390x/kernel/wrapper32.S b/arch/s390x/kernel/wrapper32.S index 8a66558332b1..a11ee19b21ff 100644 --- a/arch/s390x/kernel/wrapper32.S +++ b/arch/s390x/kernel/wrapper32.S @@ -586,7 +586,7 @@ sys32_quotactl_wrapper: llgtr %r3,%r3 # const char * lgfr %r4,%r4 # int llgtr %r5,%r5 # caddr_t - jg sys_quotactl # branch to system call + jg sys32_quotactl # branch to system call .globl sys32_getpgid_wrapper sys32_getpgid_wrapper: diff --git a/arch/sparc64/kernel/sys_sparc32.c b/arch/sparc64/kernel/sys_sparc32.c index 224387833d54..ebf671149c95 100644 --- a/arch/sparc64/kernel/sys_sparc32.c +++ b/arch/sparc64/kernel/sys_sparc32.c @@ -889,6 +889,97 @@ asmlinkage long sys32_fcntl64(unsigned int fd, unsigned int cmd, unsigned long a return sys32_fcntl(fd, cmd, arg); } +extern asmlinkage int sys_quotactl(int cmd, const char *special, int id, caddr_t addr); + +#ifdef CONFIG_QIFACE_COMPAT +#ifdef CONFIG_QIFACE_V1 +struct user_dqblk32 { + __u32 dqb_bhardlimit; + __u32 dqb_bsoftlimit; + __u32 dqb_curblocks; + __u32 dqb_ihardlimit; + __u32 dqb_isoftlimit; + __u32 dqb_curinodes; + __kernel_time_t32 dqb_btime; + __kernel_time_t32 dqb_itime; +}; +typedef struct v1c_mem_dqblk comp_dqblk_t; + +#define Q_COMP_GETQUOTA Q_V1_GETQUOTA +#define Q_COMP_SETQUOTA Q_V1_SETQUOTA +#define Q_COMP_SETQLIM Q_V1_SETQLIM +#define Q_COMP_SETUSE Q_V1_SETUSE +#else +struct user_dqblk32 { + __u32 dqb_ihardlimit; + __u32 dqb_isoftlimit; + __u32 dqb_curinodes; + __u32 dqb_bhardlimit; + __u32 dqb_bsoftlimit; + __u64 dqb_curspace; + __kernel_time_t32 dqb_btime; + __kernel_time_t32 dqb_itime; +}; +typedef struct v2c_mem_dqblk comp_dqblk_t; + +#define Q_COMP_GETQUOTA Q_V2_GETQUOTA +#define Q_COMP_SETQUOTA Q_V2_SETQUOTA +#define Q_COMP_SETQLIM Q_V2_SETQLIM +#define Q_COMP_SETUSE Q_V2_SETUSE +#endif + +asmlinkage int sys32_quotactl(int cmd, const char *special, int id, caddr_t addr) +{ + int cmds = cmd >> SUBCMDSHIFT; + int err; + comp_dqblk_t d; + mm_segment_t old_fs; + char *spec; + + switch (cmds) { + case Q_COMP_GETQUOTA: + break; + case Q_COMP_SETQUOTA: + case Q_COMP_SETUSE: + case Q_COMP_SETQLIM: + if (copy_from_user(&d, (struct user_dqblk32 *)addr, + sizeof (struct user_dqblk32))) + return -EFAULT; + d.dqb_itime = ((struct user_dqblk32 *)&d)->dqb_itime; + d.dqb_btime = ((struct user_dqblk32 *)&d)->dqb_btime; + break; + default: + return sys_quotactl(cmd, special, id, (__kernel_caddr_t)addr); + } + spec = getname (special); + err = PTR_ERR(spec); + if (IS_ERR(spec)) return err; + old_fs = get_fs(); + set_fs (KERNEL_DS); + err = sys_quotactl(cmd, (const char *)spec, id, (__kernel_caddr_t)&d); + set_fs (old_fs); + putname (spec); + if (err) + return err; + if (cmds == Q_COMP_GETQUOTA) { + __kernel_time_t b = d.dqb_btime, i = d.dqb_itime; + ((struct user_dqblk32 *)&d)->dqb_itime = i; + ((struct user_dqblk32 *)&d)->dqb_btime = b; + if (copy_to_user ((struct user_dqblk32 *)addr, &d, + sizeof (struct user_dqblk32))) + return -EFAULT; + } + return 0; +} + +#else +/* No conversion needed for new interface */ +asmlinkage int sys32_quotactl(int cmd, const char *special, int id, caddr_t addr) +{ + return sys_quotactl(cmd, special, id, addr); +} +#endif + static inline int put_statfs (struct statfs32 *ubuf, struct statfs *kbuf) { int err; diff --git a/arch/sparc64/kernel/systbls.S b/arch/sparc64/kernel/systbls.S index 6138ce2fca94..8efe92296056 100644 --- a/arch/sparc64/kernel/systbls.S +++ b/arch/sparc64/kernel/systbls.S @@ -52,7 +52,7 @@ sys_call_table32: /*150*/ .word sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_poll, sys_getdents64 .word sys32_fcntl64, sys_nis_syscall, sys32_statfs, sys32_fstatfs, sys_oldumount /*160*/ .word sys32_sched_setaffinity, sys32_sched_getaffinity, sys_getdomainname, sys_setdomainname, sys_nis_syscall - .word sys_quotactl, sys_nis_syscall, sys32_mount, sys_ustat, sys_setxattr + .word sys32_quotactl, sys_nis_syscall, sys32_mount, sys_ustat, sys_setxattr /*170*/ .word sys_lsetxattr, sys_fsetxattr, sys_getxattr, sys_lgetxattr, sys32_getdents .word sys_setsid, sys_fchdir, sys_fgetxattr, sys_listxattr, sys_llistxattr /*180*/ .word sys_flistxattr, sys_removexattr, sys_lremovexattr, sys32_sigpending, sys32_query_module diff --git a/fs/Config.help b/fs/Config.help index 94e6693f25d4..58eee1f8ea80 100644 --- a/fs/Config.help +++ b/fs/Config.help @@ -18,6 +18,21 @@ CONFIG_QFMT_V2 need this functionality say Y here. Note that you will need latest quota utilities for new quota format with this kernel. +CONFIG_QIFACE_COMPAT + This option will enable old quota interface in kernel. + If you have old quota tools (version <= 3.04) and you don't want to + upgrade them say Y here. + +CONFIG_QIFACE_V1 + This is the oldest quota interface. It was used for old quota format. + If you have old quota tools and you use old quota format choose this + interface (if unsure, this interface is the best one to choose). + +CONFIG_QIFACE_V2 + This quota interface was used by VFS v0 quota format. If you need + support for VFS v0 quota format (eg. you're using quota on ReiserFS) + and you don't want to upgrade quota tools, choose this interface. + CONFIG_MINIX_FS Minix is a simple operating system used in many classes about OS's. The minix file system (method to organize files on a hard disk diff --git a/fs/Config.in b/fs/Config.in index 318e3a9814df..e66a3b7e8472 100644 --- a/fs/Config.in +++ b/fs/Config.in @@ -7,6 +7,12 @@ comment 'File systems' bool 'Quota support' CONFIG_QUOTA dep_tristate ' Old quota format support' CONFIG_QFMT_V1 $CONFIG_QUOTA dep_tristate ' VFS v0 quota format support' CONFIG_QFMT_V2 $CONFIG_QUOTA +dep_mbool ' Compatible quota interfaces' CONFIG_QIFACE_COMPAT $CONFIG_QUOTA +if [ "$CONFIG_QUOTA" = "y" -a "$CONFIG_QIFACE_COMPAT" = "y" ]; then + choice ' Compatible quota interfaces' \ + "Original CONFIG_QIFACE_V1 \ + VFSv0 CONFIG_QIFACE_V2" Original +fi tristate 'Kernel automounter support' CONFIG_AUTOFS_FS tristate 'Kernel automounter version 4 support (also supports v3)' CONFIG_AUTOFS4_FS diff --git a/fs/quota.c b/fs/quota.c index 88d54e8ade7b..45ef892edc5c 100644 --- a/fs/quota.c +++ b/fs/quota.c @@ -11,6 +11,10 @@ #include #include #include +#ifdef CONFIG_QIFACE_COMPAT +#include +#endif + int nr_dquots, nr_free_dquots; @@ -49,7 +53,7 @@ static int check_quotactl_valid(struct super_block *sb, int type, int cmd, qid_t case Q_GETQUOTA: if (!sb->s_qcop->get_dqblk) return -ENOSYS; - break; + break; case Q_SYNC: if (!sb->s_qcop->quota_sync) return -ENOSYS; @@ -231,6 +235,381 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, cadd return 0; } +#ifdef CONFIG_QIFACE_COMPAT +static int check_compat_quotactl_valid(struct super_block *sb, int type, int cmd, qid_t id) +{ + if (type >= MAXQUOTAS) + return -EINVAL; + /* Is operation supported? */ + /* sb==NULL for GETSTATS calls */ + if (sb && !sb->s_qcop) + return -ENOSYS; + + switch (cmd) { + case Q_COMP_QUOTAON: + if (!sb->s_qcop->quota_on) + return -ENOSYS; + break; + case Q_COMP_QUOTAOFF: + if (!sb->s_qcop->quota_off) + return -ENOSYS; + break; + case Q_COMP_SYNC: + if (!sb->s_qcop->quota_sync) + return -ENOSYS; + break; +#ifdef CONFIG_QIFACE_V2 + case Q_V2_SETFLAGS: + case Q_V2_SETGRACE: + case Q_V2_SETINFO: + if (!sb->s_qcop->set_info) + return -ENOSYS; + break; + case Q_V2_GETINFO: + if (!sb->s_qcop->get_info) + return -ENOSYS; + break; + case Q_V2_SETQLIM: + case Q_V2_SETUSE: + case Q_V2_SETQUOTA: + if (!sb->s_qcop->set_dqblk) + return -ENOSYS; + break; + case Q_V2_GETQUOTA: + if (!sb->s_qcop->get_dqblk) + return -ENOSYS; + break; + case Q_V2_GETSTATS: + return 0; /* GETSTATS need no other checks */ +#endif +#ifdef CONFIG_QIFACE_V1 + case Q_V1_SETQLIM: + case Q_V1_SETUSE: + case Q_V1_SETQUOTA: + if (!sb->s_qcop->set_dqblk) + return -ENOSYS; + break; + case Q_V1_GETQUOTA: + if (!sb->s_qcop->get_dqblk) + return -ENOSYS; + break; + case Q_V1_RSQUASH: + if (!sb->s_qcop->set_info) + return -ENOSYS; + break; + case Q_V1_GETSTATS: + return 0; /* GETSTATS need no other checks */ +#endif + default: + return -EINVAL; + } + + /* Is quota turned on for commands which need it? */ + switch (cmd) { + case Q_V2_SETFLAGS: + case Q_V2_SETGRACE: + case Q_V2_SETINFO: + case Q_V2_GETINFO: + case Q_COMP_QUOTAOFF: + case Q_V1_RSQUASH: + case Q_V1_SETQUOTA: + case Q_V1_SETQLIM: + case Q_V1_SETUSE: + case Q_V2_SETQUOTA: + /* Q_V2_SETQLIM: collision with Q_V1_SETQLIM */ + case Q_V2_SETUSE: + case Q_V1_GETQUOTA: + case Q_V2_GETQUOTA: + if (!sb_has_quota_enabled(sb, type)) + return -ESRCH; + } +#ifdef CONFIG_QIFACE_V1 + if (cmd != Q_COMP_QUOTAON && cmd != Q_COMP_QUOTAOFF && cmd != Q_COMP_SYNC && sb_dqopt(sb)->info[type].dqi_format->qf_fmt_id != QFMT_VFS_OLD) +#else + if (cmd != Q_COMP_QUOTAON && cmd != Q_COMP_QUOTAOFF && cmd != Q_COMP_SYNC && sb_dqopt(sb)->info[type].dqi_format->qf_fmt_id != QFMT_VFS_V0) +#endif + return -ESRCH; + + /* Check privileges */ + if (cmd == Q_V1_GETQUOTA || cmd == Q_V2_GETQUOTA) { + if (((type == USRQUOTA && current->euid != id) || + (type == GRPQUOTA && !in_egroup_p(id))) && + !capable(CAP_SYS_ADMIN)) + return -EPERM; + } + else if (cmd != Q_V1_GETSTATS && cmd != Q_V2_GETSTATS && cmd != Q_V2_GETINFO && cmd != Q_COMP_SYNC) + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + return 0; +} + +#ifdef CONFIG_QIFACE_V1 +static int v1_set_rsquash(struct super_block *sb, int type, int flag) +{ + struct if_dqinfo info; + + info.dqi_valid = IIF_FLAGS; + info.dqi_flags = flag ? V1_DQF_RSQUASH : 0; + return sb->s_qcop->set_info(sb, type, &info); +} + +static int v1_get_dqblk(struct super_block *sb, int type, qid_t id, struct v1c_mem_dqblk *mdq) +{ + struct if_dqblk idq; + int ret; + + if ((ret = sb->s_qcop->get_dqblk(sb, type, id, &idq)) < 0) + return ret; + mdq->dqb_ihardlimit = idq.dqb_ihardlimit; + mdq->dqb_isoftlimit = idq.dqb_isoftlimit; + mdq->dqb_curinodes = idq.dqb_curinodes; + mdq->dqb_bhardlimit = idq.dqb_bhardlimit; + mdq->dqb_bsoftlimit = idq.dqb_bsoftlimit; + mdq->dqb_curblocks = toqb(idq.dqb_curspace); + mdq->dqb_itime = idq.dqb_itime; + mdq->dqb_btime = idq.dqb_btime; + if (id == 0) { /* Times for id 0 are in fact grace times */ + struct if_dqinfo info; + + if ((ret = sb->s_qcop->get_info(sb, type, &info)) < 0) + return ret; + mdq->dqb_btime = info.dqi_bgrace; + mdq->dqb_itime = info.dqi_igrace; + } + return 0; +} + +static int v1_set_dqblk(struct super_block *sb, int type, int cmd, qid_t id, struct v1c_mem_dqblk *mdq) +{ + struct if_dqblk idq; + int ret; + + idq.dqb_valid = 0; + if (cmd == Q_V1_SETQUOTA || cmd == Q_V1_SETQLIM) { + idq.dqb_ihardlimit = mdq->dqb_ihardlimit; + idq.dqb_isoftlimit = mdq->dqb_isoftlimit; + idq.dqb_bhardlimit = mdq->dqb_bhardlimit; + idq.dqb_bsoftlimit = mdq->dqb_bsoftlimit; + idq.dqb_valid |= QIF_LIMITS; + } + if (cmd == Q_V1_SETQUOTA || cmd == Q_V1_SETUSE) { + idq.dqb_curinodes = mdq->dqb_curinodes; + idq.dqb_curspace = ((qsize_t)mdq->dqb_curblocks) << QUOTABLOCK_BITS; + idq.dqb_valid |= QIF_USAGE; + } + ret = sb->s_qcop->set_dqblk(sb, type, id, &idq); + if (!ret && id == 0 && cmd == Q_V1_SETQUOTA) { /* Times for id 0 are in fact grace times */ + struct if_dqinfo info; + + info.dqi_bgrace = mdq->dqb_btime; + info.dqi_igrace = mdq->dqb_itime; + info.dqi_valid = IIF_BGRACE | IIF_IGRACE; + ret = sb->s_qcop->set_info(sb, type, &info); + } + return ret; +} + +static void v1_get_stats(struct v1c_dqstats *dst) +{ + memcpy(dst, &dqstats, sizeof(dqstats)); +} +#endif + +#ifdef CONFIG_QIFACE_V2 +static int v2_get_info(struct super_block *sb, int type, struct v2c_mem_dqinfo *oinfo) +{ + struct if_dqinfo info; + int ret; + + if ((ret = sb->s_qcop->get_info(sb, type, &info)) < 0) + return ret; + oinfo->dqi_bgrace = info.dqi_bgrace; + oinfo->dqi_igrace = info.dqi_igrace; + oinfo->dqi_flags = info.dqi_flags; + oinfo->dqi_blocks = sb_dqopt(sb)->info[type].u.v2_i.dqi_blocks; + oinfo->dqi_free_blk = sb_dqopt(sb)->info[type].u.v2_i.dqi_free_blk; + oinfo->dqi_free_entry = sb_dqopt(sb)->info[type].u.v2_i.dqi_free_entry; + return 0; +} + +static int v2_set_info(struct super_block *sb, int type, int cmd, struct v2c_mem_dqinfo *oinfo) +{ + struct if_dqinfo info; + + info.dqi_valid = 0; + if (cmd == Q_V2_SETGRACE || cmd == Q_V2_SETINFO) { + info.dqi_bgrace = oinfo->dqi_bgrace; + info.dqi_igrace = oinfo->dqi_igrace; + info.dqi_valid |= IIF_BGRACE | IIF_IGRACE; + } + if (cmd == Q_V2_SETFLAGS || cmd == Q_V2_SETINFO) { + info.dqi_flags = oinfo->dqi_flags; + info.dqi_valid |= IIF_FLAGS; + } + /* We don't simulate deadly effects of setting other parameters ;-) */ + return sb->s_qcop->set_info(sb, type, &info); +} + +static int v2_get_dqblk(struct super_block *sb, int type, qid_t id, struct v2c_mem_dqblk *mdq) +{ + struct if_dqblk idq; + int ret; + + if ((ret = sb->s_qcop->get_dqblk(sb, type, id, &idq)) < 0) + return ret; + mdq->dqb_ihardlimit = idq.dqb_ihardlimit; + mdq->dqb_isoftlimit = idq.dqb_isoftlimit; + mdq->dqb_curinodes = idq.dqb_curinodes; + mdq->dqb_bhardlimit = idq.dqb_bhardlimit; + mdq->dqb_bsoftlimit = idq.dqb_bsoftlimit; + mdq->dqb_curspace = idq.dqb_curspace; + mdq->dqb_itime = idq.dqb_itime; + mdq->dqb_btime = idq.dqb_btime; + return 0; +} + +static int v2_set_dqblk(struct super_block *sb, int type, int cmd, qid_t id, struct v2c_mem_dqblk *mdq) +{ + struct if_dqblk idq; + + idq.dqb_valid = 0; + if (cmd == Q_V2_SETQUOTA || cmd == Q_V2_SETQLIM) { + idq.dqb_ihardlimit = mdq->dqb_ihardlimit; + idq.dqb_isoftlimit = mdq->dqb_isoftlimit; + idq.dqb_bhardlimit = mdq->dqb_bhardlimit; + idq.dqb_bsoftlimit = mdq->dqb_bsoftlimit; + idq.dqb_valid |= QIF_LIMITS; + } + if (cmd == Q_V2_SETQUOTA || cmd == Q_V2_SETUSE) { + idq.dqb_curinodes = mdq->dqb_curinodes; + idq.dqb_curspace = mdq->dqb_curspace; + idq.dqb_valid |= QIF_USAGE; + } + return sb->s_qcop->set_dqblk(sb, type, id, &idq); +} + +static void v2_get_stats(struct v2c_dqstats *dst) +{ + memcpy(dst, &dqstats, sizeof(dqstats)); + dst->version = __DQUOT_NUM_VERSION__; +} +#endif + +/* Handle requests to old interface */ +static int do_compat_quotactl(struct super_block *sb, int type, int cmd, qid_t id, caddr_t addr) +{ + int ret; + + switch (cmd) { + case Q_COMP_QUOTAON: { + char *pathname; + + if (IS_ERR(pathname = getname(addr))) + return PTR_ERR(pathname); +#ifdef CONFIG_QIFACE_V1 + ret = sb->s_qcop->quota_on(sb, type, QFMT_VFS_OLD, pathname); +#else + ret = sb->s_qcop->quota_on(sb, type, QFMT_VFS_V0, pathname); +#endif + putname(pathname); + return ret; + } + case Q_COMP_QUOTAOFF: + return sb->s_qcop->quota_off(sb, type); + case Q_COMP_SYNC: + return sb->s_qcop->quota_sync(sb, type); +#ifdef CONFIG_QIFACE_V1 + case Q_V1_RSQUASH: { + int flag; + + if (copy_from_user(&flag, addr, sizeof(flag))) + return -EFAULT; + return v1_set_rsquash(sb, type, flag); + } + case Q_V1_GETQUOTA: { + struct v1c_mem_dqblk mdq; + + if ((ret = v1_get_dqblk(sb, type, id, &mdq))) + return ret; + if (copy_to_user(addr, &mdq, sizeof(mdq))) + return -EFAULT; + return 0; + } + case Q_V1_SETQLIM: + case Q_V1_SETUSE: + case Q_V1_SETQUOTA: { + struct v1c_mem_dqblk mdq; + + if (copy_from_user(&mdq, addr, sizeof(mdq))) + return -EFAULT; + return v1_set_dqblk(sb, type, cmd, id, &mdq); + } + case Q_V1_GETSTATS: { + struct v1c_dqstats dst; + + v1_get_stats(&dst); + if (copy_to_user(addr, &dst, sizeof(dst))) + return -EFAULT; + return 0; + } +#endif +#ifdef CONFIG_QIFACE_V2 + case Q_V2_GETINFO: { + struct v2c_mem_dqinfo info; + + if ((ret = v2_get_info(sb, type, &info))) + return ret; + if (copy_to_user(addr, &info, sizeof(info))) + return -EFAULT; + return 0; + } + case Q_V2_SETFLAGS: + case Q_V2_SETGRACE: + case Q_V2_SETINFO: { + struct v2c_mem_dqinfo info; + + if (copy_from_user(&info, addr, sizeof(info))) + return -EFAULT; + + return v2_set_info(sb, type, cmd, &info); + } + case Q_V2_GETQUOTA: { + struct v2c_mem_dqblk mdq; + + if ((ret = v2_get_dqblk(sb, type, id, &mdq))) + return ret; + if (copy_to_user(addr, &mdq, sizeof(mdq))) + return -EFAULT; + return 0; + } + case Q_V2_SETUSE: + case Q_V2_SETQLIM: + case Q_V2_SETQUOTA: { + struct v2c_mem_dqblk mdq; + + if (copy_from_user(&mdq, addr, sizeof(mdq))) + return -EFAULT; + return v2_set_dqblk(sb, type, cmd, id, &mdq); + } + case Q_V2_GETSTATS: { + struct v2c_dqstats dst; + + v2_get_stats(&dst); + if (copy_to_user(addr, &dst, sizeof(dst))) + return -EFAULT; + return 0; + } +#endif + } + BUG(); + return 0; +} +#endif + +/* Macros for short-circuiting the compatibility tests */ +#define NEW_COMMAND(c) ((c) & (0x80 << 16)) +#define XQM_COMMAND(c) (((c) & ('X' << 8)) == ('X' << 8)) + /* * This is the system call interface. This communicates with * the user-level programs. Currently this only supports diskquota @@ -247,11 +626,25 @@ asmlinkage long sys_quotactl(unsigned int cmd, const char *special, qid_t id, ca cmds = cmd >> SUBCMDSHIFT; type = cmd & SUBCMDMASK; +#ifdef CONFIG_QIFACE_COMPAT + if (cmds != Q_V1_GETSTATS && cmds != Q_V2_GETSTATS && IS_ERR(sb = resolve_dev(special))) { + ret = PTR_ERR(sb); + sb = NULL; + goto out; + } + if (!NEW_COMMAND(cmds) && !XQM_COMMAND(cmds)) { + if ((ret = check_compat_quotactl_valid(sb, type, cmds, id)) < 0) + goto out; + ret = do_compat_quotactl(sb, type, cmds, id, addr); + goto out; + } +#else if (IS_ERR(sb = resolve_dev(special))) { ret = PTR_ERR(sb); sb = NULL; goto out; } +#endif if ((ret = check_quotactl_valid(sb, type, cmds, id)) < 0) goto out; ret = do_quotactl(sb, type, cmds, id, addr); diff --git a/include/linux/quotacompat.h b/include/linux/quotacompat.h new file mode 100644 index 000000000000..484aac17efc9 --- /dev/null +++ b/include/linux/quotacompat.h @@ -0,0 +1,86 @@ +/* + * Definition of symbols used for backward compatible interface + */ + +#ifndef _LINUX_QUOTACOMPAT_ +#define _LINUX_QUOTACOMPAT_ + +#include +#include + +struct v1c_mem_dqblk { + __u32 dqb_bhardlimit; /* absolute limit on disk blks alloc */ + __u32 dqb_bsoftlimit; /* preferred limit on disk blks */ + __u32 dqb_curblocks; /* current block count */ + __u32 dqb_ihardlimit; /* maximum # allocated inodes */ + __u32 dqb_isoftlimit; /* preferred inode limit */ + __u32 dqb_curinodes; /* current # allocated inodes */ + time_t dqb_btime; /* time limit for excessive disk use */ + time_t dqb_itime; /* time limit for excessive files */ +}; + +struct v1c_dqstats { + __u32 lookups; + __u32 drops; + __u32 reads; + __u32 writes; + __u32 cache_hits; + __u32 allocated_dquots; + __u32 free_dquots; + __u32 syncs; +}; + +struct v2c_mem_dqblk { + unsigned int dqb_ihardlimit; + unsigned int dqb_isoftlimit; + unsigned int dqb_curinodes; + unsigned int dqb_bhardlimit; + unsigned int dqb_bsoftlimit; + qsize_t dqb_curspace; + __kernel_time_t dqb_btime; + __kernel_time_t dqb_itime; +}; + +struct v2c_mem_dqinfo { + unsigned int dqi_bgrace; + unsigned int dqi_igrace; + unsigned int dqi_flags; + unsigned int dqi_blocks; + unsigned int dqi_free_blk; + unsigned int dqi_free_entry; +}; + +struct v2c_dqstats { + __u32 lookups; + __u32 drops; + __u32 reads; + __u32 writes; + __u32 cache_hits; + __u32 allocated_dquots; + __u32 free_dquots; + __u32 syncs; + __u32 version; +}; + +#define Q_COMP_QUOTAON 0x0100 /* enable quotas */ +#define Q_COMP_QUOTAOFF 0x0200 /* disable quotas */ +#define Q_COMP_SYNC 0x0600 /* sync disk copy of a filesystems quotas */ + +#define Q_V1_GETQUOTA 0x0300 /* get limits and usage */ +#define Q_V1_SETQUOTA 0x0400 /* set limits and usage */ +#define Q_V1_SETUSE 0x0500 /* set usage */ +#define Q_V1_SETQLIM 0x0700 /* set limits */ +#define Q_V1_GETSTATS 0x0800 /* get collected stats */ +#define Q_V1_RSQUASH 0x1000 /* set root_squash option */ + +#define Q_V2_SETQLIM 0x0700 /* set limits */ +#define Q_V2_GETINFO 0x0900 /* get info about quotas - graces, flags... */ +#define Q_V2_SETINFO 0x0A00 /* set info about quotas */ +#define Q_V2_SETGRACE 0x0B00 /* set inode and block grace */ +#define Q_V2_SETFLAGS 0x0C00 /* set flags for quota */ +#define Q_V2_GETQUOTA 0x0D00 /* get limits and usage */ +#define Q_V2_SETQUOTA 0x0E00 /* set limits and usage */ +#define Q_V2_SETUSE 0x0F00 /* set usage */ +#define Q_V2_GETSTATS 0x1100 /* get collected stats */ + +#endif -- cgit v1.2.3 From ad447df32d4f9fb48ecb91c7bf1c7eb41f2acc0e Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sun, 19 May 2002 19:34:49 -0700 Subject: [PATCH] [13/13] quota-13-ioctl This patch implements ioctl() for getting space used by file. I agree it's ioctl() abuse, it doesn't work on links and has other ugly properties. Better would be to change 'struct stat' but changing it just due to this is overkill and it will take some time before there will be enough changes which will provoke yet another struct stat :). So this is temporary solution... If you don't like it, simply reject it. The function it provides is not fundamental... So that should be all patches. Any comments (or decision about including/not including) welcome. Honza --- fs/ioctl.c | 10 ++++++++++ include/asm-alpha/ioctls.h | 1 + include/asm-cris/ioctls.h | 1 + include/asm-i386/ioctls.h | 1 + include/asm-ia64/ioctls.h | 1 + include/asm-m68k/ioctls.h | 1 + include/asm-parisc/ioctls.h | 1 + include/asm-sh/ioctls.h | 1 + include/asm-sparc/ioctls.h | 1 + include/asm-sparc64/ioctls.h | 1 + 10 files changed, 19 insertions(+) (limited to 'include') diff --git a/fs/ioctl.c b/fs/ioctl.c index a36c61f18769..8d38a2f2539c 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -102,6 +102,16 @@ asmlinkage long sys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg) filp->f_flags &= ~FASYNC; break; + case FIOQSIZE: + if (S_ISDIR(filp->f_dentry->d_inode->i_mode) || + S_ISREG(filp->f_dentry->d_inode->i_mode) || + S_ISLNK(filp->f_dentry->d_inode->i_mode)) { + loff_t res = inode_get_bytes(filp->f_dentry->d_inode); + error = copy_to_user((loff_t *)arg, &res, sizeof(res)) ? -EFAULT : 0; + } + else + error = -ENOTTY; + break; default: error = -ENOTTY; if (S_ISREG(filp->f_dentry->d_inode->i_mode)) diff --git a/include/asm-alpha/ioctls.h b/include/asm-alpha/ioctls.h index 2cad3d5a8ec2..a363c50ce6d7 100644 --- a/include/asm-alpha/ioctls.h +++ b/include/asm-alpha/ioctls.h @@ -9,6 +9,7 @@ #define FIONBIO _IOW('f', 126, int) #define FIONREAD _IOR('f', 127, int) #define TIOCINQ FIONREAD +#define FIOQSIZE _IOR('f', 128, loff_t) #define TIOCGETP _IOR('t', 8, struct sgttyb) #define TIOCSETP _IOW('t', 9, struct sgttyb) diff --git a/include/asm-cris/ioctls.h b/include/asm-cris/ioctls.h index 634628728762..21957ec44d7e 100644 --- a/include/asm-cris/ioctls.h +++ b/include/asm-cris/ioctls.h @@ -69,6 +69,7 @@ #define TIOCGICOUNT 0x545D /* read serial port inline interrupt counts */ #define TIOCGHAYESESP 0x545E /* Get Hayes ESP configuration */ #define TIOCSHAYESESP 0x545F /* Set Hayes ESP configuration */ +#define FIOQSIZE 0x5460 /* Used for packet mode */ #define TIOCPKT_DATA 0 diff --git a/include/asm-i386/ioctls.h b/include/asm-i386/ioctls.h index 97b41f2feddb..ea0e6ae58c29 100644 --- a/include/asm-i386/ioctls.h +++ b/include/asm-i386/ioctls.h @@ -67,6 +67,7 @@ #define TIOCGICOUNT 0x545D /* read serial port inline interrupt counts */ #define TIOCGHAYESESP 0x545E /* Get Hayes ESP configuration */ #define TIOCSHAYESESP 0x545F /* Set Hayes ESP configuration */ +#define FIOQSIZE 0x5460 /* Used for packet mode */ #define TIOCPKT_DATA 0 diff --git a/include/asm-ia64/ioctls.h b/include/asm-ia64/ioctls.h index 5e95e8b7f104..e727e4a67189 100644 --- a/include/asm-ia64/ioctls.h +++ b/include/asm-ia64/ioctls.h @@ -72,6 +72,7 @@ #define TIOCGICOUNT 0x545D /* read serial port inline interrupt counts */ #define TIOCGHAYESESP 0x545E /* Get Hayes ESP configuration */ #define TIOCSHAYESESP 0x545F /* Set Hayes ESP configuration */ +#define FIOQSIZE 0x5460 /* Used for packet mode */ #define TIOCPKT_DATA 0 diff --git a/include/asm-m68k/ioctls.h b/include/asm-m68k/ioctls.h index 213bd58df199..89c0df1262ed 100644 --- a/include/asm-m68k/ioctls.h +++ b/include/asm-m68k/ioctls.h @@ -65,6 +65,7 @@ #define TIOCMIWAIT 0x545C /* wait for a change on serial input line(s) */ #define TIOCGICOUNT 0x545D /* read serial port inline interrupt counts */ +#define FIOQSIZE 0x545E /* Used for packet mode */ #define TIOCPKT_DATA 0 diff --git a/include/asm-parisc/ioctls.h b/include/asm-parisc/ioctls.h index 332027f38424..9210a0f87c3f 100644 --- a/include/asm-parisc/ioctls.h +++ b/include/asm-parisc/ioctls.h @@ -67,6 +67,7 @@ #define TIOCGICOUNT 0x545D /* read serial port inline interrupt counts */ #define TIOCGHAYESESP 0x545E /* Get Hayes ESP configuration */ #define TIOCSHAYESESP 0x545F /* Set Hayes ESP configuration */ +#define FIOQSIZE 0x5460 /* Get exact space used by quota */ /* Used for packet mode */ #define TIOCPKT_DATA 0 diff --git a/include/asm-sh/ioctls.h b/include/asm-sh/ioctls.h index 5d3e2cd245f5..f535202e8380 100644 --- a/include/asm-sh/ioctls.h +++ b/include/asm-sh/ioctls.h @@ -9,6 +9,7 @@ #define FIONBIO _IOW('f', 126, int) #define FIONREAD _IOR('f', 127, int) #define TIOCINQ FIONREAD +#define FIOQSIZE _IOR('f', 128, loff_t) #define TCGETS 0x5401 #define TCSETS 0x5402 diff --git a/include/asm-sparc/ioctls.h b/include/asm-sparc/ioctls.h index aa2b9ea89a6a..531460ef91ff 100644 --- a/include/asm-sparc/ioctls.h +++ b/include/asm-sparc/ioctls.h @@ -86,6 +86,7 @@ #define FIONBIO _IOW('f', 126, int) #define FIONREAD _IOR('f', 127, int) #define TIOCINQ FIONREAD +#define FIOQSIZE _IOR('f', 128, loff_t) /* SCARY Rutgers local SunOS kernel hackery, perhaps I will support it * someday. This is completely bogus, I know... diff --git a/include/asm-sparc64/ioctls.h b/include/asm-sparc64/ioctls.h index ebec66679415..0c8068dc32a9 100644 --- a/include/asm-sparc64/ioctls.h +++ b/include/asm-sparc64/ioctls.h @@ -87,6 +87,7 @@ #define FIONBIO _IOW('f', 126, int) #define FIONREAD _IOR('f', 127, int) #define TIOCINQ FIONREAD +#define FIOQSIZE _IOR('f', 128, loff_t) /* SCARY Rutgers local SunOS kernel hackery, perhaps I will support it * someday. This is completely bogus, I know... -- cgit v1.2.3 From bd2b0c85edfa015fdb4990ad07ad10e94ea885a4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 19 May 2002 19:40:16 -0700 Subject: [PATCH] get rid of The lock.h header contained some hand-crafted lcoking routines from the pre-SMP days. In 2.5 only lock_super/unlock_super are left, guarded by a number of completly unrelated (!) includes. This patch moves lock_super/unlock_super to fs.h, which defined struct super_block that is needed for those to operate it, removes locks.h and updates all caller to not include it and add the missing, previously nested includes where needed. --- drivers/block/DAC960.c | 1 - drivers/block/block_ioctl.c | 1 - drivers/block/cpqarray.h | 1 - drivers/block/ll_rw_blk.c | 1 - drivers/isdn/capi/capifs.c | 1 - drivers/isdn/capi/kcapi.c | 1 - drivers/md/lvm.c | 1 - drivers/md/raid5.c | 1 - drivers/media/video/i2c-old.c | 1 - fs/adfs/inode.c | 1 - fs/adfs/super.c | 1 - fs/affs/amigaffs.c | 1 - fs/affs/bitmap.c | 1 - fs/affs/file.c | 1 - fs/affs/inode.c | 1 - fs/affs/namei.c | 1 - fs/affs/super.c | 1 - fs/autofs/inode.c | 1 - fs/autofs4/inode.c | 2 +- fs/bfs/dir.c | 1 - fs/bfs/file.c | 1 - fs/bfs/inode.c | 1 - fs/binfmt_em86.c | 1 - fs/block_dev.c | 1 - fs/buffer.c | 1 - fs/coda/cache.c | 1 - fs/coda/coda_linux.c | 1 - fs/coda/dir.c | 1 - fs/coda/file.c | 1 - fs/coda/inode.c | 1 - fs/coda/pioctl.c | 1 - fs/coda/symlink.c | 2 +- fs/coda/upcall.c | 1 - fs/cramfs/inode.c | 1 - fs/devpts/inode.c | 1 - fs/efs/super.c | 1 - fs/ext2/balloc.c | 1 - fs/ext2/fsync.c | 1 - fs/ext2/ialloc.c | 1 - fs/ext2/inode.c | 2 +- fs/ext2/super.c | 1 - fs/ext3/balloc.c | 1 - fs/ext3/file.c | 1 - fs/ext3/ialloc.c | 1 - fs/ext3/inode.c | 5 +++-- fs/ext3/namei.c | 1 - fs/ext3/super.c | 1 - fs/fat/file.c | 1 - fs/fat/inode.c | 2 +- fs/hpfs/file.c | 1 + fs/hpfs/hpfs_fn.h | 1 - fs/hpfs/inode.c | 1 + fs/hpfs/namei.c | 1 + fs/intermezzo/cache.c | 1 - fs/intermezzo/dcache.c | 1 - fs/intermezzo/dir.c | 1 - fs/intermezzo/ext_attr.c | 1 - fs/intermezzo/file.c | 1 - fs/intermezzo/inode.c | 2 -- fs/intermezzo/journal.c | 1 - fs/intermezzo/journal_ext2.c | 1 - fs/intermezzo/journal_ext3.c | 1 - fs/intermezzo/journal_obdfs.c | 1 - fs/intermezzo/journal_reiserfs.c | 1 - fs/intermezzo/journal_xfs.c | 1 - fs/intermezzo/methods.c | 1 - fs/intermezzo/presto.c | 1 - fs/intermezzo/super.c | 1 - fs/intermezzo/upcall.c | 1 - fs/isofs/compress.c | 1 - fs/isofs/dir.c | 1 - fs/isofs/inode.c | 1 - fs/jbd/checkpoint.c | 1 - fs/jbd/commit.c | 1 - fs/jbd/journal.c | 2 +- fs/jbd/recovery.c | 1 - fs/jbd/revoke.c | 1 - fs/jbd/transaction.c | 2 +- fs/jffs/inode-v23.c | 1 - fs/jffs/intrep.c | 1 - fs/jfs/file.c | 1 - fs/jfs/inode.c | 1 - fs/jfs/jfs_dtree.c | 1 - fs/jfs/jfs_imap.c | 1 - fs/jfs/jfs_logmgr.c | 1 - fs/jfs/jfs_txnmgr.c | 1 - fs/jfs/jfs_xtree.c | 1 - fs/jfs/namei.c | 1 - fs/jfs/super.c | 1 - fs/minix/inode.c | 1 - fs/minix/itree_v1.c | 1 - fs/minix/itree_v2.c | 1 - fs/ncpfs/dir.c | 1 - fs/ncpfs/file.c | 1 - fs/ncpfs/inode.c | 1 - fs/nfs/inode.c | 1 - fs/nfsd/nfs3proc.c | 1 - fs/nfsd/nfsproc.c | 1 - fs/nfsd/vfs.c | 1 - fs/ntfs/aops.c | 1 - fs/ntfs/compress.c | 1 - fs/ntfs/mft.c | 1 - fs/ntfs/super.c | 1 - fs/openpromfs/inode.c | 1 - fs/proc/inode.c | 1 - fs/qnx4/fsync.c | 1 - fs/qnx4/inode.c | 2 +- fs/qnx4/truncate.c | 1 - fs/ramfs/inode.c | 1 - fs/reiserfs/bitmap.c | 1 - fs/reiserfs/buffer2.c | 1 - fs/reiserfs/fix_node.c | 1 - fs/reiserfs/inode.c | 2 +- fs/reiserfs/ioctl.c | 2 +- fs/reiserfs/journal.c | 1 - fs/reiserfs/objectid.c | 1 - fs/reiserfs/procfs.c | 1 - fs/reiserfs/resize.c | 1 - fs/reiserfs/stree.c | 1 - fs/reiserfs/super.c | 1 - fs/reiserfs/tail_conversion.c | 1 - fs/romfs/inode.c | 2 +- fs/smbfs/inode.c | 1 - fs/super.c | 1 - fs/sysv/balloc.c | 1 - fs/sysv/ialloc.c | 2 +- fs/sysv/inode.c | 1 - fs/sysv/itree.c | 1 - fs/udf/balloc.c | 1 - fs/udf/file.c | 2 +- fs/udf/fsync.c | 1 - fs/udf/ialloc.c | 1 - fs/udf/inode.c | 2 +- fs/udf/namei.c | 1 - fs/udf/super.c | 1 - fs/ufs/balloc.c | 1 - fs/ufs/cylinder.c | 1 - fs/ufs/dir.c | 1 - fs/ufs/file.c | 1 - fs/ufs/ialloc.c | 1 - fs/ufs/inode.c | 1 - fs/ufs/super.c | 1 - fs/ufs/truncate.c | 1 - fs/ufs/util.c | 1 - include/linux/amigaffs.h | 1 - include/linux/blk.h | 1 - include/linux/fs.h | 13 +++++++++++++ include/linux/hfs_sysdep.h | 2 +- include/linux/locks.h | 28 ---------------------------- include/linux/nbd.h | 1 - include/linux/raid/md.h | 1 - include/linux/swap.h | 1 + include/linux/ufs_fs.h | 1 + kernel/ksyms.c | 1 - mm/page_io.c | 2 +- mm/shmem.c | 1 - net/khttpd/datasending.c | 1 - 157 files changed, 36 insertions(+), 180 deletions(-) delete mode 100644 include/linux/locks.h (limited to 'include') diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c index 7ba55877b5a9..b539b367cbc7 100644 --- a/drivers/block/DAC960.c +++ b/drivers/block/DAC960.c @@ -35,7 +35,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/block/block_ioctl.c b/drivers/block/block_ioctl.c index 6c204d48ea53..7801e021c1bf 100644 --- a/drivers/block/block_ioctl.c +++ b/drivers/block/block_ioctl.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/block/cpqarray.h b/drivers/block/cpqarray.h index 80b4dba8b83e..a6118b3de22b 100644 --- a/drivers/block/cpqarray.h +++ b/drivers/block/cpqarray.h @@ -27,7 +27,6 @@ #ifdef __KERNEL__ #include -#include #include #include #include diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c index 51fd5be00995..0ea76d978992 100644 --- a/drivers/block/ll_rw_blk.c +++ b/drivers/block/ll_rw_blk.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/isdn/capi/capifs.c b/drivers/isdn/capi/capifs.c index f0c79911456f..5021b597997d 100644 --- a/drivers/isdn/capi/capifs.c +++ b/drivers/isdn/capi/capifs.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/isdn/capi/kcapi.c b/drivers/isdn/capi/kcapi.c index a609563d7c84..e9d33b415ca5 100644 --- a/drivers/isdn/capi/kcapi.c +++ b/drivers/isdn/capi/kcapi.c @@ -26,7 +26,6 @@ printk(KERN_DEBUG __FUNCTION__ ": " format "\n" , ## arg); \ #include #include #include -#include #include #include #include diff --git a/drivers/md/lvm.c b/drivers/md/lvm.c index 1c31e2058143..dfc256c6a2ec 100644 --- a/drivers/md/lvm.c +++ b/drivers/md/lvm.c @@ -212,7 +212,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 8133b7c0952f..9402b0c779b9 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -18,7 +18,6 @@ #include #include -#include #include #include #include diff --git a/drivers/media/video/i2c-old.c b/drivers/media/video/i2c-old.c index 52dc8ebb17d5..bd731be97aa1 100644 --- a/drivers/media/video/i2c-old.c +++ b/drivers/media/video/i2c-old.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index 8f7403772d0c..fd736d937020 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/adfs/super.c b/fs/adfs/super.c index 55309e14720c..8f59cf69efbf 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c index bc229488cfbf..fb4545696be9 100644 --- a/fs/affs/amigaffs.c +++ b/fs/affs/amigaffs.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include diff --git a/fs/affs/bitmap.c b/fs/affs/bitmap.c index 2ff2854b0d74..b22cadd6e86f 100644 --- a/fs/affs/bitmap.c +++ b/fs/affs/bitmap.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include diff --git a/fs/affs/file.c b/fs/affs/file.c index 86a98ea9d4bf..3d35848490c7 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/affs/inode.c b/fs/affs/inode.c index 39f18dc29465..e831d12f1aeb 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/affs/namei.c b/fs/affs/namei.c index 80578e97be18..63bcbb7f8162 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/affs/super.c b/fs/affs/super.c index 01041d693028..68af4188327c 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c index 16f434801863..5a83e2ce5ad1 100644 --- a/fs/autofs/inode.c +++ b/fs/autofs/inode.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include "autofs_i.h" #define __NO_VERSION__ diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index 86f73230fa29..c17dcb637608 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include #include "autofs_i.h" #define __NO_VERSION__ diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index c36aff10c3c2..5913276d8d07 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -7,7 +7,6 @@ #include #include #include -#include #include #include "bfs_defs.h" diff --git a/fs/bfs/file.c b/fs/bfs/file.c index 6413652035c2..313e5e4009ab 100644 --- a/fs/bfs/file.c +++ b/fs/bfs/file.c @@ -5,7 +5,6 @@ */ #include -#include #include #include #include "bfs_defs.h" diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 31823866a9c2..df4cf556785a 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -9,7 +9,6 @@ #include #include #include -#include #include #include diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c index 8f1a2752feef..d651e875b01e 100644 --- a/fs/binfmt_em86.c +++ b/fs/binfmt_em86.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/block_dev.c b/fs/block_dev.c index 76c5e5cf0555..654d98a256b0 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -8,7 +8,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/buffer.c b/fs/buffer.c index f9923e470bb3..904fec39dd60 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/coda/cache.c b/fs/coda/cache.c index 47f8ebae639e..5e526d018e23 100644 --- a/fs/coda/cache.c +++ b/fs/coda/cache.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c index c50dae543692..ee14f574233b 100644 --- a/fs/coda/coda_linux.c +++ b/fs/coda/coda_linux.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include diff --git a/fs/coda/dir.c b/fs/coda/dir.c index d18a8ad385bd..40398f8b66c7 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include diff --git a/fs/coda/file.c b/fs/coda/file.c index f74655873fa8..7d6dd4b5de74 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/coda/inode.c b/fs/coda/inode.c index 621074e23410..5066d9a04984 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c index a6a11d615fcf..edfb9aa96544 100644 --- a/fs/coda/pioctl.c +++ b/fs/coda/pioctl.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #define __NO_VERSION__ #include diff --git a/fs/coda/symlink.c b/fs/coda/symlink.c index eed35ddd28a1..764a64ee8332 100644 --- a/fs/coda/symlink.c +++ b/fs/coda/symlink.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include #include diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c index f3b8699ad5f1..72700a2dcb3c 100644 --- a/fs/coda/upcall.c +++ b/fs/coda/upcall.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index c9a6374289cd..0e9e2600a6db 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 231bc91cd79d..0727e719279d 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/efs/super.c b/fs/efs/super.c index 4af82d06d5bf..51cca8ecfa9d 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -8,7 +8,6 @@ #include #include -#include #include #include #include diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c index 7cb0c303a6ea..985fb0f71bf3 100644 --- a/fs/ext2/balloc.c +++ b/fs/ext2/balloc.c @@ -13,7 +13,6 @@ #include #include "ext2.h" -#include #include /* diff --git a/fs/ext2/fsync.c b/fs/ext2/fsync.c index 5ba02176b331..4528b40c31c9 100644 --- a/fs/ext2/fsync.c +++ b/fs/ext2/fsync.c @@ -23,7 +23,6 @@ */ #include "ext2.h" -#include #include diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index aa2f3ad28fe4..be520e2ad23c 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -14,7 +14,6 @@ #include #include "ext2.h" -#include #include diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 55592347a48c..592db3d7937f 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -23,10 +23,10 @@ */ #include "ext2.h" -#include #include #include #include +#include #include #include diff --git a/fs/ext2/super.c b/fs/ext2/super.c index db59722a8c30..7e162a913b34 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -22,7 +22,6 @@ #include "ext2.h" #include #include -#include #include #include #include diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c index 2da50dce0434..ea4bd4510319 100644 --- a/fs/ext3/balloc.c +++ b/fs/ext3/balloc.c @@ -17,7 +17,6 @@ #include #include #include -#include #include /* diff --git a/fs/ext3/file.c b/fs/ext3/file.c index 3ed85a1adcf8..d5040f1cdbcd 100644 --- a/fs/ext3/file.c +++ b/fs/ext3/file.c @@ -20,7 +20,6 @@ #include #include -#include #include #include #include diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c index cd57f9f5757d..f190708fd710 100644 --- a/fs/ext3/ialloc.c +++ b/fs/ext3/ialloc.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 6f764bf1eec9..4be6e5eaa4ea 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -22,15 +22,16 @@ * Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000 */ +#include #include #include #include #include -#include #include #include +#include #include -#include +#include /* * SEARCH_FROM_ZERO forces each block allocation to search from the start diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 58e3d8f89b61..2587c77a1d88 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 3ffa9fecf652..1c90e699030a 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/fat/file.c b/fs/fat/file.c index 32a79bfaa66a..ee581867252c 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -7,7 +7,6 @@ */ #include -#include #include #include #include diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 03fe0dc5b3d9..81568264732c 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -12,11 +12,11 @@ #include #include -#include #include #include #include #include +#include //#include #include diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index 323e66dd6b08..986e479e433b 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -9,6 +9,7 @@ #include #include #include +#include #include "hpfs_fn.h" #define BLOCKS(size) (((size) + 511) >> 9) diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h index a6cb5a596827..b5c09a1d66c3 100644 --- a/fs/hpfs/hpfs_fn.h +++ b/fs/hpfs/hpfs_fn.h @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index 61578e25673e..08125d97bf3a 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -6,6 +6,7 @@ * inode VFS functions */ +#include #include #include #include "hpfs_fn.h" diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c index 85ddc8aef4a6..2a54665058f8 100644 --- a/fs/hpfs/namei.c +++ b/fs/hpfs/namei.c @@ -6,6 +6,7 @@ * adding & removing files & directories */ +#include #include #include "hpfs_fn.h" diff --git a/fs/intermezzo/cache.c b/fs/intermezzo/cache.c index 7c3d32d4152f..93a534473ea2 100644 --- a/fs/intermezzo/cache.c +++ b/fs/intermezzo/cache.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include diff --git a/fs/intermezzo/dcache.c b/fs/intermezzo/dcache.c index eca114afc4fa..6e7bd681ae47 100644 --- a/fs/intermezzo/dcache.c +++ b/fs/intermezzo/dcache.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/intermezzo/dir.c b/fs/intermezzo/dir.c index 7617c500c35f..c8a8c1988f16 100644 --- a/fs/intermezzo/dir.c +++ b/fs/intermezzo/dir.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #define __NO_VERSION__ diff --git a/fs/intermezzo/ext_attr.c b/fs/intermezzo/ext_attr.c index 398c6d50554a..3c317baa7911 100644 --- a/fs/intermezzo/ext_attr.c +++ b/fs/intermezzo/ext_attr.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include diff --git a/fs/intermezzo/file.c b/fs/intermezzo/file.c index 68084e55ef73..67c34b7bce4a 100644 --- a/fs/intermezzo/file.c +++ b/fs/intermezzo/file.c @@ -31,7 +31,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/intermezzo/inode.c b/fs/intermezzo/inode.c index 111721845226..ace8cacad054 100644 --- a/fs/intermezzo/inode.c +++ b/fs/intermezzo/inode.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include @@ -24,7 +23,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/intermezzo/journal.c b/fs/intermezzo/journal.c index df8f31533479..049760de033c 100644 --- a/fs/intermezzo/journal.c +++ b/fs/intermezzo/journal.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/intermezzo/journal_ext2.c b/fs/intermezzo/journal_ext2.c index 2a4a5d7b40f9..74888cb0c367 100644 --- a/fs/intermezzo/journal_ext2.c +++ b/fs/intermezzo/journal_ext2.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/intermezzo/journal_ext3.c b/fs/intermezzo/journal_ext3.c index 58ab5f50dccc..46bebc15d30b 100644 --- a/fs/intermezzo/journal_ext3.c +++ b/fs/intermezzo/journal_ext3.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/intermezzo/journal_obdfs.c b/fs/intermezzo/journal_obdfs.c index c6d239b2685e..2ce2d08f73ab 100644 --- a/fs/intermezzo/journal_obdfs.c +++ b/fs/intermezzo/journal_obdfs.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #ifdef CONFIG_OBDFS_FS diff --git a/fs/intermezzo/journal_reiserfs.c b/fs/intermezzo/journal_reiserfs.c index 6531887ad3f5..23804ec54fff 100644 --- a/fs/intermezzo/journal_reiserfs.c +++ b/fs/intermezzo/journal_reiserfs.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #if 0 diff --git a/fs/intermezzo/journal_xfs.c b/fs/intermezzo/journal_xfs.c index 0ec4372e918a..70aad72b613e 100644 --- a/fs/intermezzo/journal_xfs.c +++ b/fs/intermezzo/journal_xfs.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #ifdef CONFIG_FS_XFS diff --git a/fs/intermezzo/methods.c b/fs/intermezzo/methods.c index 05af7f28bdf9..276f4be455cd 100644 --- a/fs/intermezzo/methods.c +++ b/fs/intermezzo/methods.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #define __NO_VERSION__ diff --git a/fs/intermezzo/presto.c b/fs/intermezzo/presto.c index 12243f3e199b..5c8514c2c665 100644 --- a/fs/intermezzo/presto.c +++ b/fs/intermezzo/presto.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/intermezzo/super.c b/fs/intermezzo/super.c index 4a9358f82c07..f1804c2a7860 100644 --- a/fs/intermezzo/super.c +++ b/fs/intermezzo/super.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #define __NO_VERSION__ diff --git a/fs/intermezzo/upcall.c b/fs/intermezzo/upcall.c index 37491a4c3dd0..604705413305 100644 --- a/fs/intermezzo/upcall.c +++ b/fs/intermezzo/upcall.c @@ -32,7 +32,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c index 4ce2fb2a5f96..a5728e0f7d6a 100644 --- a/fs/isofs/compress.c +++ b/fs/isofs/compress.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c index 19a643b650e6..610462dd9cc9 100644 --- a/fs/isofs/dir.c +++ b/fs/isofs/dir.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 08d4a148495b..9058e9763e88 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c index d809febc5abc..17a94591d021 100644 --- a/fs/jbd/checkpoint.c +++ b/fs/jbd/checkpoint.c @@ -22,7 +22,6 @@ #include #include #include -#include extern spinlock_t journal_datalist_lock; diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 59c3b994d47a..e4ce53b05a55 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -18,7 +18,6 @@ #include #include #include -#include #include extern spinlock_t journal_datalist_lock; diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index baafedb4afa5..a89f7e547581 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -28,11 +28,11 @@ #include #include #include -#include #include #include #include #include +#include #include #include diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c index 48af2d532d0d..e6a96d3c30ce 100644 --- a/fs/jbd/recovery.c +++ b/fs/jbd/recovery.c @@ -21,7 +21,6 @@ #include #include #include -#include #endif /* diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c index 464e828db9d1..7cecb0237988 100644 --- a/fs/jbd/revoke.c +++ b/fs/jbd/revoke.c @@ -65,7 +65,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index 2245e396ebd9..cf10a8ce12e3 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -22,10 +22,10 @@ #include #include #include -#include #include #include #include +#include extern spinlock_t journal_datalist_lock; diff --git a/fs/jffs/inode-v23.c b/fs/jffs/inode-v23.c index 3e11c2dd55b8..fe077a51775e 100644 --- a/fs/jffs/inode-v23.c +++ b/fs/jffs/inode-v23.c @@ -42,7 +42,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/jffs/intrep.c b/fs/jffs/intrep.c index ec26515ce2b4..cc3f7895b294 100644 --- a/fs/jffs/intrep.c +++ b/fs/jffs/intrep.c @@ -63,7 +63,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/jfs/file.c b/fs/jfs/file.c index b926fb7ffacd..ec8ea1484ff5 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -17,7 +17,6 @@ */ #include -#include #include "jfs_incore.h" #include "jfs_txnmgr.h" #include "jfs_debug.h" diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index eb34bd53ff04..032d52434350 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -17,7 +17,6 @@ */ #include -#include #include "jfs_incore.h" #include "jfs_filsys.h" #include "jfs_imap.h" diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c index 9742962de44f..1b223e2275ad 100644 --- a/fs/jfs/jfs_dtree.c +++ b/fs/jfs/jfs_dtree.c @@ -101,7 +101,6 @@ */ #include -#include #include #include "jfs_incore.h" #include "jfs_superblock.h" diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c index 9360c94d857f..becd4caa108a 100644 --- a/fs/jfs/jfs_imap.c +++ b/fs/jfs/jfs_imap.c @@ -42,7 +42,6 @@ */ #include -#include #include "jfs_incore.h" #include "jfs_filsys.h" #include "jfs_dinode.h" diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index a2e91a853516..219d5dbb2d18 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -59,7 +59,6 @@ */ #include -#include #include #include #include diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index 47b4b1f763f1..d7c441edbffa 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c @@ -43,7 +43,6 @@ #include -#include #include #include #include diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c index ffdb26471641..c09af0a01b3b 100644 --- a/fs/jfs/jfs_xtree.c +++ b/fs/jfs/jfs_xtree.c @@ -20,7 +20,6 @@ */ #include -#include #include "jfs_incore.h" #include "jfs_filsys.h" #include "jfs_metapage.h" diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 4653128b97ef..69beb37da170 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -17,7 +17,6 @@ */ #include -#include #include "jfs_incore.h" #include "jfs_inode.h" #include "jfs_dinode.h" diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 46f0cceb3cbe..248ab7a6be33 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -17,7 +17,6 @@ */ #include -#include #include #include #include diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 10fdf3f6973e..c8d490c5d24f 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -12,7 +12,6 @@ #include #include "minix.h" #include -#include #include #include diff --git a/fs/minix/itree_v1.c b/fs/minix/itree_v1.c index 83064f69675f..26216b80ab5a 100644 --- a/fs/minix/itree_v1.c +++ b/fs/minix/itree_v1.c @@ -1,5 +1,4 @@ #include "minix.h" -#include enum {DEPTH = 3, DIRECT = 7}; /* Only double indirect */ diff --git a/fs/minix/itree_v2.c b/fs/minix/itree_v2.c index 80885cc9b437..732a878b1e4c 100644 --- a/fs/minix/itree_v2.c +++ b/fs/minix/itree_v2.c @@ -1,5 +1,4 @@ #include "minix.h" -#include enum {DIRECT = 7, DEPTH = 4}; /* Have triple indirect */ diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index 32ec4f105c24..fd44657ee616 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c index 0b5f437715e2..81c224f1558f 100644 --- a/fs/ncpfs/file.c +++ b/fs/ncpfs/file.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index 854b599fde90..1c19c0269f9e 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index ea03c0e8a850..d5bced0cdbde 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index d0e7f24fe3fd..0e3de324ec19 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -9,7 +9,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index f81849743d7b..5527f0d6b5d4 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -10,7 +10,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 7384bc06463e..b9d0bbfb333a 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index 21e2c0095bd2..5393901948bf 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -25,7 +25,6 @@ #include #include #include -#include #include "ntfs.h" diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c index b4896ab4008b..395cf7fec9d2 100644 --- a/fs/ntfs/compress.c +++ b/fs/ntfs/compress.c @@ -21,7 +21,6 @@ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ -#include #include #include "ntfs.h" diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index f5ff98b3c1ad..9db0fb99f901 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c @@ -20,7 +20,6 @@ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ -#include #include #include "ntfs.h" diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 546eb46bb51a..1ee4c7b74aa9 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include /* For bdev_hardsect_size(). */ #include diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index c9ee76ba5d06..2c4cf1dcc5f5 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -10,7 +10,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 638d218a7f15..8495f3e1fe41 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #define __NO_VERSION__ diff --git a/fs/qnx4/fsync.c b/fs/qnx4/fsync.c index 56136f136665..76c4cb0d5f0f 100644 --- a/fs/qnx4/fsync.c +++ b/fs/qnx4/fsync.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index 82b411437b4f..a839e24bfa9b 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -20,10 +20,10 @@ #include #include #include -#include #include #include #include +#include #include diff --git a/fs/qnx4/truncate.c b/fs/qnx4/truncate.c index 88d9e181565a..681bdeb50a7c 100644 --- a/fs/qnx4/truncate.c +++ b/fs/qnx4/truncate.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index 6d31e937127e..35ee0a1c2604 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -28,7 +28,6 @@ #include #include #include -#include #include #include diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c index 6968c41a4680..95fccb745e06 100644 --- a/fs/reiserfs/bitmap.c +++ b/fs/reiserfs/bitmap.c @@ -5,7 +5,6 @@ #include #include #include -#include #include #include diff --git a/fs/reiserfs/buffer2.c b/fs/reiserfs/buffer2.c index b7025254a64b..62ec8424eca1 100644 --- a/fs/reiserfs/buffer2.c +++ b/fs/reiserfs/buffer2.c @@ -4,7 +4,6 @@ #include #include -#include #include #include #include diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c index 9a5dd50cfaf7..802eefb3825d 100644 --- a/fs/reiserfs/fix_node.c +++ b/fs/reiserfs/fix_node.c @@ -38,7 +38,6 @@ #include #include #include -#include #include diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 2c1e3ee268b0..4a757803c44b 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -5,8 +5,8 @@ #include #include #include -#include #include +#include #include #include diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index ef41742ba491..6957b5f69ce4 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -6,8 +6,8 @@ #include #include #include +#include #include -#include /* ** reiserfs_ioctl - handler for ioctl for inode diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 9de865554b51..de6e7de3068f 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -54,7 +54,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/reiserfs/objectid.c b/fs/reiserfs/objectid.c index 503ef628aa5b..8d47a4edabd9 100644 --- a/fs/reiserfs/objectid.c +++ b/fs/reiserfs/objectid.c @@ -4,7 +4,6 @@ #include #include -#include #include #include #include diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c index 359f66b1351b..19d7e4f36976 100644 --- a/fs/reiserfs/procfs.c +++ b/fs/reiserfs/procfs.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c index 9b073804f3f4..59cdfa57b354 100644 --- a/fs/reiserfs/resize.c +++ b/fs/reiserfs/resize.c @@ -10,7 +10,6 @@ #include #include -#include #include #include #include diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index e5271b25ae23..8f067bd52f2c 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -56,7 +56,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index b52e704d6c7f..c9dd2d3b5d5a 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/fs/reiserfs/tail_conversion.c b/fs/reiserfs/tail_conversion.c index defa8d3dd6b0..3393998bb7ab 100644 --- a/fs/reiserfs/tail_conversion.c +++ b/fs/reiserfs/tail_conversion.c @@ -6,7 +6,6 @@ #include #include #include -#include /* access to tail : when one is going to read tail it must make sure, that is not running. direct2indirect and indirect2direct can not run concurrently */ diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c index 25d55c453f93..3beb8e661e1b 100644 --- a/fs/romfs/inode.c +++ b/fs/romfs/inode.c @@ -70,8 +70,8 @@ #include #include #include -#include #include +#include #include #include diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c index 6807dd38a288..22a8e371b0c3 100644 --- a/fs/smbfs/inode.c +++ b/fs/smbfs/inode.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/super.c b/fs/super.c index a8184f155bdd..13f1b7a7e34d 100644 --- a/fs/super.c +++ b/fs/super.c @@ -22,7 +22,6 @@ #include #include -#include #include #include #include diff --git a/fs/sysv/balloc.c b/fs/sysv/balloc.c index 9194ccf7fdeb..568b6231b2ae 100644 --- a/fs/sysv/balloc.c +++ b/fs/sysv/balloc.c @@ -19,7 +19,6 @@ * This file contains code for allocating/freeing blocks. */ -#include #include "sysv.h" /* We don't trust the value of diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c index afda24dcf2d0..183358d83b43 100644 --- a/fs/sysv/ialloc.c +++ b/fs/sysv/ialloc.c @@ -21,9 +21,9 @@ #include #include +#include #include #include -#include #include "sysv.h" /* We don't trust the value of diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index 29ab0cc3c3f1..953a70c01df4 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -21,7 +21,6 @@ * the superblock. */ -#include #include #include #include diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c index 75beb1554f98..bc6690583a1b 100644 --- a/fs/sysv/itree.c +++ b/fs/sysv/itree.c @@ -5,7 +5,6 @@ * AV, Sep--Dec 2000 */ -#include #include "sysv.h" enum {DIRECT = 10, DEPTH = 4}; /* Have triple indirect */ diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c index eab06b2999d7..09411fa827d2 100644 --- a/fs/udf/balloc.c +++ b/fs/udf/balloc.c @@ -26,7 +26,6 @@ #include "udfdecl.h" -#include #include #include diff --git a/fs/udf/file.c b/fs/udf/file.c index 30e38892a753..6e5c92c5d275 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -37,8 +37,8 @@ #include #include /* memset */ #include -#include #include +#include #include "udf_i.h" #include "udf_sb.h" diff --git a/fs/udf/fsync.c b/fs/udf/fsync.c index c36daeee6d10..e81448d0c875 100644 --- a/fs/udf/fsync.c +++ b/fs/udf/fsync.c @@ -26,7 +26,6 @@ #include "udfdecl.h" #include -#include #include /* diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c index 5f58afa05f8a..d22e26bed2af 100644 --- a/fs/udf/ialloc.c +++ b/fs/udf/ialloc.c @@ -25,7 +25,6 @@ #include "udfdecl.h" #include -#include #include #include diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 1c229f5c912b..40696bade927 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -34,10 +34,10 @@ */ #include "udfdecl.h" -#include #include #include #include +#include #include "udf_i.h" #include "udf_sb.h" diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 60b9a37438e3..28db72d58da5 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -33,7 +33,6 @@ #include #include #include -#include #include static inline int udf_match(int len, const char * const name, struct qstr *qs) diff --git a/fs/udf/super.c b/fs/udf/super.c index bc8a0576ec45..0858d29c3418 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -50,7 +50,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index 5f7102461577..6154c29324f8 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/ufs/cylinder.c b/fs/ufs/cylinder.c index a0729feed07f..daf11e4dcf66 100644 --- a/fs/ufs/cylinder.c +++ b/fs/ufs/cylinder.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index 9bbd31501b72..bd3c40da7d9e 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -14,7 +14,6 @@ */ #include -#include #include #include #include diff --git a/fs/ufs/file.c b/fs/ufs/file.c index fd6332175401..f282ea559c80 100644 --- a/fs/ufs/file.c +++ b/fs/ufs/file.c @@ -32,7 +32,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c index c0a435a09a26..d82fd117b869 100644 --- a/fs/ufs/ialloc.c +++ b/fs/ufs/ialloc.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index e10b5a35dca9..8e5bcf749231 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -34,7 +34,6 @@ #include #include #include -#include #include #include diff --git a/fs/ufs/super.c b/fs/ufs/super.c index cdf4ad6a84b2..5971709836a6 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -77,7 +77,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c index a2b6ed7a016c..758fc57b5574 100644 --- a/fs/ufs/truncate.c +++ b/fs/ufs/truncate.c @@ -35,7 +35,6 @@ #include #include #include -#include #include #include diff --git a/fs/ufs/util.c b/fs/ufs/util.c index b40e7ab8524d..6e859dc6afef 100644 --- a/fs/ufs/util.c +++ b/fs/ufs/util.c @@ -8,7 +8,6 @@ #include #include -#include #include #include "swab.h" diff --git a/include/linux/amigaffs.h b/include/linux/amigaffs.h index 535c3bf41b9a..f02e8cbd0131 100644 --- a/include/linux/amigaffs.h +++ b/include/linux/amigaffs.h @@ -2,7 +2,6 @@ #define AMIGAFFS_H #include -#include #include diff --git a/include/linux/blk.h b/include/linux/blk.h index 9be0913f6069..62d37b2b4c17 100644 --- a/include/linux/blk.h +++ b/include/linux/blk.h @@ -3,7 +3,6 @@ #include #include -#include #include #include #include diff --git a/include/linux/fs.h b/include/linux/fs.h index df5c5fdb0c6f..ab0a05dc8e26 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -682,6 +682,19 @@ struct super_block { struct semaphore s_vfs_rename_sem; /* Kludge */ }; +/* + * Superblock locking. + */ +static inline void lock_super(struct super_block * sb) +{ + down(&sb->s_lock); +} + +static inline void unlock_super(struct super_block * sb) +{ + up(&sb->s_lock); +} + /* * VFS helper functions.. */ diff --git a/include/linux/hfs_sysdep.h b/include/linux/hfs_sysdep.h index 62fcf2ea311f..a08d5aa9e39d 100644 --- a/include/linux/hfs_sysdep.h +++ b/include/linux/hfs_sysdep.h @@ -19,8 +19,8 @@ #include #include -#include #include +#include #include #include diff --git a/include/linux/locks.h b/include/linux/locks.h deleted file mode 100644 index a380c5e4f0bb..000000000000 --- a/include/linux/locks.h +++ /dev/null @@ -1,28 +0,0 @@ -#ifndef _LINUX_LOCKS_H -#define _LINUX_LOCKS_H - -#ifndef _LINUX_MM_H -#include -#endif -#ifndef _LINUX_PAGEMAP_H -#include -#endif - -/* - * super-block locking. Again, interrupts may only unlock - * a super-block (although even this isn't done right now. - * nfs may need it). - */ - -static inline void lock_super(struct super_block * sb) -{ - down(&sb->s_lock); -} - -static inline void unlock_super(struct super_block * sb) -{ - up(&sb->s_lock); -} - -#endif /* _LINUX_LOCKS_H */ - diff --git a/include/linux/nbd.h b/include/linux/nbd.h index b6120317731d..556b847804ca 100644 --- a/include/linux/nbd.h +++ b/include/linux/nbd.h @@ -22,7 +22,6 @@ #ifdef MAJOR_NR -#include #include #define LOCAL_END_REQUEST diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h index bf586df47298..cb6332482af2 100644 --- a/include/linux/raid/md.h +++ b/include/linux/raid/md.h @@ -31,7 +31,6 @@ #include #include #include -#include #include #include #include diff --git a/include/linux/swap.h b/include/linux/swap.h index 1674b5acd6f7..3a376842c21c 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -91,6 +91,7 @@ struct swap_info_struct { int next; /* next entry on swap list */ }; +struct inode; extern int nr_swap_pages; /* Swap 50% full? Release swapcache more aggressively.. */ diff --git a/include/linux/ufs_fs.h b/include/linux/ufs_fs.h index 609d0dab2c6f..faccf5ad22d5 100644 --- a/include/linux/ufs_fs.h +++ b/include/linux/ufs_fs.h @@ -31,6 +31,7 @@ #include #include #include +#include #define UFS_BBLOCK 0 #define UFS_BBSIZE 8192 diff --git a/kernel/ksyms.c b/kernel/ksyms.c index c00aef0e313e..add2ac9dd8f3 100644 --- a/kernel/ksyms.c +++ b/kernel/ksyms.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include #include diff --git a/mm/page_io.c b/mm/page_io.c index 05594b07aba9..85bb9049ee0f 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -12,8 +12,8 @@ #include #include +#include #include -#include #include #include diff --git a/mm/shmem.c b/mm/shmem.c index 615b0051bbcf..fa365c456448 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include #include diff --git a/net/khttpd/datasending.c b/net/khttpd/datasending.c index d1663cc21ef0..c78ef11b39b3 100644 --- a/net/khttpd/datasending.c +++ b/net/khttpd/datasending.c @@ -36,7 +36,6 @@ Return value: #include #include -#include #include #include -- cgit v1.2.3