diff options
| author | Linus Torvalds <torvalds@athlon.transmeta.com> | 2002-02-04 20:19:17 -0800 |
|---|---|---|
| committer | Linus Torvalds <torvalds@athlon.transmeta.com> | 2002-02-04 20:19:17 -0800 |
| commit | e2f6721a0a1b07612c0682d8240d3e9bc0a445a4 (patch) | |
| tree | aa6410ac8fc965f7d695031fa4c467347138c485 | |
| parent | 269f8f707739116e68aee38d78d0cfb3d896b856 (diff) | |
v2.4.9.14 -> v2.4.9.15
- Jan Harkes: make Coda work with arbitrary host filesystems, not
just filesystems that use generic_file_read/write
- Al Viro: block device cleanups
- Hugh Dickins: swap device lock fixes - fix swap readahead race
- me, Andrea: more reference bit cleanups
| -rw-r--r-- | Makefile | 2 | ||||
| -rw-r--r-- | drivers/block/rd.c | 3 | ||||
| -rw-r--r-- | drivers/char/raw.c | 1 | ||||
| -rw-r--r-- | drivers/char/tty_io.c | 2 | ||||
| -rw-r--r-- | drivers/ide/hptraid.c | 21 | ||||
| -rw-r--r-- | drivers/ide/pdcraid.c | 19 | ||||
| -rw-r--r-- | drivers/md/md.c | 8 | ||||
| -rw-r--r-- | fs/block_dev.c | 68 | ||||
| -rw-r--r-- | fs/coda/file.c | 57 | ||||
| -rw-r--r-- | fs/coda/psdev.c | 2 | ||||
| -rw-r--r-- | fs/devfs/base.c | 11 | ||||
| -rw-r--r-- | fs/devices.c | 1 | ||||
| -rw-r--r-- | fs/inode.c | 9 | ||||
| -rw-r--r-- | fs/partitions/ibm.c | 10 | ||||
| -rw-r--r-- | fs/super.c | 12 | ||||
| -rw-r--r-- | include/linux/fs.h | 4 | ||||
| -rw-r--r-- | include/linux/mm.h | 32 | ||||
| -rw-r--r-- | include/linux/swap.h | 19 | ||||
| -rw-r--r-- | mm/filemap.c | 8 | ||||
| -rw-r--r-- | mm/memory.c | 25 | ||||
| -rw-r--r-- | mm/shmem.c | 50 | ||||
| -rw-r--r-- | mm/swap.c | 2 | ||||
| -rw-r--r-- | mm/swap_state.c | 70 | ||||
| -rw-r--r-- | mm/swapfile.c | 151 | ||||
| -rw-r--r-- | mm/vmscan.c | 108 |
25 files changed, 373 insertions, 322 deletions
@@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 4 SUBLEVEL = 10 -EXTRAVERSION =-pre14 +EXTRAVERSION =-pre15 KERNELRELEASE=$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION) diff --git a/drivers/block/rd.c b/drivers/block/rd.c index 369a776ce33b..4bf4286168b2 100644 --- a/drivers/block/rd.c +++ b/drivers/block/rd.c @@ -491,7 +491,6 @@ static void __exit rd_cleanup (void) bdev->bd_cache_openers--; truncate_inode_pages(bdev->bd_inode->i_mapping, 0); blkdev_put(bdev, BDEV_FILE); - bdput(bdev); } destroy_buffers(MKDEV(MAJOR_NR, i)); } @@ -780,7 +779,7 @@ static void __init rd_load_image(kdev_t device, int offset, int unit) if (i && (i % devblocks == 0)) { printk("done disk #%d.\n", i/devblocks); rotate = 0; - if (blkdev_close(inode, &infile) != 0) { + if (infile.f_op->release(inode, &infile) != 0) { printk("Error closing the disk.\n"); goto noclose_input; } diff --git a/drivers/char/raw.c b/drivers/char/raw.c index c9c5b6e4ff7c..d90d9723c1f6 100644 --- a/drivers/char/raw.c +++ b/drivers/char/raw.c @@ -103,6 +103,7 @@ int raw_open(struct inode *inode, struct file *filp) if (!bdev) goto out; + atomic_inc(&bdev->bd_count); rdev = to_kdev_t(bdev->bd_dev); err = blkdev_get(bdev, filp->f_mode, 0, BDEV_RAW); if (err) diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c index 9b2d90a6cf4e..4a688d830362 100644 --- a/drivers/char/tty_io.c +++ b/drivers/char/tty_io.c @@ -270,6 +270,8 @@ int tty_register_ldisc(int disc, struct tty_ldisc *new_ldisc) return 0; } +EXPORT_SYMBOL(tty_register_ldisc); + /* Set the discipline of a tty line. */ static int tty_set_ldisc(struct tty_struct *tty, int ldisc) { diff --git a/drivers/ide/hptraid.c b/drivers/ide/hptraid.c index d70fab90e451..bbdf1d70983f 100644 --- a/drivers/ide/hptraid.c +++ b/drivers/ide/hptraid.c @@ -279,6 +279,7 @@ static void __init probedisk(int major, int minor,int device) int i; struct highpoint_raid_conf *prom; static unsigned char block[4096]; + struct block_device *bdev; if (maxsectors(major,minor)==0) return; @@ -301,12 +302,12 @@ static void __init probedisk(int major, int minor,int device) if (i>8) return; - raid[device].disk[i].bdev = bdget(MKDEV(major,minor)); - if (raid[device].disk[i].bdev != NULL) { + bdev = bdget(MKDEV(major,minor)); + if (bdev && blkdev_get(bdev,FMODE_READ|FMODE_WRITE,0,BDEV_RAW) == 0) { int j=0; struct gendisk *gd; - /* This is supposed to prevent others from stealing our underlying disks */ - blkdev_get(raid[device].disk[i].bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_RAW); + + raid[device].disk[i].bdev = bdev; /* now blank the /proc/partitions table for the wrong partition table, so that scripts don't accidentally mount it and crash the kernel */ /* XXX: the 0 is an utter hack --hch */ @@ -408,12 +409,12 @@ static void __exit hptraid_exit (void) { int i,device; for (device = 0; device<16; device++) { - for (i=0;i<8;i++) - if (raid[device].disk[i].bdev) { - blkdev_put(raid[device].disk[i].bdev, BDEV_RAW); - bdput(raid[device].disk[i].bdev); - raid[device].disk[i].bdev = NULL; - } + for (i=0;i<8;i++) { + struct block_device *bdev = raid[device].disk[i].bdev; + raid[device].disk[i].bdev = NULL; + if (bdev) + blkdev_put(bdev, BDEV_RAW); + } if (raid[device].sectors) ataraid_release_device(device); } diff --git a/drivers/ide/pdcraid.c b/drivers/ide/pdcraid.c index b12fb0be5861..f21fda755a5f 100644 --- a/drivers/ide/pdcraid.c +++ b/drivers/ide/pdcraid.c @@ -311,12 +311,12 @@ static void __init probedisk(int major, int minor,int device) for (i=0;(i<prom->raid.total_disks)&&(i<8);i++) { if ( (prom->raid.disk[i].channel== prom->raid.channel) && (prom->raid.disk[i].device == prom->raid.device) ) { - raid[device].disk[i].bdev = bdget(MKDEV(major,minor)); - if (raid[device].disk[i].bdev != NULL) { + struct block_device *bdev = bdget(MKDEV(major,minor)); + if (bdev && blkdev_get(bdev,FMODE_READ|FMODE_WRITE,0,BDEV_RAW) == 0) { struct gendisk *gd; int j; /* This is supposed to prevent others from stealing our underlying disks */ - blkdev_get(raid[device].disk[i].bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_RAW); + raid[device].disk[i].bdev = bdev; gd=get_gendisk(major); if (gd!=NULL) { for (j=1+(minor<<gd->minor_shift);j<((minor+1)<<gd->minor_shift);j++) @@ -418,13 +418,12 @@ static void __exit pdcraid_exit (void) { int i,device; for (device = 0; device<16; device++) { - for (i=0;i<8;i++) - if (raid[device].disk[i].bdev) { - blkdev_put(raid[device].disk[i].bdev, BDEV_RAW); - bdput(raid[device].disk[i].bdev); - raid[device].disk[i].bdev = NULL; - - } + for (i=0;i<8;i++) { + struct block_device *bdev = raid[device].disk[i].bdev; + raid[device].disk[i].bdev = NULL; + if (bdev) + blkdev_put(bdev, BDEV_RAW); + } if (raid[device].sectors) ataraid_release_device(device); } diff --git a/drivers/md/md.c b/drivers/md/md.c index d4d51fd5f43d..19c30390d089 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -649,11 +649,11 @@ static int lock_rdev (mdk_rdev_t *rdev) static void unlock_rdev (mdk_rdev_t *rdev) { - if (!rdev->bdev) - MD_BUG(); - blkdev_put(rdev->bdev, BDEV_RAW); - bdput(rdev->bdev); + struct block_device *bdev = rdev->bdev; rdev->bdev = NULL; + if (!bdev) + MD_BUG(); + blkdev_put(bdev, BDEV_RAW); } void md_autodetect_dev (kdev_t dev); diff --git a/fs/block_dev.c b/fs/block_dev.c index 52f35224795e..b2d374ec457b 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -404,7 +404,6 @@ static int get_inode(struct block_device *bdev) if (!inode) return -ENOMEM; inode->i_rdev = to_kdev_t(bdev->bd_dev); - atomic_inc(&bdev->bd_count); /* will go away */ inode->i_bdev = bdev; inode->i_data.a_ops = &def_blk_aops; bdev->bd_inode = inode; @@ -437,6 +436,7 @@ static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags) { memset(bdev, 0, sizeof(*bdev)); sema_init(&bdev->bd_sem, 1); + INIT_LIST_HEAD(&bdev->bd_inodes); } } @@ -522,17 +522,58 @@ struct block_device *bdget(dev_t dev) void bdput(struct block_device *bdev) { - if (atomic_dec_and_test(&bdev->bd_count)) { + if (atomic_dec_and_lock(&bdev->bd_count, &bdev_lock)) { + struct list_head *p; if (bdev->bd_openers) BUG(); if (bdev->bd_cache_openers) BUG(); - spin_lock(&bdev_lock); list_del(&bdev->bd_hash); + while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) { + struct inode *inode; + inode = list_entry(p, struct inode, i_devices); + list_del_init(p); + inode->i_bdev = NULL; + } spin_unlock(&bdev_lock); destroy_bdev(bdev); } } + +int bd_acquire(struct inode *inode) +{ + struct block_device *bdev; + spin_lock(&bdev_lock); + if (inode->i_bdev) { + atomic_inc(&inode->i_bdev->bd_count); + spin_unlock(&bdev_lock); + return 0; + } + spin_unlock(&bdev_lock); + bdev = bdget(kdev_t_to_nr(inode->i_rdev)); + if (!bdev) + return -ENOMEM; + spin_lock(&bdev_lock); + if (!inode->i_bdev) { + inode->i_bdev = bdev; + list_add(&inode->i_devices, &bdev->bd_inodes); + } else if (inode->i_bdev != bdev) + BUG(); + spin_unlock(&bdev_lock); + return 0; +} + +/* Call when you free inode */ + +void bd_forget(struct inode *inode) +{ + spin_lock(&bdev_lock); + if (inode->i_bdev) { + list_del_init(&inode->i_devices); + inode->i_bdev = NULL; + } + spin_unlock(&bdev_lock); +} static struct { const char *name; @@ -706,13 +747,15 @@ int blkdev_get(struct block_device *bdev, mode_t mode, unsigned flags, int kind) } unlock_kernel(); up(&bdev->bd_sem); + if (ret) + bdput(bdev); return ret; } int blkdev_open(struct inode * inode, struct file * filp) { - int ret = -ENXIO; - struct block_device *bdev = inode->i_bdev; + int ret; + struct block_device *bdev; /* * Preserve backwards compatibility and allow large file access @@ -722,13 +765,15 @@ int blkdev_open(struct inode * inode, struct file * filp) */ filp->f_flags |= O_LARGEFILE; + bd_acquire(inode); + bdev = inode->i_bdev; down(&bdev->bd_sem); - if (get_inode(bdev)) { - up(&bdev->bd_sem); - return -ENOMEM; - } + ret = get_inode(bdev); + if (ret) + goto out; + ret = -ENXIO; lock_kernel(); if (!bdev->bd_op) bdev->bd_op = get_blkfops(MAJOR(inode->i_rdev)); @@ -749,7 +794,10 @@ int blkdev_open(struct inode * inode, struct file * filp) } } unlock_kernel(); +out: up(&bdev->bd_sem); + if (ret) + bdput(bdev); return ret; } @@ -777,6 +825,7 @@ int blkdev_put(struct block_device *bdev, int kind) } unlock_kernel(); up(&bdev->bd_sem); + bdput(bdev); return ret; } @@ -841,6 +890,7 @@ int blkdev_close(struct inode * inode, struct file * filp) } unlock_kernel(); up(&bdev->bd_sem); + bdput(bdev); return ret; } diff --git a/fs/coda/file.c b/fs/coda/file.c index 6c2610f0ab87..fa1d150e32d0 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -31,28 +31,65 @@ int use_coda_close; static ssize_t -coda_file_write(struct file *file,const char *buf,size_t count,loff_t *ppos) +coda_file_read(struct file *file, char *buf, size_t count, loff_t *ppos) { + struct inode *inode = file->f_dentry->d_inode; + struct coda_inode_info *cii = ITOC(inode); struct file *cfile; + + cfile = cii->c_container; + if (!cfile) BUG(); + + if (!cfile->f_op || !cfile->f_op->read) + return -EINVAL; + + return cfile->f_op->read(cfile, buf, count, ppos); +} + +static ssize_t +coda_file_write(struct file *file,const char *buf,size_t count,loff_t *ppos) +{ struct inode *cinode, *inode = file->f_dentry->d_inode; struct coda_inode_info *cii = ITOC(inode); - ssize_t n; + struct file *cfile; + ssize_t ret; + int flags; cfile = cii->c_container; if (!cfile) BUG(); - if (!cfile->f_op || cfile->f_op->write != generic_file_write) - BUG(); + if (!cfile->f_op || !cfile->f_op->write) + return -EINVAL; cinode = cfile->f_dentry->d_inode; - down(&cinode->i_sem); + down(&inode->i_sem); + flags = cfile->f_flags; + cfile->f_flags |= file->f_flags & (O_APPEND | O_SYNC); - n = generic_file_write(file, buf, count, ppos); + ret = cfile->f_op->write(cfile, buf, count, ppos); + + cfile->f_flags = flags; inode->i_size = cinode->i_size; + up(&inode->i_sem); + + return ret; +} + +static int +coda_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct inode *inode = file->f_dentry->d_inode; + struct coda_inode_info *cii = ITOC(inode); + struct file *cfile; + + cfile = cii->c_container; + + if (!cfile) BUG(); - up(&cinode->i_sem); + if (!cfile->f_op || !cfile->f_op->mmap) + return -ENODEV; - return n; + return cfile->f_op->mmap(cfile, vma); } int coda_open(struct inode *i, struct file *f) @@ -237,9 +274,9 @@ int coda_fsync(struct file *file, struct dentry *dentry, int datasync) struct file_operations coda_file_operations = { llseek: generic_file_llseek, - read: generic_file_read, + read: coda_file_read, write: coda_file_write, - mmap: generic_file_mmap, + mmap: coda_file_mmap, open: coda_open, flush: coda_flush, release: coda_release, diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c index 59c530d6d8f3..2fe942f7cad9 100644 --- a/fs/coda/psdev.c +++ b/fs/coda/psdev.c @@ -414,7 +414,7 @@ MODULE_AUTHOR("Peter J. Braam <braam@cs.cmu.edu>"); static int __init init_coda(void) { int status; - printk(KERN_INFO "Coda Kernel/Venus communications, v5.3.14, coda@cs.cmu.edu\n"); + printk(KERN_INFO "Coda Kernel/Venus communications, v5.3.15, coda@cs.cmu.edu\n"); status = init_coda_psdev(); if ( status ) { diff --git a/fs/devfs/base.c b/fs/devfs/base.c index 58aedeca1907..974379d14d4d 100644 --- a/fs/devfs/base.c +++ b/fs/devfs/base.c @@ -2291,9 +2291,16 @@ static int devfs_statfs (struct super_block *sb, struct statfs *buf) return 0; } /* End Function devfs_statfs */ +static void devfs_clear_inode(struct inode *inode) +{ + if (S_ISBLK(inode->i_mode)) + bdput(inode->i_bdev); +} + static struct super_operations devfs_sops = { put_inode: force_delete, + clear_inode: devfs_clear_inode, statfs: devfs_statfs, }; @@ -2351,9 +2358,7 @@ static struct inode *get_vfs_inode (struct super_block *sb, { inode->i_rdev = MKDEV (de->u.fcb.u.device.major, de->u.fcb.u.device.minor); - inode->i_bdev = bdget ( kdev_t_to_nr (inode->i_rdev) ); - inode->i_mapping->a_ops = &def_blk_aops; - if (inode->i_bdev) + if (bd_acquire(inode) == 0) { if (!inode->i_bdev->bd_op && de->u.fcb.ops) inode->i_bdev->bd_op = de->u.fcb.ops; diff --git a/fs/devices.c b/fs/devices.c index 875f0e9f364c..3b4448e8d0e8 100644 --- a/fs/devices.c +++ b/fs/devices.c @@ -207,7 +207,6 @@ void init_special_inode(struct inode *inode, umode_t mode, int rdev) } else if (S_ISBLK(mode)) { inode->i_fop = &def_blk_fops; inode->i_rdev = to_kdev_t(rdev); - inode->i_bdev = bdget(rdev); } else if (S_ISFIFO(mode)) inode->i_fop = &def_fifo_fops; else if (S_ISSOCK(mode)) diff --git a/fs/inode.c b/fs/inode.c index e034073de731..f9783a67b454 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -106,6 +106,7 @@ static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags) INIT_LIST_HEAD(&inode->i_dentry); INIT_LIST_HEAD(&inode->i_dirty_buffers); INIT_LIST_HEAD(&inode->i_dirty_data_buffers); + INIT_LIST_HEAD(&inode->i_devices); sema_init(&inode->i_sem, 1); sema_init(&inode->i_zombie, 1); spin_lock_init(&inode->i_data.i_shared_lock); @@ -516,11 +517,9 @@ void clear_inode(struct inode *inode) DQUOT_DROP(inode); if (inode->i_sb && inode->i_sb->s_op && inode->i_sb->s_op->clear_inode) inode->i_sb->s_op->clear_inode(inode); - if (inode->i_bdev) { - bdput(inode->i_bdev); - inode->i_bdev = NULL; - } - if (inode->i_cdev) { + if (inode->i_bdev) + bd_forget(inode); + else if (inode->i_cdev) { cdput(inode->i_cdev); inode->i_cdev = NULL; } diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c index ce313bbe89eb..8e2b6d5da229 100644 --- a/fs/partitions/ibm.c +++ b/fs/partitions/ibm.c @@ -47,9 +47,10 @@ get_drive_geometry(int kdev,struct hd_geometry *geo) { struct block_device *bdev = bdget(kdev_t_to_nr(kdev)); int rc = blkdev_get(bdev, 0, 1, BDEV_FILE); - if ( rc == 0 ) + if ( rc == 0 ) { rc = ioctl_by_bdev(bdev, HDIO_GETGEO, (unsigned long)geo); - blkdev_put(bdev,BDEV_FILE); + blkdev_put(bdev, BDEV_FILE); + } return rc; } @@ -58,9 +59,10 @@ get_drive_info(int kdev,dasd_information_t *info) { struct block_device *bdev = bdget(kdev_t_to_nr(kdev)); int rc = blkdev_get(bdev, 0, 1, BDEV_FILE); - if ( rc == 0 ) + if ( rc == 0 ) { rc = ioctl_by_bdev(bdev, BIODASDINFO, (unsigned long)(info)); - blkdev_put(bdev,BDEV_FILE); + blkdev_put(bdev, BDEV_FILE); + } return rc; } diff --git a/fs/super.c b/fs/super.c index 5541b10196fc..4736ded323c6 100644 --- a/fs/super.c +++ b/fs/super.c @@ -925,6 +925,7 @@ static struct super_block *get_sb_bdev(struct file_system_type *fs_type, error = -EACCES; if (nd.mnt->mnt_flags & MNT_NODEV) goto out; + bd_acquire(inode); bdev = inode->i_bdev; bdops = devfs_get_ops ( devfs_get_handle_from_inode (inode) ); if (bdops) bdev->bd_op = bdops; @@ -982,8 +983,6 @@ restart: if (!fs_type->read_super(s, data, 0)) goto out_fail; unlock_super(s); - /* tell bdcache that we are going to keep this one */ - atomic_inc(&bdev->bd_count); get_filesystem(fs_type); path_release(&nd); return s; @@ -1128,10 +1127,9 @@ static void kill_super(struct super_block *sb) sb->s_type = NULL; unlock_super(sb); unlock_kernel(); - if (bdev) { + if (bdev) blkdev_put(bdev, BDEV_FS); - bdput(bdev); - } else + else put_unnamed_dev(dev); spin_lock(&sb_lock); list_del(&sb->s_list); @@ -1718,6 +1716,7 @@ skip_nfs: if (!ROOT_DEV) panic("I have no root and I want to scream"); +retry: bdev = bdget(kdev_t_to_nr(ROOT_DEV)); if (!bdev) panic(__FUNCTION__ ": unable to allocate root device"); @@ -1729,7 +1728,7 @@ skip_nfs: retval = blkdev_get(bdev, mode, 0, BDEV_FS); if (retval == -EROFS) { root_mountflags |= MS_RDONLY; - retval = blkdev_get(bdev, FMODE_READ, 0, BDEV_FS); + goto retry; } if (retval) { /* @@ -1977,6 +1976,7 @@ int __init change_root(kdev_t new_root_dev,const char *put_old) int blivet; struct block_device *ramdisk = old_rootmnt->mnt_sb->s_bdev; + atomic_inc(&ramdisk->bd_count); blivet = blkdev_get(ramdisk, FMODE_READ, 0, BDEV_FS); printk(KERN_NOTICE "Trying to unmount old root ... "); if (!blivet) { diff --git a/include/linux/fs.h b/include/linux/fs.h index 31a2167afac8..9eca17f8ee1b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -415,6 +415,7 @@ struct block_device { int bd_cache_openers; const struct block_device_operations *bd_op; struct semaphore bd_sem; /* open/close mutex */ + struct list_head bd_inodes; }; struct inode { @@ -452,6 +453,7 @@ struct inode { int i_mapping_overload; struct dquot *i_dquot[MAXQUOTAS]; /* These three should probably be a union */ + struct list_head i_devices; struct pipe_inode_info *i_pipe; struct block_device *i_bdev; struct char_device *i_cdev; @@ -1046,6 +1048,8 @@ enum {BDEV_FILE, BDEV_SWAP, BDEV_FS, BDEV_RAW}; extern int register_blkdev(unsigned int, const char *, struct block_device_operations *); extern int unregister_blkdev(unsigned int, const char *); extern struct block_device *bdget(dev_t); +extern int bd_acquire(struct inode *inode); +extern void bd_forget(struct inode *inode); extern void bdput(struct block_device *); extern struct char_device *cdget(dev_t); extern void cdput(struct char_device *); diff --git a/include/linux/mm.h b/include/linux/mm.h index 8f9e8bfbce88..494d025d143d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -274,7 +274,6 @@ typedef struct page { #define PG_active 6 #define PG_inactive 7 #define PG_slab 8 -#define PG_swap_cache 9 #define PG_skip 10 #define PG_highmem 11 #define PG_checked 12 /* kill me in 2.5.<early>. */ @@ -326,18 +325,9 @@ static inline void set_page_dirty(struct page * page) #define SetPageDecrAfter(page) set_bit(PG_decr_after, &(page)->flags) #define PageTestandClearDecrAfter(page) test_and_clear_bit(PG_decr_after, &(page)->flags) #define PageSlab(page) test_bit(PG_slab, &(page)->flags) -#define PageSwapCache(page) test_bit(PG_swap_cache, &(page)->flags) -#define PageReserved(page) test_bit(PG_reserved, &(page)->flags) - #define PageSetSlab(page) set_bit(PG_slab, &(page)->flags) -#define PageSetSwapCache(page) set_bit(PG_swap_cache, &(page)->flags) - -#define PageTestandSetSwapCache(page) test_and_set_bit(PG_swap_cache, &(page)->flags) - -#define PageClearSlab(page) clear_bit(PG_slab, &(page)->flags) -#define PageClearSwapCache(page) clear_bit(PG_swap_cache, &(page)->flags) - -#define PageTestandClearSwapCache(page) test_and_clear_bit(PG_swap_cache, &(page)->flags) +#define PageClearSlab(page) clear_bit(PG_slab, &(page)->flags) +#define PageReserved(page) test_bit(PG_reserved, &(page)->flags) #define PageActive(page) test_bit(PG_active, &(page)->flags) #define SetPageActive(page) set_bit(PG_active, &(page)->flags) @@ -465,6 +455,9 @@ extern void show_mem(void); extern void si_meminfo(struct sysinfo * val); extern void swapin_readahead(swp_entry_t); +extern struct address_space swapper_space; +#define PageSwapCache(page) ((page)->mapping == &swapper_space) + static inline int is_page_cache_freeable(struct page * page) { return page_count(page) - !!page->buffers == 1; @@ -476,15 +469,13 @@ static inline int is_page_cache_freeable(struct page * page) */ static inline int exclusive_swap_page(struct page *page) { - unsigned int count; - if (!PageLocked(page)) BUG(); if (!PageSwapCache(page)) return 0; - count = page_count(page) - !!page->buffers; /* 2: us + swap cache */ - count += swap_count(page); /* +1: just swap cache */ - return count == 3; /* =3: total */ + if (page_count(page) - !!page->buffers != 2) /* 2: us + cache */ + return 0; + return swap_count(page) == 1; /* 1: just cache */ } extern void __free_pte(pte_t); @@ -565,11 +556,10 @@ extern struct page *filemap_nopage(struct vm_area_struct *, unsigned long, int); #define GFP_NOFS (__GFP_HIGH | __GFP_WAIT | __GFP_IO | __GFP_HIGHIO) #define GFP_ATOMIC (__GFP_HIGH) #define GFP_USER ( __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_FS) -#define GFP_HIGHUSER ( __GFP_WAIT | __GFP_IO | __GFP_HIGHIO \ - | __GFP_FS | __GFP_HIGHMEM) +#define GFP_HIGHUSER ( __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_FS | __GFP_HIGHMEM) #define GFP_KERNEL (__GFP_HIGH | __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_FS) -#define GFP_NFS (__GFP_HIGH | __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_FS) -#define GFP_KSWAPD ( __GFP_IO | __GFP_HIGHIO | __GFP_FS) +#define GFP_NFS (__GFP_HIGH | __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_FS) +#define GFP_KSWAPD ( __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_FS) /* Flag - indicates that the buffer will be suitable for DMA. Ignored on some platforms, used as appropriate on others */ diff --git a/include/linux/swap.h b/include/linux/swap.h index 0ce8a374a11d..0282b6bac60c 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -84,7 +84,6 @@ extern unsigned int nr_free_buffer_pages(void); extern int nr_active_pages; extern int nr_inactive_pages; extern atomic_t nr_async_pages; -extern struct address_space swapper_space; extern atomic_t page_cache_size; extern atomic_t buffermem_pages; extern spinlock_t pagecache_lock; @@ -122,35 +121,27 @@ extern void rw_swap_page_nolock(int, swp_entry_t, char *); /* linux/mm/swap_state.c */ extern void show_swap_cache_info(void); extern void add_to_swap_cache(struct page *, swp_entry_t); -extern int swap_check_entry(unsigned long); +extern void __delete_from_swap_cache(struct page *page); +extern void delete_from_swap_cache(struct page *page); +extern void free_page_and_swap_cache(struct page *page); extern struct page * lookup_swap_cache(swp_entry_t); extern struct page * read_swap_cache_async(swp_entry_t); /* linux/mm/oom_kill.c */ extern void oom_kill(void); -/* - * Make these inline later once they are working properly. - */ -extern void __delete_from_swap_cache(struct page *page); -extern void delete_from_swap_cache(struct page *page); -extern void delete_from_swap_cache_nolock(struct page *page); -extern void free_page_and_swap_cache(struct page *page); - /* linux/mm/swapfile.c */ extern unsigned int nr_swapfiles; extern struct swap_info_struct swap_info[]; extern int is_swap_partition(kdev_t); extern void si_swapinfo(struct sysinfo *); -extern swp_entry_t __get_swap_page(unsigned short); +extern swp_entry_t get_swap_page(void); extern void get_swaphandle_info(swp_entry_t, unsigned long *, kdev_t *, struct inode **); extern int swap_duplicate(swp_entry_t); extern int swap_count(struct page *); extern int valid_swaphandles(swp_entry_t, unsigned long *); -#define get_swap_page() __get_swap_page(1) -extern void __swap_free(swp_entry_t, unsigned short); -#define swap_free(entry) __swap_free((entry), 1) +extern void swap_free(swp_entry_t); struct swap_list_t { int head; /* head of priority-ordered swapfile list */ int next; /* swapfile to be used next */ diff --git a/mm/filemap.c b/mm/filemap.c index 3d6817ce077c..609e3bb04d94 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1704,6 +1704,7 @@ success: * and possibly copy it over to another page.. */ old_page = page; + mark_page_accessed(page); if (no_share) { struct page *new_page = alloc_page(GFP_HIGHUSER); @@ -2553,7 +2554,6 @@ repeat: } if (cached_page) page_cache_release(cached_page); - mark_page_accessed(page); return page; } @@ -2571,7 +2571,10 @@ struct page *read_cache_page(struct address_space *mapping, retry: page = __read_cache_page(mapping, index, filler, data); - if (IS_ERR(page) || Page_Uptodate(page)) + if (IS_ERR(page)) + goto out; + mark_page_accessed(page); + if (Page_Uptodate(page)) goto out; lock_page(page); @@ -2835,6 +2838,7 @@ generic_file_write(struct file *file,const char *buf,size_t count, loff_t *ppos) unlock: kunmap(page); /* Mark it unlocked again and drop the page.. */ + SetPageReferenced(page); UnlockPage(page); page_cache_release(page); diff --git a/mm/memory.c b/mm/memory.c index 3987ece050c0..efd520264e75 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -85,8 +85,6 @@ void __free_pte(pte_t pte) if (page->mapping) { if (pte_dirty(pte)) set_page_dirty(page); - if (pte_young(pte)) - mark_page_accessed(page); } free_page_and_swap_cache(page); @@ -939,10 +937,8 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma, break; /* Recheck swapcachedness once the page is locked */ can_reuse = exclusive_swap_page(old_page); -#if 1 if (can_reuse) - delete_from_swap_cache_nolock(old_page); -#endif + delete_from_swap_cache(old_page); UnlockPage(old_page); if (!can_reuse) break; @@ -1088,23 +1084,19 @@ void swapin_readahead(swp_entry_t entry) unsigned long offset; /* - * Get the number of handles we should do readahead io to. Also, - * grab temporary references on them, releasing them as io completes. + * Get the number of handles we should do readahead io to. */ num = valid_swaphandles(entry, &offset); for (i = 0; i < num; offset++, i++) { /* Don't block on I/O for read-ahead */ - if (atomic_read(&nr_async_pages) >= pager_daemon.swap_cluster - * (1 << page_cluster)) { - while (i++ < num) - swap_free(SWP_ENTRY(SWP_TYPE(entry), offset++)); + if (atomic_read(&nr_async_pages) >= + pager_daemon.swap_cluster << page_cluster) break; - } /* Ok, do the async read-ahead now */ new_page = read_swap_cache_async(SWP_ENTRY(SWP_TYPE(entry), offset)); - if (new_page != NULL) - page_cache_release(new_page); - swap_free(SWP_ENTRY(SWP_TYPE(entry), offset)); + if (!new_page) + break; + page_cache_release(new_page); } return; } @@ -1164,11 +1156,12 @@ static int do_swap_page(struct mm_struct * mm, pte = mk_pte(page, vma->vm_page_prot); swap_free(entry); + mark_page_accessed(page); if (exclusive_swap_page(page)) { if (vma->vm_flags & VM_WRITE) pte = pte_mkwrite(pte); pte = pte_mkdirty(pte); - delete_from_swap_cache_nolock(page); + delete_from_swap_cache(page); } UnlockPage(page); diff --git a/mm/shmem.c b/mm/shmem.c index 1e4efc4a0ff5..a24c868edca0 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -234,45 +234,55 @@ static int shmem_writepage(struct page * page) int error; struct shmem_inode_info *info; swp_entry_t *entry, swap; + struct address_space *mapping; + unsigned long index; struct inode *inode; if (!PageLocked(page)) BUG(); - - inode = page->mapping->host; + + mapping = page->mapping; + index = page->index; + inode = mapping->host; info = &inode->u.shmem_i; - swap = __get_swap_page(2); - error = -ENOMEM; - if (!swap.val) { - activate_page(page); - SetPageDirty(page); - goto out; - } spin_lock(&info->lock); - entry = shmem_swp_entry(info, page->index); - if (IS_ERR(entry)) /* this had been allocted on page allocation */ + entry = shmem_swp_entry(info, index); + if (IS_ERR(entry)) /* this had been allocated on page allocation */ BUG(); - shmem_recalc_inode(page->mapping->host); - error = -EAGAIN; + shmem_recalc_inode(inode); if (entry->val) BUG(); - *entry = swap; - error = 0; - /* Remove the from the page cache */ + /* Remove it from the page cache */ lru_cache_del(page); remove_inode_page(page); + swap_list_lock(); + swap = get_swap_page(); + + if (!swap.val) { + swap_list_unlock(); + /* Add it back to the page cache */ + add_to_page_cache_locked(page, mapping, index); + activate_page(page); + SetPageDirty(page); + error = -ENOMEM; + goto out; + } + /* Add it to the swap cache */ add_to_swap_cache(page, swap); - page_cache_release(page); - info->swapped++; + swap_list_unlock(); - spin_unlock(&info->lock); set_page_dirty(page); + info->swapped++; + *entry = swap; + error = 0; out: + spin_unlock(&info->lock); UnlockPage(page); + page_cache_release(page); return error; } @@ -356,7 +366,7 @@ repeat: swap_free(*entry); *entry = (swp_entry_t) {0}; - delete_from_swap_cache_nolock(page); + delete_from_swap_cache(page); flags = page->flags & ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_referenced | 1 << PG_arch_1); page->flags = flags | (1 << PG_dirty); add_to_page_cache_locked(page, mapping, idx); diff --git a/mm/swap.c b/mm/swap.c index 18b504deb45c..37b9ea1babb6 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -54,7 +54,6 @@ void deactivate_page_nolock(struct page * page) del_page_from_active_list(page); add_page_to_inactive_list(page); } - ClearPageReferenced(page); } void deactivate_page(struct page * page) @@ -73,7 +72,6 @@ void activate_page_nolock(struct page * page) del_page_from_inactive_list(page); add_page_to_active_list(page); } - SetPageReferenced(page); } void activate_page(struct page * page) diff --git a/mm/swap_state.c b/mm/swap_state.c index 0963ca7a9f41..ed712d227fc9 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -23,17 +23,11 @@ */ static int swap_writepage(struct page *page) { - /* One for the page cache, one for this user, one for page->buffers */ - if (page_count(page) > 2 + !!page->buffers) - goto in_use; - if (swap_count(page) > 1) - goto in_use; - - delete_from_swap_cache_nolock(page); - UnlockPage(page); - return 0; - -in_use: + if (exclusive_swap_page(page)) { + delete_from_swap_cache(page); + UnlockPage(page); + return 0; + } rw_swap_page(WRITE, page); return 0; } @@ -75,8 +69,6 @@ void add_to_swap_cache(struct page *page, swp_entry_t entry) #endif if (!PageLocked(page)) BUG(); - if (PageTestandSetSwapCache(page)) - BUG(); if (page->mapping) BUG(); @@ -92,51 +84,42 @@ void add_to_swap_cache(struct page *page, swp_entry_t entry) */ void __delete_from_swap_cache(struct page *page) { - struct address_space *mapping = page->mapping; - swp_entry_t entry; - #ifdef SWAP_CACHE_INFO swap_cache_del_total++; #endif - if (mapping != &swapper_space) + if (!PageLocked(page)) BUG(); - if (!PageSwapCache(page) || !PageLocked(page)) + if (!PageSwapCache(page)) BUG(); - entry.val = page->index; - PageClearSwapCache(page); ClearPageDirty(page); __remove_inode_page(page); - swap_free(entry); } /* - * This will never put the page into the free list, the caller has - * a reference on the page. + * This must be called only on pages that have + * been verified to be in the swap cache and locked. + * It will never put the page into the free list, + * the caller has a reference on the page. */ -void delete_from_swap_cache_nolock(struct page *page) +void delete_from_swap_cache(struct page *page) { + swp_entry_t entry; + if (!PageLocked(page)) BUG(); if (block_flushpage(page, 0)) lru_cache_del(page); + entry.val = page->index; + spin_lock(&pagecache_lock); __delete_from_swap_cache(page); spin_unlock(&pagecache_lock); - page_cache_release(page); -} -/* - * This must be called only on pages that have - * been verified to be in the swap cache and locked. - */ -void delete_from_swap_cache(struct page *page) -{ - lock_page(page); - delete_from_swap_cache_nolock(page); - UnlockPage(page); + swap_free(entry); + page_cache_release(page); } /* @@ -156,7 +139,7 @@ void free_page_and_swap_cache(struct page *page) */ if (PageSwapCache(page) && !TryLockPage(page)) { if (exclusive_swap_page(page)) - delete_from_swap_cache_nolock(page); + delete_from_swap_cache(page); UnlockPage(page); } page_cache_release(page); @@ -213,19 +196,24 @@ struct page * read_swap_cache_async(swp_entry_t entry) new_page = alloc_page(GFP_HIGHUSER); if (!new_page) goto out; /* Out of memory */ + if (TryLockPage(new_page)) + BUG(); /* * Check the swap cache again, in case we stalled above. - * The BKL is guarding against races between this check + * swap_list_lock is guarding against races between this check * and where the new page is added to the swap cache below. + * It is also guarding against race where try_to_swap_out + * allocates entry with get_swap_page then adds to cache. */ + swap_list_lock(); found_page = __find_get_page(&swapper_space, entry.val, hash); if (found_page) goto out_free_page; /* * Make sure the swap entry is still in use. It could have gone - * while caller waited for BKL, or while allocating page above, + * since caller dropped page_table_lock, while allocating page above, * or while allocating page in prior call via swapin_readahead. */ if (!swap_duplicate(entry)) /* Account for the swap cache */ @@ -234,13 +222,15 @@ struct page * read_swap_cache_async(swp_entry_t entry) /* * Add it to the swap cache and read its contents. */ - if (TryLockPage(new_page)) - BUG(); add_to_swap_cache(new_page, entry); + swap_list_unlock(); + rw_swap_page(READ, new_page); return new_page; out_free_page: + swap_list_unlock(); + UnlockPage(new_page); page_cache_release(new_page); out: return found_page; diff --git a/mm/swapfile.c b/mm/swapfile.c index c9783aa2dd70..f3b73b43abe6 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -14,6 +14,7 @@ #include <linux/vmalloc.h> #include <linux/pagemap.h> #include <linux/shm.h> +#include <linux/compiler.h> #include <asm/pgtable.h> @@ -33,7 +34,7 @@ struct swap_info_struct swap_info[MAX_SWAPFILES]; #define SWAPFILE_CLUSTER 256 -static inline int scan_swap_map(struct swap_info_struct *si, unsigned short count) +static inline int scan_swap_map(struct swap_info_struct *si) { unsigned long offset; /* @@ -86,7 +87,8 @@ static inline int scan_swap_map(struct swap_info_struct *si, unsigned short coun si->lowest_bit = si->max; si->highest_bit = 0; } - si->swap_map[offset] = count; + /* Initial count 1 for user reference + 1 for swap cache */ + si->swap_map[offset] = 2; nr_swap_pages--; si->cluster_next = offset+1; return offset; @@ -96,7 +98,12 @@ static inline int scan_swap_map(struct swap_info_struct *si, unsigned short coun return 0; } -swp_entry_t __get_swap_page(unsigned short count) +/* + * Callers of get_swap_page must hold swap_list_lock across the call, + * and across the following add_to_swap_cache, to guard against races + * with read_swap_cache_async. + */ +swp_entry_t get_swap_page(void) { struct swap_info_struct * p; unsigned long offset; @@ -104,20 +111,17 @@ swp_entry_t __get_swap_page(unsigned short count) int type, wrapped = 0; entry.val = 0; /* Out of memory */ - if (count >= SWAP_MAP_MAX) - goto bad_count; - swap_list_lock(); type = swap_list.next; if (type < 0) goto out; - if (nr_swap_pages == 0) + if (nr_swap_pages <= 0) goto out; while (1) { p = &swap_info[type]; if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) { swap_device_lock(p); - offset = scan_swap_map(p, count); + offset = scan_swap_map(p); swap_device_unlock(p); if (offset) { entry = SWP_ENTRY(type,offset); @@ -142,21 +146,14 @@ swp_entry_t __get_swap_page(unsigned short count) goto out; /* out of swap space */ } out: - swap_list_unlock(); - return entry; - -bad_count: - printk(KERN_ERR "get_swap_page: bad count %hd from %p\n", - count, __builtin_return_address(0)); return entry; } - /* * Caller has made sure that the swapdevice corresponding to entry * is still around or has not been recycled. */ -void __swap_free(swp_entry_t entry, unsigned short count) +void swap_free(swp_entry_t entry) { struct swap_info_struct * p; unsigned long offset, type; @@ -180,9 +177,7 @@ void __swap_free(swp_entry_t entry, unsigned short count) swap_list.next = type; swap_device_lock(p); if (p->swap_map[offset] < SWAP_MAP_MAX) { - if (p->swap_map[offset] < count) - goto bad_count; - if (!(p->swap_map[offset] -= count)) { + if (!--(p->swap_map[offset])) { if (offset < p->lowest_bit) p->lowest_bit = offset; if (offset > p->highest_bit) @@ -207,11 +202,6 @@ bad_offset: bad_free: printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val); goto out; -bad_count: - swap_device_unlock(p); - swap_list_unlock(); - printk(KERN_ERR "swap_free: Bad count %hd current count %hd\n", count, p->swap_map[offset]); - goto out; } /* @@ -229,9 +219,9 @@ static inline void unuse_pte(struct vm_area_struct * vma, unsigned long address, { pte_t pte = *dir; - if (pte_to_swp_entry(pte).val != entry.val) + if (likely(pte_to_swp_entry(pte).val != entry.val)) return; - if (pte_none(pte) || pte_present(pte)) + if (unlikely(pte_none(pte) || pte_present(pte))) return; get_page(page); set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot))); @@ -458,7 +448,7 @@ static int try_to_unuse(unsigned int type) */ lock_page(page); if (PageSwapCache(page)) - delete_from_swap_cache_nolock(page); + delete_from_swap_cache(page); SetPageDirty(page); UnlockPage(page); flush_page_to_ram(page); @@ -567,14 +557,8 @@ asmlinkage long sys_swapoff(const char * specialfile) for (type = swap_list.head; type >= 0; type = swap_info[type].next) { p = swap_info + type; if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) { - if (p->swap_file) { - if (p->swap_file == nd.dentry) - break; - } else { - if (S_ISBLK(nd.dentry->d_inode->i_mode) - && (p->swap_device == nd.dentry->d_inode->i_rdev)) - break; - } + if (p->swap_file == nd.dentry) + break; } prev = type; } @@ -616,19 +600,21 @@ asmlinkage long sys_swapoff(const char * specialfile) goto out_dput; } if (p->swap_device) - blkdev_put(nd.dentry->d_inode->i_bdev, BDEV_SWAP); + blkdev_put(p->swap_file->d_inode->i_bdev, BDEV_SWAP); path_release(&nd); swap_list_lock(); - nd.dentry = p->swap_file; - p->swap_file = NULL; + swap_device_lock(p); nd.mnt = p->swap_vfsmnt; + nd.dentry = p->swap_file; p->swap_vfsmnt = NULL; + p->swap_file = NULL; p->swap_device = 0; p->max = 0; swap_map = p->swap_map; p->swap_map = NULL; p->flags = 0; + swap_device_unlock(p); swap_list_unlock(); vfree(swap_map); err = 0; @@ -711,6 +697,7 @@ asmlinkage long sys_swapon(const char * specialfile, int swap_flags) unsigned long maxpages = 1; int swapfilesize; struct block_device *bdev = NULL; + unsigned short *swap_map; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -760,6 +747,7 @@ asmlinkage long sys_swapon(const char * specialfile, int swap_flags) p->swap_device = dev; set_blocksize(dev, PAGE_SIZE); + bd_acquire(swap_inode); bdev = swap_inode->i_bdev; bdops = devfs_get_ops(devfs_get_handle_from_inode(swap_inode)); if (bdops) bdev->bd_op = bdops; @@ -772,29 +760,24 @@ asmlinkage long sys_swapon(const char * specialfile, int swap_flags) if (!dev || (blk_size[MAJOR(dev)] && !blk_size[MAJOR(dev)][MINOR(dev)])) goto bad_swap; - error = -EBUSY; - for (i = 0 ; i < nr_swapfiles ; i++) { - if (i == type) - continue; - if (dev == swap_info[i].swap_device) - goto bad_swap; - } swapfilesize = 0; if (blk_size[MAJOR(dev)]) swapfilesize = blk_size[MAJOR(dev)][MINOR(dev)] >> (PAGE_SHIFT - 10); - } else if (S_ISREG(swap_inode->i_mode)) { - error = -EBUSY; - for (i = 0 ; i < nr_swapfiles ; i++) { - if (i == type || !swap_info[i].swap_file) - continue; - if (swap_inode == swap_info[i].swap_file->d_inode) - goto bad_swap; - } + } else if (S_ISREG(swap_inode->i_mode)) swapfilesize = swap_inode->i_size >> PAGE_SHIFT; - } else + else goto bad_swap; + error = -EBUSY; + for (i = 0 ; i < nr_swapfiles ; i++) { + struct swap_info_struct *q = &swap_info[i]; + if (i == type || !q->swap_file) + continue; + if (swap_inode->i_mapping == q->swap_file->d_inode->i_mapping) + goto bad_swap; + } + swap_header = (void *) __get_free_page(GFP_USER); if (!swap_header) { printk("Unable to start swapping: out of memory :-)\n"); @@ -900,6 +883,7 @@ asmlinkage long sys_swapon(const char * specialfile, int swap_flags) } p->swap_map[0] = SWAP_MAP_BAD; swap_list_lock(); + swap_device_lock(p); p->max = maxpages; p->flags = SWP_WRITEOK; p->pages = nr_good_pages; @@ -922,6 +906,7 @@ asmlinkage long sys_swapon(const char * specialfile, int swap_flags) } else { swap_info[prev].next = p - swap_info; } + swap_device_unlock(p); swap_list_unlock(); error = 0; goto out; @@ -929,11 +914,10 @@ bad_swap: if (bdev) blkdev_put(bdev, BDEV_SWAP); bad_swap_2: - if (p->swap_map) - vfree(p->swap_map); + swap_list_lock(); + swap_map = p->swap_map; nd.mnt = p->swap_vfsmnt; nd.dentry = p->swap_file; - swap_list_lock(); p->swap_device = 0; p->swap_file = NULL; p->swap_vfsmnt = NULL; @@ -942,6 +926,8 @@ bad_swap_2: if (!(swap_flags & SWAP_FLAG_PREFER)) ++least_priority; swap_list_unlock(); + if (swap_map) + vfree(swap_map); path_release(&nd); out: if (swap_header) @@ -987,43 +973,31 @@ int swap_duplicate(swp_entry_t entry) unsigned long offset, type; int result = 0; - /* Swap entry 0 is illegal */ - if (!entry.val) - goto out; type = SWP_TYPE(entry); if (type >= nr_swapfiles) goto bad_file; p = type + swap_info; offset = SWP_OFFSET(entry); - if (offset >= p->max) - goto bad_offset; - if (!p->swap_map[offset]) - goto bad_unused; - /* - * Entry is valid, so increment the map count. - */ + swap_device_lock(p); - if (p->swap_map[offset] < SWAP_MAP_MAX) - p->swap_map[offset]++; - else { - if (swap_overflow++ < 5) - printk(KERN_WARNING "swap_dup: swap entry overflow\n"); - p->swap_map[offset] = SWAP_MAP_MAX; + if (offset < p->max && p->swap_map[offset]) { + if (p->swap_map[offset] < SWAP_MAP_MAX - 1) { + p->swap_map[offset]++; + result = 1; + } else if (p->swap_map[offset] <= SWAP_MAP_MAX) { + if (swap_overflow++ < 5) + printk(KERN_WARNING "swap_dup: swap entry overflow\n"); + p->swap_map[offset] = SWAP_MAP_MAX; + result = 1; + } } swap_device_unlock(p); - result = 1; out: return result; bad_file: printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); goto out; -bad_offset: - /* Don't report: can happen in read_swap_cache_async after swapoff */ - goto out; -bad_unused: - /* Don't report: can happen in read_swap_cache_async after blocking */ - goto out; } /* @@ -1068,7 +1042,7 @@ bad_unused: } /* - * Kernel_lock protects against swap device deletion. + * Prior swap_duplicate protects against swap device deletion. */ void get_swaphandle_info(swp_entry_t entry, unsigned long *offset, kdev_t *dev, struct inode **swapf) @@ -1108,8 +1082,8 @@ void get_swaphandle_info(swp_entry_t entry, unsigned long *offset, } /* - * Kernel_lock protects against swap device deletion. Grab an extra - * reference on the swaphandle so that it dos not become unused. + * swap_device_lock prevents swap_map being freed. Don't grab an extra + * reference on the swaphandle, it doesn't matter if it becomes unused. */ int valid_swaphandles(swp_entry_t entry, unsigned long *offset) { @@ -1117,20 +1091,23 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset) unsigned long toff; struct swap_info_struct *swapdev = SWP_TYPE(entry) + swap_info; - *offset = SWP_OFFSET(entry); - toff = *offset = (*offset >> page_cluster) << page_cluster; + if (!page_cluster) /* no readahead */ + return 0; + toff = (SWP_OFFSET(entry) >> page_cluster) << page_cluster; + if (!toff) /* first page is swap header */ + toff++, i--; + *offset = toff; swap_device_lock(swapdev); do { /* Don't read-ahead past the end of the swap area */ if (toff >= swapdev->max) break; - /* Don't read in bad or busy pages */ + /* Don't read in free or bad pages */ if (!swapdev->swap_map[toff]) break; if (swapdev->swap_map[toff] == SWAP_MAP_BAD) break; - swapdev->swap_map[toff]++; toff++; ret++; } while (--i); diff --git a/mm/vmscan.c b/mm/vmscan.c index b6d222ea4e2e..1c3f6b3e2f92 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -52,14 +52,9 @@ static inline int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* /* Don't look at this pte if it's been accessed recently. */ if (ptep_test_and_clear_young(page_table)) { flush_tlb_page(vma, address); - mark_page_accessed(page); return 0; } - /* Don't bother with it if the page is otherwise active */ - if (PageActive(page)) - return 0; - if (TryLockPage(page)) return 0; @@ -85,8 +80,8 @@ static inline int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* entry.val = page->index; if (pte_dirty(pte)) set_page_dirty(page); -set_swap_pte: swap_duplicate(entry); +set_swap_pte: set_pte(page_table, swp_entry_to_pte(entry)); drop_pte: mm->rss--; @@ -130,16 +125,18 @@ drop_pte: * we have the swap cache set up to associate the * page with that swap entry. */ + swap_list_lock(); entry = get_swap_page(); - if (!entry.val) - goto out_unlock_restore; /* No swap space left */ - - /* Add it to the swap cache and mark it dirty */ - add_to_swap_cache(page, entry); - set_page_dirty(page); - goto set_swap_pte; + if (entry.val) { + /* Add it to the swap cache and mark it dirty */ + add_to_swap_cache(page, entry); + swap_list_unlock(); + set_page_dirty(page); + goto set_swap_pte; + } -out_unlock_restore: + /* No swap space left */ + swap_list_unlock(); set_pte(page_table, pte); UnlockPage(page); return 0; @@ -243,9 +240,9 @@ static inline int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vm struct mm_struct *swap_mm = &init_mm; /* - * Returns non-zero if we scanned all `count' pages + * Returns remaining count of pages to be swapped out by followup call. */ -static inline int swap_out_mm(struct mm_struct * mm, int count, int * race, zone_t * classzone) +static inline int swap_out_mm(struct mm_struct * mm, int count, int * mmcounter, zone_t * classzone) { unsigned long address; struct vm_area_struct* vma; @@ -255,11 +252,12 @@ static inline int swap_out_mm(struct mm_struct * mm, int count, int * race, zone * and ptes. */ spin_lock(&mm->page_table_lock); - *race = 1; - if (swap_mm != mm) - goto out_unlock; - *race = 0; address = mm->swap_address; + if (address == TASK_SIZE || swap_mm != mm) { + /* We raced: don't count this mm but try again */ + ++*mmcounter; + goto out_unlock; + } vma = find_vma(mm, address); if (vma) { if (address < vma->vm_start) @@ -267,31 +265,26 @@ static inline int swap_out_mm(struct mm_struct * mm, int count, int * race, zone for (;;) { count = swap_out_vma(mm, vma, address, count, classzone); - if (!count) - goto out_unlock; vma = vma->vm_next; if (!vma) break; + if (!count) + goto out_unlock; address = vma->vm_start; } } - /* Reset to 0 when we reach the end of address space */ - mm->swap_address = 0; - - spin_lock(&mmlist_lock); - swap_mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist); - spin_unlock(&mmlist_lock); + /* Indicate that we reached the end of address space */ + mm->swap_address = TASK_SIZE; out_unlock: spin_unlock(&mm->page_table_lock); - return count; } static int FASTCALL(swap_out(unsigned int priority, zone_t * classzone, unsigned int gfp_mask, int nr_pages)); static int swap_out(unsigned int priority, zone_t * classzone, unsigned int gfp_mask, int nr_pages) { - int counter, race; + int counter; struct mm_struct *mm; /* Then, look at the other mm's */ @@ -304,9 +297,10 @@ static int swap_out(unsigned int priority, zone_t * classzone, unsigned int gfp_ spin_lock(&mmlist_lock); mm = swap_mm; - if (mm == &init_mm) { + while (mm->swap_address == TASK_SIZE || mm == &init_mm) { + mm->swap_address = 0; mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist); - if (mm == &init_mm) + if (mm == swap_mm) goto empty; swap_mm = mm; } @@ -315,13 +309,13 @@ static int swap_out(unsigned int priority, zone_t * classzone, unsigned int gfp_ atomic_inc(&mm->mm_users); spin_unlock(&mmlist_lock); - nr_pages = swap_out_mm(mm, nr_pages, &race, classzone); + nr_pages = swap_out_mm(mm, nr_pages, &counter, classzone); mmput(mm); if (!nr_pages) return 1; - } while (race || --counter >= 0); + } while (--counter >= 0); return 0; @@ -330,15 +324,15 @@ empty: return 0; } -static int FASTCALL(shrink_cache(struct list_head * lru, int * max_scan, int this_max_scan, int nr_pages, zone_t * classzone, unsigned int gfp_mask)); -static int shrink_cache(struct list_head * lru, int * max_scan, int this_max_scan, int nr_pages, zone_t * classzone, unsigned int gfp_mask) +static int FASTCALL(shrink_cache(int nr_pages, int max_scan, zone_t * classzone, unsigned int gfp_mask)); +static int shrink_cache(int nr_pages, int max_scan, zone_t * classzone, unsigned int gfp_mask) { struct list_head * entry; - int __max_scan = *max_scan; spin_lock(&pagemap_lru_lock); - while (__max_scan && this_max_scan && (entry = lru->prev) != lru) { + while (max_scan && (entry = inactive_list.prev) != &inactive_list) { struct page * page; + swp_entry_t swap; if (unlikely(current->need_resched)) { spin_unlock(&pagemap_lru_lock); @@ -353,18 +347,16 @@ static int shrink_cache(struct list_head * lru, int * max_scan, int this_max_sca if (unlikely(!PageInactive(page) && !PageActive(page))) BUG(); - this_max_scan--; - list_del(entry); - list_add(entry, lru); + list_add(entry, &inactive_list); if (PageTestandClearReferenced(page)) continue; + max_scan--; + if (unlikely(!memclass(page->zone, classzone))) continue; - __max_scan--; - /* Racy check to avoid trylocking when not worthwhile */ if (!page->buffers && page_count(page) != 1) continue; @@ -479,14 +471,24 @@ static int shrink_cache(struct list_head * lru, int * max_scan, int this_max_sca } /* point of no return */ - if (likely(!PageSwapCache(page))) + if (likely(!PageSwapCache(page))) { + swap.val = 0; __remove_inode_page(page); - else + } else { + swap.val = page->index; __delete_from_swap_cache(page); + } spin_unlock(&pagecache_lock); __lru_cache_del(page); + if (unlikely(swap.val != 0)) { + /* must drop lru lock if getting swap_list lock */ + spin_unlock(&pagemap_lru_lock); + swap_free(swap); + spin_lock(&pagemap_lru_lock); + } + UnlockPage(page); /* effectively free the page here */ @@ -498,7 +500,6 @@ static int shrink_cache(struct list_head * lru, int * max_scan, int this_max_sca } spin_unlock(&pagemap_lru_lock); - *max_scan = __max_scan; return nr_pages; } @@ -509,14 +510,10 @@ static int shrink_cache(struct list_head * lru, int * max_scan, int this_max_sca * We move them the other way when we see the * reference bit on the page. */ -static void balance_inactive(int nr_pages) +static void refill_inactive(int nr_pages) { struct list_head * entry; - /* If we have more inactive pages than active don't do anything */ - if (nr_active_pages < nr_inactive_pages) - return; - spin_lock(&pagemap_lru_lock); entry = active_list.prev; while (nr_pages-- && entry != &active_list) { @@ -541,14 +538,17 @@ static void balance_inactive(int nr_pages) static int FASTCALL(shrink_caches(int priority, zone_t * classzone, unsigned int gfp_mask, int nr_pages)); static int shrink_caches(int priority, zone_t * classzone, unsigned int gfp_mask, int nr_pages) { - int max_scan = (nr_inactive_pages + nr_active_pages / DEF_PRIORITY) / priority; + int max_scan = nr_inactive_pages / priority; nr_pages -= kmem_cache_reap(gfp_mask); if (nr_pages <= 0) return 0; - balance_inactive(nr_pages); - nr_pages = shrink_cache(&inactive_list, &max_scan, nr_inactive_pages, nr_pages, classzone, gfp_mask); + /* Do we want to age the active list? */ + if (nr_inactive_pages < nr_active_pages*2) + refill_inactive(nr_pages); + + nr_pages = shrink_cache(nr_pages, max_scan, classzone, gfp_mask); if (nr_pages <= 0) return 0; |
