diff options
| author | Linus Torvalds <torvalds@athlon.transmeta.com> | 2002-02-04 18:13:58 -0800 |
|---|---|---|
| committer | Linus Torvalds <torvalds@athlon.transmeta.com> | 2002-02-04 18:13:58 -0800 |
| commit | 7216d3e927c3b6c5d28e5ffaa54afbb34649debb (patch) | |
| tree | 2b81b74fda9084131cd90731b0ec9e93e8edb853 | |
| parent | 4095b99c09e3db837b17f031da096a0213cdd527 (diff) | |
v2.4.3.8 -> v2.4.4
- Andrea Arkangeli: raw-io fixes
- Johannes Erdfelt: USB updates
- reiserfs update
- Al Viro: fsync/umount race fix
- Rusty Russell: netfilter sync
91 files changed, 1220 insertions, 615 deletions
@@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 4 SUBLEVEL = 4 -EXTRAVERSION =-pre8 +EXTRAVERSION = KERNELRELEASE=$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION) diff --git a/arch/arm/kernel/semaphore.c b/arch/arm/kernel/semaphore.c index 49b0bb5463d9..969f09546ff9 100644 --- a/arch/arm/kernel/semaphore.c +++ b/arch/arm/kernel/semaphore.c @@ -165,3 +165,94 @@ int __down_trylock(struct semaphore * sem) spin_unlock_irqrestore(&semaphore_lock, flags); return 1; } + +/* + * The semaphore operations have a special calling sequence that + * allow us to do a simpler in-line version of them. These routines + * need to convert that sequence back into the C sequence when + * there is contention on the semaphore. + * + * ip contains the semaphore pointer on entry. Save the C-clobbered + * registers (r0 to r3 and lr), but not ip, as we use it as a return + * value in some cases.. + */ +#ifdef CONFIG_CPU_26 +asm(" .section .text.lock, \"ax\" + .align 5 + .globl __down_failed +__down_failed: + stmfd sp!, {r0 - r3, lr} + mov r0, ip + bl __down + ldmfd sp!, {r0 - r3, pc}^ + + .align 5 + .globl __down_interruptible_failed +__down_interruptible_failed: + stmfd sp!, {r0 - r3, lr} + mov r0, ip + bl __down_interruptible + mov ip, r0 + ldmfd sp!, {r0 - r3, pc}^ + + .align 5 + .globl __down_trylock_failed +__down_trylock_failed: + stmfd sp!, {r0 - r3, lr} + mov r0, ip + bl __down_trylock + mov ip, r0 + ldmfd sp!, {r0 - r3, pc}^ + + .align 5 + .globl __up_wakeup +__up_wakeup: + stmfd sp!, {r0 - r3, lr} + mov r0, ip + bl __up + ldmfd sp!, {r0 - r3, pc}^ + + .previous + "); + +#else +/* 32 bit version */ +asm(" .section .text.lock, \"ax\" + .align 5 + .globl __down_failed +__down_failed: + stmfd sp!, {r0 - r3, lr} + mov r0, ip + bl __down + ldmfd sp!, {r0 - r3, pc} + + .align 5 + .globl __down_interruptible_failed +__down_interruptible_failed: + stmfd sp!, {r0 - r3, lr} + mov r0, ip + bl __down_interruptible + mov ip, r0 + ldmfd sp!, {r0 - r3, pc} + + .align 5 + .globl __down_trylock_failed +__down_trylock_failed: + stmfd sp!, {r0 - r3, lr} + mov r0, ip + bl __down_trylock + mov ip, r0 + ldmfd sp!, {r0 - r3, pc} + + .align 5 + .globl __up_wakeup +__up_wakeup: + stmfd sp!, {r0 - r3, lr} + mov r0, ip + bl __up + ldmfd sp!, {r0 - r3, pc} + + .previous + "); + +#endif diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c index 279f01f99e6d..85a2caa89124 100644 --- a/arch/arm/kernel/setup.c +++ b/arch/arm/kernel/setup.c @@ -241,7 +241,7 @@ void __init setup_ramdisk(int doload, int prompt, int image_start, unsigned int rd_sz) { #ifdef CONFIG_BLK_DEV_RAM - extern int rd_doload, rd_prompt, rd_image_start, rd_size; + extern int rd_size; rd_image_start = image_start; rd_prompt = prompt; diff --git a/arch/cris/kernel/setup.c b/arch/cris/kernel/setup.c index a082393de565..85af7212b06a 100644 --- a/arch/cris/kernel/setup.c +++ b/arch/cris/kernel/setup.c @@ -41,12 +41,6 @@ struct screen_info screen_info; unsigned char aux_device_present; -#ifdef CONFIG_BLK_DEV_RAM -extern int rd_doload; /* 1 = load ramdisk, 0 = don't load */ -extern int rd_prompt; /* 1 = prompt for ramdisk, 0 = don't prompt */ -extern int rd_image_start; /* starting block # of image */ -#endif - extern int root_mountflags; extern char _etext, _edata, _end; diff --git a/arch/i386/defconfig b/arch/i386/defconfig index 8a8e09d14217..250cf3eec0ca 100644 --- a/arch/i386/defconfig +++ b/arch/i386/defconfig @@ -5,8 +5,6 @@ CONFIG_X86=y CONFIG_ISA=y # CONFIG_SBUS is not set CONFIG_UID16=y -# CONFIG_RWSEM_GENERIC_SPINLOCK is not set -CONFIG_RWSEM_XCHGADD_ALGORITHM=y # # Code maturity level options @@ -44,6 +42,8 @@ CONFIG_X86_CMPXCHG=y CONFIG_X86_XADD=y CONFIG_X86_BSWAP=y CONFIG_X86_POPAD_OK=y +# CONFIG_RWSEM_GENERIC_SPINLOCK is not set +CONFIG_RWSEM_XCHGADD_ALGORITHM=y CONFIG_X86_L1_CACHE_SHIFT=5 CONFIG_X86_TSC=y CONFIG_X86_GOOD_APIC=y diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index c2d87bf73d3d..05080042d2b1 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -142,12 +142,6 @@ struct e820map e820; unsigned char aux_device_present; -#ifdef CONFIG_BLK_DEV_RAM -extern int rd_doload; /* 1 = load ramdisk, 0 = don't load */ -extern int rd_prompt; /* 1 = prompt for ramdisk, 0 = don't prompt */ -extern int rd_image_start; /* starting block # of image */ -#endif - extern int root_mountflags; extern char _text, _etext, _edata, _end; extern unsigned long cpu_khz; diff --git a/arch/ppc/kernel/chrp_setup.c b/arch/ppc/kernel/chrp_setup.c index b7136d00179c..0ba0c6c68873 100644 --- a/arch/ppc/kernel/chrp_setup.c +++ b/arch/ppc/kernel/chrp_setup.c @@ -95,12 +95,6 @@ static int max_width; unsigned long empty_zero_page[1024]; -#ifdef CONFIG_BLK_DEV_RAM -extern int rd_doload; /* 1 = load ramdisk, 0 = don't load */ -extern int rd_prompt; /* 1 = prompt for ramdisk, 0 = don't prompt */ -extern int rd_image_start; /* starting block # of image */ -#endif - static const char *gg2_memtypes[4] = { "FPM", "SDRAM", "EDO", "BEDO" }; diff --git a/arch/ppc/kernel/m8260_setup.c b/arch/ppc/kernel/m8260_setup.c index 4185e5b44d16..5b4f8e85a386 100644 --- a/arch/ppc/kernel/m8260_setup.c +++ b/arch/ppc/kernel/m8260_setup.c @@ -67,12 +67,6 @@ extern unsigned long loops_per_jiffy; unsigned char __res[sizeof(bd_t)]; unsigned long empty_zero_page[1024]; -#ifdef CONFIG_BLK_DEV_RAM -extern int rd_doload; /* 1 = load ramdisk, 0 = don't load */ -extern int rd_prompt; /* 1 = prompt for ramdisk, 0 = don't prompt */ -extern int rd_image_start; /* starting block # of image */ -#endif - extern char saved_command_line[256]; extern unsigned long find_available_memory(void); diff --git a/arch/ppc/kernel/m8xx_setup.c b/arch/ppc/kernel/m8xx_setup.c index 1d3260d784f4..9cbce65ac609 100644 --- a/arch/ppc/kernel/m8xx_setup.c +++ b/arch/ppc/kernel/m8xx_setup.c @@ -112,12 +112,6 @@ ide_pio_timings_t ide_pio_clocks[6]; #endif /* CONFIG_BLK_DEV_MPC8xx_IDE */ #endif /* CONFIG_BLK_DEV_IDE || CONFIG_BLK_DEV_IDE_MODULE */ -#ifdef CONFIG_BLK_DEV_RAM -extern int rd_doload; /* 1 = load ramdisk, 0 = don't load */ -extern int rd_prompt; /* 1 = prompt for ramdisk, 0 = don't prompt */ -extern int rd_image_start; /* starting block # of image */ -#endif - extern char saved_command_line[256]; extern unsigned long find_available_memory(void); diff --git a/arch/ppc/kernel/prep_setup.c b/arch/ppc/kernel/prep_setup.c index 772cceb61586..cf6edf78e556 100644 --- a/arch/ppc/kernel/prep_setup.c +++ b/arch/ppc/kernel/prep_setup.c @@ -103,12 +103,6 @@ extern unsigned long Hash_size, Hash_mask; extern int probingmem; extern unsigned long loops_per_jiffy; -#ifdef CONFIG_BLK_DEV_RAM -extern int rd_doload; /* 1 = load ramdisk, 0 = don't load */ -extern int rd_prompt; /* 1 = prompt for ramdisk, 0 = don't prompt */ -extern int rd_image_start; /* starting block # of image */ -#endif - int __prep prep_get_cpuinfo(char *buffer) { diff --git a/arch/sh/kernel/setup.c b/arch/sh/kernel/setup.c index 20b0bf018643..ba1337bec8dc 100644 --- a/arch/sh/kernel/setup.c +++ b/arch/sh/kernel/setup.c @@ -52,12 +52,6 @@ struct sh_cpuinfo boot_cpu_data = { CPU_SH_NONE, 0, 0, 0, }; struct screen_info screen_info; unsigned char aux_device_present = 0xaa; -#ifdef CONFIG_BLK_DEV_RAM -extern int rd_doload; /* 1 = load ramdisk, 0 = don't load */ -extern int rd_prompt; /* 1 = prompt for ramdisk, 0 = don't prompt */ -extern int rd_image_start; /* starting block # of image */ -#endif - #if defined(CONFIG_SH_GENERIC) || defined(CONFIG_SH_UNKNOWN) struct sh_machine_vector sh_mv; #endif diff --git a/drivers/char/mem.c b/drivers/char/mem.c index 25f97d211463..a0bfe6c1231e 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -616,7 +616,6 @@ int __init chr_dev_init(void) printk("unable to get major %d for memory devs\n", MEM_MAJOR); memory_devfs_register(); rand_initialize(); - raw_init(); #ifdef CONFIG_I2C i2c_init_all(); #endif diff --git a/drivers/char/raw.c b/drivers/char/raw.c index ae4f0bbd3ec1..0b382aa4fceb 100644 --- a/drivers/char/raw.c +++ b/drivers/char/raw.c @@ -19,10 +19,15 @@ #define dprintk(x...) -static struct block_device *raw_device_bindings[256]; -static int raw_device_inuse[256]; -static int raw_device_sector_size[256]; -static int raw_device_sector_bits[256]; +typedef struct raw_device_data_s { + struct kiobuf * iobuf; + long iobuf_lock; + struct block_device *binding; + int inuse, sector_size, sector_bits; + struct semaphore mutex; +} raw_device_data_t; + +static raw_device_data_t raw_devices[256]; static ssize_t rw_raw_dev(int rw, struct file *, char *, size_t, loff_t *); @@ -45,11 +50,19 @@ static struct file_operations raw_ctl_fops = { open: raw_open, }; -void __init raw_init(void) +static int __init raw_init(void) { + int i; register_chrdev(RAW_MAJOR, "raw", &raw_fops); + + for (i = 0; i < 256; i++) + init_MUTEX(&raw_devices[i].mutex); + + return 0; } +__initcall(raw_init); + /* * Open/close code for raw IO. */ @@ -74,28 +87,43 @@ int raw_open(struct inode *inode, struct file *filp) return 0; } + down(&raw_devices[minor].mutex); /* * No, it is a normal raw device. All we need to do on open is * to check that the device is bound, and force the underlying * block device to a sector-size blocksize. */ - bdev = raw_device_bindings[minor]; + bdev = raw_devices[minor].binding; + err = -ENODEV; if (!bdev) - return -ENODEV; + goto out; rdev = to_kdev_t(bdev->bd_dev); err = blkdev_get(bdev, filp->f_mode, 0, BDEV_RAW); if (err) - return err; + goto out; /* * Don't change the blocksize if we already have users using * this device */ - if (raw_device_inuse[minor]++) - return 0; + if (raw_devices[minor].inuse++) + goto out; + + /* + * We'll just use one kiobuf + */ + + err = alloc_kiovec(1, &raw_devices[minor].iobuf); + if (err) { + raw_devices[minor].inuse--; + up(&raw_devices[minor].mutex); + blkdev_put(bdev, BDEV_RAW); + return err; + } + /* * Don't interfere with mounted devices: we cannot safely set @@ -112,13 +140,16 @@ int raw_open(struct inode *inode, struct file *filp) } set_blocksize(rdev, sector_size); - raw_device_sector_size[minor] = sector_size; + raw_devices[minor].sector_size = sector_size; for (sector_bits = 0; !(sector_size & 1); ) sector_size>>=1, sector_bits++; - raw_device_sector_bits[minor] = sector_bits; + raw_devices[minor].sector_bits = sector_bits; + + out: + up(&raw_devices[minor].mutex); - return 0; + return err; } int raw_release(struct inode *inode, struct file *filp) @@ -127,11 +158,12 @@ int raw_release(struct inode *inode, struct file *filp) struct block_device *bdev; minor = MINOR(inode->i_rdev); - lock_kernel(); - bdev = raw_device_bindings[minor]; + down(&raw_devices[minor].mutex); + bdev = raw_devices[minor].binding; + if (!--raw_devices[minor].inuse) + free_kiovec(1, &raw_devices[minor].iobuf); + up(&raw_devices[minor].mutex); blkdev_put(bdev, BDEV_RAW); - raw_device_inuse[minor]--; - unlock_kernel(); return 0; } @@ -184,26 +216,30 @@ int raw_ctl_ioctl(struct inode *inode, * major/minor numbers make sense. */ - if (rq.block_major == NODEV || + if ((rq.block_major == NODEV && + rq.block_minor != NODEV) || rq.block_major > MAX_BLKDEV || rq.block_minor > MINORMASK) { err = -EINVAL; break; } - if (raw_device_inuse[minor]) { + down(&raw_devices[minor].mutex); + if (raw_devices[minor].inuse) { + up(&raw_devices[minor].mutex); err = -EBUSY; break; } - if (raw_device_bindings[minor]) - bdput(raw_device_bindings[minor]); - raw_device_bindings[minor] = + if (raw_devices[minor].binding) + bdput(raw_devices[minor].binding); + raw_devices[minor].binding = bdget(kdev_t_to_nr(MKDEV(rq.block_major, rq.block_minor))); + up(&raw_devices[minor].mutex); } else { struct block_device *bdev; kdev_t dev; - bdev = raw_device_bindings[minor]; + bdev = raw_devices[minor].binding; if (bdev) { dev = to_kdev_t(bdev->bd_dev); rq.block_major = MAJOR(dev); @@ -244,9 +280,9 @@ ssize_t rw_raw_dev(int rw, struct file *filp, char *buf, size_t size, loff_t *offp) { struct kiobuf * iobuf; - int err; + int new_iobuf; + int err = 0; unsigned long blocknr, blocks; - unsigned long b[KIO_MAX_SECTORS]; size_t transferred; int iosize; int i; @@ -262,9 +298,23 @@ ssize_t rw_raw_dev(int rw, struct file *filp, char *buf, */ minor = MINOR(filp->f_dentry->d_inode->i_rdev); - dev = to_kdev_t(raw_device_bindings[minor]->bd_dev); - sector_size = raw_device_sector_size[minor]; - sector_bits = raw_device_sector_bits[minor]; + + new_iobuf = 0; + iobuf = raw_devices[minor].iobuf; + if (test_and_set_bit(0, &raw_devices[minor].iobuf_lock)) { + /* + * A parallel read/write is using the preallocated iobuf + * so just run slow and allocate a new one. + */ + err = alloc_kiovec(1, &iobuf); + if (err) + goto out; + new_iobuf = 1; + } + + dev = to_kdev_t(raw_devices[minor].binding->bd_dev); + sector_size = raw_devices[minor].sector_size; + sector_bits = raw_devices[minor].sector_bits; sector_mask = sector_size- 1; max_sectors = KIO_MAX_SECTORS >> (sector_bits - 9); @@ -275,18 +325,14 @@ ssize_t rw_raw_dev(int rw, struct file *filp, char *buf, dprintk ("rw_raw_dev: dev %d:%d (+%d)\n", MAJOR(dev), MINOR(dev), limit); + err = -EINVAL; if ((*offp & sector_mask) || (size & sector_mask)) - return -EINVAL; - if ((*offp >> sector_bits) > limit) - return 0; - - /* - * We'll just use one kiobuf - */ - - err = alloc_kiovec(1, &iobuf); - if (err) - return err; + goto out_free; + err = 0; + if (size) + err = -ENXIO; + if ((*offp >> sector_bits) >= limit) + goto out_free; /* * Split the IO into KIO_MAX_SECTORS chunks, mapping and @@ -310,35 +356,37 @@ ssize_t rw_raw_dev(int rw, struct file *filp, char *buf, err = map_user_kiobuf(rw, iobuf, (unsigned long) buf, iosize); if (err) break; -#if 0 - err = lock_kiovec(1, &iobuf, 1); - if (err) - break; -#endif - + for (i=0; i < blocks; i++) - b[i] = blocknr++; + iobuf->blocks[i] = blocknr++; - err = brw_kiovec(rw, 1, &iobuf, dev, b, sector_size); + err = brw_kiovec(rw, 1, &iobuf, dev, iobuf->blocks, sector_size); + if (rw == READ && err > 0) + mark_dirty_kiobuf(iobuf, err); + if (err >= 0) { transferred += err; size -= err; buf += err; } - unmap_kiobuf(iobuf); /* The unlock_kiobuf is implicit here */ + unmap_kiobuf(iobuf); if (err != iosize) break; } - free_kiovec(1, &iobuf); - if (transferred) { *offp += transferred; - return transferred; + err = transferred; } - + + out_free: + if (!new_iobuf) + clear_bit(0, &raw_devices[minor].iobuf_lock); + else + free_kiovec(1, &iobuf); + out: return err; } diff --git a/drivers/ide/alim15x3.c b/drivers/ide/alim15x3.c index dbea97bd7d05..f4c069d78aea 100644 --- a/drivers/ide/alim15x3.c +++ b/drivers/ide/alim15x3.c @@ -640,11 +640,11 @@ unsigned int __init ata66_ali15x3 (ide_hwif_t *hwif) void __init ide_init_ali15x3 (ide_hwif_t *hwif) { +#ifndef CONFIG_SPARC64 byte ideic, inmir; byte irq_routing_table[] = { -1, 9, 3, 10, 4, 5, 7, 6, 1, 11, 0, 12, 0, 14, 0, 15 }; -#ifndef CONFIG_SPARC64 hwif->irq = hwif->channel ? 15 : 14; if (isa_dev) { diff --git a/drivers/md/lvm-snap.c b/drivers/md/lvm-snap.c index e28ffdbe9449..20e40c022e90 100644 --- a/drivers/md/lvm-snap.c +++ b/drivers/md/lvm-snap.c @@ -246,7 +246,6 @@ int lvm_write_COW_table_block(vg_t * vg, lv_t * lv_snap) int length_tmp; ulong snap_pe_start, COW_table_sector_offset, COW_entries_per_pe, COW_chunks_per_pe, COW_entries_per_block; - ulong blocks[1]; const char * reason; kdev_t snap_phys_dev; struct kiobuf * iobuf = lv_snap->lv_iobuf; @@ -274,7 +273,7 @@ int lvm_write_COW_table_block(vg_t * vg, lv_t * lv_snap) COW_table_sector_offset = (idx % COW_entries_per_pe) / (SECTOR_SIZE / sizeof(lv_COW_table_disk_t)); /* COW table block to write next */ - blocks[0] = (snap_pe_start + COW_table_sector_offset) >> (blksize_snap >> 10); + iobuf->blocks[0] = (snap_pe_start + COW_table_sector_offset) >> (blksize_snap >> 10); /* store new COW_table entry */ lv_COW_table[idx_COW_table].pv_org_number = cpu_to_le64(lvm_pv_get_number(vg, lv_snap->lv_block_exception[idx].rdev_org)); @@ -290,7 +289,7 @@ int lvm_write_COW_table_block(vg_t * vg, lv_t * lv_snap) iobuf->nr_pages = 1; if (brw_kiovec(WRITE, 1, &iobuf, snap_phys_dev, - blocks, blksize_snap) != blksize_snap) + iobuf->blocks, blksize_snap) != blksize_snap) goto fail_raw_write; @@ -309,11 +308,11 @@ int lvm_write_COW_table_block(vg_t * vg, lv_t * lv_snap) snap_phys_dev = lv_snap->lv_block_exception[idx].rdev_new; snap_pe_start = lv_snap->lv_block_exception[idx - (idx % COW_entries_per_pe)].rsector_new - lv_snap->lv_chunk_size; blksize_snap = lvm_get_blksize(snap_phys_dev); - blocks[0] = snap_pe_start >> (blksize_snap >> 10); - } else blocks[0]++; + iobuf->blocks[0] = snap_pe_start >> (blksize_snap >> 10); + } else iobuf->blocks[0]++; if (brw_kiovec(WRITE, 1, &iobuf, snap_phys_dev, - blocks, blksize_snap) != blksize_snap) + iobuf->blocks, blksize_snap) != blksize_snap) goto fail_raw_write; } @@ -352,7 +351,6 @@ int lvm_snapshot_COW(kdev_t org_phys_dev, unsigned long org_start, snap_start, snap_phys_dev, virt_start, pe_off; int idx = lv_snap->lv_remap_ptr, chunk_size = lv_snap->lv_chunk_size; struct kiobuf * iobuf; - unsigned long blocks[KIO_MAX_SECTORS]; int blksize_snap, blksize_org, min_blksize, max_blksize; int max_sectors, nr_sectors; @@ -402,16 +400,16 @@ int lvm_snapshot_COW(kdev_t org_phys_dev, iobuf->length = nr_sectors << 9; - lvm_snapshot_prepare_blocks(blocks, org_start, + lvm_snapshot_prepare_blocks(iobuf->blocks, org_start, nr_sectors, blksize_org); if (brw_kiovec(READ, 1, &iobuf, org_phys_dev, - blocks, blksize_org) != (nr_sectors<<9)) + iobuf->blocks, blksize_org) != (nr_sectors<<9)) goto fail_raw_read; - lvm_snapshot_prepare_blocks(blocks, snap_start, + lvm_snapshot_prepare_blocks(iobuf->blocks, snap_start, nr_sectors, blksize_snap); if (brw_kiovec(WRITE, 1, &iobuf, snap_phys_dev, - blocks, blksize_snap) != (nr_sectors<<9)) + iobuf->blocks, blksize_snap) != (nr_sectors<<9)) goto fail_raw_write; } diff --git a/drivers/sbus/char/Makefile b/drivers/sbus/char/Makefile index dd0a8c59884f..12ee27686e6c 100644 --- a/drivers/sbus/char/Makefile +++ b/drivers/sbus/char/Makefile @@ -12,6 +12,7 @@ O_TARGET := sunchar.o export-objs := su.o bbc_i2c.o obj-y := sunkbd.o sunkbdmap.o sunmouse.o sunserial.o zs.o +list-multi := vfc.o bbc.o vfc-objs := vfc_dev.o vfc_i2c.o bbc-objs := bbc_i2c.o bbc_envctrl.o diff --git a/drivers/scsi/3w-xxxx.c b/drivers/scsi/3w-xxxx.c index 9126933566a0..c4673d9ae094 100644 --- a/drivers/scsi/3w-xxxx.c +++ b/drivers/scsi/3w-xxxx.c @@ -717,6 +717,7 @@ int tw_findcards(Scsi_Host_Template *tw_host) continue; } + scsi_set_pci_device(host, tw_pci_dev); status_reg_value = inl(tw_dev->registers.status_reg_addr); dprintk(KERN_NOTICE "scsi%d : Found a 3ware Storage Controller at 0x%x, IRQ: %d P-chip: %d.%d\n", host->host_no, diff --git a/drivers/scsi/53c7,8xx.c b/drivers/scsi/53c7,8xx.c index 518352ad1426..e4ef976fa547 100644 --- a/drivers/scsi/53c7,8xx.c +++ b/drivers/scsi/53c7,8xx.c @@ -1155,7 +1155,7 @@ NCR53c7x0_init (struct Scsi_Host *host) { * Function : static int normal_init(Scsi_Host_Template *tpnt, int board, * int chip, u32 base, int io_port, int irq, int dma, int pcivalid, * unsigned char pci_bus, unsigned char pci_device_fn, - * long long options); + * struct pci_dev *pci_dev, long long options); * * Purpose : initializes a NCR53c7,8x0 based on base addresses, * IRQ, and DMA channel. @@ -1175,7 +1175,9 @@ NCR53c7x0_init (struct Scsi_Host *host) { static int __init normal_init (Scsi_Host_Template *tpnt, int board, int chip, u32 base, int io_port, int irq, int dma, int pci_valid, - unsigned char pci_bus, unsigned char pci_device_fn, long long options){ + unsigned char pci_bus, unsigned char pci_device_fn, + struct pci_dev *pci_dev, long long options) +{ struct Scsi_Host *instance; struct NCR53c7x0_hostdata *hostdata; char chip_str[80]; @@ -1319,6 +1321,7 @@ normal_init (Scsi_Host_Template *tpnt, int board, int chip, } instance->irq = irq; instance->dma_channel = dma; + scsi_set_pci_device(instance, pci_dev); hostdata->options = options; hostdata->dsa_len = dsa_len; @@ -1509,7 +1512,7 @@ ncr_pci_init (Scsi_Host_Template *tpnt, int board, int chip, } return normal_init (tpnt, board, chip, (int) base, io_port, - (int) irq, DMA_NONE, 1, bus, device_fn, options); + (int) irq, DMA_NONE, 1, bus, device_fn, pdev, options); } @@ -1553,6 +1556,7 @@ NCR53c7xx_detect(Scsi_Host_Template *tpnt){ overrides[current_override].data.normal.dma, 0 /* PCI data invalid */, 0 /* PCI bus place holder */, 0 /* PCI device_function place holder */, + NULL /* PCI pci_dev place holder */, overrides[current_override].options)) { ++count; } diff --git a/drivers/scsi/AM53C974.c b/drivers/scsi/AM53C974.c index f540892984f7..80b807bf723c 100644 --- a/drivers/scsi/AM53C974.c +++ b/drivers/scsi/AM53C974.c @@ -680,6 +680,7 @@ static int __init AM53C974_init(Scsi_Host_Template * tpnt, struct pci_dev *pdev printk(KERN_WARNING "AM53C974: Unable to register host, aborting.\n"); return 0; } + scsi_set_pci_device(instance, pdev); hostdata = (struct AM53C974_hostdata *) instance->hostdata; instance->base = 0; instance->io_port = pci_resource_start(pdev, 0); diff --git a/drivers/scsi/advansys.c b/drivers/scsi/advansys.c index c357279d14bd..00b3eba88525 100644 --- a/drivers/scsi/advansys.c +++ b/drivers/scsi/advansys.c @@ -4827,6 +4827,9 @@ advansys_detect(Scsi_Host_Template *tpnt) if (shp == NULL) { continue; } +#ifdef CONFIG_PCI + scsi_set_pci_device(shp, pci_devp); +#endif /* Save a pointer to the Scsi_host of each board found. */ asc_host[asc_board_count++] = shp; diff --git a/drivers/scsi/aic7xxx/aic7xxx_linux.c b/drivers/scsi/aic7xxx/aic7xxx_linux.c index edaa0b3b2957..c02fc1d2f858 100644 --- a/drivers/scsi/aic7xxx/aic7xxx_linux.c +++ b/drivers/scsi/aic7xxx/aic7xxx_linux.c @@ -1094,6 +1094,7 @@ aic7xxx_register_host(struct ahc_softc *ahc, Scsi_Host_Template *template) ahc_set_name(ahc, new_name); } host->unique_id = ahc->unit; + scsi_set_pci_device(host, ahc->dev_softc); aic7xxx_initialize_scsi_bus(ahc); ahc_unlock(ahc, &s); return (0); diff --git a/drivers/scsi/aic7xxx_old.c b/drivers/scsi/aic7xxx_old.c index e21138a20b7d..0ae0dfa94306 100644 --- a/drivers/scsi/aic7xxx_old.c +++ b/drivers/scsi/aic7xxx_old.c @@ -8867,6 +8867,7 @@ aic7xxx_alloc(Scsi_Host_Template *sht, struct aic7xxx_host *temp) } DRIVER_LOCK_INIT } + scsi_set_pci_device(host, p->pdev); return (p); } diff --git a/drivers/scsi/atp870u.c b/drivers/scsi/atp870u.c index 890d44426f7d..7f9d35696739 100644 --- a/drivers/scsi/atp870u.c +++ b/drivers/scsi/atp870u.c @@ -1787,6 +1787,7 @@ int atp870u_detect(Scsi_Host_Template * tpnt) shpnt->io_port = base_io; shpnt->n_io_port = 0x40; /* Number of bytes of I/O space used */ shpnt->irq = irq; + scsi_set_pci_device(shpnt, pdev[h]); restore_flags(flags); request_region(base_io, 0x40, "atp870u"); /* Register the IO ports that we use */ count++; diff --git a/drivers/scsi/cpqfcTSinit.c b/drivers/scsi/cpqfcTSinit.c index 434119ee1db4..83661d94d791 100644 --- a/drivers/scsi/cpqfcTSinit.c +++ b/drivers/scsi/cpqfcTSinit.c @@ -300,7 +300,7 @@ int cpqfcTS_detect(Scsi_Host_Template *ScsiHostTemplate) DEBUG_PCI(printk(" PciDev->baseaddress[]= %lx\n", PciDev->base_address[2])); DEBUG_PCI(printk(" PciDev->baseaddress[]= %lx\n", PciDev->base_address[3])); - + scsi_set_pci_device(HostAdapter, PciDev); HostAdapter->irq = PciDev->irq; // copy for Scsi layers // HP Tachlite uses two (255-byte) ranges of Port I/O (lower & upper), diff --git a/drivers/scsi/dmx3191d.c b/drivers/scsi/dmx3191d.c index 0d30d1f31574..543d380a25c8 100644 --- a/drivers/scsi/dmx3191d.c +++ b/drivers/scsi/dmx3191d.c @@ -86,6 +86,7 @@ int __init dmx3191d_detect(Scsi_Host_Template *tmpl) { release_region(port, DMX3191D_REGION); continue; } + scsi_set_pci_device(instance, pdev); instance->io_port = port; instance->irq = pdev->irq; NCR5380_init(instance, FLAG_NO_PSEUDO_DMA | FLAG_DTC3181E); diff --git a/drivers/scsi/fdomain.c b/drivers/scsi/fdomain.c index 884a2ea25a75..a2ff2bc74b85 100644 --- a/drivers/scsi/fdomain.c +++ b/drivers/scsi/fdomain.c @@ -805,7 +805,7 @@ static int fdomain_isa_detect( int *irq, int *iobase ) the PCI configuration registers. */ #ifdef CONFIG_PCI -static int fdomain_pci_bios_detect( int *irq, int *iobase ) +static int fdomain_pci_bios_detect( int *irq, int *iobase, struct pci_dev **ret_pdev ) { unsigned int pci_irq; /* PCI interrupt line */ unsigned long pci_base; /* PCI I/O base address */ @@ -849,6 +849,7 @@ static int fdomain_pci_bios_detect( int *irq, int *iobase ) *irq = pci_irq; *iobase = pci_base; + *ret_pdev = pdev; #if DEBUG_DETECT printk( "scsi: <fdomain> TMC-3260 detect:" @@ -875,6 +876,7 @@ int fdomain_16x0_detect( Scsi_Host_Template *tpnt ) { int retcode; struct Scsi_Host *shpnt; + struct pci_dev *pdev = NULL; #if DO_DETECT int i = 0; int j = 0; @@ -910,7 +912,7 @@ int fdomain_16x0_detect( Scsi_Host_Template *tpnt ) #ifdef CONFIG_PCI /* Try PCI detection first */ - flag = fdomain_pci_bios_detect( &interrupt_level, &port_base ); + flag = fdomain_pci_bios_detect( &interrupt_level, &port_base, &pdev ); #endif if (!flag) { /* Then try ISA bus detection */ @@ -969,6 +971,7 @@ int fdomain_16x0_detect( Scsi_Host_Template *tpnt ) return 0; shpnt->irq = interrupt_level; shpnt->io_port = port_base; + scsi_set_pci_device(shpnt->pci_dev, pdev); shpnt->n_io_port = 0x10; print_banner( shpnt ); diff --git a/drivers/scsi/gdth.c b/drivers/scsi/gdth.c index ec5814c7fce3..eddbe8059578 100644 --- a/drivers/scsi/gdth.c +++ b/drivers/scsi/gdth.c @@ -3290,6 +3290,7 @@ int __init gdth_detect(Scsi_Host_Template *shtp) scsi_unregister(shp); continue; } + scsi_set_pci_device(shp, pcistr[ctr].pdev); shp->unchecked_isa_dma = 0; shp->irq = ha->irq; shp->dma_channel = 0xff; diff --git a/drivers/scsi/hosts.h b/drivers/scsi/hosts.h index 7d107541e8ec..b0f8412cac27 100644 --- a/drivers/scsi/hosts.h +++ b/drivers/scsi/hosts.h @@ -27,6 +27,7 @@ #include <linux/config.h> #include <linux/proc_fs.h> +#include <linux/pci.h> /* It is senseless to set SG_ALL any higher than this - the performance * does not get any better, and it wastes memory @@ -414,6 +415,12 @@ struct Scsi_Host void (*select_queue_depths)(struct Scsi_Host *, Scsi_Device *); /* + * For SCSI hosts which are PCI devices, set pci_dev so that + * we can do BIOS EDD 3.0 mappings + */ + struct pci_dev *pci_dev; + + /* * We should ensure that this is aligned, both for better performance * and also because some compilers (m68k) don't automatically force * alignment to a long boundary. @@ -466,6 +473,13 @@ extern void scsi_unregister(struct Scsi_Host * i); extern void scsi_register_blocked_host(struct Scsi_Host * SHpnt); extern void scsi_deregister_blocked_host(struct Scsi_Host * SHpnt); +static inline void scsi_set_pci_device(struct Scsi_Host *SHpnt, + struct pci_dev *pdev) +{ + SHpnt->pci_dev = pdev; +} + + /* * Prototypes for functions/data in scsi_scan.c */ diff --git a/drivers/scsi/ini9100u.h b/drivers/scsi/ini9100u.h index f3f11920eecc..6b907ed6de1f 100644 --- a/drivers/scsi/ini9100u.h +++ b/drivers/scsi/ini9100u.h @@ -276,6 +276,7 @@ typedef struct Ha_Ctrl_Struc { spinlock_t HCS_AvailLock; spinlock_t HCS_SemaphLock; spinlock_t pSRB_lock; + struct pci_dev *pci_dev; } HCS; /* Bit Definition for HCB_Flags */ diff --git a/drivers/scsi/ips.c b/drivers/scsi/ips.c index 604c1c308f6a..d4d14f97e54e 100644 --- a/drivers/scsi/ips.c +++ b/drivers/scsi/ips.c @@ -879,6 +879,7 @@ ips_detect(Scsi_Host_Template *SHT) { sh->cmd_per_lun = sh->hostt->cmd_per_lun; sh->unchecked_isa_dma = sh->hostt->unchecked_isa_dma; sh->use_clustering = sh->hostt->use_clustering; + scsi_set_pci_device(sh, dev[i]); #if LINUX_VERSION_CODE < LinuxVersionCode(2,3,32) sh->wish_block = FALSE; diff --git a/drivers/scsi/megaraid.c b/drivers/scsi/megaraid.c index 7989cbd0f6ec..13671dc607ec 100644 --- a/drivers/scsi/megaraid.c +++ b/drivers/scsi/megaraid.c @@ -2510,6 +2510,7 @@ static int mega_findCard (Scsi_Host_Template * pHostTmpl, if (!host) goto err_unmap; + scsi_set_pci_device(host, pdev); megaCfg = (mega_host_config *) host->hostdata; memset (megaCfg, 0, sizeof (mega_host_config)); diff --git a/drivers/scsi/ncr53c8xx.c b/drivers/scsi/ncr53c8xx.c index 524d8a7b6913..4df173a780b7 100644 --- a/drivers/scsi/ncr53c8xx.c +++ b/drivers/scsi/ncr53c8xx.c @@ -3715,6 +3715,7 @@ ncr_attach (Scsi_Host_Template *tpnt, int unit, ncr_device *device) instance->cmd_per_lun = MAX_TAGS; instance->can_queue = (MAX_START-4); instance->select_queue_depths = ncr53c8xx_select_queue_depths; + scsi_set_pci_device(instance, device->pdev); #ifdef SCSI_NCR_INTEGRITY_CHECKING np->check_integrity = 0; diff --git a/drivers/scsi/pci2000.c b/drivers/scsi/pci2000.c index 8b91bb2bf11f..38fa634e5919 100644 --- a/drivers/scsi/pci2000.c +++ b/drivers/scsi/pci2000.c @@ -711,6 +711,7 @@ int Pci2000_Detect (Scsi_Host_Template *tpnt) goto unregister; } + scsi_set_pci_device(pshost, pdev); pshost->irq = pdev->irq; setirq = 1; padapter->irqOwned = 0; diff --git a/drivers/scsi/pci2220i.c b/drivers/scsi/pci2220i.c index b217520e3700..b490a1225e82 100644 --- a/drivers/scsi/pci2220i.c +++ b/drivers/scsi/pci2220i.c @@ -2553,6 +2553,7 @@ int Pci2220i_Detect (Scsi_Host_Template *tpnt) if ( GetRegs (pshost, FALSE, pcidev) ) goto unregister; + scsi_set_pci_device(pshost, pcidev); pshost->max_id = padapter->numberOfDrives; for ( z = 0; z < padapter->numberOfDrives; z++ ) { @@ -2656,6 +2657,7 @@ unregister:; for ( z = 0; z < BIGD_MAXDRIVES; z++ ) DiskMirror[z].status = inb_p (padapter->regScratchPad + BIGD_RAID_0_STATUS + z); + scsi_set_pci_info(pshost, pcidev); pshost->max_id = padapter->numberOfDrives; padapter->failRegister = inb_p (padapter->regScratchPad + BIGD_ALARM_IMAGE); for ( z = 0; z < padapter->numberOfDrives; z++ ) diff --git a/drivers/scsi/qla1280.c b/drivers/scsi/qla1280.c index 4656c964538e..9724f2091efd 100644 --- a/drivers/scsi/qla1280.c +++ b/drivers/scsi/qla1280.c @@ -814,6 +814,7 @@ qla1280_detect(Scsi_Host_Template *template) printk(KERN_WARNING "qla1280: Failed to register host, aborting.\n"); return 0; } + scsi_set_pci_device(host, pdev); ha = (scsi_qla_host_t *) host->hostdata; /* Clear our data area */ for( j =0, cp = (char *)ha; j < sizeof(scsi_qla_host_t); j++) diff --git a/drivers/scsi/qlogicfc.c b/drivers/scsi/qlogicfc.c index 4f5984953a31..3bf203c8dada 100644 --- a/drivers/scsi/qlogicfc.c +++ b/drivers/scsi/qlogicfc.c @@ -761,6 +761,7 @@ int isp2x00_detect(Scsi_Host_Template * tmpt) printk("qlogicfc%d : could not register host.\n", hosts); continue; } + scsi_set_pci_device(host, pdev); host->max_id = QLOGICFC_MAX_ID + 1; host->max_lun = QLOGICFC_MAX_LUN; host->hostt->use_new_eh_code = 1; diff --git a/drivers/scsi/qlogicisp.c b/drivers/scsi/qlogicisp.c index babf17608e2d..5e65cf8123ff 100644 --- a/drivers/scsi/qlogicisp.c +++ b/drivers/scsi/qlogicisp.c @@ -690,6 +690,7 @@ int isp1020_detect(Scsi_Host_Template *tmpt) memset(hostdata, 0, sizeof(struct isp1020_hostdata)); hostdata->pci_dev = pdev; + scsi_set_pci_device(host, pdev); if (isp1020_init(host)) goto fail_and_unregister; diff --git a/drivers/scsi/scsi_ioctl.c b/drivers/scsi/scsi_ioctl.c index ebf9a364a862..6e479d04e203 100644 --- a/drivers/scsi/scsi_ioctl.c +++ b/drivers/scsi/scsi_ioctl.c @@ -352,6 +352,26 @@ int scsi_ioctl_send_command(Scsi_Device * dev, Scsi_Ioctl_Command * sic) } /* + * The scsi_ioctl_get_pci() function places into arg the value + * pci_dev::slot_name (8 characters) for the PCI device (if any). + * Returns: 0 on success + * -ENXIO if there isn't a PCI device pointer + * (could be because the SCSI driver hasn't been + * updated yet, or because it isn't a SCSI + * device) + * any copy_to_user() error on failure there + */ +static int +scsi_ioctl_get_pci(Scsi_Device * dev, void *arg) +{ + + if (!dev->host->pci_dev) return -ENXIO; + return copy_to_user(arg, dev->host->pci_dev->slot_name, + sizeof(dev->host->pci_dev->slot_name)); +} + + +/* * the scsi_ioctl() function differs from most ioctls in that it does * not take a major/minor number as the dev field. Rather, it takes * a pointer to a scsi_devices[] element, a structure. @@ -453,6 +473,9 @@ int scsi_ioctl(Scsi_Device * dev, int cmd, void *arg) return ioctl_internal_command((Scsi_Device *) dev, scsi_cmd, START_STOP_TIMEOUT, NORMAL_RETRIES); break; + case SCSI_IOCTL_GET_PCI: + return scsi_ioctl_get_pci(dev, arg); + break; default: if (dev->host->hostt->ioctl) return dev->host->hostt->ioctl(dev, cmd, arg); diff --git a/drivers/scsi/sym53c8xx.c b/drivers/scsi/sym53c8xx.c index 9e80eeee10fd..734ada28ce69 100644 --- a/drivers/scsi/sym53c8xx.c +++ b/drivers/scsi/sym53c8xx.c @@ -5905,6 +5905,7 @@ ncr_attach (Scsi_Host_Template *tpnt, int unit, ncr_device *device) instance->dma_channel = 0; instance->cmd_per_lun = MAX_TAGS; instance->can_queue = (MAX_START-4); + scsi_set_pci_device(instance, device->pdev); np->check_integrity = 0; diff --git a/drivers/scsi/tmscsim.c b/drivers/scsi/tmscsim.c index d5937962f26f..7bcbaf62960c 100644 --- a/drivers/scsi/tmscsim.c +++ b/drivers/scsi/tmscsim.c @@ -2205,6 +2205,7 @@ static int __init DC390_init (PSHT psht, ULONG io_port, UCHAR Irq, PDEVDECL, UCH psh = scsi_register( psht, sizeof(DC390_ACB) ); if( !psh ) return( -1 ); + scsi_set_pci_device(psh, pdev); pACB = (PACB) psh->hostdata; DC390_LOCKA_INIT; DC390_LOCK_ACB; diff --git a/drivers/usb/usb-uhci.c b/drivers/usb/usb-uhci.c index b2750a6fe2e8..8f1e806332c9 100644 --- a/drivers/usb/usb-uhci.c +++ b/drivers/usb/usb-uhci.c @@ -16,7 +16,7 @@ * (C) Copyright 1999 Randy Dunlap * (C) Copyright 1999 Gregory P. Smith * - * $Id: usb-uhci.c,v 1.251 2000/11/30 09:47:54 acher Exp $ + * $Id: usb-uhci.c,v 1.259 2001/03/30 14:51:59 acher Exp $ */ #include <linux/config.h> @@ -52,7 +52,7 @@ /* This enables an extra UHCI slab for memory debugging */ #define DEBUG_SLAB -#define VERSTR "$Revision: 1.251 $ time " __TIME__ " " __DATE__ +#define VERSTR "$Revision: 1.259 $ time " __TIME__ " " __DATE__ #include <linux/usb.h> #include "usb-uhci.h" @@ -803,7 +803,7 @@ _static int uhci_submit_bulk_urb (urb_t *urb, urb_t *bulk_urb) { uhci_t *s = (uhci_t*) urb->dev->bus->hcpriv; urb_priv_t *urb_priv = urb->hcpriv; - uhci_desc_t *qh, *td, *nqh, *bqh, *first_td=NULL; + uhci_desc_t *qh, *td, *nqh=NULL, *bqh=NULL, *first_td=NULL; unsigned long destination, status; char *data; unsigned int pipe = urb->pipe; @@ -900,8 +900,8 @@ _static int uhci_submit_bulk_urb (urb_t *urb, urb_t *bulk_urb) data += pktsze; len -= pktsze; - - last = (len == 0 && (usb_pipein(pipe) || pktsze < maxsze || !(urb->transfer_flags & USB_DISABLE_SPD))); + // Use USB_ZERO_PACKET to finish bulk OUTs always with a zero length packet + last = (len == 0 && (usb_pipein(pipe) || pktsze < maxsze || !(urb->transfer_flags & USB_ZERO_PACKET))); if (last) td->hw.td.status |= TD_CTRL_IOC; // last one generates INT @@ -1178,6 +1178,9 @@ _static void uhci_cleanup_unlink(uhci_t *s, int force) urb_priv = (urb_priv_t*)urb->hcpriv; q = urb->urb_list.next; + if (!urb_priv) // avoid crash when URB is corrupted + break; + if (force || ((urb_priv->started != 0xffffffff) && (urb_priv->started != now))) { async_dbg("async cleanup %p",urb); @@ -1205,7 +1208,8 @@ _static void uhci_cleanup_unlink(uhci_t *s, int force) pipe = urb->pipe; // completion may destroy all... dev = urb->dev; urb_priv = urb->hcpriv; - + list_del (&urb->urb_list); + if (urb->complete) { spin_unlock(&s->urb_list_lock); urb->dev = NULL; @@ -1229,7 +1233,6 @@ _static void uhci_cleanup_unlink(uhci_t *s, int force) kfree (urb_priv); #endif - list_del (&urb->urb_list); } } } @@ -2282,8 +2285,11 @@ _static int process_transfer (uhci_t *s, urb_t *urb, int mode) for (; p != &qh->vertical; p = p->next) { desc = list_entry (p, uhci_desc_t, vertical); - if (desc->hw.td.status & TD_CTRL_ACTIVE) // do not process active TDs + if (desc->hw.td.status & TD_CTRL_ACTIVE) { // do not process active TDs + if (mode==2) // if called from async_unlink + uhci_clean_transfer(s, urb, qh, mode); return ret; + } actual_length = (desc->hw.td.status + 1) & 0x7ff; // extract transfer parameters from TD maxlength = (((desc->hw.td.info >> 21) & 0x7ff) + 1) & 0x7ff; @@ -2625,19 +2631,22 @@ _static int process_urb (uhci_t *s, struct list_head *p) // Completion if (urb->complete) { + int was_unlinked = (urb->status == -ENOENT); urb->dev = NULL; spin_unlock(&s->urb_list_lock); urb->complete ((struct urb *) urb); // Re-submit the URB if ring-linked - if (is_ring && (urb->status != -ENOENT) && !contains_killed) { + if (is_ring && !was_unlinked && !contains_killed) { urb->dev=usb_dev; uhci_submit_urb (urb); - } + } else + urb = 0; spin_lock(&s->urb_list_lock); } usb_dec_dev_use (usb_dev); - spin_unlock(&urb->lock); + if (urb) + spin_unlock(&urb->lock); } } @@ -2942,6 +2951,8 @@ uhci_pci_probe (struct pci_dev *dev, const struct pci_device_id *id) if (pci_enable_device(dev) < 0) return -ENODEV; + pci_set_master(dev); + /* Search for the IO base address.. */ for (i = 0; i < 6; i++) { @@ -2955,8 +2966,7 @@ uhci_pci_probe (struct pci_dev *dev, const struct pci_device_id *id) break; /* disable legacy emulation */ pci_write_config_word (dev, USBLEGSUP, 0); - - pci_set_master(dev); + return alloc_uhci(dev, dev->irq, io_addr, io_size); } return -ENODEV; diff --git a/fs/buffer.c b/fs/buffer.c index 07528fb47f43..3ad092069461 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -301,6 +301,23 @@ void sync_dev(kdev_t dev) */ } +int fsync_super(struct super_block *sb) +{ + kdev_t dev = sb->s_dev; + sync_buffers(dev, 0); + + lock_kernel(); + sync_inodes_sb(sb); + lock_super(sb); + if (sb->s_dirt && sb->s_op && sb->s_op->write_super) + sb->s_op->write_super(sb); + unlock_super(sb); + DQUOT_SYNC(dev); + unlock_kernel(); + + return sync_buffers(dev, 1); +} + int fsync_dev(kdev_t dev) { sync_buffers(dev, 0); @@ -1181,10 +1198,10 @@ static __inline__ void __put_unused_buffer_head(struct buffer_head * bh) kmem_cache_free(bh_cachep, bh); } else { bh->b_blocknr = -1; - init_waitqueue_head(&bh->b_wait); + bh->b_this_page = NULL; + nr_unused_buffer_heads++; bh->b_next_free = unused_list; - bh->b_this_page = NULL; unused_list = bh; } } @@ -1213,8 +1230,8 @@ static struct buffer_head * get_unused_buffer_head(int async) * more buffer-heads itself. Thus SLAB_BUFFER. */ if((bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER)) != NULL) { - memset(bh, 0, sizeof(*bh)); - init_waitqueue_head(&bh->b_wait); + bh->b_blocknr = -1; + bh->b_this_page = NULL; return bh; } @@ -1976,7 +1993,6 @@ static void end_buffer_io_kiobuf(struct buffer_head *bh, int uptodate) end_kio_request(kiobuf, uptodate); } - /* * For brw_kiovec: submit a set of buffer_head temporary IOs and wait * for them to complete. Clean up the buffer_heads afterwards. @@ -1984,21 +2000,18 @@ static void end_buffer_io_kiobuf(struct buffer_head *bh, int uptodate) static int wait_kio(int rw, int nr, struct buffer_head *bh[], int size) { - int iosize; + int iosize, err; int i; struct buffer_head *tmp; - iosize = 0; - spin_lock(&unused_list_lock); + err = 0; for (i = nr; --i >= 0; ) { iosize += size; tmp = bh[i]; if (buffer_locked(tmp)) { - spin_unlock(&unused_list_lock); wait_on_buffer(tmp); - spin_lock(&unused_list_lock); } if (!buffer_uptodate(tmp)) { @@ -2006,13 +2019,13 @@ static int wait_kio(int rw, int nr, struct buffer_head *bh[], int size) clearing iosize on error calculates the amount of IO before the first error. */ iosize = 0; + err = -EIO; } - __put_unused_buffer_head(tmp); } - spin_unlock(&unused_list_lock); - - return iosize; + if (iosize) + return iosize; + return err; } /* @@ -2041,7 +2054,7 @@ int brw_kiovec(int rw, int nr, struct kiobuf *iovec[], unsigned long blocknr; struct kiobuf * iobuf = NULL; struct page * map; - struct buffer_head *tmp, *bh[KIO_MAX_SECTORS]; + struct buffer_head *tmp, **bhs = NULL; if (!nr) return 0; @@ -2067,22 +2080,20 @@ int brw_kiovec(int rw, int nr, struct kiobuf *iovec[], offset = iobuf->offset; length = iobuf->length; iobuf->errno = 0; + if (!bhs) + bhs = iobuf->bh; for (pageind = 0; pageind < iobuf->nr_pages; pageind++) { map = iobuf->maplist[pageind]; if (!map) { err = -EFAULT; - goto error; + goto finished; } while (length > 0) { blocknr = b[bufind++]; - tmp = get_unused_buffer_head(0); - if (!tmp) { - err = -ENOMEM; - goto error; - } - + tmp = bhs[bhind++]; + tmp->b_dev = B_FREE; tmp->b_size = size; set_bh_page(tmp, map, offset); @@ -2096,9 +2107,9 @@ int brw_kiovec(int rw, int nr, struct kiobuf *iovec[], if (rw == WRITE) { set_bit(BH_Uptodate, &tmp->b_state); clear_bit(BH_Dirty, &tmp->b_state); - } + } else + set_bit(BH_Uptodate, &tmp->b_state); - bh[bhind++] = tmp; length -= size; offset += size; @@ -2109,7 +2120,8 @@ int brw_kiovec(int rw, int nr, struct kiobuf *iovec[], * Wait for IO if we have got too much */ if (bhind >= KIO_MAX_SECTORS) { - err = wait_kio(rw, bhind, bh, size); + kiobuf_wait_for_io(iobuf); /* wake-one */ + err = wait_kio(rw, bhind, bhs, size); if (err >= 0) transferred += err; else @@ -2127,7 +2139,8 @@ int brw_kiovec(int rw, int nr, struct kiobuf *iovec[], /* Is there any IO still left to submit? */ if (bhind) { - err = wait_kio(rw, bhind, bh, size); + kiobuf_wait_for_io(iobuf); /* wake-one */ + err = wait_kio(rw, bhind, bhs, size); if (err >= 0) transferred += err; else @@ -2138,16 +2151,6 @@ int brw_kiovec(int rw, int nr, struct kiobuf *iovec[], if (transferred) return transferred; return err; - - error: - /* We got an error allocating the bh'es. Just free the current - buffer_heads and exit. */ - spin_lock(&unused_list_lock); - for (i = bhind; --i >= 0; ) { - __put_unused_buffer_head(bh[i]); - } - spin_unlock(&unused_list_lock); - goto finished; } /* @@ -2599,7 +2602,7 @@ static int sync_old_buffers(void) { lock_kernel(); sync_supers(0); - sync_inodes(0); + sync_unlocked_inodes(); unlock_kernel(); flush_dirty_buffers(1); diff --git a/fs/coda/inode.c b/fs/coda/inode.c index 5a640dda718c..fad286d0e326 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -97,7 +97,6 @@ static struct super_block * coda_read_super(struct super_block *sb, struct coda_sb_info *sbi = NULL; struct venus_comm *vc = NULL; ViceFid fid; - kdev_t dev = sb->s_dev; int error; int idx; ENTRY; @@ -139,7 +138,6 @@ static struct super_block * coda_read_super(struct super_block *sb, sb->s_blocksize = 1024; /* XXXXX what do we put here?? */ sb->s_blocksize_bits = 10; sb->s_magic = CODA_SUPER_MAGIC; - sb->s_dev = dev; sb->s_op = &coda_super_operations; /* get root fid from Venus: this needs the root inode */ diff --git a/fs/dcache.c b/fs/dcache.c index 4b7ab57f4d9f..8c0c05187726 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1225,6 +1225,18 @@ static void __init dcache_init(unsigned long mempages) } while (i); } +static void init_buffer_head(void * foo, kmem_cache_t * cachep, unsigned long flags) +{ + if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == + SLAB_CTOR_CONSTRUCTOR) + { + struct buffer_head * bh = (struct buffer_head *) foo; + + memset(bh, 0, sizeof(*bh)); + init_waitqueue_head(&bh->b_wait); + } +} + /* SLAB cache for __getname() consumers */ kmem_cache_t *names_cachep; @@ -1242,7 +1254,7 @@ void __init vfs_caches_init(unsigned long mempages) { bh_cachep = kmem_cache_create("buffer_head", sizeof(struct buffer_head), 0, - SLAB_HWCACHE_ALIGN, NULL, NULL); + SLAB_HWCACHE_ALIGN, init_buffer_head, NULL); if(!bh_cachep) panic("Cannot create buffer head SLAB cache"); diff --git a/fs/inode.c b/fs/inode.c index d32edf37ce00..a40c75c73dcd 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -133,28 +133,26 @@ void __mark_inode_dirty(struct inode *inode, int flags) { struct super_block * sb = inode->i_sb; - if (sb) { - /* Don't do this for I_DIRTY_PAGES - that doesn't actually dirty the inode itself */ - if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { - if (sb->s_op && sb->s_op->dirty_inode) - sb->s_op->dirty_inode(inode); - } + /* Don't do this for I_DIRTY_PAGES - that doesn't actually dirty the inode itself */ + if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { + if (sb->s_op && sb->s_op->dirty_inode) + sb->s_op->dirty_inode(inode); + } - /* avoid the locking if we can */ - if ((inode->i_state & flags) == flags) - return; + /* avoid the locking if we can */ + if ((inode->i_state & flags) == flags) + return; - spin_lock(&inode_lock); - if ((inode->i_state & flags) != flags) { - inode->i_state |= flags; - /* Only add valid (ie hashed) inodes to the dirty list */ - if (!list_empty(&inode->i_hash)) { - list_del(&inode->i_list); - list_add(&inode->i_list, &sb->s_dirty); - } + spin_lock(&inode_lock); + if ((inode->i_state & flags) != flags) { + inode->i_state |= flags; + /* Only add valid (ie hashed) inodes to the dirty list */ + if (!(inode->i_state & I_LOCK) && !list_empty(&inode->i_hash)) { + list_del(&inode->i_list); + list_add(&inode->i_list, &sb->s_dirty); } - spin_unlock(&inode_lock); } + spin_unlock(&inode_lock); } static void __wait_on_inode(struct inode * inode) @@ -192,7 +190,7 @@ static inline void __iget(struct inode * inode) return; } atomic_inc(&inode->i_count); - if (!(inode->i_state & I_DIRTY)) { + if (!(inode->i_state & (I_DIRTY|I_LOCK))) { list_del(&inode->i_list); list_add(&inode->i_list, &inode_in_use); } @@ -204,9 +202,10 @@ static inline void __sync_one(struct inode *inode, int sync) unsigned dirty; list_del(&inode->i_list); - list_add(&inode->i_list, atomic_read(&inode->i_count) - ? &inode_in_use - : &inode_unused); + list_add(&inode->i_list, &inode->i_sb->s_locked_inodes); + + if (inode->i_state & I_LOCK) + BUG(); /* Set I_LOCK, reset I_DIRTY */ dirty = inode->i_state & I_DIRTY; @@ -224,6 +223,17 @@ static inline void __sync_one(struct inode *inode, int sync) spin_lock(&inode_lock); inode->i_state &= ~I_LOCK; + if (!(inode->i_state & I_FREEING)) { + struct list_head *to; + if (inode->i_state & I_DIRTY) + to = &inode->i_sb->s_dirty; + else if (atomic_read(&inode->i_count)) + to = &inode_in_use; + else + to = &inode_unused; + list_del(&inode->i_list); + list_add(&inode->i_list, to); + } wake_up(&inode->i_wait); } @@ -245,7 +255,37 @@ static inline void sync_list(struct list_head *head) struct list_head * tmp; while ((tmp = head->prev) != head) - sync_one(list_entry(tmp, struct inode, i_list), 0); + __sync_one(list_entry(tmp, struct inode, i_list), 0); +} + +static inline int wait_on_dirty(struct list_head *head) +{ + struct list_head * tmp; + list_for_each(tmp, head) { + struct inode *inode = list_entry(tmp, struct inode, i_list); + if (!inode->i_state & I_DIRTY) + continue; + __iget(inode); + spin_unlock(&inode_lock); + __wait_on_inode(inode); + iput(inode); + spin_lock(&inode_lock); + return 1; + } + return 0; +} + +static inline void wait_on_locked(struct list_head *head) +{ + struct list_head * tmp; + while ((tmp = head->prev) != head) { + struct inode *inode = list_entry(tmp, struct inode, i_list); + __iget(inode); + spin_unlock(&inode_lock); + __wait_on_inode(inode); + iput(inode); + spin_lock(&inode_lock); + } } static inline int try_to_sync_unused_list(struct list_head *head) @@ -256,8 +296,7 @@ static inline int try_to_sync_unused_list(struct list_head *head) while ((tmp = tmp->prev) != head) { inode = list_entry(tmp, struct inode, i_list); - if (!(inode->i_state & I_LOCK) - && !atomic_read(&inode->i_count)) { + if (!atomic_read(&inode->i_count)) { /* * We're under PF_MEMALLOC here, and syncing the * inode may have to allocate memory. To avoid @@ -287,7 +326,31 @@ static inline int try_to_sync_unused_list(struct list_head *head) * sync_inodes goes through the super block's dirty list, * writes them out, and puts them back on the normal list. */ + +/* + * caller holds exclusive lock on sb->s_umount + */ +void sync_inodes_sb(struct super_block *sb) +{ + spin_lock(&inode_lock); + sync_list(&sb->s_dirty); + wait_on_locked(&sb->s_locked_inodes); + spin_unlock(&inode_lock); +} + +void sync_unlocked_inodes(void) +{ + struct super_block * sb = sb_entry(super_blocks.next); + for (; sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.next)) { + if (!list_empty(&sb->s_dirty)) { + spin_lock(&inode_lock); + sync_list(&sb->s_dirty); + spin_unlock(&inode_lock); + } + } +} + void sync_inodes(kdev_t dev) { struct super_block * sb = sb_entry(super_blocks.next); @@ -295,19 +358,23 @@ void sync_inodes(kdev_t dev) /* * Search the super_blocks array for the device(s) to sync. */ - spin_lock(&inode_lock); for (; sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.next)) { if (!sb->s_dev) continue; if (dev && sb->s_dev != dev) continue; - - sync_list(&sb->s_dirty); - + down_read(&sb->s_umount); + if (sb->s_dev && (sb->s_dev == dev || !dev)) { + spin_lock(&inode_lock); + do { + sync_list(&sb->s_dirty); + } while (wait_on_dirty(&sb->s_locked_inodes)); + spin_unlock(&inode_lock); + } + up_read(&sb->s_umount); if (dev) break; } - spin_unlock(&inode_lock); } /* @@ -517,6 +584,7 @@ int invalidate_inodes(struct super_block * sb) busy = invalidate_list(&inode_in_use, sb, &throw_away); busy |= invalidate_list(&inode_unused, sb, &throw_away); busy |= invalidate_list(&sb->s_dirty, sb, &throw_away); + busy |= invalidate_list(&sb->s_locked_inodes, sb, &throw_away); spin_unlock(&inode_lock); dispose_list(&throw_away); @@ -557,7 +625,7 @@ free_unused: entry = entry->prev; inode = INODE(tmp); - if (inode->i_state & (I_FREEING|I_CLEAR)) + if (inode->i_state & (I_FREEING|I_CLEAR|I_LOCK)) BUG(); if (!CAN_UNUSE(inode)) continue; @@ -940,10 +1008,9 @@ void iput(struct inode *inode) BUG(); } else { if (!list_empty(&inode->i_hash)) { - if (!(inode->i_state & I_DIRTY)) { + if (!(inode->i_state & (I_DIRTY|I_LOCK))) { list_del(&inode->i_list); - list_add(&inode->i_list, - &inode_unused); + list_add(&inode->i_list, &inode_unused); } inodes_stat.nr_unused++; spin_unlock(&inode_lock); @@ -1086,23 +1153,25 @@ void remove_dquot_ref(kdev_t dev, short type) /* We have to be protected against other CPUs */ spin_lock(&inode_lock); - for (act_head = inode_in_use.next; act_head != &inode_in_use; act_head = act_head->next) { + list_for_each(act_head, &inode_in_use) { inode = list_entry(act_head, struct inode, i_list); - if (inode->i_sb != sb || !IS_QUOTAINIT(inode)) - continue; - remove_inode_dquot_ref(inode, type, &tofree_head); + if (inode->i_sb == sb && IS_QUOTAINIT(inode)) + remove_inode_dquot_ref(inode, type, &tofree_head); } - for (act_head = inode_unused.next; act_head != &inode_unused; act_head = act_head->next) { + list_for_each(act_head, &inode_unused) { inode = list_entry(act_head, struct inode, i_list); - if (inode->i_sb != sb || !IS_QUOTAINIT(inode)) - continue; - remove_inode_dquot_ref(inode, type, &tofree_head); + if (inode->i_sb == sb && IS_QUOTAINIT(inode)) + remove_inode_dquot_ref(inode, type, &tofree_head); } - for (act_head = sb->s_dirty.next; act_head != &sb->s_dirty; act_head = act_head->next) { + list_for_each(act_head, &sb->s_dirty) { inode = list_entry(act_head, struct inode, i_list); - if (!IS_QUOTAINIT(inode)) - continue; - remove_inode_dquot_ref(inode, type, &tofree_head); + if (IS_QUOTAINIT(inode)) + remove_inode_dquot_ref(inode, type, &tofree_head); + } + list_for_each(act_head, &sb->s_locked_inodes) { + inode = list_entry(act_head, struct inode, i_list); + if (IS_QUOTAINIT(inode)) + remove_inode_dquot_ref(inode, type, &tofree_head); } spin_unlock(&inode_lock); diff --git a/fs/iobuf.c b/fs/iobuf.c index 5401243557b4..9cfd01eaf9a4 100644 --- a/fs/iobuf.c +++ b/fs/iobuf.c @@ -8,9 +8,7 @@ #include <linux/iobuf.h> #include <linux/slab.h> - -static kmem_cache_t *kiobuf_cachep; - +#include <linux/vmalloc.h> void end_kio_request(struct kiobuf *kiobuf, int uptodate) { @@ -24,18 +22,7 @@ void end_kio_request(struct kiobuf *kiobuf, int uptodate) } } - -void __init kiobuf_setup(void) -{ - kiobuf_cachep = kmem_cache_create("kiobuf", - sizeof(struct kiobuf), - 0, - SLAB_HWCACHE_ALIGN, NULL, NULL); - if(!kiobuf_cachep) - panic("Cannot create kernel iobuf cache\n"); -} - -void kiobuf_init(struct kiobuf *iobuf) +static void kiobuf_init(struct kiobuf *iobuf) { memset(iobuf, 0, sizeof(*iobuf)); init_waitqueue_head(&iobuf->wait_queue); @@ -43,18 +30,48 @@ void kiobuf_init(struct kiobuf *iobuf) iobuf->maplist = iobuf->map_array; } +int alloc_kiobuf_bhs(struct kiobuf * kiobuf) +{ + int i; + + for (i = 0; i < KIO_MAX_SECTORS; i++) + if (!(kiobuf->bh[i] = kmem_cache_alloc(bh_cachep, SLAB_KERNEL))) { + while (i--) { + kmem_cache_free(bh_cachep, kiobuf->bh[i]); + kiobuf->bh[i] = NULL; + } + return -ENOMEM; + } + return 0; +} + +void free_kiobuf_bhs(struct kiobuf * kiobuf) +{ + int i; + + for (i = 0; i < KIO_MAX_SECTORS; i++) { + kmem_cache_free(bh_cachep, kiobuf->bh[i]); + kiobuf->bh[i] = NULL; + } +} + int alloc_kiovec(int nr, struct kiobuf **bufp) { int i; struct kiobuf *iobuf; for (i = 0; i < nr; i++) { - iobuf = kmem_cache_alloc(kiobuf_cachep, SLAB_KERNEL); + iobuf = vmalloc(sizeof(struct kiobuf)); if (!iobuf) { free_kiovec(i, bufp); return -ENOMEM; } kiobuf_init(iobuf); + if (alloc_kiobuf_bhs(iobuf)) { + vfree(iobuf); + free_kiovec(i, bufp); + return -ENOMEM; + } bufp[i] = iobuf; } @@ -72,7 +89,8 @@ void free_kiovec(int nr, struct kiobuf **bufp) unlock_kiovec(1, &iobuf); if (iobuf->array_len > KIO_STATIC_PAGES) kfree (iobuf->maplist); - kmem_cache_free(kiobuf_cachep, bufp[i]); + free_kiobuf_bhs(iobuf); + vfree(bufp[i]); } } @@ -115,11 +133,12 @@ void kiobuf_wait_for_io(struct kiobuf *kiobuf) add_wait_queue(&kiobuf->wait_queue, &wait); repeat: - run_task_queue(&tq_disk); set_task_state(tsk, TASK_UNINTERRUPTIBLE); if (atomic_read(&kiobuf->io_count) != 0) { + run_task_queue(&tq_disk); schedule(); - goto repeat; + if (atomic_read(&kiobuf->io_count) != 0) + goto repeat; } tsk->state = TASK_RUNNING; remove_wait_queue(&kiobuf->wait_queue, &wait); diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c index ea83fd5496af..eb9eb64b994f 100644 --- a/fs/reiserfs/bitmap.c +++ b/fs/reiserfs/bitmap.c @@ -8,6 +8,7 @@ #include <linux/reiserfs_fs.h> #include <linux/locks.h> #include <asm/bitops.h> +#include <linux/list.h> #else @@ -580,6 +581,12 @@ int reiserfs_new_unf_blocknrs2 (struct reiserfs_transaction_handle *th, if (p_s_inode->u.reiserfs_i.i_prealloc_count > 0) { p_s_inode->u.reiserfs_i.i_prealloc_count--; *free_blocknrs = p_s_inode->u.reiserfs_i.i_prealloc_block++; + + /* if no more preallocated blocks, remove inode from list */ + if (! p_s_inode->u.reiserfs_i.i_prealloc_count) { + list_del(&p_s_inode->u.reiserfs_i.i_prealloc_list); + } + return ret; } @@ -633,6 +640,11 @@ int reiserfs_new_unf_blocknrs2 (struct reiserfs_transaction_handle *th, *free_blocknrs = p_s_inode->u.reiserfs_i.i_prealloc_block; p_s_inode->u.reiserfs_i.i_prealloc_block++; + /* if inode has preallocated blocks, link him to list */ + if (p_s_inode->u.reiserfs_i.i_prealloc_count) { + list_add(&p_s_inode->u.reiserfs_i.i_prealloc_list, + &SB_JOURNAL(th->t_super)->j_prealloc_list); + } /* we did actually manage to get 1 block */ if (ret != CARRY_ON && allocated[0] > 0) { return CARRY_ON ; @@ -664,16 +676,43 @@ int reiserfs_new_unf_blocknrs2 (struct reiserfs_transaction_handle *th, // a portion of this function, was derived from minix or ext2's // analog. You should be able to tell which portion by looking at the // ext2 code and comparing. +static void __discard_prealloc (struct reiserfs_transaction_handle * th, + struct inode * inode) +{ + while (inode->u.reiserfs_i.i_prealloc_count > 0) { + reiserfs_free_block(th,inode->u.reiserfs_i.i_prealloc_block); + inode->u.reiserfs_i.i_prealloc_block++; + inode->u.reiserfs_i.i_prealloc_count --; + } + list_del (&(inode->u.reiserfs_i.i_prealloc_list)); +} + void reiserfs_discard_prealloc (struct reiserfs_transaction_handle *th, struct inode * inode) { - if (inode->u.reiserfs_i.i_prealloc_count > 0) { - while (inode->u.reiserfs_i.i_prealloc_count--) { - reiserfs_free_block(th,inode->u.reiserfs_i.i_prealloc_block); - inode->u.reiserfs_i.i_prealloc_block++; - } +#ifdef CONFIG_REISERFS_CHECK + if (inode->u.reiserfs_i.i_prealloc_count < 0) + reiserfs_warning("zam-4001:" __FUNCTION__ ": inode has negative prealloc blocks count.\n"); +#endif + if (inode->u.reiserfs_i.i_prealloc_count > 0) { + __discard_prealloc(th, inode); + } +} + +void reiserfs_discard_all_prealloc (struct reiserfs_transaction_handle *th) +{ + struct list_head * plist = &SB_JOURNAL(th->t_super)->j_prealloc_list; + struct inode * inode; + + while (!list_empty(plist)) { + inode = list_entry(plist->next, struct inode, u.reiserfs_i.i_prealloc_list); +#ifdef CONFIG_REISERFS_CHECK + if (!inode->u.reiserfs_i.i_prealloc_count) { + reiserfs_warning("zam-4001:" __FUNCTION__ ": inode is in prealloc list but has no preallocated blocks.\n"); } - inode->u.reiserfs_i.i_prealloc_count = 0; +#endif + __discard_prealloc(th, inode); + } } #endif diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 52237eb283f1..c2c3222791ba 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -1185,13 +1185,19 @@ struct inode * reiserfs_iget (struct super_block * s, struct cpu_key * key) if (!inode) return inode ; - // if (comp_short_keys (INODE_PKEY (inode), key)) { if (is_bad_inode (inode)) { reiserfs_warning ("vs-13048: reiserfs_iget: " "bad_inode. Stat data of (%lu %lu) not found\n", key->on_disk_key.k_dir_id, key->on_disk_key.k_objectid); iput (inode); inode = 0; + } else if (comp_short_keys (INODE_PKEY (inode), key)) { + reiserfs_warning ("vs-13049: reiserfs_iget: " + "Looking for (%lu %lu), found inode of (%lu %lu)\n", + key->on_disk_key.k_dir_id, key->on_disk_key.k_objectid, + INODE_PKEY (inode)->k_dir_id, INODE_PKEY (inode)->k_objectid); + iput (inode); + inode = 0; } return inode; } diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index d5513d59ffbb..bf77724ef03c 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -1925,8 +1925,11 @@ int journal_init(struct super_block *p_s_sb) { free_journal_ram(p_s_sb) ; return 1 ; } - SB_JOURNAL_LIST_INDEX(p_s_sb) = 0 ; /* once the read is done, we can set this where it belongs */ + SB_JOURNAL_LIST_INDEX(p_s_sb) = 0 ; /* once the read is done, we can set this + where it belongs */ + INIT_LIST_HEAD (&SB_JOURNAL(p_s_sb)->j_prealloc_list); + if (reiserfs_dont_log (p_s_sb)) return 0; @@ -2983,6 +2986,11 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, struct super_b flush = 1 ; } +#ifdef REISERFS_PREALLOCATE + reiserfs_discard_all_prealloc(th); /* it should not involve new blocks into + * the transaction */ +#endif + rs = SB_DISK_SUPER_BLOCK(p_s_sb) ; /* setup description block */ d_bh = getblk(p_s_sb->s_dev, reiserfs_get_journal_block(p_s_sb) + SB_JOURNAL(p_s_sb)->j_start, p_s_sb->s_blocksize) ; diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 27305305eced..ed782d909382 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -466,40 +466,42 @@ static int reiserfs_add_entry (struct reiserfs_transaction_handle *th, struct in /* find the proper place for the new entry */ memset (bit_string, 0, sizeof (bit_string)); de.de_gen_number_bit_string = (char *)bit_string; - if (reiserfs_find_entry (dir, name, namelen, &path, &de) == NAME_FOUND) { + retval = reiserfs_find_entry (dir, name, namelen, &path, &de); + if (retval != NAME_NOT_FOUND) { if (buffer != small_buf) reiserfs_kfree (buffer, buflen, dir->i_sb); pathrelse (&path); + + if (retval != NAME_FOUND) { + reiserfs_warning ("zam-7002:" __FUNCTION__ ": \"reiserfs_find_entry\" has returned" + " unexpected value (%d)\n", retval); + } + return -EEXIST; } - if (find_first_nonzero_bit (bit_string, MAX_GENERATION_NUMBER + 1) < MAX_GENERATION_NUMBER + 1) { - /* there are few names with given hash value */ - gen_number = find_first_zero_bit (bit_string, MAX_GENERATION_NUMBER + 1); - if (gen_number > MAX_GENERATION_NUMBER) { - /* there is no free generation number */ - reiserfs_warning ("reiserfs_add_entry: Congratulations! we have got hash function screwed up\n"); - if (buffer != small_buf) - reiserfs_kfree (buffer, buflen, dir->i_sb); - pathrelse (&path); - return -EBUSY; //HASHCOLLISION;//EBADSLT - } - /* adjust offset of directory enrty */ - deh->deh_offset = cpu_to_le32 (SET_GENERATION_NUMBER (deh_offset (deh), gen_number)); - set_cpu_key_k_offset (&entry_key, le32_to_cpu (deh->deh_offset)); + gen_number = find_first_zero_bit (bit_string, MAX_GENERATION_NUMBER + 1); + if (gen_number > MAX_GENERATION_NUMBER) { + /* there is no free generation number */ + reiserfs_warning ("reiserfs_add_entry: Congratulations! we have got hash function screwed up\n"); + if (buffer != small_buf) + reiserfs_kfree (buffer, buflen, dir->i_sb); + pathrelse (&path); + return -EBUSY; + } + /* adjust offset of directory enrty */ + deh->deh_offset = cpu_to_le32 (SET_GENERATION_NUMBER (deh_offset (deh), gen_number)); + set_cpu_key_k_offset (&entry_key, le32_to_cpu (deh->deh_offset)); - /* find place for new entry */ - if (search_by_entry_key (dir->i_sb, &entry_key, &path, &de) == NAME_FOUND) { + if (gen_number != 0) { /* we need to re-search for the insertion point */ + if (search_by_entry_key (dir->i_sb, &entry_key, &path, &de) != NAME_NOT_FOUND) { reiserfs_warning ("vs-7032: reiserfs_add_entry: " - "entry with this key (%k) already exists", &entry_key); + "entry with this key (%k) already exists\n", &entry_key); if (buffer != small_buf) reiserfs_kfree (buffer, buflen, dir->i_sb); pathrelse (&path); return -EBUSY; } - } else { - deh->deh_offset = cpu_to_le32 (SET_GENERATION_NUMBER (le32_to_cpu (deh->deh_offset), 0)); - set_cpu_key_k_offset (&entry_key, le32_to_cpu (deh->deh_offset)); } /* perform the insertion of the entry that we have prepared */ diff --git a/fs/reiserfs/objectid.c b/fs/reiserfs/objectid.c index 0bcbd4559c65..26c47f2cc7c0 100644 --- a/fs/reiserfs/objectid.c +++ b/fs/reiserfs/objectid.c @@ -18,8 +18,8 @@ // find where objectid map starts #define objectid_map(s,rs) (old_format_only (s) ? \ - (__u32 *)((struct reiserfs_super_block_v1 *)rs + 1) :\ - (__u32 *)(rs + 1)) + (__u32 *)((struct reiserfs_super_block_v1 *)(rs) + 1) :\ + (__u32 *)((rs) + 1)) #ifdef CONFIG_REISERFS_CHECK @@ -27,7 +27,8 @@ static void check_objectid_map (struct super_block * s, __u32 * map) { if (le32_to_cpu (map[0]) != 1) - reiserfs_panic (s, "vs-15010: check_objectid_map: map corrupted"); + reiserfs_panic (s, "vs-15010: check_objectid_map: map corrupted: %lx", + le32_to_cpu (map[0])); // FIXME: add something else here } diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index f7ccb4206566..a9f2c6d962e7 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -77,6 +77,7 @@ void reiserfs_write_super_lockfs (struct super_block * s) lock_kernel() ; if (!(s->s_flags & MS_RDONLY)) { journal_begin(&th, s, 1) ; + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB (s)); reiserfs_block_writes(&th) ; journal_end(&th, s, 1) ; diff --git a/fs/smbfs/getopt.c b/fs/smbfs/getopt.c index 0c5d111b3fa8..20276da095bf 100644 --- a/fs/smbfs/getopt.c +++ b/fs/smbfs/getopt.c @@ -30,8 +30,10 @@ int smb_getopt(char *caller, char **options, struct option *opts, char *val; int i; - if ( (token = strsep(options, ",")) == NULL) - return 0; + do { + if ((token = strsep(options, ",")) == NULL) + return 0; + } while (*token == '\0'); *optopt = token; *optarg = NULL; diff --git a/fs/super.c b/fs/super.c index 20b50923196c..861a46872d54 100644 --- a/fs/super.c +++ b/fs/super.c @@ -712,10 +712,12 @@ static struct super_block *get_empty_super(void) nr_super_blocks++; memset(s, 0, sizeof(struct super_block)); INIT_LIST_HEAD(&s->s_dirty); + INIT_LIST_HEAD(&s->s_locked_inodes); list_add (&s->s_list, super_blocks.prev); init_waitqueue_head(&s->s_wait); INIT_LIST_HEAD(&s->s_files); INIT_LIST_HEAD(&s->s_mounts); + init_rwsem(&s->s_umount); } return s; } @@ -895,13 +897,14 @@ static void kill_super(struct super_block *sb, int umount_root) struct file_system_type *fs = sb->s_type; struct super_operations *sop = sb->s_op; + down_write(&sb->s_umount); sb->s_root = NULL; /* Need to clean after the sucker */ if (fs->fs_flags & FS_LITTER) d_genocide(root); shrink_dcache_parent(root); dput(root); - fsync_dev(sb->s_dev); + fsync_super(sb); lock_super(sb); if (sop) { if (sop->write_super && sb->s_dirt) @@ -923,6 +926,7 @@ static void kill_super(struct super_block *sb, int umount_root) put_filesystem(fs); sb->s_type = NULL; unlock_super(sb); + up_write(&sb->s_umount); if (umount_root) { /* special: the old device driver is going to be a ramdisk and the point of this call is to free its diff --git a/include/linux/fs.h b/include/linux/fs.h index 39dc8ac3aa36..b1fa3cc77bff 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -655,9 +655,11 @@ struct super_block { unsigned long s_flags; unsigned long s_magic; struct dentry *s_root; + struct rw_semaphore s_umount; wait_queue_head_t s_wait; struct list_head s_dirty; /* dirty inodes */ + struct list_head s_locked_inodes;/* inodes being synced */ struct list_head s_files; struct block_device *s_bdev; @@ -1090,9 +1092,12 @@ extern void invalidate_inode_buffers(struct inode *); #define destroy_buffers(dev) __invalidate_buffers((dev), 1) extern void __invalidate_buffers(kdev_t dev, int); extern void sync_inodes(kdev_t); +extern void sync_unlocked_inodes(void); extern void write_inode_now(struct inode *, int); extern void sync_dev(kdev_t); extern int fsync_dev(kdev_t); +extern int fsync_super(struct super_block *); +extern void sync_inodes_sb(struct super_block *); extern int fsync_inode_buffers(struct inode *); extern int osync_inode_buffers(struct inode *); extern int inode_has_buffers(struct inode *); diff --git a/include/linux/iobuf.h b/include/linux/iobuf.h index 3de43c924039..619187efec8d 100644 --- a/include/linux/iobuf.h +++ b/include/linux/iobuf.h @@ -24,8 +24,7 @@ * entire iovec. */ -#define KIO_MAX_ATOMIC_IO 64 /* in kb */ -#define KIO_MAX_ATOMIC_BYTES (64 * 1024) +#define KIO_MAX_ATOMIC_IO 512 /* in kb */ #define KIO_STATIC_PAGES (KIO_MAX_ATOMIC_IO / (PAGE_SIZE >> 10) + 1) #define KIO_MAX_SECTORS (KIO_MAX_ATOMIC_IO * 2) @@ -47,8 +46,10 @@ struct kiobuf unsigned int locked : 1; /* If set, pages has been locked */ - /* Always embed enough struct pages for 64k of IO */ + /* Always embed enough struct pages for atomic IO */ struct page * map_array[KIO_STATIC_PAGES]; + struct buffer_head * bh[KIO_MAX_SECTORS]; + unsigned long blocks[KIO_MAX_SECTORS]; /* Dynamic state for IO completion: */ atomic_t io_count; /* IOs still in progress */ @@ -64,17 +65,18 @@ int map_user_kiobuf(int rw, struct kiobuf *, unsigned long va, size_t len); void unmap_kiobuf(struct kiobuf *iobuf); int lock_kiovec(int nr, struct kiobuf *iovec[], int wait); int unlock_kiovec(int nr, struct kiobuf *iovec[]); +void mark_dirty_kiobuf(struct kiobuf *iobuf, int bytes); /* fs/iobuf.c */ -void __init kiobuf_setup(void); -void kiobuf_init(struct kiobuf *); void end_kio_request(struct kiobuf *, int); void simple_wakeup_kiobuf(struct kiobuf *); int alloc_kiovec(int nr, struct kiobuf **); void free_kiovec(int nr, struct kiobuf **); int expand_kiobuf(struct kiobuf *, int); void kiobuf_wait_for_io(struct kiobuf *); +extern int alloc_kiobuf_bhs(struct kiobuf *); +extern void free_kiobuf_bhs(struct kiobuf *); /* fs/buffer.c */ diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index c18810e526c4..46c2b72e33f3 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -145,6 +145,8 @@ extern void nf_reinject(struct sk_buff *skb, struct nf_info *info, unsigned int verdict); +extern void (*ip_ct_attach)(struct sk_buff *, struct nf_ct_info *); + #ifdef CONFIG_NETFILTER_DEBUG extern void nf_dump_skb(int pf, struct sk_buff *skb); #endif diff --git a/include/linux/netfilter_ipv4/ip_conntrack.h b/include/linux/netfilter_ipv4/ip_conntrack.h index 35caff35f1f5..9d8a18a06048 100644 --- a/include/linux/netfilter_ipv4/ip_conntrack.h +++ b/include/linux/netfilter_ipv4/ip_conntrack.h @@ -32,6 +32,7 @@ enum ip_conntrack_info #include <linux/types.h> #include <linux/skbuff.h> #include <linux/netfilter_ipv4/ip_conntrack_tcp.h> +#include <linux/netfilter_ipv4/ip_conntrack_icmp.h> #ifdef CONFIG_NF_DEBUG #define IP_NF_ASSERT(x) \ @@ -56,12 +57,8 @@ enum ip_conntrack_status { IPS_SEEN_REPLY_BIT = 1, IPS_SEEN_REPLY = (1 << IPS_SEEN_REPLY_BIT), - /* Packet seen leaving box: bit 2 set. Can be set, not unset. */ - IPS_CONFIRMED_BIT = 2, - IPS_CONFIRMED = (1 << IPS_CONFIRMED_BIT), - /* Conntrack should never be early-expired. */ - IPS_ASSURED_BIT = 4, + IPS_ASSURED_BIT = 2, IPS_ASSURED = (1 << IPS_ASSURED_BIT), }; @@ -88,7 +85,7 @@ struct ip_conntrack_expect struct ip_conntrack { - /* Usage count in here is 1 for destruct timer, 1 per skb, + /* Usage count in here is 1 for hash table/destruct timer, 1 per skb, plus 1 for any connection(s) we are `master' for */ struct nf_conntrack ct_general; @@ -119,6 +116,7 @@ struct ip_conntrack union { struct ip_ct_tcp tcp; + struct ip_ct_icmp icmp; } proto; union { @@ -177,5 +175,13 @@ ip_ct_gather_frags(struct sk_buff *skb); extern void ip_ct_selective_cleanup(int (*kill)(const struct ip_conntrack *i, void *data), void *data); + +/* It's confirmed if it is, or has been in the hash table. */ +static inline int is_confirmed(struct ip_conntrack *ct) +{ + return ct->tuplehash[IP_CT_DIR_ORIGINAL].list.next != NULL; +} + +extern unsigned int ip_conntrack_htable_size; #endif /* __KERNEL__ */ #endif /* _IP_CONNTRACK_H */ diff --git a/include/linux/netfilter_ipv4/ip_conntrack_core.h b/include/linux/netfilter_ipv4/ip_conntrack_core.h index bcf300c51d45..6ed40793af6a 100644 --- a/include/linux/netfilter_ipv4/ip_conntrack_core.h +++ b/include/linux/netfilter_ipv4/ip_conntrack_core.h @@ -33,10 +33,17 @@ struct ip_conntrack_tuple_hash * ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple, const struct ip_conntrack *ignored_conntrack); -/* Confirm a connection */ -void ip_conntrack_confirm(struct ip_conntrack *ct); +extern int __ip_conntrack_confirm(struct nf_ct_info *nfct); + +/* Confirm a connection: returns NF_DROP if packet must be dropped. */ +static inline int ip_conntrack_confirm(struct sk_buff *skb) +{ + if (skb->nfct + && !is_confirmed((struct ip_conntrack *)skb->nfct->master)) + return __ip_conntrack_confirm(skb->nfct); + return NF_ACCEPT; +} -extern unsigned int ip_conntrack_htable_size; extern struct list_head *ip_conntrack_hash; extern struct list_head expect_list; DECLARE_RWLOCK_EXTERN(ip_conntrack_lock); diff --git a/include/linux/netfilter_ipv4/ip_conntrack_icmp.h b/include/linux/netfilter_ipv4/ip_conntrack_icmp.h new file mode 100644 index 000000000000..f1664abbe392 --- /dev/null +++ b/include/linux/netfilter_ipv4/ip_conntrack_icmp.h @@ -0,0 +1,11 @@ +#ifndef _IP_CONNTRACK_ICMP_H +#define _IP_CONNTRACK_ICMP_H +/* ICMP tracking. */ +#include <asm/atomic.h> + +struct ip_ct_icmp +{ + /* Optimization: when number in == number out, forget immediately. */ + atomic_t count; +}; +#endif /* _IP_CONNTRACK_ICMP_H */ diff --git a/include/linux/netfilter_ipv4/ip_conntrack_protocol.h b/include/linux/netfilter_ipv4/ip_conntrack_protocol.h index f5fd96690f46..83076c3c5f25 100644 --- a/include/linux/netfilter_ipv4/ip_conntrack_protocol.h +++ b/include/linux/netfilter_ipv4/ip_conntrack_protocol.h @@ -38,9 +38,9 @@ struct ip_conntrack_protocol enum ip_conntrack_info ctinfo); /* Called when a new connection for this protocol found; - * returns timeout. If so, packet() called next. */ - unsigned long (*new)(struct ip_conntrack *conntrack, - struct iphdr *iph, size_t len); + * returns TRUE if it's OK. If so, packet() called next. */ + int (*new)(struct ip_conntrack *conntrack, struct iphdr *iph, + size_t len); /* Module (if any) which this is connected to. */ struct module *me; diff --git a/include/linux/raw.h b/include/linux/raw.h index a2d9b14cd302..4736390a5db5 100644 --- a/include/linux/raw.h +++ b/include/linux/raw.h @@ -13,11 +13,4 @@ struct raw_config_request __u64 block_minor; }; -#ifdef __KERNEL__ - -/* drivers/char/raw.c */ -extern void raw_init(void); - -#endif /* __KERNEL__ */ - #endif /* __LINUX_RAW_H */ diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h index 97dbc003473b..d02476de1485 100644 --- a/include/linux/reiserfs_fs.h +++ b/include/linux/reiserfs_fs.h @@ -1572,7 +1572,7 @@ extern wait_queue_head_t reiserfs_commit_thread_wait ; #define JOURNAL_MAX_BATCH 900 /* max blocks to batch into one transaction, don't make this any bigger than 900 */ #define JOURNAL_MAX_COMMIT_AGE 30 #define JOURNAL_MAX_TRANS_AGE 30 -#define JOURNAL_PER_BALANCE_CNT 12 /* must be >= (5 + 2 * (MAX_HEIGHT-2) + 1) */ +#define JOURNAL_PER_BALANCE_CNT (3 * (MAX_HEIGHT-2) + 9) /* both of these can be as low as 1, or as high as you want. The min is the ** number of 4k bitmap nodes preallocated on mount. New nodes are allocated @@ -1950,6 +1950,7 @@ int reiserfs_new_unf_blocknrs2 (struct reiserfs_transaction_handle *th, void reiserfs_discard_prealloc (struct reiserfs_transaction_handle *th, struct inode * inode); +void reiserfs_discard_all_prealloc (struct reiserfs_transaction_handle *th); #endif /* hashes.c */ diff --git a/include/linux/reiserfs_fs_i.h b/include/linux/reiserfs_fs_i.h index 3a60b86675c5..39c37e2045c6 100644 --- a/include/linux/reiserfs_fs_i.h +++ b/include/linux/reiserfs_fs_i.h @@ -1,6 +1,8 @@ #ifndef _REISER_FS_I #define _REISER_FS_I +#include <linux/list.h> + /* these are used to keep track of the pages that need ** flushing before the current transaction can commit */ @@ -52,7 +54,8 @@ struct reiserfs_inode_info { //For preallocation int i_prealloc_block; int i_prealloc_count; - + struct list_head i_prealloc_list; /* per-transaction list of inodes which + * have preallocated blocks */ /* I regret that you think the below is a comment you should make.... -Hans */ //nopack-attribute diff --git a/include/linux/reiserfs_fs_sb.h b/include/linux/reiserfs_fs_sb.h index a6921dad0365..68d2e003b724 100644 --- a/include/linux/reiserfs_fs_sb.h +++ b/include/linux/reiserfs_fs_sb.h @@ -254,6 +254,7 @@ struct reiserfs_journal { struct reiserfs_journal_cnode *j_hash_table[JOURNAL_HASH_SIZE] ; /* hash table for real buffer heads in current trans */ struct reiserfs_journal_cnode *j_list_hash_table[JOURNAL_HASH_SIZE] ; /* hash table for all the real buffer heads in all the transactions */ + struct list_head j_prealloc_list; /* list of inodes which have preallocated blocks */ }; #define JOURNAL_DESC_MAGIC "ReIsErLB" /* ick. magic string to find desc blocks in the journal */ diff --git a/include/linux/usb.h b/include/linux/usb.h index 5068f1a80656..2e1217ad1b27 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -424,6 +424,7 @@ struct usb_driver { #define USB_ASYNC_UNLINK 0x0008 #define USB_QUEUE_BULK 0x0010 #define USB_NO_FSBR 0x0020 +#define USB_ZERO_PACKET 0x0040 // Finish bulk OUTs always with zero length packet #define USB_TIMEOUT_KILLED 0x1000 // only set by HCD! typedef struct diff --git a/include/scsi/scsi.h b/include/scsi/scsi.h index ded8f2ea4637..1bcce4c08dcd 100644 --- a/include/scsi/scsi.h +++ b/include/scsi/scsi.h @@ -211,6 +211,9 @@ struct ccs_modesel_head /* Used to get the bus number for a device */ #define SCSI_IOCTL_GET_BUS_NUMBER 0x5386 +/* Used to get the PCI location of a device */ +#define SCSI_IOCTL_GET_PCI 0x5387 + /* * Overrides for Emacs so that we follow Linus's tabbing style. * Emacs will notice this stuff at the end of the file and automatically diff --git a/init/main.c b/init/main.c index e06a6d80354b..53c4be5c68ce 100644 --- a/init/main.c +++ b/init/main.c @@ -574,7 +574,6 @@ asmlinkage void __init start_kernel(void) #if defined(CONFIG_ARCH_S390) ccwcache_init(); #endif - kiobuf_setup(); signals_init(); bdev_init(); inode_init(mempages); diff --git a/kernel/ksyms.c b/kernel/ksyms.c index e7dd1f35e682..5fcbe2d34109 100644 --- a/kernel/ksyms.c +++ b/kernel/ksyms.c @@ -386,8 +386,6 @@ EXPORT_SYMBOL(__br_write_unlock); #endif /* Kiobufs */ -EXPORT_SYMBOL(kiobuf_init); - EXPORT_SYMBOL(alloc_kiovec); EXPORT_SYMBOL(free_kiovec); EXPORT_SYMBOL(expand_kiobuf); diff --git a/lib/rwsem.c b/lib/rwsem.c index 4672fc3c342d..8d480e7e7f11 100644 --- a/lib/rwsem.c +++ b/lib/rwsem.c @@ -202,9 +202,9 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) return sem; } -EXPORT_SYMBOL(rwsem_down_read_failed); -EXPORT_SYMBOL(rwsem_down_write_failed); -EXPORT_SYMBOL(rwsem_wake); +EXPORT_SYMBOL_NOVERS(rwsem_down_read_failed); +EXPORT_SYMBOL_NOVERS(rwsem_down_write_failed); +EXPORT_SYMBOL_NOVERS(rwsem_wake); #if RWSEM_DEBUG EXPORT_SYMBOL(rwsemtrace); #endif diff --git a/lib/string.c b/lib/string.c index 0bdf942391c0..41a90d37e35a 100644 --- a/lib/string.c +++ b/lib/string.c @@ -326,21 +326,24 @@ char * strtok(char * s,const char * ct) * @ct: The characters to search for * * strsep() updates @s to point after the token, ready for the next call. + * + * It returns empty tokens, too, behaving exactly like the libc function + * of that name. In fact, it was stolen from glibc2 and de-fancy-fied. + * Same semantics, slimmer shape. ;) */ -char * strsep(char **s, const char * ct) +char * strsep(char **s, const char *ct) { - char *sbegin=*s; - if (!sbegin) - return NULL; - - sbegin += strspn(sbegin,ct); - if (*sbegin == '\0') + char *sbegin = *s, *end; + + if (sbegin == NULL) return NULL; - - *s = strpbrk( sbegin, ct); - if (*s && **s != '\0') - *(*s)++ = '\0'; - return (sbegin); + + end = strpbrk(sbegin, ct); + if (end) + *end++ = '\0'; + *s = end; + + return sbegin; } #endif diff --git a/mm/highmem.c b/mm/highmem.c index 7935d1280d1c..f093fb67e2c5 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -207,6 +207,10 @@ static inline void bounce_end_io (struct buffer_head *bh, int uptodate) bh_orig->b_end_io(bh_orig, uptodate); __free_page(bh->b_page); +#ifdef HIGHMEM_DEBUG + /* Don't clobber the constructed slab cache */ + init_waitqueue_head(&bh->b_wait); +#endif kmem_cache_free(bh_cachep, bh); } @@ -260,12 +264,14 @@ repeat_page: bh->b_count = bh_orig->b_count; bh->b_rdev = bh_orig->b_rdev; bh->b_state = bh_orig->b_state; +#ifdef HIGHMEM_DEBUG bh->b_flushtime = jiffies; bh->b_next_free = NULL; bh->b_prev_free = NULL; /* bh->b_this_page */ bh->b_reqnext = NULL; bh->b_pprev = NULL; +#endif /* bh->b_page */ if (rw == WRITE) { bh->b_end_io = bounce_end_io_write; @@ -274,7 +280,9 @@ repeat_page: bh->b_end_io = bounce_end_io_read; bh->b_private = (void *)bh_orig; bh->b_rsector = bh_orig->b_rsector; +#ifdef HIGHMEM_DEBUG memset(&bh->b_wait, -1, sizeof(bh->b_wait)); +#endif return bh; } diff --git a/mm/memory.c b/mm/memory.c index 07aa8d2475ad..4b6e70995b0b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -389,20 +389,33 @@ void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long s /* * Do a quick page-table lookup for a single page. */ -static struct page * follow_page(unsigned long address) +static struct page * follow_page(unsigned long address, int write) { pgd_t *pgd; pmd_t *pmd; + pte_t *ptep, pte; pgd = pgd_offset(current->mm, address); + if (pgd_none(*pgd) || pgd_bad(*pgd)) + goto out; + pmd = pmd_offset(pgd, address); - if (pmd) { - pte_t * pte = pte_offset(pmd, address); - if (pte && pte_present(*pte)) - return pte_page(*pte); + if (pmd_none(*pmd) || pmd_bad(*pmd)) + goto out; + + ptep = pte_offset(pmd, address); + if (!ptep) + goto out; + + pte = *ptep; + if (pte_present(pte)) { + if (!write || + (pte_write(pte) && pte_dirty(pte))) + return pte_page(pte); } - - return NULL; + +out: + return 0; } /* @@ -476,15 +489,22 @@ int map_user_kiobuf(int rw, struct kiobuf *iobuf, unsigned long va, size_t len) goto out_unlock; } } - if (handle_mm_fault(current->mm, vma, ptr, datain) <= 0) - goto out_unlock; spin_lock(&mm->page_table_lock); - map = follow_page(ptr); - if (!map) { + while (!(map = follow_page(ptr, datain))) { + int ret; + spin_unlock(&mm->page_table_lock); - dprintk (KERN_ERR "Missing page in map_user_kiobuf\n"); - goto out_unlock; - } + ret = handle_mm_fault(current->mm, vma, ptr, datain); + if (ret <= 0) { + if (!ret) + goto out_unlock; + else { + err = -ENOMEM; + goto out_unlock; + } + } + spin_lock(&mm->page_table_lock); + } map = get_page_map(map); if (map) { flush_dcache_page(map); @@ -509,6 +529,37 @@ int map_user_kiobuf(int rw, struct kiobuf *iobuf, unsigned long va, size_t len) return err; } +/* + * Mark all of the pages in a kiobuf as dirty + * + * We need to be able to deal with short reads from disk: if an IO error + * occurs, the number of bytes read into memory may be less than the + * size of the kiobuf, so we have to stop marking pages dirty once the + * requested byte count has been reached. + */ + +void mark_dirty_kiobuf(struct kiobuf *iobuf, int bytes) +{ + int index, offset, remaining; + struct page *page; + + index = iobuf->offset >> PAGE_SHIFT; + offset = iobuf->offset & ~PAGE_MASK; + remaining = bytes; + if (remaining > iobuf->length) + remaining = iobuf->length; + + while (remaining > 0 && index < iobuf->nr_pages) { + page = iobuf->maplist[index]; + + if (!PageReserved(page)) + SetPageDirty(page); + + remaining -= (PAGE_SIZE - offset); + offset = 0; + index++; + } +} /* * Unmap all of the pages referenced by a kiobuf. We release the pages, @@ -559,7 +610,6 @@ int lock_kiovec(int nr, struct kiobuf *iovec[], int wait) if (iobuf->locked) continue; - iobuf->locked = 1; ppage = iobuf->maplist; for (j = 0; j < iobuf->nr_pages; ppage++, j++) { @@ -567,9 +617,16 @@ int lock_kiovec(int nr, struct kiobuf *iovec[], int wait) if (!page) continue; - if (TryLockPage(page)) + if (TryLockPage(page)) { + while (j--) { + page = *(--ppage); + if (page) + UnlockPage(page); + } goto retry; + } } + iobuf->locked = 1; } return 0; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 91e1d3643293..822f5fc3f7e6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -295,8 +295,7 @@ struct page * __alloc_pages(zonelist_t *zonelist, unsigned long order) * Can we take pages directly from the inactive_clean * list? */ - if (order == 0 && (gfp_mask & __GFP_WAIT) && - !(current->flags & PF_MEMALLOC)) + if (order == 0 && (gfp_mask & __GFP_WAIT)) direct_reclaim = 1; /* diff --git a/net/core/netfilter.c b/net/core/netfilter.c index 31f9a46e92c3..ae0e27833976 100644 --- a/net/core/netfilter.c +++ b/net/core/netfilter.c @@ -553,6 +553,12 @@ void nf_reinject(struct sk_buff *skb, struct nf_info *info, return; } +/* This does not belong here, but ipt_REJECT needs it if connection + tracking in use: without this, connection may not be in hash table, + and hence manufactured ICMP or RST packets will not be associated + with it. */ +void (*ip_ct_attach)(struct sk_buff *, struct nf_ct_info *); + void __init netfilter_init(void) { int i, h; diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c index 5dd141dca269..91eb091f5484 100644 --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ b/net/ipv4/netfilter/ip_conntrack_core.c @@ -23,6 +23,8 @@ #include <linux/stddef.h> #include <linux/sysctl.h> #include <linux/slab.h> +/* For ERR_PTR(). Yeah, I know... --RR */ +#include <linux/fs.h> /* This rwlock protects the main hash table, protocol/helper/expected registrations, conntrack timers*/ @@ -152,7 +154,9 @@ static void clean_from_lists(struct ip_conntrack *ct) { MUST_BE_WRITE_LOCKED(&ip_conntrack_lock); - /* Remove from both hash lists */ + /* Remove from both hash lists: must not NULL out next ptrs, + otherwise we'll look unconfirmed. Fortunately, LIST_DELETE + doesn't do this. --RR */ LIST_DELETE(&ip_conntrack_hash [hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple)], &ct->tuplehash[IP_CT_DIR_ORIGINAL]); @@ -172,24 +176,6 @@ destroy_conntrack(struct nf_conntrack *nfct) { struct ip_conntrack *ct = (struct ip_conntrack *)nfct; - /* Unconfirmed connections haven't been cleaned up by the - timer: hence they cannot be simply deleted here. */ - if (!(ct->status & IPS_CONFIRMED)) { - WRITE_LOCK(&ip_conntrack_lock); - /* Race check: they can't get a reference if noone has - one and we have the write lock. */ - if (atomic_read(&ct->ct_general.use) == 0) { - clean_from_lists(ct); - WRITE_UNLOCK(&ip_conntrack_lock); - } else { - /* Either a last-minute confirmation (ie. ct - now has timer attached), or a last-minute - new skb has reference (still unconfirmed). */ - WRITE_UNLOCK(&ip_conntrack_lock); - return; - } - } - IP_NF_ASSERT(atomic_read(&nfct->use) == 0); IP_NF_ASSERT(!timer_pending(&ct->timeout)); @@ -207,7 +193,6 @@ static void death_by_timeout(unsigned long ul_conntrack) struct ip_conntrack *ct = (void *)ul_conntrack; WRITE_LOCK(&ip_conntrack_lock); - IP_NF_ASSERT(ct->status & IPS_CONFIRMED); clean_from_lists(ct); WRITE_UNLOCK(&ip_conntrack_lock); ip_conntrack_put(ct); @@ -253,24 +238,85 @@ ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple, return h; } -/* Confirm a connection */ -void -ip_conntrack_confirm(struct ip_conntrack *ct) +static inline struct ip_conntrack * +__ip_conntrack_get(struct nf_ct_info *nfct, enum ip_conntrack_info *ctinfo) { + struct ip_conntrack *ct + = (struct ip_conntrack *)nfct->master; + + /* ctinfo is the index of the nfct inside the conntrack */ + *ctinfo = nfct - ct->infos; + IP_NF_ASSERT(*ctinfo >= 0 && *ctinfo < IP_CT_NUMBER); + return ct; +} + +/* Return conntrack and conntrack_info given skb->nfct->master */ +struct ip_conntrack * +ip_conntrack_get(struct sk_buff *skb, enum ip_conntrack_info *ctinfo) +{ + if (skb->nfct) + return __ip_conntrack_get(skb->nfct, ctinfo); + return NULL; +} + +/* Confirm a connection given skb->nfct; places it in hash table */ +int +__ip_conntrack_confirm(struct nf_ct_info *nfct) +{ + unsigned int hash, repl_hash; + struct ip_conntrack *ct; + enum ip_conntrack_info ctinfo; + + ct = __ip_conntrack_get(nfct, &ctinfo); + + /* ipt_REJECT uses ip_conntrack_attach to attach related + ICMP/TCP RST packets in other direction. Actual packet + which created connection will be IP_CT_NEW or for an + expected connection, IP_CT_RELATED. */ + if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) + return NF_ACCEPT; + + hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); + + /* We're not in hash table, and we refuse to set up related + connections for unconfirmed conns. But packet copies and + REJECT will give spurious warnings here. */ + /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */ + + /* No external references means noone else could have + confirmed us. */ + IP_NF_ASSERT(!is_confirmed(ct)); DEBUGP("Confirming conntrack %p\n", ct); + WRITE_LOCK(&ip_conntrack_lock); - /* Race check */ - if (!(ct->status & IPS_CONFIRMED)) { - IP_NF_ASSERT(!timer_pending(&ct->timeout)); - set_bit(IPS_CONFIRMED_BIT, &ct->status); + /* See if there's one in the list already, including reverse: + NAT could have grabbed it without realizing, since we're + not in the hash. If there is, we lost race. */ + if (!LIST_FIND(&ip_conntrack_hash[hash], + conntrack_tuple_cmp, + struct ip_conntrack_tuple_hash *, + &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL) + && !LIST_FIND(&ip_conntrack_hash[repl_hash], + conntrack_tuple_cmp, + struct ip_conntrack_tuple_hash *, + &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) { + list_prepend(&ip_conntrack_hash[hash], + &ct->tuplehash[IP_CT_DIR_ORIGINAL]); + list_prepend(&ip_conntrack_hash[repl_hash], + &ct->tuplehash[IP_CT_DIR_REPLY]); /* Timer relative to confirmation time, not original setting time, otherwise we'd get timer wrap in wierd delay cases. */ ct->timeout.expires += jiffies; add_timer(&ct->timeout); atomic_inc(&ct->ct_general.use); + WRITE_UNLOCK(&ip_conntrack_lock); + return NF_ACCEPT; } + WRITE_UNLOCK(&ip_conntrack_lock); + return NF_DROP; } /* Returns true if a connection correspondings to the tuple (required @@ -374,30 +420,16 @@ icmp_error_track(struct sk_buff *skb, *ctinfo += IP_CT_IS_REPLY; } - /* REJECT target does this commonly, so allow locally - generated ICMP errors --RR */ - if (!(h->ctrack->status & IPS_CONFIRMED) - && hooknum != NF_IP_LOCAL_OUT) { - DEBUGP("icmp_error_track: unconfirmed\n"); - ip_conntrack_put(h->ctrack); - return NULL; - } - /* Update skb to refer to this connection */ skb->nfct = &h->ctrack->infos[*ctinfo]; return h->ctrack; } -/* There's a small race here where we may free a just-replied to +/* There's a small race here where we may free a just-assured connection. Too bad: we're in trouble anyway. */ static inline int unreplied(const struct ip_conntrack_tuple_hash *i) { - /* Unconfirmed connections either really fresh or transitory - anyway */ - if (!(i->ctrack->status & IPS_ASSURED) - && (i->ctrack->status & IPS_CONFIRMED)) - return 1; - return 0; + return !(i->ctrack->status & IPS_ASSURED); } static int early_drop(struct list_head *chain) @@ -436,10 +468,9 @@ static inline int expect_cmp(const struct ip_conntrack_expect *i, return ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask); } -/* Allocate a new conntrack; we set everything up, then grab write - lock and see if we lost a race. If we lost it we return 0, - indicating the controlling code should look again. */ -static int +/* Allocate a new conntrack: we return -ENOMEM if classification + failed due to stress. Otherwise it really is unclassifiable. */ +static struct ip_conntrack_tuple_hash * init_conntrack(const struct ip_conntrack_tuple *tuple, struct ip_conntrack_protocol *protocol, struct sk_buff *skb) @@ -448,8 +479,6 @@ init_conntrack(const struct ip_conntrack_tuple *tuple, struct ip_conntrack_tuple repl_tuple; size_t hash, repl_hash; struct ip_conntrack_expect *expected; - enum ip_conntrack_info ctinfo; - unsigned long extra_jiffies; int i; static unsigned int drop_next = 0; @@ -457,30 +486,31 @@ init_conntrack(const struct ip_conntrack_tuple *tuple, if (ip_conntrack_max && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) { - if (net_ratelimit()) - printk(KERN_WARNING "ip_conntrack: maximum limit of" - " %d entries exceeded\n", ip_conntrack_max); - /* Try dropping from random chain, or else from the chain about to put into (in case they're trying to bomb one hash chain). */ if (drop_next >= ip_conntrack_htable_size) drop_next = 0; if (!early_drop(&ip_conntrack_hash[drop_next++]) - && !early_drop(&ip_conntrack_hash[hash])) - return 1; + && !early_drop(&ip_conntrack_hash[hash])) { + if (net_ratelimit()) + printk(KERN_WARNING + "ip_conntrack: table full, dropping" + " packet.\n"); + return ERR_PTR(-ENOMEM); + } } if (!invert_tuple(&repl_tuple, tuple, protocol)) { DEBUGP("Can't invert tuple.\n"); - return 1; + return NULL; } repl_hash = hash_conntrack(&repl_tuple); conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC); if (!conntrack) { DEBUGP("Can't allocate conntrack.\n"); - return 1; + return ERR_PTR(-ENOMEM); } memset(conntrack, 0, sizeof(struct ip_conntrack)); @@ -493,32 +523,33 @@ init_conntrack(const struct ip_conntrack_tuple *tuple, for (i=0; i < IP_CT_NUMBER; i++) conntrack->infos[i].master = &conntrack->ct_general; - extra_jiffies = protocol->new(conntrack, skb->nh.iph, skb->len); - if (!extra_jiffies) { + if (!protocol->new(conntrack, skb->nh.iph, skb->len)) { kmem_cache_free(ip_conntrack_cachep, conntrack); - return 1; + return NULL; } /* Don't set timer yet: wait for confirmation */ init_timer(&conntrack->timeout); conntrack->timeout.data = (unsigned long)conntrack; conntrack->timeout.function = death_by_timeout; - conntrack->timeout.expires = extra_jiffies; - /* Sew in at head of hash list. */ + /* Mark clearly that it's not in the hash table. */ + conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list.next = NULL; + + /* Write lock required for deletion of expected. Without + this, a read-lock would do. */ WRITE_LOCK(&ip_conntrack_lock); - /* Check noone else beat us in the race... */ - if (__ip_conntrack_find(tuple, NULL)) { - WRITE_UNLOCK(&ip_conntrack_lock); - kmem_cache_free(ip_conntrack_cachep, conntrack); - return 0; - } conntrack->helper = LIST_FIND(&helpers, helper_cmp, struct ip_conntrack_helper *, &repl_tuple); /* Need finding and deleting of expected ONLY if we win race */ expected = LIST_FIND(&expect_list, expect_cmp, struct ip_conntrack_expect *, tuple); - if (expected) { + /* If master is not in hash table yet (ie. packet hasn't left + this machine yet), how can other end know about expected? + Hence these are not the droids you are looking for (if + master ct never got confirmed, we'd hold a reference to it + and weird things would happen to future packets). */ + if (expected && is_confirmed(expected->expectant)) { /* Welcome, Mr. Bond. We've been expecting you... */ conntrack->status = IPS_EXPECTED; conntrack->master.master = &expected->expectant->ct_general; @@ -526,23 +557,13 @@ init_conntrack(const struct ip_conntrack_tuple *tuple, LIST_DELETE(&expect_list, expected); expected->expectant = NULL; nf_conntrack_get(&conntrack->master); - ctinfo = IP_CT_RELATED; - } else { - ctinfo = IP_CT_NEW; } - list_prepend(&ip_conntrack_hash[hash], - &conntrack->tuplehash[IP_CT_DIR_ORIGINAL]); - list_prepend(&ip_conntrack_hash[repl_hash], - &conntrack->tuplehash[IP_CT_DIR_REPLY]); atomic_inc(&ip_conntrack_count); WRITE_UNLOCK(&ip_conntrack_lock); - /* Update skb to refer to this connection */ - skb->nfct = &conntrack->infos[ctinfo]; if (expected && expected->expectfn) expected->expectfn(conntrack); - - return 1; + return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL]; } /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */ @@ -561,38 +582,18 @@ resolve_normal_ct(struct sk_buff *skb, if (!get_tuple(skb->nh.iph, skb->len, &tuple, proto)) return NULL; - /* Loop around search/insert race */ - do { - /* look for tuple match */ - h = ip_conntrack_find_get(&tuple, NULL); - if (!h && init_conntrack(&tuple, proto, skb)) + /* look for tuple match */ + h = ip_conntrack_find_get(&tuple, NULL); + if (!h) { + h = init_conntrack(&tuple, proto, skb); + if (!h) return NULL; - } while (!h); + if (IS_ERR(h)) + return (void *)h; + } /* It exists; we have (non-exclusive) reference. */ if (DIRECTION(h) == IP_CT_DIR_REPLY) { - /* Reply on unconfirmed connection => unclassifiable */ - if (!(h->ctrack->status & IPS_CONFIRMED)) { - /* Exception: local TCP RSTs (generated by - REJECT target). */ - if (hooknum == NF_IP_LOCAL_OUT - && h->tuple.dst.protonum == IPPROTO_TCP) { - const struct tcphdr *tcph - = (const struct tcphdr *) - ((u_int32_t *)skb->nh.iph - + skb->nh.iph->ihl); - if (tcph->rst) { - *ctinfo = IP_CT_ESTABLISHED - + IP_CT_IS_REPLY; - *set_reply = 0; - goto set_skb; - } - } - DEBUGP("Reply on unconfirmed connection\n"); - ip_conntrack_put(h->ctrack); - return NULL; - } - *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY; /* Please set reply bit if this packet OK */ *set_reply = 1; @@ -613,28 +614,10 @@ resolve_normal_ct(struct sk_buff *skb, } *set_reply = 0; } - set_skb: skb->nfct = &h->ctrack->infos[*ctinfo]; return h->ctrack; } -/* Return conntrack and conntrack_info a given skb */ -inline struct ip_conntrack * -ip_conntrack_get(struct sk_buff *skb, enum ip_conntrack_info *ctinfo) -{ - if (skb->nfct) { - struct ip_conntrack *ct - = (struct ip_conntrack *)skb->nfct->master; - - /* ctinfo is the index of the nfct inside the conntrack */ - *ctinfo = skb->nfct - ct->infos; - IP_NF_ASSERT(*ctinfo >= 0 && *ctinfo < IP_CT_NUMBER); - return ct; - } - return NULL; -} - - /* Netfilter hook itself. */ unsigned int ip_conntrack_in(unsigned int hooknum, struct sk_buff **pskb, @@ -689,6 +672,10 @@ unsigned int ip_conntrack_in(unsigned int hooknum, /* Not valid part of a connection */ return NF_ACCEPT; + if (IS_ERR(ct)) + /* Too stressed to deal. */ + return NF_DROP; + IP_NF_ASSERT((*pskb)->nfct); ret = proto->packet(ct, (*pskb)->nh.iph, (*pskb)->len, ctinfo); @@ -783,23 +770,18 @@ void ip_conntrack_unexpect_related(struct ip_conntrack *related_to) int ip_conntrack_alter_reply(struct ip_conntrack *conntrack, const struct ip_conntrack_tuple *newreply) { - unsigned int newindex = hash_conntrack(newreply); - WRITE_LOCK(&ip_conntrack_lock); if (__ip_conntrack_find(newreply, conntrack)) { WRITE_UNLOCK(&ip_conntrack_lock); return 0; } + /* Should be unconfirmed, so not in hash table yet */ + IP_NF_ASSERT(!is_confirmed(conntrack)); + DEBUGP("Altering reply tuple of %p to ", conntrack); DUMP_TUPLE(newreply); - LIST_DELETE(&ip_conntrack_hash - [hash_conntrack(&conntrack->tuplehash[IP_CT_DIR_REPLY] - .tuple)], - &conntrack->tuplehash[IP_CT_DIR_REPLY]); conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; - list_prepend(&ip_conntrack_hash[newindex], - &conntrack->tuplehash[IP_CT_DIR_REPLY]); conntrack->helper = LIST_FIND(&helpers, helper_cmp, struct ip_conntrack_helper *, newreply); @@ -861,8 +843,8 @@ void ip_ct_refresh(struct ip_conntrack *ct, unsigned long extra_jiffies) IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct); WRITE_LOCK(&ip_conntrack_lock); - /* Timer may not be active yet */ - if (!(ct->status & IPS_CONFIRMED)) + /* If not in hash table, timer will not be active yet */ + if (!is_confirmed(ct)) ct->timeout.expires = extra_jiffies; else { /* Need del_timer for race avoidance (may already be dying). */ @@ -914,6 +896,26 @@ ip_ct_gather_frags(struct sk_buff *skb) return skb; } +/* Used by ipt_REJECT. */ +static void ip_conntrack_attach(struct sk_buff *nskb, struct nf_ct_info *nfct) +{ + struct ip_conntrack *ct; + enum ip_conntrack_info ctinfo; + + ct = __ip_conntrack_get(nfct, &ctinfo); + + /* This ICMP is in reverse direction to the packet which + caused it */ + if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) + ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY; + else + ctinfo = IP_CT_RELATED; + + /* Attach new skbuff, and increment count */ + nskb->nfct = &ct->infos[ctinfo]; + atomic_inc(&ct->ct_general.use); +} + static inline int do_kill(const struct ip_conntrack_tuple_hash *i, int (*kill)(const struct ip_conntrack *i, void *data), @@ -953,20 +955,6 @@ ip_ct_selective_cleanup(int (*kill)(const struct ip_conntrack *i, void *data), /* Time to push up daises... */ if (del_timer(&h->ctrack->timeout)) death_by_timeout((unsigned long)h->ctrack); - else if (!(h->ctrack->status & IPS_CONFIRMED)) { - /* Unconfirmed connection. Clean from lists, - mark confirmed so it gets cleaned as soon - as skb freed. */ - WRITE_LOCK(&ip_conntrack_lock); - /* Lock protects race against another setting - of confirmed bit. set_bit isolates this - bit from the others. */ - if (!(h->ctrack->status & IPS_CONFIRMED)) { - clean_from_lists(h->ctrack); - set_bit(IPS_CONFIRMED_BIT, &h->ctrack->status); - } - WRITE_UNLOCK(&ip_conntrack_lock); - } /* ... else the timer will get him soon. */ ip_conntrack_put(h->ctrack); @@ -1062,6 +1050,12 @@ void ip_conntrack_cleanup(void) #ifdef CONFIG_SYSCTL unregister_sysctl_table(ip_conntrack_sysctl_header); #endif + ip_ct_attach = NULL; + /* This makes sure all current packets have passed through + netfilter framework. Roll on, two-stage module + delete... */ + br_write_lock_bh(BR_NETPROTO_LOCK); + br_write_unlock_bh(BR_NETPROTO_LOCK); i_see_dead_people: ip_ct_selective_cleanup(kill_all, NULL); @@ -1075,6 +1069,9 @@ void ip_conntrack_cleanup(void) nf_unregister_sockopt(&so_getorigdst); } +static int hashsize = 0; +MODULE_PARM(hashsize, "i"); + int __init ip_conntrack_init(void) { unsigned int i; @@ -1082,13 +1079,17 @@ int __init ip_conntrack_init(void) /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB * machine has 256 buckets. >= 1GB machines have 8192 buckets. */ - ip_conntrack_htable_size - = (((num_physpages << PAGE_SHIFT) / 16384) - / sizeof(struct list_head)); - if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE)) - ip_conntrack_htable_size = 8192; - if (ip_conntrack_htable_size < 16) - ip_conntrack_htable_size = 16; + if (hashsize) { + ip_conntrack_htable_size = hashsize; + } else { + ip_conntrack_htable_size + = (((num_physpages << PAGE_SHIFT) / 16384) + / sizeof(struct list_head)); + if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE)) + ip_conntrack_htable_size = 8192; + if (ip_conntrack_htable_size < 16) + ip_conntrack_htable_size = 16; + } ip_conntrack_max = 8 * ip_conntrack_htable_size; printk("ip_conntrack (%u buckets, %d max)\n", @@ -1140,5 +1141,7 @@ int __init ip_conntrack_init(void) } #endif /*CONFIG_SYSCTL*/ + /* For use by ipt_REJECT */ + ip_ct_attach = ip_conntrack_attach; return ret; } diff --git a/net/ipv4/netfilter/ip_conntrack_proto_generic.c b/net/ipv4/netfilter/ip_conntrack_proto_generic.c index bd566db53eee..fcc0eed71a0f 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_generic.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_generic.c @@ -48,10 +48,10 @@ static int established(struct ip_conntrack *conntrack, } /* Called when a new connection for this protocol found. */ -static unsigned long +static int new(struct ip_conntrack *conntrack, struct iphdr *iph, size_t len) { - return GENERIC_TIMEOUT; + return 1; } struct ip_conntrack_protocol ip_conntrack_generic_protocol diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c index 17e126119b9b..b0eb65891d5f 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c @@ -72,22 +72,25 @@ static int icmp_packet(struct ip_conntrack *ct, struct iphdr *iph, size_t len, enum ip_conntrack_info ctinfo) { - /* FIXME: Should keep count of orig - reply packets: if == 0, - destroy --RR */ - /* Delete connection immediately on reply: won't actually - vanish as we still have skb */ + /* Try to delete connection immediately after all replies: + won't actually vanish as we still have skb, and del_timer + means this will only run once even if count hits zero twice + (theoretically possible with SMP) */ if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) { - if (del_timer(&ct->timeout)) + if (atomic_dec_and_test(&ct->proto.icmp.count) + && del_timer(&ct->timeout)) ct->timeout.function((unsigned long)ct); - } else + } else { + atomic_inc(&ct->proto.icmp.count); ip_ct_refresh(ct, ICMP_TIMEOUT); + } return NF_ACCEPT; } /* Called when a new connection for this protocol found. */ -static unsigned long icmp_new(struct ip_conntrack *conntrack, - struct iphdr *iph, size_t len) +static int icmp_new(struct ip_conntrack *conntrack, + struct iphdr *iph, size_t len) { static u_int8_t valid_new[] = { [ICMP_ECHO] = 1, @@ -103,7 +106,8 @@ static unsigned long icmp_new(struct ip_conntrack *conntrack, DUMP_TUPLE(&conntrack->tuplehash[0].tuple); return 0; } - return ICMP_TIMEOUT; + atomic_set(&conntrack->proto.icmp.count, 0); + return 1; } struct ip_conntrack_protocol ip_conntrack_protocol_icmp diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c index 0aa8426de3a7..4f52a027fb3c 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c @@ -206,8 +206,8 @@ static int tcp_packet(struct ip_conntrack *conntrack, } /* Called when a new connection for this protocol found. */ -static unsigned long tcp_new(struct ip_conntrack *conntrack, - struct iphdr *iph, size_t len) +static int tcp_new(struct ip_conntrack *conntrack, + struct iphdr *iph, size_t len) { enum tcp_conntrack newconntrack; struct tcphdr *tcph = (struct tcphdr *)((u_int32_t *)iph + iph->ihl); @@ -224,7 +224,7 @@ static unsigned long tcp_new(struct ip_conntrack *conntrack, } conntrack->proto.tcp.state = newconntrack; - return tcp_timeouts[conntrack->proto.tcp.state]; + return 1; } struct ip_conntrack_protocol ip_conntrack_protocol_tcp diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c index 644a86a1397a..86544b03d2ce 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c @@ -62,10 +62,10 @@ static int udp_packet(struct ip_conntrack *conntrack, } /* Called when a new connection for this protocol found. */ -static unsigned long udp_new(struct ip_conntrack *conntrack, +static int udp_new(struct ip_conntrack *conntrack, struct iphdr *iph, size_t len) { - return UDP_TIMEOUT; + return 1; } struct ip_conntrack_protocol ip_conntrack_protocol_udp diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c index f1faab1be187..ba94a4d6c030 100644 --- a/net/ipv4/netfilter/ip_conntrack_standalone.c +++ b/net/ipv4/netfilter/ip_conntrack_standalone.c @@ -88,8 +88,6 @@ print_conntrack(char *buffer, const struct ip_conntrack *conntrack) proto); if (conntrack->status & IPS_ASSURED) len += sprintf(buffer + len, "[ASSURED] "); - if (!(conntrack->status & IPS_CONFIRMED)) - len += sprintf(buffer + len, "[UNCONFIRMED] "); len += sprintf(buffer + len, "use=%u ", atomic_read(&conntrack->ct_general.use)); len += sprintf(buffer + len, "\n"); @@ -169,22 +167,8 @@ static unsigned int ip_confirm(unsigned int hooknum, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - /* We've seen it coming out the other side: confirm. Beware - REJECT generating TCP RESET response (IP_CT_REPLY), or ICMP - errors (IP_CT_REPLY + IP_CT_RELATED). But new expected - connections must be confirmed as well (eg. ftp data, - IP_CT_RELATED). */ - if ((*pskb)->nfct) { - struct ip_conntrack *ct - = (struct ip_conntrack *)(*pskb)->nfct->master; - /* ctinfo is the index of the nfct inside the conntrack */ - enum ip_conntrack_info ctinfo = (*pskb)->nfct - ct->infos; - - if ((ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED) - && !(ct->status & IPS_CONFIRMED)) - ip_conntrack_confirm(ct); - } - return NF_ACCEPT; + /* We've seen it coming out the other side: confirm it */ + return ip_conntrack_confirm(*pskb); } static unsigned int ip_refrag(unsigned int hooknum, @@ -196,7 +180,8 @@ static unsigned int ip_refrag(unsigned int hooknum, struct rtable *rt = (struct rtable *)(*pskb)->dst; /* We've seen it coming out the other side: confirm */ - ip_confirm(hooknum, pskb, in, out, okfn); + if (ip_confirm(hooknum, pskb, in, out, okfn) != NF_ACCEPT) + return NF_DROP; /* Local packets are never produced too large for their interface. We degfragment them at LOCAL_OUT, however, @@ -345,3 +330,4 @@ EXPORT_SYMBOL(ip_ct_refresh); EXPORT_SYMBOL(ip_conntrack_expect_related); EXPORT_SYMBOL(ip_conntrack_tuple_taken); EXPORT_SYMBOL(ip_ct_gather_frags); +EXPORT_SYMBOL(ip_conntrack_htable_size); diff --git a/net/ipv4/netfilter/ip_fw_compat.c b/net/ipv4/netfilter/ip_fw_compat.c index 240f3b47fcc9..6489ad787110 100644 --- a/net/ipv4/netfilter/ip_fw_compat.c +++ b/net/ipv4/netfilter/ip_fw_compat.c @@ -69,21 +69,6 @@ int unregister_firewall(int pf, struct firewall_ops *fw) return 0; } -static inline void -confirm_connection(struct sk_buff *skb) -{ - if (skb->nfct) { - struct ip_conntrack *ct - = (struct ip_conntrack *)skb->nfct->master; - /* ctinfo is the index of the nfct inside the conntrack */ - enum ip_conntrack_info ctinfo = skb->nfct - ct->infos; - - if ((ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED) - && !(ct->status & IPS_CONFIRMED)) - ip_conntrack_confirm(ct); - } -} - static unsigned int fw_in(unsigned int hooknum, struct sk_buff **pskb, @@ -137,7 +122,10 @@ fw_in(unsigned int hooknum, (struct net_device *)out, (*pskb)->nh.raw, &redirpt, pskb); - confirm_connection(*pskb); + + /* ip_conntrack_confirm return NF_DROP or NF_ACCEPT */ + if (ip_conntrack_confirm(*pskb) == NF_DROP) + ret = FW_BLOCK; } break; } @@ -195,8 +183,7 @@ static unsigned int fw_confirm(unsigned int hooknum, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - confirm_connection(*pskb); - return NF_ACCEPT; + return ip_conntrack_confirm(*pskb); } extern int ip_fw_ctl(int optval, void *m, unsigned int len); diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c index 4583332d7a37..daece0d6286c 100644 --- a/net/ipv4/netfilter/ip_nat_core.c +++ b/net/ipv4/netfilter/ip_nat_core.c @@ -12,6 +12,7 @@ #include <linux/skbuff.h> #include <linux/netfilter_ipv4.h> #include <linux/brlock.h> +#include <linux/vmalloc.h> #include <net/checksum.h> #include <net/icmp.h> #include <net/ip.h> @@ -34,11 +35,13 @@ DECLARE_RWLOCK(ip_nat_lock); -#define IP_NAT_HTABLE_SIZE 64 +/* Calculated at init based on memory size */ +static unsigned int ip_nat_htable_size; -static struct list_head bysource[IP_NAT_HTABLE_SIZE]; -static struct list_head byipsproto[IP_NAT_HTABLE_SIZE]; +static struct list_head *bysource; +static struct list_head *byipsproto; LIST_HEAD(protos); +LIST_HEAD(helpers); extern struct ip_nat_protocol unknown_nat_protocol; @@ -48,14 +51,14 @@ hash_by_ipsproto(u_int32_t src, u_int32_t dst, u_int16_t proto) { /* Modified src and dst, to ensure we don't create two identical streams. */ - return (src + dst + proto) % IP_NAT_HTABLE_SIZE; + return (src + dst + proto) % ip_nat_htable_size; } static inline size_t hash_by_src(const struct ip_conntrack_manip *manip, u_int16_t proto) { /* Original src, to ensure we map it consistently if poss. */ - return (manip->ip + manip->u.all + proto) % IP_NAT_HTABLE_SIZE; + return (manip->ip + manip->u.all + proto) % ip_nat_htable_size; } /* Noone using conntrack by the time this called. */ @@ -269,6 +272,7 @@ find_best_ips_proto(struct ip_conntrack_tuple *tuple, struct ip_conntrack_tuple tuple; } best = { NULL, 0xFFFFFFFF }; u_int32_t *var_ipp, *other_ipp, saved_ip, orig_dstip; + static unsigned int randomness = 0; if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC) { var_ipp = &tuple->src.ip; @@ -285,7 +289,8 @@ find_best_ips_proto(struct ip_conntrack_tuple *tuple, IP_NF_ASSERT(mr->rangesize >= 1); for (i = 0; i < mr->rangesize; i++) { - u_int32_t minip, maxip; + /* Host order */ + u_int32_t minip, maxip, j; /* Don't do ranges which are already eliminated. */ if (mr->range[i].flags & IP_NAT_RANGE_FULL) { @@ -293,16 +298,18 @@ find_best_ips_proto(struct ip_conntrack_tuple *tuple, } if (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS) { - minip = mr->range[i].min_ip; - maxip = mr->range[i].max_ip; + minip = ntohl(mr->range[i].min_ip); + maxip = ntohl(mr->range[i].max_ip); } else - minip = maxip = *var_ipp; + minip = maxip = ntohl(*var_ipp); - for (*var_ipp = minip; - ntohl(*var_ipp) <= ntohl(maxip); - *var_ipp = htonl(ntohl(*var_ipp) + 1)) { + randomness++; + for (j = 0; j < maxip - minip + 1; j++) { unsigned int score; + *var_ipp = htonl(minip + (randomness + j) + % (maxip - minip + 1)); + /* Reset the other ip in case it was mangled by * do_extra_mangle last time. */ *other_ipp = saved_ip; @@ -853,6 +860,16 @@ int __init ip_nat_init(void) { size_t i; + /* Leave them the same for the moment. */ + ip_nat_htable_size = ip_conntrack_htable_size; + + /* One vmalloc for both hash tables */ + bysource = vmalloc(sizeof(struct list_head) * ip_nat_htable_size*2); + if (!bysource) { + return -ENOMEM; + } + byipsproto = bysource + ip_nat_htable_size; + /* Sew in builtin protocols. */ WRITE_LOCK(&ip_nat_lock); list_append(&protos, &ip_nat_protocol_tcp); @@ -860,7 +877,7 @@ int __init ip_nat_init(void) list_append(&protos, &ip_nat_protocol_icmp); WRITE_UNLOCK(&ip_nat_lock); - for (i = 0; i < IP_NAT_HTABLE_SIZE; i++) { + for (i = 0; i < ip_nat_htable_size; i++) { INIT_LIST_HEAD(&bysource[i]); INIT_LIST_HEAD(&byipsproto[i]); } @@ -872,7 +889,15 @@ int __init ip_nat_init(void) return 0; } -void ip_nat_cleanup(void) +/* Clear NAT section of all conntracks, in case we're loaded again. */ +static int __exit clean_nat(const struct ip_conntrack *i, void *data) +{ + memset((void *)&i->nat, 0, sizeof(i->nat)); + return 0; +} + +void __exit ip_nat_cleanup(void) { + ip_ct_selective_cleanup(&clean_nat, NULL); ip_conntrack_destroyed = NULL; } diff --git a/net/ipv4/netfilter/ip_nat_helper.c b/net/ipv4/netfilter/ip_nat_helper.c index e35362350c68..23d1a5ed9e58 100644 --- a/net/ipv4/netfilter/ip_nat_helper.c +++ b/net/ipv4/netfilter/ip_nat_helper.c @@ -34,7 +34,6 @@ #endif DECLARE_LOCK(ip_nat_seqofs_lock); -LIST_HEAD(helpers); static inline int ip_nat_resize_packet(struct sk_buff **skb, diff --git a/net/ipv4/netfilter/ip_nat_rule.c b/net/ipv4/netfilter/ip_nat_rule.c index a22858cb3fa3..d83562c292a1 100644 --- a/net/ipv4/netfilter/ip_nat_rule.c +++ b/net/ipv4/netfilter/ip_nat_rule.c @@ -173,6 +173,12 @@ static int ipt_snat_checkentry(const char *tablename, return 0; } + /* Only allow these for NAT. */ + if (strcmp(tablename, "nat") != 0) { + DEBUGP("SNAT: wrong table %s\n", tablename); + return 0; + } + if (hook_mask & ~(1 << NF_IP_POST_ROUTING)) { DEBUGP("SNAT: hook mask 0x%x bad\n", hook_mask); return 0; @@ -202,6 +208,12 @@ static int ipt_dnat_checkentry(const char *tablename, return 0; } + /* Only allow these for NAT. */ + if (strcmp(tablename, "nat") != 0) { + DEBUGP("SNAT: wrong table %s\n", tablename); + return 0; + } + if (hook_mask & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_OUT))) { DEBUGP("DNAT: hook mask 0x%x bad\n", hook_mask); return 0; diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c index fd04ad40cfa4..20982c479db5 100644 --- a/net/ipv4/netfilter/ip_nat_standalone.c +++ b/net/ipv4/netfilter/ip_nat_standalone.c @@ -68,17 +68,21 @@ ip_nat_fn(unsigned int hooknum, (*pskb)->ip_summed = CHECKSUM_NONE; ct = ip_conntrack_get(*pskb, &ctinfo); - /* Can't track? Maybe out of memory: this would make NAT - unreliable. */ + /* Can't track? It's not due to stress, or conntrack would + have dropped it. Hence it's the user's responsibilty to + packet filter it out, or implement conntrack/NAT for that + protocol. 8) --RR */ if (!ct) { - if (net_ratelimit()) - printk(KERN_DEBUG "NAT: %u dropping untracked packet %p %u %u.%u.%u.%u -> %u.%u.%u.%u\n", - hooknum, - *pskb, - (*pskb)->nh.iph->protocol, - NIPQUAD((*pskb)->nh.iph->saddr), - NIPQUAD((*pskb)->nh.iph->daddr)); - return NF_DROP; + /* Exception: ICMP redirect to new connection (not in + hash table yet). We must not let this through, in + case we're doing NAT to the same network. */ + struct iphdr *iph = (*pskb)->nh.iph; + struct icmphdr *hdr = (struct icmphdr *) + ((u_int32_t *)iph + iph->ihl); + if (iph->protocol == IPPROTO_ICMP + && hdr->type == ICMP_REDIRECT) + return NF_DROP; + return NF_ACCEPT; } switch (ctinfo) { diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index f2a19702d729..38d619f387bc 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -127,14 +127,32 @@ device_cmp(const struct ip_conntrack *i, void *ifindex) return ret; } -int masq_device_event(struct notifier_block *this, - unsigned long event, - void *ptr) +static int masq_device_event(struct notifier_block *this, + unsigned long event, + void *ptr) { struct net_device *dev = ptr; - if (event == NETDEV_DOWN || event == NETDEV_CHANGEADDR) { - /* Device was downed/changed (diald) Search entire table for + if (event == NETDEV_DOWN) { + /* Device was downed. Search entire table for + conntracks which were associated with that device, + and forget them. */ + IP_NF_ASSERT(dev->ifindex != 0); + + ip_ct_selective_cleanup(device_cmp, (void *)(long)dev->ifindex); + } + + return NOTIFY_DONE; +} + +static int masq_inet_event(struct notifier_block *this, + unsigned long event, + void *ptr) +{ + struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev; + + if (event == NETDEV_DOWN) { + /* IP address was deleted. Search entire table for conntracks which were associated with that device, and forget them. */ IP_NF_ASSERT(dev->ifindex != 0); @@ -151,6 +169,12 @@ static struct notifier_block masq_dev_notifier = { 0 }; +static struct notifier_block masq_inet_notifier = { + masq_inet_event, + NULL, + 0 +}; + static struct ipt_target masquerade = { { NULL, NULL }, "MASQUERADE", masquerade_target, masquerade_check, NULL, THIS_MODULE }; @@ -164,6 +188,8 @@ static int __init init(void) if (ret == 0) { /* Register for device down reports */ register_netdevice_notifier(&masq_dev_notifier); + /* Register IP address change reports */ + register_inetaddr_notifier(&masq_inet_notifier); } return ret; @@ -173,6 +199,7 @@ static void __exit fini(void) { ipt_unregister_target(&masquerade); unregister_netdevice_notifier(&masq_dev_notifier); + unregister_inetaddr_notifier(&masq_inet_notifier); } module_init(init); diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index cc5ffbc4a093..30a52697904d 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -20,6 +20,18 @@ struct in_device; #define DEBUGP(format, args...) #endif +/* If the original packet is part of a connection, but the connection + is not confirmed, our manufactured reply will not be associated + with it, so we need to do this manually. */ +static void connection_attach(struct sk_buff *new_skb, struct nf_ct_info *nfct) +{ + void (*attach)(struct sk_buff *, struct nf_ct_info *); + + /* Avoid module unload race with ip_ct_attach being NULLed out */ + if (nfct && (attach = ip_ct_attach) != NULL) + attach(new_skb, nfct); +} + /* Send RST reply */ static void send_reset(struct sk_buff *oldskb, int local) { @@ -128,6 +140,8 @@ static void send_reset(struct sk_buff *oldskb, int local) if (nskb->len > nskb->dst->pmtu) goto free_nskb; + connection_attach(nskb, oldskb->nfct); + NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, nskb, NULL, nskb->dst->dev, ip_finish_output); return; @@ -136,6 +150,127 @@ static void send_reset(struct sk_buff *oldskb, int local) kfree_skb(nskb); } +static void send_unreach(struct sk_buff *skb_in, int code) +{ + struct iphdr *iph; + struct icmphdr *icmph; + struct sk_buff *nskb; + u32 saddr; + u8 tos; + int hh_len, length; + struct rtable *rt = (struct rtable*)skb_in->dst; + unsigned char *data; + + if (!rt) + return; + + /* FIXME: Use sysctl number. --RR */ + if (!xrlim_allow(&rt->u.dst, 1*HZ)) + return; + + iph = skb_in->nh.iph; + + /* No replies to physical multicast/broadcast */ + if (skb_in->pkt_type!=PACKET_HOST) + return; + + /* Now check at the protocol level */ + if (rt->rt_flags&(RTCF_BROADCAST|RTCF_MULTICAST)) + return; + + /* Only reply to fragment 0. */ + if (iph->frag_off&htons(IP_OFFSET)) + return; + + /* If we send an ICMP error to an ICMP error a mess would result.. */ + if (iph->protocol == IPPROTO_ICMP + && skb_in->tail-(u8*)iph >= sizeof(struct icmphdr)) { + icmph = (struct icmphdr *)((char *)iph + (iph->ihl<<2)); + /* Between echo-reply (0) and timestamp (13), + everything except echo-request (8) is an error. + Also, anything greater than NR_ICMP_TYPES is + unknown, and hence should be treated as an error... */ + if ((icmph->type < ICMP_TIMESTAMP + && icmph->type != ICMP_ECHOREPLY + && icmph->type != ICMP_ECHO) + || icmph->type > NR_ICMP_TYPES) + return; + } + + saddr = iph->daddr; + if (!(rt->rt_flags & RTCF_LOCAL)) + saddr = 0; + + tos = (iph->tos & IPTOS_TOS_MASK) | IPTOS_PREC_INTERNETCONTROL; + + if (ip_route_output(&rt, iph->saddr, saddr, RT_TOS(tos), 0)) + return; + + /* RFC says return as much as we can without exceeding 576 bytes. */ + length = skb_in->len + sizeof(struct iphdr) + sizeof(struct icmphdr); + + if (length > rt->u.dst.pmtu) + length = rt->u.dst.pmtu; + if (length > 576) + length = 576; + + hh_len = (rt->u.dst.dev->hard_header_len + 15)&~15; + + nskb = alloc_skb(hh_len+15+length, GFP_ATOMIC); + if (!nskb) { + ip_rt_put(rt); + return; + } + + nskb->priority = 0; + nskb->dst = &rt->u.dst; + skb_reserve(nskb, hh_len); + + /* Set up IP header */ + iph = nskb->nh.iph + = (struct iphdr *)skb_put(nskb, sizeof(struct iphdr)); + iph->version=4; + iph->ihl=5; + iph->tos=tos; + iph->tot_len = htons(length); + + /* This abbreviates icmp->send->ip_build_xmit->ip_dont_fragment */ + if (!ipv4_config.no_pmtu_disc + && !(rt->u.dst.mxlock&(1<<RTAX_MTU))) + iph->frag_off = htons(IP_DF); + else iph->frag_off = 0; + + iph->ttl = MAXTTL; + ip_select_ident(iph, &rt->u.dst, NULL); + iph->protocol=IPPROTO_ICMP; + iph->saddr=rt->rt_src; + iph->daddr=rt->rt_dst; + iph->check=0; + iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); + + /* Set up ICMP header. */ + icmph = nskb->h.icmph + = (struct icmphdr *)skb_put(nskb, sizeof(struct icmphdr)); + icmph->type = ICMP_DEST_UNREACH; + icmph->code = code; + icmph->un.gateway = 0; + icmph->checksum = 0; + + /* Copy as much of original packet as will fit */ + data = skb_put(nskb, + length - sizeof(struct iphdr) - sizeof(struct icmphdr)); + /* FIXME: won't work with nonlinear skbs --RR */ + memcpy(data, skb_in->nh.iph, + length - sizeof(struct iphdr) - sizeof(struct icmphdr)); + icmph->checksum = ip_compute_csum((unsigned char *)icmph, + length - sizeof(struct iphdr)); + + connection_attach(nskb, skb_in->nfct); + + NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, nskb, NULL, nskb->dst->dev, + ip_finish_output); +} + static unsigned int reject(struct sk_buff **pskb, unsigned int hooknum, const struct net_device *in, @@ -145,51 +280,43 @@ static unsigned int reject(struct sk_buff **pskb, { const struct ipt_reject_info *reject = targinfo; + /* Our naive response construction doesn't deal with IP + options, and probably shouldn't try. */ + if ((*pskb)->nh.iph->ihl<<2 != sizeof(struct iphdr)) + return NF_DROP; + /* WARNING: This code causes reentry within iptables. This means that the iptables jump stack is now crap. We must return an absolute verdict. --RR */ switch (reject->with) { case IPT_ICMP_NET_UNREACHABLE: - icmp_send(*pskb, ICMP_DEST_UNREACH, ICMP_NET_UNREACH, 0); + send_unreach(*pskb, ICMP_NET_UNREACH); break; case IPT_ICMP_HOST_UNREACHABLE: - icmp_send(*pskb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); + send_unreach(*pskb, ICMP_HOST_UNREACH); break; case IPT_ICMP_PROT_UNREACHABLE: - icmp_send(*pskb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0); + send_unreach(*pskb, ICMP_PROT_UNREACH); break; case IPT_ICMP_PORT_UNREACHABLE: - icmp_send(*pskb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + send_unreach(*pskb, ICMP_PORT_UNREACH); break; case IPT_ICMP_NET_PROHIBITED: - icmp_send(*pskb, ICMP_DEST_UNREACH, ICMP_NET_ANO, 0); + send_unreach(*pskb, ICMP_NET_ANO); break; case IPT_ICMP_HOST_PROHIBITED: - icmp_send(*pskb, ICMP_DEST_UNREACH, ICMP_HOST_ANO, 0); + send_unreach(*pskb, ICMP_HOST_ANO); break; - case IPT_ICMP_ECHOREPLY: - printk("REJECT: ECHOREPLY no longer supported.\n"); - break; case IPT_TCP_RESET: send_reset(*pskb, hooknum == NF_IP_LOCAL_IN); break; + case IPT_ICMP_ECHOREPLY: + /* Doesn't happen. */ } return NF_DROP; } -static inline int find_ping_match(const struct ipt_entry_match *m) -{ - const struct ipt_icmp *icmpinfo = (const struct ipt_icmp *)m->data; - - if (strcmp(m->u.kernel.match->name, "icmp") == 0 - && icmpinfo->type == ICMP_ECHO - && !(icmpinfo->invflags & IPT_ICMP_INV)) - return 1; - - return 0; -} - static int check(const char *tablename, const struct ipt_entry *e, void *targinfo, @@ -216,17 +343,8 @@ static int check(const char *tablename, } if (rejinfo->with == IPT_ICMP_ECHOREPLY) { - /* Must specify that it's an ICMP ping packet. */ - if (e->ip.proto != IPPROTO_ICMP - || (e->ip.invflags & IPT_INV_PROTO)) { - DEBUGP("REJECT: ECHOREPLY illegal for non-icmp\n"); - return 0; - } - /* Must contain ICMP match. */ - if (IPT_MATCH_ITERATE(e, find_ping_match) == 0) { - DEBUGP("REJECT: ECHOREPLY illegal for non-ping\n"); - return 0; - } + printk("REJECT: ECHOREPLY no longer supported.\n"); + return 0; } else if (rejinfo->with == IPT_TCP_RESET) { /* Must specify that it's a TCP packet */ if (e->ip.proto != IPPROTO_TCP diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 3dc1895756cb..f41e47b55c77 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -6,7 +6,7 @@ * Pedro Roque <roque@di.fc.ul.pt> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> * - * $Id: addrconf.c,v 1.61 2001/04/25 20:46:34 davem Exp $ + * $Id: addrconf.c,v 1.62 2001/04/26 19:11:59 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -285,9 +285,9 @@ static struct inet6_dev * ipv6_find_idev(struct net_device *dev) if ((idev = __in6_dev_get(dev)) == NULL) { if ((idev = ipv6_add_dev(dev)) == NULL) return NULL; - if (dev->flags&IFF_UP) - ipv6_mc_up(idev); } + if (dev->flags&IFF_UP) + ipv6_mc_up(idev); return idev; } diff --git a/net/netsyms.c b/net/netsyms.c index bb254f2aaad1..1949d89287ef 100644 --- a/net/netsyms.c +++ b/net/netsyms.c @@ -374,8 +374,6 @@ EXPORT_SYMBOL(sysctl_tcp_ecn); EXPORT_SYMBOL(tcp_cwnd_application_limited); EXPORT_SYMBOL(tcp_sendpage); -EXPORT_SYMBOL(xrlim_allow); - EXPORT_SYMBOL(tcp_write_xmit); EXPORT_SYMBOL(tcp_v4_remember_stamp); @@ -434,6 +432,7 @@ EXPORT_SYMBOL(dev_open); /* Used by other modules */ EXPORT_SYMBOL(in_ntoa); +EXPORT_SYMBOL(xrlim_allow); EXPORT_SYMBOL(ip_rcv); EXPORT_SYMBOL(arp_rcv); @@ -561,6 +560,7 @@ EXPORT_SYMBOL(nf_hook_slow); EXPORT_SYMBOL(nf_hooks); EXPORT_SYMBOL(nf_setsockopt); EXPORT_SYMBOL(nf_getsockopt); +EXPORT_SYMBOL(ip_ct_attach); #endif EXPORT_SYMBOL(register_gifconf); diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 39b2989c6331..1ee0ddc0cbf8 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -462,11 +462,11 @@ svc_udp_init(struct svc_sock *svsk) } /* - * A state change on a listening socket means there's a connection - * pending. + * A data_ready event on a listening socket means there's a connection + * pending. Do not use state_change as a substitute for it. */ static void -svc_tcp_state_change1(struct sock *sk) +svc_tcp_listen_data_ready(struct sock *sk, int count_unused) { struct svc_sock *svsk; @@ -494,7 +494,7 @@ svc_tcp_state_change1(struct sock *sk) * A state change on a connected socket means it's dying or dead. */ static void -svc_tcp_state_change2(struct sock *sk) +svc_tcp_state_change(struct sock *sk) { struct svc_sock *svsk; @@ -777,10 +777,10 @@ svc_tcp_init(struct svc_sock *svsk) if (sk->state == TCP_LISTEN) { dprintk("setting up TCP socket for listening\n"); - sk->state_change = svc_tcp_state_change1; + sk->data_ready = svc_tcp_listen_data_ready; } else { dprintk("setting up TCP socket for reading\n"); - sk->state_change = svc_tcp_state_change2; + sk->state_change = svc_tcp_state_change; sk->data_ready = svc_tcp_data_ready; svsk->sk_reclen = 0; |
