diff options
| -rw-r--r-- | Documentation/networking/NAPI_HOWTO.txt | 749 | ||||
| -rw-r--r-- | include/linux/netdevice.h | 87 | ||||
| -rw-r--r-- | include/linux/netfilter_arp.h | 19 | ||||
| -rw-r--r-- | include/linux/netfilter_arp/arp_tables.h | 342 | ||||
| -rw-r--r-- | include/linux/netfilter_ipv4/ip_nat.h | 5 | ||||
| -rw-r--r-- | include/linux/sysctl.h | 3 | ||||
| -rw-r--r-- | net/core/dev.c | 281 | ||||
| -rw-r--r-- | net/core/sysctl_net_core.c | 4 | ||||
| -rw-r--r-- | net/ipv4/arp.c | 77 | ||||
| -rw-r--r-- | net/ipv4/netfilter/Config.in | 6 | ||||
| -rw-r--r-- | net/ipv4/netfilter/Makefile | 8 | ||||
| -rw-r--r-- | net/ipv4/netfilter/arp_tables.c | 1313 | ||||
| -rw-r--r-- | net/ipv4/netfilter/arptable_filter.c | 174 | ||||
| -rw-r--r-- | net/ipv4/netfilter/ip_conntrack_standalone.c | 26 | ||||
| -rw-r--r-- | net/ipv4/netfilter/ip_nat_core.c | 9 | ||||
| -rw-r--r-- | net/ipv4/netfilter/ip_nat_rule.c | 14 | ||||
| -rw-r--r-- | net/ipv4/netfilter/ip_nat_standalone.c | 25 | ||||
| -rw-r--r-- | net/netsyms.c | 1 |
18 files changed, 2993 insertions, 150 deletions
diff --git a/Documentation/networking/NAPI_HOWTO.txt b/Documentation/networking/NAPI_HOWTO.txt new file mode 100644 index 000000000000..44811d36db49 --- /dev/null +++ b/Documentation/networking/NAPI_HOWTO.txt @@ -0,0 +1,749 @@ +HISTORY: +February 16/2002 -- revision 0.2.1: +COR typo corrected +February 10/2002 -- revision 0.2: +some spell checking ;-> +January 12/2002 -- revision 0.1 +This is still work in progress so may change. +To keep up to date please watch this space. + +Introduction to NAPI +==================== + +NAPI is a proven (www.cyberus.ca/~hadi/usenix-paper.tgz) technique +to improve network performance on Linux. For more details please +read that paper. +NAPI provides a "inherent mitigation" which is bound by system capacity +as can be seen from the following data collected by Robert on Gigabit +ethernet (e1000): + + Psize Ipps Tput Rxint Txint Done Ndone + --------------------------------------------------------------- + 60 890000 409362 17 27622 7 6823 + 128 758150 464364 21 9301 10 7738 + 256 445632 774646 42 15507 21 12906 + 512 232666 994445 241292 19147 241192 1062 + 1024 119061 1000003 872519 19258 872511 0 + 1440 85193 1000003 946576 19505 946569 0 + + +Legend: +"Ipps" stands for input packets per second. +"Tput" == packets out of total 1M that made it out. +"txint" == transmit completion interrupts seen +"Done" == The number of times that the poll() managed to pull all +packets out of the rx ring. Note from this that the lower the +load the more we could clean up the rxring +"Ndone" == is the converse of "Done". Note again, that the higher +the load the more times we couldnt clean up the rxring. + +Observe that: +when the NIC receives 890Kpackets/sec only 17 rx interrupts are generated. +The system cant handle the processing at 1 interrupt/packet at that load level. +At lower rates on the other hand, rx interrupts go up and therefore the +interrupt/packet ratio goes up (as observable from that table). So there is +possibility that under low enough input, you get one poll call for each +input packet caused by a single interrupt each time. And if the system +cant handle interrupt per packet ratio of 1, then it will just have to +chug along .... + + +0) Prerequisites: +================== +A driver MAY continue using the old 2.4 technique for interfacing +to the network stack and not benefit from the NAPI changes. +NAPI additions to the kernel do not break backward compatibility. +NAPI, however, requires the following features to be available: + +A) DMA ring or enough RAM to store packets in software devices. + +B) Ability to turn off interrupts or maybe events that send packets up +the stack. + +NAPI processes packet events in what is known as dev->poll() method. +Typically, only packet receive events are processed in dev->poll(). +The rest of the events MAY be processed by the regular interrupt handler +to reduce processing latency (justified also because there are not that +many of them). +Note, however, NAPI does not enforce that dev->poll() only processes +receive events. +Tests with the tulip driver indicated slightly increased latency if +all of the interrupt handler is moved to dev->poll(). Also MII handling +gets a little trickier. +The example used in this document is to move the receive processing only +to dev->poll(); this is shown with the patch for the tulip driver. +For an example of code that moves all the interrupt driver to +dev->poll() look at the ported e1000 code. + +There are caveats that might force you to go with moving everything to +dev->poll(). Different NICs work differently depending on their status/event +acknowledgement setup. +There are two types of event register ACK mechanisms. + I) what is known as Clear-on-read (COR). + when you read the status/event register, it clears everything! + The natsemi and sunbmac NICs are known to do this. + In this case your only choice is to move all to dev->poll() + + II) Clear-on-write (COW) + i) you clear the status by writting a 1 in the bit-location you want. + These are the majority of the NICs and work the best with NAPI. + Put only receive events in dev->poll(); leave the rest in + the old interrupt handler. + ii) whatever you write in the status register clears every thing ;-> + Cant seem to find any supported by Linux which do this. If + someone knows such a chip email us please. + Move all to dev->poll() + +C) Ability to detect new work correctly. +NAPI works by shutting down event interrupts when theres work and +turning them on when theres none. +New packets might show up in the small window while interrupts were being +re-enabled (refer to appendix 2). A packet might sneak in during the period +we are enabling interrupts. We only get to know about such a packet when the +next new packet arrives and generates an interrupt. +Essentially, there is a small window of opportunity for a race condition +which for clarity we'll refer to as the "rotting packet". + +This is a very important topic and appendix 2 is dedicated for more +discussion. + +Locking rules and environmental guarantees +========================================== + +-Guarantee: Only one CPU at any time can call dev->poll(); this is because +only one CPU can pick the initial interrupt and hence the initial +netif_rx_schedule(dev); +- The core layer invokes devices to send packets in a round robin format. +This implies receive is totaly lockless because of the guarantee only that +one CPU is executing it. +- contention can only be the result of some other CPU accessing the rx +ring. This happens only in close() and suspend() (when these methods +try to clean the rx ring); +****guarantee: driver authors need not worry about this; synchronization +is taken care for them by the top net layer. +-local interrupts are enabled (if you dont move all to dev->poll()). For +example link/MII and txcomplete continue functioning just same old way. +This improves the latency of processing these events. It is also assumed that +the receive interrupt is the largest cause of noise. Note this might not +always be true. +[according to Manfred Spraul, the winbond insists on sending one +txmitcomplete interrupt for each packet (although this can be mitigated)]. +For these broken drivers, move all to dev->poll(). + +For the rest of this text, we'll assume that dev->poll() only +processes receive events. + +new methods introduce by NAPI +============================= + +a) netif_rx_schedule(dev) +Called by an IRQ handler to schedule a poll for device + +b) netif_rx_schedule_prep(dev) +puts the device in a state which allows for it to be added to the +CPU polling list if it is up and running. You can look at this as +the first half of netif_rx_schedule(dev) above; the second half +being c) below. + +c) __netif_rx_schedule(dev) +Add device to the poll list for this CPU; assuming that _prep above +has already been called and returned 1. + +d) netif_rx_reschedule(dev, undo) +Called to reschedule polling for device specifically for some +deficient hardware. Read Appendix 2 for more details. + +e) netif_rx_complete(dev) + +Remove interface from the CPU poll list: it must be in the poll list +on current cpu. This primitive is called by dev->poll(), when +it completes its work. The device cannot be out of poll list at this +call, if it is then clearly it is a BUG(). You'll know ;-> + +All these above nethods are used below. So keep reading for clarity. + +Device driver changes to be made when porting NAPI +================================================== + +Below we describe what kind of changes are required for NAPI to work. + +1) introduction of dev->poll() method +===================================== + +This is the method that is invoked by the network core when it requests +for new packets from the driver. A driver is allowed to send upto +dev->quota packets by the current CPU before yielding to the network +subsystem (so other devices can also get opportunity to send to the stack). + +dev->poll() prototype looks as follows: +int my_poll(struct net_device *dev, int *budget) + +budget is the remaining number of packets the network subsystem on the +current CPU can send up the stack before yielding to other system tasks. +*Each driver is responsible for decrementing budget by the total number of +packets sent. + Total number of packets cannot exceed dev->quota. + +dev->poll() method is invoked by the top layer, the driver just sends if it +can to the stack the packet quantity requested. + +more on dev->poll() below after the interrupt changes are explained. + +2) registering dev->poll() method +=================================== + +dev->poll should be set in the dev->probe() method. +e.g: +dev->open = my_open; +. +. +/* two new additions */ +/* first register my poll method */ +dev->poll = my_poll; +/* next register my weight/quanta; can be overriden in /proc */ +dev->weight = 16; +. +. +dev->stop = my_close; + + + +3) scheduling dev->poll() +============================= +This involves modifying the interrupt handler and the code +path which takes the packet off the NIC and sends them to the +stack. + +it's important at this point to introduce the classical D Becker +interrupt processor: + +------------------ +static void +netdevice_interrupt(int irq, void *dev_id, struct pt_regs *regs) +{ + + struct net_device *dev = (struct net_device *)dev_instance; + struct my_private *tp = (struct my_private *)dev->priv; + + int work_count = my_work_count; + status = read_interrupt_status_reg(); + if (status == 0) + return; /* Shared IRQ: not us */ + if (status == 0xffff) + return; /* Hot unplug */ + if (status & error) + do_some_error_handling() + + do { + acknowledge_ints_ASAP(); + + if (status & link_interrupt) { + spin_lock(&tp->link_lock); + do_some_link_stat_stuff(); + spin_lock(&tp->link_lock); + } + + if (status & rx_interrupt) { + receive_packets(dev); + } + + if (status & rx_nobufs) { + make_rx_buffs_avail(); + } + + if (status & tx_related) { + spin_lock(&tp->lock); + tx_ring_free(dev); + if (tx_died) + restart_tx(); + spin_unlock(&tp->lock); + } + + status = read_interrupt_status_reg(); + + } while (!(status & error) || more_work_to_be_done); + +} + +---------------------------------------------------------------------- + +We now change this to what is shown below to NAPI-enable it: + +---------------------------------------------------------------------- +static void +netdevice_interrupt(int irq, void *dev_id, struct pt_regs *regs) +{ + struct net_device *dev = (struct net_device *)dev_instance; + struct my_private *tp = (struct my_private *)dev->priv; + + status = read_interrupt_status_reg(); + if (status == 0) + return; /* Shared IRQ: not us */ + if (status == 0xffff) + return; /* Hot unplug */ + if (status & error) + do_some_error_handling(); + + do { +/************************ start note *********************************/ + acknowledge_ints_ASAP(); // dont ack rx and rxnobuff here +/************************ end note *********************************/ + + if (status & link_interrupt) { + spin_lock(&tp->link_lock); + do_some_link_stat_stuff(); + spin_unlock(&tp->link_lock); + } +/************************ start note *********************************/ + if (status & rx_interrupt || (status & rx_nobuffs)) { + if (netif_rx_schedule_prep(dev)) { + + /* disable interrupts caused + * by arriving packets */ + disable_rx_and_rxnobuff_ints(); + /* tell system we have work to be done. */ + __netif_rx_schedule(dev); + } else { + printk("driver bug! interrupt while in poll\n"); + /* FIX by disabling interrupts */ + disable_rx_and_rxnobuff_ints(); + } + } +/************************ end note note *********************************/ + + if (status & tx_related) { + spin_lock(&tp->lock); + tx_ring_free(dev); + + if (tx_died) + restart_tx(); + spin_unlock(&tp->lock); + } + + status = read_interrupt_status_reg(); + +/************************ start note *********************************/ + } while (!(status & error) || more_work_to_be_done(status)); +/************************ end note note *********************************/ + +} + +--------------------------------------------------------------------- + + +We note several things from above: + +I) Any interrupt source which is caused by arriving packets is now +turned off when it occurs. Depending on the hardware, there could be +several reasons that arriving packets would cause interrupts; these are the +interrupt sources we wish to avoid. The two common ones are a) a packet +arriving (rxint) b) a packet arriving and finding no DMA buffers available +(rxnobuff) . +This means also acknowledge_ints_ASAP() will not clear the status +register for those two items above; clearing is done in the place where +proper work is done within NAPI; at the poll() and refill_rx_ring() +discussed further below. +netif_rx_schedule_prep() returns 1 if device is in running state and +gets successfully added to the core poll list. If we get a zero value +we can _almost_ assume are already added to the list (instead of not running. +Logic based on the fact that you shouldnt get interrupt if not running) +We rectify this by disabling rx and rxnobuf interrupts. + +II) that receive_packets(dev) and make_rx_buffs_avail() may have dissapeared. +These functionalities are still around actually...... + +infact, receive_packets(dev) is very close to my_poll() and +make_rx_buffs_avail() is invoked from my_poll() + +4) converting receive_packets() to dev->poll() +=============================================== + +We need to convert the classical D Becker receive_packets(dev) to my_poll() + +First the typical receive_packets() below: +------------------------------------------------------------------- + +/* this is called by interrupt handler */ +static void receive_packets (struct net_device *dev) +{ + + struct my_private *tp = (struct my_private *)dev->priv; + rx_ring = tp->rx_ring; + cur_rx = tp->cur_rx; + int entry = cur_rx % RX_RING_SIZE; + int received = 0; + int rx_work_limit = tp->dirty_rx + RX_RING_SIZE - tp->cur_rx; + + while (rx_ring_not_empty) { + u32 rx_status; + unsigned int rx_size; + unsigned int pkt_size; + struct sk_buff *skb; + /* read size+status of next frame from DMA ring buffer */ + /* the number 16 and 4 are just examples */ + rx_status = le32_to_cpu (*(u32 *) (rx_ring + ring_offset)); + rx_size = rx_status >> 16; + pkt_size = rx_size - 4; + + /* process errors */ + if ((rx_size > (MAX_ETH_FRAME_SIZE+4)) || + (!(rx_status & RxStatusOK))) { + netdrv_rx_err (rx_status, dev, tp, ioaddr); + return; + } + + if (--rx_work_limit < 0) + break; + + /* grab a skb */ + skb = dev_alloc_skb (pkt_size + 2); + if (skb) { + . + . + netif_rx (skb); + . + . + } else { /* OOM */ + /*seems very driver specific ... some just pass + whatever is on the ring already. */ + } + + /* move to the next skb on the ring */ + entry = (++tp->cur_rx) % RX_RING_SIZE; + received++ ; + + } + + /* store current ring pointer state */ + tp->cur_rx = cur_rx; + + /* Refill the Rx ring buffers if they are needed */ + refill_rx_ring(); + . + . + +} +------------------------------------------------------------------- +We change it to a new one below; note the additional parameter in +the call. + +------------------------------------------------------------------- + +/* this is called by the network core */ +static void my_poll (struct net_device *dev, int *budget) +{ + + struct my_private *tp = (struct my_private *)dev->priv; + rx_ring = tp->rx_ring; + cur_rx = tp->cur_rx; + int entry = cur_rx % RX_BUF_LEN; + /* maximum packets to send to the stack */ +/************************ note note *********************************/ + int rx_work_limit = dev->quota; + +/************************ end note note *********************************/ + do { // outer beggining loop starts here + + clear_rx_status_register_bit(); + + while (rx_ring_not_empty) { + u32 rx_status; + unsigned int rx_size; + unsigned int pkt_size; + struct sk_buff *skb; + /* read size+status of next frame from DMA ring buffer */ + /* the number 16 and 4 are just examples */ + rx_status = le32_to_cpu (*(u32 *) (rx_ring + ring_offset)); + rx_size = rx_status >> 16; + pkt_size = rx_size - 4; + + /* process errors */ + if ((rx_size > (MAX_ETH_FRAME_SIZE+4)) || + (!(rx_status & RxStatusOK))) { + netdrv_rx_err (rx_status, dev, tp, ioaddr); + return; + } + +/************************ note note *********************************/ + if (--rx_work_limit < 0) { /* we got packets, but no quota */ + /* store current ring pointer state */ + tp->cur_rx = cur_rx; + + /* Refill the Rx ring buffers if they are needed */ + refill_rx_ring(dev); + goto not_done; + } +/********************** end note **********************************/ + + /* grab a skb */ + skb = dev_alloc_skb (pkt_size + 2); + if (skb) { + . + . +/************************ note note *********************************/ + netif_receive_skb (skb); +/********************** end note **********************************/ + . + . + } else { /* OOM */ + /*seems very driver specific ... common is just pass + whatever is on the ring already. */ + } + + /* move to the next skb on the ring */ + entry = (++tp->cur_rx) % RX_RING_SIZE; + received++ ; + + } + + /* store current ring pointer state */ + tp->cur_rx = cur_rx; + + /* Refill the Rx ring buffers if they are needed */ + refill_rx_ring(dev); + + /* no packets on ring; but new ones can arrive since we last + checked */ + status = read_interrupt_status_reg(); + if (rx status is not set) { + /* If something arrives in this narrow window, + an interrupt will be generated */ + goto done; + } + /* done! at least thats what it looks like ;-> + if new packets came in after our last check on status bits + they'll be caught by the while check and we go back and clear them + since we havent exceeded our quota */ + } while (rx_status_is_set); + +done: + +/************************ note note *********************************/ + dev->quota -= received; + *budget -= received; + + /* If RX ring is not full we are out of memory. */ + if (tp->rx_buffers[tp->dirty_rx % RX_RING_SIZE].skb == NULL) + goto oom; + + /* we are happy/done, no more packets on ring; put us back + to where we can start processing interrupts again */ + netif_rx_complete(dev); + enable_rx_and_rxnobuf_ints(); + + /* The last op happens after poll completion. Which means the following: + * 1. it can race with disabling irqs in irq handler (which are done to + * schedule polls) + * 2. it can race with dis/enabling irqs in other poll threads + * 3. if an irq raised after the begining of the outer beginning + * loop(marked in the code above), it will be immediately + * triggered here. + * + * Summarizing: the logic may results in some redundant irqs both + * due to races in masking and due to too late acking of already + * processed irqs. The good news: no events are ever lost. + */ + + return 0; /* done */ + +not_done: + if (tp->cur_rx - tp->dirty_rx > RX_RING_SIZE/2 || + tp->rx_buffers[tp->dirty_rx % RX_RING_SIZE].skb == NULL) + refill_rx_ring(dev); + + if (!received) { + printk("received==0\n"); + received = 1; + } + dev->quota -= received; + *budget -= received; + return 1; /* not_done */ + +oom: + /* Start timer, stop polling, but do not enable rx interrupts. */ + start_poll_timer(dev); + return 0; /* we'll take it from here so tell core "done"*/ + +/************************ End note note *********************************/ +} +------------------------------------------------------------------- + +From above we note that: +0) rx_work_limit = dev->quota +1) refill_rx_ring() is in charge of clearing the bit for rxnobuff when +it does the work. +2) We have a done and not_done state. +3) instead of netif_rx() we call netif_receive_skb() to pass the skb. +4) we have a new way of handling oom condition +5) A new outer for (;;) loop has been added. This serves the purpose of +ensuring that if a new packet has come in, after we are all set and done, +and we have not exceeded our quota that we continue sending packets up. + + +----------------------------------------------------------- +Poll timer code will need to do the following: + +a) + + if (tp->cur_rx - tp->dirty_rx > RX_RING_SIZE/2 || + tp->rx_buffers[tp->dirty_rx % RX_RING_SIZE].skb == NULL) + refill_rx_ring(dev); + + /* If RX ring is not full we are still out of memory. + Restart the timer again. Else we re-add ourselves + to the master poll list. + */ + + if (tp->rx_buffers[tp->dirty_rx % RX_RING_SIZE].skb == NULL) + restart_timer(); + + else netif_rx_schedule(dev); /* we are back on the poll list */ + +5) dev->close() and dev->suspend() issues +========================================== +The driver writter neednt worry about this. The top net layer takes +care of it. + +6) Adding new Stats to /proc +============================= +In order to debug some of the new features, we introduce new stats +that need to be collected. +TODO: Fill this later. + +APPENDIX 1: discussion on using ethernet HW FC +============================================== +Most chips with FC only send a pause packet when they run out of Rx buffers. +Since packets are pulled off the DMA ring by a softirq in NAPI, +if the system is slow in grabbing them and we have a high input +rate (faster than the system's capacity to remove packets), then theoretically +there will only be one rx interrupt for all packets during a given packetstorm. +Under low load, we might have a single interrupt per packet. +FC should be programmed to apply in the case when the system cant pull out +packets fast enough i.e send a pause only when you run out of rx buffers. +Note FC in itself is a good solution but we have found it to not be +much of a commodity feature (both in NICs and switches) and hence falls +under the same category as using NIC based mitigation. Also experiments +indicate that its much harder to resolve the resource allocation +issue (aka lazy receiving that NAPI offers) and hence quantify its usefullness +proved harder. In any case, FC works even better with NAPI but is not +necessary. + + +APPENDIX 2: the "rotting packet" race-window avoidance scheme +============================================================= + +There are two types of associations seen here + +1) status/int which honors level triggered IRQ + +If a status bit for receive or rxnobuff is set and the corresponding +interrupt-enable bit is not on, then no interrupts will be generated. However, +as soon as the "interrupt-enable" bit is unmasked, an immediate interrupt is +generated. [assuming the status bit was not turned off]. +Generally the concept of level triggered IRQs in association with a status and +interrupt-enable CSR register set is used to avoid the race. + +If we take the example of the tulip: +"pending work" is indicated by the status bit(CSR5 in tulip). +the corresponding interrupt bit (CSR7 in tulip) might be turned off (but +the CSR5 will continue to be turned on with new packet arrivals even if +we clear it the first time) +Very important is the fact that if we turn on the interrupt bit on when +status is set that an immediate irq is triggered. + +If we cleared the rx ring and proclaimed there was "no more work +to be done" and then went on to do a few other things; then when we enable +interrupts, there is a possibility that a new packet might sneak in during +this phase. It helps to look at the pseudo code for the tulip poll +routine: + +-------------------------- + do { + ACK; + while (ring_is_not_empty()) { + work-work-work + if quota is exceeded: exit, no touching irq status/mask + } + /* No packets, but new can arrive while we are doing this*/ + CSR5 := read + if (CSR5 is not set) { + /* If something arrives in this narrow window here, + * where the comments are ;-> irq will be generated */ + unmask irqs; + exit poll; + } + } while (rx_status_is_set); +------------------------ + +CSR5 bit of interest is only the rx status. +If you look at the last if statement: +you just finished grabbing all the packets from the rx ring .. you check if +status bit says theres more packets just in ... it says none; you then +enable rx interrupts again; if a new packet just came in during this check, +we are counting that CSR5 will be set in that small window of opportunity +and that by re-enabling interrupts, we would actually triger an interrupt +to register the new packet for processing. + +[The above description nay be very verbose, if you have better wording +that will make this more understandable, please suggest it.] + +2) non-capable hardware + +These do not generally respect level triggered IRQs. Normally, +irqs may be lost while being masked and the only way to leave poll is to do +a double check for new input after netif_rx_complete() is invoked +and re-enable polling (after seeing this new input). + +Sample code: + +--------- + . + . +restart_poll: + while (ring_is_not_empty()) { + work-work-work + if quota is exceeded: exit, not touching irq status/mask + } + . + . + . + enable_rx_interrupts() + netif_rx_complete(dev); + if (ring_has_new_packet() && netif_rx_reschedule(dev, received)) { + disable_rx_and_rxnobufs() + goto restart_poll + } while (rx_status_is_set); +--------- + +Basically netif_rx_complete() removes us from the poll list, but because a +new packet which will never be caught due to the possibility of a race +might come in, we attempt to re-add ourselves to the poll list. + + + +-------------------------------------------------------------------- + +relevant sites: +================== +ftp://robur.slu.se/pub/Linux/net-development/NAPI/ + + +-------------------------------------------------------------------- +TODO: Write net-skeleton.c driver. +------------------------------------------------------------- + +Authors: +======== +Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> +Jamal Hadi Salim <hadi@cyberus.ca> +Robert Olsson <Robert.Olsson@data.slu.se> + +Acknowledgements: +================ +People who made this document better: + +Lennert Buytenhek <buytenh@gnu.org> +Andrew Morton <akpm@zip.com.au> +Manfred Spraul <manfred@colorfullife.com> +Donald Becker <becker@scyld.com> +Jeff Garzik <jgarzik@mandrakesoft.com> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 178f6a5a0fe6..32b6db3c7a2c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -206,7 +206,8 @@ enum netdev_state_t __LINK_STATE_START, __LINK_STATE_PRESENT, __LINK_STATE_SCHED, - __LINK_STATE_NOCARRIER + __LINK_STATE_NOCARRIER, + __LINK_STATE_RX_SCHED }; @@ -330,6 +331,10 @@ struct net_device void *ip6_ptr; /* IPv6 specific data */ void *ec_ptr; /* Econet specific data */ + struct list_head poll_list; /* Link to poll list */ + int quota; + int weight; + struct Qdisc *qdisc; struct Qdisc *qdisc_sleeping; struct Qdisc *qdisc_list; @@ -373,6 +378,7 @@ struct net_device int (*stop)(struct net_device *dev); int (*hard_start_xmit) (struct sk_buff *skb, struct net_device *dev); + int (*poll) (struct net_device *dev, int *quota); int (*hard_header) (struct sk_buff *skb, struct net_device *dev, unsigned short type, @@ -492,8 +498,11 @@ struct softnet_data int cng_level; int avg_blog; struct sk_buff_head input_pkt_queue; + struct list_head poll_list; struct net_device *output_queue; struct sk_buff *completion_queue; + + struct net_device backlog_dev; /* Sorry. 8) */ } __attribute__((__aligned__(SMP_CACHE_BYTES))); @@ -547,6 +556,7 @@ static inline int netif_running(struct net_device *dev) return test_bit(__LINK_STATE_START, &dev->state); } + /* Use this variant when it is known for sure that it * is executing from interrupt context. */ @@ -578,6 +588,8 @@ static inline void dev_kfree_skb_any(struct sk_buff *skb) extern void net_call_rx_atomic(void (*fn)(void)); #define HAVE_NETIF_RX 1 extern int netif_rx(struct sk_buff *skb); +#define HAVE_NETIF_RECEIVE_SKB 1 +extern int netif_receive_skb(struct sk_buff *skb); extern int dev_ioctl(unsigned int cmd, void *); extern int dev_change_flags(struct net_device *, unsigned); extern void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev); @@ -695,6 +707,78 @@ enum { #define netif_msg_rx_status(p) ((p)->msg_enable & NETIF_MSG_RX_STATUS) #define netif_msg_pktdata(p) ((p)->msg_enable & NETIF_MSG_PKTDATA) +/* Schedule rx intr now? */ + +static inline int netif_rx_schedule_prep(struct net_device *dev) +{ + return netif_running(dev) && + !test_and_set_bit(__LINK_STATE_RX_SCHED, &dev->state); +} + +/* Add interface to tail of rx poll list. This assumes that _prep has + * already been called and returned 1. + */ + +static inline void __netif_rx_schedule(struct net_device *dev) +{ + unsigned long flags; + int cpu = smp_processor_id(); + + local_irq_save(flags); + dev_hold(dev); + list_add_tail(&dev->poll_list, &softnet_data[cpu].poll_list); + if (dev->quota < 0) + dev->quota += dev->weight; + else + dev->quota = dev->weight; + __cpu_raise_softirq(cpu, NET_RX_SOFTIRQ); + local_irq_restore(flags); +} + +/* Try to reschedule poll. Called by irq handler. */ + +static inline void netif_rx_schedule(struct net_device *dev) +{ + if (netif_rx_schedule_prep(dev)) + __netif_rx_schedule(dev); +} + +/* Try to reschedule poll. Called by dev->poll() after netif_rx_complete(). + * Do not inline this? + */ +static inline int netif_rx_reschedule(struct net_device *dev, int undo) +{ + if (netif_rx_schedule_prep(dev)) { + unsigned long flags; + int cpu = smp_processor_id(); + + dev->quota += undo; + + local_irq_save(flags); + list_add_tail(&dev->poll_list, &softnet_data[cpu].poll_list); + __cpu_raise_softirq(cpu, NET_RX_SOFTIRQ); + local_irq_restore(flags); + return 1; + } + return 0; +} + +/* Remove interface from poll list: it must be in the poll list + * on current cpu. This primitive is called by dev->poll(), when + * it completes the work. The device cannot be out of poll list at this + * moment, it is BUG(). + */ +static inline void netif_rx_complete(struct net_device *dev) +{ + unsigned long flags; + + local_irq_save(flags); + if (!test_bit(__LINK_STATE_RX_SCHED, &dev->state)) BUG(); + list_del(&dev->poll_list); + clear_bit(__LINK_STATE_RX_SCHED, &dev->state); + local_irq_restore(flags); +} + /* These functions live elsewhere (drivers/net/net_init.c, but related) */ extern void ether_setup(struct net_device *dev); @@ -719,6 +803,7 @@ extern void dev_mcast_init(void); extern int netdev_register_fc(struct net_device *dev, void (*stimul)(struct net_device *dev)); extern void netdev_unregister_fc(int bit); extern int netdev_max_backlog; +extern int weight_p; extern unsigned long netdev_fc_xoff; extern atomic_t netdev_dropping; extern int netdev_set_master(struct net_device *dev, struct net_device *master); diff --git a/include/linux/netfilter_arp.h b/include/linux/netfilter_arp.h new file mode 100644 index 000000000000..4f460b3b0cba --- /dev/null +++ b/include/linux/netfilter_arp.h @@ -0,0 +1,19 @@ +#ifndef __LINUX_ARP_NETFILTER_H +#define __LINUX_ARP_NETFILTER_H + +/* ARP-specific defines for netfilter. + * (C)2002 Rusty Russell IBM -- This code is GPL. + */ + +#include <linux/config.h> +#include <linux/netfilter.h> + +/* There is no PF_ARP. */ +#define NF_ARP 0 + +/* ARP Hooks */ +#define NF_ARP_IN 0 +#define NF_ARP_OUT 1 +#define NF_ARP_NUMHOOKS 2 + +#endif /* __LINUX_ARP_NETFILTER_H */ diff --git a/include/linux/netfilter_arp/arp_tables.h b/include/linux/netfilter_arp/arp_tables.h new file mode 100644 index 000000000000..7b11e236c7c9 --- /dev/null +++ b/include/linux/netfilter_arp/arp_tables.h @@ -0,0 +1,342 @@ +/* + * Format of an ARP firewall descriptor + * + * src, tgt, src_mask, tgt_mask, arpop, arpop_mask are always stored in + * network byte order. + * flags are stored in host byte order (of course). + */ + +#ifndef _ARPTABLES_H +#define _ARPTABLES_H + +#ifdef __KERNEL__ +#include <linux/if.h> +#include <linux/types.h> +#include <linux/in.h> +#include <linux/if_arp.h> +#include <linux/skbuff.h> +#endif + +#include <linux/netfilter_arp.h> + +#define ARPT_FUNCTION_MAXNAMELEN 30 +#define ARPT_TABLE_MAXNAMELEN 32 + +#define ARPT_DEV_ADDR_LEN_MAX 16 + +struct arpt_devaddr_info { + char addr[ARPT_DEV_ADDR_LEN_MAX]; + char mask[ARPT_DEV_ADDR_LEN_MAX]; +}; + +/* Yes, Virginia, you have to zero the padding. */ +struct arpt_arp { + /* Source and target IP addr */ + struct in_addr src, tgt; + /* Mask for src and target IP addr */ + struct in_addr smsk, tmsk; + + /* Device hw address length, src+target device addresses */ + u_int8_t arhln, arhln_mask; + struct arpt_devaddr_info src_devaddr; + struct arpt_devaddr_info tgt_devaddr; + + /* ARP operation code. */ + u_int16_t arpop, arpop_mask; + + /* ARP hardware address and protocol address format. */ + u_int16_t arhrd, arhrd_mask; + u_int16_t arpro, arpro_mask; + + /* The protocol address length is only accepted if it is 4 + * so there is no use in offering a way to do filtering on it. + */ + + char iniface[IFNAMSIZ], outiface[IFNAMSIZ]; + unsigned char iniface_mask[IFNAMSIZ], outiface_mask[IFNAMSIZ]; + + /* Flags word */ + u_int8_t flags; + /* Inverse flags */ + u_int16_t invflags; +}; + +struct arpt_entry_target +{ + union { + struct { + u_int16_t target_size; + + /* Used by userspace */ + char name[ARPT_FUNCTION_MAXNAMELEN]; + } user; + struct { + u_int16_t target_size; + + /* Used inside the kernel */ + struct arpt_target *target; + } kernel; + + /* Total length */ + u_int16_t target_size; + } u; + + unsigned char data[0]; +}; + +struct arpt_standard_target +{ + struct arpt_entry_target target; + int verdict; +}; + +struct arpt_counters +{ + u_int64_t pcnt, bcnt; /* Packet and byte counters */ +}; + +/* Values for "flag" field in struct arpt_ip (general arp structure). + * No flags defined yet. + */ +#define ARPT_F_MASK 0x00 /* All possible flag bits mask. */ + +/* Values for "inv" field in struct arpt_arp. */ +#define ARPT_INV_VIA_IN 0x0001 /* Invert the sense of IN IFACE. */ +#define ARPT_INV_VIA_OUT 0x0002 /* Invert the sense of OUT IFACE */ +#define ARPT_INV_SRCIP 0x0004 /* Invert the sense of SRC IP. */ +#define ARPT_INV_TGTIP 0x0008 /* Invert the sense of TGT IP. */ +#define ARPT_INV_SRCDEVADDR 0x0010 /* Invert the sense of SRC DEV ADDR. */ +#define ARPT_INV_TGTDEVADDR 0x0020 /* Invert the sense of TGT DEV ADDR. */ +#define ARPT_INV_ARPOP 0x0040 /* Invert the sense of ARP OP. */ +#define ARPT_INV_ARPHRD 0x0080 /* Invert the sense of ARP HRD. */ +#define ARPT_INV_ARPPRO 0x0100 /* Invert the sense of ARP PRO. */ +#define ARPT_INV_ARPHLN 0x0200 /* Invert the sense of ARP HLN. */ +#define ARPT_INV_MASK 0x007F /* All possible flag bits mask. */ + +/* This structure defines each of the firewall rules. Consists of 3 + parts which are 1) general ARP header stuff 2) match specific + stuff 3) the target to perform if the rule matches */ +struct arpt_entry +{ + struct arpt_arp arp; + + /* Size of arpt_entry + matches */ + u_int16_t target_offset; + /* Size of arpt_entry + matches + target */ + u_int16_t next_offset; + + /* Back pointer */ + unsigned int comefrom; + + /* Packet and byte counters. */ + struct arpt_counters counters; + + /* The matches (if any), then the target. */ + unsigned char elems[0]; +}; + +/* + * New IP firewall options for [gs]etsockopt at the RAW IP level. + * Unlike BSD Linux inherits IP options so you don't have to use a raw + * socket for this. Instead we check rights in the calls. + */ +#define ARPT_BASE_CTL 96 /* base for firewall socket options */ + +#define ARPT_SO_SET_REPLACE (ARPT_BASE_CTL) +#define ARPT_SO_SET_ADD_COUNTERS (ARPT_BASE_CTL + 1) +#define ARPT_SO_SET_MAX ARPT_SO_SET_ADD_COUNTERS + +#define ARPT_SO_GET_INFO (ARPT_BASE_CTL) +#define ARPT_SO_GET_ENTRIES (ARPT_BASE_CTL + 1) +#define ARPT_SO_GET_MAX ARPT_SO_GET_ENTRIES + +/* CONTINUE verdict for targets */ +#define ARPT_CONTINUE 0xFFFFFFFF + +/* For standard target */ +#define ARPT_RETURN (-NF_MAX_VERDICT - 1) + +/* The argument to ARPT_SO_GET_INFO */ +struct arpt_getinfo +{ + /* Which table: caller fills this in. */ + char name[ARPT_TABLE_MAXNAMELEN]; + + /* Kernel fills these in. */ + /* Which hook entry points are valid: bitmask */ + unsigned int valid_hooks; + + /* Hook entry points: one per netfilter hook. */ + unsigned int hook_entry[NF_ARP_NUMHOOKS]; + + /* Underflow points. */ + unsigned int underflow[NF_ARP_NUMHOOKS]; + + /* Number of entries */ + unsigned int num_entries; + + /* Size of entries. */ + unsigned int size; +}; + +/* The argument to ARPT_SO_SET_REPLACE. */ +struct arpt_replace +{ + /* Which table. */ + char name[ARPT_TABLE_MAXNAMELEN]; + + /* Which hook entry points are valid: bitmask. You can't + change this. */ + unsigned int valid_hooks; + + /* Number of entries */ + unsigned int num_entries; + + /* Total size of new entries */ + unsigned int size; + + /* Hook entry points. */ + unsigned int hook_entry[NF_ARP_NUMHOOKS]; + + /* Underflow points. */ + unsigned int underflow[NF_ARP_NUMHOOKS]; + + /* Information about old entries: */ + /* Number of counters (must be equal to current number of entries). */ + unsigned int num_counters; + /* The old entries' counters. */ + struct arpt_counters *counters; + + /* The entries (hang off end: not really an array). */ + struct arpt_entry entries[0]; +}; + +/* The argument to ARPT_SO_ADD_COUNTERS. */ +struct arpt_counters_info +{ + /* Which table. */ + char name[ARPT_TABLE_MAXNAMELEN]; + + unsigned int num_counters; + + /* The counters (actually `number' of these). */ + struct arpt_counters counters[0]; +}; + +/* The argument to ARPT_SO_GET_ENTRIES. */ +struct arpt_get_entries +{ + /* Which table: user fills this in. */ + char name[ARPT_TABLE_MAXNAMELEN]; + + /* User fills this in: total entry size. */ + unsigned int size; + + /* The entries. */ + struct arpt_entry entrytable[0]; +}; + +/* Standard return verdict, or do jump. */ +#define ARPT_STANDARD_TARGET "" +/* Error verdict. */ +#define ARPT_ERROR_TARGET "ERROR" + +/* Helper functions */ +static __inline__ struct arpt_entry_target *arpt_get_target(struct arpt_entry *e) +{ + return (void *)e + e->target_offset; +} + +/* fn returns 0 to continue iteration */ +#define ARPT_ENTRY_ITERATE(entries, size, fn, args...) \ +({ \ + unsigned int __i; \ + int __ret = 0; \ + struct arpt_entry *__entry; \ + \ + for (__i = 0; __i < (size); __i += __entry->next_offset) { \ + __entry = (void *)(entries) + __i; \ + \ + __ret = fn(__entry , ## args); \ + if (__ret != 0) \ + break; \ + } \ + __ret; \ +}) + +/* + * Main firewall chains definitions and global var's definitions. + */ +#ifdef __KERNEL__ + +/* Registration hooks for targets. */ +struct arpt_target +{ + struct list_head list; + + const char name[ARPT_FUNCTION_MAXNAMELEN]; + + /* Returns verdict. */ + unsigned int (*target)(struct sk_buff **pskb, + unsigned int hooknum, + const struct net_device *in, + const struct net_device *out, + const void *targinfo, + void *userdata); + + /* Called when user tries to insert an entry of this type: + hook_mask is a bitmask of hooks from which it can be + called. */ + /* Should return true or false. */ + int (*checkentry)(const char *tablename, + const struct arpt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask); + + /* Called when entry of this type deleted. */ + void (*destroy)(void *targinfo, unsigned int targinfosize); + + /* Set this to THIS_MODULE if you are a module, otherwise NULL */ + struct module *me; +}; + +extern int arpt_register_target(struct arpt_target *target); +extern void arpt_unregister_target(struct arpt_target *target); + +/* Furniture shopping... */ +struct arpt_table +{ + struct list_head list; + + /* A unique name... */ + char name[ARPT_TABLE_MAXNAMELEN]; + + /* Seed table: copied in register_table */ + struct arpt_replace *table; + + /* What hooks you will enter on */ + unsigned int valid_hooks; + + /* Lock for the curtain */ + rwlock_t lock; + + /* Man behind the curtain... */ + struct arpt_table_info *private; + + /* Set this to THIS_MODULE if you are a module, otherwise NULL */ + struct module *me; +}; + +extern int arpt_register_table(struct arpt_table *table); +extern void arpt_unregister_table(struct arpt_table *table); +extern unsigned int arpt_do_table(struct sk_buff **pskb, + unsigned int hook, + const struct net_device *in, + const struct net_device *out, + struct arpt_table *table, + void *userdata); + +#define ARPT_ALIGN(s) (((s) + (__alignof__(struct arpt_entry)-1)) & ~(__alignof__(struct arpt_entry)-1)) +#endif /*__KERNEL__*/ +#endif /* _ARPTABLES_H */ diff --git a/include/linux/netfilter_ipv4/ip_nat.h b/include/linux/netfilter_ipv4/ip_nat.h index 3a35b1fafd5d..0cfbfd7201c7 100644 --- a/include/linux/netfilter_ipv4/ip_nat.h +++ b/include/linux/netfilter_ipv4/ip_nat.h @@ -11,8 +11,13 @@ enum ip_nat_manip_type IP_NAT_MANIP_DST }; +#ifndef CONFIG_IP_NF_NAT_LOCAL /* SRC manip occurs only on POST_ROUTING */ #define HOOK2MANIP(hooknum) ((hooknum) != NF_IP_POST_ROUTING) +#else +/* SRC manip occurs POST_ROUTING or LOCAL_IN */ +#define HOOK2MANIP(hooknum) ((hooknum) != NF_IP_POST_ROUTING && (hooknum) != NF_IP_LOCAL_IN) +#endif /* 2.3.19 (I hope) will define this in linux/netfilter_ipv4.h. */ #ifndef SO_ORIGINAL_DST diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 938560387354..01829afb8e41 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -202,7 +202,8 @@ enum NET_CORE_NO_CONG_THRESH=13, NET_CORE_NO_CONG=14, NET_CORE_LO_CONG=15, - NET_CORE_MOD_CONG=16 + NET_CORE_MOD_CONG=16, + NET_CORE_DEV_WEIGHT=17 }; /* /proc/sys/net/ethernet */ diff --git a/net/core/dev.c b/net/core/dev.c index 6a510b1a8ea4..8c340f76aa56 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -798,6 +798,19 @@ int dev_close(struct net_device *dev) clear_bit(__LINK_STATE_START, &dev->state); + /* Synchronize to scheduled poll. We cannot touch poll list, + * it can be even on different cpu. So just clear netif_running(), + * and wait when poll really will happen. Actually, the best place + * for this is inside dev->stop() after device stopped its irq + * engine, but this requires more changes in devices. */ + + smp_mb__after_clear_bit(); /* Commit netif_running(). */ + while (test_bit(__LINK_STATE_RX_SCHED, &dev->state)) { + /* No hurry. */ + current->state = TASK_INTERRUPTIBLE; + schedule_timeout(1); + } + /* * Call the device specific close. This cannot fail. * Only if device is UP @@ -1072,6 +1085,7 @@ int dev_queue_xmit(struct sk_buff *skb) =======================================================================*/ int netdev_max_backlog = 300; +int weight_p = 64; /* old backlog weight */ /* These numbers are selected based on intuition and some * experimentatiom, if you have more scientific way of doing this * please go ahead and fix things. @@ -1237,13 +1251,11 @@ int netif_rx(struct sk_buff *skb) enqueue: dev_hold(skb->dev); __skb_queue_tail(&queue->input_pkt_queue,skb); - /* Runs from irqs or BH's, no need to wake BH */ - cpu_raise_softirq(this_cpu, NET_RX_SOFTIRQ); local_irq_restore(flags); #ifndef OFFLINE_SAMPLE get_sample_stats(this_cpu); #endif - return softnet_data[this_cpu].cng_level; + return queue->cng_level; } if (queue->throttle) { @@ -1253,6 +1265,8 @@ enqueue: netdev_wakeup(); #endif } + + netif_rx_schedule(&queue->backlog_dev); goto enqueue; } @@ -1308,19 +1322,12 @@ static int deliver_to_old_ones(struct packet_type *pt, struct sk_buff *skb, int return ret; } -/* Reparent skb to master device. This function is called - * only from net_rx_action under BR_NETPROTO_LOCK. It is misuse - * of BR_NETPROTO_LOCK, but it is OK for now. - */ static __inline__ void skb_bond(struct sk_buff *skb) { struct net_device *dev = skb->dev; - - if (dev->master) { - dev_hold(dev->master); + + if (dev->master) skb->dev = dev->master; - dev_put(dev); - } } static void net_tx_action(struct softirq_action *h) @@ -1416,121 +1423,138 @@ static inline void handle_diverter(struct sk_buff *skb) } #endif /* CONFIG_NET_DIVERT */ - -static void net_rx_action(struct softirq_action *h) +int netif_receive_skb(struct sk_buff *skb) { - int this_cpu = smp_processor_id(); - struct softnet_data *queue = &softnet_data[this_cpu]; - unsigned long start_time = jiffies; - int bugdet = netdev_max_backlog; - - br_read_lock(BR_NETPROTO_LOCK); - - for (;;) { - struct sk_buff *skb; - struct net_device *rx_dev; - - local_irq_disable(); - skb = __skb_dequeue(&queue->input_pkt_queue); - local_irq_enable(); + struct packet_type *ptype, *pt_prev; + int ret = NET_RX_DROP; + unsigned short type = skb->protocol; - if (skb == NULL) - break; + if (skb->stamp.tv_sec == 0) + do_gettimeofday(&skb->stamp); - skb_bond(skb); + skb_bond(skb); - rx_dev = skb->dev; + netdev_rx_stat[smp_processor_id()].total++; #ifdef CONFIG_NET_FASTROUTE - if (skb->pkt_type == PACKET_FASTROUTE) { - netdev_rx_stat[this_cpu].fastroute_deferred_out++; - dev_queue_xmit(skb); - dev_put(rx_dev); - continue; - } + if (skb->pkt_type == PACKET_FASTROUTE) { + netdev_rx_stat[smp_processor_id()].fastroute_deferred_out++; + return dev_queue_xmit(skb); + } #endif - skb->h.raw = skb->nh.raw = skb->data; - { - struct packet_type *ptype, *pt_prev; - unsigned short type = skb->protocol; - - pt_prev = NULL; - for (ptype = ptype_all; ptype; ptype = ptype->next) { - if (!ptype->dev || ptype->dev == skb->dev) { - if (pt_prev) { - if (!pt_prev->data) { - deliver_to_old_ones(pt_prev, skb, 0); - } else { - atomic_inc(&skb->users); - pt_prev->func(skb, - skb->dev, - pt_prev); - } - } - pt_prev = ptype; + + skb->h.raw = skb->nh.raw = skb->data; + + pt_prev = NULL; + for (ptype = ptype_all; ptype; ptype = ptype->next) { + if (!ptype->dev || ptype->dev == skb->dev) { + if (pt_prev) { + if (!pt_prev->data) { + ret = deliver_to_old_ones(pt_prev, skb, 0); + } else { + atomic_inc(&skb->users); + ret = pt_prev->func(skb, skb->dev, pt_prev); } } + pt_prev = ptype; + } + } #ifdef CONFIG_NET_DIVERT - if (skb->dev->divert && skb->dev->divert->divert) - handle_diverter(skb); + if (skb->dev->divert && skb->dev->divert->divert) + ret = handle_diverter(skb); #endif /* CONFIG_NET_DIVERT */ - #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) - if (skb->dev->br_port != NULL && - br_handle_frame_hook != NULL) { - handle_bridge(skb, pt_prev); - dev_put(rx_dev); - continue; - } + if (skb->dev->br_port != NULL && + br_handle_frame_hook != NULL) { + return handle_bridge(skb, pt_prev); + } #endif - for (ptype=ptype_base[ntohs(type)&15];ptype;ptype=ptype->next) { - if (ptype->type == type && - (!ptype->dev || ptype->dev == skb->dev)) { - if (pt_prev) { - if (!pt_prev->data) - deliver_to_old_ones(pt_prev, skb, 0); - else { - atomic_inc(&skb->users); - pt_prev->func(skb, - skb->dev, - pt_prev); - } - } - pt_prev = ptype; + for (ptype=ptype_base[ntohs(type)&15];ptype;ptype=ptype->next) { + if (ptype->type == type && + (!ptype->dev || ptype->dev == skb->dev)) { + if (pt_prev) { + if (!pt_prev->data) { + ret = deliver_to_old_ones(pt_prev, skb, 0); + } else { + atomic_inc(&skb->users); + ret = pt_prev->func(skb, skb->dev, pt_prev); } } + pt_prev = ptype; + } + } - if (pt_prev) { - if (!pt_prev->data) - deliver_to_old_ones(pt_prev, skb, 1); - else - pt_prev->func(skb, skb->dev, pt_prev); - } else - kfree_skb(skb); + if (pt_prev) { + if (!pt_prev->data) { + ret = deliver_to_old_ones(pt_prev, skb, 1); + } else { + ret = pt_prev->func(skb, skb->dev, pt_prev); } + } else { + kfree_skb(skb); + /* Jamal, now you will not able to escape explaining + * me how you were going to use this. :-) + */ + ret = NET_RX_DROP; + } - dev_put(rx_dev); + return ret; +} - if (bugdet-- < 0 || jiffies - start_time > 1) - goto softnet_break; +static int process_backlog(struct net_device *backlog_dev, int *budget) +{ + int work = 0; + int quota = min(backlog_dev->quota, *budget); + int this_cpu = smp_processor_id(); + struct softnet_data *queue = &softnet_data[this_cpu]; + unsigned long start_time = jiffies; + + for (;;) { + struct sk_buff *skb; + struct net_device *dev; + + local_irq_disable(); + skb = __skb_dequeue(&queue->input_pkt_queue); + if (skb == NULL) + goto job_done; + local_irq_enable(); + + dev = skb->dev; + + netif_receive_skb(skb); + + dev_put(dev); + + work++; + + if (work >= quota || jiffies - start_time > 1) + break; #ifdef CONFIG_NET_HW_FLOWCONTROL - if (queue->throttle && queue->input_pkt_queue.qlen < no_cong_thresh ) { - if (atomic_dec_and_test(&netdev_dropping)) { - queue->throttle = 0; - netdev_wakeup(); - goto softnet_break; + if (queue->throttle && queue->input_pkt_queue.qlen < no_cong_thresh ) { + if (atomic_dec_and_test(&netdev_dropping)) { + queue->throttle = 0; + netdev_wakeup(); + break; + } } - } #endif - } - br_read_unlock(BR_NETPROTO_LOCK); - local_irq_disable(); + backlog_dev->quota -= work; + *budget -= work; + return -1; + +job_done: + backlog_dev->quota -= work; + *budget -= work; + + list_del(&backlog_dev->poll_list); + clear_bit(__LINK_STATE_RX_SCHED, &backlog_dev->state); + if (queue->throttle) { queue->throttle = 0; #ifdef CONFIG_NET_HW_FLOWCONTROL @@ -1539,21 +1563,53 @@ static void net_rx_action(struct softirq_action *h) #endif } local_irq_enable(); + return 0; +} - NET_PROFILE_LEAVE(softnet_process); - return; +static void net_rx_action(struct softirq_action *h) +{ + int this_cpu = smp_processor_id(); + struct softnet_data *queue = &softnet_data[this_cpu]; + unsigned long start_time = jiffies; + int budget = netdev_max_backlog; -softnet_break: + br_read_lock(BR_NETPROTO_LOCK); + local_irq_disable(); + + while (!list_empty(&queue->poll_list)) { + struct net_device *dev; + + if (budget <= 0 || jiffies - start_time > 1) + goto softnet_break; + + local_irq_enable(); + + dev = list_entry(queue->poll_list.next, struct net_device, poll_list); + + if (dev->quota <= 0 || dev->poll(dev, &budget)) { + local_irq_disable(); + list_del(&dev->poll_list); + list_add_tail(&dev->poll_list, &queue->poll_list); + if (dev->quota < 0) + dev->quota += dev->weight; + else + dev->quota = dev->weight; + } else { + dev_put(dev); + local_irq_disable(); + } + } + + local_irq_enable(); br_read_unlock(BR_NETPROTO_LOCK); + return; - local_irq_disable(); +softnet_break: netdev_rx_stat[this_cpu].time_squeeze++; - /* This already runs in BH context, no need to wake up BH's */ - cpu_raise_softirq(this_cpu, NET_RX_SOFTIRQ); - local_irq_enable(); + __cpu_raise_softirq(this_cpu, NET_RX_SOFTIRQ); - NET_PROFILE_LEAVE(softnet_process); - return; + local_irq_enable(); + br_read_unlock(BR_NETPROTO_LOCK); } static gifconf_func_t * gifconf_list [NPROTO]; @@ -2626,6 +2682,7 @@ int __init net_dev_init(void) if (!dev_boot_phase) return 0; + #ifdef CONFIG_NET_DIVERT dv_init(); #endif /* CONFIG_NET_DIVERT */ @@ -2643,8 +2700,13 @@ int __init net_dev_init(void) queue->cng_level = 0; queue->avg_blog = 10; /* arbitrary non-zero */ queue->completion_queue = NULL; + INIT_LIST_HEAD(&queue->poll_list); + set_bit(__LINK_STATE_START, &queue->backlog_dev.state); + queue->backlog_dev.weight = weight_p; + queue->backlog_dev.poll = process_backlog; + atomic_set(&queue->backlog_dev.refcnt, 1); } - + #ifdef CONFIG_NET_PROFILE net_profile_init(); NET_PROFILE_REGISTER(dev_queue_xmit); @@ -2744,7 +2806,6 @@ int __init net_dev_init(void) #ifdef CONFIG_NET_SCHED pktsched_init(); #endif - /* * Initialise network devices */ diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 2f6090a2fc9a..2e24556de974 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -12,6 +12,7 @@ #ifdef CONFIG_SYSCTL extern int netdev_max_backlog; +extern int weight_p; extern int no_cong_thresh; extern int no_cong; extern int lo_cong; @@ -47,6 +48,9 @@ ctl_table core_table[] = { {NET_CORE_RMEM_DEFAULT, "rmem_default", &sysctl_rmem_default, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_CORE_DEV_WEIGHT, "dev_weight", + &weight_p, sizeof(int), 0644, NULL, + &proc_dointvec}, {NET_CORE_MAX_BACKLOG, "netdev_max_backlog", &netdev_max_backlog, sizeof(int), 0644, NULL, &proc_dointvec}, diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 59e81c1f0cf8..9e6a18144cbf 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -112,7 +112,7 @@ #include <asm/system.h> #include <asm/uaccess.h> - +#include <linux/netfilter_arp.h> /* * Interface to generic neighbour cache. @@ -561,7 +561,8 @@ void arp_send(int type, int ptype, u32 dest_ip, arp_ptr+=dev->addr_len; memcpy(arp_ptr, &dest_ip, 4); - dev_queue_xmit(skb); + /* Send it off, maybe filter it using firewalling first. */ + NF_HOOK(NF_ARP, NF_ARP_OUT, skb, NULL, dev, dev_queue_xmit); return; out: @@ -574,45 +575,31 @@ static void parp_redo(struct sk_buff *skb) } /* - * Receive an arp request by the device layer. + * Process an arp request. */ -int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) +int arp_process(struct sk_buff *skb) { - struct arphdr *arp = skb->nh.arph; - unsigned char *arp_ptr= (unsigned char *)(arp+1); + struct net_device *dev = skb->dev; + struct in_device *in_dev = in_dev_get(dev); + struct arphdr *arp; + unsigned char *arp_ptr; struct rtable *rt; unsigned char *sha, *tha; u32 sip, tip; u16 dev_type = dev->type; int addr_type; - struct in_device *in_dev = in_dev_get(dev); struct neighbour *n; -/* - * The hardware length of the packet should match the hardware length - * of the device. Similarly, the hardware types should match. The - * device should be ARP-able. Also, if pln is not 4, then the lookup - * is not from an IP number. We can't currently handle this, so toss - * it. - */ - if (in_dev == NULL || - arp->ar_hln != dev->addr_len || - dev->flags & IFF_NOARP || - skb->pkt_type == PACKET_OTHERHOST || - skb->pkt_type == PACKET_LOOPBACK || - arp->ar_pln != 4) + /* arp_rcv below verifies the ARP header, verifies the device + * is ARP'able, and linearizes the SKB (if needed). + */ + + if (in_dev == NULL) goto out; - if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) - goto out_of_mem; - - if (skb_is_nonlinear(skb)) { - if (skb_linearize(skb, GFP_ATOMIC) != 0) - goto freeskb; - arp = skb->nh.arph; - arp_ptr= (unsigned char *)(arp+1); - } + arp = skb->nh.arph; + arp_ptr= (unsigned char *)(arp+1); switch (dev_type) { default: @@ -827,13 +814,41 @@ int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) out: if (in_dev) in_dev_put(in_dev); -freeskb: kfree_skb(skb); -out_of_mem: return 0; } +/* + * Receive an arp request from the device layer. + */ + +int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) +{ + struct arphdr *arp = skb->nh.arph; + + if (arp->ar_hln != dev->addr_len || + dev->flags & IFF_NOARP || + skb->pkt_type == PACKET_OTHERHOST || + skb->pkt_type == PACKET_LOOPBACK || + arp->ar_pln != 4) + goto freeskb; + + if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) + goto out_of_mem; + + if (skb_is_nonlinear(skb)) { + if (skb_linearize(skb, GFP_ATOMIC) != 0) + goto freeskb; + } + + return NF_HOOK(NF_ARP, NF_ARP_IN, skb, dev, NULL, arp_process); + +freeskb: + kfree_skb(skb); +out_of_mem: + return 0; +} /* * User level interface (ioctl, /proc) diff --git a/net/ipv4/netfilter/Config.in b/net/ipv4/netfilter/Config.in index 47c703c34ddc..7f250e431318 100644 --- a/net/ipv4/netfilter/Config.in +++ b/net/ipv4/netfilter/Config.in @@ -47,6 +47,7 @@ if [ "$CONFIG_IP_NF_IPTABLES" != "n" ]; then define_bool CONFIG_IP_NF_NAT_NEEDED y dep_tristate ' MASQUERADE target support' CONFIG_IP_NF_TARGET_MASQUERADE $CONFIG_IP_NF_NAT dep_tristate ' REDIRECT target support' CONFIG_IP_NF_TARGET_REDIRECT $CONFIG_IP_NF_NAT + bool ' NAT of local connections' CONFIG_IP_NF_NAT_LOCAL if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then dep_tristate ' Basic SNMP-ALG support (EXPERIMENTAL)' CONFIG_IP_NF_NAT_SNMP_BASIC $CONFIG_IP_NF_NAT fi @@ -79,6 +80,11 @@ if [ "$CONFIG_IP_NF_IPTABLES" != "n" ]; then dep_tristate ' TCPMSS target support' CONFIG_IP_NF_TARGET_TCPMSS $CONFIG_IP_NF_IPTABLES fi +tristate 'ARP tables support' CONFIG_IP_NF_ARPTABLES +if [ "$CONFIG_IP_NF_ARPTABLES" != "n" ]; then + dep_tristate ' ARP packet filtering' CONFIG_IP_NF_ARPFILTER $CONFIG_IP_NF_ARPTABLES +fi + # Backwards compatibility modules: only if you don't build in the others. if [ "$CONFIG_IP_NF_CONNTRACK" != "y" ]; then if [ "$CONFIG_IP_NF_IPTABLES" != "y" ]; then diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 6e5a8a1cc0b7..7e1bd4511532 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -9,7 +9,7 @@ O_TARGET := netfilter.o -export-objs = ip_conntrack_standalone.o ip_conntrack_ftp.o ip_fw_compat.o ip_nat_standalone.o ip_tables.o +export-objs = ip_conntrack_standalone.o ip_conntrack_ftp.o ip_fw_compat.o ip_nat_standalone.o ip_tables.o arp_tables.o # Multipart objects. list-multi := ip_conntrack.o iptable_nat.o ipfwadm.o ipchains.o @@ -75,6 +75,12 @@ obj-$(CONFIG_IP_NF_TARGET_LOG) += ipt_LOG.o obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o obj-$(CONFIG_IP_NF_TARGET_TCPMSS) += ipt_TCPMSS.o +# generic ARP tables +obj-$(CONFIG_IP_NF_ARPTABLES) += arp_tables.o + +# just filtering instance of ARP tables for now +obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o + # backwards compatibility obj-$(CONFIG_IP_NF_COMPAT_IPCHAINS) += ipchains.o obj-$(CONFIG_IP_NF_COMPAT_IPFWADM) += ipfwadm.o diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c new file mode 100644 index 000000000000..38b4356cf5b3 --- /dev/null +++ b/net/ipv4/netfilter/arp_tables.c @@ -0,0 +1,1313 @@ +/* + * Packet matching code for ARP packets. + * + * Based heavily, if not almost entirely, upon ip_tables.c framework. + * + * Some ARP specific bits are: + * + * Copyright (C) 2002 David S. Miller (davem@redhat.com) + * + */ + +#include <linux/config.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <linux/if_arp.h> +#include <linux/kmod.h> +#include <linux/vmalloc.h> +#include <linux/proc_fs.h> +#include <linux/module.h> +#include <linux/init.h> + +#include <asm/uaccess.h> +#include <asm/semaphore.h> + +#include <linux/netfilter_arp/arp_tables.h> + +/*#define DEBUG_ARP_TABLES*/ +/*#define DEBUG_ARP_TABLES_USER*/ + +#ifdef DEBUG_ARP_TABLES +#define dprintf(format, args...) printk(format , ## args) +#else +#define dprintf(format, args...) +#endif + +#ifdef DEBUG_ARP_TABLES_USER +#define duprintf(format, args...) printk(format , ## args) +#else +#define duprintf(format, args...) +#endif + +#ifdef CONFIG_NETFILTER_DEBUG +#define ARP_NF_ASSERT(x) \ +do { \ + if (!(x)) \ + printk("ARP_NF_ASSERT: %s:%s:%u\n", \ + __FUNCTION__, __FILE__, __LINE__); \ +} while(0) +#else +#define ARP_NF_ASSERT(x) +#endif +#define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1)) + +static DECLARE_MUTEX(arpt_mutex); + +#define ASSERT_READ_LOCK(x) ARP_NF_ASSERT(down_trylock(&arpt_mutex) != 0) +#define ASSERT_WRITE_LOCK(x) ARP_NF_ASSERT(down_trylock(&arpt_mutex) != 0) +#include <linux/netfilter_ipv4/lockhelp.h> +#include <linux/netfilter_ipv4/listhelp.h> + +struct arpt_table_info { + unsigned int size; + unsigned int number; + unsigned int initial_entries; + unsigned int hook_entry[NF_ARP_NUMHOOKS]; + unsigned int underflow[NF_ARP_NUMHOOKS]; + char entries[0] __attribute__((aligned(SMP_CACHE_BYTES))); +}; + +static LIST_HEAD(arpt_target); +static LIST_HEAD(arpt_tables); +#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0) + +#ifdef CONFIG_SMP +#define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p)) +#else +#define TABLE_OFFSET(t,p) 0 +#endif + +static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap, + char *hdr_addr, int len) +{ + int i, ret; + + if (len > ARPT_DEV_ADDR_LEN_MAX) + len = ARPT_DEV_ADDR_LEN_MAX; + + ret = 0; + for (i = 0; i < len; i++) + ret |= (hdr_addr[i] ^ ap->addr[i]) & ap->mask[i]; + + return (ret != 0); +} + +/* Returns whether packet matches rule or not. */ +static inline int arp_packet_match(const struct arphdr *arphdr, + struct net_device *dev, + const char *indev, + const char *outdev, + const struct arpt_arp *arpinfo) +{ + char *arpptr = (char *)(arphdr + 1); + char *src_devaddr, *tgt_devaddr; + u32 *src_ipaddr, *tgt_ipaddr; + int i, ret; + +#define FWINV(bool,invflg) ((bool) ^ !!(arpinfo->invflags & invflg)) + + if (FWINV((arphdr->ar_op & arpinfo->arpop_mask) != arpinfo->arpop, + ARPT_INV_ARPOP)) { + dprintf("ARP operation field mismatch.\n"); + dprintf("ar_op: %04x info->arpop: %04x info->arpop_mask: %04x\n", + arphdr->ar_op, arpinfo->arpop, arpinfo->arpop_mask); + return 0; + } + + if (FWINV((arphdr->ar_hrd & arpinfo->arhrd_mask) != arpinfo->arhrd, + ARPT_INV_ARPHRD)) { + dprintf("ARP hardware address format mismatch.\n"); + dprintf("ar_hrd: %04x info->arhrd: %04x info->arhrd_mask: %04x\n", + arphdr->ar_hrd, arpinfo->arhrd, arpinfo->arhrd_mask); + return 0; + } + + if (FWINV((arphdr->ar_pro & arpinfo->arpro_mask) != arpinfo->arpro, + ARPT_INV_ARPPRO)) { + dprintf("ARP protocol address format mismatch.\n"); + dprintf("ar_pro: %04x info->arpro: %04x info->arpro_mask: %04x\n", + arphdr->ar_pro, arpinfo->arpro, arpinfo->arpro_mask); + return 0; + } + + if (FWINV((arphdr->ar_hln & arpinfo->arhln_mask) != arpinfo->arhln, + ARPT_INV_ARPHLN)) { + dprintf("ARP hardware address length mismatch.\n"); + dprintf("ar_hln: %02x info->arhln: %02x info->arhln_mask: %02x\n", + arphdr->ar_hln, arpinfo->arhln, arpinfo->arhln_mask); + } + + src_devaddr = arpptr; + arpptr += dev->addr_len; + src_ipaddr = (u32 *) arpptr; + arpptr += sizeof(u32); + tgt_devaddr = arpptr; + arpptr += dev->addr_len; + tgt_ipaddr = (u32 *) arpptr; + + if (FWINV(arp_devaddr_compare(&arpinfo->src_devaddr, src_devaddr, dev->addr_len), + ARPT_INV_SRCDEVADDR) || + FWINV(arp_devaddr_compare(&arpinfo->tgt_devaddr, tgt_devaddr, dev->addr_len), + ARPT_INV_TGTDEVADDR)) { + dprintf("Source or target device address mismatch.\n"); + + return 0; + } + + if (FWINV(((*src_ipaddr) & arpinfo->smsk.s_addr) != arpinfo->src.s_addr, + ARPT_INV_SRCIP) || + FWINV((((*tgt_ipaddr) & arpinfo->tmsk.s_addr) != arpinfo->tgt.s_addr), + ARPT_INV_TGTIP)) { + dprintf("Source or target IP address mismatch.\n"); + + dprintf("SRC: %u.%u.%u.%u. Mask: %u.%u.%u.%u. Target: %u.%u.%u.%u.%s\n", + NIPQUAD(*src_ipaddr), + NIPQUAD(arpinfo->smsk.s_addr), + NIPQUAD(arpinfo->src.s_addr), + arpinfo->invflags & ARPT_INV_SRCIP ? " (INV)" : ""); + dprintf("TGT: %u.%u.%u.%u Mask: %u.%u.%u.%u Target: %u.%u.%u.%u.%s\n", + NIPQUAD(*tgt_ipaddr), + NIPQUAD(arpinfo->tmsk.s_addr), + NIPQUAD(arpinfo->tgt.s_addr), + arpinfo->invflags & ARPT_INV_TGTIP ? " (INV)" : ""); + return 0; + } + + /* Look for ifname matches; this should unroll nicely. */ + for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned long); i++) { + ret |= (((const unsigned long *)indev)[i] + ^ ((const unsigned long *)arpinfo->iniface)[i]) + & ((const unsigned long *)arpinfo->iniface_mask)[i]; + } + + if (FWINV(ret != 0, ARPT_INV_VIA_IN)) { + dprintf("VIA in mismatch (%s vs %s).%s\n", + indev, arpinfo->iniface, + arpinfo->invflags&ARPT_INV_VIA_IN ?" (INV)":""); + return 0; + } + + for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned long); i++) { + ret |= (((const unsigned long *)outdev)[i] + ^ ((const unsigned long *)arpinfo->outiface)[i]) + & ((const unsigned long *)arpinfo->outiface_mask)[i]; + } + + if (FWINV(ret != 0, ARPT_INV_VIA_OUT)) { + dprintf("VIA out mismatch (%s vs %s).%s\n", + outdev, arpinfo->outiface, + arpinfo->invflags&ARPT_INV_VIA_OUT ?" (INV)":""); + return 0; + } + + return 1; +} + +static inline int arp_checkentry(const struct arpt_arp *arp) +{ + if (arp->flags & ~ARPT_F_MASK) { + duprintf("Unknown flag bits set: %08X\n", + arp->flags & ~ARPT_F_MASK); + return 0; + } + if (arp->invflags & ~ARPT_INV_MASK) { + duprintf("Unknown invflag bits set: %08X\n", + arp->invflags & ~ARPT_INV_MASK); + return 0; + } + + return 1; +} + +static unsigned int arpt_error(struct sk_buff **pskb, + unsigned int hooknum, + const struct net_device *in, + const struct net_device *out, + const void *targinfo, + void *userinfo) +{ + if (net_ratelimit()) + printk("arp_tables: error: '%s'\n", (char *)targinfo); + + return NF_DROP; +} + +static inline struct arpt_entry *get_entry(void *base, unsigned int offset) +{ + return (struct arpt_entry *)(base + offset); +} + +unsigned int arpt_do_table(struct sk_buff **pskb, + unsigned int hook, + const struct net_device *in, + const struct net_device *out, + struct arpt_table *table, + void *userdata) +{ + static const char nulldevname[IFNAMSIZ] = { 0 }; + unsigned int verdict = NF_DROP; + struct arphdr *arp = (*pskb)->nh.arph; + int hotdrop = 0; + struct arpt_entry *e, *back; + const char *indev, *outdev; + void *table_base; + + indev = in ? in->name : nulldevname; + outdev = out ? out->name : nulldevname; + + read_lock_bh(&table->lock); + table_base = (void *)table->private->entries + + TABLE_OFFSET(table->private, + cpu_number_map(smp_processor_id())); + e = get_entry(table_base, table->private->hook_entry[hook]); + back = get_entry(table_base, table->private->underflow[hook]); + + do { + if (arp_packet_match(arp, (*pskb)->dev, indev, outdev, &e->arp)) { + struct arpt_entry_target *t; + int hdr_len; + + hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) + + (2 * (*pskb)->dev->addr_len); + ADD_COUNTER(e->counters, hdr_len, 1); + + t = arpt_get_target(e); + + /* Standard target? */ + if (!t->u.kernel.target->target) { + int v; + + v = ((struct arpt_standard_target *)t)->verdict; + if (v < 0) { + /* Pop from stack? */ + if (v != ARPT_RETURN) { + verdict = (unsigned)(-v) - 1; + break; + } + e = back; + back = get_entry(table_base, + back->comefrom); + continue; + } + if (table_base + v + != (void *)e + e->next_offset) { + /* Save old back ptr in next entry */ + struct arpt_entry *next + = (void *)e + e->next_offset; + next->comefrom = + (void *)back - table_base; + + /* set back pointer to next entry */ + back = next; + } + + e = get_entry(table_base, v); + } else { + /* Targets which reenter must return + * abs. verdicts + */ + verdict = t->u.kernel.target->target(pskb, + hook, + in, out, + t->data, + userdata); + + /* Target might have changed stuff. */ + arp = (*pskb)->nh.arph; + + if (verdict == ARPT_CONTINUE) + e = (void *)e + e->next_offset; + else + /* Verdict */ + break; + } + } else { + e = (void *)e + e->next_offset; + } + } while (!hotdrop); + read_unlock_bh(&table->lock); + + if (hotdrop) + return NF_DROP; + else + return verdict; +} + +static inline void *find_inlist_lock_noload(struct list_head *head, + const char *name, + int *error, + struct semaphore *mutex) +{ + void *ret; + + *error = down_interruptible(mutex); + if (*error != 0) + return NULL; + + ret = list_named_find(head, name); + if (!ret) { + *error = -ENOENT; + up(mutex); + } + return ret; +} + +#ifndef CONFIG_KMOD +#define find_inlist_lock(h,n,p,e,m) find_inlist_lock_noload((h),(n),(e),(m)) +#else +static void * +find_inlist_lock(struct list_head *head, + const char *name, + const char *prefix, + int *error, + struct semaphore *mutex) +{ + void *ret; + + ret = find_inlist_lock_noload(head, name, error, mutex); + if (!ret) { + char modulename[ARPT_FUNCTION_MAXNAMELEN + strlen(prefix) + 1]; + strcpy(modulename, prefix); + strcat(modulename, name); + duprintf("find_inlist: loading `%s'.\n", modulename); + request_module(modulename); + ret = find_inlist_lock_noload(head, name, error, mutex); + } + + return ret; +} +#endif + +static inline struct arpt_table *find_table_lock(const char *name, int *error, struct semaphore *mutex) +{ + return find_inlist_lock(&arpt_tables, name, "arptable_", error, mutex); +} + +static inline struct arpt_target *find_target_lock(const char *name, int *error, struct semaphore *mutex) +{ + return find_inlist_lock(&arpt_target, name, "arpt_", error, mutex); +} + +/* All zeroes == unconditional rule. */ +static inline int unconditional(const struct arpt_arp *arp) +{ + unsigned int i; + + for (i = 0; i < sizeof(*arp)/sizeof(__u32); i++) + if (((__u32 *)arp)[i]) + return 0; + + return 1; +} + +/* Figures out from what hook each rule can be called: returns 0 if + * there are loops. Puts hook bitmask in comefrom. + */ +static int mark_source_chains(struct arpt_table_info *newinfo, unsigned int valid_hooks) +{ + unsigned int hook; + + /* No recursion; use packet counter to save back ptrs (reset + * to 0 as we leave), and comefrom to save source hook bitmask. + */ + for (hook = 0; hook < NF_ARP_NUMHOOKS; hook++) { + unsigned int pos = newinfo->hook_entry[hook]; + struct arpt_entry *e + = (struct arpt_entry *)(newinfo->entries + pos); + + if (!(valid_hooks & (1 << hook))) + continue; + + /* Set initial back pointer. */ + e->counters.pcnt = pos; + + for (;;) { + struct arpt_standard_target *t + = (void *)arpt_get_target(e); + + if (e->comefrom & (1 << NF_ARP_NUMHOOKS)) { + printk("arptables: loop hook %u pos %u %08X.\n", + hook, pos, e->comefrom); + return 0; + } + e->comefrom + |= ((1 << hook) | (1 << NF_ARP_NUMHOOKS)); + + /* Unconditional return/END. */ + if (e->target_offset == sizeof(struct arpt_entry) + && (strcmp(t->target.u.user.name, + ARPT_STANDARD_TARGET) == 0) + && t->verdict < 0 + && unconditional(&e->arp)) { + unsigned int oldpos, size; + + /* Return: backtrack through the last + * big jump. + */ + do { + e->comefrom ^= (1<<NF_ARP_NUMHOOKS); + oldpos = pos; + pos = e->counters.pcnt; + e->counters.pcnt = 0; + + /* We're at the start. */ + if (pos == oldpos) + goto next; + + e = (struct arpt_entry *) + (newinfo->entries + pos); + } while (oldpos == pos + e->next_offset); + + /* Move along one */ + size = e->next_offset; + e = (struct arpt_entry *) + (newinfo->entries + pos + size); + e->counters.pcnt = pos; + pos += size; + } else { + int newpos = t->verdict; + + if (strcmp(t->target.u.user.name, + ARPT_STANDARD_TARGET) == 0 + && newpos >= 0) { + /* This a jump; chase it. */ + duprintf("Jump rule %u -> %u\n", + pos, newpos); + } else { + /* ... this is a fallthru */ + newpos = pos + e->next_offset; + } + e = (struct arpt_entry *) + (newinfo->entries + newpos); + e->counters.pcnt = pos; + pos = newpos; + } + } + next: + duprintf("Finished chain %u\n", hook); + } + return 1; +} + +static inline int standard_check(const struct arpt_entry_target *t, + unsigned int max_offset) +{ + struct arpt_standard_target *targ = (void *)t; + + /* Check standard info. */ + if (t->u.target_size + != ARPT_ALIGN(sizeof(struct arpt_standard_target))) { + duprintf("arpt_standard_check: target size %u != %Zu\n", + t->u.target_size, + ARPT_ALIGN(sizeof(struct arpt_standard_target))); + return 0; + } + + if (targ->verdict >= 0 + && targ->verdict > max_offset - sizeof(struct arpt_entry)) { + duprintf("arpt_standard_check: bad verdict (%i)\n", + targ->verdict); + return 0; + } + + if (targ->verdict < -NF_MAX_VERDICT - 1) { + duprintf("arpt_standard_check: bad negative verdict (%i)\n", + targ->verdict); + return 0; + } + return 1; +} + +static struct arpt_target arpt_standard_target; + +static inline int check_entry(struct arpt_entry *e, const char *name, unsigned int size, + unsigned int *i) +{ + struct arpt_entry_target *t; + struct arpt_target *target; + int ret; + + if (!arp_checkentry(&e->arp)) { + duprintf("arp_tables: arp check failed %p %s.\n", e, name); + return -EINVAL; + } + + t = arpt_get_target(e); + target = find_target_lock(t->u.user.name, &ret, &arpt_mutex); + if (!target) { + duprintf("check_entry: `%s' not found\n", t->u.user.name); + goto out; + } + if (target->me) + __MOD_INC_USE_COUNT(target->me); + t->u.kernel.target = target; + up(&arpt_mutex); + + if (t->u.kernel.target == &arpt_standard_target) { + if (!standard_check(t, size)) { + ret = -EINVAL; + goto out; + } + } else if (t->u.kernel.target->checkentry + && !t->u.kernel.target->checkentry(name, e, t->data, + t->u.target_size + - sizeof(*t), + e->comefrom)) { + if (t->u.kernel.target->me) + __MOD_DEC_USE_COUNT(t->u.kernel.target->me); + duprintf("arp_tables: check failed for `%s'.\n", + t->u.kernel.target->name); + ret = -EINVAL; + goto out; + } + + (*i)++; + return 0; + +out: + return ret; +} + +static inline int check_entry_size_and_hooks(struct arpt_entry *e, + struct arpt_table_info *newinfo, + unsigned char *base, + unsigned char *limit, + const unsigned int *hook_entries, + const unsigned int *underflows, + unsigned int *i) +{ + unsigned int h; + + if ((unsigned long)e % __alignof__(struct arpt_entry) != 0 + || (unsigned char *)e + sizeof(struct arpt_entry) >= limit) { + duprintf("Bad offset %p\n", e); + return -EINVAL; + } + + if (e->next_offset + < sizeof(struct arpt_entry) + sizeof(struct arpt_entry_target)) { + duprintf("checking: element %p size %u\n", + e, e->next_offset); + return -EINVAL; + } + + /* Check hooks & underflows */ + for (h = 0; h < NF_ARP_NUMHOOKS; h++) { + if ((unsigned char *)e - base == hook_entries[h]) + newinfo->hook_entry[h] = hook_entries[h]; + if ((unsigned char *)e - base == underflows[h]) + newinfo->underflow[h] = underflows[h]; + } + + /* FIXME: underflows must be unconditional, standard verdicts + < 0 (not ARPT_RETURN). --RR */ + + /* Clear counters and comefrom */ + e->counters = ((struct arpt_counters) { 0, 0 }); + e->comefrom = 0; + + (*i)++; + return 0; +} + +static inline int cleanup_entry(struct arpt_entry *e, unsigned int *i) +{ + struct arpt_entry_target *t; + + if (i && (*i)-- == 0) + return 1; + + t = arpt_get_target(e); + if (t->u.kernel.target->destroy) + t->u.kernel.target->destroy(t->data, + t->u.target_size - sizeof(*t)); + if (t->u.kernel.target->me) + __MOD_DEC_USE_COUNT(t->u.kernel.target->me); + + return 0; +} + +/* Checks and translates the user-supplied table segment (held in + * newinfo). + */ +static int translate_table(const char *name, + unsigned int valid_hooks, + struct arpt_table_info *newinfo, + unsigned int size, + unsigned int number, + const unsigned int *hook_entries, + const unsigned int *underflows) +{ + unsigned int i; + int ret; + + newinfo->size = size; + newinfo->number = number; + + /* Init all hooks to impossible value. */ + for (i = 0; i < NF_ARP_NUMHOOKS; i++) { + newinfo->hook_entry[i] = 0xFFFFFFFF; + newinfo->underflow[i] = 0xFFFFFFFF; + } + + duprintf("translate_table: size %u\n", newinfo->size); + i = 0; + + /* Walk through entries, checking offsets. */ + ret = ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, + check_entry_size_and_hooks, + newinfo, + newinfo->entries, + newinfo->entries + size, + hook_entries, underflows, &i); + duprintf("translate_table: ARPT_ENTRY_ITERATE gives %d\n", ret); + if (ret != 0) + return ret; + + if (i != number) { + duprintf("translate_table: %u not %u entries\n", + i, number); + return -EINVAL; + } + + /* Check hooks all assigned */ + for (i = 0; i < NF_ARP_NUMHOOKS; i++) { + /* Only hooks which are valid */ + if (!(valid_hooks & (1 << i))) + continue; + if (newinfo->hook_entry[i] == 0xFFFFFFFF) { + duprintf("Invalid hook entry %u %u\n", + i, hook_entries[i]); + return -EINVAL; + } + if (newinfo->underflow[i] == 0xFFFFFFFF) { + duprintf("Invalid underflow %u %u\n", + i, underflows[i]); + return -EINVAL; + } + } + + if (!mark_source_chains(newinfo, valid_hooks)) { + duprintf("Looping hook\n"); + return -ELOOP; + } + + /* Finally, each sanity check must pass */ + i = 0; + ret = ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, + check_entry, name, size, &i); + + if (ret != 0) { + ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, + cleanup_entry, &i); + return ret; + } + + /* And one copy for every other CPU */ + for (i = 1; i < smp_num_cpus; i++) { + memcpy(newinfo->entries + SMP_ALIGN(newinfo->size)*i, + newinfo->entries, + SMP_ALIGN(newinfo->size)); + } + + return ret; +} + +static struct arpt_table_info *replace_table(struct arpt_table *table, + unsigned int num_counters, + struct arpt_table_info *newinfo, + int *error) +{ + struct arpt_table_info *oldinfo; + + /* Do the substitution. */ + write_lock_bh(&table->lock); + /* Check inside lock: is the old number correct? */ + if (num_counters != table->private->number) { + duprintf("num_counters != table->private->number (%u/%u)\n", + num_counters, table->private->number); + write_unlock_bh(&table->lock); + *error = -EAGAIN; + return NULL; + } + oldinfo = table->private; + table->private = newinfo; + newinfo->initial_entries = oldinfo->initial_entries; + write_unlock_bh(&table->lock); + + return oldinfo; +} + +/* Gets counters. */ +static inline int add_entry_to_counter(const struct arpt_entry *e, + struct arpt_counters total[], + unsigned int *i) +{ + ADD_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt); + + (*i)++; + return 0; +} + +static void get_counters(const struct arpt_table_info *t, + struct arpt_counters counters[]) +{ + unsigned int cpu; + unsigned int i; + + for (cpu = 0; cpu < smp_num_cpus; cpu++) { + i = 0; + ARPT_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu), + t->size, + add_entry_to_counter, + counters, + &i); + } +} + +static int copy_entries_to_user(unsigned int total_size, + struct arpt_table *table, + void *userptr) +{ + unsigned int off, num, countersize; + struct arpt_entry *e; + struct arpt_counters *counters; + int ret = 0; + + /* We need atomic snapshot of counters: rest doesn't change + * (other than comefrom, which userspace doesn't care + * about). + */ + countersize = sizeof(struct arpt_counters) * table->private->number; + counters = vmalloc(countersize); + + if (counters == NULL) + return -ENOMEM; + + /* First, sum counters... */ + memset(counters, 0, countersize); + write_lock_bh(&table->lock); + get_counters(table->private, counters); + write_unlock_bh(&table->lock); + + /* ... then copy entire thing from CPU 0... */ + if (copy_to_user(userptr, table->private->entries, total_size) != 0) { + ret = -EFAULT; + goto free_counters; + } + + /* FIXME: use iterator macros --RR */ + /* ... then go back and fix counters and names */ + for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){ + struct arpt_entry_target *t; + + e = (struct arpt_entry *)(table->private->entries + off); + if (copy_to_user(userptr + off + + offsetof(struct arpt_entry, counters), + &counters[num], + sizeof(counters[num])) != 0) { + ret = -EFAULT; + goto free_counters; + } + + t = arpt_get_target(e); + if (copy_to_user(userptr + off + e->target_offset + + offsetof(struct arpt_entry_target, + u.user.name), + t->u.kernel.target->name, + strlen(t->u.kernel.target->name)+1) != 0) { + ret = -EFAULT; + goto free_counters; + } + } + + free_counters: + vfree(counters); + return ret; +} + +static int get_entries(const struct arpt_get_entries *entries, + struct arpt_get_entries *uptr) +{ + int ret; + struct arpt_table *t; + + t = find_table_lock(entries->name, &ret, &arpt_mutex); + if (t) { + duprintf("t->private->number = %u\n", + t->private->number); + if (entries->size == t->private->size) + ret = copy_entries_to_user(t->private->size, + t, uptr->entrytable); + else { + duprintf("get_entries: I've got %u not %u!\n", + t->private->size, + entries->size); + ret = -EINVAL; + } + up(&arpt_mutex); + } else + duprintf("get_entries: Can't find %s!\n", + entries->name); + + return ret; +} + +static int do_replace(void *user, unsigned int len) +{ + int ret; + struct arpt_replace tmp; + struct arpt_table *t; + struct arpt_table_info *newinfo, *oldinfo; + struct arpt_counters *counters; + + if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) + return -EFAULT; + + /* Hack: Causes ipchains to give correct error msg --RR */ + if (len != sizeof(tmp) + tmp.size) + return -ENOPROTOOPT; + + /* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */ + if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages) + return -ENOMEM; + + newinfo = vmalloc(sizeof(struct arpt_table_info) + + SMP_ALIGN(tmp.size) * smp_num_cpus); + if (!newinfo) + return -ENOMEM; + + if (copy_from_user(newinfo->entries, user + sizeof(tmp), + tmp.size) != 0) { + ret = -EFAULT; + goto free_newinfo; + } + + counters = vmalloc(tmp.num_counters * sizeof(struct arpt_counters)); + if (!counters) { + ret = -ENOMEM; + goto free_newinfo; + } + memset(counters, 0, tmp.num_counters * sizeof(struct arpt_counters)); + + ret = translate_table(tmp.name, tmp.valid_hooks, + newinfo, tmp.size, tmp.num_entries, + tmp.hook_entry, tmp.underflow); + if (ret != 0) + goto free_newinfo_counters; + + duprintf("arp_tables: Translated table\n"); + + t = find_table_lock(tmp.name, &ret, &arpt_mutex); + if (!t) + goto free_newinfo_counters_untrans; + + /* You lied! */ + if (tmp.valid_hooks != t->valid_hooks) { + duprintf("Valid hook crap: %08X vs %08X\n", + tmp.valid_hooks, t->valid_hooks); + ret = -EINVAL; + goto free_newinfo_counters_untrans_unlock; + } + + oldinfo = replace_table(t, tmp.num_counters, newinfo, &ret); + if (!oldinfo) + goto free_newinfo_counters_untrans_unlock; + + /* Update module usage count based on number of rules */ + duprintf("do_replace: oldnum=%u, initnum=%u, newnum=%u\n", + oldinfo->number, oldinfo->initial_entries, newinfo->number); + if (t->me && (oldinfo->number <= oldinfo->initial_entries) && + (newinfo->number > oldinfo->initial_entries)) + __MOD_INC_USE_COUNT(t->me); + else if (t->me && (oldinfo->number > oldinfo->initial_entries) && + (newinfo->number <= oldinfo->initial_entries)) + __MOD_DEC_USE_COUNT(t->me); + + /* Get the old counters. */ + get_counters(oldinfo, counters); + /* Decrease module usage counts and free resource */ + ARPT_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL); + vfree(oldinfo); + /* Silent error: too late now. */ + copy_to_user(tmp.counters, counters, + sizeof(struct arpt_counters) * tmp.num_counters); + vfree(counters); + up(&arpt_mutex); + return 0; + + free_newinfo_counters_untrans_unlock: + up(&arpt_mutex); + free_newinfo_counters_untrans: + ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry, NULL); + free_newinfo_counters: + vfree(counters); + free_newinfo: + vfree(newinfo); + return ret; +} + +/* We're lazy, and add to the first CPU; overflow works its fey magic + * and everything is OK. + */ +static inline int add_counter_to_entry(struct arpt_entry *e, + const struct arpt_counters addme[], + unsigned int *i) +{ + + ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); + + (*i)++; + return 0; +} + +static int do_add_counters(void *user, unsigned int len) +{ + unsigned int i; + struct arpt_counters_info tmp, *paddc; + struct arpt_table *t; + int ret; + + if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) + return -EFAULT; + + if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct arpt_counters)) + return -EINVAL; + + paddc = vmalloc(len); + if (!paddc) + return -ENOMEM; + + if (copy_from_user(paddc, user, len) != 0) { + ret = -EFAULT; + goto free; + } + + t = find_table_lock(tmp.name, &ret, &arpt_mutex); + if (!t) + goto free; + + write_lock_bh(&t->lock); + if (t->private->number != paddc->num_counters) { + ret = -EINVAL; + goto unlock_up_free; + } + + i = 0; + ARPT_ENTRY_ITERATE(t->private->entries, + t->private->size, + add_counter_to_entry, + paddc->counters, + &i); + unlock_up_free: + write_unlock_bh(&t->lock); + up(&arpt_mutex); + free: + vfree(paddc); + + return ret; +} + +static int do_arpt_set_ctl(struct sock *sk, int cmd, void *user, unsigned int len) +{ + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + switch (cmd) { + case ARPT_SO_SET_REPLACE: + ret = do_replace(user, len); + break; + + case ARPT_SO_SET_ADD_COUNTERS: + ret = do_add_counters(user, len); + break; + + default: + duprintf("do_arpt_set_ctl: unknown request %i\n", cmd); + ret = -EINVAL; + } + + return ret; +} + +static int do_arpt_get_ctl(struct sock *sk, int cmd, void *user, int *len) +{ + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + switch (cmd) { + case ARPT_SO_GET_INFO: { + char name[ARPT_TABLE_MAXNAMELEN]; + struct arpt_table *t; + + if (*len != sizeof(struct arpt_getinfo)) { + duprintf("length %u != %Zu\n", *len, + sizeof(struct arpt_getinfo)); + ret = -EINVAL; + break; + } + + if (copy_from_user(name, user, sizeof(name)) != 0) { + ret = -EFAULT; + break; + } + name[ARPT_TABLE_MAXNAMELEN-1] = '\0'; + t = find_table_lock(name, &ret, &arpt_mutex); + if (t) { + struct arpt_getinfo info; + + info.valid_hooks = t->valid_hooks; + memcpy(info.hook_entry, t->private->hook_entry, + sizeof(info.hook_entry)); + memcpy(info.underflow, t->private->underflow, + sizeof(info.underflow)); + info.num_entries = t->private->number; + info.size = t->private->size; + strcpy(info.name, name); + + if (copy_to_user(user, &info, *len) != 0) + ret = -EFAULT; + else + ret = 0; + + up(&arpt_mutex); + } + } + break; + + case ARPT_SO_GET_ENTRIES: { + struct arpt_get_entries get; + + if (*len < sizeof(get)) { + duprintf("get_entries: %u < %Zu\n", *len, sizeof(get)); + ret = -EINVAL; + } else if (copy_from_user(&get, user, sizeof(get)) != 0) { + ret = -EFAULT; + } else if (*len != sizeof(struct arpt_get_entries) + get.size) { + duprintf("get_entries: %u != %Zu\n", *len, + sizeof(struct arpt_get_entries) + get.size); + ret = -EINVAL; + } else + ret = get_entries(&get, user); + break; + } + + default: + duprintf("do_arpt_get_ctl: unknown request %i\n", cmd); + ret = -EINVAL; + } + + return ret; +} + +/* Registration hooks for targets. */ +int arpt_register_target(struct arpt_target *target) +{ + int ret; + + MOD_INC_USE_COUNT; + ret = down_interruptible(&arpt_mutex); + if (ret != 0) { + MOD_DEC_USE_COUNT; + return ret; + } + if (!list_named_insert(&arpt_target, target)) { + duprintf("arpt_register_target: `%s' already in list!\n", + target->name); + ret = -EINVAL; + MOD_DEC_USE_COUNT; + } + up(&arpt_mutex); + return ret; +} + +void arpt_unregister_target(struct arpt_target *target) +{ + down(&arpt_mutex); + LIST_DELETE(&arpt_target, target); + up(&arpt_mutex); + MOD_DEC_USE_COUNT; +} + +int arpt_register_table(struct arpt_table *table) +{ + int ret; + struct arpt_table_info *newinfo; + static struct arpt_table_info bootstrap + = { 0, 0, 0, { 0 }, { 0 }, { } }; + + MOD_INC_USE_COUNT; + newinfo = vmalloc(sizeof(struct arpt_table_info) + + SMP_ALIGN(table->table->size) * smp_num_cpus); + if (!newinfo) { + ret = -ENOMEM; + MOD_DEC_USE_COUNT; + return ret; + } + memcpy(newinfo->entries, table->table->entries, table->table->size); + + ret = translate_table(table->name, table->valid_hooks, + newinfo, table->table->size, + table->table->num_entries, + table->table->hook_entry, + table->table->underflow); + duprintf("arpt_register_table: translate table gives %d\n", ret); + if (ret != 0) { + vfree(newinfo); + MOD_DEC_USE_COUNT; + return ret; + } + + ret = down_interruptible(&arpt_mutex); + if (ret != 0) { + vfree(newinfo); + MOD_DEC_USE_COUNT; + return ret; + } + + /* Don't autoload: we'd eat our tail... */ + if (list_named_find(&arpt_tables, table->name)) { + ret = -EEXIST; + goto free_unlock; + } + + /* Simplifies replace_table code. */ + table->private = &bootstrap; + if (!replace_table(table, 0, newinfo, &ret)) + goto free_unlock; + + duprintf("table->private->number = %u\n", + table->private->number); + + /* save number of initial entries */ + table->private->initial_entries = table->private->number; + + table->lock = RW_LOCK_UNLOCKED; + list_prepend(&arpt_tables, table); + + unlock: + up(&arpt_mutex); + return ret; + + free_unlock: + vfree(newinfo); + MOD_DEC_USE_COUNT; + goto unlock; +} + +void arpt_unregister_table(struct arpt_table *table) +{ + down(&arpt_mutex); + LIST_DELETE(&arpt_tables, table); + up(&arpt_mutex); + + /* Decrease module usage counts and free resources */ + ARPT_ENTRY_ITERATE(table->private->entries, table->private->size, + cleanup_entry, NULL); + vfree(table->private); + MOD_DEC_USE_COUNT; +} + +/* The built-in targets: standard (NULL) and error. */ +static struct arpt_target arpt_standard_target += { { NULL, NULL }, ARPT_STANDARD_TARGET, NULL, NULL, NULL }; +static struct arpt_target arpt_error_target += { { NULL, NULL }, ARPT_ERROR_TARGET, arpt_error, NULL, NULL }; + +static struct nf_sockopt_ops arpt_sockopts += { { NULL, NULL }, PF_INET, ARPT_BASE_CTL, ARPT_SO_SET_MAX+1, do_arpt_set_ctl, + ARPT_BASE_CTL, ARPT_SO_GET_MAX+1, do_arpt_get_ctl, 0, NULL }; + +#ifdef CONFIG_PROC_FS +static inline int print_name(const struct arpt_table *t, + off_t start_offset, char *buffer, int length, + off_t *pos, unsigned int *count) +{ + if ((*count)++ >= start_offset) { + unsigned int namelen; + + namelen = sprintf(buffer + *pos, "%s\n", t->name); + if (*pos + namelen > length) { + /* Stop iterating */ + return 1; + } + *pos += namelen; + } + return 0; +} + +static int arpt_get_tables(char *buffer, char **start, off_t offset, int length) +{ + off_t pos = 0; + unsigned int count = 0; + + if (down_interruptible(&arpt_mutex) != 0) + return 0; + + LIST_FIND(&arpt_tables, print_name, struct arpt_table *, + offset, buffer, length, &pos, &count); + + up(&arpt_mutex); + + /* `start' hack - see fs/proc/generic.c line ~105 */ + *start=(char *)((unsigned long)count-offset); + return pos; +} +#endif /*CONFIG_PROC_FS*/ + +static int __init init(void) +{ + int ret; + + /* Noone else will be downing sem now, so we won't sleep */ + down(&arpt_mutex); + list_append(&arpt_target, &arpt_standard_target); + list_append(&arpt_target, &arpt_error_target); + up(&arpt_mutex); + + /* Register setsockopt */ + ret = nf_register_sockopt(&arpt_sockopts); + if (ret < 0) { + duprintf("Unable to register sockopts.\n"); + return ret; + } + +#ifdef CONFIG_PROC_FS + { + struct proc_dir_entry *proc; + + proc = proc_net_create("arp_tables_names", 0, arpt_get_tables); + if (!proc) { + nf_unregister_sockopt(&arpt_sockopts); + return -ENOMEM; + } + proc->owner = THIS_MODULE; + } +#endif + + printk("arp_tables: (C) 2002 David S. Miller\n"); + return 0; +} + +static void __exit fini(void) +{ + nf_unregister_sockopt(&arpt_sockopts); +#ifdef CONFIG_PROC_FS + proc_net_remove("arp_tables_names"); +#endif +} + +EXPORT_SYMBOL(arpt_register_table); +EXPORT_SYMBOL(arpt_unregister_table); +EXPORT_SYMBOL(arpt_do_table); +EXPORT_SYMBOL(arpt_register_target); +EXPORT_SYMBOL(arpt_unregister_target); + +module_init(init); +module_exit(fini); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c new file mode 100644 index 000000000000..4e11e5b5e006 --- /dev/null +++ b/net/ipv4/netfilter/arptable_filter.c @@ -0,0 +1,174 @@ +/* + * Filtering ARP tables module. + * + * Copyright (C) 2002 David S. Miller (davem@redhat.com) + * + */ + +#include <linux/module.h> +#include <linux/netfilter_arp/arp_tables.h> + +#define FILTER_VALID_HOOKS ((1 << NF_ARP_IN) | (1 << NF_ARP_OUT)) + +/* Standard entry. */ +struct arpt_standard +{ + struct arpt_entry entry; + struct arpt_standard_target target; +}; + +struct arpt_error_target +{ + struct arpt_entry_target target; + char errorname[ARPT_FUNCTION_MAXNAMELEN]; +}; + +struct arpt_error +{ + struct arpt_entry entry; + struct arpt_error_target target; +}; + +static struct +{ + struct arpt_replace repl; + struct arpt_standard entries[2]; + struct arpt_error term; +} initial_table __initdata += { { "filter", FILTER_VALID_HOOKS, 3, + sizeof(struct arpt_standard) * 2 + sizeof(struct arpt_error), + { [NF_ARP_IN] 0, + [NF_ARP_OUT] sizeof(struct arpt_standard) }, + { [NF_ARP_IN] 0, + [NF_ARP_OUT] sizeof(struct arpt_standard), }, + 0, NULL, { } }, + { + /* ARP_IN */ + { + { + { + { 0 }, { 0 }, { 0 }, { 0 }, + 0, 0, + { { 0, }, { 0, } }, + { { 0, }, { 0, } }, + 0, 0, + 0, 0, + 0, 0, + "", "", { 0 }, { 0 }, + 0, 0 + }, + sizeof(struct arpt_entry), + sizeof(struct arpt_standard), + 0, + { 0, 0 }, { } }, + { { { { ARPT_ALIGN(sizeof(struct arpt_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } + }, + /* ARP_OUT */ + { + { + { + { 0 }, { 0 }, { 0 }, { 0 }, + 0, 0, + { { 0, }, { 0, } }, + { { 0, }, { 0, } }, + 0, 0, + 0, 0, + 0, 0, + "", "", { 0 }, { 0 }, + 0, 0 + }, + sizeof(struct arpt_entry), + sizeof(struct arpt_standard), + 0, + { 0, 0 }, { } }, + { { { { ARPT_ALIGN(sizeof(struct arpt_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } + } + }, + /* ERROR */ + { + { + { + { 0 }, { 0 }, { 0 }, { 0 }, + 0, 0, + { { 0, }, { 0, } }, + { { 0, }, { 0, } }, + 0, 0, + 0, 0, + 0, 0, + "", "", { 0 }, { 0 }, + 0, 0 + }, + sizeof(struct arpt_entry), + sizeof(struct arpt_error), + 0, + { 0, 0 }, { } }, + { { { { ARPT_ALIGN(sizeof(struct arpt_error_target)), ARPT_ERROR_TARGET } }, + { } }, + "ERROR" + } + } +}; + +static struct arpt_table packet_filter += { { NULL, NULL }, "filter", &initial_table.repl, + FILTER_VALID_HOOKS, RW_LOCK_UNLOCKED, NULL, THIS_MODULE }; + +/* The work comes in here from netfilter.c */ +static unsigned int arpt_hook(unsigned int hook, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return arpt_do_table(pskb, hook, in, out, &packet_filter, NULL); +} + +static struct nf_hook_ops arpt_ops[] += { { { NULL, NULL }, arpt_hook, NF_ARP, NF_ARP_IN, 0 }, + { { NULL, NULL }, arpt_hook, NF_ARP, NF_ARP_OUT, 0 } +}; + +static int __init init(void) +{ + int ret; + + /* Register table */ + ret = arpt_register_table(&packet_filter); + if (ret < 0) + return ret; + + /* Register hooks */ + ret = nf_register_hook(&arpt_ops[0]); + if (ret < 0) + goto cleanup_table; + + ret = nf_register_hook(&arpt_ops[1]); + if (ret < 0) + goto cleanup_hook0; + + return ret; + +cleanup_hook0: + nf_unregister_hook(&arpt_ops[0]); + +cleanup_table: + arpt_unregister_table(&packet_filter); + + return ret; +} + +static void __exit fini(void) +{ + unsigned int i; + + for (i = 0; i < sizeof(arpt_ops)/sizeof(struct nf_hook_ops); i++) + nf_unregister_hook(&arpt_ops[i]); + + arpt_unregister_table(&packet_filter); +} + +module_init(init); +module_exit(fini); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c index 1846640d2837..5fa94340daf9 100644 --- a/net/ipv4/netfilter/ip_conntrack_standalone.c +++ b/net/ipv4/netfilter/ip_conntrack_standalone.c @@ -15,6 +15,7 @@ #include <linux/skbuff.h> #include <linux/proc_fs.h> #include <linux/version.h> +#include <linux/brlock.h> #include <net/checksum.h> #define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock) @@ -35,6 +36,11 @@ struct module *ip_conntrack_module = THIS_MODULE; MODULE_LICENSE("GPL"); +static int kill_proto(const struct ip_conntrack *i, void *data) +{ + return (i->tuplehash[IP_CT_DIR_ORIGINAL].dst.protonum == + *((u_int8_t *) data)); +} static unsigned int print_tuple(char *buffer, const struct ip_conntrack_tuple *tuple, @@ -304,12 +310,24 @@ int ip_conntrack_protocol_register(struct ip_conntrack_protocol *proto) return ret; } -/* FIXME: Implement this --RR */ -#if 0 void ip_conntrack_protocol_unregister(struct ip_conntrack_protocol *proto) { + WRITE_LOCK(&ip_conntrack_lock); + + /* find_proto() returns proto_generic in case there is no protocol + * helper. So this should be enough - HW */ + LIST_DELETE(&protocol_list, proto); + WRITE_UNLOCK(&ip_conntrack_lock); + + /* Somebody could be still looking at the proto in bh. */ + br_write_lock_bh(BR_NETPROTO_LOCK); + br_write_unlock_bh(BR_NETPROTO_LOCK); + + /* Remove all contrack entries for this protocol */ + ip_ct_selective_cleanup(kill_proto, &proto->proto); + + MOD_DEC_USE_COUNT; } -#endif static int __init init(void) { @@ -325,6 +343,7 @@ module_init(init); module_exit(fini); EXPORT_SYMBOL(ip_conntrack_protocol_register); +EXPORT_SYMBOL(ip_conntrack_protocol_unregister); EXPORT_SYMBOL(invert_tuplepr); EXPORT_SYMBOL(ip_conntrack_alter_reply); EXPORT_SYMBOL(ip_conntrack_destroyed); @@ -335,6 +354,7 @@ EXPORT_SYMBOL(ip_conntrack_helper_unregister); EXPORT_SYMBOL(ip_ct_selective_cleanup); EXPORT_SYMBOL(ip_ct_refresh); EXPORT_SYMBOL(ip_conntrack_expect_related); +EXPORT_SYMBOL(ip_conntrack_unexpect_related); EXPORT_SYMBOL(ip_conntrack_tuple_taken); EXPORT_SYMBOL(ip_ct_gather_frags); EXPORT_SYMBOL(ip_conntrack_htable_size); diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c index 07ebd39fa115..ebbbde93ead4 100644 --- a/net/ipv4/netfilter/ip_nat_core.c +++ b/net/ipv4/netfilter/ip_nat_core.c @@ -314,6 +314,7 @@ find_best_ips_proto(struct ip_conntrack_tuple *tuple, * do_extra_mangle last time. */ *other_ipp = saved_ip; +#ifdef CONFIG_IP_NF_NAT_LOCAL if (hooknum == NF_IP_LOCAL_OUT && *var_ipp != orig_dstip && !do_extra_mangle(*var_ipp, other_ipp)) { @@ -324,6 +325,7 @@ find_best_ips_proto(struct ip_conntrack_tuple *tuple, * anyway. */ continue; } +#endif /* Count how many others map onto this. */ score = count_maps(tuple->src.ip, tuple->dst.ip, @@ -367,11 +369,13 @@ find_best_ips_proto_fast(struct ip_conntrack_tuple *tuple, else { /* Only do extra mangle when required (breaks socket binding) */ +#ifdef CONFIG_IP_NF_NAT_LOCAL if (tuple->dst.ip != mr->range[0].min_ip && hooknum == NF_IP_LOCAL_OUT && !do_extra_mangle(mr->range[0].min_ip, &tuple->src.ip)) return NULL; +#endif tuple->dst.ip = mr->range[0].min_ip; } } @@ -494,7 +498,10 @@ helper_cmp(const struct ip_nat_helper *helper, static unsigned int opposite_hook[NF_IP_NUMHOOKS] = { [NF_IP_PRE_ROUTING] = NF_IP_POST_ROUTING, [NF_IP_POST_ROUTING] = NF_IP_PRE_ROUTING, - [NF_IP_LOCAL_OUT] = NF_IP_POST_ROUTING +#ifdef CONFIG_IP_NF_NAT_LOCAL + [NF_IP_LOCAL_OUT] = NF_IP_LOCAL_IN, + [NF_IP_LOCAL_IN] = NF_IP_LOCAL_OUT, +#endif }; unsigned int diff --git a/net/ipv4/netfilter/ip_nat_rule.c b/net/ipv4/netfilter/ip_nat_rule.c index 9a3248e68d64..9eacf45d908c 100644 --- a/net/ipv4/netfilter/ip_nat_rule.c +++ b/net/ipv4/netfilter/ip_nat_rule.c @@ -140,8 +140,12 @@ static unsigned int ipt_dnat_target(struct sk_buff **pskb, struct ip_conntrack *ct; enum ip_conntrack_info ctinfo; +#ifdef CONFIG_IP_NF_NAT_LOCAL IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING || hooknum == NF_IP_LOCAL_OUT); +#else + IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING); +#endif ct = ip_conntrack_get(*pskb, &ctinfo); @@ -210,7 +214,7 @@ static int ipt_dnat_checkentry(const char *tablename, /* Only allow these for NAT. */ if (strcmp(tablename, "nat") != 0) { - DEBUGP("SNAT: wrong table %s\n", tablename); + DEBUGP("DNAT: wrong table %s\n", tablename); return 0; } @@ -218,6 +222,14 @@ static int ipt_dnat_checkentry(const char *tablename, DEBUGP("DNAT: hook mask 0x%x bad\n", hook_mask); return 0; } + +#ifndef CONFIG_IP_NF_NAT_LOCAL + if (hook_mask & (1 << NF_IP_LOCAL_OUT)) { + DEBUGP("DNAT: CONFIG_IP_NF_NAT_LOCAL not enabled\n"); + return 0; + } +#endif + return 1; } diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c index b36339d2bc3f..b0d299703269 100644 --- a/net/ipv4/netfilter/ip_nat_standalone.c +++ b/net/ipv4/netfilter/ip_nat_standalone.c @@ -42,7 +42,8 @@ #define HOOKNAME(hooknum) ((hooknum) == NF_IP_POST_ROUTING ? "POST_ROUTING" \ : ((hooknum) == NF_IP_PRE_ROUTING ? "PRE_ROUTING" \ : ((hooknum) == NF_IP_LOCAL_OUT ? "LOCAL_OUT" \ - : "*ERROR*"))) + : ((hooknum) == NF_IP_LOCAL_IN ? "LOCAL_IN" \ + : "*ERROR*"))) static unsigned int ip_nat_fn(unsigned int hooknum, @@ -95,6 +96,12 @@ ip_nat_fn(unsigned int hooknum, } /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */ case IP_CT_NEW: +#ifdef CONFIG_IP_NF_NAT_LOCAL + /* LOCAL_IN hook doesn't have a chain and thus doesn't care + * about new packets -HW */ + if (hooknum == NF_IP_LOCAL_IN) + return NF_ACCEPT; +#endif info = &ct->nat.info; WRITE_LOCK(&ip_nat_lock); @@ -205,6 +212,11 @@ static struct nf_hook_ops ip_nat_out_ops static struct nf_hook_ops ip_nat_local_out_ops = { { NULL, NULL }, ip_nat_local_fn, PF_INET, NF_IP_LOCAL_OUT, NF_IP_PRI_NAT_DST }; +#ifdef CONFIG_IP_NF_NAT_LOCAL +static struct nf_hook_ops ip_nat_local_in_ops += { { NULL, NULL }, ip_nat_fn, PF_INET, NF_IP_LOCAL_IN, NF_IP_PRI_NAT_SRC }; +#endif + /* Protocol registration. */ int ip_nat_protocol_register(struct ip_nat_protocol *proto) { @@ -273,6 +285,13 @@ static int init_or_cleanup(int init) printk("ip_nat_init: can't register local out hook.\n"); goto cleanup_outops; } +#ifdef CONFIG_IP_NF_NAT_LOCAL + ret = nf_register_hook(&ip_nat_local_in_ops); + if (ret < 0) { + printk("ip_nat_init: can't register local in hook.\n"); + goto cleanup_localoutops; + } +#endif if (ip_conntrack_module) __MOD_INC_USE_COUNT(ip_conntrack_module); return ret; @@ -280,6 +299,10 @@ static int init_or_cleanup(int init) cleanup: if (ip_conntrack_module) __MOD_DEC_USE_COUNT(ip_conntrack_module); +#ifdef CONFIG_IP_NF_NAT_LOCAL + nf_unregister_hook(&ip_nat_local_in_ops); + cleanup_localoutops: +#endif nf_unregister_hook(&ip_nat_local_out_ops); cleanup_outops: nf_unregister_hook(&ip_nat_out_ops); diff --git a/net/netsyms.c b/net/netsyms.c index b500684bdaec..abf875169c99 100644 --- a/net/netsyms.c +++ b/net/netsyms.c @@ -490,6 +490,7 @@ EXPORT_SYMBOL(__kfree_skb); EXPORT_SYMBOL(skb_clone); EXPORT_SYMBOL(skb_copy); EXPORT_SYMBOL(netif_rx); +EXPORT_SYMBOL(netif_receive_skb); EXPORT_SYMBOL(dev_add_pack); EXPORT_SYMBOL(dev_remove_pack); EXPORT_SYMBOL(dev_get); |
