summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/networking/NAPI_HOWTO.txt749
-rw-r--r--include/linux/netdevice.h87
-rw-r--r--include/linux/netfilter_arp.h19
-rw-r--r--include/linux/netfilter_arp/arp_tables.h342
-rw-r--r--include/linux/netfilter_ipv4/ip_nat.h5
-rw-r--r--include/linux/sysctl.h3
-rw-r--r--net/core/dev.c281
-rw-r--r--net/core/sysctl_net_core.c4
-rw-r--r--net/ipv4/arp.c77
-rw-r--r--net/ipv4/netfilter/Config.in6
-rw-r--r--net/ipv4/netfilter/Makefile8
-rw-r--r--net/ipv4/netfilter/arp_tables.c1313
-rw-r--r--net/ipv4/netfilter/arptable_filter.c174
-rw-r--r--net/ipv4/netfilter/ip_conntrack_standalone.c26
-rw-r--r--net/ipv4/netfilter/ip_nat_core.c9
-rw-r--r--net/ipv4/netfilter/ip_nat_rule.c14
-rw-r--r--net/ipv4/netfilter/ip_nat_standalone.c25
-rw-r--r--net/netsyms.c1
18 files changed, 2993 insertions, 150 deletions
diff --git a/Documentation/networking/NAPI_HOWTO.txt b/Documentation/networking/NAPI_HOWTO.txt
new file mode 100644
index 000000000000..44811d36db49
--- /dev/null
+++ b/Documentation/networking/NAPI_HOWTO.txt
@@ -0,0 +1,749 @@
+HISTORY:
+February 16/2002 -- revision 0.2.1:
+COR typo corrected
+February 10/2002 -- revision 0.2:
+some spell checking ;->
+January 12/2002 -- revision 0.1
+This is still work in progress so may change.
+To keep up to date please watch this space.
+
+Introduction to NAPI
+====================
+
+NAPI is a proven (www.cyberus.ca/~hadi/usenix-paper.tgz) technique
+to improve network performance on Linux. For more details please
+read that paper.
+NAPI provides a "inherent mitigation" which is bound by system capacity
+as can be seen from the following data collected by Robert on Gigabit
+ethernet (e1000):
+
+ Psize Ipps Tput Rxint Txint Done Ndone
+ ---------------------------------------------------------------
+ 60 890000 409362 17 27622 7 6823
+ 128 758150 464364 21 9301 10 7738
+ 256 445632 774646 42 15507 21 12906
+ 512 232666 994445 241292 19147 241192 1062
+ 1024 119061 1000003 872519 19258 872511 0
+ 1440 85193 1000003 946576 19505 946569 0
+
+
+Legend:
+"Ipps" stands for input packets per second.
+"Tput" == packets out of total 1M that made it out.
+"txint" == transmit completion interrupts seen
+"Done" == The number of times that the poll() managed to pull all
+packets out of the rx ring. Note from this that the lower the
+load the more we could clean up the rxring
+"Ndone" == is the converse of "Done". Note again, that the higher
+the load the more times we couldnt clean up the rxring.
+
+Observe that:
+when the NIC receives 890Kpackets/sec only 17 rx interrupts are generated.
+The system cant handle the processing at 1 interrupt/packet at that load level.
+At lower rates on the other hand, rx interrupts go up and therefore the
+interrupt/packet ratio goes up (as observable from that table). So there is
+possibility that under low enough input, you get one poll call for each
+input packet caused by a single interrupt each time. And if the system
+cant handle interrupt per packet ratio of 1, then it will just have to
+chug along ....
+
+
+0) Prerequisites:
+==================
+A driver MAY continue using the old 2.4 technique for interfacing
+to the network stack and not benefit from the NAPI changes.
+NAPI additions to the kernel do not break backward compatibility.
+NAPI, however, requires the following features to be available:
+
+A) DMA ring or enough RAM to store packets in software devices.
+
+B) Ability to turn off interrupts or maybe events that send packets up
+the stack.
+
+NAPI processes packet events in what is known as dev->poll() method.
+Typically, only packet receive events are processed in dev->poll().
+The rest of the events MAY be processed by the regular interrupt handler
+to reduce processing latency (justified also because there are not that
+many of them).
+Note, however, NAPI does not enforce that dev->poll() only processes
+receive events.
+Tests with the tulip driver indicated slightly increased latency if
+all of the interrupt handler is moved to dev->poll(). Also MII handling
+gets a little trickier.
+The example used in this document is to move the receive processing only
+to dev->poll(); this is shown with the patch for the tulip driver.
+For an example of code that moves all the interrupt driver to
+dev->poll() look at the ported e1000 code.
+
+There are caveats that might force you to go with moving everything to
+dev->poll(). Different NICs work differently depending on their status/event
+acknowledgement setup.
+There are two types of event register ACK mechanisms.
+ I) what is known as Clear-on-read (COR).
+ when you read the status/event register, it clears everything!
+ The natsemi and sunbmac NICs are known to do this.
+ In this case your only choice is to move all to dev->poll()
+
+ II) Clear-on-write (COW)
+ i) you clear the status by writting a 1 in the bit-location you want.
+ These are the majority of the NICs and work the best with NAPI.
+ Put only receive events in dev->poll(); leave the rest in
+ the old interrupt handler.
+ ii) whatever you write in the status register clears every thing ;->
+ Cant seem to find any supported by Linux which do this. If
+ someone knows such a chip email us please.
+ Move all to dev->poll()
+
+C) Ability to detect new work correctly.
+NAPI works by shutting down event interrupts when theres work and
+turning them on when theres none.
+New packets might show up in the small window while interrupts were being
+re-enabled (refer to appendix 2). A packet might sneak in during the period
+we are enabling interrupts. We only get to know about such a packet when the
+next new packet arrives and generates an interrupt.
+Essentially, there is a small window of opportunity for a race condition
+which for clarity we'll refer to as the "rotting packet".
+
+This is a very important topic and appendix 2 is dedicated for more
+discussion.
+
+Locking rules and environmental guarantees
+==========================================
+
+-Guarantee: Only one CPU at any time can call dev->poll(); this is because
+only one CPU can pick the initial interrupt and hence the initial
+netif_rx_schedule(dev);
+- The core layer invokes devices to send packets in a round robin format.
+This implies receive is totaly lockless because of the guarantee only that
+one CPU is executing it.
+- contention can only be the result of some other CPU accessing the rx
+ring. This happens only in close() and suspend() (when these methods
+try to clean the rx ring);
+****guarantee: driver authors need not worry about this; synchronization
+is taken care for them by the top net layer.
+-local interrupts are enabled (if you dont move all to dev->poll()). For
+example link/MII and txcomplete continue functioning just same old way.
+This improves the latency of processing these events. It is also assumed that
+the receive interrupt is the largest cause of noise. Note this might not
+always be true.
+[according to Manfred Spraul, the winbond insists on sending one
+txmitcomplete interrupt for each packet (although this can be mitigated)].
+For these broken drivers, move all to dev->poll().
+
+For the rest of this text, we'll assume that dev->poll() only
+processes receive events.
+
+new methods introduce by NAPI
+=============================
+
+a) netif_rx_schedule(dev)
+Called by an IRQ handler to schedule a poll for device
+
+b) netif_rx_schedule_prep(dev)
+puts the device in a state which allows for it to be added to the
+CPU polling list if it is up and running. You can look at this as
+the first half of netif_rx_schedule(dev) above; the second half
+being c) below.
+
+c) __netif_rx_schedule(dev)
+Add device to the poll list for this CPU; assuming that _prep above
+has already been called and returned 1.
+
+d) netif_rx_reschedule(dev, undo)
+Called to reschedule polling for device specifically for some
+deficient hardware. Read Appendix 2 for more details.
+
+e) netif_rx_complete(dev)
+
+Remove interface from the CPU poll list: it must be in the poll list
+on current cpu. This primitive is called by dev->poll(), when
+it completes its work. The device cannot be out of poll list at this
+call, if it is then clearly it is a BUG(). You'll know ;->
+
+All these above nethods are used below. So keep reading for clarity.
+
+Device driver changes to be made when porting NAPI
+==================================================
+
+Below we describe what kind of changes are required for NAPI to work.
+
+1) introduction of dev->poll() method
+=====================================
+
+This is the method that is invoked by the network core when it requests
+for new packets from the driver. A driver is allowed to send upto
+dev->quota packets by the current CPU before yielding to the network
+subsystem (so other devices can also get opportunity to send to the stack).
+
+dev->poll() prototype looks as follows:
+int my_poll(struct net_device *dev, int *budget)
+
+budget is the remaining number of packets the network subsystem on the
+current CPU can send up the stack before yielding to other system tasks.
+*Each driver is responsible for decrementing budget by the total number of
+packets sent.
+ Total number of packets cannot exceed dev->quota.
+
+dev->poll() method is invoked by the top layer, the driver just sends if it
+can to the stack the packet quantity requested.
+
+more on dev->poll() below after the interrupt changes are explained.
+
+2) registering dev->poll() method
+===================================
+
+dev->poll should be set in the dev->probe() method.
+e.g:
+dev->open = my_open;
+.
+.
+/* two new additions */
+/* first register my poll method */
+dev->poll = my_poll;
+/* next register my weight/quanta; can be overriden in /proc */
+dev->weight = 16;
+.
+.
+dev->stop = my_close;
+
+
+
+3) scheduling dev->poll()
+=============================
+This involves modifying the interrupt handler and the code
+path which takes the packet off the NIC and sends them to the
+stack.
+
+it's important at this point to introduce the classical D Becker
+interrupt processor:
+
+------------------
+static void
+netdevice_interrupt(int irq, void *dev_id, struct pt_regs *regs)
+{
+
+ struct net_device *dev = (struct net_device *)dev_instance;
+ struct my_private *tp = (struct my_private *)dev->priv;
+
+ int work_count = my_work_count;
+ status = read_interrupt_status_reg();
+ if (status == 0)
+ return; /* Shared IRQ: not us */
+ if (status == 0xffff)
+ return; /* Hot unplug */
+ if (status & error)
+ do_some_error_handling()
+
+ do {
+ acknowledge_ints_ASAP();
+
+ if (status & link_interrupt) {
+ spin_lock(&tp->link_lock);
+ do_some_link_stat_stuff();
+ spin_lock(&tp->link_lock);
+ }
+
+ if (status & rx_interrupt) {
+ receive_packets(dev);
+ }
+
+ if (status & rx_nobufs) {
+ make_rx_buffs_avail();
+ }
+
+ if (status & tx_related) {
+ spin_lock(&tp->lock);
+ tx_ring_free(dev);
+ if (tx_died)
+ restart_tx();
+ spin_unlock(&tp->lock);
+ }
+
+ status = read_interrupt_status_reg();
+
+ } while (!(status & error) || more_work_to_be_done);
+
+}
+
+----------------------------------------------------------------------
+
+We now change this to what is shown below to NAPI-enable it:
+
+----------------------------------------------------------------------
+static void
+netdevice_interrupt(int irq, void *dev_id, struct pt_regs *regs)
+{
+ struct net_device *dev = (struct net_device *)dev_instance;
+ struct my_private *tp = (struct my_private *)dev->priv;
+
+ status = read_interrupt_status_reg();
+ if (status == 0)
+ return; /* Shared IRQ: not us */
+ if (status == 0xffff)
+ return; /* Hot unplug */
+ if (status & error)
+ do_some_error_handling();
+
+ do {
+/************************ start note *********************************/
+ acknowledge_ints_ASAP(); // dont ack rx and rxnobuff here
+/************************ end note *********************************/
+
+ if (status & link_interrupt) {
+ spin_lock(&tp->link_lock);
+ do_some_link_stat_stuff();
+ spin_unlock(&tp->link_lock);
+ }
+/************************ start note *********************************/
+ if (status & rx_interrupt || (status & rx_nobuffs)) {
+ if (netif_rx_schedule_prep(dev)) {
+
+ /* disable interrupts caused
+ * by arriving packets */
+ disable_rx_and_rxnobuff_ints();
+ /* tell system we have work to be done. */
+ __netif_rx_schedule(dev);
+ } else {
+ printk("driver bug! interrupt while in poll\n");
+ /* FIX by disabling interrupts */
+ disable_rx_and_rxnobuff_ints();
+ }
+ }
+/************************ end note note *********************************/
+
+ if (status & tx_related) {
+ spin_lock(&tp->lock);
+ tx_ring_free(dev);
+
+ if (tx_died)
+ restart_tx();
+ spin_unlock(&tp->lock);
+ }
+
+ status = read_interrupt_status_reg();
+
+/************************ start note *********************************/
+ } while (!(status & error) || more_work_to_be_done(status));
+/************************ end note note *********************************/
+
+}
+
+---------------------------------------------------------------------
+
+
+We note several things from above:
+
+I) Any interrupt source which is caused by arriving packets is now
+turned off when it occurs. Depending on the hardware, there could be
+several reasons that arriving packets would cause interrupts; these are the
+interrupt sources we wish to avoid. The two common ones are a) a packet
+arriving (rxint) b) a packet arriving and finding no DMA buffers available
+(rxnobuff) .
+This means also acknowledge_ints_ASAP() will not clear the status
+register for those two items above; clearing is done in the place where
+proper work is done within NAPI; at the poll() and refill_rx_ring()
+discussed further below.
+netif_rx_schedule_prep() returns 1 if device is in running state and
+gets successfully added to the core poll list. If we get a zero value
+we can _almost_ assume are already added to the list (instead of not running.
+Logic based on the fact that you shouldnt get interrupt if not running)
+We rectify this by disabling rx and rxnobuf interrupts.
+
+II) that receive_packets(dev) and make_rx_buffs_avail() may have dissapeared.
+These functionalities are still around actually......
+
+infact, receive_packets(dev) is very close to my_poll() and
+make_rx_buffs_avail() is invoked from my_poll()
+
+4) converting receive_packets() to dev->poll()
+===============================================
+
+We need to convert the classical D Becker receive_packets(dev) to my_poll()
+
+First the typical receive_packets() below:
+-------------------------------------------------------------------
+
+/* this is called by interrupt handler */
+static void receive_packets (struct net_device *dev)
+{
+
+ struct my_private *tp = (struct my_private *)dev->priv;
+ rx_ring = tp->rx_ring;
+ cur_rx = tp->cur_rx;
+ int entry = cur_rx % RX_RING_SIZE;
+ int received = 0;
+ int rx_work_limit = tp->dirty_rx + RX_RING_SIZE - tp->cur_rx;
+
+ while (rx_ring_not_empty) {
+ u32 rx_status;
+ unsigned int rx_size;
+ unsigned int pkt_size;
+ struct sk_buff *skb;
+ /* read size+status of next frame from DMA ring buffer */
+ /* the number 16 and 4 are just examples */
+ rx_status = le32_to_cpu (*(u32 *) (rx_ring + ring_offset));
+ rx_size = rx_status >> 16;
+ pkt_size = rx_size - 4;
+
+ /* process errors */
+ if ((rx_size > (MAX_ETH_FRAME_SIZE+4)) ||
+ (!(rx_status & RxStatusOK))) {
+ netdrv_rx_err (rx_status, dev, tp, ioaddr);
+ return;
+ }
+
+ if (--rx_work_limit < 0)
+ break;
+
+ /* grab a skb */
+ skb = dev_alloc_skb (pkt_size + 2);
+ if (skb) {
+ .
+ .
+ netif_rx (skb);
+ .
+ .
+ } else { /* OOM */
+ /*seems very driver specific ... some just pass
+ whatever is on the ring already. */
+ }
+
+ /* move to the next skb on the ring */
+ entry = (++tp->cur_rx) % RX_RING_SIZE;
+ received++ ;
+
+ }
+
+ /* store current ring pointer state */
+ tp->cur_rx = cur_rx;
+
+ /* Refill the Rx ring buffers if they are needed */
+ refill_rx_ring();
+ .
+ .
+
+}
+-------------------------------------------------------------------
+We change it to a new one below; note the additional parameter in
+the call.
+
+-------------------------------------------------------------------
+
+/* this is called by the network core */
+static void my_poll (struct net_device *dev, int *budget)
+{
+
+ struct my_private *tp = (struct my_private *)dev->priv;
+ rx_ring = tp->rx_ring;
+ cur_rx = tp->cur_rx;
+ int entry = cur_rx % RX_BUF_LEN;
+ /* maximum packets to send to the stack */
+/************************ note note *********************************/
+ int rx_work_limit = dev->quota;
+
+/************************ end note note *********************************/
+ do { // outer beggining loop starts here
+
+ clear_rx_status_register_bit();
+
+ while (rx_ring_not_empty) {
+ u32 rx_status;
+ unsigned int rx_size;
+ unsigned int pkt_size;
+ struct sk_buff *skb;
+ /* read size+status of next frame from DMA ring buffer */
+ /* the number 16 and 4 are just examples */
+ rx_status = le32_to_cpu (*(u32 *) (rx_ring + ring_offset));
+ rx_size = rx_status >> 16;
+ pkt_size = rx_size - 4;
+
+ /* process errors */
+ if ((rx_size > (MAX_ETH_FRAME_SIZE+4)) ||
+ (!(rx_status & RxStatusOK))) {
+ netdrv_rx_err (rx_status, dev, tp, ioaddr);
+ return;
+ }
+
+/************************ note note *********************************/
+ if (--rx_work_limit < 0) { /* we got packets, but no quota */
+ /* store current ring pointer state */
+ tp->cur_rx = cur_rx;
+
+ /* Refill the Rx ring buffers if they are needed */
+ refill_rx_ring(dev);
+ goto not_done;
+ }
+/********************** end note **********************************/
+
+ /* grab a skb */
+ skb = dev_alloc_skb (pkt_size + 2);
+ if (skb) {
+ .
+ .
+/************************ note note *********************************/
+ netif_receive_skb (skb);
+/********************** end note **********************************/
+ .
+ .
+ } else { /* OOM */
+ /*seems very driver specific ... common is just pass
+ whatever is on the ring already. */
+ }
+
+ /* move to the next skb on the ring */
+ entry = (++tp->cur_rx) % RX_RING_SIZE;
+ received++ ;
+
+ }
+
+ /* store current ring pointer state */
+ tp->cur_rx = cur_rx;
+
+ /* Refill the Rx ring buffers if they are needed */
+ refill_rx_ring(dev);
+
+ /* no packets on ring; but new ones can arrive since we last
+ checked */
+ status = read_interrupt_status_reg();
+ if (rx status is not set) {
+ /* If something arrives in this narrow window,
+ an interrupt will be generated */
+ goto done;
+ }
+ /* done! at least thats what it looks like ;->
+ if new packets came in after our last check on status bits
+ they'll be caught by the while check and we go back and clear them
+ since we havent exceeded our quota */
+ } while (rx_status_is_set);
+
+done:
+
+/************************ note note *********************************/
+ dev->quota -= received;
+ *budget -= received;
+
+ /* If RX ring is not full we are out of memory. */
+ if (tp->rx_buffers[tp->dirty_rx % RX_RING_SIZE].skb == NULL)
+ goto oom;
+
+ /* we are happy/done, no more packets on ring; put us back
+ to where we can start processing interrupts again */
+ netif_rx_complete(dev);
+ enable_rx_and_rxnobuf_ints();
+
+ /* The last op happens after poll completion. Which means the following:
+ * 1. it can race with disabling irqs in irq handler (which are done to
+ * schedule polls)
+ * 2. it can race with dis/enabling irqs in other poll threads
+ * 3. if an irq raised after the begining of the outer beginning
+ * loop(marked in the code above), it will be immediately
+ * triggered here.
+ *
+ * Summarizing: the logic may results in some redundant irqs both
+ * due to races in masking and due to too late acking of already
+ * processed irqs. The good news: no events are ever lost.
+ */
+
+ return 0; /* done */
+
+not_done:
+ if (tp->cur_rx - tp->dirty_rx > RX_RING_SIZE/2 ||
+ tp->rx_buffers[tp->dirty_rx % RX_RING_SIZE].skb == NULL)
+ refill_rx_ring(dev);
+
+ if (!received) {
+ printk("received==0\n");
+ received = 1;
+ }
+ dev->quota -= received;
+ *budget -= received;
+ return 1; /* not_done */
+
+oom:
+ /* Start timer, stop polling, but do not enable rx interrupts. */
+ start_poll_timer(dev);
+ return 0; /* we'll take it from here so tell core "done"*/
+
+/************************ End note note *********************************/
+}
+-------------------------------------------------------------------
+
+From above we note that:
+0) rx_work_limit = dev->quota
+1) refill_rx_ring() is in charge of clearing the bit for rxnobuff when
+it does the work.
+2) We have a done and not_done state.
+3) instead of netif_rx() we call netif_receive_skb() to pass the skb.
+4) we have a new way of handling oom condition
+5) A new outer for (;;) loop has been added. This serves the purpose of
+ensuring that if a new packet has come in, after we are all set and done,
+and we have not exceeded our quota that we continue sending packets up.
+
+
+-----------------------------------------------------------
+Poll timer code will need to do the following:
+
+a)
+
+ if (tp->cur_rx - tp->dirty_rx > RX_RING_SIZE/2 ||
+ tp->rx_buffers[tp->dirty_rx % RX_RING_SIZE].skb == NULL)
+ refill_rx_ring(dev);
+
+ /* If RX ring is not full we are still out of memory.
+ Restart the timer again. Else we re-add ourselves
+ to the master poll list.
+ */
+
+ if (tp->rx_buffers[tp->dirty_rx % RX_RING_SIZE].skb == NULL)
+ restart_timer();
+
+ else netif_rx_schedule(dev); /* we are back on the poll list */
+
+5) dev->close() and dev->suspend() issues
+==========================================
+The driver writter neednt worry about this. The top net layer takes
+care of it.
+
+6) Adding new Stats to /proc
+=============================
+In order to debug some of the new features, we introduce new stats
+that need to be collected.
+TODO: Fill this later.
+
+APPENDIX 1: discussion on using ethernet HW FC
+==============================================
+Most chips with FC only send a pause packet when they run out of Rx buffers.
+Since packets are pulled off the DMA ring by a softirq in NAPI,
+if the system is slow in grabbing them and we have a high input
+rate (faster than the system's capacity to remove packets), then theoretically
+there will only be one rx interrupt for all packets during a given packetstorm.
+Under low load, we might have a single interrupt per packet.
+FC should be programmed to apply in the case when the system cant pull out
+packets fast enough i.e send a pause only when you run out of rx buffers.
+Note FC in itself is a good solution but we have found it to not be
+much of a commodity feature (both in NICs and switches) and hence falls
+under the same category as using NIC based mitigation. Also experiments
+indicate that its much harder to resolve the resource allocation
+issue (aka lazy receiving that NAPI offers) and hence quantify its usefullness
+proved harder. In any case, FC works even better with NAPI but is not
+necessary.
+
+
+APPENDIX 2: the "rotting packet" race-window avoidance scheme
+=============================================================
+
+There are two types of associations seen here
+
+1) status/int which honors level triggered IRQ
+
+If a status bit for receive or rxnobuff is set and the corresponding
+interrupt-enable bit is not on, then no interrupts will be generated. However,
+as soon as the "interrupt-enable" bit is unmasked, an immediate interrupt is
+generated. [assuming the status bit was not turned off].
+Generally the concept of level triggered IRQs in association with a status and
+interrupt-enable CSR register set is used to avoid the race.
+
+If we take the example of the tulip:
+"pending work" is indicated by the status bit(CSR5 in tulip).
+the corresponding interrupt bit (CSR7 in tulip) might be turned off (but
+the CSR5 will continue to be turned on with new packet arrivals even if
+we clear it the first time)
+Very important is the fact that if we turn on the interrupt bit on when
+status is set that an immediate irq is triggered.
+
+If we cleared the rx ring and proclaimed there was "no more work
+to be done" and then went on to do a few other things; then when we enable
+interrupts, there is a possibility that a new packet might sneak in during
+this phase. It helps to look at the pseudo code for the tulip poll
+routine:
+
+--------------------------
+ do {
+ ACK;
+ while (ring_is_not_empty()) {
+ work-work-work
+ if quota is exceeded: exit, no touching irq status/mask
+ }
+ /* No packets, but new can arrive while we are doing this*/
+ CSR5 := read
+ if (CSR5 is not set) {
+ /* If something arrives in this narrow window here,
+ * where the comments are ;-> irq will be generated */
+ unmask irqs;
+ exit poll;
+ }
+ } while (rx_status_is_set);
+------------------------
+
+CSR5 bit of interest is only the rx status.
+If you look at the last if statement:
+you just finished grabbing all the packets from the rx ring .. you check if
+status bit says theres more packets just in ... it says none; you then
+enable rx interrupts again; if a new packet just came in during this check,
+we are counting that CSR5 will be set in that small window of opportunity
+and that by re-enabling interrupts, we would actually triger an interrupt
+to register the new packet for processing.
+
+[The above description nay be very verbose, if you have better wording
+that will make this more understandable, please suggest it.]
+
+2) non-capable hardware
+
+These do not generally respect level triggered IRQs. Normally,
+irqs may be lost while being masked and the only way to leave poll is to do
+a double check for new input after netif_rx_complete() is invoked
+and re-enable polling (after seeing this new input).
+
+Sample code:
+
+---------
+ .
+ .
+restart_poll:
+ while (ring_is_not_empty()) {
+ work-work-work
+ if quota is exceeded: exit, not touching irq status/mask
+ }
+ .
+ .
+ .
+ enable_rx_interrupts()
+ netif_rx_complete(dev);
+ if (ring_has_new_packet() && netif_rx_reschedule(dev, received)) {
+ disable_rx_and_rxnobufs()
+ goto restart_poll
+ } while (rx_status_is_set);
+---------
+
+Basically netif_rx_complete() removes us from the poll list, but because a
+new packet which will never be caught due to the possibility of a race
+might come in, we attempt to re-add ourselves to the poll list.
+
+
+
+--------------------------------------------------------------------
+
+relevant sites:
+==================
+ftp://robur.slu.se/pub/Linux/net-development/NAPI/
+
+
+--------------------------------------------------------------------
+TODO: Write net-skeleton.c driver.
+-------------------------------------------------------------
+
+Authors:
+========
+Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
+Jamal Hadi Salim <hadi@cyberus.ca>
+Robert Olsson <Robert.Olsson@data.slu.se>
+
+Acknowledgements:
+================
+People who made this document better:
+
+Lennert Buytenhek <buytenh@gnu.org>
+Andrew Morton <akpm@zip.com.au>
+Manfred Spraul <manfred@colorfullife.com>
+Donald Becker <becker@scyld.com>
+Jeff Garzik <jgarzik@mandrakesoft.com>
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 178f6a5a0fe6..32b6db3c7a2c 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -206,7 +206,8 @@ enum netdev_state_t
__LINK_STATE_START,
__LINK_STATE_PRESENT,
__LINK_STATE_SCHED,
- __LINK_STATE_NOCARRIER
+ __LINK_STATE_NOCARRIER,
+ __LINK_STATE_RX_SCHED
};
@@ -330,6 +331,10 @@ struct net_device
void *ip6_ptr; /* IPv6 specific data */
void *ec_ptr; /* Econet specific data */
+ struct list_head poll_list; /* Link to poll list */
+ int quota;
+ int weight;
+
struct Qdisc *qdisc;
struct Qdisc *qdisc_sleeping;
struct Qdisc *qdisc_list;
@@ -373,6 +378,7 @@ struct net_device
int (*stop)(struct net_device *dev);
int (*hard_start_xmit) (struct sk_buff *skb,
struct net_device *dev);
+ int (*poll) (struct net_device *dev, int *quota);
int (*hard_header) (struct sk_buff *skb,
struct net_device *dev,
unsigned short type,
@@ -492,8 +498,11 @@ struct softnet_data
int cng_level;
int avg_blog;
struct sk_buff_head input_pkt_queue;
+ struct list_head poll_list;
struct net_device *output_queue;
struct sk_buff *completion_queue;
+
+ struct net_device backlog_dev; /* Sorry. 8) */
} __attribute__((__aligned__(SMP_CACHE_BYTES)));
@@ -547,6 +556,7 @@ static inline int netif_running(struct net_device *dev)
return test_bit(__LINK_STATE_START, &dev->state);
}
+
/* Use this variant when it is known for sure that it
* is executing from interrupt context.
*/
@@ -578,6 +588,8 @@ static inline void dev_kfree_skb_any(struct sk_buff *skb)
extern void net_call_rx_atomic(void (*fn)(void));
#define HAVE_NETIF_RX 1
extern int netif_rx(struct sk_buff *skb);
+#define HAVE_NETIF_RECEIVE_SKB 1
+extern int netif_receive_skb(struct sk_buff *skb);
extern int dev_ioctl(unsigned int cmd, void *);
extern int dev_change_flags(struct net_device *, unsigned);
extern void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev);
@@ -695,6 +707,78 @@ enum {
#define netif_msg_rx_status(p) ((p)->msg_enable & NETIF_MSG_RX_STATUS)
#define netif_msg_pktdata(p) ((p)->msg_enable & NETIF_MSG_PKTDATA)
+/* Schedule rx intr now? */
+
+static inline int netif_rx_schedule_prep(struct net_device *dev)
+{
+ return netif_running(dev) &&
+ !test_and_set_bit(__LINK_STATE_RX_SCHED, &dev->state);
+}
+
+/* Add interface to tail of rx poll list. This assumes that _prep has
+ * already been called and returned 1.
+ */
+
+static inline void __netif_rx_schedule(struct net_device *dev)
+{
+ unsigned long flags;
+ int cpu = smp_processor_id();
+
+ local_irq_save(flags);
+ dev_hold(dev);
+ list_add_tail(&dev->poll_list, &softnet_data[cpu].poll_list);
+ if (dev->quota < 0)
+ dev->quota += dev->weight;
+ else
+ dev->quota = dev->weight;
+ __cpu_raise_softirq(cpu, NET_RX_SOFTIRQ);
+ local_irq_restore(flags);
+}
+
+/* Try to reschedule poll. Called by irq handler. */
+
+static inline void netif_rx_schedule(struct net_device *dev)
+{
+ if (netif_rx_schedule_prep(dev))
+ __netif_rx_schedule(dev);
+}
+
+/* Try to reschedule poll. Called by dev->poll() after netif_rx_complete().
+ * Do not inline this?
+ */
+static inline int netif_rx_reschedule(struct net_device *dev, int undo)
+{
+ if (netif_rx_schedule_prep(dev)) {
+ unsigned long flags;
+ int cpu = smp_processor_id();
+
+ dev->quota += undo;
+
+ local_irq_save(flags);
+ list_add_tail(&dev->poll_list, &softnet_data[cpu].poll_list);
+ __cpu_raise_softirq(cpu, NET_RX_SOFTIRQ);
+ local_irq_restore(flags);
+ return 1;
+ }
+ return 0;
+}
+
+/* Remove interface from poll list: it must be in the poll list
+ * on current cpu. This primitive is called by dev->poll(), when
+ * it completes the work. The device cannot be out of poll list at this
+ * moment, it is BUG().
+ */
+static inline void netif_rx_complete(struct net_device *dev)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ if (!test_bit(__LINK_STATE_RX_SCHED, &dev->state)) BUG();
+ list_del(&dev->poll_list);
+ clear_bit(__LINK_STATE_RX_SCHED, &dev->state);
+ local_irq_restore(flags);
+}
+
/* These functions live elsewhere (drivers/net/net_init.c, but related) */
extern void ether_setup(struct net_device *dev);
@@ -719,6 +803,7 @@ extern void dev_mcast_init(void);
extern int netdev_register_fc(struct net_device *dev, void (*stimul)(struct net_device *dev));
extern void netdev_unregister_fc(int bit);
extern int netdev_max_backlog;
+extern int weight_p;
extern unsigned long netdev_fc_xoff;
extern atomic_t netdev_dropping;
extern int netdev_set_master(struct net_device *dev, struct net_device *master);
diff --git a/include/linux/netfilter_arp.h b/include/linux/netfilter_arp.h
new file mode 100644
index 000000000000..4f460b3b0cba
--- /dev/null
+++ b/include/linux/netfilter_arp.h
@@ -0,0 +1,19 @@
+#ifndef __LINUX_ARP_NETFILTER_H
+#define __LINUX_ARP_NETFILTER_H
+
+/* ARP-specific defines for netfilter.
+ * (C)2002 Rusty Russell IBM -- This code is GPL.
+ */
+
+#include <linux/config.h>
+#include <linux/netfilter.h>
+
+/* There is no PF_ARP. */
+#define NF_ARP 0
+
+/* ARP Hooks */
+#define NF_ARP_IN 0
+#define NF_ARP_OUT 1
+#define NF_ARP_NUMHOOKS 2
+
+#endif /* __LINUX_ARP_NETFILTER_H */
diff --git a/include/linux/netfilter_arp/arp_tables.h b/include/linux/netfilter_arp/arp_tables.h
new file mode 100644
index 000000000000..7b11e236c7c9
--- /dev/null
+++ b/include/linux/netfilter_arp/arp_tables.h
@@ -0,0 +1,342 @@
+/*
+ * Format of an ARP firewall descriptor
+ *
+ * src, tgt, src_mask, tgt_mask, arpop, arpop_mask are always stored in
+ * network byte order.
+ * flags are stored in host byte order (of course).
+ */
+
+#ifndef _ARPTABLES_H
+#define _ARPTABLES_H
+
+#ifdef __KERNEL__
+#include <linux/if.h>
+#include <linux/types.h>
+#include <linux/in.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#endif
+
+#include <linux/netfilter_arp.h>
+
+#define ARPT_FUNCTION_MAXNAMELEN 30
+#define ARPT_TABLE_MAXNAMELEN 32
+
+#define ARPT_DEV_ADDR_LEN_MAX 16
+
+struct arpt_devaddr_info {
+ char addr[ARPT_DEV_ADDR_LEN_MAX];
+ char mask[ARPT_DEV_ADDR_LEN_MAX];
+};
+
+/* Yes, Virginia, you have to zero the padding. */
+struct arpt_arp {
+ /* Source and target IP addr */
+ struct in_addr src, tgt;
+ /* Mask for src and target IP addr */
+ struct in_addr smsk, tmsk;
+
+ /* Device hw address length, src+target device addresses */
+ u_int8_t arhln, arhln_mask;
+ struct arpt_devaddr_info src_devaddr;
+ struct arpt_devaddr_info tgt_devaddr;
+
+ /* ARP operation code. */
+ u_int16_t arpop, arpop_mask;
+
+ /* ARP hardware address and protocol address format. */
+ u_int16_t arhrd, arhrd_mask;
+ u_int16_t arpro, arpro_mask;
+
+ /* The protocol address length is only accepted if it is 4
+ * so there is no use in offering a way to do filtering on it.
+ */
+
+ char iniface[IFNAMSIZ], outiface[IFNAMSIZ];
+ unsigned char iniface_mask[IFNAMSIZ], outiface_mask[IFNAMSIZ];
+
+ /* Flags word */
+ u_int8_t flags;
+ /* Inverse flags */
+ u_int16_t invflags;
+};
+
+struct arpt_entry_target
+{
+ union {
+ struct {
+ u_int16_t target_size;
+
+ /* Used by userspace */
+ char name[ARPT_FUNCTION_MAXNAMELEN];
+ } user;
+ struct {
+ u_int16_t target_size;
+
+ /* Used inside the kernel */
+ struct arpt_target *target;
+ } kernel;
+
+ /* Total length */
+ u_int16_t target_size;
+ } u;
+
+ unsigned char data[0];
+};
+
+struct arpt_standard_target
+{
+ struct arpt_entry_target target;
+ int verdict;
+};
+
+struct arpt_counters
+{
+ u_int64_t pcnt, bcnt; /* Packet and byte counters */
+};
+
+/* Values for "flag" field in struct arpt_ip (general arp structure).
+ * No flags defined yet.
+ */
+#define ARPT_F_MASK 0x00 /* All possible flag bits mask. */
+
+/* Values for "inv" field in struct arpt_arp. */
+#define ARPT_INV_VIA_IN 0x0001 /* Invert the sense of IN IFACE. */
+#define ARPT_INV_VIA_OUT 0x0002 /* Invert the sense of OUT IFACE */
+#define ARPT_INV_SRCIP 0x0004 /* Invert the sense of SRC IP. */
+#define ARPT_INV_TGTIP 0x0008 /* Invert the sense of TGT IP. */
+#define ARPT_INV_SRCDEVADDR 0x0010 /* Invert the sense of SRC DEV ADDR. */
+#define ARPT_INV_TGTDEVADDR 0x0020 /* Invert the sense of TGT DEV ADDR. */
+#define ARPT_INV_ARPOP 0x0040 /* Invert the sense of ARP OP. */
+#define ARPT_INV_ARPHRD 0x0080 /* Invert the sense of ARP HRD. */
+#define ARPT_INV_ARPPRO 0x0100 /* Invert the sense of ARP PRO. */
+#define ARPT_INV_ARPHLN 0x0200 /* Invert the sense of ARP HLN. */
+#define ARPT_INV_MASK 0x007F /* All possible flag bits mask. */
+
+/* This structure defines each of the firewall rules. Consists of 3
+ parts which are 1) general ARP header stuff 2) match specific
+ stuff 3) the target to perform if the rule matches */
+struct arpt_entry
+{
+ struct arpt_arp arp;
+
+ /* Size of arpt_entry + matches */
+ u_int16_t target_offset;
+ /* Size of arpt_entry + matches + target */
+ u_int16_t next_offset;
+
+ /* Back pointer */
+ unsigned int comefrom;
+
+ /* Packet and byte counters. */
+ struct arpt_counters counters;
+
+ /* The matches (if any), then the target. */
+ unsigned char elems[0];
+};
+
+/*
+ * New IP firewall options for [gs]etsockopt at the RAW IP level.
+ * Unlike BSD Linux inherits IP options so you don't have to use a raw
+ * socket for this. Instead we check rights in the calls.
+ */
+#define ARPT_BASE_CTL 96 /* base for firewall socket options */
+
+#define ARPT_SO_SET_REPLACE (ARPT_BASE_CTL)
+#define ARPT_SO_SET_ADD_COUNTERS (ARPT_BASE_CTL + 1)
+#define ARPT_SO_SET_MAX ARPT_SO_SET_ADD_COUNTERS
+
+#define ARPT_SO_GET_INFO (ARPT_BASE_CTL)
+#define ARPT_SO_GET_ENTRIES (ARPT_BASE_CTL + 1)
+#define ARPT_SO_GET_MAX ARPT_SO_GET_ENTRIES
+
+/* CONTINUE verdict for targets */
+#define ARPT_CONTINUE 0xFFFFFFFF
+
+/* For standard target */
+#define ARPT_RETURN (-NF_MAX_VERDICT - 1)
+
+/* The argument to ARPT_SO_GET_INFO */
+struct arpt_getinfo
+{
+ /* Which table: caller fills this in. */
+ char name[ARPT_TABLE_MAXNAMELEN];
+
+ /* Kernel fills these in. */
+ /* Which hook entry points are valid: bitmask */
+ unsigned int valid_hooks;
+
+ /* Hook entry points: one per netfilter hook. */
+ unsigned int hook_entry[NF_ARP_NUMHOOKS];
+
+ /* Underflow points. */
+ unsigned int underflow[NF_ARP_NUMHOOKS];
+
+ /* Number of entries */
+ unsigned int num_entries;
+
+ /* Size of entries. */
+ unsigned int size;
+};
+
+/* The argument to ARPT_SO_SET_REPLACE. */
+struct arpt_replace
+{
+ /* Which table. */
+ char name[ARPT_TABLE_MAXNAMELEN];
+
+ /* Which hook entry points are valid: bitmask. You can't
+ change this. */
+ unsigned int valid_hooks;
+
+ /* Number of entries */
+ unsigned int num_entries;
+
+ /* Total size of new entries */
+ unsigned int size;
+
+ /* Hook entry points. */
+ unsigned int hook_entry[NF_ARP_NUMHOOKS];
+
+ /* Underflow points. */
+ unsigned int underflow[NF_ARP_NUMHOOKS];
+
+ /* Information about old entries: */
+ /* Number of counters (must be equal to current number of entries). */
+ unsigned int num_counters;
+ /* The old entries' counters. */
+ struct arpt_counters *counters;
+
+ /* The entries (hang off end: not really an array). */
+ struct arpt_entry entries[0];
+};
+
+/* The argument to ARPT_SO_ADD_COUNTERS. */
+struct arpt_counters_info
+{
+ /* Which table. */
+ char name[ARPT_TABLE_MAXNAMELEN];
+
+ unsigned int num_counters;
+
+ /* The counters (actually `number' of these). */
+ struct arpt_counters counters[0];
+};
+
+/* The argument to ARPT_SO_GET_ENTRIES. */
+struct arpt_get_entries
+{
+ /* Which table: user fills this in. */
+ char name[ARPT_TABLE_MAXNAMELEN];
+
+ /* User fills this in: total entry size. */
+ unsigned int size;
+
+ /* The entries. */
+ struct arpt_entry entrytable[0];
+};
+
+/* Standard return verdict, or do jump. */
+#define ARPT_STANDARD_TARGET ""
+/* Error verdict. */
+#define ARPT_ERROR_TARGET "ERROR"
+
+/* Helper functions */
+static __inline__ struct arpt_entry_target *arpt_get_target(struct arpt_entry *e)
+{
+ return (void *)e + e->target_offset;
+}
+
+/* fn returns 0 to continue iteration */
+#define ARPT_ENTRY_ITERATE(entries, size, fn, args...) \
+({ \
+ unsigned int __i; \
+ int __ret = 0; \
+ struct arpt_entry *__entry; \
+ \
+ for (__i = 0; __i < (size); __i += __entry->next_offset) { \
+ __entry = (void *)(entries) + __i; \
+ \
+ __ret = fn(__entry , ## args); \
+ if (__ret != 0) \
+ break; \
+ } \
+ __ret; \
+})
+
+/*
+ * Main firewall chains definitions and global var's definitions.
+ */
+#ifdef __KERNEL__
+
+/* Registration hooks for targets. */
+struct arpt_target
+{
+ struct list_head list;
+
+ const char name[ARPT_FUNCTION_MAXNAMELEN];
+
+ /* Returns verdict. */
+ unsigned int (*target)(struct sk_buff **pskb,
+ unsigned int hooknum,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *targinfo,
+ void *userdata);
+
+ /* Called when user tries to insert an entry of this type:
+ hook_mask is a bitmask of hooks from which it can be
+ called. */
+ /* Should return true or false. */
+ int (*checkentry)(const char *tablename,
+ const struct arpt_entry *e,
+ void *targinfo,
+ unsigned int targinfosize,
+ unsigned int hook_mask);
+
+ /* Called when entry of this type deleted. */
+ void (*destroy)(void *targinfo, unsigned int targinfosize);
+
+ /* Set this to THIS_MODULE if you are a module, otherwise NULL */
+ struct module *me;
+};
+
+extern int arpt_register_target(struct arpt_target *target);
+extern void arpt_unregister_target(struct arpt_target *target);
+
+/* Furniture shopping... */
+struct arpt_table
+{
+ struct list_head list;
+
+ /* A unique name... */
+ char name[ARPT_TABLE_MAXNAMELEN];
+
+ /* Seed table: copied in register_table */
+ struct arpt_replace *table;
+
+ /* What hooks you will enter on */
+ unsigned int valid_hooks;
+
+ /* Lock for the curtain */
+ rwlock_t lock;
+
+ /* Man behind the curtain... */
+ struct arpt_table_info *private;
+
+ /* Set this to THIS_MODULE if you are a module, otherwise NULL */
+ struct module *me;
+};
+
+extern int arpt_register_table(struct arpt_table *table);
+extern void arpt_unregister_table(struct arpt_table *table);
+extern unsigned int arpt_do_table(struct sk_buff **pskb,
+ unsigned int hook,
+ const struct net_device *in,
+ const struct net_device *out,
+ struct arpt_table *table,
+ void *userdata);
+
+#define ARPT_ALIGN(s) (((s) + (__alignof__(struct arpt_entry)-1)) & ~(__alignof__(struct arpt_entry)-1))
+#endif /*__KERNEL__*/
+#endif /* _ARPTABLES_H */
diff --git a/include/linux/netfilter_ipv4/ip_nat.h b/include/linux/netfilter_ipv4/ip_nat.h
index 3a35b1fafd5d..0cfbfd7201c7 100644
--- a/include/linux/netfilter_ipv4/ip_nat.h
+++ b/include/linux/netfilter_ipv4/ip_nat.h
@@ -11,8 +11,13 @@ enum ip_nat_manip_type
IP_NAT_MANIP_DST
};
+#ifndef CONFIG_IP_NF_NAT_LOCAL
/* SRC manip occurs only on POST_ROUTING */
#define HOOK2MANIP(hooknum) ((hooknum) != NF_IP_POST_ROUTING)
+#else
+/* SRC manip occurs POST_ROUTING or LOCAL_IN */
+#define HOOK2MANIP(hooknum) ((hooknum) != NF_IP_POST_ROUTING && (hooknum) != NF_IP_LOCAL_IN)
+#endif
/* 2.3.19 (I hope) will define this in linux/netfilter_ipv4.h. */
#ifndef SO_ORIGINAL_DST
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 938560387354..01829afb8e41 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -202,7 +202,8 @@ enum
NET_CORE_NO_CONG_THRESH=13,
NET_CORE_NO_CONG=14,
NET_CORE_LO_CONG=15,
- NET_CORE_MOD_CONG=16
+ NET_CORE_MOD_CONG=16,
+ NET_CORE_DEV_WEIGHT=17
};
/* /proc/sys/net/ethernet */
diff --git a/net/core/dev.c b/net/core/dev.c
index 6a510b1a8ea4..8c340f76aa56 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -798,6 +798,19 @@ int dev_close(struct net_device *dev)
clear_bit(__LINK_STATE_START, &dev->state);
+ /* Synchronize to scheduled poll. We cannot touch poll list,
+ * it can be even on different cpu. So just clear netif_running(),
+ * and wait when poll really will happen. Actually, the best place
+ * for this is inside dev->stop() after device stopped its irq
+ * engine, but this requires more changes in devices. */
+
+ smp_mb__after_clear_bit(); /* Commit netif_running(). */
+ while (test_bit(__LINK_STATE_RX_SCHED, &dev->state)) {
+ /* No hurry. */
+ current->state = TASK_INTERRUPTIBLE;
+ schedule_timeout(1);
+ }
+
/*
* Call the device specific close. This cannot fail.
* Only if device is UP
@@ -1072,6 +1085,7 @@ int dev_queue_xmit(struct sk_buff *skb)
=======================================================================*/
int netdev_max_backlog = 300;
+int weight_p = 64; /* old backlog weight */
/* These numbers are selected based on intuition and some
* experimentatiom, if you have more scientific way of doing this
* please go ahead and fix things.
@@ -1237,13 +1251,11 @@ int netif_rx(struct sk_buff *skb)
enqueue:
dev_hold(skb->dev);
__skb_queue_tail(&queue->input_pkt_queue,skb);
- /* Runs from irqs or BH's, no need to wake BH */
- cpu_raise_softirq(this_cpu, NET_RX_SOFTIRQ);
local_irq_restore(flags);
#ifndef OFFLINE_SAMPLE
get_sample_stats(this_cpu);
#endif
- return softnet_data[this_cpu].cng_level;
+ return queue->cng_level;
}
if (queue->throttle) {
@@ -1253,6 +1265,8 @@ enqueue:
netdev_wakeup();
#endif
}
+
+ netif_rx_schedule(&queue->backlog_dev);
goto enqueue;
}
@@ -1308,19 +1322,12 @@ static int deliver_to_old_ones(struct packet_type *pt, struct sk_buff *skb, int
return ret;
}
-/* Reparent skb to master device. This function is called
- * only from net_rx_action under BR_NETPROTO_LOCK. It is misuse
- * of BR_NETPROTO_LOCK, but it is OK for now.
- */
static __inline__ void skb_bond(struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
-
- if (dev->master) {
- dev_hold(dev->master);
+
+ if (dev->master)
skb->dev = dev->master;
- dev_put(dev);
- }
}
static void net_tx_action(struct softirq_action *h)
@@ -1416,121 +1423,138 @@ static inline void handle_diverter(struct sk_buff *skb)
}
#endif /* CONFIG_NET_DIVERT */
-
-static void net_rx_action(struct softirq_action *h)
+int netif_receive_skb(struct sk_buff *skb)
{
- int this_cpu = smp_processor_id();
- struct softnet_data *queue = &softnet_data[this_cpu];
- unsigned long start_time = jiffies;
- int bugdet = netdev_max_backlog;
-
- br_read_lock(BR_NETPROTO_LOCK);
-
- for (;;) {
- struct sk_buff *skb;
- struct net_device *rx_dev;
-
- local_irq_disable();
- skb = __skb_dequeue(&queue->input_pkt_queue);
- local_irq_enable();
+ struct packet_type *ptype, *pt_prev;
+ int ret = NET_RX_DROP;
+ unsigned short type = skb->protocol;
- if (skb == NULL)
- break;
+ if (skb->stamp.tv_sec == 0)
+ do_gettimeofday(&skb->stamp);
- skb_bond(skb);
+ skb_bond(skb);
- rx_dev = skb->dev;
+ netdev_rx_stat[smp_processor_id()].total++;
#ifdef CONFIG_NET_FASTROUTE
- if (skb->pkt_type == PACKET_FASTROUTE) {
- netdev_rx_stat[this_cpu].fastroute_deferred_out++;
- dev_queue_xmit(skb);
- dev_put(rx_dev);
- continue;
- }
+ if (skb->pkt_type == PACKET_FASTROUTE) {
+ netdev_rx_stat[smp_processor_id()].fastroute_deferred_out++;
+ return dev_queue_xmit(skb);
+ }
#endif
- skb->h.raw = skb->nh.raw = skb->data;
- {
- struct packet_type *ptype, *pt_prev;
- unsigned short type = skb->protocol;
-
- pt_prev = NULL;
- for (ptype = ptype_all; ptype; ptype = ptype->next) {
- if (!ptype->dev || ptype->dev == skb->dev) {
- if (pt_prev) {
- if (!pt_prev->data) {
- deliver_to_old_ones(pt_prev, skb, 0);
- } else {
- atomic_inc(&skb->users);
- pt_prev->func(skb,
- skb->dev,
- pt_prev);
- }
- }
- pt_prev = ptype;
+
+ skb->h.raw = skb->nh.raw = skb->data;
+
+ pt_prev = NULL;
+ for (ptype = ptype_all; ptype; ptype = ptype->next) {
+ if (!ptype->dev || ptype->dev == skb->dev) {
+ if (pt_prev) {
+ if (!pt_prev->data) {
+ ret = deliver_to_old_ones(pt_prev, skb, 0);
+ } else {
+ atomic_inc(&skb->users);
+ ret = pt_prev->func(skb, skb->dev, pt_prev);
}
}
+ pt_prev = ptype;
+ }
+ }
#ifdef CONFIG_NET_DIVERT
- if (skb->dev->divert && skb->dev->divert->divert)
- handle_diverter(skb);
+ if (skb->dev->divert && skb->dev->divert->divert)
+ ret = handle_diverter(skb);
#endif /* CONFIG_NET_DIVERT */
-
#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
- if (skb->dev->br_port != NULL &&
- br_handle_frame_hook != NULL) {
- handle_bridge(skb, pt_prev);
- dev_put(rx_dev);
- continue;
- }
+ if (skb->dev->br_port != NULL &&
+ br_handle_frame_hook != NULL) {
+ return handle_bridge(skb, pt_prev);
+ }
#endif
- for (ptype=ptype_base[ntohs(type)&15];ptype;ptype=ptype->next) {
- if (ptype->type == type &&
- (!ptype->dev || ptype->dev == skb->dev)) {
- if (pt_prev) {
- if (!pt_prev->data)
- deliver_to_old_ones(pt_prev, skb, 0);
- else {
- atomic_inc(&skb->users);
- pt_prev->func(skb,
- skb->dev,
- pt_prev);
- }
- }
- pt_prev = ptype;
+ for (ptype=ptype_base[ntohs(type)&15];ptype;ptype=ptype->next) {
+ if (ptype->type == type &&
+ (!ptype->dev || ptype->dev == skb->dev)) {
+ if (pt_prev) {
+ if (!pt_prev->data) {
+ ret = deliver_to_old_ones(pt_prev, skb, 0);
+ } else {
+ atomic_inc(&skb->users);
+ ret = pt_prev->func(skb, skb->dev, pt_prev);
}
}
+ pt_prev = ptype;
+ }
+ }
- if (pt_prev) {
- if (!pt_prev->data)
- deliver_to_old_ones(pt_prev, skb, 1);
- else
- pt_prev->func(skb, skb->dev, pt_prev);
- } else
- kfree_skb(skb);
+ if (pt_prev) {
+ if (!pt_prev->data) {
+ ret = deliver_to_old_ones(pt_prev, skb, 1);
+ } else {
+ ret = pt_prev->func(skb, skb->dev, pt_prev);
}
+ } else {
+ kfree_skb(skb);
+ /* Jamal, now you will not able to escape explaining
+ * me how you were going to use this. :-)
+ */
+ ret = NET_RX_DROP;
+ }
- dev_put(rx_dev);
+ return ret;
+}
- if (bugdet-- < 0 || jiffies - start_time > 1)
- goto softnet_break;
+static int process_backlog(struct net_device *backlog_dev, int *budget)
+{
+ int work = 0;
+ int quota = min(backlog_dev->quota, *budget);
+ int this_cpu = smp_processor_id();
+ struct softnet_data *queue = &softnet_data[this_cpu];
+ unsigned long start_time = jiffies;
+
+ for (;;) {
+ struct sk_buff *skb;
+ struct net_device *dev;
+
+ local_irq_disable();
+ skb = __skb_dequeue(&queue->input_pkt_queue);
+ if (skb == NULL)
+ goto job_done;
+ local_irq_enable();
+
+ dev = skb->dev;
+
+ netif_receive_skb(skb);
+
+ dev_put(dev);
+
+ work++;
+
+ if (work >= quota || jiffies - start_time > 1)
+ break;
#ifdef CONFIG_NET_HW_FLOWCONTROL
- if (queue->throttle && queue->input_pkt_queue.qlen < no_cong_thresh ) {
- if (atomic_dec_and_test(&netdev_dropping)) {
- queue->throttle = 0;
- netdev_wakeup();
- goto softnet_break;
+ if (queue->throttle && queue->input_pkt_queue.qlen < no_cong_thresh ) {
+ if (atomic_dec_and_test(&netdev_dropping)) {
+ queue->throttle = 0;
+ netdev_wakeup();
+ break;
+ }
}
- }
#endif
-
}
- br_read_unlock(BR_NETPROTO_LOCK);
- local_irq_disable();
+ backlog_dev->quota -= work;
+ *budget -= work;
+ return -1;
+
+job_done:
+ backlog_dev->quota -= work;
+ *budget -= work;
+
+ list_del(&backlog_dev->poll_list);
+ clear_bit(__LINK_STATE_RX_SCHED, &backlog_dev->state);
+
if (queue->throttle) {
queue->throttle = 0;
#ifdef CONFIG_NET_HW_FLOWCONTROL
@@ -1539,21 +1563,53 @@ static void net_rx_action(struct softirq_action *h)
#endif
}
local_irq_enable();
+ return 0;
+}
- NET_PROFILE_LEAVE(softnet_process);
- return;
+static void net_rx_action(struct softirq_action *h)
+{
+ int this_cpu = smp_processor_id();
+ struct softnet_data *queue = &softnet_data[this_cpu];
+ unsigned long start_time = jiffies;
+ int budget = netdev_max_backlog;
-softnet_break:
+ br_read_lock(BR_NETPROTO_LOCK);
+ local_irq_disable();
+
+ while (!list_empty(&queue->poll_list)) {
+ struct net_device *dev;
+
+ if (budget <= 0 || jiffies - start_time > 1)
+ goto softnet_break;
+
+ local_irq_enable();
+
+ dev = list_entry(queue->poll_list.next, struct net_device, poll_list);
+
+ if (dev->quota <= 0 || dev->poll(dev, &budget)) {
+ local_irq_disable();
+ list_del(&dev->poll_list);
+ list_add_tail(&dev->poll_list, &queue->poll_list);
+ if (dev->quota < 0)
+ dev->quota += dev->weight;
+ else
+ dev->quota = dev->weight;
+ } else {
+ dev_put(dev);
+ local_irq_disable();
+ }
+ }
+
+ local_irq_enable();
br_read_unlock(BR_NETPROTO_LOCK);
+ return;
- local_irq_disable();
+softnet_break:
netdev_rx_stat[this_cpu].time_squeeze++;
- /* This already runs in BH context, no need to wake up BH's */
- cpu_raise_softirq(this_cpu, NET_RX_SOFTIRQ);
- local_irq_enable();
+ __cpu_raise_softirq(this_cpu, NET_RX_SOFTIRQ);
- NET_PROFILE_LEAVE(softnet_process);
- return;
+ local_irq_enable();
+ br_read_unlock(BR_NETPROTO_LOCK);
}
static gifconf_func_t * gifconf_list [NPROTO];
@@ -2626,6 +2682,7 @@ int __init net_dev_init(void)
if (!dev_boot_phase)
return 0;
+
#ifdef CONFIG_NET_DIVERT
dv_init();
#endif /* CONFIG_NET_DIVERT */
@@ -2643,8 +2700,13 @@ int __init net_dev_init(void)
queue->cng_level = 0;
queue->avg_blog = 10; /* arbitrary non-zero */
queue->completion_queue = NULL;
+ INIT_LIST_HEAD(&queue->poll_list);
+ set_bit(__LINK_STATE_START, &queue->backlog_dev.state);
+ queue->backlog_dev.weight = weight_p;
+ queue->backlog_dev.poll = process_backlog;
+ atomic_set(&queue->backlog_dev.refcnt, 1);
}
-
+
#ifdef CONFIG_NET_PROFILE
net_profile_init();
NET_PROFILE_REGISTER(dev_queue_xmit);
@@ -2744,7 +2806,6 @@ int __init net_dev_init(void)
#ifdef CONFIG_NET_SCHED
pktsched_init();
#endif
-
/*
* Initialise network devices
*/
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 2f6090a2fc9a..2e24556de974 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -12,6 +12,7 @@
#ifdef CONFIG_SYSCTL
extern int netdev_max_backlog;
+extern int weight_p;
extern int no_cong_thresh;
extern int no_cong;
extern int lo_cong;
@@ -47,6 +48,9 @@ ctl_table core_table[] = {
{NET_CORE_RMEM_DEFAULT, "rmem_default",
&sysctl_rmem_default, sizeof(int), 0644, NULL,
&proc_dointvec},
+ {NET_CORE_DEV_WEIGHT, "dev_weight",
+ &weight_p, sizeof(int), 0644, NULL,
+ &proc_dointvec},
{NET_CORE_MAX_BACKLOG, "netdev_max_backlog",
&netdev_max_backlog, sizeof(int), 0644, NULL,
&proc_dointvec},
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 59e81c1f0cf8..9e6a18144cbf 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -112,7 +112,7 @@
#include <asm/system.h>
#include <asm/uaccess.h>
-
+#include <linux/netfilter_arp.h>
/*
* Interface to generic neighbour cache.
@@ -561,7 +561,8 @@ void arp_send(int type, int ptype, u32 dest_ip,
arp_ptr+=dev->addr_len;
memcpy(arp_ptr, &dest_ip, 4);
- dev_queue_xmit(skb);
+ /* Send it off, maybe filter it using firewalling first. */
+ NF_HOOK(NF_ARP, NF_ARP_OUT, skb, NULL, dev, dev_queue_xmit);
return;
out:
@@ -574,45 +575,31 @@ static void parp_redo(struct sk_buff *skb)
}
/*
- * Receive an arp request by the device layer.
+ * Process an arp request.
*/
-int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+int arp_process(struct sk_buff *skb)
{
- struct arphdr *arp = skb->nh.arph;
- unsigned char *arp_ptr= (unsigned char *)(arp+1);
+ struct net_device *dev = skb->dev;
+ struct in_device *in_dev = in_dev_get(dev);
+ struct arphdr *arp;
+ unsigned char *arp_ptr;
struct rtable *rt;
unsigned char *sha, *tha;
u32 sip, tip;
u16 dev_type = dev->type;
int addr_type;
- struct in_device *in_dev = in_dev_get(dev);
struct neighbour *n;
-/*
- * The hardware length of the packet should match the hardware length
- * of the device. Similarly, the hardware types should match. The
- * device should be ARP-able. Also, if pln is not 4, then the lookup
- * is not from an IP number. We can't currently handle this, so toss
- * it.
- */
- if (in_dev == NULL ||
- arp->ar_hln != dev->addr_len ||
- dev->flags & IFF_NOARP ||
- skb->pkt_type == PACKET_OTHERHOST ||
- skb->pkt_type == PACKET_LOOPBACK ||
- arp->ar_pln != 4)
+ /* arp_rcv below verifies the ARP header, verifies the device
+ * is ARP'able, and linearizes the SKB (if needed).
+ */
+
+ if (in_dev == NULL)
goto out;
- if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
- goto out_of_mem;
-
- if (skb_is_nonlinear(skb)) {
- if (skb_linearize(skb, GFP_ATOMIC) != 0)
- goto freeskb;
- arp = skb->nh.arph;
- arp_ptr= (unsigned char *)(arp+1);
- }
+ arp = skb->nh.arph;
+ arp_ptr= (unsigned char *)(arp+1);
switch (dev_type) {
default:
@@ -827,13 +814,41 @@ int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
out:
if (in_dev)
in_dev_put(in_dev);
-freeskb:
kfree_skb(skb);
-out_of_mem:
return 0;
}
+/*
+ * Receive an arp request from the device layer.
+ */
+
+int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+{
+ struct arphdr *arp = skb->nh.arph;
+
+ if (arp->ar_hln != dev->addr_len ||
+ dev->flags & IFF_NOARP ||
+ skb->pkt_type == PACKET_OTHERHOST ||
+ skb->pkt_type == PACKET_LOOPBACK ||
+ arp->ar_pln != 4)
+ goto freeskb;
+
+ if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
+ goto out_of_mem;
+
+ if (skb_is_nonlinear(skb)) {
+ if (skb_linearize(skb, GFP_ATOMIC) != 0)
+ goto freeskb;
+ }
+
+ return NF_HOOK(NF_ARP, NF_ARP_IN, skb, dev, NULL, arp_process);
+
+freeskb:
+ kfree_skb(skb);
+out_of_mem:
+ return 0;
+}
/*
* User level interface (ioctl, /proc)
diff --git a/net/ipv4/netfilter/Config.in b/net/ipv4/netfilter/Config.in
index 47c703c34ddc..7f250e431318 100644
--- a/net/ipv4/netfilter/Config.in
+++ b/net/ipv4/netfilter/Config.in
@@ -47,6 +47,7 @@ if [ "$CONFIG_IP_NF_IPTABLES" != "n" ]; then
define_bool CONFIG_IP_NF_NAT_NEEDED y
dep_tristate ' MASQUERADE target support' CONFIG_IP_NF_TARGET_MASQUERADE $CONFIG_IP_NF_NAT
dep_tristate ' REDIRECT target support' CONFIG_IP_NF_TARGET_REDIRECT $CONFIG_IP_NF_NAT
+ bool ' NAT of local connections' CONFIG_IP_NF_NAT_LOCAL
if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then
dep_tristate ' Basic SNMP-ALG support (EXPERIMENTAL)' CONFIG_IP_NF_NAT_SNMP_BASIC $CONFIG_IP_NF_NAT
fi
@@ -79,6 +80,11 @@ if [ "$CONFIG_IP_NF_IPTABLES" != "n" ]; then
dep_tristate ' TCPMSS target support' CONFIG_IP_NF_TARGET_TCPMSS $CONFIG_IP_NF_IPTABLES
fi
+tristate 'ARP tables support' CONFIG_IP_NF_ARPTABLES
+if [ "$CONFIG_IP_NF_ARPTABLES" != "n" ]; then
+ dep_tristate ' ARP packet filtering' CONFIG_IP_NF_ARPFILTER $CONFIG_IP_NF_ARPTABLES
+fi
+
# Backwards compatibility modules: only if you don't build in the others.
if [ "$CONFIG_IP_NF_CONNTRACK" != "y" ]; then
if [ "$CONFIG_IP_NF_IPTABLES" != "y" ]; then
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 6e5a8a1cc0b7..7e1bd4511532 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -9,7 +9,7 @@
O_TARGET := netfilter.o
-export-objs = ip_conntrack_standalone.o ip_conntrack_ftp.o ip_fw_compat.o ip_nat_standalone.o ip_tables.o
+export-objs = ip_conntrack_standalone.o ip_conntrack_ftp.o ip_fw_compat.o ip_nat_standalone.o ip_tables.o arp_tables.o
# Multipart objects.
list-multi := ip_conntrack.o iptable_nat.o ipfwadm.o ipchains.o
@@ -75,6 +75,12 @@ obj-$(CONFIG_IP_NF_TARGET_LOG) += ipt_LOG.o
obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o
obj-$(CONFIG_IP_NF_TARGET_TCPMSS) += ipt_TCPMSS.o
+# generic ARP tables
+obj-$(CONFIG_IP_NF_ARPTABLES) += arp_tables.o
+
+# just filtering instance of ARP tables for now
+obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o
+
# backwards compatibility
obj-$(CONFIG_IP_NF_COMPAT_IPCHAINS) += ipchains.o
obj-$(CONFIG_IP_NF_COMPAT_IPFWADM) += ipfwadm.o
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
new file mode 100644
index 000000000000..38b4356cf5b3
--- /dev/null
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -0,0 +1,1313 @@
+/*
+ * Packet matching code for ARP packets.
+ *
+ * Based heavily, if not almost entirely, upon ip_tables.c framework.
+ *
+ * Some ARP specific bits are:
+ *
+ * Copyright (C) 2002 David S. Miller (davem@redhat.com)
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/kmod.h>
+#include <linux/vmalloc.h>
+#include <linux/proc_fs.h>
+#include <linux/module.h>
+#include <linux/init.h>
+
+#include <asm/uaccess.h>
+#include <asm/semaphore.h>
+
+#include <linux/netfilter_arp/arp_tables.h>
+
+/*#define DEBUG_ARP_TABLES*/
+/*#define DEBUG_ARP_TABLES_USER*/
+
+#ifdef DEBUG_ARP_TABLES
+#define dprintf(format, args...) printk(format , ## args)
+#else
+#define dprintf(format, args...)
+#endif
+
+#ifdef DEBUG_ARP_TABLES_USER
+#define duprintf(format, args...) printk(format , ## args)
+#else
+#define duprintf(format, args...)
+#endif
+
+#ifdef CONFIG_NETFILTER_DEBUG
+#define ARP_NF_ASSERT(x) \
+do { \
+ if (!(x)) \
+ printk("ARP_NF_ASSERT: %s:%s:%u\n", \
+ __FUNCTION__, __FILE__, __LINE__); \
+} while(0)
+#else
+#define ARP_NF_ASSERT(x)
+#endif
+#define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1))
+
+static DECLARE_MUTEX(arpt_mutex);
+
+#define ASSERT_READ_LOCK(x) ARP_NF_ASSERT(down_trylock(&arpt_mutex) != 0)
+#define ASSERT_WRITE_LOCK(x) ARP_NF_ASSERT(down_trylock(&arpt_mutex) != 0)
+#include <linux/netfilter_ipv4/lockhelp.h>
+#include <linux/netfilter_ipv4/listhelp.h>
+
+struct arpt_table_info {
+ unsigned int size;
+ unsigned int number;
+ unsigned int initial_entries;
+ unsigned int hook_entry[NF_ARP_NUMHOOKS];
+ unsigned int underflow[NF_ARP_NUMHOOKS];
+ char entries[0] __attribute__((aligned(SMP_CACHE_BYTES)));
+};
+
+static LIST_HEAD(arpt_target);
+static LIST_HEAD(arpt_tables);
+#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0)
+
+#ifdef CONFIG_SMP
+#define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p))
+#else
+#define TABLE_OFFSET(t,p) 0
+#endif
+
+static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap,
+ char *hdr_addr, int len)
+{
+ int i, ret;
+
+ if (len > ARPT_DEV_ADDR_LEN_MAX)
+ len = ARPT_DEV_ADDR_LEN_MAX;
+
+ ret = 0;
+ for (i = 0; i < len; i++)
+ ret |= (hdr_addr[i] ^ ap->addr[i]) & ap->mask[i];
+
+ return (ret != 0);
+}
+
+/* Returns whether packet matches rule or not. */
+static inline int arp_packet_match(const struct arphdr *arphdr,
+ struct net_device *dev,
+ const char *indev,
+ const char *outdev,
+ const struct arpt_arp *arpinfo)
+{
+ char *arpptr = (char *)(arphdr + 1);
+ char *src_devaddr, *tgt_devaddr;
+ u32 *src_ipaddr, *tgt_ipaddr;
+ int i, ret;
+
+#define FWINV(bool,invflg) ((bool) ^ !!(arpinfo->invflags & invflg))
+
+ if (FWINV((arphdr->ar_op & arpinfo->arpop_mask) != arpinfo->arpop,
+ ARPT_INV_ARPOP)) {
+ dprintf("ARP operation field mismatch.\n");
+ dprintf("ar_op: %04x info->arpop: %04x info->arpop_mask: %04x\n",
+ arphdr->ar_op, arpinfo->arpop, arpinfo->arpop_mask);
+ return 0;
+ }
+
+ if (FWINV((arphdr->ar_hrd & arpinfo->arhrd_mask) != arpinfo->arhrd,
+ ARPT_INV_ARPHRD)) {
+ dprintf("ARP hardware address format mismatch.\n");
+ dprintf("ar_hrd: %04x info->arhrd: %04x info->arhrd_mask: %04x\n",
+ arphdr->ar_hrd, arpinfo->arhrd, arpinfo->arhrd_mask);
+ return 0;
+ }
+
+ if (FWINV((arphdr->ar_pro & arpinfo->arpro_mask) != arpinfo->arpro,
+ ARPT_INV_ARPPRO)) {
+ dprintf("ARP protocol address format mismatch.\n");
+ dprintf("ar_pro: %04x info->arpro: %04x info->arpro_mask: %04x\n",
+ arphdr->ar_pro, arpinfo->arpro, arpinfo->arpro_mask);
+ return 0;
+ }
+
+ if (FWINV((arphdr->ar_hln & arpinfo->arhln_mask) != arpinfo->arhln,
+ ARPT_INV_ARPHLN)) {
+ dprintf("ARP hardware address length mismatch.\n");
+ dprintf("ar_hln: %02x info->arhln: %02x info->arhln_mask: %02x\n",
+ arphdr->ar_hln, arpinfo->arhln, arpinfo->arhln_mask);
+ }
+
+ src_devaddr = arpptr;
+ arpptr += dev->addr_len;
+ src_ipaddr = (u32 *) arpptr;
+ arpptr += sizeof(u32);
+ tgt_devaddr = arpptr;
+ arpptr += dev->addr_len;
+ tgt_ipaddr = (u32 *) arpptr;
+
+ if (FWINV(arp_devaddr_compare(&arpinfo->src_devaddr, src_devaddr, dev->addr_len),
+ ARPT_INV_SRCDEVADDR) ||
+ FWINV(arp_devaddr_compare(&arpinfo->tgt_devaddr, tgt_devaddr, dev->addr_len),
+ ARPT_INV_TGTDEVADDR)) {
+ dprintf("Source or target device address mismatch.\n");
+
+ return 0;
+ }
+
+ if (FWINV(((*src_ipaddr) & arpinfo->smsk.s_addr) != arpinfo->src.s_addr,
+ ARPT_INV_SRCIP) ||
+ FWINV((((*tgt_ipaddr) & arpinfo->tmsk.s_addr) != arpinfo->tgt.s_addr),
+ ARPT_INV_TGTIP)) {
+ dprintf("Source or target IP address mismatch.\n");
+
+ dprintf("SRC: %u.%u.%u.%u. Mask: %u.%u.%u.%u. Target: %u.%u.%u.%u.%s\n",
+ NIPQUAD(*src_ipaddr),
+ NIPQUAD(arpinfo->smsk.s_addr),
+ NIPQUAD(arpinfo->src.s_addr),
+ arpinfo->invflags & ARPT_INV_SRCIP ? " (INV)" : "");
+ dprintf("TGT: %u.%u.%u.%u Mask: %u.%u.%u.%u Target: %u.%u.%u.%u.%s\n",
+ NIPQUAD(*tgt_ipaddr),
+ NIPQUAD(arpinfo->tmsk.s_addr),
+ NIPQUAD(arpinfo->tgt.s_addr),
+ arpinfo->invflags & ARPT_INV_TGTIP ? " (INV)" : "");
+ return 0;
+ }
+
+ /* Look for ifname matches; this should unroll nicely. */
+ for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned long); i++) {
+ ret |= (((const unsigned long *)indev)[i]
+ ^ ((const unsigned long *)arpinfo->iniface)[i])
+ & ((const unsigned long *)arpinfo->iniface_mask)[i];
+ }
+
+ if (FWINV(ret != 0, ARPT_INV_VIA_IN)) {
+ dprintf("VIA in mismatch (%s vs %s).%s\n",
+ indev, arpinfo->iniface,
+ arpinfo->invflags&ARPT_INV_VIA_IN ?" (INV)":"");
+ return 0;
+ }
+
+ for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned long); i++) {
+ ret |= (((const unsigned long *)outdev)[i]
+ ^ ((const unsigned long *)arpinfo->outiface)[i])
+ & ((const unsigned long *)arpinfo->outiface_mask)[i];
+ }
+
+ if (FWINV(ret != 0, ARPT_INV_VIA_OUT)) {
+ dprintf("VIA out mismatch (%s vs %s).%s\n",
+ outdev, arpinfo->outiface,
+ arpinfo->invflags&ARPT_INV_VIA_OUT ?" (INV)":"");
+ return 0;
+ }
+
+ return 1;
+}
+
+static inline int arp_checkentry(const struct arpt_arp *arp)
+{
+ if (arp->flags & ~ARPT_F_MASK) {
+ duprintf("Unknown flag bits set: %08X\n",
+ arp->flags & ~ARPT_F_MASK);
+ return 0;
+ }
+ if (arp->invflags & ~ARPT_INV_MASK) {
+ duprintf("Unknown invflag bits set: %08X\n",
+ arp->invflags & ~ARPT_INV_MASK);
+ return 0;
+ }
+
+ return 1;
+}
+
+static unsigned int arpt_error(struct sk_buff **pskb,
+ unsigned int hooknum,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *targinfo,
+ void *userinfo)
+{
+ if (net_ratelimit())
+ printk("arp_tables: error: '%s'\n", (char *)targinfo);
+
+ return NF_DROP;
+}
+
+static inline struct arpt_entry *get_entry(void *base, unsigned int offset)
+{
+ return (struct arpt_entry *)(base + offset);
+}
+
+unsigned int arpt_do_table(struct sk_buff **pskb,
+ unsigned int hook,
+ const struct net_device *in,
+ const struct net_device *out,
+ struct arpt_table *table,
+ void *userdata)
+{
+ static const char nulldevname[IFNAMSIZ] = { 0 };
+ unsigned int verdict = NF_DROP;
+ struct arphdr *arp = (*pskb)->nh.arph;
+ int hotdrop = 0;
+ struct arpt_entry *e, *back;
+ const char *indev, *outdev;
+ void *table_base;
+
+ indev = in ? in->name : nulldevname;
+ outdev = out ? out->name : nulldevname;
+
+ read_lock_bh(&table->lock);
+ table_base = (void *)table->private->entries
+ + TABLE_OFFSET(table->private,
+ cpu_number_map(smp_processor_id()));
+ e = get_entry(table_base, table->private->hook_entry[hook]);
+ back = get_entry(table_base, table->private->underflow[hook]);
+
+ do {
+ if (arp_packet_match(arp, (*pskb)->dev, indev, outdev, &e->arp)) {
+ struct arpt_entry_target *t;
+ int hdr_len;
+
+ hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) +
+ (2 * (*pskb)->dev->addr_len);
+ ADD_COUNTER(e->counters, hdr_len, 1);
+
+ t = arpt_get_target(e);
+
+ /* Standard target? */
+ if (!t->u.kernel.target->target) {
+ int v;
+
+ v = ((struct arpt_standard_target *)t)->verdict;
+ if (v < 0) {
+ /* Pop from stack? */
+ if (v != ARPT_RETURN) {
+ verdict = (unsigned)(-v) - 1;
+ break;
+ }
+ e = back;
+ back = get_entry(table_base,
+ back->comefrom);
+ continue;
+ }
+ if (table_base + v
+ != (void *)e + e->next_offset) {
+ /* Save old back ptr in next entry */
+ struct arpt_entry *next
+ = (void *)e + e->next_offset;
+ next->comefrom =
+ (void *)back - table_base;
+
+ /* set back pointer to next entry */
+ back = next;
+ }
+
+ e = get_entry(table_base, v);
+ } else {
+ /* Targets which reenter must return
+ * abs. verdicts
+ */
+ verdict = t->u.kernel.target->target(pskb,
+ hook,
+ in, out,
+ t->data,
+ userdata);
+
+ /* Target might have changed stuff. */
+ arp = (*pskb)->nh.arph;
+
+ if (verdict == ARPT_CONTINUE)
+ e = (void *)e + e->next_offset;
+ else
+ /* Verdict */
+ break;
+ }
+ } else {
+ e = (void *)e + e->next_offset;
+ }
+ } while (!hotdrop);
+ read_unlock_bh(&table->lock);
+
+ if (hotdrop)
+ return NF_DROP;
+ else
+ return verdict;
+}
+
+static inline void *find_inlist_lock_noload(struct list_head *head,
+ const char *name,
+ int *error,
+ struct semaphore *mutex)
+{
+ void *ret;
+
+ *error = down_interruptible(mutex);
+ if (*error != 0)
+ return NULL;
+
+ ret = list_named_find(head, name);
+ if (!ret) {
+ *error = -ENOENT;
+ up(mutex);
+ }
+ return ret;
+}
+
+#ifndef CONFIG_KMOD
+#define find_inlist_lock(h,n,p,e,m) find_inlist_lock_noload((h),(n),(e),(m))
+#else
+static void *
+find_inlist_lock(struct list_head *head,
+ const char *name,
+ const char *prefix,
+ int *error,
+ struct semaphore *mutex)
+{
+ void *ret;
+
+ ret = find_inlist_lock_noload(head, name, error, mutex);
+ if (!ret) {
+ char modulename[ARPT_FUNCTION_MAXNAMELEN + strlen(prefix) + 1];
+ strcpy(modulename, prefix);
+ strcat(modulename, name);
+ duprintf("find_inlist: loading `%s'.\n", modulename);
+ request_module(modulename);
+ ret = find_inlist_lock_noload(head, name, error, mutex);
+ }
+
+ return ret;
+}
+#endif
+
+static inline struct arpt_table *find_table_lock(const char *name, int *error, struct semaphore *mutex)
+{
+ return find_inlist_lock(&arpt_tables, name, "arptable_", error, mutex);
+}
+
+static inline struct arpt_target *find_target_lock(const char *name, int *error, struct semaphore *mutex)
+{
+ return find_inlist_lock(&arpt_target, name, "arpt_", error, mutex);
+}
+
+/* All zeroes == unconditional rule. */
+static inline int unconditional(const struct arpt_arp *arp)
+{
+ unsigned int i;
+
+ for (i = 0; i < sizeof(*arp)/sizeof(__u32); i++)
+ if (((__u32 *)arp)[i])
+ return 0;
+
+ return 1;
+}
+
+/* Figures out from what hook each rule can be called: returns 0 if
+ * there are loops. Puts hook bitmask in comefrom.
+ */
+static int mark_source_chains(struct arpt_table_info *newinfo, unsigned int valid_hooks)
+{
+ unsigned int hook;
+
+ /* No recursion; use packet counter to save back ptrs (reset
+ * to 0 as we leave), and comefrom to save source hook bitmask.
+ */
+ for (hook = 0; hook < NF_ARP_NUMHOOKS; hook++) {
+ unsigned int pos = newinfo->hook_entry[hook];
+ struct arpt_entry *e
+ = (struct arpt_entry *)(newinfo->entries + pos);
+
+ if (!(valid_hooks & (1 << hook)))
+ continue;
+
+ /* Set initial back pointer. */
+ e->counters.pcnt = pos;
+
+ for (;;) {
+ struct arpt_standard_target *t
+ = (void *)arpt_get_target(e);
+
+ if (e->comefrom & (1 << NF_ARP_NUMHOOKS)) {
+ printk("arptables: loop hook %u pos %u %08X.\n",
+ hook, pos, e->comefrom);
+ return 0;
+ }
+ e->comefrom
+ |= ((1 << hook) | (1 << NF_ARP_NUMHOOKS));
+
+ /* Unconditional return/END. */
+ if (e->target_offset == sizeof(struct arpt_entry)
+ && (strcmp(t->target.u.user.name,
+ ARPT_STANDARD_TARGET) == 0)
+ && t->verdict < 0
+ && unconditional(&e->arp)) {
+ unsigned int oldpos, size;
+
+ /* Return: backtrack through the last
+ * big jump.
+ */
+ do {
+ e->comefrom ^= (1<<NF_ARP_NUMHOOKS);
+ oldpos = pos;
+ pos = e->counters.pcnt;
+ e->counters.pcnt = 0;
+
+ /* We're at the start. */
+ if (pos == oldpos)
+ goto next;
+
+ e = (struct arpt_entry *)
+ (newinfo->entries + pos);
+ } while (oldpos == pos + e->next_offset);
+
+ /* Move along one */
+ size = e->next_offset;
+ e = (struct arpt_entry *)
+ (newinfo->entries + pos + size);
+ e->counters.pcnt = pos;
+ pos += size;
+ } else {
+ int newpos = t->verdict;
+
+ if (strcmp(t->target.u.user.name,
+ ARPT_STANDARD_TARGET) == 0
+ && newpos >= 0) {
+ /* This a jump; chase it. */
+ duprintf("Jump rule %u -> %u\n",
+ pos, newpos);
+ } else {
+ /* ... this is a fallthru */
+ newpos = pos + e->next_offset;
+ }
+ e = (struct arpt_entry *)
+ (newinfo->entries + newpos);
+ e->counters.pcnt = pos;
+ pos = newpos;
+ }
+ }
+ next:
+ duprintf("Finished chain %u\n", hook);
+ }
+ return 1;
+}
+
+static inline int standard_check(const struct arpt_entry_target *t,
+ unsigned int max_offset)
+{
+ struct arpt_standard_target *targ = (void *)t;
+
+ /* Check standard info. */
+ if (t->u.target_size
+ != ARPT_ALIGN(sizeof(struct arpt_standard_target))) {
+ duprintf("arpt_standard_check: target size %u != %Zu\n",
+ t->u.target_size,
+ ARPT_ALIGN(sizeof(struct arpt_standard_target)));
+ return 0;
+ }
+
+ if (targ->verdict >= 0
+ && targ->verdict > max_offset - sizeof(struct arpt_entry)) {
+ duprintf("arpt_standard_check: bad verdict (%i)\n",
+ targ->verdict);
+ return 0;
+ }
+
+ if (targ->verdict < -NF_MAX_VERDICT - 1) {
+ duprintf("arpt_standard_check: bad negative verdict (%i)\n",
+ targ->verdict);
+ return 0;
+ }
+ return 1;
+}
+
+static struct arpt_target arpt_standard_target;
+
+static inline int check_entry(struct arpt_entry *e, const char *name, unsigned int size,
+ unsigned int *i)
+{
+ struct arpt_entry_target *t;
+ struct arpt_target *target;
+ int ret;
+
+ if (!arp_checkentry(&e->arp)) {
+ duprintf("arp_tables: arp check failed %p %s.\n", e, name);
+ return -EINVAL;
+ }
+
+ t = arpt_get_target(e);
+ target = find_target_lock(t->u.user.name, &ret, &arpt_mutex);
+ if (!target) {
+ duprintf("check_entry: `%s' not found\n", t->u.user.name);
+ goto out;
+ }
+ if (target->me)
+ __MOD_INC_USE_COUNT(target->me);
+ t->u.kernel.target = target;
+ up(&arpt_mutex);
+
+ if (t->u.kernel.target == &arpt_standard_target) {
+ if (!standard_check(t, size)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ } else if (t->u.kernel.target->checkentry
+ && !t->u.kernel.target->checkentry(name, e, t->data,
+ t->u.target_size
+ - sizeof(*t),
+ e->comefrom)) {
+ if (t->u.kernel.target->me)
+ __MOD_DEC_USE_COUNT(t->u.kernel.target->me);
+ duprintf("arp_tables: check failed for `%s'.\n",
+ t->u.kernel.target->name);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ (*i)++;
+ return 0;
+
+out:
+ return ret;
+}
+
+static inline int check_entry_size_and_hooks(struct arpt_entry *e,
+ struct arpt_table_info *newinfo,
+ unsigned char *base,
+ unsigned char *limit,
+ const unsigned int *hook_entries,
+ const unsigned int *underflows,
+ unsigned int *i)
+{
+ unsigned int h;
+
+ if ((unsigned long)e % __alignof__(struct arpt_entry) != 0
+ || (unsigned char *)e + sizeof(struct arpt_entry) >= limit) {
+ duprintf("Bad offset %p\n", e);
+ return -EINVAL;
+ }
+
+ if (e->next_offset
+ < sizeof(struct arpt_entry) + sizeof(struct arpt_entry_target)) {
+ duprintf("checking: element %p size %u\n",
+ e, e->next_offset);
+ return -EINVAL;
+ }
+
+ /* Check hooks & underflows */
+ for (h = 0; h < NF_ARP_NUMHOOKS; h++) {
+ if ((unsigned char *)e - base == hook_entries[h])
+ newinfo->hook_entry[h] = hook_entries[h];
+ if ((unsigned char *)e - base == underflows[h])
+ newinfo->underflow[h] = underflows[h];
+ }
+
+ /* FIXME: underflows must be unconditional, standard verdicts
+ < 0 (not ARPT_RETURN). --RR */
+
+ /* Clear counters and comefrom */
+ e->counters = ((struct arpt_counters) { 0, 0 });
+ e->comefrom = 0;
+
+ (*i)++;
+ return 0;
+}
+
+static inline int cleanup_entry(struct arpt_entry *e, unsigned int *i)
+{
+ struct arpt_entry_target *t;
+
+ if (i && (*i)-- == 0)
+ return 1;
+
+ t = arpt_get_target(e);
+ if (t->u.kernel.target->destroy)
+ t->u.kernel.target->destroy(t->data,
+ t->u.target_size - sizeof(*t));
+ if (t->u.kernel.target->me)
+ __MOD_DEC_USE_COUNT(t->u.kernel.target->me);
+
+ return 0;
+}
+
+/* Checks and translates the user-supplied table segment (held in
+ * newinfo).
+ */
+static int translate_table(const char *name,
+ unsigned int valid_hooks,
+ struct arpt_table_info *newinfo,
+ unsigned int size,
+ unsigned int number,
+ const unsigned int *hook_entries,
+ const unsigned int *underflows)
+{
+ unsigned int i;
+ int ret;
+
+ newinfo->size = size;
+ newinfo->number = number;
+
+ /* Init all hooks to impossible value. */
+ for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
+ newinfo->hook_entry[i] = 0xFFFFFFFF;
+ newinfo->underflow[i] = 0xFFFFFFFF;
+ }
+
+ duprintf("translate_table: size %u\n", newinfo->size);
+ i = 0;
+
+ /* Walk through entries, checking offsets. */
+ ret = ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+ check_entry_size_and_hooks,
+ newinfo,
+ newinfo->entries,
+ newinfo->entries + size,
+ hook_entries, underflows, &i);
+ duprintf("translate_table: ARPT_ENTRY_ITERATE gives %d\n", ret);
+ if (ret != 0)
+ return ret;
+
+ if (i != number) {
+ duprintf("translate_table: %u not %u entries\n",
+ i, number);
+ return -EINVAL;
+ }
+
+ /* Check hooks all assigned */
+ for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
+ /* Only hooks which are valid */
+ if (!(valid_hooks & (1 << i)))
+ continue;
+ if (newinfo->hook_entry[i] == 0xFFFFFFFF) {
+ duprintf("Invalid hook entry %u %u\n",
+ i, hook_entries[i]);
+ return -EINVAL;
+ }
+ if (newinfo->underflow[i] == 0xFFFFFFFF) {
+ duprintf("Invalid underflow %u %u\n",
+ i, underflows[i]);
+ return -EINVAL;
+ }
+ }
+
+ if (!mark_source_chains(newinfo, valid_hooks)) {
+ duprintf("Looping hook\n");
+ return -ELOOP;
+ }
+
+ /* Finally, each sanity check must pass */
+ i = 0;
+ ret = ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+ check_entry, name, size, &i);
+
+ if (ret != 0) {
+ ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+ cleanup_entry, &i);
+ return ret;
+ }
+
+ /* And one copy for every other CPU */
+ for (i = 1; i < smp_num_cpus; i++) {
+ memcpy(newinfo->entries + SMP_ALIGN(newinfo->size)*i,
+ newinfo->entries,
+ SMP_ALIGN(newinfo->size));
+ }
+
+ return ret;
+}
+
+static struct arpt_table_info *replace_table(struct arpt_table *table,
+ unsigned int num_counters,
+ struct arpt_table_info *newinfo,
+ int *error)
+{
+ struct arpt_table_info *oldinfo;
+
+ /* Do the substitution. */
+ write_lock_bh(&table->lock);
+ /* Check inside lock: is the old number correct? */
+ if (num_counters != table->private->number) {
+ duprintf("num_counters != table->private->number (%u/%u)\n",
+ num_counters, table->private->number);
+ write_unlock_bh(&table->lock);
+ *error = -EAGAIN;
+ return NULL;
+ }
+ oldinfo = table->private;
+ table->private = newinfo;
+ newinfo->initial_entries = oldinfo->initial_entries;
+ write_unlock_bh(&table->lock);
+
+ return oldinfo;
+}
+
+/* Gets counters. */
+static inline int add_entry_to_counter(const struct arpt_entry *e,
+ struct arpt_counters total[],
+ unsigned int *i)
+{
+ ADD_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
+
+ (*i)++;
+ return 0;
+}
+
+static void get_counters(const struct arpt_table_info *t,
+ struct arpt_counters counters[])
+{
+ unsigned int cpu;
+ unsigned int i;
+
+ for (cpu = 0; cpu < smp_num_cpus; cpu++) {
+ i = 0;
+ ARPT_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu),
+ t->size,
+ add_entry_to_counter,
+ counters,
+ &i);
+ }
+}
+
+static int copy_entries_to_user(unsigned int total_size,
+ struct arpt_table *table,
+ void *userptr)
+{
+ unsigned int off, num, countersize;
+ struct arpt_entry *e;
+ struct arpt_counters *counters;
+ int ret = 0;
+
+ /* We need atomic snapshot of counters: rest doesn't change
+ * (other than comefrom, which userspace doesn't care
+ * about).
+ */
+ countersize = sizeof(struct arpt_counters) * table->private->number;
+ counters = vmalloc(countersize);
+
+ if (counters == NULL)
+ return -ENOMEM;
+
+ /* First, sum counters... */
+ memset(counters, 0, countersize);
+ write_lock_bh(&table->lock);
+ get_counters(table->private, counters);
+ write_unlock_bh(&table->lock);
+
+ /* ... then copy entire thing from CPU 0... */
+ if (copy_to_user(userptr, table->private->entries, total_size) != 0) {
+ ret = -EFAULT;
+ goto free_counters;
+ }
+
+ /* FIXME: use iterator macros --RR */
+ /* ... then go back and fix counters and names */
+ for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
+ struct arpt_entry_target *t;
+
+ e = (struct arpt_entry *)(table->private->entries + off);
+ if (copy_to_user(userptr + off
+ + offsetof(struct arpt_entry, counters),
+ &counters[num],
+ sizeof(counters[num])) != 0) {
+ ret = -EFAULT;
+ goto free_counters;
+ }
+
+ t = arpt_get_target(e);
+ if (copy_to_user(userptr + off + e->target_offset
+ + offsetof(struct arpt_entry_target,
+ u.user.name),
+ t->u.kernel.target->name,
+ strlen(t->u.kernel.target->name)+1) != 0) {
+ ret = -EFAULT;
+ goto free_counters;
+ }
+ }
+
+ free_counters:
+ vfree(counters);
+ return ret;
+}
+
+static int get_entries(const struct arpt_get_entries *entries,
+ struct arpt_get_entries *uptr)
+{
+ int ret;
+ struct arpt_table *t;
+
+ t = find_table_lock(entries->name, &ret, &arpt_mutex);
+ if (t) {
+ duprintf("t->private->number = %u\n",
+ t->private->number);
+ if (entries->size == t->private->size)
+ ret = copy_entries_to_user(t->private->size,
+ t, uptr->entrytable);
+ else {
+ duprintf("get_entries: I've got %u not %u!\n",
+ t->private->size,
+ entries->size);
+ ret = -EINVAL;
+ }
+ up(&arpt_mutex);
+ } else
+ duprintf("get_entries: Can't find %s!\n",
+ entries->name);
+
+ return ret;
+}
+
+static int do_replace(void *user, unsigned int len)
+{
+ int ret;
+ struct arpt_replace tmp;
+ struct arpt_table *t;
+ struct arpt_table_info *newinfo, *oldinfo;
+ struct arpt_counters *counters;
+
+ if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
+ return -EFAULT;
+
+ /* Hack: Causes ipchains to give correct error msg --RR */
+ if (len != sizeof(tmp) + tmp.size)
+ return -ENOPROTOOPT;
+
+ /* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */
+ if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages)
+ return -ENOMEM;
+
+ newinfo = vmalloc(sizeof(struct arpt_table_info)
+ + SMP_ALIGN(tmp.size) * smp_num_cpus);
+ if (!newinfo)
+ return -ENOMEM;
+
+ if (copy_from_user(newinfo->entries, user + sizeof(tmp),
+ tmp.size) != 0) {
+ ret = -EFAULT;
+ goto free_newinfo;
+ }
+
+ counters = vmalloc(tmp.num_counters * sizeof(struct arpt_counters));
+ if (!counters) {
+ ret = -ENOMEM;
+ goto free_newinfo;
+ }
+ memset(counters, 0, tmp.num_counters * sizeof(struct arpt_counters));
+
+ ret = translate_table(tmp.name, tmp.valid_hooks,
+ newinfo, tmp.size, tmp.num_entries,
+ tmp.hook_entry, tmp.underflow);
+ if (ret != 0)
+ goto free_newinfo_counters;
+
+ duprintf("arp_tables: Translated table\n");
+
+ t = find_table_lock(tmp.name, &ret, &arpt_mutex);
+ if (!t)
+ goto free_newinfo_counters_untrans;
+
+ /* You lied! */
+ if (tmp.valid_hooks != t->valid_hooks) {
+ duprintf("Valid hook crap: %08X vs %08X\n",
+ tmp.valid_hooks, t->valid_hooks);
+ ret = -EINVAL;
+ goto free_newinfo_counters_untrans_unlock;
+ }
+
+ oldinfo = replace_table(t, tmp.num_counters, newinfo, &ret);
+ if (!oldinfo)
+ goto free_newinfo_counters_untrans_unlock;
+
+ /* Update module usage count based on number of rules */
+ duprintf("do_replace: oldnum=%u, initnum=%u, newnum=%u\n",
+ oldinfo->number, oldinfo->initial_entries, newinfo->number);
+ if (t->me && (oldinfo->number <= oldinfo->initial_entries) &&
+ (newinfo->number > oldinfo->initial_entries))
+ __MOD_INC_USE_COUNT(t->me);
+ else if (t->me && (oldinfo->number > oldinfo->initial_entries) &&
+ (newinfo->number <= oldinfo->initial_entries))
+ __MOD_DEC_USE_COUNT(t->me);
+
+ /* Get the old counters. */
+ get_counters(oldinfo, counters);
+ /* Decrease module usage counts and free resource */
+ ARPT_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL);
+ vfree(oldinfo);
+ /* Silent error: too late now. */
+ copy_to_user(tmp.counters, counters,
+ sizeof(struct arpt_counters) * tmp.num_counters);
+ vfree(counters);
+ up(&arpt_mutex);
+ return 0;
+
+ free_newinfo_counters_untrans_unlock:
+ up(&arpt_mutex);
+ free_newinfo_counters_untrans:
+ ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry, NULL);
+ free_newinfo_counters:
+ vfree(counters);
+ free_newinfo:
+ vfree(newinfo);
+ return ret;
+}
+
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK.
+ */
+static inline int add_counter_to_entry(struct arpt_entry *e,
+ const struct arpt_counters addme[],
+ unsigned int *i)
+{
+
+ ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+ (*i)++;
+ return 0;
+}
+
+static int do_add_counters(void *user, unsigned int len)
+{
+ unsigned int i;
+ struct arpt_counters_info tmp, *paddc;
+ struct arpt_table *t;
+ int ret;
+
+ if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
+ return -EFAULT;
+
+ if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct arpt_counters))
+ return -EINVAL;
+
+ paddc = vmalloc(len);
+ if (!paddc)
+ return -ENOMEM;
+
+ if (copy_from_user(paddc, user, len) != 0) {
+ ret = -EFAULT;
+ goto free;
+ }
+
+ t = find_table_lock(tmp.name, &ret, &arpt_mutex);
+ if (!t)
+ goto free;
+
+ write_lock_bh(&t->lock);
+ if (t->private->number != paddc->num_counters) {
+ ret = -EINVAL;
+ goto unlock_up_free;
+ }
+
+ i = 0;
+ ARPT_ENTRY_ITERATE(t->private->entries,
+ t->private->size,
+ add_counter_to_entry,
+ paddc->counters,
+ &i);
+ unlock_up_free:
+ write_unlock_bh(&t->lock);
+ up(&arpt_mutex);
+ free:
+ vfree(paddc);
+
+ return ret;
+}
+
+static int do_arpt_set_ctl(struct sock *sk, int cmd, void *user, unsigned int len)
+{
+ int ret;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case ARPT_SO_SET_REPLACE:
+ ret = do_replace(user, len);
+ break;
+
+ case ARPT_SO_SET_ADD_COUNTERS:
+ ret = do_add_counters(user, len);
+ break;
+
+ default:
+ duprintf("do_arpt_set_ctl: unknown request %i\n", cmd);
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+
+static int do_arpt_get_ctl(struct sock *sk, int cmd, void *user, int *len)
+{
+ int ret;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case ARPT_SO_GET_INFO: {
+ char name[ARPT_TABLE_MAXNAMELEN];
+ struct arpt_table *t;
+
+ if (*len != sizeof(struct arpt_getinfo)) {
+ duprintf("length %u != %Zu\n", *len,
+ sizeof(struct arpt_getinfo));
+ ret = -EINVAL;
+ break;
+ }
+
+ if (copy_from_user(name, user, sizeof(name)) != 0) {
+ ret = -EFAULT;
+ break;
+ }
+ name[ARPT_TABLE_MAXNAMELEN-1] = '\0';
+ t = find_table_lock(name, &ret, &arpt_mutex);
+ if (t) {
+ struct arpt_getinfo info;
+
+ info.valid_hooks = t->valid_hooks;
+ memcpy(info.hook_entry, t->private->hook_entry,
+ sizeof(info.hook_entry));
+ memcpy(info.underflow, t->private->underflow,
+ sizeof(info.underflow));
+ info.num_entries = t->private->number;
+ info.size = t->private->size;
+ strcpy(info.name, name);
+
+ if (copy_to_user(user, &info, *len) != 0)
+ ret = -EFAULT;
+ else
+ ret = 0;
+
+ up(&arpt_mutex);
+ }
+ }
+ break;
+
+ case ARPT_SO_GET_ENTRIES: {
+ struct arpt_get_entries get;
+
+ if (*len < sizeof(get)) {
+ duprintf("get_entries: %u < %Zu\n", *len, sizeof(get));
+ ret = -EINVAL;
+ } else if (copy_from_user(&get, user, sizeof(get)) != 0) {
+ ret = -EFAULT;
+ } else if (*len != sizeof(struct arpt_get_entries) + get.size) {
+ duprintf("get_entries: %u != %Zu\n", *len,
+ sizeof(struct arpt_get_entries) + get.size);
+ ret = -EINVAL;
+ } else
+ ret = get_entries(&get, user);
+ break;
+ }
+
+ default:
+ duprintf("do_arpt_get_ctl: unknown request %i\n", cmd);
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+
+/* Registration hooks for targets. */
+int arpt_register_target(struct arpt_target *target)
+{
+ int ret;
+
+ MOD_INC_USE_COUNT;
+ ret = down_interruptible(&arpt_mutex);
+ if (ret != 0) {
+ MOD_DEC_USE_COUNT;
+ return ret;
+ }
+ if (!list_named_insert(&arpt_target, target)) {
+ duprintf("arpt_register_target: `%s' already in list!\n",
+ target->name);
+ ret = -EINVAL;
+ MOD_DEC_USE_COUNT;
+ }
+ up(&arpt_mutex);
+ return ret;
+}
+
+void arpt_unregister_target(struct arpt_target *target)
+{
+ down(&arpt_mutex);
+ LIST_DELETE(&arpt_target, target);
+ up(&arpt_mutex);
+ MOD_DEC_USE_COUNT;
+}
+
+int arpt_register_table(struct arpt_table *table)
+{
+ int ret;
+ struct arpt_table_info *newinfo;
+ static struct arpt_table_info bootstrap
+ = { 0, 0, 0, { 0 }, { 0 }, { } };
+
+ MOD_INC_USE_COUNT;
+ newinfo = vmalloc(sizeof(struct arpt_table_info)
+ + SMP_ALIGN(table->table->size) * smp_num_cpus);
+ if (!newinfo) {
+ ret = -ENOMEM;
+ MOD_DEC_USE_COUNT;
+ return ret;
+ }
+ memcpy(newinfo->entries, table->table->entries, table->table->size);
+
+ ret = translate_table(table->name, table->valid_hooks,
+ newinfo, table->table->size,
+ table->table->num_entries,
+ table->table->hook_entry,
+ table->table->underflow);
+ duprintf("arpt_register_table: translate table gives %d\n", ret);
+ if (ret != 0) {
+ vfree(newinfo);
+ MOD_DEC_USE_COUNT;
+ return ret;
+ }
+
+ ret = down_interruptible(&arpt_mutex);
+ if (ret != 0) {
+ vfree(newinfo);
+ MOD_DEC_USE_COUNT;
+ return ret;
+ }
+
+ /* Don't autoload: we'd eat our tail... */
+ if (list_named_find(&arpt_tables, table->name)) {
+ ret = -EEXIST;
+ goto free_unlock;
+ }
+
+ /* Simplifies replace_table code. */
+ table->private = &bootstrap;
+ if (!replace_table(table, 0, newinfo, &ret))
+ goto free_unlock;
+
+ duprintf("table->private->number = %u\n",
+ table->private->number);
+
+ /* save number of initial entries */
+ table->private->initial_entries = table->private->number;
+
+ table->lock = RW_LOCK_UNLOCKED;
+ list_prepend(&arpt_tables, table);
+
+ unlock:
+ up(&arpt_mutex);
+ return ret;
+
+ free_unlock:
+ vfree(newinfo);
+ MOD_DEC_USE_COUNT;
+ goto unlock;
+}
+
+void arpt_unregister_table(struct arpt_table *table)
+{
+ down(&arpt_mutex);
+ LIST_DELETE(&arpt_tables, table);
+ up(&arpt_mutex);
+
+ /* Decrease module usage counts and free resources */
+ ARPT_ENTRY_ITERATE(table->private->entries, table->private->size,
+ cleanup_entry, NULL);
+ vfree(table->private);
+ MOD_DEC_USE_COUNT;
+}
+
+/* The built-in targets: standard (NULL) and error. */
+static struct arpt_target arpt_standard_target
+= { { NULL, NULL }, ARPT_STANDARD_TARGET, NULL, NULL, NULL };
+static struct arpt_target arpt_error_target
+= { { NULL, NULL }, ARPT_ERROR_TARGET, arpt_error, NULL, NULL };
+
+static struct nf_sockopt_ops arpt_sockopts
+= { { NULL, NULL }, PF_INET, ARPT_BASE_CTL, ARPT_SO_SET_MAX+1, do_arpt_set_ctl,
+ ARPT_BASE_CTL, ARPT_SO_GET_MAX+1, do_arpt_get_ctl, 0, NULL };
+
+#ifdef CONFIG_PROC_FS
+static inline int print_name(const struct arpt_table *t,
+ off_t start_offset, char *buffer, int length,
+ off_t *pos, unsigned int *count)
+{
+ if ((*count)++ >= start_offset) {
+ unsigned int namelen;
+
+ namelen = sprintf(buffer + *pos, "%s\n", t->name);
+ if (*pos + namelen > length) {
+ /* Stop iterating */
+ return 1;
+ }
+ *pos += namelen;
+ }
+ return 0;
+}
+
+static int arpt_get_tables(char *buffer, char **start, off_t offset, int length)
+{
+ off_t pos = 0;
+ unsigned int count = 0;
+
+ if (down_interruptible(&arpt_mutex) != 0)
+ return 0;
+
+ LIST_FIND(&arpt_tables, print_name, struct arpt_table *,
+ offset, buffer, length, &pos, &count);
+
+ up(&arpt_mutex);
+
+ /* `start' hack - see fs/proc/generic.c line ~105 */
+ *start=(char *)((unsigned long)count-offset);
+ return pos;
+}
+#endif /*CONFIG_PROC_FS*/
+
+static int __init init(void)
+{
+ int ret;
+
+ /* Noone else will be downing sem now, so we won't sleep */
+ down(&arpt_mutex);
+ list_append(&arpt_target, &arpt_standard_target);
+ list_append(&arpt_target, &arpt_error_target);
+ up(&arpt_mutex);
+
+ /* Register setsockopt */
+ ret = nf_register_sockopt(&arpt_sockopts);
+ if (ret < 0) {
+ duprintf("Unable to register sockopts.\n");
+ return ret;
+ }
+
+#ifdef CONFIG_PROC_FS
+ {
+ struct proc_dir_entry *proc;
+
+ proc = proc_net_create("arp_tables_names", 0, arpt_get_tables);
+ if (!proc) {
+ nf_unregister_sockopt(&arpt_sockopts);
+ return -ENOMEM;
+ }
+ proc->owner = THIS_MODULE;
+ }
+#endif
+
+ printk("arp_tables: (C) 2002 David S. Miller\n");
+ return 0;
+}
+
+static void __exit fini(void)
+{
+ nf_unregister_sockopt(&arpt_sockopts);
+#ifdef CONFIG_PROC_FS
+ proc_net_remove("arp_tables_names");
+#endif
+}
+
+EXPORT_SYMBOL(arpt_register_table);
+EXPORT_SYMBOL(arpt_unregister_table);
+EXPORT_SYMBOL(arpt_do_table);
+EXPORT_SYMBOL(arpt_register_target);
+EXPORT_SYMBOL(arpt_unregister_target);
+
+module_init(init);
+module_exit(fini);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c
new file mode 100644
index 000000000000..4e11e5b5e006
--- /dev/null
+++ b/net/ipv4/netfilter/arptable_filter.c
@@ -0,0 +1,174 @@
+/*
+ * Filtering ARP tables module.
+ *
+ * Copyright (C) 2002 David S. Miller (davem@redhat.com)
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/netfilter_arp/arp_tables.h>
+
+#define FILTER_VALID_HOOKS ((1 << NF_ARP_IN) | (1 << NF_ARP_OUT))
+
+/* Standard entry. */
+struct arpt_standard
+{
+ struct arpt_entry entry;
+ struct arpt_standard_target target;
+};
+
+struct arpt_error_target
+{
+ struct arpt_entry_target target;
+ char errorname[ARPT_FUNCTION_MAXNAMELEN];
+};
+
+struct arpt_error
+{
+ struct arpt_entry entry;
+ struct arpt_error_target target;
+};
+
+static struct
+{
+ struct arpt_replace repl;
+ struct arpt_standard entries[2];
+ struct arpt_error term;
+} initial_table __initdata
+= { { "filter", FILTER_VALID_HOOKS, 3,
+ sizeof(struct arpt_standard) * 2 + sizeof(struct arpt_error),
+ { [NF_ARP_IN] 0,
+ [NF_ARP_OUT] sizeof(struct arpt_standard) },
+ { [NF_ARP_IN] 0,
+ [NF_ARP_OUT] sizeof(struct arpt_standard), },
+ 0, NULL, { } },
+ {
+ /* ARP_IN */
+ {
+ {
+ {
+ { 0 }, { 0 }, { 0 }, { 0 },
+ 0, 0,
+ { { 0, }, { 0, } },
+ { { 0, }, { 0, } },
+ 0, 0,
+ 0, 0,
+ 0, 0,
+ "", "", { 0 }, { 0 },
+ 0, 0
+ },
+ sizeof(struct arpt_entry),
+ sizeof(struct arpt_standard),
+ 0,
+ { 0, 0 }, { } },
+ { { { { ARPT_ALIGN(sizeof(struct arpt_standard_target)), "" } }, { } },
+ -NF_ACCEPT - 1 }
+ },
+ /* ARP_OUT */
+ {
+ {
+ {
+ { 0 }, { 0 }, { 0 }, { 0 },
+ 0, 0,
+ { { 0, }, { 0, } },
+ { { 0, }, { 0, } },
+ 0, 0,
+ 0, 0,
+ 0, 0,
+ "", "", { 0 }, { 0 },
+ 0, 0
+ },
+ sizeof(struct arpt_entry),
+ sizeof(struct arpt_standard),
+ 0,
+ { 0, 0 }, { } },
+ { { { { ARPT_ALIGN(sizeof(struct arpt_standard_target)), "" } }, { } },
+ -NF_ACCEPT - 1 }
+ }
+ },
+ /* ERROR */
+ {
+ {
+ {
+ { 0 }, { 0 }, { 0 }, { 0 },
+ 0, 0,
+ { { 0, }, { 0, } },
+ { { 0, }, { 0, } },
+ 0, 0,
+ 0, 0,
+ 0, 0,
+ "", "", { 0 }, { 0 },
+ 0, 0
+ },
+ sizeof(struct arpt_entry),
+ sizeof(struct arpt_error),
+ 0,
+ { 0, 0 }, { } },
+ { { { { ARPT_ALIGN(sizeof(struct arpt_error_target)), ARPT_ERROR_TARGET } },
+ { } },
+ "ERROR"
+ }
+ }
+};
+
+static struct arpt_table packet_filter
+= { { NULL, NULL }, "filter", &initial_table.repl,
+ FILTER_VALID_HOOKS, RW_LOCK_UNLOCKED, NULL, THIS_MODULE };
+
+/* The work comes in here from netfilter.c */
+static unsigned int arpt_hook(unsigned int hook,
+ struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ return arpt_do_table(pskb, hook, in, out, &packet_filter, NULL);
+}
+
+static struct nf_hook_ops arpt_ops[]
+= { { { NULL, NULL }, arpt_hook, NF_ARP, NF_ARP_IN, 0 },
+ { { NULL, NULL }, arpt_hook, NF_ARP, NF_ARP_OUT, 0 }
+};
+
+static int __init init(void)
+{
+ int ret;
+
+ /* Register table */
+ ret = arpt_register_table(&packet_filter);
+ if (ret < 0)
+ return ret;
+
+ /* Register hooks */
+ ret = nf_register_hook(&arpt_ops[0]);
+ if (ret < 0)
+ goto cleanup_table;
+
+ ret = nf_register_hook(&arpt_ops[1]);
+ if (ret < 0)
+ goto cleanup_hook0;
+
+ return ret;
+
+cleanup_hook0:
+ nf_unregister_hook(&arpt_ops[0]);
+
+cleanup_table:
+ arpt_unregister_table(&packet_filter);
+
+ return ret;
+}
+
+static void __exit fini(void)
+{
+ unsigned int i;
+
+ for (i = 0; i < sizeof(arpt_ops)/sizeof(struct nf_hook_ops); i++)
+ nf_unregister_hook(&arpt_ops[i]);
+
+ arpt_unregister_table(&packet_filter);
+}
+
+module_init(init);
+module_exit(fini);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c
index 1846640d2837..5fa94340daf9 100644
--- a/net/ipv4/netfilter/ip_conntrack_standalone.c
+++ b/net/ipv4/netfilter/ip_conntrack_standalone.c
@@ -15,6 +15,7 @@
#include <linux/skbuff.h>
#include <linux/proc_fs.h>
#include <linux/version.h>
+#include <linux/brlock.h>
#include <net/checksum.h>
#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock)
@@ -35,6 +36,11 @@
struct module *ip_conntrack_module = THIS_MODULE;
MODULE_LICENSE("GPL");
+static int kill_proto(const struct ip_conntrack *i, void *data)
+{
+ return (i->tuplehash[IP_CT_DIR_ORIGINAL].dst.protonum ==
+ *((u_int8_t *) data));
+}
static unsigned int
print_tuple(char *buffer, const struct ip_conntrack_tuple *tuple,
@@ -304,12 +310,24 @@ int ip_conntrack_protocol_register(struct ip_conntrack_protocol *proto)
return ret;
}
-/* FIXME: Implement this --RR */
-#if 0
void ip_conntrack_protocol_unregister(struct ip_conntrack_protocol *proto)
{
+ WRITE_LOCK(&ip_conntrack_lock);
+
+ /* find_proto() returns proto_generic in case there is no protocol
+ * helper. So this should be enough - HW */
+ LIST_DELETE(&protocol_list, proto);
+ WRITE_UNLOCK(&ip_conntrack_lock);
+
+ /* Somebody could be still looking at the proto in bh. */
+ br_write_lock_bh(BR_NETPROTO_LOCK);
+ br_write_unlock_bh(BR_NETPROTO_LOCK);
+
+ /* Remove all contrack entries for this protocol */
+ ip_ct_selective_cleanup(kill_proto, &proto->proto);
+
+ MOD_DEC_USE_COUNT;
}
-#endif
static int __init init(void)
{
@@ -325,6 +343,7 @@ module_init(init);
module_exit(fini);
EXPORT_SYMBOL(ip_conntrack_protocol_register);
+EXPORT_SYMBOL(ip_conntrack_protocol_unregister);
EXPORT_SYMBOL(invert_tuplepr);
EXPORT_SYMBOL(ip_conntrack_alter_reply);
EXPORT_SYMBOL(ip_conntrack_destroyed);
@@ -335,6 +354,7 @@ EXPORT_SYMBOL(ip_conntrack_helper_unregister);
EXPORT_SYMBOL(ip_ct_selective_cleanup);
EXPORT_SYMBOL(ip_ct_refresh);
EXPORT_SYMBOL(ip_conntrack_expect_related);
+EXPORT_SYMBOL(ip_conntrack_unexpect_related);
EXPORT_SYMBOL(ip_conntrack_tuple_taken);
EXPORT_SYMBOL(ip_ct_gather_frags);
EXPORT_SYMBOL(ip_conntrack_htable_size);
diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c
index 07ebd39fa115..ebbbde93ead4 100644
--- a/net/ipv4/netfilter/ip_nat_core.c
+++ b/net/ipv4/netfilter/ip_nat_core.c
@@ -314,6 +314,7 @@ find_best_ips_proto(struct ip_conntrack_tuple *tuple,
* do_extra_mangle last time. */
*other_ipp = saved_ip;
+#ifdef CONFIG_IP_NF_NAT_LOCAL
if (hooknum == NF_IP_LOCAL_OUT
&& *var_ipp != orig_dstip
&& !do_extra_mangle(*var_ipp, other_ipp)) {
@@ -324,6 +325,7 @@ find_best_ips_proto(struct ip_conntrack_tuple *tuple,
* anyway. */
continue;
}
+#endif
/* Count how many others map onto this. */
score = count_maps(tuple->src.ip, tuple->dst.ip,
@@ -367,11 +369,13 @@ find_best_ips_proto_fast(struct ip_conntrack_tuple *tuple,
else {
/* Only do extra mangle when required (breaks
socket binding) */
+#ifdef CONFIG_IP_NF_NAT_LOCAL
if (tuple->dst.ip != mr->range[0].min_ip
&& hooknum == NF_IP_LOCAL_OUT
&& !do_extra_mangle(mr->range[0].min_ip,
&tuple->src.ip))
return NULL;
+#endif
tuple->dst.ip = mr->range[0].min_ip;
}
}
@@ -494,7 +498,10 @@ helper_cmp(const struct ip_nat_helper *helper,
static unsigned int opposite_hook[NF_IP_NUMHOOKS]
= { [NF_IP_PRE_ROUTING] = NF_IP_POST_ROUTING,
[NF_IP_POST_ROUTING] = NF_IP_PRE_ROUTING,
- [NF_IP_LOCAL_OUT] = NF_IP_POST_ROUTING
+#ifdef CONFIG_IP_NF_NAT_LOCAL
+ [NF_IP_LOCAL_OUT] = NF_IP_LOCAL_IN,
+ [NF_IP_LOCAL_IN] = NF_IP_LOCAL_OUT,
+#endif
};
unsigned int
diff --git a/net/ipv4/netfilter/ip_nat_rule.c b/net/ipv4/netfilter/ip_nat_rule.c
index 9a3248e68d64..9eacf45d908c 100644
--- a/net/ipv4/netfilter/ip_nat_rule.c
+++ b/net/ipv4/netfilter/ip_nat_rule.c
@@ -140,8 +140,12 @@ static unsigned int ipt_dnat_target(struct sk_buff **pskb,
struct ip_conntrack *ct;
enum ip_conntrack_info ctinfo;
+#ifdef CONFIG_IP_NF_NAT_LOCAL
IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING
|| hooknum == NF_IP_LOCAL_OUT);
+#else
+ IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING);
+#endif
ct = ip_conntrack_get(*pskb, &ctinfo);
@@ -210,7 +214,7 @@ static int ipt_dnat_checkentry(const char *tablename,
/* Only allow these for NAT. */
if (strcmp(tablename, "nat") != 0) {
- DEBUGP("SNAT: wrong table %s\n", tablename);
+ DEBUGP("DNAT: wrong table %s\n", tablename);
return 0;
}
@@ -218,6 +222,14 @@ static int ipt_dnat_checkentry(const char *tablename,
DEBUGP("DNAT: hook mask 0x%x bad\n", hook_mask);
return 0;
}
+
+#ifndef CONFIG_IP_NF_NAT_LOCAL
+ if (hook_mask & (1 << NF_IP_LOCAL_OUT)) {
+ DEBUGP("DNAT: CONFIG_IP_NF_NAT_LOCAL not enabled\n");
+ return 0;
+ }
+#endif
+
return 1;
}
diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c
index b36339d2bc3f..b0d299703269 100644
--- a/net/ipv4/netfilter/ip_nat_standalone.c
+++ b/net/ipv4/netfilter/ip_nat_standalone.c
@@ -42,7 +42,8 @@
#define HOOKNAME(hooknum) ((hooknum) == NF_IP_POST_ROUTING ? "POST_ROUTING" \
: ((hooknum) == NF_IP_PRE_ROUTING ? "PRE_ROUTING" \
: ((hooknum) == NF_IP_LOCAL_OUT ? "LOCAL_OUT" \
- : "*ERROR*")))
+ : ((hooknum) == NF_IP_LOCAL_IN ? "LOCAL_IN" \
+ : "*ERROR*")))
static unsigned int
ip_nat_fn(unsigned int hooknum,
@@ -95,6 +96,12 @@ ip_nat_fn(unsigned int hooknum,
}
/* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */
case IP_CT_NEW:
+#ifdef CONFIG_IP_NF_NAT_LOCAL
+ /* LOCAL_IN hook doesn't have a chain and thus doesn't care
+ * about new packets -HW */
+ if (hooknum == NF_IP_LOCAL_IN)
+ return NF_ACCEPT;
+#endif
info = &ct->nat.info;
WRITE_LOCK(&ip_nat_lock);
@@ -205,6 +212,11 @@ static struct nf_hook_ops ip_nat_out_ops
static struct nf_hook_ops ip_nat_local_out_ops
= { { NULL, NULL }, ip_nat_local_fn, PF_INET, NF_IP_LOCAL_OUT, NF_IP_PRI_NAT_DST };
+#ifdef CONFIG_IP_NF_NAT_LOCAL
+static struct nf_hook_ops ip_nat_local_in_ops
+= { { NULL, NULL }, ip_nat_fn, PF_INET, NF_IP_LOCAL_IN, NF_IP_PRI_NAT_SRC };
+#endif
+
/* Protocol registration. */
int ip_nat_protocol_register(struct ip_nat_protocol *proto)
{
@@ -273,6 +285,13 @@ static int init_or_cleanup(int init)
printk("ip_nat_init: can't register local out hook.\n");
goto cleanup_outops;
}
+#ifdef CONFIG_IP_NF_NAT_LOCAL
+ ret = nf_register_hook(&ip_nat_local_in_ops);
+ if (ret < 0) {
+ printk("ip_nat_init: can't register local in hook.\n");
+ goto cleanup_localoutops;
+ }
+#endif
if (ip_conntrack_module)
__MOD_INC_USE_COUNT(ip_conntrack_module);
return ret;
@@ -280,6 +299,10 @@ static int init_or_cleanup(int init)
cleanup:
if (ip_conntrack_module)
__MOD_DEC_USE_COUNT(ip_conntrack_module);
+#ifdef CONFIG_IP_NF_NAT_LOCAL
+ nf_unregister_hook(&ip_nat_local_in_ops);
+ cleanup_localoutops:
+#endif
nf_unregister_hook(&ip_nat_local_out_ops);
cleanup_outops:
nf_unregister_hook(&ip_nat_out_ops);
diff --git a/net/netsyms.c b/net/netsyms.c
index b500684bdaec..abf875169c99 100644
--- a/net/netsyms.c
+++ b/net/netsyms.c
@@ -490,6 +490,7 @@ EXPORT_SYMBOL(__kfree_skb);
EXPORT_SYMBOL(skb_clone);
EXPORT_SYMBOL(skb_copy);
EXPORT_SYMBOL(netif_rx);
+EXPORT_SYMBOL(netif_receive_skb);
EXPORT_SYMBOL(dev_add_pack);
EXPORT_SYMBOL(dev_remove_pack);
EXPORT_SYMBOL(dev_get);