Fedora kernel-2.6.17-1.2142_FC4 patched with stable patch-2.6.17.4-vs2.0.2-rc26.diff
[linux-2.6.git] / net / core / netpoll.c
index cb3a03f..e8e05ce 100644 (file)
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
 #include <linux/string.h>
+#include <linux/if_arp.h>
 #include <linux/inetdevice.h>
 #include <linux/inet.h>
 #include <linux/interrupt.h>
 #include <linux/netpoll.h>
 #include <linux/sched.h>
+#include <linux/delay.h>
 #include <linux/rcupdate.h>
+#include <linux/workqueue.h>
 #include <net/tcp.h>
 #include <net/udp.h>
 #include <asm/unaligned.h>
  * message gets out even in extreme OOM situations.
  */
 
-#define MAX_SKBS 32
 #define MAX_UDP_CHUNK 1460
+#define MAX_SKBS 32
+#define MAX_QUEUE_DEPTH (MAX_SKBS / 2)
+#define MAX_RETRIES 20000
 
-static spinlock_t skb_list_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(skb_list_lock);
 static int nr_skbs;
 static struct sk_buff *skbs;
 
-static spinlock_t rx_list_lock = SPIN_LOCK_UNLOCKED;
-static LIST_HEAD(rx_list);
+static DEFINE_SPINLOCK(queue_lock);
+static int queue_depth;
+static struct sk_buff *queue_head, *queue_tail;
 
 static atomic_t trapped;
-spinlock_t netpoll_poll_lock = SPIN_LOCK_UNLOCKED;
 
 #define NETPOLL_RX_ENABLED  1
 #define NETPOLL_RX_DROP     2
@@ -50,51 +55,113 @@ spinlock_t netpoll_poll_lock = SPIN_LOCK_UNLOCKED;
 
 static void zap_completion_queue(void);
 
-static int checksum_udp(struct sk_buff *skb, struct udphdr *uh,
-                            unsigned short ulen, u32 saddr, u32 daddr)
+static void queue_process(void *p)
 {
-       if (uh->check == 0)
-               return 0;
+       unsigned long flags;
+       struct sk_buff *skb;
 
-       if (skb->ip_summed == CHECKSUM_HW)
-               return csum_tcpudp_magic(
-                       saddr, daddr, ulen, IPPROTO_UDP, skb->csum);
+       while (queue_head) {
+               spin_lock_irqsave(&queue_lock, flags);
 
-       skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0);
+               skb = queue_head;
+               queue_head = skb->next;
+               if (skb == queue_tail)
+                       queue_head = NULL;
 
-       return csum_fold(skb_checksum(skb, 0, skb->len, skb->csum));
+               queue_depth--;
+
+               spin_unlock_irqrestore(&queue_lock, flags);
+
+               dev_queue_xmit(skb);
+       }
 }
 
-void netpoll_poll(struct netpoll *np)
+static DECLARE_WORK(send_queue, queue_process, NULL);
+
+void netpoll_queue(struct sk_buff *skb)
 {
-       /*
-        * In cases where there is bi-directional communications, reading
-        * only one message at a time can lead to packets being dropped by
-        * the network adapter, forcing superfluous retries and possibly
-        * timeouts.  Thus, we set our budget to a more reasonable value.
-        */
-       int budget = 16;
        unsigned long flags;
 
-       if(!np->dev || !netif_running(np->dev) || !np->dev->poll_controller)
+       if (queue_depth == MAX_QUEUE_DEPTH) {
+               __kfree_skb(skb);
                return;
+       }
 
-       /* Process pending work on NIC */
-       np->dev->poll_controller(np->dev);
+       spin_lock_irqsave(&queue_lock, flags);
+       if (!queue_head)
+               queue_head = skb;
+       else
+               queue_tail->next = skb;
+       queue_tail = skb;
+       queue_depth++;
+       spin_unlock_irqrestore(&queue_lock, flags);
+
+       schedule_work(&send_queue);
+}
+
+static int checksum_udp(struct sk_buff *skb, struct udphdr *uh,
+                            unsigned short ulen, u32 saddr, u32 daddr)
+{
+       unsigned int psum;
 
-       /* If scheduling is stopped, tickle NAPI bits */
-       spin_lock_irqsave(&netpoll_poll_lock, flags);
-       if (np->dev->poll &&
-           test_bit(__LINK_STATE_RX_SCHED, &np->dev->state)) {
-               np->dev->netpoll_rx |= NETPOLL_RX_DROP;
+       if (uh->check == 0 || skb->ip_summed == CHECKSUM_UNNECESSARY)
+               return 0;
+
+       psum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0);
+
+       if (skb->ip_summed == CHECKSUM_HW &&
+           !(u16)csum_fold(csum_add(psum, skb->csum)))
+               return 0;
+
+       skb->csum = psum;
+
+       return __skb_checksum_complete(skb);
+}
+
+/*
+ * Check whether delayed processing was scheduled for our NIC. If so,
+ * we attempt to grab the poll lock and use ->poll() to pump the card.
+ * If this fails, either we've recursed in ->poll() or it's already
+ * running on another CPU.
+ *
+ * Note: we don't mask interrupts with this lock because we're using
+ * trylock here and interrupts are already disabled in the softirq
+ * case. Further, we test the poll_owner to avoid recursion on UP
+ * systems where the lock doesn't exist.
+ *
+ * In cases where there is bi-directional communications, reading only
+ * one message at a time can lead to packets being dropped by the
+ * network adapter, forcing superfluous retries and possibly timeouts.
+ * Thus, we set our budget to greater than 1.
+ */
+static void poll_napi(struct netpoll *np)
+{
+       struct netpoll_info *npinfo = np->dev->npinfo;
+       int budget = 16;
+
+       if (test_bit(__LINK_STATE_RX_SCHED, &np->dev->state) &&
+           npinfo->poll_owner != smp_processor_id() &&
+           spin_trylock(&npinfo->poll_lock)) {
+               npinfo->rx_flags |= NETPOLL_RX_DROP;
                atomic_inc(&trapped);
 
                np->dev->poll(np->dev, &budget);
 
                atomic_dec(&trapped);
-               np->dev->netpoll_rx &= ~NETPOLL_RX_DROP;
+               npinfo->rx_flags &= ~NETPOLL_RX_DROP;
+               spin_unlock(&npinfo->poll_lock);
        }
-       spin_unlock_irqrestore(&netpoll_poll_lock, flags);
+}
+
+void netpoll_poll(struct netpoll *np)
+{
+       if(!np->dev || !netif_running(np->dev) || !np->dev->poll_controller)
+               return;
+
+       /* Process pending work on NIC */
+       np->dev->poll_controller(np->dev);
+       if (np->dev->poll)
+               poll_napi(np);
 
        zap_completion_queue();
 }
@@ -133,7 +200,10 @@ static void zap_completion_queue(void)
                while (clist != NULL) {
                        struct sk_buff *skb = clist;
                        clist = clist->next;
-                       __kfree_skb(skb);
+                       if(skb->destructor)
+                               dev_kfree_skb_any(skb); /* put this one back */
+                       else
+                               __kfree_skb(skb);
                }
        }
 
@@ -156,10 +226,11 @@ repeat:
        if (!skb) {
                spin_lock_irqsave(&skb_list_lock, flags);
                skb = skbs;
-               if (skb)
+               if (skb) {
                        skbs = skb->next;
-               skb->next = NULL;
-               nr_skbs--;
+                       skb->next = NULL;
+                       nr_skbs--;
+               }
                spin_unlock_irqrestore(&skb_list_lock, flags);
        }
 
@@ -178,40 +249,59 @@ repeat:
        return skb;
 }
 
-void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
+static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
 {
        int status;
+       struct netpoll_info *npinfo;
 
-repeat:
-       if(!np || !np->dev || !netif_running(np->dev)) {
+       if (!np || !np->dev || !netif_running(np->dev)) {
                __kfree_skb(skb);
                return;
        }
 
-       spin_lock(&np->dev->xmit_lock);
-       np->dev->xmit_lock_owner = smp_processor_id();
+       npinfo = np->dev->npinfo;
 
-       /*
-        * network drivers do not expect to be called if the queue is
-        * stopped.
-        */
-       if (netif_queue_stopped(np->dev)) {
+       /* avoid recursion */
+       if (npinfo->poll_owner == smp_processor_id() ||
+           np->dev->xmit_lock_owner == smp_processor_id()) {
+               if (np->drop)
+                       np->drop(skb);
+               else
+                       __kfree_skb(skb);
+               return;
+       }
+
+       do {
+               npinfo->tries--;
+               spin_lock(&np->dev->xmit_lock);
+               np->dev->xmit_lock_owner = smp_processor_id();
+
+               /*
+                * network drivers do not expect to be called if the queue is
+                * stopped.
+                */
+               if (netif_queue_stopped(np->dev)) {
+                       np->dev->xmit_lock_owner = -1;
+                       spin_unlock(&np->dev->xmit_lock);
+                       netpoll_poll(np);
+                       udelay(50);
+                       continue;
+               }
+
+               status = np->dev->hard_start_xmit(skb, np->dev);
                np->dev->xmit_lock_owner = -1;
                spin_unlock(&np->dev->xmit_lock);
 
-               netpoll_poll(np);
-               goto repeat;
-       }
-
-       status = np->dev->hard_start_xmit(skb, np->dev);
-       np->dev->xmit_lock_owner = -1;
-       spin_unlock(&np->dev->xmit_lock);
+               /* success */
+               if(!status) {
+                       npinfo->tries = MAX_RETRIES; /* reset */
+                       return;
+               }
 
-       /* transmit busy */
-       if(status) {
+               /* transmit busy */
                netpoll_poll(np);
-               goto repeat;
-       }
+               udelay(50);
+       } while (npinfo->tries > 0);
 }
 
 void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
@@ -224,7 +314,7 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
 
        udp_len = len + sizeof(*udph);
        ip_len = eth_len = udp_len + sizeof(*iph);
-       total_len = eth_len + ETH_HLEN;
+       total_len = eth_len + ETH_HLEN + NET_IP_ALIGN;
 
        skb = find_skb(np, total_len, total_len - len);
        if (!skb)
@@ -260,30 +350,25 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
        memcpy(eth->h_source, np->local_mac, 6);
        memcpy(eth->h_dest, np->remote_mac, 6);
 
+       skb->dev = np->dev;
+
        netpoll_send_skb(np, skb);
 }
 
 static void arp_reply(struct sk_buff *skb)
 {
+       struct netpoll_info *npinfo = skb->dev->npinfo;
        struct arphdr *arp;
        unsigned char *arp_ptr;
        int size, type = ARPOP_REPLY, ptype = ETH_P_ARP;
        u32 sip, tip;
        struct sk_buff *send_skb;
-       unsigned long flags;
-       struct list_head *p;
        struct netpoll *np = NULL;
 
-       spin_lock_irqsave(&rx_list_lock, flags);
-       list_for_each(p, &rx_list) {
-               np = list_entry(p, struct netpoll, rx_list);
-               if ( np->dev == skb->dev )
-                       break;
-               np = NULL;
-       }
-       spin_unlock_irqrestore(&rx_list_lock, flags);
-
-       if (!np) return;
+       if (npinfo->rx_np && npinfo->rx_np->dev == skb->dev)
+               np = npinfo->rx_np;
+       if (!np)
+               return;
 
        /* No arp on this interface */
        if (skb->dev->flags & IFF_NOARP)
@@ -359,15 +444,15 @@ static void arp_reply(struct sk_buff *skb)
        netpoll_send_skb(np, send_skb);
 }
 
-int netpoll_rx(struct sk_buff *skb)
+int __netpoll_rx(struct sk_buff *skb)
 {
        int proto, len, ulen;
        struct iphdr *iph;
        struct udphdr *uh;
-       struct netpoll *np;
-       struct list_head *p;
-       unsigned long flags;
+       struct netpoll *np = skb->dev->npinfo->rx_np;
 
+       if (!np)
+               goto out;
        if (skb->dev->type != ARPHRD_ETHER)
                goto out;
 
@@ -409,34 +494,29 @@ int netpoll_rx(struct sk_buff *skb)
 
        if (ulen != len)
                goto out;
-       if (checksum_udp(skb, uh, ulen, iph->saddr, iph->daddr) < 0)
+       if (checksum_udp(skb, uh, ulen, iph->saddr, iph->daddr))
+               goto out;
+       if (np->local_ip && np->local_ip != ntohl(iph->daddr))
+               goto out;
+       if (np->remote_ip && np->remote_ip != ntohl(iph->saddr))
+               goto out;
+       if (np->local_port && np->local_port != ntohs(uh->dest))
                goto out;
 
-       spin_lock_irqsave(&rx_list_lock, flags);
-       list_for_each(p, &rx_list) {
-               np = list_entry(p, struct netpoll, rx_list);
-               if (np->dev && np->dev != skb->dev)
-                       continue;
-               if (np->local_ip && np->local_ip != ntohl(iph->daddr))
-                       continue;
-               if (np->remote_ip && np->remote_ip != ntohl(iph->saddr))
-                       continue;
-               if (np->local_port && np->local_port != ntohs(uh->dest))
-                       continue;
-
-               spin_unlock_irqrestore(&rx_list_lock, flags);
+       np->rx_hook(np, ntohs(uh->source),
+                   (char *)(uh+1),
+                   ulen - sizeof(struct udphdr));
 
-               if (np->rx_hook)
-                       np->rx_hook(np, ntohs(uh->source),
-                                   (char *)(uh+1),
-                                   ulen - sizeof(struct udphdr));
+       kfree_skb(skb);
+       return 1;
 
+out:
+       if (atomic_read(&trapped)) {
+               kfree_skb(skb);
                return 1;
        }
-       spin_unlock_irqrestore(&rx_list_lock, flags);
 
-out:
-       return atomic_read(&trapped);
+       return 0;
 }
 
 int netpoll_parse_options(struct netpoll *np, char *opt)
@@ -551,6 +631,8 @@ int netpoll_setup(struct netpoll *np)
 {
        struct net_device *ndev = NULL;
        struct in_device *in_dev;
+       struct netpoll_info *npinfo;
+       unsigned long flags;
 
        if (np->dev_name)
                ndev = dev_get_by_name(np->dev_name);
@@ -559,32 +641,45 @@ int netpoll_setup(struct netpoll *np)
                       np->name, np->dev_name);
                return -1;
        }
+
+       np->dev = ndev;
+       if (!ndev->npinfo) {
+               npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL);
+               if (!npinfo)
+                       goto release;
+
+               npinfo->rx_flags = 0;
+               npinfo->rx_np = NULL;
+               spin_lock_init(&npinfo->poll_lock);
+               npinfo->poll_owner = -1;
+               npinfo->tries = MAX_RETRIES;
+               spin_lock_init(&npinfo->rx_lock);
+       } else
+               npinfo = ndev->npinfo;
+
        if (!ndev->poll_controller) {
                printk(KERN_ERR "%s: %s doesn't support polling, aborting.\n",
                       np->name, np->dev_name);
                goto release;
        }
 
-       if (!(ndev->flags & IFF_UP)) {
-               unsigned short oflags;
+       if (!netif_running(ndev)) {
                unsigned long atmost, atleast;
 
                printk(KERN_INFO "%s: device %s not up yet, forcing it\n",
                       np->name, np->dev_name);
 
-               oflags = ndev->flags;
-
-               rtnl_shlock();
-               if (dev_change_flags(ndev, oflags | IFF_UP) < 0) {
+               rtnl_lock();
+               if (dev_change_flags(ndev, ndev->flags | IFF_UP) < 0) {
                        printk(KERN_ERR "%s: failed to open %s\n",
                               np->name, np->dev_name);
-                       rtnl_shunlock();
+                       rtnl_unlock();
                        goto release;
                }
-               rtnl_shunlock();
+               rtnl_unlock();
 
                atleast = jiffies + HZ/10;
-               atmost = jiffies + 10*HZ;
+               atmost = jiffies + 4*HZ;
                while (!netif_carrier_ok(ndev)) {
                        if (time_after(jiffies, atmost)) {
                                printk(KERN_NOTICE
@@ -595,23 +690,27 @@ int netpoll_setup(struct netpoll *np)
                        cond_resched();
                }
 
+               /* If carrier appears to come up instantly, we don't
+                * trust it and pause so that we don't pump all our
+                * queued console messages into the bitbucket.
+                */
+
                if (time_before(jiffies, atleast)) {
-                       printk(KERN_NOTICE "%s: carrier detect appears flaky,"
-                              " waiting 10 seconds\n",
+                       printk(KERN_NOTICE "%s: carrier detect appears"
+                              " untrustworthy, waiting 4 seconds\n",
                               np->name);
-                       while (time_before(jiffies, atmost))
-                               cond_resched();
+                       msleep(4000);
                }
        }
 
-       if (!memcmp(np->local_mac, "\0\0\0\0\0\0", 6) && ndev->dev_addr)
+       if (is_zero_ether_addr(np->local_mac) && ndev->dev_addr)
                memcpy(np->local_mac, ndev->dev_addr, 6);
 
        if (!np->local_ip) {
                rcu_read_lock();
-               in_dev = __in_dev_get(ndev);
+               in_dev = __in_dev_get_rcu(ndev);
 
-               if (!in_dev) {
+               if (!in_dev || !in_dev->ifa_list) {
                        rcu_read_unlock();
                        printk(KERN_ERR "%s: no IP address for %s, aborting\n",
                               np->name, np->dev_name);
@@ -624,37 +723,48 @@ int netpoll_setup(struct netpoll *np)
                       np->name, HIPQUAD(np->local_ip));
        }
 
-       np->dev = ndev;
+       if (np->rx_hook) {
+               spin_lock_irqsave(&npinfo->rx_lock, flags);
+               npinfo->rx_flags |= NETPOLL_RX_ENABLED;
+               npinfo->rx_np = np;
+               spin_unlock_irqrestore(&npinfo->rx_lock, flags);
+       }
 
-       if(np->rx_hook) {
-               unsigned long flags;
+       /* fill up the skb queue */
+       refill_skbs();
 
-               np->dev->netpoll_rx = NETPOLL_RX_ENABLED;
+       /* last thing to do is link it to the net device structure */
+       ndev->npinfo = npinfo;
 
-               spin_lock_irqsave(&rx_list_lock, flags);
-               list_add(&np->rx_list, &rx_list);
-               spin_unlock_irqrestore(&rx_list_lock, flags);
-       }
+       /* avoid racing with NAPI reading npinfo */
+       synchronize_rcu();
 
        return 0;
+
  release:
+       if (!ndev->npinfo)
+               kfree(npinfo);
+       np->dev = NULL;
        dev_put(ndev);
        return -1;
 }
 
 void netpoll_cleanup(struct netpoll *np)
 {
-       if (np->rx_hook) {
-               unsigned long flags;
+       struct netpoll_info *npinfo;
+       unsigned long flags;
 
-               spin_lock_irqsave(&rx_list_lock, flags);
-               list_del(&np->rx_list);
-               spin_unlock_irqrestore(&rx_list_lock, flags);
+       if (np->dev) {
+               npinfo = np->dev->npinfo;
+               if (npinfo && npinfo->rx_np == np) {
+                       spin_lock_irqsave(&npinfo->rx_lock, flags);
+                       npinfo->rx_np = NULL;
+                       npinfo->rx_flags &= ~NETPOLL_RX_ENABLED;
+                       spin_unlock_irqrestore(&npinfo->rx_lock, flags);
+               }
+               dev_put(np->dev);
        }
 
-       if (np->dev)
-               np->dev->netpoll_rx = 0;
-       dev_put(np->dev);
        np->dev = NULL;
 }
 
@@ -676,6 +786,6 @@ EXPORT_SYMBOL(netpoll_trap);
 EXPORT_SYMBOL(netpoll_parse_options);
 EXPORT_SYMBOL(netpoll_setup);
 EXPORT_SYMBOL(netpoll_cleanup);
-EXPORT_SYMBOL(netpoll_send_skb);
 EXPORT_SYMBOL(netpoll_send_udp);
 EXPORT_SYMBOL(netpoll_poll);
+EXPORT_SYMBOL(netpoll_queue);