Fedora kernel-2.6.17-1.2142_FC4 patched with stable patch-2.6.17.4-vs2.0.2-rc26.diff
[linux-2.6.git] / net / core / netpoll.c
index 4115945..e8e05ce 100644 (file)
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
 #include <linux/string.h>
+#include <linux/if_arp.h>
 #include <linux/inetdevice.h>
 #include <linux/inet.h>
 #include <linux/interrupt.h>
 #include <linux/netpoll.h>
 #include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/rcupdate.h>
+#include <linux/workqueue.h>
 #include <net/tcp.h>
 #include <net/udp.h>
+#include <asm/unaligned.h>
 
 /*
  * We maintain a small pool of fully-sized skbs, to make sure the
  * message gets out even in extreme OOM situations.
  */
 
-#define MAX_SKBS 32
 #define MAX_UDP_CHUNK 1460
+#define MAX_SKBS 32
+#define MAX_QUEUE_DEPTH (MAX_SKBS / 2)
+#define MAX_RETRIES 20000
 
-static spinlock_t skb_list_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(skb_list_lock);
 static int nr_skbs;
 static struct sk_buff *skbs;
 
-static spinlock_t rx_list_lock = SPIN_LOCK_UNLOCKED;
-static LIST_HEAD(rx_list);
+static DEFINE_SPINLOCK(queue_lock);
+static int queue_depth;
+static struct sk_buff *queue_head, *queue_tail;
+
+static atomic_t trapped;
 
-static int trapped;
+#define NETPOLL_RX_ENABLED  1
+#define NETPOLL_RX_DROP     2
 
 #define MAX_SKB_SIZE \
                (MAX_UDP_CHUNK + sizeof(struct udphdr) + \
@@ -44,35 +55,114 @@ static int trapped;
 
 static void zap_completion_queue(void);
 
+static void queue_process(void *p)
+{
+       unsigned long flags;
+       struct sk_buff *skb;
+
+       while (queue_head) {
+               spin_lock_irqsave(&queue_lock, flags);
+
+               skb = queue_head;
+               queue_head = skb->next;
+               if (skb == queue_tail)
+                       queue_head = NULL;
+
+               queue_depth--;
+
+               spin_unlock_irqrestore(&queue_lock, flags);
+
+               dev_queue_xmit(skb);
+       }
+}
+
+static DECLARE_WORK(send_queue, queue_process, NULL);
+
+void netpoll_queue(struct sk_buff *skb)
+{
+       unsigned long flags;
+
+       if (queue_depth == MAX_QUEUE_DEPTH) {
+               __kfree_skb(skb);
+               return;
+       }
+
+       spin_lock_irqsave(&queue_lock, flags);
+       if (!queue_head)
+               queue_head = skb;
+       else
+               queue_tail->next = skb;
+       queue_tail = skb;
+       queue_depth++;
+       spin_unlock_irqrestore(&queue_lock, flags);
+
+       schedule_work(&send_queue);
+}
+
 static int checksum_udp(struct sk_buff *skb, struct udphdr *uh,
                             unsigned short ulen, u32 saddr, u32 daddr)
 {
-       if (uh->check == 0)
+       unsigned int psum;
+
+       if (uh->check == 0 || skb->ip_summed == CHECKSUM_UNNECESSARY)
                return 0;
 
-       if (skb->ip_summed == CHECKSUM_HW)
-               return csum_tcpudp_magic(
-                       saddr, daddr, ulen, IPPROTO_UDP, skb->csum);
+       psum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0);
 
-       skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0);
+       if (skb->ip_summed == CHECKSUM_HW &&
+           !(u16)csum_fold(csum_add(psum, skb->csum)))
+               return 0;
+
+       skb->csum = psum;
 
-       return csum_fold(skb_checksum(skb, 0, skb->len, skb->csum));
+       return __skb_checksum_complete(skb);
 }
 
-void netpoll_poll(struct netpoll *np)
+/*
+ * Check whether delayed processing was scheduled for our NIC. If so,
+ * we attempt to grab the poll lock and use ->poll() to pump the card.
+ * If this fails, either we've recursed in ->poll() or it's already
+ * running on another CPU.
+ *
+ * Note: we don't mask interrupts with this lock because we're using
+ * trylock here and interrupts are already disabled in the softirq
+ * case. Further, we test the poll_owner to avoid recursion on UP
+ * systems where the lock doesn't exist.
+ *
+ * In cases where there is bi-directional communications, reading only
+ * one message at a time can lead to packets being dropped by the
+ * network adapter, forcing superfluous retries and possibly timeouts.
+ * Thus, we set our budget to greater than 1.
+ */
+static void poll_napi(struct netpoll *np)
 {
-       int budget = 1;
+       struct netpoll_info *npinfo = np->dev->npinfo;
+       int budget = 16;
+
+       if (test_bit(__LINK_STATE_RX_SCHED, &np->dev->state) &&
+           npinfo->poll_owner != smp_processor_id() &&
+           spin_trylock(&npinfo->poll_lock)) {
+               npinfo->rx_flags |= NETPOLL_RX_DROP;
+               atomic_inc(&trapped);
 
+               np->dev->poll(np->dev, &budget);
+
+               atomic_dec(&trapped);
+               npinfo->rx_flags &= ~NETPOLL_RX_DROP;
+               spin_unlock(&npinfo->poll_lock);
+       }
+}
+
+void netpoll_poll(struct netpoll *np)
+{
        if(!np->dev || !netif_running(np->dev) || !np->dev->poll_controller)
                return;
 
        /* Process pending work on NIC */
        np->dev->poll_controller(np->dev);
+       if (np->dev->poll)
+               poll_napi(np);
 
-       /* If scheduling is stopped, tickle NAPI bits */
-       if(trapped && np->dev->poll &&
-          test_bit(__LINK_STATE_RX_SCHED, &np->dev->state))
-               np->dev->poll(np->dev, &budget);
        zap_completion_queue();
 }
 
@@ -110,7 +200,10 @@ static void zap_completion_queue(void)
                while (clist != NULL) {
                        struct sk_buff *skb = clist;
                        clist = clist->next;
-                       __kfree_skb(skb);
+                       if(skb->destructor)
+                               dev_kfree_skb_any(skb); /* put this one back */
+                       else
+                               __kfree_skb(skb);
                }
        }
 
@@ -133,10 +226,11 @@ repeat:
        if (!skb) {
                spin_lock_irqsave(&skb_list_lock, flags);
                skb = skbs;
-               if (skb)
+               if (skb) {
                        skbs = skb->next;
-               skb->next = NULL;
-               nr_skbs--;
+                       skb->next = NULL;
+                       nr_skbs--;
+               }
                spin_unlock_irqrestore(&skb_list_lock, flags);
        }
 
@@ -155,28 +249,59 @@ repeat:
        return skb;
 }
 
-void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
+static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
 {
        int status;
+       struct netpoll_info *npinfo;
 
-repeat:
-       if(!np || !np->dev || !netif_running(np->dev)) {
+       if (!np || !np->dev || !netif_running(np->dev)) {
                __kfree_skb(skb);
                return;
        }
 
-       spin_lock(&np->dev->xmit_lock);
-       np->dev->xmit_lock_owner = smp_processor_id();
+       npinfo = np->dev->npinfo;
 
-       status = np->dev->hard_start_xmit(skb, np->dev);
-       np->dev->xmit_lock_owner = -1;
-       spin_unlock(&np->dev->xmit_lock);
+       /* avoid recursion */
+       if (npinfo->poll_owner == smp_processor_id() ||
+           np->dev->xmit_lock_owner == smp_processor_id()) {
+               if (np->drop)
+                       np->drop(skb);
+               else
+                       __kfree_skb(skb);
+               return;
+       }
+
+       do {
+               npinfo->tries--;
+               spin_lock(&np->dev->xmit_lock);
+               np->dev->xmit_lock_owner = smp_processor_id();
+
+               /*
+                * network drivers do not expect to be called if the queue is
+                * stopped.
+                */
+               if (netif_queue_stopped(np->dev)) {
+                       np->dev->xmit_lock_owner = -1;
+                       spin_unlock(&np->dev->xmit_lock);
+                       netpoll_poll(np);
+                       udelay(50);
+                       continue;
+               }
 
-       /* transmit busy */
-       if(status) {
+               status = np->dev->hard_start_xmit(skb, np->dev);
+               np->dev->xmit_lock_owner = -1;
+               spin_unlock(&np->dev->xmit_lock);
+
+               /* success */
+               if(!status) {
+                       npinfo->tries = MAX_RETRIES; /* reset */
+                       return;
+               }
+
+               /* transmit busy */
                netpoll_poll(np);
-               goto repeat;
-       }
+               udelay(50);
+       } while (npinfo->tries > 0);
 }
 
 void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
@@ -189,7 +314,7 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
 
        udp_len = len + sizeof(*udph);
        ip_len = eth_len = udp_len + sizeof(*iph);
-       total_len = eth_len + ETH_HLEN;
+       total_len = eth_len + ETH_HLEN + NET_IP_ALIGN;
 
        skb = find_skb(np, total_len, total_len - len);
        if (!skb)
@@ -206,17 +331,17 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
 
        iph = (struct iphdr *)skb_push(skb, sizeof(*iph));
 
-       iph->version  = 4;
-       iph->ihl      = 5;
+       /* iph->version = 4; iph->ihl = 5; */
+       put_unaligned(0x45, (unsigned char *)iph);
        iph->tos      = 0;
-       iph->tot_len  = htons(ip_len);
+       put_unaligned(htons(ip_len), &(iph->tot_len));
        iph->id       = 0;
        iph->frag_off = 0;
        iph->ttl      = 64;
        iph->protocol = IPPROTO_UDP;
        iph->check    = 0;
-       iph->saddr    = htonl(np->local_ip);
-       iph->daddr    = htonl(np->remote_ip);
+       put_unaligned(htonl(np->local_ip), &(iph->saddr));
+       put_unaligned(htonl(np->remote_ip), &(iph->daddr));
        iph->check    = ip_fast_csum((unsigned char *)iph, iph->ihl);
 
        eth = (struct ethhdr *) skb_push(skb, ETH_HLEN);
@@ -225,30 +350,25 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
        memcpy(eth->h_source, np->local_mac, 6);
        memcpy(eth->h_dest, np->remote_mac, 6);
 
+       skb->dev = np->dev;
+
        netpoll_send_skb(np, skb);
 }
 
 static void arp_reply(struct sk_buff *skb)
 {
+       struct netpoll_info *npinfo = skb->dev->npinfo;
        struct arphdr *arp;
        unsigned char *arp_ptr;
        int size, type = ARPOP_REPLY, ptype = ETH_P_ARP;
        u32 sip, tip;
        struct sk_buff *send_skb;
-       unsigned long flags;
-       struct list_head *p;
-       struct netpoll *np = 0;
-
-       spin_lock_irqsave(&rx_list_lock, flags);
-       list_for_each(p, &rx_list) {
-               np = list_entry(p, struct netpoll, rx_list);
-               if ( np->dev == skb->dev )
-                       break;
-               np = 0;
-       }
-       spin_unlock_irqrestore(&rx_list_lock, flags);
+       struct netpoll *np = NULL;
 
-       if (!np) return;
+       if (npinfo->rx_np && npinfo->rx_np->dev == skb->dev)
+               np = npinfo->rx_np;
+       if (!np)
+               return;
 
        /* No arp on this interface */
        if (skb->dev->flags & IFF_NOARP)
@@ -324,25 +444,26 @@ static void arp_reply(struct sk_buff *skb)
        netpoll_send_skb(np, send_skb);
 }
 
-int netpoll_rx(struct sk_buff *skb)
+int __netpoll_rx(struct sk_buff *skb)
 {
        int proto, len, ulen;
        struct iphdr *iph;
        struct udphdr *uh;
-       struct netpoll *np;
-       struct list_head *p;
-       unsigned long flags;
+       struct netpoll *np = skb->dev->npinfo->rx_np;
 
+       if (!np)
+               goto out;
        if (skb->dev->type != ARPHRD_ETHER)
                goto out;
 
        /* check if netpoll clients need ARP */
-       if (skb->protocol == __constant_htons(ETH_P_ARP) && trapped) {
+       if (skb->protocol == __constant_htons(ETH_P_ARP) &&
+           atomic_read(&trapped)) {
                arp_reply(skb);
                return 1;
        }
 
-       proto = ntohs(skb->mac.ethernet->h_proto);
+       proto = ntohs(eth_hdr(skb)->h_proto);
        if (proto != ETH_P_IP)
                goto out;
        if (skb->pkt_type == PACKET_OTHERHOST)
@@ -373,34 +494,29 @@ int netpoll_rx(struct sk_buff *skb)
 
        if (ulen != len)
                goto out;
-       if (checksum_udp(skb, uh, ulen, iph->saddr, iph->daddr) < 0)
+       if (checksum_udp(skb, uh, ulen, iph->saddr, iph->daddr))
+               goto out;
+       if (np->local_ip && np->local_ip != ntohl(iph->daddr))
+               goto out;
+       if (np->remote_ip && np->remote_ip != ntohl(iph->saddr))
+               goto out;
+       if (np->local_port && np->local_port != ntohs(uh->dest))
                goto out;
 
-       spin_lock_irqsave(&rx_list_lock, flags);
-       list_for_each(p, &rx_list) {
-               np = list_entry(p, struct netpoll, rx_list);
-               if (np->dev && np->dev != skb->dev)
-                       continue;
-               if (np->local_ip && np->local_ip != ntohl(iph->daddr))
-                       continue;
-               if (np->remote_ip && np->remote_ip != ntohl(iph->saddr))
-                       continue;
-               if (np->local_port && np->local_port != ntohs(uh->dest))
-                       continue;
-
-               spin_unlock_irqrestore(&rx_list_lock, flags);
+       np->rx_hook(np, ntohs(uh->source),
+                   (char *)(uh+1),
+                   ulen - sizeof(struct udphdr));
 
-               if (np->rx_hook)
-                       np->rx_hook(np, ntohs(uh->source),
-                                   (char *)(uh+1),
-                                   ulen - sizeof(struct udphdr));
+       kfree_skb(skb);
+       return 1;
 
+out:
+       if (atomic_read(&trapped)) {
+               kfree_skb(skb);
                return 1;
        }
-       spin_unlock_irqrestore(&rx_list_lock, flags);
 
-out:
-       return trapped;
+       return 0;
 }
 
 int netpoll_parse_options(struct netpoll *np, char *opt)
@@ -411,7 +527,7 @@ int netpoll_parse_options(struct netpoll *np, char *opt)
                if ((delim = strchr(cur, '@')) == NULL)
                        goto parse_failed;
                *delim=0;
-               np->local_port=simple_strtol(cur, 0, 10);
+               np->local_port=simple_strtol(cur, NULL, 10);
                cur=delim;
        }
        cur++;
@@ -446,7 +562,7 @@ int netpoll_parse_options(struct netpoll *np, char *opt)
                if ((delim = strchr(cur, '@')) == NULL)
                        goto parse_failed;
                *delim=0;
-               np->remote_port=simple_strtol(cur, 0, 10);
+               np->remote_port=simple_strtol(cur, NULL, 10);
                cur=delim;
        }
        cur++;
@@ -468,29 +584,29 @@ int netpoll_parse_options(struct netpoll *np, char *opt)
                if ((delim = strchr(cur, ':')) == NULL)
                        goto parse_failed;
                *delim=0;
-               np->remote_mac[0]=simple_strtol(cur, 0, 16);
+               np->remote_mac[0]=simple_strtol(cur, NULL, 16);
                cur=delim+1;
                if ((delim = strchr(cur, ':')) == NULL)
                        goto parse_failed;
                *delim=0;
-               np->remote_mac[1]=simple_strtol(cur, 0, 16);
+               np->remote_mac[1]=simple_strtol(cur, NULL, 16);
                cur=delim+1;
                if ((delim = strchr(cur, ':')) == NULL)
                        goto parse_failed;
                *delim=0;
-               np->remote_mac[2]=simple_strtol(cur, 0, 16);
+               np->remote_mac[2]=simple_strtol(cur, NULL, 16);
                cur=delim+1;
                if ((delim = strchr(cur, ':')) == NULL)
                        goto parse_failed;
                *delim=0;
-               np->remote_mac[3]=simple_strtol(cur, 0, 16);
+               np->remote_mac[3]=simple_strtol(cur, NULL, 16);
                cur=delim+1;
                if ((delim = strchr(cur, ':')) == NULL)
                        goto parse_failed;
                *delim=0;
-               np->remote_mac[4]=simple_strtol(cur, 0, 16);
+               np->remote_mac[4]=simple_strtol(cur, NULL, 16);
                cur=delim+1;
-               np->remote_mac[5]=simple_strtol(cur, 0, 16);
+               np->remote_mac[5]=simple_strtol(cur, NULL, 16);
        }
 
        printk(KERN_INFO "%s: remote ethernet address "
@@ -515,6 +631,8 @@ int netpoll_setup(struct netpoll *np)
 {
        struct net_device *ndev = NULL;
        struct in_device *in_dev;
+       struct netpoll_info *npinfo;
+       unsigned long flags;
 
        if (np->dev_name)
                ndev = dev_get_by_name(np->dev_name);
@@ -523,32 +641,45 @@ int netpoll_setup(struct netpoll *np)
                       np->name, np->dev_name);
                return -1;
        }
+
+       np->dev = ndev;
+       if (!ndev->npinfo) {
+               npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL);
+               if (!npinfo)
+                       goto release;
+
+               npinfo->rx_flags = 0;
+               npinfo->rx_np = NULL;
+               spin_lock_init(&npinfo->poll_lock);
+               npinfo->poll_owner = -1;
+               npinfo->tries = MAX_RETRIES;
+               spin_lock_init(&npinfo->rx_lock);
+       } else
+               npinfo = ndev->npinfo;
+
        if (!ndev->poll_controller) {
                printk(KERN_ERR "%s: %s doesn't support polling, aborting.\n",
                       np->name, np->dev_name);
                goto release;
        }
 
-       if (!(ndev->flags & IFF_UP)) {
-               unsigned short oflags;
+       if (!netif_running(ndev)) {
                unsigned long atmost, atleast;
 
                printk(KERN_INFO "%s: device %s not up yet, forcing it\n",
                       np->name, np->dev_name);
 
-               oflags = ndev->flags;
-
-               rtnl_shlock();
-               if (dev_change_flags(ndev, oflags | IFF_UP) < 0) {
+               rtnl_lock();
+               if (dev_change_flags(ndev, ndev->flags | IFF_UP) < 0) {
                        printk(KERN_ERR "%s: failed to open %s\n",
                               np->name, np->dev_name);
-                       rtnl_shunlock();
+                       rtnl_unlock();
                        goto release;
                }
-               rtnl_shunlock();
+               rtnl_unlock();
 
                atleast = jiffies + HZ/10;
-               atmost = jiffies + 10*HZ;
+               atmost = jiffies + 4*HZ;
                while (!netif_carrier_ok(ndev)) {
                        if (time_after(jiffies, atmost)) {
                                printk(KERN_NOTICE
@@ -559,78 +690,95 @@ int netpoll_setup(struct netpoll *np)
                        cond_resched();
                }
 
+               /* If carrier appears to come up instantly, we don't
+                * trust it and pause so that we don't pump all our
+                * queued console messages into the bitbucket.
+                */
+
                if (time_before(jiffies, atleast)) {
-                       printk(KERN_NOTICE "%s: carrier detect appears flaky,"
-                              " waiting 10 seconds\n",
+                       printk(KERN_NOTICE "%s: carrier detect appears"
+                              " untrustworthy, waiting 4 seconds\n",
                               np->name);
-                       while (time_before(jiffies, atmost))
-                               cond_resched();
+                       msleep(4000);
                }
        }
 
-       if (!memcmp(np->local_mac, "\0\0\0\0\0\0", 6) && ndev->dev_addr)
+       if (is_zero_ether_addr(np->local_mac) && ndev->dev_addr)
                memcpy(np->local_mac, ndev->dev_addr, 6);
 
        if (!np->local_ip) {
-               in_dev = in_dev_get(ndev);
+               rcu_read_lock();
+               in_dev = __in_dev_get_rcu(ndev);
 
-               if (!in_dev) {
+               if (!in_dev || !in_dev->ifa_list) {
+                       rcu_read_unlock();
                        printk(KERN_ERR "%s: no IP address for %s, aborting\n",
                               np->name, np->dev_name);
                        goto release;
                }
 
                np->local_ip = ntohl(in_dev->ifa_list->ifa_local);
-               in_dev_put(in_dev);
+               rcu_read_unlock();
                printk(KERN_INFO "%s: local IP %d.%d.%d.%d\n",
                       np->name, HIPQUAD(np->local_ip));
        }
 
-       np->dev = ndev;
+       if (np->rx_hook) {
+               spin_lock_irqsave(&npinfo->rx_lock, flags);
+               npinfo->rx_flags |= NETPOLL_RX_ENABLED;
+               npinfo->rx_np = np;
+               spin_unlock_irqrestore(&npinfo->rx_lock, flags);
+       }
 
-       if(np->rx_hook) {
-               unsigned long flags;
+       /* fill up the skb queue */
+       refill_skbs();
 
-#ifdef CONFIG_NETPOLL_RX
-               np->dev->netpoll_rx = 1;
-#endif
+       /* last thing to do is link it to the net device structure */
+       ndev->npinfo = npinfo;
 
-               spin_lock_irqsave(&rx_list_lock, flags);
-               list_add(&np->rx_list, &rx_list);
-               spin_unlock_irqrestore(&rx_list_lock, flags);
-       }
+       /* avoid racing with NAPI reading npinfo */
+       synchronize_rcu();
 
        return 0;
+
  release:
+       if (!ndev->npinfo)
+               kfree(npinfo);
+       np->dev = NULL;
        dev_put(ndev);
        return -1;
 }
 
 void netpoll_cleanup(struct netpoll *np)
 {
-       if(np->rx_hook) {
-               unsigned long flags;
-
-               spin_lock_irqsave(&rx_list_lock, flags);
-               list_del(&np->rx_list);
-#ifdef CONFIG_NETPOLL_RX
-               np->dev->netpoll_rx = 0;
-#endif
-               spin_unlock_irqrestore(&rx_list_lock, flags);
+       struct netpoll_info *npinfo;
+       unsigned long flags;
+
+       if (np->dev) {
+               npinfo = np->dev->npinfo;
+               if (npinfo && npinfo->rx_np == np) {
+                       spin_lock_irqsave(&npinfo->rx_lock, flags);
+                       npinfo->rx_np = NULL;
+                       npinfo->rx_flags &= ~NETPOLL_RX_ENABLED;
+                       spin_unlock_irqrestore(&npinfo->rx_lock, flags);
+               }
+               dev_put(np->dev);
        }
 
-       dev_put(np->dev);
-       np->dev = 0;
+       np->dev = NULL;
 }
 
 int netpoll_trap(void)
 {
-       return trapped;
+       return atomic_read(&trapped);
 }
 
 void netpoll_set_trap(int trap)
 {
-       trapped = trap;
+       if (trap)
+               atomic_inc(&trapped);
+       else
+               atomic_dec(&trapped);
 }
 
 EXPORT_SYMBOL(netpoll_set_trap);
@@ -638,6 +786,6 @@ EXPORT_SYMBOL(netpoll_trap);
 EXPORT_SYMBOL(netpoll_parse_options);
 EXPORT_SYMBOL(netpoll_setup);
 EXPORT_SYMBOL(netpoll_cleanup);
-EXPORT_SYMBOL(netpoll_send_skb);
 EXPORT_SYMBOL(netpoll_send_udp);
 EXPORT_SYMBOL(netpoll_poll);
+EXPORT_SYMBOL(netpoll_queue);