datapath: Compute checksum while sending packets to userspace().
[sliver-openvswitch.git] / datapath / datapath.c
index 837f567..491f98a 100644 (file)
@@ -32,6 +32,7 @@
 #include <asm/system.h>
 #include <asm/div64.h>
 #include <asm/bug.h>
+#include <asm/highmem.h>
 #include <linux/netfilter_bridge.h>
 #include <linux/netfilter_ipv4.h>
 #include <linux/inetdevice.h>
@@ -363,9 +364,9 @@ static int new_dp_port(struct datapath *dp, struct odp_port *odp_port, int port_
                vport_lock();
 
                if (odp_port->flags & ODP_PORT_INTERNAL)
-                       vport = __vport_add(odp_port->devname, "internal", NULL);
+                       vport = vport_add(odp_port->devname, "internal", NULL);
                else
-                       vport = __vport_add(odp_port->devname, "netdev", NULL);
+                       vport = vport_add(odp_port->devname, "netdev", NULL);
 
                vport_unlock();
 
@@ -471,7 +472,7 @@ int dp_detach_port(struct dp_port *p, int may_delete)
 
                if (!strcmp(port_type, "netdev") || !strcmp(port_type, "internal")) {
                        vport_lock();
-                       __vport_del(vport);
+                       vport_del(vport);
                        vport_unlock();
                }
        }
@@ -512,11 +513,12 @@ out:
        return err;
 }
 
-/* Must be called with rcu_read_lock and with bottom-halves disabled. */
+/* Must be called with rcu_read_lock. */
 void dp_process_received_packet(struct dp_port *p, struct sk_buff *skb)
 {
        struct datapath *dp = p->dp;
        struct dp_stats_percpu *stats;
+       int stats_counter_off;
        struct odp_flow_key key;
        struct tbl_node *flow_node;
 
@@ -525,14 +527,11 @@ void dp_process_received_packet(struct dp_port *p, struct sk_buff *skb)
 
        OVS_CB(skb)->dp_port = p;
 
-       /* BHs are off so we don't have to use get_cpu()/put_cpu() here. */
-       stats = percpu_ptr(dp->stats_percpu, smp_processor_id());
-
        if (flow_extract(skb, p ? p->port_no : ODPP_NONE, &key)) {
                if (dp->drop_frags) {
                        kfree_skb(skb);
-                       stats->n_frags++;
-                       return;
+                       stats_counter_off = offsetof(struct dp_stats_percpu, n_frags);
+                       goto out;
                }
        }
 
@@ -543,11 +542,17 @@ void dp_process_received_packet(struct dp_port *p, struct sk_buff *skb)
                flow_used(flow, skb);
                execute_actions(dp, skb, &key, acts->actions, acts->n_actions,
                                GFP_ATOMIC);
-               stats->n_hit++;
+               stats_counter_off = offsetof(struct dp_stats_percpu, n_hit);
        } else {
-               stats->n_missed++;
+               stats_counter_off = offsetof(struct dp_stats_percpu, n_missed);
                dp_output_control(dp, skb, _ODPL_MISS_NR, OVS_CB(skb)->tun_id);
        }
+
+out:
+       local_bh_disable();
+       stats = per_cpu_ptr(dp->stats_percpu, smp_processor_id());
+       (*(u64 *)((u8 *)stats + stats_counter_off))++;
+       local_bh_enable();
 }
 
 #if defined(CONFIG_XEN) && defined(HAVE_PROTO_DATA_VALID)
@@ -739,31 +744,16 @@ queue_control_packets(struct sk_buff *skb, struct sk_buff_head *queue,
                nskb = skb->next;
                skb->next = NULL;
 
-               /* If a checksum-deferred packet is forwarded to the
-                * controller, correct the pointers and checksum.
-                */
-               err = vswitch_skb_checksum_setup(skb);
-               if (err)
-                       goto err_kfree_skbs;
-
-               if (skb->ip_summed == CHECKSUM_PARTIAL) {
-
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22)
-                       /* Until 2.6.22, the start of the transport header was
-                        * also the start of data to be checksummed.  Linux
-                        * 2.6.22 introduced the csum_start field for this
-                        * purpose, but we should point the transport header to
-                        * it anyway for backward compatibility, as
-                        * dev_queue_xmit() does even in 2.6.28. */
-                       skb_set_transport_header(skb, skb->csum_start -
-                                                skb_headroom(skb));
+               /* Until 2.6.22, the start of the transport header was
+                * also the start of data to be checksummed.  Linux
+                * 2.6.22 introduced the csum_start field for this
+                * purpose, but we should point the transport header to
+                * it anyway for backward compatibility, as
+                * dev_queue_xmit() does even in 2.6.28. */
+               skb_set_transport_header(skb, skb->csum_start - skb_headroom(skb));
 #endif
 
-                       err = skb_checksum_help(skb);
-                       if (err)
-                               goto err_kfree_skbs;
-               }
-
                err = skb_cow(skb, sizeof *header);
                if (err)
                        goto err_kfree_skbs;
@@ -806,10 +796,14 @@ dp_output_control(struct datapath *dp, struct sk_buff *skb, int queue_no,
 
        forward_ip_summed(skb);
 
+       err = vswitch_skb_checksum_setup(skb);
+       if (err)
+               goto err_kfree_skb;
+
        /* Break apart GSO packets into their component pieces.  Otherwise
         * userspace may try to stuff a 64kB packet into a 1500-byte MTU. */
        if (skb_is_gso(skb)) {
-               struct sk_buff *nskb = skb_gso_segment(skb, 0);
+               struct sk_buff *nskb = skb_gso_segment(skb, NETIF_F_SG | NETIF_F_HW_CSUM);
                if (nskb) {
                        kfree_skb(skb);
                        skb = nskb;
@@ -830,9 +824,10 @@ dp_output_control(struct datapath *dp, struct sk_buff *skb, int queue_no,
 err_kfree_skb:
        kfree_skb(skb);
 err:
-       stats = percpu_ptr(dp->stats_percpu, get_cpu());
+       local_bh_disable();
+       stats = per_cpu_ptr(dp->stats_percpu, smp_processor_id());
        stats->n_lost++;
-       put_cpu();
+       local_bh_enable();
 
        return err;
 }
@@ -1015,7 +1010,6 @@ static int do_put_flow(struct datapath *dp, struct odp_flow_put *uf,
        } else {
                /* We found a matching flow. */
                struct sw_flow_actions *old_acts, *new_acts;
-               unsigned long int flags;
 
                flow = flow_cast(flow_node);
 
@@ -1040,11 +1034,11 @@ static int do_put_flow(struct datapath *dp, struct odp_flow_put *uf,
                }
 
                /* Fetch stats, then clear them if necessary. */
-               spin_lock_irqsave(&flow->lock, flags);
+               spin_lock_bh(&flow->lock);
                get_stats(flow, stats);
                if (uf->flags & ODPPF_ZERO_STATS)
                        clear_stats(flow);
-               spin_unlock_irqrestore(&flow->lock, flags);
+               spin_unlock_bh(&flow->lock);
        }
 
        return 0;
@@ -1084,15 +1078,14 @@ static int do_answer_query(struct sw_flow *flow, u32 query_flags,
 {
        struct sw_flow_actions *sf_acts;
        struct odp_flow_stats stats;
-       unsigned long int flags;
        u32 n_actions;
 
-       spin_lock_irqsave(&flow->lock, flags);
+       spin_lock_bh(&flow->lock);
        get_stats(flow, &stats);
-       if (query_flags & ODPFF_ZERO_TCP_FLAGS) {
+       if (query_flags & ODPFF_ZERO_TCP_FLAGS)
                flow->tcp_flags = 0;
-       }
-       spin_unlock_irqrestore(&flow->lock, flags);
+
+       spin_unlock_bh(&flow->lock);
 
        if (copy_to_user(ustats, &stats, sizeof(struct odp_flow_stats)) ||
            get_user(n_actions, n_actionsp))
@@ -1302,8 +1295,12 @@ static int do_execute(struct datapath *dp, const struct odp_execute *execute)
                skb->protocol = htons(ETH_P_802_2);
 
        flow_extract(skb, execute->in_port, &key);
+
+       rcu_read_lock();
        err = execute_actions(dp, skb, &key, actions->actions,
                              actions->n_actions, GFP_KERNEL);
+       rcu_read_unlock();
+
        kfree(actions);
        return err;
 
@@ -1340,7 +1337,7 @@ static int get_dp_stats(struct datapath *dp, struct odp_stats __user *statsp)
        stats.n_frags = stats.n_hit = stats.n_missed = stats.n_lost = 0;
        for_each_possible_cpu(i) {
                const struct dp_stats_percpu *s;
-               s = percpu_ptr(dp->stats_percpu, i);
+               s = per_cpu_ptr(dp->stats_percpu, i);
                stats.n_frags += s->n_frags;
                stats.n_hit += s->n_hit;
                stats.n_missed += s->n_missed;
@@ -1569,7 +1566,7 @@ static int get_port_group(struct datapath *dp, struct odp_port_group __user *upg
        if (copy_from_user(&pg, upg, sizeof pg))
                return -EFAULT;
 
-       return do_get_port_group(dp, pg.ports, pg.n_ports, pg.group, &pg.n_ports);
+       return do_get_port_group(dp, pg.ports, pg.n_ports, pg.group, &upg->n_ports);
 }
 
 static int get_listen_mask(const struct file *f)
@@ -1612,35 +1609,39 @@ static long openvswitch_ioctl(struct file *f, unsigned int cmd,
                goto exit;
 
        case ODP_VPORT_ADD:
-               err = vport_add((struct odp_vport_add __user *)argp);
+               err = vport_user_add((struct odp_vport_add __user *)argp);
                goto exit;
 
        case ODP_VPORT_MOD:
-               err = vport_mod((struct odp_vport_mod __user *)argp);
+               err = vport_user_mod((struct odp_vport_mod __user *)argp);
                goto exit;
 
        case ODP_VPORT_DEL:
-               err = vport_del((char __user *)argp);
+               err = vport_user_del((char __user *)argp);
                goto exit;
 
        case ODP_VPORT_STATS_GET:
-               err = vport_stats_get((struct odp_vport_stats_req __user *)argp);
+               err = vport_user_stats_get((struct odp_vport_stats_req __user *)argp);
+               goto exit;
+
+       case ODP_VPORT_STATS_SET:
+               err = vport_user_stats_set((struct odp_vport_stats_req __user *)argp);
                goto exit;
 
        case ODP_VPORT_ETHER_GET:
-               err = vport_ether_get((struct odp_vport_ether __user *)argp);
+               err = vport_user_ether_get((struct odp_vport_ether __user *)argp);
                goto exit;
 
        case ODP_VPORT_ETHER_SET:
-               err = vport_ether_set((struct odp_vport_ether __user *)argp);
+               err = vport_user_ether_set((struct odp_vport_ether __user *)argp);
                goto exit;
 
        case ODP_VPORT_MTU_GET:
-               err = vport_mtu_get((struct odp_vport_mtu __user *)argp);
+               err = vport_user_mtu_get((struct odp_vport_mtu __user *)argp);
                goto exit;
 
        case ODP_VPORT_MTU_SET:
-               err = vport_mtu_set((struct odp_vport_mtu __user *)argp);
+               err = vport_user_mtu_set((struct odp_vport_mtu __user *)argp);
                goto exit;
        }
 
@@ -1787,7 +1788,7 @@ static int compat_get_port_group(struct datapath *dp, struct compat_odp_port_gro
                return -EFAULT;
 
        return do_get_port_group(dp, compat_ptr(pg.ports), pg.n_ports,
-                                pg.group, &pg.n_ports);
+                                pg.group, &upg->n_ports);
 }
 
 static int compat_get_flow(struct odp_flow *flow, const struct compat_odp_flow __user *compat)
@@ -1991,6 +1992,7 @@ static long openvswitch_compat_ioctl(struct file *f, unsigned int cmd, unsigned
        case ODP_VPORT_MTU_GET:
        case ODP_VPORT_ETHER_SET:
        case ODP_VPORT_ETHER_GET:
+       case ODP_VPORT_STATS_SET:
        case ODP_VPORT_STATS_GET:
        case ODP_DP_STATS:
        case ODP_GET_DROP_FRAGS:
@@ -2004,10 +2006,10 @@ static long openvswitch_compat_ioctl(struct file *f, unsigned int cmd, unsigned
                return openvswitch_ioctl(f, cmd, (unsigned long)compat_ptr(argp));
 
        case ODP_VPORT_ADD32:
-               return compat_vport_add(compat_ptr(argp));
+               return compat_vport_user_add(compat_ptr(argp));
 
        case ODP_VPORT_MOD32:
-               return compat_vport_mod(compat_ptr(argp));
+               return compat_vport_user_mod(compat_ptr(argp));
        }
 
        dp = get_dp_locked(dp_idx);
@@ -2058,6 +2060,100 @@ exit:
 }
 #endif
 
+/* Unfortunately this function is not exported so this is a verbatim copy
+ * from net/core/datagram.c in 2.6.30. */
+static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
+                                     u8 __user *to, int len,
+                                     __wsum *csump)
+{
+       int start = skb_headlen(skb);
+       int pos = 0;
+       int i, copy = start - offset;
+
+       /* Copy header. */
+       if (copy > 0) {
+               int err = 0;
+               if (copy > len)
+                       copy = len;
+               *csump = csum_and_copy_to_user(skb->data + offset, to, copy,
+                                              *csump, &err);
+               if (err)
+                       goto fault;
+               if ((len -= copy) == 0)
+                       return 0;
+               offset += copy;
+               to += copy;
+               pos = copy;
+       }
+
+       for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+               int end;
+
+               WARN_ON(start > offset + len);
+
+               end = start + skb_shinfo(skb)->frags[i].size;
+               if ((copy = end - offset) > 0) {
+                       __wsum csum2;
+                       int err = 0;
+                       u8  *vaddr;
+                       skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+                       struct page *page = frag->page;
+
+                       if (copy > len)
+                               copy = len;
+                       vaddr = kmap(page);
+                       csum2 = csum_and_copy_to_user(vaddr +
+                                                       frag->page_offset +
+                                                       offset - start,
+                                                     to, copy, 0, &err);
+                       kunmap(page);
+                       if (err)
+                               goto fault;
+                       *csump = csum_block_add(*csump, csum2, pos);
+                       if (!(len -= copy))
+                               return 0;
+                       offset += copy;
+                       to += copy;
+                       pos += copy;
+               }
+               start = end;
+       }
+
+       if (skb_shinfo(skb)->frag_list) {
+               struct sk_buff *list = skb_shinfo(skb)->frag_list;
+
+               for (; list; list=list->next) {
+                       int end;
+
+                       WARN_ON(start > offset + len);
+
+                       end = start + list->len;
+                       if ((copy = end - offset) > 0) {
+                               __wsum csum2 = 0;
+                               if (copy > len)
+                                       copy = len;
+                               if (skb_copy_and_csum_datagram(list,
+                                                              offset - start,
+                                                              to, copy,
+                                                              &csum2))
+                                       goto fault;
+                               *csump = csum_block_add(*csump, csum2, pos);
+                               if ((len -= copy) == 0)
+                                       return 0;
+                               offset += copy;
+                               to += copy;
+                               pos += copy;
+                       }
+                       start = end;
+               }
+       }
+       if (!len)
+               return 0;
+
+fault:
+       return -EFAULT;
+}
+
 ssize_t openvswitch_read(struct file *f, char __user *buf, size_t nbytes,
                      loff_t *ppos)
 {
@@ -2066,8 +2162,7 @@ ssize_t openvswitch_read(struct file *f, char __user *buf, size_t nbytes,
        int dp_idx = iminor(f->f_dentry->d_inode);
        struct datapath *dp = get_dp(dp_idx);
        struct sk_buff *skb;
-       struct iovec __user iov;
-       size_t copy_bytes;
+       size_t copy_bytes, tot_copy_bytes;
        int retval;
 
        if (!dp)
@@ -2102,12 +2197,44 @@ ssize_t openvswitch_read(struct file *f, char __user *buf, size_t nbytes,
                }
        }
 success:
-       copy_bytes = min_t(size_t, skb->len, nbytes);
-       iov.iov_base = buf;
-       iov.iov_len = copy_bytes;
-       retval = skb_copy_datagram_iovec(skb, 0, &iov, iov.iov_len);
+       copy_bytes = tot_copy_bytes = min_t(size_t, skb->len, nbytes);
+       
+       retval = 0;
+       if (skb->ip_summed == CHECKSUM_PARTIAL) {
+               __wsum csum = 0;
+               int csum_start, csum_offset;
+
+               csum_start = skb_transport_header(skb) - skb->data;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22)
+               csum_offset = skb->csum_offset;
+#else
+               csum_offset = skb->csum;
+#endif
+               if (csum_start + csum_offset + sizeof(__sum16) <= copy_bytes) {
+                       retval = skb_copy_and_csum_datagram(skb, csum_start, buf + csum_start,
+                                                           copy_bytes - csum_start, &csum);
+
+                       if (!retval) {
+                               __sum16 __user *csump;
+
+                               copy_bytes = csum_start;
+                               csump = (__sum16 __user *)(buf + csum_start + csum_offset);
+                               put_user(csum_fold(csum), csump);
+                       }
+               }
+       }
+
+       if (!retval) {
+               struct iovec __user iov;
+
+               iov.iov_base = buf;
+               iov.iov_len = copy_bytes;
+               retval = skb_copy_datagram_iovec(skb, 0, &iov, iov.iov_len);
+       }
+
        if (!retval)
-               retval = copy_bytes;
+               retval = tot_copy_bytes;
+
        kfree_skb(skb);
 
 error: