#include <linux/compat.h>
#include "openvswitch/datapath-protocol.h"
+#include "checksum.h"
#include "datapath.h"
#include "actions.h"
#include "flow.h"
goto err_destroy_local_port;
rcu_assign_pointer(dps[dp_idx], dp);
+ dp_sysfs_add_dp(dp);
+
mutex_unlock(&dp_mutex);
rtnl_unlock();
- dp_sysfs_add_dp(dp);
-
return 0;
err_destroy_local_port:
{
struct vport_parms parms;
struct vport *vport;
- int err;
parms.name = odp_port->devname;
parms.type = odp_port->type;
if (IS_ERR(vport))
return PTR_ERR(vport);
- err = vport_attach(vport);
- if (err) {
- vport_del(vport);
- return err;
- }
-
rcu_assign_pointer(dp->ports[port_no], vport);
list_add_rcu(&vport->node, &dp->port_list);
dp->n_ports++;
list_del_rcu(&p->node);
rcu_assign_pointer(p->dp->ports[p->port_no], NULL);
- err = vport_detach(p);
- if (err)
- return err;
-
- /* Then wait until no one is still using it, and destroy it. */
- synchronize_rcu();
-
+ /* Then destroy it. */
vport_lock();
- vport_del(p);
+ err = vport_del(p);
vport_unlock();
- return 0;
+ return err;
}
static int detach_port(int dp_idx, int port_no)
/* Execute actions. */
execute_actions(dp, skb, &OVS_CB(skb)->flow->key, acts->actions,
- acts->n_actions);
+ acts->actions_len);
stats_counter_off = offsetof(struct dp_stats_percpu, n_hit);
/* Check whether sub-actions looped too much. */
local_bh_enable();
}
-#if defined(CONFIG_XEN) && defined(HAVE_PROTO_DATA_VALID)
-/* This code is based on skb_checksum_setup() from Xen's net/dev/core.c. We
- * can't call this function directly because it isn't exported in all
- * versions. */
-int vswitch_skb_checksum_setup(struct sk_buff *skb)
-{
- struct iphdr *iph;
- unsigned char *th;
- int err = -EPROTO;
- __u16 csum_start, csum_offset;
-
- if (!skb->proto_csum_blank)
- return 0;
-
- if (skb->protocol != htons(ETH_P_IP))
- goto out;
-
- if (!pskb_may_pull(skb, skb_network_header(skb) + sizeof(struct iphdr) - skb->data))
- goto out;
-
- iph = ip_hdr(skb);
- th = skb_network_header(skb) + 4 * iph->ihl;
-
- csum_start = th - skb->head;
- switch (iph->protocol) {
- case IPPROTO_TCP:
- csum_offset = offsetof(struct tcphdr, check);
- break;
- case IPPROTO_UDP:
- csum_offset = offsetof(struct udphdr, check);
- break;
- default:
- if (net_ratelimit())
- pr_err("Attempting to checksum a non-TCP/UDP packet, "
- "dropping a protocol %d packet",
- iph->protocol);
- goto out;
- }
-
- if (!pskb_may_pull(skb, th + csum_offset + 2 - skb->data))
- goto out;
-
- skb->ip_summed = CHECKSUM_PARTIAL;
- skb->proto_csum_blank = 0;
-
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22)
- skb->csum_start = csum_start;
- skb->csum_offset = csum_offset;
-#else
- skb_set_transport_header(skb, csum_start - skb_headroom(skb));
- skb->csum = csum_offset;
-#endif
-
- err = 0;
-
-out:
- return err;
-}
-#endif /* CONFIG_XEN && HAVE_PROTO_DATA_VALID */
-
- /* Types of checksums that we can receive (these all refer to L4 checksums):
- * 1. CHECKSUM_NONE: Device that did not compute checksum, contains full
- * (though not verified) checksum in packet but not in skb->csum. Packets
- * from the bridge local port will also have this type.
- * 2. CHECKSUM_COMPLETE (CHECKSUM_HW): Good device that computes checksums,
- * also the GRE module. This is the same as CHECKSUM_NONE, except it has
- * a valid skb->csum. Importantly, both contain a full checksum (not
- * verified) in the packet itself. The only difference is that if the
- * packet gets to L4 processing on this machine (not in DomU) we won't
- * have to recompute the checksum to verify. Most hardware devices do not
- * produce packets with this type, even if they support receive checksum
- * offloading (they produce type #5).
- * 3. CHECKSUM_PARTIAL (CHECKSUM_HW): Packet without full checksum and needs to
- * be computed if it is sent off box. Unfortunately on earlier kernels,
- * this case is impossible to distinguish from #2, despite having opposite
- * meanings. Xen adds an extra field on earlier kernels (see #4) in order
- * to distinguish the different states.
- * 4. CHECKSUM_UNNECESSARY (with proto_csum_blank true): This packet was
- * generated locally by a Xen DomU and has a partial checksum. If it is
- * handled on this machine (Dom0 or DomU), then the checksum will not be
- * computed. If it goes off box, the checksum in the packet needs to be
- * completed. Calling skb_checksum_setup converts this to CHECKSUM_HW
- * (CHECKSUM_PARTIAL) so that the checksum can be completed. In later
- * kernels, this combination is replaced with CHECKSUM_PARTIAL.
- * 5. CHECKSUM_UNNECESSARY (with proto_csum_blank false): Packet with a correct
- * full checksum or using a protocol without a checksum. skb->csum is
- * undefined. This is common from devices with receive checksum
- * offloading. This is somewhat similar to CHECKSUM_NONE, except that
- * nobody will try to verify the checksum with CHECKSUM_UNNECESSARY.
- *
- * Note that on earlier kernels, CHECKSUM_COMPLETE and CHECKSUM_PARTIAL are
- * both defined as CHECKSUM_HW. Normally the meaning of CHECKSUM_HW is clear
- * based on whether it is on the transmit or receive path. After the datapath
- * it will be intepreted as CHECKSUM_PARTIAL. If the packet already has a
- * checksum, we will panic. Since we can receive packets with checksums, we
- * assume that all CHECKSUM_HW packets have checksums and map them to
- * CHECKSUM_NONE, which has a similar meaning (the it is only different if the
- * packet is processed by the local IP stack, in which case it will need to
- * be reverified). If we receive a packet with CHECKSUM_HW that really means
- * CHECKSUM_PARTIAL, it will be sent with the wrong checksum. However, there
- * shouldn't be any devices that do this with bridging. */
-void compute_ip_summed(struct sk_buff *skb, bool xmit)
-{
- /* For our convenience these defines change repeatedly between kernel
- * versions, so we can't just copy them over... */
- switch (skb->ip_summed) {
- case CHECKSUM_NONE:
- OVS_CB(skb)->ip_summed = OVS_CSUM_NONE;
- break;
- case CHECKSUM_UNNECESSARY:
- OVS_CB(skb)->ip_summed = OVS_CSUM_UNNECESSARY;
- break;
-#ifdef CHECKSUM_HW
- /* In theory this could be either CHECKSUM_PARTIAL or CHECKSUM_COMPLETE.
- * However, on the receive side we should only get CHECKSUM_PARTIAL
- * packets from Xen, which uses some special fields to represent this
- * (see below). Since we can only make one type work, pick the one
- * that actually happens in practice.
- *
- * On the transmit side (basically after skb_checksum_setup()
- * has been run or on internal dev transmit), packets with
- * CHECKSUM_COMPLETE aren't generated, so assume CHECKSUM_PARTIAL. */
- case CHECKSUM_HW:
- if (!xmit)
- OVS_CB(skb)->ip_summed = OVS_CSUM_COMPLETE;
- else
- OVS_CB(skb)->ip_summed = OVS_CSUM_PARTIAL;
-
- break;
-#else
- case CHECKSUM_COMPLETE:
- OVS_CB(skb)->ip_summed = OVS_CSUM_COMPLETE;
- break;
- case CHECKSUM_PARTIAL:
- OVS_CB(skb)->ip_summed = OVS_CSUM_PARTIAL;
- break;
-#endif
- default:
- pr_err("unknown checksum type %d\n", skb->ip_summed);
- /* None seems the safest... */
- OVS_CB(skb)->ip_summed = OVS_CSUM_NONE;
- }
-
-#if defined(CONFIG_XEN) && defined(HAVE_PROTO_DATA_VALID)
- /* Xen has a special way of representing CHECKSUM_PARTIAL on older
- * kernels. It should not be set on the transmit path though. */
- if (skb->proto_csum_blank)
- OVS_CB(skb)->ip_summed = OVS_CSUM_PARTIAL;
-
- WARN_ON_ONCE(skb->proto_csum_blank && xmit);
-#endif
-}
-
-/* This function closely resembles skb_forward_csum() used by the bridge. It
- * is slightly different because we are only concerned with bridging and not
- * other types of forwarding and can get away with slightly more optimal
- * behavior.*/
-void forward_ip_summed(struct sk_buff *skb)
-{
-#ifdef CHECKSUM_HW
- if (OVS_CB(skb)->ip_summed == OVS_CSUM_COMPLETE)
- skb->ip_summed = CHECKSUM_NONE;
-#endif
-}
-
/* Append each packet in 'skb' list to 'queue'. There will be only one packet
* unless we broke up a GSO packet. */
static int queue_control_packets(struct sk_buff *skb, struct sk_buff_head *queue,
- int queue_no, u32 arg)
+ int queue_no, u64 arg)
{
struct sk_buff *nskb;
int port_no;
header->type = queue_no;
header->length = skb->len;
header->port = port_no;
- header->reserved = 0;
header->arg = arg;
skb_queue_tail(queue, skb);
}
int dp_output_control(struct datapath *dp, struct sk_buff *skb, int queue_no,
- u32 arg)
+ u64 arg)
{
struct dp_stats_percpu *stats;
struct sk_buff_head *queue;
* userspace may try to stuff a 64kB packet into a 1500-byte MTU. */
if (skb_is_gso(skb)) {
struct sk_buff *nskb = skb_gso_segment(skb, NETIF_F_SG | NETIF_F_HW_CSUM);
- if (nskb) {
- kfree_skb(skb);
- skb = nskb;
- if (unlikely(IS_ERR(skb))) {
- err = PTR_ERR(skb);
- goto err;
- }
- } else {
- /* XXX This case might not be possible. It's hard to
- * tell from the skb_gso_segment() code and comment. */
+
+ kfree_skb(skb);
+ skb = nskb;
+ if (unlikely(IS_ERR(skb))) {
+ err = PTR_ERR(skb);
+ goto err;
}
}
return 0;
}
-static int validate_actions(const struct sw_flow_actions *actions)
-{
- unsigned int i;
-
- for (i = 0; i < actions->n_actions; i++) {
- const union odp_action *a = &actions->actions[i];
-
- switch (a->type) {
- case ODPAT_CONTROLLER:
- case ODPAT_STRIP_VLAN:
- case ODPAT_SET_DL_SRC:
- case ODPAT_SET_DL_DST:
- case ODPAT_SET_NW_SRC:
- case ODPAT_SET_NW_DST:
- case ODPAT_SET_TP_SRC:
- case ODPAT_SET_TP_DST:
- case ODPAT_SET_TUNNEL:
- case ODPAT_SET_PRIORITY:
- case ODPAT_POP_PRIORITY:
- case ODPAT_DROP_SPOOFED_ARP:
- /* No validation needed. */
- break;
+static int validate_actions(const struct nlattr *actions, u32 actions_len)
+{
+ const struct nlattr *a;
+ int rem;
+
+ nla_for_each_attr(a, actions, actions_len, rem) {
+ static const u32 action_lens[ODPAT_MAX + 1] = {
+ [ODPAT_OUTPUT] = 4,
+ [ODPAT_CONTROLLER] = 8,
+ [ODPAT_SET_DL_TCI] = 2,
+ [ODPAT_STRIP_VLAN] = 0,
+ [ODPAT_SET_DL_SRC] = ETH_ALEN,
+ [ODPAT_SET_DL_DST] = ETH_ALEN,
+ [ODPAT_SET_NW_SRC] = 4,
+ [ODPAT_SET_NW_DST] = 4,
+ [ODPAT_SET_NW_TOS] = 1,
+ [ODPAT_SET_TP_SRC] = 2,
+ [ODPAT_SET_TP_DST] = 2,
+ [ODPAT_SET_TUNNEL] = 8,
+ [ODPAT_SET_PRIORITY] = 4,
+ [ODPAT_POP_PRIORITY] = 0,
+ [ODPAT_DROP_SPOOFED_ARP] = 0,
+ };
+ int type = nla_type(a);
+
+ if (type > ODPAT_MAX || nla_len(a) != action_lens[type])
+ return -EINVAL;
+
+ switch (type) {
+ case ODPAT_UNSPEC:
+ return -EINVAL;
- case ODPAT_OUTPUT:
- if (a->output.port >= DP_MAX_PORTS)
+ case ODPAT_CONTROLLER:
+ case ODPAT_STRIP_VLAN:
+ case ODPAT_SET_DL_SRC:
+ case ODPAT_SET_DL_DST:
+ case ODPAT_SET_NW_SRC:
+ case ODPAT_SET_NW_DST:
+ case ODPAT_SET_TP_SRC:
+ case ODPAT_SET_TP_DST:
+ case ODPAT_SET_TUNNEL:
+ case ODPAT_SET_PRIORITY:
+ case ODPAT_POP_PRIORITY:
+ case ODPAT_DROP_SPOOFED_ARP:
+ /* No validation needed. */
+ break;
+
+ case ODPAT_OUTPUT:
+ if (nla_get_u32(a) >= DP_MAX_PORTS)
+ return -EINVAL;
+
+ case ODPAT_SET_DL_TCI:
+ if (nla_get_be16(a) & htons(VLAN_CFI_MASK))
return -EINVAL;
- break;
+ break;
- case ODPAT_SET_DL_TCI:
- if (a->dl_tci.tci & htons(VLAN_CFI_MASK))
- return -EINVAL;
- break;
+ case ODPAT_SET_NW_TOS:
+ if (nla_get_u8(a) & INET_ECN_MASK)
+ return -EINVAL;
+ break;
- case ODPAT_SET_NW_TOS:
- if (a->nw_tos.nw_tos & INET_ECN_MASK)
- return -EINVAL;
- break;
+ default:
+ return -EOPNOTSUPP;
+ }
+ }
- default:
- return -EOPNOTSUPP;
- }
- }
+ if (rem > 0)
+ return -EINVAL;
- return 0;
+ return 0;
}
static struct sw_flow_actions *get_actions(const struct odp_flow *flow)
struct sw_flow_actions *actions;
int error;
- actions = flow_actions_alloc(flow->n_actions);
+ actions = flow_actions_alloc(flow->actions_len);
error = PTR_ERR(actions);
if (IS_ERR(actions))
goto error;
error = -EFAULT;
- if (copy_from_user(actions->actions, flow->actions,
- flow->n_actions * sizeof(union odp_action)))
+ if (copy_from_user(actions->actions, flow->actions, flow->actions_len))
goto error_free_actions;
- error = validate_actions(actions);
+ error = validate_actions(actions->actions, actions->actions_len);
if (error)
goto error_free_actions;
if (IS_ERR(new_acts))
goto error;
old_acts = rcu_dereference(flow->sf_acts);
- if (old_acts->n_actions != new_acts->n_actions ||
+ if (old_acts->actions_len != new_acts->actions_len ||
memcmp(old_acts->actions, new_acts->actions,
- sizeof(union odp_action) * old_acts->n_actions)) {
+ old_acts->actions_len)) {
rcu_assign_pointer(flow->sf_acts, new_acts);
flow_deferred_free_acts(old_acts);
} else {
static int do_answer_query(struct sw_flow *flow, u32 query_flags,
struct odp_flow_stats __user *ustats,
- union odp_action __user *actions,
- u32 __user *n_actionsp)
+ struct nlattr __user *actions,
+ u32 __user *actions_lenp)
{
struct sw_flow_actions *sf_acts;
struct odp_flow_stats stats;
- u32 n_actions;
+ u32 actions_len;
spin_lock_bh(&flow->lock);
get_stats(flow, &stats);
spin_unlock_bh(&flow->lock);
if (copy_to_user(ustats, &stats, sizeof(struct odp_flow_stats)) ||
- get_user(n_actions, n_actionsp))
+ get_user(actions_len, actions_lenp))
return -EFAULT;
- if (!n_actions)
+ if (!actions_len)
return 0;
sf_acts = rcu_dereference(flow->sf_acts);
- if (put_user(sf_acts->n_actions, n_actionsp) ||
+ if (put_user(sf_acts->actions_len, actions_lenp) ||
(actions && copy_to_user(actions, sf_acts->actions,
- sizeof(union odp_action) *
- min(sf_acts->n_actions, n_actions))))
+ min(sf_acts->actions_len, actions_len))))
return -EFAULT;
return 0;
static int answer_query(struct sw_flow *flow, u32 query_flags,
struct odp_flow __user *ufp)
{
- union odp_action *actions;
+ struct nlattr *actions;
if (get_user(actions, &ufp->actions))
return -EFAULT;
return do_answer_query(flow, query_flags,
- &ufp->stats, actions, &ufp->n_actions);
+ &ufp->stats, actions, &ufp->actions_len);
}
static struct sw_flow *do_del_flow(struct datapath *dp, struct odp_flow_key *key)
if (execute->length < ETH_HLEN || execute->length > 65535)
goto error;
- actions = flow_actions_alloc(execute->n_actions);
+ actions = flow_actions_alloc(execute->actions_len);
if (IS_ERR(actions)) {
err = PTR_ERR(actions);
goto error;
}
err = -EFAULT;
- if (copy_from_user(actions->actions, execute->actions,
- execute->n_actions * sizeof *execute->actions))
+ if (copy_from_user(actions->actions, execute->actions, execute->actions_len))
goto error_free_actions;
- err = validate_actions(actions);
+ err = validate_actions(actions->actions, execute->actions_len);
if (err)
goto error_free_actions;
goto error_free_skb;
rcu_read_lock();
- err = execute_actions(dp, skb, &key, actions->actions, actions->n_actions);
+ err = execute_actions(dp, skb, &key, actions->actions, actions->actions_len);
rcu_read_unlock();
kfree(actions);
__copy_from_user(&flow->stats, &compat->stats, sizeof(struct odp_flow_stats)) ||
__copy_from_user(&flow->key, &compat->key, sizeof(struct odp_flow_key)) ||
__get_user(actions, &compat->actions) ||
- __get_user(flow->n_actions, &compat->n_actions) ||
+ __get_user(flow->actions_len, &compat->actions_len) ||
__get_user(flow->flags, &compat->flags))
return -EFAULT;
return -EFAULT;
return do_answer_query(flow, query_flags, &ufp->stats,
- compat_ptr(actions), &ufp->n_actions);
+ compat_ptr(actions), &ufp->actions_len);
}
static int compat_del_flow(struct datapath *dp, struct compat_odp_flow __user *ufp)
if (!access_ok(VERIFY_READ, uexecute, sizeof(struct compat_odp_execute)) ||
__get_user(actions, &uexecute->actions) ||
- __get_user(execute.n_actions, &uexecute->n_actions) ||
+ __get_user(execute.actions_len, &uexecute->actions_len) ||
__get_user(data, &uexecute->data) ||
__get_user(execute.length, &uexecute->length))
return -EFAULT;
return -EFAULT;
}
-ssize_t openvswitch_read(struct file *f, char __user *buf, size_t nbytes,
- loff_t *ppos)
+static ssize_t openvswitch_read(struct file *f, char __user *buf,
+ size_t nbytes, loff_t *ppos)
{
- /* XXX is there sufficient synchronization here? */
int listeners = get_listen_mask(f);
int dp_idx = iminor(f->f_dentry->d_inode);
- struct datapath *dp = get_dp(dp_idx);
+ struct datapath *dp = get_dp_locked(dp_idx);
struct sk_buff *skb;
size_t copy_bytes, tot_copy_bytes;
int retval;
}
}
success:
+ mutex_unlock(&dp->mutex);
+
copy_bytes = tot_copy_bytes = min_t(size_t, skb->len, nbytes);
retval = 0;
if (skb->ip_summed == CHECKSUM_PARTIAL) {
if (copy_bytes == skb->len) {
__wsum csum = 0;
- unsigned int csum_start, csum_offset;
-
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22)
- csum_start = skb->csum_start - skb_headroom(skb);
- csum_offset = skb->csum_offset;
-#else
- csum_start = skb_transport_header(skb) - skb->data;
- csum_offset = skb->csum;
-#endif
+ u16 csum_start, csum_offset;
+
+ get_skb_csum_pointers(skb, &csum_start, &csum_offset);
BUG_ON(csum_start >= skb_headlen(skb));
retval = skb_copy_and_csum_datagram(skb, csum_start, buf + csum_start,
copy_bytes - csum_start, &csum);
retval = tot_copy_bytes;
kfree_skb(skb);
+ return retval;
error:
+ mutex_unlock(&dp->mutex);
return retval;
}
static unsigned int openvswitch_poll(struct file *file, poll_table *wait)
{
- /* XXX is there sufficient synchronization here? */
int dp_idx = iminor(file->f_dentry->d_inode);
- struct datapath *dp = get_dp(dp_idx);
+ struct datapath *dp = get_dp_locked(dp_idx);
unsigned int mask;
if (dp) {
poll_wait(file, &dp->waitqueue, wait);
if (dp_has_packet_of_interest(dp, get_listen_mask(file)))
mask |= POLLIN | POLLRDNORM;
+ mutex_unlock(&dp->mutex);
} else {
mask = POLLIN | POLLRDNORM | POLLHUP;
}
return mask;
}
-struct file_operations openvswitch_fops = {
- /* XXX .aio_read = openvswitch_aio_read, */
+static struct file_operations openvswitch_fops = {
.read = openvswitch_read,
.poll = openvswitch_poll,
.unlocked_ioctl = openvswitch_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = openvswitch_compat_ioctl,
#endif
- /* XXX .fasync = openvswitch_fasync, */
};
static int major;