--- /dev/null
+commit a57d715bc58005cfae0fdf1626ebf11b11508025
+Author: root <root@rhel6.(none)>
+Date: Thu Apr 29 10:01:21 2010 -0400
+
+ linux-2.6-525-sknid-elevator.patch
+
+diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
+index 4267c8b..3f36a91 100644
+--- a/include/linux/netdevice.h
++++ b/include/linux/netdevice.h
+@@ -1057,6 +1057,7 @@ struct napi_gro_cb {
+ struct packet_type {
+ __be16 type; /* This is really htons(ether_type). */
+ struct net_device *dev; /* NULL is wildcarded here */
++ unsigned char sknid_elevator;
+ int (*func) (struct sk_buff *,
+ struct net_device *,
+ struct packet_type *,
+diff --git a/net/core/dev.c b/net/core/dev.c
+index 8b6b941..651a1c3 100644
+--- a/net/core/dev.c
++++ b/net/core/dev.c
+@@ -99,6 +99,8 @@
+ #include <linux/proc_fs.h>
+ #include <linux/seq_file.h>
+ #include <linux/stat.h>
++#include <linux/ip.h>
++#include <linux/tcp.h>
+ #include <linux/if_bridge.h>
+ #include <linux/if_macvlan.h>
+ #include <net/dst.h>
+@@ -2275,6 +2277,10 @@ void netif_nit_deliver(struct sk_buff *skb)
+ rcu_read_unlock();
+ }
+
++/* The code already makes the assumption that packet handlers run
++ * sequentially on the same CPU. -Sapan */
++DEFINE_PER_CPU(int, sknid_elevator) = 0;
++
+ /**
+ * netif_receive_skb - process receive buffer from network
+ * @skb: buffer to process
+@@ -2296,8 +2302,11 @@ int netif_receive_skb(struct sk_buff *skb)
+ struct net_device *orig_dev;
+ struct net_device *null_or_orig;
+ int ret = NET_RX_DROP;
++ int *cur_elevator = &__get_cpu_var(sknid_elevator);
+ __be16 type;
+
++ *cur_elevator = 0;
++
+ if (!skb->tstamp.tv64)
+ net_timestamp(skb);
+
+@@ -2373,7 +2382,27 @@ ncls:
+ }
+
+ if (pt_prev) {
++ /* At this point, cur_elevator may be -2 or a positive value, in
++ * case a previous protocol handler marked it */
++ if (*cur_elevator) {
++ atomic_inc(&skb->users);
++ }
++
+ ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
++
++ if ((*cur_elevator)>0) {
++ skb->skb_tag = *cur_elevator;
++ list_for_each_entry_rcu(ptype, &ptype_all, list) {
++ if ((!ptype->dev || ptype->dev == skb->dev) && (ptype->sknid_elevator)) {
++ ret = deliver_skb(skb, ptype, orig_dev);
++ }
++ }
++ }
++
++ if (*cur_elevator) {
++ /* We have a packet */
++ kfree_skb(skb);
++ }
+ } else {
+ kfree_skb(skb);
+ /* Jamal, now you will not able to escape explaining
+@@ -4127,6 +4156,7 @@ unsigned dev_get_flags(const struct net_device *dev)
+ return flags;
+ }
+ EXPORT_SYMBOL(dev_get_flags);
++EXPORT_PER_CPU_SYMBOL(sknid_elevator);
+
+ /**
+ * dev_change_flags - change device settings
+diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
+index 1bd109e..5c2e9ad 100644
+--- a/net/packet/af_packet.c
++++ b/net/packet/af_packet.c
+@@ -78,6 +78,7 @@
+ #include <linux/poll.h>
+ #include <linux/module.h>
+ #include <linux/init.h>
++#include <linux/vs_network.h>
+ #include <linux/mutex.h>
+
+ #ifdef CONFIG_INET
+@@ -337,12 +338,54 @@ static const struct proto_ops packet_ops;
+
+ static const struct proto_ops packet_ops_spkt;
+
++extern DEFINE_PER_CPU(int, sknid_elevator);
++
++static inline unsigned int slice_check_and_elevate(struct sk_buff *skb, struct sock *sk) {
++ /* This mechanism is quite involved, and caused us a lot of pain
++ * including crashes and packet loss during the 4.2 rollout. This
++ * function decides if a slice is allowed to see a given packet.
++ * Unfortunately, the first time it is invoked for a packet it does not
++ * have enough information to make this call, since xt_MARK has not had
++ * a chance to tag it with the slice id. There is also no way of
++ * passing state between xt_MARK and this function through a packet --
++ * because the skb gets cloned quite a few times between these two
++ * points. I'd rather not use skb_shared_info because it's treated as
++ * a blob of memory, and so it would be quite hard to maintain.
++ *
++ * What we do is to keep a global variable (per CPU) that transfers the
++ * required state between xt_MARK and af_packet.c. As an optimization,
++ * this state transfer and the step that follows is only executed for
++ * packets that first get dropped here. When we drop a packet, we mark
++ * it for 'elevation' (that's what this trick is called). When xt_MARK
++ * tags the packet with the right slice, it intercepts this mark and
++ * sets the value of sknid_elevator. Next, the packet is sent back here
++ * for a second round, this time with the xid tag set.
++ */
++
++ int *elevator=&__get_cpu_var(sknid_elevator);
++ int tag = skb->skb_tag;
++
++ if (sk->sk_nx_info && !(tag == 1 || sk->sk_nid == tag)) {
++ if (skb->pkt_type==PACKET_HOST) {
++ *elevator=-2; /* Rejecting this packet. Mark it for elevation in xt_MARK */
++ }
++ return 0;
++ }
++ else if (!sk->sk_nx_info && (*elevator>0)) {
++ /* Root has already seen this packet once, since it has been elevated */
++ return 0;
++ }
++
++ return 1;
++}
++
+ static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt, struct net_device *orig_dev)
+ {
+ struct sock *sk;
+ struct sockaddr_pkt *spkt;
+-
++ int tag = skb->skb_tag;
++
+ /*
+ * When we registered the protocol we saved the socket in the data
+ * field for just this event.
+@@ -361,6 +404,16 @@ static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
+ * so that this procedure is noop.
+ */
+
++ /*
++ * (18:05:41) daniel_hozac: where?
++ * (18:05:58) daniel_hozac: we already have filters on PF_PACKET, don't we?
++ * (18:05:58) er: in packet_rcv_skpt
++ * (18:07:33) daniel_hozac: oh, that's evil.
++ */
++
++ if (!slice_check_and_elevate(skb, sk))
++ return 0;
++
+ if (skb->pkt_type == PACKET_LOOPBACK)
+ goto out;
+
+@@ -419,6 +472,9 @@ static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
+ __be16 proto = 0;
+ int err;
+
++ if (!nx_capable(CAP_NET_RAW, NXC_RAW_SEND))
++ return -EPERM;
++
+ /*
+ * Get and verify the address.
+ */
+@@ -509,11 +565,16 @@ out_unlock:
+ return err;
+ }
+
++
++
+ static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
+ unsigned int res)
+ {
+ struct sk_filter *filter;
+
++ if (!slice_check_and_elevate(skb, sk))
++ return 0;
++
+ rcu_read_lock_bh();
+ filter = rcu_dereference(sk->sk_filter);
+ if (filter != NULL)
+@@ -1063,6 +1124,9 @@ static int packet_snd(struct socket *sock,
+ unsigned char *addr;
+ int ifindex, err, reserve = 0;
+
++ if (!nx_capable(CAP_NET_RAW, NXC_RAW_SEND))
++ return -EPERM;
++
+ /*
+ * Get and verify the address.
+ */
+@@ -1248,6 +1312,7 @@ static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protoc
+
+ po->num = protocol;
+ po->prot_hook.type = protocol;
++ po->prot_hook.sknid_elevator = 1;
+ po->prot_hook.dev = dev;
+
+ po->ifindex = dev ? dev->ifindex : 0;
+@@ -1348,8 +1413,9 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
+ __be16 proto = (__force __be16)protocol; /* weird, but documented */
+ int err;
+
+- if (!capable(CAP_NET_RAW))
++ if (!nx_capable(CAP_NET_RAW, NXC_RAW_SOCKET))
+ return -EPERM;
++
+ if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
+ sock->type != SOCK_PACKET)
+ return -ESOCKTNOSUPPORT;
+@@ -1381,6 +1447,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
+ spin_lock_init(&po->bind_lock);
+ mutex_init(&po->pg_vec_lock);
+ po->prot_hook.func = packet_rcv;
++ po->prot_hook.sknid_elevator = 1;
+
+ if (sock->type == SOCK_PACKET)
+ po->prot_hook.func = packet_rcv_spkt;