-diff -Nurb linux-2.6.22-510/net/core/dev.c linux-2.6.22-520/net/core/dev.c
---- linux-2.6.22-510/net/core/dev.c 2008-06-06 17:07:48.000000000 -0400
-+++ linux-2.6.22-520/net/core/dev.c 2008-06-06 17:07:56.000000000 -0400
-@@ -1803,6 +1803,7 @@
- * the ingress scheduler, you just cant add policies on ingress.
- *
- */
-+
- static int ing_filter(struct sk_buff *skb)
- {
- struct Qdisc *q;
-@@ -1832,13 +1833,20 @@
+Index: linux-2.6.27.y/include/linux/netdevice.h
+===================================================================
+--- linux-2.6.27.y.orig/include/linux/netdevice.h
++++ linux-2.6.27.y/include/linux/netdevice.h
+@@ -857,6 +857,7 @@ static inline void netif_napi_del(struct
+ struct packet_type {
+ __be16 type; /* This is really htons(ether_type). */
+ struct net_device *dev; /* NULL is wildcarded here */
++ unsigned char sknid_elevator;
+ int (*func) (struct sk_buff *,
+ struct net_device *,
+ struct packet_type *,
+Index: linux-2.6.27.y/net/core/dev.c
+===================================================================
+--- linux-2.6.27.y.orig/net/core/dev.c
++++ linux-2.6.27.y/net/core/dev.c
+@@ -99,6 +99,8 @@
+ #include <linux/proc_fs.h>
+ #include <linux/seq_file.h>
+ #include <linux/stat.h>
++#include <linux/ip.h>
++#include <linux/tcp.h>
+ #include <linux/if_bridge.h>
+ #include <linux/if_macvlan.h>
+ #include <net/dst.h>
+@@ -1318,7 +1320,7 @@ static void dev_queue_xmit_nit(struct sk
+ if ((ptype->dev == dev || !ptype->dev) &&
+ (ptype->af_packet_priv == NULL ||
+ (struct sock *)ptype->af_packet_priv != skb->sk)) {
+- struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC);
++ struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+ if (!skb2)
+ break;
+
+@@ -2170,6 +2172,10 @@ void netif_nit_deliver(struct sk_buff *s
+ rcu_read_unlock();
}
- #endif
+/* The code already makes the assumption that packet handlers run
+ * sequentially on the same CPU. -Sapan */
-+DEFINE_PER_CPU(int, sknid_elevator);
++DEFINE_PER_CPU(int, sknid_elevator) = 0;
+
- int netif_receive_skb(struct sk_buff *skb)
- {
- struct packet_type *ptype, *pt_prev;
+ /**
+ * netif_receive_skb - process receive buffer from network
+ * @skb: buffer to process
+@@ -2191,8 +2197,11 @@ int netif_receive_skb(struct sk_buff *sk
struct net_device *orig_dev;
+ struct net_device *null_or_orig;
int ret = NET_RX_DROP;
-+ int *cur_elevator=&__get_cpu_var(sknid_elevator);
++ int *cur_elevator = &__get_cpu_var(sknid_elevator);
__be16 type;
-+ *cur_elevator = 0;
++ *cur_elevator = 0;
+
- /* if we've gotten here through NAPI, check netpoll */
- if (skb->dev->poll && netpoll_rx(skb))
- return NET_RX_DROP;
-@@ -1873,8 +1881,9 @@
+ if (skb->vlan_tci && vlan_hwaccel_do_receive(skb))
+ return NET_RX_SUCCESS;
- list_for_each_entry_rcu(ptype, &ptype_all, list) {
- if (!ptype->dev || ptype->dev == skb->dev) {
-- if (pt_prev)
-+ if (pt_prev) {
- ret = deliver_skb(skb, pt_prev, orig_dev);
-+ }
- pt_prev = ptype;
- }
- }
-@@ -1912,8 +1921,22 @@
- }
+@@ -2272,7 +2281,27 @@ ncls:
}
-+ /* We don't want the packet handlers to throw the packet away
-+ * if we want the taps to treat it again - Sapan */
-+ if (*cur_elevator) {
-+ atomic_inc(&skb->users);
-+ }
-+
if (pt_prev) {
++ /* At this point, cur_elevator may be -2 or a positive value, in
++ * case a previous protocol handler marked it */
++ if (*cur_elevator) {
++ atomic_inc(&skb->users);
++ }
++
ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
-+ if (*cur_elevator > 0) {
-+ skb->skb_tag = *cur_elevator;
-+ list_for_each_entry_rcu(ptype, &ptype_all, list) {
-+ if (!ptype->dev || ptype->dev == skb->dev) {
-+ ret = deliver_skb(skb, ptype, orig_dev);
-+ }
++
++ if ((*cur_elevator)>0) {
++ skb->skb_tag = *cur_elevator;
++ list_for_each_entry_rcu(ptype, &ptype_all, list) {
++ if ((!ptype->dev || ptype->dev == skb->dev) && (ptype->sknid_elevator)) {
++ ret = deliver_skb(skb, ptype, orig_dev);
+ }
+ }
++ }
++
++ if (*cur_elevator) {
++ /* We have a packet */
++ kfree_skb(skb);
++ }
} else {
kfree_skb(skb);
/* Jamal, now you will not able to escape explaining
-@@ -1922,6 +1945,13 @@
- ret = NET_RX_DROP;
- }
-
-+ if (*cur_elevator) {
-+ /* We have a packet */
-+ kfree_skb(skb);
-+ }
-+
-+ *cur_elevator=0;
-+
- out:
- rcu_read_unlock();
- return ret;
-@@ -3780,6 +3810,7 @@
+@@ -4895,6 +4924,7 @@ EXPORT_SYMBOL(unregister_netdevice_notif
EXPORT_SYMBOL(net_enable_timestamp);
EXPORT_SYMBOL(net_disable_timestamp);
EXPORT_SYMBOL(dev_get_flags);
#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
EXPORT_SYMBOL(br_handle_frame_hook);
-diff -Nurb linux-2.6.22-510/net/packet/af_packet.c linux-2.6.22-520/net/packet/af_packet.c
---- linux-2.6.22-510/net/packet/af_packet.c 2007-07-08 19:32:17.000000000 -0400
-+++ linux-2.6.22-520/net/packet/af_packet.c 2008-06-07 18:30:41.000000000 -0400
-@@ -78,6 +78,7 @@
+Index: linux-2.6.27.y/net/packet/af_packet.c
+===================================================================
+--- linux-2.6.27.y.orig/net/packet/af_packet.c
++++ linux-2.6.27.y/net/packet/af_packet.c
+@@ -77,6 +77,7 @@
#include <linux/poll.h>
#include <linux/module.h>
#include <linux/init.h>
+#include <linux/vs_network.h>
+ #include <linux/mutex.h>
#ifdef CONFIG_INET
- #include <net/inet_common.h>
-@@ -246,10 +247,13 @@
+@@ -278,10 +279,53 @@ static const struct proto_ops packet_ops
static const struct proto_ops packet_ops_spkt;
+extern DEFINE_PER_CPU(int, sknid_elevator);
++
++static inline unsigned int slice_check_and_elevate(struct sk_buff *skb, struct sock *sk) {
++ /* This mechanism is quite involved, and caused us a lot of pain
++ * including crashes and packet loss during the 4.2 rollout. This
++ * function decides if a slice is allowed to see a given packet.
++ * Unfortunately, the first time it is invoked for a packet it does not
++ * have enough information to make this call, since xt_MARK has not had
++ * a chance to tag it with the slice id. There is also no way of
++ * passing state between xt_MARK and this function through a packet --
++ * because the skb gets cloned quite a few times between these two
++ * points. I'd rather not use skb_shared_info because it's treated as
++ * a blob of memory, and so it would be quite hard to maintain.
++ *
++ * What we do is to keep a global variable (per CPU) that transfers the
++ * required state between xt_MARK and af_packet.c. As an optimization,
++ * this state transfer and the step that follows is only executed for
++ * packets that first get dropped here. When we drop a packet, we mark
++ * it for 'elevation' (that's what this trick is called). When xt_MARK
++ * tags the packet with the right slice, it intercepts this mark and
++ * sets the value of sknid_elevator. Next, the packet is sent back here
++ * for a second round, this time with the xid tag set.
++ */
++
++ int *elevator=&__get_cpu_var(sknid_elevator);
++ int tag = skb->skb_tag;
++
++ if (sk->sk_nx_info && !(tag == 1 || sk->sk_nid == tag)) {
++ if (skb->pkt_type==PACKET_HOST) {
++ *elevator=-2; /* Rejecting this packet. Mark it for elevation in xt_MARK */
++ }
++ return 0;
++ }
++ else if (!sk->sk_nx_info && (*elevator>0)) {
++ /* Root has already seen this packet once, since it has been elevated */
++ return 0;
++ }
++
++ return 1;
++}
++
static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
{
struct sock *sk;
struct sockaddr_pkt *spkt;
+ int tag = skb->skb_tag;
-+ int *elevator=&__get_cpu_var(sknid_elevator);
++
/*
* When we registered the protocol we saved the socket in the data
-@@ -269,6 +273,22 @@
+@@ -301,6 +345,16 @@ static int packet_rcv_spkt(struct sk_buf
* so that this procedure is noop.
*/
+ * (18:07:33) daniel_hozac: oh, that's evil.
+ */
+
-+ if (sk->sk_nx_info && !(tag == 1 || sk->sk_nid == tag)) {
-+ *elevator=-2;
-+ goto out;
-+ }
-+ else if (!sk->sk_nx_info && *elevator) {
-+ /* Root has already seen this packet */
-+ goto out;
-+ }
++ if (!slice_check_and_elevate(skb, sk))
++ return 0;
+
if (skb->pkt_type == PACKET_LOOPBACK)
goto out;
-@@ -324,6 +344,9 @@
+@@ -359,6 +413,9 @@ static int packet_sendmsg_spkt(struct ki
__be16 proto=0;
int err;
/*
* Get and verify the address.
*/
-@@ -420,6 +443,17 @@
+@@ -451,11 +508,16 @@ out_unlock:
+ return err;
+ }
+
++
++
+ static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
unsigned int res)
{
struct sk_filter *filter;
-+ int tag = skb->skb_tag;
-+ int *elevator=&__get_cpu_var(sknid_elevator);
-+
-+ if (sk->sk_nx_info && !(tag == 1 || sk->sk_nid == tag)) {
-+ *elevator=-2;
-+ return 0;
-+ }
-+ else if (!sk->sk_nx_info && *elevator) {
-+ /* Root has already seen this packet */
-+ return 0;
-+ }
++ if (!slice_check_and_elevate(skb, sk))
++ return 0;
++
rcu_read_lock_bh();
filter = rcu_dereference(sk->sk_filter);
-@@ -711,6 +745,9 @@
+ if (filter != NULL)
+@@ -775,6 +837,9 @@ static int packet_sendmsg(struct kiocb *
unsigned char *addr;
int ifindex, err, reserve = 0;
/*
* Get and verify the address.
*/
-@@ -984,8 +1021,9 @@
+@@ -941,6 +1006,7 @@ static int packet_do_bind(struct sock *s
+
+ po->num = protocol;
+ po->prot_hook.type = protocol;
++ po->prot_hook.sknid_elevator = 1;
+ po->prot_hook.dev = dev;
+
+ po->ifindex = dev ? dev->ifindex : 0;
+@@ -1039,8 +1105,9 @@ static int packet_create(struct net *net
__be16 proto = (__force __be16)protocol; /* weird, but documented */
int err;
if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
sock->type != SOCK_PACKET)
return -ESOCKTNOSUPPORT;
+@@ -1072,6 +1139,7 @@ static int packet_create(struct net *net
+ spin_lock_init(&po->bind_lock);
+ mutex_init(&po->pg_vec_lock);
+ po->prot_hook.func = packet_rcv;
++ po->prot_hook.sknid_elevator = 1;
+
+ if (sock->type == SOCK_PACKET)
+ po->prot_hook.func = packet_rcv_spkt;