1 diff -Nurb linux-2.6.22-524/include/linux/netdevice.h linux-2.6.22-525/include/linux/netdevice.h
2 --- linux-2.6.22-524/include/linux/netdevice.h 2008-07-27 22:06:14.000000000 -0400
3 +++ linux-2.6.22-525/include/linux/netdevice.h 2008-07-27 22:17:30.000000000 -0400
6 __be16 type; /* This is really htons(ether_type). */
7 struct net_device *dev; /* NULL is wildcarded here */
8 + unsigned char sknid_elevator;
9 int (*func) (struct sk_buff *,
12 diff -Nurb linux-2.6.22-524/net/core/dev.c linux-2.6.22-525/net/core/dev.c
13 --- linux-2.6.22-524/net/core/dev.c 2008-07-27 22:06:20.000000000 -0400
14 +++ linux-2.6.22-525/net/core/dev.c 2008-07-27 22:06:27.000000000 -0400
16 #include <linux/proc_fs.h>
17 #include <linux/seq_file.h>
18 #include <linux/stat.h>
19 +#include <linux/ip.h>
20 +#include <linux/tcp.h>
21 #include <linux/if_bridge.h>
23 #include <net/pkt_sched.h>
25 if ((ptype->dev == dev || !ptype->dev) &&
26 (ptype->af_packet_priv == NULL ||
27 (struct sock *)ptype->af_packet_priv != skb->sk)) {
28 - struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC);
29 + struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
34 * the ingress scheduler, you just cant add policies on ingress.
38 static int ing_filter(struct sk_buff *skb)
41 @@ -1832,13 +1835,21 @@
45 +/* The code already makes the assumption that packet handlers run
46 + * sequentially on the same CPU. -Sapan */
47 +DEFINE_PER_CPU(int, sknid_elevator) = 0;
49 int netif_receive_skb(struct sk_buff *skb)
51 struct packet_type *ptype, *pt_prev;
52 struct net_device *orig_dev;
53 int ret = NET_RX_DROP;
54 + int *cur_elevator=&__get_cpu_var(sknid_elevator);
55 + struct sk_buff *skb2;
60 /* if we've gotten here through NAPI, check netpoll */
61 if (skb->dev->poll && netpoll_rx(skb))
63 @@ -1871,10 +1882,12 @@
67 + skb2 = skb_clone(skb, GFP_ATOMIC);
68 list_for_each_entry_rcu(ptype, &ptype_all, list) {
69 if (!ptype->dev || ptype->dev == skb->dev) {
72 ret = deliver_skb(skb, pt_prev, orig_dev);
79 if (ret == TC_ACT_SHOT || (ret == TC_ACT_STOLEN)) {
85 @@ -1899,8 +1913,17 @@
88 skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
95 + /* We don't want the packet handlers to throw the packet away
96 + * if we want the taps to treat it again - Sapan */
102 type = skb->protocol;
103 list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type)&15], list) {
104 @@ -1914,6 +1937,7 @@
107 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
111 /* Jamal, now you will not able to escape explaining
112 @@ -1922,6 +1946,29 @@
116 + if ((*cur_elevator)>0) {
117 + skb2->skb_tag = *cur_elevator;
118 + list_for_each_entry_rcu(ptype, &ptype_all, list) {
119 + if ((!ptype->dev || ptype->dev == skb2->dev) && (ptype->sknid_elevator)) {
120 + ret = deliver_skb(skb2, ptype, orig_dev);
123 + type = skb2->protocol;
124 + list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type)&15], list) {
125 + if (ptype->type == type &&
126 + (!ptype->dev || ptype->dev == skb2->dev) && (ptype->sknid_elevator)) {
127 + ret = deliver_skb(skb2, ptype, orig_dev);
133 + /* We have a packet */
142 @@ -3780,6 +3827,7 @@
143 EXPORT_SYMBOL(net_enable_timestamp);
144 EXPORT_SYMBOL(net_disable_timestamp);
145 EXPORT_SYMBOL(dev_get_flags);
146 +EXPORT_PER_CPU_SYMBOL(sknid_elevator);
148 #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
149 EXPORT_SYMBOL(br_handle_frame_hook);
150 diff -Nurb linux-2.6.22-524/net/packet/af_packet.c linux-2.6.22-525/net/packet/af_packet.c
151 --- linux-2.6.22-524/net/packet/af_packet.c 2007-07-08 19:32:17.000000000 -0400
152 +++ linux-2.6.22-525/net/packet/af_packet.c 2008-07-27 22:06:27.000000000 -0400
154 #include <linux/poll.h>
155 #include <linux/module.h>
156 #include <linux/init.h>
157 +#include <linux/vs_network.h>
160 #include <net/inet_common.h>
161 @@ -246,10 +247,53 @@
163 static const struct proto_ops packet_ops_spkt;
165 +extern DEFINE_PER_CPU(int, sknid_elevator);
167 +static inline unsigned int slice_check_and_elevate(struct sk_buff *skb, struct sock *sk) {
168 + /* This mechanism is quite involved, and caused us a lot of pain
169 + * including crashes and packet loss during the 4.2 rollout. This
170 + * function decides if a slice is allowed to see a given packet.
171 + * Unfortunately, the first time it is invoked for a packet it does not
172 + * have enough information to make this call, since xt_MARK has not had
173 + * a chance to tag it with the slice id. There is also no way of
174 + * passing state between xt_MARK and this function through a packet --
175 + * because the skb gets cloned quite a few times between these two
176 + * points. I'd rather not use skb_shared_info because it's treated as
177 + * a blob of memory, and so it would be quite hard to maintain.
179 + * What we do is to keep a global variable (per CPU) that transfers the
180 + * required state between xt_MARK and af_packet.c. As an optimization,
181 + * this state transfer and the step that follows is only executed for
182 + * packets that first get dropped here. When we drop a packet, we mark
183 + * it for 'elevation' (that's what this trick is called). When xt_MARK
184 + * tags the packet with the right slice, it intercepts this mark and
185 + * sets the value of sknid_elevator. Next, the packet is sent back here
186 + * for a second round, this time with the xid tag set.
189 + int *elevator=&__get_cpu_var(sknid_elevator);
190 + int tag = skb->skb_tag;
192 + if (sk->sk_nx_info && !(tag == 1 || sk->sk_nid == tag)) {
193 + if (skb->pkt_type==PACKET_HOST) {
194 + *elevator=-2; /* Rejecting this packet. Mark it for elevation in xt_MARK */
198 + else if (!sk->sk_nx_info && (*elevator>0)) {
199 + /* Root has already seen this packet once, since it has been elevated */
206 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
209 struct sockaddr_pkt *spkt;
210 + int tag = skb->skb_tag;
214 * When we registered the protocol we saved the socket in the data
216 * so that this procedure is noop.
220 + * (18:05:41) daniel_hozac: where?
221 + * (18:05:58) daniel_hozac: we already have filters on PF_PACKET, don't we?
222 + * (18:05:58) er: in packet_rcv_skpt
223 + * (18:07:33) daniel_hozac: oh, that's evil.
226 + if (!slice_check_and_elevate(skb, sk))
229 if (skb->pkt_type == PACKET_LOOPBACK)
236 + if (!nx_capable(CAP_NET_RAW, NXC_RAW_SEND))
240 * Get and verify the address.
242 @@ -416,11 +473,16 @@
248 static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
251 struct sk_filter *filter;
253 + if (!slice_check_and_elevate(skb, sk))
257 filter = rcu_dereference(sk->sk_filter);
261 int ifindex, err, reserve = 0;
263 + if (!nx_capable(CAP_NET_RAW, NXC_RAW_SEND))
267 * Get and verify the address.
272 po->prot_hook.type = protocol;
273 + po->prot_hook.sknid_elevator = 1;
274 po->prot_hook.dev = dev;
276 po->ifindex = dev ? dev->ifindex : 0;
278 __be16 proto = (__force __be16)protocol; /* weird, but documented */
281 - if (!capable(CAP_NET_RAW))
282 + if (!nx_capable(CAP_NET_RAW, NXC_RAW_SOCKET))
285 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
286 sock->type != SOCK_PACKET)
287 return -ESOCKTNOSUPPORT;
288 @@ -1016,6 +1083,7 @@
290 spin_lock_init(&po->bind_lock);
291 po->prot_hook.func = packet_rcv;
292 + po->prot_hook.sknid_elevator = 1;
294 if (sock->type == SOCK_PACKET)
295 po->prot_hook.func = packet_rcv_spkt;