1 From f5cda6cb8da57f20cb58e5d8e8bf41b9e155c06c Mon Sep 17 00:00:00 2001
2 From: S.Çağlar Onur <caglar@cs.princeton.edu>
3 Date: Tue, 7 Dec 2010 11:07:40 -0500
4 Subject: [PATCH] linux-2.6-525-sknid-elevator.patch
7 include/linux/netdevice.h | 1 +
8 net/core/dev.c | 30 +++++++++++++++++++
9 net/packet/af_packet.c | 71 +++++++++++++++++++++++++++++++++++++++++++-
10 3 files changed, 100 insertions(+), 2 deletions(-)
12 diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
13 index 17b0c3c..186d772 100644
14 --- a/include/linux/netdevice.h
15 +++ b/include/linux/netdevice.h
16 @@ -1099,6 +1099,7 @@ struct napi_gro_cb {
18 __be16 type; /* This is really htons(ether_type). */
19 struct net_device *dev; /* NULL is wildcarded here */
20 + unsigned char sknid_elevator;
21 int (*func) (struct sk_buff *,
24 diff --git a/net/core/dev.c b/net/core/dev.c
25 index 32a2b7e..7fda319 100644
29 #include <linux/proc_fs.h>
30 #include <linux/seq_file.h>
31 #include <linux/stat.h>
32 +#include <linux/ip.h>
33 +#include <linux/tcp.h>
34 #include <linux/if_bridge.h>
35 #include <linux/if_macvlan.h>
37 @@ -2333,6 +2335,10 @@ void netif_nit_deliver(struct sk_buff *skb)
41 +/* The code already makes the assumption that packet handlers run
42 + * sequentially on the same CPU. -Sapan */
43 +DEFINE_PER_CPU(int, sknid_elevator) = 0;
46 * netif_receive_skb - process receive buffer from network
47 * @skb: buffer to process
48 @@ -2355,8 +2361,11 @@ int netif_receive_skb(struct sk_buff *skb)
49 struct net_device *null_or_orig;
50 struct net_device *null_or_bond;
51 int ret = NET_RX_DROP;
52 + int *cur_elevator = &__get_cpu_var(sknid_elevator);
57 if (!skb->tstamp.tv64)
60 @@ -2456,7 +2465,27 @@ ncls:
64 + /* At this point, cur_elevator may be -2 or a positive value, in
65 + * case a previous protocol handler marked it */
66 + if (*cur_elevator) {
67 + atomic_inc(&skb->users);
70 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
72 + if ((*cur_elevator)>0) {
73 + skb->skb_tag = *cur_elevator;
74 + list_for_each_entry_rcu(ptype, &ptype_all, list) {
75 + if ((!ptype->dev || ptype->dev == skb->dev) && (ptype->sknid_elevator)) {
76 + ret = deliver_skb(skb, ptype, orig_dev);
81 + if (*cur_elevator) {
82 + /* We have a packet */
87 /* Jamal, now you will not able to escape explaining
88 @@ -4210,6 +4239,7 @@ unsigned dev_get_flags(const struct net_device *dev)
91 EXPORT_SYMBOL(dev_get_flags);
92 +EXPORT_PER_CPU_SYMBOL(sknid_elevator);
95 * dev_change_flags - change device settings
96 diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
97 index b4bf950..cbf8d22 100644
98 --- a/net/packet/af_packet.c
99 +++ b/net/packet/af_packet.c
101 #include <linux/poll.h>
102 #include <linux/module.h>
103 #include <linux/init.h>
104 +#include <linux/vs_network.h>
105 #include <linux/mutex.h>
106 #include <linux/if_vlan.h>
108 @@ -338,12 +339,54 @@ static const struct proto_ops packet_ops;
110 static const struct proto_ops packet_ops_spkt;
112 +DECLARE_PER_CPU(int, sknid_elevator);
114 +static inline unsigned int slice_check_and_elevate(struct sk_buff *skb, struct sock *sk) {
115 + /* This mechanism is quite involved, and caused us a lot of pain
116 + * including crashes and packet loss during the 4.2 rollout. This
117 + * function decides if a slice is allowed to see a given packet.
118 + * Unfortunately, the first time it is invoked for a packet it does not
119 + * have enough information to make this call, since xt_MARK has not had
120 + * a chance to tag it with the slice id. There is also no way of
121 + * passing state between xt_MARK and this function through a packet --
122 + * because the skb gets cloned quite a few times between these two
123 + * points. I'd rather not use skb_shared_info because it's treated as
124 + * a blob of memory, and so it would be quite hard to maintain.
126 + * What we do is to keep a global variable (per CPU) that transfers the
127 + * required state between xt_MARK and af_packet.c. As an optimization,
128 + * this state transfer and the step that follows is only executed for
129 + * packets that first get dropped here. When we drop a packet, we mark
130 + * it for 'elevation' (that's what this trick is called). When xt_MARK
131 + * tags the packet with the right slice, it intercepts this mark and
132 + * sets the value of sknid_elevator. Next, the packet is sent back here
133 + * for a second round, this time with the xid tag set.
136 + int *elevator=&__get_cpu_var(sknid_elevator);
137 + int tag = skb->skb_tag;
139 + if (sk->sk_nx_info && !(tag == 1 || sk->sk_nid == tag)) {
140 + if (skb->pkt_type==PACKET_HOST) {
141 + *elevator=-2; /* Rejecting this packet. Mark it for elevation in xt_MARK */
145 + else if (!sk->sk_nx_info && (*elevator>0)) {
146 + /* Root has already seen this packet once, since it has been elevated */
153 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
154 struct packet_type *pt, struct net_device *orig_dev)
157 struct sockaddr_pkt *spkt;
162 * When we registered the protocol we saved the socket in the data
163 * field for just this event.
164 @@ -362,6 +405,16 @@ static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
165 * so that this procedure is noop.
169 + * (18:05:41) daniel_hozac: where?
170 + * (18:05:58) daniel_hozac: we already have filters on PF_PACKET, don't we?
171 + * (18:05:58) er: in packet_rcv_skpt
172 + * (18:07:33) daniel_hozac: oh, that's evil.
175 + if (!slice_check_and_elevate(skb, sk))
178 if (skb->pkt_type == PACKET_LOOPBACK)
181 @@ -420,6 +473,9 @@ static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
185 + if (!nx_capable(CAP_NET_RAW, NXC_RAW_SEND))
189 * Get and verify the address.
191 @@ -510,11 +566,16 @@ out_unlock:
197 static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
200 struct sk_filter *filter;
202 + if (!slice_check_and_elevate(skb, sk))
206 filter = rcu_dereference(sk->sk_filter);
208 @@ -1073,6 +1134,9 @@ static int packet_snd(struct socket *sock,
210 int ifindex, err, reserve = 0;
212 + if (!nx_capable(CAP_NET_RAW, NXC_RAW_SEND))
216 * Get and verify the address.
218 @@ -1258,6 +1322,7 @@ static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protoc
221 po->prot_hook.type = protocol;
222 + po->prot_hook.sknid_elevator = 1;
223 po->prot_hook.dev = dev;
225 po->ifindex = dev ? dev->ifindex : 0;
226 @@ -1358,8 +1423,9 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
227 __be16 proto = (__force __be16)protocol; /* weird, but documented */
230 - if (!capable(CAP_NET_RAW))
231 + if (!nx_capable(CAP_NET_RAW, NXC_RAW_SOCKET))
234 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
235 sock->type != SOCK_PACKET)
236 return -ESOCKTNOSUPPORT;
237 @@ -1391,6 +1457,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
238 spin_lock_init(&po->bind_lock);
239 mutex_init(&po->pg_vec_lock);
240 po->prot_hook.func = packet_rcv;
241 + po->prot_hook.sknid_elevator = 1;
243 if (sock->type == SOCK_PACKET)
244 po->prot_hook.func = packet_rcv_spkt;