From f5cda6cb8da57f20cb58e5d8e8bf41b9e155c06c Mon Sep 17 00:00:00 2001 From: S.Çağlar Onur Date: Tue, 7 Dec 2010 11:07:40 -0500 Subject: [PATCH] linux-2.6-525-sknid-elevator.patch --- include/linux/netdevice.h | 1 + net/core/dev.c | 30 +++++++++++++++++++ net/packet/af_packet.c | 71 +++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 100 insertions(+), 2 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3af28fb..8730e89 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1259,6 +1259,7 @@ struct napi_gro_cb { struct packet_type { __be16 type; /* This is really htons(ether_type). */ struct net_device *dev; /* NULL is wildcarded here */ + unsigned char sknid_elevator; int (*func) (struct sk_buff *, struct net_device *, struct packet_type *, diff --git a/net/core/dev.c b/net/core/dev.c index 1226d0d..e8d66b6 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -99,6 +99,8 @@ #include #include #include +#include +#include #include #include #include @@ -2620,6 +2622,10 @@ void netif_nit_deliver(struct sk_buff *skb) rcu_read_unlock(); } +/* The code already makes the assumption that packet handlers run + * sequentially on the same CPU. -Sapan */ +DEFINE_PER_CPU(int, sknid_elevator) = 0; + int __netif_receive_skb(struct sk_buff *skb) { struct packet_type *ptype, *pt_prev; @@ -2627,8 +2633,11 @@ int __netif_receive_skb(struct sk_buff *skb) struct net_device *null_or_orig; struct net_device *null_or_bond; int ret = NET_RX_DROP; + int *cur_elevator = &__get_cpu_var(sknid_elevator); __be16 type; + *cur_elevator = 0; + if (!skb->tstamp.tv64) net_timestamp(skb); @@ -2729,7 +2738,27 @@ ncls: } if (pt_prev) { + /* At this point, cur_elevator may be -2 or a positive value, in + * case a previous protocol handler marked it */ + if (*cur_elevator) { + atomic_inc(&skb->users); + } + ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); + + if ((*cur_elevator)>0) { + skb->skb_tag = *cur_elevator; + list_for_each_entry_rcu(ptype, &ptype_all, list) { + if ((!ptype->dev || ptype->dev == skb->dev) && (ptype->sknid_elevator)) { + ret = deliver_skb(skb, ptype, orig_dev); + } + } + } + + if (*cur_elevator) { + /* We have a packet */ + kfree_skb(skb); + } } else { kfree_skb(skb); /* Jamal, now you will not able to escape explaining @@ -4576,6 +4605,7 @@ unsigned dev_get_flags(const struct net_device *dev) return flags; } EXPORT_SYMBOL(dev_get_flags); +EXPORT_PER_CPU_SYMBOL(sknid_elevator); /** * dev_change_flags - change device settings diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index d67f5e4..fcaa094 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -78,6 +78,7 @@ #include #include #include +#include #include #include #include @@ -340,12 +341,54 @@ static const struct proto_ops packet_ops; static const struct proto_ops packet_ops_spkt; +DECLARE_PER_CPU(int, sknid_elevator); + +static inline unsigned int slice_check_and_elevate(struct sk_buff *skb, struct sock *sk) { + /* This mechanism is quite involved, and caused us a lot of pain + * including crashes and packet loss during the 4.2 rollout. This + * function decides if a slice is allowed to see a given packet. + * Unfortunately, the first time it is invoked for a packet it does not + * have enough information to make this call, since xt_MARK has not had + * a chance to tag it with the slice id. There is also no way of + * passing state between xt_MARK and this function through a packet -- + * because the skb gets cloned quite a few times between these two + * points. I'd rather not use skb_shared_info because it's treated as + * a blob of memory, and so it would be quite hard to maintain. + * + * What we do is to keep a global variable (per CPU) that transfers the + * required state between xt_MARK and af_packet.c. As an optimization, + * this state transfer and the step that follows is only executed for + * packets that first get dropped here. When we drop a packet, we mark + * it for 'elevation' (that's what this trick is called). When xt_MARK + * tags the packet with the right slice, it intercepts this mark and + * sets the value of sknid_elevator. Next, the packet is sent back here + * for a second round, this time with the xid tag set. + */ + + int *elevator=&__get_cpu_var(sknid_elevator); + int tag = skb->skb_tag; + + if (sk->sk_nx_info && !(tag == 1 || sk->sk_nid == tag)) { + if (skb->pkt_type==PACKET_HOST) { + *elevator=-2; /* Rejecting this packet. Mark it for elevation in xt_MARK */ + } + return 0; + } + else if (!sk->sk_nx_info && (*elevator>0)) { + /* Root has already seen this packet once, since it has been elevated */ + return 0; + } + + return 1; +} + static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct sock *sk; struct sockaddr_pkt *spkt; - + + /* * When we registered the protocol we saved the socket in the data * field for just this event. @@ -364,6 +407,16 @@ static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, * so that this procedure is noop. */ + /* + * (18:05:41) daniel_hozac: where? + * (18:05:58) daniel_hozac: we already have filters on PF_PACKET, don't we? + * (18:05:58) er: in packet_rcv_skpt + * (18:07:33) daniel_hozac: oh, that's evil. + */ + + if (!slice_check_and_elevate(skb, sk)) + return 0; + if (skb->pkt_type == PACKET_LOOPBACK) goto out; @@ -422,6 +475,9 @@ static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock, __be16 proto = 0; int err; + if (!nx_capable(CAP_NET_RAW, NXC_RAW_SEND)) + return -EPERM; + /* * Get and verify the address. */ @@ -512,11 +568,16 @@ out_unlock: return err; } + + static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk, unsigned int res) { struct sk_filter *filter; + if (!slice_check_and_elevate(skb, sk)) + return 0; + rcu_read_lock_bh(); filter = rcu_dereference(sk->sk_filter); if (filter != NULL) @@ -1088,6 +1149,9 @@ static int packet_snd(struct socket *sock, unsigned char *addr; int ifindex, err, reserve = 0; + if (!nx_capable(CAP_NET_RAW, NXC_RAW_SEND)) + return -EPERM; + /* * Get and verify the address. */ @@ -1273,6 +1337,7 @@ static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protoc po->num = protocol; po->prot_hook.type = protocol; + po->prot_hook.sknid_elevator = 1; po->prot_hook.dev = dev; po->ifindex = dev ? dev->ifindex : 0; @@ -1373,8 +1438,9 @@ static int packet_create(struct net *net, struct socket *sock, int protocol, __be16 proto = (__force __be16)protocol; /* weird, but documented */ int err; - if (!capable(CAP_NET_RAW)) + if (!nx_capable(CAP_NET_RAW, NXC_RAW_SOCKET)) return -EPERM; + if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW && sock->type != SOCK_PACKET) return -ESOCKTNOSUPPORT; @@ -1406,6 +1472,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol, spin_lock_init(&po->bind_lock); mutex_init(&po->pg_vec_lock); po->prot_hook.func = packet_rcv; + po->prot_hook.sknid_elevator = 1; if (sock->type == SOCK_PACKET) po->prot_hook.func = packet_rcv_spkt;