X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=linux-2.6-525-sknid-elevator.patch;h=85ceb9f4c3e5b919a687b8b182680e803ed90ee7;hb=refs%2Fheads%2F27;hp=0ff4d8aeed1fac14f3d17e3eb885c7ae7ff94d21;hpb=15a27320917fe4100c0204359a6a9d18349fb31d;p=linux-2.6.git diff --git a/linux-2.6-525-sknid-elevator.patch b/linux-2.6-525-sknid-elevator.patch index 0ff4d8aee..85ceb9f4c 100644 --- a/linux-2.6-525-sknid-elevator.patch +++ b/linux-2.6-525-sknid-elevator.patch @@ -1,84 +1,89 @@ -diff -Nurb linux-2.6.22-510/net/core/dev.c linux-2.6.22-520/net/core/dev.c ---- linux-2.6.22-510/net/core/dev.c 2008-06-06 17:07:48.000000000 -0400 -+++ linux-2.6.22-520/net/core/dev.c 2008-06-06 17:07:56.000000000 -0400 -@@ -1803,6 +1803,7 @@ - * the ingress scheduler, you just cant add policies on ingress. - * - */ -+ - static int ing_filter(struct sk_buff *skb) - { - struct Qdisc *q; -@@ -1832,13 +1833,20 @@ +Index: linux-2.6.27.y/include/linux/netdevice.h +=================================================================== +--- linux-2.6.27.y.orig/include/linux/netdevice.h ++++ linux-2.6.27.y/include/linux/netdevice.h +@@ -857,6 +857,7 @@ static inline void netif_napi_del(struct + struct packet_type { + __be16 type; /* This is really htons(ether_type). */ + struct net_device *dev; /* NULL is wildcarded here */ ++ unsigned char sknid_elevator; + int (*func) (struct sk_buff *, + struct net_device *, + struct packet_type *, +Index: linux-2.6.27.y/net/core/dev.c +=================================================================== +--- linux-2.6.27.y.orig/net/core/dev.c ++++ linux-2.6.27.y/net/core/dev.c +@@ -99,6 +99,8 @@ + #include + #include + #include ++#include ++#include + #include + #include + #include +@@ -1318,7 +1320,7 @@ static void dev_queue_xmit_nit(struct sk + if ((ptype->dev == dev || !ptype->dev) && + (ptype->af_packet_priv == NULL || + (struct sock *)ptype->af_packet_priv != skb->sk)) { +- struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC); ++ struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); + if (!skb2) + break; + +@@ -2170,6 +2172,10 @@ void netif_nit_deliver(struct sk_buff *s + rcu_read_unlock(); } - #endif +/* The code already makes the assumption that packet handlers run + * sequentially on the same CPU. -Sapan */ -+DEFINE_PER_CPU(int, sknid_elevator); ++DEFINE_PER_CPU(int, sknid_elevator) = 0; + - int netif_receive_skb(struct sk_buff *skb) - { - struct packet_type *ptype, *pt_prev; + /** + * netif_receive_skb - process receive buffer from network + * @skb: buffer to process +@@ -2191,8 +2197,11 @@ int netif_receive_skb(struct sk_buff *sk struct net_device *orig_dev; + struct net_device *null_or_orig; int ret = NET_RX_DROP; -+ int *cur_elevator=&__get_cpu_var(sknid_elevator); ++ int *cur_elevator = &__get_cpu_var(sknid_elevator); __be16 type; -+ *cur_elevator = 0; ++ *cur_elevator = 0; + - /* if we've gotten here through NAPI, check netpoll */ - if (skb->dev->poll && netpoll_rx(skb)) - return NET_RX_DROP; -@@ -1873,8 +1881,9 @@ + if (skb->vlan_tci && vlan_hwaccel_do_receive(skb)) + return NET_RX_SUCCESS; - list_for_each_entry_rcu(ptype, &ptype_all, list) { - if (!ptype->dev || ptype->dev == skb->dev) { -- if (pt_prev) -+ if (pt_prev) { - ret = deliver_skb(skb, pt_prev, orig_dev); -+ } - pt_prev = ptype; - } - } -@@ -1912,8 +1921,22 @@ - } +@@ -2272,7 +2281,27 @@ ncls: } -+ /* We don't want the packet handlers to throw the packet away -+ * if we want the taps to treat it again - Sapan */ -+ if (*cur_elevator) { -+ atomic_inc(&skb->users); -+ } -+ if (pt_prev) { ++ /* At this point, cur_elevator may be -2 or a positive value, in ++ * case a previous protocol handler marked it */ ++ if (*cur_elevator) { ++ atomic_inc(&skb->users); ++ } ++ ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); -+ if (*cur_elevator > 0) { -+ skb->skb_tag = *cur_elevator; -+ list_for_each_entry_rcu(ptype, &ptype_all, list) { -+ if (!ptype->dev || ptype->dev == skb->dev) { -+ ret = deliver_skb(skb, ptype, orig_dev); -+ } ++ ++ if ((*cur_elevator)>0) { ++ skb->skb_tag = *cur_elevator; ++ list_for_each_entry_rcu(ptype, &ptype_all, list) { ++ if ((!ptype->dev || ptype->dev == skb->dev) && (ptype->sknid_elevator)) { ++ ret = deliver_skb(skb, ptype, orig_dev); + } + } ++ } ++ ++ if (*cur_elevator) { ++ /* We have a packet */ ++ kfree_skb(skb); ++ } } else { kfree_skb(skb); /* Jamal, now you will not able to escape explaining -@@ -1922,6 +1945,13 @@ - ret = NET_RX_DROP; - } - -+ if (*cur_elevator) { -+ /* We have a packet */ -+ kfree_skb(skb); -+ } -+ -+ *cur_elevator=0; -+ - out: - rcu_read_unlock(); - return ret; -@@ -3780,6 +3810,7 @@ +@@ -4895,6 +4924,7 @@ EXPORT_SYMBOL(unregister_netdevice_notif EXPORT_SYMBOL(net_enable_timestamp); EXPORT_SYMBOL(net_disable_timestamp); EXPORT_SYMBOL(dev_get_flags); @@ -86,32 +91,73 @@ diff -Nurb linux-2.6.22-510/net/core/dev.c linux-2.6.22-520/net/core/dev.c #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) EXPORT_SYMBOL(br_handle_frame_hook); -diff -Nurb linux-2.6.22-510/net/packet/af_packet.c linux-2.6.22-520/net/packet/af_packet.c ---- linux-2.6.22-510/net/packet/af_packet.c 2007-07-08 19:32:17.000000000 -0400 -+++ linux-2.6.22-520/net/packet/af_packet.c 2008-06-07 18:30:41.000000000 -0400 -@@ -78,6 +78,7 @@ +Index: linux-2.6.27.y/net/packet/af_packet.c +=================================================================== +--- linux-2.6.27.y.orig/net/packet/af_packet.c ++++ linux-2.6.27.y/net/packet/af_packet.c +@@ -77,6 +77,7 @@ #include #include #include +#include + #include #ifdef CONFIG_INET - #include -@@ -246,10 +247,13 @@ +@@ -278,10 +279,53 @@ static const struct proto_ops packet_ops static const struct proto_ops packet_ops_spkt; +extern DEFINE_PER_CPU(int, sknid_elevator); ++ ++static inline unsigned int slice_check_and_elevate(struct sk_buff *skb, struct sock *sk) { ++ /* This mechanism is quite involved, and caused us a lot of pain ++ * including crashes and packet loss during the 4.2 rollout. This ++ * function decides if a slice is allowed to see a given packet. ++ * Unfortunately, the first time it is invoked for a packet it does not ++ * have enough information to make this call, since xt_MARK has not had ++ * a chance to tag it with the slice id. There is also no way of ++ * passing state between xt_MARK and this function through a packet -- ++ * because the skb gets cloned quite a few times between these two ++ * points. I'd rather not use skb_shared_info because it's treated as ++ * a blob of memory, and so it would be quite hard to maintain. ++ * ++ * What we do is to keep a global variable (per CPU) that transfers the ++ * required state between xt_MARK and af_packet.c. As an optimization, ++ * this state transfer and the step that follows is only executed for ++ * packets that first get dropped here. When we drop a packet, we mark ++ * it for 'elevation' (that's what this trick is called). When xt_MARK ++ * tags the packet with the right slice, it intercepts this mark and ++ * sets the value of sknid_elevator. Next, the packet is sent back here ++ * for a second round, this time with the xid tag set. ++ */ ++ ++ int *elevator=&__get_cpu_var(sknid_elevator); ++ int tag = skb->skb_tag; ++ ++ if (sk->sk_nx_info && !(tag == 1 || sk->sk_nid == tag)) { ++ if (skb->pkt_type==PACKET_HOST) { ++ *elevator=-2; /* Rejecting this packet. Mark it for elevation in xt_MARK */ ++ } ++ return 0; ++ } ++ else if (!sk->sk_nx_info && (*elevator>0)) { ++ /* Root has already seen this packet once, since it has been elevated */ ++ return 0; ++ } ++ ++ return 1; ++} ++ static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct sock *sk; struct sockaddr_pkt *spkt; + int tag = skb->skb_tag; -+ int *elevator=&__get_cpu_var(sknid_elevator); ++ /* * When we registered the protocol we saved the socket in the data -@@ -269,6 +273,22 @@ +@@ -301,6 +345,16 @@ static int packet_rcv_spkt(struct sk_buf * so that this procedure is noop. */ @@ -122,19 +168,13 @@ diff -Nurb linux-2.6.22-510/net/packet/af_packet.c linux-2.6.22-520/net/packet/a + * (18:07:33) daniel_hozac: oh, that's evil. + */ + -+ if (sk->sk_nx_info && !(tag == 1 || sk->sk_nid == tag)) { -+ *elevator=-2; -+ goto out; -+ } -+ else if (!sk->sk_nx_info && *elevator) { -+ /* Root has already seen this packet */ -+ goto out; -+ } ++ if (!slice_check_and_elevate(skb, sk)) ++ return 0; + if (skb->pkt_type == PACKET_LOOPBACK) goto out; -@@ -324,6 +344,9 @@ +@@ -359,6 +413,9 @@ static int packet_sendmsg_spkt(struct ki __be16 proto=0; int err; @@ -144,25 +184,24 @@ diff -Nurb linux-2.6.22-510/net/packet/af_packet.c linux-2.6.22-520/net/packet/a /* * Get and verify the address. */ -@@ -420,6 +443,17 @@ +@@ -451,11 +508,16 @@ out_unlock: + return err; + } + ++ ++ + static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk, unsigned int res) { struct sk_filter *filter; -+ int tag = skb->skb_tag; -+ int *elevator=&__get_cpu_var(sknid_elevator); -+ -+ if (sk->sk_nx_info && !(tag == 1 || sk->sk_nid == tag)) { -+ *elevator=-2; -+ return 0; -+ } -+ else if (!sk->sk_nx_info && *elevator) { -+ /* Root has already seen this packet */ -+ return 0; -+ } ++ if (!slice_check_and_elevate(skb, sk)) ++ return 0; ++ rcu_read_lock_bh(); filter = rcu_dereference(sk->sk_filter); -@@ -711,6 +745,9 @@ + if (filter != NULL) +@@ -775,6 +837,9 @@ static int packet_sendmsg(struct kiocb * unsigned char *addr; int ifindex, err, reserve = 0; @@ -172,7 +211,15 @@ diff -Nurb linux-2.6.22-510/net/packet/af_packet.c linux-2.6.22-520/net/packet/a /* * Get and verify the address. */ -@@ -984,8 +1021,9 @@ +@@ -941,6 +1006,7 @@ static int packet_do_bind(struct sock *s + + po->num = protocol; + po->prot_hook.type = protocol; ++ po->prot_hook.sknid_elevator = 1; + po->prot_hook.dev = dev; + + po->ifindex = dev ? dev->ifindex : 0; +@@ -1039,8 +1105,9 @@ static int packet_create(struct net *net __be16 proto = (__force __be16)protocol; /* weird, but documented */ int err; @@ -183,3 +230,11 @@ diff -Nurb linux-2.6.22-510/net/packet/af_packet.c linux-2.6.22-520/net/packet/a if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW && sock->type != SOCK_PACKET) return -ESOCKTNOSUPPORT; +@@ -1072,6 +1139,7 @@ static int packet_create(struct net *net + spin_lock_init(&po->bind_lock); + mutex_init(&po->pg_vec_lock); + po->prot_hook.func = packet_rcv; ++ po->prot_hook.sknid_elevator = 1; + + if (sock->type == SOCK_PACKET) + po->prot_hook.func = packet_rcv_spkt;