linux-2.6-525-sknid-elevator.patch

   1 diff -Nurb linux-2.6.22-524/include/linux/netdevice.h linux-2.6.22-525/include/linux/netdevice.h
   2 --- linux-2.6.22-524/include/linux/netdevice.h  2008-07-27 22:06:14.000000000 -0400
   3 +++ linux-2.6.22-525/include/linux/netdevice.h  2008-07-27 22:17:30.000000000 -0400
   4 @@ -562,6 +562,7 @@
   5  struct packet_type {
   6         __be16                  type;   /* This is really htons(ether_type). */
   7         struct net_device       *dev;   /* NULL is wildcarded here           */
   8 +       unsigned char           sknid_elevator;
   9         int                     (*func) (struct sk_buff *,
  10                                          struct net_device *,
  11                                          struct packet_type *,
  12 diff -Nurb linux-2.6.22-524/net/core/dev.c linux-2.6.22-525/net/core/dev.c
  13 --- linux-2.6.22-524/net/core/dev.c     2008-07-27 22:06:20.000000000 -0400
  14 +++ linux-2.6.22-525/net/core/dev.c     2008-07-28 09:26:45.000000000 -0400
  15 @@ -97,6 +97,8 @@
  16  #include <linux/proc_fs.h>
  17  #include <linux/seq_file.h>
  18  #include <linux/stat.h>
  19 +#include <linux/ip.h>
  20 +#include <linux/tcp.h>
  21  #include <linux/if_bridge.h>
  22  #include <net/dst.h>
  23  #include <net/pkt_sched.h>
  24 @@ -1131,7 +1133,7 @@
  25                 if ((ptype->dev == dev || !ptype->dev) &&
  26                     (ptype->af_packet_priv == NULL ||
  27                      (struct sock *)ptype->af_packet_priv != skb->sk)) {
  28 -                       struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC);
  29 +                       struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
  30                         if (!skb2)
  31                                 break;
  32
  33 @@ -1803,6 +1805,7 @@
  34   * the ingress scheduler, you just cant add policies on ingress.
  35   *
  36   */
  37 +
  38  static int ing_filter(struct sk_buff *skb)
  39  {
  40         struct Qdisc *q;
  41 @@ -1832,13 +1835,20 @@
  42  }
  43  #endif
  44
  45 +/* The code already makes the assumption that packet handlers run
  46 + * sequentially on the same CPU. -Sapan */
  47 +DEFINE_PER_CPU(int, sknid_elevator) = 0;
  48 +
  49  int netif_receive_skb(struct sk_buff *skb)
  50  {
  51         struct packet_type *ptype, *pt_prev;
  52         struct net_device *orig_dev;
  53         int ret = NET_RX_DROP;
  54 +       int *cur_elevator=&__get_cpu_var(sknid_elevator);
  55         __be16 type;
  56
  57 +       *cur_elevator = 0;
  58 +
  59         /* if we've gotten here through NAPI, check netpoll */
  60         if (skb->dev->poll && netpoll_rx(skb))
  61                 return NET_RX_DROP;
  62 @@ -1873,8 +1883,9 @@
  63
  64         list_for_each_entry_rcu(ptype, &ptype_all, list) {
  65                 if (!ptype->dev || ptype->dev == skb->dev) {
  66 -                       if (pt_prev)
  67 +                       if (pt_prev) {
  68                                 ret = deliver_skb(skb, pt_prev, orig_dev);
  69 +                       }
  70                         pt_prev = ptype;
  71                 }
  72         }
  73 @@ -1913,7 +1924,27 @@
  74         }
  75
  76         if (pt_prev) {
  77 +               /* At this point, cur_elevator may be -2 or a positive value, in
  78 +                * case a previous protocol handler marked it */
  79 +               if (*cur_elevator) {
  80 +                       atomic_inc(&skb->users);
  81 +               }
  82 +
  83                 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
  84 +
  85 +               if ((*cur_elevator)>0) {
  86 +                       skb->skb_tag = *cur_elevator;
  87 +                       list_for_each_entry_rcu(ptype, &ptype_all, list) {
  88 +                               if ((!ptype->dev || ptype->dev == skb->dev) && (ptype->sknid_elevator)) {
  89 +                                       ret = deliver_skb(skb, ptype, orig_dev);
  90 +                               }
  91 +                       }
  92 +               }
  93 +
  94 +               if (*cur_elevator) {
  95 +                       /* We have a packet */
  96 +                       kfree_skb(skb);
  97 +               }
  98         } else {
  99                 kfree_skb(skb);
 100                 /* Jamal, now you will not able to escape explaining
 101 @@ -3780,6 +3811,7 @@
 102  EXPORT_SYMBOL(net_enable_timestamp);
 103  EXPORT_SYMBOL(net_disable_timestamp);
 104  EXPORT_SYMBOL(dev_get_flags);
 105 +EXPORT_PER_CPU_SYMBOL(sknid_elevator);
 106
 107  #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
 108  EXPORT_SYMBOL(br_handle_frame_hook);
 109 diff -Nurb linux-2.6.22-524/net/packet/af_packet.c linux-2.6.22-525/net/packet/af_packet.c
 110 --- linux-2.6.22-524/net/packet/af_packet.c     2007-07-08 19:32:17.000000000 -0400
 111 +++ linux-2.6.22-525/net/packet/af_packet.c     2008-07-27 22:06:27.000000000 -0400
 112 @@ -78,6 +78,7 @@
 113  #include <linux/poll.h>
 114  #include <linux/module.h>
 115  #include <linux/init.h>
 116 +#include <linux/vs_network.h>
 117
 118  #ifdef CONFIG_INET
 119  #include <net/inet_common.h>
 120 @@ -246,10 +247,53 @@
 121
 122  static const struct proto_ops packet_ops_spkt;
 123
 124 +extern DEFINE_PER_CPU(int, sknid_elevator);
 125 +
 126 +static inline unsigned int slice_check_and_elevate(struct sk_buff *skb, struct sock *sk) {
 127 +       /* This mechanism is quite involved, and caused us a lot of pain
 128 +        * including crashes and packet loss during the 4.2 rollout. This
 129 +        * function decides if a slice is allowed to see a given packet.
 130 +        * Unfortunately, the first time it is invoked for a packet it does not
 131 +        * have enough information to make this call, since xt_MARK has not had
 132 +        * a chance to tag it with the slice id.  There is also no way of
 133 +        * passing state between xt_MARK and this function through a packet --
 134 +        * because the skb gets cloned quite a few times between these two
 135 +        * points.  I'd rather not use skb_shared_info because it's treated as
 136 +        * a blob of memory, and so it would be quite hard to maintain.
 137 +        *
 138 +        * What we do is to keep a global variable (per CPU) that transfers the
 139 +        * required state between xt_MARK and af_packet.c. As an optimization,
 140 +        * this state transfer and the step that follows is only executed for
 141 +        * packets that first get dropped here. When we drop a packet, we mark
 142 +        * it for 'elevation' (that's what this trick is called). When xt_MARK
 143 +        * tags the packet with the right slice, it intercepts this mark and
 144 +        * sets the value of sknid_elevator. Next, the packet is sent back here
 145 +        * for a second round, this time with the xid tag set.
 146 +        */
 147 +
 148 +       int *elevator=&__get_cpu_var(sknid_elevator);
 149 +       int tag = skb->skb_tag;
 150 +
 151 +       if (sk->sk_nx_info && !(tag == 1 || sk->sk_nid == tag)) {
 152 +               if (skb->pkt_type==PACKET_HOST) {
 153 +                       *elevator=-2; /* Rejecting this packet. Mark it for elevation in xt_MARK */
 154 +               }
 155 +               return 0;
 156 +       }
 157 +       else if (!sk->sk_nx_info && (*elevator>0)) {
 158 +               /* Root has already seen this packet once, since it has been elevated */
 159 +               return 0;
 160 +       }
 161 +
 162 +       return 1;
 163 +}
 164 +
 165  static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,  struct packet_type *pt, struct net_device *orig_dev)
 166  {
 167         struct sock *sk;
 168         struct sockaddr_pkt *spkt;
 169 +       int tag = skb->skb_tag;
 170 +
 171
 172         /*
 173          *      When we registered the protocol we saved the socket in the data
 174 @@ -269,6 +313,16 @@
 175          *      so that this procedure is noop.
 176          */
 177
 178 +       /*
 179 +        * (18:05:41) daniel_hozac: where?
 180 +        * (18:05:58) daniel_hozac: we already have filters on PF_PACKET, don't we?
 181 +        * (18:05:58) er: in packet_rcv_skpt
 182 +        * (18:07:33) daniel_hozac: oh, that's evil.
 183 +        */
 184 +
 185 +       if (!slice_check_and_elevate(skb, sk))
 186 +               return 0;
 187 +
 188         if (skb->pkt_type == PACKET_LOOPBACK)
 189                 goto out;
 190
 191 @@ -324,6 +378,9 @@
 192         __be16 proto=0;
 193         int err;
 194
 195 +       if (!nx_capable(CAP_NET_RAW, NXC_RAW_SEND))
 196 +               return -EPERM;
 197 +
 198         /*
 199          *      Get and verify the address.
 200          */
 201 @@ -416,11 +473,16 @@
 202         return err;
 203  }
 204
 205 +
 206 +
 207  static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
 208                                       unsigned int res)
 209  {
 210         struct sk_filter *filter;
 211
 212 +       if (!slice_check_and_elevate(skb, sk))
 213 +               return 0;
 214 +
 215         rcu_read_lock_bh();
 216         filter = rcu_dereference(sk->sk_filter);
 217         if (filter != NULL)
 218 @@ -711,6 +773,9 @@
 219         unsigned char *addr;
 220         int ifindex, err, reserve = 0;
 221
 222 +       if (!nx_capable(CAP_NET_RAW, NXC_RAW_SEND))
 223 +               return -EPERM;
 224 +
 225         /*
 226          *      Get and verify the address.
 227          */
 228 @@ -880,6 +945,7 @@
 229
 230         po->num = protocol;
 231         po->prot_hook.type = protocol;
 232 +       po->prot_hook.sknid_elevator = 1;
 233         po->prot_hook.dev = dev;
 234
 235         po->ifindex = dev ? dev->ifindex : 0;
 236 @@ -984,8 +1050,9 @@
 237         __be16 proto = (__force __be16)protocol; /* weird, but documented */
 238         int err;
 239
 240 -       if (!capable(CAP_NET_RAW))
 241 +       if (!nx_capable(CAP_NET_RAW, NXC_RAW_SOCKET))
 242                 return -EPERM;
 243 +
 244         if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
 245             sock->type != SOCK_PACKET)
 246                 return -ESOCKTNOSUPPORT;
 247 @@ -1016,6 +1083,7 @@
 248
 249         spin_lock_init(&po->bind_lock);
 250         po->prot_hook.func = packet_rcv;
 251 +       po->prot_hook.sknid_elevator = 1;
 252
 253         if (sock->type == SOCK_PACKET)
 254                 po->prot_hook.func = packet_rcv_spkt;