linux-2.6-525-sknid-elevator.patch

   1 From f5cda6cb8da57f20cb58e5d8e8bf41b9e155c06c Mon Sep 17 00:00:00 2001
   2 From: S.Çağlar Onur <caglar@cs.princeton.edu>
   3 Date: Tue, 7 Dec 2010 11:07:40 -0500
   4 Subject: [PATCH] linux-2.6-525-sknid-elevator.patch
   5
   6 ---
   7  include/linux/netdevice.h |    1 +
   8  net/core/dev.c            |   30 +++++++++++++++++++
   9  net/packet/af_packet.c    |   71 +++++++++++++++++++++++++++++++++++++++++++-
  10  3 files changed, 100 insertions(+), 2 deletions(-)
  11
  12 diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
  13 index 17b0c3c..186d772 100644
  14 --- a/include/linux/netdevice.h
  15 +++ b/include/linux/netdevice.h
  16 @@ -1099,6 +1099,7 @@ struct napi_gro_cb {
  17  struct packet_type {
  18         __be16                  type;   /* This is really htons(ether_type). */
  19         struct net_device       *dev;   /* NULL is wildcarded here           */
  20 +       unsigned char           sknid_elevator;
  21         int                     (*func) (struct sk_buff *,
  22                                          struct net_device *,
  23                                          struct packet_type *,
  24 diff --git a/net/core/dev.c b/net/core/dev.c
  25 index 32a2b7e..7fda319 100644
  26 --- a/net/core/dev.c
  27 +++ b/net/core/dev.c
  28 @@ -99,6 +99,8 @@
  29  #include <linux/proc_fs.h>
  30  #include <linux/seq_file.h>
  31  #include <linux/stat.h>
  32 +#include <linux/ip.h>
  33 +#include <linux/tcp.h>
  34  #include <linux/if_bridge.h>
  35  #include <linux/if_macvlan.h>
  36  #include <net/dst.h>
  37 @@ -2333,6 +2335,10 @@ void netif_nit_deliver(struct sk_buff *skb)
  38         rcu_read_unlock();
  39  }
  40
  41 +/* The code already makes the assumption that packet handlers run
  42 + * sequentially on the same CPU. -Sapan */
  43 +DEFINE_PER_CPU(int, sknid_elevator) = 0;
  44 +
  45  /**
  46   *     netif_receive_skb - process receive buffer from network
  47   *     @skb: buffer to process
  48 @@ -2355,8 +2361,11 @@ int netif_receive_skb(struct sk_buff *skb)
  49         struct net_device *null_or_orig;
  50         struct net_device *null_or_bond;
  51         int ret = NET_RX_DROP;
  52 +       int *cur_elevator = &__get_cpu_var(sknid_elevator);
  53         __be16 type;
  54
  55 +       *cur_elevator = 0;
  56 +
  57         if (!skb->tstamp.tv64)
  58                 net_timestamp(skb);
  59
  60 @@ -2456,7 +2465,27 @@ ncls:
  61         }
  62
  63         if (pt_prev) {
  64 +               /* At this point, cur_elevator may be -2 or a positive value, in
  65 +                * case a previous protocol handler marked it */
  66 +               if (*cur_elevator) {
  67 +                       atomic_inc(&skb->users);
  68 +               }
  69 +
  70                 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
  71 +
  72 +               if ((*cur_elevator)>0) {
  73 +                       skb->skb_tag = *cur_elevator;
  74 +                       list_for_each_entry_rcu(ptype, &ptype_all, list) {
  75 +                               if ((!ptype->dev || ptype->dev == skb->dev) && (ptype->sknid_elevator)) {
  76 +                                       ret = deliver_skb(skb, ptype, orig_dev);
  77 +                               }
  78 +                       }
  79 +               }
  80 +
  81 +               if (*cur_elevator) {
  82 +                       /* We have a packet */
  83 +                       kfree_skb(skb);
  84 +               }
  85         } else {
  86                 kfree_skb(skb);
  87                 /* Jamal, now you will not able to escape explaining
  88 @@ -4210,6 +4239,7 @@ unsigned dev_get_flags(const struct net_device *dev)
  89         return flags;
  90  }
  91  EXPORT_SYMBOL(dev_get_flags);
  92 +EXPORT_PER_CPU_SYMBOL(sknid_elevator);
  93
  94  /**
  95   *     dev_change_flags - change device settings
  96 diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
  97 index b4bf950..cbf8d22 100644
  98 --- a/net/packet/af_packet.c
  99 +++ b/net/packet/af_packet.c
 100 @@ -78,6 +78,7 @@
 101  #include <linux/poll.h>
 102  #include <linux/module.h>
 103  #include <linux/init.h>
 104 +#include <linux/vs_network.h>
 105  #include <linux/mutex.h>
 106  #include <linux/if_vlan.h>
 107
 108 @@ -338,12 +339,54 @@ static const struct proto_ops packet_ops;
 109
 110  static const struct proto_ops packet_ops_spkt;
 111
 112 +DECLARE_PER_CPU(int, sknid_elevator);
 113 +
 114 +static inline unsigned int slice_check_and_elevate(struct sk_buff *skb, struct sock *sk) {
 115 +       /* This mechanism is quite involved, and caused us a lot of pain
 116 +        * including crashes and packet loss during the 4.2 rollout. This
 117 +        * function decides if a slice is allowed to see a given packet.
 118 +        * Unfortunately, the first time it is invoked for a packet it does not
 119 +        * have enough information to make this call, since xt_MARK has not had
 120 +        * a chance to tag it with the slice id.  There is also no way of
 121 +        * passing state between xt_MARK and this function through a packet --
 122 +        * because the skb gets cloned quite a few times between these two
 123 +        * points.  I'd rather not use skb_shared_info because it's treated as
 124 +        * a blob of memory, and so it would be quite hard to maintain.
 125 +        *
 126 +        * What we do is to keep a global variable (per CPU) that transfers the
 127 +        * required state between xt_MARK and af_packet.c. As an optimization,
 128 +        * this state transfer and the step that follows is only executed for
 129 +        * packets that first get dropped here. When we drop a packet, we mark
 130 +        * it for 'elevation' (that's what this trick is called). When xt_MARK
 131 +        * tags the packet with the right slice, it intercepts this mark and
 132 +        * sets the value of sknid_elevator. Next, the packet is sent back here
 133 +        * for a second round, this time with the xid tag set.
 134 +        */
 135 +
 136 +       int *elevator=&__get_cpu_var(sknid_elevator);
 137 +       int tag = skb->skb_tag;
 138 +
 139 +       if (sk->sk_nx_info && !(tag == 1 || sk->sk_nid == tag)) {
 140 +               if (skb->pkt_type==PACKET_HOST) {
 141 +                       *elevator=-2; /* Rejecting this packet. Mark it for elevation in xt_MARK */
 142 +               }
 143 +               return 0;
 144 +       }
 145 +       else if (!sk->sk_nx_info && (*elevator>0)) {
 146 +               /* Root has already seen this packet once, since it has been elevated */
 147 +               return 0;
 148 +       }
 149 +
 150 +       return 1;
 151 +}
 152 +
 153  static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
 154                            struct packet_type *pt, struct net_device *orig_dev)
 155  {
 156         struct sock *sk;
 157         struct sockaddr_pkt *spkt;
 158 -
 159 +
 160 +
 161         /*
 162          *      When we registered the protocol we saved the socket in the data
 163          *      field for just this event.
 164 @@ -362,6 +405,16 @@ static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
 165          *      so that this procedure is noop.
 166          */
 167
 168 +       /*
 169 +        * (18:05:41) daniel_hozac: where?
 170 +        * (18:05:58) daniel_hozac: we already have filters on PF_PACKET, don't we?
 171 +        * (18:05:58) er: in packet_rcv_skpt
 172 +        * (18:07:33) daniel_hozac: oh, that's evil.
 173 +        */
 174 +
 175 +       if (!slice_check_and_elevate(skb, sk))
 176 +               return 0;
 177 +
 178         if (skb->pkt_type == PACKET_LOOPBACK)
 179                 goto out;
 180
 181 @@ -420,6 +473,9 @@ static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
 182         __be16 proto = 0;
 183         int err;
 184
 185 +       if (!nx_capable(CAP_NET_RAW, NXC_RAW_SEND))
 186 +               return -EPERM;
 187 +
 188         /*
 189          *      Get and verify the address.
 190          */
 191 @@ -510,11 +566,16 @@ out_unlock:
 192         return err;
 193  }
 194
 195 +
 196 +
 197  static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
 198                                       unsigned int res)
 199  {
 200         struct sk_filter *filter;
 201
 202 +       if (!slice_check_and_elevate(skb, sk))
 203 +               return 0;
 204 +
 205         rcu_read_lock_bh();
 206         filter = rcu_dereference(sk->sk_filter);
 207         if (filter != NULL)
 208 @@ -1073,6 +1134,9 @@ static int packet_snd(struct socket *sock,
 209         unsigned char *addr;
 210         int ifindex, err, reserve = 0;
 211
 212 +       if (!nx_capable(CAP_NET_RAW, NXC_RAW_SEND))
 213 +               return -EPERM;
 214 +
 215         /*
 216          *      Get and verify the address.
 217          */
 218 @@ -1258,6 +1322,7 @@ static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protoc
 219
 220         po->num = protocol;
 221         po->prot_hook.type = protocol;
 222 +       po->prot_hook.sknid_elevator = 1;
 223         po->prot_hook.dev = dev;
 224
 225         po->ifindex = dev ? dev->ifindex : 0;
 226 @@ -1358,8 +1423,9 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
 227         __be16 proto = (__force __be16)protocol; /* weird, but documented */
 228         int err;
 229
 230 -       if (!capable(CAP_NET_RAW))
 231 +       if (!nx_capable(CAP_NET_RAW, NXC_RAW_SOCKET))
 232                 return -EPERM;
 233 +
 234         if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
 235             sock->type != SOCK_PACKET)
 236                 return -ESOCKTNOSUPPORT;
 237 @@ -1391,6 +1457,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
 238         spin_lock_init(&po->bind_lock);
 239         mutex_init(&po->pg_vec_lock);
 240         po->prot_hook.func = packet_rcv;
 241 +       po->prot_hook.sknid_elevator = 1;
 242
 243         if (sock->type == SOCK_PACKET)
 244                 po->prot_hook.func = packet_rcv_spkt;
 245 --
 246 1.5.4.3
 247