Vserver/NetNS fix hopeful

author Sapan Bhatia <sapanb@cs.princeton.edu>

Thu, 20 Mar 2008 04:15:10 +0000 (04:15 +0000)

committer Sapan Bhatia <sapanb@cs.princeton.edu>

Thu, 20 Mar 2008 04:15:10 +0000 (04:15 +0000)
author Sapan Bhatia <sapanb@cs.princeton.edu>
Thu, 20 Mar 2008 04:15:10 +0000 (04:15 +0000)
committer Sapan Bhatia <sapanb@cs.princeton.edu>
Thu, 20 Mar 2008 04:15:10 +0000 (04:15 +0000)
diff --git a/linux-2.6-595-vserver-new-netns.patch b/linux-2.6-595-vserver-new-netns.patch

index f946b50..c2d92cd 100644 (file)
--- a/linux-2.6-595-vserver-new-netns.patch
+++ b/linux-2.6-595-vserver-new-netns.patch
@@ -1,5 +1,1650 @@
---- linux-2.6.22-590/kernel/vserver/space.c.orig       2008-02-29 09:01:28.000000000 -0500
-+++ linux-2.6.22-590/kernel/vserver/space.c    2008-03-06 15:47:26.000000000 -0500
+diff -Nurb linux-2.6.22-594/include/linux/vserver/network.h.orig.orig linux-2.6.22-595/include/linux/vserver/network.h.orig.orig
+--- linux-2.6.22-594/include/linux/vserver/network.h.orig.orig 2008-03-20 00:04:54.000000000 -0400
++++ linux-2.6.22-595/include/linux/vserver/network.h.orig.orig 1969-12-31 19:00:00.000000000 -0500
+@@ -1,143 +0,0 @@
+-#ifndef _VX_NETWORK_H
+-#define _VX_NETWORK_H
+-
+-#include <linux/types.h>
+-
+-
+-#define MAX_N_CONTEXT 65535   /* Arbitrary limit */
+-
+-
+-/* network flags */
+-
+-#define NXF_INFO_PRIVATE      0x00000008
+-
+-#define NXF_SINGLE_IP         0x00000100
+-#define NXF_LBACK_REMAP               0x00000200
+-
+-#define NXF_HIDE_NETIF                0x02000000
+-#define NXF_HIDE_LBACK                0x04000000
+-
+-#define NXF_STATE_SETUP               (1ULL << 32)
+-#define NXF_STATE_ADMIN               (1ULL << 34)
+-
+-#define NXF_SC_HELPER         (1ULL << 36)
+-#define NXF_PERSISTENT                (1ULL << 38)
+-
+-#define NXF_ONE_TIME          (0x0005ULL << 32)
+-
+-
+-#define       NXF_INIT_SET            (__nxf_init_set())
+-
+-static inline uint64_t __nxf_init_set(void) {
+-      return    NXF_STATE_ADMIN
+-#ifdef        CONFIG_VSERVER_AUTO_LBACK
+-              | NXF_LBACK_REMAP
+-              | NXF_HIDE_LBACK
+-#endif
+-#ifdef        CONFIG_VSERVER_AUTO_SINGLE
+-              | NXF_SINGLE_IP
+-#endif
+-              | NXF_HIDE_NETIF;
+-}
+-
+-
+-/* network caps */
+-
+-#define NXC_RAW_ICMP          0x00000100
+-
+-
+-/* address types */
+-
+-#define NXA_TYPE_IPV4         0x0001
+-#define NXA_TYPE_IPV6         0x0002
+-
+-#define NXA_TYPE_NONE         0x0000
+-#define NXA_TYPE_ANY          0x00FF
+-
+-#define NXA_TYPE_ADDR         0x0010
+-#define NXA_TYPE_MASK         0x0020
+-#define NXA_TYPE_RANGE                0x0040
+-
+-#define NXA_MASK_ALL          (NXA_TYPE_ADDR | NXA_TYPE_MASK | NXA_TYPE_RANGE)
+-
+-#define NXA_MOD_BCAST         0x0100
+-#define NXA_MOD_LBACK         0x0200
+-
+-#define NXA_LOOPBACK          0x1000
+-
+-#define NXA_MASK_BIND         (NXA_MASK_ALL | NXA_MOD_BCAST | NXA_MOD_LBACK)
+-#define NXA_MASK_SHOW         (NXA_MASK_ALL | NXA_LOOPBACK)
+-
+-#ifdef        __KERNEL__
+-
+-#include <linux/list.h>
+-#include <linux/spinlock.h>
+-#include <linux/rcupdate.h>
+-#include <linux/in.h>
+-#include <linux/in6.h>
+-#include <asm/atomic.h>
+-
+-struct nx_addr_v4 {
+-      struct nx_addr_v4 *next;
+-      struct in_addr ip[2];
+-      struct in_addr mask;
+-      uint16_t type;
+-      uint16_t flags;
+-};
+-
+-struct nx_addr_v6 {
+-      struct nx_addr_v6 *next;
+-      struct in6_addr ip;
+-      struct in6_addr mask;
+-      uint32_t prefix;
+-      uint16_t type;
+-      uint16_t flags;
+-};
+-
+-struct nx_info {
+-      struct hlist_node nx_hlist;     /* linked list of nxinfos */
+-      nid_t nx_id;                    /* vnet id */
+-      atomic_t nx_usecnt;             /* usage count */
+-      atomic_t nx_tasks;              /* tasks count */
+-      int nx_state;                   /* context state */
+-
+-      uint64_t nx_flags;              /* network flag word */
+-      uint64_t nx_ncaps;              /* network capabilities */
+-
+-      struct in_addr v4_lback;        /* Loopback address */
+-      struct in_addr v4_bcast;        /* Broadcast address */
+-      struct nx_addr_v4 v4;           /* First/Single ipv4 address */
+-#ifdef        CONFIG_IPV6
+-      struct nx_addr_v6 v6;           /* First/Single ipv6 address */
+-#endif
+-      char nx_name[65];               /* network context name */
+-};
+-
+-
+-/* status flags */
+-
+-#define NXS_HASHED      0x0001
+-#define NXS_SHUTDOWN    0x0100
+-#define NXS_RELEASED    0x8000
+-
+-extern struct nx_info *lookup_nx_info(int);
+-
+-extern int get_nid_list(int, unsigned int *, int);
+-extern int nid_is_hashed(nid_t);
+-
+-extern int nx_migrate_task(struct task_struct *, struct nx_info *);
+-
+-extern long vs_net_change(struct nx_info *, unsigned int);
+-
+-struct sock;
+-
+-
+-#define NX_IPV4(n)    ((n)->v4.type != NXA_TYPE_NONE)
+-#ifdef  CONFIG_IPV6
+-#define NX_IPV6(n)    ((n)->v6.type != NXA_TYPE_NONE)
+-#else
+-#define NX_IPV6(n)    (0)
+-#endif
+-
+-#endif        /* __KERNEL__ */
+-#endif        /* _VX_NETWORK_H */
+diff -Nurb linux-2.6.22-594/kernel/nsproxy.c.orig linux-2.6.22-595/kernel/nsproxy.c.orig
+--- linux-2.6.22-594/kernel/nsproxy.c.orig     2008-03-20 00:05:18.000000000 -0400
++++ linux-2.6.22-595/kernel/nsproxy.c.orig     1969-12-31 19:00:00.000000000 -0500
+@@ -1,264 +0,0 @@
+-/*
+- *  Copyright (C) 2006 IBM Corporation
+- *
+- *  Author: Serge Hallyn <serue@us.ibm.com>
+- *
+- *  This program is free software; you can redistribute it and/or
+- *  modify it under the terms of the GNU General Public License as
+- *  published by the Free Software Foundation, version 2 of the
+- *  License.
+- *
+- *  Jun 2006 - namespaces support
+- *             OpenVZ, SWsoft Inc.
+- *             Pavel Emelianov <xemul@openvz.org>
+- */
+-
+-#include <linux/module.h>
+-#include <linux/version.h>
+-#include <linux/nsproxy.h>
+-#include <linux/init_task.h>
+-#include <linux/mnt_namespace.h>
+-#include <linux/utsname.h>
+-#include <net/net_namespace.h>
+-#include <linux/pid_namespace.h>
+-#include <linux/vserver/global.h>
+-#include <linux/vserver/debug.h>
+-
+-static struct kmem_cache *nsproxy_cachep;
+-
+-struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);
+-
+-void get_task_namespaces(struct task_struct *tsk)
+-{
+-      struct nsproxy *ns = tsk->nsproxy;
+-      if (ns) {
+-              get_nsproxy(ns);
+-      }
+-}
+-
+-/*
+- * creates a copy of "orig" with refcount 1.
+- */
+-static inline struct nsproxy *clone_nsproxy(struct nsproxy *orig)
+-{
+-      struct nsproxy *ns;
+-
+-      ns = kmemdup(orig, sizeof(struct nsproxy), GFP_KERNEL);
+-      if (ns)
+-              atomic_set(&ns->count, 1);
+-      vxdprintk(VXD_CBIT(space, 2), "clone_nsproxy(%p[%u] = %p[1]",
+-              orig, atomic_read(&orig->count), ns);
+-      atomic_inc(&vs_global_nsproxy);
+-      return ns;
+-}
+-
+-/*
+- * Create new nsproxy and all of its the associated namespaces.
+- * Return the newly created nsproxy.  Do not attach this to the task,
+- * leave it to the caller to do proper locking and attach it to task.
+- */
+-static struct nsproxy *unshare_namespaces(int flags, struct nsproxy *orig,
+-                      struct fs_struct *new_fs)
+-{
+-      struct nsproxy *new_nsp;
+-      int err = -ENOMEM;
+-
+-      vxdprintk(VXD_CBIT(space, 4),
+-              "unshare_namespaces(0x%08x,%p,%p)",
+-              flags, orig, new_fs);
+-
+-      new_nsp = clone_nsproxy(orig);
+-      if (!new_nsp)
+-              return ERR_PTR(-ENOMEM);
+-
+-      new_nsp->mnt_ns = copy_mnt_ns(flags, orig->mnt_ns, new_fs);
+-      if (IS_ERR(new_nsp->mnt_ns))
+-              goto out_ns;
+-
+-      new_nsp->uts_ns = copy_utsname(flags, orig->uts_ns);
+-      if (IS_ERR(new_nsp->uts_ns))
+-              goto out_uts;
+-
+-      new_nsp->ipc_ns = copy_ipcs(flags, orig->ipc_ns);
+-      if (IS_ERR(new_nsp->ipc_ns))
+-              goto out_ipc;
+-
+-      new_nsp->pid_ns = copy_pid_ns(flags, orig->pid_ns);
+-      if (IS_ERR(new_nsp->pid_ns))
+-              goto out_pid;
+-
+-      new_nsp->user_ns = copy_user_ns(flags, orig->user_ns);
+-      if (IS_ERR(new_nsp->user_ns))
+-              goto out_user;
+-
+-      new_nsp->net_ns = copy_net_ns(flags, orig->net_ns);
+-      if (IS_ERR(new_nsp->net_ns))
+-              goto out_net;
+-
+-      return new_nsp;
+-
+-out_net:
+-      if (new_nsp->user_ns)
+-              put_user_ns(new_nsp->user_ns);
+-      if (new_nsp->net_ns)
+-              put_net(new_nsp->net_ns);
+-out_user:
+-      if (new_nsp->pid_ns)
+-              put_pid_ns(new_nsp->pid_ns);
+-out_pid:
+-      if (new_nsp->ipc_ns)
+-              put_ipc_ns(new_nsp->ipc_ns);
+-out_ipc:
+-      if (new_nsp->uts_ns)
+-              put_uts_ns(new_nsp->uts_ns);
+-out_uts:
+-      if (new_nsp->mnt_ns)
+-              put_mnt_ns(new_nsp->mnt_ns);
+-out_ns:
+-      kmem_cache_free(nsproxy_cachep, new_nsp);
+-      return ERR_PTR(err);
+-}
+-
+-static struct nsproxy *create_new_namespaces(unsigned long flags, struct task_struct *tsk,
+-                      struct fs_struct *new_fs)
+-{
+-      return unshare_namespaces(flags, tsk->nsproxy, new_fs);
+-}
+-
+-/*
+- * copies the nsproxy, setting refcount to 1, and grabbing a
+- * reference to all contained namespaces.
+- */
+-struct nsproxy *copy_nsproxy(struct nsproxy *orig)
+-{
+-      struct nsproxy *ns = clone_nsproxy(orig);
+-
+-      if (ns) {
+-              if (ns->mnt_ns)
+-                      get_mnt_ns(ns->mnt_ns);
+-              if (ns->uts_ns)
+-                      get_uts_ns(ns->uts_ns);
+-              if (ns->ipc_ns)
+-                      get_ipc_ns(ns->ipc_ns);
+-              if (ns->pid_ns)
+-                      get_pid_ns(ns->pid_ns);
+-      }
+-      return ns;
+-}
+-
+-/*
+- * called from clone.  This now handles copy for nsproxy and all
+- * namespaces therein.
+- */
+-int copy_namespaces(unsigned long flags, struct task_struct *tsk)
+-{
+-      struct nsproxy *old_ns = tsk->nsproxy;
+-      struct nsproxy *new_ns = NULL;
+-      int err = 0;
+-
+-      vxdprintk(VXD_CBIT(space, 7), "copy_namespaces(0x%08x,%p[%p])",
+-              flags, tsk, old_ns);
+-
+-      if (!old_ns)
+-              return 0;
+-
+-      get_nsproxy(old_ns);
+-      return 0;
+-
+-      if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWUSER | CLONE_NEWNET)))
+-              return 0;
+-
+-       #ifndef CONFIG_NET_NS
+-              if (unshare_flags & CLONE_NEWNET)
+-                      return -EINVAL;
+-       #endif
+-
+-
+-      if (!capable(CAP_SYS_ADMIN)) {
+-              err = -EPERM;
+-              goto out;
+-      }
+-
+-      new_ns = create_new_namespaces(flags, tsk, tsk->fs);
+-      if (IS_ERR(new_ns)) {
+-              err = PTR_ERR(new_ns);
+-              goto out;
+-      }
+-
+-      err = ns_container_clone(tsk);
+-      if (err) {
+-              put_nsproxy(new_ns);
+-              goto out;
+-      }
+-
+-      tsk->nsproxy = new_ns;
+-
+-out:
+-      put_nsproxy(old_ns);
+-      vxdprintk(VXD_CBIT(space, 3),
+-              "copy_namespaces(0x%08x,%p[%p]) = %d [%p]",
+-              flags, tsk, old_ns, err, new_ns);
+-      return err;
+-}
+-
+-void free_nsproxy(struct nsproxy *ns)
+-{
+-      if (ns->mnt_ns)
+-              put_mnt_ns(ns->mnt_ns);
+-      if (ns->uts_ns)
+-              put_uts_ns(ns->uts_ns);
+-      if (ns->ipc_ns)
+-              put_ipc_ns(ns->ipc_ns);
+-      if (ns->pid_ns)
+-              put_pid_ns(ns->pid_ns);
+-      atomic_dec(&vs_global_nsproxy);
+-      kfree(ns);
+-}
+-
+-/*
+- * Called from unshare. Unshare all the namespaces part of nsproxy.
+- * On success, returns the new nsproxy.
+- */
+-int unshare_nsproxy_namespaces(unsigned long unshare_flags,
+-              struct nsproxy **new_nsp, struct fs_struct *new_fs)
+-{
+-      int err = 0;
+-
+-      vxdprintk(VXD_CBIT(space, 4),
+-              "unshare_nsproxy_namespaces(0x%08lx,[%p])",
+-              unshare_flags, current->nsproxy);
+-
+-      if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
+-                             CLONE_NEWUSER | CLONE_NEWNET)))
+-              return 0;
+-
+-#ifndef CONFIG_NET_NS
+-      if (unshare_flags & CLONE_NEWNET)
+-              return -EINVAL;
+-#endif
+-      if (!capable(CAP_SYS_ADMIN))
+-              return -EPERM;
+-
+-      *new_nsp = create_new_namespaces(unshare_flags, current,
+-                              new_fs ? new_fs : current->fs);
+-      if (IS_ERR(*new_nsp)) {
+-              err = PTR_ERR(*new_nsp);
+-              goto out;
+-      }
+-
+-      err = ns_container_clone(current);
+-      if (err)
+-              put_nsproxy(*new_nsp);
+-
+-out:
+-      return err;
+-}
+-
+-static int __init nsproxy_cache_init(void)
+-{
+-      nsproxy_cachep = kmem_cache_create("nsproxy", sizeof(struct nsproxy),
+-                                         0, SLAB_PANIC, NULL, NULL);
+-      return 0;
+-}
+-
+-module_init(nsproxy_cache_init);
+diff -Nurb linux-2.6.22-594/kernel/user.c.orig linux-2.6.22-595/kernel/user.c.orig
+--- linux-2.6.22-594/kernel/user.c.orig        2008-03-20 00:05:18.000000000 -0400
++++ linux-2.6.22-595/kernel/user.c.orig        1969-12-31 19:00:00.000000000 -0500
+@@ -1,227 +0,0 @@
+-/*
+- * The "user cache".
+- *
+- * (C) Copyright 1991-2000 Linus Torvalds
+- *
+- * We have a per-user structure to keep track of how many
+- * processes, files etc the user has claimed, in order to be
+- * able to have per-user limits for system resources. 
+- */
+-
+-#include <linux/init.h>
+-#include <linux/sched.h>
+-#include <linux/slab.h>
+-#include <linux/bitops.h>
+-#include <linux/key.h>
+-#include <linux/interrupt.h>
+-#include <linux/module.h>
+-#include <linux/user_namespace.h>
+-
+-/*
+- * UID task count cache, to get fast user lookup in "alloc_uid"
+- * when changing user ID's (ie setuid() and friends).
+- */
+-
+-#define UIDHASH_MASK          (UIDHASH_SZ - 1)
+-#define __uidhashfn(xid,uid)  ((((uid) >> UIDHASH_BITS) + ((uid)^(xid))) & UIDHASH_MASK)
+-#define uidhashentry(ns, xid, uid)    ((ns)->uidhash_table + __uidhashfn(xid, uid))
+-
+-static struct kmem_cache *uid_cachep;
+-static struct list_head uidhash_table[UIDHASH_SZ];
+-
+-/*
+- * The uidhash_lock is mostly taken from process context, but it is
+- * occasionally also taken from softirq/tasklet context, when
+- * task-structs get RCU-freed. Hence all locking must be softirq-safe.
+- * But free_uid() is also called with local interrupts disabled, and running
+- * local_bh_enable() with local interrupts disabled is an error - we'll run
+- * softirq callbacks, and they can unconditionally enable interrupts, and
+- * the caller of free_uid() didn't expect that..
+- */
+-static DEFINE_SPINLOCK(uidhash_lock);
+-
+-struct user_struct root_user = {
+-      .__count        = ATOMIC_INIT(1),
+-      .processes      = ATOMIC_INIT(1),
+-      .files          = ATOMIC_INIT(0),
+-      .sigpending     = ATOMIC_INIT(0),
+-      .mq_bytes       = 0,
+-      .locked_shm     = 0,
+-#ifdef CONFIG_KEYS
+-      .uid_keyring    = &root_user_keyring,
+-      .session_keyring = &root_session_keyring,
+-#endif
+-};
+-
+-/*
+- * These routines must be called with the uidhash spinlock held!
+- */
+-static inline void uid_hash_insert(struct user_struct *up, struct list_head *hashent)
+-{
+-      list_add(&up->uidhash_list, hashent);
+-}
+-
+-static inline void uid_hash_remove(struct user_struct *up)
+-{
+-      list_del(&up->uidhash_list);
+-}
+-
+-static inline struct user_struct *uid_hash_find(xid_t xid, uid_t uid, struct list_head *hashent)
+-{
+-      struct list_head *up;
+-
+-      list_for_each(up, hashent) {
+-              struct user_struct *user;
+-
+-              user = list_entry(up, struct user_struct, uidhash_list);
+-
+-              if(user->uid == uid && user->xid == xid) {
+-                      atomic_inc(&user->__count);
+-                      return user;
+-              }
+-      }
+-
+-      return NULL;
+-}
+-
+-/*
+- * Locate the user_struct for the passed UID.  If found, take a ref on it.  The
+- * caller must undo that ref with free_uid().
+- *
+- * If the user_struct could not be found, return NULL.
+- */
+-struct user_struct *find_user(xid_t xid, uid_t uid)
+-{
+-      struct user_struct *ret;
+-      unsigned long flags;
+-      struct user_namespace *ns = current->nsproxy->user_ns;
+-
+-      spin_lock_irqsave(&uidhash_lock, flags);
+-      ret = uid_hash_find(xid, uid, uidhashentry(ns, xid, uid));
+-      spin_unlock_irqrestore(&uidhash_lock, flags);
+-      return ret;
+-}
+-
+-void free_uid(struct user_struct *up)
+-{
+-      unsigned long flags;
+-
+-      if (!up)
+-              return;
+-
+-      local_irq_save(flags);
+-      if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) {
+-              uid_hash_remove(up);
+-              spin_unlock_irqrestore(&uidhash_lock, flags);
+-              key_put(up->uid_keyring);
+-              key_put(up->session_keyring);
+-              kmem_cache_free(uid_cachep, up);
+-      } else {
+-              local_irq_restore(flags);
+-      }
+-}
+-
+-struct user_struct * alloc_uid(xid_t xid, uid_t uid)
+-{
+-      struct user_namespace *ns = current->nsproxy->user_ns;
+-      struct list_head *hashent = uidhashentry(ns,xid, uid);
+-      struct user_struct *up;
+-
+-      spin_lock_irq(&uidhash_lock);
+-      up = uid_hash_find(xid, uid, hashent);
+-      spin_unlock_irq(&uidhash_lock);
+-
+-      if (!up) {
+-              struct user_struct *new;
+-
+-              new = kmem_cache_alloc(uid_cachep, GFP_KERNEL);
+-              if (!new)
+-                      return NULL;
+-              new->uid = uid;
+-              new->xid = xid;
+-              atomic_set(&new->__count, 1);
+-              atomic_set(&new->processes, 0);
+-              atomic_set(&new->files, 0);
+-              atomic_set(&new->sigpending, 0);
+-#ifdef CONFIG_INOTIFY_USER
+-              atomic_set(&new->inotify_watches, 0);
+-              atomic_set(&new->inotify_devs, 0);
+-#endif
+-
+-              new->mq_bytes = 0;
+-              new->locked_shm = 0;
+-
+-              if (alloc_uid_keyring(new, current) < 0) {
+-                      kmem_cache_free(uid_cachep, new);
+-                      return NULL;
+-              }
+-
+-              /*
+-               * Before adding this, check whether we raced
+-               * on adding the same user already..
+-               */
+-              spin_lock_irq(&uidhash_lock);
+-              up = uid_hash_find(xid, uid, hashent);
+-              if (up) {
+-                      key_put(new->uid_keyring);
+-                      key_put(new->session_keyring);
+-                      kmem_cache_free(uid_cachep, new);
+-              } else {
+-                      uid_hash_insert(new, hashent);
+-                      up = new;
+-              }
+-              spin_unlock_irq(&uidhash_lock);
+-
+-      }
+-      return up;
+-}
+-
+-void switch_uid(struct user_struct *new_user)
+-{
+-      struct user_struct *old_user;
+-
+-      /* What if a process setreuid()'s and this brings the
+-       * new uid over his NPROC rlimit?  We can check this now
+-       * cheaply with the new uid cache, so if it matters
+-       * we should be checking for it.  -DaveM
+-       */
+-      old_user = current->user;
+-      atomic_inc(&new_user->processes);
+-      atomic_dec(&old_user->processes);
+-      switch_uid_keyring(new_user);
+-      current->user = new_user;
+-
+-      /*
+-       * We need to synchronize with __sigqueue_alloc()
+-       * doing a get_uid(p->user).. If that saw the old
+-       * user value, we need to wait until it has exited
+-       * its critical region before we can free the old
+-       * structure.
+-       */
+-      smp_mb();
+-      spin_unlock_wait(&current->sighand->siglock);
+-
+-      free_uid(old_user);
+-      suid_keys(current);
+-}
+-
+-
+-static int __init uid_cache_init(void)
+-{
+-      int n;
+-
+-      uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct),
+-                      0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+-
+-      for(n = 0; n < UIDHASH_SZ; ++n)
+-              INIT_LIST_HEAD(init_user_ns.uidhash_table + n);
+-
+-      /* Insert the root user immediately (init already runs as root) */
+-      spin_lock_irq(&uidhash_lock);
+-      uid_hash_insert(&root_user, uidhashentry(&init_user_ns, 0, 0));
+-      spin_unlock_irq(&uidhash_lock);
+-
+-      return 0;
+-}
+-
+-module_init(uid_cache_init);
+diff -Nurb linux-2.6.22-594/kernel/vserver/context.c linux-2.6.22-595/kernel/vserver/context.c
+--- linux-2.6.22-594/kernel/vserver/context.c  2008-03-20 00:04:46.000000000 -0400
++++ linux-2.6.22-595/kernel/vserver/context.c  2008-03-20 00:13:22.000000000 -0400
+@@ -589,13 +589,13 @@
+                       struct nsproxy *old_nsp, *new_nsp;
+ 
+                       ret = unshare_nsproxy_namespaces(
+-                              CLONE_NEWUTS | CLONE_NEWIPC,
++                              CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWNET,
+                               &new_nsp, NULL);
+                       if (ret)
+                               goto out;
+ 
+                       old_nsp = xchg(&p->nsproxy, new_nsp);
+-                      vx_set_space(vxi, CLONE_NEWUTS | CLONE_NEWIPC);
++                      vx_set_space(vxi, CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWNET);
+                       put_nsproxy(old_nsp);
+               }
+       }
+@@ -781,7 +781,7 @@
+       if (vs_state_change(new_vxi, VSC_STARTUP))
+               goto out;
+ 
+-      ret = vx_migrate_task(current, new_vxi, (!data));
++      ret = vx_migrate_task(current, new_vxi, 1 /*(!data) Hack no. 1 - Sapan*/);
+       if (ret)
+               goto out;
+ 
+diff -Nurb linux-2.6.22-594/kernel/vserver/context.c.orig linux-2.6.22-595/kernel/vserver/context.c.orig
+--- linux-2.6.22-594/kernel/vserver/context.c.orig     1969-12-31 19:00:00.000000000 -0500
++++ linux-2.6.22-595/kernel/vserver/context.c.orig     2008-03-20 00:04:46.000000000 -0400
+@@ -0,0 +1,966 @@
++/*
++ *  linux/kernel/vserver/context.c
++ *
++ *  Virtual Server: Context Support
++ *
++ *  Copyright (C) 2003-2007  Herbert Pötzl
++ *
++ *  V0.01  context helper
++ *  V0.02  vx_ctx_kill syscall command
++ *  V0.03  replaced context_info calls
++ *  V0.04  redesign of struct (de)alloc
++ *  V0.05  rlimit basic implementation
++ *  V0.06  task_xid and info commands
++ *  V0.07  context flags and caps
++ *  V0.08  switch to RCU based hash
++ *  V0.09  revert to non RCU for now
++ *  V0.10  and back to working RCU hash
++ *  V0.11  and back to locking again
++ *  V0.12  referenced context store
++ *  V0.13  separate per cpu data
++ *  V0.14  changed vcmds to vxi arg
++ *  V0.15  added context stat
++ *  V0.16  have __create claim() the vxi
++ *  V0.17  removed older and legacy stuff
++ *
++ */
++
++#include <linux/slab.h>
++#include <linux/types.h>
++#include <linux/pid_namespace.h>
++
++#include <linux/vserver/context.h>
++#include <linux/vserver/network.h>
++#include <linux/vserver/debug.h>
++#include <linux/vserver/limit.h>
++#include <linux/vserver/limit_int.h>
++#include <linux/vserver/space.h>
++
++#include <linux/vs_context.h>
++#include <linux/vs_limit.h>
++#include <linux/vserver/context_cmd.h>
++
++#include "cvirt_init.h"
++#include "cacct_init.h"
++#include "limit_init.h"
++#include "sched_init.h"
++
++
++atomic_t vx_global_ctotal     = ATOMIC_INIT(0);
++atomic_t vx_global_cactive    = ATOMIC_INIT(0);
++
++
++/*    now inactive context structures */
++
++static struct hlist_head vx_info_inactive = HLIST_HEAD_INIT;
++
++static spinlock_t vx_info_inactive_lock = SPIN_LOCK_UNLOCKED;
++
++
++/*    __alloc_vx_info()
++
++      * allocate an initialized vx_info struct
++      * doesn't make it visible (hash)                        */
++
++static struct vx_info *__alloc_vx_info(xid_t xid)
++{
++      struct vx_info *new = NULL;
++      int cpu;
++
++      vxdprintk(VXD_CBIT(xid, 0), "alloc_vx_info(%d)*", xid);
++
++      /* would this benefit from a slab cache? */
++      new = kmalloc(sizeof(struct vx_info), GFP_KERNEL);
++      if (!new)
++              return 0;
++
++      memset(new, 0, sizeof(struct vx_info));
++#ifdef CONFIG_SMP
++      new->ptr_pc = alloc_percpu(struct _vx_info_pc);
++      if (!new->ptr_pc)
++              goto error;
++#endif
++      new->vx_id = xid;
++      INIT_HLIST_NODE(&new->vx_hlist);
++      atomic_set(&new->vx_usecnt, 0);
++      atomic_set(&new->vx_tasks, 0);
++      new->vx_parent = NULL;
++      new->vx_state = 0;
++      init_waitqueue_head(&new->vx_wait);
++
++      /* prepare reaper */
++      get_task_struct(init_pid_ns.child_reaper);
++      new->vx_reaper = init_pid_ns.child_reaper;
++      new->vx_badness_bias = 0;
++
++      /* rest of init goes here */
++      vx_info_init_limit(&new->limit);
++      vx_info_init_sched(&new->sched);
++      vx_info_init_cvirt(&new->cvirt);
++      vx_info_init_cacct(&new->cacct);
++
++      /* per cpu data structures */
++      for_each_possible_cpu(cpu) {
++              vx_info_init_sched_pc(
++                      &vx_per_cpu(new, sched_pc, cpu), cpu);
++              vx_info_init_cvirt_pc(
++                      &vx_per_cpu(new, cvirt_pc, cpu), cpu);
++      }
++
++      new->vx_flags = VXF_INIT_SET;
++      new->vx_bcaps = CAP_INIT_EFF_SET;
++      new->vx_ccaps = 0;
++      new->vx_cap_bset = cap_bset;
++
++      new->reboot_cmd = 0;
++      new->exit_code = 0;
++
++      new->vx_nsproxy = copy_nsproxy(current->nsproxy);
++
++      vxdprintk(VXD_CBIT(xid, 0),
++              "alloc_vx_info(%d) = %p", xid, new);
++      vxh_alloc_vx_info(new);
++      atomic_inc(&vx_global_ctotal);
++      return new;
++#ifdef CONFIG_SMP
++error:
++      kfree(new);
++      return 0;
++#endif
++}
++
++/*    __dealloc_vx_info()
++
++      * final disposal of vx_info                             */
++
++static void __dealloc_vx_info(struct vx_info *vxi)
++{
++      int cpu;
++
++      vxdprintk(VXD_CBIT(xid, 0),
++              "dealloc_vx_info(%p)", vxi);
++      vxh_dealloc_vx_info(vxi);
++
++      vxi->vx_id = -1;
++
++      vx_info_exit_limit(&vxi->limit);
++      vx_info_exit_sched(&vxi->sched);
++      vx_info_exit_cvirt(&vxi->cvirt);
++      vx_info_exit_cacct(&vxi->cacct);
++
++      for_each_possible_cpu(cpu) {
++              vx_info_exit_sched_pc(
++                      &vx_per_cpu(vxi, sched_pc, cpu), cpu);
++              vx_info_exit_cvirt_pc(
++                      &vx_per_cpu(vxi, cvirt_pc, cpu), cpu);
++      }
++
++      vxi->vx_state |= VXS_RELEASED;
++
++#ifdef CONFIG_SMP
++      free_percpu(vxi->ptr_pc);
++#endif
++      kfree(vxi);
++      atomic_dec(&vx_global_ctotal);
++}
++
++static void __shutdown_vx_info(struct vx_info *vxi)
++{
++      struct nsproxy *nsproxy;
++      struct fs_struct *fs;
++
++      might_sleep();
++
++      vxi->vx_state |= VXS_SHUTDOWN;
++      vs_state_change(vxi, VSC_SHUTDOWN);
++
++      nsproxy = xchg(&vxi->vx_nsproxy, NULL);
++      fs = xchg(&vxi->vx_fs, NULL);
++
++      if (nsproxy)
++              put_nsproxy(nsproxy);
++      if (fs)
++              put_fs_struct(fs);
++}
++
++/* exported stuff */
++
++void free_vx_info(struct vx_info *vxi)
++{
++      unsigned long flags;
++
++      /* check for reference counts first */
++      BUG_ON(atomic_read(&vxi->vx_usecnt));
++      BUG_ON(atomic_read(&vxi->vx_tasks));
++
++      /* context must not be hashed */
++      BUG_ON(vx_info_state(vxi, VXS_HASHED));
++
++      /* context shutdown is mandatory */
++      BUG_ON(!vx_info_state(vxi, VXS_SHUTDOWN));
++
++      BUG_ON(vxi->vx_nsproxy);
++      BUG_ON(vxi->vx_fs);
++
++      spin_lock_irqsave(&vx_info_inactive_lock, flags);
++      hlist_del(&vxi->vx_hlist);
++      spin_unlock_irqrestore(&vx_info_inactive_lock, flags);
++
++      __dealloc_vx_info(vxi);
++}
++
++
++/*    hash table for vx_info hash */
++
++#define VX_HASH_SIZE  13
++
++static struct hlist_head vx_info_hash[VX_HASH_SIZE] =
++      { [0 ... VX_HASH_SIZE-1] = HLIST_HEAD_INIT };
++
++static spinlock_t vx_info_hash_lock = SPIN_LOCK_UNLOCKED;
++
++
++static inline unsigned int __hashval(xid_t xid)
++{
++      return (xid % VX_HASH_SIZE);
++}
++
++
++
++/*    __hash_vx_info()
++
++      * add the vxi to the global hash table
++      * requires the hash_lock to be held                     */
++
++static inline void __hash_vx_info(struct vx_info *vxi)
++{
++      struct hlist_head *head;
++
++      vxd_assert_lock(&vx_info_hash_lock);
++      vxdprintk(VXD_CBIT(xid, 4),
++              "__hash_vx_info: %p[#%d]", vxi, vxi->vx_id);
++      vxh_hash_vx_info(vxi);
++
++      /* context must not be hashed */
++      BUG_ON(vx_info_state(vxi, VXS_HASHED));
++
++      vxi->vx_state |= VXS_HASHED;
++      head = &vx_info_hash[__hashval(vxi->vx_id)];
++      hlist_add_head(&vxi->vx_hlist, head);
++      atomic_inc(&vx_global_cactive);
++}
++
++/*    __unhash_vx_info()
++
++      * remove the vxi from the global hash table
++      * requires the hash_lock to be held                     */
++
++static inline void __unhash_vx_info(struct vx_info *vxi)
++{
++      unsigned long flags;
++
++      vxd_assert_lock(&vx_info_hash_lock);
++      vxdprintk(VXD_CBIT(xid, 4),
++              "__unhash_vx_info: %p[#%d.%d.%d]", vxi, vxi->vx_id,
++              atomic_read(&vxi->vx_usecnt), atomic_read(&vxi->vx_tasks));
++      vxh_unhash_vx_info(vxi);
++
++      /* context must be hashed */
++      BUG_ON(!vx_info_state(vxi, VXS_HASHED));
++      /* but without tasks */
++      BUG_ON(atomic_read(&vxi->vx_tasks));
++
++      vxi->vx_state &= ~VXS_HASHED;
++      hlist_del_init(&vxi->vx_hlist);
++      spin_lock_irqsave(&vx_info_inactive_lock, flags);
++      hlist_add_head(&vxi->vx_hlist, &vx_info_inactive);
++      spin_unlock_irqrestore(&vx_info_inactive_lock, flags);
++      atomic_dec(&vx_global_cactive);
++}
++
++
++/*    __lookup_vx_info()
++
++      * requires the hash_lock to be held
++      * doesn't increment the vx_refcnt                       */
++
++static inline struct vx_info *__lookup_vx_info(xid_t xid)
++{
++      struct hlist_head *head = &vx_info_hash[__hashval(xid)];
++      struct hlist_node *pos;
++      struct vx_info *vxi;
++
++      vxd_assert_lock(&vx_info_hash_lock);
++      hlist_for_each(pos, head) {
++              vxi = hlist_entry(pos, struct vx_info, vx_hlist);
++
++              if (vxi->vx_id == xid)
++                      goto found;
++      }
++      vxi = NULL;
++found:
++      vxdprintk(VXD_CBIT(xid, 0),
++              "__lookup_vx_info(#%u): %p[#%u]",
++              xid, vxi, vxi ? vxi->vx_id : 0);
++      vxh_lookup_vx_info(vxi, xid);
++      return vxi;
++}
++
++
++/*    __create_vx_info()
++
++      * create the requested context
++      * get(), claim() and hash it                            */
++
++static struct vx_info *__create_vx_info(int id)
++{
++      struct vx_info *new, *vxi = NULL;
++
++      vxdprintk(VXD_CBIT(xid, 1), "create_vx_info(%d)*", id);
++
++      if (!(new = __alloc_vx_info(id)))
++              return ERR_PTR(-ENOMEM);
++
++      /* required to make dynamic xids unique */
++      spin_lock(&vx_info_hash_lock);
++
++      /* static context requested */
++      if ((vxi = __lookup_vx_info(id))) {
++              vxdprintk(VXD_CBIT(xid, 0),
++                      "create_vx_info(%d) = %p (already there)", id, vxi);
++              if (vx_info_flags(vxi, VXF_STATE_SETUP, 0))
++                      vxi = ERR_PTR(-EBUSY);
++              else
++                      vxi = ERR_PTR(-EEXIST);
++              goto out_unlock;
++      }
++      /* new context */
++      vxdprintk(VXD_CBIT(xid, 0),
++              "create_vx_info(%d) = %p (new)", id, new);
++      claim_vx_info(new, NULL);
++      __hash_vx_info(get_vx_info(new));
++      vxi = new, new = NULL;
++
++out_unlock:
++      spin_unlock(&vx_info_hash_lock);
++      vxh_create_vx_info(IS_ERR(vxi) ? NULL : vxi, id);
++      if (new)
++              __dealloc_vx_info(new);
++      return vxi;
++}
++
++
++/*    exported stuff                                          */
++
++
++void unhash_vx_info(struct vx_info *vxi)
++{
++      __shutdown_vx_info(vxi);
++      spin_lock(&vx_info_hash_lock);
++      __unhash_vx_info(vxi);
++      spin_unlock(&vx_info_hash_lock);
++      __wakeup_vx_info(vxi);
++}
++
++
++/*    lookup_vx_info()
++
++      * search for a vx_info and get() it
++      * negative id means current                             */
++
++struct vx_info *lookup_vx_info(int id)
++{
++      struct vx_info *vxi = NULL;
++
++      if (id < 0) {
++              vxi = get_vx_info(current->vx_info);
++      } else if (id > 1) {
++              spin_lock(&vx_info_hash_lock);
++              vxi = get_vx_info(__lookup_vx_info(id));
++              spin_unlock(&vx_info_hash_lock);
++      }
++      return vxi;
++}
++
++/*    xid_is_hashed()
++
++      * verify that xid is still hashed                       */
++
++int xid_is_hashed(xid_t xid)
++{
++      int hashed;
++
++      spin_lock(&vx_info_hash_lock);
++      hashed = (__lookup_vx_info(xid) != NULL);
++      spin_unlock(&vx_info_hash_lock);
++      return hashed;
++}
++
++#ifdef        CONFIG_PROC_FS
++
++/*    get_xid_list()
++
++      * get a subset of hashed xids for proc
++      * assumes size is at least one                          */
++
++int get_xid_list(int index, unsigned int *xids, int size)
++{
++      int hindex, nr_xids = 0;
++
++      /* only show current and children */
++      if (!vx_check(0, VS_ADMIN | VS_WATCH)) {
++              if (index > 0)
++                      return 0;
++              xids[nr_xids] = vx_current_xid();
++              return 1;
++      }
++
++      for (hindex = 0; hindex < VX_HASH_SIZE; hindex++) {
++              struct hlist_head *head = &vx_info_hash[hindex];
++              struct hlist_node *pos;
++
++              spin_lock(&vx_info_hash_lock);
++              hlist_for_each(pos, head) {
++                      struct vx_info *vxi;
++
++                      if (--index > 0)
++                              continue;
++
++                      vxi = hlist_entry(pos, struct vx_info, vx_hlist);
++                      xids[nr_xids] = vxi->vx_id;
++                      if (++nr_xids >= size) {
++                              spin_unlock(&vx_info_hash_lock);
++                              goto out;
++                      }
++              }
++              /* keep the lock time short */
++              spin_unlock(&vx_info_hash_lock);
++      }
++out:
++      return nr_xids;
++}
++#endif
++
++#ifdef        CONFIG_VSERVER_DEBUG
++
++void  dump_vx_info_inactive(int level)
++{
++      struct hlist_node *entry, *next;
++
++      hlist_for_each_safe(entry, next, &vx_info_inactive) {
++              struct vx_info *vxi =
++                      list_entry(entry, struct vx_info, vx_hlist);
++
++              dump_vx_info(vxi, level);
++      }
++}
++
++#endif
++
++int vx_migrate_user(struct task_struct *p, struct vx_info *vxi)
++{
++      struct user_struct *new_user, *old_user;
++
++      if (!p || !vxi)
++              BUG();
++
++      if (vx_info_flags(vxi, VXF_INFO_PRIVATE, 0))
++              return -EACCES;
++
++      new_user = alloc_uid(vxi->vx_id, p->uid);
++      if (!new_user)
++              return -ENOMEM;
++
++      old_user = p->user;
++      if (new_user != old_user) {
++              atomic_inc(&new_user->processes);
++              atomic_dec(&old_user->processes);
++              p->user = new_user;
++      }
++      free_uid(old_user);
++      return 0;
++}
++
++void vx_mask_cap_bset(struct vx_info *vxi, struct task_struct *p)
++{
++      p->cap_effective &= vxi->vx_cap_bset;
++      p->cap_inheritable &= vxi->vx_cap_bset;
++      p->cap_permitted &= vxi->vx_cap_bset;
++}
++
++
++#include <linux/file.h>
++
++static int vx_openfd_task(struct task_struct *tsk)
++{
++      struct files_struct *files = tsk->files;
++      struct fdtable *fdt;
++      const unsigned long *bptr;
++      int count, total;
++
++      /* no rcu_read_lock() because of spin_lock() */
++      spin_lock(&files->file_lock);
++      fdt = files_fdtable(files);
++      bptr = fdt->open_fds->fds_bits;
++      count = fdt->max_fds / (sizeof(unsigned long) * 8);
++      for (total = 0; count > 0; count--) {
++              if (*bptr)
++                      total += hweight_long(*bptr);
++              bptr++;
++      }
++      spin_unlock(&files->file_lock);
++      return total;
++}
++
++
++/*    for *space compatibility */
++
++asmlinkage long sys_unshare(unsigned long);
++
++/*
++ *    migrate task to new context
++ *    gets vxi, puts old_vxi on change
++ *    optionally unshares namespaces (hack)
++ */
++
++int vx_migrate_task(struct task_struct *p, struct vx_info *vxi, int unshare)
++{
++      struct vx_info *old_vxi;
++      int ret = 0;
++
++      if (!p || !vxi)
++              BUG();
++
++      vxdprintk(VXD_CBIT(xid, 5),
++              "vx_migrate_task(%p,%p[#%d.%d])", p, vxi,
++              vxi->vx_id, atomic_read(&vxi->vx_usecnt));
++
++      if (vx_info_flags(vxi, VXF_INFO_PRIVATE, 0) &&
++              !vx_info_flags(vxi, VXF_STATE_SETUP, 0))
++              return -EACCES;
++
++      if (vx_info_state(vxi, VXS_SHUTDOWN))
++              return -EFAULT;
++
++      old_vxi = task_get_vx_info(p);
++      if (old_vxi == vxi)
++              goto out;
++
++      if (!(ret = vx_migrate_user(p, vxi))) {
++              int openfd;
++
++              task_lock(p);
++              openfd = vx_openfd_task(p);
++
++              if (old_vxi) {
++                      atomic_dec(&old_vxi->cvirt.nr_threads);
++                      atomic_dec(&old_vxi->cvirt.nr_running);
++                      __rlim_dec(&old_vxi->limit, RLIMIT_NPROC);
++                      /* FIXME: what about the struct files here? */
++                      __rlim_sub(&old_vxi->limit, VLIMIT_OPENFD, openfd);
++                      /* account for the executable */
++                      __rlim_dec(&old_vxi->limit, VLIMIT_DENTRY);
++              }
++              atomic_inc(&vxi->cvirt.nr_threads);
++              atomic_inc(&vxi->cvirt.nr_running);
++              __rlim_inc(&vxi->limit, RLIMIT_NPROC);
++              /* FIXME: what about the struct files here? */
++              __rlim_add(&vxi->limit, VLIMIT_OPENFD, openfd);
++              /* account for the executable */
++              __rlim_inc(&vxi->limit, VLIMIT_DENTRY);
++
++              if (old_vxi) {
++                      release_vx_info(old_vxi, p);
++                      clr_vx_info(&p->vx_info);
++              }
++              claim_vx_info(vxi, p);
++              set_vx_info(&p->vx_info, vxi);
++              p->xid = vxi->vx_id;
++
++              vxdprintk(VXD_CBIT(xid, 5),
++                      "moved task %p into vxi:%p[#%d]",
++                      p, vxi, vxi->vx_id);
++
++              vx_mask_cap_bset(vxi, p);
++              task_unlock(p);
++
++              /* hack for *spaces to provide compatibility */
++              if (unshare) {
++                      struct nsproxy *old_nsp, *new_nsp;
++
++                      ret = unshare_nsproxy_namespaces(
++                              CLONE_NEWUTS | CLONE_NEWIPC,
++                              &new_nsp, NULL);
++                      if (ret)
++                              goto out;
++
++                      old_nsp = xchg(&p->nsproxy, new_nsp);
++                      vx_set_space(vxi, CLONE_NEWUTS | CLONE_NEWIPC);
++                      put_nsproxy(old_nsp);
++              }
++      }
++out:
++      put_vx_info(old_vxi);
++      return ret;
++}
++
++int vx_set_reaper(struct vx_info *vxi, struct task_struct *p)
++{
++      struct task_struct *old_reaper;
++
++      if (!vxi)
++              return -EINVAL;
++
++      vxdprintk(VXD_CBIT(xid, 6),
++              "vx_set_reaper(%p[#%d],%p[#%d,%d])",
++              vxi, vxi->vx_id, p, p->xid, p->pid);
++
++      old_reaper = vxi->vx_reaper;
++      if (old_reaper == p)
++              return 0;
++
++      /* set new child reaper */
++      get_task_struct(p);
++      vxi->vx_reaper = p;
++      put_task_struct(old_reaper);
++      return 0;
++}
++
++int vx_set_init(struct vx_info *vxi, struct task_struct *p)
++{
++      if (!vxi)
++              return -EINVAL;
++
++      vxdprintk(VXD_CBIT(xid, 6),
++              "vx_set_init(%p[#%d],%p[#%d,%d,%d])",
++              vxi, vxi->vx_id, p, p->xid, p->pid, p->tgid);
++
++      vxi->vx_flags &= ~VXF_STATE_INIT;
++      vxi->vx_initpid = p->tgid;
++      return 0;
++}
++
++void vx_exit_init(struct vx_info *vxi, struct task_struct *p, int code)
++{
++      vxdprintk(VXD_CBIT(xid, 6),
++              "vx_exit_init(%p[#%d],%p[#%d,%d,%d])",
++              vxi, vxi->vx_id, p, p->xid, p->pid, p->tgid);
++
++      vxi->exit_code = code;
++      vxi->vx_initpid = 0;
++}
++
++
++void vx_set_persistent(struct vx_info *vxi)
++{
++      vxdprintk(VXD_CBIT(xid, 6),
++              "vx_set_persistent(%p[#%d])", vxi, vxi->vx_id);
++
++      get_vx_info(vxi);
++      claim_vx_info(vxi, NULL);
++}
++
++void vx_clear_persistent(struct vx_info *vxi)
++{
++      vxdprintk(VXD_CBIT(xid, 6),
++              "vx_clear_persistent(%p[#%d])", vxi, vxi->vx_id);
++
++      release_vx_info(vxi, NULL);
++      put_vx_info(vxi);
++}
++
++void vx_update_persistent(struct vx_info *vxi)
++{
++      if (vx_info_flags(vxi, VXF_PERSISTENT, 0))
++              vx_set_persistent(vxi);
++      else
++              vx_clear_persistent(vxi);
++}
++
++
++/*    task must be current or locked          */
++
++void  exit_vx_info(struct task_struct *p, int code)
++{
++      struct vx_info *vxi = p->vx_info;
++
++      if (vxi) {
++              atomic_dec(&vxi->cvirt.nr_threads);
++              vx_nproc_dec(p);
++
++              vxi->exit_code = code;
++              release_vx_info(vxi, p);
++      }
++}
++
++void  exit_vx_info_early(struct task_struct *p, int code)
++{
++      struct vx_info *vxi = p->vx_info;
++
++      if (vxi) {
++              if (vxi->vx_initpid == p->tgid)
++                      vx_exit_init(vxi, p, code);
++              if (vxi->vx_reaper == p)
++                      vx_set_reaper(vxi, init_pid_ns.child_reaper);
++      }
++}
++
++
++/* vserver syscall commands below here */
++
++/* taks xid and vx_info functions */
++
++#include <asm/uaccess.h>
++
++
++int vc_task_xid(uint32_t id)
++{
++      xid_t xid;
++
++      if (id) {
++              struct task_struct *tsk;
++
++              read_lock(&tasklist_lock);
++              tsk = find_task_by_real_pid(id);
++              xid = (tsk) ? tsk->xid : -ESRCH;
++              read_unlock(&tasklist_lock);
++      } else
++              xid = vx_current_xid();
++      return xid;
++}
++
++
++int vc_vx_info(struct vx_info *vxi, void __user *data)
++{
++      struct vcmd_vx_info_v0 vc_data;
++
++      vc_data.xid = vxi->vx_id;
++      vc_data.initpid = vxi->vx_initpid;
++
++      if (copy_to_user(data, &vc_data, sizeof(vc_data)))
++              return -EFAULT;
++      return 0;
++}
++
++
++int vc_ctx_stat(struct vx_info *vxi, void __user *data)
++{
++      struct vcmd_ctx_stat_v0 vc_data;
++
++      vc_data.usecnt = atomic_read(&vxi->vx_usecnt);
++      vc_data.tasks = atomic_read(&vxi->vx_tasks);
++
++      if (copy_to_user(data, &vc_data, sizeof(vc_data)))
++              return -EFAULT;
++      return 0;
++}
++
++
++/* context functions */
++
++int vc_ctx_create(uint32_t xid, void __user *data)
++{
++      struct vcmd_ctx_create vc_data = { .flagword = VXF_INIT_SET };
++      struct vx_info *new_vxi;
++      int ret;
++
++      if (data && copy_from_user(&vc_data, data, sizeof(vc_data)))
++              return -EFAULT;
++
++      if ((xid > MAX_S_CONTEXT) || (xid < 2))
++              return -EINVAL;
++
++      new_vxi = __create_vx_info(xid);
++      if (IS_ERR(new_vxi))
++              return PTR_ERR(new_vxi);
++
++      /* initial flags */
++      new_vxi->vx_flags = vc_data.flagword;
++
++      ret = -ENOEXEC;
++      if (vs_state_change(new_vxi, VSC_STARTUP))
++              goto out;
++
++      ret = vx_migrate_task(current, new_vxi, (!data));
++      if (ret)
++              goto out;
++
++      /* return context id on success */
++      ret = new_vxi->vx_id;
++
++      /* get a reference for persistent contexts */
++      if ((vc_data.flagword & VXF_PERSISTENT))
++              vx_set_persistent(new_vxi);
++out:
++      release_vx_info(new_vxi, NULL);
++      put_vx_info(new_vxi);
++      return ret;
++}
++
++
++int vc_ctx_migrate(struct vx_info *vxi, void __user *data)
++{
++      struct vcmd_ctx_migrate vc_data = { .flagword = 0 };
++      int ret;
++
++      if (data && copy_from_user(&vc_data, data, sizeof(vc_data)))
++              return -EFAULT;
++
++      ret = vx_migrate_task(current, vxi, 0);
++      if (ret)
++              return ret;
++      if (vc_data.flagword & VXM_SET_INIT)
++              ret = vx_set_init(vxi, current);
++      if (ret)
++              return ret;
++      if (vc_data.flagword & VXM_SET_REAPER)
++              ret = vx_set_reaper(vxi, current);
++      return ret;
++}
++
++
++int vc_get_cflags(struct vx_info *vxi, void __user *data)
++{
++      struct vcmd_ctx_flags_v0 vc_data;
++
++      vc_data.flagword = vxi->vx_flags;
++
++      /* special STATE flag handling */
++      vc_data.mask = vs_mask_flags(~0ULL, vxi->vx_flags, VXF_ONE_TIME);
++
++      if (copy_to_user(data, &vc_data, sizeof(vc_data)))
++              return -EFAULT;
++      return 0;
++}
++
++int vc_set_cflags(struct vx_info *vxi, void __user *data)
++{
++      struct vcmd_ctx_flags_v0 vc_data;
++      uint64_t mask, trigger;
++
++      if (copy_from_user(&vc_data, data, sizeof(vc_data)))
++              return -EFAULT;
++
++      /* special STATE flag handling */
++      mask = vs_mask_mask(vc_data.mask, vxi->vx_flags, VXF_ONE_TIME);
++      trigger = (mask & vxi->vx_flags) ^ (mask & vc_data.flagword);
++
++      if (vxi == current->vx_info) {
++              if (trigger & VXF_STATE_SETUP)
++                      vx_mask_cap_bset(vxi, current);
++              if (trigger & VXF_STATE_INIT) {
++                      int ret;
++
++                      ret = vx_set_init(vxi, current);
++                      if (ret)
++                              return ret;
++                      ret = vx_set_reaper(vxi, current);
++                      if (ret)
++                              return ret;
++              }
++      }
++
++      vxi->vx_flags = vs_mask_flags(vxi->vx_flags,
++              vc_data.flagword, mask);
++      if (trigger & VXF_PERSISTENT)
++              vx_update_persistent(vxi);
++
++      return 0;
++}
++
++static int do_get_caps(struct vx_info *vxi, uint64_t *bcaps, uint64_t *ccaps)
++{
++      if (bcaps)
++              *bcaps = vxi->vx_bcaps;
++      if (ccaps)
++              *ccaps = vxi->vx_ccaps;
++
++      return 0;
++}
++
++int vc_get_ccaps(struct vx_info *vxi, void __user *data)
++{
++      struct vcmd_ctx_caps_v1 vc_data;
++      int ret;
++
++      ret = do_get_caps(vxi, NULL, &vc_data.ccaps);
++      if (ret)
++              return ret;
++      vc_data.cmask = ~0ULL;
++
++      if (copy_to_user(data, &vc_data, sizeof(vc_data)))
++              return -EFAULT;
++      return 0;
++}
++
++static int do_set_caps(struct vx_info *vxi,
++      uint64_t bcaps, uint64_t bmask, uint64_t ccaps, uint64_t cmask)
++{
++      vxi->vx_bcaps = vs_mask_flags(vxi->vx_bcaps, bcaps, bmask);
++      vxi->vx_ccaps = vs_mask_flags(vxi->vx_ccaps, ccaps, cmask);
++
++      return 0;
++}
++
++int vc_set_ccaps(struct vx_info *vxi, void __user *data)
++{
++      struct vcmd_ctx_caps_v1 vc_data;
++
++      if (copy_from_user(&vc_data, data, sizeof(vc_data)))
++              return -EFAULT;
++
++      return do_set_caps(vxi, 0, 0, vc_data.ccaps, vc_data.cmask);
++}
++
++int vc_get_bcaps(struct vx_info *vxi, void __user *data)
++{
++      struct vcmd_bcaps vc_data;
++      int ret;
++
++      ret = do_get_caps(vxi, &vc_data.bcaps, NULL);
++      if (ret)
++              return ret;
++      vc_data.bmask = ~0ULL;
++
++      if (copy_to_user(data, &vc_data, sizeof(vc_data)))
++              return -EFAULT;
++      return 0;
++}
++
++int vc_set_bcaps(struct vx_info *vxi, void __user *data)
++{
++      struct vcmd_bcaps vc_data;
++
++      if (copy_from_user(&vc_data, data, sizeof(vc_data)))
++              return -EFAULT;
++
++      return do_set_caps(vxi, vc_data.bcaps, vc_data.bmask, 0, 0);
++}
++
++
++int vc_get_badness(struct vx_info *vxi, void __user *data)
++{
++      struct vcmd_badness_v0 vc_data;
++
++      vc_data.bias = vxi->vx_badness_bias;
++
++      if (copy_to_user(data, &vc_data, sizeof(vc_data)))
++              return -EFAULT;
++      return 0;
++}
++
++int vc_set_badness(struct vx_info *vxi, void __user *data)
++{
++      struct vcmd_badness_v0 vc_data;
++
++      if (copy_from_user(&vc_data, data, sizeof(vc_data)))
++              return -EFAULT;
++
++      vxi->vx_badness_bias = vc_data.bias;
++      return 0;
++}
++
++#include <linux/module.h>
++
++EXPORT_SYMBOL_GPL(free_vx_info);
++
+diff -Nurb linux-2.6.22-594/kernel/vserver/space.c linux-2.6.22-595/kernel/vserver/space.c
+--- linux-2.6.22-594/kernel/vserver/space.c    2008-03-20 00:05:21.000000000 -0400
++++ linux-2.6.22-595/kernel/vserver/space.c    2008-03-20 00:08:28.000000000 -0400
  @@ -15,6 +15,7 @@
   #include <linux/utsname.h>
   #include <linux/nsproxy.h>
@@ -8,7 +1653,7 @@
   #include <asm/uaccess.h>
   
   #include <linux/vs_context.h>
-@@ -54,6 +55,7 @@
+@@ -55,6 +56,7 @@
         struct mnt_namespace *old_ns;
         struct uts_namespace *old_uts;
         struct ipc_namespace *old_ipc;
@@ -16,11 +1661,10 @@
         struct nsproxy *nsproxy;
   
         nsproxy = copy_nsproxy(old_nsproxy);
-@@ -83,6 +85,17 @@
-                       get_ipc_ns(nsproxy->ipc_ns);
+@@ -85,12 +87,26 @@
         } else
                 old_ipc = NULL;
-+      
+ 
  +      if (mask & CLONE_NEWNET) {
  +              old_net = nsproxy->net_ns;
  +              nsproxy->net_ns = new_nsproxy->net_ns;
@@ -31,10 +1675,10 @@
  +      } else
  +              old_net = NULL;
  +
- 
++
         if (old_ns)
                 put_mnt_ns(old_ns);
-@@ -90,6 +101,9 @@
+       if (old_uts)
                 put_uts_ns(old_uts);
         if (old_ipc)
                 put_ipc_ns(old_ipc);
@@ -44,13 +1688,6894 @@
   out:
         return nsproxy;
   }
-@@ -250,7 +264,8 @@
+@@ -251,6 +267,7 @@
   
   int vc_enter_space(struct vx_info *vxi, void __user *data)
   {
--      struct vcmd_space_mask vc_data = { .mask = 0 };
  +      /* Ask dhozac how to pass this flag from user space - Sapan*/
-+      struct vcmd_space_mask vc_data = { .mask = CLONE_NEWNET };
+       struct vcmd_space_mask vc_data = { .mask = 0 };
   
         if (data && copy_from_user(&vc_data, data, sizeof(vc_data)))
-               return -EFAULT;
+diff -Nurb linux-2.6.22-594/kernel/vserver/space.c.orig linux-2.6.22-595/kernel/vserver/space.c.orig
+--- linux-2.6.22-594/kernel/vserver/space.c.orig       1969-12-31 19:00:00.000000000 -0500
++++ linux-2.6.22-595/kernel/vserver/space.c.orig       2008-03-20 00:05:28.000000000 -0400
+@@ -0,0 +1,295 @@
++/*
++ *  linux/kernel/vserver/space.c
++ *
++ *  Virtual Server: Context Space Support
++ *
++ *  Copyright (C) 2003-2007  Herbert Pötzl
++ *
++ *  V0.01  broken out from context.c 0.07
++ *  V0.02  added task locking for namespace
++ *  V0.03  broken out vx_enter_namespace
++ *  V0.04  added *space support and commands
++ *
++ */
++
++#include <linux/utsname.h>
++#include <linux/nsproxy.h>
++#include <linux/err.h>
++#include <net/net_namespace.h>
++#include <asm/uaccess.h>
++
++#include <linux/vs_context.h>
++#include <linux/vserver/space.h>
++#include <linux/vserver/space_cmd.h>
++
++
++atomic_t vs_global_nsproxy    = ATOMIC_INIT(0);
++atomic_t vs_global_fs         = ATOMIC_INIT(0);
++atomic_t vs_global_mnt_ns     = ATOMIC_INIT(0);
++atomic_t vs_global_uts_ns     = ATOMIC_INIT(0);
++atomic_t vs_global_ipc_ns     = ATOMIC_INIT(0);
++
++
++/* namespace functions */
++
++#include <linux/mnt_namespace.h>
++
++const struct vcmd_space_mask space_mask = {
++      .mask = CLONE_NEWNS |
++              CLONE_NEWUTS |
++              CLONE_NEWIPC |
++              CLONE_FS | 
++              CLONE_NEWNET
++};
++
++
++/*
++ *    build a new nsproxy mix
++ *      assumes that both proxies are 'const'
++ *    does not touch nsproxy refcounts
++ *    will hold a reference on the result.
++ */
++
++struct nsproxy *vs_mix_nsproxy(struct nsproxy *old_nsproxy,
++      struct nsproxy *new_nsproxy, unsigned long mask)
++{
++      struct mnt_namespace *old_ns;
++      struct uts_namespace *old_uts;
++      struct ipc_namespace *old_ipc;
++      struct net *old_net;
++      struct nsproxy *nsproxy;
++
++      nsproxy = copy_nsproxy(old_nsproxy);
++      if (!nsproxy)
++              goto out;
++
++      if (mask & CLONE_NEWNS) {
++              old_ns = nsproxy->mnt_ns;
++              nsproxy->mnt_ns = new_nsproxy->mnt_ns;
++              if (nsproxy->mnt_ns)
++                      get_mnt_ns(nsproxy->mnt_ns);
++      } else
++              old_ns = NULL;
++
++      if (mask & CLONE_NEWUTS) {
++              old_uts = nsproxy->uts_ns;
++              nsproxy->uts_ns = new_nsproxy->uts_ns;
++              if (nsproxy->uts_ns)
++                      get_uts_ns(nsproxy->uts_ns);
++      } else
++              old_uts = NULL;
++
++      if (mask & CLONE_NEWIPC) {
++              old_ipc = nsproxy->ipc_ns;
++              nsproxy->ipc_ns = new_nsproxy->ipc_ns;
++              if (nsproxy->ipc_ns)
++                      get_ipc_ns(nsproxy->ipc_ns);
++      } else
++              old_ipc = NULL;
++      
++      if (mask & CLONE_NEWNET) {
++              old_net = nsproxy->net_ns;
++              nsproxy->net_ns = new_nsproxy->net_ns;
++              if (nsproxy->net_ns) {
++                      get_net(nsproxy->net_ns);
++                      printk(KERN_ALERT "Cloning network namespace\n"); 
++              }       
++      } else
++              old_net = NULL;
++
++
++      if (old_ns)
++              put_mnt_ns(old_ns);
++      if (old_uts)
++              put_uts_ns(old_uts);
++      if (old_ipc)
++              put_ipc_ns(old_ipc);
++      if (old_net)
++              put_net(old_net);
++
++out:
++      return nsproxy;
++}
++
++
++/*
++ *    merge two nsproxy structs into a new one.
++ *    will hold a reference on the result.
++ */
++
++static inline
++struct nsproxy *__vs_merge_nsproxy(struct nsproxy *old,
++      struct nsproxy *proxy, unsigned long mask)
++{
++      struct nsproxy null_proxy = { .mnt_ns = NULL };
++
++      if (!proxy)
++              return NULL;
++
++      if (mask) {
++              /* vs_mix_nsproxy returns with reference */
++              return vs_mix_nsproxy(old ? old : &null_proxy,
++                      proxy, mask);
++      }
++      get_nsproxy(proxy);
++      return proxy;
++}
++
++/*
++ *    merge two fs structs into a new one.
++ *    will take a reference on the result.
++ */
++
++static inline
++struct fs_struct *__vs_merge_fs(struct fs_struct *old,
++      struct fs_struct *fs, unsigned long mask)
++{
++      if (!(mask & CLONE_FS)) {
++              if (old)
++                      atomic_inc(&old->count);
++              return old;
++      }
++
++      if (!fs)
++              return NULL;
++
++      return copy_fs_struct(fs);
++}
++
++
++int vx_enter_space(struct vx_info *vxi, unsigned long mask)
++{
++      struct nsproxy *proxy, *proxy_cur, *proxy_new;
++      struct fs_struct *fs, *fs_cur, *fs_new;
++      int ret;
++
++      if (vx_info_flags(vxi, VXF_INFO_PRIVATE, 0))
++              return -EACCES;
++
++      if (!mask)
++              mask = vxi->vx_nsmask;
++
++      if ((mask & vxi->vx_nsmask) != mask)
++              return -EINVAL;
++
++      proxy = vxi->vx_nsproxy;
++      fs = vxi->vx_fs;
++
++      task_lock(current);
++      fs_cur = current->fs;
++      atomic_inc(&fs_cur->count);
++      proxy_cur = current->nsproxy;
++      get_nsproxy(proxy_cur);
++      task_unlock(current);
++
++      fs_new = __vs_merge_fs(fs_cur, fs, mask);
++      if (IS_ERR(fs_new)) {
++              ret = PTR_ERR(fs_new);
++              goto out_put;
++      }
++
++      proxy_new = __vs_merge_nsproxy(proxy_cur, proxy, mask);
++      if (IS_ERR(proxy_new)) {
++              ret = PTR_ERR(proxy_new);
++              goto out_put_fs;
++      }
++
++      fs_new = xchg(&current->fs, fs_new);
++      proxy_new = xchg(&current->nsproxy, proxy_new);
++      ret = 0;
++
++      if (proxy_new)
++              put_nsproxy(proxy_new);
++out_put_fs:
++      if (fs_new)
++              put_fs_struct(fs_new);
++out_put:
++      if (proxy_cur)
++              put_nsproxy(proxy_cur);
++      if (fs_cur)
++              put_fs_struct(fs_cur);
++      return ret;
++}
++
++
++int vx_set_space(struct vx_info *vxi, unsigned long mask)
++{
++      struct nsproxy *proxy_vxi, *proxy_cur, *proxy_new;
++      struct fs_struct *fs_vxi, *fs_cur, *fs_new;
++      int ret;
++
++      if (!mask)
++              mask = space_mask.mask;
++
++      if ((mask & space_mask.mask) != mask)
++              return -EINVAL;
++
++      proxy_vxi = vxi->vx_nsproxy;
++      fs_vxi = vxi->vx_fs;
++
++      task_lock(current);
++      fs_cur = current->fs;
++      atomic_inc(&fs_cur->count);
++      proxy_cur = current->nsproxy;
++      get_nsproxy(proxy_cur);
++      task_unlock(current);
++
++      fs_new = __vs_merge_fs(fs_vxi, fs_cur, mask);
++      if (IS_ERR(fs_new)) {
++              ret = PTR_ERR(fs_new);
++              goto out_put;
++      }
++
++      proxy_new = __vs_merge_nsproxy(proxy_vxi, proxy_cur, mask);
++      if (IS_ERR(proxy_new)) {
++              ret = PTR_ERR(proxy_new);
++              goto out_put_fs;
++      }
++
++      fs_new = xchg(&vxi->vx_fs, fs_new);
++      proxy_new = xchg(&vxi->vx_nsproxy, proxy_new);
++      vxi->vx_nsmask |= mask;
++      ret = 0;
++
++      if (proxy_new)
++              put_nsproxy(proxy_new);
++out_put_fs:
++      if (fs_new)
++              put_fs_struct(fs_new);
++out_put:
++      if (proxy_cur)
++              put_nsproxy(proxy_cur);
++      if (fs_cur)
++              put_fs_struct(fs_cur);
++      return ret;
++}
++
++
++int vc_enter_space(struct vx_info *vxi, void __user *data)
++{
++      /* Ask dhozac how to pass this flag from user space - Sapan*/
++      struct vcmd_space_mask vc_data = { .mask = CLONE_NEWNET };
++
++      if (data && copy_from_user(&vc_data, data, sizeof(vc_data)))
++              return -EFAULT;
++
++      return vx_enter_space(vxi, vc_data.mask);
++}
++
++int vc_set_space(struct vx_info *vxi, void __user *data)
++{
++      struct vcmd_space_mask vc_data = { .mask = 0 };
++
++      if (data && copy_from_user(&vc_data, data, sizeof(vc_data)))
++              return -EFAULT;
++
++      return vx_set_space(vxi, vc_data.mask);
++}
++
++int vc_get_space_mask(struct vx_info *vxi, void __user *data)
++{
++      if (copy_to_user(data, &space_mask, sizeof(space_mask)))
++              return -EFAULT;
++      return 0;
++}
++
+diff -Nurb linux-2.6.22-594/net/core/net_namespace.c linux-2.6.22-595/net/core/net_namespace.c
+--- linux-2.6.22-594/net/core/net_namespace.c  2008-03-20 00:05:18.000000000 -0400
++++ linux-2.6.22-595/net/core/net_namespace.c  2008-03-20 00:14:56.000000000 -0400
+@@ -112,10 +112,12 @@
+               ops = list_entry(ptr, struct pernet_operations, list);
+               if (ops->init) {
+                       error = ops->init(net);
+-                      if (error < 0)
++                      if (error < 0) {
++                              printk(KERN_ALERT "Error setting up netns: %x\n", ops->init);
+                               goto out_undo;
+               }
+       }
++      }
+ out:
+       return error;
+ out_undo:
+diff -Nurb linux-2.6.22-594/net/core/net_namespace.c.orig linux-2.6.22-595/net/core/net_namespace.c.orig
+--- linux-2.6.22-594/net/core/net_namespace.c.orig     1969-12-31 19:00:00.000000000 -0500
++++ linux-2.6.22-595/net/core/net_namespace.c.orig     2008-03-20 00:05:18.000000000 -0400
+@@ -0,0 +1,332 @@
++#include <linux/workqueue.h>
++#include <linux/rtnetlink.h>
++#include <linux/cache.h>
++#include <linux/slab.h>
++#include <linux/list.h>
++#include <linux/delay.h>
++#include <net/net_namespace.h>
++
++/*
++ *    Our network namespace constructor/destructor lists
++ */
++
++static LIST_HEAD(pernet_list);
++static struct list_head *first_device = &pernet_list;
++static DEFINE_MUTEX(net_mutex);
++
++static DEFINE_MUTEX(net_list_mutex);
++LIST_HEAD(net_namespace_list);
++
++static struct kmem_cache *net_cachep;
++
++struct net init_net;
++EXPORT_SYMBOL_GPL(init_net);
++
++void net_lock(void)
++{
++      mutex_lock(&net_list_mutex);
++}
++
++void net_unlock(void)
++{
++      mutex_unlock(&net_list_mutex);
++}
++
++static struct net *net_alloc(void)
++{
++      return kmem_cache_alloc(net_cachep, GFP_KERNEL);
++}
++
++static void net_free(struct net *net)
++{
++      if (!net)
++              return;
++
++      if (unlikely(atomic_read(&net->use_count) != 0)) {
++              printk(KERN_EMERG "network namespace not free! Usage: %d\n",
++                      atomic_read(&net->use_count));
++              return;
++      }
++
++      kmem_cache_free(net_cachep, net);
++}
++
++static void cleanup_net(struct work_struct *work)
++{
++      struct pernet_operations *ops;
++      struct list_head *ptr;
++      struct net *net;
++
++      net = container_of(work, struct net, work);
++
++      mutex_lock(&net_mutex);
++
++      /* Don't let anyone else find us. */
++      net_lock();
++      list_del(&net->list);
++      net_unlock();
++
++      /* Run all of the network namespace exit methods */
++      list_for_each_prev(ptr, &pernet_list) {
++              ops = list_entry(ptr, struct pernet_operations, list);
++              if (ops->exit)
++                      ops->exit(net);
++      }
++
++      mutex_unlock(&net_mutex);
++
++      /* Ensure there are no outstanding rcu callbacks using this
++       * network namespace.
++       */
++      rcu_barrier();
++
++      /* Finally it is safe to free my network namespace structure */
++      net_free(net);
++}
++
++
++void __put_net(struct net *net)
++{
++      /* Cleanup the network namespace in process context */
++      INIT_WORK(&net->work, cleanup_net);
++      schedule_work(&net->work);
++}
++EXPORT_SYMBOL_GPL(__put_net);
++
++/*
++ * setup_net runs the initializers for the network namespace object.
++ */
++static int setup_net(struct net *net)
++{
++      /* Must be called with net_mutex held */
++      struct pernet_operations *ops;
++      struct list_head *ptr;
++      int error;
++
++      memset(net, 0, sizeof(struct net));
++      atomic_set(&net->count, 1);
++      atomic_set(&net->use_count, 0);
++
++      error = 0;
++      list_for_each(ptr, &pernet_list) {
++              ops = list_entry(ptr, struct pernet_operations, list);
++              if (ops->init) {
++                      error = ops->init(net);
++                      if (error < 0)
++                              goto out_undo;
++              }
++      }
++out:
++      return error;
++out_undo:
++      /* Walk through the list backwards calling the exit functions
++       * for the pernet modules whose init functions did not fail.
++       */
++      for (ptr = ptr->prev; ptr != &pernet_list; ptr = ptr->prev) {
++              ops = list_entry(ptr, struct pernet_operations, list);
++              if (ops->exit)
++                      ops->exit(net);
++      }
++      goto out;
++}
++
++struct net *copy_net_ns(unsigned long flags, struct net *old_net)
++{
++      struct net *new_net = NULL;
++      int err;
++
++      get_net(old_net);
++
++      if (!(flags & CLONE_NEWNET))
++              return old_net;
++
++      err = -EPERM;
++      if (!capable(CAP_SYS_ADMIN))
++              goto out;
++
++      err = -ENOMEM;
++      new_net = net_alloc();
++      if (!new_net)
++              goto out;
++
++      mutex_lock(&net_mutex);
++      err = setup_net(new_net);
++      if (err)
++              goto out_unlock;
++
++      net_lock();
++      list_add_tail(&new_net->list, &net_namespace_list);
++      net_unlock();
++
++
++out_unlock:
++      mutex_unlock(&net_mutex);
++out:
++      put_net(old_net);
++      if (err) {
++              net_free(new_net);
++              new_net = ERR_PTR(err);
++      }
++      return new_net;
++}
++
++static int __init net_ns_init(void)
++{
++      int err;
++
++      printk(KERN_INFO "net_namespace: %zd bytes\n", sizeof(struct net));
++      net_cachep = kmem_cache_create("net_namespace", sizeof(struct net),
++                                      SMP_CACHE_BYTES,
++                                      SLAB_PANIC, NULL, NULL);
++      mutex_lock(&net_mutex);
++      err = setup_net(&init_net);
++
++      net_lock();
++      list_add_tail(&init_net.list, &net_namespace_list);
++      net_unlock();
++
++      mutex_unlock(&net_mutex);
++      if (err)
++              panic("Could not setup the initial network namespace");
++
++      return 0;
++}
++
++pure_initcall(net_ns_init);
++
++static int register_pernet_operations(struct list_head *list,
++                                    struct pernet_operations *ops)
++{
++      struct net *net, *undo_net;
++      int error;
++
++      error = 0;
++      list_add_tail(&ops->list, list);
++      for_each_net(net) {
++              if (ops->init) {
++                      error = ops->init(net);
++                      if (error)
++                              goto out_undo;
++              }
++      }
++out:
++      return error;
++
++out_undo:
++      /* If I have an error cleanup all namespaces I initialized */
++      list_del(&ops->list);
++      for_each_net(undo_net) {
++              if (undo_net == net)
++                      goto undone;
++              if (ops->exit)
++                      ops->exit(undo_net);
++      }
++undone:
++      goto out;
++}
++
++static void unregister_pernet_operations(struct pernet_operations *ops)
++{
++      struct net *net;
++
++      list_del(&ops->list);
++      for_each_net(net)
++              if (ops->exit)
++                      ops->exit(net);
++}
++
++/**
++ *      register_pernet_subsys - register a network namespace subsystem
++ *    @ops:  pernet operations structure for the subsystem
++ *
++ *    Register a subsystem which has init and exit functions
++ *    that are called when network namespaces are created and
++ *    destroyed respectively.
++ *
++ *    When registered all network namespace init functions are
++ *    called for every existing network namespace.  Allowing kernel
++ *    modules to have a race free view of the set of network namespaces.
++ *
++ *    When a new network namespace is created all of the init
++ *    methods are called in the order in which they were registered.
++ *
++ *    When a network namespace is destroyed all of the exit methods
++ *    are called in the reverse of the order with which they were
++ *    registered.
++ */
++int register_pernet_subsys(struct pernet_operations *ops)
++{
++      int error;
++      mutex_lock(&net_mutex);
++      error =  register_pernet_operations(first_device, ops);
++      mutex_unlock(&net_mutex);
++      return error;
++}
++EXPORT_SYMBOL_GPL(register_pernet_subsys);
++
++/**
++ *      unregister_pernet_subsys - unregister a network namespace subsystem
++ *    @ops: pernet operations structure to manipulate
++ *
++ *    Remove the pernet operations structure from the list to be
++ *    used when network namespaces are created or destoryed.  In
++ *    addition run the exit method for all existing network
++ *    namespaces.
++ */
++void unregister_pernet_subsys(struct pernet_operations *module)
++{
++      mutex_lock(&net_mutex);
++      unregister_pernet_operations(module);
++      mutex_unlock(&net_mutex);
++}
++EXPORT_SYMBOL_GPL(unregister_pernet_subsys);
++
++/**
++ *      register_pernet_device - register a network namespace device
++ *    @ops:  pernet operations structure for the subsystem
++ *
++ *    Register a device which has init and exit functions
++ *    that are called when network namespaces are created and
++ *    destroyed respectively.
++ *
++ *    When registered all network namespace init functions are
++ *    called for every existing network namespace.  Allowing kernel
++ *    modules to have a race free view of the set of network namespaces.
++ *
++ *    When a new network namespace is created all of the init
++ *    methods are called in the order in which they were registered.
++ *
++ *    When a network namespace is destroyed all of the exit methods
++ *    are called in the reverse of the order with which they were
++ *    registered.
++ */
++int register_pernet_device(struct pernet_operations *ops)
++{
++      int error;
++      mutex_lock(&net_mutex);
++      error = register_pernet_operations(&pernet_list, ops);
++      if (!error && (first_device == &pernet_list))
++              first_device = &ops->list;
++      mutex_unlock(&net_mutex);
++      return error;
++}
++EXPORT_SYMBOL_GPL(register_pernet_device);
++
++/**
++ *      unregister_pernet_device - unregister a network namespace netdevice
++ *    @ops: pernet operations structure to manipulate
++ *
++ *    Remove the pernet operations structure from the list to be
++ *    used when network namespaces are created or destoryed.  In
++ *    addition run the exit method for all existing network
++ *    namespaces.
++ */
++void unregister_pernet_device(struct pernet_operations *ops)
++{
++      mutex_lock(&net_mutex);
++      if (&ops->list == first_device)
++              first_device = first_device->next;
++      unregister_pernet_operations(ops);
++      mutex_unlock(&net_mutex);
++}
++EXPORT_SYMBOL_GPL(unregister_pernet_device);
+diff -Nurb linux-2.6.22-594/net/ipv4/af_inet.c.orig linux-2.6.22-595/net/ipv4/af_inet.c.orig
+--- linux-2.6.22-594/net/ipv4/af_inet.c.orig   2008-03-20 00:05:18.000000000 -0400
++++ linux-2.6.22-595/net/ipv4/af_inet.c.orig   1969-12-31 19:00:00.000000000 -0500
+@@ -1,1522 +0,0 @@
+-/*
+- * INET               An implementation of the TCP/IP protocol suite for the LINUX
+- *            operating system.  INET is implemented using the  BSD Socket
+- *            interface as the means of communication with the user level.
+- *
+- *            PF_INET protocol family socket handler.
+- *
+- * Version:   $Id: af_inet.c,v 1.137 2002/02/01 22:01:03 davem Exp $
+- *
+- * Authors:   Ross Biro
+- *            Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+- *            Florian La Roche, <flla@stud.uni-sb.de>
+- *            Alan Cox, <A.Cox@swansea.ac.uk>
+- *
+- * Changes (see also sock.c)
+- *
+- *            piggy,
+- *            Karl Knutson    :       Socket protocol table
+- *            A.N.Kuznetsov   :       Socket death error in accept().
+- *            John Richardson :       Fix non blocking error in connect()
+- *                                    so sockets that fail to connect
+- *                                    don't return -EINPROGRESS.
+- *            Alan Cox        :       Asynchronous I/O support
+- *            Alan Cox        :       Keep correct socket pointer on sock
+- *                                    structures
+- *                                    when accept() ed
+- *            Alan Cox        :       Semantics of SO_LINGER aren't state
+- *                                    moved to close when you look carefully.
+- *                                    With this fixed and the accept bug fixed
+- *                                    some RPC stuff seems happier.
+- *            Niibe Yutaka    :       4.4BSD style write async I/O
+- *            Alan Cox,
+- *            Tony Gale       :       Fixed reuse semantics.
+- *            Alan Cox        :       bind() shouldn't abort existing but dead
+- *                                    sockets. Stops FTP netin:.. I hope.
+- *            Alan Cox        :       bind() works correctly for RAW sockets.
+- *                                    Note that FreeBSD at least was broken
+- *                                    in this respect so be careful with
+- *                                    compatibility tests...
+- *            Alan Cox        :       routing cache support
+- *            Alan Cox        :       memzero the socket structure for
+- *                                    compactness.
+- *            Matt Day        :       nonblock connect error handler
+- *            Alan Cox        :       Allow large numbers of pending sockets
+- *                                    (eg for big web sites), but only if
+- *                                    specifically application requested.
+- *            Alan Cox        :       New buffering throughout IP. Used
+- *                                    dumbly.
+- *            Alan Cox        :       New buffering now used smartly.
+- *            Alan Cox        :       BSD rather than common sense
+- *                                    interpretation of listen.
+- *            Germano Caronni :       Assorted small races.
+- *            Alan Cox        :       sendmsg/recvmsg basic support.
+- *            Alan Cox        :       Only sendmsg/recvmsg now supported.
+- *            Alan Cox        :       Locked down bind (see security list).
+- *            Alan Cox        :       Loosened bind a little.
+- *            Mike McLagan    :       ADD/DEL DLCI Ioctls
+- *    Willy Konynenberg       :       Transparent proxying support.
+- *            David S. Miller :       New socket lookup architecture.
+- *                                    Some other random speedups.
+- *            Cyrus Durgin    :       Cleaned up file for kmod hacks.
+- *            Andi Kleen      :       Fix inet_stream_connect TCP race.
+- *
+- *            This program is free software; you can redistribute it and/or
+- *            modify it under the terms of the GNU General Public License
+- *            as published by the Free Software Foundation; either version
+- *            2 of the License, or (at your option) any later version.
+- */
+-
+-#include <linux/err.h>
+-#include <linux/errno.h>
+-#include <linux/types.h>
+-#include <linux/socket.h>
+-#include <linux/in.h>
+-#include <linux/kernel.h>
+-#include <linux/module.h>
+-#include <linux/sched.h>
+-#include <linux/timer.h>
+-#include <linux/string.h>
+-#include <linux/sockios.h>
+-#include <linux/net.h>
+-#include <linux/capability.h>
+-#include <linux/fcntl.h>
+-#include <linux/mm.h>
+-#include <linux/interrupt.h>
+-#include <linux/stat.h>
+-#include <linux/init.h>
+-#include <linux/poll.h>
+-#include <linux/netfilter_ipv4.h>
+-#include <linux/random.h>
+-
+-#include <asm/uaccess.h>
+-#include <asm/system.h>
+-
+-#include <linux/inet.h>
+-#include <linux/igmp.h>
+-#include <linux/inetdevice.h>
+-#include <linux/netdevice.h>
+-#include <net/ip.h>
+-#include <net/protocol.h>
+-#include <net/arp.h>
+-#include <net/route.h>
+-#include <net/ip_fib.h>
+-#include <net/inet_connection_sock.h>
+-#include <net/tcp.h>
+-#include <net/udp.h>
+-#include <net/udplite.h>
+-#include <linux/skbuff.h>
+-#include <net/sock.h>
+-#include <net/raw.h>
+-#include <net/icmp.h>
+-#include <net/ipip.h>
+-#include <net/inet_common.h>
+-#include <net/xfrm.h>
+-#ifdef CONFIG_IP_MROUTE
+-#include <linux/mroute.h>
+-#endif
+-#include <linux/vs_limit.h>
+-
+-DEFINE_SNMP_STAT(struct linux_mib, net_statistics) __read_mostly;
+-
+-extern void ip_mc_drop_socket(struct sock *sk);
+-
+-/* The inetsw table contains everything that inet_create needs to
+- * build a new socket.
+- */
+-static struct list_head inetsw[SOCK_MAX];
+-static DEFINE_SPINLOCK(inetsw_lock);
+-
+-/* New destruction routine */
+-
+-void inet_sock_destruct(struct sock *sk)
+-{
+-      struct inet_sock *inet = inet_sk(sk);
+-
+-      __skb_queue_purge(&sk->sk_receive_queue);
+-      __skb_queue_purge(&sk->sk_error_queue);
+-
+-      if (sk->sk_type == SOCK_STREAM && sk->sk_state != TCP_CLOSE) {
+-              printk("Attempt to release TCP socket in state %d %p\n",
+-                     sk->sk_state, sk);
+-              return;
+-      }
+-      if (!sock_flag(sk, SOCK_DEAD)) {
+-              printk("Attempt to release alive inet socket %p\n", sk);
+-              return;
+-      }
+-
+-      BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc));
+-      BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
+-      BUG_TRAP(!sk->sk_wmem_queued);
+-      BUG_TRAP(!sk->sk_forward_alloc);
+-
+-      kfree(inet->opt);
+-      dst_release(sk->sk_dst_cache);
+-      sk_refcnt_debug_dec(sk);
+-}
+-
+-/*
+- *    The routines beyond this point handle the behaviour of an AF_INET
+- *    socket object. Mostly it punts to the subprotocols of IP to do
+- *    the work.
+- */
+-
+-/*
+- *    Automatically bind an unbound socket.
+- */
+-
+-static int inet_autobind(struct sock *sk)
+-{
+-      struct inet_sock *inet;
+-      /* We may need to bind the socket. */
+-      lock_sock(sk);
+-      inet = inet_sk(sk);
+-      if (!inet->num) {
+-              if (sk->sk_prot->get_port(sk, 0)) {
+-                      release_sock(sk);
+-                      return -EAGAIN;
+-              }
+-              inet->sport = htons(inet->num);
+-              sk->sk_xid = vx_current_xid();
+-              sk->sk_nid = nx_current_nid();
+-      }
+-      release_sock(sk);
+-      return 0;
+-}
+-
+-/*
+- *    Move a socket into listening state.
+- */
+-int inet_listen(struct socket *sock, int backlog)
+-{
+-      struct sock *sk = sock->sk;
+-      unsigned char old_state;
+-      int err;
+-
+-      lock_sock(sk);
+-
+-      err = -EINVAL;
+-      if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM)
+-              goto out;
+-
+-      old_state = sk->sk_state;
+-      if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN)))
+-              goto out;
+-
+-      /* Really, if the socket is already in listen state
+-       * we can only allow the backlog to be adjusted.
+-       */
+-      if (old_state != TCP_LISTEN) {
+-              err = inet_csk_listen_start(sk, backlog);
+-              if (err)
+-                      goto out;
+-      }
+-      sk->sk_max_ack_backlog = backlog;
+-      err = 0;
+-
+-out:
+-      release_sock(sk);
+-      return err;
+-}
+-
+-u32 inet_ehash_secret __read_mostly;
+-EXPORT_SYMBOL(inet_ehash_secret);
+-
+-/*
+- * inet_ehash_secret must be set exactly once
+- * Instead of using a dedicated spinlock, we (ab)use inetsw_lock
+- */
+-void build_ehash_secret(void)
+-{
+-      u32 rnd;
+-      do {
+-              get_random_bytes(&rnd, sizeof(rnd));
+-      } while (rnd == 0);
+-      spin_lock_bh(&inetsw_lock);
+-      if (!inet_ehash_secret)
+-              inet_ehash_secret = rnd;
+-      spin_unlock_bh(&inetsw_lock);
+-}
+-EXPORT_SYMBOL(build_ehash_secret);
+-
+-/*
+- *    Create an inet socket.
+- */
+-
+-static int inet_create(struct socket *sock, int protocol)
+-{
+-      struct sock *sk;
+-      struct list_head *p;
+-      struct inet_protosw *answer;
+-      struct inet_sock *inet;
+-      struct proto *answer_prot;
+-      unsigned char answer_flags;
+-      char answer_no_check;
+-      int try_loading_module = 0;
+-      int err;
+-
+-      if (sock->type != SOCK_RAW &&
+-          sock->type != SOCK_DGRAM &&
+-          !inet_ehash_secret)
+-              build_ehash_secret();
+-
+-      sock->state = SS_UNCONNECTED;
+-
+-      /* Look for the requested type/protocol pair. */
+-      answer = NULL;
+-lookup_protocol:
+-      err = -ESOCKTNOSUPPORT;
+-      rcu_read_lock();
+-      list_for_each_rcu(p, &inetsw[sock->type]) {
+-              answer = list_entry(p, struct inet_protosw, list);
+-
+-              /* Check the non-wild match. */
+-              if (protocol == answer->protocol) {
+-                      if (protocol != IPPROTO_IP)
+-                              break;
+-              } else {
+-                      /* Check for the two wild cases. */
+-                      if (IPPROTO_IP == protocol) {
+-                              protocol = answer->protocol;
+-                              break;
+-                      }
+-                      if (IPPROTO_IP == answer->protocol)
+-                              break;
+-              }
+-              err = -EPROTONOSUPPORT;
+-              answer = NULL;
+-      }
+-
+-      if (unlikely(answer == NULL)) {
+-              if (try_loading_module < 2) {
+-                      rcu_read_unlock();
+-                      /*
+-                       * Be more specific, e.g. net-pf-2-proto-132-type-1
+-                       * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)
+-                       */
+-                      if (++try_loading_module == 1)
+-                              request_module("net-pf-%d-proto-%d-type-%d",
+-                                             PF_INET, protocol, sock->type);
+-                      /*
+-                       * Fall back to generic, e.g. net-pf-2-proto-132
+-                       * (net-pf-PF_INET-proto-IPPROTO_SCTP)
+-                       */
+-                      else
+-                              request_module("net-pf-%d-proto-%d",
+-                                             PF_INET, protocol);
+-                      goto lookup_protocol;
+-              } else
+-                      goto out_rcu_unlock;
+-      }
+-
+-      err = -EPERM;
+-      if ((protocol == IPPROTO_ICMP) &&
+-              nx_capable(answer->capability, NXC_RAW_ICMP))
+-              goto override;
+-      if (sock->type == SOCK_RAW &&
+-              nx_capable(answer->capability, NXC_RAW_SOCKET))
+-              goto override;
+-      if (answer->capability > 0 && !capable(answer->capability))
+-              goto out_rcu_unlock;
+-override:
+-      sock->ops = answer->ops;
+-      answer_prot = answer->prot;
+-      answer_no_check = answer->no_check;
+-      answer_flags = answer->flags;
+-      rcu_read_unlock();
+-
+-      BUG_TRAP(answer_prot->slab != NULL);
+-
+-      err = -ENOBUFS;
+-      sk = sk_alloc(PF_INET, GFP_KERNEL, answer_prot, 1);
+-      if (sk == NULL)
+-              goto out;
+-
+-      err = 0;
+-      sk->sk_no_check = answer_no_check;
+-      if (INET_PROTOSW_REUSE & answer_flags)
+-              sk->sk_reuse = 1;
+-
+-      inet = inet_sk(sk);
+-      inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
+-
+-      if (SOCK_RAW == sock->type) {
+-              inet->num = protocol;
+-              if (IPPROTO_RAW == protocol)
+-                      inet->hdrincl = 1;
+-      }
+-
+-      if (ipv4_config.no_pmtu_disc)
+-              inet->pmtudisc = IP_PMTUDISC_DONT;
+-      else
+-              inet->pmtudisc = IP_PMTUDISC_WANT;
+-
+-      inet->id = 0;
+-
+-      sock_init_data(sock, sk);
+-
+-      sk->sk_destruct    = inet_sock_destruct;
+-      sk->sk_family      = PF_INET;
+-      sk->sk_protocol    = protocol;
+-      sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
+-
+-      inet->uc_ttl    = -1;
+-      inet->mc_loop   = 1;
+-      inet->mc_ttl    = 1;
+-      inet->mc_index  = 0;
+-      inet->mc_list   = NULL;
+-
+-      sk_refcnt_debug_inc(sk);
+-
+-      if (inet->num) {
+-              /* It assumes that any protocol which allows
+-               * the user to assign a number at socket
+-               * creation time automatically
+-               * shares.
+-               */
+-              inet->sport = htons(inet->num);
+-              /* Add to protocol hash chains. */
+-              sk->sk_prot->hash(sk);
+-      }
+-
+-      if (sk->sk_prot->init) {
+-              err = sk->sk_prot->init(sk);
+-              if (err)
+-                      sk_common_release(sk);
+-      }
+-out:
+-      return err;
+-out_rcu_unlock:
+-      rcu_read_unlock();
+-      goto out;
+-}
+-
+-
+-/*
+- *    The peer socket should always be NULL (or else). When we call this
+- *    function we are destroying the object and from then on nobody
+- *    should refer to it.
+- */
+-int inet_release(struct socket *sock)
+-{
+-      struct sock *sk = sock->sk;
+-
+-      if (sk) {
+-              long timeout;
+-
+-              /* Applications forget to leave groups before exiting */
+-              ip_mc_drop_socket(sk);
+-
+-              /* If linger is set, we don't return until the close
+-               * is complete.  Otherwise we return immediately. The
+-               * actually closing is done the same either way.
+-               *
+-               * If the close is due to the process exiting, we never
+-               * linger..
+-               */
+-              timeout = 0;
+-              if (sock_flag(sk, SOCK_LINGER) &&
+-                  !(current->flags & PF_EXITING))
+-                      timeout = sk->sk_lingertime;
+-              sock->sk = NULL;
+-              sk->sk_prot->close(sk, timeout);
+-      }
+-      return 0;
+-}
+-
+-/* It is off by default, see below. */
+-int sysctl_ip_nonlocal_bind __read_mostly;
+-
+-int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+-{
+-      struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
+-      struct sock *sk = sock->sk;
+-      struct inet_sock *inet = inet_sk(sk);
+-      struct nx_v4_sock_addr nsa;
+-      unsigned short snum;
+-      int chk_addr_ret;
+-      int err;
+-
+-      /* If the socket has its own bind function then use it. (RAW) */
+-      if (sk->sk_prot->bind) {
+-              err = sk->sk_prot->bind(sk, uaddr, addr_len);
+-              goto out;
+-      }
+-      err = -EINVAL;
+-      if (addr_len < sizeof(struct sockaddr_in))
+-              goto out;
+-
+-      err = v4_map_sock_addr(inet, addr, &nsa);
+-      if (err)
+-              goto out;
+-
+-      chk_addr_ret = inet_addr_type(nsa.saddr);
+-
+-      /* Not specified by any standard per-se, however it breaks too
+-       * many applications when removed.  It is unfortunate since
+-       * allowing applications to make a non-local bind solves
+-       * several problems with systems using dynamic addressing.
+-       * (ie. your servers still start up even if your ISDN link
+-       *  is temporarily down)
+-       */
+-      err = -EADDRNOTAVAIL;
+-      if (!sysctl_ip_nonlocal_bind &&
+-          !inet->freebind &&
+-          nsa.saddr != INADDR_ANY &&
+-          chk_addr_ret != RTN_LOCAL &&
+-          chk_addr_ret != RTN_MULTICAST &&
+-          chk_addr_ret != RTN_BROADCAST)
+-              goto out;
+-
+-      snum = ntohs(addr->sin_port);
+-      err = -EACCES;
+-      if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
+-              goto out;
+-
+-      /*      We keep a pair of addresses. rcv_saddr is the one
+-       *      used by hash lookups, and saddr is used for transmit.
+-       *
+-       *      In the BSD API these are the same except where it
+-       *      would be illegal to use them (multicast/broadcast) in
+-       *      which case the sending device address is used.
+-       */
+-      lock_sock(sk);
+-
+-      /* Check these errors (active socket, double bind). */
+-      err = -EINVAL;
+-      if (sk->sk_state != TCP_CLOSE || inet->num)
+-              goto out_release_sock;
+-
+-      v4_set_sock_addr(inet, &nsa);
+-      if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
+-              inet->saddr = 0;  /* Use device */
+-
+-      /* Make sure we are allowed to bind here. */
+-      if (sk->sk_prot->get_port(sk, snum)) {
+-              inet->saddr = inet->rcv_saddr = 0;
+-              err = -EADDRINUSE;
+-              goto out_release_sock;
+-      }
+-
+-      if (inet->rcv_saddr)
+-              sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
+-      if (snum)
+-              sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
+-      inet->sport = htons(inet->num);
+-      inet->daddr = 0;
+-      inet->dport = 0;
+-      sk_dst_reset(sk);
+-      err = 0;
+-out_release_sock:
+-      release_sock(sk);
+-out:
+-      return err;
+-}
+-
+-int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr,
+-                     int addr_len, int flags)
+-{
+-      struct sock *sk = sock->sk;
+-
+-      if (uaddr->sa_family == AF_UNSPEC)
+-              return sk->sk_prot->disconnect(sk, flags);
+-
+-      if (!inet_sk(sk)->num && inet_autobind(sk))
+-              return -EAGAIN;
+-      return sk->sk_prot->connect(sk, (struct sockaddr *)uaddr, addr_len);
+-}
+-
+-static long inet_wait_for_connect(struct sock *sk, long timeo)
+-{
+-      DEFINE_WAIT(wait);
+-
+-      prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+-
+-      /* Basic assumption: if someone sets sk->sk_err, he _must_
+-       * change state of the socket from TCP_SYN_*.
+-       * Connect() does not allow to get error notifications
+-       * without closing the socket.
+-       */
+-      while ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+-              release_sock(sk);
+-              timeo = schedule_timeout(timeo);
+-              lock_sock(sk);
+-              if (signal_pending(current) || !timeo)
+-                      break;
+-              prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+-      }
+-      finish_wait(sk->sk_sleep, &wait);
+-      return timeo;
+-}
+-
+-/*
+- *    Connect to a remote host. There is regrettably still a little
+- *    TCP 'magic' in here.
+- */
+-int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+-                      int addr_len, int flags)
+-{
+-      struct sock *sk = sock->sk;
+-      int err;
+-      long timeo;
+-
+-      lock_sock(sk);
+-
+-      if (uaddr->sa_family == AF_UNSPEC) {
+-              err = sk->sk_prot->disconnect(sk, flags);
+-              sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
+-              goto out;
+-      }
+-
+-      switch (sock->state) {
+-      default:
+-              err = -EINVAL;
+-              goto out;
+-      case SS_CONNECTED:
+-              err = -EISCONN;
+-              goto out;
+-      case SS_CONNECTING:
+-              err = -EALREADY;
+-              /* Fall out of switch with err, set for this state */
+-              break;
+-      case SS_UNCONNECTED:
+-              err = -EISCONN;
+-              if (sk->sk_state != TCP_CLOSE)
+-                      goto out;
+-
+-              err = sk->sk_prot->connect(sk, uaddr, addr_len);
+-              if (err < 0)
+-                      goto out;
+-
+-              sock->state = SS_CONNECTING;
+-
+-              /* Just entered SS_CONNECTING state; the only
+-               * difference is that return value in non-blocking
+-               * case is EINPROGRESS, rather than EALREADY.
+-               */
+-              err = -EINPROGRESS;
+-              break;
+-      }
+-
+-      timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
+-
+-      if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+-              /* Error code is set above */
+-              if (!timeo || !inet_wait_for_connect(sk, timeo))
+-                      goto out;
+-
+-              err = sock_intr_errno(timeo);
+-              if (signal_pending(current))
+-                      goto out;
+-      }
+-
+-      /* Connection was closed by RST, timeout, ICMP error
+-       * or another process disconnected us.
+-       */
+-      if (sk->sk_state == TCP_CLOSE)
+-              goto sock_error;
+-
+-      /* sk->sk_err may be not zero now, if RECVERR was ordered by user
+-       * and error was received after socket entered established state.
+-       * Hence, it is handled normally after connect() return successfully.
+-       */
+-
+-      sock->state = SS_CONNECTED;
+-      err = 0;
+-out:
+-      release_sock(sk);
+-      return err;
+-
+-sock_error:
+-      err = sock_error(sk) ? : -ECONNABORTED;
+-      sock->state = SS_UNCONNECTED;
+-      if (sk->sk_prot->disconnect(sk, flags))
+-              sock->state = SS_DISCONNECTING;
+-      goto out;
+-}
+-
+-/*
+- *    Accept a pending connection. The TCP layer now gives BSD semantics.
+- */
+-
+-int inet_accept(struct socket *sock, struct socket *newsock, int flags)
+-{
+-      struct sock *sk1 = sock->sk;
+-      int err = -EINVAL;
+-      struct sock *sk2 = sk1->sk_prot->accept(sk1, flags, &err);
+-
+-      if (!sk2)
+-              goto do_err;
+-
+-      lock_sock(sk2);
+-
+-      BUG_TRAP((1 << sk2->sk_state) &
+-               (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_CLOSE));
+-
+-      sock_graft(sk2, newsock);
+-
+-      newsock->state = SS_CONNECTED;
+-      err = 0;
+-      release_sock(sk2);
+-do_err:
+-      return err;
+-}
+-
+-
+-/*
+- *    This does both peername and sockname.
+- */
+-int inet_getname(struct socket *sock, struct sockaddr *uaddr,
+-                      int *uaddr_len, int peer)
+-{
+-      struct sock *sk         = sock->sk;
+-      struct inet_sock *inet  = inet_sk(sk);
+-      struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
+-
+-      sin->sin_family = AF_INET;
+-      if (peer) {
+-              if (!inet->dport ||
+-                  (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) &&
+-                   peer == 1))
+-                      return -ENOTCONN;
+-              sin->sin_port = inet->dport;
+-              sin->sin_addr.s_addr =
+-                      nx_map_sock_lback(sk->sk_nx_info, inet->daddr);
+-      } else {
+-              __be32 addr = inet->rcv_saddr;
+-              if (!addr)
+-                      addr = inet->saddr;
+-              addr = nx_map_sock_lback(sk->sk_nx_info, addr);
+-              sin->sin_port = inet->sport;
+-              sin->sin_addr.s_addr = addr;
+-      }
+-      memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+-      *uaddr_len = sizeof(*sin);
+-      return 0;
+-}
+-
+-int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+-               size_t size)
+-{
+-      struct sock *sk = sock->sk;
+-
+-      /* We may need to bind the socket. */
+-      if (!inet_sk(sk)->num && inet_autobind(sk))
+-              return -EAGAIN;
+-
+-      return sk->sk_prot->sendmsg(iocb, sk, msg, size);
+-}
+-
+-
+-static ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
+-{
+-      struct sock *sk = sock->sk;
+-
+-      /* We may need to bind the socket. */
+-      if (!inet_sk(sk)->num && inet_autobind(sk))
+-              return -EAGAIN;
+-
+-      if (sk->sk_prot->sendpage)
+-              return sk->sk_prot->sendpage(sk, page, offset, size, flags);
+-      return sock_no_sendpage(sock, page, offset, size, flags);
+-}
+-
+-
+-int inet_shutdown(struct socket *sock, int how)
+-{
+-      struct sock *sk = sock->sk;
+-      int err = 0;
+-
+-      /* This should really check to make sure
+-       * the socket is a TCP socket. (WHY AC...)
+-       */
+-      how++; /* maps 0->1 has the advantage of making bit 1 rcvs and
+-                     1->2 bit 2 snds.
+-                     2->3 */
+-      if ((how & ~SHUTDOWN_MASK) || !how)     /* MAXINT->0 */
+-              return -EINVAL;
+-
+-      lock_sock(sk);
+-      if (sock->state == SS_CONNECTING) {
+-              if ((1 << sk->sk_state) &
+-                  (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE))
+-                      sock->state = SS_DISCONNECTING;
+-              else
+-                      sock->state = SS_CONNECTED;
+-      }
+-
+-      switch (sk->sk_state) {
+-      case TCP_CLOSE:
+-              err = -ENOTCONN;
+-              /* Hack to wake up other listeners, who can poll for
+-                 POLLHUP, even on eg. unconnected UDP sockets -- RR */
+-      default:
+-              sk->sk_shutdown |= how;
+-              if (sk->sk_prot->shutdown)
+-                      sk->sk_prot->shutdown(sk, how);
+-              break;
+-
+-      /* Remaining two branches are temporary solution for missing
+-       * close() in multithreaded environment. It is _not_ a good idea,
+-       * but we have no choice until close() is repaired at VFS level.
+-       */
+-      case TCP_LISTEN:
+-              if (!(how & RCV_SHUTDOWN))
+-                      break;
+-              /* Fall through */
+-      case TCP_SYN_SENT:
+-              err = sk->sk_prot->disconnect(sk, O_NONBLOCK);
+-              sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
+-              break;
+-      }
+-
+-      /* Wake up anyone sleeping in poll. */
+-      sk->sk_state_change(sk);
+-      release_sock(sk);
+-      return err;
+-}
+-
+-/*
+- *    ioctl() calls you can issue on an INET socket. Most of these are
+- *    device configuration and stuff and very rarely used. Some ioctls
+- *    pass on to the socket itself.
+- *
+- *    NOTE: I like the idea of a module for the config stuff. ie ifconfig
+- *    loads the devconfigure module does its configuring and unloads it.
+- *    There's a good 20K of config code hanging around the kernel.
+- */
+-
+-int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+-{
+-      struct sock *sk = sock->sk;
+-      int err = 0;
+-
+-      switch (cmd) {
+-              case SIOCGSTAMP:
+-                      err = sock_get_timestamp(sk, (struct timeval __user *)arg);
+-                      break;
+-              case SIOCGSTAMPNS:
+-                      err = sock_get_timestampns(sk, (struct timespec __user *)arg);
+-                      break;
+-              case SIOCADDRT:
+-              case SIOCDELRT:
+-              case SIOCRTMSG:
+-                      err = ip_rt_ioctl(cmd, (void __user *)arg);
+-                      break;
+-              case SIOCDARP:
+-              case SIOCGARP:
+-              case SIOCSARP:
+-                      err = arp_ioctl(cmd, (void __user *)arg);
+-                      break;
+-              case SIOCGIFADDR:
+-              case SIOCSIFADDR:
+-              case SIOCGIFBRDADDR:
+-              case SIOCSIFBRDADDR:
+-              case SIOCGIFNETMASK:
+-              case SIOCSIFNETMASK:
+-              case SIOCGIFDSTADDR:
+-              case SIOCSIFDSTADDR:
+-              case SIOCSIFPFLAGS:
+-              case SIOCGIFPFLAGS:
+-              case SIOCSIFFLAGS:
+-                      err = devinet_ioctl(cmd, (void __user *)arg);
+-                      break;
+-              default:
+-                      if (sk->sk_prot->ioctl)
+-                              err = sk->sk_prot->ioctl(sk, cmd, arg);
+-                      else
+-                              err = -ENOIOCTLCMD;
+-                      break;
+-      }
+-      return err;
+-}
+-
+-const struct proto_ops inet_stream_ops = {
+-      .family            = PF_INET,
+-      .owner             = THIS_MODULE,
+-      .release           = inet_release,
+-      .bind              = inet_bind,
+-      .connect           = inet_stream_connect,
+-      .socketpair        = sock_no_socketpair,
+-      .accept            = inet_accept,
+-      .getname           = inet_getname,
+-      .poll              = tcp_poll,
+-      .ioctl             = inet_ioctl,
+-      .listen            = inet_listen,
+-      .shutdown          = inet_shutdown,
+-      .setsockopt        = sock_common_setsockopt,
+-      .getsockopt        = sock_common_getsockopt,
+-      .sendmsg           = tcp_sendmsg,
+-      .recvmsg           = sock_common_recvmsg,
+-      .mmap              = sock_no_mmap,
+-      .sendpage          = tcp_sendpage,
+-#ifdef CONFIG_COMPAT
+-      .compat_setsockopt = compat_sock_common_setsockopt,
+-      .compat_getsockopt = compat_sock_common_getsockopt,
+-#endif
+-};
+-
+-const struct proto_ops inet_dgram_ops = {
+-      .family            = PF_INET,
+-      .owner             = THIS_MODULE,
+-      .release           = inet_release,
+-      .bind              = inet_bind,
+-      .connect           = inet_dgram_connect,
+-      .socketpair        = sock_no_socketpair,
+-      .accept            = sock_no_accept,
+-      .getname           = inet_getname,
+-      .poll              = udp_poll,
+-      .ioctl             = inet_ioctl,
+-      .listen            = sock_no_listen,
+-      .shutdown          = inet_shutdown,
+-      .setsockopt        = sock_common_setsockopt,
+-      .getsockopt        = sock_common_getsockopt,
+-      .sendmsg           = inet_sendmsg,
+-      .recvmsg           = sock_common_recvmsg,
+-      .mmap              = sock_no_mmap,
+-      .sendpage          = inet_sendpage,
+-#ifdef CONFIG_COMPAT
+-      .compat_setsockopt = compat_sock_common_setsockopt,
+-      .compat_getsockopt = compat_sock_common_getsockopt,
+-#endif
+-};
+-
+-/*
+- * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without
+- * udp_poll
+- */
+-static const struct proto_ops inet_sockraw_ops = {
+-      .family            = PF_INET,
+-      .owner             = THIS_MODULE,
+-      .release           = inet_release,
+-      .bind              = inet_bind,
+-      .connect           = inet_dgram_connect,
+-      .socketpair        = sock_no_socketpair,
+-      .accept            = sock_no_accept,
+-      .getname           = inet_getname,
+-      .poll              = datagram_poll,
+-      .ioctl             = inet_ioctl,
+-      .listen            = sock_no_listen,
+-      .shutdown          = inet_shutdown,
+-      .setsockopt        = sock_common_setsockopt,
+-      .getsockopt        = sock_common_getsockopt,
+-      .sendmsg           = inet_sendmsg,
+-      .recvmsg           = sock_common_recvmsg,
+-      .mmap              = sock_no_mmap,
+-      .sendpage          = inet_sendpage,
+-#ifdef CONFIG_COMPAT
+-      .compat_setsockopt = compat_sock_common_setsockopt,
+-      .compat_getsockopt = compat_sock_common_getsockopt,
+-#endif
+-};
+-
+-static struct net_proto_family inet_family_ops = {
+-      .family = PF_INET,
+-      .create = inet_create,
+-      .owner  = THIS_MODULE,
+-};
+-
+-/* Upon startup we insert all the elements in inetsw_array[] into
+- * the linked list inetsw.
+- */
+-static struct inet_protosw inetsw_array[] =
+-{
+-      {
+-              .type =       SOCK_STREAM,
+-              .protocol =   IPPROTO_TCP,
+-              .prot =       &tcp_prot,
+-              .ops =        &inet_stream_ops,
+-              .capability = -1,
+-              .no_check =   0,
+-              .flags =      INET_PROTOSW_PERMANENT |
+-                            INET_PROTOSW_ICSK,
+-      },
+-
+-      {
+-              .type =       SOCK_DGRAM,
+-              .protocol =   IPPROTO_UDP,
+-              .prot =       &udp_prot,
+-              .ops =        &inet_dgram_ops,
+-              .capability = -1,
+-              .no_check =   UDP_CSUM_DEFAULT,
+-              .flags =      INET_PROTOSW_PERMANENT,
+-       },
+-
+-
+-       {
+-             .type =       SOCK_RAW,
+-             .protocol =   IPPROTO_IP,        /* wild card */
+-             .prot =       &raw_prot,
+-             .ops =        &inet_sockraw_ops,
+-             .capability = CAP_NET_RAW,
+-             .no_check =   UDP_CSUM_DEFAULT,
+-             .flags =      INET_PROTOSW_REUSE,
+-       }
+-};
+-
+-#define INETSW_ARRAY_LEN (sizeof(inetsw_array) / sizeof(struct inet_protosw))
+-
+-void inet_register_protosw(struct inet_protosw *p)
+-{
+-      struct list_head *lh;
+-      struct inet_protosw *answer;
+-      int protocol = p->protocol;
+-      struct list_head *last_perm;
+-
+-      spin_lock_bh(&inetsw_lock);
+-
+-      if (p->type >= SOCK_MAX)
+-              goto out_illegal;
+-
+-      /* If we are trying to override a permanent protocol, bail. */
+-      answer = NULL;
+-      last_perm = &inetsw[p->type];
+-      list_for_each(lh, &inetsw[p->type]) {
+-              answer = list_entry(lh, struct inet_protosw, list);
+-
+-              /* Check only the non-wild match. */
+-              if (INET_PROTOSW_PERMANENT & answer->flags) {
+-                      if (protocol == answer->protocol)
+-                              break;
+-                      last_perm = lh;
+-              }
+-
+-              answer = NULL;
+-      }
+-      if (answer)
+-              goto out_permanent;
+-
+-      /* Add the new entry after the last permanent entry if any, so that
+-       * the new entry does not override a permanent entry when matched with
+-       * a wild-card protocol. But it is allowed to override any existing
+-       * non-permanent entry.  This means that when we remove this entry, the
+-       * system automatically returns to the old behavior.
+-       */
+-      list_add_rcu(&p->list, last_perm);
+-out:
+-      spin_unlock_bh(&inetsw_lock);
+-
+-      synchronize_net();
+-
+-      return;
+-
+-out_permanent:
+-      printk(KERN_ERR "Attempt to override permanent protocol %d.\n",
+-             protocol);
+-      goto out;
+-
+-out_illegal:
+-      printk(KERN_ERR
+-             "Ignoring attempt to register invalid socket type %d.\n",
+-             p->type);
+-      goto out;
+-}
+-
+-void inet_unregister_protosw(struct inet_protosw *p)
+-{
+-      if (INET_PROTOSW_PERMANENT & p->flags) {
+-              printk(KERN_ERR
+-                     "Attempt to unregister permanent protocol %d.\n",
+-                     p->protocol);
+-      } else {
+-              spin_lock_bh(&inetsw_lock);
+-              list_del_rcu(&p->list);
+-              spin_unlock_bh(&inetsw_lock);
+-
+-              synchronize_net();
+-      }
+-}
+-
+-/*
+- *      Shall we try to damage output packets if routing dev changes?
+- */
+-
+-int sysctl_ip_dynaddr __read_mostly;
+-
+-static int inet_sk_reselect_saddr(struct sock *sk)
+-{
+-      struct inet_sock *inet = inet_sk(sk);
+-      int err;
+-      struct rtable *rt;
+-      __be32 old_saddr = inet->saddr;
+-      __be32 new_saddr;
+-      __be32 daddr = inet->daddr;
+-
+-      if (inet->opt && inet->opt->srr)
+-              daddr = inet->opt->faddr;
+-
+-      /* Query new route. */
+-      err = ip_route_connect(&rt, daddr, 0,
+-                             RT_CONN_FLAGS(sk),
+-                             sk->sk_bound_dev_if,
+-                             sk->sk_protocol,
+-                             inet->sport, inet->dport, sk, 0);
+-      if (err)
+-              return err;
+-
+-      sk_setup_caps(sk, &rt->u.dst);
+-
+-      new_saddr = rt->rt_src;
+-
+-      if (new_saddr == old_saddr)
+-              return 0;
+-
+-      if (sysctl_ip_dynaddr > 1) {
+-              printk(KERN_INFO "%s(): shifting inet->"
+-                               "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
+-                     __FUNCTION__,
+-                     NIPQUAD(old_saddr),
+-                     NIPQUAD(new_saddr));
+-      }
+-
+-      inet->saddr = inet->rcv_saddr = new_saddr;
+-
+-      /*
+-       * XXX The only one ugly spot where we need to
+-       * XXX really change the sockets identity after
+-       * XXX it has entered the hashes. -DaveM
+-       *
+-       * Besides that, it does not check for connection
+-       * uniqueness. Wait for troubles.
+-       */
+-      __sk_prot_rehash(sk);
+-      return 0;
+-}
+-
+-int inet_sk_rebuild_header(struct sock *sk)
+-{
+-      struct inet_sock *inet = inet_sk(sk);
+-      struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
+-      __be32 daddr;
+-      int err;
+-
+-      /* Route is OK, nothing to do. */
+-      if (rt)
+-              return 0;
+-
+-      /* Reroute. */
+-      daddr = inet->daddr;
+-      if (inet->opt && inet->opt->srr)
+-              daddr = inet->opt->faddr;
+-{
+-      struct flowi fl = {
+-              .oif = sk->sk_bound_dev_if,
+-              .nl_u = {
+-                      .ip4_u = {
+-                              .daddr  = daddr,
+-                              .saddr  = inet->saddr,
+-                              .tos    = RT_CONN_FLAGS(sk),
+-                      },
+-              },
+-              .proto = sk->sk_protocol,
+-              .uli_u = {
+-                      .ports = {
+-                              .sport = inet->sport,
+-                              .dport = inet->dport,
+-                      },
+-              },
+-      };
+-
+-      security_sk_classify_flow(sk, &fl);
+-      err = ip_route_output_flow(&rt, &fl, sk, 0);
+-}
+-      if (!err)
+-              sk_setup_caps(sk, &rt->u.dst);
+-      else {
+-              /* Routing failed... */
+-              sk->sk_route_caps = 0;
+-              /*
+-               * Other protocols have to map its equivalent state to TCP_SYN_SENT.
+-               * DCCP maps its DCCP_REQUESTING state to TCP_SYN_SENT. -acme
+-               */
+-              if (!sysctl_ip_dynaddr ||
+-                  sk->sk_state != TCP_SYN_SENT ||
+-                  (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
+-                  (err = inet_sk_reselect_saddr(sk)) != 0)
+-                      sk->sk_err_soft = -err;
+-      }
+-
+-      return err;
+-}
+-
+-EXPORT_SYMBOL(inet_sk_rebuild_header);
+-
+-static int inet_gso_send_check(struct sk_buff *skb)
+-{
+-      struct iphdr *iph;
+-      struct net_protocol *ops;
+-      int proto;
+-      int ihl;
+-      int err = -EINVAL;
+-
+-      if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))
+-              goto out;
+-
+-      iph = ip_hdr(skb);
+-      ihl = iph->ihl * 4;
+-      if (ihl < sizeof(*iph))
+-              goto out;
+-
+-      if (unlikely(!pskb_may_pull(skb, ihl)))
+-              goto out;
+-
+-      __skb_pull(skb, ihl);
+-      skb_reset_transport_header(skb);
+-      iph = ip_hdr(skb);
+-      proto = iph->protocol & (MAX_INET_PROTOS - 1);
+-      err = -EPROTONOSUPPORT;
+-
+-      rcu_read_lock();
+-      ops = rcu_dereference(inet_protos[proto]);
+-      if (likely(ops && ops->gso_send_check))
+-              err = ops->gso_send_check(skb);
+-      rcu_read_unlock();
+-
+-out:
+-      return err;
+-}
+-
+-static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int features)
+-{
+-      struct sk_buff *segs = ERR_PTR(-EINVAL);
+-      struct iphdr *iph;
+-      struct net_protocol *ops;
+-      int proto;
+-      int ihl;
+-      int id;
+-
+-      if (unlikely(skb_shinfo(skb)->gso_type &
+-                   ~(SKB_GSO_TCPV4 |
+-                     SKB_GSO_UDP |
+-                     SKB_GSO_DODGY |
+-                     SKB_GSO_TCP_ECN |
+-                     0)))
+-              goto out;
+-
+-      if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))
+-              goto out;
+-
+-      iph = ip_hdr(skb);
+-      ihl = iph->ihl * 4;
+-      if (ihl < sizeof(*iph))
+-              goto out;
+-
+-      if (unlikely(!pskb_may_pull(skb, ihl)))
+-              goto out;
+-
+-      __skb_pull(skb, ihl);
+-      skb_reset_transport_header(skb);
+-      iph = ip_hdr(skb);
+-      id = ntohs(iph->id);
+-      proto = iph->protocol & (MAX_INET_PROTOS - 1);
+-      segs = ERR_PTR(-EPROTONOSUPPORT);
+-
+-      rcu_read_lock();
+-      ops = rcu_dereference(inet_protos[proto]);
+-      if (likely(ops && ops->gso_segment))
+-              segs = ops->gso_segment(skb, features);
+-      rcu_read_unlock();
+-
+-      if (!segs || unlikely(IS_ERR(segs)))
+-              goto out;
+-
+-      skb = segs;
+-      do {
+-              iph = ip_hdr(skb);
+-              iph->id = htons(id++);
+-              iph->tot_len = htons(skb->len - skb->mac_len);
+-              iph->check = 0;
+-              iph->check = ip_fast_csum(skb_network_header(skb), iph->ihl);
+-      } while ((skb = skb->next));
+-
+-out:
+-      return segs;
+-}
+-
+-unsigned long snmp_fold_field(void *mib[], int offt)
+-{
+-      unsigned long res = 0;
+-      int i;
+-
+-      for_each_possible_cpu(i) {
+-              res += *(((unsigned long *) per_cpu_ptr(mib[0], i)) + offt);
+-              res += *(((unsigned long *) per_cpu_ptr(mib[1], i)) + offt);
+-      }
+-      return res;
+-}
+-EXPORT_SYMBOL_GPL(snmp_fold_field);
+-
+-int snmp_mib_init(void *ptr[2], size_t mibsize, size_t mibalign)
+-{
+-      BUG_ON(ptr == NULL);
+-      ptr[0] = __alloc_percpu(mibsize);
+-      if (!ptr[0])
+-              goto err0;
+-      ptr[1] = __alloc_percpu(mibsize);
+-      if (!ptr[1])
+-              goto err1;
+-      return 0;
+-err1:
+-      free_percpu(ptr[0]);
+-      ptr[0] = NULL;
+-err0:
+-      return -ENOMEM;
+-}
+-EXPORT_SYMBOL_GPL(snmp_mib_init);
+-
+-void snmp_mib_free(void *ptr[2])
+-{
+-      BUG_ON(ptr == NULL);
+-      free_percpu(ptr[0]);
+-      free_percpu(ptr[1]);
+-      ptr[0] = ptr[1] = NULL;
+-}
+-EXPORT_SYMBOL_GPL(snmp_mib_free);
+-
+-#ifdef CONFIG_IP_MULTICAST
+-static struct net_protocol igmp_protocol = {
+-      .handler =      igmp_rcv,
+-};
+-#endif
+-
+-static struct net_protocol tcp_protocol = {
+-      .handler =      tcp_v4_rcv,
+-      .err_handler =  tcp_v4_err,
+-      .gso_send_check = tcp_v4_gso_send_check,
+-      .gso_segment =  tcp_tso_segment,
+-      .no_policy =    1,
+-};
+-
+-static struct net_protocol udp_protocol = {
+-      .handler =      udp_rcv,
+-      .err_handler =  udp_err,
+-      .no_policy =    1,
+-};
+-
+-static struct net_protocol icmp_protocol = {
+-      .handler =      icmp_rcv,
+-};
+-
+-static int __init init_ipv4_mibs(void)
+-{
+-      if (snmp_mib_init((void **)net_statistics,
+-                        sizeof(struct linux_mib),
+-                        __alignof__(struct linux_mib)) < 0)
+-              goto err_net_mib;
+-      if (snmp_mib_init((void **)ip_statistics,
+-                        sizeof(struct ipstats_mib),
+-                        __alignof__(struct ipstats_mib)) < 0)
+-              goto err_ip_mib;
+-      if (snmp_mib_init((void **)icmp_statistics,
+-                        sizeof(struct icmp_mib),
+-                        __alignof__(struct icmp_mib)) < 0)
+-              goto err_icmp_mib;
+-      if (snmp_mib_init((void **)tcp_statistics,
+-                        sizeof(struct tcp_mib),
+-                        __alignof__(struct tcp_mib)) < 0)
+-              goto err_tcp_mib;
+-      if (snmp_mib_init((void **)udp_statistics,
+-                        sizeof(struct udp_mib),
+-                        __alignof__(struct udp_mib)) < 0)
+-              goto err_udp_mib;
+-      if (snmp_mib_init((void **)udplite_statistics,
+-                        sizeof(struct udp_mib),
+-                        __alignof__(struct udp_mib)) < 0)
+-              goto err_udplite_mib;
+-
+-      tcp_mib_init();
+-
+-      return 0;
+-
+-err_udplite_mib:
+-      snmp_mib_free((void **)udp_statistics);
+-err_udp_mib:
+-      snmp_mib_free((void **)tcp_statistics);
+-err_tcp_mib:
+-      snmp_mib_free((void **)icmp_statistics);
+-err_icmp_mib:
+-      snmp_mib_free((void **)ip_statistics);
+-err_ip_mib:
+-      snmp_mib_free((void **)net_statistics);
+-err_net_mib:
+-      return -ENOMEM;
+-}
+-
+-static int ipv4_proc_init(void);
+-
+-/*
+- *    IP protocol layer initialiser
+- */
+-
+-static struct packet_type ip_packet_type = {
+-      .type = __constant_htons(ETH_P_IP),
+-      .func = ip_rcv,
+-      .gso_send_check = inet_gso_send_check,
+-      .gso_segment = inet_gso_segment,
+-};
+-
+-static int __init inet_init(void)
+-{
+-      struct sk_buff *dummy_skb;
+-      struct inet_protosw *q;
+-      struct list_head *r;
+-      int rc = -EINVAL;
+-
+-      BUILD_BUG_ON(sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb));
+-
+-      rc = proto_register(&tcp_prot, 1);
+-      if (rc)
+-              goto out;
+-
+-      rc = proto_register(&udp_prot, 1);
+-      if (rc)
+-              goto out_unregister_tcp_proto;
+-
+-      rc = proto_register(&raw_prot, 1);
+-      if (rc)
+-              goto out_unregister_udp_proto;
+-
+-      /*
+-       *      Tell SOCKET that we are alive...
+-       */
+-
+-      (void)sock_register(&inet_family_ops);
+-
+-      /*
+-       *      Add all the base protocols.
+-       */
+-
+-      if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)
+-              printk(KERN_CRIT "inet_init: Cannot add ICMP protocol\n");
+-      if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)
+-              printk(KERN_CRIT "inet_init: Cannot add UDP protocol\n");
+-      if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
+-              printk(KERN_CRIT "inet_init: Cannot add TCP protocol\n");
+-#ifdef CONFIG_IP_MULTICAST
+-      if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0)
+-              printk(KERN_CRIT "inet_init: Cannot add IGMP protocol\n");
+-#endif
+-
+-      /* Register the socket-side information for inet_create. */
+-      for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)
+-              INIT_LIST_HEAD(r);
+-
+-      for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
+-              inet_register_protosw(q);
+-
+-      /*
+-       *      Set the ARP module up
+-       */
+-
+-      arp_init();
+-
+-      /*
+-       *      Set the IP module up
+-       */
+-
+-      ip_init();
+-
+-      tcp_v4_init(&inet_family_ops);
+-
+-      /* Setup TCP slab cache for open requests. */
+-      tcp_init();
+-
+-      /* Add UDP-Lite (RFC 3828) */
+-      udplite4_register();
+-
+-      /*
+-       *      Set the ICMP layer up
+-       */
+-
+-      icmp_init(&inet_family_ops);
+-
+-      /*
+-       *      Initialise the multicast router
+-       */
+-#if defined(CONFIG_IP_MROUTE)
+-      ip_mr_init();
+-#endif
+-      /*
+-       *      Initialise per-cpu ipv4 mibs
+-       */
+-
+-      if (init_ipv4_mibs())
+-              printk(KERN_CRIT "inet_init: Cannot init ipv4 mibs\n"); ;
+-
+-      ipv4_proc_init();
+-
+-      ipfrag_init();
+-
+-      dev_add_pack(&ip_packet_type);
+-
+-      rc = 0;
+-out:
+-      return rc;
+-out_unregister_udp_proto:
+-      proto_unregister(&udp_prot);
+-out_unregister_tcp_proto:
+-      proto_unregister(&tcp_prot);
+-      goto out;
+-}
+-
+-fs_initcall(inet_init);
+-
+-/* ------------------------------------------------------------------------ */
+-
+-#ifdef CONFIG_PROC_FS
+-static int __init ipv4_proc_init(void)
+-{
+-      int rc = 0;
+-
+-      if (raw_proc_init())
+-              goto out_raw;
+-      if (tcp4_proc_init())
+-              goto out_tcp;
+-      if (udp4_proc_init())
+-              goto out_udp;
+-      if (fib_proc_init())
+-              goto out_fib;
+-      if (ip_misc_proc_init())
+-              goto out_misc;
+-out:
+-      return rc;
+-out_misc:
+-      fib_proc_exit();
+-out_fib:
+-      udp4_proc_exit();
+-out_udp:
+-      tcp4_proc_exit();
+-out_tcp:
+-      raw_proc_exit();
+-out_raw:
+-      rc = -ENOMEM;
+-      goto out;
+-}
+-
+-#else /* CONFIG_PROC_FS */
+-static int __init ipv4_proc_init(void)
+-{
+-      return 0;
+-}
+-#endif /* CONFIG_PROC_FS */
+-
+-MODULE_ALIAS_NETPROTO(PF_INET);
+-
+-EXPORT_SYMBOL(inet_accept);
+-EXPORT_SYMBOL(inet_bind);
+-EXPORT_SYMBOL(inet_dgram_connect);
+-EXPORT_SYMBOL(inet_dgram_ops);
+-EXPORT_SYMBOL(inet_getname);
+-EXPORT_SYMBOL(inet_ioctl);
+-EXPORT_SYMBOL(inet_listen);
+-EXPORT_SYMBOL(inet_register_protosw);
+-EXPORT_SYMBOL(inet_release);
+-EXPORT_SYMBOL(inet_sendmsg);
+-EXPORT_SYMBOL(inet_shutdown);
+-EXPORT_SYMBOL(inet_sock_destruct);
+-EXPORT_SYMBOL(inet_stream_connect);
+-EXPORT_SYMBOL(inet_stream_ops);
+-EXPORT_SYMBOL(inet_unregister_protosw);
+-EXPORT_SYMBOL(net_statistics);
+-EXPORT_SYMBOL(sysctl_ip_nonlocal_bind);
+diff -Nurb linux-2.6.22-594/net/netfilter/xt_MARK.c.orig linux-2.6.22-595/net/netfilter/xt_MARK.c.orig
+--- linux-2.6.22-594/net/netfilter/xt_MARK.c.orig      2008-03-20 00:05:19.000000000 -0400
++++ linux-2.6.22-595/net/netfilter/xt_MARK.c.orig      1969-12-31 19:00:00.000000000 -0500
+@@ -1,283 +0,0 @@
+-/* This is a module which is used for setting the NFMARK field of an skb. */
+-
+-/* (C) 1999-2001 Marc Boucher <marc@mbsi.ca>
+- *
+- * This program is free software; you can redistribute it and/or modify
+- * it under the terms of the GNU General Public License version 2 as
+- * published by the Free Software Foundation.
+- *
+- */
+-
+-#include <linux/module.h>
+-#include <linux/version.h>
+-#include <linux/skbuff.h>
+-#include <linux/ip.h>
+-#include <net/checksum.h>
+-#include <net/route.h>
+-#include <net/inet_hashtables.h>
+-
+-#include <net/netfilter/nf_conntrack.h>
+-#include <linux/netfilter/x_tables.h>
+-#include <linux/netfilter/xt_MARK.h>
+-
+-MODULE_LICENSE("GPL");
+-MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
+-MODULE_DESCRIPTION("ip[6]tables MARK modification module");
+-MODULE_ALIAS("ipt_MARK");
+-MODULE_ALIAS("ip6t_MARK");
+-
+-static inline u_int16_t
+-get_dst_port(struct nf_conntrack_tuple *tuple)
+-{
+-      switch (tuple->dst.protonum) {
+-      case IPPROTO_GRE:
+-              /* XXX Truncate 32-bit GRE key to 16 bits */
+-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,11)
+-              return tuple->dst.u.gre.key;
+-#else
+-              return htons(ntohl(tuple->dst.u.gre.key));
+-#endif  
+-      case IPPROTO_ICMP:
+-              /* Bind on ICMP echo ID */
+-              return tuple->src.u.icmp.id;
+-      case IPPROTO_TCP:
+-              return tuple->dst.u.tcp.port;
+-      case IPPROTO_UDP:
+-              return tuple->dst.u.udp.port;
+-      default:
+-              return tuple->dst.u.all;
+-      }
+-}
+-
+-static inline u_int16_t
+-get_src_port(struct nf_conntrack_tuple *tuple)
+-{
+-      switch (tuple->dst.protonum) {
+-      case IPPROTO_GRE:
+-              /* XXX Truncate 32-bit GRE key to 16 bits */
+-              return htons(ntohl(tuple->src.u.gre.key));
+-      case IPPROTO_ICMP:
+-              /* Bind on ICMP echo ID */
+-              return tuple->src.u.icmp.id;
+-      case IPPROTO_TCP:
+-              return tuple->src.u.tcp.port;
+-      case IPPROTO_UDP:
+-              return tuple->src.u.udp.port;
+-      default:
+-              return tuple->src.u.all;
+-      }
+-}
+-
+-static unsigned int
+-target_v0(struct sk_buff **pskb,
+-        const struct net_device *in,
+-        const struct net_device *out,
+-        unsigned int hooknum,
+-        const struct xt_target *target,
+-        const void *targinfo)
+-{
+-      const struct xt_mark_target_info *markinfo = targinfo;
+-
+-      (*pskb)->mark = markinfo->mark;
+-      return XT_CONTINUE;
+-}
+-
+-static unsigned int
+-target_v1(struct sk_buff **pskb,
+-        const struct net_device *in,
+-        const struct net_device *out,
+-        unsigned int hooknum,
+-        const struct xt_target *target,
+-        const void *targinfo)
+-{
+-      const struct xt_mark_target_info_v1 *markinfo = targinfo;
+-      int mark = -1;
+-
+-      switch (markinfo->mode) {
+-      case XT_MARK_SET:
+-              mark = markinfo->mark;
+-              break;
+-
+-      case XT_MARK_AND:
+-              mark = (*pskb)->mark & markinfo->mark;
+-              break;
+-
+-      case XT_MARK_OR:
+-              mark = (*pskb)->mark | markinfo->mark;
+-              break;
+-
+-      case XT_MARK_COPYXID: {
+-              enum ip_conntrack_info ctinfo;
+-              struct sock *connection_sk=NULL;
+-              int dif;
+-
+-              struct nf_conn *ct = nf_ct_get((*pskb), &ctinfo);
+-              extern struct inet_hashinfo tcp_hashinfo;
+-              enum ip_conntrack_dir dir;
+-              if (!ct) 
+-                      break;
+-
+-              dir = CTINFO2DIR(ctinfo);
+-              u_int32_t src_ip = ct->tuplehash[dir].tuple.src.u3.ip;
+-              u_int16_t src_port = get_src_port(&ct->tuplehash[dir].tuple);
+-              u_int16_t proto = ct->tuplehash[dir].tuple.dst.protonum;
+-
+-              u_int32_t ip;
+-              u_int16_t port;
+-
+-              dif = ((struct rtable *)(*pskb)->dst)->rt_iif;
+-              ip = ct->tuplehash[dir].tuple.dst.u3.ip;
+-              port = get_dst_port(&ct->tuplehash[dir].tuple);
+-
+-              if (proto == 1 || proto == 17) {
+-                      if (((*pskb)->mark!=-1) && (*pskb)->mark)
+-                              ct->xid[0]=(*pskb)->mark;
+-                      if (ct->xid[0]) 
+-                              mark = ct->xid[0];
+-
+-              }
+-              else if (proto == 6) {
+-                              if ((*pskb)->sk) 
+-                                      connection_sk = (*pskb)->sk;
+-                              else {
+-                                      connection_sk = inet_lookup(&tcp_hashinfo, src_ip, src_port, ip, port, dif);
+-                              }
+-
+-                              if (connection_sk) {
+-                                      connection_sk->sk_peercred.gid = connection_sk->sk_peercred.uid = ct->xid[dir];
+-                                      ct->xid[!dir]=connection_sk->sk_xid;
+-                                      if (connection_sk->sk_xid != 0) 
+-                                              mark = connection_sk->sk_xid;
+-                                      if (connection_sk != (*pskb)->sk)
+-                                              sock_put(connection_sk);
+-                              }
+-                              break;
+-                              }
+-                            }
+-      }
+-
+-      if (mark != -1)
+-      (*pskb)->mark = mark;
+-      return XT_CONTINUE;
+-}
+-
+-
+-static int
+-checkentry_v0(const char *tablename,
+-            const void *entry,
+-            const struct xt_target *target,
+-            void *targinfo,
+-            unsigned int hook_mask)
+-{
+-      struct xt_mark_target_info *markinfo = targinfo;
+-
+-      if (markinfo->mark > 0xffffffff) {
+-              printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n");
+-              return 0;
+-      }
+-      return 1;
+-}
+-
+-static int
+-checkentry_v1(const char *tablename,
+-            const void *entry,
+-            const struct xt_target *target,
+-            void *targinfo,
+-            unsigned int hook_mask)
+-{
+-      struct xt_mark_target_info_v1 *markinfo = targinfo;
+-
+-      if (markinfo->mode != XT_MARK_SET
+-          && markinfo->mode != XT_MARK_AND
+-          && markinfo->mode != XT_MARK_OR
+-          && markinfo->mode != XT_MARK_COPYXID) {
+-              printk(KERN_WARNING "MARK: unknown mode %u\n",
+-                     markinfo->mode);
+-              return 0;
+-      }
+-      if (markinfo->mark > 0xffffffff) {
+-              printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n");
+-              return 0;
+-      }
+-      return 1;
+-}
+-
+-#ifdef CONFIG_COMPAT
+-struct compat_xt_mark_target_info_v1 {
+-      compat_ulong_t  mark;
+-      u_int8_t        mode;
+-      u_int8_t        __pad1;
+-      u_int16_t       __pad2;
+-};
+-
+-static void compat_from_user_v1(void *dst, void *src)
+-{
+-      struct compat_xt_mark_target_info_v1 *cm = src;
+-      struct xt_mark_target_info_v1 m = {
+-              .mark   = cm->mark,
+-              .mode   = cm->mode,
+-      };
+-      memcpy(dst, &m, sizeof(m));
+-}
+-
+-static int compat_to_user_v1(void __user *dst, void *src)
+-{
+-      struct xt_mark_target_info_v1 *m = src;
+-      struct compat_xt_mark_target_info_v1 cm = {
+-              .mark   = m->mark,
+-              .mode   = m->mode,
+-      };
+-      return copy_to_user(dst, &cm, sizeof(cm)) ? -EFAULT : 0;
+-}
+-#endif /* CONFIG_COMPAT */
+-
+-static struct xt_target xt_mark_target[] = {
+-      {
+-              .name           = "MARK",
+-              .family         = AF_INET,
+-              .revision       = 0,
+-              .checkentry     = checkentry_v0,
+-              .target         = target_v0,
+-              .targetsize     = sizeof(struct xt_mark_target_info),
+-              .table          = "mangle",
+-              .me             = THIS_MODULE,
+-      },
+-      {
+-              .name           = "MARK",
+-              .family         = AF_INET,
+-              .revision       = 1,
+-              .checkentry     = checkentry_v1,
+-              .target         = target_v1,
+-              .targetsize     = sizeof(struct xt_mark_target_info_v1),
+-#ifdef CONFIG_COMPAT
+-              .compatsize     = sizeof(struct compat_xt_mark_target_info_v1),
+-              .compat_from_user = compat_from_user_v1,
+-              .compat_to_user = compat_to_user_v1,
+-#endif
+-              .table          = "mangle",
+-              .me             = THIS_MODULE,
+-      },
+-      {
+-              .name           = "MARK",
+-              .family         = AF_INET6,
+-              .revision       = 0,
+-              .checkentry     = checkentry_v0,
+-              .target         = target_v0,
+-              .targetsize     = sizeof(struct xt_mark_target_info),
+-              .table          = "mangle",
+-              .me             = THIS_MODULE,
+-      },
+-};
+-
+-static int __init xt_mark_init(void)
+-{
+-      return xt_register_targets(xt_mark_target, ARRAY_SIZE(xt_mark_target));
+-}
+-
+-static void __exit xt_mark_fini(void)
+-{
+-      xt_unregister_targets(xt_mark_target, ARRAY_SIZE(xt_mark_target));
+-}
+-
+-module_init(xt_mark_init);
+-module_exit(xt_mark_fini);
+diff -Nurb linux-2.6.22-594/net/packet/af_packet.c.orig linux-2.6.22-595/net/packet/af_packet.c.orig
+--- linux-2.6.22-594/net/packet/af_packet.c.orig       2008-03-20 00:05:19.000000000 -0400
++++ linux-2.6.22-595/net/packet/af_packet.c.orig       1969-12-31 19:00:00.000000000 -0500
+@@ -1,1989 +0,0 @@
+-/*
+- * INET               An implementation of the TCP/IP protocol suite for the LINUX
+- *            operating system.  INET is implemented using the  BSD Socket
+- *            interface as the means of communication with the user level.
+- *
+- *            PACKET - implements raw packet sockets.
+- *
+- * Version:   $Id: af_packet.c,v 1.61 2002/02/08 03:57:19 davem Exp $
+- *
+- * Authors:   Ross Biro
+- *            Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+- *            Alan Cox, <gw4pts@gw4pts.ampr.org>
+- *
+- * Fixes:
+- *            Alan Cox        :       verify_area() now used correctly
+- *            Alan Cox        :       new skbuff lists, look ma no backlogs!
+- *            Alan Cox        :       tidied skbuff lists.
+- *            Alan Cox        :       Now uses generic datagram routines I
+- *                                    added. Also fixed the peek/read crash
+- *                                    from all old Linux datagram code.
+- *            Alan Cox        :       Uses the improved datagram code.
+- *            Alan Cox        :       Added NULL's for socket options.
+- *            Alan Cox        :       Re-commented the code.
+- *            Alan Cox        :       Use new kernel side addressing
+- *            Rob Janssen     :       Correct MTU usage.
+- *            Dave Platt      :       Counter leaks caused by incorrect
+- *                                    interrupt locking and some slightly
+- *                                    dubious gcc output. Can you read
+- *                                    compiler: it said _VOLATILE_
+- *    Richard Kooijman        :       Timestamp fixes.
+- *            Alan Cox        :       New buffers. Use sk->mac.raw.
+- *            Alan Cox        :       sendmsg/recvmsg support.
+- *            Alan Cox        :       Protocol setting support
+- *    Alexey Kuznetsov        :       Untied from IPv4 stack.
+- *    Cyrus Durgin            :       Fixed kerneld for kmod.
+- *    Michal Ostrowski        :       Module initialization cleanup.
+- *         Ulises Alonso        :       Frame number limit removal and
+- *                                      packet_set_ring memory leak.
+- *            Eric Biederman  :       Allow for > 8 byte hardware addresses.
+- *                                    The convention is that longer addresses
+- *                                    will simply extend the hardware address
+- *                                    byte arrays at the end of sockaddr_ll
+- *                                    and packet_mreq.
+- *
+- *            This program is free software; you can redistribute it and/or
+- *            modify it under the terms of the GNU General Public License
+- *            as published by the Free Software Foundation; either version
+- *            2 of the License, or (at your option) any later version.
+- *
+- */
+-
+-#include <linux/types.h>
+-#include <linux/mm.h>
+-#include <linux/capability.h>
+-#include <linux/fcntl.h>
+-#include <linux/socket.h>
+-#include <linux/in.h>
+-#include <linux/inet.h>
+-#include <linux/netdevice.h>
+-#include <linux/if_packet.h>
+-#include <linux/wireless.h>
+-#include <linux/kernel.h>
+-#include <linux/kmod.h>
+-#include <net/ip.h>
+-#include <net/protocol.h>
+-#include <linux/skbuff.h>
+-#include <net/sock.h>
+-#include <linux/errno.h>
+-#include <linux/timer.h>
+-#include <asm/system.h>
+-#include <asm/uaccess.h>
+-#include <asm/ioctls.h>
+-#include <asm/page.h>
+-#include <asm/cacheflush.h>
+-#include <asm/io.h>
+-#include <linux/proc_fs.h>
+-#include <linux/seq_file.h>
+-#include <linux/poll.h>
+-#include <linux/module.h>
+-#include <linux/init.h>
+-#include <linux/vs_network.h>
+-
+-#ifdef CONFIG_INET
+-#include <net/inet_common.h>
+-#endif
+-
+-/*
+-   Assumptions:
+-   - if device has no dev->hard_header routine, it adds and removes ll header
+-     inside itself. In this case ll header is invisible outside of device,
+-     but higher levels still should reserve dev->hard_header_len.
+-     Some devices are enough clever to reallocate skb, when header
+-     will not fit to reserved space (tunnel), another ones are silly
+-     (PPP).
+-   - packet socket receives packets with pulled ll header,
+-     so that SOCK_RAW should push it back.
+-
+-On receive:
+------------
+-
+-Incoming, dev->hard_header!=NULL
+-   mac_header -> ll header
+-   data       -> data
+-
+-Outgoing, dev->hard_header!=NULL
+-   mac_header -> ll header
+-   data       -> ll header
+-
+-Incoming, dev->hard_header==NULL
+-   mac_header -> UNKNOWN position. It is very likely, that it points to ll
+-               header.  PPP makes it, that is wrong, because introduce
+-                 assymetry between rx and tx paths.
+-   data       -> data
+-
+-Outgoing, dev->hard_header==NULL
+-   mac_header -> data. ll header is still not built!
+-   data       -> data
+-
+-Resume
+-  If dev->hard_header==NULL we are unlikely to restore sensible ll header.
+-
+-
+-On transmit:
+-------------
+-
+-dev->hard_header != NULL
+-   mac_header -> ll header
+-   data       -> ll header
+-
+-dev->hard_header == NULL (ll header is added by device, we cannot control it)
+-   mac_header -> data
+-   data       -> data
+-
+-   We should set nh.raw on output to correct posistion,
+-   packet classifier depends on it.
+- */
+-
+-/* List of all packet sockets. */
+-static HLIST_HEAD(packet_sklist);
+-static DEFINE_RWLOCK(packet_sklist_lock);
+-
+-static atomic_t packet_socks_nr;
+-
+-
+-/* Private packet socket structures. */
+-
+-struct packet_mclist
+-{
+-      struct packet_mclist    *next;
+-      int                     ifindex;
+-      int                     count;
+-      unsigned short          type;
+-      unsigned short          alen;
+-      unsigned char           addr[MAX_ADDR_LEN];
+-};
+-/* identical to struct packet_mreq except it has
+- * a longer address field.
+- */
+-struct packet_mreq_max
+-{
+-      int             mr_ifindex;
+-      unsigned short  mr_type;
+-      unsigned short  mr_alen;
+-      unsigned char   mr_address[MAX_ADDR_LEN];
+-};
+-
+-#ifdef CONFIG_PACKET_MMAP
+-static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing);
+-#endif
+-
+-static void packet_flush_mclist(struct sock *sk);
+-
+-struct packet_sock {
+-      /* struct sock has to be the first member of packet_sock */
+-      struct sock             sk;
+-      struct tpacket_stats    stats;
+-#ifdef CONFIG_PACKET_MMAP
+-      char *                  *pg_vec;
+-      unsigned int            head;
+-      unsigned int            frames_per_block;
+-      unsigned int            frame_size;
+-      unsigned int            frame_max;
+-      int                     copy_thresh;
+-#endif
+-      struct packet_type      prot_hook;
+-      spinlock_t              bind_lock;
+-      unsigned int            running:1,      /* prot_hook is attached*/
+-                              auxdata:1,
+-                              origdev:1;
+-      int                     ifindex;        /* bound device         */
+-      __be16                  num;
+-      struct packet_mclist    *mclist;
+-#ifdef CONFIG_PACKET_MMAP
+-      atomic_t                mapped;
+-      unsigned int            pg_vec_order;
+-      unsigned int            pg_vec_pages;
+-      unsigned int            pg_vec_len;
+-#endif
+-};
+-
+-struct packet_skb_cb {
+-      unsigned int origlen;
+-      union {
+-              struct sockaddr_pkt pkt;
+-              struct sockaddr_ll ll;
+-      } sa;
+-};
+-
+-#define PACKET_SKB_CB(__skb)  ((struct packet_skb_cb *)((__skb)->cb))
+-
+-#ifdef CONFIG_PACKET_MMAP
+-
+-static inline struct tpacket_hdr *packet_lookup_frame(struct packet_sock *po, unsigned int position)
+-{
+-      unsigned int pg_vec_pos, frame_offset;
+-
+-      pg_vec_pos = position / po->frames_per_block;
+-      frame_offset = position % po->frames_per_block;
+-
+-      return (struct tpacket_hdr *)(po->pg_vec[pg_vec_pos] + (frame_offset * po->frame_size));
+-}
+-#endif
+-
+-static inline struct packet_sock *pkt_sk(struct sock *sk)
+-{
+-      return (struct packet_sock *)sk;
+-}
+-
+-static void packet_sock_destruct(struct sock *sk)
+-{
+-      BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc));
+-      BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
+-
+-      if (!sock_flag(sk, SOCK_DEAD)) {
+-              printk("Attempt to release alive packet socket: %p\n", sk);
+-              return;
+-      }
+-
+-      atomic_dec(&packet_socks_nr);
+-#ifdef PACKET_REFCNT_DEBUG
+-      printk(KERN_DEBUG "PACKET socket %p is free, %d are alive\n", sk, atomic_read(&packet_socks_nr));
+-#endif
+-}
+-
+-
+-static const struct proto_ops packet_ops;
+-
+-static const struct proto_ops packet_ops_spkt;
+-
+-static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,  struct packet_type *pt, struct net_device *orig_dev)
+-{
+-      struct sock *sk;
+-      struct sockaddr_pkt *spkt;
+-
+-      /*
+-       *      When we registered the protocol we saved the socket in the data
+-       *      field for just this event.
+-       */
+-
+-      sk = pt->af_packet_priv;
+-
+-      /*
+-       *      Yank back the headers [hope the device set this
+-       *      right or kerboom...]
+-       *
+-       *      Incoming packets have ll header pulled,
+-       *      push it back.
+-       *
+-       *      For outgoing ones skb->data == skb_mac_header(skb)
+-       *      so that this procedure is noop.
+-       */
+-
+-      if (skb->pkt_type == PACKET_LOOPBACK)
+-              goto out;
+-
+-      if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
+-              goto oom;
+-
+-      /* drop any routing info */
+-      dst_release(skb->dst);
+-      skb->dst = NULL;
+-
+-      /* drop conntrack reference */
+-      nf_reset(skb);
+-
+-      spkt = &PACKET_SKB_CB(skb)->sa.pkt;
+-
+-      skb_push(skb, skb->data - skb_mac_header(skb));
+-
+-      /*
+-       *      The SOCK_PACKET socket receives _all_ frames.
+-       */
+-
+-      spkt->spkt_family = dev->type;
+-      strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
+-      spkt->spkt_protocol = skb->protocol;
+-
+-      /*
+-       *      Charge the memory to the socket. This is done specifically
+-       *      to prevent sockets using all the memory up.
+-       */
+-
+-      if (sock_queue_rcv_skb(sk,skb) == 0)
+-              return 0;
+-
+-out:
+-      kfree_skb(skb);
+-oom:
+-      return 0;
+-}
+-
+-
+-/*
+- *    Output a raw packet to a device layer. This bypasses all the other
+- *    protocol layers and you must therefore supply it with a complete frame
+- */
+-
+-static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
+-                             struct msghdr *msg, size_t len)
+-{
+-      struct sock *sk = sock->sk;
+-      struct sockaddr_pkt *saddr=(struct sockaddr_pkt *)msg->msg_name;
+-      struct sk_buff *skb;
+-      struct net_device *dev;
+-      __be16 proto=0;
+-      int err;
+-
+-      if (!nx_capable(CAP_NET_RAW, NXC_RAW_SEND))
+-              return -EPERM;
+-
+-      /*
+-       *      Get and verify the address.
+-       */
+-
+-      if (saddr)
+-      {
+-              if (msg->msg_namelen < sizeof(struct sockaddr))
+-                      return(-EINVAL);
+-              if (msg->msg_namelen==sizeof(struct sockaddr_pkt))
+-                      proto=saddr->spkt_protocol;
+-      }
+-      else
+-              return(-ENOTCONN);      /* SOCK_PACKET must be sent giving an address */
+-
+-      /*
+-       *      Find the device first to size check it
+-       */
+-
+-      saddr->spkt_device[13] = 0;
+-      dev = dev_get_by_name(saddr->spkt_device);
+-      err = -ENODEV;
+-      if (dev == NULL)
+-              goto out_unlock;
+-
+-      err = -ENETDOWN;
+-      if (!(dev->flags & IFF_UP))
+-              goto out_unlock;
+-
+-      /*
+-       *      You may not queue a frame bigger than the mtu. This is the lowest level
+-       *      raw protocol and you must do your own fragmentation at this level.
+-       */
+-
+-      err = -EMSGSIZE;
+-      if (len > dev->mtu + dev->hard_header_len)
+-              goto out_unlock;
+-
+-      err = -ENOBUFS;
+-      skb = sock_wmalloc(sk, len + LL_RESERVED_SPACE(dev), 0, GFP_KERNEL);
+-
+-      /*
+-       *      If the write buffer is full, then tough. At this level the user gets to
+-       *      deal with the problem - do your own algorithmic backoffs. That's far
+-       *      more flexible.
+-       */
+-
+-      if (skb == NULL)
+-              goto out_unlock;
+-
+-      /*
+-       *      Fill it in
+-       */
+-
+-      /* FIXME: Save some space for broken drivers that write a
+-       * hard header at transmission time by themselves. PPP is the
+-       * notable one here. This should really be fixed at the driver level.
+-       */
+-      skb_reserve(skb, LL_RESERVED_SPACE(dev));
+-      skb_reset_network_header(skb);
+-
+-      /* Try to align data part correctly */
+-      if (dev->hard_header) {
+-              skb->data -= dev->hard_header_len;
+-              skb->tail -= dev->hard_header_len;
+-              if (len < dev->hard_header_len)
+-                      skb_reset_network_header(skb);
+-      }
+-
+-      /* Returns -EFAULT on error */
+-      err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
+-      skb->protocol = proto;
+-      skb->dev = dev;
+-      skb->priority = sk->sk_priority;
+-      if (err)
+-              goto out_free;
+-
+-      /*
+-       *      Now send it
+-       */
+-
+-      dev_queue_xmit(skb);
+-      dev_put(dev);
+-      return(len);
+-
+-out_free:
+-      kfree_skb(skb);
+-out_unlock:
+-      if (dev)
+-              dev_put(dev);
+-      return err;
+-}
+-
+-static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
+-                                    unsigned int res)
+-{
+-      struct sk_filter *filter;
+-      int tag = skb->skb_tag;
+-
+-      if (sk->sk_nx_info && !(tag == 1 || sk->sk_nid == tag))
+-              return 0;
+-
+-      rcu_read_lock_bh();
+-      filter = rcu_dereference(sk->sk_filter);
+-      if (filter != NULL)
+-              res = sk_run_filter(skb, filter->insns, filter->len);
+-      rcu_read_unlock_bh();
+-
+-      return res;
+-}
+-
+-/*
+-   This function makes lazy skb cloning in hope that most of packets
+-   are discarded by BPF.
+-
+-   Note tricky part: we DO mangle shared skb! skb->data, skb->len
+-   and skb->cb are mangled. It works because (and until) packets
+-   falling here are owned by current CPU. Output packets are cloned
+-   by dev_queue_xmit_nit(), input packets are processed by net_bh
+-   sequencially, so that if we return skb to original state on exit,
+-   we will not harm anyone.
+- */
+-
+-static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
+-{
+-      struct sock *sk;
+-      struct sockaddr_ll *sll;
+-      struct packet_sock *po;
+-      u8 * skb_head = skb->data;
+-      int skb_len = skb->len;
+-      unsigned int snaplen, res;
+-
+-      if (skb->pkt_type == PACKET_LOOPBACK)
+-              goto drop;
+-
+-      sk = pt->af_packet_priv;
+-      po = pkt_sk(sk);
+-
+-      skb->dev = dev;
+-
+-      if (dev->hard_header) {
+-              /* The device has an explicit notion of ll header,
+-                 exported to higher levels.
+-
+-                 Otherwise, the device hides datails of it frame
+-                 structure, so that corresponding packet head
+-                 never delivered to user.
+-               */
+-              if (sk->sk_type != SOCK_DGRAM)
+-                      skb_push(skb, skb->data - skb_mac_header(skb));
+-              else if (skb->pkt_type == PACKET_OUTGOING) {
+-                      /* Special case: outgoing packets have ll header at head */
+-                      skb_pull(skb, skb_network_offset(skb));
+-              }
+-      }
+-
+-      snaplen = skb->len;
+-
+-      res = run_filter(skb, sk, snaplen);
+-      if (!res)
+-              goto drop_n_restore;
+-      if (snaplen > res)
+-              snaplen = res;
+-
+-      if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
+-          (unsigned)sk->sk_rcvbuf)
+-              goto drop_n_acct;
+-
+-      if (skb_shared(skb)) {
+-              struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
+-              if (nskb == NULL)
+-                      goto drop_n_acct;
+-
+-              if (skb_head != skb->data) {
+-                      skb->data = skb_head;
+-                      skb->len = skb_len;
+-              }
+-              kfree_skb(skb);
+-              skb = nskb;
+-      }
+-
+-      BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
+-                   sizeof(skb->cb));
+-
+-      sll = &PACKET_SKB_CB(skb)->sa.ll;
+-      sll->sll_family = AF_PACKET;
+-      sll->sll_hatype = dev->type;
+-      sll->sll_protocol = skb->protocol;
+-      sll->sll_pkttype = skb->pkt_type;
+-      if (unlikely(po->origdev) && skb->pkt_type == PACKET_HOST)
+-              sll->sll_ifindex = orig_dev->ifindex;
+-      else
+-              sll->sll_ifindex = dev->ifindex;
+-      sll->sll_halen = 0;
+-
+-      if (dev->hard_header_parse)
+-              sll->sll_halen = dev->hard_header_parse(skb, sll->sll_addr);
+-
+-      PACKET_SKB_CB(skb)->origlen = skb->len;
+-
+-      if (pskb_trim(skb, snaplen))
+-              goto drop_n_acct;
+-
+-      skb_set_owner_r(skb, sk);
+-      skb->dev = NULL;
+-      dst_release(skb->dst);
+-      skb->dst = NULL;
+-
+-      /* drop conntrack reference */
+-      nf_reset(skb);
+-
+-      spin_lock(&sk->sk_receive_queue.lock);
+-      po->stats.tp_packets++;
+-      __skb_queue_tail(&sk->sk_receive_queue, skb);
+-      spin_unlock(&sk->sk_receive_queue.lock);
+-      sk->sk_data_ready(sk, skb->len);
+-      return 0;
+-
+-drop_n_acct:
+-      spin_lock(&sk->sk_receive_queue.lock);
+-      po->stats.tp_drops++;
+-      spin_unlock(&sk->sk_receive_queue.lock);
+-
+-drop_n_restore:
+-      if (skb_head != skb->data && skb_shared(skb)) {
+-              skb->data = skb_head;
+-              skb->len = skb_len;
+-      }
+-drop:
+-      kfree_skb(skb);
+-      return 0;
+-}
+-
+-#ifdef CONFIG_PACKET_MMAP
+-static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
+-{
+-      struct sock *sk;
+-      struct packet_sock *po;
+-      struct sockaddr_ll *sll;
+-      struct tpacket_hdr *h;
+-      u8 * skb_head = skb->data;
+-      int skb_len = skb->len;
+-      unsigned int snaplen, res;
+-      unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
+-      unsigned short macoff, netoff;
+-      struct sk_buff *copy_skb = NULL;
+-      struct timeval tv;
+-
+-      if (skb->pkt_type == PACKET_LOOPBACK)
+-              goto drop;
+-
+-      sk = pt->af_packet_priv;
+-      po = pkt_sk(sk);
+-
+-      if (dev->hard_header) {
+-              if (sk->sk_type != SOCK_DGRAM)
+-                      skb_push(skb, skb->data - skb_mac_header(skb));
+-              else if (skb->pkt_type == PACKET_OUTGOING) {
+-                      /* Special case: outgoing packets have ll header at head */
+-                      skb_pull(skb, skb_network_offset(skb));
+-              }
+-      }
+-
+-      if (skb->ip_summed == CHECKSUM_PARTIAL)
+-              status |= TP_STATUS_CSUMNOTREADY;
+-
+-      snaplen = skb->len;
+-
+-      res = run_filter(skb, sk, snaplen);
+-      if (!res)
+-              goto drop_n_restore;
+-      if (snaplen > res)
+-              snaplen = res;
+-
+-      if (sk->sk_type == SOCK_DGRAM) {
+-              macoff = netoff = TPACKET_ALIGN(TPACKET_HDRLEN) + 16;
+-      } else {
+-              unsigned maclen = skb_network_offset(skb);
+-              netoff = TPACKET_ALIGN(TPACKET_HDRLEN + (maclen < 16 ? 16 : maclen));
+-              macoff = netoff - maclen;
+-      }
+-
+-      if (macoff + snaplen > po->frame_size) {
+-              if (po->copy_thresh &&
+-                  atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
+-                  (unsigned)sk->sk_rcvbuf) {
+-                      if (skb_shared(skb)) {
+-                              copy_skb = skb_clone(skb, GFP_ATOMIC);
+-                      } else {
+-                              copy_skb = skb_get(skb);
+-                              skb_head = skb->data;
+-                      }
+-                      if (copy_skb)
+-                              skb_set_owner_r(copy_skb, sk);
+-              }
+-              snaplen = po->frame_size - macoff;
+-              if ((int)snaplen < 0)
+-                      snaplen = 0;
+-      }
+-
+-      spin_lock(&sk->sk_receive_queue.lock);
+-      h = packet_lookup_frame(po, po->head);
+-
+-      if (h->tp_status)
+-              goto ring_is_full;
+-      po->head = po->head != po->frame_max ? po->head+1 : 0;
+-      po->stats.tp_packets++;
+-      if (copy_skb) {
+-              status |= TP_STATUS_COPY;
+-              __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
+-      }
+-      if (!po->stats.tp_drops)
+-              status &= ~TP_STATUS_LOSING;
+-      spin_unlock(&sk->sk_receive_queue.lock);
+-
+-      skb_copy_bits(skb, 0, (u8*)h + macoff, snaplen);
+-
+-      h->tp_len = skb->len;
+-      h->tp_snaplen = snaplen;
+-      h->tp_mac = macoff;
+-      h->tp_net = netoff;
+-      if (skb->tstamp.tv64 == 0) {
+-              __net_timestamp(skb);
+-              sock_enable_timestamp(sk);
+-      }
+-      tv = ktime_to_timeval(skb->tstamp);
+-      h->tp_sec = tv.tv_sec;
+-      h->tp_usec = tv.tv_usec;
+-
+-      sll = (struct sockaddr_ll*)((u8*)h + TPACKET_ALIGN(sizeof(*h)));
+-      sll->sll_halen = 0;
+-      if (dev->hard_header_parse)
+-              sll->sll_halen = dev->hard_header_parse(skb, sll->sll_addr);
+-      sll->sll_family = AF_PACKET;
+-      sll->sll_hatype = dev->type;
+-      sll->sll_protocol = skb->protocol;
+-      sll->sll_pkttype = skb->pkt_type;
+-      if (unlikely(po->origdev) && skb->pkt_type == PACKET_HOST)
+-              sll->sll_ifindex = orig_dev->ifindex;
+-      else
+-              sll->sll_ifindex = dev->ifindex;
+-
+-      h->tp_status = status;
+-      smp_mb();
+-
+-      {
+-              struct page *p_start, *p_end;
+-              u8 *h_end = (u8 *)h + macoff + snaplen - 1;
+-
+-              p_start = virt_to_page(h);
+-              p_end = virt_to_page(h_end);
+-              while (p_start <= p_end) {
+-                      flush_dcache_page(p_start);
+-                      p_start++;
+-              }
+-      }
+-
+-      sk->sk_data_ready(sk, 0);
+-
+-drop_n_restore:
+-      if (skb_head != skb->data && skb_shared(skb)) {
+-              skb->data = skb_head;
+-              skb->len = skb_len;
+-      }
+-drop:
+-      kfree_skb(skb);
+-      return 0;
+-
+-ring_is_full:
+-      po->stats.tp_drops++;
+-      spin_unlock(&sk->sk_receive_queue.lock);
+-
+-      sk->sk_data_ready(sk, 0);
+-      if (copy_skb)
+-              kfree_skb(copy_skb);
+-      goto drop_n_restore;
+-}
+-
+-#endif
+-
+-
+-static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
+-                        struct msghdr *msg, size_t len)
+-{
+-      struct sock *sk = sock->sk;
+-      struct sockaddr_ll *saddr=(struct sockaddr_ll *)msg->msg_name;
+-      struct sk_buff *skb;
+-      struct net_device *dev;
+-      __be16 proto;
+-      unsigned char *addr;
+-      int ifindex, err, reserve = 0;
+-
+-      if (!nx_capable(CAP_NET_RAW, NXC_RAW_SEND))
+-              return -EPERM;
+-
+-      /*
+-       *      Get and verify the address.
+-       */
+-
+-      if (saddr == NULL) {
+-              struct packet_sock *po = pkt_sk(sk);
+-
+-              ifindex = po->ifindex;
+-              proto   = po->num;
+-              addr    = NULL;
+-      } else {
+-              err = -EINVAL;
+-              if (msg->msg_namelen < sizeof(struct sockaddr_ll))
+-                      goto out;
+-              if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
+-                      goto out;
+-              ifindex = saddr->sll_ifindex;
+-              proto   = saddr->sll_protocol;
+-              addr    = saddr->sll_addr;
+-      }
+-
+-
+-      dev = dev_get_by_index(ifindex);
+-      err = -ENXIO;
+-      if (dev == NULL)
+-              goto out_unlock;
+-      if (sock->type == SOCK_RAW)
+-              reserve = dev->hard_header_len;
+-
+-      err = -ENETDOWN;
+-      if (!(dev->flags & IFF_UP))
+-              goto out_unlock;
+-
+-      err = -EMSGSIZE;
+-      if (len > dev->mtu+reserve)
+-              goto out_unlock;
+-
+-      skb = sock_alloc_send_skb(sk, len + LL_RESERVED_SPACE(dev),
+-                              msg->msg_flags & MSG_DONTWAIT, &err);
+-      if (skb==NULL)
+-              goto out_unlock;
+-
+-      skb_reserve(skb, LL_RESERVED_SPACE(dev));
+-      skb_reset_network_header(skb);
+-
+-      if (dev->hard_header) {
+-              int res;
+-              err = -EINVAL;
+-              res = dev->hard_header(skb, dev, ntohs(proto), addr, NULL, len);
+-              if (sock->type != SOCK_DGRAM) {
+-                      skb_reset_tail_pointer(skb);
+-                      skb->len = 0;
+-              } else if (res < 0)
+-                      goto out_free;
+-      }
+-
+-      /* Returns -EFAULT on error */
+-      err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
+-      if (err)
+-              goto out_free;
+-
+-      skb->protocol = proto;
+-      skb->dev = dev;
+-      skb->priority = sk->sk_priority;
+-
+-      /*
+-       *      Now send it
+-       */
+-
+-      err = dev_queue_xmit(skb);
+-      if (err > 0 && (err = net_xmit_errno(err)) != 0)
+-              goto out_unlock;
+-
+-      dev_put(dev);
+-
+-      return(len);
+-
+-out_free:
+-      kfree_skb(skb);
+-out_unlock:
+-      if (dev)
+-              dev_put(dev);
+-out:
+-      return err;
+-}
+-
+-/*
+- *    Close a PACKET socket. This is fairly simple. We immediately go
+- *    to 'closed' state and remove our protocol entry in the device list.
+- */
+-
+-static int packet_release(struct socket *sock)
+-{
+-      struct sock *sk = sock->sk;
+-      struct packet_sock *po;
+-
+-      if (!sk)
+-              return 0;
+-
+-      po = pkt_sk(sk);
+-
+-      write_lock_bh(&packet_sklist_lock);
+-      sk_del_node_init(sk);
+-      write_unlock_bh(&packet_sklist_lock);
+-
+-      /*
+-       *      Unhook packet receive handler.
+-       */
+-
+-      if (po->running) {
+-              /*
+-               *      Remove the protocol hook
+-               */
+-              dev_remove_pack(&po->prot_hook);
+-              po->running = 0;
+-              po->num = 0;
+-              __sock_put(sk);
+-      }
+-
+-      packet_flush_mclist(sk);
+-
+-#ifdef CONFIG_PACKET_MMAP
+-      if (po->pg_vec) {
+-              struct tpacket_req req;
+-              memset(&req, 0, sizeof(req));
+-              packet_set_ring(sk, &req, 1);
+-      }
+-#endif
+-
+-      /*
+-       *      Now the socket is dead. No more input will appear.
+-       */
+-
+-      sock_orphan(sk);
+-      sock->sk = NULL;
+-
+-      /* Purge queues */
+-
+-      skb_queue_purge(&sk->sk_receive_queue);
+-
+-      sock_put(sk);
+-      return 0;
+-}
+-
+-/*
+- *    Attach a packet hook.
+- */
+-
+-static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
+-{
+-      struct packet_sock *po = pkt_sk(sk);
+-      /*
+-       *      Detach an existing hook if present.
+-       */
+-
+-      lock_sock(sk);
+-
+-      spin_lock(&po->bind_lock);
+-      if (po->running) {
+-              __sock_put(sk);
+-              po->running = 0;
+-              po->num = 0;
+-              spin_unlock(&po->bind_lock);
+-              dev_remove_pack(&po->prot_hook);
+-              spin_lock(&po->bind_lock);
+-      }
+-
+-      po->num = protocol;
+-      po->prot_hook.type = protocol;
+-      po->prot_hook.dev = dev;
+-
+-      po->ifindex = dev ? dev->ifindex : 0;
+-
+-      if (protocol == 0)
+-              goto out_unlock;
+-
+-      if (dev) {
+-              if (dev->flags&IFF_UP) {
+-                      dev_add_pack(&po->prot_hook);
+-                      sock_hold(sk);
+-                      po->running = 1;
+-              } else {
+-                      sk->sk_err = ENETDOWN;
+-                      if (!sock_flag(sk, SOCK_DEAD))
+-                              sk->sk_error_report(sk);
+-              }
+-      } else {
+-              dev_add_pack(&po->prot_hook);
+-              sock_hold(sk);
+-              po->running = 1;
+-      }
+-
+-out_unlock:
+-      spin_unlock(&po->bind_lock);
+-      release_sock(sk);
+-      return 0;
+-}
+-
+-/*
+- *    Bind a packet socket to a device
+- */
+-
+-static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+-{
+-      struct sock *sk=sock->sk;
+-      char name[15];
+-      struct net_device *dev;
+-      int err = -ENODEV;
+-
+-      /*
+-       *      Check legality
+-       */
+-
+-      if (addr_len != sizeof(struct sockaddr))
+-              return -EINVAL;
+-      strlcpy(name,uaddr->sa_data,sizeof(name));
+-
+-      dev = dev_get_by_name(name);
+-      if (dev) {
+-              err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
+-              dev_put(dev);
+-      }
+-      return err;
+-}
+-
+-static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+-{
+-      struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
+-      struct sock *sk=sock->sk;
+-      struct net_device *dev = NULL;
+-      int err;
+-
+-
+-      /*
+-       *      Check legality
+-       */
+-
+-      if (addr_len < sizeof(struct sockaddr_ll))
+-              return -EINVAL;
+-      if (sll->sll_family != AF_PACKET)
+-              return -EINVAL;
+-
+-      if (sll->sll_ifindex) {
+-              err = -ENODEV;
+-              dev = dev_get_by_index(sll->sll_ifindex);
+-              if (dev == NULL)
+-                      goto out;
+-      }
+-      err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
+-      if (dev)
+-              dev_put(dev);
+-
+-out:
+-      return err;
+-}
+-
+-static struct proto packet_proto = {
+-      .name     = "PACKET",
+-      .owner    = THIS_MODULE,
+-      .obj_size = sizeof(struct packet_sock),
+-};
+-
+-/*
+- *    Create a packet of type SOCK_PACKET.
+- */
+-
+-static int packet_create(struct socket *sock, int protocol)
+-{
+-      struct sock *sk;
+-      struct packet_sock *po;
+-      __be16 proto = (__force __be16)protocol; /* weird, but documented */
+-      int err;
+-
+-      if (!nx_capable(CAP_NET_RAW, NXC_RAW_SOCKET))
+-              return -EPERM;
+-      if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
+-          sock->type != SOCK_PACKET)
+-              return -ESOCKTNOSUPPORT;
+-
+-      sock->state = SS_UNCONNECTED;
+-
+-      err = -ENOBUFS;
+-      sk = sk_alloc(PF_PACKET, GFP_KERNEL, &packet_proto, 1);
+-      if (sk == NULL)
+-              goto out;
+-
+-      sock->ops = &packet_ops;
+-      if (sock->type == SOCK_PACKET)
+-              sock->ops = &packet_ops_spkt;
+-
+-      sock_init_data(sock, sk);
+-
+-      po = pkt_sk(sk);
+-      sk->sk_family = PF_PACKET;
+-      po->num = proto;
+-
+-      sk->sk_destruct = packet_sock_destruct;
+-      atomic_inc(&packet_socks_nr);
+-
+-      /*
+-       *      Attach a protocol block
+-       */
+-
+-      spin_lock_init(&po->bind_lock);
+-      po->prot_hook.func = packet_rcv;
+-
+-      if (sock->type == SOCK_PACKET)
+-              po->prot_hook.func = packet_rcv_spkt;
+-
+-      po->prot_hook.af_packet_priv = sk;
+-
+-      if (proto) {
+-              po->prot_hook.type = proto;
+-              dev_add_pack(&po->prot_hook);
+-              sock_hold(sk);
+-              po->running = 1;
+-      }
+-
+-      write_lock_bh(&packet_sklist_lock);
+-      sk_add_node(sk, &packet_sklist);
+-      write_unlock_bh(&packet_sklist_lock);
+-      return(0);
+-out:
+-      return err;
+-}
+-
+-/*
+- *    Pull a packet from our receive queue and hand it to the user.
+- *    If necessary we block.
+- */
+-
+-static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
+-                        struct msghdr *msg, size_t len, int flags)
+-{
+-      struct sock *sk = sock->sk;
+-      struct sk_buff *skb;
+-      int copied, err;
+-      struct sockaddr_ll *sll;
+-
+-      err = -EINVAL;
+-      if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
+-              goto out;
+-
+-#if 0
+-      /* What error should we return now? EUNATTACH? */
+-      if (pkt_sk(sk)->ifindex < 0)
+-              return -ENODEV;
+-#endif
+-
+-      /*
+-       *      Call the generic datagram receiver. This handles all sorts
+-       *      of horrible races and re-entrancy so we can forget about it
+-       *      in the protocol layers.
+-       *
+-       *      Now it will return ENETDOWN, if device have just gone down,
+-       *      but then it will block.
+-       */
+-
+-      skb=skb_recv_datagram(sk,flags,flags&MSG_DONTWAIT,&err);
+-
+-      /*
+-       *      An error occurred so return it. Because skb_recv_datagram()
+-       *      handles the blocking we don't see and worry about blocking
+-       *      retries.
+-       */
+-
+-      if (skb == NULL)
+-              goto out;
+-
+-      /*
+-       *      If the address length field is there to be filled in, we fill
+-       *      it in now.
+-       */
+-
+-      sll = &PACKET_SKB_CB(skb)->sa.ll;
+-      if (sock->type == SOCK_PACKET)
+-              msg->msg_namelen = sizeof(struct sockaddr_pkt);
+-      else
+-              msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
+-
+-      /*
+-       *      You lose any data beyond the buffer you gave. If it worries a
+-       *      user program they can ask the device for its MTU anyway.
+-       */
+-
+-      copied = skb->len;
+-      if (copied > len)
+-      {
+-              copied=len;
+-              msg->msg_flags|=MSG_TRUNC;
+-      }
+-
+-      err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+-      if (err)
+-              goto out_free;
+-
+-      sock_recv_timestamp(msg, sk, skb);
+-
+-      if (msg->msg_name)
+-              memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
+-                     msg->msg_namelen);
+-
+-      if (pkt_sk(sk)->auxdata) {
+-              struct tpacket_auxdata aux;
+-
+-              aux.tp_status = TP_STATUS_USER;
+-              if (skb->ip_summed == CHECKSUM_PARTIAL)
+-                      aux.tp_status |= TP_STATUS_CSUMNOTREADY;
+-              aux.tp_len = PACKET_SKB_CB(skb)->origlen;
+-              aux.tp_snaplen = skb->len;
+-              aux.tp_mac = 0;
+-              aux.tp_net = skb_network_offset(skb);
+-
+-              put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
+-      }
+-
+-      /*
+-       *      Free or return the buffer as appropriate. Again this
+-       *      hides all the races and re-entrancy issues from us.
+-       */
+-      err = (flags&MSG_TRUNC) ? skb->len : copied;
+-
+-out_free:
+-      skb_free_datagram(sk, skb);
+-out:
+-      return err;
+-}
+-
+-static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
+-                             int *uaddr_len, int peer)
+-{
+-      struct net_device *dev;
+-      struct sock *sk = sock->sk;
+-
+-      if (peer)
+-              return -EOPNOTSUPP;
+-
+-      uaddr->sa_family = AF_PACKET;
+-      dev = dev_get_by_index(pkt_sk(sk)->ifindex);
+-      if (dev) {
+-              strlcpy(uaddr->sa_data, dev->name, 15);
+-              dev_put(dev);
+-      } else
+-              memset(uaddr->sa_data, 0, 14);
+-      *uaddr_len = sizeof(*uaddr);
+-
+-      return 0;
+-}
+-
+-static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
+-                        int *uaddr_len, int peer)
+-{
+-      struct net_device *dev;
+-      struct sock *sk = sock->sk;
+-      struct packet_sock *po = pkt_sk(sk);
+-      struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
+-
+-      if (peer)
+-              return -EOPNOTSUPP;
+-
+-      sll->sll_family = AF_PACKET;
+-      sll->sll_ifindex = po->ifindex;
+-      sll->sll_protocol = po->num;
+-      dev = dev_get_by_index(po->ifindex);
+-      if (dev) {
+-              sll->sll_hatype = dev->type;
+-              sll->sll_halen = dev->addr_len;
+-              memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
+-              dev_put(dev);
+-      } else {
+-              sll->sll_hatype = 0;    /* Bad: we have no ARPHRD_UNSPEC */
+-              sll->sll_halen = 0;
+-      }
+-      *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
+-
+-      return 0;
+-}
+-
+-static void packet_dev_mc(struct net_device *dev, struct packet_mclist *i, int what)
+-{
+-      switch (i->type) {
+-      case PACKET_MR_MULTICAST:
+-              if (what > 0)
+-                      dev_mc_add(dev, i->addr, i->alen, 0);
+-              else
+-                      dev_mc_delete(dev, i->addr, i->alen, 0);
+-              break;
+-      case PACKET_MR_PROMISC:
+-              dev_set_promiscuity(dev, what);
+-              break;
+-      case PACKET_MR_ALLMULTI:
+-              dev_set_allmulti(dev, what);
+-              break;
+-      default:;
+-      }
+-}
+-
+-static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
+-{
+-      for ( ; i; i=i->next) {
+-              if (i->ifindex == dev->ifindex)
+-                      packet_dev_mc(dev, i, what);
+-      }
+-}
+-
+-static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
+-{
+-      struct packet_sock *po = pkt_sk(sk);
+-      struct packet_mclist *ml, *i;
+-      struct net_device *dev;
+-      int err;
+-
+-      rtnl_lock();
+-
+-      err = -ENODEV;
+-      dev = __dev_get_by_index(mreq->mr_ifindex);
+-      if (!dev)
+-              goto done;
+-
+-      err = -EINVAL;
+-      if (mreq->mr_alen > dev->addr_len)
+-              goto done;
+-
+-      err = -ENOBUFS;
+-      i = kmalloc(sizeof(*i), GFP_KERNEL);
+-      if (i == NULL)
+-              goto done;
+-
+-      err = 0;
+-      for (ml = po->mclist; ml; ml = ml->next) {
+-              if (ml->ifindex == mreq->mr_ifindex &&
+-                  ml->type == mreq->mr_type &&
+-                  ml->alen == mreq->mr_alen &&
+-                  memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
+-                      ml->count++;
+-                      /* Free the new element ... */
+-                      kfree(i);
+-                      goto done;
+-              }
+-      }
+-
+-      i->type = mreq->mr_type;
+-      i->ifindex = mreq->mr_ifindex;
+-      i->alen = mreq->mr_alen;
+-      memcpy(i->addr, mreq->mr_address, i->alen);
+-      i->count = 1;
+-      i->next = po->mclist;
+-      po->mclist = i;
+-      packet_dev_mc(dev, i, +1);
+-
+-done:
+-      rtnl_unlock();
+-      return err;
+-}
+-
+-static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
+-{
+-      struct packet_mclist *ml, **mlp;
+-
+-      rtnl_lock();
+-
+-      for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
+-              if (ml->ifindex == mreq->mr_ifindex &&
+-                  ml->type == mreq->mr_type &&
+-                  ml->alen == mreq->mr_alen &&
+-                  memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
+-                      if (--ml->count == 0) {
+-                              struct net_device *dev;
+-                              *mlp = ml->next;
+-                              dev = dev_get_by_index(ml->ifindex);
+-                              if (dev) {
+-                                      packet_dev_mc(dev, ml, -1);
+-                                      dev_put(dev);
+-                              }
+-                              kfree(ml);
+-                      }
+-                      rtnl_unlock();
+-                      return 0;
+-              }
+-      }
+-      rtnl_unlock();
+-      return -EADDRNOTAVAIL;
+-}
+-
+-static void packet_flush_mclist(struct sock *sk)
+-{
+-      struct packet_sock *po = pkt_sk(sk);
+-      struct packet_mclist *ml;
+-
+-      if (!po->mclist)
+-              return;
+-
+-      rtnl_lock();
+-      while ((ml = po->mclist) != NULL) {
+-              struct net_device *dev;
+-
+-              po->mclist = ml->next;
+-              if ((dev = dev_get_by_index(ml->ifindex)) != NULL) {
+-                      packet_dev_mc(dev, ml, -1);
+-                      dev_put(dev);
+-              }
+-              kfree(ml);
+-      }
+-      rtnl_unlock();
+-}
+-
+-static int
+-packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen)
+-{
+-      struct sock *sk = sock->sk;
+-      struct packet_sock *po = pkt_sk(sk);
+-      int ret;
+-
+-      if (level != SOL_PACKET)
+-              return -ENOPROTOOPT;
+-
+-      switch(optname) {
+-      case PACKET_ADD_MEMBERSHIP:
+-      case PACKET_DROP_MEMBERSHIP:
+-      {
+-              struct packet_mreq_max mreq;
+-              int len = optlen;
+-              memset(&mreq, 0, sizeof(mreq));
+-              if (len < sizeof(struct packet_mreq))
+-                      return -EINVAL;
+-              if (len > sizeof(mreq))
+-                      len = sizeof(mreq);
+-              if (copy_from_user(&mreq,optval,len))
+-                      return -EFAULT;
+-              if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
+-                      return -EINVAL;
+-              if (optname == PACKET_ADD_MEMBERSHIP)
+-                      ret = packet_mc_add(sk, &mreq);
+-              else
+-                      ret = packet_mc_drop(sk, &mreq);
+-              return ret;
+-      }
+-
+-#ifdef CONFIG_PACKET_MMAP
+-      case PACKET_RX_RING:
+-      {
+-              struct tpacket_req req;
+-
+-              if (optlen<sizeof(req))
+-                      return -EINVAL;
+-              if (copy_from_user(&req,optval,sizeof(req)))
+-                      return -EFAULT;
+-              return packet_set_ring(sk, &req, 0);
+-      }
+-      case PACKET_COPY_THRESH:
+-      {
+-              int val;
+-
+-              if (optlen!=sizeof(val))
+-                      return -EINVAL;
+-              if (copy_from_user(&val,optval,sizeof(val)))
+-                      return -EFAULT;
+-
+-              pkt_sk(sk)->copy_thresh = val;
+-              return 0;
+-      }
+-#endif
+-      case PACKET_AUXDATA:
+-      {
+-              int val;
+-
+-              if (optlen < sizeof(val))
+-                      return -EINVAL;
+-              if (copy_from_user(&val, optval, sizeof(val)))
+-                      return -EFAULT;
+-
+-              po->auxdata = !!val;
+-              return 0;
+-      }
+-      case PACKET_ORIGDEV:
+-      {
+-              int val;
+-
+-              if (optlen < sizeof(val))
+-                      return -EINVAL;
+-              if (copy_from_user(&val, optval, sizeof(val)))
+-                      return -EFAULT;
+-
+-              po->origdev = !!val;
+-              return 0;
+-      }
+-      default:
+-              return -ENOPROTOOPT;
+-      }
+-}
+-
+-static int packet_getsockopt(struct socket *sock, int level, int optname,
+-                           char __user *optval, int __user *optlen)
+-{
+-      int len;
+-      int val;
+-      struct sock *sk = sock->sk;
+-      struct packet_sock *po = pkt_sk(sk);
+-      void *data;
+-      struct tpacket_stats st;
+-
+-      if (level != SOL_PACKET)
+-              return -ENOPROTOOPT;
+-
+-      if (get_user(len, optlen))
+-              return -EFAULT;
+-
+-      if (len < 0)
+-              return -EINVAL;
+-
+-      switch(optname) {
+-      case PACKET_STATISTICS:
+-              if (len > sizeof(struct tpacket_stats))
+-                      len = sizeof(struct tpacket_stats);
+-              spin_lock_bh(&sk->sk_receive_queue.lock);
+-              st = po->stats;
+-              memset(&po->stats, 0, sizeof(st));
+-              spin_unlock_bh(&sk->sk_receive_queue.lock);
+-              st.tp_packets += st.tp_drops;
+-
+-              data = &st;
+-              break;
+-      case PACKET_AUXDATA:
+-              if (len > sizeof(int))
+-                      len = sizeof(int);
+-              val = po->auxdata;
+-
+-              data = &val;
+-              break;
+-      case PACKET_ORIGDEV:
+-              if (len > sizeof(int))
+-                      len = sizeof(int);
+-              val = po->origdev;
+-
+-              data = &val;
+-              break;
+-      default:
+-              return -ENOPROTOOPT;
+-      }
+-
+-      if (put_user(len, optlen))
+-              return -EFAULT;
+-      if (copy_to_user(optval, data, len))
+-              return -EFAULT;
+-      return 0;
+-}
+-
+-
+-static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
+-{
+-      struct sock *sk;
+-      struct hlist_node *node;
+-      struct net_device *dev = data;
+-
+-      read_lock(&packet_sklist_lock);
+-      sk_for_each(sk, node, &packet_sklist) {
+-              struct packet_sock *po = pkt_sk(sk);
+-
+-              switch (msg) {
+-              case NETDEV_UNREGISTER:
+-                      if (po->mclist)
+-                              packet_dev_mclist(dev, po->mclist, -1);
+-                      /* fallthrough */
+-
+-              case NETDEV_DOWN:
+-                      if (dev->ifindex == po->ifindex) {
+-                              spin_lock(&po->bind_lock);
+-                              if (po->running) {
+-                                      __dev_remove_pack(&po->prot_hook);
+-                                      __sock_put(sk);
+-                                      po->running = 0;
+-                                      sk->sk_err = ENETDOWN;
+-                                      if (!sock_flag(sk, SOCK_DEAD))
+-                                              sk->sk_error_report(sk);
+-                              }
+-                              if (msg == NETDEV_UNREGISTER) {
+-                                      po->ifindex = -1;
+-                                      po->prot_hook.dev = NULL;
+-                              }
+-                              spin_unlock(&po->bind_lock);
+-                      }
+-                      break;
+-              case NETDEV_UP:
+-                      spin_lock(&po->bind_lock);
+-                      if (dev->ifindex == po->ifindex && po->num &&
+-                          !po->running) {
+-                              dev_add_pack(&po->prot_hook);
+-                              sock_hold(sk);
+-                              po->running = 1;
+-                      }
+-                      spin_unlock(&po->bind_lock);
+-                      break;
+-              }
+-      }
+-      read_unlock(&packet_sklist_lock);
+-      return NOTIFY_DONE;
+-}
+-
+-
+-static int packet_ioctl(struct socket *sock, unsigned int cmd,
+-                      unsigned long arg)
+-{
+-      struct sock *sk = sock->sk;
+-
+-      switch(cmd) {
+-              case SIOCOUTQ:
+-              {
+-                      int amount = atomic_read(&sk->sk_wmem_alloc);
+-                      return put_user(amount, (int __user *)arg);
+-              }
+-              case SIOCINQ:
+-              {
+-                      struct sk_buff *skb;
+-                      int amount = 0;
+-
+-                      spin_lock_bh(&sk->sk_receive_queue.lock);
+-                      skb = skb_peek(&sk->sk_receive_queue);
+-                      if (skb)
+-                              amount = skb->len;
+-                      spin_unlock_bh(&sk->sk_receive_queue.lock);
+-                      return put_user(amount, (int __user *)arg);
+-              }
+-              case SIOCGSTAMP:
+-                      return sock_get_timestamp(sk, (struct timeval __user *)arg);
+-              case SIOCGSTAMPNS:
+-                      return sock_get_timestampns(sk, (struct timespec __user *)arg);
+-
+-#ifdef CONFIG_INET
+-              case SIOCADDRT:
+-              case SIOCDELRT:
+-              case SIOCDARP:
+-              case SIOCGARP:
+-              case SIOCSARP:
+-              case SIOCGIFADDR:
+-              case SIOCSIFADDR:
+-              case SIOCGIFBRDADDR:
+-              case SIOCSIFBRDADDR:
+-              case SIOCGIFNETMASK:
+-              case SIOCSIFNETMASK:
+-              case SIOCGIFDSTADDR:
+-              case SIOCSIFDSTADDR:
+-              case SIOCSIFFLAGS:
+-                      return inet_dgram_ops.ioctl(sock, cmd, arg);
+-#endif
+-
+-              default:
+-                      return -ENOIOCTLCMD;
+-      }
+-      return 0;
+-}
+-
+-#ifndef CONFIG_PACKET_MMAP
+-#define packet_mmap sock_no_mmap
+-#define packet_poll datagram_poll
+-#else
+-
+-static unsigned int packet_poll(struct file * file, struct socket *sock,
+-                              poll_table *wait)
+-{
+-      struct sock *sk = sock->sk;
+-      struct packet_sock *po = pkt_sk(sk);
+-      unsigned int mask = datagram_poll(file, sock, wait);
+-
+-      spin_lock_bh(&sk->sk_receive_queue.lock);
+-      if (po->pg_vec) {
+-              unsigned last = po->head ? po->head-1 : po->frame_max;
+-              struct tpacket_hdr *h;
+-
+-              h = packet_lookup_frame(po, last);
+-
+-              if (h->tp_status)
+-                      mask |= POLLIN | POLLRDNORM;
+-      }
+-      spin_unlock_bh(&sk->sk_receive_queue.lock);
+-      return mask;
+-}
+-
+-
+-/* Dirty? Well, I still did not learn better way to account
+- * for user mmaps.
+- */
+-
+-static void packet_mm_open(struct vm_area_struct *vma)
+-{
+-      struct file *file = vma->vm_file;
+-      struct socket * sock = file->private_data;
+-      struct sock *sk = sock->sk;
+-
+-      if (sk)
+-              atomic_inc(&pkt_sk(sk)->mapped);
+-}
+-
+-static void packet_mm_close(struct vm_area_struct *vma)
+-{
+-      struct file *file = vma->vm_file;
+-      struct socket * sock = file->private_data;
+-      struct sock *sk = sock->sk;
+-
+-      if (sk)
+-              atomic_dec(&pkt_sk(sk)->mapped);
+-}
+-
+-static struct vm_operations_struct packet_mmap_ops = {
+-      .open = packet_mm_open,
+-      .close =packet_mm_close,
+-};
+-
+-static inline struct page *pg_vec_endpage(char *one_pg_vec, unsigned int order)
+-{
+-      return virt_to_page(one_pg_vec + (PAGE_SIZE << order) - 1);
+-}
+-
+-static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
+-{
+-      int i;
+-
+-      for (i = 0; i < len; i++) {
+-              if (likely(pg_vec[i]))
+-                      free_pages((unsigned long) pg_vec[i], order);
+-      }
+-      kfree(pg_vec);
+-}
+-
+-static inline char *alloc_one_pg_vec_page(unsigned long order)
+-{
+-      return (char *) __get_free_pages(GFP_KERNEL | __GFP_COMP | __GFP_ZERO,
+-                                       order);
+-}
+-
+-static char **alloc_pg_vec(struct tpacket_req *req, int order)
+-{
+-      unsigned int block_nr = req->tp_block_nr;
+-      char **pg_vec;
+-      int i;
+-
+-      pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
+-      if (unlikely(!pg_vec))
+-              goto out;
+-
+-      for (i = 0; i < block_nr; i++) {
+-              pg_vec[i] = alloc_one_pg_vec_page(order);
+-              if (unlikely(!pg_vec[i]))
+-                      goto out_free_pgvec;
+-      }
+-
+-out:
+-      return pg_vec;
+-
+-out_free_pgvec:
+-      free_pg_vec(pg_vec, order, block_nr);
+-      pg_vec = NULL;
+-      goto out;
+-}
+-
+-static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing)
+-{
+-      char **pg_vec = NULL;
+-      struct packet_sock *po = pkt_sk(sk);
+-      int was_running, order = 0;
+-      __be16 num;
+-      int err = 0;
+-
+-      if (req->tp_block_nr) {
+-              int i, l;
+-
+-              /* Sanity tests and some calculations */
+-
+-              if (unlikely(po->pg_vec))
+-                      return -EBUSY;
+-
+-              if (unlikely((int)req->tp_block_size <= 0))
+-                      return -EINVAL;
+-              if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
+-                      return -EINVAL;
+-              if (unlikely(req->tp_frame_size < TPACKET_HDRLEN))
+-                      return -EINVAL;
+-              if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
+-                      return -EINVAL;
+-
+-              po->frames_per_block = req->tp_block_size/req->tp_frame_size;
+-              if (unlikely(po->frames_per_block <= 0))
+-                      return -EINVAL;
+-              if (unlikely((po->frames_per_block * req->tp_block_nr) !=
+-                           req->tp_frame_nr))
+-                      return -EINVAL;
+-
+-              err = -ENOMEM;
+-              order = get_order(req->tp_block_size);
+-              pg_vec = alloc_pg_vec(req, order);
+-              if (unlikely(!pg_vec))
+-                      goto out;
+-
+-              l = 0;
+-              for (i = 0; i < req->tp_block_nr; i++) {
+-                      char *ptr = pg_vec[i];
+-                      struct tpacket_hdr *header;
+-                      int k;
+-
+-                      for (k = 0; k < po->frames_per_block; k++) {
+-                              header = (struct tpacket_hdr *) ptr;
+-                              header->tp_status = TP_STATUS_KERNEL;
+-                              ptr += req->tp_frame_size;
+-                      }
+-              }
+-              /* Done */
+-      } else {
+-              if (unlikely(req->tp_frame_nr))
+-                      return -EINVAL;
+-      }
+-
+-      lock_sock(sk);
+-
+-      /* Detach socket from network */
+-      spin_lock(&po->bind_lock);
+-      was_running = po->running;
+-      num = po->num;
+-      if (was_running) {
+-              __dev_remove_pack(&po->prot_hook);
+-              po->num = 0;
+-              po->running = 0;
+-              __sock_put(sk);
+-      }
+-      spin_unlock(&po->bind_lock);
+-
+-      synchronize_net();
+-
+-      err = -EBUSY;
+-      if (closing || atomic_read(&po->mapped) == 0) {
+-              err = 0;
+-#define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
+-
+-              spin_lock_bh(&sk->sk_receive_queue.lock);
+-              pg_vec = XC(po->pg_vec, pg_vec);
+-              po->frame_max = (req->tp_frame_nr - 1);
+-              po->head = 0;
+-              po->frame_size = req->tp_frame_size;
+-              spin_unlock_bh(&sk->sk_receive_queue.lock);
+-
+-              order = XC(po->pg_vec_order, order);
+-              req->tp_block_nr = XC(po->pg_vec_len, req->tp_block_nr);
+-
+-              po->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
+-              po->prot_hook.func = po->pg_vec ? tpacket_rcv : packet_rcv;
+-              skb_queue_purge(&sk->sk_receive_queue);
+-#undef XC
+-              if (atomic_read(&po->mapped))
+-                      printk(KERN_DEBUG "packet_mmap: vma is busy: %d\n", atomic_read(&po->mapped));
+-      }
+-
+-      spin_lock(&po->bind_lock);
+-      if (was_running && !po->running) {
+-              sock_hold(sk);
+-              po->running = 1;
+-              po->num = num;
+-              dev_add_pack(&po->prot_hook);
+-      }
+-      spin_unlock(&po->bind_lock);
+-
+-      release_sock(sk);
+-
+-      if (pg_vec)
+-              free_pg_vec(pg_vec, order, req->tp_block_nr);
+-out:
+-      return err;
+-}
+-
+-static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
+-{
+-      struct sock *sk = sock->sk;
+-      struct packet_sock *po = pkt_sk(sk);
+-      unsigned long size;
+-      unsigned long start;
+-      int err = -EINVAL;
+-      int i;
+-
+-      if (vma->vm_pgoff)
+-              return -EINVAL;
+-
+-      size = vma->vm_end - vma->vm_start;
+-
+-      lock_sock(sk);
+-      if (po->pg_vec == NULL)
+-              goto out;
+-      if (size != po->pg_vec_len*po->pg_vec_pages*PAGE_SIZE)
+-              goto out;
+-
+-      start = vma->vm_start;
+-      for (i = 0; i < po->pg_vec_len; i++) {
+-              struct page *page = virt_to_page(po->pg_vec[i]);
+-              int pg_num;
+-
+-              for (pg_num = 0; pg_num < po->pg_vec_pages; pg_num++, page++) {
+-                      err = vm_insert_page(vma, start, page);
+-                      if (unlikely(err))
+-                              goto out;
+-                      start += PAGE_SIZE;
+-              }
+-      }
+-      atomic_inc(&po->mapped);
+-      vma->vm_ops = &packet_mmap_ops;
+-      err = 0;
+-
+-out:
+-      release_sock(sk);
+-      return err;
+-}
+-#endif
+-
+-
+-static const struct proto_ops packet_ops_spkt = {
+-      .family =       PF_PACKET,
+-      .owner =        THIS_MODULE,
+-      .release =      packet_release,
+-      .bind =         packet_bind_spkt,
+-      .connect =      sock_no_connect,
+-      .socketpair =   sock_no_socketpair,
+-      .accept =       sock_no_accept,
+-      .getname =      packet_getname_spkt,
+-      .poll =         datagram_poll,
+-      .ioctl =        packet_ioctl,
+-      .listen =       sock_no_listen,
+-      .shutdown =     sock_no_shutdown,
+-      .setsockopt =   sock_no_setsockopt,
+-      .getsockopt =   sock_no_getsockopt,
+-      .sendmsg =      packet_sendmsg_spkt,
+-      .recvmsg =      packet_recvmsg,
+-      .mmap =         sock_no_mmap,
+-      .sendpage =     sock_no_sendpage,
+-};
+-
+-static const struct proto_ops packet_ops = {
+-      .family =       PF_PACKET,
+-      .owner =        THIS_MODULE,
+-      .release =      packet_release,
+-      .bind =         packet_bind,
+-      .connect =      sock_no_connect,
+-      .socketpair =   sock_no_socketpair,
+-      .accept =       sock_no_accept,
+-      .getname =      packet_getname,
+-      .poll =         packet_poll,
+-      .ioctl =        packet_ioctl,
+-      .listen =       sock_no_listen,
+-      .shutdown =     sock_no_shutdown,
+-      .setsockopt =   packet_setsockopt,
+-      .getsockopt =   packet_getsockopt,
+-      .sendmsg =      packet_sendmsg,
+-      .recvmsg =      packet_recvmsg,
+-      .mmap =         packet_mmap,
+-      .sendpage =     sock_no_sendpage,
+-};
+-
+-static struct net_proto_family packet_family_ops = {
+-      .family =       PF_PACKET,
+-      .create =       packet_create,
+-      .owner  =       THIS_MODULE,
+-};
+-
+-static struct notifier_block packet_netdev_notifier = {
+-      .notifier_call =packet_notifier,
+-};
+-
+-#ifdef CONFIG_PROC_FS
+-static inline struct sock *packet_seq_idx(loff_t off)
+-{
+-      struct sock *s;
+-      struct hlist_node *node;
+-
+-      sk_for_each(s, node, &packet_sklist) {
+-              if (!off--)
+-                      return s;
+-      }
+-      return NULL;
+-}
+-
+-static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
+-{
+-      read_lock(&packet_sklist_lock);
+-      return *pos ? packet_seq_idx(*pos - 1) : SEQ_START_TOKEN;
+-}
+-
+-static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+-{
+-      ++*pos;
+-      return  (v == SEQ_START_TOKEN)
+-              ? sk_head(&packet_sklist)
+-              : sk_next((struct sock*)v) ;
+-}
+-
+-static void packet_seq_stop(struct seq_file *seq, void *v)
+-{
+-      read_unlock(&packet_sklist_lock);
+-}
+-
+-static int packet_seq_show(struct seq_file *seq, void *v)
+-{
+-      if (v == SEQ_START_TOKEN)
+-              seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
+-      else {
+-              struct sock *s = v;
+-              const struct packet_sock *po = pkt_sk(s);
+-
+-              seq_printf(seq,
+-                         "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
+-                         s,
+-                         atomic_read(&s->sk_refcnt),
+-                         s->sk_type,
+-                         ntohs(po->num),
+-                         po->ifindex,
+-                         po->running,
+-                         atomic_read(&s->sk_rmem_alloc),
+-                         sock_i_uid(s),
+-                         sock_i_ino(s) );
+-      }
+-
+-      return 0;
+-}
+-
+-static struct seq_operations packet_seq_ops = {
+-      .start  = packet_seq_start,
+-      .next   = packet_seq_next,
+-      .stop   = packet_seq_stop,
+-      .show   = packet_seq_show,
+-};
+-
+-static int packet_seq_open(struct inode *inode, struct file *file)
+-{
+-      return seq_open(file, &packet_seq_ops);
+-}
+-
+-static const struct file_operations packet_seq_fops = {
+-      .owner          = THIS_MODULE,
+-      .open           = packet_seq_open,
+-      .read           = seq_read,
+-      .llseek         = seq_lseek,
+-      .release        = seq_release,
+-};
+-
+-#endif
+-
+-static void __exit packet_exit(void)
+-{
+-      proc_net_remove("packet");
+-      unregister_netdevice_notifier(&packet_netdev_notifier);
+-      sock_unregister(PF_PACKET);
+-      proto_unregister(&packet_proto);
+-}
+-
+-static int __init packet_init(void)
+-{
+-      int rc = proto_register(&packet_proto, 0);
+-
+-      if (rc != 0)
+-              goto out;
+-
+-      sock_register(&packet_family_ops);
+-      register_netdevice_notifier(&packet_netdev_notifier);
+-      proc_net_fops_create("packet", 0, &packet_seq_fops);
+-out:
+-      return rc;
+-}
+-
+-module_init(packet_init);
+-module_exit(packet_exit);
+-MODULE_LICENSE("GPL");
+-MODULE_ALIAS_NETPROTO(PF_PACKET);
+diff -Nurb linux-2.6.22-594/net/socket.c linux-2.6.22-595/net/socket.c
+--- linux-2.6.22-594/net/socket.c      2008-03-20 00:05:19.000000000 -0400
++++ linux-2.6.22-595/net/socket.c      2008-03-20 00:14:03.000000000 -0400
+@@ -1122,12 +1122,17 @@
+       if (type < 0 || type >= SOCK_MAX)
+               return -EINVAL;
+ 
++      /*
++       * Hack no. 2 - Sapan
++       * Clean this up later
++       *
+       if (!nx_check(0, VS_ADMIN)) {
+               if (family == PF_INET && !current_nx_info_has_v4())
+                       return -EAFNOSUPPORT;
+               if (family == PF_INET6 && !current_nx_info_has_v6())
+                       return -EAFNOSUPPORT;
+       }
++      */
+ 
+       /* Compatibility.
+ 
+diff -Nurb linux-2.6.22-594/net/socket.c.orig linux-2.6.22-595/net/socket.c.orig
+--- linux-2.6.22-594/net/socket.c.orig 1969-12-31 19:00:00.000000000 -0500
++++ linux-2.6.22-595/net/socket.c.orig 2008-03-20 00:05:19.000000000 -0400
+@@ -0,0 +1,2400 @@
++/*
++ * NET                An implementation of the SOCKET network access protocol.
++ *
++ * Version:   @(#)socket.c    1.1.93  18/02/95
++ *
++ * Authors:   Orest Zborowski, <obz@Kodak.COM>
++ *            Ross Biro
++ *            Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
++ *
++ * Fixes:
++ *            Anonymous       :       NOTSOCK/BADF cleanup. Error fix in
++ *                                    shutdown()
++ *            Alan Cox        :       verify_area() fixes
++ *            Alan Cox        :       Removed DDI
++ *            Jonathan Kamens :       SOCK_DGRAM reconnect bug
++ *            Alan Cox        :       Moved a load of checks to the very
++ *                                    top level.
++ *            Alan Cox        :       Move address structures to/from user
++ *                                    mode above the protocol layers.
++ *            Rob Janssen     :       Allow 0 length sends.
++ *            Alan Cox        :       Asynchronous I/O support (cribbed from the
++ *                                    tty drivers).
++ *            Niibe Yutaka    :       Asynchronous I/O for writes (4.4BSD style)
++ *            Jeff Uphoff     :       Made max number of sockets command-line
++ *                                    configurable.
++ *            Matti Aarnio    :       Made the number of sockets dynamic,
++ *                                    to be allocated when needed, and mr.
++ *                                    Uphoff's max is used as max to be
++ *                                    allowed to allocate.
++ *            Linus           :       Argh. removed all the socket allocation
++ *                                    altogether: it's in the inode now.
++ *            Alan Cox        :       Made sock_alloc()/sock_release() public
++ *                                    for NetROM and future kernel nfsd type
++ *                                    stuff.
++ *            Alan Cox        :       sendmsg/recvmsg basics.
++ *            Tom Dyas        :       Export net symbols.
++ *            Marcin Dalecki  :       Fixed problems with CONFIG_NET="n".
++ *            Alan Cox        :       Added thread locking to sys_* calls
++ *                                    for sockets. May have errors at the
++ *                                    moment.
++ *            Kevin Buhr      :       Fixed the dumb errors in the above.
++ *            Andi Kleen      :       Some small cleanups, optimizations,
++ *                                    and fixed a copy_from_user() bug.
++ *            Tigran Aivazian :       sys_send(args) calls sys_sendto(args, NULL, 0)
++ *            Tigran Aivazian :       Made listen(2) backlog sanity checks
++ *                                    protocol-independent
++ *
++ *
++ *            This program is free software; you can redistribute it and/or
++ *            modify it under the terms of the GNU General Public License
++ *            as published by the Free Software Foundation; either version
++ *            2 of the License, or (at your option) any later version.
++ *
++ *
++ *    This module is effectively the top level interface to the BSD socket
++ *    paradigm.
++ *
++ *    Based upon Swansea University Computer Society NET3.039
++ */
++
++#include <linux/mm.h>
++#include <linux/socket.h>
++#include <linux/file.h>
++#include <linux/net.h>
++#include <linux/interrupt.h>
++#include <linux/rcupdate.h>
++#include <linux/netdevice.h>
++#include <linux/proc_fs.h>
++#include <linux/seq_file.h>
++#include <linux/mutex.h>
++#include <linux/wanrouter.h>
++#include <linux/if_bridge.h>
++#include <linux/if_frad.h>
++#include <linux/if_vlan.h>
++#include <linux/init.h>
++#include <linux/poll.h>
++#include <linux/cache.h>
++#include <linux/module.h>
++#include <linux/highmem.h>
++#include <linux/mount.h>
++#include <linux/security.h>
++#include <linux/syscalls.h>
++#include <linux/compat.h>
++#include <linux/kmod.h>
++#include <linux/audit.h>
++#include <linux/wireless.h>
++#include <linux/nsproxy.h>
++
++#include <asm/uaccess.h>
++#include <asm/unistd.h>
++
++#include <net/compat.h>
++
++#include <net/sock.h>
++#include <linux/netfilter.h>
++#include <linux/vs_base.h>
++#include <linux/vs_socket.h>
++#include <linux/vs_inet.h>
++#include <linux/vs_inet6.h>
++
++static int sock_no_open(struct inode *irrelevant, struct file *dontcare);
++static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov,
++                       unsigned long nr_segs, loff_t pos);
++static ssize_t sock_aio_write(struct kiocb *iocb, const struct iovec *iov,
++                        unsigned long nr_segs, loff_t pos);
++static int sock_mmap(struct file *file, struct vm_area_struct *vma);
++
++static int sock_close(struct inode *inode, struct file *file);
++static unsigned int sock_poll(struct file *file,
++                            struct poll_table_struct *wait);
++static long sock_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
++#ifdef CONFIG_COMPAT
++static long compat_sock_ioctl(struct file *file,
++                            unsigned int cmd, unsigned long arg);
++#endif
++static int sock_fasync(int fd, struct file *filp, int on);
++static ssize_t sock_sendpage(struct file *file, struct page *page,
++                           int offset, size_t size, loff_t *ppos, int more);
++
++/*
++ *    Socket files have a set of 'special' operations as well as the generic file ones. These don't appear
++ *    in the operation structures but are done directly via the socketcall() multiplexor.
++ */
++
++static const struct file_operations socket_file_ops = {
++      .owner =        THIS_MODULE,
++      .llseek =       no_llseek,
++      .aio_read =     sock_aio_read,
++      .aio_write =    sock_aio_write,
++      .poll =         sock_poll,
++      .unlocked_ioctl = sock_ioctl,
++#ifdef CONFIG_COMPAT
++      .compat_ioctl = compat_sock_ioctl,
++#endif
++      .mmap =         sock_mmap,
++      .open =         sock_no_open,   /* special open code to disallow open via /proc */
++      .release =      sock_close,
++      .fasync =       sock_fasync,
++      .sendpage =     sock_sendpage,
++      .splice_write = generic_splice_sendpage,
++};
++
++/*
++ *    The protocol list. Each protocol is registered in here.
++ */
++
++static DEFINE_SPINLOCK(net_family_lock);
++static const struct net_proto_family *net_families[NPROTO] __read_mostly;
++
++/*
++ *    Statistics counters of the socket lists
++ */
++
++static DEFINE_PER_CPU(int, sockets_in_use) = 0;
++
++/*
++ * Support routines.
++ * Move socket addresses back and forth across the kernel/user
++ * divide and look after the messy bits.
++ */
++
++#define MAX_SOCK_ADDR 128             /* 108 for Unix domain -
++                                         16 for IP, 16 for IPX,
++                                         24 for IPv6,
++                                         about 80 for AX.25
++                                         must be at least one bigger than
++                                         the AF_UNIX size (see net/unix/af_unix.c
++                                         :unix_mkname()).
++                                       */
++
++/**
++ *    move_addr_to_kernel     -       copy a socket address into kernel space
++ *    @uaddr: Address in user space
++ *    @kaddr: Address in kernel space
++ *    @ulen: Length in user space
++ *
++ *    The address is copied into kernel space. If the provided address is
++ *    too long an error code of -EINVAL is returned. If the copy gives
++ *    invalid addresses -EFAULT is returned. On a success 0 is returned.
++ */
++
++int move_addr_to_kernel(void __user *uaddr, int ulen, void *kaddr)
++{
++      if (ulen < 0 || ulen > MAX_SOCK_ADDR)
++              return -EINVAL;
++      if (ulen == 0)
++              return 0;
++      if (copy_from_user(kaddr, uaddr, ulen))
++              return -EFAULT;
++      return audit_sockaddr(ulen, kaddr);
++}
++
++/**
++ *    move_addr_to_user       -       copy an address to user space
++ *    @kaddr: kernel space address
++ *    @klen: length of address in kernel
++ *    @uaddr: user space address
++ *    @ulen: pointer to user length field
++ *
++ *    The value pointed to by ulen on entry is the buffer length available.
++ *    This is overwritten with the buffer space used. -EINVAL is returned
++ *    if an overlong buffer is specified or a negative buffer size. -EFAULT
++ *    is returned if either the buffer or the length field are not
++ *    accessible.
++ *    After copying the data up to the limit the user specifies, the true
++ *    length of the data is written over the length limit the user
++ *    specified. Zero is returned for a success.
++ */
++
++int move_addr_to_user(void *kaddr, int klen, void __user *uaddr,
++                    int __user *ulen)
++{
++      int err;
++      int len;
++
++      err = get_user(len, ulen);
++      if (err)
++              return err;
++      if (len > klen)
++              len = klen;
++      if (len < 0 || len > MAX_SOCK_ADDR)
++              return -EINVAL;
++      if (len) {
++              if (audit_sockaddr(klen, kaddr))
++                      return -ENOMEM;
++              if (copy_to_user(uaddr, kaddr, len))
++                      return -EFAULT;
++      }
++      /*
++       *      "fromlen shall refer to the value before truncation.."
++       *                      1003.1g
++       */
++      return __put_user(klen, ulen);
++}
++
++#define SOCKFS_MAGIC 0x534F434B
++
++static struct kmem_cache *sock_inode_cachep __read_mostly;
++
++static struct inode *sock_alloc_inode(struct super_block *sb)
++{
++      struct socket_alloc *ei;
++
++      ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL);
++      if (!ei)
++              return NULL;
++      init_waitqueue_head(&ei->socket.wait);
++
++      ei->socket.fasync_list = NULL;
++      ei->socket.state = SS_UNCONNECTED;
++      ei->socket.flags = 0;
++      ei->socket.ops = NULL;
++      ei->socket.sk = NULL;
++      ei->socket.file = NULL;
++
++      return &ei->vfs_inode;
++}
++
++static void sock_destroy_inode(struct inode *inode)
++{
++      kmem_cache_free(sock_inode_cachep,
++                      container_of(inode, struct socket_alloc, vfs_inode));
++}
++
++static void init_once(void *foo, struct kmem_cache *cachep, unsigned long flags)
++{
++      struct socket_alloc *ei = (struct socket_alloc *)foo;
++
++      inode_init_once(&ei->vfs_inode);
++}
++
++static int init_inodecache(void)
++{
++      sock_inode_cachep = kmem_cache_create("sock_inode_cache",
++                                            sizeof(struct socket_alloc),
++                                            0,
++                                            (SLAB_HWCACHE_ALIGN |
++                                             SLAB_RECLAIM_ACCOUNT |
++                                             SLAB_MEM_SPREAD),
++                                            init_once,
++                                            NULL);
++      if (sock_inode_cachep == NULL)
++              return -ENOMEM;
++      return 0;
++}
++
++static struct super_operations sockfs_ops = {
++      .alloc_inode =  sock_alloc_inode,
++      .destroy_inode =sock_destroy_inode,
++      .statfs =       simple_statfs,
++};
++
++static int sockfs_get_sb(struct file_system_type *fs_type,
++                       int flags, const char *dev_name, void *data,
++                       struct vfsmount *mnt)
++{
++      return get_sb_pseudo(fs_type, "socket:", &sockfs_ops, SOCKFS_MAGIC,
++                           mnt);
++}
++
++static struct vfsmount *sock_mnt __read_mostly;
++
++static struct file_system_type sock_fs_type = {
++      .name =         "sockfs",
++      .get_sb =       sockfs_get_sb,
++      .kill_sb =      kill_anon_super,
++};
++
++static int sockfs_delete_dentry(struct dentry *dentry)
++{
++      /*
++       * At creation time, we pretended this dentry was hashed
++       * (by clearing DCACHE_UNHASHED bit in d_flags)
++       * At delete time, we restore the truth : not hashed.
++       * (so that dput() can proceed correctly)
++       */
++      dentry->d_flags |= DCACHE_UNHASHED;
++      return 0;
++}
++
++/*
++ * sockfs_dname() is called from d_path().
++ */
++static char *sockfs_dname(struct dentry *dentry, char *buffer, int buflen)
++{
++      return dynamic_dname(dentry, buffer, buflen, "socket:[%lu]",
++                              dentry->d_inode->i_ino);
++}
++
++static struct dentry_operations sockfs_dentry_operations = {
++      .d_delete = sockfs_delete_dentry,
++      .d_dname  = sockfs_dname,
++};
++
++/*
++ *    Obtains the first available file descriptor and sets it up for use.
++ *
++ *    These functions create file structures and maps them to fd space
++ *    of the current process. On success it returns file descriptor
++ *    and file struct implicitly stored in sock->file.
++ *    Note that another thread may close file descriptor before we return
++ *    from this function. We use the fact that now we do not refer
++ *    to socket after mapping. If one day we will need it, this
++ *    function will increment ref. count on file by 1.
++ *
++ *    In any case returned fd MAY BE not valid!
++ *    This race condition is unavoidable
++ *    with shared fd spaces, we cannot solve it inside kernel,
++ *    but we take care of internal coherence yet.
++ */
++
++static int sock_alloc_fd(struct file **filep)
++{
++      int fd;
++
++      fd = get_unused_fd();
++      if (likely(fd >= 0)) {
++              struct file *file = get_empty_filp();
++
++              *filep = file;
++              if (unlikely(!file)) {
++                      put_unused_fd(fd);
++                      return -ENFILE;
++              }
++      } else
++              *filep = NULL;
++      return fd;
++}
++
++static int sock_attach_fd(struct socket *sock, struct file *file)
++{
++      struct qstr name = { .name = "" };
++
++      file->f_path.dentry = d_alloc(sock_mnt->mnt_sb->s_root, &name);
++      if (unlikely(!file->f_path.dentry))
++              return -ENOMEM;
++
++      file->f_path.dentry->d_op = &sockfs_dentry_operations;
++      /*
++       * We dont want to push this dentry into global dentry hash table.
++       * We pretend dentry is already hashed, by unsetting DCACHE_UNHASHED
++       * This permits a working /proc/$pid/fd/XXX on sockets
++       */
++      file->f_path.dentry->d_flags &= ~DCACHE_UNHASHED;
++      d_instantiate(file->f_path.dentry, SOCK_INODE(sock));
++      file->f_path.mnt = mntget(sock_mnt);
++      file->f_mapping = file->f_path.dentry->d_inode->i_mapping;
++
++      sock->file = file;
++      file->f_op = SOCK_INODE(sock)->i_fop = &socket_file_ops;
++      file->f_mode = FMODE_READ | FMODE_WRITE;
++      file->f_flags = O_RDWR;
++      file->f_pos = 0;
++      file->private_data = sock;
++
++      return 0;
++}
++
++int sock_map_fd(struct socket *sock)
++{
++      struct file *newfile;
++      int fd = sock_alloc_fd(&newfile);
++
++      if (likely(fd >= 0)) {
++              int err = sock_attach_fd(sock, newfile);
++
++              if (unlikely(err < 0)) {
++                      put_filp(newfile);
++                      put_unused_fd(fd);
++                      return err;
++              }
++              fd_install(fd, newfile);
++      }
++      return fd;
++}
++
++static struct socket *sock_from_file(struct file *file, int *err)
++{
++      if (file->f_op == &socket_file_ops)
++              return file->private_data;      /* set in sock_map_fd */
++
++      *err = -ENOTSOCK;
++      return NULL;
++}
++
++/**
++ *    sockfd_lookup   -       Go from a file number to its socket slot
++ *    @fd: file handle
++ *    @err: pointer to an error code return
++ *
++ *    The file handle passed in is locked and the socket it is bound
++ *    too is returned. If an error occurs the err pointer is overwritten
++ *    with a negative errno code and NULL is returned. The function checks
++ *    for both invalid handles and passing a handle which is not a socket.
++ *
++ *    On a success the socket object pointer is returned.
++ */
++
++struct socket *sockfd_lookup(int fd, int *err)
++{
++      struct file *file;
++      struct socket *sock;
++
++      file = fget(fd);
++      if (!file) {
++              *err = -EBADF;
++              return NULL;
++      }
++
++      sock = sock_from_file(file, err);
++      if (!sock)
++              fput(file);
++      return sock;
++}
++
++static struct socket *sockfd_lookup_light(int fd, int *err, int *fput_needed)
++{
++      struct file *file;
++      struct socket *sock;
++
++      *err = -EBADF;
++      file = fget_light(fd, fput_needed);
++      if (file) {
++              sock = sock_from_file(file, err);
++              if (sock)
++                      return sock;
++              fput_light(file, *fput_needed);
++      }
++      return NULL;
++}
++
++/**
++ *    sock_alloc      -       allocate a socket
++ *
++ *    Allocate a new inode and socket object. The two are bound together
++ *    and initialised. The socket is then returned. If we are out of inodes
++ *    NULL is returned.
++ */
++
++static struct socket *sock_alloc(void)
++{
++      struct inode *inode;
++      struct socket *sock;
++
++      inode = new_inode(sock_mnt->mnt_sb);
++      if (!inode)
++              return NULL;
++
++      sock = SOCKET_I(inode);
++
++      inode->i_mode = S_IFSOCK | S_IRWXUGO;
++      inode->i_uid = current->fsuid;
++      inode->i_gid = current->fsgid;
++
++      get_cpu_var(sockets_in_use)++;
++      put_cpu_var(sockets_in_use);
++      return sock;
++}
++
++/*
++ *    In theory you can't get an open on this inode, but /proc provides
++ *    a back door. Remember to keep it shut otherwise you'll let the
++ *    creepy crawlies in.
++ */
++
++static int sock_no_open(struct inode *irrelevant, struct file *dontcare)
++{
++      return -ENXIO;
++}
++
++const struct file_operations bad_sock_fops = {
++      .owner = THIS_MODULE,
++      .open = sock_no_open,
++};
++
++/**
++ *    sock_release    -       close a socket
++ *    @sock: socket to close
++ *
++ *    The socket is released from the protocol stack if it has a release
++ *    callback, and the inode is then released if the socket is bound to
++ *    an inode not a file.
++ */
++
++void sock_release(struct socket *sock)
++{
++      if (sock->ops) {
++              struct module *owner = sock->ops->owner;
++
++              sock->ops->release(sock);
++              sock->ops = NULL;
++              module_put(owner);
++      }
++
++      if (sock->fasync_list)
++              printk(KERN_ERR "sock_release: fasync list not empty!\n");
++
++      get_cpu_var(sockets_in_use)--;
++      put_cpu_var(sockets_in_use);
++      if (!sock->file) {
++              iput(SOCK_INODE(sock));
++              return;
++      }
++      sock->file = NULL;
++}
++
++static inline int __sock_sendmsg(struct kiocb *iocb, struct socket *sock,
++                               struct msghdr *msg, size_t size)
++{
++      struct sock_iocb *si = kiocb_to_siocb(iocb);
++      int err, len;
++
++      si->sock = sock;
++      si->scm = NULL;
++      si->msg = msg;
++      si->size = size;
++
++      err = security_socket_sendmsg(sock, msg, size);
++      if (err)
++              return err;
++
++      len = sock->ops->sendmsg(iocb, sock, msg, size);
++      if (sock->sk) {
++              if (len == size)
++                      vx_sock_send(sock->sk, size);
++              else
++                      vx_sock_fail(sock->sk, size);
++      }
++      vxdprintk(VXD_CBIT(net, 7),
++              "__sock_sendmsg: %p[%p,%p,%p;%d/%d]:%d/%d",
++              sock, sock->sk,
++              (sock->sk)?sock->sk->sk_nx_info:0,
++              (sock->sk)?sock->sk->sk_vx_info:0,
++              (sock->sk)?sock->sk->sk_xid:0,
++              (sock->sk)?sock->sk->sk_nid:0,
++              (unsigned int)size, len);
++      return len;
++}
++
++int sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
++{
++      struct kiocb iocb;
++      struct sock_iocb siocb;
++      int ret;
++
++      init_sync_kiocb(&iocb, NULL);
++      iocb.private = &siocb;
++      ret = __sock_sendmsg(&iocb, sock, msg, size);
++      if (-EIOCBQUEUED == ret)
++              ret = wait_on_sync_kiocb(&iocb);
++      return ret;
++}
++
++int kernel_sendmsg(struct socket *sock, struct msghdr *msg,
++                 struct kvec *vec, size_t num, size_t size)
++{
++      mm_segment_t oldfs = get_fs();
++      int result;
++
++      set_fs(KERNEL_DS);
++      /*
++       * the following is safe, since for compiler definitions of kvec and
++       * iovec are identical, yielding the same in-core layout and alignment
++       */
++      msg->msg_iov = (struct iovec *)vec;
++      msg->msg_iovlen = num;
++      result = sock_sendmsg(sock, msg, size);
++      set_fs(oldfs);
++      return result;
++}
++
++/*
++ * called from sock_recv_timestamp() if sock_flag(sk, SOCK_RCVTSTAMP)
++ */
++void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
++      struct sk_buff *skb)
++{
++      ktime_t kt = skb->tstamp;
++
++      if (!sock_flag(sk, SOCK_RCVTSTAMPNS)) {
++              struct timeval tv;
++              /* Race occurred between timestamp enabling and packet
++                 receiving.  Fill in the current time for now. */
++              if (kt.tv64 == 0)
++                      kt = ktime_get_real();
++              skb->tstamp = kt;
++              tv = ktime_to_timeval(kt);
++              put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMP, sizeof(tv), &tv);
++      } else {
++              struct timespec ts;
++              /* Race occurred between timestamp enabling and packet
++                 receiving.  Fill in the current time for now. */
++              if (kt.tv64 == 0)
++                      kt = ktime_get_real();
++              skb->tstamp = kt;
++              ts = ktime_to_timespec(kt);
++              put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPNS, sizeof(ts), &ts);
++      }
++}
++
++EXPORT_SYMBOL_GPL(__sock_recv_timestamp);
++
++static inline int __sock_recvmsg(struct kiocb *iocb, struct socket *sock,
++                               struct msghdr *msg, size_t size, int flags)
++{
++      int err, len;
++      struct sock_iocb *si = kiocb_to_siocb(iocb);
++
++      si->sock = sock;
++      si->scm = NULL;
++      si->msg = msg;
++      si->size = size;
++      si->flags = flags;
++
++      err = security_socket_recvmsg(sock, msg, size, flags);
++      if (err)
++              return err;
++
++      len = sock->ops->recvmsg(iocb, sock, msg, size, flags);
++      if ((len >= 0) && sock->sk)
++              vx_sock_recv(sock->sk, len);
++      vxdprintk(VXD_CBIT(net, 7),
++              "__sock_recvmsg: %p[%p,%p,%p;%d/%d]:%d/%d",
++              sock, sock->sk,
++              (sock->sk)?sock->sk->sk_nx_info:0,
++              (sock->sk)?sock->sk->sk_vx_info:0,
++              (sock->sk)?sock->sk->sk_xid:0,
++              (sock->sk)?sock->sk->sk_nid:0,
++              (unsigned int)size, len);
++      return len;
++}
++
++int sock_recvmsg(struct socket *sock, struct msghdr *msg,
++               size_t size, int flags)
++{
++      struct kiocb iocb;
++      struct sock_iocb siocb;
++      int ret;
++
++      init_sync_kiocb(&iocb, NULL);
++      iocb.private = &siocb;
++      ret = __sock_recvmsg(&iocb, sock, msg, size, flags);
++      if (-EIOCBQUEUED == ret)
++              ret = wait_on_sync_kiocb(&iocb);
++      return ret;
++}
++
++int kernel_recvmsg(struct socket *sock, struct msghdr *msg,
++                 struct kvec *vec, size_t num, size_t size, int flags)
++{
++      mm_segment_t oldfs = get_fs();
++      int result;
++
++      set_fs(KERNEL_DS);
++      /*
++       * the following is safe, since for compiler definitions of kvec and
++       * iovec are identical, yielding the same in-core layout and alignment
++       */
++      msg->msg_iov = (struct iovec *)vec, msg->msg_iovlen = num;
++      result = sock_recvmsg(sock, msg, size, flags);
++      set_fs(oldfs);
++      return result;
++}
++
++static void sock_aio_dtor(struct kiocb *iocb)
++{
++      kfree(iocb->private);
++}
++
++static ssize_t sock_sendpage(struct file *file, struct page *page,
++                           int offset, size_t size, loff_t *ppos, int more)
++{
++      struct socket *sock;
++      int flags;
++
++      sock = file->private_data;
++
++      flags = !(file->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT;
++      if (more)
++              flags |= MSG_MORE;
++
++      return sock->ops->sendpage(sock, page, offset, size, flags);
++}
++
++static struct sock_iocb *alloc_sock_iocb(struct kiocb *iocb,
++                                       struct sock_iocb *siocb)
++{
++      if (!is_sync_kiocb(iocb)) {
++              siocb = kmalloc(sizeof(*siocb), GFP_KERNEL);
++              if (!siocb)
++                      return NULL;
++              iocb->ki_dtor = sock_aio_dtor;
++      }
++
++      siocb->kiocb = iocb;
++      iocb->private = siocb;
++      return siocb;
++}
++
++static ssize_t do_sock_read(struct msghdr *msg, struct kiocb *iocb,
++              struct file *file, const struct iovec *iov,
++              unsigned long nr_segs)
++{
++      struct socket *sock = file->private_data;
++      size_t size = 0;
++      int i;
++
++      for (i = 0; i < nr_segs; i++)
++              size += iov[i].iov_len;
++
++      msg->msg_name = NULL;
++      msg->msg_namelen = 0;
++      msg->msg_control = NULL;
++      msg->msg_controllen = 0;
++      msg->msg_iov = (struct iovec *)iov;
++      msg->msg_iovlen = nr_segs;
++      msg->msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0;
++
++      return __sock_recvmsg(iocb, sock, msg, size, msg->msg_flags);
++}
++
++static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov,
++                              unsigned long nr_segs, loff_t pos)
++{
++      struct sock_iocb siocb, *x;
++
++      if (pos != 0)
++              return -ESPIPE;
++
++      if (iocb->ki_left == 0) /* Match SYS5 behaviour */
++              return 0;
++
++
++      x = alloc_sock_iocb(iocb, &siocb);
++      if (!x)
++              return -ENOMEM;
++      return do_sock_read(&x->async_msg, iocb, iocb->ki_filp, iov, nr_segs);
++}
++
++static ssize_t do_sock_write(struct msghdr *msg, struct kiocb *iocb,
++                      struct file *file, const struct iovec *iov,
++                      unsigned long nr_segs)
++{
++      struct socket *sock = file->private_data;
++      size_t size = 0;
++      int i;
++
++      for (i = 0; i < nr_segs; i++)
++              size += iov[i].iov_len;
++
++      msg->msg_name = NULL;
++      msg->msg_namelen = 0;
++      msg->msg_control = NULL;
++      msg->msg_controllen = 0;
++      msg->msg_iov = (struct iovec *)iov;
++      msg->msg_iovlen = nr_segs;
++      msg->msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0;
++      if (sock->type == SOCK_SEQPACKET)
++              msg->msg_flags |= MSG_EOR;
++
++      return __sock_sendmsg(iocb, sock, msg, size);
++}
++
++static ssize_t sock_aio_write(struct kiocb *iocb, const struct iovec *iov,
++                        unsigned long nr_segs, loff_t pos)
++{
++      struct sock_iocb siocb, *x;
++
++      if (pos != 0)
++              return -ESPIPE;
++
++      x = alloc_sock_iocb(iocb, &siocb);
++      if (!x)
++              return -ENOMEM;
++
++      return do_sock_write(&x->async_msg, iocb, iocb->ki_filp, iov, nr_segs);
++}
++
++/*
++ * Atomic setting of ioctl hooks to avoid race
++ * with module unload.
++ */
++
++static DEFINE_MUTEX(br_ioctl_mutex);
++static int (*br_ioctl_hook) (struct net *, unsigned int cmd, void __user *arg) = NULL;
++
++void brioctl_set(int (*hook) (struct net *, unsigned int, void __user *))
++{
++      mutex_lock(&br_ioctl_mutex);
++      br_ioctl_hook = hook;
++      mutex_unlock(&br_ioctl_mutex);
++}
++
++EXPORT_SYMBOL(brioctl_set);
++
++static DEFINE_MUTEX(vlan_ioctl_mutex);
++static int (*vlan_ioctl_hook) (struct net *, void __user *arg);
++
++void vlan_ioctl_set(int (*hook) (struct net *, void __user *))
++{
++      mutex_lock(&vlan_ioctl_mutex);
++      vlan_ioctl_hook = hook;
++      mutex_unlock(&vlan_ioctl_mutex);
++}
++
++EXPORT_SYMBOL(vlan_ioctl_set);
++
++static DEFINE_MUTEX(dlci_ioctl_mutex);
++static int (*dlci_ioctl_hook) (unsigned int, void __user *);
++
++void dlci_ioctl_set(int (*hook) (unsigned int, void __user *))
++{
++      mutex_lock(&dlci_ioctl_mutex);
++      dlci_ioctl_hook = hook;
++      mutex_unlock(&dlci_ioctl_mutex);
++}
++
++EXPORT_SYMBOL(dlci_ioctl_set);
++
++/*
++ *    With an ioctl, arg may well be a user mode pointer, but we don't know
++ *    what to do with it - that's up to the protocol still.
++ */
++
++static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg)
++{
++      struct socket *sock;
++      struct sock *sk;
++      void __user *argp = (void __user *)arg;
++      int pid, err;
++      struct net *net;
++
++      sock = file->private_data;
++      sk = sock->sk;
++      net = sk->sk_net;
++      if (cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15)) {
++              err = dev_ioctl(net, cmd, argp);
++      } else
++#ifdef CONFIG_WIRELESS_EXT
++      if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) {
++              err = dev_ioctl(net, cmd, argp);
++      } else
++#endif                                /* CONFIG_WIRELESS_EXT */
++              switch (cmd) {
++              case FIOSETOWN:
++              case SIOCSPGRP:
++                      err = -EFAULT;
++                      if (get_user(pid, (int __user *)argp))
++                              break;
++                      err = f_setown(sock->file, pid, 1);
++                      break;
++              case FIOGETOWN:
++              case SIOCGPGRP:
++                      err = put_user(f_getown(sock->file),
++                                     (int __user *)argp);
++                      break;
++              case SIOCGIFBR:
++              case SIOCSIFBR:
++              case SIOCBRADDBR:
++              case SIOCBRDELBR:
++                      err = -ENOPKG;
++                      if (!br_ioctl_hook)
++                              request_module("bridge");
++
++                      mutex_lock(&br_ioctl_mutex);
++                      if (br_ioctl_hook)
++                              err = br_ioctl_hook(net, cmd, argp);
++                      mutex_unlock(&br_ioctl_mutex);
++                      break;
++              case SIOCGIFVLAN:
++              case SIOCSIFVLAN:
++                      err = -ENOPKG;
++                      if (!vlan_ioctl_hook)
++                              request_module("8021q");
++
++                      mutex_lock(&vlan_ioctl_mutex);
++                      if (vlan_ioctl_hook)
++                              err = vlan_ioctl_hook(net, argp);
++                      mutex_unlock(&vlan_ioctl_mutex);
++                      break;
++              case SIOCADDDLCI:
++              case SIOCDELDLCI:
++                      err = -ENOPKG;
++                      if (!dlci_ioctl_hook)
++                              request_module("dlci");
++
++                      if (dlci_ioctl_hook) {
++                              mutex_lock(&dlci_ioctl_mutex);
++                              err = dlci_ioctl_hook(cmd, argp);
++                              mutex_unlock(&dlci_ioctl_mutex);
++                      }
++                      break;
++              default:
++                      err = sock->ops->ioctl(sock, cmd, arg);
++
++                      /*
++                       * If this ioctl is unknown try to hand it down
++                       * to the NIC driver.
++                       */
++                      if (err == -ENOIOCTLCMD)
++                              err = dev_ioctl(net, cmd, argp);
++                      break;
++              }
++      return err;
++}
++
++int sock_create_lite(int family, int type, int protocol, struct socket **res)
++{
++      int err;
++      struct socket *sock = NULL;
++
++      err = security_socket_create(family, type, protocol, 1);
++      if (err)
++              goto out;
++
++      sock = sock_alloc();
++      if (!sock) {
++              err = -ENOMEM;
++              goto out;
++      }
++
++      sock->type = type;
++      err = security_socket_post_create(sock, family, type, protocol, 1);
++      if (err)
++              goto out_release;
++
++out:
++      *res = sock;
++      return err;
++out_release:
++      sock_release(sock);
++      sock = NULL;
++      goto out;
++}
++
++/* No kernel lock held - perfect */
++static unsigned int sock_poll(struct file *file, poll_table *wait)
++{
++      struct socket *sock;
++
++      /*
++       *      We can't return errors to poll, so it's either yes or no.
++       */
++      sock = file->private_data;
++      return sock->ops->poll(file, sock, wait);
++}
++
++static int sock_mmap(struct file *file, struct vm_area_struct *vma)
++{
++      struct socket *sock = file->private_data;
++
++      return sock->ops->mmap(file, sock, vma);
++}
++
++static int sock_close(struct inode *inode, struct file *filp)
++{
++      /*
++       *      It was possible the inode is NULL we were
++       *      closing an unfinished socket.
++       */
++
++      if (!inode) {
++              printk(KERN_DEBUG "sock_close: NULL inode\n");
++              return 0;
++      }
++      sock_fasync(-1, filp, 0);
++      sock_release(SOCKET_I(inode));
++      return 0;
++}
++
++/*
++ *    Update the socket async list
++ *
++ *    Fasync_list locking strategy.
++ *
++ *    1. fasync_list is modified only under process context socket lock
++ *       i.e. under semaphore.
++ *    2. fasync_list is used under read_lock(&sk->sk_callback_lock)
++ *       or under socket lock.
++ *    3. fasync_list can be used from softirq context, so that
++ *       modification under socket lock have to be enhanced with
++ *       write_lock_bh(&sk->sk_callback_lock).
++ *                                                    --ANK (990710)
++ */
++
++static int sock_fasync(int fd, struct file *filp, int on)
++{
++      struct fasync_struct *fa, *fna = NULL, **prev;
++      struct socket *sock;
++      struct sock *sk;
++
++      if (on) {
++              fna = kmalloc(sizeof(struct fasync_struct), GFP_KERNEL);
++              if (fna == NULL)
++                      return -ENOMEM;
++      }
++
++      sock = filp->private_data;
++
++      sk = sock->sk;
++      if (sk == NULL) {
++              kfree(fna);
++              return -EINVAL;
++      }
++
++      lock_sock(sk);
++
++      prev = &(sock->fasync_list);
++
++      for (fa = *prev; fa != NULL; prev = &fa->fa_next, fa = *prev)
++              if (fa->fa_file == filp)
++                      break;
++
++      if (on) {
++              if (fa != NULL) {
++                      write_lock_bh(&sk->sk_callback_lock);
++                      fa->fa_fd = fd;
++                      write_unlock_bh(&sk->sk_callback_lock);
++
++                      kfree(fna);
++                      goto out;
++              }
++              fna->fa_file = filp;
++              fna->fa_fd = fd;
++              fna->magic = FASYNC_MAGIC;
++              fna->fa_next = sock->fasync_list;
++              write_lock_bh(&sk->sk_callback_lock);
++              sock->fasync_list = fna;
++              write_unlock_bh(&sk->sk_callback_lock);
++      } else {
++              if (fa != NULL) {
++                      write_lock_bh(&sk->sk_callback_lock);
++                      *prev = fa->fa_next;
++                      write_unlock_bh(&sk->sk_callback_lock);
++                      kfree(fa);
++              }
++      }
++
++out:
++      release_sock(sock->sk);
++      return 0;
++}
++
++/* This function may be called only under socket lock or callback_lock */
++
++int sock_wake_async(struct socket *sock, int how, int band)
++{
++      if (!sock || !sock->fasync_list)
++              return -1;
++      switch (how) {
++      case 1:
++
++              if (test_bit(SOCK_ASYNC_WAITDATA, &sock->flags))
++                      break;
++              goto call_kill;
++      case 2:
++              if (!test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags))
++                      break;
++              /* fall through */
++      case 0:
++call_kill:
++              __kill_fasync(sock->fasync_list, SIGIO, band);
++              break;
++      case 3:
++              __kill_fasync(sock->fasync_list, SIGURG, band);
++      }
++      return 0;
++}
++
++static int __sock_create(struct net *net, int family, int type, int protocol,
++                       struct socket **res, int kern)
++{
++      int err;
++      struct socket *sock;
++      const struct net_proto_family *pf;
++
++      /*
++       *      Check protocol is in range
++       */
++      if (family < 0 || family >= NPROTO)
++              return -EAFNOSUPPORT;
++      if (type < 0 || type >= SOCK_MAX)
++              return -EINVAL;
++
++      if (!nx_check(0, VS_ADMIN)) {
++              if (family == PF_INET && !current_nx_info_has_v4())
++                      return -EAFNOSUPPORT;
++              if (family == PF_INET6 && !current_nx_info_has_v6())
++                      return -EAFNOSUPPORT;
++      }
++
++      /* Compatibility.
++
++         This uglymoron is moved from INET layer to here to avoid
++         deadlock in module load.
++       */
++      if (family == PF_INET && type == SOCK_PACKET) {
++              static int warned;
++              if (!warned) {
++                      warned = 1;
++                      printk(KERN_INFO "%s uses obsolete (PF_INET,SOCK_PACKET)\n",
++                             current->comm);
++              }
++              family = PF_PACKET;
++      }
++
++      err = security_socket_create(family, type, protocol, kern);
++      if (err)
++              return err;
++
++      /*
++       *      Allocate the socket and allow the family to set things up. if
++       *      the protocol is 0, the family is instructed to select an appropriate
++       *      default.
++       */
++      sock = sock_alloc();
++      if (!sock) {
++              if (net_ratelimit())
++                      printk(KERN_WARNING "socket: no more sockets\n");
++              return -ENFILE; /* Not exactly a match, but its the
++                                 closest posix thing */
++      }
++
++      sock->type = type;
++
++#if defined(CONFIG_KMOD)
++      /* Attempt to load a protocol module if the find failed.
++       *
++       * 12/09/1996 Marcin: But! this makes REALLY only sense, if the user
++       * requested real, full-featured networking support upon configuration.
++       * Otherwise module support will break!
++       */
++      if (net_families[family] == NULL)
++              request_module("net-pf-%d", family);
++#endif
++
++      rcu_read_lock();
++      pf = rcu_dereference(net_families[family]);
++      err = -EAFNOSUPPORT;
++      if (!pf)
++              goto out_release;
++
++      /*
++       * We will call the ->create function, that possibly is in a loadable
++       * module, so we have to bump that loadable module refcnt first.
++       */
++      if (!try_module_get(pf->owner))
++              goto out_release;
++
++      /* Now protected by module ref count */
++      rcu_read_unlock();
++
++      err = pf->create(net, sock, protocol);
++      if (err < 0)
++              goto out_module_put;
++
++      /*
++       * Now to bump the refcnt of the [loadable] module that owns this
++       * socket at sock_release time we decrement its refcnt.
++       */
++      if (!try_module_get(sock->ops->owner))
++              goto out_module_busy;
++
++      /*
++       * Now that we're done with the ->create function, the [loadable]
++       * module can have its refcnt decremented
++       */
++      module_put(pf->owner);
++      err = security_socket_post_create(sock, family, type, protocol, kern);
++      if (err)
++              goto out_sock_release;
++      *res = sock;
++
++      return 0;
++
++out_module_busy:
++      err = -EAFNOSUPPORT;
++out_module_put:
++      sock->ops = NULL;
++      module_put(pf->owner);
++out_sock_release:
++      sock_release(sock);
++      return err;
++
++out_release:
++      rcu_read_unlock();
++      goto out_sock_release;
++}
++
++int sock_create(int family, int type, int protocol, struct socket **res)
++{
++      return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0);
++}
++
++int sock_create_kern(int family, int type, int protocol, struct socket **res)
++{
++      return __sock_create(&init_net, family, type, protocol, res, 1);
++}
++
++asmlinkage long sys_socket(int family, int type, int protocol)
++{
++      int retval;
++      struct socket *sock;
++
++      retval = sock_create(family, type, protocol, &sock);
++      if (retval < 0)
++              goto out;
++
++      set_bit(SOCK_USER_SOCKET, &sock->flags);
++      retval = sock_map_fd(sock);
++      if (retval < 0)
++              goto out_release;
++
++out:
++      /* It may be already another descriptor 8) Not kernel problem. */
++      return retval;
++
++out_release:
++      sock_release(sock);
++      return retval;
++}
++
++/*
++ *    Create a pair of connected sockets.
++ */
++
++asmlinkage long sys_socketpair(int family, int type, int protocol,
++                             int __user *usockvec)
++{
++      struct socket *sock1, *sock2;
++      int fd1, fd2, err;
++      struct file *newfile1, *newfile2;
++
++      /*
++       * Obtain the first socket and check if the underlying protocol
++       * supports the socketpair call.
++       */
++
++      err = sock_create(family, type, protocol, &sock1);
++      if (err < 0)
++              goto out;
++      set_bit(SOCK_USER_SOCKET, &sock1->flags);
++
++      err = sock_create(family, type, protocol, &sock2);
++      if (err < 0)
++              goto out_release_1;
++      set_bit(SOCK_USER_SOCKET, &sock2->flags);
++
++      err = sock1->ops->socketpair(sock1, sock2);
++      if (err < 0)
++              goto out_release_both;
++
++      fd1 = sock_alloc_fd(&newfile1);
++      if (unlikely(fd1 < 0)) {
++              err = fd1;
++              goto out_release_both;
++      }
++
++      fd2 = sock_alloc_fd(&newfile2);
++      if (unlikely(fd2 < 0)) {
++              err = fd2;
++              put_filp(newfile1);
++              put_unused_fd(fd1);
++              goto out_release_both;
++      }
++
++      err = sock_attach_fd(sock1, newfile1);
++      if (unlikely(err < 0)) {
++              goto out_fd2;
++      }
++
++      err = sock_attach_fd(sock2, newfile2);
++      if (unlikely(err < 0)) {
++              fput(newfile1);
++              goto out_fd1;
++      }
++
++      err = audit_fd_pair(fd1, fd2);
++      if (err < 0) {
++              fput(newfile1);
++              fput(newfile2);
++              goto out_fd;
++      }
++
++      fd_install(fd1, newfile1);
++      fd_install(fd2, newfile2);
++      /* fd1 and fd2 may be already another descriptors.
++       * Not kernel problem.
++       */
++
++      err = put_user(fd1, &usockvec[0]);
++      if (!err)
++              err = put_user(fd2, &usockvec[1]);
++      if (!err)
++              return 0;
++
++      sys_close(fd2);
++      sys_close(fd1);
++      return err;
++
++out_release_both:
++      sock_release(sock2);
++out_release_1:
++      sock_release(sock1);
++out:
++      return err;
++
++out_fd2:
++      put_filp(newfile1);
++      sock_release(sock1);
++out_fd1:
++      put_filp(newfile2);
++      sock_release(sock2);
++out_fd:
++      put_unused_fd(fd1);
++      put_unused_fd(fd2);
++      goto out;
++}
++
++/*
++ *    Bind a name to a socket. Nothing much to do here since it's
++ *    the protocol's responsibility to handle the local address.
++ *
++ *    We move the socket address to kernel space before we call
++ *    the protocol layer (having also checked the address is ok).
++ */
++
++asmlinkage long sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen)
++{
++      struct socket *sock;
++      char address[MAX_SOCK_ADDR];
++      int err, fput_needed;
++
++      sock = sockfd_lookup_light(fd, &err, &fput_needed);
++      if (sock) {
++              err = move_addr_to_kernel(umyaddr, addrlen, address);
++              if (err >= 0) {
++                      err = security_socket_bind(sock,
++                                                 (struct sockaddr *)address,
++                                                 addrlen);
++                      if (!err)
++                              err = sock->ops->bind(sock,
++                                                    (struct sockaddr *)
++                                                    address, addrlen);
++              }
++              fput_light(sock->file, fput_needed);
++      }
++      return err;
++}
++
++/*
++ *    Perform a listen. Basically, we allow the protocol to do anything
++ *    necessary for a listen, and if that works, we mark the socket as
++ *    ready for listening.
++ */
++
++asmlinkage long sys_listen(int fd, int backlog)
++{
++      struct socket *sock;
++      int err, fput_needed;
++
++      sock = sockfd_lookup_light(fd, &err, &fput_needed);
++      if (sock) {
++              struct net *net = sock->sk->sk_net;
++              if ((unsigned)backlog > net->sysctl_somaxconn)
++                      backlog = net->sysctl_somaxconn;
++
++              err = security_socket_listen(sock, backlog);
++              if (!err)
++                      err = sock->ops->listen(sock, backlog);
++
++              fput_light(sock->file, fput_needed);
++      }
++      return err;
++}
++
++/*
++ *    For accept, we attempt to create a new socket, set up the link
++ *    with the client, wake up the client, then return the new
++ *    connected fd. We collect the address of the connector in kernel
++ *    space and move it to user at the very end. This is unclean because
++ *    we open the socket then return an error.
++ *
++ *    1003.1g adds the ability to recvmsg() to query connection pending
++ *    status to recvmsg. We need to add that support in a way thats
++ *    clean when we restucture accept also.
++ */
++
++asmlinkage long sys_accept(int fd, struct sockaddr __user *upeer_sockaddr,
++                         int __user *upeer_addrlen)
++{
++      struct socket *sock, *newsock;
++      struct file *newfile;
++      int err, len, newfd, fput_needed;
++      char address[MAX_SOCK_ADDR];
++
++      sock = sockfd_lookup_light(fd, &err, &fput_needed);
++      if (!sock)
++              goto out;
++
++      err = -ENFILE;
++      if (!(newsock = sock_alloc()))
++              goto out_put;
++
++      newsock->type = sock->type;
++      newsock->ops = sock->ops;
++
++      /*
++       * We don't need try_module_get here, as the listening socket (sock)
++       * has the protocol module (sock->ops->owner) held.
++       */
++      __module_get(newsock->ops->owner);
++
++      newfd = sock_alloc_fd(&newfile);
++      if (unlikely(newfd < 0)) {
++              err = newfd;
++              sock_release(newsock);
++              goto out_put;
++      }
++
++      err = sock_attach_fd(newsock, newfile);
++      if (err < 0)
++              goto out_fd_simple;
++
++      err = security_socket_accept(sock, newsock);
++      if (err)
++              goto out_fd;
++
++      err = sock->ops->accept(sock, newsock, sock->file->f_flags);
++      if (err < 0)
++              goto out_fd;
++
++      if (upeer_sockaddr) {
++              if (newsock->ops->getname(newsock, (struct sockaddr *)address,
++                                        &len, 2) < 0) {
++                      err = -ECONNABORTED;
++                      goto out_fd;
++              }
++              err = move_addr_to_user(address, len, upeer_sockaddr,
++                                      upeer_addrlen);
++              if (err < 0)
++                      goto out_fd;
++      }
++
++      /* File flags are not inherited via accept() unlike another OSes. */
++
++      fd_install(newfd, newfile);
++      err = newfd;
++
++      security_socket_post_accept(sock, newsock);
++
++out_put:
++      fput_light(sock->file, fput_needed);
++out:
++      return err;
++out_fd_simple:
++      sock_release(newsock);
++      put_filp(newfile);
++      put_unused_fd(newfd);
++      goto out_put;
++out_fd:
++      fput(newfile);
++      put_unused_fd(newfd);
++      goto out_put;
++}
++
++/*
++ *    Attempt to connect to a socket with the server address.  The address
++ *    is in user space so we verify it is OK and move it to kernel space.
++ *
++ *    For 1003.1g we need to add clean support for a bind to AF_UNSPEC to
++ *    break bindings
++ *
++ *    NOTE: 1003.1g draft 6.3 is broken with respect to AX.25/NetROM and
++ *    other SEQPACKET protocols that take time to connect() as it doesn't
++ *    include the -EINPROGRESS status for such sockets.
++ */
++
++asmlinkage long sys_connect(int fd, struct sockaddr __user *uservaddr,
++                          int addrlen)
++{
++      struct socket *sock;
++      char address[MAX_SOCK_ADDR];
++      int err, fput_needed;
++
++      sock = sockfd_lookup_light(fd, &err, &fput_needed);
++      if (!sock)
++              goto out;
++      err = move_addr_to_kernel(uservaddr, addrlen, address);
++      if (err < 0)
++              goto out_put;
++
++      err =
++          security_socket_connect(sock, (struct sockaddr *)address, addrlen);
++      if (err)
++              goto out_put;
++
++      err = sock->ops->connect(sock, (struct sockaddr *)address, addrlen,
++                               sock->file->f_flags);
++out_put:
++      fput_light(sock->file, fput_needed);
++out:
++      return err;
++}
++
++/*
++ *    Get the local address ('name') of a socket object. Move the obtained
++ *    name to user space.
++ */
++
++asmlinkage long sys_getsockname(int fd, struct sockaddr __user *usockaddr,
++                              int __user *usockaddr_len)
++{
++      struct socket *sock;
++      char address[MAX_SOCK_ADDR];
++      int len, err, fput_needed;
++
++      sock = sockfd_lookup_light(fd, &err, &fput_needed);
++      if (!sock)
++              goto out;
++
++      err = security_socket_getsockname(sock);
++      if (err)
++              goto out_put;
++
++      err = sock->ops->getname(sock, (struct sockaddr *)address, &len, 0);
++      if (err)
++              goto out_put;
++      err = move_addr_to_user(address, len, usockaddr, usockaddr_len);
++
++out_put:
++      fput_light(sock->file, fput_needed);
++out:
++      return err;
++}
++
++/*
++ *    Get the remote address ('name') of a socket object. Move the obtained
++ *    name to user space.
++ */
++
++asmlinkage long sys_getpeername(int fd, struct sockaddr __user *usockaddr,
++                              int __user *usockaddr_len)
++{
++      struct socket *sock;
++      char address[MAX_SOCK_ADDR];
++      int len, err, fput_needed;
++
++      sock = sockfd_lookup_light(fd, &err, &fput_needed);
++      if (sock != NULL) {
++              err = security_socket_getpeername(sock);
++              if (err) {
++                      fput_light(sock->file, fput_needed);
++                      return err;
++              }
++
++              err =
++                  sock->ops->getname(sock, (struct sockaddr *)address, &len,
++                                     1);
++              if (!err)
++                      err = move_addr_to_user(address, len, usockaddr,
++                                              usockaddr_len);
++              fput_light(sock->file, fput_needed);
++      }
++      return err;
++}
++
++/*
++ *    Send a datagram to a given address. We move the address into kernel
++ *    space and check the user space data area is readable before invoking
++ *    the protocol.
++ */
++
++asmlinkage long sys_sendto(int fd, void __user *buff, size_t len,
++                         unsigned flags, struct sockaddr __user *addr,
++                         int addr_len)
++{
++      struct socket *sock;
++      char address[MAX_SOCK_ADDR];
++      int err;
++      struct msghdr msg;
++      struct iovec iov;
++      int fput_needed;
++      struct file *sock_file;
++
++      sock_file = fget_light(fd, &fput_needed);
++      err = -EBADF;
++      if (!sock_file)
++              goto out;
++
++      sock = sock_from_file(sock_file, &err);
++      if (!sock)
++              goto out_put;
++      iov.iov_base = buff;
++      iov.iov_len = len;
++      msg.msg_name = NULL;
++      msg.msg_iov = &iov;
++      msg.msg_iovlen = 1;
++      msg.msg_control = NULL;
++      msg.msg_controllen = 0;
++      msg.msg_namelen = 0;
++      if (addr) {
++              err = move_addr_to_kernel(addr, addr_len, address);
++              if (err < 0)
++                      goto out_put;
++              msg.msg_name = address;
++              msg.msg_namelen = addr_len;
++      }
++      if (sock->file->f_flags & O_NONBLOCK)
++              flags |= MSG_DONTWAIT;
++      msg.msg_flags = flags;
++      err = sock_sendmsg(sock, &msg, len);
++
++out_put:
++      fput_light(sock_file, fput_needed);
++out:
++      return err;
++}
++
++/*
++ *    Send a datagram down a socket.
++ */
++
++asmlinkage long sys_send(int fd, void __user *buff, size_t len, unsigned flags)
++{
++      return sys_sendto(fd, buff, len, flags, NULL, 0);
++}
++
++/*
++ *    Receive a frame from the socket and optionally record the address of the
++ *    sender. We verify the buffers are writable and if needed move the
++ *    sender address from kernel to user space.
++ */
++
++asmlinkage long sys_recvfrom(int fd, void __user *ubuf, size_t size,
++                           unsigned flags, struct sockaddr __user *addr,
++                           int __user *addr_len)
++{
++      struct socket *sock;
++      struct iovec iov;
++      struct msghdr msg;
++      char address[MAX_SOCK_ADDR];
++      int err, err2;
++      struct file *sock_file;
++      int fput_needed;
++
++      sock_file = fget_light(fd, &fput_needed);
++      err = -EBADF;
++      if (!sock_file)
++              goto out;
++
++      sock = sock_from_file(sock_file, &err);
++      if (!sock)
++              goto out_put;
++
++      msg.msg_control = NULL;
++      msg.msg_controllen = 0;
++      msg.msg_iovlen = 1;
++      msg.msg_iov = &iov;
++      iov.iov_len = size;
++      iov.iov_base = ubuf;
++      msg.msg_name = address;
++      msg.msg_namelen = MAX_SOCK_ADDR;
++      if (sock->file->f_flags & O_NONBLOCK)
++              flags |= MSG_DONTWAIT;
++      err = sock_recvmsg(sock, &msg, size, flags);
++
++      if (err >= 0 && addr != NULL) {
++              err2 = move_addr_to_user(address, msg.msg_namelen, addr, addr_len);
++              if (err2 < 0)
++                      err = err2;
++      }
++out_put:
++      fput_light(sock_file, fput_needed);
++out:
++      return err;
++}
++
++/*
++ *    Receive a datagram from a socket.
++ */
++
++asmlinkage long sys_recv(int fd, void __user *ubuf, size_t size,
++                       unsigned flags)
++{
++      return sys_recvfrom(fd, ubuf, size, flags, NULL, NULL);
++}
++
++/*
++ *    Set a socket option. Because we don't know the option lengths we have
++ *    to pass the user mode parameter for the protocols to sort out.
++ */
++
++asmlinkage long sys_setsockopt(int fd, int level, int optname,
++                             char __user *optval, int optlen)
++{
++      int err, fput_needed;
++      struct socket *sock;
++
++      if (optlen < 0)
++              return -EINVAL;
++
++      sock = sockfd_lookup_light(fd, &err, &fput_needed);
++      if (sock != NULL) {
++              err = security_socket_setsockopt(sock, level, optname);
++              if (err)
++                      goto out_put;
++
++              if (level == SOL_SOCKET)
++                      err =
++                          sock_setsockopt(sock, level, optname, optval,
++                                          optlen);
++              else
++                      err =
++                          sock->ops->setsockopt(sock, level, optname, optval,
++                                                optlen);
++out_put:
++              fput_light(sock->file, fput_needed);
++      }
++      return err;
++}
++
++/*
++ *    Get a socket option. Because we don't know the option lengths we have
++ *    to pass a user mode parameter for the protocols to sort out.
++ */
++
++asmlinkage long sys_getsockopt(int fd, int level, int optname,
++                             char __user *optval, int __user *optlen)
++{
++      int err, fput_needed;
++      struct socket *sock;
++
++      sock = sockfd_lookup_light(fd, &err, &fput_needed);
++      if (sock != NULL) {
++              err = security_socket_getsockopt(sock, level, optname);
++              if (err)
++                      goto out_put;
++
++              if (level == SOL_SOCKET)
++                      err =
++                          sock_getsockopt(sock, level, optname, optval,
++                                          optlen);
++              else
++                      err =
++                          sock->ops->getsockopt(sock, level, optname, optval,
++                                                optlen);
++out_put:
++              fput_light(sock->file, fput_needed);
++      }
++      return err;
++}
++
++/*
++ *    Shutdown a socket.
++ */
++
++asmlinkage long sys_shutdown(int fd, int how)
++{
++      int err, fput_needed;
++      struct socket *sock;
++
++      sock = sockfd_lookup_light(fd, &err, &fput_needed);
++      if (sock != NULL) {
++              err = security_socket_shutdown(sock, how);
++              if (!err)
++                      err = sock->ops->shutdown(sock, how);
++              fput_light(sock->file, fput_needed);
++      }
++      return err;
++}
++
++/* A couple of helpful macros for getting the address of the 32/64 bit
++ * fields which are the same type (int / unsigned) on our platforms.
++ */
++#define COMPAT_MSG(msg, member)       ((MSG_CMSG_COMPAT & flags) ? &msg##_compat->member : &msg->member)
++#define COMPAT_NAMELEN(msg)   COMPAT_MSG(msg, msg_namelen)
++#define COMPAT_FLAGS(msg)     COMPAT_MSG(msg, msg_flags)
++
++/*
++ *    BSD sendmsg interface
++ */
++
++asmlinkage long sys_sendmsg(int fd, struct msghdr __user *msg, unsigned flags)
++{
++      struct compat_msghdr __user *msg_compat =
++          (struct compat_msghdr __user *)msg;
++      struct socket *sock;
++      char address[MAX_SOCK_ADDR];
++      struct iovec iovstack[UIO_FASTIOV], *iov = iovstack;
++      unsigned char ctl[sizeof(struct cmsghdr) + 20]
++          __attribute__ ((aligned(sizeof(__kernel_size_t))));
++      /* 20 is size of ipv6_pktinfo */
++      unsigned char *ctl_buf = ctl;
++      struct msghdr msg_sys;
++      int err, ctl_len, iov_size, total_len;
++      int fput_needed;
++
++      err = -EFAULT;
++      if (MSG_CMSG_COMPAT & flags) {
++              if (get_compat_msghdr(&msg_sys, msg_compat))
++                      return -EFAULT;
++      }
++      else if (copy_from_user(&msg_sys, msg, sizeof(struct msghdr)))
++              return -EFAULT;
++
++      sock = sockfd_lookup_light(fd, &err, &fput_needed);
++      if (!sock)
++              goto out;
++
++      /* do not move before msg_sys is valid */
++      err = -EMSGSIZE;
++      if (msg_sys.msg_iovlen > UIO_MAXIOV)
++              goto out_put;
++
++      /* Check whether to allocate the iovec area */
++      err = -ENOMEM;
++      iov_size = msg_sys.msg_iovlen * sizeof(struct iovec);
++      if (msg_sys.msg_iovlen > UIO_FASTIOV) {
++              iov = sock_kmalloc(sock->sk, iov_size, GFP_KERNEL);
++              if (!iov)
++                      goto out_put;
++      }
++
++      /* This will also move the address data into kernel space */
++      if (MSG_CMSG_COMPAT & flags) {
++              err = verify_compat_iovec(&msg_sys, iov, address, VERIFY_READ);
++      } else
++              err = verify_iovec(&msg_sys, iov, address, VERIFY_READ);
++      if (err < 0)
++              goto out_freeiov;
++      total_len = err;
++
++      err = -ENOBUFS;
++
++      if (msg_sys.msg_controllen > INT_MAX)
++              goto out_freeiov;
++      ctl_len = msg_sys.msg_controllen;
++      if ((MSG_CMSG_COMPAT & flags) && ctl_len) {
++              err =
++                  cmsghdr_from_user_compat_to_kern(&msg_sys, sock->sk, ctl,
++                                                   sizeof(ctl));
++              if (err)
++                      goto out_freeiov;
++              ctl_buf = msg_sys.msg_control;
++              ctl_len = msg_sys.msg_controllen;
++      } else if (ctl_len) {
++              if (ctl_len > sizeof(ctl)) {
++                      ctl_buf = sock_kmalloc(sock->sk, ctl_len, GFP_KERNEL);
++                      if (ctl_buf == NULL)
++                              goto out_freeiov;
++              }
++              err = -EFAULT;
++              /*
++               * Careful! Before this, msg_sys.msg_control contains a user pointer.
++               * Afterwards, it will be a kernel pointer. Thus the compiler-assisted
++               * checking falls down on this.
++               */
++              if (copy_from_user(ctl_buf, (void __user *)msg_sys.msg_control,
++                                 ctl_len))
++                      goto out_freectl;
++              msg_sys.msg_control = ctl_buf;
++      }
++      msg_sys.msg_flags = flags;
++
++      if (sock->file->f_flags & O_NONBLOCK)
++              msg_sys.msg_flags |= MSG_DONTWAIT;
++      err = sock_sendmsg(sock, &msg_sys, total_len);
++
++out_freectl:
++      if (ctl_buf != ctl)
++              sock_kfree_s(sock->sk, ctl_buf, ctl_len);
++out_freeiov:
++      if (iov != iovstack)
++              sock_kfree_s(sock->sk, iov, iov_size);
++out_put:
++      fput_light(sock->file, fput_needed);
++out:
++      return err;
++}
++
++/*
++ *    BSD recvmsg interface
++ */
++
++asmlinkage long sys_recvmsg(int fd, struct msghdr __user *msg,
++                          unsigned int flags)
++{
++      struct compat_msghdr __user *msg_compat =
++          (struct compat_msghdr __user *)msg;
++      struct socket *sock;
++      struct iovec iovstack[UIO_FASTIOV];
++      struct iovec *iov = iovstack;
++      struct msghdr msg_sys;
++      unsigned long cmsg_ptr;
++      int err, iov_size, total_len, len;
++      int fput_needed;
++
++      /* kernel mode address */
++      char addr[MAX_SOCK_ADDR];
++
++      /* user mode address pointers */
++      struct sockaddr __user *uaddr;
++      int __user *uaddr_len;
++
++      if (MSG_CMSG_COMPAT & flags) {
++              if (get_compat_msghdr(&msg_sys, msg_compat))
++                      return -EFAULT;
++      }
++      else if (copy_from_user(&msg_sys, msg, sizeof(struct msghdr)))
++              return -EFAULT;
++
++      sock = sockfd_lookup_light(fd, &err, &fput_needed);
++      if (!sock)
++              goto out;
++
++      err = -EMSGSIZE;
++      if (msg_sys.msg_iovlen > UIO_MAXIOV)
++              goto out_put;
++
++      /* Check whether to allocate the iovec area */
++      err = -ENOMEM;
++      iov_size = msg_sys.msg_iovlen * sizeof(struct iovec);
++      if (msg_sys.msg_iovlen > UIO_FASTIOV) {
++              iov = sock_kmalloc(sock->sk, iov_size, GFP_KERNEL);
++              if (!iov)
++                      goto out_put;
++      }
++
++      /*
++       *      Save the user-mode address (verify_iovec will change the
++       *      kernel msghdr to use the kernel address space)
++       */
++
++      uaddr = (void __user *)msg_sys.msg_name;
++      uaddr_len = COMPAT_NAMELEN(msg);
++      if (MSG_CMSG_COMPAT & flags) {
++              err = verify_compat_iovec(&msg_sys, iov, addr, VERIFY_WRITE);
++      } else
++              err = verify_iovec(&msg_sys, iov, addr, VERIFY_WRITE);
++      if (err < 0)
++              goto out_freeiov;
++      total_len = err;
++
++      cmsg_ptr = (unsigned long)msg_sys.msg_control;
++      msg_sys.msg_flags = 0;
++      if (MSG_CMSG_COMPAT & flags)
++              msg_sys.msg_flags = MSG_CMSG_COMPAT;
++
++      if (sock->file->f_flags & O_NONBLOCK)
++              flags |= MSG_DONTWAIT;
++      err = sock_recvmsg(sock, &msg_sys, total_len, flags);
++      if (err < 0)
++              goto out_freeiov;
++      len = err;
++
++      if (uaddr != NULL) {
++              err = move_addr_to_user(addr, msg_sys.msg_namelen, uaddr,
++                                      uaddr_len);
++              if (err < 0)
++                      goto out_freeiov;
++      }
++      err = __put_user((msg_sys.msg_flags & ~MSG_CMSG_COMPAT),
++                       COMPAT_FLAGS(msg));
++      if (err)
++              goto out_freeiov;
++      if (MSG_CMSG_COMPAT & flags)
++              err = __put_user((unsigned long)msg_sys.msg_control - cmsg_ptr,
++                               &msg_compat->msg_controllen);
++      else
++              err = __put_user((unsigned long)msg_sys.msg_control - cmsg_ptr,
++                               &msg->msg_controllen);
++      if (err)
++              goto out_freeiov;
++      err = len;
++
++out_freeiov:
++      if (iov != iovstack)
++              sock_kfree_s(sock->sk, iov, iov_size);
++out_put:
++      fput_light(sock->file, fput_needed);
++out:
++      return err;
++}
++
++#ifdef __ARCH_WANT_SYS_SOCKETCALL
++
++/* Argument list sizes for sys_socketcall */
++#define AL(x) ((x) * sizeof(unsigned long))
++static const unsigned char nargs[18]={
++      AL(0),AL(3),AL(3),AL(3),AL(2),AL(3),
++      AL(3),AL(3),AL(4),AL(4),AL(4),AL(6),
++      AL(6),AL(2),AL(5),AL(5),AL(3),AL(3)
++};
++
++#undef AL
++
++/*
++ *    System call vectors.
++ *
++ *    Argument checking cleaned up. Saved 20% in size.
++ *  This function doesn't need to set the kernel lock because
++ *  it is set by the callees.
++ */
++
++asmlinkage long sys_socketcall(int call, unsigned long __user *args)
++{
++      unsigned long a[6];
++      unsigned long a0, a1;
++      int err;
++
++      if (call < 1 || call > SYS_RECVMSG)
++              return -EINVAL;
++
++      /* copy_from_user should be SMP safe. */
++      if (copy_from_user(a, args, nargs[call]))
++              return -EFAULT;
++
++      err = audit_socketcall(nargs[call] / sizeof(unsigned long), a);
++      if (err)
++              return err;
++
++      a0 = a[0];
++      a1 = a[1];
++
++      switch (call) {
++      case SYS_SOCKET:
++              err = sys_socket(a0, a1, a[2]);
++              break;
++      case SYS_BIND:
++              err = sys_bind(a0, (struct sockaddr __user *)a1, a[2]);
++              break;
++      case SYS_CONNECT:
++              err = sys_connect(a0, (struct sockaddr __user *)a1, a[2]);
++              break;
++      case SYS_LISTEN:
++              err = sys_listen(a0, a1);
++              break;
++      case SYS_ACCEPT:
++              err =
++                  sys_accept(a0, (struct sockaddr __user *)a1,
++                             (int __user *)a[2]);
++              break;
++      case SYS_GETSOCKNAME:
++              err =
++                  sys_getsockname(a0, (struct sockaddr __user *)a1,
++                                  (int __user *)a[2]);
++              break;
++      case SYS_GETPEERNAME:
++              err =
++                  sys_getpeername(a0, (struct sockaddr __user *)a1,
++                                  (int __user *)a[2]);
++              break;
++      case SYS_SOCKETPAIR:
++              err = sys_socketpair(a0, a1, a[2], (int __user *)a[3]);
++              break;
++      case SYS_SEND:
++              err = sys_send(a0, (void __user *)a1, a[2], a[3]);
++              break;
++      case SYS_SENDTO:
++              err = sys_sendto(a0, (void __user *)a1, a[2], a[3],
++                               (struct sockaddr __user *)a[4], a[5]);
++              break;
++      case SYS_RECV:
++              err = sys_recv(a0, (void __user *)a1, a[2], a[3]);
++              break;
++      case SYS_RECVFROM:
++              err = sys_recvfrom(a0, (void __user *)a1, a[2], a[3],
++                                 (struct sockaddr __user *)a[4],
++                                 (int __user *)a[5]);
++              break;
++      case SYS_SHUTDOWN:
++              err = sys_shutdown(a0, a1);
++              break;
++      case SYS_SETSOCKOPT:
++              err = sys_setsockopt(a0, a1, a[2], (char __user *)a[3], a[4]);
++              break;
++      case SYS_GETSOCKOPT:
++              err =
++                  sys_getsockopt(a0, a1, a[2], (char __user *)a[3],
++                                 (int __user *)a[4]);
++              break;
++      case SYS_SENDMSG:
++              err = sys_sendmsg(a0, (struct msghdr __user *)a1, a[2]);
++              break;
++      case SYS_RECVMSG:
++              err = sys_recvmsg(a0, (struct msghdr __user *)a1, a[2]);
++              break;
++      default:
++              err = -EINVAL;
++              break;
++      }
++      return err;
++}
++
++#endif                                /* __ARCH_WANT_SYS_SOCKETCALL */
++
++/**
++ *    sock_register - add a socket protocol handler
++ *    @ops: description of protocol
++ *
++ *    This function is called by a protocol handler that wants to
++ *    advertise its address family, and have it linked into the
++ *    socket interface. The value ops->family coresponds to the
++ *    socket system call protocol family.
++ */
++int sock_register(const struct net_proto_family *ops)
++{
++      int err;
++
++      if (ops->family >= NPROTO) {
++              printk(KERN_CRIT "protocol %d >= NPROTO(%d)\n", ops->family,
++                     NPROTO);
++              return -ENOBUFS;
++      }
++
++      spin_lock(&net_family_lock);
++      if (net_families[ops->family])
++              err = -EEXIST;
++      else {
++              net_families[ops->family] = ops;
++              err = 0;
++      }
++      spin_unlock(&net_family_lock);
++
++      printk(KERN_INFO "NET: Registered protocol family %d\n", ops->family);
++      return err;
++}
++
++/**
++ *    sock_unregister - remove a protocol handler
++ *    @family: protocol family to remove
++ *
++ *    This function is called by a protocol handler that wants to
++ *    remove its address family, and have it unlinked from the
++ *    new socket creation.
++ *
++ *    If protocol handler is a module, then it can use module reference
++ *    counts to protect against new references. If protocol handler is not
++ *    a module then it needs to provide its own protection in
++ *    the ops->create routine.
++ */
++void sock_unregister(int family)
++{
++      BUG_ON(family < 0 || family >= NPROTO);
++
++      spin_lock(&net_family_lock);
++      net_families[family] = NULL;
++      spin_unlock(&net_family_lock);
++
++      synchronize_rcu();
++
++      printk(KERN_INFO "NET: Unregistered protocol family %d\n", family);
++}
++
++static int sock_pernet_init(struct net *net)
++{
++      net->sysctl_somaxconn = SOMAXCONN;
++      return 0;
++}
++
++static struct pernet_operations sock_net_ops = {
++      .init = sock_pernet_init,
++};
++
++static int __init sock_init(void)
++{
++      /*
++       *      Initialize sock SLAB cache.
++       */
++
++      sk_init();
++
++      /*
++       *      Initialize skbuff SLAB cache
++       */
++      skb_init();
++
++      /*
++       *      Initialize the protocols module.
++       */
++
++      init_inodecache();
++      register_filesystem(&sock_fs_type);
++      sock_mnt = kern_mount(&sock_fs_type);
++
++      /* The real protocol initialization is performed in later initcalls.
++       */
++
++#ifdef CONFIG_NETFILTER
++      netfilter_init();
++#endif
++
++      register_pernet_subsys(&sock_net_ops);
++
++      return 0;
++}
++
++core_initcall(sock_init);     /* early initcall */
++
++#ifdef CONFIG_PROC_FS
++void socket_seq_show(struct seq_file *seq)
++{
++      int cpu;
++      int counter = 0;
++
++      for_each_possible_cpu(cpu)
++          counter += per_cpu(sockets_in_use, cpu);
++
++      /* It can be negative, by the way. 8) */
++      if (counter < 0)
++              counter = 0;
++
++      seq_printf(seq, "sockets: used %d\n", counter);
++}
++#endif                                /* CONFIG_PROC_FS */
++
++#ifdef CONFIG_COMPAT
++static long compat_sock_ioctl(struct file *file, unsigned cmd,
++                            unsigned long arg)
++{
++      struct socket *sock = file->private_data;
++      int ret = -ENOIOCTLCMD;
++
++      if (sock->ops->compat_ioctl)
++              ret = sock->ops->compat_ioctl(sock, cmd, arg);
++
++      return ret;
++}
++#endif
++
++int kernel_bind(struct socket *sock, struct sockaddr *addr, int addrlen)
++{
++      return sock->ops->bind(sock, addr, addrlen);
++}
++
++int kernel_listen(struct socket *sock, int backlog)
++{
++      return sock->ops->listen(sock, backlog);
++}
++
++int kernel_accept(struct socket *sock, struct socket **newsock, int flags)
++{
++      struct sock *sk = sock->sk;
++      int err;
++
++      err = sock_create_lite(sk->sk_family, sk->sk_type, sk->sk_protocol,
++                             newsock);
++      if (err < 0)
++              goto done;
++
++      err = sock->ops->accept(sock, *newsock, flags);
++      if (err < 0) {
++              sock_release(*newsock);
++              goto done;
++      }
++
++      (*newsock)->ops = sock->ops;
++
++done:
++      return err;
++}
++
++int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen,
++                 int flags)
++{
++      return sock->ops->connect(sock, addr, addrlen, flags);
++}
++
++int kernel_getsockname(struct socket *sock, struct sockaddr *addr,
++                       int *addrlen)
++{
++      return sock->ops->getname(sock, addr, addrlen, 0);
++}
++
++int kernel_getpeername(struct socket *sock, struct sockaddr *addr,
++                       int *addrlen)
++{
++      return sock->ops->getname(sock, addr, addrlen, 1);
++}
++
++int kernel_getsockopt(struct socket *sock, int level, int optname,
++                      char *optval, int *optlen)
++{
++      mm_segment_t oldfs = get_fs();
++      int err;
++
++      set_fs(KERNEL_DS);
++      if (level == SOL_SOCKET)
++              err = sock_getsockopt(sock, level, optname, optval, optlen);
++      else
++              err = sock->ops->getsockopt(sock, level, optname, optval,
++                                          optlen);
++      set_fs(oldfs);
++      return err;
++}
++
++int kernel_setsockopt(struct socket *sock, int level, int optname,
++                      char *optval, int optlen)
++{
++      mm_segment_t oldfs = get_fs();
++      int err;
++
++      set_fs(KERNEL_DS);
++      if (level == SOL_SOCKET)
++              err = sock_setsockopt(sock, level, optname, optval, optlen);
++      else
++              err = sock->ops->setsockopt(sock, level, optname, optval,
++                                          optlen);
++      set_fs(oldfs);
++      return err;
++}
++
++int kernel_sendpage(struct socket *sock, struct page *page, int offset,
++                  size_t size, int flags)
++{
++      if (sock->ops->sendpage)
++              return sock->ops->sendpage(sock, page, offset, size, flags);
++
++      return sock_no_sendpage(sock, page, offset, size, flags);
++}
++
++int kernel_sock_ioctl(struct socket *sock, int cmd, unsigned long arg)
++{
++      mm_segment_t oldfs = get_fs();
++      int err;
++
++      set_fs(KERNEL_DS);
++      err = sock->ops->ioctl(sock, cmd, arg);
++      set_fs(oldfs);
++
++      return err;
++}
++
++/* ABI emulation layers need these two */
++EXPORT_SYMBOL(move_addr_to_kernel);
++EXPORT_SYMBOL(move_addr_to_user);
++EXPORT_SYMBOL(sock_create);
++EXPORT_SYMBOL(sock_create_kern);
++EXPORT_SYMBOL(sock_create_lite);
++EXPORT_SYMBOL(sock_map_fd);
++EXPORT_SYMBOL(sock_recvmsg);
++EXPORT_SYMBOL(sock_register);
++EXPORT_SYMBOL(sock_release);
++EXPORT_SYMBOL(sock_sendmsg);
++EXPORT_SYMBOL(sock_unregister);
++EXPORT_SYMBOL(sock_wake_async);
++EXPORT_SYMBOL(sockfd_lookup);
++EXPORT_SYMBOL(kernel_sendmsg);
++EXPORT_SYMBOL(kernel_recvmsg);
++EXPORT_SYMBOL(kernel_bind);
++EXPORT_SYMBOL(kernel_listen);
++EXPORT_SYMBOL(kernel_accept);
++EXPORT_SYMBOL(kernel_connect);
++EXPORT_SYMBOL(kernel_getsockname);
++EXPORT_SYMBOL(kernel_getpeername);
++EXPORT_SYMBOL(kernel_getsockopt);
++EXPORT_SYMBOL(kernel_setsockopt);
++EXPORT_SYMBOL(kernel_sendpage);
++EXPORT_SYMBOL(kernel_sock_ioctl);
author	Sapan Bhatia <sapanb@cs.princeton.edu>
	Thu, 20 Mar 2008 04:15:10 +0000 (04:15 +0000)
committer	Sapan Bhatia <sapanb@cs.princeton.edu>
	Thu, 20 Mar 2008 04:15:10 +0000 (04:15 +0000)