---- linux-2.6.22-590/kernel/vserver/space.c.orig 2008-02-29 09:01:28.000000000 -0500
-+++ linux-2.6.22-590/kernel/vserver/space.c 2008-03-06 15:47:26.000000000 -0500
+diff -Nurb linux-2.6.22-594/include/linux/vserver/network.h.orig.orig linux-2.6.22-595/include/linux/vserver/network.h.orig.orig
+--- linux-2.6.22-594/include/linux/vserver/network.h.orig.orig 2008-03-20 00:04:54.000000000 -0400
++++ linux-2.6.22-595/include/linux/vserver/network.h.orig.orig 1969-12-31 19:00:00.000000000 -0500
+@@ -1,143 +0,0 @@
+-#ifndef _VX_NETWORK_H
+-#define _VX_NETWORK_H
+-
+-#include <linux/types.h>
+-
+-
+-#define MAX_N_CONTEXT 65535 /* Arbitrary limit */
+-
+-
+-/* network flags */
+-
+-#define NXF_INFO_PRIVATE 0x00000008
+-
+-#define NXF_SINGLE_IP 0x00000100
+-#define NXF_LBACK_REMAP 0x00000200
+-
+-#define NXF_HIDE_NETIF 0x02000000
+-#define NXF_HIDE_LBACK 0x04000000
+-
+-#define NXF_STATE_SETUP (1ULL << 32)
+-#define NXF_STATE_ADMIN (1ULL << 34)
+-
+-#define NXF_SC_HELPER (1ULL << 36)
+-#define NXF_PERSISTENT (1ULL << 38)
+-
+-#define NXF_ONE_TIME (0x0005ULL << 32)
+-
+-
+-#define NXF_INIT_SET (__nxf_init_set())
+-
+-static inline uint64_t __nxf_init_set(void) {
+- return NXF_STATE_ADMIN
+-#ifdef CONFIG_VSERVER_AUTO_LBACK
+- | NXF_LBACK_REMAP
+- | NXF_HIDE_LBACK
+-#endif
+-#ifdef CONFIG_VSERVER_AUTO_SINGLE
+- | NXF_SINGLE_IP
+-#endif
+- | NXF_HIDE_NETIF;
+-}
+-
+-
+-/* network caps */
+-
+-#define NXC_RAW_ICMP 0x00000100
+-
+-
+-/* address types */
+-
+-#define NXA_TYPE_IPV4 0x0001
+-#define NXA_TYPE_IPV6 0x0002
+-
+-#define NXA_TYPE_NONE 0x0000
+-#define NXA_TYPE_ANY 0x00FF
+-
+-#define NXA_TYPE_ADDR 0x0010
+-#define NXA_TYPE_MASK 0x0020
+-#define NXA_TYPE_RANGE 0x0040
+-
+-#define NXA_MASK_ALL (NXA_TYPE_ADDR | NXA_TYPE_MASK | NXA_TYPE_RANGE)
+-
+-#define NXA_MOD_BCAST 0x0100
+-#define NXA_MOD_LBACK 0x0200
+-
+-#define NXA_LOOPBACK 0x1000
+-
+-#define NXA_MASK_BIND (NXA_MASK_ALL | NXA_MOD_BCAST | NXA_MOD_LBACK)
+-#define NXA_MASK_SHOW (NXA_MASK_ALL | NXA_LOOPBACK)
+-
+-#ifdef __KERNEL__
+-
+-#include <linux/list.h>
+-#include <linux/spinlock.h>
+-#include <linux/rcupdate.h>
+-#include <linux/in.h>
+-#include <linux/in6.h>
+-#include <asm/atomic.h>
+-
+-struct nx_addr_v4 {
+- struct nx_addr_v4 *next;
+- struct in_addr ip[2];
+- struct in_addr mask;
+- uint16_t type;
+- uint16_t flags;
+-};
+-
+-struct nx_addr_v6 {
+- struct nx_addr_v6 *next;
+- struct in6_addr ip;
+- struct in6_addr mask;
+- uint32_t prefix;
+- uint16_t type;
+- uint16_t flags;
+-};
+-
+-struct nx_info {
+- struct hlist_node nx_hlist; /* linked list of nxinfos */
+- nid_t nx_id; /* vnet id */
+- atomic_t nx_usecnt; /* usage count */
+- atomic_t nx_tasks; /* tasks count */
+- int nx_state; /* context state */
+-
+- uint64_t nx_flags; /* network flag word */
+- uint64_t nx_ncaps; /* network capabilities */
+-
+- struct in_addr v4_lback; /* Loopback address */
+- struct in_addr v4_bcast; /* Broadcast address */
+- struct nx_addr_v4 v4; /* First/Single ipv4 address */
+-#ifdef CONFIG_IPV6
+- struct nx_addr_v6 v6; /* First/Single ipv6 address */
+-#endif
+- char nx_name[65]; /* network context name */
+-};
+-
+-
+-/* status flags */
+-
+-#define NXS_HASHED 0x0001
+-#define NXS_SHUTDOWN 0x0100
+-#define NXS_RELEASED 0x8000
+-
+-extern struct nx_info *lookup_nx_info(int);
+-
+-extern int get_nid_list(int, unsigned int *, int);
+-extern int nid_is_hashed(nid_t);
+-
+-extern int nx_migrate_task(struct task_struct *, struct nx_info *);
+-
+-extern long vs_net_change(struct nx_info *, unsigned int);
+-
+-struct sock;
+-
+-
+-#define NX_IPV4(n) ((n)->v4.type != NXA_TYPE_NONE)
+-#ifdef CONFIG_IPV6
+-#define NX_IPV6(n) ((n)->v6.type != NXA_TYPE_NONE)
+-#else
+-#define NX_IPV6(n) (0)
+-#endif
+-
+-#endif /* __KERNEL__ */
+-#endif /* _VX_NETWORK_H */
+diff -Nurb linux-2.6.22-594/kernel/nsproxy.c.orig linux-2.6.22-595/kernel/nsproxy.c.orig
+--- linux-2.6.22-594/kernel/nsproxy.c.orig 2008-03-20 00:05:18.000000000 -0400
++++ linux-2.6.22-595/kernel/nsproxy.c.orig 1969-12-31 19:00:00.000000000 -0500
+@@ -1,264 +0,0 @@
+-/*
+- * Copyright (C) 2006 IBM Corporation
+- *
+- * Author: Serge Hallyn <serue@us.ibm.com>
+- *
+- * This program is free software; you can redistribute it and/or
+- * modify it under the terms of the GNU General Public License as
+- * published by the Free Software Foundation, version 2 of the
+- * License.
+- *
+- * Jun 2006 - namespaces support
+- * OpenVZ, SWsoft Inc.
+- * Pavel Emelianov <xemul@openvz.org>
+- */
+-
+-#include <linux/module.h>
+-#include <linux/version.h>
+-#include <linux/nsproxy.h>
+-#include <linux/init_task.h>
+-#include <linux/mnt_namespace.h>
+-#include <linux/utsname.h>
+-#include <net/net_namespace.h>
+-#include <linux/pid_namespace.h>
+-#include <linux/vserver/global.h>
+-#include <linux/vserver/debug.h>
+-
+-static struct kmem_cache *nsproxy_cachep;
+-
+-struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);
+-
+-void get_task_namespaces(struct task_struct *tsk)
+-{
+- struct nsproxy *ns = tsk->nsproxy;
+- if (ns) {
+- get_nsproxy(ns);
+- }
+-}
+-
+-/*
+- * creates a copy of "orig" with refcount 1.
+- */
+-static inline struct nsproxy *clone_nsproxy(struct nsproxy *orig)
+-{
+- struct nsproxy *ns;
+-
+- ns = kmemdup(orig, sizeof(struct nsproxy), GFP_KERNEL);
+- if (ns)
+- atomic_set(&ns->count, 1);
+- vxdprintk(VXD_CBIT(space, 2), "clone_nsproxy(%p[%u] = %p[1]",
+- orig, atomic_read(&orig->count), ns);
+- atomic_inc(&vs_global_nsproxy);
+- return ns;
+-}
+-
+-/*
+- * Create new nsproxy and all of its the associated namespaces.
+- * Return the newly created nsproxy. Do not attach this to the task,
+- * leave it to the caller to do proper locking and attach it to task.
+- */
+-static struct nsproxy *unshare_namespaces(int flags, struct nsproxy *orig,
+- struct fs_struct *new_fs)
+-{
+- struct nsproxy *new_nsp;
+- int err = -ENOMEM;
+-
+- vxdprintk(VXD_CBIT(space, 4),
+- "unshare_namespaces(0x%08x,%p,%p)",
+- flags, orig, new_fs);
+-
+- new_nsp = clone_nsproxy(orig);
+- if (!new_nsp)
+- return ERR_PTR(-ENOMEM);
+-
+- new_nsp->mnt_ns = copy_mnt_ns(flags, orig->mnt_ns, new_fs);
+- if (IS_ERR(new_nsp->mnt_ns))
+- goto out_ns;
+-
+- new_nsp->uts_ns = copy_utsname(flags, orig->uts_ns);
+- if (IS_ERR(new_nsp->uts_ns))
+- goto out_uts;
+-
+- new_nsp->ipc_ns = copy_ipcs(flags, orig->ipc_ns);
+- if (IS_ERR(new_nsp->ipc_ns))
+- goto out_ipc;
+-
+- new_nsp->pid_ns = copy_pid_ns(flags, orig->pid_ns);
+- if (IS_ERR(new_nsp->pid_ns))
+- goto out_pid;
+-
+- new_nsp->user_ns = copy_user_ns(flags, orig->user_ns);
+- if (IS_ERR(new_nsp->user_ns))
+- goto out_user;
+-
+- new_nsp->net_ns = copy_net_ns(flags, orig->net_ns);
+- if (IS_ERR(new_nsp->net_ns))
+- goto out_net;
+-
+- return new_nsp;
+-
+-out_net:
+- if (new_nsp->user_ns)
+- put_user_ns(new_nsp->user_ns);
+- if (new_nsp->net_ns)
+- put_net(new_nsp->net_ns);
+-out_user:
+- if (new_nsp->pid_ns)
+- put_pid_ns(new_nsp->pid_ns);
+-out_pid:
+- if (new_nsp->ipc_ns)
+- put_ipc_ns(new_nsp->ipc_ns);
+-out_ipc:
+- if (new_nsp->uts_ns)
+- put_uts_ns(new_nsp->uts_ns);
+-out_uts:
+- if (new_nsp->mnt_ns)
+- put_mnt_ns(new_nsp->mnt_ns);
+-out_ns:
+- kmem_cache_free(nsproxy_cachep, new_nsp);
+- return ERR_PTR(err);
+-}
+-
+-static struct nsproxy *create_new_namespaces(unsigned long flags, struct task_struct *tsk,
+- struct fs_struct *new_fs)
+-{
+- return unshare_namespaces(flags, tsk->nsproxy, new_fs);
+-}
+-
+-/*
+- * copies the nsproxy, setting refcount to 1, and grabbing a
+- * reference to all contained namespaces.
+- */
+-struct nsproxy *copy_nsproxy(struct nsproxy *orig)
+-{
+- struct nsproxy *ns = clone_nsproxy(orig);
+-
+- if (ns) {
+- if (ns->mnt_ns)
+- get_mnt_ns(ns->mnt_ns);
+- if (ns->uts_ns)
+- get_uts_ns(ns->uts_ns);
+- if (ns->ipc_ns)
+- get_ipc_ns(ns->ipc_ns);
+- if (ns->pid_ns)
+- get_pid_ns(ns->pid_ns);
+- }
+- return ns;
+-}
+-
+-/*
+- * called from clone. This now handles copy for nsproxy and all
+- * namespaces therein.
+- */
+-int copy_namespaces(unsigned long flags, struct task_struct *tsk)
+-{
+- struct nsproxy *old_ns = tsk->nsproxy;
+- struct nsproxy *new_ns = NULL;
+- int err = 0;
+-
+- vxdprintk(VXD_CBIT(space, 7), "copy_namespaces(0x%08x,%p[%p])",
+- flags, tsk, old_ns);
+-
+- if (!old_ns)
+- return 0;
+-
+- get_nsproxy(old_ns);
+- return 0;
+-
+- if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWUSER | CLONE_NEWNET)))
+- return 0;
+-
+- #ifndef CONFIG_NET_NS
+- if (unshare_flags & CLONE_NEWNET)
+- return -EINVAL;
+- #endif
+-
+-
+- if (!capable(CAP_SYS_ADMIN)) {
+- err = -EPERM;
+- goto out;
+- }
+-
+- new_ns = create_new_namespaces(flags, tsk, tsk->fs);
+- if (IS_ERR(new_ns)) {
+- err = PTR_ERR(new_ns);
+- goto out;
+- }
+-
+- err = ns_container_clone(tsk);
+- if (err) {
+- put_nsproxy(new_ns);
+- goto out;
+- }
+-
+- tsk->nsproxy = new_ns;
+-
+-out:
+- put_nsproxy(old_ns);
+- vxdprintk(VXD_CBIT(space, 3),
+- "copy_namespaces(0x%08x,%p[%p]) = %d [%p]",
+- flags, tsk, old_ns, err, new_ns);
+- return err;
+-}
+-
+-void free_nsproxy(struct nsproxy *ns)
+-{
+- if (ns->mnt_ns)
+- put_mnt_ns(ns->mnt_ns);
+- if (ns->uts_ns)
+- put_uts_ns(ns->uts_ns);
+- if (ns->ipc_ns)
+- put_ipc_ns(ns->ipc_ns);
+- if (ns->pid_ns)
+- put_pid_ns(ns->pid_ns);
+- atomic_dec(&vs_global_nsproxy);
+- kfree(ns);
+-}
+-
+-/*
+- * Called from unshare. Unshare all the namespaces part of nsproxy.
+- * On success, returns the new nsproxy.
+- */
+-int unshare_nsproxy_namespaces(unsigned long unshare_flags,
+- struct nsproxy **new_nsp, struct fs_struct *new_fs)
+-{
+- int err = 0;
+-
+- vxdprintk(VXD_CBIT(space, 4),
+- "unshare_nsproxy_namespaces(0x%08lx,[%p])",
+- unshare_flags, current->nsproxy);
+-
+- if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
+- CLONE_NEWUSER | CLONE_NEWNET)))
+- return 0;
+-
+-#ifndef CONFIG_NET_NS
+- if (unshare_flags & CLONE_NEWNET)
+- return -EINVAL;
+-#endif
+- if (!capable(CAP_SYS_ADMIN))
+- return -EPERM;
+-
+- *new_nsp = create_new_namespaces(unshare_flags, current,
+- new_fs ? new_fs : current->fs);
+- if (IS_ERR(*new_nsp)) {
+- err = PTR_ERR(*new_nsp);
+- goto out;
+- }
+-
+- err = ns_container_clone(current);
+- if (err)
+- put_nsproxy(*new_nsp);
+-
+-out:
+- return err;
+-}
+-
+-static int __init nsproxy_cache_init(void)
+-{
+- nsproxy_cachep = kmem_cache_create("nsproxy", sizeof(struct nsproxy),
+- 0, SLAB_PANIC, NULL, NULL);
+- return 0;
+-}
+-
+-module_init(nsproxy_cache_init);
+diff -Nurb linux-2.6.22-594/kernel/user.c.orig linux-2.6.22-595/kernel/user.c.orig
+--- linux-2.6.22-594/kernel/user.c.orig 2008-03-20 00:05:18.000000000 -0400
++++ linux-2.6.22-595/kernel/user.c.orig 1969-12-31 19:00:00.000000000 -0500
+@@ -1,227 +0,0 @@
+-/*
+- * The "user cache".
+- *
+- * (C) Copyright 1991-2000 Linus Torvalds
+- *
+- * We have a per-user structure to keep track of how many
+- * processes, files etc the user has claimed, in order to be
+- * able to have per-user limits for system resources.
+- */
+-
+-#include <linux/init.h>
+-#include <linux/sched.h>
+-#include <linux/slab.h>
+-#include <linux/bitops.h>
+-#include <linux/key.h>
+-#include <linux/interrupt.h>
+-#include <linux/module.h>
+-#include <linux/user_namespace.h>
+-
+-/*
+- * UID task count cache, to get fast user lookup in "alloc_uid"
+- * when changing user ID's (ie setuid() and friends).
+- */
+-
+-#define UIDHASH_MASK (UIDHASH_SZ - 1)
+-#define __uidhashfn(xid,uid) ((((uid) >> UIDHASH_BITS) + ((uid)^(xid))) & UIDHASH_MASK)
+-#define uidhashentry(ns, xid, uid) ((ns)->uidhash_table + __uidhashfn(xid, uid))
+-
+-static struct kmem_cache *uid_cachep;
+-static struct list_head uidhash_table[UIDHASH_SZ];
+-
+-/*
+- * The uidhash_lock is mostly taken from process context, but it is
+- * occasionally also taken from softirq/tasklet context, when
+- * task-structs get RCU-freed. Hence all locking must be softirq-safe.
+- * But free_uid() is also called with local interrupts disabled, and running
+- * local_bh_enable() with local interrupts disabled is an error - we'll run
+- * softirq callbacks, and they can unconditionally enable interrupts, and
+- * the caller of free_uid() didn't expect that..
+- */
+-static DEFINE_SPINLOCK(uidhash_lock);
+-
+-struct user_struct root_user = {
+- .__count = ATOMIC_INIT(1),
+- .processes = ATOMIC_INIT(1),
+- .files = ATOMIC_INIT(0),
+- .sigpending = ATOMIC_INIT(0),
+- .mq_bytes = 0,
+- .locked_shm = 0,
+-#ifdef CONFIG_KEYS
+- .uid_keyring = &root_user_keyring,
+- .session_keyring = &root_session_keyring,
+-#endif
+-};
+-
+-/*
+- * These routines must be called with the uidhash spinlock held!
+- */
+-static inline void uid_hash_insert(struct user_struct *up, struct list_head *hashent)
+-{
+- list_add(&up->uidhash_list, hashent);
+-}
+-
+-static inline void uid_hash_remove(struct user_struct *up)
+-{
+- list_del(&up->uidhash_list);
+-}
+-
+-static inline struct user_struct *uid_hash_find(xid_t xid, uid_t uid, struct list_head *hashent)
+-{
+- struct list_head *up;
+-
+- list_for_each(up, hashent) {
+- struct user_struct *user;
+-
+- user = list_entry(up, struct user_struct, uidhash_list);
+-
+- if(user->uid == uid && user->xid == xid) {
+- atomic_inc(&user->__count);
+- return user;
+- }
+- }
+-
+- return NULL;
+-}
+-
+-/*
+- * Locate the user_struct for the passed UID. If found, take a ref on it. The
+- * caller must undo that ref with free_uid().
+- *
+- * If the user_struct could not be found, return NULL.
+- */
+-struct user_struct *find_user(xid_t xid, uid_t uid)
+-{
+- struct user_struct *ret;
+- unsigned long flags;
+- struct user_namespace *ns = current->nsproxy->user_ns;
+-
+- spin_lock_irqsave(&uidhash_lock, flags);
+- ret = uid_hash_find(xid, uid, uidhashentry(ns, xid, uid));
+- spin_unlock_irqrestore(&uidhash_lock, flags);
+- return ret;
+-}
+-
+-void free_uid(struct user_struct *up)
+-{
+- unsigned long flags;
+-
+- if (!up)
+- return;
+-
+- local_irq_save(flags);
+- if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) {
+- uid_hash_remove(up);
+- spin_unlock_irqrestore(&uidhash_lock, flags);
+- key_put(up->uid_keyring);
+- key_put(up->session_keyring);
+- kmem_cache_free(uid_cachep, up);
+- } else {
+- local_irq_restore(flags);
+- }
+-}
+-
+-struct user_struct * alloc_uid(xid_t xid, uid_t uid)
+-{
+- struct user_namespace *ns = current->nsproxy->user_ns;
+- struct list_head *hashent = uidhashentry(ns,xid, uid);
+- struct user_struct *up;
+-
+- spin_lock_irq(&uidhash_lock);
+- up = uid_hash_find(xid, uid, hashent);
+- spin_unlock_irq(&uidhash_lock);
+-
+- if (!up) {
+- struct user_struct *new;
+-
+- new = kmem_cache_alloc(uid_cachep, GFP_KERNEL);
+- if (!new)
+- return NULL;
+- new->uid = uid;
+- new->xid = xid;
+- atomic_set(&new->__count, 1);
+- atomic_set(&new->processes, 0);
+- atomic_set(&new->files, 0);
+- atomic_set(&new->sigpending, 0);
+-#ifdef CONFIG_INOTIFY_USER
+- atomic_set(&new->inotify_watches, 0);
+- atomic_set(&new->inotify_devs, 0);
+-#endif
+-
+- new->mq_bytes = 0;
+- new->locked_shm = 0;
+-
+- if (alloc_uid_keyring(new, current) < 0) {
+- kmem_cache_free(uid_cachep, new);
+- return NULL;
+- }
+-
+- /*
+- * Before adding this, check whether we raced
+- * on adding the same user already..
+- */
+- spin_lock_irq(&uidhash_lock);
+- up = uid_hash_find(xid, uid, hashent);
+- if (up) {
+- key_put(new->uid_keyring);
+- key_put(new->session_keyring);
+- kmem_cache_free(uid_cachep, new);
+- } else {
+- uid_hash_insert(new, hashent);
+- up = new;
+- }
+- spin_unlock_irq(&uidhash_lock);
+-
+- }
+- return up;
+-}
+-
+-void switch_uid(struct user_struct *new_user)
+-{
+- struct user_struct *old_user;
+-
+- /* What if a process setreuid()'s and this brings the
+- * new uid over his NPROC rlimit? We can check this now
+- * cheaply with the new uid cache, so if it matters
+- * we should be checking for it. -DaveM
+- */
+- old_user = current->user;
+- atomic_inc(&new_user->processes);
+- atomic_dec(&old_user->processes);
+- switch_uid_keyring(new_user);
+- current->user = new_user;
+-
+- /*
+- * We need to synchronize with __sigqueue_alloc()
+- * doing a get_uid(p->user).. If that saw the old
+- * user value, we need to wait until it has exited
+- * its critical region before we can free the old
+- * structure.
+- */
+- smp_mb();
+- spin_unlock_wait(¤t->sighand->siglock);
+-
+- free_uid(old_user);
+- suid_keys(current);
+-}
+-
+-
+-static int __init uid_cache_init(void)
+-{
+- int n;
+-
+- uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct),
+- 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+-
+- for(n = 0; n < UIDHASH_SZ; ++n)
+- INIT_LIST_HEAD(init_user_ns.uidhash_table + n);
+-
+- /* Insert the root user immediately (init already runs as root) */
+- spin_lock_irq(&uidhash_lock);
+- uid_hash_insert(&root_user, uidhashentry(&init_user_ns, 0, 0));
+- spin_unlock_irq(&uidhash_lock);
+-
+- return 0;
+-}
+-
+-module_init(uid_cache_init);
+diff -Nurb linux-2.6.22-594/kernel/vserver/context.c linux-2.6.22-595/kernel/vserver/context.c
+--- linux-2.6.22-594/kernel/vserver/context.c 2008-03-20 00:04:46.000000000 -0400
++++ linux-2.6.22-595/kernel/vserver/context.c 2008-03-20 00:13:22.000000000 -0400
+@@ -589,13 +589,13 @@
+ struct nsproxy *old_nsp, *new_nsp;
+
+ ret = unshare_nsproxy_namespaces(
+- CLONE_NEWUTS | CLONE_NEWIPC,
++ CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWNET,
+ &new_nsp, NULL);
+ if (ret)
+ goto out;
+
+ old_nsp = xchg(&p->nsproxy, new_nsp);
+- vx_set_space(vxi, CLONE_NEWUTS | CLONE_NEWIPC);
++ vx_set_space(vxi, CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWNET);
+ put_nsproxy(old_nsp);
+ }
+ }
+@@ -781,7 +781,7 @@
+ if (vs_state_change(new_vxi, VSC_STARTUP))
+ goto out;
+
+- ret = vx_migrate_task(current, new_vxi, (!data));
++ ret = vx_migrate_task(current, new_vxi, 1 /*(!data) Hack no. 1 - Sapan*/);
+ if (ret)
+ goto out;
+
+diff -Nurb linux-2.6.22-594/kernel/vserver/context.c.orig linux-2.6.22-595/kernel/vserver/context.c.orig
+--- linux-2.6.22-594/kernel/vserver/context.c.orig 1969-12-31 19:00:00.000000000 -0500
++++ linux-2.6.22-595/kernel/vserver/context.c.orig 2008-03-20 00:04:46.000000000 -0400
+@@ -0,0 +1,966 @@
++/*
++ * linux/kernel/vserver/context.c
++ *
++ * Virtual Server: Context Support
++ *
++ * Copyright (C) 2003-2007 Herbert Pötzl
++ *
++ * V0.01 context helper
++ * V0.02 vx_ctx_kill syscall command
++ * V0.03 replaced context_info calls
++ * V0.04 redesign of struct (de)alloc
++ * V0.05 rlimit basic implementation
++ * V0.06 task_xid and info commands
++ * V0.07 context flags and caps
++ * V0.08 switch to RCU based hash
++ * V0.09 revert to non RCU for now
++ * V0.10 and back to working RCU hash
++ * V0.11 and back to locking again
++ * V0.12 referenced context store
++ * V0.13 separate per cpu data
++ * V0.14 changed vcmds to vxi arg
++ * V0.15 added context stat
++ * V0.16 have __create claim() the vxi
++ * V0.17 removed older and legacy stuff
++ *
++ */
++
++#include <linux/slab.h>
++#include <linux/types.h>
++#include <linux/pid_namespace.h>
++
++#include <linux/vserver/context.h>
++#include <linux/vserver/network.h>
++#include <linux/vserver/debug.h>
++#include <linux/vserver/limit.h>
++#include <linux/vserver/limit_int.h>
++#include <linux/vserver/space.h>
++
++#include <linux/vs_context.h>
++#include <linux/vs_limit.h>
++#include <linux/vserver/context_cmd.h>
++
++#include "cvirt_init.h"
++#include "cacct_init.h"
++#include "limit_init.h"
++#include "sched_init.h"
++
++
++atomic_t vx_global_ctotal = ATOMIC_INIT(0);
++atomic_t vx_global_cactive = ATOMIC_INIT(0);
++
++
++/* now inactive context structures */
++
++static struct hlist_head vx_info_inactive = HLIST_HEAD_INIT;
++
++static spinlock_t vx_info_inactive_lock = SPIN_LOCK_UNLOCKED;
++
++
++/* __alloc_vx_info()
++
++ * allocate an initialized vx_info struct
++ * doesn't make it visible (hash) */
++
++static struct vx_info *__alloc_vx_info(xid_t xid)
++{
++ struct vx_info *new = NULL;
++ int cpu;
++
++ vxdprintk(VXD_CBIT(xid, 0), "alloc_vx_info(%d)*", xid);
++
++ /* would this benefit from a slab cache? */
++ new = kmalloc(sizeof(struct vx_info), GFP_KERNEL);
++ if (!new)
++ return 0;
++
++ memset(new, 0, sizeof(struct vx_info));
++#ifdef CONFIG_SMP
++ new->ptr_pc = alloc_percpu(struct _vx_info_pc);
++ if (!new->ptr_pc)
++ goto error;
++#endif
++ new->vx_id = xid;
++ INIT_HLIST_NODE(&new->vx_hlist);
++ atomic_set(&new->vx_usecnt, 0);
++ atomic_set(&new->vx_tasks, 0);
++ new->vx_parent = NULL;
++ new->vx_state = 0;
++ init_waitqueue_head(&new->vx_wait);
++
++ /* prepare reaper */
++ get_task_struct(init_pid_ns.child_reaper);
++ new->vx_reaper = init_pid_ns.child_reaper;
++ new->vx_badness_bias = 0;
++
++ /* rest of init goes here */
++ vx_info_init_limit(&new->limit);
++ vx_info_init_sched(&new->sched);
++ vx_info_init_cvirt(&new->cvirt);
++ vx_info_init_cacct(&new->cacct);
++
++ /* per cpu data structures */
++ for_each_possible_cpu(cpu) {
++ vx_info_init_sched_pc(
++ &vx_per_cpu(new, sched_pc, cpu), cpu);
++ vx_info_init_cvirt_pc(
++ &vx_per_cpu(new, cvirt_pc, cpu), cpu);
++ }
++
++ new->vx_flags = VXF_INIT_SET;
++ new->vx_bcaps = CAP_INIT_EFF_SET;
++ new->vx_ccaps = 0;
++ new->vx_cap_bset = cap_bset;
++
++ new->reboot_cmd = 0;
++ new->exit_code = 0;
++
++ new->vx_nsproxy = copy_nsproxy(current->nsproxy);
++
++ vxdprintk(VXD_CBIT(xid, 0),
++ "alloc_vx_info(%d) = %p", xid, new);
++ vxh_alloc_vx_info(new);
++ atomic_inc(&vx_global_ctotal);
++ return new;
++#ifdef CONFIG_SMP
++error:
++ kfree(new);
++ return 0;
++#endif
++}
++
++/* __dealloc_vx_info()
++
++ * final disposal of vx_info */
++
++static void __dealloc_vx_info(struct vx_info *vxi)
++{
++ int cpu;
++
++ vxdprintk(VXD_CBIT(xid, 0),
++ "dealloc_vx_info(%p)", vxi);
++ vxh_dealloc_vx_info(vxi);
++
++ vxi->vx_id = -1;
++
++ vx_info_exit_limit(&vxi->limit);
++ vx_info_exit_sched(&vxi->sched);
++ vx_info_exit_cvirt(&vxi->cvirt);
++ vx_info_exit_cacct(&vxi->cacct);
++
++ for_each_possible_cpu(cpu) {
++ vx_info_exit_sched_pc(
++ &vx_per_cpu(vxi, sched_pc, cpu), cpu);
++ vx_info_exit_cvirt_pc(
++ &vx_per_cpu(vxi, cvirt_pc, cpu), cpu);
++ }
++
++ vxi->vx_state |= VXS_RELEASED;
++
++#ifdef CONFIG_SMP
++ free_percpu(vxi->ptr_pc);
++#endif
++ kfree(vxi);
++ atomic_dec(&vx_global_ctotal);
++}
++
++static void __shutdown_vx_info(struct vx_info *vxi)
++{
++ struct nsproxy *nsproxy;
++ struct fs_struct *fs;
++
++ might_sleep();
++
++ vxi->vx_state |= VXS_SHUTDOWN;
++ vs_state_change(vxi, VSC_SHUTDOWN);
++
++ nsproxy = xchg(&vxi->vx_nsproxy, NULL);
++ fs = xchg(&vxi->vx_fs, NULL);
++
++ if (nsproxy)
++ put_nsproxy(nsproxy);
++ if (fs)
++ put_fs_struct(fs);
++}
++
++/* exported stuff */
++
++void free_vx_info(struct vx_info *vxi)
++{
++ unsigned long flags;
++
++ /* check for reference counts first */
++ BUG_ON(atomic_read(&vxi->vx_usecnt));
++ BUG_ON(atomic_read(&vxi->vx_tasks));
++
++ /* context must not be hashed */
++ BUG_ON(vx_info_state(vxi, VXS_HASHED));
++
++ /* context shutdown is mandatory */
++ BUG_ON(!vx_info_state(vxi, VXS_SHUTDOWN));
++
++ BUG_ON(vxi->vx_nsproxy);
++ BUG_ON(vxi->vx_fs);
++
++ spin_lock_irqsave(&vx_info_inactive_lock, flags);
++ hlist_del(&vxi->vx_hlist);
++ spin_unlock_irqrestore(&vx_info_inactive_lock, flags);
++
++ __dealloc_vx_info(vxi);
++}
++
++
++/* hash table for vx_info hash */
++
++#define VX_HASH_SIZE 13
++
++static struct hlist_head vx_info_hash[VX_HASH_SIZE] =
++ { [0 ... VX_HASH_SIZE-1] = HLIST_HEAD_INIT };
++
++static spinlock_t vx_info_hash_lock = SPIN_LOCK_UNLOCKED;
++
++
++static inline unsigned int __hashval(xid_t xid)
++{
++ return (xid % VX_HASH_SIZE);
++}
++
++
++
++/* __hash_vx_info()
++
++ * add the vxi to the global hash table
++ * requires the hash_lock to be held */
++
++static inline void __hash_vx_info(struct vx_info *vxi)
++{
++ struct hlist_head *head;
++
++ vxd_assert_lock(&vx_info_hash_lock);
++ vxdprintk(VXD_CBIT(xid, 4),
++ "__hash_vx_info: %p[#%d]", vxi, vxi->vx_id);
++ vxh_hash_vx_info(vxi);
++
++ /* context must not be hashed */
++ BUG_ON(vx_info_state(vxi, VXS_HASHED));
++
++ vxi->vx_state |= VXS_HASHED;
++ head = &vx_info_hash[__hashval(vxi->vx_id)];
++ hlist_add_head(&vxi->vx_hlist, head);
++ atomic_inc(&vx_global_cactive);
++}
++
++/* __unhash_vx_info()
++
++ * remove the vxi from the global hash table
++ * requires the hash_lock to be held */
++
++static inline void __unhash_vx_info(struct vx_info *vxi)
++{
++ unsigned long flags;
++
++ vxd_assert_lock(&vx_info_hash_lock);
++ vxdprintk(VXD_CBIT(xid, 4),
++ "__unhash_vx_info: %p[#%d.%d.%d]", vxi, vxi->vx_id,
++ atomic_read(&vxi->vx_usecnt), atomic_read(&vxi->vx_tasks));
++ vxh_unhash_vx_info(vxi);
++
++ /* context must be hashed */
++ BUG_ON(!vx_info_state(vxi, VXS_HASHED));
++ /* but without tasks */
++ BUG_ON(atomic_read(&vxi->vx_tasks));
++
++ vxi->vx_state &= ~VXS_HASHED;
++ hlist_del_init(&vxi->vx_hlist);
++ spin_lock_irqsave(&vx_info_inactive_lock, flags);
++ hlist_add_head(&vxi->vx_hlist, &vx_info_inactive);
++ spin_unlock_irqrestore(&vx_info_inactive_lock, flags);
++ atomic_dec(&vx_global_cactive);
++}
++
++
++/* __lookup_vx_info()
++
++ * requires the hash_lock to be held
++ * doesn't increment the vx_refcnt */
++
++static inline struct vx_info *__lookup_vx_info(xid_t xid)
++{
++ struct hlist_head *head = &vx_info_hash[__hashval(xid)];
++ struct hlist_node *pos;
++ struct vx_info *vxi;
++
++ vxd_assert_lock(&vx_info_hash_lock);
++ hlist_for_each(pos, head) {
++ vxi = hlist_entry(pos, struct vx_info, vx_hlist);
++
++ if (vxi->vx_id == xid)
++ goto found;
++ }
++ vxi = NULL;
++found:
++ vxdprintk(VXD_CBIT(xid, 0),
++ "__lookup_vx_info(#%u): %p[#%u]",
++ xid, vxi, vxi ? vxi->vx_id : 0);
++ vxh_lookup_vx_info(vxi, xid);
++ return vxi;
++}
++
++
++/* __create_vx_info()
++
++ * create the requested context
++ * get(), claim() and hash it */
++
++static struct vx_info *__create_vx_info(int id)
++{
++ struct vx_info *new, *vxi = NULL;
++
++ vxdprintk(VXD_CBIT(xid, 1), "create_vx_info(%d)*", id);
++
++ if (!(new = __alloc_vx_info(id)))
++ return ERR_PTR(-ENOMEM);
++
++ /* required to make dynamic xids unique */
++ spin_lock(&vx_info_hash_lock);
++
++ /* static context requested */
++ if ((vxi = __lookup_vx_info(id))) {
++ vxdprintk(VXD_CBIT(xid, 0),
++ "create_vx_info(%d) = %p (already there)", id, vxi);
++ if (vx_info_flags(vxi, VXF_STATE_SETUP, 0))
++ vxi = ERR_PTR(-EBUSY);
++ else
++ vxi = ERR_PTR(-EEXIST);
++ goto out_unlock;
++ }
++ /* new context */
++ vxdprintk(VXD_CBIT(xid, 0),
++ "create_vx_info(%d) = %p (new)", id, new);
++ claim_vx_info(new, NULL);
++ __hash_vx_info(get_vx_info(new));
++ vxi = new, new = NULL;
++
++out_unlock:
++ spin_unlock(&vx_info_hash_lock);
++ vxh_create_vx_info(IS_ERR(vxi) ? NULL : vxi, id);
++ if (new)
++ __dealloc_vx_info(new);
++ return vxi;
++}
++
++
++/* exported stuff */
++
++
++void unhash_vx_info(struct vx_info *vxi)
++{
++ __shutdown_vx_info(vxi);
++ spin_lock(&vx_info_hash_lock);
++ __unhash_vx_info(vxi);
++ spin_unlock(&vx_info_hash_lock);
++ __wakeup_vx_info(vxi);
++}
++
++
++/* lookup_vx_info()
++
++ * search for a vx_info and get() it
++ * negative id means current */
++
++struct vx_info *lookup_vx_info(int id)
++{
++ struct vx_info *vxi = NULL;
++
++ if (id < 0) {
++ vxi = get_vx_info(current->vx_info);
++ } else if (id > 1) {
++ spin_lock(&vx_info_hash_lock);
++ vxi = get_vx_info(__lookup_vx_info(id));
++ spin_unlock(&vx_info_hash_lock);
++ }
++ return vxi;
++}
++
++/* xid_is_hashed()
++
++ * verify that xid is still hashed */
++
++int xid_is_hashed(xid_t xid)
++{
++ int hashed;
++
++ spin_lock(&vx_info_hash_lock);
++ hashed = (__lookup_vx_info(xid) != NULL);
++ spin_unlock(&vx_info_hash_lock);
++ return hashed;
++}
++
++#ifdef CONFIG_PROC_FS
++
++/* get_xid_list()
++
++ * get a subset of hashed xids for proc
++ * assumes size is at least one */
++
++int get_xid_list(int index, unsigned int *xids, int size)
++{
++ int hindex, nr_xids = 0;
++
++ /* only show current and children */
++ if (!vx_check(0, VS_ADMIN | VS_WATCH)) {
++ if (index > 0)
++ return 0;
++ xids[nr_xids] = vx_current_xid();
++ return 1;
++ }
++
++ for (hindex = 0; hindex < VX_HASH_SIZE; hindex++) {
++ struct hlist_head *head = &vx_info_hash[hindex];
++ struct hlist_node *pos;
++
++ spin_lock(&vx_info_hash_lock);
++ hlist_for_each(pos, head) {
++ struct vx_info *vxi;
++
++ if (--index > 0)
++ continue;
++
++ vxi = hlist_entry(pos, struct vx_info, vx_hlist);
++ xids[nr_xids] = vxi->vx_id;
++ if (++nr_xids >= size) {
++ spin_unlock(&vx_info_hash_lock);
++ goto out;
++ }
++ }
++ /* keep the lock time short */
++ spin_unlock(&vx_info_hash_lock);
++ }
++out:
++ return nr_xids;
++}
++#endif
++
++#ifdef CONFIG_VSERVER_DEBUG
++
++void dump_vx_info_inactive(int level)
++{
++ struct hlist_node *entry, *next;
++
++ hlist_for_each_safe(entry, next, &vx_info_inactive) {
++ struct vx_info *vxi =
++ list_entry(entry, struct vx_info, vx_hlist);
++
++ dump_vx_info(vxi, level);
++ }
++}
++
++#endif
++
++int vx_migrate_user(struct task_struct *p, struct vx_info *vxi)
++{
++ struct user_struct *new_user, *old_user;
++
++ if (!p || !vxi)
++ BUG();
++
++ if (vx_info_flags(vxi, VXF_INFO_PRIVATE, 0))
++ return -EACCES;
++
++ new_user = alloc_uid(vxi->vx_id, p->uid);
++ if (!new_user)
++ return -ENOMEM;
++
++ old_user = p->user;
++ if (new_user != old_user) {
++ atomic_inc(&new_user->processes);
++ atomic_dec(&old_user->processes);
++ p->user = new_user;
++ }
++ free_uid(old_user);
++ return 0;
++}
++
++void vx_mask_cap_bset(struct vx_info *vxi, struct task_struct *p)
++{
++ p->cap_effective &= vxi->vx_cap_bset;
++ p->cap_inheritable &= vxi->vx_cap_bset;
++ p->cap_permitted &= vxi->vx_cap_bset;
++}
++
++
++#include <linux/file.h>
++
++static int vx_openfd_task(struct task_struct *tsk)
++{
++ struct files_struct *files = tsk->files;
++ struct fdtable *fdt;
++ const unsigned long *bptr;
++ int count, total;
++
++ /* no rcu_read_lock() because of spin_lock() */
++ spin_lock(&files->file_lock);
++ fdt = files_fdtable(files);
++ bptr = fdt->open_fds->fds_bits;
++ count = fdt->max_fds / (sizeof(unsigned long) * 8);
++ for (total = 0; count > 0; count--) {
++ if (*bptr)
++ total += hweight_long(*bptr);
++ bptr++;
++ }
++ spin_unlock(&files->file_lock);
++ return total;
++}
++
++
++/* for *space compatibility */
++
++asmlinkage long sys_unshare(unsigned long);
++
++/*
++ * migrate task to new context
++ * gets vxi, puts old_vxi on change
++ * optionally unshares namespaces (hack)
++ */
++
++int vx_migrate_task(struct task_struct *p, struct vx_info *vxi, int unshare)
++{
++ struct vx_info *old_vxi;
++ int ret = 0;
++
++ if (!p || !vxi)
++ BUG();
++
++ vxdprintk(VXD_CBIT(xid, 5),
++ "vx_migrate_task(%p,%p[#%d.%d])", p, vxi,
++ vxi->vx_id, atomic_read(&vxi->vx_usecnt));
++
++ if (vx_info_flags(vxi, VXF_INFO_PRIVATE, 0) &&
++ !vx_info_flags(vxi, VXF_STATE_SETUP, 0))
++ return -EACCES;
++
++ if (vx_info_state(vxi, VXS_SHUTDOWN))
++ return -EFAULT;
++
++ old_vxi = task_get_vx_info(p);
++ if (old_vxi == vxi)
++ goto out;
++
++ if (!(ret = vx_migrate_user(p, vxi))) {
++ int openfd;
++
++ task_lock(p);
++ openfd = vx_openfd_task(p);
++
++ if (old_vxi) {
++ atomic_dec(&old_vxi->cvirt.nr_threads);
++ atomic_dec(&old_vxi->cvirt.nr_running);
++ __rlim_dec(&old_vxi->limit, RLIMIT_NPROC);
++ /* FIXME: what about the struct files here? */
++ __rlim_sub(&old_vxi->limit, VLIMIT_OPENFD, openfd);
++ /* account for the executable */
++ __rlim_dec(&old_vxi->limit, VLIMIT_DENTRY);
++ }
++ atomic_inc(&vxi->cvirt.nr_threads);
++ atomic_inc(&vxi->cvirt.nr_running);
++ __rlim_inc(&vxi->limit, RLIMIT_NPROC);
++ /* FIXME: what about the struct files here? */
++ __rlim_add(&vxi->limit, VLIMIT_OPENFD, openfd);
++ /* account for the executable */
++ __rlim_inc(&vxi->limit, VLIMIT_DENTRY);
++
++ if (old_vxi) {
++ release_vx_info(old_vxi, p);
++ clr_vx_info(&p->vx_info);
++ }
++ claim_vx_info(vxi, p);
++ set_vx_info(&p->vx_info, vxi);
++ p->xid = vxi->vx_id;
++
++ vxdprintk(VXD_CBIT(xid, 5),
++ "moved task %p into vxi:%p[#%d]",
++ p, vxi, vxi->vx_id);
++
++ vx_mask_cap_bset(vxi, p);
++ task_unlock(p);
++
++ /* hack for *spaces to provide compatibility */
++ if (unshare) {
++ struct nsproxy *old_nsp, *new_nsp;
++
++ ret = unshare_nsproxy_namespaces(
++ CLONE_NEWUTS | CLONE_NEWIPC,
++ &new_nsp, NULL);
++ if (ret)
++ goto out;
++
++ old_nsp = xchg(&p->nsproxy, new_nsp);
++ vx_set_space(vxi, CLONE_NEWUTS | CLONE_NEWIPC);
++ put_nsproxy(old_nsp);
++ }
++ }
++out:
++ put_vx_info(old_vxi);
++ return ret;
++}
++
++int vx_set_reaper(struct vx_info *vxi, struct task_struct *p)
++{
++ struct task_struct *old_reaper;
++
++ if (!vxi)
++ return -EINVAL;
++
++ vxdprintk(VXD_CBIT(xid, 6),
++ "vx_set_reaper(%p[#%d],%p[#%d,%d])",
++ vxi, vxi->vx_id, p, p->xid, p->pid);
++
++ old_reaper = vxi->vx_reaper;
++ if (old_reaper == p)
++ return 0;
++
++ /* set new child reaper */
++ get_task_struct(p);
++ vxi->vx_reaper = p;
++ put_task_struct(old_reaper);
++ return 0;
++}
++
++int vx_set_init(struct vx_info *vxi, struct task_struct *p)
++{
++ if (!vxi)
++ return -EINVAL;
++
++ vxdprintk(VXD_CBIT(xid, 6),
++ "vx_set_init(%p[#%d],%p[#%d,%d,%d])",
++ vxi, vxi->vx_id, p, p->xid, p->pid, p->tgid);
++
++ vxi->vx_flags &= ~VXF_STATE_INIT;
++ vxi->vx_initpid = p->tgid;
++ return 0;
++}
++
++void vx_exit_init(struct vx_info *vxi, struct task_struct *p, int code)
++{
++ vxdprintk(VXD_CBIT(xid, 6),
++ "vx_exit_init(%p[#%d],%p[#%d,%d,%d])",
++ vxi, vxi->vx_id, p, p->xid, p->pid, p->tgid);
++
++ vxi->exit_code = code;
++ vxi->vx_initpid = 0;
++}
++
++
++void vx_set_persistent(struct vx_info *vxi)
++{
++ vxdprintk(VXD_CBIT(xid, 6),
++ "vx_set_persistent(%p[#%d])", vxi, vxi->vx_id);
++
++ get_vx_info(vxi);
++ claim_vx_info(vxi, NULL);
++}
++
++void vx_clear_persistent(struct vx_info *vxi)
++{
++ vxdprintk(VXD_CBIT(xid, 6),
++ "vx_clear_persistent(%p[#%d])", vxi, vxi->vx_id);
++
++ release_vx_info(vxi, NULL);
++ put_vx_info(vxi);
++}
++
++void vx_update_persistent(struct vx_info *vxi)
++{
++ if (vx_info_flags(vxi, VXF_PERSISTENT, 0))
++ vx_set_persistent(vxi);
++ else
++ vx_clear_persistent(vxi);
++}
++
++
++/* task must be current or locked */
++
++void exit_vx_info(struct task_struct *p, int code)
++{
++ struct vx_info *vxi = p->vx_info;
++
++ if (vxi) {
++ atomic_dec(&vxi->cvirt.nr_threads);
++ vx_nproc_dec(p);
++
++ vxi->exit_code = code;
++ release_vx_info(vxi, p);
++ }
++}
++
++void exit_vx_info_early(struct task_struct *p, int code)
++{
++ struct vx_info *vxi = p->vx_info;
++
++ if (vxi) {
++ if (vxi->vx_initpid == p->tgid)
++ vx_exit_init(vxi, p, code);
++ if (vxi->vx_reaper == p)
++ vx_set_reaper(vxi, init_pid_ns.child_reaper);
++ }
++}
++
++
++/* vserver syscall commands below here */
++
++/* taks xid and vx_info functions */
++
++#include <asm/uaccess.h>
++
++
++int vc_task_xid(uint32_t id)
++{
++ xid_t xid;
++
++ if (id) {
++ struct task_struct *tsk;
++
++ read_lock(&tasklist_lock);
++ tsk = find_task_by_real_pid(id);
++ xid = (tsk) ? tsk->xid : -ESRCH;
++ read_unlock(&tasklist_lock);
++ } else
++ xid = vx_current_xid();
++ return xid;
++}
++
++
++int vc_vx_info(struct vx_info *vxi, void __user *data)
++{
++ struct vcmd_vx_info_v0 vc_data;
++
++ vc_data.xid = vxi->vx_id;
++ vc_data.initpid = vxi->vx_initpid;
++
++ if (copy_to_user(data, &vc_data, sizeof(vc_data)))
++ return -EFAULT;
++ return 0;
++}
++
++
++int vc_ctx_stat(struct vx_info *vxi, void __user *data)
++{
++ struct vcmd_ctx_stat_v0 vc_data;
++
++ vc_data.usecnt = atomic_read(&vxi->vx_usecnt);
++ vc_data.tasks = atomic_read(&vxi->vx_tasks);
++
++ if (copy_to_user(data, &vc_data, sizeof(vc_data)))
++ return -EFAULT;
++ return 0;
++}
++
++
++/* context functions */
++
++int vc_ctx_create(uint32_t xid, void __user *data)
++{
++ struct vcmd_ctx_create vc_data = { .flagword = VXF_INIT_SET };
++ struct vx_info *new_vxi;
++ int ret;
++
++ if (data && copy_from_user(&vc_data, data, sizeof(vc_data)))
++ return -EFAULT;
++
++ if ((xid > MAX_S_CONTEXT) || (xid < 2))
++ return -EINVAL;
++
++ new_vxi = __create_vx_info(xid);
++ if (IS_ERR(new_vxi))
++ return PTR_ERR(new_vxi);
++
++ /* initial flags */
++ new_vxi->vx_flags = vc_data.flagword;
++
++ ret = -ENOEXEC;
++ if (vs_state_change(new_vxi, VSC_STARTUP))
++ goto out;
++
++ ret = vx_migrate_task(current, new_vxi, (!data));
++ if (ret)
++ goto out;
++
++ /* return context id on success */
++ ret = new_vxi->vx_id;
++
++ /* get a reference for persistent contexts */
++ if ((vc_data.flagword & VXF_PERSISTENT))
++ vx_set_persistent(new_vxi);
++out:
++ release_vx_info(new_vxi, NULL);
++ put_vx_info(new_vxi);
++ return ret;
++}
++
++
++int vc_ctx_migrate(struct vx_info *vxi, void __user *data)
++{
++ struct vcmd_ctx_migrate vc_data = { .flagword = 0 };
++ int ret;
++
++ if (data && copy_from_user(&vc_data, data, sizeof(vc_data)))
++ return -EFAULT;
++
++ ret = vx_migrate_task(current, vxi, 0);
++ if (ret)
++ return ret;
++ if (vc_data.flagword & VXM_SET_INIT)
++ ret = vx_set_init(vxi, current);
++ if (ret)
++ return ret;
++ if (vc_data.flagword & VXM_SET_REAPER)
++ ret = vx_set_reaper(vxi, current);
++ return ret;
++}
++
++
++int vc_get_cflags(struct vx_info *vxi, void __user *data)
++{
++ struct vcmd_ctx_flags_v0 vc_data;
++
++ vc_data.flagword = vxi->vx_flags;
++
++ /* special STATE flag handling */
++ vc_data.mask = vs_mask_flags(~0ULL, vxi->vx_flags, VXF_ONE_TIME);
++
++ if (copy_to_user(data, &vc_data, sizeof(vc_data)))
++ return -EFAULT;
++ return 0;
++}
++
++int vc_set_cflags(struct vx_info *vxi, void __user *data)
++{
++ struct vcmd_ctx_flags_v0 vc_data;
++ uint64_t mask, trigger;
++
++ if (copy_from_user(&vc_data, data, sizeof(vc_data)))
++ return -EFAULT;
++
++ /* special STATE flag handling */
++ mask = vs_mask_mask(vc_data.mask, vxi->vx_flags, VXF_ONE_TIME);
++ trigger = (mask & vxi->vx_flags) ^ (mask & vc_data.flagword);
++
++ if (vxi == current->vx_info) {
++ if (trigger & VXF_STATE_SETUP)
++ vx_mask_cap_bset(vxi, current);
++ if (trigger & VXF_STATE_INIT) {
++ int ret;
++
++ ret = vx_set_init(vxi, current);
++ if (ret)
++ return ret;
++ ret = vx_set_reaper(vxi, current);
++ if (ret)
++ return ret;
++ }
++ }
++
++ vxi->vx_flags = vs_mask_flags(vxi->vx_flags,
++ vc_data.flagword, mask);
++ if (trigger & VXF_PERSISTENT)
++ vx_update_persistent(vxi);
++
++ return 0;
++}
++
++static int do_get_caps(struct vx_info *vxi, uint64_t *bcaps, uint64_t *ccaps)
++{
++ if (bcaps)
++ *bcaps = vxi->vx_bcaps;
++ if (ccaps)
++ *ccaps = vxi->vx_ccaps;
++
++ return 0;
++}
++
++int vc_get_ccaps(struct vx_info *vxi, void __user *data)
++{
++ struct vcmd_ctx_caps_v1 vc_data;
++ int ret;
++
++ ret = do_get_caps(vxi, NULL, &vc_data.ccaps);
++ if (ret)
++ return ret;
++ vc_data.cmask = ~0ULL;
++
++ if (copy_to_user(data, &vc_data, sizeof(vc_data)))
++ return -EFAULT;
++ return 0;
++}
++
++static int do_set_caps(struct vx_info *vxi,
++ uint64_t bcaps, uint64_t bmask, uint64_t ccaps, uint64_t cmask)
++{
++ vxi->vx_bcaps = vs_mask_flags(vxi->vx_bcaps, bcaps, bmask);
++ vxi->vx_ccaps = vs_mask_flags(vxi->vx_ccaps, ccaps, cmask);
++
++ return 0;
++}
++
++int vc_set_ccaps(struct vx_info *vxi, void __user *data)
++{
++ struct vcmd_ctx_caps_v1 vc_data;
++
++ if (copy_from_user(&vc_data, data, sizeof(vc_data)))
++ return -EFAULT;
++
++ return do_set_caps(vxi, 0, 0, vc_data.ccaps, vc_data.cmask);
++}
++
++int vc_get_bcaps(struct vx_info *vxi, void __user *data)
++{
++ struct vcmd_bcaps vc_data;
++ int ret;
++
++ ret = do_get_caps(vxi, &vc_data.bcaps, NULL);
++ if (ret)
++ return ret;
++ vc_data.bmask = ~0ULL;
++
++ if (copy_to_user(data, &vc_data, sizeof(vc_data)))
++ return -EFAULT;
++ return 0;
++}
++
++int vc_set_bcaps(struct vx_info *vxi, void __user *data)
++{
++ struct vcmd_bcaps vc_data;
++
++ if (copy_from_user(&vc_data, data, sizeof(vc_data)))
++ return -EFAULT;
++
++ return do_set_caps(vxi, vc_data.bcaps, vc_data.bmask, 0, 0);
++}
++
++
++int vc_get_badness(struct vx_info *vxi, void __user *data)
++{
++ struct vcmd_badness_v0 vc_data;
++
++ vc_data.bias = vxi->vx_badness_bias;
++
++ if (copy_to_user(data, &vc_data, sizeof(vc_data)))
++ return -EFAULT;
++ return 0;
++}
++
++int vc_set_badness(struct vx_info *vxi, void __user *data)
++{
++ struct vcmd_badness_v0 vc_data;
++
++ if (copy_from_user(&vc_data, data, sizeof(vc_data)))
++ return -EFAULT;
++
++ vxi->vx_badness_bias = vc_data.bias;
++ return 0;
++}
++
++#include <linux/module.h>
++
++EXPORT_SYMBOL_GPL(free_vx_info);
++
+diff -Nurb linux-2.6.22-594/kernel/vserver/space.c linux-2.6.22-595/kernel/vserver/space.c
+--- linux-2.6.22-594/kernel/vserver/space.c 2008-03-20 00:05:21.000000000 -0400
++++ linux-2.6.22-595/kernel/vserver/space.c 2008-03-20 00:08:28.000000000 -0400
@@ -15,6 +15,7 @@
#include <linux/utsname.h>
#include <linux/nsproxy.h>
#include <asm/uaccess.h>
#include <linux/vs_context.h>
-@@ -54,6 +55,7 @@
+@@ -55,6 +56,7 @@
struct mnt_namespace *old_ns;
struct uts_namespace *old_uts;
struct ipc_namespace *old_ipc;
struct nsproxy *nsproxy;
nsproxy = copy_nsproxy(old_nsproxy);
-@@ -83,6 +85,17 @@
- get_ipc_ns(nsproxy->ipc_ns);
+@@ -85,12 +87,26 @@
} else
old_ipc = NULL;
-+
+
+ if (mask & CLONE_NEWNET) {
+ old_net = nsproxy->net_ns;
+ nsproxy->net_ns = new_nsproxy->net_ns;
+ } else
+ old_net = NULL;
+
-
++
if (old_ns)
put_mnt_ns(old_ns);
-@@ -90,6 +101,9 @@
+ if (old_uts)
put_uts_ns(old_uts);
if (old_ipc)
put_ipc_ns(old_ipc);
out:
return nsproxy;
}
-@@ -250,7 +264,8 @@
+@@ -251,6 +267,7 @@
int vc_enter_space(struct vx_info *vxi, void __user *data)
{
-- struct vcmd_space_mask vc_data = { .mask = 0 };
+ /* Ask dhozac how to pass this flag from user space - Sapan*/
-+ struct vcmd_space_mask vc_data = { .mask = CLONE_NEWNET };
+ struct vcmd_space_mask vc_data = { .mask = 0 };
if (data && copy_from_user(&vc_data, data, sizeof(vc_data)))
- return -EFAULT;
+diff -Nurb linux-2.6.22-594/kernel/vserver/space.c.orig linux-2.6.22-595/kernel/vserver/space.c.orig
+--- linux-2.6.22-594/kernel/vserver/space.c.orig 1969-12-31 19:00:00.000000000 -0500
++++ linux-2.6.22-595/kernel/vserver/space.c.orig 2008-03-20 00:05:28.000000000 -0400
+@@ -0,0 +1,295 @@
++/*
++ * linux/kernel/vserver/space.c
++ *
++ * Virtual Server: Context Space Support
++ *
++ * Copyright (C) 2003-2007 Herbert Pötzl
++ *
++ * V0.01 broken out from context.c 0.07
++ * V0.02 added task locking for namespace
++ * V0.03 broken out vx_enter_namespace
++ * V0.04 added *space support and commands
++ *
++ */
++
++#include <linux/utsname.h>
++#include <linux/nsproxy.h>
++#include <linux/err.h>
++#include <net/net_namespace.h>
++#include <asm/uaccess.h>
++
++#include <linux/vs_context.h>
++#include <linux/vserver/space.h>
++#include <linux/vserver/space_cmd.h>
++
++
++atomic_t vs_global_nsproxy = ATOMIC_INIT(0);
++atomic_t vs_global_fs = ATOMIC_INIT(0);
++atomic_t vs_global_mnt_ns = ATOMIC_INIT(0);
++atomic_t vs_global_uts_ns = ATOMIC_INIT(0);
++atomic_t vs_global_ipc_ns = ATOMIC_INIT(0);
++
++
++/* namespace functions */
++
++#include <linux/mnt_namespace.h>
++
++const struct vcmd_space_mask space_mask = {
++ .mask = CLONE_NEWNS |
++ CLONE_NEWUTS |
++ CLONE_NEWIPC |
++ CLONE_FS |
++ CLONE_NEWNET
++};
++
++
++/*
++ * build a new nsproxy mix
++ * assumes that both proxies are 'const'
++ * does not touch nsproxy refcounts
++ * will hold a reference on the result.
++ */
++
++struct nsproxy *vs_mix_nsproxy(struct nsproxy *old_nsproxy,
++ struct nsproxy *new_nsproxy, unsigned long mask)
++{
++ struct mnt_namespace *old_ns;
++ struct uts_namespace *old_uts;
++ struct ipc_namespace *old_ipc;
++ struct net *old_net;
++ struct nsproxy *nsproxy;
++
++ nsproxy = copy_nsproxy(old_nsproxy);
++ if (!nsproxy)
++ goto out;
++
++ if (mask & CLONE_NEWNS) {
++ old_ns = nsproxy->mnt_ns;
++ nsproxy->mnt_ns = new_nsproxy->mnt_ns;
++ if (nsproxy->mnt_ns)
++ get_mnt_ns(nsproxy->mnt_ns);
++ } else
++ old_ns = NULL;
++
++ if (mask & CLONE_NEWUTS) {
++ old_uts = nsproxy->uts_ns;
++ nsproxy->uts_ns = new_nsproxy->uts_ns;
++ if (nsproxy->uts_ns)
++ get_uts_ns(nsproxy->uts_ns);
++ } else
++ old_uts = NULL;
++
++ if (mask & CLONE_NEWIPC) {
++ old_ipc = nsproxy->ipc_ns;
++ nsproxy->ipc_ns = new_nsproxy->ipc_ns;
++ if (nsproxy->ipc_ns)
++ get_ipc_ns(nsproxy->ipc_ns);
++ } else
++ old_ipc = NULL;
++
++ if (mask & CLONE_NEWNET) {
++ old_net = nsproxy->net_ns;
++ nsproxy->net_ns = new_nsproxy->net_ns;
++ if (nsproxy->net_ns) {
++ get_net(nsproxy->net_ns);
++ printk(KERN_ALERT "Cloning network namespace\n");
++ }
++ } else
++ old_net = NULL;
++
++
++ if (old_ns)
++ put_mnt_ns(old_ns);
++ if (old_uts)
++ put_uts_ns(old_uts);
++ if (old_ipc)
++ put_ipc_ns(old_ipc);
++ if (old_net)
++ put_net(old_net);
++
++out:
++ return nsproxy;
++}
++
++
++/*
++ * merge two nsproxy structs into a new one.
++ * will hold a reference on the result.
++ */
++
++static inline
++struct nsproxy *__vs_merge_nsproxy(struct nsproxy *old,
++ struct nsproxy *proxy, unsigned long mask)
++{
++ struct nsproxy null_proxy = { .mnt_ns = NULL };
++
++ if (!proxy)
++ return NULL;
++
++ if (mask) {
++ /* vs_mix_nsproxy returns with reference */
++ return vs_mix_nsproxy(old ? old : &null_proxy,
++ proxy, mask);
++ }
++ get_nsproxy(proxy);
++ return proxy;
++}
++
++/*
++ * merge two fs structs into a new one.
++ * will take a reference on the result.
++ */
++
++static inline
++struct fs_struct *__vs_merge_fs(struct fs_struct *old,
++ struct fs_struct *fs, unsigned long mask)
++{
++ if (!(mask & CLONE_FS)) {
++ if (old)
++ atomic_inc(&old->count);
++ return old;
++ }
++
++ if (!fs)
++ return NULL;
++
++ return copy_fs_struct(fs);
++}
++
++
++int vx_enter_space(struct vx_info *vxi, unsigned long mask)
++{
++ struct nsproxy *proxy, *proxy_cur, *proxy_new;
++ struct fs_struct *fs, *fs_cur, *fs_new;
++ int ret;
++
++ if (vx_info_flags(vxi, VXF_INFO_PRIVATE, 0))
++ return -EACCES;
++
++ if (!mask)
++ mask = vxi->vx_nsmask;
++
++ if ((mask & vxi->vx_nsmask) != mask)
++ return -EINVAL;
++
++ proxy = vxi->vx_nsproxy;
++ fs = vxi->vx_fs;
++
++ task_lock(current);
++ fs_cur = current->fs;
++ atomic_inc(&fs_cur->count);
++ proxy_cur = current->nsproxy;
++ get_nsproxy(proxy_cur);
++ task_unlock(current);
++
++ fs_new = __vs_merge_fs(fs_cur, fs, mask);
++ if (IS_ERR(fs_new)) {
++ ret = PTR_ERR(fs_new);
++ goto out_put;
++ }
++
++ proxy_new = __vs_merge_nsproxy(proxy_cur, proxy, mask);
++ if (IS_ERR(proxy_new)) {
++ ret = PTR_ERR(proxy_new);
++ goto out_put_fs;
++ }
++
++ fs_new = xchg(¤t->fs, fs_new);
++ proxy_new = xchg(¤t->nsproxy, proxy_new);
++ ret = 0;
++
++ if (proxy_new)
++ put_nsproxy(proxy_new);
++out_put_fs:
++ if (fs_new)
++ put_fs_struct(fs_new);
++out_put:
++ if (proxy_cur)
++ put_nsproxy(proxy_cur);
++ if (fs_cur)
++ put_fs_struct(fs_cur);
++ return ret;
++}
++
++
++int vx_set_space(struct vx_info *vxi, unsigned long mask)
++{
++ struct nsproxy *proxy_vxi, *proxy_cur, *proxy_new;
++ struct fs_struct *fs_vxi, *fs_cur, *fs_new;
++ int ret;
++
++ if (!mask)
++ mask = space_mask.mask;
++
++ if ((mask & space_mask.mask) != mask)
++ return -EINVAL;
++
++ proxy_vxi = vxi->vx_nsproxy;
++ fs_vxi = vxi->vx_fs;
++
++ task_lock(current);
++ fs_cur = current->fs;
++ atomic_inc(&fs_cur->count);
++ proxy_cur = current->nsproxy;
++ get_nsproxy(proxy_cur);
++ task_unlock(current);
++
++ fs_new = __vs_merge_fs(fs_vxi, fs_cur, mask);
++ if (IS_ERR(fs_new)) {
++ ret = PTR_ERR(fs_new);
++ goto out_put;
++ }
++
++ proxy_new = __vs_merge_nsproxy(proxy_vxi, proxy_cur, mask);
++ if (IS_ERR(proxy_new)) {
++ ret = PTR_ERR(proxy_new);
++ goto out_put_fs;
++ }
++
++ fs_new = xchg(&vxi->vx_fs, fs_new);
++ proxy_new = xchg(&vxi->vx_nsproxy, proxy_new);
++ vxi->vx_nsmask |= mask;
++ ret = 0;
++
++ if (proxy_new)
++ put_nsproxy(proxy_new);
++out_put_fs:
++ if (fs_new)
++ put_fs_struct(fs_new);
++out_put:
++ if (proxy_cur)
++ put_nsproxy(proxy_cur);
++ if (fs_cur)
++ put_fs_struct(fs_cur);
++ return ret;
++}
++
++
++int vc_enter_space(struct vx_info *vxi, void __user *data)
++{
++ /* Ask dhozac how to pass this flag from user space - Sapan*/
++ struct vcmd_space_mask vc_data = { .mask = CLONE_NEWNET };
++
++ if (data && copy_from_user(&vc_data, data, sizeof(vc_data)))
++ return -EFAULT;
++
++ return vx_enter_space(vxi, vc_data.mask);
++}
++
++int vc_set_space(struct vx_info *vxi, void __user *data)
++{
++ struct vcmd_space_mask vc_data = { .mask = 0 };
++
++ if (data && copy_from_user(&vc_data, data, sizeof(vc_data)))
++ return -EFAULT;
++
++ return vx_set_space(vxi, vc_data.mask);
++}
++
++int vc_get_space_mask(struct vx_info *vxi, void __user *data)
++{
++ if (copy_to_user(data, &space_mask, sizeof(space_mask)))
++ return -EFAULT;
++ return 0;
++}
++
+diff -Nurb linux-2.6.22-594/net/core/net_namespace.c linux-2.6.22-595/net/core/net_namespace.c
+--- linux-2.6.22-594/net/core/net_namespace.c 2008-03-20 00:05:18.000000000 -0400
++++ linux-2.6.22-595/net/core/net_namespace.c 2008-03-20 00:14:56.000000000 -0400
+@@ -112,10 +112,12 @@
+ ops = list_entry(ptr, struct pernet_operations, list);
+ if (ops->init) {
+ error = ops->init(net);
+- if (error < 0)
++ if (error < 0) {
++ printk(KERN_ALERT "Error setting up netns: %x\n", ops->init);
+ goto out_undo;
+ }
+ }
++ }
+ out:
+ return error;
+ out_undo:
+diff -Nurb linux-2.6.22-594/net/core/net_namespace.c.orig linux-2.6.22-595/net/core/net_namespace.c.orig
+--- linux-2.6.22-594/net/core/net_namespace.c.orig 1969-12-31 19:00:00.000000000 -0500
++++ linux-2.6.22-595/net/core/net_namespace.c.orig 2008-03-20 00:05:18.000000000 -0400
+@@ -0,0 +1,332 @@
++#include <linux/workqueue.h>
++#include <linux/rtnetlink.h>
++#include <linux/cache.h>
++#include <linux/slab.h>
++#include <linux/list.h>
++#include <linux/delay.h>
++#include <net/net_namespace.h>
++
++/*
++ * Our network namespace constructor/destructor lists
++ */
++
++static LIST_HEAD(pernet_list);
++static struct list_head *first_device = &pernet_list;
++static DEFINE_MUTEX(net_mutex);
++
++static DEFINE_MUTEX(net_list_mutex);
++LIST_HEAD(net_namespace_list);
++
++static struct kmem_cache *net_cachep;
++
++struct net init_net;
++EXPORT_SYMBOL_GPL(init_net);
++
++void net_lock(void)
++{
++ mutex_lock(&net_list_mutex);
++}
++
++void net_unlock(void)
++{
++ mutex_unlock(&net_list_mutex);
++}
++
++static struct net *net_alloc(void)
++{
++ return kmem_cache_alloc(net_cachep, GFP_KERNEL);
++}
++
++static void net_free(struct net *net)
++{
++ if (!net)
++ return;
++
++ if (unlikely(atomic_read(&net->use_count) != 0)) {
++ printk(KERN_EMERG "network namespace not free! Usage: %d\n",
++ atomic_read(&net->use_count));
++ return;
++ }
++
++ kmem_cache_free(net_cachep, net);
++}
++
++static void cleanup_net(struct work_struct *work)
++{
++ struct pernet_operations *ops;
++ struct list_head *ptr;
++ struct net *net;
++
++ net = container_of(work, struct net, work);
++
++ mutex_lock(&net_mutex);
++
++ /* Don't let anyone else find us. */
++ net_lock();
++ list_del(&net->list);
++ net_unlock();
++
++ /* Run all of the network namespace exit methods */
++ list_for_each_prev(ptr, &pernet_list) {
++ ops = list_entry(ptr, struct pernet_operations, list);
++ if (ops->exit)
++ ops->exit(net);
++ }
++
++ mutex_unlock(&net_mutex);
++
++ /* Ensure there are no outstanding rcu callbacks using this
++ * network namespace.
++ */
++ rcu_barrier();
++
++ /* Finally it is safe to free my network namespace structure */
++ net_free(net);
++}
++
++
++void __put_net(struct net *net)
++{
++ /* Cleanup the network namespace in process context */
++ INIT_WORK(&net->work, cleanup_net);
++ schedule_work(&net->work);
++}
++EXPORT_SYMBOL_GPL(__put_net);
++
++/*
++ * setup_net runs the initializers for the network namespace object.
++ */
++static int setup_net(struct net *net)
++{
++ /* Must be called with net_mutex held */
++ struct pernet_operations *ops;
++ struct list_head *ptr;
++ int error;
++
++ memset(net, 0, sizeof(struct net));
++ atomic_set(&net->count, 1);
++ atomic_set(&net->use_count, 0);
++
++ error = 0;
++ list_for_each(ptr, &pernet_list) {
++ ops = list_entry(ptr, struct pernet_operations, list);
++ if (ops->init) {
++ error = ops->init(net);
++ if (error < 0)
++ goto out_undo;
++ }
++ }
++out:
++ return error;
++out_undo:
++ /* Walk through the list backwards calling the exit functions
++ * for the pernet modules whose init functions did not fail.
++ */
++ for (ptr = ptr->prev; ptr != &pernet_list; ptr = ptr->prev) {
++ ops = list_entry(ptr, struct pernet_operations, list);
++ if (ops->exit)
++ ops->exit(net);
++ }
++ goto out;
++}
++
++struct net *copy_net_ns(unsigned long flags, struct net *old_net)
++{
++ struct net *new_net = NULL;
++ int err;
++
++ get_net(old_net);
++
++ if (!(flags & CLONE_NEWNET))
++ return old_net;
++
++ err = -EPERM;
++ if (!capable(CAP_SYS_ADMIN))
++ goto out;
++
++ err = -ENOMEM;
++ new_net = net_alloc();
++ if (!new_net)
++ goto out;
++
++ mutex_lock(&net_mutex);
++ err = setup_net(new_net);
++ if (err)
++ goto out_unlock;
++
++ net_lock();
++ list_add_tail(&new_net->list, &net_namespace_list);
++ net_unlock();
++
++
++out_unlock:
++ mutex_unlock(&net_mutex);
++out:
++ put_net(old_net);
++ if (err) {
++ net_free(new_net);
++ new_net = ERR_PTR(err);
++ }
++ return new_net;
++}
++
++static int __init net_ns_init(void)
++{
++ int err;
++
++ printk(KERN_INFO "net_namespace: %zd bytes\n", sizeof(struct net));
++ net_cachep = kmem_cache_create("net_namespace", sizeof(struct net),
++ SMP_CACHE_BYTES,
++ SLAB_PANIC, NULL, NULL);
++ mutex_lock(&net_mutex);
++ err = setup_net(&init_net);
++
++ net_lock();
++ list_add_tail(&init_net.list, &net_namespace_list);
++ net_unlock();
++
++ mutex_unlock(&net_mutex);
++ if (err)
++ panic("Could not setup the initial network namespace");
++
++ return 0;
++}
++
++pure_initcall(net_ns_init);
++
++static int register_pernet_operations(struct list_head *list,
++ struct pernet_operations *ops)
++{
++ struct net *net, *undo_net;
++ int error;
++
++ error = 0;
++ list_add_tail(&ops->list, list);
++ for_each_net(net) {
++ if (ops->init) {
++ error = ops->init(net);
++ if (error)
++ goto out_undo;
++ }
++ }
++out:
++ return error;
++
++out_undo:
++ /* If I have an error cleanup all namespaces I initialized */
++ list_del(&ops->list);
++ for_each_net(undo_net) {
++ if (undo_net == net)
++ goto undone;
++ if (ops->exit)
++ ops->exit(undo_net);
++ }
++undone:
++ goto out;
++}
++
++static void unregister_pernet_operations(struct pernet_operations *ops)
++{
++ struct net *net;
++
++ list_del(&ops->list);
++ for_each_net(net)
++ if (ops->exit)
++ ops->exit(net);
++}
++
++/**
++ * register_pernet_subsys - register a network namespace subsystem
++ * @ops: pernet operations structure for the subsystem
++ *
++ * Register a subsystem which has init and exit functions
++ * that are called when network namespaces are created and
++ * destroyed respectively.
++ *
++ * When registered all network namespace init functions are
++ * called for every existing network namespace. Allowing kernel
++ * modules to have a race free view of the set of network namespaces.
++ *
++ * When a new network namespace is created all of the init
++ * methods are called in the order in which they were registered.
++ *
++ * When a network namespace is destroyed all of the exit methods
++ * are called in the reverse of the order with which they were
++ * registered.
++ */
++int register_pernet_subsys(struct pernet_operations *ops)
++{
++ int error;
++ mutex_lock(&net_mutex);
++ error = register_pernet_operations(first_device, ops);
++ mutex_unlock(&net_mutex);
++ return error;
++}
++EXPORT_SYMBOL_GPL(register_pernet_subsys);
++
++/**
++ * unregister_pernet_subsys - unregister a network namespace subsystem
++ * @ops: pernet operations structure to manipulate
++ *
++ * Remove the pernet operations structure from the list to be
++ * used when network namespaces are created or destoryed. In
++ * addition run the exit method for all existing network
++ * namespaces.
++ */
++void unregister_pernet_subsys(struct pernet_operations *module)
++{
++ mutex_lock(&net_mutex);
++ unregister_pernet_operations(module);
++ mutex_unlock(&net_mutex);
++}
++EXPORT_SYMBOL_GPL(unregister_pernet_subsys);
++
++/**
++ * register_pernet_device - register a network namespace device
++ * @ops: pernet operations structure for the subsystem
++ *
++ * Register a device which has init and exit functions
++ * that are called when network namespaces are created and
++ * destroyed respectively.
++ *
++ * When registered all network namespace init functions are
++ * called for every existing network namespace. Allowing kernel
++ * modules to have a race free view of the set of network namespaces.
++ *
++ * When a new network namespace is created all of the init
++ * methods are called in the order in which they were registered.
++ *
++ * When a network namespace is destroyed all of the exit methods
++ * are called in the reverse of the order with which they were
++ * registered.
++ */
++int register_pernet_device(struct pernet_operations *ops)
++{
++ int error;
++ mutex_lock(&net_mutex);
++ error = register_pernet_operations(&pernet_list, ops);
++ if (!error && (first_device == &pernet_list))
++ first_device = &ops->list;
++ mutex_unlock(&net_mutex);
++ return error;
++}
++EXPORT_SYMBOL_GPL(register_pernet_device);
++
++/**
++ * unregister_pernet_device - unregister a network namespace netdevice
++ * @ops: pernet operations structure to manipulate
++ *
++ * Remove the pernet operations structure from the list to be
++ * used when network namespaces are created or destoryed. In
++ * addition run the exit method for all existing network
++ * namespaces.
++ */
++void unregister_pernet_device(struct pernet_operations *ops)
++{
++ mutex_lock(&net_mutex);
++ if (&ops->list == first_device)
++ first_device = first_device->next;
++ unregister_pernet_operations(ops);
++ mutex_unlock(&net_mutex);
++}
++EXPORT_SYMBOL_GPL(unregister_pernet_device);
+diff -Nurb linux-2.6.22-594/net/ipv4/af_inet.c.orig linux-2.6.22-595/net/ipv4/af_inet.c.orig
+--- linux-2.6.22-594/net/ipv4/af_inet.c.orig 2008-03-20 00:05:18.000000000 -0400
++++ linux-2.6.22-595/net/ipv4/af_inet.c.orig 1969-12-31 19:00:00.000000000 -0500
+@@ -1,1522 +0,0 @@
+-/*
+- * INET An implementation of the TCP/IP protocol suite for the LINUX
+- * operating system. INET is implemented using the BSD Socket
+- * interface as the means of communication with the user level.
+- *
+- * PF_INET protocol family socket handler.
+- *
+- * Version: $Id: af_inet.c,v 1.137 2002/02/01 22:01:03 davem Exp $
+- *
+- * Authors: Ross Biro
+- * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+- * Florian La Roche, <flla@stud.uni-sb.de>
+- * Alan Cox, <A.Cox@swansea.ac.uk>
+- *
+- * Changes (see also sock.c)
+- *
+- * piggy,
+- * Karl Knutson : Socket protocol table
+- * A.N.Kuznetsov : Socket death error in accept().
+- * John Richardson : Fix non blocking error in connect()
+- * so sockets that fail to connect
+- * don't return -EINPROGRESS.
+- * Alan Cox : Asynchronous I/O support
+- * Alan Cox : Keep correct socket pointer on sock
+- * structures
+- * when accept() ed
+- * Alan Cox : Semantics of SO_LINGER aren't state
+- * moved to close when you look carefully.
+- * With this fixed and the accept bug fixed
+- * some RPC stuff seems happier.
+- * Niibe Yutaka : 4.4BSD style write async I/O
+- * Alan Cox,
+- * Tony Gale : Fixed reuse semantics.
+- * Alan Cox : bind() shouldn't abort existing but dead
+- * sockets. Stops FTP netin:.. I hope.
+- * Alan Cox : bind() works correctly for RAW sockets.
+- * Note that FreeBSD at least was broken
+- * in this respect so be careful with
+- * compatibility tests...
+- * Alan Cox : routing cache support
+- * Alan Cox : memzero the socket structure for
+- * compactness.
+- * Matt Day : nonblock connect error handler
+- * Alan Cox : Allow large numbers of pending sockets
+- * (eg for big web sites), but only if
+- * specifically application requested.
+- * Alan Cox : New buffering throughout IP. Used
+- * dumbly.
+- * Alan Cox : New buffering now used smartly.
+- * Alan Cox : BSD rather than common sense
+- * interpretation of listen.
+- * Germano Caronni : Assorted small races.
+- * Alan Cox : sendmsg/recvmsg basic support.
+- * Alan Cox : Only sendmsg/recvmsg now supported.
+- * Alan Cox : Locked down bind (see security list).
+- * Alan Cox : Loosened bind a little.
+- * Mike McLagan : ADD/DEL DLCI Ioctls
+- * Willy Konynenberg : Transparent proxying support.
+- * David S. Miller : New socket lookup architecture.
+- * Some other random speedups.
+- * Cyrus Durgin : Cleaned up file for kmod hacks.
+- * Andi Kleen : Fix inet_stream_connect TCP race.
+- *
+- * This program is free software; you can redistribute it and/or
+- * modify it under the terms of the GNU General Public License
+- * as published by the Free Software Foundation; either version
+- * 2 of the License, or (at your option) any later version.
+- */
+-
+-#include <linux/err.h>
+-#include <linux/errno.h>
+-#include <linux/types.h>
+-#include <linux/socket.h>
+-#include <linux/in.h>
+-#include <linux/kernel.h>
+-#include <linux/module.h>
+-#include <linux/sched.h>
+-#include <linux/timer.h>
+-#include <linux/string.h>
+-#include <linux/sockios.h>
+-#include <linux/net.h>
+-#include <linux/capability.h>
+-#include <linux/fcntl.h>
+-#include <linux/mm.h>
+-#include <linux/interrupt.h>
+-#include <linux/stat.h>
+-#include <linux/init.h>
+-#include <linux/poll.h>
+-#include <linux/netfilter_ipv4.h>
+-#include <linux/random.h>
+-
+-#include <asm/uaccess.h>
+-#include <asm/system.h>
+-
+-#include <linux/inet.h>
+-#include <linux/igmp.h>
+-#include <linux/inetdevice.h>
+-#include <linux/netdevice.h>
+-#include <net/ip.h>
+-#include <net/protocol.h>
+-#include <net/arp.h>
+-#include <net/route.h>
+-#include <net/ip_fib.h>
+-#include <net/inet_connection_sock.h>
+-#include <net/tcp.h>
+-#include <net/udp.h>
+-#include <net/udplite.h>
+-#include <linux/skbuff.h>
+-#include <net/sock.h>
+-#include <net/raw.h>
+-#include <net/icmp.h>
+-#include <net/ipip.h>
+-#include <net/inet_common.h>
+-#include <net/xfrm.h>
+-#ifdef CONFIG_IP_MROUTE
+-#include <linux/mroute.h>
+-#endif
+-#include <linux/vs_limit.h>
+-
+-DEFINE_SNMP_STAT(struct linux_mib, net_statistics) __read_mostly;
+-
+-extern void ip_mc_drop_socket(struct sock *sk);
+-
+-/* The inetsw table contains everything that inet_create needs to
+- * build a new socket.
+- */
+-static struct list_head inetsw[SOCK_MAX];
+-static DEFINE_SPINLOCK(inetsw_lock);
+-
+-/* New destruction routine */
+-
+-void inet_sock_destruct(struct sock *sk)
+-{
+- struct inet_sock *inet = inet_sk(sk);
+-
+- __skb_queue_purge(&sk->sk_receive_queue);
+- __skb_queue_purge(&sk->sk_error_queue);
+-
+- if (sk->sk_type == SOCK_STREAM && sk->sk_state != TCP_CLOSE) {
+- printk("Attempt to release TCP socket in state %d %p\n",
+- sk->sk_state, sk);
+- return;
+- }
+- if (!sock_flag(sk, SOCK_DEAD)) {
+- printk("Attempt to release alive inet socket %p\n", sk);
+- return;
+- }
+-
+- BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc));
+- BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
+- BUG_TRAP(!sk->sk_wmem_queued);
+- BUG_TRAP(!sk->sk_forward_alloc);
+-
+- kfree(inet->opt);
+- dst_release(sk->sk_dst_cache);
+- sk_refcnt_debug_dec(sk);
+-}
+-
+-/*
+- * The routines beyond this point handle the behaviour of an AF_INET
+- * socket object. Mostly it punts to the subprotocols of IP to do
+- * the work.
+- */
+-
+-/*
+- * Automatically bind an unbound socket.
+- */
+-
+-static int inet_autobind(struct sock *sk)
+-{
+- struct inet_sock *inet;
+- /* We may need to bind the socket. */
+- lock_sock(sk);
+- inet = inet_sk(sk);
+- if (!inet->num) {
+- if (sk->sk_prot->get_port(sk, 0)) {
+- release_sock(sk);
+- return -EAGAIN;
+- }
+- inet->sport = htons(inet->num);
+- sk->sk_xid = vx_current_xid();
+- sk->sk_nid = nx_current_nid();
+- }
+- release_sock(sk);
+- return 0;
+-}
+-
+-/*
+- * Move a socket into listening state.
+- */
+-int inet_listen(struct socket *sock, int backlog)
+-{
+- struct sock *sk = sock->sk;
+- unsigned char old_state;
+- int err;
+-
+- lock_sock(sk);
+-
+- err = -EINVAL;
+- if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM)
+- goto out;
+-
+- old_state = sk->sk_state;
+- if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN)))
+- goto out;
+-
+- /* Really, if the socket is already in listen state
+- * we can only allow the backlog to be adjusted.
+- */
+- if (old_state != TCP_LISTEN) {
+- err = inet_csk_listen_start(sk, backlog);
+- if (err)
+- goto out;
+- }
+- sk->sk_max_ack_backlog = backlog;
+- err = 0;
+-
+-out:
+- release_sock(sk);
+- return err;
+-}
+-
+-u32 inet_ehash_secret __read_mostly;
+-EXPORT_SYMBOL(inet_ehash_secret);
+-
+-/*
+- * inet_ehash_secret must be set exactly once
+- * Instead of using a dedicated spinlock, we (ab)use inetsw_lock
+- */
+-void build_ehash_secret(void)
+-{
+- u32 rnd;
+- do {
+- get_random_bytes(&rnd, sizeof(rnd));
+- } while (rnd == 0);
+- spin_lock_bh(&inetsw_lock);
+- if (!inet_ehash_secret)
+- inet_ehash_secret = rnd;
+- spin_unlock_bh(&inetsw_lock);
+-}
+-EXPORT_SYMBOL(build_ehash_secret);
+-
+-/*
+- * Create an inet socket.
+- */
+-
+-static int inet_create(struct socket *sock, int protocol)
+-{
+- struct sock *sk;
+- struct list_head *p;
+- struct inet_protosw *answer;
+- struct inet_sock *inet;
+- struct proto *answer_prot;
+- unsigned char answer_flags;
+- char answer_no_check;
+- int try_loading_module = 0;
+- int err;
+-
+- if (sock->type != SOCK_RAW &&
+- sock->type != SOCK_DGRAM &&
+- !inet_ehash_secret)
+- build_ehash_secret();
+-
+- sock->state = SS_UNCONNECTED;
+-
+- /* Look for the requested type/protocol pair. */
+- answer = NULL;
+-lookup_protocol:
+- err = -ESOCKTNOSUPPORT;
+- rcu_read_lock();
+- list_for_each_rcu(p, &inetsw[sock->type]) {
+- answer = list_entry(p, struct inet_protosw, list);
+-
+- /* Check the non-wild match. */
+- if (protocol == answer->protocol) {
+- if (protocol != IPPROTO_IP)
+- break;
+- } else {
+- /* Check for the two wild cases. */
+- if (IPPROTO_IP == protocol) {
+- protocol = answer->protocol;
+- break;
+- }
+- if (IPPROTO_IP == answer->protocol)
+- break;
+- }
+- err = -EPROTONOSUPPORT;
+- answer = NULL;
+- }
+-
+- if (unlikely(answer == NULL)) {
+- if (try_loading_module < 2) {
+- rcu_read_unlock();
+- /*
+- * Be more specific, e.g. net-pf-2-proto-132-type-1
+- * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)
+- */
+- if (++try_loading_module == 1)
+- request_module("net-pf-%d-proto-%d-type-%d",
+- PF_INET, protocol, sock->type);
+- /*
+- * Fall back to generic, e.g. net-pf-2-proto-132
+- * (net-pf-PF_INET-proto-IPPROTO_SCTP)
+- */
+- else
+- request_module("net-pf-%d-proto-%d",
+- PF_INET, protocol);
+- goto lookup_protocol;
+- } else
+- goto out_rcu_unlock;
+- }
+-
+- err = -EPERM;
+- if ((protocol == IPPROTO_ICMP) &&
+- nx_capable(answer->capability, NXC_RAW_ICMP))
+- goto override;
+- if (sock->type == SOCK_RAW &&
+- nx_capable(answer->capability, NXC_RAW_SOCKET))
+- goto override;
+- if (answer->capability > 0 && !capable(answer->capability))
+- goto out_rcu_unlock;
+-override:
+- sock->ops = answer->ops;
+- answer_prot = answer->prot;
+- answer_no_check = answer->no_check;
+- answer_flags = answer->flags;
+- rcu_read_unlock();
+-
+- BUG_TRAP(answer_prot->slab != NULL);
+-
+- err = -ENOBUFS;
+- sk = sk_alloc(PF_INET, GFP_KERNEL, answer_prot, 1);
+- if (sk == NULL)
+- goto out;
+-
+- err = 0;
+- sk->sk_no_check = answer_no_check;
+- if (INET_PROTOSW_REUSE & answer_flags)
+- sk->sk_reuse = 1;
+-
+- inet = inet_sk(sk);
+- inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
+-
+- if (SOCK_RAW == sock->type) {
+- inet->num = protocol;
+- if (IPPROTO_RAW == protocol)
+- inet->hdrincl = 1;
+- }
+-
+- if (ipv4_config.no_pmtu_disc)
+- inet->pmtudisc = IP_PMTUDISC_DONT;
+- else
+- inet->pmtudisc = IP_PMTUDISC_WANT;
+-
+- inet->id = 0;
+-
+- sock_init_data(sock, sk);
+-
+- sk->sk_destruct = inet_sock_destruct;
+- sk->sk_family = PF_INET;
+- sk->sk_protocol = protocol;
+- sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
+-
+- inet->uc_ttl = -1;
+- inet->mc_loop = 1;
+- inet->mc_ttl = 1;
+- inet->mc_index = 0;
+- inet->mc_list = NULL;
+-
+- sk_refcnt_debug_inc(sk);
+-
+- if (inet->num) {
+- /* It assumes that any protocol which allows
+- * the user to assign a number at socket
+- * creation time automatically
+- * shares.
+- */
+- inet->sport = htons(inet->num);
+- /* Add to protocol hash chains. */
+- sk->sk_prot->hash(sk);
+- }
+-
+- if (sk->sk_prot->init) {
+- err = sk->sk_prot->init(sk);
+- if (err)
+- sk_common_release(sk);
+- }
+-out:
+- return err;
+-out_rcu_unlock:
+- rcu_read_unlock();
+- goto out;
+-}
+-
+-
+-/*
+- * The peer socket should always be NULL (or else). When we call this
+- * function we are destroying the object and from then on nobody
+- * should refer to it.
+- */
+-int inet_release(struct socket *sock)
+-{
+- struct sock *sk = sock->sk;
+-
+- if (sk) {
+- long timeout;
+-
+- /* Applications forget to leave groups before exiting */
+- ip_mc_drop_socket(sk);
+-
+- /* If linger is set, we don't return until the close
+- * is complete. Otherwise we return immediately. The
+- * actually closing is done the same either way.
+- *
+- * If the close is due to the process exiting, we never
+- * linger..
+- */
+- timeout = 0;
+- if (sock_flag(sk, SOCK_LINGER) &&
+- !(current->flags & PF_EXITING))
+- timeout = sk->sk_lingertime;
+- sock->sk = NULL;
+- sk->sk_prot->close(sk, timeout);
+- }
+- return 0;
+-}
+-
+-/* It is off by default, see below. */
+-int sysctl_ip_nonlocal_bind __read_mostly;
+-
+-int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+-{
+- struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
+- struct sock *sk = sock->sk;
+- struct inet_sock *inet = inet_sk(sk);
+- struct nx_v4_sock_addr nsa;
+- unsigned short snum;
+- int chk_addr_ret;
+- int err;
+-
+- /* If the socket has its own bind function then use it. (RAW) */
+- if (sk->sk_prot->bind) {
+- err = sk->sk_prot->bind(sk, uaddr, addr_len);
+- goto out;
+- }
+- err = -EINVAL;
+- if (addr_len < sizeof(struct sockaddr_in))
+- goto out;
+-
+- err = v4_map_sock_addr(inet, addr, &nsa);
+- if (err)
+- goto out;
+-
+- chk_addr_ret = inet_addr_type(nsa.saddr);
+-
+- /* Not specified by any standard per-se, however it breaks too
+- * many applications when removed. It is unfortunate since
+- * allowing applications to make a non-local bind solves
+- * several problems with systems using dynamic addressing.
+- * (ie. your servers still start up even if your ISDN link
+- * is temporarily down)
+- */
+- err = -EADDRNOTAVAIL;
+- if (!sysctl_ip_nonlocal_bind &&
+- !inet->freebind &&
+- nsa.saddr != INADDR_ANY &&
+- chk_addr_ret != RTN_LOCAL &&
+- chk_addr_ret != RTN_MULTICAST &&
+- chk_addr_ret != RTN_BROADCAST)
+- goto out;
+-
+- snum = ntohs(addr->sin_port);
+- err = -EACCES;
+- if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
+- goto out;
+-
+- /* We keep a pair of addresses. rcv_saddr is the one
+- * used by hash lookups, and saddr is used for transmit.
+- *
+- * In the BSD API these are the same except where it
+- * would be illegal to use them (multicast/broadcast) in
+- * which case the sending device address is used.
+- */
+- lock_sock(sk);
+-
+- /* Check these errors (active socket, double bind). */
+- err = -EINVAL;
+- if (sk->sk_state != TCP_CLOSE || inet->num)
+- goto out_release_sock;
+-
+- v4_set_sock_addr(inet, &nsa);
+- if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
+- inet->saddr = 0; /* Use device */
+-
+- /* Make sure we are allowed to bind here. */
+- if (sk->sk_prot->get_port(sk, snum)) {
+- inet->saddr = inet->rcv_saddr = 0;
+- err = -EADDRINUSE;
+- goto out_release_sock;
+- }
+-
+- if (inet->rcv_saddr)
+- sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
+- if (snum)
+- sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
+- inet->sport = htons(inet->num);
+- inet->daddr = 0;
+- inet->dport = 0;
+- sk_dst_reset(sk);
+- err = 0;
+-out_release_sock:
+- release_sock(sk);
+-out:
+- return err;
+-}
+-
+-int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr,
+- int addr_len, int flags)
+-{
+- struct sock *sk = sock->sk;
+-
+- if (uaddr->sa_family == AF_UNSPEC)
+- return sk->sk_prot->disconnect(sk, flags);
+-
+- if (!inet_sk(sk)->num && inet_autobind(sk))
+- return -EAGAIN;
+- return sk->sk_prot->connect(sk, (struct sockaddr *)uaddr, addr_len);
+-}
+-
+-static long inet_wait_for_connect(struct sock *sk, long timeo)
+-{
+- DEFINE_WAIT(wait);
+-
+- prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+-
+- /* Basic assumption: if someone sets sk->sk_err, he _must_
+- * change state of the socket from TCP_SYN_*.
+- * Connect() does not allow to get error notifications
+- * without closing the socket.
+- */
+- while ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+- release_sock(sk);
+- timeo = schedule_timeout(timeo);
+- lock_sock(sk);
+- if (signal_pending(current) || !timeo)
+- break;
+- prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+- }
+- finish_wait(sk->sk_sleep, &wait);
+- return timeo;
+-}
+-
+-/*
+- * Connect to a remote host. There is regrettably still a little
+- * TCP 'magic' in here.
+- */
+-int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+- int addr_len, int flags)
+-{
+- struct sock *sk = sock->sk;
+- int err;
+- long timeo;
+-
+- lock_sock(sk);
+-
+- if (uaddr->sa_family == AF_UNSPEC) {
+- err = sk->sk_prot->disconnect(sk, flags);
+- sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
+- goto out;
+- }
+-
+- switch (sock->state) {
+- default:
+- err = -EINVAL;
+- goto out;
+- case SS_CONNECTED:
+- err = -EISCONN;
+- goto out;
+- case SS_CONNECTING:
+- err = -EALREADY;
+- /* Fall out of switch with err, set for this state */
+- break;
+- case SS_UNCONNECTED:
+- err = -EISCONN;
+- if (sk->sk_state != TCP_CLOSE)
+- goto out;
+-
+- err = sk->sk_prot->connect(sk, uaddr, addr_len);
+- if (err < 0)
+- goto out;
+-
+- sock->state = SS_CONNECTING;
+-
+- /* Just entered SS_CONNECTING state; the only
+- * difference is that return value in non-blocking
+- * case is EINPROGRESS, rather than EALREADY.
+- */
+- err = -EINPROGRESS;
+- break;
+- }
+-
+- timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
+-
+- if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+- /* Error code is set above */
+- if (!timeo || !inet_wait_for_connect(sk, timeo))
+- goto out;
+-
+- err = sock_intr_errno(timeo);
+- if (signal_pending(current))
+- goto out;
+- }
+-
+- /* Connection was closed by RST, timeout, ICMP error
+- * or another process disconnected us.
+- */
+- if (sk->sk_state == TCP_CLOSE)
+- goto sock_error;
+-
+- /* sk->sk_err may be not zero now, if RECVERR was ordered by user
+- * and error was received after socket entered established state.
+- * Hence, it is handled normally after connect() return successfully.
+- */
+-
+- sock->state = SS_CONNECTED;
+- err = 0;
+-out:
+- release_sock(sk);
+- return err;
+-
+-sock_error:
+- err = sock_error(sk) ? : -ECONNABORTED;
+- sock->state = SS_UNCONNECTED;
+- if (sk->sk_prot->disconnect(sk, flags))
+- sock->state = SS_DISCONNECTING;
+- goto out;
+-}
+-
+-/*
+- * Accept a pending connection. The TCP layer now gives BSD semantics.
+- */
+-
+-int inet_accept(struct socket *sock, struct socket *newsock, int flags)
+-{
+- struct sock *sk1 = sock->sk;
+- int err = -EINVAL;
+- struct sock *sk2 = sk1->sk_prot->accept(sk1, flags, &err);
+-
+- if (!sk2)
+- goto do_err;
+-
+- lock_sock(sk2);
+-
+- BUG_TRAP((1 << sk2->sk_state) &
+- (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_CLOSE));
+-
+- sock_graft(sk2, newsock);
+-
+- newsock->state = SS_CONNECTED;
+- err = 0;
+- release_sock(sk2);
+-do_err:
+- return err;
+-}
+-
+-
+-/*
+- * This does both peername and sockname.
+- */
+-int inet_getname(struct socket *sock, struct sockaddr *uaddr,
+- int *uaddr_len, int peer)
+-{
+- struct sock *sk = sock->sk;
+- struct inet_sock *inet = inet_sk(sk);
+- struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
+-
+- sin->sin_family = AF_INET;
+- if (peer) {
+- if (!inet->dport ||
+- (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) &&
+- peer == 1))
+- return -ENOTCONN;
+- sin->sin_port = inet->dport;
+- sin->sin_addr.s_addr =
+- nx_map_sock_lback(sk->sk_nx_info, inet->daddr);
+- } else {
+- __be32 addr = inet->rcv_saddr;
+- if (!addr)
+- addr = inet->saddr;
+- addr = nx_map_sock_lback(sk->sk_nx_info, addr);
+- sin->sin_port = inet->sport;
+- sin->sin_addr.s_addr = addr;
+- }
+- memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+- *uaddr_len = sizeof(*sin);
+- return 0;
+-}
+-
+-int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+- size_t size)
+-{
+- struct sock *sk = sock->sk;
+-
+- /* We may need to bind the socket. */
+- if (!inet_sk(sk)->num && inet_autobind(sk))
+- return -EAGAIN;
+-
+- return sk->sk_prot->sendmsg(iocb, sk, msg, size);
+-}
+-
+-
+-static ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
+-{
+- struct sock *sk = sock->sk;
+-
+- /* We may need to bind the socket. */
+- if (!inet_sk(sk)->num && inet_autobind(sk))
+- return -EAGAIN;
+-
+- if (sk->sk_prot->sendpage)
+- return sk->sk_prot->sendpage(sk, page, offset, size, flags);
+- return sock_no_sendpage(sock, page, offset, size, flags);
+-}
+-
+-
+-int inet_shutdown(struct socket *sock, int how)
+-{
+- struct sock *sk = sock->sk;
+- int err = 0;
+-
+- /* This should really check to make sure
+- * the socket is a TCP socket. (WHY AC...)
+- */
+- how++; /* maps 0->1 has the advantage of making bit 1 rcvs and
+- 1->2 bit 2 snds.
+- 2->3 */
+- if ((how & ~SHUTDOWN_MASK) || !how) /* MAXINT->0 */
+- return -EINVAL;
+-
+- lock_sock(sk);
+- if (sock->state == SS_CONNECTING) {
+- if ((1 << sk->sk_state) &
+- (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE))
+- sock->state = SS_DISCONNECTING;
+- else
+- sock->state = SS_CONNECTED;
+- }
+-
+- switch (sk->sk_state) {
+- case TCP_CLOSE:
+- err = -ENOTCONN;
+- /* Hack to wake up other listeners, who can poll for
+- POLLHUP, even on eg. unconnected UDP sockets -- RR */
+- default:
+- sk->sk_shutdown |= how;
+- if (sk->sk_prot->shutdown)
+- sk->sk_prot->shutdown(sk, how);
+- break;
+-
+- /* Remaining two branches are temporary solution for missing
+- * close() in multithreaded environment. It is _not_ a good idea,
+- * but we have no choice until close() is repaired at VFS level.
+- */
+- case TCP_LISTEN:
+- if (!(how & RCV_SHUTDOWN))
+- break;
+- /* Fall through */
+- case TCP_SYN_SENT:
+- err = sk->sk_prot->disconnect(sk, O_NONBLOCK);
+- sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
+- break;
+- }
+-
+- /* Wake up anyone sleeping in poll. */
+- sk->sk_state_change(sk);
+- release_sock(sk);
+- return err;
+-}
+-
+-/*
+- * ioctl() calls you can issue on an INET socket. Most of these are
+- * device configuration and stuff and very rarely used. Some ioctls
+- * pass on to the socket itself.
+- *
+- * NOTE: I like the idea of a module for the config stuff. ie ifconfig
+- * loads the devconfigure module does its configuring and unloads it.
+- * There's a good 20K of config code hanging around the kernel.
+- */
+-
+-int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+-{
+- struct sock *sk = sock->sk;
+- int err = 0;
+-
+- switch (cmd) {
+- case SIOCGSTAMP:
+- err = sock_get_timestamp(sk, (struct timeval __user *)arg);
+- break;
+- case SIOCGSTAMPNS:
+- err = sock_get_timestampns(sk, (struct timespec __user *)arg);
+- break;
+- case SIOCADDRT:
+- case SIOCDELRT:
+- case SIOCRTMSG:
+- err = ip_rt_ioctl(cmd, (void __user *)arg);
+- break;
+- case SIOCDARP:
+- case SIOCGARP:
+- case SIOCSARP:
+- err = arp_ioctl(cmd, (void __user *)arg);
+- break;
+- case SIOCGIFADDR:
+- case SIOCSIFADDR:
+- case SIOCGIFBRDADDR:
+- case SIOCSIFBRDADDR:
+- case SIOCGIFNETMASK:
+- case SIOCSIFNETMASK:
+- case SIOCGIFDSTADDR:
+- case SIOCSIFDSTADDR:
+- case SIOCSIFPFLAGS:
+- case SIOCGIFPFLAGS:
+- case SIOCSIFFLAGS:
+- err = devinet_ioctl(cmd, (void __user *)arg);
+- break;
+- default:
+- if (sk->sk_prot->ioctl)
+- err = sk->sk_prot->ioctl(sk, cmd, arg);
+- else
+- err = -ENOIOCTLCMD;
+- break;
+- }
+- return err;
+-}
+-
+-const struct proto_ops inet_stream_ops = {
+- .family = PF_INET,
+- .owner = THIS_MODULE,
+- .release = inet_release,
+- .bind = inet_bind,
+- .connect = inet_stream_connect,
+- .socketpair = sock_no_socketpair,
+- .accept = inet_accept,
+- .getname = inet_getname,
+- .poll = tcp_poll,
+- .ioctl = inet_ioctl,
+- .listen = inet_listen,
+- .shutdown = inet_shutdown,
+- .setsockopt = sock_common_setsockopt,
+- .getsockopt = sock_common_getsockopt,
+- .sendmsg = tcp_sendmsg,
+- .recvmsg = sock_common_recvmsg,
+- .mmap = sock_no_mmap,
+- .sendpage = tcp_sendpage,
+-#ifdef CONFIG_COMPAT
+- .compat_setsockopt = compat_sock_common_setsockopt,
+- .compat_getsockopt = compat_sock_common_getsockopt,
+-#endif
+-};
+-
+-const struct proto_ops inet_dgram_ops = {
+- .family = PF_INET,
+- .owner = THIS_MODULE,
+- .release = inet_release,
+- .bind = inet_bind,
+- .connect = inet_dgram_connect,
+- .socketpair = sock_no_socketpair,
+- .accept = sock_no_accept,
+- .getname = inet_getname,
+- .poll = udp_poll,
+- .ioctl = inet_ioctl,
+- .listen = sock_no_listen,
+- .shutdown = inet_shutdown,
+- .setsockopt = sock_common_setsockopt,
+- .getsockopt = sock_common_getsockopt,
+- .sendmsg = inet_sendmsg,
+- .recvmsg = sock_common_recvmsg,
+- .mmap = sock_no_mmap,
+- .sendpage = inet_sendpage,
+-#ifdef CONFIG_COMPAT
+- .compat_setsockopt = compat_sock_common_setsockopt,
+- .compat_getsockopt = compat_sock_common_getsockopt,
+-#endif
+-};
+-
+-/*
+- * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without
+- * udp_poll
+- */
+-static const struct proto_ops inet_sockraw_ops = {
+- .family = PF_INET,
+- .owner = THIS_MODULE,
+- .release = inet_release,
+- .bind = inet_bind,
+- .connect = inet_dgram_connect,
+- .socketpair = sock_no_socketpair,
+- .accept = sock_no_accept,
+- .getname = inet_getname,
+- .poll = datagram_poll,
+- .ioctl = inet_ioctl,
+- .listen = sock_no_listen,
+- .shutdown = inet_shutdown,
+- .setsockopt = sock_common_setsockopt,
+- .getsockopt = sock_common_getsockopt,
+- .sendmsg = inet_sendmsg,
+- .recvmsg = sock_common_recvmsg,
+- .mmap = sock_no_mmap,
+- .sendpage = inet_sendpage,
+-#ifdef CONFIG_COMPAT
+- .compat_setsockopt = compat_sock_common_setsockopt,
+- .compat_getsockopt = compat_sock_common_getsockopt,
+-#endif
+-};
+-
+-static struct net_proto_family inet_family_ops = {
+- .family = PF_INET,
+- .create = inet_create,
+- .owner = THIS_MODULE,
+-};
+-
+-/* Upon startup we insert all the elements in inetsw_array[] into
+- * the linked list inetsw.
+- */
+-static struct inet_protosw inetsw_array[] =
+-{
+- {
+- .type = SOCK_STREAM,
+- .protocol = IPPROTO_TCP,
+- .prot = &tcp_prot,
+- .ops = &inet_stream_ops,
+- .capability = -1,
+- .no_check = 0,
+- .flags = INET_PROTOSW_PERMANENT |
+- INET_PROTOSW_ICSK,
+- },
+-
+- {
+- .type = SOCK_DGRAM,
+- .protocol = IPPROTO_UDP,
+- .prot = &udp_prot,
+- .ops = &inet_dgram_ops,
+- .capability = -1,
+- .no_check = UDP_CSUM_DEFAULT,
+- .flags = INET_PROTOSW_PERMANENT,
+- },
+-
+-
+- {
+- .type = SOCK_RAW,
+- .protocol = IPPROTO_IP, /* wild card */
+- .prot = &raw_prot,
+- .ops = &inet_sockraw_ops,
+- .capability = CAP_NET_RAW,
+- .no_check = UDP_CSUM_DEFAULT,
+- .flags = INET_PROTOSW_REUSE,
+- }
+-};
+-
+-#define INETSW_ARRAY_LEN (sizeof(inetsw_array) / sizeof(struct inet_protosw))
+-
+-void inet_register_protosw(struct inet_protosw *p)
+-{
+- struct list_head *lh;
+- struct inet_protosw *answer;
+- int protocol = p->protocol;
+- struct list_head *last_perm;
+-
+- spin_lock_bh(&inetsw_lock);
+-
+- if (p->type >= SOCK_MAX)
+- goto out_illegal;
+-
+- /* If we are trying to override a permanent protocol, bail. */
+- answer = NULL;
+- last_perm = &inetsw[p->type];
+- list_for_each(lh, &inetsw[p->type]) {
+- answer = list_entry(lh, struct inet_protosw, list);
+-
+- /* Check only the non-wild match. */
+- if (INET_PROTOSW_PERMANENT & answer->flags) {
+- if (protocol == answer->protocol)
+- break;
+- last_perm = lh;
+- }
+-
+- answer = NULL;
+- }
+- if (answer)
+- goto out_permanent;
+-
+- /* Add the new entry after the last permanent entry if any, so that
+- * the new entry does not override a permanent entry when matched with
+- * a wild-card protocol. But it is allowed to override any existing
+- * non-permanent entry. This means that when we remove this entry, the
+- * system automatically returns to the old behavior.
+- */
+- list_add_rcu(&p->list, last_perm);
+-out:
+- spin_unlock_bh(&inetsw_lock);
+-
+- synchronize_net();
+-
+- return;
+-
+-out_permanent:
+- printk(KERN_ERR "Attempt to override permanent protocol %d.\n",
+- protocol);
+- goto out;
+-
+-out_illegal:
+- printk(KERN_ERR
+- "Ignoring attempt to register invalid socket type %d.\n",
+- p->type);
+- goto out;
+-}
+-
+-void inet_unregister_protosw(struct inet_protosw *p)
+-{
+- if (INET_PROTOSW_PERMANENT & p->flags) {
+- printk(KERN_ERR
+- "Attempt to unregister permanent protocol %d.\n",
+- p->protocol);
+- } else {
+- spin_lock_bh(&inetsw_lock);
+- list_del_rcu(&p->list);
+- spin_unlock_bh(&inetsw_lock);
+-
+- synchronize_net();
+- }
+-}
+-
+-/*
+- * Shall we try to damage output packets if routing dev changes?
+- */
+-
+-int sysctl_ip_dynaddr __read_mostly;
+-
+-static int inet_sk_reselect_saddr(struct sock *sk)
+-{
+- struct inet_sock *inet = inet_sk(sk);
+- int err;
+- struct rtable *rt;
+- __be32 old_saddr = inet->saddr;
+- __be32 new_saddr;
+- __be32 daddr = inet->daddr;
+-
+- if (inet->opt && inet->opt->srr)
+- daddr = inet->opt->faddr;
+-
+- /* Query new route. */
+- err = ip_route_connect(&rt, daddr, 0,
+- RT_CONN_FLAGS(sk),
+- sk->sk_bound_dev_if,
+- sk->sk_protocol,
+- inet->sport, inet->dport, sk, 0);
+- if (err)
+- return err;
+-
+- sk_setup_caps(sk, &rt->u.dst);
+-
+- new_saddr = rt->rt_src;
+-
+- if (new_saddr == old_saddr)
+- return 0;
+-
+- if (sysctl_ip_dynaddr > 1) {
+- printk(KERN_INFO "%s(): shifting inet->"
+- "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
+- __FUNCTION__,
+- NIPQUAD(old_saddr),
+- NIPQUAD(new_saddr));
+- }
+-
+- inet->saddr = inet->rcv_saddr = new_saddr;
+-
+- /*
+- * XXX The only one ugly spot where we need to
+- * XXX really change the sockets identity after
+- * XXX it has entered the hashes. -DaveM
+- *
+- * Besides that, it does not check for connection
+- * uniqueness. Wait for troubles.
+- */
+- __sk_prot_rehash(sk);
+- return 0;
+-}
+-
+-int inet_sk_rebuild_header(struct sock *sk)
+-{
+- struct inet_sock *inet = inet_sk(sk);
+- struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
+- __be32 daddr;
+- int err;
+-
+- /* Route is OK, nothing to do. */
+- if (rt)
+- return 0;
+-
+- /* Reroute. */
+- daddr = inet->daddr;
+- if (inet->opt && inet->opt->srr)
+- daddr = inet->opt->faddr;
+-{
+- struct flowi fl = {
+- .oif = sk->sk_bound_dev_if,
+- .nl_u = {
+- .ip4_u = {
+- .daddr = daddr,
+- .saddr = inet->saddr,
+- .tos = RT_CONN_FLAGS(sk),
+- },
+- },
+- .proto = sk->sk_protocol,
+- .uli_u = {
+- .ports = {
+- .sport = inet->sport,
+- .dport = inet->dport,
+- },
+- },
+- };
+-
+- security_sk_classify_flow(sk, &fl);
+- err = ip_route_output_flow(&rt, &fl, sk, 0);
+-}
+- if (!err)
+- sk_setup_caps(sk, &rt->u.dst);
+- else {
+- /* Routing failed... */
+- sk->sk_route_caps = 0;
+- /*
+- * Other protocols have to map its equivalent state to TCP_SYN_SENT.
+- * DCCP maps its DCCP_REQUESTING state to TCP_SYN_SENT. -acme
+- */
+- if (!sysctl_ip_dynaddr ||
+- sk->sk_state != TCP_SYN_SENT ||
+- (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
+- (err = inet_sk_reselect_saddr(sk)) != 0)
+- sk->sk_err_soft = -err;
+- }
+-
+- return err;
+-}
+-
+-EXPORT_SYMBOL(inet_sk_rebuild_header);
+-
+-static int inet_gso_send_check(struct sk_buff *skb)
+-{
+- struct iphdr *iph;
+- struct net_protocol *ops;
+- int proto;
+- int ihl;
+- int err = -EINVAL;
+-
+- if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))
+- goto out;
+-
+- iph = ip_hdr(skb);
+- ihl = iph->ihl * 4;
+- if (ihl < sizeof(*iph))
+- goto out;
+-
+- if (unlikely(!pskb_may_pull(skb, ihl)))
+- goto out;
+-
+- __skb_pull(skb, ihl);
+- skb_reset_transport_header(skb);
+- iph = ip_hdr(skb);
+- proto = iph->protocol & (MAX_INET_PROTOS - 1);
+- err = -EPROTONOSUPPORT;
+-
+- rcu_read_lock();
+- ops = rcu_dereference(inet_protos[proto]);
+- if (likely(ops && ops->gso_send_check))
+- err = ops->gso_send_check(skb);
+- rcu_read_unlock();
+-
+-out:
+- return err;
+-}
+-
+-static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int features)
+-{
+- struct sk_buff *segs = ERR_PTR(-EINVAL);
+- struct iphdr *iph;
+- struct net_protocol *ops;
+- int proto;
+- int ihl;
+- int id;
+-
+- if (unlikely(skb_shinfo(skb)->gso_type &
+- ~(SKB_GSO_TCPV4 |
+- SKB_GSO_UDP |
+- SKB_GSO_DODGY |
+- SKB_GSO_TCP_ECN |
+- 0)))
+- goto out;
+-
+- if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))
+- goto out;
+-
+- iph = ip_hdr(skb);
+- ihl = iph->ihl * 4;
+- if (ihl < sizeof(*iph))
+- goto out;
+-
+- if (unlikely(!pskb_may_pull(skb, ihl)))
+- goto out;
+-
+- __skb_pull(skb, ihl);
+- skb_reset_transport_header(skb);
+- iph = ip_hdr(skb);
+- id = ntohs(iph->id);
+- proto = iph->protocol & (MAX_INET_PROTOS - 1);
+- segs = ERR_PTR(-EPROTONOSUPPORT);
+-
+- rcu_read_lock();
+- ops = rcu_dereference(inet_protos[proto]);
+- if (likely(ops && ops->gso_segment))
+- segs = ops->gso_segment(skb, features);
+- rcu_read_unlock();
+-
+- if (!segs || unlikely(IS_ERR(segs)))
+- goto out;
+-
+- skb = segs;
+- do {
+- iph = ip_hdr(skb);
+- iph->id = htons(id++);
+- iph->tot_len = htons(skb->len - skb->mac_len);
+- iph->check = 0;
+- iph->check = ip_fast_csum(skb_network_header(skb), iph->ihl);
+- } while ((skb = skb->next));
+-
+-out:
+- return segs;
+-}
+-
+-unsigned long snmp_fold_field(void *mib[], int offt)
+-{
+- unsigned long res = 0;
+- int i;
+-
+- for_each_possible_cpu(i) {
+- res += *(((unsigned long *) per_cpu_ptr(mib[0], i)) + offt);
+- res += *(((unsigned long *) per_cpu_ptr(mib[1], i)) + offt);
+- }
+- return res;
+-}
+-EXPORT_SYMBOL_GPL(snmp_fold_field);
+-
+-int snmp_mib_init(void *ptr[2], size_t mibsize, size_t mibalign)
+-{
+- BUG_ON(ptr == NULL);
+- ptr[0] = __alloc_percpu(mibsize);
+- if (!ptr[0])
+- goto err0;
+- ptr[1] = __alloc_percpu(mibsize);
+- if (!ptr[1])
+- goto err1;
+- return 0;
+-err1:
+- free_percpu(ptr[0]);
+- ptr[0] = NULL;
+-err0:
+- return -ENOMEM;
+-}
+-EXPORT_SYMBOL_GPL(snmp_mib_init);
+-
+-void snmp_mib_free(void *ptr[2])
+-{
+- BUG_ON(ptr == NULL);
+- free_percpu(ptr[0]);
+- free_percpu(ptr[1]);
+- ptr[0] = ptr[1] = NULL;
+-}
+-EXPORT_SYMBOL_GPL(snmp_mib_free);
+-
+-#ifdef CONFIG_IP_MULTICAST
+-static struct net_protocol igmp_protocol = {
+- .handler = igmp_rcv,
+-};
+-#endif
+-
+-static struct net_protocol tcp_protocol = {
+- .handler = tcp_v4_rcv,
+- .err_handler = tcp_v4_err,
+- .gso_send_check = tcp_v4_gso_send_check,
+- .gso_segment = tcp_tso_segment,
+- .no_policy = 1,
+-};
+-
+-static struct net_protocol udp_protocol = {
+- .handler = udp_rcv,
+- .err_handler = udp_err,
+- .no_policy = 1,
+-};
+-
+-static struct net_protocol icmp_protocol = {
+- .handler = icmp_rcv,
+-};
+-
+-static int __init init_ipv4_mibs(void)
+-{
+- if (snmp_mib_init((void **)net_statistics,
+- sizeof(struct linux_mib),
+- __alignof__(struct linux_mib)) < 0)
+- goto err_net_mib;
+- if (snmp_mib_init((void **)ip_statistics,
+- sizeof(struct ipstats_mib),
+- __alignof__(struct ipstats_mib)) < 0)
+- goto err_ip_mib;
+- if (snmp_mib_init((void **)icmp_statistics,
+- sizeof(struct icmp_mib),
+- __alignof__(struct icmp_mib)) < 0)
+- goto err_icmp_mib;
+- if (snmp_mib_init((void **)tcp_statistics,
+- sizeof(struct tcp_mib),
+- __alignof__(struct tcp_mib)) < 0)
+- goto err_tcp_mib;
+- if (snmp_mib_init((void **)udp_statistics,
+- sizeof(struct udp_mib),
+- __alignof__(struct udp_mib)) < 0)
+- goto err_udp_mib;
+- if (snmp_mib_init((void **)udplite_statistics,
+- sizeof(struct udp_mib),
+- __alignof__(struct udp_mib)) < 0)
+- goto err_udplite_mib;
+-
+- tcp_mib_init();
+-
+- return 0;
+-
+-err_udplite_mib:
+- snmp_mib_free((void **)udp_statistics);
+-err_udp_mib:
+- snmp_mib_free((void **)tcp_statistics);
+-err_tcp_mib:
+- snmp_mib_free((void **)icmp_statistics);
+-err_icmp_mib:
+- snmp_mib_free((void **)ip_statistics);
+-err_ip_mib:
+- snmp_mib_free((void **)net_statistics);
+-err_net_mib:
+- return -ENOMEM;
+-}
+-
+-static int ipv4_proc_init(void);
+-
+-/*
+- * IP protocol layer initialiser
+- */
+-
+-static struct packet_type ip_packet_type = {
+- .type = __constant_htons(ETH_P_IP),
+- .func = ip_rcv,
+- .gso_send_check = inet_gso_send_check,
+- .gso_segment = inet_gso_segment,
+-};
+-
+-static int __init inet_init(void)
+-{
+- struct sk_buff *dummy_skb;
+- struct inet_protosw *q;
+- struct list_head *r;
+- int rc = -EINVAL;
+-
+- BUILD_BUG_ON(sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb));
+-
+- rc = proto_register(&tcp_prot, 1);
+- if (rc)
+- goto out;
+-
+- rc = proto_register(&udp_prot, 1);
+- if (rc)
+- goto out_unregister_tcp_proto;
+-
+- rc = proto_register(&raw_prot, 1);
+- if (rc)
+- goto out_unregister_udp_proto;
+-
+- /*
+- * Tell SOCKET that we are alive...
+- */
+-
+- (void)sock_register(&inet_family_ops);
+-
+- /*
+- * Add all the base protocols.
+- */
+-
+- if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)
+- printk(KERN_CRIT "inet_init: Cannot add ICMP protocol\n");
+- if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)
+- printk(KERN_CRIT "inet_init: Cannot add UDP protocol\n");
+- if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
+- printk(KERN_CRIT "inet_init: Cannot add TCP protocol\n");
+-#ifdef CONFIG_IP_MULTICAST
+- if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0)
+- printk(KERN_CRIT "inet_init: Cannot add IGMP protocol\n");
+-#endif
+-
+- /* Register the socket-side information for inet_create. */
+- for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)
+- INIT_LIST_HEAD(r);
+-
+- for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
+- inet_register_protosw(q);
+-
+- /*
+- * Set the ARP module up
+- */
+-
+- arp_init();
+-
+- /*
+- * Set the IP module up
+- */
+-
+- ip_init();
+-
+- tcp_v4_init(&inet_family_ops);
+-
+- /* Setup TCP slab cache for open requests. */
+- tcp_init();
+-
+- /* Add UDP-Lite (RFC 3828) */
+- udplite4_register();
+-
+- /*
+- * Set the ICMP layer up
+- */
+-
+- icmp_init(&inet_family_ops);
+-
+- /*
+- * Initialise the multicast router
+- */
+-#if defined(CONFIG_IP_MROUTE)
+- ip_mr_init();
+-#endif
+- /*
+- * Initialise per-cpu ipv4 mibs
+- */
+-
+- if (init_ipv4_mibs())
+- printk(KERN_CRIT "inet_init: Cannot init ipv4 mibs\n"); ;
+-
+- ipv4_proc_init();
+-
+- ipfrag_init();
+-
+- dev_add_pack(&ip_packet_type);
+-
+- rc = 0;
+-out:
+- return rc;
+-out_unregister_udp_proto:
+- proto_unregister(&udp_prot);
+-out_unregister_tcp_proto:
+- proto_unregister(&tcp_prot);
+- goto out;
+-}
+-
+-fs_initcall(inet_init);
+-
+-/* ------------------------------------------------------------------------ */
+-
+-#ifdef CONFIG_PROC_FS
+-static int __init ipv4_proc_init(void)
+-{
+- int rc = 0;
+-
+- if (raw_proc_init())
+- goto out_raw;
+- if (tcp4_proc_init())
+- goto out_tcp;
+- if (udp4_proc_init())
+- goto out_udp;
+- if (fib_proc_init())
+- goto out_fib;
+- if (ip_misc_proc_init())
+- goto out_misc;
+-out:
+- return rc;
+-out_misc:
+- fib_proc_exit();
+-out_fib:
+- udp4_proc_exit();
+-out_udp:
+- tcp4_proc_exit();
+-out_tcp:
+- raw_proc_exit();
+-out_raw:
+- rc = -ENOMEM;
+- goto out;
+-}
+-
+-#else /* CONFIG_PROC_FS */
+-static int __init ipv4_proc_init(void)
+-{
+- return 0;
+-}
+-#endif /* CONFIG_PROC_FS */
+-
+-MODULE_ALIAS_NETPROTO(PF_INET);
+-
+-EXPORT_SYMBOL(inet_accept);
+-EXPORT_SYMBOL(inet_bind);
+-EXPORT_SYMBOL(inet_dgram_connect);
+-EXPORT_SYMBOL(inet_dgram_ops);
+-EXPORT_SYMBOL(inet_getname);
+-EXPORT_SYMBOL(inet_ioctl);
+-EXPORT_SYMBOL(inet_listen);
+-EXPORT_SYMBOL(inet_register_protosw);
+-EXPORT_SYMBOL(inet_release);
+-EXPORT_SYMBOL(inet_sendmsg);
+-EXPORT_SYMBOL(inet_shutdown);
+-EXPORT_SYMBOL(inet_sock_destruct);
+-EXPORT_SYMBOL(inet_stream_connect);
+-EXPORT_SYMBOL(inet_stream_ops);
+-EXPORT_SYMBOL(inet_unregister_protosw);
+-EXPORT_SYMBOL(net_statistics);
+-EXPORT_SYMBOL(sysctl_ip_nonlocal_bind);
+diff -Nurb linux-2.6.22-594/net/netfilter/xt_MARK.c.orig linux-2.6.22-595/net/netfilter/xt_MARK.c.orig
+--- linux-2.6.22-594/net/netfilter/xt_MARK.c.orig 2008-03-20 00:05:19.000000000 -0400
++++ linux-2.6.22-595/net/netfilter/xt_MARK.c.orig 1969-12-31 19:00:00.000000000 -0500
+@@ -1,283 +0,0 @@
+-/* This is a module which is used for setting the NFMARK field of an skb. */
+-
+-/* (C) 1999-2001 Marc Boucher <marc@mbsi.ca>
+- *
+- * This program is free software; you can redistribute it and/or modify
+- * it under the terms of the GNU General Public License version 2 as
+- * published by the Free Software Foundation.
+- *
+- */
+-
+-#include <linux/module.h>
+-#include <linux/version.h>
+-#include <linux/skbuff.h>
+-#include <linux/ip.h>
+-#include <net/checksum.h>
+-#include <net/route.h>
+-#include <net/inet_hashtables.h>
+-
+-#include <net/netfilter/nf_conntrack.h>
+-#include <linux/netfilter/x_tables.h>
+-#include <linux/netfilter/xt_MARK.h>
+-
+-MODULE_LICENSE("GPL");
+-MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
+-MODULE_DESCRIPTION("ip[6]tables MARK modification module");
+-MODULE_ALIAS("ipt_MARK");
+-MODULE_ALIAS("ip6t_MARK");
+-
+-static inline u_int16_t
+-get_dst_port(struct nf_conntrack_tuple *tuple)
+-{
+- switch (tuple->dst.protonum) {
+- case IPPROTO_GRE:
+- /* XXX Truncate 32-bit GRE key to 16 bits */
+-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,11)
+- return tuple->dst.u.gre.key;
+-#else
+- return htons(ntohl(tuple->dst.u.gre.key));
+-#endif
+- case IPPROTO_ICMP:
+- /* Bind on ICMP echo ID */
+- return tuple->src.u.icmp.id;
+- case IPPROTO_TCP:
+- return tuple->dst.u.tcp.port;
+- case IPPROTO_UDP:
+- return tuple->dst.u.udp.port;
+- default:
+- return tuple->dst.u.all;
+- }
+-}
+-
+-static inline u_int16_t
+-get_src_port(struct nf_conntrack_tuple *tuple)
+-{
+- switch (tuple->dst.protonum) {
+- case IPPROTO_GRE:
+- /* XXX Truncate 32-bit GRE key to 16 bits */
+- return htons(ntohl(tuple->src.u.gre.key));
+- case IPPROTO_ICMP:
+- /* Bind on ICMP echo ID */
+- return tuple->src.u.icmp.id;
+- case IPPROTO_TCP:
+- return tuple->src.u.tcp.port;
+- case IPPROTO_UDP:
+- return tuple->src.u.udp.port;
+- default:
+- return tuple->src.u.all;
+- }
+-}
+-
+-static unsigned int
+-target_v0(struct sk_buff **pskb,
+- const struct net_device *in,
+- const struct net_device *out,
+- unsigned int hooknum,
+- const struct xt_target *target,
+- const void *targinfo)
+-{
+- const struct xt_mark_target_info *markinfo = targinfo;
+-
+- (*pskb)->mark = markinfo->mark;
+- return XT_CONTINUE;
+-}
+-
+-static unsigned int
+-target_v1(struct sk_buff **pskb,
+- const struct net_device *in,
+- const struct net_device *out,
+- unsigned int hooknum,
+- const struct xt_target *target,
+- const void *targinfo)
+-{
+- const struct xt_mark_target_info_v1 *markinfo = targinfo;
+- int mark = -1;
+-
+- switch (markinfo->mode) {
+- case XT_MARK_SET:
+- mark = markinfo->mark;
+- break;
+-
+- case XT_MARK_AND:
+- mark = (*pskb)->mark & markinfo->mark;
+- break;
+-
+- case XT_MARK_OR:
+- mark = (*pskb)->mark | markinfo->mark;
+- break;
+-
+- case XT_MARK_COPYXID: {
+- enum ip_conntrack_info ctinfo;
+- struct sock *connection_sk=NULL;
+- int dif;
+-
+- struct nf_conn *ct = nf_ct_get((*pskb), &ctinfo);
+- extern struct inet_hashinfo tcp_hashinfo;
+- enum ip_conntrack_dir dir;
+- if (!ct)
+- break;
+-
+- dir = CTINFO2DIR(ctinfo);
+- u_int32_t src_ip = ct->tuplehash[dir].tuple.src.u3.ip;
+- u_int16_t src_port = get_src_port(&ct->tuplehash[dir].tuple);
+- u_int16_t proto = ct->tuplehash[dir].tuple.dst.protonum;
+-
+- u_int32_t ip;
+- u_int16_t port;
+-
+- dif = ((struct rtable *)(*pskb)->dst)->rt_iif;
+- ip = ct->tuplehash[dir].tuple.dst.u3.ip;
+- port = get_dst_port(&ct->tuplehash[dir].tuple);
+-
+- if (proto == 1 || proto == 17) {
+- if (((*pskb)->mark!=-1) && (*pskb)->mark)
+- ct->xid[0]=(*pskb)->mark;
+- if (ct->xid[0])
+- mark = ct->xid[0];
+-
+- }
+- else if (proto == 6) {
+- if ((*pskb)->sk)
+- connection_sk = (*pskb)->sk;
+- else {
+- connection_sk = inet_lookup(&tcp_hashinfo, src_ip, src_port, ip, port, dif);
+- }
+-
+- if (connection_sk) {
+- connection_sk->sk_peercred.gid = connection_sk->sk_peercred.uid = ct->xid[dir];
+- ct->xid[!dir]=connection_sk->sk_xid;
+- if (connection_sk->sk_xid != 0)
+- mark = connection_sk->sk_xid;
+- if (connection_sk != (*pskb)->sk)
+- sock_put(connection_sk);
+- }
+- break;
+- }
+- }
+- }
+-
+- if (mark != -1)
+- (*pskb)->mark = mark;
+- return XT_CONTINUE;
+-}
+-
+-
+-static int
+-checkentry_v0(const char *tablename,
+- const void *entry,
+- const struct xt_target *target,
+- void *targinfo,
+- unsigned int hook_mask)
+-{
+- struct xt_mark_target_info *markinfo = targinfo;
+-
+- if (markinfo->mark > 0xffffffff) {
+- printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n");
+- return 0;
+- }
+- return 1;
+-}
+-
+-static int
+-checkentry_v1(const char *tablename,
+- const void *entry,
+- const struct xt_target *target,
+- void *targinfo,
+- unsigned int hook_mask)
+-{
+- struct xt_mark_target_info_v1 *markinfo = targinfo;
+-
+- if (markinfo->mode != XT_MARK_SET
+- && markinfo->mode != XT_MARK_AND
+- && markinfo->mode != XT_MARK_OR
+- && markinfo->mode != XT_MARK_COPYXID) {
+- printk(KERN_WARNING "MARK: unknown mode %u\n",
+- markinfo->mode);
+- return 0;
+- }
+- if (markinfo->mark > 0xffffffff) {
+- printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n");
+- return 0;
+- }
+- return 1;
+-}
+-
+-#ifdef CONFIG_COMPAT
+-struct compat_xt_mark_target_info_v1 {
+- compat_ulong_t mark;
+- u_int8_t mode;
+- u_int8_t __pad1;
+- u_int16_t __pad2;
+-};
+-
+-static void compat_from_user_v1(void *dst, void *src)
+-{
+- struct compat_xt_mark_target_info_v1 *cm = src;
+- struct xt_mark_target_info_v1 m = {
+- .mark = cm->mark,
+- .mode = cm->mode,
+- };
+- memcpy(dst, &m, sizeof(m));
+-}
+-
+-static int compat_to_user_v1(void __user *dst, void *src)
+-{
+- struct xt_mark_target_info_v1 *m = src;
+- struct compat_xt_mark_target_info_v1 cm = {
+- .mark = m->mark,
+- .mode = m->mode,
+- };
+- return copy_to_user(dst, &cm, sizeof(cm)) ? -EFAULT : 0;
+-}
+-#endif /* CONFIG_COMPAT */
+-
+-static struct xt_target xt_mark_target[] = {
+- {
+- .name = "MARK",
+- .family = AF_INET,
+- .revision = 0,
+- .checkentry = checkentry_v0,
+- .target = target_v0,
+- .targetsize = sizeof(struct xt_mark_target_info),
+- .table = "mangle",
+- .me = THIS_MODULE,
+- },
+- {
+- .name = "MARK",
+- .family = AF_INET,
+- .revision = 1,
+- .checkentry = checkentry_v1,
+- .target = target_v1,
+- .targetsize = sizeof(struct xt_mark_target_info_v1),
+-#ifdef CONFIG_COMPAT
+- .compatsize = sizeof(struct compat_xt_mark_target_info_v1),
+- .compat_from_user = compat_from_user_v1,
+- .compat_to_user = compat_to_user_v1,
+-#endif
+- .table = "mangle",
+- .me = THIS_MODULE,
+- },
+- {
+- .name = "MARK",
+- .family = AF_INET6,
+- .revision = 0,
+- .checkentry = checkentry_v0,
+- .target = target_v0,
+- .targetsize = sizeof(struct xt_mark_target_info),
+- .table = "mangle",
+- .me = THIS_MODULE,
+- },
+-};
+-
+-static int __init xt_mark_init(void)
+-{
+- return xt_register_targets(xt_mark_target, ARRAY_SIZE(xt_mark_target));
+-}
+-
+-static void __exit xt_mark_fini(void)
+-{
+- xt_unregister_targets(xt_mark_target, ARRAY_SIZE(xt_mark_target));
+-}
+-
+-module_init(xt_mark_init);
+-module_exit(xt_mark_fini);
+diff -Nurb linux-2.6.22-594/net/packet/af_packet.c.orig linux-2.6.22-595/net/packet/af_packet.c.orig
+--- linux-2.6.22-594/net/packet/af_packet.c.orig 2008-03-20 00:05:19.000000000 -0400
++++ linux-2.6.22-595/net/packet/af_packet.c.orig 1969-12-31 19:00:00.000000000 -0500
+@@ -1,1989 +0,0 @@
+-/*
+- * INET An implementation of the TCP/IP protocol suite for the LINUX
+- * operating system. INET is implemented using the BSD Socket
+- * interface as the means of communication with the user level.
+- *
+- * PACKET - implements raw packet sockets.
+- *
+- * Version: $Id: af_packet.c,v 1.61 2002/02/08 03:57:19 davem Exp $
+- *
+- * Authors: Ross Biro
+- * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+- * Alan Cox, <gw4pts@gw4pts.ampr.org>
+- *
+- * Fixes:
+- * Alan Cox : verify_area() now used correctly
+- * Alan Cox : new skbuff lists, look ma no backlogs!
+- * Alan Cox : tidied skbuff lists.
+- * Alan Cox : Now uses generic datagram routines I
+- * added. Also fixed the peek/read crash
+- * from all old Linux datagram code.
+- * Alan Cox : Uses the improved datagram code.
+- * Alan Cox : Added NULL's for socket options.
+- * Alan Cox : Re-commented the code.
+- * Alan Cox : Use new kernel side addressing
+- * Rob Janssen : Correct MTU usage.
+- * Dave Platt : Counter leaks caused by incorrect
+- * interrupt locking and some slightly
+- * dubious gcc output. Can you read
+- * compiler: it said _VOLATILE_
+- * Richard Kooijman : Timestamp fixes.
+- * Alan Cox : New buffers. Use sk->mac.raw.
+- * Alan Cox : sendmsg/recvmsg support.
+- * Alan Cox : Protocol setting support
+- * Alexey Kuznetsov : Untied from IPv4 stack.
+- * Cyrus Durgin : Fixed kerneld for kmod.
+- * Michal Ostrowski : Module initialization cleanup.
+- * Ulises Alonso : Frame number limit removal and
+- * packet_set_ring memory leak.
+- * Eric Biederman : Allow for > 8 byte hardware addresses.
+- * The convention is that longer addresses
+- * will simply extend the hardware address
+- * byte arrays at the end of sockaddr_ll
+- * and packet_mreq.
+- *
+- * This program is free software; you can redistribute it and/or
+- * modify it under the terms of the GNU General Public License
+- * as published by the Free Software Foundation; either version
+- * 2 of the License, or (at your option) any later version.
+- *
+- */
+-
+-#include <linux/types.h>
+-#include <linux/mm.h>
+-#include <linux/capability.h>
+-#include <linux/fcntl.h>
+-#include <linux/socket.h>
+-#include <linux/in.h>
+-#include <linux/inet.h>
+-#include <linux/netdevice.h>
+-#include <linux/if_packet.h>
+-#include <linux/wireless.h>
+-#include <linux/kernel.h>
+-#include <linux/kmod.h>
+-#include <net/ip.h>
+-#include <net/protocol.h>
+-#include <linux/skbuff.h>
+-#include <net/sock.h>
+-#include <linux/errno.h>
+-#include <linux/timer.h>
+-#include <asm/system.h>
+-#include <asm/uaccess.h>
+-#include <asm/ioctls.h>
+-#include <asm/page.h>
+-#include <asm/cacheflush.h>
+-#include <asm/io.h>
+-#include <linux/proc_fs.h>
+-#include <linux/seq_file.h>
+-#include <linux/poll.h>
+-#include <linux/module.h>
+-#include <linux/init.h>
+-#include <linux/vs_network.h>
+-
+-#ifdef CONFIG_INET
+-#include <net/inet_common.h>
+-#endif
+-
+-/*
+- Assumptions:
+- - if device has no dev->hard_header routine, it adds and removes ll header
+- inside itself. In this case ll header is invisible outside of device,
+- but higher levels still should reserve dev->hard_header_len.
+- Some devices are enough clever to reallocate skb, when header
+- will not fit to reserved space (tunnel), another ones are silly
+- (PPP).
+- - packet socket receives packets with pulled ll header,
+- so that SOCK_RAW should push it back.
+-
+-On receive:
+------------
+-
+-Incoming, dev->hard_header!=NULL
+- mac_header -> ll header
+- data -> data
+-
+-Outgoing, dev->hard_header!=NULL
+- mac_header -> ll header
+- data -> ll header
+-
+-Incoming, dev->hard_header==NULL
+- mac_header -> UNKNOWN position. It is very likely, that it points to ll
+- header. PPP makes it, that is wrong, because introduce
+- assymetry between rx and tx paths.
+- data -> data
+-
+-Outgoing, dev->hard_header==NULL
+- mac_header -> data. ll header is still not built!
+- data -> data
+-
+-Resume
+- If dev->hard_header==NULL we are unlikely to restore sensible ll header.
+-
+-
+-On transmit:
+-------------
+-
+-dev->hard_header != NULL
+- mac_header -> ll header
+- data -> ll header
+-
+-dev->hard_header == NULL (ll header is added by device, we cannot control it)
+- mac_header -> data
+- data -> data
+-
+- We should set nh.raw on output to correct posistion,
+- packet classifier depends on it.
+- */
+-
+-/* List of all packet sockets. */
+-static HLIST_HEAD(packet_sklist);
+-static DEFINE_RWLOCK(packet_sklist_lock);
+-
+-static atomic_t packet_socks_nr;
+-
+-
+-/* Private packet socket structures. */
+-
+-struct packet_mclist
+-{
+- struct packet_mclist *next;
+- int ifindex;
+- int count;
+- unsigned short type;
+- unsigned short alen;
+- unsigned char addr[MAX_ADDR_LEN];
+-};
+-/* identical to struct packet_mreq except it has
+- * a longer address field.
+- */
+-struct packet_mreq_max
+-{
+- int mr_ifindex;
+- unsigned short mr_type;
+- unsigned short mr_alen;
+- unsigned char mr_address[MAX_ADDR_LEN];
+-};
+-
+-#ifdef CONFIG_PACKET_MMAP
+-static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing);
+-#endif
+-
+-static void packet_flush_mclist(struct sock *sk);
+-
+-struct packet_sock {
+- /* struct sock has to be the first member of packet_sock */
+- struct sock sk;
+- struct tpacket_stats stats;
+-#ifdef CONFIG_PACKET_MMAP
+- char * *pg_vec;
+- unsigned int head;
+- unsigned int frames_per_block;
+- unsigned int frame_size;
+- unsigned int frame_max;
+- int copy_thresh;
+-#endif
+- struct packet_type prot_hook;
+- spinlock_t bind_lock;
+- unsigned int running:1, /* prot_hook is attached*/
+- auxdata:1,
+- origdev:1;
+- int ifindex; /* bound device */
+- __be16 num;
+- struct packet_mclist *mclist;
+-#ifdef CONFIG_PACKET_MMAP
+- atomic_t mapped;
+- unsigned int pg_vec_order;
+- unsigned int pg_vec_pages;
+- unsigned int pg_vec_len;
+-#endif
+-};
+-
+-struct packet_skb_cb {
+- unsigned int origlen;
+- union {
+- struct sockaddr_pkt pkt;
+- struct sockaddr_ll ll;
+- } sa;
+-};
+-
+-#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
+-
+-#ifdef CONFIG_PACKET_MMAP
+-
+-static inline struct tpacket_hdr *packet_lookup_frame(struct packet_sock *po, unsigned int position)
+-{
+- unsigned int pg_vec_pos, frame_offset;
+-
+- pg_vec_pos = position / po->frames_per_block;
+- frame_offset = position % po->frames_per_block;
+-
+- return (struct tpacket_hdr *)(po->pg_vec[pg_vec_pos] + (frame_offset * po->frame_size));
+-}
+-#endif
+-
+-static inline struct packet_sock *pkt_sk(struct sock *sk)
+-{
+- return (struct packet_sock *)sk;
+-}
+-
+-static void packet_sock_destruct(struct sock *sk)
+-{
+- BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc));
+- BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
+-
+- if (!sock_flag(sk, SOCK_DEAD)) {
+- printk("Attempt to release alive packet socket: %p\n", sk);
+- return;
+- }
+-
+- atomic_dec(&packet_socks_nr);
+-#ifdef PACKET_REFCNT_DEBUG
+- printk(KERN_DEBUG "PACKET socket %p is free, %d are alive\n", sk, atomic_read(&packet_socks_nr));
+-#endif
+-}
+-
+-
+-static const struct proto_ops packet_ops;
+-
+-static const struct proto_ops packet_ops_spkt;
+-
+-static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
+-{
+- struct sock *sk;
+- struct sockaddr_pkt *spkt;
+-
+- /*
+- * When we registered the protocol we saved the socket in the data
+- * field for just this event.
+- */
+-
+- sk = pt->af_packet_priv;
+-
+- /*
+- * Yank back the headers [hope the device set this
+- * right or kerboom...]
+- *
+- * Incoming packets have ll header pulled,
+- * push it back.
+- *
+- * For outgoing ones skb->data == skb_mac_header(skb)
+- * so that this procedure is noop.
+- */
+-
+- if (skb->pkt_type == PACKET_LOOPBACK)
+- goto out;
+-
+- if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
+- goto oom;
+-
+- /* drop any routing info */
+- dst_release(skb->dst);
+- skb->dst = NULL;
+-
+- /* drop conntrack reference */
+- nf_reset(skb);
+-
+- spkt = &PACKET_SKB_CB(skb)->sa.pkt;
+-
+- skb_push(skb, skb->data - skb_mac_header(skb));
+-
+- /*
+- * The SOCK_PACKET socket receives _all_ frames.
+- */
+-
+- spkt->spkt_family = dev->type;
+- strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
+- spkt->spkt_protocol = skb->protocol;
+-
+- /*
+- * Charge the memory to the socket. This is done specifically
+- * to prevent sockets using all the memory up.
+- */
+-
+- if (sock_queue_rcv_skb(sk,skb) == 0)
+- return 0;
+-
+-out:
+- kfree_skb(skb);
+-oom:
+- return 0;
+-}
+-
+-
+-/*
+- * Output a raw packet to a device layer. This bypasses all the other
+- * protocol layers and you must therefore supply it with a complete frame
+- */
+-
+-static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
+- struct msghdr *msg, size_t len)
+-{
+- struct sock *sk = sock->sk;
+- struct sockaddr_pkt *saddr=(struct sockaddr_pkt *)msg->msg_name;
+- struct sk_buff *skb;
+- struct net_device *dev;
+- __be16 proto=0;
+- int err;
+-
+- if (!nx_capable(CAP_NET_RAW, NXC_RAW_SEND))
+- return -EPERM;
+-
+- /*
+- * Get and verify the address.
+- */
+-
+- if (saddr)
+- {
+- if (msg->msg_namelen < sizeof(struct sockaddr))
+- return(-EINVAL);
+- if (msg->msg_namelen==sizeof(struct sockaddr_pkt))
+- proto=saddr->spkt_protocol;
+- }
+- else
+- return(-ENOTCONN); /* SOCK_PACKET must be sent giving an address */
+-
+- /*
+- * Find the device first to size check it
+- */
+-
+- saddr->spkt_device[13] = 0;
+- dev = dev_get_by_name(saddr->spkt_device);
+- err = -ENODEV;
+- if (dev == NULL)
+- goto out_unlock;
+-
+- err = -ENETDOWN;
+- if (!(dev->flags & IFF_UP))
+- goto out_unlock;
+-
+- /*
+- * You may not queue a frame bigger than the mtu. This is the lowest level
+- * raw protocol and you must do your own fragmentation at this level.
+- */
+-
+- err = -EMSGSIZE;
+- if (len > dev->mtu + dev->hard_header_len)
+- goto out_unlock;
+-
+- err = -ENOBUFS;
+- skb = sock_wmalloc(sk, len + LL_RESERVED_SPACE(dev), 0, GFP_KERNEL);
+-
+- /*
+- * If the write buffer is full, then tough. At this level the user gets to
+- * deal with the problem - do your own algorithmic backoffs. That's far
+- * more flexible.
+- */
+-
+- if (skb == NULL)
+- goto out_unlock;
+-
+- /*
+- * Fill it in
+- */
+-
+- /* FIXME: Save some space for broken drivers that write a
+- * hard header at transmission time by themselves. PPP is the
+- * notable one here. This should really be fixed at the driver level.
+- */
+- skb_reserve(skb, LL_RESERVED_SPACE(dev));
+- skb_reset_network_header(skb);
+-
+- /* Try to align data part correctly */
+- if (dev->hard_header) {
+- skb->data -= dev->hard_header_len;
+- skb->tail -= dev->hard_header_len;
+- if (len < dev->hard_header_len)
+- skb_reset_network_header(skb);
+- }
+-
+- /* Returns -EFAULT on error */
+- err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
+- skb->protocol = proto;
+- skb->dev = dev;
+- skb->priority = sk->sk_priority;
+- if (err)
+- goto out_free;
+-
+- /*
+- * Now send it
+- */
+-
+- dev_queue_xmit(skb);
+- dev_put(dev);
+- return(len);
+-
+-out_free:
+- kfree_skb(skb);
+-out_unlock:
+- if (dev)
+- dev_put(dev);
+- return err;
+-}
+-
+-static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
+- unsigned int res)
+-{
+- struct sk_filter *filter;
+- int tag = skb->skb_tag;
+-
+- if (sk->sk_nx_info && !(tag == 1 || sk->sk_nid == tag))
+- return 0;
+-
+- rcu_read_lock_bh();
+- filter = rcu_dereference(sk->sk_filter);
+- if (filter != NULL)
+- res = sk_run_filter(skb, filter->insns, filter->len);
+- rcu_read_unlock_bh();
+-
+- return res;
+-}
+-
+-/*
+- This function makes lazy skb cloning in hope that most of packets
+- are discarded by BPF.
+-
+- Note tricky part: we DO mangle shared skb! skb->data, skb->len
+- and skb->cb are mangled. It works because (and until) packets
+- falling here are owned by current CPU. Output packets are cloned
+- by dev_queue_xmit_nit(), input packets are processed by net_bh
+- sequencially, so that if we return skb to original state on exit,
+- we will not harm anyone.
+- */
+-
+-static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
+-{
+- struct sock *sk;
+- struct sockaddr_ll *sll;
+- struct packet_sock *po;
+- u8 * skb_head = skb->data;
+- int skb_len = skb->len;
+- unsigned int snaplen, res;
+-
+- if (skb->pkt_type == PACKET_LOOPBACK)
+- goto drop;
+-
+- sk = pt->af_packet_priv;
+- po = pkt_sk(sk);
+-
+- skb->dev = dev;
+-
+- if (dev->hard_header) {
+- /* The device has an explicit notion of ll header,
+- exported to higher levels.
+-
+- Otherwise, the device hides datails of it frame
+- structure, so that corresponding packet head
+- never delivered to user.
+- */
+- if (sk->sk_type != SOCK_DGRAM)
+- skb_push(skb, skb->data - skb_mac_header(skb));
+- else if (skb->pkt_type == PACKET_OUTGOING) {
+- /* Special case: outgoing packets have ll header at head */
+- skb_pull(skb, skb_network_offset(skb));
+- }
+- }
+-
+- snaplen = skb->len;
+-
+- res = run_filter(skb, sk, snaplen);
+- if (!res)
+- goto drop_n_restore;
+- if (snaplen > res)
+- snaplen = res;
+-
+- if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
+- (unsigned)sk->sk_rcvbuf)
+- goto drop_n_acct;
+-
+- if (skb_shared(skb)) {
+- struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
+- if (nskb == NULL)
+- goto drop_n_acct;
+-
+- if (skb_head != skb->data) {
+- skb->data = skb_head;
+- skb->len = skb_len;
+- }
+- kfree_skb(skb);
+- skb = nskb;
+- }
+-
+- BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
+- sizeof(skb->cb));
+-
+- sll = &PACKET_SKB_CB(skb)->sa.ll;
+- sll->sll_family = AF_PACKET;
+- sll->sll_hatype = dev->type;
+- sll->sll_protocol = skb->protocol;
+- sll->sll_pkttype = skb->pkt_type;
+- if (unlikely(po->origdev) && skb->pkt_type == PACKET_HOST)
+- sll->sll_ifindex = orig_dev->ifindex;
+- else
+- sll->sll_ifindex = dev->ifindex;
+- sll->sll_halen = 0;
+-
+- if (dev->hard_header_parse)
+- sll->sll_halen = dev->hard_header_parse(skb, sll->sll_addr);
+-
+- PACKET_SKB_CB(skb)->origlen = skb->len;
+-
+- if (pskb_trim(skb, snaplen))
+- goto drop_n_acct;
+-
+- skb_set_owner_r(skb, sk);
+- skb->dev = NULL;
+- dst_release(skb->dst);
+- skb->dst = NULL;
+-
+- /* drop conntrack reference */
+- nf_reset(skb);
+-
+- spin_lock(&sk->sk_receive_queue.lock);
+- po->stats.tp_packets++;
+- __skb_queue_tail(&sk->sk_receive_queue, skb);
+- spin_unlock(&sk->sk_receive_queue.lock);
+- sk->sk_data_ready(sk, skb->len);
+- return 0;
+-
+-drop_n_acct:
+- spin_lock(&sk->sk_receive_queue.lock);
+- po->stats.tp_drops++;
+- spin_unlock(&sk->sk_receive_queue.lock);
+-
+-drop_n_restore:
+- if (skb_head != skb->data && skb_shared(skb)) {
+- skb->data = skb_head;
+- skb->len = skb_len;
+- }
+-drop:
+- kfree_skb(skb);
+- return 0;
+-}
+-
+-#ifdef CONFIG_PACKET_MMAP
+-static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
+-{
+- struct sock *sk;
+- struct packet_sock *po;
+- struct sockaddr_ll *sll;
+- struct tpacket_hdr *h;
+- u8 * skb_head = skb->data;
+- int skb_len = skb->len;
+- unsigned int snaplen, res;
+- unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
+- unsigned short macoff, netoff;
+- struct sk_buff *copy_skb = NULL;
+- struct timeval tv;
+-
+- if (skb->pkt_type == PACKET_LOOPBACK)
+- goto drop;
+-
+- sk = pt->af_packet_priv;
+- po = pkt_sk(sk);
+-
+- if (dev->hard_header) {
+- if (sk->sk_type != SOCK_DGRAM)
+- skb_push(skb, skb->data - skb_mac_header(skb));
+- else if (skb->pkt_type == PACKET_OUTGOING) {
+- /* Special case: outgoing packets have ll header at head */
+- skb_pull(skb, skb_network_offset(skb));
+- }
+- }
+-
+- if (skb->ip_summed == CHECKSUM_PARTIAL)
+- status |= TP_STATUS_CSUMNOTREADY;
+-
+- snaplen = skb->len;
+-
+- res = run_filter(skb, sk, snaplen);
+- if (!res)
+- goto drop_n_restore;
+- if (snaplen > res)
+- snaplen = res;
+-
+- if (sk->sk_type == SOCK_DGRAM) {
+- macoff = netoff = TPACKET_ALIGN(TPACKET_HDRLEN) + 16;
+- } else {
+- unsigned maclen = skb_network_offset(skb);
+- netoff = TPACKET_ALIGN(TPACKET_HDRLEN + (maclen < 16 ? 16 : maclen));
+- macoff = netoff - maclen;
+- }
+-
+- if (macoff + snaplen > po->frame_size) {
+- if (po->copy_thresh &&
+- atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
+- (unsigned)sk->sk_rcvbuf) {
+- if (skb_shared(skb)) {
+- copy_skb = skb_clone(skb, GFP_ATOMIC);
+- } else {
+- copy_skb = skb_get(skb);
+- skb_head = skb->data;
+- }
+- if (copy_skb)
+- skb_set_owner_r(copy_skb, sk);
+- }
+- snaplen = po->frame_size - macoff;
+- if ((int)snaplen < 0)
+- snaplen = 0;
+- }
+-
+- spin_lock(&sk->sk_receive_queue.lock);
+- h = packet_lookup_frame(po, po->head);
+-
+- if (h->tp_status)
+- goto ring_is_full;
+- po->head = po->head != po->frame_max ? po->head+1 : 0;
+- po->stats.tp_packets++;
+- if (copy_skb) {
+- status |= TP_STATUS_COPY;
+- __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
+- }
+- if (!po->stats.tp_drops)
+- status &= ~TP_STATUS_LOSING;
+- spin_unlock(&sk->sk_receive_queue.lock);
+-
+- skb_copy_bits(skb, 0, (u8*)h + macoff, snaplen);
+-
+- h->tp_len = skb->len;
+- h->tp_snaplen = snaplen;
+- h->tp_mac = macoff;
+- h->tp_net = netoff;
+- if (skb->tstamp.tv64 == 0) {
+- __net_timestamp(skb);
+- sock_enable_timestamp(sk);
+- }
+- tv = ktime_to_timeval(skb->tstamp);
+- h->tp_sec = tv.tv_sec;
+- h->tp_usec = tv.tv_usec;
+-
+- sll = (struct sockaddr_ll*)((u8*)h + TPACKET_ALIGN(sizeof(*h)));
+- sll->sll_halen = 0;
+- if (dev->hard_header_parse)
+- sll->sll_halen = dev->hard_header_parse(skb, sll->sll_addr);
+- sll->sll_family = AF_PACKET;
+- sll->sll_hatype = dev->type;
+- sll->sll_protocol = skb->protocol;
+- sll->sll_pkttype = skb->pkt_type;
+- if (unlikely(po->origdev) && skb->pkt_type == PACKET_HOST)
+- sll->sll_ifindex = orig_dev->ifindex;
+- else
+- sll->sll_ifindex = dev->ifindex;
+-
+- h->tp_status = status;
+- smp_mb();
+-
+- {
+- struct page *p_start, *p_end;
+- u8 *h_end = (u8 *)h + macoff + snaplen - 1;
+-
+- p_start = virt_to_page(h);
+- p_end = virt_to_page(h_end);
+- while (p_start <= p_end) {
+- flush_dcache_page(p_start);
+- p_start++;
+- }
+- }
+-
+- sk->sk_data_ready(sk, 0);
+-
+-drop_n_restore:
+- if (skb_head != skb->data && skb_shared(skb)) {
+- skb->data = skb_head;
+- skb->len = skb_len;
+- }
+-drop:
+- kfree_skb(skb);
+- return 0;
+-
+-ring_is_full:
+- po->stats.tp_drops++;
+- spin_unlock(&sk->sk_receive_queue.lock);
+-
+- sk->sk_data_ready(sk, 0);
+- if (copy_skb)
+- kfree_skb(copy_skb);
+- goto drop_n_restore;
+-}
+-
+-#endif
+-
+-
+-static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
+- struct msghdr *msg, size_t len)
+-{
+- struct sock *sk = sock->sk;
+- struct sockaddr_ll *saddr=(struct sockaddr_ll *)msg->msg_name;
+- struct sk_buff *skb;
+- struct net_device *dev;
+- __be16 proto;
+- unsigned char *addr;
+- int ifindex, err, reserve = 0;
+-
+- if (!nx_capable(CAP_NET_RAW, NXC_RAW_SEND))
+- return -EPERM;
+-
+- /*
+- * Get and verify the address.
+- */
+-
+- if (saddr == NULL) {
+- struct packet_sock *po = pkt_sk(sk);
+-
+- ifindex = po->ifindex;
+- proto = po->num;
+- addr = NULL;
+- } else {
+- err = -EINVAL;
+- if (msg->msg_namelen < sizeof(struct sockaddr_ll))
+- goto out;
+- if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
+- goto out;
+- ifindex = saddr->sll_ifindex;
+- proto = saddr->sll_protocol;
+- addr = saddr->sll_addr;
+- }
+-
+-
+- dev = dev_get_by_index(ifindex);
+- err = -ENXIO;
+- if (dev == NULL)
+- goto out_unlock;
+- if (sock->type == SOCK_RAW)
+- reserve = dev->hard_header_len;
+-
+- err = -ENETDOWN;
+- if (!(dev->flags & IFF_UP))
+- goto out_unlock;
+-
+- err = -EMSGSIZE;
+- if (len > dev->mtu+reserve)
+- goto out_unlock;
+-
+- skb = sock_alloc_send_skb(sk, len + LL_RESERVED_SPACE(dev),
+- msg->msg_flags & MSG_DONTWAIT, &err);
+- if (skb==NULL)
+- goto out_unlock;
+-
+- skb_reserve(skb, LL_RESERVED_SPACE(dev));
+- skb_reset_network_header(skb);
+-
+- if (dev->hard_header) {
+- int res;
+- err = -EINVAL;
+- res = dev->hard_header(skb, dev, ntohs(proto), addr, NULL, len);
+- if (sock->type != SOCK_DGRAM) {
+- skb_reset_tail_pointer(skb);
+- skb->len = 0;
+- } else if (res < 0)
+- goto out_free;
+- }
+-
+- /* Returns -EFAULT on error */
+- err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
+- if (err)
+- goto out_free;
+-
+- skb->protocol = proto;
+- skb->dev = dev;
+- skb->priority = sk->sk_priority;
+-
+- /*
+- * Now send it
+- */
+-
+- err = dev_queue_xmit(skb);
+- if (err > 0 && (err = net_xmit_errno(err)) != 0)
+- goto out_unlock;
+-
+- dev_put(dev);
+-
+- return(len);
+-
+-out_free:
+- kfree_skb(skb);
+-out_unlock:
+- if (dev)
+- dev_put(dev);
+-out:
+- return err;
+-}
+-
+-/*
+- * Close a PACKET socket. This is fairly simple. We immediately go
+- * to 'closed' state and remove our protocol entry in the device list.
+- */
+-
+-static int packet_release(struct socket *sock)
+-{
+- struct sock *sk = sock->sk;
+- struct packet_sock *po;
+-
+- if (!sk)
+- return 0;
+-
+- po = pkt_sk(sk);
+-
+- write_lock_bh(&packet_sklist_lock);
+- sk_del_node_init(sk);
+- write_unlock_bh(&packet_sklist_lock);
+-
+- /*
+- * Unhook packet receive handler.
+- */
+-
+- if (po->running) {
+- /*
+- * Remove the protocol hook
+- */
+- dev_remove_pack(&po->prot_hook);
+- po->running = 0;
+- po->num = 0;
+- __sock_put(sk);
+- }
+-
+- packet_flush_mclist(sk);
+-
+-#ifdef CONFIG_PACKET_MMAP
+- if (po->pg_vec) {
+- struct tpacket_req req;
+- memset(&req, 0, sizeof(req));
+- packet_set_ring(sk, &req, 1);
+- }
+-#endif
+-
+- /*
+- * Now the socket is dead. No more input will appear.
+- */
+-
+- sock_orphan(sk);
+- sock->sk = NULL;
+-
+- /* Purge queues */
+-
+- skb_queue_purge(&sk->sk_receive_queue);
+-
+- sock_put(sk);
+- return 0;
+-}
+-
+-/*
+- * Attach a packet hook.
+- */
+-
+-static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
+-{
+- struct packet_sock *po = pkt_sk(sk);
+- /*
+- * Detach an existing hook if present.
+- */
+-
+- lock_sock(sk);
+-
+- spin_lock(&po->bind_lock);
+- if (po->running) {
+- __sock_put(sk);
+- po->running = 0;
+- po->num = 0;
+- spin_unlock(&po->bind_lock);
+- dev_remove_pack(&po->prot_hook);
+- spin_lock(&po->bind_lock);
+- }
+-
+- po->num = protocol;
+- po->prot_hook.type = protocol;
+- po->prot_hook.dev = dev;
+-
+- po->ifindex = dev ? dev->ifindex : 0;
+-
+- if (protocol == 0)
+- goto out_unlock;
+-
+- if (dev) {
+- if (dev->flags&IFF_UP) {
+- dev_add_pack(&po->prot_hook);
+- sock_hold(sk);
+- po->running = 1;
+- } else {
+- sk->sk_err = ENETDOWN;
+- if (!sock_flag(sk, SOCK_DEAD))
+- sk->sk_error_report(sk);
+- }
+- } else {
+- dev_add_pack(&po->prot_hook);
+- sock_hold(sk);
+- po->running = 1;
+- }
+-
+-out_unlock:
+- spin_unlock(&po->bind_lock);
+- release_sock(sk);
+- return 0;
+-}
+-
+-/*
+- * Bind a packet socket to a device
+- */
+-
+-static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+-{
+- struct sock *sk=sock->sk;
+- char name[15];
+- struct net_device *dev;
+- int err = -ENODEV;
+-
+- /*
+- * Check legality
+- */
+-
+- if (addr_len != sizeof(struct sockaddr))
+- return -EINVAL;
+- strlcpy(name,uaddr->sa_data,sizeof(name));
+-
+- dev = dev_get_by_name(name);
+- if (dev) {
+- err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
+- dev_put(dev);
+- }
+- return err;
+-}
+-
+-static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+-{
+- struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
+- struct sock *sk=sock->sk;
+- struct net_device *dev = NULL;
+- int err;
+-
+-
+- /*
+- * Check legality
+- */
+-
+- if (addr_len < sizeof(struct sockaddr_ll))
+- return -EINVAL;
+- if (sll->sll_family != AF_PACKET)
+- return -EINVAL;
+-
+- if (sll->sll_ifindex) {
+- err = -ENODEV;
+- dev = dev_get_by_index(sll->sll_ifindex);
+- if (dev == NULL)
+- goto out;
+- }
+- err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
+- if (dev)
+- dev_put(dev);
+-
+-out:
+- return err;
+-}
+-
+-static struct proto packet_proto = {
+- .name = "PACKET",
+- .owner = THIS_MODULE,
+- .obj_size = sizeof(struct packet_sock),
+-};
+-
+-/*
+- * Create a packet of type SOCK_PACKET.
+- */
+-
+-static int packet_create(struct socket *sock, int protocol)
+-{
+- struct sock *sk;
+- struct packet_sock *po;
+- __be16 proto = (__force __be16)protocol; /* weird, but documented */
+- int err;
+-
+- if (!nx_capable(CAP_NET_RAW, NXC_RAW_SOCKET))
+- return -EPERM;
+- if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
+- sock->type != SOCK_PACKET)
+- return -ESOCKTNOSUPPORT;
+-
+- sock->state = SS_UNCONNECTED;
+-
+- err = -ENOBUFS;
+- sk = sk_alloc(PF_PACKET, GFP_KERNEL, &packet_proto, 1);
+- if (sk == NULL)
+- goto out;
+-
+- sock->ops = &packet_ops;
+- if (sock->type == SOCK_PACKET)
+- sock->ops = &packet_ops_spkt;
+-
+- sock_init_data(sock, sk);
+-
+- po = pkt_sk(sk);
+- sk->sk_family = PF_PACKET;
+- po->num = proto;
+-
+- sk->sk_destruct = packet_sock_destruct;
+- atomic_inc(&packet_socks_nr);
+-
+- /*
+- * Attach a protocol block
+- */
+-
+- spin_lock_init(&po->bind_lock);
+- po->prot_hook.func = packet_rcv;
+-
+- if (sock->type == SOCK_PACKET)
+- po->prot_hook.func = packet_rcv_spkt;
+-
+- po->prot_hook.af_packet_priv = sk;
+-
+- if (proto) {
+- po->prot_hook.type = proto;
+- dev_add_pack(&po->prot_hook);
+- sock_hold(sk);
+- po->running = 1;
+- }
+-
+- write_lock_bh(&packet_sklist_lock);
+- sk_add_node(sk, &packet_sklist);
+- write_unlock_bh(&packet_sklist_lock);
+- return(0);
+-out:
+- return err;
+-}
+-
+-/*
+- * Pull a packet from our receive queue and hand it to the user.
+- * If necessary we block.
+- */
+-
+-static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
+- struct msghdr *msg, size_t len, int flags)
+-{
+- struct sock *sk = sock->sk;
+- struct sk_buff *skb;
+- int copied, err;
+- struct sockaddr_ll *sll;
+-
+- err = -EINVAL;
+- if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
+- goto out;
+-
+-#if 0
+- /* What error should we return now? EUNATTACH? */
+- if (pkt_sk(sk)->ifindex < 0)
+- return -ENODEV;
+-#endif
+-
+- /*
+- * Call the generic datagram receiver. This handles all sorts
+- * of horrible races and re-entrancy so we can forget about it
+- * in the protocol layers.
+- *
+- * Now it will return ENETDOWN, if device have just gone down,
+- * but then it will block.
+- */
+-
+- skb=skb_recv_datagram(sk,flags,flags&MSG_DONTWAIT,&err);
+-
+- /*
+- * An error occurred so return it. Because skb_recv_datagram()
+- * handles the blocking we don't see and worry about blocking
+- * retries.
+- */
+-
+- if (skb == NULL)
+- goto out;
+-
+- /*
+- * If the address length field is there to be filled in, we fill
+- * it in now.
+- */
+-
+- sll = &PACKET_SKB_CB(skb)->sa.ll;
+- if (sock->type == SOCK_PACKET)
+- msg->msg_namelen = sizeof(struct sockaddr_pkt);
+- else
+- msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
+-
+- /*
+- * You lose any data beyond the buffer you gave. If it worries a
+- * user program they can ask the device for its MTU anyway.
+- */
+-
+- copied = skb->len;
+- if (copied > len)
+- {
+- copied=len;
+- msg->msg_flags|=MSG_TRUNC;
+- }
+-
+- err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+- if (err)
+- goto out_free;
+-
+- sock_recv_timestamp(msg, sk, skb);
+-
+- if (msg->msg_name)
+- memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
+- msg->msg_namelen);
+-
+- if (pkt_sk(sk)->auxdata) {
+- struct tpacket_auxdata aux;
+-
+- aux.tp_status = TP_STATUS_USER;
+- if (skb->ip_summed == CHECKSUM_PARTIAL)
+- aux.tp_status |= TP_STATUS_CSUMNOTREADY;
+- aux.tp_len = PACKET_SKB_CB(skb)->origlen;
+- aux.tp_snaplen = skb->len;
+- aux.tp_mac = 0;
+- aux.tp_net = skb_network_offset(skb);
+-
+- put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
+- }
+-
+- /*
+- * Free or return the buffer as appropriate. Again this
+- * hides all the races and re-entrancy issues from us.
+- */
+- err = (flags&MSG_TRUNC) ? skb->len : copied;
+-
+-out_free:
+- skb_free_datagram(sk, skb);
+-out:
+- return err;
+-}
+-
+-static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
+- int *uaddr_len, int peer)
+-{
+- struct net_device *dev;
+- struct sock *sk = sock->sk;
+-
+- if (peer)
+- return -EOPNOTSUPP;
+-
+- uaddr->sa_family = AF_PACKET;
+- dev = dev_get_by_index(pkt_sk(sk)->ifindex);
+- if (dev) {
+- strlcpy(uaddr->sa_data, dev->name, 15);
+- dev_put(dev);
+- } else
+- memset(uaddr->sa_data, 0, 14);
+- *uaddr_len = sizeof(*uaddr);
+-
+- return 0;
+-}
+-
+-static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
+- int *uaddr_len, int peer)
+-{
+- struct net_device *dev;
+- struct sock *sk = sock->sk;
+- struct packet_sock *po = pkt_sk(sk);
+- struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
+-
+- if (peer)
+- return -EOPNOTSUPP;
+-
+- sll->sll_family = AF_PACKET;
+- sll->sll_ifindex = po->ifindex;
+- sll->sll_protocol = po->num;
+- dev = dev_get_by_index(po->ifindex);
+- if (dev) {
+- sll->sll_hatype = dev->type;
+- sll->sll_halen = dev->addr_len;
+- memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
+- dev_put(dev);
+- } else {
+- sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
+- sll->sll_halen = 0;
+- }
+- *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
+-
+- return 0;
+-}
+-
+-static void packet_dev_mc(struct net_device *dev, struct packet_mclist *i, int what)
+-{
+- switch (i->type) {
+- case PACKET_MR_MULTICAST:
+- if (what > 0)
+- dev_mc_add(dev, i->addr, i->alen, 0);
+- else
+- dev_mc_delete(dev, i->addr, i->alen, 0);
+- break;
+- case PACKET_MR_PROMISC:
+- dev_set_promiscuity(dev, what);
+- break;
+- case PACKET_MR_ALLMULTI:
+- dev_set_allmulti(dev, what);
+- break;
+- default:;
+- }
+-}
+-
+-static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
+-{
+- for ( ; i; i=i->next) {
+- if (i->ifindex == dev->ifindex)
+- packet_dev_mc(dev, i, what);
+- }
+-}
+-
+-static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
+-{
+- struct packet_sock *po = pkt_sk(sk);
+- struct packet_mclist *ml, *i;
+- struct net_device *dev;
+- int err;
+-
+- rtnl_lock();
+-
+- err = -ENODEV;
+- dev = __dev_get_by_index(mreq->mr_ifindex);
+- if (!dev)
+- goto done;
+-
+- err = -EINVAL;
+- if (mreq->mr_alen > dev->addr_len)
+- goto done;
+-
+- err = -ENOBUFS;
+- i = kmalloc(sizeof(*i), GFP_KERNEL);
+- if (i == NULL)
+- goto done;
+-
+- err = 0;
+- for (ml = po->mclist; ml; ml = ml->next) {
+- if (ml->ifindex == mreq->mr_ifindex &&
+- ml->type == mreq->mr_type &&
+- ml->alen == mreq->mr_alen &&
+- memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
+- ml->count++;
+- /* Free the new element ... */
+- kfree(i);
+- goto done;
+- }
+- }
+-
+- i->type = mreq->mr_type;
+- i->ifindex = mreq->mr_ifindex;
+- i->alen = mreq->mr_alen;
+- memcpy(i->addr, mreq->mr_address, i->alen);
+- i->count = 1;
+- i->next = po->mclist;
+- po->mclist = i;
+- packet_dev_mc(dev, i, +1);
+-
+-done:
+- rtnl_unlock();
+- return err;
+-}
+-
+-static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
+-{
+- struct packet_mclist *ml, **mlp;
+-
+- rtnl_lock();
+-
+- for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
+- if (ml->ifindex == mreq->mr_ifindex &&
+- ml->type == mreq->mr_type &&
+- ml->alen == mreq->mr_alen &&
+- memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
+- if (--ml->count == 0) {
+- struct net_device *dev;
+- *mlp = ml->next;
+- dev = dev_get_by_index(ml->ifindex);
+- if (dev) {
+- packet_dev_mc(dev, ml, -1);
+- dev_put(dev);
+- }
+- kfree(ml);
+- }
+- rtnl_unlock();
+- return 0;
+- }
+- }
+- rtnl_unlock();
+- return -EADDRNOTAVAIL;
+-}
+-
+-static void packet_flush_mclist(struct sock *sk)
+-{
+- struct packet_sock *po = pkt_sk(sk);
+- struct packet_mclist *ml;
+-
+- if (!po->mclist)
+- return;
+-
+- rtnl_lock();
+- while ((ml = po->mclist) != NULL) {
+- struct net_device *dev;
+-
+- po->mclist = ml->next;
+- if ((dev = dev_get_by_index(ml->ifindex)) != NULL) {
+- packet_dev_mc(dev, ml, -1);
+- dev_put(dev);
+- }
+- kfree(ml);
+- }
+- rtnl_unlock();
+-}
+-
+-static int
+-packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen)
+-{
+- struct sock *sk = sock->sk;
+- struct packet_sock *po = pkt_sk(sk);
+- int ret;
+-
+- if (level != SOL_PACKET)
+- return -ENOPROTOOPT;
+-
+- switch(optname) {
+- case PACKET_ADD_MEMBERSHIP:
+- case PACKET_DROP_MEMBERSHIP:
+- {
+- struct packet_mreq_max mreq;
+- int len = optlen;
+- memset(&mreq, 0, sizeof(mreq));
+- if (len < sizeof(struct packet_mreq))
+- return -EINVAL;
+- if (len > sizeof(mreq))
+- len = sizeof(mreq);
+- if (copy_from_user(&mreq,optval,len))
+- return -EFAULT;
+- if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
+- return -EINVAL;
+- if (optname == PACKET_ADD_MEMBERSHIP)
+- ret = packet_mc_add(sk, &mreq);
+- else
+- ret = packet_mc_drop(sk, &mreq);
+- return ret;
+- }
+-
+-#ifdef CONFIG_PACKET_MMAP
+- case PACKET_RX_RING:
+- {
+- struct tpacket_req req;
+-
+- if (optlen<sizeof(req))
+- return -EINVAL;
+- if (copy_from_user(&req,optval,sizeof(req)))
+- return -EFAULT;
+- return packet_set_ring(sk, &req, 0);
+- }
+- case PACKET_COPY_THRESH:
+- {
+- int val;
+-
+- if (optlen!=sizeof(val))
+- return -EINVAL;
+- if (copy_from_user(&val,optval,sizeof(val)))
+- return -EFAULT;
+-
+- pkt_sk(sk)->copy_thresh = val;
+- return 0;
+- }
+-#endif
+- case PACKET_AUXDATA:
+- {
+- int val;
+-
+- if (optlen < sizeof(val))
+- return -EINVAL;
+- if (copy_from_user(&val, optval, sizeof(val)))
+- return -EFAULT;
+-
+- po->auxdata = !!val;
+- return 0;
+- }
+- case PACKET_ORIGDEV:
+- {
+- int val;
+-
+- if (optlen < sizeof(val))
+- return -EINVAL;
+- if (copy_from_user(&val, optval, sizeof(val)))
+- return -EFAULT;
+-
+- po->origdev = !!val;
+- return 0;
+- }
+- default:
+- return -ENOPROTOOPT;
+- }
+-}
+-
+-static int packet_getsockopt(struct socket *sock, int level, int optname,
+- char __user *optval, int __user *optlen)
+-{
+- int len;
+- int val;
+- struct sock *sk = sock->sk;
+- struct packet_sock *po = pkt_sk(sk);
+- void *data;
+- struct tpacket_stats st;
+-
+- if (level != SOL_PACKET)
+- return -ENOPROTOOPT;
+-
+- if (get_user(len, optlen))
+- return -EFAULT;
+-
+- if (len < 0)
+- return -EINVAL;
+-
+- switch(optname) {
+- case PACKET_STATISTICS:
+- if (len > sizeof(struct tpacket_stats))
+- len = sizeof(struct tpacket_stats);
+- spin_lock_bh(&sk->sk_receive_queue.lock);
+- st = po->stats;
+- memset(&po->stats, 0, sizeof(st));
+- spin_unlock_bh(&sk->sk_receive_queue.lock);
+- st.tp_packets += st.tp_drops;
+-
+- data = &st;
+- break;
+- case PACKET_AUXDATA:
+- if (len > sizeof(int))
+- len = sizeof(int);
+- val = po->auxdata;
+-
+- data = &val;
+- break;
+- case PACKET_ORIGDEV:
+- if (len > sizeof(int))
+- len = sizeof(int);
+- val = po->origdev;
+-
+- data = &val;
+- break;
+- default:
+- return -ENOPROTOOPT;
+- }
+-
+- if (put_user(len, optlen))
+- return -EFAULT;
+- if (copy_to_user(optval, data, len))
+- return -EFAULT;
+- return 0;
+-}
+-
+-
+-static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
+-{
+- struct sock *sk;
+- struct hlist_node *node;
+- struct net_device *dev = data;
+-
+- read_lock(&packet_sklist_lock);
+- sk_for_each(sk, node, &packet_sklist) {
+- struct packet_sock *po = pkt_sk(sk);
+-
+- switch (msg) {
+- case NETDEV_UNREGISTER:
+- if (po->mclist)
+- packet_dev_mclist(dev, po->mclist, -1);
+- /* fallthrough */
+-
+- case NETDEV_DOWN:
+- if (dev->ifindex == po->ifindex) {
+- spin_lock(&po->bind_lock);
+- if (po->running) {
+- __dev_remove_pack(&po->prot_hook);
+- __sock_put(sk);
+- po->running = 0;
+- sk->sk_err = ENETDOWN;
+- if (!sock_flag(sk, SOCK_DEAD))
+- sk->sk_error_report(sk);
+- }
+- if (msg == NETDEV_UNREGISTER) {
+- po->ifindex = -1;
+- po->prot_hook.dev = NULL;
+- }
+- spin_unlock(&po->bind_lock);
+- }
+- break;
+- case NETDEV_UP:
+- spin_lock(&po->bind_lock);
+- if (dev->ifindex == po->ifindex && po->num &&
+- !po->running) {
+- dev_add_pack(&po->prot_hook);
+- sock_hold(sk);
+- po->running = 1;
+- }
+- spin_unlock(&po->bind_lock);
+- break;
+- }
+- }
+- read_unlock(&packet_sklist_lock);
+- return NOTIFY_DONE;
+-}
+-
+-
+-static int packet_ioctl(struct socket *sock, unsigned int cmd,
+- unsigned long arg)
+-{
+- struct sock *sk = sock->sk;
+-
+- switch(cmd) {
+- case SIOCOUTQ:
+- {
+- int amount = atomic_read(&sk->sk_wmem_alloc);
+- return put_user(amount, (int __user *)arg);
+- }
+- case SIOCINQ:
+- {
+- struct sk_buff *skb;
+- int amount = 0;
+-
+- spin_lock_bh(&sk->sk_receive_queue.lock);
+- skb = skb_peek(&sk->sk_receive_queue);
+- if (skb)
+- amount = skb->len;
+- spin_unlock_bh(&sk->sk_receive_queue.lock);
+- return put_user(amount, (int __user *)arg);
+- }
+- case SIOCGSTAMP:
+- return sock_get_timestamp(sk, (struct timeval __user *)arg);
+- case SIOCGSTAMPNS:
+- return sock_get_timestampns(sk, (struct timespec __user *)arg);
+-
+-#ifdef CONFIG_INET
+- case SIOCADDRT:
+- case SIOCDELRT:
+- case SIOCDARP:
+- case SIOCGARP:
+- case SIOCSARP:
+- case SIOCGIFADDR:
+- case SIOCSIFADDR:
+- case SIOCGIFBRDADDR:
+- case SIOCSIFBRDADDR:
+- case SIOCGIFNETMASK:
+- case SIOCSIFNETMASK:
+- case SIOCGIFDSTADDR:
+- case SIOCSIFDSTADDR:
+- case SIOCSIFFLAGS:
+- return inet_dgram_ops.ioctl(sock, cmd, arg);
+-#endif
+-
+- default:
+- return -ENOIOCTLCMD;
+- }
+- return 0;
+-}
+-
+-#ifndef CONFIG_PACKET_MMAP
+-#define packet_mmap sock_no_mmap
+-#define packet_poll datagram_poll
+-#else
+-
+-static unsigned int packet_poll(struct file * file, struct socket *sock,
+- poll_table *wait)
+-{
+- struct sock *sk = sock->sk;
+- struct packet_sock *po = pkt_sk(sk);
+- unsigned int mask = datagram_poll(file, sock, wait);
+-
+- spin_lock_bh(&sk->sk_receive_queue.lock);
+- if (po->pg_vec) {
+- unsigned last = po->head ? po->head-1 : po->frame_max;
+- struct tpacket_hdr *h;
+-
+- h = packet_lookup_frame(po, last);
+-
+- if (h->tp_status)
+- mask |= POLLIN | POLLRDNORM;
+- }
+- spin_unlock_bh(&sk->sk_receive_queue.lock);
+- return mask;
+-}
+-
+-
+-/* Dirty? Well, I still did not learn better way to account
+- * for user mmaps.
+- */
+-
+-static void packet_mm_open(struct vm_area_struct *vma)
+-{
+- struct file *file = vma->vm_file;
+- struct socket * sock = file->private_data;
+- struct sock *sk = sock->sk;
+-
+- if (sk)
+- atomic_inc(&pkt_sk(sk)->mapped);
+-}
+-
+-static void packet_mm_close(struct vm_area_struct *vma)
+-{
+- struct file *file = vma->vm_file;
+- struct socket * sock = file->private_data;
+- struct sock *sk = sock->sk;
+-
+- if (sk)
+- atomic_dec(&pkt_sk(sk)->mapped);
+-}
+-
+-static struct vm_operations_struct packet_mmap_ops = {
+- .open = packet_mm_open,
+- .close =packet_mm_close,
+-};
+-
+-static inline struct page *pg_vec_endpage(char *one_pg_vec, unsigned int order)
+-{
+- return virt_to_page(one_pg_vec + (PAGE_SIZE << order) - 1);
+-}
+-
+-static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
+-{
+- int i;
+-
+- for (i = 0; i < len; i++) {
+- if (likely(pg_vec[i]))
+- free_pages((unsigned long) pg_vec[i], order);
+- }
+- kfree(pg_vec);
+-}
+-
+-static inline char *alloc_one_pg_vec_page(unsigned long order)
+-{
+- return (char *) __get_free_pages(GFP_KERNEL | __GFP_COMP | __GFP_ZERO,
+- order);
+-}
+-
+-static char **alloc_pg_vec(struct tpacket_req *req, int order)
+-{
+- unsigned int block_nr = req->tp_block_nr;
+- char **pg_vec;
+- int i;
+-
+- pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
+- if (unlikely(!pg_vec))
+- goto out;
+-
+- for (i = 0; i < block_nr; i++) {
+- pg_vec[i] = alloc_one_pg_vec_page(order);
+- if (unlikely(!pg_vec[i]))
+- goto out_free_pgvec;
+- }
+-
+-out:
+- return pg_vec;
+-
+-out_free_pgvec:
+- free_pg_vec(pg_vec, order, block_nr);
+- pg_vec = NULL;
+- goto out;
+-}
+-
+-static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing)
+-{
+- char **pg_vec = NULL;
+- struct packet_sock *po = pkt_sk(sk);
+- int was_running, order = 0;
+- __be16 num;
+- int err = 0;
+-
+- if (req->tp_block_nr) {
+- int i, l;
+-
+- /* Sanity tests and some calculations */
+-
+- if (unlikely(po->pg_vec))
+- return -EBUSY;
+-
+- if (unlikely((int)req->tp_block_size <= 0))
+- return -EINVAL;
+- if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
+- return -EINVAL;
+- if (unlikely(req->tp_frame_size < TPACKET_HDRLEN))
+- return -EINVAL;
+- if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
+- return -EINVAL;
+-
+- po->frames_per_block = req->tp_block_size/req->tp_frame_size;
+- if (unlikely(po->frames_per_block <= 0))
+- return -EINVAL;
+- if (unlikely((po->frames_per_block * req->tp_block_nr) !=
+- req->tp_frame_nr))
+- return -EINVAL;
+-
+- err = -ENOMEM;
+- order = get_order(req->tp_block_size);
+- pg_vec = alloc_pg_vec(req, order);
+- if (unlikely(!pg_vec))
+- goto out;
+-
+- l = 0;
+- for (i = 0; i < req->tp_block_nr; i++) {
+- char *ptr = pg_vec[i];
+- struct tpacket_hdr *header;
+- int k;
+-
+- for (k = 0; k < po->frames_per_block; k++) {
+- header = (struct tpacket_hdr *) ptr;
+- header->tp_status = TP_STATUS_KERNEL;
+- ptr += req->tp_frame_size;
+- }
+- }
+- /* Done */
+- } else {
+- if (unlikely(req->tp_frame_nr))
+- return -EINVAL;
+- }
+-
+- lock_sock(sk);
+-
+- /* Detach socket from network */
+- spin_lock(&po->bind_lock);
+- was_running = po->running;
+- num = po->num;
+- if (was_running) {
+- __dev_remove_pack(&po->prot_hook);
+- po->num = 0;
+- po->running = 0;
+- __sock_put(sk);
+- }
+- spin_unlock(&po->bind_lock);
+-
+- synchronize_net();
+-
+- err = -EBUSY;
+- if (closing || atomic_read(&po->mapped) == 0) {
+- err = 0;
+-#define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
+-
+- spin_lock_bh(&sk->sk_receive_queue.lock);
+- pg_vec = XC(po->pg_vec, pg_vec);
+- po->frame_max = (req->tp_frame_nr - 1);
+- po->head = 0;
+- po->frame_size = req->tp_frame_size;
+- spin_unlock_bh(&sk->sk_receive_queue.lock);
+-
+- order = XC(po->pg_vec_order, order);
+- req->tp_block_nr = XC(po->pg_vec_len, req->tp_block_nr);
+-
+- po->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
+- po->prot_hook.func = po->pg_vec ? tpacket_rcv : packet_rcv;
+- skb_queue_purge(&sk->sk_receive_queue);
+-#undef XC
+- if (atomic_read(&po->mapped))
+- printk(KERN_DEBUG "packet_mmap: vma is busy: %d\n", atomic_read(&po->mapped));
+- }
+-
+- spin_lock(&po->bind_lock);
+- if (was_running && !po->running) {
+- sock_hold(sk);
+- po->running = 1;
+- po->num = num;
+- dev_add_pack(&po->prot_hook);
+- }
+- spin_unlock(&po->bind_lock);
+-
+- release_sock(sk);
+-
+- if (pg_vec)
+- free_pg_vec(pg_vec, order, req->tp_block_nr);
+-out:
+- return err;
+-}
+-
+-static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
+-{
+- struct sock *sk = sock->sk;
+- struct packet_sock *po = pkt_sk(sk);
+- unsigned long size;
+- unsigned long start;
+- int err = -EINVAL;
+- int i;
+-
+- if (vma->vm_pgoff)
+- return -EINVAL;
+-
+- size = vma->vm_end - vma->vm_start;
+-
+- lock_sock(sk);
+- if (po->pg_vec == NULL)
+- goto out;
+- if (size != po->pg_vec_len*po->pg_vec_pages*PAGE_SIZE)
+- goto out;
+-
+- start = vma->vm_start;
+- for (i = 0; i < po->pg_vec_len; i++) {
+- struct page *page = virt_to_page(po->pg_vec[i]);
+- int pg_num;
+-
+- for (pg_num = 0; pg_num < po->pg_vec_pages; pg_num++, page++) {
+- err = vm_insert_page(vma, start, page);
+- if (unlikely(err))
+- goto out;
+- start += PAGE_SIZE;
+- }
+- }
+- atomic_inc(&po->mapped);
+- vma->vm_ops = &packet_mmap_ops;
+- err = 0;
+-
+-out:
+- release_sock(sk);
+- return err;
+-}
+-#endif
+-
+-
+-static const struct proto_ops packet_ops_spkt = {
+- .family = PF_PACKET,
+- .owner = THIS_MODULE,
+- .release = packet_release,
+- .bind = packet_bind_spkt,
+- .connect = sock_no_connect,
+- .socketpair = sock_no_socketpair,
+- .accept = sock_no_accept,
+- .getname = packet_getname_spkt,
+- .poll = datagram_poll,
+- .ioctl = packet_ioctl,
+- .listen = sock_no_listen,
+- .shutdown = sock_no_shutdown,
+- .setsockopt = sock_no_setsockopt,
+- .getsockopt = sock_no_getsockopt,
+- .sendmsg = packet_sendmsg_spkt,
+- .recvmsg = packet_recvmsg,
+- .mmap = sock_no_mmap,
+- .sendpage = sock_no_sendpage,
+-};
+-
+-static const struct proto_ops packet_ops = {
+- .family = PF_PACKET,
+- .owner = THIS_MODULE,
+- .release = packet_release,
+- .bind = packet_bind,
+- .connect = sock_no_connect,
+- .socketpair = sock_no_socketpair,
+- .accept = sock_no_accept,
+- .getname = packet_getname,
+- .poll = packet_poll,
+- .ioctl = packet_ioctl,
+- .listen = sock_no_listen,
+- .shutdown = sock_no_shutdown,
+- .setsockopt = packet_setsockopt,
+- .getsockopt = packet_getsockopt,
+- .sendmsg = packet_sendmsg,
+- .recvmsg = packet_recvmsg,
+- .mmap = packet_mmap,
+- .sendpage = sock_no_sendpage,
+-};
+-
+-static struct net_proto_family packet_family_ops = {
+- .family = PF_PACKET,
+- .create = packet_create,
+- .owner = THIS_MODULE,
+-};
+-
+-static struct notifier_block packet_netdev_notifier = {
+- .notifier_call =packet_notifier,
+-};
+-
+-#ifdef CONFIG_PROC_FS
+-static inline struct sock *packet_seq_idx(loff_t off)
+-{
+- struct sock *s;
+- struct hlist_node *node;
+-
+- sk_for_each(s, node, &packet_sklist) {
+- if (!off--)
+- return s;
+- }
+- return NULL;
+-}
+-
+-static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
+-{
+- read_lock(&packet_sklist_lock);
+- return *pos ? packet_seq_idx(*pos - 1) : SEQ_START_TOKEN;
+-}
+-
+-static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+-{
+- ++*pos;
+- return (v == SEQ_START_TOKEN)
+- ? sk_head(&packet_sklist)
+- : sk_next((struct sock*)v) ;
+-}
+-
+-static void packet_seq_stop(struct seq_file *seq, void *v)
+-{
+- read_unlock(&packet_sklist_lock);
+-}
+-
+-static int packet_seq_show(struct seq_file *seq, void *v)
+-{
+- if (v == SEQ_START_TOKEN)
+- seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
+- else {
+- struct sock *s = v;
+- const struct packet_sock *po = pkt_sk(s);
+-
+- seq_printf(seq,
+- "%p %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
+- s,
+- atomic_read(&s->sk_refcnt),
+- s->sk_type,
+- ntohs(po->num),
+- po->ifindex,
+- po->running,
+- atomic_read(&s->sk_rmem_alloc),
+- sock_i_uid(s),
+- sock_i_ino(s) );
+- }
+-
+- return 0;
+-}
+-
+-static struct seq_operations packet_seq_ops = {
+- .start = packet_seq_start,
+- .next = packet_seq_next,
+- .stop = packet_seq_stop,
+- .show = packet_seq_show,
+-};
+-
+-static int packet_seq_open(struct inode *inode, struct file *file)
+-{
+- return seq_open(file, &packet_seq_ops);
+-}
+-
+-static const struct file_operations packet_seq_fops = {
+- .owner = THIS_MODULE,
+- .open = packet_seq_open,
+- .read = seq_read,
+- .llseek = seq_lseek,
+- .release = seq_release,
+-};
+-
+-#endif
+-
+-static void __exit packet_exit(void)
+-{
+- proc_net_remove("packet");
+- unregister_netdevice_notifier(&packet_netdev_notifier);
+- sock_unregister(PF_PACKET);
+- proto_unregister(&packet_proto);
+-}
+-
+-static int __init packet_init(void)
+-{
+- int rc = proto_register(&packet_proto, 0);
+-
+- if (rc != 0)
+- goto out;
+-
+- sock_register(&packet_family_ops);
+- register_netdevice_notifier(&packet_netdev_notifier);
+- proc_net_fops_create("packet", 0, &packet_seq_fops);
+-out:
+- return rc;
+-}
+-
+-module_init(packet_init);
+-module_exit(packet_exit);
+-MODULE_LICENSE("GPL");
+-MODULE_ALIAS_NETPROTO(PF_PACKET);
+diff -Nurb linux-2.6.22-594/net/socket.c linux-2.6.22-595/net/socket.c
+--- linux-2.6.22-594/net/socket.c 2008-03-20 00:05:19.000000000 -0400
++++ linux-2.6.22-595/net/socket.c 2008-03-20 00:14:03.000000000 -0400
+@@ -1122,12 +1122,17 @@
+ if (type < 0 || type >= SOCK_MAX)
+ return -EINVAL;
+
++ /*
++ * Hack no. 2 - Sapan
++ * Clean this up later
++ *
+ if (!nx_check(0, VS_ADMIN)) {
+ if (family == PF_INET && !current_nx_info_has_v4())
+ return -EAFNOSUPPORT;
+ if (family == PF_INET6 && !current_nx_info_has_v6())
+ return -EAFNOSUPPORT;
+ }
++ */
+
+ /* Compatibility.
+
+diff -Nurb linux-2.6.22-594/net/socket.c.orig linux-2.6.22-595/net/socket.c.orig
+--- linux-2.6.22-594/net/socket.c.orig 1969-12-31 19:00:00.000000000 -0500
++++ linux-2.6.22-595/net/socket.c.orig 2008-03-20 00:05:19.000000000 -0400
+@@ -0,0 +1,2400 @@
++/*
++ * NET An implementation of the SOCKET network access protocol.
++ *
++ * Version: @(#)socket.c 1.1.93 18/02/95
++ *
++ * Authors: Orest Zborowski, <obz@Kodak.COM>
++ * Ross Biro
++ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
++ *
++ * Fixes:
++ * Anonymous : NOTSOCK/BADF cleanup. Error fix in
++ * shutdown()
++ * Alan Cox : verify_area() fixes
++ * Alan Cox : Removed DDI
++ * Jonathan Kamens : SOCK_DGRAM reconnect bug
++ * Alan Cox : Moved a load of checks to the very
++ * top level.
++ * Alan Cox : Move address structures to/from user
++ * mode above the protocol layers.
++ * Rob Janssen : Allow 0 length sends.
++ * Alan Cox : Asynchronous I/O support (cribbed from the
++ * tty drivers).
++ * Niibe Yutaka : Asynchronous I/O for writes (4.4BSD style)
++ * Jeff Uphoff : Made max number of sockets command-line
++ * configurable.
++ * Matti Aarnio : Made the number of sockets dynamic,
++ * to be allocated when needed, and mr.
++ * Uphoff's max is used as max to be
++ * allowed to allocate.
++ * Linus : Argh. removed all the socket allocation
++ * altogether: it's in the inode now.
++ * Alan Cox : Made sock_alloc()/sock_release() public
++ * for NetROM and future kernel nfsd type
++ * stuff.
++ * Alan Cox : sendmsg/recvmsg basics.
++ * Tom Dyas : Export net symbols.
++ * Marcin Dalecki : Fixed problems with CONFIG_NET="n".
++ * Alan Cox : Added thread locking to sys_* calls
++ * for sockets. May have errors at the
++ * moment.
++ * Kevin Buhr : Fixed the dumb errors in the above.
++ * Andi Kleen : Some small cleanups, optimizations,
++ * and fixed a copy_from_user() bug.
++ * Tigran Aivazian : sys_send(args) calls sys_sendto(args, NULL, 0)
++ * Tigran Aivazian : Made listen(2) backlog sanity checks
++ * protocol-independent
++ *
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License
++ * as published by the Free Software Foundation; either version
++ * 2 of the License, or (at your option) any later version.
++ *
++ *
++ * This module is effectively the top level interface to the BSD socket
++ * paradigm.
++ *
++ * Based upon Swansea University Computer Society NET3.039
++ */
++
++#include <linux/mm.h>
++#include <linux/socket.h>
++#include <linux/file.h>
++#include <linux/net.h>
++#include <linux/interrupt.h>
++#include <linux/rcupdate.h>
++#include <linux/netdevice.h>
++#include <linux/proc_fs.h>
++#include <linux/seq_file.h>
++#include <linux/mutex.h>
++#include <linux/wanrouter.h>
++#include <linux/if_bridge.h>
++#include <linux/if_frad.h>
++#include <linux/if_vlan.h>
++#include <linux/init.h>
++#include <linux/poll.h>
++#include <linux/cache.h>
++#include <linux/module.h>
++#include <linux/highmem.h>
++#include <linux/mount.h>
++#include <linux/security.h>
++#include <linux/syscalls.h>
++#include <linux/compat.h>
++#include <linux/kmod.h>
++#include <linux/audit.h>
++#include <linux/wireless.h>
++#include <linux/nsproxy.h>
++
++#include <asm/uaccess.h>
++#include <asm/unistd.h>
++
++#include <net/compat.h>
++
++#include <net/sock.h>
++#include <linux/netfilter.h>
++#include <linux/vs_base.h>
++#include <linux/vs_socket.h>
++#include <linux/vs_inet.h>
++#include <linux/vs_inet6.h>
++
++static int sock_no_open(struct inode *irrelevant, struct file *dontcare);
++static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov,
++ unsigned long nr_segs, loff_t pos);
++static ssize_t sock_aio_write(struct kiocb *iocb, const struct iovec *iov,
++ unsigned long nr_segs, loff_t pos);
++static int sock_mmap(struct file *file, struct vm_area_struct *vma);
++
++static int sock_close(struct inode *inode, struct file *file);
++static unsigned int sock_poll(struct file *file,
++ struct poll_table_struct *wait);
++static long sock_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
++#ifdef CONFIG_COMPAT
++static long compat_sock_ioctl(struct file *file,
++ unsigned int cmd, unsigned long arg);
++#endif
++static int sock_fasync(int fd, struct file *filp, int on);
++static ssize_t sock_sendpage(struct file *file, struct page *page,
++ int offset, size_t size, loff_t *ppos, int more);
++
++/*
++ * Socket files have a set of 'special' operations as well as the generic file ones. These don't appear
++ * in the operation structures but are done directly via the socketcall() multiplexor.
++ */
++
++static const struct file_operations socket_file_ops = {
++ .owner = THIS_MODULE,
++ .llseek = no_llseek,
++ .aio_read = sock_aio_read,
++ .aio_write = sock_aio_write,
++ .poll = sock_poll,
++ .unlocked_ioctl = sock_ioctl,
++#ifdef CONFIG_COMPAT
++ .compat_ioctl = compat_sock_ioctl,
++#endif
++ .mmap = sock_mmap,
++ .open = sock_no_open, /* special open code to disallow open via /proc */
++ .release = sock_close,
++ .fasync = sock_fasync,
++ .sendpage = sock_sendpage,
++ .splice_write = generic_splice_sendpage,
++};
++
++/*
++ * The protocol list. Each protocol is registered in here.
++ */
++
++static DEFINE_SPINLOCK(net_family_lock);
++static const struct net_proto_family *net_families[NPROTO] __read_mostly;
++
++/*
++ * Statistics counters of the socket lists
++ */
++
++static DEFINE_PER_CPU(int, sockets_in_use) = 0;
++
++/*
++ * Support routines.
++ * Move socket addresses back and forth across the kernel/user
++ * divide and look after the messy bits.
++ */
++
++#define MAX_SOCK_ADDR 128 /* 108 for Unix domain -
++ 16 for IP, 16 for IPX,
++ 24 for IPv6,
++ about 80 for AX.25
++ must be at least one bigger than
++ the AF_UNIX size (see net/unix/af_unix.c
++ :unix_mkname()).
++ */
++
++/**
++ * move_addr_to_kernel - copy a socket address into kernel space
++ * @uaddr: Address in user space
++ * @kaddr: Address in kernel space
++ * @ulen: Length in user space
++ *
++ * The address is copied into kernel space. If the provided address is
++ * too long an error code of -EINVAL is returned. If the copy gives
++ * invalid addresses -EFAULT is returned. On a success 0 is returned.
++ */
++
++int move_addr_to_kernel(void __user *uaddr, int ulen, void *kaddr)
++{
++ if (ulen < 0 || ulen > MAX_SOCK_ADDR)
++ return -EINVAL;
++ if (ulen == 0)
++ return 0;
++ if (copy_from_user(kaddr, uaddr, ulen))
++ return -EFAULT;
++ return audit_sockaddr(ulen, kaddr);
++}
++
++/**
++ * move_addr_to_user - copy an address to user space
++ * @kaddr: kernel space address
++ * @klen: length of address in kernel
++ * @uaddr: user space address
++ * @ulen: pointer to user length field
++ *
++ * The value pointed to by ulen on entry is the buffer length available.
++ * This is overwritten with the buffer space used. -EINVAL is returned
++ * if an overlong buffer is specified or a negative buffer size. -EFAULT
++ * is returned if either the buffer or the length field are not
++ * accessible.
++ * After copying the data up to the limit the user specifies, the true
++ * length of the data is written over the length limit the user
++ * specified. Zero is returned for a success.
++ */
++
++int move_addr_to_user(void *kaddr, int klen, void __user *uaddr,
++ int __user *ulen)
++{
++ int err;
++ int len;
++
++ err = get_user(len, ulen);
++ if (err)
++ return err;
++ if (len > klen)
++ len = klen;
++ if (len < 0 || len > MAX_SOCK_ADDR)
++ return -EINVAL;
++ if (len) {
++ if (audit_sockaddr(klen, kaddr))
++ return -ENOMEM;
++ if (copy_to_user(uaddr, kaddr, len))
++ return -EFAULT;
++ }
++ /*
++ * "fromlen shall refer to the value before truncation.."
++ * 1003.1g
++ */
++ return __put_user(klen, ulen);
++}
++
++#define SOCKFS_MAGIC 0x534F434B
++
++static struct kmem_cache *sock_inode_cachep __read_mostly;
++
++static struct inode *sock_alloc_inode(struct super_block *sb)
++{
++ struct socket_alloc *ei;
++
++ ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL);
++ if (!ei)
++ return NULL;
++ init_waitqueue_head(&ei->socket.wait);
++
++ ei->socket.fasync_list = NULL;
++ ei->socket.state = SS_UNCONNECTED;
++ ei->socket.flags = 0;
++ ei->socket.ops = NULL;
++ ei->socket.sk = NULL;
++ ei->socket.file = NULL;
++
++ return &ei->vfs_inode;
++}
++
++static void sock_destroy_inode(struct inode *inode)
++{
++ kmem_cache_free(sock_inode_cachep,
++ container_of(inode, struct socket_alloc, vfs_inode));
++}
++
++static void init_once(void *foo, struct kmem_cache *cachep, unsigned long flags)
++{
++ struct socket_alloc *ei = (struct socket_alloc *)foo;
++
++ inode_init_once(&ei->vfs_inode);
++}
++
++static int init_inodecache(void)
++{
++ sock_inode_cachep = kmem_cache_create("sock_inode_cache",
++ sizeof(struct socket_alloc),
++ 0,
++ (SLAB_HWCACHE_ALIGN |
++ SLAB_RECLAIM_ACCOUNT |
++ SLAB_MEM_SPREAD),
++ init_once,
++ NULL);
++ if (sock_inode_cachep == NULL)
++ return -ENOMEM;
++ return 0;
++}
++
++static struct super_operations sockfs_ops = {
++ .alloc_inode = sock_alloc_inode,
++ .destroy_inode =sock_destroy_inode,
++ .statfs = simple_statfs,
++};
++
++static int sockfs_get_sb(struct file_system_type *fs_type,
++ int flags, const char *dev_name, void *data,
++ struct vfsmount *mnt)
++{
++ return get_sb_pseudo(fs_type, "socket:", &sockfs_ops, SOCKFS_MAGIC,
++ mnt);
++}
++
++static struct vfsmount *sock_mnt __read_mostly;
++
++static struct file_system_type sock_fs_type = {
++ .name = "sockfs",
++ .get_sb = sockfs_get_sb,
++ .kill_sb = kill_anon_super,
++};
++
++static int sockfs_delete_dentry(struct dentry *dentry)
++{
++ /*
++ * At creation time, we pretended this dentry was hashed
++ * (by clearing DCACHE_UNHASHED bit in d_flags)
++ * At delete time, we restore the truth : not hashed.
++ * (so that dput() can proceed correctly)
++ */
++ dentry->d_flags |= DCACHE_UNHASHED;
++ return 0;
++}
++
++/*
++ * sockfs_dname() is called from d_path().
++ */
++static char *sockfs_dname(struct dentry *dentry, char *buffer, int buflen)
++{
++ return dynamic_dname(dentry, buffer, buflen, "socket:[%lu]",
++ dentry->d_inode->i_ino);
++}
++
++static struct dentry_operations sockfs_dentry_operations = {
++ .d_delete = sockfs_delete_dentry,
++ .d_dname = sockfs_dname,
++};
++
++/*
++ * Obtains the first available file descriptor and sets it up for use.
++ *
++ * These functions create file structures and maps them to fd space
++ * of the current process. On success it returns file descriptor
++ * and file struct implicitly stored in sock->file.
++ * Note that another thread may close file descriptor before we return
++ * from this function. We use the fact that now we do not refer
++ * to socket after mapping. If one day we will need it, this
++ * function will increment ref. count on file by 1.
++ *
++ * In any case returned fd MAY BE not valid!
++ * This race condition is unavoidable
++ * with shared fd spaces, we cannot solve it inside kernel,
++ * but we take care of internal coherence yet.
++ */
++
++static int sock_alloc_fd(struct file **filep)
++{
++ int fd;
++
++ fd = get_unused_fd();
++ if (likely(fd >= 0)) {
++ struct file *file = get_empty_filp();
++
++ *filep = file;
++ if (unlikely(!file)) {
++ put_unused_fd(fd);
++ return -ENFILE;
++ }
++ } else
++ *filep = NULL;
++ return fd;
++}
++
++static int sock_attach_fd(struct socket *sock, struct file *file)
++{
++ struct qstr name = { .name = "" };
++
++ file->f_path.dentry = d_alloc(sock_mnt->mnt_sb->s_root, &name);
++ if (unlikely(!file->f_path.dentry))
++ return -ENOMEM;
++
++ file->f_path.dentry->d_op = &sockfs_dentry_operations;
++ /*
++ * We dont want to push this dentry into global dentry hash table.
++ * We pretend dentry is already hashed, by unsetting DCACHE_UNHASHED
++ * This permits a working /proc/$pid/fd/XXX on sockets
++ */
++ file->f_path.dentry->d_flags &= ~DCACHE_UNHASHED;
++ d_instantiate(file->f_path.dentry, SOCK_INODE(sock));
++ file->f_path.mnt = mntget(sock_mnt);
++ file->f_mapping = file->f_path.dentry->d_inode->i_mapping;
++
++ sock->file = file;
++ file->f_op = SOCK_INODE(sock)->i_fop = &socket_file_ops;
++ file->f_mode = FMODE_READ | FMODE_WRITE;
++ file->f_flags = O_RDWR;
++ file->f_pos = 0;
++ file->private_data = sock;
++
++ return 0;
++}
++
++int sock_map_fd(struct socket *sock)
++{
++ struct file *newfile;
++ int fd = sock_alloc_fd(&newfile);
++
++ if (likely(fd >= 0)) {
++ int err = sock_attach_fd(sock, newfile);
++
++ if (unlikely(err < 0)) {
++ put_filp(newfile);
++ put_unused_fd(fd);
++ return err;
++ }
++ fd_install(fd, newfile);
++ }
++ return fd;
++}
++
++static struct socket *sock_from_file(struct file *file, int *err)
++{
++ if (file->f_op == &socket_file_ops)
++ return file->private_data; /* set in sock_map_fd */
++
++ *err = -ENOTSOCK;
++ return NULL;
++}
++
++/**
++ * sockfd_lookup - Go from a file number to its socket slot
++ * @fd: file handle
++ * @err: pointer to an error code return
++ *
++ * The file handle passed in is locked and the socket it is bound
++ * too is returned. If an error occurs the err pointer is overwritten
++ * with a negative errno code and NULL is returned. The function checks
++ * for both invalid handles and passing a handle which is not a socket.
++ *
++ * On a success the socket object pointer is returned.
++ */
++
++struct socket *sockfd_lookup(int fd, int *err)
++{
++ struct file *file;
++ struct socket *sock;
++
++ file = fget(fd);
++ if (!file) {
++ *err = -EBADF;
++ return NULL;
++ }
++
++ sock = sock_from_file(file, err);
++ if (!sock)
++ fput(file);
++ return sock;
++}
++
++static struct socket *sockfd_lookup_light(int fd, int *err, int *fput_needed)
++{
++ struct file *file;
++ struct socket *sock;
++
++ *err = -EBADF;
++ file = fget_light(fd, fput_needed);
++ if (file) {
++ sock = sock_from_file(file, err);
++ if (sock)
++ return sock;
++ fput_light(file, *fput_needed);
++ }
++ return NULL;
++}
++
++/**
++ * sock_alloc - allocate a socket
++ *
++ * Allocate a new inode and socket object. The two are bound together
++ * and initialised. The socket is then returned. If we are out of inodes
++ * NULL is returned.
++ */
++
++static struct socket *sock_alloc(void)
++{
++ struct inode *inode;
++ struct socket *sock;
++
++ inode = new_inode(sock_mnt->mnt_sb);
++ if (!inode)
++ return NULL;
++
++ sock = SOCKET_I(inode);
++
++ inode->i_mode = S_IFSOCK | S_IRWXUGO;
++ inode->i_uid = current->fsuid;
++ inode->i_gid = current->fsgid;
++
++ get_cpu_var(sockets_in_use)++;
++ put_cpu_var(sockets_in_use);
++ return sock;
++}
++
++/*
++ * In theory you can't get an open on this inode, but /proc provides
++ * a back door. Remember to keep it shut otherwise you'll let the
++ * creepy crawlies in.
++ */
++
++static int sock_no_open(struct inode *irrelevant, struct file *dontcare)
++{
++ return -ENXIO;
++}
++
++const struct file_operations bad_sock_fops = {
++ .owner = THIS_MODULE,
++ .open = sock_no_open,
++};
++
++/**
++ * sock_release - close a socket
++ * @sock: socket to close
++ *
++ * The socket is released from the protocol stack if it has a release
++ * callback, and the inode is then released if the socket is bound to
++ * an inode not a file.
++ */
++
++void sock_release(struct socket *sock)
++{
++ if (sock->ops) {
++ struct module *owner = sock->ops->owner;
++
++ sock->ops->release(sock);
++ sock->ops = NULL;
++ module_put(owner);
++ }
++
++ if (sock->fasync_list)
++ printk(KERN_ERR "sock_release: fasync list not empty!\n");
++
++ get_cpu_var(sockets_in_use)--;
++ put_cpu_var(sockets_in_use);
++ if (!sock->file) {
++ iput(SOCK_INODE(sock));
++ return;
++ }
++ sock->file = NULL;
++}
++
++static inline int __sock_sendmsg(struct kiocb *iocb, struct socket *sock,
++ struct msghdr *msg, size_t size)
++{
++ struct sock_iocb *si = kiocb_to_siocb(iocb);
++ int err, len;
++
++ si->sock = sock;
++ si->scm = NULL;
++ si->msg = msg;
++ si->size = size;
++
++ err = security_socket_sendmsg(sock, msg, size);
++ if (err)
++ return err;
++
++ len = sock->ops->sendmsg(iocb, sock, msg, size);
++ if (sock->sk) {
++ if (len == size)
++ vx_sock_send(sock->sk, size);
++ else
++ vx_sock_fail(sock->sk, size);
++ }
++ vxdprintk(VXD_CBIT(net, 7),
++ "__sock_sendmsg: %p[%p,%p,%p;%d/%d]:%d/%d",
++ sock, sock->sk,
++ (sock->sk)?sock->sk->sk_nx_info:0,
++ (sock->sk)?sock->sk->sk_vx_info:0,
++ (sock->sk)?sock->sk->sk_xid:0,
++ (sock->sk)?sock->sk->sk_nid:0,
++ (unsigned int)size, len);
++ return len;
++}
++
++int sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
++{
++ struct kiocb iocb;
++ struct sock_iocb siocb;
++ int ret;
++
++ init_sync_kiocb(&iocb, NULL);
++ iocb.private = &siocb;
++ ret = __sock_sendmsg(&iocb, sock, msg, size);
++ if (-EIOCBQUEUED == ret)
++ ret = wait_on_sync_kiocb(&iocb);
++ return ret;
++}
++
++int kernel_sendmsg(struct socket *sock, struct msghdr *msg,
++ struct kvec *vec, size_t num, size_t size)
++{
++ mm_segment_t oldfs = get_fs();
++ int result;
++
++ set_fs(KERNEL_DS);
++ /*
++ * the following is safe, since for compiler definitions of kvec and
++ * iovec are identical, yielding the same in-core layout and alignment
++ */
++ msg->msg_iov = (struct iovec *)vec;
++ msg->msg_iovlen = num;
++ result = sock_sendmsg(sock, msg, size);
++ set_fs(oldfs);
++ return result;
++}
++
++/*
++ * called from sock_recv_timestamp() if sock_flag(sk, SOCK_RCVTSTAMP)
++ */
++void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
++ struct sk_buff *skb)
++{
++ ktime_t kt = skb->tstamp;
++
++ if (!sock_flag(sk, SOCK_RCVTSTAMPNS)) {
++ struct timeval tv;
++ /* Race occurred between timestamp enabling and packet
++ receiving. Fill in the current time for now. */
++ if (kt.tv64 == 0)
++ kt = ktime_get_real();
++ skb->tstamp = kt;
++ tv = ktime_to_timeval(kt);
++ put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMP, sizeof(tv), &tv);
++ } else {
++ struct timespec ts;
++ /* Race occurred between timestamp enabling and packet
++ receiving. Fill in the current time for now. */
++ if (kt.tv64 == 0)
++ kt = ktime_get_real();
++ skb->tstamp = kt;
++ ts = ktime_to_timespec(kt);
++ put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPNS, sizeof(ts), &ts);
++ }
++}
++
++EXPORT_SYMBOL_GPL(__sock_recv_timestamp);
++
++static inline int __sock_recvmsg(struct kiocb *iocb, struct socket *sock,
++ struct msghdr *msg, size_t size, int flags)
++{
++ int err, len;
++ struct sock_iocb *si = kiocb_to_siocb(iocb);
++
++ si->sock = sock;
++ si->scm = NULL;
++ si->msg = msg;
++ si->size = size;
++ si->flags = flags;
++
++ err = security_socket_recvmsg(sock, msg, size, flags);
++ if (err)
++ return err;
++
++ len = sock->ops->recvmsg(iocb, sock, msg, size, flags);
++ if ((len >= 0) && sock->sk)
++ vx_sock_recv(sock->sk, len);
++ vxdprintk(VXD_CBIT(net, 7),
++ "__sock_recvmsg: %p[%p,%p,%p;%d/%d]:%d/%d",
++ sock, sock->sk,
++ (sock->sk)?sock->sk->sk_nx_info:0,
++ (sock->sk)?sock->sk->sk_vx_info:0,
++ (sock->sk)?sock->sk->sk_xid:0,
++ (sock->sk)?sock->sk->sk_nid:0,
++ (unsigned int)size, len);
++ return len;
++}
++
++int sock_recvmsg(struct socket *sock, struct msghdr *msg,
++ size_t size, int flags)
++{
++ struct kiocb iocb;
++ struct sock_iocb siocb;
++ int ret;
++
++ init_sync_kiocb(&iocb, NULL);
++ iocb.private = &siocb;
++ ret = __sock_recvmsg(&iocb, sock, msg, size, flags);
++ if (-EIOCBQUEUED == ret)
++ ret = wait_on_sync_kiocb(&iocb);
++ return ret;
++}
++
++int kernel_recvmsg(struct socket *sock, struct msghdr *msg,
++ struct kvec *vec, size_t num, size_t size, int flags)
++{
++ mm_segment_t oldfs = get_fs();
++ int result;
++
++ set_fs(KERNEL_DS);
++ /*
++ * the following is safe, since for compiler definitions of kvec and
++ * iovec are identical, yielding the same in-core layout and alignment
++ */
++ msg->msg_iov = (struct iovec *)vec, msg->msg_iovlen = num;
++ result = sock_recvmsg(sock, msg, size, flags);
++ set_fs(oldfs);
++ return result;
++}
++
++static void sock_aio_dtor(struct kiocb *iocb)
++{
++ kfree(iocb->private);
++}
++
++static ssize_t sock_sendpage(struct file *file, struct page *page,
++ int offset, size_t size, loff_t *ppos, int more)
++{
++ struct socket *sock;
++ int flags;
++
++ sock = file->private_data;
++
++ flags = !(file->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT;
++ if (more)
++ flags |= MSG_MORE;
++
++ return sock->ops->sendpage(sock, page, offset, size, flags);
++}
++
++static struct sock_iocb *alloc_sock_iocb(struct kiocb *iocb,
++ struct sock_iocb *siocb)
++{
++ if (!is_sync_kiocb(iocb)) {
++ siocb = kmalloc(sizeof(*siocb), GFP_KERNEL);
++ if (!siocb)
++ return NULL;
++ iocb->ki_dtor = sock_aio_dtor;
++ }
++
++ siocb->kiocb = iocb;
++ iocb->private = siocb;
++ return siocb;
++}
++
++static ssize_t do_sock_read(struct msghdr *msg, struct kiocb *iocb,
++ struct file *file, const struct iovec *iov,
++ unsigned long nr_segs)
++{
++ struct socket *sock = file->private_data;
++ size_t size = 0;
++ int i;
++
++ for (i = 0; i < nr_segs; i++)
++ size += iov[i].iov_len;
++
++ msg->msg_name = NULL;
++ msg->msg_namelen = 0;
++ msg->msg_control = NULL;
++ msg->msg_controllen = 0;
++ msg->msg_iov = (struct iovec *)iov;
++ msg->msg_iovlen = nr_segs;
++ msg->msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0;
++
++ return __sock_recvmsg(iocb, sock, msg, size, msg->msg_flags);
++}
++
++static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov,
++ unsigned long nr_segs, loff_t pos)
++{
++ struct sock_iocb siocb, *x;
++
++ if (pos != 0)
++ return -ESPIPE;
++
++ if (iocb->ki_left == 0) /* Match SYS5 behaviour */
++ return 0;
++
++
++ x = alloc_sock_iocb(iocb, &siocb);
++ if (!x)
++ return -ENOMEM;
++ return do_sock_read(&x->async_msg, iocb, iocb->ki_filp, iov, nr_segs);
++}
++
++static ssize_t do_sock_write(struct msghdr *msg, struct kiocb *iocb,
++ struct file *file, const struct iovec *iov,
++ unsigned long nr_segs)
++{
++ struct socket *sock = file->private_data;
++ size_t size = 0;
++ int i;
++
++ for (i = 0; i < nr_segs; i++)
++ size += iov[i].iov_len;
++
++ msg->msg_name = NULL;
++ msg->msg_namelen = 0;
++ msg->msg_control = NULL;
++ msg->msg_controllen = 0;
++ msg->msg_iov = (struct iovec *)iov;
++ msg->msg_iovlen = nr_segs;
++ msg->msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0;
++ if (sock->type == SOCK_SEQPACKET)
++ msg->msg_flags |= MSG_EOR;
++
++ return __sock_sendmsg(iocb, sock, msg, size);
++}
++
++static ssize_t sock_aio_write(struct kiocb *iocb, const struct iovec *iov,
++ unsigned long nr_segs, loff_t pos)
++{
++ struct sock_iocb siocb, *x;
++
++ if (pos != 0)
++ return -ESPIPE;
++
++ x = alloc_sock_iocb(iocb, &siocb);
++ if (!x)
++ return -ENOMEM;
++
++ return do_sock_write(&x->async_msg, iocb, iocb->ki_filp, iov, nr_segs);
++}
++
++/*
++ * Atomic setting of ioctl hooks to avoid race
++ * with module unload.
++ */
++
++static DEFINE_MUTEX(br_ioctl_mutex);
++static int (*br_ioctl_hook) (struct net *, unsigned int cmd, void __user *arg) = NULL;
++
++void brioctl_set(int (*hook) (struct net *, unsigned int, void __user *))
++{
++ mutex_lock(&br_ioctl_mutex);
++ br_ioctl_hook = hook;
++ mutex_unlock(&br_ioctl_mutex);
++}
++
++EXPORT_SYMBOL(brioctl_set);
++
++static DEFINE_MUTEX(vlan_ioctl_mutex);
++static int (*vlan_ioctl_hook) (struct net *, void __user *arg);
++
++void vlan_ioctl_set(int (*hook) (struct net *, void __user *))
++{
++ mutex_lock(&vlan_ioctl_mutex);
++ vlan_ioctl_hook = hook;
++ mutex_unlock(&vlan_ioctl_mutex);
++}
++
++EXPORT_SYMBOL(vlan_ioctl_set);
++
++static DEFINE_MUTEX(dlci_ioctl_mutex);
++static int (*dlci_ioctl_hook) (unsigned int, void __user *);
++
++void dlci_ioctl_set(int (*hook) (unsigned int, void __user *))
++{
++ mutex_lock(&dlci_ioctl_mutex);
++ dlci_ioctl_hook = hook;
++ mutex_unlock(&dlci_ioctl_mutex);
++}
++
++EXPORT_SYMBOL(dlci_ioctl_set);
++
++/*
++ * With an ioctl, arg may well be a user mode pointer, but we don't know
++ * what to do with it - that's up to the protocol still.
++ */
++
++static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg)
++{
++ struct socket *sock;
++ struct sock *sk;
++ void __user *argp = (void __user *)arg;
++ int pid, err;
++ struct net *net;
++
++ sock = file->private_data;
++ sk = sock->sk;
++ net = sk->sk_net;
++ if (cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15)) {
++ err = dev_ioctl(net, cmd, argp);
++ } else
++#ifdef CONFIG_WIRELESS_EXT
++ if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) {
++ err = dev_ioctl(net, cmd, argp);
++ } else
++#endif /* CONFIG_WIRELESS_EXT */
++ switch (cmd) {
++ case FIOSETOWN:
++ case SIOCSPGRP:
++ err = -EFAULT;
++ if (get_user(pid, (int __user *)argp))
++ break;
++ err = f_setown(sock->file, pid, 1);
++ break;
++ case FIOGETOWN:
++ case SIOCGPGRP:
++ err = put_user(f_getown(sock->file),
++ (int __user *)argp);
++ break;
++ case SIOCGIFBR:
++ case SIOCSIFBR:
++ case SIOCBRADDBR:
++ case SIOCBRDELBR:
++ err = -ENOPKG;
++ if (!br_ioctl_hook)
++ request_module("bridge");
++
++ mutex_lock(&br_ioctl_mutex);
++ if (br_ioctl_hook)
++ err = br_ioctl_hook(net, cmd, argp);
++ mutex_unlock(&br_ioctl_mutex);
++ break;
++ case SIOCGIFVLAN:
++ case SIOCSIFVLAN:
++ err = -ENOPKG;
++ if (!vlan_ioctl_hook)
++ request_module("8021q");
++
++ mutex_lock(&vlan_ioctl_mutex);
++ if (vlan_ioctl_hook)
++ err = vlan_ioctl_hook(net, argp);
++ mutex_unlock(&vlan_ioctl_mutex);
++ break;
++ case SIOCADDDLCI:
++ case SIOCDELDLCI:
++ err = -ENOPKG;
++ if (!dlci_ioctl_hook)
++ request_module("dlci");
++
++ if (dlci_ioctl_hook) {
++ mutex_lock(&dlci_ioctl_mutex);
++ err = dlci_ioctl_hook(cmd, argp);
++ mutex_unlock(&dlci_ioctl_mutex);
++ }
++ break;
++ default:
++ err = sock->ops->ioctl(sock, cmd, arg);
++
++ /*
++ * If this ioctl is unknown try to hand it down
++ * to the NIC driver.
++ */
++ if (err == -ENOIOCTLCMD)
++ err = dev_ioctl(net, cmd, argp);
++ break;
++ }
++ return err;
++}
++
++int sock_create_lite(int family, int type, int protocol, struct socket **res)
++{
++ int err;
++ struct socket *sock = NULL;
++
++ err = security_socket_create(family, type, protocol, 1);
++ if (err)
++ goto out;
++
++ sock = sock_alloc();
++ if (!sock) {
++ err = -ENOMEM;
++ goto out;
++ }
++
++ sock->type = type;
++ err = security_socket_post_create(sock, family, type, protocol, 1);
++ if (err)
++ goto out_release;
++
++out:
++ *res = sock;
++ return err;
++out_release:
++ sock_release(sock);
++ sock = NULL;
++ goto out;
++}
++
++/* No kernel lock held - perfect */
++static unsigned int sock_poll(struct file *file, poll_table *wait)
++{
++ struct socket *sock;
++
++ /*
++ * We can't return errors to poll, so it's either yes or no.
++ */
++ sock = file->private_data;
++ return sock->ops->poll(file, sock, wait);
++}
++
++static int sock_mmap(struct file *file, struct vm_area_struct *vma)
++{
++ struct socket *sock = file->private_data;
++
++ return sock->ops->mmap(file, sock, vma);
++}
++
++static int sock_close(struct inode *inode, struct file *filp)
++{
++ /*
++ * It was possible the inode is NULL we were
++ * closing an unfinished socket.
++ */
++
++ if (!inode) {
++ printk(KERN_DEBUG "sock_close: NULL inode\n");
++ return 0;
++ }
++ sock_fasync(-1, filp, 0);
++ sock_release(SOCKET_I(inode));
++ return 0;
++}
++
++/*
++ * Update the socket async list
++ *
++ * Fasync_list locking strategy.
++ *
++ * 1. fasync_list is modified only under process context socket lock
++ * i.e. under semaphore.
++ * 2. fasync_list is used under read_lock(&sk->sk_callback_lock)
++ * or under socket lock.
++ * 3. fasync_list can be used from softirq context, so that
++ * modification under socket lock have to be enhanced with
++ * write_lock_bh(&sk->sk_callback_lock).
++ * --ANK (990710)
++ */
++
++static int sock_fasync(int fd, struct file *filp, int on)
++{
++ struct fasync_struct *fa, *fna = NULL, **prev;
++ struct socket *sock;
++ struct sock *sk;
++
++ if (on) {
++ fna = kmalloc(sizeof(struct fasync_struct), GFP_KERNEL);
++ if (fna == NULL)
++ return -ENOMEM;
++ }
++
++ sock = filp->private_data;
++
++ sk = sock->sk;
++ if (sk == NULL) {
++ kfree(fna);
++ return -EINVAL;
++ }
++
++ lock_sock(sk);
++
++ prev = &(sock->fasync_list);
++
++ for (fa = *prev; fa != NULL; prev = &fa->fa_next, fa = *prev)
++ if (fa->fa_file == filp)
++ break;
++
++ if (on) {
++ if (fa != NULL) {
++ write_lock_bh(&sk->sk_callback_lock);
++ fa->fa_fd = fd;
++ write_unlock_bh(&sk->sk_callback_lock);
++
++ kfree(fna);
++ goto out;
++ }
++ fna->fa_file = filp;
++ fna->fa_fd = fd;
++ fna->magic = FASYNC_MAGIC;
++ fna->fa_next = sock->fasync_list;
++ write_lock_bh(&sk->sk_callback_lock);
++ sock->fasync_list = fna;
++ write_unlock_bh(&sk->sk_callback_lock);
++ } else {
++ if (fa != NULL) {
++ write_lock_bh(&sk->sk_callback_lock);
++ *prev = fa->fa_next;
++ write_unlock_bh(&sk->sk_callback_lock);
++ kfree(fa);
++ }
++ }
++
++out:
++ release_sock(sock->sk);
++ return 0;
++}
++
++/* This function may be called only under socket lock or callback_lock */
++
++int sock_wake_async(struct socket *sock, int how, int band)
++{
++ if (!sock || !sock->fasync_list)
++ return -1;
++ switch (how) {
++ case 1:
++
++ if (test_bit(SOCK_ASYNC_WAITDATA, &sock->flags))
++ break;
++ goto call_kill;
++ case 2:
++ if (!test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags))
++ break;
++ /* fall through */
++ case 0:
++call_kill:
++ __kill_fasync(sock->fasync_list, SIGIO, band);
++ break;
++ case 3:
++ __kill_fasync(sock->fasync_list, SIGURG, band);
++ }
++ return 0;
++}
++
++static int __sock_create(struct net *net, int family, int type, int protocol,
++ struct socket **res, int kern)
++{
++ int err;
++ struct socket *sock;
++ const struct net_proto_family *pf;
++
++ /*
++ * Check protocol is in range
++ */
++ if (family < 0 || family >= NPROTO)
++ return -EAFNOSUPPORT;
++ if (type < 0 || type >= SOCK_MAX)
++ return -EINVAL;
++
++ if (!nx_check(0, VS_ADMIN)) {
++ if (family == PF_INET && !current_nx_info_has_v4())
++ return -EAFNOSUPPORT;
++ if (family == PF_INET6 && !current_nx_info_has_v6())
++ return -EAFNOSUPPORT;
++ }
++
++ /* Compatibility.
++
++ This uglymoron is moved from INET layer to here to avoid
++ deadlock in module load.
++ */
++ if (family == PF_INET && type == SOCK_PACKET) {
++ static int warned;
++ if (!warned) {
++ warned = 1;
++ printk(KERN_INFO "%s uses obsolete (PF_INET,SOCK_PACKET)\n",
++ current->comm);
++ }
++ family = PF_PACKET;
++ }
++
++ err = security_socket_create(family, type, protocol, kern);
++ if (err)
++ return err;
++
++ /*
++ * Allocate the socket and allow the family to set things up. if
++ * the protocol is 0, the family is instructed to select an appropriate
++ * default.
++ */
++ sock = sock_alloc();
++ if (!sock) {
++ if (net_ratelimit())
++ printk(KERN_WARNING "socket: no more sockets\n");
++ return -ENFILE; /* Not exactly a match, but its the
++ closest posix thing */
++ }
++
++ sock->type = type;
++
++#if defined(CONFIG_KMOD)
++ /* Attempt to load a protocol module if the find failed.
++ *
++ * 12/09/1996 Marcin: But! this makes REALLY only sense, if the user
++ * requested real, full-featured networking support upon configuration.
++ * Otherwise module support will break!
++ */
++ if (net_families[family] == NULL)
++ request_module("net-pf-%d", family);
++#endif
++
++ rcu_read_lock();
++ pf = rcu_dereference(net_families[family]);
++ err = -EAFNOSUPPORT;
++ if (!pf)
++ goto out_release;
++
++ /*
++ * We will call the ->create function, that possibly is in a loadable
++ * module, so we have to bump that loadable module refcnt first.
++ */
++ if (!try_module_get(pf->owner))
++ goto out_release;
++
++ /* Now protected by module ref count */
++ rcu_read_unlock();
++
++ err = pf->create(net, sock, protocol);
++ if (err < 0)
++ goto out_module_put;
++
++ /*
++ * Now to bump the refcnt of the [loadable] module that owns this
++ * socket at sock_release time we decrement its refcnt.
++ */
++ if (!try_module_get(sock->ops->owner))
++ goto out_module_busy;
++
++ /*
++ * Now that we're done with the ->create function, the [loadable]
++ * module can have its refcnt decremented
++ */
++ module_put(pf->owner);
++ err = security_socket_post_create(sock, family, type, protocol, kern);
++ if (err)
++ goto out_sock_release;
++ *res = sock;
++
++ return 0;
++
++out_module_busy:
++ err = -EAFNOSUPPORT;
++out_module_put:
++ sock->ops = NULL;
++ module_put(pf->owner);
++out_sock_release:
++ sock_release(sock);
++ return err;
++
++out_release:
++ rcu_read_unlock();
++ goto out_sock_release;
++}
++
++int sock_create(int family, int type, int protocol, struct socket **res)
++{
++ return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0);
++}
++
++int sock_create_kern(int family, int type, int protocol, struct socket **res)
++{
++ return __sock_create(&init_net, family, type, protocol, res, 1);
++}
++
++asmlinkage long sys_socket(int family, int type, int protocol)
++{
++ int retval;
++ struct socket *sock;
++
++ retval = sock_create(family, type, protocol, &sock);
++ if (retval < 0)
++ goto out;
++
++ set_bit(SOCK_USER_SOCKET, &sock->flags);
++ retval = sock_map_fd(sock);
++ if (retval < 0)
++ goto out_release;
++
++out:
++ /* It may be already another descriptor 8) Not kernel problem. */
++ return retval;
++
++out_release:
++ sock_release(sock);
++ return retval;
++}
++
++/*
++ * Create a pair of connected sockets.
++ */
++
++asmlinkage long sys_socketpair(int family, int type, int protocol,
++ int __user *usockvec)
++{
++ struct socket *sock1, *sock2;
++ int fd1, fd2, err;
++ struct file *newfile1, *newfile2;
++
++ /*
++ * Obtain the first socket and check if the underlying protocol
++ * supports the socketpair call.
++ */
++
++ err = sock_create(family, type, protocol, &sock1);
++ if (err < 0)
++ goto out;
++ set_bit(SOCK_USER_SOCKET, &sock1->flags);
++
++ err = sock_create(family, type, protocol, &sock2);
++ if (err < 0)
++ goto out_release_1;
++ set_bit(SOCK_USER_SOCKET, &sock2->flags);
++
++ err = sock1->ops->socketpair(sock1, sock2);
++ if (err < 0)
++ goto out_release_both;
++
++ fd1 = sock_alloc_fd(&newfile1);
++ if (unlikely(fd1 < 0)) {
++ err = fd1;
++ goto out_release_both;
++ }
++
++ fd2 = sock_alloc_fd(&newfile2);
++ if (unlikely(fd2 < 0)) {
++ err = fd2;
++ put_filp(newfile1);
++ put_unused_fd(fd1);
++ goto out_release_both;
++ }
++
++ err = sock_attach_fd(sock1, newfile1);
++ if (unlikely(err < 0)) {
++ goto out_fd2;
++ }
++
++ err = sock_attach_fd(sock2, newfile2);
++ if (unlikely(err < 0)) {
++ fput(newfile1);
++ goto out_fd1;
++ }
++
++ err = audit_fd_pair(fd1, fd2);
++ if (err < 0) {
++ fput(newfile1);
++ fput(newfile2);
++ goto out_fd;
++ }
++
++ fd_install(fd1, newfile1);
++ fd_install(fd2, newfile2);
++ /* fd1 and fd2 may be already another descriptors.
++ * Not kernel problem.
++ */
++
++ err = put_user(fd1, &usockvec[0]);
++ if (!err)
++ err = put_user(fd2, &usockvec[1]);
++ if (!err)
++ return 0;
++
++ sys_close(fd2);
++ sys_close(fd1);
++ return err;
++
++out_release_both:
++ sock_release(sock2);
++out_release_1:
++ sock_release(sock1);
++out:
++ return err;
++
++out_fd2:
++ put_filp(newfile1);
++ sock_release(sock1);
++out_fd1:
++ put_filp(newfile2);
++ sock_release(sock2);
++out_fd:
++ put_unused_fd(fd1);
++ put_unused_fd(fd2);
++ goto out;
++}
++
++/*
++ * Bind a name to a socket. Nothing much to do here since it's
++ * the protocol's responsibility to handle the local address.
++ *
++ * We move the socket address to kernel space before we call
++ * the protocol layer (having also checked the address is ok).
++ */
++
++asmlinkage long sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen)
++{
++ struct socket *sock;
++ char address[MAX_SOCK_ADDR];
++ int err, fput_needed;
++
++ sock = sockfd_lookup_light(fd, &err, &fput_needed);
++ if (sock) {
++ err = move_addr_to_kernel(umyaddr, addrlen, address);
++ if (err >= 0) {
++ err = security_socket_bind(sock,
++ (struct sockaddr *)address,
++ addrlen);
++ if (!err)
++ err = sock->ops->bind(sock,
++ (struct sockaddr *)
++ address, addrlen);
++ }
++ fput_light(sock->file, fput_needed);
++ }
++ return err;
++}
++
++/*
++ * Perform a listen. Basically, we allow the protocol to do anything
++ * necessary for a listen, and if that works, we mark the socket as
++ * ready for listening.
++ */
++
++asmlinkage long sys_listen(int fd, int backlog)
++{
++ struct socket *sock;
++ int err, fput_needed;
++
++ sock = sockfd_lookup_light(fd, &err, &fput_needed);
++ if (sock) {
++ struct net *net = sock->sk->sk_net;
++ if ((unsigned)backlog > net->sysctl_somaxconn)
++ backlog = net->sysctl_somaxconn;
++
++ err = security_socket_listen(sock, backlog);
++ if (!err)
++ err = sock->ops->listen(sock, backlog);
++
++ fput_light(sock->file, fput_needed);
++ }
++ return err;
++}
++
++/*
++ * For accept, we attempt to create a new socket, set up the link
++ * with the client, wake up the client, then return the new
++ * connected fd. We collect the address of the connector in kernel
++ * space and move it to user at the very end. This is unclean because
++ * we open the socket then return an error.
++ *
++ * 1003.1g adds the ability to recvmsg() to query connection pending
++ * status to recvmsg. We need to add that support in a way thats
++ * clean when we restucture accept also.
++ */
++
++asmlinkage long sys_accept(int fd, struct sockaddr __user *upeer_sockaddr,
++ int __user *upeer_addrlen)
++{
++ struct socket *sock, *newsock;
++ struct file *newfile;
++ int err, len, newfd, fput_needed;
++ char address[MAX_SOCK_ADDR];
++
++ sock = sockfd_lookup_light(fd, &err, &fput_needed);
++ if (!sock)
++ goto out;
++
++ err = -ENFILE;
++ if (!(newsock = sock_alloc()))
++ goto out_put;
++
++ newsock->type = sock->type;
++ newsock->ops = sock->ops;
++
++ /*
++ * We don't need try_module_get here, as the listening socket (sock)
++ * has the protocol module (sock->ops->owner) held.
++ */
++ __module_get(newsock->ops->owner);
++
++ newfd = sock_alloc_fd(&newfile);
++ if (unlikely(newfd < 0)) {
++ err = newfd;
++ sock_release(newsock);
++ goto out_put;
++ }
++
++ err = sock_attach_fd(newsock, newfile);
++ if (err < 0)
++ goto out_fd_simple;
++
++ err = security_socket_accept(sock, newsock);
++ if (err)
++ goto out_fd;
++
++ err = sock->ops->accept(sock, newsock, sock->file->f_flags);
++ if (err < 0)
++ goto out_fd;
++
++ if (upeer_sockaddr) {
++ if (newsock->ops->getname(newsock, (struct sockaddr *)address,
++ &len, 2) < 0) {
++ err = -ECONNABORTED;
++ goto out_fd;
++ }
++ err = move_addr_to_user(address, len, upeer_sockaddr,
++ upeer_addrlen);
++ if (err < 0)
++ goto out_fd;
++ }
++
++ /* File flags are not inherited via accept() unlike another OSes. */
++
++ fd_install(newfd, newfile);
++ err = newfd;
++
++ security_socket_post_accept(sock, newsock);
++
++out_put:
++ fput_light(sock->file, fput_needed);
++out:
++ return err;
++out_fd_simple:
++ sock_release(newsock);
++ put_filp(newfile);
++ put_unused_fd(newfd);
++ goto out_put;
++out_fd:
++ fput(newfile);
++ put_unused_fd(newfd);
++ goto out_put;
++}
++
++/*
++ * Attempt to connect to a socket with the server address. The address
++ * is in user space so we verify it is OK and move it to kernel space.
++ *
++ * For 1003.1g we need to add clean support for a bind to AF_UNSPEC to
++ * break bindings
++ *
++ * NOTE: 1003.1g draft 6.3 is broken with respect to AX.25/NetROM and
++ * other SEQPACKET protocols that take time to connect() as it doesn't
++ * include the -EINPROGRESS status for such sockets.
++ */
++
++asmlinkage long sys_connect(int fd, struct sockaddr __user *uservaddr,
++ int addrlen)
++{
++ struct socket *sock;
++ char address[MAX_SOCK_ADDR];
++ int err, fput_needed;
++
++ sock = sockfd_lookup_light(fd, &err, &fput_needed);
++ if (!sock)
++ goto out;
++ err = move_addr_to_kernel(uservaddr, addrlen, address);
++ if (err < 0)
++ goto out_put;
++
++ err =
++ security_socket_connect(sock, (struct sockaddr *)address, addrlen);
++ if (err)
++ goto out_put;
++
++ err = sock->ops->connect(sock, (struct sockaddr *)address, addrlen,
++ sock->file->f_flags);
++out_put:
++ fput_light(sock->file, fput_needed);
++out:
++ return err;
++}
++
++/*
++ * Get the local address ('name') of a socket object. Move the obtained
++ * name to user space.
++ */
++
++asmlinkage long sys_getsockname(int fd, struct sockaddr __user *usockaddr,
++ int __user *usockaddr_len)
++{
++ struct socket *sock;
++ char address[MAX_SOCK_ADDR];
++ int len, err, fput_needed;
++
++ sock = sockfd_lookup_light(fd, &err, &fput_needed);
++ if (!sock)
++ goto out;
++
++ err = security_socket_getsockname(sock);
++ if (err)
++ goto out_put;
++
++ err = sock->ops->getname(sock, (struct sockaddr *)address, &len, 0);
++ if (err)
++ goto out_put;
++ err = move_addr_to_user(address, len, usockaddr, usockaddr_len);
++
++out_put:
++ fput_light(sock->file, fput_needed);
++out:
++ return err;
++}
++
++/*
++ * Get the remote address ('name') of a socket object. Move the obtained
++ * name to user space.
++ */
++
++asmlinkage long sys_getpeername(int fd, struct sockaddr __user *usockaddr,
++ int __user *usockaddr_len)
++{
++ struct socket *sock;
++ char address[MAX_SOCK_ADDR];
++ int len, err, fput_needed;
++
++ sock = sockfd_lookup_light(fd, &err, &fput_needed);
++ if (sock != NULL) {
++ err = security_socket_getpeername(sock);
++ if (err) {
++ fput_light(sock->file, fput_needed);
++ return err;
++ }
++
++ err =
++ sock->ops->getname(sock, (struct sockaddr *)address, &len,
++ 1);
++ if (!err)
++ err = move_addr_to_user(address, len, usockaddr,
++ usockaddr_len);
++ fput_light(sock->file, fput_needed);
++ }
++ return err;
++}
++
++/*
++ * Send a datagram to a given address. We move the address into kernel
++ * space and check the user space data area is readable before invoking
++ * the protocol.
++ */
++
++asmlinkage long sys_sendto(int fd, void __user *buff, size_t len,
++ unsigned flags, struct sockaddr __user *addr,
++ int addr_len)
++{
++ struct socket *sock;
++ char address[MAX_SOCK_ADDR];
++ int err;
++ struct msghdr msg;
++ struct iovec iov;
++ int fput_needed;
++ struct file *sock_file;
++
++ sock_file = fget_light(fd, &fput_needed);
++ err = -EBADF;
++ if (!sock_file)
++ goto out;
++
++ sock = sock_from_file(sock_file, &err);
++ if (!sock)
++ goto out_put;
++ iov.iov_base = buff;
++ iov.iov_len = len;
++ msg.msg_name = NULL;
++ msg.msg_iov = &iov;
++ msg.msg_iovlen = 1;
++ msg.msg_control = NULL;
++ msg.msg_controllen = 0;
++ msg.msg_namelen = 0;
++ if (addr) {
++ err = move_addr_to_kernel(addr, addr_len, address);
++ if (err < 0)
++ goto out_put;
++ msg.msg_name = address;
++ msg.msg_namelen = addr_len;
++ }
++ if (sock->file->f_flags & O_NONBLOCK)
++ flags |= MSG_DONTWAIT;
++ msg.msg_flags = flags;
++ err = sock_sendmsg(sock, &msg, len);
++
++out_put:
++ fput_light(sock_file, fput_needed);
++out:
++ return err;
++}
++
++/*
++ * Send a datagram down a socket.
++ */
++
++asmlinkage long sys_send(int fd, void __user *buff, size_t len, unsigned flags)
++{
++ return sys_sendto(fd, buff, len, flags, NULL, 0);
++}
++
++/*
++ * Receive a frame from the socket and optionally record the address of the
++ * sender. We verify the buffers are writable and if needed move the
++ * sender address from kernel to user space.
++ */
++
++asmlinkage long sys_recvfrom(int fd, void __user *ubuf, size_t size,
++ unsigned flags, struct sockaddr __user *addr,
++ int __user *addr_len)
++{
++ struct socket *sock;
++ struct iovec iov;
++ struct msghdr msg;
++ char address[MAX_SOCK_ADDR];
++ int err, err2;
++ struct file *sock_file;
++ int fput_needed;
++
++ sock_file = fget_light(fd, &fput_needed);
++ err = -EBADF;
++ if (!sock_file)
++ goto out;
++
++ sock = sock_from_file(sock_file, &err);
++ if (!sock)
++ goto out_put;
++
++ msg.msg_control = NULL;
++ msg.msg_controllen = 0;
++ msg.msg_iovlen = 1;
++ msg.msg_iov = &iov;
++ iov.iov_len = size;
++ iov.iov_base = ubuf;
++ msg.msg_name = address;
++ msg.msg_namelen = MAX_SOCK_ADDR;
++ if (sock->file->f_flags & O_NONBLOCK)
++ flags |= MSG_DONTWAIT;
++ err = sock_recvmsg(sock, &msg, size, flags);
++
++ if (err >= 0 && addr != NULL) {
++ err2 = move_addr_to_user(address, msg.msg_namelen, addr, addr_len);
++ if (err2 < 0)
++ err = err2;
++ }
++out_put:
++ fput_light(sock_file, fput_needed);
++out:
++ return err;
++}
++
++/*
++ * Receive a datagram from a socket.
++ */
++
++asmlinkage long sys_recv(int fd, void __user *ubuf, size_t size,
++ unsigned flags)
++{
++ return sys_recvfrom(fd, ubuf, size, flags, NULL, NULL);
++}
++
++/*
++ * Set a socket option. Because we don't know the option lengths we have
++ * to pass the user mode parameter for the protocols to sort out.
++ */
++
++asmlinkage long sys_setsockopt(int fd, int level, int optname,
++ char __user *optval, int optlen)
++{
++ int err, fput_needed;
++ struct socket *sock;
++
++ if (optlen < 0)
++ return -EINVAL;
++
++ sock = sockfd_lookup_light(fd, &err, &fput_needed);
++ if (sock != NULL) {
++ err = security_socket_setsockopt(sock, level, optname);
++ if (err)
++ goto out_put;
++
++ if (level == SOL_SOCKET)
++ err =
++ sock_setsockopt(sock, level, optname, optval,
++ optlen);
++ else
++ err =
++ sock->ops->setsockopt(sock, level, optname, optval,
++ optlen);
++out_put:
++ fput_light(sock->file, fput_needed);
++ }
++ return err;
++}
++
++/*
++ * Get a socket option. Because we don't know the option lengths we have
++ * to pass a user mode parameter for the protocols to sort out.
++ */
++
++asmlinkage long sys_getsockopt(int fd, int level, int optname,
++ char __user *optval, int __user *optlen)
++{
++ int err, fput_needed;
++ struct socket *sock;
++
++ sock = sockfd_lookup_light(fd, &err, &fput_needed);
++ if (sock != NULL) {
++ err = security_socket_getsockopt(sock, level, optname);
++ if (err)
++ goto out_put;
++
++ if (level == SOL_SOCKET)
++ err =
++ sock_getsockopt(sock, level, optname, optval,
++ optlen);
++ else
++ err =
++ sock->ops->getsockopt(sock, level, optname, optval,
++ optlen);
++out_put:
++ fput_light(sock->file, fput_needed);
++ }
++ return err;
++}
++
++/*
++ * Shutdown a socket.
++ */
++
++asmlinkage long sys_shutdown(int fd, int how)
++{
++ int err, fput_needed;
++ struct socket *sock;
++
++ sock = sockfd_lookup_light(fd, &err, &fput_needed);
++ if (sock != NULL) {
++ err = security_socket_shutdown(sock, how);
++ if (!err)
++ err = sock->ops->shutdown(sock, how);
++ fput_light(sock->file, fput_needed);
++ }
++ return err;
++}
++
++/* A couple of helpful macros for getting the address of the 32/64 bit
++ * fields which are the same type (int / unsigned) on our platforms.
++ */
++#define COMPAT_MSG(msg, member) ((MSG_CMSG_COMPAT & flags) ? &msg##_compat->member : &msg->member)
++#define COMPAT_NAMELEN(msg) COMPAT_MSG(msg, msg_namelen)
++#define COMPAT_FLAGS(msg) COMPAT_MSG(msg, msg_flags)
++
++/*
++ * BSD sendmsg interface
++ */
++
++asmlinkage long sys_sendmsg(int fd, struct msghdr __user *msg, unsigned flags)
++{
++ struct compat_msghdr __user *msg_compat =
++ (struct compat_msghdr __user *)msg;
++ struct socket *sock;
++ char address[MAX_SOCK_ADDR];
++ struct iovec iovstack[UIO_FASTIOV], *iov = iovstack;
++ unsigned char ctl[sizeof(struct cmsghdr) + 20]
++ __attribute__ ((aligned(sizeof(__kernel_size_t))));
++ /* 20 is size of ipv6_pktinfo */
++ unsigned char *ctl_buf = ctl;
++ struct msghdr msg_sys;
++ int err, ctl_len, iov_size, total_len;
++ int fput_needed;
++
++ err = -EFAULT;
++ if (MSG_CMSG_COMPAT & flags) {
++ if (get_compat_msghdr(&msg_sys, msg_compat))
++ return -EFAULT;
++ }
++ else if (copy_from_user(&msg_sys, msg, sizeof(struct msghdr)))
++ return -EFAULT;
++
++ sock = sockfd_lookup_light(fd, &err, &fput_needed);
++ if (!sock)
++ goto out;
++
++ /* do not move before msg_sys is valid */
++ err = -EMSGSIZE;
++ if (msg_sys.msg_iovlen > UIO_MAXIOV)
++ goto out_put;
++
++ /* Check whether to allocate the iovec area */
++ err = -ENOMEM;
++ iov_size = msg_sys.msg_iovlen * sizeof(struct iovec);
++ if (msg_sys.msg_iovlen > UIO_FASTIOV) {
++ iov = sock_kmalloc(sock->sk, iov_size, GFP_KERNEL);
++ if (!iov)
++ goto out_put;
++ }
++
++ /* This will also move the address data into kernel space */
++ if (MSG_CMSG_COMPAT & flags) {
++ err = verify_compat_iovec(&msg_sys, iov, address, VERIFY_READ);
++ } else
++ err = verify_iovec(&msg_sys, iov, address, VERIFY_READ);
++ if (err < 0)
++ goto out_freeiov;
++ total_len = err;
++
++ err = -ENOBUFS;
++
++ if (msg_sys.msg_controllen > INT_MAX)
++ goto out_freeiov;
++ ctl_len = msg_sys.msg_controllen;
++ if ((MSG_CMSG_COMPAT & flags) && ctl_len) {
++ err =
++ cmsghdr_from_user_compat_to_kern(&msg_sys, sock->sk, ctl,
++ sizeof(ctl));
++ if (err)
++ goto out_freeiov;
++ ctl_buf = msg_sys.msg_control;
++ ctl_len = msg_sys.msg_controllen;
++ } else if (ctl_len) {
++ if (ctl_len > sizeof(ctl)) {
++ ctl_buf = sock_kmalloc(sock->sk, ctl_len, GFP_KERNEL);
++ if (ctl_buf == NULL)
++ goto out_freeiov;
++ }
++ err = -EFAULT;
++ /*
++ * Careful! Before this, msg_sys.msg_control contains a user pointer.
++ * Afterwards, it will be a kernel pointer. Thus the compiler-assisted
++ * checking falls down on this.
++ */
++ if (copy_from_user(ctl_buf, (void __user *)msg_sys.msg_control,
++ ctl_len))
++ goto out_freectl;
++ msg_sys.msg_control = ctl_buf;
++ }
++ msg_sys.msg_flags = flags;
++
++ if (sock->file->f_flags & O_NONBLOCK)
++ msg_sys.msg_flags |= MSG_DONTWAIT;
++ err = sock_sendmsg(sock, &msg_sys, total_len);
++
++out_freectl:
++ if (ctl_buf != ctl)
++ sock_kfree_s(sock->sk, ctl_buf, ctl_len);
++out_freeiov:
++ if (iov != iovstack)
++ sock_kfree_s(sock->sk, iov, iov_size);
++out_put:
++ fput_light(sock->file, fput_needed);
++out:
++ return err;
++}
++
++/*
++ * BSD recvmsg interface
++ */
++
++asmlinkage long sys_recvmsg(int fd, struct msghdr __user *msg,
++ unsigned int flags)
++{
++ struct compat_msghdr __user *msg_compat =
++ (struct compat_msghdr __user *)msg;
++ struct socket *sock;
++ struct iovec iovstack[UIO_FASTIOV];
++ struct iovec *iov = iovstack;
++ struct msghdr msg_sys;
++ unsigned long cmsg_ptr;
++ int err, iov_size, total_len, len;
++ int fput_needed;
++
++ /* kernel mode address */
++ char addr[MAX_SOCK_ADDR];
++
++ /* user mode address pointers */
++ struct sockaddr __user *uaddr;
++ int __user *uaddr_len;
++
++ if (MSG_CMSG_COMPAT & flags) {
++ if (get_compat_msghdr(&msg_sys, msg_compat))
++ return -EFAULT;
++ }
++ else if (copy_from_user(&msg_sys, msg, sizeof(struct msghdr)))
++ return -EFAULT;
++
++ sock = sockfd_lookup_light(fd, &err, &fput_needed);
++ if (!sock)
++ goto out;
++
++ err = -EMSGSIZE;
++ if (msg_sys.msg_iovlen > UIO_MAXIOV)
++ goto out_put;
++
++ /* Check whether to allocate the iovec area */
++ err = -ENOMEM;
++ iov_size = msg_sys.msg_iovlen * sizeof(struct iovec);
++ if (msg_sys.msg_iovlen > UIO_FASTIOV) {
++ iov = sock_kmalloc(sock->sk, iov_size, GFP_KERNEL);
++ if (!iov)
++ goto out_put;
++ }
++
++ /*
++ * Save the user-mode address (verify_iovec will change the
++ * kernel msghdr to use the kernel address space)
++ */
++
++ uaddr = (void __user *)msg_sys.msg_name;
++ uaddr_len = COMPAT_NAMELEN(msg);
++ if (MSG_CMSG_COMPAT & flags) {
++ err = verify_compat_iovec(&msg_sys, iov, addr, VERIFY_WRITE);
++ } else
++ err = verify_iovec(&msg_sys, iov, addr, VERIFY_WRITE);
++ if (err < 0)
++ goto out_freeiov;
++ total_len = err;
++
++ cmsg_ptr = (unsigned long)msg_sys.msg_control;
++ msg_sys.msg_flags = 0;
++ if (MSG_CMSG_COMPAT & flags)
++ msg_sys.msg_flags = MSG_CMSG_COMPAT;
++
++ if (sock->file->f_flags & O_NONBLOCK)
++ flags |= MSG_DONTWAIT;
++ err = sock_recvmsg(sock, &msg_sys, total_len, flags);
++ if (err < 0)
++ goto out_freeiov;
++ len = err;
++
++ if (uaddr != NULL) {
++ err = move_addr_to_user(addr, msg_sys.msg_namelen, uaddr,
++ uaddr_len);
++ if (err < 0)
++ goto out_freeiov;
++ }
++ err = __put_user((msg_sys.msg_flags & ~MSG_CMSG_COMPAT),
++ COMPAT_FLAGS(msg));
++ if (err)
++ goto out_freeiov;
++ if (MSG_CMSG_COMPAT & flags)
++ err = __put_user((unsigned long)msg_sys.msg_control - cmsg_ptr,
++ &msg_compat->msg_controllen);
++ else
++ err = __put_user((unsigned long)msg_sys.msg_control - cmsg_ptr,
++ &msg->msg_controllen);
++ if (err)
++ goto out_freeiov;
++ err = len;
++
++out_freeiov:
++ if (iov != iovstack)
++ sock_kfree_s(sock->sk, iov, iov_size);
++out_put:
++ fput_light(sock->file, fput_needed);
++out:
++ return err;
++}
++
++#ifdef __ARCH_WANT_SYS_SOCKETCALL
++
++/* Argument list sizes for sys_socketcall */
++#define AL(x) ((x) * sizeof(unsigned long))
++static const unsigned char nargs[18]={
++ AL(0),AL(3),AL(3),AL(3),AL(2),AL(3),
++ AL(3),AL(3),AL(4),AL(4),AL(4),AL(6),
++ AL(6),AL(2),AL(5),AL(5),AL(3),AL(3)
++};
++
++#undef AL
++
++/*
++ * System call vectors.
++ *
++ * Argument checking cleaned up. Saved 20% in size.
++ * This function doesn't need to set the kernel lock because
++ * it is set by the callees.
++ */
++
++asmlinkage long sys_socketcall(int call, unsigned long __user *args)
++{
++ unsigned long a[6];
++ unsigned long a0, a1;
++ int err;
++
++ if (call < 1 || call > SYS_RECVMSG)
++ return -EINVAL;
++
++ /* copy_from_user should be SMP safe. */
++ if (copy_from_user(a, args, nargs[call]))
++ return -EFAULT;
++
++ err = audit_socketcall(nargs[call] / sizeof(unsigned long), a);
++ if (err)
++ return err;
++
++ a0 = a[0];
++ a1 = a[1];
++
++ switch (call) {
++ case SYS_SOCKET:
++ err = sys_socket(a0, a1, a[2]);
++ break;
++ case SYS_BIND:
++ err = sys_bind(a0, (struct sockaddr __user *)a1, a[2]);
++ break;
++ case SYS_CONNECT:
++ err = sys_connect(a0, (struct sockaddr __user *)a1, a[2]);
++ break;
++ case SYS_LISTEN:
++ err = sys_listen(a0, a1);
++ break;
++ case SYS_ACCEPT:
++ err =
++ sys_accept(a0, (struct sockaddr __user *)a1,
++ (int __user *)a[2]);
++ break;
++ case SYS_GETSOCKNAME:
++ err =
++ sys_getsockname(a0, (struct sockaddr __user *)a1,
++ (int __user *)a[2]);
++ break;
++ case SYS_GETPEERNAME:
++ err =
++ sys_getpeername(a0, (struct sockaddr __user *)a1,
++ (int __user *)a[2]);
++ break;
++ case SYS_SOCKETPAIR:
++ err = sys_socketpair(a0, a1, a[2], (int __user *)a[3]);
++ break;
++ case SYS_SEND:
++ err = sys_send(a0, (void __user *)a1, a[2], a[3]);
++ break;
++ case SYS_SENDTO:
++ err = sys_sendto(a0, (void __user *)a1, a[2], a[3],
++ (struct sockaddr __user *)a[4], a[5]);
++ break;
++ case SYS_RECV:
++ err = sys_recv(a0, (void __user *)a1, a[2], a[3]);
++ break;
++ case SYS_RECVFROM:
++ err = sys_recvfrom(a0, (void __user *)a1, a[2], a[3],
++ (struct sockaddr __user *)a[4],
++ (int __user *)a[5]);
++ break;
++ case SYS_SHUTDOWN:
++ err = sys_shutdown(a0, a1);
++ break;
++ case SYS_SETSOCKOPT:
++ err = sys_setsockopt(a0, a1, a[2], (char __user *)a[3], a[4]);
++ break;
++ case SYS_GETSOCKOPT:
++ err =
++ sys_getsockopt(a0, a1, a[2], (char __user *)a[3],
++ (int __user *)a[4]);
++ break;
++ case SYS_SENDMSG:
++ err = sys_sendmsg(a0, (struct msghdr __user *)a1, a[2]);
++ break;
++ case SYS_RECVMSG:
++ err = sys_recvmsg(a0, (struct msghdr __user *)a1, a[2]);
++ break;
++ default:
++ err = -EINVAL;
++ break;
++ }
++ return err;
++}
++
++#endif /* __ARCH_WANT_SYS_SOCKETCALL */
++
++/**
++ * sock_register - add a socket protocol handler
++ * @ops: description of protocol
++ *
++ * This function is called by a protocol handler that wants to
++ * advertise its address family, and have it linked into the
++ * socket interface. The value ops->family coresponds to the
++ * socket system call protocol family.
++ */
++int sock_register(const struct net_proto_family *ops)
++{
++ int err;
++
++ if (ops->family >= NPROTO) {
++ printk(KERN_CRIT "protocol %d >= NPROTO(%d)\n", ops->family,
++ NPROTO);
++ return -ENOBUFS;
++ }
++
++ spin_lock(&net_family_lock);
++ if (net_families[ops->family])
++ err = -EEXIST;
++ else {
++ net_families[ops->family] = ops;
++ err = 0;
++ }
++ spin_unlock(&net_family_lock);
++
++ printk(KERN_INFO "NET: Registered protocol family %d\n", ops->family);
++ return err;
++}
++
++/**
++ * sock_unregister - remove a protocol handler
++ * @family: protocol family to remove
++ *
++ * This function is called by a protocol handler that wants to
++ * remove its address family, and have it unlinked from the
++ * new socket creation.
++ *
++ * If protocol handler is a module, then it can use module reference
++ * counts to protect against new references. If protocol handler is not
++ * a module then it needs to provide its own protection in
++ * the ops->create routine.
++ */
++void sock_unregister(int family)
++{
++ BUG_ON(family < 0 || family >= NPROTO);
++
++ spin_lock(&net_family_lock);
++ net_families[family] = NULL;
++ spin_unlock(&net_family_lock);
++
++ synchronize_rcu();
++
++ printk(KERN_INFO "NET: Unregistered protocol family %d\n", family);
++}
++
++static int sock_pernet_init(struct net *net)
++{
++ net->sysctl_somaxconn = SOMAXCONN;
++ return 0;
++}
++
++static struct pernet_operations sock_net_ops = {
++ .init = sock_pernet_init,
++};
++
++static int __init sock_init(void)
++{
++ /*
++ * Initialize sock SLAB cache.
++ */
++
++ sk_init();
++
++ /*
++ * Initialize skbuff SLAB cache
++ */
++ skb_init();
++
++ /*
++ * Initialize the protocols module.
++ */
++
++ init_inodecache();
++ register_filesystem(&sock_fs_type);
++ sock_mnt = kern_mount(&sock_fs_type);
++
++ /* The real protocol initialization is performed in later initcalls.
++ */
++
++#ifdef CONFIG_NETFILTER
++ netfilter_init();
++#endif
++
++ register_pernet_subsys(&sock_net_ops);
++
++ return 0;
++}
++
++core_initcall(sock_init); /* early initcall */
++
++#ifdef CONFIG_PROC_FS
++void socket_seq_show(struct seq_file *seq)
++{
++ int cpu;
++ int counter = 0;
++
++ for_each_possible_cpu(cpu)
++ counter += per_cpu(sockets_in_use, cpu);
++
++ /* It can be negative, by the way. 8) */
++ if (counter < 0)
++ counter = 0;
++
++ seq_printf(seq, "sockets: used %d\n", counter);
++}
++#endif /* CONFIG_PROC_FS */
++
++#ifdef CONFIG_COMPAT
++static long compat_sock_ioctl(struct file *file, unsigned cmd,
++ unsigned long arg)
++{
++ struct socket *sock = file->private_data;
++ int ret = -ENOIOCTLCMD;
++
++ if (sock->ops->compat_ioctl)
++ ret = sock->ops->compat_ioctl(sock, cmd, arg);
++
++ return ret;
++}
++#endif
++
++int kernel_bind(struct socket *sock, struct sockaddr *addr, int addrlen)
++{
++ return sock->ops->bind(sock, addr, addrlen);
++}
++
++int kernel_listen(struct socket *sock, int backlog)
++{
++ return sock->ops->listen(sock, backlog);
++}
++
++int kernel_accept(struct socket *sock, struct socket **newsock, int flags)
++{
++ struct sock *sk = sock->sk;
++ int err;
++
++ err = sock_create_lite(sk->sk_family, sk->sk_type, sk->sk_protocol,
++ newsock);
++ if (err < 0)
++ goto done;
++
++ err = sock->ops->accept(sock, *newsock, flags);
++ if (err < 0) {
++ sock_release(*newsock);
++ goto done;
++ }
++
++ (*newsock)->ops = sock->ops;
++
++done:
++ return err;
++}
++
++int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen,
++ int flags)
++{
++ return sock->ops->connect(sock, addr, addrlen, flags);
++}
++
++int kernel_getsockname(struct socket *sock, struct sockaddr *addr,
++ int *addrlen)
++{
++ return sock->ops->getname(sock, addr, addrlen, 0);
++}
++
++int kernel_getpeername(struct socket *sock, struct sockaddr *addr,
++ int *addrlen)
++{
++ return sock->ops->getname(sock, addr, addrlen, 1);
++}
++
++int kernel_getsockopt(struct socket *sock, int level, int optname,
++ char *optval, int *optlen)
++{
++ mm_segment_t oldfs = get_fs();
++ int err;
++
++ set_fs(KERNEL_DS);
++ if (level == SOL_SOCKET)
++ err = sock_getsockopt(sock, level, optname, optval, optlen);
++ else
++ err = sock->ops->getsockopt(sock, level, optname, optval,
++ optlen);
++ set_fs(oldfs);
++ return err;
++}
++
++int kernel_setsockopt(struct socket *sock, int level, int optname,
++ char *optval, int optlen)
++{
++ mm_segment_t oldfs = get_fs();
++ int err;
++
++ set_fs(KERNEL_DS);
++ if (level == SOL_SOCKET)
++ err = sock_setsockopt(sock, level, optname, optval, optlen);
++ else
++ err = sock->ops->setsockopt(sock, level, optname, optval,
++ optlen);
++ set_fs(oldfs);
++ return err;
++}
++
++int kernel_sendpage(struct socket *sock, struct page *page, int offset,
++ size_t size, int flags)
++{
++ if (sock->ops->sendpage)
++ return sock->ops->sendpage(sock, page, offset, size, flags);
++
++ return sock_no_sendpage(sock, page, offset, size, flags);
++}
++
++int kernel_sock_ioctl(struct socket *sock, int cmd, unsigned long arg)
++{
++ mm_segment_t oldfs = get_fs();
++ int err;
++
++ set_fs(KERNEL_DS);
++ err = sock->ops->ioctl(sock, cmd, arg);
++ set_fs(oldfs);
++
++ return err;
++}
++
++/* ABI emulation layers need these two */
++EXPORT_SYMBOL(move_addr_to_kernel);
++EXPORT_SYMBOL(move_addr_to_user);
++EXPORT_SYMBOL(sock_create);
++EXPORT_SYMBOL(sock_create_kern);
++EXPORT_SYMBOL(sock_create_lite);
++EXPORT_SYMBOL(sock_map_fd);
++EXPORT_SYMBOL(sock_recvmsg);
++EXPORT_SYMBOL(sock_register);
++EXPORT_SYMBOL(sock_release);
++EXPORT_SYMBOL(sock_sendmsg);
++EXPORT_SYMBOL(sock_unregister);
++EXPORT_SYMBOL(sock_wake_async);
++EXPORT_SYMBOL(sockfd_lookup);
++EXPORT_SYMBOL(kernel_sendmsg);
++EXPORT_SYMBOL(kernel_recvmsg);
++EXPORT_SYMBOL(kernel_bind);
++EXPORT_SYMBOL(kernel_listen);
++EXPORT_SYMBOL(kernel_accept);
++EXPORT_SYMBOL(kernel_connect);
++EXPORT_SYMBOL(kernel_getsockname);
++EXPORT_SYMBOL(kernel_getpeername);
++EXPORT_SYMBOL(kernel_getsockopt);
++EXPORT_SYMBOL(kernel_setsockopt);
++EXPORT_SYMBOL(kernel_sendpage);
++EXPORT_SYMBOL(kernel_sock_ioctl);