From: Sapan Bhatia Date: Thu, 20 Mar 2008 04:15:10 +0000 (+0000) Subject: Vserver/NetNS fix hopeful X-Git-Tag: trellis-2.6.22-Jan-2009~35 X-Git-Url: http://git.onelab.eu/?a=commitdiff_plain;h=21c23b172d34b301ca2656efea87a9d6155d9909;p=linux-2.6.git Vserver/NetNS fix hopeful --- diff --git a/linux-2.6-595-vserver-new-netns.patch b/linux-2.6-595-vserver-new-netns.patch index f946b5037..c2d92cd65 100644 --- a/linux-2.6-595-vserver-new-netns.patch +++ b/linux-2.6-595-vserver-new-netns.patch @@ -1,5 +1,1650 @@ ---- linux-2.6.22-590/kernel/vserver/space.c.orig 2008-02-29 09:01:28.000000000 -0500 -+++ linux-2.6.22-590/kernel/vserver/space.c 2008-03-06 15:47:26.000000000 -0500 +diff -Nurb linux-2.6.22-594/include/linux/vserver/network.h.orig.orig linux-2.6.22-595/include/linux/vserver/network.h.orig.orig +--- linux-2.6.22-594/include/linux/vserver/network.h.orig.orig 2008-03-20 00:04:54.000000000 -0400 ++++ linux-2.6.22-595/include/linux/vserver/network.h.orig.orig 1969-12-31 19:00:00.000000000 -0500 +@@ -1,143 +0,0 @@ +-#ifndef _VX_NETWORK_H +-#define _VX_NETWORK_H +- +-#include +- +- +-#define MAX_N_CONTEXT 65535 /* Arbitrary limit */ +- +- +-/* network flags */ +- +-#define NXF_INFO_PRIVATE 0x00000008 +- +-#define NXF_SINGLE_IP 0x00000100 +-#define NXF_LBACK_REMAP 0x00000200 +- +-#define NXF_HIDE_NETIF 0x02000000 +-#define NXF_HIDE_LBACK 0x04000000 +- +-#define NXF_STATE_SETUP (1ULL << 32) +-#define NXF_STATE_ADMIN (1ULL << 34) +- +-#define NXF_SC_HELPER (1ULL << 36) +-#define NXF_PERSISTENT (1ULL << 38) +- +-#define NXF_ONE_TIME (0x0005ULL << 32) +- +- +-#define NXF_INIT_SET (__nxf_init_set()) +- +-static inline uint64_t __nxf_init_set(void) { +- return NXF_STATE_ADMIN +-#ifdef CONFIG_VSERVER_AUTO_LBACK +- | NXF_LBACK_REMAP +- | NXF_HIDE_LBACK +-#endif +-#ifdef CONFIG_VSERVER_AUTO_SINGLE +- | NXF_SINGLE_IP +-#endif +- | NXF_HIDE_NETIF; +-} +- +- +-/* network caps */ +- +-#define NXC_RAW_ICMP 0x00000100 +- +- +-/* address types */ +- +-#define NXA_TYPE_IPV4 0x0001 +-#define NXA_TYPE_IPV6 0x0002 +- +-#define NXA_TYPE_NONE 0x0000 +-#define NXA_TYPE_ANY 0x00FF +- +-#define NXA_TYPE_ADDR 0x0010 +-#define NXA_TYPE_MASK 0x0020 +-#define NXA_TYPE_RANGE 0x0040 +- +-#define NXA_MASK_ALL (NXA_TYPE_ADDR | NXA_TYPE_MASK | NXA_TYPE_RANGE) +- +-#define NXA_MOD_BCAST 0x0100 +-#define NXA_MOD_LBACK 0x0200 +- +-#define NXA_LOOPBACK 0x1000 +- +-#define NXA_MASK_BIND (NXA_MASK_ALL | NXA_MOD_BCAST | NXA_MOD_LBACK) +-#define NXA_MASK_SHOW (NXA_MASK_ALL | NXA_LOOPBACK) +- +-#ifdef __KERNEL__ +- +-#include +-#include +-#include +-#include +-#include +-#include +- +-struct nx_addr_v4 { +- struct nx_addr_v4 *next; +- struct in_addr ip[2]; +- struct in_addr mask; +- uint16_t type; +- uint16_t flags; +-}; +- +-struct nx_addr_v6 { +- struct nx_addr_v6 *next; +- struct in6_addr ip; +- struct in6_addr mask; +- uint32_t prefix; +- uint16_t type; +- uint16_t flags; +-}; +- +-struct nx_info { +- struct hlist_node nx_hlist; /* linked list of nxinfos */ +- nid_t nx_id; /* vnet id */ +- atomic_t nx_usecnt; /* usage count */ +- atomic_t nx_tasks; /* tasks count */ +- int nx_state; /* context state */ +- +- uint64_t nx_flags; /* network flag word */ +- uint64_t nx_ncaps; /* network capabilities */ +- +- struct in_addr v4_lback; /* Loopback address */ +- struct in_addr v4_bcast; /* Broadcast address */ +- struct nx_addr_v4 v4; /* First/Single ipv4 address */ +-#ifdef CONFIG_IPV6 +- struct nx_addr_v6 v6; /* First/Single ipv6 address */ +-#endif +- char nx_name[65]; /* network context name */ +-}; +- +- +-/* status flags */ +- +-#define NXS_HASHED 0x0001 +-#define NXS_SHUTDOWN 0x0100 +-#define NXS_RELEASED 0x8000 +- +-extern struct nx_info *lookup_nx_info(int); +- +-extern int get_nid_list(int, unsigned int *, int); +-extern int nid_is_hashed(nid_t); +- +-extern int nx_migrate_task(struct task_struct *, struct nx_info *); +- +-extern long vs_net_change(struct nx_info *, unsigned int); +- +-struct sock; +- +- +-#define NX_IPV4(n) ((n)->v4.type != NXA_TYPE_NONE) +-#ifdef CONFIG_IPV6 +-#define NX_IPV6(n) ((n)->v6.type != NXA_TYPE_NONE) +-#else +-#define NX_IPV6(n) (0) +-#endif +- +-#endif /* __KERNEL__ */ +-#endif /* _VX_NETWORK_H */ +diff -Nurb linux-2.6.22-594/kernel/nsproxy.c.orig linux-2.6.22-595/kernel/nsproxy.c.orig +--- linux-2.6.22-594/kernel/nsproxy.c.orig 2008-03-20 00:05:18.000000000 -0400 ++++ linux-2.6.22-595/kernel/nsproxy.c.orig 1969-12-31 19:00:00.000000000 -0500 +@@ -1,264 +0,0 @@ +-/* +- * Copyright (C) 2006 IBM Corporation +- * +- * Author: Serge Hallyn +- * +- * This program is free software; you can redistribute it and/or +- * modify it under the terms of the GNU General Public License as +- * published by the Free Software Foundation, version 2 of the +- * License. +- * +- * Jun 2006 - namespaces support +- * OpenVZ, SWsoft Inc. +- * Pavel Emelianov +- */ +- +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +- +-static struct kmem_cache *nsproxy_cachep; +- +-struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); +- +-void get_task_namespaces(struct task_struct *tsk) +-{ +- struct nsproxy *ns = tsk->nsproxy; +- if (ns) { +- get_nsproxy(ns); +- } +-} +- +-/* +- * creates a copy of "orig" with refcount 1. +- */ +-static inline struct nsproxy *clone_nsproxy(struct nsproxy *orig) +-{ +- struct nsproxy *ns; +- +- ns = kmemdup(orig, sizeof(struct nsproxy), GFP_KERNEL); +- if (ns) +- atomic_set(&ns->count, 1); +- vxdprintk(VXD_CBIT(space, 2), "clone_nsproxy(%p[%u] = %p[1]", +- orig, atomic_read(&orig->count), ns); +- atomic_inc(&vs_global_nsproxy); +- return ns; +-} +- +-/* +- * Create new nsproxy and all of its the associated namespaces. +- * Return the newly created nsproxy. Do not attach this to the task, +- * leave it to the caller to do proper locking and attach it to task. +- */ +-static struct nsproxy *unshare_namespaces(int flags, struct nsproxy *orig, +- struct fs_struct *new_fs) +-{ +- struct nsproxy *new_nsp; +- int err = -ENOMEM; +- +- vxdprintk(VXD_CBIT(space, 4), +- "unshare_namespaces(0x%08x,%p,%p)", +- flags, orig, new_fs); +- +- new_nsp = clone_nsproxy(orig); +- if (!new_nsp) +- return ERR_PTR(-ENOMEM); +- +- new_nsp->mnt_ns = copy_mnt_ns(flags, orig->mnt_ns, new_fs); +- if (IS_ERR(new_nsp->mnt_ns)) +- goto out_ns; +- +- new_nsp->uts_ns = copy_utsname(flags, orig->uts_ns); +- if (IS_ERR(new_nsp->uts_ns)) +- goto out_uts; +- +- new_nsp->ipc_ns = copy_ipcs(flags, orig->ipc_ns); +- if (IS_ERR(new_nsp->ipc_ns)) +- goto out_ipc; +- +- new_nsp->pid_ns = copy_pid_ns(flags, orig->pid_ns); +- if (IS_ERR(new_nsp->pid_ns)) +- goto out_pid; +- +- new_nsp->user_ns = copy_user_ns(flags, orig->user_ns); +- if (IS_ERR(new_nsp->user_ns)) +- goto out_user; +- +- new_nsp->net_ns = copy_net_ns(flags, orig->net_ns); +- if (IS_ERR(new_nsp->net_ns)) +- goto out_net; +- +- return new_nsp; +- +-out_net: +- if (new_nsp->user_ns) +- put_user_ns(new_nsp->user_ns); +- if (new_nsp->net_ns) +- put_net(new_nsp->net_ns); +-out_user: +- if (new_nsp->pid_ns) +- put_pid_ns(new_nsp->pid_ns); +-out_pid: +- if (new_nsp->ipc_ns) +- put_ipc_ns(new_nsp->ipc_ns); +-out_ipc: +- if (new_nsp->uts_ns) +- put_uts_ns(new_nsp->uts_ns); +-out_uts: +- if (new_nsp->mnt_ns) +- put_mnt_ns(new_nsp->mnt_ns); +-out_ns: +- kmem_cache_free(nsproxy_cachep, new_nsp); +- return ERR_PTR(err); +-} +- +-static struct nsproxy *create_new_namespaces(unsigned long flags, struct task_struct *tsk, +- struct fs_struct *new_fs) +-{ +- return unshare_namespaces(flags, tsk->nsproxy, new_fs); +-} +- +-/* +- * copies the nsproxy, setting refcount to 1, and grabbing a +- * reference to all contained namespaces. +- */ +-struct nsproxy *copy_nsproxy(struct nsproxy *orig) +-{ +- struct nsproxy *ns = clone_nsproxy(orig); +- +- if (ns) { +- if (ns->mnt_ns) +- get_mnt_ns(ns->mnt_ns); +- if (ns->uts_ns) +- get_uts_ns(ns->uts_ns); +- if (ns->ipc_ns) +- get_ipc_ns(ns->ipc_ns); +- if (ns->pid_ns) +- get_pid_ns(ns->pid_ns); +- } +- return ns; +-} +- +-/* +- * called from clone. This now handles copy for nsproxy and all +- * namespaces therein. +- */ +-int copy_namespaces(unsigned long flags, struct task_struct *tsk) +-{ +- struct nsproxy *old_ns = tsk->nsproxy; +- struct nsproxy *new_ns = NULL; +- int err = 0; +- +- vxdprintk(VXD_CBIT(space, 7), "copy_namespaces(0x%08x,%p[%p])", +- flags, tsk, old_ns); +- +- if (!old_ns) +- return 0; +- +- get_nsproxy(old_ns); +- return 0; +- +- if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWUSER | CLONE_NEWNET))) +- return 0; +- +- #ifndef CONFIG_NET_NS +- if (unshare_flags & CLONE_NEWNET) +- return -EINVAL; +- #endif +- +- +- if (!capable(CAP_SYS_ADMIN)) { +- err = -EPERM; +- goto out; +- } +- +- new_ns = create_new_namespaces(flags, tsk, tsk->fs); +- if (IS_ERR(new_ns)) { +- err = PTR_ERR(new_ns); +- goto out; +- } +- +- err = ns_container_clone(tsk); +- if (err) { +- put_nsproxy(new_ns); +- goto out; +- } +- +- tsk->nsproxy = new_ns; +- +-out: +- put_nsproxy(old_ns); +- vxdprintk(VXD_CBIT(space, 3), +- "copy_namespaces(0x%08x,%p[%p]) = %d [%p]", +- flags, tsk, old_ns, err, new_ns); +- return err; +-} +- +-void free_nsproxy(struct nsproxy *ns) +-{ +- if (ns->mnt_ns) +- put_mnt_ns(ns->mnt_ns); +- if (ns->uts_ns) +- put_uts_ns(ns->uts_ns); +- if (ns->ipc_ns) +- put_ipc_ns(ns->ipc_ns); +- if (ns->pid_ns) +- put_pid_ns(ns->pid_ns); +- atomic_dec(&vs_global_nsproxy); +- kfree(ns); +-} +- +-/* +- * Called from unshare. Unshare all the namespaces part of nsproxy. +- * On success, returns the new nsproxy. +- */ +-int unshare_nsproxy_namespaces(unsigned long unshare_flags, +- struct nsproxy **new_nsp, struct fs_struct *new_fs) +-{ +- int err = 0; +- +- vxdprintk(VXD_CBIT(space, 4), +- "unshare_nsproxy_namespaces(0x%08lx,[%p])", +- unshare_flags, current->nsproxy); +- +- if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | +- CLONE_NEWUSER | CLONE_NEWNET))) +- return 0; +- +-#ifndef CONFIG_NET_NS +- if (unshare_flags & CLONE_NEWNET) +- return -EINVAL; +-#endif +- if (!capable(CAP_SYS_ADMIN)) +- return -EPERM; +- +- *new_nsp = create_new_namespaces(unshare_flags, current, +- new_fs ? new_fs : current->fs); +- if (IS_ERR(*new_nsp)) { +- err = PTR_ERR(*new_nsp); +- goto out; +- } +- +- err = ns_container_clone(current); +- if (err) +- put_nsproxy(*new_nsp); +- +-out: +- return err; +-} +- +-static int __init nsproxy_cache_init(void) +-{ +- nsproxy_cachep = kmem_cache_create("nsproxy", sizeof(struct nsproxy), +- 0, SLAB_PANIC, NULL, NULL); +- return 0; +-} +- +-module_init(nsproxy_cache_init); +diff -Nurb linux-2.6.22-594/kernel/user.c.orig linux-2.6.22-595/kernel/user.c.orig +--- linux-2.6.22-594/kernel/user.c.orig 2008-03-20 00:05:18.000000000 -0400 ++++ linux-2.6.22-595/kernel/user.c.orig 1969-12-31 19:00:00.000000000 -0500 +@@ -1,227 +0,0 @@ +-/* +- * The "user cache". +- * +- * (C) Copyright 1991-2000 Linus Torvalds +- * +- * We have a per-user structure to keep track of how many +- * processes, files etc the user has claimed, in order to be +- * able to have per-user limits for system resources. +- */ +- +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +- +-/* +- * UID task count cache, to get fast user lookup in "alloc_uid" +- * when changing user ID's (ie setuid() and friends). +- */ +- +-#define UIDHASH_MASK (UIDHASH_SZ - 1) +-#define __uidhashfn(xid,uid) ((((uid) >> UIDHASH_BITS) + ((uid)^(xid))) & UIDHASH_MASK) +-#define uidhashentry(ns, xid, uid) ((ns)->uidhash_table + __uidhashfn(xid, uid)) +- +-static struct kmem_cache *uid_cachep; +-static struct list_head uidhash_table[UIDHASH_SZ]; +- +-/* +- * The uidhash_lock is mostly taken from process context, but it is +- * occasionally also taken from softirq/tasklet context, when +- * task-structs get RCU-freed. Hence all locking must be softirq-safe. +- * But free_uid() is also called with local interrupts disabled, and running +- * local_bh_enable() with local interrupts disabled is an error - we'll run +- * softirq callbacks, and they can unconditionally enable interrupts, and +- * the caller of free_uid() didn't expect that.. +- */ +-static DEFINE_SPINLOCK(uidhash_lock); +- +-struct user_struct root_user = { +- .__count = ATOMIC_INIT(1), +- .processes = ATOMIC_INIT(1), +- .files = ATOMIC_INIT(0), +- .sigpending = ATOMIC_INIT(0), +- .mq_bytes = 0, +- .locked_shm = 0, +-#ifdef CONFIG_KEYS +- .uid_keyring = &root_user_keyring, +- .session_keyring = &root_session_keyring, +-#endif +-}; +- +-/* +- * These routines must be called with the uidhash spinlock held! +- */ +-static inline void uid_hash_insert(struct user_struct *up, struct list_head *hashent) +-{ +- list_add(&up->uidhash_list, hashent); +-} +- +-static inline void uid_hash_remove(struct user_struct *up) +-{ +- list_del(&up->uidhash_list); +-} +- +-static inline struct user_struct *uid_hash_find(xid_t xid, uid_t uid, struct list_head *hashent) +-{ +- struct list_head *up; +- +- list_for_each(up, hashent) { +- struct user_struct *user; +- +- user = list_entry(up, struct user_struct, uidhash_list); +- +- if(user->uid == uid && user->xid == xid) { +- atomic_inc(&user->__count); +- return user; +- } +- } +- +- return NULL; +-} +- +-/* +- * Locate the user_struct for the passed UID. If found, take a ref on it. The +- * caller must undo that ref with free_uid(). +- * +- * If the user_struct could not be found, return NULL. +- */ +-struct user_struct *find_user(xid_t xid, uid_t uid) +-{ +- struct user_struct *ret; +- unsigned long flags; +- struct user_namespace *ns = current->nsproxy->user_ns; +- +- spin_lock_irqsave(&uidhash_lock, flags); +- ret = uid_hash_find(xid, uid, uidhashentry(ns, xid, uid)); +- spin_unlock_irqrestore(&uidhash_lock, flags); +- return ret; +-} +- +-void free_uid(struct user_struct *up) +-{ +- unsigned long flags; +- +- if (!up) +- return; +- +- local_irq_save(flags); +- if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) { +- uid_hash_remove(up); +- spin_unlock_irqrestore(&uidhash_lock, flags); +- key_put(up->uid_keyring); +- key_put(up->session_keyring); +- kmem_cache_free(uid_cachep, up); +- } else { +- local_irq_restore(flags); +- } +-} +- +-struct user_struct * alloc_uid(xid_t xid, uid_t uid) +-{ +- struct user_namespace *ns = current->nsproxy->user_ns; +- struct list_head *hashent = uidhashentry(ns,xid, uid); +- struct user_struct *up; +- +- spin_lock_irq(&uidhash_lock); +- up = uid_hash_find(xid, uid, hashent); +- spin_unlock_irq(&uidhash_lock); +- +- if (!up) { +- struct user_struct *new; +- +- new = kmem_cache_alloc(uid_cachep, GFP_KERNEL); +- if (!new) +- return NULL; +- new->uid = uid; +- new->xid = xid; +- atomic_set(&new->__count, 1); +- atomic_set(&new->processes, 0); +- atomic_set(&new->files, 0); +- atomic_set(&new->sigpending, 0); +-#ifdef CONFIG_INOTIFY_USER +- atomic_set(&new->inotify_watches, 0); +- atomic_set(&new->inotify_devs, 0); +-#endif +- +- new->mq_bytes = 0; +- new->locked_shm = 0; +- +- if (alloc_uid_keyring(new, current) < 0) { +- kmem_cache_free(uid_cachep, new); +- return NULL; +- } +- +- /* +- * Before adding this, check whether we raced +- * on adding the same user already.. +- */ +- spin_lock_irq(&uidhash_lock); +- up = uid_hash_find(xid, uid, hashent); +- if (up) { +- key_put(new->uid_keyring); +- key_put(new->session_keyring); +- kmem_cache_free(uid_cachep, new); +- } else { +- uid_hash_insert(new, hashent); +- up = new; +- } +- spin_unlock_irq(&uidhash_lock); +- +- } +- return up; +-} +- +-void switch_uid(struct user_struct *new_user) +-{ +- struct user_struct *old_user; +- +- /* What if a process setreuid()'s and this brings the +- * new uid over his NPROC rlimit? We can check this now +- * cheaply with the new uid cache, so if it matters +- * we should be checking for it. -DaveM +- */ +- old_user = current->user; +- atomic_inc(&new_user->processes); +- atomic_dec(&old_user->processes); +- switch_uid_keyring(new_user); +- current->user = new_user; +- +- /* +- * We need to synchronize with __sigqueue_alloc() +- * doing a get_uid(p->user).. If that saw the old +- * user value, we need to wait until it has exited +- * its critical region before we can free the old +- * structure. +- */ +- smp_mb(); +- spin_unlock_wait(¤t->sighand->siglock); +- +- free_uid(old_user); +- suid_keys(current); +-} +- +- +-static int __init uid_cache_init(void) +-{ +- int n; +- +- uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct), +- 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); +- +- for(n = 0; n < UIDHASH_SZ; ++n) +- INIT_LIST_HEAD(init_user_ns.uidhash_table + n); +- +- /* Insert the root user immediately (init already runs as root) */ +- spin_lock_irq(&uidhash_lock); +- uid_hash_insert(&root_user, uidhashentry(&init_user_ns, 0, 0)); +- spin_unlock_irq(&uidhash_lock); +- +- return 0; +-} +- +-module_init(uid_cache_init); +diff -Nurb linux-2.6.22-594/kernel/vserver/context.c linux-2.6.22-595/kernel/vserver/context.c +--- linux-2.6.22-594/kernel/vserver/context.c 2008-03-20 00:04:46.000000000 -0400 ++++ linux-2.6.22-595/kernel/vserver/context.c 2008-03-20 00:13:22.000000000 -0400 +@@ -589,13 +589,13 @@ + struct nsproxy *old_nsp, *new_nsp; + + ret = unshare_nsproxy_namespaces( +- CLONE_NEWUTS | CLONE_NEWIPC, ++ CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWNET, + &new_nsp, NULL); + if (ret) + goto out; + + old_nsp = xchg(&p->nsproxy, new_nsp); +- vx_set_space(vxi, CLONE_NEWUTS | CLONE_NEWIPC); ++ vx_set_space(vxi, CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWNET); + put_nsproxy(old_nsp); + } + } +@@ -781,7 +781,7 @@ + if (vs_state_change(new_vxi, VSC_STARTUP)) + goto out; + +- ret = vx_migrate_task(current, new_vxi, (!data)); ++ ret = vx_migrate_task(current, new_vxi, 1 /*(!data) Hack no. 1 - Sapan*/); + if (ret) + goto out; + +diff -Nurb linux-2.6.22-594/kernel/vserver/context.c.orig linux-2.6.22-595/kernel/vserver/context.c.orig +--- linux-2.6.22-594/kernel/vserver/context.c.orig 1969-12-31 19:00:00.000000000 -0500 ++++ linux-2.6.22-595/kernel/vserver/context.c.orig 2008-03-20 00:04:46.000000000 -0400 +@@ -0,0 +1,966 @@ ++/* ++ * linux/kernel/vserver/context.c ++ * ++ * Virtual Server: Context Support ++ * ++ * Copyright (C) 2003-2007 Herbert Pötzl ++ * ++ * V0.01 context helper ++ * V0.02 vx_ctx_kill syscall command ++ * V0.03 replaced context_info calls ++ * V0.04 redesign of struct (de)alloc ++ * V0.05 rlimit basic implementation ++ * V0.06 task_xid and info commands ++ * V0.07 context flags and caps ++ * V0.08 switch to RCU based hash ++ * V0.09 revert to non RCU for now ++ * V0.10 and back to working RCU hash ++ * V0.11 and back to locking again ++ * V0.12 referenced context store ++ * V0.13 separate per cpu data ++ * V0.14 changed vcmds to vxi arg ++ * V0.15 added context stat ++ * V0.16 have __create claim() the vxi ++ * V0.17 removed older and legacy stuff ++ * ++ */ ++ ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++ ++#include "cvirt_init.h" ++#include "cacct_init.h" ++#include "limit_init.h" ++#include "sched_init.h" ++ ++ ++atomic_t vx_global_ctotal = ATOMIC_INIT(0); ++atomic_t vx_global_cactive = ATOMIC_INIT(0); ++ ++ ++/* now inactive context structures */ ++ ++static struct hlist_head vx_info_inactive = HLIST_HEAD_INIT; ++ ++static spinlock_t vx_info_inactive_lock = SPIN_LOCK_UNLOCKED; ++ ++ ++/* __alloc_vx_info() ++ ++ * allocate an initialized vx_info struct ++ * doesn't make it visible (hash) */ ++ ++static struct vx_info *__alloc_vx_info(xid_t xid) ++{ ++ struct vx_info *new = NULL; ++ int cpu; ++ ++ vxdprintk(VXD_CBIT(xid, 0), "alloc_vx_info(%d)*", xid); ++ ++ /* would this benefit from a slab cache? */ ++ new = kmalloc(sizeof(struct vx_info), GFP_KERNEL); ++ if (!new) ++ return 0; ++ ++ memset(new, 0, sizeof(struct vx_info)); ++#ifdef CONFIG_SMP ++ new->ptr_pc = alloc_percpu(struct _vx_info_pc); ++ if (!new->ptr_pc) ++ goto error; ++#endif ++ new->vx_id = xid; ++ INIT_HLIST_NODE(&new->vx_hlist); ++ atomic_set(&new->vx_usecnt, 0); ++ atomic_set(&new->vx_tasks, 0); ++ new->vx_parent = NULL; ++ new->vx_state = 0; ++ init_waitqueue_head(&new->vx_wait); ++ ++ /* prepare reaper */ ++ get_task_struct(init_pid_ns.child_reaper); ++ new->vx_reaper = init_pid_ns.child_reaper; ++ new->vx_badness_bias = 0; ++ ++ /* rest of init goes here */ ++ vx_info_init_limit(&new->limit); ++ vx_info_init_sched(&new->sched); ++ vx_info_init_cvirt(&new->cvirt); ++ vx_info_init_cacct(&new->cacct); ++ ++ /* per cpu data structures */ ++ for_each_possible_cpu(cpu) { ++ vx_info_init_sched_pc( ++ &vx_per_cpu(new, sched_pc, cpu), cpu); ++ vx_info_init_cvirt_pc( ++ &vx_per_cpu(new, cvirt_pc, cpu), cpu); ++ } ++ ++ new->vx_flags = VXF_INIT_SET; ++ new->vx_bcaps = CAP_INIT_EFF_SET; ++ new->vx_ccaps = 0; ++ new->vx_cap_bset = cap_bset; ++ ++ new->reboot_cmd = 0; ++ new->exit_code = 0; ++ ++ new->vx_nsproxy = copy_nsproxy(current->nsproxy); ++ ++ vxdprintk(VXD_CBIT(xid, 0), ++ "alloc_vx_info(%d) = %p", xid, new); ++ vxh_alloc_vx_info(new); ++ atomic_inc(&vx_global_ctotal); ++ return new; ++#ifdef CONFIG_SMP ++error: ++ kfree(new); ++ return 0; ++#endif ++} ++ ++/* __dealloc_vx_info() ++ ++ * final disposal of vx_info */ ++ ++static void __dealloc_vx_info(struct vx_info *vxi) ++{ ++ int cpu; ++ ++ vxdprintk(VXD_CBIT(xid, 0), ++ "dealloc_vx_info(%p)", vxi); ++ vxh_dealloc_vx_info(vxi); ++ ++ vxi->vx_id = -1; ++ ++ vx_info_exit_limit(&vxi->limit); ++ vx_info_exit_sched(&vxi->sched); ++ vx_info_exit_cvirt(&vxi->cvirt); ++ vx_info_exit_cacct(&vxi->cacct); ++ ++ for_each_possible_cpu(cpu) { ++ vx_info_exit_sched_pc( ++ &vx_per_cpu(vxi, sched_pc, cpu), cpu); ++ vx_info_exit_cvirt_pc( ++ &vx_per_cpu(vxi, cvirt_pc, cpu), cpu); ++ } ++ ++ vxi->vx_state |= VXS_RELEASED; ++ ++#ifdef CONFIG_SMP ++ free_percpu(vxi->ptr_pc); ++#endif ++ kfree(vxi); ++ atomic_dec(&vx_global_ctotal); ++} ++ ++static void __shutdown_vx_info(struct vx_info *vxi) ++{ ++ struct nsproxy *nsproxy; ++ struct fs_struct *fs; ++ ++ might_sleep(); ++ ++ vxi->vx_state |= VXS_SHUTDOWN; ++ vs_state_change(vxi, VSC_SHUTDOWN); ++ ++ nsproxy = xchg(&vxi->vx_nsproxy, NULL); ++ fs = xchg(&vxi->vx_fs, NULL); ++ ++ if (nsproxy) ++ put_nsproxy(nsproxy); ++ if (fs) ++ put_fs_struct(fs); ++} ++ ++/* exported stuff */ ++ ++void free_vx_info(struct vx_info *vxi) ++{ ++ unsigned long flags; ++ ++ /* check for reference counts first */ ++ BUG_ON(atomic_read(&vxi->vx_usecnt)); ++ BUG_ON(atomic_read(&vxi->vx_tasks)); ++ ++ /* context must not be hashed */ ++ BUG_ON(vx_info_state(vxi, VXS_HASHED)); ++ ++ /* context shutdown is mandatory */ ++ BUG_ON(!vx_info_state(vxi, VXS_SHUTDOWN)); ++ ++ BUG_ON(vxi->vx_nsproxy); ++ BUG_ON(vxi->vx_fs); ++ ++ spin_lock_irqsave(&vx_info_inactive_lock, flags); ++ hlist_del(&vxi->vx_hlist); ++ spin_unlock_irqrestore(&vx_info_inactive_lock, flags); ++ ++ __dealloc_vx_info(vxi); ++} ++ ++ ++/* hash table for vx_info hash */ ++ ++#define VX_HASH_SIZE 13 ++ ++static struct hlist_head vx_info_hash[VX_HASH_SIZE] = ++ { [0 ... VX_HASH_SIZE-1] = HLIST_HEAD_INIT }; ++ ++static spinlock_t vx_info_hash_lock = SPIN_LOCK_UNLOCKED; ++ ++ ++static inline unsigned int __hashval(xid_t xid) ++{ ++ return (xid % VX_HASH_SIZE); ++} ++ ++ ++ ++/* __hash_vx_info() ++ ++ * add the vxi to the global hash table ++ * requires the hash_lock to be held */ ++ ++static inline void __hash_vx_info(struct vx_info *vxi) ++{ ++ struct hlist_head *head; ++ ++ vxd_assert_lock(&vx_info_hash_lock); ++ vxdprintk(VXD_CBIT(xid, 4), ++ "__hash_vx_info: %p[#%d]", vxi, vxi->vx_id); ++ vxh_hash_vx_info(vxi); ++ ++ /* context must not be hashed */ ++ BUG_ON(vx_info_state(vxi, VXS_HASHED)); ++ ++ vxi->vx_state |= VXS_HASHED; ++ head = &vx_info_hash[__hashval(vxi->vx_id)]; ++ hlist_add_head(&vxi->vx_hlist, head); ++ atomic_inc(&vx_global_cactive); ++} ++ ++/* __unhash_vx_info() ++ ++ * remove the vxi from the global hash table ++ * requires the hash_lock to be held */ ++ ++static inline void __unhash_vx_info(struct vx_info *vxi) ++{ ++ unsigned long flags; ++ ++ vxd_assert_lock(&vx_info_hash_lock); ++ vxdprintk(VXD_CBIT(xid, 4), ++ "__unhash_vx_info: %p[#%d.%d.%d]", vxi, vxi->vx_id, ++ atomic_read(&vxi->vx_usecnt), atomic_read(&vxi->vx_tasks)); ++ vxh_unhash_vx_info(vxi); ++ ++ /* context must be hashed */ ++ BUG_ON(!vx_info_state(vxi, VXS_HASHED)); ++ /* but without tasks */ ++ BUG_ON(atomic_read(&vxi->vx_tasks)); ++ ++ vxi->vx_state &= ~VXS_HASHED; ++ hlist_del_init(&vxi->vx_hlist); ++ spin_lock_irqsave(&vx_info_inactive_lock, flags); ++ hlist_add_head(&vxi->vx_hlist, &vx_info_inactive); ++ spin_unlock_irqrestore(&vx_info_inactive_lock, flags); ++ atomic_dec(&vx_global_cactive); ++} ++ ++ ++/* __lookup_vx_info() ++ ++ * requires the hash_lock to be held ++ * doesn't increment the vx_refcnt */ ++ ++static inline struct vx_info *__lookup_vx_info(xid_t xid) ++{ ++ struct hlist_head *head = &vx_info_hash[__hashval(xid)]; ++ struct hlist_node *pos; ++ struct vx_info *vxi; ++ ++ vxd_assert_lock(&vx_info_hash_lock); ++ hlist_for_each(pos, head) { ++ vxi = hlist_entry(pos, struct vx_info, vx_hlist); ++ ++ if (vxi->vx_id == xid) ++ goto found; ++ } ++ vxi = NULL; ++found: ++ vxdprintk(VXD_CBIT(xid, 0), ++ "__lookup_vx_info(#%u): %p[#%u]", ++ xid, vxi, vxi ? vxi->vx_id : 0); ++ vxh_lookup_vx_info(vxi, xid); ++ return vxi; ++} ++ ++ ++/* __create_vx_info() ++ ++ * create the requested context ++ * get(), claim() and hash it */ ++ ++static struct vx_info *__create_vx_info(int id) ++{ ++ struct vx_info *new, *vxi = NULL; ++ ++ vxdprintk(VXD_CBIT(xid, 1), "create_vx_info(%d)*", id); ++ ++ if (!(new = __alloc_vx_info(id))) ++ return ERR_PTR(-ENOMEM); ++ ++ /* required to make dynamic xids unique */ ++ spin_lock(&vx_info_hash_lock); ++ ++ /* static context requested */ ++ if ((vxi = __lookup_vx_info(id))) { ++ vxdprintk(VXD_CBIT(xid, 0), ++ "create_vx_info(%d) = %p (already there)", id, vxi); ++ if (vx_info_flags(vxi, VXF_STATE_SETUP, 0)) ++ vxi = ERR_PTR(-EBUSY); ++ else ++ vxi = ERR_PTR(-EEXIST); ++ goto out_unlock; ++ } ++ /* new context */ ++ vxdprintk(VXD_CBIT(xid, 0), ++ "create_vx_info(%d) = %p (new)", id, new); ++ claim_vx_info(new, NULL); ++ __hash_vx_info(get_vx_info(new)); ++ vxi = new, new = NULL; ++ ++out_unlock: ++ spin_unlock(&vx_info_hash_lock); ++ vxh_create_vx_info(IS_ERR(vxi) ? NULL : vxi, id); ++ if (new) ++ __dealloc_vx_info(new); ++ return vxi; ++} ++ ++ ++/* exported stuff */ ++ ++ ++void unhash_vx_info(struct vx_info *vxi) ++{ ++ __shutdown_vx_info(vxi); ++ spin_lock(&vx_info_hash_lock); ++ __unhash_vx_info(vxi); ++ spin_unlock(&vx_info_hash_lock); ++ __wakeup_vx_info(vxi); ++} ++ ++ ++/* lookup_vx_info() ++ ++ * search for a vx_info and get() it ++ * negative id means current */ ++ ++struct vx_info *lookup_vx_info(int id) ++{ ++ struct vx_info *vxi = NULL; ++ ++ if (id < 0) { ++ vxi = get_vx_info(current->vx_info); ++ } else if (id > 1) { ++ spin_lock(&vx_info_hash_lock); ++ vxi = get_vx_info(__lookup_vx_info(id)); ++ spin_unlock(&vx_info_hash_lock); ++ } ++ return vxi; ++} ++ ++/* xid_is_hashed() ++ ++ * verify that xid is still hashed */ ++ ++int xid_is_hashed(xid_t xid) ++{ ++ int hashed; ++ ++ spin_lock(&vx_info_hash_lock); ++ hashed = (__lookup_vx_info(xid) != NULL); ++ spin_unlock(&vx_info_hash_lock); ++ return hashed; ++} ++ ++#ifdef CONFIG_PROC_FS ++ ++/* get_xid_list() ++ ++ * get a subset of hashed xids for proc ++ * assumes size is at least one */ ++ ++int get_xid_list(int index, unsigned int *xids, int size) ++{ ++ int hindex, nr_xids = 0; ++ ++ /* only show current and children */ ++ if (!vx_check(0, VS_ADMIN | VS_WATCH)) { ++ if (index > 0) ++ return 0; ++ xids[nr_xids] = vx_current_xid(); ++ return 1; ++ } ++ ++ for (hindex = 0; hindex < VX_HASH_SIZE; hindex++) { ++ struct hlist_head *head = &vx_info_hash[hindex]; ++ struct hlist_node *pos; ++ ++ spin_lock(&vx_info_hash_lock); ++ hlist_for_each(pos, head) { ++ struct vx_info *vxi; ++ ++ if (--index > 0) ++ continue; ++ ++ vxi = hlist_entry(pos, struct vx_info, vx_hlist); ++ xids[nr_xids] = vxi->vx_id; ++ if (++nr_xids >= size) { ++ spin_unlock(&vx_info_hash_lock); ++ goto out; ++ } ++ } ++ /* keep the lock time short */ ++ spin_unlock(&vx_info_hash_lock); ++ } ++out: ++ return nr_xids; ++} ++#endif ++ ++#ifdef CONFIG_VSERVER_DEBUG ++ ++void dump_vx_info_inactive(int level) ++{ ++ struct hlist_node *entry, *next; ++ ++ hlist_for_each_safe(entry, next, &vx_info_inactive) { ++ struct vx_info *vxi = ++ list_entry(entry, struct vx_info, vx_hlist); ++ ++ dump_vx_info(vxi, level); ++ } ++} ++ ++#endif ++ ++int vx_migrate_user(struct task_struct *p, struct vx_info *vxi) ++{ ++ struct user_struct *new_user, *old_user; ++ ++ if (!p || !vxi) ++ BUG(); ++ ++ if (vx_info_flags(vxi, VXF_INFO_PRIVATE, 0)) ++ return -EACCES; ++ ++ new_user = alloc_uid(vxi->vx_id, p->uid); ++ if (!new_user) ++ return -ENOMEM; ++ ++ old_user = p->user; ++ if (new_user != old_user) { ++ atomic_inc(&new_user->processes); ++ atomic_dec(&old_user->processes); ++ p->user = new_user; ++ } ++ free_uid(old_user); ++ return 0; ++} ++ ++void vx_mask_cap_bset(struct vx_info *vxi, struct task_struct *p) ++{ ++ p->cap_effective &= vxi->vx_cap_bset; ++ p->cap_inheritable &= vxi->vx_cap_bset; ++ p->cap_permitted &= vxi->vx_cap_bset; ++} ++ ++ ++#include ++ ++static int vx_openfd_task(struct task_struct *tsk) ++{ ++ struct files_struct *files = tsk->files; ++ struct fdtable *fdt; ++ const unsigned long *bptr; ++ int count, total; ++ ++ /* no rcu_read_lock() because of spin_lock() */ ++ spin_lock(&files->file_lock); ++ fdt = files_fdtable(files); ++ bptr = fdt->open_fds->fds_bits; ++ count = fdt->max_fds / (sizeof(unsigned long) * 8); ++ for (total = 0; count > 0; count--) { ++ if (*bptr) ++ total += hweight_long(*bptr); ++ bptr++; ++ } ++ spin_unlock(&files->file_lock); ++ return total; ++} ++ ++ ++/* for *space compatibility */ ++ ++asmlinkage long sys_unshare(unsigned long); ++ ++/* ++ * migrate task to new context ++ * gets vxi, puts old_vxi on change ++ * optionally unshares namespaces (hack) ++ */ ++ ++int vx_migrate_task(struct task_struct *p, struct vx_info *vxi, int unshare) ++{ ++ struct vx_info *old_vxi; ++ int ret = 0; ++ ++ if (!p || !vxi) ++ BUG(); ++ ++ vxdprintk(VXD_CBIT(xid, 5), ++ "vx_migrate_task(%p,%p[#%d.%d])", p, vxi, ++ vxi->vx_id, atomic_read(&vxi->vx_usecnt)); ++ ++ if (vx_info_flags(vxi, VXF_INFO_PRIVATE, 0) && ++ !vx_info_flags(vxi, VXF_STATE_SETUP, 0)) ++ return -EACCES; ++ ++ if (vx_info_state(vxi, VXS_SHUTDOWN)) ++ return -EFAULT; ++ ++ old_vxi = task_get_vx_info(p); ++ if (old_vxi == vxi) ++ goto out; ++ ++ if (!(ret = vx_migrate_user(p, vxi))) { ++ int openfd; ++ ++ task_lock(p); ++ openfd = vx_openfd_task(p); ++ ++ if (old_vxi) { ++ atomic_dec(&old_vxi->cvirt.nr_threads); ++ atomic_dec(&old_vxi->cvirt.nr_running); ++ __rlim_dec(&old_vxi->limit, RLIMIT_NPROC); ++ /* FIXME: what about the struct files here? */ ++ __rlim_sub(&old_vxi->limit, VLIMIT_OPENFD, openfd); ++ /* account for the executable */ ++ __rlim_dec(&old_vxi->limit, VLIMIT_DENTRY); ++ } ++ atomic_inc(&vxi->cvirt.nr_threads); ++ atomic_inc(&vxi->cvirt.nr_running); ++ __rlim_inc(&vxi->limit, RLIMIT_NPROC); ++ /* FIXME: what about the struct files here? */ ++ __rlim_add(&vxi->limit, VLIMIT_OPENFD, openfd); ++ /* account for the executable */ ++ __rlim_inc(&vxi->limit, VLIMIT_DENTRY); ++ ++ if (old_vxi) { ++ release_vx_info(old_vxi, p); ++ clr_vx_info(&p->vx_info); ++ } ++ claim_vx_info(vxi, p); ++ set_vx_info(&p->vx_info, vxi); ++ p->xid = vxi->vx_id; ++ ++ vxdprintk(VXD_CBIT(xid, 5), ++ "moved task %p into vxi:%p[#%d]", ++ p, vxi, vxi->vx_id); ++ ++ vx_mask_cap_bset(vxi, p); ++ task_unlock(p); ++ ++ /* hack for *spaces to provide compatibility */ ++ if (unshare) { ++ struct nsproxy *old_nsp, *new_nsp; ++ ++ ret = unshare_nsproxy_namespaces( ++ CLONE_NEWUTS | CLONE_NEWIPC, ++ &new_nsp, NULL); ++ if (ret) ++ goto out; ++ ++ old_nsp = xchg(&p->nsproxy, new_nsp); ++ vx_set_space(vxi, CLONE_NEWUTS | CLONE_NEWIPC); ++ put_nsproxy(old_nsp); ++ } ++ } ++out: ++ put_vx_info(old_vxi); ++ return ret; ++} ++ ++int vx_set_reaper(struct vx_info *vxi, struct task_struct *p) ++{ ++ struct task_struct *old_reaper; ++ ++ if (!vxi) ++ return -EINVAL; ++ ++ vxdprintk(VXD_CBIT(xid, 6), ++ "vx_set_reaper(%p[#%d],%p[#%d,%d])", ++ vxi, vxi->vx_id, p, p->xid, p->pid); ++ ++ old_reaper = vxi->vx_reaper; ++ if (old_reaper == p) ++ return 0; ++ ++ /* set new child reaper */ ++ get_task_struct(p); ++ vxi->vx_reaper = p; ++ put_task_struct(old_reaper); ++ return 0; ++} ++ ++int vx_set_init(struct vx_info *vxi, struct task_struct *p) ++{ ++ if (!vxi) ++ return -EINVAL; ++ ++ vxdprintk(VXD_CBIT(xid, 6), ++ "vx_set_init(%p[#%d],%p[#%d,%d,%d])", ++ vxi, vxi->vx_id, p, p->xid, p->pid, p->tgid); ++ ++ vxi->vx_flags &= ~VXF_STATE_INIT; ++ vxi->vx_initpid = p->tgid; ++ return 0; ++} ++ ++void vx_exit_init(struct vx_info *vxi, struct task_struct *p, int code) ++{ ++ vxdprintk(VXD_CBIT(xid, 6), ++ "vx_exit_init(%p[#%d],%p[#%d,%d,%d])", ++ vxi, vxi->vx_id, p, p->xid, p->pid, p->tgid); ++ ++ vxi->exit_code = code; ++ vxi->vx_initpid = 0; ++} ++ ++ ++void vx_set_persistent(struct vx_info *vxi) ++{ ++ vxdprintk(VXD_CBIT(xid, 6), ++ "vx_set_persistent(%p[#%d])", vxi, vxi->vx_id); ++ ++ get_vx_info(vxi); ++ claim_vx_info(vxi, NULL); ++} ++ ++void vx_clear_persistent(struct vx_info *vxi) ++{ ++ vxdprintk(VXD_CBIT(xid, 6), ++ "vx_clear_persistent(%p[#%d])", vxi, vxi->vx_id); ++ ++ release_vx_info(vxi, NULL); ++ put_vx_info(vxi); ++} ++ ++void vx_update_persistent(struct vx_info *vxi) ++{ ++ if (vx_info_flags(vxi, VXF_PERSISTENT, 0)) ++ vx_set_persistent(vxi); ++ else ++ vx_clear_persistent(vxi); ++} ++ ++ ++/* task must be current or locked */ ++ ++void exit_vx_info(struct task_struct *p, int code) ++{ ++ struct vx_info *vxi = p->vx_info; ++ ++ if (vxi) { ++ atomic_dec(&vxi->cvirt.nr_threads); ++ vx_nproc_dec(p); ++ ++ vxi->exit_code = code; ++ release_vx_info(vxi, p); ++ } ++} ++ ++void exit_vx_info_early(struct task_struct *p, int code) ++{ ++ struct vx_info *vxi = p->vx_info; ++ ++ if (vxi) { ++ if (vxi->vx_initpid == p->tgid) ++ vx_exit_init(vxi, p, code); ++ if (vxi->vx_reaper == p) ++ vx_set_reaper(vxi, init_pid_ns.child_reaper); ++ } ++} ++ ++ ++/* vserver syscall commands below here */ ++ ++/* taks xid and vx_info functions */ ++ ++#include ++ ++ ++int vc_task_xid(uint32_t id) ++{ ++ xid_t xid; ++ ++ if (id) { ++ struct task_struct *tsk; ++ ++ read_lock(&tasklist_lock); ++ tsk = find_task_by_real_pid(id); ++ xid = (tsk) ? tsk->xid : -ESRCH; ++ read_unlock(&tasklist_lock); ++ } else ++ xid = vx_current_xid(); ++ return xid; ++} ++ ++ ++int vc_vx_info(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_vx_info_v0 vc_data; ++ ++ vc_data.xid = vxi->vx_id; ++ vc_data.initpid = vxi->vx_initpid; ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ return -EFAULT; ++ return 0; ++} ++ ++ ++int vc_ctx_stat(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_ctx_stat_v0 vc_data; ++ ++ vc_data.usecnt = atomic_read(&vxi->vx_usecnt); ++ vc_data.tasks = atomic_read(&vxi->vx_tasks); ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ return -EFAULT; ++ return 0; ++} ++ ++ ++/* context functions */ ++ ++int vc_ctx_create(uint32_t xid, void __user *data) ++{ ++ struct vcmd_ctx_create vc_data = { .flagword = VXF_INIT_SET }; ++ struct vx_info *new_vxi; ++ int ret; ++ ++ if (data && copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ if ((xid > MAX_S_CONTEXT) || (xid < 2)) ++ return -EINVAL; ++ ++ new_vxi = __create_vx_info(xid); ++ if (IS_ERR(new_vxi)) ++ return PTR_ERR(new_vxi); ++ ++ /* initial flags */ ++ new_vxi->vx_flags = vc_data.flagword; ++ ++ ret = -ENOEXEC; ++ if (vs_state_change(new_vxi, VSC_STARTUP)) ++ goto out; ++ ++ ret = vx_migrate_task(current, new_vxi, (!data)); ++ if (ret) ++ goto out; ++ ++ /* return context id on success */ ++ ret = new_vxi->vx_id; ++ ++ /* get a reference for persistent contexts */ ++ if ((vc_data.flagword & VXF_PERSISTENT)) ++ vx_set_persistent(new_vxi); ++out: ++ release_vx_info(new_vxi, NULL); ++ put_vx_info(new_vxi); ++ return ret; ++} ++ ++ ++int vc_ctx_migrate(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_ctx_migrate vc_data = { .flagword = 0 }; ++ int ret; ++ ++ if (data && copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ ret = vx_migrate_task(current, vxi, 0); ++ if (ret) ++ return ret; ++ if (vc_data.flagword & VXM_SET_INIT) ++ ret = vx_set_init(vxi, current); ++ if (ret) ++ return ret; ++ if (vc_data.flagword & VXM_SET_REAPER) ++ ret = vx_set_reaper(vxi, current); ++ return ret; ++} ++ ++ ++int vc_get_cflags(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_ctx_flags_v0 vc_data; ++ ++ vc_data.flagword = vxi->vx_flags; ++ ++ /* special STATE flag handling */ ++ vc_data.mask = vs_mask_flags(~0ULL, vxi->vx_flags, VXF_ONE_TIME); ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ return -EFAULT; ++ return 0; ++} ++ ++int vc_set_cflags(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_ctx_flags_v0 vc_data; ++ uint64_t mask, trigger; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ /* special STATE flag handling */ ++ mask = vs_mask_mask(vc_data.mask, vxi->vx_flags, VXF_ONE_TIME); ++ trigger = (mask & vxi->vx_flags) ^ (mask & vc_data.flagword); ++ ++ if (vxi == current->vx_info) { ++ if (trigger & VXF_STATE_SETUP) ++ vx_mask_cap_bset(vxi, current); ++ if (trigger & VXF_STATE_INIT) { ++ int ret; ++ ++ ret = vx_set_init(vxi, current); ++ if (ret) ++ return ret; ++ ret = vx_set_reaper(vxi, current); ++ if (ret) ++ return ret; ++ } ++ } ++ ++ vxi->vx_flags = vs_mask_flags(vxi->vx_flags, ++ vc_data.flagword, mask); ++ if (trigger & VXF_PERSISTENT) ++ vx_update_persistent(vxi); ++ ++ return 0; ++} ++ ++static int do_get_caps(struct vx_info *vxi, uint64_t *bcaps, uint64_t *ccaps) ++{ ++ if (bcaps) ++ *bcaps = vxi->vx_bcaps; ++ if (ccaps) ++ *ccaps = vxi->vx_ccaps; ++ ++ return 0; ++} ++ ++int vc_get_ccaps(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_ctx_caps_v1 vc_data; ++ int ret; ++ ++ ret = do_get_caps(vxi, NULL, &vc_data.ccaps); ++ if (ret) ++ return ret; ++ vc_data.cmask = ~0ULL; ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ return -EFAULT; ++ return 0; ++} ++ ++static int do_set_caps(struct vx_info *vxi, ++ uint64_t bcaps, uint64_t bmask, uint64_t ccaps, uint64_t cmask) ++{ ++ vxi->vx_bcaps = vs_mask_flags(vxi->vx_bcaps, bcaps, bmask); ++ vxi->vx_ccaps = vs_mask_flags(vxi->vx_ccaps, ccaps, cmask); ++ ++ return 0; ++} ++ ++int vc_set_ccaps(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_ctx_caps_v1 vc_data; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ return do_set_caps(vxi, 0, 0, vc_data.ccaps, vc_data.cmask); ++} ++ ++int vc_get_bcaps(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_bcaps vc_data; ++ int ret; ++ ++ ret = do_get_caps(vxi, &vc_data.bcaps, NULL); ++ if (ret) ++ return ret; ++ vc_data.bmask = ~0ULL; ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ return -EFAULT; ++ return 0; ++} ++ ++int vc_set_bcaps(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_bcaps vc_data; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ return do_set_caps(vxi, vc_data.bcaps, vc_data.bmask, 0, 0); ++} ++ ++ ++int vc_get_badness(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_badness_v0 vc_data; ++ ++ vc_data.bias = vxi->vx_badness_bias; ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ return -EFAULT; ++ return 0; ++} ++ ++int vc_set_badness(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_badness_v0 vc_data; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ vxi->vx_badness_bias = vc_data.bias; ++ return 0; ++} ++ ++#include ++ ++EXPORT_SYMBOL_GPL(free_vx_info); ++ +diff -Nurb linux-2.6.22-594/kernel/vserver/space.c linux-2.6.22-595/kernel/vserver/space.c +--- linux-2.6.22-594/kernel/vserver/space.c 2008-03-20 00:05:21.000000000 -0400 ++++ linux-2.6.22-595/kernel/vserver/space.c 2008-03-20 00:08:28.000000000 -0400 @@ -15,6 +15,7 @@ #include #include @@ -8,7 +1653,7 @@ #include #include -@@ -54,6 +55,7 @@ +@@ -55,6 +56,7 @@ struct mnt_namespace *old_ns; struct uts_namespace *old_uts; struct ipc_namespace *old_ipc; @@ -16,11 +1661,10 @@ struct nsproxy *nsproxy; nsproxy = copy_nsproxy(old_nsproxy); -@@ -83,6 +85,17 @@ - get_ipc_ns(nsproxy->ipc_ns); +@@ -85,12 +87,26 @@ } else old_ipc = NULL; -+ + + if (mask & CLONE_NEWNET) { + old_net = nsproxy->net_ns; + nsproxy->net_ns = new_nsproxy->net_ns; @@ -31,10 +1675,10 @@ + } else + old_net = NULL; + - ++ if (old_ns) put_mnt_ns(old_ns); -@@ -90,6 +101,9 @@ + if (old_uts) put_uts_ns(old_uts); if (old_ipc) put_ipc_ns(old_ipc); @@ -44,13 +1688,6894 @@ out: return nsproxy; } -@@ -250,7 +264,8 @@ +@@ -251,6 +267,7 @@ int vc_enter_space(struct vx_info *vxi, void __user *data) { -- struct vcmd_space_mask vc_data = { .mask = 0 }; + /* Ask dhozac how to pass this flag from user space - Sapan*/ -+ struct vcmd_space_mask vc_data = { .mask = CLONE_NEWNET }; + struct vcmd_space_mask vc_data = { .mask = 0 }; if (data && copy_from_user(&vc_data, data, sizeof(vc_data))) - return -EFAULT; +diff -Nurb linux-2.6.22-594/kernel/vserver/space.c.orig linux-2.6.22-595/kernel/vserver/space.c.orig +--- linux-2.6.22-594/kernel/vserver/space.c.orig 1969-12-31 19:00:00.000000000 -0500 ++++ linux-2.6.22-595/kernel/vserver/space.c.orig 2008-03-20 00:05:28.000000000 -0400 +@@ -0,0 +1,295 @@ ++/* ++ * linux/kernel/vserver/space.c ++ * ++ * Virtual Server: Context Space Support ++ * ++ * Copyright (C) 2003-2007 Herbert Pötzl ++ * ++ * V0.01 broken out from context.c 0.07 ++ * V0.02 added task locking for namespace ++ * V0.03 broken out vx_enter_namespace ++ * V0.04 added *space support and commands ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++ ++ ++atomic_t vs_global_nsproxy = ATOMIC_INIT(0); ++atomic_t vs_global_fs = ATOMIC_INIT(0); ++atomic_t vs_global_mnt_ns = ATOMIC_INIT(0); ++atomic_t vs_global_uts_ns = ATOMIC_INIT(0); ++atomic_t vs_global_ipc_ns = ATOMIC_INIT(0); ++ ++ ++/* namespace functions */ ++ ++#include ++ ++const struct vcmd_space_mask space_mask = { ++ .mask = CLONE_NEWNS | ++ CLONE_NEWUTS | ++ CLONE_NEWIPC | ++ CLONE_FS | ++ CLONE_NEWNET ++}; ++ ++ ++/* ++ * build a new nsproxy mix ++ * assumes that both proxies are 'const' ++ * does not touch nsproxy refcounts ++ * will hold a reference on the result. ++ */ ++ ++struct nsproxy *vs_mix_nsproxy(struct nsproxy *old_nsproxy, ++ struct nsproxy *new_nsproxy, unsigned long mask) ++{ ++ struct mnt_namespace *old_ns; ++ struct uts_namespace *old_uts; ++ struct ipc_namespace *old_ipc; ++ struct net *old_net; ++ struct nsproxy *nsproxy; ++ ++ nsproxy = copy_nsproxy(old_nsproxy); ++ if (!nsproxy) ++ goto out; ++ ++ if (mask & CLONE_NEWNS) { ++ old_ns = nsproxy->mnt_ns; ++ nsproxy->mnt_ns = new_nsproxy->mnt_ns; ++ if (nsproxy->mnt_ns) ++ get_mnt_ns(nsproxy->mnt_ns); ++ } else ++ old_ns = NULL; ++ ++ if (mask & CLONE_NEWUTS) { ++ old_uts = nsproxy->uts_ns; ++ nsproxy->uts_ns = new_nsproxy->uts_ns; ++ if (nsproxy->uts_ns) ++ get_uts_ns(nsproxy->uts_ns); ++ } else ++ old_uts = NULL; ++ ++ if (mask & CLONE_NEWIPC) { ++ old_ipc = nsproxy->ipc_ns; ++ nsproxy->ipc_ns = new_nsproxy->ipc_ns; ++ if (nsproxy->ipc_ns) ++ get_ipc_ns(nsproxy->ipc_ns); ++ } else ++ old_ipc = NULL; ++ ++ if (mask & CLONE_NEWNET) { ++ old_net = nsproxy->net_ns; ++ nsproxy->net_ns = new_nsproxy->net_ns; ++ if (nsproxy->net_ns) { ++ get_net(nsproxy->net_ns); ++ printk(KERN_ALERT "Cloning network namespace\n"); ++ } ++ } else ++ old_net = NULL; ++ ++ ++ if (old_ns) ++ put_mnt_ns(old_ns); ++ if (old_uts) ++ put_uts_ns(old_uts); ++ if (old_ipc) ++ put_ipc_ns(old_ipc); ++ if (old_net) ++ put_net(old_net); ++ ++out: ++ return nsproxy; ++} ++ ++ ++/* ++ * merge two nsproxy structs into a new one. ++ * will hold a reference on the result. ++ */ ++ ++static inline ++struct nsproxy *__vs_merge_nsproxy(struct nsproxy *old, ++ struct nsproxy *proxy, unsigned long mask) ++{ ++ struct nsproxy null_proxy = { .mnt_ns = NULL }; ++ ++ if (!proxy) ++ return NULL; ++ ++ if (mask) { ++ /* vs_mix_nsproxy returns with reference */ ++ return vs_mix_nsproxy(old ? old : &null_proxy, ++ proxy, mask); ++ } ++ get_nsproxy(proxy); ++ return proxy; ++} ++ ++/* ++ * merge two fs structs into a new one. ++ * will take a reference on the result. ++ */ ++ ++static inline ++struct fs_struct *__vs_merge_fs(struct fs_struct *old, ++ struct fs_struct *fs, unsigned long mask) ++{ ++ if (!(mask & CLONE_FS)) { ++ if (old) ++ atomic_inc(&old->count); ++ return old; ++ } ++ ++ if (!fs) ++ return NULL; ++ ++ return copy_fs_struct(fs); ++} ++ ++ ++int vx_enter_space(struct vx_info *vxi, unsigned long mask) ++{ ++ struct nsproxy *proxy, *proxy_cur, *proxy_new; ++ struct fs_struct *fs, *fs_cur, *fs_new; ++ int ret; ++ ++ if (vx_info_flags(vxi, VXF_INFO_PRIVATE, 0)) ++ return -EACCES; ++ ++ if (!mask) ++ mask = vxi->vx_nsmask; ++ ++ if ((mask & vxi->vx_nsmask) != mask) ++ return -EINVAL; ++ ++ proxy = vxi->vx_nsproxy; ++ fs = vxi->vx_fs; ++ ++ task_lock(current); ++ fs_cur = current->fs; ++ atomic_inc(&fs_cur->count); ++ proxy_cur = current->nsproxy; ++ get_nsproxy(proxy_cur); ++ task_unlock(current); ++ ++ fs_new = __vs_merge_fs(fs_cur, fs, mask); ++ if (IS_ERR(fs_new)) { ++ ret = PTR_ERR(fs_new); ++ goto out_put; ++ } ++ ++ proxy_new = __vs_merge_nsproxy(proxy_cur, proxy, mask); ++ if (IS_ERR(proxy_new)) { ++ ret = PTR_ERR(proxy_new); ++ goto out_put_fs; ++ } ++ ++ fs_new = xchg(¤t->fs, fs_new); ++ proxy_new = xchg(¤t->nsproxy, proxy_new); ++ ret = 0; ++ ++ if (proxy_new) ++ put_nsproxy(proxy_new); ++out_put_fs: ++ if (fs_new) ++ put_fs_struct(fs_new); ++out_put: ++ if (proxy_cur) ++ put_nsproxy(proxy_cur); ++ if (fs_cur) ++ put_fs_struct(fs_cur); ++ return ret; ++} ++ ++ ++int vx_set_space(struct vx_info *vxi, unsigned long mask) ++{ ++ struct nsproxy *proxy_vxi, *proxy_cur, *proxy_new; ++ struct fs_struct *fs_vxi, *fs_cur, *fs_new; ++ int ret; ++ ++ if (!mask) ++ mask = space_mask.mask; ++ ++ if ((mask & space_mask.mask) != mask) ++ return -EINVAL; ++ ++ proxy_vxi = vxi->vx_nsproxy; ++ fs_vxi = vxi->vx_fs; ++ ++ task_lock(current); ++ fs_cur = current->fs; ++ atomic_inc(&fs_cur->count); ++ proxy_cur = current->nsproxy; ++ get_nsproxy(proxy_cur); ++ task_unlock(current); ++ ++ fs_new = __vs_merge_fs(fs_vxi, fs_cur, mask); ++ if (IS_ERR(fs_new)) { ++ ret = PTR_ERR(fs_new); ++ goto out_put; ++ } ++ ++ proxy_new = __vs_merge_nsproxy(proxy_vxi, proxy_cur, mask); ++ if (IS_ERR(proxy_new)) { ++ ret = PTR_ERR(proxy_new); ++ goto out_put_fs; ++ } ++ ++ fs_new = xchg(&vxi->vx_fs, fs_new); ++ proxy_new = xchg(&vxi->vx_nsproxy, proxy_new); ++ vxi->vx_nsmask |= mask; ++ ret = 0; ++ ++ if (proxy_new) ++ put_nsproxy(proxy_new); ++out_put_fs: ++ if (fs_new) ++ put_fs_struct(fs_new); ++out_put: ++ if (proxy_cur) ++ put_nsproxy(proxy_cur); ++ if (fs_cur) ++ put_fs_struct(fs_cur); ++ return ret; ++} ++ ++ ++int vc_enter_space(struct vx_info *vxi, void __user *data) ++{ ++ /* Ask dhozac how to pass this flag from user space - Sapan*/ ++ struct vcmd_space_mask vc_data = { .mask = CLONE_NEWNET }; ++ ++ if (data && copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ return vx_enter_space(vxi, vc_data.mask); ++} ++ ++int vc_set_space(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_space_mask vc_data = { .mask = 0 }; ++ ++ if (data && copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ return vx_set_space(vxi, vc_data.mask); ++} ++ ++int vc_get_space_mask(struct vx_info *vxi, void __user *data) ++{ ++ if (copy_to_user(data, &space_mask, sizeof(space_mask))) ++ return -EFAULT; ++ return 0; ++} ++ +diff -Nurb linux-2.6.22-594/net/core/net_namespace.c linux-2.6.22-595/net/core/net_namespace.c +--- linux-2.6.22-594/net/core/net_namespace.c 2008-03-20 00:05:18.000000000 -0400 ++++ linux-2.6.22-595/net/core/net_namespace.c 2008-03-20 00:14:56.000000000 -0400 +@@ -112,10 +112,12 @@ + ops = list_entry(ptr, struct pernet_operations, list); + if (ops->init) { + error = ops->init(net); +- if (error < 0) ++ if (error < 0) { ++ printk(KERN_ALERT "Error setting up netns: %x\n", ops->init); + goto out_undo; + } + } ++ } + out: + return error; + out_undo: +diff -Nurb linux-2.6.22-594/net/core/net_namespace.c.orig linux-2.6.22-595/net/core/net_namespace.c.orig +--- linux-2.6.22-594/net/core/net_namespace.c.orig 1969-12-31 19:00:00.000000000 -0500 ++++ linux-2.6.22-595/net/core/net_namespace.c.orig 2008-03-20 00:05:18.000000000 -0400 +@@ -0,0 +1,332 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * Our network namespace constructor/destructor lists ++ */ ++ ++static LIST_HEAD(pernet_list); ++static struct list_head *first_device = &pernet_list; ++static DEFINE_MUTEX(net_mutex); ++ ++static DEFINE_MUTEX(net_list_mutex); ++LIST_HEAD(net_namespace_list); ++ ++static struct kmem_cache *net_cachep; ++ ++struct net init_net; ++EXPORT_SYMBOL_GPL(init_net); ++ ++void net_lock(void) ++{ ++ mutex_lock(&net_list_mutex); ++} ++ ++void net_unlock(void) ++{ ++ mutex_unlock(&net_list_mutex); ++} ++ ++static struct net *net_alloc(void) ++{ ++ return kmem_cache_alloc(net_cachep, GFP_KERNEL); ++} ++ ++static void net_free(struct net *net) ++{ ++ if (!net) ++ return; ++ ++ if (unlikely(atomic_read(&net->use_count) != 0)) { ++ printk(KERN_EMERG "network namespace not free! Usage: %d\n", ++ atomic_read(&net->use_count)); ++ return; ++ } ++ ++ kmem_cache_free(net_cachep, net); ++} ++ ++static void cleanup_net(struct work_struct *work) ++{ ++ struct pernet_operations *ops; ++ struct list_head *ptr; ++ struct net *net; ++ ++ net = container_of(work, struct net, work); ++ ++ mutex_lock(&net_mutex); ++ ++ /* Don't let anyone else find us. */ ++ net_lock(); ++ list_del(&net->list); ++ net_unlock(); ++ ++ /* Run all of the network namespace exit methods */ ++ list_for_each_prev(ptr, &pernet_list) { ++ ops = list_entry(ptr, struct pernet_operations, list); ++ if (ops->exit) ++ ops->exit(net); ++ } ++ ++ mutex_unlock(&net_mutex); ++ ++ /* Ensure there are no outstanding rcu callbacks using this ++ * network namespace. ++ */ ++ rcu_barrier(); ++ ++ /* Finally it is safe to free my network namespace structure */ ++ net_free(net); ++} ++ ++ ++void __put_net(struct net *net) ++{ ++ /* Cleanup the network namespace in process context */ ++ INIT_WORK(&net->work, cleanup_net); ++ schedule_work(&net->work); ++} ++EXPORT_SYMBOL_GPL(__put_net); ++ ++/* ++ * setup_net runs the initializers for the network namespace object. ++ */ ++static int setup_net(struct net *net) ++{ ++ /* Must be called with net_mutex held */ ++ struct pernet_operations *ops; ++ struct list_head *ptr; ++ int error; ++ ++ memset(net, 0, sizeof(struct net)); ++ atomic_set(&net->count, 1); ++ atomic_set(&net->use_count, 0); ++ ++ error = 0; ++ list_for_each(ptr, &pernet_list) { ++ ops = list_entry(ptr, struct pernet_operations, list); ++ if (ops->init) { ++ error = ops->init(net); ++ if (error < 0) ++ goto out_undo; ++ } ++ } ++out: ++ return error; ++out_undo: ++ /* Walk through the list backwards calling the exit functions ++ * for the pernet modules whose init functions did not fail. ++ */ ++ for (ptr = ptr->prev; ptr != &pernet_list; ptr = ptr->prev) { ++ ops = list_entry(ptr, struct pernet_operations, list); ++ if (ops->exit) ++ ops->exit(net); ++ } ++ goto out; ++} ++ ++struct net *copy_net_ns(unsigned long flags, struct net *old_net) ++{ ++ struct net *new_net = NULL; ++ int err; ++ ++ get_net(old_net); ++ ++ if (!(flags & CLONE_NEWNET)) ++ return old_net; ++ ++ err = -EPERM; ++ if (!capable(CAP_SYS_ADMIN)) ++ goto out; ++ ++ err = -ENOMEM; ++ new_net = net_alloc(); ++ if (!new_net) ++ goto out; ++ ++ mutex_lock(&net_mutex); ++ err = setup_net(new_net); ++ if (err) ++ goto out_unlock; ++ ++ net_lock(); ++ list_add_tail(&new_net->list, &net_namespace_list); ++ net_unlock(); ++ ++ ++out_unlock: ++ mutex_unlock(&net_mutex); ++out: ++ put_net(old_net); ++ if (err) { ++ net_free(new_net); ++ new_net = ERR_PTR(err); ++ } ++ return new_net; ++} ++ ++static int __init net_ns_init(void) ++{ ++ int err; ++ ++ printk(KERN_INFO "net_namespace: %zd bytes\n", sizeof(struct net)); ++ net_cachep = kmem_cache_create("net_namespace", sizeof(struct net), ++ SMP_CACHE_BYTES, ++ SLAB_PANIC, NULL, NULL); ++ mutex_lock(&net_mutex); ++ err = setup_net(&init_net); ++ ++ net_lock(); ++ list_add_tail(&init_net.list, &net_namespace_list); ++ net_unlock(); ++ ++ mutex_unlock(&net_mutex); ++ if (err) ++ panic("Could not setup the initial network namespace"); ++ ++ return 0; ++} ++ ++pure_initcall(net_ns_init); ++ ++static int register_pernet_operations(struct list_head *list, ++ struct pernet_operations *ops) ++{ ++ struct net *net, *undo_net; ++ int error; ++ ++ error = 0; ++ list_add_tail(&ops->list, list); ++ for_each_net(net) { ++ if (ops->init) { ++ error = ops->init(net); ++ if (error) ++ goto out_undo; ++ } ++ } ++out: ++ return error; ++ ++out_undo: ++ /* If I have an error cleanup all namespaces I initialized */ ++ list_del(&ops->list); ++ for_each_net(undo_net) { ++ if (undo_net == net) ++ goto undone; ++ if (ops->exit) ++ ops->exit(undo_net); ++ } ++undone: ++ goto out; ++} ++ ++static void unregister_pernet_operations(struct pernet_operations *ops) ++{ ++ struct net *net; ++ ++ list_del(&ops->list); ++ for_each_net(net) ++ if (ops->exit) ++ ops->exit(net); ++} ++ ++/** ++ * register_pernet_subsys - register a network namespace subsystem ++ * @ops: pernet operations structure for the subsystem ++ * ++ * Register a subsystem which has init and exit functions ++ * that are called when network namespaces are created and ++ * destroyed respectively. ++ * ++ * When registered all network namespace init functions are ++ * called for every existing network namespace. Allowing kernel ++ * modules to have a race free view of the set of network namespaces. ++ * ++ * When a new network namespace is created all of the init ++ * methods are called in the order in which they were registered. ++ * ++ * When a network namespace is destroyed all of the exit methods ++ * are called in the reverse of the order with which they were ++ * registered. ++ */ ++int register_pernet_subsys(struct pernet_operations *ops) ++{ ++ int error; ++ mutex_lock(&net_mutex); ++ error = register_pernet_operations(first_device, ops); ++ mutex_unlock(&net_mutex); ++ return error; ++} ++EXPORT_SYMBOL_GPL(register_pernet_subsys); ++ ++/** ++ * unregister_pernet_subsys - unregister a network namespace subsystem ++ * @ops: pernet operations structure to manipulate ++ * ++ * Remove the pernet operations structure from the list to be ++ * used when network namespaces are created or destoryed. In ++ * addition run the exit method for all existing network ++ * namespaces. ++ */ ++void unregister_pernet_subsys(struct pernet_operations *module) ++{ ++ mutex_lock(&net_mutex); ++ unregister_pernet_operations(module); ++ mutex_unlock(&net_mutex); ++} ++EXPORT_SYMBOL_GPL(unregister_pernet_subsys); ++ ++/** ++ * register_pernet_device - register a network namespace device ++ * @ops: pernet operations structure for the subsystem ++ * ++ * Register a device which has init and exit functions ++ * that are called when network namespaces are created and ++ * destroyed respectively. ++ * ++ * When registered all network namespace init functions are ++ * called for every existing network namespace. Allowing kernel ++ * modules to have a race free view of the set of network namespaces. ++ * ++ * When a new network namespace is created all of the init ++ * methods are called in the order in which they were registered. ++ * ++ * When a network namespace is destroyed all of the exit methods ++ * are called in the reverse of the order with which they were ++ * registered. ++ */ ++int register_pernet_device(struct pernet_operations *ops) ++{ ++ int error; ++ mutex_lock(&net_mutex); ++ error = register_pernet_operations(&pernet_list, ops); ++ if (!error && (first_device == &pernet_list)) ++ first_device = &ops->list; ++ mutex_unlock(&net_mutex); ++ return error; ++} ++EXPORT_SYMBOL_GPL(register_pernet_device); ++ ++/** ++ * unregister_pernet_device - unregister a network namespace netdevice ++ * @ops: pernet operations structure to manipulate ++ * ++ * Remove the pernet operations structure from the list to be ++ * used when network namespaces are created or destoryed. In ++ * addition run the exit method for all existing network ++ * namespaces. ++ */ ++void unregister_pernet_device(struct pernet_operations *ops) ++{ ++ mutex_lock(&net_mutex); ++ if (&ops->list == first_device) ++ first_device = first_device->next; ++ unregister_pernet_operations(ops); ++ mutex_unlock(&net_mutex); ++} ++EXPORT_SYMBOL_GPL(unregister_pernet_device); +diff -Nurb linux-2.6.22-594/net/ipv4/af_inet.c.orig linux-2.6.22-595/net/ipv4/af_inet.c.orig +--- linux-2.6.22-594/net/ipv4/af_inet.c.orig 2008-03-20 00:05:18.000000000 -0400 ++++ linux-2.6.22-595/net/ipv4/af_inet.c.orig 1969-12-31 19:00:00.000000000 -0500 +@@ -1,1522 +0,0 @@ +-/* +- * INET An implementation of the TCP/IP protocol suite for the LINUX +- * operating system. INET is implemented using the BSD Socket +- * interface as the means of communication with the user level. +- * +- * PF_INET protocol family socket handler. +- * +- * Version: $Id: af_inet.c,v 1.137 2002/02/01 22:01:03 davem Exp $ +- * +- * Authors: Ross Biro +- * Fred N. van Kempen, +- * Florian La Roche, +- * Alan Cox, +- * +- * Changes (see also sock.c) +- * +- * piggy, +- * Karl Knutson : Socket protocol table +- * A.N.Kuznetsov : Socket death error in accept(). +- * John Richardson : Fix non blocking error in connect() +- * so sockets that fail to connect +- * don't return -EINPROGRESS. +- * Alan Cox : Asynchronous I/O support +- * Alan Cox : Keep correct socket pointer on sock +- * structures +- * when accept() ed +- * Alan Cox : Semantics of SO_LINGER aren't state +- * moved to close when you look carefully. +- * With this fixed and the accept bug fixed +- * some RPC stuff seems happier. +- * Niibe Yutaka : 4.4BSD style write async I/O +- * Alan Cox, +- * Tony Gale : Fixed reuse semantics. +- * Alan Cox : bind() shouldn't abort existing but dead +- * sockets. Stops FTP netin:.. I hope. +- * Alan Cox : bind() works correctly for RAW sockets. +- * Note that FreeBSD at least was broken +- * in this respect so be careful with +- * compatibility tests... +- * Alan Cox : routing cache support +- * Alan Cox : memzero the socket structure for +- * compactness. +- * Matt Day : nonblock connect error handler +- * Alan Cox : Allow large numbers of pending sockets +- * (eg for big web sites), but only if +- * specifically application requested. +- * Alan Cox : New buffering throughout IP. Used +- * dumbly. +- * Alan Cox : New buffering now used smartly. +- * Alan Cox : BSD rather than common sense +- * interpretation of listen. +- * Germano Caronni : Assorted small races. +- * Alan Cox : sendmsg/recvmsg basic support. +- * Alan Cox : Only sendmsg/recvmsg now supported. +- * Alan Cox : Locked down bind (see security list). +- * Alan Cox : Loosened bind a little. +- * Mike McLagan : ADD/DEL DLCI Ioctls +- * Willy Konynenberg : Transparent proxying support. +- * David S. Miller : New socket lookup architecture. +- * Some other random speedups. +- * Cyrus Durgin : Cleaned up file for kmod hacks. +- * Andi Kleen : Fix inet_stream_connect TCP race. +- * +- * This program is free software; you can redistribute it and/or +- * modify it under the terms of the GNU General Public License +- * as published by the Free Software Foundation; either version +- * 2 of the License, or (at your option) any later version. +- */ +- +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +- +-#include +-#include +- +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#ifdef CONFIG_IP_MROUTE +-#include +-#endif +-#include +- +-DEFINE_SNMP_STAT(struct linux_mib, net_statistics) __read_mostly; +- +-extern void ip_mc_drop_socket(struct sock *sk); +- +-/* The inetsw table contains everything that inet_create needs to +- * build a new socket. +- */ +-static struct list_head inetsw[SOCK_MAX]; +-static DEFINE_SPINLOCK(inetsw_lock); +- +-/* New destruction routine */ +- +-void inet_sock_destruct(struct sock *sk) +-{ +- struct inet_sock *inet = inet_sk(sk); +- +- __skb_queue_purge(&sk->sk_receive_queue); +- __skb_queue_purge(&sk->sk_error_queue); +- +- if (sk->sk_type == SOCK_STREAM && sk->sk_state != TCP_CLOSE) { +- printk("Attempt to release TCP socket in state %d %p\n", +- sk->sk_state, sk); +- return; +- } +- if (!sock_flag(sk, SOCK_DEAD)) { +- printk("Attempt to release alive inet socket %p\n", sk); +- return; +- } +- +- BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc)); +- BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc)); +- BUG_TRAP(!sk->sk_wmem_queued); +- BUG_TRAP(!sk->sk_forward_alloc); +- +- kfree(inet->opt); +- dst_release(sk->sk_dst_cache); +- sk_refcnt_debug_dec(sk); +-} +- +-/* +- * The routines beyond this point handle the behaviour of an AF_INET +- * socket object. Mostly it punts to the subprotocols of IP to do +- * the work. +- */ +- +-/* +- * Automatically bind an unbound socket. +- */ +- +-static int inet_autobind(struct sock *sk) +-{ +- struct inet_sock *inet; +- /* We may need to bind the socket. */ +- lock_sock(sk); +- inet = inet_sk(sk); +- if (!inet->num) { +- if (sk->sk_prot->get_port(sk, 0)) { +- release_sock(sk); +- return -EAGAIN; +- } +- inet->sport = htons(inet->num); +- sk->sk_xid = vx_current_xid(); +- sk->sk_nid = nx_current_nid(); +- } +- release_sock(sk); +- return 0; +-} +- +-/* +- * Move a socket into listening state. +- */ +-int inet_listen(struct socket *sock, int backlog) +-{ +- struct sock *sk = sock->sk; +- unsigned char old_state; +- int err; +- +- lock_sock(sk); +- +- err = -EINVAL; +- if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM) +- goto out; +- +- old_state = sk->sk_state; +- if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN))) +- goto out; +- +- /* Really, if the socket is already in listen state +- * we can only allow the backlog to be adjusted. +- */ +- if (old_state != TCP_LISTEN) { +- err = inet_csk_listen_start(sk, backlog); +- if (err) +- goto out; +- } +- sk->sk_max_ack_backlog = backlog; +- err = 0; +- +-out: +- release_sock(sk); +- return err; +-} +- +-u32 inet_ehash_secret __read_mostly; +-EXPORT_SYMBOL(inet_ehash_secret); +- +-/* +- * inet_ehash_secret must be set exactly once +- * Instead of using a dedicated spinlock, we (ab)use inetsw_lock +- */ +-void build_ehash_secret(void) +-{ +- u32 rnd; +- do { +- get_random_bytes(&rnd, sizeof(rnd)); +- } while (rnd == 0); +- spin_lock_bh(&inetsw_lock); +- if (!inet_ehash_secret) +- inet_ehash_secret = rnd; +- spin_unlock_bh(&inetsw_lock); +-} +-EXPORT_SYMBOL(build_ehash_secret); +- +-/* +- * Create an inet socket. +- */ +- +-static int inet_create(struct socket *sock, int protocol) +-{ +- struct sock *sk; +- struct list_head *p; +- struct inet_protosw *answer; +- struct inet_sock *inet; +- struct proto *answer_prot; +- unsigned char answer_flags; +- char answer_no_check; +- int try_loading_module = 0; +- int err; +- +- if (sock->type != SOCK_RAW && +- sock->type != SOCK_DGRAM && +- !inet_ehash_secret) +- build_ehash_secret(); +- +- sock->state = SS_UNCONNECTED; +- +- /* Look for the requested type/protocol pair. */ +- answer = NULL; +-lookup_protocol: +- err = -ESOCKTNOSUPPORT; +- rcu_read_lock(); +- list_for_each_rcu(p, &inetsw[sock->type]) { +- answer = list_entry(p, struct inet_protosw, list); +- +- /* Check the non-wild match. */ +- if (protocol == answer->protocol) { +- if (protocol != IPPROTO_IP) +- break; +- } else { +- /* Check for the two wild cases. */ +- if (IPPROTO_IP == protocol) { +- protocol = answer->protocol; +- break; +- } +- if (IPPROTO_IP == answer->protocol) +- break; +- } +- err = -EPROTONOSUPPORT; +- answer = NULL; +- } +- +- if (unlikely(answer == NULL)) { +- if (try_loading_module < 2) { +- rcu_read_unlock(); +- /* +- * Be more specific, e.g. net-pf-2-proto-132-type-1 +- * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM) +- */ +- if (++try_loading_module == 1) +- request_module("net-pf-%d-proto-%d-type-%d", +- PF_INET, protocol, sock->type); +- /* +- * Fall back to generic, e.g. net-pf-2-proto-132 +- * (net-pf-PF_INET-proto-IPPROTO_SCTP) +- */ +- else +- request_module("net-pf-%d-proto-%d", +- PF_INET, protocol); +- goto lookup_protocol; +- } else +- goto out_rcu_unlock; +- } +- +- err = -EPERM; +- if ((protocol == IPPROTO_ICMP) && +- nx_capable(answer->capability, NXC_RAW_ICMP)) +- goto override; +- if (sock->type == SOCK_RAW && +- nx_capable(answer->capability, NXC_RAW_SOCKET)) +- goto override; +- if (answer->capability > 0 && !capable(answer->capability)) +- goto out_rcu_unlock; +-override: +- sock->ops = answer->ops; +- answer_prot = answer->prot; +- answer_no_check = answer->no_check; +- answer_flags = answer->flags; +- rcu_read_unlock(); +- +- BUG_TRAP(answer_prot->slab != NULL); +- +- err = -ENOBUFS; +- sk = sk_alloc(PF_INET, GFP_KERNEL, answer_prot, 1); +- if (sk == NULL) +- goto out; +- +- err = 0; +- sk->sk_no_check = answer_no_check; +- if (INET_PROTOSW_REUSE & answer_flags) +- sk->sk_reuse = 1; +- +- inet = inet_sk(sk); +- inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0; +- +- if (SOCK_RAW == sock->type) { +- inet->num = protocol; +- if (IPPROTO_RAW == protocol) +- inet->hdrincl = 1; +- } +- +- if (ipv4_config.no_pmtu_disc) +- inet->pmtudisc = IP_PMTUDISC_DONT; +- else +- inet->pmtudisc = IP_PMTUDISC_WANT; +- +- inet->id = 0; +- +- sock_init_data(sock, sk); +- +- sk->sk_destruct = inet_sock_destruct; +- sk->sk_family = PF_INET; +- sk->sk_protocol = protocol; +- sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv; +- +- inet->uc_ttl = -1; +- inet->mc_loop = 1; +- inet->mc_ttl = 1; +- inet->mc_index = 0; +- inet->mc_list = NULL; +- +- sk_refcnt_debug_inc(sk); +- +- if (inet->num) { +- /* It assumes that any protocol which allows +- * the user to assign a number at socket +- * creation time automatically +- * shares. +- */ +- inet->sport = htons(inet->num); +- /* Add to protocol hash chains. */ +- sk->sk_prot->hash(sk); +- } +- +- if (sk->sk_prot->init) { +- err = sk->sk_prot->init(sk); +- if (err) +- sk_common_release(sk); +- } +-out: +- return err; +-out_rcu_unlock: +- rcu_read_unlock(); +- goto out; +-} +- +- +-/* +- * The peer socket should always be NULL (or else). When we call this +- * function we are destroying the object and from then on nobody +- * should refer to it. +- */ +-int inet_release(struct socket *sock) +-{ +- struct sock *sk = sock->sk; +- +- if (sk) { +- long timeout; +- +- /* Applications forget to leave groups before exiting */ +- ip_mc_drop_socket(sk); +- +- /* If linger is set, we don't return until the close +- * is complete. Otherwise we return immediately. The +- * actually closing is done the same either way. +- * +- * If the close is due to the process exiting, we never +- * linger.. +- */ +- timeout = 0; +- if (sock_flag(sk, SOCK_LINGER) && +- !(current->flags & PF_EXITING)) +- timeout = sk->sk_lingertime; +- sock->sk = NULL; +- sk->sk_prot->close(sk, timeout); +- } +- return 0; +-} +- +-/* It is off by default, see below. */ +-int sysctl_ip_nonlocal_bind __read_mostly; +- +-int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +-{ +- struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; +- struct sock *sk = sock->sk; +- struct inet_sock *inet = inet_sk(sk); +- struct nx_v4_sock_addr nsa; +- unsigned short snum; +- int chk_addr_ret; +- int err; +- +- /* If the socket has its own bind function then use it. (RAW) */ +- if (sk->sk_prot->bind) { +- err = sk->sk_prot->bind(sk, uaddr, addr_len); +- goto out; +- } +- err = -EINVAL; +- if (addr_len < sizeof(struct sockaddr_in)) +- goto out; +- +- err = v4_map_sock_addr(inet, addr, &nsa); +- if (err) +- goto out; +- +- chk_addr_ret = inet_addr_type(nsa.saddr); +- +- /* Not specified by any standard per-se, however it breaks too +- * many applications when removed. It is unfortunate since +- * allowing applications to make a non-local bind solves +- * several problems with systems using dynamic addressing. +- * (ie. your servers still start up even if your ISDN link +- * is temporarily down) +- */ +- err = -EADDRNOTAVAIL; +- if (!sysctl_ip_nonlocal_bind && +- !inet->freebind && +- nsa.saddr != INADDR_ANY && +- chk_addr_ret != RTN_LOCAL && +- chk_addr_ret != RTN_MULTICAST && +- chk_addr_ret != RTN_BROADCAST) +- goto out; +- +- snum = ntohs(addr->sin_port); +- err = -EACCES; +- if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE)) +- goto out; +- +- /* We keep a pair of addresses. rcv_saddr is the one +- * used by hash lookups, and saddr is used for transmit. +- * +- * In the BSD API these are the same except where it +- * would be illegal to use them (multicast/broadcast) in +- * which case the sending device address is used. +- */ +- lock_sock(sk); +- +- /* Check these errors (active socket, double bind). */ +- err = -EINVAL; +- if (sk->sk_state != TCP_CLOSE || inet->num) +- goto out_release_sock; +- +- v4_set_sock_addr(inet, &nsa); +- if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) +- inet->saddr = 0; /* Use device */ +- +- /* Make sure we are allowed to bind here. */ +- if (sk->sk_prot->get_port(sk, snum)) { +- inet->saddr = inet->rcv_saddr = 0; +- err = -EADDRINUSE; +- goto out_release_sock; +- } +- +- if (inet->rcv_saddr) +- sk->sk_userlocks |= SOCK_BINDADDR_LOCK; +- if (snum) +- sk->sk_userlocks |= SOCK_BINDPORT_LOCK; +- inet->sport = htons(inet->num); +- inet->daddr = 0; +- inet->dport = 0; +- sk_dst_reset(sk); +- err = 0; +-out_release_sock: +- release_sock(sk); +-out: +- return err; +-} +- +-int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr, +- int addr_len, int flags) +-{ +- struct sock *sk = sock->sk; +- +- if (uaddr->sa_family == AF_UNSPEC) +- return sk->sk_prot->disconnect(sk, flags); +- +- if (!inet_sk(sk)->num && inet_autobind(sk)) +- return -EAGAIN; +- return sk->sk_prot->connect(sk, (struct sockaddr *)uaddr, addr_len); +-} +- +-static long inet_wait_for_connect(struct sock *sk, long timeo) +-{ +- DEFINE_WAIT(wait); +- +- prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); +- +- /* Basic assumption: if someone sets sk->sk_err, he _must_ +- * change state of the socket from TCP_SYN_*. +- * Connect() does not allow to get error notifications +- * without closing the socket. +- */ +- while ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { +- release_sock(sk); +- timeo = schedule_timeout(timeo); +- lock_sock(sk); +- if (signal_pending(current) || !timeo) +- break; +- prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); +- } +- finish_wait(sk->sk_sleep, &wait); +- return timeo; +-} +- +-/* +- * Connect to a remote host. There is regrettably still a little +- * TCP 'magic' in here. +- */ +-int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, +- int addr_len, int flags) +-{ +- struct sock *sk = sock->sk; +- int err; +- long timeo; +- +- lock_sock(sk); +- +- if (uaddr->sa_family == AF_UNSPEC) { +- err = sk->sk_prot->disconnect(sk, flags); +- sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; +- goto out; +- } +- +- switch (sock->state) { +- default: +- err = -EINVAL; +- goto out; +- case SS_CONNECTED: +- err = -EISCONN; +- goto out; +- case SS_CONNECTING: +- err = -EALREADY; +- /* Fall out of switch with err, set for this state */ +- break; +- case SS_UNCONNECTED: +- err = -EISCONN; +- if (sk->sk_state != TCP_CLOSE) +- goto out; +- +- err = sk->sk_prot->connect(sk, uaddr, addr_len); +- if (err < 0) +- goto out; +- +- sock->state = SS_CONNECTING; +- +- /* Just entered SS_CONNECTING state; the only +- * difference is that return value in non-blocking +- * case is EINPROGRESS, rather than EALREADY. +- */ +- err = -EINPROGRESS; +- break; +- } +- +- timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); +- +- if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { +- /* Error code is set above */ +- if (!timeo || !inet_wait_for_connect(sk, timeo)) +- goto out; +- +- err = sock_intr_errno(timeo); +- if (signal_pending(current)) +- goto out; +- } +- +- /* Connection was closed by RST, timeout, ICMP error +- * or another process disconnected us. +- */ +- if (sk->sk_state == TCP_CLOSE) +- goto sock_error; +- +- /* sk->sk_err may be not zero now, if RECVERR was ordered by user +- * and error was received after socket entered established state. +- * Hence, it is handled normally after connect() return successfully. +- */ +- +- sock->state = SS_CONNECTED; +- err = 0; +-out: +- release_sock(sk); +- return err; +- +-sock_error: +- err = sock_error(sk) ? : -ECONNABORTED; +- sock->state = SS_UNCONNECTED; +- if (sk->sk_prot->disconnect(sk, flags)) +- sock->state = SS_DISCONNECTING; +- goto out; +-} +- +-/* +- * Accept a pending connection. The TCP layer now gives BSD semantics. +- */ +- +-int inet_accept(struct socket *sock, struct socket *newsock, int flags) +-{ +- struct sock *sk1 = sock->sk; +- int err = -EINVAL; +- struct sock *sk2 = sk1->sk_prot->accept(sk1, flags, &err); +- +- if (!sk2) +- goto do_err; +- +- lock_sock(sk2); +- +- BUG_TRAP((1 << sk2->sk_state) & +- (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_CLOSE)); +- +- sock_graft(sk2, newsock); +- +- newsock->state = SS_CONNECTED; +- err = 0; +- release_sock(sk2); +-do_err: +- return err; +-} +- +- +-/* +- * This does both peername and sockname. +- */ +-int inet_getname(struct socket *sock, struct sockaddr *uaddr, +- int *uaddr_len, int peer) +-{ +- struct sock *sk = sock->sk; +- struct inet_sock *inet = inet_sk(sk); +- struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; +- +- sin->sin_family = AF_INET; +- if (peer) { +- if (!inet->dport || +- (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) && +- peer == 1)) +- return -ENOTCONN; +- sin->sin_port = inet->dport; +- sin->sin_addr.s_addr = +- nx_map_sock_lback(sk->sk_nx_info, inet->daddr); +- } else { +- __be32 addr = inet->rcv_saddr; +- if (!addr) +- addr = inet->saddr; +- addr = nx_map_sock_lback(sk->sk_nx_info, addr); +- sin->sin_port = inet->sport; +- sin->sin_addr.s_addr = addr; +- } +- memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); +- *uaddr_len = sizeof(*sin); +- return 0; +-} +- +-int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, +- size_t size) +-{ +- struct sock *sk = sock->sk; +- +- /* We may need to bind the socket. */ +- if (!inet_sk(sk)->num && inet_autobind(sk)) +- return -EAGAIN; +- +- return sk->sk_prot->sendmsg(iocb, sk, msg, size); +-} +- +- +-static ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) +-{ +- struct sock *sk = sock->sk; +- +- /* We may need to bind the socket. */ +- if (!inet_sk(sk)->num && inet_autobind(sk)) +- return -EAGAIN; +- +- if (sk->sk_prot->sendpage) +- return sk->sk_prot->sendpage(sk, page, offset, size, flags); +- return sock_no_sendpage(sock, page, offset, size, flags); +-} +- +- +-int inet_shutdown(struct socket *sock, int how) +-{ +- struct sock *sk = sock->sk; +- int err = 0; +- +- /* This should really check to make sure +- * the socket is a TCP socket. (WHY AC...) +- */ +- how++; /* maps 0->1 has the advantage of making bit 1 rcvs and +- 1->2 bit 2 snds. +- 2->3 */ +- if ((how & ~SHUTDOWN_MASK) || !how) /* MAXINT->0 */ +- return -EINVAL; +- +- lock_sock(sk); +- if (sock->state == SS_CONNECTING) { +- if ((1 << sk->sk_state) & +- (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE)) +- sock->state = SS_DISCONNECTING; +- else +- sock->state = SS_CONNECTED; +- } +- +- switch (sk->sk_state) { +- case TCP_CLOSE: +- err = -ENOTCONN; +- /* Hack to wake up other listeners, who can poll for +- POLLHUP, even on eg. unconnected UDP sockets -- RR */ +- default: +- sk->sk_shutdown |= how; +- if (sk->sk_prot->shutdown) +- sk->sk_prot->shutdown(sk, how); +- break; +- +- /* Remaining two branches are temporary solution for missing +- * close() in multithreaded environment. It is _not_ a good idea, +- * but we have no choice until close() is repaired at VFS level. +- */ +- case TCP_LISTEN: +- if (!(how & RCV_SHUTDOWN)) +- break; +- /* Fall through */ +- case TCP_SYN_SENT: +- err = sk->sk_prot->disconnect(sk, O_NONBLOCK); +- sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; +- break; +- } +- +- /* Wake up anyone sleeping in poll. */ +- sk->sk_state_change(sk); +- release_sock(sk); +- return err; +-} +- +-/* +- * ioctl() calls you can issue on an INET socket. Most of these are +- * device configuration and stuff and very rarely used. Some ioctls +- * pass on to the socket itself. +- * +- * NOTE: I like the idea of a module for the config stuff. ie ifconfig +- * loads the devconfigure module does its configuring and unloads it. +- * There's a good 20K of config code hanging around the kernel. +- */ +- +-int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +-{ +- struct sock *sk = sock->sk; +- int err = 0; +- +- switch (cmd) { +- case SIOCGSTAMP: +- err = sock_get_timestamp(sk, (struct timeval __user *)arg); +- break; +- case SIOCGSTAMPNS: +- err = sock_get_timestampns(sk, (struct timespec __user *)arg); +- break; +- case SIOCADDRT: +- case SIOCDELRT: +- case SIOCRTMSG: +- err = ip_rt_ioctl(cmd, (void __user *)arg); +- break; +- case SIOCDARP: +- case SIOCGARP: +- case SIOCSARP: +- err = arp_ioctl(cmd, (void __user *)arg); +- break; +- case SIOCGIFADDR: +- case SIOCSIFADDR: +- case SIOCGIFBRDADDR: +- case SIOCSIFBRDADDR: +- case SIOCGIFNETMASK: +- case SIOCSIFNETMASK: +- case SIOCGIFDSTADDR: +- case SIOCSIFDSTADDR: +- case SIOCSIFPFLAGS: +- case SIOCGIFPFLAGS: +- case SIOCSIFFLAGS: +- err = devinet_ioctl(cmd, (void __user *)arg); +- break; +- default: +- if (sk->sk_prot->ioctl) +- err = sk->sk_prot->ioctl(sk, cmd, arg); +- else +- err = -ENOIOCTLCMD; +- break; +- } +- return err; +-} +- +-const struct proto_ops inet_stream_ops = { +- .family = PF_INET, +- .owner = THIS_MODULE, +- .release = inet_release, +- .bind = inet_bind, +- .connect = inet_stream_connect, +- .socketpair = sock_no_socketpair, +- .accept = inet_accept, +- .getname = inet_getname, +- .poll = tcp_poll, +- .ioctl = inet_ioctl, +- .listen = inet_listen, +- .shutdown = inet_shutdown, +- .setsockopt = sock_common_setsockopt, +- .getsockopt = sock_common_getsockopt, +- .sendmsg = tcp_sendmsg, +- .recvmsg = sock_common_recvmsg, +- .mmap = sock_no_mmap, +- .sendpage = tcp_sendpage, +-#ifdef CONFIG_COMPAT +- .compat_setsockopt = compat_sock_common_setsockopt, +- .compat_getsockopt = compat_sock_common_getsockopt, +-#endif +-}; +- +-const struct proto_ops inet_dgram_ops = { +- .family = PF_INET, +- .owner = THIS_MODULE, +- .release = inet_release, +- .bind = inet_bind, +- .connect = inet_dgram_connect, +- .socketpair = sock_no_socketpair, +- .accept = sock_no_accept, +- .getname = inet_getname, +- .poll = udp_poll, +- .ioctl = inet_ioctl, +- .listen = sock_no_listen, +- .shutdown = inet_shutdown, +- .setsockopt = sock_common_setsockopt, +- .getsockopt = sock_common_getsockopt, +- .sendmsg = inet_sendmsg, +- .recvmsg = sock_common_recvmsg, +- .mmap = sock_no_mmap, +- .sendpage = inet_sendpage, +-#ifdef CONFIG_COMPAT +- .compat_setsockopt = compat_sock_common_setsockopt, +- .compat_getsockopt = compat_sock_common_getsockopt, +-#endif +-}; +- +-/* +- * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without +- * udp_poll +- */ +-static const struct proto_ops inet_sockraw_ops = { +- .family = PF_INET, +- .owner = THIS_MODULE, +- .release = inet_release, +- .bind = inet_bind, +- .connect = inet_dgram_connect, +- .socketpair = sock_no_socketpair, +- .accept = sock_no_accept, +- .getname = inet_getname, +- .poll = datagram_poll, +- .ioctl = inet_ioctl, +- .listen = sock_no_listen, +- .shutdown = inet_shutdown, +- .setsockopt = sock_common_setsockopt, +- .getsockopt = sock_common_getsockopt, +- .sendmsg = inet_sendmsg, +- .recvmsg = sock_common_recvmsg, +- .mmap = sock_no_mmap, +- .sendpage = inet_sendpage, +-#ifdef CONFIG_COMPAT +- .compat_setsockopt = compat_sock_common_setsockopt, +- .compat_getsockopt = compat_sock_common_getsockopt, +-#endif +-}; +- +-static struct net_proto_family inet_family_ops = { +- .family = PF_INET, +- .create = inet_create, +- .owner = THIS_MODULE, +-}; +- +-/* Upon startup we insert all the elements in inetsw_array[] into +- * the linked list inetsw. +- */ +-static struct inet_protosw inetsw_array[] = +-{ +- { +- .type = SOCK_STREAM, +- .protocol = IPPROTO_TCP, +- .prot = &tcp_prot, +- .ops = &inet_stream_ops, +- .capability = -1, +- .no_check = 0, +- .flags = INET_PROTOSW_PERMANENT | +- INET_PROTOSW_ICSK, +- }, +- +- { +- .type = SOCK_DGRAM, +- .protocol = IPPROTO_UDP, +- .prot = &udp_prot, +- .ops = &inet_dgram_ops, +- .capability = -1, +- .no_check = UDP_CSUM_DEFAULT, +- .flags = INET_PROTOSW_PERMANENT, +- }, +- +- +- { +- .type = SOCK_RAW, +- .protocol = IPPROTO_IP, /* wild card */ +- .prot = &raw_prot, +- .ops = &inet_sockraw_ops, +- .capability = CAP_NET_RAW, +- .no_check = UDP_CSUM_DEFAULT, +- .flags = INET_PROTOSW_REUSE, +- } +-}; +- +-#define INETSW_ARRAY_LEN (sizeof(inetsw_array) / sizeof(struct inet_protosw)) +- +-void inet_register_protosw(struct inet_protosw *p) +-{ +- struct list_head *lh; +- struct inet_protosw *answer; +- int protocol = p->protocol; +- struct list_head *last_perm; +- +- spin_lock_bh(&inetsw_lock); +- +- if (p->type >= SOCK_MAX) +- goto out_illegal; +- +- /* If we are trying to override a permanent protocol, bail. */ +- answer = NULL; +- last_perm = &inetsw[p->type]; +- list_for_each(lh, &inetsw[p->type]) { +- answer = list_entry(lh, struct inet_protosw, list); +- +- /* Check only the non-wild match. */ +- if (INET_PROTOSW_PERMANENT & answer->flags) { +- if (protocol == answer->protocol) +- break; +- last_perm = lh; +- } +- +- answer = NULL; +- } +- if (answer) +- goto out_permanent; +- +- /* Add the new entry after the last permanent entry if any, so that +- * the new entry does not override a permanent entry when matched with +- * a wild-card protocol. But it is allowed to override any existing +- * non-permanent entry. This means that when we remove this entry, the +- * system automatically returns to the old behavior. +- */ +- list_add_rcu(&p->list, last_perm); +-out: +- spin_unlock_bh(&inetsw_lock); +- +- synchronize_net(); +- +- return; +- +-out_permanent: +- printk(KERN_ERR "Attempt to override permanent protocol %d.\n", +- protocol); +- goto out; +- +-out_illegal: +- printk(KERN_ERR +- "Ignoring attempt to register invalid socket type %d.\n", +- p->type); +- goto out; +-} +- +-void inet_unregister_protosw(struct inet_protosw *p) +-{ +- if (INET_PROTOSW_PERMANENT & p->flags) { +- printk(KERN_ERR +- "Attempt to unregister permanent protocol %d.\n", +- p->protocol); +- } else { +- spin_lock_bh(&inetsw_lock); +- list_del_rcu(&p->list); +- spin_unlock_bh(&inetsw_lock); +- +- synchronize_net(); +- } +-} +- +-/* +- * Shall we try to damage output packets if routing dev changes? +- */ +- +-int sysctl_ip_dynaddr __read_mostly; +- +-static int inet_sk_reselect_saddr(struct sock *sk) +-{ +- struct inet_sock *inet = inet_sk(sk); +- int err; +- struct rtable *rt; +- __be32 old_saddr = inet->saddr; +- __be32 new_saddr; +- __be32 daddr = inet->daddr; +- +- if (inet->opt && inet->opt->srr) +- daddr = inet->opt->faddr; +- +- /* Query new route. */ +- err = ip_route_connect(&rt, daddr, 0, +- RT_CONN_FLAGS(sk), +- sk->sk_bound_dev_if, +- sk->sk_protocol, +- inet->sport, inet->dport, sk, 0); +- if (err) +- return err; +- +- sk_setup_caps(sk, &rt->u.dst); +- +- new_saddr = rt->rt_src; +- +- if (new_saddr == old_saddr) +- return 0; +- +- if (sysctl_ip_dynaddr > 1) { +- printk(KERN_INFO "%s(): shifting inet->" +- "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n", +- __FUNCTION__, +- NIPQUAD(old_saddr), +- NIPQUAD(new_saddr)); +- } +- +- inet->saddr = inet->rcv_saddr = new_saddr; +- +- /* +- * XXX The only one ugly spot where we need to +- * XXX really change the sockets identity after +- * XXX it has entered the hashes. -DaveM +- * +- * Besides that, it does not check for connection +- * uniqueness. Wait for troubles. +- */ +- __sk_prot_rehash(sk); +- return 0; +-} +- +-int inet_sk_rebuild_header(struct sock *sk) +-{ +- struct inet_sock *inet = inet_sk(sk); +- struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0); +- __be32 daddr; +- int err; +- +- /* Route is OK, nothing to do. */ +- if (rt) +- return 0; +- +- /* Reroute. */ +- daddr = inet->daddr; +- if (inet->opt && inet->opt->srr) +- daddr = inet->opt->faddr; +-{ +- struct flowi fl = { +- .oif = sk->sk_bound_dev_if, +- .nl_u = { +- .ip4_u = { +- .daddr = daddr, +- .saddr = inet->saddr, +- .tos = RT_CONN_FLAGS(sk), +- }, +- }, +- .proto = sk->sk_protocol, +- .uli_u = { +- .ports = { +- .sport = inet->sport, +- .dport = inet->dport, +- }, +- }, +- }; +- +- security_sk_classify_flow(sk, &fl); +- err = ip_route_output_flow(&rt, &fl, sk, 0); +-} +- if (!err) +- sk_setup_caps(sk, &rt->u.dst); +- else { +- /* Routing failed... */ +- sk->sk_route_caps = 0; +- /* +- * Other protocols have to map its equivalent state to TCP_SYN_SENT. +- * DCCP maps its DCCP_REQUESTING state to TCP_SYN_SENT. -acme +- */ +- if (!sysctl_ip_dynaddr || +- sk->sk_state != TCP_SYN_SENT || +- (sk->sk_userlocks & SOCK_BINDADDR_LOCK) || +- (err = inet_sk_reselect_saddr(sk)) != 0) +- sk->sk_err_soft = -err; +- } +- +- return err; +-} +- +-EXPORT_SYMBOL(inet_sk_rebuild_header); +- +-static int inet_gso_send_check(struct sk_buff *skb) +-{ +- struct iphdr *iph; +- struct net_protocol *ops; +- int proto; +- int ihl; +- int err = -EINVAL; +- +- if (unlikely(!pskb_may_pull(skb, sizeof(*iph)))) +- goto out; +- +- iph = ip_hdr(skb); +- ihl = iph->ihl * 4; +- if (ihl < sizeof(*iph)) +- goto out; +- +- if (unlikely(!pskb_may_pull(skb, ihl))) +- goto out; +- +- __skb_pull(skb, ihl); +- skb_reset_transport_header(skb); +- iph = ip_hdr(skb); +- proto = iph->protocol & (MAX_INET_PROTOS - 1); +- err = -EPROTONOSUPPORT; +- +- rcu_read_lock(); +- ops = rcu_dereference(inet_protos[proto]); +- if (likely(ops && ops->gso_send_check)) +- err = ops->gso_send_check(skb); +- rcu_read_unlock(); +- +-out: +- return err; +-} +- +-static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int features) +-{ +- struct sk_buff *segs = ERR_PTR(-EINVAL); +- struct iphdr *iph; +- struct net_protocol *ops; +- int proto; +- int ihl; +- int id; +- +- if (unlikely(skb_shinfo(skb)->gso_type & +- ~(SKB_GSO_TCPV4 | +- SKB_GSO_UDP | +- SKB_GSO_DODGY | +- SKB_GSO_TCP_ECN | +- 0))) +- goto out; +- +- if (unlikely(!pskb_may_pull(skb, sizeof(*iph)))) +- goto out; +- +- iph = ip_hdr(skb); +- ihl = iph->ihl * 4; +- if (ihl < sizeof(*iph)) +- goto out; +- +- if (unlikely(!pskb_may_pull(skb, ihl))) +- goto out; +- +- __skb_pull(skb, ihl); +- skb_reset_transport_header(skb); +- iph = ip_hdr(skb); +- id = ntohs(iph->id); +- proto = iph->protocol & (MAX_INET_PROTOS - 1); +- segs = ERR_PTR(-EPROTONOSUPPORT); +- +- rcu_read_lock(); +- ops = rcu_dereference(inet_protos[proto]); +- if (likely(ops && ops->gso_segment)) +- segs = ops->gso_segment(skb, features); +- rcu_read_unlock(); +- +- if (!segs || unlikely(IS_ERR(segs))) +- goto out; +- +- skb = segs; +- do { +- iph = ip_hdr(skb); +- iph->id = htons(id++); +- iph->tot_len = htons(skb->len - skb->mac_len); +- iph->check = 0; +- iph->check = ip_fast_csum(skb_network_header(skb), iph->ihl); +- } while ((skb = skb->next)); +- +-out: +- return segs; +-} +- +-unsigned long snmp_fold_field(void *mib[], int offt) +-{ +- unsigned long res = 0; +- int i; +- +- for_each_possible_cpu(i) { +- res += *(((unsigned long *) per_cpu_ptr(mib[0], i)) + offt); +- res += *(((unsigned long *) per_cpu_ptr(mib[1], i)) + offt); +- } +- return res; +-} +-EXPORT_SYMBOL_GPL(snmp_fold_field); +- +-int snmp_mib_init(void *ptr[2], size_t mibsize, size_t mibalign) +-{ +- BUG_ON(ptr == NULL); +- ptr[0] = __alloc_percpu(mibsize); +- if (!ptr[0]) +- goto err0; +- ptr[1] = __alloc_percpu(mibsize); +- if (!ptr[1]) +- goto err1; +- return 0; +-err1: +- free_percpu(ptr[0]); +- ptr[0] = NULL; +-err0: +- return -ENOMEM; +-} +-EXPORT_SYMBOL_GPL(snmp_mib_init); +- +-void snmp_mib_free(void *ptr[2]) +-{ +- BUG_ON(ptr == NULL); +- free_percpu(ptr[0]); +- free_percpu(ptr[1]); +- ptr[0] = ptr[1] = NULL; +-} +-EXPORT_SYMBOL_GPL(snmp_mib_free); +- +-#ifdef CONFIG_IP_MULTICAST +-static struct net_protocol igmp_protocol = { +- .handler = igmp_rcv, +-}; +-#endif +- +-static struct net_protocol tcp_protocol = { +- .handler = tcp_v4_rcv, +- .err_handler = tcp_v4_err, +- .gso_send_check = tcp_v4_gso_send_check, +- .gso_segment = tcp_tso_segment, +- .no_policy = 1, +-}; +- +-static struct net_protocol udp_protocol = { +- .handler = udp_rcv, +- .err_handler = udp_err, +- .no_policy = 1, +-}; +- +-static struct net_protocol icmp_protocol = { +- .handler = icmp_rcv, +-}; +- +-static int __init init_ipv4_mibs(void) +-{ +- if (snmp_mib_init((void **)net_statistics, +- sizeof(struct linux_mib), +- __alignof__(struct linux_mib)) < 0) +- goto err_net_mib; +- if (snmp_mib_init((void **)ip_statistics, +- sizeof(struct ipstats_mib), +- __alignof__(struct ipstats_mib)) < 0) +- goto err_ip_mib; +- if (snmp_mib_init((void **)icmp_statistics, +- sizeof(struct icmp_mib), +- __alignof__(struct icmp_mib)) < 0) +- goto err_icmp_mib; +- if (snmp_mib_init((void **)tcp_statistics, +- sizeof(struct tcp_mib), +- __alignof__(struct tcp_mib)) < 0) +- goto err_tcp_mib; +- if (snmp_mib_init((void **)udp_statistics, +- sizeof(struct udp_mib), +- __alignof__(struct udp_mib)) < 0) +- goto err_udp_mib; +- if (snmp_mib_init((void **)udplite_statistics, +- sizeof(struct udp_mib), +- __alignof__(struct udp_mib)) < 0) +- goto err_udplite_mib; +- +- tcp_mib_init(); +- +- return 0; +- +-err_udplite_mib: +- snmp_mib_free((void **)udp_statistics); +-err_udp_mib: +- snmp_mib_free((void **)tcp_statistics); +-err_tcp_mib: +- snmp_mib_free((void **)icmp_statistics); +-err_icmp_mib: +- snmp_mib_free((void **)ip_statistics); +-err_ip_mib: +- snmp_mib_free((void **)net_statistics); +-err_net_mib: +- return -ENOMEM; +-} +- +-static int ipv4_proc_init(void); +- +-/* +- * IP protocol layer initialiser +- */ +- +-static struct packet_type ip_packet_type = { +- .type = __constant_htons(ETH_P_IP), +- .func = ip_rcv, +- .gso_send_check = inet_gso_send_check, +- .gso_segment = inet_gso_segment, +-}; +- +-static int __init inet_init(void) +-{ +- struct sk_buff *dummy_skb; +- struct inet_protosw *q; +- struct list_head *r; +- int rc = -EINVAL; +- +- BUILD_BUG_ON(sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb)); +- +- rc = proto_register(&tcp_prot, 1); +- if (rc) +- goto out; +- +- rc = proto_register(&udp_prot, 1); +- if (rc) +- goto out_unregister_tcp_proto; +- +- rc = proto_register(&raw_prot, 1); +- if (rc) +- goto out_unregister_udp_proto; +- +- /* +- * Tell SOCKET that we are alive... +- */ +- +- (void)sock_register(&inet_family_ops); +- +- /* +- * Add all the base protocols. +- */ +- +- if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0) +- printk(KERN_CRIT "inet_init: Cannot add ICMP protocol\n"); +- if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0) +- printk(KERN_CRIT "inet_init: Cannot add UDP protocol\n"); +- if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0) +- printk(KERN_CRIT "inet_init: Cannot add TCP protocol\n"); +-#ifdef CONFIG_IP_MULTICAST +- if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0) +- printk(KERN_CRIT "inet_init: Cannot add IGMP protocol\n"); +-#endif +- +- /* Register the socket-side information for inet_create. */ +- for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r) +- INIT_LIST_HEAD(r); +- +- for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q) +- inet_register_protosw(q); +- +- /* +- * Set the ARP module up +- */ +- +- arp_init(); +- +- /* +- * Set the IP module up +- */ +- +- ip_init(); +- +- tcp_v4_init(&inet_family_ops); +- +- /* Setup TCP slab cache for open requests. */ +- tcp_init(); +- +- /* Add UDP-Lite (RFC 3828) */ +- udplite4_register(); +- +- /* +- * Set the ICMP layer up +- */ +- +- icmp_init(&inet_family_ops); +- +- /* +- * Initialise the multicast router +- */ +-#if defined(CONFIG_IP_MROUTE) +- ip_mr_init(); +-#endif +- /* +- * Initialise per-cpu ipv4 mibs +- */ +- +- if (init_ipv4_mibs()) +- printk(KERN_CRIT "inet_init: Cannot init ipv4 mibs\n"); ; +- +- ipv4_proc_init(); +- +- ipfrag_init(); +- +- dev_add_pack(&ip_packet_type); +- +- rc = 0; +-out: +- return rc; +-out_unregister_udp_proto: +- proto_unregister(&udp_prot); +-out_unregister_tcp_proto: +- proto_unregister(&tcp_prot); +- goto out; +-} +- +-fs_initcall(inet_init); +- +-/* ------------------------------------------------------------------------ */ +- +-#ifdef CONFIG_PROC_FS +-static int __init ipv4_proc_init(void) +-{ +- int rc = 0; +- +- if (raw_proc_init()) +- goto out_raw; +- if (tcp4_proc_init()) +- goto out_tcp; +- if (udp4_proc_init()) +- goto out_udp; +- if (fib_proc_init()) +- goto out_fib; +- if (ip_misc_proc_init()) +- goto out_misc; +-out: +- return rc; +-out_misc: +- fib_proc_exit(); +-out_fib: +- udp4_proc_exit(); +-out_udp: +- tcp4_proc_exit(); +-out_tcp: +- raw_proc_exit(); +-out_raw: +- rc = -ENOMEM; +- goto out; +-} +- +-#else /* CONFIG_PROC_FS */ +-static int __init ipv4_proc_init(void) +-{ +- return 0; +-} +-#endif /* CONFIG_PROC_FS */ +- +-MODULE_ALIAS_NETPROTO(PF_INET); +- +-EXPORT_SYMBOL(inet_accept); +-EXPORT_SYMBOL(inet_bind); +-EXPORT_SYMBOL(inet_dgram_connect); +-EXPORT_SYMBOL(inet_dgram_ops); +-EXPORT_SYMBOL(inet_getname); +-EXPORT_SYMBOL(inet_ioctl); +-EXPORT_SYMBOL(inet_listen); +-EXPORT_SYMBOL(inet_register_protosw); +-EXPORT_SYMBOL(inet_release); +-EXPORT_SYMBOL(inet_sendmsg); +-EXPORT_SYMBOL(inet_shutdown); +-EXPORT_SYMBOL(inet_sock_destruct); +-EXPORT_SYMBOL(inet_stream_connect); +-EXPORT_SYMBOL(inet_stream_ops); +-EXPORT_SYMBOL(inet_unregister_protosw); +-EXPORT_SYMBOL(net_statistics); +-EXPORT_SYMBOL(sysctl_ip_nonlocal_bind); +diff -Nurb linux-2.6.22-594/net/netfilter/xt_MARK.c.orig linux-2.6.22-595/net/netfilter/xt_MARK.c.orig +--- linux-2.6.22-594/net/netfilter/xt_MARK.c.orig 2008-03-20 00:05:19.000000000 -0400 ++++ linux-2.6.22-595/net/netfilter/xt_MARK.c.orig 1969-12-31 19:00:00.000000000 -0500 +@@ -1,283 +0,0 @@ +-/* This is a module which is used for setting the NFMARK field of an skb. */ +- +-/* (C) 1999-2001 Marc Boucher +- * +- * This program is free software; you can redistribute it and/or modify +- * it under the terms of the GNU General Public License version 2 as +- * published by the Free Software Foundation. +- * +- */ +- +-#include +-#include +-#include +-#include +-#include +-#include +-#include +- +-#include +-#include +-#include +- +-MODULE_LICENSE("GPL"); +-MODULE_AUTHOR("Marc Boucher "); +-MODULE_DESCRIPTION("ip[6]tables MARK modification module"); +-MODULE_ALIAS("ipt_MARK"); +-MODULE_ALIAS("ip6t_MARK"); +- +-static inline u_int16_t +-get_dst_port(struct nf_conntrack_tuple *tuple) +-{ +- switch (tuple->dst.protonum) { +- case IPPROTO_GRE: +- /* XXX Truncate 32-bit GRE key to 16 bits */ +-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,11) +- return tuple->dst.u.gre.key; +-#else +- return htons(ntohl(tuple->dst.u.gre.key)); +-#endif +- case IPPROTO_ICMP: +- /* Bind on ICMP echo ID */ +- return tuple->src.u.icmp.id; +- case IPPROTO_TCP: +- return tuple->dst.u.tcp.port; +- case IPPROTO_UDP: +- return tuple->dst.u.udp.port; +- default: +- return tuple->dst.u.all; +- } +-} +- +-static inline u_int16_t +-get_src_port(struct nf_conntrack_tuple *tuple) +-{ +- switch (tuple->dst.protonum) { +- case IPPROTO_GRE: +- /* XXX Truncate 32-bit GRE key to 16 bits */ +- return htons(ntohl(tuple->src.u.gre.key)); +- case IPPROTO_ICMP: +- /* Bind on ICMP echo ID */ +- return tuple->src.u.icmp.id; +- case IPPROTO_TCP: +- return tuple->src.u.tcp.port; +- case IPPROTO_UDP: +- return tuple->src.u.udp.port; +- default: +- return tuple->src.u.all; +- } +-} +- +-static unsigned int +-target_v0(struct sk_buff **pskb, +- const struct net_device *in, +- const struct net_device *out, +- unsigned int hooknum, +- const struct xt_target *target, +- const void *targinfo) +-{ +- const struct xt_mark_target_info *markinfo = targinfo; +- +- (*pskb)->mark = markinfo->mark; +- return XT_CONTINUE; +-} +- +-static unsigned int +-target_v1(struct sk_buff **pskb, +- const struct net_device *in, +- const struct net_device *out, +- unsigned int hooknum, +- const struct xt_target *target, +- const void *targinfo) +-{ +- const struct xt_mark_target_info_v1 *markinfo = targinfo; +- int mark = -1; +- +- switch (markinfo->mode) { +- case XT_MARK_SET: +- mark = markinfo->mark; +- break; +- +- case XT_MARK_AND: +- mark = (*pskb)->mark & markinfo->mark; +- break; +- +- case XT_MARK_OR: +- mark = (*pskb)->mark | markinfo->mark; +- break; +- +- case XT_MARK_COPYXID: { +- enum ip_conntrack_info ctinfo; +- struct sock *connection_sk=NULL; +- int dif; +- +- struct nf_conn *ct = nf_ct_get((*pskb), &ctinfo); +- extern struct inet_hashinfo tcp_hashinfo; +- enum ip_conntrack_dir dir; +- if (!ct) +- break; +- +- dir = CTINFO2DIR(ctinfo); +- u_int32_t src_ip = ct->tuplehash[dir].tuple.src.u3.ip; +- u_int16_t src_port = get_src_port(&ct->tuplehash[dir].tuple); +- u_int16_t proto = ct->tuplehash[dir].tuple.dst.protonum; +- +- u_int32_t ip; +- u_int16_t port; +- +- dif = ((struct rtable *)(*pskb)->dst)->rt_iif; +- ip = ct->tuplehash[dir].tuple.dst.u3.ip; +- port = get_dst_port(&ct->tuplehash[dir].tuple); +- +- if (proto == 1 || proto == 17) { +- if (((*pskb)->mark!=-1) && (*pskb)->mark) +- ct->xid[0]=(*pskb)->mark; +- if (ct->xid[0]) +- mark = ct->xid[0]; +- +- } +- else if (proto == 6) { +- if ((*pskb)->sk) +- connection_sk = (*pskb)->sk; +- else { +- connection_sk = inet_lookup(&tcp_hashinfo, src_ip, src_port, ip, port, dif); +- } +- +- if (connection_sk) { +- connection_sk->sk_peercred.gid = connection_sk->sk_peercred.uid = ct->xid[dir]; +- ct->xid[!dir]=connection_sk->sk_xid; +- if (connection_sk->sk_xid != 0) +- mark = connection_sk->sk_xid; +- if (connection_sk != (*pskb)->sk) +- sock_put(connection_sk); +- } +- break; +- } +- } +- } +- +- if (mark != -1) +- (*pskb)->mark = mark; +- return XT_CONTINUE; +-} +- +- +-static int +-checkentry_v0(const char *tablename, +- const void *entry, +- const struct xt_target *target, +- void *targinfo, +- unsigned int hook_mask) +-{ +- struct xt_mark_target_info *markinfo = targinfo; +- +- if (markinfo->mark > 0xffffffff) { +- printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n"); +- return 0; +- } +- return 1; +-} +- +-static int +-checkentry_v1(const char *tablename, +- const void *entry, +- const struct xt_target *target, +- void *targinfo, +- unsigned int hook_mask) +-{ +- struct xt_mark_target_info_v1 *markinfo = targinfo; +- +- if (markinfo->mode != XT_MARK_SET +- && markinfo->mode != XT_MARK_AND +- && markinfo->mode != XT_MARK_OR +- && markinfo->mode != XT_MARK_COPYXID) { +- printk(KERN_WARNING "MARK: unknown mode %u\n", +- markinfo->mode); +- return 0; +- } +- if (markinfo->mark > 0xffffffff) { +- printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n"); +- return 0; +- } +- return 1; +-} +- +-#ifdef CONFIG_COMPAT +-struct compat_xt_mark_target_info_v1 { +- compat_ulong_t mark; +- u_int8_t mode; +- u_int8_t __pad1; +- u_int16_t __pad2; +-}; +- +-static void compat_from_user_v1(void *dst, void *src) +-{ +- struct compat_xt_mark_target_info_v1 *cm = src; +- struct xt_mark_target_info_v1 m = { +- .mark = cm->mark, +- .mode = cm->mode, +- }; +- memcpy(dst, &m, sizeof(m)); +-} +- +-static int compat_to_user_v1(void __user *dst, void *src) +-{ +- struct xt_mark_target_info_v1 *m = src; +- struct compat_xt_mark_target_info_v1 cm = { +- .mark = m->mark, +- .mode = m->mode, +- }; +- return copy_to_user(dst, &cm, sizeof(cm)) ? -EFAULT : 0; +-} +-#endif /* CONFIG_COMPAT */ +- +-static struct xt_target xt_mark_target[] = { +- { +- .name = "MARK", +- .family = AF_INET, +- .revision = 0, +- .checkentry = checkentry_v0, +- .target = target_v0, +- .targetsize = sizeof(struct xt_mark_target_info), +- .table = "mangle", +- .me = THIS_MODULE, +- }, +- { +- .name = "MARK", +- .family = AF_INET, +- .revision = 1, +- .checkentry = checkentry_v1, +- .target = target_v1, +- .targetsize = sizeof(struct xt_mark_target_info_v1), +-#ifdef CONFIG_COMPAT +- .compatsize = sizeof(struct compat_xt_mark_target_info_v1), +- .compat_from_user = compat_from_user_v1, +- .compat_to_user = compat_to_user_v1, +-#endif +- .table = "mangle", +- .me = THIS_MODULE, +- }, +- { +- .name = "MARK", +- .family = AF_INET6, +- .revision = 0, +- .checkentry = checkentry_v0, +- .target = target_v0, +- .targetsize = sizeof(struct xt_mark_target_info), +- .table = "mangle", +- .me = THIS_MODULE, +- }, +-}; +- +-static int __init xt_mark_init(void) +-{ +- return xt_register_targets(xt_mark_target, ARRAY_SIZE(xt_mark_target)); +-} +- +-static void __exit xt_mark_fini(void) +-{ +- xt_unregister_targets(xt_mark_target, ARRAY_SIZE(xt_mark_target)); +-} +- +-module_init(xt_mark_init); +-module_exit(xt_mark_fini); +diff -Nurb linux-2.6.22-594/net/packet/af_packet.c.orig linux-2.6.22-595/net/packet/af_packet.c.orig +--- linux-2.6.22-594/net/packet/af_packet.c.orig 2008-03-20 00:05:19.000000000 -0400 ++++ linux-2.6.22-595/net/packet/af_packet.c.orig 1969-12-31 19:00:00.000000000 -0500 +@@ -1,1989 +0,0 @@ +-/* +- * INET An implementation of the TCP/IP protocol suite for the LINUX +- * operating system. INET is implemented using the BSD Socket +- * interface as the means of communication with the user level. +- * +- * PACKET - implements raw packet sockets. +- * +- * Version: $Id: af_packet.c,v 1.61 2002/02/08 03:57:19 davem Exp $ +- * +- * Authors: Ross Biro +- * Fred N. van Kempen, +- * Alan Cox, +- * +- * Fixes: +- * Alan Cox : verify_area() now used correctly +- * Alan Cox : new skbuff lists, look ma no backlogs! +- * Alan Cox : tidied skbuff lists. +- * Alan Cox : Now uses generic datagram routines I +- * added. Also fixed the peek/read crash +- * from all old Linux datagram code. +- * Alan Cox : Uses the improved datagram code. +- * Alan Cox : Added NULL's for socket options. +- * Alan Cox : Re-commented the code. +- * Alan Cox : Use new kernel side addressing +- * Rob Janssen : Correct MTU usage. +- * Dave Platt : Counter leaks caused by incorrect +- * interrupt locking and some slightly +- * dubious gcc output. Can you read +- * compiler: it said _VOLATILE_ +- * Richard Kooijman : Timestamp fixes. +- * Alan Cox : New buffers. Use sk->mac.raw. +- * Alan Cox : sendmsg/recvmsg support. +- * Alan Cox : Protocol setting support +- * Alexey Kuznetsov : Untied from IPv4 stack. +- * Cyrus Durgin : Fixed kerneld for kmod. +- * Michal Ostrowski : Module initialization cleanup. +- * Ulises Alonso : Frame number limit removal and +- * packet_set_ring memory leak. +- * Eric Biederman : Allow for > 8 byte hardware addresses. +- * The convention is that longer addresses +- * will simply extend the hardware address +- * byte arrays at the end of sockaddr_ll +- * and packet_mreq. +- * +- * This program is free software; you can redistribute it and/or +- * modify it under the terms of the GNU General Public License +- * as published by the Free Software Foundation; either version +- * 2 of the License, or (at your option) any later version. +- * +- */ +- +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +- +-#ifdef CONFIG_INET +-#include +-#endif +- +-/* +- Assumptions: +- - if device has no dev->hard_header routine, it adds and removes ll header +- inside itself. In this case ll header is invisible outside of device, +- but higher levels still should reserve dev->hard_header_len. +- Some devices are enough clever to reallocate skb, when header +- will not fit to reserved space (tunnel), another ones are silly +- (PPP). +- - packet socket receives packets with pulled ll header, +- so that SOCK_RAW should push it back. +- +-On receive: +------------ +- +-Incoming, dev->hard_header!=NULL +- mac_header -> ll header +- data -> data +- +-Outgoing, dev->hard_header!=NULL +- mac_header -> ll header +- data -> ll header +- +-Incoming, dev->hard_header==NULL +- mac_header -> UNKNOWN position. It is very likely, that it points to ll +- header. PPP makes it, that is wrong, because introduce +- assymetry between rx and tx paths. +- data -> data +- +-Outgoing, dev->hard_header==NULL +- mac_header -> data. ll header is still not built! +- data -> data +- +-Resume +- If dev->hard_header==NULL we are unlikely to restore sensible ll header. +- +- +-On transmit: +------------- +- +-dev->hard_header != NULL +- mac_header -> ll header +- data -> ll header +- +-dev->hard_header == NULL (ll header is added by device, we cannot control it) +- mac_header -> data +- data -> data +- +- We should set nh.raw on output to correct posistion, +- packet classifier depends on it. +- */ +- +-/* List of all packet sockets. */ +-static HLIST_HEAD(packet_sklist); +-static DEFINE_RWLOCK(packet_sklist_lock); +- +-static atomic_t packet_socks_nr; +- +- +-/* Private packet socket structures. */ +- +-struct packet_mclist +-{ +- struct packet_mclist *next; +- int ifindex; +- int count; +- unsigned short type; +- unsigned short alen; +- unsigned char addr[MAX_ADDR_LEN]; +-}; +-/* identical to struct packet_mreq except it has +- * a longer address field. +- */ +-struct packet_mreq_max +-{ +- int mr_ifindex; +- unsigned short mr_type; +- unsigned short mr_alen; +- unsigned char mr_address[MAX_ADDR_LEN]; +-}; +- +-#ifdef CONFIG_PACKET_MMAP +-static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing); +-#endif +- +-static void packet_flush_mclist(struct sock *sk); +- +-struct packet_sock { +- /* struct sock has to be the first member of packet_sock */ +- struct sock sk; +- struct tpacket_stats stats; +-#ifdef CONFIG_PACKET_MMAP +- char * *pg_vec; +- unsigned int head; +- unsigned int frames_per_block; +- unsigned int frame_size; +- unsigned int frame_max; +- int copy_thresh; +-#endif +- struct packet_type prot_hook; +- spinlock_t bind_lock; +- unsigned int running:1, /* prot_hook is attached*/ +- auxdata:1, +- origdev:1; +- int ifindex; /* bound device */ +- __be16 num; +- struct packet_mclist *mclist; +-#ifdef CONFIG_PACKET_MMAP +- atomic_t mapped; +- unsigned int pg_vec_order; +- unsigned int pg_vec_pages; +- unsigned int pg_vec_len; +-#endif +-}; +- +-struct packet_skb_cb { +- unsigned int origlen; +- union { +- struct sockaddr_pkt pkt; +- struct sockaddr_ll ll; +- } sa; +-}; +- +-#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb)) +- +-#ifdef CONFIG_PACKET_MMAP +- +-static inline struct tpacket_hdr *packet_lookup_frame(struct packet_sock *po, unsigned int position) +-{ +- unsigned int pg_vec_pos, frame_offset; +- +- pg_vec_pos = position / po->frames_per_block; +- frame_offset = position % po->frames_per_block; +- +- return (struct tpacket_hdr *)(po->pg_vec[pg_vec_pos] + (frame_offset * po->frame_size)); +-} +-#endif +- +-static inline struct packet_sock *pkt_sk(struct sock *sk) +-{ +- return (struct packet_sock *)sk; +-} +- +-static void packet_sock_destruct(struct sock *sk) +-{ +- BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc)); +- BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc)); +- +- if (!sock_flag(sk, SOCK_DEAD)) { +- printk("Attempt to release alive packet socket: %p\n", sk); +- return; +- } +- +- atomic_dec(&packet_socks_nr); +-#ifdef PACKET_REFCNT_DEBUG +- printk(KERN_DEBUG "PACKET socket %p is free, %d are alive\n", sk, atomic_read(&packet_socks_nr)); +-#endif +-} +- +- +-static const struct proto_ops packet_ops; +- +-static const struct proto_ops packet_ops_spkt; +- +-static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) +-{ +- struct sock *sk; +- struct sockaddr_pkt *spkt; +- +- /* +- * When we registered the protocol we saved the socket in the data +- * field for just this event. +- */ +- +- sk = pt->af_packet_priv; +- +- /* +- * Yank back the headers [hope the device set this +- * right or kerboom...] +- * +- * Incoming packets have ll header pulled, +- * push it back. +- * +- * For outgoing ones skb->data == skb_mac_header(skb) +- * so that this procedure is noop. +- */ +- +- if (skb->pkt_type == PACKET_LOOPBACK) +- goto out; +- +- if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) +- goto oom; +- +- /* drop any routing info */ +- dst_release(skb->dst); +- skb->dst = NULL; +- +- /* drop conntrack reference */ +- nf_reset(skb); +- +- spkt = &PACKET_SKB_CB(skb)->sa.pkt; +- +- skb_push(skb, skb->data - skb_mac_header(skb)); +- +- /* +- * The SOCK_PACKET socket receives _all_ frames. +- */ +- +- spkt->spkt_family = dev->type; +- strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device)); +- spkt->spkt_protocol = skb->protocol; +- +- /* +- * Charge the memory to the socket. This is done specifically +- * to prevent sockets using all the memory up. +- */ +- +- if (sock_queue_rcv_skb(sk,skb) == 0) +- return 0; +- +-out: +- kfree_skb(skb); +-oom: +- return 0; +-} +- +- +-/* +- * Output a raw packet to a device layer. This bypasses all the other +- * protocol layers and you must therefore supply it with a complete frame +- */ +- +-static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock, +- struct msghdr *msg, size_t len) +-{ +- struct sock *sk = sock->sk; +- struct sockaddr_pkt *saddr=(struct sockaddr_pkt *)msg->msg_name; +- struct sk_buff *skb; +- struct net_device *dev; +- __be16 proto=0; +- int err; +- +- if (!nx_capable(CAP_NET_RAW, NXC_RAW_SEND)) +- return -EPERM; +- +- /* +- * Get and verify the address. +- */ +- +- if (saddr) +- { +- if (msg->msg_namelen < sizeof(struct sockaddr)) +- return(-EINVAL); +- if (msg->msg_namelen==sizeof(struct sockaddr_pkt)) +- proto=saddr->spkt_protocol; +- } +- else +- return(-ENOTCONN); /* SOCK_PACKET must be sent giving an address */ +- +- /* +- * Find the device first to size check it +- */ +- +- saddr->spkt_device[13] = 0; +- dev = dev_get_by_name(saddr->spkt_device); +- err = -ENODEV; +- if (dev == NULL) +- goto out_unlock; +- +- err = -ENETDOWN; +- if (!(dev->flags & IFF_UP)) +- goto out_unlock; +- +- /* +- * You may not queue a frame bigger than the mtu. This is the lowest level +- * raw protocol and you must do your own fragmentation at this level. +- */ +- +- err = -EMSGSIZE; +- if (len > dev->mtu + dev->hard_header_len) +- goto out_unlock; +- +- err = -ENOBUFS; +- skb = sock_wmalloc(sk, len + LL_RESERVED_SPACE(dev), 0, GFP_KERNEL); +- +- /* +- * If the write buffer is full, then tough. At this level the user gets to +- * deal with the problem - do your own algorithmic backoffs. That's far +- * more flexible. +- */ +- +- if (skb == NULL) +- goto out_unlock; +- +- /* +- * Fill it in +- */ +- +- /* FIXME: Save some space for broken drivers that write a +- * hard header at transmission time by themselves. PPP is the +- * notable one here. This should really be fixed at the driver level. +- */ +- skb_reserve(skb, LL_RESERVED_SPACE(dev)); +- skb_reset_network_header(skb); +- +- /* Try to align data part correctly */ +- if (dev->hard_header) { +- skb->data -= dev->hard_header_len; +- skb->tail -= dev->hard_header_len; +- if (len < dev->hard_header_len) +- skb_reset_network_header(skb); +- } +- +- /* Returns -EFAULT on error */ +- err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len); +- skb->protocol = proto; +- skb->dev = dev; +- skb->priority = sk->sk_priority; +- if (err) +- goto out_free; +- +- /* +- * Now send it +- */ +- +- dev_queue_xmit(skb); +- dev_put(dev); +- return(len); +- +-out_free: +- kfree_skb(skb); +-out_unlock: +- if (dev) +- dev_put(dev); +- return err; +-} +- +-static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk, +- unsigned int res) +-{ +- struct sk_filter *filter; +- int tag = skb->skb_tag; +- +- if (sk->sk_nx_info && !(tag == 1 || sk->sk_nid == tag)) +- return 0; +- +- rcu_read_lock_bh(); +- filter = rcu_dereference(sk->sk_filter); +- if (filter != NULL) +- res = sk_run_filter(skb, filter->insns, filter->len); +- rcu_read_unlock_bh(); +- +- return res; +-} +- +-/* +- This function makes lazy skb cloning in hope that most of packets +- are discarded by BPF. +- +- Note tricky part: we DO mangle shared skb! skb->data, skb->len +- and skb->cb are mangled. It works because (and until) packets +- falling here are owned by current CPU. Output packets are cloned +- by dev_queue_xmit_nit(), input packets are processed by net_bh +- sequencially, so that if we return skb to original state on exit, +- we will not harm anyone. +- */ +- +-static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) +-{ +- struct sock *sk; +- struct sockaddr_ll *sll; +- struct packet_sock *po; +- u8 * skb_head = skb->data; +- int skb_len = skb->len; +- unsigned int snaplen, res; +- +- if (skb->pkt_type == PACKET_LOOPBACK) +- goto drop; +- +- sk = pt->af_packet_priv; +- po = pkt_sk(sk); +- +- skb->dev = dev; +- +- if (dev->hard_header) { +- /* The device has an explicit notion of ll header, +- exported to higher levels. +- +- Otherwise, the device hides datails of it frame +- structure, so that corresponding packet head +- never delivered to user. +- */ +- if (sk->sk_type != SOCK_DGRAM) +- skb_push(skb, skb->data - skb_mac_header(skb)); +- else if (skb->pkt_type == PACKET_OUTGOING) { +- /* Special case: outgoing packets have ll header at head */ +- skb_pull(skb, skb_network_offset(skb)); +- } +- } +- +- snaplen = skb->len; +- +- res = run_filter(skb, sk, snaplen); +- if (!res) +- goto drop_n_restore; +- if (snaplen > res) +- snaplen = res; +- +- if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= +- (unsigned)sk->sk_rcvbuf) +- goto drop_n_acct; +- +- if (skb_shared(skb)) { +- struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); +- if (nskb == NULL) +- goto drop_n_acct; +- +- if (skb_head != skb->data) { +- skb->data = skb_head; +- skb->len = skb_len; +- } +- kfree_skb(skb); +- skb = nskb; +- } +- +- BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 > +- sizeof(skb->cb)); +- +- sll = &PACKET_SKB_CB(skb)->sa.ll; +- sll->sll_family = AF_PACKET; +- sll->sll_hatype = dev->type; +- sll->sll_protocol = skb->protocol; +- sll->sll_pkttype = skb->pkt_type; +- if (unlikely(po->origdev) && skb->pkt_type == PACKET_HOST) +- sll->sll_ifindex = orig_dev->ifindex; +- else +- sll->sll_ifindex = dev->ifindex; +- sll->sll_halen = 0; +- +- if (dev->hard_header_parse) +- sll->sll_halen = dev->hard_header_parse(skb, sll->sll_addr); +- +- PACKET_SKB_CB(skb)->origlen = skb->len; +- +- if (pskb_trim(skb, snaplen)) +- goto drop_n_acct; +- +- skb_set_owner_r(skb, sk); +- skb->dev = NULL; +- dst_release(skb->dst); +- skb->dst = NULL; +- +- /* drop conntrack reference */ +- nf_reset(skb); +- +- spin_lock(&sk->sk_receive_queue.lock); +- po->stats.tp_packets++; +- __skb_queue_tail(&sk->sk_receive_queue, skb); +- spin_unlock(&sk->sk_receive_queue.lock); +- sk->sk_data_ready(sk, skb->len); +- return 0; +- +-drop_n_acct: +- spin_lock(&sk->sk_receive_queue.lock); +- po->stats.tp_drops++; +- spin_unlock(&sk->sk_receive_queue.lock); +- +-drop_n_restore: +- if (skb_head != skb->data && skb_shared(skb)) { +- skb->data = skb_head; +- skb->len = skb_len; +- } +-drop: +- kfree_skb(skb); +- return 0; +-} +- +-#ifdef CONFIG_PACKET_MMAP +-static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) +-{ +- struct sock *sk; +- struct packet_sock *po; +- struct sockaddr_ll *sll; +- struct tpacket_hdr *h; +- u8 * skb_head = skb->data; +- int skb_len = skb->len; +- unsigned int snaplen, res; +- unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER; +- unsigned short macoff, netoff; +- struct sk_buff *copy_skb = NULL; +- struct timeval tv; +- +- if (skb->pkt_type == PACKET_LOOPBACK) +- goto drop; +- +- sk = pt->af_packet_priv; +- po = pkt_sk(sk); +- +- if (dev->hard_header) { +- if (sk->sk_type != SOCK_DGRAM) +- skb_push(skb, skb->data - skb_mac_header(skb)); +- else if (skb->pkt_type == PACKET_OUTGOING) { +- /* Special case: outgoing packets have ll header at head */ +- skb_pull(skb, skb_network_offset(skb)); +- } +- } +- +- if (skb->ip_summed == CHECKSUM_PARTIAL) +- status |= TP_STATUS_CSUMNOTREADY; +- +- snaplen = skb->len; +- +- res = run_filter(skb, sk, snaplen); +- if (!res) +- goto drop_n_restore; +- if (snaplen > res) +- snaplen = res; +- +- if (sk->sk_type == SOCK_DGRAM) { +- macoff = netoff = TPACKET_ALIGN(TPACKET_HDRLEN) + 16; +- } else { +- unsigned maclen = skb_network_offset(skb); +- netoff = TPACKET_ALIGN(TPACKET_HDRLEN + (maclen < 16 ? 16 : maclen)); +- macoff = netoff - maclen; +- } +- +- if (macoff + snaplen > po->frame_size) { +- if (po->copy_thresh && +- atomic_read(&sk->sk_rmem_alloc) + skb->truesize < +- (unsigned)sk->sk_rcvbuf) { +- if (skb_shared(skb)) { +- copy_skb = skb_clone(skb, GFP_ATOMIC); +- } else { +- copy_skb = skb_get(skb); +- skb_head = skb->data; +- } +- if (copy_skb) +- skb_set_owner_r(copy_skb, sk); +- } +- snaplen = po->frame_size - macoff; +- if ((int)snaplen < 0) +- snaplen = 0; +- } +- +- spin_lock(&sk->sk_receive_queue.lock); +- h = packet_lookup_frame(po, po->head); +- +- if (h->tp_status) +- goto ring_is_full; +- po->head = po->head != po->frame_max ? po->head+1 : 0; +- po->stats.tp_packets++; +- if (copy_skb) { +- status |= TP_STATUS_COPY; +- __skb_queue_tail(&sk->sk_receive_queue, copy_skb); +- } +- if (!po->stats.tp_drops) +- status &= ~TP_STATUS_LOSING; +- spin_unlock(&sk->sk_receive_queue.lock); +- +- skb_copy_bits(skb, 0, (u8*)h + macoff, snaplen); +- +- h->tp_len = skb->len; +- h->tp_snaplen = snaplen; +- h->tp_mac = macoff; +- h->tp_net = netoff; +- if (skb->tstamp.tv64 == 0) { +- __net_timestamp(skb); +- sock_enable_timestamp(sk); +- } +- tv = ktime_to_timeval(skb->tstamp); +- h->tp_sec = tv.tv_sec; +- h->tp_usec = tv.tv_usec; +- +- sll = (struct sockaddr_ll*)((u8*)h + TPACKET_ALIGN(sizeof(*h))); +- sll->sll_halen = 0; +- if (dev->hard_header_parse) +- sll->sll_halen = dev->hard_header_parse(skb, sll->sll_addr); +- sll->sll_family = AF_PACKET; +- sll->sll_hatype = dev->type; +- sll->sll_protocol = skb->protocol; +- sll->sll_pkttype = skb->pkt_type; +- if (unlikely(po->origdev) && skb->pkt_type == PACKET_HOST) +- sll->sll_ifindex = orig_dev->ifindex; +- else +- sll->sll_ifindex = dev->ifindex; +- +- h->tp_status = status; +- smp_mb(); +- +- { +- struct page *p_start, *p_end; +- u8 *h_end = (u8 *)h + macoff + snaplen - 1; +- +- p_start = virt_to_page(h); +- p_end = virt_to_page(h_end); +- while (p_start <= p_end) { +- flush_dcache_page(p_start); +- p_start++; +- } +- } +- +- sk->sk_data_ready(sk, 0); +- +-drop_n_restore: +- if (skb_head != skb->data && skb_shared(skb)) { +- skb->data = skb_head; +- skb->len = skb_len; +- } +-drop: +- kfree_skb(skb); +- return 0; +- +-ring_is_full: +- po->stats.tp_drops++; +- spin_unlock(&sk->sk_receive_queue.lock); +- +- sk->sk_data_ready(sk, 0); +- if (copy_skb) +- kfree_skb(copy_skb); +- goto drop_n_restore; +-} +- +-#endif +- +- +-static int packet_sendmsg(struct kiocb *iocb, struct socket *sock, +- struct msghdr *msg, size_t len) +-{ +- struct sock *sk = sock->sk; +- struct sockaddr_ll *saddr=(struct sockaddr_ll *)msg->msg_name; +- struct sk_buff *skb; +- struct net_device *dev; +- __be16 proto; +- unsigned char *addr; +- int ifindex, err, reserve = 0; +- +- if (!nx_capable(CAP_NET_RAW, NXC_RAW_SEND)) +- return -EPERM; +- +- /* +- * Get and verify the address. +- */ +- +- if (saddr == NULL) { +- struct packet_sock *po = pkt_sk(sk); +- +- ifindex = po->ifindex; +- proto = po->num; +- addr = NULL; +- } else { +- err = -EINVAL; +- if (msg->msg_namelen < sizeof(struct sockaddr_ll)) +- goto out; +- if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr))) +- goto out; +- ifindex = saddr->sll_ifindex; +- proto = saddr->sll_protocol; +- addr = saddr->sll_addr; +- } +- +- +- dev = dev_get_by_index(ifindex); +- err = -ENXIO; +- if (dev == NULL) +- goto out_unlock; +- if (sock->type == SOCK_RAW) +- reserve = dev->hard_header_len; +- +- err = -ENETDOWN; +- if (!(dev->flags & IFF_UP)) +- goto out_unlock; +- +- err = -EMSGSIZE; +- if (len > dev->mtu+reserve) +- goto out_unlock; +- +- skb = sock_alloc_send_skb(sk, len + LL_RESERVED_SPACE(dev), +- msg->msg_flags & MSG_DONTWAIT, &err); +- if (skb==NULL) +- goto out_unlock; +- +- skb_reserve(skb, LL_RESERVED_SPACE(dev)); +- skb_reset_network_header(skb); +- +- if (dev->hard_header) { +- int res; +- err = -EINVAL; +- res = dev->hard_header(skb, dev, ntohs(proto), addr, NULL, len); +- if (sock->type != SOCK_DGRAM) { +- skb_reset_tail_pointer(skb); +- skb->len = 0; +- } else if (res < 0) +- goto out_free; +- } +- +- /* Returns -EFAULT on error */ +- err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len); +- if (err) +- goto out_free; +- +- skb->protocol = proto; +- skb->dev = dev; +- skb->priority = sk->sk_priority; +- +- /* +- * Now send it +- */ +- +- err = dev_queue_xmit(skb); +- if (err > 0 && (err = net_xmit_errno(err)) != 0) +- goto out_unlock; +- +- dev_put(dev); +- +- return(len); +- +-out_free: +- kfree_skb(skb); +-out_unlock: +- if (dev) +- dev_put(dev); +-out: +- return err; +-} +- +-/* +- * Close a PACKET socket. This is fairly simple. We immediately go +- * to 'closed' state and remove our protocol entry in the device list. +- */ +- +-static int packet_release(struct socket *sock) +-{ +- struct sock *sk = sock->sk; +- struct packet_sock *po; +- +- if (!sk) +- return 0; +- +- po = pkt_sk(sk); +- +- write_lock_bh(&packet_sklist_lock); +- sk_del_node_init(sk); +- write_unlock_bh(&packet_sklist_lock); +- +- /* +- * Unhook packet receive handler. +- */ +- +- if (po->running) { +- /* +- * Remove the protocol hook +- */ +- dev_remove_pack(&po->prot_hook); +- po->running = 0; +- po->num = 0; +- __sock_put(sk); +- } +- +- packet_flush_mclist(sk); +- +-#ifdef CONFIG_PACKET_MMAP +- if (po->pg_vec) { +- struct tpacket_req req; +- memset(&req, 0, sizeof(req)); +- packet_set_ring(sk, &req, 1); +- } +-#endif +- +- /* +- * Now the socket is dead. No more input will appear. +- */ +- +- sock_orphan(sk); +- sock->sk = NULL; +- +- /* Purge queues */ +- +- skb_queue_purge(&sk->sk_receive_queue); +- +- sock_put(sk); +- return 0; +-} +- +-/* +- * Attach a packet hook. +- */ +- +-static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol) +-{ +- struct packet_sock *po = pkt_sk(sk); +- /* +- * Detach an existing hook if present. +- */ +- +- lock_sock(sk); +- +- spin_lock(&po->bind_lock); +- if (po->running) { +- __sock_put(sk); +- po->running = 0; +- po->num = 0; +- spin_unlock(&po->bind_lock); +- dev_remove_pack(&po->prot_hook); +- spin_lock(&po->bind_lock); +- } +- +- po->num = protocol; +- po->prot_hook.type = protocol; +- po->prot_hook.dev = dev; +- +- po->ifindex = dev ? dev->ifindex : 0; +- +- if (protocol == 0) +- goto out_unlock; +- +- if (dev) { +- if (dev->flags&IFF_UP) { +- dev_add_pack(&po->prot_hook); +- sock_hold(sk); +- po->running = 1; +- } else { +- sk->sk_err = ENETDOWN; +- if (!sock_flag(sk, SOCK_DEAD)) +- sk->sk_error_report(sk); +- } +- } else { +- dev_add_pack(&po->prot_hook); +- sock_hold(sk); +- po->running = 1; +- } +- +-out_unlock: +- spin_unlock(&po->bind_lock); +- release_sock(sk); +- return 0; +-} +- +-/* +- * Bind a packet socket to a device +- */ +- +-static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, int addr_len) +-{ +- struct sock *sk=sock->sk; +- char name[15]; +- struct net_device *dev; +- int err = -ENODEV; +- +- /* +- * Check legality +- */ +- +- if (addr_len != sizeof(struct sockaddr)) +- return -EINVAL; +- strlcpy(name,uaddr->sa_data,sizeof(name)); +- +- dev = dev_get_by_name(name); +- if (dev) { +- err = packet_do_bind(sk, dev, pkt_sk(sk)->num); +- dev_put(dev); +- } +- return err; +-} +- +-static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +-{ +- struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr; +- struct sock *sk=sock->sk; +- struct net_device *dev = NULL; +- int err; +- +- +- /* +- * Check legality +- */ +- +- if (addr_len < sizeof(struct sockaddr_ll)) +- return -EINVAL; +- if (sll->sll_family != AF_PACKET) +- return -EINVAL; +- +- if (sll->sll_ifindex) { +- err = -ENODEV; +- dev = dev_get_by_index(sll->sll_ifindex); +- if (dev == NULL) +- goto out; +- } +- err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num); +- if (dev) +- dev_put(dev); +- +-out: +- return err; +-} +- +-static struct proto packet_proto = { +- .name = "PACKET", +- .owner = THIS_MODULE, +- .obj_size = sizeof(struct packet_sock), +-}; +- +-/* +- * Create a packet of type SOCK_PACKET. +- */ +- +-static int packet_create(struct socket *sock, int protocol) +-{ +- struct sock *sk; +- struct packet_sock *po; +- __be16 proto = (__force __be16)protocol; /* weird, but documented */ +- int err; +- +- if (!nx_capable(CAP_NET_RAW, NXC_RAW_SOCKET)) +- return -EPERM; +- if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW && +- sock->type != SOCK_PACKET) +- return -ESOCKTNOSUPPORT; +- +- sock->state = SS_UNCONNECTED; +- +- err = -ENOBUFS; +- sk = sk_alloc(PF_PACKET, GFP_KERNEL, &packet_proto, 1); +- if (sk == NULL) +- goto out; +- +- sock->ops = &packet_ops; +- if (sock->type == SOCK_PACKET) +- sock->ops = &packet_ops_spkt; +- +- sock_init_data(sock, sk); +- +- po = pkt_sk(sk); +- sk->sk_family = PF_PACKET; +- po->num = proto; +- +- sk->sk_destruct = packet_sock_destruct; +- atomic_inc(&packet_socks_nr); +- +- /* +- * Attach a protocol block +- */ +- +- spin_lock_init(&po->bind_lock); +- po->prot_hook.func = packet_rcv; +- +- if (sock->type == SOCK_PACKET) +- po->prot_hook.func = packet_rcv_spkt; +- +- po->prot_hook.af_packet_priv = sk; +- +- if (proto) { +- po->prot_hook.type = proto; +- dev_add_pack(&po->prot_hook); +- sock_hold(sk); +- po->running = 1; +- } +- +- write_lock_bh(&packet_sklist_lock); +- sk_add_node(sk, &packet_sklist); +- write_unlock_bh(&packet_sklist_lock); +- return(0); +-out: +- return err; +-} +- +-/* +- * Pull a packet from our receive queue and hand it to the user. +- * If necessary we block. +- */ +- +-static int packet_recvmsg(struct kiocb *iocb, struct socket *sock, +- struct msghdr *msg, size_t len, int flags) +-{ +- struct sock *sk = sock->sk; +- struct sk_buff *skb; +- int copied, err; +- struct sockaddr_ll *sll; +- +- err = -EINVAL; +- if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT)) +- goto out; +- +-#if 0 +- /* What error should we return now? EUNATTACH? */ +- if (pkt_sk(sk)->ifindex < 0) +- return -ENODEV; +-#endif +- +- /* +- * Call the generic datagram receiver. This handles all sorts +- * of horrible races and re-entrancy so we can forget about it +- * in the protocol layers. +- * +- * Now it will return ENETDOWN, if device have just gone down, +- * but then it will block. +- */ +- +- skb=skb_recv_datagram(sk,flags,flags&MSG_DONTWAIT,&err); +- +- /* +- * An error occurred so return it. Because skb_recv_datagram() +- * handles the blocking we don't see and worry about blocking +- * retries. +- */ +- +- if (skb == NULL) +- goto out; +- +- /* +- * If the address length field is there to be filled in, we fill +- * it in now. +- */ +- +- sll = &PACKET_SKB_CB(skb)->sa.ll; +- if (sock->type == SOCK_PACKET) +- msg->msg_namelen = sizeof(struct sockaddr_pkt); +- else +- msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr); +- +- /* +- * You lose any data beyond the buffer you gave. If it worries a +- * user program they can ask the device for its MTU anyway. +- */ +- +- copied = skb->len; +- if (copied > len) +- { +- copied=len; +- msg->msg_flags|=MSG_TRUNC; +- } +- +- err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); +- if (err) +- goto out_free; +- +- sock_recv_timestamp(msg, sk, skb); +- +- if (msg->msg_name) +- memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa, +- msg->msg_namelen); +- +- if (pkt_sk(sk)->auxdata) { +- struct tpacket_auxdata aux; +- +- aux.tp_status = TP_STATUS_USER; +- if (skb->ip_summed == CHECKSUM_PARTIAL) +- aux.tp_status |= TP_STATUS_CSUMNOTREADY; +- aux.tp_len = PACKET_SKB_CB(skb)->origlen; +- aux.tp_snaplen = skb->len; +- aux.tp_mac = 0; +- aux.tp_net = skb_network_offset(skb); +- +- put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux); +- } +- +- /* +- * Free or return the buffer as appropriate. Again this +- * hides all the races and re-entrancy issues from us. +- */ +- err = (flags&MSG_TRUNC) ? skb->len : copied; +- +-out_free: +- skb_free_datagram(sk, skb); +-out: +- return err; +-} +- +-static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr, +- int *uaddr_len, int peer) +-{ +- struct net_device *dev; +- struct sock *sk = sock->sk; +- +- if (peer) +- return -EOPNOTSUPP; +- +- uaddr->sa_family = AF_PACKET; +- dev = dev_get_by_index(pkt_sk(sk)->ifindex); +- if (dev) { +- strlcpy(uaddr->sa_data, dev->name, 15); +- dev_put(dev); +- } else +- memset(uaddr->sa_data, 0, 14); +- *uaddr_len = sizeof(*uaddr); +- +- return 0; +-} +- +-static int packet_getname(struct socket *sock, struct sockaddr *uaddr, +- int *uaddr_len, int peer) +-{ +- struct net_device *dev; +- struct sock *sk = sock->sk; +- struct packet_sock *po = pkt_sk(sk); +- struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr; +- +- if (peer) +- return -EOPNOTSUPP; +- +- sll->sll_family = AF_PACKET; +- sll->sll_ifindex = po->ifindex; +- sll->sll_protocol = po->num; +- dev = dev_get_by_index(po->ifindex); +- if (dev) { +- sll->sll_hatype = dev->type; +- sll->sll_halen = dev->addr_len; +- memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len); +- dev_put(dev); +- } else { +- sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */ +- sll->sll_halen = 0; +- } +- *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen; +- +- return 0; +-} +- +-static void packet_dev_mc(struct net_device *dev, struct packet_mclist *i, int what) +-{ +- switch (i->type) { +- case PACKET_MR_MULTICAST: +- if (what > 0) +- dev_mc_add(dev, i->addr, i->alen, 0); +- else +- dev_mc_delete(dev, i->addr, i->alen, 0); +- break; +- case PACKET_MR_PROMISC: +- dev_set_promiscuity(dev, what); +- break; +- case PACKET_MR_ALLMULTI: +- dev_set_allmulti(dev, what); +- break; +- default:; +- } +-} +- +-static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what) +-{ +- for ( ; i; i=i->next) { +- if (i->ifindex == dev->ifindex) +- packet_dev_mc(dev, i, what); +- } +-} +- +-static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq) +-{ +- struct packet_sock *po = pkt_sk(sk); +- struct packet_mclist *ml, *i; +- struct net_device *dev; +- int err; +- +- rtnl_lock(); +- +- err = -ENODEV; +- dev = __dev_get_by_index(mreq->mr_ifindex); +- if (!dev) +- goto done; +- +- err = -EINVAL; +- if (mreq->mr_alen > dev->addr_len) +- goto done; +- +- err = -ENOBUFS; +- i = kmalloc(sizeof(*i), GFP_KERNEL); +- if (i == NULL) +- goto done; +- +- err = 0; +- for (ml = po->mclist; ml; ml = ml->next) { +- if (ml->ifindex == mreq->mr_ifindex && +- ml->type == mreq->mr_type && +- ml->alen == mreq->mr_alen && +- memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) { +- ml->count++; +- /* Free the new element ... */ +- kfree(i); +- goto done; +- } +- } +- +- i->type = mreq->mr_type; +- i->ifindex = mreq->mr_ifindex; +- i->alen = mreq->mr_alen; +- memcpy(i->addr, mreq->mr_address, i->alen); +- i->count = 1; +- i->next = po->mclist; +- po->mclist = i; +- packet_dev_mc(dev, i, +1); +- +-done: +- rtnl_unlock(); +- return err; +-} +- +-static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq) +-{ +- struct packet_mclist *ml, **mlp; +- +- rtnl_lock(); +- +- for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) { +- if (ml->ifindex == mreq->mr_ifindex && +- ml->type == mreq->mr_type && +- ml->alen == mreq->mr_alen && +- memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) { +- if (--ml->count == 0) { +- struct net_device *dev; +- *mlp = ml->next; +- dev = dev_get_by_index(ml->ifindex); +- if (dev) { +- packet_dev_mc(dev, ml, -1); +- dev_put(dev); +- } +- kfree(ml); +- } +- rtnl_unlock(); +- return 0; +- } +- } +- rtnl_unlock(); +- return -EADDRNOTAVAIL; +-} +- +-static void packet_flush_mclist(struct sock *sk) +-{ +- struct packet_sock *po = pkt_sk(sk); +- struct packet_mclist *ml; +- +- if (!po->mclist) +- return; +- +- rtnl_lock(); +- while ((ml = po->mclist) != NULL) { +- struct net_device *dev; +- +- po->mclist = ml->next; +- if ((dev = dev_get_by_index(ml->ifindex)) != NULL) { +- packet_dev_mc(dev, ml, -1); +- dev_put(dev); +- } +- kfree(ml); +- } +- rtnl_unlock(); +-} +- +-static int +-packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen) +-{ +- struct sock *sk = sock->sk; +- struct packet_sock *po = pkt_sk(sk); +- int ret; +- +- if (level != SOL_PACKET) +- return -ENOPROTOOPT; +- +- switch(optname) { +- case PACKET_ADD_MEMBERSHIP: +- case PACKET_DROP_MEMBERSHIP: +- { +- struct packet_mreq_max mreq; +- int len = optlen; +- memset(&mreq, 0, sizeof(mreq)); +- if (len < sizeof(struct packet_mreq)) +- return -EINVAL; +- if (len > sizeof(mreq)) +- len = sizeof(mreq); +- if (copy_from_user(&mreq,optval,len)) +- return -EFAULT; +- if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address))) +- return -EINVAL; +- if (optname == PACKET_ADD_MEMBERSHIP) +- ret = packet_mc_add(sk, &mreq); +- else +- ret = packet_mc_drop(sk, &mreq); +- return ret; +- } +- +-#ifdef CONFIG_PACKET_MMAP +- case PACKET_RX_RING: +- { +- struct tpacket_req req; +- +- if (optlencopy_thresh = val; +- return 0; +- } +-#endif +- case PACKET_AUXDATA: +- { +- int val; +- +- if (optlen < sizeof(val)) +- return -EINVAL; +- if (copy_from_user(&val, optval, sizeof(val))) +- return -EFAULT; +- +- po->auxdata = !!val; +- return 0; +- } +- case PACKET_ORIGDEV: +- { +- int val; +- +- if (optlen < sizeof(val)) +- return -EINVAL; +- if (copy_from_user(&val, optval, sizeof(val))) +- return -EFAULT; +- +- po->origdev = !!val; +- return 0; +- } +- default: +- return -ENOPROTOOPT; +- } +-} +- +-static int packet_getsockopt(struct socket *sock, int level, int optname, +- char __user *optval, int __user *optlen) +-{ +- int len; +- int val; +- struct sock *sk = sock->sk; +- struct packet_sock *po = pkt_sk(sk); +- void *data; +- struct tpacket_stats st; +- +- if (level != SOL_PACKET) +- return -ENOPROTOOPT; +- +- if (get_user(len, optlen)) +- return -EFAULT; +- +- if (len < 0) +- return -EINVAL; +- +- switch(optname) { +- case PACKET_STATISTICS: +- if (len > sizeof(struct tpacket_stats)) +- len = sizeof(struct tpacket_stats); +- spin_lock_bh(&sk->sk_receive_queue.lock); +- st = po->stats; +- memset(&po->stats, 0, sizeof(st)); +- spin_unlock_bh(&sk->sk_receive_queue.lock); +- st.tp_packets += st.tp_drops; +- +- data = &st; +- break; +- case PACKET_AUXDATA: +- if (len > sizeof(int)) +- len = sizeof(int); +- val = po->auxdata; +- +- data = &val; +- break; +- case PACKET_ORIGDEV: +- if (len > sizeof(int)) +- len = sizeof(int); +- val = po->origdev; +- +- data = &val; +- break; +- default: +- return -ENOPROTOOPT; +- } +- +- if (put_user(len, optlen)) +- return -EFAULT; +- if (copy_to_user(optval, data, len)) +- return -EFAULT; +- return 0; +-} +- +- +-static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data) +-{ +- struct sock *sk; +- struct hlist_node *node; +- struct net_device *dev = data; +- +- read_lock(&packet_sklist_lock); +- sk_for_each(sk, node, &packet_sklist) { +- struct packet_sock *po = pkt_sk(sk); +- +- switch (msg) { +- case NETDEV_UNREGISTER: +- if (po->mclist) +- packet_dev_mclist(dev, po->mclist, -1); +- /* fallthrough */ +- +- case NETDEV_DOWN: +- if (dev->ifindex == po->ifindex) { +- spin_lock(&po->bind_lock); +- if (po->running) { +- __dev_remove_pack(&po->prot_hook); +- __sock_put(sk); +- po->running = 0; +- sk->sk_err = ENETDOWN; +- if (!sock_flag(sk, SOCK_DEAD)) +- sk->sk_error_report(sk); +- } +- if (msg == NETDEV_UNREGISTER) { +- po->ifindex = -1; +- po->prot_hook.dev = NULL; +- } +- spin_unlock(&po->bind_lock); +- } +- break; +- case NETDEV_UP: +- spin_lock(&po->bind_lock); +- if (dev->ifindex == po->ifindex && po->num && +- !po->running) { +- dev_add_pack(&po->prot_hook); +- sock_hold(sk); +- po->running = 1; +- } +- spin_unlock(&po->bind_lock); +- break; +- } +- } +- read_unlock(&packet_sklist_lock); +- return NOTIFY_DONE; +-} +- +- +-static int packet_ioctl(struct socket *sock, unsigned int cmd, +- unsigned long arg) +-{ +- struct sock *sk = sock->sk; +- +- switch(cmd) { +- case SIOCOUTQ: +- { +- int amount = atomic_read(&sk->sk_wmem_alloc); +- return put_user(amount, (int __user *)arg); +- } +- case SIOCINQ: +- { +- struct sk_buff *skb; +- int amount = 0; +- +- spin_lock_bh(&sk->sk_receive_queue.lock); +- skb = skb_peek(&sk->sk_receive_queue); +- if (skb) +- amount = skb->len; +- spin_unlock_bh(&sk->sk_receive_queue.lock); +- return put_user(amount, (int __user *)arg); +- } +- case SIOCGSTAMP: +- return sock_get_timestamp(sk, (struct timeval __user *)arg); +- case SIOCGSTAMPNS: +- return sock_get_timestampns(sk, (struct timespec __user *)arg); +- +-#ifdef CONFIG_INET +- case SIOCADDRT: +- case SIOCDELRT: +- case SIOCDARP: +- case SIOCGARP: +- case SIOCSARP: +- case SIOCGIFADDR: +- case SIOCSIFADDR: +- case SIOCGIFBRDADDR: +- case SIOCSIFBRDADDR: +- case SIOCGIFNETMASK: +- case SIOCSIFNETMASK: +- case SIOCGIFDSTADDR: +- case SIOCSIFDSTADDR: +- case SIOCSIFFLAGS: +- return inet_dgram_ops.ioctl(sock, cmd, arg); +-#endif +- +- default: +- return -ENOIOCTLCMD; +- } +- return 0; +-} +- +-#ifndef CONFIG_PACKET_MMAP +-#define packet_mmap sock_no_mmap +-#define packet_poll datagram_poll +-#else +- +-static unsigned int packet_poll(struct file * file, struct socket *sock, +- poll_table *wait) +-{ +- struct sock *sk = sock->sk; +- struct packet_sock *po = pkt_sk(sk); +- unsigned int mask = datagram_poll(file, sock, wait); +- +- spin_lock_bh(&sk->sk_receive_queue.lock); +- if (po->pg_vec) { +- unsigned last = po->head ? po->head-1 : po->frame_max; +- struct tpacket_hdr *h; +- +- h = packet_lookup_frame(po, last); +- +- if (h->tp_status) +- mask |= POLLIN | POLLRDNORM; +- } +- spin_unlock_bh(&sk->sk_receive_queue.lock); +- return mask; +-} +- +- +-/* Dirty? Well, I still did not learn better way to account +- * for user mmaps. +- */ +- +-static void packet_mm_open(struct vm_area_struct *vma) +-{ +- struct file *file = vma->vm_file; +- struct socket * sock = file->private_data; +- struct sock *sk = sock->sk; +- +- if (sk) +- atomic_inc(&pkt_sk(sk)->mapped); +-} +- +-static void packet_mm_close(struct vm_area_struct *vma) +-{ +- struct file *file = vma->vm_file; +- struct socket * sock = file->private_data; +- struct sock *sk = sock->sk; +- +- if (sk) +- atomic_dec(&pkt_sk(sk)->mapped); +-} +- +-static struct vm_operations_struct packet_mmap_ops = { +- .open = packet_mm_open, +- .close =packet_mm_close, +-}; +- +-static inline struct page *pg_vec_endpage(char *one_pg_vec, unsigned int order) +-{ +- return virt_to_page(one_pg_vec + (PAGE_SIZE << order) - 1); +-} +- +-static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len) +-{ +- int i; +- +- for (i = 0; i < len; i++) { +- if (likely(pg_vec[i])) +- free_pages((unsigned long) pg_vec[i], order); +- } +- kfree(pg_vec); +-} +- +-static inline char *alloc_one_pg_vec_page(unsigned long order) +-{ +- return (char *) __get_free_pages(GFP_KERNEL | __GFP_COMP | __GFP_ZERO, +- order); +-} +- +-static char **alloc_pg_vec(struct tpacket_req *req, int order) +-{ +- unsigned int block_nr = req->tp_block_nr; +- char **pg_vec; +- int i; +- +- pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL); +- if (unlikely(!pg_vec)) +- goto out; +- +- for (i = 0; i < block_nr; i++) { +- pg_vec[i] = alloc_one_pg_vec_page(order); +- if (unlikely(!pg_vec[i])) +- goto out_free_pgvec; +- } +- +-out: +- return pg_vec; +- +-out_free_pgvec: +- free_pg_vec(pg_vec, order, block_nr); +- pg_vec = NULL; +- goto out; +-} +- +-static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing) +-{ +- char **pg_vec = NULL; +- struct packet_sock *po = pkt_sk(sk); +- int was_running, order = 0; +- __be16 num; +- int err = 0; +- +- if (req->tp_block_nr) { +- int i, l; +- +- /* Sanity tests and some calculations */ +- +- if (unlikely(po->pg_vec)) +- return -EBUSY; +- +- if (unlikely((int)req->tp_block_size <= 0)) +- return -EINVAL; +- if (unlikely(req->tp_block_size & (PAGE_SIZE - 1))) +- return -EINVAL; +- if (unlikely(req->tp_frame_size < TPACKET_HDRLEN)) +- return -EINVAL; +- if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1))) +- return -EINVAL; +- +- po->frames_per_block = req->tp_block_size/req->tp_frame_size; +- if (unlikely(po->frames_per_block <= 0)) +- return -EINVAL; +- if (unlikely((po->frames_per_block * req->tp_block_nr) != +- req->tp_frame_nr)) +- return -EINVAL; +- +- err = -ENOMEM; +- order = get_order(req->tp_block_size); +- pg_vec = alloc_pg_vec(req, order); +- if (unlikely(!pg_vec)) +- goto out; +- +- l = 0; +- for (i = 0; i < req->tp_block_nr; i++) { +- char *ptr = pg_vec[i]; +- struct tpacket_hdr *header; +- int k; +- +- for (k = 0; k < po->frames_per_block; k++) { +- header = (struct tpacket_hdr *) ptr; +- header->tp_status = TP_STATUS_KERNEL; +- ptr += req->tp_frame_size; +- } +- } +- /* Done */ +- } else { +- if (unlikely(req->tp_frame_nr)) +- return -EINVAL; +- } +- +- lock_sock(sk); +- +- /* Detach socket from network */ +- spin_lock(&po->bind_lock); +- was_running = po->running; +- num = po->num; +- if (was_running) { +- __dev_remove_pack(&po->prot_hook); +- po->num = 0; +- po->running = 0; +- __sock_put(sk); +- } +- spin_unlock(&po->bind_lock); +- +- synchronize_net(); +- +- err = -EBUSY; +- if (closing || atomic_read(&po->mapped) == 0) { +- err = 0; +-#define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; }) +- +- spin_lock_bh(&sk->sk_receive_queue.lock); +- pg_vec = XC(po->pg_vec, pg_vec); +- po->frame_max = (req->tp_frame_nr - 1); +- po->head = 0; +- po->frame_size = req->tp_frame_size; +- spin_unlock_bh(&sk->sk_receive_queue.lock); +- +- order = XC(po->pg_vec_order, order); +- req->tp_block_nr = XC(po->pg_vec_len, req->tp_block_nr); +- +- po->pg_vec_pages = req->tp_block_size/PAGE_SIZE; +- po->prot_hook.func = po->pg_vec ? tpacket_rcv : packet_rcv; +- skb_queue_purge(&sk->sk_receive_queue); +-#undef XC +- if (atomic_read(&po->mapped)) +- printk(KERN_DEBUG "packet_mmap: vma is busy: %d\n", atomic_read(&po->mapped)); +- } +- +- spin_lock(&po->bind_lock); +- if (was_running && !po->running) { +- sock_hold(sk); +- po->running = 1; +- po->num = num; +- dev_add_pack(&po->prot_hook); +- } +- spin_unlock(&po->bind_lock); +- +- release_sock(sk); +- +- if (pg_vec) +- free_pg_vec(pg_vec, order, req->tp_block_nr); +-out: +- return err; +-} +- +-static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) +-{ +- struct sock *sk = sock->sk; +- struct packet_sock *po = pkt_sk(sk); +- unsigned long size; +- unsigned long start; +- int err = -EINVAL; +- int i; +- +- if (vma->vm_pgoff) +- return -EINVAL; +- +- size = vma->vm_end - vma->vm_start; +- +- lock_sock(sk); +- if (po->pg_vec == NULL) +- goto out; +- if (size != po->pg_vec_len*po->pg_vec_pages*PAGE_SIZE) +- goto out; +- +- start = vma->vm_start; +- for (i = 0; i < po->pg_vec_len; i++) { +- struct page *page = virt_to_page(po->pg_vec[i]); +- int pg_num; +- +- for (pg_num = 0; pg_num < po->pg_vec_pages; pg_num++, page++) { +- err = vm_insert_page(vma, start, page); +- if (unlikely(err)) +- goto out; +- start += PAGE_SIZE; +- } +- } +- atomic_inc(&po->mapped); +- vma->vm_ops = &packet_mmap_ops; +- err = 0; +- +-out: +- release_sock(sk); +- return err; +-} +-#endif +- +- +-static const struct proto_ops packet_ops_spkt = { +- .family = PF_PACKET, +- .owner = THIS_MODULE, +- .release = packet_release, +- .bind = packet_bind_spkt, +- .connect = sock_no_connect, +- .socketpair = sock_no_socketpair, +- .accept = sock_no_accept, +- .getname = packet_getname_spkt, +- .poll = datagram_poll, +- .ioctl = packet_ioctl, +- .listen = sock_no_listen, +- .shutdown = sock_no_shutdown, +- .setsockopt = sock_no_setsockopt, +- .getsockopt = sock_no_getsockopt, +- .sendmsg = packet_sendmsg_spkt, +- .recvmsg = packet_recvmsg, +- .mmap = sock_no_mmap, +- .sendpage = sock_no_sendpage, +-}; +- +-static const struct proto_ops packet_ops = { +- .family = PF_PACKET, +- .owner = THIS_MODULE, +- .release = packet_release, +- .bind = packet_bind, +- .connect = sock_no_connect, +- .socketpair = sock_no_socketpair, +- .accept = sock_no_accept, +- .getname = packet_getname, +- .poll = packet_poll, +- .ioctl = packet_ioctl, +- .listen = sock_no_listen, +- .shutdown = sock_no_shutdown, +- .setsockopt = packet_setsockopt, +- .getsockopt = packet_getsockopt, +- .sendmsg = packet_sendmsg, +- .recvmsg = packet_recvmsg, +- .mmap = packet_mmap, +- .sendpage = sock_no_sendpage, +-}; +- +-static struct net_proto_family packet_family_ops = { +- .family = PF_PACKET, +- .create = packet_create, +- .owner = THIS_MODULE, +-}; +- +-static struct notifier_block packet_netdev_notifier = { +- .notifier_call =packet_notifier, +-}; +- +-#ifdef CONFIG_PROC_FS +-static inline struct sock *packet_seq_idx(loff_t off) +-{ +- struct sock *s; +- struct hlist_node *node; +- +- sk_for_each(s, node, &packet_sklist) { +- if (!off--) +- return s; +- } +- return NULL; +-} +- +-static void *packet_seq_start(struct seq_file *seq, loff_t *pos) +-{ +- read_lock(&packet_sklist_lock); +- return *pos ? packet_seq_idx(*pos - 1) : SEQ_START_TOKEN; +-} +- +-static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos) +-{ +- ++*pos; +- return (v == SEQ_START_TOKEN) +- ? sk_head(&packet_sklist) +- : sk_next((struct sock*)v) ; +-} +- +-static void packet_seq_stop(struct seq_file *seq, void *v) +-{ +- read_unlock(&packet_sklist_lock); +-} +- +-static int packet_seq_show(struct seq_file *seq, void *v) +-{ +- if (v == SEQ_START_TOKEN) +- seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n"); +- else { +- struct sock *s = v; +- const struct packet_sock *po = pkt_sk(s); +- +- seq_printf(seq, +- "%p %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n", +- s, +- atomic_read(&s->sk_refcnt), +- s->sk_type, +- ntohs(po->num), +- po->ifindex, +- po->running, +- atomic_read(&s->sk_rmem_alloc), +- sock_i_uid(s), +- sock_i_ino(s) ); +- } +- +- return 0; +-} +- +-static struct seq_operations packet_seq_ops = { +- .start = packet_seq_start, +- .next = packet_seq_next, +- .stop = packet_seq_stop, +- .show = packet_seq_show, +-}; +- +-static int packet_seq_open(struct inode *inode, struct file *file) +-{ +- return seq_open(file, &packet_seq_ops); +-} +- +-static const struct file_operations packet_seq_fops = { +- .owner = THIS_MODULE, +- .open = packet_seq_open, +- .read = seq_read, +- .llseek = seq_lseek, +- .release = seq_release, +-}; +- +-#endif +- +-static void __exit packet_exit(void) +-{ +- proc_net_remove("packet"); +- unregister_netdevice_notifier(&packet_netdev_notifier); +- sock_unregister(PF_PACKET); +- proto_unregister(&packet_proto); +-} +- +-static int __init packet_init(void) +-{ +- int rc = proto_register(&packet_proto, 0); +- +- if (rc != 0) +- goto out; +- +- sock_register(&packet_family_ops); +- register_netdevice_notifier(&packet_netdev_notifier); +- proc_net_fops_create("packet", 0, &packet_seq_fops); +-out: +- return rc; +-} +- +-module_init(packet_init); +-module_exit(packet_exit); +-MODULE_LICENSE("GPL"); +-MODULE_ALIAS_NETPROTO(PF_PACKET); +diff -Nurb linux-2.6.22-594/net/socket.c linux-2.6.22-595/net/socket.c +--- linux-2.6.22-594/net/socket.c 2008-03-20 00:05:19.000000000 -0400 ++++ linux-2.6.22-595/net/socket.c 2008-03-20 00:14:03.000000000 -0400 +@@ -1122,12 +1122,17 @@ + if (type < 0 || type >= SOCK_MAX) + return -EINVAL; + ++ /* ++ * Hack no. 2 - Sapan ++ * Clean this up later ++ * + if (!nx_check(0, VS_ADMIN)) { + if (family == PF_INET && !current_nx_info_has_v4()) + return -EAFNOSUPPORT; + if (family == PF_INET6 && !current_nx_info_has_v6()) + return -EAFNOSUPPORT; + } ++ */ + + /* Compatibility. + +diff -Nurb linux-2.6.22-594/net/socket.c.orig linux-2.6.22-595/net/socket.c.orig +--- linux-2.6.22-594/net/socket.c.orig 1969-12-31 19:00:00.000000000 -0500 ++++ linux-2.6.22-595/net/socket.c.orig 2008-03-20 00:05:19.000000000 -0400 +@@ -0,0 +1,2400 @@ ++/* ++ * NET An implementation of the SOCKET network access protocol. ++ * ++ * Version: @(#)socket.c 1.1.93 18/02/95 ++ * ++ * Authors: Orest Zborowski, ++ * Ross Biro ++ * Fred N. van Kempen, ++ * ++ * Fixes: ++ * Anonymous : NOTSOCK/BADF cleanup. Error fix in ++ * shutdown() ++ * Alan Cox : verify_area() fixes ++ * Alan Cox : Removed DDI ++ * Jonathan Kamens : SOCK_DGRAM reconnect bug ++ * Alan Cox : Moved a load of checks to the very ++ * top level. ++ * Alan Cox : Move address structures to/from user ++ * mode above the protocol layers. ++ * Rob Janssen : Allow 0 length sends. ++ * Alan Cox : Asynchronous I/O support (cribbed from the ++ * tty drivers). ++ * Niibe Yutaka : Asynchronous I/O for writes (4.4BSD style) ++ * Jeff Uphoff : Made max number of sockets command-line ++ * configurable. ++ * Matti Aarnio : Made the number of sockets dynamic, ++ * to be allocated when needed, and mr. ++ * Uphoff's max is used as max to be ++ * allowed to allocate. ++ * Linus : Argh. removed all the socket allocation ++ * altogether: it's in the inode now. ++ * Alan Cox : Made sock_alloc()/sock_release() public ++ * for NetROM and future kernel nfsd type ++ * stuff. ++ * Alan Cox : sendmsg/recvmsg basics. ++ * Tom Dyas : Export net symbols. ++ * Marcin Dalecki : Fixed problems with CONFIG_NET="n". ++ * Alan Cox : Added thread locking to sys_* calls ++ * for sockets. May have errors at the ++ * moment. ++ * Kevin Buhr : Fixed the dumb errors in the above. ++ * Andi Kleen : Some small cleanups, optimizations, ++ * and fixed a copy_from_user() bug. ++ * Tigran Aivazian : sys_send(args) calls sys_sendto(args, NULL, 0) ++ * Tigran Aivazian : Made listen(2) backlog sanity checks ++ * protocol-independent ++ * ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License ++ * as published by the Free Software Foundation; either version ++ * 2 of the License, or (at your option) any later version. ++ * ++ * ++ * This module is effectively the top level interface to the BSD socket ++ * paradigm. ++ * ++ * Based upon Swansea University Computer Society NET3.039 ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static int sock_no_open(struct inode *irrelevant, struct file *dontcare); ++static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov, ++ unsigned long nr_segs, loff_t pos); ++static ssize_t sock_aio_write(struct kiocb *iocb, const struct iovec *iov, ++ unsigned long nr_segs, loff_t pos); ++static int sock_mmap(struct file *file, struct vm_area_struct *vma); ++ ++static int sock_close(struct inode *inode, struct file *file); ++static unsigned int sock_poll(struct file *file, ++ struct poll_table_struct *wait); ++static long sock_ioctl(struct file *file, unsigned int cmd, unsigned long arg); ++#ifdef CONFIG_COMPAT ++static long compat_sock_ioctl(struct file *file, ++ unsigned int cmd, unsigned long arg); ++#endif ++static int sock_fasync(int fd, struct file *filp, int on); ++static ssize_t sock_sendpage(struct file *file, struct page *page, ++ int offset, size_t size, loff_t *ppos, int more); ++ ++/* ++ * Socket files have a set of 'special' operations as well as the generic file ones. These don't appear ++ * in the operation structures but are done directly via the socketcall() multiplexor. ++ */ ++ ++static const struct file_operations socket_file_ops = { ++ .owner = THIS_MODULE, ++ .llseek = no_llseek, ++ .aio_read = sock_aio_read, ++ .aio_write = sock_aio_write, ++ .poll = sock_poll, ++ .unlocked_ioctl = sock_ioctl, ++#ifdef CONFIG_COMPAT ++ .compat_ioctl = compat_sock_ioctl, ++#endif ++ .mmap = sock_mmap, ++ .open = sock_no_open, /* special open code to disallow open via /proc */ ++ .release = sock_close, ++ .fasync = sock_fasync, ++ .sendpage = sock_sendpage, ++ .splice_write = generic_splice_sendpage, ++}; ++ ++/* ++ * The protocol list. Each protocol is registered in here. ++ */ ++ ++static DEFINE_SPINLOCK(net_family_lock); ++static const struct net_proto_family *net_families[NPROTO] __read_mostly; ++ ++/* ++ * Statistics counters of the socket lists ++ */ ++ ++static DEFINE_PER_CPU(int, sockets_in_use) = 0; ++ ++/* ++ * Support routines. ++ * Move socket addresses back and forth across the kernel/user ++ * divide and look after the messy bits. ++ */ ++ ++#define MAX_SOCK_ADDR 128 /* 108 for Unix domain - ++ 16 for IP, 16 for IPX, ++ 24 for IPv6, ++ about 80 for AX.25 ++ must be at least one bigger than ++ the AF_UNIX size (see net/unix/af_unix.c ++ :unix_mkname()). ++ */ ++ ++/** ++ * move_addr_to_kernel - copy a socket address into kernel space ++ * @uaddr: Address in user space ++ * @kaddr: Address in kernel space ++ * @ulen: Length in user space ++ * ++ * The address is copied into kernel space. If the provided address is ++ * too long an error code of -EINVAL is returned. If the copy gives ++ * invalid addresses -EFAULT is returned. On a success 0 is returned. ++ */ ++ ++int move_addr_to_kernel(void __user *uaddr, int ulen, void *kaddr) ++{ ++ if (ulen < 0 || ulen > MAX_SOCK_ADDR) ++ return -EINVAL; ++ if (ulen == 0) ++ return 0; ++ if (copy_from_user(kaddr, uaddr, ulen)) ++ return -EFAULT; ++ return audit_sockaddr(ulen, kaddr); ++} ++ ++/** ++ * move_addr_to_user - copy an address to user space ++ * @kaddr: kernel space address ++ * @klen: length of address in kernel ++ * @uaddr: user space address ++ * @ulen: pointer to user length field ++ * ++ * The value pointed to by ulen on entry is the buffer length available. ++ * This is overwritten with the buffer space used. -EINVAL is returned ++ * if an overlong buffer is specified or a negative buffer size. -EFAULT ++ * is returned if either the buffer or the length field are not ++ * accessible. ++ * After copying the data up to the limit the user specifies, the true ++ * length of the data is written over the length limit the user ++ * specified. Zero is returned for a success. ++ */ ++ ++int move_addr_to_user(void *kaddr, int klen, void __user *uaddr, ++ int __user *ulen) ++{ ++ int err; ++ int len; ++ ++ err = get_user(len, ulen); ++ if (err) ++ return err; ++ if (len > klen) ++ len = klen; ++ if (len < 0 || len > MAX_SOCK_ADDR) ++ return -EINVAL; ++ if (len) { ++ if (audit_sockaddr(klen, kaddr)) ++ return -ENOMEM; ++ if (copy_to_user(uaddr, kaddr, len)) ++ return -EFAULT; ++ } ++ /* ++ * "fromlen shall refer to the value before truncation.." ++ * 1003.1g ++ */ ++ return __put_user(klen, ulen); ++} ++ ++#define SOCKFS_MAGIC 0x534F434B ++ ++static struct kmem_cache *sock_inode_cachep __read_mostly; ++ ++static struct inode *sock_alloc_inode(struct super_block *sb) ++{ ++ struct socket_alloc *ei; ++ ++ ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL); ++ if (!ei) ++ return NULL; ++ init_waitqueue_head(&ei->socket.wait); ++ ++ ei->socket.fasync_list = NULL; ++ ei->socket.state = SS_UNCONNECTED; ++ ei->socket.flags = 0; ++ ei->socket.ops = NULL; ++ ei->socket.sk = NULL; ++ ei->socket.file = NULL; ++ ++ return &ei->vfs_inode; ++} ++ ++static void sock_destroy_inode(struct inode *inode) ++{ ++ kmem_cache_free(sock_inode_cachep, ++ container_of(inode, struct socket_alloc, vfs_inode)); ++} ++ ++static void init_once(void *foo, struct kmem_cache *cachep, unsigned long flags) ++{ ++ struct socket_alloc *ei = (struct socket_alloc *)foo; ++ ++ inode_init_once(&ei->vfs_inode); ++} ++ ++static int init_inodecache(void) ++{ ++ sock_inode_cachep = kmem_cache_create("sock_inode_cache", ++ sizeof(struct socket_alloc), ++ 0, ++ (SLAB_HWCACHE_ALIGN | ++ SLAB_RECLAIM_ACCOUNT | ++ SLAB_MEM_SPREAD), ++ init_once, ++ NULL); ++ if (sock_inode_cachep == NULL) ++ return -ENOMEM; ++ return 0; ++} ++ ++static struct super_operations sockfs_ops = { ++ .alloc_inode = sock_alloc_inode, ++ .destroy_inode =sock_destroy_inode, ++ .statfs = simple_statfs, ++}; ++ ++static int sockfs_get_sb(struct file_system_type *fs_type, ++ int flags, const char *dev_name, void *data, ++ struct vfsmount *mnt) ++{ ++ return get_sb_pseudo(fs_type, "socket:", &sockfs_ops, SOCKFS_MAGIC, ++ mnt); ++} ++ ++static struct vfsmount *sock_mnt __read_mostly; ++ ++static struct file_system_type sock_fs_type = { ++ .name = "sockfs", ++ .get_sb = sockfs_get_sb, ++ .kill_sb = kill_anon_super, ++}; ++ ++static int sockfs_delete_dentry(struct dentry *dentry) ++{ ++ /* ++ * At creation time, we pretended this dentry was hashed ++ * (by clearing DCACHE_UNHASHED bit in d_flags) ++ * At delete time, we restore the truth : not hashed. ++ * (so that dput() can proceed correctly) ++ */ ++ dentry->d_flags |= DCACHE_UNHASHED; ++ return 0; ++} ++ ++/* ++ * sockfs_dname() is called from d_path(). ++ */ ++static char *sockfs_dname(struct dentry *dentry, char *buffer, int buflen) ++{ ++ return dynamic_dname(dentry, buffer, buflen, "socket:[%lu]", ++ dentry->d_inode->i_ino); ++} ++ ++static struct dentry_operations sockfs_dentry_operations = { ++ .d_delete = sockfs_delete_dentry, ++ .d_dname = sockfs_dname, ++}; ++ ++/* ++ * Obtains the first available file descriptor and sets it up for use. ++ * ++ * These functions create file structures and maps them to fd space ++ * of the current process. On success it returns file descriptor ++ * and file struct implicitly stored in sock->file. ++ * Note that another thread may close file descriptor before we return ++ * from this function. We use the fact that now we do not refer ++ * to socket after mapping. If one day we will need it, this ++ * function will increment ref. count on file by 1. ++ * ++ * In any case returned fd MAY BE not valid! ++ * This race condition is unavoidable ++ * with shared fd spaces, we cannot solve it inside kernel, ++ * but we take care of internal coherence yet. ++ */ ++ ++static int sock_alloc_fd(struct file **filep) ++{ ++ int fd; ++ ++ fd = get_unused_fd(); ++ if (likely(fd >= 0)) { ++ struct file *file = get_empty_filp(); ++ ++ *filep = file; ++ if (unlikely(!file)) { ++ put_unused_fd(fd); ++ return -ENFILE; ++ } ++ } else ++ *filep = NULL; ++ return fd; ++} ++ ++static int sock_attach_fd(struct socket *sock, struct file *file) ++{ ++ struct qstr name = { .name = "" }; ++ ++ file->f_path.dentry = d_alloc(sock_mnt->mnt_sb->s_root, &name); ++ if (unlikely(!file->f_path.dentry)) ++ return -ENOMEM; ++ ++ file->f_path.dentry->d_op = &sockfs_dentry_operations; ++ /* ++ * We dont want to push this dentry into global dentry hash table. ++ * We pretend dentry is already hashed, by unsetting DCACHE_UNHASHED ++ * This permits a working /proc/$pid/fd/XXX on sockets ++ */ ++ file->f_path.dentry->d_flags &= ~DCACHE_UNHASHED; ++ d_instantiate(file->f_path.dentry, SOCK_INODE(sock)); ++ file->f_path.mnt = mntget(sock_mnt); ++ file->f_mapping = file->f_path.dentry->d_inode->i_mapping; ++ ++ sock->file = file; ++ file->f_op = SOCK_INODE(sock)->i_fop = &socket_file_ops; ++ file->f_mode = FMODE_READ | FMODE_WRITE; ++ file->f_flags = O_RDWR; ++ file->f_pos = 0; ++ file->private_data = sock; ++ ++ return 0; ++} ++ ++int sock_map_fd(struct socket *sock) ++{ ++ struct file *newfile; ++ int fd = sock_alloc_fd(&newfile); ++ ++ if (likely(fd >= 0)) { ++ int err = sock_attach_fd(sock, newfile); ++ ++ if (unlikely(err < 0)) { ++ put_filp(newfile); ++ put_unused_fd(fd); ++ return err; ++ } ++ fd_install(fd, newfile); ++ } ++ return fd; ++} ++ ++static struct socket *sock_from_file(struct file *file, int *err) ++{ ++ if (file->f_op == &socket_file_ops) ++ return file->private_data; /* set in sock_map_fd */ ++ ++ *err = -ENOTSOCK; ++ return NULL; ++} ++ ++/** ++ * sockfd_lookup - Go from a file number to its socket slot ++ * @fd: file handle ++ * @err: pointer to an error code return ++ * ++ * The file handle passed in is locked and the socket it is bound ++ * too is returned. If an error occurs the err pointer is overwritten ++ * with a negative errno code and NULL is returned. The function checks ++ * for both invalid handles and passing a handle which is not a socket. ++ * ++ * On a success the socket object pointer is returned. ++ */ ++ ++struct socket *sockfd_lookup(int fd, int *err) ++{ ++ struct file *file; ++ struct socket *sock; ++ ++ file = fget(fd); ++ if (!file) { ++ *err = -EBADF; ++ return NULL; ++ } ++ ++ sock = sock_from_file(file, err); ++ if (!sock) ++ fput(file); ++ return sock; ++} ++ ++static struct socket *sockfd_lookup_light(int fd, int *err, int *fput_needed) ++{ ++ struct file *file; ++ struct socket *sock; ++ ++ *err = -EBADF; ++ file = fget_light(fd, fput_needed); ++ if (file) { ++ sock = sock_from_file(file, err); ++ if (sock) ++ return sock; ++ fput_light(file, *fput_needed); ++ } ++ return NULL; ++} ++ ++/** ++ * sock_alloc - allocate a socket ++ * ++ * Allocate a new inode and socket object. The two are bound together ++ * and initialised. The socket is then returned. If we are out of inodes ++ * NULL is returned. ++ */ ++ ++static struct socket *sock_alloc(void) ++{ ++ struct inode *inode; ++ struct socket *sock; ++ ++ inode = new_inode(sock_mnt->mnt_sb); ++ if (!inode) ++ return NULL; ++ ++ sock = SOCKET_I(inode); ++ ++ inode->i_mode = S_IFSOCK | S_IRWXUGO; ++ inode->i_uid = current->fsuid; ++ inode->i_gid = current->fsgid; ++ ++ get_cpu_var(sockets_in_use)++; ++ put_cpu_var(sockets_in_use); ++ return sock; ++} ++ ++/* ++ * In theory you can't get an open on this inode, but /proc provides ++ * a back door. Remember to keep it shut otherwise you'll let the ++ * creepy crawlies in. ++ */ ++ ++static int sock_no_open(struct inode *irrelevant, struct file *dontcare) ++{ ++ return -ENXIO; ++} ++ ++const struct file_operations bad_sock_fops = { ++ .owner = THIS_MODULE, ++ .open = sock_no_open, ++}; ++ ++/** ++ * sock_release - close a socket ++ * @sock: socket to close ++ * ++ * The socket is released from the protocol stack if it has a release ++ * callback, and the inode is then released if the socket is bound to ++ * an inode not a file. ++ */ ++ ++void sock_release(struct socket *sock) ++{ ++ if (sock->ops) { ++ struct module *owner = sock->ops->owner; ++ ++ sock->ops->release(sock); ++ sock->ops = NULL; ++ module_put(owner); ++ } ++ ++ if (sock->fasync_list) ++ printk(KERN_ERR "sock_release: fasync list not empty!\n"); ++ ++ get_cpu_var(sockets_in_use)--; ++ put_cpu_var(sockets_in_use); ++ if (!sock->file) { ++ iput(SOCK_INODE(sock)); ++ return; ++ } ++ sock->file = NULL; ++} ++ ++static inline int __sock_sendmsg(struct kiocb *iocb, struct socket *sock, ++ struct msghdr *msg, size_t size) ++{ ++ struct sock_iocb *si = kiocb_to_siocb(iocb); ++ int err, len; ++ ++ si->sock = sock; ++ si->scm = NULL; ++ si->msg = msg; ++ si->size = size; ++ ++ err = security_socket_sendmsg(sock, msg, size); ++ if (err) ++ return err; ++ ++ len = sock->ops->sendmsg(iocb, sock, msg, size); ++ if (sock->sk) { ++ if (len == size) ++ vx_sock_send(sock->sk, size); ++ else ++ vx_sock_fail(sock->sk, size); ++ } ++ vxdprintk(VXD_CBIT(net, 7), ++ "__sock_sendmsg: %p[%p,%p,%p;%d/%d]:%d/%d", ++ sock, sock->sk, ++ (sock->sk)?sock->sk->sk_nx_info:0, ++ (sock->sk)?sock->sk->sk_vx_info:0, ++ (sock->sk)?sock->sk->sk_xid:0, ++ (sock->sk)?sock->sk->sk_nid:0, ++ (unsigned int)size, len); ++ return len; ++} ++ ++int sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) ++{ ++ struct kiocb iocb; ++ struct sock_iocb siocb; ++ int ret; ++ ++ init_sync_kiocb(&iocb, NULL); ++ iocb.private = &siocb; ++ ret = __sock_sendmsg(&iocb, sock, msg, size); ++ if (-EIOCBQUEUED == ret) ++ ret = wait_on_sync_kiocb(&iocb); ++ return ret; ++} ++ ++int kernel_sendmsg(struct socket *sock, struct msghdr *msg, ++ struct kvec *vec, size_t num, size_t size) ++{ ++ mm_segment_t oldfs = get_fs(); ++ int result; ++ ++ set_fs(KERNEL_DS); ++ /* ++ * the following is safe, since for compiler definitions of kvec and ++ * iovec are identical, yielding the same in-core layout and alignment ++ */ ++ msg->msg_iov = (struct iovec *)vec; ++ msg->msg_iovlen = num; ++ result = sock_sendmsg(sock, msg, size); ++ set_fs(oldfs); ++ return result; ++} ++ ++/* ++ * called from sock_recv_timestamp() if sock_flag(sk, SOCK_RCVTSTAMP) ++ */ ++void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, ++ struct sk_buff *skb) ++{ ++ ktime_t kt = skb->tstamp; ++ ++ if (!sock_flag(sk, SOCK_RCVTSTAMPNS)) { ++ struct timeval tv; ++ /* Race occurred between timestamp enabling and packet ++ receiving. Fill in the current time for now. */ ++ if (kt.tv64 == 0) ++ kt = ktime_get_real(); ++ skb->tstamp = kt; ++ tv = ktime_to_timeval(kt); ++ put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMP, sizeof(tv), &tv); ++ } else { ++ struct timespec ts; ++ /* Race occurred between timestamp enabling and packet ++ receiving. Fill in the current time for now. */ ++ if (kt.tv64 == 0) ++ kt = ktime_get_real(); ++ skb->tstamp = kt; ++ ts = ktime_to_timespec(kt); ++ put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPNS, sizeof(ts), &ts); ++ } ++} ++ ++EXPORT_SYMBOL_GPL(__sock_recv_timestamp); ++ ++static inline int __sock_recvmsg(struct kiocb *iocb, struct socket *sock, ++ struct msghdr *msg, size_t size, int flags) ++{ ++ int err, len; ++ struct sock_iocb *si = kiocb_to_siocb(iocb); ++ ++ si->sock = sock; ++ si->scm = NULL; ++ si->msg = msg; ++ si->size = size; ++ si->flags = flags; ++ ++ err = security_socket_recvmsg(sock, msg, size, flags); ++ if (err) ++ return err; ++ ++ len = sock->ops->recvmsg(iocb, sock, msg, size, flags); ++ if ((len >= 0) && sock->sk) ++ vx_sock_recv(sock->sk, len); ++ vxdprintk(VXD_CBIT(net, 7), ++ "__sock_recvmsg: %p[%p,%p,%p;%d/%d]:%d/%d", ++ sock, sock->sk, ++ (sock->sk)?sock->sk->sk_nx_info:0, ++ (sock->sk)?sock->sk->sk_vx_info:0, ++ (sock->sk)?sock->sk->sk_xid:0, ++ (sock->sk)?sock->sk->sk_nid:0, ++ (unsigned int)size, len); ++ return len; ++} ++ ++int sock_recvmsg(struct socket *sock, struct msghdr *msg, ++ size_t size, int flags) ++{ ++ struct kiocb iocb; ++ struct sock_iocb siocb; ++ int ret; ++ ++ init_sync_kiocb(&iocb, NULL); ++ iocb.private = &siocb; ++ ret = __sock_recvmsg(&iocb, sock, msg, size, flags); ++ if (-EIOCBQUEUED == ret) ++ ret = wait_on_sync_kiocb(&iocb); ++ return ret; ++} ++ ++int kernel_recvmsg(struct socket *sock, struct msghdr *msg, ++ struct kvec *vec, size_t num, size_t size, int flags) ++{ ++ mm_segment_t oldfs = get_fs(); ++ int result; ++ ++ set_fs(KERNEL_DS); ++ /* ++ * the following is safe, since for compiler definitions of kvec and ++ * iovec are identical, yielding the same in-core layout and alignment ++ */ ++ msg->msg_iov = (struct iovec *)vec, msg->msg_iovlen = num; ++ result = sock_recvmsg(sock, msg, size, flags); ++ set_fs(oldfs); ++ return result; ++} ++ ++static void sock_aio_dtor(struct kiocb *iocb) ++{ ++ kfree(iocb->private); ++} ++ ++static ssize_t sock_sendpage(struct file *file, struct page *page, ++ int offset, size_t size, loff_t *ppos, int more) ++{ ++ struct socket *sock; ++ int flags; ++ ++ sock = file->private_data; ++ ++ flags = !(file->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT; ++ if (more) ++ flags |= MSG_MORE; ++ ++ return sock->ops->sendpage(sock, page, offset, size, flags); ++} ++ ++static struct sock_iocb *alloc_sock_iocb(struct kiocb *iocb, ++ struct sock_iocb *siocb) ++{ ++ if (!is_sync_kiocb(iocb)) { ++ siocb = kmalloc(sizeof(*siocb), GFP_KERNEL); ++ if (!siocb) ++ return NULL; ++ iocb->ki_dtor = sock_aio_dtor; ++ } ++ ++ siocb->kiocb = iocb; ++ iocb->private = siocb; ++ return siocb; ++} ++ ++static ssize_t do_sock_read(struct msghdr *msg, struct kiocb *iocb, ++ struct file *file, const struct iovec *iov, ++ unsigned long nr_segs) ++{ ++ struct socket *sock = file->private_data; ++ size_t size = 0; ++ int i; ++ ++ for (i = 0; i < nr_segs; i++) ++ size += iov[i].iov_len; ++ ++ msg->msg_name = NULL; ++ msg->msg_namelen = 0; ++ msg->msg_control = NULL; ++ msg->msg_controllen = 0; ++ msg->msg_iov = (struct iovec *)iov; ++ msg->msg_iovlen = nr_segs; ++ msg->msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0; ++ ++ return __sock_recvmsg(iocb, sock, msg, size, msg->msg_flags); ++} ++ ++static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov, ++ unsigned long nr_segs, loff_t pos) ++{ ++ struct sock_iocb siocb, *x; ++ ++ if (pos != 0) ++ return -ESPIPE; ++ ++ if (iocb->ki_left == 0) /* Match SYS5 behaviour */ ++ return 0; ++ ++ ++ x = alloc_sock_iocb(iocb, &siocb); ++ if (!x) ++ return -ENOMEM; ++ return do_sock_read(&x->async_msg, iocb, iocb->ki_filp, iov, nr_segs); ++} ++ ++static ssize_t do_sock_write(struct msghdr *msg, struct kiocb *iocb, ++ struct file *file, const struct iovec *iov, ++ unsigned long nr_segs) ++{ ++ struct socket *sock = file->private_data; ++ size_t size = 0; ++ int i; ++ ++ for (i = 0; i < nr_segs; i++) ++ size += iov[i].iov_len; ++ ++ msg->msg_name = NULL; ++ msg->msg_namelen = 0; ++ msg->msg_control = NULL; ++ msg->msg_controllen = 0; ++ msg->msg_iov = (struct iovec *)iov; ++ msg->msg_iovlen = nr_segs; ++ msg->msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0; ++ if (sock->type == SOCK_SEQPACKET) ++ msg->msg_flags |= MSG_EOR; ++ ++ return __sock_sendmsg(iocb, sock, msg, size); ++} ++ ++static ssize_t sock_aio_write(struct kiocb *iocb, const struct iovec *iov, ++ unsigned long nr_segs, loff_t pos) ++{ ++ struct sock_iocb siocb, *x; ++ ++ if (pos != 0) ++ return -ESPIPE; ++ ++ x = alloc_sock_iocb(iocb, &siocb); ++ if (!x) ++ return -ENOMEM; ++ ++ return do_sock_write(&x->async_msg, iocb, iocb->ki_filp, iov, nr_segs); ++} ++ ++/* ++ * Atomic setting of ioctl hooks to avoid race ++ * with module unload. ++ */ ++ ++static DEFINE_MUTEX(br_ioctl_mutex); ++static int (*br_ioctl_hook) (struct net *, unsigned int cmd, void __user *arg) = NULL; ++ ++void brioctl_set(int (*hook) (struct net *, unsigned int, void __user *)) ++{ ++ mutex_lock(&br_ioctl_mutex); ++ br_ioctl_hook = hook; ++ mutex_unlock(&br_ioctl_mutex); ++} ++ ++EXPORT_SYMBOL(brioctl_set); ++ ++static DEFINE_MUTEX(vlan_ioctl_mutex); ++static int (*vlan_ioctl_hook) (struct net *, void __user *arg); ++ ++void vlan_ioctl_set(int (*hook) (struct net *, void __user *)) ++{ ++ mutex_lock(&vlan_ioctl_mutex); ++ vlan_ioctl_hook = hook; ++ mutex_unlock(&vlan_ioctl_mutex); ++} ++ ++EXPORT_SYMBOL(vlan_ioctl_set); ++ ++static DEFINE_MUTEX(dlci_ioctl_mutex); ++static int (*dlci_ioctl_hook) (unsigned int, void __user *); ++ ++void dlci_ioctl_set(int (*hook) (unsigned int, void __user *)) ++{ ++ mutex_lock(&dlci_ioctl_mutex); ++ dlci_ioctl_hook = hook; ++ mutex_unlock(&dlci_ioctl_mutex); ++} ++ ++EXPORT_SYMBOL(dlci_ioctl_set); ++ ++/* ++ * With an ioctl, arg may well be a user mode pointer, but we don't know ++ * what to do with it - that's up to the protocol still. ++ */ ++ ++static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) ++{ ++ struct socket *sock; ++ struct sock *sk; ++ void __user *argp = (void __user *)arg; ++ int pid, err; ++ struct net *net; ++ ++ sock = file->private_data; ++ sk = sock->sk; ++ net = sk->sk_net; ++ if (cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15)) { ++ err = dev_ioctl(net, cmd, argp); ++ } else ++#ifdef CONFIG_WIRELESS_EXT ++ if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) { ++ err = dev_ioctl(net, cmd, argp); ++ } else ++#endif /* CONFIG_WIRELESS_EXT */ ++ switch (cmd) { ++ case FIOSETOWN: ++ case SIOCSPGRP: ++ err = -EFAULT; ++ if (get_user(pid, (int __user *)argp)) ++ break; ++ err = f_setown(sock->file, pid, 1); ++ break; ++ case FIOGETOWN: ++ case SIOCGPGRP: ++ err = put_user(f_getown(sock->file), ++ (int __user *)argp); ++ break; ++ case SIOCGIFBR: ++ case SIOCSIFBR: ++ case SIOCBRADDBR: ++ case SIOCBRDELBR: ++ err = -ENOPKG; ++ if (!br_ioctl_hook) ++ request_module("bridge"); ++ ++ mutex_lock(&br_ioctl_mutex); ++ if (br_ioctl_hook) ++ err = br_ioctl_hook(net, cmd, argp); ++ mutex_unlock(&br_ioctl_mutex); ++ break; ++ case SIOCGIFVLAN: ++ case SIOCSIFVLAN: ++ err = -ENOPKG; ++ if (!vlan_ioctl_hook) ++ request_module("8021q"); ++ ++ mutex_lock(&vlan_ioctl_mutex); ++ if (vlan_ioctl_hook) ++ err = vlan_ioctl_hook(net, argp); ++ mutex_unlock(&vlan_ioctl_mutex); ++ break; ++ case SIOCADDDLCI: ++ case SIOCDELDLCI: ++ err = -ENOPKG; ++ if (!dlci_ioctl_hook) ++ request_module("dlci"); ++ ++ if (dlci_ioctl_hook) { ++ mutex_lock(&dlci_ioctl_mutex); ++ err = dlci_ioctl_hook(cmd, argp); ++ mutex_unlock(&dlci_ioctl_mutex); ++ } ++ break; ++ default: ++ err = sock->ops->ioctl(sock, cmd, arg); ++ ++ /* ++ * If this ioctl is unknown try to hand it down ++ * to the NIC driver. ++ */ ++ if (err == -ENOIOCTLCMD) ++ err = dev_ioctl(net, cmd, argp); ++ break; ++ } ++ return err; ++} ++ ++int sock_create_lite(int family, int type, int protocol, struct socket **res) ++{ ++ int err; ++ struct socket *sock = NULL; ++ ++ err = security_socket_create(family, type, protocol, 1); ++ if (err) ++ goto out; ++ ++ sock = sock_alloc(); ++ if (!sock) { ++ err = -ENOMEM; ++ goto out; ++ } ++ ++ sock->type = type; ++ err = security_socket_post_create(sock, family, type, protocol, 1); ++ if (err) ++ goto out_release; ++ ++out: ++ *res = sock; ++ return err; ++out_release: ++ sock_release(sock); ++ sock = NULL; ++ goto out; ++} ++ ++/* No kernel lock held - perfect */ ++static unsigned int sock_poll(struct file *file, poll_table *wait) ++{ ++ struct socket *sock; ++ ++ /* ++ * We can't return errors to poll, so it's either yes or no. ++ */ ++ sock = file->private_data; ++ return sock->ops->poll(file, sock, wait); ++} ++ ++static int sock_mmap(struct file *file, struct vm_area_struct *vma) ++{ ++ struct socket *sock = file->private_data; ++ ++ return sock->ops->mmap(file, sock, vma); ++} ++ ++static int sock_close(struct inode *inode, struct file *filp) ++{ ++ /* ++ * It was possible the inode is NULL we were ++ * closing an unfinished socket. ++ */ ++ ++ if (!inode) { ++ printk(KERN_DEBUG "sock_close: NULL inode\n"); ++ return 0; ++ } ++ sock_fasync(-1, filp, 0); ++ sock_release(SOCKET_I(inode)); ++ return 0; ++} ++ ++/* ++ * Update the socket async list ++ * ++ * Fasync_list locking strategy. ++ * ++ * 1. fasync_list is modified only under process context socket lock ++ * i.e. under semaphore. ++ * 2. fasync_list is used under read_lock(&sk->sk_callback_lock) ++ * or under socket lock. ++ * 3. fasync_list can be used from softirq context, so that ++ * modification under socket lock have to be enhanced with ++ * write_lock_bh(&sk->sk_callback_lock). ++ * --ANK (990710) ++ */ ++ ++static int sock_fasync(int fd, struct file *filp, int on) ++{ ++ struct fasync_struct *fa, *fna = NULL, **prev; ++ struct socket *sock; ++ struct sock *sk; ++ ++ if (on) { ++ fna = kmalloc(sizeof(struct fasync_struct), GFP_KERNEL); ++ if (fna == NULL) ++ return -ENOMEM; ++ } ++ ++ sock = filp->private_data; ++ ++ sk = sock->sk; ++ if (sk == NULL) { ++ kfree(fna); ++ return -EINVAL; ++ } ++ ++ lock_sock(sk); ++ ++ prev = &(sock->fasync_list); ++ ++ for (fa = *prev; fa != NULL; prev = &fa->fa_next, fa = *prev) ++ if (fa->fa_file == filp) ++ break; ++ ++ if (on) { ++ if (fa != NULL) { ++ write_lock_bh(&sk->sk_callback_lock); ++ fa->fa_fd = fd; ++ write_unlock_bh(&sk->sk_callback_lock); ++ ++ kfree(fna); ++ goto out; ++ } ++ fna->fa_file = filp; ++ fna->fa_fd = fd; ++ fna->magic = FASYNC_MAGIC; ++ fna->fa_next = sock->fasync_list; ++ write_lock_bh(&sk->sk_callback_lock); ++ sock->fasync_list = fna; ++ write_unlock_bh(&sk->sk_callback_lock); ++ } else { ++ if (fa != NULL) { ++ write_lock_bh(&sk->sk_callback_lock); ++ *prev = fa->fa_next; ++ write_unlock_bh(&sk->sk_callback_lock); ++ kfree(fa); ++ } ++ } ++ ++out: ++ release_sock(sock->sk); ++ return 0; ++} ++ ++/* This function may be called only under socket lock or callback_lock */ ++ ++int sock_wake_async(struct socket *sock, int how, int band) ++{ ++ if (!sock || !sock->fasync_list) ++ return -1; ++ switch (how) { ++ case 1: ++ ++ if (test_bit(SOCK_ASYNC_WAITDATA, &sock->flags)) ++ break; ++ goto call_kill; ++ case 2: ++ if (!test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags)) ++ break; ++ /* fall through */ ++ case 0: ++call_kill: ++ __kill_fasync(sock->fasync_list, SIGIO, band); ++ break; ++ case 3: ++ __kill_fasync(sock->fasync_list, SIGURG, band); ++ } ++ return 0; ++} ++ ++static int __sock_create(struct net *net, int family, int type, int protocol, ++ struct socket **res, int kern) ++{ ++ int err; ++ struct socket *sock; ++ const struct net_proto_family *pf; ++ ++ /* ++ * Check protocol is in range ++ */ ++ if (family < 0 || family >= NPROTO) ++ return -EAFNOSUPPORT; ++ if (type < 0 || type >= SOCK_MAX) ++ return -EINVAL; ++ ++ if (!nx_check(0, VS_ADMIN)) { ++ if (family == PF_INET && !current_nx_info_has_v4()) ++ return -EAFNOSUPPORT; ++ if (family == PF_INET6 && !current_nx_info_has_v6()) ++ return -EAFNOSUPPORT; ++ } ++ ++ /* Compatibility. ++ ++ This uglymoron is moved from INET layer to here to avoid ++ deadlock in module load. ++ */ ++ if (family == PF_INET && type == SOCK_PACKET) { ++ static int warned; ++ if (!warned) { ++ warned = 1; ++ printk(KERN_INFO "%s uses obsolete (PF_INET,SOCK_PACKET)\n", ++ current->comm); ++ } ++ family = PF_PACKET; ++ } ++ ++ err = security_socket_create(family, type, protocol, kern); ++ if (err) ++ return err; ++ ++ /* ++ * Allocate the socket and allow the family to set things up. if ++ * the protocol is 0, the family is instructed to select an appropriate ++ * default. ++ */ ++ sock = sock_alloc(); ++ if (!sock) { ++ if (net_ratelimit()) ++ printk(KERN_WARNING "socket: no more sockets\n"); ++ return -ENFILE; /* Not exactly a match, but its the ++ closest posix thing */ ++ } ++ ++ sock->type = type; ++ ++#if defined(CONFIG_KMOD) ++ /* Attempt to load a protocol module if the find failed. ++ * ++ * 12/09/1996 Marcin: But! this makes REALLY only sense, if the user ++ * requested real, full-featured networking support upon configuration. ++ * Otherwise module support will break! ++ */ ++ if (net_families[family] == NULL) ++ request_module("net-pf-%d", family); ++#endif ++ ++ rcu_read_lock(); ++ pf = rcu_dereference(net_families[family]); ++ err = -EAFNOSUPPORT; ++ if (!pf) ++ goto out_release; ++ ++ /* ++ * We will call the ->create function, that possibly is in a loadable ++ * module, so we have to bump that loadable module refcnt first. ++ */ ++ if (!try_module_get(pf->owner)) ++ goto out_release; ++ ++ /* Now protected by module ref count */ ++ rcu_read_unlock(); ++ ++ err = pf->create(net, sock, protocol); ++ if (err < 0) ++ goto out_module_put; ++ ++ /* ++ * Now to bump the refcnt of the [loadable] module that owns this ++ * socket at sock_release time we decrement its refcnt. ++ */ ++ if (!try_module_get(sock->ops->owner)) ++ goto out_module_busy; ++ ++ /* ++ * Now that we're done with the ->create function, the [loadable] ++ * module can have its refcnt decremented ++ */ ++ module_put(pf->owner); ++ err = security_socket_post_create(sock, family, type, protocol, kern); ++ if (err) ++ goto out_sock_release; ++ *res = sock; ++ ++ return 0; ++ ++out_module_busy: ++ err = -EAFNOSUPPORT; ++out_module_put: ++ sock->ops = NULL; ++ module_put(pf->owner); ++out_sock_release: ++ sock_release(sock); ++ return err; ++ ++out_release: ++ rcu_read_unlock(); ++ goto out_sock_release; ++} ++ ++int sock_create(int family, int type, int protocol, struct socket **res) ++{ ++ return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0); ++} ++ ++int sock_create_kern(int family, int type, int protocol, struct socket **res) ++{ ++ return __sock_create(&init_net, family, type, protocol, res, 1); ++} ++ ++asmlinkage long sys_socket(int family, int type, int protocol) ++{ ++ int retval; ++ struct socket *sock; ++ ++ retval = sock_create(family, type, protocol, &sock); ++ if (retval < 0) ++ goto out; ++ ++ set_bit(SOCK_USER_SOCKET, &sock->flags); ++ retval = sock_map_fd(sock); ++ if (retval < 0) ++ goto out_release; ++ ++out: ++ /* It may be already another descriptor 8) Not kernel problem. */ ++ return retval; ++ ++out_release: ++ sock_release(sock); ++ return retval; ++} ++ ++/* ++ * Create a pair of connected sockets. ++ */ ++ ++asmlinkage long sys_socketpair(int family, int type, int protocol, ++ int __user *usockvec) ++{ ++ struct socket *sock1, *sock2; ++ int fd1, fd2, err; ++ struct file *newfile1, *newfile2; ++ ++ /* ++ * Obtain the first socket and check if the underlying protocol ++ * supports the socketpair call. ++ */ ++ ++ err = sock_create(family, type, protocol, &sock1); ++ if (err < 0) ++ goto out; ++ set_bit(SOCK_USER_SOCKET, &sock1->flags); ++ ++ err = sock_create(family, type, protocol, &sock2); ++ if (err < 0) ++ goto out_release_1; ++ set_bit(SOCK_USER_SOCKET, &sock2->flags); ++ ++ err = sock1->ops->socketpair(sock1, sock2); ++ if (err < 0) ++ goto out_release_both; ++ ++ fd1 = sock_alloc_fd(&newfile1); ++ if (unlikely(fd1 < 0)) { ++ err = fd1; ++ goto out_release_both; ++ } ++ ++ fd2 = sock_alloc_fd(&newfile2); ++ if (unlikely(fd2 < 0)) { ++ err = fd2; ++ put_filp(newfile1); ++ put_unused_fd(fd1); ++ goto out_release_both; ++ } ++ ++ err = sock_attach_fd(sock1, newfile1); ++ if (unlikely(err < 0)) { ++ goto out_fd2; ++ } ++ ++ err = sock_attach_fd(sock2, newfile2); ++ if (unlikely(err < 0)) { ++ fput(newfile1); ++ goto out_fd1; ++ } ++ ++ err = audit_fd_pair(fd1, fd2); ++ if (err < 0) { ++ fput(newfile1); ++ fput(newfile2); ++ goto out_fd; ++ } ++ ++ fd_install(fd1, newfile1); ++ fd_install(fd2, newfile2); ++ /* fd1 and fd2 may be already another descriptors. ++ * Not kernel problem. ++ */ ++ ++ err = put_user(fd1, &usockvec[0]); ++ if (!err) ++ err = put_user(fd2, &usockvec[1]); ++ if (!err) ++ return 0; ++ ++ sys_close(fd2); ++ sys_close(fd1); ++ return err; ++ ++out_release_both: ++ sock_release(sock2); ++out_release_1: ++ sock_release(sock1); ++out: ++ return err; ++ ++out_fd2: ++ put_filp(newfile1); ++ sock_release(sock1); ++out_fd1: ++ put_filp(newfile2); ++ sock_release(sock2); ++out_fd: ++ put_unused_fd(fd1); ++ put_unused_fd(fd2); ++ goto out; ++} ++ ++/* ++ * Bind a name to a socket. Nothing much to do here since it's ++ * the protocol's responsibility to handle the local address. ++ * ++ * We move the socket address to kernel space before we call ++ * the protocol layer (having also checked the address is ok). ++ */ ++ ++asmlinkage long sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen) ++{ ++ struct socket *sock; ++ char address[MAX_SOCK_ADDR]; ++ int err, fput_needed; ++ ++ sock = sockfd_lookup_light(fd, &err, &fput_needed); ++ if (sock) { ++ err = move_addr_to_kernel(umyaddr, addrlen, address); ++ if (err >= 0) { ++ err = security_socket_bind(sock, ++ (struct sockaddr *)address, ++ addrlen); ++ if (!err) ++ err = sock->ops->bind(sock, ++ (struct sockaddr *) ++ address, addrlen); ++ } ++ fput_light(sock->file, fput_needed); ++ } ++ return err; ++} ++ ++/* ++ * Perform a listen. Basically, we allow the protocol to do anything ++ * necessary for a listen, and if that works, we mark the socket as ++ * ready for listening. ++ */ ++ ++asmlinkage long sys_listen(int fd, int backlog) ++{ ++ struct socket *sock; ++ int err, fput_needed; ++ ++ sock = sockfd_lookup_light(fd, &err, &fput_needed); ++ if (sock) { ++ struct net *net = sock->sk->sk_net; ++ if ((unsigned)backlog > net->sysctl_somaxconn) ++ backlog = net->sysctl_somaxconn; ++ ++ err = security_socket_listen(sock, backlog); ++ if (!err) ++ err = sock->ops->listen(sock, backlog); ++ ++ fput_light(sock->file, fput_needed); ++ } ++ return err; ++} ++ ++/* ++ * For accept, we attempt to create a new socket, set up the link ++ * with the client, wake up the client, then return the new ++ * connected fd. We collect the address of the connector in kernel ++ * space and move it to user at the very end. This is unclean because ++ * we open the socket then return an error. ++ * ++ * 1003.1g adds the ability to recvmsg() to query connection pending ++ * status to recvmsg. We need to add that support in a way thats ++ * clean when we restucture accept also. ++ */ ++ ++asmlinkage long sys_accept(int fd, struct sockaddr __user *upeer_sockaddr, ++ int __user *upeer_addrlen) ++{ ++ struct socket *sock, *newsock; ++ struct file *newfile; ++ int err, len, newfd, fput_needed; ++ char address[MAX_SOCK_ADDR]; ++ ++ sock = sockfd_lookup_light(fd, &err, &fput_needed); ++ if (!sock) ++ goto out; ++ ++ err = -ENFILE; ++ if (!(newsock = sock_alloc())) ++ goto out_put; ++ ++ newsock->type = sock->type; ++ newsock->ops = sock->ops; ++ ++ /* ++ * We don't need try_module_get here, as the listening socket (sock) ++ * has the protocol module (sock->ops->owner) held. ++ */ ++ __module_get(newsock->ops->owner); ++ ++ newfd = sock_alloc_fd(&newfile); ++ if (unlikely(newfd < 0)) { ++ err = newfd; ++ sock_release(newsock); ++ goto out_put; ++ } ++ ++ err = sock_attach_fd(newsock, newfile); ++ if (err < 0) ++ goto out_fd_simple; ++ ++ err = security_socket_accept(sock, newsock); ++ if (err) ++ goto out_fd; ++ ++ err = sock->ops->accept(sock, newsock, sock->file->f_flags); ++ if (err < 0) ++ goto out_fd; ++ ++ if (upeer_sockaddr) { ++ if (newsock->ops->getname(newsock, (struct sockaddr *)address, ++ &len, 2) < 0) { ++ err = -ECONNABORTED; ++ goto out_fd; ++ } ++ err = move_addr_to_user(address, len, upeer_sockaddr, ++ upeer_addrlen); ++ if (err < 0) ++ goto out_fd; ++ } ++ ++ /* File flags are not inherited via accept() unlike another OSes. */ ++ ++ fd_install(newfd, newfile); ++ err = newfd; ++ ++ security_socket_post_accept(sock, newsock); ++ ++out_put: ++ fput_light(sock->file, fput_needed); ++out: ++ return err; ++out_fd_simple: ++ sock_release(newsock); ++ put_filp(newfile); ++ put_unused_fd(newfd); ++ goto out_put; ++out_fd: ++ fput(newfile); ++ put_unused_fd(newfd); ++ goto out_put; ++} ++ ++/* ++ * Attempt to connect to a socket with the server address. The address ++ * is in user space so we verify it is OK and move it to kernel space. ++ * ++ * For 1003.1g we need to add clean support for a bind to AF_UNSPEC to ++ * break bindings ++ * ++ * NOTE: 1003.1g draft 6.3 is broken with respect to AX.25/NetROM and ++ * other SEQPACKET protocols that take time to connect() as it doesn't ++ * include the -EINPROGRESS status for such sockets. ++ */ ++ ++asmlinkage long sys_connect(int fd, struct sockaddr __user *uservaddr, ++ int addrlen) ++{ ++ struct socket *sock; ++ char address[MAX_SOCK_ADDR]; ++ int err, fput_needed; ++ ++ sock = sockfd_lookup_light(fd, &err, &fput_needed); ++ if (!sock) ++ goto out; ++ err = move_addr_to_kernel(uservaddr, addrlen, address); ++ if (err < 0) ++ goto out_put; ++ ++ err = ++ security_socket_connect(sock, (struct sockaddr *)address, addrlen); ++ if (err) ++ goto out_put; ++ ++ err = sock->ops->connect(sock, (struct sockaddr *)address, addrlen, ++ sock->file->f_flags); ++out_put: ++ fput_light(sock->file, fput_needed); ++out: ++ return err; ++} ++ ++/* ++ * Get the local address ('name') of a socket object. Move the obtained ++ * name to user space. ++ */ ++ ++asmlinkage long sys_getsockname(int fd, struct sockaddr __user *usockaddr, ++ int __user *usockaddr_len) ++{ ++ struct socket *sock; ++ char address[MAX_SOCK_ADDR]; ++ int len, err, fput_needed; ++ ++ sock = sockfd_lookup_light(fd, &err, &fput_needed); ++ if (!sock) ++ goto out; ++ ++ err = security_socket_getsockname(sock); ++ if (err) ++ goto out_put; ++ ++ err = sock->ops->getname(sock, (struct sockaddr *)address, &len, 0); ++ if (err) ++ goto out_put; ++ err = move_addr_to_user(address, len, usockaddr, usockaddr_len); ++ ++out_put: ++ fput_light(sock->file, fput_needed); ++out: ++ return err; ++} ++ ++/* ++ * Get the remote address ('name') of a socket object. Move the obtained ++ * name to user space. ++ */ ++ ++asmlinkage long sys_getpeername(int fd, struct sockaddr __user *usockaddr, ++ int __user *usockaddr_len) ++{ ++ struct socket *sock; ++ char address[MAX_SOCK_ADDR]; ++ int len, err, fput_needed; ++ ++ sock = sockfd_lookup_light(fd, &err, &fput_needed); ++ if (sock != NULL) { ++ err = security_socket_getpeername(sock); ++ if (err) { ++ fput_light(sock->file, fput_needed); ++ return err; ++ } ++ ++ err = ++ sock->ops->getname(sock, (struct sockaddr *)address, &len, ++ 1); ++ if (!err) ++ err = move_addr_to_user(address, len, usockaddr, ++ usockaddr_len); ++ fput_light(sock->file, fput_needed); ++ } ++ return err; ++} ++ ++/* ++ * Send a datagram to a given address. We move the address into kernel ++ * space and check the user space data area is readable before invoking ++ * the protocol. ++ */ ++ ++asmlinkage long sys_sendto(int fd, void __user *buff, size_t len, ++ unsigned flags, struct sockaddr __user *addr, ++ int addr_len) ++{ ++ struct socket *sock; ++ char address[MAX_SOCK_ADDR]; ++ int err; ++ struct msghdr msg; ++ struct iovec iov; ++ int fput_needed; ++ struct file *sock_file; ++ ++ sock_file = fget_light(fd, &fput_needed); ++ err = -EBADF; ++ if (!sock_file) ++ goto out; ++ ++ sock = sock_from_file(sock_file, &err); ++ if (!sock) ++ goto out_put; ++ iov.iov_base = buff; ++ iov.iov_len = len; ++ msg.msg_name = NULL; ++ msg.msg_iov = &iov; ++ msg.msg_iovlen = 1; ++ msg.msg_control = NULL; ++ msg.msg_controllen = 0; ++ msg.msg_namelen = 0; ++ if (addr) { ++ err = move_addr_to_kernel(addr, addr_len, address); ++ if (err < 0) ++ goto out_put; ++ msg.msg_name = address; ++ msg.msg_namelen = addr_len; ++ } ++ if (sock->file->f_flags & O_NONBLOCK) ++ flags |= MSG_DONTWAIT; ++ msg.msg_flags = flags; ++ err = sock_sendmsg(sock, &msg, len); ++ ++out_put: ++ fput_light(sock_file, fput_needed); ++out: ++ return err; ++} ++ ++/* ++ * Send a datagram down a socket. ++ */ ++ ++asmlinkage long sys_send(int fd, void __user *buff, size_t len, unsigned flags) ++{ ++ return sys_sendto(fd, buff, len, flags, NULL, 0); ++} ++ ++/* ++ * Receive a frame from the socket and optionally record the address of the ++ * sender. We verify the buffers are writable and if needed move the ++ * sender address from kernel to user space. ++ */ ++ ++asmlinkage long sys_recvfrom(int fd, void __user *ubuf, size_t size, ++ unsigned flags, struct sockaddr __user *addr, ++ int __user *addr_len) ++{ ++ struct socket *sock; ++ struct iovec iov; ++ struct msghdr msg; ++ char address[MAX_SOCK_ADDR]; ++ int err, err2; ++ struct file *sock_file; ++ int fput_needed; ++ ++ sock_file = fget_light(fd, &fput_needed); ++ err = -EBADF; ++ if (!sock_file) ++ goto out; ++ ++ sock = sock_from_file(sock_file, &err); ++ if (!sock) ++ goto out_put; ++ ++ msg.msg_control = NULL; ++ msg.msg_controllen = 0; ++ msg.msg_iovlen = 1; ++ msg.msg_iov = &iov; ++ iov.iov_len = size; ++ iov.iov_base = ubuf; ++ msg.msg_name = address; ++ msg.msg_namelen = MAX_SOCK_ADDR; ++ if (sock->file->f_flags & O_NONBLOCK) ++ flags |= MSG_DONTWAIT; ++ err = sock_recvmsg(sock, &msg, size, flags); ++ ++ if (err >= 0 && addr != NULL) { ++ err2 = move_addr_to_user(address, msg.msg_namelen, addr, addr_len); ++ if (err2 < 0) ++ err = err2; ++ } ++out_put: ++ fput_light(sock_file, fput_needed); ++out: ++ return err; ++} ++ ++/* ++ * Receive a datagram from a socket. ++ */ ++ ++asmlinkage long sys_recv(int fd, void __user *ubuf, size_t size, ++ unsigned flags) ++{ ++ return sys_recvfrom(fd, ubuf, size, flags, NULL, NULL); ++} ++ ++/* ++ * Set a socket option. Because we don't know the option lengths we have ++ * to pass the user mode parameter for the protocols to sort out. ++ */ ++ ++asmlinkage long sys_setsockopt(int fd, int level, int optname, ++ char __user *optval, int optlen) ++{ ++ int err, fput_needed; ++ struct socket *sock; ++ ++ if (optlen < 0) ++ return -EINVAL; ++ ++ sock = sockfd_lookup_light(fd, &err, &fput_needed); ++ if (sock != NULL) { ++ err = security_socket_setsockopt(sock, level, optname); ++ if (err) ++ goto out_put; ++ ++ if (level == SOL_SOCKET) ++ err = ++ sock_setsockopt(sock, level, optname, optval, ++ optlen); ++ else ++ err = ++ sock->ops->setsockopt(sock, level, optname, optval, ++ optlen); ++out_put: ++ fput_light(sock->file, fput_needed); ++ } ++ return err; ++} ++ ++/* ++ * Get a socket option. Because we don't know the option lengths we have ++ * to pass a user mode parameter for the protocols to sort out. ++ */ ++ ++asmlinkage long sys_getsockopt(int fd, int level, int optname, ++ char __user *optval, int __user *optlen) ++{ ++ int err, fput_needed; ++ struct socket *sock; ++ ++ sock = sockfd_lookup_light(fd, &err, &fput_needed); ++ if (sock != NULL) { ++ err = security_socket_getsockopt(sock, level, optname); ++ if (err) ++ goto out_put; ++ ++ if (level == SOL_SOCKET) ++ err = ++ sock_getsockopt(sock, level, optname, optval, ++ optlen); ++ else ++ err = ++ sock->ops->getsockopt(sock, level, optname, optval, ++ optlen); ++out_put: ++ fput_light(sock->file, fput_needed); ++ } ++ return err; ++} ++ ++/* ++ * Shutdown a socket. ++ */ ++ ++asmlinkage long sys_shutdown(int fd, int how) ++{ ++ int err, fput_needed; ++ struct socket *sock; ++ ++ sock = sockfd_lookup_light(fd, &err, &fput_needed); ++ if (sock != NULL) { ++ err = security_socket_shutdown(sock, how); ++ if (!err) ++ err = sock->ops->shutdown(sock, how); ++ fput_light(sock->file, fput_needed); ++ } ++ return err; ++} ++ ++/* A couple of helpful macros for getting the address of the 32/64 bit ++ * fields which are the same type (int / unsigned) on our platforms. ++ */ ++#define COMPAT_MSG(msg, member) ((MSG_CMSG_COMPAT & flags) ? &msg##_compat->member : &msg->member) ++#define COMPAT_NAMELEN(msg) COMPAT_MSG(msg, msg_namelen) ++#define COMPAT_FLAGS(msg) COMPAT_MSG(msg, msg_flags) ++ ++/* ++ * BSD sendmsg interface ++ */ ++ ++asmlinkage long sys_sendmsg(int fd, struct msghdr __user *msg, unsigned flags) ++{ ++ struct compat_msghdr __user *msg_compat = ++ (struct compat_msghdr __user *)msg; ++ struct socket *sock; ++ char address[MAX_SOCK_ADDR]; ++ struct iovec iovstack[UIO_FASTIOV], *iov = iovstack; ++ unsigned char ctl[sizeof(struct cmsghdr) + 20] ++ __attribute__ ((aligned(sizeof(__kernel_size_t)))); ++ /* 20 is size of ipv6_pktinfo */ ++ unsigned char *ctl_buf = ctl; ++ struct msghdr msg_sys; ++ int err, ctl_len, iov_size, total_len; ++ int fput_needed; ++ ++ err = -EFAULT; ++ if (MSG_CMSG_COMPAT & flags) { ++ if (get_compat_msghdr(&msg_sys, msg_compat)) ++ return -EFAULT; ++ } ++ else if (copy_from_user(&msg_sys, msg, sizeof(struct msghdr))) ++ return -EFAULT; ++ ++ sock = sockfd_lookup_light(fd, &err, &fput_needed); ++ if (!sock) ++ goto out; ++ ++ /* do not move before msg_sys is valid */ ++ err = -EMSGSIZE; ++ if (msg_sys.msg_iovlen > UIO_MAXIOV) ++ goto out_put; ++ ++ /* Check whether to allocate the iovec area */ ++ err = -ENOMEM; ++ iov_size = msg_sys.msg_iovlen * sizeof(struct iovec); ++ if (msg_sys.msg_iovlen > UIO_FASTIOV) { ++ iov = sock_kmalloc(sock->sk, iov_size, GFP_KERNEL); ++ if (!iov) ++ goto out_put; ++ } ++ ++ /* This will also move the address data into kernel space */ ++ if (MSG_CMSG_COMPAT & flags) { ++ err = verify_compat_iovec(&msg_sys, iov, address, VERIFY_READ); ++ } else ++ err = verify_iovec(&msg_sys, iov, address, VERIFY_READ); ++ if (err < 0) ++ goto out_freeiov; ++ total_len = err; ++ ++ err = -ENOBUFS; ++ ++ if (msg_sys.msg_controllen > INT_MAX) ++ goto out_freeiov; ++ ctl_len = msg_sys.msg_controllen; ++ if ((MSG_CMSG_COMPAT & flags) && ctl_len) { ++ err = ++ cmsghdr_from_user_compat_to_kern(&msg_sys, sock->sk, ctl, ++ sizeof(ctl)); ++ if (err) ++ goto out_freeiov; ++ ctl_buf = msg_sys.msg_control; ++ ctl_len = msg_sys.msg_controllen; ++ } else if (ctl_len) { ++ if (ctl_len > sizeof(ctl)) { ++ ctl_buf = sock_kmalloc(sock->sk, ctl_len, GFP_KERNEL); ++ if (ctl_buf == NULL) ++ goto out_freeiov; ++ } ++ err = -EFAULT; ++ /* ++ * Careful! Before this, msg_sys.msg_control contains a user pointer. ++ * Afterwards, it will be a kernel pointer. Thus the compiler-assisted ++ * checking falls down on this. ++ */ ++ if (copy_from_user(ctl_buf, (void __user *)msg_sys.msg_control, ++ ctl_len)) ++ goto out_freectl; ++ msg_sys.msg_control = ctl_buf; ++ } ++ msg_sys.msg_flags = flags; ++ ++ if (sock->file->f_flags & O_NONBLOCK) ++ msg_sys.msg_flags |= MSG_DONTWAIT; ++ err = sock_sendmsg(sock, &msg_sys, total_len); ++ ++out_freectl: ++ if (ctl_buf != ctl) ++ sock_kfree_s(sock->sk, ctl_buf, ctl_len); ++out_freeiov: ++ if (iov != iovstack) ++ sock_kfree_s(sock->sk, iov, iov_size); ++out_put: ++ fput_light(sock->file, fput_needed); ++out: ++ return err; ++} ++ ++/* ++ * BSD recvmsg interface ++ */ ++ ++asmlinkage long sys_recvmsg(int fd, struct msghdr __user *msg, ++ unsigned int flags) ++{ ++ struct compat_msghdr __user *msg_compat = ++ (struct compat_msghdr __user *)msg; ++ struct socket *sock; ++ struct iovec iovstack[UIO_FASTIOV]; ++ struct iovec *iov = iovstack; ++ struct msghdr msg_sys; ++ unsigned long cmsg_ptr; ++ int err, iov_size, total_len, len; ++ int fput_needed; ++ ++ /* kernel mode address */ ++ char addr[MAX_SOCK_ADDR]; ++ ++ /* user mode address pointers */ ++ struct sockaddr __user *uaddr; ++ int __user *uaddr_len; ++ ++ if (MSG_CMSG_COMPAT & flags) { ++ if (get_compat_msghdr(&msg_sys, msg_compat)) ++ return -EFAULT; ++ } ++ else if (copy_from_user(&msg_sys, msg, sizeof(struct msghdr))) ++ return -EFAULT; ++ ++ sock = sockfd_lookup_light(fd, &err, &fput_needed); ++ if (!sock) ++ goto out; ++ ++ err = -EMSGSIZE; ++ if (msg_sys.msg_iovlen > UIO_MAXIOV) ++ goto out_put; ++ ++ /* Check whether to allocate the iovec area */ ++ err = -ENOMEM; ++ iov_size = msg_sys.msg_iovlen * sizeof(struct iovec); ++ if (msg_sys.msg_iovlen > UIO_FASTIOV) { ++ iov = sock_kmalloc(sock->sk, iov_size, GFP_KERNEL); ++ if (!iov) ++ goto out_put; ++ } ++ ++ /* ++ * Save the user-mode address (verify_iovec will change the ++ * kernel msghdr to use the kernel address space) ++ */ ++ ++ uaddr = (void __user *)msg_sys.msg_name; ++ uaddr_len = COMPAT_NAMELEN(msg); ++ if (MSG_CMSG_COMPAT & flags) { ++ err = verify_compat_iovec(&msg_sys, iov, addr, VERIFY_WRITE); ++ } else ++ err = verify_iovec(&msg_sys, iov, addr, VERIFY_WRITE); ++ if (err < 0) ++ goto out_freeiov; ++ total_len = err; ++ ++ cmsg_ptr = (unsigned long)msg_sys.msg_control; ++ msg_sys.msg_flags = 0; ++ if (MSG_CMSG_COMPAT & flags) ++ msg_sys.msg_flags = MSG_CMSG_COMPAT; ++ ++ if (sock->file->f_flags & O_NONBLOCK) ++ flags |= MSG_DONTWAIT; ++ err = sock_recvmsg(sock, &msg_sys, total_len, flags); ++ if (err < 0) ++ goto out_freeiov; ++ len = err; ++ ++ if (uaddr != NULL) { ++ err = move_addr_to_user(addr, msg_sys.msg_namelen, uaddr, ++ uaddr_len); ++ if (err < 0) ++ goto out_freeiov; ++ } ++ err = __put_user((msg_sys.msg_flags & ~MSG_CMSG_COMPAT), ++ COMPAT_FLAGS(msg)); ++ if (err) ++ goto out_freeiov; ++ if (MSG_CMSG_COMPAT & flags) ++ err = __put_user((unsigned long)msg_sys.msg_control - cmsg_ptr, ++ &msg_compat->msg_controllen); ++ else ++ err = __put_user((unsigned long)msg_sys.msg_control - cmsg_ptr, ++ &msg->msg_controllen); ++ if (err) ++ goto out_freeiov; ++ err = len; ++ ++out_freeiov: ++ if (iov != iovstack) ++ sock_kfree_s(sock->sk, iov, iov_size); ++out_put: ++ fput_light(sock->file, fput_needed); ++out: ++ return err; ++} ++ ++#ifdef __ARCH_WANT_SYS_SOCKETCALL ++ ++/* Argument list sizes for sys_socketcall */ ++#define AL(x) ((x) * sizeof(unsigned long)) ++static const unsigned char nargs[18]={ ++ AL(0),AL(3),AL(3),AL(3),AL(2),AL(3), ++ AL(3),AL(3),AL(4),AL(4),AL(4),AL(6), ++ AL(6),AL(2),AL(5),AL(5),AL(3),AL(3) ++}; ++ ++#undef AL ++ ++/* ++ * System call vectors. ++ * ++ * Argument checking cleaned up. Saved 20% in size. ++ * This function doesn't need to set the kernel lock because ++ * it is set by the callees. ++ */ ++ ++asmlinkage long sys_socketcall(int call, unsigned long __user *args) ++{ ++ unsigned long a[6]; ++ unsigned long a0, a1; ++ int err; ++ ++ if (call < 1 || call > SYS_RECVMSG) ++ return -EINVAL; ++ ++ /* copy_from_user should be SMP safe. */ ++ if (copy_from_user(a, args, nargs[call])) ++ return -EFAULT; ++ ++ err = audit_socketcall(nargs[call] / sizeof(unsigned long), a); ++ if (err) ++ return err; ++ ++ a0 = a[0]; ++ a1 = a[1]; ++ ++ switch (call) { ++ case SYS_SOCKET: ++ err = sys_socket(a0, a1, a[2]); ++ break; ++ case SYS_BIND: ++ err = sys_bind(a0, (struct sockaddr __user *)a1, a[2]); ++ break; ++ case SYS_CONNECT: ++ err = sys_connect(a0, (struct sockaddr __user *)a1, a[2]); ++ break; ++ case SYS_LISTEN: ++ err = sys_listen(a0, a1); ++ break; ++ case SYS_ACCEPT: ++ err = ++ sys_accept(a0, (struct sockaddr __user *)a1, ++ (int __user *)a[2]); ++ break; ++ case SYS_GETSOCKNAME: ++ err = ++ sys_getsockname(a0, (struct sockaddr __user *)a1, ++ (int __user *)a[2]); ++ break; ++ case SYS_GETPEERNAME: ++ err = ++ sys_getpeername(a0, (struct sockaddr __user *)a1, ++ (int __user *)a[2]); ++ break; ++ case SYS_SOCKETPAIR: ++ err = sys_socketpair(a0, a1, a[2], (int __user *)a[3]); ++ break; ++ case SYS_SEND: ++ err = sys_send(a0, (void __user *)a1, a[2], a[3]); ++ break; ++ case SYS_SENDTO: ++ err = sys_sendto(a0, (void __user *)a1, a[2], a[3], ++ (struct sockaddr __user *)a[4], a[5]); ++ break; ++ case SYS_RECV: ++ err = sys_recv(a0, (void __user *)a1, a[2], a[3]); ++ break; ++ case SYS_RECVFROM: ++ err = sys_recvfrom(a0, (void __user *)a1, a[2], a[3], ++ (struct sockaddr __user *)a[4], ++ (int __user *)a[5]); ++ break; ++ case SYS_SHUTDOWN: ++ err = sys_shutdown(a0, a1); ++ break; ++ case SYS_SETSOCKOPT: ++ err = sys_setsockopt(a0, a1, a[2], (char __user *)a[3], a[4]); ++ break; ++ case SYS_GETSOCKOPT: ++ err = ++ sys_getsockopt(a0, a1, a[2], (char __user *)a[3], ++ (int __user *)a[4]); ++ break; ++ case SYS_SENDMSG: ++ err = sys_sendmsg(a0, (struct msghdr __user *)a1, a[2]); ++ break; ++ case SYS_RECVMSG: ++ err = sys_recvmsg(a0, (struct msghdr __user *)a1, a[2]); ++ break; ++ default: ++ err = -EINVAL; ++ break; ++ } ++ return err; ++} ++ ++#endif /* __ARCH_WANT_SYS_SOCKETCALL */ ++ ++/** ++ * sock_register - add a socket protocol handler ++ * @ops: description of protocol ++ * ++ * This function is called by a protocol handler that wants to ++ * advertise its address family, and have it linked into the ++ * socket interface. The value ops->family coresponds to the ++ * socket system call protocol family. ++ */ ++int sock_register(const struct net_proto_family *ops) ++{ ++ int err; ++ ++ if (ops->family >= NPROTO) { ++ printk(KERN_CRIT "protocol %d >= NPROTO(%d)\n", ops->family, ++ NPROTO); ++ return -ENOBUFS; ++ } ++ ++ spin_lock(&net_family_lock); ++ if (net_families[ops->family]) ++ err = -EEXIST; ++ else { ++ net_families[ops->family] = ops; ++ err = 0; ++ } ++ spin_unlock(&net_family_lock); ++ ++ printk(KERN_INFO "NET: Registered protocol family %d\n", ops->family); ++ return err; ++} ++ ++/** ++ * sock_unregister - remove a protocol handler ++ * @family: protocol family to remove ++ * ++ * This function is called by a protocol handler that wants to ++ * remove its address family, and have it unlinked from the ++ * new socket creation. ++ * ++ * If protocol handler is a module, then it can use module reference ++ * counts to protect against new references. If protocol handler is not ++ * a module then it needs to provide its own protection in ++ * the ops->create routine. ++ */ ++void sock_unregister(int family) ++{ ++ BUG_ON(family < 0 || family >= NPROTO); ++ ++ spin_lock(&net_family_lock); ++ net_families[family] = NULL; ++ spin_unlock(&net_family_lock); ++ ++ synchronize_rcu(); ++ ++ printk(KERN_INFO "NET: Unregistered protocol family %d\n", family); ++} ++ ++static int sock_pernet_init(struct net *net) ++{ ++ net->sysctl_somaxconn = SOMAXCONN; ++ return 0; ++} ++ ++static struct pernet_operations sock_net_ops = { ++ .init = sock_pernet_init, ++}; ++ ++static int __init sock_init(void) ++{ ++ /* ++ * Initialize sock SLAB cache. ++ */ ++ ++ sk_init(); ++ ++ /* ++ * Initialize skbuff SLAB cache ++ */ ++ skb_init(); ++ ++ /* ++ * Initialize the protocols module. ++ */ ++ ++ init_inodecache(); ++ register_filesystem(&sock_fs_type); ++ sock_mnt = kern_mount(&sock_fs_type); ++ ++ /* The real protocol initialization is performed in later initcalls. ++ */ ++ ++#ifdef CONFIG_NETFILTER ++ netfilter_init(); ++#endif ++ ++ register_pernet_subsys(&sock_net_ops); ++ ++ return 0; ++} ++ ++core_initcall(sock_init); /* early initcall */ ++ ++#ifdef CONFIG_PROC_FS ++void socket_seq_show(struct seq_file *seq) ++{ ++ int cpu; ++ int counter = 0; ++ ++ for_each_possible_cpu(cpu) ++ counter += per_cpu(sockets_in_use, cpu); ++ ++ /* It can be negative, by the way. 8) */ ++ if (counter < 0) ++ counter = 0; ++ ++ seq_printf(seq, "sockets: used %d\n", counter); ++} ++#endif /* CONFIG_PROC_FS */ ++ ++#ifdef CONFIG_COMPAT ++static long compat_sock_ioctl(struct file *file, unsigned cmd, ++ unsigned long arg) ++{ ++ struct socket *sock = file->private_data; ++ int ret = -ENOIOCTLCMD; ++ ++ if (sock->ops->compat_ioctl) ++ ret = sock->ops->compat_ioctl(sock, cmd, arg); ++ ++ return ret; ++} ++#endif ++ ++int kernel_bind(struct socket *sock, struct sockaddr *addr, int addrlen) ++{ ++ return sock->ops->bind(sock, addr, addrlen); ++} ++ ++int kernel_listen(struct socket *sock, int backlog) ++{ ++ return sock->ops->listen(sock, backlog); ++} ++ ++int kernel_accept(struct socket *sock, struct socket **newsock, int flags) ++{ ++ struct sock *sk = sock->sk; ++ int err; ++ ++ err = sock_create_lite(sk->sk_family, sk->sk_type, sk->sk_protocol, ++ newsock); ++ if (err < 0) ++ goto done; ++ ++ err = sock->ops->accept(sock, *newsock, flags); ++ if (err < 0) { ++ sock_release(*newsock); ++ goto done; ++ } ++ ++ (*newsock)->ops = sock->ops; ++ ++done: ++ return err; ++} ++ ++int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen, ++ int flags) ++{ ++ return sock->ops->connect(sock, addr, addrlen, flags); ++} ++ ++int kernel_getsockname(struct socket *sock, struct sockaddr *addr, ++ int *addrlen) ++{ ++ return sock->ops->getname(sock, addr, addrlen, 0); ++} ++ ++int kernel_getpeername(struct socket *sock, struct sockaddr *addr, ++ int *addrlen) ++{ ++ return sock->ops->getname(sock, addr, addrlen, 1); ++} ++ ++int kernel_getsockopt(struct socket *sock, int level, int optname, ++ char *optval, int *optlen) ++{ ++ mm_segment_t oldfs = get_fs(); ++ int err; ++ ++ set_fs(KERNEL_DS); ++ if (level == SOL_SOCKET) ++ err = sock_getsockopt(sock, level, optname, optval, optlen); ++ else ++ err = sock->ops->getsockopt(sock, level, optname, optval, ++ optlen); ++ set_fs(oldfs); ++ return err; ++} ++ ++int kernel_setsockopt(struct socket *sock, int level, int optname, ++ char *optval, int optlen) ++{ ++ mm_segment_t oldfs = get_fs(); ++ int err; ++ ++ set_fs(KERNEL_DS); ++ if (level == SOL_SOCKET) ++ err = sock_setsockopt(sock, level, optname, optval, optlen); ++ else ++ err = sock->ops->setsockopt(sock, level, optname, optval, ++ optlen); ++ set_fs(oldfs); ++ return err; ++} ++ ++int kernel_sendpage(struct socket *sock, struct page *page, int offset, ++ size_t size, int flags) ++{ ++ if (sock->ops->sendpage) ++ return sock->ops->sendpage(sock, page, offset, size, flags); ++ ++ return sock_no_sendpage(sock, page, offset, size, flags); ++} ++ ++int kernel_sock_ioctl(struct socket *sock, int cmd, unsigned long arg) ++{ ++ mm_segment_t oldfs = get_fs(); ++ int err; ++ ++ set_fs(KERNEL_DS); ++ err = sock->ops->ioctl(sock, cmd, arg); ++ set_fs(oldfs); ++ ++ return err; ++} ++ ++/* ABI emulation layers need these two */ ++EXPORT_SYMBOL(move_addr_to_kernel); ++EXPORT_SYMBOL(move_addr_to_user); ++EXPORT_SYMBOL(sock_create); ++EXPORT_SYMBOL(sock_create_kern); ++EXPORT_SYMBOL(sock_create_lite); ++EXPORT_SYMBOL(sock_map_fd); ++EXPORT_SYMBOL(sock_recvmsg); ++EXPORT_SYMBOL(sock_register); ++EXPORT_SYMBOL(sock_release); ++EXPORT_SYMBOL(sock_sendmsg); ++EXPORT_SYMBOL(sock_unregister); ++EXPORT_SYMBOL(sock_wake_async); ++EXPORT_SYMBOL(sockfd_lookup); ++EXPORT_SYMBOL(kernel_sendmsg); ++EXPORT_SYMBOL(kernel_recvmsg); ++EXPORT_SYMBOL(kernel_bind); ++EXPORT_SYMBOL(kernel_listen); ++EXPORT_SYMBOL(kernel_accept); ++EXPORT_SYMBOL(kernel_connect); ++EXPORT_SYMBOL(kernel_getsockname); ++EXPORT_SYMBOL(kernel_getpeername); ++EXPORT_SYMBOL(kernel_getsockopt); ++EXPORT_SYMBOL(kernel_setsockopt); ++EXPORT_SYMBOL(kernel_sendpage); ++EXPORT_SYMBOL(kernel_sock_ioctl);