diff -Nurb linux-2.6.22-594/include/linux/vserver/network.h.orig.orig linux-2.6.22-595/include/linux/vserver/network.h.orig.orig --- linux-2.6.22-594/include/linux/vserver/network.h.orig.orig 2008-03-20 00:04:54.000000000 -0400 +++ linux-2.6.22-595/include/linux/vserver/network.h.orig.orig 1969-12-31 19:00:00.000000000 -0500 @@ -1,143 +0,0 @@ -#ifndef _VX_NETWORK_H -#define _VX_NETWORK_H - -#include - - -#define MAX_N_CONTEXT 65535 /* Arbitrary limit */ - - -/* network flags */ - -#define NXF_INFO_PRIVATE 0x00000008 - -#define NXF_SINGLE_IP 0x00000100 -#define NXF_LBACK_REMAP 0x00000200 - -#define NXF_HIDE_NETIF 0x02000000 -#define NXF_HIDE_LBACK 0x04000000 - -#define NXF_STATE_SETUP (1ULL << 32) -#define NXF_STATE_ADMIN (1ULL << 34) - -#define NXF_SC_HELPER (1ULL << 36) -#define NXF_PERSISTENT (1ULL << 38) - -#define NXF_ONE_TIME (0x0005ULL << 32) - - -#define NXF_INIT_SET (__nxf_init_set()) - -static inline uint64_t __nxf_init_set(void) { - return NXF_STATE_ADMIN -#ifdef CONFIG_VSERVER_AUTO_LBACK - | NXF_LBACK_REMAP - | NXF_HIDE_LBACK -#endif -#ifdef CONFIG_VSERVER_AUTO_SINGLE - | NXF_SINGLE_IP -#endif - | NXF_HIDE_NETIF; -} - - -/* network caps */ - -#define NXC_RAW_ICMP 0x00000100 - - -/* address types */ - -#define NXA_TYPE_IPV4 0x0001 -#define NXA_TYPE_IPV6 0x0002 - -#define NXA_TYPE_NONE 0x0000 -#define NXA_TYPE_ANY 0x00FF - -#define NXA_TYPE_ADDR 0x0010 -#define NXA_TYPE_MASK 0x0020 -#define NXA_TYPE_RANGE 0x0040 - -#define NXA_MASK_ALL (NXA_TYPE_ADDR | NXA_TYPE_MASK | NXA_TYPE_RANGE) - -#define NXA_MOD_BCAST 0x0100 -#define NXA_MOD_LBACK 0x0200 - -#define NXA_LOOPBACK 0x1000 - -#define NXA_MASK_BIND (NXA_MASK_ALL | NXA_MOD_BCAST | NXA_MOD_LBACK) -#define NXA_MASK_SHOW (NXA_MASK_ALL | NXA_LOOPBACK) - -#ifdef __KERNEL__ - -#include -#include -#include -#include -#include -#include - -struct nx_addr_v4 { - struct nx_addr_v4 *next; - struct in_addr ip[2]; - struct in_addr mask; - uint16_t type; - uint16_t flags; -}; - -struct nx_addr_v6 { - struct nx_addr_v6 *next; - struct in6_addr ip; - struct in6_addr mask; - uint32_t prefix; - uint16_t type; - uint16_t flags; -}; - -struct nx_info { - struct hlist_node nx_hlist; /* linked list of nxinfos */ - nid_t nx_id; /* vnet id */ - atomic_t nx_usecnt; /* usage count */ - atomic_t nx_tasks; /* tasks count */ - int nx_state; /* context state */ - - uint64_t nx_flags; /* network flag word */ - uint64_t nx_ncaps; /* network capabilities */ - - struct in_addr v4_lback; /* Loopback address */ - struct in_addr v4_bcast; /* Broadcast address */ - struct nx_addr_v4 v4; /* First/Single ipv4 address */ -#ifdef CONFIG_IPV6 - struct nx_addr_v6 v6; /* First/Single ipv6 address */ -#endif - char nx_name[65]; /* network context name */ -}; - - -/* status flags */ - -#define NXS_HASHED 0x0001 -#define NXS_SHUTDOWN 0x0100 -#define NXS_RELEASED 0x8000 - -extern struct nx_info *lookup_nx_info(int); - -extern int get_nid_list(int, unsigned int *, int); -extern int nid_is_hashed(nid_t); - -extern int nx_migrate_task(struct task_struct *, struct nx_info *); - -extern long vs_net_change(struct nx_info *, unsigned int); - -struct sock; - - -#define NX_IPV4(n) ((n)->v4.type != NXA_TYPE_NONE) -#ifdef CONFIG_IPV6 -#define NX_IPV6(n) ((n)->v6.type != NXA_TYPE_NONE) -#else -#define NX_IPV6(n) (0) -#endif - -#endif /* __KERNEL__ */ -#endif /* _VX_NETWORK_H */ diff -Nurb linux-2.6.22-594/kernel/nsproxy.c.orig linux-2.6.22-595/kernel/nsproxy.c.orig --- linux-2.6.22-594/kernel/nsproxy.c.orig 2008-03-20 00:05:18.000000000 -0400 +++ linux-2.6.22-595/kernel/nsproxy.c.orig 1969-12-31 19:00:00.000000000 -0500 @@ -1,264 +0,0 @@ -/* - * Copyright (C) 2006 IBM Corporation - * - * Author: Serge Hallyn - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation, version 2 of the - * License. - * - * Jun 2006 - namespaces support - * OpenVZ, SWsoft Inc. - * Pavel Emelianov - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static struct kmem_cache *nsproxy_cachep; - -struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); - -void get_task_namespaces(struct task_struct *tsk) -{ - struct nsproxy *ns = tsk->nsproxy; - if (ns) { - get_nsproxy(ns); - } -} - -/* - * creates a copy of "orig" with refcount 1. - */ -static inline struct nsproxy *clone_nsproxy(struct nsproxy *orig) -{ - struct nsproxy *ns; - - ns = kmemdup(orig, sizeof(struct nsproxy), GFP_KERNEL); - if (ns) - atomic_set(&ns->count, 1); - vxdprintk(VXD_CBIT(space, 2), "clone_nsproxy(%p[%u] = %p[1]", - orig, atomic_read(&orig->count), ns); - atomic_inc(&vs_global_nsproxy); - return ns; -} - -/* - * Create new nsproxy and all of its the associated namespaces. - * Return the newly created nsproxy. Do not attach this to the task, - * leave it to the caller to do proper locking and attach it to task. - */ -static struct nsproxy *unshare_namespaces(int flags, struct nsproxy *orig, - struct fs_struct *new_fs) -{ - struct nsproxy *new_nsp; - int err = -ENOMEM; - - vxdprintk(VXD_CBIT(space, 4), - "unshare_namespaces(0x%08x,%p,%p)", - flags, orig, new_fs); - - new_nsp = clone_nsproxy(orig); - if (!new_nsp) - return ERR_PTR(-ENOMEM); - - new_nsp->mnt_ns = copy_mnt_ns(flags, orig->mnt_ns, new_fs); - if (IS_ERR(new_nsp->mnt_ns)) - goto out_ns; - - new_nsp->uts_ns = copy_utsname(flags, orig->uts_ns); - if (IS_ERR(new_nsp->uts_ns)) - goto out_uts; - - new_nsp->ipc_ns = copy_ipcs(flags, orig->ipc_ns); - if (IS_ERR(new_nsp->ipc_ns)) - goto out_ipc; - - new_nsp->pid_ns = copy_pid_ns(flags, orig->pid_ns); - if (IS_ERR(new_nsp->pid_ns)) - goto out_pid; - - new_nsp->user_ns = copy_user_ns(flags, orig->user_ns); - if (IS_ERR(new_nsp->user_ns)) - goto out_user; - - new_nsp->net_ns = copy_net_ns(flags, orig->net_ns); - if (IS_ERR(new_nsp->net_ns)) - goto out_net; - - return new_nsp; - -out_net: - if (new_nsp->user_ns) - put_user_ns(new_nsp->user_ns); - if (new_nsp->net_ns) - put_net(new_nsp->net_ns); -out_user: - if (new_nsp->pid_ns) - put_pid_ns(new_nsp->pid_ns); -out_pid: - if (new_nsp->ipc_ns) - put_ipc_ns(new_nsp->ipc_ns); -out_ipc: - if (new_nsp->uts_ns) - put_uts_ns(new_nsp->uts_ns); -out_uts: - if (new_nsp->mnt_ns) - put_mnt_ns(new_nsp->mnt_ns); -out_ns: - kmem_cache_free(nsproxy_cachep, new_nsp); - return ERR_PTR(err); -} - -static struct nsproxy *create_new_namespaces(unsigned long flags, struct task_struct *tsk, - struct fs_struct *new_fs) -{ - return unshare_namespaces(flags, tsk->nsproxy, new_fs); -} - -/* - * copies the nsproxy, setting refcount to 1, and grabbing a - * reference to all contained namespaces. - */ -struct nsproxy *copy_nsproxy(struct nsproxy *orig) -{ - struct nsproxy *ns = clone_nsproxy(orig); - - if (ns) { - if (ns->mnt_ns) - get_mnt_ns(ns->mnt_ns); - if (ns->uts_ns) - get_uts_ns(ns->uts_ns); - if (ns->ipc_ns) - get_ipc_ns(ns->ipc_ns); - if (ns->pid_ns) - get_pid_ns(ns->pid_ns); - } - return ns; -} - -/* - * called from clone. This now handles copy for nsproxy and all - * namespaces therein. - */ -int copy_namespaces(unsigned long flags, struct task_struct *tsk) -{ - struct nsproxy *old_ns = tsk->nsproxy; - struct nsproxy *new_ns = NULL; - int err = 0; - - vxdprintk(VXD_CBIT(space, 7), "copy_namespaces(0x%08x,%p[%p])", - flags, tsk, old_ns); - - if (!old_ns) - return 0; - - get_nsproxy(old_ns); - return 0; - - if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWUSER | CLONE_NEWNET))) - return 0; - - #ifndef CONFIG_NET_NS - if (unshare_flags & CLONE_NEWNET) - return -EINVAL; - #endif - - - if (!capable(CAP_SYS_ADMIN)) { - err = -EPERM; - goto out; - } - - new_ns = create_new_namespaces(flags, tsk, tsk->fs); - if (IS_ERR(new_ns)) { - err = PTR_ERR(new_ns); - goto out; - } - - err = ns_container_clone(tsk); - if (err) { - put_nsproxy(new_ns); - goto out; - } - - tsk->nsproxy = new_ns; - -out: - put_nsproxy(old_ns); - vxdprintk(VXD_CBIT(space, 3), - "copy_namespaces(0x%08x,%p[%p]) = %d [%p]", - flags, tsk, old_ns, err, new_ns); - return err; -} - -void free_nsproxy(struct nsproxy *ns) -{ - if (ns->mnt_ns) - put_mnt_ns(ns->mnt_ns); - if (ns->uts_ns) - put_uts_ns(ns->uts_ns); - if (ns->ipc_ns) - put_ipc_ns(ns->ipc_ns); - if (ns->pid_ns) - put_pid_ns(ns->pid_ns); - atomic_dec(&vs_global_nsproxy); - kfree(ns); -} - -/* - * Called from unshare. Unshare all the namespaces part of nsproxy. - * On success, returns the new nsproxy. - */ -int unshare_nsproxy_namespaces(unsigned long unshare_flags, - struct nsproxy **new_nsp, struct fs_struct *new_fs) -{ - int err = 0; - - vxdprintk(VXD_CBIT(space, 4), - "unshare_nsproxy_namespaces(0x%08lx,[%p])", - unshare_flags, current->nsproxy); - - if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | - CLONE_NEWUSER | CLONE_NEWNET))) - return 0; - -#ifndef CONFIG_NET_NS - if (unshare_flags & CLONE_NEWNET) - return -EINVAL; -#endif - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - *new_nsp = create_new_namespaces(unshare_flags, current, - new_fs ? new_fs : current->fs); - if (IS_ERR(*new_nsp)) { - err = PTR_ERR(*new_nsp); - goto out; - } - - err = ns_container_clone(current); - if (err) - put_nsproxy(*new_nsp); - -out: - return err; -} - -static int __init nsproxy_cache_init(void) -{ - nsproxy_cachep = kmem_cache_create("nsproxy", sizeof(struct nsproxy), - 0, SLAB_PANIC, NULL, NULL); - return 0; -} - -module_init(nsproxy_cache_init); diff -Nurb linux-2.6.22-594/kernel/user.c.orig linux-2.6.22-595/kernel/user.c.orig --- linux-2.6.22-594/kernel/user.c.orig 2008-03-20 00:05:18.000000000 -0400 +++ linux-2.6.22-595/kernel/user.c.orig 1969-12-31 19:00:00.000000000 -0500 @@ -1,227 +0,0 @@ -/* - * The "user cache". - * - * (C) Copyright 1991-2000 Linus Torvalds - * - * We have a per-user structure to keep track of how many - * processes, files etc the user has claimed, in order to be - * able to have per-user limits for system resources. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * UID task count cache, to get fast user lookup in "alloc_uid" - * when changing user ID's (ie setuid() and friends). - */ - -#define UIDHASH_MASK (UIDHASH_SZ - 1) -#define __uidhashfn(xid,uid) ((((uid) >> UIDHASH_BITS) + ((uid)^(xid))) & UIDHASH_MASK) -#define uidhashentry(ns, xid, uid) ((ns)->uidhash_table + __uidhashfn(xid, uid)) - -static struct kmem_cache *uid_cachep; -static struct list_head uidhash_table[UIDHASH_SZ]; - -/* - * The uidhash_lock is mostly taken from process context, but it is - * occasionally also taken from softirq/tasklet context, when - * task-structs get RCU-freed. Hence all locking must be softirq-safe. - * But free_uid() is also called with local interrupts disabled, and running - * local_bh_enable() with local interrupts disabled is an error - we'll run - * softirq callbacks, and they can unconditionally enable interrupts, and - * the caller of free_uid() didn't expect that.. - */ -static DEFINE_SPINLOCK(uidhash_lock); - -struct user_struct root_user = { - .__count = ATOMIC_INIT(1), - .processes = ATOMIC_INIT(1), - .files = ATOMIC_INIT(0), - .sigpending = ATOMIC_INIT(0), - .mq_bytes = 0, - .locked_shm = 0, -#ifdef CONFIG_KEYS - .uid_keyring = &root_user_keyring, - .session_keyring = &root_session_keyring, -#endif -}; - -/* - * These routines must be called with the uidhash spinlock held! - */ -static inline void uid_hash_insert(struct user_struct *up, struct list_head *hashent) -{ - list_add(&up->uidhash_list, hashent); -} - -static inline void uid_hash_remove(struct user_struct *up) -{ - list_del(&up->uidhash_list); -} - -static inline struct user_struct *uid_hash_find(xid_t xid, uid_t uid, struct list_head *hashent) -{ - struct list_head *up; - - list_for_each(up, hashent) { - struct user_struct *user; - - user = list_entry(up, struct user_struct, uidhash_list); - - if(user->uid == uid && user->xid == xid) { - atomic_inc(&user->__count); - return user; - } - } - - return NULL; -} - -/* - * Locate the user_struct for the passed UID. If found, take a ref on it. The - * caller must undo that ref with free_uid(). - * - * If the user_struct could not be found, return NULL. - */ -struct user_struct *find_user(xid_t xid, uid_t uid) -{ - struct user_struct *ret; - unsigned long flags; - struct user_namespace *ns = current->nsproxy->user_ns; - - spin_lock_irqsave(&uidhash_lock, flags); - ret = uid_hash_find(xid, uid, uidhashentry(ns, xid, uid)); - spin_unlock_irqrestore(&uidhash_lock, flags); - return ret; -} - -void free_uid(struct user_struct *up) -{ - unsigned long flags; - - if (!up) - return; - - local_irq_save(flags); - if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) { - uid_hash_remove(up); - spin_unlock_irqrestore(&uidhash_lock, flags); - key_put(up->uid_keyring); - key_put(up->session_keyring); - kmem_cache_free(uid_cachep, up); - } else { - local_irq_restore(flags); - } -} - -struct user_struct * alloc_uid(xid_t xid, uid_t uid) -{ - struct user_namespace *ns = current->nsproxy->user_ns; - struct list_head *hashent = uidhashentry(ns,xid, uid); - struct user_struct *up; - - spin_lock_irq(&uidhash_lock); - up = uid_hash_find(xid, uid, hashent); - spin_unlock_irq(&uidhash_lock); - - if (!up) { - struct user_struct *new; - - new = kmem_cache_alloc(uid_cachep, GFP_KERNEL); - if (!new) - return NULL; - new->uid = uid; - new->xid = xid; - atomic_set(&new->__count, 1); - atomic_set(&new->processes, 0); - atomic_set(&new->files, 0); - atomic_set(&new->sigpending, 0); -#ifdef CONFIG_INOTIFY_USER - atomic_set(&new->inotify_watches, 0); - atomic_set(&new->inotify_devs, 0); -#endif - - new->mq_bytes = 0; - new->locked_shm = 0; - - if (alloc_uid_keyring(new, current) < 0) { - kmem_cache_free(uid_cachep, new); - return NULL; - } - - /* - * Before adding this, check whether we raced - * on adding the same user already.. - */ - spin_lock_irq(&uidhash_lock); - up = uid_hash_find(xid, uid, hashent); - if (up) { - key_put(new->uid_keyring); - key_put(new->session_keyring); - kmem_cache_free(uid_cachep, new); - } else { - uid_hash_insert(new, hashent); - up = new; - } - spin_unlock_irq(&uidhash_lock); - - } - return up; -} - -void switch_uid(struct user_struct *new_user) -{ - struct user_struct *old_user; - - /* What if a process setreuid()'s and this brings the - * new uid over his NPROC rlimit? We can check this now - * cheaply with the new uid cache, so if it matters - * we should be checking for it. -DaveM - */ - old_user = current->user; - atomic_inc(&new_user->processes); - atomic_dec(&old_user->processes); - switch_uid_keyring(new_user); - current->user = new_user; - - /* - * We need to synchronize with __sigqueue_alloc() - * doing a get_uid(p->user).. If that saw the old - * user value, we need to wait until it has exited - * its critical region before we can free the old - * structure. - */ - smp_mb(); - spin_unlock_wait(¤t->sighand->siglock); - - free_uid(old_user); - suid_keys(current); -} - - -static int __init uid_cache_init(void) -{ - int n; - - uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct), - 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); - - for(n = 0; n < UIDHASH_SZ; ++n) - INIT_LIST_HEAD(init_user_ns.uidhash_table + n); - - /* Insert the root user immediately (init already runs as root) */ - spin_lock_irq(&uidhash_lock); - uid_hash_insert(&root_user, uidhashentry(&init_user_ns, 0, 0)); - spin_unlock_irq(&uidhash_lock); - - return 0; -} - -module_init(uid_cache_init); diff -Nurb linux-2.6.22-594/kernel/vserver/context.c linux-2.6.22-595/kernel/vserver/context.c --- linux-2.6.22-594/kernel/vserver/context.c 2008-03-20 00:04:46.000000000 -0400 +++ linux-2.6.22-595/kernel/vserver/context.c 2008-03-20 00:13:22.000000000 -0400 @@ -589,13 +589,13 @@ struct nsproxy *old_nsp, *new_nsp; ret = unshare_nsproxy_namespaces( - CLONE_NEWUTS | CLONE_NEWIPC, + CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWNET, &new_nsp, NULL); if (ret) goto out; old_nsp = xchg(&p->nsproxy, new_nsp); - vx_set_space(vxi, CLONE_NEWUTS | CLONE_NEWIPC); + vx_set_space(vxi, CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWNET); put_nsproxy(old_nsp); } } @@ -781,7 +781,7 @@ if (vs_state_change(new_vxi, VSC_STARTUP)) goto out; - ret = vx_migrate_task(current, new_vxi, (!data)); + ret = vx_migrate_task(current, new_vxi, 1 /*(!data) Hack no. 1 - Sapan*/); if (ret) goto out; diff -Nurb linux-2.6.22-594/kernel/vserver/context.c.orig linux-2.6.22-595/kernel/vserver/context.c.orig --- linux-2.6.22-594/kernel/vserver/context.c.orig 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.22-595/kernel/vserver/context.c.orig 2008-03-20 00:04:46.000000000 -0400 @@ -0,0 +1,966 @@ +/* + * linux/kernel/vserver/context.c + * + * Virtual Server: Context Support + * + * Copyright (C) 2003-2007 Herbert Pötzl + * + * V0.01 context helper + * V0.02 vx_ctx_kill syscall command + * V0.03 replaced context_info calls + * V0.04 redesign of struct (de)alloc + * V0.05 rlimit basic implementation + * V0.06 task_xid and info commands + * V0.07 context flags and caps + * V0.08 switch to RCU based hash + * V0.09 revert to non RCU for now + * V0.10 and back to working RCU hash + * V0.11 and back to locking again + * V0.12 referenced context store + * V0.13 separate per cpu data + * V0.14 changed vcmds to vxi arg + * V0.15 added context stat + * V0.16 have __create claim() the vxi + * V0.17 removed older and legacy stuff + * + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "cvirt_init.h" +#include "cacct_init.h" +#include "limit_init.h" +#include "sched_init.h" + + +atomic_t vx_global_ctotal = ATOMIC_INIT(0); +atomic_t vx_global_cactive = ATOMIC_INIT(0); + + +/* now inactive context structures */ + +static struct hlist_head vx_info_inactive = HLIST_HEAD_INIT; + +static spinlock_t vx_info_inactive_lock = SPIN_LOCK_UNLOCKED; + + +/* __alloc_vx_info() + + * allocate an initialized vx_info struct + * doesn't make it visible (hash) */ + +static struct vx_info *__alloc_vx_info(xid_t xid) +{ + struct vx_info *new = NULL; + int cpu; + + vxdprintk(VXD_CBIT(xid, 0), "alloc_vx_info(%d)*", xid); + + /* would this benefit from a slab cache? */ + new = kmalloc(sizeof(struct vx_info), GFP_KERNEL); + if (!new) + return 0; + + memset(new, 0, sizeof(struct vx_info)); +#ifdef CONFIG_SMP + new->ptr_pc = alloc_percpu(struct _vx_info_pc); + if (!new->ptr_pc) + goto error; +#endif + new->vx_id = xid; + INIT_HLIST_NODE(&new->vx_hlist); + atomic_set(&new->vx_usecnt, 0); + atomic_set(&new->vx_tasks, 0); + new->vx_parent = NULL; + new->vx_state = 0; + init_waitqueue_head(&new->vx_wait); + + /* prepare reaper */ + get_task_struct(init_pid_ns.child_reaper); + new->vx_reaper = init_pid_ns.child_reaper; + new->vx_badness_bias = 0; + + /* rest of init goes here */ + vx_info_init_limit(&new->limit); + vx_info_init_sched(&new->sched); + vx_info_init_cvirt(&new->cvirt); + vx_info_init_cacct(&new->cacct); + + /* per cpu data structures */ + for_each_possible_cpu(cpu) { + vx_info_init_sched_pc( + &vx_per_cpu(new, sched_pc, cpu), cpu); + vx_info_init_cvirt_pc( + &vx_per_cpu(new, cvirt_pc, cpu), cpu); + } + + new->vx_flags = VXF_INIT_SET; + new->vx_bcaps = CAP_INIT_EFF_SET; + new->vx_ccaps = 0; + new->vx_cap_bset = cap_bset; + + new->reboot_cmd = 0; + new->exit_code = 0; + + new->vx_nsproxy = copy_nsproxy(current->nsproxy); + + vxdprintk(VXD_CBIT(xid, 0), + "alloc_vx_info(%d) = %p", xid, new); + vxh_alloc_vx_info(new); + atomic_inc(&vx_global_ctotal); + return new; +#ifdef CONFIG_SMP +error: + kfree(new); + return 0; +#endif +} + +/* __dealloc_vx_info() + + * final disposal of vx_info */ + +static void __dealloc_vx_info(struct vx_info *vxi) +{ + int cpu; + + vxdprintk(VXD_CBIT(xid, 0), + "dealloc_vx_info(%p)", vxi); + vxh_dealloc_vx_info(vxi); + + vxi->vx_id = -1; + + vx_info_exit_limit(&vxi->limit); + vx_info_exit_sched(&vxi->sched); + vx_info_exit_cvirt(&vxi->cvirt); + vx_info_exit_cacct(&vxi->cacct); + + for_each_possible_cpu(cpu) { + vx_info_exit_sched_pc( + &vx_per_cpu(vxi, sched_pc, cpu), cpu); + vx_info_exit_cvirt_pc( + &vx_per_cpu(vxi, cvirt_pc, cpu), cpu); + } + + vxi->vx_state |= VXS_RELEASED; + +#ifdef CONFIG_SMP + free_percpu(vxi->ptr_pc); +#endif + kfree(vxi); + atomic_dec(&vx_global_ctotal); +} + +static void __shutdown_vx_info(struct vx_info *vxi) +{ + struct nsproxy *nsproxy; + struct fs_struct *fs; + + might_sleep(); + + vxi->vx_state |= VXS_SHUTDOWN; + vs_state_change(vxi, VSC_SHUTDOWN); + + nsproxy = xchg(&vxi->vx_nsproxy, NULL); + fs = xchg(&vxi->vx_fs, NULL); + + if (nsproxy) + put_nsproxy(nsproxy); + if (fs) + put_fs_struct(fs); +} + +/* exported stuff */ + +void free_vx_info(struct vx_info *vxi) +{ + unsigned long flags; + + /* check for reference counts first */ + BUG_ON(atomic_read(&vxi->vx_usecnt)); + BUG_ON(atomic_read(&vxi->vx_tasks)); + + /* context must not be hashed */ + BUG_ON(vx_info_state(vxi, VXS_HASHED)); + + /* context shutdown is mandatory */ + BUG_ON(!vx_info_state(vxi, VXS_SHUTDOWN)); + + BUG_ON(vxi->vx_nsproxy); + BUG_ON(vxi->vx_fs); + + spin_lock_irqsave(&vx_info_inactive_lock, flags); + hlist_del(&vxi->vx_hlist); + spin_unlock_irqrestore(&vx_info_inactive_lock, flags); + + __dealloc_vx_info(vxi); +} + + +/* hash table for vx_info hash */ + +#define VX_HASH_SIZE 13 + +static struct hlist_head vx_info_hash[VX_HASH_SIZE] = + { [0 ... VX_HASH_SIZE-1] = HLIST_HEAD_INIT }; + +static spinlock_t vx_info_hash_lock = SPIN_LOCK_UNLOCKED; + + +static inline unsigned int __hashval(xid_t xid) +{ + return (xid % VX_HASH_SIZE); +} + + + +/* __hash_vx_info() + + * add the vxi to the global hash table + * requires the hash_lock to be held */ + +static inline void __hash_vx_info(struct vx_info *vxi) +{ + struct hlist_head *head; + + vxd_assert_lock(&vx_info_hash_lock); + vxdprintk(VXD_CBIT(xid, 4), + "__hash_vx_info: %p[#%d]", vxi, vxi->vx_id); + vxh_hash_vx_info(vxi); + + /* context must not be hashed */ + BUG_ON(vx_info_state(vxi, VXS_HASHED)); + + vxi->vx_state |= VXS_HASHED; + head = &vx_info_hash[__hashval(vxi->vx_id)]; + hlist_add_head(&vxi->vx_hlist, head); + atomic_inc(&vx_global_cactive); +} + +/* __unhash_vx_info() + + * remove the vxi from the global hash table + * requires the hash_lock to be held */ + +static inline void __unhash_vx_info(struct vx_info *vxi) +{ + unsigned long flags; + + vxd_assert_lock(&vx_info_hash_lock); + vxdprintk(VXD_CBIT(xid, 4), + "__unhash_vx_info: %p[#%d.%d.%d]", vxi, vxi->vx_id, + atomic_read(&vxi->vx_usecnt), atomic_read(&vxi->vx_tasks)); + vxh_unhash_vx_info(vxi); + + /* context must be hashed */ + BUG_ON(!vx_info_state(vxi, VXS_HASHED)); + /* but without tasks */ + BUG_ON(atomic_read(&vxi->vx_tasks)); + + vxi->vx_state &= ~VXS_HASHED; + hlist_del_init(&vxi->vx_hlist); + spin_lock_irqsave(&vx_info_inactive_lock, flags); + hlist_add_head(&vxi->vx_hlist, &vx_info_inactive); + spin_unlock_irqrestore(&vx_info_inactive_lock, flags); + atomic_dec(&vx_global_cactive); +} + + +/* __lookup_vx_info() + + * requires the hash_lock to be held + * doesn't increment the vx_refcnt */ + +static inline struct vx_info *__lookup_vx_info(xid_t xid) +{ + struct hlist_head *head = &vx_info_hash[__hashval(xid)]; + struct hlist_node *pos; + struct vx_info *vxi; + + vxd_assert_lock(&vx_info_hash_lock); + hlist_for_each(pos, head) { + vxi = hlist_entry(pos, struct vx_info, vx_hlist); + + if (vxi->vx_id == xid) + goto found; + } + vxi = NULL; +found: + vxdprintk(VXD_CBIT(xid, 0), + "__lookup_vx_info(#%u): %p[#%u]", + xid, vxi, vxi ? vxi->vx_id : 0); + vxh_lookup_vx_info(vxi, xid); + return vxi; +} + + +/* __create_vx_info() + + * create the requested context + * get(), claim() and hash it */ + +static struct vx_info *__create_vx_info(int id) +{ + struct vx_info *new, *vxi = NULL; + + vxdprintk(VXD_CBIT(xid, 1), "create_vx_info(%d)*", id); + + if (!(new = __alloc_vx_info(id))) + return ERR_PTR(-ENOMEM); + + /* required to make dynamic xids unique */ + spin_lock(&vx_info_hash_lock); + + /* static context requested */ + if ((vxi = __lookup_vx_info(id))) { + vxdprintk(VXD_CBIT(xid, 0), + "create_vx_info(%d) = %p (already there)", id, vxi); + if (vx_info_flags(vxi, VXF_STATE_SETUP, 0)) + vxi = ERR_PTR(-EBUSY); + else + vxi = ERR_PTR(-EEXIST); + goto out_unlock; + } + /* new context */ + vxdprintk(VXD_CBIT(xid, 0), + "create_vx_info(%d) = %p (new)", id, new); + claim_vx_info(new, NULL); + __hash_vx_info(get_vx_info(new)); + vxi = new, new = NULL; + +out_unlock: + spin_unlock(&vx_info_hash_lock); + vxh_create_vx_info(IS_ERR(vxi) ? NULL : vxi, id); + if (new) + __dealloc_vx_info(new); + return vxi; +} + + +/* exported stuff */ + + +void unhash_vx_info(struct vx_info *vxi) +{ + __shutdown_vx_info(vxi); + spin_lock(&vx_info_hash_lock); + __unhash_vx_info(vxi); + spin_unlock(&vx_info_hash_lock); + __wakeup_vx_info(vxi); +} + + +/* lookup_vx_info() + + * search for a vx_info and get() it + * negative id means current */ + +struct vx_info *lookup_vx_info(int id) +{ + struct vx_info *vxi = NULL; + + if (id < 0) { + vxi = get_vx_info(current->vx_info); + } else if (id > 1) { + spin_lock(&vx_info_hash_lock); + vxi = get_vx_info(__lookup_vx_info(id)); + spin_unlock(&vx_info_hash_lock); + } + return vxi; +} + +/* xid_is_hashed() + + * verify that xid is still hashed */ + +int xid_is_hashed(xid_t xid) +{ + int hashed; + + spin_lock(&vx_info_hash_lock); + hashed = (__lookup_vx_info(xid) != NULL); + spin_unlock(&vx_info_hash_lock); + return hashed; +} + +#ifdef CONFIG_PROC_FS + +/* get_xid_list() + + * get a subset of hashed xids for proc + * assumes size is at least one */ + +int get_xid_list(int index, unsigned int *xids, int size) +{ + int hindex, nr_xids = 0; + + /* only show current and children */ + if (!vx_check(0, VS_ADMIN | VS_WATCH)) { + if (index > 0) + return 0; + xids[nr_xids] = vx_current_xid(); + return 1; + } + + for (hindex = 0; hindex < VX_HASH_SIZE; hindex++) { + struct hlist_head *head = &vx_info_hash[hindex]; + struct hlist_node *pos; + + spin_lock(&vx_info_hash_lock); + hlist_for_each(pos, head) { + struct vx_info *vxi; + + if (--index > 0) + continue; + + vxi = hlist_entry(pos, struct vx_info, vx_hlist); + xids[nr_xids] = vxi->vx_id; + if (++nr_xids >= size) { + spin_unlock(&vx_info_hash_lock); + goto out; + } + } + /* keep the lock time short */ + spin_unlock(&vx_info_hash_lock); + } +out: + return nr_xids; +} +#endif + +#ifdef CONFIG_VSERVER_DEBUG + +void dump_vx_info_inactive(int level) +{ + struct hlist_node *entry, *next; + + hlist_for_each_safe(entry, next, &vx_info_inactive) { + struct vx_info *vxi = + list_entry(entry, struct vx_info, vx_hlist); + + dump_vx_info(vxi, level); + } +} + +#endif + +int vx_migrate_user(struct task_struct *p, struct vx_info *vxi) +{ + struct user_struct *new_user, *old_user; + + if (!p || !vxi) + BUG(); + + if (vx_info_flags(vxi, VXF_INFO_PRIVATE, 0)) + return -EACCES; + + new_user = alloc_uid(vxi->vx_id, p->uid); + if (!new_user) + return -ENOMEM; + + old_user = p->user; + if (new_user != old_user) { + atomic_inc(&new_user->processes); + atomic_dec(&old_user->processes); + p->user = new_user; + } + free_uid(old_user); + return 0; +} + +void vx_mask_cap_bset(struct vx_info *vxi, struct task_struct *p) +{ + p->cap_effective &= vxi->vx_cap_bset; + p->cap_inheritable &= vxi->vx_cap_bset; + p->cap_permitted &= vxi->vx_cap_bset; +} + + +#include + +static int vx_openfd_task(struct task_struct *tsk) +{ + struct files_struct *files = tsk->files; + struct fdtable *fdt; + const unsigned long *bptr; + int count, total; + + /* no rcu_read_lock() because of spin_lock() */ + spin_lock(&files->file_lock); + fdt = files_fdtable(files); + bptr = fdt->open_fds->fds_bits; + count = fdt->max_fds / (sizeof(unsigned long) * 8); + for (total = 0; count > 0; count--) { + if (*bptr) + total += hweight_long(*bptr); + bptr++; + } + spin_unlock(&files->file_lock); + return total; +} + + +/* for *space compatibility */ + +asmlinkage long sys_unshare(unsigned long); + +/* + * migrate task to new context + * gets vxi, puts old_vxi on change + * optionally unshares namespaces (hack) + */ + +int vx_migrate_task(struct task_struct *p, struct vx_info *vxi, int unshare) +{ + struct vx_info *old_vxi; + int ret = 0; + + if (!p || !vxi) + BUG(); + + vxdprintk(VXD_CBIT(xid, 5), + "vx_migrate_task(%p,%p[#%d.%d])", p, vxi, + vxi->vx_id, atomic_read(&vxi->vx_usecnt)); + + if (vx_info_flags(vxi, VXF_INFO_PRIVATE, 0) && + !vx_info_flags(vxi, VXF_STATE_SETUP, 0)) + return -EACCES; + + if (vx_info_state(vxi, VXS_SHUTDOWN)) + return -EFAULT; + + old_vxi = task_get_vx_info(p); + if (old_vxi == vxi) + goto out; + + if (!(ret = vx_migrate_user(p, vxi))) { + int openfd; + + task_lock(p); + openfd = vx_openfd_task(p); + + if (old_vxi) { + atomic_dec(&old_vxi->cvirt.nr_threads); + atomic_dec(&old_vxi->cvirt.nr_running); + __rlim_dec(&old_vxi->limit, RLIMIT_NPROC); + /* FIXME: what about the struct files here? */ + __rlim_sub(&old_vxi->limit, VLIMIT_OPENFD, openfd); + /* account for the executable */ + __rlim_dec(&old_vxi->limit, VLIMIT_DENTRY); + } + atomic_inc(&vxi->cvirt.nr_threads); + atomic_inc(&vxi->cvirt.nr_running); + __rlim_inc(&vxi->limit, RLIMIT_NPROC); + /* FIXME: what about the struct files here? */ + __rlim_add(&vxi->limit, VLIMIT_OPENFD, openfd); + /* account for the executable */ + __rlim_inc(&vxi->limit, VLIMIT_DENTRY); + + if (old_vxi) { + release_vx_info(old_vxi, p); + clr_vx_info(&p->vx_info); + } + claim_vx_info(vxi, p); + set_vx_info(&p->vx_info, vxi); + p->xid = vxi->vx_id; + + vxdprintk(VXD_CBIT(xid, 5), + "moved task %p into vxi:%p[#%d]", + p, vxi, vxi->vx_id); + + vx_mask_cap_bset(vxi, p); + task_unlock(p); + + /* hack for *spaces to provide compatibility */ + if (unshare) { + struct nsproxy *old_nsp, *new_nsp; + + ret = unshare_nsproxy_namespaces( + CLONE_NEWUTS | CLONE_NEWIPC, + &new_nsp, NULL); + if (ret) + goto out; + + old_nsp = xchg(&p->nsproxy, new_nsp); + vx_set_space(vxi, CLONE_NEWUTS | CLONE_NEWIPC); + put_nsproxy(old_nsp); + } + } +out: + put_vx_info(old_vxi); + return ret; +} + +int vx_set_reaper(struct vx_info *vxi, struct task_struct *p) +{ + struct task_struct *old_reaper; + + if (!vxi) + return -EINVAL; + + vxdprintk(VXD_CBIT(xid, 6), + "vx_set_reaper(%p[#%d],%p[#%d,%d])", + vxi, vxi->vx_id, p, p->xid, p->pid); + + old_reaper = vxi->vx_reaper; + if (old_reaper == p) + return 0; + + /* set new child reaper */ + get_task_struct(p); + vxi->vx_reaper = p; + put_task_struct(old_reaper); + return 0; +} + +int vx_set_init(struct vx_info *vxi, struct task_struct *p) +{ + if (!vxi) + return -EINVAL; + + vxdprintk(VXD_CBIT(xid, 6), + "vx_set_init(%p[#%d],%p[#%d,%d,%d])", + vxi, vxi->vx_id, p, p->xid, p->pid, p->tgid); + + vxi->vx_flags &= ~VXF_STATE_INIT; + vxi->vx_initpid = p->tgid; + return 0; +} + +void vx_exit_init(struct vx_info *vxi, struct task_struct *p, int code) +{ + vxdprintk(VXD_CBIT(xid, 6), + "vx_exit_init(%p[#%d],%p[#%d,%d,%d])", + vxi, vxi->vx_id, p, p->xid, p->pid, p->tgid); + + vxi->exit_code = code; + vxi->vx_initpid = 0; +} + + +void vx_set_persistent(struct vx_info *vxi) +{ + vxdprintk(VXD_CBIT(xid, 6), + "vx_set_persistent(%p[#%d])", vxi, vxi->vx_id); + + get_vx_info(vxi); + claim_vx_info(vxi, NULL); +} + +void vx_clear_persistent(struct vx_info *vxi) +{ + vxdprintk(VXD_CBIT(xid, 6), + "vx_clear_persistent(%p[#%d])", vxi, vxi->vx_id); + + release_vx_info(vxi, NULL); + put_vx_info(vxi); +} + +void vx_update_persistent(struct vx_info *vxi) +{ + if (vx_info_flags(vxi, VXF_PERSISTENT, 0)) + vx_set_persistent(vxi); + else + vx_clear_persistent(vxi); +} + + +/* task must be current or locked */ + +void exit_vx_info(struct task_struct *p, int code) +{ + struct vx_info *vxi = p->vx_info; + + if (vxi) { + atomic_dec(&vxi->cvirt.nr_threads); + vx_nproc_dec(p); + + vxi->exit_code = code; + release_vx_info(vxi, p); + } +} + +void exit_vx_info_early(struct task_struct *p, int code) +{ + struct vx_info *vxi = p->vx_info; + + if (vxi) { + if (vxi->vx_initpid == p->tgid) + vx_exit_init(vxi, p, code); + if (vxi->vx_reaper == p) + vx_set_reaper(vxi, init_pid_ns.child_reaper); + } +} + + +/* vserver syscall commands below here */ + +/* taks xid and vx_info functions */ + +#include + + +int vc_task_xid(uint32_t id) +{ + xid_t xid; + + if (id) { + struct task_struct *tsk; + + read_lock(&tasklist_lock); + tsk = find_task_by_real_pid(id); + xid = (tsk) ? tsk->xid : -ESRCH; + read_unlock(&tasklist_lock); + } else + xid = vx_current_xid(); + return xid; +} + + +int vc_vx_info(struct vx_info *vxi, void __user *data) +{ + struct vcmd_vx_info_v0 vc_data; + + vc_data.xid = vxi->vx_id; + vc_data.initpid = vxi->vx_initpid; + + if (copy_to_user(data, &vc_data, sizeof(vc_data))) + return -EFAULT; + return 0; +} + + +int vc_ctx_stat(struct vx_info *vxi, void __user *data) +{ + struct vcmd_ctx_stat_v0 vc_data; + + vc_data.usecnt = atomic_read(&vxi->vx_usecnt); + vc_data.tasks = atomic_read(&vxi->vx_tasks); + + if (copy_to_user(data, &vc_data, sizeof(vc_data))) + return -EFAULT; + return 0; +} + + +/* context functions */ + +int vc_ctx_create(uint32_t xid, void __user *data) +{ + struct vcmd_ctx_create vc_data = { .flagword = VXF_INIT_SET }; + struct vx_info *new_vxi; + int ret; + + if (data && copy_from_user(&vc_data, data, sizeof(vc_data))) + return -EFAULT; + + if ((xid > MAX_S_CONTEXT) || (xid < 2)) + return -EINVAL; + + new_vxi = __create_vx_info(xid); + if (IS_ERR(new_vxi)) + return PTR_ERR(new_vxi); + + /* initial flags */ + new_vxi->vx_flags = vc_data.flagword; + + ret = -ENOEXEC; + if (vs_state_change(new_vxi, VSC_STARTUP)) + goto out; + + ret = vx_migrate_task(current, new_vxi, (!data)); + if (ret) + goto out; + + /* return context id on success */ + ret = new_vxi->vx_id; + + /* get a reference for persistent contexts */ + if ((vc_data.flagword & VXF_PERSISTENT)) + vx_set_persistent(new_vxi); +out: + release_vx_info(new_vxi, NULL); + put_vx_info(new_vxi); + return ret; +} + + +int vc_ctx_migrate(struct vx_info *vxi, void __user *data) +{ + struct vcmd_ctx_migrate vc_data = { .flagword = 0 }; + int ret; + + if (data && copy_from_user(&vc_data, data, sizeof(vc_data))) + return -EFAULT; + + ret = vx_migrate_task(current, vxi, 0); + if (ret) + return ret; + if (vc_data.flagword & VXM_SET_INIT) + ret = vx_set_init(vxi, current); + if (ret) + return ret; + if (vc_data.flagword & VXM_SET_REAPER) + ret = vx_set_reaper(vxi, current); + return ret; +} + + +int vc_get_cflags(struct vx_info *vxi, void __user *data) +{ + struct vcmd_ctx_flags_v0 vc_data; + + vc_data.flagword = vxi->vx_flags; + + /* special STATE flag handling */ + vc_data.mask = vs_mask_flags(~0ULL, vxi->vx_flags, VXF_ONE_TIME); + + if (copy_to_user(data, &vc_data, sizeof(vc_data))) + return -EFAULT; + return 0; +} + +int vc_set_cflags(struct vx_info *vxi, void __user *data) +{ + struct vcmd_ctx_flags_v0 vc_data; + uint64_t mask, trigger; + + if (copy_from_user(&vc_data, data, sizeof(vc_data))) + return -EFAULT; + + /* special STATE flag handling */ + mask = vs_mask_mask(vc_data.mask, vxi->vx_flags, VXF_ONE_TIME); + trigger = (mask & vxi->vx_flags) ^ (mask & vc_data.flagword); + + if (vxi == current->vx_info) { + if (trigger & VXF_STATE_SETUP) + vx_mask_cap_bset(vxi, current); + if (trigger & VXF_STATE_INIT) { + int ret; + + ret = vx_set_init(vxi, current); + if (ret) + return ret; + ret = vx_set_reaper(vxi, current); + if (ret) + return ret; + } + } + + vxi->vx_flags = vs_mask_flags(vxi->vx_flags, + vc_data.flagword, mask); + if (trigger & VXF_PERSISTENT) + vx_update_persistent(vxi); + + return 0; +} + +static int do_get_caps(struct vx_info *vxi, uint64_t *bcaps, uint64_t *ccaps) +{ + if (bcaps) + *bcaps = vxi->vx_bcaps; + if (ccaps) + *ccaps = vxi->vx_ccaps; + + return 0; +} + +int vc_get_ccaps(struct vx_info *vxi, void __user *data) +{ + struct vcmd_ctx_caps_v1 vc_data; + int ret; + + ret = do_get_caps(vxi, NULL, &vc_data.ccaps); + if (ret) + return ret; + vc_data.cmask = ~0ULL; + + if (copy_to_user(data, &vc_data, sizeof(vc_data))) + return -EFAULT; + return 0; +} + +static int do_set_caps(struct vx_info *vxi, + uint64_t bcaps, uint64_t bmask, uint64_t ccaps, uint64_t cmask) +{ + vxi->vx_bcaps = vs_mask_flags(vxi->vx_bcaps, bcaps, bmask); + vxi->vx_ccaps = vs_mask_flags(vxi->vx_ccaps, ccaps, cmask); + + return 0; +} + +int vc_set_ccaps(struct vx_info *vxi, void __user *data) +{ + struct vcmd_ctx_caps_v1 vc_data; + + if (copy_from_user(&vc_data, data, sizeof(vc_data))) + return -EFAULT; + + return do_set_caps(vxi, 0, 0, vc_data.ccaps, vc_data.cmask); +} + +int vc_get_bcaps(struct vx_info *vxi, void __user *data) +{ + struct vcmd_bcaps vc_data; + int ret; + + ret = do_get_caps(vxi, &vc_data.bcaps, NULL); + if (ret) + return ret; + vc_data.bmask = ~0ULL; + + if (copy_to_user(data, &vc_data, sizeof(vc_data))) + return -EFAULT; + return 0; +} + +int vc_set_bcaps(struct vx_info *vxi, void __user *data) +{ + struct vcmd_bcaps vc_data; + + if (copy_from_user(&vc_data, data, sizeof(vc_data))) + return -EFAULT; + + return do_set_caps(vxi, vc_data.bcaps, vc_data.bmask, 0, 0); +} + + +int vc_get_badness(struct vx_info *vxi, void __user *data) +{ + struct vcmd_badness_v0 vc_data; + + vc_data.bias = vxi->vx_badness_bias; + + if (copy_to_user(data, &vc_data, sizeof(vc_data))) + return -EFAULT; + return 0; +} + +int vc_set_badness(struct vx_info *vxi, void __user *data) +{ + struct vcmd_badness_v0 vc_data; + + if (copy_from_user(&vc_data, data, sizeof(vc_data))) + return -EFAULT; + + vxi->vx_badness_bias = vc_data.bias; + return 0; +} + +#include + +EXPORT_SYMBOL_GPL(free_vx_info); + diff -Nurb linux-2.6.22-594/kernel/vserver/space.c linux-2.6.22-595/kernel/vserver/space.c --- linux-2.6.22-594/kernel/vserver/space.c 2008-03-20 00:05:21.000000000 -0400 +++ linux-2.6.22-595/kernel/vserver/space.c 2008-03-20 00:08:28.000000000 -0400 @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -55,6 +56,7 @@ struct mnt_namespace *old_ns; struct uts_namespace *old_uts; struct ipc_namespace *old_ipc; + struct net *old_net; struct nsproxy *nsproxy; nsproxy = copy_nsproxy(old_nsproxy); @@ -85,12 +87,26 @@ } else old_ipc = NULL; + if (mask & CLONE_NEWNET) { + old_net = nsproxy->net_ns; + nsproxy->net_ns = new_nsproxy->net_ns; + if (nsproxy->net_ns) { + get_net(nsproxy->net_ns); + printk(KERN_ALERT "Cloning network namespace\n"); + } + } else + old_net = NULL; + + if (old_ns) put_mnt_ns(old_ns); if (old_uts) put_uts_ns(old_uts); if (old_ipc) put_ipc_ns(old_ipc); + if (old_net) + put_net(old_net); + out: return nsproxy; } @@ -251,6 +267,7 @@ int vc_enter_space(struct vx_info *vxi, void __user *data) { + /* Ask dhozac how to pass this flag from user space - Sapan*/ struct vcmd_space_mask vc_data = { .mask = 0 }; if (data && copy_from_user(&vc_data, data, sizeof(vc_data))) diff -Nurb linux-2.6.22-594/kernel/vserver/space.c.orig linux-2.6.22-595/kernel/vserver/space.c.orig --- linux-2.6.22-594/kernel/vserver/space.c.orig 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.22-595/kernel/vserver/space.c.orig 2008-03-20 00:05:28.000000000 -0400 @@ -0,0 +1,295 @@ +/* + * linux/kernel/vserver/space.c + * + * Virtual Server: Context Space Support + * + * Copyright (C) 2003-2007 Herbert Pötzl + * + * V0.01 broken out from context.c 0.07 + * V0.02 added task locking for namespace + * V0.03 broken out vx_enter_namespace + * V0.04 added *space support and commands + * + */ + +#include +#include +#include +#include +#include + +#include +#include +#include + + +atomic_t vs_global_nsproxy = ATOMIC_INIT(0); +atomic_t vs_global_fs = ATOMIC_INIT(0); +atomic_t vs_global_mnt_ns = ATOMIC_INIT(0); +atomic_t vs_global_uts_ns = ATOMIC_INIT(0); +atomic_t vs_global_ipc_ns = ATOMIC_INIT(0); + + +/* namespace functions */ + +#include + +const struct vcmd_space_mask space_mask = { + .mask = CLONE_NEWNS | + CLONE_NEWUTS | + CLONE_NEWIPC | + CLONE_FS | + CLONE_NEWNET +}; + + +/* + * build a new nsproxy mix + * assumes that both proxies are 'const' + * does not touch nsproxy refcounts + * will hold a reference on the result. + */ + +struct nsproxy *vs_mix_nsproxy(struct nsproxy *old_nsproxy, + struct nsproxy *new_nsproxy, unsigned long mask) +{ + struct mnt_namespace *old_ns; + struct uts_namespace *old_uts; + struct ipc_namespace *old_ipc; + struct net *old_net; + struct nsproxy *nsproxy; + + nsproxy = copy_nsproxy(old_nsproxy); + if (!nsproxy) + goto out; + + if (mask & CLONE_NEWNS) { + old_ns = nsproxy->mnt_ns; + nsproxy->mnt_ns = new_nsproxy->mnt_ns; + if (nsproxy->mnt_ns) + get_mnt_ns(nsproxy->mnt_ns); + } else + old_ns = NULL; + + if (mask & CLONE_NEWUTS) { + old_uts = nsproxy->uts_ns; + nsproxy->uts_ns = new_nsproxy->uts_ns; + if (nsproxy->uts_ns) + get_uts_ns(nsproxy->uts_ns); + } else + old_uts = NULL; + + if (mask & CLONE_NEWIPC) { + old_ipc = nsproxy->ipc_ns; + nsproxy->ipc_ns = new_nsproxy->ipc_ns; + if (nsproxy->ipc_ns) + get_ipc_ns(nsproxy->ipc_ns); + } else + old_ipc = NULL; + + if (mask & CLONE_NEWNET) { + old_net = nsproxy->net_ns; + nsproxy->net_ns = new_nsproxy->net_ns; + if (nsproxy->net_ns) { + get_net(nsproxy->net_ns); + printk(KERN_ALERT "Cloning network namespace\n"); + } + } else + old_net = NULL; + + + if (old_ns) + put_mnt_ns(old_ns); + if (old_uts) + put_uts_ns(old_uts); + if (old_ipc) + put_ipc_ns(old_ipc); + if (old_net) + put_net(old_net); + +out: + return nsproxy; +} + + +/* + * merge two nsproxy structs into a new one. + * will hold a reference on the result. + */ + +static inline +struct nsproxy *__vs_merge_nsproxy(struct nsproxy *old, + struct nsproxy *proxy, unsigned long mask) +{ + struct nsproxy null_proxy = { .mnt_ns = NULL }; + + if (!proxy) + return NULL; + + if (mask) { + /* vs_mix_nsproxy returns with reference */ + return vs_mix_nsproxy(old ? old : &null_proxy, + proxy, mask); + } + get_nsproxy(proxy); + return proxy; +} + +/* + * merge two fs structs into a new one. + * will take a reference on the result. + */ + +static inline +struct fs_struct *__vs_merge_fs(struct fs_struct *old, + struct fs_struct *fs, unsigned long mask) +{ + if (!(mask & CLONE_FS)) { + if (old) + atomic_inc(&old->count); + return old; + } + + if (!fs) + return NULL; + + return copy_fs_struct(fs); +} + + +int vx_enter_space(struct vx_info *vxi, unsigned long mask) +{ + struct nsproxy *proxy, *proxy_cur, *proxy_new; + struct fs_struct *fs, *fs_cur, *fs_new; + int ret; + + if (vx_info_flags(vxi, VXF_INFO_PRIVATE, 0)) + return -EACCES; + + if (!mask) + mask = vxi->vx_nsmask; + + if ((mask & vxi->vx_nsmask) != mask) + return -EINVAL; + + proxy = vxi->vx_nsproxy; + fs = vxi->vx_fs; + + task_lock(current); + fs_cur = current->fs; + atomic_inc(&fs_cur->count); + proxy_cur = current->nsproxy; + get_nsproxy(proxy_cur); + task_unlock(current); + + fs_new = __vs_merge_fs(fs_cur, fs, mask); + if (IS_ERR(fs_new)) { + ret = PTR_ERR(fs_new); + goto out_put; + } + + proxy_new = __vs_merge_nsproxy(proxy_cur, proxy, mask); + if (IS_ERR(proxy_new)) { + ret = PTR_ERR(proxy_new); + goto out_put_fs; + } + + fs_new = xchg(¤t->fs, fs_new); + proxy_new = xchg(¤t->nsproxy, proxy_new); + ret = 0; + + if (proxy_new) + put_nsproxy(proxy_new); +out_put_fs: + if (fs_new) + put_fs_struct(fs_new); +out_put: + if (proxy_cur) + put_nsproxy(proxy_cur); + if (fs_cur) + put_fs_struct(fs_cur); + return ret; +} + + +int vx_set_space(struct vx_info *vxi, unsigned long mask) +{ + struct nsproxy *proxy_vxi, *proxy_cur, *proxy_new; + struct fs_struct *fs_vxi, *fs_cur, *fs_new; + int ret; + + if (!mask) + mask = space_mask.mask; + + if ((mask & space_mask.mask) != mask) + return -EINVAL; + + proxy_vxi = vxi->vx_nsproxy; + fs_vxi = vxi->vx_fs; + + task_lock(current); + fs_cur = current->fs; + atomic_inc(&fs_cur->count); + proxy_cur = current->nsproxy; + get_nsproxy(proxy_cur); + task_unlock(current); + + fs_new = __vs_merge_fs(fs_vxi, fs_cur, mask); + if (IS_ERR(fs_new)) { + ret = PTR_ERR(fs_new); + goto out_put; + } + + proxy_new = __vs_merge_nsproxy(proxy_vxi, proxy_cur, mask); + if (IS_ERR(proxy_new)) { + ret = PTR_ERR(proxy_new); + goto out_put_fs; + } + + fs_new = xchg(&vxi->vx_fs, fs_new); + proxy_new = xchg(&vxi->vx_nsproxy, proxy_new); + vxi->vx_nsmask |= mask; + ret = 0; + + if (proxy_new) + put_nsproxy(proxy_new); +out_put_fs: + if (fs_new) + put_fs_struct(fs_new); +out_put: + if (proxy_cur) + put_nsproxy(proxy_cur); + if (fs_cur) + put_fs_struct(fs_cur); + return ret; +} + + +int vc_enter_space(struct vx_info *vxi, void __user *data) +{ + /* Ask dhozac how to pass this flag from user space - Sapan*/ + struct vcmd_space_mask vc_data = { .mask = CLONE_NEWNET }; + + if (data && copy_from_user(&vc_data, data, sizeof(vc_data))) + return -EFAULT; + + return vx_enter_space(vxi, vc_data.mask); +} + +int vc_set_space(struct vx_info *vxi, void __user *data) +{ + struct vcmd_space_mask vc_data = { .mask = 0 }; + + if (data && copy_from_user(&vc_data, data, sizeof(vc_data))) + return -EFAULT; + + return vx_set_space(vxi, vc_data.mask); +} + +int vc_get_space_mask(struct vx_info *vxi, void __user *data) +{ + if (copy_to_user(data, &space_mask, sizeof(space_mask))) + return -EFAULT; + return 0; +} + diff -Nurb linux-2.6.22-594/net/core/net_namespace.c linux-2.6.22-595/net/core/net_namespace.c --- linux-2.6.22-594/net/core/net_namespace.c 2008-03-20 00:05:18.000000000 -0400 +++ linux-2.6.22-595/net/core/net_namespace.c 2008-03-20 00:14:56.000000000 -0400 @@ -112,10 +112,12 @@ ops = list_entry(ptr, struct pernet_operations, list); if (ops->init) { error = ops->init(net); - if (error < 0) + if (error < 0) { + printk(KERN_ALERT "Error setting up netns: %x\n", ops->init); goto out_undo; } } + } out: return error; out_undo: diff -Nurb linux-2.6.22-594/net/core/net_namespace.c.orig linux-2.6.22-595/net/core/net_namespace.c.orig --- linux-2.6.22-594/net/core/net_namespace.c.orig 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.22-595/net/core/net_namespace.c.orig 2008-03-20 00:05:18.000000000 -0400 @@ -0,0 +1,332 @@ +#include +#include +#include +#include +#include +#include +#include + +/* + * Our network namespace constructor/destructor lists + */ + +static LIST_HEAD(pernet_list); +static struct list_head *first_device = &pernet_list; +static DEFINE_MUTEX(net_mutex); + +static DEFINE_MUTEX(net_list_mutex); +LIST_HEAD(net_namespace_list); + +static struct kmem_cache *net_cachep; + +struct net init_net; +EXPORT_SYMBOL_GPL(init_net); + +void net_lock(void) +{ + mutex_lock(&net_list_mutex); +} + +void net_unlock(void) +{ + mutex_unlock(&net_list_mutex); +} + +static struct net *net_alloc(void) +{ + return kmem_cache_alloc(net_cachep, GFP_KERNEL); +} + +static void net_free(struct net *net) +{ + if (!net) + return; + + if (unlikely(atomic_read(&net->use_count) != 0)) { + printk(KERN_EMERG "network namespace not free! Usage: %d\n", + atomic_read(&net->use_count)); + return; + } + + kmem_cache_free(net_cachep, net); +} + +static void cleanup_net(struct work_struct *work) +{ + struct pernet_operations *ops; + struct list_head *ptr; + struct net *net; + + net = container_of(work, struct net, work); + + mutex_lock(&net_mutex); + + /* Don't let anyone else find us. */ + net_lock(); + list_del(&net->list); + net_unlock(); + + /* Run all of the network namespace exit methods */ + list_for_each_prev(ptr, &pernet_list) { + ops = list_entry(ptr, struct pernet_operations, list); + if (ops->exit) + ops->exit(net); + } + + mutex_unlock(&net_mutex); + + /* Ensure there are no outstanding rcu callbacks using this + * network namespace. + */ + rcu_barrier(); + + /* Finally it is safe to free my network namespace structure */ + net_free(net); +} + + +void __put_net(struct net *net) +{ + /* Cleanup the network namespace in process context */ + INIT_WORK(&net->work, cleanup_net); + schedule_work(&net->work); +} +EXPORT_SYMBOL_GPL(__put_net); + +/* + * setup_net runs the initializers for the network namespace object. + */ +static int setup_net(struct net *net) +{ + /* Must be called with net_mutex held */ + struct pernet_operations *ops; + struct list_head *ptr; + int error; + + memset(net, 0, sizeof(struct net)); + atomic_set(&net->count, 1); + atomic_set(&net->use_count, 0); + + error = 0; + list_for_each(ptr, &pernet_list) { + ops = list_entry(ptr, struct pernet_operations, list); + if (ops->init) { + error = ops->init(net); + if (error < 0) + goto out_undo; + } + } +out: + return error; +out_undo: + /* Walk through the list backwards calling the exit functions + * for the pernet modules whose init functions did not fail. + */ + for (ptr = ptr->prev; ptr != &pernet_list; ptr = ptr->prev) { + ops = list_entry(ptr, struct pernet_operations, list); + if (ops->exit) + ops->exit(net); + } + goto out; +} + +struct net *copy_net_ns(unsigned long flags, struct net *old_net) +{ + struct net *new_net = NULL; + int err; + + get_net(old_net); + + if (!(flags & CLONE_NEWNET)) + return old_net; + + err = -EPERM; + if (!capable(CAP_SYS_ADMIN)) + goto out; + + err = -ENOMEM; + new_net = net_alloc(); + if (!new_net) + goto out; + + mutex_lock(&net_mutex); + err = setup_net(new_net); + if (err) + goto out_unlock; + + net_lock(); + list_add_tail(&new_net->list, &net_namespace_list); + net_unlock(); + + +out_unlock: + mutex_unlock(&net_mutex); +out: + put_net(old_net); + if (err) { + net_free(new_net); + new_net = ERR_PTR(err); + } + return new_net; +} + +static int __init net_ns_init(void) +{ + int err; + + printk(KERN_INFO "net_namespace: %zd bytes\n", sizeof(struct net)); + net_cachep = kmem_cache_create("net_namespace", sizeof(struct net), + SMP_CACHE_BYTES, + SLAB_PANIC, NULL, NULL); + mutex_lock(&net_mutex); + err = setup_net(&init_net); + + net_lock(); + list_add_tail(&init_net.list, &net_namespace_list); + net_unlock(); + + mutex_unlock(&net_mutex); + if (err) + panic("Could not setup the initial network namespace"); + + return 0; +} + +pure_initcall(net_ns_init); + +static int register_pernet_operations(struct list_head *list, + struct pernet_operations *ops) +{ + struct net *net, *undo_net; + int error; + + error = 0; + list_add_tail(&ops->list, list); + for_each_net(net) { + if (ops->init) { + error = ops->init(net); + if (error) + goto out_undo; + } + } +out: + return error; + +out_undo: + /* If I have an error cleanup all namespaces I initialized */ + list_del(&ops->list); + for_each_net(undo_net) { + if (undo_net == net) + goto undone; + if (ops->exit) + ops->exit(undo_net); + } +undone: + goto out; +} + +static void unregister_pernet_operations(struct pernet_operations *ops) +{ + struct net *net; + + list_del(&ops->list); + for_each_net(net) + if (ops->exit) + ops->exit(net); +} + +/** + * register_pernet_subsys - register a network namespace subsystem + * @ops: pernet operations structure for the subsystem + * + * Register a subsystem which has init and exit functions + * that are called when network namespaces are created and + * destroyed respectively. + * + * When registered all network namespace init functions are + * called for every existing network namespace. Allowing kernel + * modules to have a race free view of the set of network namespaces. + * + * When a new network namespace is created all of the init + * methods are called in the order in which they were registered. + * + * When a network namespace is destroyed all of the exit methods + * are called in the reverse of the order with which they were + * registered. + */ +int register_pernet_subsys(struct pernet_operations *ops) +{ + int error; + mutex_lock(&net_mutex); + error = register_pernet_operations(first_device, ops); + mutex_unlock(&net_mutex); + return error; +} +EXPORT_SYMBOL_GPL(register_pernet_subsys); + +/** + * unregister_pernet_subsys - unregister a network namespace subsystem + * @ops: pernet operations structure to manipulate + * + * Remove the pernet operations structure from the list to be + * used when network namespaces are created or destoryed. In + * addition run the exit method for all existing network + * namespaces. + */ +void unregister_pernet_subsys(struct pernet_operations *module) +{ + mutex_lock(&net_mutex); + unregister_pernet_operations(module); + mutex_unlock(&net_mutex); +} +EXPORT_SYMBOL_GPL(unregister_pernet_subsys); + +/** + * register_pernet_device - register a network namespace device + * @ops: pernet operations structure for the subsystem + * + * Register a device which has init and exit functions + * that are called when network namespaces are created and + * destroyed respectively. + * + * When registered all network namespace init functions are + * called for every existing network namespace. Allowing kernel + * modules to have a race free view of the set of network namespaces. + * + * When a new network namespace is created all of the init + * methods are called in the order in which they were registered. + * + * When a network namespace is destroyed all of the exit methods + * are called in the reverse of the order with which they were + * registered. + */ +int register_pernet_device(struct pernet_operations *ops) +{ + int error; + mutex_lock(&net_mutex); + error = register_pernet_operations(&pernet_list, ops); + if (!error && (first_device == &pernet_list)) + first_device = &ops->list; + mutex_unlock(&net_mutex); + return error; +} +EXPORT_SYMBOL_GPL(register_pernet_device); + +/** + * unregister_pernet_device - unregister a network namespace netdevice + * @ops: pernet operations structure to manipulate + * + * Remove the pernet operations structure from the list to be + * used when network namespaces are created or destoryed. In + * addition run the exit method for all existing network + * namespaces. + */ +void unregister_pernet_device(struct pernet_operations *ops) +{ + mutex_lock(&net_mutex); + if (&ops->list == first_device) + first_device = first_device->next; + unregister_pernet_operations(ops); + mutex_unlock(&net_mutex); +} +EXPORT_SYMBOL_GPL(unregister_pernet_device); diff -Nurb linux-2.6.22-594/net/ipv4/af_inet.c.orig linux-2.6.22-595/net/ipv4/af_inet.c.orig --- linux-2.6.22-594/net/ipv4/af_inet.c.orig 2008-03-20 00:05:18.000000000 -0400 +++ linux-2.6.22-595/net/ipv4/af_inet.c.orig 1969-12-31 19:00:00.000000000 -0500 @@ -1,1522 +0,0 @@ -/* - * INET An implementation of the TCP/IP protocol suite for the LINUX - * operating system. INET is implemented using the BSD Socket - * interface as the means of communication with the user level. - * - * PF_INET protocol family socket handler. - * - * Version: $Id: af_inet.c,v 1.137 2002/02/01 22:01:03 davem Exp $ - * - * Authors: Ross Biro - * Fred N. van Kempen, - * Florian La Roche, - * Alan Cox, - * - * Changes (see also sock.c) - * - * piggy, - * Karl Knutson : Socket protocol table - * A.N.Kuznetsov : Socket death error in accept(). - * John Richardson : Fix non blocking error in connect() - * so sockets that fail to connect - * don't return -EINPROGRESS. - * Alan Cox : Asynchronous I/O support - * Alan Cox : Keep correct socket pointer on sock - * structures - * when accept() ed - * Alan Cox : Semantics of SO_LINGER aren't state - * moved to close when you look carefully. - * With this fixed and the accept bug fixed - * some RPC stuff seems happier. - * Niibe Yutaka : 4.4BSD style write async I/O - * Alan Cox, - * Tony Gale : Fixed reuse semantics. - * Alan Cox : bind() shouldn't abort existing but dead - * sockets. Stops FTP netin:.. I hope. - * Alan Cox : bind() works correctly for RAW sockets. - * Note that FreeBSD at least was broken - * in this respect so be careful with - * compatibility tests... - * Alan Cox : routing cache support - * Alan Cox : memzero the socket structure for - * compactness. - * Matt Day : nonblock connect error handler - * Alan Cox : Allow large numbers of pending sockets - * (eg for big web sites), but only if - * specifically application requested. - * Alan Cox : New buffering throughout IP. Used - * dumbly. - * Alan Cox : New buffering now used smartly. - * Alan Cox : BSD rather than common sense - * interpretation of listen. - * Germano Caronni : Assorted small races. - * Alan Cox : sendmsg/recvmsg basic support. - * Alan Cox : Only sendmsg/recvmsg now supported. - * Alan Cox : Locked down bind (see security list). - * Alan Cox : Loosened bind a little. - * Mike McLagan : ADD/DEL DLCI Ioctls - * Willy Konynenberg : Transparent proxying support. - * David S. Miller : New socket lookup architecture. - * Some other random speedups. - * Cyrus Durgin : Cleaned up file for kmod hacks. - * Andi Kleen : Fix inet_stream_connect TCP race. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#ifdef CONFIG_IP_MROUTE -#include -#endif -#include - -DEFINE_SNMP_STAT(struct linux_mib, net_statistics) __read_mostly; - -extern void ip_mc_drop_socket(struct sock *sk); - -/* The inetsw table contains everything that inet_create needs to - * build a new socket. - */ -static struct list_head inetsw[SOCK_MAX]; -static DEFINE_SPINLOCK(inetsw_lock); - -/* New destruction routine */ - -void inet_sock_destruct(struct sock *sk) -{ - struct inet_sock *inet = inet_sk(sk); - - __skb_queue_purge(&sk->sk_receive_queue); - __skb_queue_purge(&sk->sk_error_queue); - - if (sk->sk_type == SOCK_STREAM && sk->sk_state != TCP_CLOSE) { - printk("Attempt to release TCP socket in state %d %p\n", - sk->sk_state, sk); - return; - } - if (!sock_flag(sk, SOCK_DEAD)) { - printk("Attempt to release alive inet socket %p\n", sk); - return; - } - - BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc)); - BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc)); - BUG_TRAP(!sk->sk_wmem_queued); - BUG_TRAP(!sk->sk_forward_alloc); - - kfree(inet->opt); - dst_release(sk->sk_dst_cache); - sk_refcnt_debug_dec(sk); -} - -/* - * The routines beyond this point handle the behaviour of an AF_INET - * socket object. Mostly it punts to the subprotocols of IP to do - * the work. - */ - -/* - * Automatically bind an unbound socket. - */ - -static int inet_autobind(struct sock *sk) -{ - struct inet_sock *inet; - /* We may need to bind the socket. */ - lock_sock(sk); - inet = inet_sk(sk); - if (!inet->num) { - if (sk->sk_prot->get_port(sk, 0)) { - release_sock(sk); - return -EAGAIN; - } - inet->sport = htons(inet->num); - sk->sk_xid = vx_current_xid(); - sk->sk_nid = nx_current_nid(); - } - release_sock(sk); - return 0; -} - -/* - * Move a socket into listening state. - */ -int inet_listen(struct socket *sock, int backlog) -{ - struct sock *sk = sock->sk; - unsigned char old_state; - int err; - - lock_sock(sk); - - err = -EINVAL; - if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM) - goto out; - - old_state = sk->sk_state; - if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN))) - goto out; - - /* Really, if the socket is already in listen state - * we can only allow the backlog to be adjusted. - */ - if (old_state != TCP_LISTEN) { - err = inet_csk_listen_start(sk, backlog); - if (err) - goto out; - } - sk->sk_max_ack_backlog = backlog; - err = 0; - -out: - release_sock(sk); - return err; -} - -u32 inet_ehash_secret __read_mostly; -EXPORT_SYMBOL(inet_ehash_secret); - -/* - * inet_ehash_secret must be set exactly once - * Instead of using a dedicated spinlock, we (ab)use inetsw_lock - */ -void build_ehash_secret(void) -{ - u32 rnd; - do { - get_random_bytes(&rnd, sizeof(rnd)); - } while (rnd == 0); - spin_lock_bh(&inetsw_lock); - if (!inet_ehash_secret) - inet_ehash_secret = rnd; - spin_unlock_bh(&inetsw_lock); -} -EXPORT_SYMBOL(build_ehash_secret); - -/* - * Create an inet socket. - */ - -static int inet_create(struct socket *sock, int protocol) -{ - struct sock *sk; - struct list_head *p; - struct inet_protosw *answer; - struct inet_sock *inet; - struct proto *answer_prot; - unsigned char answer_flags; - char answer_no_check; - int try_loading_module = 0; - int err; - - if (sock->type != SOCK_RAW && - sock->type != SOCK_DGRAM && - !inet_ehash_secret) - build_ehash_secret(); - - sock->state = SS_UNCONNECTED; - - /* Look for the requested type/protocol pair. */ - answer = NULL; -lookup_protocol: - err = -ESOCKTNOSUPPORT; - rcu_read_lock(); - list_for_each_rcu(p, &inetsw[sock->type]) { - answer = list_entry(p, struct inet_protosw, list); - - /* Check the non-wild match. */ - if (protocol == answer->protocol) { - if (protocol != IPPROTO_IP) - break; - } else { - /* Check for the two wild cases. */ - if (IPPROTO_IP == protocol) { - protocol = answer->protocol; - break; - } - if (IPPROTO_IP == answer->protocol) - break; - } - err = -EPROTONOSUPPORT; - answer = NULL; - } - - if (unlikely(answer == NULL)) { - if (try_loading_module < 2) { - rcu_read_unlock(); - /* - * Be more specific, e.g. net-pf-2-proto-132-type-1 - * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM) - */ - if (++try_loading_module == 1) - request_module("net-pf-%d-proto-%d-type-%d", - PF_INET, protocol, sock->type); - /* - * Fall back to generic, e.g. net-pf-2-proto-132 - * (net-pf-PF_INET-proto-IPPROTO_SCTP) - */ - else - request_module("net-pf-%d-proto-%d", - PF_INET, protocol); - goto lookup_protocol; - } else - goto out_rcu_unlock; - } - - err = -EPERM; - if ((protocol == IPPROTO_ICMP) && - nx_capable(answer->capability, NXC_RAW_ICMP)) - goto override; - if (sock->type == SOCK_RAW && - nx_capable(answer->capability, NXC_RAW_SOCKET)) - goto override; - if (answer->capability > 0 && !capable(answer->capability)) - goto out_rcu_unlock; -override: - sock->ops = answer->ops; - answer_prot = answer->prot; - answer_no_check = answer->no_check; - answer_flags = answer->flags; - rcu_read_unlock(); - - BUG_TRAP(answer_prot->slab != NULL); - - err = -ENOBUFS; - sk = sk_alloc(PF_INET, GFP_KERNEL, answer_prot, 1); - if (sk == NULL) - goto out; - - err = 0; - sk->sk_no_check = answer_no_check; - if (INET_PROTOSW_REUSE & answer_flags) - sk->sk_reuse = 1; - - inet = inet_sk(sk); - inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0; - - if (SOCK_RAW == sock->type) { - inet->num = protocol; - if (IPPROTO_RAW == protocol) - inet->hdrincl = 1; - } - - if (ipv4_config.no_pmtu_disc) - inet->pmtudisc = IP_PMTUDISC_DONT; - else - inet->pmtudisc = IP_PMTUDISC_WANT; - - inet->id = 0; - - sock_init_data(sock, sk); - - sk->sk_destruct = inet_sock_destruct; - sk->sk_family = PF_INET; - sk->sk_protocol = protocol; - sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv; - - inet->uc_ttl = -1; - inet->mc_loop = 1; - inet->mc_ttl = 1; - inet->mc_index = 0; - inet->mc_list = NULL; - - sk_refcnt_debug_inc(sk); - - if (inet->num) { - /* It assumes that any protocol which allows - * the user to assign a number at socket - * creation time automatically - * shares. - */ - inet->sport = htons(inet->num); - /* Add to protocol hash chains. */ - sk->sk_prot->hash(sk); - } - - if (sk->sk_prot->init) { - err = sk->sk_prot->init(sk); - if (err) - sk_common_release(sk); - } -out: - return err; -out_rcu_unlock: - rcu_read_unlock(); - goto out; -} - - -/* - * The peer socket should always be NULL (or else). When we call this - * function we are destroying the object and from then on nobody - * should refer to it. - */ -int inet_release(struct socket *sock) -{ - struct sock *sk = sock->sk; - - if (sk) { - long timeout; - - /* Applications forget to leave groups before exiting */ - ip_mc_drop_socket(sk); - - /* If linger is set, we don't return until the close - * is complete. Otherwise we return immediately. The - * actually closing is done the same either way. - * - * If the close is due to the process exiting, we never - * linger.. - */ - timeout = 0; - if (sock_flag(sk, SOCK_LINGER) && - !(current->flags & PF_EXITING)) - timeout = sk->sk_lingertime; - sock->sk = NULL; - sk->sk_prot->close(sk, timeout); - } - return 0; -} - -/* It is off by default, see below. */ -int sysctl_ip_nonlocal_bind __read_mostly; - -int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) -{ - struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; - struct sock *sk = sock->sk; - struct inet_sock *inet = inet_sk(sk); - struct nx_v4_sock_addr nsa; - unsigned short snum; - int chk_addr_ret; - int err; - - /* If the socket has its own bind function then use it. (RAW) */ - if (sk->sk_prot->bind) { - err = sk->sk_prot->bind(sk, uaddr, addr_len); - goto out; - } - err = -EINVAL; - if (addr_len < sizeof(struct sockaddr_in)) - goto out; - - err = v4_map_sock_addr(inet, addr, &nsa); - if (err) - goto out; - - chk_addr_ret = inet_addr_type(nsa.saddr); - - /* Not specified by any standard per-se, however it breaks too - * many applications when removed. It is unfortunate since - * allowing applications to make a non-local bind solves - * several problems with systems using dynamic addressing. - * (ie. your servers still start up even if your ISDN link - * is temporarily down) - */ - err = -EADDRNOTAVAIL; - if (!sysctl_ip_nonlocal_bind && - !inet->freebind && - nsa.saddr != INADDR_ANY && - chk_addr_ret != RTN_LOCAL && - chk_addr_ret != RTN_MULTICAST && - chk_addr_ret != RTN_BROADCAST) - goto out; - - snum = ntohs(addr->sin_port); - err = -EACCES; - if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE)) - goto out; - - /* We keep a pair of addresses. rcv_saddr is the one - * used by hash lookups, and saddr is used for transmit. - * - * In the BSD API these are the same except where it - * would be illegal to use them (multicast/broadcast) in - * which case the sending device address is used. - */ - lock_sock(sk); - - /* Check these errors (active socket, double bind). */ - err = -EINVAL; - if (sk->sk_state != TCP_CLOSE || inet->num) - goto out_release_sock; - - v4_set_sock_addr(inet, &nsa); - if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) - inet->saddr = 0; /* Use device */ - - /* Make sure we are allowed to bind here. */ - if (sk->sk_prot->get_port(sk, snum)) { - inet->saddr = inet->rcv_saddr = 0; - err = -EADDRINUSE; - goto out_release_sock; - } - - if (inet->rcv_saddr) - sk->sk_userlocks |= SOCK_BINDADDR_LOCK; - if (snum) - sk->sk_userlocks |= SOCK_BINDPORT_LOCK; - inet->sport = htons(inet->num); - inet->daddr = 0; - inet->dport = 0; - sk_dst_reset(sk); - err = 0; -out_release_sock: - release_sock(sk); -out: - return err; -} - -int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr, - int addr_len, int flags) -{ - struct sock *sk = sock->sk; - - if (uaddr->sa_family == AF_UNSPEC) - return sk->sk_prot->disconnect(sk, flags); - - if (!inet_sk(sk)->num && inet_autobind(sk)) - return -EAGAIN; - return sk->sk_prot->connect(sk, (struct sockaddr *)uaddr, addr_len); -} - -static long inet_wait_for_connect(struct sock *sk, long timeo) -{ - DEFINE_WAIT(wait); - - prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); - - /* Basic assumption: if someone sets sk->sk_err, he _must_ - * change state of the socket from TCP_SYN_*. - * Connect() does not allow to get error notifications - * without closing the socket. - */ - while ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { - release_sock(sk); - timeo = schedule_timeout(timeo); - lock_sock(sk); - if (signal_pending(current) || !timeo) - break; - prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); - } - finish_wait(sk->sk_sleep, &wait); - return timeo; -} - -/* - * Connect to a remote host. There is regrettably still a little - * TCP 'magic' in here. - */ -int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, - int addr_len, int flags) -{ - struct sock *sk = sock->sk; - int err; - long timeo; - - lock_sock(sk); - - if (uaddr->sa_family == AF_UNSPEC) { - err = sk->sk_prot->disconnect(sk, flags); - sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; - goto out; - } - - switch (sock->state) { - default: - err = -EINVAL; - goto out; - case SS_CONNECTED: - err = -EISCONN; - goto out; - case SS_CONNECTING: - err = -EALREADY; - /* Fall out of switch with err, set for this state */ - break; - case SS_UNCONNECTED: - err = -EISCONN; - if (sk->sk_state != TCP_CLOSE) - goto out; - - err = sk->sk_prot->connect(sk, uaddr, addr_len); - if (err < 0) - goto out; - - sock->state = SS_CONNECTING; - - /* Just entered SS_CONNECTING state; the only - * difference is that return value in non-blocking - * case is EINPROGRESS, rather than EALREADY. - */ - err = -EINPROGRESS; - break; - } - - timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); - - if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { - /* Error code is set above */ - if (!timeo || !inet_wait_for_connect(sk, timeo)) - goto out; - - err = sock_intr_errno(timeo); - if (signal_pending(current)) - goto out; - } - - /* Connection was closed by RST, timeout, ICMP error - * or another process disconnected us. - */ - if (sk->sk_state == TCP_CLOSE) - goto sock_error; - - /* sk->sk_err may be not zero now, if RECVERR was ordered by user - * and error was received after socket entered established state. - * Hence, it is handled normally after connect() return successfully. - */ - - sock->state = SS_CONNECTED; - err = 0; -out: - release_sock(sk); - return err; - -sock_error: - err = sock_error(sk) ? : -ECONNABORTED; - sock->state = SS_UNCONNECTED; - if (sk->sk_prot->disconnect(sk, flags)) - sock->state = SS_DISCONNECTING; - goto out; -} - -/* - * Accept a pending connection. The TCP layer now gives BSD semantics. - */ - -int inet_accept(struct socket *sock, struct socket *newsock, int flags) -{ - struct sock *sk1 = sock->sk; - int err = -EINVAL; - struct sock *sk2 = sk1->sk_prot->accept(sk1, flags, &err); - - if (!sk2) - goto do_err; - - lock_sock(sk2); - - BUG_TRAP((1 << sk2->sk_state) & - (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_CLOSE)); - - sock_graft(sk2, newsock); - - newsock->state = SS_CONNECTED; - err = 0; - release_sock(sk2); -do_err: - return err; -} - - -/* - * This does both peername and sockname. - */ -int inet_getname(struct socket *sock, struct sockaddr *uaddr, - int *uaddr_len, int peer) -{ - struct sock *sk = sock->sk; - struct inet_sock *inet = inet_sk(sk); - struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; - - sin->sin_family = AF_INET; - if (peer) { - if (!inet->dport || - (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) && - peer == 1)) - return -ENOTCONN; - sin->sin_port = inet->dport; - sin->sin_addr.s_addr = - nx_map_sock_lback(sk->sk_nx_info, inet->daddr); - } else { - __be32 addr = inet->rcv_saddr; - if (!addr) - addr = inet->saddr; - addr = nx_map_sock_lback(sk->sk_nx_info, addr); - sin->sin_port = inet->sport; - sin->sin_addr.s_addr = addr; - } - memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); - *uaddr_len = sizeof(*sin); - return 0; -} - -int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, - size_t size) -{ - struct sock *sk = sock->sk; - - /* We may need to bind the socket. */ - if (!inet_sk(sk)->num && inet_autobind(sk)) - return -EAGAIN; - - return sk->sk_prot->sendmsg(iocb, sk, msg, size); -} - - -static ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) -{ - struct sock *sk = sock->sk; - - /* We may need to bind the socket. */ - if (!inet_sk(sk)->num && inet_autobind(sk)) - return -EAGAIN; - - if (sk->sk_prot->sendpage) - return sk->sk_prot->sendpage(sk, page, offset, size, flags); - return sock_no_sendpage(sock, page, offset, size, flags); -} - - -int inet_shutdown(struct socket *sock, int how) -{ - struct sock *sk = sock->sk; - int err = 0; - - /* This should really check to make sure - * the socket is a TCP socket. (WHY AC...) - */ - how++; /* maps 0->1 has the advantage of making bit 1 rcvs and - 1->2 bit 2 snds. - 2->3 */ - if ((how & ~SHUTDOWN_MASK) || !how) /* MAXINT->0 */ - return -EINVAL; - - lock_sock(sk); - if (sock->state == SS_CONNECTING) { - if ((1 << sk->sk_state) & - (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE)) - sock->state = SS_DISCONNECTING; - else - sock->state = SS_CONNECTED; - } - - switch (sk->sk_state) { - case TCP_CLOSE: - err = -ENOTCONN; - /* Hack to wake up other listeners, who can poll for - POLLHUP, even on eg. unconnected UDP sockets -- RR */ - default: - sk->sk_shutdown |= how; - if (sk->sk_prot->shutdown) - sk->sk_prot->shutdown(sk, how); - break; - - /* Remaining two branches are temporary solution for missing - * close() in multithreaded environment. It is _not_ a good idea, - * but we have no choice until close() is repaired at VFS level. - */ - case TCP_LISTEN: - if (!(how & RCV_SHUTDOWN)) - break; - /* Fall through */ - case TCP_SYN_SENT: - err = sk->sk_prot->disconnect(sk, O_NONBLOCK); - sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; - break; - } - - /* Wake up anyone sleeping in poll. */ - sk->sk_state_change(sk); - release_sock(sk); - return err; -} - -/* - * ioctl() calls you can issue on an INET socket. Most of these are - * device configuration and stuff and very rarely used. Some ioctls - * pass on to the socket itself. - * - * NOTE: I like the idea of a module for the config stuff. ie ifconfig - * loads the devconfigure module does its configuring and unloads it. - * There's a good 20K of config code hanging around the kernel. - */ - -int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) -{ - struct sock *sk = sock->sk; - int err = 0; - - switch (cmd) { - case SIOCGSTAMP: - err = sock_get_timestamp(sk, (struct timeval __user *)arg); - break; - case SIOCGSTAMPNS: - err = sock_get_timestampns(sk, (struct timespec __user *)arg); - break; - case SIOCADDRT: - case SIOCDELRT: - case SIOCRTMSG: - err = ip_rt_ioctl(cmd, (void __user *)arg); - break; - case SIOCDARP: - case SIOCGARP: - case SIOCSARP: - err = arp_ioctl(cmd, (void __user *)arg); - break; - case SIOCGIFADDR: - case SIOCSIFADDR: - case SIOCGIFBRDADDR: - case SIOCSIFBRDADDR: - case SIOCGIFNETMASK: - case SIOCSIFNETMASK: - case SIOCGIFDSTADDR: - case SIOCSIFDSTADDR: - case SIOCSIFPFLAGS: - case SIOCGIFPFLAGS: - case SIOCSIFFLAGS: - err = devinet_ioctl(cmd, (void __user *)arg); - break; - default: - if (sk->sk_prot->ioctl) - err = sk->sk_prot->ioctl(sk, cmd, arg); - else - err = -ENOIOCTLCMD; - break; - } - return err; -} - -const struct proto_ops inet_stream_ops = { - .family = PF_INET, - .owner = THIS_MODULE, - .release = inet_release, - .bind = inet_bind, - .connect = inet_stream_connect, - .socketpair = sock_no_socketpair, - .accept = inet_accept, - .getname = inet_getname, - .poll = tcp_poll, - .ioctl = inet_ioctl, - .listen = inet_listen, - .shutdown = inet_shutdown, - .setsockopt = sock_common_setsockopt, - .getsockopt = sock_common_getsockopt, - .sendmsg = tcp_sendmsg, - .recvmsg = sock_common_recvmsg, - .mmap = sock_no_mmap, - .sendpage = tcp_sendpage, -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_sock_common_setsockopt, - .compat_getsockopt = compat_sock_common_getsockopt, -#endif -}; - -const struct proto_ops inet_dgram_ops = { - .family = PF_INET, - .owner = THIS_MODULE, - .release = inet_release, - .bind = inet_bind, - .connect = inet_dgram_connect, - .socketpair = sock_no_socketpair, - .accept = sock_no_accept, - .getname = inet_getname, - .poll = udp_poll, - .ioctl = inet_ioctl, - .listen = sock_no_listen, - .shutdown = inet_shutdown, - .setsockopt = sock_common_setsockopt, - .getsockopt = sock_common_getsockopt, - .sendmsg = inet_sendmsg, - .recvmsg = sock_common_recvmsg, - .mmap = sock_no_mmap, - .sendpage = inet_sendpage, -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_sock_common_setsockopt, - .compat_getsockopt = compat_sock_common_getsockopt, -#endif -}; - -/* - * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without - * udp_poll - */ -static const struct proto_ops inet_sockraw_ops = { - .family = PF_INET, - .owner = THIS_MODULE, - .release = inet_release, - .bind = inet_bind, - .connect = inet_dgram_connect, - .socketpair = sock_no_socketpair, - .accept = sock_no_accept, - .getname = inet_getname, - .poll = datagram_poll, - .ioctl = inet_ioctl, - .listen = sock_no_listen, - .shutdown = inet_shutdown, - .setsockopt = sock_common_setsockopt, - .getsockopt = sock_common_getsockopt, - .sendmsg = inet_sendmsg, - .recvmsg = sock_common_recvmsg, - .mmap = sock_no_mmap, - .sendpage = inet_sendpage, -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_sock_common_setsockopt, - .compat_getsockopt = compat_sock_common_getsockopt, -#endif -}; - -static struct net_proto_family inet_family_ops = { - .family = PF_INET, - .create = inet_create, - .owner = THIS_MODULE, -}; - -/* Upon startup we insert all the elements in inetsw_array[] into - * the linked list inetsw. - */ -static struct inet_protosw inetsw_array[] = -{ - { - .type = SOCK_STREAM, - .protocol = IPPROTO_TCP, - .prot = &tcp_prot, - .ops = &inet_stream_ops, - .capability = -1, - .no_check = 0, - .flags = INET_PROTOSW_PERMANENT | - INET_PROTOSW_ICSK, - }, - - { - .type = SOCK_DGRAM, - .protocol = IPPROTO_UDP, - .prot = &udp_prot, - .ops = &inet_dgram_ops, - .capability = -1, - .no_check = UDP_CSUM_DEFAULT, - .flags = INET_PROTOSW_PERMANENT, - }, - - - { - .type = SOCK_RAW, - .protocol = IPPROTO_IP, /* wild card */ - .prot = &raw_prot, - .ops = &inet_sockraw_ops, - .capability = CAP_NET_RAW, - .no_check = UDP_CSUM_DEFAULT, - .flags = INET_PROTOSW_REUSE, - } -}; - -#define INETSW_ARRAY_LEN (sizeof(inetsw_array) / sizeof(struct inet_protosw)) - -void inet_register_protosw(struct inet_protosw *p) -{ - struct list_head *lh; - struct inet_protosw *answer; - int protocol = p->protocol; - struct list_head *last_perm; - - spin_lock_bh(&inetsw_lock); - - if (p->type >= SOCK_MAX) - goto out_illegal; - - /* If we are trying to override a permanent protocol, bail. */ - answer = NULL; - last_perm = &inetsw[p->type]; - list_for_each(lh, &inetsw[p->type]) { - answer = list_entry(lh, struct inet_protosw, list); - - /* Check only the non-wild match. */ - if (INET_PROTOSW_PERMANENT & answer->flags) { - if (protocol == answer->protocol) - break; - last_perm = lh; - } - - answer = NULL; - } - if (answer) - goto out_permanent; - - /* Add the new entry after the last permanent entry if any, so that - * the new entry does not override a permanent entry when matched with - * a wild-card protocol. But it is allowed to override any existing - * non-permanent entry. This means that when we remove this entry, the - * system automatically returns to the old behavior. - */ - list_add_rcu(&p->list, last_perm); -out: - spin_unlock_bh(&inetsw_lock); - - synchronize_net(); - - return; - -out_permanent: - printk(KERN_ERR "Attempt to override permanent protocol %d.\n", - protocol); - goto out; - -out_illegal: - printk(KERN_ERR - "Ignoring attempt to register invalid socket type %d.\n", - p->type); - goto out; -} - -void inet_unregister_protosw(struct inet_protosw *p) -{ - if (INET_PROTOSW_PERMANENT & p->flags) { - printk(KERN_ERR - "Attempt to unregister permanent protocol %d.\n", - p->protocol); - } else { - spin_lock_bh(&inetsw_lock); - list_del_rcu(&p->list); - spin_unlock_bh(&inetsw_lock); - - synchronize_net(); - } -} - -/* - * Shall we try to damage output packets if routing dev changes? - */ - -int sysctl_ip_dynaddr __read_mostly; - -static int inet_sk_reselect_saddr(struct sock *sk) -{ - struct inet_sock *inet = inet_sk(sk); - int err; - struct rtable *rt; - __be32 old_saddr = inet->saddr; - __be32 new_saddr; - __be32 daddr = inet->daddr; - - if (inet->opt && inet->opt->srr) - daddr = inet->opt->faddr; - - /* Query new route. */ - err = ip_route_connect(&rt, daddr, 0, - RT_CONN_FLAGS(sk), - sk->sk_bound_dev_if, - sk->sk_protocol, - inet->sport, inet->dport, sk, 0); - if (err) - return err; - - sk_setup_caps(sk, &rt->u.dst); - - new_saddr = rt->rt_src; - - if (new_saddr == old_saddr) - return 0; - - if (sysctl_ip_dynaddr > 1) { - printk(KERN_INFO "%s(): shifting inet->" - "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n", - __FUNCTION__, - NIPQUAD(old_saddr), - NIPQUAD(new_saddr)); - } - - inet->saddr = inet->rcv_saddr = new_saddr; - - /* - * XXX The only one ugly spot where we need to - * XXX really change the sockets identity after - * XXX it has entered the hashes. -DaveM - * - * Besides that, it does not check for connection - * uniqueness. Wait for troubles. - */ - __sk_prot_rehash(sk); - return 0; -} - -int inet_sk_rebuild_header(struct sock *sk) -{ - struct inet_sock *inet = inet_sk(sk); - struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0); - __be32 daddr; - int err; - - /* Route is OK, nothing to do. */ - if (rt) - return 0; - - /* Reroute. */ - daddr = inet->daddr; - if (inet->opt && inet->opt->srr) - daddr = inet->opt->faddr; -{ - struct flowi fl = { - .oif = sk->sk_bound_dev_if, - .nl_u = { - .ip4_u = { - .daddr = daddr, - .saddr = inet->saddr, - .tos = RT_CONN_FLAGS(sk), - }, - }, - .proto = sk->sk_protocol, - .uli_u = { - .ports = { - .sport = inet->sport, - .dport = inet->dport, - }, - }, - }; - - security_sk_classify_flow(sk, &fl); - err = ip_route_output_flow(&rt, &fl, sk, 0); -} - if (!err) - sk_setup_caps(sk, &rt->u.dst); - else { - /* Routing failed... */ - sk->sk_route_caps = 0; - /* - * Other protocols have to map its equivalent state to TCP_SYN_SENT. - * DCCP maps its DCCP_REQUESTING state to TCP_SYN_SENT. -acme - */ - if (!sysctl_ip_dynaddr || - sk->sk_state != TCP_SYN_SENT || - (sk->sk_userlocks & SOCK_BINDADDR_LOCK) || - (err = inet_sk_reselect_saddr(sk)) != 0) - sk->sk_err_soft = -err; - } - - return err; -} - -EXPORT_SYMBOL(inet_sk_rebuild_header); - -static int inet_gso_send_check(struct sk_buff *skb) -{ - struct iphdr *iph; - struct net_protocol *ops; - int proto; - int ihl; - int err = -EINVAL; - - if (unlikely(!pskb_may_pull(skb, sizeof(*iph)))) - goto out; - - iph = ip_hdr(skb); - ihl = iph->ihl * 4; - if (ihl < sizeof(*iph)) - goto out; - - if (unlikely(!pskb_may_pull(skb, ihl))) - goto out; - - __skb_pull(skb, ihl); - skb_reset_transport_header(skb); - iph = ip_hdr(skb); - proto = iph->protocol & (MAX_INET_PROTOS - 1); - err = -EPROTONOSUPPORT; - - rcu_read_lock(); - ops = rcu_dereference(inet_protos[proto]); - if (likely(ops && ops->gso_send_check)) - err = ops->gso_send_check(skb); - rcu_read_unlock(); - -out: - return err; -} - -static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int features) -{ - struct sk_buff *segs = ERR_PTR(-EINVAL); - struct iphdr *iph; - struct net_protocol *ops; - int proto; - int ihl; - int id; - - if (unlikely(skb_shinfo(skb)->gso_type & - ~(SKB_GSO_TCPV4 | - SKB_GSO_UDP | - SKB_GSO_DODGY | - SKB_GSO_TCP_ECN | - 0))) - goto out; - - if (unlikely(!pskb_may_pull(skb, sizeof(*iph)))) - goto out; - - iph = ip_hdr(skb); - ihl = iph->ihl * 4; - if (ihl < sizeof(*iph)) - goto out; - - if (unlikely(!pskb_may_pull(skb, ihl))) - goto out; - - __skb_pull(skb, ihl); - skb_reset_transport_header(skb); - iph = ip_hdr(skb); - id = ntohs(iph->id); - proto = iph->protocol & (MAX_INET_PROTOS - 1); - segs = ERR_PTR(-EPROTONOSUPPORT); - - rcu_read_lock(); - ops = rcu_dereference(inet_protos[proto]); - if (likely(ops && ops->gso_segment)) - segs = ops->gso_segment(skb, features); - rcu_read_unlock(); - - if (!segs || unlikely(IS_ERR(segs))) - goto out; - - skb = segs; - do { - iph = ip_hdr(skb); - iph->id = htons(id++); - iph->tot_len = htons(skb->len - skb->mac_len); - iph->check = 0; - iph->check = ip_fast_csum(skb_network_header(skb), iph->ihl); - } while ((skb = skb->next)); - -out: - return segs; -} - -unsigned long snmp_fold_field(void *mib[], int offt) -{ - unsigned long res = 0; - int i; - - for_each_possible_cpu(i) { - res += *(((unsigned long *) per_cpu_ptr(mib[0], i)) + offt); - res += *(((unsigned long *) per_cpu_ptr(mib[1], i)) + offt); - } - return res; -} -EXPORT_SYMBOL_GPL(snmp_fold_field); - -int snmp_mib_init(void *ptr[2], size_t mibsize, size_t mibalign) -{ - BUG_ON(ptr == NULL); - ptr[0] = __alloc_percpu(mibsize); - if (!ptr[0]) - goto err0; - ptr[1] = __alloc_percpu(mibsize); - if (!ptr[1]) - goto err1; - return 0; -err1: - free_percpu(ptr[0]); - ptr[0] = NULL; -err0: - return -ENOMEM; -} -EXPORT_SYMBOL_GPL(snmp_mib_init); - -void snmp_mib_free(void *ptr[2]) -{ - BUG_ON(ptr == NULL); - free_percpu(ptr[0]); - free_percpu(ptr[1]); - ptr[0] = ptr[1] = NULL; -} -EXPORT_SYMBOL_GPL(snmp_mib_free); - -#ifdef CONFIG_IP_MULTICAST -static struct net_protocol igmp_protocol = { - .handler = igmp_rcv, -}; -#endif - -static struct net_protocol tcp_protocol = { - .handler = tcp_v4_rcv, - .err_handler = tcp_v4_err, - .gso_send_check = tcp_v4_gso_send_check, - .gso_segment = tcp_tso_segment, - .no_policy = 1, -}; - -static struct net_protocol udp_protocol = { - .handler = udp_rcv, - .err_handler = udp_err, - .no_policy = 1, -}; - -static struct net_protocol icmp_protocol = { - .handler = icmp_rcv, -}; - -static int __init init_ipv4_mibs(void) -{ - if (snmp_mib_init((void **)net_statistics, - sizeof(struct linux_mib), - __alignof__(struct linux_mib)) < 0) - goto err_net_mib; - if (snmp_mib_init((void **)ip_statistics, - sizeof(struct ipstats_mib), - __alignof__(struct ipstats_mib)) < 0) - goto err_ip_mib; - if (snmp_mib_init((void **)icmp_statistics, - sizeof(struct icmp_mib), - __alignof__(struct icmp_mib)) < 0) - goto err_icmp_mib; - if (snmp_mib_init((void **)tcp_statistics, - sizeof(struct tcp_mib), - __alignof__(struct tcp_mib)) < 0) - goto err_tcp_mib; - if (snmp_mib_init((void **)udp_statistics, - sizeof(struct udp_mib), - __alignof__(struct udp_mib)) < 0) - goto err_udp_mib; - if (snmp_mib_init((void **)udplite_statistics, - sizeof(struct udp_mib), - __alignof__(struct udp_mib)) < 0) - goto err_udplite_mib; - - tcp_mib_init(); - - return 0; - -err_udplite_mib: - snmp_mib_free((void **)udp_statistics); -err_udp_mib: - snmp_mib_free((void **)tcp_statistics); -err_tcp_mib: - snmp_mib_free((void **)icmp_statistics); -err_icmp_mib: - snmp_mib_free((void **)ip_statistics); -err_ip_mib: - snmp_mib_free((void **)net_statistics); -err_net_mib: - return -ENOMEM; -} - -static int ipv4_proc_init(void); - -/* - * IP protocol layer initialiser - */ - -static struct packet_type ip_packet_type = { - .type = __constant_htons(ETH_P_IP), - .func = ip_rcv, - .gso_send_check = inet_gso_send_check, - .gso_segment = inet_gso_segment, -}; - -static int __init inet_init(void) -{ - struct sk_buff *dummy_skb; - struct inet_protosw *q; - struct list_head *r; - int rc = -EINVAL; - - BUILD_BUG_ON(sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb)); - - rc = proto_register(&tcp_prot, 1); - if (rc) - goto out; - - rc = proto_register(&udp_prot, 1); - if (rc) - goto out_unregister_tcp_proto; - - rc = proto_register(&raw_prot, 1); - if (rc) - goto out_unregister_udp_proto; - - /* - * Tell SOCKET that we are alive... - */ - - (void)sock_register(&inet_family_ops); - - /* - * Add all the base protocols. - */ - - if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0) - printk(KERN_CRIT "inet_init: Cannot add ICMP protocol\n"); - if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0) - printk(KERN_CRIT "inet_init: Cannot add UDP protocol\n"); - if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0) - printk(KERN_CRIT "inet_init: Cannot add TCP protocol\n"); -#ifdef CONFIG_IP_MULTICAST - if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0) - printk(KERN_CRIT "inet_init: Cannot add IGMP protocol\n"); -#endif - - /* Register the socket-side information for inet_create. */ - for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r) - INIT_LIST_HEAD(r); - - for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q) - inet_register_protosw(q); - - /* - * Set the ARP module up - */ - - arp_init(); - - /* - * Set the IP module up - */ - - ip_init(); - - tcp_v4_init(&inet_family_ops); - - /* Setup TCP slab cache for open requests. */ - tcp_init(); - - /* Add UDP-Lite (RFC 3828) */ - udplite4_register(); - - /* - * Set the ICMP layer up - */ - - icmp_init(&inet_family_ops); - - /* - * Initialise the multicast router - */ -#if defined(CONFIG_IP_MROUTE) - ip_mr_init(); -#endif - /* - * Initialise per-cpu ipv4 mibs - */ - - if (init_ipv4_mibs()) - printk(KERN_CRIT "inet_init: Cannot init ipv4 mibs\n"); ; - - ipv4_proc_init(); - - ipfrag_init(); - - dev_add_pack(&ip_packet_type); - - rc = 0; -out: - return rc; -out_unregister_udp_proto: - proto_unregister(&udp_prot); -out_unregister_tcp_proto: - proto_unregister(&tcp_prot); - goto out; -} - -fs_initcall(inet_init); - -/* ------------------------------------------------------------------------ */ - -#ifdef CONFIG_PROC_FS -static int __init ipv4_proc_init(void) -{ - int rc = 0; - - if (raw_proc_init()) - goto out_raw; - if (tcp4_proc_init()) - goto out_tcp; - if (udp4_proc_init()) - goto out_udp; - if (fib_proc_init()) - goto out_fib; - if (ip_misc_proc_init()) - goto out_misc; -out: - return rc; -out_misc: - fib_proc_exit(); -out_fib: - udp4_proc_exit(); -out_udp: - tcp4_proc_exit(); -out_tcp: - raw_proc_exit(); -out_raw: - rc = -ENOMEM; - goto out; -} - -#else /* CONFIG_PROC_FS */ -static int __init ipv4_proc_init(void) -{ - return 0; -} -#endif /* CONFIG_PROC_FS */ - -MODULE_ALIAS_NETPROTO(PF_INET); - -EXPORT_SYMBOL(inet_accept); -EXPORT_SYMBOL(inet_bind); -EXPORT_SYMBOL(inet_dgram_connect); -EXPORT_SYMBOL(inet_dgram_ops); -EXPORT_SYMBOL(inet_getname); -EXPORT_SYMBOL(inet_ioctl); -EXPORT_SYMBOL(inet_listen); -EXPORT_SYMBOL(inet_register_protosw); -EXPORT_SYMBOL(inet_release); -EXPORT_SYMBOL(inet_sendmsg); -EXPORT_SYMBOL(inet_shutdown); -EXPORT_SYMBOL(inet_sock_destruct); -EXPORT_SYMBOL(inet_stream_connect); -EXPORT_SYMBOL(inet_stream_ops); -EXPORT_SYMBOL(inet_unregister_protosw); -EXPORT_SYMBOL(net_statistics); -EXPORT_SYMBOL(sysctl_ip_nonlocal_bind); diff -Nurb linux-2.6.22-594/net/netfilter/xt_MARK.c.orig linux-2.6.22-595/net/netfilter/xt_MARK.c.orig --- linux-2.6.22-594/net/netfilter/xt_MARK.c.orig 2008-03-20 00:05:19.000000000 -0400 +++ linux-2.6.22-595/net/netfilter/xt_MARK.c.orig 1969-12-31 19:00:00.000000000 -0500 @@ -1,283 +0,0 @@ -/* This is a module which is used for setting the NFMARK field of an skb. */ - -/* (C) 1999-2001 Marc Boucher - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - */ - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Marc Boucher "); -MODULE_DESCRIPTION("ip[6]tables MARK modification module"); -MODULE_ALIAS("ipt_MARK"); -MODULE_ALIAS("ip6t_MARK"); - -static inline u_int16_t -get_dst_port(struct nf_conntrack_tuple *tuple) -{ - switch (tuple->dst.protonum) { - case IPPROTO_GRE: - /* XXX Truncate 32-bit GRE key to 16 bits */ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,11) - return tuple->dst.u.gre.key; -#else - return htons(ntohl(tuple->dst.u.gre.key)); -#endif - case IPPROTO_ICMP: - /* Bind on ICMP echo ID */ - return tuple->src.u.icmp.id; - case IPPROTO_TCP: - return tuple->dst.u.tcp.port; - case IPPROTO_UDP: - return tuple->dst.u.udp.port; - default: - return tuple->dst.u.all; - } -} - -static inline u_int16_t -get_src_port(struct nf_conntrack_tuple *tuple) -{ - switch (tuple->dst.protonum) { - case IPPROTO_GRE: - /* XXX Truncate 32-bit GRE key to 16 bits */ - return htons(ntohl(tuple->src.u.gre.key)); - case IPPROTO_ICMP: - /* Bind on ICMP echo ID */ - return tuple->src.u.icmp.id; - case IPPROTO_TCP: - return tuple->src.u.tcp.port; - case IPPROTO_UDP: - return tuple->src.u.udp.port; - default: - return tuple->src.u.all; - } -} - -static unsigned int -target_v0(struct sk_buff **pskb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) -{ - const struct xt_mark_target_info *markinfo = targinfo; - - (*pskb)->mark = markinfo->mark; - return XT_CONTINUE; -} - -static unsigned int -target_v1(struct sk_buff **pskb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) -{ - const struct xt_mark_target_info_v1 *markinfo = targinfo; - int mark = -1; - - switch (markinfo->mode) { - case XT_MARK_SET: - mark = markinfo->mark; - break; - - case XT_MARK_AND: - mark = (*pskb)->mark & markinfo->mark; - break; - - case XT_MARK_OR: - mark = (*pskb)->mark | markinfo->mark; - break; - - case XT_MARK_COPYXID: { - enum ip_conntrack_info ctinfo; - struct sock *connection_sk=NULL; - int dif; - - struct nf_conn *ct = nf_ct_get((*pskb), &ctinfo); - extern struct inet_hashinfo tcp_hashinfo; - enum ip_conntrack_dir dir; - if (!ct) - break; - - dir = CTINFO2DIR(ctinfo); - u_int32_t src_ip = ct->tuplehash[dir].tuple.src.u3.ip; - u_int16_t src_port = get_src_port(&ct->tuplehash[dir].tuple); - u_int16_t proto = ct->tuplehash[dir].tuple.dst.protonum; - - u_int32_t ip; - u_int16_t port; - - dif = ((struct rtable *)(*pskb)->dst)->rt_iif; - ip = ct->tuplehash[dir].tuple.dst.u3.ip; - port = get_dst_port(&ct->tuplehash[dir].tuple); - - if (proto == 1 || proto == 17) { - if (((*pskb)->mark!=-1) && (*pskb)->mark) - ct->xid[0]=(*pskb)->mark; - if (ct->xid[0]) - mark = ct->xid[0]; - - } - else if (proto == 6) { - if ((*pskb)->sk) - connection_sk = (*pskb)->sk; - else { - connection_sk = inet_lookup(&tcp_hashinfo, src_ip, src_port, ip, port, dif); - } - - if (connection_sk) { - connection_sk->sk_peercred.gid = connection_sk->sk_peercred.uid = ct->xid[dir]; - ct->xid[!dir]=connection_sk->sk_xid; - if (connection_sk->sk_xid != 0) - mark = connection_sk->sk_xid; - if (connection_sk != (*pskb)->sk) - sock_put(connection_sk); - } - break; - } - } - } - - if (mark != -1) - (*pskb)->mark = mark; - return XT_CONTINUE; -} - - -static int -checkentry_v0(const char *tablename, - const void *entry, - const struct xt_target *target, - void *targinfo, - unsigned int hook_mask) -{ - struct xt_mark_target_info *markinfo = targinfo; - - if (markinfo->mark > 0xffffffff) { - printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n"); - return 0; - } - return 1; -} - -static int -checkentry_v1(const char *tablename, - const void *entry, - const struct xt_target *target, - void *targinfo, - unsigned int hook_mask) -{ - struct xt_mark_target_info_v1 *markinfo = targinfo; - - if (markinfo->mode != XT_MARK_SET - && markinfo->mode != XT_MARK_AND - && markinfo->mode != XT_MARK_OR - && markinfo->mode != XT_MARK_COPYXID) { - printk(KERN_WARNING "MARK: unknown mode %u\n", - markinfo->mode); - return 0; - } - if (markinfo->mark > 0xffffffff) { - printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n"); - return 0; - } - return 1; -} - -#ifdef CONFIG_COMPAT -struct compat_xt_mark_target_info_v1 { - compat_ulong_t mark; - u_int8_t mode; - u_int8_t __pad1; - u_int16_t __pad2; -}; - -static void compat_from_user_v1(void *dst, void *src) -{ - struct compat_xt_mark_target_info_v1 *cm = src; - struct xt_mark_target_info_v1 m = { - .mark = cm->mark, - .mode = cm->mode, - }; - memcpy(dst, &m, sizeof(m)); -} - -static int compat_to_user_v1(void __user *dst, void *src) -{ - struct xt_mark_target_info_v1 *m = src; - struct compat_xt_mark_target_info_v1 cm = { - .mark = m->mark, - .mode = m->mode, - }; - return copy_to_user(dst, &cm, sizeof(cm)) ? -EFAULT : 0; -} -#endif /* CONFIG_COMPAT */ - -static struct xt_target xt_mark_target[] = { - { - .name = "MARK", - .family = AF_INET, - .revision = 0, - .checkentry = checkentry_v0, - .target = target_v0, - .targetsize = sizeof(struct xt_mark_target_info), - .table = "mangle", - .me = THIS_MODULE, - }, - { - .name = "MARK", - .family = AF_INET, - .revision = 1, - .checkentry = checkentry_v1, - .target = target_v1, - .targetsize = sizeof(struct xt_mark_target_info_v1), -#ifdef CONFIG_COMPAT - .compatsize = sizeof(struct compat_xt_mark_target_info_v1), - .compat_from_user = compat_from_user_v1, - .compat_to_user = compat_to_user_v1, -#endif - .table = "mangle", - .me = THIS_MODULE, - }, - { - .name = "MARK", - .family = AF_INET6, - .revision = 0, - .checkentry = checkentry_v0, - .target = target_v0, - .targetsize = sizeof(struct xt_mark_target_info), - .table = "mangle", - .me = THIS_MODULE, - }, -}; - -static int __init xt_mark_init(void) -{ - return xt_register_targets(xt_mark_target, ARRAY_SIZE(xt_mark_target)); -} - -static void __exit xt_mark_fini(void) -{ - xt_unregister_targets(xt_mark_target, ARRAY_SIZE(xt_mark_target)); -} - -module_init(xt_mark_init); -module_exit(xt_mark_fini); diff -Nurb linux-2.6.22-594/net/packet/af_packet.c.orig linux-2.6.22-595/net/packet/af_packet.c.orig --- linux-2.6.22-594/net/packet/af_packet.c.orig 2008-03-20 00:05:19.000000000 -0400 +++ linux-2.6.22-595/net/packet/af_packet.c.orig 1969-12-31 19:00:00.000000000 -0500 @@ -1,1989 +0,0 @@ -/* - * INET An implementation of the TCP/IP protocol suite for the LINUX - * operating system. INET is implemented using the BSD Socket - * interface as the means of communication with the user level. - * - * PACKET - implements raw packet sockets. - * - * Version: $Id: af_packet.c,v 1.61 2002/02/08 03:57:19 davem Exp $ - * - * Authors: Ross Biro - * Fred N. van Kempen, - * Alan Cox, - * - * Fixes: - * Alan Cox : verify_area() now used correctly - * Alan Cox : new skbuff lists, look ma no backlogs! - * Alan Cox : tidied skbuff lists. - * Alan Cox : Now uses generic datagram routines I - * added. Also fixed the peek/read crash - * from all old Linux datagram code. - * Alan Cox : Uses the improved datagram code. - * Alan Cox : Added NULL's for socket options. - * Alan Cox : Re-commented the code. - * Alan Cox : Use new kernel side addressing - * Rob Janssen : Correct MTU usage. - * Dave Platt : Counter leaks caused by incorrect - * interrupt locking and some slightly - * dubious gcc output. Can you read - * compiler: it said _VOLATILE_ - * Richard Kooijman : Timestamp fixes. - * Alan Cox : New buffers. Use sk->mac.raw. - * Alan Cox : sendmsg/recvmsg support. - * Alan Cox : Protocol setting support - * Alexey Kuznetsov : Untied from IPv4 stack. - * Cyrus Durgin : Fixed kerneld for kmod. - * Michal Ostrowski : Module initialization cleanup. - * Ulises Alonso : Frame number limit removal and - * packet_set_ring memory leak. - * Eric Biederman : Allow for > 8 byte hardware addresses. - * The convention is that longer addresses - * will simply extend the hardware address - * byte arrays at the end of sockaddr_ll - * and packet_mreq. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef CONFIG_INET -#include -#endif - -/* - Assumptions: - - if device has no dev->hard_header routine, it adds and removes ll header - inside itself. In this case ll header is invisible outside of device, - but higher levels still should reserve dev->hard_header_len. - Some devices are enough clever to reallocate skb, when header - will not fit to reserved space (tunnel), another ones are silly - (PPP). - - packet socket receives packets with pulled ll header, - so that SOCK_RAW should push it back. - -On receive: ------------ - -Incoming, dev->hard_header!=NULL - mac_header -> ll header - data -> data - -Outgoing, dev->hard_header!=NULL - mac_header -> ll header - data -> ll header - -Incoming, dev->hard_header==NULL - mac_header -> UNKNOWN position. It is very likely, that it points to ll - header. PPP makes it, that is wrong, because introduce - assymetry between rx and tx paths. - data -> data - -Outgoing, dev->hard_header==NULL - mac_header -> data. ll header is still not built! - data -> data - -Resume - If dev->hard_header==NULL we are unlikely to restore sensible ll header. - - -On transmit: ------------- - -dev->hard_header != NULL - mac_header -> ll header - data -> ll header - -dev->hard_header == NULL (ll header is added by device, we cannot control it) - mac_header -> data - data -> data - - We should set nh.raw on output to correct posistion, - packet classifier depends on it. - */ - -/* List of all packet sockets. */ -static HLIST_HEAD(packet_sklist); -static DEFINE_RWLOCK(packet_sklist_lock); - -static atomic_t packet_socks_nr; - - -/* Private packet socket structures. */ - -struct packet_mclist -{ - struct packet_mclist *next; - int ifindex; - int count; - unsigned short type; - unsigned short alen; - unsigned char addr[MAX_ADDR_LEN]; -}; -/* identical to struct packet_mreq except it has - * a longer address field. - */ -struct packet_mreq_max -{ - int mr_ifindex; - unsigned short mr_type; - unsigned short mr_alen; - unsigned char mr_address[MAX_ADDR_LEN]; -}; - -#ifdef CONFIG_PACKET_MMAP -static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing); -#endif - -static void packet_flush_mclist(struct sock *sk); - -struct packet_sock { - /* struct sock has to be the first member of packet_sock */ - struct sock sk; - struct tpacket_stats stats; -#ifdef CONFIG_PACKET_MMAP - char * *pg_vec; - unsigned int head; - unsigned int frames_per_block; - unsigned int frame_size; - unsigned int frame_max; - int copy_thresh; -#endif - struct packet_type prot_hook; - spinlock_t bind_lock; - unsigned int running:1, /* prot_hook is attached*/ - auxdata:1, - origdev:1; - int ifindex; /* bound device */ - __be16 num; - struct packet_mclist *mclist; -#ifdef CONFIG_PACKET_MMAP - atomic_t mapped; - unsigned int pg_vec_order; - unsigned int pg_vec_pages; - unsigned int pg_vec_len; -#endif -}; - -struct packet_skb_cb { - unsigned int origlen; - union { - struct sockaddr_pkt pkt; - struct sockaddr_ll ll; - } sa; -}; - -#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb)) - -#ifdef CONFIG_PACKET_MMAP - -static inline struct tpacket_hdr *packet_lookup_frame(struct packet_sock *po, unsigned int position) -{ - unsigned int pg_vec_pos, frame_offset; - - pg_vec_pos = position / po->frames_per_block; - frame_offset = position % po->frames_per_block; - - return (struct tpacket_hdr *)(po->pg_vec[pg_vec_pos] + (frame_offset * po->frame_size)); -} -#endif - -static inline struct packet_sock *pkt_sk(struct sock *sk) -{ - return (struct packet_sock *)sk; -} - -static void packet_sock_destruct(struct sock *sk) -{ - BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc)); - BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc)); - - if (!sock_flag(sk, SOCK_DEAD)) { - printk("Attempt to release alive packet socket: %p\n", sk); - return; - } - - atomic_dec(&packet_socks_nr); -#ifdef PACKET_REFCNT_DEBUG - printk(KERN_DEBUG "PACKET socket %p is free, %d are alive\n", sk, atomic_read(&packet_socks_nr)); -#endif -} - - -static const struct proto_ops packet_ops; - -static const struct proto_ops packet_ops_spkt; - -static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) -{ - struct sock *sk; - struct sockaddr_pkt *spkt; - - /* - * When we registered the protocol we saved the socket in the data - * field for just this event. - */ - - sk = pt->af_packet_priv; - - /* - * Yank back the headers [hope the device set this - * right or kerboom...] - * - * Incoming packets have ll header pulled, - * push it back. - * - * For outgoing ones skb->data == skb_mac_header(skb) - * so that this procedure is noop. - */ - - if (skb->pkt_type == PACKET_LOOPBACK) - goto out; - - if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) - goto oom; - - /* drop any routing info */ - dst_release(skb->dst); - skb->dst = NULL; - - /* drop conntrack reference */ - nf_reset(skb); - - spkt = &PACKET_SKB_CB(skb)->sa.pkt; - - skb_push(skb, skb->data - skb_mac_header(skb)); - - /* - * The SOCK_PACKET socket receives _all_ frames. - */ - - spkt->spkt_family = dev->type; - strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device)); - spkt->spkt_protocol = skb->protocol; - - /* - * Charge the memory to the socket. This is done specifically - * to prevent sockets using all the memory up. - */ - - if (sock_queue_rcv_skb(sk,skb) == 0) - return 0; - -out: - kfree_skb(skb); -oom: - return 0; -} - - -/* - * Output a raw packet to a device layer. This bypasses all the other - * protocol layers and you must therefore supply it with a complete frame - */ - -static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t len) -{ - struct sock *sk = sock->sk; - struct sockaddr_pkt *saddr=(struct sockaddr_pkt *)msg->msg_name; - struct sk_buff *skb; - struct net_device *dev; - __be16 proto=0; - int err; - - if (!nx_capable(CAP_NET_RAW, NXC_RAW_SEND)) - return -EPERM; - - /* - * Get and verify the address. - */ - - if (saddr) - { - if (msg->msg_namelen < sizeof(struct sockaddr)) - return(-EINVAL); - if (msg->msg_namelen==sizeof(struct sockaddr_pkt)) - proto=saddr->spkt_protocol; - } - else - return(-ENOTCONN); /* SOCK_PACKET must be sent giving an address */ - - /* - * Find the device first to size check it - */ - - saddr->spkt_device[13] = 0; - dev = dev_get_by_name(saddr->spkt_device); - err = -ENODEV; - if (dev == NULL) - goto out_unlock; - - err = -ENETDOWN; - if (!(dev->flags & IFF_UP)) - goto out_unlock; - - /* - * You may not queue a frame bigger than the mtu. This is the lowest level - * raw protocol and you must do your own fragmentation at this level. - */ - - err = -EMSGSIZE; - if (len > dev->mtu + dev->hard_header_len) - goto out_unlock; - - err = -ENOBUFS; - skb = sock_wmalloc(sk, len + LL_RESERVED_SPACE(dev), 0, GFP_KERNEL); - - /* - * If the write buffer is full, then tough. At this level the user gets to - * deal with the problem - do your own algorithmic backoffs. That's far - * more flexible. - */ - - if (skb == NULL) - goto out_unlock; - - /* - * Fill it in - */ - - /* FIXME: Save some space for broken drivers that write a - * hard header at transmission time by themselves. PPP is the - * notable one here. This should really be fixed at the driver level. - */ - skb_reserve(skb, LL_RESERVED_SPACE(dev)); - skb_reset_network_header(skb); - - /* Try to align data part correctly */ - if (dev->hard_header) { - skb->data -= dev->hard_header_len; - skb->tail -= dev->hard_header_len; - if (len < dev->hard_header_len) - skb_reset_network_header(skb); - } - - /* Returns -EFAULT on error */ - err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len); - skb->protocol = proto; - skb->dev = dev; - skb->priority = sk->sk_priority; - if (err) - goto out_free; - - /* - * Now send it - */ - - dev_queue_xmit(skb); - dev_put(dev); - return(len); - -out_free: - kfree_skb(skb); -out_unlock: - if (dev) - dev_put(dev); - return err; -} - -static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk, - unsigned int res) -{ - struct sk_filter *filter; - int tag = skb->skb_tag; - - if (sk->sk_nx_info && !(tag == 1 || sk->sk_nid == tag)) - return 0; - - rcu_read_lock_bh(); - filter = rcu_dereference(sk->sk_filter); - if (filter != NULL) - res = sk_run_filter(skb, filter->insns, filter->len); - rcu_read_unlock_bh(); - - return res; -} - -/* - This function makes lazy skb cloning in hope that most of packets - are discarded by BPF. - - Note tricky part: we DO mangle shared skb! skb->data, skb->len - and skb->cb are mangled. It works because (and until) packets - falling here are owned by current CPU. Output packets are cloned - by dev_queue_xmit_nit(), input packets are processed by net_bh - sequencially, so that if we return skb to original state on exit, - we will not harm anyone. - */ - -static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) -{ - struct sock *sk; - struct sockaddr_ll *sll; - struct packet_sock *po; - u8 * skb_head = skb->data; - int skb_len = skb->len; - unsigned int snaplen, res; - - if (skb->pkt_type == PACKET_LOOPBACK) - goto drop; - - sk = pt->af_packet_priv; - po = pkt_sk(sk); - - skb->dev = dev; - - if (dev->hard_header) { - /* The device has an explicit notion of ll header, - exported to higher levels. - - Otherwise, the device hides datails of it frame - structure, so that corresponding packet head - never delivered to user. - */ - if (sk->sk_type != SOCK_DGRAM) - skb_push(skb, skb->data - skb_mac_header(skb)); - else if (skb->pkt_type == PACKET_OUTGOING) { - /* Special case: outgoing packets have ll header at head */ - skb_pull(skb, skb_network_offset(skb)); - } - } - - snaplen = skb->len; - - res = run_filter(skb, sk, snaplen); - if (!res) - goto drop_n_restore; - if (snaplen > res) - snaplen = res; - - if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= - (unsigned)sk->sk_rcvbuf) - goto drop_n_acct; - - if (skb_shared(skb)) { - struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); - if (nskb == NULL) - goto drop_n_acct; - - if (skb_head != skb->data) { - skb->data = skb_head; - skb->len = skb_len; - } - kfree_skb(skb); - skb = nskb; - } - - BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 > - sizeof(skb->cb)); - - sll = &PACKET_SKB_CB(skb)->sa.ll; - sll->sll_family = AF_PACKET; - sll->sll_hatype = dev->type; - sll->sll_protocol = skb->protocol; - sll->sll_pkttype = skb->pkt_type; - if (unlikely(po->origdev) && skb->pkt_type == PACKET_HOST) - sll->sll_ifindex = orig_dev->ifindex; - else - sll->sll_ifindex = dev->ifindex; - sll->sll_halen = 0; - - if (dev->hard_header_parse) - sll->sll_halen = dev->hard_header_parse(skb, sll->sll_addr); - - PACKET_SKB_CB(skb)->origlen = skb->len; - - if (pskb_trim(skb, snaplen)) - goto drop_n_acct; - - skb_set_owner_r(skb, sk); - skb->dev = NULL; - dst_release(skb->dst); - skb->dst = NULL; - - /* drop conntrack reference */ - nf_reset(skb); - - spin_lock(&sk->sk_receive_queue.lock); - po->stats.tp_packets++; - __skb_queue_tail(&sk->sk_receive_queue, skb); - spin_unlock(&sk->sk_receive_queue.lock); - sk->sk_data_ready(sk, skb->len); - return 0; - -drop_n_acct: - spin_lock(&sk->sk_receive_queue.lock); - po->stats.tp_drops++; - spin_unlock(&sk->sk_receive_queue.lock); - -drop_n_restore: - if (skb_head != skb->data && skb_shared(skb)) { - skb->data = skb_head; - skb->len = skb_len; - } -drop: - kfree_skb(skb); - return 0; -} - -#ifdef CONFIG_PACKET_MMAP -static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) -{ - struct sock *sk; - struct packet_sock *po; - struct sockaddr_ll *sll; - struct tpacket_hdr *h; - u8 * skb_head = skb->data; - int skb_len = skb->len; - unsigned int snaplen, res; - unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER; - unsigned short macoff, netoff; - struct sk_buff *copy_skb = NULL; - struct timeval tv; - - if (skb->pkt_type == PACKET_LOOPBACK) - goto drop; - - sk = pt->af_packet_priv; - po = pkt_sk(sk); - - if (dev->hard_header) { - if (sk->sk_type != SOCK_DGRAM) - skb_push(skb, skb->data - skb_mac_header(skb)); - else if (skb->pkt_type == PACKET_OUTGOING) { - /* Special case: outgoing packets have ll header at head */ - skb_pull(skb, skb_network_offset(skb)); - } - } - - if (skb->ip_summed == CHECKSUM_PARTIAL) - status |= TP_STATUS_CSUMNOTREADY; - - snaplen = skb->len; - - res = run_filter(skb, sk, snaplen); - if (!res) - goto drop_n_restore; - if (snaplen > res) - snaplen = res; - - if (sk->sk_type == SOCK_DGRAM) { - macoff = netoff = TPACKET_ALIGN(TPACKET_HDRLEN) + 16; - } else { - unsigned maclen = skb_network_offset(skb); - netoff = TPACKET_ALIGN(TPACKET_HDRLEN + (maclen < 16 ? 16 : maclen)); - macoff = netoff - maclen; - } - - if (macoff + snaplen > po->frame_size) { - if (po->copy_thresh && - atomic_read(&sk->sk_rmem_alloc) + skb->truesize < - (unsigned)sk->sk_rcvbuf) { - if (skb_shared(skb)) { - copy_skb = skb_clone(skb, GFP_ATOMIC); - } else { - copy_skb = skb_get(skb); - skb_head = skb->data; - } - if (copy_skb) - skb_set_owner_r(copy_skb, sk); - } - snaplen = po->frame_size - macoff; - if ((int)snaplen < 0) - snaplen = 0; - } - - spin_lock(&sk->sk_receive_queue.lock); - h = packet_lookup_frame(po, po->head); - - if (h->tp_status) - goto ring_is_full; - po->head = po->head != po->frame_max ? po->head+1 : 0; - po->stats.tp_packets++; - if (copy_skb) { - status |= TP_STATUS_COPY; - __skb_queue_tail(&sk->sk_receive_queue, copy_skb); - } - if (!po->stats.tp_drops) - status &= ~TP_STATUS_LOSING; - spin_unlock(&sk->sk_receive_queue.lock); - - skb_copy_bits(skb, 0, (u8*)h + macoff, snaplen); - - h->tp_len = skb->len; - h->tp_snaplen = snaplen; - h->tp_mac = macoff; - h->tp_net = netoff; - if (skb->tstamp.tv64 == 0) { - __net_timestamp(skb); - sock_enable_timestamp(sk); - } - tv = ktime_to_timeval(skb->tstamp); - h->tp_sec = tv.tv_sec; - h->tp_usec = tv.tv_usec; - - sll = (struct sockaddr_ll*)((u8*)h + TPACKET_ALIGN(sizeof(*h))); - sll->sll_halen = 0; - if (dev->hard_header_parse) - sll->sll_halen = dev->hard_header_parse(skb, sll->sll_addr); - sll->sll_family = AF_PACKET; - sll->sll_hatype = dev->type; - sll->sll_protocol = skb->protocol; - sll->sll_pkttype = skb->pkt_type; - if (unlikely(po->origdev) && skb->pkt_type == PACKET_HOST) - sll->sll_ifindex = orig_dev->ifindex; - else - sll->sll_ifindex = dev->ifindex; - - h->tp_status = status; - smp_mb(); - - { - struct page *p_start, *p_end; - u8 *h_end = (u8 *)h + macoff + snaplen - 1; - - p_start = virt_to_page(h); - p_end = virt_to_page(h_end); - while (p_start <= p_end) { - flush_dcache_page(p_start); - p_start++; - } - } - - sk->sk_data_ready(sk, 0); - -drop_n_restore: - if (skb_head != skb->data && skb_shared(skb)) { - skb->data = skb_head; - skb->len = skb_len; - } -drop: - kfree_skb(skb); - return 0; - -ring_is_full: - po->stats.tp_drops++; - spin_unlock(&sk->sk_receive_queue.lock); - - sk->sk_data_ready(sk, 0); - if (copy_skb) - kfree_skb(copy_skb); - goto drop_n_restore; -} - -#endif - - -static int packet_sendmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t len) -{ - struct sock *sk = sock->sk; - struct sockaddr_ll *saddr=(struct sockaddr_ll *)msg->msg_name; - struct sk_buff *skb; - struct net_device *dev; - __be16 proto; - unsigned char *addr; - int ifindex, err, reserve = 0; - - if (!nx_capable(CAP_NET_RAW, NXC_RAW_SEND)) - return -EPERM; - - /* - * Get and verify the address. - */ - - if (saddr == NULL) { - struct packet_sock *po = pkt_sk(sk); - - ifindex = po->ifindex; - proto = po->num; - addr = NULL; - } else { - err = -EINVAL; - if (msg->msg_namelen < sizeof(struct sockaddr_ll)) - goto out; - if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr))) - goto out; - ifindex = saddr->sll_ifindex; - proto = saddr->sll_protocol; - addr = saddr->sll_addr; - } - - - dev = dev_get_by_index(ifindex); - err = -ENXIO; - if (dev == NULL) - goto out_unlock; - if (sock->type == SOCK_RAW) - reserve = dev->hard_header_len; - - err = -ENETDOWN; - if (!(dev->flags & IFF_UP)) - goto out_unlock; - - err = -EMSGSIZE; - if (len > dev->mtu+reserve) - goto out_unlock; - - skb = sock_alloc_send_skb(sk, len + LL_RESERVED_SPACE(dev), - msg->msg_flags & MSG_DONTWAIT, &err); - if (skb==NULL) - goto out_unlock; - - skb_reserve(skb, LL_RESERVED_SPACE(dev)); - skb_reset_network_header(skb); - - if (dev->hard_header) { - int res; - err = -EINVAL; - res = dev->hard_header(skb, dev, ntohs(proto), addr, NULL, len); - if (sock->type != SOCK_DGRAM) { - skb_reset_tail_pointer(skb); - skb->len = 0; - } else if (res < 0) - goto out_free; - } - - /* Returns -EFAULT on error */ - err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len); - if (err) - goto out_free; - - skb->protocol = proto; - skb->dev = dev; - skb->priority = sk->sk_priority; - - /* - * Now send it - */ - - err = dev_queue_xmit(skb); - if (err > 0 && (err = net_xmit_errno(err)) != 0) - goto out_unlock; - - dev_put(dev); - - return(len); - -out_free: - kfree_skb(skb); -out_unlock: - if (dev) - dev_put(dev); -out: - return err; -} - -/* - * Close a PACKET socket. This is fairly simple. We immediately go - * to 'closed' state and remove our protocol entry in the device list. - */ - -static int packet_release(struct socket *sock) -{ - struct sock *sk = sock->sk; - struct packet_sock *po; - - if (!sk) - return 0; - - po = pkt_sk(sk); - - write_lock_bh(&packet_sklist_lock); - sk_del_node_init(sk); - write_unlock_bh(&packet_sklist_lock); - - /* - * Unhook packet receive handler. - */ - - if (po->running) { - /* - * Remove the protocol hook - */ - dev_remove_pack(&po->prot_hook); - po->running = 0; - po->num = 0; - __sock_put(sk); - } - - packet_flush_mclist(sk); - -#ifdef CONFIG_PACKET_MMAP - if (po->pg_vec) { - struct tpacket_req req; - memset(&req, 0, sizeof(req)); - packet_set_ring(sk, &req, 1); - } -#endif - - /* - * Now the socket is dead. No more input will appear. - */ - - sock_orphan(sk); - sock->sk = NULL; - - /* Purge queues */ - - skb_queue_purge(&sk->sk_receive_queue); - - sock_put(sk); - return 0; -} - -/* - * Attach a packet hook. - */ - -static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol) -{ - struct packet_sock *po = pkt_sk(sk); - /* - * Detach an existing hook if present. - */ - - lock_sock(sk); - - spin_lock(&po->bind_lock); - if (po->running) { - __sock_put(sk); - po->running = 0; - po->num = 0; - spin_unlock(&po->bind_lock); - dev_remove_pack(&po->prot_hook); - spin_lock(&po->bind_lock); - } - - po->num = protocol; - po->prot_hook.type = protocol; - po->prot_hook.dev = dev; - - po->ifindex = dev ? dev->ifindex : 0; - - if (protocol == 0) - goto out_unlock; - - if (dev) { - if (dev->flags&IFF_UP) { - dev_add_pack(&po->prot_hook); - sock_hold(sk); - po->running = 1; - } else { - sk->sk_err = ENETDOWN; - if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_error_report(sk); - } - } else { - dev_add_pack(&po->prot_hook); - sock_hold(sk); - po->running = 1; - } - -out_unlock: - spin_unlock(&po->bind_lock); - release_sock(sk); - return 0; -} - -/* - * Bind a packet socket to a device - */ - -static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, int addr_len) -{ - struct sock *sk=sock->sk; - char name[15]; - struct net_device *dev; - int err = -ENODEV; - - /* - * Check legality - */ - - if (addr_len != sizeof(struct sockaddr)) - return -EINVAL; - strlcpy(name,uaddr->sa_data,sizeof(name)); - - dev = dev_get_by_name(name); - if (dev) { - err = packet_do_bind(sk, dev, pkt_sk(sk)->num); - dev_put(dev); - } - return err; -} - -static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) -{ - struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr; - struct sock *sk=sock->sk; - struct net_device *dev = NULL; - int err; - - - /* - * Check legality - */ - - if (addr_len < sizeof(struct sockaddr_ll)) - return -EINVAL; - if (sll->sll_family != AF_PACKET) - return -EINVAL; - - if (sll->sll_ifindex) { - err = -ENODEV; - dev = dev_get_by_index(sll->sll_ifindex); - if (dev == NULL) - goto out; - } - err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num); - if (dev) - dev_put(dev); - -out: - return err; -} - -static struct proto packet_proto = { - .name = "PACKET", - .owner = THIS_MODULE, - .obj_size = sizeof(struct packet_sock), -}; - -/* - * Create a packet of type SOCK_PACKET. - */ - -static int packet_create(struct socket *sock, int protocol) -{ - struct sock *sk; - struct packet_sock *po; - __be16 proto = (__force __be16)protocol; /* weird, but documented */ - int err; - - if (!nx_capable(CAP_NET_RAW, NXC_RAW_SOCKET)) - return -EPERM; - if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW && - sock->type != SOCK_PACKET) - return -ESOCKTNOSUPPORT; - - sock->state = SS_UNCONNECTED; - - err = -ENOBUFS; - sk = sk_alloc(PF_PACKET, GFP_KERNEL, &packet_proto, 1); - if (sk == NULL) - goto out; - - sock->ops = &packet_ops; - if (sock->type == SOCK_PACKET) - sock->ops = &packet_ops_spkt; - - sock_init_data(sock, sk); - - po = pkt_sk(sk); - sk->sk_family = PF_PACKET; - po->num = proto; - - sk->sk_destruct = packet_sock_destruct; - atomic_inc(&packet_socks_nr); - - /* - * Attach a protocol block - */ - - spin_lock_init(&po->bind_lock); - po->prot_hook.func = packet_rcv; - - if (sock->type == SOCK_PACKET) - po->prot_hook.func = packet_rcv_spkt; - - po->prot_hook.af_packet_priv = sk; - - if (proto) { - po->prot_hook.type = proto; - dev_add_pack(&po->prot_hook); - sock_hold(sk); - po->running = 1; - } - - write_lock_bh(&packet_sklist_lock); - sk_add_node(sk, &packet_sklist); - write_unlock_bh(&packet_sklist_lock); - return(0); -out: - return err; -} - -/* - * Pull a packet from our receive queue and hand it to the user. - * If necessary we block. - */ - -static int packet_recvmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t len, int flags) -{ - struct sock *sk = sock->sk; - struct sk_buff *skb; - int copied, err; - struct sockaddr_ll *sll; - - err = -EINVAL; - if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT)) - goto out; - -#if 0 - /* What error should we return now? EUNATTACH? */ - if (pkt_sk(sk)->ifindex < 0) - return -ENODEV; -#endif - - /* - * Call the generic datagram receiver. This handles all sorts - * of horrible races and re-entrancy so we can forget about it - * in the protocol layers. - * - * Now it will return ENETDOWN, if device have just gone down, - * but then it will block. - */ - - skb=skb_recv_datagram(sk,flags,flags&MSG_DONTWAIT,&err); - - /* - * An error occurred so return it. Because skb_recv_datagram() - * handles the blocking we don't see and worry about blocking - * retries. - */ - - if (skb == NULL) - goto out; - - /* - * If the address length field is there to be filled in, we fill - * it in now. - */ - - sll = &PACKET_SKB_CB(skb)->sa.ll; - if (sock->type == SOCK_PACKET) - msg->msg_namelen = sizeof(struct sockaddr_pkt); - else - msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr); - - /* - * You lose any data beyond the buffer you gave. If it worries a - * user program they can ask the device for its MTU anyway. - */ - - copied = skb->len; - if (copied > len) - { - copied=len; - msg->msg_flags|=MSG_TRUNC; - } - - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); - if (err) - goto out_free; - - sock_recv_timestamp(msg, sk, skb); - - if (msg->msg_name) - memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa, - msg->msg_namelen); - - if (pkt_sk(sk)->auxdata) { - struct tpacket_auxdata aux; - - aux.tp_status = TP_STATUS_USER; - if (skb->ip_summed == CHECKSUM_PARTIAL) - aux.tp_status |= TP_STATUS_CSUMNOTREADY; - aux.tp_len = PACKET_SKB_CB(skb)->origlen; - aux.tp_snaplen = skb->len; - aux.tp_mac = 0; - aux.tp_net = skb_network_offset(skb); - - put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux); - } - - /* - * Free or return the buffer as appropriate. Again this - * hides all the races and re-entrancy issues from us. - */ - err = (flags&MSG_TRUNC) ? skb->len : copied; - -out_free: - skb_free_datagram(sk, skb); -out: - return err; -} - -static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr, - int *uaddr_len, int peer) -{ - struct net_device *dev; - struct sock *sk = sock->sk; - - if (peer) - return -EOPNOTSUPP; - - uaddr->sa_family = AF_PACKET; - dev = dev_get_by_index(pkt_sk(sk)->ifindex); - if (dev) { - strlcpy(uaddr->sa_data, dev->name, 15); - dev_put(dev); - } else - memset(uaddr->sa_data, 0, 14); - *uaddr_len = sizeof(*uaddr); - - return 0; -} - -static int packet_getname(struct socket *sock, struct sockaddr *uaddr, - int *uaddr_len, int peer) -{ - struct net_device *dev; - struct sock *sk = sock->sk; - struct packet_sock *po = pkt_sk(sk); - struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr; - - if (peer) - return -EOPNOTSUPP; - - sll->sll_family = AF_PACKET; - sll->sll_ifindex = po->ifindex; - sll->sll_protocol = po->num; - dev = dev_get_by_index(po->ifindex); - if (dev) { - sll->sll_hatype = dev->type; - sll->sll_halen = dev->addr_len; - memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len); - dev_put(dev); - } else { - sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */ - sll->sll_halen = 0; - } - *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen; - - return 0; -} - -static void packet_dev_mc(struct net_device *dev, struct packet_mclist *i, int what) -{ - switch (i->type) { - case PACKET_MR_MULTICAST: - if (what > 0) - dev_mc_add(dev, i->addr, i->alen, 0); - else - dev_mc_delete(dev, i->addr, i->alen, 0); - break; - case PACKET_MR_PROMISC: - dev_set_promiscuity(dev, what); - break; - case PACKET_MR_ALLMULTI: - dev_set_allmulti(dev, what); - break; - default:; - } -} - -static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what) -{ - for ( ; i; i=i->next) { - if (i->ifindex == dev->ifindex) - packet_dev_mc(dev, i, what); - } -} - -static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq) -{ - struct packet_sock *po = pkt_sk(sk); - struct packet_mclist *ml, *i; - struct net_device *dev; - int err; - - rtnl_lock(); - - err = -ENODEV; - dev = __dev_get_by_index(mreq->mr_ifindex); - if (!dev) - goto done; - - err = -EINVAL; - if (mreq->mr_alen > dev->addr_len) - goto done; - - err = -ENOBUFS; - i = kmalloc(sizeof(*i), GFP_KERNEL); - if (i == NULL) - goto done; - - err = 0; - for (ml = po->mclist; ml; ml = ml->next) { - if (ml->ifindex == mreq->mr_ifindex && - ml->type == mreq->mr_type && - ml->alen == mreq->mr_alen && - memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) { - ml->count++; - /* Free the new element ... */ - kfree(i); - goto done; - } - } - - i->type = mreq->mr_type; - i->ifindex = mreq->mr_ifindex; - i->alen = mreq->mr_alen; - memcpy(i->addr, mreq->mr_address, i->alen); - i->count = 1; - i->next = po->mclist; - po->mclist = i; - packet_dev_mc(dev, i, +1); - -done: - rtnl_unlock(); - return err; -} - -static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq) -{ - struct packet_mclist *ml, **mlp; - - rtnl_lock(); - - for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) { - if (ml->ifindex == mreq->mr_ifindex && - ml->type == mreq->mr_type && - ml->alen == mreq->mr_alen && - memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) { - if (--ml->count == 0) { - struct net_device *dev; - *mlp = ml->next; - dev = dev_get_by_index(ml->ifindex); - if (dev) { - packet_dev_mc(dev, ml, -1); - dev_put(dev); - } - kfree(ml); - } - rtnl_unlock(); - return 0; - } - } - rtnl_unlock(); - return -EADDRNOTAVAIL; -} - -static void packet_flush_mclist(struct sock *sk) -{ - struct packet_sock *po = pkt_sk(sk); - struct packet_mclist *ml; - - if (!po->mclist) - return; - - rtnl_lock(); - while ((ml = po->mclist) != NULL) { - struct net_device *dev; - - po->mclist = ml->next; - if ((dev = dev_get_by_index(ml->ifindex)) != NULL) { - packet_dev_mc(dev, ml, -1); - dev_put(dev); - } - kfree(ml); - } - rtnl_unlock(); -} - -static int -packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen) -{ - struct sock *sk = sock->sk; - struct packet_sock *po = pkt_sk(sk); - int ret; - - if (level != SOL_PACKET) - return -ENOPROTOOPT; - - switch(optname) { - case PACKET_ADD_MEMBERSHIP: - case PACKET_DROP_MEMBERSHIP: - { - struct packet_mreq_max mreq; - int len = optlen; - memset(&mreq, 0, sizeof(mreq)); - if (len < sizeof(struct packet_mreq)) - return -EINVAL; - if (len > sizeof(mreq)) - len = sizeof(mreq); - if (copy_from_user(&mreq,optval,len)) - return -EFAULT; - if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address))) - return -EINVAL; - if (optname == PACKET_ADD_MEMBERSHIP) - ret = packet_mc_add(sk, &mreq); - else - ret = packet_mc_drop(sk, &mreq); - return ret; - } - -#ifdef CONFIG_PACKET_MMAP - case PACKET_RX_RING: - { - struct tpacket_req req; - - if (optlencopy_thresh = val; - return 0; - } -#endif - case PACKET_AUXDATA: - { - int val; - - if (optlen < sizeof(val)) - return -EINVAL; - if (copy_from_user(&val, optval, sizeof(val))) - return -EFAULT; - - po->auxdata = !!val; - return 0; - } - case PACKET_ORIGDEV: - { - int val; - - if (optlen < sizeof(val)) - return -EINVAL; - if (copy_from_user(&val, optval, sizeof(val))) - return -EFAULT; - - po->origdev = !!val; - return 0; - } - default: - return -ENOPROTOOPT; - } -} - -static int packet_getsockopt(struct socket *sock, int level, int optname, - char __user *optval, int __user *optlen) -{ - int len; - int val; - struct sock *sk = sock->sk; - struct packet_sock *po = pkt_sk(sk); - void *data; - struct tpacket_stats st; - - if (level != SOL_PACKET) - return -ENOPROTOOPT; - - if (get_user(len, optlen)) - return -EFAULT; - - if (len < 0) - return -EINVAL; - - switch(optname) { - case PACKET_STATISTICS: - if (len > sizeof(struct tpacket_stats)) - len = sizeof(struct tpacket_stats); - spin_lock_bh(&sk->sk_receive_queue.lock); - st = po->stats; - memset(&po->stats, 0, sizeof(st)); - spin_unlock_bh(&sk->sk_receive_queue.lock); - st.tp_packets += st.tp_drops; - - data = &st; - break; - case PACKET_AUXDATA: - if (len > sizeof(int)) - len = sizeof(int); - val = po->auxdata; - - data = &val; - break; - case PACKET_ORIGDEV: - if (len > sizeof(int)) - len = sizeof(int); - val = po->origdev; - - data = &val; - break; - default: - return -ENOPROTOOPT; - } - - if (put_user(len, optlen)) - return -EFAULT; - if (copy_to_user(optval, data, len)) - return -EFAULT; - return 0; -} - - -static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data) -{ - struct sock *sk; - struct hlist_node *node; - struct net_device *dev = data; - - read_lock(&packet_sklist_lock); - sk_for_each(sk, node, &packet_sklist) { - struct packet_sock *po = pkt_sk(sk); - - switch (msg) { - case NETDEV_UNREGISTER: - if (po->mclist) - packet_dev_mclist(dev, po->mclist, -1); - /* fallthrough */ - - case NETDEV_DOWN: - if (dev->ifindex == po->ifindex) { - spin_lock(&po->bind_lock); - if (po->running) { - __dev_remove_pack(&po->prot_hook); - __sock_put(sk); - po->running = 0; - sk->sk_err = ENETDOWN; - if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_error_report(sk); - } - if (msg == NETDEV_UNREGISTER) { - po->ifindex = -1; - po->prot_hook.dev = NULL; - } - spin_unlock(&po->bind_lock); - } - break; - case NETDEV_UP: - spin_lock(&po->bind_lock); - if (dev->ifindex == po->ifindex && po->num && - !po->running) { - dev_add_pack(&po->prot_hook); - sock_hold(sk); - po->running = 1; - } - spin_unlock(&po->bind_lock); - break; - } - } - read_unlock(&packet_sklist_lock); - return NOTIFY_DONE; -} - - -static int packet_ioctl(struct socket *sock, unsigned int cmd, - unsigned long arg) -{ - struct sock *sk = sock->sk; - - switch(cmd) { - case SIOCOUTQ: - { - int amount = atomic_read(&sk->sk_wmem_alloc); - return put_user(amount, (int __user *)arg); - } - case SIOCINQ: - { - struct sk_buff *skb; - int amount = 0; - - spin_lock_bh(&sk->sk_receive_queue.lock); - skb = skb_peek(&sk->sk_receive_queue); - if (skb) - amount = skb->len; - spin_unlock_bh(&sk->sk_receive_queue.lock); - return put_user(amount, (int __user *)arg); - } - case SIOCGSTAMP: - return sock_get_timestamp(sk, (struct timeval __user *)arg); - case SIOCGSTAMPNS: - return sock_get_timestampns(sk, (struct timespec __user *)arg); - -#ifdef CONFIG_INET - case SIOCADDRT: - case SIOCDELRT: - case SIOCDARP: - case SIOCGARP: - case SIOCSARP: - case SIOCGIFADDR: - case SIOCSIFADDR: - case SIOCGIFBRDADDR: - case SIOCSIFBRDADDR: - case SIOCGIFNETMASK: - case SIOCSIFNETMASK: - case SIOCGIFDSTADDR: - case SIOCSIFDSTADDR: - case SIOCSIFFLAGS: - return inet_dgram_ops.ioctl(sock, cmd, arg); -#endif - - default: - return -ENOIOCTLCMD; - } - return 0; -} - -#ifndef CONFIG_PACKET_MMAP -#define packet_mmap sock_no_mmap -#define packet_poll datagram_poll -#else - -static unsigned int packet_poll(struct file * file, struct socket *sock, - poll_table *wait) -{ - struct sock *sk = sock->sk; - struct packet_sock *po = pkt_sk(sk); - unsigned int mask = datagram_poll(file, sock, wait); - - spin_lock_bh(&sk->sk_receive_queue.lock); - if (po->pg_vec) { - unsigned last = po->head ? po->head-1 : po->frame_max; - struct tpacket_hdr *h; - - h = packet_lookup_frame(po, last); - - if (h->tp_status) - mask |= POLLIN | POLLRDNORM; - } - spin_unlock_bh(&sk->sk_receive_queue.lock); - return mask; -} - - -/* Dirty? Well, I still did not learn better way to account - * for user mmaps. - */ - -static void packet_mm_open(struct vm_area_struct *vma) -{ - struct file *file = vma->vm_file; - struct socket * sock = file->private_data; - struct sock *sk = sock->sk; - - if (sk) - atomic_inc(&pkt_sk(sk)->mapped); -} - -static void packet_mm_close(struct vm_area_struct *vma) -{ - struct file *file = vma->vm_file; - struct socket * sock = file->private_data; - struct sock *sk = sock->sk; - - if (sk) - atomic_dec(&pkt_sk(sk)->mapped); -} - -static struct vm_operations_struct packet_mmap_ops = { - .open = packet_mm_open, - .close =packet_mm_close, -}; - -static inline struct page *pg_vec_endpage(char *one_pg_vec, unsigned int order) -{ - return virt_to_page(one_pg_vec + (PAGE_SIZE << order) - 1); -} - -static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len) -{ - int i; - - for (i = 0; i < len; i++) { - if (likely(pg_vec[i])) - free_pages((unsigned long) pg_vec[i], order); - } - kfree(pg_vec); -} - -static inline char *alloc_one_pg_vec_page(unsigned long order) -{ - return (char *) __get_free_pages(GFP_KERNEL | __GFP_COMP | __GFP_ZERO, - order); -} - -static char **alloc_pg_vec(struct tpacket_req *req, int order) -{ - unsigned int block_nr = req->tp_block_nr; - char **pg_vec; - int i; - - pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL); - if (unlikely(!pg_vec)) - goto out; - - for (i = 0; i < block_nr; i++) { - pg_vec[i] = alloc_one_pg_vec_page(order); - if (unlikely(!pg_vec[i])) - goto out_free_pgvec; - } - -out: - return pg_vec; - -out_free_pgvec: - free_pg_vec(pg_vec, order, block_nr); - pg_vec = NULL; - goto out; -} - -static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing) -{ - char **pg_vec = NULL; - struct packet_sock *po = pkt_sk(sk); - int was_running, order = 0; - __be16 num; - int err = 0; - - if (req->tp_block_nr) { - int i, l; - - /* Sanity tests and some calculations */ - - if (unlikely(po->pg_vec)) - return -EBUSY; - - if (unlikely((int)req->tp_block_size <= 0)) - return -EINVAL; - if (unlikely(req->tp_block_size & (PAGE_SIZE - 1))) - return -EINVAL; - if (unlikely(req->tp_frame_size < TPACKET_HDRLEN)) - return -EINVAL; - if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1))) - return -EINVAL; - - po->frames_per_block = req->tp_block_size/req->tp_frame_size; - if (unlikely(po->frames_per_block <= 0)) - return -EINVAL; - if (unlikely((po->frames_per_block * req->tp_block_nr) != - req->tp_frame_nr)) - return -EINVAL; - - err = -ENOMEM; - order = get_order(req->tp_block_size); - pg_vec = alloc_pg_vec(req, order); - if (unlikely(!pg_vec)) - goto out; - - l = 0; - for (i = 0; i < req->tp_block_nr; i++) { - char *ptr = pg_vec[i]; - struct tpacket_hdr *header; - int k; - - for (k = 0; k < po->frames_per_block; k++) { - header = (struct tpacket_hdr *) ptr; - header->tp_status = TP_STATUS_KERNEL; - ptr += req->tp_frame_size; - } - } - /* Done */ - } else { - if (unlikely(req->tp_frame_nr)) - return -EINVAL; - } - - lock_sock(sk); - - /* Detach socket from network */ - spin_lock(&po->bind_lock); - was_running = po->running; - num = po->num; - if (was_running) { - __dev_remove_pack(&po->prot_hook); - po->num = 0; - po->running = 0; - __sock_put(sk); - } - spin_unlock(&po->bind_lock); - - synchronize_net(); - - err = -EBUSY; - if (closing || atomic_read(&po->mapped) == 0) { - err = 0; -#define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; }) - - spin_lock_bh(&sk->sk_receive_queue.lock); - pg_vec = XC(po->pg_vec, pg_vec); - po->frame_max = (req->tp_frame_nr - 1); - po->head = 0; - po->frame_size = req->tp_frame_size; - spin_unlock_bh(&sk->sk_receive_queue.lock); - - order = XC(po->pg_vec_order, order); - req->tp_block_nr = XC(po->pg_vec_len, req->tp_block_nr); - - po->pg_vec_pages = req->tp_block_size/PAGE_SIZE; - po->prot_hook.func = po->pg_vec ? tpacket_rcv : packet_rcv; - skb_queue_purge(&sk->sk_receive_queue); -#undef XC - if (atomic_read(&po->mapped)) - printk(KERN_DEBUG "packet_mmap: vma is busy: %d\n", atomic_read(&po->mapped)); - } - - spin_lock(&po->bind_lock); - if (was_running && !po->running) { - sock_hold(sk); - po->running = 1; - po->num = num; - dev_add_pack(&po->prot_hook); - } - spin_unlock(&po->bind_lock); - - release_sock(sk); - - if (pg_vec) - free_pg_vec(pg_vec, order, req->tp_block_nr); -out: - return err; -} - -static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) -{ - struct sock *sk = sock->sk; - struct packet_sock *po = pkt_sk(sk); - unsigned long size; - unsigned long start; - int err = -EINVAL; - int i; - - if (vma->vm_pgoff) - return -EINVAL; - - size = vma->vm_end - vma->vm_start; - - lock_sock(sk); - if (po->pg_vec == NULL) - goto out; - if (size != po->pg_vec_len*po->pg_vec_pages*PAGE_SIZE) - goto out; - - start = vma->vm_start; - for (i = 0; i < po->pg_vec_len; i++) { - struct page *page = virt_to_page(po->pg_vec[i]); - int pg_num; - - for (pg_num = 0; pg_num < po->pg_vec_pages; pg_num++, page++) { - err = vm_insert_page(vma, start, page); - if (unlikely(err)) - goto out; - start += PAGE_SIZE; - } - } - atomic_inc(&po->mapped); - vma->vm_ops = &packet_mmap_ops; - err = 0; - -out: - release_sock(sk); - return err; -} -#endif - - -static const struct proto_ops packet_ops_spkt = { - .family = PF_PACKET, - .owner = THIS_MODULE, - .release = packet_release, - .bind = packet_bind_spkt, - .connect = sock_no_connect, - .socketpair = sock_no_socketpair, - .accept = sock_no_accept, - .getname = packet_getname_spkt, - .poll = datagram_poll, - .ioctl = packet_ioctl, - .listen = sock_no_listen, - .shutdown = sock_no_shutdown, - .setsockopt = sock_no_setsockopt, - .getsockopt = sock_no_getsockopt, - .sendmsg = packet_sendmsg_spkt, - .recvmsg = packet_recvmsg, - .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, -}; - -static const struct proto_ops packet_ops = { - .family = PF_PACKET, - .owner = THIS_MODULE, - .release = packet_release, - .bind = packet_bind, - .connect = sock_no_connect, - .socketpair = sock_no_socketpair, - .accept = sock_no_accept, - .getname = packet_getname, - .poll = packet_poll, - .ioctl = packet_ioctl, - .listen = sock_no_listen, - .shutdown = sock_no_shutdown, - .setsockopt = packet_setsockopt, - .getsockopt = packet_getsockopt, - .sendmsg = packet_sendmsg, - .recvmsg = packet_recvmsg, - .mmap = packet_mmap, - .sendpage = sock_no_sendpage, -}; - -static struct net_proto_family packet_family_ops = { - .family = PF_PACKET, - .create = packet_create, - .owner = THIS_MODULE, -}; - -static struct notifier_block packet_netdev_notifier = { - .notifier_call =packet_notifier, -}; - -#ifdef CONFIG_PROC_FS -static inline struct sock *packet_seq_idx(loff_t off) -{ - struct sock *s; - struct hlist_node *node; - - sk_for_each(s, node, &packet_sklist) { - if (!off--) - return s; - } - return NULL; -} - -static void *packet_seq_start(struct seq_file *seq, loff_t *pos) -{ - read_lock(&packet_sklist_lock); - return *pos ? packet_seq_idx(*pos - 1) : SEQ_START_TOKEN; -} - -static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - ++*pos; - return (v == SEQ_START_TOKEN) - ? sk_head(&packet_sklist) - : sk_next((struct sock*)v) ; -} - -static void packet_seq_stop(struct seq_file *seq, void *v) -{ - read_unlock(&packet_sklist_lock); -} - -static int packet_seq_show(struct seq_file *seq, void *v) -{ - if (v == SEQ_START_TOKEN) - seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n"); - else { - struct sock *s = v; - const struct packet_sock *po = pkt_sk(s); - - seq_printf(seq, - "%p %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n", - s, - atomic_read(&s->sk_refcnt), - s->sk_type, - ntohs(po->num), - po->ifindex, - po->running, - atomic_read(&s->sk_rmem_alloc), - sock_i_uid(s), - sock_i_ino(s) ); - } - - return 0; -} - -static struct seq_operations packet_seq_ops = { - .start = packet_seq_start, - .next = packet_seq_next, - .stop = packet_seq_stop, - .show = packet_seq_show, -}; - -static int packet_seq_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &packet_seq_ops); -} - -static const struct file_operations packet_seq_fops = { - .owner = THIS_MODULE, - .open = packet_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -#endif - -static void __exit packet_exit(void) -{ - proc_net_remove("packet"); - unregister_netdevice_notifier(&packet_netdev_notifier); - sock_unregister(PF_PACKET); - proto_unregister(&packet_proto); -} - -static int __init packet_init(void) -{ - int rc = proto_register(&packet_proto, 0); - - if (rc != 0) - goto out; - - sock_register(&packet_family_ops); - register_netdevice_notifier(&packet_netdev_notifier); - proc_net_fops_create("packet", 0, &packet_seq_fops); -out: - return rc; -} - -module_init(packet_init); -module_exit(packet_exit); -MODULE_LICENSE("GPL"); -MODULE_ALIAS_NETPROTO(PF_PACKET); diff -Nurb linux-2.6.22-594/net/socket.c linux-2.6.22-595/net/socket.c --- linux-2.6.22-594/net/socket.c 2008-03-20 00:05:19.000000000 -0400 +++ linux-2.6.22-595/net/socket.c 2008-03-20 00:14:03.000000000 -0400 @@ -1122,12 +1122,17 @@ if (type < 0 || type >= SOCK_MAX) return -EINVAL; + /* + * Hack no. 2 - Sapan + * Clean this up later + * if (!nx_check(0, VS_ADMIN)) { if (family == PF_INET && !current_nx_info_has_v4()) return -EAFNOSUPPORT; if (family == PF_INET6 && !current_nx_info_has_v6()) return -EAFNOSUPPORT; } + */ /* Compatibility. diff -Nurb linux-2.6.22-594/net/socket.c.orig linux-2.6.22-595/net/socket.c.orig --- linux-2.6.22-594/net/socket.c.orig 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.22-595/net/socket.c.orig 2008-03-20 00:05:19.000000000 -0400 @@ -0,0 +1,2400 @@ +/* + * NET An implementation of the SOCKET network access protocol. + * + * Version: @(#)socket.c 1.1.93 18/02/95 + * + * Authors: Orest Zborowski, + * Ross Biro + * Fred N. van Kempen, + * + * Fixes: + * Anonymous : NOTSOCK/BADF cleanup. Error fix in + * shutdown() + * Alan Cox : verify_area() fixes + * Alan Cox : Removed DDI + * Jonathan Kamens : SOCK_DGRAM reconnect bug + * Alan Cox : Moved a load of checks to the very + * top level. + * Alan Cox : Move address structures to/from user + * mode above the protocol layers. + * Rob Janssen : Allow 0 length sends. + * Alan Cox : Asynchronous I/O support (cribbed from the + * tty drivers). + * Niibe Yutaka : Asynchronous I/O for writes (4.4BSD style) + * Jeff Uphoff : Made max number of sockets command-line + * configurable. + * Matti Aarnio : Made the number of sockets dynamic, + * to be allocated when needed, and mr. + * Uphoff's max is used as max to be + * allowed to allocate. + * Linus : Argh. removed all the socket allocation + * altogether: it's in the inode now. + * Alan Cox : Made sock_alloc()/sock_release() public + * for NetROM and future kernel nfsd type + * stuff. + * Alan Cox : sendmsg/recvmsg basics. + * Tom Dyas : Export net symbols. + * Marcin Dalecki : Fixed problems with CONFIG_NET="n". + * Alan Cox : Added thread locking to sys_* calls + * for sockets. May have errors at the + * moment. + * Kevin Buhr : Fixed the dumb errors in the above. + * Andi Kleen : Some small cleanups, optimizations, + * and fixed a copy_from_user() bug. + * Tigran Aivazian : sys_send(args) calls sys_sendto(args, NULL, 0) + * Tigran Aivazian : Made listen(2) backlog sanity checks + * protocol-independent + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * + * This module is effectively the top level interface to the BSD socket + * paradigm. + * + * Based upon Swansea University Computer Society NET3.039 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +static int sock_no_open(struct inode *irrelevant, struct file *dontcare); +static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos); +static ssize_t sock_aio_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos); +static int sock_mmap(struct file *file, struct vm_area_struct *vma); + +static int sock_close(struct inode *inode, struct file *file); +static unsigned int sock_poll(struct file *file, + struct poll_table_struct *wait); +static long sock_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +#ifdef CONFIG_COMPAT +static long compat_sock_ioctl(struct file *file, + unsigned int cmd, unsigned long arg); +#endif +static int sock_fasync(int fd, struct file *filp, int on); +static ssize_t sock_sendpage(struct file *file, struct page *page, + int offset, size_t size, loff_t *ppos, int more); + +/* + * Socket files have a set of 'special' operations as well as the generic file ones. These don't appear + * in the operation structures but are done directly via the socketcall() multiplexor. + */ + +static const struct file_operations socket_file_ops = { + .owner = THIS_MODULE, + .llseek = no_llseek, + .aio_read = sock_aio_read, + .aio_write = sock_aio_write, + .poll = sock_poll, + .unlocked_ioctl = sock_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = compat_sock_ioctl, +#endif + .mmap = sock_mmap, + .open = sock_no_open, /* special open code to disallow open via /proc */ + .release = sock_close, + .fasync = sock_fasync, + .sendpage = sock_sendpage, + .splice_write = generic_splice_sendpage, +}; + +/* + * The protocol list. Each protocol is registered in here. + */ + +static DEFINE_SPINLOCK(net_family_lock); +static const struct net_proto_family *net_families[NPROTO] __read_mostly; + +/* + * Statistics counters of the socket lists + */ + +static DEFINE_PER_CPU(int, sockets_in_use) = 0; + +/* + * Support routines. + * Move socket addresses back and forth across the kernel/user + * divide and look after the messy bits. + */ + +#define MAX_SOCK_ADDR 128 /* 108 for Unix domain - + 16 for IP, 16 for IPX, + 24 for IPv6, + about 80 for AX.25 + must be at least one bigger than + the AF_UNIX size (see net/unix/af_unix.c + :unix_mkname()). + */ + +/** + * move_addr_to_kernel - copy a socket address into kernel space + * @uaddr: Address in user space + * @kaddr: Address in kernel space + * @ulen: Length in user space + * + * The address is copied into kernel space. If the provided address is + * too long an error code of -EINVAL is returned. If the copy gives + * invalid addresses -EFAULT is returned. On a success 0 is returned. + */ + +int move_addr_to_kernel(void __user *uaddr, int ulen, void *kaddr) +{ + if (ulen < 0 || ulen > MAX_SOCK_ADDR) + return -EINVAL; + if (ulen == 0) + return 0; + if (copy_from_user(kaddr, uaddr, ulen)) + return -EFAULT; + return audit_sockaddr(ulen, kaddr); +} + +/** + * move_addr_to_user - copy an address to user space + * @kaddr: kernel space address + * @klen: length of address in kernel + * @uaddr: user space address + * @ulen: pointer to user length field + * + * The value pointed to by ulen on entry is the buffer length available. + * This is overwritten with the buffer space used. -EINVAL is returned + * if an overlong buffer is specified or a negative buffer size. -EFAULT + * is returned if either the buffer or the length field are not + * accessible. + * After copying the data up to the limit the user specifies, the true + * length of the data is written over the length limit the user + * specified. Zero is returned for a success. + */ + +int move_addr_to_user(void *kaddr, int klen, void __user *uaddr, + int __user *ulen) +{ + int err; + int len; + + err = get_user(len, ulen); + if (err) + return err; + if (len > klen) + len = klen; + if (len < 0 || len > MAX_SOCK_ADDR) + return -EINVAL; + if (len) { + if (audit_sockaddr(klen, kaddr)) + return -ENOMEM; + if (copy_to_user(uaddr, kaddr, len)) + return -EFAULT; + } + /* + * "fromlen shall refer to the value before truncation.." + * 1003.1g + */ + return __put_user(klen, ulen); +} + +#define SOCKFS_MAGIC 0x534F434B + +static struct kmem_cache *sock_inode_cachep __read_mostly; + +static struct inode *sock_alloc_inode(struct super_block *sb) +{ + struct socket_alloc *ei; + + ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL); + if (!ei) + return NULL; + init_waitqueue_head(&ei->socket.wait); + + ei->socket.fasync_list = NULL; + ei->socket.state = SS_UNCONNECTED; + ei->socket.flags = 0; + ei->socket.ops = NULL; + ei->socket.sk = NULL; + ei->socket.file = NULL; + + return &ei->vfs_inode; +} + +static void sock_destroy_inode(struct inode *inode) +{ + kmem_cache_free(sock_inode_cachep, + container_of(inode, struct socket_alloc, vfs_inode)); +} + +static void init_once(void *foo, struct kmem_cache *cachep, unsigned long flags) +{ + struct socket_alloc *ei = (struct socket_alloc *)foo; + + inode_init_once(&ei->vfs_inode); +} + +static int init_inodecache(void) +{ + sock_inode_cachep = kmem_cache_create("sock_inode_cache", + sizeof(struct socket_alloc), + 0, + (SLAB_HWCACHE_ALIGN | + SLAB_RECLAIM_ACCOUNT | + SLAB_MEM_SPREAD), + init_once, + NULL); + if (sock_inode_cachep == NULL) + return -ENOMEM; + return 0; +} + +static struct super_operations sockfs_ops = { + .alloc_inode = sock_alloc_inode, + .destroy_inode =sock_destroy_inode, + .statfs = simple_statfs, +}; + +static int sockfs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, + struct vfsmount *mnt) +{ + return get_sb_pseudo(fs_type, "socket:", &sockfs_ops, SOCKFS_MAGIC, + mnt); +} + +static struct vfsmount *sock_mnt __read_mostly; + +static struct file_system_type sock_fs_type = { + .name = "sockfs", + .get_sb = sockfs_get_sb, + .kill_sb = kill_anon_super, +}; + +static int sockfs_delete_dentry(struct dentry *dentry) +{ + /* + * At creation time, we pretended this dentry was hashed + * (by clearing DCACHE_UNHASHED bit in d_flags) + * At delete time, we restore the truth : not hashed. + * (so that dput() can proceed correctly) + */ + dentry->d_flags |= DCACHE_UNHASHED; + return 0; +} + +/* + * sockfs_dname() is called from d_path(). + */ +static char *sockfs_dname(struct dentry *dentry, char *buffer, int buflen) +{ + return dynamic_dname(dentry, buffer, buflen, "socket:[%lu]", + dentry->d_inode->i_ino); +} + +static struct dentry_operations sockfs_dentry_operations = { + .d_delete = sockfs_delete_dentry, + .d_dname = sockfs_dname, +}; + +/* + * Obtains the first available file descriptor and sets it up for use. + * + * These functions create file structures and maps them to fd space + * of the current process. On success it returns file descriptor + * and file struct implicitly stored in sock->file. + * Note that another thread may close file descriptor before we return + * from this function. We use the fact that now we do not refer + * to socket after mapping. If one day we will need it, this + * function will increment ref. count on file by 1. + * + * In any case returned fd MAY BE not valid! + * This race condition is unavoidable + * with shared fd spaces, we cannot solve it inside kernel, + * but we take care of internal coherence yet. + */ + +static int sock_alloc_fd(struct file **filep) +{ + int fd; + + fd = get_unused_fd(); + if (likely(fd >= 0)) { + struct file *file = get_empty_filp(); + + *filep = file; + if (unlikely(!file)) { + put_unused_fd(fd); + return -ENFILE; + } + } else + *filep = NULL; + return fd; +} + +static int sock_attach_fd(struct socket *sock, struct file *file) +{ + struct qstr name = { .name = "" }; + + file->f_path.dentry = d_alloc(sock_mnt->mnt_sb->s_root, &name); + if (unlikely(!file->f_path.dentry)) + return -ENOMEM; + + file->f_path.dentry->d_op = &sockfs_dentry_operations; + /* + * We dont want to push this dentry into global dentry hash table. + * We pretend dentry is already hashed, by unsetting DCACHE_UNHASHED + * This permits a working /proc/$pid/fd/XXX on sockets + */ + file->f_path.dentry->d_flags &= ~DCACHE_UNHASHED; + d_instantiate(file->f_path.dentry, SOCK_INODE(sock)); + file->f_path.mnt = mntget(sock_mnt); + file->f_mapping = file->f_path.dentry->d_inode->i_mapping; + + sock->file = file; + file->f_op = SOCK_INODE(sock)->i_fop = &socket_file_ops; + file->f_mode = FMODE_READ | FMODE_WRITE; + file->f_flags = O_RDWR; + file->f_pos = 0; + file->private_data = sock; + + return 0; +} + +int sock_map_fd(struct socket *sock) +{ + struct file *newfile; + int fd = sock_alloc_fd(&newfile); + + if (likely(fd >= 0)) { + int err = sock_attach_fd(sock, newfile); + + if (unlikely(err < 0)) { + put_filp(newfile); + put_unused_fd(fd); + return err; + } + fd_install(fd, newfile); + } + return fd; +} + +static struct socket *sock_from_file(struct file *file, int *err) +{ + if (file->f_op == &socket_file_ops) + return file->private_data; /* set in sock_map_fd */ + + *err = -ENOTSOCK; + return NULL; +} + +/** + * sockfd_lookup - Go from a file number to its socket slot + * @fd: file handle + * @err: pointer to an error code return + * + * The file handle passed in is locked and the socket it is bound + * too is returned. If an error occurs the err pointer is overwritten + * with a negative errno code and NULL is returned. The function checks + * for both invalid handles and passing a handle which is not a socket. + * + * On a success the socket object pointer is returned. + */ + +struct socket *sockfd_lookup(int fd, int *err) +{ + struct file *file; + struct socket *sock; + + file = fget(fd); + if (!file) { + *err = -EBADF; + return NULL; + } + + sock = sock_from_file(file, err); + if (!sock) + fput(file); + return sock; +} + +static struct socket *sockfd_lookup_light(int fd, int *err, int *fput_needed) +{ + struct file *file; + struct socket *sock; + + *err = -EBADF; + file = fget_light(fd, fput_needed); + if (file) { + sock = sock_from_file(file, err); + if (sock) + return sock; + fput_light(file, *fput_needed); + } + return NULL; +} + +/** + * sock_alloc - allocate a socket + * + * Allocate a new inode and socket object. The two are bound together + * and initialised. The socket is then returned. If we are out of inodes + * NULL is returned. + */ + +static struct socket *sock_alloc(void) +{ + struct inode *inode; + struct socket *sock; + + inode = new_inode(sock_mnt->mnt_sb); + if (!inode) + return NULL; + + sock = SOCKET_I(inode); + + inode->i_mode = S_IFSOCK | S_IRWXUGO; + inode->i_uid = current->fsuid; + inode->i_gid = current->fsgid; + + get_cpu_var(sockets_in_use)++; + put_cpu_var(sockets_in_use); + return sock; +} + +/* + * In theory you can't get an open on this inode, but /proc provides + * a back door. Remember to keep it shut otherwise you'll let the + * creepy crawlies in. + */ + +static int sock_no_open(struct inode *irrelevant, struct file *dontcare) +{ + return -ENXIO; +} + +const struct file_operations bad_sock_fops = { + .owner = THIS_MODULE, + .open = sock_no_open, +}; + +/** + * sock_release - close a socket + * @sock: socket to close + * + * The socket is released from the protocol stack if it has a release + * callback, and the inode is then released if the socket is bound to + * an inode not a file. + */ + +void sock_release(struct socket *sock) +{ + if (sock->ops) { + struct module *owner = sock->ops->owner; + + sock->ops->release(sock); + sock->ops = NULL; + module_put(owner); + } + + if (sock->fasync_list) + printk(KERN_ERR "sock_release: fasync list not empty!\n"); + + get_cpu_var(sockets_in_use)--; + put_cpu_var(sockets_in_use); + if (!sock->file) { + iput(SOCK_INODE(sock)); + return; + } + sock->file = NULL; +} + +static inline int __sock_sendmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t size) +{ + struct sock_iocb *si = kiocb_to_siocb(iocb); + int err, len; + + si->sock = sock; + si->scm = NULL; + si->msg = msg; + si->size = size; + + err = security_socket_sendmsg(sock, msg, size); + if (err) + return err; + + len = sock->ops->sendmsg(iocb, sock, msg, size); + if (sock->sk) { + if (len == size) + vx_sock_send(sock->sk, size); + else + vx_sock_fail(sock->sk, size); + } + vxdprintk(VXD_CBIT(net, 7), + "__sock_sendmsg: %p[%p,%p,%p;%d/%d]:%d/%d", + sock, sock->sk, + (sock->sk)?sock->sk->sk_nx_info:0, + (sock->sk)?sock->sk->sk_vx_info:0, + (sock->sk)?sock->sk->sk_xid:0, + (sock->sk)?sock->sk->sk_nid:0, + (unsigned int)size, len); + return len; +} + +int sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) +{ + struct kiocb iocb; + struct sock_iocb siocb; + int ret; + + init_sync_kiocb(&iocb, NULL); + iocb.private = &siocb; + ret = __sock_sendmsg(&iocb, sock, msg, size); + if (-EIOCBQUEUED == ret) + ret = wait_on_sync_kiocb(&iocb); + return ret; +} + +int kernel_sendmsg(struct socket *sock, struct msghdr *msg, + struct kvec *vec, size_t num, size_t size) +{ + mm_segment_t oldfs = get_fs(); + int result; + + set_fs(KERNEL_DS); + /* + * the following is safe, since for compiler definitions of kvec and + * iovec are identical, yielding the same in-core layout and alignment + */ + msg->msg_iov = (struct iovec *)vec; + msg->msg_iovlen = num; + result = sock_sendmsg(sock, msg, size); + set_fs(oldfs); + return result; +} + +/* + * called from sock_recv_timestamp() if sock_flag(sk, SOCK_RCVTSTAMP) + */ +void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, + struct sk_buff *skb) +{ + ktime_t kt = skb->tstamp; + + if (!sock_flag(sk, SOCK_RCVTSTAMPNS)) { + struct timeval tv; + /* Race occurred between timestamp enabling and packet + receiving. Fill in the current time for now. */ + if (kt.tv64 == 0) + kt = ktime_get_real(); + skb->tstamp = kt; + tv = ktime_to_timeval(kt); + put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMP, sizeof(tv), &tv); + } else { + struct timespec ts; + /* Race occurred between timestamp enabling and packet + receiving. Fill in the current time for now. */ + if (kt.tv64 == 0) + kt = ktime_get_real(); + skb->tstamp = kt; + ts = ktime_to_timespec(kt); + put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPNS, sizeof(ts), &ts); + } +} + +EXPORT_SYMBOL_GPL(__sock_recv_timestamp); + +static inline int __sock_recvmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t size, int flags) +{ + int err, len; + struct sock_iocb *si = kiocb_to_siocb(iocb); + + si->sock = sock; + si->scm = NULL; + si->msg = msg; + si->size = size; + si->flags = flags; + + err = security_socket_recvmsg(sock, msg, size, flags); + if (err) + return err; + + len = sock->ops->recvmsg(iocb, sock, msg, size, flags); + if ((len >= 0) && sock->sk) + vx_sock_recv(sock->sk, len); + vxdprintk(VXD_CBIT(net, 7), + "__sock_recvmsg: %p[%p,%p,%p;%d/%d]:%d/%d", + sock, sock->sk, + (sock->sk)?sock->sk->sk_nx_info:0, + (sock->sk)?sock->sk->sk_vx_info:0, + (sock->sk)?sock->sk->sk_xid:0, + (sock->sk)?sock->sk->sk_nid:0, + (unsigned int)size, len); + return len; +} + +int sock_recvmsg(struct socket *sock, struct msghdr *msg, + size_t size, int flags) +{ + struct kiocb iocb; + struct sock_iocb siocb; + int ret; + + init_sync_kiocb(&iocb, NULL); + iocb.private = &siocb; + ret = __sock_recvmsg(&iocb, sock, msg, size, flags); + if (-EIOCBQUEUED == ret) + ret = wait_on_sync_kiocb(&iocb); + return ret; +} + +int kernel_recvmsg(struct socket *sock, struct msghdr *msg, + struct kvec *vec, size_t num, size_t size, int flags) +{ + mm_segment_t oldfs = get_fs(); + int result; + + set_fs(KERNEL_DS); + /* + * the following is safe, since for compiler definitions of kvec and + * iovec are identical, yielding the same in-core layout and alignment + */ + msg->msg_iov = (struct iovec *)vec, msg->msg_iovlen = num; + result = sock_recvmsg(sock, msg, size, flags); + set_fs(oldfs); + return result; +} + +static void sock_aio_dtor(struct kiocb *iocb) +{ + kfree(iocb->private); +} + +static ssize_t sock_sendpage(struct file *file, struct page *page, + int offset, size_t size, loff_t *ppos, int more) +{ + struct socket *sock; + int flags; + + sock = file->private_data; + + flags = !(file->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT; + if (more) + flags |= MSG_MORE; + + return sock->ops->sendpage(sock, page, offset, size, flags); +} + +static struct sock_iocb *alloc_sock_iocb(struct kiocb *iocb, + struct sock_iocb *siocb) +{ + if (!is_sync_kiocb(iocb)) { + siocb = kmalloc(sizeof(*siocb), GFP_KERNEL); + if (!siocb) + return NULL; + iocb->ki_dtor = sock_aio_dtor; + } + + siocb->kiocb = iocb; + iocb->private = siocb; + return siocb; +} + +static ssize_t do_sock_read(struct msghdr *msg, struct kiocb *iocb, + struct file *file, const struct iovec *iov, + unsigned long nr_segs) +{ + struct socket *sock = file->private_data; + size_t size = 0; + int i; + + for (i = 0; i < nr_segs; i++) + size += iov[i].iov_len; + + msg->msg_name = NULL; + msg->msg_namelen = 0; + msg->msg_control = NULL; + msg->msg_controllen = 0; + msg->msg_iov = (struct iovec *)iov; + msg->msg_iovlen = nr_segs; + msg->msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0; + + return __sock_recvmsg(iocb, sock, msg, size, msg->msg_flags); +} + +static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct sock_iocb siocb, *x; + + if (pos != 0) + return -ESPIPE; + + if (iocb->ki_left == 0) /* Match SYS5 behaviour */ + return 0; + + + x = alloc_sock_iocb(iocb, &siocb); + if (!x) + return -ENOMEM; + return do_sock_read(&x->async_msg, iocb, iocb->ki_filp, iov, nr_segs); +} + +static ssize_t do_sock_write(struct msghdr *msg, struct kiocb *iocb, + struct file *file, const struct iovec *iov, + unsigned long nr_segs) +{ + struct socket *sock = file->private_data; + size_t size = 0; + int i; + + for (i = 0; i < nr_segs; i++) + size += iov[i].iov_len; + + msg->msg_name = NULL; + msg->msg_namelen = 0; + msg->msg_control = NULL; + msg->msg_controllen = 0; + msg->msg_iov = (struct iovec *)iov; + msg->msg_iovlen = nr_segs; + msg->msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0; + if (sock->type == SOCK_SEQPACKET) + msg->msg_flags |= MSG_EOR; + + return __sock_sendmsg(iocb, sock, msg, size); +} + +static ssize_t sock_aio_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct sock_iocb siocb, *x; + + if (pos != 0) + return -ESPIPE; + + x = alloc_sock_iocb(iocb, &siocb); + if (!x) + return -ENOMEM; + + return do_sock_write(&x->async_msg, iocb, iocb->ki_filp, iov, nr_segs); +} + +/* + * Atomic setting of ioctl hooks to avoid race + * with module unload. + */ + +static DEFINE_MUTEX(br_ioctl_mutex); +static int (*br_ioctl_hook) (struct net *, unsigned int cmd, void __user *arg) = NULL; + +void brioctl_set(int (*hook) (struct net *, unsigned int, void __user *)) +{ + mutex_lock(&br_ioctl_mutex); + br_ioctl_hook = hook; + mutex_unlock(&br_ioctl_mutex); +} + +EXPORT_SYMBOL(brioctl_set); + +static DEFINE_MUTEX(vlan_ioctl_mutex); +static int (*vlan_ioctl_hook) (struct net *, void __user *arg); + +void vlan_ioctl_set(int (*hook) (struct net *, void __user *)) +{ + mutex_lock(&vlan_ioctl_mutex); + vlan_ioctl_hook = hook; + mutex_unlock(&vlan_ioctl_mutex); +} + +EXPORT_SYMBOL(vlan_ioctl_set); + +static DEFINE_MUTEX(dlci_ioctl_mutex); +static int (*dlci_ioctl_hook) (unsigned int, void __user *); + +void dlci_ioctl_set(int (*hook) (unsigned int, void __user *)) +{ + mutex_lock(&dlci_ioctl_mutex); + dlci_ioctl_hook = hook; + mutex_unlock(&dlci_ioctl_mutex); +} + +EXPORT_SYMBOL(dlci_ioctl_set); + +/* + * With an ioctl, arg may well be a user mode pointer, but we don't know + * what to do with it - that's up to the protocol still. + */ + +static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) +{ + struct socket *sock; + struct sock *sk; + void __user *argp = (void __user *)arg; + int pid, err; + struct net *net; + + sock = file->private_data; + sk = sock->sk; + net = sk->sk_net; + if (cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15)) { + err = dev_ioctl(net, cmd, argp); + } else +#ifdef CONFIG_WIRELESS_EXT + if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) { + err = dev_ioctl(net, cmd, argp); + } else +#endif /* CONFIG_WIRELESS_EXT */ + switch (cmd) { + case FIOSETOWN: + case SIOCSPGRP: + err = -EFAULT; + if (get_user(pid, (int __user *)argp)) + break; + err = f_setown(sock->file, pid, 1); + break; + case FIOGETOWN: + case SIOCGPGRP: + err = put_user(f_getown(sock->file), + (int __user *)argp); + break; + case SIOCGIFBR: + case SIOCSIFBR: + case SIOCBRADDBR: + case SIOCBRDELBR: + err = -ENOPKG; + if (!br_ioctl_hook) + request_module("bridge"); + + mutex_lock(&br_ioctl_mutex); + if (br_ioctl_hook) + err = br_ioctl_hook(net, cmd, argp); + mutex_unlock(&br_ioctl_mutex); + break; + case SIOCGIFVLAN: + case SIOCSIFVLAN: + err = -ENOPKG; + if (!vlan_ioctl_hook) + request_module("8021q"); + + mutex_lock(&vlan_ioctl_mutex); + if (vlan_ioctl_hook) + err = vlan_ioctl_hook(net, argp); + mutex_unlock(&vlan_ioctl_mutex); + break; + case SIOCADDDLCI: + case SIOCDELDLCI: + err = -ENOPKG; + if (!dlci_ioctl_hook) + request_module("dlci"); + + if (dlci_ioctl_hook) { + mutex_lock(&dlci_ioctl_mutex); + err = dlci_ioctl_hook(cmd, argp); + mutex_unlock(&dlci_ioctl_mutex); + } + break; + default: + err = sock->ops->ioctl(sock, cmd, arg); + + /* + * If this ioctl is unknown try to hand it down + * to the NIC driver. + */ + if (err == -ENOIOCTLCMD) + err = dev_ioctl(net, cmd, argp); + break; + } + return err; +} + +int sock_create_lite(int family, int type, int protocol, struct socket **res) +{ + int err; + struct socket *sock = NULL; + + err = security_socket_create(family, type, protocol, 1); + if (err) + goto out; + + sock = sock_alloc(); + if (!sock) { + err = -ENOMEM; + goto out; + } + + sock->type = type; + err = security_socket_post_create(sock, family, type, protocol, 1); + if (err) + goto out_release; + +out: + *res = sock; + return err; +out_release: + sock_release(sock); + sock = NULL; + goto out; +} + +/* No kernel lock held - perfect */ +static unsigned int sock_poll(struct file *file, poll_table *wait) +{ + struct socket *sock; + + /* + * We can't return errors to poll, so it's either yes or no. + */ + sock = file->private_data; + return sock->ops->poll(file, sock, wait); +} + +static int sock_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct socket *sock = file->private_data; + + return sock->ops->mmap(file, sock, vma); +} + +static int sock_close(struct inode *inode, struct file *filp) +{ + /* + * It was possible the inode is NULL we were + * closing an unfinished socket. + */ + + if (!inode) { + printk(KERN_DEBUG "sock_close: NULL inode\n"); + return 0; + } + sock_fasync(-1, filp, 0); + sock_release(SOCKET_I(inode)); + return 0; +} + +/* + * Update the socket async list + * + * Fasync_list locking strategy. + * + * 1. fasync_list is modified only under process context socket lock + * i.e. under semaphore. + * 2. fasync_list is used under read_lock(&sk->sk_callback_lock) + * or under socket lock. + * 3. fasync_list can be used from softirq context, so that + * modification under socket lock have to be enhanced with + * write_lock_bh(&sk->sk_callback_lock). + * --ANK (990710) + */ + +static int sock_fasync(int fd, struct file *filp, int on) +{ + struct fasync_struct *fa, *fna = NULL, **prev; + struct socket *sock; + struct sock *sk; + + if (on) { + fna = kmalloc(sizeof(struct fasync_struct), GFP_KERNEL); + if (fna == NULL) + return -ENOMEM; + } + + sock = filp->private_data; + + sk = sock->sk; + if (sk == NULL) { + kfree(fna); + return -EINVAL; + } + + lock_sock(sk); + + prev = &(sock->fasync_list); + + for (fa = *prev; fa != NULL; prev = &fa->fa_next, fa = *prev) + if (fa->fa_file == filp) + break; + + if (on) { + if (fa != NULL) { + write_lock_bh(&sk->sk_callback_lock); + fa->fa_fd = fd; + write_unlock_bh(&sk->sk_callback_lock); + + kfree(fna); + goto out; + } + fna->fa_file = filp; + fna->fa_fd = fd; + fna->magic = FASYNC_MAGIC; + fna->fa_next = sock->fasync_list; + write_lock_bh(&sk->sk_callback_lock); + sock->fasync_list = fna; + write_unlock_bh(&sk->sk_callback_lock); + } else { + if (fa != NULL) { + write_lock_bh(&sk->sk_callback_lock); + *prev = fa->fa_next; + write_unlock_bh(&sk->sk_callback_lock); + kfree(fa); + } + } + +out: + release_sock(sock->sk); + return 0; +} + +/* This function may be called only under socket lock or callback_lock */ + +int sock_wake_async(struct socket *sock, int how, int band) +{ + if (!sock || !sock->fasync_list) + return -1; + switch (how) { + case 1: + + if (test_bit(SOCK_ASYNC_WAITDATA, &sock->flags)) + break; + goto call_kill; + case 2: + if (!test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags)) + break; + /* fall through */ + case 0: +call_kill: + __kill_fasync(sock->fasync_list, SIGIO, band); + break; + case 3: + __kill_fasync(sock->fasync_list, SIGURG, band); + } + return 0; +} + +static int __sock_create(struct net *net, int family, int type, int protocol, + struct socket **res, int kern) +{ + int err; + struct socket *sock; + const struct net_proto_family *pf; + + /* + * Check protocol is in range + */ + if (family < 0 || family >= NPROTO) + return -EAFNOSUPPORT; + if (type < 0 || type >= SOCK_MAX) + return -EINVAL; + + if (!nx_check(0, VS_ADMIN)) { + if (family == PF_INET && !current_nx_info_has_v4()) + return -EAFNOSUPPORT; + if (family == PF_INET6 && !current_nx_info_has_v6()) + return -EAFNOSUPPORT; + } + + /* Compatibility. + + This uglymoron is moved from INET layer to here to avoid + deadlock in module load. + */ + if (family == PF_INET && type == SOCK_PACKET) { + static int warned; + if (!warned) { + warned = 1; + printk(KERN_INFO "%s uses obsolete (PF_INET,SOCK_PACKET)\n", + current->comm); + } + family = PF_PACKET; + } + + err = security_socket_create(family, type, protocol, kern); + if (err) + return err; + + /* + * Allocate the socket and allow the family to set things up. if + * the protocol is 0, the family is instructed to select an appropriate + * default. + */ + sock = sock_alloc(); + if (!sock) { + if (net_ratelimit()) + printk(KERN_WARNING "socket: no more sockets\n"); + return -ENFILE; /* Not exactly a match, but its the + closest posix thing */ + } + + sock->type = type; + +#if defined(CONFIG_KMOD) + /* Attempt to load a protocol module if the find failed. + * + * 12/09/1996 Marcin: But! this makes REALLY only sense, if the user + * requested real, full-featured networking support upon configuration. + * Otherwise module support will break! + */ + if (net_families[family] == NULL) + request_module("net-pf-%d", family); +#endif + + rcu_read_lock(); + pf = rcu_dereference(net_families[family]); + err = -EAFNOSUPPORT; + if (!pf) + goto out_release; + + /* + * We will call the ->create function, that possibly is in a loadable + * module, so we have to bump that loadable module refcnt first. + */ + if (!try_module_get(pf->owner)) + goto out_release; + + /* Now protected by module ref count */ + rcu_read_unlock(); + + err = pf->create(net, sock, protocol); + if (err < 0) + goto out_module_put; + + /* + * Now to bump the refcnt of the [loadable] module that owns this + * socket at sock_release time we decrement its refcnt. + */ + if (!try_module_get(sock->ops->owner)) + goto out_module_busy; + + /* + * Now that we're done with the ->create function, the [loadable] + * module can have its refcnt decremented + */ + module_put(pf->owner); + err = security_socket_post_create(sock, family, type, protocol, kern); + if (err) + goto out_sock_release; + *res = sock; + + return 0; + +out_module_busy: + err = -EAFNOSUPPORT; +out_module_put: + sock->ops = NULL; + module_put(pf->owner); +out_sock_release: + sock_release(sock); + return err; + +out_release: + rcu_read_unlock(); + goto out_sock_release; +} + +int sock_create(int family, int type, int protocol, struct socket **res) +{ + return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0); +} + +int sock_create_kern(int family, int type, int protocol, struct socket **res) +{ + return __sock_create(&init_net, family, type, protocol, res, 1); +} + +asmlinkage long sys_socket(int family, int type, int protocol) +{ + int retval; + struct socket *sock; + + retval = sock_create(family, type, protocol, &sock); + if (retval < 0) + goto out; + + set_bit(SOCK_USER_SOCKET, &sock->flags); + retval = sock_map_fd(sock); + if (retval < 0) + goto out_release; + +out: + /* It may be already another descriptor 8) Not kernel problem. */ + return retval; + +out_release: + sock_release(sock); + return retval; +} + +/* + * Create a pair of connected sockets. + */ + +asmlinkage long sys_socketpair(int family, int type, int protocol, + int __user *usockvec) +{ + struct socket *sock1, *sock2; + int fd1, fd2, err; + struct file *newfile1, *newfile2; + + /* + * Obtain the first socket and check if the underlying protocol + * supports the socketpair call. + */ + + err = sock_create(family, type, protocol, &sock1); + if (err < 0) + goto out; + set_bit(SOCK_USER_SOCKET, &sock1->flags); + + err = sock_create(family, type, protocol, &sock2); + if (err < 0) + goto out_release_1; + set_bit(SOCK_USER_SOCKET, &sock2->flags); + + err = sock1->ops->socketpair(sock1, sock2); + if (err < 0) + goto out_release_both; + + fd1 = sock_alloc_fd(&newfile1); + if (unlikely(fd1 < 0)) { + err = fd1; + goto out_release_both; + } + + fd2 = sock_alloc_fd(&newfile2); + if (unlikely(fd2 < 0)) { + err = fd2; + put_filp(newfile1); + put_unused_fd(fd1); + goto out_release_both; + } + + err = sock_attach_fd(sock1, newfile1); + if (unlikely(err < 0)) { + goto out_fd2; + } + + err = sock_attach_fd(sock2, newfile2); + if (unlikely(err < 0)) { + fput(newfile1); + goto out_fd1; + } + + err = audit_fd_pair(fd1, fd2); + if (err < 0) { + fput(newfile1); + fput(newfile2); + goto out_fd; + } + + fd_install(fd1, newfile1); + fd_install(fd2, newfile2); + /* fd1 and fd2 may be already another descriptors. + * Not kernel problem. + */ + + err = put_user(fd1, &usockvec[0]); + if (!err) + err = put_user(fd2, &usockvec[1]); + if (!err) + return 0; + + sys_close(fd2); + sys_close(fd1); + return err; + +out_release_both: + sock_release(sock2); +out_release_1: + sock_release(sock1); +out: + return err; + +out_fd2: + put_filp(newfile1); + sock_release(sock1); +out_fd1: + put_filp(newfile2); + sock_release(sock2); +out_fd: + put_unused_fd(fd1); + put_unused_fd(fd2); + goto out; +} + +/* + * Bind a name to a socket. Nothing much to do here since it's + * the protocol's responsibility to handle the local address. + * + * We move the socket address to kernel space before we call + * the protocol layer (having also checked the address is ok). + */ + +asmlinkage long sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen) +{ + struct socket *sock; + char address[MAX_SOCK_ADDR]; + int err, fput_needed; + + sock = sockfd_lookup_light(fd, &err, &fput_needed); + if (sock) { + err = move_addr_to_kernel(umyaddr, addrlen, address); + if (err >= 0) { + err = security_socket_bind(sock, + (struct sockaddr *)address, + addrlen); + if (!err) + err = sock->ops->bind(sock, + (struct sockaddr *) + address, addrlen); + } + fput_light(sock->file, fput_needed); + } + return err; +} + +/* + * Perform a listen. Basically, we allow the protocol to do anything + * necessary for a listen, and if that works, we mark the socket as + * ready for listening. + */ + +asmlinkage long sys_listen(int fd, int backlog) +{ + struct socket *sock; + int err, fput_needed; + + sock = sockfd_lookup_light(fd, &err, &fput_needed); + if (sock) { + struct net *net = sock->sk->sk_net; + if ((unsigned)backlog > net->sysctl_somaxconn) + backlog = net->sysctl_somaxconn; + + err = security_socket_listen(sock, backlog); + if (!err) + err = sock->ops->listen(sock, backlog); + + fput_light(sock->file, fput_needed); + } + return err; +} + +/* + * For accept, we attempt to create a new socket, set up the link + * with the client, wake up the client, then return the new + * connected fd. We collect the address of the connector in kernel + * space and move it to user at the very end. This is unclean because + * we open the socket then return an error. + * + * 1003.1g adds the ability to recvmsg() to query connection pending + * status to recvmsg. We need to add that support in a way thats + * clean when we restucture accept also. + */ + +asmlinkage long sys_accept(int fd, struct sockaddr __user *upeer_sockaddr, + int __user *upeer_addrlen) +{ + struct socket *sock, *newsock; + struct file *newfile; + int err, len, newfd, fput_needed; + char address[MAX_SOCK_ADDR]; + + sock = sockfd_lookup_light(fd, &err, &fput_needed); + if (!sock) + goto out; + + err = -ENFILE; + if (!(newsock = sock_alloc())) + goto out_put; + + newsock->type = sock->type; + newsock->ops = sock->ops; + + /* + * We don't need try_module_get here, as the listening socket (sock) + * has the protocol module (sock->ops->owner) held. + */ + __module_get(newsock->ops->owner); + + newfd = sock_alloc_fd(&newfile); + if (unlikely(newfd < 0)) { + err = newfd; + sock_release(newsock); + goto out_put; + } + + err = sock_attach_fd(newsock, newfile); + if (err < 0) + goto out_fd_simple; + + err = security_socket_accept(sock, newsock); + if (err) + goto out_fd; + + err = sock->ops->accept(sock, newsock, sock->file->f_flags); + if (err < 0) + goto out_fd; + + if (upeer_sockaddr) { + if (newsock->ops->getname(newsock, (struct sockaddr *)address, + &len, 2) < 0) { + err = -ECONNABORTED; + goto out_fd; + } + err = move_addr_to_user(address, len, upeer_sockaddr, + upeer_addrlen); + if (err < 0) + goto out_fd; + } + + /* File flags are not inherited via accept() unlike another OSes. */ + + fd_install(newfd, newfile); + err = newfd; + + security_socket_post_accept(sock, newsock); + +out_put: + fput_light(sock->file, fput_needed); +out: + return err; +out_fd_simple: + sock_release(newsock); + put_filp(newfile); + put_unused_fd(newfd); + goto out_put; +out_fd: + fput(newfile); + put_unused_fd(newfd); + goto out_put; +} + +/* + * Attempt to connect to a socket with the server address. The address + * is in user space so we verify it is OK and move it to kernel space. + * + * For 1003.1g we need to add clean support for a bind to AF_UNSPEC to + * break bindings + * + * NOTE: 1003.1g draft 6.3 is broken with respect to AX.25/NetROM and + * other SEQPACKET protocols that take time to connect() as it doesn't + * include the -EINPROGRESS status for such sockets. + */ + +asmlinkage long sys_connect(int fd, struct sockaddr __user *uservaddr, + int addrlen) +{ + struct socket *sock; + char address[MAX_SOCK_ADDR]; + int err, fput_needed; + + sock = sockfd_lookup_light(fd, &err, &fput_needed); + if (!sock) + goto out; + err = move_addr_to_kernel(uservaddr, addrlen, address); + if (err < 0) + goto out_put; + + err = + security_socket_connect(sock, (struct sockaddr *)address, addrlen); + if (err) + goto out_put; + + err = sock->ops->connect(sock, (struct sockaddr *)address, addrlen, + sock->file->f_flags); +out_put: + fput_light(sock->file, fput_needed); +out: + return err; +} + +/* + * Get the local address ('name') of a socket object. Move the obtained + * name to user space. + */ + +asmlinkage long sys_getsockname(int fd, struct sockaddr __user *usockaddr, + int __user *usockaddr_len) +{ + struct socket *sock; + char address[MAX_SOCK_ADDR]; + int len, err, fput_needed; + + sock = sockfd_lookup_light(fd, &err, &fput_needed); + if (!sock) + goto out; + + err = security_socket_getsockname(sock); + if (err) + goto out_put; + + err = sock->ops->getname(sock, (struct sockaddr *)address, &len, 0); + if (err) + goto out_put; + err = move_addr_to_user(address, len, usockaddr, usockaddr_len); + +out_put: + fput_light(sock->file, fput_needed); +out: + return err; +} + +/* + * Get the remote address ('name') of a socket object. Move the obtained + * name to user space. + */ + +asmlinkage long sys_getpeername(int fd, struct sockaddr __user *usockaddr, + int __user *usockaddr_len) +{ + struct socket *sock; + char address[MAX_SOCK_ADDR]; + int len, err, fput_needed; + + sock = sockfd_lookup_light(fd, &err, &fput_needed); + if (sock != NULL) { + err = security_socket_getpeername(sock); + if (err) { + fput_light(sock->file, fput_needed); + return err; + } + + err = + sock->ops->getname(sock, (struct sockaddr *)address, &len, + 1); + if (!err) + err = move_addr_to_user(address, len, usockaddr, + usockaddr_len); + fput_light(sock->file, fput_needed); + } + return err; +} + +/* + * Send a datagram to a given address. We move the address into kernel + * space and check the user space data area is readable before invoking + * the protocol. + */ + +asmlinkage long sys_sendto(int fd, void __user *buff, size_t len, + unsigned flags, struct sockaddr __user *addr, + int addr_len) +{ + struct socket *sock; + char address[MAX_SOCK_ADDR]; + int err; + struct msghdr msg; + struct iovec iov; + int fput_needed; + struct file *sock_file; + + sock_file = fget_light(fd, &fput_needed); + err = -EBADF; + if (!sock_file) + goto out; + + sock = sock_from_file(sock_file, &err); + if (!sock) + goto out_put; + iov.iov_base = buff; + iov.iov_len = len; + msg.msg_name = NULL; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_namelen = 0; + if (addr) { + err = move_addr_to_kernel(addr, addr_len, address); + if (err < 0) + goto out_put; + msg.msg_name = address; + msg.msg_namelen = addr_len; + } + if (sock->file->f_flags & O_NONBLOCK) + flags |= MSG_DONTWAIT; + msg.msg_flags = flags; + err = sock_sendmsg(sock, &msg, len); + +out_put: + fput_light(sock_file, fput_needed); +out: + return err; +} + +/* + * Send a datagram down a socket. + */ + +asmlinkage long sys_send(int fd, void __user *buff, size_t len, unsigned flags) +{ + return sys_sendto(fd, buff, len, flags, NULL, 0); +} + +/* + * Receive a frame from the socket and optionally record the address of the + * sender. We verify the buffers are writable and if needed move the + * sender address from kernel to user space. + */ + +asmlinkage long sys_recvfrom(int fd, void __user *ubuf, size_t size, + unsigned flags, struct sockaddr __user *addr, + int __user *addr_len) +{ + struct socket *sock; + struct iovec iov; + struct msghdr msg; + char address[MAX_SOCK_ADDR]; + int err, err2; + struct file *sock_file; + int fput_needed; + + sock_file = fget_light(fd, &fput_needed); + err = -EBADF; + if (!sock_file) + goto out; + + sock = sock_from_file(sock_file, &err); + if (!sock) + goto out_put; + + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_iovlen = 1; + msg.msg_iov = &iov; + iov.iov_len = size; + iov.iov_base = ubuf; + msg.msg_name = address; + msg.msg_namelen = MAX_SOCK_ADDR; + if (sock->file->f_flags & O_NONBLOCK) + flags |= MSG_DONTWAIT; + err = sock_recvmsg(sock, &msg, size, flags); + + if (err >= 0 && addr != NULL) { + err2 = move_addr_to_user(address, msg.msg_namelen, addr, addr_len); + if (err2 < 0) + err = err2; + } +out_put: + fput_light(sock_file, fput_needed); +out: + return err; +} + +/* + * Receive a datagram from a socket. + */ + +asmlinkage long sys_recv(int fd, void __user *ubuf, size_t size, + unsigned flags) +{ + return sys_recvfrom(fd, ubuf, size, flags, NULL, NULL); +} + +/* + * Set a socket option. Because we don't know the option lengths we have + * to pass the user mode parameter for the protocols to sort out. + */ + +asmlinkage long sys_setsockopt(int fd, int level, int optname, + char __user *optval, int optlen) +{ + int err, fput_needed; + struct socket *sock; + + if (optlen < 0) + return -EINVAL; + + sock = sockfd_lookup_light(fd, &err, &fput_needed); + if (sock != NULL) { + err = security_socket_setsockopt(sock, level, optname); + if (err) + goto out_put; + + if (level == SOL_SOCKET) + err = + sock_setsockopt(sock, level, optname, optval, + optlen); + else + err = + sock->ops->setsockopt(sock, level, optname, optval, + optlen); +out_put: + fput_light(sock->file, fput_needed); + } + return err; +} + +/* + * Get a socket option. Because we don't know the option lengths we have + * to pass a user mode parameter for the protocols to sort out. + */ + +asmlinkage long sys_getsockopt(int fd, int level, int optname, + char __user *optval, int __user *optlen) +{ + int err, fput_needed; + struct socket *sock; + + sock = sockfd_lookup_light(fd, &err, &fput_needed); + if (sock != NULL) { + err = security_socket_getsockopt(sock, level, optname); + if (err) + goto out_put; + + if (level == SOL_SOCKET) + err = + sock_getsockopt(sock, level, optname, optval, + optlen); + else + err = + sock->ops->getsockopt(sock, level, optname, optval, + optlen); +out_put: + fput_light(sock->file, fput_needed); + } + return err; +} + +/* + * Shutdown a socket. + */ + +asmlinkage long sys_shutdown(int fd, int how) +{ + int err, fput_needed; + struct socket *sock; + + sock = sockfd_lookup_light(fd, &err, &fput_needed); + if (sock != NULL) { + err = security_socket_shutdown(sock, how); + if (!err) + err = sock->ops->shutdown(sock, how); + fput_light(sock->file, fput_needed); + } + return err; +} + +/* A couple of helpful macros for getting the address of the 32/64 bit + * fields which are the same type (int / unsigned) on our platforms. + */ +#define COMPAT_MSG(msg, member) ((MSG_CMSG_COMPAT & flags) ? &msg##_compat->member : &msg->member) +#define COMPAT_NAMELEN(msg) COMPAT_MSG(msg, msg_namelen) +#define COMPAT_FLAGS(msg) COMPAT_MSG(msg, msg_flags) + +/* + * BSD sendmsg interface + */ + +asmlinkage long sys_sendmsg(int fd, struct msghdr __user *msg, unsigned flags) +{ + struct compat_msghdr __user *msg_compat = + (struct compat_msghdr __user *)msg; + struct socket *sock; + char address[MAX_SOCK_ADDR]; + struct iovec iovstack[UIO_FASTIOV], *iov = iovstack; + unsigned char ctl[sizeof(struct cmsghdr) + 20] + __attribute__ ((aligned(sizeof(__kernel_size_t)))); + /* 20 is size of ipv6_pktinfo */ + unsigned char *ctl_buf = ctl; + struct msghdr msg_sys; + int err, ctl_len, iov_size, total_len; + int fput_needed; + + err = -EFAULT; + if (MSG_CMSG_COMPAT & flags) { + if (get_compat_msghdr(&msg_sys, msg_compat)) + return -EFAULT; + } + else if (copy_from_user(&msg_sys, msg, sizeof(struct msghdr))) + return -EFAULT; + + sock = sockfd_lookup_light(fd, &err, &fput_needed); + if (!sock) + goto out; + + /* do not move before msg_sys is valid */ + err = -EMSGSIZE; + if (msg_sys.msg_iovlen > UIO_MAXIOV) + goto out_put; + + /* Check whether to allocate the iovec area */ + err = -ENOMEM; + iov_size = msg_sys.msg_iovlen * sizeof(struct iovec); + if (msg_sys.msg_iovlen > UIO_FASTIOV) { + iov = sock_kmalloc(sock->sk, iov_size, GFP_KERNEL); + if (!iov) + goto out_put; + } + + /* This will also move the address data into kernel space */ + if (MSG_CMSG_COMPAT & flags) { + err = verify_compat_iovec(&msg_sys, iov, address, VERIFY_READ); + } else + err = verify_iovec(&msg_sys, iov, address, VERIFY_READ); + if (err < 0) + goto out_freeiov; + total_len = err; + + err = -ENOBUFS; + + if (msg_sys.msg_controllen > INT_MAX) + goto out_freeiov; + ctl_len = msg_sys.msg_controllen; + if ((MSG_CMSG_COMPAT & flags) && ctl_len) { + err = + cmsghdr_from_user_compat_to_kern(&msg_sys, sock->sk, ctl, + sizeof(ctl)); + if (err) + goto out_freeiov; + ctl_buf = msg_sys.msg_control; + ctl_len = msg_sys.msg_controllen; + } else if (ctl_len) { + if (ctl_len > sizeof(ctl)) { + ctl_buf = sock_kmalloc(sock->sk, ctl_len, GFP_KERNEL); + if (ctl_buf == NULL) + goto out_freeiov; + } + err = -EFAULT; + /* + * Careful! Before this, msg_sys.msg_control contains a user pointer. + * Afterwards, it will be a kernel pointer. Thus the compiler-assisted + * checking falls down on this. + */ + if (copy_from_user(ctl_buf, (void __user *)msg_sys.msg_control, + ctl_len)) + goto out_freectl; + msg_sys.msg_control = ctl_buf; + } + msg_sys.msg_flags = flags; + + if (sock->file->f_flags & O_NONBLOCK) + msg_sys.msg_flags |= MSG_DONTWAIT; + err = sock_sendmsg(sock, &msg_sys, total_len); + +out_freectl: + if (ctl_buf != ctl) + sock_kfree_s(sock->sk, ctl_buf, ctl_len); +out_freeiov: + if (iov != iovstack) + sock_kfree_s(sock->sk, iov, iov_size); +out_put: + fput_light(sock->file, fput_needed); +out: + return err; +} + +/* + * BSD recvmsg interface + */ + +asmlinkage long sys_recvmsg(int fd, struct msghdr __user *msg, + unsigned int flags) +{ + struct compat_msghdr __user *msg_compat = + (struct compat_msghdr __user *)msg; + struct socket *sock; + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov = iovstack; + struct msghdr msg_sys; + unsigned long cmsg_ptr; + int err, iov_size, total_len, len; + int fput_needed; + + /* kernel mode address */ + char addr[MAX_SOCK_ADDR]; + + /* user mode address pointers */ + struct sockaddr __user *uaddr; + int __user *uaddr_len; + + if (MSG_CMSG_COMPAT & flags) { + if (get_compat_msghdr(&msg_sys, msg_compat)) + return -EFAULT; + } + else if (copy_from_user(&msg_sys, msg, sizeof(struct msghdr))) + return -EFAULT; + + sock = sockfd_lookup_light(fd, &err, &fput_needed); + if (!sock) + goto out; + + err = -EMSGSIZE; + if (msg_sys.msg_iovlen > UIO_MAXIOV) + goto out_put; + + /* Check whether to allocate the iovec area */ + err = -ENOMEM; + iov_size = msg_sys.msg_iovlen * sizeof(struct iovec); + if (msg_sys.msg_iovlen > UIO_FASTIOV) { + iov = sock_kmalloc(sock->sk, iov_size, GFP_KERNEL); + if (!iov) + goto out_put; + } + + /* + * Save the user-mode address (verify_iovec will change the + * kernel msghdr to use the kernel address space) + */ + + uaddr = (void __user *)msg_sys.msg_name; + uaddr_len = COMPAT_NAMELEN(msg); + if (MSG_CMSG_COMPAT & flags) { + err = verify_compat_iovec(&msg_sys, iov, addr, VERIFY_WRITE); + } else + err = verify_iovec(&msg_sys, iov, addr, VERIFY_WRITE); + if (err < 0) + goto out_freeiov; + total_len = err; + + cmsg_ptr = (unsigned long)msg_sys.msg_control; + msg_sys.msg_flags = 0; + if (MSG_CMSG_COMPAT & flags) + msg_sys.msg_flags = MSG_CMSG_COMPAT; + + if (sock->file->f_flags & O_NONBLOCK) + flags |= MSG_DONTWAIT; + err = sock_recvmsg(sock, &msg_sys, total_len, flags); + if (err < 0) + goto out_freeiov; + len = err; + + if (uaddr != NULL) { + err = move_addr_to_user(addr, msg_sys.msg_namelen, uaddr, + uaddr_len); + if (err < 0) + goto out_freeiov; + } + err = __put_user((msg_sys.msg_flags & ~MSG_CMSG_COMPAT), + COMPAT_FLAGS(msg)); + if (err) + goto out_freeiov; + if (MSG_CMSG_COMPAT & flags) + err = __put_user((unsigned long)msg_sys.msg_control - cmsg_ptr, + &msg_compat->msg_controllen); + else + err = __put_user((unsigned long)msg_sys.msg_control - cmsg_ptr, + &msg->msg_controllen); + if (err) + goto out_freeiov; + err = len; + +out_freeiov: + if (iov != iovstack) + sock_kfree_s(sock->sk, iov, iov_size); +out_put: + fput_light(sock->file, fput_needed); +out: + return err; +} + +#ifdef __ARCH_WANT_SYS_SOCKETCALL + +/* Argument list sizes for sys_socketcall */ +#define AL(x) ((x) * sizeof(unsigned long)) +static const unsigned char nargs[18]={ + AL(0),AL(3),AL(3),AL(3),AL(2),AL(3), + AL(3),AL(3),AL(4),AL(4),AL(4),AL(6), + AL(6),AL(2),AL(5),AL(5),AL(3),AL(3) +}; + +#undef AL + +/* + * System call vectors. + * + * Argument checking cleaned up. Saved 20% in size. + * This function doesn't need to set the kernel lock because + * it is set by the callees. + */ + +asmlinkage long sys_socketcall(int call, unsigned long __user *args) +{ + unsigned long a[6]; + unsigned long a0, a1; + int err; + + if (call < 1 || call > SYS_RECVMSG) + return -EINVAL; + + /* copy_from_user should be SMP safe. */ + if (copy_from_user(a, args, nargs[call])) + return -EFAULT; + + err = audit_socketcall(nargs[call] / sizeof(unsigned long), a); + if (err) + return err; + + a0 = a[0]; + a1 = a[1]; + + switch (call) { + case SYS_SOCKET: + err = sys_socket(a0, a1, a[2]); + break; + case SYS_BIND: + err = sys_bind(a0, (struct sockaddr __user *)a1, a[2]); + break; + case SYS_CONNECT: + err = sys_connect(a0, (struct sockaddr __user *)a1, a[2]); + break; + case SYS_LISTEN: + err = sys_listen(a0, a1); + break; + case SYS_ACCEPT: + err = + sys_accept(a0, (struct sockaddr __user *)a1, + (int __user *)a[2]); + break; + case SYS_GETSOCKNAME: + err = + sys_getsockname(a0, (struct sockaddr __user *)a1, + (int __user *)a[2]); + break; + case SYS_GETPEERNAME: + err = + sys_getpeername(a0, (struct sockaddr __user *)a1, + (int __user *)a[2]); + break; + case SYS_SOCKETPAIR: + err = sys_socketpair(a0, a1, a[2], (int __user *)a[3]); + break; + case SYS_SEND: + err = sys_send(a0, (void __user *)a1, a[2], a[3]); + break; + case SYS_SENDTO: + err = sys_sendto(a0, (void __user *)a1, a[2], a[3], + (struct sockaddr __user *)a[4], a[5]); + break; + case SYS_RECV: + err = sys_recv(a0, (void __user *)a1, a[2], a[3]); + break; + case SYS_RECVFROM: + err = sys_recvfrom(a0, (void __user *)a1, a[2], a[3], + (struct sockaddr __user *)a[4], + (int __user *)a[5]); + break; + case SYS_SHUTDOWN: + err = sys_shutdown(a0, a1); + break; + case SYS_SETSOCKOPT: + err = sys_setsockopt(a0, a1, a[2], (char __user *)a[3], a[4]); + break; + case SYS_GETSOCKOPT: + err = + sys_getsockopt(a0, a1, a[2], (char __user *)a[3], + (int __user *)a[4]); + break; + case SYS_SENDMSG: + err = sys_sendmsg(a0, (struct msghdr __user *)a1, a[2]); + break; + case SYS_RECVMSG: + err = sys_recvmsg(a0, (struct msghdr __user *)a1, a[2]); + break; + default: + err = -EINVAL; + break; + } + return err; +} + +#endif /* __ARCH_WANT_SYS_SOCKETCALL */ + +/** + * sock_register - add a socket protocol handler + * @ops: description of protocol + * + * This function is called by a protocol handler that wants to + * advertise its address family, and have it linked into the + * socket interface. The value ops->family coresponds to the + * socket system call protocol family. + */ +int sock_register(const struct net_proto_family *ops) +{ + int err; + + if (ops->family >= NPROTO) { + printk(KERN_CRIT "protocol %d >= NPROTO(%d)\n", ops->family, + NPROTO); + return -ENOBUFS; + } + + spin_lock(&net_family_lock); + if (net_families[ops->family]) + err = -EEXIST; + else { + net_families[ops->family] = ops; + err = 0; + } + spin_unlock(&net_family_lock); + + printk(KERN_INFO "NET: Registered protocol family %d\n", ops->family); + return err; +} + +/** + * sock_unregister - remove a protocol handler + * @family: protocol family to remove + * + * This function is called by a protocol handler that wants to + * remove its address family, and have it unlinked from the + * new socket creation. + * + * If protocol handler is a module, then it can use module reference + * counts to protect against new references. If protocol handler is not + * a module then it needs to provide its own protection in + * the ops->create routine. + */ +void sock_unregister(int family) +{ + BUG_ON(family < 0 || family >= NPROTO); + + spin_lock(&net_family_lock); + net_families[family] = NULL; + spin_unlock(&net_family_lock); + + synchronize_rcu(); + + printk(KERN_INFO "NET: Unregistered protocol family %d\n", family); +} + +static int sock_pernet_init(struct net *net) +{ + net->sysctl_somaxconn = SOMAXCONN; + return 0; +} + +static struct pernet_operations sock_net_ops = { + .init = sock_pernet_init, +}; + +static int __init sock_init(void) +{ + /* + * Initialize sock SLAB cache. + */ + + sk_init(); + + /* + * Initialize skbuff SLAB cache + */ + skb_init(); + + /* + * Initialize the protocols module. + */ + + init_inodecache(); + register_filesystem(&sock_fs_type); + sock_mnt = kern_mount(&sock_fs_type); + + /* The real protocol initialization is performed in later initcalls. + */ + +#ifdef CONFIG_NETFILTER + netfilter_init(); +#endif + + register_pernet_subsys(&sock_net_ops); + + return 0; +} + +core_initcall(sock_init); /* early initcall */ + +#ifdef CONFIG_PROC_FS +void socket_seq_show(struct seq_file *seq) +{ + int cpu; + int counter = 0; + + for_each_possible_cpu(cpu) + counter += per_cpu(sockets_in_use, cpu); + + /* It can be negative, by the way. 8) */ + if (counter < 0) + counter = 0; + + seq_printf(seq, "sockets: used %d\n", counter); +} +#endif /* CONFIG_PROC_FS */ + +#ifdef CONFIG_COMPAT +static long compat_sock_ioctl(struct file *file, unsigned cmd, + unsigned long arg) +{ + struct socket *sock = file->private_data; + int ret = -ENOIOCTLCMD; + + if (sock->ops->compat_ioctl) + ret = sock->ops->compat_ioctl(sock, cmd, arg); + + return ret; +} +#endif + +int kernel_bind(struct socket *sock, struct sockaddr *addr, int addrlen) +{ + return sock->ops->bind(sock, addr, addrlen); +} + +int kernel_listen(struct socket *sock, int backlog) +{ + return sock->ops->listen(sock, backlog); +} + +int kernel_accept(struct socket *sock, struct socket **newsock, int flags) +{ + struct sock *sk = sock->sk; + int err; + + err = sock_create_lite(sk->sk_family, sk->sk_type, sk->sk_protocol, + newsock); + if (err < 0) + goto done; + + err = sock->ops->accept(sock, *newsock, flags); + if (err < 0) { + sock_release(*newsock); + goto done; + } + + (*newsock)->ops = sock->ops; + +done: + return err; +} + +int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen, + int flags) +{ + return sock->ops->connect(sock, addr, addrlen, flags); +} + +int kernel_getsockname(struct socket *sock, struct sockaddr *addr, + int *addrlen) +{ + return sock->ops->getname(sock, addr, addrlen, 0); +} + +int kernel_getpeername(struct socket *sock, struct sockaddr *addr, + int *addrlen) +{ + return sock->ops->getname(sock, addr, addrlen, 1); +} + +int kernel_getsockopt(struct socket *sock, int level, int optname, + char *optval, int *optlen) +{ + mm_segment_t oldfs = get_fs(); + int err; + + set_fs(KERNEL_DS); + if (level == SOL_SOCKET) + err = sock_getsockopt(sock, level, optname, optval, optlen); + else + err = sock->ops->getsockopt(sock, level, optname, optval, + optlen); + set_fs(oldfs); + return err; +} + +int kernel_setsockopt(struct socket *sock, int level, int optname, + char *optval, int optlen) +{ + mm_segment_t oldfs = get_fs(); + int err; + + set_fs(KERNEL_DS); + if (level == SOL_SOCKET) + err = sock_setsockopt(sock, level, optname, optval, optlen); + else + err = sock->ops->setsockopt(sock, level, optname, optval, + optlen); + set_fs(oldfs); + return err; +} + +int kernel_sendpage(struct socket *sock, struct page *page, int offset, + size_t size, int flags) +{ + if (sock->ops->sendpage) + return sock->ops->sendpage(sock, page, offset, size, flags); + + return sock_no_sendpage(sock, page, offset, size, flags); +} + +int kernel_sock_ioctl(struct socket *sock, int cmd, unsigned long arg) +{ + mm_segment_t oldfs = get_fs(); + int err; + + set_fs(KERNEL_DS); + err = sock->ops->ioctl(sock, cmd, arg); + set_fs(oldfs); + + return err; +} + +/* ABI emulation layers need these two */ +EXPORT_SYMBOL(move_addr_to_kernel); +EXPORT_SYMBOL(move_addr_to_user); +EXPORT_SYMBOL(sock_create); +EXPORT_SYMBOL(sock_create_kern); +EXPORT_SYMBOL(sock_create_lite); +EXPORT_SYMBOL(sock_map_fd); +EXPORT_SYMBOL(sock_recvmsg); +EXPORT_SYMBOL(sock_register); +EXPORT_SYMBOL(sock_release); +EXPORT_SYMBOL(sock_sendmsg); +EXPORT_SYMBOL(sock_unregister); +EXPORT_SYMBOL(sock_wake_async); +EXPORT_SYMBOL(sockfd_lookup); +EXPORT_SYMBOL(kernel_sendmsg); +EXPORT_SYMBOL(kernel_recvmsg); +EXPORT_SYMBOL(kernel_bind); +EXPORT_SYMBOL(kernel_listen); +EXPORT_SYMBOL(kernel_accept); +EXPORT_SYMBOL(kernel_connect); +EXPORT_SYMBOL(kernel_getsockname); +EXPORT_SYMBOL(kernel_getpeername); +EXPORT_SYMBOL(kernel_getsockopt); +EXPORT_SYMBOL(kernel_setsockopt); +EXPORT_SYMBOL(kernel_sendpage); +EXPORT_SYMBOL(kernel_sock_ioctl);