/*
 * VServer IP isolation.
 *
 * This file implements netfilter hooks and AF_INET socket function
 * overrides.
 *
 * Mark Huang <mlhuang@cs.princeton.edu>
 * Copyright (C) 2004 The Trustees of Princeton University
 *
 * $Id: vnet_main.c,v 1.40 2007/03/08 15:46:07 mef Exp $
 */

#include <linux/version.h>
#include <linux/types.h>
#include <linux/module.h>
#include <linux/ip.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>
#include <linux/pkt_sched.h>
#include <linux/skbuff.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/icmp.h>
#include <linux/slab.h>
#include <net/sock.h>
#include <net/route.h>
#include <net/tcp.h>

#include <linux/netfilter_ipv4/ip_conntrack.h>
#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
#include <linux/netfilter_ipv4/ip_conntrack_core.h>
#include <linux/netfilter_ipv4/ip_tables.h>

#include "vnet_config.h"
#include "vnet.h"
#include "vnet_dbg.h"
#include "vnet_compat.h"

#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)

#define HAVE_FUNCTIONALITY_REQUIRED_BY_DEMUX

#include <net/inet_hashtables.h>

static inline void
vnet_timewait_put(struct sock* sk)
{
         inet_twsk_put((struct inet_timewait_sock *)sk);
}

static inline struct sock* 
vnet_tcp_lookup(u32 src_ip, u16 src_port, 
		u32 ip, u16 port, int dif)
{
  return inet_lookup(&tcp_hashinfo, src_ip, src_port, ip, port, dif);
}

static inline int vnet_iif(const struct sk_buff *skb)
{
  return inet_iif(skb);
}
#endif

#if LINUX_VERSION_CODE == KERNEL_VERSION(2,6,12)

#define HAVE_FUNCTIONALITY_REQUIRED_BY_DEMUX

static inline void 
vnet_timewait_put(struct sock* sk)
{
  /* net/tcp.h */
  tcp_tw_put((struct tcp_tw_bucket*)sk);
}

static inline struct sock* 
vnet_tcp_lookup(u32 saddr, u16 sport, u32 daddr,u16 dport, int dif)
{
  extern struct sock *tcp_v4_lookup(u32, u16, u32, u16, int);
  return tcp_v4_lookup(saddr, sport, daddr, dport, dif);
}

/* same as tcp_v4_iff() in net/ipv4/tcp_ipv4. */
static inline int vnet_iif(const struct sk_buff *skb)
{
	return ((struct rtable *)skb->dst)->rt_iif;
}
#endif

#ifndef HAVE_FUNCTIONALITY_REQUIRED_BY_DEMUX
#warning DEMUX FUNCTIONALITY NOT SUPPORTED
#endif

int vnet_verbose = 1;

/* We subdivide the 1: major class into 15 minor subclasses 1:1, 1:2,
 * etc. so that we can represent multiple bandwidth limits. The 1:1
 * subclass has children named 1:1000, 1:1001, etc., one for each
 * context (up to 4096). Similarly, the 1:2 subclass has children
 * named 1:2000, 1:2001, etc. By default, the 1:1 subclass represents
 * the node bandwidth cap and 1:1000 represents the root context's
 * share of it. */
int vnet_root_class = TC_H_MAKE(1 << 16, 0x1000);

#define FILTER_VALID_HOOKS ((1 << NF_IP_LOCAL_IN) | \
			    (1 << NF_IP_LOCAL_OUT) | \
			    (1 << NF_IP_POST_ROUTING))

#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,11)

/* Standard entry. */
struct ipt_standard
{
	struct ipt_entry entry;
	struct ipt_standard_target target;
};

struct ipt_error_target
{
	struct ipt_entry_target target;
	char errorname[IPT_FUNCTION_MAXNAMELEN];
};

struct ipt_error
{
	struct ipt_entry entry;
	struct ipt_error_target target;
};

#endif

static struct
{
	struct ipt_replace repl;
	struct ipt_standard entries[3];
	struct ipt_error term;
} initial_table __initdata =
{
	.repl =
	{
		.name = "vnet",
		.valid_hooks = FILTER_VALID_HOOKS,
		.num_entries = 4,
		.size = sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error),
		.hook_entry = { [NF_IP_LOCAL_IN] = 0,
				[NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard),
				[NF_IP_POST_ROUTING] = sizeof(struct ipt_standard) * 2, },
		.underflow = { [NF_IP_LOCAL_IN] = 0,
			       [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard),
			       [NF_IP_POST_ROUTING] = sizeof(struct ipt_standard) * 2, },
	},

	.entries =
	{
		/* LOCAL_IN: currently unused */
		{ .entry = { .target_offset = sizeof(struct ipt_entry),
			     .next_offset = sizeof(struct ipt_standard), },
		  .target = { .target = { .u = { .target_size = IPT_ALIGN(sizeof(struct ipt_standard_target)), }, },
			      .verdict = -NF_ACCEPT - 1, },
		},

		/* LOCAL_OUT: used for logging */
		{ .entry = { .target_offset = sizeof(struct ipt_entry),
			     .next_offset = sizeof(struct ipt_standard), },
		  .target = { .target = { .u = { .target_size = IPT_ALIGN(sizeof(struct ipt_standard_target)), }, },
			      .verdict = -NF_ACCEPT - 1, },
		},

		/* POST_ROUTING: used for priority classification */
		{ .entry = { .target_offset = sizeof(struct ipt_entry),
			     .next_offset = sizeof(struct ipt_standard), },
		  .target = { .target = { .u = { .target_size = IPT_ALIGN(sizeof(struct ipt_standard_target)), }, },
			      .verdict = -NF_ACCEPT - 1, },
		},
	},

	/* ERROR */
	.term =
	{
		.entry = { .target_offset = sizeof(struct ipt_entry),
			   .next_offset = sizeof(struct ipt_error), },
		.target = { .target = { .u = { .user = { .target_size = IPT_ALIGN(sizeof(struct ipt_error_target)),
							 .name = IPT_ERROR_TARGET, }, }, },
			    .errorname = "ERROR", },
	},
};

static struct ipt_table vnet_table = {
	.name		= "vnet",
#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,11)
	.table		= &initial_table.repl,
#endif
	.valid_hooks	= FILTER_VALID_HOOKS,
	.lock		= RW_LOCK_UNLOCKED,
	.me		= THIS_MODULE,
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
	.af		= AF_INET,
#endif
};

static inline u_int16_t
get_dst_port(struct ip_conntrack_tuple *tuple)
{
	switch (tuple->dst.protonum) {
	case IPPROTO_GRE:
		/* XXX Truncate 32-bit GRE key to 16 bits */
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,11)		
		return tuple->dst.u.gre.key;
#else
		return htons(ntohl(tuple->dst.u.gre.key));
#endif
	case IPPROTO_ICMP:
		/* Bind on ICMP echo ID */
		return tuple->src.u.icmp.id;
	case IPPROTO_TCP:
		return tuple->dst.u.tcp.port;
	case IPPROTO_UDP:
		return tuple->dst.u.udp.port;
	default:
		return tuple->dst.u.all;
	}
}

static inline u_int16_t
get_src_port(struct ip_conntrack_tuple *tuple)
{
	switch (tuple->dst.protonum) {
	case IPPROTO_GRE:
		/* XXX Truncate 32-bit GRE key to 16 bits */
		return htons(ntohl(tuple->src.u.gre.key));
	case IPPROTO_ICMP:
		/* Bind on ICMP echo ID */
		return tuple->src.u.icmp.id;
	case IPPROTO_TCP:
		return tuple->src.u.tcp.port;
	case IPPROTO_UDP:
		return tuple->src.u.udp.port;
	default:
		return tuple->src.u.all;
	}
}


static unsigned int
vnet_hook(unsigned int hook,
	  struct sk_buff **pskb,
	  const struct net_device *in,
	  const struct net_device *out,
	  int (*okfn)(struct sk_buff *))
{
	struct ip_conntrack *ct;
	enum ip_conntrack_info ctinfo;
	enum ip_conntrack_dir dir;
	u_int8_t protocol;
	u_int32_t ip;
	u_int16_t port;
	struct bind_key *key;
	xid_t xid;
	unsigned int verdict;
	int priority;
	struct sock *sk;
	int need_to_free_sk = 0;

	ct = ip_conntrack_get(*pskb, &ctinfo);
	dir = CTINFO2DIR(ctinfo);

	/* Default to marking packet with root context ID */
	xid = 0;

	switch (hook) {

	case NF_IP_LOCAL_IN:
		/* Multicast to 224.0.0.1 is one example */
		if (!ct)
			break;

		/* Determine if the packet is destined for a bound port */
		protocol = ct->tuplehash[dir].tuple.dst.protonum;
		assert(ctinfo == IP_CT_RELATED ||
		       ctinfo == (IP_CT_IS_REPLY + IP_CT_RELATED) ||
		       protocol == (*pskb)->nh.iph->protocol);
		ip = ct->tuplehash[dir].tuple.dst.ip;
		port = get_dst_port(&ct->tuplehash[dir].tuple);

		/* Check if the port is bound */
		key = bind_get(protocol, ip, port, NULL);

		if (key && key->sk != NULL) {

			/* A new or established connection to a bound port */
			sk = key->sk;

#ifdef HAVE_FUNCTIONALITY_REQUIRED_BY_DEMUX
			/* If the bound socket is a real TCP socket, then the context that
			 * bound the port could have re-assigned an established connection
			 * socket to another context. See if this is the case.
			 */
			if (protocol == IPPROTO_TCP && sk->sk_type == SOCK_STREAM) {
				struct sock *tcp_sk;
				u_int32_t src_ip = ct->tuplehash[dir].tuple.src.ip;
				u_int16_t src_port = get_src_port(&ct->tuplehash[dir].tuple);

				tcp_sk = vnet_tcp_lookup(src_ip, src_port, ip, port, vnet_iif(*pskb));
				if (tcp_sk) {
				  if (tcp_sk->sk_state == TCP_TIME_WAIT) {
				     sock_put(tcp_sk);
				  } else {
				    dbg("vnet_in:%d: established TCP socket %u.%u.%u.%u:%u -> %u.%u.%u.%u:%u\n", 
					get_sk_xid(tcp_sk), NIPQUAD(src_ip), ntohs(src_port), NIPQUAD(ip), ntohs(port));
				    sk = tcp_sk;
				    need_to_free_sk = 1;
				  }
				  /* Remember to sock_put()! */
				}
			}
#endif

			/* Indicate to the stack that the packet was "expected", so that it does
			 * not generate a TCP RST or ICMP Unreachable message. This requires a
			 * kernel patch.
			 */
			if (sk->sk_type == SOCK_RAW)
			  (*pskb)->sk = sk;

			assert(sk);
			xid = get_sk_xid(sk);

			/* Steal the reply end of the connection */
			if (get_ct_xid(ct, !dir) != xid) {
				dbg("vnet_in:%d: stealing %sbound %s connection %u.%u.%u.%u:%u -> %u.%u.%u.%u:%u from context %d\n", xid,
				    key ? "" : "un", print_protocol(protocol),
				    NIPQUAD(ip), ntohs(port),
				    NIPQUAD(ct->tuplehash[!dir].tuple.dst.ip), ntohs(ct->tuplehash[!dir].tuple.dst.u.all),
				    get_ct_xid(ct, !dir));
				set_ct_xid(ct, !dir, xid);
			}

			/* Store the owner (if any) of the other side of the connection (if
			 * localhost) in the peercred struct.
			 */
			sk->sk_peercred.uid = sk->sk_peercred.gid = (__u32) get_ct_xid(ct, dir);

			if (ctinfo == IP_CT_NEW) {
				dbg("vnet_in: %s port %u.%u.%u.%u:%u bound by context %d\n",
				    print_protocol(protocol), NIPQUAD(ip), ntohs(port), xid);
			}

#ifdef HAVE_FUNCTIONALITY_REQUIRED_BY_DEMUX
			if (need_to_free_sk) {
			  /*
			  if (sk->sk_state == TCP_TIME_WAIT)
			    vnet_timewait_put(sk);
			  else*/
			  sock_put(sk);
			  need_to_free_sk=0;
			}
#endif
			bind_put(key);

		} else if ((int) get_ct_xid(ct, !dir) == -1) {
			/* A new connection to an unbound port */
			if (ctinfo == IP_CT_NEW) {
				dbg("vnet_in: %s port %u.%u.%u.%u:%u not bound\n",
				    print_protocol(protocol), NIPQUAD(ip), ntohs(port));
			}
		} else {
			/* A new or established connection to an unbound port that could be
			 * associated with an active socket ("could be" because the socket
			 * could be closed and the connection in a WAIT state). In any case,
			 * give it to the last owner of the connection.
			 */
			xid = get_ct_xid(ct, !dir);
		}

		break;

	case NF_IP_LOCAL_OUT:
		/* Get the context ID of the sender */
		assert((*pskb)->sk);
		xid = get_sk_xid((*pskb)->sk);

		/* Default class */
		priority = vnet_root_class;

		if (ct) {
			protocol = ct->tuplehash[dir].tuple.dst.protonum;
			assert(ctinfo == IP_CT_RELATED ||
			       ctinfo == (IP_CT_IS_REPLY + IP_CT_RELATED) ||
			       protocol == (*pskb)->nh.iph->protocol);
			ip = ct->tuplehash[dir].tuple.src.ip;
			assert(ctinfo == IP_CT_RELATED ||
			       ctinfo == (IP_CT_IS_REPLY + IP_CT_RELATED) ||
			       ip == __constant_htonl(INADDR_ANY) || ip == (*pskb)->nh.iph->saddr);
			port = get_src_port(&ct->tuplehash[dir].tuple);
		} else {
			protocol = port = 0;
		}

		if (xid) {
			/* Multicast to 224.0.0.1 is one example */
			if (!ct) {
				dbg("vnet_out:%d: dropping untrackable IP packet\n", xid);
				return NF_DROP;
			}

			/* XXX Is this guaranteed? */
			if ((*pskb)->len < sizeof(struct iphdr)) {
				dbg("vnet_out:%d: dropping runt IP packet\n", xid);
				return NF_DROP;
			}

			/* Check source IP address */
			if (inet_addr_type(ip) != RTN_LOCAL) {
				dbg("vnet_out:%d: non-local source IP address %u.%u.%u.%u not allowed\n", xid,
				    NIPQUAD(ip));
				return NF_DROP;
			}

			/* Sending of ICMP error messages not allowed */
			if (protocol == IPPROTO_ICMP) {
				struct icmphdr *icmph = (struct icmphdr *)((*pskb)->nh.raw + ((*pskb)->nh.iph->ihl * 4));

				if ((unsigned char *) &icmph[1] > (*pskb)->tail) {
					dbg("vnet_out:%d: dropping runt ICMP packet\n", xid);
					return NF_DROP;
				}
				
				switch (icmph->type) {
				case ICMP_ECHOREPLY:
				case ICMP_ECHO:
				case ICMP_TIMESTAMP:
				case ICMP_TIMESTAMPREPLY:
				case ICMP_INFO_REQUEST:
				case ICMP_INFO_REPLY:
				case ICMP_ADDRESS:
				case ICMP_ADDRESSREPLY:
					/* Guaranteed by icmp_pkt_to_tuple() */
					assert(port == icmph->un.echo.id);
					break;
				default:
					dbg("vnet_out:%d: sending of ICMP error messages not allowed\n", xid);
					return NF_DROP;
				}
			}
		} else {
			/* Let root send anything it wants */
		}

		if (ct) {
			/* Check if the port is bound by someone else */
			key = bind_get(protocol, ip, port, NULL);
		} else {
			assert(xid == 0);
			key = NULL;
		}

		if (key && key->sk != NULL) {
			/* A new or established connection from a bound port */
			assert(ct);

			sk = key->sk;

#ifdef HAVE_FUNCTIONALITY_REQUIRED_BY_DEMUX
			/* If the bound socket is a real TCP socket, then the context that
			 * bound the port could have re-assigned an established connection
			 * socket to the sender's context. See if this is the case.
			 */
			if (protocol == IPPROTO_TCP && sk->sk_type == SOCK_STREAM && get_sk_xid(sk) != xid) {
				struct sock *tcp_sk;
				u_int32_t dst_ip = ct->tuplehash[dir].tuple.dst.ip;
				u_int16_t dst_port = get_dst_port(&ct->tuplehash[dir].tuple);

				tcp_sk = vnet_tcp_lookup(dst_ip, dst_port, ip, port, vnet_iif(*pskb));
				if (tcp_sk) {
				  if (tcp_sk->sk_state == TCP_TIME_WAIT) {
				    sock_put(tcp_sk);
				    //vnet_timewait_put(tcp_sk);
				  } else {
				    need_to_free_sk = 1;
				    sk = tcp_sk;
				    /* Remember to sock_put()! */
				  }
				}
			}
#endif

			verdict = NF_ACCEPT;

			/* Stealing connections from established sockets is not allowed */
			assert(sk);
			if (get_sk_xid(sk) != xid) {
				if (xid) {
					dbg("vnet_out:%d: %s port %u.%u.%u.%u:%u already bound by context %d\n", xid,
					    print_protocol(protocol), NIPQUAD(ip), ntohs(port), get_sk_xid(sk));
					verdict = NF_DROP;
				} else {
					/* Let root send whatever it wants but do not steal the packet or
					 * connection. Kernel sockets owned by root may send packets on
					 * behalf of bound sockets (for instance, TCP ACK in SYN_RECV or
					 * TIME_WAIT).
					 */
					xid = get_sk_xid(sk);
				}
			}

#ifdef HAVE_FUNCTIONALITY_REQUIRED_BY_DEMUX
			if (need_to_free_sk) {
			/*
			  if (sk->sk_state == TCP_TIME_WAIT)
			    vnet_timewait_put(sk);
			  else */
			  sock_put(sk);
			  need_to_free_sk = 0;
			}
#endif
			bind_put(key);

			if (verdict == NF_DROP)
				goto done;
		} else {
			/* A new or established or untrackable connection from an unbound port */

			/* Reserved ports must be bound. Usually only root is capable of
			 * CAP_NET_BIND_SERVICE.
			 */
			if (xid &&
			    (protocol == IPPROTO_TCP || protocol == IPPROTO_UDP) &&
			    ntohs(port) < PROT_SOCK) {
				assert(ct);
				dbg("vnet_out:%d: %s port %u is reserved\n", xid,
				    print_protocol(protocol), ntohs(port));
				return NF_DROP;
			}
		}

		if (ct) {
			/* Steal the connection */
			if (get_ct_xid(ct, dir) != xid) {
				dbg("vnet_out:%d: stealing %sbound %s connection %u.%u.%u.%u:%u -> %u.%u.%u.%u:%u from context %d\n", xid,
				    key ? "" : "un", print_protocol(protocol),
				    NIPQUAD(ip), ntohs(port),
				    NIPQUAD(ct->tuplehash[dir].tuple.dst.ip), ntohs(ct->tuplehash[dir].tuple.dst.u.all),
				    get_ct_xid(ct, dir));
				set_ct_xid(ct, dir, xid);
			}

			/* Classify traffic once per connection */
			if (ct->priority == (u_int32_t) -1) {
				/* The POSTROUTING chain should classify packets into a minor subclass
				 * (1:1000, 1:2000, etc.) with -j CLASSIFY --set-class. Set the packet
				 * MARK early so that rules can take xid into account. */
				set_skb_xid(*pskb, xid);
				(*pskb)->priority = priority;
				(void) ipt_do_table(pskb, NF_IP_POST_ROUTING, in, out, &vnet_table, NULL);
				priority = (*pskb)->priority | xid;
				dbg("vnet_out:%d: %u.%u.%u.%u:%u -> %u.%u.%u.%u:%u class %x:%x\n", xid,
				    NIPQUAD(ip), ntohs(port),
				    NIPQUAD(ct->tuplehash[dir].tuple.dst.ip), ntohs(ct->tuplehash[dir].tuple.dst.u.all),
				    TC_H_MAJ(priority) >> 16, TC_H_MIN(priority));
				ct->priority = priority;
			} else
				priority = ct->priority;
		} else {
			assert(xid == 0);
		}

		/* Set class */
		(*pskb)->priority = priority;

		break;

	default:
		/* Huh? */
		assert(hook == NF_IP_LOCAL_IN || hook == NF_IP_LOCAL_OUT);
		break;
	}

	/* Mark packet */
	set_skb_xid(*pskb, xid);

#ifdef VNET_DEBUG
	if (vnet_verbose >= 3) {
		if (ct)
			print_conntrack(ct, ctinfo, hook);
		if (vnet_verbose >= 4)
			print_packet(*pskb);
	}
#endif

 get_verdict:
	verdict = ipt_do_table(pskb, hook, in, out, &vnet_table, NULL);

	/* Pass to network taps */
	if (verdict == NF_ACCEPT)
		verdict = packet_hook(*pskb, hook);

 done:
	return verdict;
}

static struct nf_hook_ops vnet_ops[] = {
	{
		.hook		= vnet_hook,
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
		.owner		= THIS_MODULE,
#endif
		.pf		= PF_INET,
		.hooknum	= NF_IP_LOCAL_IN,
		.priority	= NF_IP_PRI_LAST,
	},
	{
		.hook		= vnet_hook,
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
		.owner		= THIS_MODULE,
#endif
		.pf		= PF_INET,
		.hooknum	= NF_IP_LOCAL_OUT,
		.priority	= NF_IP_PRI_LAST,
	},
};

/* Exported by net/ipv4/af_inet.c */
extern struct net_proto_family inet_family_ops;
extern struct proto_ops inet_stream_ops;
extern struct proto_ops inet_dgram_ops;
extern struct proto_ops inet_sockraw_ops;
extern int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len);
extern int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
			       int addr_len, int flags);
extern int inet_listen(struct socket *sock, int backlog);
extern int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr,
			      int addr_len, int flags);
extern int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
			size_t size);
extern int inet_release(struct socket *sock);

/* Exported by net/ipv4/tcp_ipv4.c */
extern struct proto tcp_prot;
extern int tcp_port_rover;
extern int sysctl_local_port_range[2];

/* Exported by net/ipv4/udp.c */
extern struct proto udp_prot;
extern int udp_port_rover;

/* Functions that are not exported */
static int (*inet_create)(struct socket *sock, int protocol);
static ssize_t (*inet_sendpage)(struct socket *sock, struct page *page, int offset, size_t size, int flags);
static void (*tcp_v4_hash)(struct sock *sk);
static void (*tcp_v4_unhash)(struct sock *sk);
static void (*udp_v4_hash)(struct sock *sk);
static void (*udp_v4_unhash)(struct sock *sk);

static int
vnet_inet_create(struct socket *sock, int protocol)
{
	int ret;

	if (sock->type == SOCK_RAW) {
		/* Temporarily give CAP_NET_RAW to root VServer accounts */
		if (current->euid)
			return -EPERM;
		cap_raise(current->cap_effective, CAP_NET_RAW);
	}
	ret = inet_create(sock, protocol);
	if (sock->type == SOCK_RAW)
		cap_lower(current->cap_effective, CAP_NET_RAW);
	if (ret)
		return ret;

	if (sock->type == SOCK_RAW) {
		struct sock *sk = sock->sk;
		struct inet_opt *inet = inet_sk(sk);
		/* Usually redundant and unused */
		assert(inet->sport == htons(inet->num));
		/* So we can track double raw binds */
		inet->sport = 0;
	}

	return ret;
}

/* Make sure our bind table gets updated whenever the stack decides to
 * unhash or rehash a socket.
 */
static void
vnet_inet_unhash(struct sock *sk)
{
	struct inet_opt *inet = inet_sk(sk);
	struct bind_key *key;

	key = bind_get(sk->sk_protocol, inet->saddr, inet->sport, sk);
	if (key) {
		dbg("vnet_inet_unhash:%d: released %s port %u.%u.%u.%u:%u\n", get_sk_xid(sk),
		    print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), ntohs(inet->sport));
		bind_del(key);
		bind_put(key);
	}

	if (sk->sk_protocol == IPPROTO_TCP)
		tcp_v4_unhash(sk);
	else if (sk->sk_protocol == IPPROTO_UDP)
		udp_v4_unhash(sk);
}

static void
vnet_inet_hash(struct sock *sk)
{
	struct inet_opt *inet = inet_sk(sk);

	if (bind_add(sk->sk_protocol, inet->saddr, inet->sport, sk) == 0) {
		dbg("vnet_inet_hash:%d: bound %s port %u.%u.%u.%u:%u\n", get_sk_xid(sk),
		    print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), ntohs(inet->sport));
	}

	if (sk->sk_protocol == IPPROTO_TCP)
		tcp_v4_hash(sk);
	else if (sk->sk_protocol == IPPROTO_UDP)
		udp_v4_hash(sk);
}

/* Port reservation */
static int
vnet_inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
{
	struct sock *sk = sock->sk;
	struct inet_opt *inet = inet_sk(sk);
	struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
	struct bind_key *key;
	int ret;

	/* Bind socket */
	if ((ret = inet_bind(sock, uaddr, addr_len)))
		return ret;

	lock_sock(sk);

	/* Backward compatibility with safe raw sockets */
	if (sock->type == SOCK_RAW) {
		/* Runt sockaddr */
		if (addr_len < sizeof(struct sockaddr_in))
			ret = -EINVAL;
		/* Non-local bind */
		else if (sin->sin_addr.s_addr != __constant_htonl(INADDR_ANY) && inet_addr_type(sin->sin_addr.s_addr) != RTN_LOCAL)
			ret = -EINVAL;
		/* Unspecified port */
		else if (!sin->sin_port)
			ret = -EINVAL;
		/* Reserved port */
		else if ((sk->sk_protocol == IPPROTO_TCP || sk->sk_protocol == IPPROTO_UDP) &&
			 ntohs(sin->sin_port) < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
			ret = -EACCES;
		/* Double bind */
		else if (inet->sport)
			ret = -EINVAL;
		if (ret)
			goto done;
		inet->saddr = sin->sin_addr.s_addr;
		inet->sport = sin->sin_port;
	}

	key = bind_get(sk->sk_protocol, inet->saddr, inet->sport, NULL);
	if (key) {
		/*
		 * If we are root or own the already bound socket, and
		 * SO_REUSEADDR has been set on both.
		 */
		if ((get_sk_xid(sk) == 0 || get_sk_xid(sk) == get_sk_xid(key->sk)) &&
		    key->sk->sk_reuse && sk->sk_reuse) {
			if (key->ip == __constant_htonl(INADDR_ANY)) {
				/* Keep the current bind key */
				bind_put(key);
				goto done;
			} else if (inet->saddr == __constant_htonl(INADDR_ANY)) {
				/* Consider the port to be bound to this socket now */
				bind_del(key);
			}
		}
		bind_put(key);
	}

	if ((ret = bind_add(sk->sk_protocol, inet->saddr, inet->sport, sk)) == 0) {
		dbg("vnet_inet_bind:%d: bound %s port %u.%u.%u.%u:%u\n", get_sk_xid(sk),
		    print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), ntohs(inet->sport));
	}

 done:
	release_sock(sk);
	return ret;
}

/* Override TCP and UDP port rovers since they do not know about raw
 * socket binds.
 */
static int
vnet_autobind(struct sock *sk)
{
	int (*get_port)(struct sock *, unsigned short);
	int low = sysctl_local_port_range[0];
	int high = sysctl_local_port_range[1];
	int remaining = (high - low) + 1;
	int port;
	struct inet_opt *inet = inet_sk(sk);
	struct bind_key *key;

	/* Must be locked */
	assert(sock_owned_by_user(sk));

	/* Already bound to a port */
	if (inet->num)
		return 0;

	if (sk->sk_protocol == IPPROTO_TCP) {
		get_port = tcp_prot.get_port;
#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,14)
		/* Approximate the tcp_v4_get_port() strategy */
		port = tcp_port_rover + 1;
#else
		/* Approximate the inet_csk_get_port() strategy */
		port = net_random() % (high - low) + low;
#endif
	} else if (sk->sk_protocol == IPPROTO_UDP) {
		get_port = udp_prot.get_port;
		port = udp_port_rover;
	} else if (sk->sk_prot->get_port) {
		err("vnet_get_port:%d: %s unhandled\n", get_sk_xid(sk),
		    print_protocol(sk->sk_protocol));
		if (sk->sk_prot->get_port(sk, 0))
			return -EAGAIN;
		inet->sport = htons(inet->num);
		return 0;
	} else {
		return 0;
	}

	dbg("vnet_autobind:%d: roving %s port range %u.%u.%u.%u:%u-%u\n", get_sk_xid(sk),
	    print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), low, high);

	/* Find a free port by linear search. Note that the standard
	 * udp_v4_get_port() function attempts to pick a port that
	 * keeps its hash tables balanced. If the UDP hash table keeps
	 * getting bombed, we should try implementing this strategy
	 * here.
	 */
	do {
		if (port < low || port > high)
			port = low;

		/* XXX We could probably try something more clever
		 * like checking to see if the bound socket is a
		 * regular TCP socket owned by the same context (or we
		 * are root) and, if so, letting tcp_v4_get_port()
		 * apply its fast reuse logic to determine if the port
		 * can be reused.
		 */
		if (bind_add(sk->sk_protocol, inet->saddr, htons(port), sk)) {
			dbg("vnet_get_port:%d: %s port %u.%u.%u.%u:%u already bound\n", get_sk_xid(sk),
			    print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), port);
			goto next;
		}

		if (get_port(sk, port)) {
			/* Can happen if we are unloaded when there are active sockets */
			dbg("vnet_get_port:%d: failed to hash unbound %s port %u.%u.%u.%u:%u\n", get_sk_xid(sk),
			    print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), port);
			key = bind_get(sk->sk_protocol, inet->saddr, htons(port), sk);
			assert(key);
			bind_del(key);
			bind_put(key);
		} else {
			assert(port == inet->num);
			inet->sport = htons(inet->num);
			break;
		}
	next:
		port++;
	} while (--remaining > 0);

	if (sk->sk_protocol == IPPROTO_UDP)
		udp_port_rover = port;
#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,14)
	else if (sk->sk_protocol == IPPROTO_TCP)
		tcp_port_rover = port;
#endif

	if (remaining <= 0) {
		err("vnet_get_port:%d: exhausted local %s port range %u.%u.%u.%u:%u-%u\n", get_sk_xid(sk),
		    print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), low, high);
		return -EAGAIN;
	} else {
		dbg("vnet_get_port:%d: autobound %s port %u.%u.%u.%u:%u\n", get_sk_xid(sk),
		    print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), port);
		return 0;
	}
}

static int
vnet_inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
			 int addr_len, int flags)
{
	struct sock *sk = sock->sk;

	lock_sock(sk);

	/* Duplicates checks in inet_stream_connect() */
	if (uaddr->sa_family != AF_UNSPEC &&
	    sock->state == SS_UNCONNECTED &&
	    sk->sk_state == TCP_CLOSE) {
		/* We may need to bind the socket. */
		if (!inet_sk(sk)->num && vnet_autobind(sk)) {
			release_sock(sk);
			return -EAGAIN;
		}
	}

	release_sock(sk);

	return inet_stream_connect(sock, uaddr, addr_len, flags);
}

static int 
vnet_inet_listen(struct socket *sock, int backlog)
{
	struct sock *sk = sock->sk;

	lock_sock(sk);

	/* Duplicates checks in inet_listen() */
	if (sock->type == SOCK_STREAM &&
	    sock->state == SS_UNCONNECTED &&
	    sk->sk_state == TCP_CLOSE) {
		/* We may need to bind the socket. */
		if (!inet_sk(sk)->num && vnet_autobind(sk)) {
			release_sock(sk);
			return -EAGAIN;
		}
	}

	release_sock(sk);

	return inet_listen(sock, backlog);
}

static int
vnet_inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr,
			int addr_len, int flags)
{
	struct sock *sk = sock->sk;

	lock_sock(sk);

	/* Duplicates checks in inet_dgram_connect() */
	if (uaddr->sa_family != AF_UNSPEC) {
		/* We may need to bind the socket. */
		if (!inet_sk(sk)->num && vnet_autobind(sk)) {
			release_sock(sk);
			return -EAGAIN;
		}
	}

	release_sock(sk);

	return inet_dgram_connect(sock, uaddr, addr_len, flags);
}

static int
vnet_inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
		  size_t size)
{
	struct sock *sk = sock->sk;

	lock_sock(sk);

	/* We may need to bind the socket. */
	if (!inet_sk(sk)->num && vnet_autobind(sk)) {
		release_sock(sk);
		return -EAGAIN;
	}

	release_sock(sk);

	return inet_sendmsg(iocb, sock, msg, size);
}

static ssize_t
vnet_inet_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
{
	struct sock *sk = sock->sk;

	lock_sock(sk);

	/* We may need to bind the socket. */
	if (!inet_sk(sk)->num && vnet_autobind(sk)) {
		release_sock(sk);
		return -EAGAIN;
	}

	release_sock(sk);

	return inet_sendpage(sock, page, offset, size, flags);
}

static int
vnet_inet_release(struct socket *sock)
{
	struct sock *sk = sock->sk;
	struct inet_opt *inet = inet_sk(sk);
	struct bind_key *key;

	/* Partial socket created by accept() */
	if (!sk)
		goto done;

	lock_sock(sk);

	key = bind_get(sk->sk_protocol, inet->saddr, inet->sport, sk);
	if (key) {
		dbg("vnet_inet_release:%d: released %s port %u.%u.%u.%u:%u\n", get_sk_xid(sk),
		    print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), ntohs(inet->sport));
		bind_del(key);
		bind_put(key);
	}

	release_sock(sk);

 done:
	return inet_release(sock);
}

/* Sanity check */
#define override_op(op, from, to) do { assert((op) == (from)); (op) = (to); } while (0)

static int __init
vnet_init(void)
{
	int ret;

	/* Initialize bind table */
	ret = bind_init();
	if (ret < 0)
		return ret;

	/* Register /proc entries */
	ret = proc_init();
	if (ret < 0)
		goto cleanup_bind;

	/* Register dummy netdevice */
	ret = packet_init();
	if (ret < 0)
		goto cleanup_proc;

	/* Register tap netdevice */
	ret = tun_init();
	if (ret < 0)
		goto cleanup_packet;

	/* Get pointers to unexported functions */
	inet_create = inet_family_ops.create;
	inet_sendpage = inet_dgram_ops.sendpage;
	tcp_v4_hash = tcp_prot.hash;
	tcp_v4_unhash = tcp_prot.unhash;
	udp_v4_hash = udp_prot.hash;
	udp_v4_unhash = udp_prot.unhash;

	/* Override PF_INET socket operations */
	override_op(inet_family_ops.create, inet_create, vnet_inet_create);
	override_op(inet_stream_ops.bind, inet_bind, vnet_inet_bind);
	override_op(inet_stream_ops.connect, inet_stream_connect, vnet_inet_stream_connect);
	override_op(inet_stream_ops.listen, inet_listen, vnet_inet_listen);
	override_op(inet_stream_ops.sendmsg, inet_sendmsg, vnet_inet_sendmsg);
	override_op(inet_stream_ops.release, inet_release, vnet_inet_release);
	override_op(inet_dgram_ops.bind, inet_bind, vnet_inet_bind);
	override_op(inet_dgram_ops.connect, inet_dgram_connect, vnet_inet_dgram_connect);
	override_op(inet_dgram_ops.sendmsg, inet_sendmsg, vnet_inet_sendmsg); 
	override_op(inet_dgram_ops.sendpage, inet_sendpage, vnet_inet_sendpage);
	override_op(inet_dgram_ops.release, inet_release, vnet_inet_release);
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10)
	override_op(inet_sockraw_ops.bind, inet_bind, vnet_inet_bind);
	override_op(inet_sockraw_ops.connect, inet_dgram_connect, vnet_inet_dgram_connect);
	override_op(inet_sockraw_ops.sendmsg, inet_sendmsg, vnet_inet_sendmsg);
	override_op(inet_sockraw_ops.sendpage, inet_sendpage, vnet_inet_sendpage); 
	override_op(inet_sockraw_ops.release, inet_release, vnet_inet_release);
#endif
	override_op(tcp_prot.hash, tcp_v4_hash, vnet_inet_hash);
	override_op(tcp_prot.unhash, tcp_v4_unhash, vnet_inet_unhash);
	override_op(udp_prot.hash, udp_v4_hash, vnet_inet_hash);
	override_op(udp_prot.unhash, udp_v4_unhash, vnet_inet_unhash);

	/* Register table */
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,11)
	ret = ipt_register_table(&vnet_table, &initial_table.repl);
#else
	ret = ipt_register_table(&vnet_table);
#endif
	if (ret < 0)
		goto cleanup_override;

	/* Register hooks */
	ret = nf_register_hook(&vnet_ops[0]);
	if (ret < 0)
		goto cleanup_table;

	ret = nf_register_hook(&vnet_ops[1]);
	if (ret < 0)
		goto cleanup_hook0;

	/* Enables any runtime kernel support for VNET */
	vnet_active = 1;

	/* Print banner */
	printk("VNET: version " VNET_VERSION " compiled on " __DATE__ " at " __TIME__ "\n");

	return ret;

 cleanup_hook0:
	nf_unregister_hook(&vnet_ops[0]);
 cleanup_table:
	ipt_unregister_table(&vnet_table);
 cleanup_override:
	inet_family_ops.create = inet_create;
	inet_stream_ops.bind = inet_bind;
	inet_stream_ops.connect = inet_stream_connect;
	inet_stream_ops.listen = inet_listen;
	inet_stream_ops.sendmsg = inet_sendmsg;
	inet_stream_ops.release = inet_release;
	inet_dgram_ops.bind = inet_bind;
	inet_dgram_ops.connect = inet_dgram_connect;
	inet_dgram_ops.sendmsg = inet_sendmsg;
	inet_dgram_ops.sendpage = inet_sendpage;
	inet_dgram_ops.release = inet_release;
	tun_cleanup();
 cleanup_packet:
	packet_cleanup();	
 cleanup_proc:
	proc_cleanup();
 cleanup_bind:
	bind_cleanup();

	return ret;
}

static void __exit
vnet_exit(void)
{
	unsigned int i;

	/* Print banner */
	printk("VNET: exiting\n");

	/* Disables any runtime kernel support for VNET */
	vnet_active = 0;

	/* Stop handling packets first */
	for (i = 0; i < sizeof(vnet_ops)/sizeof(struct nf_hook_ops); i++)
		nf_unregister_hook(&vnet_ops[i]);

	ipt_unregister_table(&vnet_table);

	/* Stop handling PF_INET socket operations */
	override_op(inet_family_ops.create, vnet_inet_create, inet_create);
	override_op(inet_stream_ops.bind, vnet_inet_bind, inet_bind);
	override_op(inet_stream_ops.connect, vnet_inet_stream_connect, inet_stream_connect);
	override_op(inet_stream_ops.listen, vnet_inet_listen, inet_listen);
	override_op(inet_stream_ops.sendmsg, vnet_inet_sendmsg, inet_sendmsg);
	override_op(inet_stream_ops.release, vnet_inet_release, inet_release);
	override_op(inet_dgram_ops.bind, vnet_inet_bind, inet_bind);
	override_op(inet_dgram_ops.connect, vnet_inet_dgram_connect, inet_dgram_connect);
	override_op(inet_dgram_ops.sendmsg, vnet_inet_sendmsg, inet_sendmsg); 
	override_op(inet_dgram_ops.sendpage, vnet_inet_sendpage, inet_sendpage);
	override_op(inet_dgram_ops.release, vnet_inet_release, inet_release);
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10)
	override_op(inet_sockraw_ops.bind, vnet_inet_bind, inet_bind);
	override_op(inet_sockraw_ops.connect, vnet_inet_dgram_connect, inet_dgram_connect);
	override_op(inet_sockraw_ops.sendmsg, vnet_inet_sendmsg, inet_sendmsg);
	override_op(inet_sockraw_ops.sendpage, vnet_inet_sendpage, inet_sendpage); 
	override_op(inet_sockraw_ops.release, vnet_inet_release, inet_release);
#endif
	override_op(tcp_prot.hash, vnet_inet_hash, tcp_v4_hash);
	override_op(tcp_prot.unhash, vnet_inet_unhash, tcp_v4_unhash);
	override_op(udp_prot.hash, vnet_inet_hash, udp_v4_hash);
	override_op(udp_prot.unhash, vnet_inet_unhash, udp_v4_unhash);

	/* Disable tap netdevice */
	tun_cleanup();

	/* Disable vnet netdevice and stop handling PF_PACKET sockets */
	packet_cleanup();

	/* Unregister /proc handlers */
 	proc_cleanup();

	/* Cleanup bind table (must be after nf_unregister_hook()) */
	bind_cleanup();
}

module_init(vnet_init);
module_exit(vnet_exit);

MODULE_LICENSE("GPL");
MODULE_AUTHOR("Mark Huang <mlhuang@cs.princeton.edu>");
MODULE_DESCRIPTION("VServer IP isolation");