+++ /dev/null
-/*
- * VServer IP isolation.
- *
- * This file implements netfilter hooks and AF_INET socket function
- * overrides.
- *
- * Mark Huang <mlhuang@cs.princeton.edu>
- * Copyright (C) 2004 The Trustees of Princeton University
- *
- * $Id: vnet_main.c,v 1.40 2007/03/08 15:46:07 mef Exp $
- */
-
-#include <linux/version.h>
-#include <linux/types.h>
-#include <linux/module.h>
-#include <linux/ip.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/pkt_sched.h>
-#include <linux/skbuff.h>
-#include <linux/tcp.h>
-#include <linux/udp.h>
-#include <linux/icmp.h>
-#include <linux/slab.h>
-#include <net/sock.h>
-#include <net/route.h>
-#include <net/tcp.h>
-
-#include <linux/netfilter_ipv4/ip_conntrack.h>
-#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
-#include <linux/netfilter_ipv4/ip_conntrack_core.h>
-#include <linux/netfilter_ipv4/ip_tables.h>
-
-#include "vnet_config.h"
-#include "vnet.h"
-#include "vnet_dbg.h"
-#include "vnet_compat.h"
-
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
-
-#define HAVE_FUNCTIONALITY_REQUIRED_BY_DEMUX
-
-#include <net/inet_hashtables.h>
-
-static inline void
-vnet_timewait_put(struct sock* sk)
-{
- inet_twsk_put((struct inet_timewait_sock *)sk);
-}
-
-static inline struct sock*
-vnet_tcp_lookup(u32 src_ip, u16 src_port,
- u32 ip, u16 port, int dif)
-{
- return inet_lookup(&tcp_hashinfo, src_ip, src_port, ip, port, dif);
-}
-
-static inline int vnet_iif(const struct sk_buff *skb)
-{
- return inet_iif(skb);
-}
-#endif
-
-#if LINUX_VERSION_CODE == KERNEL_VERSION(2,6,12)
-
-#define HAVE_FUNCTIONALITY_REQUIRED_BY_DEMUX
-
-static inline void
-vnet_timewait_put(struct sock* sk)
-{
- /* net/tcp.h */
- tcp_tw_put((struct tcp_tw_bucket*)sk);
-}
-
-static inline struct sock*
-vnet_tcp_lookup(u32 saddr, u16 sport, u32 daddr,u16 dport, int dif)
-{
- extern struct sock *tcp_v4_lookup(u32, u16, u32, u16, int);
- return tcp_v4_lookup(saddr, sport, daddr, dport, dif);
-}
-
-/* same as tcp_v4_iff() in net/ipv4/tcp_ipv4. */
-static inline int vnet_iif(const struct sk_buff *skb)
-{
- return ((struct rtable *)skb->dst)->rt_iif;
-}
-#endif
-
-#ifndef HAVE_FUNCTIONALITY_REQUIRED_BY_DEMUX
-#warning DEMUX FUNCTIONALITY NOT SUPPORTED
-#endif
-
-int vnet_verbose = 1;
-
-/* We subdivide the 1: major class into 15 minor subclasses 1:1, 1:2,
- * etc. so that we can represent multiple bandwidth limits. The 1:1
- * subclass has children named 1:1000, 1:1001, etc., one for each
- * context (up to 4096). Similarly, the 1:2 subclass has children
- * named 1:2000, 1:2001, etc. By default, the 1:1 subclass represents
- * the node bandwidth cap and 1:1000 represents the root context's
- * share of it. */
-int vnet_root_class = TC_H_MAKE(1 << 16, 0x1000);
-
-#define FILTER_VALID_HOOKS ((1 << NF_IP_LOCAL_IN) | \
- (1 << NF_IP_LOCAL_OUT) | \
- (1 << NF_IP_POST_ROUTING))
-
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,11)
-
-/* Standard entry. */
-struct ipt_standard
-{
- struct ipt_entry entry;
- struct ipt_standard_target target;
-};
-
-struct ipt_error_target
-{
- struct ipt_entry_target target;
- char errorname[IPT_FUNCTION_MAXNAMELEN];
-};
-
-struct ipt_error
-{
- struct ipt_entry entry;
- struct ipt_error_target target;
-};
-
-#endif
-
-static struct
-{
- struct ipt_replace repl;
- struct ipt_standard entries[3];
- struct ipt_error term;
-} initial_table __initdata =
-{
- .repl =
- {
- .name = "vnet",
- .valid_hooks = FILTER_VALID_HOOKS,
- .num_entries = 4,
- .size = sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error),
- .hook_entry = { [NF_IP_LOCAL_IN] = 0,
- [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard),
- [NF_IP_POST_ROUTING] = sizeof(struct ipt_standard) * 2, },
- .underflow = { [NF_IP_LOCAL_IN] = 0,
- [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard),
- [NF_IP_POST_ROUTING] = sizeof(struct ipt_standard) * 2, },
- },
-
- .entries =
- {
- /* LOCAL_IN: currently unused */
- { .entry = { .target_offset = sizeof(struct ipt_entry),
- .next_offset = sizeof(struct ipt_standard), },
- .target = { .target = { .u = { .target_size = IPT_ALIGN(sizeof(struct ipt_standard_target)), }, },
- .verdict = -NF_ACCEPT - 1, },
- },
-
- /* LOCAL_OUT: used for logging */
- { .entry = { .target_offset = sizeof(struct ipt_entry),
- .next_offset = sizeof(struct ipt_standard), },
- .target = { .target = { .u = { .target_size = IPT_ALIGN(sizeof(struct ipt_standard_target)), }, },
- .verdict = -NF_ACCEPT - 1, },
- },
-
- /* POST_ROUTING: used for priority classification */
- { .entry = { .target_offset = sizeof(struct ipt_entry),
- .next_offset = sizeof(struct ipt_standard), },
- .target = { .target = { .u = { .target_size = IPT_ALIGN(sizeof(struct ipt_standard_target)), }, },
- .verdict = -NF_ACCEPT - 1, },
- },
- },
-
- /* ERROR */
- .term =
- {
- .entry = { .target_offset = sizeof(struct ipt_entry),
- .next_offset = sizeof(struct ipt_error), },
- .target = { .target = { .u = { .user = { .target_size = IPT_ALIGN(sizeof(struct ipt_error_target)),
- .name = IPT_ERROR_TARGET, }, }, },
- .errorname = "ERROR", },
- },
-};
-
-static struct ipt_table vnet_table = {
- .name = "vnet",
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,11)
- .table = &initial_table.repl,
-#endif
- .valid_hooks = FILTER_VALID_HOOKS,
- .lock = RW_LOCK_UNLOCKED,
- .me = THIS_MODULE,
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
- .af = AF_INET,
-#endif
-};
-
-static inline u_int16_t
-get_dst_port(struct ip_conntrack_tuple *tuple)
-{
- switch (tuple->dst.protonum) {
- case IPPROTO_GRE:
- /* XXX Truncate 32-bit GRE key to 16 bits */
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,11)
- return tuple->dst.u.gre.key;
-#else
- return htons(ntohl(tuple->dst.u.gre.key));
-#endif
- case IPPROTO_ICMP:
- /* Bind on ICMP echo ID */
- return tuple->src.u.icmp.id;
- case IPPROTO_TCP:
- return tuple->dst.u.tcp.port;
- case IPPROTO_UDP:
- return tuple->dst.u.udp.port;
- default:
- return tuple->dst.u.all;
- }
-}
-
-static inline u_int16_t
-get_src_port(struct ip_conntrack_tuple *tuple)
-{
- switch (tuple->dst.protonum) {
- case IPPROTO_GRE:
- /* XXX Truncate 32-bit GRE key to 16 bits */
- return htons(ntohl(tuple->src.u.gre.key));
- case IPPROTO_ICMP:
- /* Bind on ICMP echo ID */
- return tuple->src.u.icmp.id;
- case IPPROTO_TCP:
- return tuple->src.u.tcp.port;
- case IPPROTO_UDP:
- return tuple->src.u.udp.port;
- default:
- return tuple->src.u.all;
- }
-}
-
-
-
-static unsigned int
-vnet_hook(unsigned int hook,
- struct sk_buff **pskb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
- struct ip_conntrack *ct;
- enum ip_conntrack_info ctinfo;
- enum ip_conntrack_dir dir;
- u_int8_t protocol;
- u_int32_t ip;
- u_int16_t port;
- struct bind_key *key;
- xid_t xid;
- unsigned int verdict;
- int priority;
- struct sock *sk;
- int need_to_free_sk = 0;
-
- ct = ip_conntrack_get(*pskb, &ctinfo);
- dir = CTINFO2DIR(ctinfo);
-
- /* Default to marking packet with root context ID */
- xid = 0;
-
- switch (hook) {
-
- case NF_IP_LOCAL_IN:
- /* Multicast to 224.0.0.1 is one example */
- if (!ct)
- break;
-
- /* Determine if the packet is destined for a bound port */
- protocol = ct->tuplehash[dir].tuple.dst.protonum;
- assert(ctinfo == IP_CT_RELATED ||
- ctinfo == (IP_CT_IS_REPLY + IP_CT_RELATED) ||
- protocol == (*pskb)->nh.iph->protocol);
- ip = ct->tuplehash[dir].tuple.dst.ip;
- port = get_dst_port(&ct->tuplehash[dir].tuple);
-
- /* Check if the port is bound */
- key = bind_get(protocol, ip, port, NULL);
-
- if (key && key->sk != NULL) {
-
- /* A new or established connection to a bound port */
- sk = key->sk;
-
-#ifdef HAVE_FUNCTIONALITY_REQUIRED_BY_DEMUX
- /* If the bound socket is a real TCP socket, then the context that
- * bound the port could have re-assigned an established connection
- * socket to another context. See if this is the case.
- */
- if (protocol == IPPROTO_TCP && sk->sk_type == SOCK_STREAM) {
- struct sock *tcp_sk;
- u_int32_t src_ip = ct->tuplehash[dir].tuple.src.ip;
- u_int16_t src_port = get_src_port(&ct->tuplehash[dir].tuple);
-
- tcp_sk = vnet_tcp_lookup(src_ip, src_port, ip, port, vnet_iif(*pskb));
- if (tcp_sk) {
- if (tcp_sk->sk_state == TCP_TIME_WAIT) {
- sock_put(tcp_sk);
- } else {
- dbg("vnet_in:%d: established TCP socket %u.%u.%u.%u:%u -> %u.%u.%u.%u:%u\n",
- get_sk_xid(tcp_sk), NIPQUAD(src_ip), ntohs(src_port), NIPQUAD(ip), ntohs(port));
- sk = tcp_sk;
- need_to_free_sk = 1;
- }
- /* Remember to sock_put()! */
- }
- }
-#endif
-
- /* Indicate to the stack that the packet was "expected", so that it does
- * not generate a TCP RST or ICMP Unreachable message. This requires a
- * kernel patch.
- */
- if (sk->sk_type == SOCK_RAW)
- (*pskb)->sk = sk;
-
- assert(sk);
- xid = get_sk_xid(sk);
-
- /* Steal the reply end of the connection */
- if (get_ct_xid(ct, !dir) != xid) {
- dbg("vnet_in:%d: stealing %sbound %s connection %u.%u.%u.%u:%u -> %u.%u.%u.%u:%u from context %d\n", xid,
- key ? "" : "un", print_protocol(protocol),
- NIPQUAD(ip), ntohs(port),
- NIPQUAD(ct->tuplehash[!dir].tuple.dst.ip), ntohs(ct->tuplehash[!dir].tuple.dst.u.all),
- get_ct_xid(ct, !dir));
- set_ct_xid(ct, !dir, xid);
- }
-
- /* Store the owner (if any) of the other side of the connection (if
- * localhost) in the peercred struct.
- */
- sk->sk_peercred.uid = sk->sk_peercred.gid = (__u32) get_ct_xid(ct, dir);
-
- if (ctinfo == IP_CT_NEW) {
- dbg("vnet_in: %s port %u.%u.%u.%u:%u bound by context %d\n",
- print_protocol(protocol), NIPQUAD(ip), ntohs(port), xid);
- }
-
-#ifdef HAVE_FUNCTIONALITY_REQUIRED_BY_DEMUX
- if (need_to_free_sk) {
- /*
- if (sk->sk_state == TCP_TIME_WAIT)
- vnet_timewait_put(sk);
- else*/
- sock_put(sk);
- need_to_free_sk=0;
- }
-#endif
- bind_put(key);
-
- } else if ((int) get_ct_xid(ct, !dir) == -1) {
- /* A new connection to an unbound port */
- if (ctinfo == IP_CT_NEW) {
- dbg("vnet_in: %s port %u.%u.%u.%u:%u not bound\n",
- print_protocol(protocol), NIPQUAD(ip), ntohs(port));
- }
- } else {
- /* A new or established connection to an unbound port that could be
- * associated with an active socket ("could be" because the socket
- * could be closed and the connection in a WAIT state). In any case,
- * give it to the last owner of the connection.
- */
- xid = get_ct_xid(ct, !dir);
- }
-
- break;
-
- case NF_IP_LOCAL_OUT:
- /* Get the context ID of the sender */
- assert((*pskb)->sk);
- xid = get_sk_xid((*pskb)->sk);
-
- /* Default class */
- priority = vnet_root_class;
-
- if (ct) {
- protocol = ct->tuplehash[dir].tuple.dst.protonum;
- assert(ctinfo == IP_CT_RELATED ||
- ctinfo == (IP_CT_IS_REPLY + IP_CT_RELATED) ||
- protocol == (*pskb)->nh.iph->protocol);
- ip = ct->tuplehash[dir].tuple.src.ip;
- assert(ctinfo == IP_CT_RELATED ||
- ctinfo == (IP_CT_IS_REPLY + IP_CT_RELATED) ||
- ip == __constant_htonl(INADDR_ANY) || ip == (*pskb)->nh.iph->saddr);
- port = get_src_port(&ct->tuplehash[dir].tuple);
- } else {
- protocol = port = 0;
- }
-
- if (xid) {
- /* Multicast to 224.0.0.1 is one example */
- if (!ct) {
- dbg("vnet_out:%d: dropping untrackable IP packet\n", xid);
- return NF_DROP;
- }
-
- /* XXX Is this guaranteed? */
- if ((*pskb)->len < sizeof(struct iphdr)) {
- dbg("vnet_out:%d: dropping runt IP packet\n", xid);
- return NF_DROP;
- }
-
- /* Check source IP address */
- if (inet_addr_type(ip) != RTN_LOCAL) {
- dbg("vnet_out:%d: non-local source IP address %u.%u.%u.%u not allowed\n", xid,
- NIPQUAD(ip));
- return NF_DROP;
- }
-
- /* Sending of ICMP error messages not allowed */
- if (protocol == IPPROTO_ICMP) {
- struct icmphdr *icmph = (struct icmphdr *)((*pskb)->nh.raw + ((*pskb)->nh.iph->ihl * 4));
-
- if ((unsigned char *) &icmph[1] > (*pskb)->tail) {
- dbg("vnet_out:%d: dropping runt ICMP packet\n", xid);
- return NF_DROP;
- }
-
- switch (icmph->type) {
- case ICMP_ECHOREPLY:
- case ICMP_ECHO:
- case ICMP_TIMESTAMP:
- case ICMP_TIMESTAMPREPLY:
- case ICMP_INFO_REQUEST:
- case ICMP_INFO_REPLY:
- case ICMP_ADDRESS:
- case ICMP_ADDRESSREPLY:
- /* Guaranteed by icmp_pkt_to_tuple() */
- assert(port == icmph->un.echo.id);
- break;
- default:
- dbg("vnet_out:%d: sending of ICMP error messages not allowed\n", xid);
- return NF_DROP;
- }
- }
- } else {
- /* Let root send anything it wants */
- }
-
- if (ct) {
- /* Check if the port is bound by someone else */
- key = bind_get(protocol, ip, port, NULL);
- } else {
- assert(xid == 0);
- key = NULL;
- }
-
- if (key && key->sk != NULL) {
- /* A new or established connection from a bound port */
- assert(ct);
-
- sk = key->sk;
-
-#ifdef HAVE_FUNCTIONALITY_REQUIRED_BY_DEMUX
- /* If the bound socket is a real TCP socket, then the context that
- * bound the port could have re-assigned an established connection
- * socket to the sender's context. See if this is the case.
- */
- if (protocol == IPPROTO_TCP && sk->sk_type == SOCK_STREAM && get_sk_xid(sk) != xid) {
- struct sock *tcp_sk;
- u_int32_t dst_ip = ct->tuplehash[dir].tuple.dst.ip;
- u_int16_t dst_port = get_dst_port(&ct->tuplehash[dir].tuple);
-
- tcp_sk = vnet_tcp_lookup(dst_ip, dst_port, ip, port, vnet_iif(*pskb));
- if (tcp_sk) {
- if (tcp_sk->sk_state == TCP_TIME_WAIT) {
- sock_put(tcp_sk);
- //vnet_timewait_put(tcp_sk);
- } else {
- need_to_free_sk = 1;
- sk = tcp_sk;
- /* Remember to sock_put()! */
- }
- }
- }
-#endif
-
- verdict = NF_ACCEPT;
-
- /* Stealing connections from established sockets is not allowed */
- assert(sk);
- if (get_sk_xid(sk) != xid) {
- if (xid) {
- dbg("vnet_out:%d: %s port %u.%u.%u.%u:%u already bound by context %d\n", xid,
- print_protocol(protocol), NIPQUAD(ip), ntohs(port), get_sk_xid(sk));
- verdict = NF_DROP;
- } else {
- /* Let root send whatever it wants but do not steal the packet or
- * connection. Kernel sockets owned by root may send packets on
- * behalf of bound sockets (for instance, TCP ACK in SYN_RECV or
- * TIME_WAIT).
- */
- xid = get_sk_xid(sk);
- }
- }
-
-#ifdef HAVE_FUNCTIONALITY_REQUIRED_BY_DEMUX
- if (need_to_free_sk) {
- /*
- if (sk->sk_state == TCP_TIME_WAIT)
- vnet_timewait_put(sk);
- else */
- sock_put(sk);
- need_to_free_sk = 0;
- }
-#endif
- bind_put(key);
-
- if (verdict == NF_DROP)
- goto done;
- } else {
- /* A new or established or untrackable connection from an unbound port */
-
- /* Reserved ports must be bound. Usually only root is capable of
- * CAP_NET_BIND_SERVICE.
- */
- if (xid &&
- (protocol == IPPROTO_TCP || protocol == IPPROTO_UDP) &&
- ntohs(port) < PROT_SOCK) {
- assert(ct);
- dbg("vnet_out:%d: %s port %u is reserved\n", xid,
- print_protocol(protocol), ntohs(port));
- return NF_DROP;
- }
- }
-
- if (ct) {
- /* Steal the connection */
- if (get_ct_xid(ct, dir) != xid) {
- dbg("vnet_out:%d: stealing %sbound %s connection %u.%u.%u.%u:%u -> %u.%u.%u.%u:%u from context %d\n", xid,
- key ? "" : "un", print_protocol(protocol),
- NIPQUAD(ip), ntohs(port),
- NIPQUAD(ct->tuplehash[dir].tuple.dst.ip), ntohs(ct->tuplehash[dir].tuple.dst.u.all),
- get_ct_xid(ct, dir));
- set_ct_xid(ct, dir, xid);
- }
-
- /* Classify traffic once per connection */
- if (ct->priority == (u_int32_t) -1) {
- /* The POSTROUTING chain should classify packets into a minor subclass
- * (1:1000, 1:2000, etc.) with -j CLASSIFY --set-class. Set the packet
- * MARK early so that rules can take xid into account. */
- set_skb_xid(*pskb, xid);
- (*pskb)->priority = priority;
- (void) ipt_do_table(pskb, NF_IP_POST_ROUTING, in, out, &vnet_table, NULL);
- priority = (*pskb)->priority | xid;
- dbg("vnet_out:%d: %u.%u.%u.%u:%u -> %u.%u.%u.%u:%u class %x:%x\n", xid,
- NIPQUAD(ip), ntohs(port),
- NIPQUAD(ct->tuplehash[dir].tuple.dst.ip), ntohs(ct->tuplehash[dir].tuple.dst.u.all),
- TC_H_MAJ(priority) >> 16, TC_H_MIN(priority));
- ct->priority = priority;
- } else
- priority = ct->priority;
- } else {
- assert(xid == 0);
- }
-
- /* Set class */
- (*pskb)->priority = priority;
-
- break;
-
- default:
- /* Huh? */
- assert(hook == NF_IP_LOCAL_IN || hook == NF_IP_LOCAL_OUT);
- break;
- }
-
- /* Mark packet */
- set_skb_xid(*pskb, xid);
-
-#ifdef VNET_DEBUG
- if (vnet_verbose >= 3) {
- if (ct)
- print_conntrack(ct, ctinfo, hook);
- if (vnet_verbose >= 4)
- print_packet(*pskb);
- }
-#endif
-
- get_verdict:
- verdict = ipt_do_table(pskb, hook, in, out, &vnet_table, NULL);
-
- /* Pass to network taps */
- if (verdict == NF_ACCEPT)
- verdict = packet_hook(*pskb, hook);
-
- done:
- return verdict;
-}
-
-static struct nf_hook_ops vnet_ops[] = {
- {
- .hook = vnet_hook,
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
- .owner = THIS_MODULE,
-#endif
- .pf = PF_INET,
- .hooknum = NF_IP_LOCAL_IN,
- .priority = NF_IP_PRI_LAST,
- },
- {
- .hook = vnet_hook,
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
- .owner = THIS_MODULE,
-#endif
- .pf = PF_INET,
- .hooknum = NF_IP_LOCAL_OUT,
- .priority = NF_IP_PRI_LAST,
- },
-};
-
-/* Exported by net/ipv4/af_inet.c */
-extern struct net_proto_family inet_family_ops;
-extern struct proto_ops inet_stream_ops;
-extern struct proto_ops inet_dgram_ops;
-extern struct proto_ops inet_sockraw_ops;
-extern int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len);
-extern int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
- int addr_len, int flags);
-extern int inet_listen(struct socket *sock, int backlog);
-extern int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr,
- int addr_len, int flags);
-extern int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
- size_t size);
-extern int inet_release(struct socket *sock);
-
-/* Exported by net/ipv4/tcp_ipv4.c */
-extern struct proto tcp_prot;
-extern int tcp_port_rover;
-extern int sysctl_local_port_range[2];
-
-/* Exported by net/ipv4/udp.c */
-extern struct proto udp_prot;
-extern int udp_port_rover;
-
-/* Functions that are not exported */
-static int (*inet_create)(struct socket *sock, int protocol);
-static ssize_t (*inet_sendpage)(struct socket *sock, struct page *page, int offset, size_t size, int flags);
-static void (*tcp_v4_hash)(struct sock *sk);
-static void (*tcp_v4_unhash)(struct sock *sk);
-static void (*udp_v4_hash)(struct sock *sk);
-static void (*udp_v4_unhash)(struct sock *sk);
-
-static int
-vnet_inet_create(struct socket *sock, int protocol)
-{
- int ret;
-
- if (sock->type == SOCK_RAW) {
- /* Temporarily give CAP_NET_RAW to root VServer accounts */
- if (current->euid)
- return -EPERM;
- cap_raise(current->cap_effective, CAP_NET_RAW);
- }
- ret = inet_create(sock, protocol);
- if (sock->type == SOCK_RAW)
- cap_lower(current->cap_effective, CAP_NET_RAW);
- if (ret)
- return ret;
-
- if (sock->type == SOCK_RAW) {
- struct sock *sk = sock->sk;
- struct inet_opt *inet = inet_sk(sk);
- /* Usually redundant and unused */
- assert(inet->sport == htons(inet->num));
- /* So we can track double raw binds */
- inet->sport = 0;
- }
-
- return ret;
-}
-
-/* Make sure our bind table gets updated whenever the stack decides to
- * unhash or rehash a socket.
- */
-static void
-vnet_inet_unhash(struct sock *sk)
-{
- struct inet_opt *inet = inet_sk(sk);
- struct bind_key *key;
-
- key = bind_get(sk->sk_protocol, inet->saddr, inet->sport, sk);
- if (key) {
- dbg("vnet_inet_unhash:%d: released %s port %u.%u.%u.%u:%u\n", get_sk_xid(sk),
- print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), ntohs(inet->sport));
- bind_del(key);
- bind_put(key);
- }
-
- if (sk->sk_protocol == IPPROTO_TCP)
- tcp_v4_unhash(sk);
- else if (sk->sk_protocol == IPPROTO_UDP)
- udp_v4_unhash(sk);
-}
-
-static void
-vnet_inet_hash(struct sock *sk)
-{
- struct inet_opt *inet = inet_sk(sk);
-
- if (bind_add(sk->sk_protocol, inet->saddr, inet->sport, sk) == 0) {
- dbg("vnet_inet_hash:%d: bound %s port %u.%u.%u.%u:%u\n", get_sk_xid(sk),
- print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), ntohs(inet->sport));
- }
-
- if (sk->sk_protocol == IPPROTO_TCP)
- tcp_v4_hash(sk);
- else if (sk->sk_protocol == IPPROTO_UDP)
- udp_v4_hash(sk);
-}
-
-/* Port reservation */
-static int
-vnet_inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
-{
- struct sock *sk = sock->sk;
- struct inet_opt *inet = inet_sk(sk);
- struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
- struct bind_key *key;
- int ret;
-
- /* Bind socket */
- if ((ret = inet_bind(sock, uaddr, addr_len)))
- return ret;
-
- lock_sock(sk);
-
- /* Backward compatibility with safe raw sockets */
- if (sock->type == SOCK_RAW) {
- /* Runt sockaddr */
- if (addr_len < sizeof(struct sockaddr_in))
- ret = -EINVAL;
- /* Non-local bind */
- else if (sin->sin_addr.s_addr != __constant_htonl(INADDR_ANY) && inet_addr_type(sin->sin_addr.s_addr) != RTN_LOCAL)
- ret = -EINVAL;
- /* Unspecified port */
- else if (!sin->sin_port)
- ret = -EINVAL;
- /* Reserved port */
- else if ((sk->sk_protocol == IPPROTO_TCP || sk->sk_protocol == IPPROTO_UDP) &&
- ntohs(sin->sin_port) < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
- ret = -EACCES;
- /* Double bind */
- else if (inet->sport)
- ret = -EINVAL;
- if (ret)
- goto done;
- inet->saddr = sin->sin_addr.s_addr;
- inet->sport = sin->sin_port;
- }
-
- key = bind_get(sk->sk_protocol, inet->saddr, inet->sport, NULL);
- if (key) {
- /*
- * If we are root or own the already bound socket, and
- * SO_REUSEADDR has been set on both.
- */
- if ((get_sk_xid(sk) == 0 || get_sk_xid(sk) == get_sk_xid(key->sk)) &&
- key->sk->sk_reuse && sk->sk_reuse) {
- if (key->ip == __constant_htonl(INADDR_ANY)) {
- /* Keep the current bind key */
- bind_put(key);
- goto done;
- } else if (inet->saddr == __constant_htonl(INADDR_ANY)) {
- /* Consider the port to be bound to this socket now */
- bind_del(key);
- }
- }
- bind_put(key);
- }
-
- if ((ret = bind_add(sk->sk_protocol, inet->saddr, inet->sport, sk)) == 0) {
- dbg("vnet_inet_bind:%d: bound %s port %u.%u.%u.%u:%u\n", get_sk_xid(sk),
- print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), ntohs(inet->sport));
- }
-
- done:
- release_sock(sk);
- return ret;
-}
-
-/* Override TCP and UDP port rovers since they do not know about raw
- * socket binds.
- */
-static int
-vnet_autobind(struct sock *sk)
-{
- int (*get_port)(struct sock *, unsigned short);
- int low = sysctl_local_port_range[0];
- int high = sysctl_local_port_range[1];
- int remaining = (high - low) + 1;
- int port;
- struct inet_opt *inet = inet_sk(sk);
- struct bind_key *key;
-
- /* Must be locked */
- assert(sock_owned_by_user(sk));
-
- /* Already bound to a port */
- if (inet->num)
- return 0;
-
- if (sk->sk_protocol == IPPROTO_TCP) {
- get_port = tcp_prot.get_port;
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,14)
- /* Approximate the tcp_v4_get_port() strategy */
- port = tcp_port_rover + 1;
-#else
- /* Approximate the inet_csk_get_port() strategy */
- port = net_random() % (high - low) + low;
-#endif
- } else if (sk->sk_protocol == IPPROTO_UDP) {
- get_port = udp_prot.get_port;
- port = udp_port_rover;
- } else if (sk->sk_prot->get_port) {
- err("vnet_get_port:%d: %s unhandled\n", get_sk_xid(sk),
- print_protocol(sk->sk_protocol));
- if (sk->sk_prot->get_port(sk, 0))
- return -EAGAIN;
- inet->sport = htons(inet->num);
- return 0;
- } else {
- return 0;
- }
-
- dbg("vnet_autobind:%d: roving %s port range %u.%u.%u.%u:%u-%u\n", get_sk_xid(sk),
- print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), low, high);
-
- /* Find a free port by linear search. Note that the standard
- * udp_v4_get_port() function attempts to pick a port that
- * keeps its hash tables balanced. If the UDP hash table keeps
- * getting bombed, we should try implementing this strategy
- * here.
- */
- do {
- if (port < low || port > high)
- port = low;
-
- /* XXX We could probably try something more clever
- * like checking to see if the bound socket is a
- * regular TCP socket owned by the same context (or we
- * are root) and, if so, letting tcp_v4_get_port()
- * apply its fast reuse logic to determine if the port
- * can be reused.
- */
- if (bind_add(sk->sk_protocol, inet->saddr, htons(port), sk)) {
- dbg("vnet_get_port:%d: %s port %u.%u.%u.%u:%u already bound\n", get_sk_xid(sk),
- print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), port);
- goto next;
- }
-
- if (get_port(sk, port)) {
- /* Can happen if we are unloaded when there are active sockets */
- dbg("vnet_get_port:%d: failed to hash unbound %s port %u.%u.%u.%u:%u\n", get_sk_xid(sk),
- print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), port);
- key = bind_get(sk->sk_protocol, inet->saddr, htons(port), sk);
- assert(key);
- bind_del(key);
- bind_put(key);
- } else {
- assert(port == inet->num);
- inet->sport = htons(inet->num);
- break;
- }
- next:
- port++;
- } while (--remaining > 0);
-
- if (sk->sk_protocol == IPPROTO_UDP)
- udp_port_rover = port;
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,14)
- else if (sk->sk_protocol == IPPROTO_TCP)
- tcp_port_rover = port;
-#endif
-
- if (remaining <= 0) {
- err("vnet_get_port:%d: exhausted local %s port range %u.%u.%u.%u:%u-%u\n", get_sk_xid(sk),
- print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), low, high);
- return -EAGAIN;
- } else {
- dbg("vnet_get_port:%d: autobound %s port %u.%u.%u.%u:%u\n", get_sk_xid(sk),
- print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), port);
- return 0;
- }
-}
-
-static int
-vnet_inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
- int addr_len, int flags)
-{
- struct sock *sk = sock->sk;
-
- lock_sock(sk);
-
- /* Duplicates checks in inet_stream_connect() */
- if (uaddr->sa_family != AF_UNSPEC &&
- sock->state == SS_UNCONNECTED &&
- sk->sk_state == TCP_CLOSE) {
- /* We may need to bind the socket. */
- if (!inet_sk(sk)->num && vnet_autobind(sk)) {
- release_sock(sk);
- return -EAGAIN;
- }
- }
-
- release_sock(sk);
-
- return inet_stream_connect(sock, uaddr, addr_len, flags);
-}
-
-static int
-vnet_inet_listen(struct socket *sock, int backlog)
-{
- struct sock *sk = sock->sk;
-
- lock_sock(sk);
-
- /* Duplicates checks in inet_listen() */
- if (sock->type == SOCK_STREAM &&
- sock->state == SS_UNCONNECTED &&
- sk->sk_state == TCP_CLOSE) {
- /* We may need to bind the socket. */
- if (!inet_sk(sk)->num && vnet_autobind(sk)) {
- release_sock(sk);
- return -EAGAIN;
- }
- }
-
- release_sock(sk);
-
- return inet_listen(sock, backlog);
-}
-
-static int
-vnet_inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr,
- int addr_len, int flags)
-{
- struct sock *sk = sock->sk;
-
- lock_sock(sk);
-
- /* Duplicates checks in inet_dgram_connect() */
- if (uaddr->sa_family != AF_UNSPEC) {
- /* We may need to bind the socket. */
- if (!inet_sk(sk)->num && vnet_autobind(sk)) {
- release_sock(sk);
- return -EAGAIN;
- }
- }
-
- release_sock(sk);
-
- return inet_dgram_connect(sock, uaddr, addr_len, flags);
-}
-
-static int
-vnet_inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
- size_t size)
-{
- struct sock *sk = sock->sk;
-
- lock_sock(sk);
-
- /* We may need to bind the socket. */
- if (!inet_sk(sk)->num && vnet_autobind(sk)) {
- release_sock(sk);
- return -EAGAIN;
- }
-
- release_sock(sk);
-
- return inet_sendmsg(iocb, sock, msg, size);
-}
-
-static ssize_t
-vnet_inet_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
-{
- struct sock *sk = sock->sk;
-
- lock_sock(sk);
-
- /* We may need to bind the socket. */
- if (!inet_sk(sk)->num && vnet_autobind(sk)) {
- release_sock(sk);
- return -EAGAIN;
- }
-
- release_sock(sk);
-
- return inet_sendpage(sock, page, offset, size, flags);
-}
-
-static int
-vnet_inet_release(struct socket *sock)
-{
- struct sock *sk = sock->sk;
- struct inet_opt *inet = inet_sk(sk);
- struct bind_key *key;
-
- /* Partial socket created by accept() */
- if (!sk)
- goto done;
-
- lock_sock(sk);
-
- key = bind_get(sk->sk_protocol, inet->saddr, inet->sport, sk);
- if (key) {
- dbg("vnet_inet_release:%d: released %s port %u.%u.%u.%u:%u\n", get_sk_xid(sk),
- print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), ntohs(inet->sport));
- bind_del(key);
- bind_put(key);
- }
-
- release_sock(sk);
-
- done:
- return inet_release(sock);
-}
-
-/* Sanity check */
-#define override_op(op, from, to) do { assert((op) == (from)); (op) = (to); } while (0)
-
-static int __init
-vnet_init(void)
-{
- int ret;
-
- /* Initialize bind table */
- ret = bind_init();
- if (ret < 0)
- return ret;
-
- /* Register /proc entries */
- ret = proc_init();
- if (ret < 0)
- goto cleanup_bind;
-
- /* Register dummy netdevice */
- ret = packet_init();
- if (ret < 0)
- goto cleanup_proc;
-
- /* Register tap netdevice */
- ret = tun_init();
- if (ret < 0)
- goto cleanup_packet;
-
- /* Get pointers to unexported functions */
- inet_create = inet_family_ops.create;
- inet_sendpage = inet_dgram_ops.sendpage;
- tcp_v4_hash = tcp_prot.hash;
- tcp_v4_unhash = tcp_prot.unhash;
- udp_v4_hash = udp_prot.hash;
- udp_v4_unhash = udp_prot.unhash;
-
- /* Override PF_INET socket operations */
- override_op(inet_family_ops.create, inet_create, vnet_inet_create);
- override_op(inet_stream_ops.bind, inet_bind, vnet_inet_bind);
- override_op(inet_stream_ops.connect, inet_stream_connect, vnet_inet_stream_connect);
- override_op(inet_stream_ops.listen, inet_listen, vnet_inet_listen);
- override_op(inet_stream_ops.sendmsg, inet_sendmsg, vnet_inet_sendmsg);
- override_op(inet_stream_ops.release, inet_release, vnet_inet_release);
- override_op(inet_dgram_ops.bind, inet_bind, vnet_inet_bind);
- override_op(inet_dgram_ops.connect, inet_dgram_connect, vnet_inet_dgram_connect);
- override_op(inet_dgram_ops.sendmsg, inet_sendmsg, vnet_inet_sendmsg);
- override_op(inet_dgram_ops.sendpage, inet_sendpage, vnet_inet_sendpage);
- override_op(inet_dgram_ops.release, inet_release, vnet_inet_release);
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10)
- override_op(inet_sockraw_ops.bind, inet_bind, vnet_inet_bind);
- override_op(inet_sockraw_ops.connect, inet_dgram_connect, vnet_inet_dgram_connect);
- override_op(inet_sockraw_ops.sendmsg, inet_sendmsg, vnet_inet_sendmsg);
- override_op(inet_sockraw_ops.sendpage, inet_sendpage, vnet_inet_sendpage);
- override_op(inet_sockraw_ops.release, inet_release, vnet_inet_release);
-#endif
- override_op(tcp_prot.hash, tcp_v4_hash, vnet_inet_hash);
- override_op(tcp_prot.unhash, tcp_v4_unhash, vnet_inet_unhash);
- override_op(udp_prot.hash, udp_v4_hash, vnet_inet_hash);
- override_op(udp_prot.unhash, udp_v4_unhash, vnet_inet_unhash);
-
- /* Register table */
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,11)
- ret = ipt_register_table(&vnet_table, &initial_table.repl);
-#else
- ret = ipt_register_table(&vnet_table);
-#endif
- if (ret < 0)
- goto cleanup_override;
-
- /* Register hooks */
- ret = nf_register_hook(&vnet_ops[0]);
- if (ret < 0)
- goto cleanup_table;
-
- ret = nf_register_hook(&vnet_ops[1]);
- if (ret < 0)
- goto cleanup_hook0;
-
- /* Enables any runtime kernel support for VNET */
- vnet_active = 1;
-
- /* Print banner */
- printk("VNET: version " VNET_VERSION " compiled on " __DATE__ " at " __TIME__ "\n");
-
- return ret;
-
- cleanup_hook0:
- nf_unregister_hook(&vnet_ops[0]);
- cleanup_table:
- ipt_unregister_table(&vnet_table);
- cleanup_override:
- inet_family_ops.create = inet_create;
- inet_stream_ops.bind = inet_bind;
- inet_stream_ops.connect = inet_stream_connect;
- inet_stream_ops.listen = inet_listen;
- inet_stream_ops.sendmsg = inet_sendmsg;
- inet_stream_ops.release = inet_release;
- inet_dgram_ops.bind = inet_bind;
- inet_dgram_ops.connect = inet_dgram_connect;
- inet_dgram_ops.sendmsg = inet_sendmsg;
- inet_dgram_ops.sendpage = inet_sendpage;
- inet_dgram_ops.release = inet_release;
- tun_cleanup();
- cleanup_packet:
- packet_cleanup();
- cleanup_proc:
- proc_cleanup();
- cleanup_bind:
- bind_cleanup();
-
- return ret;
-}
-
-static void __exit
-vnet_exit(void)
-{
- unsigned int i;
-
- /* Print banner */
- printk("VNET: exiting\n");
-
- /* Disables any runtime kernel support for VNET */
- vnet_active = 0;
-
- /* Stop handling packets first */
- for (i = 0; i < sizeof(vnet_ops)/sizeof(struct nf_hook_ops); i++)
- nf_unregister_hook(&vnet_ops[i]);
-
- ipt_unregister_table(&vnet_table);
-
- /* Stop handling PF_INET socket operations */
- override_op(inet_family_ops.create, vnet_inet_create, inet_create);
- override_op(inet_stream_ops.bind, vnet_inet_bind, inet_bind);
- override_op(inet_stream_ops.connect, vnet_inet_stream_connect, inet_stream_connect);
- override_op(inet_stream_ops.listen, vnet_inet_listen, inet_listen);
- override_op(inet_stream_ops.sendmsg, vnet_inet_sendmsg, inet_sendmsg);
- override_op(inet_stream_ops.release, vnet_inet_release, inet_release);
- override_op(inet_dgram_ops.bind, vnet_inet_bind, inet_bind);
- override_op(inet_dgram_ops.connect, vnet_inet_dgram_connect, inet_dgram_connect);
- override_op(inet_dgram_ops.sendmsg, vnet_inet_sendmsg, inet_sendmsg);
- override_op(inet_dgram_ops.sendpage, vnet_inet_sendpage, inet_sendpage);
- override_op(inet_dgram_ops.release, vnet_inet_release, inet_release);
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10)
- override_op(inet_sockraw_ops.bind, vnet_inet_bind, inet_bind);
- override_op(inet_sockraw_ops.connect, vnet_inet_dgram_connect, inet_dgram_connect);
- override_op(inet_sockraw_ops.sendmsg, vnet_inet_sendmsg, inet_sendmsg);
- override_op(inet_sockraw_ops.sendpage, vnet_inet_sendpage, inet_sendpage);
- override_op(inet_sockraw_ops.release, vnet_inet_release, inet_release);
-#endif
- override_op(tcp_prot.hash, vnet_inet_hash, tcp_v4_hash);
- override_op(tcp_prot.unhash, vnet_inet_unhash, tcp_v4_unhash);
- override_op(udp_prot.hash, vnet_inet_hash, udp_v4_hash);
- override_op(udp_prot.unhash, vnet_inet_unhash, udp_v4_unhash);
-
- /* Disable tap netdevice */
- tun_cleanup();
-
- /* Disable vnet netdevice and stop handling PF_PACKET sockets */
- packet_cleanup();
-
- /* Unregister /proc handlers */
- proc_cleanup();
-
- /* Cleanup bind table (must be after nf_unregister_hook()) */
- bind_cleanup();
-}
-
-module_init(vnet_init);
-module_exit(vnet_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Mark Huang <mlhuang@cs.princeton.edu>");
-MODULE_DESCRIPTION("VServer IP isolation");