/* * VServer IP isolation. * * This file implements netfilter hooks and AF_INET socket function * overrides. * * Mark Huang * Copyright (C) 2004 The Trustees of Princeton University * * $Id: vnet_main.c,v 1.40 2007/03/08 15:46:07 mef Exp $ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "vnet_config.h" #include "vnet.h" #include "vnet_dbg.h" #include "vnet_compat.h" #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16) #define HAVE_FUNCTIONALITY_REQUIRED_BY_DEMUX #include static inline void vnet_timewait_put(struct sock* sk) { inet_twsk_put((struct inet_timewait_sock *)sk); } static inline struct sock* vnet_tcp_lookup(u32 src_ip, u16 src_port, u32 ip, u16 port, int dif) { return inet_lookup(&tcp_hashinfo, src_ip, src_port, ip, port, dif); } static inline int vnet_iif(const struct sk_buff *skb) { return inet_iif(skb); } #endif #if LINUX_VERSION_CODE == KERNEL_VERSION(2,6,12) #define HAVE_FUNCTIONALITY_REQUIRED_BY_DEMUX static inline void vnet_timewait_put(struct sock* sk) { /* net/tcp.h */ tcp_tw_put((struct tcp_tw_bucket*)sk); } static inline struct sock* vnet_tcp_lookup(u32 saddr, u16 sport, u32 daddr,u16 dport, int dif) { extern struct sock *tcp_v4_lookup(u32, u16, u32, u16, int); return tcp_v4_lookup(saddr, sport, daddr, dport, dif); } /* same as tcp_v4_iff() in net/ipv4/tcp_ipv4. */ static inline int vnet_iif(const struct sk_buff *skb) { return ((struct rtable *)skb->dst)->rt_iif; } #endif #ifndef HAVE_FUNCTIONALITY_REQUIRED_BY_DEMUX #warning DEMUX FUNCTIONALITY NOT SUPPORTED #endif int vnet_verbose = 1; /* We subdivide the 1: major class into 15 minor subclasses 1:1, 1:2, * etc. so that we can represent multiple bandwidth limits. The 1:1 * subclass has children named 1:1000, 1:1001, etc., one for each * context (up to 4096). Similarly, the 1:2 subclass has children * named 1:2000, 1:2001, etc. By default, the 1:1 subclass represents * the node bandwidth cap and 1:1000 represents the root context's * share of it. */ int vnet_root_class = TC_H_MAKE(1 << 16, 0x1000); #define FILTER_VALID_HOOKS ((1 << NF_IP_LOCAL_IN) | \ (1 << NF_IP_LOCAL_OUT) | \ (1 << NF_IP_POST_ROUTING)) #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,11) /* Standard entry. */ struct ipt_standard { struct ipt_entry entry; struct ipt_standard_target target; }; struct ipt_error_target { struct ipt_entry_target target; char errorname[IPT_FUNCTION_MAXNAMELEN]; }; struct ipt_error { struct ipt_entry entry; struct ipt_error_target target; }; #endif static struct { struct ipt_replace repl; struct ipt_standard entries[3]; struct ipt_error term; } initial_table __initdata = { .repl = { .name = "vnet", .valid_hooks = FILTER_VALID_HOOKS, .num_entries = 4, .size = sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error), .hook_entry = { [NF_IP_LOCAL_IN] = 0, [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard), [NF_IP_POST_ROUTING] = sizeof(struct ipt_standard) * 2, }, .underflow = { [NF_IP_LOCAL_IN] = 0, [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard), [NF_IP_POST_ROUTING] = sizeof(struct ipt_standard) * 2, }, }, .entries = { /* LOCAL_IN: currently unused */ { .entry = { .target_offset = sizeof(struct ipt_entry), .next_offset = sizeof(struct ipt_standard), }, .target = { .target = { .u = { .target_size = IPT_ALIGN(sizeof(struct ipt_standard_target)), }, }, .verdict = -NF_ACCEPT - 1, }, }, /* LOCAL_OUT: used for logging */ { .entry = { .target_offset = sizeof(struct ipt_entry), .next_offset = sizeof(struct ipt_standard), }, .target = { .target = { .u = { .target_size = IPT_ALIGN(sizeof(struct ipt_standard_target)), }, }, .verdict = -NF_ACCEPT - 1, }, }, /* POST_ROUTING: used for priority classification */ { .entry = { .target_offset = sizeof(struct ipt_entry), .next_offset = sizeof(struct ipt_standard), }, .target = { .target = { .u = { .target_size = IPT_ALIGN(sizeof(struct ipt_standard_target)), }, }, .verdict = -NF_ACCEPT - 1, }, }, }, /* ERROR */ .term = { .entry = { .target_offset = sizeof(struct ipt_entry), .next_offset = sizeof(struct ipt_error), }, .target = { .target = { .u = { .user = { .target_size = IPT_ALIGN(sizeof(struct ipt_error_target)), .name = IPT_ERROR_TARGET, }, }, }, .errorname = "ERROR", }, }, }; static struct ipt_table vnet_table = { .name = "vnet", #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,11) .table = &initial_table.repl, #endif .valid_hooks = FILTER_VALID_HOOKS, .lock = RW_LOCK_UNLOCKED, .me = THIS_MODULE, #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16) .af = AF_INET, #endif }; static inline u_int16_t get_dst_port(struct ip_conntrack_tuple *tuple) { switch (tuple->dst.protonum) { case IPPROTO_GRE: /* XXX Truncate 32-bit GRE key to 16 bits */ #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,11) return tuple->dst.u.gre.key; #else return htons(ntohl(tuple->dst.u.gre.key)); #endif case IPPROTO_ICMP: /* Bind on ICMP echo ID */ return tuple->src.u.icmp.id; case IPPROTO_TCP: return tuple->dst.u.tcp.port; case IPPROTO_UDP: return tuple->dst.u.udp.port; default: return tuple->dst.u.all; } } static inline u_int16_t get_src_port(struct ip_conntrack_tuple *tuple) { switch (tuple->dst.protonum) { case IPPROTO_GRE: /* XXX Truncate 32-bit GRE key to 16 bits */ return htons(ntohl(tuple->src.u.gre.key)); case IPPROTO_ICMP: /* Bind on ICMP echo ID */ return tuple->src.u.icmp.id; case IPPROTO_TCP: return tuple->src.u.tcp.port; case IPPROTO_UDP: return tuple->src.u.udp.port; default: return tuple->src.u.all; } } static unsigned int vnet_hook(unsigned int hook, struct sk_buff **pskb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { struct ip_conntrack *ct; enum ip_conntrack_info ctinfo; enum ip_conntrack_dir dir; u_int8_t protocol; u_int32_t ip; u_int16_t port; struct bind_key *key; xid_t xid; unsigned int verdict; int priority; struct sock *sk; int need_to_free_sk = 0; ct = ip_conntrack_get(*pskb, &ctinfo); dir = CTINFO2DIR(ctinfo); /* Default to marking packet with root context ID */ xid = 0; switch (hook) { case NF_IP_LOCAL_IN: /* Multicast to 224.0.0.1 is one example */ if (!ct) break; /* Determine if the packet is destined for a bound port */ protocol = ct->tuplehash[dir].tuple.dst.protonum; assert(ctinfo == IP_CT_RELATED || ctinfo == (IP_CT_IS_REPLY + IP_CT_RELATED) || protocol == (*pskb)->nh.iph->protocol); ip = ct->tuplehash[dir].tuple.dst.ip; port = get_dst_port(&ct->tuplehash[dir].tuple); /* Check if the port is bound */ key = bind_get(protocol, ip, port, NULL); if (key && key->sk != NULL) { /* A new or established connection to a bound port */ sk = key->sk; #ifdef HAVE_FUNCTIONALITY_REQUIRED_BY_DEMUX /* If the bound socket is a real TCP socket, then the context that * bound the port could have re-assigned an established connection * socket to another context. See if this is the case. */ if (protocol == IPPROTO_TCP && sk->sk_type == SOCK_STREAM) { struct sock *tcp_sk; u_int32_t src_ip = ct->tuplehash[dir].tuple.src.ip; u_int16_t src_port = get_src_port(&ct->tuplehash[dir].tuple); tcp_sk = vnet_tcp_lookup(src_ip, src_port, ip, port, vnet_iif(*pskb)); if (tcp_sk) { if (tcp_sk->sk_state == TCP_TIME_WAIT) { sock_put(tcp_sk); } else { dbg("vnet_in:%d: established TCP socket %u.%u.%u.%u:%u -> %u.%u.%u.%u:%u\n", get_sk_xid(tcp_sk), NIPQUAD(src_ip), ntohs(src_port), NIPQUAD(ip), ntohs(port)); sk = tcp_sk; need_to_free_sk = 1; } /* Remember to sock_put()! */ } } #endif /* Indicate to the stack that the packet was "expected", so that it does * not generate a TCP RST or ICMP Unreachable message. This requires a * kernel patch. */ if (sk->sk_type == SOCK_RAW) (*pskb)->sk = sk; assert(sk); xid = get_sk_xid(sk); /* Steal the reply end of the connection */ if (get_ct_xid(ct, !dir) != xid) { dbg("vnet_in:%d: stealing %sbound %s connection %u.%u.%u.%u:%u -> %u.%u.%u.%u:%u from context %d\n", xid, key ? "" : "un", print_protocol(protocol), NIPQUAD(ip), ntohs(port), NIPQUAD(ct->tuplehash[!dir].tuple.dst.ip), ntohs(ct->tuplehash[!dir].tuple.dst.u.all), get_ct_xid(ct, !dir)); set_ct_xid(ct, !dir, xid); } /* Store the owner (if any) of the other side of the connection (if * localhost) in the peercred struct. */ sk->sk_peercred.uid = sk->sk_peercred.gid = (__u32) get_ct_xid(ct, dir); if (ctinfo == IP_CT_NEW) { dbg("vnet_in: %s port %u.%u.%u.%u:%u bound by context %d\n", print_protocol(protocol), NIPQUAD(ip), ntohs(port), xid); } #ifdef HAVE_FUNCTIONALITY_REQUIRED_BY_DEMUX if (need_to_free_sk) { /* if (sk->sk_state == TCP_TIME_WAIT) vnet_timewait_put(sk); else*/ sock_put(sk); need_to_free_sk=0; } #endif bind_put(key); } else if ((int) get_ct_xid(ct, !dir) == -1) { /* A new connection to an unbound port */ if (ctinfo == IP_CT_NEW) { dbg("vnet_in: %s port %u.%u.%u.%u:%u not bound\n", print_protocol(protocol), NIPQUAD(ip), ntohs(port)); } } else { /* A new or established connection to an unbound port that could be * associated with an active socket ("could be" because the socket * could be closed and the connection in a WAIT state). In any case, * give it to the last owner of the connection. */ xid = get_ct_xid(ct, !dir); } break; case NF_IP_LOCAL_OUT: /* Get the context ID of the sender */ assert((*pskb)->sk); xid = get_sk_xid((*pskb)->sk); /* Default class */ priority = vnet_root_class; if (ct) { protocol = ct->tuplehash[dir].tuple.dst.protonum; assert(ctinfo == IP_CT_RELATED || ctinfo == (IP_CT_IS_REPLY + IP_CT_RELATED) || protocol == (*pskb)->nh.iph->protocol); ip = ct->tuplehash[dir].tuple.src.ip; assert(ctinfo == IP_CT_RELATED || ctinfo == (IP_CT_IS_REPLY + IP_CT_RELATED) || ip == __constant_htonl(INADDR_ANY) || ip == (*pskb)->nh.iph->saddr); port = get_src_port(&ct->tuplehash[dir].tuple); } else { protocol = port = 0; } if (xid) { /* Multicast to 224.0.0.1 is one example */ if (!ct) { dbg("vnet_out:%d: dropping untrackable IP packet\n", xid); return NF_DROP; } /* XXX Is this guaranteed? */ if ((*pskb)->len < sizeof(struct iphdr)) { dbg("vnet_out:%d: dropping runt IP packet\n", xid); return NF_DROP; } /* Check source IP address */ if (inet_addr_type(ip) != RTN_LOCAL) { dbg("vnet_out:%d: non-local source IP address %u.%u.%u.%u not allowed\n", xid, NIPQUAD(ip)); return NF_DROP; } /* Sending of ICMP error messages not allowed */ if (protocol == IPPROTO_ICMP) { struct icmphdr *icmph = (struct icmphdr *)((*pskb)->nh.raw + ((*pskb)->nh.iph->ihl * 4)); if ((unsigned char *) &icmph[1] > (*pskb)->tail) { dbg("vnet_out:%d: dropping runt ICMP packet\n", xid); return NF_DROP; } switch (icmph->type) { case ICMP_ECHOREPLY: case ICMP_ECHO: case ICMP_TIMESTAMP: case ICMP_TIMESTAMPREPLY: case ICMP_INFO_REQUEST: case ICMP_INFO_REPLY: case ICMP_ADDRESS: case ICMP_ADDRESSREPLY: /* Guaranteed by icmp_pkt_to_tuple() */ assert(port == icmph->un.echo.id); break; default: dbg("vnet_out:%d: sending of ICMP error messages not allowed\n", xid); return NF_DROP; } } } else { /* Let root send anything it wants */ } if (ct) { /* Check if the port is bound by someone else */ key = bind_get(protocol, ip, port, NULL); } else { assert(xid == 0); key = NULL; } if (key && key->sk != NULL) { /* A new or established connection from a bound port */ assert(ct); sk = key->sk; #ifdef HAVE_FUNCTIONALITY_REQUIRED_BY_DEMUX /* If the bound socket is a real TCP socket, then the context that * bound the port could have re-assigned an established connection * socket to the sender's context. See if this is the case. */ if (protocol == IPPROTO_TCP && sk->sk_type == SOCK_STREAM && get_sk_xid(sk) != xid) { struct sock *tcp_sk; u_int32_t dst_ip = ct->tuplehash[dir].tuple.dst.ip; u_int16_t dst_port = get_dst_port(&ct->tuplehash[dir].tuple); tcp_sk = vnet_tcp_lookup(dst_ip, dst_port, ip, port, vnet_iif(*pskb)); if (tcp_sk) { if (tcp_sk->sk_state == TCP_TIME_WAIT) { sock_put(tcp_sk); //vnet_timewait_put(tcp_sk); } else { need_to_free_sk = 1; sk = tcp_sk; /* Remember to sock_put()! */ } } } #endif verdict = NF_ACCEPT; /* Stealing connections from established sockets is not allowed */ assert(sk); if (get_sk_xid(sk) != xid) { if (xid) { dbg("vnet_out:%d: %s port %u.%u.%u.%u:%u already bound by context %d\n", xid, print_protocol(protocol), NIPQUAD(ip), ntohs(port), get_sk_xid(sk)); verdict = NF_DROP; } else { /* Let root send whatever it wants but do not steal the packet or * connection. Kernel sockets owned by root may send packets on * behalf of bound sockets (for instance, TCP ACK in SYN_RECV or * TIME_WAIT). */ xid = get_sk_xid(sk); } } #ifdef HAVE_FUNCTIONALITY_REQUIRED_BY_DEMUX if (need_to_free_sk) { /* if (sk->sk_state == TCP_TIME_WAIT) vnet_timewait_put(sk); else */ sock_put(sk); need_to_free_sk = 0; } #endif bind_put(key); if (verdict == NF_DROP) goto done; } else { /* A new or established or untrackable connection from an unbound port */ /* Reserved ports must be bound. Usually only root is capable of * CAP_NET_BIND_SERVICE. */ if (xid && (protocol == IPPROTO_TCP || protocol == IPPROTO_UDP) && ntohs(port) < PROT_SOCK) { assert(ct); dbg("vnet_out:%d: %s port %u is reserved\n", xid, print_protocol(protocol), ntohs(port)); return NF_DROP; } } if (ct) { /* Steal the connection */ if (get_ct_xid(ct, dir) != xid) { dbg("vnet_out:%d: stealing %sbound %s connection %u.%u.%u.%u:%u -> %u.%u.%u.%u:%u from context %d\n", xid, key ? "" : "un", print_protocol(protocol), NIPQUAD(ip), ntohs(port), NIPQUAD(ct->tuplehash[dir].tuple.dst.ip), ntohs(ct->tuplehash[dir].tuple.dst.u.all), get_ct_xid(ct, dir)); set_ct_xid(ct, dir, xid); } /* Classify traffic once per connection */ if (ct->priority == (u_int32_t) -1) { /* The POSTROUTING chain should classify packets into a minor subclass * (1:1000, 1:2000, etc.) with -j CLASSIFY --set-class. Set the packet * MARK early so that rules can take xid into account. */ set_skb_xid(*pskb, xid); (*pskb)->priority = priority; (void) ipt_do_table(pskb, NF_IP_POST_ROUTING, in, out, &vnet_table, NULL); priority = (*pskb)->priority | xid; dbg("vnet_out:%d: %u.%u.%u.%u:%u -> %u.%u.%u.%u:%u class %x:%x\n", xid, NIPQUAD(ip), ntohs(port), NIPQUAD(ct->tuplehash[dir].tuple.dst.ip), ntohs(ct->tuplehash[dir].tuple.dst.u.all), TC_H_MAJ(priority) >> 16, TC_H_MIN(priority)); ct->priority = priority; } else priority = ct->priority; } else { assert(xid == 0); } /* Set class */ (*pskb)->priority = priority; break; default: /* Huh? */ assert(hook == NF_IP_LOCAL_IN || hook == NF_IP_LOCAL_OUT); break; } /* Mark packet */ set_skb_xid(*pskb, xid); #ifdef VNET_DEBUG if (vnet_verbose >= 3) { if (ct) print_conntrack(ct, ctinfo, hook); if (vnet_verbose >= 4) print_packet(*pskb); } #endif get_verdict: verdict = ipt_do_table(pskb, hook, in, out, &vnet_table, NULL); /* Pass to network taps */ if (verdict == NF_ACCEPT) verdict = packet_hook(*pskb, hook); done: return verdict; } static struct nf_hook_ops vnet_ops[] = { { .hook = vnet_hook, #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) .owner = THIS_MODULE, #endif .pf = PF_INET, .hooknum = NF_IP_LOCAL_IN, .priority = NF_IP_PRI_LAST, }, { .hook = vnet_hook, #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) .owner = THIS_MODULE, #endif .pf = PF_INET, .hooknum = NF_IP_LOCAL_OUT, .priority = NF_IP_PRI_LAST, }, }; /* Exported by net/ipv4/af_inet.c */ extern struct net_proto_family inet_family_ops; extern struct proto_ops inet_stream_ops; extern struct proto_ops inet_dgram_ops; extern struct proto_ops inet_sockraw_ops; extern int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); extern int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags); extern int inet_listen(struct socket *sock, int backlog); extern int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr, int addr_len, int flags); extern int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t size); extern int inet_release(struct socket *sock); /* Exported by net/ipv4/tcp_ipv4.c */ extern struct proto tcp_prot; extern int tcp_port_rover; extern int sysctl_local_port_range[2]; /* Exported by net/ipv4/udp.c */ extern struct proto udp_prot; extern int udp_port_rover; /* Functions that are not exported */ static int (*inet_create)(struct socket *sock, int protocol); static ssize_t (*inet_sendpage)(struct socket *sock, struct page *page, int offset, size_t size, int flags); static void (*tcp_v4_hash)(struct sock *sk); static void (*tcp_v4_unhash)(struct sock *sk); static void (*udp_v4_hash)(struct sock *sk); static void (*udp_v4_unhash)(struct sock *sk); static int vnet_inet_create(struct socket *sock, int protocol) { int ret; if (sock->type == SOCK_RAW) { /* Temporarily give CAP_NET_RAW to root VServer accounts */ if (current->euid) return -EPERM; cap_raise(current->cap_effective, CAP_NET_RAW); } ret = inet_create(sock, protocol); if (sock->type == SOCK_RAW) cap_lower(current->cap_effective, CAP_NET_RAW); if (ret) return ret; if (sock->type == SOCK_RAW) { struct sock *sk = sock->sk; struct inet_opt *inet = inet_sk(sk); /* Usually redundant and unused */ assert(inet->sport == htons(inet->num)); /* So we can track double raw binds */ inet->sport = 0; } return ret; } /* Make sure our bind table gets updated whenever the stack decides to * unhash or rehash a socket. */ static void vnet_inet_unhash(struct sock *sk) { struct inet_opt *inet = inet_sk(sk); struct bind_key *key; key = bind_get(sk->sk_protocol, inet->saddr, inet->sport, sk); if (key) { dbg("vnet_inet_unhash:%d: released %s port %u.%u.%u.%u:%u\n", get_sk_xid(sk), print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), ntohs(inet->sport)); bind_del(key); bind_put(key); } if (sk->sk_protocol == IPPROTO_TCP) tcp_v4_unhash(sk); else if (sk->sk_protocol == IPPROTO_UDP) udp_v4_unhash(sk); } static void vnet_inet_hash(struct sock *sk) { struct inet_opt *inet = inet_sk(sk); if (bind_add(sk->sk_protocol, inet->saddr, inet->sport, sk) == 0) { dbg("vnet_inet_hash:%d: bound %s port %u.%u.%u.%u:%u\n", get_sk_xid(sk), print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), ntohs(inet->sport)); } if (sk->sk_protocol == IPPROTO_TCP) tcp_v4_hash(sk); else if (sk->sk_protocol == IPPROTO_UDP) udp_v4_hash(sk); } /* Port reservation */ static int vnet_inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk = sock->sk; struct inet_opt *inet = inet_sk(sk); struct sockaddr_in *sin = (struct sockaddr_in *) uaddr; struct bind_key *key; int ret; /* Bind socket */ if ((ret = inet_bind(sock, uaddr, addr_len))) return ret; lock_sock(sk); /* Backward compatibility with safe raw sockets */ if (sock->type == SOCK_RAW) { /* Runt sockaddr */ if (addr_len < sizeof(struct sockaddr_in)) ret = -EINVAL; /* Non-local bind */ else if (sin->sin_addr.s_addr != __constant_htonl(INADDR_ANY) && inet_addr_type(sin->sin_addr.s_addr) != RTN_LOCAL) ret = -EINVAL; /* Unspecified port */ else if (!sin->sin_port) ret = -EINVAL; /* Reserved port */ else if ((sk->sk_protocol == IPPROTO_TCP || sk->sk_protocol == IPPROTO_UDP) && ntohs(sin->sin_port) < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE)) ret = -EACCES; /* Double bind */ else if (inet->sport) ret = -EINVAL; if (ret) goto done; inet->saddr = sin->sin_addr.s_addr; inet->sport = sin->sin_port; } key = bind_get(sk->sk_protocol, inet->saddr, inet->sport, NULL); if (key) { /* * If we are root or own the already bound socket, and * SO_REUSEADDR has been set on both. */ if ((get_sk_xid(sk) == 0 || get_sk_xid(sk) == get_sk_xid(key->sk)) && key->sk->sk_reuse && sk->sk_reuse) { if (key->ip == __constant_htonl(INADDR_ANY)) { /* Keep the current bind key */ bind_put(key); goto done; } else if (inet->saddr == __constant_htonl(INADDR_ANY)) { /* Consider the port to be bound to this socket now */ bind_del(key); } } bind_put(key); } if ((ret = bind_add(sk->sk_protocol, inet->saddr, inet->sport, sk)) == 0) { dbg("vnet_inet_bind:%d: bound %s port %u.%u.%u.%u:%u\n", get_sk_xid(sk), print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), ntohs(inet->sport)); } done: release_sock(sk); return ret; } /* Override TCP and UDP port rovers since they do not know about raw * socket binds. */ static int vnet_autobind(struct sock *sk) { int (*get_port)(struct sock *, unsigned short); int low = sysctl_local_port_range[0]; int high = sysctl_local_port_range[1]; int remaining = (high - low) + 1; int port; struct inet_opt *inet = inet_sk(sk); struct bind_key *key; /* Must be locked */ assert(sock_owned_by_user(sk)); /* Already bound to a port */ if (inet->num) return 0; if (sk->sk_protocol == IPPROTO_TCP) { get_port = tcp_prot.get_port; #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,14) /* Approximate the tcp_v4_get_port() strategy */ port = tcp_port_rover + 1; #else /* Approximate the inet_csk_get_port() strategy */ port = net_random() % (high - low) + low; #endif } else if (sk->sk_protocol == IPPROTO_UDP) { get_port = udp_prot.get_port; port = udp_port_rover; } else if (sk->sk_prot->get_port) { err("vnet_get_port:%d: %s unhandled\n", get_sk_xid(sk), print_protocol(sk->sk_protocol)); if (sk->sk_prot->get_port(sk, 0)) return -EAGAIN; inet->sport = htons(inet->num); return 0; } else { return 0; } dbg("vnet_autobind:%d: roving %s port range %u.%u.%u.%u:%u-%u\n", get_sk_xid(sk), print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), low, high); /* Find a free port by linear search. Note that the standard * udp_v4_get_port() function attempts to pick a port that * keeps its hash tables balanced. If the UDP hash table keeps * getting bombed, we should try implementing this strategy * here. */ do { if (port < low || port > high) port = low; /* XXX We could probably try something more clever * like checking to see if the bound socket is a * regular TCP socket owned by the same context (or we * are root) and, if so, letting tcp_v4_get_port() * apply its fast reuse logic to determine if the port * can be reused. */ if (bind_add(sk->sk_protocol, inet->saddr, htons(port), sk)) { dbg("vnet_get_port:%d: %s port %u.%u.%u.%u:%u already bound\n", get_sk_xid(sk), print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), port); goto next; } if (get_port(sk, port)) { /* Can happen if we are unloaded when there are active sockets */ dbg("vnet_get_port:%d: failed to hash unbound %s port %u.%u.%u.%u:%u\n", get_sk_xid(sk), print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), port); key = bind_get(sk->sk_protocol, inet->saddr, htons(port), sk); assert(key); bind_del(key); bind_put(key); } else { assert(port == inet->num); inet->sport = htons(inet->num); break; } next: port++; } while (--remaining > 0); if (sk->sk_protocol == IPPROTO_UDP) udp_port_rover = port; #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,14) else if (sk->sk_protocol == IPPROTO_TCP) tcp_port_rover = port; #endif if (remaining <= 0) { err("vnet_get_port:%d: exhausted local %s port range %u.%u.%u.%u:%u-%u\n", get_sk_xid(sk), print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), low, high); return -EAGAIN; } else { dbg("vnet_get_port:%d: autobound %s port %u.%u.%u.%u:%u\n", get_sk_xid(sk), print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), port); return 0; } } static int vnet_inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; lock_sock(sk); /* Duplicates checks in inet_stream_connect() */ if (uaddr->sa_family != AF_UNSPEC && sock->state == SS_UNCONNECTED && sk->sk_state == TCP_CLOSE) { /* We may need to bind the socket. */ if (!inet_sk(sk)->num && vnet_autobind(sk)) { release_sock(sk); return -EAGAIN; } } release_sock(sk); return inet_stream_connect(sock, uaddr, addr_len, flags); } static int vnet_inet_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; lock_sock(sk); /* Duplicates checks in inet_listen() */ if (sock->type == SOCK_STREAM && sock->state == SS_UNCONNECTED && sk->sk_state == TCP_CLOSE) { /* We may need to bind the socket. */ if (!inet_sk(sk)->num && vnet_autobind(sk)) { release_sock(sk); return -EAGAIN; } } release_sock(sk); return inet_listen(sock, backlog); } static int vnet_inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; lock_sock(sk); /* Duplicates checks in inet_dgram_connect() */ if (uaddr->sa_family != AF_UNSPEC) { /* We may need to bind the socket. */ if (!inet_sk(sk)->num && vnet_autobind(sk)) { release_sock(sk); return -EAGAIN; } } release_sock(sk); return inet_dgram_connect(sock, uaddr, addr_len, flags); } static int vnet_inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t size) { struct sock *sk = sock->sk; lock_sock(sk); /* We may need to bind the socket. */ if (!inet_sk(sk)->num && vnet_autobind(sk)) { release_sock(sk); return -EAGAIN; } release_sock(sk); return inet_sendmsg(iocb, sock, msg, size); } static ssize_t vnet_inet_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) { struct sock *sk = sock->sk; lock_sock(sk); /* We may need to bind the socket. */ if (!inet_sk(sk)->num && vnet_autobind(sk)) { release_sock(sk); return -EAGAIN; } release_sock(sk); return inet_sendpage(sock, page, offset, size, flags); } static int vnet_inet_release(struct socket *sock) { struct sock *sk = sock->sk; struct inet_opt *inet = inet_sk(sk); struct bind_key *key; /* Partial socket created by accept() */ if (!sk) goto done; lock_sock(sk); key = bind_get(sk->sk_protocol, inet->saddr, inet->sport, sk); if (key) { dbg("vnet_inet_release:%d: released %s port %u.%u.%u.%u:%u\n", get_sk_xid(sk), print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), ntohs(inet->sport)); bind_del(key); bind_put(key); } release_sock(sk); done: return inet_release(sock); } /* Sanity check */ #define override_op(op, from, to) do { assert((op) == (from)); (op) = (to); } while (0) static int __init vnet_init(void) { int ret; /* Initialize bind table */ ret = bind_init(); if (ret < 0) return ret; /* Register /proc entries */ ret = proc_init(); if (ret < 0) goto cleanup_bind; /* Register dummy netdevice */ ret = packet_init(); if (ret < 0) goto cleanup_proc; /* Register tap netdevice */ ret = tun_init(); if (ret < 0) goto cleanup_packet; /* Get pointers to unexported functions */ inet_create = inet_family_ops.create; inet_sendpage = inet_dgram_ops.sendpage; tcp_v4_hash = tcp_prot.hash; tcp_v4_unhash = tcp_prot.unhash; udp_v4_hash = udp_prot.hash; udp_v4_unhash = udp_prot.unhash; /* Override PF_INET socket operations */ override_op(inet_family_ops.create, inet_create, vnet_inet_create); override_op(inet_stream_ops.bind, inet_bind, vnet_inet_bind); override_op(inet_stream_ops.connect, inet_stream_connect, vnet_inet_stream_connect); override_op(inet_stream_ops.listen, inet_listen, vnet_inet_listen); override_op(inet_stream_ops.sendmsg, inet_sendmsg, vnet_inet_sendmsg); override_op(inet_stream_ops.release, inet_release, vnet_inet_release); override_op(inet_dgram_ops.bind, inet_bind, vnet_inet_bind); override_op(inet_dgram_ops.connect, inet_dgram_connect, vnet_inet_dgram_connect); override_op(inet_dgram_ops.sendmsg, inet_sendmsg, vnet_inet_sendmsg); override_op(inet_dgram_ops.sendpage, inet_sendpage, vnet_inet_sendpage); override_op(inet_dgram_ops.release, inet_release, vnet_inet_release); #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10) override_op(inet_sockraw_ops.bind, inet_bind, vnet_inet_bind); override_op(inet_sockraw_ops.connect, inet_dgram_connect, vnet_inet_dgram_connect); override_op(inet_sockraw_ops.sendmsg, inet_sendmsg, vnet_inet_sendmsg); override_op(inet_sockraw_ops.sendpage, inet_sendpage, vnet_inet_sendpage); override_op(inet_sockraw_ops.release, inet_release, vnet_inet_release); #endif override_op(tcp_prot.hash, tcp_v4_hash, vnet_inet_hash); override_op(tcp_prot.unhash, tcp_v4_unhash, vnet_inet_unhash); override_op(udp_prot.hash, udp_v4_hash, vnet_inet_hash); override_op(udp_prot.unhash, udp_v4_unhash, vnet_inet_unhash); /* Register table */ #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,11) ret = ipt_register_table(&vnet_table, &initial_table.repl); #else ret = ipt_register_table(&vnet_table); #endif if (ret < 0) goto cleanup_override; /* Register hooks */ ret = nf_register_hook(&vnet_ops[0]); if (ret < 0) goto cleanup_table; ret = nf_register_hook(&vnet_ops[1]); if (ret < 0) goto cleanup_hook0; /* Enables any runtime kernel support for VNET */ vnet_active = 1; /* Print banner */ printk("VNET: version " VNET_VERSION " compiled on " __DATE__ " at " __TIME__ "\n"); return ret; cleanup_hook0: nf_unregister_hook(&vnet_ops[0]); cleanup_table: ipt_unregister_table(&vnet_table); cleanup_override: inet_family_ops.create = inet_create; inet_stream_ops.bind = inet_bind; inet_stream_ops.connect = inet_stream_connect; inet_stream_ops.listen = inet_listen; inet_stream_ops.sendmsg = inet_sendmsg; inet_stream_ops.release = inet_release; inet_dgram_ops.bind = inet_bind; inet_dgram_ops.connect = inet_dgram_connect; inet_dgram_ops.sendmsg = inet_sendmsg; inet_dgram_ops.sendpage = inet_sendpage; inet_dgram_ops.release = inet_release; tun_cleanup(); cleanup_packet: packet_cleanup(); cleanup_proc: proc_cleanup(); cleanup_bind: bind_cleanup(); return ret; } static void __exit vnet_exit(void) { unsigned int i; /* Print banner */ printk("VNET: exiting\n"); /* Disables any runtime kernel support for VNET */ vnet_active = 0; /* Stop handling packets first */ for (i = 0; i < sizeof(vnet_ops)/sizeof(struct nf_hook_ops); i++) nf_unregister_hook(&vnet_ops[i]); ipt_unregister_table(&vnet_table); /* Stop handling PF_INET socket operations */ override_op(inet_family_ops.create, vnet_inet_create, inet_create); override_op(inet_stream_ops.bind, vnet_inet_bind, inet_bind); override_op(inet_stream_ops.connect, vnet_inet_stream_connect, inet_stream_connect); override_op(inet_stream_ops.listen, vnet_inet_listen, inet_listen); override_op(inet_stream_ops.sendmsg, vnet_inet_sendmsg, inet_sendmsg); override_op(inet_stream_ops.release, vnet_inet_release, inet_release); override_op(inet_dgram_ops.bind, vnet_inet_bind, inet_bind); override_op(inet_dgram_ops.connect, vnet_inet_dgram_connect, inet_dgram_connect); override_op(inet_dgram_ops.sendmsg, vnet_inet_sendmsg, inet_sendmsg); override_op(inet_dgram_ops.sendpage, vnet_inet_sendpage, inet_sendpage); override_op(inet_dgram_ops.release, vnet_inet_release, inet_release); #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10) override_op(inet_sockraw_ops.bind, vnet_inet_bind, inet_bind); override_op(inet_sockraw_ops.connect, vnet_inet_dgram_connect, inet_dgram_connect); override_op(inet_sockraw_ops.sendmsg, vnet_inet_sendmsg, inet_sendmsg); override_op(inet_sockraw_ops.sendpage, vnet_inet_sendpage, inet_sendpage); override_op(inet_sockraw_ops.release, vnet_inet_release, inet_release); #endif override_op(tcp_prot.hash, vnet_inet_hash, tcp_v4_hash); override_op(tcp_prot.unhash, vnet_inet_unhash, tcp_v4_unhash); override_op(udp_prot.hash, vnet_inet_hash, udp_v4_hash); override_op(udp_prot.unhash, vnet_inet_unhash, udp_v4_unhash); /* Disable tap netdevice */ tun_cleanup(); /* Disable vnet netdevice and stop handling PF_PACKET sockets */ packet_cleanup(); /* Unregister /proc handlers */ proc_cleanup(); /* Cleanup bind table (must be after nf_unregister_hook()) */ bind_cleanup(); } module_init(vnet_init); module_exit(vnet_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Mark Huang "); MODULE_DESCRIPTION("VServer IP isolation");