From 8d5fa6e42e6a276fd43204eb011ae43e6caa0411 Mon Sep 17 00:00:00 2001 From: KyoungSoo Park Date: Wed, 25 Apr 2007 16:11:44 +0000 Subject: [PATCH] erroneously checked in --- vnet_main.c | 1202 --------------------------------------------------- 1 file changed, 1202 deletions(-) delete mode 100644 vnet_main.c diff --git a/vnet_main.c b/vnet_main.c deleted file mode 100644 index 4dfe7b4..0000000 --- a/vnet_main.c +++ /dev/null @@ -1,1202 +0,0 @@ -/* - * VServer IP isolation. - * - * This file implements netfilter hooks and AF_INET socket function - * overrides. - * - * Mark Huang - * Copyright (C) 2004 The Trustees of Princeton University - * - * $Id: vnet_main.c,v 1.40 2007/03/08 15:46:07 mef Exp $ - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include "vnet_config.h" -#include "vnet.h" -#include "vnet_dbg.h" -#include "vnet_compat.h" - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16) - -#define HAVE_FUNCTIONALITY_REQUIRED_BY_DEMUX - -#include - -static inline void -vnet_timewait_put(struct sock* sk) -{ - inet_twsk_put((struct inet_timewait_sock *)sk); -} - -static inline struct sock* -vnet_tcp_lookup(u32 src_ip, u16 src_port, - u32 ip, u16 port, int dif) -{ - return inet_lookup(&tcp_hashinfo, src_ip, src_port, ip, port, dif); -} - -static inline int vnet_iif(const struct sk_buff *skb) -{ - return inet_iif(skb); -} -#endif - -#if LINUX_VERSION_CODE == KERNEL_VERSION(2,6,12) - -#define HAVE_FUNCTIONALITY_REQUIRED_BY_DEMUX - -static inline void -vnet_timewait_put(struct sock* sk) -{ - /* net/tcp.h */ - tcp_tw_put((struct tcp_tw_bucket*)sk); -} - -static inline struct sock* -vnet_tcp_lookup(u32 saddr, u16 sport, u32 daddr,u16 dport, int dif) -{ - extern struct sock *tcp_v4_lookup(u32, u16, u32, u16, int); - return tcp_v4_lookup(saddr, sport, daddr, dport, dif); -} - -/* same as tcp_v4_iff() in net/ipv4/tcp_ipv4. */ -static inline int vnet_iif(const struct sk_buff *skb) -{ - return ((struct rtable *)skb->dst)->rt_iif; -} -#endif - -#ifndef HAVE_FUNCTIONALITY_REQUIRED_BY_DEMUX -#warning DEMUX FUNCTIONALITY NOT SUPPORTED -#endif - -int vnet_verbose = 1; - -/* We subdivide the 1: major class into 15 minor subclasses 1:1, 1:2, - * etc. so that we can represent multiple bandwidth limits. The 1:1 - * subclass has children named 1:1000, 1:1001, etc., one for each - * context (up to 4096). Similarly, the 1:2 subclass has children - * named 1:2000, 1:2001, etc. By default, the 1:1 subclass represents - * the node bandwidth cap and 1:1000 represents the root context's - * share of it. */ -int vnet_root_class = TC_H_MAKE(1 << 16, 0x1000); - -#define FILTER_VALID_HOOKS ((1 << NF_IP_LOCAL_IN) | \ - (1 << NF_IP_LOCAL_OUT) | \ - (1 << NF_IP_POST_ROUTING)) - -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,11) - -/* Standard entry. */ -struct ipt_standard -{ - struct ipt_entry entry; - struct ipt_standard_target target; -}; - -struct ipt_error_target -{ - struct ipt_entry_target target; - char errorname[IPT_FUNCTION_MAXNAMELEN]; -}; - -struct ipt_error -{ - struct ipt_entry entry; - struct ipt_error_target target; -}; - -#endif - -static struct -{ - struct ipt_replace repl; - struct ipt_standard entries[3]; - struct ipt_error term; -} initial_table __initdata = -{ - .repl = - { - .name = "vnet", - .valid_hooks = FILTER_VALID_HOOKS, - .num_entries = 4, - .size = sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error), - .hook_entry = { [NF_IP_LOCAL_IN] = 0, - [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard), - [NF_IP_POST_ROUTING] = sizeof(struct ipt_standard) * 2, }, - .underflow = { [NF_IP_LOCAL_IN] = 0, - [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard), - [NF_IP_POST_ROUTING] = sizeof(struct ipt_standard) * 2, }, - }, - - .entries = - { - /* LOCAL_IN: currently unused */ - { .entry = { .target_offset = sizeof(struct ipt_entry), - .next_offset = sizeof(struct ipt_standard), }, - .target = { .target = { .u = { .target_size = IPT_ALIGN(sizeof(struct ipt_standard_target)), }, }, - .verdict = -NF_ACCEPT - 1, }, - }, - - /* LOCAL_OUT: used for logging */ - { .entry = { .target_offset = sizeof(struct ipt_entry), - .next_offset = sizeof(struct ipt_standard), }, - .target = { .target = { .u = { .target_size = IPT_ALIGN(sizeof(struct ipt_standard_target)), }, }, - .verdict = -NF_ACCEPT - 1, }, - }, - - /* POST_ROUTING: used for priority classification */ - { .entry = { .target_offset = sizeof(struct ipt_entry), - .next_offset = sizeof(struct ipt_standard), }, - .target = { .target = { .u = { .target_size = IPT_ALIGN(sizeof(struct ipt_standard_target)), }, }, - .verdict = -NF_ACCEPT - 1, }, - }, - }, - - /* ERROR */ - .term = - { - .entry = { .target_offset = sizeof(struct ipt_entry), - .next_offset = sizeof(struct ipt_error), }, - .target = { .target = { .u = { .user = { .target_size = IPT_ALIGN(sizeof(struct ipt_error_target)), - .name = IPT_ERROR_TARGET, }, }, }, - .errorname = "ERROR", }, - }, -}; - -static struct ipt_table vnet_table = { - .name = "vnet", -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,11) - .table = &initial_table.repl, -#endif - .valid_hooks = FILTER_VALID_HOOKS, - .lock = RW_LOCK_UNLOCKED, - .me = THIS_MODULE, -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16) - .af = AF_INET, -#endif -}; - -static inline u_int16_t -get_dst_port(struct ip_conntrack_tuple *tuple) -{ - switch (tuple->dst.protonum) { - case IPPROTO_GRE: - /* XXX Truncate 32-bit GRE key to 16 bits */ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,11) - return tuple->dst.u.gre.key; -#else - return htons(ntohl(tuple->dst.u.gre.key)); -#endif - case IPPROTO_ICMP: - /* Bind on ICMP echo ID */ - return tuple->src.u.icmp.id; - case IPPROTO_TCP: - return tuple->dst.u.tcp.port; - case IPPROTO_UDP: - return tuple->dst.u.udp.port; - default: - return tuple->dst.u.all; - } -} - -static inline u_int16_t -get_src_port(struct ip_conntrack_tuple *tuple) -{ - switch (tuple->dst.protonum) { - case IPPROTO_GRE: - /* XXX Truncate 32-bit GRE key to 16 bits */ - return htons(ntohl(tuple->src.u.gre.key)); - case IPPROTO_ICMP: - /* Bind on ICMP echo ID */ - return tuple->src.u.icmp.id; - case IPPROTO_TCP: - return tuple->src.u.tcp.port; - case IPPROTO_UDP: - return tuple->src.u.udp.port; - default: - return tuple->src.u.all; - } -} - - - -static unsigned int -vnet_hook(unsigned int hook, - struct sk_buff **pskb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - struct ip_conntrack *ct; - enum ip_conntrack_info ctinfo; - enum ip_conntrack_dir dir; - u_int8_t protocol; - u_int32_t ip; - u_int16_t port; - struct bind_key *key; - xid_t xid; - unsigned int verdict; - int priority; - struct sock *sk; - int need_to_free_sk = 0; - - ct = ip_conntrack_get(*pskb, &ctinfo); - dir = CTINFO2DIR(ctinfo); - - /* Default to marking packet with root context ID */ - xid = 0; - - switch (hook) { - - case NF_IP_LOCAL_IN: - /* Multicast to 224.0.0.1 is one example */ - if (!ct) - break; - - /* Determine if the packet is destined for a bound port */ - protocol = ct->tuplehash[dir].tuple.dst.protonum; - assert(ctinfo == IP_CT_RELATED || - ctinfo == (IP_CT_IS_REPLY + IP_CT_RELATED) || - protocol == (*pskb)->nh.iph->protocol); - ip = ct->tuplehash[dir].tuple.dst.ip; - port = get_dst_port(&ct->tuplehash[dir].tuple); - - /* Check if the port is bound */ - key = bind_get(protocol, ip, port, NULL); - - if (key && key->sk != NULL) { - - /* A new or established connection to a bound port */ - sk = key->sk; - -#ifdef HAVE_FUNCTIONALITY_REQUIRED_BY_DEMUX - /* If the bound socket is a real TCP socket, then the context that - * bound the port could have re-assigned an established connection - * socket to another context. See if this is the case. - */ - if (protocol == IPPROTO_TCP && sk->sk_type == SOCK_STREAM) { - struct sock *tcp_sk; - u_int32_t src_ip = ct->tuplehash[dir].tuple.src.ip; - u_int16_t src_port = get_src_port(&ct->tuplehash[dir].tuple); - - tcp_sk = vnet_tcp_lookup(src_ip, src_port, ip, port, vnet_iif(*pskb)); - if (tcp_sk) { - if (tcp_sk->sk_state == TCP_TIME_WAIT) { - sock_put(tcp_sk); - } else { - dbg("vnet_in:%d: established TCP socket %u.%u.%u.%u:%u -> %u.%u.%u.%u:%u\n", - get_sk_xid(tcp_sk), NIPQUAD(src_ip), ntohs(src_port), NIPQUAD(ip), ntohs(port)); - sk = tcp_sk; - need_to_free_sk = 1; - } - /* Remember to sock_put()! */ - } - } -#endif - - /* Indicate to the stack that the packet was "expected", so that it does - * not generate a TCP RST or ICMP Unreachable message. This requires a - * kernel patch. - */ - if (sk->sk_type == SOCK_RAW) - (*pskb)->sk = sk; - - assert(sk); - xid = get_sk_xid(sk); - - /* Steal the reply end of the connection */ - if (get_ct_xid(ct, !dir) != xid) { - dbg("vnet_in:%d: stealing %sbound %s connection %u.%u.%u.%u:%u -> %u.%u.%u.%u:%u from context %d\n", xid, - key ? "" : "un", print_protocol(protocol), - NIPQUAD(ip), ntohs(port), - NIPQUAD(ct->tuplehash[!dir].tuple.dst.ip), ntohs(ct->tuplehash[!dir].tuple.dst.u.all), - get_ct_xid(ct, !dir)); - set_ct_xid(ct, !dir, xid); - } - - /* Store the owner (if any) of the other side of the connection (if - * localhost) in the peercred struct. - */ - sk->sk_peercred.uid = sk->sk_peercred.gid = (__u32) get_ct_xid(ct, dir); - - if (ctinfo == IP_CT_NEW) { - dbg("vnet_in: %s port %u.%u.%u.%u:%u bound by context %d\n", - print_protocol(protocol), NIPQUAD(ip), ntohs(port), xid); - } - -#ifdef HAVE_FUNCTIONALITY_REQUIRED_BY_DEMUX - if (need_to_free_sk) { - /* - if (sk->sk_state == TCP_TIME_WAIT) - vnet_timewait_put(sk); - else*/ - sock_put(sk); - need_to_free_sk=0; - } -#endif - bind_put(key); - - } else if ((int) get_ct_xid(ct, !dir) == -1) { - /* A new connection to an unbound port */ - if (ctinfo == IP_CT_NEW) { - dbg("vnet_in: %s port %u.%u.%u.%u:%u not bound\n", - print_protocol(protocol), NIPQUAD(ip), ntohs(port)); - } - } else { - /* A new or established connection to an unbound port that could be - * associated with an active socket ("could be" because the socket - * could be closed and the connection in a WAIT state). In any case, - * give it to the last owner of the connection. - */ - xid = get_ct_xid(ct, !dir); - } - - break; - - case NF_IP_LOCAL_OUT: - /* Get the context ID of the sender */ - assert((*pskb)->sk); - xid = get_sk_xid((*pskb)->sk); - - /* Default class */ - priority = vnet_root_class; - - if (ct) { - protocol = ct->tuplehash[dir].tuple.dst.protonum; - assert(ctinfo == IP_CT_RELATED || - ctinfo == (IP_CT_IS_REPLY + IP_CT_RELATED) || - protocol == (*pskb)->nh.iph->protocol); - ip = ct->tuplehash[dir].tuple.src.ip; - assert(ctinfo == IP_CT_RELATED || - ctinfo == (IP_CT_IS_REPLY + IP_CT_RELATED) || - ip == __constant_htonl(INADDR_ANY) || ip == (*pskb)->nh.iph->saddr); - port = get_src_port(&ct->tuplehash[dir].tuple); - } else { - protocol = port = 0; - } - - if (xid) { - /* Multicast to 224.0.0.1 is one example */ - if (!ct) { - dbg("vnet_out:%d: dropping untrackable IP packet\n", xid); - return NF_DROP; - } - - /* XXX Is this guaranteed? */ - if ((*pskb)->len < sizeof(struct iphdr)) { - dbg("vnet_out:%d: dropping runt IP packet\n", xid); - return NF_DROP; - } - - /* Check source IP address */ - if (inet_addr_type(ip) != RTN_LOCAL) { - dbg("vnet_out:%d: non-local source IP address %u.%u.%u.%u not allowed\n", xid, - NIPQUAD(ip)); - return NF_DROP; - } - - /* Sending of ICMP error messages not allowed */ - if (protocol == IPPROTO_ICMP) { - struct icmphdr *icmph = (struct icmphdr *)((*pskb)->nh.raw + ((*pskb)->nh.iph->ihl * 4)); - - if ((unsigned char *) &icmph[1] > (*pskb)->tail) { - dbg("vnet_out:%d: dropping runt ICMP packet\n", xid); - return NF_DROP; - } - - switch (icmph->type) { - case ICMP_ECHOREPLY: - case ICMP_ECHO: - case ICMP_TIMESTAMP: - case ICMP_TIMESTAMPREPLY: - case ICMP_INFO_REQUEST: - case ICMP_INFO_REPLY: - case ICMP_ADDRESS: - case ICMP_ADDRESSREPLY: - /* Guaranteed by icmp_pkt_to_tuple() */ - assert(port == icmph->un.echo.id); - break; - default: - dbg("vnet_out:%d: sending of ICMP error messages not allowed\n", xid); - return NF_DROP; - } - } - } else { - /* Let root send anything it wants */ - } - - if (ct) { - /* Check if the port is bound by someone else */ - key = bind_get(protocol, ip, port, NULL); - } else { - assert(xid == 0); - key = NULL; - } - - if (key && key->sk != NULL) { - /* A new or established connection from a bound port */ - assert(ct); - - sk = key->sk; - -#ifdef HAVE_FUNCTIONALITY_REQUIRED_BY_DEMUX - /* If the bound socket is a real TCP socket, then the context that - * bound the port could have re-assigned an established connection - * socket to the sender's context. See if this is the case. - */ - if (protocol == IPPROTO_TCP && sk->sk_type == SOCK_STREAM && get_sk_xid(sk) != xid) { - struct sock *tcp_sk; - u_int32_t dst_ip = ct->tuplehash[dir].tuple.dst.ip; - u_int16_t dst_port = get_dst_port(&ct->tuplehash[dir].tuple); - - tcp_sk = vnet_tcp_lookup(dst_ip, dst_port, ip, port, vnet_iif(*pskb)); - if (tcp_sk) { - if (tcp_sk->sk_state == TCP_TIME_WAIT) { - sock_put(tcp_sk); - //vnet_timewait_put(tcp_sk); - } else { - need_to_free_sk = 1; - sk = tcp_sk; - /* Remember to sock_put()! */ - } - } - } -#endif - - verdict = NF_ACCEPT; - - /* Stealing connections from established sockets is not allowed */ - assert(sk); - if (get_sk_xid(sk) != xid) { - if (xid) { - dbg("vnet_out:%d: %s port %u.%u.%u.%u:%u already bound by context %d\n", xid, - print_protocol(protocol), NIPQUAD(ip), ntohs(port), get_sk_xid(sk)); - verdict = NF_DROP; - } else { - /* Let root send whatever it wants but do not steal the packet or - * connection. Kernel sockets owned by root may send packets on - * behalf of bound sockets (for instance, TCP ACK in SYN_RECV or - * TIME_WAIT). - */ - xid = get_sk_xid(sk); - } - } - -#ifdef HAVE_FUNCTIONALITY_REQUIRED_BY_DEMUX - if (need_to_free_sk) { - /* - if (sk->sk_state == TCP_TIME_WAIT) - vnet_timewait_put(sk); - else */ - sock_put(sk); - need_to_free_sk = 0; - } -#endif - bind_put(key); - - if (verdict == NF_DROP) - goto done; - } else { - /* A new or established or untrackable connection from an unbound port */ - - /* Reserved ports must be bound. Usually only root is capable of - * CAP_NET_BIND_SERVICE. - */ - if (xid && - (protocol == IPPROTO_TCP || protocol == IPPROTO_UDP) && - ntohs(port) < PROT_SOCK) { - assert(ct); - dbg("vnet_out:%d: %s port %u is reserved\n", xid, - print_protocol(protocol), ntohs(port)); - return NF_DROP; - } - } - - if (ct) { - /* Steal the connection */ - if (get_ct_xid(ct, dir) != xid) { - dbg("vnet_out:%d: stealing %sbound %s connection %u.%u.%u.%u:%u -> %u.%u.%u.%u:%u from context %d\n", xid, - key ? "" : "un", print_protocol(protocol), - NIPQUAD(ip), ntohs(port), - NIPQUAD(ct->tuplehash[dir].tuple.dst.ip), ntohs(ct->tuplehash[dir].tuple.dst.u.all), - get_ct_xid(ct, dir)); - set_ct_xid(ct, dir, xid); - } - - /* Classify traffic once per connection */ - if (ct->priority == (u_int32_t) -1) { - /* The POSTROUTING chain should classify packets into a minor subclass - * (1:1000, 1:2000, etc.) with -j CLASSIFY --set-class. Set the packet - * MARK early so that rules can take xid into account. */ - set_skb_xid(*pskb, xid); - (*pskb)->priority = priority; - (void) ipt_do_table(pskb, NF_IP_POST_ROUTING, in, out, &vnet_table, NULL); - priority = (*pskb)->priority | xid; - dbg("vnet_out:%d: %u.%u.%u.%u:%u -> %u.%u.%u.%u:%u class %x:%x\n", xid, - NIPQUAD(ip), ntohs(port), - NIPQUAD(ct->tuplehash[dir].tuple.dst.ip), ntohs(ct->tuplehash[dir].tuple.dst.u.all), - TC_H_MAJ(priority) >> 16, TC_H_MIN(priority)); - ct->priority = priority; - } else - priority = ct->priority; - } else { - assert(xid == 0); - } - - /* Set class */ - (*pskb)->priority = priority; - - break; - - default: - /* Huh? */ - assert(hook == NF_IP_LOCAL_IN || hook == NF_IP_LOCAL_OUT); - break; - } - - /* Mark packet */ - set_skb_xid(*pskb, xid); - -#ifdef VNET_DEBUG - if (vnet_verbose >= 3) { - if (ct) - print_conntrack(ct, ctinfo, hook); - if (vnet_verbose >= 4) - print_packet(*pskb); - } -#endif - - get_verdict: - verdict = ipt_do_table(pskb, hook, in, out, &vnet_table, NULL); - - /* Pass to network taps */ - if (verdict == NF_ACCEPT) - verdict = packet_hook(*pskb, hook); - - done: - return verdict; -} - -static struct nf_hook_ops vnet_ops[] = { - { - .hook = vnet_hook, -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) - .owner = THIS_MODULE, -#endif - .pf = PF_INET, - .hooknum = NF_IP_LOCAL_IN, - .priority = NF_IP_PRI_LAST, - }, - { - .hook = vnet_hook, -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) - .owner = THIS_MODULE, -#endif - .pf = PF_INET, - .hooknum = NF_IP_LOCAL_OUT, - .priority = NF_IP_PRI_LAST, - }, -}; - -/* Exported by net/ipv4/af_inet.c */ -extern struct net_proto_family inet_family_ops; -extern struct proto_ops inet_stream_ops; -extern struct proto_ops inet_dgram_ops; -extern struct proto_ops inet_sockraw_ops; -extern int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); -extern int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, - int addr_len, int flags); -extern int inet_listen(struct socket *sock, int backlog); -extern int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr, - int addr_len, int flags); -extern int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, - size_t size); -extern int inet_release(struct socket *sock); - -/* Exported by net/ipv4/tcp_ipv4.c */ -extern struct proto tcp_prot; -extern int tcp_port_rover; -extern int sysctl_local_port_range[2]; - -/* Exported by net/ipv4/udp.c */ -extern struct proto udp_prot; -extern int udp_port_rover; - -/* Functions that are not exported */ -static int (*inet_create)(struct socket *sock, int protocol); -static ssize_t (*inet_sendpage)(struct socket *sock, struct page *page, int offset, size_t size, int flags); -static void (*tcp_v4_hash)(struct sock *sk); -static void (*tcp_v4_unhash)(struct sock *sk); -static void (*udp_v4_hash)(struct sock *sk); -static void (*udp_v4_unhash)(struct sock *sk); - -static int -vnet_inet_create(struct socket *sock, int protocol) -{ - int ret; - - if (sock->type == SOCK_RAW) { - /* Temporarily give CAP_NET_RAW to root VServer accounts */ - if (current->euid) - return -EPERM; - cap_raise(current->cap_effective, CAP_NET_RAW); - } - ret = inet_create(sock, protocol); - if (sock->type == SOCK_RAW) - cap_lower(current->cap_effective, CAP_NET_RAW); - if (ret) - return ret; - - if (sock->type == SOCK_RAW) { - struct sock *sk = sock->sk; - struct inet_opt *inet = inet_sk(sk); - /* Usually redundant and unused */ - assert(inet->sport == htons(inet->num)); - /* So we can track double raw binds */ - inet->sport = 0; - } - - return ret; -} - -/* Make sure our bind table gets updated whenever the stack decides to - * unhash or rehash a socket. - */ -static void -vnet_inet_unhash(struct sock *sk) -{ - struct inet_opt *inet = inet_sk(sk); - struct bind_key *key; - - key = bind_get(sk->sk_protocol, inet->saddr, inet->sport, sk); - if (key) { - dbg("vnet_inet_unhash:%d: released %s port %u.%u.%u.%u:%u\n", get_sk_xid(sk), - print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), ntohs(inet->sport)); - bind_del(key); - bind_put(key); - } - - if (sk->sk_protocol == IPPROTO_TCP) - tcp_v4_unhash(sk); - else if (sk->sk_protocol == IPPROTO_UDP) - udp_v4_unhash(sk); -} - -static void -vnet_inet_hash(struct sock *sk) -{ - struct inet_opt *inet = inet_sk(sk); - - if (bind_add(sk->sk_protocol, inet->saddr, inet->sport, sk) == 0) { - dbg("vnet_inet_hash:%d: bound %s port %u.%u.%u.%u:%u\n", get_sk_xid(sk), - print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), ntohs(inet->sport)); - } - - if (sk->sk_protocol == IPPROTO_TCP) - tcp_v4_hash(sk); - else if (sk->sk_protocol == IPPROTO_UDP) - udp_v4_hash(sk); -} - -/* Port reservation */ -static int -vnet_inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) -{ - struct sock *sk = sock->sk; - struct inet_opt *inet = inet_sk(sk); - struct sockaddr_in *sin = (struct sockaddr_in *) uaddr; - struct bind_key *key; - int ret; - - /* Bind socket */ - if ((ret = inet_bind(sock, uaddr, addr_len))) - return ret; - - lock_sock(sk); - - /* Backward compatibility with safe raw sockets */ - if (sock->type == SOCK_RAW) { - /* Runt sockaddr */ - if (addr_len < sizeof(struct sockaddr_in)) - ret = -EINVAL; - /* Non-local bind */ - else if (sin->sin_addr.s_addr != __constant_htonl(INADDR_ANY) && inet_addr_type(sin->sin_addr.s_addr) != RTN_LOCAL) - ret = -EINVAL; - /* Unspecified port */ - else if (!sin->sin_port) - ret = -EINVAL; - /* Reserved port */ - else if ((sk->sk_protocol == IPPROTO_TCP || sk->sk_protocol == IPPROTO_UDP) && - ntohs(sin->sin_port) < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE)) - ret = -EACCES; - /* Double bind */ - else if (inet->sport) - ret = -EINVAL; - if (ret) - goto done; - inet->saddr = sin->sin_addr.s_addr; - inet->sport = sin->sin_port; - } - - key = bind_get(sk->sk_protocol, inet->saddr, inet->sport, NULL); - if (key) { - /* - * If we are root or own the already bound socket, and - * SO_REUSEADDR has been set on both. - */ - if ((get_sk_xid(sk) == 0 || get_sk_xid(sk) == get_sk_xid(key->sk)) && - key->sk->sk_reuse && sk->sk_reuse) { - if (key->ip == __constant_htonl(INADDR_ANY)) { - /* Keep the current bind key */ - bind_put(key); - goto done; - } else if (inet->saddr == __constant_htonl(INADDR_ANY)) { - /* Consider the port to be bound to this socket now */ - bind_del(key); - } - } - bind_put(key); - } - - if ((ret = bind_add(sk->sk_protocol, inet->saddr, inet->sport, sk)) == 0) { - dbg("vnet_inet_bind:%d: bound %s port %u.%u.%u.%u:%u\n", get_sk_xid(sk), - print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), ntohs(inet->sport)); - } - - done: - release_sock(sk); - return ret; -} - -/* Override TCP and UDP port rovers since they do not know about raw - * socket binds. - */ -static int -vnet_autobind(struct sock *sk) -{ - int (*get_port)(struct sock *, unsigned short); - int low = sysctl_local_port_range[0]; - int high = sysctl_local_port_range[1]; - int remaining = (high - low) + 1; - int port; - struct inet_opt *inet = inet_sk(sk); - struct bind_key *key; - - /* Must be locked */ - assert(sock_owned_by_user(sk)); - - /* Already bound to a port */ - if (inet->num) - return 0; - - if (sk->sk_protocol == IPPROTO_TCP) { - get_port = tcp_prot.get_port; -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,14) - /* Approximate the tcp_v4_get_port() strategy */ - port = tcp_port_rover + 1; -#else - /* Approximate the inet_csk_get_port() strategy */ - port = net_random() % (high - low) + low; -#endif - } else if (sk->sk_protocol == IPPROTO_UDP) { - get_port = udp_prot.get_port; - port = udp_port_rover; - } else if (sk->sk_prot->get_port) { - err("vnet_get_port:%d: %s unhandled\n", get_sk_xid(sk), - print_protocol(sk->sk_protocol)); - if (sk->sk_prot->get_port(sk, 0)) - return -EAGAIN; - inet->sport = htons(inet->num); - return 0; - } else { - return 0; - } - - dbg("vnet_autobind:%d: roving %s port range %u.%u.%u.%u:%u-%u\n", get_sk_xid(sk), - print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), low, high); - - /* Find a free port by linear search. Note that the standard - * udp_v4_get_port() function attempts to pick a port that - * keeps its hash tables balanced. If the UDP hash table keeps - * getting bombed, we should try implementing this strategy - * here. - */ - do { - if (port < low || port > high) - port = low; - - /* XXX We could probably try something more clever - * like checking to see if the bound socket is a - * regular TCP socket owned by the same context (or we - * are root) and, if so, letting tcp_v4_get_port() - * apply its fast reuse logic to determine if the port - * can be reused. - */ - if (bind_add(sk->sk_protocol, inet->saddr, htons(port), sk)) { - dbg("vnet_get_port:%d: %s port %u.%u.%u.%u:%u already bound\n", get_sk_xid(sk), - print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), port); - goto next; - } - - if (get_port(sk, port)) { - /* Can happen if we are unloaded when there are active sockets */ - dbg("vnet_get_port:%d: failed to hash unbound %s port %u.%u.%u.%u:%u\n", get_sk_xid(sk), - print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), port); - key = bind_get(sk->sk_protocol, inet->saddr, htons(port), sk); - assert(key); - bind_del(key); - bind_put(key); - } else { - assert(port == inet->num); - inet->sport = htons(inet->num); - break; - } - next: - port++; - } while (--remaining > 0); - - if (sk->sk_protocol == IPPROTO_UDP) - udp_port_rover = port; -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,14) - else if (sk->sk_protocol == IPPROTO_TCP) - tcp_port_rover = port; -#endif - - if (remaining <= 0) { - err("vnet_get_port:%d: exhausted local %s port range %u.%u.%u.%u:%u-%u\n", get_sk_xid(sk), - print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), low, high); - return -EAGAIN; - } else { - dbg("vnet_get_port:%d: autobound %s port %u.%u.%u.%u:%u\n", get_sk_xid(sk), - print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), port); - return 0; - } -} - -static int -vnet_inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, - int addr_len, int flags) -{ - struct sock *sk = sock->sk; - - lock_sock(sk); - - /* Duplicates checks in inet_stream_connect() */ - if (uaddr->sa_family != AF_UNSPEC && - sock->state == SS_UNCONNECTED && - sk->sk_state == TCP_CLOSE) { - /* We may need to bind the socket. */ - if (!inet_sk(sk)->num && vnet_autobind(sk)) { - release_sock(sk); - return -EAGAIN; - } - } - - release_sock(sk); - - return inet_stream_connect(sock, uaddr, addr_len, flags); -} - -static int -vnet_inet_listen(struct socket *sock, int backlog) -{ - struct sock *sk = sock->sk; - - lock_sock(sk); - - /* Duplicates checks in inet_listen() */ - if (sock->type == SOCK_STREAM && - sock->state == SS_UNCONNECTED && - sk->sk_state == TCP_CLOSE) { - /* We may need to bind the socket. */ - if (!inet_sk(sk)->num && vnet_autobind(sk)) { - release_sock(sk); - return -EAGAIN; - } - } - - release_sock(sk); - - return inet_listen(sock, backlog); -} - -static int -vnet_inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr, - int addr_len, int flags) -{ - struct sock *sk = sock->sk; - - lock_sock(sk); - - /* Duplicates checks in inet_dgram_connect() */ - if (uaddr->sa_family != AF_UNSPEC) { - /* We may need to bind the socket. */ - if (!inet_sk(sk)->num && vnet_autobind(sk)) { - release_sock(sk); - return -EAGAIN; - } - } - - release_sock(sk); - - return inet_dgram_connect(sock, uaddr, addr_len, flags); -} - -static int -vnet_inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, - size_t size) -{ - struct sock *sk = sock->sk; - - lock_sock(sk); - - /* We may need to bind the socket. */ - if (!inet_sk(sk)->num && vnet_autobind(sk)) { - release_sock(sk); - return -EAGAIN; - } - - release_sock(sk); - - return inet_sendmsg(iocb, sock, msg, size); -} - -static ssize_t -vnet_inet_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) -{ - struct sock *sk = sock->sk; - - lock_sock(sk); - - /* We may need to bind the socket. */ - if (!inet_sk(sk)->num && vnet_autobind(sk)) { - release_sock(sk); - return -EAGAIN; - } - - release_sock(sk); - - return inet_sendpage(sock, page, offset, size, flags); -} - -static int -vnet_inet_release(struct socket *sock) -{ - struct sock *sk = sock->sk; - struct inet_opt *inet = inet_sk(sk); - struct bind_key *key; - - /* Partial socket created by accept() */ - if (!sk) - goto done; - - lock_sock(sk); - - key = bind_get(sk->sk_protocol, inet->saddr, inet->sport, sk); - if (key) { - dbg("vnet_inet_release:%d: released %s port %u.%u.%u.%u:%u\n", get_sk_xid(sk), - print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), ntohs(inet->sport)); - bind_del(key); - bind_put(key); - } - - release_sock(sk); - - done: - return inet_release(sock); -} - -/* Sanity check */ -#define override_op(op, from, to) do { assert((op) == (from)); (op) = (to); } while (0) - -static int __init -vnet_init(void) -{ - int ret; - - /* Initialize bind table */ - ret = bind_init(); - if (ret < 0) - return ret; - - /* Register /proc entries */ - ret = proc_init(); - if (ret < 0) - goto cleanup_bind; - - /* Register dummy netdevice */ - ret = packet_init(); - if (ret < 0) - goto cleanup_proc; - - /* Register tap netdevice */ - ret = tun_init(); - if (ret < 0) - goto cleanup_packet; - - /* Get pointers to unexported functions */ - inet_create = inet_family_ops.create; - inet_sendpage = inet_dgram_ops.sendpage; - tcp_v4_hash = tcp_prot.hash; - tcp_v4_unhash = tcp_prot.unhash; - udp_v4_hash = udp_prot.hash; - udp_v4_unhash = udp_prot.unhash; - - /* Override PF_INET socket operations */ - override_op(inet_family_ops.create, inet_create, vnet_inet_create); - override_op(inet_stream_ops.bind, inet_bind, vnet_inet_bind); - override_op(inet_stream_ops.connect, inet_stream_connect, vnet_inet_stream_connect); - override_op(inet_stream_ops.listen, inet_listen, vnet_inet_listen); - override_op(inet_stream_ops.sendmsg, inet_sendmsg, vnet_inet_sendmsg); - override_op(inet_stream_ops.release, inet_release, vnet_inet_release); - override_op(inet_dgram_ops.bind, inet_bind, vnet_inet_bind); - override_op(inet_dgram_ops.connect, inet_dgram_connect, vnet_inet_dgram_connect); - override_op(inet_dgram_ops.sendmsg, inet_sendmsg, vnet_inet_sendmsg); - override_op(inet_dgram_ops.sendpage, inet_sendpage, vnet_inet_sendpage); - override_op(inet_dgram_ops.release, inet_release, vnet_inet_release); -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10) - override_op(inet_sockraw_ops.bind, inet_bind, vnet_inet_bind); - override_op(inet_sockraw_ops.connect, inet_dgram_connect, vnet_inet_dgram_connect); - override_op(inet_sockraw_ops.sendmsg, inet_sendmsg, vnet_inet_sendmsg); - override_op(inet_sockraw_ops.sendpage, inet_sendpage, vnet_inet_sendpage); - override_op(inet_sockraw_ops.release, inet_release, vnet_inet_release); -#endif - override_op(tcp_prot.hash, tcp_v4_hash, vnet_inet_hash); - override_op(tcp_prot.unhash, tcp_v4_unhash, vnet_inet_unhash); - override_op(udp_prot.hash, udp_v4_hash, vnet_inet_hash); - override_op(udp_prot.unhash, udp_v4_unhash, vnet_inet_unhash); - - /* Register table */ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,11) - ret = ipt_register_table(&vnet_table, &initial_table.repl); -#else - ret = ipt_register_table(&vnet_table); -#endif - if (ret < 0) - goto cleanup_override; - - /* Register hooks */ - ret = nf_register_hook(&vnet_ops[0]); - if (ret < 0) - goto cleanup_table; - - ret = nf_register_hook(&vnet_ops[1]); - if (ret < 0) - goto cleanup_hook0; - - /* Enables any runtime kernel support for VNET */ - vnet_active = 1; - - /* Print banner */ - printk("VNET: version " VNET_VERSION " compiled on " __DATE__ " at " __TIME__ "\n"); - - return ret; - - cleanup_hook0: - nf_unregister_hook(&vnet_ops[0]); - cleanup_table: - ipt_unregister_table(&vnet_table); - cleanup_override: - inet_family_ops.create = inet_create; - inet_stream_ops.bind = inet_bind; - inet_stream_ops.connect = inet_stream_connect; - inet_stream_ops.listen = inet_listen; - inet_stream_ops.sendmsg = inet_sendmsg; - inet_stream_ops.release = inet_release; - inet_dgram_ops.bind = inet_bind; - inet_dgram_ops.connect = inet_dgram_connect; - inet_dgram_ops.sendmsg = inet_sendmsg; - inet_dgram_ops.sendpage = inet_sendpage; - inet_dgram_ops.release = inet_release; - tun_cleanup(); - cleanup_packet: - packet_cleanup(); - cleanup_proc: - proc_cleanup(); - cleanup_bind: - bind_cleanup(); - - return ret; -} - -static void __exit -vnet_exit(void) -{ - unsigned int i; - - /* Print banner */ - printk("VNET: exiting\n"); - - /* Disables any runtime kernel support for VNET */ - vnet_active = 0; - - /* Stop handling packets first */ - for (i = 0; i < sizeof(vnet_ops)/sizeof(struct nf_hook_ops); i++) - nf_unregister_hook(&vnet_ops[i]); - - ipt_unregister_table(&vnet_table); - - /* Stop handling PF_INET socket operations */ - override_op(inet_family_ops.create, vnet_inet_create, inet_create); - override_op(inet_stream_ops.bind, vnet_inet_bind, inet_bind); - override_op(inet_stream_ops.connect, vnet_inet_stream_connect, inet_stream_connect); - override_op(inet_stream_ops.listen, vnet_inet_listen, inet_listen); - override_op(inet_stream_ops.sendmsg, vnet_inet_sendmsg, inet_sendmsg); - override_op(inet_stream_ops.release, vnet_inet_release, inet_release); - override_op(inet_dgram_ops.bind, vnet_inet_bind, inet_bind); - override_op(inet_dgram_ops.connect, vnet_inet_dgram_connect, inet_dgram_connect); - override_op(inet_dgram_ops.sendmsg, vnet_inet_sendmsg, inet_sendmsg); - override_op(inet_dgram_ops.sendpage, vnet_inet_sendpage, inet_sendpage); - override_op(inet_dgram_ops.release, vnet_inet_release, inet_release); -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10) - override_op(inet_sockraw_ops.bind, vnet_inet_bind, inet_bind); - override_op(inet_sockraw_ops.connect, vnet_inet_dgram_connect, inet_dgram_connect); - override_op(inet_sockraw_ops.sendmsg, vnet_inet_sendmsg, inet_sendmsg); - override_op(inet_sockraw_ops.sendpage, vnet_inet_sendpage, inet_sendpage); - override_op(inet_sockraw_ops.release, vnet_inet_release, inet_release); -#endif - override_op(tcp_prot.hash, vnet_inet_hash, tcp_v4_hash); - override_op(tcp_prot.unhash, vnet_inet_unhash, tcp_v4_unhash); - override_op(udp_prot.hash, vnet_inet_hash, udp_v4_hash); - override_op(udp_prot.unhash, vnet_inet_unhash, udp_v4_unhash); - - /* Disable tap netdevice */ - tun_cleanup(); - - /* Disable vnet netdevice and stop handling PF_PACKET sockets */ - packet_cleanup(); - - /* Unregister /proc handlers */ - proc_cleanup(); - - /* Cleanup bind table (must be after nf_unregister_hook()) */ - bind_cleanup(); -} - -module_init(vnet_init); -module_exit(vnet_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Mark Huang "); -MODULE_DESCRIPTION("VServer IP isolation"); -- 2.43.0