From: KyoungSoo Park Date: Wed, 25 Apr 2007 16:07:02 +0000 (+0000) Subject: This commit was generated by cvs2svn to compensate for changes in r759, X-Git-Tag: planetlab-4_0-rc2~6 X-Git-Url: http://git.onelab.eu/?p=codemux.git;a=commitdiff_plain;h=c208f01c6a4c8d896c81692f65a302b426cb7c5f This commit was generated by cvs2svn to compensate for changes in r759, which included commits to RCS files with non-trunk default branches. --- diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..480c997 --- /dev/null +++ b/Makefile @@ -0,0 +1,16 @@ +CC = gcc +CFLAGS = -Wall -O -g + +TARGS = codemux + +all: ${TARGS} + +clean: + rm -f ${TARGS} *.o *~ + +SHARED_OBJ = applib.o gettimeofdayex.o + +CODEMUX_OBJ = codemux.o ${SHARED_OBJ} + +codemux: ${CODEMUX_OBJ} + diff --git a/appdef.h b/appdef.h new file mode 100644 index 0000000..cc82ae8 --- /dev/null +++ b/appdef.h @@ -0,0 +1,53 @@ +#ifndef _APPDEF_H_ +#define _APPDEF_H_ + +/* useful definitions */ +#ifndef TRUE +#define TRUE 1 +#endif + +#ifndef FALSE +#define FALSE 0 +#endif + +#ifndef SUCCESS +#define SUCCESS 0 +#endif + +#ifndef FAILURE +#define FAILURE (-1) +#endif + +#ifndef INT_MAX +#ifdef MAX_INT +#define INT_MAX MAXINT +#endif +#endif + +#ifndef MAX +#define MAX(x, y) ((x) > (y) ? (x) : (y)) +#endif + +#ifndef MIN +#define MIN(x, y) ((x) < (y) ? (x) : (y)) +#endif + +#ifndef NELEMS +#define NELEMS(x) (sizeof(x)/sizeof(x[0])) +#endif + +#ifndef SKIP_CHARS +#define SKIP_CHARS(x) while(!isspace((int)*x)) (x)++ +#endif + +#ifndef SKIP_SPACES +#define SKIP_SPACES(x) while (isspace((int)*x)) (x)++ +#endif + +#ifndef SKIP_WORD +#define SKIP_WORD(x) do { SKIP_SPACES(x); \ + SKIP_CHARS(x); \ + SKIP_SPACES(x);} while(0) +#endif + +#endif //_APPDEF_H_ diff --git a/applib.c b/applib.c new file mode 100644 index 0000000..a91c6aa --- /dev/null +++ b/applib.c @@ -0,0 +1,1156 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "applib.h" +#include "debug.h" +#include "codns.h" + + + +int defaultTraceSync = TRUE; + +/*-----------------------------------------------------------------*/ +float +DiffTimeVal(const struct timeval *start, const struct timeval *end) +{ + struct timeval temp; + if (end == NULL) { + end = &temp; + gettimeofday(&temp, NULL); + } + return(end->tv_sec - start->tv_sec + + 1e-6*(end->tv_usec - start->tv_usec)); +} +/*-----------------------------------------------------------------*/ +int +CreatePrivateAcceptSocketEx(int portNum, int nonBlocking, int loopbackOnly) +{ + int doReuse = 1; + struct linger doLinger; + int sock; + struct sockaddr_in sa; + + /* Create socket. */ + if ((sock = socket(PF_INET, SOCK_STREAM, 0)) == -1) + return(-1); + + /* don't linger on close */ + doLinger.l_onoff = doLinger.l_linger = 0; + if (setsockopt(sock, SOL_SOCKET, SO_LINGER, + &doLinger, sizeof(doLinger)) == -1) { + close(sock); + return(-1); + } + + /* reuse addresses */ + if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, + &doReuse, sizeof(doReuse)) == -1) { + close(sock); + return(-1); + } + + if (nonBlocking) { + /* make listen socket nonblocking */ + if (fcntl(sock, F_SETFL, O_NDELAY) == -1) { + close(sock); + return(-1); + } + } + + /* set up info for binding listen */ + memset(&sa, 0, sizeof(sa)); + sa.sin_family = AF_INET; + sa.sin_addr.s_addr = (loopbackOnly) ? htonl(INADDR_LOOPBACK) + : htonl(INADDR_ANY); + sa.sin_port = htons(portNum); + + /* bind the sock */ + if (bind(sock, (struct sockaddr *) &sa, sizeof(sa)) == -1) { + close(sock); + return(-1); + } + + /* start listening */ + if (listen(sock, 32) == -1) { + close(sock); + return(-1); + } + + return(sock); +} +/*-----------------------------------------------------------------*/ +int +CreatePrivateAcceptSocket(int portNum, int nonBlocking) +{ + return CreatePrivateAcceptSocketEx(portNum, nonBlocking, FALSE); +} +/*-----------------------------------------------------------------*/ +int +CreatePublicUDPSocket(int portNum) +{ + struct sockaddr_in hb_sin; + int sock; + + if ((sock = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP)) < 0) + return(-1); + + memset(&hb_sin, 0, sizeof(hb_sin)); + hb_sin.sin_family = AF_INET; + hb_sin.sin_addr.s_addr = INADDR_ANY; + hb_sin.sin_port = htons(portNum); + + if (bind(sock, (struct sockaddr *) &hb_sin, sizeof(hb_sin)) < 0) { + close(sock); + return(-1); + } + return(sock); +} +/*-----------------------------------------------------------------*/ +int +MakeConnection(char *name, in_addr_t netAddr, int portNum, int nonBlocking) +{ + struct sockaddr_in saddr; + int fd; + + if (name != NULL) { + struct hostent *ent; + if ((ent = gethostbyname(name)) == NULL) { + if (hdebugLog) + TRACE("failed in name lookup - %s\n", name); + return(-1); + } + memcpy(&netAddr, ent->h_addr, sizeof(netAddr)); + } + + if ((fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) < 0) { + if (hdebugLog) + TRACE("failed creating socket - %d\n", errno); + return(-1); + } + + if (nonBlocking) { + if (fcntl(fd, F_SETFL, O_NDELAY) < 0) { + if (hdebugLog) + TRACE("failed fcntl'ing socket - %d\n", errno); + close(fd); + return(-1); + } + } + + saddr.sin_family = AF_INET; + saddr.sin_addr.s_addr = netAddr; + saddr.sin_port = htons(portNum); + + if (connect(fd, (struct sockaddr *) &saddr, + sizeof(struct sockaddr_in)) < 0) { + if (errno == EINPROGRESS) + return(fd); + if (hdebugLog) + TRACE("failed connecting socket - %d\n", errno); + close(fd); + return(-1); + } + + return(fd); +} +/*-----------------------------------------------------------------*/ +int +MakeLoopbackConnection(int portNum, int nonBlocking) +{ + return(MakeConnection(NULL, htonl(INADDR_LOOPBACK), + portNum, nonBlocking)); +} +/*-----------------------------------------------------------------*/ +char * +GetField(const unsigned char *start, int whichField) +{ + int currField; + + /* move to first non-blank char */ + while (isspace(*start)) + start++; + + if (*start == '\0') + return(NULL); + + for (currField = 0; currField < whichField; currField++) { + /* move over this field */ + while (*start != '\0' && (!isspace(*start))) + start++; + /* move over blanks before next field */ + while (isspace(*start)) + start++; + if (*start == '\0') + return(NULL); + } + return((char *) start); +} +/* ---------------------------------------------------------------- */ +char * +GetWord(const unsigned char *start, int whichWord) +{ + /* returns a newly allocated string containing the desired word, + or NULL if there was a problem */ + unsigned char *temp; + int len = 0; + char *res; + + temp = (unsigned char *) GetField(start, whichWord); + if (!temp) + return(NULL); + while (!(temp[len] == '\0' || isspace(temp[len]))) + len++; + if (!len) + NiceExit(-1, "internal error"); + res = (char *)xcalloc(1, len+1); + if (!res) + NiceExit(-1, "out of memory"); + memcpy(res, temp, len); + return(res); +} +/* ---------------------------------------------------------------- */ +char * +GetWordEx(const unsigned char *start, int whichWord, + char* dest, int max) +{ + /* returns a newly allocated string containing the desired word, + or NULL if there was a problem */ + unsigned char *temp; + int len = 0; + + memset(dest, 0, max); + temp = (unsigned char *) GetField(start, whichWord); + if (!temp) + return(NULL); + while (!(temp[len] == '\0' || isspace(temp[len]))) + len++; + if (!len) + NiceExit(-1, "internal error"); + if (len >= max-1) + len = max-1; + memcpy(dest, temp, len); + return(dest); +} +/*-----------------------------------------------------------------*/ +int +Base36Digit(int a) +{ + if (a < 0) + return('0'); + if (a > 35) + return('z'); + if (a < 10) + return('0' + a); + return('a' + a-10); +} +/*-----------------------------------------------------------------*/ +int +Base36To10(int a) +{ + if (a >= '0' && a <= '9') + return(a - '0'); + if (a >= 'a' && a <= 'z') + return(10 + a - 'a'); + if (a >= 'A' && a <= 'Z') + return(10 + a - 'A'); + return(0); +} +/*-----------------------------------------------------------------*/ +int +PopCount(int val) +{ + int i; + int count = 0; + + for (i = 0; i < (int)sizeof(int) * 8; i++) { + if (val & (1<= l2) { + l1--; + if (!memcmp(s1,s2,l2)) + return (char *) s1; + s1++; + } + return NULL; +} +#endif +/*-----------------------------------------------------------------*/ +/* same as strnstr, except case insensitive */ +char * strncasestr(const char * s1, int s1_len, const char * s2) +{ + int l1, l2; + + l2 = strlen(s2); + if (!l2) + return (char *) s1; + + l1 = s1_len; + while (l1 >= l2) { + l1--; + if (!strncasecmp(s1,s2,l2)) + return (char *) s1; + s1++; + } + return NULL; +} + +/*-----------------------------------------------------------------*/ +char * +StrdupLower(const char *orig) +{ + char *temp; + int i; + + if ((temp = xstrdup(orig)) == NULL) + NiceExit(-1, "no memory in strduplower"); + for (i = 0; temp[i]; i++) { + if (isupper((int) temp[i])) + temp[i] = tolower(temp[i]); + } + return(temp); +} + +/*-----------------------------------------------------------------*/ +void +StrcpyLower(char *dest, const char *src) +{ + /* copy 'src' to 'dest' in lower cases. + 'dest' should have enough free space to hold src */ + int i; + + for (i = 0; src[i]; i++) { + dest[i] = (isupper((int) src[i])) ? tolower(src[i]) : src[i]; + } + + /* mark it as NULL */ + dest[i] = 0; +} +/*-----------------------------------------------------------------*/ +void +StrcpyLowerExcept(char *dest, int dest_max, const char* src, const char* except) +{ + /* copy 'src' to 'dest' in lower cases, skipping the chars in except. + 'dest' should have enough free space to hold src */ + int i, j; + + if (src == NULL) + return; + + for (i = 0, j= 0; src[i]; i++) { + if (strchr(except, src[i])) + continue; + + if (j == dest_max - 1) + break; + dest[j++] = (isupper((int) src[i])) ? tolower(src[i]) : src[i]; + } + + /* mark it as NULL */ + dest[j] = 0; +} + +/*-----------------------------------------------------------------*/ +static char * +GetNextLineBack(FILE *file, int lower, int stripComments) +{ + /* reads the next non-blank line of the file. strips off any leading + space, any comments, and any trailing space. returns a lowercase + version of the line that has been malloc'd */ + char line[1024]; + + while (fgets(line, sizeof(line), file) != NULL) { + char *temp; + int len; + + /* strip off any comments, leading and trailing whitespace */ + if (stripComments) { + if ((temp = strchr(line, '#')) != NULL) + *temp = 0; + } + len = strlen(line); + while (len > 0 && isspace((int) line[len-1])) { + len--; + line[len] = 0; + } + temp = line; + while (isspace((int) *temp)) + temp++; + if (temp[0] == 0) + continue; /* blank line, move on */ + + if (lower) + return(StrdupLower(temp)); + return(xstrdup(temp)); + } + + return(NULL); +} +/*-----------------------------------------------------------------*/ +char * +GetLowerNextLine(FILE *file) +{ + return(GetNextLineBack(file, TRUE, TRUE)); +} +/*-----------------------------------------------------------------*/ +char * +GetNextLine(FILE *file) +{ + return(GetNextLineBack(file, FALSE, TRUE)); +} +/*-----------------------------------------------------------------*/ +char * +GetNextLineNoCommStrip(FILE *file) +{ + return(GetNextLineBack(file, FALSE, FALSE)); +} +/*-----------------------------------------------------------------*/ +int +DoesSuffixMatch(char *start, int len, char *suffix) +{ + int sufLen = strlen(suffix); + + if (len < 1) + len = strlen(start); + if (len < sufLen) + return(FALSE); + if (strncasecmp(start+len-sufLen, suffix, sufLen)) + return(FALSE); + return(TRUE); +} +/*-----------------------------------------------------------------*/ +int +DoesDotlessSuffixMatch(char *start, int len, char *suffix) +{ + /* ignores any dots on end of start, suffix */ + int sufLen = strlen(suffix); + + if (len < 1) + len = strlen(start); + + while (len > 1 && start[len-1] == '.') + len--; + while (sufLen > 1 && suffix[sufLen-1] == '.') + sufLen--; + + if (len < sufLen) + return(FALSE); + if (strncasecmp(start+len-sufLen, suffix, sufLen)) + return(FALSE); + return(TRUE); +} + +/*-----------------------------------------------------------------*/ +/* */ +/* Bit Vector Implementation */ +/* */ +/*-----------------------------------------------------------------*/ +#define BIT_INDEX (0x0000001F) + +void +SetBits(int* bits, int idx, int maxNum) +{ + if (idx > (maxNum << 5)) { + TRACE("Invalid index: %d", idx); + return; + } + bits[(idx >> 5)] |= (1 << (idx & BIT_INDEX)); +} +/*-----------------------------------------------------------------*/ +int +GetBits(int* bits, int idx, int maxNum) +{ + if (idx > (maxNum << 5)) { + TRACE("Invalid index: %d", idx); + return FALSE; + } + return (bits[(idx >> 5)] & (1 << (idx & BIT_INDEX))); +} + +/*-----------------------------------------------------------------*/ +static inline int +GetNumBits_I(int bitvec) +{ + int i, count; + + for (i = 0, count = 0; i < 32; i++) + if (bitvec & (1 << i)) count++; + return count; +} + +/*-----------------------------------------------------------------*/ +int +GetNumBits(int* bitvecs, int maxNum) +{ + int i, count; + + /* get the number of bits that have been set to 1 */ + for (i = 0, count = 0; i < maxNum; i++) + count += GetNumBits_I(bitvecs[i]); + return count; +} + +/*-----------------------------------------------------------------*/ + +/* Logging & Trace support */ + +/* buffer threshold : when the size hits this value, it flushes its content + to the file */ +#define LOG_BYTES_THRESHOLD (32*1024) + +/* this flag indicates that it preserves the base file name for the current + one, and changes its name when it actually closes it off */ +#define CHANGE_FILE_NAME_ON_SAVE 0x01 + +/* size of the actual log buffer */ +#define LOG_BYTES_MAX (2*LOG_BYTES_THRESHOLD) + +/* log/trace book keeping information */ +typedef struct ExtendedLog { + char buffer[LOG_BYTES_MAX]; /* 64 KB */ + int bytes; /* number of bytes written */ + int filesize; /* number of bytes written into this file so far */ + int fd; /* file descriptor */ + char* sig; /* base file name */ + int flags; /* flags */ + time_t nextday; +} ExtendedLog, *PExtendedLog; + +/* maximum single file size */ +int maxSingleLogSize = 100 * (1024*1024); + +static time_t +GetNextLogFileName(char* file, int size, const char* sig) +{ +#define COMPRESS_EXT1 ".bz2" +#define COMPRESS_EXT2 ".gz" + + struct tm cur_tm; + time_t cur_t; + int idx = 0; + + cur_t = timeex(NULL); + cur_tm = *gmtime(&cur_t); + + for (;;) { + /* check if .bz2 exists */ + snprintf(file, size, "%s.%04d%02d%02d_%03d%s", + sig, cur_tm.tm_year+1900, cur_tm.tm_mon+1, cur_tm.tm_mday, + idx, COMPRESS_EXT1); + + if (access(file, F_OK) == 0) { + idx++; + continue; + } + + /* check if .gz exists */ + snprintf(file, size, "%s.%04d%02d%02d_%03d%s", + sig, cur_tm.tm_year+1900, cur_tm.tm_mon+1, cur_tm.tm_mday, + idx++, COMPRESS_EXT2); + + if (access(file, F_OK) == 0) + continue; + + /* strip the extension and see if the (uncompressed) file exists */ + file[strlen(file) - sizeof(COMPRESS_EXT2) + 1] = 0; + if (access(file, F_OK) != 0) + break; + } + + /* calculate & return the next day */ + cur_t -= (3600*cur_tm.tm_hour + 60*cur_tm.tm_min + cur_tm.tm_sec); + return cur_t + 60*60*24; + +#undef COMPRESS_EXT1 +#undef COMPRESS_EXT2 +} + +/*-----------------------------------------------------------------*/ +static void +FlushBuffer(HANDLE file) +{ + /* write data into the file */ + ExtendedLog* pel = (ExtendedLog *)file; + int written; + + if (pel == NULL || pel->fd < 0) + return; + + if ((written = write(pel->fd, pel->buffer, pel->bytes)) > 0) { + pel->bytes -= written; + + /* if it hasn't written all data, then we need to move memory */ + if (pel->bytes > 0) + memmove(pel->buffer, pel->buffer + written, pel->bytes); + pel->buffer[pel->bytes] = 0; + pel->filesize += written; + } + + /* if the filesize is bigger than maxSignleLogSize, then close it off */ + if (pel->filesize >= maxSingleLogSize) + OpenLogF(file); +} + +/*-----------------------------------------------------------------*/ +HANDLE +CreateLogFHandle(const char* signature, int change_file_name_on_save) +{ + ExtendedLog* pel; + + if ((pel = (ExtendedLog *)xcalloc(1, sizeof(ExtendedLog))) == NULL) + NiceExit(-1, "failed"); + + pel->fd = -1; + pel->sig = xstrdup(signature); + if (pel->sig == NULL) + NiceExit(-1, "signature copying failed"); + if (change_file_name_on_save) + pel->flags |= CHANGE_FILE_NAME_ON_SAVE; + + return pel; +} + + +/*-----------------------------------------------------------------*/ +int +OpenLogF(HANDLE file) +{ + char filename[1024]; + ExtendedLog* pel = (ExtendedLog *)file; + + if (pel == NULL) + return -1; + + if (pel->fd != -1) + close(pel->fd); + + pel->nextday = GetNextLogFileName(filename, sizeof(filename), pel->sig); + + /* change the file name only at saving time + use pel->sig as current file name */ + if (pel->flags & CHANGE_FILE_NAME_ON_SAVE) { + if (access(pel->sig, F_OK) == 0) + rename(pel->sig, filename); + strcpy(filename, pel->sig); + } + + /* file opening */ + if ((pel->fd = open(filename, O_RDWR | O_CREAT | O_APPEND, + S_IRUSR | S_IWUSR)) == -1) { + char errMessage[2048]; + sprintf(errMessage, "couldn't open the extended log file %s\n", filename); + NiceExit(-1, errMessage); + } + + /* reset the file size */ + pel->filesize = 0; + return 0; +} + +/*-----------------------------------------------------------------*/ +int +WriteLog(HANDLE file, const char* data, int size, int forceFlush) +{ + ExtendedLog* pel = (ExtendedLog *)file; + + /* if an error might occur, then stop here */ + if (pel == NULL || pel->fd < 0 || size > LOG_BYTES_MAX) + return -1; + + if (data != NULL) { + /* flush the previous data, if this data would overfill the buffer */ + if (pel->bytes + size >= LOG_BYTES_MAX) + FlushBuffer(file); + + /* write into the buffer */ + memcpy(pel->buffer + pel->bytes, data, size); + pel->bytes += size; + } + + /* need to flush ? */ + if ((forceFlush && (pel->bytes > 0)) || (pel->bytes >= LOG_BYTES_THRESHOLD)) + FlushBuffer(file); + + return 0; +} + +/*-----------------------------------------------------------------*/ +void +DailyReopenLogF(HANDLE file) +{ + /* check if current file is a day long, + opens another for today's file */ + ExtendedLog* pel = (ExtendedLog *)file; + + if (pel && (timeex(NULL) >= pel->nextday)) { + FlushLogF(file); /* flush */ + OpenLogF(file); /* close previous one & reopen today's */ + } +} +/*-----------------------------------------------------------------*/ +int +HandleToFileNo(HANDLE file) +{ + ExtendedLog* pel = (ExtendedLog *)file; + + return (pel != NULL) ? pel->fd : -1; +} +/*-----------------------------------------------------------------*/ +#define TO_HOST_L(x) x = ntohl(x); +#define TO_NETWORK_L(x) x = htonl(x); +void +HtoN_LocalQueryInfo(LocalQueryInfo *p) +{ + TO_NETWORK_L(p->lqi_size); + TO_NETWORK_L(p->lqi_id); + TO_NETWORK_L(p->lqi_cache); +} +/*----------------------------------------------------------------*/ +void +NtoH_LocalQueryResult(LocalQueryResult* p) +{ + TO_HOST_L(p->lq_id); + TO_HOST_L(p->lq_ttl); +} +/*----------------------------------------------------------------*/ +int +CoDNSGetHostByNameSync(const char *name, struct in_addr *res) +{ + static int fd = -1; + LocalQuery query; + int size; + LocalQueryResult result; + + /* create a connection to CoDNS */ + if (fd == -1 && (fd = MakeLoopbackConnection(CODNS_PORT, 0)) < 0) { + TRACE("CoDNS connection try has failed!\n"); + + /* try to resolve names using gethostbyname() */ + { + struct hostent* hp; + if ((hp = gethostbyname(name)) == NULL) + NiceExit(-1, "gethostbyname also failed!"); + if (res) + *res = *(struct in_addr *)hp->h_addr; + } + return 0; + } + + memset(&query, 0, sizeof(query)); + size = strlen(name) + 1; + query.lq_info.lqi_size = size; /* length of name */ + query.lq_info.lqi_cache = TRUE; + strcpy(query.lq_name, name); + size += sizeof(query.lq_info) + sizeof(query.lq_zero); + + /* send a query */ + HtoN_LocalQueryInfo(&query.lq_info); + if (write(fd, &query, size) != size) { + close(fd); + fd = -1; + return(-1); + } + + /* get answer */ + do { + size = read(fd, &result, sizeof(result)); + } while (size == -1 && errno == EINTR); + /* close(fd); */ + NtoH_LocalQueryResult(&result); + + if (size != sizeof(result)) + return(-1); + + *res = result.lq_address[0]; + return 0; +} +/*-----------------------------------------------------------------*/ +void +FeedbackDelay(struct timeval *start, float minSec, float maxSec) +{ + float diff = DiffTimeVal(start, NULL); + if (diff < minSec) + diff = minSec; + if (diff > maxSec) + diff = maxSec; + usleep((unsigned int)(diff * 1e6)); +} +/*-----------------------------------------------------------------*/ +static int niceExitLogFD = -1; +int +NiceExitOpenLog(char *logName) +{ + /* log file to record exit reasons */ + if (niceExitLogFD >= 0) + close(niceExitLogFD); + + niceExitLogFD = open(logName, O_WRONLY | O_APPEND | O_CREAT, + S_IRUSR | S_IWUSR); + return((niceExitLogFD >= 0) ? SUCCESS : FAILURE); +} +/*-----------------------------------------------------------------*/ +void +NiceExitBack(int val, char *reason, char *file, int line) +{ + char buf[1024]; + time_t currT = time(NULL); + + sprintf(buf, "[%s, %d] %.24s %s\n", file, line, ctime(&currT), reason); + if (hdebugLog) { + TRACE("%s", buf); + FlushLogF(hdebugLog); + } + else + fprintf(stderr, "%s", buf); + + if (niceExitLogFD >= 0) + write(niceExitLogFD, buf, strlen(buf)); + exit(val); +} +/*-----------------------------------------------------------------*/ +int +WordCount(char *buf) +{ + int count = 0; + int wasSpace = TRUE; + + while (*buf != '\0') { + int isSpace = isspace(*buf); + if (wasSpace && (!isSpace)) + count++; + wasSpace = isSpace; + buf++; + } + return(count); +} +/*-----------------------------------------------------------------*/ +char * +ReadFile(const char *filename) +{ + int dummySize; + return(ReadFileEx(filename, &dummySize)); +} +/*-----------------------------------------------------------------*/ +char * +ReadFileEx(const char *filename, int *size) +{ + /* allocate a buffer, read the file into it and + return the buffer */ + char *content = NULL; + struct stat buf; + int fd; + + *size = -1; + if (access(filename, R_OK) < 0 + || stat(filename, &buf) < 0 + || (fd = open(filename, O_RDONLY)) < 0) { + TRACE("opening captcha file %s failed\n", filename); + exit(-1); + } + + if ((content = (char *)xmalloc(buf.st_size + 1)) == NULL) { + TRACE("memory alloc failed\n"); + exit(-1); + } + + if (read(fd, content, buf.st_size) != buf.st_size) { + TRACE("opening captcha test file failed\n"); + exit(-1); + } + close(fd); + content[buf.st_size] = 0; + *size = buf.st_size; + return content; +} +/*-----------------------------------------------------------------*/ +char * +MmapFile(const char *filename, int *size) +{ + /* allocate a buffer, read the file into it and + return the buffer */ + char *content = NULL; + struct stat buf; + int fd; + + *size = -1; + if (access(filename, R_OK) < 0 + || stat(filename, &buf) < 0 + || (fd = open(filename, O_RDONLY)) < 0) { + TRACE("opening captcha file %s failed\n", filename); + exit(-1); + } + + content = (char *)mmap(NULL, buf.st_size, PROT_READ, MAP_SHARED, fd, 0); + close(fd); + if (content != MAP_FAILED) { + *size = buf.st_size; + return content; + } + return(NULL); +} +/*-----------------------------------------------------------------*/ +unsigned int +HashString(const char *name, unsigned int hash, int endOnQuery, + int skipLastIfDot) +{ + /* if endOnQuery, we stop the hashing when we hit the question mark. + if skipLastIfDot, we check the last component of the path to see + if it includes a dot, and it so, we skip it. if both are specified, + we first try the query, and if that exists, we don't trim the path */ + + int i; + int len; + char *temp; + + if (name == NULL) + return 0; + + len = strlen(name); + if (endOnQuery && (temp = strchr(name, '?')) != NULL) + len = temp - name; + else if (skipLastIfDot) { + /* first, find last component by searching backward */ + if ((temp = strrchr(name, '/')) != NULL) { + /* now search forward for the dot */ + if (strchr(temp, '.') != NULL) + len = temp - name; + } + } + + for (i = 0; i < len; i ++) + hash += (_rotl(hash, 19) + name[i]); + + return hash; +} +/*-------------------------------------------------------------*/ +unsigned int +CalcAgentHash(const char* agent) +{ + char p[strlen(agent)+1]; + int i; + + if (agent == NULL) + return 0; + + /* we remove all spaces */ + for (i = 0; *agent; agent++) { + if (isspace(*agent)) + continue; + p[i++] = *agent; + } + p[i] = 0; + + return HashString(p, 0, FALSE, FALSE); +} +/*-----------------------------------------------------------------*/ +char * +ZapSpacesAndZeros(char *src) +{ + /* get rid of excess spaces between words, and remove any trailing + (post-decimal) zeros from floating point numbers */ + static char smallLine[4096]; + char *dst = smallLine; + char *word; + int addSpace = FALSE; + + while (src != NULL && + (word = GetWord((const unsigned char*)src, 0)) != NULL) { + char *temp; + int isDotNumber = TRUE; + + src = GetField((const unsigned char*)src, 1); /* advance to next */ + + /* check to make sure it has exactly one decimal point */ + if ((temp = strchr(word, '.')) != NULL && + (temp = strchr(temp+1, '.')) == NULL) { + /* make sure it's all digits or the dot */ + for (temp = word; *temp != '\0'; temp++) { + if (!(isdigit(*temp) || *temp == '.')) { + isDotNumber = FALSE; + break; + } + } + } + else + isDotNumber = FALSE; + + if (isDotNumber) { + /* strip off any trailing zeros and possibly the decimal point */ + int len = strlen(word) - 1; + + while (word[len] == '0') { + word[len] = '\0'; + len--; + } + if (word[len] == '.') + word[len] = '\0'; + } + + if (addSpace) + sprintf(dst, " %s", word); + else + sprintf(dst, "%s", word); + dst += strlen(dst); + addSpace = TRUE; + xfree(word); + } + + *dst = 0; + return(smallLine); +} +/*-----------------------------------------------------------------*/ diff --git a/applib.h b/applib.h new file mode 100644 index 0000000..40b283a --- /dev/null +++ b/applib.h @@ -0,0 +1,130 @@ +#ifndef _APPLIB_H_ +#define _APPLIB_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "appdef.h" + +float DiffTimeVal(const struct timeval *start, const struct timeval *end); + +int CreatePrivateAcceptSocketEx(int portNum, + int nonBlocking, int loopbackOnly); +int CreatePrivateAcceptSocket(int portNum, int nonBlocking); +int CreatePublicUDPSocket(int portNum); +int MakeLoopbackConnection(int portNum, int nonBlocking); +int MakeConnection(char *name, in_addr_t netAddr, + int portNum, int nonBlocking); + +char *GetField(const unsigned char *start, int whichField); +char *GetWord(const unsigned char *start, int whichWord); +char *GetWordEx(const unsigned char *start, int whichWord, + char* dest, int max); +int WordCount(char *buf); +char *ZapSpacesAndZeros(char *src); + +int Base36Digit(int a); +int Base36To10(int a); +int PopCount(int val); +int PopCountChar(int val); +int LogValChar(int val); + +const char *StringOrNull(const char *s); +char *strchars(const char *string, const char *list); +char *strnchars(const char *string, int length, const char *list); + +#if defined(OS_FREEBSD) || defined(OS_DARWIN) +#define HAS_STRNSTR +#endif + +#ifndef HAS_STRNSTR +char *strnstr(const char * s1, int s1_len, const char * s2); +#endif + +char * strncasestr(const char * s1, int s1_len, const char * s2); +int strncspn(const char* string, int length, const char* reject); +char *strnchr(const char *string, int length, const char needle); +char *StrdupLower(const char *orig); +void StrcpyLower(char *dest, const char *src); +void StrcpyLowerExcept(char *dest, int dest_max, const char* src, const char* except); +char *GetLowerNextLine(FILE *file); +char *GetNextLine(FILE *file); +char *GetNextLineNoCommStrip(FILE *file); +int DoesSuffixMatch(char *start, int len, char *suffix); +int DoesDotlessSuffixMatch(char *start, int len, char *suffix); + +/* resolves a name using CoDNS */ +#include "codns.h" +void HtoN_LocalQueryInfo(LocalQueryInfo *p); +void NtoH_LocalQueryResult(LocalQueryResult* p); +int CoDNSGetHostByNameSync(const char *name, struct in_addr *res); + +#ifdef OS_LINUX +#include +#endif + +#include + +/* allocate stack memory to copy "src" to "dest" in lower cases */ +#define LOCAL_STR_DUP_LOWER(dest, src) \ + { dest = alloca(strlen(src) + 1); \ + StrcpyLower(dest, src); \ + } + +/* allocate stack memory to copy "src" to "dest" */ +#define LOCAL_STR_DUP(dest, src) \ + { dest = alloca(strlen(src) + 1); \ + strcpy(dest, src); \ + } + +/* release memory pointer after checking NULL */ +#define FREE_PTR(x) if ((x) != NULL) { xfree(x);} + +/* Bit vector implementation */ +void SetBits(int* bits, int idx, int maxNum); +int GetBits(int* bits, int idx, int maxNum); +int GetNumBits(int* bitvecs, int maxNum); + +/* extended logging */ +typedef void* HANDLE; + +HANDLE CreateLogFHandle(const char* signature, int change_file_name_on_save); +int OpenLogF(HANDLE file); +int WriteLog(HANDLE file, const char* data, int size, int forceFlush); +void DailyReopenLogF(HANDLE file); +int HandleToFileNo(HANDLE file); + +/* flush the buffer */ +#define FlushLogF(h) WriteLog(h, NULL, 0, TRUE) + +/* maximum single log file size, defined in applib.c */ +extern int maxSingleLogSize; + +#include "gettimeofdayex.h" + +void FeedbackDelay(struct timeval *start, float minSec, float maxSec); + +int NiceExitOpenLog(char *logName); +void NiceExitBack(int val, char *reason, char *file, int line) __attribute__ ((noreturn)); +#define NiceExit(val, reason) NiceExitBack(val, reason, __FILE__, __LINE__) + +char *ReadFile(const char *filename); +char *ReadFileEx(const char *filename, int *size); +char *MmapFile(const char *filename, int *size); + +/* use 32-bit unsigned integer, higher bits are thrown away */ +#define WORD_BITS 32 +#define MASK 0xFFFFFFFF +/* bitwise left rotate operator */ +#define _rotl(Val, Bits) ((((Val)<<(Bits)) | (((Val) & MASK)>>(32 - (Bits)))) & MASK) + +unsigned int HashString(const char *name, unsigned int hash, + int endOnQuery, int skipLastIfDot); +unsigned int CalcAgentHash(const char* agent); +#endif + diff --git a/codemux.c b/codemux.c new file mode 100644 index 0000000..732059a --- /dev/null +++ b/codemux.c @@ -0,0 +1,944 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "applib.h" + +#define CONF_FILE "/etc/codemux/codemux.conf" +#define DEMUX_PORT 80 +#define REAL_WEBSERVER_CONFLINE "* root 1080" +#define TARG_SETSIZE 4096 + +/* set aside some small number of fds for us, allow the rest for + connections */ +#define MAX_CONNS ((TARG_SETSIZE-20)/2) + +/* no single service can take more than half the connections */ +#define SERVICE_MAX (MAX_CONNS/2) + +/* how many total connections before we get concerned about fairness + among them */ +#define FAIRNESS_CUTOFF (MAX_CONNS * 0.85) + + +typedef struct FlowBuf { + int fb_refs; /* num refs */ + char *fb_buf; /* actual buffer */ + int fb_used; /* bytes used in buffer */ +} FlowBuf; +#define FB_SIZE 3800 /* max usable size */ +#define FB_ALLOCSIZE 4000 /* extra to include IP address */ + +typedef struct SockInfo { + int si_peerFd; /* fd of peer */ + struct in_addr si_cliAddr; /* address of client */ + int si_blocked; /* are we blocked? */ + int si_needsHeaderSince; /* since when are we waiting for a header */ + int si_whichService; /* index of service */ + FlowBuf *si_readBuf; /* read data into this buffer */ + FlowBuf *si_writeBuf; /* drain this buffer for writing */ +} SockInfo; + +static SockInfo sockInfo[TARG_SETSIZE]; /* fd number of peer socket */ + +typedef struct ServiceSig { + char *ss_host; /* suffix in host */ + char *ss_slice; + short ss_port; + int ss_slicePos; /* position in slices array */ +} ServiceSig; + +static ServiceSig *serviceSig; +static int numServices; +static int confFileReadTime; +static int now; + +typedef struct SliceInfo { + char *si_sliceName; + int si_inUse; /* do any services refer to this? */ + int si_numConns; + int si_xid; +} SliceInfo; + +static SliceInfo *slices; +static int numSlices; +static int numActiveSlices; +static int numTotalSliceConns; +static int anySliceXidsNeeded; + +typedef struct OurFDSet { + long __fds_bits[TARG_SETSIZE/32]; +} OurFDSet; +static OurFDSet masterReadSet, masterWriteSet; +static int highestSetFd; +static int numNeedingHeaders; /* how many conns waiting on headers? */ + +static int numForks; + +HANDLE hdebugLog; + +#ifndef SO_SETXID +#define SO_SETXID SO_PEERCRED +#endif +/*-----------------------------------------------------------------*/ +static SliceInfo * +ServiceToSlice(int whichService) +{ + if (whichService < 0) + return(NULL); + return(&slices[serviceSig[whichService].ss_slicePos]); +} +/*-----------------------------------------------------------------*/ +static void +DumpStatus(int fd) +{ + char buf[65535]; + char *start = buf; + int i; + int len; + + sprintf(start, + "numForks %d, numActiveSlices %d, numTotalSliceConns %d\n" + "numNeedingHeaders %d, anySliceXidsNeeded %d\n", + numForks, numActiveSlices, numTotalSliceConns, + numNeedingHeaders, anySliceXidsNeeded); + start += strlen(start); + + for (i = 0; i < numSlices; i++) { + SliceInfo *si = &slices[i]; + sprintf(start, "Slice %d: %s xid %d, %d conns, inUse %d\n", + i, si->si_sliceName, si->si_xid, si->si_numConns, + si->si_inUse); + start += strlen(start); + } + + for (i = 0; i < numServices; i++) { + ServiceSig *ss = &serviceSig[i]; + sprintf(start, "Service %d: %s %s port %d, slice# %d\n", i, ss->ss_host, + ss->ss_slice, (int) ss->ss_port, ss->ss_slicePos); + start += strlen(start); + } + + len = start - buf; + write(fd, buf, len); +} +/*-----------------------------------------------------------------*/ +static void +GetSliceXids(void) +{ + /* walks through /etc/passwd, and gets the uid for every slice we + have */ + FILE *f; + char *line; + int i; + + if (!anySliceXidsNeeded) + return; + + for (i = 0; i < numSlices; i++) { + SliceInfo *si = &slices[i]; + si->si_inUse = 0; + } + for (i = 0; i < numServices; i++) { + SliceInfo *si = ServiceToSlice(i); + if (si != NULL) + si->si_inUse++; + } + + if ((f = fopen("/etc/passwd", "r")) == NULL) + return; + + while ((line = GetNextLine(f)) != NULL) { + char *temp; + int xid; + + if ((temp = strchr(line, ':')) == NULL) + continue; /* weird line */ + *temp = '\0'; /* terminate slice name */ + temp++; + if ((temp = strchr(temp+1, ':')) == NULL) + continue; /* weird line */ + if ((xid = atoi(temp+1)) < 1) + continue; /* weird xid */ + + /* we've got a slice name and xid, let's try to match */ + for (i = 0; i < numSlices; i++) { + if (slices[i].si_xid == 0 && + strcasecmp(slices[i].si_sliceName, line) == 0) { + slices[i].si_xid = xid; + break; + } + } + } + + /* assume service 0 is the root service, and don't check it since + it'll have xid zero */ + anySliceXidsNeeded = FALSE; + for (i = 1; i < numSlices; i++) { + if (slices[i].si_xid == 0 && slices[i].si_inUse > 0) { + anySliceXidsNeeded = TRUE; + break; + } + } + + fclose(f); +} +/*-----------------------------------------------------------------*/ +static void +SliceConnsInc(int whichService) +{ + SliceInfo *si = ServiceToSlice(whichService); + + if (si == NULL) + return; + numTotalSliceConns++; + si->si_numConns++; + if (si->si_numConns == 1) + numActiveSlices++; +} +/*-----------------------------------------------------------------*/ +static void +SliceConnsDec(int whichService) +{ + SliceInfo *si = ServiceToSlice(whichService); + + if (si == NULL) + return; + numTotalSliceConns--; + si->si_numConns--; + if (si->si_numConns == 0) + numActiveSlices--; +} +/*-----------------------------------------------------------------*/ +static int +WhichSlicePos(char *slice) +{ + /* adds the new slice if necessary, returns the index into slice + array. Never change the ordering of existing slices */ + int i; + static int numSlicesAlloc; + + for (i = 0; i < numSlices; i++) { + if (strcasecmp(slice, slices[i].si_sliceName) == 0) + return(i); + } + + if (numSlices >= numSlicesAlloc) { + numSlicesAlloc = MAX(8, numSlicesAlloc * 2); + slices = realloc(slices, numSlicesAlloc * sizeof(SliceInfo)); + } + + memset(&slices[numSlices], 0, sizeof(SliceInfo)); + slices[numSlices].si_sliceName = strdup(slice); + numSlices++; + return(numSlices-1); +} +/*-----------------------------------------------------------------*/ +static void +ReadConfFile(void) +{ + int numAlloc = 0; + int num = 0; + ServiceSig *servs = NULL; + FILE *f; + char *line = NULL; + struct stat statBuf; + int i; + + if (stat(CONF_FILE, &statBuf) != 0) { + fprintf(stderr, "failed stat on codemux.conf\n"); + if (numServices) + return; + exit(-1); + } + if (statBuf.st_mtime == confFileReadTime) + return; + + if ((f = fopen(CONF_FILE, "r")) == NULL) { + fprintf(stderr, "failed reading codemux.conf\n"); + if (numServices) + return; + exit(-1); + } + + /* conf file entries look like + coblitz.codeen.org princeton_coblitz 3125 + */ + + while (1) { + ServiceSig serv; + int port; + if (line != NULL) + free(line); + + /* on the first pass, put in a fake entry for apache */ + if (num == 0) + line = strdup(REAL_WEBSERVER_CONFLINE); + else { + if ((line = GetNextLine(f)) == NULL) + break; + } + + memset(&serv, 0, sizeof(serv)); + if (WordCount(line) != 3) { + fprintf(stderr, "bad line: %s\n", line); + continue; + } + serv.ss_port = port = atoi(GetField(line, 2)); + if (port < 1 || port > 65535 || port == DEMUX_PORT) { + fprintf(stderr, "bad port: %s\n", line); + continue; + } + + serv.ss_host = GetWord(line, 0); + serv.ss_slice = GetWord(line, 1); + if (num >= numAlloc) { + numAlloc = MAX(numAlloc * 2, 8); + servs = realloc(servs, numAlloc * sizeof(ServiceSig)); + } + serv.ss_slicePos = WhichSlicePos(serv.ss_slice); + if (slices[serv.ss_slicePos].si_inUse == 0 && + slices[serv.ss_slicePos].si_xid < 1) + anySliceXidsNeeded = TRUE; /* if new/inactive, we need xid */ + servs[num] = serv; + num++; + } + + fclose(f); + + if (num == 1) { + if (numServices == 0) { + fprintf(stderr, "nothing found in codemux.conf\n"); + exit(-1); + } + return; + } + + for (i = 0; i < numServices; i++) { + free(serviceSig[i].ss_host); + free(serviceSig[i].ss_slice); + } + free(serviceSig); + serviceSig = servs; + numServices = num; + confFileReadTime = statBuf.st_mtime; +} +/*-----------------------------------------------------------------*/ +static char *err400BadRequest = +"HTTP/1.0 400 Bad Request\r\n" +"Content-Type: text/html\r\n" +"\r\n" +"You are trying to access a PlanetLab node, and your\n" +"request header exceeded the allowable size. Please\n" +"try again if you believe this error is temporary.\n"; +/*-----------------------------------------------------------------*/ +static char *err503Unavailable = +"HTTP/1.0 503 Service Unavailable\r\n" +"Content-Type: text/html\r\n" +"\r\n" +"You are trying to access a PlanetLab node, but the service\n" +"seems to be unavailable at the moment. Please try again.\n"; +/*-----------------------------------------------------------------*/ +static char *err503TooBusy = +"HTTP/1.0 503 Service Unavailable\r\n" +"Content-Type: text/html\r\n" +"\r\n" +"You are trying to access a PlanetLab node, but the service\n" +"seems to be overloaded at the moment. Please try again.\n"; +/*-----------------------------------------------------------------*/ +static void +SetFd(int fd, OurFDSet *set) +{ + if (highestSetFd < fd) + highestSetFd = fd; + FD_SET(fd, set); +} +/*-----------------------------------------------------------------*/ +static void +ClearFd(int fd, OurFDSet *set) +{ + FD_CLR(fd, set); +} +/*-----------------------------------------------------------------*/ +static int +RemoveHeader(char *lower, char *real, int totalSize, char *header) +{ + /* returns number of characters removed */ + char h2[256]; + int start, end, len; + char *temp, *conn; + + sprintf(h2, "\n%s", header); + + if ((conn = strstr(lower, h2)) == NULL) + return(0); + + conn++; + /* determine how many characters to remove */ + if ((temp = strchr(conn, '\n')) != NULL) + len = (temp - conn) + 1; + else + len = strlen(conn) + 1; + start = conn - lower; + end = start + len; + memmove(&real[start], &real[end], totalSize - end); + memmove(&lower[start], &lower[end], totalSize - end); + + return(len); +} +/*-----------------------------------------------------------------*/ +static int +InsertHeader(char *buf, int totalSize, char *header) +{ + /* returns number of bytes inserted */ + + char h2[256]; + char *temp; + int len; + + sprintf(h2, "%s\r\n", header); + len = strlen(h2); + + /* if we don't encounter a \n, it means that we have only a single + line, and we'd converted the \n to a \0 */ + if ((temp = strchr(buf, '\n')) == NULL) + temp = strchr(buf, '\0'); + temp++; + + memmove(temp + len, temp, totalSize - (temp - buf)); + memcpy(temp, h2, len); + + return(len); +} +/*-----------------------------------------------------------------*/ +static int +FindService(FlowBuf *fb, int *whichService, struct in_addr addr) +{ + char *end; + char *lowerBuf; + char *hostVal; + char *buf = fb->fb_buf; + char orig[256]; +#if 0 + char *url; + int i; + int len; +#endif + + if (strstr(buf, "\n\r\n") == NULL && strstr(buf, "\n\n") == NULL) + return(FAILURE); + + /* insert client info after first line */ + sprintf(orig, "X-CoDemux-Client: %s", inet_ntoa(addr)); + fb->fb_used += InsertHeader(buf, fb->fb_used + 1, orig); + + /* get just the header, so we can work on it */ + LOCAL_STR_DUP_LOWER(lowerBuf, buf); + if ((end = strstr(lowerBuf, "\n\r\n")) == NULL) + end = strstr(lowerBuf, "\n\n"); + *end = '\0'; + + /* remove any existing connection, keep-alive headers, add ours */ + fb->fb_used -= RemoveHeader(lowerBuf, buf, fb->fb_used + 1, "keep-alive:"); + fb->fb_used -= RemoveHeader(lowerBuf, buf, fb->fb_used + 1, "connection:"); + fb->fb_used += InsertHeader(buf, fb->fb_used + 1, "Connection: close"); + InsertHeader(lowerBuf, fb->fb_used + 1, "connection: close"); + + /* isolate host, see if it matches */ + if ((hostVal = strstr(lowerBuf, "\nhost:")) != NULL) { + int i; + hostVal += strlen("\nhost:"); + if ((end = strchr(hostVal, '\n')) != NULL) + *end = '\0'; + if ((end = strchr(hostVal, ':')) != NULL) + *end = '\0'; + while (isspace(*hostVal)) + hostVal++; + if (strlen(hostVal) > 0) { + hostVal = GetWord(hostVal, 0); + for (i = 1; i < numServices; i++) { + if (serviceSig[i].ss_host != NULL && + DoesDotlessSuffixMatch(hostVal, 0, serviceSig[i].ss_host)) { + *whichService = i; + free(hostVal); + /* printf("%s", buf); */ + return(SUCCESS); + } + } + free(hostVal); + } + } + +#if 0 + /* see if URL prefix matches */ + if ((end = strchr(lowerBuf, '\n')) != NULL) + *end = 0; + if ((url = GetField(lowerBuf, 1)) == NULL || + url[0] != '/') { + /* bad request - let apache handle it ? */ + *whichService = 0; + return(SUCCESS); + } + url++; /* skip the leading slash */ + for (i = 1; i < numServices; i++) { + if (serviceSig[i].ss_prefix != NULL && + (len = strlen(serviceSig[i].ss_prefix)) > 0 && + strncmp(url, serviceSig[i].ss_prefix, len) == 0 && + (url[len] == ' ' || url[len] == '/')) { + int startPos = url - lowerBuf; + int stripLen = len + ((url[len] == '/') ? 1 : 0); + /* strip out prefix */ + fb->fb_used -= stripLen; + memmove(&buf[startPos], &buf[startPos+stripLen], + fb->fb_used + 1 - startPos); + /* printf("%s", buf); */ + *whichService = i; + return(SUCCESS); + } + } +#endif + + /* default to first service */ + *whichService = 0; + return(SUCCESS); +} +/*-----------------------------------------------------------------*/ +static int +StartConnect(int origFD, int whichService) +{ + int sock; + struct sockaddr_in dest; + SockInfo *si; + + /* create socket */ + if ((sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) < 0) { + return(FAILURE); + } + + /* make socket non-blocking */ + if (fcntl(sock, F_SETFL, O_NONBLOCK) < 0) { + close(sock); + return(FAILURE); + } + + /* set addr structure */ + memset(&dest, 0, sizeof(dest)); + dest.sin_family = AF_INET; + dest.sin_port = htons(serviceSig[whichService].ss_port); + dest.sin_addr.s_addr = htonl(INADDR_LOOPBACK); + + /* start connection process - we should be told that it's in + progress */ + if (connect(sock, (struct sockaddr *) &dest, sizeof(dest)) != -1 || + errno != EINPROGRESS) { + close(sock); + return(FAILURE); + } + + SetFd(sock, &masterWriteSet); /* determine when connect finishes */ + sockInfo[origFD].si_peerFd = sock; + si = &sockInfo[sock]; + memset(si, 0, sizeof(SockInfo)); + si->si_peerFd = origFD; + si->si_blocked = TRUE; /* still connecting */ + si->si_whichService = whichService; + si->si_writeBuf = sockInfo[origFD].si_readBuf; + sockInfo[origFD].si_readBuf->fb_refs++; + if (whichService >= 0) + SliceConnsInc(whichService); + + return(SUCCESS); +} +/*-----------------------------------------------------------------*/ +static int +WriteAvailData(int fd) +{ + SockInfo *si = &sockInfo[fd]; + FlowBuf *fb = si->si_writeBuf; + int res; + + /* printf("trying to write fd %d\n", fd); */ + if (fb->fb_used < 1 || si->si_blocked) + return(SUCCESS); + + /* printf("trying to write %d bytes\n", fb->fb_used); */ + /* write(STDOUT_FILENO, fb->fb_buf, fb->fb_used); */ + if ((res = write(fd, fb->fb_buf, fb->fb_used)) > 0) { + fb->fb_used -= res; + if (fb->fb_used > 0) { + /* couldn't write all - assume blocked */ + memmove(fb->fb_buf, &fb->fb_buf[res], fb->fb_used); + si->si_blocked = TRUE; + SetFd(fd, &masterWriteSet); + } + /* printf("wrote %d\n", res); */ + return(SUCCESS); + } + + /* we might have been full but didn't realize it */ + if (res == -1 && errno == EAGAIN) { + si->si_blocked = TRUE; + SetFd(fd, &masterWriteSet); + return(SUCCESS); + } + + /* otherwise, assume the worst */ + return(FAILURE); +} +/*-----------------------------------------------------------------*/ +static OurFDSet socksToCloseVec; +static int numSocksToClose; +static int whichSocksToClose[TARG_SETSIZE]; +/*-----------------------------------------------------------------*/ +static void +CloseSock(int fd) +{ + if (FD_ISSET(fd, &socksToCloseVec)) + return; + SetFd(fd, &socksToCloseVec); + whichSocksToClose[numSocksToClose] = fd; + numSocksToClose++; +} +/*-----------------------------------------------------------------*/ +static void +DecBuf(FlowBuf *buf) +{ + if (buf == NULL) + return; + buf->fb_refs--; + if (buf->fb_refs == 0) { + free(buf->fb_buf); + free(buf); + } +} +/*-----------------------------------------------------------------*/ +static void +ReallyCloseSocks(void) +{ + int i; + + memset(&socksToCloseVec, 0, sizeof(socksToCloseVec)); + + for (i = 0; i < numSocksToClose; i++) { + int fd = whichSocksToClose[i]; + close(fd); + DecBuf(sockInfo[fd].si_readBuf); + DecBuf(sockInfo[fd].si_writeBuf); + ClearFd(fd, &masterReadSet); + ClearFd(fd, &masterWriteSet); + if (sockInfo[fd].si_needsHeaderSince) { + sockInfo[fd].si_needsHeaderSince = 0; + numNeedingHeaders--; + } + if (sockInfo[fd].si_whichService >= 0) { + SliceConnsDec(sockInfo[fd].si_whichService); + sockInfo[fd].si_whichService = -1; + } + } + numSocksToClose = 0; +} +/*-----------------------------------------------------------------*/ +static void +SocketReadyToRead(int fd) +{ + SockInfo *si = &sockInfo[fd]; + int spaceLeft; + FlowBuf *fb; + int res; + + /* if peer is closed, close ourselves */ + if (si->si_peerFd < 0 && (!si->si_needsHeaderSince)) { + CloseSock(fd); + return; + } + + if ((fb = si->si_readBuf) == NULL) { + fb = si->si_readBuf = calloc(1, sizeof(FlowBuf)); + fb->fb_refs = 1; + if (si->si_peerFd >= 0) { + sockInfo[si->si_peerFd].si_writeBuf = fb; + fb->fb_refs = 2; + } + } + + if (fb->fb_buf == NULL) + fb->fb_buf = malloc(FB_ALLOCSIZE); + + /* determine read buffer size - if 0, then block reads and return */ + if ((spaceLeft = FB_SIZE - fb->fb_used) < 0) { + if (si->si_needsHeaderSince) { + write(fd, err400BadRequest, strlen(err400BadRequest)); + CloseSock(fd); + return; + } + else { + ClearFd(fd, &masterReadSet); + return; + } + } + + /* read as much as allowed, and is available */ + if ((res = read(fd, &fb->fb_buf[fb->fb_used], spaceLeft)) == 0) { + CloseSock(fd); + if (fb->fb_used == 0 && si->si_peerFd >= 0) { + CloseSock(si->si_peerFd); + si->si_peerFd = -1; + } + return; + } + if (res == -1) { + if (errno == EAGAIN) + return; + CloseSock(fd); + if (fb->fb_used == 0 && si->si_peerFd >= 0) { + CloseSock(si->si_peerFd); + si->si_peerFd = -1; + } + return; + } + fb->fb_used += res; + fb->fb_buf[fb->fb_used] = 0; /* terminate it for convenience */ + printf("sock %d, read %d, total %d\n", fd, res, fb->fb_used); + + /* if we need header, check if we've gotten it. if so, do + modifications and continue. if not, check if we've read the + maximum, and if so, fail */ + if (si->si_needsHeaderSince) { + int whichService; + SliceInfo *slice; + +#define STATUS_REQ "GET /codemux/status.txt" + if (strncasecmp(fb->fb_buf, STATUS_REQ, sizeof(STATUS_REQ)-1) == 0) { + DumpStatus(fd); + CloseSock(fd); + return; + } + + printf("trying to find service\n"); + if (FindService(fb, &whichService, si->si_cliAddr) != SUCCESS) + return; + printf("found service %d\n", whichService); + slice = ServiceToSlice(whichService); + + /* no service can have more than some absolute max number of + connections. Also, when we're too busy, start enforcing + fairness across the servers */ + if (slice->si_numConns > SERVICE_MAX || + (numTotalSliceConns > FAIRNESS_CUTOFF && + slice->si_numConns > MAX_CONNS/numActiveSlices)) { + write(fd, err503TooBusy, strlen(err503TooBusy)); + CloseSock(fd); + return; + } + + if (slice->si_xid > 0) { + setsockopt(fd, SOL_SOCKET, SO_SETXID, + &slice->si_xid, sizeof(slice->si_xid)); + fprintf(stderr, "setsockopt() with XID = %d name = %s\n", + slice->si_xid, slice->si_sliceName); + } + + si->si_needsHeaderSince = 0; + numNeedingHeaders--; + if (StartConnect(fd, whichService) != SUCCESS) { + write(fd, err503Unavailable, strlen(err503Unavailable)); + CloseSock(fd); + return; + } + return; + } + + /* write anything possible */ + if (WriteAvailData(si->si_peerFd) != SUCCESS) { + /* assume the worst and close */ + CloseSock(fd); + CloseSock(si->si_peerFd); + si->si_peerFd = -1; + } +} +/*-----------------------------------------------------------------*/ +static void +SocketReadyToWrite(int fd) +{ + SockInfo *si = &sockInfo[fd]; + + /* unblock it and read what it has */ + si->si_blocked = FALSE; + ClearFd(fd, &masterWriteSet); + SetFd(fd, &masterReadSet); + + /* enable reading on peer just in case it was off */ + if (si->si_peerFd >= 0) + SetFd(si->si_peerFd, &masterReadSet); + + /* if we have data, write it */ + if (WriteAvailData(fd) != SUCCESS) { + /* assume the worst and close */ + CloseSock(fd); + if (si->si_peerFd >= 0) { + CloseSock(si->si_peerFd); + si->si_peerFd = -1; + } + return; + } + + /* if peer is closed and we're done writing, we should close */ + if (si->si_peerFd < 0 && si->si_writeBuf->fb_used == 0) + CloseSock(fd); +} +/*-----------------------------------------------------------------*/ +static void +CloseReqlessConns(void) +{ + static int lastSweep; + int maxAge; + int i; + + if (lastSweep == now) + return; + lastSweep = now; + + if (numTotalSliceConns + numNeedingHeaders > MAX_CONNS || + numNeedingHeaders > TARG_SETSIZE/20) { + /* second condition is probably an attack - close aggressively */ + maxAge = 5; + } + else if (numTotalSliceConns + numNeedingHeaders > FAIRNESS_CUTOFF || + numNeedingHeaders > TARG_SETSIZE/40) { + /* sweep a little aggressively */ + maxAge = 10; + } + else if (numNeedingHeaders > TARG_SETSIZE/80) { + /* just sweep to close strays */ + maxAge = 30; + } + else { + /* too little gained - not worth sweeping */ + return; + } + + /* if it's too old, close it */ + for (i = 0; i < highestSetFd+1; i++) { + if (sockInfo[i].si_needsHeaderSince && + (now - sockInfo[i].si_needsHeaderSince) > maxAge) + CloseSock(i); + } +} +/*-----------------------------------------------------------------*/ +static void +MainLoop(int lisSock) +{ + int i; + OurFDSet tempReadSet, tempWriteSet; + int res; + int lastConfCheck = 0; + + signal(SIGPIPE, SIG_IGN); + + while (1) { + int newSock; + int ceiling; + struct timeval timeout; + + now = time(NULL); + + if (now - lastConfCheck > 300) { + ReadConfFile(); + GetSliceXids(); /* always call - in case new slices created */ + lastConfCheck = now; + } + + /* see if there's any activity */ + tempReadSet = masterReadSet; + tempWriteSet = masterWriteSet; + + /* trim it down if needed */ + while (highestSetFd > 1 && + (!FD_ISSET(highestSetFd, &tempReadSet)) && + (!FD_ISSET(highestSetFd, &tempWriteSet))) + highestSetFd--; + timeout.tv_sec = 1; + timeout.tv_usec = 0; + res = select(highestSetFd+1, (fd_set *) &tempReadSet, + (fd_set *) &tempWriteSet, NULL, &timeout); + if (res < 0 && errno != EINTR) { + perror("select"); + exit(-1); + } + + now = time(NULL); + + /* clear the bit for listen socket to avoid confusion */ + ClearFd(lisSock, &tempReadSet); + + ceiling = highestSetFd+1; /* copy it, since it changes during loop */ + /* pass data back and forth as needed */ + for (i = 0; i < ceiling; i++) { + if (FD_ISSET(i, &tempWriteSet)) + SocketReadyToWrite(i); + } + for (i = 0; i < ceiling; i++) { + if (FD_ISSET(i, &tempReadSet)) + SocketReadyToRead(i); + } + + /* see if we need to close conns w/o requests */ + CloseReqlessConns(); + + /* do all closes */ + ReallyCloseSocks(); + + /* try accepting new connections */ + do { + struct sockaddr_in addr; + socklen_t lenAddr = sizeof(addr); + if ((newSock = accept(lisSock, (struct sockaddr *) &addr, + &lenAddr)) >= 0) { + memset(&sockInfo[newSock], 0, sizeof(SockInfo)); + sockInfo[newSock].si_needsHeaderSince = now; + numNeedingHeaders++; + sockInfo[newSock].si_peerFd = -1; + sockInfo[newSock].si_cliAddr = addr.sin_addr; + sockInfo[newSock].si_whichService = -1; + SetFd(newSock, &masterReadSet); + } + } while (newSock >= 0); + } +} +/*-----------------------------------------------------------------*/ +int +main(int argc, char *argv[]) +{ + int lisSock; + + if ((lisSock = CreatePrivateAcceptSocket(DEMUX_PORT, TRUE)) < 0) { + fprintf(stderr, "failed creating accept socket\n"); + exit(-1); + } + SetFd(lisSock, &masterReadSet); + + while (1) { + numForks++; + if (fork()) { + /* this is the parent - just wait */ + while (wait3(NULL, 0, NULL) < 1) + ; /* just keep waiting for a real pid */ + } + else { + /* child process */ + MainLoop(lisSock); + exit(-1); + } + } +} +/*-----------------------------------------------------------------*/ diff --git a/codemux.conf b/codemux.conf new file mode 100644 index 0000000..b5c86fb --- /dev/null +++ b/codemux.conf @@ -0,0 +1,6 @@ +coblitz.codeen.org princeton_coblitz 3125 +cdn.rd.tp.pl princeton_coblitz 3125 +nyud.net nyu_d 8080 +nyucd.net nyu_d 8080 +nyuld.net nyu_oasis 8096 +cob-web.org cornell_cobweb 8888 diff --git a/codemux.initscript b/codemux.initscript new file mode 100644 index 0000000..5a69a70 --- /dev/null +++ b/codemux.initscript @@ -0,0 +1,69 @@ +#!/bin/sh +# +# chkconfig: 345 85 02 +# description: codemux startup script +# + +PROC=codemux + +. /etc/rc.d/init.d/functions + +RETVAL=0 + +pidfile=/var/run/$PROC.pid + +check_status() { + pid=`cat $pidfile 2>/dev/null` + # + # this eliminates a race condition between checking existence of pidfile + # and reading its value + # + [ -n "$pid" -a -d /proc/$pid ] +} + +case "$1" in + start) + echo -n "starting $PROC:" + pid=`cat $pidfile 2>/dev/null` + if [ -n "$pid" ]; then + # check whether process really exists + # yes - don't try to start + [ -d /proc/$pid ] && action "already running" /bin/true && exit 1 + + # no - PID file is stale + rm -f $pidfile + fi + + initlog -c /usr/local/planetlab/sbin/codemux + + cmd=success + check_status && touch /var/lock/subsys/$PROC || cmd=failure + $cmd "$PROC startup" + echo + ;; + + stop) + echo -n "shutting down $PROC: " + killproc $PROC + RETVAL=$? + echo + [ $RETVAL -eq 0 ] && rm -f /var/lock/subsys/$PROC + ;; + + restart|reload) + $0 stop + $0 start + RETVAL=$? + ;; + + status) + check_status && echo 'running' && exit 0 || \ + echo 'not running' && exit 1 + ;; + + *) + echo "Usage: $0 {start|stop|restart|status}" + RETVAL=1 +esac + +exit $RETVAL diff --git a/codemux.spec b/codemux.spec new file mode 100644 index 0000000..a713adf --- /dev/null +++ b/codemux.spec @@ -0,0 +1,63 @@ +Summary: CoDemux - HTTP port demultiplexer +Name: codemux +Version: 0.5 +Release: 1 +License: Private +Group: System Environment/Base +URL: http://codeen.cs.princeton.edu/ +Source0: %{name}-%{version}.tar.gz +BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root + +%description + +%prep +%setup -q + +make clean + +%build +make + +%install +rm -rf $RPM_BUILD_ROOT + +make INSTALL_ROOT=$RPM_BUILD_ROOT install-all + +# Build and install in %post so that it gets put in the right site-packages directory +#rm -rf $RPM_BUILD_ROOT/usr/lib/python* +#install -D -m 755 python/Proper.py $RPM_BUILD_ROOT/%{_datadir}/%{name}/Proper.py + +%clean +rm -rf $RPM_BUILD_ROOT + +%files +%defattr(-,root,root,-) +%attr(0755,root,root) %{_initrddir}/codemux +%config /etc/codemux/codemux.conf +%attr(0755,root,root) /usr/local/planetlab/sbin/codemux + +%post +chkconfig codemux reset + +if [ -z "$PL_BOOTCD" ]; then + /sbin/ldconfig + /etc/init.d/codemux restart +fi + +%preun +if [ "$1" = 0 ]; then + # erase, not upgrade + chkconfig --del codemux + + # stop daemon if its currently running + if [ "`/etc/init.d/codemux status`" = "running" ]; then + /etc/init.d/codemux stop + fi +fi + +%doc + +%changelog +* Sun Apr 22 2007 KYOUNGSOO PARK - +- Initial build. + diff --git a/codns.h b/codns.h new file mode 100644 index 0000000..2f7f36e --- /dev/null +++ b/codns.h @@ -0,0 +1,42 @@ +#ifndef _CODNS_H_ +#define _CODNS_H_ +#include "ports.h" + +/* query info - fixed part */ +typedef struct LocalQueryInfo { + int lqi_size; /* length of the name string */ + int lqi_id; /* query id */ + int lqi_cache; /* not being used now */ +} LocalQueryInfo; + +/* query info + name + query structure expected from a client */ +#define MAX_QUERY_NAME 256 +#define SIG_SPLIT_TRANSACTION 0 /* signature for split-transaction */ +typedef struct LocalQuery { + int lq_zero; /* always set to SIG_SPLIT_TRANSACTION(=0) */ + LocalQueryInfo lq_info; /* query info */ + char lq_name[MAX_QUERY_NAME]; /* name */ +} LocalQuery; + +/* query result from CoDNS + we set MAX_ANSWERS for easy implementation. + if lq.address[i].s_addr == 0, that means it returned i-1 valid anwers. */ +#define MAX_ANSWERS 8 +typedef struct LocalQueryResult { + int lq_id; /* query id */ + int lq_ttl; /* TTL of the record */ + struct in_addr lq_address[MAX_ANSWERS]; /* IP addresses for the query */ +} LocalQueryResult; + +/*----------------------------------------------------------------------*/ + +/* temporary section : from here to the end + used for defining variables or constants for testing */ + +/* for testing in HBTWGET */ +#define HBTWGET_CODNS_ID (-3) + +#endif // _CODNS_H_ + + diff --git a/debug.h b/debug.h new file mode 100644 index 0000000..a7ddc87 --- /dev/null +++ b/debug.h @@ -0,0 +1,97 @@ +#ifndef _DEBUG_H_ +#define _DEBUG_H_ +#include +#include "applib.h" +#undef max + + +/* + TRACE : print with function name + TRACE0 : print without function name + TRACE1 : print "buf" whose size is "size" +*/ + +#define DEBUG + +extern HANDLE hdebugLog; +extern int defaultTraceSync; +#define TRACE0(fmt, msg...) { \ + char __buf[2048]; \ + if (hdebugLog) { \ + snprintf(__buf, sizeof(__buf), fmt, ##msg); \ + WriteLog(hdebugLog, __buf, strlen(__buf), defaultTraceSync); \ + } \ +} +#define TRACE1(buf, size) { \ + WriteLog(hdebugLog, buf, size, defaultTraceSync); \ +} +#define TRACE(fmt, msg...) { \ + char __buf[2048]; \ + if (hdebugLog) { \ + snprintf(__buf, sizeof(__buf), "[%s] " fmt, __FUNCTION__, ##msg); \ + WriteLog(hdebugLog, __buf, strlen(__buf), defaultTraceSync); \ + } \ +} +#define TRACEX(fmt) { \ + char __buf[2048]; \ + if (hdebugLog) { \ + snprintf(__buf, sizeof(__buf), "[%s] " fmt, __FUNCTION__); \ + WriteLog(hdebugLog, __buf, strlen(__buf), defaultTraceSync); \ + } \ +} + +#ifndef HERE +#define HERE TRACE("file %s, line %d, func %s\n", __FILE__, __LINE__, __FUNCTION__) +#endif + +#ifdef DEBUG +#define ASSERT(exp) { \ + if (!(exp)) { \ + TRACE("ASSERTION (%s) FAILED in %s (%s:%d)\n", \ + (#exp), __FUNCTION__, __FILE__, __LINE__); \ + } \ +} +#else +#define ASSERT(exp) 1 ? (void)0 : (exp) +#endif // DEBUG + +/*-------------------------------------------------------------- + macros used for debugging memory leaks + if DEBUG_MEMORY_LEAK is enabled, we track down all the memory + allocation/freeing to count the number of allocations + -------------------------------------------------------------*/ +//#define DEBUG_MEMORY_LEAK + +#ifndef DEBUG_MEMORY_LEAK + +#define xcalloc(nmemb, size) calloc(nmemb, size) +#define xmalloc(size) malloc(size) +#define xrealloc(ptr, size) realloc(ptr, size) +#define xstrdup(s) strdup(s) +#define xfree(ptr) free(ptr) + +#else + +#define xcalloc(nmemb, size) dbgcalloc(nmemb, size, __FUNCTION__, \ + __FILE__, __LINE__) +#define xmalloc(size) dbgmalloc(size, __FUNCTION__,\ + __FILE__, __LINE__) +#define xrealloc(ptr, size) dbgrealloc(ptr, size, __FUNCTION__,\ + __FILE__, __LINE__) +#define xstrdup(s) dbgstrdup(s, __FUNCTION__, __FILE__, __LINE__) +#define xfree(ptr) dbgfree(ptr, __FUNCTION__, __FILE__, __LINE__) + +void *dbgcalloc(size_t nmemb, size_t size, + const char* func, const char* file, const int line); +void *dbgmalloc(size_t size, + const char* func, const char* file, const int line); +void *dbgrealloc(void *ptr, size_t size, + const char* func, const char* file, const int line); +char *dbgstrdup(const char *s, + const char* func, const char* file, const int line); +void dbgfree(void *ptr, const char* func, const char* file, const int line); +void dbg_print_memtrace(void); + +#endif /* DEBUG_MEMOTY_LEAK */ + +#endif /* _DEBUG_H_ */ diff --git a/gettimeofdayex.c b/gettimeofdayex.c new file mode 100644 index 0000000..4fc6718 --- /dev/null +++ b/gettimeofdayex.c @@ -0,0 +1,149 @@ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include +#include +#include +#include +#include +#include "gettimeofdayex.h" + +#if defined(__linux) && !defined(ALPHA) + +/* Macros */ +#define rdtsc(low, high) \ + asm volatile("rdtsc":"=a" (low), "=d" (high)) + +/* get cycle counts */ +#define GET_CC(cc) \ +do \ +{ \ + cc = 0; \ + unsigned long __cc_low__,__cc_high__; \ + rdtsc(__cc_low__,__cc_high__); \ + cc = __cc_high__; \ + cc = (cc << 32) + __cc_low__; \ +} \ +while(0) + +/*-------------------------------------------------------------------------*/ +static int +get_cpu_speed(double* cpu_speed) +{ + FILE* fp = NULL; + char *line = NULL; + size_t len = 0; + ssize_t num; + char* ptr = 0; + + if (cpu_speed == NULL) + return -1; + + if ((fp = fopen("/proc/cpuinfo", "r")) == NULL) + return -1; + + while ((num = getline(&line, &len, fp)) != -1) { + ptr = strtok(line, ":\n"); + if (ptr && strstr(ptr, "cpu MHz")) { + ptr = strtok(0," \r\t\n"); + *cpu_speed = strtod(ptr, 0); + break; + } + } + if (line) + free(line); + + fclose(fp); + return (*cpu_speed == 0) ? -1 : 0; +} + +/*--------------------------------------------------------------------------*/ +int +gettimeofdayex(struct timeval *tv, struct timezone *tz) +{ +#define MAX_64_BIT_NUM ((unsigned long long)(-1)) + + /* TO DO: timezone adjustment */ + static int first = 1, impossible = 0; + static double cpu_speed; + static struct timeval start; + static unsigned long long cc_start; + unsigned long long cc_now, cc_diff, usec; + + /* initialize : get the current time + and fix it as a starting point */ + if (first) { + if (get_cpu_speed(&cpu_speed) < 0) + impossible = 1; + + GET_CC(cc_start); + gettimeofday(&start, 0); + if (tv) + *tv = start; + first = 0; + return 0; + } + + /* if it's impossible to get cycle counts, + then use original gettimeofday() */ + if (impossible) + return gettimeofday(tv, tz); + + if (tv) { + GET_CC(cc_now); + + /* when overflow happens, we need to take care of the carry bit, + otherwise we get wrong result since we are using unsigned variables. + assuming cycle counter starts from zero at time zero, + this would happen after 136 years on 4GHz CPU. + however, lets just play on the safer side. + */ + + cc_diff = (cc_now < cc_start) ? cc_now + (MAX_64_BIT_NUM - cc_start) + : cc_now - cc_start; + usec = (unsigned long long)(cc_diff / cpu_speed); + + *tv = start; + tv->tv_sec += (usec / 1000000); + tv->tv_usec += (usec % 1000000); + + if (tv->tv_usec >= 1000000) { + tv->tv_sec++; + tv->tv_usec -= 1000000; + } + } + + return 0; +} + +/*--------------------------------------------------------------------------*/ +time_t +timeex(time_t *t) +{ + /* simple version of time() */ + struct timeval s; + + gettimeofdayex(&s, NULL); + if (t) + *t = (time_t)s.tv_sec; + + return (time_t)s.tv_sec; +} + +#else +#include +/*--------------------------------------------------------------------------*/ +int +gettimeofdayex(struct timeval *tv, struct timezone *tz) +{ + return(gettimeofday(tv, tz)); +} +/*--------------------------------------------------------------------------*/ +time_t +timeex(time_t *t) +{ + return(time(t)); +} +/*--------------------------------------------------------------------------*/ +#endif diff --git a/gettimeofdayex.h b/gettimeofdayex.h new file mode 100644 index 0000000..b9d6108 --- /dev/null +++ b/gettimeofdayex.h @@ -0,0 +1,8 @@ +#ifndef _GETTIMEOFDAYEX_H_ +#define _GETTIMEOFDAYEX_H_ +#include + +int gettimeofdayex(struct timeval *tv, struct timezone *tz); +time_t timeex(time_t *t); + +#endif // _GETTIMEOFDAYEX_H_ diff --git a/ports.h b/ports.h new file mode 100644 index 0000000..807dd2e --- /dev/null +++ b/ports.h @@ -0,0 +1,102 @@ +#ifndef _PORTS_H_ +#define _PORTS_H_ + +#define PORTS_COBLITZ + +#ifdef PORTS_CODEEN + +#define HOMEDIR "/home/princeton_codeen" +/* options in prox.conf or defined implicitly + 1161 snmpPort + 31415 APIDummyServPort + 3130 ICPPort */ +/* different ports for CoDNS and CoDNS2 */ +#ifdef CODNS2 +#define CODNS_PORT 24119 /* CODNS2 PORT */ +#define CODNS_STAT_PORT 24118 /* CODNS2 STATISTICS PORT */ +#define CODNS_HB_PORT 24121 +#else +#define CODNS_PORT 4119 /* CODNS PORT */ +#define CODNS_STAT_PORT 4118 /* CODNS STATISTICS PORT */ +#define CODNS_HB_PORT 4121 +#endif + +#define HB_PORT 23127 /* CoDeeN heartbeat port */ +#define CODEMON_PORT 2123 +/* 3124 proxy listen port */ +#define CODEPLOY_PORT 2125 +#define FAKE_WEBSERVER_PORT 3126 +#define FAKE_WS_PORT_STR "3126" +/* 3127 proxy listen port */ +/* 3128 proxy listen port */ +#define MAIN_PROXYPORT_STR "3128" +#define CODEEN_TM_PORT 3129 +#define PROXY_PORT 3128 +#define PROXY_PORT2 3127 +#define PROXY_PORT3 3124 + +/* used by redir_helper.c */ +#define QUERY_PORT 3122 +#define RTTSNAP_PORT 3110 + +#elif defined(PORTS_COBLITZ) + +#define HOMEDIR "/home/princeton_coblitz" +/* options in prox.conf + 2111 snmpPort + 2112 APIDummyServPort + 2113 ICPPort */ +#define CODNS_PORT 2119 /* CODNS PORT */ +#define CODNS_STAT_PORT 2120 /* CODNS STATISTICS PORT */ +#define CODNS_HB_PORT 2121 +#define HB_PORT 2122 /* CoDeeN heartbeat port */ +#define CODEMON_PORT 23126 +/* 2124 proxy listen port */ +#define CODEPLOY_PORT 3125 +#define FAKE_WEBSERVER_PORT 2126 +#define FAKE_WS_PORT_STR "2126" +/* 2127 proxy listen port */ +/* 2128 proxy listen port */ +#define MAIN_PROXYPORT_STR "2128" +#define CODEEN_TM_PORT 2129 +#define PROXY_PORT 2128 +#define PROXY_PORT2 2127 +#define PROXY_PORT3 2124 + +/* used by redir_helper.c */ +#define QUERY_PORT 2122 +#define RTTSNAP_PORT 2110 + +#else +#error "need to define either PORTS_CODEEN or PORTS_COBLITZ" +#endif + + +#endif + + +/* + +other changed files: prox_monitor prox.conf + + 4119 tcp 415 princeton_codeen codns + 4119 udp 415 princeton_codeen codns + 4118 tcp 412 princeton_codeen codns + 4121 udp 411 princeton_codeen codns + +23126 tcp 412 princeton_codeen codemon +23126 udp 412 princeton_codeen codemon +23127 tcp 399 princeton_codeen codeen +23127 udp 399 princeton_codeen codeen + 3126 tcp 399 princeton_codeen codeen + 1161 udp 401 princeton_codeen specified via snmpPort +31415 tcp 401 princeton_codeen can be specified via APIDummyServPort + 3124 tcp 399 princeton_codeen specified in prox.conf + 3127 tcp 399 princeton_codeen specified in prox.conf + 3128 tcp 398 princeton_codeen specified in prox.conf + 3129 tcp 399 princeton_codeen codeen traffic mon + 3130 udp 401 princeton_codeen can be specified from ICPPort + 3125 tcp 410 princeton_codeen codeploy + 3135 tcp 370 princeton_codeen + +*/ diff --git a/vnet_main.c b/vnet_main.c new file mode 100644 index 0000000..4dfe7b4 --- /dev/null +++ b/vnet_main.c @@ -0,0 +1,1202 @@ +/* + * VServer IP isolation. + * + * This file implements netfilter hooks and AF_INET socket function + * overrides. + * + * Mark Huang + * Copyright (C) 2004 The Trustees of Princeton University + * + * $Id: vnet_main.c,v 1.40 2007/03/08 15:46:07 mef Exp $ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "vnet_config.h" +#include "vnet.h" +#include "vnet_dbg.h" +#include "vnet_compat.h" + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16) + +#define HAVE_FUNCTIONALITY_REQUIRED_BY_DEMUX + +#include + +static inline void +vnet_timewait_put(struct sock* sk) +{ + inet_twsk_put((struct inet_timewait_sock *)sk); +} + +static inline struct sock* +vnet_tcp_lookup(u32 src_ip, u16 src_port, + u32 ip, u16 port, int dif) +{ + return inet_lookup(&tcp_hashinfo, src_ip, src_port, ip, port, dif); +} + +static inline int vnet_iif(const struct sk_buff *skb) +{ + return inet_iif(skb); +} +#endif + +#if LINUX_VERSION_CODE == KERNEL_VERSION(2,6,12) + +#define HAVE_FUNCTIONALITY_REQUIRED_BY_DEMUX + +static inline void +vnet_timewait_put(struct sock* sk) +{ + /* net/tcp.h */ + tcp_tw_put((struct tcp_tw_bucket*)sk); +} + +static inline struct sock* +vnet_tcp_lookup(u32 saddr, u16 sport, u32 daddr,u16 dport, int dif) +{ + extern struct sock *tcp_v4_lookup(u32, u16, u32, u16, int); + return tcp_v4_lookup(saddr, sport, daddr, dport, dif); +} + +/* same as tcp_v4_iff() in net/ipv4/tcp_ipv4. */ +static inline int vnet_iif(const struct sk_buff *skb) +{ + return ((struct rtable *)skb->dst)->rt_iif; +} +#endif + +#ifndef HAVE_FUNCTIONALITY_REQUIRED_BY_DEMUX +#warning DEMUX FUNCTIONALITY NOT SUPPORTED +#endif + +int vnet_verbose = 1; + +/* We subdivide the 1: major class into 15 minor subclasses 1:1, 1:2, + * etc. so that we can represent multiple bandwidth limits. The 1:1 + * subclass has children named 1:1000, 1:1001, etc., one for each + * context (up to 4096). Similarly, the 1:2 subclass has children + * named 1:2000, 1:2001, etc. By default, the 1:1 subclass represents + * the node bandwidth cap and 1:1000 represents the root context's + * share of it. */ +int vnet_root_class = TC_H_MAKE(1 << 16, 0x1000); + +#define FILTER_VALID_HOOKS ((1 << NF_IP_LOCAL_IN) | \ + (1 << NF_IP_LOCAL_OUT) | \ + (1 << NF_IP_POST_ROUTING)) + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,11) + +/* Standard entry. */ +struct ipt_standard +{ + struct ipt_entry entry; + struct ipt_standard_target target; +}; + +struct ipt_error_target +{ + struct ipt_entry_target target; + char errorname[IPT_FUNCTION_MAXNAMELEN]; +}; + +struct ipt_error +{ + struct ipt_entry entry; + struct ipt_error_target target; +}; + +#endif + +static struct +{ + struct ipt_replace repl; + struct ipt_standard entries[3]; + struct ipt_error term; +} initial_table __initdata = +{ + .repl = + { + .name = "vnet", + .valid_hooks = FILTER_VALID_HOOKS, + .num_entries = 4, + .size = sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error), + .hook_entry = { [NF_IP_LOCAL_IN] = 0, + [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard), + [NF_IP_POST_ROUTING] = sizeof(struct ipt_standard) * 2, }, + .underflow = { [NF_IP_LOCAL_IN] = 0, + [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard), + [NF_IP_POST_ROUTING] = sizeof(struct ipt_standard) * 2, }, + }, + + .entries = + { + /* LOCAL_IN: currently unused */ + { .entry = { .target_offset = sizeof(struct ipt_entry), + .next_offset = sizeof(struct ipt_standard), }, + .target = { .target = { .u = { .target_size = IPT_ALIGN(sizeof(struct ipt_standard_target)), }, }, + .verdict = -NF_ACCEPT - 1, }, + }, + + /* LOCAL_OUT: used for logging */ + { .entry = { .target_offset = sizeof(struct ipt_entry), + .next_offset = sizeof(struct ipt_standard), }, + .target = { .target = { .u = { .target_size = IPT_ALIGN(sizeof(struct ipt_standard_target)), }, }, + .verdict = -NF_ACCEPT - 1, }, + }, + + /* POST_ROUTING: used for priority classification */ + { .entry = { .target_offset = sizeof(struct ipt_entry), + .next_offset = sizeof(struct ipt_standard), }, + .target = { .target = { .u = { .target_size = IPT_ALIGN(sizeof(struct ipt_standard_target)), }, }, + .verdict = -NF_ACCEPT - 1, }, + }, + }, + + /* ERROR */ + .term = + { + .entry = { .target_offset = sizeof(struct ipt_entry), + .next_offset = sizeof(struct ipt_error), }, + .target = { .target = { .u = { .user = { .target_size = IPT_ALIGN(sizeof(struct ipt_error_target)), + .name = IPT_ERROR_TARGET, }, }, }, + .errorname = "ERROR", }, + }, +}; + +static struct ipt_table vnet_table = { + .name = "vnet", +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,11) + .table = &initial_table.repl, +#endif + .valid_hooks = FILTER_VALID_HOOKS, + .lock = RW_LOCK_UNLOCKED, + .me = THIS_MODULE, +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16) + .af = AF_INET, +#endif +}; + +static inline u_int16_t +get_dst_port(struct ip_conntrack_tuple *tuple) +{ + switch (tuple->dst.protonum) { + case IPPROTO_GRE: + /* XXX Truncate 32-bit GRE key to 16 bits */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,11) + return tuple->dst.u.gre.key; +#else + return htons(ntohl(tuple->dst.u.gre.key)); +#endif + case IPPROTO_ICMP: + /* Bind on ICMP echo ID */ + return tuple->src.u.icmp.id; + case IPPROTO_TCP: + return tuple->dst.u.tcp.port; + case IPPROTO_UDP: + return tuple->dst.u.udp.port; + default: + return tuple->dst.u.all; + } +} + +static inline u_int16_t +get_src_port(struct ip_conntrack_tuple *tuple) +{ + switch (tuple->dst.protonum) { + case IPPROTO_GRE: + /* XXX Truncate 32-bit GRE key to 16 bits */ + return htons(ntohl(tuple->src.u.gre.key)); + case IPPROTO_ICMP: + /* Bind on ICMP echo ID */ + return tuple->src.u.icmp.id; + case IPPROTO_TCP: + return tuple->src.u.tcp.port; + case IPPROTO_UDP: + return tuple->src.u.udp.port; + default: + return tuple->src.u.all; + } +} + + + +static unsigned int +vnet_hook(unsigned int hook, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct ip_conntrack *ct; + enum ip_conntrack_info ctinfo; + enum ip_conntrack_dir dir; + u_int8_t protocol; + u_int32_t ip; + u_int16_t port; + struct bind_key *key; + xid_t xid; + unsigned int verdict; + int priority; + struct sock *sk; + int need_to_free_sk = 0; + + ct = ip_conntrack_get(*pskb, &ctinfo); + dir = CTINFO2DIR(ctinfo); + + /* Default to marking packet with root context ID */ + xid = 0; + + switch (hook) { + + case NF_IP_LOCAL_IN: + /* Multicast to 224.0.0.1 is one example */ + if (!ct) + break; + + /* Determine if the packet is destined for a bound port */ + protocol = ct->tuplehash[dir].tuple.dst.protonum; + assert(ctinfo == IP_CT_RELATED || + ctinfo == (IP_CT_IS_REPLY + IP_CT_RELATED) || + protocol == (*pskb)->nh.iph->protocol); + ip = ct->tuplehash[dir].tuple.dst.ip; + port = get_dst_port(&ct->tuplehash[dir].tuple); + + /* Check if the port is bound */ + key = bind_get(protocol, ip, port, NULL); + + if (key && key->sk != NULL) { + + /* A new or established connection to a bound port */ + sk = key->sk; + +#ifdef HAVE_FUNCTIONALITY_REQUIRED_BY_DEMUX + /* If the bound socket is a real TCP socket, then the context that + * bound the port could have re-assigned an established connection + * socket to another context. See if this is the case. + */ + if (protocol == IPPROTO_TCP && sk->sk_type == SOCK_STREAM) { + struct sock *tcp_sk; + u_int32_t src_ip = ct->tuplehash[dir].tuple.src.ip; + u_int16_t src_port = get_src_port(&ct->tuplehash[dir].tuple); + + tcp_sk = vnet_tcp_lookup(src_ip, src_port, ip, port, vnet_iif(*pskb)); + if (tcp_sk) { + if (tcp_sk->sk_state == TCP_TIME_WAIT) { + sock_put(tcp_sk); + } else { + dbg("vnet_in:%d: established TCP socket %u.%u.%u.%u:%u -> %u.%u.%u.%u:%u\n", + get_sk_xid(tcp_sk), NIPQUAD(src_ip), ntohs(src_port), NIPQUAD(ip), ntohs(port)); + sk = tcp_sk; + need_to_free_sk = 1; + } + /* Remember to sock_put()! */ + } + } +#endif + + /* Indicate to the stack that the packet was "expected", so that it does + * not generate a TCP RST or ICMP Unreachable message. This requires a + * kernel patch. + */ + if (sk->sk_type == SOCK_RAW) + (*pskb)->sk = sk; + + assert(sk); + xid = get_sk_xid(sk); + + /* Steal the reply end of the connection */ + if (get_ct_xid(ct, !dir) != xid) { + dbg("vnet_in:%d: stealing %sbound %s connection %u.%u.%u.%u:%u -> %u.%u.%u.%u:%u from context %d\n", xid, + key ? "" : "un", print_protocol(protocol), + NIPQUAD(ip), ntohs(port), + NIPQUAD(ct->tuplehash[!dir].tuple.dst.ip), ntohs(ct->tuplehash[!dir].tuple.dst.u.all), + get_ct_xid(ct, !dir)); + set_ct_xid(ct, !dir, xid); + } + + /* Store the owner (if any) of the other side of the connection (if + * localhost) in the peercred struct. + */ + sk->sk_peercred.uid = sk->sk_peercred.gid = (__u32) get_ct_xid(ct, dir); + + if (ctinfo == IP_CT_NEW) { + dbg("vnet_in: %s port %u.%u.%u.%u:%u bound by context %d\n", + print_protocol(protocol), NIPQUAD(ip), ntohs(port), xid); + } + +#ifdef HAVE_FUNCTIONALITY_REQUIRED_BY_DEMUX + if (need_to_free_sk) { + /* + if (sk->sk_state == TCP_TIME_WAIT) + vnet_timewait_put(sk); + else*/ + sock_put(sk); + need_to_free_sk=0; + } +#endif + bind_put(key); + + } else if ((int) get_ct_xid(ct, !dir) == -1) { + /* A new connection to an unbound port */ + if (ctinfo == IP_CT_NEW) { + dbg("vnet_in: %s port %u.%u.%u.%u:%u not bound\n", + print_protocol(protocol), NIPQUAD(ip), ntohs(port)); + } + } else { + /* A new or established connection to an unbound port that could be + * associated with an active socket ("could be" because the socket + * could be closed and the connection in a WAIT state). In any case, + * give it to the last owner of the connection. + */ + xid = get_ct_xid(ct, !dir); + } + + break; + + case NF_IP_LOCAL_OUT: + /* Get the context ID of the sender */ + assert((*pskb)->sk); + xid = get_sk_xid((*pskb)->sk); + + /* Default class */ + priority = vnet_root_class; + + if (ct) { + protocol = ct->tuplehash[dir].tuple.dst.protonum; + assert(ctinfo == IP_CT_RELATED || + ctinfo == (IP_CT_IS_REPLY + IP_CT_RELATED) || + protocol == (*pskb)->nh.iph->protocol); + ip = ct->tuplehash[dir].tuple.src.ip; + assert(ctinfo == IP_CT_RELATED || + ctinfo == (IP_CT_IS_REPLY + IP_CT_RELATED) || + ip == __constant_htonl(INADDR_ANY) || ip == (*pskb)->nh.iph->saddr); + port = get_src_port(&ct->tuplehash[dir].tuple); + } else { + protocol = port = 0; + } + + if (xid) { + /* Multicast to 224.0.0.1 is one example */ + if (!ct) { + dbg("vnet_out:%d: dropping untrackable IP packet\n", xid); + return NF_DROP; + } + + /* XXX Is this guaranteed? */ + if ((*pskb)->len < sizeof(struct iphdr)) { + dbg("vnet_out:%d: dropping runt IP packet\n", xid); + return NF_DROP; + } + + /* Check source IP address */ + if (inet_addr_type(ip) != RTN_LOCAL) { + dbg("vnet_out:%d: non-local source IP address %u.%u.%u.%u not allowed\n", xid, + NIPQUAD(ip)); + return NF_DROP; + } + + /* Sending of ICMP error messages not allowed */ + if (protocol == IPPROTO_ICMP) { + struct icmphdr *icmph = (struct icmphdr *)((*pskb)->nh.raw + ((*pskb)->nh.iph->ihl * 4)); + + if ((unsigned char *) &icmph[1] > (*pskb)->tail) { + dbg("vnet_out:%d: dropping runt ICMP packet\n", xid); + return NF_DROP; + } + + switch (icmph->type) { + case ICMP_ECHOREPLY: + case ICMP_ECHO: + case ICMP_TIMESTAMP: + case ICMP_TIMESTAMPREPLY: + case ICMP_INFO_REQUEST: + case ICMP_INFO_REPLY: + case ICMP_ADDRESS: + case ICMP_ADDRESSREPLY: + /* Guaranteed by icmp_pkt_to_tuple() */ + assert(port == icmph->un.echo.id); + break; + default: + dbg("vnet_out:%d: sending of ICMP error messages not allowed\n", xid); + return NF_DROP; + } + } + } else { + /* Let root send anything it wants */ + } + + if (ct) { + /* Check if the port is bound by someone else */ + key = bind_get(protocol, ip, port, NULL); + } else { + assert(xid == 0); + key = NULL; + } + + if (key && key->sk != NULL) { + /* A new or established connection from a bound port */ + assert(ct); + + sk = key->sk; + +#ifdef HAVE_FUNCTIONALITY_REQUIRED_BY_DEMUX + /* If the bound socket is a real TCP socket, then the context that + * bound the port could have re-assigned an established connection + * socket to the sender's context. See if this is the case. + */ + if (protocol == IPPROTO_TCP && sk->sk_type == SOCK_STREAM && get_sk_xid(sk) != xid) { + struct sock *tcp_sk; + u_int32_t dst_ip = ct->tuplehash[dir].tuple.dst.ip; + u_int16_t dst_port = get_dst_port(&ct->tuplehash[dir].tuple); + + tcp_sk = vnet_tcp_lookup(dst_ip, dst_port, ip, port, vnet_iif(*pskb)); + if (tcp_sk) { + if (tcp_sk->sk_state == TCP_TIME_WAIT) { + sock_put(tcp_sk); + //vnet_timewait_put(tcp_sk); + } else { + need_to_free_sk = 1; + sk = tcp_sk; + /* Remember to sock_put()! */ + } + } + } +#endif + + verdict = NF_ACCEPT; + + /* Stealing connections from established sockets is not allowed */ + assert(sk); + if (get_sk_xid(sk) != xid) { + if (xid) { + dbg("vnet_out:%d: %s port %u.%u.%u.%u:%u already bound by context %d\n", xid, + print_protocol(protocol), NIPQUAD(ip), ntohs(port), get_sk_xid(sk)); + verdict = NF_DROP; + } else { + /* Let root send whatever it wants but do not steal the packet or + * connection. Kernel sockets owned by root may send packets on + * behalf of bound sockets (for instance, TCP ACK in SYN_RECV or + * TIME_WAIT). + */ + xid = get_sk_xid(sk); + } + } + +#ifdef HAVE_FUNCTIONALITY_REQUIRED_BY_DEMUX + if (need_to_free_sk) { + /* + if (sk->sk_state == TCP_TIME_WAIT) + vnet_timewait_put(sk); + else */ + sock_put(sk); + need_to_free_sk = 0; + } +#endif + bind_put(key); + + if (verdict == NF_DROP) + goto done; + } else { + /* A new or established or untrackable connection from an unbound port */ + + /* Reserved ports must be bound. Usually only root is capable of + * CAP_NET_BIND_SERVICE. + */ + if (xid && + (protocol == IPPROTO_TCP || protocol == IPPROTO_UDP) && + ntohs(port) < PROT_SOCK) { + assert(ct); + dbg("vnet_out:%d: %s port %u is reserved\n", xid, + print_protocol(protocol), ntohs(port)); + return NF_DROP; + } + } + + if (ct) { + /* Steal the connection */ + if (get_ct_xid(ct, dir) != xid) { + dbg("vnet_out:%d: stealing %sbound %s connection %u.%u.%u.%u:%u -> %u.%u.%u.%u:%u from context %d\n", xid, + key ? "" : "un", print_protocol(protocol), + NIPQUAD(ip), ntohs(port), + NIPQUAD(ct->tuplehash[dir].tuple.dst.ip), ntohs(ct->tuplehash[dir].tuple.dst.u.all), + get_ct_xid(ct, dir)); + set_ct_xid(ct, dir, xid); + } + + /* Classify traffic once per connection */ + if (ct->priority == (u_int32_t) -1) { + /* The POSTROUTING chain should classify packets into a minor subclass + * (1:1000, 1:2000, etc.) with -j CLASSIFY --set-class. Set the packet + * MARK early so that rules can take xid into account. */ + set_skb_xid(*pskb, xid); + (*pskb)->priority = priority; + (void) ipt_do_table(pskb, NF_IP_POST_ROUTING, in, out, &vnet_table, NULL); + priority = (*pskb)->priority | xid; + dbg("vnet_out:%d: %u.%u.%u.%u:%u -> %u.%u.%u.%u:%u class %x:%x\n", xid, + NIPQUAD(ip), ntohs(port), + NIPQUAD(ct->tuplehash[dir].tuple.dst.ip), ntohs(ct->tuplehash[dir].tuple.dst.u.all), + TC_H_MAJ(priority) >> 16, TC_H_MIN(priority)); + ct->priority = priority; + } else + priority = ct->priority; + } else { + assert(xid == 0); + } + + /* Set class */ + (*pskb)->priority = priority; + + break; + + default: + /* Huh? */ + assert(hook == NF_IP_LOCAL_IN || hook == NF_IP_LOCAL_OUT); + break; + } + + /* Mark packet */ + set_skb_xid(*pskb, xid); + +#ifdef VNET_DEBUG + if (vnet_verbose >= 3) { + if (ct) + print_conntrack(ct, ctinfo, hook); + if (vnet_verbose >= 4) + print_packet(*pskb); + } +#endif + + get_verdict: + verdict = ipt_do_table(pskb, hook, in, out, &vnet_table, NULL); + + /* Pass to network taps */ + if (verdict == NF_ACCEPT) + verdict = packet_hook(*pskb, hook); + + done: + return verdict; +} + +static struct nf_hook_ops vnet_ops[] = { + { + .hook = vnet_hook, +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) + .owner = THIS_MODULE, +#endif + .pf = PF_INET, + .hooknum = NF_IP_LOCAL_IN, + .priority = NF_IP_PRI_LAST, + }, + { + .hook = vnet_hook, +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) + .owner = THIS_MODULE, +#endif + .pf = PF_INET, + .hooknum = NF_IP_LOCAL_OUT, + .priority = NF_IP_PRI_LAST, + }, +}; + +/* Exported by net/ipv4/af_inet.c */ +extern struct net_proto_family inet_family_ops; +extern struct proto_ops inet_stream_ops; +extern struct proto_ops inet_dgram_ops; +extern struct proto_ops inet_sockraw_ops; +extern int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); +extern int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, + int addr_len, int flags); +extern int inet_listen(struct socket *sock, int backlog); +extern int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr, + int addr_len, int flags); +extern int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, + size_t size); +extern int inet_release(struct socket *sock); + +/* Exported by net/ipv4/tcp_ipv4.c */ +extern struct proto tcp_prot; +extern int tcp_port_rover; +extern int sysctl_local_port_range[2]; + +/* Exported by net/ipv4/udp.c */ +extern struct proto udp_prot; +extern int udp_port_rover; + +/* Functions that are not exported */ +static int (*inet_create)(struct socket *sock, int protocol); +static ssize_t (*inet_sendpage)(struct socket *sock, struct page *page, int offset, size_t size, int flags); +static void (*tcp_v4_hash)(struct sock *sk); +static void (*tcp_v4_unhash)(struct sock *sk); +static void (*udp_v4_hash)(struct sock *sk); +static void (*udp_v4_unhash)(struct sock *sk); + +static int +vnet_inet_create(struct socket *sock, int protocol) +{ + int ret; + + if (sock->type == SOCK_RAW) { + /* Temporarily give CAP_NET_RAW to root VServer accounts */ + if (current->euid) + return -EPERM; + cap_raise(current->cap_effective, CAP_NET_RAW); + } + ret = inet_create(sock, protocol); + if (sock->type == SOCK_RAW) + cap_lower(current->cap_effective, CAP_NET_RAW); + if (ret) + return ret; + + if (sock->type == SOCK_RAW) { + struct sock *sk = sock->sk; + struct inet_opt *inet = inet_sk(sk); + /* Usually redundant and unused */ + assert(inet->sport == htons(inet->num)); + /* So we can track double raw binds */ + inet->sport = 0; + } + + return ret; +} + +/* Make sure our bind table gets updated whenever the stack decides to + * unhash or rehash a socket. + */ +static void +vnet_inet_unhash(struct sock *sk) +{ + struct inet_opt *inet = inet_sk(sk); + struct bind_key *key; + + key = bind_get(sk->sk_protocol, inet->saddr, inet->sport, sk); + if (key) { + dbg("vnet_inet_unhash:%d: released %s port %u.%u.%u.%u:%u\n", get_sk_xid(sk), + print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), ntohs(inet->sport)); + bind_del(key); + bind_put(key); + } + + if (sk->sk_protocol == IPPROTO_TCP) + tcp_v4_unhash(sk); + else if (sk->sk_protocol == IPPROTO_UDP) + udp_v4_unhash(sk); +} + +static void +vnet_inet_hash(struct sock *sk) +{ + struct inet_opt *inet = inet_sk(sk); + + if (bind_add(sk->sk_protocol, inet->saddr, inet->sport, sk) == 0) { + dbg("vnet_inet_hash:%d: bound %s port %u.%u.%u.%u:%u\n", get_sk_xid(sk), + print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), ntohs(inet->sport)); + } + + if (sk->sk_protocol == IPPROTO_TCP) + tcp_v4_hash(sk); + else if (sk->sk_protocol == IPPROTO_UDP) + udp_v4_hash(sk); +} + +/* Port reservation */ +static int +vnet_inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + struct sock *sk = sock->sk; + struct inet_opt *inet = inet_sk(sk); + struct sockaddr_in *sin = (struct sockaddr_in *) uaddr; + struct bind_key *key; + int ret; + + /* Bind socket */ + if ((ret = inet_bind(sock, uaddr, addr_len))) + return ret; + + lock_sock(sk); + + /* Backward compatibility with safe raw sockets */ + if (sock->type == SOCK_RAW) { + /* Runt sockaddr */ + if (addr_len < sizeof(struct sockaddr_in)) + ret = -EINVAL; + /* Non-local bind */ + else if (sin->sin_addr.s_addr != __constant_htonl(INADDR_ANY) && inet_addr_type(sin->sin_addr.s_addr) != RTN_LOCAL) + ret = -EINVAL; + /* Unspecified port */ + else if (!sin->sin_port) + ret = -EINVAL; + /* Reserved port */ + else if ((sk->sk_protocol == IPPROTO_TCP || sk->sk_protocol == IPPROTO_UDP) && + ntohs(sin->sin_port) < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE)) + ret = -EACCES; + /* Double bind */ + else if (inet->sport) + ret = -EINVAL; + if (ret) + goto done; + inet->saddr = sin->sin_addr.s_addr; + inet->sport = sin->sin_port; + } + + key = bind_get(sk->sk_protocol, inet->saddr, inet->sport, NULL); + if (key) { + /* + * If we are root or own the already bound socket, and + * SO_REUSEADDR has been set on both. + */ + if ((get_sk_xid(sk) == 0 || get_sk_xid(sk) == get_sk_xid(key->sk)) && + key->sk->sk_reuse && sk->sk_reuse) { + if (key->ip == __constant_htonl(INADDR_ANY)) { + /* Keep the current bind key */ + bind_put(key); + goto done; + } else if (inet->saddr == __constant_htonl(INADDR_ANY)) { + /* Consider the port to be bound to this socket now */ + bind_del(key); + } + } + bind_put(key); + } + + if ((ret = bind_add(sk->sk_protocol, inet->saddr, inet->sport, sk)) == 0) { + dbg("vnet_inet_bind:%d: bound %s port %u.%u.%u.%u:%u\n", get_sk_xid(sk), + print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), ntohs(inet->sport)); + } + + done: + release_sock(sk); + return ret; +} + +/* Override TCP and UDP port rovers since they do not know about raw + * socket binds. + */ +static int +vnet_autobind(struct sock *sk) +{ + int (*get_port)(struct sock *, unsigned short); + int low = sysctl_local_port_range[0]; + int high = sysctl_local_port_range[1]; + int remaining = (high - low) + 1; + int port; + struct inet_opt *inet = inet_sk(sk); + struct bind_key *key; + + /* Must be locked */ + assert(sock_owned_by_user(sk)); + + /* Already bound to a port */ + if (inet->num) + return 0; + + if (sk->sk_protocol == IPPROTO_TCP) { + get_port = tcp_prot.get_port; +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,14) + /* Approximate the tcp_v4_get_port() strategy */ + port = tcp_port_rover + 1; +#else + /* Approximate the inet_csk_get_port() strategy */ + port = net_random() % (high - low) + low; +#endif + } else if (sk->sk_protocol == IPPROTO_UDP) { + get_port = udp_prot.get_port; + port = udp_port_rover; + } else if (sk->sk_prot->get_port) { + err("vnet_get_port:%d: %s unhandled\n", get_sk_xid(sk), + print_protocol(sk->sk_protocol)); + if (sk->sk_prot->get_port(sk, 0)) + return -EAGAIN; + inet->sport = htons(inet->num); + return 0; + } else { + return 0; + } + + dbg("vnet_autobind:%d: roving %s port range %u.%u.%u.%u:%u-%u\n", get_sk_xid(sk), + print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), low, high); + + /* Find a free port by linear search. Note that the standard + * udp_v4_get_port() function attempts to pick a port that + * keeps its hash tables balanced. If the UDP hash table keeps + * getting bombed, we should try implementing this strategy + * here. + */ + do { + if (port < low || port > high) + port = low; + + /* XXX We could probably try something more clever + * like checking to see if the bound socket is a + * regular TCP socket owned by the same context (or we + * are root) and, if so, letting tcp_v4_get_port() + * apply its fast reuse logic to determine if the port + * can be reused. + */ + if (bind_add(sk->sk_protocol, inet->saddr, htons(port), sk)) { + dbg("vnet_get_port:%d: %s port %u.%u.%u.%u:%u already bound\n", get_sk_xid(sk), + print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), port); + goto next; + } + + if (get_port(sk, port)) { + /* Can happen if we are unloaded when there are active sockets */ + dbg("vnet_get_port:%d: failed to hash unbound %s port %u.%u.%u.%u:%u\n", get_sk_xid(sk), + print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), port); + key = bind_get(sk->sk_protocol, inet->saddr, htons(port), sk); + assert(key); + bind_del(key); + bind_put(key); + } else { + assert(port == inet->num); + inet->sport = htons(inet->num); + break; + } + next: + port++; + } while (--remaining > 0); + + if (sk->sk_protocol == IPPROTO_UDP) + udp_port_rover = port; +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,14) + else if (sk->sk_protocol == IPPROTO_TCP) + tcp_port_rover = port; +#endif + + if (remaining <= 0) { + err("vnet_get_port:%d: exhausted local %s port range %u.%u.%u.%u:%u-%u\n", get_sk_xid(sk), + print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), low, high); + return -EAGAIN; + } else { + dbg("vnet_get_port:%d: autobound %s port %u.%u.%u.%u:%u\n", get_sk_xid(sk), + print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), port); + return 0; + } +} + +static int +vnet_inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, + int addr_len, int flags) +{ + struct sock *sk = sock->sk; + + lock_sock(sk); + + /* Duplicates checks in inet_stream_connect() */ + if (uaddr->sa_family != AF_UNSPEC && + sock->state == SS_UNCONNECTED && + sk->sk_state == TCP_CLOSE) { + /* We may need to bind the socket. */ + if (!inet_sk(sk)->num && vnet_autobind(sk)) { + release_sock(sk); + return -EAGAIN; + } + } + + release_sock(sk); + + return inet_stream_connect(sock, uaddr, addr_len, flags); +} + +static int +vnet_inet_listen(struct socket *sock, int backlog) +{ + struct sock *sk = sock->sk; + + lock_sock(sk); + + /* Duplicates checks in inet_listen() */ + if (sock->type == SOCK_STREAM && + sock->state == SS_UNCONNECTED && + sk->sk_state == TCP_CLOSE) { + /* We may need to bind the socket. */ + if (!inet_sk(sk)->num && vnet_autobind(sk)) { + release_sock(sk); + return -EAGAIN; + } + } + + release_sock(sk); + + return inet_listen(sock, backlog); +} + +static int +vnet_inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr, + int addr_len, int flags) +{ + struct sock *sk = sock->sk; + + lock_sock(sk); + + /* Duplicates checks in inet_dgram_connect() */ + if (uaddr->sa_family != AF_UNSPEC) { + /* We may need to bind the socket. */ + if (!inet_sk(sk)->num && vnet_autobind(sk)) { + release_sock(sk); + return -EAGAIN; + } + } + + release_sock(sk); + + return inet_dgram_connect(sock, uaddr, addr_len, flags); +} + +static int +vnet_inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, + size_t size) +{ + struct sock *sk = sock->sk; + + lock_sock(sk); + + /* We may need to bind the socket. */ + if (!inet_sk(sk)->num && vnet_autobind(sk)) { + release_sock(sk); + return -EAGAIN; + } + + release_sock(sk); + + return inet_sendmsg(iocb, sock, msg, size); +} + +static ssize_t +vnet_inet_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) +{ + struct sock *sk = sock->sk; + + lock_sock(sk); + + /* We may need to bind the socket. */ + if (!inet_sk(sk)->num && vnet_autobind(sk)) { + release_sock(sk); + return -EAGAIN; + } + + release_sock(sk); + + return inet_sendpage(sock, page, offset, size, flags); +} + +static int +vnet_inet_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct inet_opt *inet = inet_sk(sk); + struct bind_key *key; + + /* Partial socket created by accept() */ + if (!sk) + goto done; + + lock_sock(sk); + + key = bind_get(sk->sk_protocol, inet->saddr, inet->sport, sk); + if (key) { + dbg("vnet_inet_release:%d: released %s port %u.%u.%u.%u:%u\n", get_sk_xid(sk), + print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), ntohs(inet->sport)); + bind_del(key); + bind_put(key); + } + + release_sock(sk); + + done: + return inet_release(sock); +} + +/* Sanity check */ +#define override_op(op, from, to) do { assert((op) == (from)); (op) = (to); } while (0) + +static int __init +vnet_init(void) +{ + int ret; + + /* Initialize bind table */ + ret = bind_init(); + if (ret < 0) + return ret; + + /* Register /proc entries */ + ret = proc_init(); + if (ret < 0) + goto cleanup_bind; + + /* Register dummy netdevice */ + ret = packet_init(); + if (ret < 0) + goto cleanup_proc; + + /* Register tap netdevice */ + ret = tun_init(); + if (ret < 0) + goto cleanup_packet; + + /* Get pointers to unexported functions */ + inet_create = inet_family_ops.create; + inet_sendpage = inet_dgram_ops.sendpage; + tcp_v4_hash = tcp_prot.hash; + tcp_v4_unhash = tcp_prot.unhash; + udp_v4_hash = udp_prot.hash; + udp_v4_unhash = udp_prot.unhash; + + /* Override PF_INET socket operations */ + override_op(inet_family_ops.create, inet_create, vnet_inet_create); + override_op(inet_stream_ops.bind, inet_bind, vnet_inet_bind); + override_op(inet_stream_ops.connect, inet_stream_connect, vnet_inet_stream_connect); + override_op(inet_stream_ops.listen, inet_listen, vnet_inet_listen); + override_op(inet_stream_ops.sendmsg, inet_sendmsg, vnet_inet_sendmsg); + override_op(inet_stream_ops.release, inet_release, vnet_inet_release); + override_op(inet_dgram_ops.bind, inet_bind, vnet_inet_bind); + override_op(inet_dgram_ops.connect, inet_dgram_connect, vnet_inet_dgram_connect); + override_op(inet_dgram_ops.sendmsg, inet_sendmsg, vnet_inet_sendmsg); + override_op(inet_dgram_ops.sendpage, inet_sendpage, vnet_inet_sendpage); + override_op(inet_dgram_ops.release, inet_release, vnet_inet_release); +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10) + override_op(inet_sockraw_ops.bind, inet_bind, vnet_inet_bind); + override_op(inet_sockraw_ops.connect, inet_dgram_connect, vnet_inet_dgram_connect); + override_op(inet_sockraw_ops.sendmsg, inet_sendmsg, vnet_inet_sendmsg); + override_op(inet_sockraw_ops.sendpage, inet_sendpage, vnet_inet_sendpage); + override_op(inet_sockraw_ops.release, inet_release, vnet_inet_release); +#endif + override_op(tcp_prot.hash, tcp_v4_hash, vnet_inet_hash); + override_op(tcp_prot.unhash, tcp_v4_unhash, vnet_inet_unhash); + override_op(udp_prot.hash, udp_v4_hash, vnet_inet_hash); + override_op(udp_prot.unhash, udp_v4_unhash, vnet_inet_unhash); + + /* Register table */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,11) + ret = ipt_register_table(&vnet_table, &initial_table.repl); +#else + ret = ipt_register_table(&vnet_table); +#endif + if (ret < 0) + goto cleanup_override; + + /* Register hooks */ + ret = nf_register_hook(&vnet_ops[0]); + if (ret < 0) + goto cleanup_table; + + ret = nf_register_hook(&vnet_ops[1]); + if (ret < 0) + goto cleanup_hook0; + + /* Enables any runtime kernel support for VNET */ + vnet_active = 1; + + /* Print banner */ + printk("VNET: version " VNET_VERSION " compiled on " __DATE__ " at " __TIME__ "\n"); + + return ret; + + cleanup_hook0: + nf_unregister_hook(&vnet_ops[0]); + cleanup_table: + ipt_unregister_table(&vnet_table); + cleanup_override: + inet_family_ops.create = inet_create; + inet_stream_ops.bind = inet_bind; + inet_stream_ops.connect = inet_stream_connect; + inet_stream_ops.listen = inet_listen; + inet_stream_ops.sendmsg = inet_sendmsg; + inet_stream_ops.release = inet_release; + inet_dgram_ops.bind = inet_bind; + inet_dgram_ops.connect = inet_dgram_connect; + inet_dgram_ops.sendmsg = inet_sendmsg; + inet_dgram_ops.sendpage = inet_sendpage; + inet_dgram_ops.release = inet_release; + tun_cleanup(); + cleanup_packet: + packet_cleanup(); + cleanup_proc: + proc_cleanup(); + cleanup_bind: + bind_cleanup(); + + return ret; +} + +static void __exit +vnet_exit(void) +{ + unsigned int i; + + /* Print banner */ + printk("VNET: exiting\n"); + + /* Disables any runtime kernel support for VNET */ + vnet_active = 0; + + /* Stop handling packets first */ + for (i = 0; i < sizeof(vnet_ops)/sizeof(struct nf_hook_ops); i++) + nf_unregister_hook(&vnet_ops[i]); + + ipt_unregister_table(&vnet_table); + + /* Stop handling PF_INET socket operations */ + override_op(inet_family_ops.create, vnet_inet_create, inet_create); + override_op(inet_stream_ops.bind, vnet_inet_bind, inet_bind); + override_op(inet_stream_ops.connect, vnet_inet_stream_connect, inet_stream_connect); + override_op(inet_stream_ops.listen, vnet_inet_listen, inet_listen); + override_op(inet_stream_ops.sendmsg, vnet_inet_sendmsg, inet_sendmsg); + override_op(inet_stream_ops.release, vnet_inet_release, inet_release); + override_op(inet_dgram_ops.bind, vnet_inet_bind, inet_bind); + override_op(inet_dgram_ops.connect, vnet_inet_dgram_connect, inet_dgram_connect); + override_op(inet_dgram_ops.sendmsg, vnet_inet_sendmsg, inet_sendmsg); + override_op(inet_dgram_ops.sendpage, vnet_inet_sendpage, inet_sendpage); + override_op(inet_dgram_ops.release, vnet_inet_release, inet_release); +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10) + override_op(inet_sockraw_ops.bind, vnet_inet_bind, inet_bind); + override_op(inet_sockraw_ops.connect, vnet_inet_dgram_connect, inet_dgram_connect); + override_op(inet_sockraw_ops.sendmsg, vnet_inet_sendmsg, inet_sendmsg); + override_op(inet_sockraw_ops.sendpage, vnet_inet_sendpage, inet_sendpage); + override_op(inet_sockraw_ops.release, vnet_inet_release, inet_release); +#endif + override_op(tcp_prot.hash, vnet_inet_hash, tcp_v4_hash); + override_op(tcp_prot.unhash, vnet_inet_unhash, tcp_v4_unhash); + override_op(udp_prot.hash, vnet_inet_hash, udp_v4_hash); + override_op(udp_prot.unhash, vnet_inet_unhash, udp_v4_unhash); + + /* Disable tap netdevice */ + tun_cleanup(); + + /* Disable vnet netdevice and stop handling PF_PACKET sockets */ + packet_cleanup(); + + /* Unregister /proc handlers */ + proc_cleanup(); + + /* Cleanup bind table (must be after nf_unregister_hook()) */ + bind_cleanup(); +} + +module_init(vnet_init); +module_exit(vnet_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Mark Huang "); +MODULE_DESCRIPTION("VServer IP isolation");