From 76b34e0978fa5e5faf93a0b7f77c4efb5d2f745a Mon Sep 17 00:00:00 2001 From: KyoungSoo Park Date: Wed, 25 Apr 2007 16:07:02 +0000 Subject: [PATCH 1/1] codemux --- Makefile | 16 + appdef.h | 53 ++ applib.c | 1156 ++++++++++++++++++++++++++++++++++++++++++ applib.h | 130 +++++ codemux.c | 944 ++++++++++++++++++++++++++++++++++ codemux.conf | 6 + codemux.initscript | 69 +++ codemux.spec | 63 +++ codns.h | 42 ++ debug.h | 97 ++++ gettimeofdayex.c | 149 ++++++ gettimeofdayex.h | 8 + ports.h | 102 ++++ vnet_main.c | 1202 ++++++++++++++++++++++++++++++++++++++++++++ 14 files changed, 4037 insertions(+) create mode 100644 Makefile create mode 100644 appdef.h create mode 100644 applib.c create mode 100644 applib.h create mode 100644 codemux.c create mode 100644 codemux.conf create mode 100644 codemux.initscript create mode 100644 codemux.spec create mode 100644 codns.h create mode 100644 debug.h create mode 100644 gettimeofdayex.c create mode 100644 gettimeofdayex.h create mode 100644 ports.h create mode 100644 vnet_main.c diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..480c997 --- /dev/null +++ b/Makefile @@ -0,0 +1,16 @@ +CC = gcc +CFLAGS = -Wall -O -g + +TARGS = codemux + +all: ${TARGS} + +clean: + rm -f ${TARGS} *.o *~ + +SHARED_OBJ = applib.o gettimeofdayex.o + +CODEMUX_OBJ = codemux.o ${SHARED_OBJ} + +codemux: ${CODEMUX_OBJ} + diff --git a/appdef.h b/appdef.h new file mode 100644 index 0000000..cc82ae8 --- /dev/null +++ b/appdef.h @@ -0,0 +1,53 @@ +#ifndef _APPDEF_H_ +#define _APPDEF_H_ + +/* useful definitions */ +#ifndef TRUE +#define TRUE 1 +#endif + +#ifndef FALSE +#define FALSE 0 +#endif + +#ifndef SUCCESS +#define SUCCESS 0 +#endif + +#ifndef FAILURE +#define FAILURE (-1) +#endif + +#ifndef INT_MAX +#ifdef MAX_INT +#define INT_MAX MAXINT +#endif +#endif + +#ifndef MAX +#define MAX(x, y) ((x) > (y) ? (x) : (y)) +#endif + +#ifndef MIN +#define MIN(x, y) ((x) < (y) ? (x) : (y)) +#endif + +#ifndef NELEMS +#define NELEMS(x) (sizeof(x)/sizeof(x[0])) +#endif + +#ifndef SKIP_CHARS +#define SKIP_CHARS(x) while(!isspace((int)*x)) (x)++ +#endif + +#ifndef SKIP_SPACES +#define SKIP_SPACES(x) while (isspace((int)*x)) (x)++ +#endif + +#ifndef SKIP_WORD +#define SKIP_WORD(x) do { SKIP_SPACES(x); \ + SKIP_CHARS(x); \ + SKIP_SPACES(x);} while(0) +#endif + +#endif //_APPDEF_H_ diff --git a/applib.c b/applib.c new file mode 100644 index 0000000..a91c6aa --- /dev/null +++ b/applib.c @@ -0,0 +1,1156 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "applib.h" +#include "debug.h" +#include "codns.h" + + + +int defaultTraceSync = TRUE; + +/*-----------------------------------------------------------------*/ +float +DiffTimeVal(const struct timeval *start, const struct timeval *end) +{ + struct timeval temp; + if (end == NULL) { + end = &temp; + gettimeofday(&temp, NULL); + } + return(end->tv_sec - start->tv_sec + + 1e-6*(end->tv_usec - start->tv_usec)); +} +/*-----------------------------------------------------------------*/ +int +CreatePrivateAcceptSocketEx(int portNum, int nonBlocking, int loopbackOnly) +{ + int doReuse = 1; + struct linger doLinger; + int sock; + struct sockaddr_in sa; + + /* Create socket. */ + if ((sock = socket(PF_INET, SOCK_STREAM, 0)) == -1) + return(-1); + + /* don't linger on close */ + doLinger.l_onoff = doLinger.l_linger = 0; + if (setsockopt(sock, SOL_SOCKET, SO_LINGER, + &doLinger, sizeof(doLinger)) == -1) { + close(sock); + return(-1); + } + + /* reuse addresses */ + if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, + &doReuse, sizeof(doReuse)) == -1) { + close(sock); + return(-1); + } + + if (nonBlocking) { + /* make listen socket nonblocking */ + if (fcntl(sock, F_SETFL, O_NDELAY) == -1) { + close(sock); + return(-1); + } + } + + /* set up info for binding listen */ + memset(&sa, 0, sizeof(sa)); + sa.sin_family = AF_INET; + sa.sin_addr.s_addr = (loopbackOnly) ? htonl(INADDR_LOOPBACK) + : htonl(INADDR_ANY); + sa.sin_port = htons(portNum); + + /* bind the sock */ + if (bind(sock, (struct sockaddr *) &sa, sizeof(sa)) == -1) { + close(sock); + return(-1); + } + + /* start listening */ + if (listen(sock, 32) == -1) { + close(sock); + return(-1); + } + + return(sock); +} +/*-----------------------------------------------------------------*/ +int +CreatePrivateAcceptSocket(int portNum, int nonBlocking) +{ + return CreatePrivateAcceptSocketEx(portNum, nonBlocking, FALSE); +} +/*-----------------------------------------------------------------*/ +int +CreatePublicUDPSocket(int portNum) +{ + struct sockaddr_in hb_sin; + int sock; + + if ((sock = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP)) < 0) + return(-1); + + memset(&hb_sin, 0, sizeof(hb_sin)); + hb_sin.sin_family = AF_INET; + hb_sin.sin_addr.s_addr = INADDR_ANY; + hb_sin.sin_port = htons(portNum); + + if (bind(sock, (struct sockaddr *) &hb_sin, sizeof(hb_sin)) < 0) { + close(sock); + return(-1); + } + return(sock); +} +/*-----------------------------------------------------------------*/ +int +MakeConnection(char *name, in_addr_t netAddr, int portNum, int nonBlocking) +{ + struct sockaddr_in saddr; + int fd; + + if (name != NULL) { + struct hostent *ent; + if ((ent = gethostbyname(name)) == NULL) { + if (hdebugLog) + TRACE("failed in name lookup - %s\n", name); + return(-1); + } + memcpy(&netAddr, ent->h_addr, sizeof(netAddr)); + } + + if ((fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) < 0) { + if (hdebugLog) + TRACE("failed creating socket - %d\n", errno); + return(-1); + } + + if (nonBlocking) { + if (fcntl(fd, F_SETFL, O_NDELAY) < 0) { + if (hdebugLog) + TRACE("failed fcntl'ing socket - %d\n", errno); + close(fd); + return(-1); + } + } + + saddr.sin_family = AF_INET; + saddr.sin_addr.s_addr = netAddr; + saddr.sin_port = htons(portNum); + + if (connect(fd, (struct sockaddr *) &saddr, + sizeof(struct sockaddr_in)) < 0) { + if (errno == EINPROGRESS) + return(fd); + if (hdebugLog) + TRACE("failed connecting socket - %d\n", errno); + close(fd); + return(-1); + } + + return(fd); +} +/*-----------------------------------------------------------------*/ +int +MakeLoopbackConnection(int portNum, int nonBlocking) +{ + return(MakeConnection(NULL, htonl(INADDR_LOOPBACK), + portNum, nonBlocking)); +} +/*-----------------------------------------------------------------*/ +char * +GetField(const unsigned char *start, int whichField) +{ + int currField; + + /* move to first non-blank char */ + while (isspace(*start)) + start++; + + if (*start == '\0') + return(NULL); + + for (currField = 0; currField < whichField; currField++) { + /* move over this field */ + while (*start != '\0' && (!isspace(*start))) + start++; + /* move over blanks before next field */ + while (isspace(*start)) + start++; + if (*start == '\0') + return(NULL); + } + return((char *) start); +} +/* ---------------------------------------------------------------- */ +char * +GetWord(const unsigned char *start, int whichWord) +{ + /* returns a newly allocated string containing the desired word, + or NULL if there was a problem */ + unsigned char *temp; + int len = 0; + char *res; + + temp = (unsigned char *) GetField(start, whichWord); + if (!temp) + return(NULL); + while (!(temp[len] == '\0' || isspace(temp[len]))) + len++; + if (!len) + NiceExit(-1, "internal error"); + res = (char *)xcalloc(1, len+1); + if (!res) + NiceExit(-1, "out of memory"); + memcpy(res, temp, len); + return(res); +} +/* ---------------------------------------------------------------- */ +char * +GetWordEx(const unsigned char *start, int whichWord, + char* dest, int max) +{ + /* returns a newly allocated string containing the desired word, + or NULL if there was a problem */ + unsigned char *temp; + int len = 0; + + memset(dest, 0, max); + temp = (unsigned char *) GetField(start, whichWord); + if (!temp) + return(NULL); + while (!(temp[len] == '\0' || isspace(temp[len]))) + len++; + if (!len) + NiceExit(-1, "internal error"); + if (len >= max-1) + len = max-1; + memcpy(dest, temp, len); + return(dest); +} +/*-----------------------------------------------------------------*/ +int +Base36Digit(int a) +{ + if (a < 0) + return('0'); + if (a > 35) + return('z'); + if (a < 10) + return('0' + a); + return('a' + a-10); +} +/*-----------------------------------------------------------------*/ +int +Base36To10(int a) +{ + if (a >= '0' && a <= '9') + return(a - '0'); + if (a >= 'a' && a <= 'z') + return(10 + a - 'a'); + if (a >= 'A' && a <= 'Z') + return(10 + a - 'A'); + return(0); +} +/*-----------------------------------------------------------------*/ +int +PopCount(int val) +{ + int i; + int count = 0; + + for (i = 0; i < (int)sizeof(int) * 8; i++) { + if (val & (1<= l2) { + l1--; + if (!memcmp(s1,s2,l2)) + return (char *) s1; + s1++; + } + return NULL; +} +#endif +/*-----------------------------------------------------------------*/ +/* same as strnstr, except case insensitive */ +char * strncasestr(const char * s1, int s1_len, const char * s2) +{ + int l1, l2; + + l2 = strlen(s2); + if (!l2) + return (char *) s1; + + l1 = s1_len; + while (l1 >= l2) { + l1--; + if (!strncasecmp(s1,s2,l2)) + return (char *) s1; + s1++; + } + return NULL; +} + +/*-----------------------------------------------------------------*/ +char * +StrdupLower(const char *orig) +{ + char *temp; + int i; + + if ((temp = xstrdup(orig)) == NULL) + NiceExit(-1, "no memory in strduplower"); + for (i = 0; temp[i]; i++) { + if (isupper((int) temp[i])) + temp[i] = tolower(temp[i]); + } + return(temp); +} + +/*-----------------------------------------------------------------*/ +void +StrcpyLower(char *dest, const char *src) +{ + /* copy 'src' to 'dest' in lower cases. + 'dest' should have enough free space to hold src */ + int i; + + for (i = 0; src[i]; i++) { + dest[i] = (isupper((int) src[i])) ? tolower(src[i]) : src[i]; + } + + /* mark it as NULL */ + dest[i] = 0; +} +/*-----------------------------------------------------------------*/ +void +StrcpyLowerExcept(char *dest, int dest_max, const char* src, const char* except) +{ + /* copy 'src' to 'dest' in lower cases, skipping the chars in except. + 'dest' should have enough free space to hold src */ + int i, j; + + if (src == NULL) + return; + + for (i = 0, j= 0; src[i]; i++) { + if (strchr(except, src[i])) + continue; + + if (j == dest_max - 1) + break; + dest[j++] = (isupper((int) src[i])) ? tolower(src[i]) : src[i]; + } + + /* mark it as NULL */ + dest[j] = 0; +} + +/*-----------------------------------------------------------------*/ +static char * +GetNextLineBack(FILE *file, int lower, int stripComments) +{ + /* reads the next non-blank line of the file. strips off any leading + space, any comments, and any trailing space. returns a lowercase + version of the line that has been malloc'd */ + char line[1024]; + + while (fgets(line, sizeof(line), file) != NULL) { + char *temp; + int len; + + /* strip off any comments, leading and trailing whitespace */ + if (stripComments) { + if ((temp = strchr(line, '#')) != NULL) + *temp = 0; + } + len = strlen(line); + while (len > 0 && isspace((int) line[len-1])) { + len--; + line[len] = 0; + } + temp = line; + while (isspace((int) *temp)) + temp++; + if (temp[0] == 0) + continue; /* blank line, move on */ + + if (lower) + return(StrdupLower(temp)); + return(xstrdup(temp)); + } + + return(NULL); +} +/*-----------------------------------------------------------------*/ +char * +GetLowerNextLine(FILE *file) +{ + return(GetNextLineBack(file, TRUE, TRUE)); +} +/*-----------------------------------------------------------------*/ +char * +GetNextLine(FILE *file) +{ + return(GetNextLineBack(file, FALSE, TRUE)); +} +/*-----------------------------------------------------------------*/ +char * +GetNextLineNoCommStrip(FILE *file) +{ + return(GetNextLineBack(file, FALSE, FALSE)); +} +/*-----------------------------------------------------------------*/ +int +DoesSuffixMatch(char *start, int len, char *suffix) +{ + int sufLen = strlen(suffix); + + if (len < 1) + len = strlen(start); + if (len < sufLen) + return(FALSE); + if (strncasecmp(start+len-sufLen, suffix, sufLen)) + return(FALSE); + return(TRUE); +} +/*-----------------------------------------------------------------*/ +int +DoesDotlessSuffixMatch(char *start, int len, char *suffix) +{ + /* ignores any dots on end of start, suffix */ + int sufLen = strlen(suffix); + + if (len < 1) + len = strlen(start); + + while (len > 1 && start[len-1] == '.') + len--; + while (sufLen > 1 && suffix[sufLen-1] == '.') + sufLen--; + + if (len < sufLen) + return(FALSE); + if (strncasecmp(start+len-sufLen, suffix, sufLen)) + return(FALSE); + return(TRUE); +} + +/*-----------------------------------------------------------------*/ +/* */ +/* Bit Vector Implementation */ +/* */ +/*-----------------------------------------------------------------*/ +#define BIT_INDEX (0x0000001F) + +void +SetBits(int* bits, int idx, int maxNum) +{ + if (idx > (maxNum << 5)) { + TRACE("Invalid index: %d", idx); + return; + } + bits[(idx >> 5)] |= (1 << (idx & BIT_INDEX)); +} +/*-----------------------------------------------------------------*/ +int +GetBits(int* bits, int idx, int maxNum) +{ + if (idx > (maxNum << 5)) { + TRACE("Invalid index: %d", idx); + return FALSE; + } + return (bits[(idx >> 5)] & (1 << (idx & BIT_INDEX))); +} + +/*-----------------------------------------------------------------*/ +static inline int +GetNumBits_I(int bitvec) +{ + int i, count; + + for (i = 0, count = 0; i < 32; i++) + if (bitvec & (1 << i)) count++; + return count; +} + +/*-----------------------------------------------------------------*/ +int +GetNumBits(int* bitvecs, int maxNum) +{ + int i, count; + + /* get the number of bits that have been set to 1 */ + for (i = 0, count = 0; i < maxNum; i++) + count += GetNumBits_I(bitvecs[i]); + return count; +} + +/*-----------------------------------------------------------------*/ + +/* Logging & Trace support */ + +/* buffer threshold : when the size hits this value, it flushes its content + to the file */ +#define LOG_BYTES_THRESHOLD (32*1024) + +/* this flag indicates that it preserves the base file name for the current + one, and changes its name when it actually closes it off */ +#define CHANGE_FILE_NAME_ON_SAVE 0x01 + +/* size of the actual log buffer */ +#define LOG_BYTES_MAX (2*LOG_BYTES_THRESHOLD) + +/* log/trace book keeping information */ +typedef struct ExtendedLog { + char buffer[LOG_BYTES_MAX]; /* 64 KB */ + int bytes; /* number of bytes written */ + int filesize; /* number of bytes written into this file so far */ + int fd; /* file descriptor */ + char* sig; /* base file name */ + int flags; /* flags */ + time_t nextday; +} ExtendedLog, *PExtendedLog; + +/* maximum single file size */ +int maxSingleLogSize = 100 * (1024*1024); + +static time_t +GetNextLogFileName(char* file, int size, const char* sig) +{ +#define COMPRESS_EXT1 ".bz2" +#define COMPRESS_EXT2 ".gz" + + struct tm cur_tm; + time_t cur_t; + int idx = 0; + + cur_t = timeex(NULL); + cur_tm = *gmtime(&cur_t); + + for (;;) { + /* check if .bz2 exists */ + snprintf(file, size, "%s.%04d%02d%02d_%03d%s", + sig, cur_tm.tm_year+1900, cur_tm.tm_mon+1, cur_tm.tm_mday, + idx, COMPRESS_EXT1); + + if (access(file, F_OK) == 0) { + idx++; + continue; + } + + /* check if .gz exists */ + snprintf(file, size, "%s.%04d%02d%02d_%03d%s", + sig, cur_tm.tm_year+1900, cur_tm.tm_mon+1, cur_tm.tm_mday, + idx++, COMPRESS_EXT2); + + if (access(file, F_OK) == 0) + continue; + + /* strip the extension and see if the (uncompressed) file exists */ + file[strlen(file) - sizeof(COMPRESS_EXT2) + 1] = 0; + if (access(file, F_OK) != 0) + break; + } + + /* calculate & return the next day */ + cur_t -= (3600*cur_tm.tm_hour + 60*cur_tm.tm_min + cur_tm.tm_sec); + return cur_t + 60*60*24; + +#undef COMPRESS_EXT1 +#undef COMPRESS_EXT2 +} + +/*-----------------------------------------------------------------*/ +static void +FlushBuffer(HANDLE file) +{ + /* write data into the file */ + ExtendedLog* pel = (ExtendedLog *)file; + int written; + + if (pel == NULL || pel->fd < 0) + return; + + if ((written = write(pel->fd, pel->buffer, pel->bytes)) > 0) { + pel->bytes -= written; + + /* if it hasn't written all data, then we need to move memory */ + if (pel->bytes > 0) + memmove(pel->buffer, pel->buffer + written, pel->bytes); + pel->buffer[pel->bytes] = 0; + pel->filesize += written; + } + + /* if the filesize is bigger than maxSignleLogSize, then close it off */ + if (pel->filesize >= maxSingleLogSize) + OpenLogF(file); +} + +/*-----------------------------------------------------------------*/ +HANDLE +CreateLogFHandle(const char* signature, int change_file_name_on_save) +{ + ExtendedLog* pel; + + if ((pel = (ExtendedLog *)xcalloc(1, sizeof(ExtendedLog))) == NULL) + NiceExit(-1, "failed"); + + pel->fd = -1; + pel->sig = xstrdup(signature); + if (pel->sig == NULL) + NiceExit(-1, "signature copying failed"); + if (change_file_name_on_save) + pel->flags |= CHANGE_FILE_NAME_ON_SAVE; + + return pel; +} + + +/*-----------------------------------------------------------------*/ +int +OpenLogF(HANDLE file) +{ + char filename[1024]; + ExtendedLog* pel = (ExtendedLog *)file; + + if (pel == NULL) + return -1; + + if (pel->fd != -1) + close(pel->fd); + + pel->nextday = GetNextLogFileName(filename, sizeof(filename), pel->sig); + + /* change the file name only at saving time + use pel->sig as current file name */ + if (pel->flags & CHANGE_FILE_NAME_ON_SAVE) { + if (access(pel->sig, F_OK) == 0) + rename(pel->sig, filename); + strcpy(filename, pel->sig); + } + + /* file opening */ + if ((pel->fd = open(filename, O_RDWR | O_CREAT | O_APPEND, + S_IRUSR | S_IWUSR)) == -1) { + char errMessage[2048]; + sprintf(errMessage, "couldn't open the extended log file %s\n", filename); + NiceExit(-1, errMessage); + } + + /* reset the file size */ + pel->filesize = 0; + return 0; +} + +/*-----------------------------------------------------------------*/ +int +WriteLog(HANDLE file, const char* data, int size, int forceFlush) +{ + ExtendedLog* pel = (ExtendedLog *)file; + + /* if an error might occur, then stop here */ + if (pel == NULL || pel->fd < 0 || size > LOG_BYTES_MAX) + return -1; + + if (data != NULL) { + /* flush the previous data, if this data would overfill the buffer */ + if (pel->bytes + size >= LOG_BYTES_MAX) + FlushBuffer(file); + + /* write into the buffer */ + memcpy(pel->buffer + pel->bytes, data, size); + pel->bytes += size; + } + + /* need to flush ? */ + if ((forceFlush && (pel->bytes > 0)) || (pel->bytes >= LOG_BYTES_THRESHOLD)) + FlushBuffer(file); + + return 0; +} + +/*-----------------------------------------------------------------*/ +void +DailyReopenLogF(HANDLE file) +{ + /* check if current file is a day long, + opens another for today's file */ + ExtendedLog* pel = (ExtendedLog *)file; + + if (pel && (timeex(NULL) >= pel->nextday)) { + FlushLogF(file); /* flush */ + OpenLogF(file); /* close previous one & reopen today's */ + } +} +/*-----------------------------------------------------------------*/ +int +HandleToFileNo(HANDLE file) +{ + ExtendedLog* pel = (ExtendedLog *)file; + + return (pel != NULL) ? pel->fd : -1; +} +/*-----------------------------------------------------------------*/ +#define TO_HOST_L(x) x = ntohl(x); +#define TO_NETWORK_L(x) x = htonl(x); +void +HtoN_LocalQueryInfo(LocalQueryInfo *p) +{ + TO_NETWORK_L(p->lqi_size); + TO_NETWORK_L(p->lqi_id); + TO_NETWORK_L(p->lqi_cache); +} +/*----------------------------------------------------------------*/ +void +NtoH_LocalQueryResult(LocalQueryResult* p) +{ + TO_HOST_L(p->lq_id); + TO_HOST_L(p->lq_ttl); +} +/*----------------------------------------------------------------*/ +int +CoDNSGetHostByNameSync(const char *name, struct in_addr *res) +{ + static int fd = -1; + LocalQuery query; + int size; + LocalQueryResult result; + + /* create a connection to CoDNS */ + if (fd == -1 && (fd = MakeLoopbackConnection(CODNS_PORT, 0)) < 0) { + TRACE("CoDNS connection try has failed!\n"); + + /* try to resolve names using gethostbyname() */ + { + struct hostent* hp; + if ((hp = gethostbyname(name)) == NULL) + NiceExit(-1, "gethostbyname also failed!"); + if (res) + *res = *(struct in_addr *)hp->h_addr; + } + return 0; + } + + memset(&query, 0, sizeof(query)); + size = strlen(name) + 1; + query.lq_info.lqi_size = size; /* length of name */ + query.lq_info.lqi_cache = TRUE; + strcpy(query.lq_name, name); + size += sizeof(query.lq_info) + sizeof(query.lq_zero); + + /* send a query */ + HtoN_LocalQueryInfo(&query.lq_info); + if (write(fd, &query, size) != size) { + close(fd); + fd = -1; + return(-1); + } + + /* get answer */ + do { + size = read(fd, &result, sizeof(result)); + } while (size == -1 && errno == EINTR); + /* close(fd); */ + NtoH_LocalQueryResult(&result); + + if (size != sizeof(result)) + return(-1); + + *res = result.lq_address[0]; + return 0; +} +/*-----------------------------------------------------------------*/ +void +FeedbackDelay(struct timeval *start, float minSec, float maxSec) +{ + float diff = DiffTimeVal(start, NULL); + if (diff < minSec) + diff = minSec; + if (diff > maxSec) + diff = maxSec; + usleep((unsigned int)(diff * 1e6)); +} +/*-----------------------------------------------------------------*/ +static int niceExitLogFD = -1; +int +NiceExitOpenLog(char *logName) +{ + /* log file to record exit reasons */ + if (niceExitLogFD >= 0) + close(niceExitLogFD); + + niceExitLogFD = open(logName, O_WRONLY | O_APPEND | O_CREAT, + S_IRUSR | S_IWUSR); + return((niceExitLogFD >= 0) ? SUCCESS : FAILURE); +} +/*-----------------------------------------------------------------*/ +void +NiceExitBack(int val, char *reason, char *file, int line) +{ + char buf[1024]; + time_t currT = time(NULL); + + sprintf(buf, "[%s, %d] %.24s %s\n", file, line, ctime(&currT), reason); + if (hdebugLog) { + TRACE("%s", buf); + FlushLogF(hdebugLog); + } + else + fprintf(stderr, "%s", buf); + + if (niceExitLogFD >= 0) + write(niceExitLogFD, buf, strlen(buf)); + exit(val); +} +/*-----------------------------------------------------------------*/ +int +WordCount(char *buf) +{ + int count = 0; + int wasSpace = TRUE; + + while (*buf != '\0') { + int isSpace = isspace(*buf); + if (wasSpace && (!isSpace)) + count++; + wasSpace = isSpace; + buf++; + } + return(count); +} +/*-----------------------------------------------------------------*/ +char * +ReadFile(const char *filename) +{ + int dummySize; + return(ReadFileEx(filename, &dummySize)); +} +/*-----------------------------------------------------------------*/ +char * +ReadFileEx(const char *filename, int *size) +{ + /* allocate a buffer, read the file into it and + return the buffer */ + char *content = NULL; + struct stat buf; + int fd; + + *size = -1; + if (access(filename, R_OK) < 0 + || stat(filename, &buf) < 0 + || (fd = open(filename, O_RDONLY)) < 0) { + TRACE("opening captcha file %s failed\n", filename); + exit(-1); + } + + if ((content = (char *)xmalloc(buf.st_size + 1)) == NULL) { + TRACE("memory alloc failed\n"); + exit(-1); + } + + if (read(fd, content, buf.st_size) != buf.st_size) { + TRACE("opening captcha test file failed\n"); + exit(-1); + } + close(fd); + content[buf.st_size] = 0; + *size = buf.st_size; + return content; +} +/*-----------------------------------------------------------------*/ +char * +MmapFile(const char *filename, int *size) +{ + /* allocate a buffer, read the file into it and + return the buffer */ + char *content = NULL; + struct stat buf; + int fd; + + *size = -1; + if (access(filename, R_OK) < 0 + || stat(filename, &buf) < 0 + || (fd = open(filename, O_RDONLY)) < 0) { + TRACE("opening captcha file %s failed\n", filename); + exit(-1); + } + + content = (char *)mmap(NULL, buf.st_size, PROT_READ, MAP_SHARED, fd, 0); + close(fd); + if (content != MAP_FAILED) { + *size = buf.st_size; + return content; + } + return(NULL); +} +/*-----------------------------------------------------------------*/ +unsigned int +HashString(const char *name, unsigned int hash, int endOnQuery, + int skipLastIfDot) +{ + /* if endOnQuery, we stop the hashing when we hit the question mark. + if skipLastIfDot, we check the last component of the path to see + if it includes a dot, and it so, we skip it. if both are specified, + we first try the query, and if that exists, we don't trim the path */ + + int i; + int len; + char *temp; + + if (name == NULL) + return 0; + + len = strlen(name); + if (endOnQuery && (temp = strchr(name, '?')) != NULL) + len = temp - name; + else if (skipLastIfDot) { + /* first, find last component by searching backward */ + if ((temp = strrchr(name, '/')) != NULL) { + /* now search forward for the dot */ + if (strchr(temp, '.') != NULL) + len = temp - name; + } + } + + for (i = 0; i < len; i ++) + hash += (_rotl(hash, 19) + name[i]); + + return hash; +} +/*-------------------------------------------------------------*/ +unsigned int +CalcAgentHash(const char* agent) +{ + char p[strlen(agent)+1]; + int i; + + if (agent == NULL) + return 0; + + /* we remove all spaces */ + for (i = 0; *agent; agent++) { + if (isspace(*agent)) + continue; + p[i++] = *agent; + } + p[i] = 0; + + return HashString(p, 0, FALSE, FALSE); +} +/*-----------------------------------------------------------------*/ +char * +ZapSpacesAndZeros(char *src) +{ + /* get rid of excess spaces between words, and remove any trailing + (post-decimal) zeros from floating point numbers */ + static char smallLine[4096]; + char *dst = smallLine; + char *word; + int addSpace = FALSE; + + while (src != NULL && + (word = GetWord((const unsigned char*)src, 0)) != NULL) { + char *temp; + int isDotNumber = TRUE; + + src = GetField((const unsigned char*)src, 1); /* advance to next */ + + /* check to make sure it has exactly one decimal point */ + if ((temp = strchr(word, '.')) != NULL && + (temp = strchr(temp+1, '.')) == NULL) { + /* make sure it's all digits or the dot */ + for (temp = word; *temp != '\0'; temp++) { + if (!(isdigit(*temp) || *temp == '.')) { + isDotNumber = FALSE; + break; + } + } + } + else + isDotNumber = FALSE; + + if (isDotNumber) { + /* strip off any trailing zeros and possibly the decimal point */ + int len = strlen(word) - 1; + + while (word[len] == '0') { + word[len] = '\0'; + len--; + } + if (word[len] == '.') + word[len] = '\0'; + } + + if (addSpace) + sprintf(dst, " %s", word); + else + sprintf(dst, "%s", word); + dst += strlen(dst); + addSpace = TRUE; + xfree(word); + } + + *dst = 0; + return(smallLine); +} +/*-----------------------------------------------------------------*/ diff --git a/applib.h b/applib.h new file mode 100644 index 0000000..40b283a --- /dev/null +++ b/applib.h @@ -0,0 +1,130 @@ +#ifndef _APPLIB_H_ +#define _APPLIB_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "appdef.h" + +float DiffTimeVal(const struct timeval *start, const struct timeval *end); + +int CreatePrivateAcceptSocketEx(int portNum, + int nonBlocking, int loopbackOnly); +int CreatePrivateAcceptSocket(int portNum, int nonBlocking); +int CreatePublicUDPSocket(int portNum); +int MakeLoopbackConnection(int portNum, int nonBlocking); +int MakeConnection(char *name, in_addr_t netAddr, + int portNum, int nonBlocking); + +char *GetField(const unsigned char *start, int whichField); +char *GetWord(const unsigned char *start, int whichWord); +char *GetWordEx(const unsigned char *start, int whichWord, + char* dest, int max); +int WordCount(char *buf); +char *ZapSpacesAndZeros(char *src); + +int Base36Digit(int a); +int Base36To10(int a); +int PopCount(int val); +int PopCountChar(int val); +int LogValChar(int val); + +const char *StringOrNull(const char *s); +char *strchars(const char *string, const char *list); +char *strnchars(const char *string, int length, const char *list); + +#if defined(OS_FREEBSD) || defined(OS_DARWIN) +#define HAS_STRNSTR +#endif + +#ifndef HAS_STRNSTR +char *strnstr(const char * s1, int s1_len, const char * s2); +#endif + +char * strncasestr(const char * s1, int s1_len, const char * s2); +int strncspn(const char* string, int length, const char* reject); +char *strnchr(const char *string, int length, const char needle); +char *StrdupLower(const char *orig); +void StrcpyLower(char *dest, const char *src); +void StrcpyLowerExcept(char *dest, int dest_max, const char* src, const char* except); +char *GetLowerNextLine(FILE *file); +char *GetNextLine(FILE *file); +char *GetNextLineNoCommStrip(FILE *file); +int DoesSuffixMatch(char *start, int len, char *suffix); +int DoesDotlessSuffixMatch(char *start, int len, char *suffix); + +/* resolves a name using CoDNS */ +#include "codns.h" +void HtoN_LocalQueryInfo(LocalQueryInfo *p); +void NtoH_LocalQueryResult(LocalQueryResult* p); +int CoDNSGetHostByNameSync(const char *name, struct in_addr *res); + +#ifdef OS_LINUX +#include +#endif + +#include + +/* allocate stack memory to copy "src" to "dest" in lower cases */ +#define LOCAL_STR_DUP_LOWER(dest, src) \ + { dest = alloca(strlen(src) + 1); \ + StrcpyLower(dest, src); \ + } + +/* allocate stack memory to copy "src" to "dest" */ +#define LOCAL_STR_DUP(dest, src) \ + { dest = alloca(strlen(src) + 1); \ + strcpy(dest, src); \ + } + +/* release memory pointer after checking NULL */ +#define FREE_PTR(x) if ((x) != NULL) { xfree(x);} + +/* Bit vector implementation */ +void SetBits(int* bits, int idx, int maxNum); +int GetBits(int* bits, int idx, int maxNum); +int GetNumBits(int* bitvecs, int maxNum); + +/* extended logging */ +typedef void* HANDLE; + +HANDLE CreateLogFHandle(const char* signature, int change_file_name_on_save); +int OpenLogF(HANDLE file); +int WriteLog(HANDLE file, const char* data, int size, int forceFlush); +void DailyReopenLogF(HANDLE file); +int HandleToFileNo(HANDLE file); + +/* flush the buffer */ +#define FlushLogF(h) WriteLog(h, NULL, 0, TRUE) + +/* maximum single log file size, defined in applib.c */ +extern int maxSingleLogSize; + +#include "gettimeofdayex.h" + +void FeedbackDelay(struct timeval *start, float minSec, float maxSec); + +int NiceExitOpenLog(char *logName); +void NiceExitBack(int val, char *reason, char *file, int line) __attribute__ ((noreturn)); +#define NiceExit(val, reason) NiceExitBack(val, reason, __FILE__, __LINE__) + +char *ReadFile(const char *filename); +char *ReadFileEx(const char *filename, int *size); +char *MmapFile(const char *filename, int *size); + +/* use 32-bit unsigned integer, higher bits are thrown away */ +#define WORD_BITS 32 +#define MASK 0xFFFFFFFF +/* bitwise left rotate operator */ +#define _rotl(Val, Bits) ((((Val)<<(Bits)) | (((Val) & MASK)>>(32 - (Bits)))) & MASK) + +unsigned int HashString(const char *name, unsigned int hash, + int endOnQuery, int skipLastIfDot); +unsigned int CalcAgentHash(const char* agent); +#endif + diff --git a/codemux.c b/codemux.c new file mode 100644 index 0000000..732059a --- /dev/null +++ b/codemux.c @@ -0,0 +1,944 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "applib.h" + +#define CONF_FILE "/etc/codemux/codemux.conf" +#define DEMUX_PORT 80 +#define REAL_WEBSERVER_CONFLINE "* root 1080" +#define TARG_SETSIZE 4096 + +/* set aside some small number of fds for us, allow the rest for + connections */ +#define MAX_CONNS ((TARG_SETSIZE-20)/2) + +/* no single service can take more than half the connections */ +#define SERVICE_MAX (MAX_CONNS/2) + +/* how many total connections before we get concerned about fairness + among them */ +#define FAIRNESS_CUTOFF (MAX_CONNS * 0.85) + + +typedef struct FlowBuf { + int fb_refs; /* num refs */ + char *fb_buf; /* actual buffer */ + int fb_used; /* bytes used in buffer */ +} FlowBuf; +#define FB_SIZE 3800 /* max usable size */ +#define FB_ALLOCSIZE 4000 /* extra to include IP address */ + +typedef struct SockInfo { + int si_peerFd; /* fd of peer */ + struct in_addr si_cliAddr; /* address of client */ + int si_blocked; /* are we blocked? */ + int si_needsHeaderSince; /* since when are we waiting for a header */ + int si_whichService; /* index of service */ + FlowBuf *si_readBuf; /* read data into this buffer */ + FlowBuf *si_writeBuf; /* drain this buffer for writing */ +} SockInfo; + +static SockInfo sockInfo[TARG_SETSIZE]; /* fd number of peer socket */ + +typedef struct ServiceSig { + char *ss_host; /* suffix in host */ + char *ss_slice; + short ss_port; + int ss_slicePos; /* position in slices array */ +} ServiceSig; + +static ServiceSig *serviceSig; +static int numServices; +static int confFileReadTime; +static int now; + +typedef struct SliceInfo { + char *si_sliceName; + int si_inUse; /* do any services refer to this? */ + int si_numConns; + int si_xid; +} SliceInfo; + +static SliceInfo *slices; +static int numSlices; +static int numActiveSlices; +static int numTotalSliceConns; +static int anySliceXidsNeeded; + +typedef struct OurFDSet { + long __fds_bits[TARG_SETSIZE/32]; +} OurFDSet; +static OurFDSet masterReadSet, masterWriteSet; +static int highestSetFd; +static int numNeedingHeaders; /* how many conns waiting on headers? */ + +static int numForks; + +HANDLE hdebugLog; + +#ifndef SO_SETXID +#define SO_SETXID SO_PEERCRED +#endif +/*-----------------------------------------------------------------*/ +static SliceInfo * +ServiceToSlice(int whichService) +{ + if (whichService < 0) + return(NULL); + return(&slices[serviceSig[whichService].ss_slicePos]); +} +/*-----------------------------------------------------------------*/ +static void +DumpStatus(int fd) +{ + char buf[65535]; + char *start = buf; + int i; + int len; + + sprintf(start, + "numForks %d, numActiveSlices %d, numTotalSliceConns %d\n" + "numNeedingHeaders %d, anySliceXidsNeeded %d\n", + numForks, numActiveSlices, numTotalSliceConns, + numNeedingHeaders, anySliceXidsNeeded); + start += strlen(start); + + for (i = 0; i < numSlices; i++) { + SliceInfo *si = &slices[i]; + sprintf(start, "Slice %d: %s xid %d, %d conns, inUse %d\n", + i, si->si_sliceName, si->si_xid, si->si_numConns, + si->si_inUse); + start += strlen(start); + } + + for (i = 0; i < numServices; i++) { + ServiceSig *ss = &serviceSig[i]; + sprintf(start, "Service %d: %s %s port %d, slice# %d\n", i, ss->ss_host, + ss->ss_slice, (int) ss->ss_port, ss->ss_slicePos); + start += strlen(start); + } + + len = start - buf; + write(fd, buf, len); +} +/*-----------------------------------------------------------------*/ +static void +GetSliceXids(void) +{ + /* walks through /etc/passwd, and gets the uid for every slice we + have */ + FILE *f; + char *line; + int i; + + if (!anySliceXidsNeeded) + return; + + for (i = 0; i < numSlices; i++) { + SliceInfo *si = &slices[i]; + si->si_inUse = 0; + } + for (i = 0; i < numServices; i++) { + SliceInfo *si = ServiceToSlice(i); + if (si != NULL) + si->si_inUse++; + } + + if ((f = fopen("/etc/passwd", "r")) == NULL) + return; + + while ((line = GetNextLine(f)) != NULL) { + char *temp; + int xid; + + if ((temp = strchr(line, ':')) == NULL) + continue; /* weird line */ + *temp = '\0'; /* terminate slice name */ + temp++; + if ((temp = strchr(temp+1, ':')) == NULL) + continue; /* weird line */ + if ((xid = atoi(temp+1)) < 1) + continue; /* weird xid */ + + /* we've got a slice name and xid, let's try to match */ + for (i = 0; i < numSlices; i++) { + if (slices[i].si_xid == 0 && + strcasecmp(slices[i].si_sliceName, line) == 0) { + slices[i].si_xid = xid; + break; + } + } + } + + /* assume service 0 is the root service, and don't check it since + it'll have xid zero */ + anySliceXidsNeeded = FALSE; + for (i = 1; i < numSlices; i++) { + if (slices[i].si_xid == 0 && slices[i].si_inUse > 0) { + anySliceXidsNeeded = TRUE; + break; + } + } + + fclose(f); +} +/*-----------------------------------------------------------------*/ +static void +SliceConnsInc(int whichService) +{ + SliceInfo *si = ServiceToSlice(whichService); + + if (si == NULL) + return; + numTotalSliceConns++; + si->si_numConns++; + if (si->si_numConns == 1) + numActiveSlices++; +} +/*-----------------------------------------------------------------*/ +static void +SliceConnsDec(int whichService) +{ + SliceInfo *si = ServiceToSlice(whichService); + + if (si == NULL) + return; + numTotalSliceConns--; + si->si_numConns--; + if (si->si_numConns == 0) + numActiveSlices--; +} +/*-----------------------------------------------------------------*/ +static int +WhichSlicePos(char *slice) +{ + /* adds the new slice if necessary, returns the index into slice + array. Never change the ordering of existing slices */ + int i; + static int numSlicesAlloc; + + for (i = 0; i < numSlices; i++) { + if (strcasecmp(slice, slices[i].si_sliceName) == 0) + return(i); + } + + if (numSlices >= numSlicesAlloc) { + numSlicesAlloc = MAX(8, numSlicesAlloc * 2); + slices = realloc(slices, numSlicesAlloc * sizeof(SliceInfo)); + } + + memset(&slices[numSlices], 0, sizeof(SliceInfo)); + slices[numSlices].si_sliceName = strdup(slice); + numSlices++; + return(numSlices-1); +} +/*-----------------------------------------------------------------*/ +static void +ReadConfFile(void) +{ + int numAlloc = 0; + int num = 0; + ServiceSig *servs = NULL; + FILE *f; + char *line = NULL; + struct stat statBuf; + int i; + + if (stat(CONF_FILE, &statBuf) != 0) { + fprintf(stderr, "failed stat on codemux.conf\n"); + if (numServices) + return; + exit(-1); + } + if (statBuf.st_mtime == confFileReadTime) + return; + + if ((f = fopen(CONF_FILE, "r")) == NULL) { + fprintf(stderr, "failed reading codemux.conf\n"); + if (numServices) + return; + exit(-1); + } + + /* conf file entries look like + coblitz.codeen.org princeton_coblitz 3125 + */ + + while (1) { + ServiceSig serv; + int port; + if (line != NULL) + free(line); + + /* on the first pass, put in a fake entry for apache */ + if (num == 0) + line = strdup(REAL_WEBSERVER_CONFLINE); + else { + if ((line = GetNextLine(f)) == NULL) + break; + } + + memset(&serv, 0, sizeof(serv)); + if (WordCount(line) != 3) { + fprintf(stderr, "bad line: %s\n", line); + continue; + } + serv.ss_port = port = atoi(GetField(line, 2)); + if (port < 1 || port > 65535 || port == DEMUX_PORT) { + fprintf(stderr, "bad port: %s\n", line); + continue; + } + + serv.ss_host = GetWord(line, 0); + serv.ss_slice = GetWord(line, 1); + if (num >= numAlloc) { + numAlloc = MAX(numAlloc * 2, 8); + servs = realloc(servs, numAlloc * sizeof(ServiceSig)); + } + serv.ss_slicePos = WhichSlicePos(serv.ss_slice); + if (slices[serv.ss_slicePos].si_inUse == 0 && + slices[serv.ss_slicePos].si_xid < 1) + anySliceXidsNeeded = TRUE; /* if new/inactive, we need xid */ + servs[num] = serv; + num++; + } + + fclose(f); + + if (num == 1) { + if (numServices == 0) { + fprintf(stderr, "nothing found in codemux.conf\n"); + exit(-1); + } + return; + } + + for (i = 0; i < numServices; i++) { + free(serviceSig[i].ss_host); + free(serviceSig[i].ss_slice); + } + free(serviceSig); + serviceSig = servs; + numServices = num; + confFileReadTime = statBuf.st_mtime; +} +/*-----------------------------------------------------------------*/ +static char *err400BadRequest = +"HTTP/1.0 400 Bad Request\r\n" +"Content-Type: text/html\r\n" +"\r\n" +"You are trying to access a PlanetLab node, and your\n" +"request header exceeded the allowable size. Please\n" +"try again if you believe this error is temporary.\n"; +/*-----------------------------------------------------------------*/ +static char *err503Unavailable = +"HTTP/1.0 503 Service Unavailable\r\n" +"Content-Type: text/html\r\n" +"\r\n" +"You are trying to access a PlanetLab node, but the service\n" +"seems to be unavailable at the moment. Please try again.\n"; +/*-----------------------------------------------------------------*/ +static char *err503TooBusy = +"HTTP/1.0 503 Service Unavailable\r\n" +"Content-Type: text/html\r\n" +"\r\n" +"You are trying to access a PlanetLab node, but the service\n" +"seems to be overloaded at the moment. Please try again.\n"; +/*-----------------------------------------------------------------*/ +static void +SetFd(int fd, OurFDSet *set) +{ + if (highestSetFd < fd) + highestSetFd = fd; + FD_SET(fd, set); +} +/*-----------------------------------------------------------------*/ +static void +ClearFd(int fd, OurFDSet *set) +{ + FD_CLR(fd, set); +} +/*-----------------------------------------------------------------*/ +static int +RemoveHeader(char *lower, char *real, int totalSize, char *header) +{ + /* returns number of characters removed */ + char h2[256]; + int start, end, len; + char *temp, *conn; + + sprintf(h2, "\n%s", header); + + if ((conn = strstr(lower, h2)) == NULL) + return(0); + + conn++; + /* determine how many characters to remove */ + if ((temp = strchr(conn, '\n')) != NULL) + len = (temp - conn) + 1; + else + len = strlen(conn) + 1; + start = conn - lower; + end = start + len; + memmove(&real[start], &real[end], totalSize - end); + memmove(&lower[start], &lower[end], totalSize - end); + + return(len); +} +/*-----------------------------------------------------------------*/ +static int +InsertHeader(char *buf, int totalSize, char *header) +{ + /* returns number of bytes inserted */ + + char h2[256]; + char *temp; + int len; + + sprintf(h2, "%s\r\n", header); + len = strlen(h2); + + /* if we don't encounter a \n, it means that we have only a single + line, and we'd converted the \n to a \0 */ + if ((temp = strchr(buf, '\n')) == NULL) + temp = strchr(buf, '\0'); + temp++; + + memmove(temp + len, temp, totalSize - (temp - buf)); + memcpy(temp, h2, len); + + return(len); +} +/*-----------------------------------------------------------------*/ +static int +FindService(FlowBuf *fb, int *whichService, struct in_addr addr) +{ + char *end; + char *lowerBuf; + char *hostVal; + char *buf = fb->fb_buf; + char orig[256]; +#if 0 + char *url; + int i; + int len; +#endif + + if (strstr(buf, "\n\r\n") == NULL && strstr(buf, "\n\n") == NULL) + return(FAILURE); + + /* insert client info after first line */ + sprintf(orig, "X-CoDemux-Client: %s", inet_ntoa(addr)); + fb->fb_used += InsertHeader(buf, fb->fb_used + 1, orig); + + /* get just the header, so we can work on it */ + LOCAL_STR_DUP_LOWER(lowerBuf, buf); + if ((end = strstr(lowerBuf, "\n\r\n")) == NULL) + end = strstr(lowerBuf, "\n\n"); + *end = '\0'; + + /* remove any existing connection, keep-alive headers, add ours */ + fb->fb_used -= RemoveHeader(lowerBuf, buf, fb->fb_used + 1, "keep-alive:"); + fb->fb_used -= RemoveHeader(lowerBuf, buf, fb->fb_used + 1, "connection:"); + fb->fb_used += InsertHeader(buf, fb->fb_used + 1, "Connection: close"); + InsertHeader(lowerBuf, fb->fb_used + 1, "connection: close"); + + /* isolate host, see if it matches */ + if ((hostVal = strstr(lowerBuf, "\nhost:")) != NULL) { + int i; + hostVal += strlen("\nhost:"); + if ((end = strchr(hostVal, '\n')) != NULL) + *end = '\0'; + if ((end = strchr(hostVal, ':')) != NULL) + *end = '\0'; + while (isspace(*hostVal)) + hostVal++; + if (strlen(hostVal) > 0) { + hostVal = GetWord(hostVal, 0); + for (i = 1; i < numServices; i++) { + if (serviceSig[i].ss_host != NULL && + DoesDotlessSuffixMatch(hostVal, 0, serviceSig[i].ss_host)) { + *whichService = i; + free(hostVal); + /* printf("%s", buf); */ + return(SUCCESS); + } + } + free(hostVal); + } + } + +#if 0 + /* see if URL prefix matches */ + if ((end = strchr(lowerBuf, '\n')) != NULL) + *end = 0; + if ((url = GetField(lowerBuf, 1)) == NULL || + url[0] != '/') { + /* bad request - let apache handle it ? */ + *whichService = 0; + return(SUCCESS); + } + url++; /* skip the leading slash */ + for (i = 1; i < numServices; i++) { + if (serviceSig[i].ss_prefix != NULL && + (len = strlen(serviceSig[i].ss_prefix)) > 0 && + strncmp(url, serviceSig[i].ss_prefix, len) == 0 && + (url[len] == ' ' || url[len] == '/')) { + int startPos = url - lowerBuf; + int stripLen = len + ((url[len] == '/') ? 1 : 0); + /* strip out prefix */ + fb->fb_used -= stripLen; + memmove(&buf[startPos], &buf[startPos+stripLen], + fb->fb_used + 1 - startPos); + /* printf("%s", buf); */ + *whichService = i; + return(SUCCESS); + } + } +#endif + + /* default to first service */ + *whichService = 0; + return(SUCCESS); +} +/*-----------------------------------------------------------------*/ +static int +StartConnect(int origFD, int whichService) +{ + int sock; + struct sockaddr_in dest; + SockInfo *si; + + /* create socket */ + if ((sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) < 0) { + return(FAILURE); + } + + /* make socket non-blocking */ + if (fcntl(sock, F_SETFL, O_NONBLOCK) < 0) { + close(sock); + return(FAILURE); + } + + /* set addr structure */ + memset(&dest, 0, sizeof(dest)); + dest.sin_family = AF_INET; + dest.sin_port = htons(serviceSig[whichService].ss_port); + dest.sin_addr.s_addr = htonl(INADDR_LOOPBACK); + + /* start connection process - we should be told that it's in + progress */ + if (connect(sock, (struct sockaddr *) &dest, sizeof(dest)) != -1 || + errno != EINPROGRESS) { + close(sock); + return(FAILURE); + } + + SetFd(sock, &masterWriteSet); /* determine when connect finishes */ + sockInfo[origFD].si_peerFd = sock; + si = &sockInfo[sock]; + memset(si, 0, sizeof(SockInfo)); + si->si_peerFd = origFD; + si->si_blocked = TRUE; /* still connecting */ + si->si_whichService = whichService; + si->si_writeBuf = sockInfo[origFD].si_readBuf; + sockInfo[origFD].si_readBuf->fb_refs++; + if (whichService >= 0) + SliceConnsInc(whichService); + + return(SUCCESS); +} +/*-----------------------------------------------------------------*/ +static int +WriteAvailData(int fd) +{ + SockInfo *si = &sockInfo[fd]; + FlowBuf *fb = si->si_writeBuf; + int res; + + /* printf("trying to write fd %d\n", fd); */ + if (fb->fb_used < 1 || si->si_blocked) + return(SUCCESS); + + /* printf("trying to write %d bytes\n", fb->fb_used); */ + /* write(STDOUT_FILENO, fb->fb_buf, fb->fb_used); */ + if ((res = write(fd, fb->fb_buf, fb->fb_used)) > 0) { + fb->fb_used -= res; + if (fb->fb_used > 0) { + /* couldn't write all - assume blocked */ + memmove(fb->fb_buf, &fb->fb_buf[res], fb->fb_used); + si->si_blocked = TRUE; + SetFd(fd, &masterWriteSet); + } + /* printf("wrote %d\n", res); */ + return(SUCCESS); + } + + /* we might have been full but didn't realize it */ + if (res == -1 && errno == EAGAIN) { + si->si_blocked = TRUE; + SetFd(fd, &masterWriteSet); + return(SUCCESS); + } + + /* otherwise, assume the worst */ + return(FAILURE); +} +/*-----------------------------------------------------------------*/ +static OurFDSet socksToCloseVec; +static int numSocksToClose; +static int whichSocksToClose[TARG_SETSIZE]; +/*-----------------------------------------------------------------*/ +static void +CloseSock(int fd) +{ + if (FD_ISSET(fd, &socksToCloseVec)) + return; + SetFd(fd, &socksToCloseVec); + whichSocksToClose[numSocksToClose] = fd; + numSocksToClose++; +} +/*-----------------------------------------------------------------*/ +static void +DecBuf(FlowBuf *buf) +{ + if (buf == NULL) + return; + buf->fb_refs--; + if (buf->fb_refs == 0) { + free(buf->fb_buf); + free(buf); + } +} +/*-----------------------------------------------------------------*/ +static void +ReallyCloseSocks(void) +{ + int i; + + memset(&socksToCloseVec, 0, sizeof(socksToCloseVec)); + + for (i = 0; i < numSocksToClose; i++) { + int fd = whichSocksToClose[i]; + close(fd); + DecBuf(sockInfo[fd].si_readBuf); + DecBuf(sockInfo[fd].si_writeBuf); + ClearFd(fd, &masterReadSet); + ClearFd(fd, &masterWriteSet); + if (sockInfo[fd].si_needsHeaderSince) { + sockInfo[fd].si_needsHeaderSince = 0; + numNeedingHeaders--; + } + if (sockInfo[fd].si_whichService >= 0) { + SliceConnsDec(sockInfo[fd].si_whichService); + sockInfo[fd].si_whichService = -1; + } + } + numSocksToClose = 0; +} +/*-----------------------------------------------------------------*/ +static void +SocketReadyToRead(int fd) +{ + SockInfo *si = &sockInfo[fd]; + int spaceLeft; + FlowBuf *fb; + int res; + + /* if peer is closed, close ourselves */ + if (si->si_peerFd < 0 && (!si->si_needsHeaderSince)) { + CloseSock(fd); + return; + } + + if ((fb = si->si_readBuf) == NULL) { + fb = si->si_readBuf = calloc(1, sizeof(FlowBuf)); + fb->fb_refs = 1; + if (si->si_peerFd >= 0) { + sockInfo[si->si_peerFd].si_writeBuf = fb; + fb->fb_refs = 2; + } + } + + if (fb->fb_buf == NULL) + fb->fb_buf = malloc(FB_ALLOCSIZE); + + /* determine read buffer size - if 0, then block reads and return */ + if ((spaceLeft = FB_SIZE - fb->fb_used) < 0) { + if (si->si_needsHeaderSince) { + write(fd, err400BadRequest, strlen(err400BadRequest)); + CloseSock(fd); + return; + } + else { + ClearFd(fd, &masterReadSet); + return; + } + } + + /* read as much as allowed, and is available */ + if ((res = read(fd, &fb->fb_buf[fb->fb_used], spaceLeft)) == 0) { + CloseSock(fd); + if (fb->fb_used == 0 && si->si_peerFd >= 0) { + CloseSock(si->si_peerFd); + si->si_peerFd = -1; + } + return; + } + if (res == -1) { + if (errno == EAGAIN) + return; + CloseSock(fd); + if (fb->fb_used == 0 && si->si_peerFd >= 0) { + CloseSock(si->si_peerFd); + si->si_peerFd = -1; + } + return; + } + fb->fb_used += res; + fb->fb_buf[fb->fb_used] = 0; /* terminate it for convenience */ + printf("sock %d, read %d, total %d\n", fd, res, fb->fb_used); + + /* if we need header, check if we've gotten it. if so, do + modifications and continue. if not, check if we've read the + maximum, and if so, fail */ + if (si->si_needsHeaderSince) { + int whichService; + SliceInfo *slice; + +#define STATUS_REQ "GET /codemux/status.txt" + if (strncasecmp(fb->fb_buf, STATUS_REQ, sizeof(STATUS_REQ)-1) == 0) { + DumpStatus(fd); + CloseSock(fd); + return; + } + + printf("trying to find service\n"); + if (FindService(fb, &whichService, si->si_cliAddr) != SUCCESS) + return; + printf("found service %d\n", whichService); + slice = ServiceToSlice(whichService); + + /* no service can have more than some absolute max number of + connections. Also, when we're too busy, start enforcing + fairness across the servers */ + if (slice->si_numConns > SERVICE_MAX || + (numTotalSliceConns > FAIRNESS_CUTOFF && + slice->si_numConns > MAX_CONNS/numActiveSlices)) { + write(fd, err503TooBusy, strlen(err503TooBusy)); + CloseSock(fd); + return; + } + + if (slice->si_xid > 0) { + setsockopt(fd, SOL_SOCKET, SO_SETXID, + &slice->si_xid, sizeof(slice->si_xid)); + fprintf(stderr, "setsockopt() with XID = %d name = %s\n", + slice->si_xid, slice->si_sliceName); + } + + si->si_needsHeaderSince = 0; + numNeedingHeaders--; + if (StartConnect(fd, whichService) != SUCCESS) { + write(fd, err503Unavailable, strlen(err503Unavailable)); + CloseSock(fd); + return; + } + return; + } + + /* write anything possible */ + if (WriteAvailData(si->si_peerFd) != SUCCESS) { + /* assume the worst and close */ + CloseSock(fd); + CloseSock(si->si_peerFd); + si->si_peerFd = -1; + } +} +/*-----------------------------------------------------------------*/ +static void +SocketReadyToWrite(int fd) +{ + SockInfo *si = &sockInfo[fd]; + + /* unblock it and read what it has */ + si->si_blocked = FALSE; + ClearFd(fd, &masterWriteSet); + SetFd(fd, &masterReadSet); + + /* enable reading on peer just in case it was off */ + if (si->si_peerFd >= 0) + SetFd(si->si_peerFd, &masterReadSet); + + /* if we have data, write it */ + if (WriteAvailData(fd) != SUCCESS) { + /* assume the worst and close */ + CloseSock(fd); + if (si->si_peerFd >= 0) { + CloseSock(si->si_peerFd); + si->si_peerFd = -1; + } + return; + } + + /* if peer is closed and we're done writing, we should close */ + if (si->si_peerFd < 0 && si->si_writeBuf->fb_used == 0) + CloseSock(fd); +} +/*-----------------------------------------------------------------*/ +static void +CloseReqlessConns(void) +{ + static int lastSweep; + int maxAge; + int i; + + if (lastSweep == now) + return; + lastSweep = now; + + if (numTotalSliceConns + numNeedingHeaders > MAX_CONNS || + numNeedingHeaders > TARG_SETSIZE/20) { + /* second condition is probably an attack - close aggressively */ + maxAge = 5; + } + else if (numTotalSliceConns + numNeedingHeaders > FAIRNESS_CUTOFF || + numNeedingHeaders > TARG_SETSIZE/40) { + /* sweep a little aggressively */ + maxAge = 10; + } + else if (numNeedingHeaders > TARG_SETSIZE/80) { + /* just sweep to close strays */ + maxAge = 30; + } + else { + /* too little gained - not worth sweeping */ + return; + } + + /* if it's too old, close it */ + for (i = 0; i < highestSetFd+1; i++) { + if (sockInfo[i].si_needsHeaderSince && + (now - sockInfo[i].si_needsHeaderSince) > maxAge) + CloseSock(i); + } +} +/*-----------------------------------------------------------------*/ +static void +MainLoop(int lisSock) +{ + int i; + OurFDSet tempReadSet, tempWriteSet; + int res; + int lastConfCheck = 0; + + signal(SIGPIPE, SIG_IGN); + + while (1) { + int newSock; + int ceiling; + struct timeval timeout; + + now = time(NULL); + + if (now - lastConfCheck > 300) { + ReadConfFile(); + GetSliceXids(); /* always call - in case new slices created */ + lastConfCheck = now; + } + + /* see if there's any activity */ + tempReadSet = masterReadSet; + tempWriteSet = masterWriteSet; + + /* trim it down if needed */ + while (highestSetFd > 1 && + (!FD_ISSET(highestSetFd, &tempReadSet)) && + (!FD_ISSET(highestSetFd, &tempWriteSet))) + highestSetFd--; + timeout.tv_sec = 1; + timeout.tv_usec = 0; + res = select(highestSetFd+1, (fd_set *) &tempReadSet, + (fd_set *) &tempWriteSet, NULL, &timeout); + if (res < 0 && errno != EINTR) { + perror("select"); + exit(-1); + } + + now = time(NULL); + + /* clear the bit for listen socket to avoid confusion */ + ClearFd(lisSock, &tempReadSet); + + ceiling = highestSetFd+1; /* copy it, since it changes during loop */ + /* pass data back and forth as needed */ + for (i = 0; i < ceiling; i++) { + if (FD_ISSET(i, &tempWriteSet)) + SocketReadyToWrite(i); + } + for (i = 0; i < ceiling; i++) { + if (FD_ISSET(i, &tempReadSet)) + SocketReadyToRead(i); + } + + /* see if we need to close conns w/o requests */ + CloseReqlessConns(); + + /* do all closes */ + ReallyCloseSocks(); + + /* try accepting new connections */ + do { + struct sockaddr_in addr; + socklen_t lenAddr = sizeof(addr); + if ((newSock = accept(lisSock, (struct sockaddr *) &addr, + &lenAddr)) >= 0) { + memset(&sockInfo[newSock], 0, sizeof(SockInfo)); + sockInfo[newSock].si_needsHeaderSince = now; + numNeedingHeaders++; + sockInfo[newSock].si_peerFd = -1; + sockInfo[newSock].si_cliAddr = addr.sin_addr; + sockInfo[newSock].si_whichService = -1; + SetFd(newSock, &masterReadSet); + } + } while (newSock >= 0); + } +} +/*-----------------------------------------------------------------*/ +int +main(int argc, char *argv[]) +{ + int lisSock; + + if ((lisSock = CreatePrivateAcceptSocket(DEMUX_PORT, TRUE)) < 0) { + fprintf(stderr, "failed creating accept socket\n"); + exit(-1); + } + SetFd(lisSock, &masterReadSet); + + while (1) { + numForks++; + if (fork()) { + /* this is the parent - just wait */ + while (wait3(NULL, 0, NULL) < 1) + ; /* just keep waiting for a real pid */ + } + else { + /* child process */ + MainLoop(lisSock); + exit(-1); + } + } +} +/*-----------------------------------------------------------------*/ diff --git a/codemux.conf b/codemux.conf new file mode 100644 index 0000000..b5c86fb --- /dev/null +++ b/codemux.conf @@ -0,0 +1,6 @@ +coblitz.codeen.org princeton_coblitz 3125 +cdn.rd.tp.pl princeton_coblitz 3125 +nyud.net nyu_d 8080 +nyucd.net nyu_d 8080 +nyuld.net nyu_oasis 8096 +cob-web.org cornell_cobweb 8888 diff --git a/codemux.initscript b/codemux.initscript new file mode 100644 index 0000000..5a69a70 --- /dev/null +++ b/codemux.initscript @@ -0,0 +1,69 @@ +#!/bin/sh +# +# chkconfig: 345 85 02 +# description: codemux startup script +# + +PROC=codemux + +. /etc/rc.d/init.d/functions + +RETVAL=0 + +pidfile=/var/run/$PROC.pid + +check_status() { + pid=`cat $pidfile 2>/dev/null` + # + # this eliminates a race condition between checking existence of pidfile + # and reading its value + # + [ -n "$pid" -a -d /proc/$pid ] +} + +case "$1" in + start) + echo -n "starting $PROC:" + pid=`cat $pidfile 2>/dev/null` + if [ -n "$pid" ]; then + # check whether process really exists + # yes - don't try to start + [ -d /proc/$pid ] && action "already running" /bin/true && exit 1 + + # no - PID file is stale + rm -f $pidfile + fi + + initlog -c /usr/local/planetlab/sbin/codemux + + cmd=success + check_status && touch /var/lock/subsys/$PROC || cmd=failure + $cmd "$PROC startup" + echo + ;; + + stop) + echo -n "shutting down $PROC: " + killproc $PROC + RETVAL=$? + echo + [ $RETVAL -eq 0 ] && rm -f /var/lock/subsys/$PROC + ;; + + restart|reload) + $0 stop + $0 start + RETVAL=$? + ;; + + status) + check_status && echo 'running' && exit 0 || \ + echo 'not running' && exit 1 + ;; + + *) + echo "Usage: $0 {start|stop|restart|status}" + RETVAL=1 +esac + +exit $RETVAL diff --git a/codemux.spec b/codemux.spec new file mode 100644 index 0000000..a713adf --- /dev/null +++ b/codemux.spec @@ -0,0 +1,63 @@ +Summary: CoDemux - HTTP port demultiplexer +Name: codemux +Version: 0.5 +Release: 1 +License: Private +Group: System Environment/Base +URL: http://codeen.cs.princeton.edu/ +Source0: %{name}-%{version}.tar.gz +BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root + +%description + +%prep +%setup -q + +make clean + +%build +make + +%install +rm -rf $RPM_BUILD_ROOT + +make INSTALL_ROOT=$RPM_BUILD_ROOT install-all + +# Build and install in %post so that it gets put in the right site-packages directory +#rm -rf $RPM_BUILD_ROOT/usr/lib/python* +#install -D -m 755 python/Proper.py $RPM_BUILD_ROOT/%{_datadir}/%{name}/Proper.py + +%clean +rm -rf $RPM_BUILD_ROOT + +%files +%defattr(-,root,root,-) +%attr(0755,root,root) %{_initrddir}/codemux +%config /etc/codemux/codemux.conf +%attr(0755,root,root) /usr/local/planetlab/sbin/codemux + +%post +chkconfig codemux reset + +if [ -z "$PL_BOOTCD" ]; then + /sbin/ldconfig + /etc/init.d/codemux restart +fi + +%preun +if [ "$1" = 0 ]; then + # erase, not upgrade + chkconfig --del codemux + + # stop daemon if its currently running + if [ "`/etc/init.d/codemux status`" = "running" ]; then + /etc/init.d/codemux stop + fi +fi + +%doc + +%changelog +* Sun Apr 22 2007 KYOUNGSOO PARK - +- Initial build. + diff --git a/codns.h b/codns.h new file mode 100644 index 0000000..2f7f36e --- /dev/null +++ b/codns.h @@ -0,0 +1,42 @@ +#ifndef _CODNS_H_ +#define _CODNS_H_ +#include "ports.h" + +/* query info - fixed part */ +typedef struct LocalQueryInfo { + int lqi_size; /* length of the name string */ + int lqi_id; /* query id */ + int lqi_cache; /* not being used now */ +} LocalQueryInfo; + +/* query info + name + query structure expected from a client */ +#define MAX_QUERY_NAME 256 +#define SIG_SPLIT_TRANSACTION 0 /* signature for split-transaction */ +typedef struct LocalQuery { + int lq_zero; /* always set to SIG_SPLIT_TRANSACTION(=0) */ + LocalQueryInfo lq_info; /* query info */ + char lq_name[MAX_QUERY_NAME]; /* name */ +} LocalQuery; + +/* query result from CoDNS + we set MAX_ANSWERS for easy implementation. + if lq.address[i].s_addr == 0, that means it returned i-1 valid anwers. */ +#define MAX_ANSWERS 8 +typedef struct LocalQueryResult { + int lq_id; /* query id */ + int lq_ttl; /* TTL of the record */ + struct in_addr lq_address[MAX_ANSWERS]; /* IP addresses for the query */ +} LocalQueryResult; + +/*----------------------------------------------------------------------*/ + +/* temporary section : from here to the end + used for defining variables or constants for testing */ + +/* for testing in HBTWGET */ +#define HBTWGET_CODNS_ID (-3) + +#endif // _CODNS_H_ + + diff --git a/debug.h b/debug.h new file mode 100644 index 0000000..a7ddc87 --- /dev/null +++ b/debug.h @@ -0,0 +1,97 @@ +#ifndef _DEBUG_H_ +#define _DEBUG_H_ +#include +#include "applib.h" +#undef max + + +/* + TRACE : print with function name + TRACE0 : print without function name + TRACE1 : print "buf" whose size is "size" +*/ + +#define DEBUG + +extern HANDLE hdebugLog; +extern int defaultTraceSync; +#define TRACE0(fmt, msg...) { \ + char __buf[2048]; \ + if (hdebugLog) { \ + snprintf(__buf, sizeof(__buf), fmt, ##msg); \ + WriteLog(hdebugLog, __buf, strlen(__buf), defaultTraceSync); \ + } \ +} +#define TRACE1(buf, size) { \ + WriteLog(hdebugLog, buf, size, defaultTraceSync); \ +} +#define TRACE(fmt, msg...) { \ + char __buf[2048]; \ + if (hdebugLog) { \ + snprintf(__buf, sizeof(__buf), "[%s] " fmt, __FUNCTION__, ##msg); \ + WriteLog(hdebugLog, __buf, strlen(__buf), defaultTraceSync); \ + } \ +} +#define TRACEX(fmt) { \ + char __buf[2048]; \ + if (hdebugLog) { \ + snprintf(__buf, sizeof(__buf), "[%s] " fmt, __FUNCTION__); \ + WriteLog(hdebugLog, __buf, strlen(__buf), defaultTraceSync); \ + } \ +} + +#ifndef HERE +#define HERE TRACE("file %s, line %d, func %s\n", __FILE__, __LINE__, __FUNCTION__) +#endif + +#ifdef DEBUG +#define ASSERT(exp) { \ + if (!(exp)) { \ + TRACE("ASSERTION (%s) FAILED in %s (%s:%d)\n", \ + (#exp), __FUNCTION__, __FILE__, __LINE__); \ + } \ +} +#else +#define ASSERT(exp) 1 ? (void)0 : (exp) +#endif // DEBUG + +/*-------------------------------------------------------------- + macros used for debugging memory leaks + if DEBUG_MEMORY_LEAK is enabled, we track down all the memory + allocation/freeing to count the number of allocations + -------------------------------------------------------------*/ +//#define DEBUG_MEMORY_LEAK + +#ifndef DEBUG_MEMORY_LEAK + +#define xcalloc(nmemb, size) calloc(nmemb, size) +#define xmalloc(size) malloc(size) +#define xrealloc(ptr, size) realloc(ptr, size) +#define xstrdup(s) strdup(s) +#define xfree(ptr) free(ptr) + +#else + +#define xcalloc(nmemb, size) dbgcalloc(nmemb, size, __FUNCTION__, \ + __FILE__, __LINE__) +#define xmalloc(size) dbgmalloc(size, __FUNCTION__,\ + __FILE__, __LINE__) +#define xrealloc(ptr, size) dbgrealloc(ptr, size, __FUNCTION__,\ + __FILE__, __LINE__) +#define xstrdup(s) dbgstrdup(s, __FUNCTION__, __FILE__, __LINE__) +#define xfree(ptr) dbgfree(ptr, __FUNCTION__, __FILE__, __LINE__) + +void *dbgcalloc(size_t nmemb, size_t size, + const char* func, const char* file, const int line); +void *dbgmalloc(size_t size, + const char* func, const char* file, const int line); +void *dbgrealloc(void *ptr, size_t size, + const char* func, const char* file, const int line); +char *dbgstrdup(const char *s, + const char* func, const char* file, const int line); +void dbgfree(void *ptr, const char* func, const char* file, const int line); +void dbg_print_memtrace(void); + +#endif /* DEBUG_MEMOTY_LEAK */ + +#endif /* _DEBUG_H_ */ diff --git a/gettimeofdayex.c b/gettimeofdayex.c new file mode 100644 index 0000000..4fc6718 --- /dev/null +++ b/gettimeofdayex.c @@ -0,0 +1,149 @@ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include +#include +#include +#include +#include +#include "gettimeofdayex.h" + +#if defined(__linux) && !defined(ALPHA) + +/* Macros */ +#define rdtsc(low, high) \ + asm volatile("rdtsc":"=a" (low), "=d" (high)) + +/* get cycle counts */ +#define GET_CC(cc) \ +do \ +{ \ + cc = 0; \ + unsigned long __cc_low__,__cc_high__; \ + rdtsc(__cc_low__,__cc_high__); \ + cc = __cc_high__; \ + cc = (cc << 32) + __cc_low__; \ +} \ +while(0) + +/*-------------------------------------------------------------------------*/ +static int +get_cpu_speed(double* cpu_speed) +{ + FILE* fp = NULL; + char *line = NULL; + size_t len = 0; + ssize_t num; + char* ptr = 0; + + if (cpu_speed == NULL) + return -1; + + if ((fp = fopen("/proc/cpuinfo", "r")) == NULL) + return -1; + + while ((num = getline(&line, &len, fp)) != -1) { + ptr = strtok(line, ":\n"); + if (ptr && strstr(ptr, "cpu MHz")) { + ptr = strtok(0," \r\t\n"); + *cpu_speed = strtod(ptr, 0); + break; + } + } + if (line) + free(line); + + fclose(fp); + return (*cpu_speed == 0) ? -1 : 0; +} + +/*--------------------------------------------------------------------------*/ +int +gettimeofdayex(struct timeval *tv, struct timezone *tz) +{ +#define MAX_64_BIT_NUM ((unsigned long long)(-1)) + + /* TO DO: timezone adjustment */ + static int first = 1, impossible = 0; + static double cpu_speed; + static struct timeval start; + static unsigned long long cc_start; + unsigned long long cc_now, cc_diff, usec; + + /* initialize : get the current time + and fix it as a starting point */ + if (first) { + if (get_cpu_speed(&cpu_speed) < 0) + impossible = 1; + + GET_CC(cc_start); + gettimeofday(&start, 0); + if (tv) + *tv = start; + first = 0; + return 0; + } + + /* if it's impossible to get cycle counts, + then use original gettimeofday() */ + if (impossible) + return gettimeofday(tv, tz); + + if (tv) { + GET_CC(cc_now); + + /* when overflow happens, we need to take care of the carry bit, + otherwise we get wrong result since we are using unsigned variables. + assuming cycle counter starts from zero at time zero, + this would happen after 136 years on 4GHz CPU. + however, lets just play on the safer side. + */ + + cc_diff = (cc_now < cc_start) ? cc_now + (MAX_64_BIT_NUM - cc_start) + : cc_now - cc_start; + usec = (unsigned long long)(cc_diff / cpu_speed); + + *tv = start; + tv->tv_sec += (usec / 1000000); + tv->tv_usec += (usec % 1000000); + + if (tv->tv_usec >= 1000000) { + tv->tv_sec++; + tv->tv_usec -= 1000000; + } + } + + return 0; +} + +/*--------------------------------------------------------------------------*/ +time_t +timeex(time_t *t) +{ + /* simple version of time() */ + struct timeval s; + + gettimeofdayex(&s, NULL); + if (t) + *t = (time_t)s.tv_sec; + + return (time_t)s.tv_sec; +} + +#else +#include +/*--------------------------------------------------------------------------*/ +int +gettimeofdayex(struct timeval *tv, struct timezone *tz) +{ + return(gettimeofday(tv, tz)); +} +/*--------------------------------------------------------------------------*/ +time_t +timeex(time_t *t) +{ + return(time(t)); +} +/*--------------------------------------------------------------------------*/ +#endif diff --git a/gettimeofdayex.h b/gettimeofdayex.h new file mode 100644 index 0000000..b9d6108 --- /dev/null +++ b/gettimeofdayex.h @@ -0,0 +1,8 @@ +#ifndef _GETTIMEOFDAYEX_H_ +#define _GETTIMEOFDAYEX_H_ +#include + +int gettimeofdayex(struct timeval *tv, struct timezone *tz); +time_t timeex(time_t *t); + +#endif // _GETTIMEOFDAYEX_H_ diff --git a/ports.h b/ports.h new file mode 100644 index 0000000..807dd2e --- /dev/null +++ b/ports.h @@ -0,0 +1,102 @@ +#ifndef _PORTS_H_ +#define _PORTS_H_ + +#define PORTS_COBLITZ + +#ifdef PORTS_CODEEN + +#define HOMEDIR "/home/princeton_codeen" +/* options in prox.conf or defined implicitly + 1161 snmpPort + 31415 APIDummyServPort + 3130 ICPPort */ +/* different ports for CoDNS and CoDNS2 */ +#ifdef CODNS2 +#define CODNS_PORT 24119 /* CODNS2 PORT */ +#define CODNS_STAT_PORT 24118 /* CODNS2 STATISTICS PORT */ +#define CODNS_HB_PORT 24121 +#else +#define CODNS_PORT 4119 /* CODNS PORT */ +#define CODNS_STAT_PORT 4118 /* CODNS STATISTICS PORT */ +#define CODNS_HB_PORT 4121 +#endif + +#define HB_PORT 23127 /* CoDeeN heartbeat port */ +#define CODEMON_PORT 2123 +/* 3124 proxy listen port */ +#define CODEPLOY_PORT 2125 +#define FAKE_WEBSERVER_PORT 3126 +#define FAKE_WS_PORT_STR "3126" +/* 3127 proxy listen port */ +/* 3128 proxy listen port */ +#define MAIN_PROXYPORT_STR "3128" +#define CODEEN_TM_PORT 3129 +#define PROXY_PORT 3128 +#define PROXY_PORT2 3127 +#define PROXY_PORT3 3124 + +/* used by redir_helper.c */ +#define QUERY_PORT 3122 +#define RTTSNAP_PORT 3110 + +#elif defined(PORTS_COBLITZ) + +#define HOMEDIR "/home/princeton_coblitz" +/* options in prox.conf + 2111 snmpPort + 2112 APIDummyServPort + 2113 ICPPort */ +#define CODNS_PORT 2119 /* CODNS PORT */ +#define CODNS_STAT_PORT 2120 /* CODNS STATISTICS PORT */ +#define CODNS_HB_PORT 2121 +#define HB_PORT 2122 /* CoDeeN heartbeat port */ +#define CODEMON_PORT 23126 +/* 2124 proxy listen port */ +#define CODEPLOY_PORT 3125 +#define FAKE_WEBSERVER_PORT 2126 +#define FAKE_WS_PORT_STR "2126" +/* 2127 proxy listen port */ +/* 2128 proxy listen port */ +#define MAIN_PROXYPORT_STR "2128" +#define CODEEN_TM_PORT 2129 +#define PROXY_PORT 2128 +#define PROXY_PORT2 2127 +#define PROXY_PORT3 2124 + +/* used by redir_helper.c */ +#define QUERY_PORT 2122 +#define RTTSNAP_PORT 2110 + +#else +#error "need to define either PORTS_CODEEN or PORTS_COBLITZ" +#endif + + +#endif + + +/* + +other changed files: prox_monitor prox.conf + + 4119 tcp 415 princeton_codeen codns + 4119 udp 415 princeton_codeen codns + 4118 tcp 412 princeton_codeen codns + 4121 udp 411 princeton_codeen codns + +23126 tcp 412 princeton_codeen codemon +23126 udp 412 princeton_codeen codemon +23127 tcp 399 princeton_codeen codeen +23127 udp 399 princeton_codeen codeen + 3126 tcp 399 princeton_codeen codeen + 1161 udp 401 princeton_codeen specified via snmpPort +31415 tcp 401 princeton_codeen can be specified via APIDummyServPort + 3124 tcp 399 princeton_codeen specified in prox.conf + 3127 tcp 399 princeton_codeen specified in prox.conf + 3128 tcp 398 princeton_codeen specified in prox.conf + 3129 tcp 399 princeton_codeen codeen traffic mon + 3130 udp 401 princeton_codeen can be specified from ICPPort + 3125 tcp 410 princeton_codeen codeploy + 3135 tcp 370 princeton_codeen + +*/ diff --git a/vnet_main.c b/vnet_main.c new file mode 100644 index 0000000..4dfe7b4 --- /dev/null +++ b/vnet_main.c @@ -0,0 +1,1202 @@ +/* + * VServer IP isolation. + * + * This file implements netfilter hooks and AF_INET socket function + * overrides. + * + * Mark Huang + * Copyright (C) 2004 The Trustees of Princeton University + * + * $Id: vnet_main.c,v 1.40 2007/03/08 15:46:07 mef Exp $ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "vnet_config.h" +#include "vnet.h" +#include "vnet_dbg.h" +#include "vnet_compat.h" + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16) + +#define HAVE_FUNCTIONALITY_REQUIRED_BY_DEMUX + +#include + +static inline void +vnet_timewait_put(struct sock* sk) +{ + inet_twsk_put((struct inet_timewait_sock *)sk); +} + +static inline struct sock* +vnet_tcp_lookup(u32 src_ip, u16 src_port, + u32 ip, u16 port, int dif) +{ + return inet_lookup(&tcp_hashinfo, src_ip, src_port, ip, port, dif); +} + +static inline int vnet_iif(const struct sk_buff *skb) +{ + return inet_iif(skb); +} +#endif + +#if LINUX_VERSION_CODE == KERNEL_VERSION(2,6,12) + +#define HAVE_FUNCTIONALITY_REQUIRED_BY_DEMUX + +static inline void +vnet_timewait_put(struct sock* sk) +{ + /* net/tcp.h */ + tcp_tw_put((struct tcp_tw_bucket*)sk); +} + +static inline struct sock* +vnet_tcp_lookup(u32 saddr, u16 sport, u32 daddr,u16 dport, int dif) +{ + extern struct sock *tcp_v4_lookup(u32, u16, u32, u16, int); + return tcp_v4_lookup(saddr, sport, daddr, dport, dif); +} + +/* same as tcp_v4_iff() in net/ipv4/tcp_ipv4. */ +static inline int vnet_iif(const struct sk_buff *skb) +{ + return ((struct rtable *)skb->dst)->rt_iif; +} +#endif + +#ifndef HAVE_FUNCTIONALITY_REQUIRED_BY_DEMUX +#warning DEMUX FUNCTIONALITY NOT SUPPORTED +#endif + +int vnet_verbose = 1; + +/* We subdivide the 1: major class into 15 minor subclasses 1:1, 1:2, + * etc. so that we can represent multiple bandwidth limits. The 1:1 + * subclass has children named 1:1000, 1:1001, etc., one for each + * context (up to 4096). Similarly, the 1:2 subclass has children + * named 1:2000, 1:2001, etc. By default, the 1:1 subclass represents + * the node bandwidth cap and 1:1000 represents the root context's + * share of it. */ +int vnet_root_class = TC_H_MAKE(1 << 16, 0x1000); + +#define FILTER_VALID_HOOKS ((1 << NF_IP_LOCAL_IN) | \ + (1 << NF_IP_LOCAL_OUT) | \ + (1 << NF_IP_POST_ROUTING)) + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,11) + +/* Standard entry. */ +struct ipt_standard +{ + struct ipt_entry entry; + struct ipt_standard_target target; +}; + +struct ipt_error_target +{ + struct ipt_entry_target target; + char errorname[IPT_FUNCTION_MAXNAMELEN]; +}; + +struct ipt_error +{ + struct ipt_entry entry; + struct ipt_error_target target; +}; + +#endif + +static struct +{ + struct ipt_replace repl; + struct ipt_standard entries[3]; + struct ipt_error term; +} initial_table __initdata = +{ + .repl = + { + .name = "vnet", + .valid_hooks = FILTER_VALID_HOOKS, + .num_entries = 4, + .size = sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error), + .hook_entry = { [NF_IP_LOCAL_IN] = 0, + [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard), + [NF_IP_POST_ROUTING] = sizeof(struct ipt_standard) * 2, }, + .underflow = { [NF_IP_LOCAL_IN] = 0, + [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard), + [NF_IP_POST_ROUTING] = sizeof(struct ipt_standard) * 2, }, + }, + + .entries = + { + /* LOCAL_IN: currently unused */ + { .entry = { .target_offset = sizeof(struct ipt_entry), + .next_offset = sizeof(struct ipt_standard), }, + .target = { .target = { .u = { .target_size = IPT_ALIGN(sizeof(struct ipt_standard_target)), }, }, + .verdict = -NF_ACCEPT - 1, }, + }, + + /* LOCAL_OUT: used for logging */ + { .entry = { .target_offset = sizeof(struct ipt_entry), + .next_offset = sizeof(struct ipt_standard), }, + .target = { .target = { .u = { .target_size = IPT_ALIGN(sizeof(struct ipt_standard_target)), }, }, + .verdict = -NF_ACCEPT - 1, }, + }, + + /* POST_ROUTING: used for priority classification */ + { .entry = { .target_offset = sizeof(struct ipt_entry), + .next_offset = sizeof(struct ipt_standard), }, + .target = { .target = { .u = { .target_size = IPT_ALIGN(sizeof(struct ipt_standard_target)), }, }, + .verdict = -NF_ACCEPT - 1, }, + }, + }, + + /* ERROR */ + .term = + { + .entry = { .target_offset = sizeof(struct ipt_entry), + .next_offset = sizeof(struct ipt_error), }, + .target = { .target = { .u = { .user = { .target_size = IPT_ALIGN(sizeof(struct ipt_error_target)), + .name = IPT_ERROR_TARGET, }, }, }, + .errorname = "ERROR", }, + }, +}; + +static struct ipt_table vnet_table = { + .name = "vnet", +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,11) + .table = &initial_table.repl, +#endif + .valid_hooks = FILTER_VALID_HOOKS, + .lock = RW_LOCK_UNLOCKED, + .me = THIS_MODULE, +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16) + .af = AF_INET, +#endif +}; + +static inline u_int16_t +get_dst_port(struct ip_conntrack_tuple *tuple) +{ + switch (tuple->dst.protonum) { + case IPPROTO_GRE: + /* XXX Truncate 32-bit GRE key to 16 bits */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,11) + return tuple->dst.u.gre.key; +#else + return htons(ntohl(tuple->dst.u.gre.key)); +#endif + case IPPROTO_ICMP: + /* Bind on ICMP echo ID */ + return tuple->src.u.icmp.id; + case IPPROTO_TCP: + return tuple->dst.u.tcp.port; + case IPPROTO_UDP: + return tuple->dst.u.udp.port; + default: + return tuple->dst.u.all; + } +} + +static inline u_int16_t +get_src_port(struct ip_conntrack_tuple *tuple) +{ + switch (tuple->dst.protonum) { + case IPPROTO_GRE: + /* XXX Truncate 32-bit GRE key to 16 bits */ + return htons(ntohl(tuple->src.u.gre.key)); + case IPPROTO_ICMP: + /* Bind on ICMP echo ID */ + return tuple->src.u.icmp.id; + case IPPROTO_TCP: + return tuple->src.u.tcp.port; + case IPPROTO_UDP: + return tuple->src.u.udp.port; + default: + return tuple->src.u.all; + } +} + + + +static unsigned int +vnet_hook(unsigned int hook, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct ip_conntrack *ct; + enum ip_conntrack_info ctinfo; + enum ip_conntrack_dir dir; + u_int8_t protocol; + u_int32_t ip; + u_int16_t port; + struct bind_key *key; + xid_t xid; + unsigned int verdict; + int priority; + struct sock *sk; + int need_to_free_sk = 0; + + ct = ip_conntrack_get(*pskb, &ctinfo); + dir = CTINFO2DIR(ctinfo); + + /* Default to marking packet with root context ID */ + xid = 0; + + switch (hook) { + + case NF_IP_LOCAL_IN: + /* Multicast to 224.0.0.1 is one example */ + if (!ct) + break; + + /* Determine if the packet is destined for a bound port */ + protocol = ct->tuplehash[dir].tuple.dst.protonum; + assert(ctinfo == IP_CT_RELATED || + ctinfo == (IP_CT_IS_REPLY + IP_CT_RELATED) || + protocol == (*pskb)->nh.iph->protocol); + ip = ct->tuplehash[dir].tuple.dst.ip; + port = get_dst_port(&ct->tuplehash[dir].tuple); + + /* Check if the port is bound */ + key = bind_get(protocol, ip, port, NULL); + + if (key && key->sk != NULL) { + + /* A new or established connection to a bound port */ + sk = key->sk; + +#ifdef HAVE_FUNCTIONALITY_REQUIRED_BY_DEMUX + /* If the bound socket is a real TCP socket, then the context that + * bound the port could have re-assigned an established connection + * socket to another context. See if this is the case. + */ + if (protocol == IPPROTO_TCP && sk->sk_type == SOCK_STREAM) { + struct sock *tcp_sk; + u_int32_t src_ip = ct->tuplehash[dir].tuple.src.ip; + u_int16_t src_port = get_src_port(&ct->tuplehash[dir].tuple); + + tcp_sk = vnet_tcp_lookup(src_ip, src_port, ip, port, vnet_iif(*pskb)); + if (tcp_sk) { + if (tcp_sk->sk_state == TCP_TIME_WAIT) { + sock_put(tcp_sk); + } else { + dbg("vnet_in:%d: established TCP socket %u.%u.%u.%u:%u -> %u.%u.%u.%u:%u\n", + get_sk_xid(tcp_sk), NIPQUAD(src_ip), ntohs(src_port), NIPQUAD(ip), ntohs(port)); + sk = tcp_sk; + need_to_free_sk = 1; + } + /* Remember to sock_put()! */ + } + } +#endif + + /* Indicate to the stack that the packet was "expected", so that it does + * not generate a TCP RST or ICMP Unreachable message. This requires a + * kernel patch. + */ + if (sk->sk_type == SOCK_RAW) + (*pskb)->sk = sk; + + assert(sk); + xid = get_sk_xid(sk); + + /* Steal the reply end of the connection */ + if (get_ct_xid(ct, !dir) != xid) { + dbg("vnet_in:%d: stealing %sbound %s connection %u.%u.%u.%u:%u -> %u.%u.%u.%u:%u from context %d\n", xid, + key ? "" : "un", print_protocol(protocol), + NIPQUAD(ip), ntohs(port), + NIPQUAD(ct->tuplehash[!dir].tuple.dst.ip), ntohs(ct->tuplehash[!dir].tuple.dst.u.all), + get_ct_xid(ct, !dir)); + set_ct_xid(ct, !dir, xid); + } + + /* Store the owner (if any) of the other side of the connection (if + * localhost) in the peercred struct. + */ + sk->sk_peercred.uid = sk->sk_peercred.gid = (__u32) get_ct_xid(ct, dir); + + if (ctinfo == IP_CT_NEW) { + dbg("vnet_in: %s port %u.%u.%u.%u:%u bound by context %d\n", + print_protocol(protocol), NIPQUAD(ip), ntohs(port), xid); + } + +#ifdef HAVE_FUNCTIONALITY_REQUIRED_BY_DEMUX + if (need_to_free_sk) { + /* + if (sk->sk_state == TCP_TIME_WAIT) + vnet_timewait_put(sk); + else*/ + sock_put(sk); + need_to_free_sk=0; + } +#endif + bind_put(key); + + } else if ((int) get_ct_xid(ct, !dir) == -1) { + /* A new connection to an unbound port */ + if (ctinfo == IP_CT_NEW) { + dbg("vnet_in: %s port %u.%u.%u.%u:%u not bound\n", + print_protocol(protocol), NIPQUAD(ip), ntohs(port)); + } + } else { + /* A new or established connection to an unbound port that could be + * associated with an active socket ("could be" because the socket + * could be closed and the connection in a WAIT state). In any case, + * give it to the last owner of the connection. + */ + xid = get_ct_xid(ct, !dir); + } + + break; + + case NF_IP_LOCAL_OUT: + /* Get the context ID of the sender */ + assert((*pskb)->sk); + xid = get_sk_xid((*pskb)->sk); + + /* Default class */ + priority = vnet_root_class; + + if (ct) { + protocol = ct->tuplehash[dir].tuple.dst.protonum; + assert(ctinfo == IP_CT_RELATED || + ctinfo == (IP_CT_IS_REPLY + IP_CT_RELATED) || + protocol == (*pskb)->nh.iph->protocol); + ip = ct->tuplehash[dir].tuple.src.ip; + assert(ctinfo == IP_CT_RELATED || + ctinfo == (IP_CT_IS_REPLY + IP_CT_RELATED) || + ip == __constant_htonl(INADDR_ANY) || ip == (*pskb)->nh.iph->saddr); + port = get_src_port(&ct->tuplehash[dir].tuple); + } else { + protocol = port = 0; + } + + if (xid) { + /* Multicast to 224.0.0.1 is one example */ + if (!ct) { + dbg("vnet_out:%d: dropping untrackable IP packet\n", xid); + return NF_DROP; + } + + /* XXX Is this guaranteed? */ + if ((*pskb)->len < sizeof(struct iphdr)) { + dbg("vnet_out:%d: dropping runt IP packet\n", xid); + return NF_DROP; + } + + /* Check source IP address */ + if (inet_addr_type(ip) != RTN_LOCAL) { + dbg("vnet_out:%d: non-local source IP address %u.%u.%u.%u not allowed\n", xid, + NIPQUAD(ip)); + return NF_DROP; + } + + /* Sending of ICMP error messages not allowed */ + if (protocol == IPPROTO_ICMP) { + struct icmphdr *icmph = (struct icmphdr *)((*pskb)->nh.raw + ((*pskb)->nh.iph->ihl * 4)); + + if ((unsigned char *) &icmph[1] > (*pskb)->tail) { + dbg("vnet_out:%d: dropping runt ICMP packet\n", xid); + return NF_DROP; + } + + switch (icmph->type) { + case ICMP_ECHOREPLY: + case ICMP_ECHO: + case ICMP_TIMESTAMP: + case ICMP_TIMESTAMPREPLY: + case ICMP_INFO_REQUEST: + case ICMP_INFO_REPLY: + case ICMP_ADDRESS: + case ICMP_ADDRESSREPLY: + /* Guaranteed by icmp_pkt_to_tuple() */ + assert(port == icmph->un.echo.id); + break; + default: + dbg("vnet_out:%d: sending of ICMP error messages not allowed\n", xid); + return NF_DROP; + } + } + } else { + /* Let root send anything it wants */ + } + + if (ct) { + /* Check if the port is bound by someone else */ + key = bind_get(protocol, ip, port, NULL); + } else { + assert(xid == 0); + key = NULL; + } + + if (key && key->sk != NULL) { + /* A new or established connection from a bound port */ + assert(ct); + + sk = key->sk; + +#ifdef HAVE_FUNCTIONALITY_REQUIRED_BY_DEMUX + /* If the bound socket is a real TCP socket, then the context that + * bound the port could have re-assigned an established connection + * socket to the sender's context. See if this is the case. + */ + if (protocol == IPPROTO_TCP && sk->sk_type == SOCK_STREAM && get_sk_xid(sk) != xid) { + struct sock *tcp_sk; + u_int32_t dst_ip = ct->tuplehash[dir].tuple.dst.ip; + u_int16_t dst_port = get_dst_port(&ct->tuplehash[dir].tuple); + + tcp_sk = vnet_tcp_lookup(dst_ip, dst_port, ip, port, vnet_iif(*pskb)); + if (tcp_sk) { + if (tcp_sk->sk_state == TCP_TIME_WAIT) { + sock_put(tcp_sk); + //vnet_timewait_put(tcp_sk); + } else { + need_to_free_sk = 1; + sk = tcp_sk; + /* Remember to sock_put()! */ + } + } + } +#endif + + verdict = NF_ACCEPT; + + /* Stealing connections from established sockets is not allowed */ + assert(sk); + if (get_sk_xid(sk) != xid) { + if (xid) { + dbg("vnet_out:%d: %s port %u.%u.%u.%u:%u already bound by context %d\n", xid, + print_protocol(protocol), NIPQUAD(ip), ntohs(port), get_sk_xid(sk)); + verdict = NF_DROP; + } else { + /* Let root send whatever it wants but do not steal the packet or + * connection. Kernel sockets owned by root may send packets on + * behalf of bound sockets (for instance, TCP ACK in SYN_RECV or + * TIME_WAIT). + */ + xid = get_sk_xid(sk); + } + } + +#ifdef HAVE_FUNCTIONALITY_REQUIRED_BY_DEMUX + if (need_to_free_sk) { + /* + if (sk->sk_state == TCP_TIME_WAIT) + vnet_timewait_put(sk); + else */ + sock_put(sk); + need_to_free_sk = 0; + } +#endif + bind_put(key); + + if (verdict == NF_DROP) + goto done; + } else { + /* A new or established or untrackable connection from an unbound port */ + + /* Reserved ports must be bound. Usually only root is capable of + * CAP_NET_BIND_SERVICE. + */ + if (xid && + (protocol == IPPROTO_TCP || protocol == IPPROTO_UDP) && + ntohs(port) < PROT_SOCK) { + assert(ct); + dbg("vnet_out:%d: %s port %u is reserved\n", xid, + print_protocol(protocol), ntohs(port)); + return NF_DROP; + } + } + + if (ct) { + /* Steal the connection */ + if (get_ct_xid(ct, dir) != xid) { + dbg("vnet_out:%d: stealing %sbound %s connection %u.%u.%u.%u:%u -> %u.%u.%u.%u:%u from context %d\n", xid, + key ? "" : "un", print_protocol(protocol), + NIPQUAD(ip), ntohs(port), + NIPQUAD(ct->tuplehash[dir].tuple.dst.ip), ntohs(ct->tuplehash[dir].tuple.dst.u.all), + get_ct_xid(ct, dir)); + set_ct_xid(ct, dir, xid); + } + + /* Classify traffic once per connection */ + if (ct->priority == (u_int32_t) -1) { + /* The POSTROUTING chain should classify packets into a minor subclass + * (1:1000, 1:2000, etc.) with -j CLASSIFY --set-class. Set the packet + * MARK early so that rules can take xid into account. */ + set_skb_xid(*pskb, xid); + (*pskb)->priority = priority; + (void) ipt_do_table(pskb, NF_IP_POST_ROUTING, in, out, &vnet_table, NULL); + priority = (*pskb)->priority | xid; + dbg("vnet_out:%d: %u.%u.%u.%u:%u -> %u.%u.%u.%u:%u class %x:%x\n", xid, + NIPQUAD(ip), ntohs(port), + NIPQUAD(ct->tuplehash[dir].tuple.dst.ip), ntohs(ct->tuplehash[dir].tuple.dst.u.all), + TC_H_MAJ(priority) >> 16, TC_H_MIN(priority)); + ct->priority = priority; + } else + priority = ct->priority; + } else { + assert(xid == 0); + } + + /* Set class */ + (*pskb)->priority = priority; + + break; + + default: + /* Huh? */ + assert(hook == NF_IP_LOCAL_IN || hook == NF_IP_LOCAL_OUT); + break; + } + + /* Mark packet */ + set_skb_xid(*pskb, xid); + +#ifdef VNET_DEBUG + if (vnet_verbose >= 3) { + if (ct) + print_conntrack(ct, ctinfo, hook); + if (vnet_verbose >= 4) + print_packet(*pskb); + } +#endif + + get_verdict: + verdict = ipt_do_table(pskb, hook, in, out, &vnet_table, NULL); + + /* Pass to network taps */ + if (verdict == NF_ACCEPT) + verdict = packet_hook(*pskb, hook); + + done: + return verdict; +} + +static struct nf_hook_ops vnet_ops[] = { + { + .hook = vnet_hook, +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) + .owner = THIS_MODULE, +#endif + .pf = PF_INET, + .hooknum = NF_IP_LOCAL_IN, + .priority = NF_IP_PRI_LAST, + }, + { + .hook = vnet_hook, +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) + .owner = THIS_MODULE, +#endif + .pf = PF_INET, + .hooknum = NF_IP_LOCAL_OUT, + .priority = NF_IP_PRI_LAST, + }, +}; + +/* Exported by net/ipv4/af_inet.c */ +extern struct net_proto_family inet_family_ops; +extern struct proto_ops inet_stream_ops; +extern struct proto_ops inet_dgram_ops; +extern struct proto_ops inet_sockraw_ops; +extern int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); +extern int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, + int addr_len, int flags); +extern int inet_listen(struct socket *sock, int backlog); +extern int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr, + int addr_len, int flags); +extern int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, + size_t size); +extern int inet_release(struct socket *sock); + +/* Exported by net/ipv4/tcp_ipv4.c */ +extern struct proto tcp_prot; +extern int tcp_port_rover; +extern int sysctl_local_port_range[2]; + +/* Exported by net/ipv4/udp.c */ +extern struct proto udp_prot; +extern int udp_port_rover; + +/* Functions that are not exported */ +static int (*inet_create)(struct socket *sock, int protocol); +static ssize_t (*inet_sendpage)(struct socket *sock, struct page *page, int offset, size_t size, int flags); +static void (*tcp_v4_hash)(struct sock *sk); +static void (*tcp_v4_unhash)(struct sock *sk); +static void (*udp_v4_hash)(struct sock *sk); +static void (*udp_v4_unhash)(struct sock *sk); + +static int +vnet_inet_create(struct socket *sock, int protocol) +{ + int ret; + + if (sock->type == SOCK_RAW) { + /* Temporarily give CAP_NET_RAW to root VServer accounts */ + if (current->euid) + return -EPERM; + cap_raise(current->cap_effective, CAP_NET_RAW); + } + ret = inet_create(sock, protocol); + if (sock->type == SOCK_RAW) + cap_lower(current->cap_effective, CAP_NET_RAW); + if (ret) + return ret; + + if (sock->type == SOCK_RAW) { + struct sock *sk = sock->sk; + struct inet_opt *inet = inet_sk(sk); + /* Usually redundant and unused */ + assert(inet->sport == htons(inet->num)); + /* So we can track double raw binds */ + inet->sport = 0; + } + + return ret; +} + +/* Make sure our bind table gets updated whenever the stack decides to + * unhash or rehash a socket. + */ +static void +vnet_inet_unhash(struct sock *sk) +{ + struct inet_opt *inet = inet_sk(sk); + struct bind_key *key; + + key = bind_get(sk->sk_protocol, inet->saddr, inet->sport, sk); + if (key) { + dbg("vnet_inet_unhash:%d: released %s port %u.%u.%u.%u:%u\n", get_sk_xid(sk), + print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), ntohs(inet->sport)); + bind_del(key); + bind_put(key); + } + + if (sk->sk_protocol == IPPROTO_TCP) + tcp_v4_unhash(sk); + else if (sk->sk_protocol == IPPROTO_UDP) + udp_v4_unhash(sk); +} + +static void +vnet_inet_hash(struct sock *sk) +{ + struct inet_opt *inet = inet_sk(sk); + + if (bind_add(sk->sk_protocol, inet->saddr, inet->sport, sk) == 0) { + dbg("vnet_inet_hash:%d: bound %s port %u.%u.%u.%u:%u\n", get_sk_xid(sk), + print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), ntohs(inet->sport)); + } + + if (sk->sk_protocol == IPPROTO_TCP) + tcp_v4_hash(sk); + else if (sk->sk_protocol == IPPROTO_UDP) + udp_v4_hash(sk); +} + +/* Port reservation */ +static int +vnet_inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + struct sock *sk = sock->sk; + struct inet_opt *inet = inet_sk(sk); + struct sockaddr_in *sin = (struct sockaddr_in *) uaddr; + struct bind_key *key; + int ret; + + /* Bind socket */ + if ((ret = inet_bind(sock, uaddr, addr_len))) + return ret; + + lock_sock(sk); + + /* Backward compatibility with safe raw sockets */ + if (sock->type == SOCK_RAW) { + /* Runt sockaddr */ + if (addr_len < sizeof(struct sockaddr_in)) + ret = -EINVAL; + /* Non-local bind */ + else if (sin->sin_addr.s_addr != __constant_htonl(INADDR_ANY) && inet_addr_type(sin->sin_addr.s_addr) != RTN_LOCAL) + ret = -EINVAL; + /* Unspecified port */ + else if (!sin->sin_port) + ret = -EINVAL; + /* Reserved port */ + else if ((sk->sk_protocol == IPPROTO_TCP || sk->sk_protocol == IPPROTO_UDP) && + ntohs(sin->sin_port) < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE)) + ret = -EACCES; + /* Double bind */ + else if (inet->sport) + ret = -EINVAL; + if (ret) + goto done; + inet->saddr = sin->sin_addr.s_addr; + inet->sport = sin->sin_port; + } + + key = bind_get(sk->sk_protocol, inet->saddr, inet->sport, NULL); + if (key) { + /* + * If we are root or own the already bound socket, and + * SO_REUSEADDR has been set on both. + */ + if ((get_sk_xid(sk) == 0 || get_sk_xid(sk) == get_sk_xid(key->sk)) && + key->sk->sk_reuse && sk->sk_reuse) { + if (key->ip == __constant_htonl(INADDR_ANY)) { + /* Keep the current bind key */ + bind_put(key); + goto done; + } else if (inet->saddr == __constant_htonl(INADDR_ANY)) { + /* Consider the port to be bound to this socket now */ + bind_del(key); + } + } + bind_put(key); + } + + if ((ret = bind_add(sk->sk_protocol, inet->saddr, inet->sport, sk)) == 0) { + dbg("vnet_inet_bind:%d: bound %s port %u.%u.%u.%u:%u\n", get_sk_xid(sk), + print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), ntohs(inet->sport)); + } + + done: + release_sock(sk); + return ret; +} + +/* Override TCP and UDP port rovers since they do not know about raw + * socket binds. + */ +static int +vnet_autobind(struct sock *sk) +{ + int (*get_port)(struct sock *, unsigned short); + int low = sysctl_local_port_range[0]; + int high = sysctl_local_port_range[1]; + int remaining = (high - low) + 1; + int port; + struct inet_opt *inet = inet_sk(sk); + struct bind_key *key; + + /* Must be locked */ + assert(sock_owned_by_user(sk)); + + /* Already bound to a port */ + if (inet->num) + return 0; + + if (sk->sk_protocol == IPPROTO_TCP) { + get_port = tcp_prot.get_port; +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,14) + /* Approximate the tcp_v4_get_port() strategy */ + port = tcp_port_rover + 1; +#else + /* Approximate the inet_csk_get_port() strategy */ + port = net_random() % (high - low) + low; +#endif + } else if (sk->sk_protocol == IPPROTO_UDP) { + get_port = udp_prot.get_port; + port = udp_port_rover; + } else if (sk->sk_prot->get_port) { + err("vnet_get_port:%d: %s unhandled\n", get_sk_xid(sk), + print_protocol(sk->sk_protocol)); + if (sk->sk_prot->get_port(sk, 0)) + return -EAGAIN; + inet->sport = htons(inet->num); + return 0; + } else { + return 0; + } + + dbg("vnet_autobind:%d: roving %s port range %u.%u.%u.%u:%u-%u\n", get_sk_xid(sk), + print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), low, high); + + /* Find a free port by linear search. Note that the standard + * udp_v4_get_port() function attempts to pick a port that + * keeps its hash tables balanced. If the UDP hash table keeps + * getting bombed, we should try implementing this strategy + * here. + */ + do { + if (port < low || port > high) + port = low; + + /* XXX We could probably try something more clever + * like checking to see if the bound socket is a + * regular TCP socket owned by the same context (or we + * are root) and, if so, letting tcp_v4_get_port() + * apply its fast reuse logic to determine if the port + * can be reused. + */ + if (bind_add(sk->sk_protocol, inet->saddr, htons(port), sk)) { + dbg("vnet_get_port:%d: %s port %u.%u.%u.%u:%u already bound\n", get_sk_xid(sk), + print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), port); + goto next; + } + + if (get_port(sk, port)) { + /* Can happen if we are unloaded when there are active sockets */ + dbg("vnet_get_port:%d: failed to hash unbound %s port %u.%u.%u.%u:%u\n", get_sk_xid(sk), + print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), port); + key = bind_get(sk->sk_protocol, inet->saddr, htons(port), sk); + assert(key); + bind_del(key); + bind_put(key); + } else { + assert(port == inet->num); + inet->sport = htons(inet->num); + break; + } + next: + port++; + } while (--remaining > 0); + + if (sk->sk_protocol == IPPROTO_UDP) + udp_port_rover = port; +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,14) + else if (sk->sk_protocol == IPPROTO_TCP) + tcp_port_rover = port; +#endif + + if (remaining <= 0) { + err("vnet_get_port:%d: exhausted local %s port range %u.%u.%u.%u:%u-%u\n", get_sk_xid(sk), + print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), low, high); + return -EAGAIN; + } else { + dbg("vnet_get_port:%d: autobound %s port %u.%u.%u.%u:%u\n", get_sk_xid(sk), + print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), port); + return 0; + } +} + +static int +vnet_inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, + int addr_len, int flags) +{ + struct sock *sk = sock->sk; + + lock_sock(sk); + + /* Duplicates checks in inet_stream_connect() */ + if (uaddr->sa_family != AF_UNSPEC && + sock->state == SS_UNCONNECTED && + sk->sk_state == TCP_CLOSE) { + /* We may need to bind the socket. */ + if (!inet_sk(sk)->num && vnet_autobind(sk)) { + release_sock(sk); + return -EAGAIN; + } + } + + release_sock(sk); + + return inet_stream_connect(sock, uaddr, addr_len, flags); +} + +static int +vnet_inet_listen(struct socket *sock, int backlog) +{ + struct sock *sk = sock->sk; + + lock_sock(sk); + + /* Duplicates checks in inet_listen() */ + if (sock->type == SOCK_STREAM && + sock->state == SS_UNCONNECTED && + sk->sk_state == TCP_CLOSE) { + /* We may need to bind the socket. */ + if (!inet_sk(sk)->num && vnet_autobind(sk)) { + release_sock(sk); + return -EAGAIN; + } + } + + release_sock(sk); + + return inet_listen(sock, backlog); +} + +static int +vnet_inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr, + int addr_len, int flags) +{ + struct sock *sk = sock->sk; + + lock_sock(sk); + + /* Duplicates checks in inet_dgram_connect() */ + if (uaddr->sa_family != AF_UNSPEC) { + /* We may need to bind the socket. */ + if (!inet_sk(sk)->num && vnet_autobind(sk)) { + release_sock(sk); + return -EAGAIN; + } + } + + release_sock(sk); + + return inet_dgram_connect(sock, uaddr, addr_len, flags); +} + +static int +vnet_inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, + size_t size) +{ + struct sock *sk = sock->sk; + + lock_sock(sk); + + /* We may need to bind the socket. */ + if (!inet_sk(sk)->num && vnet_autobind(sk)) { + release_sock(sk); + return -EAGAIN; + } + + release_sock(sk); + + return inet_sendmsg(iocb, sock, msg, size); +} + +static ssize_t +vnet_inet_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) +{ + struct sock *sk = sock->sk; + + lock_sock(sk); + + /* We may need to bind the socket. */ + if (!inet_sk(sk)->num && vnet_autobind(sk)) { + release_sock(sk); + return -EAGAIN; + } + + release_sock(sk); + + return inet_sendpage(sock, page, offset, size, flags); +} + +static int +vnet_inet_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct inet_opt *inet = inet_sk(sk); + struct bind_key *key; + + /* Partial socket created by accept() */ + if (!sk) + goto done; + + lock_sock(sk); + + key = bind_get(sk->sk_protocol, inet->saddr, inet->sport, sk); + if (key) { + dbg("vnet_inet_release:%d: released %s port %u.%u.%u.%u:%u\n", get_sk_xid(sk), + print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), ntohs(inet->sport)); + bind_del(key); + bind_put(key); + } + + release_sock(sk); + + done: + return inet_release(sock); +} + +/* Sanity check */ +#define override_op(op, from, to) do { assert((op) == (from)); (op) = (to); } while (0) + +static int __init +vnet_init(void) +{ + int ret; + + /* Initialize bind table */ + ret = bind_init(); + if (ret < 0) + return ret; + + /* Register /proc entries */ + ret = proc_init(); + if (ret < 0) + goto cleanup_bind; + + /* Register dummy netdevice */ + ret = packet_init(); + if (ret < 0) + goto cleanup_proc; + + /* Register tap netdevice */ + ret = tun_init(); + if (ret < 0) + goto cleanup_packet; + + /* Get pointers to unexported functions */ + inet_create = inet_family_ops.create; + inet_sendpage = inet_dgram_ops.sendpage; + tcp_v4_hash = tcp_prot.hash; + tcp_v4_unhash = tcp_prot.unhash; + udp_v4_hash = udp_prot.hash; + udp_v4_unhash = udp_prot.unhash; + + /* Override PF_INET socket operations */ + override_op(inet_family_ops.create, inet_create, vnet_inet_create); + override_op(inet_stream_ops.bind, inet_bind, vnet_inet_bind); + override_op(inet_stream_ops.connect, inet_stream_connect, vnet_inet_stream_connect); + override_op(inet_stream_ops.listen, inet_listen, vnet_inet_listen); + override_op(inet_stream_ops.sendmsg, inet_sendmsg, vnet_inet_sendmsg); + override_op(inet_stream_ops.release, inet_release, vnet_inet_release); + override_op(inet_dgram_ops.bind, inet_bind, vnet_inet_bind); + override_op(inet_dgram_ops.connect, inet_dgram_connect, vnet_inet_dgram_connect); + override_op(inet_dgram_ops.sendmsg, inet_sendmsg, vnet_inet_sendmsg); + override_op(inet_dgram_ops.sendpage, inet_sendpage, vnet_inet_sendpage); + override_op(inet_dgram_ops.release, inet_release, vnet_inet_release); +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10) + override_op(inet_sockraw_ops.bind, inet_bind, vnet_inet_bind); + override_op(inet_sockraw_ops.connect, inet_dgram_connect, vnet_inet_dgram_connect); + override_op(inet_sockraw_ops.sendmsg, inet_sendmsg, vnet_inet_sendmsg); + override_op(inet_sockraw_ops.sendpage, inet_sendpage, vnet_inet_sendpage); + override_op(inet_sockraw_ops.release, inet_release, vnet_inet_release); +#endif + override_op(tcp_prot.hash, tcp_v4_hash, vnet_inet_hash); + override_op(tcp_prot.unhash, tcp_v4_unhash, vnet_inet_unhash); + override_op(udp_prot.hash, udp_v4_hash, vnet_inet_hash); + override_op(udp_prot.unhash, udp_v4_unhash, vnet_inet_unhash); + + /* Register table */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,11) + ret = ipt_register_table(&vnet_table, &initial_table.repl); +#else + ret = ipt_register_table(&vnet_table); +#endif + if (ret < 0) + goto cleanup_override; + + /* Register hooks */ + ret = nf_register_hook(&vnet_ops[0]); + if (ret < 0) + goto cleanup_table; + + ret = nf_register_hook(&vnet_ops[1]); + if (ret < 0) + goto cleanup_hook0; + + /* Enables any runtime kernel support for VNET */ + vnet_active = 1; + + /* Print banner */ + printk("VNET: version " VNET_VERSION " compiled on " __DATE__ " at " __TIME__ "\n"); + + return ret; + + cleanup_hook0: + nf_unregister_hook(&vnet_ops[0]); + cleanup_table: + ipt_unregister_table(&vnet_table); + cleanup_override: + inet_family_ops.create = inet_create; + inet_stream_ops.bind = inet_bind; + inet_stream_ops.connect = inet_stream_connect; + inet_stream_ops.listen = inet_listen; + inet_stream_ops.sendmsg = inet_sendmsg; + inet_stream_ops.release = inet_release; + inet_dgram_ops.bind = inet_bind; + inet_dgram_ops.connect = inet_dgram_connect; + inet_dgram_ops.sendmsg = inet_sendmsg; + inet_dgram_ops.sendpage = inet_sendpage; + inet_dgram_ops.release = inet_release; + tun_cleanup(); + cleanup_packet: + packet_cleanup(); + cleanup_proc: + proc_cleanup(); + cleanup_bind: + bind_cleanup(); + + return ret; +} + +static void __exit +vnet_exit(void) +{ + unsigned int i; + + /* Print banner */ + printk("VNET: exiting\n"); + + /* Disables any runtime kernel support for VNET */ + vnet_active = 0; + + /* Stop handling packets first */ + for (i = 0; i < sizeof(vnet_ops)/sizeof(struct nf_hook_ops); i++) + nf_unregister_hook(&vnet_ops[i]); + + ipt_unregister_table(&vnet_table); + + /* Stop handling PF_INET socket operations */ + override_op(inet_family_ops.create, vnet_inet_create, inet_create); + override_op(inet_stream_ops.bind, vnet_inet_bind, inet_bind); + override_op(inet_stream_ops.connect, vnet_inet_stream_connect, inet_stream_connect); + override_op(inet_stream_ops.listen, vnet_inet_listen, inet_listen); + override_op(inet_stream_ops.sendmsg, vnet_inet_sendmsg, inet_sendmsg); + override_op(inet_stream_ops.release, vnet_inet_release, inet_release); + override_op(inet_dgram_ops.bind, vnet_inet_bind, inet_bind); + override_op(inet_dgram_ops.connect, vnet_inet_dgram_connect, inet_dgram_connect); + override_op(inet_dgram_ops.sendmsg, vnet_inet_sendmsg, inet_sendmsg); + override_op(inet_dgram_ops.sendpage, vnet_inet_sendpage, inet_sendpage); + override_op(inet_dgram_ops.release, vnet_inet_release, inet_release); +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10) + override_op(inet_sockraw_ops.bind, vnet_inet_bind, inet_bind); + override_op(inet_sockraw_ops.connect, vnet_inet_dgram_connect, inet_dgram_connect); + override_op(inet_sockraw_ops.sendmsg, vnet_inet_sendmsg, inet_sendmsg); + override_op(inet_sockraw_ops.sendpage, vnet_inet_sendpage, inet_sendpage); + override_op(inet_sockraw_ops.release, vnet_inet_release, inet_release); +#endif + override_op(tcp_prot.hash, vnet_inet_hash, tcp_v4_hash); + override_op(tcp_prot.unhash, vnet_inet_unhash, tcp_v4_unhash); + override_op(udp_prot.hash, vnet_inet_hash, udp_v4_hash); + override_op(udp_prot.unhash, vnet_inet_unhash, udp_v4_unhash); + + /* Disable tap netdevice */ + tun_cleanup(); + + /* Disable vnet netdevice and stop handling PF_PACKET sockets */ + packet_cleanup(); + + /* Unregister /proc handlers */ + proc_cleanup(); + + /* Cleanup bind table (must be after nf_unregister_hook()) */ + bind_cleanup(); +} + +module_init(vnet_init); +module_exit(vnet_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Mark Huang "); +MODULE_DESCRIPTION("VServer IP isolation"); -- 2.43.0