This commit was generated by cvs2svn to compensate for changes in r759,

author KyoungSoo Park <kyoungso@cs.princeton.edu>

Wed, 25 Apr 2007 16:07:02 +0000 (16:07 +0000)

committer KyoungSoo Park <kyoungso@cs.princeton.edu>

Wed, 25 Apr 2007 16:07:02 +0000 (16:07 +0000)
author KyoungSoo Park <kyoungso@cs.princeton.edu>
Wed, 25 Apr 2007 16:07:02 +0000 (16:07 +0000)
committer KyoungSoo Park <kyoungso@cs.princeton.edu>
Wed, 25 Apr 2007 16:07:02 +0000 (16:07 +0000)
diff --git a/Makefile b/Makefile

new file mode 100644 (file)

index 0000000..480c997
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,16 @@
+CC = gcc
+CFLAGS = -Wall -O -g
+
+TARGS = codemux
+
+all: ${TARGS}
+
+clean:
+       rm -f ${TARGS} *.o *~
+
+SHARED_OBJ = applib.o gettimeofdayex.o
+
+CODEMUX_OBJ = codemux.o ${SHARED_OBJ}
+
+codemux: ${CODEMUX_OBJ}
+
diff --git a/appdef.h b/appdef.h

new file mode 100644 (file)

index 0000000..cc82ae8
--- /dev/null
+++ b/appdef.h
@@ -0,0 +1,53 @@
+#ifndef _APPDEF_H_
+#define _APPDEF_H_
+
+/* useful definitions */
+#ifndef TRUE
+#define TRUE 1
+#endif
+
+#ifndef FALSE
+#define FALSE 0
+#endif
+
+#ifndef SUCCESS
+#define SUCCESS 0
+#endif
+
+#ifndef FAILURE
+#define FAILURE (-1)
+#endif
+
+#ifndef INT_MAX
+#ifdef MAX_INT
+#define INT_MAX MAXINT
+#endif
+#endif
+
+#ifndef MAX
+#define MAX(x, y) ((x) > (y) ? (x) : (y))
+#endif
+
+#ifndef MIN
+#define MIN(x, y) ((x) < (y) ? (x) : (y))
+#endif
+
+#ifndef NELEMS
+#define NELEMS(x) (sizeof(x)/sizeof(x[0]))
+#endif
+
+#ifndef SKIP_CHARS
+#define SKIP_CHARS(x)  while(!isspace((int)*x)) (x)++
+#endif
+
+#ifndef SKIP_SPACES
+#define SKIP_SPACES(x) while (isspace((int)*x)) (x)++ 
+#endif
+
+#ifndef SKIP_WORD
+#define SKIP_WORD(x)   do { SKIP_SPACES(x); \
+                            SKIP_CHARS(x);  \
+                            SKIP_SPACES(x);} while(0)
+#endif
+
+#endif //_APPDEF_H_
diff --git a/applib.c b/applib.c

new file mode 100644 (file)

index 0000000..a91c6aa
--- /dev/null
+++ b/applib.c
@@ -0,0 +1,1156 @@
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <netinet/in.h>
+#include <ctype.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <unistd.h>
+#include <errno.h>
+#include "applib.h"
+#include "debug.h"
+#include "codns.h"
+
+
+
+int defaultTraceSync = TRUE;
+
+/*-----------------------------------------------------------------*/
+float
+DiffTimeVal(const struct timeval *start, const struct timeval *end)
+{
+  struct timeval temp;
+  if (end == NULL) {
+    end = &temp;
+    gettimeofday(&temp, NULL);
+  }
+  return(end->tv_sec - start->tv_sec + 
+        1e-6*(end->tv_usec - start->tv_usec));
+}
+/*-----------------------------------------------------------------*/
+int 
+CreatePrivateAcceptSocketEx(int portNum, int nonBlocking, int loopbackOnly)
+{
+  int doReuse = 1;
+  struct linger doLinger;
+  int sock;
+  struct sockaddr_in sa;
+  
+  /* Create socket. */
+  if ((sock = socket(PF_INET, SOCK_STREAM, 0)) == -1)
+    return(-1);
+  
+  /* don't linger on close */
+  doLinger.l_onoff = doLinger.l_linger = 0;
+  if (setsockopt(sock, SOL_SOCKET, SO_LINGER, 
+                &doLinger, sizeof(doLinger)) == -1) {
+    close(sock);
+    return(-1);
+  }
+  
+  /* reuse addresses */
+  if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, 
+                &doReuse, sizeof(doReuse)) == -1) {
+    close(sock);
+    return(-1);
+  }
+
+  if (nonBlocking) {
+    /* make listen socket nonblocking */
+    if (fcntl(sock, F_SETFL, O_NDELAY) == -1) {
+      close(sock);
+      return(-1);
+    }
+  }
+  
+  /* set up info for binding listen */
+  memset(&sa, 0, sizeof(sa));
+  sa.sin_family = AF_INET;
+  sa.sin_addr.s_addr = (loopbackOnly) ? htonl(INADDR_LOOPBACK) 
+                                      : htonl(INADDR_ANY);
+  sa.sin_port = htons(portNum);
+
+  /* bind the sock */
+  if (bind(sock, (struct sockaddr *) &sa, sizeof(sa)) == -1) {
+    close(sock);
+    return(-1);
+  }
+  
+  /* start listening */
+  if (listen(sock, 32) == -1) {
+    close(sock);
+    return(-1);
+  }
+  
+  return(sock);
+}
+/*-----------------------------------------------------------------*/
+int
+CreatePrivateAcceptSocket(int portNum, int nonBlocking)
+{
+  return CreatePrivateAcceptSocketEx(portNum, nonBlocking, FALSE);
+}
+/*-----------------------------------------------------------------*/
+int
+CreatePublicUDPSocket(int portNum)
+{
+  struct sockaddr_in hb_sin;
+  int sock;
+
+  if ((sock = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP)) < 0)
+    return(-1);
+
+  memset(&hb_sin, 0, sizeof(hb_sin));
+  hb_sin.sin_family = AF_INET;
+  hb_sin.sin_addr.s_addr = INADDR_ANY;
+  hb_sin.sin_port = htons(portNum);
+
+  if (bind(sock, (struct sockaddr *) &hb_sin, sizeof(hb_sin)) < 0) {
+    close(sock);
+    return(-1);
+  }
+  return(sock);
+}
+/*-----------------------------------------------------------------*/
+int
+MakeConnection(char *name, in_addr_t netAddr, int portNum, int nonBlocking)
+{
+  struct sockaddr_in saddr;
+  int fd;
+
+  if (name != NULL) {
+    struct hostent *ent;
+    if ((ent = gethostbyname(name)) == NULL) {
+      if (hdebugLog)
+       TRACE("failed in name lookup - %s\n", name);
+      return(-1);
+    }
+    memcpy(&netAddr, ent->h_addr, sizeof(netAddr));
+  }
+
+  if ((fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) < 0) {
+    if (hdebugLog)
+      TRACE("failed creating socket - %d\n", errno);
+    return(-1);
+  }
+
+  if (nonBlocking) {
+    if (fcntl(fd, F_SETFL, O_NDELAY) < 0) {
+      if (hdebugLog)
+       TRACE("failed fcntl'ing socket - %d\n", errno);
+      close(fd);
+      return(-1);
+    }
+  }
+
+  saddr.sin_family = AF_INET;
+  saddr.sin_addr.s_addr = netAddr;
+  saddr.sin_port = htons(portNum);
+  
+  if (connect(fd, (struct sockaddr *) &saddr, 
+             sizeof(struct sockaddr_in)) < 0) {
+    if (errno == EINPROGRESS)
+      return(fd);
+    if (hdebugLog)
+      TRACE("failed connecting socket - %d\n", errno);
+    close(fd);
+    return(-1);
+  }
+
+  return(fd);
+}
+/*-----------------------------------------------------------------*/
+int
+MakeLoopbackConnection(int portNum, int nonBlocking)
+{
+  return(MakeConnection(NULL, htonl(INADDR_LOOPBACK), 
+                       portNum, nonBlocking));
+}
+/*-----------------------------------------------------------------*/
+char *
+GetField(const unsigned char *start, int whichField)
+{
+  int currField;
+
+  /* move to first non-blank char */
+  while (isspace(*start))
+    start++;
+
+  if (*start == '\0')
+    return(NULL);
+
+  for (currField = 0; currField < whichField; currField++) {
+    /* move over this field */
+    while (*start != '\0' && (!isspace(*start)))
+      start++;
+    /* move over blanks before next field */
+    while (isspace(*start))
+      start++;
+    if (*start == '\0')
+      return(NULL);
+  }
+  return((char *) start);
+}
+/* ---------------------------------------------------------------- */
+char *
+GetWord(const unsigned char *start, int whichWord)
+{
+  /* returns a newly allocated string containing the desired word,
+     or NULL if there was a problem */
+  unsigned char *temp;
+  int len = 0;
+  char *res;
+
+  temp = (unsigned char *) GetField(start, whichWord);
+  if (!temp)
+    return(NULL);
+  while (!(temp[len] == '\0' || isspace(temp[len])))
+    len++;
+  if (!len)
+    NiceExit(-1, "internal error");
+  res = (char *)xcalloc(1, len+1);
+  if (!res) 
+    NiceExit(-1, "out of memory");
+  memcpy(res, temp, len);
+  return(res);
+}
+/* ---------------------------------------------------------------- */
+char *
+GetWordEx(const unsigned char *start, int whichWord, 
+         char* dest, int max)
+{
+  /* returns a newly allocated string containing the desired word,
+     or NULL if there was a problem */
+  unsigned char *temp;
+  int len = 0;
+
+  memset(dest, 0, max);
+  temp = (unsigned char *) GetField(start, whichWord);
+  if (!temp)
+    return(NULL);
+  while (!(temp[len] == '\0' || isspace(temp[len])))
+    len++;
+  if (!len)
+    NiceExit(-1, "internal error");
+  if (len >= max-1)
+    len = max-1;
+  memcpy(dest, temp, len);
+  return(dest);
+}
+/*-----------------------------------------------------------------*/
+int
+Base36Digit(int a)
+{
+  if (a < 0)
+    return('0');
+  if (a > 35)
+    return('z');
+  if (a < 10)
+    return('0' + a);
+  return('a' + a-10);
+}
+/*-----------------------------------------------------------------*/
+int
+Base36To10(int a)
+{
+  if (a >= '0' && a <= '9')
+    return(a - '0');
+  if (a >= 'a' && a <= 'z')
+    return(10 + a - 'a');
+  if (a >= 'A' && a <= 'Z')
+    return(10 + a - 'A');
+  return(0);
+}
+/*-----------------------------------------------------------------*/
+int
+PopCount(int val)
+{
+  int i;
+  int count = 0;
+
+  for (i = 0; i < (int)sizeof(int) * 8; i++) {
+    if (val & (1<<i))
+      count++;
+  }
+  return(count);
+}
+/*-----------------------------------------------------------------*/
+int
+PopCountChar(int val)
+{
+  int i;
+  int count = 0;
+
+  for (i = 0; i < (int)sizeof(int) * 8; i++) {
+    if (val & (1<<i))
+      count++;
+  }
+  return(Base36Digit(count));
+}
+/*-----------------------------------------------------------------*/
+int
+LogValChar(int val)
+{
+  int i;
+
+  for (i = 0; i < 32; i++) {
+    if (val <= (1<<i))
+      return(Base36Digit(i));
+  }
+  return(Base36Digit(32));
+}
+/*-----------------------------------------------------------------*/
+const char *
+StringOrNull(const char *s)
+{
+  if (s)
+    return(s);
+  return("(null)");
+}
+/*-----------------------------------------------------------------*/
+char *
+strchars(const char *string, const char *list)
+{
+  /* acts like strchr, but works with multiple characters */
+  int numChars = strlen(list);
+  int i;
+  const char *walk;
+
+  if (numChars < 1)
+    return(NULL);
+  
+  for (walk = string; *walk; walk++) {
+    for (i = 0; i < numChars; i++) {
+      if (*walk == list[i])
+       return (char *)(walk);
+    }
+  }
+  return(NULL);
+}
+/*-----------------------------------------------------------------*/
+char *
+strnchars(const char *string, int length, const char *list)
+{
+  /* acts like strchr, but works with multiple characters, and 
+     reads exactly length characters from string.  */
+  int searchingfor[256] = {0};
+  const char *walk;
+
+  if ('\0' == *list)
+    return(NULL);
+
+  for (; *list; list++) {
+    /*
+     * Be careful with this cast.
+     * If *list == (char) -98 (extended ascii), then
+     *     (unsigned)*list == (unsigned int)*list == 4294967198
+     *     (int)(unsigned char)*list == (unsigned char)*list == 158
+     * The compiler automatically casts the character to an int before
+     * doing the array index.
+     */
+    searchingfor[(int)(unsigned char)*list] = 1;
+  }
+
+  for (walk = string; walk - string < length; walk++) {
+    /* likewise */
+    if (searchingfor[(int)(unsigned char)*walk]) {
+       return (char *)(walk);
+    }
+  }
+  return(NULL);
+}
+/*-----------------------------------------------------------------*/
+int
+strncspn(const char* string, int length, const char* reject)
+     /* like strcspn but reads to length characters */
+{
+  int count = 0;
+  int searchingfor[256] = {0};
+  const char* walk;
+
+  for (; *reject; reject++) {
+    /*
+     * Be careful with this cast.
+     * If *list == (char) -98 (extended ascii), then
+     *     (unsigned)*list == (unsigned int)*list == 4294967198
+     *     (int)(unsigned char)*list == (unsigned char)*list == 158
+     * The compiler automatically casts the character to an int before
+     * doing the array index.
+     */
+    searchingfor[(int)(unsigned char)*reject] = 1;
+  }
+
+  for(walk = string; walk - string < length; walk++) {
+    /* likewise */
+    if (searchingfor[(int)(unsigned char)*walk]) {
+      break;
+    } else {
+      count++;
+    }
+  }
+  return count;
+}
+/*-----------------------------------------------------------------*/
+char *
+strnchr(const char *string, int length, const char needle)
+{
+  /* acts like strchr, but 
+     reads exactly length characters from string.  */
+  const char *walk;
+
+  for (walk = string; walk - string < length; walk++) {
+    if (needle == *walk) {
+       return (char *)(walk);
+    }
+  }
+  return(NULL);
+}
+
+/*-----------------------------------------------------------------*/
+#ifndef HAS_STRNSTR
+/* same as strstr, except that si doesn't have to end at '\0' */
+char * strnstr(const char * s1, int s1_len, const char * s2)
+{
+  int l1, l2;
+  
+  l2 = strlen(s2);
+  if (!l2)
+    return (char *) s1;
+  
+  l1 = s1_len;
+  while (l1 >= l2) {
+    l1--;
+    if (!memcmp(s1,s2,l2))
+      return (char *) s1;
+    s1++;
+  }
+  return NULL;
+}
+#endif
+/*-----------------------------------------------------------------*/
+/* same as strnstr, except case insensitive */
+char * strncasestr(const char * s1, int s1_len, const char * s2)
+{
+  int l1, l2;
+  
+  l2 = strlen(s2);
+  if (!l2)
+    return (char *) s1;
+  
+  l1 = s1_len;
+  while (l1 >= l2) {
+    l1--;
+    if (!strncasecmp(s1,s2,l2))
+      return (char *) s1;
+    s1++;
+  }
+  return NULL;
+}
+
+/*-----------------------------------------------------------------*/
+char *
+StrdupLower(const char *orig)
+{
+  char *temp;
+  int i;
+
+  if ((temp = xstrdup(orig)) == NULL)
+    NiceExit(-1, "no memory in strduplower");
+  for (i = 0; temp[i]; i++) {
+    if (isupper((int) temp[i]))
+      temp[i] = tolower(temp[i]);
+  }
+  return(temp);
+}
+
+/*-----------------------------------------------------------------*/
+void 
+StrcpyLower(char *dest, const char *src)
+{
+  /* copy 'src' to 'dest' in lower cases. 
+     'dest' should have enough free space to hold src */
+  int i;
+
+  for (i = 0; src[i]; i++) {
+    dest[i] = (isupper((int) src[i])) ? tolower(src[i]) : src[i];
+  }
+
+  /* mark it as NULL */
+  dest[i] = 0;
+}
+/*-----------------------------------------------------------------*/
+void
+StrcpyLowerExcept(char *dest, int dest_max, const char* src, const char* except)
+{
+  /* copy 'src' to 'dest' in lower cases, skipping the chars in except.
+     'dest' should have enough free space to hold src */
+  int i, j;
+
+  if (src == NULL)
+    return;
+  
+  for (i = 0, j= 0; src[i]; i++) {
+    if (strchr(except, src[i]))
+      continue;
+
+    if (j == dest_max - 1)
+      break;
+    dest[j++] = (isupper((int) src[i])) ? tolower(src[i]) : src[i];
+  }
+
+  /* mark it as NULL */
+  dest[j] = 0;
+}
+
+/*-----------------------------------------------------------------*/
+static char *
+GetNextLineBack(FILE *file, int lower, int stripComments)
+{
+  /* reads the next non-blank line of the file. strips off any leading
+     space, any comments, and any trailing space.  returns a lowercase
+     version of the line that has been malloc'd */
+  char line[1024];
+
+  while (fgets(line, sizeof(line), file) != NULL) {
+    char *temp;
+    int len;
+
+    /* strip off any comments, leading and trailing whitespace */
+    if (stripComments) {
+      if ((temp = strchr(line, '#')) != NULL)
+       *temp = 0;
+    }
+    len = strlen(line);
+    while (len > 0 && isspace((int) line[len-1])) {
+      len--;
+      line[len] = 0;
+    }
+    temp = line;
+    while (isspace((int) *temp))
+      temp++;
+    if (temp[0] == 0)
+      continue;                        /* blank line, move on */
+
+    if (lower)
+      return(StrdupLower(temp));
+    return(xstrdup(temp));
+  }
+
+  return(NULL);
+}
+/*-----------------------------------------------------------------*/
+char *
+GetLowerNextLine(FILE *file)
+{
+  return(GetNextLineBack(file, TRUE, TRUE));
+}
+/*-----------------------------------------------------------------*/
+char *
+GetNextLine(FILE *file)
+{
+  return(GetNextLineBack(file, FALSE, TRUE));
+}
+/*-----------------------------------------------------------------*/
+char *
+GetNextLineNoCommStrip(FILE *file)
+{
+  return(GetNextLineBack(file, FALSE, FALSE));
+}
+/*-----------------------------------------------------------------*/
+int
+DoesSuffixMatch(char *start, int len, char *suffix)
+{
+  int sufLen = strlen(suffix);
+
+  if (len < 1)
+    len = strlen(start);
+  if (len < sufLen)
+    return(FALSE);
+  if (strncasecmp(start+len-sufLen, suffix, sufLen))
+    return(FALSE);
+  return(TRUE);
+}
+/*-----------------------------------------------------------------*/
+int
+DoesDotlessSuffixMatch(char *start, int len, char *suffix)
+{
+  /* ignores any dots on end of start, suffix */
+  int sufLen = strlen(suffix);
+
+  if (len < 1)
+    len = strlen(start);
+
+  while (len > 1 && start[len-1] == '.')
+    len--;
+  while (sufLen > 1 && suffix[sufLen-1] == '.')
+    sufLen--;
+
+  if (len < sufLen)
+    return(FALSE);
+  if (strncasecmp(start+len-sufLen, suffix, sufLen))
+    return(FALSE);
+  return(TRUE);
+}
+
+/*-----------------------------------------------------------------*/
+/*                                                                 */
+/* Bit Vector Implementation                                       */
+/*                                                                 */
+/*-----------------------------------------------------------------*/
+#define BIT_INDEX (0x0000001F)
+
+void
+SetBits(int* bits, int idx, int maxNum)
+{
+  if (idx > (maxNum << 5)) {
+    TRACE("Invalid index: %d", idx);
+    return;
+  }
+  bits[(idx >> 5)] |= (1 << (idx & BIT_INDEX));
+}
+/*-----------------------------------------------------------------*/
+int
+GetBits(int* bits, int idx, int maxNum)
+{
+  if (idx > (maxNum << 5)) {
+    TRACE("Invalid index: %d", idx);
+    return FALSE;
+  }
+  return (bits[(idx >> 5)] & (1 << (idx & BIT_INDEX)));
+}
+
+/*-----------------------------------------------------------------*/
+static inline int
+GetNumBits_I(int bitvec)
+{
+  int i, count;
+
+  for (i = 0, count = 0; i < 32; i++)
+    if (bitvec & (1 << i)) count++;
+  return count;
+}
+
+/*-----------------------------------------------------------------*/
+int 
+GetNumBits(int* bitvecs, int maxNum)
+{
+  int i, count;
+
+  /* get the number of bits that have been set to 1 */
+  for (i = 0, count = 0; i < maxNum; i++)
+    count += GetNumBits_I(bitvecs[i]);
+  return count;
+}
+
+/*-----------------------------------------------------------------*/
+
+/* Logging & Trace support */
+
+/* buffer threshold : when the size hits this value, it flushes its content
+   to the file  */
+#define LOG_BYTES_THRESHOLD (32*1024)
+
+/* this flag indicates that it preserves the base file name for the current
+   one, and changes its name when it actually closes it off */
+#define CHANGE_FILE_NAME_ON_SAVE 0x01 
+
+/* size of the actual log buffer */
+#define LOG_BYTES_MAX       (2*LOG_BYTES_THRESHOLD)
+
+/* log/trace book keeping information */
+typedef struct ExtendedLog {
+  char buffer[LOG_BYTES_MAX]; /* 64 KB */
+  int  bytes;           /* number of bytes written */
+  int  filesize;        /* number of bytes written into this file so far */
+  int  fd;              /* file descriptor */
+  char* sig;            /* base file name */
+  int flags;            /* flags */
+  time_t nextday;
+} ExtendedLog, *PExtendedLog;
+
+/* maximum single file size */
+int maxSingleLogSize = 100 * (1024*1024);
+
+static time_t
+GetNextLogFileName(char* file, int size, const char* sig)
+{
+#define COMPRESS_EXT1 ".bz2"
+#define COMPRESS_EXT2 ".gz"
+
+  struct tm cur_tm;
+  time_t cur_t;
+  int idx = 0;
+
+  cur_t = timeex(NULL);
+  cur_tm = *gmtime(&cur_t);
+
+  for (;;) {
+    /* check if .bz2 exists */
+    snprintf(file, size, "%s.%04d%02d%02d_%03d%s", 
+            sig, cur_tm.tm_year+1900, cur_tm.tm_mon+1, cur_tm.tm_mday, 
+            idx, COMPRESS_EXT1);
+
+    if (access(file, F_OK) == 0) {
+      idx++;
+      continue;
+    }
+
+    /* check if .gz exists */
+    snprintf(file, size, "%s.%04d%02d%02d_%03d%s", 
+            sig, cur_tm.tm_year+1900, cur_tm.tm_mon+1, cur_tm.tm_mday, 
+            idx++, COMPRESS_EXT2);
+
+    if (access(file, F_OK) == 0) 
+      continue;
+
+    /* strip the extension and see if the (uncompressed) file exists */
+    file[strlen(file) - sizeof(COMPRESS_EXT2) + 1] = 0;
+    if (access(file, F_OK) != 0)
+      break;
+  }
+  
+  /* calculate & return the next day */
+  cur_t -= (3600*cur_tm.tm_hour + 60*cur_tm.tm_min + cur_tm.tm_sec);
+  return cur_t + 60*60*24;
+
+#undef COMPRESS_EXT1
+#undef COMPRESS_EXT2
+}
+
+/*-----------------------------------------------------------------*/
+static void
+FlushBuffer(HANDLE file) 
+{
+  /* write data into the file */
+  ExtendedLog* pel = (ExtendedLog *)file;
+  int written;
+
+  if (pel == NULL || pel->fd < 0)
+    return;
+  
+  if ((written = write(pel->fd, pel->buffer, pel->bytes)) > 0) {
+    pel->bytes -= written;
+
+    /* if it hasn't written all data, then we need to move memory */
+    if (pel->bytes > 0) 
+      memmove(pel->buffer, pel->buffer + written, pel->bytes);
+    pel->buffer[pel->bytes] = 0;
+    pel->filesize += written;
+  }
+  
+  /* if the filesize is bigger than maxSignleLogSize, then close it off */
+  if (pel->filesize >= maxSingleLogSize) 
+    OpenLogF(file);
+}
+
+/*-----------------------------------------------------------------*/
+HANDLE
+CreateLogFHandle(const char* signature, int change_file_name_on_save)
+{
+  ExtendedLog* pel;
+
+  if ((pel = (ExtendedLog *)xcalloc(1, sizeof(ExtendedLog))) == NULL)
+    NiceExit(-1, "failed");
+
+  pel->fd = -1;
+  pel->sig = xstrdup(signature);
+  if (pel->sig == NULL)
+    NiceExit(-1, "signature copying failed");
+  if (change_file_name_on_save)
+    pel->flags |= CHANGE_FILE_NAME_ON_SAVE;
+
+  return pel;
+}
+
+
+/*-----------------------------------------------------------------*/
+int
+OpenLogF(HANDLE file)
+{
+  char filename[1024];
+  ExtendedLog* pel = (ExtendedLog *)file;
+
+  if (pel == NULL)
+    return -1;
+
+  if (pel->fd != -1) 
+    close(pel->fd);
+
+  pel->nextday = GetNextLogFileName(filename, sizeof(filename), pel->sig);
+
+  /* change the file name only at saving time 
+     use pel->sig as current file name         */
+  if (pel->flags & CHANGE_FILE_NAME_ON_SAVE) {
+    if (access(pel->sig, F_OK) == 0) 
+      rename(pel->sig, filename);
+    strcpy(filename, pel->sig);
+  }
+
+  /* file opening */
+  if ((pel->fd = open(filename, O_RDWR | O_CREAT | O_APPEND, 
+                     S_IRUSR | S_IWUSR)) == -1) {
+    char errMessage[2048];
+    sprintf(errMessage, "couldn't open the extended log file %s\n", filename);
+    NiceExit(-1, errMessage);
+  }
+
+  /* reset the file size */
+  pel->filesize = 0;
+  return 0;
+}
+
+/*-----------------------------------------------------------------*/
+int 
+WriteLog(HANDLE file, const char* data, int size, int forceFlush)
+{
+  ExtendedLog* pel = (ExtendedLog *)file;
+
+  /* if an error might occur, then stop here */
+  if (pel == NULL || pel->fd < 0 || size > LOG_BYTES_MAX)
+    return -1;
+
+  if (data != NULL) {
+    /* flush the previous data, if this data would overfill the buffer */
+    if (pel->bytes + size >= LOG_BYTES_MAX) 
+      FlushBuffer(file);
+
+    /* write into the buffer */
+    memcpy(pel->buffer + pel->bytes, data, size);
+    pel->bytes += size;
+  }
+
+  /* need to flush ? */
+  if ((forceFlush && (pel->bytes > 0)) || (pel->bytes >= LOG_BYTES_THRESHOLD))
+    FlushBuffer(file);
+
+  return 0;
+}
+
+/*-----------------------------------------------------------------*/
+void
+DailyReopenLogF(HANDLE file) 
+{
+  /* check if current file is a day long,
+     opens another for today's file         */
+  ExtendedLog* pel = (ExtendedLog *)file;
+
+  if (pel && (timeex(NULL) >= pel->nextday)) {
+    FlushLogF(file);               /* flush */
+    OpenLogF(file);                /* close previous one & reopen today's */
+  }
+}
+/*-----------------------------------------------------------------*/
+int 
+HandleToFileNo(HANDLE file)
+{
+  ExtendedLog* pel = (ExtendedLog *)file;
+
+  return (pel != NULL) ? pel->fd : -1;
+}
+/*-----------------------------------------------------------------*/
+#define TO_HOST_L(x) x = ntohl(x);
+#define TO_NETWORK_L(x) x = htonl(x);
+void 
+HtoN_LocalQueryInfo(LocalQueryInfo *p)
+{
+  TO_NETWORK_L(p->lqi_size);
+  TO_NETWORK_L(p->lqi_id);
+  TO_NETWORK_L(p->lqi_cache);
+}
+/*----------------------------------------------------------------*/
+void 
+NtoH_LocalQueryResult(LocalQueryResult* p)
+{
+  TO_HOST_L(p->lq_id);
+  TO_HOST_L(p->lq_ttl);
+}
+/*----------------------------------------------------------------*/
+int
+CoDNSGetHostByNameSync(const char *name, struct in_addr *res)
+{
+  static int fd = -1;
+  LocalQuery query;
+  int size;
+  LocalQueryResult result;
+
+  /* create a connection to CoDNS */
+  if (fd == -1 && (fd = MakeLoopbackConnection(CODNS_PORT, 0)) < 0) {
+    TRACE("CoDNS connection try has failed!\n");
+
+    /* try to resolve names using gethostbyname() */
+    {
+      struct hostent* hp;
+      if ((hp = gethostbyname(name)) == NULL)
+       NiceExit(-1, "gethostbyname also failed!");
+      if (res)
+       *res = *(struct in_addr *)hp->h_addr;
+    }
+    return 0;
+  }
+
+  memset(&query, 0, sizeof(query));
+  size = strlen(name) + 1;
+  query.lq_info.lqi_size = size;      /* length of name */
+  query.lq_info.lqi_cache = TRUE;
+  strcpy(query.lq_name, name);
+  size += sizeof(query.lq_info) + sizeof(query.lq_zero);
+
+  /* send a query */
+  HtoN_LocalQueryInfo(&query.lq_info);
+  if (write(fd, &query, size) != size) {
+    close(fd);
+    fd  = -1;
+    return(-1);
+  }
+
+  /* get answer */
+  do {
+    size = read(fd, &result, sizeof(result));
+  } while (size == -1 && errno == EINTR);
+  /*  close(fd); */
+  NtoH_LocalQueryResult(&result);
+
+  if (size != sizeof(result))
+    return(-1);
+
+  *res = result.lq_address[0];
+  return 0;
+}
+/*-----------------------------------------------------------------*/
+void
+FeedbackDelay(struct timeval *start, float minSec, float maxSec)
+{
+  float diff = DiffTimeVal(start, NULL);
+  if (diff < minSec)
+    diff = minSec;
+  if (diff > maxSec)
+    diff = maxSec;
+  usleep((unsigned int)(diff * 1e6));
+}
+/*-----------------------------------------------------------------*/
+static int niceExitLogFD = -1;
+int
+NiceExitOpenLog(char *logName)
+{
+  /* log file to record exit reasons */
+  if (niceExitLogFD >= 0)
+    close(niceExitLogFD);
+
+  niceExitLogFD = open(logName, O_WRONLY | O_APPEND | O_CREAT, 
+                      S_IRUSR | S_IWUSR);
+  return((niceExitLogFD >= 0) ? SUCCESS : FAILURE);
+}
+/*-----------------------------------------------------------------*/
+void
+NiceExitBack(int val, char *reason, char *file, int line)
+{
+  char buf[1024];
+  time_t currT = time(NULL);
+
+  sprintf(buf, "[%s, %d] %.24s %s\n", file, line, ctime(&currT), reason);
+  if (hdebugLog) {
+    TRACE("%s", buf);
+    FlushLogF(hdebugLog);
+  }
+  else
+    fprintf(stderr, "%s", buf);
+
+  if (niceExitLogFD >= 0)
+    write(niceExitLogFD, buf, strlen(buf));
+  exit(val);
+}
+/*-----------------------------------------------------------------*/
+int
+WordCount(char *buf)
+{
+  int count = 0;
+  int wasSpace = TRUE;
+
+  while (*buf != '\0') {
+    int isSpace = isspace(*buf);
+    if (wasSpace && (!isSpace))
+      count++;
+    wasSpace = isSpace;
+    buf++;
+  }
+  return(count);
+}
+/*-----------------------------------------------------------------*/
+char * 
+ReadFile(const char *filename)
+{
+  int dummySize;
+  return(ReadFileEx(filename, &dummySize));
+}
+/*-----------------------------------------------------------------*/
+char * 
+ReadFileEx(const char *filename, int *size)
+{
+  /* allocate a buffer, read the file into it and
+     return the buffer */
+  char *content = NULL;
+  struct stat buf;
+  int fd;
+
+  *size = -1;
+  if (access(filename, R_OK) < 0 
+      || stat(filename, &buf) < 0
+      || (fd = open(filename, O_RDONLY)) < 0) {
+    TRACE("opening captcha file %s failed\n", filename);
+    exit(-1);
+  }
+
+  if ((content = (char *)xmalloc(buf.st_size + 1)) == NULL) {
+    TRACE("memory alloc failed\n");
+    exit(-1);
+  }
+
+  if (read(fd, content, buf.st_size) != buf.st_size) {
+    TRACE("opening captcha test file failed\n");
+    exit(-1);
+  }
+  close(fd);
+  content[buf.st_size] = 0;
+  *size = buf.st_size;
+  return content;
+}
+/*-----------------------------------------------------------------*/
+char * 
+MmapFile(const char *filename, int *size)
+{
+  /* allocate a buffer, read the file into it and
+     return the buffer */
+  char *content = NULL;
+  struct stat buf;
+  int fd;
+
+  *size = -1;
+  if (access(filename, R_OK) < 0 
+      || stat(filename, &buf) < 0
+      || (fd = open(filename, O_RDONLY)) < 0) {
+    TRACE("opening captcha file %s failed\n", filename);
+    exit(-1);
+  }
+
+  content = (char *)mmap(NULL, buf.st_size, PROT_READ, MAP_SHARED, fd, 0);
+  close(fd);
+  if (content != MAP_FAILED) {
+    *size = buf.st_size;
+    return content;
+  }
+  return(NULL);
+}
+/*-----------------------------------------------------------------*/
+unsigned int 
+HashString(const char *name, unsigned int hash, int endOnQuery, 
+          int skipLastIfDot)
+{
+  /* if endOnQuery, we stop the hashing when we hit the question mark.
+     if skipLastIfDot, we check the last component of the path to see
+     if it includes a dot, and it so, we skip it. if both are specified,
+     we first try the query, and if that exists, we don't trim the path */
+
+  int i;
+  int len;
+  char *temp;
+
+  if (name == NULL)
+    return 0;
+
+  len = strlen(name);
+  if (endOnQuery && (temp = strchr(name, '?')) != NULL)
+    len = temp - name;
+  else if (skipLastIfDot) {
+    /* first, find last component by searching backward */
+    if ((temp = strrchr(name, '/')) != NULL) {
+      /* now search forward for the dot */
+      if (strchr(temp, '.') != NULL)
+       len = temp - name;
+    }
+  }
+
+  for (i = 0; i < len; i ++)
+    hash += (_rotl(hash, 19) + name[i]);
+
+  return hash;
+}
+/*-------------------------------------------------------------*/
+unsigned int
+CalcAgentHash(const char* agent) 
+{
+  char p[strlen(agent)+1];
+  int i;
+
+  if (agent == NULL)
+    return 0;
+
+  /* we remove all spaces */
+  for (i = 0; *agent; agent++) {
+    if (isspace(*agent))
+      continue;
+    p[i++] = *agent;
+  }
+  p[i] = 0;
+
+  return HashString(p, 0, FALSE, FALSE);
+}
+/*-----------------------------------------------------------------*/
+char *
+ZapSpacesAndZeros(char *src)
+{
+  /* get rid of excess spaces between words, and remove any trailing
+     (post-decimal) zeros from floating point numbers */
+  static char smallLine[4096];
+  char *dst = smallLine;
+  char *word;
+  int addSpace = FALSE;
+
+  while (src != NULL &&
+        (word = GetWord((const unsigned char*)src, 0)) != NULL) {
+    char *temp;
+    int isDotNumber = TRUE;
+
+    src = GetField((const unsigned char*)src, 1);      /* advance to next */
+
+    /* check to make sure it has exactly one decimal point */
+    if ((temp = strchr(word, '.')) != NULL &&
+       (temp = strchr(temp+1, '.')) == NULL) {
+      /* make sure it's all digits or the dot */
+      for (temp = word; *temp != '\0'; temp++) {
+       if (!(isdigit(*temp) || *temp == '.')) {
+         isDotNumber = FALSE;
+         break;
+       }
+      }
+    }
+    else
+      isDotNumber = FALSE;
+    
+    if (isDotNumber) {
+      /* strip off any trailing zeros and possibly the decimal point */
+      int len = strlen(word) - 1;
+      
+      while (word[len] == '0') {
+       word[len] = '\0';
+       len--;
+      }
+      if (word[len] == '.')
+       word[len] = '\0';
+    }
+
+    if (addSpace)
+      sprintf(dst, " %s", word);
+    else
+      sprintf(dst, "%s", word);
+    dst += strlen(dst);
+    addSpace = TRUE;
+    xfree(word);
+  }
+
+  *dst = 0;
+  return(smallLine);
+}
+/*-----------------------------------------------------------------*/
diff --git a/applib.h b/applib.h

new file mode 100644 (file)

index 0000000..40b283a
--- /dev/null
+++ b/applib.h
@@ -0,0 +1,130 @@
+#ifndef _APPLIB_H_
+#define _APPLIB_H_
+
+#include <stdio.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <sys/mman.h>
+#include <netdb.h>
+#include <limits.h>
+#include "appdef.h"
+
+float DiffTimeVal(const struct timeval *start, const struct timeval *end);
+
+int  CreatePrivateAcceptSocketEx(int portNum, 
+                                int nonBlocking, int loopbackOnly);
+int CreatePrivateAcceptSocket(int portNum, int nonBlocking);
+int CreatePublicUDPSocket(int portNum);
+int MakeLoopbackConnection(int portNum, int nonBlocking);
+int MakeConnection(char *name, in_addr_t netAddr, 
+                  int portNum, int nonBlocking);
+
+char *GetField(const unsigned char *start, int whichField);
+char *GetWord(const unsigned char *start, int whichWord);
+char *GetWordEx(const unsigned char *start, int whichWord, 
+               char* dest, int max);
+int WordCount(char *buf);
+char *ZapSpacesAndZeros(char *src);
+
+int Base36Digit(int a);
+int Base36To10(int a);
+int PopCount(int val);
+int PopCountChar(int val);
+int LogValChar(int val);
+
+const char *StringOrNull(const char *s);
+char *strchars(const char *string, const char *list);
+char *strnchars(const char *string, int length, const char *list);
+
+#if defined(OS_FREEBSD) || defined(OS_DARWIN)
+#define HAS_STRNSTR
+#endif
+
+#ifndef HAS_STRNSTR
+char *strnstr(const char * s1, int s1_len, const char * s2);
+#endif
+
+char * strncasestr(const char * s1, int s1_len, const char * s2);
+int strncspn(const char* string, int length, const char* reject);
+char *strnchr(const char *string, int length, const char needle);
+char *StrdupLower(const char *orig);
+void StrcpyLower(char *dest, const char *src);
+void StrcpyLowerExcept(char *dest, int dest_max, const char* src, const char* except);
+char *GetLowerNextLine(FILE *file);
+char *GetNextLine(FILE *file);
+char *GetNextLineNoCommStrip(FILE *file);
+int DoesSuffixMatch(char *start, int len, char *suffix);
+int DoesDotlessSuffixMatch(char *start, int len, char *suffix);
+
+/* resolves a name using CoDNS */
+#include "codns.h"
+void HtoN_LocalQueryInfo(LocalQueryInfo *p);
+void NtoH_LocalQueryResult(LocalQueryResult* p);
+int CoDNSGetHostByNameSync(const char *name, struct in_addr *res);
+
+#ifdef OS_LINUX
+#include <alloca.h>
+#endif
+
+#include <string.h>
+
+/*  allocate stack memory to copy "src" to "dest" in lower cases */
+#define LOCAL_STR_DUP_LOWER(dest, src)    \
+  { dest = alloca(strlen(src) + 1);       \
+    StrcpyLower(dest, src);               \
+  }
+  
+/*  allocate stack memory to copy "src" to "dest" */
+#define LOCAL_STR_DUP(dest, src)         \
+  { dest = alloca(strlen(src) + 1);      \
+    strcpy(dest, src);                   \
+  }
+
+/* release memory pointer after checking NULL */
+#define FREE_PTR(x) if ((x) != NULL) { xfree(x);}
+
+/* Bit vector implementation */
+void SetBits(int* bits, int idx, int maxNum);
+int GetBits(int* bits, int idx, int maxNum);
+int GetNumBits(int* bitvecs, int maxNum);
+
+/* extended logging */
+typedef void* HANDLE;             
+
+HANDLE CreateLogFHandle(const char* signature, int change_file_name_on_save);
+int OpenLogF(HANDLE file);
+int WriteLog(HANDLE file, const char* data, int size, int forceFlush);
+void DailyReopenLogF(HANDLE file);
+int HandleToFileNo(HANDLE file);
+
+/* flush the buffer */
+#define FlushLogF(h)  WriteLog(h, NULL, 0, TRUE)
+
+/* maximum single log file size, defined in applib.c */
+extern int maxSingleLogSize; 
+
+#include "gettimeofdayex.h"
+
+void FeedbackDelay(struct timeval *start, float minSec, float maxSec);
+
+int NiceExitOpenLog(char *logName);
+void NiceExitBack(int val, char *reason, char *file, int line) __attribute__ ((noreturn));
+#define NiceExit(val, reason) NiceExitBack(val, reason, __FILE__, __LINE__)
+
+char *ReadFile(const char *filename);
+char *ReadFileEx(const char *filename, int *size);
+char *MmapFile(const char *filename, int *size);
+
+/* use 32-bit unsigned integer, higher bits are thrown away */
+#define WORD_BITS 32
+#define MASK 0xFFFFFFFF
+/* bitwise left rotate operator */
+#define _rotl(Val, Bits) ((((Val)<<(Bits)) | (((Val) & MASK)>>(32 - (Bits)))) & MASK)
+
+unsigned int HashString(const char *name, unsigned int hash, 
+                       int endOnQuery, int skipLastIfDot);
+unsigned int CalcAgentHash(const char* agent);
+#endif
+
diff --git a/codemux.c b/codemux.c

new file mode 100644 (file)

index 0000000..732059a
--- /dev/null
+++ b/codemux.c
@@ -0,0 +1,944 @@
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <ctype.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <unistd.h>
+#include "applib.h"
+
+#define CONF_FILE "/etc/codemux/codemux.conf"
+#define DEMUX_PORT 80
+#define REAL_WEBSERVER_CONFLINE "* root 1080"
+#define TARG_SETSIZE 4096
+
+/* set aside some small number of fds for us, allow the rest for
+   connections */
+#define MAX_CONNS ((TARG_SETSIZE-20)/2)
+
+/* no single service can take more than half the connections */
+#define SERVICE_MAX (MAX_CONNS/2)
+
+/* how many total connections before we get concerned about fairness
+   among them */
+#define FAIRNESS_CUTOFF (MAX_CONNS * 0.85)
+
+
+typedef struct FlowBuf {
+  int fb_refs;                 /* num refs */
+  char *fb_buf;                        /* actual buffer */
+  int fb_used;                 /* bytes used in buffer */
+} FlowBuf;
+#define FB_SIZE 3800           /* max usable size */
+#define FB_ALLOCSIZE 4000      /* extra to include IP address */
+
+typedef struct SockInfo {
+  int si_peerFd;               /* fd of peer */
+  struct in_addr si_cliAddr;   /* address of client */
+  int si_blocked;              /* are we blocked? */
+  int si_needsHeaderSince;     /* since when are we waiting for a header */
+  int si_whichService;         /* index of service */
+  FlowBuf *si_readBuf;         /* read data into this buffer */
+  FlowBuf *si_writeBuf;                /* drain this buffer for writing */
+} SockInfo;
+
+static SockInfo sockInfo[TARG_SETSIZE]; /* fd number of peer socket */
+
+typedef struct ServiceSig {
+  char *ss_host;               /* suffix in host */
+  char *ss_slice;
+  short ss_port;
+  int ss_slicePos;             /* position in slices array */
+} ServiceSig;
+
+static ServiceSig *serviceSig;
+static int numServices;
+static int confFileReadTime;
+static int now;
+
+typedef struct SliceInfo {
+  char *si_sliceName;
+  int si_inUse;                        /* do any services refer to this? */
+  int si_numConns;
+  int si_xid;
+} SliceInfo;
+
+static SliceInfo *slices;
+static int numSlices;
+static int numActiveSlices;
+static int numTotalSliceConns;
+static int anySliceXidsNeeded;
+
+typedef struct OurFDSet {
+  long __fds_bits[TARG_SETSIZE/32];
+} OurFDSet;
+static OurFDSet masterReadSet, masterWriteSet;
+static int highestSetFd;
+static int numNeedingHeaders;  /* how many conns waiting on headers? */
+
+static int numForks;
+
+HANDLE hdebugLog;
+
+#ifndef SO_SETXID
+#define SO_SETXID SO_PEERCRED
+#endif
+/*-----------------------------------------------------------------*/
+static SliceInfo *
+ServiceToSlice(int whichService)
+{
+  if (whichService < 0)
+    return(NULL);
+  return(&slices[serviceSig[whichService].ss_slicePos]);
+}
+/*-----------------------------------------------------------------*/
+static void
+DumpStatus(int fd)
+{
+  char buf[65535];
+  char *start = buf;
+  int i;
+  int len;
+
+  sprintf(start, 
+         "numForks %d, numActiveSlices %d, numTotalSliceConns %d\n"
+         "numNeedingHeaders %d, anySliceXidsNeeded %d\n",
+         numForks, numActiveSlices, numTotalSliceConns,
+         numNeedingHeaders, anySliceXidsNeeded);
+  start += strlen(start);
+
+  for (i = 0; i < numSlices; i++) {
+    SliceInfo *si = &slices[i];
+    sprintf(start, "Slice %d: %s xid %d, %d conns, inUse %d\n", 
+           i, si->si_sliceName, si->si_xid, si->si_numConns,
+           si->si_inUse);
+    start += strlen(start);
+  }
+
+  for (i = 0; i < numServices; i++) {
+    ServiceSig *ss = &serviceSig[i];
+    sprintf(start, "Service %d: %s %s port %d, slice# %d\n", i, ss->ss_host,
+           ss->ss_slice, (int) ss->ss_port, ss->ss_slicePos);
+    start += strlen(start);
+  }
+
+  len = start - buf;
+  write(fd, buf, len);
+}
+/*-----------------------------------------------------------------*/
+static void
+GetSliceXids(void)
+{
+  /* walks through /etc/passwd, and gets the uid for every slice we
+     have */
+  FILE *f;
+  char *line;
+  int i;
+
+  if (!anySliceXidsNeeded)
+    return;
+
+  for (i = 0; i < numSlices; i++) {
+    SliceInfo *si = &slices[i];
+    si->si_inUse = 0;
+  }
+  for (i = 0; i < numServices; i++) {
+    SliceInfo *si = ServiceToSlice(i);
+    if (si != NULL)
+      si->si_inUse++;
+  }  
+
+  if ((f = fopen("/etc/passwd", "r")) == NULL)
+    return;
+
+  while ((line = GetNextLine(f)) != NULL) {
+    char *temp;
+    int xid;
+
+    if ((temp = strchr(line, ':')) == NULL)
+      continue;                        /* weird line */
+    *temp = '\0';              /* terminate slice name */
+    temp++;
+    if ((temp = strchr(temp+1, ':')) == NULL)
+      continue;                        /* weird line */
+    if ((xid = atoi(temp+1)) < 1)
+      continue;                        /* weird xid */
+
+    /* we've got a slice name and xid, let's try to match */
+    for (i = 0; i < numSlices; i++) {
+      if (slices[i].si_xid == 0 &&
+         strcasecmp(slices[i].si_sliceName, line) == 0) {
+       slices[i].si_xid = xid;
+       break;
+      }
+    }
+  }
+
+  /* assume service 0 is the root service, and don't check it since
+     it'll have xid zero */
+  anySliceXidsNeeded = FALSE;
+  for (i = 1; i < numSlices; i++) {
+    if (slices[i].si_xid == 0 && slices[i].si_inUse > 0) {
+      anySliceXidsNeeded = TRUE;
+      break;
+    }
+  }
+
+  fclose(f);
+}
+/*-----------------------------------------------------------------*/
+static void
+SliceConnsInc(int whichService)
+{
+  SliceInfo *si = ServiceToSlice(whichService);
+
+  if (si == NULL)
+    return;
+  numTotalSliceConns++;
+  si->si_numConns++;
+  if (si->si_numConns == 1)
+    numActiveSlices++;
+}
+/*-----------------------------------------------------------------*/
+static void
+SliceConnsDec(int whichService)
+{
+  SliceInfo *si = ServiceToSlice(whichService);
+
+  if (si == NULL)
+    return;
+  numTotalSliceConns--;
+  si->si_numConns--;
+  if (si->si_numConns == 0)
+    numActiveSlices--;
+}
+/*-----------------------------------------------------------------*/
+static int
+WhichSlicePos(char *slice)
+{
+  /* adds the new slice if necessary, returns the index into slice
+     array. Never change the ordering of existing slices */
+  int i;
+  static int numSlicesAlloc;
+
+  for (i = 0; i < numSlices; i++) {
+    if (strcasecmp(slice, slices[i].si_sliceName) == 0)
+      return(i);
+  }
+
+  if (numSlices >= numSlicesAlloc) {
+    numSlicesAlloc = MAX(8, numSlicesAlloc * 2);
+    slices = realloc(slices, numSlicesAlloc * sizeof(SliceInfo));
+  }
+
+  memset(&slices[numSlices], 0, sizeof(SliceInfo));
+  slices[numSlices].si_sliceName = strdup(slice);
+  numSlices++;
+  return(numSlices-1);
+}
+/*-----------------------------------------------------------------*/
+static void
+ReadConfFile(void)
+{
+  int numAlloc = 0;
+  int num = 0;
+  ServiceSig *servs = NULL;
+  FILE *f;
+  char *line = NULL;
+  struct stat statBuf;
+  int i;
+
+  if (stat(CONF_FILE, &statBuf) != 0) {
+    fprintf(stderr, "failed stat on codemux.conf\n");
+    if (numServices)
+      return;
+    exit(-1);
+  }
+  if (statBuf.st_mtime == confFileReadTime)
+    return;
+
+  if ((f = fopen(CONF_FILE, "r")) == NULL) {
+    fprintf(stderr, "failed reading codemux.conf\n");
+    if (numServices)
+      return;
+    exit(-1);
+  }
+
+  /* conf file entries look like
+     coblitz.codeen.org princeton_coblitz 3125
+  */
+
+  while (1) {
+    ServiceSig serv;
+    int port;
+    if (line != NULL)
+      free(line);
+
+    /* on the first pass, put in a fake entry for apache */
+    if (num == 0)
+      line = strdup(REAL_WEBSERVER_CONFLINE);
+    else {
+      if ((line = GetNextLine(f)) == NULL)
+       break;
+    }
+
+    memset(&serv, 0, sizeof(serv));
+    if (WordCount(line) != 3) {
+      fprintf(stderr, "bad line: %s\n", line);
+      continue;
+    }
+    serv.ss_port = port = atoi(GetField(line, 2));
+    if (port < 1 || port > 65535 || port == DEMUX_PORT) {
+      fprintf(stderr, "bad port: %s\n", line);
+      continue;
+    }
+
+    serv.ss_host = GetWord(line, 0);
+    serv.ss_slice = GetWord(line, 1);
+    if (num >= numAlloc) {
+      numAlloc = MAX(numAlloc * 2, 8);
+      servs = realloc(servs, numAlloc * sizeof(ServiceSig));
+    }
+    serv.ss_slicePos = WhichSlicePos(serv.ss_slice);
+    if (slices[serv.ss_slicePos].si_inUse == 0 &&
+       slices[serv.ss_slicePos].si_xid < 1)
+      anySliceXidsNeeded = TRUE; /* if new/inactive, we need xid */
+    servs[num] = serv;
+    num++;
+  }
+
+  fclose(f);
+
+  if (num == 1) {
+    if (numServices == 0) {
+      fprintf(stderr, "nothing found in codemux.conf\n");
+      exit(-1);
+    }
+    return;
+  }
+
+  for (i = 0; i < numServices; i++) {
+    free(serviceSig[i].ss_host);
+    free(serviceSig[i].ss_slice);
+  }
+  free(serviceSig);
+  serviceSig = servs;
+  numServices = num;
+  confFileReadTime = statBuf.st_mtime;
+}
+/*-----------------------------------------------------------------*/
+static char *err400BadRequest =
+"HTTP/1.0 400 Bad Request\r\n"
+"Content-Type: text/html\r\n"
+"\r\n"
+"You are trying to access a PlanetLab node, and your\n"
+"request header exceeded the allowable size. Please\n"
+"try again if you believe this error is temporary.\n";
+/*-----------------------------------------------------------------*/
+static char *err503Unavailable =
+"HTTP/1.0 503 Service Unavailable\r\n"
+"Content-Type: text/html\r\n"
+"\r\n"
+"You are trying to access a PlanetLab node, but the service\n"
+"seems to be unavailable at the moment. Please try again.\n";
+/*-----------------------------------------------------------------*/
+static char *err503TooBusy =
+"HTTP/1.0 503 Service Unavailable\r\n"
+"Content-Type: text/html\r\n"
+"\r\n"
+"You are trying to access a PlanetLab node, but the service\n"
+"seems to be overloaded at the moment. Please try again.\n";
+/*-----------------------------------------------------------------*/
+static void
+SetFd(int fd, OurFDSet *set)
+{
+  if (highestSetFd < fd)
+    highestSetFd = fd;
+  FD_SET(fd, set);
+}
+/*-----------------------------------------------------------------*/
+static void
+ClearFd(int fd, OurFDSet *set)
+{
+  FD_CLR(fd, set);
+}
+/*-----------------------------------------------------------------*/
+static int
+RemoveHeader(char *lower, char *real, int totalSize, char *header)
+{
+  /* returns number of characters removed */
+  char h2[256];
+  int start, end, len;
+  char *temp, *conn;
+
+  sprintf(h2, "\n%s", header);
+
+  if ((conn = strstr(lower, h2)) == NULL)
+    return(0);
+
+  conn++;
+  /* determine how many characters to remove */
+  if ((temp = strchr(conn, '\n')) != NULL)
+    len = (temp - conn) + 1;
+  else
+    len = strlen(conn) + 1;
+  start = conn - lower;
+  end = start + len;
+  memmove(&real[start], &real[end], totalSize - end);
+  memmove(&lower[start], &lower[end], totalSize - end);
+
+  return(len);
+}
+/*-----------------------------------------------------------------*/
+static int
+InsertHeader(char *buf, int totalSize, char *header)
+{
+  /* returns number of bytes inserted */
+  
+  char h2[256];
+  char *temp;
+  int len;
+  
+  sprintf(h2, "%s\r\n", header);
+  len = strlen(h2);
+
+  /* if we don't encounter a \n, it means that we have only a single
+     line, and we'd converted the \n to a \0 */
+  if ((temp = strchr(buf, '\n')) == NULL)
+    temp = strchr(buf, '\0');
+  temp++;
+  
+  memmove(temp + len, temp, totalSize - (temp - buf));
+  memcpy(temp, h2, len);
+  
+  return(len);
+}
+/*-----------------------------------------------------------------*/
+static int
+FindService(FlowBuf *fb, int *whichService, struct in_addr addr)
+{
+  char *end;
+  char *lowerBuf;
+  char *hostVal;
+  char *buf = fb->fb_buf;
+  char orig[256];
+#if 0
+  char *url;
+  int i;
+  int len;
+#endif
+    
+  if (strstr(buf, "\n\r\n") == NULL && strstr(buf, "\n\n") == NULL)
+    return(FAILURE);
+
+  /* insert client info after first line */
+  sprintf(orig, "X-CoDemux-Client: %s", inet_ntoa(addr));
+  fb->fb_used += InsertHeader(buf, fb->fb_used + 1, orig);
+    
+  /* get just the header, so we can work on it */
+  LOCAL_STR_DUP_LOWER(lowerBuf, buf);
+  if ((end = strstr(lowerBuf, "\n\r\n")) == NULL)
+    end = strstr(lowerBuf, "\n\n");
+  *end = '\0';
+  
+  /* remove any existing connection, keep-alive headers, add ours */
+  fb->fb_used -= RemoveHeader(lowerBuf, buf, fb->fb_used + 1, "keep-alive:");
+  fb->fb_used -= RemoveHeader(lowerBuf, buf, fb->fb_used + 1, "connection:");
+  fb->fb_used += InsertHeader(buf, fb->fb_used + 1, "Connection: close");
+  InsertHeader(lowerBuf, fb->fb_used + 1, "connection: close");
+
+  /* isolate host, see if it matches */
+  if ((hostVal = strstr(lowerBuf, "\nhost:")) != NULL) {
+    int i;
+    hostVal += strlen("\nhost:");
+    if ((end = strchr(hostVal, '\n')) != NULL)
+      *end = '\0';
+    if ((end = strchr(hostVal, ':')) != NULL)
+      *end = '\0';
+    while (isspace(*hostVal))
+      hostVal++;
+    if (strlen(hostVal) > 0) {
+      hostVal = GetWord(hostVal, 0);
+      for (i = 1; i < numServices; i++) {
+       if (serviceSig[i].ss_host != NULL &&
+           DoesDotlessSuffixMatch(hostVal, 0, serviceSig[i].ss_host)) {
+         *whichService = i;
+         free(hostVal);
+         /* printf("%s", buf); */
+         return(SUCCESS);
+       }
+      }
+      free(hostVal);
+    }
+  }
+
+#if 0
+  /* see if URL prefix matches */
+  if ((end = strchr(lowerBuf, '\n')) != NULL)
+    *end = 0;
+  if ((url = GetField(lowerBuf, 1)) == NULL ||
+      url[0] != '/') {
+    /* bad request - let apache handle it ? */
+    *whichService = 0;
+    return(SUCCESS);
+  }
+  url++;                       /* skip the leading slash */
+  for (i = 1; i < numServices; i++) {
+    if (serviceSig[i].ss_prefix != NULL &&
+       (len = strlen(serviceSig[i].ss_prefix)) > 0 &&
+       strncmp(url, serviceSig[i].ss_prefix, len) == 0 &&
+       (url[len] == ' ' || url[len] == '/')) {
+      int startPos = url - lowerBuf;
+      int stripLen = len + ((url[len] == '/') ? 1 : 0);
+      /* strip out prefix */
+      fb->fb_used -= stripLen;
+      memmove(&buf[startPos], &buf[startPos+stripLen], 
+             fb->fb_used + 1 - startPos);
+      /* printf("%s", buf); */
+      *whichService = i;
+      return(SUCCESS);
+    }
+  }
+#endif
+
+  /* default to first service */
+  *whichService = 0;
+  return(SUCCESS);
+}
+/*-----------------------------------------------------------------*/
+static int
+StartConnect(int origFD, int whichService)
+{
+  int sock;
+  struct sockaddr_in dest;
+  SockInfo *si;
+
+  /* create socket */
+  if ((sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) < 0) {
+    return(FAILURE);
+  }
+  
+  /* make socket non-blocking */
+  if (fcntl(sock, F_SETFL, O_NONBLOCK) < 0) {
+    close(sock);
+    return(FAILURE);
+  }
+  
+  /* set addr structure */
+  memset(&dest, 0, sizeof(dest));
+  dest.sin_family = AF_INET;
+  dest.sin_port = htons(serviceSig[whichService].ss_port);
+  dest.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+  
+  /* start connection process - we should be told that it's in
+     progress */
+  if (connect(sock, (struct sockaddr *) &dest, sizeof(dest)) != -1 || 
+      errno != EINPROGRESS) {
+    close(sock);
+    return(FAILURE);
+  }
+
+  SetFd(sock, &masterWriteSet); /* determine when connect finishes */
+  sockInfo[origFD].si_peerFd = sock;
+  si = &sockInfo[sock];
+  memset(si, 0, sizeof(SockInfo));
+  si->si_peerFd = origFD;
+  si->si_blocked = TRUE;       /* still connecting */
+  si->si_whichService = whichService;
+  si->si_writeBuf = sockInfo[origFD].si_readBuf;
+  sockInfo[origFD].si_readBuf->fb_refs++;
+  if (whichService >= 0)
+    SliceConnsInc(whichService);
+
+  return(SUCCESS);
+}
+/*-----------------------------------------------------------------*/
+static int
+WriteAvailData(int fd)
+{
+  SockInfo *si = &sockInfo[fd];
+  FlowBuf *fb = si->si_writeBuf;
+  int res;
+
+  /* printf("trying to write fd %d\n", fd); */
+  if (fb->fb_used < 1 || si->si_blocked)
+    return(SUCCESS);
+
+  /* printf("trying to write %d bytes\n", fb->fb_used); */
+  /* write(STDOUT_FILENO, fb->fb_buf, fb->fb_used); */
+  if ((res = write(fd, fb->fb_buf, fb->fb_used)) > 0) {
+    fb->fb_used -= res;
+    if (fb->fb_used > 0) {
+      /* couldn't write all - assume blocked */
+      memmove(fb->fb_buf, &fb->fb_buf[res], fb->fb_used);
+      si->si_blocked = TRUE;
+      SetFd(fd, &masterWriteSet);
+    }
+    /* printf("wrote %d\n", res); */
+    return(SUCCESS);
+  }
+
+  /* we might have been full but didn't realize it */
+  if (res == -1 && errno == EAGAIN) {
+    si->si_blocked = TRUE;
+    SetFd(fd, &masterWriteSet);
+    return(SUCCESS);
+  }
+
+  /* otherwise, assume the worst */
+  return(FAILURE);
+}
+/*-----------------------------------------------------------------*/
+static OurFDSet socksToCloseVec;
+static int numSocksToClose;
+static int whichSocksToClose[TARG_SETSIZE];
+/*-----------------------------------------------------------------*/
+static void
+CloseSock(int fd)
+{
+  if (FD_ISSET(fd, &socksToCloseVec))
+    return;
+  SetFd(fd, &socksToCloseVec);
+  whichSocksToClose[numSocksToClose] = fd;
+  numSocksToClose++;
+}
+/*-----------------------------------------------------------------*/
+static void
+DecBuf(FlowBuf *buf)
+{
+  if (buf == NULL)
+    return;
+  buf->fb_refs--;
+  if (buf->fb_refs == 0) {
+    free(buf->fb_buf);
+    free(buf);
+  }
+}
+/*-----------------------------------------------------------------*/
+static void
+ReallyCloseSocks(void)
+{
+  int i;
+
+  memset(&socksToCloseVec, 0, sizeof(socksToCloseVec));
+
+  for (i = 0; i < numSocksToClose; i++) {
+    int fd = whichSocksToClose[i];
+    close(fd);
+    DecBuf(sockInfo[fd].si_readBuf);
+    DecBuf(sockInfo[fd].si_writeBuf);
+    ClearFd(fd, &masterReadSet);
+    ClearFd(fd, &masterWriteSet);
+    if (sockInfo[fd].si_needsHeaderSince) {
+      sockInfo[fd].si_needsHeaderSince = 0;
+      numNeedingHeaders--;
+    }
+    if (sockInfo[fd].si_whichService >= 0) {
+      SliceConnsDec(sockInfo[fd].si_whichService);
+      sockInfo[fd].si_whichService = -1;
+    }
+  }
+  numSocksToClose = 0;
+}
+/*-----------------------------------------------------------------*/
+static void
+SocketReadyToRead(int fd)
+{
+  SockInfo *si = &sockInfo[fd];
+  int spaceLeft;
+  FlowBuf *fb;
+  int res;
+
+  /* if peer is closed, close ourselves */
+  if (si->si_peerFd < 0 && (!si->si_needsHeaderSince)) {
+    CloseSock(fd);
+    return;
+  }
+
+  if ((fb = si->si_readBuf) == NULL) {
+    fb = si->si_readBuf = calloc(1, sizeof(FlowBuf));
+    fb->fb_refs = 1;
+    if (si->si_peerFd >= 0) {
+      sockInfo[si->si_peerFd].si_writeBuf = fb;
+      fb->fb_refs = 2;
+    }
+  }
+
+  if (fb->fb_buf == NULL)
+    fb->fb_buf = malloc(FB_ALLOCSIZE);
+
+  /* determine read buffer size - if 0, then block reads and return */
+  if ((spaceLeft = FB_SIZE - fb->fb_used) < 0) {
+    if (si->si_needsHeaderSince) {
+      write(fd, err400BadRequest, strlen(err400BadRequest));
+      CloseSock(fd);
+      return;
+    }
+    else {
+      ClearFd(fd, &masterReadSet);
+      return;
+    }
+  }
+
+  /* read as much as allowed, and is available */
+  if ((res = read(fd, &fb->fb_buf[fb->fb_used], spaceLeft)) == 0) {
+    CloseSock(fd);
+    if (fb->fb_used == 0 && si->si_peerFd >= 0) {
+      CloseSock(si->si_peerFd);
+      si->si_peerFd = -1;
+    }
+    return;
+  }
+  if (res == -1) {
+    if (errno == EAGAIN)
+      return;
+    CloseSock(fd);
+    if (fb->fb_used == 0 && si->si_peerFd >= 0) {
+      CloseSock(si->si_peerFd);
+      si->si_peerFd = -1;
+    }
+    return;
+  }
+  fb->fb_used += res;
+  fb->fb_buf[fb->fb_used] = 0; /* terminate it for convenience */
+  printf("sock %d, read %d, total %d\n", fd, res, fb->fb_used);
+
+  /* if we need header, check if we've gotten it. if so, do
+     modifications and continue. if not, check if we've read the
+     maximum, and if so, fail */
+  if (si->si_needsHeaderSince) {
+    int whichService;
+    SliceInfo *slice;
+
+#define STATUS_REQ "GET /codemux/status.txt"
+    if (strncasecmp(fb->fb_buf, STATUS_REQ, sizeof(STATUS_REQ)-1) == 0) {
+      DumpStatus(fd);
+      CloseSock(fd);
+      return;
+    }
+
+    printf("trying to find service\n");
+    if (FindService(fb, &whichService, si->si_cliAddr) != SUCCESS)
+      return;
+    printf("found service %d\n", whichService);
+    slice = ServiceToSlice(whichService);
+
+    /* no service can have more than some absolute max number of
+       connections. Also, when we're too busy, start enforcing
+       fairness across the servers */
+    if (slice->si_numConns > SERVICE_MAX ||
+       (numTotalSliceConns > FAIRNESS_CUTOFF && 
+        slice->si_numConns > MAX_CONNS/numActiveSlices)) {
+      write(fd, err503TooBusy, strlen(err503TooBusy));
+      CloseSock(fd);
+      return;
+    }
+
+    if (slice->si_xid > 0) {
+      setsockopt(fd, SOL_SOCKET, SO_SETXID, 
+                &slice->si_xid, sizeof(slice->si_xid));
+      fprintf(stderr, "setsockopt() with XID = %d name = %s\n", 
+             slice->si_xid, slice->si_sliceName);
+    }
+
+    si->si_needsHeaderSince = 0;
+    numNeedingHeaders--;
+    if (StartConnect(fd, whichService) != SUCCESS) {
+      write(fd, err503Unavailable, strlen(err503Unavailable));
+      CloseSock(fd);
+      return;
+    }
+    return;
+  }
+
+  /* write anything possible */
+  if (WriteAvailData(si->si_peerFd) != SUCCESS) {
+    /* assume the worst and close */
+    CloseSock(fd);
+    CloseSock(si->si_peerFd);
+    si->si_peerFd = -1;
+  }
+}
+/*-----------------------------------------------------------------*/
+static void
+SocketReadyToWrite(int fd)
+{
+  SockInfo *si = &sockInfo[fd];
+
+  /* unblock it and read what it has */
+  si->si_blocked = FALSE;
+  ClearFd(fd, &masterWriteSet);
+  SetFd(fd, &masterReadSet);
+  
+  /* enable reading on peer just in case it was off */
+  if (si->si_peerFd >= 0)
+    SetFd(si->si_peerFd, &masterReadSet);
+    
+  /* if we have data, write it */
+  if (WriteAvailData(fd) != SUCCESS) {
+   /* assume the worst and close */
+    CloseSock(fd);
+    if (si->si_peerFd >= 0) {
+      CloseSock(si->si_peerFd);
+      si->si_peerFd = -1;
+    }
+    return;
+  }
+
+  /* if peer is closed and we're done writing, we should close */
+  if (si->si_peerFd < 0 && si->si_writeBuf->fb_used == 0)
+    CloseSock(fd);
+}
+/*-----------------------------------------------------------------*/
+static void
+CloseReqlessConns(void)
+{
+  static int lastSweep;
+  int maxAge;
+  int i;
+
+  if (lastSweep == now)
+    return;
+  lastSweep = now;
+
+  if (numTotalSliceConns + numNeedingHeaders > MAX_CONNS ||
+      numNeedingHeaders > TARG_SETSIZE/20) {
+    /* second condition is probably an attack - close aggressively */
+    maxAge = 5;
+  }
+  else if (numTotalSliceConns + numNeedingHeaders > FAIRNESS_CUTOFF ||
+          numNeedingHeaders > TARG_SETSIZE/40) {
+    /* sweep a little aggressively */
+    maxAge = 10;
+  }
+  else if (numNeedingHeaders > TARG_SETSIZE/80) {
+    /* just sweep to close strays */
+    maxAge = 30;
+  }
+  else {
+    /* too little gained - not worth sweeping */
+    return;
+  }
+
+  /* if it's too old, close it */
+  for (i = 0; i < highestSetFd+1; i++) {
+    if (sockInfo[i].si_needsHeaderSince &&
+       (now - sockInfo[i].si_needsHeaderSince) > maxAge)
+      CloseSock(i);
+  }
+}
+/*-----------------------------------------------------------------*/
+static void
+MainLoop(int lisSock)
+{
+  int i;
+  OurFDSet tempReadSet, tempWriteSet;
+  int res;
+  int lastConfCheck = 0;
+
+  signal(SIGPIPE, SIG_IGN);
+
+  while (1) {
+    int newSock;
+    int ceiling;
+    struct timeval timeout;
+
+    now = time(NULL);
+
+    if (now - lastConfCheck > 300) {
+      ReadConfFile();
+      GetSliceXids();          /* always call - in case new slices created */
+      lastConfCheck = now;
+    }
+
+    /* see if there's any activity */
+    tempReadSet = masterReadSet;
+    tempWriteSet = masterWriteSet;
+
+    /* trim it down if needed */
+    while (highestSetFd > 1 &&
+          (!FD_ISSET(highestSetFd, &tempReadSet)) &&
+          (!FD_ISSET(highestSetFd, &tempWriteSet)))
+      highestSetFd--;
+    timeout.tv_sec = 1;
+    timeout.tv_usec = 0;
+    res = select(highestSetFd+1, (fd_set *) &tempReadSet, 
+                (fd_set *) &tempWriteSet, NULL, &timeout);
+    if (res < 0 && errno != EINTR) {
+      perror("select");
+      exit(-1);
+    }
+
+    now = time(NULL);
+
+    /* clear the bit for listen socket to avoid confusion */
+    ClearFd(lisSock, &tempReadSet);
+    
+    ceiling = highestSetFd+1;  /* copy it, since it changes during loop */
+    /* pass data back and forth as needed */
+    for (i = 0; i < ceiling; i++) {
+      if (FD_ISSET(i, &tempWriteSet))
+       SocketReadyToWrite(i);
+    }
+    for (i = 0; i < ceiling; i++) {
+      if (FD_ISSET(i, &tempReadSet))
+       SocketReadyToRead(i);
+    }
+
+    /* see if we need to close conns w/o requests */
+    CloseReqlessConns();
+    
+    /* do all closes */
+    ReallyCloseSocks();
+
+    /* try accepting new connections */
+    do {
+      struct sockaddr_in addr;
+      socklen_t lenAddr = sizeof(addr);
+      if ((newSock = accept(lisSock, (struct sockaddr *) &addr, 
+                           &lenAddr)) >= 0) {
+       memset(&sockInfo[newSock], 0, sizeof(SockInfo));
+       sockInfo[newSock].si_needsHeaderSince = now;
+       numNeedingHeaders++;
+       sockInfo[newSock].si_peerFd = -1;
+       sockInfo[newSock].si_cliAddr = addr.sin_addr;
+       sockInfo[newSock].si_whichService = -1;
+       SetFd(newSock, &masterReadSet);
+      }
+    } while (newSock >= 0);
+  }
+}
+/*-----------------------------------------------------------------*/
+int
+main(int argc, char *argv[])
+{
+  int lisSock;
+
+  if ((lisSock = CreatePrivateAcceptSocket(DEMUX_PORT, TRUE)) < 0) {
+    fprintf(stderr, "failed creating accept socket\n");
+    exit(-1);
+  }
+  SetFd(lisSock, &masterReadSet);
+
+  while (1) {
+    numForks++;
+    if (fork()) {
+      /* this is the parent - just wait */
+      while (wait3(NULL, 0, NULL) < 1)
+       ;                       /* just keep waiting for a real pid */
+    }
+    else {
+      /* child process */
+      MainLoop(lisSock);
+      exit(-1);
+    }
+  }
+}
+/*-----------------------------------------------------------------*/
diff --git a/codemux.conf b/codemux.conf

new file mode 100644 (file)

index 0000000..b5c86fb
--- /dev/null
+++ b/codemux.conf
@@ -0,0 +1,6 @@
+coblitz.codeen.org princeton_coblitz 3125
+cdn.rd.tp.pl princeton_coblitz 3125
+nyud.net nyu_d 8080
+nyucd.net nyu_d 8080
+nyuld.net nyu_oasis 8096
+cob-web.org cornell_cobweb 8888
diff --git a/codemux.initscript b/codemux.initscript

new file mode 100644 (file)

index 0000000..5a69a70
--- /dev/null
+++ b/codemux.initscript
@@ -0,0 +1,69 @@
+#!/bin/sh
+#
+# chkconfig: 345 85 02
+# description: codemux startup script
+#
+
+PROC=codemux
+
+. /etc/rc.d/init.d/functions
+
+RETVAL=0
+
+pidfile=/var/run/$PROC.pid
+
+check_status() {
+    pid=`cat $pidfile 2>/dev/null`
+    #
+    # this eliminates a race condition between checking existence of pidfile
+    # and reading its value
+    #
+    [ -n "$pid" -a -d /proc/$pid ]
+}
+
+case "$1" in
+    start)
+        echo -n "starting $PROC:"
+        pid=`cat $pidfile 2>/dev/null`
+        if [ -n "$pid" ]; then
+            # check whether process really exists
+            # yes - don't try to start
+            [ -d /proc/$pid ] && action "already running" /bin/true && exit 1
+
+            # no - PID file is stale
+            rm -f $pidfile
+        fi
+
+        initlog -c /usr/local/planetlab/sbin/codemux
+
+        cmd=success
+        check_status && touch /var/lock/subsys/$PROC || cmd=failure
+        $cmd "$PROC startup"
+        echo
+        ;;
+
+    stop)
+        echo -n "shutting down $PROC: "
+        killproc $PROC
+        RETVAL=$?
+        echo
+        [ $RETVAL -eq 0 ] && rm -f /var/lock/subsys/$PROC
+        ;;
+
+    restart|reload)
+        $0 stop
+        $0 start
+        RETVAL=$?
+        ;;
+
+    status)
+        check_status && echo 'running' && exit 0 || \
+            echo 'not running' && exit 1
+        ;;
+
+    *)
+        echo "Usage: $0 {start|stop|restart|status}"
+        RETVAL=1
+esac
+
+exit $RETVAL
diff --git a/codemux.spec b/codemux.spec

new file mode 100644 (file)

index 0000000..a713adf
--- /dev/null
+++ b/codemux.spec
@@ -0,0 +1,63 @@
+Summary: CoDemux - HTTP port demultiplexer
+Name: codemux
+Version: 0.5
+Release: 1
+License: Private
+Group: System Environment/Base
+URL: http://codeen.cs.princeton.edu/
+Source0: %{name}-%{version}.tar.gz
+BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root
+
+%description
+
+%prep
+%setup -q
+
+make clean
+
+%build
+make
+
+%install
+rm -rf $RPM_BUILD_ROOT
+
+make INSTALL_ROOT=$RPM_BUILD_ROOT install-all
+
+# Build and install in %post so that it gets put in the right site-packages directory
+#rm -rf $RPM_BUILD_ROOT/usr/lib/python*
+#install -D -m 755 python/Proper.py $RPM_BUILD_ROOT/%{_datadir}/%{name}/Proper.py
+
+%clean
+rm -rf $RPM_BUILD_ROOT
+
+%files
+%defattr(-,root,root,-)
+%attr(0755,root,root) %{_initrddir}/codemux
+%config /etc/codemux/codemux.conf
+%attr(0755,root,root) /usr/local/planetlab/sbin/codemux
+
+%post
+chkconfig codemux reset
+
+if [ -z "$PL_BOOTCD" ]; then
+    /sbin/ldconfig
+    /etc/init.d/codemux restart
+fi
+
+%preun
+if [ "$1" = 0 ]; then
+    # erase, not upgrade
+    chkconfig --del codemux
+
+    # stop daemon if its currently running
+    if [ "`/etc/init.d/codemux status`" = "running" ]; then
+       /etc/init.d/codemux stop
+    fi
+fi
+
+%doc
+
+%changelog
+* Sun Apr 22 2007 KYOUNGSOO PARK <kyoungso@park.cs.princeton.edu> - 
+- Initial build.
+
diff --git a/codns.h b/codns.h

new file mode 100644 (file)

index 0000000..2f7f36e
--- /dev/null
+++ b/codns.h
@@ -0,0 +1,42 @@
+#ifndef _CODNS_H_
+#define _CODNS_H_
+#include "ports.h"
+
+/* query info - fixed part */
+typedef struct LocalQueryInfo {
+  int lqi_size;                 /* length of the name string */
+  int lqi_id;                   /* query id */
+  int lqi_cache;                /* not being used now */
+} LocalQueryInfo;
+
+/* query info + name 
+   query structure expected from a client */
+#define MAX_QUERY_NAME 256
+#define SIG_SPLIT_TRANSACTION 0 /* signature for split-transaction */
+typedef struct LocalQuery {
+  int  lq_zero;                 /* always set to SIG_SPLIT_TRANSACTION(=0) */
+  LocalQueryInfo lq_info;       /* query info */
+  char lq_name[MAX_QUERY_NAME]; /* name */
+} LocalQuery;
+
+/* query result from CoDNS 
+   we set MAX_ANSWERS for easy implementation.
+   if lq.address[i].s_addr == 0, that means it returned i-1 valid anwers. */
+#define MAX_ANSWERS 8         
+typedef struct LocalQueryResult {
+  int lq_id;                               /* query id */
+  int lq_ttl;                              /* TTL of the record */
+  struct in_addr lq_address[MAX_ANSWERS];  /* IP addresses for the query */
+} LocalQueryResult;
+
+/*----------------------------------------------------------------------*/
+
+/* temporary section : from here to the end
+   used for defining variables or constants for testing */
+
+/* for testing in HBTWGET */
+#define HBTWGET_CODNS_ID (-3)
+
+#endif // _CODNS_H_
+
+
diff --git a/debug.h b/debug.h

new file mode 100644 (file)

index 0000000..a7ddc87
--- /dev/null
+++ b/debug.h
@@ -0,0 +1,97 @@
+#ifndef _DEBUG_H_
+#define _DEBUG_H_
+#include <stdio.h>
+#include "applib.h"
+#undef max
+
+
+/*
+  TRACE  : print with function name
+  TRACE0 : print without function name
+  TRACE1 : print "buf" whose size is "size"
+*/
+
+#define DEBUG
+
+extern HANDLE hdebugLog;
+extern int defaultTraceSync;
+#define TRACE0(fmt, msg...) {                                             \
+       char __buf[2048];                                                  \
+       if (hdebugLog) {                                                   \
+          snprintf(__buf, sizeof(__buf), fmt, ##msg);                     \
+          WriteLog(hdebugLog, __buf, strlen(__buf), defaultTraceSync);    \
+       }                                                                  \
+}          
+#define TRACE1(buf, size) {                                 \
+       WriteLog(hdebugLog, buf, size, defaultTraceSync);    \
+}
+#define TRACE(fmt, msg...) {                                               \
+       char __buf[2048];                                                   \
+       if (hdebugLog) {                                                    \
+         snprintf(__buf, sizeof(__buf), "[%s] " fmt, __FUNCTION__, ##msg); \
+         WriteLog(hdebugLog, __buf, strlen(__buf), defaultTraceSync);      \
+       }                                                                  \
+}                                                                  
+#define TRACEX(fmt) {                                                      \
+       char __buf[2048];                                                   \
+       if (hdebugLog) {                                                    \
+         snprintf(__buf, sizeof(__buf), "[%s] " fmt, __FUNCTION__);        \
+         WriteLog(hdebugLog, __buf, strlen(__buf), defaultTraceSync);      \
+       }                                                                  \
+}                                                                  
+
+#ifndef HERE
+#define HERE TRACE("file %s, line %d, func %s\n", __FILE__, __LINE__, __FUNCTION__)
+#endif
+
+#ifdef DEBUG
+#define ASSERT(exp) {                                         \
+  if (!(exp)) {                                               \
+    TRACE("ASSERTION (%s) FAILED in %s (%s:%d)\n",            \
+        (#exp), __FUNCTION__, __FILE__, __LINE__);           \
+  }                                                           \
+}
+#else
+#define ASSERT(exp)         1 ? (void)0 : (exp)
+#endif // DEBUG
+
+/*--------------------------------------------------------------
+  macros used for debugging memory leaks 
+  if DEBUG_MEMORY_LEAK is enabled, we track down all the memory
+  allocation/freeing to count the number of allocations
+ -------------------------------------------------------------*/
+//#define DEBUG_MEMORY_LEAK
+
+#ifndef DEBUG_MEMORY_LEAK
+
+#define xcalloc(nmemb, size) calloc(nmemb, size)
+#define xmalloc(size)        malloc(size)
+#define xrealloc(ptr, size)  realloc(ptr, size)
+#define xstrdup(s)           strdup(s)
+#define xfree(ptr)           free(ptr)
+
+#else
+
+#define xcalloc(nmemb, size) dbgcalloc(nmemb, size, __FUNCTION__, \
+                                      __FILE__, __LINE__)
+#define xmalloc(size)        dbgmalloc(size, __FUNCTION__,\
+                                       __FILE__, __LINE__)
+#define xrealloc(ptr, size)  dbgrealloc(ptr, size, __FUNCTION__,\
+                                        __FILE__, __LINE__)
+#define xstrdup(s)           dbgstrdup(s, __FUNCTION__, __FILE__, __LINE__)
+#define xfree(ptr)           dbgfree(ptr, __FUNCTION__, __FILE__, __LINE__)
+
+void *dbgcalloc(size_t nmemb, size_t size, 
+               const char* func, const char* file, const int line);
+void *dbgmalloc(size_t size, 
+               const char* func, const char* file, const int line);
+void *dbgrealloc(void *ptr, size_t size, 
+                const char* func, const char* file, const int line);
+char *dbgstrdup(const char *s, 
+               const char* func, const char* file, const int line);
+void  dbgfree(void *ptr, const char* func, const char* file, const int line);
+void  dbg_print_memtrace(void);
+
+#endif /* DEBUG_MEMOTY_LEAK */
+
+#endif /* _DEBUG_H_ */
diff --git a/gettimeofdayex.c b/gettimeofdayex.c

new file mode 100644 (file)

index 0000000..4fc6718
--- /dev/null
+++ b/gettimeofdayex.c
@@ -0,0 +1,149 @@
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include "gettimeofdayex.h"
+
+#if defined(__linux) && !defined(ALPHA)
+
+/* Macros */
+#define rdtsc(low, high) \
+   asm volatile("rdtsc":"=a" (low), "=d" (high))
+
+/* get cycle counts */
+#define GET_CC(cc)                       \
+do                                       \
+{                                        \
+   cc = 0;                               \
+   unsigned long __cc_low__,__cc_high__; \
+   rdtsc(__cc_low__,__cc_high__);        \
+   cc = __cc_high__;                     \
+   cc = (cc << 32) + __cc_low__;         \
+}                                        \
+while(0)
+
+/*-------------------------------------------------------------------------*/
+static int
+get_cpu_speed(double* cpu_speed) 
+{
+   FILE* fp = NULL;
+   char  *line = NULL;
+   size_t len = 0;
+   ssize_t num;
+   char* ptr  = 0;
+
+   if (cpu_speed == NULL) 
+     return -1;
+
+   if ((fp = fopen("/proc/cpuinfo", "r")) == NULL) 
+     return -1;
+
+   while ((num = getline(&line, &len, fp)) != -1) {
+      ptr = strtok(line, ":\n");
+      if (ptr && strstr(ptr, "cpu MHz")) {
+         ptr = strtok(0," \r\t\n");
+         *cpu_speed = strtod(ptr, 0);
+        break;
+      }
+   }
+   if (line) 
+     free(line);
+
+   fclose(fp);
+   return (*cpu_speed == 0) ? -1 : 0;
+}
+
+/*--------------------------------------------------------------------------*/
+int 
+gettimeofdayex(struct timeval *tv, struct timezone *tz)
+{
+#define MAX_64_BIT_NUM ((unsigned long long)(-1))
+
+  /* TO DO: timezone adjustment */
+  static int first = 1, impossible = 0;
+  static double cpu_speed;
+  static struct timeval start;
+  static unsigned long long cc_start;
+  unsigned long long cc_now, cc_diff, usec;
+
+  /* initialize : get the current time 
+     and fix it as a starting point */
+  if (first) {
+    if (get_cpu_speed(&cpu_speed) < 0)
+      impossible = 1;
+
+    GET_CC(cc_start);
+    gettimeofday(&start, 0);
+    if (tv)
+      *tv = start;
+    first = 0;
+    return 0;
+  }
+
+  /* if it's impossible to get cycle counts, 
+     then use original gettimeofday() */
+  if (impossible)
+    return gettimeofday(tv, tz);
+
+  if (tv) {
+    GET_CC(cc_now);
+
+    /* when overflow happens, we need to take care of the carry bit,
+       otherwise we get wrong result since we are using unsigned variables.
+       assuming cycle counter starts from zero at time zero,
+       this would happen after 136 years on 4GHz CPU.
+       however, lets just play on the safer side.
+    */
+
+    cc_diff = (cc_now < cc_start) ? cc_now + (MAX_64_BIT_NUM - cc_start)
+                                  : cc_now - cc_start;
+    usec = (unsigned long long)(cc_diff / cpu_speed);
+
+    *tv = start;
+    tv->tv_sec  += (usec / 1000000);
+    tv->tv_usec += (usec % 1000000);
+
+    if (tv->tv_usec >= 1000000) {
+      tv->tv_sec++;
+      tv->tv_usec -= 1000000;
+    }
+  }
+
+  return 0;
+}
+
+/*--------------------------------------------------------------------------*/
+time_t 
+timeex(time_t *t)
+{
+  /* simple version of time() */
+  struct timeval s;
+
+  gettimeofdayex(&s, NULL);
+  if (t)
+    *t = (time_t)s.tv_sec;
+
+  return (time_t)s.tv_sec;
+}
+
+#else
+#include <time.h>
+/*--------------------------------------------------------------------------*/
+int 
+gettimeofdayex(struct timeval *tv, struct timezone *tz)
+{
+  return(gettimeofday(tv, tz));
+}
+/*--------------------------------------------------------------------------*/
+time_t 
+timeex(time_t *t)
+{
+  return(time(t));
+}
+/*--------------------------------------------------------------------------*/
+#endif
diff --git a/gettimeofdayex.h b/gettimeofdayex.h

new file mode 100644 (file)

index 0000000..b9d6108
--- /dev/null
+++ b/gettimeofdayex.h
@@ -0,0 +1,8 @@
+#ifndef _GETTIMEOFDAYEX_H_
+#define _GETTIMEOFDAYEX_H_
+#include <sys/time.h>
+
+int gettimeofdayex(struct timeval *tv, struct timezone *tz);
+time_t timeex(time_t *t);
+
+#endif // _GETTIMEOFDAYEX_H_
diff --git a/ports.h b/ports.h

new file mode 100644 (file)

index 0000000..807dd2e
--- /dev/null
+++ b/ports.h
@@ -0,0 +1,102 @@
+#ifndef _PORTS_H_
+#define _PORTS_H_
+
+#define PORTS_COBLITZ
+
+#ifdef PORTS_CODEEN
+
+#define HOMEDIR "/home/princeton_codeen"
+/* options in prox.conf or defined implicitly
+                            1161 snmpPort
+                           31415 APIDummyServPort
+                            3130 ICPPort */
+/* different ports for CoDNS and CoDNS2 */
+#ifdef CODNS2
+#define CODNS_PORT          24119 /* CODNS2 PORT */
+#define CODNS_STAT_PORT     24118 /* CODNS2 STATISTICS PORT */
+#define CODNS_HB_PORT       24121
+#else
+#define CODNS_PORT          4119 /* CODNS PORT */
+#define CODNS_STAT_PORT     4118 /* CODNS STATISTICS PORT */
+#define CODNS_HB_PORT       4121
+#endif
+
+#define HB_PORT            23127 /* CoDeeN heartbeat port */
+#define CODEMON_PORT        2123
+/*                          3124 proxy listen port */
+#define CODEPLOY_PORT       2125
+#define FAKE_WEBSERVER_PORT 3126
+#define FAKE_WS_PORT_STR   "3126"
+/*                          3127 proxy listen port */
+/*                          3128 proxy listen port */
+#define MAIN_PROXYPORT_STR "3128"
+#define CODEEN_TM_PORT      3129
+#define PROXY_PORT          3128
+#define PROXY_PORT2         3127
+#define PROXY_PORT3         3124
+
+/* used by redir_helper.c */
+#define QUERY_PORT          3122
+#define RTTSNAP_PORT        3110
+
+#elif defined(PORTS_COBLITZ)
+
+#define HOMEDIR "/home/princeton_coblitz"
+/* options in prox.conf
+                            2111 snmpPort
+                            2112 APIDummyServPort
+                            2113 ICPPort */
+#define CODNS_PORT          2119 /* CODNS PORT */
+#define CODNS_STAT_PORT     2120 /* CODNS STATISTICS PORT */
+#define CODNS_HB_PORT       2121
+#define HB_PORT             2122 /* CoDeeN heartbeat port */
+#define CODEMON_PORT       23126
+/*                          2124 proxy listen port */
+#define CODEPLOY_PORT       3125
+#define FAKE_WEBSERVER_PORT 2126
+#define FAKE_WS_PORT_STR   "2126"
+/*                          2127 proxy listen port */
+/*                          2128 proxy listen port */
+#define MAIN_PROXYPORT_STR "2128"
+#define CODEEN_TM_PORT      2129
+#define PROXY_PORT          2128
+#define PROXY_PORT2         2127
+#define PROXY_PORT3         2124
+
+/* used by redir_helper.c */
+#define QUERY_PORT          2122
+#define RTTSNAP_PORT        2110
+
+#else
+#error "need to define either PORTS_CODEEN or PORTS_COBLITZ"
+#endif
+
+
+#endif
+
+
+/*
+
+other changed files: prox_monitor prox.conf
+
+ 4119   tcp  415  princeton_codeen codns
+ 4119   udp  415  princeton_codeen codns
+ 4118   tcp  412  princeton_codeen codns
+ 4121   udp  411  princeton_codeen codns
+
+23126   tcp  412  princeton_codeen codemon
+23126   udp  412  princeton_codeen codemon
+23127   tcp  399  princeton_codeen codeen
+23127   udp  399  princeton_codeen codeen
+ 3126   tcp  399  princeton_codeen codeen
+ 1161   udp  401  princeton_codeen specified via snmpPort
+31415   tcp  401  princeton_codeen can be specified via APIDummyServPort
+ 3124   tcp  399  princeton_codeen specified in prox.conf
+ 3127   tcp  399  princeton_codeen specified in prox.conf
+ 3128   tcp  398  princeton_codeen specified in prox.conf
+ 3129   tcp  399  princeton_codeen codeen traffic mon
+ 3130   udp  401  princeton_codeen can be specified from ICPPort
+ 3125   tcp  410  princeton_codeen codeploy
+ 3135   tcp  370  princeton_codeen
+
+*/
diff --git a/vnet_main.c b/vnet_main.c

new file mode 100644 (file)

index 0000000..4dfe7b4
--- /dev/null
+++ b/vnet_main.c
@@ -0,0 +1,1202 @@
+/*
+ * VServer IP isolation.
+ *
+ * This file implements netfilter hooks and AF_INET socket function
+ * overrides.
+ *
+ * Mark Huang <mlhuang@cs.princeton.edu>
+ * Copyright (C) 2004 The Trustees of Princeton University
+ *
+ * $Id: vnet_main.c,v 1.40 2007/03/08 15:46:07 mef Exp $
+ */
+
+#include <linux/version.h>
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/pkt_sched.h>
+#include <linux/skbuff.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/icmp.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/route.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
+#include <linux/netfilter_ipv4/ip_conntrack_core.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+
+#include "vnet_config.h"
+#include "vnet.h"
+#include "vnet_dbg.h"
+#include "vnet_compat.h"
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
+
+#define HAVE_FUNCTIONALITY_REQUIRED_BY_DEMUX
+
+#include <net/inet_hashtables.h>
+
+static inline void
+vnet_timewait_put(struct sock* sk)
+{
+         inet_twsk_put((struct inet_timewait_sock *)sk);
+}
+
+static inline struct sock* 
+vnet_tcp_lookup(u32 src_ip, u16 src_port, 
+               u32 ip, u16 port, int dif)
+{
+  return inet_lookup(&tcp_hashinfo, src_ip, src_port, ip, port, dif);
+}
+
+static inline int vnet_iif(const struct sk_buff *skb)
+{
+  return inet_iif(skb);
+}
+#endif
+
+#if LINUX_VERSION_CODE == KERNEL_VERSION(2,6,12)
+
+#define HAVE_FUNCTIONALITY_REQUIRED_BY_DEMUX
+
+static inline void 
+vnet_timewait_put(struct sock* sk)
+{
+  /* net/tcp.h */
+  tcp_tw_put((struct tcp_tw_bucket*)sk);
+}
+
+static inline struct sock* 
+vnet_tcp_lookup(u32 saddr, u16 sport, u32 daddr,u16 dport, int dif)
+{
+  extern struct sock *tcp_v4_lookup(u32, u16, u32, u16, int);
+  return tcp_v4_lookup(saddr, sport, daddr, dport, dif);
+}
+
+/* same as tcp_v4_iff() in net/ipv4/tcp_ipv4. */
+static inline int vnet_iif(const struct sk_buff *skb)
+{
+       return ((struct rtable *)skb->dst)->rt_iif;
+}
+#endif
+
+#ifndef HAVE_FUNCTIONALITY_REQUIRED_BY_DEMUX
+#warning DEMUX FUNCTIONALITY NOT SUPPORTED
+#endif
+
+int vnet_verbose = 1;
+
+/* We subdivide the 1: major class into 15 minor subclasses 1:1, 1:2,
+ * etc. so that we can represent multiple bandwidth limits. The 1:1
+ * subclass has children named 1:1000, 1:1001, etc., one for each
+ * context (up to 4096). Similarly, the 1:2 subclass has children
+ * named 1:2000, 1:2001, etc. By default, the 1:1 subclass represents
+ * the node bandwidth cap and 1:1000 represents the root context's
+ * share of it. */
+int vnet_root_class = TC_H_MAKE(1 << 16, 0x1000);
+
+#define FILTER_VALID_HOOKS ((1 << NF_IP_LOCAL_IN) | \
+                           (1 << NF_IP_LOCAL_OUT) | \
+                           (1 << NF_IP_POST_ROUTING))
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,11)
+
+/* Standard entry. */
+struct ipt_standard
+{
+       struct ipt_entry entry;
+       struct ipt_standard_target target;
+};
+
+struct ipt_error_target
+{
+       struct ipt_entry_target target;
+       char errorname[IPT_FUNCTION_MAXNAMELEN];
+};
+
+struct ipt_error
+{
+       struct ipt_entry entry;
+       struct ipt_error_target target;
+};
+
+#endif
+
+static struct
+{
+       struct ipt_replace repl;
+       struct ipt_standard entries[3];
+       struct ipt_error term;
+} initial_table __initdata =
+{
+       .repl =
+       {
+               .name = "vnet",
+               .valid_hooks = FILTER_VALID_HOOKS,
+               .num_entries = 4,
+               .size = sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error),
+               .hook_entry = { [NF_IP_LOCAL_IN] = 0,
+                               [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard),
+                               [NF_IP_POST_ROUTING] = sizeof(struct ipt_standard) * 2, },
+               .underflow = { [NF_IP_LOCAL_IN] = 0,
+                              [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard),
+                              [NF_IP_POST_ROUTING] = sizeof(struct ipt_standard) * 2, },
+       },
+
+       .entries =
+       {
+               /* LOCAL_IN: currently unused */
+               { .entry = { .target_offset = sizeof(struct ipt_entry),
+                            .next_offset = sizeof(struct ipt_standard), },
+                 .target = { .target = { .u = { .target_size = IPT_ALIGN(sizeof(struct ipt_standard_target)), }, },
+                             .verdict = -NF_ACCEPT - 1, },
+               },
+
+               /* LOCAL_OUT: used for logging */
+               { .entry = { .target_offset = sizeof(struct ipt_entry),
+                            .next_offset = sizeof(struct ipt_standard), },
+                 .target = { .target = { .u = { .target_size = IPT_ALIGN(sizeof(struct ipt_standard_target)), }, },
+                             .verdict = -NF_ACCEPT - 1, },
+               },
+
+               /* POST_ROUTING: used for priority classification */
+               { .entry = { .target_offset = sizeof(struct ipt_entry),
+                            .next_offset = sizeof(struct ipt_standard), },
+                 .target = { .target = { .u = { .target_size = IPT_ALIGN(sizeof(struct ipt_standard_target)), }, },
+                             .verdict = -NF_ACCEPT - 1, },
+               },
+       },
+
+       /* ERROR */
+       .term =
+       {
+               .entry = { .target_offset = sizeof(struct ipt_entry),
+                          .next_offset = sizeof(struct ipt_error), },
+               .target = { .target = { .u = { .user = { .target_size = IPT_ALIGN(sizeof(struct ipt_error_target)),
+                                                        .name = IPT_ERROR_TARGET, }, }, },
+                           .errorname = "ERROR", },
+       },
+};
+
+static struct ipt_table vnet_table = {
+       .name           = "vnet",
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,11)
+       .table          = &initial_table.repl,
+#endif
+       .valid_hooks    = FILTER_VALID_HOOKS,
+       .lock           = RW_LOCK_UNLOCKED,
+       .me             = THIS_MODULE,
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
+       .af             = AF_INET,
+#endif
+};
+
+static inline u_int16_t
+get_dst_port(struct ip_conntrack_tuple *tuple)
+{
+       switch (tuple->dst.protonum) {
+       case IPPROTO_GRE:
+               /* XXX Truncate 32-bit GRE key to 16 bits */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,11)               
+               return tuple->dst.u.gre.key;
+#else
+               return htons(ntohl(tuple->dst.u.gre.key));
+#endif
+       case IPPROTO_ICMP:
+               /* Bind on ICMP echo ID */
+               return tuple->src.u.icmp.id;
+       case IPPROTO_TCP:
+               return tuple->dst.u.tcp.port;
+       case IPPROTO_UDP:
+               return tuple->dst.u.udp.port;
+       default:
+               return tuple->dst.u.all;
+       }
+}
+
+static inline u_int16_t
+get_src_port(struct ip_conntrack_tuple *tuple)
+{
+       switch (tuple->dst.protonum) {
+       case IPPROTO_GRE:
+               /* XXX Truncate 32-bit GRE key to 16 bits */
+               return htons(ntohl(tuple->src.u.gre.key));
+       case IPPROTO_ICMP:
+               /* Bind on ICMP echo ID */
+               return tuple->src.u.icmp.id;
+       case IPPROTO_TCP:
+               return tuple->src.u.tcp.port;
+       case IPPROTO_UDP:
+               return tuple->src.u.udp.port;
+       default:
+               return tuple->src.u.all;
+       }
+}
+
+
+
+static unsigned int
+vnet_hook(unsigned int hook,
+         struct sk_buff **pskb,
+         const struct net_device *in,
+         const struct net_device *out,
+         int (*okfn)(struct sk_buff *))
+{
+       struct ip_conntrack *ct;
+       enum ip_conntrack_info ctinfo;
+       enum ip_conntrack_dir dir;
+       u_int8_t protocol;
+       u_int32_t ip;
+       u_int16_t port;
+       struct bind_key *key;
+       xid_t xid;
+       unsigned int verdict;
+       int priority;
+       struct sock *sk;
+       int need_to_free_sk = 0;
+
+       ct = ip_conntrack_get(*pskb, &ctinfo);
+       dir = CTINFO2DIR(ctinfo);
+
+       /* Default to marking packet with root context ID */
+       xid = 0;
+
+       switch (hook) {
+
+       case NF_IP_LOCAL_IN:
+               /* Multicast to 224.0.0.1 is one example */
+               if (!ct)
+                       break;
+
+               /* Determine if the packet is destined for a bound port */
+               protocol = ct->tuplehash[dir].tuple.dst.protonum;
+               assert(ctinfo == IP_CT_RELATED ||
+                      ctinfo == (IP_CT_IS_REPLY + IP_CT_RELATED) ||
+                      protocol == (*pskb)->nh.iph->protocol);
+               ip = ct->tuplehash[dir].tuple.dst.ip;
+               port = get_dst_port(&ct->tuplehash[dir].tuple);
+
+               /* Check if the port is bound */
+               key = bind_get(protocol, ip, port, NULL);
+
+               if (key && key->sk != NULL) {
+
+                       /* A new or established connection to a bound port */
+                       sk = key->sk;
+
+#ifdef HAVE_FUNCTIONALITY_REQUIRED_BY_DEMUX
+                       /* If the bound socket is a real TCP socket, then the context that
+                        * bound the port could have re-assigned an established connection
+                        * socket to another context. See if this is the case.
+                        */
+                       if (protocol == IPPROTO_TCP && sk->sk_type == SOCK_STREAM) {
+                               struct sock *tcp_sk;
+                               u_int32_t src_ip = ct->tuplehash[dir].tuple.src.ip;
+                               u_int16_t src_port = get_src_port(&ct->tuplehash[dir].tuple);
+
+                               tcp_sk = vnet_tcp_lookup(src_ip, src_port, ip, port, vnet_iif(*pskb));
+                               if (tcp_sk) {
+                                 if (tcp_sk->sk_state == TCP_TIME_WAIT) {
+                                    sock_put(tcp_sk);
+                                 } else {
+                                   dbg("vnet_in:%d: established TCP socket %u.%u.%u.%u:%u -> %u.%u.%u.%u:%u\n", 
+                                       get_sk_xid(tcp_sk), NIPQUAD(src_ip), ntohs(src_port), NIPQUAD(ip), ntohs(port));
+                                   sk = tcp_sk;
+                                   need_to_free_sk = 1;
+                                 }
+                                 /* Remember to sock_put()! */
+                               }
+                       }
+#endif
+
+                       /* Indicate to the stack that the packet was "expected", so that it does
+                        * not generate a TCP RST or ICMP Unreachable message. This requires a
+                        * kernel patch.
+                        */
+                       if (sk->sk_type == SOCK_RAW)
+                         (*pskb)->sk = sk;
+
+                       assert(sk);
+                       xid = get_sk_xid(sk);
+
+                       /* Steal the reply end of the connection */
+                       if (get_ct_xid(ct, !dir) != xid) {
+                               dbg("vnet_in:%d: stealing %sbound %s connection %u.%u.%u.%u:%u -> %u.%u.%u.%u:%u from context %d\n", xid,
+                                   key ? "" : "un", print_protocol(protocol),
+                                   NIPQUAD(ip), ntohs(port),
+                                   NIPQUAD(ct->tuplehash[!dir].tuple.dst.ip), ntohs(ct->tuplehash[!dir].tuple.dst.u.all),
+                                   get_ct_xid(ct, !dir));
+                               set_ct_xid(ct, !dir, xid);
+                       }
+
+                       /* Store the owner (if any) of the other side of the connection (if
+                        * localhost) in the peercred struct.
+                        */
+                       sk->sk_peercred.uid = sk->sk_peercred.gid = (__u32) get_ct_xid(ct, dir);
+
+                       if (ctinfo == IP_CT_NEW) {
+                               dbg("vnet_in: %s port %u.%u.%u.%u:%u bound by context %d\n",
+                                   print_protocol(protocol), NIPQUAD(ip), ntohs(port), xid);
+                       }
+
+#ifdef HAVE_FUNCTIONALITY_REQUIRED_BY_DEMUX
+                       if (need_to_free_sk) {
+                         /*
+                         if (sk->sk_state == TCP_TIME_WAIT)
+                           vnet_timewait_put(sk);
+                         else*/
+                         sock_put(sk);
+                         need_to_free_sk=0;
+                       }
+#endif
+                       bind_put(key);
+
+               } else if ((int) get_ct_xid(ct, !dir) == -1) {
+                       /* A new connection to an unbound port */
+                       if (ctinfo == IP_CT_NEW) {
+                               dbg("vnet_in: %s port %u.%u.%u.%u:%u not bound\n",
+                                   print_protocol(protocol), NIPQUAD(ip), ntohs(port));
+                       }
+               } else {
+                       /* A new or established connection to an unbound port that could be
+                        * associated with an active socket ("could be" because the socket
+                        * could be closed and the connection in a WAIT state). In any case,
+                        * give it to the last owner of the connection.
+                        */
+                       xid = get_ct_xid(ct, !dir);
+               }
+
+               break;
+
+       case NF_IP_LOCAL_OUT:
+               /* Get the context ID of the sender */
+               assert((*pskb)->sk);
+               xid = get_sk_xid((*pskb)->sk);
+
+               /* Default class */
+               priority = vnet_root_class;
+
+               if (ct) {
+                       protocol = ct->tuplehash[dir].tuple.dst.protonum;
+                       assert(ctinfo == IP_CT_RELATED ||
+                              ctinfo == (IP_CT_IS_REPLY + IP_CT_RELATED) ||
+                              protocol == (*pskb)->nh.iph->protocol);
+                       ip = ct->tuplehash[dir].tuple.src.ip;
+                       assert(ctinfo == IP_CT_RELATED ||
+                              ctinfo == (IP_CT_IS_REPLY + IP_CT_RELATED) ||
+                              ip == __constant_htonl(INADDR_ANY) || ip == (*pskb)->nh.iph->saddr);
+                       port = get_src_port(&ct->tuplehash[dir].tuple);
+               } else {
+                       protocol = port = 0;
+               }
+
+               if (xid) {
+                       /* Multicast to 224.0.0.1 is one example */
+                       if (!ct) {
+                               dbg("vnet_out:%d: dropping untrackable IP packet\n", xid);
+                               return NF_DROP;
+                       }
+
+                       /* XXX Is this guaranteed? */
+                       if ((*pskb)->len < sizeof(struct iphdr)) {
+                               dbg("vnet_out:%d: dropping runt IP packet\n", xid);
+                               return NF_DROP;
+                       }
+
+                       /* Check source IP address */
+                       if (inet_addr_type(ip) != RTN_LOCAL) {
+                               dbg("vnet_out:%d: non-local source IP address %u.%u.%u.%u not allowed\n", xid,
+                                   NIPQUAD(ip));
+                               return NF_DROP;
+                       }
+
+                       /* Sending of ICMP error messages not allowed */
+                       if (protocol == IPPROTO_ICMP) {
+                               struct icmphdr *icmph = (struct icmphdr *)((*pskb)->nh.raw + ((*pskb)->nh.iph->ihl * 4));
+
+                               if ((unsigned char *) &icmph[1] > (*pskb)->tail) {
+                                       dbg("vnet_out:%d: dropping runt ICMP packet\n", xid);
+                                       return NF_DROP;
+                               }
+                               
+                               switch (icmph->type) {
+                               case ICMP_ECHOREPLY:
+                               case ICMP_ECHO:
+                               case ICMP_TIMESTAMP:
+                               case ICMP_TIMESTAMPREPLY:
+                               case ICMP_INFO_REQUEST:
+                               case ICMP_INFO_REPLY:
+                               case ICMP_ADDRESS:
+                               case ICMP_ADDRESSREPLY:
+                                       /* Guaranteed by icmp_pkt_to_tuple() */
+                                       assert(port == icmph->un.echo.id);
+                                       break;
+                               default:
+                                       dbg("vnet_out:%d: sending of ICMP error messages not allowed\n", xid);
+                                       return NF_DROP;
+                               }
+                       }
+               } else {
+                       /* Let root send anything it wants */
+               }
+
+               if (ct) {
+                       /* Check if the port is bound by someone else */
+                       key = bind_get(protocol, ip, port, NULL);
+               } else {
+                       assert(xid == 0);
+                       key = NULL;
+               }
+
+               if (key && key->sk != NULL) {
+                       /* A new or established connection from a bound port */
+                       assert(ct);
+
+                       sk = key->sk;
+
+#ifdef HAVE_FUNCTIONALITY_REQUIRED_BY_DEMUX
+                       /* If the bound socket is a real TCP socket, then the context that
+                        * bound the port could have re-assigned an established connection
+                        * socket to the sender's context. See if this is the case.
+                        */
+                       if (protocol == IPPROTO_TCP && sk->sk_type == SOCK_STREAM && get_sk_xid(sk) != xid) {
+                               struct sock *tcp_sk;
+                               u_int32_t dst_ip = ct->tuplehash[dir].tuple.dst.ip;
+                               u_int16_t dst_port = get_dst_port(&ct->tuplehash[dir].tuple);
+
+                               tcp_sk = vnet_tcp_lookup(dst_ip, dst_port, ip, port, vnet_iif(*pskb));
+                               if (tcp_sk) {
+                                 if (tcp_sk->sk_state == TCP_TIME_WAIT) {
+                                   sock_put(tcp_sk);
+                                   //vnet_timewait_put(tcp_sk);
+                                 } else {
+                                   need_to_free_sk = 1;
+                                   sk = tcp_sk;
+                                   /* Remember to sock_put()! */
+                                 }
+                               }
+                       }
+#endif
+
+                       verdict = NF_ACCEPT;
+
+                       /* Stealing connections from established sockets is not allowed */
+                       assert(sk);
+                       if (get_sk_xid(sk) != xid) {
+                               if (xid) {
+                                       dbg("vnet_out:%d: %s port %u.%u.%u.%u:%u already bound by context %d\n", xid,
+                                           print_protocol(protocol), NIPQUAD(ip), ntohs(port), get_sk_xid(sk));
+                                       verdict = NF_DROP;
+                               } else {
+                                       /* Let root send whatever it wants but do not steal the packet or
+                                        * connection. Kernel sockets owned by root may send packets on
+                                        * behalf of bound sockets (for instance, TCP ACK in SYN_RECV or
+                                        * TIME_WAIT).
+                                        */
+                                       xid = get_sk_xid(sk);
+                               }
+                       }
+
+#ifdef HAVE_FUNCTIONALITY_REQUIRED_BY_DEMUX
+                       if (need_to_free_sk) {
+                       /*
+                         if (sk->sk_state == TCP_TIME_WAIT)
+                           vnet_timewait_put(sk);
+                         else */
+                         sock_put(sk);
+                         need_to_free_sk = 0;
+                       }
+#endif
+                       bind_put(key);
+
+                       if (verdict == NF_DROP)
+                               goto done;
+               } else {
+                       /* A new or established or untrackable connection from an unbound port */
+
+                       /* Reserved ports must be bound. Usually only root is capable of
+                        * CAP_NET_BIND_SERVICE.
+                        */
+                       if (xid &&
+                           (protocol == IPPROTO_TCP || protocol == IPPROTO_UDP) &&
+                           ntohs(port) < PROT_SOCK) {
+                               assert(ct);
+                               dbg("vnet_out:%d: %s port %u is reserved\n", xid,
+                                   print_protocol(protocol), ntohs(port));
+                               return NF_DROP;
+                       }
+               }
+
+               if (ct) {
+                       /* Steal the connection */
+                       if (get_ct_xid(ct, dir) != xid) {
+                               dbg("vnet_out:%d: stealing %sbound %s connection %u.%u.%u.%u:%u -> %u.%u.%u.%u:%u from context %d\n", xid,
+                                   key ? "" : "un", print_protocol(protocol),
+                                   NIPQUAD(ip), ntohs(port),
+                                   NIPQUAD(ct->tuplehash[dir].tuple.dst.ip), ntohs(ct->tuplehash[dir].tuple.dst.u.all),
+                                   get_ct_xid(ct, dir));
+                               set_ct_xid(ct, dir, xid);
+                       }
+
+                       /* Classify traffic once per connection */
+                       if (ct->priority == (u_int32_t) -1) {
+                               /* The POSTROUTING chain should classify packets into a minor subclass
+                                * (1:1000, 1:2000, etc.) with -j CLASSIFY --set-class. Set the packet
+                                * MARK early so that rules can take xid into account. */
+                               set_skb_xid(*pskb, xid);
+                               (*pskb)->priority = priority;
+                               (void) ipt_do_table(pskb, NF_IP_POST_ROUTING, in, out, &vnet_table, NULL);
+                               priority = (*pskb)->priority | xid;
+                               dbg("vnet_out:%d: %u.%u.%u.%u:%u -> %u.%u.%u.%u:%u class %x:%x\n", xid,
+                                   NIPQUAD(ip), ntohs(port),
+                                   NIPQUAD(ct->tuplehash[dir].tuple.dst.ip), ntohs(ct->tuplehash[dir].tuple.dst.u.all),
+                                   TC_H_MAJ(priority) >> 16, TC_H_MIN(priority));
+                               ct->priority = priority;
+                       } else
+                               priority = ct->priority;
+               } else {
+                       assert(xid == 0);
+               }
+
+               /* Set class */
+               (*pskb)->priority = priority;
+
+               break;
+
+       default:
+               /* Huh? */
+               assert(hook == NF_IP_LOCAL_IN || hook == NF_IP_LOCAL_OUT);
+               break;
+       }
+
+       /* Mark packet */
+       set_skb_xid(*pskb, xid);
+
+#ifdef VNET_DEBUG
+       if (vnet_verbose >= 3) {
+               if (ct)
+                       print_conntrack(ct, ctinfo, hook);
+               if (vnet_verbose >= 4)
+                       print_packet(*pskb);
+       }
+#endif
+
+ get_verdict:
+       verdict = ipt_do_table(pskb, hook, in, out, &vnet_table, NULL);
+
+       /* Pass to network taps */
+       if (verdict == NF_ACCEPT)
+               verdict = packet_hook(*pskb, hook);
+
+ done:
+       return verdict;
+}
+
+static struct nf_hook_ops vnet_ops[] = {
+       {
+               .hook           = vnet_hook,
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+               .owner          = THIS_MODULE,
+#endif
+               .pf             = PF_INET,
+               .hooknum        = NF_IP_LOCAL_IN,
+               .priority       = NF_IP_PRI_LAST,
+       },
+       {
+               .hook           = vnet_hook,
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+               .owner          = THIS_MODULE,
+#endif
+               .pf             = PF_INET,
+               .hooknum        = NF_IP_LOCAL_OUT,
+               .priority       = NF_IP_PRI_LAST,
+       },
+};
+
+/* Exported by net/ipv4/af_inet.c */
+extern struct net_proto_family inet_family_ops;
+extern struct proto_ops inet_stream_ops;
+extern struct proto_ops inet_dgram_ops;
+extern struct proto_ops inet_sockraw_ops;
+extern int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len);
+extern int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+                              int addr_len, int flags);
+extern int inet_listen(struct socket *sock, int backlog);
+extern int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr,
+                             int addr_len, int flags);
+extern int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+                       size_t size);
+extern int inet_release(struct socket *sock);
+
+/* Exported by net/ipv4/tcp_ipv4.c */
+extern struct proto tcp_prot;
+extern int tcp_port_rover;
+extern int sysctl_local_port_range[2];
+
+/* Exported by net/ipv4/udp.c */
+extern struct proto udp_prot;
+extern int udp_port_rover;
+
+/* Functions that are not exported */
+static int (*inet_create)(struct socket *sock, int protocol);
+static ssize_t (*inet_sendpage)(struct socket *sock, struct page *page, int offset, size_t size, int flags);
+static void (*tcp_v4_hash)(struct sock *sk);
+static void (*tcp_v4_unhash)(struct sock *sk);
+static void (*udp_v4_hash)(struct sock *sk);
+static void (*udp_v4_unhash)(struct sock *sk);
+
+static int
+vnet_inet_create(struct socket *sock, int protocol)
+{
+       int ret;
+
+       if (sock->type == SOCK_RAW) {
+               /* Temporarily give CAP_NET_RAW to root VServer accounts */
+               if (current->euid)
+                       return -EPERM;
+               cap_raise(current->cap_effective, CAP_NET_RAW);
+       }
+       ret = inet_create(sock, protocol);
+       if (sock->type == SOCK_RAW)
+               cap_lower(current->cap_effective, CAP_NET_RAW);
+       if (ret)
+               return ret;
+
+       if (sock->type == SOCK_RAW) {
+               struct sock *sk = sock->sk;
+               struct inet_opt *inet = inet_sk(sk);
+               /* Usually redundant and unused */
+               assert(inet->sport == htons(inet->num));
+               /* So we can track double raw binds */
+               inet->sport = 0;
+       }
+
+       return ret;
+}
+
+/* Make sure our bind table gets updated whenever the stack decides to
+ * unhash or rehash a socket.
+ */
+static void
+vnet_inet_unhash(struct sock *sk)
+{
+       struct inet_opt *inet = inet_sk(sk);
+       struct bind_key *key;
+
+       key = bind_get(sk->sk_protocol, inet->saddr, inet->sport, sk);
+       if (key) {
+               dbg("vnet_inet_unhash:%d: released %s port %u.%u.%u.%u:%u\n", get_sk_xid(sk),
+                   print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), ntohs(inet->sport));
+               bind_del(key);
+               bind_put(key);
+       }
+
+       if (sk->sk_protocol == IPPROTO_TCP)
+               tcp_v4_unhash(sk);
+       else if (sk->sk_protocol == IPPROTO_UDP)
+               udp_v4_unhash(sk);
+}
+
+static void
+vnet_inet_hash(struct sock *sk)
+{
+       struct inet_opt *inet = inet_sk(sk);
+
+       if (bind_add(sk->sk_protocol, inet->saddr, inet->sport, sk) == 0) {
+               dbg("vnet_inet_hash:%d: bound %s port %u.%u.%u.%u:%u\n", get_sk_xid(sk),
+                   print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), ntohs(inet->sport));
+       }
+
+       if (sk->sk_protocol == IPPROTO_TCP)
+               tcp_v4_hash(sk);
+       else if (sk->sk_protocol == IPPROTO_UDP)
+               udp_v4_hash(sk);
+}
+
+/* Port reservation */
+static int
+vnet_inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+{
+       struct sock *sk = sock->sk;
+       struct inet_opt *inet = inet_sk(sk);
+       struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
+       struct bind_key *key;
+       int ret;
+
+       /* Bind socket */
+       if ((ret = inet_bind(sock, uaddr, addr_len)))
+               return ret;
+
+       lock_sock(sk);
+
+       /* Backward compatibility with safe raw sockets */
+       if (sock->type == SOCK_RAW) {
+               /* Runt sockaddr */
+               if (addr_len < sizeof(struct sockaddr_in))
+                       ret = -EINVAL;
+               /* Non-local bind */
+               else if (sin->sin_addr.s_addr != __constant_htonl(INADDR_ANY) && inet_addr_type(sin->sin_addr.s_addr) != RTN_LOCAL)
+                       ret = -EINVAL;
+               /* Unspecified port */
+               else if (!sin->sin_port)
+                       ret = -EINVAL;
+               /* Reserved port */
+               else if ((sk->sk_protocol == IPPROTO_TCP || sk->sk_protocol == IPPROTO_UDP) &&
+                        ntohs(sin->sin_port) < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
+                       ret = -EACCES;
+               /* Double bind */
+               else if (inet->sport)
+                       ret = -EINVAL;
+               if (ret)
+                       goto done;
+               inet->saddr = sin->sin_addr.s_addr;
+               inet->sport = sin->sin_port;
+       }
+
+       key = bind_get(sk->sk_protocol, inet->saddr, inet->sport, NULL);
+       if (key) {
+               /*
+                * If we are root or own the already bound socket, and
+                * SO_REUSEADDR has been set on both.
+                */
+               if ((get_sk_xid(sk) == 0 || get_sk_xid(sk) == get_sk_xid(key->sk)) &&
+                   key->sk->sk_reuse && sk->sk_reuse) {
+                       if (key->ip == __constant_htonl(INADDR_ANY)) {
+                               /* Keep the current bind key */
+                               bind_put(key);
+                               goto done;
+                       } else if (inet->saddr == __constant_htonl(INADDR_ANY)) {
+                               /* Consider the port to be bound to this socket now */
+                               bind_del(key);
+                       }
+               }
+               bind_put(key);
+       }
+
+       if ((ret = bind_add(sk->sk_protocol, inet->saddr, inet->sport, sk)) == 0) {
+               dbg("vnet_inet_bind:%d: bound %s port %u.%u.%u.%u:%u\n", get_sk_xid(sk),
+                   print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), ntohs(inet->sport));
+       }
+
+ done:
+       release_sock(sk);
+       return ret;
+}
+
+/* Override TCP and UDP port rovers since they do not know about raw
+ * socket binds.
+ */
+static int
+vnet_autobind(struct sock *sk)
+{
+       int (*get_port)(struct sock *, unsigned short);
+       int low = sysctl_local_port_range[0];
+       int high = sysctl_local_port_range[1];
+       int remaining = (high - low) + 1;
+       int port;
+       struct inet_opt *inet = inet_sk(sk);
+       struct bind_key *key;
+
+       /* Must be locked */
+       assert(sock_owned_by_user(sk));
+
+       /* Already bound to a port */
+       if (inet->num)
+               return 0;
+
+       if (sk->sk_protocol == IPPROTO_TCP) {
+               get_port = tcp_prot.get_port;
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,14)
+               /* Approximate the tcp_v4_get_port() strategy */
+               port = tcp_port_rover + 1;
+#else
+               /* Approximate the inet_csk_get_port() strategy */
+               port = net_random() % (high - low) + low;
+#endif
+       } else if (sk->sk_protocol == IPPROTO_UDP) {
+               get_port = udp_prot.get_port;
+               port = udp_port_rover;
+       } else if (sk->sk_prot->get_port) {
+               err("vnet_get_port:%d: %s unhandled\n", get_sk_xid(sk),
+                   print_protocol(sk->sk_protocol));
+               if (sk->sk_prot->get_port(sk, 0))
+                       return -EAGAIN;
+               inet->sport = htons(inet->num);
+               return 0;
+       } else {
+               return 0;
+       }
+
+       dbg("vnet_autobind:%d: roving %s port range %u.%u.%u.%u:%u-%u\n", get_sk_xid(sk),
+           print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), low, high);
+
+       /* Find a free port by linear search. Note that the standard
+        * udp_v4_get_port() function attempts to pick a port that
+        * keeps its hash tables balanced. If the UDP hash table keeps
+        * getting bombed, we should try implementing this strategy
+        * here.
+        */
+       do {
+               if (port < low || port > high)
+                       port = low;
+
+               /* XXX We could probably try something more clever
+                * like checking to see if the bound socket is a
+                * regular TCP socket owned by the same context (or we
+                * are root) and, if so, letting tcp_v4_get_port()
+                * apply its fast reuse logic to determine if the port
+                * can be reused.
+                */
+               if (bind_add(sk->sk_protocol, inet->saddr, htons(port), sk)) {
+                       dbg("vnet_get_port:%d: %s port %u.%u.%u.%u:%u already bound\n", get_sk_xid(sk),
+                           print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), port);
+                       goto next;
+               }
+
+               if (get_port(sk, port)) {
+                       /* Can happen if we are unloaded when there are active sockets */
+                       dbg("vnet_get_port:%d: failed to hash unbound %s port %u.%u.%u.%u:%u\n", get_sk_xid(sk),
+                           print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), port);
+                       key = bind_get(sk->sk_protocol, inet->saddr, htons(port), sk);
+                       assert(key);
+                       bind_del(key);
+                       bind_put(key);
+               } else {
+                       assert(port == inet->num);
+                       inet->sport = htons(inet->num);
+                       break;
+               }
+       next:
+               port++;
+       } while (--remaining > 0);
+
+       if (sk->sk_protocol == IPPROTO_UDP)
+               udp_port_rover = port;
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,14)
+       else if (sk->sk_protocol == IPPROTO_TCP)
+               tcp_port_rover = port;
+#endif
+
+       if (remaining <= 0) {
+               err("vnet_get_port:%d: exhausted local %s port range %u.%u.%u.%u:%u-%u\n", get_sk_xid(sk),
+                   print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), low, high);
+               return -EAGAIN;
+       } else {
+               dbg("vnet_get_port:%d: autobound %s port %u.%u.%u.%u:%u\n", get_sk_xid(sk),
+                   print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), port);
+               return 0;
+       }
+}
+
+static int
+vnet_inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+                        int addr_len, int flags)
+{
+       struct sock *sk = sock->sk;
+
+       lock_sock(sk);
+
+       /* Duplicates checks in inet_stream_connect() */
+       if (uaddr->sa_family != AF_UNSPEC &&
+           sock->state == SS_UNCONNECTED &&
+           sk->sk_state == TCP_CLOSE) {
+               /* We may need to bind the socket. */
+               if (!inet_sk(sk)->num && vnet_autobind(sk)) {
+                       release_sock(sk);
+                       return -EAGAIN;
+               }
+       }
+
+       release_sock(sk);
+
+       return inet_stream_connect(sock, uaddr, addr_len, flags);
+}
+
+static int 
+vnet_inet_listen(struct socket *sock, int backlog)
+{
+       struct sock *sk = sock->sk;
+
+       lock_sock(sk);
+
+       /* Duplicates checks in inet_listen() */
+       if (sock->type == SOCK_STREAM &&
+           sock->state == SS_UNCONNECTED &&
+           sk->sk_state == TCP_CLOSE) {
+               /* We may need to bind the socket. */
+               if (!inet_sk(sk)->num && vnet_autobind(sk)) {
+                       release_sock(sk);
+                       return -EAGAIN;
+               }
+       }
+
+       release_sock(sk);
+
+       return inet_listen(sock, backlog);
+}
+
+static int
+vnet_inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr,
+                       int addr_len, int flags)
+{
+       struct sock *sk = sock->sk;
+
+       lock_sock(sk);
+
+       /* Duplicates checks in inet_dgram_connect() */
+       if (uaddr->sa_family != AF_UNSPEC) {
+               /* We may need to bind the socket. */
+               if (!inet_sk(sk)->num && vnet_autobind(sk)) {
+                       release_sock(sk);
+                       return -EAGAIN;
+               }
+       }
+
+       release_sock(sk);
+
+       return inet_dgram_connect(sock, uaddr, addr_len, flags);
+}
+
+static int
+vnet_inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+                 size_t size)
+{
+       struct sock *sk = sock->sk;
+
+       lock_sock(sk);
+
+       /* We may need to bind the socket. */
+       if (!inet_sk(sk)->num && vnet_autobind(sk)) {
+               release_sock(sk);
+               return -EAGAIN;
+       }
+
+       release_sock(sk);
+
+       return inet_sendmsg(iocb, sock, msg, size);
+}
+
+static ssize_t
+vnet_inet_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
+{
+       struct sock *sk = sock->sk;
+
+       lock_sock(sk);
+
+       /* We may need to bind the socket. */
+       if (!inet_sk(sk)->num && vnet_autobind(sk)) {
+               release_sock(sk);
+               return -EAGAIN;
+       }
+
+       release_sock(sk);
+
+       return inet_sendpage(sock, page, offset, size, flags);
+}
+
+static int
+vnet_inet_release(struct socket *sock)
+{
+       struct sock *sk = sock->sk;
+       struct inet_opt *inet = inet_sk(sk);
+       struct bind_key *key;
+
+       /* Partial socket created by accept() */
+       if (!sk)
+               goto done;
+
+       lock_sock(sk);
+
+       key = bind_get(sk->sk_protocol, inet->saddr, inet->sport, sk);
+       if (key) {
+               dbg("vnet_inet_release:%d: released %s port %u.%u.%u.%u:%u\n", get_sk_xid(sk),
+                   print_protocol(sk->sk_protocol), NIPQUAD(inet->saddr), ntohs(inet->sport));
+               bind_del(key);
+               bind_put(key);
+       }
+
+       release_sock(sk);
+
+ done:
+       return inet_release(sock);
+}
+
+/* Sanity check */
+#define override_op(op, from, to) do { assert((op) == (from)); (op) = (to); } while (0)
+
+static int __init
+vnet_init(void)
+{
+       int ret;
+
+       /* Initialize bind table */
+       ret = bind_init();
+       if (ret < 0)
+               return ret;
+
+       /* Register /proc entries */
+       ret = proc_init();
+       if (ret < 0)
+               goto cleanup_bind;
+
+       /* Register dummy netdevice */
+       ret = packet_init();
+       if (ret < 0)
+               goto cleanup_proc;
+
+       /* Register tap netdevice */
+       ret = tun_init();
+       if (ret < 0)
+               goto cleanup_packet;
+
+       /* Get pointers to unexported functions */
+       inet_create = inet_family_ops.create;
+       inet_sendpage = inet_dgram_ops.sendpage;
+       tcp_v4_hash = tcp_prot.hash;
+       tcp_v4_unhash = tcp_prot.unhash;
+       udp_v4_hash = udp_prot.hash;
+       udp_v4_unhash = udp_prot.unhash;
+
+       /* Override PF_INET socket operations */
+       override_op(inet_family_ops.create, inet_create, vnet_inet_create);
+       override_op(inet_stream_ops.bind, inet_bind, vnet_inet_bind);
+       override_op(inet_stream_ops.connect, inet_stream_connect, vnet_inet_stream_connect);
+       override_op(inet_stream_ops.listen, inet_listen, vnet_inet_listen);
+       override_op(inet_stream_ops.sendmsg, inet_sendmsg, vnet_inet_sendmsg);
+       override_op(inet_stream_ops.release, inet_release, vnet_inet_release);
+       override_op(inet_dgram_ops.bind, inet_bind, vnet_inet_bind);
+       override_op(inet_dgram_ops.connect, inet_dgram_connect, vnet_inet_dgram_connect);
+       override_op(inet_dgram_ops.sendmsg, inet_sendmsg, vnet_inet_sendmsg); 
+       override_op(inet_dgram_ops.sendpage, inet_sendpage, vnet_inet_sendpage);
+       override_op(inet_dgram_ops.release, inet_release, vnet_inet_release);
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10)
+       override_op(inet_sockraw_ops.bind, inet_bind, vnet_inet_bind);
+       override_op(inet_sockraw_ops.connect, inet_dgram_connect, vnet_inet_dgram_connect);
+       override_op(inet_sockraw_ops.sendmsg, inet_sendmsg, vnet_inet_sendmsg);
+       override_op(inet_sockraw_ops.sendpage, inet_sendpage, vnet_inet_sendpage); 
+       override_op(inet_sockraw_ops.release, inet_release, vnet_inet_release);
+#endif
+       override_op(tcp_prot.hash, tcp_v4_hash, vnet_inet_hash);
+       override_op(tcp_prot.unhash, tcp_v4_unhash, vnet_inet_unhash);
+       override_op(udp_prot.hash, udp_v4_hash, vnet_inet_hash);
+       override_op(udp_prot.unhash, udp_v4_unhash, vnet_inet_unhash);
+
+       /* Register table */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,11)
+       ret = ipt_register_table(&vnet_table, &initial_table.repl);
+#else
+       ret = ipt_register_table(&vnet_table);
+#endif
+       if (ret < 0)
+               goto cleanup_override;
+
+       /* Register hooks */
+       ret = nf_register_hook(&vnet_ops[0]);
+       if (ret < 0)
+               goto cleanup_table;
+
+       ret = nf_register_hook(&vnet_ops[1]);
+       if (ret < 0)
+               goto cleanup_hook0;
+
+       /* Enables any runtime kernel support for VNET */
+       vnet_active = 1;
+
+       /* Print banner */
+       printk("VNET: version " VNET_VERSION " compiled on " __DATE__ " at " __TIME__ "\n");
+
+       return ret;
+
+ cleanup_hook0:
+       nf_unregister_hook(&vnet_ops[0]);
+ cleanup_table:
+       ipt_unregister_table(&vnet_table);
+ cleanup_override:
+       inet_family_ops.create = inet_create;
+       inet_stream_ops.bind = inet_bind;
+       inet_stream_ops.connect = inet_stream_connect;
+       inet_stream_ops.listen = inet_listen;
+       inet_stream_ops.sendmsg = inet_sendmsg;
+       inet_stream_ops.release = inet_release;
+       inet_dgram_ops.bind = inet_bind;
+       inet_dgram_ops.connect = inet_dgram_connect;
+       inet_dgram_ops.sendmsg = inet_sendmsg;
+       inet_dgram_ops.sendpage = inet_sendpage;
+       inet_dgram_ops.release = inet_release;
+       tun_cleanup();
+ cleanup_packet:
+       packet_cleanup();       
+ cleanup_proc:
+       proc_cleanup();
+ cleanup_bind:
+       bind_cleanup();
+
+       return ret;
+}
+
+static void __exit
+vnet_exit(void)
+{
+       unsigned int i;
+
+       /* Print banner */
+       printk("VNET: exiting\n");
+
+       /* Disables any runtime kernel support for VNET */
+       vnet_active = 0;
+
+       /* Stop handling packets first */
+       for (i = 0; i < sizeof(vnet_ops)/sizeof(struct nf_hook_ops); i++)
+               nf_unregister_hook(&vnet_ops[i]);
+
+       ipt_unregister_table(&vnet_table);
+
+       /* Stop handling PF_INET socket operations */
+       override_op(inet_family_ops.create, vnet_inet_create, inet_create);
+       override_op(inet_stream_ops.bind, vnet_inet_bind, inet_bind);
+       override_op(inet_stream_ops.connect, vnet_inet_stream_connect, inet_stream_connect);
+       override_op(inet_stream_ops.listen, vnet_inet_listen, inet_listen);
+       override_op(inet_stream_ops.sendmsg, vnet_inet_sendmsg, inet_sendmsg);
+       override_op(inet_stream_ops.release, vnet_inet_release, inet_release);
+       override_op(inet_dgram_ops.bind, vnet_inet_bind, inet_bind);
+       override_op(inet_dgram_ops.connect, vnet_inet_dgram_connect, inet_dgram_connect);
+       override_op(inet_dgram_ops.sendmsg, vnet_inet_sendmsg, inet_sendmsg); 
+       override_op(inet_dgram_ops.sendpage, vnet_inet_sendpage, inet_sendpage);
+       override_op(inet_dgram_ops.release, vnet_inet_release, inet_release);
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10)
+       override_op(inet_sockraw_ops.bind, vnet_inet_bind, inet_bind);
+       override_op(inet_sockraw_ops.connect, vnet_inet_dgram_connect, inet_dgram_connect);
+       override_op(inet_sockraw_ops.sendmsg, vnet_inet_sendmsg, inet_sendmsg);
+       override_op(inet_sockraw_ops.sendpage, vnet_inet_sendpage, inet_sendpage); 
+       override_op(inet_sockraw_ops.release, vnet_inet_release, inet_release);
+#endif
+       override_op(tcp_prot.hash, vnet_inet_hash, tcp_v4_hash);
+       override_op(tcp_prot.unhash, vnet_inet_unhash, tcp_v4_unhash);
+       override_op(udp_prot.hash, vnet_inet_hash, udp_v4_hash);
+       override_op(udp_prot.unhash, vnet_inet_unhash, udp_v4_unhash);
+
+       /* Disable tap netdevice */
+       tun_cleanup();
+
+       /* Disable vnet netdevice and stop handling PF_PACKET sockets */
+       packet_cleanup();
+
+       /* Unregister /proc handlers */
+       proc_cleanup();
+
+       /* Cleanup bind table (must be after nf_unregister_hook()) */
+       bind_cleanup();
+}
+
+module_init(vnet_init);
+module_exit(vnet_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Mark Huang <mlhuang@cs.princeton.edu>");
+MODULE_DESCRIPTION("VServer IP isolation");
author	KyoungSoo Park <kyoungso@cs.princeton.edu>
	Wed, 25 Apr 2007 16:07:02 +0000 (16:07 +0000)
committer	KyoungSoo Park <kyoungso@cs.princeton.edu>
	Wed, 25 Apr 2007 16:07:02 +0000 (16:07 +0000)
Makefile	[new file with mode: 0644]	patch \| blob
appdef.h	[new file with mode: 0644]	patch \| blob
applib.c	[new file with mode: 0644]	patch \| blob
applib.h	[new file with mode: 0644]	patch \| blob
codemux.c	[new file with mode: 0644]	patch \| blob
codemux.conf	[new file with mode: 0644]	patch \| blob
codemux.initscript	[new file with mode: 0644]	patch \| blob
codemux.spec	[new file with mode: 0644]	patch \| blob
codns.h	[new file with mode: 0644]	patch \| blob
debug.h	[new file with mode: 0644]	patch \| blob
gettimeofdayex.c	[new file with mode: 0644]	patch \| blob
gettimeofdayex.h	[new file with mode: 0644]	patch \| blob
ports.h	[new file with mode: 0644]	patch \| blob
vnet_main.c	[new file with mode: 0644]	patch \| blob