Print OpenFlow ref. impl. and protocol versions at secchan startup time.

[sliver-openvswitch.git] / secchan / secchan.c
diff --git a/secchan/secchan.c b/secchan/secchan.c

index 50ebebb..3cb1006 100644 (file)
--- a/secchan/secchan.c
+++ b/secchan/secchan.c
@@ -40,6 +40,7 @@
  #include <poll.h>
  #include <regex.h>
  #include <stdlib.h>
+#include <signal.h>
  #include <string.h>
  #include <time.h>
  #include <unistd.h>
@@ -61,6 +62,7 @@
  #include "packets.h"
  #include "poll-loop.h"
  #include "rconn.h"
+#include "timeval.h"
  #include "util.h"
  #include "vconn-ssl.h"
  #include "vconn.h"
@@ -93,6 +95,10 @@ struct settings {
      int probe_interval;       /* # seconds idle before sending echo request. */
      int max_backoff;          /* Max # seconds between connection attempts. */
  
+    /* Packet-in rate-limiting. */
+    int rate_limit;           /* Tokens added to bucket per second. */
+    int burst_limit;          /* Maximum number token bucket size. */
+
      /* Discovery behavior. */
      regex_t accept_controller_regex;  /* Controller vconns to accept. */
      const char *accept_controller_re; /* String version of regex. */
@@ -102,6 +108,7 @@ struct settings {
  struct half {
      struct rconn *rconn;
      struct buffer *rxbuf;
+    int n_txq;                  /* No. of packets queued for tx on 'rconn'. */
  };
  
  struct relay {
@@ -117,6 +124,7 @@ struct relay {
  struct hook {
      bool (*packet_cb)(struct relay *, int half, void *aux);
      void (*periodic_cb)(void *aux);
+    void (*wait_cb)(void *aux);
      void *aux;
  };
  
@@ -132,6 +140,7 @@ static void relay_destroy(struct relay *);
  
  static struct hook make_hook(bool (*packet_cb)(struct relay *, int, void *),
                               void (*periodic_cb)(void *),
+                             void (*wait_cb)(void *),
                               void *aux);
  
  static struct discovery *discovery_init(const struct settings *);
@@ -143,6 +152,9 @@ static struct hook in_band_hook_create(const struct settings *);
  static struct hook fail_open_hook_create(const struct settings *,
                                           struct rconn *local,
                                           struct rconn *remote);
+static struct hook rate_limit_hook_create(const struct settings *,
+                                          struct rconn *local,
+                                          struct rconn *remote);
  
  static void modify_dhcp_request(struct dhcp_msg *, void *aux);
  static bool validate_dhcp_offer(const struct dhcp_msg *, void *aux);
@@ -154,7 +166,7 @@ main(int argc, char *argv[])
  
      struct list relays = LIST_INITIALIZER(&relays);
  
-    struct hook hooks[3];
+    struct hook hooks[4];
      size_t n_hooks;
  
      struct rconn *local_rconn, *remote_rconn;
@@ -165,8 +177,10 @@ main(int argc, char *argv[])
  
      set_program_name(argv[0]);
      register_fault_handlers();
+    time_init();
      vlog_init();
      parse_options(argc, argv, &s);
+    signal(SIGPIPE, SIG_IGN);
  
      /* Start listening for management connections. */
      if (s.listen_vconn_name) {
@@ -192,12 +206,15 @@ main(int argc, char *argv[])
  
      daemonize();
  
+    VLOG_WARN("OpenFlow reference implementation version %s", VERSION);
+    VLOG_WARN("OpenFlow protocol version 0x%02x", OFP_VERSION);
+
      /* Connect to datapath. */
-    local_rconn = rconn_create(1, 0, s.max_backoff);
+    local_rconn = rconn_create(0, s.max_backoff);
      rconn_connect(local_rconn, s.nl_name);
  
      /* Connect to controller. */
-    remote_rconn = rconn_create(1, s.probe_interval, s.max_backoff);
+    remote_rconn = rconn_create(s.probe_interval, s.max_backoff);
      if (s.controller_name) {
          retval = rconn_connect(remote_rconn, s.controller_name);
          if (retval == EAFNOSUPPORT) {
@@ -218,6 +235,10 @@ main(int argc, char *argv[])
          hooks[n_hooks++] = fail_open_hook_create(&s,
                                                   local_rconn, remote_rconn);
      }
+    if (s.rate_limit) {
+        hooks[n_hooks++] = rate_limit_hook_create(&s,
+                                                  local_rconn, remote_rconn);
+    }
      assert(n_hooks <= ARRAY_SIZE(hooks));
  
      for (;;) {
@@ -263,6 +284,11 @@ main(int argc, char *argv[])
          if (listen_vconn) {
              vconn_accept_wait(listen_vconn);
          }
+        for (i = 0; i < n_hooks; i++) {
+            if (hooks[i].wait_cb) {
+                hooks[i].wait_cb(hooks[i].aux);
+            }
+        }
          if (discovery) {
              discovery_wait(discovery);
          }
@@ -275,11 +301,13 @@ main(int argc, char *argv[])
  static struct hook
  make_hook(bool (*packet_cb)(struct relay *, int half, void *aux),
            void (*periodic_cb)(void *aux),
+          void (*wait_cb)(void *aux),
            void *aux)
  {
      struct hook h;
      h.packet_cb = packet_cb;
      h.periodic_cb = periodic_cb;
+    h.wait_cb = wait_cb;
      h.aux = aux;
      return h;
  }
@@ -319,11 +347,11 @@ relay_accept(const struct settings *s, struct vconn *listen_vconn)
      }
  
      /* Create and return relay. */
-    r1 = rconn_create(1, 0, 0);
+    r1 = rconn_create(0, 0);
      rconn_connect_unreliably(r1, nl_name_without_subscription, new_local);
      free(nl_name_without_subscription);
  
-    r2 = rconn_create(1, 0, 0);
+    r2 = rconn_create(0, 0);
      rconn_connect_unreliably(r2, "passive", new_remote);
  
      return relay_create(r1, r2, true);
@@ -332,15 +360,9 @@ relay_accept(const struct settings *s, struct vconn *listen_vconn)
  static struct relay *
  relay_create(struct rconn *local, struct rconn *remote, bool is_mgmt_conn)
  {
-    struct relay *r;
-    int i;
-
-    r = xmalloc(sizeof *r);
+    struct relay *r = xcalloc(1, sizeof *r);
      r->halves[HALF_LOCAL].rconn = local;
      r->halves[HALF_REMOTE].rconn = remote;
-    for (i = 0; i < 2; i++) {
-        r->halves[i].rxbuf = NULL;
-    }
      r->is_mgmt_conn = is_mgmt_conn;
      return r;
  }
@@ -377,8 +399,9 @@ relay_run(struct relay *r, const struct hook hooks[], size_t n_hooks)
                  }
              }
  
-            if (this->rxbuf) {
-                int retval = rconn_send(peer->rconn, this->rxbuf);
+            if (this->rxbuf && !this->n_txq) {
+                int retval = rconn_send(peer->rconn, this->rxbuf,
+                                        &this->n_txq);
                  if (retval != EAGAIN) {
                      if (!retval) {
                          progress = true;
@@ -441,19 +464,17 @@ struct in_band_data {
      struct mac_learning *ml;
      struct netdev *of_device;
      uint8_t mac[ETH_ADDR_LEN];
+    int n_queued;
  };
  
  static void
-queue_tx(struct rconn *rc, struct buffer *b)
+queue_tx(struct rconn *rc, struct in_band_data *in_band, struct buffer *b)
  {
-    if (rconn_force_send(rc, b)) {
-        buffer_delete(b);
-    }
+    rconn_send_with_limit(rc, b, &in_band->n_queued, 10);
  }
  
-static bool
-is_controller_mac(const uint8_t dl_addr[ETH_ADDR_LEN], struct netdev *netdev,
-                  struct rconn *controller)
+static const uint8_t *
+get_controller_mac(struct netdev *netdev, struct rconn *controller)
  {
      static uint32_t ip, last_nonzero_ip;
      static uint8_t mac[ETH_ADDR_LEN], last_nonzero_mac[ETH_ADDR_LEN];
@@ -461,7 +482,7 @@ is_controller_mac(const uint8_t dl_addr[ETH_ADDR_LEN], struct netdev *netdev,
  
      uint32_t last_ip = ip;
  
-    time_t now = time(0);
+    time_t now = time_now();
  
      ip = rconn_get_ip(controller);
      if (last_ip != ip || !next_refresh || now >= next_refresh) {
@@ -498,7 +519,14 @@ is_controller_mac(const uint8_t dl_addr[ETH_ADDR_LEN], struct netdev *netdev,
           * Otherwise, we can afford to wait a little while. */
          next_refresh = now + (!ip || have_mac ? 10 : 1);
      }
-    return !eth_addr_is_zero(mac) && eth_addr_equals(mac, dl_addr);
+    return !eth_addr_is_zero(mac) ? mac : NULL;
+}
+
+static bool
+is_controller_mac(const uint8_t mac[ETH_ADDR_LEN],
+                  const uint8_t *controller_mac)
+{
+    return controller_mac && eth_addr_equals(mac, controller_mac);
  }
  
  static bool
@@ -513,6 +541,7 @@ in_band_packet_cb(struct relay *r, int half, void *in_band_)
      struct buffer pkt;
      struct flow flow;
      uint16_t in_port, out_port;
+    const uint8_t *controller_mac;
  
      if (half != HALF_LOCAL || r->is_mgmt_conn) {
          return false;
@@ -537,9 +566,13 @@ in_band_packet_cb(struct relay *r, int half, void *in_band_)
      flow_extract(&pkt, in_port, &flow);
  
      /* Deal with local stuff. */
+    controller_mac = get_controller_mac(in_band->of_device,
+                                        r->halves[HALF_REMOTE].rconn);
      if (in_port == OFPP_LOCAL) {
+        /* Sent by secure channel. */
          out_port = mac_learning_lookup(in_band->ml, flow.dl_dst);
      } else if (eth_addr_equals(flow.dl_dst, in_band->mac)) {
+        /* Sent to secure channel. */
          out_port = OFPP_LOCAL;
          if (mac_learning_learn(in_band->ml, flow.dl_src, in_port)) {
              VLOG_DBG("learned that "ETH_ADDR_FMT" is on port %"PRIu16,
@@ -547,21 +580,30 @@ in_band_packet_cb(struct relay *r, int half, void *in_band_)
          }
      } else if (flow.dl_type == htons(ETH_TYPE_ARP)
                 && eth_addr_is_broadcast(flow.dl_dst)
-               && is_controller_mac(flow.dl_src, in_band->of_device,
-                                    r->halves[HALF_REMOTE].rconn)) {
+               && is_controller_mac(flow.dl_src, controller_mac)) {
+        /* ARP sent by controller. */
          out_port = OFPP_FLOOD;
+    } else if (is_controller_mac(flow.dl_dst, controller_mac)
+               && in_port == mac_learning_lookup(in_band->ml,
+                                                 controller_mac)) {
+        /* Drop controller traffic that arrives on the controller port. */
+        queue_tx(rc, in_band, make_add_flow(&flow, ntohl(opi->buffer_id),
+                                            in_band->s->max_idle, 0));
+        return true;
      } else {
          return false;
      }
  
      if (out_port != OFPP_FLOOD) {
          /* The output port is known, so add a new flow. */
-        queue_tx(rc, make_add_simple_flow(&flow, ntohl(opi->buffer_id),
-                                          out_port, in_band->s->max_idle));
+        queue_tx(rc, in_band,
+                 make_add_simple_flow(&flow, ntohl(opi->buffer_id),
+                                      out_port, in_band->s->max_idle));
  
          /* If the switch didn't buffer the packet, we need to send a copy. */
          if (ntohl(opi->buffer_id) == UINT32_MAX) {
-            queue_tx(rc, make_unbuffered_packet_out(&pkt, in_port, out_port));
+            queue_tx(rc, in_band,
+                     make_unbuffered_packet_out(&pkt, in_port, out_port));
          }
      } else {
          /* We don't know that MAC.  Send along the packet without setting up a
@@ -573,7 +615,7 @@ in_band_packet_cb(struct relay *r, int half, void *in_band_)
              b = make_buffered_packet_out(ntohl(opi->buffer_id),
                                           in_port, out_port);
          }
-        queue_tx(rc, b);
+        queue_tx(rc, in_band, b);
      }
      return true;
  }
@@ -584,7 +626,7 @@ in_band_hook_create(const struct settings *s)
      struct in_band_data *in_band;
      int retval;
  
-    in_band = xmalloc(sizeof *in_band);
+    in_band = xcalloc(1, sizeof *in_band);
      in_band->s = s;
      in_band->ml = mac_learning_create();
      retval = netdev_open(s->of_name, NETDEV_ETH_TYPE_NONE,
@@ -595,7 +637,7 @@ in_band_hook_create(const struct settings *s)
      memcpy(in_band->mac, netdev_get_etheraddr(in_band->of_device),
             ETH_ADDR_LEN);
  
-    return make_hook(in_band_packet_cb, NULL, in_band);
+    return make_hook(in_band_packet_cb, NULL, NULL, in_band);
  }
  \f
  /* Fail open support. */
@@ -660,7 +702,198 @@ fail_open_hook_create(const struct settings *s, struct rconn *local_rconn,
      fail_open->local_rconn = local_rconn;
      fail_open->remote_rconn = remote_rconn;
      fail_open->lswitch = NULL;
-    return make_hook(fail_open_packet_cb, fail_open_periodic_cb, fail_open);
+    return make_hook(fail_open_packet_cb, fail_open_periodic_cb, NULL,
+                     fail_open);
+}
+\f
+struct rate_limiter {
+    const struct settings *s;
+    struct rconn *remote_rconn;
+
+    /* One queue per physical port. */
+    struct queue queues[OFPP_MAX];
+    int n_queued;               /* Sum over queues[*].n. */
+    int next_tx_port;           /* Next port to check in round-robin. */
+
+    /* Token bucket.
+     *
+     * It costs 1000 tokens to send a single packet_in message.  A single token
+     * per message would be more straightforward, but this choice lets us avoid
+     * round-off error in refill_bucket()'s calculation of how many tokens to
+     * add to the bucket, since no division step is needed. */
+    long long int last_fill;    /* Time at which we last added tokens. */
+    int tokens;                 /* Current number of tokens. */
+
+    /* Transmission queue. */
+    int n_txq;                  /* No. of packets waiting in rconn for tx. */
+};
+
+/* Drop a packet from the longest queue in 'rl'. */
+static void
+drop_packet(struct rate_limiter *rl)
+{
+    struct queue *longest;      /* Queue currently selected as longest. */
+    int n_longest;              /* # of queues of same length as 'longest'. */
+    struct queue *q;
+
+    longest = &rl->queues[0];
+    n_longest = 1;
+    for (q = &rl->queues[0]; q < &rl->queues[OFPP_MAX]; q++) {
+        if (longest->n < q->n) {
+            longest = q;
+            n_longest = 1;
+        } else if (longest->n == q->n) {
+            n_longest++;
+
+            /* Randomly select one of the longest queues, with a uniform
+             * distribution (Knuth algorithm 3.4.2R). */
+            if (!random_range(n_longest)) {
+                longest = q;
+            }
+        }
+    }
+
+    /* FIXME: do we want to pop the tail instead? */
+    buffer_delete(queue_pop_head(longest));
+    rl->n_queued--;
+}
+
+/* Remove and return the next packet to transmit (in round-robin order). */
+static struct buffer *
+dequeue_packet(struct rate_limiter *rl)
+{
+    unsigned int i;
+
+    for (i = 0; i < OFPP_MAX; i++) {
+        unsigned int port = (rl->next_tx_port + i) % OFPP_MAX;
+        struct queue *q = &rl->queues[port];
+        if (q->n) {
+            rl->next_tx_port = (port + 1) % OFPP_MAX;
+            rl->n_queued--;
+            return queue_pop_head(q);
+        }
+    }
+    NOT_REACHED();
+}
+
+/* Add tokens to the bucket based on elapsed time. */
+static void
+refill_bucket(struct rate_limiter *rl)
+{
+    const struct settings *s = rl->s;
+    long long int now = time_msec();
+    long long int tokens = (now - rl->last_fill) * s->rate_limit + rl->tokens;
+    if (tokens >= 1000) {
+        rl->last_fill = now;
+        rl->tokens = MIN(tokens, s->burst_limit * 1000);
+    }
+}
+
+/* Attempts to remove enough tokens from 'rl' to transmit a packet.  Returns
+ * true if successful, false otherwise.  (In the latter case no tokens are
+ * removed.) */
+static bool
+get_token(struct rate_limiter *rl)
+{
+    if (rl->tokens >= 1000) {
+        rl->tokens -= 1000;
+        return true;
+    } else {
+        return false;
+    }
+}
+
+static bool
+rate_limit_packet_cb(struct relay *r, int half, void *rl_)
+{
+    struct rate_limiter *rl = rl_;
+    const struct settings *s = rl->s;
+    struct buffer *msg = r->halves[HALF_LOCAL].rxbuf;
+    struct ofp_header *oh;
+
+    if (half == HALF_REMOTE) {
+        return false;
+    }
+
+    oh = msg->data;
+    if (oh->type != OFPT_PACKET_IN) {
+        return false;
+    }
+    if (msg->size < offsetof(struct ofp_packet_in, data)) {
+        VLOG_WARN("packet too short (%zu bytes) for packet_in", msg->size);
+        return false;
+    }
+
+    if (!rl->n_queued && get_token(rl)) {
+        /* In the common case where we are not constrained by the rate limit,
+         * let the packet take the normal path. */
+        return false;
+    } else {
+        /* Otherwise queue it up for the periodic callback to drain out. */
+        struct ofp_packet_in *opi = msg->data;
+        int port = ntohs(opi->in_port) % OFPP_MAX;
+        if (rl->n_queued >= s->burst_limit) {
+            drop_packet(rl);
+        }
+        queue_push_tail(&rl->queues[port], buffer_clone(msg));
+        rl->n_queued++;
+        return true;
+    }
+}
+
+static void
+rate_limit_periodic_cb(void *rl_)
+{
+    struct rate_limiter *rl = rl_;
+    int i;
+
+    /* Drain some packets out of the bucket if possible, but limit the number
+     * of iterations to allow other code to get work done too. */
+    refill_bucket(rl);
+    for (i = 0; rl->n_queued && get_token(rl) && i < 50; i++) {
+        /* Use a small, arbitrary limit for the amount of queuing to do here,
+         * because the TCP connection is responsible for buffering and there is
+         * no point in trying to transmit faster than the TCP connection can
+         * handle. */
+        struct buffer *b = dequeue_packet(rl);
+        rconn_send_with_limit(rl->remote_rconn, b, &rl->n_txq, 10);
+    }
+}
+
+static void
+rate_limit_wait_cb(void *rl_)
+{
+    struct rate_limiter *rl = rl_;
+    if (rl->n_queued) {
+        if (rl->tokens >= 1000) {
+            /* We can transmit more packets as soon as we're called again. */
+            poll_immediate_wake();
+        } else {
+            /* We have to wait for the bucket to re-fill.  We could calculate
+             * the exact amount of time here for increased smoothness. */
+            poll_timer_wait(TIME_UPDATE_INTERVAL / 2);
+        }
+    }
+}
+
+static struct hook
+rate_limit_hook_create(const struct settings *s,
+                       struct rconn *local,
+                       struct rconn *remote)
+{
+    struct rate_limiter *rl;
+    size_t i;
+
+    rl = xcalloc(1, sizeof *rl);
+    rl->s = s;
+    rl->remote_rconn = remote;
+    for (i = 0; i < ARRAY_SIZE(rl->queues); i++) {
+        queue_init(&rl->queues[i]);
+    }
+    rl->last_fill = time_msec();
+    rl->tokens = s->rate_limit * 100;
+    return make_hook(rate_limit_packet_cb, rate_limit_periodic_cb,
+                     rate_limit_wait_cb, rl);
  }
  \f
  /* Controller discovery. */
@@ -782,7 +1015,9 @@ parse_options(int argc, char *argv[], struct settings *s)
          OPT_NO_RESOLV_CONF,
          OPT_INACTIVITY_PROBE,
          OPT_MAX_IDLE,
-        OPT_MAX_BACKOFF
+        OPT_MAX_BACKOFF,
+        OPT_RATE_LIMIT,
+        OPT_BURST_LIMIT
      };
      static struct option long_options[] = {
          {"accept-vconn", required_argument, 0, OPT_ACCEPT_VCONN},
@@ -792,6 +1027,8 @@ parse_options(int argc, char *argv[], struct settings *s)
          {"max-idle",    required_argument, 0, OPT_MAX_IDLE},
          {"max-backoff", required_argument, 0, OPT_MAX_BACKOFF},
          {"listen",      required_argument, 0, 'l'},
+        {"rate-limit",  optional_argument, 0, OPT_RATE_LIMIT},
+        {"burst-limit", required_argument, 0, OPT_BURST_LIMIT},
          {"detach",      no_argument, 0, 'D'},
          {"pidfile",     optional_argument, 0, 'P'},
          {"verbose",     optional_argument, 0, 'v'},
@@ -811,6 +1048,8 @@ parse_options(int argc, char *argv[], struct settings *s)
      s->probe_interval = 15;
      s->max_backoff = 15;
      s->update_resolv_conf = true;
+    s->rate_limit = 0;
+    s->burst_limit = 0;
      for (;;) {
          int c;
  
@@ -867,6 +1106,24 @@ parse_options(int argc, char *argv[], struct settings *s)
              }
              break;
  
+        case OPT_RATE_LIMIT:
+            if (optarg) {
+                s->rate_limit = atoi(optarg);
+                if (s->rate_limit < 1) {
+                    fatal(0, "--rate-limit argument must be at least 1");
+                }
+            } else {
+                s->rate_limit = 1000;
+            }
+            break;
+
+        case OPT_BURST_LIMIT:
+            s->burst_limit = atoi(optarg);
+            if (s->burst_limit < 1) {
+                fatal(0, "--burst-limit argument must be at least 1");
+            }
+            break;
+
          case 'D':
              set_detach();
              break;
@@ -960,6 +1217,19 @@ parse_options(int argc, char *argv[], struct settings *s)
  
          netdev_close(netdev);
      }
+
+    /* Rate limiting. */
+    if (s->rate_limit) {
+        if (s->rate_limit < 100) {
+            VLOG_WARN("Rate limit set to unusually low value %d",
+                      s->rate_limit);
+        }
+        if (!s->burst_limit) {
+            s->burst_limit = s->rate_limit / 4;
+        }
+        s->burst_limit = MAX(s->burst_limit, 1);
+        s->burst_limit = MIN(s->burst_limit, INT_MAX / 1000);
+    }
  }
  
  static void
@@ -985,6 +1255,9 @@ usage(void)
             "                          attempts (default: 15 seconds)\n"
             "  -l, --listen=METHOD     allow management connections on METHOD\n"
             "                          (a passive OpenFlow connection method)\n"
+           "\nRate-limiting of \"packet-in\" messages to the controller:\n"
+           "  --rate-limit[=PACKETS]  max rate, in packets/s (default: 1000)\n"
+           "  --burst-limit=BURST     limit on packet credit for idle time\n"
             "\nOther options:\n"
             "  -D, --detach            run in background as daemon\n"
             "  -P, --pidfile[=FILE]    create pidfile (default: %s/secchan.pid)\n"