bond: Bonds never sleep if carrier changes.

[sliver-openvswitch.git] / lib / cfm.c
diff --git a/lib/cfm.c b/lib/cfm.c

index f9b3488..a605c09 100644 (file)
--- a/lib/cfm.c
+++ b/lib/cfm.c
@@ -45,10 +45,10 @@ struct cfm_internal {
  
      struct timer tx_timer;    /* Send CCM when expired. */
      struct timer fault_timer; /* Check for faults when expired. */
-
-    long long x_recv_time;
  };
  
+static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
+
  static int
  ccm_interval_to_ms(uint8_t interval)
  {
@@ -67,6 +67,20 @@ ccm_interval_to_ms(uint8_t interval)
      NOT_REACHED();
  }
  
+static long long int
+cfm_fault_interval(struct cfm_internal *cfmi)
+{
+    /* According to the 802.1ag specification we should assume every other MP
+     * with the same MAID has the same transmission interval that we have.  If
+     * an MP has a different interval, cfm_process_heartbeat will register it
+     * as a fault (likely due to a configuration error).  Thus we can check all
+     * MPs at once making this quite a bit simpler.
+     *
+     * According to the specification we should check when (ccm_interval_ms *
+     * 3.5)ms have passed. */
+    return (cfmi->ccm_interval_ms * 7) / 2;
+}
+
  static uint8_t
  ms_to_ccm_interval(int interval_ms)
  {
@@ -115,8 +129,9 @@ lookup_remote_mp(const struct hmap *hmap, uint16_t mpid)
  }
  
  /* Allocates a 'cfm' object.  This object should have its 'mpid', 'maid',
- * 'eth_src', and 'interval' filled out.  When changes are made to the 'cfm'
- * object, cfm_configure should be called before using it. */
+ * 'eth_src', and 'interval' filled out.  cfm_configure() should be called
+ * whenever changes are made to 'cfm', and before cfm_run() is called for the
+ * first time. */
  struct cfm *
  cfm_create(void)
  {
@@ -125,7 +140,6 @@ cfm_create(void)
  
      cfmi = xzalloc(sizeof *cfmi);
      cfm  = &cfmi->cfm;
-    cfmi->x_recv_time = LLONG_MIN;
  
      hmap_init(&cfm->remote_mps);
      return cfm;
@@ -154,37 +168,29 @@ cfm_destroy(struct cfm *cfm)
  void
  cfm_run(struct cfm *cfm)
  {
-    long long now = time_msec();
      struct cfm_internal *cfmi = cfm_to_internal(cfm);
-    long long fault_interval;
  
-    /* According to the 802.1ag specification we should assume every other MP
-     * with the same MAID has the same transmission interval that we have.  If
-     * an MP has a different interval, cfm_process_heartbeat will register it
-     * as a fault (likely due to a configuration error).  Thus we can check all
-     * MPs at once making this quite a bit simpler.
-     *
-     * According to the specification we should check when (ccm_interval_ms *
-     * 3.5)ms have passed. */
-    fault_interval = (cfmi->ccm_interval_ms * 7) / 2;
      if (timer_expired(&cfmi->fault_timer)) {
-        bool fault;
+        long long int interval = cfm_fault_interval(cfmi);
          struct remote_mp *rmp;
  
-        fault = now < cfmi->x_recv_time + fault_interval;
-
+        cfm->fault = false;
          HMAP_FOR_EACH (rmp, node, &cfm->remote_mps) {
-            if (timer_expired_at(&cfmi->fault_timer, rmp->recv_time)) {
-                rmp->fault = true;
-            }
+            rmp->fault = !rmp->recv;
+            rmp->recv = false;
  
              if (rmp->fault) {
-                fault = true;
+                cfm->fault = true;
+                VLOG_DBG("No CCM from RMP %"PRIu16" in the last %lldms",
+                         rmp->mpid, interval);
              }
          }
  
-        cfm->fault = fault;
-        timer_set_duration(&cfmi->fault_timer, fault_interval);
+        if (!cfm->fault) {
+            VLOG_DBG("All RMPs received CCMs in the last %lldms", interval);
+        }
+
+        timer_set_duration(&cfmi->fault_timer, interval);
      }
  }
  
@@ -243,9 +249,8 @@ cfm_configure(struct cfm *cfm)
          cfmi->ccm_interval = interval;
          cfmi->ccm_interval_ms = ccm_interval_to_ms(interval);
  
-        /* Force a resend and check in case anything changed. */
          timer_set_expired(&cfmi->tx_timer);
-        timer_set_expired(&cfmi->fault_timer);
+        timer_set_duration(&cfmi->fault_timer, cfm_fault_interval(cfmi));
      }
  
      return true;
@@ -354,9 +359,7 @@ cfm_process_heartbeat(struct cfm *cfm, const struct ofpbuf *p)
      uint8_t ccm_interval;
      struct remote_mp *rmp;
      struct eth_header *eth;
-
-    struct cfm_internal *cfmi        = cfm_to_internal(cfm);
-    static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
+    struct cfm_internal *cfmi = cfm_to_internal(cfm);
  
      eth = p->l2;
      ccm = ofpbuf_at(p, (uint8_t *)p->l3 - (uint8_t *)p->data, CCM_LEN);
@@ -372,9 +375,16 @@ cfm_process_heartbeat(struct cfm *cfm, const struct ofpbuf *p)
          return;
      }
  
+    /* According to the 802.1ag specification, reception of a CCM with an
+     * incorrect ccm_interval, unexpected MAID, or unexpected MPID should
+     * trigger a fault.  We ignore this requirement for several reasons.
+     *
+     * Faults can cause a controller or Open vSwitch to make potentially
+     * expensive changes to the network topology.  It seems prudent to trigger
+     * them judiciously, especially when CFM is used to check slave status of
+     * bonds. Furthermore, faults can be maliciously triggered by crafting
+     * invalid CCMs. */
      if (memcmp(ccm->maid, cfm->maid, sizeof ccm->maid)) {
-        cfmi->x_recv_time = time_msec();
-        cfm->fault = true;
          VLOG_WARN_RL(&rl, "Received unexpected remote MAID from MAC "
                       ETH_ADDR_FMT, ETH_ADDR_ARGS(eth->eth_src));
      } else {
@@ -384,15 +394,20 @@ cfm_process_heartbeat(struct cfm *cfm, const struct ofpbuf *p)
          rmp = lookup_remote_mp(&cfm->remote_mps, ccm_mpid);
  
          if (rmp) {
-            rmp->recv_time = time_msec();
-            rmp->fault = ccm_interval != cfmi->ccm_interval;
-            cfm->fault = rmp->fault || cfm->fault;
+            rmp->recv = true;
+
+            if (ccm_interval != cfmi->ccm_interval) {
+                VLOG_WARN_RL(&rl, "received a CCM with an invalid interval"
+                             " (%"PRIu8") from RMP %"PRIu16, ccm_interval,
+                             rmp->mpid);
+            }
          } else {
-            cfmi->x_recv_time = time_msec();
-            cfm->fault = true;
              VLOG_WARN_RL(&rl, "Received unexpected remote MPID %d from MAC "
                           ETH_ADDR_FMT, ccm_mpid, ETH_ADDR_ARGS(eth->eth_src));
          }
+
+        VLOG_DBG("Received CCM (mpid %"PRIu16") (interval %"PRIu8")", ccm_mpid,
+                 ccm_interval);
      }
  }
  
@@ -400,7 +415,6 @@ void
  cfm_dump_ds(const struct cfm *cfm, struct ds *ds)
  {
      const struct cfm_internal *cfmi = cfm_to_internal(cfm);
-    long long int now = time_msec();
      struct remote_mp *rmp;
  
      ds_put_format(ds, "MPID %"PRIu16": %s\n", cfm->mpid,
@@ -412,16 +426,11 @@ cfm_dump_ds(const struct cfm *cfm, struct ds *ds)
      ds_put_format(ds, "\tnext fault check: %lldms\n",
                    timer_msecs_until_expired(&cfmi->fault_timer));
  
-    if (cfmi->x_recv_time != LLONG_MIN) {
-        ds_put_format(ds, "\ttime since bad CCM rx: %lldms\n",
-                      now - cfmi->x_recv_time);
-    }
-
      ds_put_cstr(ds, "\n");
      HMAP_FOR_EACH (rmp, node, &cfm->remote_mps) {
          ds_put_format(ds, "Remote MPID %"PRIu16": %s\n", rmp->mpid,
                        rmp->fault ? "fault" : "");
-        ds_put_format(ds, "\ttime since CCM rx: %lldms\n",
-                      time_msec() - rmp->recv_time);
+        ds_put_format(ds, "\trecv since check: %s",
+                      rmp->recv ? "true" : "false");
      }
  }