2 * Copyright (c) 2008, 2009, 2010 Nicira Networks.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include <arpa/inet.h>
27 #include "mac-learning.h"
29 #include "xflow-util.h"
30 #include "ofp-print.h"
33 #include "openflow/openflow.h"
34 #include "openvswitch/xflow.h"
36 #include "poll-loop.h"
44 #define THIS_MODULE VLM_in_band
47 /* In-band control allows a single network to be used for OpenFlow
48 * traffic and other data traffic. Refer to ovs-vswitchd.conf(5) and
49 * secchan(8) for a description of configuring in-band control.
51 * This comment is an attempt to describe how in-band control works at a
52 * wire- and implementation-level. Correctly implementing in-band
53 * control has proven difficult due to its many subtleties, and has thus
54 * gone through many iterations. Please read through and understand the
55 * reasoning behind the chosen rules before making modifications.
57 * In Open vSwitch, in-band control is implemented as "hidden" flows (in
58 * that they are not visible through OpenFlow) and at a higher priority
59 * than wildcarded flows can be set up by the controller. This is done
60 * so that the controller cannot interfere with them and possibly break
61 * connectivity with its switches. It is possible to see all flows,
62 * including in-band ones, with the ovs-appctl "bridge/dump-flows"
65 * The following rules are always enabled with the "normal" action by a
66 * switch with in-band control:
68 * a. DHCP requests sent from the local port.
69 * b. ARP replies to the local port's MAC address.
70 * c. ARP requests from the local port's MAC address.
71 * d. ARP replies to the remote side's MAC address. Note that the
72 * remote side is either the controller or the gateway to reach
74 * e. ARP requests from the remote side's MAC address. Note that
75 * like (d), the MAC is either for the controller or gateway.
76 * f. ARP replies containing the controller's IP address as a target.
77 * g. ARP requests containing the controller's IP address as a source.
78 * h. OpenFlow (6633/tcp) traffic to the controller's IP.
79 * i. OpenFlow (6633/tcp) traffic from the controller's IP.
81 * The goal of these rules is to be as narrow as possible to allow a
82 * switch to join a network and be able to communicate with a
83 * controller. As mentioned earlier, these rules have higher priority
84 * than the controller's rules, so if they are too broad, they may
85 * prevent the controller from implementing its policy. As such,
86 * in-band actively monitors some aspects of flow and packet processing
87 * so that the rules can be made more precise.
89 * In-band control monitors attempts to add flows into the datapath that
90 * could interfere with its duties. The datapath only allows exact
91 * match entries, so in-band control is able to be very precise about
92 * the flows it prevents. Flows that miss in the datapath are sent to
93 * userspace to be processed, so preventing these flows from being
94 * cached in the "fast path" does not affect correctness. The only type
95 * of flow that is currently prevented is one that would prevent DHCP
96 * replies from being seen by the local port. For example, a rule that
97 * forwarded all DHCP traffic to the controller would not be allowed,
98 * but one that forwarded to all ports (including the local port) would.
100 * As mentioned earlier, packets that miss in the datapath are sent to
101 * the userspace for processing. The userspace has its own flow table,
102 * the "classifier", so in-band checks whether any special processing
103 * is needed before the classifier is consulted. If a packet is a DHCP
104 * response to a request from the local port, the packet is forwarded to
105 * the local port, regardless of the flow table. Note that this requires
106 * L7 processing of DHCP replies to determine whether the 'chaddr' field
107 * matches the MAC address of the local port.
109 * It is interesting to note that for an L3-based in-band control
110 * mechanism, the majority of rules are devoted to ARP traffic. At first
111 * glance, some of these rules appear redundant. However, each serves an
112 * important role. First, in order to determine the MAC address of the
113 * remote side (controller or gateway) for other ARP rules, we must allow
114 * ARP traffic for our local port with rules (b) and (c). If we are
115 * between a switch and its connection to the controller, we have to
116 * allow the other switch's ARP traffic to through. This is done with
117 * rules (d) and (e), since we do not know the addresses of the other
118 * switches a priori, but do know the controller's or gateway's. Finally,
119 * if the controller is running in a local guest VM that is not reached
120 * through the local port, the switch that is connected to the VM must
121 * allow ARP traffic based on the controller's IP address, since it will
122 * not know the MAC address of the local port that is sending the traffic
123 * or the MAC address of the controller in the guest VM.
125 * With a few notable exceptions below, in-band should work in most
126 * network setups. The following are considered "supported' in the
127 * current implementation:
129 * - Locally Connected. The switch and controller are on the same
130 * subnet. This uses rules (a), (b), (c), (h), and (i).
132 * - Reached through Gateway. The switch and controller are on
133 * different subnets and must go through a gateway. This uses
134 * rules (a), (b), (c), (h), and (i).
136 * - Between Switch and Controller. This switch is between another
137 * switch and the controller, and we want to allow the other
138 * switch's traffic through. This uses rules (d), (e), (h), and
139 * (i). It uses (b) and (c) indirectly in order to know the MAC
140 * address for rules (d) and (e). Note that DHCP for the other
141 * switch will not work unless the controller explicitly lets this
142 * switch pass the traffic.
144 * - Between Switch and Gateway. This switch is between another
145 * switch and the gateway, and we want to allow the other switch's
146 * traffic through. This uses the same rules and logic as the
147 * "Between Switch and Controller" configuration described earlier.
149 * - Controller on Local VM. The controller is a guest VM on the
150 * system running in-band control. This uses rules (a), (b), (c),
153 * - Controller on Local VM with Different Networks. The controller
154 * is a guest VM on the system running in-band control, but the
155 * local port is not used to connect to the controller. For
156 * example, an IP address is configured on eth0 of the switch. The
157 * controller's VM is connected through eth1 of the switch, but an
158 * IP address has not been configured for that port on the switch.
159 * As such, the switch will use eth0 to connect to the controller,
160 * and eth1's rules about the local port will not work. In the
161 * example, the switch attached to eth0 would use rules (a), (b),
162 * (c), (h), and (i) on eth0. The switch attached to eth1 would use
163 * rules (f), (g), (h), and (i).
165 * The following are explicitly *not* supported by in-band control:
167 * - Specify Controller by Name. Currently, the controller must be
168 * identified by IP address. A naive approach would be to permit
169 * all DNS traffic. Unfortunately, this would prevent the
170 * controller from defining any policy over DNS. Since switches
171 * that are located behind us need to connect to the controller,
172 * in-band cannot simply add a rule that allows DNS traffic from
173 * the local port. The "correct" way to support this is to parse
174 * DNS requests to allow all traffic related to a request for the
175 * controller's name through. Due to the potential security
176 * problems and amount of processing, we decided to hold off for
179 * - Multiple Controllers. There is nothing intrinsic in the high-
180 * level design that prevents using multiple (known) controllers,
181 * however, the current implementation's data structures assume
184 * - Differing Controllers for Switches. All switches must know
185 * the L3 addresses for all the controllers that other switches
186 * may use, since rules need to be set up to allow traffic related
187 * to those controllers through. See rules (f), (g), (h), and (i).
189 * - Differing Routes for Switches. In order for the switch to
190 * allow other switches to connect to a controller through a
191 * gateway, it allows the gateway's traffic through with rules (d)
192 * and (e). If the routes to the controller differ for the two
193 * switches, we will not know the MAC address of the alternate
197 #define IB_BASE_PRIORITY 18181800
200 IBR_FROM_LOCAL_DHCP, /* (a) From local port, DHCP. */
201 IBR_TO_LOCAL_ARP, /* (b) To local port, ARP. */
202 IBR_FROM_LOCAL_ARP, /* (c) From local port, ARP. */
203 IBR_TO_REMOTE_ARP, /* (d) To remote MAC, ARP. */
204 IBR_FROM_REMOTE_ARP, /* (e) From remote MAC, ARP. */
205 IBR_TO_CTL_ARP, /* (f) To controller IP, ARP. */
206 IBR_FROM_CTL_ARP, /* (g) From controller IP, ARP. */
207 IBR_TO_CTL_OFP, /* (h) To controller, OpenFlow port. */
208 IBR_FROM_CTL_OFP, /* (i) From controller, OpenFlow port. */
209 #if OFP_TCP_PORT != OFP_SSL_PORT
210 #error Need to support separate TCP and SSL flows.
221 struct ofproto *ofproto;
222 struct rconn *controller;
223 struct status_category *ss_cat;
225 /* Keep track of local port's information. */
226 uint8_t local_mac[ETH_ADDR_LEN]; /* Current MAC. */
227 struct netdev *local_netdev; /* Local port's network device. */
228 time_t next_local_refresh;
230 /* Keep track of controller and next hop's information. */
231 uint32_t controller_ip; /* Controller IP, 0 if unknown. */
232 uint8_t remote_mac[ETH_ADDR_LEN]; /* Remote MAC. */
233 struct netdev *remote_netdev;
234 uint8_t last_remote_mac[ETH_ADDR_LEN]; /* Previous remote MAC. */
235 time_t next_remote_refresh;
237 /* Rules that we set up. */
238 struct ib_rule rules[N_IB_RULES];
241 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(60, 60);
243 static const uint8_t *
244 get_remote_mac(struct in_band *ib)
248 struct in_addr c_in4; /* Controller's IP address. */
249 struct in_addr r_in4; /* Next hop IP address. */
251 time_t now = time_now();
253 if (now >= ib->next_remote_refresh) {
254 /* Find the next-hop IP address. */
255 c_in4.s_addr = ib->controller_ip;
256 memset(ib->remote_mac, 0, sizeof ib->remote_mac);
257 retval = netdev_get_next_hop(ib->local_netdev,
258 &c_in4, &r_in4, &next_hop_dev);
260 VLOG_WARN("cannot find route for controller ("IP_FMT"): %s",
261 IP_ARGS(&ib->controller_ip), strerror(retval));
262 ib->next_remote_refresh = now + 1;
266 r_in4.s_addr = c_in4.s_addr;
269 /* Get the next-hop IP and network device. */
270 if (!ib->remote_netdev
271 || strcmp(netdev_get_name(ib->remote_netdev), next_hop_dev))
273 netdev_close(ib->remote_netdev);
275 retval = netdev_open_default(next_hop_dev, &ib->remote_netdev);
277 VLOG_WARN_RL(&rl, "cannot open netdev %s (next hop "
278 "to controller "IP_FMT"): %s",
279 next_hop_dev, IP_ARGS(&ib->controller_ip),
281 ib->next_remote_refresh = now + 1;
286 /* Look up the MAC address of the next-hop IP address. */
287 retval = netdev_arp_lookup(ib->remote_netdev, r_in4.s_addr,
290 VLOG_DBG_RL(&rl, "cannot look up remote MAC address ("IP_FMT"): %s",
291 IP_ARGS(&r_in4.s_addr), strerror(retval));
293 have_mac = !eth_addr_is_zero(ib->remote_mac);
296 && !eth_addr_equals(ib->last_remote_mac, ib->remote_mac)) {
297 VLOG_DBG("remote MAC address changed from "ETH_ADDR_FMT" to "
299 ETH_ADDR_ARGS(ib->last_remote_mac),
300 ETH_ADDR_ARGS(ib->remote_mac));
301 memcpy(ib->last_remote_mac, ib->remote_mac, ETH_ADDR_LEN);
304 /* Schedule next refresh.
306 * If we have an IP address but not a MAC address, then refresh
307 * quickly, since we probably will get a MAC address soon (via ARP).
308 * Otherwise, we can afford to wait a little while. */
309 ib->next_remote_refresh
310 = now + (!ib->controller_ip || have_mac ? 10 : 1);
313 return !eth_addr_is_zero(ib->remote_mac) ? ib->remote_mac : NULL;
316 static const uint8_t *
317 get_local_mac(struct in_band *ib)
319 time_t now = time_now();
320 if (now >= ib->next_local_refresh) {
321 uint8_t ea[ETH_ADDR_LEN];
322 if (ib->local_netdev && !netdev_get_etheraddr(ib->local_netdev, ea)) {
323 memcpy(ib->local_mac, ea, ETH_ADDR_LEN);
325 ib->next_local_refresh = now + 1;
327 return !eth_addr_is_zero(ib->local_mac) ? ib->local_mac : NULL;
331 in_band_status_cb(struct status_reply *sr, void *in_band_)
333 struct in_band *in_band = in_band_;
335 if (!eth_addr_is_zero(in_band->local_mac)) {
336 status_reply_put(sr, "local-mac="ETH_ADDR_FMT,
337 ETH_ADDR_ARGS(in_band->local_mac));
340 if (!eth_addr_is_zero(in_band->remote_mac)) {
341 status_reply_put(sr, "remote-mac="ETH_ADDR_FMT,
342 ETH_ADDR_ARGS(in_band->remote_mac));
347 drop_flow(struct in_band *in_band, int rule_idx)
349 struct ib_rule *rule = &in_band->rules[rule_idx];
351 if (rule->installed) {
352 rule->installed = false;
353 ofproto_delete_flow(in_band->ofproto, &rule->flow);
357 /* out_port and fixed_fields are assumed never to change. */
359 set_up_flow(struct in_band *in_band, int rule_idx, const flow_t *flow,
360 uint32_t fixed_fields, uint16_t out_port)
362 struct ib_rule *rule = &in_band->rules[rule_idx];
364 if (!rule->installed || memcmp(flow, &rule->flow, sizeof *flow)) {
365 union ofp_action action;
367 drop_flow(in_band, rule_idx);
369 rule->installed = true;
371 rule->flow.wildcards = OFPFW_ALL & ~fixed_fields;
372 rule->flow.priority = IB_BASE_PRIORITY + (N_IB_RULES - rule_idx);
374 action.type = htons(OFPAT_OUTPUT);
375 action.output.len = htons(sizeof action);
376 action.output.port = htons(out_port);
377 action.output.max_len = htons(0);
378 ofproto_add_flow(in_band->ofproto, &rule->flow, &action, 1, 0);
383 in_band_run(struct in_band *in_band)
385 time_t now = time_now();
386 uint32_t controller_ip;
387 const uint8_t *remote_mac;
388 const uint8_t *local_mac;
391 if (now < in_band->next_remote_refresh
392 && now < in_band->next_local_refresh) {
396 controller_ip = rconn_get_remote_ip(in_band->controller);
397 if (in_band->controller_ip && controller_ip != in_band->controller_ip) {
398 VLOG_DBG("controller IP address changed from "IP_FMT" to "IP_FMT,
399 IP_ARGS(&in_band->controller_ip),
400 IP_ARGS(&controller_ip));
402 in_band->controller_ip = controller_ip;
404 remote_mac = get_remote_mac(in_band);
405 local_mac = get_local_mac(in_band);
408 /* Allow DHCP requests to be sent from the local port. */
409 memset(&flow, 0, sizeof flow);
410 flow.in_port = OFPP_LOCAL;
411 flow.dl_type = htons(ETH_TYPE_IP);
412 memcpy(flow.dl_src, local_mac, ETH_ADDR_LEN);
413 flow.nw_proto = IP_TYPE_UDP;
414 flow.tp_src = htons(DHCP_CLIENT_PORT);
415 flow.tp_dst = htons(DHCP_SERVER_PORT);
416 set_up_flow(in_band, IBR_FROM_LOCAL_DHCP, &flow,
417 (OFPFW_IN_PORT | OFPFW_DL_TYPE | OFPFW_DL_SRC
418 | OFPFW_NW_PROTO | OFPFW_TP_SRC | OFPFW_TP_DST),
421 /* Allow the connection's interface to receive directed ARP traffic. */
422 memset(&flow, 0, sizeof flow);
423 flow.dl_type = htons(ETH_TYPE_ARP);
424 memcpy(flow.dl_dst, local_mac, ETH_ADDR_LEN);
425 flow.nw_proto = ARP_OP_REPLY;
426 set_up_flow(in_band, IBR_TO_LOCAL_ARP, &flow,
427 (OFPFW_DL_TYPE | OFPFW_DL_DST | OFPFW_NW_PROTO),
430 /* Allow the connection's interface to be the source of ARP traffic. */
431 memset(&flow, 0, sizeof flow);
432 flow.dl_type = htons(ETH_TYPE_ARP);
433 memcpy(flow.dl_src, local_mac, ETH_ADDR_LEN);
434 flow.nw_proto = ARP_OP_REQUEST;
435 set_up_flow(in_band, IBR_FROM_LOCAL_ARP, &flow,
436 (OFPFW_DL_TYPE | OFPFW_DL_SRC | OFPFW_NW_PROTO),
439 drop_flow(in_band, IBR_TO_LOCAL_ARP);
440 drop_flow(in_band, IBR_FROM_LOCAL_ARP);
444 /* Allow ARP replies to the remote side's MAC. */
445 memset(&flow, 0, sizeof flow);
446 flow.dl_type = htons(ETH_TYPE_ARP);
447 memcpy(flow.dl_dst, remote_mac, ETH_ADDR_LEN);
448 flow.nw_proto = ARP_OP_REPLY;
449 set_up_flow(in_band, IBR_TO_REMOTE_ARP, &flow,
450 (OFPFW_DL_TYPE | OFPFW_DL_DST | OFPFW_NW_PROTO),
453 /* Allow ARP requests from the remote side's MAC. */
454 memset(&flow, 0, sizeof flow);
455 flow.dl_type = htons(ETH_TYPE_ARP);
456 memcpy(flow.dl_src, remote_mac, ETH_ADDR_LEN);
457 flow.nw_proto = ARP_OP_REQUEST;
458 set_up_flow(in_band, IBR_FROM_REMOTE_ARP, &flow,
459 (OFPFW_DL_TYPE | OFPFW_DL_SRC | OFPFW_NW_PROTO),
462 drop_flow(in_band, IBR_TO_REMOTE_ARP);
463 drop_flow(in_band, IBR_FROM_REMOTE_ARP);
467 /* Allow ARP replies to the controller's IP. */
468 memset(&flow, 0, sizeof flow);
469 flow.dl_type = htons(ETH_TYPE_ARP);
470 flow.nw_proto = ARP_OP_REPLY;
471 flow.nw_dst = controller_ip;
472 set_up_flow(in_band, IBR_TO_CTL_ARP, &flow,
473 (OFPFW_DL_TYPE | OFPFW_NW_PROTO | OFPFW_NW_DST_MASK),
476 /* Allow ARP requests from the controller's IP. */
477 memset(&flow, 0, sizeof flow);
478 flow.dl_type = htons(ETH_TYPE_ARP);
479 flow.nw_proto = ARP_OP_REQUEST;
480 flow.nw_src = controller_ip;
481 set_up_flow(in_band, IBR_FROM_CTL_ARP, &flow,
482 (OFPFW_DL_TYPE | OFPFW_NW_PROTO | OFPFW_NW_SRC_MASK),
485 /* OpenFlow traffic to or from the controller.
487 * (A given field's value is completely ignored if it is wildcarded,
488 * which is why we can get away with using a single 'flow' in each
490 memset(&flow, 0, sizeof flow);
491 flow.dl_type = htons(ETH_TYPE_IP);
492 flow.nw_proto = IP_TYPE_TCP;
493 flow.nw_src = controller_ip;
494 flow.nw_dst = controller_ip;
495 flow.tp_src = htons(OFP_TCP_PORT);
496 flow.tp_dst = htons(OFP_TCP_PORT);
497 set_up_flow(in_band, IBR_TO_CTL_OFP, &flow,
498 (OFPFW_DL_TYPE | OFPFW_NW_PROTO | OFPFW_NW_DST_MASK
499 | OFPFW_TP_DST), OFPP_NORMAL);
500 set_up_flow(in_band, IBR_FROM_CTL_OFP, &flow,
501 (OFPFW_DL_TYPE | OFPFW_NW_PROTO | OFPFW_NW_SRC_MASK
502 | OFPFW_TP_SRC), OFPP_NORMAL);
504 drop_flow(in_band, IBR_TO_CTL_ARP);
505 drop_flow(in_band, IBR_FROM_CTL_ARP);
506 drop_flow(in_band, IBR_TO_CTL_OFP);
507 drop_flow(in_band, IBR_FROM_CTL_OFP);
512 in_band_wait(struct in_band *in_band)
514 time_t now = time_now();
516 = MIN(in_band->next_remote_refresh, in_band->next_local_refresh);
518 poll_timer_wait((wakeup - now) * 1000);
520 poll_immediate_wake();
525 in_band_flushed(struct in_band *in_band)
529 for (i = 0; i < N_IB_RULES; i++) {
530 in_band->rules[i].installed = false;
535 in_band_create(struct ofproto *ofproto, struct wdp *wdp,
536 struct switch_status *ss, struct rconn *controller,
537 struct in_band **in_bandp)
539 struct in_band *in_band;
541 struct netdev *local_netdev;
544 error = wdp_port_get_name(wdp, OFPP_LOCAL, &local_name);
546 VLOG_ERR("failed to initialize in-band control: cannot get name "
547 "of datapath local port (%s)", strerror(error));
551 error = netdev_open_default(local_name, &local_netdev);
553 VLOG_ERR("failed to initialize in-band control: cannot open "
554 "datapath local port %s (%s)", local_name, strerror(error));
560 in_band = xzalloc(sizeof *in_band);
561 in_band->ofproto = ofproto;
562 in_band->controller = controller;
563 in_band->ss_cat = switch_status_register(ss, "in-band",
564 in_band_status_cb, in_band);
565 in_band->local_netdev = local_netdev;
566 in_band->next_local_refresh = TIME_MIN;
567 in_band->remote_netdev = NULL;
568 in_band->next_remote_refresh = TIME_MIN;
576 in_band_destroy(struct in_band *in_band)
579 switch_status_unregister(in_band->ss_cat);
580 netdev_close(in_band->local_netdev);
581 netdev_close(in_band->remote_netdev);
582 /* We don't own the rconn. */