From: Justin Pettit Date: Sat, 12 Sep 2009 07:49:30 +0000 (-0700) Subject: Merge commit 'origin/citrix' X-Git-Tag: v0.99.0~103 X-Git-Url: http://git.onelab.eu/?a=commitdiff_plain;h=39fb08818bbd9c438dbf23caa89937c663451b5a;hp=6867bc3633fc5734eb81cd8684f0fed5df319bdd;p=sliver-openvswitch.git Merge commit 'origin/citrix' Conflicts: vswitchd/ovs-vswitchd.8.in vswitchd/ovs-vswitchd.conf.5.in xenserver/etc_init.d_vswitch --- diff --git a/lib/automake.mk b/lib/automake.mk index b0d10fded..e5dbfba89 100644 --- a/lib/automake.mk +++ b/lib/automake.mk @@ -155,6 +155,7 @@ EXTRA_DIST += \ lib/daemon.man \ lib/dpif.man \ lib/leak-checker.man \ + lib/vlog-unixctl.man \ lib/vlog.man diff --git a/lib/coverage.c b/lib/coverage.c index ba5d68d84..cdc796ec3 100644 --- a/lib/coverage.c +++ b/lib/coverage.c @@ -122,7 +122,10 @@ coverage_log_counter(enum vlog_level level, const struct coverage_counter *c) } /* Logs the coverage counters at the given vlog 'level'. If - * 'suppress_dups' is true, then duplicate events are not displayed. */ + * 'suppress_dups' is true, then duplicate events are not displayed. + * Care should be taken in the value used for 'level'. Depending on the + * configuration, syslog can write changes synchronously, which can + * cause the coverage messages to take several seconds to write. */ void coverage_log(enum vlog_level level, bool suppress_dups) { diff --git a/lib/timeval.c b/lib/timeval.c index 4c34a1731..3cca338fb 100644 --- a/lib/timeval.c +++ b/lib/timeval.c @@ -294,7 +294,12 @@ log_poll_interval(long long int last_wakeup, const struct rusage *last_rusage) rusage.ru_nvcsw - last_rusage->ru_nvcsw, rusage.ru_nivcsw - last_rusage->ru_nivcsw); } - coverage_log(VLL_WARN, true); + + /* Care should be taken in the value chosen for logging. Depending + * on the configuration, syslog can write changes synchronously, + * which can cause the coverage messages to take longer to log + * than the processing delay that triggered it. */ + coverage_log(VLL_INFO, true); } /* Update exponentially weighted moving average. With these parameters, a diff --git a/lib/vconn.c b/lib/vconn.c index 321092314..aed1880d5 100644 --- a/lib/vconn.c +++ b/lib/vconn.c @@ -1230,7 +1230,7 @@ check_action(const union ofp_action *a, unsigned int len, int max_ports) { int error; - switch (a->type) { + switch (ntohs(a->type)) { case OFPAT_OUTPUT: error = check_action_port(ntohs(a->output.port), max_ports); if (error) { diff --git a/lib/vlog-unixctl.man b/lib/vlog-unixctl.man new file mode 100644 index 000000000..5c79875fc --- /dev/null +++ b/lib/vlog-unixctl.man @@ -0,0 +1,39 @@ +.SS "VLOG COMMANDS" +These commands manage \fB\*(PN\fR's logging settings. +.IP "\fBvlog/set\fR \fImodule\fR[\fB:\fIfacility\fR[\fB:\fIlevel\fR]]" +Sets the logging level for \fImodule\fR in \fIfacility\fR to +\fIlevel\fR: +. +.RS +.IP \(bu +\fImodule\fR may be any valid module name (as displayed by the +\fB--list\fR action on \fBovs-appctl\fR(8)), or the special name +\fBANY\fR to set the logging levels for all modules. +. +.IP \(bu +\fIfacility\fR may be \fBsyslog\fR, \fBconsole\fR, or \fBfile\fR to +set the levels for logging to the system log, the console, or a file +respectively, or \fBANY\fR to set the logging levels for both +facilities. If it is omitted, \fIfacility\fR defaults to \fBANY\fR. +.IP +The log level for the \fBfile\fR facility has no effect unless +\fB\*(PN\fR was invoked with the \fB--log-file\fR option. +.IP \(bu +\fIlevel\fR must be one of \fBemer\fR, \fBerr\fR, \fBwarn\fR, +\fBinfo\fR, or +\fBdbg\fR, designating the minimum severity of a message for it to be +logged. If it is omitted, \fIlevel\fR defaults to \fBdbg\fR. +.RE +.IP "\fBvlog/set PATTERN:\fIfacility\fB:\fIpattern\fR" +Sets the log pattern for \fIfacility\fR to \fIpattern\fR. Refer to +\fBovs-appctl\fR(8) for a description of the valid syntax for \fIpattern\fR. +. +.IP "\fBvlog/list\fR" +Lists the supported logging modules and their current levels. +. +.IP "\fBvlog/reopen\fR" +Causes \fB\*(PN\fR to close and reopen its log file. (This is useful +after rotating log files, to cause a new log file to be used.) +.IP +This has no effect unless \fB\*(PN\fR was invoked with the +\fB--log-file\fR option. diff --git a/vswitchd/INTERNALS b/vswitchd/INTERNALS new file mode 100644 index 000000000..49a415880 --- /dev/null +++ b/vswitchd/INTERNALS @@ -0,0 +1,129 @@ + ======================== + ovs-vswitchd Internals + ======================== + +This document describes some of the internals of the ovs-vswitchd +process. It is not complete. It tends to be updated on demand, so if +you have questions about the vswitchd implementation, ask them and +perhaps we'll add some appropriate documentation here. + +Most of the ovs-vswitchd implementation is in vswitchd/bridge.c, so +code references below should be assumed to refer to that file except +as otherwise specified. + +Bonding +======= + +Bonding allows two or more interfaces (the "slaves") to share network +traffic. From a high-level point of view, bonded interfaces act like +a single port, but they have the bandwidth of multiple network +devices, e.g. two 1 GB physical interfaces act like a single 2 GB +interface. Bonds also increase robustness: the bonded port does not +go down as long as at least one of its slaves is up. + +In vswitchd, a bond always has at least two slaves (and may have +more). If a configuration error, etc. would cause a bond to have only +one slave, the port becomes an ordinary port, not a bonded port, and +none of the special features of bonded ports described in this section +apply. + +There are many forms of bonding, but ovs-vswitchd currently implements +only a single kind, called "source load balancing" or SLB bonding. +SLB bonding divides traffic among the slaves based on the Ethernet +source address. This is useful only if the traffic over the bond has +multiple Ethernet source addresses, for example if network traffic +from multiple VMs are multiplexed over the bond. + +Enabling and Disabling Slaves +----------------------------- + +When a bond is created, a slave is initially enabled or disabled based +on whether carrier is detected on the NIC (see iface_create()). After +that, a slave is disabled if its carrier goes down for a period of +time longer than the downdelay, and it is enabled if carrier comes up +for longer than the updelay (see bond_link_status_update()). There is +one exception where the updelay is skipped: if no slaves at all are +currently enabled, then the first slave on which carrier comes up is +enabled immediately. + +The updelay should be set to a time longer than the STP forwarding +delay of the physical switch to which the bond port is connected (if +STP is enabled on that switch). Otherwise, the slave will be enabled, +and load may be shifted to it, before the physical switch starts +forwarding packets on that port, which can cause some data to be +"blackholed" for a time. The exception for a single enabled slave +does not cause any problem in this regard because when no slaves are +enabled all output packets are blackholed anyway. + +When a slave becomes disabled, the vswitch immediately chooses a new +output port for traffic that was destined for that slave (see +bond_enable_slave()). It also sends a "gratuitous learning packet" on +the bond port (on the newly chosen slave) for each MAC address that +the vswitch has learned on a port other than the bond (see +bond_send_learning_packets()), to teach the physical switch that the +new slave should be used in place of the one that is now disabled. +(This behavior probably makes sense only for a vswitch that has only +one port (the bond) connected to a physical switch; vswitchd should +probably provide a way to disable or configure it in other scenarios.) + +Bond Packet Input +----------------- + +Bond packet input processing takes place in process_flow(). + +Bonding accepts unicast packets on any bond slave. This can +occasionally cause packet duplication for the first few packets sent +to a given MAC, if the physical switch attached to the bond is +flooding packets to that MAC because it has not yet learned the +correct slave for that MAC. + +Bonding only accepts multicast (and broadcast) packets on a single +bond slave (the "active slave") at any given time. Multicast packets +received on other slaves are dropped. Otherwise, every multicast +packet would be duplicated, once for every bond slave, because the +physical switch attached to the bond will flood those packets. + +Bonding also drops some multicast packets received on the active +slave: those for the vswitch has learned that the packet's MAC is on a +port other than the bond port itself. This is because it is likely +that the vswitch itself sent the multicast packet out the bond port, +on a slave other than the active slave, and is now receiving the +packet back on the active slave. However, the vswitch makes an +exception to this rule for broadcast ARP replies, which indicate that +the MAC has moved to another switch, probably due to VM migration. +(ARP replies are normally unicast, so this exception does not match +normal ARP replies. It will match the learning packets sent on bond +fail-over.) + +The active slave is simply the first slave to be enabled after the +bond is created (see bond_choose_active_iface()). If the active slave +is disabled, then a new active slave is chosen among the slaves that +remain active. Currently due to the way that configuration works, +this tends to be the remaining slave whose interface name is first +alphabetically, but this is by no means guaranteed. + +Bond Packet Output +------------------ + +When a packet is sent out a bond port, the bond slave actually used is +selected based on the packet's source MAC (see choose_output_iface()). +In particular, the source MAC is hashed into one of 256 values, and +that value is looked up in a hash table (the "bond hash") kept in the +"bond_hash" member of struct port. The hash table entry identifies a +bond slave. If no bond slave has yet been chosen for that hash table +entry, vswitchd chooses one arbitrarily. + +Every 10 seconds, vswitchd rebalances the bond slaves (see +bond_rebalance_port()). To rebalance, vswitchd examines the +statistics for the number of bytes transmitted by each slave over +approximately the past minute, with data sent more recently weighted +more heavily than data sent less recently. It considers each of the +slaves in order from most-loaded to least-loaded. If highly loaded +slave H is significantly more heavily loaded than the least-loaded +slave L, and slave H carries at least two hashes, then vswitchd shifts +one of H's hashes to L. However, vswitchd will not shift a hash from +H to L if that will cause L's load to exceed H's load. + +Currently, "significantly more loaded" means that H must carry at +least 1 Mbps more traffic, and that traffic must be at least 3% +greater than L's. diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c index 7081512e9..48d02b383 100644 --- a/vswitchd/bridge.c +++ b/vswitchd/bridge.c @@ -1955,7 +1955,7 @@ process_flow(struct bridge *br, const flow_t *flow, goto done; } else { /* Drop all multicast packets for which we have learned a different - * input port, because we probably sent the packet on one slaves + * input port, because we probably sent the packet on one slave * and got it back on the active slave. Broadcast ARP replies are * an exception to this rule: the host has moved to another * switch. */ diff --git a/vswitchd/ovs-vswitchd.8.in b/vswitchd/ovs-vswitchd.8.in index 6941bdf5e..788e22372 100644 --- a/vswitchd/ovs-vswitchd.8.in +++ b/vswitchd/ovs-vswitchd.8.in @@ -73,13 +73,71 @@ to be loaded. .so lib/common.man .so lib/leak-checker.man . -.SH "BUGS" +.SH "RUNTIME MANAGEMENT COMMANDS" +\fBovs\-appctl\fR(8) can send commands to a running +\fBovs\-vswitchd\fR process. The currently supported commands are +described below. The command descriptions assume an understanding of +how to configure Open vSwitch, as described in +\fBovs-vswitchd.conf\fR(5). +.SS "OVS\-VSWITCHD COMMANDS" +These commands manage the \fBovs-vswitchd\fR process. +.IP "\fBvswitchd/reload\fR" +Reloads the \fBovs\-vswitchd\fR configuration file, as if a +\fBSIGHUP\fR signal were received. The command completes only after +reloading is finished, in particular after all datapaths have been +created and destroyed and ports added and removed as specified by the +new configuration. +.SS "BRIDGE COMMANDS" +These commands manage bridges. +.IP "\fBfdb/show\fR \fIbridge\fR" +Lists each MAC address/VLAN pair learned by the specified \fIbridge\fR, +along with the port on which it was learned and the age of the entry, +in seconds. +.SS "BOND COMMANDS" +These commands manage bonded ports on an Open vSwitch's bridges. To +understand some of these commands, it is important to understand a +detail of the bonding implementation called ``MAC hashing.'' Instead +of directly assigning Ethernet source addresses to slaves, the bonding +implementation computes a function that maps an 48-bit Ethernet source +addresses into an 8-bit value (a ``MAC hash'' value). All of the +Ethernet addresses that map to a single 8-bit value are then assigned +to a single slave. +.IP "\fBbond/list\fR" +Lists all of the bonds, and their slaves, on each bridge. . -Only Open vSwitch kernel-based datapaths are currently supported. In the -future, this restriction may be lifted. -.PP -Only Linux 2.6.\fIx\fR is currently supported. +.IP "\fBbond/show\fR \fIport\fR" +Lists all of the bond-specific information about the given bonded +\fIport\fR: updelay, downdelay, time until the next rebalance. Also +lists information about each slave: whether it is enabled or disabled, +the time to completion of an updelay or downdelay if one is in +progress, whether it is the active slave, the MAC hashes assigned to +the slave, and the MAC learning table entries that hash to each MAC. +.IP "\fBbond/migrate\fR \fIport\fR \fIhash\fR \fIslave\fR" +Assigns a given MAC hash to a new slave. \fIport\fR specifies the +bond port, \fIhash\fR either the MAC hash to be migrated (as a decimal +number between 0 and 255) or an Ethernet address to be hashed, and +\fIslave\fR the new slave to be assigned. +.IP +The reassignment is not permanent: rebalancing or fail-over will +cause the MAC hash to be shifted to a new slave in the usual +manner. +.IP +A MAC hash cannot be migrated to a disabled slave. +.IP "\fBbond/set-active-slave\fR \fIport\fR \fIslave\fR" +Sets \fIslave\fR as the active slave on \fIport\fR. \fIslave\fR must +currently be enabled. +.IP +The setting is not permanent: a new active slave will be selected +if \fIslave\fR becomes disabled. +.IP "\fBbond/enable-slave\fR \fIport\fR \fIslave\fR" +.IQ "\fBbond/disable-slave\fR \fIport\fR \fIslave\fR" +Enables (or disables) \fIslave\fR on the given bond \fIport\fR, skipping any +updelay (or downdelay). +.IP +This setting is not permanent: it persists only until the carrier +status of \fIslave\fR changes. . +.so lib/vlog-unixctl.man .SH "SEE ALSO" .BR ovs\-appctl (8), .BR ovs\-vswitchd.conf (5), diff --git a/vswitchd/ovs-vswitchd.conf.5.in b/vswitchd/ovs-vswitchd.conf.5.in index af3ff27f5..431e20b9c 100644 --- a/vswitchd/ovs-vswitchd.conf.5.in +++ b/vswitchd/ovs-vswitchd.conf.5.in @@ -523,7 +523,8 @@ the connection to the controller stays down long enough, no packets can pass through the switch at all. .IP The first of these that is set takes effect. -If the value is \fBstandalone\fR, \fBovs\-vswitchd\fR will take over +If the value is \fBstandalone\fR, or if neither of these settings +is set, \fBovs\-vswitchd\fR will take over responsibility for setting up flows when no message has been received from the controller for three times the inactivity probe interval (see above). In this mode, @@ -532,9 +533,8 @@ MAC-learning switch. \fBovs\-vswitchd\fR will continue to retry connecting to the controller in the background and, when the connection succeeds, it discontinues its standalone behavior. .IP -If this option is set to \fBsecure\fR, or if neither of these settings -is set, \fBovs\-vswitchd\fR will not set up flows on its own when the -controller connection fails. +If this option is set to \fBsecure\fR, \fBovs\-vswitchd\fR will not +set up flows on its own when the controller connection fails. .IP "\fBbridge.\fIname\fB.controller.max-backoff=\fIsecs\fR" Sets the maximum time between attempts to connect to the controller to \fIsecs\fR, which must be at least 1. The actual interval between diff --git a/xenserver/etc_init.d_vswitch b/xenserver/etc_init.d_vswitch index 8bb8e2a75..8f0adf797 100755 --- a/xenserver/etc_init.d_vswitch +++ b/xenserver/etc_init.d_vswitch @@ -34,8 +34,8 @@ test -e /etc/sysconfig/vswitch && . /etc/sysconfig/vswitch : ${VSWITCHD_RUN_DIR:=/var/xen/vswitch} : ${VSWITCHD_PRIORITY:=-10} : ${VSWITCHD_LOGFILE:=/var/log/ovs-vswitchd.log} -: ${VSWITCHD_FILE_LOGLEVEL:=} -: ${VSWITCHD_SYSLOG_LOGLEVEL:=WARN} +: ${VSWITCHD_FILE_LOGLEVEL:=INFO} +: ${VSWITCHD_SYSLOG_LOGLEVEL:=ERR} : ${VSWITCHD_MEMLEAK_LOGFILE:=} : ${VSWITCHD_STRACE_LOG:=} : ${VSWITCHD_STRACE_OPT:=} @@ -47,8 +47,8 @@ test -e /etc/sysconfig/vswitch && . /etc/sysconfig/vswitch : ${BRCOMPATD_RUN_DIR:=/var/xen/vswitch} : ${BRCOMPATD_PRIORITY:=-10} : ${BRCOMPATD_LOGFILE:=/var/log/ovs-brcompatd.log} -: ${BRCOMPATD_FILE_LOGLEVEL:=} -: ${BRCOMPATD_SYSLOG_LOGLEVEL:=INFO} +: ${BRCOMPATD_FILE_LOGLEVEL:=INFO} +: ${BRCOMPATD_SYSLOG_LOGLEVEL:=ERR} : ${BRCOMPATD_MEMLEAK_LOGFILE:=} : ${BRCOMPATD_STRACE_LOG:=} : ${BRCOMPATD_STRACE_OPT:=} @@ -111,6 +111,13 @@ function reload_vswitchd { fi } +function reload_brcompatd { + if [ -f "$BRCOMPATD_PIDFILE" ]; then + "$appctl" \ + --target=ovs-brcompatd.$(cat "$BRCOMPATD_PIDFILE").ctl --reopen + fi +} + function start_vswitchd { local syslog_opt="-vANY:SYSLOG:${VSWITCHD_SYSLOG_LOGLEVEL}" local logfile_file_opt="" @@ -165,7 +172,7 @@ function start_brcompatd { mkdir -p "$BRCOMPATD_RUN_DIR" fi cd "$BRCOMPATD_RUN_DIR" - if [ ! -n "$BRCOMPATD_FILE_LOGLEVEL" ]; then + if [ -n "$BRCOMPATD_FILE_LOGLEVEL" ]; then logfile_level_opt="-vANY:FILE:${BRCOMPATD_FILE_LOGLEVEL}" logfile_file_opt="--log-file=$BRCOMPATD_LOGFILE" fi @@ -288,6 +295,7 @@ case "$1" in ;; reload) reload_vswitchd + reload_brcompatd ;; strace-vswitchd) shift diff --git a/xenserver/etc_logrotate.d_vswitch b/xenserver/etc_logrotate.d_vswitch index dae235bd5..6366c6758 100644 --- a/xenserver/etc_logrotate.d_vswitch +++ b/xenserver/etc_logrotate.d_vswitch @@ -5,10 +5,10 @@ # notice and this notice are preserved. This file is offered as-is, # without warranty of any kind. -/var/log/ovs-vswitchd.log { +/var/log/ovs-vswitchd.log /var/log/ovs-brcompatd.log { sharedscripts postrotate - # Send sighup to vswitch which will cause it to reopen its log files. - /sbin/service vswitch reload + # Tell ovs-vswitchd and ovs-brcompatd to reopen their log files + /sbin/service vswitch reload endscript }