/****************************************************************************** * Virtual network driver for conversing with remote driver backends. * * Copyright (c) 2002-2004, K A Fraser * * This file may be distributed separately from the Linux kernel, or * incorporated into other software packages, subject to the following license: * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this source file (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, copy, modify, * merge, publish, distribute, sublicense, and/or sell copies of the Software, * and to permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef __GFP_NOWARN #define __GFP_NOWARN 0 #endif #define alloc_xen_skb(_l) __dev_alloc_skb((_l), GFP_ATOMIC|__GFP_NOWARN) #define init_skb_shinfo(_skb) \ do { \ atomic_set(&(skb_shinfo(_skb)->dataref), 1); \ skb_shinfo(_skb)->nr_frags = 0; \ skb_shinfo(_skb)->frag_list = NULL; \ } while (0) /* Allow headroom on each rx pkt for Ethernet header, alignment padding, ... */ #define RX_HEADROOM 200 /* * If the backend driver is pipelining transmit requests then we can be very * aggressive in avoiding new-packet notifications -- only need to send a * notification if there are no outstanding unreceived responses. * If the backend may be buffering our transmit buffers for any reason then we * are rather more conservative. */ #ifdef CONFIG_XEN_NETDEV_FRONTEND_PIPELINED_TRANSMITTER #define TX_TEST_IDX resp_prod /* aggressive: any outstanding responses? */ #else #define TX_TEST_IDX req_cons /* conservative: not seen all our requests? */ #endif static void network_tx_buf_gc(struct net_device *dev); static void network_alloc_rx_buffers(struct net_device *dev); static unsigned long rx_pfn_array[NETIF_RX_RING_SIZE]; static multicall_entry_t rx_mcl[NETIF_RX_RING_SIZE+1]; static mmu_update_t rx_mmu[NETIF_RX_RING_SIZE]; static struct list_head dev_list; struct net_private { struct list_head list; struct net_device *dev; struct net_device_stats stats; NETIF_RING_IDX rx_resp_cons, tx_resp_cons; unsigned int tx_full; netif_tx_interface_t *tx; netif_rx_interface_t *rx; spinlock_t tx_lock; spinlock_t rx_lock; unsigned int handle; unsigned int evtchn; unsigned int irq; /* What is the status of our connection to the remote backend? */ #define BEST_CLOSED 0 #define BEST_DISCONNECTED 1 #define BEST_CONNECTED 2 unsigned int backend_state; /* Is this interface open or closed (down or up)? */ #define UST_CLOSED 0 #define UST_OPEN 1 unsigned int user_state; /* Receive-ring batched refills. */ #define RX_MIN_TARGET 8 #define RX_MAX_TARGET NETIF_RX_RING_SIZE int rx_target; struct sk_buff_head rx_batch; /* * {tx,rx}_skbs store outstanding skbuffs. The first entry in each * array is an index into a chain of free entries. */ struct sk_buff *tx_skbs[NETIF_TX_RING_SIZE+1]; struct sk_buff *rx_skbs[NETIF_RX_RING_SIZE+1]; }; /* Access macros for acquiring freeing slots in {tx,rx}_skbs[]. */ #define ADD_ID_TO_FREELIST(_list, _id) \ (_list)[(_id)] = (_list)[0]; \ (_list)[0] = (void *)(unsigned long)(_id); #define GET_ID_FROM_FREELIST(_list) \ ({ unsigned long _id = (unsigned long)(_list)[0]; \ (_list)[0] = (_list)[_id]; \ (unsigned short)_id; }) static char *status_name[] = { [NETIF_INTERFACE_STATUS_CLOSED] = "closed", [NETIF_INTERFACE_STATUS_DISCONNECTED] = "disconnected", [NETIF_INTERFACE_STATUS_CONNECTED] = "connected", [NETIF_INTERFACE_STATUS_CHANGED] = "changed", }; static char *be_state_name[] = { [BEST_CLOSED] = "closed", [BEST_DISCONNECTED] = "disconnected", [BEST_CONNECTED] = "connected", }; #if DEBUG #define DPRINTK(fmt, args...) \ printk(KERN_ALERT "xen_net (%s:%d) " fmt, __FUNCTION__, __LINE__, ##args) #else #define DPRINTK(fmt, args...) ((void)0) #endif #define IPRINTK(fmt, args...) \ printk(KERN_INFO "xen_net: " fmt, ##args) #define WPRINTK(fmt, args...) \ printk(KERN_WARNING "xen_net: " fmt, ##args) static struct net_device *find_dev_by_handle(unsigned int handle) { struct list_head *ent; struct net_private *np; list_for_each (ent, &dev_list) { np = list_entry(ent, struct net_private, list); if (np->handle == handle) return np->dev; } return NULL; } /** Network interface info. */ struct netif_ctrl { /** Number of interfaces. */ int interface_n; /** Number of connected interfaces. */ int connected_n; /** Error code. */ int err; int up; }; static struct netif_ctrl netctrl; static void netctrl_init(void) { memset(&netctrl, 0, sizeof(netctrl)); netctrl.up = NETIF_DRIVER_STATUS_DOWN; } /** Get or set a network interface error. */ static int netctrl_err(int err) { if ((err < 0) && !netctrl.err) netctrl.err = err; return netctrl.err; } /** Test if all network interfaces are connected. * * @return 1 if all connected, 0 if not, negative error code otherwise */ static int netctrl_connected(void) { int ok; if (netctrl.err) ok = netctrl.err; else if (netctrl.up == NETIF_DRIVER_STATUS_UP) ok = (netctrl.connected_n == netctrl.interface_n); else ok = 0; return ok; } /** Count the connected network interfaces. * * @return connected count */ static int netctrl_connected_count(void) { struct list_head *ent; struct net_private *np; unsigned int connected; connected = 0; list_for_each(ent, &dev_list) { np = list_entry(ent, struct net_private, list); if (np->backend_state == BEST_CONNECTED) connected++; } netctrl.connected_n = connected; DPRINTK("> connected_n=%d interface_n=%d\n", netctrl.connected_n, netctrl.interface_n); return connected; } /** Send a packet on a net device to encourage switches to learn the * MAC. We send a fake ARP request. * * @param dev device * @return 0 on success, error code otherwise */ static int send_fake_arp(struct net_device *dev) { struct sk_buff *skb; u32 src_ip, dst_ip; dst_ip = INADDR_BROADCAST; src_ip = inet_select_addr(dev, dst_ip, RT_SCOPE_LINK); /* No IP? Then nothing to do. */ if (src_ip == 0) return 0; skb = arp_create(ARPOP_REPLY, ETH_P_ARP, dst_ip, dev, src_ip, /*dst_hw*/ NULL, /*src_hw*/ NULL, /*target_hw*/ dev->dev_addr); if (skb == NULL) return -ENOMEM; return dev_queue_xmit(skb); } static int network_open(struct net_device *dev) { struct net_private *np = netdev_priv(dev); memset(&np->stats, 0, sizeof(np->stats)); np->user_state = UST_OPEN; network_alloc_rx_buffers(dev); np->rx->event = np->rx_resp_cons + 1; netif_start_queue(dev); return 0; } static void network_tx_buf_gc(struct net_device *dev) { NETIF_RING_IDX i, prod; unsigned short id; struct net_private *np = netdev_priv(dev); struct sk_buff *skb; if (np->backend_state != BEST_CONNECTED) return; do { prod = np->tx->resp_prod; rmb(); /* Ensure we see responses up to 'rp'. */ for (i = np->tx_resp_cons; i != prod; i++) { id = np->tx->ring[MASK_NETIF_TX_IDX(i)].resp.id; skb = np->tx_skbs[id]; ADD_ID_TO_FREELIST(np->tx_skbs, id); dev_kfree_skb_irq(skb); } np->tx_resp_cons = prod; /* * Set a new event, then check for race with update of tx_cons. Note * that it is essential to schedule a callback, no matter how few * buffers are pending. Even if there is space in the transmit ring, * higher layers may be blocked because too much data is outstanding: * in such cases notification from Xen is likely to be the only kick * that we'll get. */ np->tx->event = prod + ((np->tx->req_prod - prod) >> 1) + 1; mb(); } while (prod != np->tx->resp_prod); if (np->tx_full && ((np->tx->req_prod - prod) < NETIF_TX_RING_SIZE)) { np->tx_full = 0; if (np->user_state == UST_OPEN) netif_wake_queue(dev); } } static void network_alloc_rx_buffers(struct net_device *dev) { unsigned short id; struct net_private *np = netdev_priv(dev); struct sk_buff *skb; int i, batch_target; NETIF_RING_IDX req_prod = np->rx->req_prod; if (unlikely(np->backend_state != BEST_CONNECTED)) return; /* * Allocate skbuffs greedily, even though we batch updates to the * receive ring. This creates a less bursty demand on the memory allocator, * so should reduce the chance of failed allocation requests both for * ourself and for other kernel subsystems. */ batch_target = np->rx_target - (req_prod - np->rx_resp_cons); for (i = skb_queue_len(&np->rx_batch); i < batch_target; i++) { if (unlikely((skb = alloc_xen_skb(dev->mtu + RX_HEADROOM)) == NULL)) break; __skb_queue_tail(&np->rx_batch, skb); } /* Is the batch large enough to be worthwhile? */ if (i < (np->rx_target/2)) return; for (i = 0; ; i++) { if ((skb = __skb_dequeue(&np->rx_batch)) == NULL) break; skb->dev = dev; id = GET_ID_FROM_FREELIST(np->rx_skbs); np->rx_skbs[id] = skb; np->rx->ring[MASK_NETIF_RX_IDX(req_prod + i)].req.id = id; rx_pfn_array[i] = virt_to_machine(skb->head) >> PAGE_SHIFT; /* Remove this page from pseudo phys map before passing back to Xen. */ phys_to_machine_mapping[__pa(skb->head) >> PAGE_SHIFT] = INVALID_P2M_ENTRY; rx_mcl[i].op = __HYPERVISOR_update_va_mapping; rx_mcl[i].args[0] = (unsigned long)skb->head; rx_mcl[i].args[1] = 0; rx_mcl[i].args[2] = 0; } /* After all PTEs have been zapped we blow away stale TLB entries. */ rx_mcl[i-1].args[2] = UVMF_TLB_FLUSH|UVMF_ALL; /* Give away a batch of pages. */ rx_mcl[i].op = __HYPERVISOR_dom_mem_op; rx_mcl[i].args[0] = MEMOP_decrease_reservation; rx_mcl[i].args[1] = (unsigned long)rx_pfn_array; rx_mcl[i].args[2] = (unsigned long)i; rx_mcl[i].args[3] = 0; rx_mcl[i].args[4] = DOMID_SELF; /* Tell the ballon driver what is going on. */ balloon_update_driver_allowance(i); /* Zap PTEs and give away pages in one big multicall. */ (void)HYPERVISOR_multicall(rx_mcl, i+1); /* Check return status of HYPERVISOR_dom_mem_op(). */ if (unlikely(rx_mcl[i].args[5] != i)) panic("Unable to reduce memory reservation\n"); /* Above is a suitable barrier to ensure backend will see requests. */ np->rx->req_prod = req_prod + i; /* Adjust our floating fill target if we risked running out of buffers. */ if (((req_prod - np->rx->resp_prod) < (np->rx_target / 4)) && ((np->rx_target *= 2) > RX_MAX_TARGET)) np->rx_target = RX_MAX_TARGET; } static int network_start_xmit(struct sk_buff *skb, struct net_device *dev) { unsigned short id; struct net_private *np = netdev_priv(dev); netif_tx_request_t *tx; NETIF_RING_IDX i; if (unlikely(np->tx_full)) { printk(KERN_ALERT "%s: full queue wasn't stopped!\n", dev->name); netif_stop_queue(dev); goto drop; } if (unlikely((((unsigned long)skb->data & ~PAGE_MASK) + skb->len) >= PAGE_SIZE)) { struct sk_buff *nskb; if (unlikely((nskb = alloc_xen_skb(skb->len)) == NULL)) goto drop; skb_put(nskb, skb->len); memcpy(nskb->data, skb->data, skb->len); nskb->dev = skb->dev; dev_kfree_skb(skb); skb = nskb; } spin_lock_irq(&np->tx_lock); if (np->backend_state != BEST_CONNECTED) { spin_unlock_irq(&np->tx_lock); goto drop; } i = np->tx->req_prod; id = GET_ID_FROM_FREELIST(np->tx_skbs); np->tx_skbs[id] = skb; tx = &np->tx->ring[MASK_NETIF_TX_IDX(i)].req; tx->id = id; tx->addr = virt_to_machine(skb->data); tx->size = skb->len; wmb(); /* Ensure that backend will see the request. */ np->tx->req_prod = i + 1; network_tx_buf_gc(dev); if ((i - np->tx_resp_cons) == (NETIF_TX_RING_SIZE - 1)) { np->tx_full = 1; netif_stop_queue(dev); } spin_unlock_irq(&np->tx_lock); np->stats.tx_bytes += skb->len; np->stats.tx_packets++; /* Only notify Xen if we really have to. */ mb(); if (np->tx->TX_TEST_IDX == i) notify_via_evtchn(np->evtchn); return 0; drop: np->stats.tx_dropped++; dev_kfree_skb(skb); return 0; } static irqreturn_t netif_int(int irq, void *dev_id, struct pt_regs *ptregs) { struct net_device *dev = dev_id; struct net_private *np = netdev_priv(dev); unsigned long flags; spin_lock_irqsave(&np->tx_lock, flags); network_tx_buf_gc(dev); spin_unlock_irqrestore(&np->tx_lock, flags); if ((np->rx_resp_cons != np->rx->resp_prod) && (np->user_state == UST_OPEN)) netif_rx_schedule(dev); return IRQ_HANDLED; } static int netif_poll(struct net_device *dev, int *pbudget) { struct net_private *np = netdev_priv(dev); struct sk_buff *skb, *nskb; netif_rx_response_t *rx; NETIF_RING_IDX i, rp; mmu_update_t *mmu = rx_mmu; multicall_entry_t *mcl = rx_mcl; int work_done, budget, more_to_do = 1; struct sk_buff_head rxq; unsigned long flags; spin_lock(&np->rx_lock); if (np->backend_state != BEST_CONNECTED) { spin_unlock(&np->rx_lock); return 0; } skb_queue_head_init(&rxq); if ((budget = *pbudget) > dev->quota) budget = dev->quota; rp = np->rx->resp_prod; rmb(); /* Ensure we see queued responses up to 'rp'. */ for (i = np->rx_resp_cons, work_done = 0; (i != rp) && (work_done < budget); i++, work_done++) { rx = &np->rx->ring[MASK_NETIF_RX_IDX(i)].resp; /* * An error here is very odd. Usually indicates a backend bug, * low-memory condition, or that we didn't have reservation headroom. */ if (unlikely(rx->status <= 0)) { if (net_ratelimit()) printk(KERN_WARNING "Bad rx buffer (memory squeeze?).\n"); np->rx->ring[MASK_NETIF_RX_IDX(np->rx->req_prod)].req.id = rx->id; wmb(); np->rx->req_prod++; work_done--; continue; } skb = np->rx_skbs[rx->id]; ADD_ID_TO_FREELIST(np->rx_skbs, rx->id); /* NB. We handle skb overflow later. */ skb->data = skb->head + (rx->addr & ~PAGE_MASK); skb->len = rx->status; skb->tail = skb->data + skb->len; np->stats.rx_packets++; np->stats.rx_bytes += rx->status; /* Remap the page. */ mmu->ptr = (rx->addr & PAGE_MASK) | MMU_MACHPHYS_UPDATE; mmu->val = __pa(skb->head) >> PAGE_SHIFT; mmu++; mcl->op = __HYPERVISOR_update_va_mapping; mcl->args[0] = (unsigned long)skb->head; mcl->args[1] = (rx->addr & PAGE_MASK) | __PAGE_KERNEL; mcl->args[2] = 0; mcl++; phys_to_machine_mapping[__pa(skb->head) >> PAGE_SHIFT] = rx->addr >> PAGE_SHIFT; __skb_queue_tail(&rxq, skb); } /* Some pages are no longer absent... */ balloon_update_driver_allowance(-work_done); /* Do all the remapping work, and M->P updates, in one big hypercall. */ if (likely((mcl - rx_mcl) != 0)) { mcl->op = __HYPERVISOR_mmu_update; mcl->args[0] = (unsigned long)rx_mmu; mcl->args[1] = mmu - rx_mmu; mcl->args[2] = 0; mcl->args[3] = DOMID_SELF; mcl++; (void)HYPERVISOR_multicall(rx_mcl, mcl - rx_mcl); } while ((skb = __skb_dequeue(&rxq)) != NULL) { /* * Enough room in skbuff for the data we were passed? Also, Linux * expects at least 16 bytes headroom in each receive buffer. */ if (unlikely(skb->tail > skb->end) || unlikely((skb->data - skb->head) < 16)) { nskb = NULL; /* Only copy the packet if it fits in the current MTU. */ if (skb->len <= (dev->mtu + ETH_HLEN)) { if ((skb->tail > skb->end) && net_ratelimit()) printk(KERN_INFO "Received packet needs %d bytes more " "headroom.\n", skb->tail - skb->end); if ((nskb = alloc_xen_skb(skb->len + 2)) != NULL) { skb_reserve(nskb, 2); skb_put(nskb, skb->len); memcpy(nskb->data, skb->data, skb->len); nskb->dev = skb->dev; } } else if (net_ratelimit()) printk(KERN_INFO "Received packet too big for MTU " "(%d > %d)\n", skb->len - ETH_HLEN, dev->mtu); /* Reinitialise and then destroy the old skbuff. */ skb->len = 0; skb->tail = skb->data; init_skb_shinfo(skb); dev_kfree_skb(skb); /* Switch old for new, if we copied the buffer. */ if ((skb = nskb) == NULL) continue; } /* Set the shared-info area, which is hidden behind the real data. */ init_skb_shinfo(skb); /* Ethernet-specific work. Delayed to here as it peeks the header. */ skb->protocol = eth_type_trans(skb, dev); /* Pass it up. */ netif_receive_skb(skb); dev->last_rx = jiffies; } np->rx_resp_cons = i; /* If we get a callback with very few responses, reduce fill target. */ /* NB. Note exponential increase, linear decrease. */ if (((np->rx->req_prod - np->rx->resp_prod) > ((3*np->rx_target) / 4)) && (--np->rx_target < RX_MIN_TARGET)) np->rx_target = RX_MIN_TARGET; network_alloc_rx_buffers(dev); *pbudget -= work_done; dev->quota -= work_done; if (work_done < budget) { local_irq_save(flags); np->rx->event = i + 1; /* Deal with hypervisor racing our resetting of rx_event. */ mb(); if (np->rx->resp_prod == i) { __netif_rx_complete(dev); more_to_do = 0; } local_irq_restore(flags); } spin_unlock(&np->rx_lock); return more_to_do; } static int network_close(struct net_device *dev) { struct net_private *np = netdev_priv(dev); np->user_state = UST_CLOSED; netif_stop_queue(np->dev); return 0; } static struct net_device_stats *network_get_stats(struct net_device *dev) { struct net_private *np = netdev_priv(dev); return &np->stats; } static void network_connect(struct net_device *dev, netif_fe_interface_status_t *status) { struct net_private *np; int i, requeue_idx; netif_tx_request_t *tx; np = netdev_priv(dev); spin_lock_irq(&np->tx_lock); spin_lock(&np->rx_lock); /* Recovery procedure: */ /* Step 1: Reinitialise variables. */ np->rx_resp_cons = np->tx_resp_cons = np->tx_full = 0; np->rx->event = np->tx->event = 1; /* Step 2: Rebuild the RX and TX ring contents. * NB. We could just free the queued TX packets now but we hope * that sending them out might do some good. We have to rebuild * the RX ring because some of our pages are currently flipped out * so we can't just free the RX skbs. * NB2. Freelist index entries are always going to be less than * __PAGE_OFFSET, whereas pointers to skbs will always be equal or * greater than __PAGE_OFFSET: we use this property to distinguish * them. */ /* Rebuild the TX buffer freelist and the TX ring itself. * NB. This reorders packets. We could keep more private state * to avoid this but maybe it doesn't matter so much given the * interface has been down. */ for (requeue_idx = 0, i = 1; i <= NETIF_TX_RING_SIZE; i++) { if ((unsigned long)np->tx_skbs[i] >= __PAGE_OFFSET) { struct sk_buff *skb = np->tx_skbs[i]; tx = &np->tx->ring[requeue_idx++].req; tx->id = i; tx->addr = virt_to_machine(skb->data); tx->size = skb->len; np->stats.tx_bytes += skb->len; np->stats.tx_packets++; } } wmb(); np->tx->req_prod = requeue_idx; /* Rebuild the RX buffer freelist and the RX ring itself. */ for (requeue_idx = 0, i = 1; i <= NETIF_RX_RING_SIZE; i++) if ((unsigned long)np->rx_skbs[i] >= __PAGE_OFFSET) np->rx->ring[requeue_idx++].req.id = i; wmb(); np->rx->req_prod = requeue_idx; /* Step 3: All public and private state should now be sane. Get * ready to start sending and receiving packets and give the driver * domain a kick because we've probably just requeued some * packets. */ np->backend_state = BEST_CONNECTED; wmb(); notify_via_evtchn(status->evtchn); network_tx_buf_gc(dev); if (np->user_state == UST_OPEN) netif_start_queue(dev); spin_unlock(&np->rx_lock); spin_unlock_irq(&np->tx_lock); } static void vif_show(struct net_private *np) { #if DEBUG if (np) { IPRINTK("\n", np->handle, be_state_name[np->backend_state], np->user_state ? "open" : "closed", np->evtchn, np->irq, np->tx, np->rx); } else { IPRINTK("\n"); } #endif } /* Send a connect message to xend to tell it to bring up the interface. */ static void send_interface_connect(struct net_private *np) { ctrl_msg_t cmsg = { .type = CMSG_NETIF_FE, .subtype = CMSG_NETIF_FE_INTERFACE_CONNECT, .length = sizeof(netif_fe_interface_connect_t), }; netif_fe_interface_connect_t *msg = (void*)cmsg.msg; msg->handle = np->handle; msg->tx_shmem_frame = (virt_to_machine(np->tx) >> PAGE_SHIFT); msg->rx_shmem_frame = (virt_to_machine(np->rx) >> PAGE_SHIFT); ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); } /* Send a driver status notification to the domain controller. */ static int send_driver_status(int ok) { int err = 0; ctrl_msg_t cmsg = { .type = CMSG_NETIF_FE, .subtype = CMSG_NETIF_FE_DRIVER_STATUS, .length = sizeof(netif_fe_driver_status_t), }; netif_fe_driver_status_t *msg = (void*)cmsg.msg; msg->status = (ok ? NETIF_DRIVER_STATUS_UP : NETIF_DRIVER_STATUS_DOWN); err = ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); return err; } /* Stop network device and free tx/rx queues and irq. */ static void vif_release(struct net_private *np) { /* Stop old i/f to prevent errors whilst we rebuild the state. */ spin_lock_irq(&np->tx_lock); spin_lock(&np->rx_lock); netif_stop_queue(np->dev); /* np->backend_state = BEST_DISCONNECTED; */ spin_unlock(&np->rx_lock); spin_unlock_irq(&np->tx_lock); /* Free resources. */ if(np->tx != NULL){ free_irq(np->irq, np->dev); unbind_evtchn_from_irq(np->evtchn); free_page((unsigned long)np->tx); free_page((unsigned long)np->rx); np->irq = 0; np->evtchn = 0; np->tx = NULL; np->rx = NULL; } } /* Release vif resources and close it down completely. */ static void vif_close(struct net_private *np) { WPRINTK("Unexpected netif-CLOSED message in state %s\n", be_state_name[np->backend_state]); vif_release(np); np->backend_state = BEST_CLOSED; /* todo: take dev down and free. */ vif_show(np); } /* Move the vif into disconnected state. * Allocates tx/rx pages. * Sends connect message to xend. */ static void vif_disconnect(struct net_private *np) { if(np->tx) free_page((unsigned long)np->tx); if(np->rx) free_page((unsigned long)np->rx); // Before this np->tx and np->rx had better be null. np->tx = (netif_tx_interface_t *)__get_free_page(GFP_KERNEL); np->rx = (netif_rx_interface_t *)__get_free_page(GFP_KERNEL); memset(np->tx, 0, PAGE_SIZE); memset(np->rx, 0, PAGE_SIZE); np->backend_state = BEST_DISCONNECTED; send_interface_connect(np); vif_show(np); } /* Begin interface recovery. * * NB. Whilst we're recovering, we turn the carrier state off. We * take measures to ensure that this device isn't used for * anything. We also stop the queue for this device. Various * different approaches (e.g. continuing to buffer packets) have * been tested but don't appear to improve the overall impact on * TCP connections. * * TODO: (MAW) Change the Xend<->Guest protocol so that a recovery * is initiated by a special "RESET" message - disconnect could * just mean we're not allowed to use this interface any more. */ static void vif_reset(struct net_private *np) { IPRINTK("Attempting to reconnect network interface: handle=%u\n", np->handle); vif_release(np); vif_disconnect(np); vif_show(np); } /* Move the vif into connected state. * Sets the mac and event channel from the message. * Binds the irq to the event channel. */ static void vif_connect(struct net_private *np, netif_fe_interface_status_t *status) { struct net_device *dev = np->dev; memcpy(dev->dev_addr, status->mac, ETH_ALEN); network_connect(dev, status); np->evtchn = status->evtchn; np->irq = bind_evtchn_to_irq(np->evtchn); (void)request_irq(np->irq, netif_int, SA_SAMPLE_RANDOM, dev->name, dev); netctrl_connected_count(); (void)send_fake_arp(dev); vif_show(np); } /** Create a network device. * @param handle device handle * @param val return parameter for created device * @return 0 on success, error code otherwise */ static int create_netdev(int handle, struct net_device **val) { int i, err = 0; struct net_device *dev = NULL; struct net_private *np = NULL; if ((dev = alloc_etherdev(sizeof(struct net_private))) == NULL) { printk(KERN_WARNING "%s> alloc_etherdev failed.\n", __FUNCTION__); err = -ENOMEM; goto exit; } np = netdev_priv(dev); np->backend_state = BEST_CLOSED; np->user_state = UST_CLOSED; np->handle = handle; spin_lock_init(&np->tx_lock); spin_lock_init(&np->rx_lock); skb_queue_head_init(&np->rx_batch); np->rx_target = RX_MIN_TARGET; /* Initialise {tx,rx}_skbs to be a free chain containing every entry. */ for (i = 0; i <= NETIF_TX_RING_SIZE; i++) np->tx_skbs[i] = (void *)(i+1); for (i = 0; i <= NETIF_RX_RING_SIZE; i++) np->rx_skbs[i] = (void *)(i+1); dev->open = network_open; dev->hard_start_xmit = network_start_xmit; dev->stop = network_close; dev->get_stats = network_get_stats; dev->poll = netif_poll; dev->weight = 64; if ((err = register_netdev(dev)) != 0) { printk(KERN_WARNING "%s> register_netdev err=%d\n", __FUNCTION__, err); goto exit; } np->dev = dev; list_add(&np->list, &dev_list); exit: if ((err != 0) && (dev != NULL )) kfree(dev); else if (val != NULL) *val = dev; return err; } /* Get the target interface for a status message. * Creates the interface when it makes sense. * The returned interface may be null when there is no error. * * @param status status message * @param np return parameter for interface state * @return 0 on success, error code otherwise */ static int target_vif(netif_fe_interface_status_t *status, struct net_private **np) { int err = 0; struct net_device *dev; DPRINTK("> handle=%d\n", status->handle); if (status->handle < 0) { err = -EINVAL; goto exit; } if ((dev = find_dev_by_handle(status->handle)) != NULL) goto exit; if (status->status == NETIF_INTERFACE_STATUS_CLOSED) goto exit; if (status->status == NETIF_INTERFACE_STATUS_CHANGED) goto exit; /* It's a new interface in a good state - create it. */ DPRINTK("> create device...\n"); if ((err = create_netdev(status->handle, &dev)) != 0) goto exit; netctrl.interface_n++; exit: if (np != NULL) *np = ((dev && !err) ? netdev_priv(dev) : NULL); DPRINTK("< err=%d\n", err); return err; } /* Handle an interface status message. */ static void netif_interface_status(netif_fe_interface_status_t *status) { int err = 0; struct net_private *np = NULL; DPRINTK("> status=%s handle=%d\n", status_name[status->status], status->handle); if ((err = target_vif(status, &np)) != 0) { WPRINTK("Invalid netif: handle=%u\n", status->handle); return; } if (np == NULL) { DPRINTK("> no vif\n"); return; } switch (status->status) { case NETIF_INTERFACE_STATUS_CLOSED: switch (np->backend_state) { case BEST_CLOSED: case BEST_DISCONNECTED: case BEST_CONNECTED: vif_close(np); break; } break; case NETIF_INTERFACE_STATUS_DISCONNECTED: switch (np->backend_state) { case BEST_CLOSED: vif_disconnect(np); break; case BEST_DISCONNECTED: case BEST_CONNECTED: vif_reset(np); break; } break; case NETIF_INTERFACE_STATUS_CONNECTED: switch (np->backend_state) { case BEST_CLOSED: WPRINTK("Unexpected netif status %s in state %s\n", status_name[status->status], be_state_name[np->backend_state]); vif_disconnect(np); vif_connect(np, status); break; case BEST_DISCONNECTED: vif_connect(np, status); break; } break; case NETIF_INTERFACE_STATUS_CHANGED: /* * The domain controller is notifying us that a device has been * added or removed. */ break; default: WPRINTK("Invalid netif status code %d\n", status->status); break; } vif_show(np); } /* * Initialize the network control interface. */ static void netif_driver_status(netif_fe_driver_status_t *status) { netctrl.up = status->status; netctrl_connected_count(); } /* Receive handler for control messages. */ static void netif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id) { switch (msg->subtype) { case CMSG_NETIF_FE_INTERFACE_STATUS: netif_interface_status((netif_fe_interface_status_t *) &msg->msg[0]); break; case CMSG_NETIF_FE_DRIVER_STATUS: netif_driver_status((netif_fe_driver_status_t *) &msg->msg[0]); break; default: msg->length = 0; break; } ctrl_if_send_response(msg); } #if 1 /* Wait for all interfaces to be connected. * * This works OK, but we'd like to use the probing mode (see below). */ static int probe_interfaces(void) { int err = 0, conn = 0; int wait_i, wait_n = 100; DPRINTK(">\n"); for (wait_i = 0; wait_i < wait_n; wait_i++) { DPRINTK("> wait_i=%d\n", wait_i); conn = netctrl_connected(); if(conn) break; DPRINTK("> schedule_timeout...\n"); set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(10); } DPRINTK("> wait finished...\n"); if (conn <= 0) { err = netctrl_err(-ENETDOWN); WPRINTK("Failed to connect all virtual interfaces: err=%d\n", err); } DPRINTK("< err=%d\n", err); return err; } #else /* Probe for interfaces until no more are found. * * This is the mode we'd like to use, but at the moment it panics the kernel. */ static int probe_interfaces(void) { int err = 0; int wait_i, wait_n = 100; ctrl_msg_t cmsg = { .type = CMSG_NETIF_FE, .subtype = CMSG_NETIF_FE_INTERFACE_STATUS, .length = sizeof(netif_fe_interface_status_t), }; netif_fe_interface_status_t msg = {}; ctrl_msg_t rmsg = {}; netif_fe_interface_status_t *reply = (void*)rmsg.msg; int state = TASK_UNINTERRUPTIBLE; u32 query = -1; DPRINTK(">\n"); netctrl.interface_n = 0; for (wait_i = 0; wait_i < wait_n; wait_i++) { DPRINTK("> wait_i=%d query=%d\n", wait_i, query); msg.handle = query; memcpy(cmsg.msg, &msg, sizeof(msg)); DPRINTK("> set_current_state...\n"); set_current_state(state); DPRINTK("> rmsg=%p msg=%p, reply=%p\n", &rmsg, rmsg.msg, reply); DPRINTK("> sending...\n"); err = ctrl_if_send_message_and_get_response(&cmsg, &rmsg, state); DPRINTK("> err=%d\n", err); if(err) goto exit; DPRINTK("> rmsg=%p msg=%p, reply=%p\n", &rmsg, rmsg.msg, reply); if((int)reply->handle < 0) { // No more interfaces. break; } query = -reply->handle - 2; DPRINTK(">netif_interface_status ...\n"); netif_interface_status(reply); } exit: if (err) { err = netctrl_err(-ENETDOWN); WPRINTK("Connecting virtual network interfaces failed: err=%d\n", err); } DPRINTK("< err=%d\n", err); return err; } #endif /* * We use this notifier to send out a fake ARP reply to reset switches and * router ARP caches when an IP interface is brought up on a VIF. */ static int inetdev_notify(struct notifier_block *this, unsigned long event, void *ptr) { struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; struct net_device *dev = ifa->ifa_dev->dev; struct list_head *ent; struct net_private *np; if (event != NETDEV_UP) goto out; list_for_each (ent, &dev_list) { np = list_entry(ent, struct net_private, list); if (np->dev == dev) (void)send_fake_arp(dev); } out: return NOTIFY_DONE; } static struct notifier_block notifier_inetdev = { .notifier_call = inetdev_notify, .next = NULL, .priority = 0 }; static int __init netif_init(void) { int err = 0; if (xen_start_info.flags & SIF_INITDOMAIN) return 0; IPRINTK("Initialising virtual ethernet driver.\n"); INIT_LIST_HEAD(&dev_list); (void)register_inetaddr_notifier(¬ifier_inetdev); netctrl_init(); (void)ctrl_if_register_receiver(CMSG_NETIF_FE, netif_ctrlif_rx, CALLBACK_IN_BLOCKING_CONTEXT); send_driver_status(1); err = probe_interfaces(); if (err) ctrl_if_unregister_receiver(CMSG_NETIF_FE, netif_ctrlif_rx); DPRINTK("< err=%d\n", err); return err; } static void vif_suspend(struct net_private *np) { /* Avoid having tx/rx stuff happen until we're ready. */ free_irq(np->irq, np->dev); unbind_evtchn_from_irq(np->evtchn); } static void vif_resume(struct net_private *np) { /* * Connect regardless of whether IFF_UP flag set. * Stop bad things from happening until we're back up. */ np->backend_state = BEST_DISCONNECTED; memset(np->tx, 0, PAGE_SIZE); memset(np->rx, 0, PAGE_SIZE); send_interface_connect(np); } void netif_suspend(void) { struct list_head *ent; struct net_private *np; list_for_each (ent, &dev_list) { np = list_entry(ent, struct net_private, list); vif_suspend(np); } } void netif_resume(void) { struct list_head *ent; struct net_private *np; list_for_each (ent, &dev_list) { np = list_entry(ent, struct net_private, list); vif_resume(np); } } module_init(netif_init);