X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=drivers%2Fxen%2Fnetback%2Fnetback.c;h=89a196b73d94fd8ffc82c34ab497d461bd5a1a29;hb=16c70f8c1b54b61c3b951b6fb220df250fe09b32;hp=d08c296a028c8ab4cf492e8f7b8953a03e0b1db5;hpb=27c597220375df29763c4fecc5130a5d0054261e;p=linux-2.6.git diff --git a/drivers/xen/netback/netback.c b/drivers/xen/netback/netback.c index d08c296a0..89a196b73 100644 --- a/drivers/xen/netback/netback.c +++ b/drivers/xen/netback/netback.c @@ -8,26 +8,56 @@ * drivers/xen/netfront/netfront.c * * Copyright (c) 2002-2005, K A Fraser + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. */ #include "common.h" -#include -#include +#include +#include +#include -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) -#include -#endif +/*#define NETBE_DEBUG_INTERRUPT*/ + +struct netbk_rx_meta { + skb_frag_t frag; + int id; + int copy:1; +}; static void netif_idx_release(u16 pending_idx); static void netif_page_release(struct page *page); static void make_tx_response(netif_t *netif, - u16 id, - s8 st); -static int make_rx_response(netif_t *netif, - u16 id, - s8 st, - memory_t addr, - u16 size); + netif_tx_request_t *txp, + s8 st); +static netif_rx_response_t *make_rx_response(netif_t *netif, + u16 id, + s8 st, + u16 offset, + u16 size, + u16 flags); static void net_tx_action(unsigned long unused); static DECLARE_TASKLET(net_tx_tasklet, net_tx_action, 0); @@ -37,24 +67,21 @@ static DECLARE_TASKLET(net_rx_tasklet, net_rx_action, 0); static struct timer_list net_timer; -static struct sk_buff_head rx_queue; -static multicall_entry_t rx_mcl[NETIF_RX_RING_SIZE*2+1]; -static mmu_update_t rx_mmu[NETIF_RX_RING_SIZE]; -static struct mmuext_op rx_mmuext[NETIF_RX_RING_SIZE]; -static unsigned char rx_notify[NR_EVENT_CHANNELS]; +#define MAX_PENDING_REQS 256 -/* Don't currently gate addition of an interface to the tx scheduling list. */ -#define tx_work_exists(_if) (1) +static struct sk_buff_head rx_queue; -#define MAX_PENDING_REQS 256 -static unsigned long mmap_vstart; -#define MMAP_VADDR(_req) (mmap_vstart + ((_req) * PAGE_SIZE)) +static struct page **mmap_pages; +static inline unsigned long idx_to_kaddr(unsigned int idx) +{ + return (unsigned long)pfn_to_kaddr(page_to_pfn(mmap_pages[idx])); +} #define PKT_PROT_LEN 64 static struct { - netif_tx_request_t req; - netif_t *netif; + netif_tx_request_t req; + netif_t *netif; } pending_tx_info[MAX_PENDING_REQS]; static u16 pending_ring[MAX_PENDING_REQS]; typedef unsigned int PEND_RING_IDX; @@ -67,7 +94,10 @@ static u16 dealloc_ring[MAX_PENDING_REQS]; static PEND_RING_IDX dealloc_prod, dealloc_cons; static struct sk_buff_head tx_queue; -static multicall_entry_t tx_mcl[MAX_PENDING_REQS]; + +static grant_handle_t grant_tx_handle[MAX_PENDING_REQS]; +static gnttab_unmap_grant_ref_t tx_unmap_ops[MAX_PENDING_REQS]; +static gnttab_map_grant_ref_t tx_map_ops[MAX_PENDING_REQS]; static struct list_head net_schedule_list; static spinlock_t net_schedule_list_lock; @@ -75,39 +105,36 @@ static spinlock_t net_schedule_list_lock; #define MAX_MFN_ALLOC 64 static unsigned long mfn_list[MAX_MFN_ALLOC]; static unsigned int alloc_index = 0; -static spinlock_t mfn_lock = SPIN_LOCK_UNLOCKED; -static unsigned long alloc_mfn(void) +static inline unsigned long alloc_mfn(void) { - unsigned long mfn = 0, flags; - spin_lock_irqsave(&mfn_lock, flags); - if ( unlikely(alloc_index == 0) ) - alloc_index = HYPERVISOR_dom_mem_op( - MEMOP_increase_reservation, mfn_list, MAX_MFN_ALLOC, 0); - if ( alloc_index != 0 ) - mfn = mfn_list[--alloc_index]; - spin_unlock_irqrestore(&mfn_lock, flags); - return mfn; + return mfn_list[--alloc_index]; } -static void free_mfn(unsigned long mfn) +static int check_mfn(int nr) { - unsigned long flags; - spin_lock_irqsave(&mfn_lock, flags); - if ( alloc_index != MAX_MFN_ALLOC ) - mfn_list[alloc_index++] = mfn; - else if ( HYPERVISOR_dom_mem_op(MEMOP_decrease_reservation, - &mfn, 1, 0) != 1 ) - BUG(); - spin_unlock_irqrestore(&mfn_lock, flags); + struct xen_memory_reservation reservation = { + .extent_order = 0, + .domid = DOMID_SELF + }; + + if (likely(alloc_index >= nr)) + return 0; + + set_xen_guest_handle(reservation.extent_start, mfn_list + alloc_index); + reservation.nr_extents = MAX_MFN_ALLOC - alloc_index; + alloc_index += HYPERVISOR_memory_op(XENMEM_increase_reservation, + &reservation); + + return alloc_index >= nr ? 0 : -ENOMEM; } static inline void maybe_schedule_tx_action(void) { - smp_mb(); - if ( (NR_PENDING_REQS < (MAX_PENDING_REQS/2)) && - !list_empty(&net_schedule_list) ) - tasklet_schedule(&net_tx_tasklet); + smp_mb(); + if ((NR_PENDING_REQS < (MAX_PENDING_REQS/2)) && + !list_empty(&net_schedule_list)) + tasklet_schedule(&net_tx_tasklet); } /* @@ -116,704 +143,1358 @@ static inline void maybe_schedule_tx_action(void) */ static inline int is_xen_skb(struct sk_buff *skb) { - extern kmem_cache_t *skbuff_cachep; -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) - kmem_cache_t *cp = (kmem_cache_t *)virt_to_page(skb->head)->lru.next; -#else - kmem_cache_t *cp = (kmem_cache_t *)virt_to_page(skb->head)->list.next; -#endif - return (cp == skbuff_cachep); + extern kmem_cache_t *skbuff_cachep; + kmem_cache_t *cp = (kmem_cache_t *)virt_to_page(skb->head)->lru.next; + return (cp == skbuff_cachep); +} + +/* + * We can flip without copying the packet unless: + * 1. The data is not allocated from our special cache; or + * 2. The main data area is shared; or + * 3. One or more fragments are shared; or + * 4. There are chained fragments. + */ +static inline int is_flippable_skb(struct sk_buff *skb) +{ + int frag; + + if (!is_xen_skb(skb) || skb_cloned(skb)) + return 0; + + for (frag = 0; frag < skb_shinfo(skb)->nr_frags; frag++) { + if (page_count(skb_shinfo(skb)->frags[frag].page) > 1) + return 0; + } + + if (skb_shinfo(skb)->frag_list != NULL) + return 0; + + return 1; +} + +static struct sk_buff *netbk_copy_skb(struct sk_buff *skb) +{ + struct skb_shared_info *ninfo; + struct sk_buff *nskb; + unsigned long offset; + int ret; + int len; + int headlen; + + BUG_ON(skb_shinfo(skb)->frag_list != NULL); + + nskb = alloc_skb(SKB_MAX_HEAD(0), GFP_ATOMIC); + if (unlikely(!nskb)) + goto err; + + skb_reserve(nskb, 16); + headlen = nskb->end - nskb->data; + if (headlen > skb_headlen(skb)) + headlen = skb_headlen(skb); + ret = skb_copy_bits(skb, 0, __skb_put(nskb, headlen), headlen); + BUG_ON(ret); + + ninfo = skb_shinfo(nskb); + ninfo->gso_size = skb_shinfo(skb)->gso_size; + ninfo->gso_type = skb_shinfo(skb)->gso_type; + + offset = headlen; + len = skb->len - headlen; + + nskb->len = skb->len; + nskb->data_len = len; + nskb->truesize += len; + + while (len) { + struct page *page; + int copy; + int zero; + + if (unlikely(ninfo->nr_frags >= MAX_SKB_FRAGS)) { + dump_stack(); + goto err_free; + } + + copy = len >= PAGE_SIZE ? PAGE_SIZE : len; + zero = len >= PAGE_SIZE ? 0 : __GFP_ZERO; + + page = alloc_page(GFP_ATOMIC | __GFP_NOWARN | zero); + if (unlikely(!page)) + goto err_free; + + ret = skb_copy_bits(skb, offset, page_address(page), copy); + BUG_ON(ret); + + ninfo->frags[ninfo->nr_frags].page = page; + ninfo->frags[ninfo->nr_frags].page_offset = 0; + ninfo->frags[ninfo->nr_frags].size = copy; + ninfo->nr_frags++; + + offset += copy; + len -= copy; + } + + offset = nskb->data - skb->data; + + nskb->h.raw = skb->h.raw + offset; + nskb->nh.raw = skb->nh.raw + offset; + nskb->mac.raw = skb->mac.raw + offset; + + return nskb; + + err_free: + kfree_skb(nskb); + err: + return NULL; +} + +static inline int netbk_max_required_rx_slots(netif_t *netif) +{ + if (netif->features & (NETIF_F_SG|NETIF_F_TSO)) + return MAX_SKB_FRAGS + 2; /* header + extra_info + frags */ + return 1; /* all in one */ +} + +static inline int netbk_queue_full(netif_t *netif) +{ + RING_IDX peek = netif->rx_req_cons_peek; + RING_IDX needed = netbk_max_required_rx_slots(netif); + + return ((netif->rx.sring->req_prod - peek) < needed) || + ((netif->rx.rsp_prod_pvt + NET_RX_RING_SIZE - peek) < needed); } int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev) { - netif_t *netif = netdev_priv(dev); - - ASSERT(skb->dev == dev); - - /* Drop the packet if the target domain has no receive buffers. */ - if ( !netif->active || - (netif->rx_req_cons == netif->rx->req_prod) || - ((netif->rx_req_cons-netif->rx_resp_prod) == NETIF_RX_RING_SIZE) ) - goto drop; - - /* - * We do not copy the packet unless: - * 1. The data is shared; or - * 2. The data is not allocated from our special cache. - * NB. We also couldn't cope with fragmented packets, but we won't get - * any because we not advertise the NETIF_F_SG feature. - */ - if ( skb_shared(skb) || skb_cloned(skb) || !is_xen_skb(skb) ) - { - int hlen = skb->data - skb->head; - struct sk_buff *nskb = dev_alloc_skb(hlen + skb->len); - if ( unlikely(nskb == NULL) ) - goto drop; - skb_reserve(nskb, hlen); - __skb_put(nskb, skb->len); - (void)skb_copy_bits(skb, -hlen, nskb->data - hlen, skb->len + hlen); - nskb->dev = skb->dev; - dev_kfree_skb(skb); - skb = nskb; - } - - netif->rx_req_cons++; - netif_get(netif); - - skb_queue_tail(&rx_queue, skb); - tasklet_schedule(&net_rx_tasklet); - - return 0; + netif_t *netif = netdev_priv(dev); + + BUG_ON(skb->dev != dev); + + /* Drop the packet if the target domain has no receive buffers. */ + if (unlikely(!netif_running(dev) || !netif_carrier_ok(dev))) + goto drop; + + if (unlikely(netbk_queue_full(netif))) { + /* Not a BUG_ON() -- misbehaving netfront can trigger this. */ + if (netbk_can_queue(dev)) + DPRINTK("Queue full but not stopped!\n"); + goto drop; + } + + /* Copy the packet here if it's destined for a flipping + interface but isn't flippable (e.g. extra references to + data) + */ + if (!netif->copying_receiver && !is_flippable_skb(skb)) { + struct sk_buff *nskb = netbk_copy_skb(skb); + if ( unlikely(nskb == NULL) ) + goto drop; + /* Copy only the header fields we use in this driver. */ + nskb->dev = skb->dev; + nskb->ip_summed = skb->ip_summed; + nskb->proto_data_valid = skb->proto_data_valid; + dev_kfree_skb(skb); + skb = nskb; + } + + netif->rx_req_cons_peek += skb_shinfo(skb)->nr_frags + 1 + + !!skb_shinfo(skb)->gso_size; + netif_get(netif); + + if (netbk_can_queue(dev) && netbk_queue_full(netif)) { + netif->rx.sring->req_event = netif->rx_req_cons_peek + + netbk_max_required_rx_slots(netif); + mb(); /* request notification /then/ check & stop the queue */ + if (netbk_queue_full(netif)) + netif_stop_queue(dev); + } + + skb_queue_tail(&rx_queue, skb); + tasklet_schedule(&net_rx_tasklet); + + return 0; drop: - netif->stats.tx_dropped++; - dev_kfree_skb(skb); - return 0; + netif->stats.tx_dropped++; + dev_kfree_skb(skb); + return 0; } #if 0 static void xen_network_done_notify(void) { - static struct net_device *eth0_dev = NULL; - if ( unlikely(eth0_dev == NULL) ) - eth0_dev = __dev_get_by_name("eth0"); - netif_rx_schedule(eth0_dev); + static struct net_device *eth0_dev = NULL; + if (unlikely(eth0_dev == NULL)) + eth0_dev = __dev_get_by_name("eth0"); + netif_rx_schedule(eth0_dev); } /* * Add following to poll() function in NAPI driver (Tigon3 is example): * if ( xen_network_done() ) - * tg3_enable_ints(tp); + * tg3_enable_ints(tp); */ int xen_network_done(void) { - return skb_queue_empty(&rx_queue); + return skb_queue_empty(&rx_queue); } #endif +struct netrx_pending_operations { + unsigned trans_prod, trans_cons; + unsigned mmu_prod, mmu_cons; + unsigned mcl_prod, mcl_cons; + unsigned copy_prod, copy_cons; + unsigned meta_prod, meta_cons; + mmu_update_t *mmu; + gnttab_transfer_t *trans; + gnttab_copy_t *copy; + multicall_entry_t *mcl; + struct netbk_rx_meta *meta; +}; + +/* Set up the grant operations for this fragment. If it's a flipping + interface, we also set up the unmap request from here. */ +static u16 netbk_gop_frag(netif_t *netif, struct netbk_rx_meta *meta, + int i, struct netrx_pending_operations *npo, + struct page *page, unsigned long size, + unsigned long offset) +{ + mmu_update_t *mmu; + gnttab_transfer_t *gop; + gnttab_copy_t *copy_gop; + multicall_entry_t *mcl; + netif_rx_request_t *req; + unsigned long old_mfn, new_mfn; + + old_mfn = virt_to_mfn(page_address(page)); + + req = RING_GET_REQUEST(&netif->rx, netif->rx.req_cons + i); + if (netif->copying_receiver) { + /* The fragment needs to be copied rather than + flipped. */ + meta->copy = 1; + copy_gop = npo->copy + npo->copy_prod++; + copy_gop->source.domid = DOMID_SELF; + copy_gop->source.offset = offset; + copy_gop->source.u.gmfn = old_mfn; + copy_gop->dest.domid = netif->domid; + copy_gop->dest.offset = 0; + copy_gop->dest.u.ref = req->gref; + copy_gop->len = size; + copy_gop->flags = GNTCOPY_dest_gref; + } else { + meta->copy = 0; + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + new_mfn = alloc_mfn(); + + /* + * Set the new P2M table entry before + * reassigning the old data page. Heed the + * comment in pgtable-2level.h:pte_page(). :-) + */ + set_phys_to_machine(page_to_pfn(page), new_mfn); + + mcl = npo->mcl + npo->mcl_prod++; + MULTI_update_va_mapping(mcl, + (unsigned long)page_address(page), + pfn_pte_ma(new_mfn, PAGE_KERNEL), + 0); + + mmu = npo->mmu + npo->mmu_prod++; + mmu->ptr = ((maddr_t)new_mfn << PAGE_SHIFT) | + MMU_MACHPHYS_UPDATE; + mmu->val = page_to_pfn(page); + } + + gop = npo->trans + npo->trans_prod++; + gop->mfn = old_mfn; + gop->domid = netif->domid; + gop->ref = req->gref; + } + return req->id; +} + +static void netbk_gop_skb(struct sk_buff *skb, + struct netrx_pending_operations *npo) +{ + netif_t *netif = netdev_priv(skb->dev); + int nr_frags = skb_shinfo(skb)->nr_frags; + int i; + int extra; + struct netbk_rx_meta *head_meta, *meta; + + head_meta = npo->meta + npo->meta_prod++; + head_meta->frag.page_offset = skb_shinfo(skb)->gso_type; + head_meta->frag.size = skb_shinfo(skb)->gso_size; + extra = !!head_meta->frag.size + 1; + + for (i = 0; i < nr_frags; i++) { + meta = npo->meta + npo->meta_prod++; + meta->frag = skb_shinfo(skb)->frags[i]; + meta->id = netbk_gop_frag(netif, meta, i + extra, npo, + meta->frag.page, + meta->frag.size, + meta->frag.page_offset); + } + + /* + * This must occur at the end to ensure that we don't trash + * skb_shinfo until we're done. + */ + head_meta->id = netbk_gop_frag(netif, head_meta, 0, npo, + virt_to_page(skb->data), + skb_headlen(skb), + offset_in_page(skb->data)); + + netif->rx.req_cons += nr_frags + extra; +} + +static inline void netbk_free_pages(int nr_frags, struct netbk_rx_meta *meta) +{ + int i; + + for (i = 0; i < nr_frags; i++) + put_page(meta[i].frag.page); +} + +/* This is a twin to netbk_gop_skb. Assume that netbk_gop_skb was + used to set up the operations on the top of + netrx_pending_operations, which have since been done. Check that + they didn't give any errors and advance over them. */ +static int netbk_check_gop(int nr_frags, domid_t domid, + struct netrx_pending_operations *npo) +{ + multicall_entry_t *mcl; + gnttab_transfer_t *gop; + gnttab_copy_t *copy_op; + int status = NETIF_RSP_OKAY; + int i; + + for (i = 0; i <= nr_frags; i++) { + if (npo->meta[npo->meta_cons + i].copy) { + copy_op = npo->copy + npo->copy_cons++; + if (copy_op->status != GNTST_okay) { + DPRINTK("Bad status %d from copy to DOM%d.\n", + gop->status, domid); + status = NETIF_RSP_ERROR; + } + } else { + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + mcl = npo->mcl + npo->mcl_cons++; + /* The update_va_mapping() must not fail. */ + BUG_ON(mcl->result != 0); + } + + gop = npo->trans + npo->trans_cons++; + /* Check the reassignment error code. */ + if (gop->status != 0) { + DPRINTK("Bad status %d from grant transfer to DOM%u\n", + gop->status, domid); + /* + * Page no longer belongs to us unless + * GNTST_bad_page, but that should be + * a fatal error anyway. + */ + BUG_ON(gop->status == GNTST_bad_page); + status = NETIF_RSP_ERROR; + } + } + } + + return status; +} + +static void netbk_add_frag_responses(netif_t *netif, int status, + struct netbk_rx_meta *meta, int nr_frags) +{ + int i; + unsigned long offset; + + for (i = 0; i < nr_frags; i++) { + int id = meta[i].id; + int flags = (i == nr_frags - 1) ? 0 : NETRXF_more_data; + + if (meta[i].copy) + offset = 0; + else + offset = meta[i].frag.page_offset; + make_rx_response(netif, id, status, offset, + meta[i].frag.size, flags); + } +} + static void net_rx_action(unsigned long unused) { - netif_t *netif; - s8 status; - u16 size, id, evtchn; - multicall_entry_t *mcl; - mmu_update_t *mmu; - struct mmuext_op *mmuext; - unsigned long vdata, mdata, new_mfn; - struct sk_buff_head rxq; - struct sk_buff *skb; - u16 notify_list[NETIF_RX_RING_SIZE]; - int notify_nr = 0; - - skb_queue_head_init(&rxq); - - mcl = rx_mcl; - mmu = rx_mmu; - mmuext = rx_mmuext; - while ( (skb = skb_dequeue(&rx_queue)) != NULL ) - { - netif = netdev_priv(skb->dev); - vdata = (unsigned long)skb->data; - mdata = virt_to_machine(vdata); - - /* Memory squeeze? Back off for an arbitrary while. */ - if ( (new_mfn = alloc_mfn()) == 0 ) - { - if ( net_ratelimit() ) - printk(KERN_WARNING "Memory squeeze in netback driver.\n"); - mod_timer(&net_timer, jiffies + HZ); - skb_queue_head(&rx_queue, skb); - break; - } - - /* - * Set the new P2M table entry before reassigning the old data page. - * Heed the comment in pgtable-2level.h:pte_page(). :-) - */ - phys_to_machine_mapping[__pa(skb->data) >> PAGE_SHIFT] = new_mfn; - - mcl->op = __HYPERVISOR_update_va_mapping; - mcl->args[0] = vdata; - mcl->args[1] = (new_mfn << PAGE_SHIFT) | __PAGE_KERNEL; - mcl->args[2] = 0; - mcl++; - - mcl->op = __HYPERVISOR_mmuext_op; - mcl->args[0] = (unsigned long)mmuext; - mcl->args[1] = 1; - mcl->args[2] = 0; - mcl->args[3] = netif->domid; - mcl++; - - mmuext->cmd = MMUEXT_REASSIGN_PAGE; - mmuext->mfn = mdata >> PAGE_SHIFT; - mmuext++; - - mmu->ptr = (new_mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE; - mmu->val = __pa(vdata) >> PAGE_SHIFT; - mmu++; - - __skb_queue_tail(&rxq, skb); - - /* Filled the batch queue? */ - if ( (mcl - rx_mcl) == ARRAY_SIZE(rx_mcl) ) - break; - } - - if ( mcl == rx_mcl ) - return; - - mcl->op = __HYPERVISOR_mmu_update; - mcl->args[0] = (unsigned long)rx_mmu; - mcl->args[1] = mmu - rx_mmu; - mcl->args[2] = 0; - mcl->args[3] = DOMID_SELF; - mcl++; - - mcl[-3].args[2] = UVMF_TLB_FLUSH|UVMF_ALL; - if ( unlikely(HYPERVISOR_multicall(rx_mcl, mcl - rx_mcl) != 0) ) - BUG(); - - mcl = rx_mcl; - mmuext = rx_mmuext; - while ( (skb = __skb_dequeue(&rxq)) != NULL ) - { - netif = netdev_priv(skb->dev); - size = skb->tail - skb->data; - - /* Rederive the machine addresses. */ - new_mfn = mcl[0].args[1] >> PAGE_SHIFT; - mdata = ((mmuext[0].mfn << PAGE_SHIFT) | - ((unsigned long)skb->data & ~PAGE_MASK)); - - atomic_set(&(skb_shinfo(skb)->dataref), 1); - skb_shinfo(skb)->nr_frags = 0; - skb_shinfo(skb)->frag_list = NULL; - - netif->stats.tx_bytes += size; - netif->stats.tx_packets++; - - /* The update_va_mapping() must not fail. */ - if ( unlikely(mcl[0].args[5] != 0) ) - BUG(); - - /* Check the reassignment error code. */ - status = NETIF_RSP_OKAY; - if ( unlikely(mcl[1].args[5] != 0) ) - { - DPRINTK("Failed MMU update transferring to DOM%u\n", netif->domid); - free_mfn(mdata >> PAGE_SHIFT); - status = NETIF_RSP_ERROR; - } - - evtchn = netif->evtchn; - id = netif->rx->ring[MASK_NETIF_RX_IDX(netif->rx_resp_prod)].req.id; - if ( make_rx_response(netif, id, status, mdata, size) && - (rx_notify[evtchn] == 0) ) - { - rx_notify[evtchn] = 1; - notify_list[notify_nr++] = evtchn; - } - - netif_put(netif); - dev_kfree_skb(skb); - - mcl += 2; - mmuext += 1; - } - - while ( notify_nr != 0 ) - { - evtchn = notify_list[--notify_nr]; - rx_notify[evtchn] = 0; - notify_via_evtchn(evtchn); - } - - /* More work to do? */ - if ( !skb_queue_empty(&rx_queue) && !timer_pending(&net_timer) ) - tasklet_schedule(&net_rx_tasklet); + netif_t *netif = NULL; + s8 status; + u16 id, irq, flags; + netif_rx_response_t *resp; + multicall_entry_t *mcl; + struct sk_buff_head rxq; + struct sk_buff *skb; + int notify_nr = 0; + int ret; + int nr_frags; + int count; + unsigned long offset; + + /* + * Putting hundreds of bytes on the stack is considered rude. + * Static works because a tasklet can only be on one CPU at any time. + */ + static multicall_entry_t rx_mcl[NET_RX_RING_SIZE+3]; + static mmu_update_t rx_mmu[NET_RX_RING_SIZE]; + static gnttab_transfer_t grant_trans_op[NET_RX_RING_SIZE]; + static gnttab_copy_t grant_copy_op[NET_RX_RING_SIZE]; + static unsigned char rx_notify[NR_IRQS]; + static u16 notify_list[NET_RX_RING_SIZE]; + static struct netbk_rx_meta meta[NET_RX_RING_SIZE]; + + struct netrx_pending_operations npo = { + mmu: rx_mmu, + trans: grant_trans_op, + copy: grant_copy_op, + mcl: rx_mcl, + meta: meta}; + + skb_queue_head_init(&rxq); + + count = 0; + + while ((skb = skb_dequeue(&rx_queue)) != NULL) { + nr_frags = skb_shinfo(skb)->nr_frags; + *(int *)skb->cb = nr_frags; + + if (!xen_feature(XENFEAT_auto_translated_physmap) && + check_mfn(nr_frags + 1)) { + /* Memory squeeze? Back off for an arbitrary while. */ + if ( net_ratelimit() ) + WPRINTK("Memory squeeze in netback " + "driver.\n"); + mod_timer(&net_timer, jiffies + HZ); + skb_queue_head(&rx_queue, skb); + break; + } + + netbk_gop_skb(skb, &npo); + + count += nr_frags + 1; + + __skb_queue_tail(&rxq, skb); + + /* Filled the batch queue? */ + if (count + MAX_SKB_FRAGS >= NET_RX_RING_SIZE) + break; + } + + if (npo.mcl_prod && + !xen_feature(XENFEAT_auto_translated_physmap)) { + mcl = npo.mcl + npo.mcl_prod++; + + BUG_ON(mcl[-1].op != __HYPERVISOR_update_va_mapping); + mcl[-1].args[MULTI_UVMFLAGS_INDEX] = UVMF_TLB_FLUSH|UVMF_ALL; + + mcl->op = __HYPERVISOR_mmu_update; + mcl->args[0] = (unsigned long)rx_mmu; + mcl->args[1] = npo.mmu_prod; + mcl->args[2] = 0; + mcl->args[3] = DOMID_SELF; + } + + if (npo.trans_prod) { + mcl = npo.mcl + npo.mcl_prod++; + mcl->op = __HYPERVISOR_grant_table_op; + mcl->args[0] = GNTTABOP_transfer; + mcl->args[1] = (unsigned long)grant_trans_op; + mcl->args[2] = npo.trans_prod; + } + + if (npo.copy_prod) { + mcl = npo.mcl + npo.mcl_prod++; + mcl->op = __HYPERVISOR_grant_table_op; + mcl->args[0] = GNTTABOP_copy; + mcl->args[1] = (unsigned long)grant_copy_op; + mcl->args[2] = npo.copy_prod; + } + + /* Nothing to do? */ + if (!npo.mcl_prod) + return; + + BUG_ON(npo.copy_prod > NET_RX_RING_SIZE); + BUG_ON(npo.mmu_prod > NET_RX_RING_SIZE); + BUG_ON(npo.trans_prod > NET_RX_RING_SIZE); + BUG_ON(npo.mcl_prod > NET_RX_RING_SIZE+3); + BUG_ON(npo.meta_prod > NET_RX_RING_SIZE); + + ret = HYPERVISOR_multicall(npo.mcl, npo.mcl_prod); + BUG_ON(ret != 0); + + while ((skb = __skb_dequeue(&rxq)) != NULL) { + nr_frags = *(int *)skb->cb; + + netif = netdev_priv(skb->dev); + /* We can't rely on skb_release_data to release the + pages used by fragments for us, since it tries to + touch the pages in the fraglist. If we're in + flipping mode, that doesn't work. In copying mode, + we still have access to all of the pages, and so + it's safe to let release_data deal with it. */ + /* (Freeing the fragments is safe since we copy + non-linear skbs destined for flipping interfaces) */ + if (!netif->copying_receiver) { + atomic_set(&(skb_shinfo(skb)->dataref), 1); + skb_shinfo(skb)->frag_list = NULL; + skb_shinfo(skb)->nr_frags = 0; + netbk_free_pages(nr_frags, meta + npo.meta_cons + 1); + } + + netif->stats.tx_bytes += skb->len; + netif->stats.tx_packets++; + + status = netbk_check_gop(nr_frags, netif->domid, &npo); + + id = meta[npo.meta_cons].id; + flags = nr_frags ? NETRXF_more_data : 0; + + if (skb->ip_summed == CHECKSUM_HW) /* local packet? */ + flags |= NETRXF_csum_blank | NETRXF_data_validated; + else if (skb->proto_data_valid) /* remote but checksummed? */ + flags |= NETRXF_data_validated; + + if (meta[npo.meta_cons].copy) + offset = 0; + else + offset = offset_in_page(skb->data); + resp = make_rx_response(netif, id, status, offset, + skb_headlen(skb), flags); + + if (meta[npo.meta_cons].frag.size) { + struct netif_extra_info *gso = + (struct netif_extra_info *) + RING_GET_RESPONSE(&netif->rx, + netif->rx.rsp_prod_pvt++); + + resp->flags |= NETRXF_extra_info; + + gso->u.gso.size = meta[npo.meta_cons].frag.size; + gso->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4; + gso->u.gso.pad = 0; + gso->u.gso.features = 0; + + gso->type = XEN_NETIF_EXTRA_TYPE_GSO; + gso->flags = 0; + } + + netbk_add_frag_responses(netif, status, + meta + npo.meta_cons + 1, + nr_frags); + + RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->rx, ret); + irq = netif->irq; + if (ret && !rx_notify[irq]) { + rx_notify[irq] = 1; + notify_list[notify_nr++] = irq; + } + + if (netif_queue_stopped(netif->dev) && + !netbk_queue_full(netif)) + netif_wake_queue(netif->dev); + + netif_put(netif); + dev_kfree_skb(skb); + npo.meta_cons += nr_frags + 1; + } + + while (notify_nr != 0) { + irq = notify_list[--notify_nr]; + rx_notify[irq] = 0; + notify_remote_via_irq(irq); + } + + /* More work to do? */ + if (!skb_queue_empty(&rx_queue) && !timer_pending(&net_timer)) + tasklet_schedule(&net_rx_tasklet); #if 0 - else - xen_network_done_notify(); + else + xen_network_done_notify(); #endif } static void net_alarm(unsigned long unused) { - tasklet_schedule(&net_rx_tasklet); + tasklet_schedule(&net_rx_tasklet); } struct net_device_stats *netif_be_get_stats(struct net_device *dev) { - netif_t *netif = netdev_priv(dev); - return &netif->stats; + netif_t *netif = netdev_priv(dev); + return &netif->stats; } static int __on_net_schedule_list(netif_t *netif) { - return netif->list.next != NULL; + return netif->list.next != NULL; } static void remove_from_net_schedule_list(netif_t *netif) { - spin_lock_irq(&net_schedule_list_lock); - if ( likely(__on_net_schedule_list(netif)) ) - { - list_del(&netif->list); - netif->list.next = NULL; - netif_put(netif); - } - spin_unlock_irq(&net_schedule_list_lock); + spin_lock_irq(&net_schedule_list_lock); + if (likely(__on_net_schedule_list(netif))) { + list_del(&netif->list); + netif->list.next = NULL; + netif_put(netif); + } + spin_unlock_irq(&net_schedule_list_lock); } static void add_to_net_schedule_list_tail(netif_t *netif) { - if ( __on_net_schedule_list(netif) ) - return; - - spin_lock_irq(&net_schedule_list_lock); - if ( !__on_net_schedule_list(netif) && netif->active ) - { - list_add_tail(&netif->list, &net_schedule_list); - netif_get(netif); - } - spin_unlock_irq(&net_schedule_list_lock); + if (__on_net_schedule_list(netif)) + return; + + spin_lock_irq(&net_schedule_list_lock); + if (!__on_net_schedule_list(netif) && + likely(netif_running(netif->dev) && + netif_carrier_ok(netif->dev))) { + list_add_tail(&netif->list, &net_schedule_list); + netif_get(netif); + } + spin_unlock_irq(&net_schedule_list_lock); } +/* + * Note on CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER: + * If this driver is pipelining transmit requests then we can be very + * aggressive in avoiding new-packet notifications -- frontend only needs to + * send a notification if there are no outstanding unreceived responses. + * If we may be buffer transmit buffers for any reason then we must be rather + * more conservative and treat this as the final check for pending work. + */ void netif_schedule_work(netif_t *netif) { - if ( (netif->tx_req_cons != netif->tx->req_prod) && - ((netif->tx_req_cons-netif->tx_resp_prod) != NETIF_TX_RING_SIZE) ) - { - add_to_net_schedule_list_tail(netif); - maybe_schedule_tx_action(); - } + int more_to_do; + +#ifdef CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER + more_to_do = RING_HAS_UNCONSUMED_REQUESTS(&netif->tx); +#else + RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, more_to_do); +#endif + + if (more_to_do) { + add_to_net_schedule_list_tail(netif); + maybe_schedule_tx_action(); + } } void netif_deschedule_work(netif_t *netif) { - remove_from_net_schedule_list(netif); + remove_from_net_schedule_list(netif); } -static void tx_credit_callback(unsigned long data) +static void tx_add_credit(netif_t *netif) { - netif_t *netif = (netif_t *)data; - netif->remaining_credit = netif->credit_bytes; - netif_schedule_work(netif); + unsigned long max_burst; + + /* + * Allow a burst big enough to transmit a jumbo packet of up to 128kB. + * Otherwise the interface can seize up due to insufficient credit. + */ + max_burst = RING_GET_REQUEST(&netif->tx, netif->tx.req_cons)->size; + max_burst = min(max_burst, 131072UL); + max_burst = max(max_burst, netif->credit_bytes); + + netif->remaining_credit = min(netif->remaining_credit + + netif->credit_bytes, + max_burst); } -static void net_tx_action(unsigned long unused) +static void tx_credit_callback(unsigned long data) { - struct list_head *ent; - struct sk_buff *skb; - netif_t *netif; - netif_tx_request_t txreq; - u16 pending_idx; - NETIF_RING_IDX i; - multicall_entry_t *mcl; - PEND_RING_IDX dc, dp; - unsigned int data_len; - - if ( (dc = dealloc_cons) == (dp = dealloc_prod) ) - goto skip_dealloc; - - mcl = tx_mcl; - while ( dc != dp ) - { - pending_idx = dealloc_ring[MASK_PEND_IDX(dc++)]; - mcl[0].op = __HYPERVISOR_update_va_mapping; - mcl[0].args[0] = MMAP_VADDR(pending_idx); - mcl[0].args[1] = 0; - mcl[0].args[2] = 0; - mcl++; - } - - mcl[-1].args[2] = UVMF_TLB_FLUSH|UVMF_ALL; - if ( unlikely(HYPERVISOR_multicall(tx_mcl, mcl - tx_mcl) != 0) ) - BUG(); - - mcl = tx_mcl; - while ( dealloc_cons != dp ) - { - /* The update_va_mapping() must not fail. */ - if ( unlikely(mcl[0].args[5] != 0) ) - BUG(); - - pending_idx = dealloc_ring[MASK_PEND_IDX(dealloc_cons++)]; - - netif = pending_tx_info[pending_idx].netif; - - make_tx_response(netif, pending_tx_info[pending_idx].req.id, - NETIF_RSP_OKAY); - - pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx; - - /* - * Scheduling checks must happen after the above response is posted. - * This avoids a possible race with a guest OS on another CPU if that - * guest is testing against 'resp_prod' when deciding whether to notify - * us when it queues additional packets. - */ - mb(); - if ( (netif->tx_req_cons != netif->tx->req_prod) && - ((netif->tx_req_cons-netif->tx_resp_prod) != NETIF_TX_RING_SIZE) ) - add_to_net_schedule_list_tail(netif); - - netif_put(netif); - - mcl++; - } - - skip_dealloc: - mcl = tx_mcl; - while ( (NR_PENDING_REQS < MAX_PENDING_REQS) && - !list_empty(&net_schedule_list) ) - { - /* Get a netif from the list with work to do. */ - ent = net_schedule_list.next; - netif = list_entry(ent, netif_t, list); - netif_get(netif); - remove_from_net_schedule_list(netif); - - /* Work to do? */ - i = netif->tx_req_cons; - if ( (i == netif->tx->req_prod) || - ((i-netif->tx_resp_prod) == NETIF_TX_RING_SIZE) ) - { - netif_put(netif); - continue; - } - - rmb(); /* Ensure that we see the request before we copy it. */ - memcpy(&txreq, &netif->tx->ring[MASK_NETIF_TX_IDX(i)].req, - sizeof(txreq)); - - /* Credit-based scheduling. */ - if ( txreq.size > netif->remaining_credit ) - { - unsigned long now = jiffies; - unsigned long next_credit = - netif->credit_timeout.expires + - msecs_to_jiffies(netif->credit_usec / 1000); - - /* Timer could already be pending in some rare cases. */ - if ( timer_pending(&netif->credit_timeout) ) - break; - - /* Already passed the point at which we can replenish credit? */ - if ( time_after_eq(now, next_credit) ) - { - netif->credit_timeout.expires = now; - netif->remaining_credit = netif->credit_bytes; - } - - /* Still too big to send right now? Then set a timer callback. */ - if ( txreq.size > netif->remaining_credit ) - { - netif->remaining_credit = 0; - netif->credit_timeout.expires = next_credit; - netif->credit_timeout.data = (unsigned long)netif; - netif->credit_timeout.function = tx_credit_callback; -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) - add_timer_on(&netif->credit_timeout, smp_processor_id()); -#else - add_timer(&netif->credit_timeout); -#endif - break; - } - } - netif->remaining_credit -= txreq.size; - - /* - * Why the barrier? It ensures that the frontend sees updated req_cons - * before we check for more work to schedule. - */ - netif->tx->req_cons = ++netif->tx_req_cons; - mb(); - - netif_schedule_work(netif); - - if ( unlikely(txreq.size < ETH_HLEN) || - unlikely(txreq.size > ETH_FRAME_LEN) ) - { - DPRINTK("Bad packet size: %d\n", txreq.size); - make_tx_response(netif, txreq.id, NETIF_RSP_ERROR); - netif_put(netif); - continue; - } - - /* No crossing a page boundary as the payload mustn't fragment. */ - if ( unlikely(((txreq.addr & ~PAGE_MASK) + txreq.size) >= PAGE_SIZE) ) - { - DPRINTK("txreq.addr: %lx, size: %u, end: %lu\n", - txreq.addr, txreq.size, - (txreq.addr &~PAGE_MASK) + txreq.size); - make_tx_response(netif, txreq.id, NETIF_RSP_ERROR); - netif_put(netif); - continue; - } - - pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)]; - - data_len = (txreq.size > PKT_PROT_LEN) ? PKT_PROT_LEN : txreq.size; - - if ( unlikely((skb = alloc_skb(data_len+16, GFP_ATOMIC)) == NULL) ) - { - DPRINTK("Can't allocate a skb in start_xmit.\n"); - make_tx_response(netif, txreq.id, NETIF_RSP_ERROR); - netif_put(netif); - break; - } - - /* Packets passed to netif_rx() must have some headroom. */ - skb_reserve(skb, 16); - - mcl[0].op = __HYPERVISOR_update_va_mapping_otherdomain; - mcl[0].args[0] = MMAP_VADDR(pending_idx); - mcl[0].args[1] = (txreq.addr & PAGE_MASK) | __PAGE_KERNEL; - mcl[0].args[2] = 0; - mcl[0].args[3] = netif->domid; - mcl++; - - memcpy(&pending_tx_info[pending_idx].req, &txreq, sizeof(txreq)); - pending_tx_info[pending_idx].netif = netif; - *((u16 *)skb->data) = pending_idx; - - __skb_queue_tail(&tx_queue, skb); - - pending_cons++; - - /* Filled the batch queue? */ - if ( (mcl - tx_mcl) == ARRAY_SIZE(tx_mcl) ) - break; - } - - if ( mcl == tx_mcl ) - return; - - if ( unlikely(HYPERVISOR_multicall(tx_mcl, mcl - tx_mcl) != 0) ) - BUG(); - - mcl = tx_mcl; - while ( (skb = __skb_dequeue(&tx_queue)) != NULL ) - { - pending_idx = *((u16 *)skb->data); - netif = pending_tx_info[pending_idx].netif; - memcpy(&txreq, &pending_tx_info[pending_idx].req, sizeof(txreq)); - - /* Check the remap error code. */ - if ( unlikely(mcl[0].args[5] != 0) ) - { - DPRINTK("Bad page frame\n"); - make_tx_response(netif, txreq.id, NETIF_RSP_ERROR); - netif_put(netif); - kfree_skb(skb); - mcl++; - pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx; - continue; - } - - phys_to_machine_mapping[__pa(MMAP_VADDR(pending_idx)) >> PAGE_SHIFT] = - FOREIGN_FRAME(txreq.addr >> PAGE_SHIFT); - - data_len = (txreq.size > PKT_PROT_LEN) ? PKT_PROT_LEN : txreq.size; - - __skb_put(skb, data_len); - memcpy(skb->data, - (void *)(MMAP_VADDR(pending_idx)|(txreq.addr&~PAGE_MASK)), - data_len); - - if ( data_len < txreq.size ) - { - /* Append the packet payload as a fragment. */ - skb_shinfo(skb)->frags[0].page = - virt_to_page(MMAP_VADDR(pending_idx)); - skb_shinfo(skb)->frags[0].size = txreq.size - data_len; - skb_shinfo(skb)->frags[0].page_offset = - (txreq.addr + data_len) & ~PAGE_MASK; - skb_shinfo(skb)->nr_frags = 1; - } - else - { - /* Schedule a response immediately. */ - netif_idx_release(pending_idx); - } - - skb->data_len = txreq.size - data_len; - skb->len += skb->data_len; - - skb->dev = netif->dev; - skb->protocol = eth_type_trans(skb, skb->dev); - - netif->stats.rx_bytes += txreq.size; - netif->stats.rx_packets++; - - netif_rx(skb); - netif->dev->last_rx = jiffies; - - mcl++; - } + netif_t *netif = (netif_t *)data; + tx_add_credit(netif); + netif_schedule_work(netif); } -static void netif_idx_release(u16 pending_idx) +inline static void net_tx_action_dealloc(void) { - static spinlock_t _lock = SPIN_LOCK_UNLOCKED; - unsigned long flags; - - spin_lock_irqsave(&_lock, flags); - dealloc_ring[MASK_PEND_IDX(dealloc_prod++)] = pending_idx; - spin_unlock_irqrestore(&_lock, flags); - - tasklet_schedule(&net_tx_tasklet); + gnttab_unmap_grant_ref_t *gop; + u16 pending_idx; + PEND_RING_IDX dc, dp; + netif_t *netif; + int ret; + + dc = dealloc_cons; + dp = dealloc_prod; + + /* Ensure we see all indexes enqueued by netif_idx_release(). */ + smp_rmb(); + + /* + * Free up any grants we have finished using + */ + gop = tx_unmap_ops; + while (dc != dp) { + pending_idx = dealloc_ring[MASK_PEND_IDX(dc++)]; + gnttab_set_unmap_op(gop, idx_to_kaddr(pending_idx), + GNTMAP_host_map, + grant_tx_handle[pending_idx]); + gop++; + } + ret = HYPERVISOR_grant_table_op( + GNTTABOP_unmap_grant_ref, tx_unmap_ops, gop - tx_unmap_ops); + BUG_ON(ret); + + while (dealloc_cons != dp) { + pending_idx = dealloc_ring[MASK_PEND_IDX(dealloc_cons++)]; + + netif = pending_tx_info[pending_idx].netif; + + make_tx_response(netif, &pending_tx_info[pending_idx].req, + NETIF_RSP_OKAY); + + pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx; + + netif_put(netif); + } } -static void netif_page_release(struct page *page) +static void netbk_tx_err(netif_t *netif, netif_tx_request_t *txp, RING_IDX end) { - u16 pending_idx = page - virt_to_page(mmap_vstart); - - /* Ready for next use. */ - set_page_count(page, 1); - - netif_idx_release(pending_idx); + RING_IDX cons = netif->tx.req_cons; + + do { + make_tx_response(netif, txp, NETIF_RSP_ERROR); + if (cons >= end) + break; + txp = RING_GET_REQUEST(&netif->tx, cons++); + } while (1); + netif->tx.req_cons = cons; + netif_schedule_work(netif); + netif_put(netif); } -irqreturn_t netif_be_int(int irq, void *dev_id, struct pt_regs *regs) +static int netbk_count_requests(netif_t *netif, netif_tx_request_t *txp, + int work_to_do) { - netif_t *netif = dev_id; - if ( tx_work_exists(netif) ) - { - add_to_net_schedule_list_tail(netif); - maybe_schedule_tx_action(); - } - return IRQ_HANDLED; + netif_tx_request_t *first = txp; + RING_IDX cons = netif->tx.req_cons; + int frags = 0; + + while (txp->flags & NETTXF_more_data) { + if (frags >= work_to_do) { + DPRINTK("Need more frags\n"); + return -frags; + } + + txp = RING_GET_REQUEST(&netif->tx, cons + frags); + if (txp->size > first->size) { + DPRINTK("Frags galore\n"); + return -frags; + } + + first->size -= txp->size; + frags++; + + if (unlikely((txp->offset + txp->size) > PAGE_SIZE)) { + DPRINTK("txp->offset: %x, size: %u\n", + txp->offset, txp->size); + return -frags; + } + } + + return frags; } -static void make_tx_response(netif_t *netif, - u16 id, - s8 st) +static gnttab_map_grant_ref_t *netbk_get_requests(netif_t *netif, + struct sk_buff *skb, + gnttab_map_grant_ref_t *mop) { - NETIF_RING_IDX i = netif->tx_resp_prod; - netif_tx_response_t *resp; - - resp = &netif->tx->ring[MASK_NETIF_TX_IDX(i)].resp; - resp->id = id; - resp->status = st; - wmb(); - netif->tx->resp_prod = netif->tx_resp_prod = ++i; - - mb(); /* Update producer before checking event threshold. */ - if ( i == netif->tx->event ) - notify_via_evtchn(netif->evtchn); + struct skb_shared_info *shinfo = skb_shinfo(skb); + skb_frag_t *frags = shinfo->frags; + netif_tx_request_t *txp; + unsigned long pending_idx = *((u16 *)skb->data); + RING_IDX cons = netif->tx.req_cons; + int i, start; + + /* Skip first skb fragment if it is on same page as header fragment. */ + start = ((unsigned long)shinfo->frags[0].page == pending_idx); + + for (i = start; i < shinfo->nr_frags; i++) { + txp = RING_GET_REQUEST(&netif->tx, cons++); + pending_idx = pending_ring[MASK_PEND_IDX(pending_cons++)]; + + gnttab_set_map_op(mop++, idx_to_kaddr(pending_idx), + GNTMAP_host_map | GNTMAP_readonly, + txp->gref, netif->domid); + + memcpy(&pending_tx_info[pending_idx].req, txp, sizeof(*txp)); + netif_get(netif); + pending_tx_info[pending_idx].netif = netif; + frags[i].page = (void *)pending_idx; + } + + return mop; } -static int make_rx_response(netif_t *netif, - u16 id, - s8 st, - memory_t addr, - u16 size) +static int netbk_tx_check_mop(struct sk_buff *skb, + gnttab_map_grant_ref_t **mopp) { - NETIF_RING_IDX i = netif->rx_resp_prod; - netif_rx_response_t *resp; + gnttab_map_grant_ref_t *mop = *mopp; + int pending_idx = *((u16 *)skb->data); + netif_t *netif = pending_tx_info[pending_idx].netif; + netif_tx_request_t *txp; + struct skb_shared_info *shinfo = skb_shinfo(skb); + int nr_frags = shinfo->nr_frags; + int i, err, start; + + /* Check status of header. */ + err = mop->status; + if (unlikely(err)) { + txp = &pending_tx_info[pending_idx].req; + make_tx_response(netif, txp, NETIF_RSP_ERROR); + pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx; + netif_put(netif); + } else { + set_phys_to_machine( + __pa(idx_to_kaddr(pending_idx)) >> PAGE_SHIFT, + FOREIGN_FRAME(mop->dev_bus_addr >> PAGE_SHIFT)); + grant_tx_handle[pending_idx] = mop->handle; + } + + /* Skip first skb fragment if it is on same page as header fragment. */ + start = ((unsigned long)shinfo->frags[0].page == pending_idx); + + for (i = start; i < nr_frags; i++) { + int j, newerr; + + pending_idx = (unsigned long)shinfo->frags[i].page; + + /* Check error status: if okay then remember grant handle. */ + newerr = (++mop)->status; + if (likely(!newerr)) { + set_phys_to_machine( + __pa(idx_to_kaddr(pending_idx))>>PAGE_SHIFT, + FOREIGN_FRAME(mop->dev_bus_addr>>PAGE_SHIFT)); + grant_tx_handle[pending_idx] = mop->handle; + /* Had a previous error? Invalidate this fragment. */ + if (unlikely(err)) + netif_idx_release(pending_idx); + continue; + } + + /* Error on this fragment: respond to client with an error. */ + txp = &pending_tx_info[pending_idx].req; + make_tx_response(netif, txp, NETIF_RSP_ERROR); + pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx; + netif_put(netif); + + /* Not the first error? Preceding frags already invalidated. */ + if (err) + continue; + + /* First error: invalidate header and preceding fragments. */ + pending_idx = *((u16 *)skb->data); + netif_idx_release(pending_idx); + for (j = start; j < i; j++) { + pending_idx = (unsigned long)shinfo->frags[i].page; + netif_idx_release(pending_idx); + } + + /* Remember the error: invalidate all subsequent fragments. */ + err = newerr; + } + + *mopp = mop + 1; + return err; +} - resp = &netif->rx->ring[MASK_NETIF_RX_IDX(i)].resp; - resp->addr = addr; - resp->id = id; - resp->status = (s16)size; - if ( st < 0 ) - resp->status = (s16)st; - wmb(); - netif->rx->resp_prod = netif->rx_resp_prod = ++i; +static void netbk_fill_frags(struct sk_buff *skb) +{ + struct skb_shared_info *shinfo = skb_shinfo(skb); + int nr_frags = shinfo->nr_frags; + int i; + + for (i = 0; i < nr_frags; i++) { + skb_frag_t *frag = shinfo->frags + i; + netif_tx_request_t *txp; + unsigned long pending_idx; + + pending_idx = (unsigned long)frag->page; + txp = &pending_tx_info[pending_idx].req; + frag->page = virt_to_page(idx_to_kaddr(pending_idx)); + frag->size = txp->size; + frag->page_offset = txp->offset; + + skb->len += txp->size; + skb->data_len += txp->size; + skb->truesize += txp->size; + } +} - mb(); /* Update producer before checking event threshold. */ - return (i == netif->rx->event); +int netbk_get_extras(netif_t *netif, struct netif_extra_info *extras, + int work_to_do) +{ + struct netif_extra_info *extra; + RING_IDX cons = netif->tx.req_cons; + + do { + if (unlikely(work_to_do-- <= 0)) { + DPRINTK("Missing extra info\n"); + return -EBADR; + } + + extra = (struct netif_extra_info *) + RING_GET_REQUEST(&netif->tx, cons); + if (unlikely(!extra->type || + extra->type >= XEN_NETIF_EXTRA_TYPE_MAX)) { + netif->tx.req_cons = ++cons; + DPRINTK("Invalid extra type: %d\n", extra->type); + return -EINVAL; + } + + memcpy(&extras[extra->type - 1], extra, sizeof(*extra)); + netif->tx.req_cons = ++cons; + } while (extra->flags & XEN_NETIF_EXTRA_FLAG_MORE); + + return work_to_do; } -static irqreturn_t netif_be_dbg(int irq, void *dev_id, struct pt_regs *regs) +static int netbk_set_skb_gso(struct sk_buff *skb, struct netif_extra_info *gso) { - struct list_head *ent; - netif_t *netif; - int i = 0; + if (!gso->u.gso.size) { + DPRINTK("GSO size must not be zero.\n"); + return -EINVAL; + } - printk(KERN_ALERT "netif_schedule_list:\n"); - spin_lock_irq(&net_schedule_list_lock); + /* Currently only TCPv4 S.O. is supported. */ + if (gso->u.gso.type != XEN_NETIF_GSO_TYPE_TCPV4) { + DPRINTK("Bad GSO type %d.\n", gso->u.gso.type); + return -EINVAL; + } - list_for_each ( ent, &net_schedule_list ) - { - netif = list_entry(ent, netif_t, list); - printk(KERN_ALERT " %d: private(rx_req_cons=%08x rx_resp_prod=%08x\n", - i, netif->rx_req_cons, netif->rx_resp_prod); - printk(KERN_ALERT " tx_req_cons=%08x tx_resp_prod=%08x)\n", - netif->tx_req_cons, netif->tx_resp_prod); - printk(KERN_ALERT " shared(rx_req_prod=%08x rx_resp_prod=%08x\n", - netif->rx->req_prod, netif->rx->resp_prod); - printk(KERN_ALERT " rx_event=%08x tx_req_prod=%08x\n", - netif->rx->event, netif->tx->req_prod); - printk(KERN_ALERT " tx_resp_prod=%08x, tx_event=%08x)\n", - netif->tx->resp_prod, netif->tx->event); - i++; - } + skb_shinfo(skb)->gso_size = gso->u.gso.size; + skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; - spin_unlock_irq(&net_schedule_list_lock); - printk(KERN_ALERT " ** End of netif_schedule_list **\n"); + /* Header must be checked, and gso_segs computed. */ + skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; + skb_shinfo(skb)->gso_segs = 0; - return IRQ_HANDLED; + return 0; } -static int __init netback_init(void) +/* Called after netfront has transmitted */ +static void net_tx_action(unsigned long unused) { - int i; - struct page *page; + struct list_head *ent; + struct sk_buff *skb; + netif_t *netif; + netif_tx_request_t txreq; + struct netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1]; + u16 pending_idx; + RING_IDX i; + gnttab_map_grant_ref_t *mop; + unsigned int data_len; + int ret, work_to_do; + + if (dealloc_cons != dealloc_prod) + net_tx_action_dealloc(); + + mop = tx_map_ops; + while (((NR_PENDING_REQS + MAX_SKB_FRAGS) < MAX_PENDING_REQS) && + !list_empty(&net_schedule_list)) { + /* Get a netif from the list with work to do. */ + ent = net_schedule_list.next; + netif = list_entry(ent, netif_t, list); + netif_get(netif); + remove_from_net_schedule_list(netif); + + RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, work_to_do); + if (!work_to_do) { + netif_put(netif); + continue; + } + + i = netif->tx.req_cons; + rmb(); /* Ensure that we see the request before we copy it. */ + memcpy(&txreq, RING_GET_REQUEST(&netif->tx, i), sizeof(txreq)); + + /* Credit-based scheduling. */ + if (txreq.size > netif->remaining_credit) { + unsigned long now = jiffies; + unsigned long next_credit = + netif->credit_timeout.expires + + msecs_to_jiffies(netif->credit_usec / 1000); + + /* Timer could already be pending in rare cases. */ + if (timer_pending(&netif->credit_timeout)) { + netif_put(netif); + continue; + } + + /* Passed the point where we can replenish credit? */ + if (time_after_eq(now, next_credit)) { + netif->credit_timeout.expires = now; + tx_add_credit(netif); + } + + /* Still too big to send right now? Set a callback. */ + if (txreq.size > netif->remaining_credit) { + netif->credit_timeout.data = + (unsigned long)netif; + netif->credit_timeout.function = + tx_credit_callback; + __mod_timer(&netif->credit_timeout, + next_credit); + netif_put(netif); + continue; + } + } + netif->remaining_credit -= txreq.size; + + work_to_do--; + netif->tx.req_cons = ++i; + + memset(extras, 0, sizeof(extras)); + if (txreq.flags & NETTXF_extra_info) { + work_to_do = netbk_get_extras(netif, extras, + work_to_do); + i = netif->tx.req_cons; + if (unlikely(work_to_do < 0)) { + netbk_tx_err(netif, &txreq, i); + continue; + } + } + + ret = netbk_count_requests(netif, &txreq, work_to_do); + if (unlikely(ret < 0)) { + netbk_tx_err(netif, &txreq, i - ret); + continue; + } + i += ret; + + if (unlikely(ret > MAX_SKB_FRAGS)) { + DPRINTK("Too many frags\n"); + netbk_tx_err(netif, &txreq, i); + continue; + } + + if (unlikely(txreq.size < ETH_HLEN)) { + DPRINTK("Bad packet size: %d\n", txreq.size); + netbk_tx_err(netif, &txreq, i); + continue; + } + + /* No crossing a page as the payload mustn't fragment. */ + if (unlikely((txreq.offset + txreq.size) > PAGE_SIZE)) { + DPRINTK("txreq.offset: %x, size: %u, end: %lu\n", + txreq.offset, txreq.size, + (txreq.offset &~PAGE_MASK) + txreq.size); + netbk_tx_err(netif, &txreq, i); + continue; + } + + pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)]; + + data_len = (txreq.size > PKT_PROT_LEN && + ret < MAX_SKB_FRAGS) ? + PKT_PROT_LEN : txreq.size; + + skb = alloc_skb(data_len+16, GFP_ATOMIC); + if (unlikely(skb == NULL)) { + DPRINTK("Can't allocate a skb in start_xmit.\n"); + netbk_tx_err(netif, &txreq, i); + break; + } + + /* Packets passed to netif_rx() must have some headroom. */ + skb_reserve(skb, 16); + + if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) { + struct netif_extra_info *gso; + gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1]; + + if (netbk_set_skb_gso(skb, gso)) { + kfree_skb(skb); + netbk_tx_err(netif, &txreq, i); + continue; + } + } + + gnttab_set_map_op(mop, idx_to_kaddr(pending_idx), + GNTMAP_host_map | GNTMAP_readonly, + txreq.gref, netif->domid); + mop++; + + memcpy(&pending_tx_info[pending_idx].req, + &txreq, sizeof(txreq)); + pending_tx_info[pending_idx].netif = netif; + *((u16 *)skb->data) = pending_idx; + + __skb_put(skb, data_len); + + skb_shinfo(skb)->nr_frags = ret; + if (data_len < txreq.size) { + skb_shinfo(skb)->nr_frags++; + skb_shinfo(skb)->frags[0].page = + (void *)(unsigned long)pending_idx; + } else { + /* Discriminate from any valid pending_idx value. */ + skb_shinfo(skb)->frags[0].page = (void *)~0UL; + } + + __skb_queue_tail(&tx_queue, skb); + + pending_cons++; + + mop = netbk_get_requests(netif, skb, mop); + + netif->tx.req_cons = i; + netif_schedule_work(netif); + + if ((mop - tx_map_ops) >= ARRAY_SIZE(tx_map_ops)) + break; + } + + if (mop == tx_map_ops) + return; + + ret = HYPERVISOR_grant_table_op( + GNTTABOP_map_grant_ref, tx_map_ops, mop - tx_map_ops); + BUG_ON(ret); + + mop = tx_map_ops; + while ((skb = __skb_dequeue(&tx_queue)) != NULL) { + netif_tx_request_t *txp; + + pending_idx = *((u16 *)skb->data); + netif = pending_tx_info[pending_idx].netif; + txp = &pending_tx_info[pending_idx].req; + + /* Check the remap error code. */ + if (unlikely(netbk_tx_check_mop(skb, &mop))) { + printk(KERN_ALERT "#### netback grant fails\n"); + skb_shinfo(skb)->nr_frags = 0; + kfree_skb(skb); + continue; + } + + data_len = skb->len; + memcpy(skb->data, + (void *)(idx_to_kaddr(pending_idx)|txp->offset), + data_len); + if (data_len < txp->size) { + /* Append the packet payload as a fragment. */ + txp->offset += data_len; + txp->size -= data_len; + } else { + /* Schedule a response immediately. */ + netif_idx_release(pending_idx); + } + + /* + * Old frontends do not assert data_validated but we + * can infer it from csum_blank so test both flags. + */ + if (txp->flags & (NETTXF_data_validated|NETTXF_csum_blank)) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + skb->proto_data_valid = 1; + } else { + skb->ip_summed = CHECKSUM_NONE; + skb->proto_data_valid = 0; + } + skb->proto_csum_blank = !!(txp->flags & NETTXF_csum_blank); + + netbk_fill_frags(skb); + + skb->dev = netif->dev; + skb->protocol = eth_type_trans(skb, skb->dev); + + netif->stats.rx_bytes += skb->len; + netif->stats.rx_packets++; + + netif_rx(skb); + netif->dev->last_rx = jiffies; + } +} - if ( !(xen_start_info.flags & SIF_NET_BE_DOMAIN) && - !(xen_start_info.flags & SIF_INITDOMAIN) ) - return 0; +static void netif_idx_release(u16 pending_idx) +{ + static DEFINE_SPINLOCK(_lock); + unsigned long flags; - printk("Initialising Xen netif backend\n"); + spin_lock_irqsave(&_lock, flags); + dealloc_ring[MASK_PEND_IDX(dealloc_prod)] = pending_idx; + /* Sync with net_tx_action_dealloc: insert idx /then/ incr producer. */ + smp_wmb(); + dealloc_prod++; + spin_unlock_irqrestore(&_lock, flags); - /* We can increase reservation by this much in net_rx_action(). */ - balloon_update_driver_allowance(NETIF_RX_RING_SIZE); + tasklet_schedule(&net_tx_tasklet); +} - skb_queue_head_init(&rx_queue); - skb_queue_head_init(&tx_queue); +static void netif_page_release(struct page *page) +{ + /* Ready for next use. */ + init_page_count(page); + netif_idx_release(page->index); +} + +irqreturn_t netif_be_int(int irq, void *dev_id, struct pt_regs *regs) +{ + netif_t *netif = dev_id; + + add_to_net_schedule_list_tail(netif); + maybe_schedule_tx_action(); - init_timer(&net_timer); - net_timer.data = 0; - net_timer.function = net_alarm; - - netif_interface_init(); + if (netif_queue_stopped(netif->dev) && !netbk_queue_full(netif)) + netif_wake_queue(netif->dev); - if ( (mmap_vstart = allocate_empty_lowmem_region(MAX_PENDING_REQS)) == 0 ) - BUG(); + return IRQ_HANDLED; +} - for ( i = 0; i < MAX_PENDING_REQS; i++ ) - { - page = virt_to_page(MMAP_VADDR(i)); - set_page_count(page, 1); - SetPageForeign(page, netif_page_release); - } +static void make_tx_response(netif_t *netif, + netif_tx_request_t *txp, + s8 st) +{ + RING_IDX i = netif->tx.rsp_prod_pvt; + netif_tx_response_t *resp; + int notify; + + resp = RING_GET_RESPONSE(&netif->tx, i); + resp->id = txp->id; + resp->status = st; + + if (txp->flags & NETTXF_extra_info) + RING_GET_RESPONSE(&netif->tx, ++i)->status = NETIF_RSP_NULL; + + netif->tx.rsp_prod_pvt = ++i; + RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->tx, notify); + if (notify) + notify_remote_via_irq(netif->irq); + +#ifdef CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER + if (i == netif->tx.req_cons) { + int more_to_do; + RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, more_to_do); + if (more_to_do) + add_to_net_schedule_list_tail(netif); + } +#endif +} - pending_cons = 0; - pending_prod = MAX_PENDING_REQS; - for ( i = 0; i < MAX_PENDING_REQS; i++ ) - pending_ring[i] = i; +static netif_rx_response_t *make_rx_response(netif_t *netif, + u16 id, + s8 st, + u16 offset, + u16 size, + u16 flags) +{ + RING_IDX i = netif->rx.rsp_prod_pvt; + netif_rx_response_t *resp; - spin_lock_init(&net_schedule_list_lock); - INIT_LIST_HEAD(&net_schedule_list); + resp = RING_GET_RESPONSE(&netif->rx, i); + resp->offset = offset; + resp->flags = flags; + resp->id = id; + resp->status = (s16)size; + if (st < 0) + resp->status = (s16)st; - netif_ctrlif_init(); + netif->rx.rsp_prod_pvt = ++i; - (void)request_irq(bind_virq_to_irq(VIRQ_DEBUG), - netif_be_dbg, SA_SHIRQ, - "net-be-dbg", &netif_be_dbg); + return resp; +} - return 0; +#ifdef NETBE_DEBUG_INTERRUPT +static irqreturn_t netif_be_dbg(int irq, void *dev_id, struct pt_regs *regs) +{ + struct list_head *ent; + netif_t *netif; + int i = 0; + + printk(KERN_ALERT "netif_schedule_list:\n"); + spin_lock_irq(&net_schedule_list_lock); + + list_for_each (ent, &net_schedule_list) { + netif = list_entry(ent, netif_t, list); + printk(KERN_ALERT " %d: private(rx_req_cons=%08x " + "rx_resp_prod=%08x\n", + i, netif->rx.req_cons, netif->rx.rsp_prod_pvt); + printk(KERN_ALERT " tx_req_cons=%08x tx_resp_prod=%08x)\n", + netif->tx.req_cons, netif->tx.rsp_prod_pvt); + printk(KERN_ALERT " shared(rx_req_prod=%08x " + "rx_resp_prod=%08x\n", + netif->rx.sring->req_prod, netif->rx.sring->rsp_prod); + printk(KERN_ALERT " rx_event=%08x tx_req_prod=%08x\n", + netif->rx.sring->rsp_event, netif->tx.sring->req_prod); + printk(KERN_ALERT " tx_resp_prod=%08x, tx_event=%08x)\n", + netif->tx.sring->rsp_prod, netif->tx.sring->rsp_event); + i++; + } + + spin_unlock_irq(&net_schedule_list_lock); + printk(KERN_ALERT " ** End of netif_schedule_list **\n"); + + return IRQ_HANDLED; } +#endif -static void netback_cleanup(void) +static int __init netback_init(void) { - BUG(); + int i; + struct page *page; + + if (!is_running_on_xen()) + return -ENODEV; + + /* We can increase reservation by this much in net_rx_action(). */ + balloon_update_driver_allowance(NET_RX_RING_SIZE); + + skb_queue_head_init(&rx_queue); + skb_queue_head_init(&tx_queue); + + init_timer(&net_timer); + net_timer.data = 0; + net_timer.function = net_alarm; + + mmap_pages = alloc_empty_pages_and_pagevec(MAX_PENDING_REQS); + if (mmap_pages == NULL) { + printk("%s: out of memory\n", __FUNCTION__); + return -ENOMEM; + } + + for (i = 0; i < MAX_PENDING_REQS; i++) { + page = mmap_pages[i]; + SetPageForeign(page, netif_page_release); + page->index = i; + } + + pending_cons = 0; + pending_prod = MAX_PENDING_REQS; + for (i = 0; i < MAX_PENDING_REQS; i++) + pending_ring[i] = i; + + spin_lock_init(&net_schedule_list_lock); + INIT_LIST_HEAD(&net_schedule_list); + + netif_xenbus_init(); + +#ifdef NETBE_DEBUG_INTERRUPT + (void)bind_virq_to_irqhandler( + VIRQ_DEBUG, + 0, + netif_be_dbg, + SA_SHIRQ, + "net-be-dbg", + &netif_be_dbg); +#endif + + return 0; } module_init(netback_init); -module_exit(netback_cleanup); + +MODULE_LICENSE("Dual BSD/GPL");