1 /******************************************************************************
2 * drivers/xen/netback/netback.c
4 * Back-end of the driver for virtual network devices. This portion of the
5 * driver exports a 'unified' network-device interface that can be accessed
6 * by any operating system that implements a compatible front end. A
7 * reference front-end implementation can be found in:
8 * drivers/xen/netfront/netfront.c
10 * Copyright (c) 2002-2005, K A Fraser
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License version 2
14 * as published by the Free Software Foundation; or, when distributed
15 * separately from the Linux kernel or incorporated into other
16 * software packages, subject to the following license:
18 * Permission is hereby granted, free of charge, to any person obtaining a copy
19 * of this source file (the "Software"), to deal in the Software without
20 * restriction, including without limitation the rights to use, copy, modify,
21 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
22 * and to permit persons to whom the Software is furnished to do so, subject to
23 * the following conditions:
25 * The above copyright notice and this permission notice shall be included in
26 * all copies or substantial portions of the Software.
28 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
29 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
30 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
31 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
32 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
33 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
38 #include <xen/balloon.h>
39 #include <xen/interface/memory.h>
42 /*#define NETBE_DEBUG_INTERRUPT*/
44 struct netbk_rx_meta {
50 static void netif_idx_release(u16 pending_idx);
51 static void netif_page_release(struct page *page);
52 static void make_tx_response(netif_t *netif,
53 netif_tx_request_t *txp,
55 static netif_rx_response_t *make_rx_response(netif_t *netif,
62 static void net_tx_action(unsigned long unused);
63 static DECLARE_TASKLET(net_tx_tasklet, net_tx_action, 0);
65 static void net_rx_action(unsigned long unused);
66 static DECLARE_TASKLET(net_rx_tasklet, net_rx_action, 0);
68 static struct timer_list net_timer;
70 #define MAX_PENDING_REQS 256
72 static struct sk_buff_head rx_queue;
74 static struct page **mmap_pages;
75 static inline unsigned long idx_to_kaddr(unsigned int idx)
77 return (unsigned long)pfn_to_kaddr(page_to_pfn(mmap_pages[idx]));
80 #define PKT_PROT_LEN 64
83 netif_tx_request_t req;
85 } pending_tx_info[MAX_PENDING_REQS];
86 static u16 pending_ring[MAX_PENDING_REQS];
87 typedef unsigned int PEND_RING_IDX;
88 #define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1))
89 static PEND_RING_IDX pending_prod, pending_cons;
90 #define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
92 /* Freed TX SKBs get batched on this ring before return to pending_ring. */
93 static u16 dealloc_ring[MAX_PENDING_REQS];
94 static PEND_RING_IDX dealloc_prod, dealloc_cons;
96 static struct sk_buff_head tx_queue;
98 static grant_handle_t grant_tx_handle[MAX_PENDING_REQS];
99 static gnttab_unmap_grant_ref_t tx_unmap_ops[MAX_PENDING_REQS];
100 static gnttab_map_grant_ref_t tx_map_ops[MAX_PENDING_REQS];
102 static struct list_head net_schedule_list;
103 static spinlock_t net_schedule_list_lock;
105 #define MAX_MFN_ALLOC 64
106 static unsigned long mfn_list[MAX_MFN_ALLOC];
107 static unsigned int alloc_index = 0;
109 static inline unsigned long alloc_mfn(void)
111 return mfn_list[--alloc_index];
114 static int check_mfn(int nr)
116 struct xen_memory_reservation reservation = {
121 if (likely(alloc_index >= nr))
124 set_xen_guest_handle(reservation.extent_start, mfn_list + alloc_index);
125 reservation.nr_extents = MAX_MFN_ALLOC - alloc_index;
126 alloc_index += HYPERVISOR_memory_op(XENMEM_increase_reservation,
129 return alloc_index >= nr ? 0 : -ENOMEM;
132 static inline void maybe_schedule_tx_action(void)
135 if ((NR_PENDING_REQS < (MAX_PENDING_REQS/2)) &&
136 !list_empty(&net_schedule_list))
137 tasklet_schedule(&net_tx_tasklet);
141 * A gross way of confirming the origin of an skb data page. The slab
142 * allocator abuses a field in the page struct to cache the kmem_cache_t ptr.
144 static inline int is_xen_skb(struct sk_buff *skb)
146 extern kmem_cache_t *skbuff_cachep;
147 kmem_cache_t *cp = (kmem_cache_t *)virt_to_page(skb->head)->lru.next;
148 return (cp == skbuff_cachep);
152 * We can flip without copying the packet unless:
153 * 1. The data is not allocated from our special cache; or
154 * 2. The main data area is shared; or
155 * 3. One or more fragments are shared; or
156 * 4. There are chained fragments.
158 static inline int is_flippable_skb(struct sk_buff *skb)
162 if (!is_xen_skb(skb) || skb_cloned(skb))
165 for (frag = 0; frag < skb_shinfo(skb)->nr_frags; frag++) {
166 if (page_count(skb_shinfo(skb)->frags[frag].page) > 1)
170 if (skb_shinfo(skb)->frag_list != NULL)
176 static struct sk_buff *netbk_copy_skb(struct sk_buff *skb)
178 struct skb_shared_info *ninfo;
179 struct sk_buff *nskb;
180 unsigned long offset;
185 BUG_ON(skb_shinfo(skb)->frag_list != NULL);
187 nskb = alloc_skb(SKB_MAX_HEAD(0), GFP_ATOMIC);
191 skb_reserve(nskb, 16);
192 headlen = nskb->end - nskb->data;
193 if (headlen > skb_headlen(skb))
194 headlen = skb_headlen(skb);
195 ret = skb_copy_bits(skb, 0, __skb_put(nskb, headlen), headlen);
198 ninfo = skb_shinfo(nskb);
199 ninfo->gso_size = skb_shinfo(skb)->gso_size;
200 ninfo->gso_type = skb_shinfo(skb)->gso_type;
203 len = skb->len - headlen;
205 nskb->len = skb->len;
206 nskb->data_len = len;
207 nskb->truesize += len;
214 if (unlikely(ninfo->nr_frags >= MAX_SKB_FRAGS)) {
219 copy = len >= PAGE_SIZE ? PAGE_SIZE : len;
220 zero = len >= PAGE_SIZE ? 0 : __GFP_ZERO;
222 page = alloc_page(GFP_ATOMIC | __GFP_NOWARN | zero);
226 ret = skb_copy_bits(skb, offset, page_address(page), copy);
229 ninfo->frags[ninfo->nr_frags].page = page;
230 ninfo->frags[ninfo->nr_frags].page_offset = 0;
231 ninfo->frags[ninfo->nr_frags].size = copy;
238 offset = nskb->data - skb->data;
240 nskb->h.raw = skb->h.raw + offset;
241 nskb->nh.raw = skb->nh.raw + offset;
242 nskb->mac.raw = skb->mac.raw + offset;
252 static inline int netbk_max_required_rx_slots(netif_t *netif)
254 if (netif->features & (NETIF_F_SG|NETIF_F_TSO))
255 return MAX_SKB_FRAGS + 2; /* header + extra_info + frags */
256 return 1; /* all in one */
259 static inline int netbk_queue_full(netif_t *netif)
261 RING_IDX peek = netif->rx_req_cons_peek;
262 RING_IDX needed = netbk_max_required_rx_slots(netif);
264 return ((netif->rx.sring->req_prod - peek) < needed) ||
265 ((netif->rx.rsp_prod_pvt + NET_RX_RING_SIZE - peek) < needed);
268 int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev)
270 netif_t *netif = netdev_priv(dev);
272 BUG_ON(skb->dev != dev);
274 /* Drop the packet if the target domain has no receive buffers. */
275 if (unlikely(!netif_running(dev) || !netif_carrier_ok(dev)))
278 if (unlikely(netbk_queue_full(netif))) {
279 /* Not a BUG_ON() -- misbehaving netfront can trigger this. */
280 if (netbk_can_queue(dev))
281 DPRINTK("Queue full but not stopped!\n");
285 /* Copy the packet here if it's destined for a flipping
286 interface but isn't flippable (e.g. extra references to
289 if (!netif->copying_receiver && !is_flippable_skb(skb)) {
290 struct sk_buff *nskb = netbk_copy_skb(skb);
291 if ( unlikely(nskb == NULL) )
293 /* Copy only the header fields we use in this driver. */
294 nskb->dev = skb->dev;
295 nskb->ip_summed = skb->ip_summed;
296 nskb->proto_data_valid = skb->proto_data_valid;
301 netif->rx_req_cons_peek += skb_shinfo(skb)->nr_frags + 1 +
302 !!skb_shinfo(skb)->gso_size;
305 if (netbk_can_queue(dev) && netbk_queue_full(netif)) {
306 netif->rx.sring->req_event = netif->rx_req_cons_peek +
307 netbk_max_required_rx_slots(netif);
308 mb(); /* request notification /then/ check & stop the queue */
309 if (netbk_queue_full(netif))
310 netif_stop_queue(dev);
313 skb_queue_tail(&rx_queue, skb);
314 tasklet_schedule(&net_rx_tasklet);
319 netif->stats.tx_dropped++;
325 static void xen_network_done_notify(void)
327 static struct net_device *eth0_dev = NULL;
328 if (unlikely(eth0_dev == NULL))
329 eth0_dev = __dev_get_by_name("eth0");
330 netif_rx_schedule(eth0_dev);
333 * Add following to poll() function in NAPI driver (Tigon3 is example):
334 * if ( xen_network_done() )
335 * tg3_enable_ints(tp);
337 int xen_network_done(void)
339 return skb_queue_empty(&rx_queue);
343 struct netrx_pending_operations {
344 unsigned trans_prod, trans_cons;
345 unsigned mmu_prod, mmu_cons;
346 unsigned mcl_prod, mcl_cons;
347 unsigned copy_prod, copy_cons;
348 unsigned meta_prod, meta_cons;
350 gnttab_transfer_t *trans;
352 multicall_entry_t *mcl;
353 struct netbk_rx_meta *meta;
356 /* Set up the grant operations for this fragment. If it's a flipping
357 interface, we also set up the unmap request from here. */
358 static u16 netbk_gop_frag(netif_t *netif, struct netbk_rx_meta *meta,
359 int i, struct netrx_pending_operations *npo,
360 struct page *page, unsigned long size,
361 unsigned long offset)
364 gnttab_transfer_t *gop;
365 gnttab_copy_t *copy_gop;
366 multicall_entry_t *mcl;
367 netif_rx_request_t *req;
368 unsigned long old_mfn, new_mfn;
370 old_mfn = virt_to_mfn(page_address(page));
372 req = RING_GET_REQUEST(&netif->rx, netif->rx.req_cons + i);
373 if (netif->copying_receiver) {
374 /* The fragment needs to be copied rather than
377 copy_gop = npo->copy + npo->copy_prod++;
378 copy_gop->source.domid = DOMID_SELF;
379 copy_gop->source.offset = offset;
380 copy_gop->source.u.gmfn = old_mfn;
381 copy_gop->dest.domid = netif->domid;
382 copy_gop->dest.offset = 0;
383 copy_gop->dest.u.ref = req->gref;
384 copy_gop->len = size;
385 copy_gop->flags = GNTCOPY_dest_gref;
388 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
389 new_mfn = alloc_mfn();
392 * Set the new P2M table entry before
393 * reassigning the old data page. Heed the
394 * comment in pgtable-2level.h:pte_page(). :-)
396 set_phys_to_machine(page_to_pfn(page), new_mfn);
398 mcl = npo->mcl + npo->mcl_prod++;
399 MULTI_update_va_mapping(mcl,
400 (unsigned long)page_address(page),
401 pfn_pte_ma(new_mfn, PAGE_KERNEL),
404 mmu = npo->mmu + npo->mmu_prod++;
405 mmu->ptr = ((maddr_t)new_mfn << PAGE_SHIFT) |
407 mmu->val = page_to_pfn(page);
410 gop = npo->trans + npo->trans_prod++;
412 gop->domid = netif->domid;
413 gop->ref = req->gref;
418 static void netbk_gop_skb(struct sk_buff *skb,
419 struct netrx_pending_operations *npo)
421 netif_t *netif = netdev_priv(skb->dev);
422 int nr_frags = skb_shinfo(skb)->nr_frags;
425 struct netbk_rx_meta *head_meta, *meta;
427 head_meta = npo->meta + npo->meta_prod++;
428 head_meta->frag.page_offset = skb_shinfo(skb)->gso_type;
429 head_meta->frag.size = skb_shinfo(skb)->gso_size;
430 extra = !!head_meta->frag.size + 1;
432 for (i = 0; i < nr_frags; i++) {
433 meta = npo->meta + npo->meta_prod++;
434 meta->frag = skb_shinfo(skb)->frags[i];
435 meta->id = netbk_gop_frag(netif, meta, i + extra, npo,
438 meta->frag.page_offset);
442 * This must occur at the end to ensure that we don't trash
443 * skb_shinfo until we're done.
445 head_meta->id = netbk_gop_frag(netif, head_meta, 0, npo,
446 virt_to_page(skb->data),
448 offset_in_page(skb->data));
450 netif->rx.req_cons += nr_frags + extra;
453 static inline void netbk_free_pages(int nr_frags, struct netbk_rx_meta *meta)
457 for (i = 0; i < nr_frags; i++)
458 put_page(meta[i].frag.page);
461 /* This is a twin to netbk_gop_skb. Assume that netbk_gop_skb was
462 used to set up the operations on the top of
463 netrx_pending_operations, which have since been done. Check that
464 they didn't give any errors and advance over them. */
465 static int netbk_check_gop(int nr_frags, domid_t domid,
466 struct netrx_pending_operations *npo)
468 multicall_entry_t *mcl;
469 gnttab_transfer_t *gop;
470 gnttab_copy_t *copy_op;
471 int status = NETIF_RSP_OKAY;
474 for (i = 0; i <= nr_frags; i++) {
475 if (npo->meta[npo->meta_cons + i].copy) {
476 copy_op = npo->copy + npo->copy_cons++;
477 if (copy_op->status != GNTST_okay) {
478 DPRINTK("Bad status %d from copy to DOM%d.\n",
480 status = NETIF_RSP_ERROR;
483 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
484 mcl = npo->mcl + npo->mcl_cons++;
485 /* The update_va_mapping() must not fail. */
486 BUG_ON(mcl->result != 0);
489 gop = npo->trans + npo->trans_cons++;
490 /* Check the reassignment error code. */
491 if (gop->status != 0) {
492 DPRINTK("Bad status %d from grant transfer to DOM%u\n",
495 * Page no longer belongs to us unless
496 * GNTST_bad_page, but that should be
497 * a fatal error anyway.
499 BUG_ON(gop->status == GNTST_bad_page);
500 status = NETIF_RSP_ERROR;
508 static void netbk_add_frag_responses(netif_t *netif, int status,
509 struct netbk_rx_meta *meta, int nr_frags)
512 unsigned long offset;
514 for (i = 0; i < nr_frags; i++) {
516 int flags = (i == nr_frags - 1) ? 0 : NETRXF_more_data;
521 offset = meta[i].frag.page_offset;
522 make_rx_response(netif, id, status, offset,
523 meta[i].frag.size, flags);
527 static void net_rx_action(unsigned long unused)
529 netif_t *netif = NULL;
532 netif_rx_response_t *resp;
533 multicall_entry_t *mcl;
534 struct sk_buff_head rxq;
540 unsigned long offset;
543 * Putting hundreds of bytes on the stack is considered rude.
544 * Static works because a tasklet can only be on one CPU at any time.
546 static multicall_entry_t rx_mcl[NET_RX_RING_SIZE+3];
547 static mmu_update_t rx_mmu[NET_RX_RING_SIZE];
548 static gnttab_transfer_t grant_trans_op[NET_RX_RING_SIZE];
549 static gnttab_copy_t grant_copy_op[NET_RX_RING_SIZE];
550 static unsigned char rx_notify[NR_IRQS];
551 static u16 notify_list[NET_RX_RING_SIZE];
552 static struct netbk_rx_meta meta[NET_RX_RING_SIZE];
554 struct netrx_pending_operations npo = {
556 trans: grant_trans_op,
561 skb_queue_head_init(&rxq);
565 while ((skb = skb_dequeue(&rx_queue)) != NULL) {
566 nr_frags = skb_shinfo(skb)->nr_frags;
567 *(int *)skb->cb = nr_frags;
569 if (!xen_feature(XENFEAT_auto_translated_physmap) &&
570 check_mfn(nr_frags + 1)) {
571 /* Memory squeeze? Back off for an arbitrary while. */
572 if ( net_ratelimit() )
573 WPRINTK("Memory squeeze in netback "
575 mod_timer(&net_timer, jiffies + HZ);
576 skb_queue_head(&rx_queue, skb);
580 netbk_gop_skb(skb, &npo);
582 count += nr_frags + 1;
584 __skb_queue_tail(&rxq, skb);
586 /* Filled the batch queue? */
587 if (count + MAX_SKB_FRAGS >= NET_RX_RING_SIZE)
592 !xen_feature(XENFEAT_auto_translated_physmap)) {
593 mcl = npo.mcl + npo.mcl_prod++;
595 BUG_ON(mcl[-1].op != __HYPERVISOR_update_va_mapping);
596 mcl[-1].args[MULTI_UVMFLAGS_INDEX] = UVMF_TLB_FLUSH|UVMF_ALL;
598 mcl->op = __HYPERVISOR_mmu_update;
599 mcl->args[0] = (unsigned long)rx_mmu;
600 mcl->args[1] = npo.mmu_prod;
602 mcl->args[3] = DOMID_SELF;
605 if (npo.trans_prod) {
606 mcl = npo.mcl + npo.mcl_prod++;
607 mcl->op = __HYPERVISOR_grant_table_op;
608 mcl->args[0] = GNTTABOP_transfer;
609 mcl->args[1] = (unsigned long)grant_trans_op;
610 mcl->args[2] = npo.trans_prod;
614 mcl = npo.mcl + npo.mcl_prod++;
615 mcl->op = __HYPERVISOR_grant_table_op;
616 mcl->args[0] = GNTTABOP_copy;
617 mcl->args[1] = (unsigned long)grant_copy_op;
618 mcl->args[2] = npo.copy_prod;
625 BUG_ON(npo.copy_prod > NET_RX_RING_SIZE);
626 BUG_ON(npo.mmu_prod > NET_RX_RING_SIZE);
627 BUG_ON(npo.trans_prod > NET_RX_RING_SIZE);
628 BUG_ON(npo.mcl_prod > NET_RX_RING_SIZE+3);
629 BUG_ON(npo.meta_prod > NET_RX_RING_SIZE);
631 ret = HYPERVISOR_multicall(npo.mcl, npo.mcl_prod);
634 while ((skb = __skb_dequeue(&rxq)) != NULL) {
635 nr_frags = *(int *)skb->cb;
637 netif = netdev_priv(skb->dev);
638 /* We can't rely on skb_release_data to release the
639 pages used by fragments for us, since it tries to
640 touch the pages in the fraglist. If we're in
641 flipping mode, that doesn't work. In copying mode,
642 we still have access to all of the pages, and so
643 it's safe to let release_data deal with it. */
644 /* (Freeing the fragments is safe since we copy
645 non-linear skbs destined for flipping interfaces) */
646 if (!netif->copying_receiver) {
647 atomic_set(&(skb_shinfo(skb)->dataref), 1);
648 skb_shinfo(skb)->frag_list = NULL;
649 skb_shinfo(skb)->nr_frags = 0;
650 netbk_free_pages(nr_frags, meta + npo.meta_cons + 1);
653 netif->stats.tx_bytes += skb->len;
654 netif->stats.tx_packets++;
656 status = netbk_check_gop(nr_frags, netif->domid, &npo);
658 id = meta[npo.meta_cons].id;
659 flags = nr_frags ? NETRXF_more_data : 0;
661 if (skb->ip_summed == CHECKSUM_HW) /* local packet? */
662 flags |= NETRXF_csum_blank | NETRXF_data_validated;
663 else if (skb->proto_data_valid) /* remote but checksummed? */
664 flags |= NETRXF_data_validated;
666 if (meta[npo.meta_cons].copy)
669 offset = offset_in_page(skb->data);
670 resp = make_rx_response(netif, id, status, offset,
671 skb_headlen(skb), flags);
673 if (meta[npo.meta_cons].frag.size) {
674 struct netif_extra_info *gso =
675 (struct netif_extra_info *)
676 RING_GET_RESPONSE(&netif->rx,
677 netif->rx.rsp_prod_pvt++);
679 resp->flags |= NETRXF_extra_info;
681 gso->u.gso.size = meta[npo.meta_cons].frag.size;
682 gso->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4;
684 gso->u.gso.features = 0;
686 gso->type = XEN_NETIF_EXTRA_TYPE_GSO;
690 netbk_add_frag_responses(netif, status,
691 meta + npo.meta_cons + 1,
694 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->rx, ret);
696 if (ret && !rx_notify[irq]) {
698 notify_list[notify_nr++] = irq;
701 if (netif_queue_stopped(netif->dev) &&
702 !netbk_queue_full(netif))
703 netif_wake_queue(netif->dev);
707 npo.meta_cons += nr_frags + 1;
710 while (notify_nr != 0) {
711 irq = notify_list[--notify_nr];
713 notify_remote_via_irq(irq);
716 /* More work to do? */
717 if (!skb_queue_empty(&rx_queue) && !timer_pending(&net_timer))
718 tasklet_schedule(&net_rx_tasklet);
721 xen_network_done_notify();
725 static void net_alarm(unsigned long unused)
727 tasklet_schedule(&net_rx_tasklet);
730 struct net_device_stats *netif_be_get_stats(struct net_device *dev)
732 netif_t *netif = netdev_priv(dev);
733 return &netif->stats;
736 static int __on_net_schedule_list(netif_t *netif)
738 return netif->list.next != NULL;
741 static void remove_from_net_schedule_list(netif_t *netif)
743 spin_lock_irq(&net_schedule_list_lock);
744 if (likely(__on_net_schedule_list(netif))) {
745 list_del(&netif->list);
746 netif->list.next = NULL;
749 spin_unlock_irq(&net_schedule_list_lock);
752 static void add_to_net_schedule_list_tail(netif_t *netif)
754 if (__on_net_schedule_list(netif))
757 spin_lock_irq(&net_schedule_list_lock);
758 if (!__on_net_schedule_list(netif) &&
759 likely(netif_running(netif->dev) &&
760 netif_carrier_ok(netif->dev))) {
761 list_add_tail(&netif->list, &net_schedule_list);
764 spin_unlock_irq(&net_schedule_list_lock);
768 * Note on CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER:
769 * If this driver is pipelining transmit requests then we can be very
770 * aggressive in avoiding new-packet notifications -- frontend only needs to
771 * send a notification if there are no outstanding unreceived responses.
772 * If we may be buffer transmit buffers for any reason then we must be rather
773 * more conservative and treat this as the final check for pending work.
775 void netif_schedule_work(netif_t *netif)
779 #ifdef CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER
780 more_to_do = RING_HAS_UNCONSUMED_REQUESTS(&netif->tx);
782 RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, more_to_do);
786 add_to_net_schedule_list_tail(netif);
787 maybe_schedule_tx_action();
791 void netif_deschedule_work(netif_t *netif)
793 remove_from_net_schedule_list(netif);
797 static void tx_add_credit(netif_t *netif)
799 unsigned long max_burst;
802 * Allow a burst big enough to transmit a jumbo packet of up to 128kB.
803 * Otherwise the interface can seize up due to insufficient credit.
805 max_burst = RING_GET_REQUEST(&netif->tx, netif->tx.req_cons)->size;
806 max_burst = min(max_burst, 131072UL);
807 max_burst = max(max_burst, netif->credit_bytes);
809 netif->remaining_credit = min(netif->remaining_credit +
814 static void tx_credit_callback(unsigned long data)
816 netif_t *netif = (netif_t *)data;
817 tx_add_credit(netif);
818 netif_schedule_work(netif);
821 inline static void net_tx_action_dealloc(void)
823 gnttab_unmap_grant_ref_t *gop;
825 PEND_RING_IDX dc, dp;
832 /* Ensure we see all indexes enqueued by netif_idx_release(). */
836 * Free up any grants we have finished using
840 pending_idx = dealloc_ring[MASK_PEND_IDX(dc++)];
841 gnttab_set_unmap_op(gop, idx_to_kaddr(pending_idx),
843 grant_tx_handle[pending_idx]);
846 ret = HYPERVISOR_grant_table_op(
847 GNTTABOP_unmap_grant_ref, tx_unmap_ops, gop - tx_unmap_ops);
850 while (dealloc_cons != dp) {
851 pending_idx = dealloc_ring[MASK_PEND_IDX(dealloc_cons++)];
853 netif = pending_tx_info[pending_idx].netif;
855 make_tx_response(netif, &pending_tx_info[pending_idx].req,
858 pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
864 static void netbk_tx_err(netif_t *netif, netif_tx_request_t *txp, RING_IDX end)
866 RING_IDX cons = netif->tx.req_cons;
869 make_tx_response(netif, txp, NETIF_RSP_ERROR);
872 txp = RING_GET_REQUEST(&netif->tx, cons++);
874 netif->tx.req_cons = cons;
875 netif_schedule_work(netif);
879 static int netbk_count_requests(netif_t *netif, netif_tx_request_t *txp,
882 netif_tx_request_t *first = txp;
883 RING_IDX cons = netif->tx.req_cons;
886 while (txp->flags & NETTXF_more_data) {
887 if (frags >= work_to_do) {
888 DPRINTK("Need more frags\n");
892 txp = RING_GET_REQUEST(&netif->tx, cons + frags);
893 if (txp->size > first->size) {
894 DPRINTK("Frags galore\n");
898 first->size -= txp->size;
901 if (unlikely((txp->offset + txp->size) > PAGE_SIZE)) {
902 DPRINTK("txp->offset: %x, size: %u\n",
903 txp->offset, txp->size);
911 static gnttab_map_grant_ref_t *netbk_get_requests(netif_t *netif,
913 gnttab_map_grant_ref_t *mop)
915 struct skb_shared_info *shinfo = skb_shinfo(skb);
916 skb_frag_t *frags = shinfo->frags;
917 netif_tx_request_t *txp;
918 unsigned long pending_idx = *((u16 *)skb->data);
919 RING_IDX cons = netif->tx.req_cons;
922 /* Skip first skb fragment if it is on same page as header fragment. */
923 start = ((unsigned long)shinfo->frags[0].page == pending_idx);
925 for (i = start; i < shinfo->nr_frags; i++) {
926 txp = RING_GET_REQUEST(&netif->tx, cons++);
927 pending_idx = pending_ring[MASK_PEND_IDX(pending_cons++)];
929 gnttab_set_map_op(mop++, idx_to_kaddr(pending_idx),
930 GNTMAP_host_map | GNTMAP_readonly,
931 txp->gref, netif->domid);
933 memcpy(&pending_tx_info[pending_idx].req, txp, sizeof(*txp));
935 pending_tx_info[pending_idx].netif = netif;
936 frags[i].page = (void *)pending_idx;
942 static int netbk_tx_check_mop(struct sk_buff *skb,
943 gnttab_map_grant_ref_t **mopp)
945 gnttab_map_grant_ref_t *mop = *mopp;
946 int pending_idx = *((u16 *)skb->data);
947 netif_t *netif = pending_tx_info[pending_idx].netif;
948 netif_tx_request_t *txp;
949 struct skb_shared_info *shinfo = skb_shinfo(skb);
950 int nr_frags = shinfo->nr_frags;
953 /* Check status of header. */
956 txp = &pending_tx_info[pending_idx].req;
957 make_tx_response(netif, txp, NETIF_RSP_ERROR);
958 pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
962 __pa(idx_to_kaddr(pending_idx)) >> PAGE_SHIFT,
963 FOREIGN_FRAME(mop->dev_bus_addr >> PAGE_SHIFT));
964 grant_tx_handle[pending_idx] = mop->handle;
967 /* Skip first skb fragment if it is on same page as header fragment. */
968 start = ((unsigned long)shinfo->frags[0].page == pending_idx);
970 for (i = start; i < nr_frags; i++) {
973 pending_idx = (unsigned long)shinfo->frags[i].page;
975 /* Check error status: if okay then remember grant handle. */
976 newerr = (++mop)->status;
977 if (likely(!newerr)) {
979 __pa(idx_to_kaddr(pending_idx))>>PAGE_SHIFT,
980 FOREIGN_FRAME(mop->dev_bus_addr>>PAGE_SHIFT));
981 grant_tx_handle[pending_idx] = mop->handle;
982 /* Had a previous error? Invalidate this fragment. */
984 netif_idx_release(pending_idx);
988 /* Error on this fragment: respond to client with an error. */
989 txp = &pending_tx_info[pending_idx].req;
990 make_tx_response(netif, txp, NETIF_RSP_ERROR);
991 pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
994 /* Not the first error? Preceding frags already invalidated. */
998 /* First error: invalidate header and preceding fragments. */
999 pending_idx = *((u16 *)skb->data);
1000 netif_idx_release(pending_idx);
1001 for (j = start; j < i; j++) {
1002 pending_idx = (unsigned long)shinfo->frags[i].page;
1003 netif_idx_release(pending_idx);
1006 /* Remember the error: invalidate all subsequent fragments. */
1014 static void netbk_fill_frags(struct sk_buff *skb)
1016 struct skb_shared_info *shinfo = skb_shinfo(skb);
1017 int nr_frags = shinfo->nr_frags;
1020 for (i = 0; i < nr_frags; i++) {
1021 skb_frag_t *frag = shinfo->frags + i;
1022 netif_tx_request_t *txp;
1023 unsigned long pending_idx;
1025 pending_idx = (unsigned long)frag->page;
1026 txp = &pending_tx_info[pending_idx].req;
1027 frag->page = virt_to_page(idx_to_kaddr(pending_idx));
1028 frag->size = txp->size;
1029 frag->page_offset = txp->offset;
1031 skb->len += txp->size;
1032 skb->data_len += txp->size;
1033 skb->truesize += txp->size;
1037 int netbk_get_extras(netif_t *netif, struct netif_extra_info *extras,
1040 struct netif_extra_info *extra;
1041 RING_IDX cons = netif->tx.req_cons;
1044 if (unlikely(work_to_do-- <= 0)) {
1045 DPRINTK("Missing extra info\n");
1049 extra = (struct netif_extra_info *)
1050 RING_GET_REQUEST(&netif->tx, cons);
1051 if (unlikely(!extra->type ||
1052 extra->type >= XEN_NETIF_EXTRA_TYPE_MAX)) {
1053 netif->tx.req_cons = ++cons;
1054 DPRINTK("Invalid extra type: %d\n", extra->type);
1058 memcpy(&extras[extra->type - 1], extra, sizeof(*extra));
1059 netif->tx.req_cons = ++cons;
1060 } while (extra->flags & XEN_NETIF_EXTRA_FLAG_MORE);
1065 static int netbk_set_skb_gso(struct sk_buff *skb, struct netif_extra_info *gso)
1067 if (!gso->u.gso.size) {
1068 DPRINTK("GSO size must not be zero.\n");
1072 /* Currently only TCPv4 S.O. is supported. */
1073 if (gso->u.gso.type != XEN_NETIF_GSO_TYPE_TCPV4) {
1074 DPRINTK("Bad GSO type %d.\n", gso->u.gso.type);
1078 skb_shinfo(skb)->gso_size = gso->u.gso.size;
1079 skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
1081 /* Header must be checked, and gso_segs computed. */
1082 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
1083 skb_shinfo(skb)->gso_segs = 0;
1088 /* Called after netfront has transmitted */
1089 static void net_tx_action(unsigned long unused)
1091 struct list_head *ent;
1092 struct sk_buff *skb;
1094 netif_tx_request_t txreq;
1095 struct netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1];
1098 gnttab_map_grant_ref_t *mop;
1099 unsigned int data_len;
1100 int ret, work_to_do;
1102 if (dealloc_cons != dealloc_prod)
1103 net_tx_action_dealloc();
1106 while (((NR_PENDING_REQS + MAX_SKB_FRAGS) < MAX_PENDING_REQS) &&
1107 !list_empty(&net_schedule_list)) {
1108 /* Get a netif from the list with work to do. */
1109 ent = net_schedule_list.next;
1110 netif = list_entry(ent, netif_t, list);
1112 remove_from_net_schedule_list(netif);
1114 RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, work_to_do);
1120 i = netif->tx.req_cons;
1121 rmb(); /* Ensure that we see the request before we copy it. */
1122 memcpy(&txreq, RING_GET_REQUEST(&netif->tx, i), sizeof(txreq));
1124 /* Credit-based scheduling. */
1125 if (txreq.size > netif->remaining_credit) {
1126 unsigned long now = jiffies;
1127 unsigned long next_credit =
1128 netif->credit_timeout.expires +
1129 msecs_to_jiffies(netif->credit_usec / 1000);
1131 /* Timer could already be pending in rare cases. */
1132 if (timer_pending(&netif->credit_timeout)) {
1137 /* Passed the point where we can replenish credit? */
1138 if (time_after_eq(now, next_credit)) {
1139 netif->credit_timeout.expires = now;
1140 tx_add_credit(netif);
1143 /* Still too big to send right now? Set a callback. */
1144 if (txreq.size > netif->remaining_credit) {
1145 netif->credit_timeout.data =
1146 (unsigned long)netif;
1147 netif->credit_timeout.function =
1149 __mod_timer(&netif->credit_timeout,
1155 netif->remaining_credit -= txreq.size;
1158 netif->tx.req_cons = ++i;
1160 memset(extras, 0, sizeof(extras));
1161 if (txreq.flags & NETTXF_extra_info) {
1162 work_to_do = netbk_get_extras(netif, extras,
1164 i = netif->tx.req_cons;
1165 if (unlikely(work_to_do < 0)) {
1166 netbk_tx_err(netif, &txreq, i);
1171 ret = netbk_count_requests(netif, &txreq, work_to_do);
1172 if (unlikely(ret < 0)) {
1173 netbk_tx_err(netif, &txreq, i - ret);
1178 if (unlikely(ret > MAX_SKB_FRAGS)) {
1179 DPRINTK("Too many frags\n");
1180 netbk_tx_err(netif, &txreq, i);
1184 if (unlikely(txreq.size < ETH_HLEN)) {
1185 DPRINTK("Bad packet size: %d\n", txreq.size);
1186 netbk_tx_err(netif, &txreq, i);
1190 /* No crossing a page as the payload mustn't fragment. */
1191 if (unlikely((txreq.offset + txreq.size) > PAGE_SIZE)) {
1192 DPRINTK("txreq.offset: %x, size: %u, end: %lu\n",
1193 txreq.offset, txreq.size,
1194 (txreq.offset &~PAGE_MASK) + txreq.size);
1195 netbk_tx_err(netif, &txreq, i);
1199 pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
1201 data_len = (txreq.size > PKT_PROT_LEN &&
1202 ret < MAX_SKB_FRAGS) ?
1203 PKT_PROT_LEN : txreq.size;
1205 skb = alloc_skb(data_len+16, GFP_ATOMIC);
1206 if (unlikely(skb == NULL)) {
1207 DPRINTK("Can't allocate a skb in start_xmit.\n");
1208 netbk_tx_err(netif, &txreq, i);
1212 /* Packets passed to netif_rx() must have some headroom. */
1213 skb_reserve(skb, 16);
1215 if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) {
1216 struct netif_extra_info *gso;
1217 gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1];
1219 if (netbk_set_skb_gso(skb, gso)) {
1221 netbk_tx_err(netif, &txreq, i);
1226 gnttab_set_map_op(mop, idx_to_kaddr(pending_idx),
1227 GNTMAP_host_map | GNTMAP_readonly,
1228 txreq.gref, netif->domid);
1231 memcpy(&pending_tx_info[pending_idx].req,
1232 &txreq, sizeof(txreq));
1233 pending_tx_info[pending_idx].netif = netif;
1234 *((u16 *)skb->data) = pending_idx;
1236 __skb_put(skb, data_len);
1238 skb_shinfo(skb)->nr_frags = ret;
1239 if (data_len < txreq.size) {
1240 skb_shinfo(skb)->nr_frags++;
1241 skb_shinfo(skb)->frags[0].page =
1242 (void *)(unsigned long)pending_idx;
1244 /* Discriminate from any valid pending_idx value. */
1245 skb_shinfo(skb)->frags[0].page = (void *)~0UL;
1248 __skb_queue_tail(&tx_queue, skb);
1252 mop = netbk_get_requests(netif, skb, mop);
1254 netif->tx.req_cons = i;
1255 netif_schedule_work(netif);
1257 if ((mop - tx_map_ops) >= ARRAY_SIZE(tx_map_ops))
1261 if (mop == tx_map_ops)
1264 ret = HYPERVISOR_grant_table_op(
1265 GNTTABOP_map_grant_ref, tx_map_ops, mop - tx_map_ops);
1269 while ((skb = __skb_dequeue(&tx_queue)) != NULL) {
1270 netif_tx_request_t *txp;
1272 pending_idx = *((u16 *)skb->data);
1273 netif = pending_tx_info[pending_idx].netif;
1274 txp = &pending_tx_info[pending_idx].req;
1276 /* Check the remap error code. */
1277 if (unlikely(netbk_tx_check_mop(skb, &mop))) {
1278 printk(KERN_ALERT "#### netback grant fails\n");
1279 skb_shinfo(skb)->nr_frags = 0;
1284 data_len = skb->len;
1286 (void *)(idx_to_kaddr(pending_idx)|txp->offset),
1288 if (data_len < txp->size) {
1289 /* Append the packet payload as a fragment. */
1290 txp->offset += data_len;
1291 txp->size -= data_len;
1293 /* Schedule a response immediately. */
1294 netif_idx_release(pending_idx);
1298 * Old frontends do not assert data_validated but we
1299 * can infer it from csum_blank so test both flags.
1301 if (txp->flags & (NETTXF_data_validated|NETTXF_csum_blank)) {
1302 skb->ip_summed = CHECKSUM_UNNECESSARY;
1303 skb->proto_data_valid = 1;
1305 skb->ip_summed = CHECKSUM_NONE;
1306 skb->proto_data_valid = 0;
1308 skb->proto_csum_blank = !!(txp->flags & NETTXF_csum_blank);
1310 netbk_fill_frags(skb);
1312 skb->dev = netif->dev;
1313 skb->protocol = eth_type_trans(skb, skb->dev);
1315 netif->stats.rx_bytes += skb->len;
1316 netif->stats.rx_packets++;
1319 netif->dev->last_rx = jiffies;
1323 static void netif_idx_release(u16 pending_idx)
1325 static DEFINE_SPINLOCK(_lock);
1326 unsigned long flags;
1328 spin_lock_irqsave(&_lock, flags);
1329 dealloc_ring[MASK_PEND_IDX(dealloc_prod)] = pending_idx;
1330 /* Sync with net_tx_action_dealloc: insert idx /then/ incr producer. */
1333 spin_unlock_irqrestore(&_lock, flags);
1335 tasklet_schedule(&net_tx_tasklet);
1338 static void netif_page_release(struct page *page)
1340 /* Ready for next use. */
1341 init_page_count(page);
1342 netif_idx_release(page->index);
1345 irqreturn_t netif_be_int(int irq, void *dev_id, struct pt_regs *regs)
1347 netif_t *netif = dev_id;
1349 add_to_net_schedule_list_tail(netif);
1350 maybe_schedule_tx_action();
1352 if (netif_queue_stopped(netif->dev) && !netbk_queue_full(netif))
1353 netif_wake_queue(netif->dev);
1358 static void make_tx_response(netif_t *netif,
1359 netif_tx_request_t *txp,
1362 RING_IDX i = netif->tx.rsp_prod_pvt;
1363 netif_tx_response_t *resp;
1366 resp = RING_GET_RESPONSE(&netif->tx, i);
1370 if (txp->flags & NETTXF_extra_info)
1371 RING_GET_RESPONSE(&netif->tx, ++i)->status = NETIF_RSP_NULL;
1373 netif->tx.rsp_prod_pvt = ++i;
1374 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->tx, notify);
1376 notify_remote_via_irq(netif->irq);
1378 #ifdef CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER
1379 if (i == netif->tx.req_cons) {
1381 RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, more_to_do);
1383 add_to_net_schedule_list_tail(netif);
1388 static netif_rx_response_t *make_rx_response(netif_t *netif,
1395 RING_IDX i = netif->rx.rsp_prod_pvt;
1396 netif_rx_response_t *resp;
1398 resp = RING_GET_RESPONSE(&netif->rx, i);
1399 resp->offset = offset;
1400 resp->flags = flags;
1402 resp->status = (s16)size;
1404 resp->status = (s16)st;
1406 netif->rx.rsp_prod_pvt = ++i;
1411 #ifdef NETBE_DEBUG_INTERRUPT
1412 static irqreturn_t netif_be_dbg(int irq, void *dev_id, struct pt_regs *regs)
1414 struct list_head *ent;
1418 printk(KERN_ALERT "netif_schedule_list:\n");
1419 spin_lock_irq(&net_schedule_list_lock);
1421 list_for_each (ent, &net_schedule_list) {
1422 netif = list_entry(ent, netif_t, list);
1423 printk(KERN_ALERT " %d: private(rx_req_cons=%08x "
1424 "rx_resp_prod=%08x\n",
1425 i, netif->rx.req_cons, netif->rx.rsp_prod_pvt);
1426 printk(KERN_ALERT " tx_req_cons=%08x tx_resp_prod=%08x)\n",
1427 netif->tx.req_cons, netif->tx.rsp_prod_pvt);
1428 printk(KERN_ALERT " shared(rx_req_prod=%08x "
1429 "rx_resp_prod=%08x\n",
1430 netif->rx.sring->req_prod, netif->rx.sring->rsp_prod);
1431 printk(KERN_ALERT " rx_event=%08x tx_req_prod=%08x\n",
1432 netif->rx.sring->rsp_event, netif->tx.sring->req_prod);
1433 printk(KERN_ALERT " tx_resp_prod=%08x, tx_event=%08x)\n",
1434 netif->tx.sring->rsp_prod, netif->tx.sring->rsp_event);
1438 spin_unlock_irq(&net_schedule_list_lock);
1439 printk(KERN_ALERT " ** End of netif_schedule_list **\n");
1445 static int __init netback_init(void)
1450 if (!is_running_on_xen())
1453 /* We can increase reservation by this much in net_rx_action(). */
1454 balloon_update_driver_allowance(NET_RX_RING_SIZE);
1456 skb_queue_head_init(&rx_queue);
1457 skb_queue_head_init(&tx_queue);
1459 init_timer(&net_timer);
1461 net_timer.function = net_alarm;
1463 mmap_pages = alloc_empty_pages_and_pagevec(MAX_PENDING_REQS);
1464 if (mmap_pages == NULL) {
1465 printk("%s: out of memory\n", __FUNCTION__);
1469 for (i = 0; i < MAX_PENDING_REQS; i++) {
1470 page = mmap_pages[i];
1471 SetPageForeign(page, netif_page_release);
1476 pending_prod = MAX_PENDING_REQS;
1477 for (i = 0; i < MAX_PENDING_REQS; i++)
1478 pending_ring[i] = i;
1480 spin_lock_init(&net_schedule_list_lock);
1481 INIT_LIST_HEAD(&net_schedule_list);
1483 netif_xenbus_init();
1485 #ifdef NETBE_DEBUG_INTERRUPT
1486 (void)bind_virq_to_irqhandler(
1498 module_init(netback_init);
1500 MODULE_LICENSE("Dual BSD/GPL");