1 /******************************************************************************
2 * drivers/xen/netback/netback.c
4 * Back-end of the driver for virtual network devices. This portion of the
5 * driver exports a 'unified' network-device interface that can be accessed
6 * by any operating system that implements a compatible front end. A
7 * reference front-end implementation can be found in:
8 * drivers/xen/netfront/netfront.c
10 * Copyright (c) 2002-2005, K A Fraser
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License version 2
14 * as published by the Free Software Foundation; or, when distributed
15 * separately from the Linux kernel or incorporated into other
16 * software packages, subject to the following license:
18 * Permission is hereby granted, free of charge, to any person obtaining a copy
19 * of this source file (the "Software"), to deal in the Software without
20 * restriction, including without limitation the rights to use, copy, modify,
21 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
22 * and to permit persons to whom the Software is furnished to do so, subject to
23 * the following conditions:
25 * The above copyright notice and this permission notice shall be included in
26 * all copies or substantial portions of the Software.
28 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
29 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
30 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
31 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
32 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
33 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
38 #include <xen/balloon.h>
39 #include <xen/interface/memory.h>
42 /*#define NETBE_DEBUG_INTERRUPT*/
44 struct netbk_rx_meta {
50 static void netif_idx_release(u16 pending_idx);
51 static void netif_page_release(struct page *page);
52 static void make_tx_response(netif_t *netif,
53 netif_tx_request_t *txp,
55 static netif_rx_response_t *make_rx_response(netif_t *netif,
62 static void net_tx_action(unsigned long unused);
63 static DECLARE_TASKLET(net_tx_tasklet, net_tx_action, 0);
65 static void net_rx_action(unsigned long unused);
66 static DECLARE_TASKLET(net_rx_tasklet, net_rx_action, 0);
68 static struct timer_list net_timer;
70 #define MAX_PENDING_REQS 256
72 static struct sk_buff_head rx_queue;
74 static struct page **mmap_pages;
75 static inline unsigned long idx_to_kaddr(unsigned int idx)
77 return (unsigned long)pfn_to_kaddr(page_to_pfn(mmap_pages[idx]));
80 #define PKT_PROT_LEN 64
82 static struct pending_tx_info {
83 netif_tx_request_t req;
85 } pending_tx_info[MAX_PENDING_REQS];
86 static u16 pending_ring[MAX_PENDING_REQS];
87 typedef unsigned int PEND_RING_IDX;
88 #define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1))
89 static PEND_RING_IDX pending_prod, pending_cons;
90 #define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
92 /* Freed TX SKBs get batched on this ring before return to pending_ring. */
93 static u16 dealloc_ring[MAX_PENDING_REQS];
94 static PEND_RING_IDX dealloc_prod, dealloc_cons;
96 static struct sk_buff_head tx_queue;
98 static grant_handle_t grant_tx_handle[MAX_PENDING_REQS];
99 static gnttab_unmap_grant_ref_t tx_unmap_ops[MAX_PENDING_REQS];
100 static gnttab_map_grant_ref_t tx_map_ops[MAX_PENDING_REQS];
102 static struct list_head net_schedule_list;
103 static spinlock_t net_schedule_list_lock;
105 #define MAX_MFN_ALLOC 64
106 static unsigned long mfn_list[MAX_MFN_ALLOC];
107 static unsigned int alloc_index = 0;
109 static inline unsigned long alloc_mfn(void)
111 return mfn_list[--alloc_index];
114 static int check_mfn(int nr)
116 struct xen_memory_reservation reservation = {
121 if (likely(alloc_index >= nr))
124 set_xen_guest_handle(reservation.extent_start, mfn_list + alloc_index);
125 reservation.nr_extents = MAX_MFN_ALLOC - alloc_index;
126 alloc_index += HYPERVISOR_memory_op(XENMEM_increase_reservation,
129 return alloc_index >= nr ? 0 : -ENOMEM;
132 static inline void maybe_schedule_tx_action(void)
135 if ((NR_PENDING_REQS < (MAX_PENDING_REQS/2)) &&
136 !list_empty(&net_schedule_list))
137 tasklet_schedule(&net_tx_tasklet);
141 * A gross way of confirming the origin of an skb data page. The slab
142 * allocator abuses a field in the page struct to cache the kmem_cache_t ptr.
144 static inline int is_xen_skb(struct sk_buff *skb)
146 extern kmem_cache_t *skbuff_cachep;
147 kmem_cache_t *cp = (kmem_cache_t *)virt_to_page(skb->head)->lru.next;
148 return (cp == skbuff_cachep);
152 * We can flip without copying the packet unless:
153 * 1. The data is not allocated from our special cache; or
154 * 2. The main data area is shared; or
155 * 3. One or more fragments are shared; or
156 * 4. There are chained fragments.
158 static inline int is_flippable_skb(struct sk_buff *skb)
162 if (!is_xen_skb(skb) || skb_cloned(skb))
165 for (frag = 0; frag < skb_shinfo(skb)->nr_frags; frag++) {
166 if (page_count(skb_shinfo(skb)->frags[frag].page) > 1)
170 if (skb_shinfo(skb)->frag_list != NULL)
176 static struct sk_buff *netbk_copy_skb(struct sk_buff *skb)
178 struct skb_shared_info *ninfo;
179 struct sk_buff *nskb;
180 unsigned long offset;
185 BUG_ON(skb_shinfo(skb)->frag_list != NULL);
187 nskb = alloc_skb(SKB_MAX_HEAD(0), GFP_ATOMIC);
191 skb_reserve(nskb, 16);
192 headlen = nskb->end - nskb->data;
193 if (headlen > skb_headlen(skb))
194 headlen = skb_headlen(skb);
195 ret = skb_copy_bits(skb, 0, __skb_put(nskb, headlen), headlen);
198 ninfo = skb_shinfo(nskb);
199 ninfo->gso_size = skb_shinfo(skb)->gso_size;
200 ninfo->gso_type = skb_shinfo(skb)->gso_type;
203 len = skb->len - headlen;
205 nskb->len = skb->len;
206 nskb->data_len = len;
207 nskb->truesize += len;
214 if (unlikely(ninfo->nr_frags >= MAX_SKB_FRAGS)) {
219 copy = len >= PAGE_SIZE ? PAGE_SIZE : len;
220 zero = len >= PAGE_SIZE ? 0 : __GFP_ZERO;
222 page = alloc_page(GFP_ATOMIC | __GFP_NOWARN | zero);
226 ret = skb_copy_bits(skb, offset, page_address(page), copy);
229 ninfo->frags[ninfo->nr_frags].page = page;
230 ninfo->frags[ninfo->nr_frags].page_offset = 0;
231 ninfo->frags[ninfo->nr_frags].size = copy;
238 offset = nskb->data - skb->data;
240 nskb->h.raw = skb->h.raw + offset;
241 nskb->nh.raw = skb->nh.raw + offset;
242 nskb->mac.raw = skb->mac.raw + offset;
252 static inline int netbk_max_required_rx_slots(netif_t *netif)
254 if (netif->features & (NETIF_F_SG|NETIF_F_TSO))
255 return MAX_SKB_FRAGS + 2; /* header + extra_info + frags */
256 return 1; /* all in one */
259 static inline int netbk_queue_full(netif_t *netif)
261 RING_IDX peek = netif->rx_req_cons_peek;
262 RING_IDX needed = netbk_max_required_rx_slots(netif);
264 return ((netif->rx.sring->req_prod - peek) < needed) ||
265 ((netif->rx.rsp_prod_pvt + NET_RX_RING_SIZE - peek) < needed);
268 int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev)
270 netif_t *netif = netdev_priv(dev);
272 BUG_ON(skb->dev != dev);
274 /* Drop the packet if the target domain has no receive buffers. */
275 if (unlikely(!netif_running(dev) || !netif_carrier_ok(dev)))
278 if (unlikely(netbk_queue_full(netif))) {
279 /* Not a BUG_ON() -- misbehaving netfront can trigger this. */
280 if (netbk_can_queue(dev))
281 DPRINTK("Queue full but not stopped!\n");
285 /* Copy the packet here if it's destined for a flipping
286 interface but isn't flippable (e.g. extra references to
289 if (!netif->copying_receiver && !is_flippable_skb(skb)) {
290 struct sk_buff *nskb = netbk_copy_skb(skb);
291 if ( unlikely(nskb == NULL) )
293 /* Copy only the header fields we use in this driver. */
294 nskb->dev = skb->dev;
295 nskb->ip_summed = skb->ip_summed;
296 nskb->proto_data_valid = skb->proto_data_valid;
301 netif->rx_req_cons_peek += skb_shinfo(skb)->nr_frags + 1 +
302 !!skb_shinfo(skb)->gso_size;
305 if (netbk_can_queue(dev) && netbk_queue_full(netif)) {
306 netif->rx.sring->req_event = netif->rx_req_cons_peek +
307 netbk_max_required_rx_slots(netif);
308 mb(); /* request notification /then/ check & stop the queue */
309 if (netbk_queue_full(netif))
310 netif_stop_queue(dev);
313 skb_queue_tail(&rx_queue, skb);
314 tasklet_schedule(&net_rx_tasklet);
319 netif->stats.tx_dropped++;
325 static void xen_network_done_notify(void)
327 static struct net_device *eth0_dev = NULL;
328 if (unlikely(eth0_dev == NULL))
329 eth0_dev = __dev_get_by_name("eth0");
330 netif_rx_schedule(eth0_dev);
333 * Add following to poll() function in NAPI driver (Tigon3 is example):
334 * if ( xen_network_done() )
335 * tg3_enable_ints(tp);
337 int xen_network_done(void)
339 return skb_queue_empty(&rx_queue);
343 struct netrx_pending_operations {
344 unsigned trans_prod, trans_cons;
345 unsigned mmu_prod, mmu_cons;
346 unsigned mcl_prod, mcl_cons;
347 unsigned copy_prod, copy_cons;
348 unsigned meta_prod, meta_cons;
350 gnttab_transfer_t *trans;
352 multicall_entry_t *mcl;
353 struct netbk_rx_meta *meta;
356 /* Set up the grant operations for this fragment. If it's a flipping
357 interface, we also set up the unmap request from here. */
358 static u16 netbk_gop_frag(netif_t *netif, struct netbk_rx_meta *meta,
359 int i, struct netrx_pending_operations *npo,
360 struct page *page, unsigned long size,
361 unsigned long offset)
364 gnttab_transfer_t *gop;
365 gnttab_copy_t *copy_gop;
366 multicall_entry_t *mcl;
367 netif_rx_request_t *req;
368 unsigned long old_mfn, new_mfn;
370 old_mfn = virt_to_mfn(page_address(page));
372 req = RING_GET_REQUEST(&netif->rx, netif->rx.req_cons + i);
373 if (netif->copying_receiver) {
374 /* The fragment needs to be copied rather than
377 copy_gop = npo->copy + npo->copy_prod++;
378 copy_gop->flags = GNTCOPY_dest_gref;
379 if (PageForeign(page)) {
380 struct pending_tx_info *src_pend =
381 &pending_tx_info[page->index];
382 copy_gop->source.domid = src_pend->netif->domid;
383 copy_gop->source.u.ref = src_pend->req.gref;
384 copy_gop->flags |= GNTCOPY_source_gref;
386 copy_gop->source.domid = DOMID_SELF;
387 copy_gop->source.u.gmfn = old_mfn;
389 copy_gop->source.offset = offset;
390 copy_gop->dest.domid = netif->domid;
391 copy_gop->dest.offset = 0;
392 copy_gop->dest.u.ref = req->gref;
393 copy_gop->len = size;
396 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
397 new_mfn = alloc_mfn();
400 * Set the new P2M table entry before
401 * reassigning the old data page. Heed the
402 * comment in pgtable-2level.h:pte_page(). :-)
404 set_phys_to_machine(page_to_pfn(page), new_mfn);
406 mcl = npo->mcl + npo->mcl_prod++;
407 MULTI_update_va_mapping(mcl,
408 (unsigned long)page_address(page),
409 pfn_pte_ma(new_mfn, PAGE_KERNEL),
412 mmu = npo->mmu + npo->mmu_prod++;
413 mmu->ptr = ((maddr_t)new_mfn << PAGE_SHIFT) |
415 mmu->val = page_to_pfn(page);
418 gop = npo->trans + npo->trans_prod++;
420 gop->domid = netif->domid;
421 gop->ref = req->gref;
426 static void netbk_gop_skb(struct sk_buff *skb,
427 struct netrx_pending_operations *npo)
429 netif_t *netif = netdev_priv(skb->dev);
430 int nr_frags = skb_shinfo(skb)->nr_frags;
433 struct netbk_rx_meta *head_meta, *meta;
435 head_meta = npo->meta + npo->meta_prod++;
436 head_meta->frag.page_offset = skb_shinfo(skb)->gso_type;
437 head_meta->frag.size = skb_shinfo(skb)->gso_size;
438 extra = !!head_meta->frag.size + 1;
440 for (i = 0; i < nr_frags; i++) {
441 meta = npo->meta + npo->meta_prod++;
442 meta->frag = skb_shinfo(skb)->frags[i];
443 meta->id = netbk_gop_frag(netif, meta, i + extra, npo,
446 meta->frag.page_offset);
450 * This must occur at the end to ensure that we don't trash
451 * skb_shinfo until we're done.
453 head_meta->id = netbk_gop_frag(netif, head_meta, 0, npo,
454 virt_to_page(skb->data),
456 offset_in_page(skb->data));
458 netif->rx.req_cons += nr_frags + extra;
461 static inline void netbk_free_pages(int nr_frags, struct netbk_rx_meta *meta)
465 for (i = 0; i < nr_frags; i++)
466 put_page(meta[i].frag.page);
469 /* This is a twin to netbk_gop_skb. Assume that netbk_gop_skb was
470 used to set up the operations on the top of
471 netrx_pending_operations, which have since been done. Check that
472 they didn't give any errors and advance over them. */
473 static int netbk_check_gop(int nr_frags, domid_t domid,
474 struct netrx_pending_operations *npo)
476 multicall_entry_t *mcl;
477 gnttab_transfer_t *gop;
478 gnttab_copy_t *copy_op;
479 int status = NETIF_RSP_OKAY;
482 for (i = 0; i <= nr_frags; i++) {
483 if (npo->meta[npo->meta_cons + i].copy) {
484 copy_op = npo->copy + npo->copy_cons++;
485 if (copy_op->status != GNTST_okay) {
486 DPRINTK("Bad status %d from copy to DOM%d.\n",
488 status = NETIF_RSP_ERROR;
491 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
492 mcl = npo->mcl + npo->mcl_cons++;
493 /* The update_va_mapping() must not fail. */
494 BUG_ON(mcl->result != 0);
497 gop = npo->trans + npo->trans_cons++;
498 /* Check the reassignment error code. */
499 if (gop->status != 0) {
500 DPRINTK("Bad status %d from grant transfer to DOM%u\n",
503 * Page no longer belongs to us unless
504 * GNTST_bad_page, but that should be
505 * a fatal error anyway.
507 BUG_ON(gop->status == GNTST_bad_page);
508 status = NETIF_RSP_ERROR;
516 static void netbk_add_frag_responses(netif_t *netif, int status,
517 struct netbk_rx_meta *meta, int nr_frags)
520 unsigned long offset;
522 for (i = 0; i < nr_frags; i++) {
524 int flags = (i == nr_frags - 1) ? 0 : NETRXF_more_data;
529 offset = meta[i].frag.page_offset;
530 make_rx_response(netif, id, status, offset,
531 meta[i].frag.size, flags);
535 static void net_rx_action(unsigned long unused)
537 netif_t *netif = NULL;
540 netif_rx_response_t *resp;
541 multicall_entry_t *mcl;
542 struct sk_buff_head rxq;
548 unsigned long offset;
551 * Putting hundreds of bytes on the stack is considered rude.
552 * Static works because a tasklet can only be on one CPU at any time.
554 static multicall_entry_t rx_mcl[NET_RX_RING_SIZE+3];
555 static mmu_update_t rx_mmu[NET_RX_RING_SIZE];
556 static gnttab_transfer_t grant_trans_op[NET_RX_RING_SIZE];
557 static gnttab_copy_t grant_copy_op[NET_RX_RING_SIZE];
558 static unsigned char rx_notify[NR_IRQS];
559 static u16 notify_list[NET_RX_RING_SIZE];
560 static struct netbk_rx_meta meta[NET_RX_RING_SIZE];
562 struct netrx_pending_operations npo = {
564 trans: grant_trans_op,
569 skb_queue_head_init(&rxq);
573 while ((skb = skb_dequeue(&rx_queue)) != NULL) {
574 nr_frags = skb_shinfo(skb)->nr_frags;
575 *(int *)skb->cb = nr_frags;
577 if (!xen_feature(XENFEAT_auto_translated_physmap) &&
578 check_mfn(nr_frags + 1)) {
579 /* Memory squeeze? Back off for an arbitrary while. */
580 if ( net_ratelimit() )
581 WPRINTK("Memory squeeze in netback "
583 mod_timer(&net_timer, jiffies + HZ);
584 skb_queue_head(&rx_queue, skb);
588 netbk_gop_skb(skb, &npo);
590 count += nr_frags + 1;
592 __skb_queue_tail(&rxq, skb);
594 /* Filled the batch queue? */
595 if (count + MAX_SKB_FRAGS >= NET_RX_RING_SIZE)
600 !xen_feature(XENFEAT_auto_translated_physmap)) {
601 mcl = npo.mcl + npo.mcl_prod++;
603 BUG_ON(mcl[-1].op != __HYPERVISOR_update_va_mapping);
604 mcl[-1].args[MULTI_UVMFLAGS_INDEX] = UVMF_TLB_FLUSH|UVMF_ALL;
606 mcl->op = __HYPERVISOR_mmu_update;
607 mcl->args[0] = (unsigned long)rx_mmu;
608 mcl->args[1] = npo.mmu_prod;
610 mcl->args[3] = DOMID_SELF;
613 if (npo.trans_prod) {
614 mcl = npo.mcl + npo.mcl_prod++;
615 mcl->op = __HYPERVISOR_grant_table_op;
616 mcl->args[0] = GNTTABOP_transfer;
617 mcl->args[1] = (unsigned long)grant_trans_op;
618 mcl->args[2] = npo.trans_prod;
622 mcl = npo.mcl + npo.mcl_prod++;
623 mcl->op = __HYPERVISOR_grant_table_op;
624 mcl->args[0] = GNTTABOP_copy;
625 mcl->args[1] = (unsigned long)grant_copy_op;
626 mcl->args[2] = npo.copy_prod;
633 BUG_ON(npo.copy_prod > NET_RX_RING_SIZE);
634 BUG_ON(npo.mmu_prod > NET_RX_RING_SIZE);
635 BUG_ON(npo.trans_prod > NET_RX_RING_SIZE);
636 BUG_ON(npo.mcl_prod > NET_RX_RING_SIZE+3);
637 BUG_ON(npo.meta_prod > NET_RX_RING_SIZE);
639 ret = HYPERVISOR_multicall(npo.mcl, npo.mcl_prod);
642 while ((skb = __skb_dequeue(&rxq)) != NULL) {
643 nr_frags = *(int *)skb->cb;
645 netif = netdev_priv(skb->dev);
646 /* We can't rely on skb_release_data to release the
647 pages used by fragments for us, since it tries to
648 touch the pages in the fraglist. If we're in
649 flipping mode, that doesn't work. In copying mode,
650 we still have access to all of the pages, and so
651 it's safe to let release_data deal with it. */
652 /* (Freeing the fragments is safe since we copy
653 non-linear skbs destined for flipping interfaces) */
654 if (!netif->copying_receiver) {
655 atomic_set(&(skb_shinfo(skb)->dataref), 1);
656 skb_shinfo(skb)->frag_list = NULL;
657 skb_shinfo(skb)->nr_frags = 0;
658 netbk_free_pages(nr_frags, meta + npo.meta_cons + 1);
661 netif->stats.tx_bytes += skb->len;
662 netif->stats.tx_packets++;
664 status = netbk_check_gop(nr_frags, netif->domid, &npo);
666 id = meta[npo.meta_cons].id;
667 flags = nr_frags ? NETRXF_more_data : 0;
669 if (skb->ip_summed == CHECKSUM_HW) /* local packet? */
670 flags |= NETRXF_csum_blank | NETRXF_data_validated;
671 else if (skb->proto_data_valid) /* remote but checksummed? */
672 flags |= NETRXF_data_validated;
674 if (meta[npo.meta_cons].copy)
677 offset = offset_in_page(skb->data);
678 resp = make_rx_response(netif, id, status, offset,
679 skb_headlen(skb), flags);
681 if (meta[npo.meta_cons].frag.size) {
682 struct netif_extra_info *gso =
683 (struct netif_extra_info *)
684 RING_GET_RESPONSE(&netif->rx,
685 netif->rx.rsp_prod_pvt++);
687 resp->flags |= NETRXF_extra_info;
689 gso->u.gso.size = meta[npo.meta_cons].frag.size;
690 gso->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4;
692 gso->u.gso.features = 0;
694 gso->type = XEN_NETIF_EXTRA_TYPE_GSO;
698 netbk_add_frag_responses(netif, status,
699 meta + npo.meta_cons + 1,
702 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->rx, ret);
704 if (ret && !rx_notify[irq]) {
706 notify_list[notify_nr++] = irq;
709 if (netif_queue_stopped(netif->dev) &&
710 !netbk_queue_full(netif))
711 netif_wake_queue(netif->dev);
715 npo.meta_cons += nr_frags + 1;
718 while (notify_nr != 0) {
719 irq = notify_list[--notify_nr];
721 notify_remote_via_irq(irq);
724 /* More work to do? */
725 if (!skb_queue_empty(&rx_queue) && !timer_pending(&net_timer))
726 tasklet_schedule(&net_rx_tasklet);
729 xen_network_done_notify();
733 static void net_alarm(unsigned long unused)
735 tasklet_schedule(&net_rx_tasklet);
738 struct net_device_stats *netif_be_get_stats(struct net_device *dev)
740 netif_t *netif = netdev_priv(dev);
741 return &netif->stats;
744 static int __on_net_schedule_list(netif_t *netif)
746 return netif->list.next != NULL;
749 static void remove_from_net_schedule_list(netif_t *netif)
751 spin_lock_irq(&net_schedule_list_lock);
752 if (likely(__on_net_schedule_list(netif))) {
753 list_del(&netif->list);
754 netif->list.next = NULL;
757 spin_unlock_irq(&net_schedule_list_lock);
760 static void add_to_net_schedule_list_tail(netif_t *netif)
762 if (__on_net_schedule_list(netif))
765 spin_lock_irq(&net_schedule_list_lock);
766 if (!__on_net_schedule_list(netif) &&
767 likely(netif_running(netif->dev) &&
768 netif_carrier_ok(netif->dev))) {
769 list_add_tail(&netif->list, &net_schedule_list);
772 spin_unlock_irq(&net_schedule_list_lock);
776 * Note on CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER:
777 * If this driver is pipelining transmit requests then we can be very
778 * aggressive in avoiding new-packet notifications -- frontend only needs to
779 * send a notification if there are no outstanding unreceived responses.
780 * If we may be buffer transmit buffers for any reason then we must be rather
781 * more conservative and treat this as the final check for pending work.
783 void netif_schedule_work(netif_t *netif)
787 #ifdef CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER
788 more_to_do = RING_HAS_UNCONSUMED_REQUESTS(&netif->tx);
790 RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, more_to_do);
794 add_to_net_schedule_list_tail(netif);
795 maybe_schedule_tx_action();
799 void netif_deschedule_work(netif_t *netif)
801 remove_from_net_schedule_list(netif);
805 static void tx_add_credit(netif_t *netif)
807 unsigned long max_burst;
810 * Allow a burst big enough to transmit a jumbo packet of up to 128kB.
811 * Otherwise the interface can seize up due to insufficient credit.
813 max_burst = RING_GET_REQUEST(&netif->tx, netif->tx.req_cons)->size;
814 max_burst = min(max_burst, 131072UL);
815 max_burst = max(max_burst, netif->credit_bytes);
817 netif->remaining_credit = min(netif->remaining_credit +
822 static void tx_credit_callback(unsigned long data)
824 netif_t *netif = (netif_t *)data;
825 tx_add_credit(netif);
826 netif_schedule_work(netif);
829 inline static void net_tx_action_dealloc(void)
831 gnttab_unmap_grant_ref_t *gop;
833 PEND_RING_IDX dc, dp;
840 /* Ensure we see all indexes enqueued by netif_idx_release(). */
844 * Free up any grants we have finished using
848 pending_idx = dealloc_ring[MASK_PEND_IDX(dc++)];
849 gnttab_set_unmap_op(gop, idx_to_kaddr(pending_idx),
851 grant_tx_handle[pending_idx]);
854 ret = HYPERVISOR_grant_table_op(
855 GNTTABOP_unmap_grant_ref, tx_unmap_ops, gop - tx_unmap_ops);
858 while (dealloc_cons != dp) {
859 pending_idx = dealloc_ring[MASK_PEND_IDX(dealloc_cons++)];
861 netif = pending_tx_info[pending_idx].netif;
863 make_tx_response(netif, &pending_tx_info[pending_idx].req,
866 pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
872 static void netbk_tx_err(netif_t *netif, netif_tx_request_t *txp, RING_IDX end)
874 RING_IDX cons = netif->tx.req_cons;
877 make_tx_response(netif, txp, NETIF_RSP_ERROR);
880 txp = RING_GET_REQUEST(&netif->tx, cons++);
882 netif->tx.req_cons = cons;
883 netif_schedule_work(netif);
887 static int netbk_count_requests(netif_t *netif, netif_tx_request_t *first,
888 netif_tx_request_t *txp, int work_to_do)
890 RING_IDX cons = netif->tx.req_cons;
893 if (!(first->flags & NETTXF_more_data))
897 if (frags >= work_to_do) {
898 DPRINTK("Need more frags\n");
902 if (unlikely(frags >= MAX_SKB_FRAGS)) {
903 DPRINTK("Too many frags\n");
907 memcpy(txp, RING_GET_REQUEST(&netif->tx, cons + frags),
909 if (txp->size > first->size) {
910 DPRINTK("Frags galore\n");
914 first->size -= txp->size;
917 if (unlikely((txp->offset + txp->size) > PAGE_SIZE)) {
918 DPRINTK("txp->offset: %x, size: %u\n",
919 txp->offset, txp->size);
922 } while ((txp++)->flags & NETTXF_more_data);
927 static gnttab_map_grant_ref_t *netbk_get_requests(netif_t *netif,
929 netif_tx_request_t *txp,
930 gnttab_map_grant_ref_t *mop)
932 struct skb_shared_info *shinfo = skb_shinfo(skb);
933 skb_frag_t *frags = shinfo->frags;
934 unsigned long pending_idx = *((u16 *)skb->data);
937 /* Skip first skb fragment if it is on same page as header fragment. */
938 start = ((unsigned long)shinfo->frags[0].page == pending_idx);
940 for (i = start; i < shinfo->nr_frags; i++, txp++) {
941 pending_idx = pending_ring[MASK_PEND_IDX(pending_cons++)];
943 gnttab_set_map_op(mop++, idx_to_kaddr(pending_idx),
944 GNTMAP_host_map | GNTMAP_readonly,
945 txp->gref, netif->domid);
947 memcpy(&pending_tx_info[pending_idx].req, txp, sizeof(*txp));
949 pending_tx_info[pending_idx].netif = netif;
950 frags[i].page = (void *)pending_idx;
956 static int netbk_tx_check_mop(struct sk_buff *skb,
957 gnttab_map_grant_ref_t **mopp)
959 gnttab_map_grant_ref_t *mop = *mopp;
960 int pending_idx = *((u16 *)skb->data);
961 netif_t *netif = pending_tx_info[pending_idx].netif;
962 netif_tx_request_t *txp;
963 struct skb_shared_info *shinfo = skb_shinfo(skb);
964 int nr_frags = shinfo->nr_frags;
967 /* Check status of header. */
970 txp = &pending_tx_info[pending_idx].req;
971 make_tx_response(netif, txp, NETIF_RSP_ERROR);
972 pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
976 __pa(idx_to_kaddr(pending_idx)) >> PAGE_SHIFT,
977 FOREIGN_FRAME(mop->dev_bus_addr >> PAGE_SHIFT));
978 grant_tx_handle[pending_idx] = mop->handle;
981 /* Skip first skb fragment if it is on same page as header fragment. */
982 start = ((unsigned long)shinfo->frags[0].page == pending_idx);
984 for (i = start; i < nr_frags; i++) {
987 pending_idx = (unsigned long)shinfo->frags[i].page;
989 /* Check error status: if okay then remember grant handle. */
990 newerr = (++mop)->status;
991 if (likely(!newerr)) {
993 __pa(idx_to_kaddr(pending_idx))>>PAGE_SHIFT,
994 FOREIGN_FRAME(mop->dev_bus_addr>>PAGE_SHIFT));
995 grant_tx_handle[pending_idx] = mop->handle;
996 /* Had a previous error? Invalidate this fragment. */
998 netif_idx_release(pending_idx);
1002 /* Error on this fragment: respond to client with an error. */
1003 txp = &pending_tx_info[pending_idx].req;
1004 make_tx_response(netif, txp, NETIF_RSP_ERROR);
1005 pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
1008 /* Not the first error? Preceding frags already invalidated. */
1012 /* First error: invalidate header and preceding fragments. */
1013 pending_idx = *((u16 *)skb->data);
1014 netif_idx_release(pending_idx);
1015 for (j = start; j < i; j++) {
1016 pending_idx = (unsigned long)shinfo->frags[i].page;
1017 netif_idx_release(pending_idx);
1020 /* Remember the error: invalidate all subsequent fragments. */
1028 static void netbk_fill_frags(struct sk_buff *skb)
1030 struct skb_shared_info *shinfo = skb_shinfo(skb);
1031 int nr_frags = shinfo->nr_frags;
1034 for (i = 0; i < nr_frags; i++) {
1035 skb_frag_t *frag = shinfo->frags + i;
1036 netif_tx_request_t *txp;
1037 unsigned long pending_idx;
1039 pending_idx = (unsigned long)frag->page;
1040 txp = &pending_tx_info[pending_idx].req;
1041 frag->page = virt_to_page(idx_to_kaddr(pending_idx));
1042 frag->size = txp->size;
1043 frag->page_offset = txp->offset;
1045 skb->len += txp->size;
1046 skb->data_len += txp->size;
1047 skb->truesize += txp->size;
1051 int netbk_get_extras(netif_t *netif, struct netif_extra_info *extras,
1054 struct netif_extra_info extra;
1055 RING_IDX cons = netif->tx.req_cons;
1058 if (unlikely(work_to_do-- <= 0)) {
1059 DPRINTK("Missing extra info\n");
1063 memcpy(&extra, RING_GET_REQUEST(&netif->tx, cons),
1065 if (unlikely(!extra.type ||
1066 extra.type >= XEN_NETIF_EXTRA_TYPE_MAX)) {
1067 netif->tx.req_cons = ++cons;
1068 DPRINTK("Invalid extra type: %d\n", extra.type);
1072 memcpy(&extras[extra.type - 1], &extra, sizeof(extra));
1073 netif->tx.req_cons = ++cons;
1074 } while (extra.flags & XEN_NETIF_EXTRA_FLAG_MORE);
1079 static int netbk_set_skb_gso(struct sk_buff *skb, struct netif_extra_info *gso)
1081 if (!gso->u.gso.size) {
1082 DPRINTK("GSO size must not be zero.\n");
1086 /* Currently only TCPv4 S.O. is supported. */
1087 if (gso->u.gso.type != XEN_NETIF_GSO_TYPE_TCPV4) {
1088 DPRINTK("Bad GSO type %d.\n", gso->u.gso.type);
1092 skb_shinfo(skb)->gso_size = gso->u.gso.size;
1093 skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
1095 /* Header must be checked, and gso_segs computed. */
1096 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
1097 skb_shinfo(skb)->gso_segs = 0;
1102 /* Called after netfront has transmitted */
1103 static void net_tx_action(unsigned long unused)
1105 struct list_head *ent;
1106 struct sk_buff *skb;
1108 netif_tx_request_t txreq;
1109 netif_tx_request_t txfrags[MAX_SKB_FRAGS];
1110 struct netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1];
1113 gnttab_map_grant_ref_t *mop;
1114 unsigned int data_len;
1115 int ret, work_to_do;
1117 if (dealloc_cons != dealloc_prod)
1118 net_tx_action_dealloc();
1121 while (((NR_PENDING_REQS + MAX_SKB_FRAGS) < MAX_PENDING_REQS) &&
1122 !list_empty(&net_schedule_list)) {
1123 /* Get a netif from the list with work to do. */
1124 ent = net_schedule_list.next;
1125 netif = list_entry(ent, netif_t, list);
1127 remove_from_net_schedule_list(netif);
1129 RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, work_to_do);
1135 i = netif->tx.req_cons;
1136 rmb(); /* Ensure that we see the request before we copy it. */
1137 memcpy(&txreq, RING_GET_REQUEST(&netif->tx, i), sizeof(txreq));
1139 /* Credit-based scheduling. */
1140 if (txreq.size > netif->remaining_credit) {
1141 unsigned long now = jiffies;
1142 unsigned long next_credit =
1143 netif->credit_timeout.expires +
1144 msecs_to_jiffies(netif->credit_usec / 1000);
1146 /* Timer could already be pending in rare cases. */
1147 if (timer_pending(&netif->credit_timeout)) {
1152 /* Passed the point where we can replenish credit? */
1153 if (time_after_eq(now, next_credit)) {
1154 netif->credit_timeout.expires = now;
1155 tx_add_credit(netif);
1158 /* Still too big to send right now? Set a callback. */
1159 if (txreq.size > netif->remaining_credit) {
1160 netif->credit_timeout.data =
1161 (unsigned long)netif;
1162 netif->credit_timeout.function =
1164 __mod_timer(&netif->credit_timeout,
1170 netif->remaining_credit -= txreq.size;
1173 netif->tx.req_cons = ++i;
1175 memset(extras, 0, sizeof(extras));
1176 if (txreq.flags & NETTXF_extra_info) {
1177 work_to_do = netbk_get_extras(netif, extras,
1179 i = netif->tx.req_cons;
1180 if (unlikely(work_to_do < 0)) {
1181 netbk_tx_err(netif, &txreq, i);
1186 ret = netbk_count_requests(netif, &txreq, txfrags, work_to_do);
1187 if (unlikely(ret < 0)) {
1188 netbk_tx_err(netif, &txreq, i - ret);
1193 if (unlikely(txreq.size < ETH_HLEN)) {
1194 DPRINTK("Bad packet size: %d\n", txreq.size);
1195 netbk_tx_err(netif, &txreq, i);
1199 /* No crossing a page as the payload mustn't fragment. */
1200 if (unlikely((txreq.offset + txreq.size) > PAGE_SIZE)) {
1201 DPRINTK("txreq.offset: %x, size: %u, end: %lu\n",
1202 txreq.offset, txreq.size,
1203 (txreq.offset &~PAGE_MASK) + txreq.size);
1204 netbk_tx_err(netif, &txreq, i);
1208 pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
1210 data_len = (txreq.size > PKT_PROT_LEN &&
1211 ret < MAX_SKB_FRAGS) ?
1212 PKT_PROT_LEN : txreq.size;
1214 skb = alloc_skb(data_len+16, GFP_ATOMIC);
1215 if (unlikely(skb == NULL)) {
1216 DPRINTK("Can't allocate a skb in start_xmit.\n");
1217 netbk_tx_err(netif, &txreq, i);
1221 /* Packets passed to netif_rx() must have some headroom. */
1222 skb_reserve(skb, 16);
1224 if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) {
1225 struct netif_extra_info *gso;
1226 gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1];
1228 if (netbk_set_skb_gso(skb, gso)) {
1230 netbk_tx_err(netif, &txreq, i);
1235 gnttab_set_map_op(mop, idx_to_kaddr(pending_idx),
1236 GNTMAP_host_map | GNTMAP_readonly,
1237 txreq.gref, netif->domid);
1240 memcpy(&pending_tx_info[pending_idx].req,
1241 &txreq, sizeof(txreq));
1242 pending_tx_info[pending_idx].netif = netif;
1243 *((u16 *)skb->data) = pending_idx;
1245 __skb_put(skb, data_len);
1247 skb_shinfo(skb)->nr_frags = ret;
1248 if (data_len < txreq.size) {
1249 skb_shinfo(skb)->nr_frags++;
1250 skb_shinfo(skb)->frags[0].page =
1251 (void *)(unsigned long)pending_idx;
1253 /* Discriminate from any valid pending_idx value. */
1254 skb_shinfo(skb)->frags[0].page = (void *)~0UL;
1257 __skb_queue_tail(&tx_queue, skb);
1261 mop = netbk_get_requests(netif, skb, txfrags, mop);
1263 netif->tx.req_cons = i;
1264 netif_schedule_work(netif);
1266 if ((mop - tx_map_ops) >= ARRAY_SIZE(tx_map_ops))
1270 if (mop == tx_map_ops)
1273 ret = HYPERVISOR_grant_table_op(
1274 GNTTABOP_map_grant_ref, tx_map_ops, mop - tx_map_ops);
1278 while ((skb = __skb_dequeue(&tx_queue)) != NULL) {
1279 netif_tx_request_t *txp;
1281 pending_idx = *((u16 *)skb->data);
1282 netif = pending_tx_info[pending_idx].netif;
1283 txp = &pending_tx_info[pending_idx].req;
1285 /* Check the remap error code. */
1286 if (unlikely(netbk_tx_check_mop(skb, &mop))) {
1287 printk(KERN_ALERT "#### netback grant fails\n");
1288 skb_shinfo(skb)->nr_frags = 0;
1293 data_len = skb->len;
1295 (void *)(idx_to_kaddr(pending_idx)|txp->offset),
1297 if (data_len < txp->size) {
1298 /* Append the packet payload as a fragment. */
1299 txp->offset += data_len;
1300 txp->size -= data_len;
1302 /* Schedule a response immediately. */
1303 netif_idx_release(pending_idx);
1307 * Old frontends do not assert data_validated but we
1308 * can infer it from csum_blank so test both flags.
1310 if (txp->flags & (NETTXF_data_validated|NETTXF_csum_blank)) {
1311 skb->ip_summed = CHECKSUM_UNNECESSARY;
1312 skb->proto_data_valid = 1;
1314 skb->ip_summed = CHECKSUM_NONE;
1315 skb->proto_data_valid = 0;
1317 skb->proto_csum_blank = !!(txp->flags & NETTXF_csum_blank);
1319 netbk_fill_frags(skb);
1321 skb->dev = netif->dev;
1322 skb->protocol = eth_type_trans(skb, skb->dev);
1324 netif->stats.rx_bytes += skb->len;
1325 netif->stats.rx_packets++;
1328 netif->dev->last_rx = jiffies;
1332 static void netif_idx_release(u16 pending_idx)
1334 static DEFINE_SPINLOCK(_lock);
1335 unsigned long flags;
1337 spin_lock_irqsave(&_lock, flags);
1338 dealloc_ring[MASK_PEND_IDX(dealloc_prod)] = pending_idx;
1339 /* Sync with net_tx_action_dealloc: insert idx /then/ incr producer. */
1342 spin_unlock_irqrestore(&_lock, flags);
1344 tasklet_schedule(&net_tx_tasklet);
1347 static void netif_page_release(struct page *page)
1349 /* Ready for next use. */
1350 init_page_count(page);
1352 netif_idx_release(page->index);
1355 irqreturn_t netif_be_int(int irq, void *dev_id, struct pt_regs *regs)
1357 netif_t *netif = dev_id;
1359 add_to_net_schedule_list_tail(netif);
1360 maybe_schedule_tx_action();
1362 if (netif_queue_stopped(netif->dev) && !netbk_queue_full(netif))
1363 netif_wake_queue(netif->dev);
1368 static void make_tx_response(netif_t *netif,
1369 netif_tx_request_t *txp,
1372 RING_IDX i = netif->tx.rsp_prod_pvt;
1373 netif_tx_response_t *resp;
1376 resp = RING_GET_RESPONSE(&netif->tx, i);
1380 if (txp->flags & NETTXF_extra_info)
1381 RING_GET_RESPONSE(&netif->tx, ++i)->status = NETIF_RSP_NULL;
1383 netif->tx.rsp_prod_pvt = ++i;
1384 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->tx, notify);
1386 notify_remote_via_irq(netif->irq);
1388 #ifdef CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER
1389 if (i == netif->tx.req_cons) {
1391 RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, more_to_do);
1393 add_to_net_schedule_list_tail(netif);
1398 static netif_rx_response_t *make_rx_response(netif_t *netif,
1405 RING_IDX i = netif->rx.rsp_prod_pvt;
1406 netif_rx_response_t *resp;
1408 resp = RING_GET_RESPONSE(&netif->rx, i);
1409 resp->offset = offset;
1410 resp->flags = flags;
1412 resp->status = (s16)size;
1414 resp->status = (s16)st;
1416 netif->rx.rsp_prod_pvt = ++i;
1421 #ifdef NETBE_DEBUG_INTERRUPT
1422 static irqreturn_t netif_be_dbg(int irq, void *dev_id, struct pt_regs *regs)
1424 struct list_head *ent;
1428 printk(KERN_ALERT "netif_schedule_list:\n");
1429 spin_lock_irq(&net_schedule_list_lock);
1431 list_for_each (ent, &net_schedule_list) {
1432 netif = list_entry(ent, netif_t, list);
1433 printk(KERN_ALERT " %d: private(rx_req_cons=%08x "
1434 "rx_resp_prod=%08x\n",
1435 i, netif->rx.req_cons, netif->rx.rsp_prod_pvt);
1436 printk(KERN_ALERT " tx_req_cons=%08x tx_resp_prod=%08x)\n",
1437 netif->tx.req_cons, netif->tx.rsp_prod_pvt);
1438 printk(KERN_ALERT " shared(rx_req_prod=%08x "
1439 "rx_resp_prod=%08x\n",
1440 netif->rx.sring->req_prod, netif->rx.sring->rsp_prod);
1441 printk(KERN_ALERT " rx_event=%08x tx_req_prod=%08x\n",
1442 netif->rx.sring->rsp_event, netif->tx.sring->req_prod);
1443 printk(KERN_ALERT " tx_resp_prod=%08x, tx_event=%08x)\n",
1444 netif->tx.sring->rsp_prod, netif->tx.sring->rsp_event);
1448 spin_unlock_irq(&net_schedule_list_lock);
1449 printk(KERN_ALERT " ** End of netif_schedule_list **\n");
1455 static int __init netback_init(void)
1460 if (!is_running_on_xen())
1463 /* We can increase reservation by this much in net_rx_action(). */
1464 balloon_update_driver_allowance(NET_RX_RING_SIZE);
1466 skb_queue_head_init(&rx_queue);
1467 skb_queue_head_init(&tx_queue);
1469 init_timer(&net_timer);
1471 net_timer.function = net_alarm;
1473 mmap_pages = alloc_empty_pages_and_pagevec(MAX_PENDING_REQS);
1474 if (mmap_pages == NULL) {
1475 printk("%s: out of memory\n", __FUNCTION__);
1479 for (i = 0; i < MAX_PENDING_REQS; i++) {
1480 page = mmap_pages[i];
1481 SetPageForeign(page, netif_page_release);
1486 pending_prod = MAX_PENDING_REQS;
1487 for (i = 0; i < MAX_PENDING_REQS; i++)
1488 pending_ring[i] = i;
1490 spin_lock_init(&net_schedule_list_lock);
1491 INIT_LIST_HEAD(&net_schedule_list);
1493 netif_xenbus_init();
1495 #ifdef NETBE_DEBUG_INTERRUPT
1496 (void)bind_virq_to_irqhandler(
1508 module_init(netback_init);
1510 MODULE_LICENSE("Dual BSD/GPL");