1 /******************************************************************************
2 * drivers/xen/netback/netback.c
4 * Back-end of the driver for virtual network devices. This portion of the
5 * driver exports a 'unified' network-device interface that can be accessed
6 * by any operating system that implements a compatible front end. A
7 * reference front-end implementation can be found in:
8 * drivers/xen/netfront/netfront.c
10 * Copyright (c) 2002-2005, K A Fraser
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License version 2
14 * as published by the Free Software Foundation; or, when distributed
15 * separately from the Linux kernel or incorporated into other
16 * software packages, subject to the following license:
18 * Permission is hereby granted, free of charge, to any person obtaining a copy
19 * of this source file (the "Software"), to deal in the Software without
20 * restriction, including without limitation the rights to use, copy, modify,
21 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
22 * and to permit persons to whom the Software is furnished to do so, subject to
23 * the following conditions:
25 * The above copyright notice and this permission notice shall be included in
26 * all copies or substantial portions of the Software.
28 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
29 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
30 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
31 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
32 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
33 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
38 #include <xen/balloon.h>
39 #include <xen/interface/memory.h>
42 /*#define NETBE_DEBUG_INTERRUPT*/
44 struct netbk_rx_meta {
50 static void netif_idx_release(u16 pending_idx);
51 static void netif_page_release(struct page *page);
52 static void make_tx_response(netif_t *netif,
53 netif_tx_request_t *txp,
55 static netif_rx_response_t *make_rx_response(netif_t *netif,
62 static void net_tx_action(unsigned long unused);
63 static DECLARE_TASKLET(net_tx_tasklet, net_tx_action, 0);
65 static void net_rx_action(unsigned long unused);
66 static DECLARE_TASKLET(net_rx_tasklet, net_rx_action, 0);
68 static struct timer_list net_timer;
70 #define MAX_PENDING_REQS 256
72 static struct sk_buff_head rx_queue;
74 static struct page **mmap_pages;
75 static inline unsigned long idx_to_kaddr(unsigned int idx)
77 return (unsigned long)pfn_to_kaddr(page_to_pfn(mmap_pages[idx]));
80 #define PKT_PROT_LEN 64
82 static struct pending_tx_info {
83 netif_tx_request_t req;
85 } pending_tx_info[MAX_PENDING_REQS];
86 static u16 pending_ring[MAX_PENDING_REQS];
87 typedef unsigned int PEND_RING_IDX;
88 #define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1))
89 static PEND_RING_IDX pending_prod, pending_cons;
90 #define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
92 /* Freed TX SKBs get batched on this ring before return to pending_ring. */
93 static u16 dealloc_ring[MAX_PENDING_REQS];
94 static PEND_RING_IDX dealloc_prod, dealloc_cons;
96 static struct sk_buff_head tx_queue;
98 static grant_handle_t grant_tx_handle[MAX_PENDING_REQS];
99 static gnttab_unmap_grant_ref_t tx_unmap_ops[MAX_PENDING_REQS];
100 static gnttab_map_grant_ref_t tx_map_ops[MAX_PENDING_REQS];
102 static struct list_head net_schedule_list;
103 static spinlock_t net_schedule_list_lock;
105 #define MAX_MFN_ALLOC 64
106 static unsigned long mfn_list[MAX_MFN_ALLOC];
107 static unsigned int alloc_index = 0;
109 static inline unsigned long alloc_mfn(void)
111 return mfn_list[--alloc_index];
114 static int check_mfn(int nr)
116 struct xen_memory_reservation reservation = {
121 if (likely(alloc_index >= nr))
124 set_xen_guest_handle(reservation.extent_start, mfn_list + alloc_index);
125 reservation.nr_extents = MAX_MFN_ALLOC - alloc_index;
126 alloc_index += HYPERVISOR_memory_op(XENMEM_increase_reservation,
129 return alloc_index >= nr ? 0 : -ENOMEM;
132 static inline void maybe_schedule_tx_action(void)
135 if ((NR_PENDING_REQS < (MAX_PENDING_REQS/2)) &&
136 !list_empty(&net_schedule_list))
137 tasklet_schedule(&net_tx_tasklet);
141 * A gross way of confirming the origin of an skb data page. The slab
142 * allocator abuses a field in the page struct to cache the struct kmem_cache ptr.
144 static inline int is_xen_skb(struct sk_buff *skb)
146 extern struct kmem_cache *skbuff_cachep;
147 struct kmem_cache *cp = (struct kmem_cache *)virt_to_page(skb->head)->lru.next;
148 return (cp == skbuff_cachep);
152 * We can flip without copying the packet unless:
153 * 1. The data is not allocated from our special cache; or
154 * 2. The main data area is shared; or
155 * 3. One or more fragments are shared; or
156 * 4. There are chained fragments.
158 static inline int is_flippable_skb(struct sk_buff *skb)
162 if (!is_xen_skb(skb) || skb_cloned(skb))
165 for (frag = 0; frag < skb_shinfo(skb)->nr_frags; frag++) {
166 if (page_count(skb_shinfo(skb)->frags[frag].page) > 1)
170 if (skb_shinfo(skb)->frag_list != NULL)
176 static struct sk_buff *netbk_copy_skb(struct sk_buff *skb)
178 struct skb_shared_info *ninfo;
179 struct sk_buff *nskb;
180 unsigned long offset;
185 BUG_ON(skb_shinfo(skb)->frag_list != NULL);
187 nskb = alloc_skb(SKB_MAX_HEAD(0), GFP_ATOMIC);
191 skb_reserve(nskb, 16);
192 headlen = nskb->end - nskb->data;
193 if (headlen > skb_headlen(skb))
194 headlen = skb_headlen(skb);
195 ret = skb_copy_bits(skb, 0, __skb_put(nskb, headlen), headlen);
198 ninfo = skb_shinfo(nskb);
199 ninfo->gso_size = skb_shinfo(skb)->gso_size;
200 ninfo->gso_type = skb_shinfo(skb)->gso_type;
203 len = skb->len - headlen;
205 nskb->len = skb->len;
206 nskb->data_len = len;
207 nskb->truesize += len;
214 if (unlikely(ninfo->nr_frags >= MAX_SKB_FRAGS)) {
219 copy = len >= PAGE_SIZE ? PAGE_SIZE : len;
220 zero = len >= PAGE_SIZE ? 0 : __GFP_ZERO;
222 page = alloc_page(GFP_ATOMIC | __GFP_NOWARN | zero);
226 ret = skb_copy_bits(skb, offset, page_address(page), copy);
229 ninfo->frags[ninfo->nr_frags].page = page;
230 ninfo->frags[ninfo->nr_frags].page_offset = 0;
231 ninfo->frags[ninfo->nr_frags].size = copy;
238 offset = nskb->data - skb->data;
240 nskb->h.raw = skb->h.raw + offset;
241 nskb->nh.raw = skb->nh.raw + offset;
242 nskb->mac.raw = skb->mac.raw + offset;
252 static inline int netbk_max_required_rx_slots(netif_t *netif)
254 if (netif->features & (NETIF_F_SG|NETIF_F_TSO))
255 return MAX_SKB_FRAGS + 2; /* header + extra_info + frags */
256 return 1; /* all in one */
259 static inline int netbk_queue_full(netif_t *netif)
261 RING_IDX peek = netif->rx_req_cons_peek;
262 RING_IDX needed = netbk_max_required_rx_slots(netif);
264 return ((netif->rx.sring->req_prod - peek) < needed) ||
265 ((netif->rx.rsp_prod_pvt + NET_RX_RING_SIZE - peek) < needed);
268 static void tx_queue_callback(unsigned long data)
270 netif_t *netif = (netif_t *)data;
271 if (netif_schedulable(netif->dev))
272 netif_wake_queue(netif->dev);
275 int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev)
277 netif_t *netif = netdev_priv(dev);
279 BUG_ON(skb->dev != dev);
281 /* Drop the packet if the target domain has no receive buffers. */
282 if (unlikely(!netif_schedulable(dev) || netbk_queue_full(netif)))
286 * Copy the packet here if it's destined for a flipping interface
287 * but isn't flippable (e.g. extra references to data).
289 if (!netif->copying_receiver && !is_flippable_skb(skb)) {
290 struct sk_buff *nskb = netbk_copy_skb(skb);
291 if ( unlikely(nskb == NULL) )
293 /* Copy only the header fields we use in this driver. */
294 nskb->dev = skb->dev;
295 nskb->ip_summed = skb->ip_summed;
296 nskb->proto_data_valid = skb->proto_data_valid;
301 netif->rx_req_cons_peek += skb_shinfo(skb)->nr_frags + 1 +
302 !!skb_shinfo(skb)->gso_size;
305 if (netbk_can_queue(dev) && netbk_queue_full(netif)) {
306 netif->rx.sring->req_event = netif->rx_req_cons_peek +
307 netbk_max_required_rx_slots(netif);
308 mb(); /* request notification /then/ check & stop the queue */
309 if (netbk_queue_full(netif)) {
310 netif_stop_queue(dev);
312 * Schedule 500ms timeout to restart the queue, thus
313 * ensuring that an inactive queue will be drained.
314 * Packets will be immediately be dropped until more
315 * receive buffers become available (see
316 * netbk_queue_full() check above).
318 netif->tx_queue_timeout.data = (unsigned long)netif;
319 netif->tx_queue_timeout.function = tx_queue_callback;
320 __mod_timer(&netif->tx_queue_timeout, jiffies + HZ/2);
324 skb_queue_tail(&rx_queue, skb);
325 tasklet_schedule(&net_rx_tasklet);
330 netif->stats.tx_dropped++;
336 static void xen_network_done_notify(void)
338 static struct net_device *eth0_dev = NULL;
339 if (unlikely(eth0_dev == NULL))
340 eth0_dev = __dev_get_by_name("eth0");
341 netif_rx_schedule(eth0_dev);
344 * Add following to poll() function in NAPI driver (Tigon3 is example):
345 * if ( xen_network_done() )
346 * tg3_enable_ints(tp);
348 int xen_network_done(void)
350 return skb_queue_empty(&rx_queue);
354 struct netrx_pending_operations {
355 unsigned trans_prod, trans_cons;
356 unsigned mmu_prod, mmu_cons;
357 unsigned mcl_prod, mcl_cons;
358 unsigned copy_prod, copy_cons;
359 unsigned meta_prod, meta_cons;
361 gnttab_transfer_t *trans;
363 multicall_entry_t *mcl;
364 struct netbk_rx_meta *meta;
367 /* Set up the grant operations for this fragment. If it's a flipping
368 interface, we also set up the unmap request from here. */
369 static u16 netbk_gop_frag(netif_t *netif, struct netbk_rx_meta *meta,
370 int i, struct netrx_pending_operations *npo,
371 struct page *page, unsigned long size,
372 unsigned long offset)
375 gnttab_transfer_t *gop;
376 gnttab_copy_t *copy_gop;
377 multicall_entry_t *mcl;
378 netif_rx_request_t *req;
379 unsigned long old_mfn, new_mfn;
381 old_mfn = virt_to_mfn(page_address(page));
383 req = RING_GET_REQUEST(&netif->rx, netif->rx.req_cons + i);
384 if (netif->copying_receiver) {
385 /* The fragment needs to be copied rather than
388 copy_gop = npo->copy + npo->copy_prod++;
389 copy_gop->flags = GNTCOPY_dest_gref;
390 if (PageForeign(page)) {
391 struct pending_tx_info *src_pend =
392 &pending_tx_info[page->index];
393 copy_gop->source.domid = src_pend->netif->domid;
394 copy_gop->source.u.ref = src_pend->req.gref;
395 copy_gop->flags |= GNTCOPY_source_gref;
397 copy_gop->source.domid = DOMID_SELF;
398 copy_gop->source.u.gmfn = old_mfn;
400 copy_gop->source.offset = offset;
401 copy_gop->dest.domid = netif->domid;
402 copy_gop->dest.offset = 0;
403 copy_gop->dest.u.ref = req->gref;
404 copy_gop->len = size;
407 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
408 new_mfn = alloc_mfn();
411 * Set the new P2M table entry before
412 * reassigning the old data page. Heed the
413 * comment in pgtable-2level.h:pte_page(). :-)
415 set_phys_to_machine(page_to_pfn(page), new_mfn);
417 mcl = npo->mcl + npo->mcl_prod++;
418 MULTI_update_va_mapping(mcl,
419 (unsigned long)page_address(page),
420 pfn_pte_ma(new_mfn, PAGE_KERNEL),
423 mmu = npo->mmu + npo->mmu_prod++;
424 mmu->ptr = ((maddr_t)new_mfn << PAGE_SHIFT) |
426 mmu->val = page_to_pfn(page);
429 gop = npo->trans + npo->trans_prod++;
431 gop->domid = netif->domid;
432 gop->ref = req->gref;
437 static void netbk_gop_skb(struct sk_buff *skb,
438 struct netrx_pending_operations *npo)
440 netif_t *netif = netdev_priv(skb->dev);
441 int nr_frags = skb_shinfo(skb)->nr_frags;
444 struct netbk_rx_meta *head_meta, *meta;
446 head_meta = npo->meta + npo->meta_prod++;
447 head_meta->frag.page_offset = skb_shinfo(skb)->gso_type;
448 head_meta->frag.size = skb_shinfo(skb)->gso_size;
449 extra = !!head_meta->frag.size + 1;
451 for (i = 0; i < nr_frags; i++) {
452 meta = npo->meta + npo->meta_prod++;
453 meta->frag = skb_shinfo(skb)->frags[i];
454 meta->id = netbk_gop_frag(netif, meta, i + extra, npo,
457 meta->frag.page_offset);
461 * This must occur at the end to ensure that we don't trash
462 * skb_shinfo until we're done.
464 head_meta->id = netbk_gop_frag(netif, head_meta, 0, npo,
465 virt_to_page(skb->data),
467 offset_in_page(skb->data));
469 netif->rx.req_cons += nr_frags + extra;
472 static inline void netbk_free_pages(int nr_frags, struct netbk_rx_meta *meta)
476 for (i = 0; i < nr_frags; i++)
477 put_page(meta[i].frag.page);
480 /* This is a twin to netbk_gop_skb. Assume that netbk_gop_skb was
481 used to set up the operations on the top of
482 netrx_pending_operations, which have since been done. Check that
483 they didn't give any errors and advance over them. */
484 static int netbk_check_gop(int nr_frags, domid_t domid,
485 struct netrx_pending_operations *npo)
487 multicall_entry_t *mcl;
488 gnttab_transfer_t *gop;
489 gnttab_copy_t *copy_op;
490 int status = NETIF_RSP_OKAY;
493 for (i = 0; i <= nr_frags; i++) {
494 if (npo->meta[npo->meta_cons + i].copy) {
495 copy_op = npo->copy + npo->copy_cons++;
496 if (copy_op->status != GNTST_okay) {
497 DPRINTK("Bad status %d from copy to DOM%d.\n",
498 copy_op->status, domid);
499 status = NETIF_RSP_ERROR;
502 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
503 mcl = npo->mcl + npo->mcl_cons++;
504 /* The update_va_mapping() must not fail. */
505 BUG_ON(mcl->result != 0);
508 gop = npo->trans + npo->trans_cons++;
509 /* Check the reassignment error code. */
510 if (gop->status != 0) {
511 DPRINTK("Bad status %d from grant transfer to DOM%u\n",
514 * Page no longer belongs to us unless
515 * GNTST_bad_page, but that should be
516 * a fatal error anyway.
518 BUG_ON(gop->status == GNTST_bad_page);
519 status = NETIF_RSP_ERROR;
527 static void netbk_add_frag_responses(netif_t *netif, int status,
528 struct netbk_rx_meta *meta, int nr_frags)
531 unsigned long offset;
533 for (i = 0; i < nr_frags; i++) {
535 int flags = (i == nr_frags - 1) ? 0 : NETRXF_more_data;
540 offset = meta[i].frag.page_offset;
541 make_rx_response(netif, id, status, offset,
542 meta[i].frag.size, flags);
546 static void net_rx_action(unsigned long unused)
548 netif_t *netif = NULL;
551 netif_rx_response_t *resp;
552 multicall_entry_t *mcl;
553 struct sk_buff_head rxq;
559 unsigned long offset;
562 * Putting hundreds of bytes on the stack is considered rude.
563 * Static works because a tasklet can only be on one CPU at any time.
565 static multicall_entry_t rx_mcl[NET_RX_RING_SIZE+3];
566 static mmu_update_t rx_mmu[NET_RX_RING_SIZE];
567 static gnttab_transfer_t grant_trans_op[NET_RX_RING_SIZE];
568 static gnttab_copy_t grant_copy_op[NET_RX_RING_SIZE];
569 static unsigned char rx_notify[NR_IRQS];
570 static u16 notify_list[NET_RX_RING_SIZE];
571 static struct netbk_rx_meta meta[NET_RX_RING_SIZE];
573 struct netrx_pending_operations npo = {
575 trans: grant_trans_op,
580 skb_queue_head_init(&rxq);
584 while ((skb = skb_dequeue(&rx_queue)) != NULL) {
585 nr_frags = skb_shinfo(skb)->nr_frags;
586 *(int *)skb->cb = nr_frags;
588 if (!xen_feature(XENFEAT_auto_translated_physmap) &&
589 check_mfn(nr_frags + 1)) {
590 /* Memory squeeze? Back off for an arbitrary while. */
591 if ( net_ratelimit() )
592 WPRINTK("Memory squeeze in netback "
594 mod_timer(&net_timer, jiffies + HZ);
595 skb_queue_head(&rx_queue, skb);
599 netbk_gop_skb(skb, &npo);
601 count += nr_frags + 1;
603 __skb_queue_tail(&rxq, skb);
605 /* Filled the batch queue? */
606 if (count + MAX_SKB_FRAGS >= NET_RX_RING_SIZE)
611 !xen_feature(XENFEAT_auto_translated_physmap)) {
612 mcl = npo.mcl + npo.mcl_prod++;
614 BUG_ON(mcl[-1].op != __HYPERVISOR_update_va_mapping);
615 mcl[-1].args[MULTI_UVMFLAGS_INDEX] = UVMF_TLB_FLUSH|UVMF_ALL;
617 mcl->op = __HYPERVISOR_mmu_update;
618 mcl->args[0] = (unsigned long)rx_mmu;
619 mcl->args[1] = npo.mmu_prod;
621 mcl->args[3] = DOMID_SELF;
624 if (npo.trans_prod) {
625 mcl = npo.mcl + npo.mcl_prod++;
626 mcl->op = __HYPERVISOR_grant_table_op;
627 mcl->args[0] = GNTTABOP_transfer;
628 mcl->args[1] = (unsigned long)grant_trans_op;
629 mcl->args[2] = npo.trans_prod;
633 mcl = npo.mcl + npo.mcl_prod++;
634 mcl->op = __HYPERVISOR_grant_table_op;
635 mcl->args[0] = GNTTABOP_copy;
636 mcl->args[1] = (unsigned long)grant_copy_op;
637 mcl->args[2] = npo.copy_prod;
644 BUG_ON(npo.copy_prod > NET_RX_RING_SIZE);
645 BUG_ON(npo.mmu_prod > NET_RX_RING_SIZE);
646 BUG_ON(npo.trans_prod > NET_RX_RING_SIZE);
647 BUG_ON(npo.mcl_prod > NET_RX_RING_SIZE+3);
648 BUG_ON(npo.meta_prod > NET_RX_RING_SIZE);
650 ret = HYPERVISOR_multicall(npo.mcl, npo.mcl_prod);
653 while ((skb = __skb_dequeue(&rxq)) != NULL) {
654 nr_frags = *(int *)skb->cb;
656 netif = netdev_priv(skb->dev);
657 /* We can't rely on skb_release_data to release the
658 pages used by fragments for us, since it tries to
659 touch the pages in the fraglist. If we're in
660 flipping mode, that doesn't work. In copying mode,
661 we still have access to all of the pages, and so
662 it's safe to let release_data deal with it. */
663 /* (Freeing the fragments is safe since we copy
664 non-linear skbs destined for flipping interfaces) */
665 if (!netif->copying_receiver) {
666 atomic_set(&(skb_shinfo(skb)->dataref), 1);
667 skb_shinfo(skb)->frag_list = NULL;
668 skb_shinfo(skb)->nr_frags = 0;
669 netbk_free_pages(nr_frags, meta + npo.meta_cons + 1);
672 netif->stats.tx_bytes += skb->len;
673 netif->stats.tx_packets++;
675 status = netbk_check_gop(nr_frags, netif->domid, &npo);
677 id = meta[npo.meta_cons].id;
678 flags = nr_frags ? NETRXF_more_data : 0;
680 if (skb->ip_summed == CHECKSUM_PARTIAL) /* local packet? */
681 flags |= NETRXF_csum_blank | NETRXF_data_validated;
682 else if (skb->proto_data_valid) /* remote but checksummed? */
683 flags |= NETRXF_data_validated;
685 if (meta[npo.meta_cons].copy)
688 offset = offset_in_page(skb->data);
689 resp = make_rx_response(netif, id, status, offset,
690 skb_headlen(skb), flags);
692 if (meta[npo.meta_cons].frag.size) {
693 struct netif_extra_info *gso =
694 (struct netif_extra_info *)
695 RING_GET_RESPONSE(&netif->rx,
696 netif->rx.rsp_prod_pvt++);
698 resp->flags |= NETRXF_extra_info;
700 gso->u.gso.size = meta[npo.meta_cons].frag.size;
701 gso->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4;
703 gso->u.gso.features = 0;
705 gso->type = XEN_NETIF_EXTRA_TYPE_GSO;
709 netbk_add_frag_responses(netif, status,
710 meta + npo.meta_cons + 1,
713 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->rx, ret);
715 if (ret && !rx_notify[irq]) {
717 notify_list[notify_nr++] = irq;
720 if (netif_queue_stopped(netif->dev) &&
721 netif_schedulable(netif->dev) &&
722 !netbk_queue_full(netif))
723 netif_wake_queue(netif->dev);
727 npo.meta_cons += nr_frags + 1;
730 while (notify_nr != 0) {
731 irq = notify_list[--notify_nr];
733 notify_remote_via_irq(irq);
736 /* More work to do? */
737 if (!skb_queue_empty(&rx_queue) && !timer_pending(&net_timer))
738 tasklet_schedule(&net_rx_tasklet);
741 xen_network_done_notify();
745 static void net_alarm(unsigned long unused)
747 tasklet_schedule(&net_rx_tasklet);
750 struct net_device_stats *netif_be_get_stats(struct net_device *dev)
752 netif_t *netif = netdev_priv(dev);
753 return &netif->stats;
756 static int __on_net_schedule_list(netif_t *netif)
758 return netif->list.next != NULL;
761 static void remove_from_net_schedule_list(netif_t *netif)
763 spin_lock_irq(&net_schedule_list_lock);
764 if (likely(__on_net_schedule_list(netif))) {
765 list_del(&netif->list);
766 netif->list.next = NULL;
769 spin_unlock_irq(&net_schedule_list_lock);
772 static void add_to_net_schedule_list_tail(netif_t *netif)
774 if (__on_net_schedule_list(netif))
777 spin_lock_irq(&net_schedule_list_lock);
778 if (!__on_net_schedule_list(netif) &&
779 likely(netif_schedulable(netif->dev))) {
780 list_add_tail(&netif->list, &net_schedule_list);
783 spin_unlock_irq(&net_schedule_list_lock);
787 * Note on CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER:
788 * If this driver is pipelining transmit requests then we can be very
789 * aggressive in avoiding new-packet notifications -- frontend only needs to
790 * send a notification if there are no outstanding unreceived responses.
791 * If we may be buffer transmit buffers for any reason then we must be rather
792 * more conservative and treat this as the final check for pending work.
794 void netif_schedule_work(netif_t *netif)
798 #ifdef CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER
799 more_to_do = RING_HAS_UNCONSUMED_REQUESTS(&netif->tx);
801 RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, more_to_do);
805 add_to_net_schedule_list_tail(netif);
806 maybe_schedule_tx_action();
810 void netif_deschedule_work(netif_t *netif)
812 remove_from_net_schedule_list(netif);
816 static void tx_add_credit(netif_t *netif)
818 unsigned long max_burst, max_credit;
821 * Allow a burst big enough to transmit a jumbo packet of up to 128kB.
822 * Otherwise the interface can seize up due to insufficient credit.
824 max_burst = RING_GET_REQUEST(&netif->tx, netif->tx.req_cons)->size;
825 max_burst = min(max_burst, 131072UL);
826 max_burst = max(max_burst, netif->credit_bytes);
828 /* Take care that adding a new chunk of credit doesn't wrap to zero. */
829 max_credit = netif->remaining_credit + netif->credit_bytes;
830 if (max_credit < netif->remaining_credit)
831 max_credit = ULONG_MAX; /* wrapped: clamp to ULONG_MAX */
833 netif->remaining_credit = min(max_credit, max_burst);
836 static void tx_credit_callback(unsigned long data)
838 netif_t *netif = (netif_t *)data;
839 tx_add_credit(netif);
840 netif_schedule_work(netif);
843 inline static void net_tx_action_dealloc(void)
845 gnttab_unmap_grant_ref_t *gop;
847 PEND_RING_IDX dc, dp;
854 /* Ensure we see all indexes enqueued by netif_idx_release(). */
858 * Free up any grants we have finished using
862 pending_idx = dealloc_ring[MASK_PEND_IDX(dc++)];
863 gnttab_set_unmap_op(gop, idx_to_kaddr(pending_idx),
865 grant_tx_handle[pending_idx]);
868 ret = HYPERVISOR_grant_table_op(
869 GNTTABOP_unmap_grant_ref, tx_unmap_ops, gop - tx_unmap_ops);
872 while (dealloc_cons != dp) {
873 pending_idx = dealloc_ring[MASK_PEND_IDX(dealloc_cons++)];
875 netif = pending_tx_info[pending_idx].netif;
877 make_tx_response(netif, &pending_tx_info[pending_idx].req,
880 pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
886 static void netbk_tx_err(netif_t *netif, netif_tx_request_t *txp, RING_IDX end)
888 RING_IDX cons = netif->tx.req_cons;
891 make_tx_response(netif, txp, NETIF_RSP_ERROR);
894 txp = RING_GET_REQUEST(&netif->tx, cons++);
896 netif->tx.req_cons = cons;
897 netif_schedule_work(netif);
901 static int netbk_count_requests(netif_t *netif, netif_tx_request_t *first,
902 netif_tx_request_t *txp, int work_to_do)
904 RING_IDX cons = netif->tx.req_cons;
907 if (!(first->flags & NETTXF_more_data))
911 if (frags >= work_to_do) {
912 DPRINTK("Need more frags\n");
916 if (unlikely(frags >= MAX_SKB_FRAGS)) {
917 DPRINTK("Too many frags\n");
921 memcpy(txp, RING_GET_REQUEST(&netif->tx, cons + frags),
923 if (txp->size > first->size) {
924 DPRINTK("Frags galore\n");
928 first->size -= txp->size;
931 if (unlikely((txp->offset + txp->size) > PAGE_SIZE)) {
932 DPRINTK("txp->offset: %x, size: %u\n",
933 txp->offset, txp->size);
936 } while ((txp++)->flags & NETTXF_more_data);
941 static gnttab_map_grant_ref_t *netbk_get_requests(netif_t *netif,
943 netif_tx_request_t *txp,
944 gnttab_map_grant_ref_t *mop)
946 struct skb_shared_info *shinfo = skb_shinfo(skb);
947 skb_frag_t *frags = shinfo->frags;
948 unsigned long pending_idx = *((u16 *)skb->data);
951 /* Skip first skb fragment if it is on same page as header fragment. */
952 start = ((unsigned long)shinfo->frags[0].page == pending_idx);
954 for (i = start; i < shinfo->nr_frags; i++, txp++) {
955 pending_idx = pending_ring[MASK_PEND_IDX(pending_cons++)];
957 gnttab_set_map_op(mop++, idx_to_kaddr(pending_idx),
958 GNTMAP_host_map | GNTMAP_readonly,
959 txp->gref, netif->domid);
961 memcpy(&pending_tx_info[pending_idx].req, txp, sizeof(*txp));
963 pending_tx_info[pending_idx].netif = netif;
964 frags[i].page = (void *)pending_idx;
970 static int netbk_tx_check_mop(struct sk_buff *skb,
971 gnttab_map_grant_ref_t **mopp)
973 gnttab_map_grant_ref_t *mop = *mopp;
974 int pending_idx = *((u16 *)skb->data);
975 netif_t *netif = pending_tx_info[pending_idx].netif;
976 netif_tx_request_t *txp;
977 struct skb_shared_info *shinfo = skb_shinfo(skb);
978 int nr_frags = shinfo->nr_frags;
981 /* Check status of header. */
984 txp = &pending_tx_info[pending_idx].req;
985 make_tx_response(netif, txp, NETIF_RSP_ERROR);
986 pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
990 __pa(idx_to_kaddr(pending_idx)) >> PAGE_SHIFT,
991 FOREIGN_FRAME(mop->dev_bus_addr >> PAGE_SHIFT));
992 grant_tx_handle[pending_idx] = mop->handle;
995 /* Skip first skb fragment if it is on same page as header fragment. */
996 start = ((unsigned long)shinfo->frags[0].page == pending_idx);
998 for (i = start; i < nr_frags; i++) {
1001 pending_idx = (unsigned long)shinfo->frags[i].page;
1003 /* Check error status: if okay then remember grant handle. */
1004 newerr = (++mop)->status;
1005 if (likely(!newerr)) {
1006 set_phys_to_machine(
1007 __pa(idx_to_kaddr(pending_idx))>>PAGE_SHIFT,
1008 FOREIGN_FRAME(mop->dev_bus_addr>>PAGE_SHIFT));
1009 grant_tx_handle[pending_idx] = mop->handle;
1010 /* Had a previous error? Invalidate this fragment. */
1012 netif_idx_release(pending_idx);
1016 /* Error on this fragment: respond to client with an error. */
1017 txp = &pending_tx_info[pending_idx].req;
1018 make_tx_response(netif, txp, NETIF_RSP_ERROR);
1019 pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
1022 /* Not the first error? Preceding frags already invalidated. */
1026 /* First error: invalidate header and preceding fragments. */
1027 pending_idx = *((u16 *)skb->data);
1028 netif_idx_release(pending_idx);
1029 for (j = start; j < i; j++) {
1030 pending_idx = (unsigned long)shinfo->frags[i].page;
1031 netif_idx_release(pending_idx);
1034 /* Remember the error: invalidate all subsequent fragments. */
1042 static void netbk_fill_frags(struct sk_buff *skb)
1044 struct skb_shared_info *shinfo = skb_shinfo(skb);
1045 int nr_frags = shinfo->nr_frags;
1048 for (i = 0; i < nr_frags; i++) {
1049 skb_frag_t *frag = shinfo->frags + i;
1050 netif_tx_request_t *txp;
1051 unsigned long pending_idx;
1053 pending_idx = (unsigned long)frag->page;
1054 txp = &pending_tx_info[pending_idx].req;
1055 frag->page = virt_to_page(idx_to_kaddr(pending_idx));
1056 frag->size = txp->size;
1057 frag->page_offset = txp->offset;
1059 skb->len += txp->size;
1060 skb->data_len += txp->size;
1061 skb->truesize += txp->size;
1065 int netbk_get_extras(netif_t *netif, struct netif_extra_info *extras,
1068 struct netif_extra_info extra;
1069 RING_IDX cons = netif->tx.req_cons;
1072 if (unlikely(work_to_do-- <= 0)) {
1073 DPRINTK("Missing extra info\n");
1077 memcpy(&extra, RING_GET_REQUEST(&netif->tx, cons),
1079 if (unlikely(!extra.type ||
1080 extra.type >= XEN_NETIF_EXTRA_TYPE_MAX)) {
1081 netif->tx.req_cons = ++cons;
1082 DPRINTK("Invalid extra type: %d\n", extra.type);
1086 memcpy(&extras[extra.type - 1], &extra, sizeof(extra));
1087 netif->tx.req_cons = ++cons;
1088 } while (extra.flags & XEN_NETIF_EXTRA_FLAG_MORE);
1093 static int netbk_set_skb_gso(struct sk_buff *skb, struct netif_extra_info *gso)
1095 if (!gso->u.gso.size) {
1096 DPRINTK("GSO size must not be zero.\n");
1100 /* Currently only TCPv4 S.O. is supported. */
1101 if (gso->u.gso.type != XEN_NETIF_GSO_TYPE_TCPV4) {
1102 DPRINTK("Bad GSO type %d.\n", gso->u.gso.type);
1106 skb_shinfo(skb)->gso_size = gso->u.gso.size;
1107 skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
1109 /* Header must be checked, and gso_segs computed. */
1110 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
1111 skb_shinfo(skb)->gso_segs = 0;
1116 /* Called after netfront has transmitted */
1117 static void net_tx_action(unsigned long unused)
1119 struct list_head *ent;
1120 struct sk_buff *skb;
1122 netif_tx_request_t txreq;
1123 netif_tx_request_t txfrags[MAX_SKB_FRAGS];
1124 struct netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1];
1127 gnttab_map_grant_ref_t *mop;
1128 unsigned int data_len;
1129 int ret, work_to_do;
1131 if (dealloc_cons != dealloc_prod)
1132 net_tx_action_dealloc();
1135 while (((NR_PENDING_REQS + MAX_SKB_FRAGS) < MAX_PENDING_REQS) &&
1136 !list_empty(&net_schedule_list)) {
1137 /* Get a netif from the list with work to do. */
1138 ent = net_schedule_list.next;
1139 netif = list_entry(ent, netif_t, list);
1141 remove_from_net_schedule_list(netif);
1143 RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, work_to_do);
1149 i = netif->tx.req_cons;
1150 rmb(); /* Ensure that we see the request before we copy it. */
1151 memcpy(&txreq, RING_GET_REQUEST(&netif->tx, i), sizeof(txreq));
1153 /* Credit-based scheduling. */
1154 if (txreq.size > netif->remaining_credit) {
1155 unsigned long now = jiffies;
1156 unsigned long next_credit =
1157 netif->credit_timeout.expires +
1158 msecs_to_jiffies(netif->credit_usec / 1000);
1160 /* Timer could already be pending in rare cases. */
1161 if (timer_pending(&netif->credit_timeout)) {
1166 /* Passed the point where we can replenish credit? */
1167 if (time_after_eq(now, next_credit)) {
1168 netif->credit_timeout.expires = now;
1169 tx_add_credit(netif);
1172 /* Still too big to send right now? Set a callback. */
1173 if (txreq.size > netif->remaining_credit) {
1174 netif->credit_timeout.data =
1175 (unsigned long)netif;
1176 netif->credit_timeout.function =
1178 __mod_timer(&netif->credit_timeout,
1184 netif->remaining_credit -= txreq.size;
1187 netif->tx.req_cons = ++i;
1189 memset(extras, 0, sizeof(extras));
1190 if (txreq.flags & NETTXF_extra_info) {
1191 work_to_do = netbk_get_extras(netif, extras,
1193 i = netif->tx.req_cons;
1194 if (unlikely(work_to_do < 0)) {
1195 netbk_tx_err(netif, &txreq, i);
1200 ret = netbk_count_requests(netif, &txreq, txfrags, work_to_do);
1201 if (unlikely(ret < 0)) {
1202 netbk_tx_err(netif, &txreq, i - ret);
1207 if (unlikely(txreq.size < ETH_HLEN)) {
1208 DPRINTK("Bad packet size: %d\n", txreq.size);
1209 netbk_tx_err(netif, &txreq, i);
1213 /* No crossing a page as the payload mustn't fragment. */
1214 if (unlikely((txreq.offset + txreq.size) > PAGE_SIZE)) {
1215 DPRINTK("txreq.offset: %x, size: %u, end: %lu\n",
1216 txreq.offset, txreq.size,
1217 (txreq.offset &~PAGE_MASK) + txreq.size);
1218 netbk_tx_err(netif, &txreq, i);
1222 pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
1224 data_len = (txreq.size > PKT_PROT_LEN &&
1225 ret < MAX_SKB_FRAGS) ?
1226 PKT_PROT_LEN : txreq.size;
1228 skb = alloc_skb(data_len+16, GFP_ATOMIC);
1229 if (unlikely(skb == NULL)) {
1230 DPRINTK("Can't allocate a skb in start_xmit.\n");
1231 netbk_tx_err(netif, &txreq, i);
1235 /* Packets passed to netif_rx() must have some headroom. */
1236 skb_reserve(skb, 16);
1238 if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) {
1239 struct netif_extra_info *gso;
1240 gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1];
1242 if (netbk_set_skb_gso(skb, gso)) {
1244 netbk_tx_err(netif, &txreq, i);
1249 gnttab_set_map_op(mop, idx_to_kaddr(pending_idx),
1250 GNTMAP_host_map | GNTMAP_readonly,
1251 txreq.gref, netif->domid);
1254 memcpy(&pending_tx_info[pending_idx].req,
1255 &txreq, sizeof(txreq));
1256 pending_tx_info[pending_idx].netif = netif;
1257 *((u16 *)skb->data) = pending_idx;
1259 __skb_put(skb, data_len);
1261 skb_shinfo(skb)->nr_frags = ret;
1262 if (data_len < txreq.size) {
1263 skb_shinfo(skb)->nr_frags++;
1264 skb_shinfo(skb)->frags[0].page =
1265 (void *)(unsigned long)pending_idx;
1267 /* Discriminate from any valid pending_idx value. */
1268 skb_shinfo(skb)->frags[0].page = (void *)~0UL;
1271 __skb_queue_tail(&tx_queue, skb);
1275 mop = netbk_get_requests(netif, skb, txfrags, mop);
1277 netif->tx.req_cons = i;
1278 netif_schedule_work(netif);
1280 if ((mop - tx_map_ops) >= ARRAY_SIZE(tx_map_ops))
1284 if (mop == tx_map_ops)
1287 ret = HYPERVISOR_grant_table_op(
1288 GNTTABOP_map_grant_ref, tx_map_ops, mop - tx_map_ops);
1292 while ((skb = __skb_dequeue(&tx_queue)) != NULL) {
1293 netif_tx_request_t *txp;
1295 pending_idx = *((u16 *)skb->data);
1296 netif = pending_tx_info[pending_idx].netif;
1297 txp = &pending_tx_info[pending_idx].req;
1299 /* Check the remap error code. */
1300 if (unlikely(netbk_tx_check_mop(skb, &mop))) {
1301 printk(KERN_ALERT "#### netback grant fails\n");
1302 skb_shinfo(skb)->nr_frags = 0;
1307 data_len = skb->len;
1309 (void *)(idx_to_kaddr(pending_idx)|txp->offset),
1311 if (data_len < txp->size) {
1312 /* Append the packet payload as a fragment. */
1313 txp->offset += data_len;
1314 txp->size -= data_len;
1316 /* Schedule a response immediately. */
1317 netif_idx_release(pending_idx);
1321 * Old frontends do not assert data_validated but we
1322 * can infer it from csum_blank so test both flags.
1324 if (txp->flags & (NETTXF_data_validated|NETTXF_csum_blank)) {
1325 skb->ip_summed = CHECKSUM_UNNECESSARY;
1326 skb->proto_data_valid = 1;
1328 skb->ip_summed = CHECKSUM_NONE;
1329 skb->proto_data_valid = 0;
1331 skb->proto_csum_blank = !!(txp->flags & NETTXF_csum_blank);
1333 netbk_fill_frags(skb);
1335 skb->dev = netif->dev;
1336 skb->protocol = eth_type_trans(skb, skb->dev);
1338 netif->stats.rx_bytes += skb->len;
1339 netif->stats.rx_packets++;
1342 netif->dev->last_rx = jiffies;
1346 static void netif_idx_release(u16 pending_idx)
1348 static DEFINE_SPINLOCK(_lock);
1349 unsigned long flags;
1351 spin_lock_irqsave(&_lock, flags);
1352 dealloc_ring[MASK_PEND_IDX(dealloc_prod)] = pending_idx;
1353 /* Sync with net_tx_action_dealloc: insert idx /then/ incr producer. */
1356 spin_unlock_irqrestore(&_lock, flags);
1358 tasklet_schedule(&net_tx_tasklet);
1361 static void netif_page_release(struct page *page)
1363 /* Ready for next use. */
1364 init_page_count(page);
1366 netif_idx_release(page->index);
1369 irqreturn_t netif_be_int(int irq, void *dev_id)
1371 netif_t *netif = dev_id;
1373 add_to_net_schedule_list_tail(netif);
1374 maybe_schedule_tx_action();
1376 if (netif_schedulable(netif->dev) && !netbk_queue_full(netif))
1377 netif_wake_queue(netif->dev);
1382 static void make_tx_response(netif_t *netif,
1383 netif_tx_request_t *txp,
1386 RING_IDX i = netif->tx.rsp_prod_pvt;
1387 netif_tx_response_t *resp;
1390 resp = RING_GET_RESPONSE(&netif->tx, i);
1394 if (txp->flags & NETTXF_extra_info)
1395 RING_GET_RESPONSE(&netif->tx, ++i)->status = NETIF_RSP_NULL;
1397 netif->tx.rsp_prod_pvt = ++i;
1398 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->tx, notify);
1400 notify_remote_via_irq(netif->irq);
1402 #ifdef CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER
1403 if (i == netif->tx.req_cons) {
1405 RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, more_to_do);
1407 add_to_net_schedule_list_tail(netif);
1412 static netif_rx_response_t *make_rx_response(netif_t *netif,
1419 RING_IDX i = netif->rx.rsp_prod_pvt;
1420 netif_rx_response_t *resp;
1422 resp = RING_GET_RESPONSE(&netif->rx, i);
1423 resp->offset = offset;
1424 resp->flags = flags;
1426 resp->status = (s16)size;
1428 resp->status = (s16)st;
1430 netif->rx.rsp_prod_pvt = ++i;
1435 #ifdef NETBE_DEBUG_INTERRUPT
1436 static irqreturn_t netif_be_dbg(int irq, void *dev_id)
1438 struct list_head *ent;
1442 printk(KERN_ALERT "netif_schedule_list:\n");
1443 spin_lock_irq(&net_schedule_list_lock);
1445 list_for_each (ent, &net_schedule_list) {
1446 netif = list_entry(ent, netif_t, list);
1447 printk(KERN_ALERT " %d: private(rx_req_cons=%08x "
1448 "rx_resp_prod=%08x\n",
1449 i, netif->rx.req_cons, netif->rx.rsp_prod_pvt);
1450 printk(KERN_ALERT " tx_req_cons=%08x tx_resp_prod=%08x)\n",
1451 netif->tx.req_cons, netif->tx.rsp_prod_pvt);
1452 printk(KERN_ALERT " shared(rx_req_prod=%08x "
1453 "rx_resp_prod=%08x\n",
1454 netif->rx.sring->req_prod, netif->rx.sring->rsp_prod);
1455 printk(KERN_ALERT " rx_event=%08x tx_req_prod=%08x\n",
1456 netif->rx.sring->rsp_event, netif->tx.sring->req_prod);
1457 printk(KERN_ALERT " tx_resp_prod=%08x, tx_event=%08x)\n",
1458 netif->tx.sring->rsp_prod, netif->tx.sring->rsp_event);
1462 spin_unlock_irq(&net_schedule_list_lock);
1463 printk(KERN_ALERT " ** End of netif_schedule_list **\n");
1469 static int __init netback_init(void)
1474 if (!is_running_on_xen())
1477 /* We can increase reservation by this much in net_rx_action(). */
1478 balloon_update_driver_allowance(NET_RX_RING_SIZE);
1480 skb_queue_head_init(&rx_queue);
1481 skb_queue_head_init(&tx_queue);
1483 init_timer(&net_timer);
1485 net_timer.function = net_alarm;
1487 mmap_pages = alloc_empty_pages_and_pagevec(MAX_PENDING_REQS);
1488 if (mmap_pages == NULL) {
1489 printk("%s: out of memory\n", __FUNCTION__);
1493 for (i = 0; i < MAX_PENDING_REQS; i++) {
1494 page = mmap_pages[i];
1495 SetPageForeign(page, netif_page_release);
1500 pending_prod = MAX_PENDING_REQS;
1501 for (i = 0; i < MAX_PENDING_REQS; i++)
1502 pending_ring[i] = i;
1504 spin_lock_init(&net_schedule_list_lock);
1505 INIT_LIST_HEAD(&net_schedule_list);
1507 netif_xenbus_init();
1509 #ifdef NETBE_DEBUG_INTERRUPT
1510 (void)bind_virq_to_irqhandler(
1522 module_init(netback_init);
1524 MODULE_LICENSE("Dual BSD/GPL");