2 * Copyright (c) 2010, 2011 Nicira Networks.
3 * Distributed under the terms of the GNU GPL version 2.
5 * Significant portions of this file may be copied from parts of the Linux
6 * kernel, by Linus Torvalds and others.
9 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11 #include <linux/version.h>
12 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,26)
17 #include <linux/list.h>
18 #include <linux/net.h>
21 #include <net/inet_frag.h>
23 #include <net/protocol.h>
28 #include "vport-generic.h"
30 #define CAPWAP_SRC_PORT 58881
31 #define CAPWAP_DST_PORT 58882
33 #define CAPWAP_FRAG_TIMEOUT (30 * HZ)
34 #define CAPWAP_FRAG_MAX_MEM (256 * 1024)
35 #define CAPWAP_FRAG_PRUNE_MEM (192 *1024)
36 #define CAPWAP_FRAG_SECRET_INTERVAL (10 * 60 * HZ)
39 * The CAPWAP header is a mess, with all kinds of odd size bit fields that
40 * cross byte boundaries, which are difficult to represent correctly in
41 * various byte orderings. Luckily we only care about a few permutations, so
42 * statically create them and we can do very fast parsing by checking all 12
45 #define CAPWAP_PREAMBLE_MASK __cpu_to_be32(0xFF000000)
46 #define CAPWAP_HLEN_SHIFT 17
47 #define CAPWAP_HLEN_MASK __cpu_to_be32(0x00F80000)
48 #define CAPWAP_RID_MASK __cpu_to_be32(0x0007C000)
49 #define CAPWAP_WBID_MASK __cpu_to_be32(0x00003E00)
50 #define CAPWAP_F_MASK __cpu_to_be32(0x000001FF)
52 #define CAPWAP_F_FRAG __cpu_to_be32(0x00000080)
53 #define CAPWAP_F_LASTFRAG __cpu_to_be32(0x00000040)
54 #define CAPWAP_F_WSI __cpu_to_be32(0x00000020)
55 #define CAPWAP_F_RMAC __cpu_to_be32(0x00000010)
57 #define CAPWAP_RMAC_LEN 4
59 /* Standard CAPWAP looks for a WBID value of 2.
60 * When we insert WSI field, use WBID value of 30, which has been
61 * proposed for all "experimental" usage - users with no reserved WBID value
64 #define CAPWAP_WBID_30 __cpu_to_be32(0x00003C00)
65 #define CAPWAP_WBID_2 __cpu_to_be32(0x00000200)
67 #define FRAG_HDR (CAPWAP_F_FRAG)
68 #define FRAG_LAST_HDR (FRAG_HDR | CAPWAP_F_LASTFRAG)
70 /* Keyed packet, WBID 30, and length long enough to include WSI key */
71 #define CAPWAP_KEYED (CAPWAP_WBID_30 | CAPWAP_F_WSI | htonl(20 << CAPWAP_HLEN_SHIFT))
72 /* A backward-compatible packet, WBID 2 and length of 2 words (no WSI fields) */
73 #define CAPWAP_NO_WSI (CAPWAP_WBID_2 | htonl(8 << CAPWAP_HLEN_SHIFT))
75 /* Mask for all parts of header that must be 0. */
76 #define CAPWAP_ZERO_MASK (CAPWAP_PREAMBLE_MASK | \
77 (CAPWAP_F_MASK ^ (CAPWAP_F_WSI | CAPWAP_F_FRAG | CAPWAP_F_LASTFRAG | CAPWAP_F_RMAC)))
82 /* low 3 bits of frag_off are reserved */
87 * We use the WSI field to hold additional tunnel data.
88 * The first eight bits store the size of the wsi data in bytes.
90 struct capwaphdr_wsi {
93 __be16 reserved_padding;
96 struct capwaphdr_wsi_key {
100 /* Flag indicating a 64bit key is stored in WSI data field */
101 #define CAPWAP_WSI_F_KEY64 0x80
103 static inline struct capwaphdr *capwap_hdr(const struct sk_buff *skb)
105 return (struct capwaphdr *)(udp_hdr(skb) + 1);
109 * The fragment offset is actually the high 13 bits of the last 16 bit field,
110 * so we would normally need to right shift 3 places. However, it stores the
111 * offset in 8 byte chunks, which would involve a 3 place left shift. So we
112 * just mask off the last 3 bits and be done with it.
114 #define FRAG_OFF_MASK (~0x7U)
117 * The minimum header length. The header may be longer if the optional
120 #define CAPWAP_MIN_HLEN (sizeof(struct udphdr) + sizeof(struct capwaphdr))
129 struct inet_frag_queue ifq;
130 struct frag_match match;
136 #define FRAG_CB(skb) ((struct frag_skb_cb *)(skb)->cb)
138 static struct sk_buff *fragment(struct sk_buff *, const struct vport *,
139 struct dst_entry *dst, unsigned int hlen);
140 static void defrag_init(void);
141 static void defrag_exit(void);
142 static struct sk_buff *defrag(struct sk_buff *, bool frag_last);
144 static void capwap_frag_init(struct inet_frag_queue *, void *match);
145 static unsigned int capwap_frag_hash(struct inet_frag_queue *);
146 static int capwap_frag_match(struct inet_frag_queue *, void *match);
147 static void capwap_frag_expire(unsigned long ifq);
149 static struct inet_frags frag_state = {
150 .constructor = capwap_frag_init,
151 .qsize = sizeof(struct frag_queue),
152 .hashfn = capwap_frag_hash,
153 .match = capwap_frag_match,
154 .frag_expire = capwap_frag_expire,
155 .secret_interval = CAPWAP_FRAG_SECRET_INTERVAL,
157 static struct netns_frags frag_netns_state = {
158 .timeout = CAPWAP_FRAG_TIMEOUT,
159 .high_thresh = CAPWAP_FRAG_MAX_MEM,
160 .low_thresh = CAPWAP_FRAG_PRUNE_MEM,
163 static struct socket *capwap_rcv_socket;
165 static int capwap_hdr_len(const struct tnl_mutable_config *mutable)
167 int size = CAPWAP_MIN_HLEN;
169 /* CAPWAP has no checksums. */
170 if (mutable->flags & TNL_F_CSUM)
173 /* if keys are specified, then add WSI field */
174 if (mutable->out_key || (mutable->flags & TNL_F_OUT_KEY_ACTION)) {
175 size += sizeof(struct capwaphdr_wsi) +
176 sizeof(struct capwaphdr_wsi_key);
182 static void capwap_build_header(const struct vport *vport,
183 const struct tnl_mutable_config *mutable,
186 struct udphdr *udph = header;
187 struct capwaphdr *cwh = (struct capwaphdr *)(udph + 1);
189 udph->source = htons(CAPWAP_SRC_PORT);
190 udph->dest = htons(CAPWAP_DST_PORT);
196 if (mutable->out_key || (mutable->flags & TNL_F_OUT_KEY_ACTION)) {
197 struct capwaphdr_wsi *wsi = (struct capwaphdr_wsi *)(cwh + 1);
199 cwh->begin = CAPWAP_KEYED;
201 /* -1 for wsi_len byte, not included in length as per spec */
202 wsi->wsi_len = sizeof(struct capwaphdr_wsi) - 1
203 + sizeof(struct capwaphdr_wsi_key);
204 wsi->flags = CAPWAP_WSI_F_KEY64;
205 wsi->reserved_padding = 0;
207 if (mutable->out_key) {
208 struct capwaphdr_wsi_key *opt = (struct capwaphdr_wsi_key *)(wsi + 1);
209 opt->key = mutable->out_key;
212 /* make packet readable by old capwap code */
213 cwh->begin = CAPWAP_NO_WSI;
217 static struct sk_buff *capwap_update_header(const struct vport *vport,
218 const struct tnl_mutable_config *mutable,
219 struct dst_entry *dst,
222 struct udphdr *udph = udp_hdr(skb);
224 if (mutable->flags & TNL_F_OUT_KEY_ACTION) {
225 /* first field in WSI is key */
226 struct capwaphdr *cwh = (struct capwaphdr *)(udph + 1);
227 struct capwaphdr_wsi *wsi = (struct capwaphdr_wsi *)(cwh + 1);
228 struct capwaphdr_wsi_key *opt = (struct capwaphdr_wsi_key *)(wsi + 1);
230 opt->key = OVS_CB(skb)->tun_id;
233 udph->len = htons(skb->len - skb_transport_offset(skb));
235 if (unlikely(skb->len - skb_network_offset(skb) > dst_mtu(dst))) {
236 unsigned int hlen = skb_transport_offset(skb) + capwap_hdr_len(mutable);
237 skb = fragment(skb, vport, dst, hlen);
243 static int process_capwap_wsi(struct sk_buff *skb, __be64 *key)
245 struct capwaphdr *cwh = capwap_hdr(skb);
246 struct capwaphdr_wsi *wsi;
251 if (((cwh->begin & CAPWAP_WBID_MASK) != CAPWAP_WBID_30))
254 if (cwh->begin & CAPWAP_F_RMAC)
255 rmac_len = CAPWAP_RMAC_LEN;
257 hdr_len = ntohl(cwh->begin & CAPWAP_HLEN_MASK) >> CAPWAP_HLEN_SHIFT;
259 if (unlikely(sizeof(struct capwaphdr) + rmac_len + sizeof(struct capwaphdr_wsi) > hdr_len))
262 /* read wsi header to find out how big it really is */
263 wsi = (struct capwaphdr_wsi *)((u8 *)(cwh + 1) + rmac_len);
264 /* +1 for length byte not included in wsi_len */
265 wsi_len = 1 + wsi->wsi_len;
267 if (unlikely(sizeof(struct capwaphdr) + rmac_len + wsi_len != hdr_len))
270 wsi_len -= sizeof(struct capwaphdr_wsi);
272 if (wsi->flags & CAPWAP_WSI_F_KEY64) {
273 struct capwaphdr_wsi_key *opt;
275 if (unlikely(wsi_len < sizeof(struct capwaphdr_wsi_key)))
278 opt = (struct capwaphdr_wsi_key *)(wsi + 1);
285 static inline struct sk_buff *process_capwap_proto(struct sk_buff *skb,
288 struct capwaphdr *cwh = capwap_hdr(skb);
289 int hdr_len = sizeof(struct udphdr);
291 if (unlikely((cwh->begin & CAPWAP_ZERO_MASK) != 0))
294 hdr_len += ntohl(cwh->begin & CAPWAP_HLEN_MASK) >> CAPWAP_HLEN_SHIFT;
295 if (unlikely(hdr_len < CAPWAP_MIN_HLEN))
298 if (unlikely(!pskb_may_pull(skb, hdr_len + ETH_HLEN)))
301 cwh = capwap_hdr(skb);
302 __skb_pull(skb, hdr_len);
303 skb_postpull_rcsum(skb, skb_transport_header(skb), hdr_len + ETH_HLEN);
305 if (cwh->begin & CAPWAP_F_FRAG) {
306 skb = defrag(skb, (__force bool)(cwh->begin & CAPWAP_F_LASTFRAG));
309 cwh = capwap_hdr(skb);
312 if ((cwh->begin & CAPWAP_F_WSI) && process_capwap_wsi(skb, key))
321 /* Called with rcu_read_lock and BH disabled. */
322 static int capwap_rcv(struct sock *sk, struct sk_buff *skb)
325 const struct tnl_mutable_config *mutable;
329 if (unlikely(!pskb_may_pull(skb, CAPWAP_MIN_HLEN + ETH_HLEN)))
332 skb = process_capwap_proto(skb, &key);
337 vport = tnl_find_port(iph->daddr, iph->saddr, key,
338 TNL_T_PROTO_CAPWAP | TNL_T_KEY_EITHER, &mutable);
339 if (unlikely(!vport)) {
340 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
344 if (mutable->flags & TNL_F_IN_KEY_MATCH)
345 OVS_CB(skb)->tun_id = key;
347 OVS_CB(skb)->tun_id = 0;
349 tnl_rcv(vport, skb, iph->tos);
358 static const struct tnl_ops capwap_tnl_ops = {
359 .tunnel_type = TNL_T_PROTO_CAPWAP,
360 .ipproto = IPPROTO_UDP,
361 .hdr_len = capwap_hdr_len,
362 .build_header = capwap_build_header,
363 .update_header = capwap_update_header,
366 static struct vport *capwap_create(const struct vport_parms *parms)
368 return tnl_create(parms, &capwap_vport_ops, &capwap_tnl_ops);
371 /* Random value. Irrelevant as long as it's not 0 since we set the handler. */
372 #define UDP_ENCAP_CAPWAP 10
373 static int capwap_init(void)
376 struct sockaddr_in sin;
378 err = sock_create(AF_INET, SOCK_DGRAM, 0, &capwap_rcv_socket);
382 sin.sin_family = AF_INET;
383 sin.sin_addr.s_addr = htonl(INADDR_ANY);
384 sin.sin_port = htons(CAPWAP_DST_PORT);
386 err = kernel_bind(capwap_rcv_socket, (struct sockaddr *)&sin,
387 sizeof(struct sockaddr_in));
391 udp_sk(capwap_rcv_socket->sk)->encap_type = UDP_ENCAP_CAPWAP;
392 udp_sk(capwap_rcv_socket->sk)->encap_rcv = capwap_rcv;
399 sock_release(capwap_rcv_socket);
401 pr_warn("cannot register capwap protocol handler\n");
405 static void capwap_exit(void)
408 sock_release(capwap_rcv_socket);
411 static void copy_skb_metadata(struct sk_buff *from, struct sk_buff *to)
413 to->pkt_type = from->pkt_type;
414 to->priority = from->priority;
415 to->protocol = from->protocol;
416 skb_dst_set(to, dst_clone(skb_dst(from)));
418 to->mark = from->mark;
421 skb_set_owner_w(to, from->sk);
423 #ifdef CONFIG_NET_SCHED
424 to->tc_index = from->tc_index;
426 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
427 to->ipvs_property = from->ipvs_property;
429 skb_copy_secmark(to, from);
432 static struct sk_buff *fragment(struct sk_buff *skb, const struct vport *vport,
433 struct dst_entry *dst, unsigned int hlen)
435 struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
436 unsigned int headroom;
437 unsigned int max_frame_len = dst_mtu(dst) + skb_network_offset(skb);
438 struct sk_buff *result = NULL, *list_cur = NULL;
439 unsigned int remaining;
443 if (hlen + ~FRAG_OFF_MASK + 1 > max_frame_len) {
445 pr_warn("capwap link mtu (%d) is less than minimum packet (%d)\n",
447 hlen - skb_network_offset(skb) + ~FRAG_OFF_MASK + 1);
451 remaining = skb->len - hlen;
453 frag_id = htons(atomic_inc_return(&tnl_vport->frag_id));
455 headroom = dst->header_len + 16;
456 if (!skb_network_offset(skb))
457 headroom += LL_RESERVED_SPACE(dst->dev);
460 struct sk_buff *skb2;
463 struct capwaphdr *cwh;
465 frag_size = min(remaining, max_frame_len - hlen);
466 if (remaining > frag_size)
467 frag_size &= FRAG_OFF_MASK;
469 skb2 = alloc_skb(headroom + hlen + frag_size, GFP_ATOMIC);
473 skb_reserve(skb2, headroom);
474 __skb_put(skb2, hlen + frag_size);
476 if (skb_network_offset(skb))
477 skb_reset_mac_header(skb2);
478 skb_set_network_header(skb2, skb_network_offset(skb));
479 skb_set_transport_header(skb2, skb_transport_offset(skb));
481 /* Copy (Ethernet)/IP/UDP/CAPWAP header. */
482 copy_skb_metadata(skb, skb2);
483 skb_copy_from_linear_data(skb, skb2->data, hlen);
485 /* Copy this data chunk. */
486 if (skb_copy_bits(skb, hlen + offset, skb2->data + hlen, frag_size))
489 udph = udp_hdr(skb2);
490 udph->len = htons(skb2->len - skb_transport_offset(skb2));
492 cwh = capwap_hdr(skb2);
493 if (remaining > frag_size)
494 cwh->begin |= FRAG_HDR;
496 cwh->begin |= FRAG_LAST_HDR;
497 cwh->frag_id = frag_id;
498 cwh->frag_off = htons(offset);
501 list_cur->next = skb2;
504 result = list_cur = skb2;
507 remaining -= frag_size;
513 tnl_free_linked_skbs(result);
519 /* All of the following functions relate to fragmentation reassembly. */
521 static inline struct frag_queue *ifq_cast(struct inet_frag_queue *ifq)
523 return container_of(ifq, struct frag_queue, ifq);
526 static u32 frag_hash(struct frag_match *match)
528 return jhash_3words((__force u16)match->id, (__force u32)match->saddr,
529 (__force u32)match->daddr,
530 frag_state.rnd) & (INETFRAGS_HASHSZ - 1);
533 static struct frag_queue *queue_find(struct frag_match *match)
535 struct inet_frag_queue *ifq;
537 read_lock(&frag_state.lock);
539 ifq = inet_frag_find(&frag_netns_state, &frag_state, match, frag_hash(match));
543 /* Unlock happens inside inet_frag_find(). */
545 return ifq_cast(ifq);
548 static struct sk_buff *frag_reasm(struct frag_queue *fq, struct net_device *dev)
550 struct sk_buff *head = fq->ifq.fragments;
551 struct sk_buff *frag;
553 /* Succeed or fail, we're done with this queue. */
554 inet_frag_kill(&fq->ifq, &frag_state);
556 if (fq->ifq.len > 65535)
559 /* Can't have the head be a clone. */
560 if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC))
564 * We're about to build frag list for this SKB. If it already has a
565 * frag list, alloc a new SKB and put the existing frag list there.
567 if (skb_shinfo(head)->frag_list) {
571 frag = alloc_skb(0, GFP_ATOMIC);
575 frag->next = head->next;
577 skb_shinfo(frag)->frag_list = skb_shinfo(head)->frag_list;
578 skb_shinfo(head)->frag_list = NULL;
580 for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
581 paged_len += skb_shinfo(head)->frags[i].size;
582 frag->len = frag->data_len = head->data_len - paged_len;
583 head->data_len -= frag->len;
584 head->len -= frag->len;
586 frag->ip_summed = head->ip_summed;
587 atomic_add(frag->truesize, &fq->ifq.net->mem);
590 skb_shinfo(head)->frag_list = head->next;
591 atomic_sub(head->truesize, &fq->ifq.net->mem);
593 /* Properly account for data in various packets. */
594 for (frag = head->next; frag; frag = frag->next) {
595 head->data_len += frag->len;
596 head->len += frag->len;
598 if (head->ip_summed != frag->ip_summed)
599 head->ip_summed = CHECKSUM_NONE;
600 else if (head->ip_summed == CHECKSUM_COMPLETE)
601 head->csum = csum_add(head->csum, frag->csum);
603 head->truesize += frag->truesize;
604 atomic_sub(frag->truesize, &fq->ifq.net->mem);
609 head->tstamp = fq->ifq.stamp;
610 fq->ifq.fragments = NULL;
615 static struct sk_buff *frag_queue(struct frag_queue *fq, struct sk_buff *skb,
616 u16 offset, bool frag_last)
618 struct sk_buff *prev, *next;
619 struct net_device *dev;
622 if (fq->ifq.last_in & INET_FRAG_COMPLETE)
628 end = offset + skb->len;
632 * Last fragment, shouldn't already have data past our end or
633 * have another last fragment.
635 if (end < fq->ifq.len || fq->ifq.last_in & INET_FRAG_LAST_IN)
638 fq->ifq.last_in |= INET_FRAG_LAST_IN;
641 /* Fragments should align to 8 byte chunks. */
642 if (end & ~FRAG_OFF_MASK)
645 if (end > fq->ifq.len) {
647 * Shouldn't have data past the end, if we already
650 if (fq->ifq.last_in & INET_FRAG_LAST_IN)
657 /* Find where we fit in. */
659 for (next = fq->ifq.fragments; next != NULL; next = next->next) {
660 if (FRAG_CB(next)->offset >= offset)
666 * Overlapping fragments aren't allowed. We shouldn't start before
667 * the end of the previous fragment.
669 if (prev && FRAG_CB(prev)->offset + prev->len > offset)
672 /* We also shouldn't end after the beginning of the next fragment. */
673 if (next && end > FRAG_CB(next)->offset)
676 FRAG_CB(skb)->offset = offset;
678 /* Link into list. */
683 fq->ifq.fragments = skb;
688 fq->ifq.stamp = skb->tstamp;
689 fq->ifq.meat += skb->len;
690 atomic_add(skb->truesize, &fq->ifq.net->mem);
692 fq->ifq.last_in |= INET_FRAG_FIRST_IN;
694 /* If we have all fragments do reassembly. */
695 if (fq->ifq.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
696 fq->ifq.meat == fq->ifq.len)
697 return frag_reasm(fq, dev);
699 write_lock(&frag_state.lock);
700 list_move_tail(&fq->ifq.lru_list, &fq->ifq.net->lru_list);
701 write_unlock(&frag_state.lock);
710 static struct sk_buff *defrag(struct sk_buff *skb, bool frag_last)
712 struct iphdr *iph = ip_hdr(skb);
713 struct capwaphdr *cwh = capwap_hdr(skb);
714 struct frag_match match;
716 struct frag_queue *fq;
718 if (atomic_read(&frag_netns_state.mem) > frag_netns_state.high_thresh)
719 inet_frag_evictor(&frag_netns_state, &frag_state);
721 match.daddr = iph->daddr;
722 match.saddr = iph->saddr;
723 match.id = cwh->frag_id;
724 frag_off = ntohs(cwh->frag_off) & FRAG_OFF_MASK;
726 fq = queue_find(&match);
728 spin_lock(&fq->ifq.lock);
729 skb = frag_queue(fq, skb, frag_off, frag_last);
730 spin_unlock(&fq->ifq.lock);
732 inet_frag_put(&fq->ifq, &frag_state);
741 static void defrag_init(void)
743 inet_frags_init(&frag_state);
744 inet_frags_init_net(&frag_netns_state);
747 static void defrag_exit(void)
749 inet_frags_exit_net(&frag_netns_state, &frag_state);
750 inet_frags_fini(&frag_state);
753 static void capwap_frag_init(struct inet_frag_queue *ifq, void *match_)
755 struct frag_match *match = match_;
757 ifq_cast(ifq)->match = *match;
760 static unsigned int capwap_frag_hash(struct inet_frag_queue *ifq)
762 return frag_hash(&ifq_cast(ifq)->match);
765 static int capwap_frag_match(struct inet_frag_queue *ifq, void *a_)
767 struct frag_match *a = a_;
768 struct frag_match *b = &ifq_cast(ifq)->match;
770 return a->id == b->id && a->saddr == b->saddr && a->daddr == b->daddr;
773 /* Run when the timeout for a given queue expires. */
774 static void capwap_frag_expire(unsigned long ifq)
776 struct frag_queue *fq;
778 fq = ifq_cast((struct inet_frag_queue *)ifq);
780 spin_lock(&fq->ifq.lock);
782 if (!(fq->ifq.last_in & INET_FRAG_COMPLETE))
783 inet_frag_kill(&fq->ifq, &frag_state);
785 spin_unlock(&fq->ifq.lock);
786 inet_frag_put(&fq->ifq, &frag_state);
789 const struct vport_ops capwap_vport_ops = {
790 .type = OVS_VPORT_TYPE_CAPWAP,
791 .flags = VPORT_F_GEN_STATS | VPORT_F_TUN_ID,
794 .create = capwap_create,
795 .destroy = tnl_destroy,
796 .set_addr = tnl_set_addr,
797 .get_name = tnl_get_name,
798 .get_addr = tnl_get_addr,
799 .get_options = tnl_get_options,
800 .set_options = tnl_set_options,
801 .get_dev_flags = vport_gen_get_dev_flags,
802 .is_running = vport_gen_is_running,
803 .get_operstate = vport_gen_get_operstate,
807 #warning CAPWAP tunneling will not be available on kernels before 2.6.26
808 #endif /* Linux kernel < 2.6.26 */