3 * Distributed under the terms of the GNU GPL version 2.
4 * Copyright (c) 2008 Nicira Networks
7 #include <linux/etherdevice.h>
8 #include <linux/netdevice.h>
9 #include <linux/netfilter.h>
10 #include <linux/netfilter_bridge.h>
11 #include <linux/netfilter_ipv4.h>
14 #include <linux/icmp.h>
15 #include <linux/if_ether.h>
17 #include <net/route.h>
21 #include "nx_act_snat.h"
24 /* We need these fake structures to make netfilter happy --
25 * lots of places assume that skb->dst != NULL, which isn't
26 * all that unreasonable.
28 * Currently, we fill in the PMTU entry because netfilter
29 * refragmentation needs it, and the rt_flags entry because
30 * ipt_REJECT needs it. Future netfilter modules might
31 * require us to fill additional fields. */
32 static struct net_device __fake_net_device = {
33 .hard_header_len = ETH_HLEN
36 static struct rtable __fake_rtable = {
39 .__refcnt = ATOMIC_INIT(1),
40 .dev = &__fake_net_device,
41 .path = &__fake_rtable.u.dst,
42 .metrics = {[RTAX_MTU - 1] = 1500},
49 /* Define ARP for IP since the Linux headers don't do it cleanly. */
56 uint8_t ar_sha[ETH_ALEN];
58 uint8_t ar_tha[ETH_ALEN];
60 } __attribute__((packed));
61 OFP_ASSERT(sizeof(struct ip_arphdr) == 28);
63 static inline struct nf_bridge_info *nf_bridge_alloc(struct sk_buff *skb)
65 skb->nf_bridge = kzalloc(sizeof(struct nf_bridge_info), GFP_ATOMIC);
66 if (likely(skb->nf_bridge))
67 atomic_set(&(skb->nf_bridge->use), 1);
69 return skb->nf_bridge;
72 /* Save a copy of the original Ethernet header. */
73 void snat_save_header(struct sk_buff *skb)
75 int header_size = ETH_HLEN + nf_bridge_encap_header_len(skb);
80 skb_copy_from_linear_data_offset(skb, -header_size,
81 skb->nf_bridge->data, header_size);
84 /* Restore a saved Ethernet header. */
85 int snat_copy_header(struct sk_buff *skb)
88 int header_size = ETH_HLEN + nf_bridge_encap_header_len(skb);
93 err = skb_cow_head(skb, header_size);
97 skb_copy_to_linear_data_offset(skb, -header_size,
98 skb->nf_bridge->data, header_size);
99 __skb_push(skb, nf_bridge_encap_header_len(skb));
103 /* Push the Ethernet header back on and tranmit the packet. */
105 dp_xmit_skb_push(struct sk_buff *skb)
107 skb_push(skb, ETH_HLEN);
108 return dp_xmit_skb(skb);
111 /* Perform maintainence related to a SNAT'd interface. Currently, this only
112 * checks whether MAC->IP bindings have expired.
114 * Called with the RCU read lock */
116 snat_maint(struct net_bridge_port *p)
118 struct snat_conf *sc;
119 struct snat_mapping *m, *n;
121 unsigned long timeout;
123 spin_lock_irqsave(&p->lock, flags);
128 timeout = sc->mac_timeout * HZ;
130 list_for_each_entry_safe (m, n, &sc->mappings, node) {
131 if (time_after(jiffies, m->used + timeout)) {
138 spin_unlock_irqrestore(&p->lock, flags);
141 /* When the packet is bound for a local interface, strip off the fake
144 void snat_local_in(struct sk_buff *skb)
146 if (skb->dst == (struct dst_entry *)&__fake_rtable) {
147 dst_release(skb->dst);
152 /* Check whether destination IP's address is in the IP->MAC mappings.
153 * If it is, then overwrite the destination MAC with the value from the
156 * Returns -1 if there is a problem, otherwise 0. */
158 dnat_mac(struct net_bridge_port *p, struct sk_buff *skb)
160 struct snat_conf *sc = p->snat;
161 struct iphdr *iph = ip_hdr(skb);
162 struct ethhdr *eh = eth_hdr(skb);
163 struct snat_mapping *m;
165 if (skb->protocol != htons(ETH_P_IP))
168 list_for_each_entry (m, &sc->mappings, node) {
169 if (m->ip_addr == iph->daddr){
171 if (!make_writable(&skb)) {
173 printk("make_writable failed\n");
177 memcpy(eh->h_dest, m->hw_addr, ETH_ALEN);
186 __snat_this_address(struct snat_conf *sc, u32 ip_addr)
189 u32 h_ip_addr = ntohl(ip_addr);
190 return (h_ip_addr >= sc->ip_addr_start &&
191 h_ip_addr <= sc->ip_addr_end);
197 snat_this_address(struct net_bridge_port *p, u32 ip_addr)
199 unsigned long int flags;
202 spin_lock_irqsave(&p->lock, flags);
203 retval = __snat_this_address(p->snat, ip_addr);
204 spin_unlock_irqrestore(&p->lock, flags);
209 /* Must hold RCU lock. */
210 static struct net_bridge_port *
211 get_nbp_by_ip_addr(struct datapath *dp, u32 ip_addr)
213 struct net_bridge_port *p;
215 list_for_each_entry_rcu (p, &dp->port_list, node)
216 if (snat_this_address(p, ip_addr))
223 snat_pre_route_finish(struct sk_buff *skb)
225 struct net_bridge_port *p = skb->dev->br_port;
226 struct snat_conf *sc;
227 struct iphdr *iph = ip_hdr(skb);
230 skb->dst = (struct dst_entry *)&__fake_rtable;
233 /* Don't process packets that were not translated due to NAT */
234 spin_lock_irqsave(&p->lock, flags);
236 if (!__snat_this_address(sc, iph->daddr)) {
237 /* If SNAT is configured for this input device, check the
238 * IP->MAC mappings to see if we should update the destination
241 dnat_mac(skb->dev->br_port, skb);
244 spin_unlock_irqrestore(&p->lock, flags);
246 /* Pass the translated packet as input to the OpenFlow stack, which
248 skb_push(skb, ETH_HLEN);
249 skb_reset_mac_header(skb);
250 fwd_port_input(p->dp->chain, skb, p);
255 /* Checks whether 'skb' is an ARP request for an SNAT'd interface. If
256 * so, it will generate a response.
258 * Returns 0 if the packet was not handled. Otherwise, -1 is returned
259 * and the caller is responsible for freeing 'skb'. */
261 handle_arp_snat(struct sk_buff *skb)
263 struct net_bridge_port *s_nbp = skb->dev->br_port;
264 struct net_bridge_port *nat_nbp;
265 struct ip_arphdr *ah;
266 uint8_t mac_addr[ETH_ALEN];
268 if (!pskb_may_pull(skb, sizeof *ah))
271 ah = (struct ip_arphdr *)arp_hdr(skb);
272 if ((ah->ar_op != htons(ARPOP_REQUEST))
273 || ah->ar_hln != ETH_ALEN
274 || ah->ar_pro != htons(ETH_P_IP)
279 nat_nbp = get_nbp_by_ip_addr(s_nbp->dp, ah->ar_tip);
284 if (s_nbp == nat_nbp)
285 memcpy(mac_addr, s_nbp->dp->netdev->dev_addr, sizeof(mac_addr));
286 else if (!is_zero_ether_addr(nat_nbp->snat->mac_addr))
287 memcpy(mac_addr, nat_nbp->snat->mac_addr, sizeof(mac_addr));
294 arp_send(ARPOP_REPLY, ETH_P_ARP, ah->ar_sip, skb->dev, ah->ar_tip,
295 ah->ar_sha, mac_addr, ah->ar_sha);
300 /* Checks whether 'skb' is a ping request for an SNAT'd interface. If
301 * so, it will generate a response.
303 * Returns 0 if the packet was not handled. Otherwise, -1 is returned
304 * and the caller is responsible for freeing 'skb'. */
306 handle_icmp_snat(struct sk_buff *skb)
308 struct net_bridge_port *p = skb->dev->br_port;
311 struct icmphdr *icmph;
312 uint8_t tmp_eth[ETH_ALEN];
314 struct sk_buff *nskb;
316 /* We're only interested in addresses we rewrite. */
318 if (!snat_this_address(p, iph->daddr)) {
322 /* Drop fragments and packets not long enough to hold the ICMP
324 if ((ntohs(iph->frag_off) & IP_OFFSET) != 0 ||
325 !pskb_may_pull(skb, skb_transport_offset(skb) + 4))
328 /* We only respond to echo requests to our address. Continue
329 * processing replies and other ICMP messages since they may be
330 * intended for NAT'd hosts. */
331 icmph = icmp_hdr(skb);
332 if (icmph->type != ICMP_ECHO)
335 /* Send an echo reply in response */
336 nskb = skb_copy(skb, GFP_ATOMIC);
339 printk("skb copy failed for icmp reply\n");
343 /* Update Ethernet header. */
345 memcpy(tmp_eth, eh->h_dest, ETH_ALEN);
346 memcpy(eh->h_dest, eh->h_source, ETH_ALEN);
347 memcpy(eh->h_source, tmp_eth, ETH_ALEN);
350 * This is kind of busted, at least in that it doesn't check that the
351 * echoed IP options make sense. */
358 iph->daddr = iph->saddr;
360 iph->check = ip_fast_csum(iph, iph->ihl);
362 /* Update ICMP header. */
363 icmph = icmp_hdr(nskb);
364 icmph->type = ICMP_ECHOREPLY;
366 icmph->checksum = ip_compute_csum(icmph,
367 nskb->tail - nskb->transport_header);
369 dp_xmit_skb_push(nskb);
374 /* Check if any SNAT maintenance needs to be done on 'skb' before it's
375 * checked against the datapath's tables. This includes DNAT
376 * modification based on prior SNAT action and responding to ARP and
377 * echo requests for the SNAT interface.
379 * Returns -1 if the packet was handled and consumed, 0 if the caller
380 * should continue to process 'skb'.
383 snat_pre_route(struct sk_buff *skb)
388 WARN_ON_ONCE(skb_network_offset(skb));
389 if (skb->protocol == htons(ETH_P_ARP)) {
390 if (handle_arp_snat(skb))
394 else if (skb->protocol != htons(ETH_P_IP))
397 if (!pskb_may_pull(skb, sizeof *iph))
401 if (iph->ihl < 5 || iph->version != 4)
404 if (!pskb_may_pull(skb, ip_hdrlen(skb)))
406 skb_set_transport_header(skb, ip_hdrlen(skb));
408 /* Check if we need to echo reply for this address */
410 if ((iph->protocol == IPPROTO_ICMP) && (handle_icmp_snat(skb)))
414 if (unlikely(ip_fast_csum(iph, iph->ihl)))
417 len = ntohs(iph->tot_len);
418 if ((skb->len < len) || len < (iph->ihl*4))
421 if (pskb_trim_rcsum(skb, len))
424 nf_bridge_put(skb->nf_bridge);
425 if (!nf_bridge_alloc(skb))
428 NF_HOOK(PF_INET, NF_INET_PRE_ROUTING, skb, skb->dev, NULL,
429 snat_pre_route_finish);
439 snat_skb_finish(struct sk_buff *skb)
441 NF_HOOK(PF_INET, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
447 /* Update the MAC->IP mappings for the private side of the SNAT'd
450 update_mapping(struct net_bridge_port *p, const struct sk_buff *skb)
453 struct snat_conf *sc;
454 const struct iphdr *iph = ip_hdr(skb);
455 const struct ethhdr *eh = eth_hdr(skb);
456 struct snat_mapping *m;
458 spin_lock_irqsave(&p->lock, flags);
463 list_for_each_entry (m, &sc->mappings, node) {
464 if (m->ip_addr == iph->saddr){
465 memcpy(m->hw_addr, eh->h_source, ETH_ALEN);
471 m = kmalloc(sizeof *m, GFP_ATOMIC);
474 m->ip_addr = iph->saddr;
475 memcpy(m->hw_addr, eh->h_source, ETH_ALEN);
478 list_add(&m->node, &sc->mappings);
481 spin_unlock_irqrestore(&p->lock, flags);
484 /* Perform SNAT modification on 'skb' and send out 'out_port'. If the
485 * port was not configured for SNAT, it will be sent through the interface
486 * unmodified. 'skb' is not consumed, so caller will need to free it.
489 snat_skb(struct datapath *dp, const struct sk_buff *skb, int out_port)
491 struct net_bridge_port *p = dp->ports[out_port];
492 struct sk_buff *nskb;
497 /* FIXME: Expensive. Just need to skb_clone() here?
498 * (However, the skb_copy() does linearize and ensure that the headers
499 * are accessible.) */
500 nskb = skb_copy(skb, GFP_ATOMIC);
506 /* We only SNAT IP, so just send it on its way if not */
507 if (skb->protocol != htons(ETH_P_IP)) {
512 /* Set the source MAC to the OF interface */
513 memcpy(eth_hdr(nskb)->h_source, dp->netdev->dev_addr, ETH_ALEN);
515 update_mapping(p, skb);
517 /* Take the Ethernet header back off for netfilter hooks. */
518 skb_pull(nskb, ETH_HLEN);
520 NF_HOOK(PF_INET, NF_INET_FORWARD, nskb, skb->dev, nskb->dev,
524 /* Remove SNAT configuration on port 'p'.
526 * NB: The caller must hold the port's spinlock. */
528 snat_free_conf(struct net_bridge_port *p)
530 struct snat_conf *sc = p->snat;
535 /* Free existing mapping entries */
536 while (!list_empty(&sc->mappings)) {
537 struct snat_mapping *m = list_entry(sc->mappings.next,
538 struct snat_mapping, node);
549 /* Remove SNAT configuration from an interface. */
551 snat_del_port(struct datapath *dp, const struct nx_snat_config *nsc)
554 uint16_t port = ntohs(nsc->port);
555 struct net_bridge_port *p = dp->ports[port];
559 printk("Attempt to remove snat on non-existent port: %d\n", port);
563 spin_lock_irqsave(&p->lock, flags);
564 if (snat_free_conf(p)) {
565 /* SNAT not configured on this port */
566 spin_unlock_irqrestore(&p->lock, flags);
568 printk("Attempt to remove snat on non-snat port: %d\n", port);
572 spin_unlock_irqrestore(&p->lock, flags);
577 /* Add SNAT configuration to an interface. */
579 snat_add_port(struct datapath *dp, const struct nx_snat_config *nsc)
582 uint16_t port = ntohs(nsc->port);
583 struct net_bridge_port *p = dp->ports[port];
584 uint16_t mac_timeout = ntohs(nsc->mac_timeout);
585 struct snat_conf *sc;
587 if (mac_timeout == 0)
588 mac_timeout = MAC_TIMEOUT_DEFAULT;
592 printk("Attempt to add snat on non-existent port: %d\n", port);
596 /* If SNAT is already configured on the port, check whether the same
597 * IP addresses are used. If so, just update the mac timeout
598 * configuration. Otherwise, drop all SNAT configuration and
600 spin_lock_irqsave(&p->lock, flags);
602 if ((p->snat->ip_addr_start == ntohl(nsc->ip_addr_start))
603 && (p->snat->ip_addr_end == ntohl(nsc->ip_addr_end))) {
604 p->snat->mac_timeout = mac_timeout;
605 spin_unlock_irqrestore(&p->lock, flags);
609 /* Free the existing configuration and mappings. */
613 sc = kzalloc(sizeof *sc, GFP_ATOMIC);
615 spin_unlock_irqrestore(&p->lock, flags);
619 sc->ip_addr_start = ntohl(nsc->ip_addr_start);
620 sc->ip_addr_end = ntohl(nsc->ip_addr_end);
621 sc->mac_timeout = mac_timeout;
622 memcpy(sc->mac_addr, nsc->mac_addr, sizeof(sc->mac_addr));
623 INIT_LIST_HEAD(&sc->mappings);
626 spin_unlock_irqrestore(&p->lock, flags);
631 /* Handle a SNAT configuration message.
633 * Returns 0 if no problems are found. Otherwise, a negative errno. */
635 snat_mod_config(struct datapath *dp, const struct nx_act_config *nac)
637 int n_entries = (ntohs(nac->header.header.length) - sizeof *nac)
638 / sizeof (struct nx_snat_config);
642 for (i=0; i<n_entries; i++) {
643 const struct nx_snat_config *nsc = &nac->snat[i];
646 if (nsc->command == NXSC_ADD)
647 r = snat_add_port(dp, nsc);
649 r = snat_del_port(dp, nsc);