ftp://ftp.kernel.org/pub/linux/kernel/v2.6/linux-2.6.6.tar.bz2
[linux-2.6.git] / net / ipv4 / ipvs / ip_vs_xmit.c
1 /*
2  * ip_vs_xmit.c: various packet transmitters for IPVS
3  *
4  * Version:     $Id: ip_vs_xmit.c,v 1.2 2002/11/30 01:50:35 wensong Exp $
5  *
6  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
7  *              Julian Anastasov <ja@ssi.bg>
8  *
9  *              This program is free software; you can redistribute it and/or
10  *              modify it under the terms of the GNU General Public License
11  *              as published by the Free Software Foundation; either version
12  *              2 of the License, or (at your option) any later version.
13  *
14  * Changes:
15  *
16  */
17
18 #include <linux/kernel.h>
19 #include <linux/ip.h>
20 #include <linux/tcp.h>                  /* for tcphdr */
21 #include <net/tcp.h>                    /* for csum_tcpudp_magic */
22 #include <net/udp.h>
23 #include <net/icmp.h>                   /* for icmp_send */
24 #include <net/route.h>                  /* for ip_route_output */
25 #include <linux/netfilter.h>
26 #include <linux/netfilter_ipv4.h>
27
28 #include <net/ip_vs.h>
29
30
31 /*
32  *      Destination cache to speed up outgoing route lookup
33  */
34 static inline void
35 __ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst)
36 {
37         struct dst_entry *old_dst;
38
39         old_dst = dest->dst_cache;
40         dest->dst_cache = dst;
41         dest->dst_rtos = rtos;
42         dst_release(old_dst);
43 }
44
45 static inline struct dst_entry *
46 __ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie)
47 {
48         struct dst_entry *dst = dest->dst_cache;
49
50         if (!dst)
51                 return NULL;
52         if ((dst->obsolete || rtos != dest->dst_rtos) &&
53             dst->ops->check(dst, cookie) == NULL) {
54                 dest->dst_cache = 0;
55                 return NULL;
56         }
57         dst_hold(dst);
58         return dst;
59 }
60
61 static inline struct rtable *
62 __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
63 {
64         struct rtable *rt;                      /* Route to the other host */
65         struct ip_vs_dest *dest = cp->dest;
66
67         if (dest) {
68                 spin_lock(&dest->dst_lock);
69                 if (!(rt = (struct rtable *)
70                       __ip_vs_dst_check(dest, rtos, 0))) {
71                         struct flowi fl = {
72                                 .oif = 0,
73                                 .nl_u = {
74                                         .ip4_u = {
75                                                 .daddr = dest->addr,
76                                                 .saddr = 0,
77                                                 .tos = rtos, } },
78                         };
79
80                         if (ip_route_output_key(&rt, &fl)) {
81                                 spin_unlock(&dest->dst_lock);
82                                 IP_VS_DBG_RL("ip_route_output error, "
83                                              "dest: %u.%u.%u.%u\n",
84                                              NIPQUAD(dest->addr));
85                                 return NULL;
86                         }
87                         __ip_vs_dst_set(dest, rtos, dst_clone(&rt->u.dst));
88                         IP_VS_DBG(10, "new dst %u.%u.%u.%u, refcnt=%d, rtos=%X\n",
89                                   NIPQUAD(dest->addr),
90                                   atomic_read(&rt->u.dst.__refcnt), rtos);
91                 }
92                 spin_unlock(&dest->dst_lock);
93         } else {
94                 struct flowi fl = {
95                         .oif = 0,
96                         .nl_u = {
97                                 .ip4_u = {
98                                         .daddr = cp->daddr,
99                                         .saddr = 0,
100                                         .tos = rtos, } },
101                 };
102
103                 if (ip_route_output_key(&rt, &fl)) {
104                         IP_VS_DBG_RL("ip_route_output error, dest: "
105                                      "%u.%u.%u.%u\n", NIPQUAD(cp->daddr));
106                         return NULL;
107                 }
108         }
109
110         return rt;
111 }
112
113
114 /*
115  *      Release dest->dst_cache before a dest is removed
116  */
117 void
118 ip_vs_dst_reset(struct ip_vs_dest *dest)
119 {
120         struct dst_entry *old_dst;
121
122         old_dst = dest->dst_cache;
123         dest->dst_cache = NULL;
124         dst_release(old_dst);
125 }
126
127
128 #define IP_VS_XMIT(skb, rt)                             \
129 do {                                                    \
130         (skb)->nfcache |= NFC_IPVS_PROPERTY;            \
131         NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL,  \
132                 (rt)->u.dst.dev, dst_output);           \
133 } while (0)
134
135
136 /*
137  *      NULL transmitter (do nothing except return NF_ACCEPT)
138  */
139 int
140 ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
141                 struct ip_vs_protocol *pp)
142 {
143         /* we do not touch skb and do not need pskb ptr */
144         return NF_ACCEPT;
145 }
146
147
148 /*
149  *      Bypass transmitter
150  *      Let packets bypass the destination when the destination is not
151  *      available, it may be only used in transparent cache cluster.
152  */
153 int
154 ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
155                   struct ip_vs_protocol *pp)
156 {
157         struct rtable *rt;                      /* Route to the other host */
158         struct iphdr  *iph = skb->nh.iph;
159         u8     tos = iph->tos;
160         int    mtu;
161         struct flowi fl = {
162                 .oif = 0,
163                 .nl_u = {
164                         .ip4_u = {
165                                 .daddr = iph->daddr,
166                                 .saddr = 0,
167                                 .tos = RT_TOS(tos), } },
168         };
169
170         EnterFunction(10);
171
172         if (ip_route_output_key(&rt, &fl)) {
173                 IP_VS_DBG_RL("ip_vs_bypass_xmit(): ip_route_output error, "
174                              "dest: %u.%u.%u.%u\n", NIPQUAD(iph->daddr));
175                 goto tx_error_icmp;
176         }
177
178         /* MTU checking */
179         mtu = dst_pmtu(&rt->u.dst);
180         if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) {
181                 ip_rt_put(rt);
182                 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
183                 IP_VS_DBG_RL("ip_vs_bypass_xmit(): frag needed\n");
184                 goto tx_error;
185         }
186
187         /*
188          * Call ip_send_check because we are not sure it is called
189          * after ip_defrag. Is copy-on-write needed?
190          */
191         if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
192                 ip_rt_put(rt);
193                 return NF_STOLEN;
194         }
195         ip_send_check(skb->nh.iph);
196
197         /* drop old route */
198         dst_release(skb->dst);
199         skb->dst = &rt->u.dst;
200
201         /* Another hack: avoid icmp_send in ip_fragment */
202         skb->local_df = 1;
203
204 #ifdef CONFIG_NETFILTER_DEBUG
205         skb->nf_debug = 0;
206 #endif /* CONFIG_NETFILTER_DEBUG */
207         IP_VS_XMIT(skb, rt);
208
209         LeaveFunction(10);
210         return NF_STOLEN;
211
212  tx_error_icmp:
213         dst_link_failure(skb);
214  tx_error:
215         kfree_skb(skb);
216         LeaveFunction(10);
217         return NF_STOLEN;
218 }
219
220
221 /*
222  *      NAT transmitter (only for outside-to-inside nat forwarding)
223  *      Not used for related ICMP
224  */
225 int
226 ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
227                struct ip_vs_protocol *pp)
228 {
229         struct rtable *rt;              /* Route to the other host */
230         int mtu;
231         struct iphdr *iph = skb->nh.iph;
232
233         EnterFunction(10);
234
235         /* check if it is a connection of no-client-port */
236         if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
237                 __u16 pt;
238                 if (skb_copy_bits(skb, iph->ihl*4, &pt, sizeof(pt)) < 0)
239                         goto tx_error;
240                 ip_vs_conn_fill_cport(cp, pt);
241                 IP_VS_DBG(10, "filled cport=%d\n", ntohs(pt));
242         }
243
244         if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
245                 goto tx_error_icmp;
246
247         /* MTU checking */
248         mtu = dst_pmtu(&rt->u.dst);
249         if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) {
250                 ip_rt_put(rt);
251                 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
252                 IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for");
253                 goto tx_error;
254         }
255
256         /* copy-on-write the packet before mangling it */
257         if (!ip_vs_make_skb_writable(&skb, sizeof(struct iphdr)))
258                 goto tx_error_put;
259
260         if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
261                 goto tx_error_put;
262
263         /* drop old route */
264         dst_release(skb->dst);
265         skb->dst = &rt->u.dst;
266
267         /* mangle the packet */
268         if (pp->dnat_handler && !pp->dnat_handler(&skb, pp, cp))
269                 goto tx_error;
270         skb->nh.iph->daddr = cp->daddr;
271         ip_send_check(skb->nh.iph);
272
273         IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
274
275         /* FIXME: when application helper enlarges the packet and the length
276            is larger than the MTU of outgoing device, there will be still
277            MTU problem. */
278
279         /* Another hack: avoid icmp_send in ip_fragment */
280         skb->local_df = 1;
281
282 #ifdef CONFIG_NETFILTER_DEBUG
283         skb->nf_debug = 0;
284 #endif /* CONFIG_NETFILTER_DEBUG */
285         IP_VS_XMIT(skb, rt);
286
287         LeaveFunction(10);
288         return NF_STOLEN;
289
290   tx_error_icmp:
291         dst_link_failure(skb);
292   tx_error:
293         LeaveFunction(10);
294         kfree_skb(skb);
295         return NF_STOLEN;
296   tx_error_put:
297         ip_rt_put(rt);
298         goto tx_error;
299 }
300
301
302 /*
303  *   IP Tunneling transmitter
304  *
305  *   This function encapsulates the packet in a new IP packet, its
306  *   destination will be set to cp->daddr. Most code of this function
307  *   is taken from ipip.c.
308  *
309  *   It is used in VS/TUN cluster. The load balancer selects a real
310  *   server from a cluster based on a scheduling algorithm,
311  *   encapsulates the request packet and forwards it to the selected
312  *   server. For example, all real servers are configured with
313  *   "ifconfig tunl0 <Virtual IP Address> up". When the server receives
314  *   the encapsulated packet, it will decapsulate the packet, processe
315  *   the request and return the response packets directly to the client
316  *   without passing the load balancer. This can greatly increase the
317  *   scalability of virtual server.
318  *
319  *   Used for ANY protocol
320  */
321 int
322 ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
323                   struct ip_vs_protocol *pp)
324 {
325         struct rtable *rt;                      /* Route to the other host */
326         struct net_device *tdev;                /* Device to other host */
327         struct iphdr  *old_iph = skb->nh.iph;
328         u8     tos = old_iph->tos;
329         u16    df = old_iph->frag_off;
330         struct iphdr  *iph;                     /* Our new IP header */
331         int    max_headroom;                    /* The extra header space needed */
332         int    mtu;
333
334         EnterFunction(10);
335
336         if (skb->protocol != __constant_htons(ETH_P_IP)) {
337                 IP_VS_DBG_RL("ip_vs_tunnel_xmit(): protocol error, "
338                              "ETH_P_IP: %d, skb protocol: %d\n",
339                              __constant_htons(ETH_P_IP), skb->protocol);
340                 goto tx_error;
341         }
342
343         if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos))))
344                 goto tx_error_icmp;
345
346         tdev = rt->u.dst.dev;
347
348         mtu = dst_pmtu(&rt->u.dst) - sizeof(struct iphdr);
349         if (mtu < 68) {
350                 ip_rt_put(rt);
351                 IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68\n");
352                 goto tx_error;
353         }
354         if (skb->dst)
355                 skb->dst->ops->update_pmtu(skb->dst, mtu);
356
357         df |= (old_iph->frag_off&__constant_htons(IP_DF));
358
359         if ((old_iph->frag_off&__constant_htons(IP_DF))
360             && mtu < ntohs(old_iph->tot_len)) {
361                 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
362                 ip_rt_put(rt);
363                 IP_VS_DBG_RL("ip_vs_tunnel_xmit(): frag needed\n");
364                 goto tx_error;
365         }
366
367         /*
368          * Okay, now see if we can stuff it in the buffer as-is.
369          */
370         max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr);
371
372         if (skb_headroom(skb) < max_headroom
373             || skb_cloned(skb) || skb_shared(skb)) {
374                 struct sk_buff *new_skb =
375                         skb_realloc_headroom(skb, max_headroom);
376                 if (!new_skb) {
377                         ip_rt_put(rt);
378                         kfree_skb(skb);
379                         IP_VS_ERR_RL("ip_vs_tunnel_xmit(): no memory\n");
380                         return NF_STOLEN;
381                 }
382                 kfree_skb(skb);
383                 skb = new_skb;
384                 old_iph = skb->nh.iph;
385         }
386
387         skb->h.raw = (void *) old_iph;
388
389         /* fix old IP header checksum */
390         ip_send_check(old_iph);
391
392         skb->nh.raw = skb_push(skb, sizeof(struct iphdr));
393         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
394
395         /* drop old route */
396         dst_release(skb->dst);
397         skb->dst = &rt->u.dst;
398
399         /*
400          *      Push down and install the IPIP header.
401          */
402         iph                     =       skb->nh.iph;
403         iph->version            =       4;
404         iph->ihl                =       sizeof(struct iphdr)>>2;
405         iph->frag_off           =       df;
406         iph->protocol           =       IPPROTO_IPIP;
407         iph->tos                =       tos;
408         iph->daddr              =       rt->rt_dst;
409         iph->saddr              =       rt->rt_src;
410         iph->ttl                =       old_iph->ttl;
411         iph->tot_len            =       htons(skb->len);
412         ip_select_ident(iph, &rt->u.dst, NULL);
413         ip_send_check(iph);
414
415         skb->ip_summed = CHECKSUM_NONE;
416
417         /* Another hack: avoid icmp_send in ip_fragment */
418         skb->local_df = 1;
419
420 #ifdef CONFIG_NETFILTER_DEBUG
421         skb->nf_debug = 0;
422 #endif /* CONFIG_NETFILTER_DEBUG */
423
424         IP_VS_XMIT(skb, rt);
425
426         LeaveFunction(10);
427
428         return NF_STOLEN;
429
430   tx_error_icmp:
431         dst_link_failure(skb);
432   tx_error:
433         kfree_skb(skb);
434         LeaveFunction(10);
435         return NF_STOLEN;
436 }
437
438
439 /*
440  *      Direct Routing transmitter
441  *      Used for ANY protocol
442  */
443 int
444 ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
445               struct ip_vs_protocol *pp)
446 {
447         struct rtable *rt;                      /* Route to the other host */
448         struct iphdr  *iph = skb->nh.iph;
449         int    mtu;
450
451         EnterFunction(10);
452
453         if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
454                 goto tx_error_icmp;
455
456         /* MTU checking */
457         mtu = dst_pmtu(&rt->u.dst);
458         if ((iph->frag_off&__constant_htons(IP_DF)) && skb->len > mtu) {
459                 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
460                 ip_rt_put(rt);
461                 IP_VS_DBG_RL("ip_vs_dr_xmit(): frag needed\n");
462                 goto tx_error;
463         }
464
465         /*
466          * Call ip_send_check because we are not sure it is called
467          * after ip_defrag. Is copy-on-write needed?
468          */
469         if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
470                 ip_rt_put(rt);
471                 return NF_STOLEN;
472         }
473         ip_send_check(skb->nh.iph);
474
475         /* drop old route */
476         dst_release(skb->dst);
477         skb->dst = &rt->u.dst;
478
479         /* Another hack: avoid icmp_send in ip_fragment */
480         skb->local_df = 1;
481
482 #ifdef CONFIG_NETFILTER_DEBUG
483         skb->nf_debug = 0;
484 #endif /* CONFIG_NETFILTER_DEBUG */
485         IP_VS_XMIT(skb, rt);
486
487         LeaveFunction(10);
488         return NF_STOLEN;
489
490   tx_error_icmp:
491         dst_link_failure(skb);
492   tx_error:
493         kfree_skb(skb);
494         LeaveFunction(10);
495         return NF_STOLEN;
496 }
497
498
499 /*
500  *      ICMP packet transmitter
501  *      called by the ip_vs_in_icmp
502  */
503 int
504 ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
505                 struct ip_vs_protocol *pp, int offset)
506 {
507         struct rtable   *rt;    /* Route to the other host */
508         int mtu;
509         int rc;
510
511         EnterFunction(10);
512
513         /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
514            forwarded directly here, because there is no need to
515            translate address/port back */
516         if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
517                 if (cp->packet_xmit)
518                         rc = cp->packet_xmit(skb, cp, pp);
519                 else
520                         rc = NF_ACCEPT;
521                 /* do not touch skb anymore */
522                 atomic_inc(&cp->in_pkts);
523                 __ip_vs_conn_put(cp);
524                 goto out;
525         }
526
527         /*
528          * mangle and send the packet here (only for VS/NAT)
529          */
530
531         if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(skb->nh.iph->tos))))
532                 goto tx_error_icmp;
533
534         /* MTU checking */
535         mtu = dst_pmtu(&rt->u.dst);
536         if ((skb->len > mtu) && (skb->nh.iph->frag_off&__constant_htons(IP_DF))) {
537                 ip_rt_put(rt);
538                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
539                 IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n");
540                 goto tx_error;
541         }
542
543         /* copy-on-write the packet before mangling it */
544         if (!ip_vs_make_skb_writable(&skb, offset))
545                 goto tx_error_put;
546
547         if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
548                 goto tx_error_put;
549
550         /* drop the old route when skb is not shared */
551         dst_release(skb->dst);
552         skb->dst = &rt->u.dst;
553
554         ip_vs_nat_icmp(skb, pp, cp, 0);
555
556         /* Another hack: avoid icmp_send in ip_fragment */
557         skb->local_df = 1;
558
559 #ifdef CONFIG_NETFILTER_DEBUG
560         skb->nf_debug = 0;
561 #endif /* CONFIG_NETFILTER_DEBUG */
562         IP_VS_XMIT(skb, rt);
563
564         rc = NF_STOLEN;
565         goto out;
566
567   tx_error_icmp:
568         dst_link_failure(skb);
569   tx_error:
570         dev_kfree_skb(skb);
571         rc = NF_STOLEN;
572   out:
573         LeaveFunction(10);
574         return rc;
575   tx_error_put:
576         ip_rt_put(rt);
577         goto tx_error;
578 }