vserver 2.0 rc7
[linux-2.6.git] / net / ipv4 / ipvs / ip_vs_xmit.c
1 /*
2  * ip_vs_xmit.c: various packet transmitters for IPVS
3  *
4  * Version:     $Id: ip_vs_xmit.c,v 1.2 2002/11/30 01:50:35 wensong Exp $
5  *
6  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
7  *              Julian Anastasov <ja@ssi.bg>
8  *
9  *              This program is free software; you can redistribute it and/or
10  *              modify it under the terms of the GNU General Public License
11  *              as published by the Free Software Foundation; either version
12  *              2 of the License, or (at your option) any later version.
13  *
14  * Changes:
15  *
16  */
17
18 #include <linux/kernel.h>
19 #include <linux/ip.h>
20 #include <linux/tcp.h>                  /* for tcphdr */
21 #include <net/tcp.h>                    /* for csum_tcpudp_magic */
22 #include <net/udp.h>
23 #include <net/icmp.h>                   /* for icmp_send */
24 #include <net/route.h>                  /* for ip_route_output */
25 #include <linux/netfilter.h>
26 #include <linux/netfilter_ipv4.h>
27
28 #include <net/ip_vs.h>
29
30
31 /*
32  *      Destination cache to speed up outgoing route lookup
33  */
34 static inline void
35 __ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst)
36 {
37         struct dst_entry *old_dst;
38
39         old_dst = dest->dst_cache;
40         dest->dst_cache = dst;
41         dest->dst_rtos = rtos;
42         dst_release(old_dst);
43 }
44
45 static inline struct dst_entry *
46 __ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie)
47 {
48         struct dst_entry *dst = dest->dst_cache;
49
50         if (!dst)
51                 return NULL;
52         if ((dst->obsolete || rtos != dest->dst_rtos) &&
53             dst->ops->check(dst, cookie) == NULL) {
54                 dest->dst_cache = NULL;
55                 dst_release(dst);
56                 return NULL;
57         }
58         dst_hold(dst);
59         return dst;
60 }
61
62 static inline struct rtable *
63 __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
64 {
65         struct rtable *rt;                      /* Route to the other host */
66         struct ip_vs_dest *dest = cp->dest;
67
68         if (dest) {
69                 spin_lock(&dest->dst_lock);
70                 if (!(rt = (struct rtable *)
71                       __ip_vs_dst_check(dest, rtos, 0))) {
72                         struct flowi fl = {
73                                 .oif = 0,
74                                 .nl_u = {
75                                         .ip4_u = {
76                                                 .daddr = dest->addr,
77                                                 .saddr = 0,
78                                                 .tos = rtos, } },
79                         };
80
81                         if (ip_route_output_key(&rt, &fl)) {
82                                 spin_unlock(&dest->dst_lock);
83                                 IP_VS_DBG_RL("ip_route_output error, "
84                                              "dest: %u.%u.%u.%u\n",
85                                              NIPQUAD(dest->addr));
86                                 return NULL;
87                         }
88                         __ip_vs_dst_set(dest, rtos, dst_clone(&rt->u.dst));
89                         IP_VS_DBG(10, "new dst %u.%u.%u.%u, refcnt=%d, rtos=%X\n",
90                                   NIPQUAD(dest->addr),
91                                   atomic_read(&rt->u.dst.__refcnt), rtos);
92                 }
93                 spin_unlock(&dest->dst_lock);
94         } else {
95                 struct flowi fl = {
96                         .oif = 0,
97                         .nl_u = {
98                                 .ip4_u = {
99                                         .daddr = cp->daddr,
100                                         .saddr = 0,
101                                         .tos = rtos, } },
102                 };
103
104                 if (ip_route_output_key(&rt, &fl)) {
105                         IP_VS_DBG_RL("ip_route_output error, dest: "
106                                      "%u.%u.%u.%u\n", NIPQUAD(cp->daddr));
107                         return NULL;
108                 }
109         }
110
111         return rt;
112 }
113
114
115 /*
116  *      Release dest->dst_cache before a dest is removed
117  */
118 void
119 ip_vs_dst_reset(struct ip_vs_dest *dest)
120 {
121         struct dst_entry *old_dst;
122
123         old_dst = dest->dst_cache;
124         dest->dst_cache = NULL;
125         dst_release(old_dst);
126 }
127
128 #define IP_VS_XMIT(skb, rt)                             \
129 do {                                                    \
130         nf_reset_debug(skb);                            \
131         (skb)->nfcache |= NFC_IPVS_PROPERTY;            \
132         (skb)->ip_summed = CHECKSUM_NONE;               \
133         NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL,  \
134                 (rt)->u.dst.dev, dst_output);           \
135 } while (0)
136
137
138 /*
139  *      NULL transmitter (do nothing except return NF_ACCEPT)
140  */
141 int
142 ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
143                 struct ip_vs_protocol *pp)
144 {
145         /* we do not touch skb and do not need pskb ptr */
146         return NF_ACCEPT;
147 }
148
149
150 /*
151  *      Bypass transmitter
152  *      Let packets bypass the destination when the destination is not
153  *      available, it may be only used in transparent cache cluster.
154  */
155 int
156 ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
157                   struct ip_vs_protocol *pp)
158 {
159         struct rtable *rt;                      /* Route to the other host */
160         struct iphdr  *iph = skb->nh.iph;
161         u8     tos = iph->tos;
162         int    mtu;
163         struct flowi fl = {
164                 .oif = 0,
165                 .nl_u = {
166                         .ip4_u = {
167                                 .daddr = iph->daddr,
168                                 .saddr = 0,
169                                 .tos = RT_TOS(tos), } },
170         };
171
172         EnterFunction(10);
173
174         if (ip_route_output_key(&rt, &fl)) {
175                 IP_VS_DBG_RL("ip_vs_bypass_xmit(): ip_route_output error, "
176                              "dest: %u.%u.%u.%u\n", NIPQUAD(iph->daddr));
177                 goto tx_error_icmp;
178         }
179
180         /* MTU checking */
181         mtu = dst_mtu(&rt->u.dst);
182         if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) {
183                 ip_rt_put(rt);
184                 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
185                 IP_VS_DBG_RL("ip_vs_bypass_xmit(): frag needed\n");
186                 goto tx_error;
187         }
188
189         /*
190          * Call ip_send_check because we are not sure it is called
191          * after ip_defrag. Is copy-on-write needed?
192          */
193         if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
194                 ip_rt_put(rt);
195                 return NF_STOLEN;
196         }
197         ip_send_check(skb->nh.iph);
198
199         /* drop old route */
200         dst_release(skb->dst);
201         skb->dst = &rt->u.dst;
202
203         /* Another hack: avoid icmp_send in ip_fragment */
204         skb->local_df = 1;
205
206         IP_VS_XMIT(skb, rt);
207
208         LeaveFunction(10);
209         return NF_STOLEN;
210
211  tx_error_icmp:
212         dst_link_failure(skb);
213  tx_error:
214         kfree_skb(skb);
215         LeaveFunction(10);
216         return NF_STOLEN;
217 }
218
219
220 /*
221  *      NAT transmitter (only for outside-to-inside nat forwarding)
222  *      Not used for related ICMP
223  */
224 int
225 ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
226                struct ip_vs_protocol *pp)
227 {
228         struct rtable *rt;              /* Route to the other host */
229         int mtu;
230         struct iphdr *iph = skb->nh.iph;
231
232         EnterFunction(10);
233
234         /* check if it is a connection of no-client-port */
235         if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
236                 __u16 _pt, *p;
237                 p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt);
238                 if (p == NULL)
239                         goto tx_error;
240                 ip_vs_conn_fill_cport(cp, *p);
241                 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
242         }
243
244         if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
245                 goto tx_error_icmp;
246
247         /* MTU checking */
248         mtu = dst_mtu(&rt->u.dst);
249         if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) {
250                 ip_rt_put(rt);
251                 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
252                 IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for");
253                 goto tx_error;
254         }
255
256         /* copy-on-write the packet before mangling it */
257         if (!ip_vs_make_skb_writable(&skb, sizeof(struct iphdr)))
258                 goto tx_error_put;
259
260         if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
261                 goto tx_error_put;
262
263         /* drop old route */
264         dst_release(skb->dst);
265         skb->dst = &rt->u.dst;
266
267         /* mangle the packet */
268         if (pp->dnat_handler && !pp->dnat_handler(&skb, pp, cp))
269                 goto tx_error;
270         skb->nh.iph->daddr = cp->daddr;
271         ip_send_check(skb->nh.iph);
272
273         IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
274
275         /* FIXME: when application helper enlarges the packet and the length
276            is larger than the MTU of outgoing device, there will be still
277            MTU problem. */
278
279         /* Another hack: avoid icmp_send in ip_fragment */
280         skb->local_df = 1;
281
282         IP_VS_XMIT(skb, rt);
283
284         LeaveFunction(10);
285         return NF_STOLEN;
286
287   tx_error_icmp:
288         dst_link_failure(skb);
289   tx_error:
290         LeaveFunction(10);
291         kfree_skb(skb);
292         return NF_STOLEN;
293   tx_error_put:
294         ip_rt_put(rt);
295         goto tx_error;
296 }
297
298
299 /*
300  *   IP Tunneling transmitter
301  *
302  *   This function encapsulates the packet in a new IP packet, its
303  *   destination will be set to cp->daddr. Most code of this function
304  *   is taken from ipip.c.
305  *
306  *   It is used in VS/TUN cluster. The load balancer selects a real
307  *   server from a cluster based on a scheduling algorithm,
308  *   encapsulates the request packet and forwards it to the selected
309  *   server. For example, all real servers are configured with
310  *   "ifconfig tunl0 <Virtual IP Address> up". When the server receives
311  *   the encapsulated packet, it will decapsulate the packet, processe
312  *   the request and return the response packets directly to the client
313  *   without passing the load balancer. This can greatly increase the
314  *   scalability of virtual server.
315  *
316  *   Used for ANY protocol
317  */
318 int
319 ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
320                   struct ip_vs_protocol *pp)
321 {
322         struct rtable *rt;                      /* Route to the other host */
323         struct net_device *tdev;                /* Device to other host */
324         struct iphdr  *old_iph = skb->nh.iph;
325         u8     tos = old_iph->tos;
326         u16    df = old_iph->frag_off;
327         struct iphdr  *iph;                     /* Our new IP header */
328         int    max_headroom;                    /* The extra header space needed */
329         int    mtu;
330
331         EnterFunction(10);
332
333         if (skb->protocol != __constant_htons(ETH_P_IP)) {
334                 IP_VS_DBG_RL("ip_vs_tunnel_xmit(): protocol error, "
335                              "ETH_P_IP: %d, skb protocol: %d\n",
336                              __constant_htons(ETH_P_IP), skb->protocol);
337                 goto tx_error;
338         }
339
340         if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos))))
341                 goto tx_error_icmp;
342
343         tdev = rt->u.dst.dev;
344
345         mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr);
346         if (mtu < 68) {
347                 ip_rt_put(rt);
348                 IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68\n");
349                 goto tx_error;
350         }
351         if (skb->dst)
352                 skb->dst->ops->update_pmtu(skb->dst, mtu);
353
354         df |= (old_iph->frag_off&__constant_htons(IP_DF));
355
356         if ((old_iph->frag_off&__constant_htons(IP_DF))
357             && mtu < ntohs(old_iph->tot_len)) {
358                 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
359                 ip_rt_put(rt);
360                 IP_VS_DBG_RL("ip_vs_tunnel_xmit(): frag needed\n");
361                 goto tx_error;
362         }
363
364         /*
365          * Okay, now see if we can stuff it in the buffer as-is.
366          */
367         max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr);
368
369         if (skb_headroom(skb) < max_headroom
370             || skb_cloned(skb) || skb_shared(skb)) {
371                 struct sk_buff *new_skb =
372                         skb_realloc_headroom(skb, max_headroom);
373                 if (!new_skb) {
374                         ip_rt_put(rt);
375                         kfree_skb(skb);
376                         IP_VS_ERR_RL("ip_vs_tunnel_xmit(): no memory\n");
377                         return NF_STOLEN;
378                 }
379                 kfree_skb(skb);
380                 skb = new_skb;
381                 old_iph = skb->nh.iph;
382         }
383
384         skb->h.raw = (void *) old_iph;
385
386         /* fix old IP header checksum */
387         ip_send_check(old_iph);
388
389         skb->nh.raw = skb_push(skb, sizeof(struct iphdr));
390         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
391
392         /* drop old route */
393         dst_release(skb->dst);
394         skb->dst = &rt->u.dst;
395
396         /*
397          *      Push down and install the IPIP header.
398          */
399         iph                     =       skb->nh.iph;
400         iph->version            =       4;
401         iph->ihl                =       sizeof(struct iphdr)>>2;
402         iph->frag_off           =       df;
403         iph->protocol           =       IPPROTO_IPIP;
404         iph->tos                =       tos;
405         iph->daddr              =       rt->rt_dst;
406         iph->saddr              =       rt->rt_src;
407         iph->ttl                =       old_iph->ttl;
408         iph->tot_len            =       htons(skb->len);
409         ip_select_ident(iph, &rt->u.dst, NULL);
410         ip_send_check(iph);
411
412         /* Another hack: avoid icmp_send in ip_fragment */
413         skb->local_df = 1;
414
415         IP_VS_XMIT(skb, rt);
416
417         LeaveFunction(10);
418
419         return NF_STOLEN;
420
421   tx_error_icmp:
422         dst_link_failure(skb);
423   tx_error:
424         kfree_skb(skb);
425         LeaveFunction(10);
426         return NF_STOLEN;
427 }
428
429
430 /*
431  *      Direct Routing transmitter
432  *      Used for ANY protocol
433  */
434 int
435 ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
436               struct ip_vs_protocol *pp)
437 {
438         struct rtable *rt;                      /* Route to the other host */
439         struct iphdr  *iph = skb->nh.iph;
440         int    mtu;
441
442         EnterFunction(10);
443
444         if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
445                 goto tx_error_icmp;
446
447         /* MTU checking */
448         mtu = dst_mtu(&rt->u.dst);
449         if ((iph->frag_off&__constant_htons(IP_DF)) && skb->len > mtu) {
450                 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
451                 ip_rt_put(rt);
452                 IP_VS_DBG_RL("ip_vs_dr_xmit(): frag needed\n");
453                 goto tx_error;
454         }
455
456         /*
457          * Call ip_send_check because we are not sure it is called
458          * after ip_defrag. Is copy-on-write needed?
459          */
460         if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
461                 ip_rt_put(rt);
462                 return NF_STOLEN;
463         }
464         ip_send_check(skb->nh.iph);
465
466         /* drop old route */
467         dst_release(skb->dst);
468         skb->dst = &rt->u.dst;
469
470         /* Another hack: avoid icmp_send in ip_fragment */
471         skb->local_df = 1;
472
473         IP_VS_XMIT(skb, rt);
474
475         LeaveFunction(10);
476         return NF_STOLEN;
477
478   tx_error_icmp:
479         dst_link_failure(skb);
480   tx_error:
481         kfree_skb(skb);
482         LeaveFunction(10);
483         return NF_STOLEN;
484 }
485
486
487 /*
488  *      ICMP packet transmitter
489  *      called by the ip_vs_in_icmp
490  */
491 int
492 ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
493                 struct ip_vs_protocol *pp, int offset)
494 {
495         struct rtable   *rt;    /* Route to the other host */
496         int mtu;
497         int rc;
498
499         EnterFunction(10);
500
501         /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
502            forwarded directly here, because there is no need to
503            translate address/port back */
504         if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
505                 if (cp->packet_xmit)
506                         rc = cp->packet_xmit(skb, cp, pp);
507                 else
508                         rc = NF_ACCEPT;
509                 /* do not touch skb anymore */
510                 atomic_inc(&cp->in_pkts);
511                 goto out;
512         }
513
514         /*
515          * mangle and send the packet here (only for VS/NAT)
516          */
517
518         if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(skb->nh.iph->tos))))
519                 goto tx_error_icmp;
520
521         /* MTU checking */
522         mtu = dst_mtu(&rt->u.dst);
523         if ((skb->len > mtu) && (skb->nh.iph->frag_off&__constant_htons(IP_DF))) {
524                 ip_rt_put(rt);
525                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
526                 IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n");
527                 goto tx_error;
528         }
529
530         /* copy-on-write the packet before mangling it */
531         if (!ip_vs_make_skb_writable(&skb, offset))
532                 goto tx_error_put;
533
534         if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
535                 goto tx_error_put;
536
537         /* drop the old route when skb is not shared */
538         dst_release(skb->dst);
539         skb->dst = &rt->u.dst;
540
541         ip_vs_nat_icmp(skb, pp, cp, 0);
542
543         /* Another hack: avoid icmp_send in ip_fragment */
544         skb->local_df = 1;
545
546         IP_VS_XMIT(skb, rt);
547
548         rc = NF_STOLEN;
549         goto out;
550
551   tx_error_icmp:
552         dst_link_failure(skb);
553   tx_error:
554         dev_kfree_skb(skb);
555         rc = NF_STOLEN;
556   out:
557         LeaveFunction(10);
558         return rc;
559   tx_error_put:
560         ip_rt_put(rt);
561         goto tx_error;
562 }