vserver 1.9.3
[linux-2.6.git] / net / ipv4 / netfilter / ip_nat_helper.c
1 /* ip_nat_helper.c - generic support functions for NAT helpers 
2  *
3  * (C) 2000-2002 Harald Welte <laforge@netfilter.org>
4  * (C) 2003-2004 Netfilter Core Team <coreteam@netfilter.org>
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 as
8  * published by the Free Software Foundation.
9  *
10  *      14 Jan 2002 Harald Welte <laforge@gnumonks.org>:
11  *              - add support for SACK adjustment 
12  *      14 Mar 2002 Harald Welte <laforge@gnumonks.org>:
13  *              - merge SACK support into newnat API
14  *      16 Aug 2002 Brian J. Murrell <netfilter@interlinx.bc.ca>:
15  *              - make ip_nat_resize_packet more generic (TCP and UDP)
16  *              - add ip_nat_mangle_udp_packet
17  */
18 #include <linux/config.h>
19 #include <linux/module.h>
20 #include <linux/kmod.h>
21 #include <linux/types.h>
22 #include <linux/timer.h>
23 #include <linux/skbuff.h>
24 #include <linux/netfilter_ipv4.h>
25 #include <net/checksum.h>
26 #include <net/icmp.h>
27 #include <net/ip.h>
28 #include <net/tcp.h>
29 #include <net/udp.h>
30
31 #define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock)
32 #define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock)
33
34 #include <linux/netfilter_ipv4/ip_conntrack.h>
35 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
36 #include <linux/netfilter_ipv4/ip_nat.h>
37 #include <linux/netfilter_ipv4/ip_nat_protocol.h>
38 #include <linux/netfilter_ipv4/ip_nat_core.h>
39 #include <linux/netfilter_ipv4/ip_nat_helper.h>
40 #include <linux/netfilter_ipv4/listhelp.h>
41
42 #if 0
43 #define DEBUGP printk
44 #define DUMP_OFFSET(x)  printk("offset_before=%d, offset_after=%d, correction_pos=%u\n", x->offset_before, x->offset_after, x->correction_pos);
45 #else
46 #define DEBUGP(format, args...)
47 #define DUMP_OFFSET(x)
48 #endif
49
50 static LIST_HEAD(helpers);
51 DECLARE_LOCK(ip_nat_seqofs_lock);
52
53 /* Setup TCP sequence correction given this change at this sequence */
54 static inline void 
55 adjust_tcp_sequence(u32 seq,
56                     int sizediff,
57                     struct ip_conntrack *ct, 
58                     enum ip_conntrack_info ctinfo)
59 {
60         int dir;
61         struct ip_nat_seq *this_way, *other_way;
62
63         DEBUGP("ip_nat_resize_packet: old_size = %u, new_size = %u\n",
64                 (*skb)->len, new_size);
65
66         dir = CTINFO2DIR(ctinfo);
67
68         this_way = &ct->nat.info.seq[dir];
69         other_way = &ct->nat.info.seq[!dir];
70
71         DEBUGP("ip_nat_resize_packet: Seq_offset before: ");
72         DUMP_OFFSET(this_way);
73
74         LOCK_BH(&ip_nat_seqofs_lock);
75
76         /* SYN adjust. If it's uninitialized, or this is after last
77          * correction, record it: we don't handle more than one
78          * adjustment in the window, but do deal with common case of a
79          * retransmit */
80         if (this_way->offset_before == this_way->offset_after
81             || before(this_way->correction_pos, seq)) {
82                     this_way->correction_pos = seq;
83                     this_way->offset_before = this_way->offset_after;
84                     this_way->offset_after += sizediff;
85         }
86         UNLOCK_BH(&ip_nat_seqofs_lock);
87
88         DEBUGP("ip_nat_resize_packet: Seq_offset after: ");
89         DUMP_OFFSET(this_way);
90 }
91
92 /* Frobs data inside this packet, which is linear. */
93 static void mangle_contents(struct sk_buff *skb,
94                             unsigned int dataoff,
95                             unsigned int match_offset,
96                             unsigned int match_len,
97                             const char *rep_buffer,
98                             unsigned int rep_len)
99 {
100         unsigned char *data;
101
102         BUG_ON(skb_is_nonlinear(skb));
103         data = (unsigned char *)skb->nh.iph + dataoff;
104
105         /* move post-replacement */
106         memmove(data + match_offset + rep_len,
107                 data + match_offset + match_len,
108                 skb->tail - (data + match_offset + match_len));
109
110         /* insert data from buffer */
111         memcpy(data + match_offset, rep_buffer, rep_len);
112
113         /* update skb info */
114         if (rep_len > match_len) {
115                 DEBUGP("ip_nat_mangle_packet: Extending packet by "
116                         "%u from %u bytes\n", rep_len - match_len,
117                        skb->len);
118                 skb_put(skb, rep_len - match_len);
119         } else {
120                 DEBUGP("ip_nat_mangle_packet: Shrinking packet from "
121                         "%u from %u bytes\n", match_len - rep_len,
122                        skb->len);
123                 __skb_trim(skb, skb->len + rep_len - match_len);
124         }
125
126         /* fix IP hdr checksum information */
127         skb->nh.iph->tot_len = htons(skb->len);
128         ip_send_check(skb->nh.iph);
129 }
130
131 /* Unusual, but possible case. */
132 static int enlarge_skb(struct sk_buff **pskb, unsigned int extra)
133 {
134         struct sk_buff *nskb;
135
136         if ((*pskb)->len + extra > 65535)
137                 return 0;
138
139         nskb = skb_copy_expand(*pskb, skb_headroom(*pskb), extra, GFP_ATOMIC);
140         if (!nskb)
141                 return 0;
142
143         /* Transfer socket to new skb. */
144         if ((*pskb)->sk)
145                 skb_set_owner_w(nskb, (*pskb)->sk);
146 #ifdef CONFIG_NETFILTER_DEBUG
147         nskb->nf_debug = (*pskb)->nf_debug;
148 #endif
149         kfree_skb(*pskb);
150         *pskb = nskb;
151         return 1;
152 }
153
154 /* Generic function for mangling variable-length address changes inside
155  * NATed TCP connections (like the PORT XXX,XXX,XXX,XXX,XXX,XXX
156  * command in FTP).
157  *
158  * Takes care about all the nasty sequence number changes, checksumming,
159  * skb enlargement, ...
160  *
161  * */
162 int 
163 ip_nat_mangle_tcp_packet(struct sk_buff **pskb,
164                          struct ip_conntrack *ct,
165                          enum ip_conntrack_info ctinfo,
166                          unsigned int match_offset,
167                          unsigned int match_len,
168                          const char *rep_buffer,
169                          unsigned int rep_len)
170 {
171         struct iphdr *iph;
172         struct tcphdr *tcph;
173         int datalen;
174
175         if (!skb_ip_make_writable(pskb, (*pskb)->len))
176                 return 0;
177
178         if (rep_len > match_len
179             && rep_len - match_len > skb_tailroom(*pskb)
180             && !enlarge_skb(pskb, rep_len - match_len))
181                 return 0;
182
183         SKB_LINEAR_ASSERT(*pskb);
184
185         iph = (*pskb)->nh.iph;
186         tcph = (void *)iph + iph->ihl*4;
187
188         mangle_contents(*pskb, iph->ihl*4 + tcph->doff*4,
189                         match_offset, match_len, rep_buffer, rep_len);
190
191         datalen = (*pskb)->len - iph->ihl*4;
192         tcph->check = 0;
193         tcph->check = tcp_v4_check(tcph, datalen, iph->saddr, iph->daddr,
194                                    csum_partial((char *)tcph, datalen, 0));
195
196         adjust_tcp_sequence(ntohl(tcph->seq),
197                             (int)rep_len - (int)match_len,
198                             ct, ctinfo);
199         return 1;
200 }
201                         
202 /* Generic function for mangling variable-length address changes inside
203  * NATed UDP connections (like the CONNECT DATA XXXXX MESG XXXXX INDEX XXXXX
204  * command in the Amanda protocol)
205  *
206  * Takes care about all the nasty sequence number changes, checksumming,
207  * skb enlargement, ...
208  *
209  * XXX - This function could be merged with ip_nat_mangle_tcp_packet which
210  *       should be fairly easy to do.
211  */
212 int 
213 ip_nat_mangle_udp_packet(struct sk_buff **pskb,
214                          struct ip_conntrack *ct,
215                          enum ip_conntrack_info ctinfo,
216                          unsigned int match_offset,
217                          unsigned int match_len,
218                          const char *rep_buffer,
219                          unsigned int rep_len)
220 {
221         struct iphdr *iph;
222         struct udphdr *udph;
223
224         /* UDP helpers might accidentally mangle the wrong packet */
225         iph = (*pskb)->nh.iph;
226         if ((*pskb)->len < iph->ihl*4 + sizeof(*udph) + 
227                                match_offset + match_len)
228                 return 0;
229
230         if (!skb_ip_make_writable(pskb, (*pskb)->len))
231                 return 0;
232
233         if (rep_len > match_len
234             && rep_len - match_len > skb_tailroom(*pskb)
235             && !enlarge_skb(pskb, rep_len - match_len))
236                 return 0;
237
238         iph = (*pskb)->nh.iph;
239         udph = (void *)iph + iph->ihl*4;
240         mangle_contents(*pskb, iph->ihl*4 + sizeof(*udph),
241                         match_offset, match_len, rep_buffer, rep_len);
242
243         /* update the length of the UDP packet */
244         udph->len = htons((*pskb)->len - iph->ihl*4);
245
246         /* fix udp checksum if udp checksum was previously calculated */
247         if (udph->check) {
248                 int datalen = (*pskb)->len - iph->ihl * 4;
249                 udph->check = 0;
250                 udph->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
251                                                 datalen, IPPROTO_UDP,
252                                                 csum_partial((char *)udph,
253                                                              datalen, 0));
254         }
255
256         return 1;
257 }
258
259 /* Adjust one found SACK option including checksum correction */
260 static void
261 sack_adjust(struct sk_buff *skb,
262             struct tcphdr *tcph, 
263             unsigned int sackoff,
264             unsigned int sackend,
265             struct ip_nat_seq *natseq)
266 {
267         while (sackoff < sackend) {
268                 struct tcp_sack_block *sack;
269                 u_int32_t new_start_seq, new_end_seq;
270
271                 sack = (void *)skb->data + sackoff;
272                 if (after(ntohl(sack->start_seq) - natseq->offset_before,
273                           natseq->correction_pos))
274                         new_start_seq = ntohl(sack->start_seq) 
275                                         - natseq->offset_after;
276                 else
277                         new_start_seq = ntohl(sack->start_seq) 
278                                         - natseq->offset_before;
279                 new_start_seq = htonl(new_start_seq);
280
281                 if (after(ntohl(sack->end_seq) - natseq->offset_before,
282                           natseq->correction_pos))
283                         new_end_seq = ntohl(sack->end_seq)
284                                       - natseq->offset_after;
285                 else
286                         new_end_seq = ntohl(sack->end_seq)
287                                       - natseq->offset_before;
288                 new_end_seq = htonl(new_end_seq);
289
290                 DEBUGP("sack_adjust: start_seq: %d->%d, end_seq: %d->%d\n",
291                         ntohl(sack->start_seq), new_start_seq,
292                         ntohl(sack->end_seq), new_end_seq);
293
294                 tcph->check = 
295                         ip_nat_cheat_check(~sack->start_seq, new_start_seq,
296                                            ip_nat_cheat_check(~sack->end_seq, 
297                                                               new_end_seq,
298                                                               tcph->check));
299                 sack->start_seq = new_start_seq;
300                 sack->end_seq = new_end_seq;
301                 sackoff += sizeof(*sack);
302         }
303 }
304
305 /* TCP SACK sequence number adjustment */
306 static inline unsigned int
307 ip_nat_sack_adjust(struct sk_buff **pskb,
308                    struct tcphdr *tcph,
309                    struct ip_conntrack *ct,
310                    enum ip_conntrack_info ctinfo)
311 {
312         unsigned int dir, optoff, optend;
313
314         optoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct tcphdr);
315         optend = (*pskb)->nh.iph->ihl*4 + tcph->doff*4;
316
317         if (!skb_ip_make_writable(pskb, optend))
318                 return 0;
319
320         dir = CTINFO2DIR(ctinfo);
321
322         while (optoff < optend) {
323                 /* Usually: option, length. */
324                 unsigned char *op = (*pskb)->data + optoff;
325
326                 switch (op[0]) {
327                 case TCPOPT_EOL:
328                         return 1;
329                 case TCPOPT_NOP:
330                         optoff++;
331                         continue;
332                 default:
333                         /* no partial options */
334                         if (optoff + 1 == optend
335                             || optoff + op[1] > optend
336                             || op[1] < 2)
337                                 return 0;
338                         if (op[0] == TCPOPT_SACK
339                             && op[1] >= 2+TCPOLEN_SACK_PERBLOCK
340                             && ((op[1] - 2) % TCPOLEN_SACK_PERBLOCK) == 0)
341                                 sack_adjust(*pskb, tcph, optoff+2,
342                                             optoff+op[1],
343                                             &ct->nat.info.seq[!dir]);
344                         optoff += op[1];
345                 }
346         }
347         return 1;
348 }
349
350 /* TCP sequence number adjustment.  Returns 1 on success, 0 on failure */
351 int
352 ip_nat_seq_adjust(struct sk_buff **pskb, 
353                   struct ip_conntrack *ct, 
354                   enum ip_conntrack_info ctinfo)
355 {
356         struct tcphdr *tcph;
357         int dir, newseq, newack;
358         struct ip_nat_seq *this_way, *other_way;        
359
360         dir = CTINFO2DIR(ctinfo);
361
362         this_way = &ct->nat.info.seq[dir];
363         other_way = &ct->nat.info.seq[!dir];
364
365         /* No adjustments to make?  Very common case. */
366         if (!this_way->offset_before && !this_way->offset_after
367             && !other_way->offset_before && !other_way->offset_after)
368                 return 1;
369
370         if (!skb_ip_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph)))
371                 return 0;
372
373         tcph = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
374         if (after(ntohl(tcph->seq), this_way->correction_pos))
375                 newseq = ntohl(tcph->seq) + this_way->offset_after;
376         else
377                 newseq = ntohl(tcph->seq) + this_way->offset_before;
378         newseq = htonl(newseq);
379
380         if (after(ntohl(tcph->ack_seq) - other_way->offset_before,
381                   other_way->correction_pos))
382                 newack = ntohl(tcph->ack_seq) - other_way->offset_after;
383         else
384                 newack = ntohl(tcph->ack_seq) - other_way->offset_before;
385         newack = htonl(newack);
386
387         tcph->check = ip_nat_cheat_check(~tcph->seq, newseq,
388                                          ip_nat_cheat_check(~tcph->ack_seq, 
389                                                             newack, 
390                                                             tcph->check));
391
392         DEBUGP("Adjusting sequence number from %u->%u, ack from %u->%u\n",
393                 ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq),
394                 ntohl(newack));
395
396         tcph->seq = newseq;
397         tcph->ack_seq = newack;
398
399         if (!ip_nat_sack_adjust(pskb, tcph, ct, ctinfo))
400                 return 0;
401
402         ip_conntrack_tcp_update(*pskb, ct, dir);
403
404         return 1;
405 }
406
407 static inline int
408 helper_cmp(const struct ip_nat_helper *helper,
409            const struct ip_conntrack_tuple *tuple)
410 {
411         return ip_ct_tuple_mask_cmp(tuple, &helper->tuple, &helper->mask);
412 }
413
414 int ip_nat_helper_register(struct ip_nat_helper *me)
415 {
416         int ret = 0;
417
418         WRITE_LOCK(&ip_nat_lock);
419         if (LIST_FIND(&helpers, helper_cmp, struct ip_nat_helper *,&me->tuple))
420                 ret = -EBUSY;
421         else
422                 list_prepend(&helpers, me);
423         WRITE_UNLOCK(&ip_nat_lock);
424
425         return ret;
426 }
427
428 struct ip_nat_helper *
429 __ip_nat_find_helper(const struct ip_conntrack_tuple *tuple)
430 {
431         return LIST_FIND(&helpers, helper_cmp, struct ip_nat_helper *, tuple);
432 }
433
434 struct ip_nat_helper *
435 ip_nat_find_helper(const struct ip_conntrack_tuple *tuple)
436 {
437         struct ip_nat_helper *h;
438
439         READ_LOCK(&ip_nat_lock);
440         h = __ip_nat_find_helper(tuple);
441         READ_UNLOCK(&ip_nat_lock);
442
443         return h;
444 }
445
446 static int
447 kill_helper(const struct ip_conntrack *i, void *helper)
448 {
449         int ret;
450
451         READ_LOCK(&ip_nat_lock);
452         ret = (i->nat.info.helper == helper);
453         READ_UNLOCK(&ip_nat_lock);
454
455         return ret;
456 }
457
458 void ip_nat_helper_unregister(struct ip_nat_helper *me)
459 {
460         WRITE_LOCK(&ip_nat_lock);
461         /* Autoloading conntrack helper might have failed */
462         if (LIST_FIND(&helpers, helper_cmp, struct ip_nat_helper *,&me->tuple)) {
463                 LIST_DELETE(&helpers, me);
464         }
465         WRITE_UNLOCK(&ip_nat_lock);
466
467         /* Someone could be still looking at the helper in a bh. */
468         synchronize_net();
469
470         /* Find anything using it, and umm, kill them.  We can't turn
471            them into normal connections: if we've adjusted SYNs, then
472            they'll ackstorm.  So we just drop it.  We used to just
473            bump module count when a connection existed, but that
474            forces admins to gen fake RSTs or bounce box, either of
475            which is just a long-winded way of making things
476            worse. --RR */
477         ip_ct_selective_cleanup(kill_helper, me);
478 }