patch-2_6_7-vs1_9_1_12
[linux-2.6.git] / net / ipv4 / ipvs / ip_vs_ctl.c
1 /*
2  * IPVS         An implementation of the IP virtual server support for the
3  *              LINUX operating system.  IPVS is now implemented as a module
4  *              over the NetFilter framework. IPVS can be used to build a
5  *              high-performance and highly available server based on a
6  *              cluster of servers.
7  *
8  * Version:     $Id: ip_vs_ctl.c,v 1.36 2003/06/08 09:31:19 wensong Exp $
9  *
10  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
11  *              Peter Kese <peter.kese@ijs.si>
12  *              Julian Anastasov <ja@ssi.bg>
13  *
14  *              This program is free software; you can redistribute it and/or
15  *              modify it under the terms of the GNU General Public License
16  *              as published by the Free Software Foundation; either version
17  *              2 of the License, or (at your option) any later version.
18  *
19  * Changes:
20  *
21  */
22
23 #include <linux/module.h>
24 #include <linux/init.h>
25 #include <linux/types.h>
26 #include <linux/fs.h>
27 #include <linux/sysctl.h>
28 #include <linux/proc_fs.h>
29 #include <linux/timer.h>
30 #include <linux/swap.h>
31 #include <linux/proc_fs.h>
32 #include <linux/seq_file.h>
33
34 #include <linux/netfilter.h>
35 #include <linux/netfilter_ipv4.h>
36
37 #include <net/ip.h>
38 #include <net/sock.h>
39
40 #include <asm/uaccess.h>
41
42 #include <net/ip_vs.h>
43
44 /* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
45 static DECLARE_MUTEX(__ip_vs_mutex);
46
47 /* lock for service table */
48 static rwlock_t __ip_vs_svc_lock = RW_LOCK_UNLOCKED;
49
50 /* lock for table with the real services */
51 static rwlock_t __ip_vs_rs_lock = RW_LOCK_UNLOCKED;
52
53 /* lock for state and timeout tables */
54 static rwlock_t __ip_vs_securetcp_lock = RW_LOCK_UNLOCKED;
55
56 /* lock for drop entry handling */
57 static spinlock_t __ip_vs_dropentry_lock = SPIN_LOCK_UNLOCKED;
58
59 /* lock for drop packet handling */
60 static spinlock_t __ip_vs_droppacket_lock = SPIN_LOCK_UNLOCKED;
61
62 /* 1/rate drop and drop-entry variables */
63 int ip_vs_drop_rate = 0;
64 int ip_vs_drop_counter = 0;
65 atomic_t ip_vs_dropentry = ATOMIC_INIT(0);
66
67 /* number of virtual services */
68 static int ip_vs_num_services = 0;
69
70 /* sysctl variables */
71 static int sysctl_ip_vs_drop_entry = 0;
72 static int sysctl_ip_vs_drop_packet = 0;
73 static int sysctl_ip_vs_secure_tcp = 0;
74 static int sysctl_ip_vs_amemthresh = 1024;
75 static int sysctl_ip_vs_am_droprate = 10;
76 int sysctl_ip_vs_cache_bypass = 0;
77 int sysctl_ip_vs_expire_nodest_conn = 0;
78 int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
79 int sysctl_ip_vs_nat_icmp_send = 0;
80
81
82 #ifdef CONFIG_IP_VS_DEBUG
83 static int sysctl_ip_vs_debug_level = 0;
84
85 int ip_vs_get_debug_level(void)
86 {
87         return sysctl_ip_vs_debug_level;
88 }
89 #endif
90
91 /*
92  *      update_defense_level is called from timer bh and from sysctl.
93  */
94 static void update_defense_level(void)
95 {
96         struct sysinfo i;
97         static int old_secure_tcp = 0;
98         int availmem;
99         int nomem;
100         int to_change = -1;
101
102         /* we only count free and buffered memory (in pages) */
103         si_meminfo(&i);
104         availmem = i.freeram + i.bufferram;
105         /* however in linux 2.5 the i.bufferram is total page cache size,
106            we need adjust it */
107         /* si_swapinfo(&i); */
108         /* availmem = availmem - (i.totalswap - i.freeswap); */
109
110         nomem = (availmem < sysctl_ip_vs_amemthresh);
111
112         /* drop_entry */
113         spin_lock(&__ip_vs_dropentry_lock);
114         switch (sysctl_ip_vs_drop_entry) {
115         case 0:
116                 atomic_set(&ip_vs_dropentry, 0);
117                 break;
118         case 1:
119                 if (nomem) {
120                         atomic_set(&ip_vs_dropentry, 1);
121                         sysctl_ip_vs_drop_entry = 2;
122                 } else {
123                         atomic_set(&ip_vs_dropentry, 0);
124                 }
125                 break;
126         case 2:
127                 if (nomem) {
128                         atomic_set(&ip_vs_dropentry, 1);
129                 } else {
130                         atomic_set(&ip_vs_dropentry, 0);
131                         sysctl_ip_vs_drop_entry = 1;
132                 };
133                 break;
134         case 3:
135                 atomic_set(&ip_vs_dropentry, 1);
136                 break;
137         }
138         spin_unlock(&__ip_vs_dropentry_lock);
139
140         /* drop_packet */
141         spin_lock(&__ip_vs_droppacket_lock);
142         switch (sysctl_ip_vs_drop_packet) {
143         case 0:
144                 ip_vs_drop_rate = 0;
145                 break;
146         case 1:
147                 if (nomem) {
148                         ip_vs_drop_rate = ip_vs_drop_counter
149                                 = sysctl_ip_vs_amemthresh /
150                                 (sysctl_ip_vs_amemthresh-availmem);
151                         sysctl_ip_vs_drop_packet = 2;
152                 } else {
153                         ip_vs_drop_rate = 0;
154                 }
155                 break;
156         case 2:
157                 if (nomem) {
158                         ip_vs_drop_rate = ip_vs_drop_counter
159                                 = sysctl_ip_vs_amemthresh /
160                                 (sysctl_ip_vs_amemthresh-availmem);
161                 } else {
162                         ip_vs_drop_rate = 0;
163                         sysctl_ip_vs_drop_packet = 1;
164                 }
165                 break;
166         case 3:
167                 ip_vs_drop_rate = sysctl_ip_vs_am_droprate;
168                 break;
169         }
170         spin_unlock(&__ip_vs_droppacket_lock);
171
172         /* secure_tcp */
173         write_lock(&__ip_vs_securetcp_lock);
174         switch (sysctl_ip_vs_secure_tcp) {
175         case 0:
176                 if (old_secure_tcp >= 2)
177                         to_change = 0;
178                 break;
179         case 1:
180                 if (nomem) {
181                         if (old_secure_tcp < 2)
182                                 to_change = 1;
183                         sysctl_ip_vs_secure_tcp = 2;
184                 } else {
185                         if (old_secure_tcp >= 2)
186                                 to_change = 0;
187                 }
188                 break;
189         case 2:
190                 if (nomem) {
191                         if (old_secure_tcp < 2)
192                                 to_change = 1;
193                 } else {
194                         if (old_secure_tcp >= 2)
195                                 to_change = 0;
196                         sysctl_ip_vs_secure_tcp = 1;
197                 }
198                 break;
199         case 3:
200                 if (old_secure_tcp < 2)
201                         to_change = 1;
202                 break;
203         }
204         old_secure_tcp = sysctl_ip_vs_secure_tcp;
205         if (to_change >= 0)
206                 ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
207         write_unlock(&__ip_vs_securetcp_lock);
208 }
209
210
211 /*
212  *      Timer for checking the defense
213  */
214 static struct timer_list defense_timer;
215 #define DEFENSE_TIMER_PERIOD    1*HZ
216
217 static void defense_timer_handler(unsigned long data)
218 {
219         update_defense_level();
220         if (atomic_read(&ip_vs_dropentry))
221                 ip_vs_random_dropentry();
222
223         mod_timer(&defense_timer, jiffies + DEFENSE_TIMER_PERIOD);
224 }
225
226
227 int
228 ip_vs_use_count_inc(void)
229 {
230         return try_module_get(THIS_MODULE);
231 }
232
233 void
234 ip_vs_use_count_dec(void)
235 {
236         module_put(THIS_MODULE);
237 }
238
239
240 /*
241  *      Hash table: for virtual service lookups
242  */
243 #define IP_VS_SVC_TAB_BITS 8
244 #define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
245 #define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
246
247 /* the service table hashed by <protocol, addr, port> */
248 static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
249 /* the service table hashed by fwmark */
250 static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
251
252 /*
253  *      Hash table: for real service lookups
254  */
255 #define IP_VS_RTAB_BITS 4
256 #define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS)
257 #define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1)
258
259 static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE];
260
261 /*
262  *      Trash for destinations
263  */
264 static LIST_HEAD(ip_vs_dest_trash);
265
266 /*
267  *      FTP & NULL virtual service counters
268  */
269 static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0);
270 static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0);
271
272
273 /*
274  *      Returns hash value for virtual service
275  */
276 static __inline__ unsigned
277 ip_vs_svc_hashkey(unsigned proto, __u32 addr, __u16 port)
278 {
279         register unsigned porth = ntohs(port);
280
281         return (proto^ntohl(addr)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
282                 & IP_VS_SVC_TAB_MASK;
283 }
284
285 /*
286  *      Returns hash value of fwmark for virtual service lookup
287  */
288 static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark)
289 {
290         return fwmark & IP_VS_SVC_TAB_MASK;
291 }
292
293 /*
294  *      Hashes a service in the ip_vs_svc_table by <proto,addr,port>
295  *      or in the ip_vs_svc_fwm_table by fwmark.
296  *      Should be called with locked tables.
297  */
298 static int ip_vs_svc_hash(struct ip_vs_service *svc)
299 {
300         unsigned hash;
301
302         if (svc->flags & IP_VS_SVC_F_HASHED) {
303                 IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, "
304                           "called from %p\n", __builtin_return_address(0));
305                 return 0;
306         }
307
308         if (svc->fwmark == 0) {
309                 /*
310                  *  Hash it by <protocol,addr,port> in ip_vs_svc_table
311                  */
312                 hash = ip_vs_svc_hashkey(svc->protocol, svc->addr, svc->port);
313                 list_add(&svc->s_list, &ip_vs_svc_table[hash]);
314         } else {
315                 /*
316                  *  Hash it by fwmark in ip_vs_svc_fwm_table
317                  */
318                 hash = ip_vs_svc_fwm_hashkey(svc->fwmark);
319                 list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
320         }
321
322         svc->flags |= IP_VS_SVC_F_HASHED;
323         /* increase its refcnt because it is referenced by the svc table */
324         atomic_inc(&svc->refcnt);
325         return 1;
326 }
327
328
329 /*
330  *      Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table.
331  *      Should be called with locked tables.
332  */
333 static int ip_vs_svc_unhash(struct ip_vs_service *svc)
334 {
335         if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
336                 IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, "
337                           "called from %p\n", __builtin_return_address(0));
338                 return 0;
339         }
340
341         if (svc->fwmark == 0) {
342                 /* Remove it from the ip_vs_svc_table table */
343                 list_del(&svc->s_list);
344         } else {
345                 /* Remove it from the ip_vs_svc_fwm_table table */
346                 list_del(&svc->f_list);
347         }
348
349         svc->flags &= ~IP_VS_SVC_F_HASHED;
350         atomic_dec(&svc->refcnt);
351         return 1;
352 }
353
354
355 /*
356  *      Get service by {proto,addr,port} in the service table.
357  */
358 static __inline__ struct ip_vs_service *
359 __ip_vs_service_get(__u16 protocol, __u32 vaddr, __u16 vport)
360 {
361         unsigned hash;
362         struct ip_vs_service *svc;
363
364         /* Check for "full" addressed entries */
365         hash = ip_vs_svc_hashkey(protocol, vaddr, vport);
366
367         list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
368                 if ((svc->addr == vaddr)
369                     && (svc->port == vport)
370                     && (svc->protocol == protocol)) {
371                         /* HIT */
372                         atomic_inc(&svc->usecnt);
373                         return svc;
374                 }
375         }
376
377         return NULL;
378 }
379
380
381 /*
382  *      Get service by {fwmark} in the service table.
383  */
384 static __inline__ struct ip_vs_service *__ip_vs_svc_fwm_get(__u32 fwmark)
385 {
386         unsigned hash;
387         struct ip_vs_service *svc;
388
389         /* Check for fwmark addressed entries */
390         hash = ip_vs_svc_fwm_hashkey(fwmark);
391
392         list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
393                 if (svc->fwmark == fwmark) {
394                         /* HIT */
395                         atomic_inc(&svc->usecnt);
396                         return svc;
397                 }
398         }
399
400         return NULL;
401 }
402
403 struct ip_vs_service *
404 ip_vs_service_get(__u32 fwmark, __u16 protocol, __u32 vaddr, __u16 vport)
405 {
406         struct ip_vs_service *svc;
407
408         read_lock(&__ip_vs_svc_lock);
409
410         /*
411          *      Check the table hashed by fwmark first
412          */
413         if (fwmark && (svc = __ip_vs_svc_fwm_get(fwmark)))
414                 goto out;
415
416         /*
417          *      Check the table hashed by <protocol,addr,port>
418          *      for "full" addressed entries
419          */
420         svc = __ip_vs_service_get(protocol, vaddr, vport);
421
422         if (svc == NULL
423             && protocol == IPPROTO_TCP
424             && atomic_read(&ip_vs_ftpsvc_counter)
425             && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
426                 /*
427                  * Check if ftp service entry exists, the packet
428                  * might belong to FTP data connections.
429                  */
430                 svc = __ip_vs_service_get(protocol, vaddr, FTPPORT);
431         }
432
433         if (svc == NULL
434             && atomic_read(&ip_vs_nullsvc_counter)) {
435                 /*
436                  * Check if the catch-all port (port zero) exists
437                  */
438                 svc = __ip_vs_service_get(protocol, vaddr, 0);
439         }
440
441   out:
442         read_unlock(&__ip_vs_svc_lock);
443
444         IP_VS_DBG(6, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n",
445                   fwmark, ip_vs_proto_name(protocol),
446                   NIPQUAD(vaddr), ntohs(vport),
447                   svc?"hit":"not hit");
448
449         return svc;
450 }
451
452
453 static inline void
454 __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
455 {
456         atomic_inc(&svc->refcnt);
457         dest->svc = svc;
458 }
459
460 static inline void
461 __ip_vs_unbind_svc(struct ip_vs_dest *dest)
462 {
463         struct ip_vs_service *svc = dest->svc;
464
465         dest->svc = NULL;
466         if (atomic_dec_and_test(&svc->refcnt))
467                 kfree(svc);
468 }
469
470
471 /*
472  *      Returns hash value for real service
473  */
474 static __inline__ unsigned ip_vs_rs_hashkey(__u32 addr, __u16 port)
475 {
476         register unsigned porth = ntohs(port);
477
478         return (ntohl(addr)^(porth>>IP_VS_RTAB_BITS)^porth)
479                 & IP_VS_RTAB_MASK;
480 }
481
482 /*
483  *      Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>.
484  *      should be called with locked tables.
485  */
486 static int ip_vs_rs_hash(struct ip_vs_dest *dest)
487 {
488         unsigned hash;
489
490         if (!list_empty(&dest->d_list)) {
491                 return 0;
492         }
493
494         /*
495          *      Hash by proto,addr,port,
496          *      which are the parameters of the real service.
497          */
498         hash = ip_vs_rs_hashkey(dest->addr, dest->port);
499         list_add(&dest->d_list, &ip_vs_rtable[hash]);
500
501         return 1;
502 }
503
504 /*
505  *      UNhashes ip_vs_dest from ip_vs_rtable.
506  *      should be called with locked tables.
507  */
508 static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
509 {
510         /*
511          * Remove it from the ip_vs_rtable table.
512          */
513         if (!list_empty(&dest->d_list)) {
514                 list_del(&dest->d_list);
515                 INIT_LIST_HEAD(&dest->d_list);
516         }
517
518         return 1;
519 }
520
521 /*
522  *      Lookup real service by <proto,addr,port> in the real service table.
523  */
524 struct ip_vs_dest *
525 ip_vs_lookup_real_service(__u16 protocol, __u32 daddr, __u16 dport)
526 {
527         unsigned hash;
528         struct ip_vs_dest *dest;
529
530         /*
531          *      Check for "full" addressed entries
532          *      Return the first found entry
533          */
534         hash = ip_vs_rs_hashkey(daddr, dport);
535
536         read_lock(&__ip_vs_rs_lock);
537         list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) {
538                 if ((dest->addr == daddr)
539                     && (dest->port == dport)
540                     && ((dest->protocol == protocol) ||
541                         dest->vfwmark)) {
542                         /* HIT */
543                         read_unlock(&__ip_vs_rs_lock);
544                         return dest;
545                 }
546         }
547         read_unlock(&__ip_vs_rs_lock);
548
549         return NULL;
550 }
551
552 /*
553  *      Lookup destination by {addr,port} in the given service
554  */
555 static struct ip_vs_dest *
556 ip_vs_lookup_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport)
557 {
558         struct ip_vs_dest *dest;
559
560         /*
561          * Find the destination for the given service
562          */
563         list_for_each_entry(dest, &svc->destinations, n_list) {
564                 if ((dest->addr == daddr) && (dest->port == dport)) {
565                         /* HIT */
566                         return dest;
567                 }
568         }
569
570         return NULL;
571 }
572
573
574 /*
575  *  Lookup dest by {svc,addr,port} in the destination trash.
576  *  The destination trash is used to hold the destinations that are removed
577  *  from the service table but are still referenced by some conn entries.
578  *  The reason to add the destination trash is when the dest is temporary
579  *  down (either by administrator or by monitor program), the dest can be
580  *  picked back from the trash, the remaining connections to the dest can
581  *  continue, and the counting information of the dest is also useful for
582  *  scheduling.
583  */
584 static struct ip_vs_dest *
585 ip_vs_trash_get_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport)
586 {
587         struct ip_vs_dest *dest, *nxt;
588
589         /*
590          * Find the destination in trash
591          */
592         list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
593                 IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, "
594                           "refcnt=%d\n",
595                           dest->vfwmark,
596                           NIPQUAD(dest->addr), ntohs(dest->port),
597                           atomic_read(&dest->refcnt));
598                 if (dest->addr == daddr &&
599                     dest->port == dport &&
600                     dest->vfwmark == svc->fwmark &&
601                     dest->protocol == svc->protocol &&
602                     (svc->fwmark ||
603                      (dest->vaddr == svc->addr &&
604                       dest->vport == svc->port))) {
605                         /* HIT */
606                         return dest;
607                 }
608
609                 /*
610                  * Try to purge the destination from trash if not referenced
611                  */
612                 if (atomic_read(&dest->refcnt) == 1) {
613                         IP_VS_DBG(3, "Removing destination %u/%u.%u.%u.%u:%u "
614                                   "from trash\n",
615                                   dest->vfwmark,
616                                   NIPQUAD(dest->addr), ntohs(dest->port));
617                         list_del(&dest->n_list);
618                         ip_vs_dst_reset(dest);
619                         __ip_vs_unbind_svc(dest);
620                         kfree(dest);
621                 }
622         }
623
624         return NULL;
625 }
626
627
628 /*
629  *  Clean up all the destinations in the trash
630  *  Called by the ip_vs_control_cleanup()
631  *
632  *  When the ip_vs_control_clearup is activated by ipvs module exit,
633  *  the service tables must have been flushed and all the connections
634  *  are expired, and the refcnt of each destination in the trash must
635  *  be 1, so we simply release them here.
636  */
637 static void ip_vs_trash_cleanup(void)
638 {
639         struct ip_vs_dest *dest, *nxt;
640
641         list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
642                 list_del(&dest->n_list);
643                 ip_vs_dst_reset(dest);
644                 __ip_vs_unbind_svc(dest);
645                 kfree(dest);
646         }
647 }
648
649
650 static void
651 ip_vs_zero_stats(struct ip_vs_stats *stats)
652 {
653         spin_lock_bh(&stats->lock);
654         memset(stats, 0, (char *)&stats->lock - (char *)stats);
655         spin_unlock_bh(&stats->lock);
656         ip_vs_zero_estimator(stats);
657 }
658
659 /*
660  *      Update a destination in the given service
661  */
662 static void
663 __ip_vs_update_dest(struct ip_vs_service *svc,
664                     struct ip_vs_dest *dest, struct ip_vs_dest_user *udest)
665 {
666         int conn_flags;
667
668         /* set the weight and the flags */
669         atomic_set(&dest->weight, udest->weight);
670         conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE;
671
672         /* check if local node and update the flags */
673         if (inet_addr_type(udest->addr) == RTN_LOCAL) {
674                 conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
675                         | IP_VS_CONN_F_LOCALNODE;
676         }
677
678         /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
679         if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) {
680                 conn_flags |= IP_VS_CONN_F_NOOUTPUT;
681         } else {
682                 /*
683                  *    Put the real service in ip_vs_rtable if not present.
684                  *    For now only for NAT!
685                  */
686                 write_lock_bh(&__ip_vs_rs_lock);
687                 ip_vs_rs_hash(dest);
688                 write_unlock_bh(&__ip_vs_rs_lock);
689         }
690         atomic_set(&dest->conn_flags, conn_flags);
691
692         /* bind the service */
693         if (!dest->svc) {
694                 __ip_vs_bind_svc(dest, svc);
695         } else {
696                 if (dest->svc != svc) {
697                         __ip_vs_unbind_svc(dest);
698                         ip_vs_zero_stats(&dest->stats);
699                         __ip_vs_bind_svc(dest, svc);
700                 }
701         }
702
703         /* set the dest status flags */
704         dest->flags |= IP_VS_DEST_F_AVAILABLE;
705
706         if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
707                 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
708         dest->u_threshold = udest->u_threshold;
709         dest->l_threshold = udest->l_threshold;
710 }
711
712
713 /*
714  *      Create a destination for the given service
715  */
716 static int
717 ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest,
718                struct ip_vs_dest **dest_p)
719 {
720         struct ip_vs_dest *dest;
721         unsigned atype;
722
723         EnterFunction(2);
724
725         atype = inet_addr_type(udest->addr);
726         if (atype != RTN_LOCAL && atype != RTN_UNICAST)
727                 return -EINVAL;
728
729         dest = kmalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC);
730         if (dest == NULL) {
731                 IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n");
732                 return -ENOMEM;
733         }
734         memset(dest, 0, sizeof(struct ip_vs_dest));
735
736         dest->protocol = svc->protocol;
737         dest->vaddr = svc->addr;
738         dest->vport = svc->port;
739         dest->vfwmark = svc->fwmark;
740         dest->addr = udest->addr;
741         dest->port = udest->port;
742
743         atomic_set(&dest->activeconns, 0);
744         atomic_set(&dest->inactconns, 0);
745         atomic_set(&dest->persistconns, 0);
746         atomic_set(&dest->refcnt, 0);
747
748         INIT_LIST_HEAD(&dest->d_list);
749         dest->dst_lock = SPIN_LOCK_UNLOCKED;
750         dest->stats.lock = SPIN_LOCK_UNLOCKED;
751         __ip_vs_update_dest(svc, dest, udest);
752         ip_vs_new_estimator(&dest->stats);
753
754         *dest_p = dest;
755
756         LeaveFunction(2);
757         return 0;
758 }
759
760
761 /*
762  *      Add a destination into an existing service
763  */
764 static int
765 ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
766 {
767         struct ip_vs_dest *dest;
768         __u32 daddr = udest->addr;
769         __u16 dport = udest->port;
770         int ret;
771
772         EnterFunction(2);
773
774         if (udest->weight < 0) {
775                 IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n");
776                 return -ERANGE;
777         }
778
779         if (udest->l_threshold > udest->u_threshold) {
780                 IP_VS_ERR("ip_vs_add_dest(): lower threshold is higher than "
781                           "upper threshold\n");
782                 return -ERANGE;
783         }
784
785         /*
786          * Check if the dest already exists in the list
787          */
788         dest = ip_vs_lookup_dest(svc, daddr, dport);
789         if (dest != NULL) {
790                 IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n");
791                 return -EEXIST;
792         }
793
794         /*
795          * Check if the dest already exists in the trash and
796          * is from the same service
797          */
798         dest = ip_vs_trash_get_dest(svc, daddr, dport);
799         if (dest != NULL) {
800                 IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, "
801                           "refcnt=%d, service %u/%u.%u.%u.%u:%u\n",
802                           NIPQUAD(daddr), ntohs(dport),
803                           atomic_read(&dest->refcnt),
804                           dest->vfwmark,
805                           NIPQUAD(dest->vaddr),
806                           ntohs(dest->vport));
807                 __ip_vs_update_dest(svc, dest, udest);
808
809                 /*
810                  * Get the destination from the trash
811                  */
812                 list_del(&dest->n_list);
813
814                 ip_vs_new_estimator(&dest->stats);
815
816                 write_lock_bh(&__ip_vs_svc_lock);
817
818                 /*
819                  * Wait until all other svc users go away.
820                  */
821                 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
822
823                 list_add(&dest->n_list, &svc->destinations);
824                 svc->num_dests++;
825
826                 /* call the update_service function of its scheduler */
827                 svc->scheduler->update_service(svc);
828
829                 write_unlock_bh(&__ip_vs_svc_lock);
830                 return 0;
831         }
832
833         /*
834          * Allocate and initialize the dest structure
835          */
836         ret = ip_vs_new_dest(svc, udest, &dest);
837         if (ret) {
838                 return ret;
839         }
840
841         /*
842          * Add the dest entry into the list
843          */
844         atomic_inc(&dest->refcnt);
845
846         write_lock_bh(&__ip_vs_svc_lock);
847
848         /*
849          * Wait until all other svc users go away.
850          */
851         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
852
853         list_add(&dest->n_list, &svc->destinations);
854         svc->num_dests++;
855
856         /* call the update_service function of its scheduler */
857         svc->scheduler->update_service(svc);
858
859         write_unlock_bh(&__ip_vs_svc_lock);
860
861         LeaveFunction(2);
862
863         return 0;
864 }
865
866
867 /*
868  *      Edit a destination in the given service
869  */
870 static int
871 ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
872 {
873         struct ip_vs_dest *dest;
874         __u32 daddr = udest->addr;
875         __u16 dport = udest->port;
876
877         EnterFunction(2);
878
879         if (udest->weight < 0) {
880                 IP_VS_ERR("ip_vs_edit_dest(): server weight less than zero\n");
881                 return -ERANGE;
882         }
883
884         if (udest->l_threshold > udest->u_threshold) {
885                 IP_VS_ERR("ip_vs_edit_dest(): lower threshold is higher than "
886                           "upper threshold\n");
887                 return -ERANGE;
888         }
889
890         /*
891          *  Lookup the destination list
892          */
893         dest = ip_vs_lookup_dest(svc, daddr, dport);
894         if (dest == NULL) {
895                 IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n");
896                 return -ENOENT;
897         }
898
899         __ip_vs_update_dest(svc, dest, udest);
900
901         write_lock_bh(&__ip_vs_svc_lock);
902
903         /* Wait until all other svc users go away */
904         while (atomic_read(&svc->usecnt) > 1) {};
905
906         /* call the update_service, because server weight may be changed */
907         svc->scheduler->update_service(svc);
908
909         write_unlock_bh(&__ip_vs_svc_lock);
910
911         LeaveFunction(2);
912
913         return 0;
914 }
915
916
917 /*
918  *      Delete a destination (must be already unlinked from the service)
919  */
920 static void __ip_vs_del_dest(struct ip_vs_dest *dest)
921 {
922         ip_vs_kill_estimator(&dest->stats);
923
924         /*
925          *  Remove it from the d-linked list with the real services.
926          */
927         write_lock_bh(&__ip_vs_rs_lock);
928         ip_vs_rs_unhash(dest);
929         write_unlock_bh(&__ip_vs_rs_lock);
930
931         /*
932          *  Decrease the refcnt of the dest, and free the dest
933          *  if nobody refers to it (refcnt=0). Otherwise, throw
934          *  the destination into the trash.
935          */
936         if (atomic_dec_and_test(&dest->refcnt)) {
937                 ip_vs_dst_reset(dest);
938                 /* simply decrease svc->refcnt here, let the caller check
939                    and release the service if nobody refers to it.
940                    Only user context can release destination and service,
941                    and only one user context can update virtual service at a
942                    time, so the operation here is OK */
943                 atomic_dec(&dest->svc->refcnt);
944                 kfree(dest);
945         } else {
946                 IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, refcnt=%d\n",
947                           NIPQUAD(dest->addr), ntohs(dest->port),
948                           atomic_read(&dest->refcnt));
949                 list_add(&dest->n_list, &ip_vs_dest_trash);
950                 atomic_inc(&dest->refcnt);
951         }
952 }
953
954
955 /*
956  *      Unlink a destination from the given service
957  */
958 static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
959                                 struct ip_vs_dest *dest,
960                                 int svcupd)
961 {
962         dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
963
964         /*
965          *  Remove it from the d-linked destination list.
966          */
967         list_del(&dest->n_list);
968         svc->num_dests--;
969         if (svcupd) {
970                 /*
971                  *  Call the update_service function of its scheduler
972                  */
973                 svc->scheduler->update_service(svc);
974         }
975 }
976
977
978 /*
979  *      Delete a destination server in the given service
980  */
981 static int
982 ip_vs_del_dest(struct ip_vs_service *svc,struct ip_vs_dest_user *udest)
983 {
984         struct ip_vs_dest *dest;
985         __u32 daddr = udest->addr;
986         __u16 dport = udest->port;
987
988         EnterFunction(2);
989
990         dest = ip_vs_lookup_dest(svc, daddr, dport);
991         if (dest == NULL) {
992                 IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n");
993                 return -ENOENT;
994         }
995
996         write_lock_bh(&__ip_vs_svc_lock);
997
998         /*
999          *      Wait until all other svc users go away.
1000          */
1001         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1002
1003         /*
1004          *      Unlink dest from the service
1005          */
1006         __ip_vs_unlink_dest(svc, dest, 1);
1007
1008         write_unlock_bh(&__ip_vs_svc_lock);
1009
1010         /*
1011          *      Delete the destination
1012          */
1013         __ip_vs_del_dest(dest);
1014
1015         LeaveFunction(2);
1016
1017         return 0;
1018 }
1019
1020
1021 /*
1022  *      Add a service into the service hash table
1023  */
1024 static int
1025 ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p)
1026 {
1027         int ret = 0;
1028         struct ip_vs_scheduler *sched = NULL;
1029         struct ip_vs_service *svc = NULL;
1030
1031         /* increase the module use count */
1032         ip_vs_use_count_inc();
1033
1034         /* Lookup the scheduler by 'u->sched_name' */
1035         sched = ip_vs_scheduler_get(u->sched_name);
1036         if (sched == NULL) {
1037                 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1038                            u->sched_name);
1039                 ret = -ENOENT;
1040                 goto out_mod_dec;
1041         }
1042
1043         svc = (struct ip_vs_service *)
1044                 kmalloc(sizeof(struct ip_vs_service), GFP_ATOMIC);
1045         if (svc == NULL) {
1046                 IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n");
1047                 ret = -ENOMEM;
1048                 goto out_err;
1049         }
1050         memset(svc, 0, sizeof(struct ip_vs_service));
1051
1052         /* I'm the first user of the service */
1053         atomic_set(&svc->usecnt, 1);
1054         atomic_set(&svc->refcnt, 0);
1055
1056         svc->protocol = u->protocol;
1057         svc->addr = u->addr;
1058         svc->port = u->port;
1059         svc->fwmark = u->fwmark;
1060         svc->flags = u->flags;
1061         svc->timeout = u->timeout * HZ;
1062         svc->netmask = u->netmask;
1063
1064         INIT_LIST_HEAD(&svc->destinations);
1065         svc->sched_lock = RW_LOCK_UNLOCKED;
1066         svc->stats.lock = SPIN_LOCK_UNLOCKED;
1067
1068         /* Bind the scheduler */
1069         ret = ip_vs_bind_scheduler(svc, sched);
1070         if (ret)
1071                 goto out_err;
1072         sched = NULL;
1073
1074         /* Update the virtual service counters */
1075         if (svc->port == FTPPORT)
1076                 atomic_inc(&ip_vs_ftpsvc_counter);
1077         else if (svc->port == 0)
1078                 atomic_inc(&ip_vs_nullsvc_counter);
1079
1080         ip_vs_new_estimator(&svc->stats);
1081         ip_vs_num_services++;
1082
1083         /* Hash the service into the service table */
1084         write_lock_bh(&__ip_vs_svc_lock);
1085         ip_vs_svc_hash(svc);
1086         write_unlock_bh(&__ip_vs_svc_lock);
1087
1088         *svc_p = svc;
1089         return 0;
1090
1091   out_err:
1092         if (svc != NULL) {
1093                 if (svc->scheduler)
1094                         ip_vs_unbind_scheduler(svc);
1095                 if (svc->inc) {
1096                         local_bh_disable();
1097                         ip_vs_app_inc_put(svc->inc);
1098                         local_bh_enable();
1099                 }
1100                 kfree(svc);
1101         }
1102         ip_vs_scheduler_put(sched);
1103
1104   out_mod_dec:
1105         /* decrease the module use count */
1106         ip_vs_use_count_dec();
1107
1108         return ret;
1109 }
1110
1111
1112 /*
1113  *      Edit a service and bind it with a new scheduler
1114  */
1115 static int
1116 ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user *u)
1117 {
1118         struct ip_vs_scheduler *sched, *old_sched;
1119         int ret = 0;
1120
1121         /*
1122          * Lookup the scheduler, by 'u->sched_name'
1123          */
1124         sched = ip_vs_scheduler_get(u->sched_name);
1125         if (sched == NULL) {
1126                 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1127                            u->sched_name);
1128                 return -ENOENT;
1129         }
1130         old_sched = sched;
1131
1132         write_lock_bh(&__ip_vs_svc_lock);
1133
1134         /*
1135          * Wait until all other svc users go away.
1136          */
1137         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1138
1139         /*
1140          * Set the flags and timeout value
1141          */
1142         svc->flags = u->flags | IP_VS_SVC_F_HASHED;
1143         svc->timeout = u->timeout * HZ;
1144         svc->netmask = u->netmask;
1145
1146         old_sched = svc->scheduler;
1147         if (sched != old_sched) {
1148                 /*
1149                  * Unbind the old scheduler
1150                  */
1151                 if ((ret = ip_vs_unbind_scheduler(svc))) {
1152                         old_sched = sched;
1153                         goto out;
1154                 }
1155
1156                 /*
1157                  * Bind the new scheduler
1158                  */
1159                 if ((ret = ip_vs_bind_scheduler(svc, sched))) {
1160                         /*
1161                          * If ip_vs_bind_scheduler fails, restore the old
1162                          * scheduler.
1163                          * The main reason of failure is out of memory.
1164                          *
1165                          * The question is if the old scheduler can be
1166                          * restored all the time. TODO: if it cannot be
1167                          * restored some time, we must delete the service,
1168                          * otherwise the system may crash.
1169                          */
1170                         ip_vs_bind_scheduler(svc, old_sched);
1171                         old_sched = sched;
1172                         goto out;
1173                 }
1174         }
1175
1176   out:
1177         write_unlock_bh(&__ip_vs_svc_lock);
1178
1179         if (old_sched)
1180                 ip_vs_scheduler_put(old_sched);
1181
1182         return ret;
1183 }
1184
1185
1186 /*
1187  *      Delete a service from the service list
1188  *      - The service must be unlinked, unlocked and not referenced!
1189  *      - We are called under _bh lock
1190  */
1191 static void __ip_vs_del_service(struct ip_vs_service *svc)
1192 {
1193         struct ip_vs_dest *dest, *nxt;
1194         struct ip_vs_scheduler *old_sched;
1195
1196         ip_vs_num_services--;
1197         ip_vs_kill_estimator(&svc->stats);
1198
1199         /* Unbind scheduler */
1200         old_sched = svc->scheduler;
1201         ip_vs_unbind_scheduler(svc);
1202         if (old_sched)
1203                 ip_vs_scheduler_put(old_sched);
1204
1205         /* Unbind app inc */
1206         if (svc->inc) {
1207                 ip_vs_app_inc_put(svc->inc);
1208                 svc->inc = NULL;
1209         }
1210
1211         /*
1212          *    Unlink the whole destination list
1213          */
1214         list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
1215                 __ip_vs_unlink_dest(svc, dest, 0);
1216                 __ip_vs_del_dest(dest);
1217         }
1218
1219         /*
1220          *    Update the virtual service counters
1221          */
1222         if (svc->port == FTPPORT)
1223                 atomic_dec(&ip_vs_ftpsvc_counter);
1224         else if (svc->port == 0)
1225                 atomic_dec(&ip_vs_nullsvc_counter);
1226
1227         /*
1228          *    Free the service if nobody refers to it
1229          */
1230         if (atomic_read(&svc->refcnt) == 0)
1231                 kfree(svc);
1232
1233         /* decrease the module use count */
1234         ip_vs_use_count_dec();
1235 }
1236
1237 /*
1238  *      Delete a service from the service list
1239  */
1240 static int ip_vs_del_service(struct ip_vs_service *svc)
1241 {
1242         if (svc == NULL)
1243                 return -EEXIST;
1244
1245         /*
1246          * Unhash it from the service table
1247          */
1248         write_lock_bh(&__ip_vs_svc_lock);
1249
1250         ip_vs_svc_unhash(svc);
1251
1252         /*
1253          * Wait until all the svc users go away.
1254          */
1255         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1256
1257         __ip_vs_del_service(svc);
1258
1259         write_unlock_bh(&__ip_vs_svc_lock);
1260
1261         return 0;
1262 }
1263
1264
1265 /*
1266  *      Flush all the virtual services
1267  */
1268 static int ip_vs_flush(void)
1269 {
1270         int idx;
1271         struct ip_vs_service *svc, *nxt;
1272
1273         /*
1274          * Flush the service table hashed by <protocol,addr,port>
1275          */
1276         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1277                 list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) {
1278                         write_lock_bh(&__ip_vs_svc_lock);
1279                         ip_vs_svc_unhash(svc);
1280                         /*
1281                          * Wait until all the svc users go away.
1282                          */
1283                         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1284                         __ip_vs_del_service(svc);
1285                         write_unlock_bh(&__ip_vs_svc_lock);
1286                 }
1287         }
1288
1289         /*
1290          * Flush the service table hashed by fwmark
1291          */
1292         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1293                 list_for_each_entry_safe(svc, nxt,
1294                                          &ip_vs_svc_fwm_table[idx], f_list) {
1295                         write_lock_bh(&__ip_vs_svc_lock);
1296                         ip_vs_svc_unhash(svc);
1297                         /*
1298                          * Wait until all the svc users go away.
1299                          */
1300                         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1301                         __ip_vs_del_service(svc);
1302                         write_unlock_bh(&__ip_vs_svc_lock);
1303                 }
1304         }
1305
1306         return 0;
1307 }
1308
1309
1310 /*
1311  *      Zero counters in a service or all services
1312  */
1313 static int ip_vs_zero_service(struct ip_vs_service *svc)
1314 {
1315         struct ip_vs_dest *dest;
1316
1317         write_lock_bh(&__ip_vs_svc_lock);
1318         list_for_each_entry(dest, &svc->destinations, n_list) {
1319                 ip_vs_zero_stats(&dest->stats);
1320         }
1321         ip_vs_zero_stats(&svc->stats);
1322         write_unlock_bh(&__ip_vs_svc_lock);
1323         return 0;
1324 }
1325
1326 static int ip_vs_zero_all(void)
1327 {
1328         int idx;
1329         struct ip_vs_service *svc;
1330
1331         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1332                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1333                         ip_vs_zero_service(svc);
1334                 }
1335         }
1336
1337         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1338                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1339                         ip_vs_zero_service(svc);
1340                 }
1341         }
1342
1343         ip_vs_zero_stats(&ip_vs_stats);
1344         return 0;
1345 }
1346
1347
1348 static int
1349 proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
1350                      void __user *buffer, size_t *lenp)
1351 {
1352         int *valp = table->data;
1353         int val = *valp;
1354         int rc;
1355
1356         rc = proc_dointvec(table, write, filp, buffer, lenp);
1357         if (write && (*valp != val)) {
1358                 if ((*valp < 0) || (*valp > 3)) {
1359                         /* Restore the correct value */
1360                         *valp = val;
1361                 } else {
1362                         local_bh_disable();
1363                         update_defense_level();
1364                         local_bh_enable();
1365                 }
1366         }
1367         return rc;
1368 }
1369
1370
1371 static int
1372 proc_do_sync_threshold(ctl_table *table, int write, struct file *filp,
1373                        void __user *buffer, size_t *lenp)
1374 {
1375         int *valp = table->data;
1376         int val[2];
1377         int rc;
1378
1379         /* backup the value first */
1380         memcpy(val, valp, sizeof(val));
1381
1382         rc = proc_dointvec(table, write, filp, buffer, lenp);
1383         if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) {
1384                 /* Restore the correct value */
1385                 memcpy(valp, val, sizeof(val));
1386         }
1387         return rc;
1388 }
1389
1390
1391 /*
1392  *      IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
1393  */
1394
1395 static struct ctl_table vs_vars[] = {
1396         {
1397                 .ctl_name       = NET_IPV4_VS_AMEMTHRESH,
1398                 .procname       = "amemthresh",
1399                 .data           = &sysctl_ip_vs_amemthresh,
1400                 .maxlen         = sizeof(int),
1401                 .mode           = 0644,
1402                 .proc_handler   = &proc_dointvec,
1403         },
1404 #ifdef CONFIG_IP_VS_DEBUG
1405         {
1406                 .ctl_name       = NET_IPV4_VS_DEBUG_LEVEL,
1407                 .procname       = "debug_level",
1408                 .data           = &sysctl_ip_vs_debug_level,
1409                 .maxlen         = sizeof(int),
1410                 .mode           = 0644,
1411                 .proc_handler   = &proc_dointvec,
1412         },
1413 #endif
1414         {
1415                 .ctl_name       = NET_IPV4_VS_AMDROPRATE,
1416                 .procname       = "am_droprate",
1417                 .data           = &sysctl_ip_vs_am_droprate,
1418                 .maxlen         = sizeof(int),
1419                 .mode           = 0644,
1420                 .proc_handler   = &proc_dointvec,
1421         },
1422         {
1423                 .ctl_name       = NET_IPV4_VS_DROP_ENTRY,
1424                 .procname       = "drop_entry",
1425                 .data           = &sysctl_ip_vs_drop_entry,
1426                 .maxlen         = sizeof(int),
1427                 .mode           = 0644,
1428                 .proc_handler   = &proc_do_defense_mode,
1429         },
1430         {
1431                 .ctl_name       = NET_IPV4_VS_DROP_PACKET,
1432                 .procname       = "drop_packet",
1433                 .data           = &sysctl_ip_vs_drop_packet,
1434                 .maxlen         = sizeof(int),
1435                 .mode           = 0644,
1436                 .proc_handler   = &proc_do_defense_mode,
1437         },
1438         {
1439                 .ctl_name       = NET_IPV4_VS_SECURE_TCP,
1440                 .procname       = "secure_tcp",
1441                 .data           = &sysctl_ip_vs_secure_tcp,
1442                 .maxlen         = sizeof(int),
1443                 .mode           = 0644,
1444                 .proc_handler   = &proc_do_defense_mode,
1445         },
1446 #if 0
1447         {
1448                 .ctl_name       = NET_IPV4_VS_TO_ES,
1449                 .procname       = "timeout_established",
1450                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED],
1451                 .maxlen         = sizeof(int),
1452                 .mode           = 0644, 
1453                 .proc_handler   = &proc_dointvec_jiffies,
1454         },
1455         {
1456                 .ctl_name       = NET_IPV4_VS_TO_SS,
1457                 .procname       = "timeout_synsent",
1458                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT],
1459                 .maxlen         = sizeof(int),
1460                 .mode           = 0644, 
1461                 .proc_handler   = &proc_dointvec_jiffies,
1462         },
1463         {
1464                 .ctl_name       = NET_IPV4_VS_TO_SR,
1465                 .procname       = "timeout_synrecv",
1466                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV],
1467                 .maxlen         = sizeof(int),
1468                 .mode           = 0644, 
1469                 .proc_handler   = &proc_dointvec_jiffies,
1470         },
1471         {
1472                 .ctl_name       = NET_IPV4_VS_TO_FW,
1473                 .procname       = "timeout_finwait",
1474                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT],
1475                 .maxlen         = sizeof(int),
1476                 .mode           = 0644, 
1477                 .proc_handler   = &proc_dointvec_jiffies,
1478         },
1479         {
1480                 .ctl_name       = NET_IPV4_VS_TO_TW,
1481                 .procname       = "timeout_timewait",
1482                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT],
1483                 .maxlen         = sizeof(int),
1484                 .mode           = 0644,
1485                 .proc_handler   = &proc_dointvec_jiffies,
1486         },
1487         {
1488                 .ctl_name       = NET_IPV4_VS_TO_CL,
1489                 .procname       = "timeout_close",
1490                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE],
1491                 .maxlen         = sizeof(int),
1492                 .mode           = 0644, 
1493                 .proc_handler   = &proc_dointvec_jiffies,
1494         },
1495         {
1496                 .ctl_name       = NET_IPV4_VS_TO_CW,
1497                 .procname       = "timeout_closewait",
1498                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT],
1499                 .maxlen         = sizeof(int),
1500                 .mode           = 0644, 
1501                 .proc_handler   = &proc_dointvec_jiffies,
1502         },
1503         {
1504                 .ctl_name       = NET_IPV4_VS_TO_LA,
1505                 .procname       = "timeout_lastack",
1506                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK],
1507                 .maxlen         = sizeof(int),
1508                 .mode           = 0644, 
1509                 .proc_handler   = &proc_dointvec_jiffies,
1510         },
1511         {
1512                 .ctl_name       = NET_IPV4_VS_TO_LI,
1513                 .procname       = "timeout_listen",
1514                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN],
1515                 .maxlen         = sizeof(int),
1516                 .mode           = 0644, 
1517                 .proc_handler   = &proc_dointvec_jiffies,
1518         },
1519         {
1520                 .ctl_name       = NET_IPV4_VS_TO_SA,
1521                 .procname       = "timeout_synack",
1522                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK],
1523                 .maxlen         = sizeof(int),
1524                 .mode           = 0644, 
1525                 .proc_handler   = &proc_dointvec_jiffies,
1526         },
1527         {
1528                 .ctl_name       = NET_IPV4_VS_TO_UDP,
1529                 .procname       = "timeout_udp",
1530                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_UDP],
1531                 .maxlen         = sizeof(int),
1532                 .mode           = 0644, 
1533                 .proc_handler   = &proc_dointvec_jiffies,
1534         },
1535         {
1536                 .ctl_name       = NET_IPV4_VS_TO_ICMP,
1537                 .procname       = "timeout_icmp",
1538                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP],
1539                 .maxlen         = sizeof(int),
1540                 .mode           = 0644,
1541                 .proc_handler   = &proc_dointvec_jiffies,
1542         },
1543 #endif
1544         {
1545                 .ctl_name       = NET_IPV4_VS_CACHE_BYPASS,
1546                 .procname       = "cache_bypass",
1547                 .data           = &sysctl_ip_vs_cache_bypass,
1548                 .maxlen         = sizeof(int),
1549                 .mode           = 0644,
1550                 .proc_handler   = &proc_dointvec,
1551         },
1552         {
1553                 .ctl_name       = NET_IPV4_VS_EXPIRE_NODEST_CONN,
1554                 .procname       = "expire_nodest_conn",
1555                 .data           = &sysctl_ip_vs_expire_nodest_conn,
1556                 .maxlen         = sizeof(int),
1557                 .mode           = 0644,
1558                 .proc_handler   = &proc_dointvec,
1559         },
1560         {
1561                 .ctl_name       = NET_IPV4_VS_SYNC_THRESHOLD,
1562                 .procname       = "sync_threshold",
1563                 .data           = &sysctl_ip_vs_sync_threshold,
1564                 .maxlen         = sizeof(sysctl_ip_vs_sync_threshold),
1565                 .mode           = 0644,
1566                 .proc_handler   = &proc_do_sync_threshold,
1567         },
1568         {
1569                 .ctl_name       = NET_IPV4_VS_NAT_ICMP_SEND,
1570                 .procname       = "nat_icmp_send",
1571                 .data           = &sysctl_ip_vs_nat_icmp_send,
1572                 .maxlen         = sizeof(int),
1573                 .mode           = 0644,
1574                 .proc_handler   = &proc_dointvec,
1575         },
1576         { .ctl_name = 0 }
1577 };
1578
1579 static ctl_table vs_table[] = {
1580         {
1581                 .ctl_name       = NET_IPV4_VS,
1582                 .procname       = "vs",
1583                 .mode           = 0555,
1584                 .child          = vs_vars
1585         },
1586         { .ctl_name = 0 }
1587 };
1588
1589 static ctl_table ipv4_table[] = {
1590         {
1591                 .ctl_name       = NET_IPV4,
1592                 .procname       = "ipv4",
1593                 .mode           = 0555,
1594                 .child          = vs_table,
1595         },
1596         { .ctl_name = 0 }
1597 };
1598
1599 static ctl_table vs_root_table[] = {
1600         {
1601                 .ctl_name       = CTL_NET,
1602                 .procname       = "net",
1603                 .mode           = 0555,
1604                 .child          = ipv4_table,
1605         },
1606         { .ctl_name = 0 }
1607 };
1608
1609 static struct ctl_table_header * sysctl_header;
1610
1611 #ifdef CONFIG_PROC_FS
1612
1613 struct ip_vs_iter {
1614         struct list_head *table;
1615         int bucket;
1616 };
1617
1618 /*
1619  *      Write the contents of the VS rule table to a PROCfs file.
1620  *      (It is kept just for backward compatibility)
1621  */
1622 static inline const char *ip_vs_fwd_name(unsigned flags)
1623 {
1624         switch (flags & IP_VS_CONN_F_FWD_MASK) {
1625         case IP_VS_CONN_F_LOCALNODE:
1626                 return "Local";
1627         case IP_VS_CONN_F_TUNNEL:
1628                 return "Tunnel";
1629         case IP_VS_CONN_F_DROUTE:
1630                 return "Route";
1631         default:
1632                 return "Masq";
1633         }
1634 }
1635
1636
1637 /* Get the Nth entry in the two lists */
1638 static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
1639 {
1640         struct ip_vs_iter *iter = seq->private;
1641         int idx;
1642         struct ip_vs_service *svc;
1643
1644         /* look in hash by protocol */
1645         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1646                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1647                         if (pos-- == 0){
1648                                 iter->table = ip_vs_svc_table;
1649                                 iter->bucket = idx;
1650                                 return svc;
1651                         }
1652                 }
1653         }
1654
1655         /* keep looking in fwmark */
1656         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1657                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1658                         if (pos-- == 0) {
1659                                 iter->table = ip_vs_svc_fwm_table;
1660                                 iter->bucket = idx;
1661                                 return svc;
1662                         }
1663                 }
1664         }
1665
1666         return NULL;
1667 }
1668
1669 static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
1670 {
1671
1672         read_lock_bh(&__ip_vs_svc_lock);
1673         return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
1674 }
1675
1676
1677 static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1678 {
1679         struct list_head *e;
1680         struct ip_vs_iter *iter;
1681         struct ip_vs_service *svc;
1682
1683         ++*pos;
1684         if (v == SEQ_START_TOKEN)
1685                 return ip_vs_info_array(seq,0);
1686
1687         svc = v;
1688         iter = seq->private;
1689
1690         if (iter->table == ip_vs_svc_table) {
1691                 /* next service in table hashed by protocol */
1692                 if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket])
1693                         return list_entry(e, struct ip_vs_service, s_list);
1694
1695
1696                 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1697                         list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket],
1698                                             s_list) {
1699                                 return svc;
1700                         }
1701                 }
1702
1703                 iter->table = ip_vs_svc_fwm_table;
1704                 iter->bucket = -1;
1705                 goto scan_fwmark;
1706         }
1707
1708         /* next service in hashed by fwmark */
1709         if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket])
1710                 return list_entry(e, struct ip_vs_service, f_list);
1711
1712  scan_fwmark:
1713         while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1714                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket],
1715                                     f_list)
1716                         return svc;
1717         }
1718
1719         return NULL;
1720 }
1721
1722 static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
1723 {
1724         read_unlock_bh(&__ip_vs_svc_lock);
1725 }
1726
1727
1728 static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
1729 {
1730         if (v == SEQ_START_TOKEN) {
1731                 seq_printf(seq,
1732                         "IP Virtual Server version %d.%d.%d (size=%d)\n",
1733                         NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
1734                 seq_puts(seq,
1735                          "Prot LocalAddress:Port Scheduler Flags\n");
1736                 seq_puts(seq,
1737                          "  -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
1738         } else {
1739                 const struct ip_vs_service *svc = v;
1740                 const struct ip_vs_iter *iter = seq->private;
1741                 const struct ip_vs_dest *dest;
1742
1743                 if (iter->table == ip_vs_svc_table)
1744                         seq_printf(seq, "%s  %08X:%04X %s ",
1745                                    ip_vs_proto_name(svc->protocol),
1746                                    ntohl(svc->addr),
1747                                    ntohs(svc->port),
1748                                    svc->scheduler->name);
1749                 else
1750                         seq_printf(seq, "FWM  %08X %s ",
1751                                    svc->fwmark, svc->scheduler->name);
1752
1753                 if (svc->flags & IP_VS_SVC_F_PERSISTENT)
1754                         seq_printf(seq, "persistent %d %08X\n",
1755                                 svc->timeout,
1756                                 ntohl(svc->netmask));
1757                 else
1758                         seq_putc(seq, '\n');
1759
1760                 list_for_each_entry(dest, &svc->destinations, n_list) {
1761                         seq_printf(seq,
1762                                    "  -> %08X:%04X      %-7s %-6d %-10d %-10d\n",
1763                                    ntohl(dest->addr), ntohs(dest->port),
1764                                    ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
1765                                    atomic_read(&dest->weight),
1766                                    atomic_read(&dest->activeconns),
1767                                    atomic_read(&dest->inactconns));
1768                 }
1769         }
1770         return 0;
1771 }
1772
1773 static struct seq_operations ip_vs_info_seq_ops = {
1774         .start = ip_vs_info_seq_start,
1775         .next  = ip_vs_info_seq_next,
1776         .stop  = ip_vs_info_seq_stop,
1777         .show  = ip_vs_info_seq_show,
1778 };
1779
1780 static int ip_vs_info_open(struct inode *inode, struct file *file)
1781 {
1782         struct seq_file *seq;
1783         int rc = -ENOMEM;
1784         struct ip_vs_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
1785
1786         if (!s)
1787                 goto out;
1788
1789         rc = seq_open(file, &ip_vs_info_seq_ops);
1790         if (rc)
1791                 goto out_kfree;
1792
1793         seq          = file->private_data;
1794         seq->private = s;
1795         memset(s, 0, sizeof(*s));
1796 out:
1797         return rc;
1798 out_kfree:
1799         kfree(s);
1800         goto out;
1801 }
1802
1803 static struct file_operations ip_vs_info_fops = {
1804         .owner   = THIS_MODULE,
1805         .open    = ip_vs_info_open,
1806         .read    = seq_read,
1807         .llseek  = seq_lseek,
1808         .release = seq_release_private,
1809 };
1810
1811 #endif
1812
1813 struct ip_vs_stats ip_vs_stats;
1814
1815 #ifdef CONFIG_PROC_FS
1816 static int ip_vs_stats_show(struct seq_file *seq, void *v)
1817 {
1818
1819 /*               01234567 01234567 01234567 0123456701234567 0123456701234567 */
1820         seq_puts(seq,
1821                  "   Total Incoming Outgoing         Incoming         Outgoing\n");
1822         seq_printf(seq,
1823                    "   Conns  Packets  Packets            Bytes            Bytes\n");
1824
1825         spin_lock_bh(&ip_vs_stats.lock);
1826         seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.conns,
1827                    ip_vs_stats.inpkts, ip_vs_stats.outpkts,
1828                    (unsigned long long) ip_vs_stats.inbytes,
1829                    (unsigned long long) ip_vs_stats.outbytes);
1830
1831 /*                 01234567 01234567 01234567 0123456701234567 0123456701234567 */
1832         seq_puts(seq,
1833                    " Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s\n");
1834         seq_printf(seq,"%8X %8X %8X %16X %16X\n",
1835                         ip_vs_stats.cps,
1836                         ip_vs_stats.inpps,
1837                         ip_vs_stats.outpps,
1838                         ip_vs_stats.inbps,
1839                         ip_vs_stats.outbps);
1840         spin_unlock_bh(&ip_vs_stats.lock);
1841
1842         return 0;
1843 }
1844
1845 static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
1846 {
1847         return single_open(file, ip_vs_stats_show, NULL);
1848 }
1849
1850 static struct file_operations ip_vs_stats_fops = {
1851         .owner = THIS_MODULE,
1852         .open = ip_vs_stats_seq_open,
1853         .read = seq_read,
1854         .llseek = seq_lseek,
1855         .release = single_release,
1856 };
1857
1858 #endif
1859
1860 /*
1861  *      Set timeout values for tcp tcpfin udp in the timeout_table.
1862  */
1863 static int ip_vs_set_timeout(struct ip_vs_timeout_user *u)
1864 {
1865         IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
1866                   u->tcp_timeout,
1867                   u->tcp_fin_timeout,
1868                   u->udp_timeout);
1869
1870 #ifdef CONFIG_IP_VS_PROTO_TCP
1871         if (u->tcp_timeout) {
1872                 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED]
1873                         = u->tcp_timeout * HZ;
1874         }
1875
1876         if (u->tcp_fin_timeout) {
1877                 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT]
1878                         = u->tcp_fin_timeout * HZ;
1879         }
1880 #endif
1881
1882 #ifdef CONFIG_IP_VS_PROTO_UDP
1883         if (u->udp_timeout) {
1884                 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL]
1885                         = u->udp_timeout * HZ;
1886         }
1887 #endif
1888         return 0;
1889 }
1890
1891
1892 #define SET_CMDID(cmd)          (cmd - IP_VS_BASE_CTL)
1893 #define SERVICE_ARG_LEN         (sizeof(struct ip_vs_service_user))
1894 #define SVCDEST_ARG_LEN         (sizeof(struct ip_vs_service_user) +    \
1895                                  sizeof(struct ip_vs_dest_user))
1896 #define TIMEOUT_ARG_LEN         (sizeof(struct ip_vs_timeout_user))
1897 #define DAEMON_ARG_LEN          (sizeof(struct ip_vs_daemon_user))
1898 #define MAX_ARG_LEN             SVCDEST_ARG_LEN
1899
1900 static unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
1901         [SET_CMDID(IP_VS_SO_SET_ADD)]           = SERVICE_ARG_LEN,
1902         [SET_CMDID(IP_VS_SO_SET_EDIT)]          = SERVICE_ARG_LEN,
1903         [SET_CMDID(IP_VS_SO_SET_DEL)]           = SERVICE_ARG_LEN,
1904         [SET_CMDID(IP_VS_SO_SET_FLUSH)]         = 0,
1905         [SET_CMDID(IP_VS_SO_SET_ADDDEST)]       = SVCDEST_ARG_LEN,
1906         [SET_CMDID(IP_VS_SO_SET_DELDEST)]       = SVCDEST_ARG_LEN,
1907         [SET_CMDID(IP_VS_SO_SET_EDITDEST)]      = SVCDEST_ARG_LEN,
1908         [SET_CMDID(IP_VS_SO_SET_TIMEOUT)]       = TIMEOUT_ARG_LEN,
1909         [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)]   = DAEMON_ARG_LEN,
1910         [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)]    = DAEMON_ARG_LEN,
1911         [SET_CMDID(IP_VS_SO_SET_ZERO)]          = SERVICE_ARG_LEN,
1912 };
1913
1914 static int
1915 do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
1916 {
1917         int ret;
1918         unsigned char arg[MAX_ARG_LEN];
1919         struct ip_vs_service_user *usvc;
1920         struct ip_vs_service *svc;
1921         struct ip_vs_dest_user *udest;
1922
1923         if (!capable(CAP_NET_ADMIN))
1924                 return -EPERM;
1925
1926         if (len != set_arglen[SET_CMDID(cmd)]) {
1927                 IP_VS_ERR("set_ctl: len %u != %u\n",
1928                           len, set_arglen[SET_CMDID(cmd)]);
1929                 return -EINVAL;
1930         }
1931
1932         if (copy_from_user(arg, user, len) != 0)
1933                 return -EFAULT;
1934
1935         /* increase the module use count */
1936         ip_vs_use_count_inc();
1937
1938         if (down_interruptible(&__ip_vs_mutex)) {
1939                 ret = -ERESTARTSYS;
1940                 goto out_dec;
1941         }
1942
1943         if (cmd == IP_VS_SO_SET_FLUSH) {
1944                 /* Flush the virtual service */
1945                 ret = ip_vs_flush();
1946                 goto out_unlock;
1947         } else if (cmd == IP_VS_SO_SET_TIMEOUT) {
1948                 /* Set timeout values for (tcp tcpfin udp) */
1949                 ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg);
1950                 goto out_unlock;
1951         } else if (cmd == IP_VS_SO_SET_STARTDAEMON) {
1952                 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1953                 ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid);
1954                 goto out_unlock;
1955         } else if (cmd == IP_VS_SO_SET_STOPDAEMON) {
1956                 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1957                 ret = stop_sync_thread(dm->state);
1958                 goto out_unlock;
1959         }
1960
1961         usvc = (struct ip_vs_service_user *)arg;
1962         udest = (struct ip_vs_dest_user *)(usvc + 1);
1963
1964         if (cmd == IP_VS_SO_SET_ZERO) {
1965                 /* if no service address is set, zero counters in all */
1966                 if (!usvc->fwmark && !usvc->addr && !usvc->port) {
1967                         ret = ip_vs_zero_all();
1968                         goto out_unlock;
1969                 }
1970         }
1971
1972         /* Check for valid protocol: TCP or UDP, even for fwmark!=0 */
1973         if (usvc->protocol!=IPPROTO_TCP && usvc->protocol!=IPPROTO_UDP) {
1974                 IP_VS_ERR("set_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s\n",
1975                           usvc->protocol, NIPQUAD(usvc->addr),
1976                           ntohs(usvc->port), usvc->sched_name);
1977                 ret = -EFAULT;
1978                 goto out_unlock;
1979         }
1980
1981         /* Lookup the exact service by <protocol, addr, port> or fwmark */
1982         if (usvc->fwmark == 0)
1983                 svc = __ip_vs_service_get(usvc->protocol,
1984                                           usvc->addr, usvc->port);
1985         else
1986                 svc = __ip_vs_svc_fwm_get(usvc->fwmark);
1987
1988         if (cmd != IP_VS_SO_SET_ADD
1989             && (svc == NULL || svc->protocol != usvc->protocol)) {
1990                 ret = -ESRCH;
1991                 goto out_unlock;
1992         }
1993
1994         switch (cmd) {
1995         case IP_VS_SO_SET_ADD:
1996                 if (svc != NULL)
1997                         ret = -EEXIST;
1998                 else
1999                         ret = ip_vs_add_service(usvc, &svc);
2000                 break;
2001         case IP_VS_SO_SET_EDIT:
2002                 ret = ip_vs_edit_service(svc, usvc);
2003                 break;
2004         case IP_VS_SO_SET_DEL:
2005                 ret = ip_vs_del_service(svc);
2006                 if (!ret)
2007                         goto out_unlock;
2008                 break;
2009         case IP_VS_SO_SET_ZERO:
2010                 ret = ip_vs_zero_service(svc);
2011                 break;
2012         case IP_VS_SO_SET_ADDDEST:
2013                 ret = ip_vs_add_dest(svc, udest);
2014                 break;
2015         case IP_VS_SO_SET_EDITDEST:
2016                 ret = ip_vs_edit_dest(svc, udest);
2017                 break;
2018         case IP_VS_SO_SET_DELDEST:
2019                 ret = ip_vs_del_dest(svc, udest);
2020                 break;
2021         default:
2022                 ret = -EINVAL;
2023         }
2024
2025         if (svc)
2026                 ip_vs_service_put(svc);
2027
2028   out_unlock:
2029         up(&__ip_vs_mutex);
2030   out_dec:
2031         /* decrease the module use count */
2032         ip_vs_use_count_dec();
2033
2034         return ret;
2035 }
2036
2037
2038 static void
2039 ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
2040 {
2041         spin_lock_bh(&src->lock);
2042         memcpy(dst, src, (char*)&src->lock - (char*)src);
2043         spin_unlock_bh(&src->lock);
2044 }
2045
2046 static void
2047 ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
2048 {
2049         dst->protocol = src->protocol;
2050         dst->addr = src->addr;
2051         dst->port = src->port;
2052         dst->fwmark = src->fwmark;
2053         strcpy(dst->sched_name, src->scheduler->name);
2054         dst->flags = src->flags;
2055         dst->timeout = src->timeout / HZ;
2056         dst->netmask = src->netmask;
2057         dst->num_dests = src->num_dests;
2058         ip_vs_copy_stats(&dst->stats, &src->stats);
2059 }
2060
2061 static inline int
2062 __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
2063                             struct ip_vs_get_services __user *uptr)
2064 {
2065         int idx, count=0;
2066         struct ip_vs_service *svc;
2067         struct ip_vs_service_entry entry;
2068         int ret = 0;
2069
2070         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2071                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
2072                         if (count >= get->num_services)
2073                                 goto out;
2074                         ip_vs_copy_service(&entry, svc);
2075                         if (copy_to_user(&uptr->entrytable[count],
2076                                          &entry, sizeof(entry))) {
2077                                 ret = -EFAULT;
2078                                 goto out;
2079                         }
2080                         count++;
2081                 }
2082         }
2083
2084         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2085                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
2086                         if (count >= get->num_services)
2087                                 goto out;
2088                         ip_vs_copy_service(&entry, svc);
2089                         if (copy_to_user(&uptr->entrytable[count],
2090                                          &entry, sizeof(entry))) {
2091                                 ret = -EFAULT;
2092                                 goto out;
2093                         }
2094                         count++;
2095                 }
2096         }
2097   out:
2098         return ret;
2099 }
2100
2101 static inline int
2102 __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
2103                          struct ip_vs_get_dests __user *uptr)
2104 {
2105         struct ip_vs_service *svc;
2106         int ret = 0;
2107
2108         if (get->fwmark)
2109                 svc = __ip_vs_svc_fwm_get(get->fwmark);
2110         else
2111                 svc = __ip_vs_service_get(get->protocol,
2112                                           get->addr, get->port);
2113         if (svc) {
2114                 int count = 0;
2115                 struct ip_vs_dest *dest;
2116                 struct ip_vs_dest_entry entry;
2117
2118                 list_for_each_entry(dest, &svc->destinations, n_list) {
2119                         if (count >= get->num_dests)
2120                                 break;
2121
2122                         entry.addr = dest->addr;
2123                         entry.port = dest->port;
2124                         entry.conn_flags = atomic_read(&dest->conn_flags);
2125                         entry.weight = atomic_read(&dest->weight);
2126                         entry.u_threshold = dest->u_threshold;
2127                         entry.l_threshold = dest->l_threshold;
2128                         entry.activeconns = atomic_read(&dest->activeconns);
2129                         entry.inactconns = atomic_read(&dest->inactconns);
2130                         entry.persistconns = atomic_read(&dest->persistconns);
2131                         ip_vs_copy_stats(&entry.stats, &dest->stats);
2132                         if (copy_to_user(&uptr->entrytable[count],
2133                                          &entry, sizeof(entry))) {
2134                                 ret = -EFAULT;
2135                                 break;
2136                         }
2137                         count++;
2138                 }
2139                 ip_vs_service_put(svc);
2140         } else
2141                 ret = -ESRCH;
2142         return ret;
2143 }
2144
2145 static inline void
2146 __ip_vs_get_timeouts(struct ip_vs_timeout_user *u)
2147 {
2148 #ifdef CONFIG_IP_VS_PROTO_TCP
2149         u->tcp_timeout =
2150                 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
2151         u->tcp_fin_timeout =
2152                 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
2153 #endif
2154 #ifdef CONFIG_IP_VS_PROTO_UDP
2155         u->udp_timeout =
2156                 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
2157 #endif
2158 }
2159
2160
2161 #define GET_CMDID(cmd)          (cmd - IP_VS_BASE_CTL)
2162 #define GET_INFO_ARG_LEN        (sizeof(struct ip_vs_getinfo))
2163 #define GET_SERVICES_ARG_LEN    (sizeof(struct ip_vs_get_services))
2164 #define GET_SERVICE_ARG_LEN     (sizeof(struct ip_vs_service_entry))
2165 #define GET_DESTS_ARG_LEN       (sizeof(struct ip_vs_get_dests))
2166 #define GET_TIMEOUT_ARG_LEN     (sizeof(struct ip_vs_timeout_user))
2167 #define GET_DAEMON_ARG_LEN      (sizeof(struct ip_vs_daemon_user) * 2)
2168
2169 static unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
2170         [GET_CMDID(IP_VS_SO_GET_VERSION)]       = 64,
2171         [GET_CMDID(IP_VS_SO_GET_INFO)]          = GET_INFO_ARG_LEN,
2172         [GET_CMDID(IP_VS_SO_GET_SERVICES)]      = GET_SERVICES_ARG_LEN,
2173         [GET_CMDID(IP_VS_SO_GET_SERVICE)]       = GET_SERVICE_ARG_LEN,
2174         [GET_CMDID(IP_VS_SO_GET_DESTS)]         = GET_DESTS_ARG_LEN,
2175         [GET_CMDID(IP_VS_SO_GET_TIMEOUT)]       = GET_TIMEOUT_ARG_LEN,
2176         [GET_CMDID(IP_VS_SO_GET_DAEMON)]        = GET_DAEMON_ARG_LEN,
2177 };
2178
2179 static int
2180 do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2181 {
2182         unsigned char arg[128];
2183         int ret = 0;
2184
2185         if (!capable(CAP_NET_ADMIN))
2186                 return -EPERM;
2187
2188         if (*len < get_arglen[GET_CMDID(cmd)]) {
2189                 IP_VS_ERR("get_ctl: len %u < %u\n",
2190                           *len, get_arglen[GET_CMDID(cmd)]);
2191                 return -EINVAL;
2192         }
2193
2194         if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0)
2195                 return -EFAULT;
2196
2197         if (down_interruptible(&__ip_vs_mutex))
2198                 return -ERESTARTSYS;
2199
2200         switch (cmd) {
2201         case IP_VS_SO_GET_VERSION:
2202         {
2203                 char buf[64];
2204
2205                 sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
2206                         NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
2207                 if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
2208                         ret = -EFAULT;
2209                         goto out;
2210                 }
2211                 *len = strlen(buf)+1;
2212         }
2213         break;
2214
2215         case IP_VS_SO_GET_INFO:
2216         {
2217                 struct ip_vs_getinfo info;
2218                 info.version = IP_VS_VERSION_CODE;
2219                 info.size = IP_VS_CONN_TAB_SIZE;
2220                 info.num_services = ip_vs_num_services;
2221                 if (copy_to_user(user, &info, sizeof(info)) != 0)
2222                         ret = -EFAULT;
2223         }
2224         break;
2225
2226         case IP_VS_SO_GET_SERVICES:
2227         {
2228                 struct ip_vs_get_services *get;
2229                 int size;
2230
2231                 get = (struct ip_vs_get_services *)arg;
2232                 size = sizeof(*get) +
2233                         sizeof(struct ip_vs_service_entry) * get->num_services;
2234                 if (*len != size) {
2235                         IP_VS_ERR("length: %u != %u\n", *len, size);
2236                         ret = -EINVAL;
2237                         goto out;
2238                 }
2239                 ret = __ip_vs_get_service_entries(get, user);
2240         }
2241         break;
2242
2243         case IP_VS_SO_GET_SERVICE:
2244         {
2245                 struct ip_vs_service_entry *entry;
2246                 struct ip_vs_service *svc;
2247
2248                 entry = (struct ip_vs_service_entry *)arg;
2249                 if (entry->fwmark)
2250                         svc = __ip_vs_svc_fwm_get(entry->fwmark);
2251                 else
2252                         svc = __ip_vs_service_get(entry->protocol,
2253                                                   entry->addr, entry->port);
2254                 if (svc) {
2255                         ip_vs_copy_service(entry, svc);
2256                         if (copy_to_user(user, entry, sizeof(*entry)) != 0)
2257                                 ret = -EFAULT;
2258                         ip_vs_service_put(svc);
2259                 } else
2260                         ret = -ESRCH;
2261         }
2262         break;
2263
2264         case IP_VS_SO_GET_DESTS:
2265         {
2266                 struct ip_vs_get_dests *get;
2267                 int size;
2268
2269                 get = (struct ip_vs_get_dests *)arg;
2270                 size = sizeof(*get) +
2271                         sizeof(struct ip_vs_dest_entry) * get->num_dests;
2272                 if (*len != size) {
2273                         IP_VS_ERR("length: %u != %u\n", *len, size);
2274                         ret = -EINVAL;
2275                         goto out;
2276                 }
2277                 ret = __ip_vs_get_dest_entries(get, user);
2278         }
2279         break;
2280
2281         case IP_VS_SO_GET_TIMEOUT:
2282         {
2283                 struct ip_vs_timeout_user t;
2284
2285                 __ip_vs_get_timeouts(&t);
2286                 if (copy_to_user(user, &t, sizeof(t)) != 0)
2287                         ret = -EFAULT;
2288         }
2289         break;
2290
2291         case IP_VS_SO_GET_DAEMON:
2292         {
2293                 struct ip_vs_daemon_user d[2];
2294
2295                 memset(&d, 0, sizeof(d));
2296                 if (ip_vs_sync_state & IP_VS_STATE_MASTER) {
2297                         d[0].state = IP_VS_STATE_MASTER;
2298                         strcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn);
2299                         d[0].syncid = ip_vs_master_syncid;
2300                 }
2301                 if (ip_vs_sync_state & IP_VS_STATE_BACKUP) {
2302                         d[1].state = IP_VS_STATE_BACKUP;
2303                         strcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn);
2304                         d[1].syncid = ip_vs_backup_syncid;
2305                 }
2306                 if (copy_to_user(user, &d, sizeof(d)) != 0)
2307                         ret = -EFAULT;
2308         }
2309         break;
2310
2311         default:
2312                 ret = -EINVAL;
2313         }
2314
2315   out:
2316         up(&__ip_vs_mutex);
2317         return ret;
2318 }
2319
2320
2321 static struct nf_sockopt_ops ip_vs_sockopts = {
2322         .pf             = PF_INET,
2323         .set_optmin     = IP_VS_BASE_CTL,
2324         .set_optmax     = IP_VS_SO_SET_MAX+1,
2325         .set            = do_ip_vs_set_ctl,
2326         .get_optmin     = IP_VS_BASE_CTL,
2327         .get_optmax     = IP_VS_SO_GET_MAX+1,
2328         .get            = do_ip_vs_get_ctl,
2329 };
2330
2331
2332 int ip_vs_control_init(void)
2333 {
2334         int ret;
2335         int idx;
2336
2337         EnterFunction(2);
2338
2339         ret = nf_register_sockopt(&ip_vs_sockopts);
2340         if (ret) {
2341                 IP_VS_ERR("cannot register sockopt.\n");
2342                 return ret;
2343         }
2344
2345         proc_net_fops_create("ip_vs", 0, &ip_vs_info_fops);
2346         proc_net_fops_create("ip_vs_stats",0, &ip_vs_stats_fops);
2347
2348         sysctl_header = register_sysctl_table(vs_root_table, 0);
2349
2350         /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
2351         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++)  {
2352                 INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
2353                 INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
2354         }
2355         for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++)  {
2356                 INIT_LIST_HEAD(&ip_vs_rtable[idx]);
2357         }
2358
2359         memset(&ip_vs_stats, 0, sizeof(ip_vs_stats));
2360         ip_vs_stats.lock = SPIN_LOCK_UNLOCKED;
2361         ip_vs_new_estimator(&ip_vs_stats);
2362
2363         /* Hook the defense timer */
2364         init_timer(&defense_timer);
2365         defense_timer.function = defense_timer_handler;
2366         defense_timer.expires = jiffies + DEFENSE_TIMER_PERIOD;
2367         add_timer(&defense_timer);
2368
2369         LeaveFunction(2);
2370         return 0;
2371 }
2372
2373
2374 void ip_vs_control_cleanup(void)
2375 {
2376         EnterFunction(2);
2377         ip_vs_trash_cleanup();
2378         del_timer_sync(&defense_timer);
2379         ip_vs_kill_estimator(&ip_vs_stats);
2380         unregister_sysctl_table(sysctl_header);
2381         proc_net_remove("ip_vs_stats");
2382         proc_net_remove("ip_vs");
2383         nf_unregister_sockopt(&ip_vs_sockopts);
2384         LeaveFunction(2);
2385 }