vserver 2.0 rc7
[linux-2.6.git] / net / ipv4 / ipvs / ip_vs_ctl.c
1 /*
2  * IPVS         An implementation of the IP virtual server support for the
3  *              LINUX operating system.  IPVS is now implemented as a module
4  *              over the NetFilter framework. IPVS can be used to build a
5  *              high-performance and highly available server based on a
6  *              cluster of servers.
7  *
8  * Version:     $Id: ip_vs_ctl.c,v 1.36 2003/06/08 09:31:19 wensong Exp $
9  *
10  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
11  *              Peter Kese <peter.kese@ijs.si>
12  *              Julian Anastasov <ja@ssi.bg>
13  *
14  *              This program is free software; you can redistribute it and/or
15  *              modify it under the terms of the GNU General Public License
16  *              as published by the Free Software Foundation; either version
17  *              2 of the License, or (at your option) any later version.
18  *
19  * Changes:
20  *
21  */
22
23 #include <linux/module.h>
24 #include <linux/init.h>
25 #include <linux/types.h>
26 #include <linux/fs.h>
27 #include <linux/sysctl.h>
28 #include <linux/proc_fs.h>
29 #include <linux/workqueue.h>
30 #include <linux/swap.h>
31 #include <linux/proc_fs.h>
32 #include <linux/seq_file.h>
33
34 #include <linux/netfilter.h>
35 #include <linux/netfilter_ipv4.h>
36
37 #include <net/ip.h>
38 #include <net/sock.h>
39
40 #include <asm/uaccess.h>
41
42 #include <net/ip_vs.h>
43
44 /* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
45 static DECLARE_MUTEX(__ip_vs_mutex);
46
47 /* lock for service table */
48 static DEFINE_RWLOCK(__ip_vs_svc_lock);
49
50 /* lock for table with the real services */
51 static DEFINE_RWLOCK(__ip_vs_rs_lock);
52
53 /* lock for state and timeout tables */
54 static DEFINE_RWLOCK(__ip_vs_securetcp_lock);
55
56 /* lock for drop entry handling */
57 static DEFINE_SPINLOCK(__ip_vs_dropentry_lock);
58
59 /* lock for drop packet handling */
60 static DEFINE_SPINLOCK(__ip_vs_droppacket_lock);
61
62 /* 1/rate drop and drop-entry variables */
63 int ip_vs_drop_rate = 0;
64 int ip_vs_drop_counter = 0;
65 static atomic_t ip_vs_dropentry = ATOMIC_INIT(0);
66
67 /* number of virtual services */
68 static int ip_vs_num_services = 0;
69
70 /* sysctl variables */
71 static int sysctl_ip_vs_drop_entry = 0;
72 static int sysctl_ip_vs_drop_packet = 0;
73 static int sysctl_ip_vs_secure_tcp = 0;
74 static int sysctl_ip_vs_amemthresh = 1024;
75 static int sysctl_ip_vs_am_droprate = 10;
76 int sysctl_ip_vs_cache_bypass = 0;
77 int sysctl_ip_vs_expire_nodest_conn = 0;
78 int sysctl_ip_vs_expire_quiescent_template = 0;
79 int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
80 int sysctl_ip_vs_nat_icmp_send = 0;
81
82
83 #ifdef CONFIG_IP_VS_DEBUG
84 static int sysctl_ip_vs_debug_level = 0;
85
86 int ip_vs_get_debug_level(void)
87 {
88         return sysctl_ip_vs_debug_level;
89 }
90 #endif
91
92 /*
93  *      update_defense_level is called from keventd and from sysctl.
94  */
95 static void update_defense_level(void)
96 {
97         struct sysinfo i;
98         static int old_secure_tcp = 0;
99         int availmem;
100         int nomem;
101         int to_change = -1;
102
103         /* we only count free and buffered memory (in pages) */
104         si_meminfo(&i);
105         availmem = i.freeram + i.bufferram;
106         /* however in linux 2.5 the i.bufferram is total page cache size,
107            we need adjust it */
108         /* si_swapinfo(&i); */
109         /* availmem = availmem - (i.totalswap - i.freeswap); */
110
111         nomem = (availmem < sysctl_ip_vs_amemthresh);
112
113         /* drop_entry */
114         spin_lock(&__ip_vs_dropentry_lock);
115         switch (sysctl_ip_vs_drop_entry) {
116         case 0:
117                 atomic_set(&ip_vs_dropentry, 0);
118                 break;
119         case 1:
120                 if (nomem) {
121                         atomic_set(&ip_vs_dropentry, 1);
122                         sysctl_ip_vs_drop_entry = 2;
123                 } else {
124                         atomic_set(&ip_vs_dropentry, 0);
125                 }
126                 break;
127         case 2:
128                 if (nomem) {
129                         atomic_set(&ip_vs_dropentry, 1);
130                 } else {
131                         atomic_set(&ip_vs_dropentry, 0);
132                         sysctl_ip_vs_drop_entry = 1;
133                 };
134                 break;
135         case 3:
136                 atomic_set(&ip_vs_dropentry, 1);
137                 break;
138         }
139         spin_unlock(&__ip_vs_dropentry_lock);
140
141         /* drop_packet */
142         spin_lock(&__ip_vs_droppacket_lock);
143         switch (sysctl_ip_vs_drop_packet) {
144         case 0:
145                 ip_vs_drop_rate = 0;
146                 break;
147         case 1:
148                 if (nomem) {
149                         ip_vs_drop_rate = ip_vs_drop_counter
150                                 = sysctl_ip_vs_amemthresh /
151                                 (sysctl_ip_vs_amemthresh-availmem);
152                         sysctl_ip_vs_drop_packet = 2;
153                 } else {
154                         ip_vs_drop_rate = 0;
155                 }
156                 break;
157         case 2:
158                 if (nomem) {
159                         ip_vs_drop_rate = ip_vs_drop_counter
160                                 = sysctl_ip_vs_amemthresh /
161                                 (sysctl_ip_vs_amemthresh-availmem);
162                 } else {
163                         ip_vs_drop_rate = 0;
164                         sysctl_ip_vs_drop_packet = 1;
165                 }
166                 break;
167         case 3:
168                 ip_vs_drop_rate = sysctl_ip_vs_am_droprate;
169                 break;
170         }
171         spin_unlock(&__ip_vs_droppacket_lock);
172
173         /* secure_tcp */
174         write_lock(&__ip_vs_securetcp_lock);
175         switch (sysctl_ip_vs_secure_tcp) {
176         case 0:
177                 if (old_secure_tcp >= 2)
178                         to_change = 0;
179                 break;
180         case 1:
181                 if (nomem) {
182                         if (old_secure_tcp < 2)
183                                 to_change = 1;
184                         sysctl_ip_vs_secure_tcp = 2;
185                 } else {
186                         if (old_secure_tcp >= 2)
187                                 to_change = 0;
188                 }
189                 break;
190         case 2:
191                 if (nomem) {
192                         if (old_secure_tcp < 2)
193                                 to_change = 1;
194                 } else {
195                         if (old_secure_tcp >= 2)
196                                 to_change = 0;
197                         sysctl_ip_vs_secure_tcp = 1;
198                 }
199                 break;
200         case 3:
201                 if (old_secure_tcp < 2)
202                         to_change = 1;
203                 break;
204         }
205         old_secure_tcp = sysctl_ip_vs_secure_tcp;
206         if (to_change >= 0)
207                 ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
208         write_unlock(&__ip_vs_securetcp_lock);
209 }
210
211
212 /*
213  *      Timer for checking the defense
214  */
215 #define DEFENSE_TIMER_PERIOD    1*HZ
216 static void defense_work_handler(void *data);
217 static DECLARE_WORK(defense_work, defense_work_handler, NULL);
218
219 static void defense_work_handler(void *data)
220 {
221         update_defense_level();
222         if (atomic_read(&ip_vs_dropentry))
223                 ip_vs_random_dropentry();
224
225         schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
226 }
227
228 int
229 ip_vs_use_count_inc(void)
230 {
231         return try_module_get(THIS_MODULE);
232 }
233
234 void
235 ip_vs_use_count_dec(void)
236 {
237         module_put(THIS_MODULE);
238 }
239
240
241 /*
242  *      Hash table: for virtual service lookups
243  */
244 #define IP_VS_SVC_TAB_BITS 8
245 #define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
246 #define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
247
248 /* the service table hashed by <protocol, addr, port> */
249 static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
250 /* the service table hashed by fwmark */
251 static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
252
253 /*
254  *      Hash table: for real service lookups
255  */
256 #define IP_VS_RTAB_BITS 4
257 #define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS)
258 #define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1)
259
260 static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE];
261
262 /*
263  *      Trash for destinations
264  */
265 static LIST_HEAD(ip_vs_dest_trash);
266
267 /*
268  *      FTP & NULL virtual service counters
269  */
270 static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0);
271 static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0);
272
273
274 /*
275  *      Returns hash value for virtual service
276  */
277 static __inline__ unsigned
278 ip_vs_svc_hashkey(unsigned proto, __u32 addr, __u16 port)
279 {
280         register unsigned porth = ntohs(port);
281
282         return (proto^ntohl(addr)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
283                 & IP_VS_SVC_TAB_MASK;
284 }
285
286 /*
287  *      Returns hash value of fwmark for virtual service lookup
288  */
289 static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark)
290 {
291         return fwmark & IP_VS_SVC_TAB_MASK;
292 }
293
294 /*
295  *      Hashes a service in the ip_vs_svc_table by <proto,addr,port>
296  *      or in the ip_vs_svc_fwm_table by fwmark.
297  *      Should be called with locked tables.
298  */
299 static int ip_vs_svc_hash(struct ip_vs_service *svc)
300 {
301         unsigned hash;
302
303         if (svc->flags & IP_VS_SVC_F_HASHED) {
304                 IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, "
305                           "called from %p\n", __builtin_return_address(0));
306                 return 0;
307         }
308
309         if (svc->fwmark == 0) {
310                 /*
311                  *  Hash it by <protocol,addr,port> in ip_vs_svc_table
312                  */
313                 hash = ip_vs_svc_hashkey(svc->protocol, svc->addr, svc->port);
314                 list_add(&svc->s_list, &ip_vs_svc_table[hash]);
315         } else {
316                 /*
317                  *  Hash it by fwmark in ip_vs_svc_fwm_table
318                  */
319                 hash = ip_vs_svc_fwm_hashkey(svc->fwmark);
320                 list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
321         }
322
323         svc->flags |= IP_VS_SVC_F_HASHED;
324         /* increase its refcnt because it is referenced by the svc table */
325         atomic_inc(&svc->refcnt);
326         return 1;
327 }
328
329
330 /*
331  *      Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table.
332  *      Should be called with locked tables.
333  */
334 static int ip_vs_svc_unhash(struct ip_vs_service *svc)
335 {
336         if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
337                 IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, "
338                           "called from %p\n", __builtin_return_address(0));
339                 return 0;
340         }
341
342         if (svc->fwmark == 0) {
343                 /* Remove it from the ip_vs_svc_table table */
344                 list_del(&svc->s_list);
345         } else {
346                 /* Remove it from the ip_vs_svc_fwm_table table */
347                 list_del(&svc->f_list);
348         }
349
350         svc->flags &= ~IP_VS_SVC_F_HASHED;
351         atomic_dec(&svc->refcnt);
352         return 1;
353 }
354
355
356 /*
357  *      Get service by {proto,addr,port} in the service table.
358  */
359 static __inline__ struct ip_vs_service *
360 __ip_vs_service_get(__u16 protocol, __u32 vaddr, __u16 vport)
361 {
362         unsigned hash;
363         struct ip_vs_service *svc;
364
365         /* Check for "full" addressed entries */
366         hash = ip_vs_svc_hashkey(protocol, vaddr, vport);
367
368         list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
369                 if ((svc->addr == vaddr)
370                     && (svc->port == vport)
371                     && (svc->protocol == protocol)) {
372                         /* HIT */
373                         atomic_inc(&svc->usecnt);
374                         return svc;
375                 }
376         }
377
378         return NULL;
379 }
380
381
382 /*
383  *      Get service by {fwmark} in the service table.
384  */
385 static __inline__ struct ip_vs_service *__ip_vs_svc_fwm_get(__u32 fwmark)
386 {
387         unsigned hash;
388         struct ip_vs_service *svc;
389
390         /* Check for fwmark addressed entries */
391         hash = ip_vs_svc_fwm_hashkey(fwmark);
392
393         list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
394                 if (svc->fwmark == fwmark) {
395                         /* HIT */
396                         atomic_inc(&svc->usecnt);
397                         return svc;
398                 }
399         }
400
401         return NULL;
402 }
403
404 struct ip_vs_service *
405 ip_vs_service_get(__u32 fwmark, __u16 protocol, __u32 vaddr, __u16 vport)
406 {
407         struct ip_vs_service *svc;
408
409         read_lock(&__ip_vs_svc_lock);
410
411         /*
412          *      Check the table hashed by fwmark first
413          */
414         if (fwmark && (svc = __ip_vs_svc_fwm_get(fwmark)))
415                 goto out;
416
417         /*
418          *      Check the table hashed by <protocol,addr,port>
419          *      for "full" addressed entries
420          */
421         svc = __ip_vs_service_get(protocol, vaddr, vport);
422
423         if (svc == NULL
424             && protocol == IPPROTO_TCP
425             && atomic_read(&ip_vs_ftpsvc_counter)
426             && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
427                 /*
428                  * Check if ftp service entry exists, the packet
429                  * might belong to FTP data connections.
430                  */
431                 svc = __ip_vs_service_get(protocol, vaddr, FTPPORT);
432         }
433
434         if (svc == NULL
435             && atomic_read(&ip_vs_nullsvc_counter)) {
436                 /*
437                  * Check if the catch-all port (port zero) exists
438                  */
439                 svc = __ip_vs_service_get(protocol, vaddr, 0);
440         }
441
442   out:
443         read_unlock(&__ip_vs_svc_lock);
444
445         IP_VS_DBG(6, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n",
446                   fwmark, ip_vs_proto_name(protocol),
447                   NIPQUAD(vaddr), ntohs(vport),
448                   svc?"hit":"not hit");
449
450         return svc;
451 }
452
453
454 static inline void
455 __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
456 {
457         atomic_inc(&svc->refcnt);
458         dest->svc = svc;
459 }
460
461 static inline void
462 __ip_vs_unbind_svc(struct ip_vs_dest *dest)
463 {
464         struct ip_vs_service *svc = dest->svc;
465
466         dest->svc = NULL;
467         if (atomic_dec_and_test(&svc->refcnt))
468                 kfree(svc);
469 }
470
471
472 /*
473  *      Returns hash value for real service
474  */
475 static __inline__ unsigned ip_vs_rs_hashkey(__u32 addr, __u16 port)
476 {
477         register unsigned porth = ntohs(port);
478
479         return (ntohl(addr)^(porth>>IP_VS_RTAB_BITS)^porth)
480                 & IP_VS_RTAB_MASK;
481 }
482
483 /*
484  *      Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>.
485  *      should be called with locked tables.
486  */
487 static int ip_vs_rs_hash(struct ip_vs_dest *dest)
488 {
489         unsigned hash;
490
491         if (!list_empty(&dest->d_list)) {
492                 return 0;
493         }
494
495         /*
496          *      Hash by proto,addr,port,
497          *      which are the parameters of the real service.
498          */
499         hash = ip_vs_rs_hashkey(dest->addr, dest->port);
500         list_add(&dest->d_list, &ip_vs_rtable[hash]);
501
502         return 1;
503 }
504
505 /*
506  *      UNhashes ip_vs_dest from ip_vs_rtable.
507  *      should be called with locked tables.
508  */
509 static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
510 {
511         /*
512          * Remove it from the ip_vs_rtable table.
513          */
514         if (!list_empty(&dest->d_list)) {
515                 list_del(&dest->d_list);
516                 INIT_LIST_HEAD(&dest->d_list);
517         }
518
519         return 1;
520 }
521
522 /*
523  *      Lookup real service by <proto,addr,port> in the real service table.
524  */
525 struct ip_vs_dest *
526 ip_vs_lookup_real_service(__u16 protocol, __u32 daddr, __u16 dport)
527 {
528         unsigned hash;
529         struct ip_vs_dest *dest;
530
531         /*
532          *      Check for "full" addressed entries
533          *      Return the first found entry
534          */
535         hash = ip_vs_rs_hashkey(daddr, dport);
536
537         read_lock(&__ip_vs_rs_lock);
538         list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) {
539                 if ((dest->addr == daddr)
540                     && (dest->port == dport)
541                     && ((dest->protocol == protocol) ||
542                         dest->vfwmark)) {
543                         /* HIT */
544                         read_unlock(&__ip_vs_rs_lock);
545                         return dest;
546                 }
547         }
548         read_unlock(&__ip_vs_rs_lock);
549
550         return NULL;
551 }
552
553 /*
554  *      Lookup destination by {addr,port} in the given service
555  */
556 static struct ip_vs_dest *
557 ip_vs_lookup_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport)
558 {
559         struct ip_vs_dest *dest;
560
561         /*
562          * Find the destination for the given service
563          */
564         list_for_each_entry(dest, &svc->destinations, n_list) {
565                 if ((dest->addr == daddr) && (dest->port == dport)) {
566                         /* HIT */
567                         return dest;
568                 }
569         }
570
571         return NULL;
572 }
573
574
575 /*
576  *  Lookup dest by {svc,addr,port} in the destination trash.
577  *  The destination trash is used to hold the destinations that are removed
578  *  from the service table but are still referenced by some conn entries.
579  *  The reason to add the destination trash is when the dest is temporary
580  *  down (either by administrator or by monitor program), the dest can be
581  *  picked back from the trash, the remaining connections to the dest can
582  *  continue, and the counting information of the dest is also useful for
583  *  scheduling.
584  */
585 static struct ip_vs_dest *
586 ip_vs_trash_get_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport)
587 {
588         struct ip_vs_dest *dest, *nxt;
589
590         /*
591          * Find the destination in trash
592          */
593         list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
594                 IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, "
595                           "refcnt=%d\n",
596                           dest->vfwmark,
597                           NIPQUAD(dest->addr), ntohs(dest->port),
598                           atomic_read(&dest->refcnt));
599                 if (dest->addr == daddr &&
600                     dest->port == dport &&
601                     dest->vfwmark == svc->fwmark &&
602                     dest->protocol == svc->protocol &&
603                     (svc->fwmark ||
604                      (dest->vaddr == svc->addr &&
605                       dest->vport == svc->port))) {
606                         /* HIT */
607                         return dest;
608                 }
609
610                 /*
611                  * Try to purge the destination from trash if not referenced
612                  */
613                 if (atomic_read(&dest->refcnt) == 1) {
614                         IP_VS_DBG(3, "Removing destination %u/%u.%u.%u.%u:%u "
615                                   "from trash\n",
616                                   dest->vfwmark,
617                                   NIPQUAD(dest->addr), ntohs(dest->port));
618                         list_del(&dest->n_list);
619                         ip_vs_dst_reset(dest);
620                         __ip_vs_unbind_svc(dest);
621                         kfree(dest);
622                 }
623         }
624
625         return NULL;
626 }
627
628
629 /*
630  *  Clean up all the destinations in the trash
631  *  Called by the ip_vs_control_cleanup()
632  *
633  *  When the ip_vs_control_clearup is activated by ipvs module exit,
634  *  the service tables must have been flushed and all the connections
635  *  are expired, and the refcnt of each destination in the trash must
636  *  be 1, so we simply release them here.
637  */
638 static void ip_vs_trash_cleanup(void)
639 {
640         struct ip_vs_dest *dest, *nxt;
641
642         list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
643                 list_del(&dest->n_list);
644                 ip_vs_dst_reset(dest);
645                 __ip_vs_unbind_svc(dest);
646                 kfree(dest);
647         }
648 }
649
650
651 static void
652 ip_vs_zero_stats(struct ip_vs_stats *stats)
653 {
654         spin_lock_bh(&stats->lock);
655         memset(stats, 0, (char *)&stats->lock - (char *)stats);
656         spin_unlock_bh(&stats->lock);
657         ip_vs_zero_estimator(stats);
658 }
659
660 /*
661  *      Update a destination in the given service
662  */
663 static void
664 __ip_vs_update_dest(struct ip_vs_service *svc,
665                     struct ip_vs_dest *dest, struct ip_vs_dest_user *udest)
666 {
667         int conn_flags;
668
669         /* set the weight and the flags */
670         atomic_set(&dest->weight, udest->weight);
671         conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE;
672
673         /* check if local node and update the flags */
674         if (inet_addr_type(udest->addr) == RTN_LOCAL) {
675                 conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
676                         | IP_VS_CONN_F_LOCALNODE;
677         }
678
679         /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
680         if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) {
681                 conn_flags |= IP_VS_CONN_F_NOOUTPUT;
682         } else {
683                 /*
684                  *    Put the real service in ip_vs_rtable if not present.
685                  *    For now only for NAT!
686                  */
687                 write_lock_bh(&__ip_vs_rs_lock);
688                 ip_vs_rs_hash(dest);
689                 write_unlock_bh(&__ip_vs_rs_lock);
690         }
691         atomic_set(&dest->conn_flags, conn_flags);
692
693         /* bind the service */
694         if (!dest->svc) {
695                 __ip_vs_bind_svc(dest, svc);
696         } else {
697                 if (dest->svc != svc) {
698                         __ip_vs_unbind_svc(dest);
699                         ip_vs_zero_stats(&dest->stats);
700                         __ip_vs_bind_svc(dest, svc);
701                 }
702         }
703
704         /* set the dest status flags */
705         dest->flags |= IP_VS_DEST_F_AVAILABLE;
706
707         if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
708                 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
709         dest->u_threshold = udest->u_threshold;
710         dest->l_threshold = udest->l_threshold;
711 }
712
713
714 /*
715  *      Create a destination for the given service
716  */
717 static int
718 ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest,
719                struct ip_vs_dest **dest_p)
720 {
721         struct ip_vs_dest *dest;
722         unsigned atype;
723
724         EnterFunction(2);
725
726         atype = inet_addr_type(udest->addr);
727         if (atype != RTN_LOCAL && atype != RTN_UNICAST)
728                 return -EINVAL;
729
730         dest = kmalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC);
731         if (dest == NULL) {
732                 IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n");
733                 return -ENOMEM;
734         }
735         memset(dest, 0, sizeof(struct ip_vs_dest));
736
737         dest->protocol = svc->protocol;
738         dest->vaddr = svc->addr;
739         dest->vport = svc->port;
740         dest->vfwmark = svc->fwmark;
741         dest->addr = udest->addr;
742         dest->port = udest->port;
743
744         atomic_set(&dest->activeconns, 0);
745         atomic_set(&dest->inactconns, 0);
746         atomic_set(&dest->persistconns, 0);
747         atomic_set(&dest->refcnt, 0);
748
749         INIT_LIST_HEAD(&dest->d_list);
750         spin_lock_init(&dest->dst_lock);
751         spin_lock_init(&dest->stats.lock);
752         __ip_vs_update_dest(svc, dest, udest);
753         ip_vs_new_estimator(&dest->stats);
754
755         *dest_p = dest;
756
757         LeaveFunction(2);
758         return 0;
759 }
760
761
762 /*
763  *      Add a destination into an existing service
764  */
765 static int
766 ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
767 {
768         struct ip_vs_dest *dest;
769         __u32 daddr = udest->addr;
770         __u16 dport = udest->port;
771         int ret;
772
773         EnterFunction(2);
774
775         if (udest->weight < 0) {
776                 IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n");
777                 return -ERANGE;
778         }
779
780         if (udest->l_threshold > udest->u_threshold) {
781                 IP_VS_ERR("ip_vs_add_dest(): lower threshold is higher than "
782                           "upper threshold\n");
783                 return -ERANGE;
784         }
785
786         /*
787          * Check if the dest already exists in the list
788          */
789         dest = ip_vs_lookup_dest(svc, daddr, dport);
790         if (dest != NULL) {
791                 IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n");
792                 return -EEXIST;
793         }
794
795         /*
796          * Check if the dest already exists in the trash and
797          * is from the same service
798          */
799         dest = ip_vs_trash_get_dest(svc, daddr, dport);
800         if (dest != NULL) {
801                 IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, "
802                           "refcnt=%d, service %u/%u.%u.%u.%u:%u\n",
803                           NIPQUAD(daddr), ntohs(dport),
804                           atomic_read(&dest->refcnt),
805                           dest->vfwmark,
806                           NIPQUAD(dest->vaddr),
807                           ntohs(dest->vport));
808                 __ip_vs_update_dest(svc, dest, udest);
809
810                 /*
811                  * Get the destination from the trash
812                  */
813                 list_del(&dest->n_list);
814
815                 ip_vs_new_estimator(&dest->stats);
816
817                 write_lock_bh(&__ip_vs_svc_lock);
818
819                 /*
820                  * Wait until all other svc users go away.
821                  */
822                 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
823
824                 list_add(&dest->n_list, &svc->destinations);
825                 svc->num_dests++;
826
827                 /* call the update_service function of its scheduler */
828                 svc->scheduler->update_service(svc);
829
830                 write_unlock_bh(&__ip_vs_svc_lock);
831                 return 0;
832         }
833
834         /*
835          * Allocate and initialize the dest structure
836          */
837         ret = ip_vs_new_dest(svc, udest, &dest);
838         if (ret) {
839                 return ret;
840         }
841
842         /*
843          * Add the dest entry into the list
844          */
845         atomic_inc(&dest->refcnt);
846
847         write_lock_bh(&__ip_vs_svc_lock);
848
849         /*
850          * Wait until all other svc users go away.
851          */
852         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
853
854         list_add(&dest->n_list, &svc->destinations);
855         svc->num_dests++;
856
857         /* call the update_service function of its scheduler */
858         svc->scheduler->update_service(svc);
859
860         write_unlock_bh(&__ip_vs_svc_lock);
861
862         LeaveFunction(2);
863
864         return 0;
865 }
866
867
868 /*
869  *      Edit a destination in the given service
870  */
871 static int
872 ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
873 {
874         struct ip_vs_dest *dest;
875         __u32 daddr = udest->addr;
876         __u16 dport = udest->port;
877
878         EnterFunction(2);
879
880         if (udest->weight < 0) {
881                 IP_VS_ERR("ip_vs_edit_dest(): server weight less than zero\n");
882                 return -ERANGE;
883         }
884
885         if (udest->l_threshold > udest->u_threshold) {
886                 IP_VS_ERR("ip_vs_edit_dest(): lower threshold is higher than "
887                           "upper threshold\n");
888                 return -ERANGE;
889         }
890
891         /*
892          *  Lookup the destination list
893          */
894         dest = ip_vs_lookup_dest(svc, daddr, dport);
895         if (dest == NULL) {
896                 IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n");
897                 return -ENOENT;
898         }
899
900         __ip_vs_update_dest(svc, dest, udest);
901
902         write_lock_bh(&__ip_vs_svc_lock);
903
904         /* Wait until all other svc users go away */
905         while (atomic_read(&svc->usecnt) > 1) {};
906
907         /* call the update_service, because server weight may be changed */
908         svc->scheduler->update_service(svc);
909
910         write_unlock_bh(&__ip_vs_svc_lock);
911
912         LeaveFunction(2);
913
914         return 0;
915 }
916
917
918 /*
919  *      Delete a destination (must be already unlinked from the service)
920  */
921 static void __ip_vs_del_dest(struct ip_vs_dest *dest)
922 {
923         ip_vs_kill_estimator(&dest->stats);
924
925         /*
926          *  Remove it from the d-linked list with the real services.
927          */
928         write_lock_bh(&__ip_vs_rs_lock);
929         ip_vs_rs_unhash(dest);
930         write_unlock_bh(&__ip_vs_rs_lock);
931
932         /*
933          *  Decrease the refcnt of the dest, and free the dest
934          *  if nobody refers to it (refcnt=0). Otherwise, throw
935          *  the destination into the trash.
936          */
937         if (atomic_dec_and_test(&dest->refcnt)) {
938                 ip_vs_dst_reset(dest);
939                 /* simply decrease svc->refcnt here, let the caller check
940                    and release the service if nobody refers to it.
941                    Only user context can release destination and service,
942                    and only one user context can update virtual service at a
943                    time, so the operation here is OK */
944                 atomic_dec(&dest->svc->refcnt);
945                 kfree(dest);
946         } else {
947                 IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, refcnt=%d\n",
948                           NIPQUAD(dest->addr), ntohs(dest->port),
949                           atomic_read(&dest->refcnt));
950                 list_add(&dest->n_list, &ip_vs_dest_trash);
951                 atomic_inc(&dest->refcnt);
952         }
953 }
954
955
956 /*
957  *      Unlink a destination from the given service
958  */
959 static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
960                                 struct ip_vs_dest *dest,
961                                 int svcupd)
962 {
963         dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
964
965         /*
966          *  Remove it from the d-linked destination list.
967          */
968         list_del(&dest->n_list);
969         svc->num_dests--;
970         if (svcupd) {
971                 /*
972                  *  Call the update_service function of its scheduler
973                  */
974                 svc->scheduler->update_service(svc);
975         }
976 }
977
978
979 /*
980  *      Delete a destination server in the given service
981  */
982 static int
983 ip_vs_del_dest(struct ip_vs_service *svc,struct ip_vs_dest_user *udest)
984 {
985         struct ip_vs_dest *dest;
986         __u32 daddr = udest->addr;
987         __u16 dport = udest->port;
988
989         EnterFunction(2);
990
991         dest = ip_vs_lookup_dest(svc, daddr, dport);
992         if (dest == NULL) {
993                 IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n");
994                 return -ENOENT;
995         }
996
997         write_lock_bh(&__ip_vs_svc_lock);
998
999         /*
1000          *      Wait until all other svc users go away.
1001          */
1002         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1003
1004         /*
1005          *      Unlink dest from the service
1006          */
1007         __ip_vs_unlink_dest(svc, dest, 1);
1008
1009         write_unlock_bh(&__ip_vs_svc_lock);
1010
1011         /*
1012          *      Delete the destination
1013          */
1014         __ip_vs_del_dest(dest);
1015
1016         LeaveFunction(2);
1017
1018         return 0;
1019 }
1020
1021
1022 /*
1023  *      Add a service into the service hash table
1024  */
1025 static int
1026 ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p)
1027 {
1028         int ret = 0;
1029         struct ip_vs_scheduler *sched = NULL;
1030         struct ip_vs_service *svc = NULL;
1031
1032         /* increase the module use count */
1033         ip_vs_use_count_inc();
1034
1035         /* Lookup the scheduler by 'u->sched_name' */
1036         sched = ip_vs_scheduler_get(u->sched_name);
1037         if (sched == NULL) {
1038                 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1039                            u->sched_name);
1040                 ret = -ENOENT;
1041                 goto out_mod_dec;
1042         }
1043
1044         svc = (struct ip_vs_service *)
1045                 kmalloc(sizeof(struct ip_vs_service), GFP_ATOMIC);
1046         if (svc == NULL) {
1047                 IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n");
1048                 ret = -ENOMEM;
1049                 goto out_err;
1050         }
1051         memset(svc, 0, sizeof(struct ip_vs_service));
1052
1053         /* I'm the first user of the service */
1054         atomic_set(&svc->usecnt, 1);
1055         atomic_set(&svc->refcnt, 0);
1056
1057         svc->protocol = u->protocol;
1058         svc->addr = u->addr;
1059         svc->port = u->port;
1060         svc->fwmark = u->fwmark;
1061         svc->flags = u->flags;
1062         svc->timeout = u->timeout * HZ;
1063         svc->netmask = u->netmask;
1064
1065         INIT_LIST_HEAD(&svc->destinations);
1066         rwlock_init(&svc->sched_lock);
1067         spin_lock_init(&svc->stats.lock);
1068
1069         /* Bind the scheduler */
1070         ret = ip_vs_bind_scheduler(svc, sched);
1071         if (ret)
1072                 goto out_err;
1073         sched = NULL;
1074
1075         /* Update the virtual service counters */
1076         if (svc->port == FTPPORT)
1077                 atomic_inc(&ip_vs_ftpsvc_counter);
1078         else if (svc->port == 0)
1079                 atomic_inc(&ip_vs_nullsvc_counter);
1080
1081         ip_vs_new_estimator(&svc->stats);
1082         ip_vs_num_services++;
1083
1084         /* Hash the service into the service table */
1085         write_lock_bh(&__ip_vs_svc_lock);
1086         ip_vs_svc_hash(svc);
1087         write_unlock_bh(&__ip_vs_svc_lock);
1088
1089         *svc_p = svc;
1090         return 0;
1091
1092   out_err:
1093         if (svc != NULL) {
1094                 if (svc->scheduler)
1095                         ip_vs_unbind_scheduler(svc);
1096                 if (svc->inc) {
1097                         local_bh_disable();
1098                         ip_vs_app_inc_put(svc->inc);
1099                         local_bh_enable();
1100                 }
1101                 kfree(svc);
1102         }
1103         ip_vs_scheduler_put(sched);
1104
1105   out_mod_dec:
1106         /* decrease the module use count */
1107         ip_vs_use_count_dec();
1108
1109         return ret;
1110 }
1111
1112
1113 /*
1114  *      Edit a service and bind it with a new scheduler
1115  */
1116 static int
1117 ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user *u)
1118 {
1119         struct ip_vs_scheduler *sched, *old_sched;
1120         int ret = 0;
1121
1122         /*
1123          * Lookup the scheduler, by 'u->sched_name'
1124          */
1125         sched = ip_vs_scheduler_get(u->sched_name);
1126         if (sched == NULL) {
1127                 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1128                            u->sched_name);
1129                 return -ENOENT;
1130         }
1131         old_sched = sched;
1132
1133         write_lock_bh(&__ip_vs_svc_lock);
1134
1135         /*
1136          * Wait until all other svc users go away.
1137          */
1138         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1139
1140         /*
1141          * Set the flags and timeout value
1142          */
1143         svc->flags = u->flags | IP_VS_SVC_F_HASHED;
1144         svc->timeout = u->timeout * HZ;
1145         svc->netmask = u->netmask;
1146
1147         old_sched = svc->scheduler;
1148         if (sched != old_sched) {
1149                 /*
1150                  * Unbind the old scheduler
1151                  */
1152                 if ((ret = ip_vs_unbind_scheduler(svc))) {
1153                         old_sched = sched;
1154                         goto out;
1155                 }
1156
1157                 /*
1158                  * Bind the new scheduler
1159                  */
1160                 if ((ret = ip_vs_bind_scheduler(svc, sched))) {
1161                         /*
1162                          * If ip_vs_bind_scheduler fails, restore the old
1163                          * scheduler.
1164                          * The main reason of failure is out of memory.
1165                          *
1166                          * The question is if the old scheduler can be
1167                          * restored all the time. TODO: if it cannot be
1168                          * restored some time, we must delete the service,
1169                          * otherwise the system may crash.
1170                          */
1171                         ip_vs_bind_scheduler(svc, old_sched);
1172                         old_sched = sched;
1173                         goto out;
1174                 }
1175         }
1176
1177   out:
1178         write_unlock_bh(&__ip_vs_svc_lock);
1179
1180         if (old_sched)
1181                 ip_vs_scheduler_put(old_sched);
1182
1183         return ret;
1184 }
1185
1186
1187 /*
1188  *      Delete a service from the service list
1189  *      - The service must be unlinked, unlocked and not referenced!
1190  *      - We are called under _bh lock
1191  */
1192 static void __ip_vs_del_service(struct ip_vs_service *svc)
1193 {
1194         struct ip_vs_dest *dest, *nxt;
1195         struct ip_vs_scheduler *old_sched;
1196
1197         ip_vs_num_services--;
1198         ip_vs_kill_estimator(&svc->stats);
1199
1200         /* Unbind scheduler */
1201         old_sched = svc->scheduler;
1202         ip_vs_unbind_scheduler(svc);
1203         if (old_sched)
1204                 ip_vs_scheduler_put(old_sched);
1205
1206         /* Unbind app inc */
1207         if (svc->inc) {
1208                 ip_vs_app_inc_put(svc->inc);
1209                 svc->inc = NULL;
1210         }
1211
1212         /*
1213          *    Unlink the whole destination list
1214          */
1215         list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
1216                 __ip_vs_unlink_dest(svc, dest, 0);
1217                 __ip_vs_del_dest(dest);
1218         }
1219
1220         /*
1221          *    Update the virtual service counters
1222          */
1223         if (svc->port == FTPPORT)
1224                 atomic_dec(&ip_vs_ftpsvc_counter);
1225         else if (svc->port == 0)
1226                 atomic_dec(&ip_vs_nullsvc_counter);
1227
1228         /*
1229          *    Free the service if nobody refers to it
1230          */
1231         if (atomic_read(&svc->refcnt) == 0)
1232                 kfree(svc);
1233
1234         /* decrease the module use count */
1235         ip_vs_use_count_dec();
1236 }
1237
1238 /*
1239  *      Delete a service from the service list
1240  */
1241 static int ip_vs_del_service(struct ip_vs_service *svc)
1242 {
1243         if (svc == NULL)
1244                 return -EEXIST;
1245
1246         /*
1247          * Unhash it from the service table
1248          */
1249         write_lock_bh(&__ip_vs_svc_lock);
1250
1251         ip_vs_svc_unhash(svc);
1252
1253         /*
1254          * Wait until all the svc users go away.
1255          */
1256         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1257
1258         __ip_vs_del_service(svc);
1259
1260         write_unlock_bh(&__ip_vs_svc_lock);
1261
1262         return 0;
1263 }
1264
1265
1266 /*
1267  *      Flush all the virtual services
1268  */
1269 static int ip_vs_flush(void)
1270 {
1271         int idx;
1272         struct ip_vs_service *svc, *nxt;
1273
1274         /*
1275          * Flush the service table hashed by <protocol,addr,port>
1276          */
1277         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1278                 list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) {
1279                         write_lock_bh(&__ip_vs_svc_lock);
1280                         ip_vs_svc_unhash(svc);
1281                         /*
1282                          * Wait until all the svc users go away.
1283                          */
1284                         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1285                         __ip_vs_del_service(svc);
1286                         write_unlock_bh(&__ip_vs_svc_lock);
1287                 }
1288         }
1289
1290         /*
1291          * Flush the service table hashed by fwmark
1292          */
1293         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1294                 list_for_each_entry_safe(svc, nxt,
1295                                          &ip_vs_svc_fwm_table[idx], f_list) {
1296                         write_lock_bh(&__ip_vs_svc_lock);
1297                         ip_vs_svc_unhash(svc);
1298                         /*
1299                          * Wait until all the svc users go away.
1300                          */
1301                         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1302                         __ip_vs_del_service(svc);
1303                         write_unlock_bh(&__ip_vs_svc_lock);
1304                 }
1305         }
1306
1307         return 0;
1308 }
1309
1310
1311 /*
1312  *      Zero counters in a service or all services
1313  */
1314 static int ip_vs_zero_service(struct ip_vs_service *svc)
1315 {
1316         struct ip_vs_dest *dest;
1317
1318         write_lock_bh(&__ip_vs_svc_lock);
1319         list_for_each_entry(dest, &svc->destinations, n_list) {
1320                 ip_vs_zero_stats(&dest->stats);
1321         }
1322         ip_vs_zero_stats(&svc->stats);
1323         write_unlock_bh(&__ip_vs_svc_lock);
1324         return 0;
1325 }
1326
1327 static int ip_vs_zero_all(void)
1328 {
1329         int idx;
1330         struct ip_vs_service *svc;
1331
1332         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1333                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1334                         ip_vs_zero_service(svc);
1335                 }
1336         }
1337
1338         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1339                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1340                         ip_vs_zero_service(svc);
1341                 }
1342         }
1343
1344         ip_vs_zero_stats(&ip_vs_stats);
1345         return 0;
1346 }
1347
1348
1349 static int
1350 proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
1351                      void __user *buffer, size_t *lenp, loff_t *ppos)
1352 {
1353         int *valp = table->data;
1354         int val = *valp;
1355         int rc;
1356
1357         rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
1358         if (write && (*valp != val)) {
1359                 if ((*valp < 0) || (*valp > 3)) {
1360                         /* Restore the correct value */
1361                         *valp = val;
1362                 } else {
1363                         local_bh_disable();
1364                         update_defense_level();
1365                         local_bh_enable();
1366                 }
1367         }
1368         return rc;
1369 }
1370
1371
1372 static int
1373 proc_do_sync_threshold(ctl_table *table, int write, struct file *filp,
1374                        void __user *buffer, size_t *lenp, loff_t *ppos)
1375 {
1376         int *valp = table->data;
1377         int val[2];
1378         int rc;
1379
1380         /* backup the value first */
1381         memcpy(val, valp, sizeof(val));
1382
1383         rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
1384         if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) {
1385                 /* Restore the correct value */
1386                 memcpy(valp, val, sizeof(val));
1387         }
1388         return rc;
1389 }
1390
1391
1392 /*
1393  *      IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
1394  */
1395
1396 static struct ctl_table vs_vars[] = {
1397         {
1398                 .ctl_name       = NET_IPV4_VS_AMEMTHRESH,
1399                 .procname       = "amemthresh",
1400                 .data           = &sysctl_ip_vs_amemthresh,
1401                 .maxlen         = sizeof(int),
1402                 .mode           = 0644,
1403                 .proc_handler   = &proc_dointvec,
1404         },
1405 #ifdef CONFIG_IP_VS_DEBUG
1406         {
1407                 .ctl_name       = NET_IPV4_VS_DEBUG_LEVEL,
1408                 .procname       = "debug_level",
1409                 .data           = &sysctl_ip_vs_debug_level,
1410                 .maxlen         = sizeof(int),
1411                 .mode           = 0644,
1412                 .proc_handler   = &proc_dointvec,
1413         },
1414 #endif
1415         {
1416                 .ctl_name       = NET_IPV4_VS_AMDROPRATE,
1417                 .procname       = "am_droprate",
1418                 .data           = &sysctl_ip_vs_am_droprate,
1419                 .maxlen         = sizeof(int),
1420                 .mode           = 0644,
1421                 .proc_handler   = &proc_dointvec,
1422         },
1423         {
1424                 .ctl_name       = NET_IPV4_VS_DROP_ENTRY,
1425                 .procname       = "drop_entry",
1426                 .data           = &sysctl_ip_vs_drop_entry,
1427                 .maxlen         = sizeof(int),
1428                 .mode           = 0644,
1429                 .proc_handler   = &proc_do_defense_mode,
1430         },
1431         {
1432                 .ctl_name       = NET_IPV4_VS_DROP_PACKET,
1433                 .procname       = "drop_packet",
1434                 .data           = &sysctl_ip_vs_drop_packet,
1435                 .maxlen         = sizeof(int),
1436                 .mode           = 0644,
1437                 .proc_handler   = &proc_do_defense_mode,
1438         },
1439         {
1440                 .ctl_name       = NET_IPV4_VS_SECURE_TCP,
1441                 .procname       = "secure_tcp",
1442                 .data           = &sysctl_ip_vs_secure_tcp,
1443                 .maxlen         = sizeof(int),
1444                 .mode           = 0644,
1445                 .proc_handler   = &proc_do_defense_mode,
1446         },
1447 #if 0
1448         {
1449                 .ctl_name       = NET_IPV4_VS_TO_ES,
1450                 .procname       = "timeout_established",
1451                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED],
1452                 .maxlen         = sizeof(int),
1453                 .mode           = 0644,
1454                 .proc_handler   = &proc_dointvec_jiffies,
1455         },
1456         {
1457                 .ctl_name       = NET_IPV4_VS_TO_SS,
1458                 .procname       = "timeout_synsent",
1459                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT],
1460                 .maxlen         = sizeof(int),
1461                 .mode           = 0644,
1462                 .proc_handler   = &proc_dointvec_jiffies,
1463         },
1464         {
1465                 .ctl_name       = NET_IPV4_VS_TO_SR,
1466                 .procname       = "timeout_synrecv",
1467                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV],
1468                 .maxlen         = sizeof(int),
1469                 .mode           = 0644,
1470                 .proc_handler   = &proc_dointvec_jiffies,
1471         },
1472         {
1473                 .ctl_name       = NET_IPV4_VS_TO_FW,
1474                 .procname       = "timeout_finwait",
1475                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT],
1476                 .maxlen         = sizeof(int),
1477                 .mode           = 0644,
1478                 .proc_handler   = &proc_dointvec_jiffies,
1479         },
1480         {
1481                 .ctl_name       = NET_IPV4_VS_TO_TW,
1482                 .procname       = "timeout_timewait",
1483                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT],
1484                 .maxlen         = sizeof(int),
1485                 .mode           = 0644,
1486                 .proc_handler   = &proc_dointvec_jiffies,
1487         },
1488         {
1489                 .ctl_name       = NET_IPV4_VS_TO_CL,
1490                 .procname       = "timeout_close",
1491                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE],
1492                 .maxlen         = sizeof(int),
1493                 .mode           = 0644,
1494                 .proc_handler   = &proc_dointvec_jiffies,
1495         },
1496         {
1497                 .ctl_name       = NET_IPV4_VS_TO_CW,
1498                 .procname       = "timeout_closewait",
1499                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT],
1500                 .maxlen         = sizeof(int),
1501                 .mode           = 0644,
1502                 .proc_handler   = &proc_dointvec_jiffies,
1503         },
1504         {
1505                 .ctl_name       = NET_IPV4_VS_TO_LA,
1506                 .procname       = "timeout_lastack",
1507                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK],
1508                 .maxlen         = sizeof(int),
1509                 .mode           = 0644,
1510                 .proc_handler   = &proc_dointvec_jiffies,
1511         },
1512         {
1513                 .ctl_name       = NET_IPV4_VS_TO_LI,
1514                 .procname       = "timeout_listen",
1515                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN],
1516                 .maxlen         = sizeof(int),
1517                 .mode           = 0644,
1518                 .proc_handler   = &proc_dointvec_jiffies,
1519         },
1520         {
1521                 .ctl_name       = NET_IPV4_VS_TO_SA,
1522                 .procname       = "timeout_synack",
1523                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK],
1524                 .maxlen         = sizeof(int),
1525                 .mode           = 0644,
1526                 .proc_handler   = &proc_dointvec_jiffies,
1527         },
1528         {
1529                 .ctl_name       = NET_IPV4_VS_TO_UDP,
1530                 .procname       = "timeout_udp",
1531                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_UDP],
1532                 .maxlen         = sizeof(int),
1533                 .mode           = 0644,
1534                 .proc_handler   = &proc_dointvec_jiffies,
1535         },
1536         {
1537                 .ctl_name       = NET_IPV4_VS_TO_ICMP,
1538                 .procname       = "timeout_icmp",
1539                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP],
1540                 .maxlen         = sizeof(int),
1541                 .mode           = 0644,
1542                 .proc_handler   = &proc_dointvec_jiffies,
1543         },
1544 #endif
1545         {
1546                 .ctl_name       = NET_IPV4_VS_CACHE_BYPASS,
1547                 .procname       = "cache_bypass",
1548                 .data           = &sysctl_ip_vs_cache_bypass,
1549                 .maxlen         = sizeof(int),
1550                 .mode           = 0644,
1551                 .proc_handler   = &proc_dointvec,
1552         },
1553         {
1554                 .ctl_name       = NET_IPV4_VS_EXPIRE_NODEST_CONN,
1555                 .procname       = "expire_nodest_conn",
1556                 .data           = &sysctl_ip_vs_expire_nodest_conn,
1557                 .maxlen         = sizeof(int),
1558                 .mode           = 0644,
1559                 .proc_handler   = &proc_dointvec,
1560         },
1561         {
1562                 .ctl_name       = NET_IPV4_VS_EXPIRE_QUIESCENT_TEMPLATE,
1563                 .procname       = "expire_quiescent_template",
1564                 .data           = &sysctl_ip_vs_expire_quiescent_template,
1565                 .maxlen         = sizeof(int),
1566                 .mode           = 0644,
1567                 .proc_handler   = &proc_dointvec,
1568         },
1569         {
1570                 .ctl_name       = NET_IPV4_VS_SYNC_THRESHOLD,
1571                 .procname       = "sync_threshold",
1572                 .data           = &sysctl_ip_vs_sync_threshold,
1573                 .maxlen         = sizeof(sysctl_ip_vs_sync_threshold),
1574                 .mode           = 0644,
1575                 .proc_handler   = &proc_do_sync_threshold,
1576         },
1577         {
1578                 .ctl_name       = NET_IPV4_VS_NAT_ICMP_SEND,
1579                 .procname       = "nat_icmp_send",
1580                 .data           = &sysctl_ip_vs_nat_icmp_send,
1581                 .maxlen         = sizeof(int),
1582                 .mode           = 0644,
1583                 .proc_handler   = &proc_dointvec,
1584         },
1585         { .ctl_name = 0 }
1586 };
1587
1588 static ctl_table vs_table[] = {
1589         {
1590                 .ctl_name       = NET_IPV4_VS,
1591                 .procname       = "vs",
1592                 .mode           = 0555,
1593                 .child          = vs_vars
1594         },
1595         { .ctl_name = 0 }
1596 };
1597
1598 static ctl_table ipv4_table[] = {
1599         {
1600                 .ctl_name       = NET_IPV4,
1601                 .procname       = "ipv4",
1602                 .mode           = 0555,
1603                 .child          = vs_table,
1604         },
1605         { .ctl_name = 0 }
1606 };
1607
1608 static ctl_table vs_root_table[] = {
1609         {
1610                 .ctl_name       = CTL_NET,
1611                 .procname       = "net",
1612                 .mode           = 0555,
1613                 .child          = ipv4_table,
1614         },
1615         { .ctl_name = 0 }
1616 };
1617
1618 static struct ctl_table_header * sysctl_header;
1619
1620 #ifdef CONFIG_PROC_FS
1621
1622 struct ip_vs_iter {
1623         struct list_head *table;
1624         int bucket;
1625 };
1626
1627 /*
1628  *      Write the contents of the VS rule table to a PROCfs file.
1629  *      (It is kept just for backward compatibility)
1630  */
1631 static inline const char *ip_vs_fwd_name(unsigned flags)
1632 {
1633         switch (flags & IP_VS_CONN_F_FWD_MASK) {
1634         case IP_VS_CONN_F_LOCALNODE:
1635                 return "Local";
1636         case IP_VS_CONN_F_TUNNEL:
1637                 return "Tunnel";
1638         case IP_VS_CONN_F_DROUTE:
1639                 return "Route";
1640         default:
1641                 return "Masq";
1642         }
1643 }
1644
1645
1646 /* Get the Nth entry in the two lists */
1647 static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
1648 {
1649         struct ip_vs_iter *iter = seq->private;
1650         int idx;
1651         struct ip_vs_service *svc;
1652
1653         /* look in hash by protocol */
1654         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1655                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1656                         if (pos-- == 0){
1657                                 iter->table = ip_vs_svc_table;
1658                                 iter->bucket = idx;
1659                                 return svc;
1660                         }
1661                 }
1662         }
1663
1664         /* keep looking in fwmark */
1665         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1666                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1667                         if (pos-- == 0) {
1668                                 iter->table = ip_vs_svc_fwm_table;
1669                                 iter->bucket = idx;
1670                                 return svc;
1671                         }
1672                 }
1673         }
1674
1675         return NULL;
1676 }
1677
1678 static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
1679 {
1680
1681         read_lock_bh(&__ip_vs_svc_lock);
1682         return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
1683 }
1684
1685
1686 static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1687 {
1688         struct list_head *e;
1689         struct ip_vs_iter *iter;
1690         struct ip_vs_service *svc;
1691
1692         ++*pos;
1693         if (v == SEQ_START_TOKEN)
1694                 return ip_vs_info_array(seq,0);
1695
1696         svc = v;
1697         iter = seq->private;
1698
1699         if (iter->table == ip_vs_svc_table) {
1700                 /* next service in table hashed by protocol */
1701                 if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket])
1702                         return list_entry(e, struct ip_vs_service, s_list);
1703
1704
1705                 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1706                         list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket],
1707                                             s_list) {
1708                                 return svc;
1709                         }
1710                 }
1711
1712                 iter->table = ip_vs_svc_fwm_table;
1713                 iter->bucket = -1;
1714                 goto scan_fwmark;
1715         }
1716
1717         /* next service in hashed by fwmark */
1718         if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket])
1719                 return list_entry(e, struct ip_vs_service, f_list);
1720
1721  scan_fwmark:
1722         while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1723                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket],
1724                                     f_list)
1725                         return svc;
1726         }
1727
1728         return NULL;
1729 }
1730
1731 static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
1732 {
1733         read_unlock_bh(&__ip_vs_svc_lock);
1734 }
1735
1736
1737 static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
1738 {
1739         if (v == SEQ_START_TOKEN) {
1740                 seq_printf(seq,
1741                         "IP Virtual Server version %d.%d.%d (size=%d)\n",
1742                         NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
1743                 seq_puts(seq,
1744                          "Prot LocalAddress:Port Scheduler Flags\n");
1745                 seq_puts(seq,
1746                          "  -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
1747         } else {
1748                 const struct ip_vs_service *svc = v;
1749                 const struct ip_vs_iter *iter = seq->private;
1750                 const struct ip_vs_dest *dest;
1751
1752                 if (iter->table == ip_vs_svc_table)
1753                         seq_printf(seq, "%s  %08X:%04X %s ",
1754                                    ip_vs_proto_name(svc->protocol),
1755                                    ntohl(svc->addr),
1756                                    ntohs(svc->port),
1757                                    svc->scheduler->name);
1758                 else
1759                         seq_printf(seq, "FWM  %08X %s ",
1760                                    svc->fwmark, svc->scheduler->name);
1761
1762                 if (svc->flags & IP_VS_SVC_F_PERSISTENT)
1763                         seq_printf(seq, "persistent %d %08X\n",
1764                                 svc->timeout,
1765                                 ntohl(svc->netmask));
1766                 else
1767                         seq_putc(seq, '\n');
1768
1769                 list_for_each_entry(dest, &svc->destinations, n_list) {
1770                         seq_printf(seq,
1771                                    "  -> %08X:%04X      %-7s %-6d %-10d %-10d\n",
1772                                    ntohl(dest->addr), ntohs(dest->port),
1773                                    ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
1774                                    atomic_read(&dest->weight),
1775                                    atomic_read(&dest->activeconns),
1776                                    atomic_read(&dest->inactconns));
1777                 }
1778         }
1779         return 0;
1780 }
1781
1782 static struct seq_operations ip_vs_info_seq_ops = {
1783         .start = ip_vs_info_seq_start,
1784         .next  = ip_vs_info_seq_next,
1785         .stop  = ip_vs_info_seq_stop,
1786         .show  = ip_vs_info_seq_show,
1787 };
1788
1789 static int ip_vs_info_open(struct inode *inode, struct file *file)
1790 {
1791         struct seq_file *seq;
1792         int rc = -ENOMEM;
1793         struct ip_vs_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
1794
1795         if (!s)
1796                 goto out;
1797
1798         rc = seq_open(file, &ip_vs_info_seq_ops);
1799         if (rc)
1800                 goto out_kfree;
1801
1802         seq          = file->private_data;
1803         seq->private = s;
1804         memset(s, 0, sizeof(*s));
1805 out:
1806         return rc;
1807 out_kfree:
1808         kfree(s);
1809         goto out;
1810 }
1811
1812 static struct file_operations ip_vs_info_fops = {
1813         .owner   = THIS_MODULE,
1814         .open    = ip_vs_info_open,
1815         .read    = seq_read,
1816         .llseek  = seq_lseek,
1817         .release = seq_release_private,
1818 };
1819
1820 #endif
1821
1822 struct ip_vs_stats ip_vs_stats;
1823
1824 #ifdef CONFIG_PROC_FS
1825 static int ip_vs_stats_show(struct seq_file *seq, void *v)
1826 {
1827
1828 /*               01234567 01234567 01234567 0123456701234567 0123456701234567 */
1829         seq_puts(seq,
1830                  "   Total Incoming Outgoing         Incoming         Outgoing\n");
1831         seq_printf(seq,
1832                    "   Conns  Packets  Packets            Bytes            Bytes\n");
1833
1834         spin_lock_bh(&ip_vs_stats.lock);
1835         seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.conns,
1836                    ip_vs_stats.inpkts, ip_vs_stats.outpkts,
1837                    (unsigned long long) ip_vs_stats.inbytes,
1838                    (unsigned long long) ip_vs_stats.outbytes);
1839
1840 /*                 01234567 01234567 01234567 0123456701234567 0123456701234567 */
1841         seq_puts(seq,
1842                    " Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s\n");
1843         seq_printf(seq,"%8X %8X %8X %16X %16X\n",
1844                         ip_vs_stats.cps,
1845                         ip_vs_stats.inpps,
1846                         ip_vs_stats.outpps,
1847                         ip_vs_stats.inbps,
1848                         ip_vs_stats.outbps);
1849         spin_unlock_bh(&ip_vs_stats.lock);
1850
1851         return 0;
1852 }
1853
1854 static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
1855 {
1856         return single_open(file, ip_vs_stats_show, NULL);
1857 }
1858
1859 static struct file_operations ip_vs_stats_fops = {
1860         .owner = THIS_MODULE,
1861         .open = ip_vs_stats_seq_open,
1862         .read = seq_read,
1863         .llseek = seq_lseek,
1864         .release = single_release,
1865 };
1866
1867 #endif
1868
1869 /*
1870  *      Set timeout values for tcp tcpfin udp in the timeout_table.
1871  */
1872 static int ip_vs_set_timeout(struct ip_vs_timeout_user *u)
1873 {
1874         IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
1875                   u->tcp_timeout,
1876                   u->tcp_fin_timeout,
1877                   u->udp_timeout);
1878
1879 #ifdef CONFIG_IP_VS_PROTO_TCP
1880         if (u->tcp_timeout) {
1881                 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED]
1882                         = u->tcp_timeout * HZ;
1883         }
1884
1885         if (u->tcp_fin_timeout) {
1886                 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT]
1887                         = u->tcp_fin_timeout * HZ;
1888         }
1889 #endif
1890
1891 #ifdef CONFIG_IP_VS_PROTO_UDP
1892         if (u->udp_timeout) {
1893                 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL]
1894                         = u->udp_timeout * HZ;
1895         }
1896 #endif
1897         return 0;
1898 }
1899
1900
1901 #define SET_CMDID(cmd)          (cmd - IP_VS_BASE_CTL)
1902 #define SERVICE_ARG_LEN         (sizeof(struct ip_vs_service_user))
1903 #define SVCDEST_ARG_LEN         (sizeof(struct ip_vs_service_user) +    \
1904                                  sizeof(struct ip_vs_dest_user))
1905 #define TIMEOUT_ARG_LEN         (sizeof(struct ip_vs_timeout_user))
1906 #define DAEMON_ARG_LEN          (sizeof(struct ip_vs_daemon_user))
1907 #define MAX_ARG_LEN             SVCDEST_ARG_LEN
1908
1909 static unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
1910         [SET_CMDID(IP_VS_SO_SET_ADD)]           = SERVICE_ARG_LEN,
1911         [SET_CMDID(IP_VS_SO_SET_EDIT)]          = SERVICE_ARG_LEN,
1912         [SET_CMDID(IP_VS_SO_SET_DEL)]           = SERVICE_ARG_LEN,
1913         [SET_CMDID(IP_VS_SO_SET_FLUSH)]         = 0,
1914         [SET_CMDID(IP_VS_SO_SET_ADDDEST)]       = SVCDEST_ARG_LEN,
1915         [SET_CMDID(IP_VS_SO_SET_DELDEST)]       = SVCDEST_ARG_LEN,
1916         [SET_CMDID(IP_VS_SO_SET_EDITDEST)]      = SVCDEST_ARG_LEN,
1917         [SET_CMDID(IP_VS_SO_SET_TIMEOUT)]       = TIMEOUT_ARG_LEN,
1918         [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)]   = DAEMON_ARG_LEN,
1919         [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)]    = DAEMON_ARG_LEN,
1920         [SET_CMDID(IP_VS_SO_SET_ZERO)]          = SERVICE_ARG_LEN,
1921 };
1922
1923 static int
1924 do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
1925 {
1926         int ret;
1927         unsigned char arg[MAX_ARG_LEN];
1928         struct ip_vs_service_user *usvc;
1929         struct ip_vs_service *svc;
1930         struct ip_vs_dest_user *udest;
1931
1932         if (!capable(CAP_NET_ADMIN))
1933                 return -EPERM;
1934
1935         if (len != set_arglen[SET_CMDID(cmd)]) {
1936                 IP_VS_ERR("set_ctl: len %u != %u\n",
1937                           len, set_arglen[SET_CMDID(cmd)]);
1938                 return -EINVAL;
1939         }
1940
1941         if (copy_from_user(arg, user, len) != 0)
1942                 return -EFAULT;
1943
1944         /* increase the module use count */
1945         ip_vs_use_count_inc();
1946
1947         if (down_interruptible(&__ip_vs_mutex)) {
1948                 ret = -ERESTARTSYS;
1949                 goto out_dec;
1950         }
1951
1952         if (cmd == IP_VS_SO_SET_FLUSH) {
1953                 /* Flush the virtual service */
1954                 ret = ip_vs_flush();
1955                 goto out_unlock;
1956         } else if (cmd == IP_VS_SO_SET_TIMEOUT) {
1957                 /* Set timeout values for (tcp tcpfin udp) */
1958                 ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg);
1959                 goto out_unlock;
1960         } else if (cmd == IP_VS_SO_SET_STARTDAEMON) {
1961                 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1962                 ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid);
1963                 goto out_unlock;
1964         } else if (cmd == IP_VS_SO_SET_STOPDAEMON) {
1965                 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1966                 ret = stop_sync_thread(dm->state);
1967                 goto out_unlock;
1968         }
1969
1970         usvc = (struct ip_vs_service_user *)arg;
1971         udest = (struct ip_vs_dest_user *)(usvc + 1);
1972
1973         if (cmd == IP_VS_SO_SET_ZERO) {
1974                 /* if no service address is set, zero counters in all */
1975                 if (!usvc->fwmark && !usvc->addr && !usvc->port) {
1976                         ret = ip_vs_zero_all();
1977                         goto out_unlock;
1978                 }
1979         }
1980
1981         /* Check for valid protocol: TCP or UDP, even for fwmark!=0 */
1982         if (usvc->protocol!=IPPROTO_TCP && usvc->protocol!=IPPROTO_UDP) {
1983                 IP_VS_ERR("set_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s\n",
1984                           usvc->protocol, NIPQUAD(usvc->addr),
1985                           ntohs(usvc->port), usvc->sched_name);
1986                 ret = -EFAULT;
1987                 goto out_unlock;
1988         }
1989
1990         /* Lookup the exact service by <protocol, addr, port> or fwmark */
1991         if (usvc->fwmark == 0)
1992                 svc = __ip_vs_service_get(usvc->protocol,
1993                                           usvc->addr, usvc->port);
1994         else
1995                 svc = __ip_vs_svc_fwm_get(usvc->fwmark);
1996
1997         if (cmd != IP_VS_SO_SET_ADD
1998             && (svc == NULL || svc->protocol != usvc->protocol)) {
1999                 ret = -ESRCH;
2000                 goto out_unlock;
2001         }
2002
2003         switch (cmd) {
2004         case IP_VS_SO_SET_ADD:
2005                 if (svc != NULL)
2006                         ret = -EEXIST;
2007                 else
2008                         ret = ip_vs_add_service(usvc, &svc);
2009                 break;
2010         case IP_VS_SO_SET_EDIT:
2011                 ret = ip_vs_edit_service(svc, usvc);
2012                 break;
2013         case IP_VS_SO_SET_DEL:
2014                 ret = ip_vs_del_service(svc);
2015                 if (!ret)
2016                         goto out_unlock;
2017                 break;
2018         case IP_VS_SO_SET_ZERO:
2019                 ret = ip_vs_zero_service(svc);
2020                 break;
2021         case IP_VS_SO_SET_ADDDEST:
2022                 ret = ip_vs_add_dest(svc, udest);
2023                 break;
2024         case IP_VS_SO_SET_EDITDEST:
2025                 ret = ip_vs_edit_dest(svc, udest);
2026                 break;
2027         case IP_VS_SO_SET_DELDEST:
2028                 ret = ip_vs_del_dest(svc, udest);
2029                 break;
2030         default:
2031                 ret = -EINVAL;
2032         }
2033
2034         if (svc)
2035                 ip_vs_service_put(svc);
2036
2037   out_unlock:
2038         up(&__ip_vs_mutex);
2039   out_dec:
2040         /* decrease the module use count */
2041         ip_vs_use_count_dec();
2042
2043         return ret;
2044 }
2045
2046
2047 static void
2048 ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
2049 {
2050         spin_lock_bh(&src->lock);
2051         memcpy(dst, src, (char*)&src->lock - (char*)src);
2052         spin_unlock_bh(&src->lock);
2053 }
2054
2055 static void
2056 ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
2057 {
2058         dst->protocol = src->protocol;
2059         dst->addr = src->addr;
2060         dst->port = src->port;
2061         dst->fwmark = src->fwmark;
2062         strcpy(dst->sched_name, src->scheduler->name);
2063         dst->flags = src->flags;
2064         dst->timeout = src->timeout / HZ;
2065         dst->netmask = src->netmask;
2066         dst->num_dests = src->num_dests;
2067         ip_vs_copy_stats(&dst->stats, &src->stats);
2068 }
2069
2070 static inline int
2071 __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
2072                             struct ip_vs_get_services __user *uptr)
2073 {
2074         int idx, count=0;
2075         struct ip_vs_service *svc;
2076         struct ip_vs_service_entry entry;
2077         int ret = 0;
2078
2079         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2080                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
2081                         if (count >= get->num_services)
2082                                 goto out;
2083                         ip_vs_copy_service(&entry, svc);
2084                         if (copy_to_user(&uptr->entrytable[count],
2085                                          &entry, sizeof(entry))) {
2086                                 ret = -EFAULT;
2087                                 goto out;
2088                         }
2089                         count++;
2090                 }
2091         }
2092
2093         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2094                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
2095                         if (count >= get->num_services)
2096                                 goto out;
2097                         ip_vs_copy_service(&entry, svc);
2098                         if (copy_to_user(&uptr->entrytable[count],
2099                                          &entry, sizeof(entry))) {
2100                                 ret = -EFAULT;
2101                                 goto out;
2102                         }
2103                         count++;
2104                 }
2105         }
2106   out:
2107         return ret;
2108 }
2109
2110 static inline int
2111 __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
2112                          struct ip_vs_get_dests __user *uptr)
2113 {
2114         struct ip_vs_service *svc;
2115         int ret = 0;
2116
2117         if (get->fwmark)
2118                 svc = __ip_vs_svc_fwm_get(get->fwmark);
2119         else
2120                 svc = __ip_vs_service_get(get->protocol,
2121                                           get->addr, get->port);
2122         if (svc) {
2123                 int count = 0;
2124                 struct ip_vs_dest *dest;
2125                 struct ip_vs_dest_entry entry;
2126
2127                 list_for_each_entry(dest, &svc->destinations, n_list) {
2128                         if (count >= get->num_dests)
2129                                 break;
2130
2131                         entry.addr = dest->addr;
2132                         entry.port = dest->port;
2133                         entry.conn_flags = atomic_read(&dest->conn_flags);
2134                         entry.weight = atomic_read(&dest->weight);
2135                         entry.u_threshold = dest->u_threshold;
2136                         entry.l_threshold = dest->l_threshold;
2137                         entry.activeconns = atomic_read(&dest->activeconns);
2138                         entry.inactconns = atomic_read(&dest->inactconns);
2139                         entry.persistconns = atomic_read(&dest->persistconns);
2140                         ip_vs_copy_stats(&entry.stats, &dest->stats);
2141                         if (copy_to_user(&uptr->entrytable[count],
2142                                          &entry, sizeof(entry))) {
2143                                 ret = -EFAULT;
2144                                 break;
2145                         }
2146                         count++;
2147                 }
2148                 ip_vs_service_put(svc);
2149         } else
2150                 ret = -ESRCH;
2151         return ret;
2152 }
2153
2154 static inline void
2155 __ip_vs_get_timeouts(struct ip_vs_timeout_user *u)
2156 {
2157 #ifdef CONFIG_IP_VS_PROTO_TCP
2158         u->tcp_timeout =
2159                 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
2160         u->tcp_fin_timeout =
2161                 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
2162 #endif
2163 #ifdef CONFIG_IP_VS_PROTO_UDP
2164         u->udp_timeout =
2165                 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
2166 #endif
2167 }
2168
2169
2170 #define GET_CMDID(cmd)          (cmd - IP_VS_BASE_CTL)
2171 #define GET_INFO_ARG_LEN        (sizeof(struct ip_vs_getinfo))
2172 #define GET_SERVICES_ARG_LEN    (sizeof(struct ip_vs_get_services))
2173 #define GET_SERVICE_ARG_LEN     (sizeof(struct ip_vs_service_entry))
2174 #define GET_DESTS_ARG_LEN       (sizeof(struct ip_vs_get_dests))
2175 #define GET_TIMEOUT_ARG_LEN     (sizeof(struct ip_vs_timeout_user))
2176 #define GET_DAEMON_ARG_LEN      (sizeof(struct ip_vs_daemon_user) * 2)
2177
2178 static unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
2179         [GET_CMDID(IP_VS_SO_GET_VERSION)]       = 64,
2180         [GET_CMDID(IP_VS_SO_GET_INFO)]          = GET_INFO_ARG_LEN,
2181         [GET_CMDID(IP_VS_SO_GET_SERVICES)]      = GET_SERVICES_ARG_LEN,
2182         [GET_CMDID(IP_VS_SO_GET_SERVICE)]       = GET_SERVICE_ARG_LEN,
2183         [GET_CMDID(IP_VS_SO_GET_DESTS)]         = GET_DESTS_ARG_LEN,
2184         [GET_CMDID(IP_VS_SO_GET_TIMEOUT)]       = GET_TIMEOUT_ARG_LEN,
2185         [GET_CMDID(IP_VS_SO_GET_DAEMON)]        = GET_DAEMON_ARG_LEN,
2186 };
2187
2188 static int
2189 do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2190 {
2191         unsigned char arg[128];
2192         int ret = 0;
2193
2194         if (!capable(CAP_NET_ADMIN))
2195                 return -EPERM;
2196
2197         if (*len < get_arglen[GET_CMDID(cmd)]) {
2198                 IP_VS_ERR("get_ctl: len %u < %u\n",
2199                           *len, get_arglen[GET_CMDID(cmd)]);
2200                 return -EINVAL;
2201         }
2202
2203         if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0)
2204                 return -EFAULT;
2205
2206         if (down_interruptible(&__ip_vs_mutex))
2207                 return -ERESTARTSYS;
2208
2209         switch (cmd) {
2210         case IP_VS_SO_GET_VERSION:
2211         {
2212                 char buf[64];
2213
2214                 sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
2215                         NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
2216                 if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
2217                         ret = -EFAULT;
2218                         goto out;
2219                 }
2220                 *len = strlen(buf)+1;
2221         }
2222         break;
2223
2224         case IP_VS_SO_GET_INFO:
2225         {
2226                 struct ip_vs_getinfo info;
2227                 info.version = IP_VS_VERSION_CODE;
2228                 info.size = IP_VS_CONN_TAB_SIZE;
2229                 info.num_services = ip_vs_num_services;
2230                 if (copy_to_user(user, &info, sizeof(info)) != 0)
2231                         ret = -EFAULT;
2232         }
2233         break;
2234
2235         case IP_VS_SO_GET_SERVICES:
2236         {
2237                 struct ip_vs_get_services *get;
2238                 int size;
2239
2240                 get = (struct ip_vs_get_services *)arg;
2241                 size = sizeof(*get) +
2242                         sizeof(struct ip_vs_service_entry) * get->num_services;
2243                 if (*len != size) {
2244                         IP_VS_ERR("length: %u != %u\n", *len, size);
2245                         ret = -EINVAL;
2246                         goto out;
2247                 }
2248                 ret = __ip_vs_get_service_entries(get, user);
2249         }
2250         break;
2251
2252         case IP_VS_SO_GET_SERVICE:
2253         {
2254                 struct ip_vs_service_entry *entry;
2255                 struct ip_vs_service *svc;
2256
2257                 entry = (struct ip_vs_service_entry *)arg;
2258                 if (entry->fwmark)
2259                         svc = __ip_vs_svc_fwm_get(entry->fwmark);
2260                 else
2261                         svc = __ip_vs_service_get(entry->protocol,
2262                                                   entry->addr, entry->port);
2263                 if (svc) {
2264                         ip_vs_copy_service(entry, svc);
2265                         if (copy_to_user(user, entry, sizeof(*entry)) != 0)
2266                                 ret = -EFAULT;
2267                         ip_vs_service_put(svc);
2268                 } else
2269                         ret = -ESRCH;
2270         }
2271         break;
2272
2273         case IP_VS_SO_GET_DESTS:
2274         {
2275                 struct ip_vs_get_dests *get;
2276                 int size;
2277
2278                 get = (struct ip_vs_get_dests *)arg;
2279                 size = sizeof(*get) +
2280                         sizeof(struct ip_vs_dest_entry) * get->num_dests;
2281                 if (*len != size) {
2282                         IP_VS_ERR("length: %u != %u\n", *len, size);
2283                         ret = -EINVAL;
2284                         goto out;
2285                 }
2286                 ret = __ip_vs_get_dest_entries(get, user);
2287         }
2288         break;
2289
2290         case IP_VS_SO_GET_TIMEOUT:
2291         {
2292                 struct ip_vs_timeout_user t;
2293
2294                 __ip_vs_get_timeouts(&t);
2295                 if (copy_to_user(user, &t, sizeof(t)) != 0)
2296                         ret = -EFAULT;
2297         }
2298         break;
2299
2300         case IP_VS_SO_GET_DAEMON:
2301         {
2302                 struct ip_vs_daemon_user d[2];
2303
2304                 memset(&d, 0, sizeof(d));
2305                 if (ip_vs_sync_state & IP_VS_STATE_MASTER) {
2306                         d[0].state = IP_VS_STATE_MASTER;
2307                         strcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn);
2308                         d[0].syncid = ip_vs_master_syncid;
2309                 }
2310                 if (ip_vs_sync_state & IP_VS_STATE_BACKUP) {
2311                         d[1].state = IP_VS_STATE_BACKUP;
2312                         strcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn);
2313                         d[1].syncid = ip_vs_backup_syncid;
2314                 }
2315                 if (copy_to_user(user, &d, sizeof(d)) != 0)
2316                         ret = -EFAULT;
2317         }
2318         break;
2319
2320         default:
2321                 ret = -EINVAL;
2322         }
2323
2324   out:
2325         up(&__ip_vs_mutex);
2326         return ret;
2327 }
2328
2329
2330 static struct nf_sockopt_ops ip_vs_sockopts = {
2331         .pf             = PF_INET,
2332         .set_optmin     = IP_VS_BASE_CTL,
2333         .set_optmax     = IP_VS_SO_SET_MAX+1,
2334         .set            = do_ip_vs_set_ctl,
2335         .get_optmin     = IP_VS_BASE_CTL,
2336         .get_optmax     = IP_VS_SO_GET_MAX+1,
2337         .get            = do_ip_vs_get_ctl,
2338 };
2339
2340
2341 int ip_vs_control_init(void)
2342 {
2343         int ret;
2344         int idx;
2345
2346         EnterFunction(2);
2347
2348         ret = nf_register_sockopt(&ip_vs_sockopts);
2349         if (ret) {
2350                 IP_VS_ERR("cannot register sockopt.\n");
2351                 return ret;
2352         }
2353
2354         proc_net_fops_create("ip_vs", 0, &ip_vs_info_fops);
2355         proc_net_fops_create("ip_vs_stats",0, &ip_vs_stats_fops);
2356
2357         sysctl_header = register_sysctl_table(vs_root_table, 0);
2358
2359         /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
2360         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++)  {
2361                 INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
2362                 INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
2363         }
2364         for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++)  {
2365                 INIT_LIST_HEAD(&ip_vs_rtable[idx]);
2366         }
2367
2368         memset(&ip_vs_stats, 0, sizeof(ip_vs_stats));
2369         spin_lock_init(&ip_vs_stats.lock);
2370         ip_vs_new_estimator(&ip_vs_stats);
2371
2372         /* Hook the defense timer */
2373         schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
2374
2375         LeaveFunction(2);
2376         return 0;
2377 }
2378
2379
2380 void ip_vs_control_cleanup(void)
2381 {
2382         EnterFunction(2);
2383         ip_vs_trash_cleanup();
2384         cancel_rearming_delayed_work(&defense_work);
2385         ip_vs_kill_estimator(&ip_vs_stats);
2386         unregister_sysctl_table(sysctl_header);
2387         proc_net_remove("ip_vs_stats");
2388         proc_net_remove("ip_vs");
2389         nf_unregister_sockopt(&ip_vs_sockopts);
2390         LeaveFunction(2);
2391 }