Fedora kernel-2.6.17-1.2142_FC4 patched with stable patch-2.6.17.4-vs2.0.2-rc26.diff
[linux-2.6.git] / net / ipv4 / ipvs / ip_vs_ctl.c
1 /*
2  * IPVS         An implementation of the IP virtual server support for the
3  *              LINUX operating system.  IPVS is now implemented as a module
4  *              over the NetFilter framework. IPVS can be used to build a
5  *              high-performance and highly available server based on a
6  *              cluster of servers.
7  *
8  * Version:     $Id: ip_vs_ctl.c,v 1.36 2003/06/08 09:31:19 wensong Exp $
9  *
10  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
11  *              Peter Kese <peter.kese@ijs.si>
12  *              Julian Anastasov <ja@ssi.bg>
13  *
14  *              This program is free software; you can redistribute it and/or
15  *              modify it under the terms of the GNU General Public License
16  *              as published by the Free Software Foundation; either version
17  *              2 of the License, or (at your option) any later version.
18  *
19  * Changes:
20  *
21  */
22
23 #include <linux/module.h>
24 #include <linux/init.h>
25 #include <linux/types.h>
26 #include <linux/capability.h>
27 #include <linux/fs.h>
28 #include <linux/sysctl.h>
29 #include <linux/proc_fs.h>
30 #include <linux/workqueue.h>
31 #include <linux/swap.h>
32 #include <linux/proc_fs.h>
33 #include <linux/seq_file.h>
34
35 #include <linux/netfilter.h>
36 #include <linux/netfilter_ipv4.h>
37 #include <linux/mutex.h>
38
39 #include <net/ip.h>
40 #include <net/route.h>
41 #include <net/sock.h>
42
43 #include <asm/uaccess.h>
44
45 #include <net/ip_vs.h>
46
47 /* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
48 static DEFINE_MUTEX(__ip_vs_mutex);
49
50 /* lock for service table */
51 static DEFINE_RWLOCK(__ip_vs_svc_lock);
52
53 /* lock for table with the real services */
54 static DEFINE_RWLOCK(__ip_vs_rs_lock);
55
56 /* lock for state and timeout tables */
57 static DEFINE_RWLOCK(__ip_vs_securetcp_lock);
58
59 /* lock for drop entry handling */
60 static DEFINE_SPINLOCK(__ip_vs_dropentry_lock);
61
62 /* lock for drop packet handling */
63 static DEFINE_SPINLOCK(__ip_vs_droppacket_lock);
64
65 /* 1/rate drop and drop-entry variables */
66 int ip_vs_drop_rate = 0;
67 int ip_vs_drop_counter = 0;
68 static atomic_t ip_vs_dropentry = ATOMIC_INIT(0);
69
70 /* number of virtual services */
71 static int ip_vs_num_services = 0;
72
73 /* sysctl variables */
74 static int sysctl_ip_vs_drop_entry = 0;
75 static int sysctl_ip_vs_drop_packet = 0;
76 static int sysctl_ip_vs_secure_tcp = 0;
77 static int sysctl_ip_vs_amemthresh = 1024;
78 static int sysctl_ip_vs_am_droprate = 10;
79 int sysctl_ip_vs_cache_bypass = 0;
80 int sysctl_ip_vs_expire_nodest_conn = 0;
81 int sysctl_ip_vs_expire_quiescent_template = 0;
82 int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
83 int sysctl_ip_vs_nat_icmp_send = 0;
84
85
86 #ifdef CONFIG_IP_VS_DEBUG
87 static int sysctl_ip_vs_debug_level = 0;
88
89 int ip_vs_get_debug_level(void)
90 {
91         return sysctl_ip_vs_debug_level;
92 }
93 #endif
94
95 /*
96  *      update_defense_level is called from keventd and from sysctl,
97  *      so it needs to protect itself from softirqs
98  */
99 static void update_defense_level(void)
100 {
101         struct sysinfo i;
102         static int old_secure_tcp = 0;
103         int availmem;
104         int nomem;
105         int to_change = -1;
106
107         /* we only count free and buffered memory (in pages) */
108         si_meminfo(&i);
109         availmem = i.freeram + i.bufferram;
110         /* however in linux 2.5 the i.bufferram is total page cache size,
111            we need adjust it */
112         /* si_swapinfo(&i); */
113         /* availmem = availmem - (i.totalswap - i.freeswap); */
114
115         nomem = (availmem < sysctl_ip_vs_amemthresh);
116
117         local_bh_disable();
118
119         /* drop_entry */
120         spin_lock(&__ip_vs_dropentry_lock);
121         switch (sysctl_ip_vs_drop_entry) {
122         case 0:
123                 atomic_set(&ip_vs_dropentry, 0);
124                 break;
125         case 1:
126                 if (nomem) {
127                         atomic_set(&ip_vs_dropentry, 1);
128                         sysctl_ip_vs_drop_entry = 2;
129                 } else {
130                         atomic_set(&ip_vs_dropentry, 0);
131                 }
132                 break;
133         case 2:
134                 if (nomem) {
135                         atomic_set(&ip_vs_dropentry, 1);
136                 } else {
137                         atomic_set(&ip_vs_dropentry, 0);
138                         sysctl_ip_vs_drop_entry = 1;
139                 };
140                 break;
141         case 3:
142                 atomic_set(&ip_vs_dropentry, 1);
143                 break;
144         }
145         spin_unlock(&__ip_vs_dropentry_lock);
146
147         /* drop_packet */
148         spin_lock(&__ip_vs_droppacket_lock);
149         switch (sysctl_ip_vs_drop_packet) {
150         case 0:
151                 ip_vs_drop_rate = 0;
152                 break;
153         case 1:
154                 if (nomem) {
155                         ip_vs_drop_rate = ip_vs_drop_counter
156                                 = sysctl_ip_vs_amemthresh /
157                                 (sysctl_ip_vs_amemthresh-availmem);
158                         sysctl_ip_vs_drop_packet = 2;
159                 } else {
160                         ip_vs_drop_rate = 0;
161                 }
162                 break;
163         case 2:
164                 if (nomem) {
165                         ip_vs_drop_rate = ip_vs_drop_counter
166                                 = sysctl_ip_vs_amemthresh /
167                                 (sysctl_ip_vs_amemthresh-availmem);
168                 } else {
169                         ip_vs_drop_rate = 0;
170                         sysctl_ip_vs_drop_packet = 1;
171                 }
172                 break;
173         case 3:
174                 ip_vs_drop_rate = sysctl_ip_vs_am_droprate;
175                 break;
176         }
177         spin_unlock(&__ip_vs_droppacket_lock);
178
179         /* secure_tcp */
180         write_lock(&__ip_vs_securetcp_lock);
181         switch (sysctl_ip_vs_secure_tcp) {
182         case 0:
183                 if (old_secure_tcp >= 2)
184                         to_change = 0;
185                 break;
186         case 1:
187                 if (nomem) {
188                         if (old_secure_tcp < 2)
189                                 to_change = 1;
190                         sysctl_ip_vs_secure_tcp = 2;
191                 } else {
192                         if (old_secure_tcp >= 2)
193                                 to_change = 0;
194                 }
195                 break;
196         case 2:
197                 if (nomem) {
198                         if (old_secure_tcp < 2)
199                                 to_change = 1;
200                 } else {
201                         if (old_secure_tcp >= 2)
202                                 to_change = 0;
203                         sysctl_ip_vs_secure_tcp = 1;
204                 }
205                 break;
206         case 3:
207                 if (old_secure_tcp < 2)
208                         to_change = 1;
209                 break;
210         }
211         old_secure_tcp = sysctl_ip_vs_secure_tcp;
212         if (to_change >= 0)
213                 ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
214         write_unlock(&__ip_vs_securetcp_lock);
215
216         local_bh_enable();
217 }
218
219
220 /*
221  *      Timer for checking the defense
222  */
223 #define DEFENSE_TIMER_PERIOD    1*HZ
224 static void defense_work_handler(void *data);
225 static DECLARE_WORK(defense_work, defense_work_handler, NULL);
226
227 static void defense_work_handler(void *data)
228 {
229         update_defense_level();
230         if (atomic_read(&ip_vs_dropentry))
231                 ip_vs_random_dropentry();
232
233         schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
234 }
235
236 int
237 ip_vs_use_count_inc(void)
238 {
239         return try_module_get(THIS_MODULE);
240 }
241
242 void
243 ip_vs_use_count_dec(void)
244 {
245         module_put(THIS_MODULE);
246 }
247
248
249 /*
250  *      Hash table: for virtual service lookups
251  */
252 #define IP_VS_SVC_TAB_BITS 8
253 #define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
254 #define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
255
256 /* the service table hashed by <protocol, addr, port> */
257 static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
258 /* the service table hashed by fwmark */
259 static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
260
261 /*
262  *      Hash table: for real service lookups
263  */
264 #define IP_VS_RTAB_BITS 4
265 #define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS)
266 #define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1)
267
268 static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE];
269
270 /*
271  *      Trash for destinations
272  */
273 static LIST_HEAD(ip_vs_dest_trash);
274
275 /*
276  *      FTP & NULL virtual service counters
277  */
278 static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0);
279 static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0);
280
281
282 /*
283  *      Returns hash value for virtual service
284  */
285 static __inline__ unsigned
286 ip_vs_svc_hashkey(unsigned proto, __u32 addr, __u16 port)
287 {
288         register unsigned porth = ntohs(port);
289
290         return (proto^ntohl(addr)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
291                 & IP_VS_SVC_TAB_MASK;
292 }
293
294 /*
295  *      Returns hash value of fwmark for virtual service lookup
296  */
297 static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark)
298 {
299         return fwmark & IP_VS_SVC_TAB_MASK;
300 }
301
302 /*
303  *      Hashes a service in the ip_vs_svc_table by <proto,addr,port>
304  *      or in the ip_vs_svc_fwm_table by fwmark.
305  *      Should be called with locked tables.
306  */
307 static int ip_vs_svc_hash(struct ip_vs_service *svc)
308 {
309         unsigned hash;
310
311         if (svc->flags & IP_VS_SVC_F_HASHED) {
312                 IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, "
313                           "called from %p\n", __builtin_return_address(0));
314                 return 0;
315         }
316
317         if (svc->fwmark == 0) {
318                 /*
319                  *  Hash it by <protocol,addr,port> in ip_vs_svc_table
320                  */
321                 hash = ip_vs_svc_hashkey(svc->protocol, svc->addr, svc->port);
322                 list_add(&svc->s_list, &ip_vs_svc_table[hash]);
323         } else {
324                 /*
325                  *  Hash it by fwmark in ip_vs_svc_fwm_table
326                  */
327                 hash = ip_vs_svc_fwm_hashkey(svc->fwmark);
328                 list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
329         }
330
331         svc->flags |= IP_VS_SVC_F_HASHED;
332         /* increase its refcnt because it is referenced by the svc table */
333         atomic_inc(&svc->refcnt);
334         return 1;
335 }
336
337
338 /*
339  *      Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table.
340  *      Should be called with locked tables.
341  */
342 static int ip_vs_svc_unhash(struct ip_vs_service *svc)
343 {
344         if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
345                 IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, "
346                           "called from %p\n", __builtin_return_address(0));
347                 return 0;
348         }
349
350         if (svc->fwmark == 0) {
351                 /* Remove it from the ip_vs_svc_table table */
352                 list_del(&svc->s_list);
353         } else {
354                 /* Remove it from the ip_vs_svc_fwm_table table */
355                 list_del(&svc->f_list);
356         }
357
358         svc->flags &= ~IP_VS_SVC_F_HASHED;
359         atomic_dec(&svc->refcnt);
360         return 1;
361 }
362
363
364 /*
365  *      Get service by {proto,addr,port} in the service table.
366  */
367 static __inline__ struct ip_vs_service *
368 __ip_vs_service_get(__u16 protocol, __u32 vaddr, __u16 vport)
369 {
370         unsigned hash;
371         struct ip_vs_service *svc;
372
373         /* Check for "full" addressed entries */
374         hash = ip_vs_svc_hashkey(protocol, vaddr, vport);
375
376         list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
377                 if ((svc->addr == vaddr)
378                     && (svc->port == vport)
379                     && (svc->protocol == protocol)) {
380                         /* HIT */
381                         atomic_inc(&svc->usecnt);
382                         return svc;
383                 }
384         }
385
386         return NULL;
387 }
388
389
390 /*
391  *      Get service by {fwmark} in the service table.
392  */
393 static __inline__ struct ip_vs_service *__ip_vs_svc_fwm_get(__u32 fwmark)
394 {
395         unsigned hash;
396         struct ip_vs_service *svc;
397
398         /* Check for fwmark addressed entries */
399         hash = ip_vs_svc_fwm_hashkey(fwmark);
400
401         list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
402                 if (svc->fwmark == fwmark) {
403                         /* HIT */
404                         atomic_inc(&svc->usecnt);
405                         return svc;
406                 }
407         }
408
409         return NULL;
410 }
411
412 struct ip_vs_service *
413 ip_vs_service_get(__u32 fwmark, __u16 protocol, __u32 vaddr, __u16 vport)
414 {
415         struct ip_vs_service *svc;
416
417         read_lock(&__ip_vs_svc_lock);
418
419         /*
420          *      Check the table hashed by fwmark first
421          */
422         if (fwmark && (svc = __ip_vs_svc_fwm_get(fwmark)))
423                 goto out;
424
425         /*
426          *      Check the table hashed by <protocol,addr,port>
427          *      for "full" addressed entries
428          */
429         svc = __ip_vs_service_get(protocol, vaddr, vport);
430
431         if (svc == NULL
432             && protocol == IPPROTO_TCP
433             && atomic_read(&ip_vs_ftpsvc_counter)
434             && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
435                 /*
436                  * Check if ftp service entry exists, the packet
437                  * might belong to FTP data connections.
438                  */
439                 svc = __ip_vs_service_get(protocol, vaddr, FTPPORT);
440         }
441
442         if (svc == NULL
443             && atomic_read(&ip_vs_nullsvc_counter)) {
444                 /*
445                  * Check if the catch-all port (port zero) exists
446                  */
447                 svc = __ip_vs_service_get(protocol, vaddr, 0);
448         }
449
450   out:
451         read_unlock(&__ip_vs_svc_lock);
452
453         IP_VS_DBG(9, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n",
454                   fwmark, ip_vs_proto_name(protocol),
455                   NIPQUAD(vaddr), ntohs(vport),
456                   svc?"hit":"not hit");
457
458         return svc;
459 }
460
461
462 static inline void
463 __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
464 {
465         atomic_inc(&svc->refcnt);
466         dest->svc = svc;
467 }
468
469 static inline void
470 __ip_vs_unbind_svc(struct ip_vs_dest *dest)
471 {
472         struct ip_vs_service *svc = dest->svc;
473
474         dest->svc = NULL;
475         if (atomic_dec_and_test(&svc->refcnt))
476                 kfree(svc);
477 }
478
479
480 /*
481  *      Returns hash value for real service
482  */
483 static __inline__ unsigned ip_vs_rs_hashkey(__u32 addr, __u16 port)
484 {
485         register unsigned porth = ntohs(port);
486
487         return (ntohl(addr)^(porth>>IP_VS_RTAB_BITS)^porth)
488                 & IP_VS_RTAB_MASK;
489 }
490
491 /*
492  *      Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>.
493  *      should be called with locked tables.
494  */
495 static int ip_vs_rs_hash(struct ip_vs_dest *dest)
496 {
497         unsigned hash;
498
499         if (!list_empty(&dest->d_list)) {
500                 return 0;
501         }
502
503         /*
504          *      Hash by proto,addr,port,
505          *      which are the parameters of the real service.
506          */
507         hash = ip_vs_rs_hashkey(dest->addr, dest->port);
508         list_add(&dest->d_list, &ip_vs_rtable[hash]);
509
510         return 1;
511 }
512
513 /*
514  *      UNhashes ip_vs_dest from ip_vs_rtable.
515  *      should be called with locked tables.
516  */
517 static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
518 {
519         /*
520          * Remove it from the ip_vs_rtable table.
521          */
522         if (!list_empty(&dest->d_list)) {
523                 list_del(&dest->d_list);
524                 INIT_LIST_HEAD(&dest->d_list);
525         }
526
527         return 1;
528 }
529
530 /*
531  *      Lookup real service by <proto,addr,port> in the real service table.
532  */
533 struct ip_vs_dest *
534 ip_vs_lookup_real_service(__u16 protocol, __u32 daddr, __u16 dport)
535 {
536         unsigned hash;
537         struct ip_vs_dest *dest;
538
539         /*
540          *      Check for "full" addressed entries
541          *      Return the first found entry
542          */
543         hash = ip_vs_rs_hashkey(daddr, dport);
544
545         read_lock(&__ip_vs_rs_lock);
546         list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) {
547                 if ((dest->addr == daddr)
548                     && (dest->port == dport)
549                     && ((dest->protocol == protocol) ||
550                         dest->vfwmark)) {
551                         /* HIT */
552                         read_unlock(&__ip_vs_rs_lock);
553                         return dest;
554                 }
555         }
556         read_unlock(&__ip_vs_rs_lock);
557
558         return NULL;
559 }
560
561 /*
562  *      Lookup destination by {addr,port} in the given service
563  */
564 static struct ip_vs_dest *
565 ip_vs_lookup_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport)
566 {
567         struct ip_vs_dest *dest;
568
569         /*
570          * Find the destination for the given service
571          */
572         list_for_each_entry(dest, &svc->destinations, n_list) {
573                 if ((dest->addr == daddr) && (dest->port == dport)) {
574                         /* HIT */
575                         return dest;
576                 }
577         }
578
579         return NULL;
580 }
581
582
583 /*
584  *  Lookup dest by {svc,addr,port} in the destination trash.
585  *  The destination trash is used to hold the destinations that are removed
586  *  from the service table but are still referenced by some conn entries.
587  *  The reason to add the destination trash is when the dest is temporary
588  *  down (either by administrator or by monitor program), the dest can be
589  *  picked back from the trash, the remaining connections to the dest can
590  *  continue, and the counting information of the dest is also useful for
591  *  scheduling.
592  */
593 static struct ip_vs_dest *
594 ip_vs_trash_get_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport)
595 {
596         struct ip_vs_dest *dest, *nxt;
597
598         /*
599          * Find the destination in trash
600          */
601         list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
602                 IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, "
603                           "dest->refcnt=%d\n",
604                           dest->vfwmark,
605                           NIPQUAD(dest->addr), ntohs(dest->port),
606                           atomic_read(&dest->refcnt));
607                 if (dest->addr == daddr &&
608                     dest->port == dport &&
609                     dest->vfwmark == svc->fwmark &&
610                     dest->protocol == svc->protocol &&
611                     (svc->fwmark ||
612                      (dest->vaddr == svc->addr &&
613                       dest->vport == svc->port))) {
614                         /* HIT */
615                         return dest;
616                 }
617
618                 /*
619                  * Try to purge the destination from trash if not referenced
620                  */
621                 if (atomic_read(&dest->refcnt) == 1) {
622                         IP_VS_DBG(3, "Removing destination %u/%u.%u.%u.%u:%u "
623                                   "from trash\n",
624                                   dest->vfwmark,
625                                   NIPQUAD(dest->addr), ntohs(dest->port));
626                         list_del(&dest->n_list);
627                         ip_vs_dst_reset(dest);
628                         __ip_vs_unbind_svc(dest);
629                         kfree(dest);
630                 }
631         }
632
633         return NULL;
634 }
635
636
637 /*
638  *  Clean up all the destinations in the trash
639  *  Called by the ip_vs_control_cleanup()
640  *
641  *  When the ip_vs_control_clearup is activated by ipvs module exit,
642  *  the service tables must have been flushed and all the connections
643  *  are expired, and the refcnt of each destination in the trash must
644  *  be 1, so we simply release them here.
645  */
646 static void ip_vs_trash_cleanup(void)
647 {
648         struct ip_vs_dest *dest, *nxt;
649
650         list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
651                 list_del(&dest->n_list);
652                 ip_vs_dst_reset(dest);
653                 __ip_vs_unbind_svc(dest);
654                 kfree(dest);
655         }
656 }
657
658
659 static void
660 ip_vs_zero_stats(struct ip_vs_stats *stats)
661 {
662         spin_lock_bh(&stats->lock);
663         memset(stats, 0, (char *)&stats->lock - (char *)stats);
664         spin_unlock_bh(&stats->lock);
665         ip_vs_zero_estimator(stats);
666 }
667
668 /*
669  *      Update a destination in the given service
670  */
671 static void
672 __ip_vs_update_dest(struct ip_vs_service *svc,
673                     struct ip_vs_dest *dest, struct ip_vs_dest_user *udest)
674 {
675         int conn_flags;
676
677         /* set the weight and the flags */
678         atomic_set(&dest->weight, udest->weight);
679         conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE;
680
681         /* check if local node and update the flags */
682         if (inet_addr_type(udest->addr) == RTN_LOCAL) {
683                 conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
684                         | IP_VS_CONN_F_LOCALNODE;
685         }
686
687         /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
688         if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) {
689                 conn_flags |= IP_VS_CONN_F_NOOUTPUT;
690         } else {
691                 /*
692                  *    Put the real service in ip_vs_rtable if not present.
693                  *    For now only for NAT!
694                  */
695                 write_lock_bh(&__ip_vs_rs_lock);
696                 ip_vs_rs_hash(dest);
697                 write_unlock_bh(&__ip_vs_rs_lock);
698         }
699         atomic_set(&dest->conn_flags, conn_flags);
700
701         /* bind the service */
702         if (!dest->svc) {
703                 __ip_vs_bind_svc(dest, svc);
704         } else {
705                 if (dest->svc != svc) {
706                         __ip_vs_unbind_svc(dest);
707                         ip_vs_zero_stats(&dest->stats);
708                         __ip_vs_bind_svc(dest, svc);
709                 }
710         }
711
712         /* set the dest status flags */
713         dest->flags |= IP_VS_DEST_F_AVAILABLE;
714
715         if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
716                 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
717         dest->u_threshold = udest->u_threshold;
718         dest->l_threshold = udest->l_threshold;
719 }
720
721
722 /*
723  *      Create a destination for the given service
724  */
725 static int
726 ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest,
727                struct ip_vs_dest **dest_p)
728 {
729         struct ip_vs_dest *dest;
730         unsigned atype;
731
732         EnterFunction(2);
733
734         atype = inet_addr_type(udest->addr);
735         if (atype != RTN_LOCAL && atype != RTN_UNICAST)
736                 return -EINVAL;
737
738         dest = kmalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC);
739         if (dest == NULL) {
740                 IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n");
741                 return -ENOMEM;
742         }
743         memset(dest, 0, sizeof(struct ip_vs_dest));
744
745         dest->protocol = svc->protocol;
746         dest->vaddr = svc->addr;
747         dest->vport = svc->port;
748         dest->vfwmark = svc->fwmark;
749         dest->addr = udest->addr;
750         dest->port = udest->port;
751
752         atomic_set(&dest->activeconns, 0);
753         atomic_set(&dest->inactconns, 0);
754         atomic_set(&dest->persistconns, 0);
755         atomic_set(&dest->refcnt, 0);
756
757         INIT_LIST_HEAD(&dest->d_list);
758         spin_lock_init(&dest->dst_lock);
759         spin_lock_init(&dest->stats.lock);
760         __ip_vs_update_dest(svc, dest, udest);
761         ip_vs_new_estimator(&dest->stats);
762
763         *dest_p = dest;
764
765         LeaveFunction(2);
766         return 0;
767 }
768
769
770 /*
771  *      Add a destination into an existing service
772  */
773 static int
774 ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
775 {
776         struct ip_vs_dest *dest;
777         __u32 daddr = udest->addr;
778         __u16 dport = udest->port;
779         int ret;
780
781         EnterFunction(2);
782
783         if (udest->weight < 0) {
784                 IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n");
785                 return -ERANGE;
786         }
787
788         if (udest->l_threshold > udest->u_threshold) {
789                 IP_VS_ERR("ip_vs_add_dest(): lower threshold is higher than "
790                           "upper threshold\n");
791                 return -ERANGE;
792         }
793
794         /*
795          * Check if the dest already exists in the list
796          */
797         dest = ip_vs_lookup_dest(svc, daddr, dport);
798         if (dest != NULL) {
799                 IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n");
800                 return -EEXIST;
801         }
802
803         /*
804          * Check if the dest already exists in the trash and
805          * is from the same service
806          */
807         dest = ip_vs_trash_get_dest(svc, daddr, dport);
808         if (dest != NULL) {
809                 IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, "
810                           "dest->refcnt=%d, service %u/%u.%u.%u.%u:%u\n",
811                           NIPQUAD(daddr), ntohs(dport),
812                           atomic_read(&dest->refcnt),
813                           dest->vfwmark,
814                           NIPQUAD(dest->vaddr),
815                           ntohs(dest->vport));
816                 __ip_vs_update_dest(svc, dest, udest);
817
818                 /*
819                  * Get the destination from the trash
820                  */
821                 list_del(&dest->n_list);
822
823                 ip_vs_new_estimator(&dest->stats);
824
825                 write_lock_bh(&__ip_vs_svc_lock);
826
827                 /*
828                  * Wait until all other svc users go away.
829                  */
830                 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
831
832                 list_add(&dest->n_list, &svc->destinations);
833                 svc->num_dests++;
834
835                 /* call the update_service function of its scheduler */
836                 svc->scheduler->update_service(svc);
837
838                 write_unlock_bh(&__ip_vs_svc_lock);
839                 return 0;
840         }
841
842         /*
843          * Allocate and initialize the dest structure
844          */
845         ret = ip_vs_new_dest(svc, udest, &dest);
846         if (ret) {
847                 return ret;
848         }
849
850         /*
851          * Add the dest entry into the list
852          */
853         atomic_inc(&dest->refcnt);
854
855         write_lock_bh(&__ip_vs_svc_lock);
856
857         /*
858          * Wait until all other svc users go away.
859          */
860         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
861
862         list_add(&dest->n_list, &svc->destinations);
863         svc->num_dests++;
864
865         /* call the update_service function of its scheduler */
866         svc->scheduler->update_service(svc);
867
868         write_unlock_bh(&__ip_vs_svc_lock);
869
870         LeaveFunction(2);
871
872         return 0;
873 }
874
875
876 /*
877  *      Edit a destination in the given service
878  */
879 static int
880 ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
881 {
882         struct ip_vs_dest *dest;
883         __u32 daddr = udest->addr;
884         __u16 dport = udest->port;
885
886         EnterFunction(2);
887
888         if (udest->weight < 0) {
889                 IP_VS_ERR("ip_vs_edit_dest(): server weight less than zero\n");
890                 return -ERANGE;
891         }
892
893         if (udest->l_threshold > udest->u_threshold) {
894                 IP_VS_ERR("ip_vs_edit_dest(): lower threshold is higher than "
895                           "upper threshold\n");
896                 return -ERANGE;
897         }
898
899         /*
900          *  Lookup the destination list
901          */
902         dest = ip_vs_lookup_dest(svc, daddr, dport);
903         if (dest == NULL) {
904                 IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n");
905                 return -ENOENT;
906         }
907
908         __ip_vs_update_dest(svc, dest, udest);
909
910         write_lock_bh(&__ip_vs_svc_lock);
911
912         /* Wait until all other svc users go away */
913         while (atomic_read(&svc->usecnt) > 1) {};
914
915         /* call the update_service, because server weight may be changed */
916         svc->scheduler->update_service(svc);
917
918         write_unlock_bh(&__ip_vs_svc_lock);
919
920         LeaveFunction(2);
921
922         return 0;
923 }
924
925
926 /*
927  *      Delete a destination (must be already unlinked from the service)
928  */
929 static void __ip_vs_del_dest(struct ip_vs_dest *dest)
930 {
931         ip_vs_kill_estimator(&dest->stats);
932
933         /*
934          *  Remove it from the d-linked list with the real services.
935          */
936         write_lock_bh(&__ip_vs_rs_lock);
937         ip_vs_rs_unhash(dest);
938         write_unlock_bh(&__ip_vs_rs_lock);
939
940         /*
941          *  Decrease the refcnt of the dest, and free the dest
942          *  if nobody refers to it (refcnt=0). Otherwise, throw
943          *  the destination into the trash.
944          */
945         if (atomic_dec_and_test(&dest->refcnt)) {
946                 ip_vs_dst_reset(dest);
947                 /* simply decrease svc->refcnt here, let the caller check
948                    and release the service if nobody refers to it.
949                    Only user context can release destination and service,
950                    and only one user context can update virtual service at a
951                    time, so the operation here is OK */
952                 atomic_dec(&dest->svc->refcnt);
953                 kfree(dest);
954         } else {
955                 IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, "
956                           "dest->refcnt=%d\n",
957                           NIPQUAD(dest->addr), ntohs(dest->port),
958                           atomic_read(&dest->refcnt));
959                 list_add(&dest->n_list, &ip_vs_dest_trash);
960                 atomic_inc(&dest->refcnt);
961         }
962 }
963
964
965 /*
966  *      Unlink a destination from the given service
967  */
968 static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
969                                 struct ip_vs_dest *dest,
970                                 int svcupd)
971 {
972         dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
973
974         /*
975          *  Remove it from the d-linked destination list.
976          */
977         list_del(&dest->n_list);
978         svc->num_dests--;
979         if (svcupd) {
980                 /*
981                  *  Call the update_service function of its scheduler
982                  */
983                 svc->scheduler->update_service(svc);
984         }
985 }
986
987
988 /*
989  *      Delete a destination server in the given service
990  */
991 static int
992 ip_vs_del_dest(struct ip_vs_service *svc,struct ip_vs_dest_user *udest)
993 {
994         struct ip_vs_dest *dest;
995         __u32 daddr = udest->addr;
996         __u16 dport = udest->port;
997
998         EnterFunction(2);
999
1000         dest = ip_vs_lookup_dest(svc, daddr, dport);
1001         if (dest == NULL) {
1002                 IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n");
1003                 return -ENOENT;
1004         }
1005
1006         write_lock_bh(&__ip_vs_svc_lock);
1007
1008         /*
1009          *      Wait until all other svc users go away.
1010          */
1011         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1012
1013         /*
1014          *      Unlink dest from the service
1015          */
1016         __ip_vs_unlink_dest(svc, dest, 1);
1017
1018         write_unlock_bh(&__ip_vs_svc_lock);
1019
1020         /*
1021          *      Delete the destination
1022          */
1023         __ip_vs_del_dest(dest);
1024
1025         LeaveFunction(2);
1026
1027         return 0;
1028 }
1029
1030
1031 /*
1032  *      Add a service into the service hash table
1033  */
1034 static int
1035 ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p)
1036 {
1037         int ret = 0;
1038         struct ip_vs_scheduler *sched = NULL;
1039         struct ip_vs_service *svc = NULL;
1040
1041         /* increase the module use count */
1042         ip_vs_use_count_inc();
1043
1044         /* Lookup the scheduler by 'u->sched_name' */
1045         sched = ip_vs_scheduler_get(u->sched_name);
1046         if (sched == NULL) {
1047                 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1048                            u->sched_name);
1049                 ret = -ENOENT;
1050                 goto out_mod_dec;
1051         }
1052
1053         svc = (struct ip_vs_service *)
1054                 kmalloc(sizeof(struct ip_vs_service), GFP_ATOMIC);
1055         if (svc == NULL) {
1056                 IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n");
1057                 ret = -ENOMEM;
1058                 goto out_err;
1059         }
1060         memset(svc, 0, sizeof(struct ip_vs_service));
1061
1062         /* I'm the first user of the service */
1063         atomic_set(&svc->usecnt, 1);
1064         atomic_set(&svc->refcnt, 0);
1065
1066         svc->protocol = u->protocol;
1067         svc->addr = u->addr;
1068         svc->port = u->port;
1069         svc->fwmark = u->fwmark;
1070         svc->flags = u->flags;
1071         svc->timeout = u->timeout * HZ;
1072         svc->netmask = u->netmask;
1073
1074         INIT_LIST_HEAD(&svc->destinations);
1075         rwlock_init(&svc->sched_lock);
1076         spin_lock_init(&svc->stats.lock);
1077
1078         /* Bind the scheduler */
1079         ret = ip_vs_bind_scheduler(svc, sched);
1080         if (ret)
1081                 goto out_err;
1082         sched = NULL;
1083
1084         /* Update the virtual service counters */
1085         if (svc->port == FTPPORT)
1086                 atomic_inc(&ip_vs_ftpsvc_counter);
1087         else if (svc->port == 0)
1088                 atomic_inc(&ip_vs_nullsvc_counter);
1089
1090         ip_vs_new_estimator(&svc->stats);
1091         ip_vs_num_services++;
1092
1093         /* Hash the service into the service table */
1094         write_lock_bh(&__ip_vs_svc_lock);
1095         ip_vs_svc_hash(svc);
1096         write_unlock_bh(&__ip_vs_svc_lock);
1097
1098         *svc_p = svc;
1099         return 0;
1100
1101   out_err:
1102         if (svc != NULL) {
1103                 if (svc->scheduler)
1104                         ip_vs_unbind_scheduler(svc);
1105                 if (svc->inc) {
1106                         local_bh_disable();
1107                         ip_vs_app_inc_put(svc->inc);
1108                         local_bh_enable();
1109                 }
1110                 kfree(svc);
1111         }
1112         ip_vs_scheduler_put(sched);
1113
1114   out_mod_dec:
1115         /* decrease the module use count */
1116         ip_vs_use_count_dec();
1117
1118         return ret;
1119 }
1120
1121
1122 /*
1123  *      Edit a service and bind it with a new scheduler
1124  */
1125 static int
1126 ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user *u)
1127 {
1128         struct ip_vs_scheduler *sched, *old_sched;
1129         int ret = 0;
1130
1131         /*
1132          * Lookup the scheduler, by 'u->sched_name'
1133          */
1134         sched = ip_vs_scheduler_get(u->sched_name);
1135         if (sched == NULL) {
1136                 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1137                            u->sched_name);
1138                 return -ENOENT;
1139         }
1140         old_sched = sched;
1141
1142         write_lock_bh(&__ip_vs_svc_lock);
1143
1144         /*
1145          * Wait until all other svc users go away.
1146          */
1147         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1148
1149         /*
1150          * Set the flags and timeout value
1151          */
1152         svc->flags = u->flags | IP_VS_SVC_F_HASHED;
1153         svc->timeout = u->timeout * HZ;
1154         svc->netmask = u->netmask;
1155
1156         old_sched = svc->scheduler;
1157         if (sched != old_sched) {
1158                 /*
1159                  * Unbind the old scheduler
1160                  */
1161                 if ((ret = ip_vs_unbind_scheduler(svc))) {
1162                         old_sched = sched;
1163                         goto out;
1164                 }
1165
1166                 /*
1167                  * Bind the new scheduler
1168                  */
1169                 if ((ret = ip_vs_bind_scheduler(svc, sched))) {
1170                         /*
1171                          * If ip_vs_bind_scheduler fails, restore the old
1172                          * scheduler.
1173                          * The main reason of failure is out of memory.
1174                          *
1175                          * The question is if the old scheduler can be
1176                          * restored all the time. TODO: if it cannot be
1177                          * restored some time, we must delete the service,
1178                          * otherwise the system may crash.
1179                          */
1180                         ip_vs_bind_scheduler(svc, old_sched);
1181                         old_sched = sched;
1182                         goto out;
1183                 }
1184         }
1185
1186   out:
1187         write_unlock_bh(&__ip_vs_svc_lock);
1188
1189         if (old_sched)
1190                 ip_vs_scheduler_put(old_sched);
1191
1192         return ret;
1193 }
1194
1195
1196 /*
1197  *      Delete a service from the service list
1198  *      - The service must be unlinked, unlocked and not referenced!
1199  *      - We are called under _bh lock
1200  */
1201 static void __ip_vs_del_service(struct ip_vs_service *svc)
1202 {
1203         struct ip_vs_dest *dest, *nxt;
1204         struct ip_vs_scheduler *old_sched;
1205
1206         ip_vs_num_services--;
1207         ip_vs_kill_estimator(&svc->stats);
1208
1209         /* Unbind scheduler */
1210         old_sched = svc->scheduler;
1211         ip_vs_unbind_scheduler(svc);
1212         if (old_sched)
1213                 ip_vs_scheduler_put(old_sched);
1214
1215         /* Unbind app inc */
1216         if (svc->inc) {
1217                 ip_vs_app_inc_put(svc->inc);
1218                 svc->inc = NULL;
1219         }
1220
1221         /*
1222          *    Unlink the whole destination list
1223          */
1224         list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
1225                 __ip_vs_unlink_dest(svc, dest, 0);
1226                 __ip_vs_del_dest(dest);
1227         }
1228
1229         /*
1230          *    Update the virtual service counters
1231          */
1232         if (svc->port == FTPPORT)
1233                 atomic_dec(&ip_vs_ftpsvc_counter);
1234         else if (svc->port == 0)
1235                 atomic_dec(&ip_vs_nullsvc_counter);
1236
1237         /*
1238          *    Free the service if nobody refers to it
1239          */
1240         if (atomic_read(&svc->refcnt) == 0)
1241                 kfree(svc);
1242
1243         /* decrease the module use count */
1244         ip_vs_use_count_dec();
1245 }
1246
1247 /*
1248  *      Delete a service from the service list
1249  */
1250 static int ip_vs_del_service(struct ip_vs_service *svc)
1251 {
1252         if (svc == NULL)
1253                 return -EEXIST;
1254
1255         /*
1256          * Unhash it from the service table
1257          */
1258         write_lock_bh(&__ip_vs_svc_lock);
1259
1260         ip_vs_svc_unhash(svc);
1261
1262         /*
1263          * Wait until all the svc users go away.
1264          */
1265         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1266
1267         __ip_vs_del_service(svc);
1268
1269         write_unlock_bh(&__ip_vs_svc_lock);
1270
1271         return 0;
1272 }
1273
1274
1275 /*
1276  *      Flush all the virtual services
1277  */
1278 static int ip_vs_flush(void)
1279 {
1280         int idx;
1281         struct ip_vs_service *svc, *nxt;
1282
1283         /*
1284          * Flush the service table hashed by <protocol,addr,port>
1285          */
1286         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1287                 list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) {
1288                         write_lock_bh(&__ip_vs_svc_lock);
1289                         ip_vs_svc_unhash(svc);
1290                         /*
1291                          * Wait until all the svc users go away.
1292                          */
1293                         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1294                         __ip_vs_del_service(svc);
1295                         write_unlock_bh(&__ip_vs_svc_lock);
1296                 }
1297         }
1298
1299         /*
1300          * Flush the service table hashed by fwmark
1301          */
1302         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1303                 list_for_each_entry_safe(svc, nxt,
1304                                          &ip_vs_svc_fwm_table[idx], f_list) {
1305                         write_lock_bh(&__ip_vs_svc_lock);
1306                         ip_vs_svc_unhash(svc);
1307                         /*
1308                          * Wait until all the svc users go away.
1309                          */
1310                         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1311                         __ip_vs_del_service(svc);
1312                         write_unlock_bh(&__ip_vs_svc_lock);
1313                 }
1314         }
1315
1316         return 0;
1317 }
1318
1319
1320 /*
1321  *      Zero counters in a service or all services
1322  */
1323 static int ip_vs_zero_service(struct ip_vs_service *svc)
1324 {
1325         struct ip_vs_dest *dest;
1326
1327         write_lock_bh(&__ip_vs_svc_lock);
1328         list_for_each_entry(dest, &svc->destinations, n_list) {
1329                 ip_vs_zero_stats(&dest->stats);
1330         }
1331         ip_vs_zero_stats(&svc->stats);
1332         write_unlock_bh(&__ip_vs_svc_lock);
1333         return 0;
1334 }
1335
1336 static int ip_vs_zero_all(void)
1337 {
1338         int idx;
1339         struct ip_vs_service *svc;
1340
1341         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1342                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1343                         ip_vs_zero_service(svc);
1344                 }
1345         }
1346
1347         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1348                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1349                         ip_vs_zero_service(svc);
1350                 }
1351         }
1352
1353         ip_vs_zero_stats(&ip_vs_stats);
1354         return 0;
1355 }
1356
1357
1358 static int
1359 proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
1360                      void __user *buffer, size_t *lenp, loff_t *ppos)
1361 {
1362         int *valp = table->data;
1363         int val = *valp;
1364         int rc;
1365
1366         rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
1367         if (write && (*valp != val)) {
1368                 if ((*valp < 0) || (*valp > 3)) {
1369                         /* Restore the correct value */
1370                         *valp = val;
1371                 } else {
1372                         update_defense_level();
1373                 }
1374         }
1375         return rc;
1376 }
1377
1378
1379 static int
1380 proc_do_sync_threshold(ctl_table *table, int write, struct file *filp,
1381                        void __user *buffer, size_t *lenp, loff_t *ppos)
1382 {
1383         int *valp = table->data;
1384         int val[2];
1385         int rc;
1386
1387         /* backup the value first */
1388         memcpy(val, valp, sizeof(val));
1389
1390         rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
1391         if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) {
1392                 /* Restore the correct value */
1393                 memcpy(valp, val, sizeof(val));
1394         }
1395         return rc;
1396 }
1397
1398
1399 /*
1400  *      IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
1401  */
1402
1403 static struct ctl_table vs_vars[] = {
1404         {
1405                 .ctl_name       = NET_IPV4_VS_AMEMTHRESH,
1406                 .procname       = "amemthresh",
1407                 .data           = &sysctl_ip_vs_amemthresh,
1408                 .maxlen         = sizeof(int),
1409                 .mode           = 0644,
1410                 .proc_handler   = &proc_dointvec,
1411         },
1412 #ifdef CONFIG_IP_VS_DEBUG
1413         {
1414                 .ctl_name       = NET_IPV4_VS_DEBUG_LEVEL,
1415                 .procname       = "debug_level",
1416                 .data           = &sysctl_ip_vs_debug_level,
1417                 .maxlen         = sizeof(int),
1418                 .mode           = 0644,
1419                 .proc_handler   = &proc_dointvec,
1420         },
1421 #endif
1422         {
1423                 .ctl_name       = NET_IPV4_VS_AMDROPRATE,
1424                 .procname       = "am_droprate",
1425                 .data           = &sysctl_ip_vs_am_droprate,
1426                 .maxlen         = sizeof(int),
1427                 .mode           = 0644,
1428                 .proc_handler   = &proc_dointvec,
1429         },
1430         {
1431                 .ctl_name       = NET_IPV4_VS_DROP_ENTRY,
1432                 .procname       = "drop_entry",
1433                 .data           = &sysctl_ip_vs_drop_entry,
1434                 .maxlen         = sizeof(int),
1435                 .mode           = 0644,
1436                 .proc_handler   = &proc_do_defense_mode,
1437         },
1438         {
1439                 .ctl_name       = NET_IPV4_VS_DROP_PACKET,
1440                 .procname       = "drop_packet",
1441                 .data           = &sysctl_ip_vs_drop_packet,
1442                 .maxlen         = sizeof(int),
1443                 .mode           = 0644,
1444                 .proc_handler   = &proc_do_defense_mode,
1445         },
1446         {
1447                 .ctl_name       = NET_IPV4_VS_SECURE_TCP,
1448                 .procname       = "secure_tcp",
1449                 .data           = &sysctl_ip_vs_secure_tcp,
1450                 .maxlen         = sizeof(int),
1451                 .mode           = 0644,
1452                 .proc_handler   = &proc_do_defense_mode,
1453         },
1454 #if 0
1455         {
1456                 .ctl_name       = NET_IPV4_VS_TO_ES,
1457                 .procname       = "timeout_established",
1458                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED],
1459                 .maxlen         = sizeof(int),
1460                 .mode           = 0644,
1461                 .proc_handler   = &proc_dointvec_jiffies,
1462         },
1463         {
1464                 .ctl_name       = NET_IPV4_VS_TO_SS,
1465                 .procname       = "timeout_synsent",
1466                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT],
1467                 .maxlen         = sizeof(int),
1468                 .mode           = 0644,
1469                 .proc_handler   = &proc_dointvec_jiffies,
1470         },
1471         {
1472                 .ctl_name       = NET_IPV4_VS_TO_SR,
1473                 .procname       = "timeout_synrecv",
1474                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV],
1475                 .maxlen         = sizeof(int),
1476                 .mode           = 0644,
1477                 .proc_handler   = &proc_dointvec_jiffies,
1478         },
1479         {
1480                 .ctl_name       = NET_IPV4_VS_TO_FW,
1481                 .procname       = "timeout_finwait",
1482                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT],
1483                 .maxlen         = sizeof(int),
1484                 .mode           = 0644,
1485                 .proc_handler   = &proc_dointvec_jiffies,
1486         },
1487         {
1488                 .ctl_name       = NET_IPV4_VS_TO_TW,
1489                 .procname       = "timeout_timewait",
1490                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT],
1491                 .maxlen         = sizeof(int),
1492                 .mode           = 0644,
1493                 .proc_handler   = &proc_dointvec_jiffies,
1494         },
1495         {
1496                 .ctl_name       = NET_IPV4_VS_TO_CL,
1497                 .procname       = "timeout_close",
1498                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE],
1499                 .maxlen         = sizeof(int),
1500                 .mode           = 0644,
1501                 .proc_handler   = &proc_dointvec_jiffies,
1502         },
1503         {
1504                 .ctl_name       = NET_IPV4_VS_TO_CW,
1505                 .procname       = "timeout_closewait",
1506                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT],
1507                 .maxlen         = sizeof(int),
1508                 .mode           = 0644,
1509                 .proc_handler   = &proc_dointvec_jiffies,
1510         },
1511         {
1512                 .ctl_name       = NET_IPV4_VS_TO_LA,
1513                 .procname       = "timeout_lastack",
1514                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK],
1515                 .maxlen         = sizeof(int),
1516                 .mode           = 0644,
1517                 .proc_handler   = &proc_dointvec_jiffies,
1518         },
1519         {
1520                 .ctl_name       = NET_IPV4_VS_TO_LI,
1521                 .procname       = "timeout_listen",
1522                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN],
1523                 .maxlen         = sizeof(int),
1524                 .mode           = 0644,
1525                 .proc_handler   = &proc_dointvec_jiffies,
1526         },
1527         {
1528                 .ctl_name       = NET_IPV4_VS_TO_SA,
1529                 .procname       = "timeout_synack",
1530                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK],
1531                 .maxlen         = sizeof(int),
1532                 .mode           = 0644,
1533                 .proc_handler   = &proc_dointvec_jiffies,
1534         },
1535         {
1536                 .ctl_name       = NET_IPV4_VS_TO_UDP,
1537                 .procname       = "timeout_udp",
1538                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_UDP],
1539                 .maxlen         = sizeof(int),
1540                 .mode           = 0644,
1541                 .proc_handler   = &proc_dointvec_jiffies,
1542         },
1543         {
1544                 .ctl_name       = NET_IPV4_VS_TO_ICMP,
1545                 .procname       = "timeout_icmp",
1546                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP],
1547                 .maxlen         = sizeof(int),
1548                 .mode           = 0644,
1549                 .proc_handler   = &proc_dointvec_jiffies,
1550         },
1551 #endif
1552         {
1553                 .ctl_name       = NET_IPV4_VS_CACHE_BYPASS,
1554                 .procname       = "cache_bypass",
1555                 .data           = &sysctl_ip_vs_cache_bypass,
1556                 .maxlen         = sizeof(int),
1557                 .mode           = 0644,
1558                 .proc_handler   = &proc_dointvec,
1559         },
1560         {
1561                 .ctl_name       = NET_IPV4_VS_EXPIRE_NODEST_CONN,
1562                 .procname       = "expire_nodest_conn",
1563                 .data           = &sysctl_ip_vs_expire_nodest_conn,
1564                 .maxlen         = sizeof(int),
1565                 .mode           = 0644,
1566                 .proc_handler   = &proc_dointvec,
1567         },
1568         {
1569                 .ctl_name       = NET_IPV4_VS_EXPIRE_QUIESCENT_TEMPLATE,
1570                 .procname       = "expire_quiescent_template",
1571                 .data           = &sysctl_ip_vs_expire_quiescent_template,
1572                 .maxlen         = sizeof(int),
1573                 .mode           = 0644,
1574                 .proc_handler   = &proc_dointvec,
1575         },
1576         {
1577                 .ctl_name       = NET_IPV4_VS_SYNC_THRESHOLD,
1578                 .procname       = "sync_threshold",
1579                 .data           = &sysctl_ip_vs_sync_threshold,
1580                 .maxlen         = sizeof(sysctl_ip_vs_sync_threshold),
1581                 .mode           = 0644,
1582                 .proc_handler   = &proc_do_sync_threshold,
1583         },
1584         {
1585                 .ctl_name       = NET_IPV4_VS_NAT_ICMP_SEND,
1586                 .procname       = "nat_icmp_send",
1587                 .data           = &sysctl_ip_vs_nat_icmp_send,
1588                 .maxlen         = sizeof(int),
1589                 .mode           = 0644,
1590                 .proc_handler   = &proc_dointvec,
1591         },
1592         { .ctl_name = 0 }
1593 };
1594
1595 static ctl_table vs_table[] = {
1596         {
1597                 .ctl_name       = NET_IPV4_VS,
1598                 .procname       = "vs",
1599                 .mode           = 0555,
1600                 .child          = vs_vars
1601         },
1602         { .ctl_name = 0 }
1603 };
1604
1605 static ctl_table ipvs_ipv4_table[] = {
1606         {
1607                 .ctl_name       = NET_IPV4,
1608                 .procname       = "ipv4",
1609                 .mode           = 0555,
1610                 .child          = vs_table,
1611         },
1612         { .ctl_name = 0 }
1613 };
1614
1615 static ctl_table vs_root_table[] = {
1616         {
1617                 .ctl_name       = CTL_NET,
1618                 .procname       = "net",
1619                 .mode           = 0555,
1620                 .child          = ipvs_ipv4_table,
1621         },
1622         { .ctl_name = 0 }
1623 };
1624
1625 static struct ctl_table_header * sysctl_header;
1626
1627 #ifdef CONFIG_PROC_FS
1628
1629 struct ip_vs_iter {
1630         struct list_head *table;
1631         int bucket;
1632 };
1633
1634 /*
1635  *      Write the contents of the VS rule table to a PROCfs file.
1636  *      (It is kept just for backward compatibility)
1637  */
1638 static inline const char *ip_vs_fwd_name(unsigned flags)
1639 {
1640         switch (flags & IP_VS_CONN_F_FWD_MASK) {
1641         case IP_VS_CONN_F_LOCALNODE:
1642                 return "Local";
1643         case IP_VS_CONN_F_TUNNEL:
1644                 return "Tunnel";
1645         case IP_VS_CONN_F_DROUTE:
1646                 return "Route";
1647         default:
1648                 return "Masq";
1649         }
1650 }
1651
1652
1653 /* Get the Nth entry in the two lists */
1654 static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
1655 {
1656         struct ip_vs_iter *iter = seq->private;
1657         int idx;
1658         struct ip_vs_service *svc;
1659
1660         /* look in hash by protocol */
1661         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1662                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1663                         if (pos-- == 0){
1664                                 iter->table = ip_vs_svc_table;
1665                                 iter->bucket = idx;
1666                                 return svc;
1667                         }
1668                 }
1669         }
1670
1671         /* keep looking in fwmark */
1672         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1673                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1674                         if (pos-- == 0) {
1675                                 iter->table = ip_vs_svc_fwm_table;
1676                                 iter->bucket = idx;
1677                                 return svc;
1678                         }
1679                 }
1680         }
1681
1682         return NULL;
1683 }
1684
1685 static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
1686 {
1687
1688         read_lock_bh(&__ip_vs_svc_lock);
1689         return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
1690 }
1691
1692
1693 static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1694 {
1695         struct list_head *e;
1696         struct ip_vs_iter *iter;
1697         struct ip_vs_service *svc;
1698
1699         ++*pos;
1700         if (v == SEQ_START_TOKEN)
1701                 return ip_vs_info_array(seq,0);
1702
1703         svc = v;
1704         iter = seq->private;
1705
1706         if (iter->table == ip_vs_svc_table) {
1707                 /* next service in table hashed by protocol */
1708                 if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket])
1709                         return list_entry(e, struct ip_vs_service, s_list);
1710
1711
1712                 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1713                         list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket],
1714                                             s_list) {
1715                                 return svc;
1716                         }
1717                 }
1718
1719                 iter->table = ip_vs_svc_fwm_table;
1720                 iter->bucket = -1;
1721                 goto scan_fwmark;
1722         }
1723
1724         /* next service in hashed by fwmark */
1725         if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket])
1726                 return list_entry(e, struct ip_vs_service, f_list);
1727
1728  scan_fwmark:
1729         while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1730                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket],
1731                                     f_list)
1732                         return svc;
1733         }
1734
1735         return NULL;
1736 }
1737
1738 static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
1739 {
1740         read_unlock_bh(&__ip_vs_svc_lock);
1741 }
1742
1743
1744 static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
1745 {
1746         if (v == SEQ_START_TOKEN) {
1747                 seq_printf(seq,
1748                         "IP Virtual Server version %d.%d.%d (size=%d)\n",
1749                         NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
1750                 seq_puts(seq,
1751                          "Prot LocalAddress:Port Scheduler Flags\n");
1752                 seq_puts(seq,
1753                          "  -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
1754         } else {
1755                 const struct ip_vs_service *svc = v;
1756                 const struct ip_vs_iter *iter = seq->private;
1757                 const struct ip_vs_dest *dest;
1758
1759                 if (iter->table == ip_vs_svc_table)
1760                         seq_printf(seq, "%s  %08X:%04X %s ",
1761                                    ip_vs_proto_name(svc->protocol),
1762                                    ntohl(svc->addr),
1763                                    ntohs(svc->port),
1764                                    svc->scheduler->name);
1765                 else
1766                         seq_printf(seq, "FWM  %08X %s ",
1767                                    svc->fwmark, svc->scheduler->name);
1768
1769                 if (svc->flags & IP_VS_SVC_F_PERSISTENT)
1770                         seq_printf(seq, "persistent %d %08X\n",
1771                                 svc->timeout,
1772                                 ntohl(svc->netmask));
1773                 else
1774                         seq_putc(seq, '\n');
1775
1776                 list_for_each_entry(dest, &svc->destinations, n_list) {
1777                         seq_printf(seq,
1778                                    "  -> %08X:%04X      %-7s %-6d %-10d %-10d\n",
1779                                    ntohl(dest->addr), ntohs(dest->port),
1780                                    ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
1781                                    atomic_read(&dest->weight),
1782                                    atomic_read(&dest->activeconns),
1783                                    atomic_read(&dest->inactconns));
1784                 }
1785         }
1786         return 0;
1787 }
1788
1789 static struct seq_operations ip_vs_info_seq_ops = {
1790         .start = ip_vs_info_seq_start,
1791         .next  = ip_vs_info_seq_next,
1792         .stop  = ip_vs_info_seq_stop,
1793         .show  = ip_vs_info_seq_show,
1794 };
1795
1796 static int ip_vs_info_open(struct inode *inode, struct file *file)
1797 {
1798         struct seq_file *seq;
1799         int rc = -ENOMEM;
1800         struct ip_vs_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
1801
1802         if (!s)
1803                 goto out;
1804
1805         rc = seq_open(file, &ip_vs_info_seq_ops);
1806         if (rc)
1807                 goto out_kfree;
1808
1809         seq          = file->private_data;
1810         seq->private = s;
1811         memset(s, 0, sizeof(*s));
1812 out:
1813         return rc;
1814 out_kfree:
1815         kfree(s);
1816         goto out;
1817 }
1818
1819 static struct file_operations ip_vs_info_fops = {
1820         .owner   = THIS_MODULE,
1821         .open    = ip_vs_info_open,
1822         .read    = seq_read,
1823         .llseek  = seq_lseek,
1824         .release = seq_release_private,
1825 };
1826
1827 #endif
1828
1829 struct ip_vs_stats ip_vs_stats;
1830
1831 #ifdef CONFIG_PROC_FS
1832 static int ip_vs_stats_show(struct seq_file *seq, void *v)
1833 {
1834
1835 /*               01234567 01234567 01234567 0123456701234567 0123456701234567 */
1836         seq_puts(seq,
1837                  "   Total Incoming Outgoing         Incoming         Outgoing\n");
1838         seq_printf(seq,
1839                    "   Conns  Packets  Packets            Bytes            Bytes\n");
1840
1841         spin_lock_bh(&ip_vs_stats.lock);
1842         seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.conns,
1843                    ip_vs_stats.inpkts, ip_vs_stats.outpkts,
1844                    (unsigned long long) ip_vs_stats.inbytes,
1845                    (unsigned long long) ip_vs_stats.outbytes);
1846
1847 /*                 01234567 01234567 01234567 0123456701234567 0123456701234567 */
1848         seq_puts(seq,
1849                    " Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s\n");
1850         seq_printf(seq,"%8X %8X %8X %16X %16X\n",
1851                         ip_vs_stats.cps,
1852                         ip_vs_stats.inpps,
1853                         ip_vs_stats.outpps,
1854                         ip_vs_stats.inbps,
1855                         ip_vs_stats.outbps);
1856         spin_unlock_bh(&ip_vs_stats.lock);
1857
1858         return 0;
1859 }
1860
1861 static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
1862 {
1863         return single_open(file, ip_vs_stats_show, NULL);
1864 }
1865
1866 static struct file_operations ip_vs_stats_fops = {
1867         .owner = THIS_MODULE,
1868         .open = ip_vs_stats_seq_open,
1869         .read = seq_read,
1870         .llseek = seq_lseek,
1871         .release = single_release,
1872 };
1873
1874 #endif
1875
1876 /*
1877  *      Set timeout values for tcp tcpfin udp in the timeout_table.
1878  */
1879 static int ip_vs_set_timeout(struct ip_vs_timeout_user *u)
1880 {
1881         IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
1882                   u->tcp_timeout,
1883                   u->tcp_fin_timeout,
1884                   u->udp_timeout);
1885
1886 #ifdef CONFIG_IP_VS_PROTO_TCP
1887         if (u->tcp_timeout) {
1888                 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED]
1889                         = u->tcp_timeout * HZ;
1890         }
1891
1892         if (u->tcp_fin_timeout) {
1893                 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT]
1894                         = u->tcp_fin_timeout * HZ;
1895         }
1896 #endif
1897
1898 #ifdef CONFIG_IP_VS_PROTO_UDP
1899         if (u->udp_timeout) {
1900                 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL]
1901                         = u->udp_timeout * HZ;
1902         }
1903 #endif
1904         return 0;
1905 }
1906
1907
1908 #define SET_CMDID(cmd)          (cmd - IP_VS_BASE_CTL)
1909 #define SERVICE_ARG_LEN         (sizeof(struct ip_vs_service_user))
1910 #define SVCDEST_ARG_LEN         (sizeof(struct ip_vs_service_user) +    \
1911                                  sizeof(struct ip_vs_dest_user))
1912 #define TIMEOUT_ARG_LEN         (sizeof(struct ip_vs_timeout_user))
1913 #define DAEMON_ARG_LEN          (sizeof(struct ip_vs_daemon_user))
1914 #define MAX_ARG_LEN             SVCDEST_ARG_LEN
1915
1916 static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
1917         [SET_CMDID(IP_VS_SO_SET_ADD)]           = SERVICE_ARG_LEN,
1918         [SET_CMDID(IP_VS_SO_SET_EDIT)]          = SERVICE_ARG_LEN,
1919         [SET_CMDID(IP_VS_SO_SET_DEL)]           = SERVICE_ARG_LEN,
1920         [SET_CMDID(IP_VS_SO_SET_FLUSH)]         = 0,
1921         [SET_CMDID(IP_VS_SO_SET_ADDDEST)]       = SVCDEST_ARG_LEN,
1922         [SET_CMDID(IP_VS_SO_SET_DELDEST)]       = SVCDEST_ARG_LEN,
1923         [SET_CMDID(IP_VS_SO_SET_EDITDEST)]      = SVCDEST_ARG_LEN,
1924         [SET_CMDID(IP_VS_SO_SET_TIMEOUT)]       = TIMEOUT_ARG_LEN,
1925         [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)]   = DAEMON_ARG_LEN,
1926         [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)]    = DAEMON_ARG_LEN,
1927         [SET_CMDID(IP_VS_SO_SET_ZERO)]          = SERVICE_ARG_LEN,
1928 };
1929
1930 static int
1931 do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
1932 {
1933         int ret;
1934         unsigned char arg[MAX_ARG_LEN];
1935         struct ip_vs_service_user *usvc;
1936         struct ip_vs_service *svc;
1937         struct ip_vs_dest_user *udest;
1938
1939         if (!capable(CAP_NET_ADMIN))
1940                 return -EPERM;
1941
1942         if (len != set_arglen[SET_CMDID(cmd)]) {
1943                 IP_VS_ERR("set_ctl: len %u != %u\n",
1944                           len, set_arglen[SET_CMDID(cmd)]);
1945                 return -EINVAL;
1946         }
1947
1948         if (copy_from_user(arg, user, len) != 0)
1949                 return -EFAULT;
1950
1951         /* increase the module use count */
1952         ip_vs_use_count_inc();
1953
1954         if (mutex_lock_interruptible(&__ip_vs_mutex)) {
1955                 ret = -ERESTARTSYS;
1956                 goto out_dec;
1957         }
1958
1959         if (cmd == IP_VS_SO_SET_FLUSH) {
1960                 /* Flush the virtual service */
1961                 ret = ip_vs_flush();
1962                 goto out_unlock;
1963         } else if (cmd == IP_VS_SO_SET_TIMEOUT) {
1964                 /* Set timeout values for (tcp tcpfin udp) */
1965                 ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg);
1966                 goto out_unlock;
1967         } else if (cmd == IP_VS_SO_SET_STARTDAEMON) {
1968                 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1969                 ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid);
1970                 goto out_unlock;
1971         } else if (cmd == IP_VS_SO_SET_STOPDAEMON) {
1972                 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1973                 ret = stop_sync_thread(dm->state);
1974                 goto out_unlock;
1975         }
1976
1977         usvc = (struct ip_vs_service_user *)arg;
1978         udest = (struct ip_vs_dest_user *)(usvc + 1);
1979
1980         if (cmd == IP_VS_SO_SET_ZERO) {
1981                 /* if no service address is set, zero counters in all */
1982                 if (!usvc->fwmark && !usvc->addr && !usvc->port) {
1983                         ret = ip_vs_zero_all();
1984                         goto out_unlock;
1985                 }
1986         }
1987
1988         /* Check for valid protocol: TCP or UDP, even for fwmark!=0 */
1989         if (usvc->protocol!=IPPROTO_TCP && usvc->protocol!=IPPROTO_UDP) {
1990                 IP_VS_ERR("set_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s\n",
1991                           usvc->protocol, NIPQUAD(usvc->addr),
1992                           ntohs(usvc->port), usvc->sched_name);
1993                 ret = -EFAULT;
1994                 goto out_unlock;
1995         }
1996
1997         /* Lookup the exact service by <protocol, addr, port> or fwmark */
1998         if (usvc->fwmark == 0)
1999                 svc = __ip_vs_service_get(usvc->protocol,
2000                                           usvc->addr, usvc->port);
2001         else
2002                 svc = __ip_vs_svc_fwm_get(usvc->fwmark);
2003
2004         if (cmd != IP_VS_SO_SET_ADD
2005             && (svc == NULL || svc->protocol != usvc->protocol)) {
2006                 ret = -ESRCH;
2007                 goto out_unlock;
2008         }
2009
2010         switch (cmd) {
2011         case IP_VS_SO_SET_ADD:
2012                 if (svc != NULL)
2013                         ret = -EEXIST;
2014                 else
2015                         ret = ip_vs_add_service(usvc, &svc);
2016                 break;
2017         case IP_VS_SO_SET_EDIT:
2018                 ret = ip_vs_edit_service(svc, usvc);
2019                 break;
2020         case IP_VS_SO_SET_DEL:
2021                 ret = ip_vs_del_service(svc);
2022                 if (!ret)
2023                         goto out_unlock;
2024                 break;
2025         case IP_VS_SO_SET_ZERO:
2026                 ret = ip_vs_zero_service(svc);
2027                 break;
2028         case IP_VS_SO_SET_ADDDEST:
2029                 ret = ip_vs_add_dest(svc, udest);
2030                 break;
2031         case IP_VS_SO_SET_EDITDEST:
2032                 ret = ip_vs_edit_dest(svc, udest);
2033                 break;
2034         case IP_VS_SO_SET_DELDEST:
2035                 ret = ip_vs_del_dest(svc, udest);
2036                 break;
2037         default:
2038                 ret = -EINVAL;
2039         }
2040
2041         if (svc)
2042                 ip_vs_service_put(svc);
2043
2044   out_unlock:
2045         mutex_unlock(&__ip_vs_mutex);
2046   out_dec:
2047         /* decrease the module use count */
2048         ip_vs_use_count_dec();
2049
2050         return ret;
2051 }
2052
2053
2054 static void
2055 ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
2056 {
2057         spin_lock_bh(&src->lock);
2058         memcpy(dst, src, (char*)&src->lock - (char*)src);
2059         spin_unlock_bh(&src->lock);
2060 }
2061
2062 static void
2063 ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
2064 {
2065         dst->protocol = src->protocol;
2066         dst->addr = src->addr;
2067         dst->port = src->port;
2068         dst->fwmark = src->fwmark;
2069         strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
2070         dst->flags = src->flags;
2071         dst->timeout = src->timeout / HZ;
2072         dst->netmask = src->netmask;
2073         dst->num_dests = src->num_dests;
2074         ip_vs_copy_stats(&dst->stats, &src->stats);
2075 }
2076
2077 static inline int
2078 __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
2079                             struct ip_vs_get_services __user *uptr)
2080 {
2081         int idx, count=0;
2082         struct ip_vs_service *svc;
2083         struct ip_vs_service_entry entry;
2084         int ret = 0;
2085
2086         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2087                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
2088                         if (count >= get->num_services)
2089                                 goto out;
2090                         memset(&entry, 0, sizeof(entry));
2091                         ip_vs_copy_service(&entry, svc);
2092                         if (copy_to_user(&uptr->entrytable[count],
2093                                          &entry, sizeof(entry))) {
2094                                 ret = -EFAULT;
2095                                 goto out;
2096                         }
2097                         count++;
2098                 }
2099         }
2100
2101         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2102                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
2103                         if (count >= get->num_services)
2104                                 goto out;
2105                         memset(&entry, 0, sizeof(entry));
2106                         ip_vs_copy_service(&entry, svc);
2107                         if (copy_to_user(&uptr->entrytable[count],
2108                                          &entry, sizeof(entry))) {
2109                                 ret = -EFAULT;
2110                                 goto out;
2111                         }
2112                         count++;
2113                 }
2114         }
2115   out:
2116         return ret;
2117 }
2118
2119 static inline int
2120 __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
2121                          struct ip_vs_get_dests __user *uptr)
2122 {
2123         struct ip_vs_service *svc;
2124         int ret = 0;
2125
2126         if (get->fwmark)
2127                 svc = __ip_vs_svc_fwm_get(get->fwmark);
2128         else
2129                 svc = __ip_vs_service_get(get->protocol,
2130                                           get->addr, get->port);
2131         if (svc) {
2132                 int count = 0;
2133                 struct ip_vs_dest *dest;
2134                 struct ip_vs_dest_entry entry;
2135
2136                 list_for_each_entry(dest, &svc->destinations, n_list) {
2137                         if (count >= get->num_dests)
2138                                 break;
2139
2140                         entry.addr = dest->addr;
2141                         entry.port = dest->port;
2142                         entry.conn_flags = atomic_read(&dest->conn_flags);
2143                         entry.weight = atomic_read(&dest->weight);
2144                         entry.u_threshold = dest->u_threshold;
2145                         entry.l_threshold = dest->l_threshold;
2146                         entry.activeconns = atomic_read(&dest->activeconns);
2147                         entry.inactconns = atomic_read(&dest->inactconns);
2148                         entry.persistconns = atomic_read(&dest->persistconns);
2149                         ip_vs_copy_stats(&entry.stats, &dest->stats);
2150                         if (copy_to_user(&uptr->entrytable[count],
2151                                          &entry, sizeof(entry))) {
2152                                 ret = -EFAULT;
2153                                 break;
2154                         }
2155                         count++;
2156                 }
2157                 ip_vs_service_put(svc);
2158         } else
2159                 ret = -ESRCH;
2160         return ret;
2161 }
2162
2163 static inline void
2164 __ip_vs_get_timeouts(struct ip_vs_timeout_user *u)
2165 {
2166 #ifdef CONFIG_IP_VS_PROTO_TCP
2167         u->tcp_timeout =
2168                 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
2169         u->tcp_fin_timeout =
2170                 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
2171 #endif
2172 #ifdef CONFIG_IP_VS_PROTO_UDP
2173         u->udp_timeout =
2174                 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
2175 #endif
2176 }
2177
2178
2179 #define GET_CMDID(cmd)          (cmd - IP_VS_BASE_CTL)
2180 #define GET_INFO_ARG_LEN        (sizeof(struct ip_vs_getinfo))
2181 #define GET_SERVICES_ARG_LEN    (sizeof(struct ip_vs_get_services))
2182 #define GET_SERVICE_ARG_LEN     (sizeof(struct ip_vs_service_entry))
2183 #define GET_DESTS_ARG_LEN       (sizeof(struct ip_vs_get_dests))
2184 #define GET_TIMEOUT_ARG_LEN     (sizeof(struct ip_vs_timeout_user))
2185 #define GET_DAEMON_ARG_LEN      (sizeof(struct ip_vs_daemon_user) * 2)
2186
2187 static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
2188         [GET_CMDID(IP_VS_SO_GET_VERSION)]       = 64,
2189         [GET_CMDID(IP_VS_SO_GET_INFO)]          = GET_INFO_ARG_LEN,
2190         [GET_CMDID(IP_VS_SO_GET_SERVICES)]      = GET_SERVICES_ARG_LEN,
2191         [GET_CMDID(IP_VS_SO_GET_SERVICE)]       = GET_SERVICE_ARG_LEN,
2192         [GET_CMDID(IP_VS_SO_GET_DESTS)]         = GET_DESTS_ARG_LEN,
2193         [GET_CMDID(IP_VS_SO_GET_TIMEOUT)]       = GET_TIMEOUT_ARG_LEN,
2194         [GET_CMDID(IP_VS_SO_GET_DAEMON)]        = GET_DAEMON_ARG_LEN,
2195 };
2196
2197 static int
2198 do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2199 {
2200         unsigned char arg[128];
2201         int ret = 0;
2202
2203         if (!capable(CAP_NET_ADMIN))
2204                 return -EPERM;
2205
2206         if (*len < get_arglen[GET_CMDID(cmd)]) {
2207                 IP_VS_ERR("get_ctl: len %u < %u\n",
2208                           *len, get_arglen[GET_CMDID(cmd)]);
2209                 return -EINVAL;
2210         }
2211
2212         if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0)
2213                 return -EFAULT;
2214
2215         if (mutex_lock_interruptible(&__ip_vs_mutex))
2216                 return -ERESTARTSYS;
2217
2218         switch (cmd) {
2219         case IP_VS_SO_GET_VERSION:
2220         {
2221                 char buf[64];
2222
2223                 sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
2224                         NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
2225                 if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
2226                         ret = -EFAULT;
2227                         goto out;
2228                 }
2229                 *len = strlen(buf)+1;
2230         }
2231         break;
2232
2233         case IP_VS_SO_GET_INFO:
2234         {
2235                 struct ip_vs_getinfo info;
2236                 info.version = IP_VS_VERSION_CODE;
2237                 info.size = IP_VS_CONN_TAB_SIZE;
2238                 info.num_services = ip_vs_num_services;
2239                 if (copy_to_user(user, &info, sizeof(info)) != 0)
2240                         ret = -EFAULT;
2241         }
2242         break;
2243
2244         case IP_VS_SO_GET_SERVICES:
2245         {
2246                 struct ip_vs_get_services *get;
2247                 int size;
2248
2249                 get = (struct ip_vs_get_services *)arg;
2250                 size = sizeof(*get) +
2251                         sizeof(struct ip_vs_service_entry) * get->num_services;
2252                 if (*len != size) {
2253                         IP_VS_ERR("length: %u != %u\n", *len, size);
2254                         ret = -EINVAL;
2255                         goto out;
2256                 }
2257                 ret = __ip_vs_get_service_entries(get, user);
2258         }
2259         break;
2260
2261         case IP_VS_SO_GET_SERVICE:
2262         {
2263                 struct ip_vs_service_entry *entry;
2264                 struct ip_vs_service *svc;
2265
2266                 entry = (struct ip_vs_service_entry *)arg;
2267                 if (entry->fwmark)
2268                         svc = __ip_vs_svc_fwm_get(entry->fwmark);
2269                 else
2270                         svc = __ip_vs_service_get(entry->protocol,
2271                                                   entry->addr, entry->port);
2272                 if (svc) {
2273                         ip_vs_copy_service(entry, svc);
2274                         if (copy_to_user(user, entry, sizeof(*entry)) != 0)
2275                                 ret = -EFAULT;
2276                         ip_vs_service_put(svc);
2277                 } else
2278                         ret = -ESRCH;
2279         }
2280         break;
2281
2282         case IP_VS_SO_GET_DESTS:
2283         {
2284                 struct ip_vs_get_dests *get;
2285                 int size;
2286
2287                 get = (struct ip_vs_get_dests *)arg;
2288                 size = sizeof(*get) +
2289                         sizeof(struct ip_vs_dest_entry) * get->num_dests;
2290                 if (*len != size) {
2291                         IP_VS_ERR("length: %u != %u\n", *len, size);
2292                         ret = -EINVAL;
2293                         goto out;
2294                 }
2295                 ret = __ip_vs_get_dest_entries(get, user);
2296         }
2297         break;
2298
2299         case IP_VS_SO_GET_TIMEOUT:
2300         {
2301                 struct ip_vs_timeout_user t;
2302
2303                 __ip_vs_get_timeouts(&t);
2304                 if (copy_to_user(user, &t, sizeof(t)) != 0)
2305                         ret = -EFAULT;
2306         }
2307         break;
2308
2309         case IP_VS_SO_GET_DAEMON:
2310         {
2311                 struct ip_vs_daemon_user d[2];
2312
2313                 memset(&d, 0, sizeof(d));
2314                 if (ip_vs_sync_state & IP_VS_STATE_MASTER) {
2315                         d[0].state = IP_VS_STATE_MASTER;
2316                         strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn));
2317                         d[0].syncid = ip_vs_master_syncid;
2318                 }
2319                 if (ip_vs_sync_state & IP_VS_STATE_BACKUP) {
2320                         d[1].state = IP_VS_STATE_BACKUP;
2321                         strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn));
2322                         d[1].syncid = ip_vs_backup_syncid;
2323                 }
2324                 if (copy_to_user(user, &d, sizeof(d)) != 0)
2325                         ret = -EFAULT;
2326         }
2327         break;
2328
2329         default:
2330                 ret = -EINVAL;
2331         }
2332
2333   out:
2334         mutex_unlock(&__ip_vs_mutex);
2335         return ret;
2336 }
2337
2338
2339 static struct nf_sockopt_ops ip_vs_sockopts = {
2340         .pf             = PF_INET,
2341         .set_optmin     = IP_VS_BASE_CTL,
2342         .set_optmax     = IP_VS_SO_SET_MAX+1,
2343         .set            = do_ip_vs_set_ctl,
2344         .get_optmin     = IP_VS_BASE_CTL,
2345         .get_optmax     = IP_VS_SO_GET_MAX+1,
2346         .get            = do_ip_vs_get_ctl,
2347 };
2348
2349
2350 int ip_vs_control_init(void)
2351 {
2352         int ret;
2353         int idx;
2354
2355         EnterFunction(2);
2356
2357         ret = nf_register_sockopt(&ip_vs_sockopts);
2358         if (ret) {
2359                 IP_VS_ERR("cannot register sockopt.\n");
2360                 return ret;
2361         }
2362
2363         proc_net_fops_create("ip_vs", 0, &ip_vs_info_fops);
2364         proc_net_fops_create("ip_vs_stats",0, &ip_vs_stats_fops);
2365
2366         sysctl_header = register_sysctl_table(vs_root_table, 0);
2367
2368         /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
2369         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++)  {
2370                 INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
2371                 INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
2372         }
2373         for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++)  {
2374                 INIT_LIST_HEAD(&ip_vs_rtable[idx]);
2375         }
2376
2377         memset(&ip_vs_stats, 0, sizeof(ip_vs_stats));
2378         spin_lock_init(&ip_vs_stats.lock);
2379         ip_vs_new_estimator(&ip_vs_stats);
2380
2381         /* Hook the defense timer */
2382         schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
2383
2384         LeaveFunction(2);
2385         return 0;
2386 }
2387
2388
2389 void ip_vs_control_cleanup(void)
2390 {
2391         EnterFunction(2);
2392         ip_vs_trash_cleanup();
2393         cancel_rearming_delayed_work(&defense_work);
2394         ip_vs_kill_estimator(&ip_vs_stats);
2395         unregister_sysctl_table(sysctl_header);
2396         proc_net_remove("ip_vs_stats");
2397         proc_net_remove("ip_vs");
2398         nf_unregister_sockopt(&ip_vs_sockopts);
2399         LeaveFunction(2);
2400 }