2 * IPVS An implementation of the IP virtual server support for the
3 * LINUX operating system. IPVS is now implemented as a module
4 * over the NetFilter framework. IPVS can be used to build a
5 * high-performance and highly available server based on a
8 * Version: $Id: ip_vs_ctl.c,v 1.36 2003/06/08 09:31:19 wensong Exp $
10 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
11 * Peter Kese <peter.kese@ijs.si>
12 * Julian Anastasov <ja@ssi.bg>
14 * This program is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU General Public License
16 * as published by the Free Software Foundation; either version
17 * 2 of the License, or (at your option) any later version.
23 #include <linux/module.h>
24 #include <linux/init.h>
25 #include <linux/types.h>
27 #include <linux/sysctl.h>
28 #include <linux/proc_fs.h>
29 #include <linux/timer.h>
30 #include <linux/swap.h>
31 #include <linux/proc_fs.h>
32 #include <linux/seq_file.h>
34 #include <linux/netfilter.h>
35 #include <linux/netfilter_ipv4.h>
40 #include <asm/uaccess.h>
42 #include <net/ip_vs.h>
44 /* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
45 static DECLARE_MUTEX(__ip_vs_mutex);
47 /* lock for service table */
48 static rwlock_t __ip_vs_svc_lock = RW_LOCK_UNLOCKED;
50 /* lock for table with the real services */
51 static rwlock_t __ip_vs_rs_lock = RW_LOCK_UNLOCKED;
53 /* lock for state and timeout tables */
54 static rwlock_t __ip_vs_securetcp_lock = RW_LOCK_UNLOCKED;
56 /* lock for drop entry handling */
57 static spinlock_t __ip_vs_dropentry_lock = SPIN_LOCK_UNLOCKED;
59 /* lock for drop packet handling */
60 static spinlock_t __ip_vs_droppacket_lock = SPIN_LOCK_UNLOCKED;
62 /* 1/rate drop and drop-entry variables */
63 int ip_vs_drop_rate = 0;
64 int ip_vs_drop_counter = 0;
65 atomic_t ip_vs_dropentry = ATOMIC_INIT(0);
67 /* number of virtual services */
68 static int ip_vs_num_services = 0;
70 /* sysctl variables */
71 static int sysctl_ip_vs_drop_entry = 0;
72 static int sysctl_ip_vs_drop_packet = 0;
73 static int sysctl_ip_vs_secure_tcp = 0;
74 static int sysctl_ip_vs_amemthresh = 1024;
75 static int sysctl_ip_vs_am_droprate = 10;
76 int sysctl_ip_vs_cache_bypass = 0;
77 int sysctl_ip_vs_expire_nodest_conn = 0;
78 int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
79 int sysctl_ip_vs_nat_icmp_send = 0;
82 #ifdef CONFIG_IP_VS_DEBUG
83 static int sysctl_ip_vs_debug_level = 0;
85 int ip_vs_get_debug_level(void)
87 return sysctl_ip_vs_debug_level;
92 * update_defense_level is called from timer bh and from sysctl.
94 static void update_defense_level(void)
97 static int old_secure_tcp = 0;
102 /* we only count free and buffered memory (in pages) */
104 availmem = i.freeram + i.bufferram;
105 /* however in linux 2.5 the i.bufferram is total page cache size,
107 /* si_swapinfo(&i); */
108 /* availmem = availmem - (i.totalswap - i.freeswap); */
110 nomem = (availmem < sysctl_ip_vs_amemthresh);
113 spin_lock(&__ip_vs_dropentry_lock);
114 switch (sysctl_ip_vs_drop_entry) {
116 atomic_set(&ip_vs_dropentry, 0);
120 atomic_set(&ip_vs_dropentry, 1);
121 sysctl_ip_vs_drop_entry = 2;
123 atomic_set(&ip_vs_dropentry, 0);
128 atomic_set(&ip_vs_dropentry, 1);
130 atomic_set(&ip_vs_dropentry, 0);
131 sysctl_ip_vs_drop_entry = 1;
135 atomic_set(&ip_vs_dropentry, 1);
138 spin_unlock(&__ip_vs_dropentry_lock);
141 spin_lock(&__ip_vs_droppacket_lock);
142 switch (sysctl_ip_vs_drop_packet) {
148 ip_vs_drop_rate = ip_vs_drop_counter
149 = sysctl_ip_vs_amemthresh /
150 (sysctl_ip_vs_amemthresh-availmem);
151 sysctl_ip_vs_drop_packet = 2;
158 ip_vs_drop_rate = ip_vs_drop_counter
159 = sysctl_ip_vs_amemthresh /
160 (sysctl_ip_vs_amemthresh-availmem);
163 sysctl_ip_vs_drop_packet = 1;
167 ip_vs_drop_rate = sysctl_ip_vs_am_droprate;
170 spin_unlock(&__ip_vs_droppacket_lock);
173 write_lock(&__ip_vs_securetcp_lock);
174 switch (sysctl_ip_vs_secure_tcp) {
176 if (old_secure_tcp >= 2)
181 if (old_secure_tcp < 2)
183 sysctl_ip_vs_secure_tcp = 2;
185 if (old_secure_tcp >= 2)
191 if (old_secure_tcp < 2)
194 if (old_secure_tcp >= 2)
196 sysctl_ip_vs_secure_tcp = 1;
200 if (old_secure_tcp < 2)
204 old_secure_tcp = sysctl_ip_vs_secure_tcp;
206 ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
207 write_unlock(&__ip_vs_securetcp_lock);
212 * Timer for checking the defense
214 static struct timer_list defense_timer;
215 #define DEFENSE_TIMER_PERIOD 1*HZ
217 static void defense_timer_handler(unsigned long data)
219 update_defense_level();
220 if (atomic_read(&ip_vs_dropentry))
221 ip_vs_random_dropentry();
223 mod_timer(&defense_timer, jiffies + DEFENSE_TIMER_PERIOD);
228 ip_vs_use_count_inc(void)
230 return try_module_get(THIS_MODULE);
234 ip_vs_use_count_dec(void)
236 module_put(THIS_MODULE);
241 * Hash table: for virtual service lookups
243 #define IP_VS_SVC_TAB_BITS 8
244 #define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
245 #define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
247 /* the service table hashed by <protocol, addr, port> */
248 static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
249 /* the service table hashed by fwmark */
250 static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
253 * Hash table: for real service lookups
255 #define IP_VS_RTAB_BITS 4
256 #define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS)
257 #define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1)
259 static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE];
262 * Trash for destinations
264 static LIST_HEAD(ip_vs_dest_trash);
267 * FTP & NULL virtual service counters
269 static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0);
270 static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0);
274 * Returns hash value for virtual service
276 static __inline__ unsigned
277 ip_vs_svc_hashkey(unsigned proto, __u32 addr, __u16 port)
279 register unsigned porth = ntohs(port);
281 return (proto^ntohl(addr)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
282 & IP_VS_SVC_TAB_MASK;
286 * Returns hash value of fwmark for virtual service lookup
288 static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark)
290 return fwmark & IP_VS_SVC_TAB_MASK;
294 * Hashes a service in the ip_vs_svc_table by <proto,addr,port>
295 * or in the ip_vs_svc_fwm_table by fwmark.
296 * Should be called with locked tables.
298 static int ip_vs_svc_hash(struct ip_vs_service *svc)
302 if (svc->flags & IP_VS_SVC_F_HASHED) {
303 IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, "
304 "called from %p\n", __builtin_return_address(0));
308 if (svc->fwmark == 0) {
310 * Hash it by <protocol,addr,port> in ip_vs_svc_table
312 hash = ip_vs_svc_hashkey(svc->protocol, svc->addr, svc->port);
313 list_add(&svc->s_list, &ip_vs_svc_table[hash]);
316 * Hash it by fwmark in ip_vs_svc_fwm_table
318 hash = ip_vs_svc_fwm_hashkey(svc->fwmark);
319 list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
322 svc->flags |= IP_VS_SVC_F_HASHED;
323 /* increase its refcnt because it is referenced by the svc table */
324 atomic_inc(&svc->refcnt);
330 * Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table.
331 * Should be called with locked tables.
333 static int ip_vs_svc_unhash(struct ip_vs_service *svc)
335 if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
336 IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, "
337 "called from %p\n", __builtin_return_address(0));
341 if (svc->fwmark == 0) {
342 /* Remove it from the ip_vs_svc_table table */
343 list_del(&svc->s_list);
345 /* Remove it from the ip_vs_svc_fwm_table table */
346 list_del(&svc->f_list);
349 svc->flags &= ~IP_VS_SVC_F_HASHED;
350 atomic_dec(&svc->refcnt);
356 * Get service by {proto,addr,port} in the service table.
358 static __inline__ struct ip_vs_service *
359 __ip_vs_service_get(__u16 protocol, __u32 vaddr, __u16 vport)
362 struct ip_vs_service *svc;
364 /* Check for "full" addressed entries */
365 hash = ip_vs_svc_hashkey(protocol, vaddr, vport);
367 list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
368 if ((svc->addr == vaddr)
369 && (svc->port == vport)
370 && (svc->protocol == protocol)) {
372 atomic_inc(&svc->usecnt);
382 * Get service by {fwmark} in the service table.
384 static __inline__ struct ip_vs_service *__ip_vs_svc_fwm_get(__u32 fwmark)
387 struct ip_vs_service *svc;
389 /* Check for fwmark addressed entries */
390 hash = ip_vs_svc_fwm_hashkey(fwmark);
392 list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
393 if (svc->fwmark == fwmark) {
395 atomic_inc(&svc->usecnt);
403 struct ip_vs_service *
404 ip_vs_service_get(__u32 fwmark, __u16 protocol, __u32 vaddr, __u16 vport)
406 struct ip_vs_service *svc;
408 read_lock(&__ip_vs_svc_lock);
411 * Check the table hashed by fwmark first
413 if (fwmark && (svc = __ip_vs_svc_fwm_get(fwmark)))
417 * Check the table hashed by <protocol,addr,port>
418 * for "full" addressed entries
420 svc = __ip_vs_service_get(protocol, vaddr, vport);
423 && protocol == IPPROTO_TCP
424 && atomic_read(&ip_vs_ftpsvc_counter)
425 && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
427 * Check if ftp service entry exists, the packet
428 * might belong to FTP data connections.
430 svc = __ip_vs_service_get(protocol, vaddr, FTPPORT);
434 && atomic_read(&ip_vs_nullsvc_counter)) {
436 * Check if the catch-all port (port zero) exists
438 svc = __ip_vs_service_get(protocol, vaddr, 0);
442 read_unlock(&__ip_vs_svc_lock);
444 IP_VS_DBG(6, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n",
445 fwmark, ip_vs_proto_name(protocol),
446 NIPQUAD(vaddr), ntohs(vport),
447 svc?"hit":"not hit");
454 __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
456 atomic_inc(&svc->refcnt);
461 __ip_vs_unbind_svc(struct ip_vs_dest *dest)
463 struct ip_vs_service *svc = dest->svc;
466 if (atomic_dec_and_test(&svc->refcnt))
472 * Returns hash value for real service
474 static __inline__ unsigned ip_vs_rs_hashkey(__u32 addr, __u16 port)
476 register unsigned porth = ntohs(port);
478 return (ntohl(addr)^(porth>>IP_VS_RTAB_BITS)^porth)
483 * Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>.
484 * should be called with locked tables.
486 static int ip_vs_rs_hash(struct ip_vs_dest *dest)
490 if (!list_empty(&dest->d_list)) {
495 * Hash by proto,addr,port,
496 * which are the parameters of the real service.
498 hash = ip_vs_rs_hashkey(dest->addr, dest->port);
499 list_add(&dest->d_list, &ip_vs_rtable[hash]);
505 * UNhashes ip_vs_dest from ip_vs_rtable.
506 * should be called with locked tables.
508 static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
511 * Remove it from the ip_vs_rtable table.
513 if (!list_empty(&dest->d_list)) {
514 list_del(&dest->d_list);
515 INIT_LIST_HEAD(&dest->d_list);
522 * Lookup real service by <proto,addr,port> in the real service table.
525 ip_vs_lookup_real_service(__u16 protocol, __u32 daddr, __u16 dport)
528 struct ip_vs_dest *dest;
531 * Check for "full" addressed entries
532 * Return the first found entry
534 hash = ip_vs_rs_hashkey(daddr, dport);
536 read_lock(&__ip_vs_rs_lock);
537 list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) {
538 if ((dest->addr == daddr)
539 && (dest->port == dport)
540 && ((dest->protocol == protocol) ||
543 read_unlock(&__ip_vs_rs_lock);
547 read_unlock(&__ip_vs_rs_lock);
553 * Lookup destination by {addr,port} in the given service
555 static struct ip_vs_dest *
556 ip_vs_lookup_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport)
558 struct ip_vs_dest *dest;
561 * Find the destination for the given service
563 list_for_each_entry(dest, &svc->destinations, n_list) {
564 if ((dest->addr == daddr) && (dest->port == dport)) {
575 * Lookup dest by {svc,addr,port} in the destination trash.
576 * The destination trash is used to hold the destinations that are removed
577 * from the service table but are still referenced by some conn entries.
578 * The reason to add the destination trash is when the dest is temporary
579 * down (either by administrator or by monitor program), the dest can be
580 * picked back from the trash, the remaining connections to the dest can
581 * continue, and the counting information of the dest is also useful for
584 static struct ip_vs_dest *
585 ip_vs_trash_get_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport)
587 struct ip_vs_dest *dest, *nxt;
590 * Find the destination in trash
592 list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
593 IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, "
596 NIPQUAD(dest->addr), ntohs(dest->port),
597 atomic_read(&dest->refcnt));
598 if (dest->addr == daddr &&
599 dest->port == dport &&
600 dest->vfwmark == svc->fwmark &&
601 dest->protocol == svc->protocol &&
603 (dest->vaddr == svc->addr &&
604 dest->vport == svc->port))) {
610 * Try to purge the destination from trash if not referenced
612 if (atomic_read(&dest->refcnt) == 1) {
613 IP_VS_DBG(3, "Removing destination %u/%u.%u.%u.%u:%u "
616 NIPQUAD(dest->addr), ntohs(dest->port));
617 list_del(&dest->n_list);
618 ip_vs_dst_reset(dest);
619 __ip_vs_unbind_svc(dest);
629 * Clean up all the destinations in the trash
630 * Called by the ip_vs_control_cleanup()
632 * When the ip_vs_control_clearup is activated by ipvs module exit,
633 * the service tables must have been flushed and all the connections
634 * are expired, and the refcnt of each destination in the trash must
635 * be 1, so we simply release them here.
637 static void ip_vs_trash_cleanup(void)
639 struct ip_vs_dest *dest, *nxt;
641 list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
642 list_del(&dest->n_list);
643 ip_vs_dst_reset(dest);
644 __ip_vs_unbind_svc(dest);
651 ip_vs_zero_stats(struct ip_vs_stats *stats)
653 spin_lock_bh(&stats->lock);
654 memset(stats, 0, (char *)&stats->lock - (char *)stats);
655 spin_unlock_bh(&stats->lock);
656 ip_vs_zero_estimator(stats);
660 * Update a destination in the given service
663 __ip_vs_update_dest(struct ip_vs_service *svc,
664 struct ip_vs_dest *dest, struct ip_vs_dest_user *udest)
668 /* set the weight and the flags */
669 atomic_set(&dest->weight, udest->weight);
670 conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE;
672 /* check if local node and update the flags */
673 if (inet_addr_type(udest->addr) == RTN_LOCAL) {
674 conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
675 | IP_VS_CONN_F_LOCALNODE;
678 /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
679 if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) {
680 conn_flags |= IP_VS_CONN_F_NOOUTPUT;
683 * Put the real service in ip_vs_rtable if not present.
684 * For now only for NAT!
686 write_lock_bh(&__ip_vs_rs_lock);
688 write_unlock_bh(&__ip_vs_rs_lock);
690 atomic_set(&dest->conn_flags, conn_flags);
692 /* bind the service */
694 __ip_vs_bind_svc(dest, svc);
696 if (dest->svc != svc) {
697 __ip_vs_unbind_svc(dest);
698 ip_vs_zero_stats(&dest->stats);
699 __ip_vs_bind_svc(dest, svc);
703 /* set the dest status flags */
704 dest->flags |= IP_VS_DEST_F_AVAILABLE;
706 if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
707 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
708 dest->u_threshold = udest->u_threshold;
709 dest->l_threshold = udest->l_threshold;
714 * Create a destination for the given service
717 ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest,
718 struct ip_vs_dest **dest_p)
720 struct ip_vs_dest *dest;
725 atype = inet_addr_type(udest->addr);
726 if (atype != RTN_LOCAL && atype != RTN_UNICAST)
729 dest = kmalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC);
731 IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n");
734 memset(dest, 0, sizeof(struct ip_vs_dest));
736 dest->protocol = svc->protocol;
737 dest->vaddr = svc->addr;
738 dest->vport = svc->port;
739 dest->vfwmark = svc->fwmark;
740 dest->addr = udest->addr;
741 dest->port = udest->port;
743 atomic_set(&dest->activeconns, 0);
744 atomic_set(&dest->inactconns, 0);
745 atomic_set(&dest->persistconns, 0);
746 atomic_set(&dest->refcnt, 0);
748 INIT_LIST_HEAD(&dest->d_list);
749 dest->dst_lock = SPIN_LOCK_UNLOCKED;
750 dest->stats.lock = SPIN_LOCK_UNLOCKED;
751 __ip_vs_update_dest(svc, dest, udest);
752 ip_vs_new_estimator(&dest->stats);
762 * Add a destination into an existing service
765 ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
767 struct ip_vs_dest *dest;
768 __u32 daddr = udest->addr;
769 __u16 dport = udest->port;
774 if (udest->weight < 0) {
775 IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n");
779 if (udest->l_threshold > udest->u_threshold) {
780 IP_VS_ERR("ip_vs_add_dest(): lower threshold is higher than "
781 "upper threshold\n");
786 * Check if the dest already exists in the list
788 dest = ip_vs_lookup_dest(svc, daddr, dport);
790 IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n");
795 * Check if the dest already exists in the trash and
796 * is from the same service
798 dest = ip_vs_trash_get_dest(svc, daddr, dport);
800 IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, "
801 "refcnt=%d, service %u/%u.%u.%u.%u:%u\n",
802 NIPQUAD(daddr), ntohs(dport),
803 atomic_read(&dest->refcnt),
805 NIPQUAD(dest->vaddr),
807 __ip_vs_update_dest(svc, dest, udest);
810 * Get the destination from the trash
812 list_del(&dest->n_list);
814 ip_vs_new_estimator(&dest->stats);
816 write_lock_bh(&__ip_vs_svc_lock);
819 * Wait until all other svc users go away.
821 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
823 list_add(&dest->n_list, &svc->destinations);
826 /* call the update_service function of its scheduler */
827 svc->scheduler->update_service(svc);
829 write_unlock_bh(&__ip_vs_svc_lock);
834 * Allocate and initialize the dest structure
836 ret = ip_vs_new_dest(svc, udest, &dest);
842 * Add the dest entry into the list
844 atomic_inc(&dest->refcnt);
846 write_lock_bh(&__ip_vs_svc_lock);
849 * Wait until all other svc users go away.
851 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
853 list_add(&dest->n_list, &svc->destinations);
856 /* call the update_service function of its scheduler */
857 svc->scheduler->update_service(svc);
859 write_unlock_bh(&__ip_vs_svc_lock);
868 * Edit a destination in the given service
871 ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
873 struct ip_vs_dest *dest;
874 __u32 daddr = udest->addr;
875 __u16 dport = udest->port;
879 if (udest->weight < 0) {
880 IP_VS_ERR("ip_vs_edit_dest(): server weight less than zero\n");
884 if (udest->l_threshold > udest->u_threshold) {
885 IP_VS_ERR("ip_vs_edit_dest(): lower threshold is higher than "
886 "upper threshold\n");
891 * Lookup the destination list
893 dest = ip_vs_lookup_dest(svc, daddr, dport);
895 IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n");
899 __ip_vs_update_dest(svc, dest, udest);
901 write_lock_bh(&__ip_vs_svc_lock);
903 /* Wait until all other svc users go away */
904 while (atomic_read(&svc->usecnt) > 1) {};
906 /* call the update_service, because server weight may be changed */
907 svc->scheduler->update_service(svc);
909 write_unlock_bh(&__ip_vs_svc_lock);
918 * Delete a destination (must be already unlinked from the service)
920 static void __ip_vs_del_dest(struct ip_vs_dest *dest)
922 ip_vs_kill_estimator(&dest->stats);
925 * Remove it from the d-linked list with the real services.
927 write_lock_bh(&__ip_vs_rs_lock);
928 ip_vs_rs_unhash(dest);
929 write_unlock_bh(&__ip_vs_rs_lock);
932 * Decrease the refcnt of the dest, and free the dest
933 * if nobody refers to it (refcnt=0). Otherwise, throw
934 * the destination into the trash.
936 if (atomic_dec_and_test(&dest->refcnt)) {
937 ip_vs_dst_reset(dest);
938 /* simply decrease svc->refcnt here, let the caller check
939 and release the service if nobody refers to it.
940 Only user context can release destination and service,
941 and only one user context can update virtual service at a
942 time, so the operation here is OK */
943 atomic_dec(&dest->svc->refcnt);
946 IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, refcnt=%d\n",
947 NIPQUAD(dest->addr), ntohs(dest->port),
948 atomic_read(&dest->refcnt));
949 list_add(&dest->n_list, &ip_vs_dest_trash);
950 atomic_inc(&dest->refcnt);
956 * Unlink a destination from the given service
958 static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
959 struct ip_vs_dest *dest,
962 dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
965 * Remove it from the d-linked destination list.
967 list_del(&dest->n_list);
971 * Call the update_service function of its scheduler
973 svc->scheduler->update_service(svc);
979 * Delete a destination server in the given service
982 ip_vs_del_dest(struct ip_vs_service *svc,struct ip_vs_dest_user *udest)
984 struct ip_vs_dest *dest;
985 __u32 daddr = udest->addr;
986 __u16 dport = udest->port;
990 dest = ip_vs_lookup_dest(svc, daddr, dport);
992 IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n");
996 write_lock_bh(&__ip_vs_svc_lock);
999 * Wait until all other svc users go away.
1001 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1004 * Unlink dest from the service
1006 __ip_vs_unlink_dest(svc, dest, 1);
1008 write_unlock_bh(&__ip_vs_svc_lock);
1011 * Delete the destination
1013 __ip_vs_del_dest(dest);
1022 * Add a service into the service hash table
1025 ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p)
1028 struct ip_vs_scheduler *sched = NULL;
1029 struct ip_vs_service *svc = NULL;
1031 /* increase the module use count */
1032 ip_vs_use_count_inc();
1034 /* Lookup the scheduler by 'u->sched_name' */
1035 sched = ip_vs_scheduler_get(u->sched_name);
1036 if (sched == NULL) {
1037 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1043 svc = (struct ip_vs_service *)
1044 kmalloc(sizeof(struct ip_vs_service), GFP_ATOMIC);
1046 IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n");
1050 memset(svc, 0, sizeof(struct ip_vs_service));
1052 /* I'm the first user of the service */
1053 atomic_set(&svc->usecnt, 1);
1054 atomic_set(&svc->refcnt, 0);
1056 svc->protocol = u->protocol;
1057 svc->addr = u->addr;
1058 svc->port = u->port;
1059 svc->fwmark = u->fwmark;
1060 svc->flags = u->flags;
1061 svc->timeout = u->timeout * HZ;
1062 svc->netmask = u->netmask;
1064 INIT_LIST_HEAD(&svc->destinations);
1065 svc->sched_lock = RW_LOCK_UNLOCKED;
1066 svc->stats.lock = SPIN_LOCK_UNLOCKED;
1068 /* Bind the scheduler */
1069 ret = ip_vs_bind_scheduler(svc, sched);
1074 /* Update the virtual service counters */
1075 if (svc->port == FTPPORT)
1076 atomic_inc(&ip_vs_ftpsvc_counter);
1077 else if (svc->port == 0)
1078 atomic_inc(&ip_vs_nullsvc_counter);
1080 ip_vs_new_estimator(&svc->stats);
1081 ip_vs_num_services++;
1083 /* Hash the service into the service table */
1084 write_lock_bh(&__ip_vs_svc_lock);
1085 ip_vs_svc_hash(svc);
1086 write_unlock_bh(&__ip_vs_svc_lock);
1094 ip_vs_unbind_scheduler(svc);
1097 ip_vs_app_inc_put(svc->inc);
1102 ip_vs_scheduler_put(sched);
1105 /* decrease the module use count */
1106 ip_vs_use_count_dec();
1113 * Edit a service and bind it with a new scheduler
1116 ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user *u)
1118 struct ip_vs_scheduler *sched, *old_sched;
1122 * Lookup the scheduler, by 'u->sched_name'
1124 sched = ip_vs_scheduler_get(u->sched_name);
1125 if (sched == NULL) {
1126 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1132 write_lock_bh(&__ip_vs_svc_lock);
1135 * Wait until all other svc users go away.
1137 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1140 * Set the flags and timeout value
1142 svc->flags = u->flags | IP_VS_SVC_F_HASHED;
1143 svc->timeout = u->timeout * HZ;
1144 svc->netmask = u->netmask;
1146 old_sched = svc->scheduler;
1147 if (sched != old_sched) {
1149 * Unbind the old scheduler
1151 if ((ret = ip_vs_unbind_scheduler(svc))) {
1157 * Bind the new scheduler
1159 if ((ret = ip_vs_bind_scheduler(svc, sched))) {
1161 * If ip_vs_bind_scheduler fails, restore the old
1163 * The main reason of failure is out of memory.
1165 * The question is if the old scheduler can be
1166 * restored all the time. TODO: if it cannot be
1167 * restored some time, we must delete the service,
1168 * otherwise the system may crash.
1170 ip_vs_bind_scheduler(svc, old_sched);
1177 write_unlock_bh(&__ip_vs_svc_lock);
1180 ip_vs_scheduler_put(old_sched);
1187 * Delete a service from the service list
1188 * - The service must be unlinked, unlocked and not referenced!
1189 * - We are called under _bh lock
1191 static void __ip_vs_del_service(struct ip_vs_service *svc)
1193 struct ip_vs_dest *dest, *nxt;
1194 struct ip_vs_scheduler *old_sched;
1196 ip_vs_num_services--;
1197 ip_vs_kill_estimator(&svc->stats);
1199 /* Unbind scheduler */
1200 old_sched = svc->scheduler;
1201 ip_vs_unbind_scheduler(svc);
1203 ip_vs_scheduler_put(old_sched);
1205 /* Unbind app inc */
1207 ip_vs_app_inc_put(svc->inc);
1212 * Unlink the whole destination list
1214 list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
1215 __ip_vs_unlink_dest(svc, dest, 0);
1216 __ip_vs_del_dest(dest);
1220 * Update the virtual service counters
1222 if (svc->port == FTPPORT)
1223 atomic_dec(&ip_vs_ftpsvc_counter);
1224 else if (svc->port == 0)
1225 atomic_dec(&ip_vs_nullsvc_counter);
1228 * Free the service if nobody refers to it
1230 if (atomic_read(&svc->refcnt) == 0)
1233 /* decrease the module use count */
1234 ip_vs_use_count_dec();
1238 * Delete a service from the service list
1240 static int ip_vs_del_service(struct ip_vs_service *svc)
1246 * Unhash it from the service table
1248 write_lock_bh(&__ip_vs_svc_lock);
1250 ip_vs_svc_unhash(svc);
1253 * Wait until all the svc users go away.
1255 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1257 __ip_vs_del_service(svc);
1259 write_unlock_bh(&__ip_vs_svc_lock);
1266 * Flush all the virtual services
1268 static int ip_vs_flush(void)
1271 struct ip_vs_service *svc, *nxt;
1274 * Flush the service table hashed by <protocol,addr,port>
1276 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1277 list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) {
1278 write_lock_bh(&__ip_vs_svc_lock);
1279 ip_vs_svc_unhash(svc);
1281 * Wait until all the svc users go away.
1283 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1284 __ip_vs_del_service(svc);
1285 write_unlock_bh(&__ip_vs_svc_lock);
1290 * Flush the service table hashed by fwmark
1292 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1293 list_for_each_entry_safe(svc, nxt,
1294 &ip_vs_svc_fwm_table[idx], f_list) {
1295 write_lock_bh(&__ip_vs_svc_lock);
1296 ip_vs_svc_unhash(svc);
1298 * Wait until all the svc users go away.
1300 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1301 __ip_vs_del_service(svc);
1302 write_unlock_bh(&__ip_vs_svc_lock);
1311 * Zero counters in a service or all services
1313 static int ip_vs_zero_service(struct ip_vs_service *svc)
1315 struct ip_vs_dest *dest;
1317 write_lock_bh(&__ip_vs_svc_lock);
1318 list_for_each_entry(dest, &svc->destinations, n_list) {
1319 ip_vs_zero_stats(&dest->stats);
1321 ip_vs_zero_stats(&svc->stats);
1322 write_unlock_bh(&__ip_vs_svc_lock);
1326 static int ip_vs_zero_all(void)
1329 struct ip_vs_service *svc;
1331 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1332 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1333 ip_vs_zero_service(svc);
1337 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1338 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1339 ip_vs_zero_service(svc);
1343 ip_vs_zero_stats(&ip_vs_stats);
1349 proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
1350 void *buffer, size_t *lenp)
1352 int *valp = table->data;
1356 rc = proc_dointvec(table, write, filp, buffer, lenp);
1357 if (write && (*valp != val)) {
1358 if ((*valp < 0) || (*valp > 3)) {
1359 /* Restore the correct value */
1363 update_defense_level();
1372 proc_do_sync_threshold(ctl_table *table, int write, struct file *filp,
1373 void *buffer, size_t *lenp)
1375 int *valp = table->data;
1379 /* backup the value first */
1380 memcpy(val, valp, sizeof(val));
1382 rc = proc_dointvec(table, write, filp, buffer, lenp);
1383 if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) {
1384 /* Restore the correct value */
1385 memcpy(valp, val, sizeof(val));
1392 * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
1395 static struct ctl_table vs_vars[] = {
1397 .ctl_name = NET_IPV4_VS_AMEMTHRESH,
1398 .procname = "amemthresh",
1399 .data = &sysctl_ip_vs_amemthresh,
1400 .maxlen = sizeof(int),
1402 .proc_handler = &proc_dointvec,
1404 #ifdef CONFIG_IP_VS_DEBUG
1406 .ctl_name = NET_IPV4_VS_DEBUG_LEVEL,
1407 .procname = "debug_level",
1408 .data = &sysctl_ip_vs_debug_level,
1409 .maxlen = sizeof(int),
1411 .proc_handler = &proc_dointvec,
1415 .ctl_name = NET_IPV4_VS_AMDROPRATE,
1416 .procname = "am_droprate",
1417 .data = &sysctl_ip_vs_am_droprate,
1418 .maxlen = sizeof(int),
1420 .proc_handler = &proc_dointvec,
1423 .ctl_name = NET_IPV4_VS_DROP_ENTRY,
1424 .procname = "drop_entry",
1425 .data = &sysctl_ip_vs_drop_entry,
1426 .maxlen = sizeof(int),
1428 .proc_handler = &proc_do_defense_mode,
1431 .ctl_name = NET_IPV4_VS_DROP_PACKET,
1432 .procname = "drop_packet",
1433 .data = &sysctl_ip_vs_drop_packet,
1434 .maxlen = sizeof(int),
1436 .proc_handler = &proc_do_defense_mode,
1439 .ctl_name = NET_IPV4_VS_SECURE_TCP,
1440 .procname = "secure_tcp",
1441 .data = &sysctl_ip_vs_secure_tcp,
1442 .maxlen = sizeof(int),
1444 .proc_handler = &proc_do_defense_mode,
1448 .ctl_name = NET_IPV4_VS_TO_ES,
1449 .procname = "timeout_established",
1450 .data = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED],
1451 .maxlen = sizeof(int),
1453 .proc_handler = &proc_dointvec_jiffies,
1456 .ctl_name = NET_IPV4_VS_TO_SS,
1457 .procname = "timeout_synsent",
1458 .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT],
1459 .maxlen = sizeof(int),
1461 .proc_handler = &proc_dointvec_jiffies,
1464 .ctl_name = NET_IPV4_VS_TO_SR,
1465 .procname = "timeout_synrecv",
1466 .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV],
1467 .maxlen = sizeof(int),
1469 .proc_handler = &proc_dointvec_jiffies,
1472 .ctl_name = NET_IPV4_VS_TO_FW,
1473 .procname = "timeout_finwait",
1474 .data = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT],
1475 .maxlen = sizeof(int),
1477 .proc_handler = &proc_dointvec_jiffies,
1480 .ctl_name = NET_IPV4_VS_TO_TW,
1481 .procname = "timeout_timewait",
1482 .data = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT],
1483 .maxlen = sizeof(int),
1485 .proc_handler = &proc_dointvec_jiffies,
1488 .ctl_name = NET_IPV4_VS_TO_CL,
1489 .procname = "timeout_close",
1490 .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE],
1491 .maxlen = sizeof(int),
1493 .proc_handler = &proc_dointvec_jiffies,
1496 .ctl_name = NET_IPV4_VS_TO_CW,
1497 .procname = "timeout_closewait",
1498 .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT],
1499 .maxlen = sizeof(int),
1501 .proc_handler = &proc_dointvec_jiffies,
1504 .ctl_name = NET_IPV4_VS_TO_LA,
1505 .procname = "timeout_lastack",
1506 .data = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK],
1507 .maxlen = sizeof(int),
1509 .proc_handler = &proc_dointvec_jiffies,
1512 .ctl_name = NET_IPV4_VS_TO_LI,
1513 .procname = "timeout_listen",
1514 .data = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN],
1515 .maxlen = sizeof(int),
1517 .proc_handler = &proc_dointvec_jiffies,
1520 .ctl_name = NET_IPV4_VS_TO_SA,
1521 .procname = "timeout_synack",
1522 .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK],
1523 .maxlen = sizeof(int),
1525 .proc_handler = &proc_dointvec_jiffies,
1528 .ctl_name = NET_IPV4_VS_TO_UDP,
1529 .procname = "timeout_udp",
1530 .data = &vs_timeout_table_dos.timeout[IP_VS_S_UDP],
1531 .maxlen = sizeof(int),
1533 .proc_handler = &proc_dointvec_jiffies,
1536 .ctl_name = NET_IPV4_VS_TO_ICMP,
1537 .procname = "timeout_icmp",
1538 .data = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP],
1539 .maxlen = sizeof(int),
1541 .proc_handler = &proc_dointvec_jiffies,
1545 .ctl_name = NET_IPV4_VS_CACHE_BYPASS,
1546 .procname = "cache_bypass",
1547 .data = &sysctl_ip_vs_cache_bypass,
1548 .maxlen = sizeof(int),
1550 .proc_handler = &proc_dointvec,
1553 .ctl_name = NET_IPV4_VS_EXPIRE_NODEST_CONN,
1554 .procname = "expire_nodest_conn",
1555 .data = &sysctl_ip_vs_expire_nodest_conn,
1556 .maxlen = sizeof(int),
1558 .proc_handler = &proc_dointvec,
1561 .ctl_name = NET_IPV4_VS_SYNC_THRESHOLD,
1562 .procname = "sync_threshold",
1563 .data = &sysctl_ip_vs_sync_threshold,
1564 .maxlen = sizeof(sysctl_ip_vs_sync_threshold),
1566 .proc_handler = &proc_do_sync_threshold,
1569 .ctl_name = NET_IPV4_VS_NAT_ICMP_SEND,
1570 .procname = "nat_icmp_send",
1571 .data = &sysctl_ip_vs_nat_icmp_send,
1572 .maxlen = sizeof(int),
1574 .proc_handler = &proc_dointvec,
1579 static ctl_table vs_table[] = {
1581 .ctl_name = NET_IPV4_VS,
1589 static ctl_table ipv4_table[] = {
1591 .ctl_name = NET_IPV4,
1599 static ctl_table vs_root_table[] = {
1601 .ctl_name = CTL_NET,
1604 .child = ipv4_table,
1609 static struct ctl_table_header * sysctl_header;
1611 #ifdef CONFIG_PROC_FS
1614 struct list_head *table;
1619 * Write the contents of the VS rule table to a PROCfs file.
1620 * (It is kept just for backward compatibility)
1622 static inline const char *ip_vs_fwd_name(unsigned flags)
1624 switch (flags & IP_VS_CONN_F_FWD_MASK) {
1625 case IP_VS_CONN_F_LOCALNODE:
1627 case IP_VS_CONN_F_TUNNEL:
1629 case IP_VS_CONN_F_DROUTE:
1637 /* Get the Nth entry in the two lists */
1638 static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
1640 struct ip_vs_iter *iter = seq->private;
1642 struct ip_vs_service *svc;
1644 /* look in hash by protocol */
1645 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1646 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1648 iter->table = ip_vs_svc_table;
1655 /* keep looking in fwmark */
1656 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1657 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1659 iter->table = ip_vs_svc_fwm_table;
1669 static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
1672 read_lock_bh(&__ip_vs_svc_lock);
1673 return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
1677 static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1679 struct list_head *e;
1680 struct ip_vs_iter *iter;
1681 struct ip_vs_service *svc;
1684 if (v == SEQ_START_TOKEN)
1685 return ip_vs_info_array(seq,0);
1688 iter = seq->private;
1690 if (iter->table == ip_vs_svc_table) {
1691 /* next service in table hashed by protocol */
1692 if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket])
1693 return list_entry(e, struct ip_vs_service, s_list);
1696 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1697 list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket],
1703 iter->table = ip_vs_svc_fwm_table;
1708 /* next service in hashed by fwmark */
1709 if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket])
1710 return list_entry(e, struct ip_vs_service, f_list);
1713 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1714 list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket],
1722 static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
1724 read_unlock_bh(&__ip_vs_svc_lock);
1728 static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
1730 if (v == SEQ_START_TOKEN) {
1732 "IP Virtual Server version %d.%d.%d (size=%d)\n",
1733 NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
1735 "Prot LocalAddress:Port Scheduler Flags\n");
1737 " -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
1739 const struct ip_vs_service *svc = v;
1740 const struct ip_vs_iter *iter = seq->private;
1741 const struct ip_vs_dest *dest;
1743 if (iter->table == ip_vs_svc_table)
1744 seq_printf(seq, "%s %08X:%04X %s ",
1745 ip_vs_proto_name(svc->protocol),
1748 svc->scheduler->name);
1750 seq_printf(seq, "FWM %08X %s ",
1751 svc->fwmark, svc->scheduler->name);
1753 if (svc->flags & IP_VS_SVC_F_PERSISTENT)
1754 seq_printf(seq, "persistent %d %08X\n",
1756 ntohl(svc->netmask));
1758 seq_putc(seq, '\n');
1760 list_for_each_entry(dest, &svc->destinations, n_list) {
1762 " -> %08X:%04X %-7s %-6d %-10d %-10d\n",
1763 ntohl(dest->addr), ntohs(dest->port),
1764 ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
1765 atomic_read(&dest->weight),
1766 atomic_read(&dest->activeconns),
1767 atomic_read(&dest->inactconns));
1773 static struct seq_operations ip_vs_info_seq_ops = {
1774 .start = ip_vs_info_seq_start,
1775 .next = ip_vs_info_seq_next,
1776 .stop = ip_vs_info_seq_stop,
1777 .show = ip_vs_info_seq_show,
1780 static int ip_vs_info_open(struct inode *inode, struct file *file)
1782 struct seq_file *seq;
1784 struct ip_vs_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
1789 rc = seq_open(file, &ip_vs_info_seq_ops);
1793 seq = file->private_data;
1795 memset(s, 0, sizeof(*s));
1803 static struct file_operations ip_vs_info_fops = {
1804 .owner = THIS_MODULE,
1805 .open = ip_vs_info_open,
1807 .llseek = seq_lseek,
1808 .release = seq_release_private,
1813 struct ip_vs_stats ip_vs_stats;
1815 #ifdef CONFIG_PROC_FS
1816 static int ip_vs_stats_show(struct seq_file *seq, void *v)
1819 /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
1821 " Total Incoming Outgoing Incoming Outgoing\n");
1823 " Conns Packets Packets Bytes Bytes\n");
1825 spin_lock_bh(&ip_vs_stats.lock);
1826 seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.conns,
1827 ip_vs_stats.inpkts, ip_vs_stats.outpkts,
1828 (unsigned long long) ip_vs_stats.inbytes,
1829 (unsigned long long) ip_vs_stats.outbytes);
1831 /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
1833 " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n");
1834 seq_printf(seq,"%8X %8X %8X %16X %16X\n",
1839 ip_vs_stats.outbps);
1840 spin_unlock_bh(&ip_vs_stats.lock);
1845 static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
1847 return single_open(file, ip_vs_stats_show, NULL);
1850 static struct file_operations ip_vs_stats_fops = {
1851 .owner = THIS_MODULE,
1852 .open = ip_vs_stats_seq_open,
1854 .llseek = seq_lseek,
1855 .release = single_release,
1861 * Set timeout values for tcp tcpfin udp in the timeout_table.
1863 static int ip_vs_set_timeout(struct ip_vs_timeout_user *u)
1865 IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
1870 #ifdef CONFIG_IP_VS_PROTO_TCP
1871 if (u->tcp_timeout) {
1872 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED]
1873 = u->tcp_timeout * HZ;
1876 if (u->tcp_fin_timeout) {
1877 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT]
1878 = u->tcp_fin_timeout * HZ;
1882 #ifdef CONFIG_IP_VS_PROTO_UDP
1883 if (u->udp_timeout) {
1884 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL]
1885 = u->udp_timeout * HZ;
1892 #define SET_CMDID(cmd) (cmd - IP_VS_BASE_CTL)
1893 #define SERVICE_ARG_LEN (sizeof(struct ip_vs_service_user))
1894 #define SVCDEST_ARG_LEN (sizeof(struct ip_vs_service_user) + \
1895 sizeof(struct ip_vs_dest_user))
1896 #define TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user))
1897 #define DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user))
1898 #define MAX_ARG_LEN SVCDEST_ARG_LEN
1900 static unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
1901 [SET_CMDID(IP_VS_SO_SET_ADD)] = SERVICE_ARG_LEN,
1902 [SET_CMDID(IP_VS_SO_SET_EDIT)] = SERVICE_ARG_LEN,
1903 [SET_CMDID(IP_VS_SO_SET_DEL)] = SERVICE_ARG_LEN,
1904 [SET_CMDID(IP_VS_SO_SET_FLUSH)] = 0,
1905 [SET_CMDID(IP_VS_SO_SET_ADDDEST)] = SVCDEST_ARG_LEN,
1906 [SET_CMDID(IP_VS_SO_SET_DELDEST)] = SVCDEST_ARG_LEN,
1907 [SET_CMDID(IP_VS_SO_SET_EDITDEST)] = SVCDEST_ARG_LEN,
1908 [SET_CMDID(IP_VS_SO_SET_TIMEOUT)] = TIMEOUT_ARG_LEN,
1909 [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)] = DAEMON_ARG_LEN,
1910 [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)] = DAEMON_ARG_LEN,
1911 [SET_CMDID(IP_VS_SO_SET_ZERO)] = SERVICE_ARG_LEN,
1915 do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
1918 unsigned char arg[MAX_ARG_LEN];
1919 struct ip_vs_service_user *usvc;
1920 struct ip_vs_service *svc;
1921 struct ip_vs_dest_user *udest;
1923 if (!capable(CAP_NET_ADMIN))
1926 if (len != set_arglen[SET_CMDID(cmd)]) {
1927 IP_VS_ERR("set_ctl: len %u != %u\n",
1928 len, set_arglen[SET_CMDID(cmd)]);
1932 if (copy_from_user(arg, user, len) != 0)
1935 /* increase the module use count */
1936 ip_vs_use_count_inc();
1938 if (down_interruptible(&__ip_vs_mutex)) {
1943 if (cmd == IP_VS_SO_SET_FLUSH) {
1944 /* Flush the virtual service */
1945 ret = ip_vs_flush();
1947 } else if (cmd == IP_VS_SO_SET_TIMEOUT) {
1948 /* Set timeout values for (tcp tcpfin udp) */
1949 ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg);
1951 } else if (cmd == IP_VS_SO_SET_STARTDAEMON) {
1952 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1953 ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid);
1955 } else if (cmd == IP_VS_SO_SET_STOPDAEMON) {
1956 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1957 ret = stop_sync_thread(dm->state);
1961 usvc = (struct ip_vs_service_user *)arg;
1962 udest = (struct ip_vs_dest_user *)(usvc + 1);
1964 if (cmd == IP_VS_SO_SET_ZERO) {
1965 /* if no service address is set, zero counters in all */
1966 if (!usvc->fwmark && !usvc->addr && !usvc->port) {
1967 ret = ip_vs_zero_all();
1972 /* Check for valid protocol: TCP or UDP, even for fwmark!=0 */
1973 if (usvc->protocol!=IPPROTO_TCP && usvc->protocol!=IPPROTO_UDP) {
1974 IP_VS_ERR("set_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s\n",
1975 usvc->protocol, NIPQUAD(usvc->addr),
1976 ntohs(usvc->port), usvc->sched_name);
1981 /* Lookup the exact service by <protocol, addr, port> or fwmark */
1982 if (usvc->fwmark == 0)
1983 svc = __ip_vs_service_get(usvc->protocol,
1984 usvc->addr, usvc->port);
1986 svc = __ip_vs_svc_fwm_get(usvc->fwmark);
1988 if (cmd != IP_VS_SO_SET_ADD
1989 && (svc == NULL || svc->protocol != usvc->protocol)) {
1995 case IP_VS_SO_SET_ADD:
1999 ret = ip_vs_add_service(usvc, &svc);
2001 case IP_VS_SO_SET_EDIT:
2002 ret = ip_vs_edit_service(svc, usvc);
2004 case IP_VS_SO_SET_DEL:
2005 ret = ip_vs_del_service(svc);
2009 case IP_VS_SO_SET_ZERO:
2010 ret = ip_vs_zero_service(svc);
2012 case IP_VS_SO_SET_ADDDEST:
2013 ret = ip_vs_add_dest(svc, udest);
2015 case IP_VS_SO_SET_EDITDEST:
2016 ret = ip_vs_edit_dest(svc, udest);
2018 case IP_VS_SO_SET_DELDEST:
2019 ret = ip_vs_del_dest(svc, udest);
2026 ip_vs_service_put(svc);
2031 /* decrease the module use count */
2032 ip_vs_use_count_dec();
2039 ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
2041 spin_lock_bh(&src->lock);
2042 memcpy(dst, src, (char*)&src->lock - (char*)src);
2043 spin_unlock_bh(&src->lock);
2047 ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
2049 dst->protocol = src->protocol;
2050 dst->addr = src->addr;
2051 dst->port = src->port;
2052 dst->fwmark = src->fwmark;
2053 strcpy(dst->sched_name, src->scheduler->name);
2054 dst->flags = src->flags;
2055 dst->timeout = src->timeout / HZ;
2056 dst->netmask = src->netmask;
2057 dst->num_dests = src->num_dests;
2058 ip_vs_copy_stats(&dst->stats, &src->stats);
2062 __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
2063 struct ip_vs_get_services __user *uptr)
2066 struct ip_vs_service *svc;
2067 struct ip_vs_service_entry entry;
2070 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2071 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
2072 if (count >= get->num_services)
2074 ip_vs_copy_service(&entry, svc);
2075 if (copy_to_user(&uptr->entrytable[count],
2076 &entry, sizeof(entry))) {
2084 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2085 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
2086 if (count >= get->num_services)
2088 ip_vs_copy_service(&entry, svc);
2089 if (copy_to_user(&uptr->entrytable[count],
2090 &entry, sizeof(entry))) {
2102 __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
2103 struct ip_vs_get_dests __user *uptr)
2105 struct ip_vs_service *svc;
2109 svc = __ip_vs_svc_fwm_get(get->fwmark);
2111 svc = __ip_vs_service_get(get->protocol,
2112 get->addr, get->port);
2115 struct ip_vs_dest *dest;
2116 struct ip_vs_dest_entry entry;
2118 list_for_each_entry(dest, &svc->destinations, n_list) {
2119 if (count >= get->num_dests)
2122 entry.addr = dest->addr;
2123 entry.port = dest->port;
2124 entry.conn_flags = atomic_read(&dest->conn_flags);
2125 entry.weight = atomic_read(&dest->weight);
2126 entry.u_threshold = dest->u_threshold;
2127 entry.l_threshold = dest->l_threshold;
2128 entry.activeconns = atomic_read(&dest->activeconns);
2129 entry.inactconns = atomic_read(&dest->inactconns);
2130 entry.persistconns = atomic_read(&dest->persistconns);
2131 ip_vs_copy_stats(&entry.stats, &dest->stats);
2132 if (copy_to_user(&uptr->entrytable[count],
2133 &entry, sizeof(entry))) {
2139 ip_vs_service_put(svc);
2146 __ip_vs_get_timeouts(struct ip_vs_timeout_user *u)
2148 #ifdef CONFIG_IP_VS_PROTO_TCP
2150 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
2151 u->tcp_fin_timeout =
2152 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
2154 #ifdef CONFIG_IP_VS_PROTO_UDP
2156 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
2161 #define GET_CMDID(cmd) (cmd - IP_VS_BASE_CTL)
2162 #define GET_INFO_ARG_LEN (sizeof(struct ip_vs_getinfo))
2163 #define GET_SERVICES_ARG_LEN (sizeof(struct ip_vs_get_services))
2164 #define GET_SERVICE_ARG_LEN (sizeof(struct ip_vs_service_entry))
2165 #define GET_DESTS_ARG_LEN (sizeof(struct ip_vs_get_dests))
2166 #define GET_TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user))
2167 #define GET_DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user) * 2)
2169 static unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
2170 [GET_CMDID(IP_VS_SO_GET_VERSION)] = 64,
2171 [GET_CMDID(IP_VS_SO_GET_INFO)] = GET_INFO_ARG_LEN,
2172 [GET_CMDID(IP_VS_SO_GET_SERVICES)] = GET_SERVICES_ARG_LEN,
2173 [GET_CMDID(IP_VS_SO_GET_SERVICE)] = GET_SERVICE_ARG_LEN,
2174 [GET_CMDID(IP_VS_SO_GET_DESTS)] = GET_DESTS_ARG_LEN,
2175 [GET_CMDID(IP_VS_SO_GET_TIMEOUT)] = GET_TIMEOUT_ARG_LEN,
2176 [GET_CMDID(IP_VS_SO_GET_DAEMON)] = GET_DAEMON_ARG_LEN,
2180 do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2182 unsigned char arg[128];
2185 if (!capable(CAP_NET_ADMIN))
2188 if (*len < get_arglen[GET_CMDID(cmd)]) {
2189 IP_VS_ERR("get_ctl: len %u < %u\n",
2190 *len, get_arglen[GET_CMDID(cmd)]);
2194 if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0)
2197 if (down_interruptible(&__ip_vs_mutex))
2198 return -ERESTARTSYS;
2201 case IP_VS_SO_GET_VERSION:
2205 sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
2206 NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
2207 if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
2211 *len = strlen(buf)+1;
2215 case IP_VS_SO_GET_INFO:
2217 struct ip_vs_getinfo info;
2218 info.version = IP_VS_VERSION_CODE;
2219 info.size = IP_VS_CONN_TAB_SIZE;
2220 info.num_services = ip_vs_num_services;
2221 if (copy_to_user(user, &info, sizeof(info)) != 0)
2226 case IP_VS_SO_GET_SERVICES:
2228 struct ip_vs_get_services *get;
2231 get = (struct ip_vs_get_services *)arg;
2232 size = sizeof(*get) +
2233 sizeof(struct ip_vs_service_entry) * get->num_services;
2235 IP_VS_ERR("length: %u != %u\n", *len, size);
2239 ret = __ip_vs_get_service_entries(get, user);
2243 case IP_VS_SO_GET_SERVICE:
2245 struct ip_vs_service_entry *entry;
2246 struct ip_vs_service *svc;
2248 entry = (struct ip_vs_service_entry *)arg;
2250 svc = __ip_vs_svc_fwm_get(entry->fwmark);
2252 svc = __ip_vs_service_get(entry->protocol,
2253 entry->addr, entry->port);
2255 ip_vs_copy_service(entry, svc);
2256 if (copy_to_user(user, entry, sizeof(*entry)) != 0)
2258 ip_vs_service_put(svc);
2264 case IP_VS_SO_GET_DESTS:
2266 struct ip_vs_get_dests *get;
2269 get = (struct ip_vs_get_dests *)arg;
2270 size = sizeof(*get) +
2271 sizeof(struct ip_vs_dest_entry) * get->num_dests;
2273 IP_VS_ERR("length: %u != %u\n", *len, size);
2277 ret = __ip_vs_get_dest_entries(get, user);
2281 case IP_VS_SO_GET_TIMEOUT:
2283 struct ip_vs_timeout_user t;
2285 __ip_vs_get_timeouts(&t);
2286 if (copy_to_user(user, &t, sizeof(t)) != 0)
2291 case IP_VS_SO_GET_DAEMON:
2293 struct ip_vs_daemon_user d[2];
2295 memset(&d, 0, sizeof(d));
2296 if (ip_vs_sync_state & IP_VS_STATE_MASTER) {
2297 d[0].state = IP_VS_STATE_MASTER;
2298 strcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn);
2299 d[0].syncid = ip_vs_master_syncid;
2301 if (ip_vs_sync_state & IP_VS_STATE_BACKUP) {
2302 d[1].state = IP_VS_STATE_BACKUP;
2303 strcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn);
2304 d[1].syncid = ip_vs_backup_syncid;
2306 if (copy_to_user(user, &d, sizeof(d)) != 0)
2321 static struct nf_sockopt_ops ip_vs_sockopts = {
2323 .set_optmin = IP_VS_BASE_CTL,
2324 .set_optmax = IP_VS_SO_SET_MAX+1,
2325 .set = do_ip_vs_set_ctl,
2326 .get_optmin = IP_VS_BASE_CTL,
2327 .get_optmax = IP_VS_SO_GET_MAX+1,
2328 .get = do_ip_vs_get_ctl,
2332 int ip_vs_control_init(void)
2339 ret = nf_register_sockopt(&ip_vs_sockopts);
2341 IP_VS_ERR("cannot register sockopt.\n");
2345 proc_net_fops_create("ip_vs", 0, &ip_vs_info_fops);
2346 proc_net_fops_create("ip_vs_stats",0, &ip_vs_stats_fops);
2348 sysctl_header = register_sysctl_table(vs_root_table, 0);
2350 /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
2351 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2352 INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
2353 INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
2355 for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) {
2356 INIT_LIST_HEAD(&ip_vs_rtable[idx]);
2359 memset(&ip_vs_stats, 0, sizeof(ip_vs_stats));
2360 ip_vs_stats.lock = SPIN_LOCK_UNLOCKED;
2361 ip_vs_new_estimator(&ip_vs_stats);
2363 /* Hook the defense timer */
2364 init_timer(&defense_timer);
2365 defense_timer.function = defense_timer_handler;
2366 defense_timer.expires = jiffies + DEFENSE_TIMER_PERIOD;
2367 add_timer(&defense_timer);
2374 void ip_vs_control_cleanup(void)
2377 ip_vs_trash_cleanup();
2378 del_timer_sync(&defense_timer);
2379 ip_vs_kill_estimator(&ip_vs_stats);
2380 unregister_sysctl_table(sysctl_header);
2381 proc_net_remove("ip_vs_stats");
2382 proc_net_remove("ip_vs");
2383 nf_unregister_sockopt(&ip_vs_sockopts);