vserver 2.0 rc7
[linux-2.6.git] / net / core / sock.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Generic socket support routines. Memory allocators, socket lock/release
7  *              handler for protocols to use and generic option handler.
8  *
9  *
10  * Version:     $Id: sock.c,v 1.117 2002/02/01 22:01:03 davem Exp $
11  *
12  * Authors:     Ross Biro
13  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
14  *              Florian La Roche, <flla@stud.uni-sb.de>
15  *              Alan Cox, <A.Cox@swansea.ac.uk>
16  *
17  * Fixes:
18  *              Alan Cox        :       Numerous verify_area() problems
19  *              Alan Cox        :       Connecting on a connecting socket
20  *                                      now returns an error for tcp.
21  *              Alan Cox        :       sock->protocol is set correctly.
22  *                                      and is not sometimes left as 0.
23  *              Alan Cox        :       connect handles icmp errors on a
24  *                                      connect properly. Unfortunately there
25  *                                      is a restart syscall nasty there. I
26  *                                      can't match BSD without hacking the C
27  *                                      library. Ideas urgently sought!
28  *              Alan Cox        :       Disallow bind() to addresses that are
29  *                                      not ours - especially broadcast ones!!
30  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
31  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
32  *                                      instead they leave that for the DESTROY timer.
33  *              Alan Cox        :       Clean up error flag in accept
34  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
35  *                                      was buggy. Put a remove_sock() in the handler
36  *                                      for memory when we hit 0. Also altered the timer
37  *                                      code. The ACK stuff can wait and needs major 
38  *                                      TCP layer surgery.
39  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
40  *                                      and fixed timer/inet_bh race.
41  *              Alan Cox        :       Added zapped flag for TCP
42  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
43  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
44  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
45  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
46  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
47  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
48  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
49  *      Pauline Middelink       :       identd support
50  *              Alan Cox        :       Fixed connect() taking signals I think.
51  *              Alan Cox        :       SO_LINGER supported
52  *              Alan Cox        :       Error reporting fixes
53  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
54  *              Alan Cox        :       inet sockets don't set sk->type!
55  *              Alan Cox        :       Split socket option code
56  *              Alan Cox        :       Callbacks
57  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
58  *              Alex            :       Removed restriction on inet fioctl
59  *              Alan Cox        :       Splitting INET from NET core
60  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
61  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
62  *              Alan Cox        :       Split IP from generic code
63  *              Alan Cox        :       New kfree_skbmem()
64  *              Alan Cox        :       Make SO_DEBUG superuser only.
65  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
66  *                                      (compatibility fix)
67  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
68  *              Alan Cox        :       Allocator for a socket is settable.
69  *              Alan Cox        :       SO_ERROR includes soft errors.
70  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
71  *              Alan Cox        :       Generic socket allocation to make hooks
72  *                                      easier (suggested by Craig Metz).
73  *              Michael Pall    :       SO_ERROR returns positive errno again
74  *              Steve Whitehouse:       Added default destructor to free
75  *                                      protocol private data.
76  *              Steve Whitehouse:       Added various other default routines
77  *                                      common to several socket families.
78  *              Chris Evans     :       Call suser() check last on F_SETOWN
79  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
80  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
81  *              Andi Kleen      :       Fix write_space callback
82  *              Chris Evans     :       Security fixes - signedness again
83  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
84  *
85  * To Fix:
86  *
87  *
88  *              This program is free software; you can redistribute it and/or
89  *              modify it under the terms of the GNU General Public License
90  *              as published by the Free Software Foundation; either version
91  *              2 of the License, or (at your option) any later version.
92  */
93
94 #include <linux/config.h>
95 #include <linux/errno.h>
96 #include <linux/types.h>
97 #include <linux/socket.h>
98 #include <linux/in.h>
99 #include <linux/kernel.h>
100 #include <linux/module.h>
101 #include <linux/proc_fs.h>
102 #include <linux/seq_file.h>
103 #include <linux/sched.h>
104 #include <linux/timer.h>
105 #include <linux/string.h>
106 #include <linux/sockios.h>
107 #include <linux/net.h>
108 #include <linux/mm.h>
109 #include <linux/slab.h>
110 #include <linux/interrupt.h>
111 #include <linux/poll.h>
112 #include <linux/tcp.h>
113 #include <linux/init.h>
114
115 #include <asm/uaccess.h>
116 #include <asm/system.h>
117
118 #include <linux/netdevice.h>
119 #include <net/protocol.h>
120 #include <linux/skbuff.h>
121 #include <net/sock.h>
122 #include <net/xfrm.h>
123 #include <linux/ipsec.h>
124
125 #include <linux/filter.h>
126 #include <linux/vs_socket.h>
127 #include <linux/vs_limit.h>
128 #include <linux/vs_context.h>
129
130 #ifdef CONFIG_INET
131 #include <net/tcp.h>
132 #endif
133
134 /* Take into consideration the size of the struct sk_buff overhead in the
135  * determination of these values, since that is non-constant across
136  * platforms.  This makes socket queueing behavior and performance
137  * not depend upon such differences.
138  */
139 #define _SK_MEM_PACKETS         256
140 #define _SK_MEM_OVERHEAD        (sizeof(struct sk_buff) + 256)
141 #define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
142 #define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
143
144 /* Run time adjustable parameters. */
145 __u32 sysctl_wmem_max = SK_WMEM_MAX;
146 __u32 sysctl_rmem_max = SK_RMEM_MAX;
147 __u32 sysctl_wmem_default = SK_WMEM_MAX;
148 __u32 sysctl_rmem_default = SK_RMEM_MAX;
149
150 /* Maximal space eaten by iovec or ancilliary data plus some space */
151 int sysctl_optmem_max = sizeof(unsigned long)*(2*UIO_MAXIOV + 512);
152
153 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
154 {
155         struct timeval tv;
156
157         if (optlen < sizeof(tv))
158                 return -EINVAL;
159         if (copy_from_user(&tv, optval, sizeof(tv)))
160                 return -EFAULT;
161
162         *timeo_p = MAX_SCHEDULE_TIMEOUT;
163         if (tv.tv_sec == 0 && tv.tv_usec == 0)
164                 return 0;
165         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
166                 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
167         return 0;
168 }
169
170 static void sock_warn_obsolete_bsdism(const char *name)
171 {
172         static int warned;
173         static char warncomm[TASK_COMM_LEN];
174         if (strcmp(warncomm, current->comm) && warned < 5) { 
175                 strcpy(warncomm,  current->comm); 
176                 printk(KERN_WARNING "process `%s' is using obsolete "
177                        "%s SO_BSDCOMPAT\n", warncomm, name);
178                 warned++;
179         }
180 }
181
182 static void sock_disable_timestamp(struct sock *sk)
183 {       
184         if (sock_flag(sk, SOCK_TIMESTAMP)) { 
185                 sock_reset_flag(sk, SOCK_TIMESTAMP);
186                 net_disable_timestamp();
187         }
188 }
189
190
191 /*
192  *      This is meant for all protocols to use and covers goings on
193  *      at the socket level. Everything here is generic.
194  */
195
196 int sock_setsockopt(struct socket *sock, int level, int optname,
197                     char __user *optval, int optlen)
198 {
199         struct sock *sk=sock->sk;
200         struct sk_filter *filter;
201         int val;
202         int valbool;
203         struct linger ling;
204         int ret = 0;
205         
206         /*
207          *      Options without arguments
208          */
209
210 #ifdef SO_DONTLINGER            /* Compatibility item... */
211         switch (optname) {
212                 case SO_DONTLINGER:
213                         sock_reset_flag(sk, SOCK_LINGER);
214                         return 0;
215         }
216 #endif  
217                 
218         if(optlen<sizeof(int))
219                 return(-EINVAL);
220         
221         if (get_user(val, (int __user *)optval))
222                 return -EFAULT;
223         
224         valbool = val?1:0;
225
226         lock_sock(sk);
227
228         switch(optname) 
229         {
230                 case SO_DEBUG:  
231                         if(val && !capable(CAP_NET_ADMIN))
232                         {
233                                 ret = -EACCES;
234                         }
235                         else if (valbool)
236                                 sock_set_flag(sk, SOCK_DBG);
237                         else
238                                 sock_reset_flag(sk, SOCK_DBG);
239                         break;
240                 case SO_REUSEADDR:
241                         sk->sk_reuse = valbool;
242                         break;
243                 case SO_TYPE:
244                 case SO_ERROR:
245                         ret = -ENOPROTOOPT;
246                         break;
247                 case SO_DONTROUTE:
248                         if (valbool)
249                                 sock_set_flag(sk, SOCK_LOCALROUTE);
250                         else
251                                 sock_reset_flag(sk, SOCK_LOCALROUTE);
252                         break;
253                 case SO_BROADCAST:
254                         sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
255                         break;
256                 case SO_SNDBUF:
257                         /* Don't error on this BSD doesn't and if you think
258                            about it this is right. Otherwise apps have to
259                            play 'guess the biggest size' games. RCVBUF/SNDBUF
260                            are treated in BSD as hints */
261                            
262                         if (val > sysctl_wmem_max)
263                                 val = sysctl_wmem_max;
264
265                         sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
266                         if ((val * 2) < SOCK_MIN_SNDBUF)
267                                 sk->sk_sndbuf = SOCK_MIN_SNDBUF;
268                         else
269                                 sk->sk_sndbuf = val * 2;
270
271                         /*
272                          *      Wake up sending tasks if we
273                          *      upped the value.
274                          */
275                         sk->sk_write_space(sk);
276                         break;
277
278                 case SO_RCVBUF:
279                         /* Don't error on this BSD doesn't and if you think
280                            about it this is right. Otherwise apps have to
281                            play 'guess the biggest size' games. RCVBUF/SNDBUF
282                            are treated in BSD as hints */
283                           
284                         if (val > sysctl_rmem_max)
285                                 val = sysctl_rmem_max;
286
287                         sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
288                         /* FIXME: is this lower bound the right one? */
289                         if ((val * 2) < SOCK_MIN_RCVBUF)
290                                 sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
291                         else
292                                 sk->sk_rcvbuf = val * 2;
293                         break;
294
295                 case SO_KEEPALIVE:
296 #ifdef CONFIG_INET
297                         if (sk->sk_protocol == IPPROTO_TCP)
298                                 tcp_set_keepalive(sk, valbool);
299 #endif
300                         sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
301                         break;
302
303                 case SO_OOBINLINE:
304                         sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
305                         break;
306
307                 case SO_NO_CHECK:
308                         sk->sk_no_check = valbool;
309                         break;
310
311                 case SO_PRIORITY:
312                         if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN)) 
313                                 sk->sk_priority = val;
314                         else
315                                 ret = -EPERM;
316                         break;
317
318                 case SO_LINGER:
319                         if(optlen<sizeof(ling)) {
320                                 ret = -EINVAL;  /* 1003.1g */
321                                 break;
322                         }
323                         if (copy_from_user(&ling,optval,sizeof(ling))) {
324                                 ret = -EFAULT;
325                                 break;
326                         }
327                         if (!ling.l_onoff)
328                                 sock_reset_flag(sk, SOCK_LINGER);
329                         else {
330 #if (BITS_PER_LONG == 32)
331                                 if (ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
332                                         sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
333                                 else
334 #endif
335                                         sk->sk_lingertime = ling.l_linger * HZ;
336                                 sock_set_flag(sk, SOCK_LINGER);
337                         }
338                         break;
339
340                 case SO_BSDCOMPAT:
341                         sock_warn_obsolete_bsdism("setsockopt");
342                         break;
343
344                 case SO_PASSCRED:
345                         if (valbool)
346                                 set_bit(SOCK_PASSCRED, &sock->flags);
347                         else
348                                 clear_bit(SOCK_PASSCRED, &sock->flags);
349                         break;
350
351                 case SO_TIMESTAMP:
352                         if (valbool)  {
353                                 sock_set_flag(sk, SOCK_RCVTSTAMP);
354                                 sock_enable_timestamp(sk);
355                         } else
356                                 sock_reset_flag(sk, SOCK_RCVTSTAMP);
357                         break;
358
359                 case SO_RCVLOWAT:
360                         if (val < 0)
361                                 val = INT_MAX;
362                         sk->sk_rcvlowat = val ? : 1;
363                         break;
364
365                 case SO_RCVTIMEO:
366                         ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
367                         break;
368
369                 case SO_SNDTIMEO:
370                         ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
371                         break;
372
373 #ifdef CONFIG_NETDEVICES
374                 case SO_BINDTODEVICE:
375                 {
376                         char devname[IFNAMSIZ]; 
377
378                         /* Sorry... */ 
379                         if (!capable(CAP_NET_RAW)) {
380                                 ret = -EPERM;
381                                 break;
382                         }
383
384                         /* Bind this socket to a particular device like "eth0",
385                          * as specified in the passed interface name. If the
386                          * name is "" or the option length is zero the socket 
387                          * is not bound. 
388                          */ 
389
390                         if (!valbool) {
391                                 sk->sk_bound_dev_if = 0;
392                         } else {
393                                 if (optlen > IFNAMSIZ) 
394                                         optlen = IFNAMSIZ; 
395                                 if (copy_from_user(devname, optval, optlen)) {
396                                         ret = -EFAULT;
397                                         break;
398                                 }
399
400                                 /* Remove any cached route for this socket. */
401                                 sk_dst_reset(sk);
402
403                                 if (devname[0] == '\0') {
404                                         sk->sk_bound_dev_if = 0;
405                                 } else {
406                                         struct net_device *dev = dev_get_by_name(devname);
407                                         if (!dev) {
408                                                 ret = -ENODEV;
409                                                 break;
410                                         }
411                                         sk->sk_bound_dev_if = dev->ifindex;
412                                         dev_put(dev);
413                                 }
414                         }
415                         break;
416                 }
417 #endif
418
419
420                 case SO_ATTACH_FILTER:
421                         ret = -EINVAL;
422                         if (optlen == sizeof(struct sock_fprog)) {
423                                 struct sock_fprog fprog;
424
425                                 ret = -EFAULT;
426                                 if (copy_from_user(&fprog, optval, sizeof(fprog)))
427                                         break;
428
429                                 ret = sk_attach_filter(&fprog, sk);
430                         }
431                         break;
432
433                 case SO_DETACH_FILTER:
434                         spin_lock_bh(&sk->sk_lock.slock);
435                         filter = sk->sk_filter;
436                         if (filter) {
437                                 sk->sk_filter = NULL;
438                                 spin_unlock_bh(&sk->sk_lock.slock);
439                                 sk_filter_release(sk, filter);
440                                 break;
441                         }
442                         spin_unlock_bh(&sk->sk_lock.slock);
443                         ret = -ENONET;
444                         break;
445
446                 /* We implement the SO_SNDLOWAT etc to
447                    not be settable (1003.1g 5.3) */
448                 default:
449                         ret = -ENOPROTOOPT;
450                         break;
451         }
452         release_sock(sk);
453         return ret;
454 }
455
456
457 int sock_getsockopt(struct socket *sock, int level, int optname,
458                     char __user *optval, int __user *optlen)
459 {
460         struct sock *sk = sock->sk;
461         
462         union
463         {
464                 int val;
465                 struct linger ling;
466                 struct timeval tm;
467         } v;
468         
469         unsigned int lv = sizeof(int);
470         int len;
471         
472         if(get_user(len,optlen))
473                 return -EFAULT;
474         if(len < 0)
475                 return -EINVAL;
476                 
477         switch(optname) 
478         {
479                 case SO_DEBUG:          
480                         v.val = sock_flag(sk, SOCK_DBG);
481                         break;
482                 
483                 case SO_DONTROUTE:
484                         v.val = sock_flag(sk, SOCK_LOCALROUTE);
485                         break;
486                 
487                 case SO_BROADCAST:
488                         v.val = !!sock_flag(sk, SOCK_BROADCAST);
489                         break;
490
491                 case SO_SNDBUF:
492                         v.val = sk->sk_sndbuf;
493                         break;
494                 
495                 case SO_RCVBUF:
496                         v.val = sk->sk_rcvbuf;
497                         break;
498
499                 case SO_REUSEADDR:
500                         v.val = sk->sk_reuse;
501                         break;
502
503                 case SO_KEEPALIVE:
504                         v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
505                         break;
506
507                 case SO_TYPE:
508                         v.val = sk->sk_type;                            
509                         break;
510
511                 case SO_ERROR:
512                         v.val = -sock_error(sk);
513                         if(v.val==0)
514                                 v.val = xchg(&sk->sk_err_soft, 0);
515                         break;
516
517                 case SO_OOBINLINE:
518                         v.val = !!sock_flag(sk, SOCK_URGINLINE);
519                         break;
520         
521                 case SO_NO_CHECK:
522                         v.val = sk->sk_no_check;
523                         break;
524
525                 case SO_PRIORITY:
526                         v.val = sk->sk_priority;
527                         break;
528                 
529                 case SO_LINGER: 
530                         lv              = sizeof(v.ling);
531                         v.ling.l_onoff  = !!sock_flag(sk, SOCK_LINGER);
532                         v.ling.l_linger = sk->sk_lingertime / HZ;
533                         break;
534                                         
535                 case SO_BSDCOMPAT:
536                         sock_warn_obsolete_bsdism("getsockopt");
537                         break;
538
539                 case SO_TIMESTAMP:
540                         v.val = sock_flag(sk, SOCK_RCVTSTAMP);
541                         break;
542
543                 case SO_RCVTIMEO:
544                         lv=sizeof(struct timeval);
545                         if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
546                                 v.tm.tv_sec = 0;
547                                 v.tm.tv_usec = 0;
548                         } else {
549                                 v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
550                                 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
551                         }
552                         break;
553
554                 case SO_SNDTIMEO:
555                         lv=sizeof(struct timeval);
556                         if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
557                                 v.tm.tv_sec = 0;
558                                 v.tm.tv_usec = 0;
559                         } else {
560                                 v.tm.tv_sec = sk->sk_sndtimeo / HZ;
561                                 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
562                         }
563                         break;
564
565                 case SO_RCVLOWAT:
566                         v.val = sk->sk_rcvlowat;
567                         break;
568
569                 case SO_SNDLOWAT:
570                         v.val=1;
571                         break; 
572
573                 case SO_PASSCRED:
574                         v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
575                         break;
576
577                 case SO_PEERCRED:
578                         if (len > sizeof(sk->sk_peercred))
579                                 len = sizeof(sk->sk_peercred);
580                         if (copy_to_user(optval, &sk->sk_peercred, len))
581                                 return -EFAULT;
582                         goto lenout;
583
584                 case SO_PEERNAME:
585                 {
586                         char address[128];
587
588                         if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
589                                 return -ENOTCONN;
590                         if (lv < len)
591                                 return -EINVAL;
592                         if (copy_to_user(optval, address, len))
593                                 return -EFAULT;
594                         goto lenout;
595                 }
596
597                 /* Dubious BSD thing... Probably nobody even uses it, but
598                  * the UNIX standard wants it for whatever reason... -DaveM
599                  */
600                 case SO_ACCEPTCONN:
601                         v.val = sk->sk_state == TCP_LISTEN;
602                         break;
603
604                 case SO_PEERSEC:
605                         return security_socket_getpeersec(sock, optval, optlen, len);
606
607                 default:
608                         return(-ENOPROTOOPT);
609         }
610         if (len > lv)
611                 len = lv;
612         if (copy_to_user(optval, &v, len))
613                 return -EFAULT;
614 lenout:
615         if (put_user(len, optlen))
616                 return -EFAULT;
617         return 0;
618 }
619
620 /**
621  *      sk_alloc - All socket objects are allocated here
622  *      @family: protocol family
623  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
624  *      @prot: struct proto associated with this new sock instance
625  *      @zero_it: if we should zero the newly allocated sock
626  */
627 struct sock *sk_alloc(int family, int priority, struct proto *prot, int zero_it)
628 {
629         struct sock *sk = NULL;
630         kmem_cache_t *slab = prot->slab;
631
632         if (slab != NULL)
633                 sk = kmem_cache_alloc(slab, priority);
634         else
635                 sk = kmalloc(prot->obj_size, priority);
636
637         if (sk) {
638                 if (zero_it) {
639                         memset(sk, 0, prot->obj_size);
640                         sk->sk_family = family;
641                         /*
642                          * See comment in struct sock definition to understand
643                          * why we need sk_prot_creator -acme
644                          */
645                         sk->sk_prot = sk->sk_prot_creator = prot;
646                         sock_lock_init(sk);
647                 }
648                 sock_vx_init(sk);
649                 sock_nx_init(sk);
650                 
651                 if (security_sk_alloc(sk, family, priority)) {
652                         if (slab != NULL)
653                                 kmem_cache_free(slab, sk);
654                         else
655                                 kfree(sk);
656                         sk = NULL;
657                 } else
658                         __module_get(prot->owner);
659         }
660         return sk;
661 }
662
663 void sk_free(struct sock *sk)
664 {
665         struct sk_filter *filter;
666         struct module *owner = sk->sk_prot_creator->owner;
667
668         if (sk->sk_destruct)
669                 sk->sk_destruct(sk);
670
671         filter = sk->sk_filter;
672         if (filter) {
673                 sk_filter_release(sk, filter);
674                 sk->sk_filter = NULL;
675         }
676
677         sock_disable_timestamp(sk);
678
679         if (atomic_read(&sk->sk_omem_alloc))
680                 printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
681                        __FUNCTION__, atomic_read(&sk->sk_omem_alloc));
682
683         security_sk_free(sk);
684         vx_sock_dec(sk);
685         clr_vx_info(&sk->sk_vx_info);
686         sk->sk_xid = -1;
687         clr_nx_info(&sk->sk_nx_info);
688         sk->sk_nid = -1;
689         if (sk->sk_prot_creator->slab != NULL)
690                 kmem_cache_free(sk->sk_prot_creator->slab, sk);
691         else
692                 kfree(sk);
693         module_put(owner);
694 }
695
696 void __init sk_init(void)
697 {
698         if (num_physpages <= 4096) {
699                 sysctl_wmem_max = 32767;
700                 sysctl_rmem_max = 32767;
701                 sysctl_wmem_default = 32767;
702                 sysctl_rmem_default = 32767;
703         } else if (num_physpages >= 131072) {
704                 sysctl_wmem_max = 131071;
705                 sysctl_rmem_max = 131071;
706         }
707 }
708
709 /*
710  *      Simple resource managers for sockets.
711  */
712
713
714 /* 
715  * Write buffer destructor automatically called from kfree_skb. 
716  */
717 void sock_wfree(struct sk_buff *skb)
718 {
719         struct sock *sk = skb->sk;
720
721         /* In case it might be waiting for more memory. */
722         atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
723         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE))
724                 sk->sk_write_space(sk);
725         sock_put(sk);
726 }
727
728 /* 
729  * Read buffer destructor automatically called from kfree_skb. 
730  */
731 void sock_rfree(struct sk_buff *skb)
732 {
733         struct sock *sk = skb->sk;
734
735         atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
736 }
737
738
739 int sock_i_uid(struct sock *sk)
740 {
741         int uid;
742
743         read_lock(&sk->sk_callback_lock);
744         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
745         read_unlock(&sk->sk_callback_lock);
746         return uid;
747 }
748
749 unsigned long sock_i_ino(struct sock *sk)
750 {
751         unsigned long ino;
752
753         read_lock(&sk->sk_callback_lock);
754         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
755         read_unlock(&sk->sk_callback_lock);
756         return ino;
757 }
758
759 /*
760  * Allocate a skb from the socket's send buffer.
761  */
762 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, int priority)
763 {
764         if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
765                 struct sk_buff * skb = alloc_skb(size, priority);
766                 if (skb) {
767                         skb_set_owner_w(skb, sk);
768                         return skb;
769                 }
770         }
771         return NULL;
772 }
773
774 /*
775  * Allocate a skb from the socket's receive buffer.
776  */ 
777 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, int priority)
778 {
779         if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
780                 struct sk_buff *skb = alloc_skb(size, priority);
781                 if (skb) {
782                         skb_set_owner_r(skb, sk);
783                         return skb;
784                 }
785         }
786         return NULL;
787 }
788
789 /* 
790  * Allocate a memory block from the socket's option memory buffer.
791  */ 
792 void *sock_kmalloc(struct sock *sk, int size, int priority)
793 {
794         if ((unsigned)size <= sysctl_optmem_max &&
795             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
796                 void *mem;
797                 /* First do the add, to avoid the race if kmalloc
798                  * might sleep.
799                  */
800                 atomic_add(size, &sk->sk_omem_alloc);
801                 mem = kmalloc(size, priority);
802                 if (mem)
803                         return mem;
804                 atomic_sub(size, &sk->sk_omem_alloc);
805         }
806         return NULL;
807 }
808
809 /*
810  * Free an option memory block.
811  */
812 void sock_kfree_s(struct sock *sk, void *mem, int size)
813 {
814         kfree(mem);
815         atomic_sub(size, &sk->sk_omem_alloc);
816 }
817
818 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
819    I think, these locks should be removed for datagram sockets.
820  */
821 static long sock_wait_for_wmem(struct sock * sk, long timeo)
822 {
823         DEFINE_WAIT(wait);
824
825         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
826         for (;;) {
827                 if (!timeo)
828                         break;
829                 if (signal_pending(current))
830                         break;
831                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
832                 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
833                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
834                         break;
835                 if (sk->sk_shutdown & SEND_SHUTDOWN)
836                         break;
837                 if (sk->sk_err)
838                         break;
839                 timeo = schedule_timeout(timeo);
840         }
841         finish_wait(sk->sk_sleep, &wait);
842         return timeo;
843 }
844
845
846 /*
847  *      Generic send/receive buffer handlers
848  */
849
850 static struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
851                                             unsigned long header_len,
852                                             unsigned long data_len,
853                                             int noblock, int *errcode)
854 {
855         struct sk_buff *skb;
856         unsigned int gfp_mask;
857         long timeo;
858         int err;
859
860         gfp_mask = sk->sk_allocation;
861         if (gfp_mask & __GFP_WAIT)
862                 gfp_mask |= __GFP_REPEAT;
863
864         timeo = sock_sndtimeo(sk, noblock);
865         while (1) {
866                 err = sock_error(sk);
867                 if (err != 0)
868                         goto failure;
869
870                 err = -EPIPE;
871                 if (sk->sk_shutdown & SEND_SHUTDOWN)
872                         goto failure;
873
874                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
875                         skb = alloc_skb(header_len, sk->sk_allocation);
876                         if (skb) {
877                                 int npages;
878                                 int i;
879
880                                 /* No pages, we're done... */
881                                 if (!data_len)
882                                         break;
883
884                                 npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
885                                 skb->truesize += data_len;
886                                 skb_shinfo(skb)->nr_frags = npages;
887                                 for (i = 0; i < npages; i++) {
888                                         struct page *page;
889                                         skb_frag_t *frag;
890
891                                         page = alloc_pages(sk->sk_allocation, 0);
892                                         if (!page) {
893                                                 err = -ENOBUFS;
894                                                 skb_shinfo(skb)->nr_frags = i;
895                                                 kfree_skb(skb);
896                                                 goto failure;
897                                         }
898
899                                         frag = &skb_shinfo(skb)->frags[i];
900                                         frag->page = page;
901                                         frag->page_offset = 0;
902                                         frag->size = (data_len >= PAGE_SIZE ?
903                                                       PAGE_SIZE :
904                                                       data_len);
905                                         data_len -= PAGE_SIZE;
906                                 }
907
908                                 /* Full success... */
909                                 break;
910                         }
911                         err = -ENOBUFS;
912                         goto failure;
913                 }
914                 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
915                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
916                 err = -EAGAIN;
917                 if (!timeo)
918                         goto failure;
919                 if (signal_pending(current))
920                         goto interrupted;
921                 timeo = sock_wait_for_wmem(sk, timeo);
922         }
923
924         skb_set_owner_w(skb, sk);
925         return skb;
926
927 interrupted:
928         err = sock_intr_errno(timeo);
929 failure:
930         *errcode = err;
931         return NULL;
932 }
933
934 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, 
935                                     int noblock, int *errcode)
936 {
937         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
938 }
939
940 static void __lock_sock(struct sock *sk)
941 {
942         DEFINE_WAIT(wait);
943
944         for(;;) {
945                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
946                                         TASK_UNINTERRUPTIBLE);
947                 spin_unlock_bh(&sk->sk_lock.slock);
948                 schedule();
949                 spin_lock_bh(&sk->sk_lock.slock);
950                 if(!sock_owned_by_user(sk))
951                         break;
952         }
953         finish_wait(&sk->sk_lock.wq, &wait);
954 }
955
956 static void __release_sock(struct sock *sk)
957 {
958         struct sk_buff *skb = sk->sk_backlog.head;
959
960         do {
961                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
962                 bh_unlock_sock(sk);
963
964                 do {
965                         struct sk_buff *next = skb->next;
966
967                         skb->next = NULL;
968                         sk->sk_backlog_rcv(sk, skb);
969
970                         /*
971                          * We are in process context here with softirqs
972                          * disabled, use cond_resched_softirq() to preempt.
973                          * This is safe to do because we've taken the backlog
974                          * queue private:
975                          */
976                         cond_resched_softirq();
977
978                         skb = next;
979                 } while (skb != NULL);
980
981                 bh_lock_sock(sk);
982         } while((skb = sk->sk_backlog.head) != NULL);
983 }
984
985 /**
986  * sk_wait_data - wait for data to arrive at sk_receive_queue
987  * @sk:    sock to wait on
988  * @timeo: for how long
989  *
990  * Now socket state including sk->sk_err is changed only under lock,
991  * hence we may omit checks after joining wait queue.
992  * We check receive queue before schedule() only as optimization;
993  * it is very likely that release_sock() added new data.
994  */
995 int sk_wait_data(struct sock *sk, long *timeo)
996 {
997         int rc;
998         DEFINE_WAIT(wait);
999
1000         prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1001         set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1002         rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1003         clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1004         finish_wait(sk->sk_sleep, &wait);
1005         return rc;
1006 }
1007
1008 EXPORT_SYMBOL(sk_wait_data);
1009
1010 /*
1011  * Set of default routines for initialising struct proto_ops when
1012  * the protocol does not support a particular function. In certain
1013  * cases where it makes no sense for a protocol to have a "do nothing"
1014  * function, some default processing is provided.
1015  */
1016
1017 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1018 {
1019         return -EOPNOTSUPP;
1020 }
1021
1022 int sock_no_connect(struct socket *sock, struct sockaddr *saddr, 
1023                     int len, int flags)
1024 {
1025         return -EOPNOTSUPP;
1026 }
1027
1028 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1029 {
1030         return -EOPNOTSUPP;
1031 }
1032
1033 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1034 {
1035         return -EOPNOTSUPP;
1036 }
1037
1038 int sock_no_getname(struct socket *sock, struct sockaddr *saddr, 
1039                     int *len, int peer)
1040 {
1041         return -EOPNOTSUPP;
1042 }
1043
1044 unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt)
1045 {
1046         return 0;
1047 }
1048
1049 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1050 {
1051         return -EOPNOTSUPP;
1052 }
1053
1054 int sock_no_listen(struct socket *sock, int backlog)
1055 {
1056         return -EOPNOTSUPP;
1057 }
1058
1059 int sock_no_shutdown(struct socket *sock, int how)
1060 {
1061         return -EOPNOTSUPP;
1062 }
1063
1064 int sock_no_setsockopt(struct socket *sock, int level, int optname,
1065                     char __user *optval, int optlen)
1066 {
1067         return -EOPNOTSUPP;
1068 }
1069
1070 int sock_no_getsockopt(struct socket *sock, int level, int optname,
1071                     char __user *optval, int __user *optlen)
1072 {
1073         return -EOPNOTSUPP;
1074 }
1075
1076 int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1077                     size_t len)
1078 {
1079         return -EOPNOTSUPP;
1080 }
1081
1082 int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1083                     size_t len, int flags)
1084 {
1085         return -EOPNOTSUPP;
1086 }
1087
1088 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1089 {
1090         /* Mirror missing mmap method error code */
1091         return -ENODEV;
1092 }
1093
1094 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1095 {
1096         ssize_t res;
1097         struct msghdr msg = {.msg_flags = flags};
1098         struct kvec iov;
1099         char *kaddr = kmap(page);
1100         iov.iov_base = kaddr + offset;
1101         iov.iov_len = size;
1102         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1103         kunmap(page);
1104         return res;
1105 }
1106
1107 /*
1108  *      Default Socket Callbacks
1109  */
1110
1111 static void sock_def_wakeup(struct sock *sk)
1112 {
1113         read_lock(&sk->sk_callback_lock);
1114         if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1115                 wake_up_interruptible_all(sk->sk_sleep);
1116         read_unlock(&sk->sk_callback_lock);
1117 }
1118
1119 static void sock_def_error_report(struct sock *sk)
1120 {
1121         read_lock(&sk->sk_callback_lock);
1122         if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1123                 wake_up_interruptible(sk->sk_sleep);
1124         sk_wake_async(sk,0,POLL_ERR); 
1125         read_unlock(&sk->sk_callback_lock);
1126 }
1127
1128 static void sock_def_readable(struct sock *sk, int len)
1129 {
1130         read_lock(&sk->sk_callback_lock);
1131         if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1132                 wake_up_interruptible(sk->sk_sleep);
1133         sk_wake_async(sk,1,POLL_IN);
1134         read_unlock(&sk->sk_callback_lock);
1135 }
1136
1137 static void sock_def_write_space(struct sock *sk)
1138 {
1139         read_lock(&sk->sk_callback_lock);
1140
1141         /* Do not wake up a writer until he can make "significant"
1142          * progress.  --DaveM
1143          */
1144         if((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1145                 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1146                         wake_up_interruptible(sk->sk_sleep);
1147
1148                 /* Should agree with poll, otherwise some programs break */
1149                 if (sock_writeable(sk))
1150                         sk_wake_async(sk, 2, POLL_OUT);
1151         }
1152
1153         read_unlock(&sk->sk_callback_lock);
1154 }
1155
1156 static void sock_def_destruct(struct sock *sk)
1157 {
1158         if (sk->sk_protinfo)
1159                 kfree(sk->sk_protinfo);
1160 }
1161
1162 void sk_send_sigurg(struct sock *sk)
1163 {
1164         if (sk->sk_socket && sk->sk_socket->file)
1165                 if (send_sigurg(&sk->sk_socket->file->f_owner))
1166                         sk_wake_async(sk, 3, POLL_PRI);
1167 }
1168
1169 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1170                     unsigned long expires)
1171 {
1172         if (!mod_timer(timer, expires))
1173                 sock_hold(sk);
1174 }
1175
1176 EXPORT_SYMBOL(sk_reset_timer);
1177
1178 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1179 {
1180         if (timer_pending(timer) && del_timer(timer))
1181                 __sock_put(sk);
1182 }
1183
1184 EXPORT_SYMBOL(sk_stop_timer);
1185
1186 void sock_init_data(struct socket *sock, struct sock *sk)
1187 {
1188         skb_queue_head_init(&sk->sk_receive_queue);
1189         skb_queue_head_init(&sk->sk_write_queue);
1190         skb_queue_head_init(&sk->sk_error_queue);
1191
1192         sk->sk_send_head        =       NULL;
1193
1194         init_timer(&sk->sk_timer);
1195         
1196         sk->sk_allocation       =       GFP_KERNEL;
1197         sk->sk_rcvbuf           =       sysctl_rmem_default;
1198         sk->sk_sndbuf           =       sysctl_wmem_default;
1199         sk->sk_state            =       TCP_CLOSE;
1200         sk->sk_socket           =       sock;
1201
1202         sock_set_flag(sk, SOCK_ZAPPED);
1203
1204         if(sock)
1205         {
1206                 sk->sk_type     =       sock->type;
1207                 sk->sk_sleep    =       &sock->wait;
1208                 sock->sk        =       sk;
1209         } else
1210                 sk->sk_sleep    =       NULL;
1211
1212         rwlock_init(&sk->sk_dst_lock);
1213         rwlock_init(&sk->sk_callback_lock);
1214
1215         sk->sk_state_change     =       sock_def_wakeup;
1216         sk->sk_data_ready       =       sock_def_readable;
1217         sk->sk_write_space      =       sock_def_write_space;
1218         sk->sk_error_report     =       sock_def_error_report;
1219         sk->sk_destruct         =       sock_def_destruct;
1220
1221         sk->sk_sndmsg_page      =       NULL;
1222         sk->sk_sndmsg_off       =       0;
1223
1224         sk->sk_peercred.pid     =       0;
1225         sk->sk_peercred.uid     =       -1;
1226         sk->sk_peercred.gid     =       -1;
1227         sk->sk_write_pending    =       0;
1228         sk->sk_rcvlowat         =       1;
1229         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
1230         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
1231
1232         sk->sk_stamp.tv_sec     = -1L;
1233         sk->sk_stamp.tv_usec    = -1L;
1234
1235         set_vx_info(&sk->sk_vx_info, current->vx_info);
1236         sk->sk_xid = vx_current_xid();
1237         vx_sock_inc(sk);
1238         set_nx_info(&sk->sk_nx_info, current->nx_info);
1239         sk->sk_nid = nx_current_nid();
1240         atomic_set(&sk->sk_refcnt, 1);
1241 }
1242
1243 void fastcall lock_sock(struct sock *sk)
1244 {
1245         might_sleep();
1246         spin_lock_bh(&(sk->sk_lock.slock));
1247         if (sk->sk_lock.owner)
1248                 __lock_sock(sk);
1249         sk->sk_lock.owner = (void *)1;
1250         spin_unlock_bh(&(sk->sk_lock.slock));
1251 }
1252
1253 EXPORT_SYMBOL(lock_sock);
1254
1255 void fastcall release_sock(struct sock *sk)
1256 {
1257         spin_lock_bh(&(sk->sk_lock.slock));
1258         if (sk->sk_backlog.tail)
1259                 __release_sock(sk);
1260         sk->sk_lock.owner = NULL;
1261         if (waitqueue_active(&(sk->sk_lock.wq)))
1262                 wake_up(&(sk->sk_lock.wq));
1263         spin_unlock_bh(&(sk->sk_lock.slock));
1264 }
1265 EXPORT_SYMBOL(release_sock);
1266
1267 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
1268
1269         if (!sock_flag(sk, SOCK_TIMESTAMP))
1270                 sock_enable_timestamp(sk);
1271         if (sk->sk_stamp.tv_sec == -1) 
1272                 return -ENOENT;
1273         if (sk->sk_stamp.tv_sec == 0)
1274                 do_gettimeofday(&sk->sk_stamp);
1275         return copy_to_user(userstamp, &sk->sk_stamp, sizeof(struct timeval)) ?
1276                 -EFAULT : 0; 
1277
1278 EXPORT_SYMBOL(sock_get_timestamp);
1279
1280 void sock_enable_timestamp(struct sock *sk)
1281 {       
1282         if (!sock_flag(sk, SOCK_TIMESTAMP)) { 
1283                 sock_set_flag(sk, SOCK_TIMESTAMP);
1284                 net_enable_timestamp();
1285         }
1286 }
1287 EXPORT_SYMBOL(sock_enable_timestamp); 
1288
1289 /*
1290  *      Get a socket option on an socket.
1291  *
1292  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
1293  *      asynchronous errors should be reported by getsockopt. We assume
1294  *      this means if you specify SO_ERROR (otherwise whats the point of it).
1295  */
1296 int sock_common_getsockopt(struct socket *sock, int level, int optname,
1297                            char __user *optval, int __user *optlen)
1298 {
1299         struct sock *sk = sock->sk;
1300
1301         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1302 }
1303
1304 EXPORT_SYMBOL(sock_common_getsockopt);
1305
1306 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
1307                         struct msghdr *msg, size_t size, int flags)
1308 {
1309         struct sock *sk = sock->sk;
1310         int addr_len = 0;
1311         int err;
1312
1313         err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
1314                                    flags & ~MSG_DONTWAIT, &addr_len);
1315         if (err >= 0)
1316                 msg->msg_namelen = addr_len;
1317         return err;
1318 }
1319
1320 EXPORT_SYMBOL(sock_common_recvmsg);
1321
1322 /*
1323  *      Set socket options on an inet socket.
1324  */
1325 int sock_common_setsockopt(struct socket *sock, int level, int optname,
1326                            char __user *optval, int optlen)
1327 {
1328         struct sock *sk = sock->sk;
1329
1330         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1331 }
1332
1333 EXPORT_SYMBOL(sock_common_setsockopt);
1334
1335 void sk_common_release(struct sock *sk)
1336 {
1337         if (sk->sk_prot->destroy)
1338                 sk->sk_prot->destroy(sk);
1339
1340         /*
1341          * Observation: when sock_common_release is called, processes have
1342          * no access to socket. But net still has.
1343          * Step one, detach it from networking:
1344          *
1345          * A. Remove from hash tables.
1346          */
1347
1348         sk->sk_prot->unhash(sk);
1349
1350         /*
1351          * In this point socket cannot receive new packets, but it is possible
1352          * that some packets are in flight because some CPU runs receiver and
1353          * did hash table lookup before we unhashed socket. They will achieve
1354          * receive queue and will be purged by socket destructor.
1355          *
1356          * Also we still have packets pending on receive queue and probably,
1357          * our own packets waiting in device queues. sock_destroy will drain
1358          * receive queue, but transmitted packets will delay socket destruction
1359          * until the last reference will be released.
1360          */
1361
1362         sock_orphan(sk);
1363
1364         xfrm_sk_free_policy(sk);
1365
1366 #ifdef INET_REFCNT_DEBUG
1367         if (atomic_read(&sk->sk_refcnt) != 1)
1368                 printk(KERN_DEBUG "Destruction of the socket %p delayed, c=%d\n",
1369                        sk, atomic_read(&sk->sk_refcnt));
1370 #endif
1371         sock_put(sk);
1372 }
1373
1374 EXPORT_SYMBOL(sk_common_release);
1375
1376 static DEFINE_RWLOCK(proto_list_lock);
1377 static LIST_HEAD(proto_list);
1378
1379 int proto_register(struct proto *prot, int alloc_slab)
1380 {
1381         int rc = -ENOBUFS;
1382
1383         if (alloc_slab) {
1384                 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
1385                                                SLAB_HWCACHE_ALIGN, NULL, NULL);
1386
1387                 if (prot->slab == NULL) {
1388                         printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
1389                                prot->name);
1390                         goto out;
1391                 }
1392         }
1393
1394         write_lock(&proto_list_lock);
1395         list_add(&prot->node, &proto_list);
1396         write_unlock(&proto_list_lock);
1397         rc = 0;
1398 out:
1399         return rc;
1400 }
1401
1402 EXPORT_SYMBOL(proto_register);
1403
1404 void proto_unregister(struct proto *prot)
1405 {
1406         write_lock(&proto_list_lock);
1407
1408         if (prot->slab != NULL) {
1409                 kmem_cache_destroy(prot->slab);
1410                 prot->slab = NULL;
1411         }
1412
1413         list_del(&prot->node);
1414         write_unlock(&proto_list_lock);
1415 }
1416
1417 EXPORT_SYMBOL(proto_unregister);
1418
1419 #ifdef CONFIG_PROC_FS
1420 static inline struct proto *__proto_head(void)
1421 {
1422         return list_entry(proto_list.next, struct proto, node);
1423 }
1424
1425 static inline struct proto *proto_head(void)
1426 {
1427         return list_empty(&proto_list) ? NULL : __proto_head();
1428 }
1429
1430 static inline struct proto *proto_next(struct proto *proto)
1431 {
1432         return proto->node.next == &proto_list ? NULL :
1433                 list_entry(proto->node.next, struct proto, node);
1434 }
1435
1436 static inline struct proto *proto_get_idx(loff_t pos)
1437 {
1438         struct proto *proto;
1439         loff_t i = 0;
1440
1441         list_for_each_entry(proto, &proto_list, node)
1442                 if (i++ == pos)
1443                         goto out;
1444
1445         proto = NULL;
1446 out:
1447         return proto;
1448 }
1449
1450 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
1451 {
1452         read_lock(&proto_list_lock);
1453         return *pos ? proto_get_idx(*pos - 1) : SEQ_START_TOKEN;
1454 }
1455
1456 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1457 {
1458         ++*pos;
1459         return v == SEQ_START_TOKEN ? proto_head() : proto_next(v);
1460 }
1461
1462 static void proto_seq_stop(struct seq_file *seq, void *v)
1463 {
1464         read_unlock(&proto_list_lock);
1465 }
1466
1467 static char proto_method_implemented(const void *method)
1468 {
1469         return method == NULL ? 'n' : 'y';
1470 }
1471
1472 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
1473 {
1474         seq_printf(seq, "%-9s %4u %6d  %6d   %-3s %6u   %-3s  %-10s "
1475                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
1476                    proto->name,
1477                    proto->obj_size,
1478                    proto->sockets_allocated != NULL ? atomic_read(proto->sockets_allocated) : -1,
1479                    proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
1480                    proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
1481                    proto->max_header,
1482                    proto->slab == NULL ? "no" : "yes",
1483                    module_name(proto->owner),
1484                    proto_method_implemented(proto->close),
1485                    proto_method_implemented(proto->connect),
1486                    proto_method_implemented(proto->disconnect),
1487                    proto_method_implemented(proto->accept),
1488                    proto_method_implemented(proto->ioctl),
1489                    proto_method_implemented(proto->init),
1490                    proto_method_implemented(proto->destroy),
1491                    proto_method_implemented(proto->shutdown),
1492                    proto_method_implemented(proto->setsockopt),
1493                    proto_method_implemented(proto->getsockopt),
1494                    proto_method_implemented(proto->sendmsg),
1495                    proto_method_implemented(proto->recvmsg),
1496                    proto_method_implemented(proto->sendpage),
1497                    proto_method_implemented(proto->bind),
1498                    proto_method_implemented(proto->backlog_rcv),
1499                    proto_method_implemented(proto->hash),
1500                    proto_method_implemented(proto->unhash),
1501                    proto_method_implemented(proto->get_port),
1502                    proto_method_implemented(proto->enter_memory_pressure));
1503 }
1504
1505 static int proto_seq_show(struct seq_file *seq, void *v)
1506 {
1507         if (v == SEQ_START_TOKEN)
1508                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
1509                            "protocol",
1510                            "size",
1511                            "sockets",
1512                            "memory",
1513                            "press",
1514                            "maxhdr",
1515                            "slab",
1516                            "module",
1517                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
1518         else
1519                 proto_seq_printf(seq, v);
1520         return 0;
1521 }
1522
1523 static struct seq_operations proto_seq_ops = {
1524         .start  = proto_seq_start,
1525         .next   = proto_seq_next,
1526         .stop   = proto_seq_stop,
1527         .show   = proto_seq_show,
1528 };
1529
1530 static int proto_seq_open(struct inode *inode, struct file *file)
1531 {
1532         return seq_open(file, &proto_seq_ops);
1533 }
1534
1535 static struct file_operations proto_seq_fops = {
1536         .owner          = THIS_MODULE,
1537         .open           = proto_seq_open,
1538         .read           = seq_read,
1539         .llseek         = seq_lseek,
1540         .release        = seq_release,
1541 };
1542
1543 static int __init proto_init(void)
1544 {
1545         /* register /proc/net/protocols */
1546         return proc_net_fops_create("protocols", S_IRUGO, &proto_seq_fops) == NULL ? -ENOBUFS : 0;
1547 }
1548
1549 subsys_initcall(proto_init);
1550
1551 #endif /* PROC_FS */
1552
1553 EXPORT_SYMBOL(sk_alloc);
1554 EXPORT_SYMBOL(sk_free);
1555 EXPORT_SYMBOL(sk_send_sigurg);
1556 EXPORT_SYMBOL(sock_alloc_send_skb);
1557 EXPORT_SYMBOL(sock_init_data);
1558 EXPORT_SYMBOL(sock_kfree_s);
1559 EXPORT_SYMBOL(sock_kmalloc);
1560 EXPORT_SYMBOL(sock_no_accept);
1561 EXPORT_SYMBOL(sock_no_bind);
1562 EXPORT_SYMBOL(sock_no_connect);
1563 EXPORT_SYMBOL(sock_no_getname);
1564 EXPORT_SYMBOL(sock_no_getsockopt);
1565 EXPORT_SYMBOL(sock_no_ioctl);
1566 EXPORT_SYMBOL(sock_no_listen);
1567 EXPORT_SYMBOL(sock_no_mmap);
1568 EXPORT_SYMBOL(sock_no_poll);
1569 EXPORT_SYMBOL(sock_no_recvmsg);
1570 EXPORT_SYMBOL(sock_no_sendmsg);
1571 EXPORT_SYMBOL(sock_no_sendpage);
1572 EXPORT_SYMBOL(sock_no_setsockopt);
1573 EXPORT_SYMBOL(sock_no_shutdown);
1574 EXPORT_SYMBOL(sock_no_socketpair);
1575 EXPORT_SYMBOL(sock_rfree);
1576 EXPORT_SYMBOL(sock_setsockopt);
1577 EXPORT_SYMBOL(sock_wfree);
1578 EXPORT_SYMBOL(sock_wmalloc);
1579 EXPORT_SYMBOL(sock_i_uid);
1580 EXPORT_SYMBOL(sock_i_ino);
1581 #ifdef CONFIG_SYSCTL
1582 EXPORT_SYMBOL(sysctl_optmem_max);
1583 EXPORT_SYMBOL(sysctl_rmem_max);
1584 EXPORT_SYMBOL(sysctl_wmem_max);
1585 #endif