linux 2.6.16.38 w/ vs2.0.3-rc1
[linux-2.6.git] / net / core / sock.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Generic socket support routines. Memory allocators, socket lock/release
7  *              handler for protocols to use and generic option handler.
8  *
9  *
10  * Version:     $Id: sock.c,v 1.117 2002/02/01 22:01:03 davem Exp $
11  *
12  * Authors:     Ross Biro
13  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
14  *              Florian La Roche, <flla@stud.uni-sb.de>
15  *              Alan Cox, <A.Cox@swansea.ac.uk>
16  *
17  * Fixes:
18  *              Alan Cox        :       Numerous verify_area() problems
19  *              Alan Cox        :       Connecting on a connecting socket
20  *                                      now returns an error for tcp.
21  *              Alan Cox        :       sock->protocol is set correctly.
22  *                                      and is not sometimes left as 0.
23  *              Alan Cox        :       connect handles icmp errors on a
24  *                                      connect properly. Unfortunately there
25  *                                      is a restart syscall nasty there. I
26  *                                      can't match BSD without hacking the C
27  *                                      library. Ideas urgently sought!
28  *              Alan Cox        :       Disallow bind() to addresses that are
29  *                                      not ours - especially broadcast ones!!
30  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
31  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
32  *                                      instead they leave that for the DESTROY timer.
33  *              Alan Cox        :       Clean up error flag in accept
34  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
35  *                                      was buggy. Put a remove_sock() in the handler
36  *                                      for memory when we hit 0. Also altered the timer
37  *                                      code. The ACK stuff can wait and needs major 
38  *                                      TCP layer surgery.
39  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
40  *                                      and fixed timer/inet_bh race.
41  *              Alan Cox        :       Added zapped flag for TCP
42  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
43  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
44  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
45  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
46  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
47  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
48  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
49  *      Pauline Middelink       :       identd support
50  *              Alan Cox        :       Fixed connect() taking signals I think.
51  *              Alan Cox        :       SO_LINGER supported
52  *              Alan Cox        :       Error reporting fixes
53  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
54  *              Alan Cox        :       inet sockets don't set sk->type!
55  *              Alan Cox        :       Split socket option code
56  *              Alan Cox        :       Callbacks
57  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
58  *              Alex            :       Removed restriction on inet fioctl
59  *              Alan Cox        :       Splitting INET from NET core
60  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
61  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
62  *              Alan Cox        :       Split IP from generic code
63  *              Alan Cox        :       New kfree_skbmem()
64  *              Alan Cox        :       Make SO_DEBUG superuser only.
65  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
66  *                                      (compatibility fix)
67  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
68  *              Alan Cox        :       Allocator for a socket is settable.
69  *              Alan Cox        :       SO_ERROR includes soft errors.
70  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
71  *              Alan Cox        :       Generic socket allocation to make hooks
72  *                                      easier (suggested by Craig Metz).
73  *              Michael Pall    :       SO_ERROR returns positive errno again
74  *              Steve Whitehouse:       Added default destructor to free
75  *                                      protocol private data.
76  *              Steve Whitehouse:       Added various other default routines
77  *                                      common to several socket families.
78  *              Chris Evans     :       Call suser() check last on F_SETOWN
79  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
80  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
81  *              Andi Kleen      :       Fix write_space callback
82  *              Chris Evans     :       Security fixes - signedness again
83  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
84  *
85  * To Fix:
86  *
87  *
88  *              This program is free software; you can redistribute it and/or
89  *              modify it under the terms of the GNU General Public License
90  *              as published by the Free Software Foundation; either version
91  *              2 of the License, or (at your option) any later version.
92  */
93
94 #include <linux/capability.h>
95 #include <linux/config.h>
96 #include <linux/errno.h>
97 #include <linux/types.h>
98 #include <linux/socket.h>
99 #include <linux/in.h>
100 #include <linux/kernel.h>
101 #include <linux/module.h>
102 #include <linux/proc_fs.h>
103 #include <linux/seq_file.h>
104 #include <linux/sched.h>
105 #include <linux/timer.h>
106 #include <linux/string.h>
107 #include <linux/sockios.h>
108 #include <linux/net.h>
109 #include <linux/mm.h>
110 #include <linux/slab.h>
111 #include <linux/interrupt.h>
112 #include <linux/poll.h>
113 #include <linux/tcp.h>
114 #include <linux/init.h>
115
116 #include <asm/uaccess.h>
117 #include <asm/system.h>
118
119 #include <linux/netdevice.h>
120 #include <net/protocol.h>
121 #include <linux/skbuff.h>
122 #include <net/request_sock.h>
123 #include <net/sock.h>
124 #include <net/xfrm.h>
125 #include <linux/ipsec.h>
126
127 #include <linux/filter.h>
128 #include <linux/vs_socket.h>
129 #include <linux/vs_limit.h>
130 #include <linux/vs_context.h>
131
132 #ifdef CONFIG_INET
133 #include <net/tcp.h>
134 #endif
135
136 /* Take into consideration the size of the struct sk_buff overhead in the
137  * determination of these values, since that is non-constant across
138  * platforms.  This makes socket queueing behavior and performance
139  * not depend upon such differences.
140  */
141 #define _SK_MEM_PACKETS         256
142 #define _SK_MEM_OVERHEAD        (sizeof(struct sk_buff) + 256)
143 #define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
144 #define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
145
146 /* Run time adjustable parameters. */
147 __u32 sysctl_wmem_max = SK_WMEM_MAX;
148 __u32 sysctl_rmem_max = SK_RMEM_MAX;
149 __u32 sysctl_wmem_default = SK_WMEM_MAX;
150 __u32 sysctl_rmem_default = SK_RMEM_MAX;
151
152 /* Maximal space eaten by iovec or ancilliary data plus some space */
153 int sysctl_optmem_max = sizeof(unsigned long)*(2*UIO_MAXIOV + 512);
154
155 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
156 {
157         struct timeval tv;
158
159         if (optlen < sizeof(tv))
160                 return -EINVAL;
161         if (copy_from_user(&tv, optval, sizeof(tv)))
162                 return -EFAULT;
163
164         *timeo_p = MAX_SCHEDULE_TIMEOUT;
165         if (tv.tv_sec == 0 && tv.tv_usec == 0)
166                 return 0;
167         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
168                 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
169         return 0;
170 }
171
172 static void sock_warn_obsolete_bsdism(const char *name)
173 {
174         static int warned;
175         static char warncomm[TASK_COMM_LEN];
176         if (strcmp(warncomm, current->comm) && warned < 5) { 
177                 strcpy(warncomm,  current->comm); 
178                 printk(KERN_WARNING "process `%s' is using obsolete "
179                        "%s SO_BSDCOMPAT\n", warncomm, name);
180                 warned++;
181         }
182 }
183
184 static void sock_disable_timestamp(struct sock *sk)
185 {       
186         if (sock_flag(sk, SOCK_TIMESTAMP)) { 
187                 sock_reset_flag(sk, SOCK_TIMESTAMP);
188                 net_disable_timestamp();
189         }
190 }
191
192 /*
193  *      This is meant for all protocols to use and covers goings on
194  *      at the socket level. Everything here is generic.
195  */
196
197 int sock_setsockopt(struct socket *sock, int level, int optname,
198                     char __user *optval, int optlen)
199 {
200         struct sock *sk=sock->sk;
201         struct sk_filter *filter;
202         int val;
203         int valbool;
204         struct linger ling;
205         int ret = 0;
206         
207         /*
208          *      Options without arguments
209          */
210
211 #ifdef SO_DONTLINGER            /* Compatibility item... */
212         if (optname == SO_DONTLINGER) {
213                 lock_sock(sk);
214                 sock_reset_flag(sk, SOCK_LINGER);
215                 release_sock(sk);
216                 return 0;
217         }
218 #endif
219         
220         if(optlen<sizeof(int))
221                 return(-EINVAL);
222         
223         if (get_user(val, (int __user *)optval))
224                 return -EFAULT;
225         
226         valbool = val?1:0;
227
228         lock_sock(sk);
229
230         switch(optname) 
231         {
232                 case SO_DEBUG:  
233                         if(val && !capable(CAP_NET_ADMIN))
234                         {
235                                 ret = -EACCES;
236                         }
237                         else if (valbool)
238                                 sock_set_flag(sk, SOCK_DBG);
239                         else
240                                 sock_reset_flag(sk, SOCK_DBG);
241                         break;
242                 case SO_REUSEADDR:
243                         sk->sk_reuse = valbool;
244                         break;
245                 case SO_TYPE:
246                 case SO_ERROR:
247                         ret = -ENOPROTOOPT;
248                         break;
249                 case SO_DONTROUTE:
250                         if (valbool)
251                                 sock_set_flag(sk, SOCK_LOCALROUTE);
252                         else
253                                 sock_reset_flag(sk, SOCK_LOCALROUTE);
254                         break;
255                 case SO_BROADCAST:
256                         sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
257                         break;
258                 case SO_SNDBUF:
259                         /* Don't error on this BSD doesn't and if you think
260                            about it this is right. Otherwise apps have to
261                            play 'guess the biggest size' games. RCVBUF/SNDBUF
262                            are treated in BSD as hints */
263                            
264                         if (val > sysctl_wmem_max)
265                                 val = sysctl_wmem_max;
266 set_sndbuf:
267                         sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
268                         if ((val * 2) < SOCK_MIN_SNDBUF)
269                                 sk->sk_sndbuf = SOCK_MIN_SNDBUF;
270                         else
271                                 sk->sk_sndbuf = val * 2;
272
273                         /*
274                          *      Wake up sending tasks if we
275                          *      upped the value.
276                          */
277                         sk->sk_write_space(sk);
278                         break;
279
280                 case SO_SNDBUFFORCE:
281                         if (!capable(CAP_NET_ADMIN)) {
282                                 ret = -EPERM;
283                                 break;
284                         }
285                         goto set_sndbuf;
286
287                 case SO_RCVBUF:
288                         /* Don't error on this BSD doesn't and if you think
289                            about it this is right. Otherwise apps have to
290                            play 'guess the biggest size' games. RCVBUF/SNDBUF
291                            are treated in BSD as hints */
292                           
293                         if (val > sysctl_rmem_max)
294                                 val = sysctl_rmem_max;
295 set_rcvbuf:
296                         sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
297                         /* FIXME: is this lower bound the right one? */
298                         if ((val * 2) < SOCK_MIN_RCVBUF)
299                                 sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
300                         else
301                                 sk->sk_rcvbuf = val * 2;
302                         break;
303
304                 case SO_RCVBUFFORCE:
305                         if (!capable(CAP_NET_ADMIN)) {
306                                 ret = -EPERM;
307                                 break;
308                         }
309                         goto set_rcvbuf;
310
311                 case SO_KEEPALIVE:
312 #ifdef CONFIG_INET
313                         if (sk->sk_protocol == IPPROTO_TCP)
314                                 tcp_set_keepalive(sk, valbool);
315 #endif
316                         sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
317                         break;
318
319                 case SO_OOBINLINE:
320                         sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
321                         break;
322
323                 case SO_NO_CHECK:
324                         sk->sk_no_check = valbool;
325                         break;
326
327                 case SO_PRIORITY:
328                         if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN)) 
329                                 sk->sk_priority = val;
330                         else
331                                 ret = -EPERM;
332                         break;
333
334                 case SO_LINGER:
335                         if(optlen<sizeof(ling)) {
336                                 ret = -EINVAL;  /* 1003.1g */
337                                 break;
338                         }
339                         if (copy_from_user(&ling,optval,sizeof(ling))) {
340                                 ret = -EFAULT;
341                                 break;
342                         }
343                         if (!ling.l_onoff)
344                                 sock_reset_flag(sk, SOCK_LINGER);
345                         else {
346 #if (BITS_PER_LONG == 32)
347                                 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
348                                         sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
349                                 else
350 #endif
351                                         sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
352                                 sock_set_flag(sk, SOCK_LINGER);
353                         }
354                         break;
355
356                 case SO_BSDCOMPAT:
357                         sock_warn_obsolete_bsdism("setsockopt");
358                         break;
359
360                 case SO_PASSCRED:
361                         if (valbool)
362                                 set_bit(SOCK_PASSCRED, &sock->flags);
363                         else
364                                 clear_bit(SOCK_PASSCRED, &sock->flags);
365                         break;
366
367 #if defined(CONFIG_VNET) || defined(CONFIG_VNET_MODULE)
368                 case SO_SETXID:
369                         if (current->xid) {
370                                 ret = -EPERM;
371                                 break;
372                         }
373                         if (val < 0 || val > MAX_S_CONTEXT) {
374                                 ret = -EINVAL;
375                                 break;
376                         }
377                         sk->sk_xid = val;
378                         break;
379 #endif
380
381                 case SO_TIMESTAMP:
382                         if (valbool)  {
383                                 sock_set_flag(sk, SOCK_RCVTSTAMP);
384                                 sock_enable_timestamp(sk);
385                         } else
386                                 sock_reset_flag(sk, SOCK_RCVTSTAMP);
387                         break;
388
389                 case SO_RCVLOWAT:
390                         if (val < 0)
391                                 val = INT_MAX;
392                         sk->sk_rcvlowat = val ? : 1;
393                         break;
394
395                 case SO_RCVTIMEO:
396                         ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
397                         break;
398
399                 case SO_SNDTIMEO:
400                         ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
401                         break;
402
403 #ifdef CONFIG_NETDEVICES
404                 case SO_BINDTODEVICE:
405                 {
406                         char devname[IFNAMSIZ]; 
407
408                         /* Sorry... */ 
409                         if (!capable(CAP_NET_RAW)) {
410                                 ret = -EPERM;
411                                 break;
412                         }
413
414                         /* Bind this socket to a particular device like "eth0",
415                          * as specified in the passed interface name. If the
416                          * name is "" or the option length is zero the socket 
417                          * is not bound. 
418                          */ 
419
420                         if (!valbool) {
421                                 sk->sk_bound_dev_if = 0;
422                         } else {
423                                 if (optlen > IFNAMSIZ - 1)
424                                         optlen = IFNAMSIZ - 1;
425                                 memset(devname, 0, sizeof(devname));
426                                 if (copy_from_user(devname, optval, optlen)) {
427                                         ret = -EFAULT;
428                                         break;
429                                 }
430
431                                 /* Remove any cached route for this socket. */
432                                 sk_dst_reset(sk);
433
434                                 if (devname[0] == '\0') {
435                                         sk->sk_bound_dev_if = 0;
436                                 } else {
437                                         struct net_device *dev = dev_get_by_name(devname);
438                                         if (!dev) {
439                                                 ret = -ENODEV;
440                                                 break;
441                                         }
442                                         sk->sk_bound_dev_if = dev->ifindex;
443                                         dev_put(dev);
444                                 }
445                         }
446                         break;
447                 }
448 #endif
449
450
451                 case SO_ATTACH_FILTER:
452                         ret = -EINVAL;
453                         if (optlen == sizeof(struct sock_fprog)) {
454                                 struct sock_fprog fprog;
455
456                                 ret = -EFAULT;
457                                 if (copy_from_user(&fprog, optval, sizeof(fprog)))
458                                         break;
459
460                                 ret = sk_attach_filter(&fprog, sk);
461                         }
462                         break;
463
464                 case SO_DETACH_FILTER:
465                         spin_lock_bh(&sk->sk_lock.slock);
466                         filter = sk->sk_filter;
467                         if (filter) {
468                                 sk->sk_filter = NULL;
469                                 spin_unlock_bh(&sk->sk_lock.slock);
470                                 sk_filter_release(sk, filter);
471                                 break;
472                         }
473                         spin_unlock_bh(&sk->sk_lock.slock);
474                         ret = -ENONET;
475                         break;
476
477                 /* We implement the SO_SNDLOWAT etc to
478                    not be settable (1003.1g 5.3) */
479                 default:
480                         ret = -ENOPROTOOPT;
481                         break;
482         }
483         release_sock(sk);
484         return ret;
485 }
486
487
488 int sock_getsockopt(struct socket *sock, int level, int optname,
489                     char __user *optval, int __user *optlen)
490 {
491         struct sock *sk = sock->sk;
492         
493         union
494         {
495                 int val;
496                 struct linger ling;
497                 struct timeval tm;
498         } v;
499         
500         unsigned int lv = sizeof(int);
501         int len;
502         
503         if(get_user(len,optlen))
504                 return -EFAULT;
505         if(len < 0)
506                 return -EINVAL;
507                 
508         switch(optname) 
509         {
510                 case SO_DEBUG:          
511                         v.val = sock_flag(sk, SOCK_DBG);
512                         break;
513                 
514                 case SO_DONTROUTE:
515                         v.val = sock_flag(sk, SOCK_LOCALROUTE);
516                         break;
517                 
518                 case SO_BROADCAST:
519                         v.val = !!sock_flag(sk, SOCK_BROADCAST);
520                         break;
521
522                 case SO_SNDBUF:
523                         v.val = sk->sk_sndbuf;
524                         break;
525                 
526                 case SO_RCVBUF:
527                         v.val = sk->sk_rcvbuf;
528                         break;
529
530                 case SO_REUSEADDR:
531                         v.val = sk->sk_reuse;
532                         break;
533
534                 case SO_KEEPALIVE:
535                         v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
536                         break;
537
538                 case SO_TYPE:
539                         v.val = sk->sk_type;                            
540                         break;
541
542                 case SO_ERROR:
543                         v.val = -sock_error(sk);
544                         if(v.val==0)
545                                 v.val = xchg(&sk->sk_err_soft, 0);
546                         break;
547
548                 case SO_OOBINLINE:
549                         v.val = !!sock_flag(sk, SOCK_URGINLINE);
550                         break;
551         
552                 case SO_NO_CHECK:
553                         v.val = sk->sk_no_check;
554                         break;
555
556                 case SO_PRIORITY:
557                         v.val = sk->sk_priority;
558                         break;
559                 
560                 case SO_LINGER: 
561                         lv              = sizeof(v.ling);
562                         v.ling.l_onoff  = !!sock_flag(sk, SOCK_LINGER);
563                         v.ling.l_linger = sk->sk_lingertime / HZ;
564                         break;
565                                         
566                 case SO_BSDCOMPAT:
567                         sock_warn_obsolete_bsdism("getsockopt");
568                         break;
569
570                 case SO_TIMESTAMP:
571                         v.val = sock_flag(sk, SOCK_RCVTSTAMP);
572                         break;
573
574                 case SO_RCVTIMEO:
575                         lv=sizeof(struct timeval);
576                         if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
577                                 v.tm.tv_sec = 0;
578                                 v.tm.tv_usec = 0;
579                         } else {
580                                 v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
581                                 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
582                         }
583                         break;
584
585                 case SO_SNDTIMEO:
586                         lv=sizeof(struct timeval);
587                         if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
588                                 v.tm.tv_sec = 0;
589                                 v.tm.tv_usec = 0;
590                         } else {
591                                 v.tm.tv_sec = sk->sk_sndtimeo / HZ;
592                                 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
593                         }
594                         break;
595
596                 case SO_RCVLOWAT:
597                         v.val = sk->sk_rcvlowat;
598                         break;
599
600                 case SO_SNDLOWAT:
601                         v.val=1;
602                         break; 
603
604                 case SO_PASSCRED:
605                         v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
606                         break;
607
608                 case SO_PEERCRED:
609                         if (len > sizeof(sk->sk_peercred))
610                                 len = sizeof(sk->sk_peercred);
611                         if (copy_to_user(optval, &sk->sk_peercred, len))
612                                 return -EFAULT;
613                         goto lenout;
614
615                 case SO_PEERNAME:
616                 {
617                         char address[128];
618
619                         if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
620                                 return -ENOTCONN;
621                         if (lv < len)
622                                 return -EINVAL;
623                         if (copy_to_user(optval, address, len))
624                                 return -EFAULT;
625                         goto lenout;
626                 }
627
628                 /* Dubious BSD thing... Probably nobody even uses it, but
629                  * the UNIX standard wants it for whatever reason... -DaveM
630                  */
631                 case SO_ACCEPTCONN:
632                         v.val = sk->sk_state == TCP_LISTEN;
633                         break;
634
635                 case SO_PEERSEC:
636                         return security_socket_getpeersec(sock, optval, optlen, len);
637
638                 default:
639                         return(-ENOPROTOOPT);
640         }
641         if (len > lv)
642                 len = lv;
643         if (copy_to_user(optval, &v, len))
644                 return -EFAULT;
645 lenout:
646         if (put_user(len, optlen))
647                 return -EFAULT;
648         return 0;
649 }
650
651 /**
652  *      sk_alloc - All socket objects are allocated here
653  *      @family: protocol family
654  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
655  *      @prot: struct proto associated with this new sock instance
656  *      @zero_it: if we should zero the newly allocated sock
657  */
658 struct sock *sk_alloc(int family, gfp_t priority,
659                       struct proto *prot, int zero_it)
660 {
661         struct sock *sk = NULL;
662         kmem_cache_t *slab = prot->slab;
663
664         if (slab != NULL)
665                 sk = kmem_cache_alloc(slab, priority);
666         else
667                 sk = kmalloc(prot->obj_size, priority);
668
669         if (sk) {
670                 if (zero_it) {
671                         memset(sk, 0, prot->obj_size);
672                         sk->sk_family = family;
673                         /*
674                          * See comment in struct sock definition to understand
675                          * why we need sk_prot_creator -acme
676                          */
677                         sk->sk_prot = sk->sk_prot_creator = prot;
678                         sock_lock_init(sk);
679                 }
680                 sock_vx_init(sk);
681                 sock_nx_init(sk);
682                 
683                 if (security_sk_alloc(sk, family, priority))
684                         goto out_free;
685
686                 if (!try_module_get(prot->owner))
687                         goto out_free;
688         }
689         return sk;
690
691 out_free:
692         if (slab != NULL)
693                 kmem_cache_free(slab, sk);
694         else
695                 kfree(sk);
696         return NULL;
697 }
698
699 void sk_free(struct sock *sk)
700 {
701         struct sk_filter *filter;
702         struct module *owner = sk->sk_prot_creator->owner;
703
704         if (sk->sk_destruct)
705                 sk->sk_destruct(sk);
706
707         filter = sk->sk_filter;
708         if (filter) {
709                 sk_filter_release(sk, filter);
710                 sk->sk_filter = NULL;
711         }
712
713         sock_disable_timestamp(sk);
714
715         if (atomic_read(&sk->sk_omem_alloc))
716                 printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
717                        __FUNCTION__, atomic_read(&sk->sk_omem_alloc));
718
719         security_sk_free(sk);
720         vx_sock_dec(sk);
721         clr_vx_info(&sk->sk_vx_info);
722         sk->sk_xid = -1;
723         clr_nx_info(&sk->sk_nx_info);
724         sk->sk_nid = -1;
725         if (sk->sk_prot_creator->slab != NULL)
726                 kmem_cache_free(sk->sk_prot_creator->slab, sk);
727         else
728                 kfree(sk);
729         module_put(owner);
730 }
731
732 struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
733 {
734         struct sock *newsk = sk_alloc(sk->sk_family, priority, sk->sk_prot, 0);
735
736         if (newsk != NULL) {
737                 struct sk_filter *filter;
738
739                 memcpy(newsk, sk, sk->sk_prot->obj_size);
740
741                 /* SANITY */
742                 sock_vx_init(newsk);
743                 sock_nx_init(newsk);
744                 sk_node_init(&newsk->sk_node);
745                 sock_lock_init(newsk);
746                 bh_lock_sock(newsk);
747
748                 atomic_set(&newsk->sk_rmem_alloc, 0);
749                 atomic_set(&newsk->sk_wmem_alloc, 0);
750                 atomic_set(&newsk->sk_omem_alloc, 0);
751                 skb_queue_head_init(&newsk->sk_receive_queue);
752                 skb_queue_head_init(&newsk->sk_write_queue);
753
754                 rwlock_init(&newsk->sk_dst_lock);
755                 rwlock_init(&newsk->sk_callback_lock);
756
757                 newsk->sk_dst_cache     = NULL;
758                 newsk->sk_wmem_queued   = 0;
759                 newsk->sk_forward_alloc = 0;
760                 newsk->sk_send_head     = NULL;
761                 newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
762                 newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
763
764                 sock_reset_flag(newsk, SOCK_DONE);
765                 skb_queue_head_init(&newsk->sk_error_queue);
766
767                 filter = newsk->sk_filter;
768                 if (filter != NULL)
769                         sk_filter_charge(newsk, filter);
770
771                 if (unlikely(xfrm_sk_clone_policy(newsk))) {
772                         /* It is still raw copy of parent, so invalidate
773                          * destructor and make plain sk_free() */
774                         newsk->sk_destruct = NULL;
775                         sk_free(newsk);
776                         newsk = NULL;
777                         goto out;
778                 }
779
780                 newsk->sk_err      = 0;
781                 newsk->sk_priority = 0;
782                 atomic_set(&newsk->sk_refcnt, 2);
783
784                 set_vx_info(&newsk->sk_vx_info, sk->sk_vx_info);
785                 newsk->sk_xid = sk->sk_xid;
786                 vx_sock_inc(newsk);
787                 set_nx_info(&newsk->sk_nx_info, sk->sk_nx_info);
788                 newsk->sk_nid = sk->sk_nid;
789
790                 /*
791                  * Increment the counter in the same struct proto as the master
792                  * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
793                  * is the same as sk->sk_prot->socks, as this field was copied
794                  * with memcpy).
795                  *
796                  * This _changes_ the previous behaviour, where
797                  * tcp_create_openreq_child always was incrementing the
798                  * equivalent to tcp_prot->socks (inet_sock_nr), so this have
799                  * to be taken into account in all callers. -acme
800                  */
801                 sk_refcnt_debug_inc(newsk);
802                 newsk->sk_socket = NULL;
803                 newsk->sk_sleep  = NULL;
804
805                 if (newsk->sk_prot->sockets_allocated)
806                         atomic_inc(newsk->sk_prot->sockets_allocated);
807         }
808 out:
809         return newsk;
810 }
811
812 EXPORT_SYMBOL_GPL(sk_clone);
813
814 void __init sk_init(void)
815 {
816         if (num_physpages <= 4096) {
817                 sysctl_wmem_max = 32767;
818                 sysctl_rmem_max = 32767;
819                 sysctl_wmem_default = 32767;
820                 sysctl_rmem_default = 32767;
821         } else if (num_physpages >= 131072) {
822                 sysctl_wmem_max = 131071;
823                 sysctl_rmem_max = 131071;
824         }
825 }
826
827 /*
828  *      Simple resource managers for sockets.
829  */
830
831
832 /* 
833  * Write buffer destructor automatically called from kfree_skb. 
834  */
835 void sock_wfree(struct sk_buff *skb)
836 {
837         struct sock *sk = skb->sk;
838
839         /* In case it might be waiting for more memory. */
840         atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
841         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE))
842                 sk->sk_write_space(sk);
843         sock_put(sk);
844 }
845
846 /* 
847  * Read buffer destructor automatically called from kfree_skb. 
848  */
849 void sock_rfree(struct sk_buff *skb)
850 {
851         struct sock *sk = skb->sk;
852
853         atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
854 }
855
856
857 int sock_i_uid(struct sock *sk)
858 {
859         int uid;
860
861         read_lock(&sk->sk_callback_lock);
862         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
863         read_unlock(&sk->sk_callback_lock);
864         return uid;
865 }
866
867 unsigned long sock_i_ino(struct sock *sk)
868 {
869         unsigned long ino;
870
871         read_lock(&sk->sk_callback_lock);
872         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
873         read_unlock(&sk->sk_callback_lock);
874         return ino;
875 }
876
877 /*
878  * Allocate a skb from the socket's send buffer.
879  */
880 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
881                              gfp_t priority)
882 {
883         if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
884                 struct sk_buff * skb = alloc_skb(size, priority);
885                 if (skb) {
886                         skb_set_owner_w(skb, sk);
887                         return skb;
888                 }
889         }
890         return NULL;
891 }
892
893 /*
894  * Allocate a skb from the socket's receive buffer.
895  */ 
896 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
897                              gfp_t priority)
898 {
899         if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
900                 struct sk_buff *skb = alloc_skb(size, priority);
901                 if (skb) {
902                         skb_set_owner_r(skb, sk);
903                         return skb;
904                 }
905         }
906         return NULL;
907 }
908
909 /* 
910  * Allocate a memory block from the socket's option memory buffer.
911  */ 
912 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
913 {
914         if ((unsigned)size <= sysctl_optmem_max &&
915             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
916                 void *mem;
917                 /* First do the add, to avoid the race if kmalloc
918                  * might sleep.
919                  */
920                 atomic_add(size, &sk->sk_omem_alloc);
921                 mem = kmalloc(size, priority);
922                 if (mem)
923                         return mem;
924                 atomic_sub(size, &sk->sk_omem_alloc);
925         }
926         return NULL;
927 }
928
929 /*
930  * Free an option memory block.
931  */
932 void sock_kfree_s(struct sock *sk, void *mem, int size)
933 {
934         kfree(mem);
935         atomic_sub(size, &sk->sk_omem_alloc);
936 }
937
938 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
939    I think, these locks should be removed for datagram sockets.
940  */
941 static long sock_wait_for_wmem(struct sock * sk, long timeo)
942 {
943         DEFINE_WAIT(wait);
944
945         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
946         for (;;) {
947                 if (!timeo)
948                         break;
949                 if (signal_pending(current))
950                         break;
951                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
952                 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
953                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
954                         break;
955                 if (sk->sk_shutdown & SEND_SHUTDOWN)
956                         break;
957                 if (sk->sk_err)
958                         break;
959                 timeo = schedule_timeout(timeo);
960         }
961         finish_wait(sk->sk_sleep, &wait);
962         return timeo;
963 }
964
965
966 /*
967  *      Generic send/receive buffer handlers
968  */
969
970 static struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
971                                             unsigned long header_len,
972                                             unsigned long data_len,
973                                             int noblock, int *errcode)
974 {
975         struct sk_buff *skb;
976         gfp_t gfp_mask;
977         long timeo;
978         int err;
979
980         gfp_mask = sk->sk_allocation;
981         if (gfp_mask & __GFP_WAIT)
982                 gfp_mask |= __GFP_REPEAT;
983
984         timeo = sock_sndtimeo(sk, noblock);
985         while (1) {
986                 err = sock_error(sk);
987                 if (err != 0)
988                         goto failure;
989
990                 err = -EPIPE;
991                 if (sk->sk_shutdown & SEND_SHUTDOWN)
992                         goto failure;
993
994                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
995                         skb = alloc_skb(header_len, gfp_mask);
996                         if (skb) {
997                                 int npages;
998                                 int i;
999
1000                                 /* No pages, we're done... */
1001                                 if (!data_len)
1002                                         break;
1003
1004                                 npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1005                                 skb->truesize += data_len;
1006                                 skb_shinfo(skb)->nr_frags = npages;
1007                                 for (i = 0; i < npages; i++) {
1008                                         struct page *page;
1009                                         skb_frag_t *frag;
1010
1011                                         page = alloc_pages(sk->sk_allocation, 0);
1012                                         if (!page) {
1013                                                 err = -ENOBUFS;
1014                                                 skb_shinfo(skb)->nr_frags = i;
1015                                                 kfree_skb(skb);
1016                                                 goto failure;
1017                                         }
1018
1019                                         frag = &skb_shinfo(skb)->frags[i];
1020                                         frag->page = page;
1021                                         frag->page_offset = 0;
1022                                         frag->size = (data_len >= PAGE_SIZE ?
1023                                                       PAGE_SIZE :
1024                                                       data_len);
1025                                         data_len -= PAGE_SIZE;
1026                                 }
1027
1028                                 /* Full success... */
1029                                 break;
1030                         }
1031                         err = -ENOBUFS;
1032                         goto failure;
1033                 }
1034                 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1035                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1036                 err = -EAGAIN;
1037                 if (!timeo)
1038                         goto failure;
1039                 if (signal_pending(current))
1040                         goto interrupted;
1041                 timeo = sock_wait_for_wmem(sk, timeo);
1042         }
1043
1044         skb_set_owner_w(skb, sk);
1045         return skb;
1046
1047 interrupted:
1048         err = sock_intr_errno(timeo);
1049 failure:
1050         *errcode = err;
1051         return NULL;
1052 }
1053
1054 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, 
1055                                     int noblock, int *errcode)
1056 {
1057         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1058 }
1059
1060 static void __lock_sock(struct sock *sk)
1061 {
1062         DEFINE_WAIT(wait);
1063
1064         for(;;) {
1065                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1066                                         TASK_UNINTERRUPTIBLE);
1067                 spin_unlock_bh(&sk->sk_lock.slock);
1068                 schedule();
1069                 spin_lock_bh(&sk->sk_lock.slock);
1070                 if(!sock_owned_by_user(sk))
1071                         break;
1072         }
1073         finish_wait(&sk->sk_lock.wq, &wait);
1074 }
1075
1076 static void __release_sock(struct sock *sk)
1077 {
1078         struct sk_buff *skb = sk->sk_backlog.head;
1079
1080         do {
1081                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1082                 bh_unlock_sock(sk);
1083
1084                 do {
1085                         struct sk_buff *next = skb->next;
1086
1087                         skb->next = NULL;
1088                         sk->sk_backlog_rcv(sk, skb);
1089
1090                         /*
1091                          * We are in process context here with softirqs
1092                          * disabled, use cond_resched_softirq() to preempt.
1093                          * This is safe to do because we've taken the backlog
1094                          * queue private:
1095                          */
1096                         cond_resched_softirq();
1097
1098                         skb = next;
1099                 } while (skb != NULL);
1100
1101                 bh_lock_sock(sk);
1102         } while((skb = sk->sk_backlog.head) != NULL);
1103 }
1104
1105 /**
1106  * sk_wait_data - wait for data to arrive at sk_receive_queue
1107  * @sk:    sock to wait on
1108  * @timeo: for how long
1109  *
1110  * Now socket state including sk->sk_err is changed only under lock,
1111  * hence we may omit checks after joining wait queue.
1112  * We check receive queue before schedule() only as optimization;
1113  * it is very likely that release_sock() added new data.
1114  */
1115 int sk_wait_data(struct sock *sk, long *timeo)
1116 {
1117         int rc;
1118         DEFINE_WAIT(wait);
1119
1120         prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1121         set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1122         rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1123         clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1124         finish_wait(sk->sk_sleep, &wait);
1125         return rc;
1126 }
1127
1128 EXPORT_SYMBOL(sk_wait_data);
1129
1130 /*
1131  * Set of default routines for initialising struct proto_ops when
1132  * the protocol does not support a particular function. In certain
1133  * cases where it makes no sense for a protocol to have a "do nothing"
1134  * function, some default processing is provided.
1135  */
1136
1137 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1138 {
1139         return -EOPNOTSUPP;
1140 }
1141
1142 int sock_no_connect(struct socket *sock, struct sockaddr *saddr, 
1143                     int len, int flags)
1144 {
1145         return -EOPNOTSUPP;
1146 }
1147
1148 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1149 {
1150         return -EOPNOTSUPP;
1151 }
1152
1153 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1154 {
1155         return -EOPNOTSUPP;
1156 }
1157
1158 int sock_no_getname(struct socket *sock, struct sockaddr *saddr, 
1159                     int *len, int peer)
1160 {
1161         return -EOPNOTSUPP;
1162 }
1163
1164 unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt)
1165 {
1166         return 0;
1167 }
1168
1169 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1170 {
1171         return -EOPNOTSUPP;
1172 }
1173
1174 int sock_no_listen(struct socket *sock, int backlog)
1175 {
1176         return -EOPNOTSUPP;
1177 }
1178
1179 int sock_no_shutdown(struct socket *sock, int how)
1180 {
1181         return -EOPNOTSUPP;
1182 }
1183
1184 int sock_no_setsockopt(struct socket *sock, int level, int optname,
1185                     char __user *optval, int optlen)
1186 {
1187         return -EOPNOTSUPP;
1188 }
1189
1190 int sock_no_getsockopt(struct socket *sock, int level, int optname,
1191                     char __user *optval, int __user *optlen)
1192 {
1193         return -EOPNOTSUPP;
1194 }
1195
1196 int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1197                     size_t len)
1198 {
1199         return -EOPNOTSUPP;
1200 }
1201
1202 int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1203                     size_t len, int flags)
1204 {
1205         return -EOPNOTSUPP;
1206 }
1207
1208 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1209 {
1210         /* Mirror missing mmap method error code */
1211         return -ENODEV;
1212 }
1213
1214 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1215 {
1216         ssize_t res;
1217         struct msghdr msg = {.msg_flags = flags};
1218         struct kvec iov;
1219         char *kaddr = kmap(page);
1220         iov.iov_base = kaddr + offset;
1221         iov.iov_len = size;
1222         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1223         kunmap(page);
1224         return res;
1225 }
1226
1227 /*
1228  *      Default Socket Callbacks
1229  */
1230
1231 static void sock_def_wakeup(struct sock *sk)
1232 {
1233         read_lock(&sk->sk_callback_lock);
1234         if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1235                 wake_up_interruptible_all(sk->sk_sleep);
1236         read_unlock(&sk->sk_callback_lock);
1237 }
1238
1239 static void sock_def_error_report(struct sock *sk)
1240 {
1241         read_lock(&sk->sk_callback_lock);
1242         if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1243                 wake_up_interruptible(sk->sk_sleep);
1244         sk_wake_async(sk,0,POLL_ERR); 
1245         read_unlock(&sk->sk_callback_lock);
1246 }
1247
1248 static void sock_def_readable(struct sock *sk, int len)
1249 {
1250         read_lock(&sk->sk_callback_lock);
1251         if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1252                 wake_up_interruptible(sk->sk_sleep);
1253         sk_wake_async(sk,1,POLL_IN);
1254         read_unlock(&sk->sk_callback_lock);
1255 }
1256
1257 static void sock_def_write_space(struct sock *sk)
1258 {
1259         read_lock(&sk->sk_callback_lock);
1260
1261         /* Do not wake up a writer until he can make "significant"
1262          * progress.  --DaveM
1263          */
1264         if((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1265                 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1266                         wake_up_interruptible(sk->sk_sleep);
1267
1268                 /* Should agree with poll, otherwise some programs break */
1269                 if (sock_writeable(sk))
1270                         sk_wake_async(sk, 2, POLL_OUT);
1271         }
1272
1273         read_unlock(&sk->sk_callback_lock);
1274 }
1275
1276 static void sock_def_destruct(struct sock *sk)
1277 {
1278         kfree(sk->sk_protinfo);
1279 }
1280
1281 void sk_send_sigurg(struct sock *sk)
1282 {
1283         if (sk->sk_socket && sk->sk_socket->file)
1284                 if (send_sigurg(&sk->sk_socket->file->f_owner))
1285                         sk_wake_async(sk, 3, POLL_PRI);
1286 }
1287
1288 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1289                     unsigned long expires)
1290 {
1291         if (!mod_timer(timer, expires))
1292                 sock_hold(sk);
1293 }
1294
1295 EXPORT_SYMBOL(sk_reset_timer);
1296
1297 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1298 {
1299         if (timer_pending(timer) && del_timer(timer))
1300                 __sock_put(sk);
1301 }
1302
1303 EXPORT_SYMBOL(sk_stop_timer);
1304
1305 void sock_init_data(struct socket *sock, struct sock *sk)
1306 {
1307         skb_queue_head_init(&sk->sk_receive_queue);
1308         skb_queue_head_init(&sk->sk_write_queue);
1309         skb_queue_head_init(&sk->sk_error_queue);
1310
1311         sk->sk_send_head        =       NULL;
1312
1313         init_timer(&sk->sk_timer);
1314         
1315         sk->sk_allocation       =       GFP_KERNEL;
1316         sk->sk_rcvbuf           =       sysctl_rmem_default;
1317         sk->sk_sndbuf           =       sysctl_wmem_default;
1318         sk->sk_state            =       TCP_CLOSE;
1319         sk->sk_socket           =       sock;
1320
1321         sock_set_flag(sk, SOCK_ZAPPED);
1322
1323         if(sock)
1324         {
1325                 sk->sk_type     =       sock->type;
1326                 sk->sk_sleep    =       &sock->wait;
1327                 sock->sk        =       sk;
1328         } else
1329                 sk->sk_sleep    =       NULL;
1330
1331         rwlock_init(&sk->sk_dst_lock);
1332         rwlock_init(&sk->sk_callback_lock);
1333
1334         sk->sk_state_change     =       sock_def_wakeup;
1335         sk->sk_data_ready       =       sock_def_readable;
1336         sk->sk_write_space      =       sock_def_write_space;
1337         sk->sk_error_report     =       sock_def_error_report;
1338         sk->sk_destruct         =       sock_def_destruct;
1339
1340         sk->sk_sndmsg_page      =       NULL;
1341         sk->sk_sndmsg_off       =       0;
1342
1343         sk->sk_peercred.pid     =       0;
1344         sk->sk_peercred.uid     =       -1;
1345         sk->sk_peercred.gid     =       -1;
1346         sk->sk_write_pending    =       0;
1347         sk->sk_rcvlowat         =       1;
1348         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
1349         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
1350
1351         sk->sk_stamp.tv_sec     = -1L;
1352         sk->sk_stamp.tv_usec    = -1L;
1353
1354         set_vx_info(&sk->sk_vx_info, current->vx_info);
1355         sk->sk_xid = vx_current_xid();
1356         vx_sock_inc(sk);
1357         set_nx_info(&sk->sk_nx_info, current->nx_info);
1358         sk->sk_nid = nx_current_nid();
1359         atomic_set(&sk->sk_refcnt, 1);
1360 }
1361
1362 void fastcall lock_sock(struct sock *sk)
1363 {
1364         might_sleep();
1365         spin_lock_bh(&(sk->sk_lock.slock));
1366         if (sk->sk_lock.owner)
1367                 __lock_sock(sk);
1368         sk->sk_lock.owner = (void *)1;
1369         spin_unlock_bh(&(sk->sk_lock.slock));
1370 }
1371
1372 EXPORT_SYMBOL(lock_sock);
1373
1374 void fastcall release_sock(struct sock *sk)
1375 {
1376         spin_lock_bh(&(sk->sk_lock.slock));
1377         if (sk->sk_backlog.tail)
1378                 __release_sock(sk);
1379         sk->sk_lock.owner = NULL;
1380         if (waitqueue_active(&(sk->sk_lock.wq)))
1381                 wake_up(&(sk->sk_lock.wq));
1382         spin_unlock_bh(&(sk->sk_lock.slock));
1383 }
1384 EXPORT_SYMBOL(release_sock);
1385
1386 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
1387
1388         if (!sock_flag(sk, SOCK_TIMESTAMP))
1389                 sock_enable_timestamp(sk);
1390         if (sk->sk_stamp.tv_sec == -1) 
1391                 return -ENOENT;
1392         if (sk->sk_stamp.tv_sec == 0)
1393                 do_gettimeofday(&sk->sk_stamp);
1394         return copy_to_user(userstamp, &sk->sk_stamp, sizeof(struct timeval)) ?
1395                 -EFAULT : 0; 
1396
1397 EXPORT_SYMBOL(sock_get_timestamp);
1398
1399 void sock_enable_timestamp(struct sock *sk)
1400 {       
1401         if (!sock_flag(sk, SOCK_TIMESTAMP)) { 
1402                 sock_set_flag(sk, SOCK_TIMESTAMP);
1403                 net_enable_timestamp();
1404         }
1405 }
1406 EXPORT_SYMBOL(sock_enable_timestamp); 
1407
1408 /*
1409  *      Get a socket option on an socket.
1410  *
1411  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
1412  *      asynchronous errors should be reported by getsockopt. We assume
1413  *      this means if you specify SO_ERROR (otherwise whats the point of it).
1414  */
1415 int sock_common_getsockopt(struct socket *sock, int level, int optname,
1416                            char __user *optval, int __user *optlen)
1417 {
1418         struct sock *sk = sock->sk;
1419
1420         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1421 }
1422
1423 EXPORT_SYMBOL(sock_common_getsockopt);
1424
1425 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
1426                         struct msghdr *msg, size_t size, int flags)
1427 {
1428         struct sock *sk = sock->sk;
1429         int addr_len = 0;
1430         int err;
1431
1432         err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
1433                                    flags & ~MSG_DONTWAIT, &addr_len);
1434         if (err >= 0)
1435                 msg->msg_namelen = addr_len;
1436         return err;
1437 }
1438
1439 EXPORT_SYMBOL(sock_common_recvmsg);
1440
1441 /*
1442  *      Set socket options on an inet socket.
1443  */
1444 int sock_common_setsockopt(struct socket *sock, int level, int optname,
1445                            char __user *optval, int optlen)
1446 {
1447         struct sock *sk = sock->sk;
1448
1449         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1450 }
1451
1452 EXPORT_SYMBOL(sock_common_setsockopt);
1453
1454 void sk_common_release(struct sock *sk)
1455 {
1456         if (sk->sk_prot->destroy)
1457                 sk->sk_prot->destroy(sk);
1458
1459         /*
1460          * Observation: when sock_common_release is called, processes have
1461          * no access to socket. But net still has.
1462          * Step one, detach it from networking:
1463          *
1464          * A. Remove from hash tables.
1465          */
1466
1467         sk->sk_prot->unhash(sk);
1468
1469         /*
1470          * In this point socket cannot receive new packets, but it is possible
1471          * that some packets are in flight because some CPU runs receiver and
1472          * did hash table lookup before we unhashed socket. They will achieve
1473          * receive queue and will be purged by socket destructor.
1474          *
1475          * Also we still have packets pending on receive queue and probably,
1476          * our own packets waiting in device queues. sock_destroy will drain
1477          * receive queue, but transmitted packets will delay socket destruction
1478          * until the last reference will be released.
1479          */
1480
1481         sock_orphan(sk);
1482
1483         xfrm_sk_free_policy(sk);
1484
1485         sk_refcnt_debug_release(sk);
1486         sock_put(sk);
1487 }
1488
1489 EXPORT_SYMBOL(sk_common_release);
1490
1491 static DEFINE_RWLOCK(proto_list_lock);
1492 static LIST_HEAD(proto_list);
1493
1494 int proto_register(struct proto *prot, int alloc_slab)
1495 {
1496         char *request_sock_slab_name = NULL;
1497         char *timewait_sock_slab_name;
1498         int rc = -ENOBUFS;
1499
1500         if (alloc_slab) {
1501                 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
1502                                                SLAB_HWCACHE_ALIGN, NULL, NULL);
1503
1504                 if (prot->slab == NULL) {
1505                         printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
1506                                prot->name);
1507                         goto out;
1508                 }
1509
1510                 if (prot->rsk_prot != NULL) {
1511                         static const char mask[] = "request_sock_%s";
1512
1513                         request_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1514                         if (request_sock_slab_name == NULL)
1515                                 goto out_free_sock_slab;
1516
1517                         sprintf(request_sock_slab_name, mask, prot->name);
1518                         prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name,
1519                                                                  prot->rsk_prot->obj_size, 0,
1520                                                                  SLAB_HWCACHE_ALIGN, NULL, NULL);
1521
1522                         if (prot->rsk_prot->slab == NULL) {
1523                                 printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
1524                                        prot->name);
1525                                 goto out_free_request_sock_slab_name;
1526                         }
1527                 }
1528
1529                 if (prot->twsk_prot != NULL) {
1530                         static const char mask[] = "tw_sock_%s";
1531
1532                         timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1533
1534                         if (timewait_sock_slab_name == NULL)
1535                                 goto out_free_request_sock_slab;
1536
1537                         sprintf(timewait_sock_slab_name, mask, prot->name);
1538                         prot->twsk_prot->twsk_slab =
1539                                 kmem_cache_create(timewait_sock_slab_name,
1540                                                   prot->twsk_prot->twsk_obj_size,
1541                                                   0, SLAB_HWCACHE_ALIGN,
1542                                                   NULL, NULL);
1543                         if (prot->twsk_prot->twsk_slab == NULL)
1544                                 goto out_free_timewait_sock_slab_name;
1545                 }
1546         }
1547
1548         write_lock(&proto_list_lock);
1549         list_add(&prot->node, &proto_list);
1550         write_unlock(&proto_list_lock);
1551         rc = 0;
1552 out:
1553         return rc;
1554 out_free_timewait_sock_slab_name:
1555         kfree(timewait_sock_slab_name);
1556 out_free_request_sock_slab:
1557         if (prot->rsk_prot && prot->rsk_prot->slab) {
1558                 kmem_cache_destroy(prot->rsk_prot->slab);
1559                 prot->rsk_prot->slab = NULL;
1560         }
1561 out_free_request_sock_slab_name:
1562         kfree(request_sock_slab_name);
1563 out_free_sock_slab:
1564         kmem_cache_destroy(prot->slab);
1565         prot->slab = NULL;
1566         goto out;
1567 }
1568
1569 EXPORT_SYMBOL(proto_register);
1570
1571 void proto_unregister(struct proto *prot)
1572 {
1573         write_lock(&proto_list_lock);
1574         list_del(&prot->node);
1575         write_unlock(&proto_list_lock);
1576
1577         if (prot->slab != NULL) {
1578                 kmem_cache_destroy(prot->slab);
1579                 prot->slab = NULL;
1580         }
1581
1582         if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
1583                 const char *name = kmem_cache_name(prot->rsk_prot->slab);
1584
1585                 kmem_cache_destroy(prot->rsk_prot->slab);
1586                 kfree(name);
1587                 prot->rsk_prot->slab = NULL;
1588         }
1589
1590         if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
1591                 const char *name = kmem_cache_name(prot->twsk_prot->twsk_slab);
1592
1593                 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
1594                 kfree(name);
1595                 prot->twsk_prot->twsk_slab = NULL;
1596         }
1597 }
1598
1599 EXPORT_SYMBOL(proto_unregister);
1600
1601 #ifdef CONFIG_PROC_FS
1602 static inline struct proto *__proto_head(void)
1603 {
1604         return list_entry(proto_list.next, struct proto, node);
1605 }
1606
1607 static inline struct proto *proto_head(void)
1608 {
1609         return list_empty(&proto_list) ? NULL : __proto_head();
1610 }
1611
1612 static inline struct proto *proto_next(struct proto *proto)
1613 {
1614         return proto->node.next == &proto_list ? NULL :
1615                 list_entry(proto->node.next, struct proto, node);
1616 }
1617
1618 static inline struct proto *proto_get_idx(loff_t pos)
1619 {
1620         struct proto *proto;
1621         loff_t i = 0;
1622
1623         list_for_each_entry(proto, &proto_list, node)
1624                 if (i++ == pos)
1625                         goto out;
1626
1627         proto = NULL;
1628 out:
1629         return proto;
1630 }
1631
1632 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
1633 {
1634         read_lock(&proto_list_lock);
1635         return *pos ? proto_get_idx(*pos - 1) : SEQ_START_TOKEN;
1636 }
1637
1638 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1639 {
1640         ++*pos;
1641         return v == SEQ_START_TOKEN ? proto_head() : proto_next(v);
1642 }
1643
1644 static void proto_seq_stop(struct seq_file *seq, void *v)
1645 {
1646         read_unlock(&proto_list_lock);
1647 }
1648
1649 static char proto_method_implemented(const void *method)
1650 {
1651         return method == NULL ? 'n' : 'y';
1652 }
1653
1654 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
1655 {
1656         seq_printf(seq, "%-9s %4u %6d  %6d   %-3s %6u   %-3s  %-10s "
1657                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
1658                    proto->name,
1659                    proto->obj_size,
1660                    proto->sockets_allocated != NULL ? atomic_read(proto->sockets_allocated) : -1,
1661                    proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
1662                    proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
1663                    proto->max_header,
1664                    proto->slab == NULL ? "no" : "yes",
1665                    module_name(proto->owner),
1666                    proto_method_implemented(proto->close),
1667                    proto_method_implemented(proto->connect),
1668                    proto_method_implemented(proto->disconnect),
1669                    proto_method_implemented(proto->accept),
1670                    proto_method_implemented(proto->ioctl),
1671                    proto_method_implemented(proto->init),
1672                    proto_method_implemented(proto->destroy),
1673                    proto_method_implemented(proto->shutdown),
1674                    proto_method_implemented(proto->setsockopt),
1675                    proto_method_implemented(proto->getsockopt),
1676                    proto_method_implemented(proto->sendmsg),
1677                    proto_method_implemented(proto->recvmsg),
1678                    proto_method_implemented(proto->sendpage),
1679                    proto_method_implemented(proto->bind),
1680                    proto_method_implemented(proto->backlog_rcv),
1681                    proto_method_implemented(proto->hash),
1682                    proto_method_implemented(proto->unhash),
1683                    proto_method_implemented(proto->get_port),
1684                    proto_method_implemented(proto->enter_memory_pressure));
1685 }
1686
1687 static int proto_seq_show(struct seq_file *seq, void *v)
1688 {
1689         if (v == SEQ_START_TOKEN)
1690                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
1691                            "protocol",
1692                            "size",
1693                            "sockets",
1694                            "memory",
1695                            "press",
1696                            "maxhdr",
1697                            "slab",
1698                            "module",
1699                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
1700         else
1701                 proto_seq_printf(seq, v);
1702         return 0;
1703 }
1704
1705 static struct seq_operations proto_seq_ops = {
1706         .start  = proto_seq_start,
1707         .next   = proto_seq_next,
1708         .stop   = proto_seq_stop,
1709         .show   = proto_seq_show,
1710 };
1711
1712 static int proto_seq_open(struct inode *inode, struct file *file)
1713 {
1714         return seq_open(file, &proto_seq_ops);
1715 }
1716
1717 static struct file_operations proto_seq_fops = {
1718         .owner          = THIS_MODULE,
1719         .open           = proto_seq_open,
1720         .read           = seq_read,
1721         .llseek         = seq_lseek,
1722         .release        = seq_release,
1723 };
1724
1725 static int __init proto_init(void)
1726 {
1727         /* register /proc/net/protocols */
1728         return proc_net_fops_create("protocols", S_IRUGO, &proto_seq_fops) == NULL ? -ENOBUFS : 0;
1729 }
1730
1731 subsys_initcall(proto_init);
1732
1733 #endif /* PROC_FS */
1734
1735 EXPORT_SYMBOL(sk_alloc);
1736 EXPORT_SYMBOL(sk_free);
1737 EXPORT_SYMBOL(sk_send_sigurg);
1738 EXPORT_SYMBOL(sock_alloc_send_skb);
1739 EXPORT_SYMBOL(sock_init_data);
1740 EXPORT_SYMBOL(sock_kfree_s);
1741 EXPORT_SYMBOL(sock_kmalloc);
1742 EXPORT_SYMBOL(sock_no_accept);
1743 EXPORT_SYMBOL(sock_no_bind);
1744 EXPORT_SYMBOL(sock_no_connect);
1745 EXPORT_SYMBOL(sock_no_getname);
1746 EXPORT_SYMBOL(sock_no_getsockopt);
1747 EXPORT_SYMBOL(sock_no_ioctl);
1748 EXPORT_SYMBOL(sock_no_listen);
1749 EXPORT_SYMBOL(sock_no_mmap);
1750 EXPORT_SYMBOL(sock_no_poll);
1751 EXPORT_SYMBOL(sock_no_recvmsg);
1752 EXPORT_SYMBOL(sock_no_sendmsg);
1753 EXPORT_SYMBOL(sock_no_sendpage);
1754 EXPORT_SYMBOL(sock_no_setsockopt);
1755 EXPORT_SYMBOL(sock_no_shutdown);
1756 EXPORT_SYMBOL(sock_no_socketpair);
1757 EXPORT_SYMBOL(sock_rfree);
1758 EXPORT_SYMBOL(sock_setsockopt);
1759 EXPORT_SYMBOL(sock_wfree);
1760 EXPORT_SYMBOL(sock_wmalloc);
1761 EXPORT_SYMBOL(sock_i_uid);
1762 EXPORT_SYMBOL(sock_i_ino);
1763 EXPORT_SYMBOL(sysctl_optmem_max);
1764 #ifdef CONFIG_SYSCTL
1765 EXPORT_SYMBOL(sysctl_rmem_max);
1766 EXPORT_SYMBOL(sysctl_wmem_max);
1767 #endif