Merge to Fedora kernel-2.6.17-1.2142_FC4 patched with stable patch-2.6.17.4-vs2.0...
[linux-2.6.git] / net / core / sock.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Generic socket support routines. Memory allocators, socket lock/release
7  *              handler for protocols to use and generic option handler.
8  *
9  *
10  * Version:     $Id: sock.c,v 1.117 2002/02/01 22:01:03 davem Exp $
11  *
12  * Authors:     Ross Biro
13  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
14  *              Florian La Roche, <flla@stud.uni-sb.de>
15  *              Alan Cox, <A.Cox@swansea.ac.uk>
16  *
17  * Fixes:
18  *              Alan Cox        :       Numerous verify_area() problems
19  *              Alan Cox        :       Connecting on a connecting socket
20  *                                      now returns an error for tcp.
21  *              Alan Cox        :       sock->protocol is set correctly.
22  *                                      and is not sometimes left as 0.
23  *              Alan Cox        :       connect handles icmp errors on a
24  *                                      connect properly. Unfortunately there
25  *                                      is a restart syscall nasty there. I
26  *                                      can't match BSD without hacking the C
27  *                                      library. Ideas urgently sought!
28  *              Alan Cox        :       Disallow bind() to addresses that are
29  *                                      not ours - especially broadcast ones!!
30  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
31  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
32  *                                      instead they leave that for the DESTROY timer.
33  *              Alan Cox        :       Clean up error flag in accept
34  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
35  *                                      was buggy. Put a remove_sock() in the handler
36  *                                      for memory when we hit 0. Also altered the timer
37  *                                      code. The ACK stuff can wait and needs major 
38  *                                      TCP layer surgery.
39  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
40  *                                      and fixed timer/inet_bh race.
41  *              Alan Cox        :       Added zapped flag for TCP
42  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
43  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
44  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
45  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
46  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
47  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
48  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
49  *      Pauline Middelink       :       identd support
50  *              Alan Cox        :       Fixed connect() taking signals I think.
51  *              Alan Cox        :       SO_LINGER supported
52  *              Alan Cox        :       Error reporting fixes
53  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
54  *              Alan Cox        :       inet sockets don't set sk->type!
55  *              Alan Cox        :       Split socket option code
56  *              Alan Cox        :       Callbacks
57  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
58  *              Alex            :       Removed restriction on inet fioctl
59  *              Alan Cox        :       Splitting INET from NET core
60  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
61  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
62  *              Alan Cox        :       Split IP from generic code
63  *              Alan Cox        :       New kfree_skbmem()
64  *              Alan Cox        :       Make SO_DEBUG superuser only.
65  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
66  *                                      (compatibility fix)
67  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
68  *              Alan Cox        :       Allocator for a socket is settable.
69  *              Alan Cox        :       SO_ERROR includes soft errors.
70  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
71  *              Alan Cox        :       Generic socket allocation to make hooks
72  *                                      easier (suggested by Craig Metz).
73  *              Michael Pall    :       SO_ERROR returns positive errno again
74  *              Steve Whitehouse:       Added default destructor to free
75  *                                      protocol private data.
76  *              Steve Whitehouse:       Added various other default routines
77  *                                      common to several socket families.
78  *              Chris Evans     :       Call suser() check last on F_SETOWN
79  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
80  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
81  *              Andi Kleen      :       Fix write_space callback
82  *              Chris Evans     :       Security fixes - signedness again
83  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
84  *
85  * To Fix:
86  *
87  *
88  *              This program is free software; you can redistribute it and/or
89  *              modify it under the terms of the GNU General Public License
90  *              as published by the Free Software Foundation; either version
91  *              2 of the License, or (at your option) any later version.
92  */
93
94 #include <linux/capability.h>
95 #include <linux/config.h>
96 #include <linux/errno.h>
97 #include <linux/types.h>
98 #include <linux/socket.h>
99 #include <linux/in.h>
100 #include <linux/kernel.h>
101 #include <linux/module.h>
102 #include <linux/proc_fs.h>
103 #include <linux/seq_file.h>
104 #include <linux/sched.h>
105 #include <linux/timer.h>
106 #include <linux/string.h>
107 #include <linux/sockios.h>
108 #include <linux/net.h>
109 #include <linux/mm.h>
110 #include <linux/slab.h>
111 #include <linux/interrupt.h>
112 #include <linux/poll.h>
113 #include <linux/tcp.h>
114 #include <linux/init.h>
115
116 #include <asm/uaccess.h>
117 #include <asm/system.h>
118
119 #include <linux/netdevice.h>
120 #include <net/protocol.h>
121 #include <linux/skbuff.h>
122 #include <net/request_sock.h>
123 #include <net/sock.h>
124 #include <net/xfrm.h>
125 #include <linux/ipsec.h>
126
127 #include <linux/filter.h>
128 #include <linux/vs_socket.h>
129 #include <linux/vs_limit.h>
130 #include <linux/vs_context.h>
131
132 #ifdef CONFIG_INET
133 #include <net/tcp.h>
134 #endif
135
136 /* Take into consideration the size of the struct sk_buff overhead in the
137  * determination of these values, since that is non-constant across
138  * platforms.  This makes socket queueing behavior and performance
139  * not depend upon such differences.
140  */
141 #define _SK_MEM_PACKETS         256
142 #define _SK_MEM_OVERHEAD        (sizeof(struct sk_buff) + 256)
143 #define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
144 #define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
145
146 /* Run time adjustable parameters. */
147 __u32 sysctl_wmem_max = SK_WMEM_MAX;
148 __u32 sysctl_rmem_max = SK_RMEM_MAX;
149 __u32 sysctl_wmem_default = SK_WMEM_MAX;
150 __u32 sysctl_rmem_default = SK_RMEM_MAX;
151
152 /* Maximal space eaten by iovec or ancilliary data plus some space */
153 int sysctl_optmem_max = sizeof(unsigned long)*(2*UIO_MAXIOV + 512);
154
155 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
156 {
157         struct timeval tv;
158
159         if (optlen < sizeof(tv))
160                 return -EINVAL;
161         if (copy_from_user(&tv, optval, sizeof(tv)))
162                 return -EFAULT;
163
164         *timeo_p = MAX_SCHEDULE_TIMEOUT;
165         if (tv.tv_sec == 0 && tv.tv_usec == 0)
166                 return 0;
167         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
168                 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
169         return 0;
170 }
171
172 static void sock_warn_obsolete_bsdism(const char *name)
173 {
174         static int warned;
175         static char warncomm[TASK_COMM_LEN];
176         if (strcmp(warncomm, current->comm) && warned < 5) { 
177                 strcpy(warncomm,  current->comm); 
178                 printk(KERN_WARNING "process `%s' is using obsolete "
179                        "%s SO_BSDCOMPAT\n", warncomm, name);
180                 warned++;
181         }
182 }
183
184 static void sock_disable_timestamp(struct sock *sk)
185 {       
186         if (sock_flag(sk, SOCK_TIMESTAMP)) { 
187                 sock_reset_flag(sk, SOCK_TIMESTAMP);
188                 net_disable_timestamp();
189         }
190 }
191
192
193 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
194 {
195         int err = 0;
196         int skb_len;
197
198 #if defined(CONFIG_VNET) || defined(CONFIG_VNET_MODULE)
199         /* Silently drop if VNET is active (if INET bind() has been
200          * overridden) and the context is not entitled to read the
201          * packet.
202          */
203         if (vnet_active &&
204             (int) sk->sk_xid > 0 && sk->sk_xid != skb->xid) {
205                 err = -EPERM;
206                 goto out;
207         }
208 #endif
209
210         /* Cast skb->rcvbuf to unsigned... It's pointless, but reduces
211            number of warnings when compiling with -W --ANK
212          */
213         if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
214             (unsigned)sk->sk_rcvbuf) {
215                 err = -ENOMEM;
216                 goto out;
217         }
218
219         /* It would be deadlock, if sock_queue_rcv_skb is used
220            with socket lock! We assume that users of this
221            function are lock free.
222         */
223         err = sk_filter(sk, skb, 1);
224         if (err)
225                 goto out;
226
227         skb->dev = NULL;
228         skb_set_owner_r(skb, sk);
229
230         /* Cache the SKB length before we tack it onto the receive
231          * queue.  Once it is added it no longer belongs to us and
232          * may be freed by other threads of control pulling packets
233          * from the queue.
234          */
235         skb_len = skb->len;
236
237         skb_queue_tail(&sk->sk_receive_queue, skb);
238
239         if (!sock_flag(sk, SOCK_DEAD))
240                 sk->sk_data_ready(sk, skb_len);
241 out:
242         return err;
243 }
244 EXPORT_SYMBOL(sock_queue_rcv_skb);
245
246 int sk_receive_skb(struct sock *sk, struct sk_buff *skb)
247 {
248         int rc = NET_RX_SUCCESS;
249
250         if (sk_filter(sk, skb, 0))
251                 goto discard_and_relse;
252
253         skb->dev = NULL;
254
255         bh_lock_sock(sk);
256         if (!sock_owned_by_user(sk))
257                 rc = sk->sk_backlog_rcv(sk, skb);
258         else
259                 sk_add_backlog(sk, skb);
260         bh_unlock_sock(sk);
261 out:
262         sock_put(sk);
263         return rc;
264 discard_and_relse:
265         kfree_skb(skb);
266         goto out;
267 }
268 EXPORT_SYMBOL(sk_receive_skb);
269
270 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
271 {
272         struct dst_entry *dst = sk->sk_dst_cache;
273
274         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
275                 sk->sk_dst_cache = NULL;
276                 dst_release(dst);
277                 return NULL;
278         }
279
280         return dst;
281 }
282 EXPORT_SYMBOL(__sk_dst_check);
283
284 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
285 {
286         struct dst_entry *dst = sk_dst_get(sk);
287
288         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
289                 sk_dst_reset(sk);
290                 dst_release(dst);
291                 return NULL;
292         }
293
294         return dst;
295 }
296 EXPORT_SYMBOL(sk_dst_check);
297
298 /*
299  *      This is meant for all protocols to use and covers goings on
300  *      at the socket level. Everything here is generic.
301  */
302
303 int sock_setsockopt(struct socket *sock, int level, int optname,
304                     char __user *optval, int optlen)
305 {
306         struct sock *sk=sock->sk;
307         struct sk_filter *filter;
308         int val;
309         int valbool;
310         struct linger ling;
311         int ret = 0;
312         
313         /*
314          *      Options without arguments
315          */
316
317 #ifdef SO_DONTLINGER            /* Compatibility item... */
318         if (optname == SO_DONTLINGER) {
319                 lock_sock(sk);
320                 sock_reset_flag(sk, SOCK_LINGER);
321                 release_sock(sk);
322                 return 0;
323         }
324 #endif
325         
326         if(optlen<sizeof(int))
327                 return(-EINVAL);
328         
329         if (get_user(val, (int __user *)optval))
330                 return -EFAULT;
331         
332         valbool = val?1:0;
333
334         lock_sock(sk);
335
336         switch(optname) 
337         {
338                 case SO_DEBUG:  
339                         if(val && !capable(CAP_NET_ADMIN))
340                         {
341                                 ret = -EACCES;
342                         }
343                         else if (valbool)
344                                 sock_set_flag(sk, SOCK_DBG);
345                         else
346                                 sock_reset_flag(sk, SOCK_DBG);
347                         break;
348                 case SO_REUSEADDR:
349                         sk->sk_reuse = valbool;
350                         break;
351                 case SO_TYPE:
352                 case SO_ERROR:
353                         ret = -ENOPROTOOPT;
354                         break;
355                 case SO_DONTROUTE:
356                         if (valbool)
357                                 sock_set_flag(sk, SOCK_LOCALROUTE);
358                         else
359                                 sock_reset_flag(sk, SOCK_LOCALROUTE);
360                         break;
361                 case SO_BROADCAST:
362                         sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
363                         break;
364                 case SO_SNDBUF:
365                         /* Don't error on this BSD doesn't and if you think
366                            about it this is right. Otherwise apps have to
367                            play 'guess the biggest size' games. RCVBUF/SNDBUF
368                            are treated in BSD as hints */
369                            
370                         if (val > sysctl_wmem_max)
371                                 val = sysctl_wmem_max;
372 set_sndbuf:
373                         sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
374                         if ((val * 2) < SOCK_MIN_SNDBUF)
375                                 sk->sk_sndbuf = SOCK_MIN_SNDBUF;
376                         else
377                                 sk->sk_sndbuf = val * 2;
378
379                         /*
380                          *      Wake up sending tasks if we
381                          *      upped the value.
382                          */
383                         sk->sk_write_space(sk);
384                         break;
385
386                 case SO_SNDBUFFORCE:
387                         if (!capable(CAP_NET_ADMIN)) {
388                                 ret = -EPERM;
389                                 break;
390                         }
391                         goto set_sndbuf;
392
393                 case SO_RCVBUF:
394                         /* Don't error on this BSD doesn't and if you think
395                            about it this is right. Otherwise apps have to
396                            play 'guess the biggest size' games. RCVBUF/SNDBUF
397                            are treated in BSD as hints */
398                           
399                         if (val > sysctl_rmem_max)
400                                 val = sysctl_rmem_max;
401 set_rcvbuf:
402                         sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
403                         /*
404                          * We double it on the way in to account for
405                          * "struct sk_buff" etc. overhead.   Applications
406                          * assume that the SO_RCVBUF setting they make will
407                          * allow that much actual data to be received on that
408                          * socket.
409                          *
410                          * Applications are unaware that "struct sk_buff" and
411                          * other overheads allocate from the receive buffer
412                          * during socket buffer allocation.
413                          *
414                          * And after considering the possible alternatives,
415                          * returning the value we actually used in getsockopt
416                          * is the most desirable behavior.
417                          */
418                         if ((val * 2) < SOCK_MIN_RCVBUF)
419                                 sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
420                         else
421                                 sk->sk_rcvbuf = val * 2;
422                         break;
423
424                 case SO_RCVBUFFORCE:
425                         if (!capable(CAP_NET_ADMIN)) {
426                                 ret = -EPERM;
427                                 break;
428                         }
429                         goto set_rcvbuf;
430
431                 case SO_KEEPALIVE:
432 #ifdef CONFIG_INET
433                         if (sk->sk_protocol == IPPROTO_TCP)
434                                 tcp_set_keepalive(sk, valbool);
435 #endif
436                         sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
437                         break;
438
439                 case SO_OOBINLINE:
440                         sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
441                         break;
442
443                 case SO_NO_CHECK:
444                         sk->sk_no_check = valbool;
445                         break;
446
447                 case SO_PRIORITY:
448                         if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN)) 
449                                 sk->sk_priority = val;
450                         else
451                                 ret = -EPERM;
452                         break;
453
454                 case SO_LINGER:
455                         if(optlen<sizeof(ling)) {
456                                 ret = -EINVAL;  /* 1003.1g */
457                                 break;
458                         }
459                         if (copy_from_user(&ling,optval,sizeof(ling))) {
460                                 ret = -EFAULT;
461                                 break;
462                         }
463                         if (!ling.l_onoff)
464                                 sock_reset_flag(sk, SOCK_LINGER);
465                         else {
466 #if (BITS_PER_LONG == 32)
467                                 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
468                                         sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
469                                 else
470 #endif
471                                         sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
472                                 sock_set_flag(sk, SOCK_LINGER);
473                         }
474                         break;
475
476                 case SO_BSDCOMPAT:
477                         sock_warn_obsolete_bsdism("setsockopt");
478                         break;
479
480                 case SO_PASSCRED:
481                         if (valbool)
482                                 set_bit(SOCK_PASSCRED, &sock->flags);
483                         else
484                                 clear_bit(SOCK_PASSCRED, &sock->flags);
485                         break;
486
487 #if defined(CONFIG_VNET) || defined(CONFIG_VNET_MODULE)
488                 case SO_SETXID:
489                         if (current->xid) {
490                                 ret = -EPERM;
491                                 break;
492                         }
493                         if (val < 0 || val > MAX_S_CONTEXT) {
494                                 ret = -EINVAL;
495                                 break;
496                         }
497                         sk->sk_xid = val;
498                         break;
499 #endif
500
501                 case SO_TIMESTAMP:
502                         if (valbool)  {
503                                 sock_set_flag(sk, SOCK_RCVTSTAMP);
504                                 sock_enable_timestamp(sk);
505                         } else
506                                 sock_reset_flag(sk, SOCK_RCVTSTAMP);
507                         break;
508
509                 case SO_RCVLOWAT:
510                         if (val < 0)
511                                 val = INT_MAX;
512                         sk->sk_rcvlowat = val ? : 1;
513                         break;
514
515                 case SO_RCVTIMEO:
516                         ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
517                         break;
518
519                 case SO_SNDTIMEO:
520                         ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
521                         break;
522
523 #ifdef CONFIG_NETDEVICES
524                 case SO_BINDTODEVICE:
525                 {
526                         char devname[IFNAMSIZ]; 
527
528                         /* Sorry... */ 
529                         if (!capable(CAP_NET_RAW)) {
530                                 ret = -EPERM;
531                                 break;
532                         }
533
534                         /* Bind this socket to a particular device like "eth0",
535                          * as specified in the passed interface name. If the
536                          * name is "" or the option length is zero the socket 
537                          * is not bound. 
538                          */ 
539
540                         if (!valbool) {
541                                 sk->sk_bound_dev_if = 0;
542                         } else {
543                                 if (optlen > IFNAMSIZ - 1)
544                                         optlen = IFNAMSIZ - 1;
545                                 memset(devname, 0, sizeof(devname));
546                                 if (copy_from_user(devname, optval, optlen)) {
547                                         ret = -EFAULT;
548                                         break;
549                                 }
550
551                                 /* Remove any cached route for this socket. */
552                                 sk_dst_reset(sk);
553
554                                 if (devname[0] == '\0') {
555                                         sk->sk_bound_dev_if = 0;
556                                 } else {
557                                         struct net_device *dev = dev_get_by_name(devname);
558                                         if (!dev) {
559                                                 ret = -ENODEV;
560                                                 break;
561                                         }
562                                         sk->sk_bound_dev_if = dev->ifindex;
563                                         dev_put(dev);
564                                 }
565                         }
566                         break;
567                 }
568 #endif
569
570
571                 case SO_ATTACH_FILTER:
572                         ret = -EINVAL;
573                         if (optlen == sizeof(struct sock_fprog)) {
574                                 struct sock_fprog fprog;
575
576                                 ret = -EFAULT;
577                                 if (copy_from_user(&fprog, optval, sizeof(fprog)))
578                                         break;
579
580                                 ret = sk_attach_filter(&fprog, sk);
581                         }
582                         break;
583
584                 case SO_DETACH_FILTER:
585                         spin_lock_bh(&sk->sk_lock.slock);
586                         filter = sk->sk_filter;
587                         if (filter) {
588                                 sk->sk_filter = NULL;
589                                 spin_unlock_bh(&sk->sk_lock.slock);
590                                 sk_filter_release(sk, filter);
591                                 break;
592                         }
593                         spin_unlock_bh(&sk->sk_lock.slock);
594                         ret = -ENONET;
595                         break;
596
597                 /* We implement the SO_SNDLOWAT etc to
598                    not be settable (1003.1g 5.3) */
599                 default:
600                         ret = -ENOPROTOOPT;
601                         break;
602         }
603         release_sock(sk);
604         return ret;
605 }
606
607
608 int sock_getsockopt(struct socket *sock, int level, int optname,
609                     char __user *optval, int __user *optlen)
610 {
611         struct sock *sk = sock->sk;
612         
613         union
614         {
615                 int val;
616                 struct linger ling;
617                 struct timeval tm;
618         } v;
619         
620         unsigned int lv = sizeof(int);
621         int len;
622         
623         if(get_user(len,optlen))
624                 return -EFAULT;
625         if(len < 0)
626                 return -EINVAL;
627                 
628         switch(optname) 
629         {
630                 case SO_DEBUG:          
631                         v.val = sock_flag(sk, SOCK_DBG);
632                         break;
633                 
634                 case SO_DONTROUTE:
635                         v.val = sock_flag(sk, SOCK_LOCALROUTE);
636                         break;
637                 
638                 case SO_BROADCAST:
639                         v.val = !!sock_flag(sk, SOCK_BROADCAST);
640                         break;
641
642                 case SO_SNDBUF:
643                         v.val = sk->sk_sndbuf;
644                         break;
645                 
646                 case SO_RCVBUF:
647                         v.val = sk->sk_rcvbuf;
648                         break;
649
650                 case SO_REUSEADDR:
651                         v.val = sk->sk_reuse;
652                         break;
653
654                 case SO_KEEPALIVE:
655                         v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
656                         break;
657
658                 case SO_TYPE:
659                         v.val = sk->sk_type;                            
660                         break;
661
662                 case SO_ERROR:
663                         v.val = -sock_error(sk);
664                         if(v.val==0)
665                                 v.val = xchg(&sk->sk_err_soft, 0);
666                         break;
667
668                 case SO_OOBINLINE:
669                         v.val = !!sock_flag(sk, SOCK_URGINLINE);
670                         break;
671         
672                 case SO_NO_CHECK:
673                         v.val = sk->sk_no_check;
674                         break;
675
676                 case SO_PRIORITY:
677                         v.val = sk->sk_priority;
678                         break;
679                 
680                 case SO_LINGER: 
681                         lv              = sizeof(v.ling);
682                         v.ling.l_onoff  = !!sock_flag(sk, SOCK_LINGER);
683                         v.ling.l_linger = sk->sk_lingertime / HZ;
684                         break;
685                                         
686                 case SO_BSDCOMPAT:
687                         sock_warn_obsolete_bsdism("getsockopt");
688                         break;
689
690                 case SO_TIMESTAMP:
691                         v.val = sock_flag(sk, SOCK_RCVTSTAMP);
692                         break;
693
694                 case SO_RCVTIMEO:
695                         lv=sizeof(struct timeval);
696                         if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
697                                 v.tm.tv_sec = 0;
698                                 v.tm.tv_usec = 0;
699                         } else {
700                                 v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
701                                 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
702                         }
703                         break;
704
705                 case SO_SNDTIMEO:
706                         lv=sizeof(struct timeval);
707                         if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
708                                 v.tm.tv_sec = 0;
709                                 v.tm.tv_usec = 0;
710                         } else {
711                                 v.tm.tv_sec = sk->sk_sndtimeo / HZ;
712                                 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
713                         }
714                         break;
715
716                 case SO_RCVLOWAT:
717                         v.val = sk->sk_rcvlowat;
718                         break;
719
720                 case SO_SNDLOWAT:
721                         v.val=1;
722                         break; 
723
724                 case SO_PASSCRED:
725                         v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
726                         break;
727
728                 case SO_PEERCRED:
729                         if (len > sizeof(sk->sk_peercred))
730                                 len = sizeof(sk->sk_peercred);
731                         if (copy_to_user(optval, &sk->sk_peercred, len))
732                                 return -EFAULT;
733                         goto lenout;
734
735                 case SO_PEERNAME:
736                 {
737                         char address[128];
738
739                         if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
740                                 return -ENOTCONN;
741                         if (lv < len)
742                                 return -EINVAL;
743                         if (copy_to_user(optval, address, len))
744                                 return -EFAULT;
745                         goto lenout;
746                 }
747
748                 /* Dubious BSD thing... Probably nobody even uses it, but
749                  * the UNIX standard wants it for whatever reason... -DaveM
750                  */
751                 case SO_ACCEPTCONN:
752                         v.val = sk->sk_state == TCP_LISTEN;
753                         break;
754
755                 case SO_PEERSEC:
756                         return security_socket_getpeersec_stream(sock, optval, optlen, len);
757
758                 default:
759                         return(-ENOPROTOOPT);
760         }
761         if (len > lv)
762                 len = lv;
763         if (copy_to_user(optval, &v, len))
764                 return -EFAULT;
765 lenout:
766         if (put_user(len, optlen))
767                 return -EFAULT;
768         return 0;
769 }
770
771 /**
772  *      sk_alloc - All socket objects are allocated here
773  *      @family: protocol family
774  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
775  *      @prot: struct proto associated with this new sock instance
776  *      @zero_it: if we should zero the newly allocated sock
777  */
778 struct sock *sk_alloc(int family, gfp_t priority,
779                       struct proto *prot, int zero_it)
780 {
781         struct sock *sk = NULL;
782         kmem_cache_t *slab = prot->slab;
783
784         if (slab != NULL)
785                 sk = kmem_cache_alloc(slab, priority);
786         else
787                 sk = kmalloc(prot->obj_size, priority);
788
789         if (sk) {
790                 if (zero_it) {
791                         memset(sk, 0, prot->obj_size);
792                         sk->sk_family = family;
793                         /*
794                          * See comment in struct sock definition to understand
795                          * why we need sk_prot_creator -acme
796                          */
797                         sk->sk_prot = sk->sk_prot_creator = prot;
798                         sock_lock_init(sk);
799                 }
800                 sock_vx_init(sk);
801                 sock_nx_init(sk);
802                 
803                 if (security_sk_alloc(sk, family, priority))
804                         goto out_free;
805
806                 if (!try_module_get(prot->owner))
807                         goto out_free;
808         }
809         return sk;
810
811 out_free:
812         if (slab != NULL)
813                 kmem_cache_free(slab, sk);
814         else
815                 kfree(sk);
816         return NULL;
817 }
818
819 void sk_free(struct sock *sk)
820 {
821         struct sk_filter *filter;
822         struct module *owner = sk->sk_prot_creator->owner;
823
824         if (sk->sk_destruct)
825                 sk->sk_destruct(sk);
826
827         filter = sk->sk_filter;
828         if (filter) {
829                 sk_filter_release(sk, filter);
830                 sk->sk_filter = NULL;
831         }
832
833         sock_disable_timestamp(sk);
834
835         if (atomic_read(&sk->sk_omem_alloc))
836                 printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
837                        __FUNCTION__, atomic_read(&sk->sk_omem_alloc));
838
839         security_sk_free(sk);
840         vx_sock_dec(sk);
841         clr_vx_info(&sk->sk_vx_info);
842         sk->sk_xid = -1;
843         clr_nx_info(&sk->sk_nx_info);
844         sk->sk_nid = -1;
845         if (sk->sk_prot_creator->slab != NULL)
846                 kmem_cache_free(sk->sk_prot_creator->slab, sk);
847         else
848                 kfree(sk);
849         module_put(owner);
850 }
851
852 struct sock *sk_clone(struct sock *sk, const gfp_t priority)
853 {
854         struct sock *newsk = sk_alloc(sk->sk_family, priority, sk->sk_prot, 0);
855
856         if (newsk != NULL) {
857                 struct sk_filter *filter;
858
859                 memcpy(newsk, sk, sk->sk_prot->obj_size);
860
861                 /* SANITY */
862                 sock_vx_init(newsk);
863                 sock_nx_init(newsk);
864                 sk_node_init(&newsk->sk_node);
865                 sock_lock_init(newsk);
866                 bh_lock_sock(newsk);
867
868                 atomic_set(&newsk->sk_rmem_alloc, 0);
869                 atomic_set(&newsk->sk_wmem_alloc, 0);
870                 atomic_set(&newsk->sk_omem_alloc, 0);
871                 skb_queue_head_init(&newsk->sk_receive_queue);
872                 skb_queue_head_init(&newsk->sk_write_queue);
873
874                 rwlock_init(&newsk->sk_dst_lock);
875                 rwlock_init(&newsk->sk_callback_lock);
876
877                 newsk->sk_dst_cache     = NULL;
878                 newsk->sk_wmem_queued   = 0;
879                 newsk->sk_forward_alloc = 0;
880                 newsk->sk_send_head     = NULL;
881                 newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
882                 newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
883
884                 sock_reset_flag(newsk, SOCK_DONE);
885                 skb_queue_head_init(&newsk->sk_error_queue);
886
887                 filter = newsk->sk_filter;
888                 if (filter != NULL)
889                         sk_filter_charge(newsk, filter);
890
891                 if (sk->sk_create_child)
892                         sk->sk_create_child(sk, newsk);
893
894                 if (unlikely(xfrm_sk_clone_policy(newsk))) {
895                         /* It is still raw copy of parent, so invalidate
896                          * destructor and make plain sk_free() */
897                         newsk->sk_destruct = NULL;
898                         sk_free(newsk);
899                         newsk = NULL;
900                         goto out;
901                 }
902
903                 newsk->sk_err      = 0;
904                 newsk->sk_priority = 0;
905                 atomic_set(&newsk->sk_refcnt, 2);
906
907                 set_vx_info(&newsk->sk_vx_info, sk->sk_vx_info);
908                 newsk->sk_xid = sk->sk_xid;
909                 vx_sock_inc(newsk);
910                 set_nx_info(&newsk->sk_nx_info, sk->sk_nx_info);
911                 newsk->sk_nid = sk->sk_nid;
912
913                 /*
914                  * Increment the counter in the same struct proto as the master
915                  * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
916                  * is the same as sk->sk_prot->socks, as this field was copied
917                  * with memcpy).
918                  *
919                  * This _changes_ the previous behaviour, where
920                  * tcp_create_openreq_child always was incrementing the
921                  * equivalent to tcp_prot->socks (inet_sock_nr), so this have
922                  * to be taken into account in all callers. -acme
923                  */
924                 sk_refcnt_debug_inc(newsk);
925                 newsk->sk_socket = NULL;
926                 newsk->sk_sleep  = NULL;
927
928                 if (newsk->sk_prot->sockets_allocated)
929                         atomic_inc(newsk->sk_prot->sockets_allocated);
930         }
931 out:
932         return newsk;
933 }
934
935 EXPORT_SYMBOL_GPL(sk_clone);
936
937 void __init sk_init(void)
938 {
939         if (num_physpages <= 4096) {
940                 sysctl_wmem_max = 32767;
941                 sysctl_rmem_max = 32767;
942                 sysctl_wmem_default = 32767;
943                 sysctl_rmem_default = 32767;
944         } else if (num_physpages >= 131072) {
945                 sysctl_wmem_max = 131071;
946                 sysctl_rmem_max = 131071;
947         }
948 }
949
950 /*
951  *      Simple resource managers for sockets.
952  */
953
954
955 /* 
956  * Write buffer destructor automatically called from kfree_skb. 
957  */
958 void sock_wfree(struct sk_buff *skb)
959 {
960         struct sock *sk = skb->sk;
961
962         /* In case it might be waiting for more memory. */
963         atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
964         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE))
965                 sk->sk_write_space(sk);
966         sock_put(sk);
967 }
968
969 /* 
970  * Read buffer destructor automatically called from kfree_skb. 
971  */
972 void sock_rfree(struct sk_buff *skb)
973 {
974         struct sock *sk = skb->sk;
975
976         atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
977 }
978
979
980 int sock_i_uid(struct sock *sk)
981 {
982         int uid;
983
984         read_lock(&sk->sk_callback_lock);
985         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
986         read_unlock(&sk->sk_callback_lock);
987         return uid;
988 }
989
990 unsigned long sock_i_ino(struct sock *sk)
991 {
992         unsigned long ino;
993
994         read_lock(&sk->sk_callback_lock);
995         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
996         read_unlock(&sk->sk_callback_lock);
997         return ino;
998 }
999
1000 /*
1001  * Allocate a skb from the socket's send buffer.
1002  */
1003 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1004                              gfp_t priority)
1005 {
1006         if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1007                 struct sk_buff * skb = alloc_skb(size, priority);
1008                 if (skb) {
1009                         skb_set_owner_w(skb, sk);
1010                         return skb;
1011                 }
1012         }
1013         return NULL;
1014 }
1015
1016 /*
1017  * Allocate a skb from the socket's receive buffer.
1018  */ 
1019 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1020                              gfp_t priority)
1021 {
1022         if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1023                 struct sk_buff *skb = alloc_skb(size, priority);
1024                 if (skb) {
1025                         skb_set_owner_r(skb, sk);
1026                         return skb;
1027                 }
1028         }
1029         return NULL;
1030 }
1031
1032 /* 
1033  * Allocate a memory block from the socket's option memory buffer.
1034  */ 
1035 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1036 {
1037         if ((unsigned)size <= sysctl_optmem_max &&
1038             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1039                 void *mem;
1040                 /* First do the add, to avoid the race if kmalloc
1041                  * might sleep.
1042                  */
1043                 atomic_add(size, &sk->sk_omem_alloc);
1044                 mem = kmalloc(size, priority);
1045                 if (mem)
1046                         return mem;
1047                 atomic_sub(size, &sk->sk_omem_alloc);
1048         }
1049         return NULL;
1050 }
1051
1052 /*
1053  * Free an option memory block.
1054  */
1055 void sock_kfree_s(struct sock *sk, void *mem, int size)
1056 {
1057         kfree(mem);
1058         atomic_sub(size, &sk->sk_omem_alloc);
1059 }
1060
1061 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1062    I think, these locks should be removed for datagram sockets.
1063  */
1064 static long sock_wait_for_wmem(struct sock * sk, long timeo)
1065 {
1066         DEFINE_WAIT(wait);
1067
1068         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1069         for (;;) {
1070                 if (!timeo)
1071                         break;
1072                 if (signal_pending(current))
1073                         break;
1074                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1075                 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1076                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1077                         break;
1078                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1079                         break;
1080                 if (sk->sk_err)
1081                         break;
1082                 timeo = schedule_timeout(timeo);
1083         }
1084         finish_wait(sk->sk_sleep, &wait);
1085         return timeo;
1086 }
1087
1088
1089 /*
1090  *      Generic send/receive buffer handlers
1091  */
1092
1093 static struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
1094                                             unsigned long header_len,
1095                                             unsigned long data_len,
1096                                             int noblock, int *errcode)
1097 {
1098         struct sk_buff *skb;
1099         gfp_t gfp_mask;
1100         long timeo;
1101         int err;
1102
1103         gfp_mask = sk->sk_allocation;
1104         if (gfp_mask & __GFP_WAIT)
1105                 gfp_mask |= __GFP_REPEAT;
1106
1107         timeo = sock_sndtimeo(sk, noblock);
1108         while (1) {
1109                 err = sock_error(sk);
1110                 if (err != 0)
1111                         goto failure;
1112
1113                 err = -EPIPE;
1114                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1115                         goto failure;
1116
1117                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1118                         skb = alloc_skb(header_len, sk->sk_allocation);
1119                         if (skb) {
1120                                 int npages;
1121                                 int i;
1122
1123                                 /* No pages, we're done... */
1124                                 if (!data_len)
1125                                         break;
1126
1127                                 npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1128                                 skb->truesize += data_len;
1129                                 skb_shinfo(skb)->nr_frags = npages;
1130                                 for (i = 0; i < npages; i++) {
1131                                         struct page *page;
1132                                         skb_frag_t *frag;
1133
1134                                         page = alloc_pages(sk->sk_allocation, 0);
1135                                         if (!page) {
1136                                                 err = -ENOBUFS;
1137                                                 skb_shinfo(skb)->nr_frags = i;
1138                                                 kfree_skb(skb);
1139                                                 goto failure;
1140                                         }
1141
1142                                         frag = &skb_shinfo(skb)->frags[i];
1143                                         frag->page = page;
1144                                         frag->page_offset = 0;
1145                                         frag->size = (data_len >= PAGE_SIZE ?
1146                                                       PAGE_SIZE :
1147                                                       data_len);
1148                                         data_len -= PAGE_SIZE;
1149                                 }
1150
1151                                 /* Full success... */
1152                                 break;
1153                         }
1154                         err = -ENOBUFS;
1155                         goto failure;
1156                 }
1157                 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1158                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1159                 err = -EAGAIN;
1160                 if (!timeo)
1161                         goto failure;
1162                 if (signal_pending(current))
1163                         goto interrupted;
1164                 timeo = sock_wait_for_wmem(sk, timeo);
1165         }
1166
1167         skb_set_owner_w(skb, sk);
1168         return skb;
1169
1170 interrupted:
1171         err = sock_intr_errno(timeo);
1172 failure:
1173         *errcode = err;
1174         return NULL;
1175 }
1176
1177 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, 
1178                                     int noblock, int *errcode)
1179 {
1180         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1181 }
1182
1183 static void __lock_sock(struct sock *sk)
1184 {
1185         DEFINE_WAIT(wait);
1186
1187         for(;;) {
1188                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1189                                         TASK_UNINTERRUPTIBLE);
1190                 spin_unlock_bh(&sk->sk_lock.slock);
1191                 schedule();
1192                 spin_lock_bh(&sk->sk_lock.slock);
1193                 if(!sock_owned_by_user(sk))
1194                         break;
1195         }
1196         finish_wait(&sk->sk_lock.wq, &wait);
1197 }
1198
1199 static void __release_sock(struct sock *sk)
1200 {
1201         struct sk_buff *skb = sk->sk_backlog.head;
1202
1203         do {
1204                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1205                 bh_unlock_sock(sk);
1206
1207                 do {
1208                         struct sk_buff *next = skb->next;
1209
1210                         skb->next = NULL;
1211                         sk->sk_backlog_rcv(sk, skb);
1212
1213                         /*
1214                          * We are in process context here with softirqs
1215                          * disabled, use cond_resched_softirq() to preempt.
1216                          * This is safe to do because we've taken the backlog
1217                          * queue private:
1218                          */
1219                         cond_resched_softirq();
1220
1221                         skb = next;
1222                 } while (skb != NULL);
1223
1224                 bh_lock_sock(sk);
1225         } while((skb = sk->sk_backlog.head) != NULL);
1226 }
1227
1228 /**
1229  * sk_wait_data - wait for data to arrive at sk_receive_queue
1230  * @sk:    sock to wait on
1231  * @timeo: for how long
1232  *
1233  * Now socket state including sk->sk_err is changed only under lock,
1234  * hence we may omit checks after joining wait queue.
1235  * We check receive queue before schedule() only as optimization;
1236  * it is very likely that release_sock() added new data.
1237  */
1238 int sk_wait_data(struct sock *sk, long *timeo)
1239 {
1240         int rc;
1241         DEFINE_WAIT(wait);
1242
1243         prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1244         set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1245         rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1246         clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1247         finish_wait(sk->sk_sleep, &wait);
1248         return rc;
1249 }
1250
1251 EXPORT_SYMBOL(sk_wait_data);
1252
1253 /*
1254  * Set of default routines for initialising struct proto_ops when
1255  * the protocol does not support a particular function. In certain
1256  * cases where it makes no sense for a protocol to have a "do nothing"
1257  * function, some default processing is provided.
1258  */
1259
1260 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1261 {
1262         return -EOPNOTSUPP;
1263 }
1264
1265 int sock_no_connect(struct socket *sock, struct sockaddr *saddr, 
1266                     int len, int flags)
1267 {
1268         return -EOPNOTSUPP;
1269 }
1270
1271 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1272 {
1273         return -EOPNOTSUPP;
1274 }
1275
1276 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1277 {
1278         return -EOPNOTSUPP;
1279 }
1280
1281 int sock_no_getname(struct socket *sock, struct sockaddr *saddr, 
1282                     int *len, int peer)
1283 {
1284         return -EOPNOTSUPP;
1285 }
1286
1287 unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt)
1288 {
1289         return 0;
1290 }
1291
1292 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1293 {
1294         return -EOPNOTSUPP;
1295 }
1296
1297 int sock_no_listen(struct socket *sock, int backlog)
1298 {
1299         return -EOPNOTSUPP;
1300 }
1301
1302 int sock_no_shutdown(struct socket *sock, int how)
1303 {
1304         return -EOPNOTSUPP;
1305 }
1306
1307 int sock_no_setsockopt(struct socket *sock, int level, int optname,
1308                     char __user *optval, int optlen)
1309 {
1310         return -EOPNOTSUPP;
1311 }
1312
1313 int sock_no_getsockopt(struct socket *sock, int level, int optname,
1314                     char __user *optval, int __user *optlen)
1315 {
1316         return -EOPNOTSUPP;
1317 }
1318
1319 int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1320                     size_t len)
1321 {
1322         return -EOPNOTSUPP;
1323 }
1324
1325 int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1326                     size_t len, int flags)
1327 {
1328         return -EOPNOTSUPP;
1329 }
1330
1331 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1332 {
1333         /* Mirror missing mmap method error code */
1334         return -ENODEV;
1335 }
1336
1337 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1338 {
1339         ssize_t res;
1340         struct msghdr msg = {.msg_flags = flags};
1341         struct kvec iov;
1342         char *kaddr = kmap(page);
1343         iov.iov_base = kaddr + offset;
1344         iov.iov_len = size;
1345         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1346         kunmap(page);
1347         return res;
1348 }
1349
1350 /*
1351  *      Default Socket Callbacks
1352  */
1353
1354 static void sock_def_wakeup(struct sock *sk)
1355 {
1356         read_lock(&sk->sk_callback_lock);
1357         if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1358                 wake_up_interruptible_all(sk->sk_sleep);
1359         read_unlock(&sk->sk_callback_lock);
1360 }
1361
1362 static void sock_def_error_report(struct sock *sk)
1363 {
1364         read_lock(&sk->sk_callback_lock);
1365         if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1366                 wake_up_interruptible(sk->sk_sleep);
1367         sk_wake_async(sk,0,POLL_ERR); 
1368         read_unlock(&sk->sk_callback_lock);
1369 }
1370
1371 static void sock_def_readable(struct sock *sk, int len)
1372 {
1373         read_lock(&sk->sk_callback_lock);
1374         if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1375                 wake_up_interruptible(sk->sk_sleep);
1376         sk_wake_async(sk,1,POLL_IN);
1377         read_unlock(&sk->sk_callback_lock);
1378 }
1379
1380 static void sock_def_write_space(struct sock *sk)
1381 {
1382         read_lock(&sk->sk_callback_lock);
1383
1384         /* Do not wake up a writer until he can make "significant"
1385          * progress.  --DaveM
1386          */
1387         if((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1388                 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1389                         wake_up_interruptible(sk->sk_sleep);
1390
1391                 /* Should agree with poll, otherwise some programs break */
1392                 if (sock_writeable(sk))
1393                         sk_wake_async(sk, 2, POLL_OUT);
1394         }
1395
1396         read_unlock(&sk->sk_callback_lock);
1397 }
1398
1399 static void sock_def_destruct(struct sock *sk)
1400 {
1401         kfree(sk->sk_protinfo);
1402 }
1403
1404 void sk_send_sigurg(struct sock *sk)
1405 {
1406         if (sk->sk_socket && sk->sk_socket->file)
1407                 if (send_sigurg(&sk->sk_socket->file->f_owner))
1408                         sk_wake_async(sk, 3, POLL_PRI);
1409 }
1410
1411 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1412                     unsigned long expires)
1413 {
1414         if (!mod_timer(timer, expires))
1415                 sock_hold(sk);
1416 }
1417
1418 EXPORT_SYMBOL(sk_reset_timer);
1419
1420 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1421 {
1422         if (timer_pending(timer) && del_timer(timer))
1423                 __sock_put(sk);
1424 }
1425
1426 EXPORT_SYMBOL(sk_stop_timer);
1427
1428 void sock_init_data(struct socket *sock, struct sock *sk)
1429 {
1430         skb_queue_head_init(&sk->sk_receive_queue);
1431         skb_queue_head_init(&sk->sk_write_queue);
1432         skb_queue_head_init(&sk->sk_error_queue);
1433
1434         sk->sk_send_head        =       NULL;
1435
1436         init_timer(&sk->sk_timer);
1437         
1438         sk->sk_allocation       =       GFP_KERNEL;
1439         sk->sk_rcvbuf           =       sysctl_rmem_default;
1440         sk->sk_sndbuf           =       sysctl_wmem_default;
1441         sk->sk_state            =       TCP_CLOSE;
1442         sk->sk_socket           =       sock;
1443
1444         sock_set_flag(sk, SOCK_ZAPPED);
1445
1446         if(sock)
1447         {
1448                 sk->sk_type     =       sock->type;
1449                 sk->sk_sleep    =       &sock->wait;
1450                 sock->sk        =       sk;
1451         } else
1452                 sk->sk_sleep    =       NULL;
1453
1454         rwlock_init(&sk->sk_dst_lock);
1455         rwlock_init(&sk->sk_callback_lock);
1456
1457         sk->sk_state_change     =       sock_def_wakeup;
1458         sk->sk_data_ready       =       sock_def_readable;
1459         sk->sk_write_space      =       sock_def_write_space;
1460         sk->sk_error_report     =       sock_def_error_report;
1461         sk->sk_destruct         =       sock_def_destruct;
1462
1463         sk->sk_sndmsg_page      =       NULL;
1464         sk->sk_sndmsg_off       =       0;
1465
1466         sk->sk_peercred.pid     =       0;
1467         sk->sk_peercred.uid     =       -1;
1468         sk->sk_peercred.gid     =       -1;
1469         sk->sk_write_pending    =       0;
1470         sk->sk_rcvlowat         =       1;
1471         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
1472         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
1473
1474         sk->sk_stamp.tv_sec     = -1L;
1475         sk->sk_stamp.tv_usec    = -1L;
1476
1477         set_vx_info(&sk->sk_vx_info, current->vx_info);
1478         sk->sk_xid = vx_current_xid();
1479         vx_sock_inc(sk);
1480         set_nx_info(&sk->sk_nx_info, current->nx_info);
1481         sk->sk_nid = nx_current_nid();
1482         atomic_set(&sk->sk_refcnt, 1);
1483 }
1484
1485 void fastcall lock_sock(struct sock *sk)
1486 {
1487         might_sleep();
1488         spin_lock_bh(&(sk->sk_lock.slock));
1489         if (sk->sk_lock.owner)
1490                 __lock_sock(sk);
1491         sk->sk_lock.owner = (void *)1;
1492         spin_unlock_bh(&(sk->sk_lock.slock));
1493 }
1494
1495 EXPORT_SYMBOL(lock_sock);
1496
1497 void fastcall release_sock(struct sock *sk)
1498 {
1499         spin_lock_bh(&(sk->sk_lock.slock));
1500         if (sk->sk_backlog.tail)
1501                 __release_sock(sk);
1502         sk->sk_lock.owner = NULL;
1503         if (waitqueue_active(&(sk->sk_lock.wq)))
1504                 wake_up(&(sk->sk_lock.wq));
1505         spin_unlock_bh(&(sk->sk_lock.slock));
1506 }
1507 EXPORT_SYMBOL(release_sock);
1508
1509 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
1510
1511         if (!sock_flag(sk, SOCK_TIMESTAMP))
1512                 sock_enable_timestamp(sk);
1513         if (sk->sk_stamp.tv_sec == -1) 
1514                 return -ENOENT;
1515         if (sk->sk_stamp.tv_sec == 0)
1516                 do_gettimeofday(&sk->sk_stamp);
1517         return copy_to_user(userstamp, &sk->sk_stamp, sizeof(struct timeval)) ?
1518                 -EFAULT : 0; 
1519
1520 EXPORT_SYMBOL(sock_get_timestamp);
1521
1522 void sock_enable_timestamp(struct sock *sk)
1523 {       
1524         if (!sock_flag(sk, SOCK_TIMESTAMP)) { 
1525                 sock_set_flag(sk, SOCK_TIMESTAMP);
1526                 net_enable_timestamp();
1527         }
1528 }
1529 EXPORT_SYMBOL(sock_enable_timestamp); 
1530
1531 /*
1532  *      Get a socket option on an socket.
1533  *
1534  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
1535  *      asynchronous errors should be reported by getsockopt. We assume
1536  *      this means if you specify SO_ERROR (otherwise whats the point of it).
1537  */
1538 int sock_common_getsockopt(struct socket *sock, int level, int optname,
1539                            char __user *optval, int __user *optlen)
1540 {
1541         struct sock *sk = sock->sk;
1542
1543         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1544 }
1545
1546 EXPORT_SYMBOL(sock_common_getsockopt);
1547
1548 #ifdef CONFIG_COMPAT
1549 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
1550                                   char __user *optval, int __user *optlen)
1551 {
1552         struct sock *sk = sock->sk;
1553
1554         if (sk->sk_prot->compat_setsockopt != NULL)
1555                 return sk->sk_prot->compat_getsockopt(sk, level, optname,
1556                                                       optval, optlen);
1557         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1558 }
1559 EXPORT_SYMBOL(compat_sock_common_getsockopt);
1560 #endif
1561
1562 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
1563                         struct msghdr *msg, size_t size, int flags)
1564 {
1565         struct sock *sk = sock->sk;
1566         int addr_len = 0;
1567         int err;
1568
1569         err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
1570                                    flags & ~MSG_DONTWAIT, &addr_len);
1571         if (err >= 0)
1572                 msg->msg_namelen = addr_len;
1573         return err;
1574 }
1575
1576 EXPORT_SYMBOL(sock_common_recvmsg);
1577
1578 /*
1579  *      Set socket options on an inet socket.
1580  */
1581 int sock_common_setsockopt(struct socket *sock, int level, int optname,
1582                            char __user *optval, int optlen)
1583 {
1584         struct sock *sk = sock->sk;
1585
1586         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1587 }
1588
1589 EXPORT_SYMBOL(sock_common_setsockopt);
1590
1591 #ifdef CONFIG_COMPAT
1592 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
1593                                   char __user *optval, int optlen)
1594 {
1595         struct sock *sk = sock->sk;
1596
1597         if (sk->sk_prot->compat_setsockopt != NULL)
1598                 return sk->sk_prot->compat_setsockopt(sk, level, optname,
1599                                                       optval, optlen);
1600         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1601 }
1602 EXPORT_SYMBOL(compat_sock_common_setsockopt);
1603 #endif
1604
1605 void sk_common_release(struct sock *sk)
1606 {
1607         if (sk->sk_prot->destroy)
1608                 sk->sk_prot->destroy(sk);
1609
1610         /*
1611          * Observation: when sock_common_release is called, processes have
1612          * no access to socket. But net still has.
1613          * Step one, detach it from networking:
1614          *
1615          * A. Remove from hash tables.
1616          */
1617
1618         sk->sk_prot->unhash(sk);
1619
1620         /*
1621          * In this point socket cannot receive new packets, but it is possible
1622          * that some packets are in flight because some CPU runs receiver and
1623          * did hash table lookup before we unhashed socket. They will achieve
1624          * receive queue and will be purged by socket destructor.
1625          *
1626          * Also we still have packets pending on receive queue and probably,
1627          * our own packets waiting in device queues. sock_destroy will drain
1628          * receive queue, but transmitted packets will delay socket destruction
1629          * until the last reference will be released.
1630          */
1631
1632         sock_orphan(sk);
1633
1634         xfrm_sk_free_policy(sk);
1635
1636         sk_refcnt_debug_release(sk);
1637         sock_put(sk);
1638 }
1639
1640 EXPORT_SYMBOL(sk_common_release);
1641
1642 static DEFINE_RWLOCK(proto_list_lock);
1643 static LIST_HEAD(proto_list);
1644
1645 int proto_register(struct proto *prot, int alloc_slab)
1646 {
1647         char *request_sock_slab_name = NULL;
1648         char *timewait_sock_slab_name;
1649         int rc = -ENOBUFS;
1650
1651         if (alloc_slab) {
1652                 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
1653                                                SLAB_HWCACHE_ALIGN, NULL, NULL);
1654
1655                 if (prot->slab == NULL) {
1656                         printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
1657                                prot->name);
1658                         goto out;
1659                 }
1660
1661                 if (prot->rsk_prot != NULL) {
1662                         static const char mask[] = "request_sock_%s";
1663
1664                         request_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1665                         if (request_sock_slab_name == NULL)
1666                                 goto out_free_sock_slab;
1667
1668                         sprintf(request_sock_slab_name, mask, prot->name);
1669                         prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name,
1670                                                                  prot->rsk_prot->obj_size, 0,
1671                                                                  SLAB_HWCACHE_ALIGN, NULL, NULL);
1672
1673                         if (prot->rsk_prot->slab == NULL) {
1674                                 printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
1675                                        prot->name);
1676                                 goto out_free_request_sock_slab_name;
1677                         }
1678                 }
1679
1680                 if (prot->twsk_prot != NULL) {
1681                         static const char mask[] = "tw_sock_%s";
1682
1683                         timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1684
1685                         if (timewait_sock_slab_name == NULL)
1686                                 goto out_free_request_sock_slab;
1687
1688                         sprintf(timewait_sock_slab_name, mask, prot->name);
1689                         prot->twsk_prot->twsk_slab =
1690                                 kmem_cache_create(timewait_sock_slab_name,
1691                                                   prot->twsk_prot->twsk_obj_size,
1692                                                   0, SLAB_HWCACHE_ALIGN,
1693                                                   NULL, NULL);
1694                         if (prot->twsk_prot->twsk_slab == NULL)
1695                                 goto out_free_timewait_sock_slab_name;
1696                 }
1697         }
1698
1699         write_lock(&proto_list_lock);
1700         list_add(&prot->node, &proto_list);
1701         write_unlock(&proto_list_lock);
1702         rc = 0;
1703 out:
1704         return rc;
1705 out_free_timewait_sock_slab_name:
1706         kfree(timewait_sock_slab_name);
1707 out_free_request_sock_slab:
1708         if (prot->rsk_prot && prot->rsk_prot->slab) {
1709                 kmem_cache_destroy(prot->rsk_prot->slab);
1710                 prot->rsk_prot->slab = NULL;
1711         }
1712 out_free_request_sock_slab_name:
1713         kfree(request_sock_slab_name);
1714 out_free_sock_slab:
1715         kmem_cache_destroy(prot->slab);
1716         prot->slab = NULL;
1717         goto out;
1718 }
1719
1720 EXPORT_SYMBOL(proto_register);
1721
1722 void proto_unregister(struct proto *prot)
1723 {
1724         write_lock(&proto_list_lock);
1725         list_del(&prot->node);
1726         write_unlock(&proto_list_lock);
1727
1728         if (prot->slab != NULL) {
1729                 kmem_cache_destroy(prot->slab);
1730                 prot->slab = NULL;
1731         }
1732
1733         if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
1734                 const char *name = kmem_cache_name(prot->rsk_prot->slab);
1735
1736                 kmem_cache_destroy(prot->rsk_prot->slab);
1737                 kfree(name);
1738                 prot->rsk_prot->slab = NULL;
1739         }
1740
1741         if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
1742                 const char *name = kmem_cache_name(prot->twsk_prot->twsk_slab);
1743
1744                 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
1745                 kfree(name);
1746                 prot->twsk_prot->twsk_slab = NULL;
1747         }
1748 }
1749
1750 EXPORT_SYMBOL(proto_unregister);
1751
1752 #ifdef CONFIG_PROC_FS
1753 static inline struct proto *__proto_head(void)
1754 {
1755         return list_entry(proto_list.next, struct proto, node);
1756 }
1757
1758 static inline struct proto *proto_head(void)
1759 {
1760         return list_empty(&proto_list) ? NULL : __proto_head();
1761 }
1762
1763 static inline struct proto *proto_next(struct proto *proto)
1764 {
1765         return proto->node.next == &proto_list ? NULL :
1766                 list_entry(proto->node.next, struct proto, node);
1767 }
1768
1769 static inline struct proto *proto_get_idx(loff_t pos)
1770 {
1771         struct proto *proto;
1772         loff_t i = 0;
1773
1774         list_for_each_entry(proto, &proto_list, node)
1775                 if (i++ == pos)
1776                         goto out;
1777
1778         proto = NULL;
1779 out:
1780         return proto;
1781 }
1782
1783 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
1784 {
1785         read_lock(&proto_list_lock);
1786         return *pos ? proto_get_idx(*pos - 1) : SEQ_START_TOKEN;
1787 }
1788
1789 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1790 {
1791         ++*pos;
1792         return v == SEQ_START_TOKEN ? proto_head() : proto_next(v);
1793 }
1794
1795 static void proto_seq_stop(struct seq_file *seq, void *v)
1796 {
1797         read_unlock(&proto_list_lock);
1798 }
1799
1800 static char proto_method_implemented(const void *method)
1801 {
1802         return method == NULL ? 'n' : 'y';
1803 }
1804
1805 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
1806 {
1807         seq_printf(seq, "%-9s %4u %6d  %6d   %-3s %6u   %-3s  %-10s "
1808                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
1809                    proto->name,
1810                    proto->obj_size,
1811                    proto->sockets_allocated != NULL ? atomic_read(proto->sockets_allocated) : -1,
1812                    proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
1813                    proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
1814                    proto->max_header,
1815                    proto->slab == NULL ? "no" : "yes",
1816                    module_name(proto->owner),
1817                    proto_method_implemented(proto->close),
1818                    proto_method_implemented(proto->connect),
1819                    proto_method_implemented(proto->disconnect),
1820                    proto_method_implemented(proto->accept),
1821                    proto_method_implemented(proto->ioctl),
1822                    proto_method_implemented(proto->init),
1823                    proto_method_implemented(proto->destroy),
1824                    proto_method_implemented(proto->shutdown),
1825                    proto_method_implemented(proto->setsockopt),
1826                    proto_method_implemented(proto->getsockopt),
1827                    proto_method_implemented(proto->sendmsg),
1828                    proto_method_implemented(proto->recvmsg),
1829                    proto_method_implemented(proto->sendpage),
1830                    proto_method_implemented(proto->bind),
1831                    proto_method_implemented(proto->backlog_rcv),
1832                    proto_method_implemented(proto->hash),
1833                    proto_method_implemented(proto->unhash),
1834                    proto_method_implemented(proto->get_port),
1835                    proto_method_implemented(proto->enter_memory_pressure));
1836 }
1837
1838 static int proto_seq_show(struct seq_file *seq, void *v)
1839 {
1840         if (v == SEQ_START_TOKEN)
1841                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
1842                            "protocol",
1843                            "size",
1844                            "sockets",
1845                            "memory",
1846                            "press",
1847                            "maxhdr",
1848                            "slab",
1849                            "module",
1850                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
1851         else
1852                 proto_seq_printf(seq, v);
1853         return 0;
1854 }
1855
1856 static struct seq_operations proto_seq_ops = {
1857         .start  = proto_seq_start,
1858         .next   = proto_seq_next,
1859         .stop   = proto_seq_stop,
1860         .show   = proto_seq_show,
1861 };
1862
1863 static int proto_seq_open(struct inode *inode, struct file *file)
1864 {
1865         return seq_open(file, &proto_seq_ops);
1866 }
1867
1868 static struct file_operations proto_seq_fops = {
1869         .owner          = THIS_MODULE,
1870         .open           = proto_seq_open,
1871         .read           = seq_read,
1872         .llseek         = seq_lseek,
1873         .release        = seq_release,
1874 };
1875
1876 static int __init proto_init(void)
1877 {
1878         /* register /proc/net/protocols */
1879         return proc_net_fops_create("protocols", S_IRUGO, &proto_seq_fops) == NULL ? -ENOBUFS : 0;
1880 }
1881
1882 subsys_initcall(proto_init);
1883
1884 #endif /* PROC_FS */
1885
1886 EXPORT_SYMBOL(sk_alloc);
1887 EXPORT_SYMBOL(sk_free);
1888 EXPORT_SYMBOL(sk_send_sigurg);
1889 EXPORT_SYMBOL(sock_alloc_send_skb);
1890 EXPORT_SYMBOL(sock_init_data);
1891 EXPORT_SYMBOL(sock_kfree_s);
1892 EXPORT_SYMBOL(sock_kmalloc);
1893 EXPORT_SYMBOL(sock_no_accept);
1894 EXPORT_SYMBOL(sock_no_bind);
1895 EXPORT_SYMBOL(sock_no_connect);
1896 EXPORT_SYMBOL(sock_no_getname);
1897 EXPORT_SYMBOL(sock_no_getsockopt);
1898 EXPORT_SYMBOL(sock_no_ioctl);
1899 EXPORT_SYMBOL(sock_no_listen);
1900 EXPORT_SYMBOL(sock_no_mmap);
1901 EXPORT_SYMBOL(sock_no_poll);
1902 EXPORT_SYMBOL(sock_no_recvmsg);
1903 EXPORT_SYMBOL(sock_no_sendmsg);
1904 EXPORT_SYMBOL(sock_no_sendpage);
1905 EXPORT_SYMBOL(sock_no_setsockopt);
1906 EXPORT_SYMBOL(sock_no_shutdown);
1907 EXPORT_SYMBOL(sock_no_socketpair);
1908 EXPORT_SYMBOL(sock_rfree);
1909 EXPORT_SYMBOL(sock_setsockopt);
1910 EXPORT_SYMBOL(sock_wfree);
1911 EXPORT_SYMBOL(sock_wmalloc);
1912 EXPORT_SYMBOL(sock_i_uid);
1913 EXPORT_SYMBOL(sock_i_ino);
1914 EXPORT_SYMBOL(sysctl_optmem_max);
1915 #ifdef CONFIG_SYSCTL
1916 EXPORT_SYMBOL(sysctl_rmem_max);
1917 EXPORT_SYMBOL(sysctl_wmem_max);
1918 #endif