Fedora kernel-2.6.17-1.2142_FC4 patched with stable patch-2.6.17.4-vs2.0.2-rc26.diff
[linux-2.6.git] / net / core / sock.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Generic socket support routines. Memory allocators, socket lock/release
7  *              handler for protocols to use and generic option handler.
8  *
9  *
10  * Version:     $Id: sock.c,v 1.117 2002/02/01 22:01:03 davem Exp $
11  *
12  * Authors:     Ross Biro
13  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
14  *              Florian La Roche, <flla@stud.uni-sb.de>
15  *              Alan Cox, <A.Cox@swansea.ac.uk>
16  *
17  * Fixes:
18  *              Alan Cox        :       Numerous verify_area() problems
19  *              Alan Cox        :       Connecting on a connecting socket
20  *                                      now returns an error for tcp.
21  *              Alan Cox        :       sock->protocol is set correctly.
22  *                                      and is not sometimes left as 0.
23  *              Alan Cox        :       connect handles icmp errors on a
24  *                                      connect properly. Unfortunately there
25  *                                      is a restart syscall nasty there. I
26  *                                      can't match BSD without hacking the C
27  *                                      library. Ideas urgently sought!
28  *              Alan Cox        :       Disallow bind() to addresses that are
29  *                                      not ours - especially broadcast ones!!
30  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
31  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
32  *                                      instead they leave that for the DESTROY timer.
33  *              Alan Cox        :       Clean up error flag in accept
34  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
35  *                                      was buggy. Put a remove_sock() in the handler
36  *                                      for memory when we hit 0. Also altered the timer
37  *                                      code. The ACK stuff can wait and needs major 
38  *                                      TCP layer surgery.
39  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
40  *                                      and fixed timer/inet_bh race.
41  *              Alan Cox        :       Added zapped flag for TCP
42  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
43  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
44  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
45  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
46  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
47  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
48  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
49  *      Pauline Middelink       :       identd support
50  *              Alan Cox        :       Fixed connect() taking signals I think.
51  *              Alan Cox        :       SO_LINGER supported
52  *              Alan Cox        :       Error reporting fixes
53  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
54  *              Alan Cox        :       inet sockets don't set sk->type!
55  *              Alan Cox        :       Split socket option code
56  *              Alan Cox        :       Callbacks
57  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
58  *              Alex            :       Removed restriction on inet fioctl
59  *              Alan Cox        :       Splitting INET from NET core
60  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
61  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
62  *              Alan Cox        :       Split IP from generic code
63  *              Alan Cox        :       New kfree_skbmem()
64  *              Alan Cox        :       Make SO_DEBUG superuser only.
65  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
66  *                                      (compatibility fix)
67  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
68  *              Alan Cox        :       Allocator for a socket is settable.
69  *              Alan Cox        :       SO_ERROR includes soft errors.
70  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
71  *              Alan Cox        :       Generic socket allocation to make hooks
72  *                                      easier (suggested by Craig Metz).
73  *              Michael Pall    :       SO_ERROR returns positive errno again
74  *              Steve Whitehouse:       Added default destructor to free
75  *                                      protocol private data.
76  *              Steve Whitehouse:       Added various other default routines
77  *                                      common to several socket families.
78  *              Chris Evans     :       Call suser() check last on F_SETOWN
79  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
80  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
81  *              Andi Kleen      :       Fix write_space callback
82  *              Chris Evans     :       Security fixes - signedness again
83  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
84  *
85  * To Fix:
86  *
87  *
88  *              This program is free software; you can redistribute it and/or
89  *              modify it under the terms of the GNU General Public License
90  *              as published by the Free Software Foundation; either version
91  *              2 of the License, or (at your option) any later version.
92  */
93
94 #include <linux/capability.h>
95 #include <linux/config.h>
96 #include <linux/errno.h>
97 #include <linux/types.h>
98 #include <linux/socket.h>
99 #include <linux/in.h>
100 #include <linux/kernel.h>
101 #include <linux/module.h>
102 #include <linux/proc_fs.h>
103 #include <linux/seq_file.h>
104 #include <linux/sched.h>
105 #include <linux/timer.h>
106 #include <linux/string.h>
107 #include <linux/sockios.h>
108 #include <linux/net.h>
109 #include <linux/mm.h>
110 #include <linux/slab.h>
111 #include <linux/interrupt.h>
112 #include <linux/poll.h>
113 #include <linux/tcp.h>
114 #include <linux/init.h>
115
116 #include <asm/uaccess.h>
117 #include <asm/system.h>
118
119 #include <linux/netdevice.h>
120 #include <net/protocol.h>
121 #include <linux/skbuff.h>
122 #include <net/request_sock.h>
123 #include <net/sock.h>
124 #include <net/xfrm.h>
125 #include <linux/ipsec.h>
126
127 #include <linux/filter.h>
128 #include <linux/vs_socket.h>
129 #include <linux/vs_limit.h>
130 #include <linux/vs_context.h>
131
132 #ifdef CONFIG_INET
133 #include <net/tcp.h>
134 #endif
135
136 /* Take into consideration the size of the struct sk_buff overhead in the
137  * determination of these values, since that is non-constant across
138  * platforms.  This makes socket queueing behavior and performance
139  * not depend upon such differences.
140  */
141 #define _SK_MEM_PACKETS         256
142 #define _SK_MEM_OVERHEAD        (sizeof(struct sk_buff) + 256)
143 #define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
144 #define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
145
146 /* Run time adjustable parameters. */
147 __u32 sysctl_wmem_max = SK_WMEM_MAX;
148 __u32 sysctl_rmem_max = SK_RMEM_MAX;
149 __u32 sysctl_wmem_default = SK_WMEM_MAX;
150 __u32 sysctl_rmem_default = SK_RMEM_MAX;
151
152 /* Maximal space eaten by iovec or ancilliary data plus some space */
153 int sysctl_optmem_max = sizeof(unsigned long)*(2*UIO_MAXIOV + 512);
154
155 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
156 {
157         struct timeval tv;
158
159         if (optlen < sizeof(tv))
160                 return -EINVAL;
161         if (copy_from_user(&tv, optval, sizeof(tv)))
162                 return -EFAULT;
163
164         *timeo_p = MAX_SCHEDULE_TIMEOUT;
165         if (tv.tv_sec == 0 && tv.tv_usec == 0)
166                 return 0;
167         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
168                 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
169         return 0;
170 }
171
172 static void sock_warn_obsolete_bsdism(const char *name)
173 {
174         static int warned;
175         static char warncomm[TASK_COMM_LEN];
176         if (strcmp(warncomm, current->comm) && warned < 5) { 
177                 strcpy(warncomm,  current->comm); 
178                 printk(KERN_WARNING "process `%s' is using obsolete "
179                        "%s SO_BSDCOMPAT\n", warncomm, name);
180                 warned++;
181         }
182 }
183
184 static void sock_disable_timestamp(struct sock *sk)
185 {       
186         if (sock_flag(sk, SOCK_TIMESTAMP)) { 
187                 sock_reset_flag(sk, SOCK_TIMESTAMP);
188                 net_disable_timestamp();
189         }
190 }
191
192
193 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
194 {
195         int err = 0;
196         int skb_len;
197
198         /* Cast skb->rcvbuf to unsigned... It's pointless, but reduces
199            number of warnings when compiling with -W --ANK
200          */
201         if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
202             (unsigned)sk->sk_rcvbuf) {
203                 err = -ENOMEM;
204                 goto out;
205         }
206
207         /* It would be deadlock, if sock_queue_rcv_skb is used
208            with socket lock! We assume that users of this
209            function are lock free.
210         */
211         err = sk_filter(sk, skb, 1);
212         if (err)
213                 goto out;
214
215         skb->dev = NULL;
216         skb_set_owner_r(skb, sk);
217
218         /* Cache the SKB length before we tack it onto the receive
219          * queue.  Once it is added it no longer belongs to us and
220          * may be freed by other threads of control pulling packets
221          * from the queue.
222          */
223         skb_len = skb->len;
224
225         skb_queue_tail(&sk->sk_receive_queue, skb);
226
227         if (!sock_flag(sk, SOCK_DEAD))
228                 sk->sk_data_ready(sk, skb_len);
229 out:
230         return err;
231 }
232 EXPORT_SYMBOL(sock_queue_rcv_skb);
233
234 int sk_receive_skb(struct sock *sk, struct sk_buff *skb)
235 {
236         int rc = NET_RX_SUCCESS;
237
238         if (sk_filter(sk, skb, 0))
239                 goto discard_and_relse;
240
241         skb->dev = NULL;
242
243         bh_lock_sock(sk);
244         if (!sock_owned_by_user(sk))
245                 rc = sk->sk_backlog_rcv(sk, skb);
246         else
247                 sk_add_backlog(sk, skb);
248         bh_unlock_sock(sk);
249 out:
250         sock_put(sk);
251         return rc;
252 discard_and_relse:
253         kfree_skb(skb);
254         goto out;
255 }
256 EXPORT_SYMBOL(sk_receive_skb);
257
258 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
259 {
260         struct dst_entry *dst = sk->sk_dst_cache;
261
262         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
263                 sk->sk_dst_cache = NULL;
264                 dst_release(dst);
265                 return NULL;
266         }
267
268         return dst;
269 }
270 EXPORT_SYMBOL(__sk_dst_check);
271
272 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
273 {
274         struct dst_entry *dst = sk_dst_get(sk);
275
276         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
277                 sk_dst_reset(sk);
278                 dst_release(dst);
279                 return NULL;
280         }
281
282         return dst;
283 }
284 EXPORT_SYMBOL(sk_dst_check);
285
286 /*
287  *      This is meant for all protocols to use and covers goings on
288  *      at the socket level. Everything here is generic.
289  */
290
291 int sock_setsockopt(struct socket *sock, int level, int optname,
292                     char __user *optval, int optlen)
293 {
294         struct sock *sk=sock->sk;
295         struct sk_filter *filter;
296         int val;
297         int valbool;
298         struct linger ling;
299         int ret = 0;
300         
301         /*
302          *      Options without arguments
303          */
304
305 #ifdef SO_DONTLINGER            /* Compatibility item... */
306         if (optname == SO_DONTLINGER) {
307                 lock_sock(sk);
308                 sock_reset_flag(sk, SOCK_LINGER);
309                 release_sock(sk);
310                 return 0;
311         }
312 #endif
313         
314         if(optlen<sizeof(int))
315                 return(-EINVAL);
316         
317         if (get_user(val, (int __user *)optval))
318                 return -EFAULT;
319         
320         valbool = val?1:0;
321
322         lock_sock(sk);
323
324         switch(optname) 
325         {
326                 case SO_DEBUG:  
327                         if(val && !capable(CAP_NET_ADMIN))
328                         {
329                                 ret = -EACCES;
330                         }
331                         else if (valbool)
332                                 sock_set_flag(sk, SOCK_DBG);
333                         else
334                                 sock_reset_flag(sk, SOCK_DBG);
335                         break;
336                 case SO_REUSEADDR:
337                         sk->sk_reuse = valbool;
338                         break;
339                 case SO_TYPE:
340                 case SO_ERROR:
341                         ret = -ENOPROTOOPT;
342                         break;
343                 case SO_DONTROUTE:
344                         if (valbool)
345                                 sock_set_flag(sk, SOCK_LOCALROUTE);
346                         else
347                                 sock_reset_flag(sk, SOCK_LOCALROUTE);
348                         break;
349                 case SO_BROADCAST:
350                         sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
351                         break;
352                 case SO_SNDBUF:
353                         /* Don't error on this BSD doesn't and if you think
354                            about it this is right. Otherwise apps have to
355                            play 'guess the biggest size' games. RCVBUF/SNDBUF
356                            are treated in BSD as hints */
357                            
358                         if (val > sysctl_wmem_max)
359                                 val = sysctl_wmem_max;
360 set_sndbuf:
361                         sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
362                         if ((val * 2) < SOCK_MIN_SNDBUF)
363                                 sk->sk_sndbuf = SOCK_MIN_SNDBUF;
364                         else
365                                 sk->sk_sndbuf = val * 2;
366
367                         /*
368                          *      Wake up sending tasks if we
369                          *      upped the value.
370                          */
371                         sk->sk_write_space(sk);
372                         break;
373
374                 case SO_SNDBUFFORCE:
375                         if (!capable(CAP_NET_ADMIN)) {
376                                 ret = -EPERM;
377                                 break;
378                         }
379                         goto set_sndbuf;
380
381                 case SO_RCVBUF:
382                         /* Don't error on this BSD doesn't and if you think
383                            about it this is right. Otherwise apps have to
384                            play 'guess the biggest size' games. RCVBUF/SNDBUF
385                            are treated in BSD as hints */
386                           
387                         if (val > sysctl_rmem_max)
388                                 val = sysctl_rmem_max;
389 set_rcvbuf:
390                         sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
391                         /*
392                          * We double it on the way in to account for
393                          * "struct sk_buff" etc. overhead.   Applications
394                          * assume that the SO_RCVBUF setting they make will
395                          * allow that much actual data to be received on that
396                          * socket.
397                          *
398                          * Applications are unaware that "struct sk_buff" and
399                          * other overheads allocate from the receive buffer
400                          * during socket buffer allocation.
401                          *
402                          * And after considering the possible alternatives,
403                          * returning the value we actually used in getsockopt
404                          * is the most desirable behavior.
405                          */
406                         if ((val * 2) < SOCK_MIN_RCVBUF)
407                                 sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
408                         else
409                                 sk->sk_rcvbuf = val * 2;
410                         break;
411
412                 case SO_RCVBUFFORCE:
413                         if (!capable(CAP_NET_ADMIN)) {
414                                 ret = -EPERM;
415                                 break;
416                         }
417                         goto set_rcvbuf;
418
419                 case SO_KEEPALIVE:
420 #ifdef CONFIG_INET
421                         if (sk->sk_protocol == IPPROTO_TCP)
422                                 tcp_set_keepalive(sk, valbool);
423 #endif
424                         sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
425                         break;
426
427                 case SO_OOBINLINE:
428                         sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
429                         break;
430
431                 case SO_NO_CHECK:
432                         sk->sk_no_check = valbool;
433                         break;
434
435                 case SO_PRIORITY:
436                         if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN)) 
437                                 sk->sk_priority = val;
438                         else
439                                 ret = -EPERM;
440                         break;
441
442                 case SO_LINGER:
443                         if(optlen<sizeof(ling)) {
444                                 ret = -EINVAL;  /* 1003.1g */
445                                 break;
446                         }
447                         if (copy_from_user(&ling,optval,sizeof(ling))) {
448                                 ret = -EFAULT;
449                                 break;
450                         }
451                         if (!ling.l_onoff)
452                                 sock_reset_flag(sk, SOCK_LINGER);
453                         else {
454 #if (BITS_PER_LONG == 32)
455                                 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
456                                         sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
457                                 else
458 #endif
459                                         sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
460                                 sock_set_flag(sk, SOCK_LINGER);
461                         }
462                         break;
463
464                 case SO_BSDCOMPAT:
465                         sock_warn_obsolete_bsdism("setsockopt");
466                         break;
467
468                 case SO_PASSCRED:
469                         if (valbool)
470                                 set_bit(SOCK_PASSCRED, &sock->flags);
471                         else
472                                 clear_bit(SOCK_PASSCRED, &sock->flags);
473                         break;
474
475                 case SO_TIMESTAMP:
476                         if (valbool)  {
477                                 sock_set_flag(sk, SOCK_RCVTSTAMP);
478                                 sock_enable_timestamp(sk);
479                         } else
480                                 sock_reset_flag(sk, SOCK_RCVTSTAMP);
481                         break;
482
483                 case SO_RCVLOWAT:
484                         if (val < 0)
485                                 val = INT_MAX;
486                         sk->sk_rcvlowat = val ? : 1;
487                         break;
488
489                 case SO_RCVTIMEO:
490                         ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
491                         break;
492
493                 case SO_SNDTIMEO:
494                         ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
495                         break;
496
497 #ifdef CONFIG_NETDEVICES
498                 case SO_BINDTODEVICE:
499                 {
500                         char devname[IFNAMSIZ]; 
501
502                         /* Sorry... */ 
503                         if (!capable(CAP_NET_RAW)) {
504                                 ret = -EPERM;
505                                 break;
506                         }
507
508                         /* Bind this socket to a particular device like "eth0",
509                          * as specified in the passed interface name. If the
510                          * name is "" or the option length is zero the socket 
511                          * is not bound. 
512                          */ 
513
514                         if (!valbool) {
515                                 sk->sk_bound_dev_if = 0;
516                         } else {
517                                 if (optlen > IFNAMSIZ - 1)
518                                         optlen = IFNAMSIZ - 1;
519                                 memset(devname, 0, sizeof(devname));
520                                 if (copy_from_user(devname, optval, optlen)) {
521                                         ret = -EFAULT;
522                                         break;
523                                 }
524
525                                 /* Remove any cached route for this socket. */
526                                 sk_dst_reset(sk);
527
528                                 if (devname[0] == '\0') {
529                                         sk->sk_bound_dev_if = 0;
530                                 } else {
531                                         struct net_device *dev = dev_get_by_name(devname);
532                                         if (!dev) {
533                                                 ret = -ENODEV;
534                                                 break;
535                                         }
536                                         sk->sk_bound_dev_if = dev->ifindex;
537                                         dev_put(dev);
538                                 }
539                         }
540                         break;
541                 }
542 #endif
543
544
545                 case SO_ATTACH_FILTER:
546                         ret = -EINVAL;
547                         if (optlen == sizeof(struct sock_fprog)) {
548                                 struct sock_fprog fprog;
549
550                                 ret = -EFAULT;
551                                 if (copy_from_user(&fprog, optval, sizeof(fprog)))
552                                         break;
553
554                                 ret = sk_attach_filter(&fprog, sk);
555                         }
556                         break;
557
558                 case SO_DETACH_FILTER:
559                         spin_lock_bh(&sk->sk_lock.slock);
560                         filter = sk->sk_filter;
561                         if (filter) {
562                                 sk->sk_filter = NULL;
563                                 spin_unlock_bh(&sk->sk_lock.slock);
564                                 sk_filter_release(sk, filter);
565                                 break;
566                         }
567                         spin_unlock_bh(&sk->sk_lock.slock);
568                         ret = -ENONET;
569                         break;
570
571                 /* We implement the SO_SNDLOWAT etc to
572                    not be settable (1003.1g 5.3) */
573                 default:
574                         ret = -ENOPROTOOPT;
575                         break;
576         }
577         release_sock(sk);
578         return ret;
579 }
580
581
582 int sock_getsockopt(struct socket *sock, int level, int optname,
583                     char __user *optval, int __user *optlen)
584 {
585         struct sock *sk = sock->sk;
586         
587         union
588         {
589                 int val;
590                 struct linger ling;
591                 struct timeval tm;
592         } v;
593         
594         unsigned int lv = sizeof(int);
595         int len;
596         
597         if(get_user(len,optlen))
598                 return -EFAULT;
599         if(len < 0)
600                 return -EINVAL;
601                 
602         switch(optname) 
603         {
604                 case SO_DEBUG:          
605                         v.val = sock_flag(sk, SOCK_DBG);
606                         break;
607                 
608                 case SO_DONTROUTE:
609                         v.val = sock_flag(sk, SOCK_LOCALROUTE);
610                         break;
611                 
612                 case SO_BROADCAST:
613                         v.val = !!sock_flag(sk, SOCK_BROADCAST);
614                         break;
615
616                 case SO_SNDBUF:
617                         v.val = sk->sk_sndbuf;
618                         break;
619                 
620                 case SO_RCVBUF:
621                         v.val = sk->sk_rcvbuf;
622                         break;
623
624                 case SO_REUSEADDR:
625                         v.val = sk->sk_reuse;
626                         break;
627
628                 case SO_KEEPALIVE:
629                         v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
630                         break;
631
632                 case SO_TYPE:
633                         v.val = sk->sk_type;                            
634                         break;
635
636                 case SO_ERROR:
637                         v.val = -sock_error(sk);
638                         if(v.val==0)
639                                 v.val = xchg(&sk->sk_err_soft, 0);
640                         break;
641
642                 case SO_OOBINLINE:
643                         v.val = !!sock_flag(sk, SOCK_URGINLINE);
644                         break;
645         
646                 case SO_NO_CHECK:
647                         v.val = sk->sk_no_check;
648                         break;
649
650                 case SO_PRIORITY:
651                         v.val = sk->sk_priority;
652                         break;
653                 
654                 case SO_LINGER: 
655                         lv              = sizeof(v.ling);
656                         v.ling.l_onoff  = !!sock_flag(sk, SOCK_LINGER);
657                         v.ling.l_linger = sk->sk_lingertime / HZ;
658                         break;
659                                         
660                 case SO_BSDCOMPAT:
661                         sock_warn_obsolete_bsdism("getsockopt");
662                         break;
663
664                 case SO_TIMESTAMP:
665                         v.val = sock_flag(sk, SOCK_RCVTSTAMP);
666                         break;
667
668                 case SO_RCVTIMEO:
669                         lv=sizeof(struct timeval);
670                         if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
671                                 v.tm.tv_sec = 0;
672                                 v.tm.tv_usec = 0;
673                         } else {
674                                 v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
675                                 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
676                         }
677                         break;
678
679                 case SO_SNDTIMEO:
680                         lv=sizeof(struct timeval);
681                         if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
682                                 v.tm.tv_sec = 0;
683                                 v.tm.tv_usec = 0;
684                         } else {
685                                 v.tm.tv_sec = sk->sk_sndtimeo / HZ;
686                                 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
687                         }
688                         break;
689
690                 case SO_RCVLOWAT:
691                         v.val = sk->sk_rcvlowat;
692                         break;
693
694                 case SO_SNDLOWAT:
695                         v.val=1;
696                         break; 
697
698                 case SO_PASSCRED:
699                         v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
700                         break;
701
702                 case SO_PEERCRED:
703                         if (len > sizeof(sk->sk_peercred))
704                                 len = sizeof(sk->sk_peercred);
705                         if (copy_to_user(optval, &sk->sk_peercred, len))
706                                 return -EFAULT;
707                         goto lenout;
708
709                 case SO_PEERNAME:
710                 {
711                         char address[128];
712
713                         if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
714                                 return -ENOTCONN;
715                         if (lv < len)
716                                 return -EINVAL;
717                         if (copy_to_user(optval, address, len))
718                                 return -EFAULT;
719                         goto lenout;
720                 }
721
722                 /* Dubious BSD thing... Probably nobody even uses it, but
723                  * the UNIX standard wants it for whatever reason... -DaveM
724                  */
725                 case SO_ACCEPTCONN:
726                         v.val = sk->sk_state == TCP_LISTEN;
727                         break;
728
729                 case SO_PEERSEC:
730                         return security_socket_getpeersec_stream(sock, optval, optlen, len);
731
732                 default:
733                         return(-ENOPROTOOPT);
734         }
735         if (len > lv)
736                 len = lv;
737         if (copy_to_user(optval, &v, len))
738                 return -EFAULT;
739 lenout:
740         if (put_user(len, optlen))
741                 return -EFAULT;
742         return 0;
743 }
744
745 /**
746  *      sk_alloc - All socket objects are allocated here
747  *      @family: protocol family
748  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
749  *      @prot: struct proto associated with this new sock instance
750  *      @zero_it: if we should zero the newly allocated sock
751  */
752 struct sock *sk_alloc(int family, gfp_t priority,
753                       struct proto *prot, int zero_it)
754 {
755         struct sock *sk = NULL;
756         kmem_cache_t *slab = prot->slab;
757
758         if (slab != NULL)
759                 sk = kmem_cache_alloc(slab, priority);
760         else
761                 sk = kmalloc(prot->obj_size, priority);
762
763         if (sk) {
764                 if (zero_it) {
765                         memset(sk, 0, prot->obj_size);
766                         sk->sk_family = family;
767                         /*
768                          * See comment in struct sock definition to understand
769                          * why we need sk_prot_creator -acme
770                          */
771                         sk->sk_prot = sk->sk_prot_creator = prot;
772                         sock_lock_init(sk);
773                 }
774                 sock_vx_init(sk);
775                 sock_nx_init(sk);
776                 
777                 if (security_sk_alloc(sk, family, priority))
778                         goto out_free;
779
780                 if (!try_module_get(prot->owner))
781                         goto out_free;
782         }
783         return sk;
784
785 out_free:
786         if (slab != NULL)
787                 kmem_cache_free(slab, sk);
788         else
789                 kfree(sk);
790         return NULL;
791 }
792
793 void sk_free(struct sock *sk)
794 {
795         struct sk_filter *filter;
796         struct module *owner = sk->sk_prot_creator->owner;
797
798         if (sk->sk_destruct)
799                 sk->sk_destruct(sk);
800
801         filter = sk->sk_filter;
802         if (filter) {
803                 sk_filter_release(sk, filter);
804                 sk->sk_filter = NULL;
805         }
806
807         sock_disable_timestamp(sk);
808
809         if (atomic_read(&sk->sk_omem_alloc))
810                 printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
811                        __FUNCTION__, atomic_read(&sk->sk_omem_alloc));
812
813         security_sk_free(sk);
814         vx_sock_dec(sk);
815         clr_vx_info(&sk->sk_vx_info);
816         sk->sk_xid = -1;
817         clr_nx_info(&sk->sk_nx_info);
818         sk->sk_nid = -1;
819         if (sk->sk_prot_creator->slab != NULL)
820                 kmem_cache_free(sk->sk_prot_creator->slab, sk);
821         else
822                 kfree(sk);
823         module_put(owner);
824 }
825
826 struct sock *sk_clone(struct sock *sk, const gfp_t priority)
827 {
828         struct sock *newsk = sk_alloc(sk->sk_family, priority, sk->sk_prot, 0);
829
830         if (newsk != NULL) {
831                 struct sk_filter *filter;
832
833                 memcpy(newsk, sk, sk->sk_prot->obj_size);
834
835                 /* SANITY */
836                 sock_vx_init(newsk);
837                 sock_nx_init(newsk);
838                 sk_node_init(&newsk->sk_node);
839                 sock_lock_init(newsk);
840                 bh_lock_sock(newsk);
841
842                 atomic_set(&newsk->sk_rmem_alloc, 0);
843                 atomic_set(&newsk->sk_wmem_alloc, 0);
844                 atomic_set(&newsk->sk_omem_alloc, 0);
845                 skb_queue_head_init(&newsk->sk_receive_queue);
846                 skb_queue_head_init(&newsk->sk_write_queue);
847
848                 rwlock_init(&newsk->sk_dst_lock);
849                 rwlock_init(&newsk->sk_callback_lock);
850
851                 newsk->sk_dst_cache     = NULL;
852                 newsk->sk_wmem_queued   = 0;
853                 newsk->sk_forward_alloc = 0;
854                 newsk->sk_send_head     = NULL;
855                 newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
856                 newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
857
858                 sock_reset_flag(newsk, SOCK_DONE);
859                 skb_queue_head_init(&newsk->sk_error_queue);
860
861                 filter = newsk->sk_filter;
862                 if (filter != NULL)
863                         sk_filter_charge(newsk, filter);
864
865                 if (sk->sk_create_child)
866                         sk->sk_create_child(sk, newsk);
867
868                 if (unlikely(xfrm_sk_clone_policy(newsk))) {
869                         /* It is still raw copy of parent, so invalidate
870                          * destructor and make plain sk_free() */
871                         newsk->sk_destruct = NULL;
872                         sk_free(newsk);
873                         newsk = NULL;
874                         goto out;
875                 }
876
877                 newsk->sk_err      = 0;
878                 newsk->sk_priority = 0;
879                 atomic_set(&newsk->sk_refcnt, 2);
880
881                 set_vx_info(&newsk->sk_vx_info, sk->sk_vx_info);
882                 newsk->sk_xid = sk->sk_xid;
883                 vx_sock_inc(newsk);
884                 set_nx_info(&newsk->sk_nx_info, sk->sk_nx_info);
885                 newsk->sk_nid = sk->sk_nid;
886
887                 /*
888                  * Increment the counter in the same struct proto as the master
889                  * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
890                  * is the same as sk->sk_prot->socks, as this field was copied
891                  * with memcpy).
892                  *
893                  * This _changes_ the previous behaviour, where
894                  * tcp_create_openreq_child always was incrementing the
895                  * equivalent to tcp_prot->socks (inet_sock_nr), so this have
896                  * to be taken into account in all callers. -acme
897                  */
898                 sk_refcnt_debug_inc(newsk);
899                 newsk->sk_socket = NULL;
900                 newsk->sk_sleep  = NULL;
901
902                 if (newsk->sk_prot->sockets_allocated)
903                         atomic_inc(newsk->sk_prot->sockets_allocated);
904         }
905 out:
906         return newsk;
907 }
908
909 EXPORT_SYMBOL_GPL(sk_clone);
910
911 void __init sk_init(void)
912 {
913         if (num_physpages <= 4096) {
914                 sysctl_wmem_max = 32767;
915                 sysctl_rmem_max = 32767;
916                 sysctl_wmem_default = 32767;
917                 sysctl_rmem_default = 32767;
918         } else if (num_physpages >= 131072) {
919                 sysctl_wmem_max = 131071;
920                 sysctl_rmem_max = 131071;
921         }
922 }
923
924 /*
925  *      Simple resource managers for sockets.
926  */
927
928
929 /* 
930  * Write buffer destructor automatically called from kfree_skb. 
931  */
932 void sock_wfree(struct sk_buff *skb)
933 {
934         struct sock *sk = skb->sk;
935
936         /* In case it might be waiting for more memory. */
937         atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
938         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE))
939                 sk->sk_write_space(sk);
940         sock_put(sk);
941 }
942
943 /* 
944  * Read buffer destructor automatically called from kfree_skb. 
945  */
946 void sock_rfree(struct sk_buff *skb)
947 {
948         struct sock *sk = skb->sk;
949
950         atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
951 }
952
953
954 int sock_i_uid(struct sock *sk)
955 {
956         int uid;
957
958         read_lock(&sk->sk_callback_lock);
959         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
960         read_unlock(&sk->sk_callback_lock);
961         return uid;
962 }
963
964 unsigned long sock_i_ino(struct sock *sk)
965 {
966         unsigned long ino;
967
968         read_lock(&sk->sk_callback_lock);
969         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
970         read_unlock(&sk->sk_callback_lock);
971         return ino;
972 }
973
974 /*
975  * Allocate a skb from the socket's send buffer.
976  */
977 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
978                              gfp_t priority)
979 {
980         if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
981                 struct sk_buff * skb = alloc_skb(size, priority);
982                 if (skb) {
983                         skb_set_owner_w(skb, sk);
984                         return skb;
985                 }
986         }
987         return NULL;
988 }
989
990 /*
991  * Allocate a skb from the socket's receive buffer.
992  */ 
993 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
994                              gfp_t priority)
995 {
996         if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
997                 struct sk_buff *skb = alloc_skb(size, priority);
998                 if (skb) {
999                         skb_set_owner_r(skb, sk);
1000                         return skb;
1001                 }
1002         }
1003         return NULL;
1004 }
1005
1006 /* 
1007  * Allocate a memory block from the socket's option memory buffer.
1008  */ 
1009 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1010 {
1011         if ((unsigned)size <= sysctl_optmem_max &&
1012             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1013                 void *mem;
1014                 /* First do the add, to avoid the race if kmalloc
1015                  * might sleep.
1016                  */
1017                 atomic_add(size, &sk->sk_omem_alloc);
1018                 mem = kmalloc(size, priority);
1019                 if (mem)
1020                         return mem;
1021                 atomic_sub(size, &sk->sk_omem_alloc);
1022         }
1023         return NULL;
1024 }
1025
1026 /*
1027  * Free an option memory block.
1028  */
1029 void sock_kfree_s(struct sock *sk, void *mem, int size)
1030 {
1031         kfree(mem);
1032         atomic_sub(size, &sk->sk_omem_alloc);
1033 }
1034
1035 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1036    I think, these locks should be removed for datagram sockets.
1037  */
1038 static long sock_wait_for_wmem(struct sock * sk, long timeo)
1039 {
1040         DEFINE_WAIT(wait);
1041
1042         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1043         for (;;) {
1044                 if (!timeo)
1045                         break;
1046                 if (signal_pending(current))
1047                         break;
1048                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1049                 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1050                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1051                         break;
1052                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1053                         break;
1054                 if (sk->sk_err)
1055                         break;
1056                 timeo = schedule_timeout(timeo);
1057         }
1058         finish_wait(sk->sk_sleep, &wait);
1059         return timeo;
1060 }
1061
1062
1063 /*
1064  *      Generic send/receive buffer handlers
1065  */
1066
1067 static struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
1068                                             unsigned long header_len,
1069                                             unsigned long data_len,
1070                                             int noblock, int *errcode)
1071 {
1072         struct sk_buff *skb;
1073         gfp_t gfp_mask;
1074         long timeo;
1075         int err;
1076
1077         gfp_mask = sk->sk_allocation;
1078         if (gfp_mask & __GFP_WAIT)
1079                 gfp_mask |= __GFP_REPEAT;
1080
1081         timeo = sock_sndtimeo(sk, noblock);
1082         while (1) {
1083                 err = sock_error(sk);
1084                 if (err != 0)
1085                         goto failure;
1086
1087                 err = -EPIPE;
1088                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1089                         goto failure;
1090
1091                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1092                         skb = alloc_skb(header_len, sk->sk_allocation);
1093                         if (skb) {
1094                                 int npages;
1095                                 int i;
1096
1097                                 /* No pages, we're done... */
1098                                 if (!data_len)
1099                                         break;
1100
1101                                 npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1102                                 skb->truesize += data_len;
1103                                 skb_shinfo(skb)->nr_frags = npages;
1104                                 for (i = 0; i < npages; i++) {
1105                                         struct page *page;
1106                                         skb_frag_t *frag;
1107
1108                                         page = alloc_pages(sk->sk_allocation, 0);
1109                                         if (!page) {
1110                                                 err = -ENOBUFS;
1111                                                 skb_shinfo(skb)->nr_frags = i;
1112                                                 kfree_skb(skb);
1113                                                 goto failure;
1114                                         }
1115
1116                                         frag = &skb_shinfo(skb)->frags[i];
1117                                         frag->page = page;
1118                                         frag->page_offset = 0;
1119                                         frag->size = (data_len >= PAGE_SIZE ?
1120                                                       PAGE_SIZE :
1121                                                       data_len);
1122                                         data_len -= PAGE_SIZE;
1123                                 }
1124
1125                                 /* Full success... */
1126                                 break;
1127                         }
1128                         err = -ENOBUFS;
1129                         goto failure;
1130                 }
1131                 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1132                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1133                 err = -EAGAIN;
1134                 if (!timeo)
1135                         goto failure;
1136                 if (signal_pending(current))
1137                         goto interrupted;
1138                 timeo = sock_wait_for_wmem(sk, timeo);
1139         }
1140
1141         skb_set_owner_w(skb, sk);
1142         return skb;
1143
1144 interrupted:
1145         err = sock_intr_errno(timeo);
1146 failure:
1147         *errcode = err;
1148         return NULL;
1149 }
1150
1151 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, 
1152                                     int noblock, int *errcode)
1153 {
1154         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1155 }
1156
1157 static void __lock_sock(struct sock *sk)
1158 {
1159         DEFINE_WAIT(wait);
1160
1161         for(;;) {
1162                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1163                                         TASK_UNINTERRUPTIBLE);
1164                 spin_unlock_bh(&sk->sk_lock.slock);
1165                 schedule();
1166                 spin_lock_bh(&sk->sk_lock.slock);
1167                 if(!sock_owned_by_user(sk))
1168                         break;
1169         }
1170         finish_wait(&sk->sk_lock.wq, &wait);
1171 }
1172
1173 static void __release_sock(struct sock *sk)
1174 {
1175         struct sk_buff *skb = sk->sk_backlog.head;
1176
1177         do {
1178                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1179                 bh_unlock_sock(sk);
1180
1181                 do {
1182                         struct sk_buff *next = skb->next;
1183
1184                         skb->next = NULL;
1185                         sk->sk_backlog_rcv(sk, skb);
1186
1187                         /*
1188                          * We are in process context here with softirqs
1189                          * disabled, use cond_resched_softirq() to preempt.
1190                          * This is safe to do because we've taken the backlog
1191                          * queue private:
1192                          */
1193                         cond_resched_softirq();
1194
1195                         skb = next;
1196                 } while (skb != NULL);
1197
1198                 bh_lock_sock(sk);
1199         } while((skb = sk->sk_backlog.head) != NULL);
1200 }
1201
1202 /**
1203  * sk_wait_data - wait for data to arrive at sk_receive_queue
1204  * @sk:    sock to wait on
1205  * @timeo: for how long
1206  *
1207  * Now socket state including sk->sk_err is changed only under lock,
1208  * hence we may omit checks after joining wait queue.
1209  * We check receive queue before schedule() only as optimization;
1210  * it is very likely that release_sock() added new data.
1211  */
1212 int sk_wait_data(struct sock *sk, long *timeo)
1213 {
1214         int rc;
1215         DEFINE_WAIT(wait);
1216
1217         prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1218         set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1219         rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1220         clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1221         finish_wait(sk->sk_sleep, &wait);
1222         return rc;
1223 }
1224
1225 EXPORT_SYMBOL(sk_wait_data);
1226
1227 /*
1228  * Set of default routines for initialising struct proto_ops when
1229  * the protocol does not support a particular function. In certain
1230  * cases where it makes no sense for a protocol to have a "do nothing"
1231  * function, some default processing is provided.
1232  */
1233
1234 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1235 {
1236         return -EOPNOTSUPP;
1237 }
1238
1239 int sock_no_connect(struct socket *sock, struct sockaddr *saddr, 
1240                     int len, int flags)
1241 {
1242         return -EOPNOTSUPP;
1243 }
1244
1245 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1246 {
1247         return -EOPNOTSUPP;
1248 }
1249
1250 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1251 {
1252         return -EOPNOTSUPP;
1253 }
1254
1255 int sock_no_getname(struct socket *sock, struct sockaddr *saddr, 
1256                     int *len, int peer)
1257 {
1258         return -EOPNOTSUPP;
1259 }
1260
1261 unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt)
1262 {
1263         return 0;
1264 }
1265
1266 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1267 {
1268         return -EOPNOTSUPP;
1269 }
1270
1271 int sock_no_listen(struct socket *sock, int backlog)
1272 {
1273         return -EOPNOTSUPP;
1274 }
1275
1276 int sock_no_shutdown(struct socket *sock, int how)
1277 {
1278         return -EOPNOTSUPP;
1279 }
1280
1281 int sock_no_setsockopt(struct socket *sock, int level, int optname,
1282                     char __user *optval, int optlen)
1283 {
1284         return -EOPNOTSUPP;
1285 }
1286
1287 int sock_no_getsockopt(struct socket *sock, int level, int optname,
1288                     char __user *optval, int __user *optlen)
1289 {
1290         return -EOPNOTSUPP;
1291 }
1292
1293 int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1294                     size_t len)
1295 {
1296         return -EOPNOTSUPP;
1297 }
1298
1299 int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1300                     size_t len, int flags)
1301 {
1302         return -EOPNOTSUPP;
1303 }
1304
1305 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1306 {
1307         /* Mirror missing mmap method error code */
1308         return -ENODEV;
1309 }
1310
1311 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1312 {
1313         ssize_t res;
1314         struct msghdr msg = {.msg_flags = flags};
1315         struct kvec iov;
1316         char *kaddr = kmap(page);
1317         iov.iov_base = kaddr + offset;
1318         iov.iov_len = size;
1319         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1320         kunmap(page);
1321         return res;
1322 }
1323
1324 /*
1325  *      Default Socket Callbacks
1326  */
1327
1328 static void sock_def_wakeup(struct sock *sk)
1329 {
1330         read_lock(&sk->sk_callback_lock);
1331         if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1332                 wake_up_interruptible_all(sk->sk_sleep);
1333         read_unlock(&sk->sk_callback_lock);
1334 }
1335
1336 static void sock_def_error_report(struct sock *sk)
1337 {
1338         read_lock(&sk->sk_callback_lock);
1339         if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1340                 wake_up_interruptible(sk->sk_sleep);
1341         sk_wake_async(sk,0,POLL_ERR); 
1342         read_unlock(&sk->sk_callback_lock);
1343 }
1344
1345 static void sock_def_readable(struct sock *sk, int len)
1346 {
1347         read_lock(&sk->sk_callback_lock);
1348         if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1349                 wake_up_interruptible(sk->sk_sleep);
1350         sk_wake_async(sk,1,POLL_IN);
1351         read_unlock(&sk->sk_callback_lock);
1352 }
1353
1354 static void sock_def_write_space(struct sock *sk)
1355 {
1356         read_lock(&sk->sk_callback_lock);
1357
1358         /* Do not wake up a writer until he can make "significant"
1359          * progress.  --DaveM
1360          */
1361         if((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1362                 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1363                         wake_up_interruptible(sk->sk_sleep);
1364
1365                 /* Should agree with poll, otherwise some programs break */
1366                 if (sock_writeable(sk))
1367                         sk_wake_async(sk, 2, POLL_OUT);
1368         }
1369
1370         read_unlock(&sk->sk_callback_lock);
1371 }
1372
1373 static void sock_def_destruct(struct sock *sk)
1374 {
1375         kfree(sk->sk_protinfo);
1376 }
1377
1378 void sk_send_sigurg(struct sock *sk)
1379 {
1380         if (sk->sk_socket && sk->sk_socket->file)
1381                 if (send_sigurg(&sk->sk_socket->file->f_owner))
1382                         sk_wake_async(sk, 3, POLL_PRI);
1383 }
1384
1385 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1386                     unsigned long expires)
1387 {
1388         if (!mod_timer(timer, expires))
1389                 sock_hold(sk);
1390 }
1391
1392 EXPORT_SYMBOL(sk_reset_timer);
1393
1394 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1395 {
1396         if (timer_pending(timer) && del_timer(timer))
1397                 __sock_put(sk);
1398 }
1399
1400 EXPORT_SYMBOL(sk_stop_timer);
1401
1402 void sock_init_data(struct socket *sock, struct sock *sk)
1403 {
1404         skb_queue_head_init(&sk->sk_receive_queue);
1405         skb_queue_head_init(&sk->sk_write_queue);
1406         skb_queue_head_init(&sk->sk_error_queue);
1407
1408         sk->sk_send_head        =       NULL;
1409
1410         init_timer(&sk->sk_timer);
1411         
1412         sk->sk_allocation       =       GFP_KERNEL;
1413         sk->sk_rcvbuf           =       sysctl_rmem_default;
1414         sk->sk_sndbuf           =       sysctl_wmem_default;
1415         sk->sk_state            =       TCP_CLOSE;
1416         sk->sk_socket           =       sock;
1417
1418         sock_set_flag(sk, SOCK_ZAPPED);
1419
1420         if(sock)
1421         {
1422                 sk->sk_type     =       sock->type;
1423                 sk->sk_sleep    =       &sock->wait;
1424                 sock->sk        =       sk;
1425         } else
1426                 sk->sk_sleep    =       NULL;
1427
1428         rwlock_init(&sk->sk_dst_lock);
1429         rwlock_init(&sk->sk_callback_lock);
1430
1431         sk->sk_state_change     =       sock_def_wakeup;
1432         sk->sk_data_ready       =       sock_def_readable;
1433         sk->sk_write_space      =       sock_def_write_space;
1434         sk->sk_error_report     =       sock_def_error_report;
1435         sk->sk_destruct         =       sock_def_destruct;
1436
1437         sk->sk_sndmsg_page      =       NULL;
1438         sk->sk_sndmsg_off       =       0;
1439
1440         sk->sk_peercred.pid     =       0;
1441         sk->sk_peercred.uid     =       -1;
1442         sk->sk_peercred.gid     =       -1;
1443         sk->sk_write_pending    =       0;
1444         sk->sk_rcvlowat         =       1;
1445         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
1446         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
1447
1448         sk->sk_stamp.tv_sec     = -1L;
1449         sk->sk_stamp.tv_usec    = -1L;
1450
1451         set_vx_info(&sk->sk_vx_info, current->vx_info);
1452         sk->sk_xid = vx_current_xid();
1453         vx_sock_inc(sk);
1454         set_nx_info(&sk->sk_nx_info, current->nx_info);
1455         sk->sk_nid = nx_current_nid();
1456         atomic_set(&sk->sk_refcnt, 1);
1457 }
1458
1459 void fastcall lock_sock(struct sock *sk)
1460 {
1461         might_sleep();
1462         spin_lock_bh(&(sk->sk_lock.slock));
1463         if (sk->sk_lock.owner)
1464                 __lock_sock(sk);
1465         sk->sk_lock.owner = (void *)1;
1466         spin_unlock_bh(&(sk->sk_lock.slock));
1467 }
1468
1469 EXPORT_SYMBOL(lock_sock);
1470
1471 void fastcall release_sock(struct sock *sk)
1472 {
1473         spin_lock_bh(&(sk->sk_lock.slock));
1474         if (sk->sk_backlog.tail)
1475                 __release_sock(sk);
1476         sk->sk_lock.owner = NULL;
1477         if (waitqueue_active(&(sk->sk_lock.wq)))
1478                 wake_up(&(sk->sk_lock.wq));
1479         spin_unlock_bh(&(sk->sk_lock.slock));
1480 }
1481 EXPORT_SYMBOL(release_sock);
1482
1483 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
1484
1485         if (!sock_flag(sk, SOCK_TIMESTAMP))
1486                 sock_enable_timestamp(sk);
1487         if (sk->sk_stamp.tv_sec == -1) 
1488                 return -ENOENT;
1489         if (sk->sk_stamp.tv_sec == 0)
1490                 do_gettimeofday(&sk->sk_stamp);
1491         return copy_to_user(userstamp, &sk->sk_stamp, sizeof(struct timeval)) ?
1492                 -EFAULT : 0; 
1493
1494 EXPORT_SYMBOL(sock_get_timestamp);
1495
1496 void sock_enable_timestamp(struct sock *sk)
1497 {       
1498         if (!sock_flag(sk, SOCK_TIMESTAMP)) { 
1499                 sock_set_flag(sk, SOCK_TIMESTAMP);
1500                 net_enable_timestamp();
1501         }
1502 }
1503 EXPORT_SYMBOL(sock_enable_timestamp); 
1504
1505 /*
1506  *      Get a socket option on an socket.
1507  *
1508  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
1509  *      asynchronous errors should be reported by getsockopt. We assume
1510  *      this means if you specify SO_ERROR (otherwise whats the point of it).
1511  */
1512 int sock_common_getsockopt(struct socket *sock, int level, int optname,
1513                            char __user *optval, int __user *optlen)
1514 {
1515         struct sock *sk = sock->sk;
1516
1517         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1518 }
1519
1520 EXPORT_SYMBOL(sock_common_getsockopt);
1521
1522 #ifdef CONFIG_COMPAT
1523 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
1524                                   char __user *optval, int __user *optlen)
1525 {
1526         struct sock *sk = sock->sk;
1527
1528         if (sk->sk_prot->compat_setsockopt != NULL)
1529                 return sk->sk_prot->compat_getsockopt(sk, level, optname,
1530                                                       optval, optlen);
1531         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1532 }
1533 EXPORT_SYMBOL(compat_sock_common_getsockopt);
1534 #endif
1535
1536 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
1537                         struct msghdr *msg, size_t size, int flags)
1538 {
1539         struct sock *sk = sock->sk;
1540         int addr_len = 0;
1541         int err;
1542
1543         err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
1544                                    flags & ~MSG_DONTWAIT, &addr_len);
1545         if (err >= 0)
1546                 msg->msg_namelen = addr_len;
1547         return err;
1548 }
1549
1550 EXPORT_SYMBOL(sock_common_recvmsg);
1551
1552 /*
1553  *      Set socket options on an inet socket.
1554  */
1555 int sock_common_setsockopt(struct socket *sock, int level, int optname,
1556                            char __user *optval, int optlen)
1557 {
1558         struct sock *sk = sock->sk;
1559
1560         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1561 }
1562
1563 EXPORT_SYMBOL(sock_common_setsockopt);
1564
1565 #ifdef CONFIG_COMPAT
1566 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
1567                                   char __user *optval, int optlen)
1568 {
1569         struct sock *sk = sock->sk;
1570
1571         if (sk->sk_prot->compat_setsockopt != NULL)
1572                 return sk->sk_prot->compat_setsockopt(sk, level, optname,
1573                                                       optval, optlen);
1574         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1575 }
1576 EXPORT_SYMBOL(compat_sock_common_setsockopt);
1577 #endif
1578
1579 void sk_common_release(struct sock *sk)
1580 {
1581         if (sk->sk_prot->destroy)
1582                 sk->sk_prot->destroy(sk);
1583
1584         /*
1585          * Observation: when sock_common_release is called, processes have
1586          * no access to socket. But net still has.
1587          * Step one, detach it from networking:
1588          *
1589          * A. Remove from hash tables.
1590          */
1591
1592         sk->sk_prot->unhash(sk);
1593
1594         /*
1595          * In this point socket cannot receive new packets, but it is possible
1596          * that some packets are in flight because some CPU runs receiver and
1597          * did hash table lookup before we unhashed socket. They will achieve
1598          * receive queue and will be purged by socket destructor.
1599          *
1600          * Also we still have packets pending on receive queue and probably,
1601          * our own packets waiting in device queues. sock_destroy will drain
1602          * receive queue, but transmitted packets will delay socket destruction
1603          * until the last reference will be released.
1604          */
1605
1606         sock_orphan(sk);
1607
1608         xfrm_sk_free_policy(sk);
1609
1610         sk_refcnt_debug_release(sk);
1611         sock_put(sk);
1612 }
1613
1614 EXPORT_SYMBOL(sk_common_release);
1615
1616 static DEFINE_RWLOCK(proto_list_lock);
1617 static LIST_HEAD(proto_list);
1618
1619 int proto_register(struct proto *prot, int alloc_slab)
1620 {
1621         char *request_sock_slab_name = NULL;
1622         char *timewait_sock_slab_name;
1623         int rc = -ENOBUFS;
1624
1625         if (alloc_slab) {
1626                 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
1627                                                SLAB_HWCACHE_ALIGN, NULL, NULL);
1628
1629                 if (prot->slab == NULL) {
1630                         printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
1631                                prot->name);
1632                         goto out;
1633                 }
1634
1635                 if (prot->rsk_prot != NULL) {
1636                         static const char mask[] = "request_sock_%s";
1637
1638                         request_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1639                         if (request_sock_slab_name == NULL)
1640                                 goto out_free_sock_slab;
1641
1642                         sprintf(request_sock_slab_name, mask, prot->name);
1643                         prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name,
1644                                                                  prot->rsk_prot->obj_size, 0,
1645                                                                  SLAB_HWCACHE_ALIGN, NULL, NULL);
1646
1647                         if (prot->rsk_prot->slab == NULL) {
1648                                 printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
1649                                        prot->name);
1650                                 goto out_free_request_sock_slab_name;
1651                         }
1652                 }
1653
1654                 if (prot->twsk_prot != NULL) {
1655                         static const char mask[] = "tw_sock_%s";
1656
1657                         timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1658
1659                         if (timewait_sock_slab_name == NULL)
1660                                 goto out_free_request_sock_slab;
1661
1662                         sprintf(timewait_sock_slab_name, mask, prot->name);
1663                         prot->twsk_prot->twsk_slab =
1664                                 kmem_cache_create(timewait_sock_slab_name,
1665                                                   prot->twsk_prot->twsk_obj_size,
1666                                                   0, SLAB_HWCACHE_ALIGN,
1667                                                   NULL, NULL);
1668                         if (prot->twsk_prot->twsk_slab == NULL)
1669                                 goto out_free_timewait_sock_slab_name;
1670                 }
1671         }
1672
1673         write_lock(&proto_list_lock);
1674         list_add(&prot->node, &proto_list);
1675         write_unlock(&proto_list_lock);
1676         rc = 0;
1677 out:
1678         return rc;
1679 out_free_timewait_sock_slab_name:
1680         kfree(timewait_sock_slab_name);
1681 out_free_request_sock_slab:
1682         if (prot->rsk_prot && prot->rsk_prot->slab) {
1683                 kmem_cache_destroy(prot->rsk_prot->slab);
1684                 prot->rsk_prot->slab = NULL;
1685         }
1686 out_free_request_sock_slab_name:
1687         kfree(request_sock_slab_name);
1688 out_free_sock_slab:
1689         kmem_cache_destroy(prot->slab);
1690         prot->slab = NULL;
1691         goto out;
1692 }
1693
1694 EXPORT_SYMBOL(proto_register);
1695
1696 void proto_unregister(struct proto *prot)
1697 {
1698         write_lock(&proto_list_lock);
1699         list_del(&prot->node);
1700         write_unlock(&proto_list_lock);
1701
1702         if (prot->slab != NULL) {
1703                 kmem_cache_destroy(prot->slab);
1704                 prot->slab = NULL;
1705         }
1706
1707         if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
1708                 const char *name = kmem_cache_name(prot->rsk_prot->slab);
1709
1710                 kmem_cache_destroy(prot->rsk_prot->slab);
1711                 kfree(name);
1712                 prot->rsk_prot->slab = NULL;
1713         }
1714
1715         if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
1716                 const char *name = kmem_cache_name(prot->twsk_prot->twsk_slab);
1717
1718                 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
1719                 kfree(name);
1720                 prot->twsk_prot->twsk_slab = NULL;
1721         }
1722 }
1723
1724 EXPORT_SYMBOL(proto_unregister);
1725
1726 #ifdef CONFIG_PROC_FS
1727 static inline struct proto *__proto_head(void)
1728 {
1729         return list_entry(proto_list.next, struct proto, node);
1730 }
1731
1732 static inline struct proto *proto_head(void)
1733 {
1734         return list_empty(&proto_list) ? NULL : __proto_head();
1735 }
1736
1737 static inline struct proto *proto_next(struct proto *proto)
1738 {
1739         return proto->node.next == &proto_list ? NULL :
1740                 list_entry(proto->node.next, struct proto, node);
1741 }
1742
1743 static inline struct proto *proto_get_idx(loff_t pos)
1744 {
1745         struct proto *proto;
1746         loff_t i = 0;
1747
1748         list_for_each_entry(proto, &proto_list, node)
1749                 if (i++ == pos)
1750                         goto out;
1751
1752         proto = NULL;
1753 out:
1754         return proto;
1755 }
1756
1757 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
1758 {
1759         read_lock(&proto_list_lock);
1760         return *pos ? proto_get_idx(*pos - 1) : SEQ_START_TOKEN;
1761 }
1762
1763 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1764 {
1765         ++*pos;
1766         return v == SEQ_START_TOKEN ? proto_head() : proto_next(v);
1767 }
1768
1769 static void proto_seq_stop(struct seq_file *seq, void *v)
1770 {
1771         read_unlock(&proto_list_lock);
1772 }
1773
1774 static char proto_method_implemented(const void *method)
1775 {
1776         return method == NULL ? 'n' : 'y';
1777 }
1778
1779 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
1780 {
1781         seq_printf(seq, "%-9s %4u %6d  %6d   %-3s %6u   %-3s  %-10s "
1782                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
1783                    proto->name,
1784                    proto->obj_size,
1785                    proto->sockets_allocated != NULL ? atomic_read(proto->sockets_allocated) : -1,
1786                    proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
1787                    proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
1788                    proto->max_header,
1789                    proto->slab == NULL ? "no" : "yes",
1790                    module_name(proto->owner),
1791                    proto_method_implemented(proto->close),
1792                    proto_method_implemented(proto->connect),
1793                    proto_method_implemented(proto->disconnect),
1794                    proto_method_implemented(proto->accept),
1795                    proto_method_implemented(proto->ioctl),
1796                    proto_method_implemented(proto->init),
1797                    proto_method_implemented(proto->destroy),
1798                    proto_method_implemented(proto->shutdown),
1799                    proto_method_implemented(proto->setsockopt),
1800                    proto_method_implemented(proto->getsockopt),
1801                    proto_method_implemented(proto->sendmsg),
1802                    proto_method_implemented(proto->recvmsg),
1803                    proto_method_implemented(proto->sendpage),
1804                    proto_method_implemented(proto->bind),
1805                    proto_method_implemented(proto->backlog_rcv),
1806                    proto_method_implemented(proto->hash),
1807                    proto_method_implemented(proto->unhash),
1808                    proto_method_implemented(proto->get_port),
1809                    proto_method_implemented(proto->enter_memory_pressure));
1810 }
1811
1812 static int proto_seq_show(struct seq_file *seq, void *v)
1813 {
1814         if (v == SEQ_START_TOKEN)
1815                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
1816                            "protocol",
1817                            "size",
1818                            "sockets",
1819                            "memory",
1820                            "press",
1821                            "maxhdr",
1822                            "slab",
1823                            "module",
1824                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
1825         else
1826                 proto_seq_printf(seq, v);
1827         return 0;
1828 }
1829
1830 static struct seq_operations proto_seq_ops = {
1831         .start  = proto_seq_start,
1832         .next   = proto_seq_next,
1833         .stop   = proto_seq_stop,
1834         .show   = proto_seq_show,
1835 };
1836
1837 static int proto_seq_open(struct inode *inode, struct file *file)
1838 {
1839         return seq_open(file, &proto_seq_ops);
1840 }
1841
1842 static struct file_operations proto_seq_fops = {
1843         .owner          = THIS_MODULE,
1844         .open           = proto_seq_open,
1845         .read           = seq_read,
1846         .llseek         = seq_lseek,
1847         .release        = seq_release,
1848 };
1849
1850 static int __init proto_init(void)
1851 {
1852         /* register /proc/net/protocols */
1853         return proc_net_fops_create("protocols", S_IRUGO, &proto_seq_fops) == NULL ? -ENOBUFS : 0;
1854 }
1855
1856 subsys_initcall(proto_init);
1857
1858 #endif /* PROC_FS */
1859
1860 EXPORT_SYMBOL(sk_alloc);
1861 EXPORT_SYMBOL(sk_free);
1862 EXPORT_SYMBOL(sk_send_sigurg);
1863 EXPORT_SYMBOL(sock_alloc_send_skb);
1864 EXPORT_SYMBOL(sock_init_data);
1865 EXPORT_SYMBOL(sock_kfree_s);
1866 EXPORT_SYMBOL(sock_kmalloc);
1867 EXPORT_SYMBOL(sock_no_accept);
1868 EXPORT_SYMBOL(sock_no_bind);
1869 EXPORT_SYMBOL(sock_no_connect);
1870 EXPORT_SYMBOL(sock_no_getname);
1871 EXPORT_SYMBOL(sock_no_getsockopt);
1872 EXPORT_SYMBOL(sock_no_ioctl);
1873 EXPORT_SYMBOL(sock_no_listen);
1874 EXPORT_SYMBOL(sock_no_mmap);
1875 EXPORT_SYMBOL(sock_no_poll);
1876 EXPORT_SYMBOL(sock_no_recvmsg);
1877 EXPORT_SYMBOL(sock_no_sendmsg);
1878 EXPORT_SYMBOL(sock_no_sendpage);
1879 EXPORT_SYMBOL(sock_no_setsockopt);
1880 EXPORT_SYMBOL(sock_no_shutdown);
1881 EXPORT_SYMBOL(sock_no_socketpair);
1882 EXPORT_SYMBOL(sock_rfree);
1883 EXPORT_SYMBOL(sock_setsockopt);
1884 EXPORT_SYMBOL(sock_wfree);
1885 EXPORT_SYMBOL(sock_wmalloc);
1886 EXPORT_SYMBOL(sock_i_uid);
1887 EXPORT_SYMBOL(sock_i_ino);
1888 EXPORT_SYMBOL(sysctl_optmem_max);
1889 #ifdef CONFIG_SYSCTL
1890 EXPORT_SYMBOL(sysctl_rmem_max);
1891 EXPORT_SYMBOL(sysctl_wmem_max);
1892 #endif