vserver 1.9.5.x5
[linux-2.6.git] / net / packet / af_packet.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              PACKET - implements raw packet sockets.
7  *
8  * Version:     $Id: af_packet.c,v 1.61 2002/02/08 03:57:19 davem Exp $
9  *
10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *
14  * Fixes:       
15  *              Alan Cox        :       verify_area() now used correctly
16  *              Alan Cox        :       new skbuff lists, look ma no backlogs!
17  *              Alan Cox        :       tidied skbuff lists.
18  *              Alan Cox        :       Now uses generic datagram routines I
19  *                                      added. Also fixed the peek/read crash
20  *                                      from all old Linux datagram code.
21  *              Alan Cox        :       Uses the improved datagram code.
22  *              Alan Cox        :       Added NULL's for socket options.
23  *              Alan Cox        :       Re-commented the code.
24  *              Alan Cox        :       Use new kernel side addressing
25  *              Rob Janssen     :       Correct MTU usage.
26  *              Dave Platt      :       Counter leaks caused by incorrect
27  *                                      interrupt locking and some slightly
28  *                                      dubious gcc output. Can you read
29  *                                      compiler: it said _VOLATILE_
30  *      Richard Kooijman        :       Timestamp fixes.
31  *              Alan Cox        :       New buffers. Use sk->mac.raw.
32  *              Alan Cox        :       sendmsg/recvmsg support.
33  *              Alan Cox        :       Protocol setting support
34  *      Alexey Kuznetsov        :       Untied from IPv4 stack.
35  *      Cyrus Durgin            :       Fixed kerneld for kmod.
36  *      Michal Ostrowski        :       Module initialization cleanup.
37  *         Ulises Alonso        :       Frame number limit removal and 
38  *                                      packet_set_ring memory leak.
39  *
40  *              This program is free software; you can redistribute it and/or
41  *              modify it under the terms of the GNU General Public License
42  *              as published by the Free Software Foundation; either version
43  *              2 of the License, or (at your option) any later version.
44  *
45  */
46  
47 #include <linux/config.h>
48 #include <linux/types.h>
49 #include <linux/sched.h>
50 #include <linux/mm.h>
51 #include <linux/fcntl.h>
52 #include <linux/socket.h>
53 #include <linux/in.h>
54 #include <linux/inet.h>
55 #include <linux/netdevice.h>
56 #include <linux/if_packet.h>
57 #include <linux/wireless.h>
58 #include <linux/kmod.h>
59 #include <net/ip.h>
60 #include <net/protocol.h>
61 #include <linux/skbuff.h>
62 #include <net/sock.h>
63 #include <linux/errno.h>
64 #include <linux/timer.h>
65 #include <asm/system.h>
66 #include <asm/uaccess.h>
67 #include <asm/ioctls.h>
68 #include <asm/page.h>
69 #include <asm/io.h>
70 #include <linux/proc_fs.h>
71 #include <linux/seq_file.h>
72 #include <linux/poll.h>
73 #include <linux/module.h>
74 #include <linux/init.h>
75
76 #ifdef CONFIG_INET
77 #include <net/inet_common.h>
78 #endif
79
80 #define CONFIG_SOCK_PACKET      1
81
82 /*
83    Proposed replacement for SIOC{ADD,DEL}MULTI and
84    IFF_PROMISC, IFF_ALLMULTI flags.
85
86    It is more expensive, but I believe,
87    it is really correct solution: reentereble, safe and fault tolerant.
88
89    IFF_PROMISC/IFF_ALLMULTI/SIOC{ADD/DEL}MULTI are faked by keeping
90    reference count and global flag, so that real status is
91    (gflag|(count != 0)), so that we can use obsolete faulty interface
92    not harming clever users.
93  */
94 #define CONFIG_PACKET_MULTICAST 1
95
96 /*
97    Assumptions:
98    - if device has no dev->hard_header routine, it adds and removes ll header
99      inside itself. In this case ll header is invisible outside of device,
100      but higher levels still should reserve dev->hard_header_len.
101      Some devices are enough clever to reallocate skb, when header
102      will not fit to reserved space (tunnel), another ones are silly
103      (PPP).
104    - packet socket receives packets with pulled ll header,
105      so that SOCK_RAW should push it back.
106
107 On receive:
108 -----------
109
110 Incoming, dev->hard_header!=NULL
111    mac.raw -> ll header
112    data    -> data
113
114 Outgoing, dev->hard_header!=NULL
115    mac.raw -> ll header
116    data    -> ll header
117
118 Incoming, dev->hard_header==NULL
119    mac.raw -> UNKNOWN position. It is very likely, that it points to ll header.
120               PPP makes it, that is wrong, because introduce assymetry
121               between rx and tx paths.
122    data    -> data
123
124 Outgoing, dev->hard_header==NULL
125    mac.raw -> data. ll header is still not built!
126    data    -> data
127
128 Resume
129   If dev->hard_header==NULL we are unlikely to restore sensible ll header.
130
131
132 On transmit:
133 ------------
134
135 dev->hard_header != NULL
136    mac.raw -> ll header
137    data    -> ll header
138
139 dev->hard_header == NULL (ll header is added by device, we cannot control it)
140    mac.raw -> data
141    data -> data
142
143    We should set nh.raw on output to correct posistion,
144    packet classifier depends on it.
145  */
146
147 /* List of all packet sockets. */
148 static HLIST_HEAD(packet_sklist);
149 static DEFINE_RWLOCK(packet_sklist_lock);
150
151 static atomic_t packet_socks_nr;
152
153
154 /* Private packet socket structures. */
155
156 #ifdef CONFIG_PACKET_MULTICAST
157 struct packet_mclist
158 {
159         struct packet_mclist    *next;
160         int                     ifindex;
161         int                     count;
162         unsigned short          type;
163         unsigned short          alen;
164         unsigned char           addr[8];
165 };
166 #endif
167 #ifdef CONFIG_PACKET_MMAP
168 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing);
169 #endif
170
171 static void packet_flush_mclist(struct sock *sk);
172
173 struct packet_opt
174 {
175         struct tpacket_stats    stats;
176 #ifdef CONFIG_PACKET_MMAP
177         char *                  *pg_vec;
178         unsigned int            head;
179         unsigned int            frames_per_block;
180         unsigned int            frame_size;
181         unsigned int            frame_max;
182         int                     copy_thresh;
183 #endif
184         struct packet_type      prot_hook;
185         spinlock_t              bind_lock;
186         char                    running;        /* prot_hook is attached*/
187         int                     ifindex;        /* bound device         */
188         unsigned short          num;
189 #ifdef CONFIG_PACKET_MULTICAST
190         struct packet_mclist    *mclist;
191 #endif
192 #ifdef CONFIG_PACKET_MMAP
193         atomic_t                mapped;
194         unsigned int            pg_vec_order;
195         unsigned int            pg_vec_pages;
196         unsigned int            pg_vec_len;
197 #endif
198 };
199
200 #ifdef CONFIG_PACKET_MMAP
201
202 static inline char *packet_lookup_frame(struct packet_opt *po, unsigned int position)
203 {
204         unsigned int pg_vec_pos, frame_offset;
205         char *frame;
206
207         pg_vec_pos = position / po->frames_per_block;
208         frame_offset = position % po->frames_per_block;
209
210         frame = po->pg_vec[pg_vec_pos] + (frame_offset * po->frame_size);
211         
212         return frame;
213 }
214 #endif
215
216 #define pkt_sk(__sk) ((struct packet_opt *)(__sk)->sk_protinfo)
217
218 static void packet_sock_destruct(struct sock *sk)
219 {
220         BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc));
221         BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
222
223         if (!sock_flag(sk, SOCK_DEAD)) {
224                 printk("Attempt to release alive packet socket: %p\n", sk);
225                 return;
226         }
227
228         if (pkt_sk(sk))
229                 kfree(pkt_sk(sk));
230         atomic_dec(&packet_socks_nr);
231 #ifdef PACKET_REFCNT_DEBUG
232         printk(KERN_DEBUG "PACKET socket %p is free, %d are alive\n", sk, atomic_read(&packet_socks_nr));
233 #endif
234 }
235
236
237 static struct proto_ops packet_ops;
238
239 #ifdef CONFIG_SOCK_PACKET
240 static struct proto_ops packet_ops_spkt;
241
242 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,  struct packet_type *pt)
243 {
244         struct sock *sk;
245         struct sockaddr_pkt *spkt;
246
247         /*
248          *      When we registered the protocol we saved the socket in the data
249          *      field for just this event.
250          */
251
252         sk = pt->af_packet_priv;
253         
254         /*
255          *      Yank back the headers [hope the device set this
256          *      right or kerboom...]
257          *
258          *      Incoming packets have ll header pulled,
259          *      push it back.
260          *
261          *      For outgoing ones skb->data == skb->mac.raw
262          *      so that this procedure is noop.
263          */
264
265         if (skb->pkt_type == PACKET_LOOPBACK)
266                 goto out;
267
268         if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
269                 goto oom;
270
271         /* drop any routing info */
272         dst_release(skb->dst);
273         skb->dst = NULL;
274
275         spkt = (struct sockaddr_pkt*)skb->cb;
276
277         skb_push(skb, skb->data-skb->mac.raw);
278
279         /*
280          *      The SOCK_PACKET socket receives _all_ frames.
281          */
282
283         spkt->spkt_family = dev->type;
284         strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
285         spkt->spkt_protocol = skb->protocol;
286
287         /*
288          *      Charge the memory to the socket. This is done specifically
289          *      to prevent sockets using all the memory up.
290          */
291
292         if (sock_queue_rcv_skb(sk,skb) == 0)
293                 return 0;
294
295 out:
296         kfree_skb(skb);
297 oom:
298         return 0;
299 }
300
301
302 /*
303  *      Output a raw packet to a device layer. This bypasses all the other
304  *      protocol layers and you must therefore supply it with a complete frame
305  */
306  
307 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
308                                struct msghdr *msg, size_t len)
309 {
310         struct sock *sk = sock->sk;
311         struct sockaddr_pkt *saddr=(struct sockaddr_pkt *)msg->msg_name;
312         struct sk_buff *skb;
313         struct net_device *dev;
314         unsigned short proto=0;
315         int err;
316         
317         /*
318          *      Get and verify the address. 
319          */
320
321         if (saddr)
322         {
323                 if (msg->msg_namelen < sizeof(struct sockaddr))
324                         return(-EINVAL);
325                 if (msg->msg_namelen==sizeof(struct sockaddr_pkt))
326                         proto=saddr->spkt_protocol;
327         }
328         else
329                 return(-ENOTCONN);      /* SOCK_PACKET must be sent giving an address */
330
331         /*
332          *      Find the device first to size check it 
333          */
334
335         saddr->spkt_device[13] = 0;
336         dev = dev_get_by_name(saddr->spkt_device);
337         err = -ENODEV;
338         if (dev == NULL)
339                 goto out_unlock;
340         
341         /*
342          *      You may not queue a frame bigger than the mtu. This is the lowest level
343          *      raw protocol and you must do your own fragmentation at this level.
344          */
345          
346         err = -EMSGSIZE;
347         if(len>dev->mtu+dev->hard_header_len)
348                 goto out_unlock;
349
350         err = -ENOBUFS;
351         skb = sock_wmalloc(sk, len + LL_RESERVED_SPACE(dev), 0, GFP_KERNEL);
352
353         /*
354          *      If the write buffer is full, then tough. At this level the user gets to
355          *      deal with the problem - do your own algorithmic backoffs. That's far
356          *      more flexible.
357          */
358          
359         if (skb == NULL) 
360                 goto out_unlock;
361
362         /*
363          *      Fill it in 
364          */
365          
366         /* FIXME: Save some space for broken drivers that write a
367          * hard header at transmission time by themselves. PPP is the
368          * notable one here. This should really be fixed at the driver level.
369          */
370         skb_reserve(skb, LL_RESERVED_SPACE(dev));
371         skb->nh.raw = skb->data;
372
373         /* Try to align data part correctly */
374         if (dev->hard_header) {
375                 skb->data -= dev->hard_header_len;
376                 skb->tail -= dev->hard_header_len;
377                 if (len < dev->hard_header_len)
378                         skb->nh.raw = skb->data;
379         }
380
381         /* Returns -EFAULT on error */
382         err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
383         skb->protocol = proto;
384         skb->dev = dev;
385         skb->priority = sk->sk_priority;
386         if (err)
387                 goto out_free;
388
389         err = -ENETDOWN;
390         if (!(dev->flags & IFF_UP))
391                 goto out_free;
392
393         /*
394          *      Now send it
395          */
396
397         dev_queue_xmit(skb);
398         dev_put(dev);
399         return(len);
400
401 out_free:
402         kfree_skb(skb);
403 out_unlock:
404         if (dev)
405                 dev_put(dev);
406         return err;
407 }
408 #endif
409
410 static inline unsigned run_filter(struct sk_buff *skb, struct sock *sk, unsigned res)
411 {
412         struct sk_filter *filter;
413
414         bh_lock_sock(sk);
415         filter = sk->sk_filter;
416         /*
417          * Our caller already checked that filter != NULL but we need to
418          * verify that under bh_lock_sock() to be safe
419          */
420         if (likely(filter != NULL))
421                 res = sk_run_filter(skb, filter->insns, filter->len);
422         bh_unlock_sock(sk);
423
424         return res;
425 }
426
427 /*
428    This function makes lazy skb cloning in hope that most of packets
429    are discarded by BPF.
430
431    Note tricky part: we DO mangle shared skb! skb->data, skb->len
432    and skb->cb are mangled. It works because (and until) packets
433    falling here are owned by current CPU. Output packets are cloned
434    by dev_queue_xmit_nit(), input packets are processed by net_bh
435    sequencially, so that if we return skb to original state on exit,
436    we will not harm anyone.
437  */
438
439 static int packet_rcv(struct sk_buff *skb, struct net_device *dev,  struct packet_type *pt)
440 {
441         struct sock *sk;
442         struct sockaddr_ll *sll;
443         struct packet_opt *po;
444         u8 * skb_head = skb->data;
445         int skb_len = skb->len;
446         unsigned snaplen;
447
448         if (skb->pkt_type == PACKET_LOOPBACK)
449                 goto drop;
450
451         sk = pt->af_packet_priv;
452         po = pkt_sk(sk);
453
454         skb->dev = dev;
455
456         if (dev->hard_header) {
457                 /* The device has an explicit notion of ll header,
458                    exported to higher levels.
459
460                    Otherwise, the device hides datails of it frame
461                    structure, so that corresponding packet head
462                    never delivered to user.
463                  */
464                 if (sk->sk_type != SOCK_DGRAM)
465                         skb_push(skb, skb->data - skb->mac.raw);
466                 else if (skb->pkt_type == PACKET_OUTGOING) {
467                         /* Special case: outgoing packets have ll header at head */
468                         skb_pull(skb, skb->nh.raw - skb->data);
469                 }
470         }
471
472         snaplen = skb->len;
473
474         if (sk->sk_filter) {
475                 unsigned res = run_filter(skb, sk, snaplen);
476                 if (res == 0)
477                         goto drop_n_restore;
478                 if (snaplen > res)
479                         snaplen = res;
480         }
481
482         if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
483             (unsigned)sk->sk_rcvbuf)
484                 goto drop_n_acct;
485
486         if (skb_shared(skb)) {
487                 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
488                 if (nskb == NULL)
489                         goto drop_n_acct;
490
491                 if (skb_head != skb->data) {
492                         skb->data = skb_head;
493                         skb->len = skb_len;
494                 }
495                 kfree_skb(skb);
496                 skb = nskb;
497         }
498
499         sll = (struct sockaddr_ll*)skb->cb;
500         sll->sll_family = AF_PACKET;
501         sll->sll_hatype = dev->type;
502         sll->sll_protocol = skb->protocol;
503         sll->sll_pkttype = skb->pkt_type;
504         sll->sll_ifindex = dev->ifindex;
505         sll->sll_halen = 0;
506
507         if (dev->hard_header_parse)
508                 sll->sll_halen = dev->hard_header_parse(skb, sll->sll_addr);
509
510         if (pskb_trim(skb, snaplen))
511                 goto drop_n_acct;
512
513         skb_set_owner_r(skb, sk);
514         skb->dev = NULL;
515         dst_release(skb->dst);
516         skb->dst = NULL;
517
518         spin_lock(&sk->sk_receive_queue.lock);
519         po->stats.tp_packets++;
520         __skb_queue_tail(&sk->sk_receive_queue, skb);
521         spin_unlock(&sk->sk_receive_queue.lock);
522         sk->sk_data_ready(sk, skb->len);
523         return 0;
524
525 drop_n_acct:
526         spin_lock(&sk->sk_receive_queue.lock);
527         po->stats.tp_drops++;
528         spin_unlock(&sk->sk_receive_queue.lock);
529
530 drop_n_restore:
531         if (skb_head != skb->data && skb_shared(skb)) {
532                 skb->data = skb_head;
533                 skb->len = skb_len;
534         }
535 drop:
536         kfree_skb(skb);
537         return 0;
538 }
539
540 #ifdef CONFIG_PACKET_MMAP
541 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,  struct packet_type *pt)
542 {
543         struct sock *sk;
544         struct packet_opt *po;
545         struct sockaddr_ll *sll;
546         struct tpacket_hdr *h;
547         u8 * skb_head = skb->data;
548         int skb_len = skb->len;
549         unsigned snaplen;
550         unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
551         unsigned short macoff, netoff;
552         struct sk_buff *copy_skb = NULL;
553
554         if (skb->pkt_type == PACKET_LOOPBACK)
555                 goto drop;
556
557         sk = pt->af_packet_priv;
558         po = pkt_sk(sk);
559
560         if (dev->hard_header) {
561                 if (sk->sk_type != SOCK_DGRAM)
562                         skb_push(skb, skb->data - skb->mac.raw);
563                 else if (skb->pkt_type == PACKET_OUTGOING) {
564                         /* Special case: outgoing packets have ll header at head */
565                         skb_pull(skb, skb->nh.raw - skb->data);
566                         if (skb->ip_summed == CHECKSUM_HW)
567                                 status |= TP_STATUS_CSUMNOTREADY;
568                 }
569         }
570
571         snaplen = skb->len;
572
573         if (sk->sk_filter) {
574                 unsigned res = run_filter(skb, sk, snaplen);
575                 if (res == 0)
576                         goto drop_n_restore;
577                 if (snaplen > res)
578                         snaplen = res;
579         }
580
581         if (sk->sk_type == SOCK_DGRAM) {
582                 macoff = netoff = TPACKET_ALIGN(TPACKET_HDRLEN) + 16;
583         } else {
584                 unsigned maclen = skb->nh.raw - skb->data;
585                 netoff = TPACKET_ALIGN(TPACKET_HDRLEN + (maclen < 16 ? 16 : maclen));
586                 macoff = netoff - maclen;
587         }
588
589         if (macoff + snaplen > po->frame_size) {
590                 if (po->copy_thresh &&
591                     atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
592                     (unsigned)sk->sk_rcvbuf) {
593                         if (skb_shared(skb)) {
594                                 copy_skb = skb_clone(skb, GFP_ATOMIC);
595                         } else {
596                                 copy_skb = skb_get(skb);
597                                 skb_head = skb->data;
598                         }
599                         if (copy_skb)
600                                 skb_set_owner_r(copy_skb, sk);
601                 }
602                 snaplen = po->frame_size - macoff;
603                 if ((int)snaplen < 0)
604                         snaplen = 0;
605         }
606         if (snaplen > skb->len-skb->data_len)
607                 snaplen = skb->len-skb->data_len;
608
609         spin_lock(&sk->sk_receive_queue.lock);
610         h = (struct tpacket_hdr *)packet_lookup_frame(po, po->head);
611         
612         if (h->tp_status)
613                 goto ring_is_full;
614         po->head = po->head != po->frame_max ? po->head+1 : 0;
615         po->stats.tp_packets++;
616         if (copy_skb) {
617                 status |= TP_STATUS_COPY;
618                 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
619         }
620         if (!po->stats.tp_drops)
621                 status &= ~TP_STATUS_LOSING;
622         spin_unlock(&sk->sk_receive_queue.lock);
623
624         memcpy((u8*)h + macoff, skb->data, snaplen);
625
626         h->tp_len = skb->len;
627         h->tp_snaplen = snaplen;
628         h->tp_mac = macoff;
629         h->tp_net = netoff;
630         if (skb->stamp.tv_sec == 0) { 
631                 do_gettimeofday(&skb->stamp);
632                 sock_enable_timestamp(sk);
633         }
634         h->tp_sec = skb->stamp.tv_sec;
635         h->tp_usec = skb->stamp.tv_usec;
636
637         sll = (struct sockaddr_ll*)((u8*)h + TPACKET_ALIGN(sizeof(*h)));
638         sll->sll_halen = 0;
639         if (dev->hard_header_parse)
640                 sll->sll_halen = dev->hard_header_parse(skb, sll->sll_addr);
641         sll->sll_family = AF_PACKET;
642         sll->sll_hatype = dev->type;
643         sll->sll_protocol = skb->protocol;
644         sll->sll_pkttype = skb->pkt_type;
645         sll->sll_ifindex = dev->ifindex;
646
647         h->tp_status = status;
648         mb();
649
650         {
651                 struct page *p_start, *p_end;
652                 u8 *h_end = (u8 *)h + macoff + snaplen - 1;
653
654                 p_start = virt_to_page(h);
655                 p_end = virt_to_page(h_end);
656                 while (p_start <= p_end) {
657                         flush_dcache_page(p_start);
658                         p_start++;
659                 }
660         }
661
662         sk->sk_data_ready(sk, 0);
663
664 drop_n_restore:
665         if (skb_head != skb->data && skb_shared(skb)) {
666                 skb->data = skb_head;
667                 skb->len = skb_len;
668         }
669 drop:
670         kfree_skb(skb);
671         return 0;
672
673 ring_is_full:
674         po->stats.tp_drops++;
675         spin_unlock(&sk->sk_receive_queue.lock);
676
677         sk->sk_data_ready(sk, 0);
678         if (copy_skb)
679                 kfree_skb(copy_skb);
680         goto drop_n_restore;
681 }
682
683 #endif
684
685
686 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
687                           struct msghdr *msg, size_t len)
688 {
689         struct sock *sk = sock->sk;
690         struct sockaddr_ll *saddr=(struct sockaddr_ll *)msg->msg_name;
691         struct sk_buff *skb;
692         struct net_device *dev;
693         unsigned short proto;
694         unsigned char *addr;
695         int ifindex, err, reserve = 0;
696
697         /*
698          *      Get and verify the address. 
699          */
700          
701         if (saddr == NULL) {
702                 struct packet_opt *po = pkt_sk(sk);
703
704                 ifindex = po->ifindex;
705                 proto   = po->num;
706                 addr    = NULL;
707         } else {
708                 err = -EINVAL;
709                 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
710                         goto out;
711                 ifindex = saddr->sll_ifindex;
712                 proto   = saddr->sll_protocol;
713                 addr    = saddr->sll_addr;
714         }
715
716
717         dev = dev_get_by_index(ifindex);
718         err = -ENXIO;
719         if (dev == NULL)
720                 goto out_unlock;
721         if (sock->type == SOCK_RAW)
722                 reserve = dev->hard_header_len;
723
724         err = -EMSGSIZE;
725         if (len > dev->mtu+reserve)
726                 goto out_unlock;
727
728         skb = sock_alloc_send_skb(sk, len + LL_RESERVED_SPACE(dev),
729                                 msg->msg_flags & MSG_DONTWAIT, &err);
730         if (skb==NULL)
731                 goto out_unlock;
732
733         skb_reserve(skb, LL_RESERVED_SPACE(dev));
734         skb->nh.raw = skb->data;
735
736         if (dev->hard_header) {
737                 int res;
738                 err = -EINVAL;
739                 res = dev->hard_header(skb, dev, ntohs(proto), addr, NULL, len);
740                 if (sock->type != SOCK_DGRAM) {
741                         skb->tail = skb->data;
742                         skb->len = 0;
743                 } else if (res < 0)
744                         goto out_free;
745         }
746
747         /* Returns -EFAULT on error */
748         err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
749         if (err)
750                 goto out_free;
751
752         skb->protocol = proto;
753         skb->dev = dev;
754         skb->priority = sk->sk_priority;
755
756         err = -ENETDOWN;
757         if (!(dev->flags & IFF_UP))
758                 goto out_free;
759
760         /*
761          *      Now send it
762          */
763
764         err = dev_queue_xmit(skb);
765         if (err > 0 && (err = net_xmit_errno(err)) != 0)
766                 goto out_unlock;
767
768         dev_put(dev);
769
770         return(len);
771
772 out_free:
773         kfree_skb(skb);
774 out_unlock:
775         if (dev)
776                 dev_put(dev);
777 out:
778         return err;
779 }
780
781 /*
782  *      Close a PACKET socket. This is fairly simple. We immediately go
783  *      to 'closed' state and remove our protocol entry in the device list.
784  */
785
786 static int packet_release(struct socket *sock)
787 {
788         struct sock *sk = sock->sk;
789         struct packet_opt *po;
790
791         if (!sk)
792                 return 0;
793
794         po = pkt_sk(sk);
795
796         write_lock_bh(&packet_sklist_lock);
797         sk_del_node_init(sk);
798         write_unlock_bh(&packet_sklist_lock);
799
800         /*
801          *      Unhook packet receive handler.
802          */
803
804         if (po->running) {
805                 /*
806                  *      Remove the protocol hook
807                  */
808                 dev_remove_pack(&po->prot_hook);
809                 po->running = 0;
810                 po->num = 0;
811                 __sock_put(sk);
812         }
813
814 #ifdef CONFIG_PACKET_MULTICAST
815         packet_flush_mclist(sk);
816 #endif
817
818 #ifdef CONFIG_PACKET_MMAP
819         if (po->pg_vec) {
820                 struct tpacket_req req;
821                 memset(&req, 0, sizeof(req));
822                 packet_set_ring(sk, &req, 1);
823         }
824 #endif
825
826         /*
827          *      Now the socket is dead. No more input will appear.
828          */
829
830         sock_orphan(sk);
831         sock->sk = NULL;
832
833         /* Purge queues */
834
835         skb_queue_purge(&sk->sk_receive_queue);
836
837         sock_put(sk);
838         return 0;
839 }
840
841 /*
842  *      Attach a packet hook.
843  */
844
845 static int packet_do_bind(struct sock *sk, struct net_device *dev, int protocol)
846 {
847         struct packet_opt *po = pkt_sk(sk);
848         /*
849          *      Detach an existing hook if present.
850          */
851
852         lock_sock(sk);
853
854         spin_lock(&po->bind_lock);
855         if (po->running) {
856                 __sock_put(sk);
857                 po->running = 0;
858                 po->num = 0;
859                 spin_unlock(&po->bind_lock);
860                 dev_remove_pack(&po->prot_hook);
861                 spin_lock(&po->bind_lock);
862         }
863
864         po->num = protocol;
865         po->prot_hook.type = protocol;
866         po->prot_hook.dev = dev;
867
868         po->ifindex = dev ? dev->ifindex : 0;
869
870         if (protocol == 0)
871                 goto out_unlock;
872
873         if (dev) {
874                 if (dev->flags&IFF_UP) {
875                         dev_add_pack(&po->prot_hook);
876                         sock_hold(sk);
877                         po->running = 1;
878                 } else {
879                         sk->sk_err = ENETDOWN;
880                         if (!sock_flag(sk, SOCK_DEAD))
881                                 sk->sk_error_report(sk);
882                 }
883         } else {
884                 dev_add_pack(&po->prot_hook);
885                 sock_hold(sk);
886                 po->running = 1;
887         }
888
889 out_unlock:
890         spin_unlock(&po->bind_lock);
891         release_sock(sk);
892         return 0;
893 }
894
895 /*
896  *      Bind a packet socket to a device
897  */
898
899 #ifdef CONFIG_SOCK_PACKET
900
901 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, int addr_len)
902 {
903         struct sock *sk=sock->sk;
904         char name[15];
905         struct net_device *dev;
906         int err = -ENODEV;
907         
908         /*
909          *      Check legality
910          */
911          
912         if(addr_len!=sizeof(struct sockaddr))
913                 return -EINVAL;
914         strlcpy(name,uaddr->sa_data,sizeof(name));
915
916         dev = dev_get_by_name(name);
917         if (dev) {
918                 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
919                 dev_put(dev);
920         }
921         return err;
922 }
923 #endif
924
925 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
926 {
927         struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
928         struct sock *sk=sock->sk;
929         struct net_device *dev = NULL;
930         int err;
931
932
933         /*
934          *      Check legality
935          */
936          
937         if (addr_len < sizeof(struct sockaddr_ll))
938                 return -EINVAL;
939         if (sll->sll_family != AF_PACKET)
940                 return -EINVAL;
941
942         if (sll->sll_ifindex) {
943                 err = -ENODEV;
944                 dev = dev_get_by_index(sll->sll_ifindex);
945                 if (dev == NULL)
946                         goto out;
947         }
948         err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
949         if (dev)
950                 dev_put(dev);
951
952 out:
953         return err;
954 }
955
956
957 /*
958  *      Create a packet of type SOCK_PACKET. 
959  */
960
961 static int packet_create(struct socket *sock, int protocol)
962 {
963         struct sock *sk;
964         struct packet_opt *po;
965         int err;
966
967         if (!capable(CAP_NET_RAW))
968                 return -EPERM;
969         if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW
970 #ifdef CONFIG_SOCK_PACKET
971             && sock->type != SOCK_PACKET
972 #endif
973             )
974                 return -ESOCKTNOSUPPORT;
975
976         sock->state = SS_UNCONNECTED;
977
978         err = -ENOBUFS;
979         sk = sk_alloc(PF_PACKET, GFP_KERNEL, 1, NULL);
980         if (sk == NULL)
981                 goto out;
982
983         sock->ops = &packet_ops;
984 #ifdef CONFIG_SOCK_PACKET
985         if (sock->type == SOCK_PACKET)
986                 sock->ops = &packet_ops_spkt;
987 #endif
988         sock_init_data(sock,sk);
989         sk_set_owner(sk, THIS_MODULE);
990
991         po = sk->sk_protinfo = kmalloc(sizeof(*po), GFP_KERNEL);
992         if (!po)
993                 goto out_free;
994         memset(po, 0, sizeof(*po));
995         sk->sk_family = PF_PACKET;
996         po->num = protocol;
997
998         sk->sk_destruct = packet_sock_destruct;
999         atomic_inc(&packet_socks_nr);
1000
1001         /*
1002          *      Attach a protocol block
1003          */
1004
1005         spin_lock_init(&po->bind_lock);
1006         po->prot_hook.func = packet_rcv;
1007 #ifdef CONFIG_SOCK_PACKET
1008         if (sock->type == SOCK_PACKET)
1009                 po->prot_hook.func = packet_rcv_spkt;
1010 #endif
1011         po->prot_hook.af_packet_priv = sk;
1012
1013         if (protocol) {
1014                 po->prot_hook.type = protocol;
1015                 dev_add_pack(&po->prot_hook);
1016                 sock_hold(sk);
1017                 po->running = 1;
1018         }
1019
1020         write_lock_bh(&packet_sklist_lock);
1021         sk_add_node(sk, &packet_sklist);
1022         write_unlock_bh(&packet_sklist_lock);
1023         return(0);
1024
1025 out_free:
1026         sk_free(sk);
1027 out:
1028         return err;
1029 }
1030
1031 /*
1032  *      Pull a packet from our receive queue and hand it to the user.
1033  *      If necessary we block.
1034  */
1035
1036 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1037                           struct msghdr *msg, size_t len, int flags)
1038 {
1039         struct sock *sk = sock->sk;
1040         struct sk_buff *skb;
1041         int copied, err;
1042
1043         err = -EINVAL;
1044         if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
1045                 goto out;
1046
1047 #if 0
1048         /* What error should we return now? EUNATTACH? */
1049         if (pkt_sk(sk)->ifindex < 0)
1050                 return -ENODEV;
1051 #endif
1052
1053         /*
1054          *      If the address length field is there to be filled in, we fill
1055          *      it in now.
1056          */
1057
1058         if (sock->type == SOCK_PACKET)
1059                 msg->msg_namelen = sizeof(struct sockaddr_pkt);
1060         else
1061                 msg->msg_namelen = sizeof(struct sockaddr_ll);
1062
1063         /*
1064          *      Call the generic datagram receiver. This handles all sorts
1065          *      of horrible races and re-entrancy so we can forget about it
1066          *      in the protocol layers.
1067          *
1068          *      Now it will return ENETDOWN, if device have just gone down,
1069          *      but then it will block.
1070          */
1071
1072         skb=skb_recv_datagram(sk,flags,flags&MSG_DONTWAIT,&err);
1073
1074         /*
1075          *      An error occurred so return it. Because skb_recv_datagram() 
1076          *      handles the blocking we don't see and worry about blocking
1077          *      retries.
1078          */
1079
1080         if(skb==NULL)
1081                 goto out;
1082
1083         /*
1084          *      You lose any data beyond the buffer you gave. If it worries a
1085          *      user program they can ask the device for its MTU anyway.
1086          */
1087
1088         copied = skb->len;
1089         if (copied > len)
1090         {
1091                 copied=len;
1092                 msg->msg_flags|=MSG_TRUNC;
1093         }
1094
1095         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1096         if (err)
1097                 goto out_free;
1098
1099         sock_recv_timestamp(msg, sk, skb);
1100
1101         if (msg->msg_name)
1102                 memcpy(msg->msg_name, skb->cb, msg->msg_namelen);
1103
1104         /*
1105          *      Free or return the buffer as appropriate. Again this
1106          *      hides all the races and re-entrancy issues from us.
1107          */
1108         err = (flags&MSG_TRUNC) ? skb->len : copied;
1109
1110 out_free:
1111         skb_free_datagram(sk, skb);
1112 out:
1113         return err;
1114 }
1115
1116 #ifdef CONFIG_SOCK_PACKET
1117 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1118                                int *uaddr_len, int peer)
1119 {
1120         struct net_device *dev;
1121         struct sock *sk = sock->sk;
1122
1123         if (peer)
1124                 return -EOPNOTSUPP;
1125
1126         uaddr->sa_family = AF_PACKET;
1127         dev = dev_get_by_index(pkt_sk(sk)->ifindex);
1128         if (dev) {
1129                 strlcpy(uaddr->sa_data, dev->name, 15);
1130                 dev_put(dev);
1131         } else
1132                 memset(uaddr->sa_data, 0, 14);
1133         *uaddr_len = sizeof(*uaddr);
1134
1135         return 0;
1136 }
1137 #endif
1138
1139 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1140                           int *uaddr_len, int peer)
1141 {
1142         struct net_device *dev;
1143         struct sock *sk = sock->sk;
1144         struct packet_opt *po = pkt_sk(sk);
1145         struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
1146
1147         if (peer)
1148                 return -EOPNOTSUPP;
1149
1150         sll->sll_family = AF_PACKET;
1151         sll->sll_ifindex = po->ifindex;
1152         sll->sll_protocol = po->num;
1153         dev = dev_get_by_index(po->ifindex);
1154         if (dev) {
1155                 sll->sll_hatype = dev->type;
1156                 sll->sll_halen = dev->addr_len;
1157                 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1158                 dev_put(dev);
1159         } else {
1160                 sll->sll_hatype = 0;    /* Bad: we have no ARPHRD_UNSPEC */
1161                 sll->sll_halen = 0;
1162         }
1163         *uaddr_len = sizeof(*sll);
1164
1165         return 0;
1166 }
1167
1168 #ifdef CONFIG_PACKET_MULTICAST
1169 static void packet_dev_mc(struct net_device *dev, struct packet_mclist *i, int what)
1170 {
1171         switch (i->type) {
1172         case PACKET_MR_MULTICAST:
1173                 if (what > 0)
1174                         dev_mc_add(dev, i->addr, i->alen, 0);
1175                 else
1176                         dev_mc_delete(dev, i->addr, i->alen, 0);
1177                 break;
1178         case PACKET_MR_PROMISC:
1179                 dev_set_promiscuity(dev, what);
1180                 break;
1181         case PACKET_MR_ALLMULTI:
1182                 dev_set_allmulti(dev, what);
1183                 break;
1184         default:;
1185         }
1186 }
1187
1188 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1189 {
1190         for ( ; i; i=i->next) {
1191                 if (i->ifindex == dev->ifindex)
1192                         packet_dev_mc(dev, i, what);
1193         }
1194 }
1195
1196 static int packet_mc_add(struct sock *sk, struct packet_mreq *mreq)
1197 {
1198         struct packet_opt *po = pkt_sk(sk);
1199         struct packet_mclist *ml, *i;
1200         struct net_device *dev;
1201         int err;
1202
1203         rtnl_lock();
1204
1205         err = -ENODEV;
1206         dev = __dev_get_by_index(mreq->mr_ifindex);
1207         if (!dev)
1208                 goto done;
1209
1210         err = -EINVAL;
1211         if (mreq->mr_alen > dev->addr_len)
1212                 goto done;
1213
1214         err = -ENOBUFS;
1215         i = (struct packet_mclist *)kmalloc(sizeof(*i), GFP_KERNEL);
1216         if (i == NULL)
1217                 goto done;
1218
1219         err = 0;
1220         for (ml = po->mclist; ml; ml = ml->next) {
1221                 if (ml->ifindex == mreq->mr_ifindex &&
1222                     ml->type == mreq->mr_type &&
1223                     ml->alen == mreq->mr_alen &&
1224                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1225                         ml->count++;
1226                         /* Free the new element ... */
1227                         kfree(i);
1228                         goto done;
1229                 }
1230         }
1231
1232         i->type = mreq->mr_type;
1233         i->ifindex = mreq->mr_ifindex;
1234         i->alen = mreq->mr_alen;
1235         memcpy(i->addr, mreq->mr_address, i->alen);
1236         i->count = 1;
1237         i->next = po->mclist;
1238         po->mclist = i;
1239         packet_dev_mc(dev, i, +1);
1240
1241 done:
1242         rtnl_unlock();
1243         return err;
1244 }
1245
1246 static int packet_mc_drop(struct sock *sk, struct packet_mreq *mreq)
1247 {
1248         struct packet_mclist *ml, **mlp;
1249
1250         rtnl_lock();
1251
1252         for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1253                 if (ml->ifindex == mreq->mr_ifindex &&
1254                     ml->type == mreq->mr_type &&
1255                     ml->alen == mreq->mr_alen &&
1256                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1257                         if (--ml->count == 0) {
1258                                 struct net_device *dev;
1259                                 *mlp = ml->next;
1260                                 dev = dev_get_by_index(ml->ifindex);
1261                                 if (dev) {
1262                                         packet_dev_mc(dev, ml, -1);
1263                                         dev_put(dev);
1264                                 }
1265                                 kfree(ml);
1266                         }
1267                         rtnl_unlock();
1268                         return 0;
1269                 }
1270         }
1271         rtnl_unlock();
1272         return -EADDRNOTAVAIL;
1273 }
1274
1275 static void packet_flush_mclist(struct sock *sk)
1276 {
1277         struct packet_opt *po = pkt_sk(sk);
1278         struct packet_mclist *ml;
1279
1280         if (!po->mclist)
1281                 return;
1282
1283         rtnl_lock();
1284         while ((ml = po->mclist) != NULL) {
1285                 struct net_device *dev;
1286
1287                 po->mclist = ml->next;
1288                 if ((dev = dev_get_by_index(ml->ifindex)) != NULL) {
1289                         packet_dev_mc(dev, ml, -1);
1290                         dev_put(dev);
1291                 }
1292                 kfree(ml);
1293         }
1294         rtnl_unlock();
1295 }
1296 #endif
1297
1298 static int
1299 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen)
1300 {
1301         struct sock *sk = sock->sk;
1302         int ret;
1303
1304         if (level != SOL_PACKET)
1305                 return -ENOPROTOOPT;
1306
1307         switch(optname) {
1308 #ifdef CONFIG_PACKET_MULTICAST
1309         case PACKET_ADD_MEMBERSHIP:     
1310         case PACKET_DROP_MEMBERSHIP:
1311         {
1312                 struct packet_mreq mreq;
1313                 if (optlen<sizeof(mreq))
1314                         return -EINVAL;
1315                 if (copy_from_user(&mreq,optval,sizeof(mreq)))
1316                         return -EFAULT;
1317                 if (optname == PACKET_ADD_MEMBERSHIP)
1318                         ret = packet_mc_add(sk, &mreq);
1319                 else
1320                         ret = packet_mc_drop(sk, &mreq);
1321                 return ret;
1322         }
1323 #endif
1324 #ifdef CONFIG_PACKET_MMAP
1325         case PACKET_RX_RING:
1326         {
1327                 struct tpacket_req req;
1328
1329                 if (optlen<sizeof(req))
1330                         return -EINVAL;
1331                 if (copy_from_user(&req,optval,sizeof(req)))
1332                         return -EFAULT;
1333                 return packet_set_ring(sk, &req, 0);
1334         }
1335         case PACKET_COPY_THRESH:
1336         {
1337                 int val;
1338
1339                 if (optlen!=sizeof(val))
1340                         return -EINVAL;
1341                 if (copy_from_user(&val,optval,sizeof(val)))
1342                         return -EFAULT;
1343
1344                 pkt_sk(sk)->copy_thresh = val;
1345                 return 0;
1346         }
1347 #endif
1348         default:
1349                 return -ENOPROTOOPT;
1350         }
1351 }
1352
1353 static int packet_getsockopt(struct socket *sock, int level, int optname,
1354                              char __user *optval, int __user *optlen)
1355 {
1356         int len;
1357         struct sock *sk = sock->sk;
1358         struct packet_opt *po = pkt_sk(sk);
1359
1360         if (level != SOL_PACKET)
1361                 return -ENOPROTOOPT;
1362
1363         if (get_user(len,optlen))
1364                 return -EFAULT;
1365
1366         if (len < 0)
1367                 return -EINVAL;
1368                 
1369         switch(optname) {
1370         case PACKET_STATISTICS:
1371         {
1372                 struct tpacket_stats st;
1373
1374                 if (len > sizeof(struct tpacket_stats))
1375                         len = sizeof(struct tpacket_stats);
1376                 spin_lock_bh(&sk->sk_receive_queue.lock);
1377                 st = po->stats;
1378                 memset(&po->stats, 0, sizeof(st));
1379                 spin_unlock_bh(&sk->sk_receive_queue.lock);
1380                 st.tp_packets += st.tp_drops;
1381
1382                 if (copy_to_user(optval, &st, len))
1383                         return -EFAULT;
1384                 break;
1385         }
1386         default:
1387                 return -ENOPROTOOPT;
1388         }
1389
1390         if (put_user(len, optlen))
1391                 return -EFAULT;
1392         return 0;
1393 }
1394
1395
1396 static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
1397 {
1398         struct sock *sk;
1399         struct hlist_node *node;
1400         struct net_device *dev = (struct net_device*)data;
1401
1402         read_lock(&packet_sklist_lock);
1403         sk_for_each(sk, node, &packet_sklist) {
1404                 struct packet_opt *po = pkt_sk(sk);
1405
1406                 switch (msg) {
1407                 case NETDEV_UNREGISTER:
1408 #ifdef CONFIG_PACKET_MULTICAST
1409                         if (po->mclist)
1410                                 packet_dev_mclist(dev, po->mclist, -1);
1411                         // fallthrough
1412 #endif
1413                 case NETDEV_DOWN:
1414                         if (dev->ifindex == po->ifindex) {
1415                                 spin_lock(&po->bind_lock);
1416                                 if (po->running) {
1417                                         __dev_remove_pack(&po->prot_hook);
1418                                         __sock_put(sk);
1419                                         po->running = 0;
1420                                         sk->sk_err = ENETDOWN;
1421                                         if (!sock_flag(sk, SOCK_DEAD))
1422                                                 sk->sk_error_report(sk);
1423                                 }
1424                                 if (msg == NETDEV_UNREGISTER) {
1425                                         po->ifindex = -1;
1426                                         po->prot_hook.dev = NULL;
1427                                 }
1428                                 spin_unlock(&po->bind_lock);
1429                         }
1430                         break;
1431                 case NETDEV_UP:
1432                         spin_lock(&po->bind_lock);
1433                         if (dev->ifindex == po->ifindex && po->num &&
1434                             !po->running) {
1435                                 dev_add_pack(&po->prot_hook);
1436                                 sock_hold(sk);
1437                                 po->running = 1;
1438                         }
1439                         spin_unlock(&po->bind_lock);
1440                         break;
1441                 }
1442         }
1443         read_unlock(&packet_sklist_lock);
1444         return NOTIFY_DONE;
1445 }
1446
1447
1448 static int packet_ioctl(struct socket *sock, unsigned int cmd,
1449                         unsigned long arg)
1450 {
1451         struct sock *sk = sock->sk;
1452
1453         switch(cmd) {
1454                 case SIOCOUTQ:
1455                 {
1456                         int amount = atomic_read(&sk->sk_wmem_alloc);
1457                         return put_user(amount, (int __user *)arg);
1458                 }
1459                 case SIOCINQ:
1460                 {
1461                         struct sk_buff *skb;
1462                         int amount = 0;
1463
1464                         spin_lock_bh(&sk->sk_receive_queue.lock);
1465                         skb = skb_peek(&sk->sk_receive_queue);
1466                         if (skb)
1467                                 amount = skb->len;
1468                         spin_unlock_bh(&sk->sk_receive_queue.lock);
1469                         return put_user(amount, (int __user *)arg);
1470                 }
1471                 case SIOCGSTAMP:
1472                         return sock_get_timestamp(sk, (struct timeval __user *)arg);
1473                         
1474 #ifdef CONFIG_INET
1475                 case SIOCADDRT:
1476                 case SIOCDELRT:
1477                 case SIOCDARP:
1478                 case SIOCGARP:
1479                 case SIOCSARP:
1480                 case SIOCGIFADDR:
1481                 case SIOCSIFADDR:
1482                 case SIOCGIFBRDADDR:
1483                 case SIOCSIFBRDADDR:
1484                 case SIOCGIFNETMASK:
1485                 case SIOCSIFNETMASK:
1486                 case SIOCGIFDSTADDR:
1487                 case SIOCSIFDSTADDR:
1488                 case SIOCSIFFLAGS:
1489                         return inet_dgram_ops.ioctl(sock, cmd, arg);
1490 #endif
1491
1492                 default:
1493                         return dev_ioctl(cmd, (void __user *)arg);
1494         }
1495         return 0;
1496 }
1497
1498 #ifndef CONFIG_PACKET_MMAP
1499 #define packet_mmap sock_no_mmap
1500 #define packet_poll datagram_poll
1501 #else
1502
1503 static unsigned int packet_poll(struct file * file, struct socket *sock,
1504                                 poll_table *wait)
1505 {
1506         struct sock *sk = sock->sk;
1507         struct packet_opt *po = pkt_sk(sk);
1508         unsigned int mask = datagram_poll(file, sock, wait);
1509
1510         spin_lock_bh(&sk->sk_receive_queue.lock);
1511         if (po->pg_vec) {
1512                 unsigned last = po->head ? po->head-1 : po->frame_max;
1513                 struct tpacket_hdr *h;
1514
1515                 h = (struct tpacket_hdr *)packet_lookup_frame(po, last);
1516
1517                 if (h->tp_status)
1518                         mask |= POLLIN | POLLRDNORM;
1519         }
1520         spin_unlock_bh(&sk->sk_receive_queue.lock);
1521         return mask;
1522 }
1523
1524
1525 /* Dirty? Well, I still did not learn better way to account
1526  * for user mmaps.
1527  */
1528
1529 static void packet_mm_open(struct vm_area_struct *vma)
1530 {
1531         struct file *file = vma->vm_file;
1532         struct inode *inode = file->f_dentry->d_inode;
1533         struct socket * sock = SOCKET_I(inode);
1534         struct sock *sk = sock->sk;
1535         
1536         if (sk)
1537                 atomic_inc(&pkt_sk(sk)->mapped);
1538 }
1539
1540 static void packet_mm_close(struct vm_area_struct *vma)
1541 {
1542         struct file *file = vma->vm_file;
1543         struct inode *inode = file->f_dentry->d_inode;
1544         struct socket * sock = SOCKET_I(inode);
1545         struct sock *sk = sock->sk;
1546         
1547         if (sk)
1548                 atomic_dec(&pkt_sk(sk)->mapped);
1549 }
1550
1551 static struct vm_operations_struct packet_mmap_ops = {
1552         .open = packet_mm_open,
1553         .close =packet_mm_close,
1554 };
1555
1556 static inline struct page *pg_vec_endpage(char *one_pg_vec, unsigned int order)
1557 {
1558         return virt_to_page(one_pg_vec + (PAGE_SIZE << order) - 1);
1559 }
1560
1561 static void free_pg_vec(char **pg_vec, unsigned order, unsigned len)
1562 {
1563         int i;
1564
1565         for (i=0; i<len; i++) {
1566                 if (pg_vec[i]) {
1567                         struct page *page, *pend;
1568
1569                         pend = pg_vec_endpage(pg_vec[i], order);
1570                         for (page = virt_to_page(pg_vec[i]); page <= pend; page++)
1571                                 ClearPageReserved(page);
1572                         free_pages((unsigned long)pg_vec[i], order);
1573                 }
1574         }
1575         kfree(pg_vec);
1576 }
1577
1578
1579 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing)
1580 {
1581         char **pg_vec = NULL;
1582         struct packet_opt *po = pkt_sk(sk);
1583         int was_running, num, order = 0;
1584         int err = 0;
1585         
1586         if (req->tp_block_nr) {
1587                 int i, l;
1588
1589                 /* Sanity tests and some calculations */
1590
1591                 if (po->pg_vec)
1592                         return -EBUSY;
1593
1594                 if ((int)req->tp_block_size <= 0)
1595                         return -EINVAL;
1596                 if (req->tp_block_size&(PAGE_SIZE-1))
1597                         return -EINVAL;
1598                 if (req->tp_frame_size < TPACKET_HDRLEN)
1599                         return -EINVAL;
1600                 if (req->tp_frame_size&(TPACKET_ALIGNMENT-1))
1601                         return -EINVAL;
1602
1603                 po->frames_per_block = req->tp_block_size/req->tp_frame_size;
1604                 if (po->frames_per_block <= 0)
1605                         return -EINVAL;
1606                 if (po->frames_per_block*req->tp_block_nr != req->tp_frame_nr)
1607                         return -EINVAL;
1608                 /* OK! */
1609
1610                 /* Allocate page vector */
1611                 while ((PAGE_SIZE<<order) < req->tp_block_size)
1612                         order++;
1613
1614                 err = -ENOMEM;
1615
1616                 pg_vec = kmalloc(req->tp_block_nr*sizeof(char *), GFP_KERNEL);
1617                 if (pg_vec == NULL)
1618                         goto out;
1619                 memset(pg_vec, 0, req->tp_block_nr*sizeof(char **));
1620
1621                 for (i=0; i<req->tp_block_nr; i++) {
1622                         struct page *page, *pend;
1623                         pg_vec[i] = (char *)__get_free_pages(GFP_KERNEL, order);
1624                         if (!pg_vec[i])
1625                                 goto out_free_pgvec;
1626
1627                         pend = pg_vec_endpage(pg_vec[i], order);
1628                         for (page = virt_to_page(pg_vec[i]); page <= pend; page++)
1629                                 SetPageReserved(page);
1630                 }
1631                 /* Page vector is allocated */
1632
1633                 l = 0;
1634                 for (i=0; i<req->tp_block_nr; i++) {
1635                         char *ptr = pg_vec[i];
1636                         struct tpacket_hdr *header;
1637                         int k;
1638
1639                         for (k=0; k<po->frames_per_block; k++) {
1640                                 
1641                                 header = (struct tpacket_hdr*)ptr;
1642                                 header->tp_status = TP_STATUS_KERNEL;
1643                                 ptr += req->tp_frame_size;
1644                         }
1645                 }
1646                 /* Done */
1647         } else {
1648                 if (req->tp_frame_nr)
1649                         return -EINVAL;
1650         }
1651
1652         lock_sock(sk);
1653
1654         /* Detach socket from network */
1655         spin_lock(&po->bind_lock);
1656         was_running = po->running;
1657         num = po->num;
1658         if (was_running) {
1659                 __dev_remove_pack(&po->prot_hook);
1660                 po->num = 0;
1661                 po->running = 0;
1662                 __sock_put(sk);
1663         }
1664         spin_unlock(&po->bind_lock);
1665                 
1666         synchronize_net();
1667
1668         err = -EBUSY;
1669         if (closing || atomic_read(&po->mapped) == 0) {
1670                 err = 0;
1671 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
1672
1673                 spin_lock_bh(&sk->sk_receive_queue.lock);
1674                 pg_vec = XC(po->pg_vec, pg_vec);
1675                 po->frame_max = req->tp_frame_nr-1;
1676                 po->head = 0;
1677                 po->frame_size = req->tp_frame_size;
1678                 spin_unlock_bh(&sk->sk_receive_queue.lock);
1679
1680                 order = XC(po->pg_vec_order, order);
1681                 req->tp_block_nr = XC(po->pg_vec_len, req->tp_block_nr);
1682
1683                 po->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
1684                 po->prot_hook.func = po->pg_vec ? tpacket_rcv : packet_rcv;
1685                 skb_queue_purge(&sk->sk_receive_queue);
1686 #undef XC
1687                 if (atomic_read(&po->mapped))
1688                         printk(KERN_DEBUG "packet_mmap: vma is busy: %d\n", atomic_read(&po->mapped));
1689         }
1690
1691         spin_lock(&po->bind_lock);
1692         if (was_running && !po->running) {
1693                 sock_hold(sk);
1694                 po->running = 1;
1695                 po->num = num;
1696                 dev_add_pack(&po->prot_hook);
1697         }
1698         spin_unlock(&po->bind_lock);
1699
1700         release_sock(sk);
1701
1702 out_free_pgvec:
1703         if (pg_vec)
1704                 free_pg_vec(pg_vec, order, req->tp_block_nr);
1705 out:
1706         return err;
1707 }
1708
1709 static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1710 {
1711         struct sock *sk = sock->sk;
1712         struct packet_opt *po = pkt_sk(sk);
1713         unsigned long size;
1714         unsigned long start;
1715         int err = -EINVAL;
1716         int i;
1717
1718         if (vma->vm_pgoff)
1719                 return -EINVAL;
1720
1721         size = vma->vm_end - vma->vm_start;
1722
1723         lock_sock(sk);
1724         if (po->pg_vec == NULL)
1725                 goto out;
1726         if (size != po->pg_vec_len*po->pg_vec_pages*PAGE_SIZE)
1727                 goto out;
1728
1729         atomic_inc(&po->mapped);
1730         start = vma->vm_start;
1731         err = -EAGAIN;
1732         for (i=0; i<po->pg_vec_len; i++) {
1733                 if (remap_pfn_range(vma, start,
1734                                      __pa(po->pg_vec[i]) >> PAGE_SHIFT,
1735                                      po->pg_vec_pages*PAGE_SIZE,
1736                                      vma->vm_page_prot))
1737                         goto out;
1738                 start += po->pg_vec_pages*PAGE_SIZE;
1739         }
1740         vma->vm_ops = &packet_mmap_ops;
1741         err = 0;
1742
1743 out:
1744         release_sock(sk);
1745         return err;
1746 }
1747 #endif
1748
1749
1750 #ifdef CONFIG_SOCK_PACKET
1751 static struct proto_ops packet_ops_spkt = {
1752         .family =       PF_PACKET,
1753         .owner =        THIS_MODULE,
1754         .release =      packet_release,
1755         .bind =         packet_bind_spkt,
1756         .connect =      sock_no_connect,
1757         .socketpair =   sock_no_socketpair,
1758         .accept =       sock_no_accept,
1759         .getname =      packet_getname_spkt,
1760         .poll =         datagram_poll,
1761         .ioctl =        packet_ioctl,
1762         .listen =       sock_no_listen,
1763         .shutdown =     sock_no_shutdown,
1764         .setsockopt =   sock_no_setsockopt,
1765         .getsockopt =   sock_no_getsockopt,
1766         .sendmsg =      packet_sendmsg_spkt,
1767         .recvmsg =      packet_recvmsg,
1768         .mmap =         sock_no_mmap,
1769         .sendpage =     sock_no_sendpage,
1770 };
1771 #endif
1772
1773 static struct proto_ops packet_ops = {
1774         .family =       PF_PACKET,
1775         .owner =        THIS_MODULE,
1776         .release =      packet_release,
1777         .bind =         packet_bind,
1778         .connect =      sock_no_connect,
1779         .socketpair =   sock_no_socketpair,
1780         .accept =       sock_no_accept,
1781         .getname =      packet_getname, 
1782         .poll =         packet_poll,
1783         .ioctl =        packet_ioctl,
1784         .listen =       sock_no_listen,
1785         .shutdown =     sock_no_shutdown,
1786         .setsockopt =   packet_setsockopt,
1787         .getsockopt =   packet_getsockopt,
1788         .sendmsg =      packet_sendmsg,
1789         .recvmsg =      packet_recvmsg,
1790         .mmap =         packet_mmap,
1791         .sendpage =     sock_no_sendpage,
1792 };
1793
1794 static struct net_proto_family packet_family_ops = {
1795         .family =       PF_PACKET,
1796         .create =       packet_create,
1797         .owner  =       THIS_MODULE,
1798 };
1799
1800 static struct notifier_block packet_netdev_notifier = {
1801         .notifier_call =packet_notifier,
1802 };
1803
1804 #ifdef CONFIG_PROC_FS
1805 static inline struct sock *packet_seq_idx(loff_t off)
1806 {
1807         struct sock *s;
1808         struct hlist_node *node;
1809
1810         sk_for_each(s, node, &packet_sklist) {
1811                 if (!off--)
1812                         return s;
1813         }
1814         return NULL;
1815 }
1816
1817 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
1818 {
1819         read_lock(&packet_sklist_lock);
1820         return *pos ? packet_seq_idx(*pos - 1) : SEQ_START_TOKEN;
1821 }
1822
1823 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1824 {
1825         ++*pos;
1826         return  (v == SEQ_START_TOKEN) 
1827                 ? sk_head(&packet_sklist) 
1828                 : sk_next((struct sock*)v) ;
1829 }
1830
1831 static void packet_seq_stop(struct seq_file *seq, void *v)
1832 {
1833         read_unlock(&packet_sklist_lock);               
1834 }
1835
1836 static int packet_seq_show(struct seq_file *seq, void *v) 
1837 {
1838         if (v == SEQ_START_TOKEN)
1839                 seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
1840         else {
1841                 struct sock *s = v;
1842                 const struct packet_opt *po = pkt_sk(s);
1843
1844                 seq_printf(seq,
1845                            "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
1846                            s,
1847                            atomic_read(&s->sk_refcnt),
1848                            s->sk_type,
1849                            ntohs(po->num),
1850                            po->ifindex,
1851                            po->running,
1852                            atomic_read(&s->sk_rmem_alloc),
1853                            sock_i_uid(s),
1854                            sock_i_ino(s) );
1855         }
1856
1857         return 0;
1858 }
1859
1860 static struct seq_operations packet_seq_ops = {
1861         .start  = packet_seq_start,
1862         .next   = packet_seq_next,
1863         .stop   = packet_seq_stop,
1864         .show   = packet_seq_show,
1865 };
1866
1867 static int packet_seq_open(struct inode *inode, struct file *file)
1868 {
1869         return seq_open(file, &packet_seq_ops);
1870 }
1871
1872 static struct file_operations packet_seq_fops = {
1873         .owner          = THIS_MODULE,
1874         .open           = packet_seq_open,
1875         .read           = seq_read,
1876         .llseek         = seq_lseek,
1877         .release        = seq_release,
1878 };
1879
1880 #endif
1881
1882 static void __exit packet_exit(void)
1883 {
1884         proc_net_remove("packet");
1885         unregister_netdevice_notifier(&packet_netdev_notifier);
1886         sock_unregister(PF_PACKET);
1887         return;
1888 }
1889
1890 static int __init packet_init(void)
1891 {
1892         sock_register(&packet_family_ops);
1893         register_netdevice_notifier(&packet_netdev_notifier);
1894         proc_net_fops_create("packet", 0, &packet_seq_fops);
1895
1896         return 0;
1897 }
1898
1899 module_init(packet_init);
1900 module_exit(packet_exit);
1901 MODULE_LICENSE("GPL");
1902 MODULE_ALIAS_NETPROTO(PF_PACKET);