vserver 1.9.3
[linux-2.6.git] / net / packet / af_packet.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              PACKET - implements raw packet sockets.
7  *
8  * Version:     $Id: af_packet.c,v 1.61 2002/02/08 03:57:19 davem Exp $
9  *
10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *
14  * Fixes:       
15  *              Alan Cox        :       verify_area() now used correctly
16  *              Alan Cox        :       new skbuff lists, look ma no backlogs!
17  *              Alan Cox        :       tidied skbuff lists.
18  *              Alan Cox        :       Now uses generic datagram routines I
19  *                                      added. Also fixed the peek/read crash
20  *                                      from all old Linux datagram code.
21  *              Alan Cox        :       Uses the improved datagram code.
22  *              Alan Cox        :       Added NULL's for socket options.
23  *              Alan Cox        :       Re-commented the code.
24  *              Alan Cox        :       Use new kernel side addressing
25  *              Rob Janssen     :       Correct MTU usage.
26  *              Dave Platt      :       Counter leaks caused by incorrect
27  *                                      interrupt locking and some slightly
28  *                                      dubious gcc output. Can you read
29  *                                      compiler: it said _VOLATILE_
30  *      Richard Kooijman        :       Timestamp fixes.
31  *              Alan Cox        :       New buffers. Use sk->mac.raw.
32  *              Alan Cox        :       sendmsg/recvmsg support.
33  *              Alan Cox        :       Protocol setting support
34  *      Alexey Kuznetsov        :       Untied from IPv4 stack.
35  *      Cyrus Durgin            :       Fixed kerneld for kmod.
36  *      Michal Ostrowski        :       Module initialization cleanup.
37  *         Ulises Alonso        :       Frame number limit removal and 
38  *                                      packet_set_ring memory leak.
39  *
40  *              This program is free software; you can redistribute it and/or
41  *              modify it under the terms of the GNU General Public License
42  *              as published by the Free Software Foundation; either version
43  *              2 of the License, or (at your option) any later version.
44  *
45  */
46  
47 #include <linux/config.h>
48 #include <linux/types.h>
49 #include <linux/sched.h>
50 #include <linux/mm.h>
51 #include <linux/fcntl.h>
52 #include <linux/socket.h>
53 #include <linux/in.h>
54 #include <linux/inet.h>
55 #include <linux/netdevice.h>
56 #include <linux/if_packet.h>
57 #include <linux/wireless.h>
58 #include <linux/kmod.h>
59 #include <net/ip.h>
60 #include <net/protocol.h>
61 #include <linux/skbuff.h>
62 #include <net/sock.h>
63 #include <linux/errno.h>
64 #include <linux/timer.h>
65 #include <asm/system.h>
66 #include <asm/uaccess.h>
67 #include <asm/ioctls.h>
68 #include <asm/page.h>
69 #include <asm/io.h>
70 #include <linux/proc_fs.h>
71 #include <linux/seq_file.h>
72 #include <linux/poll.h>
73 #include <linux/module.h>
74 #include <linux/init.h>
75
76 #ifdef CONFIG_INET
77 #include <net/inet_common.h>
78 #endif
79
80 #define CONFIG_SOCK_PACKET      1
81
82 /*
83    Proposed replacement for SIOC{ADD,DEL}MULTI and
84    IFF_PROMISC, IFF_ALLMULTI flags.
85
86    It is more expensive, but I believe,
87    it is really correct solution: reentereble, safe and fault tolerant.
88
89    IFF_PROMISC/IFF_ALLMULTI/SIOC{ADD/DEL}MULTI are faked by keeping
90    reference count and global flag, so that real status is
91    (gflag|(count != 0)), so that we can use obsolete faulty interface
92    not harming clever users.
93  */
94 #define CONFIG_PACKET_MULTICAST 1
95
96 /*
97    Assumptions:
98    - if device has no dev->hard_header routine, it adds and removes ll header
99      inside itself. In this case ll header is invisible outside of device,
100      but higher levels still should reserve dev->hard_header_len.
101      Some devices are enough clever to reallocate skb, when header
102      will not fit to reserved space (tunnel), another ones are silly
103      (PPP).
104    - packet socket receives packets with pulled ll header,
105      so that SOCK_RAW should push it back.
106
107 On receive:
108 -----------
109
110 Incoming, dev->hard_header!=NULL
111    mac.raw -> ll header
112    data    -> data
113
114 Outgoing, dev->hard_header!=NULL
115    mac.raw -> ll header
116    data    -> ll header
117
118 Incoming, dev->hard_header==NULL
119    mac.raw -> UNKNOWN position. It is very likely, that it points to ll header.
120               PPP makes it, that is wrong, because introduce assymetry
121               between rx and tx paths.
122    data    -> data
123
124 Outgoing, dev->hard_header==NULL
125    mac.raw -> data. ll header is still not built!
126    data    -> data
127
128 Resume
129   If dev->hard_header==NULL we are unlikely to restore sensible ll header.
130
131
132 On transmit:
133 ------------
134
135 dev->hard_header != NULL
136    mac.raw -> ll header
137    data    -> ll header
138
139 dev->hard_header == NULL (ll header is added by device, we cannot control it)
140    mac.raw -> data
141    data -> data
142
143    We should set nh.raw on output to correct posistion,
144    packet classifier depends on it.
145  */
146
147 /* List of all packet sockets. */
148 HLIST_HEAD(packet_sklist);
149 static rwlock_t packet_sklist_lock = RW_LOCK_UNLOCKED;
150
151 atomic_t packet_socks_nr;
152
153
154 /* Private packet socket structures. */
155
156 #ifdef CONFIG_PACKET_MULTICAST
157 struct packet_mclist
158 {
159         struct packet_mclist    *next;
160         int                     ifindex;
161         int                     count;
162         unsigned short          type;
163         unsigned short          alen;
164         unsigned char           addr[8];
165 };
166 #endif
167 #ifdef CONFIG_PACKET_MMAP
168 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing);
169 #endif
170
171 static void packet_flush_mclist(struct sock *sk);
172
173 struct packet_opt
174 {
175         struct tpacket_stats    stats;
176 #ifdef CONFIG_PACKET_MMAP
177         char *                  *pg_vec;
178         unsigned int            head;
179         unsigned int            frames_per_block;
180         unsigned int            frame_size;
181         unsigned int            frame_max;
182         int                     copy_thresh;
183 #endif
184         struct packet_type      prot_hook;
185         spinlock_t              bind_lock;
186         char                    running;        /* prot_hook is attached*/
187         int                     ifindex;        /* bound device         */
188         unsigned short          num;
189 #ifdef CONFIG_PACKET_MULTICAST
190         struct packet_mclist    *mclist;
191 #endif
192 #ifdef CONFIG_PACKET_MMAP
193         atomic_t                mapped;
194         unsigned int            pg_vec_order;
195         unsigned int            pg_vec_pages;
196         unsigned int            pg_vec_len;
197 #endif
198 };
199
200 #ifdef CONFIG_PACKET_MMAP
201
202 static inline char *packet_lookup_frame(struct packet_opt *po, unsigned int position)
203 {
204         unsigned int pg_vec_pos, frame_offset;
205         char *frame;
206
207         pg_vec_pos = position / po->frames_per_block;
208         frame_offset = position % po->frames_per_block;
209
210         frame = po->pg_vec[pg_vec_pos] + (frame_offset * po->frame_size);
211         
212         return frame;
213 }
214 #endif
215
216 #define pkt_sk(__sk) ((struct packet_opt *)(__sk)->sk_protinfo)
217
218 void packet_sock_destruct(struct sock *sk)
219 {
220         BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc));
221         BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
222
223         if (!sock_flag(sk, SOCK_DEAD)) {
224                 printk("Attempt to release alive packet socket: %p\n", sk);
225                 return;
226         }
227
228         if (pkt_sk(sk))
229                 kfree(pkt_sk(sk));
230         atomic_dec(&packet_socks_nr);
231 #ifdef PACKET_REFCNT_DEBUG
232         printk(KERN_DEBUG "PACKET socket %p is free, %d are alive\n", sk, atomic_read(&packet_socks_nr));
233 #endif
234 }
235
236
237 extern struct proto_ops packet_ops;
238
239 #ifdef CONFIG_SOCK_PACKET
240 extern struct proto_ops packet_ops_spkt;
241
242 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,  struct packet_type *pt)
243 {
244         struct sock *sk;
245         struct sockaddr_pkt *spkt;
246
247         /*
248          *      When we registered the protocol we saved the socket in the data
249          *      field for just this event.
250          */
251
252         sk = pt->af_packet_priv;
253         
254         /*
255          *      Yank back the headers [hope the device set this
256          *      right or kerboom...]
257          *
258          *      Incoming packets have ll header pulled,
259          *      push it back.
260          *
261          *      For outgoing ones skb->data == skb->mac.raw
262          *      so that this procedure is noop.
263          */
264
265         if (skb->pkt_type == PACKET_LOOPBACK)
266                 goto out;
267
268         if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
269                 goto oom;
270
271         /* drop any routing info */
272         dst_release(skb->dst);
273         skb->dst = NULL;
274
275         spkt = (struct sockaddr_pkt*)skb->cb;
276
277         skb_push(skb, skb->data-skb->mac.raw);
278
279         /*
280          *      The SOCK_PACKET socket receives _all_ frames.
281          */
282
283         spkt->spkt_family = dev->type;
284         strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
285         spkt->spkt_protocol = skb->protocol;
286
287         /*
288          *      Charge the memory to the socket. This is done specifically
289          *      to prevent sockets using all the memory up.
290          */
291
292         if (sock_queue_rcv_skb(sk,skb) == 0)
293                 return 0;
294
295 out:
296         kfree_skb(skb);
297 oom:
298         return 0;
299 }
300
301
302 /*
303  *      Output a raw packet to a device layer. This bypasses all the other
304  *      protocol layers and you must therefore supply it with a complete frame
305  */
306  
307 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
308                                struct msghdr *msg, size_t len)
309 {
310         struct sock *sk = sock->sk;
311         struct sockaddr_pkt *saddr=(struct sockaddr_pkt *)msg->msg_name;
312         struct sk_buff *skb;
313         struct net_device *dev;
314         unsigned short proto=0;
315         int err;
316         
317         /*
318          *      Get and verify the address. 
319          */
320
321         if (saddr)
322         {
323                 if (msg->msg_namelen < sizeof(struct sockaddr))
324                         return(-EINVAL);
325                 if (msg->msg_namelen==sizeof(struct sockaddr_pkt))
326                         proto=saddr->spkt_protocol;
327         }
328         else
329                 return(-ENOTCONN);      /* SOCK_PACKET must be sent giving an address */
330
331         /*
332          *      Find the device first to size check it 
333          */
334
335         saddr->spkt_device[13] = 0;
336         dev = dev_get_by_name(saddr->spkt_device);
337         err = -ENODEV;
338         if (dev == NULL)
339                 goto out_unlock;
340         
341         /*
342          *      You may not queue a frame bigger than the mtu. This is the lowest level
343          *      raw protocol and you must do your own fragmentation at this level.
344          */
345          
346         err = -EMSGSIZE;
347         if(len>dev->mtu+dev->hard_header_len)
348                 goto out_unlock;
349
350         err = -ENOBUFS;
351         skb = sock_wmalloc(sk, len + LL_RESERVED_SPACE(dev), 0, GFP_KERNEL);
352
353         /*
354          *      If the write buffer is full, then tough. At this level the user gets to
355          *      deal with the problem - do your own algorithmic backoffs. That's far
356          *      more flexible.
357          */
358          
359         if (skb == NULL) 
360                 goto out_unlock;
361
362         /*
363          *      Fill it in 
364          */
365          
366         /* FIXME: Save some space for broken drivers that write a
367          * hard header at transmission time by themselves. PPP is the
368          * notable one here. This should really be fixed at the driver level.
369          */
370         skb_reserve(skb, LL_RESERVED_SPACE(dev));
371         skb->nh.raw = skb->data;
372
373         /* Try to align data part correctly */
374         if (dev->hard_header) {
375                 skb->data -= dev->hard_header_len;
376                 skb->tail -= dev->hard_header_len;
377                 if (len < dev->hard_header_len)
378                         skb->nh.raw = skb->data;
379         }
380
381         /* Returns -EFAULT on error */
382         err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
383         skb->protocol = proto;
384         skb->dev = dev;
385         skb->priority = sk->sk_priority;
386         if (err)
387                 goto out_free;
388
389         err = -ENETDOWN;
390         if (!(dev->flags & IFF_UP))
391                 goto out_free;
392
393         /*
394          *      Now send it
395          */
396
397         dev_queue_xmit(skb);
398         dev_put(dev);
399         return(len);
400
401 out_free:
402         kfree_skb(skb);
403 out_unlock:
404         if (dev)
405                 dev_put(dev);
406         return err;
407 }
408 #endif
409
410 static inline unsigned run_filter(struct sk_buff *skb, struct sock *sk, unsigned res)
411 {
412         struct sk_filter *filter;
413
414         bh_lock_sock(sk);
415         filter = sk->sk_filter;
416         /*
417          * Our caller already checked that filter != NULL but we need to
418          * verify that under bh_lock_sock() to be safe
419          */
420         if (likely(filter != NULL))
421                 res = sk_run_filter(skb, filter->insns, filter->len);
422         bh_unlock_sock(sk);
423
424         return res;
425 }
426
427 /*
428    This function makes lazy skb cloning in hope that most of packets
429    are discarded by BPF.
430
431    Note tricky part: we DO mangle shared skb! skb->data, skb->len
432    and skb->cb are mangled. It works because (and until) packets
433    falling here are owned by current CPU. Output packets are cloned
434    by dev_queue_xmit_nit(), input packets are processed by net_bh
435    sequencially, so that if we return skb to original state on exit,
436    we will not harm anyone.
437  */
438
439 static int packet_rcv(struct sk_buff *skb, struct net_device *dev,  struct packet_type *pt)
440 {
441         struct sock *sk;
442         struct sockaddr_ll *sll;
443         struct packet_opt *po;
444         u8 * skb_head = skb->data;
445         int skb_len = skb->len;
446         unsigned snaplen;
447
448         if (skb->pkt_type == PACKET_LOOPBACK)
449                 goto drop;
450
451         sk = pt->af_packet_priv;
452         po = pkt_sk(sk);
453
454         skb->dev = dev;
455
456         if (dev->hard_header) {
457                 /* The device has an explicit notion of ll header,
458                    exported to higher levels.
459
460                    Otherwise, the device hides datails of it frame
461                    structure, so that corresponding packet head
462                    never delivered to user.
463                  */
464                 if (sk->sk_type != SOCK_DGRAM)
465                         skb_push(skb, skb->data - skb->mac.raw);
466                 else if (skb->pkt_type == PACKET_OUTGOING) {
467                         /* Special case: outgoing packets have ll header at head */
468                         skb_pull(skb, skb->nh.raw - skb->data);
469                 }
470         }
471
472         snaplen = skb->len;
473
474         if (sk->sk_filter) {
475                 unsigned res = run_filter(skb, sk, snaplen);
476                 if (res == 0)
477                         goto drop_n_restore;
478                 if (snaplen > res)
479                         snaplen = res;
480         }
481
482         if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
483             (unsigned)sk->sk_rcvbuf)
484                 goto drop_n_acct;
485
486         if (skb_shared(skb)) {
487                 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
488                 if (nskb == NULL)
489                         goto drop_n_acct;
490
491                 if (skb_head != skb->data) {
492                         skb->data = skb_head;
493                         skb->len = skb_len;
494                 }
495                 kfree_skb(skb);
496                 skb = nskb;
497         }
498
499         sll = (struct sockaddr_ll*)skb->cb;
500         sll->sll_family = AF_PACKET;
501         sll->sll_hatype = dev->type;
502         sll->sll_protocol = skb->protocol;
503         sll->sll_pkttype = skb->pkt_type;
504         sll->sll_ifindex = dev->ifindex;
505         sll->sll_halen = 0;
506
507         if (dev->hard_header_parse)
508                 sll->sll_halen = dev->hard_header_parse(skb, sll->sll_addr);
509
510         if (pskb_trim(skb, snaplen))
511                 goto drop_n_acct;
512
513         skb_set_owner_r(skb, sk);
514         skb->dev = NULL;
515         dst_release(skb->dst);
516         skb->dst = NULL;
517
518         spin_lock(&sk->sk_receive_queue.lock);
519         po->stats.tp_packets++;
520         __skb_queue_tail(&sk->sk_receive_queue, skb);
521         spin_unlock(&sk->sk_receive_queue.lock);
522         sk->sk_data_ready(sk, skb->len);
523         return 0;
524
525 drop_n_acct:
526         spin_lock(&sk->sk_receive_queue.lock);
527         po->stats.tp_drops++;
528         spin_unlock(&sk->sk_receive_queue.lock);
529
530 drop_n_restore:
531         if (skb_head != skb->data && skb_shared(skb)) {
532                 skb->data = skb_head;
533                 skb->len = skb_len;
534         }
535 drop:
536         kfree_skb(skb);
537         return 0;
538 }
539
540 #ifdef CONFIG_PACKET_MMAP
541 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,  struct packet_type *pt)
542 {
543         struct sock *sk;
544         struct packet_opt *po;
545         struct sockaddr_ll *sll;
546         struct tpacket_hdr *h;
547         u8 * skb_head = skb->data;
548         int skb_len = skb->len;
549         unsigned snaplen;
550         unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
551         unsigned short macoff, netoff;
552         struct sk_buff *copy_skb = NULL;
553
554         if (skb->pkt_type == PACKET_LOOPBACK)
555                 goto drop;
556
557         sk = pt->af_packet_priv;
558         po = pkt_sk(sk);
559
560         if (dev->hard_header) {
561                 if (sk->sk_type != SOCK_DGRAM)
562                         skb_push(skb, skb->data - skb->mac.raw);
563                 else if (skb->pkt_type == PACKET_OUTGOING) {
564                         /* Special case: outgoing packets have ll header at head */
565                         skb_pull(skb, skb->nh.raw - skb->data);
566                         if (skb->ip_summed == CHECKSUM_HW)
567                                 status |= TP_STATUS_CSUMNOTREADY;
568                 }
569         }
570
571         snaplen = skb->len;
572
573         if (sk->sk_filter) {
574                 unsigned res = run_filter(skb, sk, snaplen);
575                 if (res == 0)
576                         goto drop_n_restore;
577                 if (snaplen > res)
578                         snaplen = res;
579         }
580
581         if (sk->sk_type == SOCK_DGRAM) {
582                 macoff = netoff = TPACKET_ALIGN(TPACKET_HDRLEN) + 16;
583         } else {
584                 unsigned maclen = skb->nh.raw - skb->data;
585                 netoff = TPACKET_ALIGN(TPACKET_HDRLEN + (maclen < 16 ? 16 : maclen));
586                 macoff = netoff - maclen;
587         }
588
589         if (macoff + snaplen > po->frame_size) {
590                 if (po->copy_thresh &&
591                     atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
592                     (unsigned)sk->sk_rcvbuf) {
593                         if (skb_shared(skb)) {
594                                 copy_skb = skb_clone(skb, GFP_ATOMIC);
595                         } else {
596                                 copy_skb = skb_get(skb);
597                                 skb_head = skb->data;
598                         }
599                         if (copy_skb)
600                                 skb_set_owner_r(copy_skb, sk);
601                 }
602                 snaplen = po->frame_size - macoff;
603                 if ((int)snaplen < 0)
604                         snaplen = 0;
605         }
606         if (snaplen > skb->len-skb->data_len)
607                 snaplen = skb->len-skb->data_len;
608
609         spin_lock(&sk->sk_receive_queue.lock);
610         h = (struct tpacket_hdr *)packet_lookup_frame(po, po->head);
611         
612         if (h->tp_status)
613                 goto ring_is_full;
614         po->head = po->head != po->frame_max ? po->head+1 : 0;
615         po->stats.tp_packets++;
616         if (copy_skb) {
617                 status |= TP_STATUS_COPY;
618                 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
619         }
620         if (!po->stats.tp_drops)
621                 status &= ~TP_STATUS_LOSING;
622         spin_unlock(&sk->sk_receive_queue.lock);
623
624         memcpy((u8*)h + macoff, skb->data, snaplen);
625
626         h->tp_len = skb->len;
627         h->tp_snaplen = snaplen;
628         h->tp_mac = macoff;
629         h->tp_net = netoff;
630         if (skb->stamp.tv_sec == 0) { 
631                 do_gettimeofday(&skb->stamp);
632                 sock_enable_timestamp(sk);
633         }
634         h->tp_sec = skb->stamp.tv_sec;
635         h->tp_usec = skb->stamp.tv_usec;
636
637         sll = (struct sockaddr_ll*)((u8*)h + TPACKET_ALIGN(sizeof(*h)));
638         sll->sll_halen = 0;
639         if (dev->hard_header_parse)
640                 sll->sll_halen = dev->hard_header_parse(skb, sll->sll_addr);
641         sll->sll_family = AF_PACKET;
642         sll->sll_hatype = dev->type;
643         sll->sll_protocol = skb->protocol;
644         sll->sll_pkttype = skb->pkt_type;
645         sll->sll_ifindex = dev->ifindex;
646
647         h->tp_status = status;
648         mb();
649
650         {
651                 struct page *p_start, *p_end;
652                 u8 *h_end = (u8 *)h + macoff + snaplen - 1;
653
654                 p_start = virt_to_page(h);
655                 p_end = virt_to_page(h_end);
656                 while (p_start <= p_end) {
657                         flush_dcache_page(p_start);
658                         p_start++;
659                 }
660         }
661
662         sk->sk_data_ready(sk, 0);
663
664 drop_n_restore:
665         if (skb_head != skb->data && skb_shared(skb)) {
666                 skb->data = skb_head;
667                 skb->len = skb_len;
668         }
669 drop:
670         kfree_skb(skb);
671         return 0;
672
673 ring_is_full:
674         po->stats.tp_drops++;
675         spin_unlock(&sk->sk_receive_queue.lock);
676
677         sk->sk_data_ready(sk, 0);
678         if (copy_skb)
679                 kfree_skb(copy_skb);
680         goto drop_n_restore;
681 }
682
683 #endif
684
685
686 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
687                           struct msghdr *msg, size_t len)
688 {
689         struct sock *sk = sock->sk;
690         struct sockaddr_ll *saddr=(struct sockaddr_ll *)msg->msg_name;
691         struct sk_buff *skb;
692         struct net_device *dev;
693         unsigned short proto;
694         unsigned char *addr;
695         int ifindex, err, reserve = 0;
696
697         /*
698          *      Get and verify the address. 
699          */
700          
701         if (saddr == NULL) {
702                 struct packet_opt *po = pkt_sk(sk);
703
704                 ifindex = po->ifindex;
705                 proto   = po->num;
706                 addr    = NULL;
707         } else {
708                 err = -EINVAL;
709                 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
710                         goto out;
711                 ifindex = saddr->sll_ifindex;
712                 proto   = saddr->sll_protocol;
713                 addr    = saddr->sll_addr;
714         }
715
716
717         dev = dev_get_by_index(ifindex);
718         err = -ENXIO;
719         if (dev == NULL)
720                 goto out_unlock;
721         if (sock->type == SOCK_RAW)
722                 reserve = dev->hard_header_len;
723
724         err = -EMSGSIZE;
725         if (len > dev->mtu+reserve)
726                 goto out_unlock;
727
728         skb = sock_alloc_send_skb(sk, len + LL_RESERVED_SPACE(dev),
729                                 msg->msg_flags & MSG_DONTWAIT, &err);
730         if (skb==NULL)
731                 goto out_unlock;
732
733         skb_reserve(skb, LL_RESERVED_SPACE(dev));
734         skb->nh.raw = skb->data;
735
736         if (dev->hard_header) {
737                 int res;
738                 err = -EINVAL;
739                 res = dev->hard_header(skb, dev, ntohs(proto), addr, NULL, len);
740                 if (sock->type != SOCK_DGRAM) {
741                         skb->tail = skb->data;
742                         skb->len = 0;
743                 } else if (res < 0)
744                         goto out_free;
745         }
746
747         /* Returns -EFAULT on error */
748         err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
749         if (err)
750                 goto out_free;
751
752         skb->protocol = proto;
753         skb->dev = dev;
754         skb->priority = sk->sk_priority;
755
756         err = -ENETDOWN;
757         if (!(dev->flags & IFF_UP))
758                 goto out_free;
759
760         /*
761          *      Now send it
762          */
763
764         err = dev_queue_xmit(skb);
765         if (err > 0 && (err = net_xmit_errno(err)) != 0)
766                 goto out_unlock;
767
768         dev_put(dev);
769
770         return(len);
771
772 out_free:
773         kfree_skb(skb);
774 out_unlock:
775         if (dev)
776                 dev_put(dev);
777 out:
778         return err;
779 }
780
781 /*
782  *      Close a PACKET socket. This is fairly simple. We immediately go
783  *      to 'closed' state and remove our protocol entry in the device list.
784  */
785
786 static int packet_release(struct socket *sock)
787 {
788         struct sock *sk = sock->sk;
789         struct packet_opt *po;
790
791         if (!sk)
792                 return 0;
793
794         po = pkt_sk(sk);
795
796         write_lock_bh(&packet_sklist_lock);
797         sk_del_node_init(sk);
798         write_unlock_bh(&packet_sklist_lock);
799
800         /*
801          *      Unhook packet receive handler.
802          */
803
804         if (po->running) {
805                 /*
806                  *      Remove the protocol hook
807                  */
808                 dev_remove_pack(&po->prot_hook);
809                 po->running = 0;
810                 po->num = 0;
811                 __sock_put(sk);
812         }
813
814 #ifdef CONFIG_PACKET_MULTICAST
815         packet_flush_mclist(sk);
816 #endif
817
818 #ifdef CONFIG_PACKET_MMAP
819         if (po->pg_vec) {
820                 struct tpacket_req req;
821                 memset(&req, 0, sizeof(req));
822                 packet_set_ring(sk, &req, 1);
823         }
824 #endif
825
826         /*
827          *      Now the socket is dead. No more input will appear.
828          */
829
830         sock_orphan(sk);
831         sock->sk = NULL;
832
833         /* Purge queues */
834
835         skb_queue_purge(&sk->sk_receive_queue);
836
837         sock_put(sk);
838         return 0;
839 }
840
841 /*
842  *      Attach a packet hook.
843  */
844
845 static int packet_do_bind(struct sock *sk, struct net_device *dev, int protocol)
846 {
847         struct packet_opt *po = pkt_sk(sk);
848         /*
849          *      Detach an existing hook if present.
850          */
851
852         lock_sock(sk);
853
854         spin_lock(&po->bind_lock);
855         if (po->running) {
856                 __sock_put(sk);
857                 po->running = 0;
858                 po->num = 0;
859                 spin_unlock(&po->bind_lock);
860                 dev_remove_pack(&po->prot_hook);
861                 spin_lock(&po->bind_lock);
862         }
863
864         po->num = protocol;
865         po->prot_hook.type = protocol;
866         po->prot_hook.dev = dev;
867
868         po->ifindex = dev ? dev->ifindex : 0;
869
870         if (protocol == 0)
871                 goto out_unlock;
872
873         if (dev) {
874                 if (dev->flags&IFF_UP) {
875                         dev_add_pack(&po->prot_hook);
876                         sock_hold(sk);
877                         po->running = 1;
878                 } else {
879                         sk->sk_err = ENETDOWN;
880                         if (!sock_flag(sk, SOCK_DEAD))
881                                 sk->sk_error_report(sk);
882                 }
883         } else {
884                 dev_add_pack(&po->prot_hook);
885                 sock_hold(sk);
886                 po->running = 1;
887         }
888
889 out_unlock:
890         spin_unlock(&po->bind_lock);
891         release_sock(sk);
892         return 0;
893 }
894
895 /*
896  *      Bind a packet socket to a device
897  */
898
899 #ifdef CONFIG_SOCK_PACKET
900
901 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, int addr_len)
902 {
903         struct sock *sk=sock->sk;
904         char name[15];
905         struct net_device *dev;
906         int err = -ENODEV;
907         
908         /*
909          *      Check legality
910          */
911          
912         if(addr_len!=sizeof(struct sockaddr))
913                 return -EINVAL;
914         strlcpy(name,uaddr->sa_data,sizeof(name));
915
916         dev = dev_get_by_name(name);
917         if (dev) {
918                 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
919                 dev_put(dev);
920         }
921         return err;
922 }
923 #endif
924
925 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
926 {
927         struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
928         struct sock *sk=sock->sk;
929         struct net_device *dev = NULL;
930         int err;
931
932
933         /*
934          *      Check legality
935          */
936          
937         if (addr_len < sizeof(struct sockaddr_ll))
938                 return -EINVAL;
939         if (sll->sll_family != AF_PACKET)
940                 return -EINVAL;
941
942         if (sll->sll_ifindex) {
943                 err = -ENODEV;
944                 dev = dev_get_by_index(sll->sll_ifindex);
945                 if (dev == NULL)
946                         goto out;
947         }
948         err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
949         if (dev)
950                 dev_put(dev);
951
952 out:
953         return err;
954 }
955
956
957 /*
958  *      Create a packet of type SOCK_PACKET. 
959  */
960
961 static int packet_create(struct socket *sock, int protocol)
962 {
963         struct sock *sk;
964         struct packet_opt *po;
965         int err;
966
967         if (!capable(CAP_NET_RAW))
968                 return -EPERM;
969         if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW
970 #ifdef CONFIG_SOCK_PACKET
971             && sock->type != SOCK_PACKET
972 #endif
973             )
974                 return -ESOCKTNOSUPPORT;
975
976         sock->state = SS_UNCONNECTED;
977
978         err = -ENOBUFS;
979         sk = sk_alloc(PF_PACKET, GFP_KERNEL, 1, NULL);
980         if (sk == NULL)
981                 goto out;
982
983         sock->ops = &packet_ops;
984 #ifdef CONFIG_SOCK_PACKET
985         if (sock->type == SOCK_PACKET)
986                 sock->ops = &packet_ops_spkt;
987 #endif
988         sock_init_data(sock,sk);
989         sk_set_owner(sk, THIS_MODULE);
990
991         po = sk->sk_protinfo = kmalloc(sizeof(*po), GFP_KERNEL);
992         if (!po)
993                 goto out_free;
994         memset(po, 0, sizeof(*po));
995         sk->sk_family = PF_PACKET;
996         po->num = protocol;
997
998         sk->sk_destruct = packet_sock_destruct;
999         atomic_inc(&packet_socks_nr);
1000
1001         /*
1002          *      Attach a protocol block
1003          */
1004
1005         spin_lock_init(&po->bind_lock);
1006         po->prot_hook.func = packet_rcv;
1007 #ifdef CONFIG_SOCK_PACKET
1008         if (sock->type == SOCK_PACKET)
1009                 po->prot_hook.func = packet_rcv_spkt;
1010 #endif
1011         po->prot_hook.af_packet_priv = sk;
1012
1013         if (protocol) {
1014                 po->prot_hook.type = protocol;
1015                 dev_add_pack(&po->prot_hook);
1016                 sock_hold(sk);
1017                 po->running = 1;
1018         }
1019
1020         write_lock_bh(&packet_sklist_lock);
1021         sk_add_node(sk, &packet_sklist);
1022         write_unlock_bh(&packet_sklist_lock);
1023         return(0);
1024
1025 out_free:
1026         sk_free(sk);
1027 out:
1028         return err;
1029 }
1030
1031 /*
1032  *      Pull a packet from our receive queue and hand it to the user.
1033  *      If necessary we block.
1034  */
1035
1036 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1037                           struct msghdr *msg, size_t len, int flags)
1038 {
1039         struct sock *sk = sock->sk;
1040         struct sk_buff *skb;
1041         int copied, err;
1042
1043         err = -EINVAL;
1044         if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
1045                 goto out;
1046
1047 #if 0
1048         /* What error should we return now? EUNATTACH? */
1049         if (pkt_sk(sk)->ifindex < 0)
1050                 return -ENODEV;
1051 #endif
1052
1053         /*
1054          *      If the address length field is there to be filled in, we fill
1055          *      it in now.
1056          */
1057
1058         if (sock->type == SOCK_PACKET)
1059                 msg->msg_namelen = sizeof(struct sockaddr_pkt);
1060         else
1061                 msg->msg_namelen = sizeof(struct sockaddr_ll);
1062
1063         /*
1064          *      Call the generic datagram receiver. This handles all sorts
1065          *      of horrible races and re-entrancy so we can forget about it
1066          *      in the protocol layers.
1067          *
1068          *      Now it will return ENETDOWN, if device have just gone down,
1069          *      but then it will block.
1070          */
1071
1072         skb=skb_recv_datagram(sk,flags,flags&MSG_DONTWAIT,&err);
1073
1074         /*
1075          *      An error occurred so return it. Because skb_recv_datagram() 
1076          *      handles the blocking we don't see and worry about blocking
1077          *      retries.
1078          */
1079
1080         if(skb==NULL)
1081                 goto out;
1082
1083         /*
1084          *      You lose any data beyond the buffer you gave. If it worries a
1085          *      user program they can ask the device for its MTU anyway.
1086          */
1087
1088         copied = skb->len;
1089         if (copied > len)
1090         {
1091                 copied=len;
1092                 msg->msg_flags|=MSG_TRUNC;
1093         }
1094
1095         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1096         if (err)
1097                 goto out_free;
1098
1099         sock_recv_timestamp(msg, sk, skb);
1100
1101         if (msg->msg_name)
1102                 memcpy(msg->msg_name, skb->cb, msg->msg_namelen);
1103
1104         /*
1105          *      Free or return the buffer as appropriate. Again this
1106          *      hides all the races and re-entrancy issues from us.
1107          */
1108         err = (flags&MSG_TRUNC) ? skb->len : copied;
1109
1110 out_free:
1111         skb_free_datagram(sk, skb);
1112 out:
1113         return err;
1114 }
1115
1116 #ifdef CONFIG_SOCK_PACKET
1117 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1118                                int *uaddr_len, int peer)
1119 {
1120         struct net_device *dev;
1121         struct sock *sk = sock->sk;
1122
1123         if (peer)
1124                 return -EOPNOTSUPP;
1125
1126         uaddr->sa_family = AF_PACKET;
1127         dev = dev_get_by_index(pkt_sk(sk)->ifindex);
1128         if (dev) {
1129                 strlcpy(uaddr->sa_data, dev->name, 15);
1130                 dev_put(dev);
1131         } else
1132                 memset(uaddr->sa_data, 0, 14);
1133         *uaddr_len = sizeof(*uaddr);
1134
1135         return 0;
1136 }
1137 #endif
1138
1139 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1140                           int *uaddr_len, int peer)
1141 {
1142         struct net_device *dev;
1143         struct sock *sk = sock->sk;
1144         struct packet_opt *po = pkt_sk(sk);
1145         struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
1146
1147         if (peer)
1148                 return -EOPNOTSUPP;
1149
1150         sll->sll_family = AF_PACKET;
1151         sll->sll_ifindex = po->ifindex;
1152         sll->sll_protocol = po->num;
1153         dev = dev_get_by_index(po->ifindex);
1154         if (dev) {
1155                 sll->sll_hatype = dev->type;
1156                 sll->sll_halen = dev->addr_len;
1157                 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1158                 dev_put(dev);
1159         } else {
1160                 sll->sll_hatype = 0;    /* Bad: we have no ARPHRD_UNSPEC */
1161                 sll->sll_halen = 0;
1162         }
1163         *uaddr_len = sizeof(*sll);
1164
1165         return 0;
1166 }
1167
1168 #ifdef CONFIG_PACKET_MULTICAST
1169 static void packet_dev_mc(struct net_device *dev, struct packet_mclist *i, int what)
1170 {
1171         switch (i->type) {
1172         case PACKET_MR_MULTICAST:
1173                 if (what > 0)
1174                         dev_mc_add(dev, i->addr, i->alen, 0);
1175                 else
1176                         dev_mc_delete(dev, i->addr, i->alen, 0);
1177                 break;
1178         case PACKET_MR_PROMISC:
1179                 dev_set_promiscuity(dev, what);
1180                 break;
1181         case PACKET_MR_ALLMULTI:
1182                 dev_set_allmulti(dev, what);
1183                 break;
1184         default:;
1185         }
1186 }
1187
1188 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1189 {
1190         for ( ; i; i=i->next) {
1191                 if (i->ifindex == dev->ifindex)
1192                         packet_dev_mc(dev, i, what);
1193         }
1194 }
1195
1196 static int packet_mc_add(struct sock *sk, struct packet_mreq *mreq)
1197 {
1198         struct packet_opt *po = pkt_sk(sk);
1199         struct packet_mclist *ml, *i;
1200         struct net_device *dev;
1201         int err;
1202
1203         rtnl_lock();
1204
1205         err = -ENODEV;
1206         dev = __dev_get_by_index(mreq->mr_ifindex);
1207         if (!dev)
1208                 goto done;
1209
1210         err = -EINVAL;
1211         if (mreq->mr_alen > dev->addr_len)
1212                 goto done;
1213
1214         err = -ENOBUFS;
1215         i = (struct packet_mclist *)kmalloc(sizeof(*i), GFP_KERNEL);
1216         if (i == NULL)
1217                 goto done;
1218
1219         err = 0;
1220         for (ml = po->mclist; ml; ml = ml->next) {
1221                 if (ml->ifindex == mreq->mr_ifindex &&
1222                     ml->type == mreq->mr_type &&
1223                     ml->alen == mreq->mr_alen &&
1224                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1225                         ml->count++;
1226                         /* Free the new element ... */
1227                         kfree(i);
1228                         goto done;
1229                 }
1230         }
1231
1232         i->type = mreq->mr_type;
1233         i->ifindex = mreq->mr_ifindex;
1234         i->alen = mreq->mr_alen;
1235         memcpy(i->addr, mreq->mr_address, i->alen);
1236         i->count = 1;
1237         i->next = po->mclist;
1238         po->mclist = i;
1239         packet_dev_mc(dev, i, +1);
1240
1241 done:
1242         rtnl_unlock();
1243         return err;
1244 }
1245
1246 static int packet_mc_drop(struct sock *sk, struct packet_mreq *mreq)
1247 {
1248         struct packet_mclist *ml, **mlp;
1249
1250         rtnl_lock();
1251
1252         for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1253                 if (ml->ifindex == mreq->mr_ifindex &&
1254                     ml->type == mreq->mr_type &&
1255                     ml->alen == mreq->mr_alen &&
1256                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1257                         if (--ml->count == 0) {
1258                                 struct net_device *dev;
1259                                 *mlp = ml->next;
1260                                 dev = dev_get_by_index(ml->ifindex);
1261                                 if (dev) {
1262                                         packet_dev_mc(dev, ml, -1);
1263                                         dev_put(dev);
1264                                 }
1265                                 kfree(ml);
1266                         }
1267                         rtnl_unlock();
1268                         return 0;
1269                 }
1270         }
1271         rtnl_unlock();
1272         return -EADDRNOTAVAIL;
1273 }
1274
1275 static void packet_flush_mclist(struct sock *sk)
1276 {
1277         struct packet_opt *po = pkt_sk(sk);
1278         struct packet_mclist *ml;
1279
1280         if (!po->mclist)
1281                 return;
1282
1283         rtnl_lock();
1284         while ((ml = po->mclist) != NULL) {
1285                 struct net_device *dev;
1286
1287                 po->mclist = ml->next;
1288                 if ((dev = dev_get_by_index(ml->ifindex)) != NULL) {
1289                         packet_dev_mc(dev, ml, -1);
1290                         dev_put(dev);
1291                 }
1292                 kfree(ml);
1293         }
1294         rtnl_unlock();
1295 }
1296 #endif
1297
1298 static int
1299 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen)
1300 {
1301         struct sock *sk = sock->sk;
1302         int ret;
1303
1304         if (level != SOL_PACKET)
1305                 return -ENOPROTOOPT;
1306
1307         switch(optname) {
1308 #ifdef CONFIG_PACKET_MULTICAST
1309         case PACKET_ADD_MEMBERSHIP:     
1310         case PACKET_DROP_MEMBERSHIP:
1311         {
1312                 struct packet_mreq mreq;
1313                 if (optlen<sizeof(mreq))
1314                         return -EINVAL;
1315                 if (copy_from_user(&mreq,optval,sizeof(mreq)))
1316                         return -EFAULT;
1317                 if (optname == PACKET_ADD_MEMBERSHIP)
1318                         ret = packet_mc_add(sk, &mreq);
1319                 else
1320                         ret = packet_mc_drop(sk, &mreq);
1321                 return ret;
1322         }
1323 #endif
1324 #ifdef CONFIG_PACKET_MMAP
1325         case PACKET_RX_RING:
1326         {
1327                 struct tpacket_req req;
1328
1329                 if (optlen<sizeof(req))
1330                         return -EINVAL;
1331                 if (copy_from_user(&req,optval,sizeof(req)))
1332                         return -EFAULT;
1333                 return packet_set_ring(sk, &req, 0);
1334         }
1335         case PACKET_COPY_THRESH:
1336         {
1337                 int val;
1338
1339                 if (optlen!=sizeof(val))
1340                         return -EINVAL;
1341                 if (copy_from_user(&val,optval,sizeof(val)))
1342                         return -EFAULT;
1343
1344                 pkt_sk(sk)->copy_thresh = val;
1345                 return 0;
1346         }
1347 #endif
1348         default:
1349                 return -ENOPROTOOPT;
1350         }
1351 }
1352
1353 int packet_getsockopt(struct socket *sock, int level, int optname,
1354                       char __user *optval, int __user *optlen)
1355 {
1356         int len;
1357         struct sock *sk = sock->sk;
1358         struct packet_opt *po = pkt_sk(sk);
1359
1360         if (level != SOL_PACKET)
1361                 return -ENOPROTOOPT;
1362
1363         if (get_user(len,optlen))
1364                 return -EFAULT;
1365
1366         if (len < 0)
1367                 return -EINVAL;
1368                 
1369         switch(optname) {
1370         case PACKET_STATISTICS:
1371         {
1372                 struct tpacket_stats st;
1373
1374                 if (len > sizeof(struct tpacket_stats))
1375                         len = sizeof(struct tpacket_stats);
1376                 spin_lock_bh(&sk->sk_receive_queue.lock);
1377                 st = po->stats;
1378                 memset(&po->stats, 0, sizeof(st));
1379                 spin_unlock_bh(&sk->sk_receive_queue.lock);
1380                 st.tp_packets += st.tp_drops;
1381
1382                 if (copy_to_user(optval, &st, len))
1383                         return -EFAULT;
1384                 break;
1385         }
1386         default:
1387                 return -ENOPROTOOPT;
1388         }
1389
1390         if (put_user(len, optlen))
1391                 return -EFAULT;
1392         return 0;
1393 }
1394
1395
1396 static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
1397 {
1398         struct sock *sk;
1399         struct hlist_node *node;
1400         struct net_device *dev = (struct net_device*)data;
1401
1402         read_lock(&packet_sklist_lock);
1403         sk_for_each(sk, node, &packet_sklist) {
1404                 struct packet_opt *po = pkt_sk(sk);
1405
1406                 switch (msg) {
1407                 case NETDEV_UNREGISTER:
1408 #ifdef CONFIG_PACKET_MULTICAST
1409                         if (po->mclist)
1410                                 packet_dev_mclist(dev, po->mclist, -1);
1411                         // fallthrough
1412 #endif
1413                 case NETDEV_DOWN:
1414                         if (dev->ifindex == po->ifindex) {
1415                                 spin_lock(&po->bind_lock);
1416                                 if (po->running) {
1417                                         __dev_remove_pack(&po->prot_hook);
1418                                         __sock_put(sk);
1419                                         po->running = 0;
1420                                         sk->sk_err = ENETDOWN;
1421                                         if (!sock_flag(sk, SOCK_DEAD))
1422                                                 sk->sk_error_report(sk);
1423                                 }
1424                                 if (msg == NETDEV_UNREGISTER) {
1425                                         po->ifindex = -1;
1426                                         po->prot_hook.dev = NULL;
1427                                 }
1428                                 spin_unlock(&po->bind_lock);
1429                         }
1430                         break;
1431                 case NETDEV_UP:
1432                         spin_lock(&po->bind_lock);
1433                         if (dev->ifindex == po->ifindex && po->num &&
1434                             !po->running) {
1435                                 dev_add_pack(&po->prot_hook);
1436                                 sock_hold(sk);
1437                                 po->running = 1;
1438                         }
1439                         spin_unlock(&po->bind_lock);
1440                         break;
1441                 }
1442         }
1443         read_unlock(&packet_sklist_lock);
1444         return NOTIFY_DONE;
1445 }
1446
1447
1448 static int packet_ioctl(struct socket *sock, unsigned int cmd,
1449                         unsigned long arg)
1450 {
1451         struct sock *sk = sock->sk;
1452
1453         switch(cmd) {
1454                 case SIOCOUTQ:
1455                 {
1456                         int amount = atomic_read(&sk->sk_wmem_alloc);
1457                         return put_user(amount, (int __user *)arg);
1458                 }
1459                 case SIOCINQ:
1460                 {
1461                         struct sk_buff *skb;
1462                         int amount = 0;
1463
1464                         spin_lock_bh(&sk->sk_receive_queue.lock);
1465                         skb = skb_peek(&sk->sk_receive_queue);
1466                         if (skb)
1467                                 amount = skb->len;
1468                         spin_unlock_bh(&sk->sk_receive_queue.lock);
1469                         return put_user(amount, (int __user *)arg);
1470                 }
1471                 case SIOCGSTAMP:
1472                         return sock_get_timestamp(sk, (struct timeval __user *)arg);
1473                         
1474 #ifdef CONFIG_INET
1475                 case SIOCADDRT:
1476                 case SIOCDELRT:
1477                 case SIOCDARP:
1478                 case SIOCGARP:
1479                 case SIOCSARP:
1480                 case SIOCGIFADDR:
1481                 case SIOCSIFADDR:
1482                 case SIOCGIFBRDADDR:
1483                 case SIOCSIFBRDADDR:
1484                 case SIOCGIFNETMASK:
1485                 case SIOCSIFNETMASK:
1486                 case SIOCGIFDSTADDR:
1487                 case SIOCSIFDSTADDR:
1488                 case SIOCSIFFLAGS:
1489                         return inet_dgram_ops.ioctl(sock, cmd, arg);
1490 #endif
1491
1492                 default:
1493                         return dev_ioctl(cmd, (void __user *)arg);
1494         }
1495         return 0;
1496 }
1497
1498 #ifndef CONFIG_PACKET_MMAP
1499 #define packet_mmap sock_no_mmap
1500 #define packet_poll datagram_poll
1501 #else
1502
1503 unsigned int packet_poll(struct file * file, struct socket *sock, poll_table *wait)
1504 {
1505         struct sock *sk = sock->sk;
1506         struct packet_opt *po = pkt_sk(sk);
1507         unsigned int mask = datagram_poll(file, sock, wait);
1508
1509         spin_lock_bh(&sk->sk_receive_queue.lock);
1510         if (po->pg_vec) {
1511                 unsigned last = po->head ? po->head-1 : po->frame_max;
1512                 struct tpacket_hdr *h;
1513
1514                 h = (struct tpacket_hdr *)packet_lookup_frame(po, last);
1515
1516                 if (h->tp_status)
1517                         mask |= POLLIN | POLLRDNORM;
1518         }
1519         spin_unlock_bh(&sk->sk_receive_queue.lock);
1520         return mask;
1521 }
1522
1523
1524 /* Dirty? Well, I still did not learn better way to account
1525  * for user mmaps.
1526  */
1527
1528 static void packet_mm_open(struct vm_area_struct *vma)
1529 {
1530         struct file *file = vma->vm_file;
1531         struct inode *inode = file->f_dentry->d_inode;
1532         struct socket * sock = SOCKET_I(inode);
1533         struct sock *sk = sock->sk;
1534         
1535         if (sk)
1536                 atomic_inc(&pkt_sk(sk)->mapped);
1537 }
1538
1539 static void packet_mm_close(struct vm_area_struct *vma)
1540 {
1541         struct file *file = vma->vm_file;
1542         struct inode *inode = file->f_dentry->d_inode;
1543         struct socket * sock = SOCKET_I(inode);
1544         struct sock *sk = sock->sk;
1545         
1546         if (sk)
1547                 atomic_dec(&pkt_sk(sk)->mapped);
1548 }
1549
1550 static struct vm_operations_struct packet_mmap_ops = {
1551         .open = packet_mm_open,
1552         .close =packet_mm_close,
1553 };
1554
1555 static inline struct page *pg_vec_endpage(char *one_pg_vec, unsigned int order)
1556 {
1557         return virt_to_page(one_pg_vec + (PAGE_SIZE << order) - 1);
1558 }
1559
1560 static void free_pg_vec(char **pg_vec, unsigned order, unsigned len)
1561 {
1562         int i;
1563
1564         for (i=0; i<len; i++) {
1565                 if (pg_vec[i]) {
1566                         struct page *page, *pend;
1567
1568                         pend = pg_vec_endpage(pg_vec[i], order);
1569                         for (page = virt_to_page(pg_vec[i]); page <= pend; page++)
1570                                 ClearPageReserved(page);
1571                         free_pages((unsigned long)pg_vec[i], order);
1572                 }
1573         }
1574         kfree(pg_vec);
1575 }
1576
1577
1578 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing)
1579 {
1580         char **pg_vec = NULL;
1581         struct packet_opt *po = pkt_sk(sk);
1582         int was_running, num, order = 0;
1583         int err = 0;
1584         
1585         if (req->tp_block_nr) {
1586                 int i, l;
1587
1588                 /* Sanity tests and some calculations */
1589
1590                 if (po->pg_vec)
1591                         return -EBUSY;
1592
1593                 if ((int)req->tp_block_size <= 0)
1594                         return -EINVAL;
1595                 if (req->tp_block_size&(PAGE_SIZE-1))
1596                         return -EINVAL;
1597                 if (req->tp_frame_size < TPACKET_HDRLEN)
1598                         return -EINVAL;
1599                 if (req->tp_frame_size&(TPACKET_ALIGNMENT-1))
1600                         return -EINVAL;
1601
1602                 po->frames_per_block = req->tp_block_size/req->tp_frame_size;
1603                 if (po->frames_per_block <= 0)
1604                         return -EINVAL;
1605                 if (po->frames_per_block*req->tp_block_nr != req->tp_frame_nr)
1606                         return -EINVAL;
1607                 /* OK! */
1608
1609                 /* Allocate page vector */
1610                 while ((PAGE_SIZE<<order) < req->tp_block_size)
1611                         order++;
1612
1613                 err = -ENOMEM;
1614
1615                 pg_vec = kmalloc(req->tp_block_nr*sizeof(char *), GFP_KERNEL);
1616                 if (pg_vec == NULL)
1617                         goto out;
1618                 memset(pg_vec, 0, req->tp_block_nr*sizeof(char **));
1619
1620                 for (i=0; i<req->tp_block_nr; i++) {
1621                         struct page *page, *pend;
1622                         pg_vec[i] = (char *)__get_free_pages(GFP_KERNEL, order);
1623                         if (!pg_vec[i])
1624                                 goto out_free_pgvec;
1625
1626                         pend = pg_vec_endpage(pg_vec[i], order);
1627                         for (page = virt_to_page(pg_vec[i]); page <= pend; page++)
1628                                 SetPageReserved(page);
1629                 }
1630                 /* Page vector is allocated */
1631
1632                 l = 0;
1633                 for (i=0; i<req->tp_block_nr; i++) {
1634                         char *ptr = pg_vec[i];
1635                         struct tpacket_hdr *header;
1636                         int k;
1637
1638                         for (k=0; k<po->frames_per_block; k++) {
1639                                 
1640                                 header = (struct tpacket_hdr*)ptr;
1641                                 header->tp_status = TP_STATUS_KERNEL;
1642                                 ptr += req->tp_frame_size;
1643                         }
1644                 }
1645                 /* Done */
1646         } else {
1647                 if (req->tp_frame_nr)
1648                         return -EINVAL;
1649         }
1650
1651         lock_sock(sk);
1652
1653         /* Detach socket from network */
1654         spin_lock(&po->bind_lock);
1655         was_running = po->running;
1656         num = po->num;
1657         if (was_running) {
1658                 __dev_remove_pack(&po->prot_hook);
1659                 po->num = 0;
1660                 po->running = 0;
1661                 __sock_put(sk);
1662         }
1663         spin_unlock(&po->bind_lock);
1664                 
1665         synchronize_net();
1666
1667         err = -EBUSY;
1668         if (closing || atomic_read(&po->mapped) == 0) {
1669                 err = 0;
1670 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
1671
1672                 spin_lock_bh(&sk->sk_receive_queue.lock);
1673                 pg_vec = XC(po->pg_vec, pg_vec);
1674                 po->frame_max = req->tp_frame_nr-1;
1675                 po->head = 0;
1676                 po->frame_size = req->tp_frame_size;
1677                 spin_unlock_bh(&sk->sk_receive_queue.lock);
1678
1679                 order = XC(po->pg_vec_order, order);
1680                 req->tp_block_nr = XC(po->pg_vec_len, req->tp_block_nr);
1681
1682                 po->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
1683                 po->prot_hook.func = po->pg_vec ? tpacket_rcv : packet_rcv;
1684                 skb_queue_purge(&sk->sk_receive_queue);
1685 #undef XC
1686                 if (atomic_read(&po->mapped))
1687                         printk(KERN_DEBUG "packet_mmap: vma is busy: %d\n", atomic_read(&po->mapped));
1688         }
1689
1690         spin_lock(&po->bind_lock);
1691         if (was_running && !po->running) {
1692                 sock_hold(sk);
1693                 po->running = 1;
1694                 po->num = num;
1695                 dev_add_pack(&po->prot_hook);
1696         }
1697         spin_unlock(&po->bind_lock);
1698
1699         release_sock(sk);
1700
1701 out_free_pgvec:
1702         if (pg_vec)
1703                 free_pg_vec(pg_vec, order, req->tp_block_nr);
1704 out:
1705         return err;
1706 }
1707
1708 static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1709 {
1710         struct sock *sk = sock->sk;
1711         struct packet_opt *po = pkt_sk(sk);
1712         unsigned long size;
1713         unsigned long start;
1714         int err = -EINVAL;
1715         int i;
1716
1717         if (vma->vm_pgoff)
1718                 return -EINVAL;
1719
1720         size = vma->vm_end - vma->vm_start;
1721
1722         lock_sock(sk);
1723         if (po->pg_vec == NULL)
1724                 goto out;
1725         if (size != po->pg_vec_len*po->pg_vec_pages*PAGE_SIZE)
1726                 goto out;
1727
1728         atomic_inc(&po->mapped);
1729         start = vma->vm_start;
1730         err = -EAGAIN;
1731         for (i=0; i<po->pg_vec_len; i++) {
1732                 if (remap_page_range(vma, start, __pa(po->pg_vec[i]),
1733                                      po->pg_vec_pages*PAGE_SIZE,
1734                                      vma->vm_page_prot))
1735                         goto out;
1736                 start += po->pg_vec_pages*PAGE_SIZE;
1737         }
1738         vma->vm_ops = &packet_mmap_ops;
1739         err = 0;
1740
1741 out:
1742         release_sock(sk);
1743         return err;
1744 }
1745 #endif
1746
1747
1748 #ifdef CONFIG_SOCK_PACKET
1749 struct proto_ops packet_ops_spkt = {
1750         .family =       PF_PACKET,
1751         .owner =        THIS_MODULE,
1752         .release =      packet_release,
1753         .bind =         packet_bind_spkt,
1754         .connect =      sock_no_connect,
1755         .socketpair =   sock_no_socketpair,
1756         .accept =       sock_no_accept,
1757         .getname =      packet_getname_spkt,
1758         .poll =         datagram_poll,
1759         .ioctl =        packet_ioctl,
1760         .listen =       sock_no_listen,
1761         .shutdown =     sock_no_shutdown,
1762         .setsockopt =   sock_no_setsockopt,
1763         .getsockopt =   sock_no_getsockopt,
1764         .sendmsg =      packet_sendmsg_spkt,
1765         .recvmsg =      packet_recvmsg,
1766         .mmap =         sock_no_mmap,
1767         .sendpage =     sock_no_sendpage,
1768 };
1769 #endif
1770
1771 struct proto_ops packet_ops = {
1772         .family =       PF_PACKET,
1773         .owner =        THIS_MODULE,
1774         .release =      packet_release,
1775         .bind =         packet_bind,
1776         .connect =      sock_no_connect,
1777         .socketpair =   sock_no_socketpair,
1778         .accept =       sock_no_accept,
1779         .getname =      packet_getname, 
1780         .poll =         packet_poll,
1781         .ioctl =        packet_ioctl,
1782         .listen =       sock_no_listen,
1783         .shutdown =     sock_no_shutdown,
1784         .setsockopt =   packet_setsockopt,
1785         .getsockopt =   packet_getsockopt,
1786         .sendmsg =      packet_sendmsg,
1787         .recvmsg =      packet_recvmsg,
1788         .mmap =         packet_mmap,
1789         .sendpage =     sock_no_sendpage,
1790 };
1791
1792 static struct net_proto_family packet_family_ops = {
1793         .family =       PF_PACKET,
1794         .create =       packet_create,
1795         .owner  =       THIS_MODULE,
1796 };
1797
1798 static struct notifier_block packet_netdev_notifier = {
1799         .notifier_call =packet_notifier,
1800 };
1801
1802 #ifdef CONFIG_PROC_FS
1803 static inline struct sock *packet_seq_idx(loff_t off)
1804 {
1805         struct sock *s;
1806         struct hlist_node *node;
1807
1808         sk_for_each(s, node, &packet_sklist) {
1809                 if (!off--)
1810                         return s;
1811         }
1812         return NULL;
1813 }
1814
1815 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
1816 {
1817         read_lock(&packet_sklist_lock);
1818         return *pos ? packet_seq_idx(*pos - 1) : SEQ_START_TOKEN;
1819 }
1820
1821 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1822 {
1823         ++*pos;
1824         return  (v == SEQ_START_TOKEN) 
1825                 ? sk_head(&packet_sklist) 
1826                 : sk_next((struct sock*)v) ;
1827 }
1828
1829 static void packet_seq_stop(struct seq_file *seq, void *v)
1830 {
1831         read_unlock(&packet_sklist_lock);               
1832 }
1833
1834 static int packet_seq_show(struct seq_file *seq, void *v) 
1835 {
1836         if (v == SEQ_START_TOKEN)
1837                 seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
1838         else {
1839                 struct sock *s = v;
1840                 const struct packet_opt *po = pkt_sk(s);
1841
1842                 seq_printf(seq,
1843                            "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
1844                            s,
1845                            atomic_read(&s->sk_refcnt),
1846                            s->sk_type,
1847                            ntohs(po->num),
1848                            po->ifindex,
1849                            po->running,
1850                            atomic_read(&s->sk_rmem_alloc),
1851                            sock_i_uid(s),
1852                            sock_i_ino(s) );
1853         }
1854
1855         return 0;
1856 }
1857
1858 static struct seq_operations packet_seq_ops = {
1859         .start  = packet_seq_start,
1860         .next   = packet_seq_next,
1861         .stop   = packet_seq_stop,
1862         .show   = packet_seq_show,
1863 };
1864
1865 static int packet_seq_open(struct inode *inode, struct file *file)
1866 {
1867         return seq_open(file, &packet_seq_ops);
1868 }
1869
1870 static struct file_operations packet_seq_fops = {
1871         .owner          = THIS_MODULE,
1872         .open           = packet_seq_open,
1873         .read           = seq_read,
1874         .llseek         = seq_lseek,
1875         .release        = seq_release,
1876 };
1877
1878 #endif
1879
1880 static void __exit packet_exit(void)
1881 {
1882         proc_net_remove("packet");
1883         unregister_netdevice_notifier(&packet_netdev_notifier);
1884         sock_unregister(PF_PACKET);
1885         return;
1886 }
1887
1888 static int __init packet_init(void)
1889 {
1890         sock_register(&packet_family_ops);
1891         register_netdevice_notifier(&packet_netdev_notifier);
1892         proc_net_fops_create("packet", 0, &packet_seq_fops);
1893
1894         return 0;
1895 }
1896
1897 module_init(packet_init);
1898 module_exit(packet_exit);
1899 MODULE_LICENSE("GPL");
1900 MODULE_ALIAS_NETPROTO(PF_PACKET);