patch-2_6_7-vs1_9_1_12
[linux-2.6.git] / net / packet / af_packet.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              PACKET - implements raw packet sockets.
7  *
8  * Version:     $Id: af_packet.c,v 1.61 2002/02/08 03:57:19 davem Exp $
9  *
10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *
14  * Fixes:       
15  *              Alan Cox        :       verify_area() now used correctly
16  *              Alan Cox        :       new skbuff lists, look ma no backlogs!
17  *              Alan Cox        :       tidied skbuff lists.
18  *              Alan Cox        :       Now uses generic datagram routines I
19  *                                      added. Also fixed the peek/read crash
20  *                                      from all old Linux datagram code.
21  *              Alan Cox        :       Uses the improved datagram code.
22  *              Alan Cox        :       Added NULL's for socket options.
23  *              Alan Cox        :       Re-commented the code.
24  *              Alan Cox        :       Use new kernel side addressing
25  *              Rob Janssen     :       Correct MTU usage.
26  *              Dave Platt      :       Counter leaks caused by incorrect
27  *                                      interrupt locking and some slightly
28  *                                      dubious gcc output. Can you read
29  *                                      compiler: it said _VOLATILE_
30  *      Richard Kooijman        :       Timestamp fixes.
31  *              Alan Cox        :       New buffers. Use sk->mac.raw.
32  *              Alan Cox        :       sendmsg/recvmsg support.
33  *              Alan Cox        :       Protocol setting support
34  *      Alexey Kuznetsov        :       Untied from IPv4 stack.
35  *      Cyrus Durgin            :       Fixed kerneld for kmod.
36  *      Michal Ostrowski        :       Module initialization cleanup.
37  *         Ulises Alonso        :       Frame number limit removal and 
38  *                                      packet_set_ring memory leak.
39  *
40  *              This program is free software; you can redistribute it and/or
41  *              modify it under the terms of the GNU General Public License
42  *              as published by the Free Software Foundation; either version
43  *              2 of the License, or (at your option) any later version.
44  *
45  */
46  
47 #include <linux/config.h>
48 #include <linux/types.h>
49 #include <linux/sched.h>
50 #include <linux/mm.h>
51 #include <linux/fcntl.h>
52 #include <linux/socket.h>
53 #include <linux/in.h>
54 #include <linux/inet.h>
55 #include <linux/netdevice.h>
56 #include <linux/if_packet.h>
57 #include <linux/wireless.h>
58 #include <linux/kmod.h>
59 #include <net/ip.h>
60 #include <net/protocol.h>
61 #include <linux/skbuff.h>
62 #include <net/sock.h>
63 #include <linux/errno.h>
64 #include <linux/timer.h>
65 #include <asm/system.h>
66 #include <asm/uaccess.h>
67 #include <asm/ioctls.h>
68 #include <linux/proc_fs.h>
69 #include <linux/seq_file.h>
70 #include <linux/poll.h>
71 #include <linux/module.h>
72 #include <linux/init.h>
73
74 #ifdef CONFIG_INET
75 #include <net/inet_common.h>
76 #endif
77
78 #define CONFIG_SOCK_PACKET      1
79
80 /*
81    Proposed replacement for SIOC{ADD,DEL}MULTI and
82    IFF_PROMISC, IFF_ALLMULTI flags.
83
84    It is more expensive, but I believe,
85    it is really correct solution: reentereble, safe and fault tolerant.
86
87    IFF_PROMISC/IFF_ALLMULTI/SIOC{ADD/DEL}MULTI are faked by keeping
88    reference count and global flag, so that real status is
89    (gflag|(count != 0)), so that we can use obsolete faulty interface
90    not harming clever users.
91  */
92 #define CONFIG_PACKET_MULTICAST 1
93
94 /*
95    Assumptions:
96    - if device has no dev->hard_header routine, it adds and removes ll header
97      inside itself. In this case ll header is invisible outside of device,
98      but higher levels still should reserve dev->hard_header_len.
99      Some devices are enough clever to reallocate skb, when header
100      will not fit to reserved space (tunnel), another ones are silly
101      (PPP).
102    - packet socket receives packets with pulled ll header,
103      so that SOCK_RAW should push it back.
104
105 On receive:
106 -----------
107
108 Incoming, dev->hard_header!=NULL
109    mac.raw -> ll header
110    data    -> data
111
112 Outgoing, dev->hard_header!=NULL
113    mac.raw -> ll header
114    data    -> ll header
115
116 Incoming, dev->hard_header==NULL
117    mac.raw -> UNKNOWN position. It is very likely, that it points to ll header.
118               PPP makes it, that is wrong, because introduce assymetry
119               between rx and tx paths.
120    data    -> data
121
122 Outgoing, dev->hard_header==NULL
123    mac.raw -> data. ll header is still not built!
124    data    -> data
125
126 Resume
127   If dev->hard_header==NULL we are unlikely to restore sensible ll header.
128
129
130 On transmit:
131 ------------
132
133 dev->hard_header != NULL
134    mac.raw -> ll header
135    data    -> ll header
136
137 dev->hard_header == NULL (ll header is added by device, we cannot control it)
138    mac.raw -> data
139    data -> data
140
141    We should set nh.raw on output to correct posistion,
142    packet classifier depends on it.
143  */
144
145 /* List of all packet sockets. */
146 HLIST_HEAD(packet_sklist);
147 static rwlock_t packet_sklist_lock = RW_LOCK_UNLOCKED;
148
149 atomic_t packet_socks_nr;
150
151
152 /* Private packet socket structures. */
153
154 #ifdef CONFIG_PACKET_MULTICAST
155 struct packet_mclist
156 {
157         struct packet_mclist    *next;
158         int                     ifindex;
159         int                     count;
160         unsigned short          type;
161         unsigned short          alen;
162         unsigned char           addr[8];
163 };
164 #endif
165 #ifdef CONFIG_PACKET_MMAP
166 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing);
167 #endif
168
169 static void packet_flush_mclist(struct sock *sk);
170
171 struct packet_opt
172 {
173         struct tpacket_stats    stats;
174 #ifdef CONFIG_PACKET_MMAP
175         unsigned long           *pg_vec;
176         unsigned int            head;
177         unsigned int            frames_per_block;
178         unsigned int            frame_size;
179         unsigned int            frame_max;
180         int                     copy_thresh;
181 #endif
182         struct packet_type      prot_hook;
183         spinlock_t              bind_lock;
184         char                    running;        /* prot_hook is attached*/
185         int                     ifindex;        /* bound device         */
186         unsigned short          num;
187 #ifdef CONFIG_PACKET_MULTICAST
188         struct packet_mclist    *mclist;
189 #endif
190 #ifdef CONFIG_PACKET_MMAP
191         atomic_t                mapped;
192         unsigned int            pg_vec_order;
193         unsigned int            pg_vec_pages;
194         unsigned int            pg_vec_len;
195 #endif
196 };
197
198 #ifdef CONFIG_PACKET_MMAP
199
200 static inline unsigned long packet_lookup_frame(struct packet_opt *po, unsigned int position)
201 {
202         unsigned int pg_vec_pos, frame_offset;
203         unsigned long frame;
204
205         pg_vec_pos = position / po->frames_per_block;
206         frame_offset = position % po->frames_per_block;
207
208         frame = (unsigned long) (po->pg_vec[pg_vec_pos] + (frame_offset * po->frame_size));
209         
210         return frame;
211 }
212 #endif
213
214 #define pkt_sk(__sk) ((struct packet_opt *)(__sk)->sk_protinfo)
215
216 void packet_sock_destruct(struct sock *sk)
217 {
218         BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc));
219         BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
220
221         if (!sock_flag(sk, SOCK_DEAD)) {
222                 printk("Attempt to release alive packet socket: %p\n", sk);
223                 return;
224         }
225
226         if (pkt_sk(sk))
227                 kfree(pkt_sk(sk));
228         atomic_dec(&packet_socks_nr);
229 #ifdef PACKET_REFCNT_DEBUG
230         printk(KERN_DEBUG "PACKET socket %p is free, %d are alive\n", sk, atomic_read(&packet_socks_nr));
231 #endif
232 }
233
234
235 extern struct proto_ops packet_ops;
236
237 #ifdef CONFIG_SOCK_PACKET
238 extern struct proto_ops packet_ops_spkt;
239
240 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,  struct packet_type *pt)
241 {
242         struct sock *sk;
243         struct sockaddr_pkt *spkt;
244
245         /*
246          *      When we registered the protocol we saved the socket in the data
247          *      field for just this event.
248          */
249
250         sk = pt->af_packet_priv;
251         
252         /*
253          *      Yank back the headers [hope the device set this
254          *      right or kerboom...]
255          *
256          *      Incoming packets have ll header pulled,
257          *      push it back.
258          *
259          *      For outgoing ones skb->data == skb->mac.raw
260          *      so that this procedure is noop.
261          */
262
263         if (skb->pkt_type == PACKET_LOOPBACK)
264                 goto out;
265
266         if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
267                 goto oom;
268
269         /* drop any routing info */
270         dst_release(skb->dst);
271         skb->dst = NULL;
272
273         spkt = (struct sockaddr_pkt*)skb->cb;
274
275         skb_push(skb, skb->data-skb->mac.raw);
276
277         /*
278          *      The SOCK_PACKET socket receives _all_ frames.
279          */
280
281         spkt->spkt_family = dev->type;
282         strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
283         spkt->spkt_protocol = skb->protocol;
284
285         /*
286          *      Charge the memory to the socket. This is done specifically
287          *      to prevent sockets using all the memory up.
288          */
289
290         if (sock_queue_rcv_skb(sk,skb) == 0)
291                 return 0;
292
293 out:
294         kfree_skb(skb);
295 oom:
296         return 0;
297 }
298
299
300 /*
301  *      Output a raw packet to a device layer. This bypasses all the other
302  *      protocol layers and you must therefore supply it with a complete frame
303  */
304  
305 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
306                                struct msghdr *msg, size_t len)
307 {
308         struct sock *sk = sock->sk;
309         struct sockaddr_pkt *saddr=(struct sockaddr_pkt *)msg->msg_name;
310         struct sk_buff *skb;
311         struct net_device *dev;
312         unsigned short proto=0;
313         int err;
314         
315         /*
316          *      Get and verify the address. 
317          */
318
319         if (saddr)
320         {
321                 if (msg->msg_namelen < sizeof(struct sockaddr))
322                         return(-EINVAL);
323                 if (msg->msg_namelen==sizeof(struct sockaddr_pkt))
324                         proto=saddr->spkt_protocol;
325         }
326         else
327                 return(-ENOTCONN);      /* SOCK_PACKET must be sent giving an address */
328
329         /*
330          *      Find the device first to size check it 
331          */
332
333         saddr->spkt_device[13] = 0;
334         dev = dev_get_by_name(saddr->spkt_device);
335         err = -ENODEV;
336         if (dev == NULL)
337                 goto out_unlock;
338         
339         /*
340          *      You may not queue a frame bigger than the mtu. This is the lowest level
341          *      raw protocol and you must do your own fragmentation at this level.
342          */
343          
344         err = -EMSGSIZE;
345         if(len>dev->mtu+dev->hard_header_len)
346                 goto out_unlock;
347
348         err = -ENOBUFS;
349         skb = sock_wmalloc(sk, len + LL_RESERVED_SPACE(dev), 0, GFP_KERNEL);
350
351         /*
352          *      If the write buffer is full, then tough. At this level the user gets to
353          *      deal with the problem - do your own algorithmic backoffs. That's far
354          *      more flexible.
355          */
356          
357         if (skb == NULL) 
358                 goto out_unlock;
359
360         /*
361          *      Fill it in 
362          */
363          
364         /* FIXME: Save some space for broken drivers that write a
365          * hard header at transmission time by themselves. PPP is the
366          * notable one here. This should really be fixed at the driver level.
367          */
368         skb_reserve(skb, LL_RESERVED_SPACE(dev));
369         skb->nh.raw = skb->data;
370
371         /* Try to align data part correctly */
372         if (dev->hard_header) {
373                 skb->data -= dev->hard_header_len;
374                 skb->tail -= dev->hard_header_len;
375                 if (len < dev->hard_header_len)
376                         skb->nh.raw = skb->data;
377         }
378
379         /* Returns -EFAULT on error */
380         err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
381         skb->protocol = proto;
382         skb->dev = dev;
383         skb->priority = sk->sk_priority;
384         if (err)
385                 goto out_free;
386
387         err = -ENETDOWN;
388         if (!(dev->flags & IFF_UP))
389                 goto out_free;
390
391         /*
392          *      Now send it
393          */
394
395         dev_queue_xmit(skb);
396         dev_put(dev);
397         return(len);
398
399 out_free:
400         kfree_skb(skb);
401 out_unlock:
402         if (dev)
403                 dev_put(dev);
404         return err;
405 }
406 #endif
407
408 static inline unsigned run_filter(struct sk_buff *skb, struct sock *sk, unsigned res)
409 {
410         struct sk_filter *filter;
411
412         bh_lock_sock(sk);
413         filter = sk->sk_filter;
414         /*
415          * Our caller already checked that filter != NULL but we need to
416          * verify that under bh_lock_sock() to be safe
417          */
418         if (likely(filter != NULL))
419                 res = sk_run_filter(skb, filter->insns, filter->len);
420         bh_unlock_sock(sk);
421
422         return res;
423 }
424
425 /*
426    This function makes lazy skb cloning in hope that most of packets
427    are discarded by BPF.
428
429    Note tricky part: we DO mangle shared skb! skb->data, skb->len
430    and skb->cb are mangled. It works because (and until) packets
431    falling here are owned by current CPU. Output packets are cloned
432    by dev_queue_xmit_nit(), input packets are processed by net_bh
433    sequencially, so that if we return skb to original state on exit,
434    we will not harm anyone.
435  */
436
437 static int packet_rcv(struct sk_buff *skb, struct net_device *dev,  struct packet_type *pt)
438 {
439         struct sock *sk;
440         struct sockaddr_ll *sll;
441         struct packet_opt *po;
442         u8 * skb_head = skb->data;
443         int skb_len = skb->len;
444         unsigned snaplen;
445
446         if (skb->pkt_type == PACKET_LOOPBACK)
447                 goto drop;
448
449         sk = pt->af_packet_priv;
450         po = pkt_sk(sk);
451
452         skb->dev = dev;
453
454         if (dev->hard_header) {
455                 /* The device has an explicit notion of ll header,
456                    exported to higher levels.
457
458                    Otherwise, the device hides datails of it frame
459                    structure, so that corresponding packet head
460                    never delivered to user.
461                  */
462                 if (sk->sk_type != SOCK_DGRAM)
463                         skb_push(skb, skb->data - skb->mac.raw);
464                 else if (skb->pkt_type == PACKET_OUTGOING) {
465                         /* Special case: outgoing packets have ll header at head */
466                         skb_pull(skb, skb->nh.raw - skb->data);
467                 }
468         }
469
470         snaplen = skb->len;
471
472         if (sk->sk_filter) {
473                 unsigned res = run_filter(skb, sk, snaplen);
474                 if (res == 0)
475                         goto drop_n_restore;
476                 if (snaplen > res)
477                         snaplen = res;
478         }
479
480         if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
481             (unsigned)sk->sk_rcvbuf)
482                 goto drop_n_acct;
483
484         if (skb_shared(skb)) {
485                 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
486                 if (nskb == NULL)
487                         goto drop_n_acct;
488
489                 if (skb_head != skb->data) {
490                         skb->data = skb_head;
491                         skb->len = skb_len;
492                 }
493                 kfree_skb(skb);
494                 skb = nskb;
495         }
496
497         sll = (struct sockaddr_ll*)skb->cb;
498         sll->sll_family = AF_PACKET;
499         sll->sll_hatype = dev->type;
500         sll->sll_protocol = skb->protocol;
501         sll->sll_pkttype = skb->pkt_type;
502         sll->sll_ifindex = dev->ifindex;
503         sll->sll_halen = 0;
504
505         if (dev->hard_header_parse)
506                 sll->sll_halen = dev->hard_header_parse(skb, sll->sll_addr);
507
508         if (pskb_trim(skb, snaplen))
509                 goto drop_n_acct;
510
511         skb_set_owner_r(skb, sk);
512         skb->dev = NULL;
513         dst_release(skb->dst);
514         skb->dst = NULL;
515
516         spin_lock(&sk->sk_receive_queue.lock);
517         po->stats.tp_packets++;
518         __skb_queue_tail(&sk->sk_receive_queue, skb);
519         spin_unlock(&sk->sk_receive_queue.lock);
520         sk->sk_data_ready(sk, skb->len);
521         return 0;
522
523 drop_n_acct:
524         spin_lock(&sk->sk_receive_queue.lock);
525         po->stats.tp_drops++;
526         spin_unlock(&sk->sk_receive_queue.lock);
527
528 drop_n_restore:
529         if (skb_head != skb->data && skb_shared(skb)) {
530                 skb->data = skb_head;
531                 skb->len = skb_len;
532         }
533 drop:
534         kfree_skb(skb);
535         return 0;
536 }
537
538 #ifdef CONFIG_PACKET_MMAP
539 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,  struct packet_type *pt)
540 {
541         struct sock *sk;
542         struct packet_opt *po;
543         struct sockaddr_ll *sll;
544         struct tpacket_hdr *h;
545         u8 * skb_head = skb->data;
546         int skb_len = skb->len;
547         unsigned snaplen;
548         unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
549         unsigned short macoff, netoff;
550         struct sk_buff *copy_skb = NULL;
551
552         if (skb->pkt_type == PACKET_LOOPBACK)
553                 goto drop;
554
555         sk = pt->af_packet_priv;
556         po = pkt_sk(sk);
557
558         if (dev->hard_header) {
559                 if (sk->sk_type != SOCK_DGRAM)
560                         skb_push(skb, skb->data - skb->mac.raw);
561                 else if (skb->pkt_type == PACKET_OUTGOING) {
562                         /* Special case: outgoing packets have ll header at head */
563                         skb_pull(skb, skb->nh.raw - skb->data);
564                         if (skb->ip_summed == CHECKSUM_HW)
565                                 status |= TP_STATUS_CSUMNOTREADY;
566                 }
567         }
568
569         snaplen = skb->len;
570
571         if (sk->sk_filter) {
572                 unsigned res = run_filter(skb, sk, snaplen);
573                 if (res == 0)
574                         goto drop_n_restore;
575                 if (snaplen > res)
576                         snaplen = res;
577         }
578
579         if (sk->sk_type == SOCK_DGRAM) {
580                 macoff = netoff = TPACKET_ALIGN(TPACKET_HDRLEN) + 16;
581         } else {
582                 unsigned maclen = skb->nh.raw - skb->data;
583                 netoff = TPACKET_ALIGN(TPACKET_HDRLEN + (maclen < 16 ? 16 : maclen));
584                 macoff = netoff - maclen;
585         }
586
587         if (macoff + snaplen > po->frame_size) {
588                 if (po->copy_thresh &&
589                     atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
590                     (unsigned)sk->sk_rcvbuf) {
591                         if (skb_shared(skb)) {
592                                 copy_skb = skb_clone(skb, GFP_ATOMIC);
593                         } else {
594                                 copy_skb = skb_get(skb);
595                                 skb_head = skb->data;
596                         }
597                         if (copy_skb)
598                                 skb_set_owner_r(copy_skb, sk);
599                 }
600                 snaplen = po->frame_size - macoff;
601                 if ((int)snaplen < 0)
602                         snaplen = 0;
603         }
604         if (snaplen > skb->len-skb->data_len)
605                 snaplen = skb->len-skb->data_len;
606
607         spin_lock(&sk->sk_receive_queue.lock);
608         h = (struct tpacket_hdr *)packet_lookup_frame(po, po->head);
609         
610         if (h->tp_status)
611                 goto ring_is_full;
612         po->head = po->head != po->frame_max ? po->head+1 : 0;
613         po->stats.tp_packets++;
614         if (copy_skb) {
615                 status |= TP_STATUS_COPY;
616                 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
617         }
618         if (!po->stats.tp_drops)
619                 status &= ~TP_STATUS_LOSING;
620         spin_unlock(&sk->sk_receive_queue.lock);
621
622         memcpy((u8*)h + macoff, skb->data, snaplen);
623
624         h->tp_len = skb->len;
625         h->tp_snaplen = snaplen;
626         h->tp_mac = macoff;
627         h->tp_net = netoff;
628         if (skb->stamp.tv_sec == 0) { 
629                 do_gettimeofday(&skb->stamp);
630                 sock_enable_timestamp(sk);
631         }
632         h->tp_sec = skb->stamp.tv_sec;
633         h->tp_usec = skb->stamp.tv_usec;
634
635         sll = (struct sockaddr_ll*)((u8*)h + TPACKET_ALIGN(sizeof(*h)));
636         sll->sll_halen = 0;
637         if (dev->hard_header_parse)
638                 sll->sll_halen = dev->hard_header_parse(skb, sll->sll_addr);
639         sll->sll_family = AF_PACKET;
640         sll->sll_hatype = dev->type;
641         sll->sll_protocol = skb->protocol;
642         sll->sll_pkttype = skb->pkt_type;
643         sll->sll_ifindex = dev->ifindex;
644
645         h->tp_status = status;
646         mb();
647
648         {
649                 struct page *p_start, *p_end;
650                 u8 *h_end = (u8 *)h + macoff + snaplen - 1;
651
652                 p_start = virt_to_page(h);
653                 p_end = virt_to_page(h_end);
654                 while (p_start <= p_end) {
655                         flush_dcache_page(p_start);
656                         p_start++;
657                 }
658         }
659
660         sk->sk_data_ready(sk, 0);
661
662 drop_n_restore:
663         if (skb_head != skb->data && skb_shared(skb)) {
664                 skb->data = skb_head;
665                 skb->len = skb_len;
666         }
667 drop:
668         kfree_skb(skb);
669         return 0;
670
671 ring_is_full:
672         po->stats.tp_drops++;
673         spin_unlock(&sk->sk_receive_queue.lock);
674
675         sk->sk_data_ready(sk, 0);
676         if (copy_skb)
677                 kfree_skb(copy_skb);
678         goto drop_n_restore;
679 }
680
681 #endif
682
683
684 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
685                           struct msghdr *msg, size_t len)
686 {
687         struct sock *sk = sock->sk;
688         struct sockaddr_ll *saddr=(struct sockaddr_ll *)msg->msg_name;
689         struct sk_buff *skb;
690         struct net_device *dev;
691         unsigned short proto;
692         unsigned char *addr;
693         int ifindex, err, reserve = 0;
694
695         /*
696          *      Get and verify the address. 
697          */
698          
699         if (saddr == NULL) {
700                 struct packet_opt *po = pkt_sk(sk);
701
702                 ifindex = po->ifindex;
703                 proto   = po->num;
704                 addr    = NULL;
705         } else {
706                 err = -EINVAL;
707                 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
708                         goto out;
709                 ifindex = saddr->sll_ifindex;
710                 proto   = saddr->sll_protocol;
711                 addr    = saddr->sll_addr;
712         }
713
714
715         dev = dev_get_by_index(ifindex);
716         err = -ENXIO;
717         if (dev == NULL)
718                 goto out_unlock;
719         if (sock->type == SOCK_RAW)
720                 reserve = dev->hard_header_len;
721
722         err = -EMSGSIZE;
723         if (len > dev->mtu+reserve)
724                 goto out_unlock;
725
726         skb = sock_alloc_send_skb(sk, len + LL_RESERVED_SPACE(dev),
727                                 msg->msg_flags & MSG_DONTWAIT, &err);
728         if (skb==NULL)
729                 goto out_unlock;
730
731         skb_reserve(skb, LL_RESERVED_SPACE(dev));
732         skb->nh.raw = skb->data;
733
734         if (dev->hard_header) {
735                 int res;
736                 err = -EINVAL;
737                 res = dev->hard_header(skb, dev, ntohs(proto), addr, NULL, len);
738                 if (sock->type != SOCK_DGRAM) {
739                         skb->tail = skb->data;
740                         skb->len = 0;
741                 } else if (res < 0)
742                         goto out_free;
743         }
744
745         /* Returns -EFAULT on error */
746         err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
747         if (err)
748                 goto out_free;
749
750         skb->protocol = proto;
751         skb->dev = dev;
752         skb->priority = sk->sk_priority;
753
754         err = -ENETDOWN;
755         if (!(dev->flags & IFF_UP))
756                 goto out_free;
757
758         /*
759          *      Now send it
760          */
761
762         err = dev_queue_xmit(skb);
763         if (err > 0 && (err = net_xmit_errno(err)) != 0)
764                 goto out_unlock;
765
766         dev_put(dev);
767
768         return(len);
769
770 out_free:
771         kfree_skb(skb);
772 out_unlock:
773         if (dev)
774                 dev_put(dev);
775 out:
776         return err;
777 }
778
779 /*
780  *      Close a PACKET socket. This is fairly simple. We immediately go
781  *      to 'closed' state and remove our protocol entry in the device list.
782  */
783
784 static int packet_release(struct socket *sock)
785 {
786         struct sock *sk = sock->sk;
787         struct packet_opt *po = pkt_sk(sk);
788
789         if (!sk)
790                 return 0;
791
792         write_lock_bh(&packet_sklist_lock);
793         sk_del_node_init(sk);
794         write_unlock_bh(&packet_sklist_lock);
795
796         /*
797          *      Unhook packet receive handler.
798          */
799
800         if (po->running) {
801                 /*
802                  *      Remove the protocol hook
803                  */
804                 dev_remove_pack(&po->prot_hook);
805                 po->running = 0;
806                 po->num = 0;
807                 __sock_put(sk);
808         }
809
810 #ifdef CONFIG_PACKET_MULTICAST
811         packet_flush_mclist(sk);
812 #endif
813
814 #ifdef CONFIG_PACKET_MMAP
815         if (po->pg_vec) {
816                 struct tpacket_req req;
817                 memset(&req, 0, sizeof(req));
818                 packet_set_ring(sk, &req, 1);
819         }
820 #endif
821
822         /*
823          *      Now the socket is dead. No more input will appear.
824          */
825
826         sock_orphan(sk);
827         sock->sk = NULL;
828
829         /* Purge queues */
830
831         skb_queue_purge(&sk->sk_receive_queue);
832
833         sock_put(sk);
834         return 0;
835 }
836
837 /*
838  *      Attach a packet hook.
839  */
840
841 static int packet_do_bind(struct sock *sk, struct net_device *dev, int protocol)
842 {
843         struct packet_opt *po = pkt_sk(sk);
844         /*
845          *      Detach an existing hook if present.
846          */
847
848         lock_sock(sk);
849
850         spin_lock(&po->bind_lock);
851         if (po->running) {
852                 __sock_put(sk);
853                 po->running = 0;
854                 po->num = 0;
855                 spin_unlock(&po->bind_lock);
856                 dev_remove_pack(&po->prot_hook);
857                 spin_lock(&po->bind_lock);
858         }
859
860         po->num = protocol;
861         po->prot_hook.type = protocol;
862         po->prot_hook.dev = dev;
863
864         po->ifindex = dev ? dev->ifindex : 0;
865
866         if (protocol == 0)
867                 goto out_unlock;
868
869         if (dev) {
870                 if (dev->flags&IFF_UP) {
871                         dev_add_pack(&po->prot_hook);
872                         sock_hold(sk);
873                         po->running = 1;
874                 } else {
875                         sk->sk_err = ENETDOWN;
876                         if (!sock_flag(sk, SOCK_DEAD))
877                                 sk->sk_error_report(sk);
878                 }
879         } else {
880                 dev_add_pack(&po->prot_hook);
881                 sock_hold(sk);
882                 po->running = 1;
883         }
884
885 out_unlock:
886         spin_unlock(&po->bind_lock);
887         release_sock(sk);
888         return 0;
889 }
890
891 /*
892  *      Bind a packet socket to a device
893  */
894
895 #ifdef CONFIG_SOCK_PACKET
896
897 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, int addr_len)
898 {
899         struct sock *sk=sock->sk;
900         char name[15];
901         struct net_device *dev;
902         int err = -ENODEV;
903         
904         /*
905          *      Check legality
906          */
907          
908         if(addr_len!=sizeof(struct sockaddr))
909                 return -EINVAL;
910         strlcpy(name,uaddr->sa_data,sizeof(name));
911
912         dev = dev_get_by_name(name);
913         if (dev) {
914                 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
915                 dev_put(dev);
916         }
917         return err;
918 }
919 #endif
920
921 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
922 {
923         struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
924         struct sock *sk=sock->sk;
925         struct net_device *dev = NULL;
926         int err;
927
928
929         /*
930          *      Check legality
931          */
932          
933         if (addr_len < sizeof(struct sockaddr_ll))
934                 return -EINVAL;
935         if (sll->sll_family != AF_PACKET)
936                 return -EINVAL;
937
938         if (sll->sll_ifindex) {
939                 err = -ENODEV;
940                 dev = dev_get_by_index(sll->sll_ifindex);
941                 if (dev == NULL)
942                         goto out;
943         }
944         err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
945         if (dev)
946                 dev_put(dev);
947
948 out:
949         return err;
950 }
951
952
953 /*
954  *      Create a packet of type SOCK_PACKET. 
955  */
956
957 static int packet_create(struct socket *sock, int protocol)
958 {
959         struct sock *sk;
960         struct packet_opt *po;
961         int err;
962
963         if (!capable(CAP_NET_RAW))
964                 return -EPERM;
965         if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW
966 #ifdef CONFIG_SOCK_PACKET
967             && sock->type != SOCK_PACKET
968 #endif
969             )
970                 return -ESOCKTNOSUPPORT;
971
972         sock->state = SS_UNCONNECTED;
973
974         err = -ENOBUFS;
975         sk = sk_alloc(PF_PACKET, GFP_KERNEL, 1, NULL);
976         if (sk == NULL)
977                 goto out;
978
979         sock->ops = &packet_ops;
980 #ifdef CONFIG_SOCK_PACKET
981         if (sock->type == SOCK_PACKET)
982                 sock->ops = &packet_ops_spkt;
983 #endif
984         sock_init_data(sock,sk);
985         sk_set_owner(sk, THIS_MODULE);
986
987         po = sk->sk_protinfo = kmalloc(sizeof(*po), GFP_KERNEL);
988         if (!po)
989                 goto out_free;
990         memset(po, 0, sizeof(*po));
991         sk->sk_family = PF_PACKET;
992         po->num = protocol;
993
994         sk->sk_destruct = packet_sock_destruct;
995         atomic_inc(&packet_socks_nr);
996
997         /*
998          *      Attach a protocol block
999          */
1000
1001         spin_lock_init(&po->bind_lock);
1002         po->prot_hook.func = packet_rcv;
1003 #ifdef CONFIG_SOCK_PACKET
1004         if (sock->type == SOCK_PACKET)
1005                 po->prot_hook.func = packet_rcv_spkt;
1006 #endif
1007         po->prot_hook.af_packet_priv = sk;
1008
1009         if (protocol) {
1010                 po->prot_hook.type = protocol;
1011                 dev_add_pack(&po->prot_hook);
1012                 sock_hold(sk);
1013                 po->running = 1;
1014         }
1015
1016         write_lock_bh(&packet_sklist_lock);
1017         sk_add_node(sk, &packet_sklist);
1018         write_unlock_bh(&packet_sklist_lock);
1019         return(0);
1020
1021 out_free:
1022         sk_free(sk);
1023 out:
1024         return err;
1025 }
1026
1027 /*
1028  *      Pull a packet from our receive queue and hand it to the user.
1029  *      If necessary we block.
1030  */
1031
1032 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1033                           struct msghdr *msg, size_t len, int flags)
1034 {
1035         struct sock *sk = sock->sk;
1036         struct sk_buff *skb;
1037         int copied, err;
1038
1039         err = -EINVAL;
1040         if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
1041                 goto out;
1042
1043 #if 0
1044         /* What error should we return now? EUNATTACH? */
1045         if (pkt_sk(sk)->ifindex < 0)
1046                 return -ENODEV;
1047 #endif
1048
1049         /*
1050          *      If the address length field is there to be filled in, we fill
1051          *      it in now.
1052          */
1053
1054         if (sock->type == SOCK_PACKET)
1055                 msg->msg_namelen = sizeof(struct sockaddr_pkt);
1056         else
1057                 msg->msg_namelen = sizeof(struct sockaddr_ll);
1058
1059         /*
1060          *      Call the generic datagram receiver. This handles all sorts
1061          *      of horrible races and re-entrancy so we can forget about it
1062          *      in the protocol layers.
1063          *
1064          *      Now it will return ENETDOWN, if device have just gone down,
1065          *      but then it will block.
1066          */
1067
1068         skb=skb_recv_datagram(sk,flags,flags&MSG_DONTWAIT,&err);
1069
1070         /*
1071          *      An error occurred so return it. Because skb_recv_datagram() 
1072          *      handles the blocking we don't see and worry about blocking
1073          *      retries.
1074          */
1075
1076         if(skb==NULL)
1077                 goto out;
1078
1079         /*
1080          *      You lose any data beyond the buffer you gave. If it worries a
1081          *      user program they can ask the device for its MTU anyway.
1082          */
1083
1084         copied = skb->len;
1085         if (copied > len)
1086         {
1087                 copied=len;
1088                 msg->msg_flags|=MSG_TRUNC;
1089         }
1090
1091         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1092         if (err)
1093                 goto out_free;
1094
1095         sock_recv_timestamp(msg, sk, skb);
1096
1097         if (msg->msg_name)
1098                 memcpy(msg->msg_name, skb->cb, msg->msg_namelen);
1099
1100         /*
1101          *      Free or return the buffer as appropriate. Again this
1102          *      hides all the races and re-entrancy issues from us.
1103          */
1104         err = (flags&MSG_TRUNC) ? skb->len : copied;
1105
1106 out_free:
1107         skb_free_datagram(sk, skb);
1108 out:
1109         return err;
1110 }
1111
1112 #ifdef CONFIG_SOCK_PACKET
1113 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1114                                int *uaddr_len, int peer)
1115 {
1116         struct net_device *dev;
1117         struct sock *sk = sock->sk;
1118
1119         if (peer)
1120                 return -EOPNOTSUPP;
1121
1122         uaddr->sa_family = AF_PACKET;
1123         dev = dev_get_by_index(pkt_sk(sk)->ifindex);
1124         if (dev) {
1125                 strlcpy(uaddr->sa_data, dev->name, 15);
1126                 dev_put(dev);
1127         } else
1128                 memset(uaddr->sa_data, 0, 14);
1129         *uaddr_len = sizeof(*uaddr);
1130
1131         return 0;
1132 }
1133 #endif
1134
1135 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1136                           int *uaddr_len, int peer)
1137 {
1138         struct net_device *dev;
1139         struct sock *sk = sock->sk;
1140         struct packet_opt *po = pkt_sk(sk);
1141         struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
1142
1143         if (peer)
1144                 return -EOPNOTSUPP;
1145
1146         sll->sll_family = AF_PACKET;
1147         sll->sll_ifindex = po->ifindex;
1148         sll->sll_protocol = po->num;
1149         dev = dev_get_by_index(po->ifindex);
1150         if (dev) {
1151                 sll->sll_hatype = dev->type;
1152                 sll->sll_halen = dev->addr_len;
1153                 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1154                 dev_put(dev);
1155         } else {
1156                 sll->sll_hatype = 0;    /* Bad: we have no ARPHRD_UNSPEC */
1157                 sll->sll_halen = 0;
1158         }
1159         *uaddr_len = sizeof(*sll);
1160
1161         return 0;
1162 }
1163
1164 #ifdef CONFIG_PACKET_MULTICAST
1165 static void packet_dev_mc(struct net_device *dev, struct packet_mclist *i, int what)
1166 {
1167         switch (i->type) {
1168         case PACKET_MR_MULTICAST:
1169                 if (what > 0)
1170                         dev_mc_add(dev, i->addr, i->alen, 0);
1171                 else
1172                         dev_mc_delete(dev, i->addr, i->alen, 0);
1173                 break;
1174         case PACKET_MR_PROMISC:
1175                 dev_set_promiscuity(dev, what);
1176                 break;
1177         case PACKET_MR_ALLMULTI:
1178                 dev_set_allmulti(dev, what);
1179                 break;
1180         default:;
1181         }
1182 }
1183
1184 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1185 {
1186         for ( ; i; i=i->next) {
1187                 if (i->ifindex == dev->ifindex)
1188                         packet_dev_mc(dev, i, what);
1189         }
1190 }
1191
1192 static int packet_mc_add(struct sock *sk, struct packet_mreq *mreq)
1193 {
1194         struct packet_opt *po = pkt_sk(sk);
1195         struct packet_mclist *ml, *i;
1196         struct net_device *dev;
1197         int err;
1198
1199         rtnl_lock();
1200
1201         err = -ENODEV;
1202         dev = __dev_get_by_index(mreq->mr_ifindex);
1203         if (!dev)
1204                 goto done;
1205
1206         err = -EINVAL;
1207         if (mreq->mr_alen > dev->addr_len)
1208                 goto done;
1209
1210         err = -ENOBUFS;
1211         i = (struct packet_mclist *)kmalloc(sizeof(*i), GFP_KERNEL);
1212         if (i == NULL)
1213                 goto done;
1214
1215         err = 0;
1216         for (ml = po->mclist; ml; ml = ml->next) {
1217                 if (ml->ifindex == mreq->mr_ifindex &&
1218                     ml->type == mreq->mr_type &&
1219                     ml->alen == mreq->mr_alen &&
1220                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1221                         ml->count++;
1222                         /* Free the new element ... */
1223                         kfree(i);
1224                         goto done;
1225                 }
1226         }
1227
1228         i->type = mreq->mr_type;
1229         i->ifindex = mreq->mr_ifindex;
1230         i->alen = mreq->mr_alen;
1231         memcpy(i->addr, mreq->mr_address, i->alen);
1232         i->count = 1;
1233         i->next = po->mclist;
1234         po->mclist = i;
1235         packet_dev_mc(dev, i, +1);
1236
1237 done:
1238         rtnl_unlock();
1239         return err;
1240 }
1241
1242 static int packet_mc_drop(struct sock *sk, struct packet_mreq *mreq)
1243 {
1244         struct packet_mclist *ml, **mlp;
1245
1246         rtnl_lock();
1247
1248         for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1249                 if (ml->ifindex == mreq->mr_ifindex &&
1250                     ml->type == mreq->mr_type &&
1251                     ml->alen == mreq->mr_alen &&
1252                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1253                         if (--ml->count == 0) {
1254                                 struct net_device *dev;
1255                                 *mlp = ml->next;
1256                                 dev = dev_get_by_index(ml->ifindex);
1257                                 if (dev) {
1258                                         packet_dev_mc(dev, ml, -1);
1259                                         dev_put(dev);
1260                                 }
1261                                 kfree(ml);
1262                         }
1263                         rtnl_unlock();
1264                         return 0;
1265                 }
1266         }
1267         rtnl_unlock();
1268         return -EADDRNOTAVAIL;
1269 }
1270
1271 static void packet_flush_mclist(struct sock *sk)
1272 {
1273         struct packet_opt *po = pkt_sk(sk);
1274         struct packet_mclist *ml;
1275
1276         if (!po->mclist)
1277                 return;
1278
1279         rtnl_lock();
1280         while ((ml = po->mclist) != NULL) {
1281                 struct net_device *dev;
1282
1283                 po->mclist = ml->next;
1284                 if ((dev = dev_get_by_index(ml->ifindex)) != NULL) {
1285                         packet_dev_mc(dev, ml, -1);
1286                         dev_put(dev);
1287                 }
1288                 kfree(ml);
1289         }
1290         rtnl_unlock();
1291 }
1292 #endif
1293
1294 static int
1295 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen)
1296 {
1297         struct sock *sk = sock->sk;
1298         int ret;
1299
1300         if (level != SOL_PACKET)
1301                 return -ENOPROTOOPT;
1302
1303         switch(optname) {
1304 #ifdef CONFIG_PACKET_MULTICAST
1305         case PACKET_ADD_MEMBERSHIP:     
1306         case PACKET_DROP_MEMBERSHIP:
1307         {
1308                 struct packet_mreq mreq;
1309                 if (optlen<sizeof(mreq))
1310                         return -EINVAL;
1311                 if (copy_from_user(&mreq,optval,sizeof(mreq)))
1312                         return -EFAULT;
1313                 if (optname == PACKET_ADD_MEMBERSHIP)
1314                         ret = packet_mc_add(sk, &mreq);
1315                 else
1316                         ret = packet_mc_drop(sk, &mreq);
1317                 return ret;
1318         }
1319 #endif
1320 #ifdef CONFIG_PACKET_MMAP
1321         case PACKET_RX_RING:
1322         {
1323                 struct tpacket_req req;
1324
1325                 if (optlen<sizeof(req))
1326                         return -EINVAL;
1327                 if (copy_from_user(&req,optval,sizeof(req)))
1328                         return -EFAULT;
1329                 return packet_set_ring(sk, &req, 0);
1330         }
1331         case PACKET_COPY_THRESH:
1332         {
1333                 int val;
1334
1335                 if (optlen!=sizeof(val))
1336                         return -EINVAL;
1337                 if (copy_from_user(&val,optval,sizeof(val)))
1338                         return -EFAULT;
1339
1340                 pkt_sk(sk)->copy_thresh = val;
1341                 return 0;
1342         }
1343 #endif
1344         default:
1345                 return -ENOPROTOOPT;
1346         }
1347 }
1348
1349 int packet_getsockopt(struct socket *sock, int level, int optname,
1350                       char __user *optval, int __user *optlen)
1351 {
1352         int len;
1353         struct sock *sk = sock->sk;
1354         struct packet_opt *po = pkt_sk(sk);
1355
1356         if (level != SOL_PACKET)
1357                 return -ENOPROTOOPT;
1358
1359         if (get_user(len,optlen))
1360                 return -EFAULT;
1361
1362         if (len < 0)
1363                 return -EINVAL;
1364                 
1365         switch(optname) {
1366         case PACKET_STATISTICS:
1367         {
1368                 struct tpacket_stats st;
1369
1370                 if (len > sizeof(struct tpacket_stats))
1371                         len = sizeof(struct tpacket_stats);
1372                 spin_lock_bh(&sk->sk_receive_queue.lock);
1373                 st = po->stats;
1374                 memset(&po->stats, 0, sizeof(st));
1375                 spin_unlock_bh(&sk->sk_receive_queue.lock);
1376                 st.tp_packets += st.tp_drops;
1377
1378                 if (copy_to_user(optval, &st, len))
1379                         return -EFAULT;
1380                 break;
1381         }
1382         default:
1383                 return -ENOPROTOOPT;
1384         }
1385
1386         if (put_user(len, optlen))
1387                 return -EFAULT;
1388         return 0;
1389 }
1390
1391
1392 static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
1393 {
1394         struct sock *sk;
1395         struct hlist_node *node;
1396         struct net_device *dev = (struct net_device*)data;
1397
1398         read_lock(&packet_sklist_lock);
1399         sk_for_each(sk, node, &packet_sklist) {
1400                 struct packet_opt *po = pkt_sk(sk);
1401
1402                 switch (msg) {
1403                 case NETDEV_UNREGISTER:
1404 #ifdef CONFIG_PACKET_MULTICAST
1405                         if (po->mclist)
1406                                 packet_dev_mclist(dev, po->mclist, -1);
1407                         // fallthrough
1408 #endif
1409                 case NETDEV_DOWN:
1410                         if (dev->ifindex == po->ifindex) {
1411                                 spin_lock(&po->bind_lock);
1412                                 if (po->running) {
1413                                         __dev_remove_pack(&po->prot_hook);
1414                                         __sock_put(sk);
1415                                         po->running = 0;
1416                                         sk->sk_err = ENETDOWN;
1417                                         if (!sock_flag(sk, SOCK_DEAD))
1418                                                 sk->sk_error_report(sk);
1419                                 }
1420                                 if (msg == NETDEV_UNREGISTER) {
1421                                         po->ifindex = -1;
1422                                         po->prot_hook.dev = NULL;
1423                                 }
1424                                 spin_unlock(&po->bind_lock);
1425                         }
1426                         break;
1427                 case NETDEV_UP:
1428                         spin_lock(&po->bind_lock);
1429                         if (dev->ifindex == po->ifindex && po->num &&
1430                             !po->running) {
1431                                 dev_add_pack(&po->prot_hook);
1432                                 sock_hold(sk);
1433                                 po->running = 1;
1434                         }
1435                         spin_unlock(&po->bind_lock);
1436                         break;
1437                 }
1438         }
1439         read_unlock(&packet_sklist_lock);
1440         return NOTIFY_DONE;
1441 }
1442
1443
1444 static int packet_ioctl(struct socket *sock, unsigned int cmd,
1445                         unsigned long arg)
1446 {
1447         struct sock *sk = sock->sk;
1448
1449         switch(cmd) {
1450                 case SIOCOUTQ:
1451                 {
1452                         int amount = atomic_read(&sk->sk_wmem_alloc);
1453                         return put_user(amount, (int __user *)arg);
1454                 }
1455                 case SIOCINQ:
1456                 {
1457                         struct sk_buff *skb;
1458                         int amount = 0;
1459
1460                         spin_lock_bh(&sk->sk_receive_queue.lock);
1461                         skb = skb_peek(&sk->sk_receive_queue);
1462                         if (skb)
1463                                 amount = skb->len;
1464                         spin_unlock_bh(&sk->sk_receive_queue.lock);
1465                         return put_user(amount, (int __user *)arg);
1466                 }
1467                 case SIOCGSTAMP:
1468                         return sock_get_timestamp(sk, (struct timeval __user *)arg);
1469                         
1470 #ifdef CONFIG_INET
1471                 case SIOCADDRT:
1472                 case SIOCDELRT:
1473                 case SIOCDARP:
1474                 case SIOCGARP:
1475                 case SIOCSARP:
1476                 case SIOCGIFADDR:
1477                 case SIOCSIFADDR:
1478                 case SIOCGIFBRDADDR:
1479                 case SIOCSIFBRDADDR:
1480                 case SIOCGIFNETMASK:
1481                 case SIOCSIFNETMASK:
1482                 case SIOCGIFDSTADDR:
1483                 case SIOCSIFDSTADDR:
1484                 case SIOCSIFFLAGS:
1485                         return inet_dgram_ops.ioctl(sock, cmd, arg);
1486 #endif
1487
1488                 default:
1489                         return dev_ioctl(cmd, (void __user *)arg);
1490         }
1491         return 0;
1492 }
1493
1494 #ifndef CONFIG_PACKET_MMAP
1495 #define packet_mmap sock_no_mmap
1496 #define packet_poll datagram_poll
1497 #else
1498
1499 unsigned int packet_poll(struct file * file, struct socket *sock, poll_table *wait)
1500 {
1501         struct sock *sk = sock->sk;
1502         struct packet_opt *po = pkt_sk(sk);
1503         unsigned int mask = datagram_poll(file, sock, wait);
1504
1505         spin_lock_bh(&sk->sk_receive_queue.lock);
1506         if (po->pg_vec) {
1507                 unsigned last = po->head ? po->head-1 : po->frame_max;
1508                 struct tpacket_hdr *h;
1509
1510                 h = (struct tpacket_hdr *)packet_lookup_frame(po, last);
1511
1512                 if (h->tp_status)
1513                         mask |= POLLIN | POLLRDNORM;
1514         }
1515         spin_unlock_bh(&sk->sk_receive_queue.lock);
1516         return mask;
1517 }
1518
1519
1520 /* Dirty? Well, I still did not learn better way to account
1521  * for user mmaps.
1522  */
1523
1524 static void packet_mm_open(struct vm_area_struct *vma)
1525 {
1526         struct file *file = vma->vm_file;
1527         struct inode *inode = file->f_dentry->d_inode;
1528         struct socket * sock = SOCKET_I(inode);
1529         struct sock *sk = sock->sk;
1530         
1531         if (sk)
1532                 atomic_inc(&pkt_sk(sk)->mapped);
1533 }
1534
1535 static void packet_mm_close(struct vm_area_struct *vma)
1536 {
1537         struct file *file = vma->vm_file;
1538         struct inode *inode = file->f_dentry->d_inode;
1539         struct socket * sock = SOCKET_I(inode);
1540         struct sock *sk = sock->sk;
1541         
1542         if (sk)
1543                 atomic_dec(&pkt_sk(sk)->mapped);
1544 }
1545
1546 static struct vm_operations_struct packet_mmap_ops = {
1547         .open = packet_mm_open,
1548         .close =packet_mm_close,
1549 };
1550
1551 static void free_pg_vec(unsigned long *pg_vec, unsigned order, unsigned len)
1552 {
1553         int i;
1554
1555         for (i=0; i<len; i++) {
1556                 if (pg_vec[i]) {
1557                         struct page *page, *pend;
1558
1559                         pend = virt_to_page(pg_vec[i] + (PAGE_SIZE << order) - 1);
1560                         for (page = virt_to_page(pg_vec[i]); page <= pend; page++)
1561                                 ClearPageReserved(page);
1562                         free_pages(pg_vec[i], order);
1563                 }
1564         }
1565         kfree(pg_vec);
1566 }
1567
1568
1569 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing)
1570 {
1571         unsigned long *pg_vec = NULL;
1572         struct packet_opt *po = pkt_sk(sk);
1573         int was_running, num, order = 0;
1574         int err = 0;
1575         
1576         if (req->tp_block_nr) {
1577                 int i, l;
1578
1579                 /* Sanity tests and some calculations */
1580
1581                 if (po->pg_vec)
1582                         return -EBUSY;
1583
1584                 if ((int)req->tp_block_size <= 0)
1585                         return -EINVAL;
1586                 if (req->tp_block_size&(PAGE_SIZE-1))
1587                         return -EINVAL;
1588                 if (req->tp_frame_size < TPACKET_HDRLEN)
1589                         return -EINVAL;
1590                 if (req->tp_frame_size&(TPACKET_ALIGNMENT-1))
1591                         return -EINVAL;
1592
1593                 po->frames_per_block = req->tp_block_size/req->tp_frame_size;
1594                 if (po->frames_per_block <= 0)
1595                         return -EINVAL;
1596                 if (po->frames_per_block*req->tp_block_nr != req->tp_frame_nr)
1597                         return -EINVAL;
1598                 /* OK! */
1599
1600                 /* Allocate page vector */
1601                 while ((PAGE_SIZE<<order) < req->tp_block_size)
1602                         order++;
1603
1604                 err = -ENOMEM;
1605
1606                 pg_vec = kmalloc(req->tp_block_nr*sizeof(unsigned long*), GFP_KERNEL);
1607                 if (pg_vec == NULL)
1608                         goto out;
1609                 memset(pg_vec, 0, req->tp_block_nr*sizeof(unsigned long*));
1610
1611                 for (i=0; i<req->tp_block_nr; i++) {
1612                         struct page *page, *pend;
1613                         pg_vec[i] = __get_free_pages(GFP_KERNEL, order);
1614                         if (!pg_vec[i])
1615                                 goto out_free_pgvec;
1616
1617                         pend = virt_to_page(pg_vec[i] + (PAGE_SIZE << order) - 1);
1618                         for (page = virt_to_page(pg_vec[i]); page <= pend; page++)
1619                                 SetPageReserved(page);
1620                 }
1621                 /* Page vector is allocated */
1622
1623                 l = 0;
1624                 for (i=0; i<req->tp_block_nr; i++) {
1625                         unsigned long ptr = pg_vec[i];
1626                         struct tpacket_hdr *header;
1627                         int k;
1628
1629                         for (k=0; k<po->frames_per_block; k++) {
1630                                 
1631                                 header = (struct tpacket_hdr*)ptr;
1632                                 header->tp_status = TP_STATUS_KERNEL;
1633                                 ptr += req->tp_frame_size;
1634                         }
1635                 }
1636                 /* Done */
1637         } else {
1638                 if (req->tp_frame_nr)
1639                         return -EINVAL;
1640         }
1641
1642         lock_sock(sk);
1643
1644         /* Detach socket from network */
1645         spin_lock(&po->bind_lock);
1646         was_running = po->running;
1647         num = po->num;
1648         if (was_running) {
1649                 __dev_remove_pack(&po->prot_hook);
1650                 po->num = 0;
1651                 po->running = 0;
1652                 __sock_put(sk);
1653         }
1654         spin_unlock(&po->bind_lock);
1655                 
1656         synchronize_net();
1657
1658         err = -EBUSY;
1659         if (closing || atomic_read(&po->mapped) == 0) {
1660                 err = 0;
1661 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
1662
1663                 spin_lock_bh(&sk->sk_receive_queue.lock);
1664                 pg_vec = XC(po->pg_vec, pg_vec);
1665                 po->frame_max = req->tp_frame_nr-1;
1666                 po->head = 0;
1667                 po->frame_size = req->tp_frame_size;
1668                 spin_unlock_bh(&sk->sk_receive_queue.lock);
1669
1670                 order = XC(po->pg_vec_order, order);
1671                 req->tp_block_nr = XC(po->pg_vec_len, req->tp_block_nr);
1672
1673                 po->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
1674                 po->prot_hook.func = po->pg_vec ? tpacket_rcv : packet_rcv;
1675                 skb_queue_purge(&sk->sk_receive_queue);
1676 #undef XC
1677                 if (atomic_read(&po->mapped))
1678                         printk(KERN_DEBUG "packet_mmap: vma is busy: %d\n", atomic_read(&po->mapped));
1679         }
1680
1681         spin_lock(&po->bind_lock);
1682         if (was_running && !po->running) {
1683                 sock_hold(sk);
1684                 po->running = 1;
1685                 po->num = num;
1686                 dev_add_pack(&po->prot_hook);
1687         }
1688         spin_unlock(&po->bind_lock);
1689
1690         release_sock(sk);
1691
1692 out_free_pgvec:
1693         if (pg_vec)
1694                 free_pg_vec(pg_vec, order, req->tp_block_nr);
1695 out:
1696         return err;
1697 }
1698
1699 static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1700 {
1701         struct sock *sk = sock->sk;
1702         struct packet_opt *po = pkt_sk(sk);
1703         unsigned long size;
1704         unsigned long start;
1705         int err = -EINVAL;
1706         int i;
1707
1708         if (vma->vm_pgoff)
1709                 return -EINVAL;
1710
1711         size = vma->vm_end - vma->vm_start;
1712
1713         lock_sock(sk);
1714         if (po->pg_vec == NULL)
1715                 goto out;
1716         if (size != po->pg_vec_len*po->pg_vec_pages*PAGE_SIZE)
1717                 goto out;
1718
1719         atomic_inc(&po->mapped);
1720         start = vma->vm_start;
1721         err = -EAGAIN;
1722         for (i=0; i<po->pg_vec_len; i++) {
1723                 if (remap_page_range(vma, start, __pa(po->pg_vec[i]),
1724                                      po->pg_vec_pages*PAGE_SIZE,
1725                                      vma->vm_page_prot))
1726                         goto out;
1727                 start += po->pg_vec_pages*PAGE_SIZE;
1728         }
1729         vma->vm_ops = &packet_mmap_ops;
1730         err = 0;
1731
1732 out:
1733         release_sock(sk);
1734         return err;
1735 }
1736 #endif
1737
1738
1739 #ifdef CONFIG_SOCK_PACKET
1740 struct proto_ops packet_ops_spkt = {
1741         .family =       PF_PACKET,
1742         .owner =        THIS_MODULE,
1743         .release =      packet_release,
1744         .bind =         packet_bind_spkt,
1745         .connect =      sock_no_connect,
1746         .socketpair =   sock_no_socketpair,
1747         .accept =       sock_no_accept,
1748         .getname =      packet_getname_spkt,
1749         .poll =         datagram_poll,
1750         .ioctl =        packet_ioctl,
1751         .listen =       sock_no_listen,
1752         .shutdown =     sock_no_shutdown,
1753         .setsockopt =   sock_no_setsockopt,
1754         .getsockopt =   sock_no_getsockopt,
1755         .sendmsg =      packet_sendmsg_spkt,
1756         .recvmsg =      packet_recvmsg,
1757         .mmap =         sock_no_mmap,
1758         .sendpage =     sock_no_sendpage,
1759 };
1760 #endif
1761
1762 struct proto_ops packet_ops = {
1763         .family =       PF_PACKET,
1764         .owner =        THIS_MODULE,
1765         .release =      packet_release,
1766         .bind =         packet_bind,
1767         .connect =      sock_no_connect,
1768         .socketpair =   sock_no_socketpair,
1769         .accept =       sock_no_accept,
1770         .getname =      packet_getname, 
1771         .poll =         packet_poll,
1772         .ioctl =        packet_ioctl,
1773         .listen =       sock_no_listen,
1774         .shutdown =     sock_no_shutdown,
1775         .setsockopt =   packet_setsockopt,
1776         .getsockopt =   packet_getsockopt,
1777         .sendmsg =      packet_sendmsg,
1778         .recvmsg =      packet_recvmsg,
1779         .mmap =         packet_mmap,
1780         .sendpage =     sock_no_sendpage,
1781 };
1782
1783 static struct net_proto_family packet_family_ops = {
1784         .family =       PF_PACKET,
1785         .create =       packet_create,
1786         .owner  =       THIS_MODULE,
1787 };
1788
1789 static struct notifier_block packet_netdev_notifier = {
1790         .notifier_call =packet_notifier,
1791 };
1792
1793 #ifdef CONFIG_PROC_FS
1794 static inline struct sock *packet_seq_idx(loff_t off)
1795 {
1796         struct sock *s;
1797         struct hlist_node *node;
1798
1799         sk_for_each(s, node, &packet_sklist) {
1800                 if (!off--)
1801                         return s;
1802         }
1803         return NULL;
1804 }
1805
1806 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
1807 {
1808         read_lock(&packet_sklist_lock);
1809         return *pos ? packet_seq_idx(*pos - 1) : SEQ_START_TOKEN;
1810 }
1811
1812 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1813 {
1814         ++*pos;
1815         return  (v == SEQ_START_TOKEN) 
1816                 ? sk_head(&packet_sklist) 
1817                 : sk_next((struct sock*)v) ;
1818 }
1819
1820 static void packet_seq_stop(struct seq_file *seq, void *v)
1821 {
1822         read_unlock(&packet_sklist_lock);               
1823 }
1824
1825 static int packet_seq_show(struct seq_file *seq, void *v) 
1826 {
1827         if (v == SEQ_START_TOKEN)
1828                 seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
1829         else {
1830                 struct sock *s = v;
1831                 const struct packet_opt *po = pkt_sk(s);
1832
1833                 seq_printf(seq,
1834                            "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
1835                            s,
1836                            atomic_read(&s->sk_refcnt),
1837                            s->sk_type,
1838                            ntohs(po->num),
1839                            po->ifindex,
1840                            po->running,
1841                            atomic_read(&s->sk_rmem_alloc),
1842                            sock_i_uid(s),
1843                            sock_i_ino(s) );
1844         }
1845
1846         return 0;
1847 }
1848
1849 static struct seq_operations packet_seq_ops = {
1850         .start  = packet_seq_start,
1851         .next   = packet_seq_next,
1852         .stop   = packet_seq_stop,
1853         .show   = packet_seq_show,
1854 };
1855
1856 static int packet_seq_open(struct inode *inode, struct file *file)
1857 {
1858         return seq_open(file, &packet_seq_ops);
1859 }
1860
1861 static struct file_operations packet_seq_fops = {
1862         .owner          = THIS_MODULE,
1863         .open           = packet_seq_open,
1864         .read           = seq_read,
1865         .llseek         = seq_lseek,
1866         .release        = seq_release,
1867 };
1868
1869 #endif
1870
1871 static void __exit packet_exit(void)
1872 {
1873         proc_net_remove("packet");
1874         unregister_netdevice_notifier(&packet_netdev_notifier);
1875         sock_unregister(PF_PACKET);
1876         return;
1877 }
1878
1879 static int __init packet_init(void)
1880 {
1881         sock_register(&packet_family_ops);
1882         register_netdevice_notifier(&packet_netdev_notifier);
1883         proc_net_fops_create("packet", 0, &packet_seq_fops);
1884
1885         return 0;
1886 }
1887
1888 module_init(packet_init);
1889 module_exit(packet_exit);
1890 MODULE_LICENSE("GPL");
1891 MODULE_ALIAS_NETPROTO(PF_PACKET);