Revert to Fedora kernel-2.6.17-1.2187_FC5 patched with vs2.0.2.1; there are too many...
[linux-2.6.git] / net / packet / af_packet.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              PACKET - implements raw packet sockets.
7  *
8  * Version:     $Id: af_packet.c,v 1.61 2002/02/08 03:57:19 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *
14  * Fixes:       
15  *              Alan Cox        :       verify_area() now used correctly
16  *              Alan Cox        :       new skbuff lists, look ma no backlogs!
17  *              Alan Cox        :       tidied skbuff lists.
18  *              Alan Cox        :       Now uses generic datagram routines I
19  *                                      added. Also fixed the peek/read crash
20  *                                      from all old Linux datagram code.
21  *              Alan Cox        :       Uses the improved datagram code.
22  *              Alan Cox        :       Added NULL's for socket options.
23  *              Alan Cox        :       Re-commented the code.
24  *              Alan Cox        :       Use new kernel side addressing
25  *              Rob Janssen     :       Correct MTU usage.
26  *              Dave Platt      :       Counter leaks caused by incorrect
27  *                                      interrupt locking and some slightly
28  *                                      dubious gcc output. Can you read
29  *                                      compiler: it said _VOLATILE_
30  *      Richard Kooijman        :       Timestamp fixes.
31  *              Alan Cox        :       New buffers. Use sk->mac.raw.
32  *              Alan Cox        :       sendmsg/recvmsg support.
33  *              Alan Cox        :       Protocol setting support
34  *      Alexey Kuznetsov        :       Untied from IPv4 stack.
35  *      Cyrus Durgin            :       Fixed kerneld for kmod.
36  *      Michal Ostrowski        :       Module initialization cleanup.
37  *         Ulises Alonso        :       Frame number limit removal and 
38  *                                      packet_set_ring memory leak.
39  *              Eric Biederman  :       Allow for > 8 byte hardware addresses.
40  *                                      The convention is that longer addresses
41  *                                      will simply extend the hardware address
42  *                                      byte arrays at the end of sockaddr_ll 
43  *                                      and packet_mreq.
44  *
45  *              This program is free software; you can redistribute it and/or
46  *              modify it under the terms of the GNU General Public License
47  *              as published by the Free Software Foundation; either version
48  *              2 of the License, or (at your option) any later version.
49  *
50  */
51  
52 #include <linux/config.h>
53 #include <linux/types.h>
54 #include <linux/sched.h>
55 #include <linux/mm.h>
56 #include <linux/capability.h>
57 #include <linux/fcntl.h>
58 #include <linux/socket.h>
59 #include <linux/in.h>
60 #include <linux/inet.h>
61 #include <linux/netdevice.h>
62 #include <linux/if_packet.h>
63 #include <linux/wireless.h>
64 #include <linux/kmod.h>
65 #include <net/ip.h>
66 #include <net/protocol.h>
67 #include <linux/skbuff.h>
68 #include <net/sock.h>
69 #include <linux/errno.h>
70 #include <linux/timer.h>
71 #include <asm/system.h>
72 #include <asm/uaccess.h>
73 #include <asm/ioctls.h>
74 #include <asm/page.h>
75 #include <asm/io.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/poll.h>
79 #include <linux/module.h>
80 #include <linux/init.h>
81
82 #ifdef CONFIG_INET
83 #include <net/inet_common.h>
84 #endif
85
86 #define CONFIG_SOCK_PACKET      1
87
88 /*
89    Proposed replacement for SIOC{ADD,DEL}MULTI and
90    IFF_PROMISC, IFF_ALLMULTI flags.
91
92    It is more expensive, but I believe,
93    it is really correct solution: reentereble, safe and fault tolerant.
94
95    IFF_PROMISC/IFF_ALLMULTI/SIOC{ADD/DEL}MULTI are faked by keeping
96    reference count and global flag, so that real status is
97    (gflag|(count != 0)), so that we can use obsolete faulty interface
98    not harming clever users.
99  */
100 #define CONFIG_PACKET_MULTICAST 1
101
102 /*
103    Assumptions:
104    - if device has no dev->hard_header routine, it adds and removes ll header
105      inside itself. In this case ll header is invisible outside of device,
106      but higher levels still should reserve dev->hard_header_len.
107      Some devices are enough clever to reallocate skb, when header
108      will not fit to reserved space (tunnel), another ones are silly
109      (PPP).
110    - packet socket receives packets with pulled ll header,
111      so that SOCK_RAW should push it back.
112
113 On receive:
114 -----------
115
116 Incoming, dev->hard_header!=NULL
117    mac.raw -> ll header
118    data    -> data
119
120 Outgoing, dev->hard_header!=NULL
121    mac.raw -> ll header
122    data    -> ll header
123
124 Incoming, dev->hard_header==NULL
125    mac.raw -> UNKNOWN position. It is very likely, that it points to ll header.
126               PPP makes it, that is wrong, because introduce assymetry
127               between rx and tx paths.
128    data    -> data
129
130 Outgoing, dev->hard_header==NULL
131    mac.raw -> data. ll header is still not built!
132    data    -> data
133
134 Resume
135   If dev->hard_header==NULL we are unlikely to restore sensible ll header.
136
137
138 On transmit:
139 ------------
140
141 dev->hard_header != NULL
142    mac.raw -> ll header
143    data    -> ll header
144
145 dev->hard_header == NULL (ll header is added by device, we cannot control it)
146    mac.raw -> data
147    data -> data
148
149    We should set nh.raw on output to correct posistion,
150    packet classifier depends on it.
151  */
152
153 /* List of all packet sockets. */
154 static HLIST_HEAD(packet_sklist);
155 static DEFINE_RWLOCK(packet_sklist_lock);
156
157 static atomic_t packet_socks_nr;
158
159
160 /* Private packet socket structures. */
161
162 #ifdef CONFIG_PACKET_MULTICAST
163 struct packet_mclist
164 {
165         struct packet_mclist    *next;
166         int                     ifindex;
167         int                     count;
168         unsigned short          type;
169         unsigned short          alen;
170         unsigned char           addr[MAX_ADDR_LEN];
171 };
172 /* identical to struct packet_mreq except it has
173  * a longer address field.
174  */
175 struct packet_mreq_max
176 {
177         int             mr_ifindex;
178         unsigned short  mr_type;
179         unsigned short  mr_alen;
180         unsigned char   mr_address[MAX_ADDR_LEN];
181 };
182 #endif
183 #ifdef CONFIG_PACKET_MMAP
184 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing);
185 #endif
186
187 static void packet_flush_mclist(struct sock *sk);
188
189 struct packet_sock {
190         /* struct sock has to be the first member of packet_sock */
191         struct sock             sk;
192         struct tpacket_stats    stats;
193 #ifdef CONFIG_PACKET_MMAP
194         char *                  *pg_vec;
195         unsigned int            head;
196         unsigned int            frames_per_block;
197         unsigned int            frame_size;
198         unsigned int            frame_max;
199         int                     copy_thresh;
200 #endif
201         struct packet_type      prot_hook;
202         spinlock_t              bind_lock;
203         char                    running;        /* prot_hook is attached*/
204         int                     ifindex;        /* bound device         */
205         unsigned short          num;
206 #ifdef CONFIG_PACKET_MULTICAST
207         struct packet_mclist    *mclist;
208 #endif
209 #ifdef CONFIG_PACKET_MMAP
210         atomic_t                mapped;
211         unsigned int            pg_vec_order;
212         unsigned int            pg_vec_pages;
213         unsigned int            pg_vec_len;
214 #endif
215 };
216
217 #ifdef CONFIG_PACKET_MMAP
218
219 static inline char *packet_lookup_frame(struct packet_sock *po, unsigned int position)
220 {
221         unsigned int pg_vec_pos, frame_offset;
222         char *frame;
223
224         pg_vec_pos = position / po->frames_per_block;
225         frame_offset = position % po->frames_per_block;
226
227         frame = po->pg_vec[pg_vec_pos] + (frame_offset * po->frame_size);
228         
229         return frame;
230 }
231 #endif
232
233 static inline struct packet_sock *pkt_sk(struct sock *sk)
234 {
235         return (struct packet_sock *)sk;
236 }
237
238 static void packet_sock_destruct(struct sock *sk)
239 {
240         BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc));
241         BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
242
243         if (!sock_flag(sk, SOCK_DEAD)) {
244                 printk("Attempt to release alive packet socket: %p\n", sk);
245                 return;
246         }
247
248         atomic_dec(&packet_socks_nr);
249 #ifdef PACKET_REFCNT_DEBUG
250         printk(KERN_DEBUG "PACKET socket %p is free, %d are alive\n", sk, atomic_read(&packet_socks_nr));
251 #endif
252 }
253
254
255 #if defined(CONFIG_VNET) || defined(CONFIG_VNET_MODULE)
256 struct proto_ops packet_ops;
257 EXPORT_SYMBOL(packet_ops);
258 #else
259 static const
260 #endif
261 struct proto_ops packet_ops;
262
263 #ifdef CONFIG_SOCK_PACKET
264 static const struct proto_ops packet_ops_spkt;
265
266 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,  struct packet_type *pt, struct net_device *orig_dev)
267 {
268         struct sock *sk;
269         struct sockaddr_pkt *spkt;
270
271         /*
272          *      When we registered the protocol we saved the socket in the data
273          *      field for just this event.
274          */
275
276         sk = pt->af_packet_priv;
277         
278         /*
279          *      Yank back the headers [hope the device set this
280          *      right or kerboom...]
281          *
282          *      Incoming packets have ll header pulled,
283          *      push it back.
284          *
285          *      For outgoing ones skb->data == skb->mac.raw
286          *      so that this procedure is noop.
287          */
288
289         if (skb->pkt_type == PACKET_LOOPBACK)
290                 goto out;
291
292         if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
293                 goto oom;
294
295         /* drop any routing info */
296         dst_release(skb->dst);
297         skb->dst = NULL;
298
299         /* drop conntrack reference */
300         nf_reset(skb);
301
302         spkt = (struct sockaddr_pkt*)skb->cb;
303
304         skb_push(skb, skb->data-skb->mac.raw);
305
306         /*
307          *      The SOCK_PACKET socket receives _all_ frames.
308          */
309
310         spkt->spkt_family = dev->type;
311         strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
312         spkt->spkt_protocol = skb->protocol;
313
314         /*
315          *      Charge the memory to the socket. This is done specifically
316          *      to prevent sockets using all the memory up.
317          */
318
319         if (sock_queue_rcv_skb(sk,skb) == 0)
320                 return 0;
321
322 out:
323         kfree_skb(skb);
324 oom:
325         return 0;
326 }
327
328
329 /*
330  *      Output a raw packet to a device layer. This bypasses all the other
331  *      protocol layers and you must therefore supply it with a complete frame
332  */
333  
334 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
335                                struct msghdr *msg, size_t len)
336 {
337         struct sock *sk = sock->sk;
338         struct sockaddr_pkt *saddr=(struct sockaddr_pkt *)msg->msg_name;
339         struct sk_buff *skb;
340         struct net_device *dev;
341         unsigned short proto=0;
342         int err;
343         
344         /*
345          *      Get and verify the address. 
346          */
347
348         if (saddr)
349         {
350                 if (msg->msg_namelen < sizeof(struct sockaddr))
351                         return(-EINVAL);
352                 if (msg->msg_namelen==sizeof(struct sockaddr_pkt))
353                         proto=saddr->spkt_protocol;
354         }
355         else
356                 return(-ENOTCONN);      /* SOCK_PACKET must be sent giving an address */
357
358         /*
359          *      Find the device first to size check it 
360          */
361
362         saddr->spkt_device[13] = 0;
363         dev = dev_get_by_name(saddr->spkt_device);
364         err = -ENODEV;
365         if (dev == NULL)
366                 goto out_unlock;
367         
368         /*
369          *      You may not queue a frame bigger than the mtu. This is the lowest level
370          *      raw protocol and you must do your own fragmentation at this level.
371          */
372          
373         err = -EMSGSIZE;
374         if (len > dev->mtu + dev->hard_header_len)
375                 goto out_unlock;
376
377         err = -ENOBUFS;
378         skb = sock_wmalloc(sk, len + LL_RESERVED_SPACE(dev), 0, GFP_KERNEL);
379
380         /*
381          *      If the write buffer is full, then tough. At this level the user gets to
382          *      deal with the problem - do your own algorithmic backoffs. That's far
383          *      more flexible.
384          */
385          
386         if (skb == NULL) 
387                 goto out_unlock;
388
389         /*
390          *      Fill it in 
391          */
392          
393         /* FIXME: Save some space for broken drivers that write a
394          * hard header at transmission time by themselves. PPP is the
395          * notable one here. This should really be fixed at the driver level.
396          */
397         skb_reserve(skb, LL_RESERVED_SPACE(dev));
398         skb->nh.raw = skb->data;
399
400         /* Try to align data part correctly */
401         if (dev->hard_header) {
402                 skb->data -= dev->hard_header_len;
403                 skb->tail -= dev->hard_header_len;
404                 if (len < dev->hard_header_len)
405                         skb->nh.raw = skb->data;
406         }
407
408         /* Returns -EFAULT on error */
409         err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
410         skb->protocol = proto;
411         skb->dev = dev;
412         skb->priority = sk->sk_priority;
413         if (err)
414                 goto out_free;
415
416         err = -ENETDOWN;
417         if (!(dev->flags & IFF_UP))
418                 goto out_free;
419
420         /*
421          *      Now send it
422          */
423
424         dev_queue_xmit(skb);
425         dev_put(dev);
426         return(len);
427
428 out_free:
429         kfree_skb(skb);
430 out_unlock:
431         if (dev)
432                 dev_put(dev);
433         return err;
434 }
435 #endif
436
437 static inline unsigned run_filter(struct sk_buff *skb, struct sock *sk, unsigned res)
438 {
439         struct sk_filter *filter;
440
441         bh_lock_sock(sk);
442         filter = sk->sk_filter;
443         /*
444          * Our caller already checked that filter != NULL but we need to
445          * verify that under bh_lock_sock() to be safe
446          */
447         if (likely(filter != NULL))
448                 res = sk_run_filter(skb, filter->insns, filter->len);
449         bh_unlock_sock(sk);
450
451         return res;
452 }
453
454 /*
455    This function makes lazy skb cloning in hope that most of packets
456    are discarded by BPF.
457
458    Note tricky part: we DO mangle shared skb! skb->data, skb->len
459    and skb->cb are mangled. It works because (and until) packets
460    falling here are owned by current CPU. Output packets are cloned
461    by dev_queue_xmit_nit(), input packets are processed by net_bh
462    sequencially, so that if we return skb to original state on exit,
463    we will not harm anyone.
464  */
465
466 static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
467 {
468         struct sock *sk;
469         struct sockaddr_ll *sll;
470         struct packet_sock *po;
471         u8 * skb_head = skb->data;
472         int skb_len = skb->len;
473         unsigned snaplen;
474
475         if (skb->pkt_type == PACKET_LOOPBACK)
476                 goto drop;
477
478         sk = pt->af_packet_priv;
479         po = pkt_sk(sk);
480
481 #if defined(CONFIG_VNET) || defined(CONFIG_VNET_MODULE)
482         if (vnet_active &&
483             (int) sk->sk_xid > 0 && sk->sk_xid != skb->xid)
484                 goto drop;
485 #endif
486
487         skb->dev = dev;
488
489         if (dev->hard_header) {
490                 /* The device has an explicit notion of ll header,
491                    exported to higher levels.
492
493                    Otherwise, the device hides datails of it frame
494                    structure, so that corresponding packet head
495                    never delivered to user.
496                  */
497                 if (sk->sk_type != SOCK_DGRAM)
498                         skb_push(skb, skb->data - skb->mac.raw);
499                 else if (skb->pkt_type == PACKET_OUTGOING) {
500                         /* Special case: outgoing packets have ll header at head */
501                         skb_pull(skb, skb->nh.raw - skb->data);
502                 }
503         }
504
505         snaplen = skb->len;
506
507         if (sk->sk_filter) {
508                 unsigned res = run_filter(skb, sk, snaplen);
509                 if (res == 0)
510                         goto drop_n_restore;
511                 if (snaplen > res)
512                         snaplen = res;
513         }
514
515         if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
516             (unsigned)sk->sk_rcvbuf)
517                 goto drop_n_acct;
518
519         if (skb_shared(skb)) {
520                 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
521                 if (nskb == NULL)
522                         goto drop_n_acct;
523
524                 if (skb_head != skb->data) {
525                         skb->data = skb_head;
526                         skb->len = skb_len;
527                 }
528                 kfree_skb(skb);
529                 skb = nskb;
530         }
531
532         sll = (struct sockaddr_ll*)skb->cb;
533         sll->sll_family = AF_PACKET;
534         sll->sll_hatype = dev->type;
535         sll->sll_protocol = skb->protocol;
536         sll->sll_pkttype = skb->pkt_type;
537         sll->sll_ifindex = dev->ifindex;
538         sll->sll_halen = 0;
539
540         if (dev->hard_header_parse)
541                 sll->sll_halen = dev->hard_header_parse(skb, sll->sll_addr);
542
543         if (pskb_trim(skb, snaplen))
544                 goto drop_n_acct;
545
546         skb_set_owner_r(skb, sk);
547         skb->dev = NULL;
548         dst_release(skb->dst);
549         skb->dst = NULL;
550
551         /* drop conntrack reference */
552         nf_reset(skb);
553
554         spin_lock(&sk->sk_receive_queue.lock);
555         po->stats.tp_packets++;
556         __skb_queue_tail(&sk->sk_receive_queue, skb);
557         spin_unlock(&sk->sk_receive_queue.lock);
558         sk->sk_data_ready(sk, skb->len);
559         return 0;
560
561 drop_n_acct:
562         spin_lock(&sk->sk_receive_queue.lock);
563         po->stats.tp_drops++;
564         spin_unlock(&sk->sk_receive_queue.lock);
565
566 drop_n_restore:
567         if (skb_head != skb->data && skb_shared(skb)) {
568                 skb->data = skb_head;
569                 skb->len = skb_len;
570         }
571 drop:
572         kfree_skb(skb);
573         return 0;
574 }
575
576 #ifdef CONFIG_PACKET_MMAP
577 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
578 {
579         struct sock *sk;
580         struct packet_sock *po;
581         struct sockaddr_ll *sll;
582         struct tpacket_hdr *h;
583         u8 * skb_head = skb->data;
584         int skb_len = skb->len;
585         unsigned snaplen;
586         unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
587         unsigned short macoff, netoff;
588         struct sk_buff *copy_skb = NULL;
589
590         if (skb->pkt_type == PACKET_LOOPBACK)
591                 goto drop;
592
593         sk = pt->af_packet_priv;
594         po = pkt_sk(sk);
595
596         if (dev->hard_header) {
597                 if (sk->sk_type != SOCK_DGRAM)
598                         skb_push(skb, skb->data - skb->mac.raw);
599                 else if (skb->pkt_type == PACKET_OUTGOING) {
600                         /* Special case: outgoing packets have ll header at head */
601                         skb_pull(skb, skb->nh.raw - skb->data);
602                         if (skb->ip_summed == CHECKSUM_HW)
603                                 status |= TP_STATUS_CSUMNOTREADY;
604                 }
605         }
606
607         snaplen = skb->len;
608
609         if (sk->sk_filter) {
610                 unsigned res = run_filter(skb, sk, snaplen);
611                 if (res == 0)
612                         goto drop_n_restore;
613                 if (snaplen > res)
614                         snaplen = res;
615         }
616
617         if (sk->sk_type == SOCK_DGRAM) {
618                 macoff = netoff = TPACKET_ALIGN(TPACKET_HDRLEN) + 16;
619         } else {
620                 unsigned maclen = skb->nh.raw - skb->data;
621                 netoff = TPACKET_ALIGN(TPACKET_HDRLEN + (maclen < 16 ? 16 : maclen));
622                 macoff = netoff - maclen;
623         }
624
625         if (macoff + snaplen > po->frame_size) {
626                 if (po->copy_thresh &&
627                     atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
628                     (unsigned)sk->sk_rcvbuf) {
629                         if (skb_shared(skb)) {
630                                 copy_skb = skb_clone(skb, GFP_ATOMIC);
631                         } else {
632                                 copy_skb = skb_get(skb);
633                                 skb_head = skb->data;
634                         }
635                         if (copy_skb)
636                                 skb_set_owner_r(copy_skb, sk);
637                 }
638                 snaplen = po->frame_size - macoff;
639                 if ((int)snaplen < 0)
640                         snaplen = 0;
641         }
642         if (snaplen > skb->len-skb->data_len)
643                 snaplen = skb->len-skb->data_len;
644
645         spin_lock(&sk->sk_receive_queue.lock);
646         h = (struct tpacket_hdr *)packet_lookup_frame(po, po->head);
647         
648         if (h->tp_status)
649                 goto ring_is_full;
650         po->head = po->head != po->frame_max ? po->head+1 : 0;
651         po->stats.tp_packets++;
652         if (copy_skb) {
653                 status |= TP_STATUS_COPY;
654                 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
655         }
656         if (!po->stats.tp_drops)
657                 status &= ~TP_STATUS_LOSING;
658         spin_unlock(&sk->sk_receive_queue.lock);
659
660         memcpy((u8*)h + macoff, skb->data, snaplen);
661
662         h->tp_len = skb->len;
663         h->tp_snaplen = snaplen;
664         h->tp_mac = macoff;
665         h->tp_net = netoff;
666         if (skb->tstamp.off_sec == 0) { 
667                 __net_timestamp(skb);
668                 sock_enable_timestamp(sk);
669         }
670         h->tp_sec = skb->tstamp.off_sec;
671         h->tp_usec = skb->tstamp.off_usec;
672
673         sll = (struct sockaddr_ll*)((u8*)h + TPACKET_ALIGN(sizeof(*h)));
674         sll->sll_halen = 0;
675         if (dev->hard_header_parse)
676                 sll->sll_halen = dev->hard_header_parse(skb, sll->sll_addr);
677         sll->sll_family = AF_PACKET;
678         sll->sll_hatype = dev->type;
679         sll->sll_protocol = skb->protocol;
680         sll->sll_pkttype = skb->pkt_type;
681         sll->sll_ifindex = dev->ifindex;
682
683         h->tp_status = status;
684         mb();
685
686         {
687                 struct page *p_start, *p_end;
688                 u8 *h_end = (u8 *)h + macoff + snaplen - 1;
689
690                 p_start = virt_to_page(h);
691                 p_end = virt_to_page(h_end);
692                 while (p_start <= p_end) {
693                         flush_dcache_page(p_start);
694                         p_start++;
695                 }
696         }
697
698         sk->sk_data_ready(sk, 0);
699
700 drop_n_restore:
701         if (skb_head != skb->data && skb_shared(skb)) {
702                 skb->data = skb_head;
703                 skb->len = skb_len;
704         }
705 drop:
706         kfree_skb(skb);
707         return 0;
708
709 ring_is_full:
710         po->stats.tp_drops++;
711         spin_unlock(&sk->sk_receive_queue.lock);
712
713         sk->sk_data_ready(sk, 0);
714         if (copy_skb)
715                 kfree_skb(copy_skb);
716         goto drop_n_restore;
717 }
718
719 #endif
720
721
722 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
723                           struct msghdr *msg, size_t len)
724 {
725         struct sock *sk = sock->sk;
726         struct sockaddr_ll *saddr=(struct sockaddr_ll *)msg->msg_name;
727         struct sk_buff *skb;
728         struct net_device *dev;
729         unsigned short proto;
730         unsigned char *addr;
731         int ifindex, err, reserve = 0;
732
733         /*
734          *      Get and verify the address. 
735          */
736          
737         if (saddr == NULL) {
738                 struct packet_sock *po = pkt_sk(sk);
739
740                 ifindex = po->ifindex;
741                 proto   = po->num;
742                 addr    = NULL;
743         } else {
744                 err = -EINVAL;
745                 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
746                         goto out;
747                 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
748                         goto out;
749                 ifindex = saddr->sll_ifindex;
750                 proto   = saddr->sll_protocol;
751                 addr    = saddr->sll_addr;
752         }
753
754
755         dev = dev_get_by_index(ifindex);
756         err = -ENXIO;
757         if (dev == NULL)
758                 goto out_unlock;
759         if (sock->type == SOCK_RAW)
760                 reserve = dev->hard_header_len;
761
762         err = -EMSGSIZE;
763         if (len > dev->mtu+reserve)
764                 goto out_unlock;
765
766         skb = sock_alloc_send_skb(sk, len + LL_RESERVED_SPACE(dev),
767                                 msg->msg_flags & MSG_DONTWAIT, &err);
768         if (skb==NULL)
769                 goto out_unlock;
770
771         skb_reserve(skb, LL_RESERVED_SPACE(dev));
772         skb->nh.raw = skb->data;
773
774         if (dev->hard_header) {
775                 int res;
776                 err = -EINVAL;
777                 res = dev->hard_header(skb, dev, ntohs(proto), addr, NULL, len);
778                 if (sock->type != SOCK_DGRAM) {
779                         skb->tail = skb->data;
780                         skb->len = 0;
781                 } else if (res < 0)
782                         goto out_free;
783         }
784
785         /* Returns -EFAULT on error */
786         err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
787         if (err)
788                 goto out_free;
789
790         skb->protocol = proto;
791         skb->dev = dev;
792         skb->priority = sk->sk_priority;
793
794         err = -ENETDOWN;
795         if (!(dev->flags & IFF_UP))
796                 goto out_free;
797
798         /*
799          *      Now send it
800          */
801
802         err = dev_queue_xmit(skb);
803         if (err > 0 && (err = net_xmit_errno(err)) != 0)
804                 goto out_unlock;
805
806         dev_put(dev);
807
808         return(len);
809
810 out_free:
811         kfree_skb(skb);
812 out_unlock:
813         if (dev)
814                 dev_put(dev);
815 out:
816         return err;
817 }
818
819 /*
820  *      Close a PACKET socket. This is fairly simple. We immediately go
821  *      to 'closed' state and remove our protocol entry in the device list.
822  */
823
824 static int packet_release(struct socket *sock)
825 {
826         struct sock *sk = sock->sk;
827         struct packet_sock *po;
828
829         if (!sk)
830                 return 0;
831
832         po = pkt_sk(sk);
833
834         write_lock_bh(&packet_sklist_lock);
835         sk_del_node_init(sk);
836         write_unlock_bh(&packet_sklist_lock);
837
838         /*
839          *      Unhook packet receive handler.
840          */
841
842         if (po->running) {
843                 /*
844                  *      Remove the protocol hook
845                  */
846                 dev_remove_pack(&po->prot_hook);
847                 po->running = 0;
848                 po->num = 0;
849                 __sock_put(sk);
850         }
851
852 #ifdef CONFIG_PACKET_MULTICAST
853         packet_flush_mclist(sk);
854 #endif
855
856 #ifdef CONFIG_PACKET_MMAP
857         if (po->pg_vec) {
858                 struct tpacket_req req;
859                 memset(&req, 0, sizeof(req));
860                 packet_set_ring(sk, &req, 1);
861         }
862 #endif
863
864         /*
865          *      Now the socket is dead. No more input will appear.
866          */
867
868         sock_orphan(sk);
869         sock->sk = NULL;
870
871         /* Purge queues */
872
873         skb_queue_purge(&sk->sk_receive_queue);
874
875         sock_put(sk);
876         return 0;
877 }
878
879 /*
880  *      Attach a packet hook.
881  */
882
883 static int packet_do_bind(struct sock *sk, struct net_device *dev, int protocol)
884 {
885         struct packet_sock *po = pkt_sk(sk);
886         /*
887          *      Detach an existing hook if present.
888          */
889
890         lock_sock(sk);
891
892         spin_lock(&po->bind_lock);
893         if (po->running) {
894                 __sock_put(sk);
895                 po->running = 0;
896                 po->num = 0;
897                 spin_unlock(&po->bind_lock);
898                 dev_remove_pack(&po->prot_hook);
899                 spin_lock(&po->bind_lock);
900         }
901
902         po->num = protocol;
903         po->prot_hook.type = protocol;
904         po->prot_hook.dev = dev;
905
906         po->ifindex = dev ? dev->ifindex : 0;
907
908         if (protocol == 0)
909                 goto out_unlock;
910
911         if (dev) {
912                 if (dev->flags&IFF_UP) {
913                         dev_add_pack(&po->prot_hook);
914                         sock_hold(sk);
915                         po->running = 1;
916                 } else {
917                         sk->sk_err = ENETDOWN;
918                         if (!sock_flag(sk, SOCK_DEAD))
919                                 sk->sk_error_report(sk);
920                 }
921         } else {
922                 dev_add_pack(&po->prot_hook);
923                 sock_hold(sk);
924                 po->running = 1;
925         }
926
927 out_unlock:
928         spin_unlock(&po->bind_lock);
929         release_sock(sk);
930         return 0;
931 }
932
933 /*
934  *      Bind a packet socket to a device
935  */
936
937 #ifdef CONFIG_SOCK_PACKET
938
939 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, int addr_len)
940 {
941         struct sock *sk=sock->sk;
942         char name[15];
943         struct net_device *dev;
944         int err = -ENODEV;
945         
946         /*
947          *      Check legality
948          */
949          
950         if (addr_len != sizeof(struct sockaddr))
951                 return -EINVAL;
952         strlcpy(name,uaddr->sa_data,sizeof(name));
953
954         dev = dev_get_by_name(name);
955         if (dev) {
956                 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
957                 dev_put(dev);
958         }
959         return err;
960 }
961 #endif
962
963 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
964 {
965         struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
966         struct sock *sk=sock->sk;
967         struct net_device *dev = NULL;
968         int err;
969
970
971         /*
972          *      Check legality
973          */
974          
975         if (addr_len < sizeof(struct sockaddr_ll))
976                 return -EINVAL;
977         if (sll->sll_family != AF_PACKET)
978                 return -EINVAL;
979
980         if (sll->sll_ifindex) {
981                 err = -ENODEV;
982                 dev = dev_get_by_index(sll->sll_ifindex);
983                 if (dev == NULL)
984                         goto out;
985         }
986         err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
987         if (dev)
988                 dev_put(dev);
989
990 out:
991         return err;
992 }
993
994 static struct proto packet_proto = {
995         .name     = "PACKET",
996         .owner    = THIS_MODULE,
997         .obj_size = sizeof(struct packet_sock),
998 };
999
1000 /*
1001  *      Create a packet of type SOCK_PACKET. 
1002  */
1003
1004 static int packet_create(struct socket *sock, int protocol)
1005 {
1006         struct sock *sk;
1007         struct packet_sock *po;
1008         int err;
1009
1010         if (!capable(CAP_NET_RAW))
1011                 return -EPERM;
1012         if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW
1013 #ifdef CONFIG_SOCK_PACKET
1014             && sock->type != SOCK_PACKET
1015 #endif
1016             )
1017                 return -ESOCKTNOSUPPORT;
1018
1019         sock->state = SS_UNCONNECTED;
1020
1021         err = -ENOBUFS;
1022         sk = sk_alloc(PF_PACKET, GFP_KERNEL, &packet_proto, 1);
1023         if (sk == NULL)
1024                 goto out;
1025
1026         sock->ops = &packet_ops;
1027 #ifdef CONFIG_SOCK_PACKET
1028         if (sock->type == SOCK_PACKET)
1029                 sock->ops = &packet_ops_spkt;
1030 #endif
1031         sock_init_data(sock, sk);
1032
1033         po = pkt_sk(sk);
1034         sk->sk_family = PF_PACKET;
1035         po->num = protocol;
1036
1037         sk->sk_destruct = packet_sock_destruct;
1038         atomic_inc(&packet_socks_nr);
1039
1040         /*
1041          *      Attach a protocol block
1042          */
1043
1044         spin_lock_init(&po->bind_lock);
1045         po->prot_hook.func = packet_rcv;
1046 #ifdef CONFIG_SOCK_PACKET
1047         if (sock->type == SOCK_PACKET)
1048                 po->prot_hook.func = packet_rcv_spkt;
1049 #endif
1050         po->prot_hook.af_packet_priv = sk;
1051
1052         if (protocol) {
1053                 po->prot_hook.type = protocol;
1054                 dev_add_pack(&po->prot_hook);
1055                 sock_hold(sk);
1056                 po->running = 1;
1057         }
1058
1059         write_lock_bh(&packet_sklist_lock);
1060         sk_add_node(sk, &packet_sklist);
1061         write_unlock_bh(&packet_sklist_lock);
1062         return(0);
1063 out:
1064         return err;
1065 }
1066
1067 /*
1068  *      Pull a packet from our receive queue and hand it to the user.
1069  *      If necessary we block.
1070  */
1071
1072 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1073                           struct msghdr *msg, size_t len, int flags)
1074 {
1075         struct sock *sk = sock->sk;
1076         struct sk_buff *skb;
1077         int copied, err;
1078         struct sockaddr_ll *sll;
1079
1080         err = -EINVAL;
1081         if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
1082                 goto out;
1083
1084 #if 0
1085         /* What error should we return now? EUNATTACH? */
1086         if (pkt_sk(sk)->ifindex < 0)
1087                 return -ENODEV;
1088 #endif
1089
1090         /*
1091          *      Call the generic datagram receiver. This handles all sorts
1092          *      of horrible races and re-entrancy so we can forget about it
1093          *      in the protocol layers.
1094          *
1095          *      Now it will return ENETDOWN, if device have just gone down,
1096          *      but then it will block.
1097          */
1098
1099         skb=skb_recv_datagram(sk,flags,flags&MSG_DONTWAIT,&err);
1100
1101         /*
1102          *      An error occurred so return it. Because skb_recv_datagram() 
1103          *      handles the blocking we don't see and worry about blocking
1104          *      retries.
1105          */
1106
1107         if (skb == NULL)
1108                 goto out;
1109
1110         /*
1111          *      If the address length field is there to be filled in, we fill
1112          *      it in now.
1113          */
1114
1115         sll = (struct sockaddr_ll*)skb->cb;
1116         if (sock->type == SOCK_PACKET)
1117                 msg->msg_namelen = sizeof(struct sockaddr_pkt);
1118         else
1119                 msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1120
1121         /*
1122          *      You lose any data beyond the buffer you gave. If it worries a
1123          *      user program they can ask the device for its MTU anyway.
1124          */
1125
1126         copied = skb->len;
1127         if (copied > len)
1128         {
1129                 copied=len;
1130                 msg->msg_flags|=MSG_TRUNC;
1131         }
1132
1133         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1134         if (err)
1135                 goto out_free;
1136
1137         sock_recv_timestamp(msg, sk, skb);
1138
1139         if (msg->msg_name)
1140                 memcpy(msg->msg_name, skb->cb, msg->msg_namelen);
1141
1142         /*
1143          *      Free or return the buffer as appropriate. Again this
1144          *      hides all the races and re-entrancy issues from us.
1145          */
1146         err = (flags&MSG_TRUNC) ? skb->len : copied;
1147
1148 out_free:
1149         skb_free_datagram(sk, skb);
1150 out:
1151         return err;
1152 }
1153
1154 #ifdef CONFIG_SOCK_PACKET
1155 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1156                                int *uaddr_len, int peer)
1157 {
1158         struct net_device *dev;
1159         struct sock *sk = sock->sk;
1160
1161         if (peer)
1162                 return -EOPNOTSUPP;
1163
1164         uaddr->sa_family = AF_PACKET;
1165         dev = dev_get_by_index(pkt_sk(sk)->ifindex);
1166         if (dev) {
1167                 strlcpy(uaddr->sa_data, dev->name, 15);
1168                 dev_put(dev);
1169         } else
1170                 memset(uaddr->sa_data, 0, 14);
1171         *uaddr_len = sizeof(*uaddr);
1172
1173         return 0;
1174 }
1175 #endif
1176
1177 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1178                           int *uaddr_len, int peer)
1179 {
1180         struct net_device *dev;
1181         struct sock *sk = sock->sk;
1182         struct packet_sock *po = pkt_sk(sk);
1183         struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
1184
1185         if (peer)
1186                 return -EOPNOTSUPP;
1187
1188         sll->sll_family = AF_PACKET;
1189         sll->sll_ifindex = po->ifindex;
1190         sll->sll_protocol = po->num;
1191         dev = dev_get_by_index(po->ifindex);
1192         if (dev) {
1193                 sll->sll_hatype = dev->type;
1194                 sll->sll_halen = dev->addr_len;
1195                 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1196                 dev_put(dev);
1197         } else {
1198                 sll->sll_hatype = 0;    /* Bad: we have no ARPHRD_UNSPEC */
1199                 sll->sll_halen = 0;
1200         }
1201         *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1202
1203         return 0;
1204 }
1205
1206 #ifdef CONFIG_PACKET_MULTICAST
1207 static void packet_dev_mc(struct net_device *dev, struct packet_mclist *i, int what)
1208 {
1209         switch (i->type) {
1210         case PACKET_MR_MULTICAST:
1211                 if (what > 0)
1212                         dev_mc_add(dev, i->addr, i->alen, 0);
1213                 else
1214                         dev_mc_delete(dev, i->addr, i->alen, 0);
1215                 break;
1216         case PACKET_MR_PROMISC:
1217                 dev_set_promiscuity(dev, what);
1218                 break;
1219         case PACKET_MR_ALLMULTI:
1220                 dev_set_allmulti(dev, what);
1221                 break;
1222         default:;
1223         }
1224 }
1225
1226 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1227 {
1228         for ( ; i; i=i->next) {
1229                 if (i->ifindex == dev->ifindex)
1230                         packet_dev_mc(dev, i, what);
1231         }
1232 }
1233
1234 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1235 {
1236         struct packet_sock *po = pkt_sk(sk);
1237         struct packet_mclist *ml, *i;
1238         struct net_device *dev;
1239         int err;
1240
1241         rtnl_lock();
1242
1243         err = -ENODEV;
1244         dev = __dev_get_by_index(mreq->mr_ifindex);
1245         if (!dev)
1246                 goto done;
1247
1248         err = -EINVAL;
1249         if (mreq->mr_alen > dev->addr_len)
1250                 goto done;
1251
1252         err = -ENOBUFS;
1253         i = kmalloc(sizeof(*i), GFP_KERNEL);
1254         if (i == NULL)
1255                 goto done;
1256
1257         err = 0;
1258         for (ml = po->mclist; ml; ml = ml->next) {
1259                 if (ml->ifindex == mreq->mr_ifindex &&
1260                     ml->type == mreq->mr_type &&
1261                     ml->alen == mreq->mr_alen &&
1262                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1263                         ml->count++;
1264                         /* Free the new element ... */
1265                         kfree(i);
1266                         goto done;
1267                 }
1268         }
1269
1270         i->type = mreq->mr_type;
1271         i->ifindex = mreq->mr_ifindex;
1272         i->alen = mreq->mr_alen;
1273         memcpy(i->addr, mreq->mr_address, i->alen);
1274         i->count = 1;
1275         i->next = po->mclist;
1276         po->mclist = i;
1277         packet_dev_mc(dev, i, +1);
1278
1279 done:
1280         rtnl_unlock();
1281         return err;
1282 }
1283
1284 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1285 {
1286         struct packet_mclist *ml, **mlp;
1287
1288         rtnl_lock();
1289
1290         for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1291                 if (ml->ifindex == mreq->mr_ifindex &&
1292                     ml->type == mreq->mr_type &&
1293                     ml->alen == mreq->mr_alen &&
1294                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1295                         if (--ml->count == 0) {
1296                                 struct net_device *dev;
1297                                 *mlp = ml->next;
1298                                 dev = dev_get_by_index(ml->ifindex);
1299                                 if (dev) {
1300                                         packet_dev_mc(dev, ml, -1);
1301                                         dev_put(dev);
1302                                 }
1303                                 kfree(ml);
1304                         }
1305                         rtnl_unlock();
1306                         return 0;
1307                 }
1308         }
1309         rtnl_unlock();
1310         return -EADDRNOTAVAIL;
1311 }
1312
1313 static void packet_flush_mclist(struct sock *sk)
1314 {
1315         struct packet_sock *po = pkt_sk(sk);
1316         struct packet_mclist *ml;
1317
1318         if (!po->mclist)
1319                 return;
1320
1321         rtnl_lock();
1322         while ((ml = po->mclist) != NULL) {
1323                 struct net_device *dev;
1324
1325                 po->mclist = ml->next;
1326                 if ((dev = dev_get_by_index(ml->ifindex)) != NULL) {
1327                         packet_dev_mc(dev, ml, -1);
1328                         dev_put(dev);
1329                 }
1330                 kfree(ml);
1331         }
1332         rtnl_unlock();
1333 }
1334 #endif
1335
1336 static int
1337 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen)
1338 {
1339         struct sock *sk = sock->sk;
1340         int ret;
1341
1342         if (level != SOL_PACKET)
1343                 return -ENOPROTOOPT;
1344
1345         switch(optname) {
1346 #ifdef CONFIG_PACKET_MULTICAST
1347         case PACKET_ADD_MEMBERSHIP:     
1348         case PACKET_DROP_MEMBERSHIP:
1349         {
1350                 struct packet_mreq_max mreq;
1351                 int len = optlen;
1352                 memset(&mreq, 0, sizeof(mreq));
1353                 if (len < sizeof(struct packet_mreq))
1354                         return -EINVAL;
1355                 if (len > sizeof(mreq))
1356                         len = sizeof(mreq);
1357                 if (copy_from_user(&mreq,optval,len))
1358                         return -EFAULT;
1359                 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1360                         return -EINVAL;
1361                 if (optname == PACKET_ADD_MEMBERSHIP)
1362                         ret = packet_mc_add(sk, &mreq);
1363                 else
1364                         ret = packet_mc_drop(sk, &mreq);
1365                 return ret;
1366         }
1367 #endif
1368 #ifdef CONFIG_PACKET_MMAP
1369         case PACKET_RX_RING:
1370         {
1371                 struct tpacket_req req;
1372
1373                 if (optlen<sizeof(req))
1374                         return -EINVAL;
1375                 if (copy_from_user(&req,optval,sizeof(req)))
1376                         return -EFAULT;
1377                 return packet_set_ring(sk, &req, 0);
1378         }
1379         case PACKET_COPY_THRESH:
1380         {
1381                 int val;
1382
1383                 if (optlen!=sizeof(val))
1384                         return -EINVAL;
1385                 if (copy_from_user(&val,optval,sizeof(val)))
1386                         return -EFAULT;
1387
1388                 pkt_sk(sk)->copy_thresh = val;
1389                 return 0;
1390         }
1391 #endif
1392         default:
1393                 return -ENOPROTOOPT;
1394         }
1395 }
1396
1397 static int packet_getsockopt(struct socket *sock, int level, int optname,
1398                              char __user *optval, int __user *optlen)
1399 {
1400         int len;
1401         struct sock *sk = sock->sk;
1402         struct packet_sock *po = pkt_sk(sk);
1403
1404         if (level != SOL_PACKET)
1405                 return -ENOPROTOOPT;
1406
1407         if (get_user(len, optlen))
1408                 return -EFAULT;
1409
1410         if (len < 0)
1411                 return -EINVAL;
1412                 
1413         switch(optname) {
1414         case PACKET_STATISTICS:
1415         {
1416                 struct tpacket_stats st;
1417
1418                 if (len > sizeof(struct tpacket_stats))
1419                         len = sizeof(struct tpacket_stats);
1420                 spin_lock_bh(&sk->sk_receive_queue.lock);
1421                 st = po->stats;
1422                 memset(&po->stats, 0, sizeof(st));
1423                 spin_unlock_bh(&sk->sk_receive_queue.lock);
1424                 st.tp_packets += st.tp_drops;
1425
1426                 if (copy_to_user(optval, &st, len))
1427                         return -EFAULT;
1428                 break;
1429         }
1430         default:
1431                 return -ENOPROTOOPT;
1432         }
1433
1434         if (put_user(len, optlen))
1435                 return -EFAULT;
1436         return 0;
1437 }
1438
1439
1440 static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
1441 {
1442         struct sock *sk;
1443         struct hlist_node *node;
1444         struct net_device *dev = (struct net_device*)data;
1445
1446         read_lock(&packet_sklist_lock);
1447         sk_for_each(sk, node, &packet_sklist) {
1448                 struct packet_sock *po = pkt_sk(sk);
1449
1450                 switch (msg) {
1451                 case NETDEV_UNREGISTER:
1452 #ifdef CONFIG_PACKET_MULTICAST
1453                         if (po->mclist)
1454                                 packet_dev_mclist(dev, po->mclist, -1);
1455                         // fallthrough
1456 #endif
1457                 case NETDEV_DOWN:
1458                         if (dev->ifindex == po->ifindex) {
1459                                 spin_lock(&po->bind_lock);
1460                                 if (po->running) {
1461                                         __dev_remove_pack(&po->prot_hook);
1462                                         __sock_put(sk);
1463                                         po->running = 0;
1464                                         sk->sk_err = ENETDOWN;
1465                                         if (!sock_flag(sk, SOCK_DEAD))
1466                                                 sk->sk_error_report(sk);
1467                                 }
1468                                 if (msg == NETDEV_UNREGISTER) {
1469                                         po->ifindex = -1;
1470                                         po->prot_hook.dev = NULL;
1471                                 }
1472                                 spin_unlock(&po->bind_lock);
1473                         }
1474                         break;
1475                 case NETDEV_UP:
1476                         spin_lock(&po->bind_lock);
1477                         if (dev->ifindex == po->ifindex && po->num &&
1478                             !po->running) {
1479                                 dev_add_pack(&po->prot_hook);
1480                                 sock_hold(sk);
1481                                 po->running = 1;
1482                         }
1483                         spin_unlock(&po->bind_lock);
1484                         break;
1485                 }
1486         }
1487         read_unlock(&packet_sklist_lock);
1488         return NOTIFY_DONE;
1489 }
1490
1491
1492 static int packet_ioctl(struct socket *sock, unsigned int cmd,
1493                         unsigned long arg)
1494 {
1495         struct sock *sk = sock->sk;
1496
1497         switch(cmd) {
1498                 case SIOCOUTQ:
1499                 {
1500                         int amount = atomic_read(&sk->sk_wmem_alloc);
1501                         return put_user(amount, (int __user *)arg);
1502                 }
1503                 case SIOCINQ:
1504                 {
1505                         struct sk_buff *skb;
1506                         int amount = 0;
1507
1508                         spin_lock_bh(&sk->sk_receive_queue.lock);
1509                         skb = skb_peek(&sk->sk_receive_queue);
1510                         if (skb)
1511                                 amount = skb->len;
1512                         spin_unlock_bh(&sk->sk_receive_queue.lock);
1513                         return put_user(amount, (int __user *)arg);
1514                 }
1515                 case SIOCGSTAMP:
1516                         return sock_get_timestamp(sk, (struct timeval __user *)arg);
1517                         
1518 #ifdef CONFIG_INET
1519                 case SIOCADDRT:
1520                 case SIOCDELRT:
1521                 case SIOCDARP:
1522                 case SIOCGARP:
1523                 case SIOCSARP:
1524                 case SIOCGIFADDR:
1525                 case SIOCSIFADDR:
1526                 case SIOCGIFBRDADDR:
1527                 case SIOCSIFBRDADDR:
1528                 case SIOCGIFNETMASK:
1529                 case SIOCSIFNETMASK:
1530                 case SIOCGIFDSTADDR:
1531                 case SIOCSIFDSTADDR:
1532                 case SIOCSIFFLAGS:
1533                         return inet_dgram_ops.ioctl(sock, cmd, arg);
1534 #endif
1535
1536                 default:
1537                         return -ENOIOCTLCMD;
1538         }
1539         return 0;
1540 }
1541
1542 #ifndef CONFIG_PACKET_MMAP
1543 #define packet_mmap sock_no_mmap
1544 #define packet_poll datagram_poll
1545 #else
1546
1547 static unsigned int packet_poll(struct file * file, struct socket *sock,
1548                                 poll_table *wait)
1549 {
1550         struct sock *sk = sock->sk;
1551         struct packet_sock *po = pkt_sk(sk);
1552         unsigned int mask = datagram_poll(file, sock, wait);
1553
1554         spin_lock_bh(&sk->sk_receive_queue.lock);
1555         if (po->pg_vec) {
1556                 unsigned last = po->head ? po->head-1 : po->frame_max;
1557                 struct tpacket_hdr *h;
1558
1559                 h = (struct tpacket_hdr *)packet_lookup_frame(po, last);
1560
1561                 if (h->tp_status)
1562                         mask |= POLLIN | POLLRDNORM;
1563         }
1564         spin_unlock_bh(&sk->sk_receive_queue.lock);
1565         return mask;
1566 }
1567
1568
1569 /* Dirty? Well, I still did not learn better way to account
1570  * for user mmaps.
1571  */
1572
1573 static void packet_mm_open(struct vm_area_struct *vma)
1574 {
1575         struct file *file = vma->vm_file;
1576         struct socket * sock = file->private_data;
1577         struct sock *sk = sock->sk;
1578         
1579         if (sk)
1580                 atomic_inc(&pkt_sk(sk)->mapped);
1581 }
1582
1583 static void packet_mm_close(struct vm_area_struct *vma)
1584 {
1585         struct file *file = vma->vm_file;
1586         struct socket * sock = file->private_data;
1587         struct sock *sk = sock->sk;
1588         
1589         if (sk)
1590                 atomic_dec(&pkt_sk(sk)->mapped);
1591 }
1592
1593 static struct vm_operations_struct packet_mmap_ops = {
1594         .open = packet_mm_open,
1595         .close =packet_mm_close,
1596 };
1597
1598 static inline struct page *pg_vec_endpage(char *one_pg_vec, unsigned int order)
1599 {
1600         return virt_to_page(one_pg_vec + (PAGE_SIZE << order) - 1);
1601 }
1602
1603 static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
1604 {
1605         int i;
1606
1607         for (i = 0; i < len; i++) {
1608                 if (likely(pg_vec[i]))
1609                         free_pages((unsigned long) pg_vec[i], order);
1610         }
1611         kfree(pg_vec);
1612 }
1613
1614 static inline char *alloc_one_pg_vec_page(unsigned long order)
1615 {
1616         return (char *) __get_free_pages(GFP_KERNEL | __GFP_COMP | __GFP_ZERO,
1617                                          order);
1618 }
1619
1620 static char **alloc_pg_vec(struct tpacket_req *req, int order)
1621 {
1622         unsigned int block_nr = req->tp_block_nr;
1623         char **pg_vec;
1624         int i;
1625
1626         pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
1627         if (unlikely(!pg_vec))
1628                 goto out;
1629
1630         for (i = 0; i < block_nr; i++) {
1631                 pg_vec[i] = alloc_one_pg_vec_page(order);
1632                 if (unlikely(!pg_vec[i]))
1633                         goto out_free_pgvec;
1634         }
1635
1636 out:
1637         return pg_vec;
1638
1639 out_free_pgvec:
1640         free_pg_vec(pg_vec, order, block_nr);
1641         pg_vec = NULL;
1642         goto out;
1643 }
1644
1645 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing)
1646 {
1647         char **pg_vec = NULL;
1648         struct packet_sock *po = pkt_sk(sk);
1649         int was_running, num, order = 0;
1650         int err = 0;
1651         
1652         if (req->tp_block_nr) {
1653                 int i, l;
1654
1655                 /* Sanity tests and some calculations */
1656
1657                 if (unlikely(po->pg_vec))
1658                         return -EBUSY;
1659
1660                 if (unlikely((int)req->tp_block_size <= 0))
1661                         return -EINVAL;
1662                 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
1663                         return -EINVAL;
1664                 if (unlikely(req->tp_frame_size < TPACKET_HDRLEN))
1665                         return -EINVAL;
1666                 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
1667                         return -EINVAL;
1668
1669                 po->frames_per_block = req->tp_block_size/req->tp_frame_size;
1670                 if (unlikely(po->frames_per_block <= 0))
1671                         return -EINVAL;
1672                 if (unlikely((po->frames_per_block * req->tp_block_nr) !=
1673                              req->tp_frame_nr))
1674                         return -EINVAL;
1675
1676                 err = -ENOMEM;
1677                 order = get_order(req->tp_block_size);
1678                 pg_vec = alloc_pg_vec(req, order);
1679                 if (unlikely(!pg_vec))
1680                         goto out;
1681
1682                 l = 0;
1683                 for (i = 0; i < req->tp_block_nr; i++) {
1684                         char *ptr = pg_vec[i];
1685                         struct tpacket_hdr *header;
1686                         int k;
1687
1688                         for (k = 0; k < po->frames_per_block; k++) {
1689                                 header = (struct tpacket_hdr *) ptr;
1690                                 header->tp_status = TP_STATUS_KERNEL;
1691                                 ptr += req->tp_frame_size;
1692                         }
1693                 }
1694                 /* Done */
1695         } else {
1696                 if (unlikely(req->tp_frame_nr))
1697                         return -EINVAL;
1698         }
1699
1700         lock_sock(sk);
1701
1702         /* Detach socket from network */
1703         spin_lock(&po->bind_lock);
1704         was_running = po->running;
1705         num = po->num;
1706         if (was_running) {
1707                 __dev_remove_pack(&po->prot_hook);
1708                 po->num = 0;
1709                 po->running = 0;
1710                 __sock_put(sk);
1711         }
1712         spin_unlock(&po->bind_lock);
1713                 
1714         synchronize_net();
1715
1716         err = -EBUSY;
1717         if (closing || atomic_read(&po->mapped) == 0) {
1718                 err = 0;
1719 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
1720
1721                 spin_lock_bh(&sk->sk_receive_queue.lock);
1722                 pg_vec = XC(po->pg_vec, pg_vec);
1723                 po->frame_max = (req->tp_frame_nr - 1);
1724                 po->head = 0;
1725                 po->frame_size = req->tp_frame_size;
1726                 spin_unlock_bh(&sk->sk_receive_queue.lock);
1727
1728                 order = XC(po->pg_vec_order, order);
1729                 req->tp_block_nr = XC(po->pg_vec_len, req->tp_block_nr);
1730
1731                 po->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
1732                 po->prot_hook.func = po->pg_vec ? tpacket_rcv : packet_rcv;
1733                 skb_queue_purge(&sk->sk_receive_queue);
1734 #undef XC
1735                 if (atomic_read(&po->mapped))
1736                         printk(KERN_DEBUG "packet_mmap: vma is busy: %d\n", atomic_read(&po->mapped));
1737         }
1738
1739         spin_lock(&po->bind_lock);
1740         if (was_running && !po->running) {
1741                 sock_hold(sk);
1742                 po->running = 1;
1743                 po->num = num;
1744                 dev_add_pack(&po->prot_hook);
1745         }
1746         spin_unlock(&po->bind_lock);
1747
1748         release_sock(sk);
1749
1750         if (pg_vec)
1751                 free_pg_vec(pg_vec, order, req->tp_block_nr);
1752 out:
1753         return err;
1754 }
1755
1756 static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1757 {
1758         struct sock *sk = sock->sk;
1759         struct packet_sock *po = pkt_sk(sk);
1760         unsigned long size;
1761         unsigned long start;
1762         int err = -EINVAL;
1763         int i;
1764
1765         if (vma->vm_pgoff)
1766                 return -EINVAL;
1767
1768         size = vma->vm_end - vma->vm_start;
1769
1770         lock_sock(sk);
1771         if (po->pg_vec == NULL)
1772                 goto out;
1773         if (size != po->pg_vec_len*po->pg_vec_pages*PAGE_SIZE)
1774                 goto out;
1775
1776         start = vma->vm_start;
1777         for (i = 0; i < po->pg_vec_len; i++) {
1778                 struct page *page = virt_to_page(po->pg_vec[i]);
1779                 int pg_num;
1780
1781                 for (pg_num = 0; pg_num < po->pg_vec_pages; pg_num++, page++) {
1782                         err = vm_insert_page(vma, start, page);
1783                         if (unlikely(err))
1784                                 goto out;
1785                         start += PAGE_SIZE;
1786                 }
1787         }
1788         atomic_inc(&po->mapped);
1789         vma->vm_ops = &packet_mmap_ops;
1790         err = 0;
1791
1792 out:
1793         release_sock(sk);
1794         return err;
1795 }
1796 #endif
1797
1798
1799 #ifdef CONFIG_SOCK_PACKET
1800 static const struct proto_ops packet_ops_spkt = {
1801         .family =       PF_PACKET,
1802         .owner =        THIS_MODULE,
1803         .release =      packet_release,
1804         .bind =         packet_bind_spkt,
1805         .connect =      sock_no_connect,
1806         .socketpair =   sock_no_socketpair,
1807         .accept =       sock_no_accept,
1808         .getname =      packet_getname_spkt,
1809         .poll =         datagram_poll,
1810         .ioctl =        packet_ioctl,
1811         .listen =       sock_no_listen,
1812         .shutdown =     sock_no_shutdown,
1813         .setsockopt =   sock_no_setsockopt,
1814         .getsockopt =   sock_no_getsockopt,
1815         .sendmsg =      packet_sendmsg_spkt,
1816         .recvmsg =      packet_recvmsg,
1817         .mmap =         sock_no_mmap,
1818         .sendpage =     sock_no_sendpage,
1819 };
1820 #endif
1821
1822 #if !defined(CONFIG_VNET) && !defined(CONFIG_VNET_MODULE)
1823 static const
1824 #endif
1825 struct proto_ops packet_ops = {
1826         .family =       PF_PACKET,
1827         .owner =        THIS_MODULE,
1828         .release =      packet_release,
1829         .bind =         packet_bind,
1830         .connect =      sock_no_connect,
1831         .socketpair =   sock_no_socketpair,
1832         .accept =       sock_no_accept,
1833         .getname =      packet_getname, 
1834         .poll =         packet_poll,
1835         .ioctl =        packet_ioctl,
1836         .listen =       sock_no_listen,
1837         .shutdown =     sock_no_shutdown,
1838         .setsockopt =   packet_setsockopt,
1839         .getsockopt =   packet_getsockopt,
1840         .sendmsg =      packet_sendmsg,
1841         .recvmsg =      packet_recvmsg,
1842         .mmap =         packet_mmap,
1843         .sendpage =     sock_no_sendpage,
1844 };
1845
1846 #if defined(CONFIG_VNET) || defined(CONFIG_VNET_MODULE)
1847 struct net_proto_family packet_family_ops;
1848 EXPORT_SYMBOL(packet_family_ops);
1849 #else
1850 static
1851 #endif
1852 struct net_proto_family packet_family_ops = {
1853         .family =       PF_PACKET,
1854         .create =       packet_create,
1855         .owner  =       THIS_MODULE,
1856 };
1857
1858 static struct notifier_block packet_netdev_notifier = {
1859         .notifier_call =packet_notifier,
1860 };
1861
1862 #ifdef CONFIG_PROC_FS
1863 static inline struct sock *packet_seq_idx(loff_t off)
1864 {
1865         struct sock *s;
1866         struct hlist_node *node;
1867
1868         sk_for_each(s, node, &packet_sklist) {
1869                 if (!off--)
1870                         return s;
1871         }
1872         return NULL;
1873 }
1874
1875 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
1876 {
1877         read_lock(&packet_sklist_lock);
1878         return *pos ? packet_seq_idx(*pos - 1) : SEQ_START_TOKEN;
1879 }
1880
1881 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1882 {
1883         ++*pos;
1884         return  (v == SEQ_START_TOKEN) 
1885                 ? sk_head(&packet_sklist) 
1886                 : sk_next((struct sock*)v) ;
1887 }
1888
1889 static void packet_seq_stop(struct seq_file *seq, void *v)
1890 {
1891         read_unlock(&packet_sklist_lock);               
1892 }
1893
1894 static int packet_seq_show(struct seq_file *seq, void *v) 
1895 {
1896         if (v == SEQ_START_TOKEN)
1897                 seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
1898         else {
1899                 struct sock *s = v;
1900                 const struct packet_sock *po = pkt_sk(s);
1901
1902                 seq_printf(seq,
1903                            "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
1904                            s,
1905                            atomic_read(&s->sk_refcnt),
1906                            s->sk_type,
1907                            ntohs(po->num),
1908                            po->ifindex,
1909                            po->running,
1910                            atomic_read(&s->sk_rmem_alloc),
1911                            sock_i_uid(s),
1912                            sock_i_ino(s) );
1913         }
1914
1915         return 0;
1916 }
1917
1918 static struct seq_operations packet_seq_ops = {
1919         .start  = packet_seq_start,
1920         .next   = packet_seq_next,
1921         .stop   = packet_seq_stop,
1922         .show   = packet_seq_show,
1923 };
1924
1925 static int packet_seq_open(struct inode *inode, struct file *file)
1926 {
1927         return seq_open(file, &packet_seq_ops);
1928 }
1929
1930 static struct file_operations packet_seq_fops = {
1931         .owner          = THIS_MODULE,
1932         .open           = packet_seq_open,
1933         .read           = seq_read,
1934         .llseek         = seq_lseek,
1935         .release        = seq_release,
1936 };
1937
1938 #endif
1939
1940 static void __exit packet_exit(void)
1941 {
1942         proc_net_remove("packet");
1943         unregister_netdevice_notifier(&packet_netdev_notifier);
1944         sock_unregister(PF_PACKET);
1945         proto_unregister(&packet_proto);
1946 }
1947
1948 static int __init packet_init(void)
1949 {
1950         int rc = proto_register(&packet_proto, 0);
1951
1952         if (rc != 0)
1953                 goto out;
1954
1955         sock_register(&packet_family_ops);
1956         register_netdevice_notifier(&packet_netdev_notifier);
1957         proc_net_fops_create("packet", 0, &packet_seq_fops);
1958 out:
1959         return rc;
1960 }
1961
1962 module_init(packet_init);
1963 module_exit(packet_exit);
1964 MODULE_LICENSE("GPL");
1965 MODULE_ALIAS_NETPROTO(PF_PACKET);