vserver 2.0 rc7
[linux-2.6.git] / net / packet / af_packet.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              PACKET - implements raw packet sockets.
7  *
8  * Version:     $Id: af_packet.c,v 1.61 2002/02/08 03:57:19 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *
14  * Fixes:       
15  *              Alan Cox        :       verify_area() now used correctly
16  *              Alan Cox        :       new skbuff lists, look ma no backlogs!
17  *              Alan Cox        :       tidied skbuff lists.
18  *              Alan Cox        :       Now uses generic datagram routines I
19  *                                      added. Also fixed the peek/read crash
20  *                                      from all old Linux datagram code.
21  *              Alan Cox        :       Uses the improved datagram code.
22  *              Alan Cox        :       Added NULL's for socket options.
23  *              Alan Cox        :       Re-commented the code.
24  *              Alan Cox        :       Use new kernel side addressing
25  *              Rob Janssen     :       Correct MTU usage.
26  *              Dave Platt      :       Counter leaks caused by incorrect
27  *                                      interrupt locking and some slightly
28  *                                      dubious gcc output. Can you read
29  *                                      compiler: it said _VOLATILE_
30  *      Richard Kooijman        :       Timestamp fixes.
31  *              Alan Cox        :       New buffers. Use sk->mac.raw.
32  *              Alan Cox        :       sendmsg/recvmsg support.
33  *              Alan Cox        :       Protocol setting support
34  *      Alexey Kuznetsov        :       Untied from IPv4 stack.
35  *      Cyrus Durgin            :       Fixed kerneld for kmod.
36  *      Michal Ostrowski        :       Module initialization cleanup.
37  *         Ulises Alonso        :       Frame number limit removal and 
38  *                                      packet_set_ring memory leak.
39  *
40  *              This program is free software; you can redistribute it and/or
41  *              modify it under the terms of the GNU General Public License
42  *              as published by the Free Software Foundation; either version
43  *              2 of the License, or (at your option) any later version.
44  *
45  */
46  
47 #include <linux/config.h>
48 #include <linux/types.h>
49 #include <linux/sched.h>
50 #include <linux/mm.h>
51 #include <linux/fcntl.h>
52 #include <linux/socket.h>
53 #include <linux/in.h>
54 #include <linux/inet.h>
55 #include <linux/netdevice.h>
56 #include <linux/if_packet.h>
57 #include <linux/wireless.h>
58 #include <linux/kmod.h>
59 #include <net/ip.h>
60 #include <net/protocol.h>
61 #include <linux/skbuff.h>
62 #include <net/sock.h>
63 #include <linux/errno.h>
64 #include <linux/timer.h>
65 #include <asm/system.h>
66 #include <asm/uaccess.h>
67 #include <asm/ioctls.h>
68 #include <asm/page.h>
69 #include <asm/io.h>
70 #include <linux/proc_fs.h>
71 #include <linux/seq_file.h>
72 #include <linux/poll.h>
73 #include <linux/module.h>
74 #include <linux/init.h>
75
76 #ifdef CONFIG_INET
77 #include <net/inet_common.h>
78 #endif
79
80 #define CONFIG_SOCK_PACKET      1
81
82 /*
83    Proposed replacement for SIOC{ADD,DEL}MULTI and
84    IFF_PROMISC, IFF_ALLMULTI flags.
85
86    It is more expensive, but I believe,
87    it is really correct solution: reentereble, safe and fault tolerant.
88
89    IFF_PROMISC/IFF_ALLMULTI/SIOC{ADD/DEL}MULTI are faked by keeping
90    reference count and global flag, so that real status is
91    (gflag|(count != 0)), so that we can use obsolete faulty interface
92    not harming clever users.
93  */
94 #define CONFIG_PACKET_MULTICAST 1
95
96 /*
97    Assumptions:
98    - if device has no dev->hard_header routine, it adds and removes ll header
99      inside itself. In this case ll header is invisible outside of device,
100      but higher levels still should reserve dev->hard_header_len.
101      Some devices are enough clever to reallocate skb, when header
102      will not fit to reserved space (tunnel), another ones are silly
103      (PPP).
104    - packet socket receives packets with pulled ll header,
105      so that SOCK_RAW should push it back.
106
107 On receive:
108 -----------
109
110 Incoming, dev->hard_header!=NULL
111    mac.raw -> ll header
112    data    -> data
113
114 Outgoing, dev->hard_header!=NULL
115    mac.raw -> ll header
116    data    -> ll header
117
118 Incoming, dev->hard_header==NULL
119    mac.raw -> UNKNOWN position. It is very likely, that it points to ll header.
120               PPP makes it, that is wrong, because introduce assymetry
121               between rx and tx paths.
122    data    -> data
123
124 Outgoing, dev->hard_header==NULL
125    mac.raw -> data. ll header is still not built!
126    data    -> data
127
128 Resume
129   If dev->hard_header==NULL we are unlikely to restore sensible ll header.
130
131
132 On transmit:
133 ------------
134
135 dev->hard_header != NULL
136    mac.raw -> ll header
137    data    -> ll header
138
139 dev->hard_header == NULL (ll header is added by device, we cannot control it)
140    mac.raw -> data
141    data -> data
142
143    We should set nh.raw on output to correct posistion,
144    packet classifier depends on it.
145  */
146
147 /* List of all packet sockets. */
148 static HLIST_HEAD(packet_sklist);
149 static DEFINE_RWLOCK(packet_sklist_lock);
150
151 static atomic_t packet_socks_nr;
152
153
154 /* Private packet socket structures. */
155
156 #ifdef CONFIG_PACKET_MULTICAST
157 struct packet_mclist
158 {
159         struct packet_mclist    *next;
160         int                     ifindex;
161         int                     count;
162         unsigned short          type;
163         unsigned short          alen;
164         unsigned char           addr[8];
165 };
166 #endif
167 #ifdef CONFIG_PACKET_MMAP
168 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing);
169 #endif
170
171 static void packet_flush_mclist(struct sock *sk);
172
173 struct packet_sock {
174         /* struct sock has to be the first member of packet_sock */
175         struct sock             sk;
176         struct tpacket_stats    stats;
177 #ifdef CONFIG_PACKET_MMAP
178         char *                  *pg_vec;
179         unsigned int            head;
180         unsigned int            frames_per_block;
181         unsigned int            frame_size;
182         unsigned int            frame_max;
183         int                     copy_thresh;
184 #endif
185         struct packet_type      prot_hook;
186         spinlock_t              bind_lock;
187         char                    running;        /* prot_hook is attached*/
188         int                     ifindex;        /* bound device         */
189         unsigned short          num;
190 #ifdef CONFIG_PACKET_MULTICAST
191         struct packet_mclist    *mclist;
192 #endif
193 #ifdef CONFIG_PACKET_MMAP
194         atomic_t                mapped;
195         unsigned int            pg_vec_order;
196         unsigned int            pg_vec_pages;
197         unsigned int            pg_vec_len;
198 #endif
199 };
200
201 #ifdef CONFIG_PACKET_MMAP
202
203 static inline char *packet_lookup_frame(struct packet_sock *po, unsigned int position)
204 {
205         unsigned int pg_vec_pos, frame_offset;
206         char *frame;
207
208         pg_vec_pos = position / po->frames_per_block;
209         frame_offset = position % po->frames_per_block;
210
211         frame = po->pg_vec[pg_vec_pos] + (frame_offset * po->frame_size);
212         
213         return frame;
214 }
215 #endif
216
217 static inline struct packet_sock *pkt_sk(struct sock *sk)
218 {
219         return (struct packet_sock *)sk;
220 }
221
222 static void packet_sock_destruct(struct sock *sk)
223 {
224         BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc));
225         BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
226
227         if (!sock_flag(sk, SOCK_DEAD)) {
228                 printk("Attempt to release alive packet socket: %p\n", sk);
229                 return;
230         }
231
232         atomic_dec(&packet_socks_nr);
233 #ifdef PACKET_REFCNT_DEBUG
234         printk(KERN_DEBUG "PACKET socket %p is free, %d are alive\n", sk, atomic_read(&packet_socks_nr));
235 #endif
236 }
237
238
239 static struct proto_ops packet_ops;
240
241 #ifdef CONFIG_SOCK_PACKET
242 static struct proto_ops packet_ops_spkt;
243
244 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,  struct packet_type *pt)
245 {
246         struct sock *sk;
247         struct sockaddr_pkt *spkt;
248
249         /*
250          *      When we registered the protocol we saved the socket in the data
251          *      field for just this event.
252          */
253
254         sk = pt->af_packet_priv;
255         
256         /*
257          *      Yank back the headers [hope the device set this
258          *      right or kerboom...]
259          *
260          *      Incoming packets have ll header pulled,
261          *      push it back.
262          *
263          *      For outgoing ones skb->data == skb->mac.raw
264          *      so that this procedure is noop.
265          */
266
267         if (skb->pkt_type == PACKET_LOOPBACK)
268                 goto out;
269
270         if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
271                 goto oom;
272
273         /* drop any routing info */
274         dst_release(skb->dst);
275         skb->dst = NULL;
276
277         spkt = (struct sockaddr_pkt*)skb->cb;
278
279         skb_push(skb, skb->data-skb->mac.raw);
280
281         /*
282          *      The SOCK_PACKET socket receives _all_ frames.
283          */
284
285         spkt->spkt_family = dev->type;
286         strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
287         spkt->spkt_protocol = skb->protocol;
288
289         /*
290          *      Charge the memory to the socket. This is done specifically
291          *      to prevent sockets using all the memory up.
292          */
293
294         if (sock_queue_rcv_skb(sk,skb) == 0)
295                 return 0;
296
297 out:
298         kfree_skb(skb);
299 oom:
300         return 0;
301 }
302
303
304 /*
305  *      Output a raw packet to a device layer. This bypasses all the other
306  *      protocol layers and you must therefore supply it with a complete frame
307  */
308  
309 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
310                                struct msghdr *msg, size_t len)
311 {
312         struct sock *sk = sock->sk;
313         struct sockaddr_pkt *saddr=(struct sockaddr_pkt *)msg->msg_name;
314         struct sk_buff *skb;
315         struct net_device *dev;
316         unsigned short proto=0;
317         int err;
318         
319         /*
320          *      Get and verify the address. 
321          */
322
323         if (saddr)
324         {
325                 if (msg->msg_namelen < sizeof(struct sockaddr))
326                         return(-EINVAL);
327                 if (msg->msg_namelen==sizeof(struct sockaddr_pkt))
328                         proto=saddr->spkt_protocol;
329         }
330         else
331                 return(-ENOTCONN);      /* SOCK_PACKET must be sent giving an address */
332
333         /*
334          *      Find the device first to size check it 
335          */
336
337         saddr->spkt_device[13] = 0;
338         dev = dev_get_by_name(saddr->spkt_device);
339         err = -ENODEV;
340         if (dev == NULL)
341                 goto out_unlock;
342         
343         /*
344          *      You may not queue a frame bigger than the mtu. This is the lowest level
345          *      raw protocol and you must do your own fragmentation at this level.
346          */
347          
348         err = -EMSGSIZE;
349         if(len>dev->mtu+dev->hard_header_len)
350                 goto out_unlock;
351
352         err = -ENOBUFS;
353         skb = sock_wmalloc(sk, len + LL_RESERVED_SPACE(dev), 0, GFP_KERNEL);
354
355         /*
356          *      If the write buffer is full, then tough. At this level the user gets to
357          *      deal with the problem - do your own algorithmic backoffs. That's far
358          *      more flexible.
359          */
360          
361         if (skb == NULL) 
362                 goto out_unlock;
363
364         /*
365          *      Fill it in 
366          */
367          
368         /* FIXME: Save some space for broken drivers that write a
369          * hard header at transmission time by themselves. PPP is the
370          * notable one here. This should really be fixed at the driver level.
371          */
372         skb_reserve(skb, LL_RESERVED_SPACE(dev));
373         skb->nh.raw = skb->data;
374
375         /* Try to align data part correctly */
376         if (dev->hard_header) {
377                 skb->data -= dev->hard_header_len;
378                 skb->tail -= dev->hard_header_len;
379                 if (len < dev->hard_header_len)
380                         skb->nh.raw = skb->data;
381         }
382
383         /* Returns -EFAULT on error */
384         err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
385         skb->protocol = proto;
386         skb->dev = dev;
387         skb->priority = sk->sk_priority;
388         if (err)
389                 goto out_free;
390
391         err = -ENETDOWN;
392         if (!(dev->flags & IFF_UP))
393                 goto out_free;
394
395         /*
396          *      Now send it
397          */
398
399         dev_queue_xmit(skb);
400         dev_put(dev);
401         return(len);
402
403 out_free:
404         kfree_skb(skb);
405 out_unlock:
406         if (dev)
407                 dev_put(dev);
408         return err;
409 }
410 #endif
411
412 static inline unsigned run_filter(struct sk_buff *skb, struct sock *sk, unsigned res)
413 {
414         struct sk_filter *filter;
415
416         bh_lock_sock(sk);
417         filter = sk->sk_filter;
418         /*
419          * Our caller already checked that filter != NULL but we need to
420          * verify that under bh_lock_sock() to be safe
421          */
422         if (likely(filter != NULL))
423                 res = sk_run_filter(skb, filter->insns, filter->len);
424         bh_unlock_sock(sk);
425
426         return res;
427 }
428
429 /*
430    This function makes lazy skb cloning in hope that most of packets
431    are discarded by BPF.
432
433    Note tricky part: we DO mangle shared skb! skb->data, skb->len
434    and skb->cb are mangled. It works because (and until) packets
435    falling here are owned by current CPU. Output packets are cloned
436    by dev_queue_xmit_nit(), input packets are processed by net_bh
437    sequencially, so that if we return skb to original state on exit,
438    we will not harm anyone.
439  */
440
441 static int packet_rcv(struct sk_buff *skb, struct net_device *dev,  struct packet_type *pt)
442 {
443         struct sock *sk;
444         struct sockaddr_ll *sll;
445         struct packet_sock *po;
446         u8 * skb_head = skb->data;
447         int skb_len = skb->len;
448         unsigned snaplen;
449
450         if (skb->pkt_type == PACKET_LOOPBACK)
451                 goto drop;
452
453         sk = pt->af_packet_priv;
454         po = pkt_sk(sk);
455
456         skb->dev = dev;
457
458         if (dev->hard_header) {
459                 /* The device has an explicit notion of ll header,
460                    exported to higher levels.
461
462                    Otherwise, the device hides datails of it frame
463                    structure, so that corresponding packet head
464                    never delivered to user.
465                  */
466                 if (sk->sk_type != SOCK_DGRAM)
467                         skb_push(skb, skb->data - skb->mac.raw);
468                 else if (skb->pkt_type == PACKET_OUTGOING) {
469                         /* Special case: outgoing packets have ll header at head */
470                         skb_pull(skb, skb->nh.raw - skb->data);
471                 }
472         }
473
474         snaplen = skb->len;
475
476         if (sk->sk_filter) {
477                 unsigned res = run_filter(skb, sk, snaplen);
478                 if (res == 0)
479                         goto drop_n_restore;
480                 if (snaplen > res)
481                         snaplen = res;
482         }
483
484         if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
485             (unsigned)sk->sk_rcvbuf)
486                 goto drop_n_acct;
487
488         if (skb_shared(skb)) {
489                 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
490                 if (nskb == NULL)
491                         goto drop_n_acct;
492
493                 if (skb_head != skb->data) {
494                         skb->data = skb_head;
495                         skb->len = skb_len;
496                 }
497                 kfree_skb(skb);
498                 skb = nskb;
499         }
500
501         sll = (struct sockaddr_ll*)skb->cb;
502         sll->sll_family = AF_PACKET;
503         sll->sll_hatype = dev->type;
504         sll->sll_protocol = skb->protocol;
505         sll->sll_pkttype = skb->pkt_type;
506         sll->sll_ifindex = dev->ifindex;
507         sll->sll_halen = 0;
508
509         if (dev->hard_header_parse)
510                 sll->sll_halen = dev->hard_header_parse(skb, sll->sll_addr);
511
512         if (pskb_trim(skb, snaplen))
513                 goto drop_n_acct;
514
515         skb_set_owner_r(skb, sk);
516         skb->dev = NULL;
517         dst_release(skb->dst);
518         skb->dst = NULL;
519
520         spin_lock(&sk->sk_receive_queue.lock);
521         po->stats.tp_packets++;
522         __skb_queue_tail(&sk->sk_receive_queue, skb);
523         spin_unlock(&sk->sk_receive_queue.lock);
524         sk->sk_data_ready(sk, skb->len);
525         return 0;
526
527 drop_n_acct:
528         spin_lock(&sk->sk_receive_queue.lock);
529         po->stats.tp_drops++;
530         spin_unlock(&sk->sk_receive_queue.lock);
531
532 drop_n_restore:
533         if (skb_head != skb->data && skb_shared(skb)) {
534                 skb->data = skb_head;
535                 skb->len = skb_len;
536         }
537 drop:
538         kfree_skb(skb);
539         return 0;
540 }
541
542 #ifdef CONFIG_PACKET_MMAP
543 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,  struct packet_type *pt)
544 {
545         struct sock *sk;
546         struct packet_sock *po;
547         struct sockaddr_ll *sll;
548         struct tpacket_hdr *h;
549         u8 * skb_head = skb->data;
550         int skb_len = skb->len;
551         unsigned snaplen;
552         unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
553         unsigned short macoff, netoff;
554         struct sk_buff *copy_skb = NULL;
555
556         if (skb->pkt_type == PACKET_LOOPBACK)
557                 goto drop;
558
559         sk = pt->af_packet_priv;
560         po = pkt_sk(sk);
561
562         if (dev->hard_header) {
563                 if (sk->sk_type != SOCK_DGRAM)
564                         skb_push(skb, skb->data - skb->mac.raw);
565                 else if (skb->pkt_type == PACKET_OUTGOING) {
566                         /* Special case: outgoing packets have ll header at head */
567                         skb_pull(skb, skb->nh.raw - skb->data);
568                         if (skb->ip_summed == CHECKSUM_HW)
569                                 status |= TP_STATUS_CSUMNOTREADY;
570                 }
571         }
572
573         snaplen = skb->len;
574
575         if (sk->sk_filter) {
576                 unsigned res = run_filter(skb, sk, snaplen);
577                 if (res == 0)
578                         goto drop_n_restore;
579                 if (snaplen > res)
580                         snaplen = res;
581         }
582
583         if (sk->sk_type == SOCK_DGRAM) {
584                 macoff = netoff = TPACKET_ALIGN(TPACKET_HDRLEN) + 16;
585         } else {
586                 unsigned maclen = skb->nh.raw - skb->data;
587                 netoff = TPACKET_ALIGN(TPACKET_HDRLEN + (maclen < 16 ? 16 : maclen));
588                 macoff = netoff - maclen;
589         }
590
591         if (macoff + snaplen > po->frame_size) {
592                 if (po->copy_thresh &&
593                     atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
594                     (unsigned)sk->sk_rcvbuf) {
595                         if (skb_shared(skb)) {
596                                 copy_skb = skb_clone(skb, GFP_ATOMIC);
597                         } else {
598                                 copy_skb = skb_get(skb);
599                                 skb_head = skb->data;
600                         }
601                         if (copy_skb)
602                                 skb_set_owner_r(copy_skb, sk);
603                 }
604                 snaplen = po->frame_size - macoff;
605                 if ((int)snaplen < 0)
606                         snaplen = 0;
607         }
608         if (snaplen > skb->len-skb->data_len)
609                 snaplen = skb->len-skb->data_len;
610
611         spin_lock(&sk->sk_receive_queue.lock);
612         h = (struct tpacket_hdr *)packet_lookup_frame(po, po->head);
613         
614         if (h->tp_status)
615                 goto ring_is_full;
616         po->head = po->head != po->frame_max ? po->head+1 : 0;
617         po->stats.tp_packets++;
618         if (copy_skb) {
619                 status |= TP_STATUS_COPY;
620                 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
621         }
622         if (!po->stats.tp_drops)
623                 status &= ~TP_STATUS_LOSING;
624         spin_unlock(&sk->sk_receive_queue.lock);
625
626         memcpy((u8*)h + macoff, skb->data, snaplen);
627
628         h->tp_len = skb->len;
629         h->tp_snaplen = snaplen;
630         h->tp_mac = macoff;
631         h->tp_net = netoff;
632         if (skb->stamp.tv_sec == 0) { 
633                 do_gettimeofday(&skb->stamp);
634                 sock_enable_timestamp(sk);
635         }
636         h->tp_sec = skb->stamp.tv_sec;
637         h->tp_usec = skb->stamp.tv_usec;
638
639         sll = (struct sockaddr_ll*)((u8*)h + TPACKET_ALIGN(sizeof(*h)));
640         sll->sll_halen = 0;
641         if (dev->hard_header_parse)
642                 sll->sll_halen = dev->hard_header_parse(skb, sll->sll_addr);
643         sll->sll_family = AF_PACKET;
644         sll->sll_hatype = dev->type;
645         sll->sll_protocol = skb->protocol;
646         sll->sll_pkttype = skb->pkt_type;
647         sll->sll_ifindex = dev->ifindex;
648
649         h->tp_status = status;
650         mb();
651
652         {
653                 struct page *p_start, *p_end;
654                 u8 *h_end = (u8 *)h + macoff + snaplen - 1;
655
656                 p_start = virt_to_page(h);
657                 p_end = virt_to_page(h_end);
658                 while (p_start <= p_end) {
659                         flush_dcache_page(p_start);
660                         p_start++;
661                 }
662         }
663
664         sk->sk_data_ready(sk, 0);
665
666 drop_n_restore:
667         if (skb_head != skb->data && skb_shared(skb)) {
668                 skb->data = skb_head;
669                 skb->len = skb_len;
670         }
671 drop:
672         kfree_skb(skb);
673         return 0;
674
675 ring_is_full:
676         po->stats.tp_drops++;
677         spin_unlock(&sk->sk_receive_queue.lock);
678
679         sk->sk_data_ready(sk, 0);
680         if (copy_skb)
681                 kfree_skb(copy_skb);
682         goto drop_n_restore;
683 }
684
685 #endif
686
687
688 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
689                           struct msghdr *msg, size_t len)
690 {
691         struct sock *sk = sock->sk;
692         struct sockaddr_ll *saddr=(struct sockaddr_ll *)msg->msg_name;
693         struct sk_buff *skb;
694         struct net_device *dev;
695         unsigned short proto;
696         unsigned char *addr;
697         int ifindex, err, reserve = 0;
698
699         /*
700          *      Get and verify the address. 
701          */
702          
703         if (saddr == NULL) {
704                 struct packet_sock *po = pkt_sk(sk);
705
706                 ifindex = po->ifindex;
707                 proto   = po->num;
708                 addr    = NULL;
709         } else {
710                 err = -EINVAL;
711                 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
712                         goto out;
713                 ifindex = saddr->sll_ifindex;
714                 proto   = saddr->sll_protocol;
715                 addr    = saddr->sll_addr;
716         }
717
718
719         dev = dev_get_by_index(ifindex);
720         err = -ENXIO;
721         if (dev == NULL)
722                 goto out_unlock;
723         if (sock->type == SOCK_RAW)
724                 reserve = dev->hard_header_len;
725
726         err = -EMSGSIZE;
727         if (len > dev->mtu+reserve)
728                 goto out_unlock;
729
730         skb = sock_alloc_send_skb(sk, len + LL_RESERVED_SPACE(dev),
731                                 msg->msg_flags & MSG_DONTWAIT, &err);
732         if (skb==NULL)
733                 goto out_unlock;
734
735         skb_reserve(skb, LL_RESERVED_SPACE(dev));
736         skb->nh.raw = skb->data;
737
738         if (dev->hard_header) {
739                 int res;
740                 err = -EINVAL;
741                 res = dev->hard_header(skb, dev, ntohs(proto), addr, NULL, len);
742                 if (sock->type != SOCK_DGRAM) {
743                         skb->tail = skb->data;
744                         skb->len = 0;
745                 } else if (res < 0)
746                         goto out_free;
747         }
748
749         /* Returns -EFAULT on error */
750         err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
751         if (err)
752                 goto out_free;
753
754         skb->protocol = proto;
755         skb->dev = dev;
756         skb->priority = sk->sk_priority;
757
758         err = -ENETDOWN;
759         if (!(dev->flags & IFF_UP))
760                 goto out_free;
761
762         /*
763          *      Now send it
764          */
765
766         err = dev_queue_xmit(skb);
767         if (err > 0 && (err = net_xmit_errno(err)) != 0)
768                 goto out_unlock;
769
770         dev_put(dev);
771
772         return(len);
773
774 out_free:
775         kfree_skb(skb);
776 out_unlock:
777         if (dev)
778                 dev_put(dev);
779 out:
780         return err;
781 }
782
783 /*
784  *      Close a PACKET socket. This is fairly simple. We immediately go
785  *      to 'closed' state and remove our protocol entry in the device list.
786  */
787
788 static int packet_release(struct socket *sock)
789 {
790         struct sock *sk = sock->sk;
791         struct packet_sock *po;
792
793         if (!sk)
794                 return 0;
795
796         po = pkt_sk(sk);
797
798         write_lock_bh(&packet_sklist_lock);
799         sk_del_node_init(sk);
800         write_unlock_bh(&packet_sklist_lock);
801
802         /*
803          *      Unhook packet receive handler.
804          */
805
806         if (po->running) {
807                 /*
808                  *      Remove the protocol hook
809                  */
810                 dev_remove_pack(&po->prot_hook);
811                 po->running = 0;
812                 po->num = 0;
813                 __sock_put(sk);
814         }
815
816 #ifdef CONFIG_PACKET_MULTICAST
817         packet_flush_mclist(sk);
818 #endif
819
820 #ifdef CONFIG_PACKET_MMAP
821         if (po->pg_vec) {
822                 struct tpacket_req req;
823                 memset(&req, 0, sizeof(req));
824                 packet_set_ring(sk, &req, 1);
825         }
826 #endif
827
828         /*
829          *      Now the socket is dead. No more input will appear.
830          */
831
832         sock_orphan(sk);
833         sock->sk = NULL;
834
835         /* Purge queues */
836
837         skb_queue_purge(&sk->sk_receive_queue);
838
839         sock_put(sk);
840         return 0;
841 }
842
843 /*
844  *      Attach a packet hook.
845  */
846
847 static int packet_do_bind(struct sock *sk, struct net_device *dev, int protocol)
848 {
849         struct packet_sock *po = pkt_sk(sk);
850         /*
851          *      Detach an existing hook if present.
852          */
853
854         lock_sock(sk);
855
856         spin_lock(&po->bind_lock);
857         if (po->running) {
858                 __sock_put(sk);
859                 po->running = 0;
860                 po->num = 0;
861                 spin_unlock(&po->bind_lock);
862                 dev_remove_pack(&po->prot_hook);
863                 spin_lock(&po->bind_lock);
864         }
865
866         po->num = protocol;
867         po->prot_hook.type = protocol;
868         po->prot_hook.dev = dev;
869
870         po->ifindex = dev ? dev->ifindex : 0;
871
872         if (protocol == 0)
873                 goto out_unlock;
874
875         if (dev) {
876                 if (dev->flags&IFF_UP) {
877                         dev_add_pack(&po->prot_hook);
878                         sock_hold(sk);
879                         po->running = 1;
880                 } else {
881                         sk->sk_err = ENETDOWN;
882                         if (!sock_flag(sk, SOCK_DEAD))
883                                 sk->sk_error_report(sk);
884                 }
885         } else {
886                 dev_add_pack(&po->prot_hook);
887                 sock_hold(sk);
888                 po->running = 1;
889         }
890
891 out_unlock:
892         spin_unlock(&po->bind_lock);
893         release_sock(sk);
894         return 0;
895 }
896
897 /*
898  *      Bind a packet socket to a device
899  */
900
901 #ifdef CONFIG_SOCK_PACKET
902
903 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, int addr_len)
904 {
905         struct sock *sk=sock->sk;
906         char name[15];
907         struct net_device *dev;
908         int err = -ENODEV;
909         
910         /*
911          *      Check legality
912          */
913          
914         if(addr_len!=sizeof(struct sockaddr))
915                 return -EINVAL;
916         strlcpy(name,uaddr->sa_data,sizeof(name));
917
918         dev = dev_get_by_name(name);
919         if (dev) {
920                 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
921                 dev_put(dev);
922         }
923         return err;
924 }
925 #endif
926
927 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
928 {
929         struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
930         struct sock *sk=sock->sk;
931         struct net_device *dev = NULL;
932         int err;
933
934
935         /*
936          *      Check legality
937          */
938          
939         if (addr_len < sizeof(struct sockaddr_ll))
940                 return -EINVAL;
941         if (sll->sll_family != AF_PACKET)
942                 return -EINVAL;
943
944         if (sll->sll_ifindex) {
945                 err = -ENODEV;
946                 dev = dev_get_by_index(sll->sll_ifindex);
947                 if (dev == NULL)
948                         goto out;
949         }
950         err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
951         if (dev)
952                 dev_put(dev);
953
954 out:
955         return err;
956 }
957
958 static struct proto packet_proto = {
959         .name     = "PACKET",
960         .owner    = THIS_MODULE,
961         .obj_size = sizeof(struct packet_sock),
962 };
963
964 /*
965  *      Create a packet of type SOCK_PACKET. 
966  */
967
968 static int packet_create(struct socket *sock, int protocol)
969 {
970         struct sock *sk;
971         struct packet_sock *po;
972         int err;
973
974         if (!capable(CAP_NET_RAW))
975                 return -EPERM;
976         if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW
977 #ifdef CONFIG_SOCK_PACKET
978             && sock->type != SOCK_PACKET
979 #endif
980             )
981                 return -ESOCKTNOSUPPORT;
982
983         sock->state = SS_UNCONNECTED;
984
985         err = -ENOBUFS;
986         sk = sk_alloc(PF_PACKET, GFP_KERNEL, &packet_proto, 1);
987         if (sk == NULL)
988                 goto out;
989
990         sock->ops = &packet_ops;
991 #ifdef CONFIG_SOCK_PACKET
992         if (sock->type == SOCK_PACKET)
993                 sock->ops = &packet_ops_spkt;
994 #endif
995         sock_init_data(sock, sk);
996
997         po = pkt_sk(sk);
998         sk->sk_family = PF_PACKET;
999         po->num = protocol;
1000
1001         sk->sk_destruct = packet_sock_destruct;
1002         atomic_inc(&packet_socks_nr);
1003
1004         /*
1005          *      Attach a protocol block
1006          */
1007
1008         spin_lock_init(&po->bind_lock);
1009         po->prot_hook.func = packet_rcv;
1010 #ifdef CONFIG_SOCK_PACKET
1011         if (sock->type == SOCK_PACKET)
1012                 po->prot_hook.func = packet_rcv_spkt;
1013 #endif
1014         po->prot_hook.af_packet_priv = sk;
1015
1016         if (protocol) {
1017                 po->prot_hook.type = protocol;
1018                 dev_add_pack(&po->prot_hook);
1019                 sock_hold(sk);
1020                 po->running = 1;
1021         }
1022
1023         write_lock_bh(&packet_sklist_lock);
1024         sk_add_node(sk, &packet_sklist);
1025         write_unlock_bh(&packet_sklist_lock);
1026         return(0);
1027 out:
1028         return err;
1029 }
1030
1031 /*
1032  *      Pull a packet from our receive queue and hand it to the user.
1033  *      If necessary we block.
1034  */
1035
1036 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1037                           struct msghdr *msg, size_t len, int flags)
1038 {
1039         struct sock *sk = sock->sk;
1040         struct sk_buff *skb;
1041         int copied, err;
1042
1043         err = -EINVAL;
1044         if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
1045                 goto out;
1046
1047 #if 0
1048         /* What error should we return now? EUNATTACH? */
1049         if (pkt_sk(sk)->ifindex < 0)
1050                 return -ENODEV;
1051 #endif
1052
1053         /*
1054          *      If the address length field is there to be filled in, we fill
1055          *      it in now.
1056          */
1057
1058         if (sock->type == SOCK_PACKET)
1059                 msg->msg_namelen = sizeof(struct sockaddr_pkt);
1060         else
1061                 msg->msg_namelen = sizeof(struct sockaddr_ll);
1062
1063         /*
1064          *      Call the generic datagram receiver. This handles all sorts
1065          *      of horrible races and re-entrancy so we can forget about it
1066          *      in the protocol layers.
1067          *
1068          *      Now it will return ENETDOWN, if device have just gone down,
1069          *      but then it will block.
1070          */
1071
1072         skb=skb_recv_datagram(sk,flags,flags&MSG_DONTWAIT,&err);
1073
1074         /*
1075          *      An error occurred so return it. Because skb_recv_datagram() 
1076          *      handles the blocking we don't see and worry about blocking
1077          *      retries.
1078          */
1079
1080         if(skb==NULL)
1081                 goto out;
1082
1083         /*
1084          *      You lose any data beyond the buffer you gave. If it worries a
1085          *      user program they can ask the device for its MTU anyway.
1086          */
1087
1088         copied = skb->len;
1089         if (copied > len)
1090         {
1091                 copied=len;
1092                 msg->msg_flags|=MSG_TRUNC;
1093         }
1094
1095         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1096         if (err)
1097                 goto out_free;
1098
1099         sock_recv_timestamp(msg, sk, skb);
1100
1101         if (msg->msg_name)
1102                 memcpy(msg->msg_name, skb->cb, msg->msg_namelen);
1103
1104         /*
1105          *      Free or return the buffer as appropriate. Again this
1106          *      hides all the races and re-entrancy issues from us.
1107          */
1108         err = (flags&MSG_TRUNC) ? skb->len : copied;
1109
1110 out_free:
1111         skb_free_datagram(sk, skb);
1112 out:
1113         return err;
1114 }
1115
1116 #ifdef CONFIG_SOCK_PACKET
1117 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1118                                int *uaddr_len, int peer)
1119 {
1120         struct net_device *dev;
1121         struct sock *sk = sock->sk;
1122
1123         if (peer)
1124                 return -EOPNOTSUPP;
1125
1126         uaddr->sa_family = AF_PACKET;
1127         dev = dev_get_by_index(pkt_sk(sk)->ifindex);
1128         if (dev) {
1129                 strlcpy(uaddr->sa_data, dev->name, 15);
1130                 dev_put(dev);
1131         } else
1132                 memset(uaddr->sa_data, 0, 14);
1133         *uaddr_len = sizeof(*uaddr);
1134
1135         return 0;
1136 }
1137 #endif
1138
1139 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1140                           int *uaddr_len, int peer)
1141 {
1142         struct net_device *dev;
1143         struct sock *sk = sock->sk;
1144         struct packet_sock *po = pkt_sk(sk);
1145         struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
1146
1147         if (peer)
1148                 return -EOPNOTSUPP;
1149
1150         sll->sll_family = AF_PACKET;
1151         sll->sll_ifindex = po->ifindex;
1152         sll->sll_protocol = po->num;
1153         dev = dev_get_by_index(po->ifindex);
1154         if (dev) {
1155                 sll->sll_hatype = dev->type;
1156                 sll->sll_halen = dev->addr_len;
1157                 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1158                 dev_put(dev);
1159         } else {
1160                 sll->sll_hatype = 0;    /* Bad: we have no ARPHRD_UNSPEC */
1161                 sll->sll_halen = 0;
1162         }
1163         *uaddr_len = sizeof(*sll);
1164
1165         return 0;
1166 }
1167
1168 #ifdef CONFIG_PACKET_MULTICAST
1169 static void packet_dev_mc(struct net_device *dev, struct packet_mclist *i, int what)
1170 {
1171         switch (i->type) {
1172         case PACKET_MR_MULTICAST:
1173                 if (what > 0)
1174                         dev_mc_add(dev, i->addr, i->alen, 0);
1175                 else
1176                         dev_mc_delete(dev, i->addr, i->alen, 0);
1177                 break;
1178         case PACKET_MR_PROMISC:
1179                 dev_set_promiscuity(dev, what);
1180                 break;
1181         case PACKET_MR_ALLMULTI:
1182                 dev_set_allmulti(dev, what);
1183                 break;
1184         default:;
1185         }
1186 }
1187
1188 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1189 {
1190         for ( ; i; i=i->next) {
1191                 if (i->ifindex == dev->ifindex)
1192                         packet_dev_mc(dev, i, what);
1193         }
1194 }
1195
1196 static int packet_mc_add(struct sock *sk, struct packet_mreq *mreq)
1197 {
1198         struct packet_sock *po = pkt_sk(sk);
1199         struct packet_mclist *ml, *i;
1200         struct net_device *dev;
1201         int err;
1202
1203         rtnl_lock();
1204
1205         err = -ENODEV;
1206         dev = __dev_get_by_index(mreq->mr_ifindex);
1207         if (!dev)
1208                 goto done;
1209
1210         err = -EINVAL;
1211         if (mreq->mr_alen > dev->addr_len)
1212                 goto done;
1213
1214         err = -ENOBUFS;
1215         i = (struct packet_mclist *)kmalloc(sizeof(*i), GFP_KERNEL);
1216         if (i == NULL)
1217                 goto done;
1218
1219         err = 0;
1220         for (ml = po->mclist; ml; ml = ml->next) {
1221                 if (ml->ifindex == mreq->mr_ifindex &&
1222                     ml->type == mreq->mr_type &&
1223                     ml->alen == mreq->mr_alen &&
1224                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1225                         ml->count++;
1226                         /* Free the new element ... */
1227                         kfree(i);
1228                         goto done;
1229                 }
1230         }
1231
1232         i->type = mreq->mr_type;
1233         i->ifindex = mreq->mr_ifindex;
1234         i->alen = mreq->mr_alen;
1235         memcpy(i->addr, mreq->mr_address, i->alen);
1236         i->count = 1;
1237         i->next = po->mclist;
1238         po->mclist = i;
1239         packet_dev_mc(dev, i, +1);
1240
1241 done:
1242         rtnl_unlock();
1243         return err;
1244 }
1245
1246 static int packet_mc_drop(struct sock *sk, struct packet_mreq *mreq)
1247 {
1248         struct packet_mclist *ml, **mlp;
1249
1250         rtnl_lock();
1251
1252         for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1253                 if (ml->ifindex == mreq->mr_ifindex &&
1254                     ml->type == mreq->mr_type &&
1255                     ml->alen == mreq->mr_alen &&
1256                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1257                         if (--ml->count == 0) {
1258                                 struct net_device *dev;
1259                                 *mlp = ml->next;
1260                                 dev = dev_get_by_index(ml->ifindex);
1261                                 if (dev) {
1262                                         packet_dev_mc(dev, ml, -1);
1263                                         dev_put(dev);
1264                                 }
1265                                 kfree(ml);
1266                         }
1267                         rtnl_unlock();
1268                         return 0;
1269                 }
1270         }
1271         rtnl_unlock();
1272         return -EADDRNOTAVAIL;
1273 }
1274
1275 static void packet_flush_mclist(struct sock *sk)
1276 {
1277         struct packet_sock *po = pkt_sk(sk);
1278         struct packet_mclist *ml;
1279
1280         if (!po->mclist)
1281                 return;
1282
1283         rtnl_lock();
1284         while ((ml = po->mclist) != NULL) {
1285                 struct net_device *dev;
1286
1287                 po->mclist = ml->next;
1288                 if ((dev = dev_get_by_index(ml->ifindex)) != NULL) {
1289                         packet_dev_mc(dev, ml, -1);
1290                         dev_put(dev);
1291                 }
1292                 kfree(ml);
1293         }
1294         rtnl_unlock();
1295 }
1296 #endif
1297
1298 static int
1299 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen)
1300 {
1301         struct sock *sk = sock->sk;
1302         int ret;
1303
1304         if (level != SOL_PACKET)
1305                 return -ENOPROTOOPT;
1306
1307         switch(optname) {
1308 #ifdef CONFIG_PACKET_MULTICAST
1309         case PACKET_ADD_MEMBERSHIP:     
1310         case PACKET_DROP_MEMBERSHIP:
1311         {
1312                 struct packet_mreq mreq;
1313                 if (optlen<sizeof(mreq))
1314                         return -EINVAL;
1315                 if (copy_from_user(&mreq,optval,sizeof(mreq)))
1316                         return -EFAULT;
1317                 if (optname == PACKET_ADD_MEMBERSHIP)
1318                         ret = packet_mc_add(sk, &mreq);
1319                 else
1320                         ret = packet_mc_drop(sk, &mreq);
1321                 return ret;
1322         }
1323 #endif
1324 #ifdef CONFIG_PACKET_MMAP
1325         case PACKET_RX_RING:
1326         {
1327                 struct tpacket_req req;
1328
1329                 if (optlen<sizeof(req))
1330                         return -EINVAL;
1331                 if (copy_from_user(&req,optval,sizeof(req)))
1332                         return -EFAULT;
1333                 return packet_set_ring(sk, &req, 0);
1334         }
1335         case PACKET_COPY_THRESH:
1336         {
1337                 int val;
1338
1339                 if (optlen!=sizeof(val))
1340                         return -EINVAL;
1341                 if (copy_from_user(&val,optval,sizeof(val)))
1342                         return -EFAULT;
1343
1344                 pkt_sk(sk)->copy_thresh = val;
1345                 return 0;
1346         }
1347 #endif
1348         default:
1349                 return -ENOPROTOOPT;
1350         }
1351 }
1352
1353 static int packet_getsockopt(struct socket *sock, int level, int optname,
1354                              char __user *optval, int __user *optlen)
1355 {
1356         int len;
1357         struct sock *sk = sock->sk;
1358         struct packet_sock *po = pkt_sk(sk);
1359
1360         if (level != SOL_PACKET)
1361                 return -ENOPROTOOPT;
1362
1363         if (get_user(len,optlen))
1364                 return -EFAULT;
1365
1366         if (len < 0)
1367                 return -EINVAL;
1368                 
1369         switch(optname) {
1370         case PACKET_STATISTICS:
1371         {
1372                 struct tpacket_stats st;
1373
1374                 if (len > sizeof(struct tpacket_stats))
1375                         len = sizeof(struct tpacket_stats);
1376                 spin_lock_bh(&sk->sk_receive_queue.lock);
1377                 st = po->stats;
1378                 memset(&po->stats, 0, sizeof(st));
1379                 spin_unlock_bh(&sk->sk_receive_queue.lock);
1380                 st.tp_packets += st.tp_drops;
1381
1382                 if (copy_to_user(optval, &st, len))
1383                         return -EFAULT;
1384                 break;
1385         }
1386         default:
1387                 return -ENOPROTOOPT;
1388         }
1389
1390         if (put_user(len, optlen))
1391                 return -EFAULT;
1392         return 0;
1393 }
1394
1395
1396 static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
1397 {
1398         struct sock *sk;
1399         struct hlist_node *node;
1400         struct net_device *dev = (struct net_device*)data;
1401
1402         read_lock(&packet_sklist_lock);
1403         sk_for_each(sk, node, &packet_sklist) {
1404                 struct packet_sock *po = pkt_sk(sk);
1405
1406                 switch (msg) {
1407                 case NETDEV_UNREGISTER:
1408 #ifdef CONFIG_PACKET_MULTICAST
1409                         if (po->mclist)
1410                                 packet_dev_mclist(dev, po->mclist, -1);
1411                         // fallthrough
1412 #endif
1413                 case NETDEV_DOWN:
1414                         if (dev->ifindex == po->ifindex) {
1415                                 spin_lock(&po->bind_lock);
1416                                 if (po->running) {
1417                                         __dev_remove_pack(&po->prot_hook);
1418                                         __sock_put(sk);
1419                                         po->running = 0;
1420                                         sk->sk_err = ENETDOWN;
1421                                         if (!sock_flag(sk, SOCK_DEAD))
1422                                                 sk->sk_error_report(sk);
1423                                 }
1424                                 if (msg == NETDEV_UNREGISTER) {
1425                                         po->ifindex = -1;
1426                                         po->prot_hook.dev = NULL;
1427                                 }
1428                                 spin_unlock(&po->bind_lock);
1429                         }
1430                         break;
1431                 case NETDEV_UP:
1432                         spin_lock(&po->bind_lock);
1433                         if (dev->ifindex == po->ifindex && po->num &&
1434                             !po->running) {
1435                                 dev_add_pack(&po->prot_hook);
1436                                 sock_hold(sk);
1437                                 po->running = 1;
1438                         }
1439                         spin_unlock(&po->bind_lock);
1440                         break;
1441                 }
1442         }
1443         read_unlock(&packet_sklist_lock);
1444         return NOTIFY_DONE;
1445 }
1446
1447
1448 static int packet_ioctl(struct socket *sock, unsigned int cmd,
1449                         unsigned long arg)
1450 {
1451         struct sock *sk = sock->sk;
1452
1453         switch(cmd) {
1454                 case SIOCOUTQ:
1455                 {
1456                         int amount = atomic_read(&sk->sk_wmem_alloc);
1457                         return put_user(amount, (int __user *)arg);
1458                 }
1459                 case SIOCINQ:
1460                 {
1461                         struct sk_buff *skb;
1462                         int amount = 0;
1463
1464                         spin_lock_bh(&sk->sk_receive_queue.lock);
1465                         skb = skb_peek(&sk->sk_receive_queue);
1466                         if (skb)
1467                                 amount = skb->len;
1468                         spin_unlock_bh(&sk->sk_receive_queue.lock);
1469                         return put_user(amount, (int __user *)arg);
1470                 }
1471                 case SIOCGSTAMP:
1472                         return sock_get_timestamp(sk, (struct timeval __user *)arg);
1473                         
1474 #ifdef CONFIG_INET
1475                 case SIOCADDRT:
1476                 case SIOCDELRT:
1477                 case SIOCDARP:
1478                 case SIOCGARP:
1479                 case SIOCSARP:
1480                 case SIOCGIFADDR:
1481                 case SIOCSIFADDR:
1482                 case SIOCGIFBRDADDR:
1483                 case SIOCSIFBRDADDR:
1484                 case SIOCGIFNETMASK:
1485                 case SIOCSIFNETMASK:
1486                 case SIOCGIFDSTADDR:
1487                 case SIOCSIFDSTADDR:
1488                 case SIOCSIFFLAGS:
1489                         return inet_dgram_ops.ioctl(sock, cmd, arg);
1490 #endif
1491
1492                 default:
1493                         return dev_ioctl(cmd, (void __user *)arg);
1494         }
1495         return 0;
1496 }
1497
1498 #ifndef CONFIG_PACKET_MMAP
1499 #define packet_mmap sock_no_mmap
1500 #define packet_poll datagram_poll
1501 #else
1502
1503 static unsigned int packet_poll(struct file * file, struct socket *sock,
1504                                 poll_table *wait)
1505 {
1506         struct sock *sk = sock->sk;
1507         struct packet_sock *po = pkt_sk(sk);
1508         unsigned int mask = datagram_poll(file, sock, wait);
1509
1510         spin_lock_bh(&sk->sk_receive_queue.lock);
1511         if (po->pg_vec) {
1512                 unsigned last = po->head ? po->head-1 : po->frame_max;
1513                 struct tpacket_hdr *h;
1514
1515                 h = (struct tpacket_hdr *)packet_lookup_frame(po, last);
1516
1517                 if (h->tp_status)
1518                         mask |= POLLIN | POLLRDNORM;
1519         }
1520         spin_unlock_bh(&sk->sk_receive_queue.lock);
1521         return mask;
1522 }
1523
1524
1525 /* Dirty? Well, I still did not learn better way to account
1526  * for user mmaps.
1527  */
1528
1529 static void packet_mm_open(struct vm_area_struct *vma)
1530 {
1531         struct file *file = vma->vm_file;
1532         struct inode *inode = file->f_dentry->d_inode;
1533         struct socket * sock = SOCKET_I(inode);
1534         struct sock *sk = sock->sk;
1535         
1536         if (sk)
1537                 atomic_inc(&pkt_sk(sk)->mapped);
1538 }
1539
1540 static void packet_mm_close(struct vm_area_struct *vma)
1541 {
1542         struct file *file = vma->vm_file;
1543         struct inode *inode = file->f_dentry->d_inode;
1544         struct socket * sock = SOCKET_I(inode);
1545         struct sock *sk = sock->sk;
1546         
1547         if (sk)
1548                 atomic_dec(&pkt_sk(sk)->mapped);
1549 }
1550
1551 static struct vm_operations_struct packet_mmap_ops = {
1552         .open = packet_mm_open,
1553         .close =packet_mm_close,
1554 };
1555
1556 static inline struct page *pg_vec_endpage(char *one_pg_vec, unsigned int order)
1557 {
1558         return virt_to_page(one_pg_vec + (PAGE_SIZE << order) - 1);
1559 }
1560
1561 static void free_pg_vec(char **pg_vec, unsigned order, unsigned len)
1562 {
1563         int i;
1564
1565         for (i=0; i<len; i++) {
1566                 if (pg_vec[i]) {
1567                         struct page *page, *pend;
1568
1569                         pend = pg_vec_endpage(pg_vec[i], order);
1570                         for (page = virt_to_page(pg_vec[i]); page <= pend; page++)
1571                                 ClearPageReserved(page);
1572                         free_pages((unsigned long)pg_vec[i], order);
1573                 }
1574         }
1575         kfree(pg_vec);
1576 }
1577
1578
1579 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing)
1580 {
1581         char **pg_vec = NULL;
1582         struct packet_sock *po = pkt_sk(sk);
1583         int was_running, num, order = 0;
1584         int err = 0;
1585         
1586         if (req->tp_block_nr) {
1587                 int i, l;
1588
1589                 /* Sanity tests and some calculations */
1590
1591                 if (po->pg_vec)
1592                         return -EBUSY;
1593
1594                 if ((int)req->tp_block_size <= 0)
1595                         return -EINVAL;
1596                 if (req->tp_block_size&(PAGE_SIZE-1))
1597                         return -EINVAL;
1598                 if (req->tp_frame_size < TPACKET_HDRLEN)
1599                         return -EINVAL;
1600                 if (req->tp_frame_size&(TPACKET_ALIGNMENT-1))
1601                         return -EINVAL;
1602
1603                 po->frames_per_block = req->tp_block_size/req->tp_frame_size;
1604                 if (po->frames_per_block <= 0)
1605                         return -EINVAL;
1606                 if (po->frames_per_block*req->tp_block_nr != req->tp_frame_nr)
1607                         return -EINVAL;
1608                 /* OK! */
1609
1610                 /* Allocate page vector */
1611                 while ((PAGE_SIZE<<order) < req->tp_block_size)
1612                         order++;
1613
1614                 err = -ENOMEM;
1615
1616                 pg_vec = kmalloc(req->tp_block_nr*sizeof(char *), GFP_KERNEL);
1617                 if (pg_vec == NULL)
1618                         goto out;
1619                 memset(pg_vec, 0, req->tp_block_nr*sizeof(char **));
1620
1621                 for (i=0; i<req->tp_block_nr; i++) {
1622                         struct page *page, *pend;
1623                         pg_vec[i] = (char *)__get_free_pages(GFP_KERNEL, order);
1624                         if (!pg_vec[i])
1625                                 goto out_free_pgvec;
1626
1627                         pend = pg_vec_endpage(pg_vec[i], order);
1628                         for (page = virt_to_page(pg_vec[i]); page <= pend; page++)
1629                                 SetPageReserved(page);
1630                 }
1631                 /* Page vector is allocated */
1632
1633                 l = 0;
1634                 for (i=0; i<req->tp_block_nr; i++) {
1635                         char *ptr = pg_vec[i];
1636                         struct tpacket_hdr *header;
1637                         int k;
1638
1639                         for (k=0; k<po->frames_per_block; k++) {
1640                                 
1641                                 header = (struct tpacket_hdr*)ptr;
1642                                 header->tp_status = TP_STATUS_KERNEL;
1643                                 ptr += req->tp_frame_size;
1644                         }
1645                 }
1646                 /* Done */
1647         } else {
1648                 if (req->tp_frame_nr)
1649                         return -EINVAL;
1650         }
1651
1652         lock_sock(sk);
1653
1654         /* Detach socket from network */
1655         spin_lock(&po->bind_lock);
1656         was_running = po->running;
1657         num = po->num;
1658         if (was_running) {
1659                 __dev_remove_pack(&po->prot_hook);
1660                 po->num = 0;
1661                 po->running = 0;
1662                 __sock_put(sk);
1663         }
1664         spin_unlock(&po->bind_lock);
1665                 
1666         synchronize_net();
1667
1668         err = -EBUSY;
1669         if (closing || atomic_read(&po->mapped) == 0) {
1670                 err = 0;
1671 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
1672
1673                 spin_lock_bh(&sk->sk_receive_queue.lock);
1674                 pg_vec = XC(po->pg_vec, pg_vec);
1675                 po->frame_max = req->tp_frame_nr-1;
1676                 po->head = 0;
1677                 po->frame_size = req->tp_frame_size;
1678                 spin_unlock_bh(&sk->sk_receive_queue.lock);
1679
1680                 order = XC(po->pg_vec_order, order);
1681                 req->tp_block_nr = XC(po->pg_vec_len, req->tp_block_nr);
1682
1683                 po->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
1684                 po->prot_hook.func = po->pg_vec ? tpacket_rcv : packet_rcv;
1685                 skb_queue_purge(&sk->sk_receive_queue);
1686 #undef XC
1687                 if (atomic_read(&po->mapped))
1688                         printk(KERN_DEBUG "packet_mmap: vma is busy: %d\n", atomic_read(&po->mapped));
1689         }
1690
1691         spin_lock(&po->bind_lock);
1692         if (was_running && !po->running) {
1693                 sock_hold(sk);
1694                 po->running = 1;
1695                 po->num = num;
1696                 dev_add_pack(&po->prot_hook);
1697         }
1698         spin_unlock(&po->bind_lock);
1699
1700         release_sock(sk);
1701
1702 out_free_pgvec:
1703         if (pg_vec)
1704                 free_pg_vec(pg_vec, order, req->tp_block_nr);
1705 out:
1706         return err;
1707 }
1708
1709 static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1710 {
1711         struct sock *sk = sock->sk;
1712         struct packet_sock *po = pkt_sk(sk);
1713         unsigned long size;
1714         unsigned long start;
1715         int err = -EINVAL;
1716         int i;
1717
1718         if (vma->vm_pgoff)
1719                 return -EINVAL;
1720
1721         size = vma->vm_end - vma->vm_start;
1722
1723         lock_sock(sk);
1724         if (po->pg_vec == NULL)
1725                 goto out;
1726         if (size != po->pg_vec_len*po->pg_vec_pages*PAGE_SIZE)
1727                 goto out;
1728
1729         atomic_inc(&po->mapped);
1730         start = vma->vm_start;
1731         err = -EAGAIN;
1732         for (i=0; i<po->pg_vec_len; i++) {
1733                 if (remap_pfn_range(vma, start,
1734                                      __pa(po->pg_vec[i]) >> PAGE_SHIFT,
1735                                      po->pg_vec_pages*PAGE_SIZE,
1736                                      vma->vm_page_prot))
1737                         goto out;
1738                 start += po->pg_vec_pages*PAGE_SIZE;
1739         }
1740         vma->vm_ops = &packet_mmap_ops;
1741         err = 0;
1742
1743 out:
1744         release_sock(sk);
1745         return err;
1746 }
1747 #endif
1748
1749
1750 #ifdef CONFIG_SOCK_PACKET
1751 static struct proto_ops packet_ops_spkt = {
1752         .family =       PF_PACKET,
1753         .owner =        THIS_MODULE,
1754         .release =      packet_release,
1755         .bind =         packet_bind_spkt,
1756         .connect =      sock_no_connect,
1757         .socketpair =   sock_no_socketpair,
1758         .accept =       sock_no_accept,
1759         .getname =      packet_getname_spkt,
1760         .poll =         datagram_poll,
1761         .ioctl =        packet_ioctl,
1762         .listen =       sock_no_listen,
1763         .shutdown =     sock_no_shutdown,
1764         .setsockopt =   sock_no_setsockopt,
1765         .getsockopt =   sock_no_getsockopt,
1766         .sendmsg =      packet_sendmsg_spkt,
1767         .recvmsg =      packet_recvmsg,
1768         .mmap =         sock_no_mmap,
1769         .sendpage =     sock_no_sendpage,
1770 };
1771 #endif
1772
1773 static struct proto_ops packet_ops = {
1774         .family =       PF_PACKET,
1775         .owner =        THIS_MODULE,
1776         .release =      packet_release,
1777         .bind =         packet_bind,
1778         .connect =      sock_no_connect,
1779         .socketpair =   sock_no_socketpair,
1780         .accept =       sock_no_accept,
1781         .getname =      packet_getname, 
1782         .poll =         packet_poll,
1783         .ioctl =        packet_ioctl,
1784         .listen =       sock_no_listen,
1785         .shutdown =     sock_no_shutdown,
1786         .setsockopt =   packet_setsockopt,
1787         .getsockopt =   packet_getsockopt,
1788         .sendmsg =      packet_sendmsg,
1789         .recvmsg =      packet_recvmsg,
1790         .mmap =         packet_mmap,
1791         .sendpage =     sock_no_sendpage,
1792 };
1793
1794 static struct net_proto_family packet_family_ops = {
1795         .family =       PF_PACKET,
1796         .create =       packet_create,
1797         .owner  =       THIS_MODULE,
1798 };
1799
1800 static struct notifier_block packet_netdev_notifier = {
1801         .notifier_call =packet_notifier,
1802 };
1803
1804 #ifdef CONFIG_PROC_FS
1805 static inline struct sock *packet_seq_idx(loff_t off)
1806 {
1807         struct sock *s;
1808         struct hlist_node *node;
1809
1810         sk_for_each(s, node, &packet_sklist) {
1811                 if (!off--)
1812                         return s;
1813         }
1814         return NULL;
1815 }
1816
1817 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
1818 {
1819         read_lock(&packet_sklist_lock);
1820         return *pos ? packet_seq_idx(*pos - 1) : SEQ_START_TOKEN;
1821 }
1822
1823 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1824 {
1825         ++*pos;
1826         return  (v == SEQ_START_TOKEN) 
1827                 ? sk_head(&packet_sklist) 
1828                 : sk_next((struct sock*)v) ;
1829 }
1830
1831 static void packet_seq_stop(struct seq_file *seq, void *v)
1832 {
1833         read_unlock(&packet_sklist_lock);               
1834 }
1835
1836 static int packet_seq_show(struct seq_file *seq, void *v) 
1837 {
1838         if (v == SEQ_START_TOKEN)
1839                 seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
1840         else {
1841                 struct sock *s = v;
1842                 const struct packet_sock *po = pkt_sk(s);
1843
1844                 seq_printf(seq,
1845                            "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
1846                            s,
1847                            atomic_read(&s->sk_refcnt),
1848                            s->sk_type,
1849                            ntohs(po->num),
1850                            po->ifindex,
1851                            po->running,
1852                            atomic_read(&s->sk_rmem_alloc),
1853                            sock_i_uid(s),
1854                            sock_i_ino(s) );
1855         }
1856
1857         return 0;
1858 }
1859
1860 static struct seq_operations packet_seq_ops = {
1861         .start  = packet_seq_start,
1862         .next   = packet_seq_next,
1863         .stop   = packet_seq_stop,
1864         .show   = packet_seq_show,
1865 };
1866
1867 static int packet_seq_open(struct inode *inode, struct file *file)
1868 {
1869         return seq_open(file, &packet_seq_ops);
1870 }
1871
1872 static struct file_operations packet_seq_fops = {
1873         .owner          = THIS_MODULE,
1874         .open           = packet_seq_open,
1875         .read           = seq_read,
1876         .llseek         = seq_lseek,
1877         .release        = seq_release,
1878 };
1879
1880 #endif
1881
1882 static void __exit packet_exit(void)
1883 {
1884         proc_net_remove("packet");
1885         unregister_netdevice_notifier(&packet_netdev_notifier);
1886         sock_unregister(PF_PACKET);
1887         proto_unregister(&packet_proto);
1888 }
1889
1890 static int __init packet_init(void)
1891 {
1892         int rc = proto_register(&packet_proto, 0);
1893
1894         if (rc != 0)
1895                 goto out;
1896
1897         sock_register(&packet_family_ops);
1898         register_netdevice_notifier(&packet_netdev_notifier);
1899         proc_net_fops_create("packet", 0, &packet_seq_fops);
1900 out:
1901         return rc;
1902 }
1903
1904 module_init(packet_init);
1905 module_exit(packet_exit);
1906 MODULE_LICENSE("GPL");
1907 MODULE_ALIAS_NETPROTO(PF_PACKET);