2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * PACKET - implements raw packet sockets.
8 * Version: $Id: af_packet.c,v 1.61 2002/02/08 03:57:19 davem Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
15 * Alan Cox : verify_area() now used correctly
16 * Alan Cox : new skbuff lists, look ma no backlogs!
17 * Alan Cox : tidied skbuff lists.
18 * Alan Cox : Now uses generic datagram routines I
19 * added. Also fixed the peek/read crash
20 * from all old Linux datagram code.
21 * Alan Cox : Uses the improved datagram code.
22 * Alan Cox : Added NULL's for socket options.
23 * Alan Cox : Re-commented the code.
24 * Alan Cox : Use new kernel side addressing
25 * Rob Janssen : Correct MTU usage.
26 * Dave Platt : Counter leaks caused by incorrect
27 * interrupt locking and some slightly
28 * dubious gcc output. Can you read
29 * compiler: it said _VOLATILE_
30 * Richard Kooijman : Timestamp fixes.
31 * Alan Cox : New buffers. Use sk->mac.raw.
32 * Alan Cox : sendmsg/recvmsg support.
33 * Alan Cox : Protocol setting support
34 * Alexey Kuznetsov : Untied from IPv4 stack.
35 * Cyrus Durgin : Fixed kerneld for kmod.
36 * Michal Ostrowski : Module initialization cleanup.
37 * Ulises Alonso : Frame number limit removal and
38 * packet_set_ring memory leak.
40 * This program is free software; you can redistribute it and/or
41 * modify it under the terms of the GNU General Public License
42 * as published by the Free Software Foundation; either version
43 * 2 of the License, or (at your option) any later version.
47 #include <linux/config.h>
48 #include <linux/types.h>
49 #include <linux/sched.h>
51 #include <linux/fcntl.h>
52 #include <linux/socket.h>
54 #include <linux/inet.h>
55 #include <linux/netdevice.h>
56 #include <linux/if_packet.h>
57 #include <linux/wireless.h>
58 #include <linux/kmod.h>
60 #include <net/protocol.h>
61 #include <linux/skbuff.h>
63 #include <linux/errno.h>
64 #include <linux/timer.h>
65 #include <asm/system.h>
66 #include <asm/uaccess.h>
67 #include <asm/ioctls.h>
68 #include <linux/proc_fs.h>
69 #include <linux/seq_file.h>
70 #include <linux/poll.h>
71 #include <linux/module.h>
72 #include <linux/init.h>
75 #include <net/inet_common.h>
78 #define CONFIG_SOCK_PACKET 1
81 Proposed replacement for SIOC{ADD,DEL}MULTI and
82 IFF_PROMISC, IFF_ALLMULTI flags.
84 It is more expensive, but I believe,
85 it is really correct solution: reentereble, safe and fault tolerant.
87 IFF_PROMISC/IFF_ALLMULTI/SIOC{ADD/DEL}MULTI are faked by keeping
88 reference count and global flag, so that real status is
89 (gflag|(count != 0)), so that we can use obsolete faulty interface
90 not harming clever users.
92 #define CONFIG_PACKET_MULTICAST 1
96 - if device has no dev->hard_header routine, it adds and removes ll header
97 inside itself. In this case ll header is invisible outside of device,
98 but higher levels still should reserve dev->hard_header_len.
99 Some devices are enough clever to reallocate skb, when header
100 will not fit to reserved space (tunnel), another ones are silly
102 - packet socket receives packets with pulled ll header,
103 so that SOCK_RAW should push it back.
108 Incoming, dev->hard_header!=NULL
112 Outgoing, dev->hard_header!=NULL
116 Incoming, dev->hard_header==NULL
117 mac.raw -> UNKNOWN position. It is very likely, that it points to ll header.
118 PPP makes it, that is wrong, because introduce assymetry
119 between rx and tx paths.
122 Outgoing, dev->hard_header==NULL
123 mac.raw -> data. ll header is still not built!
127 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
133 dev->hard_header != NULL
137 dev->hard_header == NULL (ll header is added by device, we cannot control it)
141 We should set nh.raw on output to correct posistion,
142 packet classifier depends on it.
145 /* List of all packet sockets. */
146 HLIST_HEAD(packet_sklist);
147 static rwlock_t packet_sklist_lock = RW_LOCK_UNLOCKED;
149 atomic_t packet_socks_nr;
152 /* Private packet socket structures. */
154 #ifdef CONFIG_PACKET_MULTICAST
157 struct packet_mclist *next;
162 unsigned char addr[8];
165 #ifdef CONFIG_PACKET_MMAP
166 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing);
169 static void packet_flush_mclist(struct sock *sk);
173 struct tpacket_stats stats;
174 #ifdef CONFIG_PACKET_MMAP
175 unsigned long *pg_vec;
177 unsigned int frames_per_block;
178 unsigned int frame_size;
179 unsigned int frame_max;
182 struct packet_type prot_hook;
183 spinlock_t bind_lock;
184 char running; /* prot_hook is attached*/
185 int ifindex; /* bound device */
187 #ifdef CONFIG_PACKET_MULTICAST
188 struct packet_mclist *mclist;
190 #ifdef CONFIG_PACKET_MMAP
192 unsigned int pg_vec_order;
193 unsigned int pg_vec_pages;
194 unsigned int pg_vec_len;
198 #ifdef CONFIG_PACKET_MMAP
200 static inline unsigned long packet_lookup_frame(struct packet_opt *po, unsigned int position)
202 unsigned int pg_vec_pos, frame_offset;
205 pg_vec_pos = position / po->frames_per_block;
206 frame_offset = position % po->frames_per_block;
208 frame = (unsigned long) (po->pg_vec[pg_vec_pos] + (frame_offset * po->frame_size));
214 #define pkt_sk(__sk) ((struct packet_opt *)(__sk)->sk_protinfo)
216 void packet_sock_destruct(struct sock *sk)
218 BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc));
219 BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
221 if (!sock_flag(sk, SOCK_DEAD)) {
222 printk("Attempt to release alive packet socket: %p\n", sk);
228 atomic_dec(&packet_socks_nr);
229 #ifdef PACKET_REFCNT_DEBUG
230 printk(KERN_DEBUG "PACKET socket %p is free, %d are alive\n", sk, atomic_read(&packet_socks_nr));
235 extern struct proto_ops packet_ops;
237 #ifdef CONFIG_SOCK_PACKET
238 extern struct proto_ops packet_ops_spkt;
240 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
243 struct sockaddr_pkt *spkt;
246 * When we registered the protocol we saved the socket in the data
247 * field for just this event.
250 sk = pt->af_packet_priv;
253 * Yank back the headers [hope the device set this
254 * right or kerboom...]
256 * Incoming packets have ll header pulled,
259 * For outgoing ones skb->data == skb->mac.raw
260 * so that this procedure is noop.
263 if (skb->pkt_type == PACKET_LOOPBACK)
266 if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
269 /* drop any routing info */
270 dst_release(skb->dst);
273 spkt = (struct sockaddr_pkt*)skb->cb;
275 skb_push(skb, skb->data-skb->mac.raw);
278 * The SOCK_PACKET socket receives _all_ frames.
281 spkt->spkt_family = dev->type;
282 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
283 spkt->spkt_protocol = skb->protocol;
286 * Charge the memory to the socket. This is done specifically
287 * to prevent sockets using all the memory up.
290 if (sock_queue_rcv_skb(sk,skb) == 0)
301 * Output a raw packet to a device layer. This bypasses all the other
302 * protocol layers and you must therefore supply it with a complete frame
305 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
306 struct msghdr *msg, size_t len)
308 struct sock *sk = sock->sk;
309 struct sockaddr_pkt *saddr=(struct sockaddr_pkt *)msg->msg_name;
311 struct net_device *dev;
312 unsigned short proto=0;
316 * Get and verify the address.
321 if (msg->msg_namelen < sizeof(struct sockaddr))
323 if (msg->msg_namelen==sizeof(struct sockaddr_pkt))
324 proto=saddr->spkt_protocol;
327 return(-ENOTCONN); /* SOCK_PACKET must be sent giving an address */
330 * Find the device first to size check it
333 saddr->spkt_device[13] = 0;
334 dev = dev_get_by_name(saddr->spkt_device);
340 * You may not queue a frame bigger than the mtu. This is the lowest level
341 * raw protocol and you must do your own fragmentation at this level.
345 if(len>dev->mtu+dev->hard_header_len)
349 skb = sock_wmalloc(sk, len + LL_RESERVED_SPACE(dev), 0, GFP_KERNEL);
352 * If the write buffer is full, then tough. At this level the user gets to
353 * deal with the problem - do your own algorithmic backoffs. That's far
364 /* FIXME: Save some space for broken drivers that write a
365 * hard header at transmission time by themselves. PPP is the
366 * notable one here. This should really be fixed at the driver level.
368 skb_reserve(skb, LL_RESERVED_SPACE(dev));
369 skb->nh.raw = skb->data;
371 /* Try to align data part correctly */
372 if (dev->hard_header) {
373 skb->data -= dev->hard_header_len;
374 skb->tail -= dev->hard_header_len;
375 if (len < dev->hard_header_len)
376 skb->nh.raw = skb->data;
379 /* Returns -EFAULT on error */
380 err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
381 skb->protocol = proto;
383 skb->priority = sk->sk_priority;
388 if (!(dev->flags & IFF_UP))
408 static inline unsigned run_filter(struct sk_buff *skb, struct sock *sk, unsigned res)
410 struct sk_filter *filter;
413 filter = sk->sk_filter;
415 * Our caller already checked that filter != NULL but we need to
416 * verify that under bh_lock_sock() to be safe
418 if (likely(filter != NULL))
419 res = sk_run_filter(skb, filter->insns, filter->len);
426 This function makes lazy skb cloning in hope that most of packets
427 are discarded by BPF.
429 Note tricky part: we DO mangle shared skb! skb->data, skb->len
430 and skb->cb are mangled. It works because (and until) packets
431 falling here are owned by current CPU. Output packets are cloned
432 by dev_queue_xmit_nit(), input packets are processed by net_bh
433 sequencially, so that if we return skb to original state on exit,
434 we will not harm anyone.
437 static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
440 struct sockaddr_ll *sll;
441 struct packet_opt *po;
442 u8 * skb_head = skb->data;
443 int skb_len = skb->len;
446 if (skb->pkt_type == PACKET_LOOPBACK)
449 sk = pt->af_packet_priv;
454 if (dev->hard_header) {
455 /* The device has an explicit notion of ll header,
456 exported to higher levels.
458 Otherwise, the device hides datails of it frame
459 structure, so that corresponding packet head
460 never delivered to user.
462 if (sk->sk_type != SOCK_DGRAM)
463 skb_push(skb, skb->data - skb->mac.raw);
464 else if (skb->pkt_type == PACKET_OUTGOING) {
465 /* Special case: outgoing packets have ll header at head */
466 skb_pull(skb, skb->nh.raw - skb->data);
473 unsigned res = run_filter(skb, sk, snaplen);
480 if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
481 (unsigned)sk->sk_rcvbuf)
484 if (skb_shared(skb)) {
485 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
489 if (skb_head != skb->data) {
490 skb->data = skb_head;
497 sll = (struct sockaddr_ll*)skb->cb;
498 sll->sll_family = AF_PACKET;
499 sll->sll_hatype = dev->type;
500 sll->sll_protocol = skb->protocol;
501 sll->sll_pkttype = skb->pkt_type;
502 sll->sll_ifindex = dev->ifindex;
505 if (dev->hard_header_parse)
506 sll->sll_halen = dev->hard_header_parse(skb, sll->sll_addr);
508 if (pskb_trim(skb, snaplen))
511 skb_set_owner_r(skb, sk);
513 dst_release(skb->dst);
516 spin_lock(&sk->sk_receive_queue.lock);
517 po->stats.tp_packets++;
518 __skb_queue_tail(&sk->sk_receive_queue, skb);
519 spin_unlock(&sk->sk_receive_queue.lock);
520 sk->sk_data_ready(sk, skb->len);
524 spin_lock(&sk->sk_receive_queue.lock);
525 po->stats.tp_drops++;
526 spin_unlock(&sk->sk_receive_queue.lock);
529 if (skb_head != skb->data && skb_shared(skb)) {
530 skb->data = skb_head;
538 #ifdef CONFIG_PACKET_MMAP
539 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
542 struct packet_opt *po;
543 struct sockaddr_ll *sll;
544 struct tpacket_hdr *h;
545 u8 * skb_head = skb->data;
546 int skb_len = skb->len;
548 unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
549 unsigned short macoff, netoff;
550 struct sk_buff *copy_skb = NULL;
552 if (skb->pkt_type == PACKET_LOOPBACK)
555 sk = pt->af_packet_priv;
558 if (dev->hard_header) {
559 if (sk->sk_type != SOCK_DGRAM)
560 skb_push(skb, skb->data - skb->mac.raw);
561 else if (skb->pkt_type == PACKET_OUTGOING) {
562 /* Special case: outgoing packets have ll header at head */
563 skb_pull(skb, skb->nh.raw - skb->data);
564 if (skb->ip_summed == CHECKSUM_HW)
565 status |= TP_STATUS_CSUMNOTREADY;
572 unsigned res = run_filter(skb, sk, snaplen);
579 if (sk->sk_type == SOCK_DGRAM) {
580 macoff = netoff = TPACKET_ALIGN(TPACKET_HDRLEN) + 16;
582 unsigned maclen = skb->nh.raw - skb->data;
583 netoff = TPACKET_ALIGN(TPACKET_HDRLEN + (maclen < 16 ? 16 : maclen));
584 macoff = netoff - maclen;
587 if (macoff + snaplen > po->frame_size) {
588 if (po->copy_thresh &&
589 atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
590 (unsigned)sk->sk_rcvbuf) {
591 if (skb_shared(skb)) {
592 copy_skb = skb_clone(skb, GFP_ATOMIC);
594 copy_skb = skb_get(skb);
595 skb_head = skb->data;
598 skb_set_owner_r(copy_skb, sk);
600 snaplen = po->frame_size - macoff;
601 if ((int)snaplen < 0)
604 if (snaplen > skb->len-skb->data_len)
605 snaplen = skb->len-skb->data_len;
607 spin_lock(&sk->sk_receive_queue.lock);
608 h = (struct tpacket_hdr *)packet_lookup_frame(po, po->head);
612 po->head = po->head != po->frame_max ? po->head+1 : 0;
613 po->stats.tp_packets++;
615 status |= TP_STATUS_COPY;
616 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
618 if (!po->stats.tp_drops)
619 status &= ~TP_STATUS_LOSING;
620 spin_unlock(&sk->sk_receive_queue.lock);
622 memcpy((u8*)h + macoff, skb->data, snaplen);
624 h->tp_len = skb->len;
625 h->tp_snaplen = snaplen;
628 if (skb->stamp.tv_sec == 0) {
629 do_gettimeofday(&skb->stamp);
630 sock_enable_timestamp(sk);
632 h->tp_sec = skb->stamp.tv_sec;
633 h->tp_usec = skb->stamp.tv_usec;
635 sll = (struct sockaddr_ll*)((u8*)h + TPACKET_ALIGN(sizeof(*h)));
637 if (dev->hard_header_parse)
638 sll->sll_halen = dev->hard_header_parse(skb, sll->sll_addr);
639 sll->sll_family = AF_PACKET;
640 sll->sll_hatype = dev->type;
641 sll->sll_protocol = skb->protocol;
642 sll->sll_pkttype = skb->pkt_type;
643 sll->sll_ifindex = dev->ifindex;
645 h->tp_status = status;
649 struct page *p_start, *p_end;
650 u8 *h_end = (u8 *)h + macoff + snaplen - 1;
652 p_start = virt_to_page(h);
653 p_end = virt_to_page(h_end);
654 while (p_start <= p_end) {
655 flush_dcache_page(p_start);
660 sk->sk_data_ready(sk, 0);
663 if (skb_head != skb->data && skb_shared(skb)) {
664 skb->data = skb_head;
672 po->stats.tp_drops++;
673 spin_unlock(&sk->sk_receive_queue.lock);
675 sk->sk_data_ready(sk, 0);
684 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
685 struct msghdr *msg, size_t len)
687 struct sock *sk = sock->sk;
688 struct sockaddr_ll *saddr=(struct sockaddr_ll *)msg->msg_name;
690 struct net_device *dev;
691 unsigned short proto;
693 int ifindex, err, reserve = 0;
696 * Get and verify the address.
700 struct packet_opt *po = pkt_sk(sk);
702 ifindex = po->ifindex;
707 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
709 ifindex = saddr->sll_ifindex;
710 proto = saddr->sll_protocol;
711 addr = saddr->sll_addr;
715 dev = dev_get_by_index(ifindex);
719 if (sock->type == SOCK_RAW)
720 reserve = dev->hard_header_len;
723 if (len > dev->mtu+reserve)
726 skb = sock_alloc_send_skb(sk, len + LL_RESERVED_SPACE(dev),
727 msg->msg_flags & MSG_DONTWAIT, &err);
731 skb_reserve(skb, LL_RESERVED_SPACE(dev));
732 skb->nh.raw = skb->data;
734 if (dev->hard_header) {
737 res = dev->hard_header(skb, dev, ntohs(proto), addr, NULL, len);
738 if (sock->type != SOCK_DGRAM) {
739 skb->tail = skb->data;
745 /* Returns -EFAULT on error */
746 err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
750 skb->protocol = proto;
752 skb->priority = sk->sk_priority;
755 if (!(dev->flags & IFF_UP))
762 err = dev_queue_xmit(skb);
763 if (err > 0 && (err = net_xmit_errno(err)) != 0)
780 * Close a PACKET socket. This is fairly simple. We immediately go
781 * to 'closed' state and remove our protocol entry in the device list.
784 static int packet_release(struct socket *sock)
786 struct sock *sk = sock->sk;
787 struct packet_opt *po = pkt_sk(sk);
792 write_lock_bh(&packet_sklist_lock);
793 sk_del_node_init(sk);
794 write_unlock_bh(&packet_sklist_lock);
797 * Unhook packet receive handler.
802 * Remove the protocol hook
804 dev_remove_pack(&po->prot_hook);
810 #ifdef CONFIG_PACKET_MULTICAST
811 packet_flush_mclist(sk);
814 #ifdef CONFIG_PACKET_MMAP
816 struct tpacket_req req;
817 memset(&req, 0, sizeof(req));
818 packet_set_ring(sk, &req, 1);
823 * Now the socket is dead. No more input will appear.
831 skb_queue_purge(&sk->sk_receive_queue);
838 * Attach a packet hook.
841 static int packet_do_bind(struct sock *sk, struct net_device *dev, int protocol)
843 struct packet_opt *po = pkt_sk(sk);
845 * Detach an existing hook if present.
850 spin_lock(&po->bind_lock);
855 spin_unlock(&po->bind_lock);
856 dev_remove_pack(&po->prot_hook);
857 spin_lock(&po->bind_lock);
861 po->prot_hook.type = protocol;
862 po->prot_hook.dev = dev;
864 po->ifindex = dev ? dev->ifindex : 0;
870 if (dev->flags&IFF_UP) {
871 dev_add_pack(&po->prot_hook);
875 sk->sk_err = ENETDOWN;
876 if (!sock_flag(sk, SOCK_DEAD))
877 sk->sk_error_report(sk);
880 dev_add_pack(&po->prot_hook);
886 spin_unlock(&po->bind_lock);
892 * Bind a packet socket to a device
895 #ifdef CONFIG_SOCK_PACKET
897 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, int addr_len)
899 struct sock *sk=sock->sk;
901 struct net_device *dev;
908 if(addr_len!=sizeof(struct sockaddr))
910 strlcpy(name,uaddr->sa_data,sizeof(name));
912 dev = dev_get_by_name(name);
914 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
921 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
923 struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
924 struct sock *sk=sock->sk;
925 struct net_device *dev = NULL;
933 if (addr_len < sizeof(struct sockaddr_ll))
935 if (sll->sll_family != AF_PACKET)
938 if (sll->sll_ifindex) {
940 dev = dev_get_by_index(sll->sll_ifindex);
944 err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
954 * Create a packet of type SOCK_PACKET.
957 static int packet_create(struct socket *sock, int protocol)
960 struct packet_opt *po;
963 if (!capable(CAP_NET_RAW))
965 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW
966 #ifdef CONFIG_SOCK_PACKET
967 && sock->type != SOCK_PACKET
970 return -ESOCKTNOSUPPORT;
972 sock->state = SS_UNCONNECTED;
975 sk = sk_alloc(PF_PACKET, GFP_KERNEL, 1, NULL);
979 sock->ops = &packet_ops;
980 #ifdef CONFIG_SOCK_PACKET
981 if (sock->type == SOCK_PACKET)
982 sock->ops = &packet_ops_spkt;
984 sock_init_data(sock,sk);
985 sk_set_owner(sk, THIS_MODULE);
987 po = sk->sk_protinfo = kmalloc(sizeof(*po), GFP_KERNEL);
990 memset(po, 0, sizeof(*po));
991 sk->sk_family = PF_PACKET;
994 sk->sk_destruct = packet_sock_destruct;
995 atomic_inc(&packet_socks_nr);
998 * Attach a protocol block
1001 spin_lock_init(&po->bind_lock);
1002 po->prot_hook.func = packet_rcv;
1003 #ifdef CONFIG_SOCK_PACKET
1004 if (sock->type == SOCK_PACKET)
1005 po->prot_hook.func = packet_rcv_spkt;
1007 po->prot_hook.af_packet_priv = sk;
1010 po->prot_hook.type = protocol;
1011 dev_add_pack(&po->prot_hook);
1016 write_lock_bh(&packet_sklist_lock);
1017 sk_add_node(sk, &packet_sklist);
1018 write_unlock_bh(&packet_sklist_lock);
1028 * Pull a packet from our receive queue and hand it to the user.
1029 * If necessary we block.
1032 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1033 struct msghdr *msg, size_t len, int flags)
1035 struct sock *sk = sock->sk;
1036 struct sk_buff *skb;
1040 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC))
1044 /* What error should we return now? EUNATTACH? */
1045 if (pkt_sk(sk)->ifindex < 0)
1050 * If the address length field is there to be filled in, we fill
1054 if (sock->type == SOCK_PACKET)
1055 msg->msg_namelen = sizeof(struct sockaddr_pkt);
1057 msg->msg_namelen = sizeof(struct sockaddr_ll);
1060 * Call the generic datagram receiver. This handles all sorts
1061 * of horrible races and re-entrancy so we can forget about it
1062 * in the protocol layers.
1064 * Now it will return ENETDOWN, if device have just gone down,
1065 * but then it will block.
1068 skb=skb_recv_datagram(sk,flags,flags&MSG_DONTWAIT,&err);
1071 * An error occurred so return it. Because skb_recv_datagram()
1072 * handles the blocking we don't see and worry about blocking
1080 * You lose any data beyond the buffer you gave. If it worries a
1081 * user program they can ask the device for its MTU anyway.
1088 msg->msg_flags|=MSG_TRUNC;
1091 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1095 sock_recv_timestamp(msg, sk, skb);
1098 memcpy(msg->msg_name, skb->cb, msg->msg_namelen);
1101 * Free or return the buffer as appropriate. Again this
1102 * hides all the races and re-entrancy issues from us.
1104 err = (flags&MSG_TRUNC) ? skb->len : copied;
1107 skb_free_datagram(sk, skb);
1112 #ifdef CONFIG_SOCK_PACKET
1113 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1114 int *uaddr_len, int peer)
1116 struct net_device *dev;
1117 struct sock *sk = sock->sk;
1122 uaddr->sa_family = AF_PACKET;
1123 dev = dev_get_by_index(pkt_sk(sk)->ifindex);
1125 strlcpy(uaddr->sa_data, dev->name, 15);
1128 memset(uaddr->sa_data, 0, 14);
1129 *uaddr_len = sizeof(*uaddr);
1135 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1136 int *uaddr_len, int peer)
1138 struct net_device *dev;
1139 struct sock *sk = sock->sk;
1140 struct packet_opt *po = pkt_sk(sk);
1141 struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
1146 sll->sll_family = AF_PACKET;
1147 sll->sll_ifindex = po->ifindex;
1148 sll->sll_protocol = po->num;
1149 dev = dev_get_by_index(po->ifindex);
1151 sll->sll_hatype = dev->type;
1152 sll->sll_halen = dev->addr_len;
1153 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1156 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
1159 *uaddr_len = sizeof(*sll);
1164 #ifdef CONFIG_PACKET_MULTICAST
1165 static void packet_dev_mc(struct net_device *dev, struct packet_mclist *i, int what)
1168 case PACKET_MR_MULTICAST:
1170 dev_mc_add(dev, i->addr, i->alen, 0);
1172 dev_mc_delete(dev, i->addr, i->alen, 0);
1174 case PACKET_MR_PROMISC:
1175 dev_set_promiscuity(dev, what);
1177 case PACKET_MR_ALLMULTI:
1178 dev_set_allmulti(dev, what);
1184 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1186 for ( ; i; i=i->next) {
1187 if (i->ifindex == dev->ifindex)
1188 packet_dev_mc(dev, i, what);
1192 static int packet_mc_add(struct sock *sk, struct packet_mreq *mreq)
1194 struct packet_opt *po = pkt_sk(sk);
1195 struct packet_mclist *ml, *i;
1196 struct net_device *dev;
1202 dev = __dev_get_by_index(mreq->mr_ifindex);
1207 if (mreq->mr_alen > dev->addr_len)
1211 i = (struct packet_mclist *)kmalloc(sizeof(*i), GFP_KERNEL);
1216 for (ml = po->mclist; ml; ml = ml->next) {
1217 if (ml->ifindex == mreq->mr_ifindex &&
1218 ml->type == mreq->mr_type &&
1219 ml->alen == mreq->mr_alen &&
1220 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1222 /* Free the new element ... */
1228 i->type = mreq->mr_type;
1229 i->ifindex = mreq->mr_ifindex;
1230 i->alen = mreq->mr_alen;
1231 memcpy(i->addr, mreq->mr_address, i->alen);
1233 i->next = po->mclist;
1235 packet_dev_mc(dev, i, +1);
1242 static int packet_mc_drop(struct sock *sk, struct packet_mreq *mreq)
1244 struct packet_mclist *ml, **mlp;
1248 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1249 if (ml->ifindex == mreq->mr_ifindex &&
1250 ml->type == mreq->mr_type &&
1251 ml->alen == mreq->mr_alen &&
1252 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1253 if (--ml->count == 0) {
1254 struct net_device *dev;
1256 dev = dev_get_by_index(ml->ifindex);
1258 packet_dev_mc(dev, ml, -1);
1268 return -EADDRNOTAVAIL;
1271 static void packet_flush_mclist(struct sock *sk)
1273 struct packet_opt *po = pkt_sk(sk);
1274 struct packet_mclist *ml;
1280 while ((ml = po->mclist) != NULL) {
1281 struct net_device *dev;
1283 po->mclist = ml->next;
1284 if ((dev = dev_get_by_index(ml->ifindex)) != NULL) {
1285 packet_dev_mc(dev, ml, -1);
1295 packet_setsockopt(struct socket *sock, int level, int optname, char *optval, int optlen)
1297 struct sock *sk = sock->sk;
1300 if (level != SOL_PACKET)
1301 return -ENOPROTOOPT;
1304 #ifdef CONFIG_PACKET_MULTICAST
1305 case PACKET_ADD_MEMBERSHIP:
1306 case PACKET_DROP_MEMBERSHIP:
1308 struct packet_mreq mreq;
1309 if (optlen<sizeof(mreq))
1311 if (copy_from_user(&mreq,optval,sizeof(mreq)))
1313 if (optname == PACKET_ADD_MEMBERSHIP)
1314 ret = packet_mc_add(sk, &mreq);
1316 ret = packet_mc_drop(sk, &mreq);
1320 #ifdef CONFIG_PACKET_MMAP
1321 case PACKET_RX_RING:
1323 struct tpacket_req req;
1325 if (optlen<sizeof(req))
1327 if (copy_from_user(&req,optval,sizeof(req)))
1329 return packet_set_ring(sk, &req, 0);
1331 case PACKET_COPY_THRESH:
1335 if (optlen!=sizeof(val))
1337 if (copy_from_user(&val,optval,sizeof(val)))
1340 pkt_sk(sk)->copy_thresh = val;
1345 return -ENOPROTOOPT;
1349 int packet_getsockopt(struct socket *sock, int level, int optname,
1350 char *optval, int *optlen)
1353 struct sock *sk = sock->sk;
1354 struct packet_opt *po = pkt_sk(sk);
1356 if (level != SOL_PACKET)
1357 return -ENOPROTOOPT;
1359 if (get_user(len,optlen))
1366 case PACKET_STATISTICS:
1368 struct tpacket_stats st;
1370 if (len > sizeof(struct tpacket_stats))
1371 len = sizeof(struct tpacket_stats);
1372 spin_lock_bh(&sk->sk_receive_queue.lock);
1374 memset(&po->stats, 0, sizeof(st));
1375 spin_unlock_bh(&sk->sk_receive_queue.lock);
1376 st.tp_packets += st.tp_drops;
1378 if (copy_to_user(optval, &st, len))
1383 return -ENOPROTOOPT;
1386 if (put_user(len, optlen))
1392 static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
1395 struct hlist_node *node;
1396 struct net_device *dev = (struct net_device*)data;
1398 read_lock(&packet_sklist_lock);
1399 sk_for_each(sk, node, &packet_sklist) {
1400 struct packet_opt *po = pkt_sk(sk);
1403 case NETDEV_UNREGISTER:
1404 #ifdef CONFIG_PACKET_MULTICAST
1406 packet_dev_mclist(dev, po->mclist, -1);
1410 if (dev->ifindex == po->ifindex) {
1411 spin_lock(&po->bind_lock);
1413 __dev_remove_pack(&po->prot_hook);
1416 sk->sk_err = ENETDOWN;
1417 if (!sock_flag(sk, SOCK_DEAD))
1418 sk->sk_error_report(sk);
1420 if (msg == NETDEV_UNREGISTER) {
1422 po->prot_hook.dev = NULL;
1424 spin_unlock(&po->bind_lock);
1428 spin_lock(&po->bind_lock);
1429 if (dev->ifindex == po->ifindex && po->num &&
1431 dev_add_pack(&po->prot_hook);
1435 spin_unlock(&po->bind_lock);
1439 read_unlock(&packet_sklist_lock);
1444 static int packet_ioctl(struct socket *sock, unsigned int cmd,
1447 struct sock *sk = sock->sk;
1452 int amount = atomic_read(&sk->sk_wmem_alloc);
1453 return put_user(amount, (int *)arg);
1457 struct sk_buff *skb;
1460 spin_lock_bh(&sk->sk_receive_queue.lock);
1461 skb = skb_peek(&sk->sk_receive_queue);
1464 spin_unlock_bh(&sk->sk_receive_queue.lock);
1465 return put_user(amount, (int *)arg);
1468 return sock_get_timestamp(sk, (struct timeval *)arg);
1478 case SIOCGIFBRDADDR:
1479 case SIOCSIFBRDADDR:
1480 case SIOCGIFNETMASK:
1481 case SIOCSIFNETMASK:
1482 case SIOCGIFDSTADDR:
1483 case SIOCSIFDSTADDR:
1485 return inet_dgram_ops.ioctl(sock, cmd, arg);
1489 return dev_ioctl(cmd, (void *)arg);
1494 #ifndef CONFIG_PACKET_MMAP
1495 #define packet_mmap sock_no_mmap
1496 #define packet_poll datagram_poll
1499 unsigned int packet_poll(struct file * file, struct socket *sock, poll_table *wait)
1501 struct sock *sk = sock->sk;
1502 struct packet_opt *po = pkt_sk(sk);
1503 unsigned int mask = datagram_poll(file, sock, wait);
1505 spin_lock_bh(&sk->sk_receive_queue.lock);
1507 unsigned last = po->head ? po->head-1 : po->frame_max;
1508 struct tpacket_hdr *h;
1510 h = (struct tpacket_hdr *)packet_lookup_frame(po, last);
1513 mask |= POLLIN | POLLRDNORM;
1515 spin_unlock_bh(&sk->sk_receive_queue.lock);
1520 /* Dirty? Well, I still did not learn better way to account
1524 static void packet_mm_open(struct vm_area_struct *vma)
1526 struct file *file = vma->vm_file;
1527 struct inode *inode = file->f_dentry->d_inode;
1528 struct socket * sock = SOCKET_I(inode);
1529 struct sock *sk = sock->sk;
1532 atomic_inc(&pkt_sk(sk)->mapped);
1535 static void packet_mm_close(struct vm_area_struct *vma)
1537 struct file *file = vma->vm_file;
1538 struct inode *inode = file->f_dentry->d_inode;
1539 struct socket * sock = SOCKET_I(inode);
1540 struct sock *sk = sock->sk;
1543 atomic_dec(&pkt_sk(sk)->mapped);
1546 static struct vm_operations_struct packet_mmap_ops = {
1547 .open = packet_mm_open,
1548 .close =packet_mm_close,
1551 static void free_pg_vec(unsigned long *pg_vec, unsigned order, unsigned len)
1555 for (i=0; i<len; i++) {
1557 struct page *page, *pend;
1559 pend = virt_to_page(pg_vec[i] + (PAGE_SIZE << order) - 1);
1560 for (page = virt_to_page(pg_vec[i]); page <= pend; page++)
1561 ClearPageReserved(page);
1562 free_pages(pg_vec[i], order);
1569 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing)
1571 unsigned long *pg_vec = NULL;
1572 struct packet_opt *po = pkt_sk(sk);
1573 int was_running, num, order = 0;
1576 if (req->tp_block_nr) {
1579 /* Sanity tests and some calculations */
1584 if ((int)req->tp_block_size <= 0)
1586 if (req->tp_block_size&(PAGE_SIZE-1))
1588 if (req->tp_frame_size < TPACKET_HDRLEN)
1590 if (req->tp_frame_size&(TPACKET_ALIGNMENT-1))
1593 po->frames_per_block = req->tp_block_size/req->tp_frame_size;
1594 if (po->frames_per_block <= 0)
1596 if (po->frames_per_block*req->tp_block_nr != req->tp_frame_nr)
1600 /* Allocate page vector */
1601 while ((PAGE_SIZE<<order) < req->tp_block_size)
1606 pg_vec = kmalloc(req->tp_block_nr*sizeof(unsigned long*), GFP_KERNEL);
1609 memset(pg_vec, 0, req->tp_block_nr*sizeof(unsigned long*));
1611 for (i=0; i<req->tp_block_nr; i++) {
1612 struct page *page, *pend;
1613 pg_vec[i] = __get_free_pages(GFP_KERNEL, order);
1615 goto out_free_pgvec;
1617 pend = virt_to_page(pg_vec[i] + (PAGE_SIZE << order) - 1);
1618 for (page = virt_to_page(pg_vec[i]); page <= pend; page++)
1619 SetPageReserved(page);
1621 /* Page vector is allocated */
1624 for (i=0; i<req->tp_block_nr; i++) {
1625 unsigned long ptr = pg_vec[i];
1626 struct tpacket_hdr *header;
1629 for (k=0; k<po->frames_per_block; k++) {
1631 header = (struct tpacket_hdr*)ptr;
1632 header->tp_status = TP_STATUS_KERNEL;
1633 ptr += req->tp_frame_size;
1638 if (req->tp_frame_nr)
1644 /* Detach socket from network */
1645 spin_lock(&po->bind_lock);
1646 was_running = po->running;
1649 __dev_remove_pack(&po->prot_hook);
1654 spin_unlock(&po->bind_lock);
1659 if (closing || atomic_read(&po->mapped) == 0) {
1661 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
1663 spin_lock_bh(&sk->sk_receive_queue.lock);
1664 pg_vec = XC(po->pg_vec, pg_vec);
1665 po->frame_max = req->tp_frame_nr-1;
1667 po->frame_size = req->tp_frame_size;
1668 spin_unlock_bh(&sk->sk_receive_queue.lock);
1670 order = XC(po->pg_vec_order, order);
1671 req->tp_block_nr = XC(po->pg_vec_len, req->tp_block_nr);
1673 po->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
1674 po->prot_hook.func = po->pg_vec ? tpacket_rcv : packet_rcv;
1675 skb_queue_purge(&sk->sk_receive_queue);
1677 if (atomic_read(&po->mapped))
1678 printk(KERN_DEBUG "packet_mmap: vma is busy: %d\n", atomic_read(&po->mapped));
1681 spin_lock(&po->bind_lock);
1682 if (was_running && !po->running) {
1686 dev_add_pack(&po->prot_hook);
1688 spin_unlock(&po->bind_lock);
1694 free_pg_vec(pg_vec, order, req->tp_block_nr);
1699 static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1701 struct sock *sk = sock->sk;
1702 struct packet_opt *po = pkt_sk(sk);
1704 unsigned long start;
1711 size = vma->vm_end - vma->vm_start;
1714 if (po->pg_vec == NULL)
1716 if (size != po->pg_vec_len*po->pg_vec_pages*PAGE_SIZE)
1719 atomic_inc(&po->mapped);
1720 start = vma->vm_start;
1722 for (i=0; i<po->pg_vec_len; i++) {
1723 if (remap_page_range(vma, start, __pa(po->pg_vec[i]),
1724 po->pg_vec_pages*PAGE_SIZE,
1727 start += po->pg_vec_pages*PAGE_SIZE;
1729 vma->vm_ops = &packet_mmap_ops;
1739 #ifdef CONFIG_SOCK_PACKET
1740 struct proto_ops packet_ops_spkt = {
1741 .family = PF_PACKET,
1742 .owner = THIS_MODULE,
1743 .release = packet_release,
1744 .bind = packet_bind_spkt,
1745 .connect = sock_no_connect,
1746 .socketpair = sock_no_socketpair,
1747 .accept = sock_no_accept,
1748 .getname = packet_getname_spkt,
1749 .poll = datagram_poll,
1750 .ioctl = packet_ioctl,
1751 .listen = sock_no_listen,
1752 .shutdown = sock_no_shutdown,
1753 .setsockopt = sock_no_setsockopt,
1754 .getsockopt = sock_no_getsockopt,
1755 .sendmsg = packet_sendmsg_spkt,
1756 .recvmsg = packet_recvmsg,
1757 .mmap = sock_no_mmap,
1758 .sendpage = sock_no_sendpage,
1762 struct proto_ops packet_ops = {
1763 .family = PF_PACKET,
1764 .owner = THIS_MODULE,
1765 .release = packet_release,
1766 .bind = packet_bind,
1767 .connect = sock_no_connect,
1768 .socketpair = sock_no_socketpair,
1769 .accept = sock_no_accept,
1770 .getname = packet_getname,
1771 .poll = packet_poll,
1772 .ioctl = packet_ioctl,
1773 .listen = sock_no_listen,
1774 .shutdown = sock_no_shutdown,
1775 .setsockopt = packet_setsockopt,
1776 .getsockopt = packet_getsockopt,
1777 .sendmsg = packet_sendmsg,
1778 .recvmsg = packet_recvmsg,
1779 .mmap = packet_mmap,
1780 .sendpage = sock_no_sendpage,
1783 static struct net_proto_family packet_family_ops = {
1784 .family = PF_PACKET,
1785 .create = packet_create,
1786 .owner = THIS_MODULE,
1789 static struct notifier_block packet_netdev_notifier = {
1790 .notifier_call =packet_notifier,
1793 #ifdef CONFIG_PROC_FS
1794 static inline struct sock *packet_seq_idx(loff_t off)
1797 struct hlist_node *node;
1799 sk_for_each(s, node, &packet_sklist) {
1806 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
1808 read_lock(&packet_sklist_lock);
1809 return *pos ? packet_seq_idx(*pos - 1) : SEQ_START_TOKEN;
1812 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1815 return (v == SEQ_START_TOKEN)
1816 ? sk_head(&packet_sklist)
1817 : sk_next((struct sock*)v) ;
1820 static void packet_seq_stop(struct seq_file *seq, void *v)
1822 read_unlock(&packet_sklist_lock);
1825 static int packet_seq_show(struct seq_file *seq, void *v)
1827 if (v == SEQ_START_TOKEN)
1828 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
1831 const struct packet_opt *po = pkt_sk(s);
1834 "%p %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
1836 atomic_read(&s->sk_refcnt),
1841 atomic_read(&s->sk_rmem_alloc),
1849 static struct seq_operations packet_seq_ops = {
1850 .start = packet_seq_start,
1851 .next = packet_seq_next,
1852 .stop = packet_seq_stop,
1853 .show = packet_seq_show,
1856 static int packet_seq_open(struct inode *inode, struct file *file)
1858 return seq_open(file, &packet_seq_ops);
1861 static struct file_operations packet_seq_fops = {
1862 .owner = THIS_MODULE,
1863 .open = packet_seq_open,
1865 .llseek = seq_lseek,
1866 .release = seq_release,
1871 static void __exit packet_exit(void)
1873 proc_net_remove("packet");
1874 unregister_netdevice_notifier(&packet_netdev_notifier);
1875 sock_unregister(PF_PACKET);
1879 static int __init packet_init(void)
1881 sock_register(&packet_family_ops);
1882 register_netdevice_notifier(&packet_netdev_notifier);
1883 proc_net_fops_create("packet", 0, &packet_seq_fops);
1888 module_init(packet_init);
1889 module_exit(packet_exit);
1890 MODULE_LICENSE("GPL");
1891 MODULE_ALIAS_NETPROTO(PF_PACKET);