2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * PACKET - implements raw packet sockets.
8 * Version: $Id: af_packet.c,v 1.61 2002/02/08 03:57:19 davem Exp $
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
15 * Alan Cox : verify_area() now used correctly
16 * Alan Cox : new skbuff lists, look ma no backlogs!
17 * Alan Cox : tidied skbuff lists.
18 * Alan Cox : Now uses generic datagram routines I
19 * added. Also fixed the peek/read crash
20 * from all old Linux datagram code.
21 * Alan Cox : Uses the improved datagram code.
22 * Alan Cox : Added NULL's for socket options.
23 * Alan Cox : Re-commented the code.
24 * Alan Cox : Use new kernel side addressing
25 * Rob Janssen : Correct MTU usage.
26 * Dave Platt : Counter leaks caused by incorrect
27 * interrupt locking and some slightly
28 * dubious gcc output. Can you read
29 * compiler: it said _VOLATILE_
30 * Richard Kooijman : Timestamp fixes.
31 * Alan Cox : New buffers. Use sk->mac.raw.
32 * Alan Cox : sendmsg/recvmsg support.
33 * Alan Cox : Protocol setting support
34 * Alexey Kuznetsov : Untied from IPv4 stack.
35 * Cyrus Durgin : Fixed kerneld for kmod.
36 * Michal Ostrowski : Module initialization cleanup.
37 * Ulises Alonso : Frame number limit removal and
38 * packet_set_ring memory leak.
39 * Eric Biederman : Allow for > 8 byte hardware addresses.
40 * The convention is that longer addresses
41 * will simply extend the hardware address
42 * byte arrays at the end of sockaddr_ll
45 * This program is free software; you can redistribute it and/or
46 * modify it under the terms of the GNU General Public License
47 * as published by the Free Software Foundation; either version
48 * 2 of the License, or (at your option) any later version.
52 #include <linux/config.h>
53 #include <linux/types.h>
54 #include <linux/sched.h>
56 #include <linux/capability.h>
57 #include <linux/fcntl.h>
58 #include <linux/socket.h>
60 #include <linux/inet.h>
61 #include <linux/netdevice.h>
62 #include <linux/if_packet.h>
63 #include <linux/wireless.h>
64 #include <linux/kmod.h>
66 #include <net/protocol.h>
67 #include <linux/skbuff.h>
69 #include <linux/errno.h>
70 #include <linux/timer.h>
71 #include <asm/system.h>
72 #include <asm/uaccess.h>
73 #include <asm/ioctls.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/poll.h>
79 #include <linux/module.h>
80 #include <linux/init.h>
83 #include <net/inet_common.h>
86 #define CONFIG_SOCK_PACKET 1
89 Proposed replacement for SIOC{ADD,DEL}MULTI and
90 IFF_PROMISC, IFF_ALLMULTI flags.
92 It is more expensive, but I believe,
93 it is really correct solution: reentereble, safe and fault tolerant.
95 IFF_PROMISC/IFF_ALLMULTI/SIOC{ADD/DEL}MULTI are faked by keeping
96 reference count and global flag, so that real status is
97 (gflag|(count != 0)), so that we can use obsolete faulty interface
98 not harming clever users.
100 #define CONFIG_PACKET_MULTICAST 1
104 - if device has no dev->hard_header routine, it adds and removes ll header
105 inside itself. In this case ll header is invisible outside of device,
106 but higher levels still should reserve dev->hard_header_len.
107 Some devices are enough clever to reallocate skb, when header
108 will not fit to reserved space (tunnel), another ones are silly
110 - packet socket receives packets with pulled ll header,
111 so that SOCK_RAW should push it back.
116 Incoming, dev->hard_header!=NULL
120 Outgoing, dev->hard_header!=NULL
124 Incoming, dev->hard_header==NULL
125 mac.raw -> UNKNOWN position. It is very likely, that it points to ll header.
126 PPP makes it, that is wrong, because introduce assymetry
127 between rx and tx paths.
130 Outgoing, dev->hard_header==NULL
131 mac.raw -> data. ll header is still not built!
135 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
141 dev->hard_header != NULL
145 dev->hard_header == NULL (ll header is added by device, we cannot control it)
149 We should set nh.raw on output to correct posistion,
150 packet classifier depends on it.
153 /* List of all packet sockets. */
154 static HLIST_HEAD(packet_sklist);
155 static DEFINE_RWLOCK(packet_sklist_lock);
157 static atomic_t packet_socks_nr;
160 /* Private packet socket structures. */
162 #ifdef CONFIG_PACKET_MULTICAST
165 struct packet_mclist *next;
170 unsigned char addr[MAX_ADDR_LEN];
172 /* identical to struct packet_mreq except it has
173 * a longer address field.
175 struct packet_mreq_max
178 unsigned short mr_type;
179 unsigned short mr_alen;
180 unsigned char mr_address[MAX_ADDR_LEN];
183 #ifdef CONFIG_PACKET_MMAP
184 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing);
187 static void packet_flush_mclist(struct sock *sk);
190 /* struct sock has to be the first member of packet_sock */
192 struct tpacket_stats stats;
193 #ifdef CONFIG_PACKET_MMAP
196 unsigned int frames_per_block;
197 unsigned int frame_size;
198 unsigned int frame_max;
201 struct packet_type prot_hook;
202 spinlock_t bind_lock;
203 char running; /* prot_hook is attached*/
204 int ifindex; /* bound device */
206 #ifdef CONFIG_PACKET_MULTICAST
207 struct packet_mclist *mclist;
209 #ifdef CONFIG_PACKET_MMAP
211 unsigned int pg_vec_order;
212 unsigned int pg_vec_pages;
213 unsigned int pg_vec_len;
217 #ifdef CONFIG_PACKET_MMAP
219 static inline char *packet_lookup_frame(struct packet_sock *po, unsigned int position)
221 unsigned int pg_vec_pos, frame_offset;
224 pg_vec_pos = position / po->frames_per_block;
225 frame_offset = position % po->frames_per_block;
227 frame = po->pg_vec[pg_vec_pos] + (frame_offset * po->frame_size);
233 static inline struct packet_sock *pkt_sk(struct sock *sk)
235 return (struct packet_sock *)sk;
238 static void packet_sock_destruct(struct sock *sk)
240 BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc));
241 BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
243 if (!sock_flag(sk, SOCK_DEAD)) {
244 printk("Attempt to release alive packet socket: %p\n", sk);
248 atomic_dec(&packet_socks_nr);
249 #ifdef PACKET_REFCNT_DEBUG
250 printk(KERN_DEBUG "PACKET socket %p is free, %d are alive\n", sk, atomic_read(&packet_socks_nr));
255 #if defined(CONFIG_VNET) || defined(CONFIG_VNET_MODULE)
256 struct proto_ops packet_ops;
257 EXPORT_SYMBOL(packet_ops);
261 struct proto_ops packet_ops;
263 #ifdef CONFIG_SOCK_PACKET
264 static const struct proto_ops packet_ops_spkt;
266 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
269 struct sockaddr_pkt *spkt;
272 * When we registered the protocol we saved the socket in the data
273 * field for just this event.
276 sk = pt->af_packet_priv;
279 * Yank back the headers [hope the device set this
280 * right or kerboom...]
282 * Incoming packets have ll header pulled,
285 * For outgoing ones skb->data == skb->mac.raw
286 * so that this procedure is noop.
289 if (skb->pkt_type == PACKET_LOOPBACK)
292 if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
295 /* drop any routing info */
296 dst_release(skb->dst);
299 /* drop conntrack reference */
302 spkt = (struct sockaddr_pkt*)skb->cb;
304 skb_push(skb, skb->data-skb->mac.raw);
307 * The SOCK_PACKET socket receives _all_ frames.
310 spkt->spkt_family = dev->type;
311 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
312 spkt->spkt_protocol = skb->protocol;
315 * Charge the memory to the socket. This is done specifically
316 * to prevent sockets using all the memory up.
319 if (sock_queue_rcv_skb(sk,skb) == 0)
330 * Output a raw packet to a device layer. This bypasses all the other
331 * protocol layers and you must therefore supply it with a complete frame
334 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
335 struct msghdr *msg, size_t len)
337 struct sock *sk = sock->sk;
338 struct sockaddr_pkt *saddr=(struct sockaddr_pkt *)msg->msg_name;
340 struct net_device *dev;
341 unsigned short proto=0;
345 * Get and verify the address.
350 if (msg->msg_namelen < sizeof(struct sockaddr))
352 if (msg->msg_namelen==sizeof(struct sockaddr_pkt))
353 proto=saddr->spkt_protocol;
356 return(-ENOTCONN); /* SOCK_PACKET must be sent giving an address */
359 * Find the device first to size check it
362 saddr->spkt_device[13] = 0;
363 dev = dev_get_by_name(saddr->spkt_device);
369 * You may not queue a frame bigger than the mtu. This is the lowest level
370 * raw protocol and you must do your own fragmentation at this level.
374 if (len > dev->mtu + dev->hard_header_len)
378 skb = sock_wmalloc(sk, len + LL_RESERVED_SPACE(dev), 0, GFP_KERNEL);
381 * If the write buffer is full, then tough. At this level the user gets to
382 * deal with the problem - do your own algorithmic backoffs. That's far
393 /* FIXME: Save some space for broken drivers that write a
394 * hard header at transmission time by themselves. PPP is the
395 * notable one here. This should really be fixed at the driver level.
397 skb_reserve(skb, LL_RESERVED_SPACE(dev));
398 skb->nh.raw = skb->data;
400 /* Try to align data part correctly */
401 if (dev->hard_header) {
402 skb->data -= dev->hard_header_len;
403 skb->tail -= dev->hard_header_len;
404 if (len < dev->hard_header_len)
405 skb->nh.raw = skb->data;
408 /* Returns -EFAULT on error */
409 err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
410 skb->protocol = proto;
412 skb->priority = sk->sk_priority;
417 if (!(dev->flags & IFF_UP))
437 static inline unsigned run_filter(struct sk_buff *skb, struct sock *sk, unsigned res)
439 struct sk_filter *filter;
442 filter = sk->sk_filter;
444 * Our caller already checked that filter != NULL but we need to
445 * verify that under bh_lock_sock() to be safe
447 if (likely(filter != NULL))
448 res = sk_run_filter(skb, filter->insns, filter->len);
455 This function makes lazy skb cloning in hope that most of packets
456 are discarded by BPF.
458 Note tricky part: we DO mangle shared skb! skb->data, skb->len
459 and skb->cb are mangled. It works because (and until) packets
460 falling here are owned by current CPU. Output packets are cloned
461 by dev_queue_xmit_nit(), input packets are processed by net_bh
462 sequencially, so that if we return skb to original state on exit,
463 we will not harm anyone.
466 static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
469 struct sockaddr_ll *sll;
470 struct packet_sock *po;
471 u8 * skb_head = skb->data;
472 int skb_len = skb->len;
475 if (skb->pkt_type == PACKET_LOOPBACK)
478 sk = pt->af_packet_priv;
481 #if defined(CONFIG_VNET) || defined(CONFIG_VNET_MODULE)
483 (int) sk->sk_xid > 0 && sk->sk_xid != skb->xid)
489 if (dev->hard_header) {
490 /* The device has an explicit notion of ll header,
491 exported to higher levels.
493 Otherwise, the device hides datails of it frame
494 structure, so that corresponding packet head
495 never delivered to user.
497 if (sk->sk_type != SOCK_DGRAM)
498 skb_push(skb, skb->data - skb->mac.raw);
499 else if (skb->pkt_type == PACKET_OUTGOING) {
500 /* Special case: outgoing packets have ll header at head */
501 skb_pull(skb, skb->nh.raw - skb->data);
508 unsigned res = run_filter(skb, sk, snaplen);
515 if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
516 (unsigned)sk->sk_rcvbuf)
519 if (skb_shared(skb)) {
520 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
524 if (skb_head != skb->data) {
525 skb->data = skb_head;
532 sll = (struct sockaddr_ll*)skb->cb;
533 sll->sll_family = AF_PACKET;
534 sll->sll_hatype = dev->type;
535 sll->sll_protocol = skb->protocol;
536 sll->sll_pkttype = skb->pkt_type;
537 sll->sll_ifindex = dev->ifindex;
540 if (dev->hard_header_parse)
541 sll->sll_halen = dev->hard_header_parse(skb, sll->sll_addr);
543 if (pskb_trim(skb, snaplen))
546 skb_set_owner_r(skb, sk);
548 dst_release(skb->dst);
551 /* drop conntrack reference */
554 spin_lock(&sk->sk_receive_queue.lock);
555 po->stats.tp_packets++;
556 __skb_queue_tail(&sk->sk_receive_queue, skb);
557 spin_unlock(&sk->sk_receive_queue.lock);
558 sk->sk_data_ready(sk, skb->len);
562 spin_lock(&sk->sk_receive_queue.lock);
563 po->stats.tp_drops++;
564 spin_unlock(&sk->sk_receive_queue.lock);
567 if (skb_head != skb->data && skb_shared(skb)) {
568 skb->data = skb_head;
576 #ifdef CONFIG_PACKET_MMAP
577 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
580 struct packet_sock *po;
581 struct sockaddr_ll *sll;
582 struct tpacket_hdr *h;
583 u8 * skb_head = skb->data;
584 int skb_len = skb->len;
586 unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
587 unsigned short macoff, netoff;
588 struct sk_buff *copy_skb = NULL;
590 if (skb->pkt_type == PACKET_LOOPBACK)
593 sk = pt->af_packet_priv;
596 if (dev->hard_header) {
597 if (sk->sk_type != SOCK_DGRAM)
598 skb_push(skb, skb->data - skb->mac.raw);
599 else if (skb->pkt_type == PACKET_OUTGOING) {
600 /* Special case: outgoing packets have ll header at head */
601 skb_pull(skb, skb->nh.raw - skb->data);
602 if (skb->ip_summed == CHECKSUM_HW)
603 status |= TP_STATUS_CSUMNOTREADY;
610 unsigned res = run_filter(skb, sk, snaplen);
617 if (sk->sk_type == SOCK_DGRAM) {
618 macoff = netoff = TPACKET_ALIGN(TPACKET_HDRLEN) + 16;
620 unsigned maclen = skb->nh.raw - skb->data;
621 netoff = TPACKET_ALIGN(TPACKET_HDRLEN + (maclen < 16 ? 16 : maclen));
622 macoff = netoff - maclen;
625 if (macoff + snaplen > po->frame_size) {
626 if (po->copy_thresh &&
627 atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
628 (unsigned)sk->sk_rcvbuf) {
629 if (skb_shared(skb)) {
630 copy_skb = skb_clone(skb, GFP_ATOMIC);
632 copy_skb = skb_get(skb);
633 skb_head = skb->data;
636 skb_set_owner_r(copy_skb, sk);
638 snaplen = po->frame_size - macoff;
639 if ((int)snaplen < 0)
642 if (snaplen > skb->len-skb->data_len)
643 snaplen = skb->len-skb->data_len;
645 spin_lock(&sk->sk_receive_queue.lock);
646 h = (struct tpacket_hdr *)packet_lookup_frame(po, po->head);
650 po->head = po->head != po->frame_max ? po->head+1 : 0;
651 po->stats.tp_packets++;
653 status |= TP_STATUS_COPY;
654 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
656 if (!po->stats.tp_drops)
657 status &= ~TP_STATUS_LOSING;
658 spin_unlock(&sk->sk_receive_queue.lock);
660 memcpy((u8*)h + macoff, skb->data, snaplen);
662 h->tp_len = skb->len;
663 h->tp_snaplen = snaplen;
666 if (skb->tstamp.off_sec == 0) {
667 __net_timestamp(skb);
668 sock_enable_timestamp(sk);
670 h->tp_sec = skb->tstamp.off_sec;
671 h->tp_usec = skb->tstamp.off_usec;
673 sll = (struct sockaddr_ll*)((u8*)h + TPACKET_ALIGN(sizeof(*h)));
675 if (dev->hard_header_parse)
676 sll->sll_halen = dev->hard_header_parse(skb, sll->sll_addr);
677 sll->sll_family = AF_PACKET;
678 sll->sll_hatype = dev->type;
679 sll->sll_protocol = skb->protocol;
680 sll->sll_pkttype = skb->pkt_type;
681 sll->sll_ifindex = dev->ifindex;
683 h->tp_status = status;
687 struct page *p_start, *p_end;
688 u8 *h_end = (u8 *)h + macoff + snaplen - 1;
690 p_start = virt_to_page(h);
691 p_end = virt_to_page(h_end);
692 while (p_start <= p_end) {
693 flush_dcache_page(p_start);
698 sk->sk_data_ready(sk, 0);
701 if (skb_head != skb->data && skb_shared(skb)) {
702 skb->data = skb_head;
710 po->stats.tp_drops++;
711 spin_unlock(&sk->sk_receive_queue.lock);
713 sk->sk_data_ready(sk, 0);
722 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
723 struct msghdr *msg, size_t len)
725 struct sock *sk = sock->sk;
726 struct sockaddr_ll *saddr=(struct sockaddr_ll *)msg->msg_name;
728 struct net_device *dev;
729 unsigned short proto;
731 int ifindex, err, reserve = 0;
734 * Get and verify the address.
738 struct packet_sock *po = pkt_sk(sk);
740 ifindex = po->ifindex;
745 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
747 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
749 ifindex = saddr->sll_ifindex;
750 proto = saddr->sll_protocol;
751 addr = saddr->sll_addr;
755 dev = dev_get_by_index(ifindex);
759 if (sock->type == SOCK_RAW)
760 reserve = dev->hard_header_len;
763 if (len > dev->mtu+reserve)
766 skb = sock_alloc_send_skb(sk, len + LL_RESERVED_SPACE(dev),
767 msg->msg_flags & MSG_DONTWAIT, &err);
771 skb_reserve(skb, LL_RESERVED_SPACE(dev));
772 skb->nh.raw = skb->data;
774 if (dev->hard_header) {
777 res = dev->hard_header(skb, dev, ntohs(proto), addr, NULL, len);
778 if (sock->type != SOCK_DGRAM) {
779 skb->tail = skb->data;
785 /* Returns -EFAULT on error */
786 err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
790 skb->protocol = proto;
792 skb->priority = sk->sk_priority;
795 if (!(dev->flags & IFF_UP))
802 err = dev_queue_xmit(skb);
803 if (err > 0 && (err = net_xmit_errno(err)) != 0)
820 * Close a PACKET socket. This is fairly simple. We immediately go
821 * to 'closed' state and remove our protocol entry in the device list.
824 static int packet_release(struct socket *sock)
826 struct sock *sk = sock->sk;
827 struct packet_sock *po;
834 write_lock_bh(&packet_sklist_lock);
835 sk_del_node_init(sk);
836 write_unlock_bh(&packet_sklist_lock);
839 * Unhook packet receive handler.
844 * Remove the protocol hook
846 dev_remove_pack(&po->prot_hook);
852 #ifdef CONFIG_PACKET_MULTICAST
853 packet_flush_mclist(sk);
856 #ifdef CONFIG_PACKET_MMAP
858 struct tpacket_req req;
859 memset(&req, 0, sizeof(req));
860 packet_set_ring(sk, &req, 1);
865 * Now the socket is dead. No more input will appear.
873 skb_queue_purge(&sk->sk_receive_queue);
880 * Attach a packet hook.
883 static int packet_do_bind(struct sock *sk, struct net_device *dev, int protocol)
885 struct packet_sock *po = pkt_sk(sk);
887 * Detach an existing hook if present.
892 spin_lock(&po->bind_lock);
897 spin_unlock(&po->bind_lock);
898 dev_remove_pack(&po->prot_hook);
899 spin_lock(&po->bind_lock);
903 po->prot_hook.type = protocol;
904 po->prot_hook.dev = dev;
906 po->ifindex = dev ? dev->ifindex : 0;
912 if (dev->flags&IFF_UP) {
913 dev_add_pack(&po->prot_hook);
917 sk->sk_err = ENETDOWN;
918 if (!sock_flag(sk, SOCK_DEAD))
919 sk->sk_error_report(sk);
922 dev_add_pack(&po->prot_hook);
928 spin_unlock(&po->bind_lock);
934 * Bind a packet socket to a device
937 #ifdef CONFIG_SOCK_PACKET
939 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, int addr_len)
941 struct sock *sk=sock->sk;
943 struct net_device *dev;
950 if (addr_len != sizeof(struct sockaddr))
952 strlcpy(name,uaddr->sa_data,sizeof(name));
954 dev = dev_get_by_name(name);
956 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
963 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
965 struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
966 struct sock *sk=sock->sk;
967 struct net_device *dev = NULL;
975 if (addr_len < sizeof(struct sockaddr_ll))
977 if (sll->sll_family != AF_PACKET)
980 if (sll->sll_ifindex) {
982 dev = dev_get_by_index(sll->sll_ifindex);
986 err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
994 static struct proto packet_proto = {
996 .owner = THIS_MODULE,
997 .obj_size = sizeof(struct packet_sock),
1001 * Create a packet of type SOCK_PACKET.
1004 static int packet_create(struct socket *sock, int protocol)
1007 struct packet_sock *po;
1010 if (!capable(CAP_NET_RAW))
1012 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW
1013 #ifdef CONFIG_SOCK_PACKET
1014 && sock->type != SOCK_PACKET
1017 return -ESOCKTNOSUPPORT;
1019 sock->state = SS_UNCONNECTED;
1022 sk = sk_alloc(PF_PACKET, GFP_KERNEL, &packet_proto, 1);
1026 sock->ops = &packet_ops;
1027 #ifdef CONFIG_SOCK_PACKET
1028 if (sock->type == SOCK_PACKET)
1029 sock->ops = &packet_ops_spkt;
1031 sock_init_data(sock, sk);
1034 sk->sk_family = PF_PACKET;
1037 sk->sk_destruct = packet_sock_destruct;
1038 atomic_inc(&packet_socks_nr);
1041 * Attach a protocol block
1044 spin_lock_init(&po->bind_lock);
1045 po->prot_hook.func = packet_rcv;
1046 #ifdef CONFIG_SOCK_PACKET
1047 if (sock->type == SOCK_PACKET)
1048 po->prot_hook.func = packet_rcv_spkt;
1050 po->prot_hook.af_packet_priv = sk;
1053 po->prot_hook.type = protocol;
1054 dev_add_pack(&po->prot_hook);
1059 write_lock_bh(&packet_sklist_lock);
1060 sk_add_node(sk, &packet_sklist);
1061 write_unlock_bh(&packet_sklist_lock);
1068 * Pull a packet from our receive queue and hand it to the user.
1069 * If necessary we block.
1072 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1073 struct msghdr *msg, size_t len, int flags)
1075 struct sock *sk = sock->sk;
1076 struct sk_buff *skb;
1078 struct sockaddr_ll *sll;
1081 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
1085 /* What error should we return now? EUNATTACH? */
1086 if (pkt_sk(sk)->ifindex < 0)
1091 * Call the generic datagram receiver. This handles all sorts
1092 * of horrible races and re-entrancy so we can forget about it
1093 * in the protocol layers.
1095 * Now it will return ENETDOWN, if device have just gone down,
1096 * but then it will block.
1099 skb=skb_recv_datagram(sk,flags,flags&MSG_DONTWAIT,&err);
1102 * An error occurred so return it. Because skb_recv_datagram()
1103 * handles the blocking we don't see and worry about blocking
1111 * If the address length field is there to be filled in, we fill
1115 sll = (struct sockaddr_ll*)skb->cb;
1116 if (sock->type == SOCK_PACKET)
1117 msg->msg_namelen = sizeof(struct sockaddr_pkt);
1119 msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1122 * You lose any data beyond the buffer you gave. If it worries a
1123 * user program they can ask the device for its MTU anyway.
1130 msg->msg_flags|=MSG_TRUNC;
1133 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1137 sock_recv_timestamp(msg, sk, skb);
1140 memcpy(msg->msg_name, skb->cb, msg->msg_namelen);
1143 * Free or return the buffer as appropriate. Again this
1144 * hides all the races and re-entrancy issues from us.
1146 err = (flags&MSG_TRUNC) ? skb->len : copied;
1149 skb_free_datagram(sk, skb);
1154 #ifdef CONFIG_SOCK_PACKET
1155 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1156 int *uaddr_len, int peer)
1158 struct net_device *dev;
1159 struct sock *sk = sock->sk;
1164 uaddr->sa_family = AF_PACKET;
1165 dev = dev_get_by_index(pkt_sk(sk)->ifindex);
1167 strlcpy(uaddr->sa_data, dev->name, 15);
1170 memset(uaddr->sa_data, 0, 14);
1171 *uaddr_len = sizeof(*uaddr);
1177 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1178 int *uaddr_len, int peer)
1180 struct net_device *dev;
1181 struct sock *sk = sock->sk;
1182 struct packet_sock *po = pkt_sk(sk);
1183 struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
1188 sll->sll_family = AF_PACKET;
1189 sll->sll_ifindex = po->ifindex;
1190 sll->sll_protocol = po->num;
1191 dev = dev_get_by_index(po->ifindex);
1193 sll->sll_hatype = dev->type;
1194 sll->sll_halen = dev->addr_len;
1195 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1198 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
1201 *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1206 #ifdef CONFIG_PACKET_MULTICAST
1207 static void packet_dev_mc(struct net_device *dev, struct packet_mclist *i, int what)
1210 case PACKET_MR_MULTICAST:
1212 dev_mc_add(dev, i->addr, i->alen, 0);
1214 dev_mc_delete(dev, i->addr, i->alen, 0);
1216 case PACKET_MR_PROMISC:
1217 dev_set_promiscuity(dev, what);
1219 case PACKET_MR_ALLMULTI:
1220 dev_set_allmulti(dev, what);
1226 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1228 for ( ; i; i=i->next) {
1229 if (i->ifindex == dev->ifindex)
1230 packet_dev_mc(dev, i, what);
1234 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1236 struct packet_sock *po = pkt_sk(sk);
1237 struct packet_mclist *ml, *i;
1238 struct net_device *dev;
1244 dev = __dev_get_by_index(mreq->mr_ifindex);
1249 if (mreq->mr_alen > dev->addr_len)
1253 i = kmalloc(sizeof(*i), GFP_KERNEL);
1258 for (ml = po->mclist; ml; ml = ml->next) {
1259 if (ml->ifindex == mreq->mr_ifindex &&
1260 ml->type == mreq->mr_type &&
1261 ml->alen == mreq->mr_alen &&
1262 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1264 /* Free the new element ... */
1270 i->type = mreq->mr_type;
1271 i->ifindex = mreq->mr_ifindex;
1272 i->alen = mreq->mr_alen;
1273 memcpy(i->addr, mreq->mr_address, i->alen);
1275 i->next = po->mclist;
1277 packet_dev_mc(dev, i, +1);
1284 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1286 struct packet_mclist *ml, **mlp;
1290 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1291 if (ml->ifindex == mreq->mr_ifindex &&
1292 ml->type == mreq->mr_type &&
1293 ml->alen == mreq->mr_alen &&
1294 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1295 if (--ml->count == 0) {
1296 struct net_device *dev;
1298 dev = dev_get_by_index(ml->ifindex);
1300 packet_dev_mc(dev, ml, -1);
1310 return -EADDRNOTAVAIL;
1313 static void packet_flush_mclist(struct sock *sk)
1315 struct packet_sock *po = pkt_sk(sk);
1316 struct packet_mclist *ml;
1322 while ((ml = po->mclist) != NULL) {
1323 struct net_device *dev;
1325 po->mclist = ml->next;
1326 if ((dev = dev_get_by_index(ml->ifindex)) != NULL) {
1327 packet_dev_mc(dev, ml, -1);
1337 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen)
1339 struct sock *sk = sock->sk;
1342 if (level != SOL_PACKET)
1343 return -ENOPROTOOPT;
1346 #ifdef CONFIG_PACKET_MULTICAST
1347 case PACKET_ADD_MEMBERSHIP:
1348 case PACKET_DROP_MEMBERSHIP:
1350 struct packet_mreq_max mreq;
1352 memset(&mreq, 0, sizeof(mreq));
1353 if (len < sizeof(struct packet_mreq))
1355 if (len > sizeof(mreq))
1357 if (copy_from_user(&mreq,optval,len))
1359 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1361 if (optname == PACKET_ADD_MEMBERSHIP)
1362 ret = packet_mc_add(sk, &mreq);
1364 ret = packet_mc_drop(sk, &mreq);
1368 #ifdef CONFIG_PACKET_MMAP
1369 case PACKET_RX_RING:
1371 struct tpacket_req req;
1373 if (optlen<sizeof(req))
1375 if (copy_from_user(&req,optval,sizeof(req)))
1377 return packet_set_ring(sk, &req, 0);
1379 case PACKET_COPY_THRESH:
1383 if (optlen!=sizeof(val))
1385 if (copy_from_user(&val,optval,sizeof(val)))
1388 pkt_sk(sk)->copy_thresh = val;
1393 return -ENOPROTOOPT;
1397 static int packet_getsockopt(struct socket *sock, int level, int optname,
1398 char __user *optval, int __user *optlen)
1401 struct sock *sk = sock->sk;
1402 struct packet_sock *po = pkt_sk(sk);
1404 if (level != SOL_PACKET)
1405 return -ENOPROTOOPT;
1407 if (get_user(len, optlen))
1414 case PACKET_STATISTICS:
1416 struct tpacket_stats st;
1418 if (len > sizeof(struct tpacket_stats))
1419 len = sizeof(struct tpacket_stats);
1420 spin_lock_bh(&sk->sk_receive_queue.lock);
1422 memset(&po->stats, 0, sizeof(st));
1423 spin_unlock_bh(&sk->sk_receive_queue.lock);
1424 st.tp_packets += st.tp_drops;
1426 if (copy_to_user(optval, &st, len))
1431 return -ENOPROTOOPT;
1434 if (put_user(len, optlen))
1440 static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
1443 struct hlist_node *node;
1444 struct net_device *dev = (struct net_device*)data;
1446 read_lock(&packet_sklist_lock);
1447 sk_for_each(sk, node, &packet_sklist) {
1448 struct packet_sock *po = pkt_sk(sk);
1451 case NETDEV_UNREGISTER:
1452 #ifdef CONFIG_PACKET_MULTICAST
1454 packet_dev_mclist(dev, po->mclist, -1);
1458 if (dev->ifindex == po->ifindex) {
1459 spin_lock(&po->bind_lock);
1461 __dev_remove_pack(&po->prot_hook);
1464 sk->sk_err = ENETDOWN;
1465 if (!sock_flag(sk, SOCK_DEAD))
1466 sk->sk_error_report(sk);
1468 if (msg == NETDEV_UNREGISTER) {
1470 po->prot_hook.dev = NULL;
1472 spin_unlock(&po->bind_lock);
1476 spin_lock(&po->bind_lock);
1477 if (dev->ifindex == po->ifindex && po->num &&
1479 dev_add_pack(&po->prot_hook);
1483 spin_unlock(&po->bind_lock);
1487 read_unlock(&packet_sklist_lock);
1492 static int packet_ioctl(struct socket *sock, unsigned int cmd,
1495 struct sock *sk = sock->sk;
1500 int amount = atomic_read(&sk->sk_wmem_alloc);
1501 return put_user(amount, (int __user *)arg);
1505 struct sk_buff *skb;
1508 spin_lock_bh(&sk->sk_receive_queue.lock);
1509 skb = skb_peek(&sk->sk_receive_queue);
1512 spin_unlock_bh(&sk->sk_receive_queue.lock);
1513 return put_user(amount, (int __user *)arg);
1516 return sock_get_timestamp(sk, (struct timeval __user *)arg);
1526 case SIOCGIFBRDADDR:
1527 case SIOCSIFBRDADDR:
1528 case SIOCGIFNETMASK:
1529 case SIOCSIFNETMASK:
1530 case SIOCGIFDSTADDR:
1531 case SIOCSIFDSTADDR:
1533 return inet_dgram_ops.ioctl(sock, cmd, arg);
1537 return -ENOIOCTLCMD;
1542 #ifndef CONFIG_PACKET_MMAP
1543 #define packet_mmap sock_no_mmap
1544 #define packet_poll datagram_poll
1547 static unsigned int packet_poll(struct file * file, struct socket *sock,
1550 struct sock *sk = sock->sk;
1551 struct packet_sock *po = pkt_sk(sk);
1552 unsigned int mask = datagram_poll(file, sock, wait);
1554 spin_lock_bh(&sk->sk_receive_queue.lock);
1556 unsigned last = po->head ? po->head-1 : po->frame_max;
1557 struct tpacket_hdr *h;
1559 h = (struct tpacket_hdr *)packet_lookup_frame(po, last);
1562 mask |= POLLIN | POLLRDNORM;
1564 spin_unlock_bh(&sk->sk_receive_queue.lock);
1569 /* Dirty? Well, I still did not learn better way to account
1573 static void packet_mm_open(struct vm_area_struct *vma)
1575 struct file *file = vma->vm_file;
1576 struct socket * sock = file->private_data;
1577 struct sock *sk = sock->sk;
1580 atomic_inc(&pkt_sk(sk)->mapped);
1583 static void packet_mm_close(struct vm_area_struct *vma)
1585 struct file *file = vma->vm_file;
1586 struct socket * sock = file->private_data;
1587 struct sock *sk = sock->sk;
1590 atomic_dec(&pkt_sk(sk)->mapped);
1593 static struct vm_operations_struct packet_mmap_ops = {
1594 .open = packet_mm_open,
1595 .close =packet_mm_close,
1598 static inline struct page *pg_vec_endpage(char *one_pg_vec, unsigned int order)
1600 return virt_to_page(one_pg_vec + (PAGE_SIZE << order) - 1);
1603 static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
1607 for (i = 0; i < len; i++) {
1608 if (likely(pg_vec[i]))
1609 free_pages((unsigned long) pg_vec[i], order);
1614 static inline char *alloc_one_pg_vec_page(unsigned long order)
1616 return (char *) __get_free_pages(GFP_KERNEL | __GFP_COMP | __GFP_ZERO,
1620 static char **alloc_pg_vec(struct tpacket_req *req, int order)
1622 unsigned int block_nr = req->tp_block_nr;
1626 pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
1627 if (unlikely(!pg_vec))
1630 for (i = 0; i < block_nr; i++) {
1631 pg_vec[i] = alloc_one_pg_vec_page(order);
1632 if (unlikely(!pg_vec[i]))
1633 goto out_free_pgvec;
1640 free_pg_vec(pg_vec, order, block_nr);
1645 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing)
1647 char **pg_vec = NULL;
1648 struct packet_sock *po = pkt_sk(sk);
1649 int was_running, num, order = 0;
1652 if (req->tp_block_nr) {
1655 /* Sanity tests and some calculations */
1657 if (unlikely(po->pg_vec))
1660 if (unlikely((int)req->tp_block_size <= 0))
1662 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
1664 if (unlikely(req->tp_frame_size < TPACKET_HDRLEN))
1666 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
1669 po->frames_per_block = req->tp_block_size/req->tp_frame_size;
1670 if (unlikely(po->frames_per_block <= 0))
1672 if (unlikely((po->frames_per_block * req->tp_block_nr) !=
1677 order = get_order(req->tp_block_size);
1678 pg_vec = alloc_pg_vec(req, order);
1679 if (unlikely(!pg_vec))
1683 for (i = 0; i < req->tp_block_nr; i++) {
1684 char *ptr = pg_vec[i];
1685 struct tpacket_hdr *header;
1688 for (k = 0; k < po->frames_per_block; k++) {
1689 header = (struct tpacket_hdr *) ptr;
1690 header->tp_status = TP_STATUS_KERNEL;
1691 ptr += req->tp_frame_size;
1696 if (unlikely(req->tp_frame_nr))
1702 /* Detach socket from network */
1703 spin_lock(&po->bind_lock);
1704 was_running = po->running;
1707 __dev_remove_pack(&po->prot_hook);
1712 spin_unlock(&po->bind_lock);
1717 if (closing || atomic_read(&po->mapped) == 0) {
1719 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
1721 spin_lock_bh(&sk->sk_receive_queue.lock);
1722 pg_vec = XC(po->pg_vec, pg_vec);
1723 po->frame_max = (req->tp_frame_nr - 1);
1725 po->frame_size = req->tp_frame_size;
1726 spin_unlock_bh(&sk->sk_receive_queue.lock);
1728 order = XC(po->pg_vec_order, order);
1729 req->tp_block_nr = XC(po->pg_vec_len, req->tp_block_nr);
1731 po->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
1732 po->prot_hook.func = po->pg_vec ? tpacket_rcv : packet_rcv;
1733 skb_queue_purge(&sk->sk_receive_queue);
1735 if (atomic_read(&po->mapped))
1736 printk(KERN_DEBUG "packet_mmap: vma is busy: %d\n", atomic_read(&po->mapped));
1739 spin_lock(&po->bind_lock);
1740 if (was_running && !po->running) {
1744 dev_add_pack(&po->prot_hook);
1746 spin_unlock(&po->bind_lock);
1751 free_pg_vec(pg_vec, order, req->tp_block_nr);
1756 static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1758 struct sock *sk = sock->sk;
1759 struct packet_sock *po = pkt_sk(sk);
1761 unsigned long start;
1768 size = vma->vm_end - vma->vm_start;
1771 if (po->pg_vec == NULL)
1773 if (size != po->pg_vec_len*po->pg_vec_pages*PAGE_SIZE)
1776 start = vma->vm_start;
1777 for (i = 0; i < po->pg_vec_len; i++) {
1778 struct page *page = virt_to_page(po->pg_vec[i]);
1781 for (pg_num = 0; pg_num < po->pg_vec_pages; pg_num++, page++) {
1782 err = vm_insert_page(vma, start, page);
1788 atomic_inc(&po->mapped);
1789 vma->vm_ops = &packet_mmap_ops;
1799 #ifdef CONFIG_SOCK_PACKET
1800 static const struct proto_ops packet_ops_spkt = {
1801 .family = PF_PACKET,
1802 .owner = THIS_MODULE,
1803 .release = packet_release,
1804 .bind = packet_bind_spkt,
1805 .connect = sock_no_connect,
1806 .socketpair = sock_no_socketpair,
1807 .accept = sock_no_accept,
1808 .getname = packet_getname_spkt,
1809 .poll = datagram_poll,
1810 .ioctl = packet_ioctl,
1811 .listen = sock_no_listen,
1812 .shutdown = sock_no_shutdown,
1813 .setsockopt = sock_no_setsockopt,
1814 .getsockopt = sock_no_getsockopt,
1815 .sendmsg = packet_sendmsg_spkt,
1816 .recvmsg = packet_recvmsg,
1817 .mmap = sock_no_mmap,
1818 .sendpage = sock_no_sendpage,
1822 #if !defined(CONFIG_VNET) && !defined(CONFIG_VNET_MODULE)
1825 struct proto_ops packet_ops = {
1826 .family = PF_PACKET,
1827 .owner = THIS_MODULE,
1828 .release = packet_release,
1829 .bind = packet_bind,
1830 .connect = sock_no_connect,
1831 .socketpair = sock_no_socketpair,
1832 .accept = sock_no_accept,
1833 .getname = packet_getname,
1834 .poll = packet_poll,
1835 .ioctl = packet_ioctl,
1836 .listen = sock_no_listen,
1837 .shutdown = sock_no_shutdown,
1838 .setsockopt = packet_setsockopt,
1839 .getsockopt = packet_getsockopt,
1840 .sendmsg = packet_sendmsg,
1841 .recvmsg = packet_recvmsg,
1842 .mmap = packet_mmap,
1843 .sendpage = sock_no_sendpage,
1846 #if defined(CONFIG_VNET) || defined(CONFIG_VNET_MODULE)
1847 struct net_proto_family packet_family_ops;
1848 EXPORT_SYMBOL(packet_family_ops);
1852 struct net_proto_family packet_family_ops = {
1853 .family = PF_PACKET,
1854 .create = packet_create,
1855 .owner = THIS_MODULE,
1858 static struct notifier_block packet_netdev_notifier = {
1859 .notifier_call =packet_notifier,
1862 #ifdef CONFIG_PROC_FS
1863 static inline struct sock *packet_seq_idx(loff_t off)
1866 struct hlist_node *node;
1868 sk_for_each(s, node, &packet_sklist) {
1875 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
1877 read_lock(&packet_sklist_lock);
1878 return *pos ? packet_seq_idx(*pos - 1) : SEQ_START_TOKEN;
1881 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1884 return (v == SEQ_START_TOKEN)
1885 ? sk_head(&packet_sklist)
1886 : sk_next((struct sock*)v) ;
1889 static void packet_seq_stop(struct seq_file *seq, void *v)
1891 read_unlock(&packet_sklist_lock);
1894 static int packet_seq_show(struct seq_file *seq, void *v)
1896 if (v == SEQ_START_TOKEN)
1897 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
1900 const struct packet_sock *po = pkt_sk(s);
1903 "%p %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
1905 atomic_read(&s->sk_refcnt),
1910 atomic_read(&s->sk_rmem_alloc),
1918 static struct seq_operations packet_seq_ops = {
1919 .start = packet_seq_start,
1920 .next = packet_seq_next,
1921 .stop = packet_seq_stop,
1922 .show = packet_seq_show,
1925 static int packet_seq_open(struct inode *inode, struct file *file)
1927 return seq_open(file, &packet_seq_ops);
1930 static struct file_operations packet_seq_fops = {
1931 .owner = THIS_MODULE,
1932 .open = packet_seq_open,
1934 .llseek = seq_lseek,
1935 .release = seq_release,
1940 static void __exit packet_exit(void)
1942 proc_net_remove("packet");
1943 unregister_netdevice_notifier(&packet_netdev_notifier);
1944 sock_unregister(PF_PACKET);
1945 proto_unregister(&packet_proto);
1948 static int __init packet_init(void)
1950 int rc = proto_register(&packet_proto, 0);
1955 sock_register(&packet_family_ops);
1956 register_netdevice_notifier(&packet_netdev_notifier);
1957 proc_net_fops_create("packet", 0, &packet_seq_fops);
1962 module_init(packet_init);
1963 module_exit(packet_exit);
1964 MODULE_LICENSE("GPL");
1965 MODULE_ALIAS_NETPROTO(PF_PACKET);