patch-2_6_7-vs1_9_1_12
[linux-2.6.git] / net / ipv4 / ipmr.c
1 /*
2  *      IP multicast routing support for mrouted 3.6/3.8
3  *
4  *              (c) 1995 Alan Cox, <alan@redhat.com>
5  *        Linux Consultancy and Custom Driver Development
6  *
7  *      This program is free software; you can redistribute it and/or
8  *      modify it under the terms of the GNU General Public License
9  *      as published by the Free Software Foundation; either version
10  *      2 of the License, or (at your option) any later version.
11  *
12  *      Version: $Id: ipmr.c,v 1.65 2001/10/31 21:55:54 davem Exp $
13  *
14  *      Fixes:
15  *      Michael Chastain        :       Incorrect size of copying.
16  *      Alan Cox                :       Added the cache manager code
17  *      Alan Cox                :       Fixed the clone/copy bug and device race.
18  *      Mike McLagan            :       Routing by source
19  *      Malcolm Beattie         :       Buffer handling fixes.
20  *      Alexey Kuznetsov        :       Double buffer free and other fixes.
21  *      SVR Anand               :       Fixed several multicast bugs and problems.
22  *      Alexey Kuznetsov        :       Status, optimisations and more.
23  *      Brad Parker             :       Better behaviour on mrouted upcall
24  *                                      overflow.
25  *      Carlos Picoto           :       PIMv1 Support
26  *      Pavlin Ivanov Radoslavov:       PIMv2 Registers must checksum only PIM header
27  *                                      Relax this requrement to work with older peers.
28  *
29  */
30
31 #include <linux/config.h>
32 #include <asm/system.h>
33 #include <asm/uaccess.h>
34 #include <linux/types.h>
35 #include <linux/sched.h>
36 #include <linux/errno.h>
37 #include <linux/timer.h>
38 #include <linux/mm.h>
39 #include <linux/kernel.h>
40 #include <linux/fcntl.h>
41 #include <linux/stat.h>
42 #include <linux/socket.h>
43 #include <linux/in.h>
44 #include <linux/inet.h>
45 #include <linux/netdevice.h>
46 #include <linux/inetdevice.h>
47 #include <linux/igmp.h>
48 #include <linux/proc_fs.h>
49 #include <linux/seq_file.h>
50 #include <linux/mroute.h>
51 #include <linux/init.h>
52 #include <net/ip.h>
53 #include <net/protocol.h>
54 #include <linux/skbuff.h>
55 #include <net/sock.h>
56 #include <net/icmp.h>
57 #include <net/udp.h>
58 #include <net/raw.h>
59 #include <linux/notifier.h>
60 #include <linux/if_arp.h>
61 #include <linux/netfilter_ipv4.h>
62 #include <net/ipip.h>
63 #include <net/checksum.h>
64
65 #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
66 #define CONFIG_IP_PIMSM 1
67 #endif
68
69 static struct sock *mroute_socket;
70
71
72 /* Big lock, protecting vif table, mrt cache and mroute socket state.
73    Note that the changes are semaphored via rtnl_lock.
74  */
75
76 static rwlock_t mrt_lock = RW_LOCK_UNLOCKED;
77
78 /*
79  *      Multicast router control variables
80  */
81
82 static struct vif_device vif_table[MAXVIFS];            /* Devices              */
83 static int maxvif;
84
85 #define VIF_EXISTS(idx) (vif_table[idx].dev != NULL)
86
87 static int mroute_do_assert;                            /* Set in PIM assert    */
88 static int mroute_do_pim;
89
90 static struct mfc_cache *mfc_cache_array[MFC_LINES];    /* Forwarding cache     */
91
92 static struct mfc_cache *mfc_unres_queue;               /* Queue of unresolved entries */
93 static atomic_t cache_resolve_queue_len;                /* Size of unresolved   */
94
95 /* Special spinlock for queue of unresolved entries */
96 static spinlock_t mfc_unres_lock = SPIN_LOCK_UNLOCKED;
97
98 /* We return to original Alan's scheme. Hash table of resolved
99    entries is changed only in process context and protected
100    with weak lock mrt_lock. Queue of unresolved entries is protected
101    with strong spinlock mfc_unres_lock.
102
103    In this case data path is free of exclusive locks at all.
104  */
105
106 static kmem_cache_t *mrt_cachep;
107
108 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
109 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert);
110 static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm);
111
112 static struct inet_protocol pim_protocol;
113
114 static struct timer_list ipmr_expire_timer;
115
116 /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
117
118 static
119 struct net_device *ipmr_new_tunnel(struct vifctl *v)
120 {
121         struct net_device  *dev;
122
123         dev = __dev_get_by_name("tunl0");
124
125         if (dev) {
126                 int err;
127                 struct ifreq ifr;
128                 mm_segment_t    oldfs;
129                 struct ip_tunnel_parm p;
130                 struct in_device  *in_dev;
131
132                 memset(&p, 0, sizeof(p));
133                 p.iph.daddr = v->vifc_rmt_addr.s_addr;
134                 p.iph.saddr = v->vifc_lcl_addr.s_addr;
135                 p.iph.version = 4;
136                 p.iph.ihl = 5;
137                 p.iph.protocol = IPPROTO_IPIP;
138                 sprintf(p.name, "dvmrp%d", v->vifc_vifi);
139                 ifr.ifr_ifru.ifru_data = (void*)&p;
140
141                 oldfs = get_fs(); set_fs(KERNEL_DS);
142                 err = dev->do_ioctl(dev, &ifr, SIOCADDTUNNEL);
143                 set_fs(oldfs);
144
145                 dev = NULL;
146
147                 if (err == 0 && (dev = __dev_get_by_name(p.name)) != NULL) {
148                         dev->flags |= IFF_MULTICAST;
149
150                         in_dev = __in_dev_get(dev);
151                         if (in_dev == NULL && (in_dev = inetdev_init(dev)) == NULL)
152                                 goto failure;
153                         in_dev->cnf.rp_filter = 0;
154
155                         if (dev_open(dev))
156                                 goto failure;
157                 }
158         }
159         return dev;
160
161 failure:
162         /* allow the register to be completed before unregistering. */
163         rtnl_unlock();
164         rtnl_lock();
165
166         unregister_netdevice(dev);
167         return NULL;
168 }
169
170 #ifdef CONFIG_IP_PIMSM
171
172 static int reg_vif_num = -1;
173
174 static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
175 {
176         read_lock(&mrt_lock);
177         ((struct net_device_stats*)dev->priv)->tx_bytes += skb->len;
178         ((struct net_device_stats*)dev->priv)->tx_packets++;
179         ipmr_cache_report(skb, reg_vif_num, IGMPMSG_WHOLEPKT);
180         read_unlock(&mrt_lock);
181         kfree_skb(skb);
182         return 0;
183 }
184
185 static struct net_device_stats *reg_vif_get_stats(struct net_device *dev)
186 {
187         return (struct net_device_stats*)dev->priv;
188 }
189
190 static void reg_vif_setup(struct net_device *dev)
191 {
192         dev->type               = ARPHRD_PIMREG;
193         dev->mtu                = 1500 - sizeof(struct iphdr) - 8;
194         dev->flags              = IFF_NOARP;
195         dev->hard_start_xmit    = reg_vif_xmit;
196         dev->get_stats          = reg_vif_get_stats;
197         dev->destructor         = free_netdev;
198 }
199
200 static struct net_device *ipmr_reg_vif(void)
201 {
202         struct net_device *dev;
203         struct in_device *in_dev;
204
205         dev = alloc_netdev(sizeof(struct net_device_stats), "pimreg",
206                            reg_vif_setup);
207
208         if (dev == NULL)
209                 return NULL;
210
211         if (register_netdevice(dev)) {
212                 free_netdev(dev);
213                 return NULL;
214         }
215         dev->iflink = 0;
216
217         if ((in_dev = inetdev_init(dev)) == NULL)
218                 goto failure;
219
220         in_dev->cnf.rp_filter = 0;
221
222         if (dev_open(dev))
223                 goto failure;
224
225         return dev;
226
227 failure:
228         /* allow the register to be completed before unregistering. */
229         rtnl_unlock();
230         rtnl_lock();
231
232         unregister_netdevice(dev);
233         return NULL;
234 }
235 #endif
236
237 /*
238  *      Delete a VIF entry
239  */
240  
241 static int vif_delete(int vifi)
242 {
243         struct vif_device *v;
244         struct net_device *dev;
245         struct in_device *in_dev;
246
247         if (vifi < 0 || vifi >= maxvif)
248                 return -EADDRNOTAVAIL;
249
250         v = &vif_table[vifi];
251
252         write_lock_bh(&mrt_lock);
253         dev = v->dev;
254         v->dev = NULL;
255
256         if (!dev) {
257                 write_unlock_bh(&mrt_lock);
258                 return -EADDRNOTAVAIL;
259         }
260
261 #ifdef CONFIG_IP_PIMSM
262         if (vifi == reg_vif_num)
263                 reg_vif_num = -1;
264 #endif
265
266         if (vifi+1 == maxvif) {
267                 int tmp;
268                 for (tmp=vifi-1; tmp>=0; tmp--) {
269                         if (VIF_EXISTS(tmp))
270                                 break;
271                 }
272                 maxvif = tmp+1;
273         }
274
275         write_unlock_bh(&mrt_lock);
276
277         dev_set_allmulti(dev, -1);
278
279         if ((in_dev = __in_dev_get(dev)) != NULL) {
280                 in_dev->cnf.mc_forwarding--;
281                 ip_rt_multicast_event(in_dev);
282         }
283
284         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
285                 unregister_netdevice(dev);
286
287         dev_put(dev);
288         return 0;
289 }
290
291 /* Destroy an unresolved cache entry, killing queued skbs
292    and reporting error to netlink readers.
293  */
294
295 static void ipmr_destroy_unres(struct mfc_cache *c)
296 {
297         struct sk_buff *skb;
298
299         atomic_dec(&cache_resolve_queue_len);
300
301         while((skb=skb_dequeue(&c->mfc_un.unres.unresolved))) {
302                 if (skb->nh.iph->version == 0) {
303                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
304                         nlh->nlmsg_type = NLMSG_ERROR;
305                         nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
306                         skb_trim(skb, nlh->nlmsg_len);
307                         ((struct nlmsgerr*)NLMSG_DATA(nlh))->error = -ETIMEDOUT;
308                         netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT);
309                 } else
310                         kfree_skb(skb);
311         }
312
313         kmem_cache_free(mrt_cachep, c);
314 }
315
316
317 /* Single timer process for all the unresolved queue. */
318
319 static void ipmr_expire_process(unsigned long dummy)
320 {
321         unsigned long now;
322         unsigned long expires;
323         struct mfc_cache *c, **cp;
324
325         if (!spin_trylock(&mfc_unres_lock)) {
326                 mod_timer(&ipmr_expire_timer, jiffies+HZ/10);
327                 return;
328         }
329
330         if (atomic_read(&cache_resolve_queue_len) == 0)
331                 goto out;
332
333         now = jiffies;
334         expires = 10*HZ;
335         cp = &mfc_unres_queue;
336
337         while ((c=*cp) != NULL) {
338                 if (time_after(c->mfc_un.unres.expires, now)) {
339                         unsigned long interval = c->mfc_un.unres.expires - now;
340                         if (interval < expires)
341                                 expires = interval;
342                         cp = &c->next;
343                         continue;
344                 }
345
346                 *cp = c->next;
347
348                 ipmr_destroy_unres(c);
349         }
350
351         if (atomic_read(&cache_resolve_queue_len))
352                 mod_timer(&ipmr_expire_timer, jiffies + expires);
353
354 out:
355         spin_unlock(&mfc_unres_lock);
356 }
357
358 /* Fill oifs list. It is called under write locked mrt_lock. */
359
360 static void ipmr_update_threshoulds(struct mfc_cache *cache, unsigned char *ttls)
361 {
362         int vifi;
363
364         cache->mfc_un.res.minvif = MAXVIFS;
365         cache->mfc_un.res.maxvif = 0;
366         memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
367
368         for (vifi=0; vifi<maxvif; vifi++) {
369                 if (VIF_EXISTS(vifi) && ttls[vifi] && ttls[vifi] < 255) {
370                         cache->mfc_un.res.ttls[vifi] = ttls[vifi];
371                         if (cache->mfc_un.res.minvif > vifi)
372                                 cache->mfc_un.res.minvif = vifi;
373                         if (cache->mfc_un.res.maxvif <= vifi)
374                                 cache->mfc_un.res.maxvif = vifi + 1;
375                 }
376         }
377 }
378
379 static int vif_add(struct vifctl *vifc, int mrtsock)
380 {
381         int vifi = vifc->vifc_vifi;
382         struct vif_device *v = &vif_table[vifi];
383         struct net_device *dev;
384         struct in_device *in_dev;
385
386         /* Is vif busy ? */
387         if (VIF_EXISTS(vifi))
388                 return -EADDRINUSE;
389
390         switch (vifc->vifc_flags) {
391 #ifdef CONFIG_IP_PIMSM
392         case VIFF_REGISTER:
393                 /*
394                  * Special Purpose VIF in PIM
395                  * All the packets will be sent to the daemon
396                  */
397                 if (reg_vif_num >= 0)
398                         return -EADDRINUSE;
399                 dev = ipmr_reg_vif();
400                 if (!dev)
401                         return -ENOBUFS;
402                 break;
403 #endif
404         case VIFF_TUNNEL:       
405                 dev = ipmr_new_tunnel(vifc);
406                 if (!dev)
407                         return -ENOBUFS;
408                 break;
409         case 0:
410                 dev=ip_dev_find(vifc->vifc_lcl_addr.s_addr);
411                 if (!dev)
412                         return -EADDRNOTAVAIL;
413                 __dev_put(dev);
414                 break;
415         default:
416                 return -EINVAL;
417         }
418
419         if ((in_dev = __in_dev_get(dev)) == NULL)
420                 return -EADDRNOTAVAIL;
421         in_dev->cnf.mc_forwarding++;
422         dev_set_allmulti(dev, +1);
423         ip_rt_multicast_event(in_dev);
424
425         /*
426          *      Fill in the VIF structures
427          */
428         v->rate_limit=vifc->vifc_rate_limit;
429         v->local=vifc->vifc_lcl_addr.s_addr;
430         v->remote=vifc->vifc_rmt_addr.s_addr;
431         v->flags=vifc->vifc_flags;
432         if (!mrtsock)
433                 v->flags |= VIFF_STATIC;
434         v->threshold=vifc->vifc_threshold;
435         v->bytes_in = 0;
436         v->bytes_out = 0;
437         v->pkt_in = 0;
438         v->pkt_out = 0;
439         v->link = dev->ifindex;
440         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
441                 v->link = dev->iflink;
442
443         /* And finish update writing critical data */
444         write_lock_bh(&mrt_lock);
445         dev_hold(dev);
446         v->dev=dev;
447 #ifdef CONFIG_IP_PIMSM
448         if (v->flags&VIFF_REGISTER)
449                 reg_vif_num = vifi;
450 #endif
451         if (vifi+1 > maxvif)
452                 maxvif = vifi+1;
453         write_unlock_bh(&mrt_lock);
454         return 0;
455 }
456
457 static struct mfc_cache *ipmr_cache_find(__u32 origin, __u32 mcastgrp)
458 {
459         int line=MFC_HASH(mcastgrp,origin);
460         struct mfc_cache *c;
461
462         for (c=mfc_cache_array[line]; c; c = c->next) {
463                 if (c->mfc_origin==origin && c->mfc_mcastgrp==mcastgrp)
464                         break;
465         }
466         return c;
467 }
468
469 /*
470  *      Allocate a multicast cache entry
471  */
472 static struct mfc_cache *ipmr_cache_alloc(void)
473 {
474         struct mfc_cache *c=kmem_cache_alloc(mrt_cachep, GFP_KERNEL);
475         if(c==NULL)
476                 return NULL;
477         memset(c, 0, sizeof(*c));
478         c->mfc_un.res.minvif = MAXVIFS;
479         return c;
480 }
481
482 static struct mfc_cache *ipmr_cache_alloc_unres(void)
483 {
484         struct mfc_cache *c=kmem_cache_alloc(mrt_cachep, GFP_ATOMIC);
485         if(c==NULL)
486                 return NULL;
487         memset(c, 0, sizeof(*c));
488         skb_queue_head_init(&c->mfc_un.unres.unresolved);
489         c->mfc_un.unres.expires = jiffies + 10*HZ;
490         return c;
491 }
492
493 /*
494  *      A cache entry has gone into a resolved state from queued
495  */
496  
497 static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
498 {
499         struct sk_buff *skb;
500
501         /*
502          *      Play the pending entries through our router
503          */
504
505         while((skb=__skb_dequeue(&uc->mfc_un.unres.unresolved))) {
506                 if (skb->nh.iph->version == 0) {
507                         int err;
508                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
509
510                         if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) {
511                                 nlh->nlmsg_len = skb->tail - (u8*)nlh;
512                         } else {
513                                 nlh->nlmsg_type = NLMSG_ERROR;
514                                 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
515                                 skb_trim(skb, nlh->nlmsg_len);
516                                 ((struct nlmsgerr*)NLMSG_DATA(nlh))->error = -EMSGSIZE;
517                         }
518                         err = netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT);
519                 } else
520                         ip_mr_forward(skb, c, 0);
521         }
522 }
523
524 /*
525  *      Bounce a cache query up to mrouted. We could use netlink for this but mrouted
526  *      expects the following bizarre scheme.
527  *
528  *      Called under mrt_lock.
529  */
530  
531 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
532 {
533         struct sk_buff *skb;
534         int ihl = pkt->nh.iph->ihl<<2;
535         struct igmphdr *igmp;
536         struct igmpmsg *msg;
537         int ret;
538
539 #ifdef CONFIG_IP_PIMSM
540         if (assert == IGMPMSG_WHOLEPKT)
541                 skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
542         else
543 #endif
544                 skb = alloc_skb(128, GFP_ATOMIC);
545
546         if(!skb)
547                 return -ENOBUFS;
548
549 #ifdef CONFIG_IP_PIMSM
550         if (assert == IGMPMSG_WHOLEPKT) {
551                 /* Ugly, but we have no choice with this interface.
552                    Duplicate old header, fix ihl, length etc.
553                    And all this only to mangle msg->im_msgtype and
554                    to set msg->im_mbz to "mbz" :-)
555                  */
556                 msg = (struct igmpmsg*)skb_push(skb, sizeof(struct iphdr));
557                 skb->nh.raw = skb->h.raw = (u8*)msg;
558                 memcpy(msg, pkt->nh.raw, sizeof(struct iphdr));
559                 msg->im_msgtype = IGMPMSG_WHOLEPKT;
560                 msg->im_mbz = 0;
561                 msg->im_vif = reg_vif_num;
562                 skb->nh.iph->ihl = sizeof(struct iphdr) >> 2;
563                 skb->nh.iph->tot_len = htons(ntohs(pkt->nh.iph->tot_len) + sizeof(struct iphdr));
564         } else 
565 #endif
566         {       
567                 
568         /*
569          *      Copy the IP header
570          */
571
572         skb->nh.iph = (struct iphdr *)skb_put(skb, ihl);
573         memcpy(skb->data,pkt->data,ihl);
574         skb->nh.iph->protocol = 0;                      /* Flag to the kernel this is a route add */
575         msg = (struct igmpmsg*)skb->nh.iph;
576         msg->im_vif = vifi;
577         skb->dst = dst_clone(pkt->dst);
578
579         /*
580          *      Add our header
581          */
582
583         igmp=(struct igmphdr *)skb_put(skb,sizeof(struct igmphdr));
584         igmp->type      =
585         msg->im_msgtype = assert;
586         igmp->code      =       0;
587         skb->nh.iph->tot_len=htons(skb->len);                   /* Fix the length */
588         skb->h.raw = skb->nh.raw;
589         }
590
591         if (mroute_socket == NULL) {
592                 kfree_skb(skb);
593                 return -EINVAL;
594         }
595
596         /*
597          *      Deliver to mrouted
598          */
599         if ((ret=sock_queue_rcv_skb(mroute_socket,skb))<0) {
600                 if (net_ratelimit())
601                         printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
602                 kfree_skb(skb);
603         }
604
605         return ret;
606 }
607
608 /*
609  *      Queue a packet for resolution. It gets locked cache entry!
610  */
611  
612 static int
613 ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
614 {
615         int err;
616         struct mfc_cache *c;
617
618         spin_lock_bh(&mfc_unres_lock);
619         for (c=mfc_unres_queue; c; c=c->next) {
620                 if (c->mfc_mcastgrp == skb->nh.iph->daddr &&
621                     c->mfc_origin == skb->nh.iph->saddr)
622                         break;
623         }
624
625         if (c == NULL) {
626                 /*
627                  *      Create a new entry if allowable
628                  */
629
630                 if (atomic_read(&cache_resolve_queue_len)>=10 ||
631                     (c=ipmr_cache_alloc_unres())==NULL) {
632                         spin_unlock_bh(&mfc_unres_lock);
633
634                         kfree_skb(skb);
635                         return -ENOBUFS;
636                 }
637
638                 /*
639                  *      Fill in the new cache entry
640                  */
641                 c->mfc_parent=-1;
642                 c->mfc_origin=skb->nh.iph->saddr;
643                 c->mfc_mcastgrp=skb->nh.iph->daddr;
644
645                 /*
646                  *      Reflect first query at mrouted.
647                  */
648                 if ((err = ipmr_cache_report(skb, vifi, IGMPMSG_NOCACHE))<0) {
649                         /* If the report failed throw the cache entry 
650                            out - Brad Parker
651                          */
652                         spin_unlock_bh(&mfc_unres_lock);
653
654                         kmem_cache_free(mrt_cachep, c);
655                         kfree_skb(skb);
656                         return err;
657                 }
658
659                 atomic_inc(&cache_resolve_queue_len);
660                 c->next = mfc_unres_queue;
661                 mfc_unres_queue = c;
662
663                 mod_timer(&ipmr_expire_timer, c->mfc_un.unres.expires);
664         }
665
666         /*
667          *      See if we can append the packet
668          */
669         if (c->mfc_un.unres.unresolved.qlen>3) {
670                 kfree_skb(skb);
671                 err = -ENOBUFS;
672         } else {
673                 skb_queue_tail(&c->mfc_un.unres.unresolved,skb);
674                 err = 0;
675         }
676
677         spin_unlock_bh(&mfc_unres_lock);
678         return err;
679 }
680
681 /*
682  *      MFC cache manipulation by user space mroute daemon
683  */
684
685 static int ipmr_mfc_delete(struct mfcctl *mfc)
686 {
687         int line;
688         struct mfc_cache *c, **cp;
689
690         line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
691
692         for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
693                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
694                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
695                         write_lock_bh(&mrt_lock);
696                         *cp = c->next;
697                         write_unlock_bh(&mrt_lock);
698
699                         kmem_cache_free(mrt_cachep, c);
700                         return 0;
701                 }
702         }
703         return -ENOENT;
704 }
705
706 static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
707 {
708         int line;
709         struct mfc_cache *uc, *c, **cp;
710
711         line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
712
713         for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
714                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
715                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr)
716                         break;
717         }
718
719         if (c != NULL) {
720                 write_lock_bh(&mrt_lock);
721                 c->mfc_parent = mfc->mfcc_parent;
722                 ipmr_update_threshoulds(c, mfc->mfcc_ttls);
723                 if (!mrtsock)
724                         c->mfc_flags |= MFC_STATIC;
725                 write_unlock_bh(&mrt_lock);
726                 return 0;
727         }
728
729         if(!MULTICAST(mfc->mfcc_mcastgrp.s_addr))
730                 return -EINVAL;
731
732         c=ipmr_cache_alloc();
733         if (c==NULL)
734                 return -ENOMEM;
735
736         c->mfc_origin=mfc->mfcc_origin.s_addr;
737         c->mfc_mcastgrp=mfc->mfcc_mcastgrp.s_addr;
738         c->mfc_parent=mfc->mfcc_parent;
739         ipmr_update_threshoulds(c, mfc->mfcc_ttls);
740         if (!mrtsock)
741                 c->mfc_flags |= MFC_STATIC;
742
743         write_lock_bh(&mrt_lock);
744         c->next = mfc_cache_array[line];
745         mfc_cache_array[line] = c;
746         write_unlock_bh(&mrt_lock);
747
748         /*
749          *      Check to see if we resolved a queued list. If so we
750          *      need to send on the frames and tidy up.
751          */
752         spin_lock_bh(&mfc_unres_lock);
753         for (cp = &mfc_unres_queue; (uc=*cp) != NULL;
754              cp = &uc->next) {
755                 if (uc->mfc_origin == c->mfc_origin &&
756                     uc->mfc_mcastgrp == c->mfc_mcastgrp) {
757                         *cp = uc->next;
758                         if (atomic_dec_and_test(&cache_resolve_queue_len))
759                                 del_timer(&ipmr_expire_timer);
760                         break;
761                 }
762         }
763         spin_unlock_bh(&mfc_unres_lock);
764
765         if (uc) {
766                 ipmr_cache_resolve(uc, c);
767                 kmem_cache_free(mrt_cachep, uc);
768         }
769         return 0;
770 }
771
772 /*
773  *      Close the multicast socket, and clear the vif tables etc
774  */
775  
776 static void mroute_clean_tables(struct sock *sk)
777 {
778         int i;
779                 
780         /*
781          *      Shut down all active vif entries
782          */
783         for(i=0; i<maxvif; i++) {
784                 if (!(vif_table[i].flags&VIFF_STATIC))
785                         vif_delete(i);
786         }
787
788         /*
789          *      Wipe the cache
790          */
791         for (i=0;i<MFC_LINES;i++) {
792                 struct mfc_cache *c, **cp;
793
794                 cp = &mfc_cache_array[i];
795                 while ((c = *cp) != NULL) {
796                         if (c->mfc_flags&MFC_STATIC) {
797                                 cp = &c->next;
798                                 continue;
799                         }
800                         write_lock_bh(&mrt_lock);
801                         *cp = c->next;
802                         write_unlock_bh(&mrt_lock);
803
804                         kmem_cache_free(mrt_cachep, c);
805                 }
806         }
807
808         if (atomic_read(&cache_resolve_queue_len) != 0) {
809                 struct mfc_cache *c;
810
811                 spin_lock_bh(&mfc_unres_lock);
812                 while (mfc_unres_queue != NULL) {
813                         c = mfc_unres_queue;
814                         mfc_unres_queue = c->next;
815                         spin_unlock_bh(&mfc_unres_lock);
816
817                         ipmr_destroy_unres(c);
818
819                         spin_lock_bh(&mfc_unres_lock);
820                 }
821                 spin_unlock_bh(&mfc_unres_lock);
822         }
823 }
824
825 static void mrtsock_destruct(struct sock *sk)
826 {
827         rtnl_lock();
828         if (sk == mroute_socket) {
829                 ipv4_devconf.mc_forwarding--;
830
831                 write_lock_bh(&mrt_lock);
832                 mroute_socket=NULL;
833                 write_unlock_bh(&mrt_lock);
834
835                 mroute_clean_tables(sk);
836         }
837         rtnl_unlock();
838 }
839
840 /*
841  *      Socket options and virtual interface manipulation. The whole
842  *      virtual interface system is a complete heap, but unfortunately
843  *      that's how BSD mrouted happens to think. Maybe one day with a proper
844  *      MOSPF/PIM router set up we can clean this up.
845  */
846  
847 int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int optlen)
848 {
849         int ret;
850         struct vifctl vif;
851         struct mfcctl mfc;
852         
853         if(optname!=MRT_INIT)
854         {
855                 if(sk!=mroute_socket && !capable(CAP_NET_ADMIN))
856                         return -EACCES;
857         }
858
859         switch(optname)
860         {
861                 case MRT_INIT:
862                         if (sk->sk_type != SOCK_RAW ||
863                             inet_sk(sk)->num != IPPROTO_IGMP)
864                                 return -EOPNOTSUPP;
865                         if(optlen!=sizeof(int))
866                                 return -ENOPROTOOPT;
867
868                         rtnl_lock();
869                         if (mroute_socket) {
870                                 rtnl_unlock();
871                                 return -EADDRINUSE;
872                         }
873
874                         ret = ip_ra_control(sk, 1, mrtsock_destruct);
875                         if (ret == 0) {
876                                 write_lock_bh(&mrt_lock);
877                                 mroute_socket=sk;
878                                 write_unlock_bh(&mrt_lock);
879
880                                 ipv4_devconf.mc_forwarding++;
881                         }
882                         rtnl_unlock();
883                         return ret;
884                 case MRT_DONE:
885                         if (sk!=mroute_socket)
886                                 return -EACCES;
887                         return ip_ra_control(sk, 0, NULL);
888                 case MRT_ADD_VIF:
889                 case MRT_DEL_VIF:
890                         if(optlen!=sizeof(vif))
891                                 return -EINVAL;
892                         if (copy_from_user(&vif,optval,sizeof(vif)))
893                                 return -EFAULT; 
894                         if(vif.vifc_vifi >= MAXVIFS)
895                                 return -ENFILE;
896                         rtnl_lock();
897                         if (optname==MRT_ADD_VIF) {
898                                 ret = vif_add(&vif, sk==mroute_socket);
899                         } else {
900                                 ret = vif_delete(vif.vifc_vifi);
901                         }
902                         rtnl_unlock();
903                         return ret;
904
905                 /*
906                  *      Manipulate the forwarding caches. These live
907                  *      in a sort of kernel/user symbiosis.
908                  */
909                 case MRT_ADD_MFC:
910                 case MRT_DEL_MFC:
911                         if(optlen!=sizeof(mfc))
912                                 return -EINVAL;
913                         if (copy_from_user(&mfc,optval, sizeof(mfc)))
914                                 return -EFAULT;
915                         rtnl_lock();
916                         if (optname==MRT_DEL_MFC)
917                                 ret = ipmr_mfc_delete(&mfc);
918                         else
919                                 ret = ipmr_mfc_add(&mfc, sk==mroute_socket);
920                         rtnl_unlock();
921                         return ret;
922                 /*
923                  *      Control PIM assert.
924                  */
925                 case MRT_ASSERT:
926                 {
927                         int v;
928                         if(get_user(v,(int __user *)optval))
929                                 return -EFAULT;
930                         mroute_do_assert=(v)?1:0;
931                         return 0;
932                 }
933 #ifdef CONFIG_IP_PIMSM
934                 case MRT_PIM:
935                 {
936                         int v, ret;
937                         if(get_user(v,(int __user *)optval))
938                                 return -EFAULT;
939                         v = (v)?1:0;
940                         rtnl_lock();
941                         ret = 0;
942                         if (v != mroute_do_pim) {
943                                 mroute_do_pim = v;
944                                 mroute_do_assert = v;
945 #ifdef CONFIG_IP_PIMSM_V2
946                                 if (mroute_do_pim)
947                                         ret = inet_add_protocol(&pim_protocol,
948                                                                 IPPROTO_PIM);
949                                 else
950                                         ret = inet_del_protocol(&pim_protocol,
951                                                                 IPPROTO_PIM);
952                                 if (ret < 0)
953                                         ret = -EAGAIN;
954 #endif
955                         }
956                         rtnl_unlock();
957                         return ret;
958                 }
959 #endif
960                 /*
961                  *      Spurious command, or MRT_VERSION which you cannot
962                  *      set.
963                  */
964                 default:
965                         return -ENOPROTOOPT;
966         }
967 }
968
969 /*
970  *      Getsock opt support for the multicast routing system.
971  */
972  
973 int ip_mroute_getsockopt(struct sock *sk,int optname,char __user *optval,int __user *optlen)
974 {
975         int olr;
976         int val;
977
978         if(optname!=MRT_VERSION && 
979 #ifdef CONFIG_IP_PIMSM
980            optname!=MRT_PIM &&
981 #endif
982            optname!=MRT_ASSERT)
983                 return -ENOPROTOOPT;
984
985         if (get_user(olr, optlen))
986                 return -EFAULT;
987
988         olr = min_t(unsigned int, olr, sizeof(int));
989         if (olr < 0)
990                 return -EINVAL;
991                 
992         if(put_user(olr,optlen))
993                 return -EFAULT;
994         if(optname==MRT_VERSION)
995                 val=0x0305;
996 #ifdef CONFIG_IP_PIMSM
997         else if(optname==MRT_PIM)
998                 val=mroute_do_pim;
999 #endif
1000         else
1001                 val=mroute_do_assert;
1002         if(copy_to_user(optval,&val,olr))
1003                 return -EFAULT;
1004         return 0;
1005 }
1006
1007 /*
1008  *      The IP multicast ioctl support routines.
1009  */
1010  
1011 int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1012 {
1013         struct sioc_sg_req sr;
1014         struct sioc_vif_req vr;
1015         struct vif_device *vif;
1016         struct mfc_cache *c;
1017         
1018         switch(cmd)
1019         {
1020                 case SIOCGETVIFCNT:
1021                         if (copy_from_user(&vr,arg,sizeof(vr)))
1022                                 return -EFAULT; 
1023                         if(vr.vifi>=maxvif)
1024                                 return -EINVAL;
1025                         read_lock(&mrt_lock);
1026                         vif=&vif_table[vr.vifi];
1027                         if(VIF_EXISTS(vr.vifi)) {
1028                                 vr.icount=vif->pkt_in;
1029                                 vr.ocount=vif->pkt_out;
1030                                 vr.ibytes=vif->bytes_in;
1031                                 vr.obytes=vif->bytes_out;
1032                                 read_unlock(&mrt_lock);
1033
1034                                 if (copy_to_user(arg,&vr,sizeof(vr)))
1035                                         return -EFAULT;
1036                                 return 0;
1037                         }
1038                         read_unlock(&mrt_lock);
1039                         return -EADDRNOTAVAIL;
1040                 case SIOCGETSGCNT:
1041                         if (copy_from_user(&sr,arg,sizeof(sr)))
1042                                 return -EFAULT;
1043
1044                         read_lock(&mrt_lock);
1045                         c = ipmr_cache_find(sr.src.s_addr, sr.grp.s_addr);
1046                         if (c) {
1047                                 sr.pktcnt = c->mfc_un.res.pkt;
1048                                 sr.bytecnt = c->mfc_un.res.bytes;
1049                                 sr.wrong_if = c->mfc_un.res.wrong_if;
1050                                 read_unlock(&mrt_lock);
1051
1052                                 if (copy_to_user(arg,&sr,sizeof(sr)))
1053                                         return -EFAULT;
1054                                 return 0;
1055                         }
1056                         read_unlock(&mrt_lock);
1057                         return -EADDRNOTAVAIL;
1058                 default:
1059                         return -ENOIOCTLCMD;
1060         }
1061 }
1062
1063
1064 static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
1065 {
1066         struct vif_device *v;
1067         int ct;
1068         if (event != NETDEV_UNREGISTER)
1069                 return NOTIFY_DONE;
1070         v=&vif_table[0];
1071         for(ct=0;ct<maxvif;ct++,v++) {
1072                 if (v->dev==ptr)
1073                         vif_delete(ct);
1074         }
1075         return NOTIFY_DONE;
1076 }
1077
1078
1079 static struct notifier_block ip_mr_notifier={
1080         .notifier_call = ipmr_device_event,
1081 };
1082
1083 /*
1084  *      Encapsulate a packet by attaching a valid IPIP header to it.
1085  *      This avoids tunnel drivers and other mess and gives us the speed so
1086  *      important for multicast video.
1087  */
1088  
1089 static void ip_encap(struct sk_buff *skb, u32 saddr, u32 daddr)
1090 {
1091         struct iphdr *iph = (struct iphdr *)skb_push(skb,sizeof(struct iphdr));
1092
1093         iph->version    =       4;
1094         iph->tos        =       skb->nh.iph->tos;
1095         iph->ttl        =       skb->nh.iph->ttl;
1096         iph->frag_off   =       0;
1097         iph->daddr      =       daddr;
1098         iph->saddr      =       saddr;
1099         iph->protocol   =       IPPROTO_IPIP;
1100         iph->ihl        =       5;
1101         iph->tot_len    =       htons(skb->len);
1102         ip_select_ident(iph, skb->dst, NULL);
1103         ip_send_check(iph);
1104
1105         skb->h.ipiph = skb->nh.iph;
1106         skb->nh.iph = iph;
1107         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1108 #ifdef CONFIG_NETFILTER
1109         nf_conntrack_put(skb->nfct);
1110         skb->nfct = NULL;
1111 #endif
1112 }
1113
1114 static inline int ipmr_forward_finish(struct sk_buff *skb)
1115 {
1116         struct ip_options * opt = &(IPCB(skb)->opt);
1117
1118         IP_INC_STATS_BH(OutForwDatagrams);
1119
1120         if (unlikely(opt->optlen))
1121                 ip_forward_options(skb);
1122
1123         return dst_output(skb);
1124 }
1125
1126 /*
1127  *      Processing handlers for ipmr_forward
1128  */
1129
1130 static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
1131 {
1132         struct iphdr *iph = skb->nh.iph;
1133         struct vif_device *vif = &vif_table[vifi];
1134         struct net_device *dev;
1135         struct rtable *rt;
1136         int    encap = 0;
1137
1138         if (vif->dev == NULL)
1139                 goto out_free;
1140
1141 #ifdef CONFIG_IP_PIMSM
1142         if (vif->flags & VIFF_REGISTER) {
1143                 vif->pkt_out++;
1144                 vif->bytes_out+=skb->len;
1145                 ((struct net_device_stats*)vif->dev->priv)->tx_bytes += skb->len;
1146                 ((struct net_device_stats*)vif->dev->priv)->tx_packets++;
1147                 ipmr_cache_report(skb, vifi, IGMPMSG_WHOLEPKT);
1148                 kfree_skb(skb);
1149                 return;
1150         }
1151 #endif
1152
1153         if (vif->flags&VIFF_TUNNEL) {
1154                 struct flowi fl = { .oif = vif->link,
1155                                     .nl_u = { .ip4_u =
1156                                               { .daddr = vif->remote,
1157                                                 .saddr = vif->local,
1158                                                 .tos = RT_TOS(iph->tos) } },
1159                                     .proto = IPPROTO_IPIP };
1160                 if (ip_route_output_key(&rt, &fl))
1161                         goto out_free;
1162                 encap = sizeof(struct iphdr);
1163         } else {
1164                 struct flowi fl = { .oif = vif->link,
1165                                     .nl_u = { .ip4_u =
1166                                               { .daddr = iph->daddr,
1167                                                 .tos = RT_TOS(iph->tos) } },
1168                                     .proto = IPPROTO_IPIP };
1169                 if (ip_route_output_key(&rt, &fl))
1170                         goto out_free;
1171         }
1172
1173         dev = rt->u.dst.dev;
1174
1175         if (skb->len+encap > dst_pmtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) {
1176                 /* Do not fragment multicasts. Alas, IPv4 does not
1177                    allow to send ICMP, so that packets will disappear
1178                    to blackhole.
1179                  */
1180
1181                 IP_INC_STATS_BH(FragFails);
1182                 ip_rt_put(rt);
1183                 goto out_free;
1184         }
1185
1186         encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len;
1187
1188         if (skb_cow(skb, encap)) {
1189                 ip_rt_put(rt);
1190                 goto out_free;
1191         }
1192
1193         vif->pkt_out++;
1194         vif->bytes_out+=skb->len;
1195
1196         dst_release(skb->dst);
1197         skb->dst = &rt->u.dst;
1198         iph = skb->nh.iph;
1199         ip_decrease_ttl(iph);
1200
1201         /* FIXME: forward and output firewalls used to be called here.
1202          * What do we do with netfilter? -- RR */
1203         if (vif->flags & VIFF_TUNNEL) {
1204                 ip_encap(skb, vif->local, vif->remote);
1205                 /* FIXME: extra output firewall step used to be here. --RR */
1206                 ((struct ip_tunnel *)vif->dev->priv)->stat.tx_packets++;
1207                 ((struct ip_tunnel *)vif->dev->priv)->stat.tx_bytes+=skb->len;
1208         }
1209
1210         IPCB(skb)->flags |= IPSKB_FORWARDED;
1211
1212         /*
1213          * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
1214          * not only before forwarding, but after forwarding on all output
1215          * interfaces. It is clear, if mrouter runs a multicasting
1216          * program, it should receive packets not depending to what interface
1217          * program is joined.
1218          * If we will not make it, the program will have to join on all
1219          * interfaces. On the other hand, multihoming host (or router, but
1220          * not mrouter) cannot join to more than one interface - it will
1221          * result in receiving multiple packets.
1222          */
1223         NF_HOOK(PF_INET, NF_IP_FORWARD, skb, skb->dev, dev, 
1224                 ipmr_forward_finish);
1225         return;
1226
1227 out_free:
1228         kfree_skb(skb);
1229         return;
1230 }
1231
1232 static int ipmr_find_vif(struct net_device *dev)
1233 {
1234         int ct;
1235         for (ct=maxvif-1; ct>=0; ct--) {
1236                 if (vif_table[ct].dev == dev)
1237                         break;
1238         }
1239         return ct;
1240 }
1241
1242 /* "local" means that we should preserve one skb (for local delivery) */
1243
1244 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local)
1245 {
1246         int psend = -1;
1247         int vif, ct;
1248
1249         vif = cache->mfc_parent;
1250         cache->mfc_un.res.pkt++;
1251         cache->mfc_un.res.bytes += skb->len;
1252
1253         /*
1254          * Wrong interface: drop packet and (maybe) send PIM assert.
1255          */
1256         if (vif_table[vif].dev != skb->dev) {
1257                 int true_vifi;
1258
1259                 if (((struct rtable*)skb->dst)->fl.iif == 0) {
1260                         /* It is our own packet, looped back.
1261                            Very complicated situation...
1262
1263                            The best workaround until routing daemons will be
1264                            fixed is not to redistribute packet, if it was
1265                            send through wrong interface. It means, that
1266                            multicast applications WILL NOT work for
1267                            (S,G), which have default multicast route pointing
1268                            to wrong oif. In any case, it is not a good
1269                            idea to use multicasting applications on router.
1270                          */
1271                         goto dont_forward;
1272                 }
1273
1274                 cache->mfc_un.res.wrong_if++;
1275                 true_vifi = ipmr_find_vif(skb->dev);
1276
1277                 if (true_vifi >= 0 && mroute_do_assert &&
1278                     /* pimsm uses asserts, when switching from RPT to SPT,
1279                        so that we cannot check that packet arrived on an oif.
1280                        It is bad, but otherwise we would need to move pretty
1281                        large chunk of pimd to kernel. Ough... --ANK
1282                      */
1283                     (mroute_do_pim || cache->mfc_un.res.ttls[true_vifi] < 255) &&
1284                     time_after(jiffies, 
1285                                cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
1286                         cache->mfc_un.res.last_assert = jiffies;
1287                         ipmr_cache_report(skb, true_vifi, IGMPMSG_WRONGVIF);
1288                 }
1289                 goto dont_forward;
1290         }
1291
1292         vif_table[vif].pkt_in++;
1293         vif_table[vif].bytes_in+=skb->len;
1294
1295         /*
1296          *      Forward the frame
1297          */
1298         for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
1299                 if (skb->nh.iph->ttl > cache->mfc_un.res.ttls[ct]) {
1300                         if (psend != -1) {
1301                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1302                                 if (skb2)
1303                                         ipmr_queue_xmit(skb2, cache, psend);
1304                         }
1305                         psend=ct;
1306                 }
1307         }
1308         if (psend != -1) {
1309                 if (local) {
1310                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1311                         if (skb2)
1312                                 ipmr_queue_xmit(skb2, cache, psend);
1313                 } else {
1314                         ipmr_queue_xmit(skb, cache, psend);
1315                         return 0;
1316                 }
1317         }
1318
1319 dont_forward:
1320         if (!local)
1321                 kfree_skb(skb);
1322         return 0;
1323 }
1324
1325
1326 /*
1327  *      Multicast packets for forwarding arrive here
1328  */
1329
1330 int ip_mr_input(struct sk_buff *skb)
1331 {
1332         struct mfc_cache *cache;
1333         int local = ((struct rtable*)skb->dst)->rt_flags&RTCF_LOCAL;
1334
1335         /* Packet is looped back after forward, it should not be
1336            forwarded second time, but still can be delivered locally.
1337          */
1338         if (IPCB(skb)->flags&IPSKB_FORWARDED)
1339                 goto dont_forward;
1340
1341         if (!local) {
1342                     if (IPCB(skb)->opt.router_alert) {
1343                             if (ip_call_ra_chain(skb))
1344                                     return 0;
1345                     } else if (skb->nh.iph->protocol == IPPROTO_IGMP){
1346                             /* IGMPv1 (and broken IGMPv2 implementations sort of
1347                                Cisco IOS <= 11.2(8)) do not put router alert
1348                                option to IGMP packets destined to routable
1349                                groups. It is very bad, because it means
1350                                that we can forward NO IGMP messages.
1351                              */
1352                             read_lock(&mrt_lock);
1353                             if (mroute_socket) {
1354                                     raw_rcv(mroute_socket, skb);
1355                                     read_unlock(&mrt_lock);
1356                                     return 0;
1357                             }
1358                             read_unlock(&mrt_lock);
1359                     }
1360         }
1361
1362         read_lock(&mrt_lock);
1363         cache = ipmr_cache_find(skb->nh.iph->saddr, skb->nh.iph->daddr);
1364
1365         /*
1366          *      No usable cache entry
1367          */
1368         if (cache==NULL) {
1369                 int vif;
1370
1371                 if (local) {
1372                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1373                         ip_local_deliver(skb);
1374                         if (skb2 == NULL) {
1375                                 read_unlock(&mrt_lock);
1376                                 return -ENOBUFS;
1377                         }
1378                         skb = skb2;
1379                 }
1380
1381                 vif = ipmr_find_vif(skb->dev);
1382                 if (vif >= 0) {
1383                         int err = ipmr_cache_unresolved(vif, skb);
1384                         read_unlock(&mrt_lock);
1385
1386                         return err;
1387                 }
1388                 read_unlock(&mrt_lock);
1389                 kfree_skb(skb);
1390                 return -ENODEV;
1391         }
1392
1393         ip_mr_forward(skb, cache, local);
1394
1395         read_unlock(&mrt_lock);
1396
1397         if (local)
1398                 return ip_local_deliver(skb);
1399
1400         return 0;
1401
1402 dont_forward:
1403         if (local)
1404                 return ip_local_deliver(skb);
1405         kfree_skb(skb);
1406         return 0;
1407 }
1408
1409 #ifdef CONFIG_IP_PIMSM_V1
1410 /*
1411  * Handle IGMP messages of PIMv1
1412  */
1413
1414 int pim_rcv_v1(struct sk_buff * skb)
1415 {
1416         struct igmphdr *pim;
1417         struct iphdr   *encap;
1418         struct net_device  *reg_dev = NULL;
1419
1420         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap))) 
1421                 goto drop;
1422
1423         pim = (struct igmphdr*)skb->h.raw;
1424
1425         if (!mroute_do_pim ||
1426             skb->len < sizeof(*pim) + sizeof(*encap) ||
1427             pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER) 
1428                 goto drop;
1429
1430         encap = (struct iphdr*)(skb->h.raw + sizeof(struct igmphdr));
1431         /*
1432            Check that:
1433            a. packet is really destinted to a multicast group
1434            b. packet is not a NULL-REGISTER
1435            c. packet is not truncated
1436          */
1437         if (!MULTICAST(encap->daddr) ||
1438             encap->tot_len == 0 ||
1439             ntohs(encap->tot_len) + sizeof(*pim) > skb->len) 
1440                 goto drop;
1441
1442         read_lock(&mrt_lock);
1443         if (reg_vif_num >= 0)
1444                 reg_dev = vif_table[reg_vif_num].dev;
1445         if (reg_dev)
1446                 dev_hold(reg_dev);
1447         read_unlock(&mrt_lock);
1448
1449         if (reg_dev == NULL) 
1450                 goto drop;
1451
1452         skb->mac.raw = skb->nh.raw;
1453         skb_pull(skb, (u8*)encap - skb->data);
1454         skb->nh.iph = (struct iphdr *)skb->data;
1455         skb->dev = reg_dev;
1456         memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
1457         skb->protocol = htons(ETH_P_IP);
1458         skb->ip_summed = 0;
1459         skb->pkt_type = PACKET_HOST;
1460         dst_release(skb->dst);
1461         skb->dst = NULL;
1462         ((struct net_device_stats*)reg_dev->priv)->rx_bytes += skb->len;
1463         ((struct net_device_stats*)reg_dev->priv)->rx_packets++;
1464 #ifdef CONFIG_NETFILTER
1465         nf_conntrack_put(skb->nfct);
1466         skb->nfct = NULL;
1467 #endif
1468         netif_rx(skb);
1469         dev_put(reg_dev);
1470         return 0;
1471  drop:
1472         kfree_skb(skb);
1473         return 0;
1474 }
1475 #endif
1476
1477 #ifdef CONFIG_IP_PIMSM_V2
1478 static int pim_rcv(struct sk_buff * skb)
1479 {
1480         struct pimreghdr *pim;
1481         struct iphdr   *encap;
1482         struct net_device  *reg_dev = NULL;
1483
1484         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap))) 
1485                 goto drop;
1486
1487         pim = (struct pimreghdr*)skb->h.raw;
1488         if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
1489             (pim->flags&PIM_NULL_REGISTER) ||
1490             (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 && 
1491              (u16)csum_fold(skb_checksum(skb, 0, skb->len, 0)))) 
1492                 goto drop;
1493
1494         /* check if the inner packet is destined to mcast group */
1495         encap = (struct iphdr*)(skb->h.raw + sizeof(struct pimreghdr));
1496         if (!MULTICAST(encap->daddr) ||
1497             encap->tot_len == 0 ||
1498             ntohs(encap->tot_len) + sizeof(*pim) > skb->len) 
1499                 goto drop;
1500
1501         read_lock(&mrt_lock);
1502         if (reg_vif_num >= 0)
1503                 reg_dev = vif_table[reg_vif_num].dev;
1504         if (reg_dev)
1505                 dev_hold(reg_dev);
1506         read_unlock(&mrt_lock);
1507
1508         if (reg_dev == NULL) 
1509                 goto drop;
1510
1511         skb->mac.raw = skb->nh.raw;
1512         skb_pull(skb, (u8*)encap - skb->data);
1513         skb->nh.iph = (struct iphdr *)skb->data;
1514         skb->dev = reg_dev;
1515         memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
1516         skb->protocol = htons(ETH_P_IP);
1517         skb->ip_summed = 0;
1518         skb->pkt_type = PACKET_HOST;
1519         dst_release(skb->dst);
1520         ((struct net_device_stats*)reg_dev->priv)->rx_bytes += skb->len;
1521         ((struct net_device_stats*)reg_dev->priv)->rx_packets++;
1522         skb->dst = NULL;
1523 #ifdef CONFIG_NETFILTER
1524         nf_conntrack_put(skb->nfct);
1525         skb->nfct = NULL;
1526 #endif
1527         netif_rx(skb);
1528         dev_put(reg_dev);
1529         return 0;
1530  drop:
1531         kfree_skb(skb);
1532         return 0;
1533 }
1534 #endif
1535
1536 static int
1537 ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
1538 {
1539         int ct;
1540         struct rtnexthop *nhp;
1541         struct net_device *dev = vif_table[c->mfc_parent].dev;
1542         u8 *b = skb->tail;
1543         struct rtattr *mp_head;
1544
1545         if (dev)
1546                 RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex);
1547
1548         mp_head = (struct rtattr*)skb_put(skb, RTA_LENGTH(0));
1549
1550         for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
1551                 if (c->mfc_un.res.ttls[ct] < 255) {
1552                         if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
1553                                 goto rtattr_failure;
1554                         nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
1555                         nhp->rtnh_flags = 0;
1556                         nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
1557                         nhp->rtnh_ifindex = vif_table[ct].dev->ifindex;
1558                         nhp->rtnh_len = sizeof(*nhp);
1559                 }
1560         }
1561         mp_head->rta_type = RTA_MULTIPATH;
1562         mp_head->rta_len = skb->tail - (u8*)mp_head;
1563         rtm->rtm_type = RTN_MULTICAST;
1564         return 1;
1565
1566 rtattr_failure:
1567         skb_trim(skb, b - skb->data);
1568         return -EMSGSIZE;
1569 }
1570
1571 int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait)
1572 {
1573         int err;
1574         struct mfc_cache *cache;
1575         struct rtable *rt = (struct rtable*)skb->dst;
1576
1577         read_lock(&mrt_lock);
1578         cache = ipmr_cache_find(rt->rt_src, rt->rt_dst);
1579
1580         if (cache==NULL) {
1581                 struct net_device *dev;
1582                 int vif;
1583
1584                 if (nowait) {
1585                         read_unlock(&mrt_lock);
1586                         return -EAGAIN;
1587                 }
1588
1589                 dev = skb->dev;
1590                 if (dev == NULL || (vif = ipmr_find_vif(dev)) < 0) {
1591                         read_unlock(&mrt_lock);
1592                         return -ENODEV;
1593                 }
1594                 skb->nh.raw = skb_push(skb, sizeof(struct iphdr));
1595                 skb->nh.iph->ihl = sizeof(struct iphdr)>>2;
1596                 skb->nh.iph->saddr = rt->rt_src;
1597                 skb->nh.iph->daddr = rt->rt_dst;
1598                 skb->nh.iph->version = 0;
1599                 err = ipmr_cache_unresolved(vif, skb);
1600                 read_unlock(&mrt_lock);
1601                 return err;
1602         }
1603
1604         if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
1605                 cache->mfc_flags |= MFC_NOTIFY;
1606         err = ipmr_fill_mroute(skb, cache, rtm);
1607         read_unlock(&mrt_lock);
1608         return err;
1609 }
1610
1611 #ifdef CONFIG_PROC_FS   
1612 /*
1613  *      The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
1614  */
1615 struct ipmr_vif_iter {
1616         int ct;
1617 };
1618
1619 static struct vif_device *ipmr_vif_seq_idx(struct ipmr_vif_iter *iter,
1620                                            loff_t pos)
1621 {
1622         for (iter->ct = 0; iter->ct < maxvif; ++iter->ct) {
1623                 if(!VIF_EXISTS(iter->ct))
1624                         continue;
1625                 if (pos-- == 0) 
1626                         return &vif_table[iter->ct];
1627         }
1628         return NULL;
1629 }
1630
1631 static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
1632 {
1633         read_lock(&mrt_lock);
1634         return *pos ? ipmr_vif_seq_idx(seq->private, *pos - 1) 
1635                 : SEQ_START_TOKEN;
1636 }
1637
1638 static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1639 {
1640         struct ipmr_vif_iter *iter = seq->private;
1641
1642         ++*pos;
1643         if (v == SEQ_START_TOKEN)
1644                 return ipmr_vif_seq_idx(iter, 0);
1645         
1646         while (++iter->ct < maxvif) {
1647                 if(!VIF_EXISTS(iter->ct))
1648                         continue;
1649                 return &vif_table[iter->ct];
1650         }
1651         return NULL;
1652 }
1653
1654 static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
1655 {
1656         read_unlock(&mrt_lock);
1657 }
1658
1659 static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
1660 {
1661         if (v == SEQ_START_TOKEN) {
1662                 seq_puts(seq, 
1663                          "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags Local    Remote\n");
1664         } else {
1665                 const struct vif_device *vif = v;
1666                 const char *name =  vif->dev ? vif->dev->name : "none";
1667
1668                 seq_printf(seq,
1669                            "%2Zd %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
1670                            vif - vif_table,
1671                            name, vif->bytes_in, vif->pkt_in, 
1672                            vif->bytes_out, vif->pkt_out,
1673                            vif->flags, vif->local, vif->remote);
1674         }
1675         return 0;
1676 }
1677
1678 static struct seq_operations ipmr_vif_seq_ops = {
1679         .start = ipmr_vif_seq_start,
1680         .next  = ipmr_vif_seq_next,
1681         .stop  = ipmr_vif_seq_stop,
1682         .show  = ipmr_vif_seq_show,
1683 };
1684
1685 static int ipmr_vif_open(struct inode *inode, struct file *file)
1686 {
1687         struct seq_file *seq;
1688         int rc = -ENOMEM;
1689         struct ipmr_vif_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
1690        
1691         if (!s)
1692                 goto out;
1693
1694         rc = seq_open(file, &ipmr_vif_seq_ops);
1695         if (rc)
1696                 goto out_kfree;
1697
1698         s->ct = 0;
1699         seq = file->private_data;
1700         seq->private = s;
1701 out:
1702         return rc;
1703 out_kfree:
1704         kfree(s);
1705         goto out;
1706
1707 }
1708
1709 static struct file_operations ipmr_vif_fops = {
1710         .owner   = THIS_MODULE,
1711         .open    = ipmr_vif_open,
1712         .read    = seq_read,
1713         .llseek  = seq_lseek,
1714         .release = seq_release,
1715 };
1716
1717 struct ipmr_mfc_iter {
1718         struct mfc_cache **cache;
1719         int ct;
1720 };
1721
1722
1723 static struct mfc_cache *ipmr_mfc_seq_idx(struct ipmr_mfc_iter *it, loff_t pos)
1724 {
1725         struct mfc_cache *mfc;
1726
1727         it->cache = mfc_cache_array;
1728         read_lock(&mrt_lock);
1729         for (it->ct = 0; it->ct < MFC_LINES; it->ct++) 
1730                 for(mfc = mfc_cache_array[it->ct]; mfc; mfc = mfc->next) 
1731                         if (pos-- == 0) 
1732                                 return mfc;
1733         read_unlock(&mrt_lock);
1734
1735         it->cache = &mfc_unres_queue;
1736         spin_lock_bh(&mfc_unres_lock);
1737         for(mfc = mfc_unres_queue; mfc; mfc = mfc->next) 
1738                 if (pos-- == 0)
1739                         return mfc;
1740         spin_unlock_bh(&mfc_unres_lock);
1741
1742         it->cache = NULL;
1743         return NULL;
1744 }
1745
1746
1747 static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
1748 {
1749         return *pos ? ipmr_mfc_seq_idx(seq->private, *pos - 1) 
1750                 : SEQ_START_TOKEN;
1751 }
1752
1753 static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1754 {
1755         struct mfc_cache *mfc = v;
1756         struct ipmr_mfc_iter *it = seq->private;
1757
1758         ++*pos;
1759
1760         if (v == SEQ_START_TOKEN)
1761                 return ipmr_mfc_seq_idx(seq->private, 0);
1762
1763         if (mfc->next)
1764                 return mfc->next;
1765         
1766         if (it->cache == &mfc_unres_queue) 
1767                 goto end_of_list;
1768
1769         BUG_ON(it->cache != mfc_cache_array);
1770
1771         while (++it->ct < MFC_LINES) {
1772                 mfc = mfc_cache_array[it->ct];
1773                 if (mfc)
1774                         return mfc;
1775         }
1776
1777         /* exhausted cache_array, show unresolved */
1778         read_unlock(&mrt_lock);
1779         it->cache = &mfc_unres_queue;
1780         it->ct = 0;
1781                 
1782         spin_lock_bh(&mfc_unres_lock);
1783         mfc = mfc_unres_queue;
1784         if (mfc) 
1785                 return mfc;
1786
1787  end_of_list:
1788         spin_unlock_bh(&mfc_unres_lock);
1789         it->cache = NULL;
1790
1791         return NULL;
1792 }
1793
1794 static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
1795 {
1796         struct ipmr_mfc_iter *it = seq->private;
1797
1798         if (it->cache == &mfc_unres_queue)
1799                 spin_unlock_bh(&mfc_unres_lock);
1800         else if (it->cache == mfc_cache_array)
1801                 read_unlock(&mrt_lock);
1802 }
1803
1804 static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
1805 {
1806         int n;
1807
1808         if (v == SEQ_START_TOKEN) {
1809                 seq_puts(seq, 
1810                  "Group    Origin   Iif     Pkts    Bytes    Wrong Oifs\n");
1811         } else {
1812                 const struct mfc_cache *mfc = v;
1813                 const struct ipmr_mfc_iter *it = seq->private;
1814                 
1815                 seq_printf(seq, "%08lX %08lX %-3d %8ld %8ld %8ld",
1816                            (unsigned long) mfc->mfc_mcastgrp,
1817                            (unsigned long) mfc->mfc_origin,
1818                            mfc->mfc_parent,
1819                            mfc->mfc_un.res.pkt,
1820                            mfc->mfc_un.res.bytes,
1821                            mfc->mfc_un.res.wrong_if);
1822
1823                 if (it->cache != &mfc_unres_queue) {
1824                         for(n = mfc->mfc_un.res.minvif; 
1825                             n < mfc->mfc_un.res.maxvif; n++ ) {
1826                                 if(VIF_EXISTS(n) 
1827                                    && mfc->mfc_un.res.ttls[n] < 255)
1828                                 seq_printf(seq, 
1829                                            " %2d:%-3d", 
1830                                            n, mfc->mfc_un.res.ttls[n]);
1831                         }
1832                 }
1833                 seq_putc(seq, '\n');
1834         }
1835         return 0;
1836 }
1837
1838 static struct seq_operations ipmr_mfc_seq_ops = {
1839         .start = ipmr_mfc_seq_start,
1840         .next  = ipmr_mfc_seq_next,
1841         .stop  = ipmr_mfc_seq_stop,
1842         .show  = ipmr_mfc_seq_show,
1843 };
1844
1845 static int ipmr_mfc_open(struct inode *inode, struct file *file)
1846 {
1847         struct seq_file *seq;
1848         int rc = -ENOMEM;
1849         struct ipmr_mfc_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
1850        
1851         if (!s)
1852                 goto out;
1853
1854         rc = seq_open(file, &ipmr_mfc_seq_ops);
1855         if (rc)
1856                 goto out_kfree;
1857
1858         memset(s, 0, sizeof(*s));
1859         seq = file->private_data;
1860         seq->private = s;
1861 out:
1862         return rc;
1863 out_kfree:
1864         kfree(s);
1865         goto out;
1866
1867 }
1868
1869 static struct file_operations ipmr_mfc_fops = {
1870         .owner   = THIS_MODULE,
1871         .open    = ipmr_mfc_open,
1872         .read    = seq_read,
1873         .llseek  = seq_lseek,
1874         .release = seq_release,
1875 };
1876 #endif  
1877
1878 #ifdef CONFIG_IP_PIMSM_V2
1879 static struct inet_protocol pim_protocol = {
1880         .handler        =       pim_rcv,
1881 };
1882 #endif
1883
1884
1885 /*
1886  *      Setup for IP multicast routing
1887  */
1888  
1889 void __init ip_mr_init(void)
1890 {
1891         mrt_cachep = kmem_cache_create("ip_mrt_cache",
1892                                        sizeof(struct mfc_cache),
1893                                        0, SLAB_HWCACHE_ALIGN,
1894                                        NULL, NULL);
1895         if (!mrt_cachep)
1896                 panic("cannot allocate ip_mrt_cache");
1897
1898         init_timer(&ipmr_expire_timer);
1899         ipmr_expire_timer.function=ipmr_expire_process;
1900         register_netdevice_notifier(&ip_mr_notifier);
1901 #ifdef CONFIG_PROC_FS   
1902         proc_net_fops_create("ip_mr_vif", 0, &ipmr_vif_fops);
1903         proc_net_fops_create("ip_mr_cache", 0, &ipmr_mfc_fops);
1904 #endif  
1905 }