ftp://ftp.kernel.org/pub/linux/kernel/v2.6/linux-2.6.6.tar.bz2
[linux-2.6.git] / net / xfrm / xfrm_policy.c
1 /* 
2  * xfrm_policy.c
3  *
4  * Changes:
5  *      Mitsuru KANDA @USAGI
6  *      Kazunori MIYAZAWA @USAGI
7  *      Kunihiro Ishiguro <kunihiro@ipinfusion.com>
8  *              IPv6 support
9  *      Kazunori MIYAZAWA @USAGI
10  *      YOSHIFUJI Hideaki
11  *              Split up af-specific portion
12  *      Derek Atkins <derek@ihtfp.com>          Add the post_input processor
13  *      
14  */
15
16 #include <linux/config.h>
17 #include <linux/slab.h>
18 #include <linux/kmod.h>
19 #include <linux/list.h>
20 #include <linux/spinlock.h>
21 #include <linux/workqueue.h>
22 #include <linux/notifier.h>
23 #include <linux/netdevice.h>
24 #include <net/xfrm.h>
25 #include <net/ip.h>
26
27 DECLARE_MUTEX(xfrm_cfg_sem);
28
29 static rwlock_t xfrm_policy_lock = RW_LOCK_UNLOCKED;
30
31 struct xfrm_policy *xfrm_policy_list[XFRM_POLICY_MAX*2];
32
33 static rwlock_t xfrm_policy_afinfo_lock = RW_LOCK_UNLOCKED;
34 static struct xfrm_policy_afinfo *xfrm_policy_afinfo[NPROTO];
35
36 kmem_cache_t *xfrm_dst_cache;
37
38 static struct work_struct xfrm_policy_gc_work;
39 static struct list_head xfrm_policy_gc_list =
40         LIST_HEAD_INIT(xfrm_policy_gc_list);
41 static spinlock_t xfrm_policy_gc_lock = SPIN_LOCK_UNLOCKED;
42
43 int xfrm_register_type(struct xfrm_type *type, unsigned short family)
44 {
45         struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
46         struct xfrm_type_map *typemap;
47         int err = 0;
48
49         if (unlikely(afinfo == NULL))
50                 return -EAFNOSUPPORT;
51         typemap = afinfo->type_map;
52
53         write_lock(&typemap->lock);
54         if (likely(typemap->map[type->proto] == NULL))
55                 typemap->map[type->proto] = type;
56         else
57                 err = -EEXIST;
58         write_unlock(&typemap->lock);
59         xfrm_policy_put_afinfo(afinfo);
60         return err;
61 }
62
63 int xfrm_unregister_type(struct xfrm_type *type, unsigned short family)
64 {
65         struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
66         struct xfrm_type_map *typemap;
67         int err = 0;
68
69         if (unlikely(afinfo == NULL))
70                 return -EAFNOSUPPORT;
71         typemap = afinfo->type_map;
72
73         write_lock(&typemap->lock);
74         if (unlikely(typemap->map[type->proto] != type))
75                 err = -ENOENT;
76         else
77                 typemap->map[type->proto] = NULL;
78         write_unlock(&typemap->lock);
79         xfrm_policy_put_afinfo(afinfo);
80         return err;
81 }
82
83 struct xfrm_type *xfrm_get_type(u8 proto, unsigned short family)
84 {
85         struct xfrm_policy_afinfo *afinfo;
86         struct xfrm_type_map *typemap;
87         struct xfrm_type *type;
88         int modload_attempted = 0;
89
90 retry:
91         afinfo = xfrm_policy_get_afinfo(family);
92         if (unlikely(afinfo == NULL))
93                 return NULL;
94         typemap = afinfo->type_map;
95
96         read_lock(&typemap->lock);
97         type = typemap->map[proto];
98         if (unlikely(type && !try_module_get(type->owner)))
99                 type = NULL;
100         read_unlock(&typemap->lock);
101         if (!type && !modload_attempted) {
102                 xfrm_policy_put_afinfo(afinfo);
103                 request_module("xfrm-type-%d-%d",
104                                (int) family, (int) proto);
105                 modload_attempted = 1;
106                 goto retry;
107         }
108
109         xfrm_policy_put_afinfo(afinfo);
110         return type;
111 }
112
113 int xfrm_dst_lookup(struct xfrm_dst **dst, struct flowi *fl, 
114                     unsigned short family)
115 {
116         struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
117         int err = 0;
118
119         if (unlikely(afinfo == NULL))
120                 return -EAFNOSUPPORT;
121
122         if (likely(afinfo->dst_lookup != NULL))
123                 err = afinfo->dst_lookup(dst, fl);
124         else
125                 err = -EINVAL;
126         xfrm_policy_put_afinfo(afinfo);
127         return err;
128 }
129
130 void xfrm_put_type(struct xfrm_type *type)
131 {
132         module_put(type->owner);
133 }
134
135 static inline unsigned long make_jiffies(long secs)
136 {
137         if (secs >= (MAX_SCHEDULE_TIMEOUT-1)/HZ)
138                 return MAX_SCHEDULE_TIMEOUT-1;
139         else
140                 return secs*HZ;
141 }
142
143 static void xfrm_policy_timer(unsigned long data)
144 {
145         struct xfrm_policy *xp = (struct xfrm_policy*)data;
146         unsigned long now = (unsigned long)xtime.tv_sec;
147         long next = LONG_MAX;
148         int warn = 0;
149         int dir;
150
151         if (xp->dead)
152                 goto out;
153
154         dir = xp->index & 7;
155
156         if (xp->lft.hard_add_expires_seconds) {
157                 long tmo = xp->lft.hard_add_expires_seconds +
158                         xp->curlft.add_time - now;
159                 if (tmo <= 0)
160                         goto expired;
161                 if (tmo < next)
162                         next = tmo;
163         }
164         if (xp->lft.hard_use_expires_seconds) {
165                 long tmo = xp->lft.hard_use_expires_seconds +
166                         (xp->curlft.use_time ? : xp->curlft.add_time) - now;
167                 if (tmo <= 0)
168                         goto expired;
169                 if (tmo < next)
170                         next = tmo;
171         }
172         if (xp->lft.soft_add_expires_seconds) {
173                 long tmo = xp->lft.soft_add_expires_seconds +
174                         xp->curlft.add_time - now;
175                 if (tmo <= 0) {
176                         warn = 1;
177                         tmo = XFRM_KM_TIMEOUT;
178                 }
179                 if (tmo < next)
180                         next = tmo;
181         }
182         if (xp->lft.soft_use_expires_seconds) {
183                 long tmo = xp->lft.soft_use_expires_seconds +
184                         (xp->curlft.use_time ? : xp->curlft.add_time) - now;
185                 if (tmo <= 0) {
186                         warn = 1;
187                         tmo = XFRM_KM_TIMEOUT;
188                 }
189                 if (tmo < next)
190                         next = tmo;
191         }
192
193         if (warn)
194                 km_policy_expired(xp, dir, 0);
195         if (next != LONG_MAX &&
196             !mod_timer(&xp->timer, jiffies + make_jiffies(next)))
197                 xfrm_pol_hold(xp);
198
199 out:
200         xfrm_pol_put(xp);
201         return;
202
203 expired:
204         km_policy_expired(xp, dir, 1);
205         xfrm_policy_delete(xp, dir);
206         xfrm_pol_put(xp);
207 }
208
209
210 /* Allocate xfrm_policy. Not used here, it is supposed to be used by pfkeyv2
211  * SPD calls.
212  */
213
214 struct xfrm_policy *xfrm_policy_alloc(int gfp)
215 {
216         struct xfrm_policy *policy;
217
218         policy = kmalloc(sizeof(struct xfrm_policy), gfp);
219
220         if (policy) {
221                 memset(policy, 0, sizeof(struct xfrm_policy));
222                 atomic_set(&policy->refcnt, 1);
223                 policy->lock = RW_LOCK_UNLOCKED;
224                 init_timer(&policy->timer);
225                 policy->timer.data = (unsigned long)policy;
226                 policy->timer.function = xfrm_policy_timer;
227         }
228         return policy;
229 }
230
231 /* Destroy xfrm_policy: descendant resources must be released to this moment. */
232
233 void __xfrm_policy_destroy(struct xfrm_policy *policy)
234 {
235         if (!policy->dead)
236                 BUG();
237
238         if (policy->bundles)
239                 BUG();
240
241         if (del_timer(&policy->timer))
242                 BUG();
243
244         kfree(policy);
245 }
246
247 static void xfrm_policy_gc_kill(struct xfrm_policy *policy)
248 {
249         struct dst_entry *dst;
250
251         while ((dst = policy->bundles) != NULL) {
252                 policy->bundles = dst->next;
253                 dst_free(dst);
254         }
255
256         if (del_timer(&policy->timer))
257                 atomic_dec(&policy->refcnt);
258
259         if (atomic_read(&policy->refcnt) > 1)
260                 flow_cache_flush();
261
262         xfrm_pol_put(policy);
263 }
264
265 static void xfrm_policy_gc_task(void *data)
266 {
267         struct xfrm_policy *policy;
268         struct list_head *entry, *tmp;
269         struct list_head gc_list = LIST_HEAD_INIT(gc_list);
270
271         spin_lock_bh(&xfrm_policy_gc_lock);
272         list_splice_init(&xfrm_policy_gc_list, &gc_list);
273         spin_unlock_bh(&xfrm_policy_gc_lock);
274
275         list_for_each_safe(entry, tmp, &gc_list) {
276                 policy = list_entry(entry, struct xfrm_policy, list);
277                 xfrm_policy_gc_kill(policy);
278         }
279 }
280
281 /* Rule must be locked. Release descentant resources, announce
282  * entry dead. The rule must be unlinked from lists to the moment.
283  */
284
285 void xfrm_policy_kill(struct xfrm_policy *policy)
286 {
287         write_lock_bh(&policy->lock);
288         if (policy->dead)
289                 goto out;
290
291         policy->dead = 1;
292
293         spin_lock(&xfrm_policy_gc_lock);
294         list_add(&policy->list, &xfrm_policy_gc_list);
295         spin_unlock(&xfrm_policy_gc_lock);
296         schedule_work(&xfrm_policy_gc_work);
297
298 out:
299         write_unlock_bh(&policy->lock);
300 }
301
302 /* Generate new index... KAME seems to generate them ordered by cost
303  * of an absolute inpredictability of ordering of rules. This will not pass. */
304 static u32 xfrm_gen_index(int dir)
305 {
306         u32 idx;
307         struct xfrm_policy *p;
308         static u32 idx_generator;
309
310         for (;;) {
311                 idx = (idx_generator | dir);
312                 idx_generator += 8;
313                 if (idx == 0)
314                         idx = 8;
315                 for (p = xfrm_policy_list[dir]; p; p = p->next) {
316                         if (p->index == idx)
317                                 break;
318                 }
319                 if (!p)
320                         return idx;
321         }
322 }
323
324 int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
325 {
326         struct xfrm_policy *pol, **p;
327         struct xfrm_policy *delpol = NULL;
328         struct xfrm_policy **newpos = NULL;
329
330         write_lock_bh(&xfrm_policy_lock);
331         for (p = &xfrm_policy_list[dir]; (pol=*p)!=NULL; p = &pol->next) {
332                 if (!delpol && memcmp(&policy->selector, &pol->selector, sizeof(pol->selector)) == 0) {
333                         if (excl) {
334                                 write_unlock_bh(&xfrm_policy_lock);
335                                 return -EEXIST;
336                         }
337                         *p = pol->next;
338                         delpol = pol;
339                         if (policy->priority > pol->priority)
340                                 continue;
341                 } else if (policy->priority >= pol->priority)
342                         continue;
343                 if (!newpos)
344                         newpos = p;
345                 if (delpol)
346                         break;
347         }
348         if (newpos)
349                 p = newpos;
350         xfrm_pol_hold(policy);
351         policy->next = *p;
352         *p = policy;
353         atomic_inc(&flow_cache_genid);
354         policy->index = delpol ? delpol->index : xfrm_gen_index(dir);
355         policy->curlft.add_time = (unsigned long)xtime.tv_sec;
356         policy->curlft.use_time = 0;
357         if (!mod_timer(&policy->timer, jiffies + HZ))
358                 xfrm_pol_hold(policy);
359         write_unlock_bh(&xfrm_policy_lock);
360
361         if (delpol) {
362                 xfrm_policy_kill(delpol);
363         }
364         return 0;
365 }
366
367 struct xfrm_policy *xfrm_policy_bysel(int dir, struct xfrm_selector *sel,
368                                       int delete)
369 {
370         struct xfrm_policy *pol, **p;
371
372         write_lock_bh(&xfrm_policy_lock);
373         for (p = &xfrm_policy_list[dir]; (pol=*p)!=NULL; p = &pol->next) {
374                 if (memcmp(sel, &pol->selector, sizeof(*sel)) == 0) {
375                         xfrm_pol_hold(pol);
376                         if (delete)
377                                 *p = pol->next;
378                         break;
379                 }
380         }
381         write_unlock_bh(&xfrm_policy_lock);
382
383         if (pol && delete) {
384                 atomic_inc(&flow_cache_genid);
385                 xfrm_policy_kill(pol);
386         }
387         return pol;
388 }
389
390 struct xfrm_policy *xfrm_policy_byid(int dir, u32 id, int delete)
391 {
392         struct xfrm_policy *pol, **p;
393
394         write_lock_bh(&xfrm_policy_lock);
395         for (p = &xfrm_policy_list[id & 7]; (pol=*p)!=NULL; p = &pol->next) {
396                 if (pol->index == id) {
397                         xfrm_pol_hold(pol);
398                         if (delete)
399                                 *p = pol->next;
400                         break;
401                 }
402         }
403         write_unlock_bh(&xfrm_policy_lock);
404
405         if (pol && delete) {
406                 atomic_inc(&flow_cache_genid);
407                 xfrm_policy_kill(pol);
408         }
409         return pol;
410 }
411
412 void xfrm_policy_flush(void)
413 {
414         struct xfrm_policy *xp;
415         int dir;
416
417         write_lock_bh(&xfrm_policy_lock);
418         for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
419                 while ((xp = xfrm_policy_list[dir]) != NULL) {
420                         xfrm_policy_list[dir] = xp->next;
421                         write_unlock_bh(&xfrm_policy_lock);
422
423                         xfrm_policy_kill(xp);
424
425                         write_lock_bh(&xfrm_policy_lock);
426                 }
427         }
428         atomic_inc(&flow_cache_genid);
429         write_unlock_bh(&xfrm_policy_lock);
430 }
431
432 int xfrm_policy_walk(int (*func)(struct xfrm_policy *, int, int, void*),
433                      void *data)
434 {
435         struct xfrm_policy *xp;
436         int dir;
437         int count = 0;
438         int error = 0;
439
440         read_lock_bh(&xfrm_policy_lock);
441         for (dir = 0; dir < 2*XFRM_POLICY_MAX; dir++) {
442                 for (xp = xfrm_policy_list[dir]; xp; xp = xp->next)
443                         count++;
444         }
445
446         if (count == 0) {
447                 error = -ENOENT;
448                 goto out;
449         }
450
451         for (dir = 0; dir < 2*XFRM_POLICY_MAX; dir++) {
452                 for (xp = xfrm_policy_list[dir]; xp; xp = xp->next) {
453                         error = func(xp, dir%XFRM_POLICY_MAX, --count, data);
454                         if (error)
455                                 goto out;
456                 }
457         }
458
459 out:
460         read_unlock_bh(&xfrm_policy_lock);
461         return error;
462 }
463
464
465 /* Find policy to apply to this flow. */
466
467 static void xfrm_policy_lookup(struct flowi *fl, u16 family, u8 dir,
468                                void **objp, atomic_t **obj_refp)
469 {
470         struct xfrm_policy *pol;
471
472         read_lock_bh(&xfrm_policy_lock);
473         for (pol = xfrm_policy_list[dir]; pol; pol = pol->next) {
474                 struct xfrm_selector *sel = &pol->selector;
475                 int match;
476
477                 if (pol->family != family)
478                         continue;
479
480                 match = xfrm_selector_match(sel, fl, family);
481                 if (match) {
482                         xfrm_pol_hold(pol);
483                         break;
484                 }
485         }
486         read_unlock_bh(&xfrm_policy_lock);
487         if ((*objp = (void *) pol) != NULL)
488                 *obj_refp = &pol->refcnt;
489 }
490
491 struct xfrm_policy *xfrm_sk_policy_lookup(struct sock *sk, int dir, struct flowi *fl)
492 {
493         struct xfrm_policy *pol;
494
495         read_lock_bh(&xfrm_policy_lock);
496         if ((pol = sk->sk_policy[dir]) != NULL) {
497                 int match = xfrm_selector_match(&pol->selector, fl,
498                                                 sk->sk_family);
499                 if (match)
500                         xfrm_pol_hold(pol);
501                 else
502                         pol = NULL;
503         }
504         read_unlock_bh(&xfrm_policy_lock);
505         return pol;
506 }
507
508 static void __xfrm_policy_link(struct xfrm_policy *pol, int dir)
509 {
510         pol->next = xfrm_policy_list[dir];
511         xfrm_policy_list[dir] = pol;
512         xfrm_pol_hold(pol);
513 }
514
515 static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
516                                                 int dir)
517 {
518         struct xfrm_policy **polp;
519
520         for (polp = &xfrm_policy_list[dir];
521              *polp != NULL; polp = &(*polp)->next) {
522                 if (*polp == pol) {
523                         *polp = pol->next;
524                         return pol;
525                 }
526         }
527         return NULL;
528 }
529
530 void xfrm_policy_delete(struct xfrm_policy *pol, int dir)
531 {
532         write_lock_bh(&xfrm_policy_lock);
533         pol = __xfrm_policy_unlink(pol, dir);
534         write_unlock_bh(&xfrm_policy_lock);
535         if (pol)
536                 xfrm_policy_kill(pol);
537 }
538
539 int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol)
540 {
541         struct xfrm_policy *old_pol;
542
543         write_lock_bh(&xfrm_policy_lock);
544         old_pol = sk->sk_policy[dir];
545         sk->sk_policy[dir] = pol;
546         if (pol) {
547                 pol->curlft.add_time = (unsigned long)xtime.tv_sec;
548                 pol->index = xfrm_gen_index(XFRM_POLICY_MAX+dir);
549                 __xfrm_policy_link(pol, XFRM_POLICY_MAX+dir);
550         }
551         if (old_pol)
552                 __xfrm_policy_unlink(old_pol, XFRM_POLICY_MAX+dir);
553         write_unlock_bh(&xfrm_policy_lock);
554
555         if (old_pol) {
556                 xfrm_policy_kill(old_pol);
557         }
558         return 0;
559 }
560
561 static struct xfrm_policy *clone_policy(struct xfrm_policy *old, int dir)
562 {
563         struct xfrm_policy *newp = xfrm_policy_alloc(GFP_ATOMIC);
564
565         if (newp) {
566                 newp->selector = old->selector;
567                 newp->lft = old->lft;
568                 newp->curlft = old->curlft;
569                 newp->action = old->action;
570                 newp->flags = old->flags;
571                 newp->xfrm_nr = old->xfrm_nr;
572                 newp->index = old->index;
573                 memcpy(newp->xfrm_vec, old->xfrm_vec,
574                        newp->xfrm_nr*sizeof(struct xfrm_tmpl));
575                 write_lock_bh(&xfrm_policy_lock);
576                 __xfrm_policy_link(newp, XFRM_POLICY_MAX+dir);
577                 write_unlock_bh(&xfrm_policy_lock);
578                 xfrm_pol_put(newp);
579         }
580         return newp;
581 }
582
583 int __xfrm_sk_clone_policy(struct sock *sk)
584 {
585         struct xfrm_policy *p0 = sk->sk_policy[0],
586                            *p1 = sk->sk_policy[1];
587
588         sk->sk_policy[0] = sk->sk_policy[1] = NULL;
589         if (p0 && (sk->sk_policy[0] = clone_policy(p0, 0)) == NULL)
590                 return -ENOMEM;
591         if (p1 && (sk->sk_policy[1] = clone_policy(p1, 1)) == NULL)
592                 return -ENOMEM;
593         return 0;
594 }
595
596 /* Resolve list of templates for the flow, given policy. */
597
598 static int
599 xfrm_tmpl_resolve(struct xfrm_policy *policy, struct flowi *fl,
600                   struct xfrm_state **xfrm,
601                   unsigned short family)
602 {
603         int nx;
604         int i, error;
605         xfrm_address_t *daddr = xfrm_flowi_daddr(fl, family);
606         xfrm_address_t *saddr = xfrm_flowi_saddr(fl, family);
607
608         for (nx=0, i = 0; i < policy->xfrm_nr; i++) {
609                 struct xfrm_state *x;
610                 xfrm_address_t *remote = daddr;
611                 xfrm_address_t *local  = saddr;
612                 struct xfrm_tmpl *tmpl = &policy->xfrm_vec[i];
613
614                 if (tmpl->mode) {
615                         remote = &tmpl->id.daddr;
616                         local = &tmpl->saddr;
617                 }
618
619                 x = xfrm_state_find(remote, local, fl, tmpl, policy, &error, family);
620
621                 if (x && x->km.state == XFRM_STATE_VALID) {
622                         xfrm[nx++] = x;
623                         daddr = remote;
624                         saddr = local;
625                         continue;
626                 }
627                 if (x) {
628                         error = (x->km.state == XFRM_STATE_ERROR ?
629                                  -EINVAL : -EAGAIN);
630                         xfrm_state_put(x);
631                 }
632
633                 if (!tmpl->optional)
634                         goto fail;
635         }
636         return nx;
637
638 fail:
639         for (nx--; nx>=0; nx--)
640                 xfrm_state_put(xfrm[nx]);
641         return error;
642 }
643
644 /* Check that the bundle accepts the flow and its components are
645  * still valid.
646  */
647
648 static struct dst_entry *
649 xfrm_find_bundle(struct flowi *fl, struct xfrm_policy *policy, unsigned short family)
650 {
651         struct dst_entry *x;
652         struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
653         if (unlikely(afinfo == NULL))
654                 return ERR_PTR(-EINVAL);
655         x = afinfo->find_bundle(fl, policy);
656         xfrm_policy_put_afinfo(afinfo);
657         return x;
658 }
659
660 /* Allocate chain of dst_entry's, attach known xfrm's, calculate
661  * all the metrics... Shortly, bundle a bundle.
662  */
663
664 static int
665 xfrm_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int nx,
666                    struct flowi *fl, struct dst_entry **dst_p,
667                    unsigned short family)
668 {
669         int err;
670         struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
671         if (unlikely(afinfo == NULL))
672                 return -EINVAL;
673         err = afinfo->bundle_create(policy, xfrm, nx, fl, dst_p);
674         xfrm_policy_put_afinfo(afinfo);
675         return err;
676 }
677
678 static inline int policy_to_flow_dir(int dir)
679 {
680         if (XFRM_POLICY_IN == FLOW_DIR_IN &&
681             XFRM_POLICY_OUT == FLOW_DIR_OUT &&
682             XFRM_POLICY_FWD == FLOW_DIR_FWD)
683                 return dir;
684         switch (dir) {
685         default:
686         case XFRM_POLICY_IN:
687                 return FLOW_DIR_IN;
688         case XFRM_POLICY_OUT:
689                 return FLOW_DIR_OUT;
690         case XFRM_POLICY_FWD:
691                 return FLOW_DIR_FWD;
692         };
693 }
694
695 static int stale_bundle(struct dst_entry *dst);
696
697 /* Main function: finds/creates a bundle for given flow.
698  *
699  * At the moment we eat a raw IP route. Mostly to speed up lookups
700  * on interfaces with disabled IPsec.
701  */
702 int xfrm_lookup(struct dst_entry **dst_p, struct flowi *fl,
703                 struct sock *sk, int flags)
704 {
705         struct xfrm_policy *policy;
706         struct xfrm_state *xfrm[XFRM_MAX_DEPTH];
707         struct rtable *rt = (struct rtable*)*dst_p;
708         struct dst_entry *dst;
709         int nx = 0;
710         int err;
711         u32 genid;
712         u16 family = (*dst_p)->ops->family;
713
714         switch (family) {
715         case AF_INET:
716                 if (!fl->fl4_src)
717                         fl->fl4_src = rt->rt_src;
718                 if (!fl->fl4_dst)
719                         fl->fl4_dst = rt->rt_dst;
720         case AF_INET6:
721                 /* Still not clear... */
722         default:
723                 /* nothing */;
724         }
725
726 restart:
727         genid = atomic_read(&flow_cache_genid);
728         policy = NULL;
729         if (sk && sk->sk_policy[1])
730                 policy = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl);
731
732         if (!policy) {
733                 /* To accelerate a bit...  */
734                 if ((rt->u.dst.flags & DST_NOXFRM) || !xfrm_policy_list[XFRM_POLICY_OUT])
735                         return 0;
736
737                 policy = flow_cache_lookup(fl, family,
738                                            policy_to_flow_dir(XFRM_POLICY_OUT),
739                                            xfrm_policy_lookup);
740         }
741
742         if (!policy)
743                 return 0;
744
745         policy->curlft.use_time = (unsigned long)xtime.tv_sec;
746
747         switch (policy->action) {
748         case XFRM_POLICY_BLOCK:
749                 /* Prohibit the flow */
750                 xfrm_pol_put(policy);
751                 return -EPERM;
752
753         case XFRM_POLICY_ALLOW:
754                 if (policy->xfrm_nr == 0) {
755                         /* Flow passes not transformed. */
756                         xfrm_pol_put(policy);
757                         return 0;
758                 }
759
760                 /* Try to find matching bundle.
761                  *
762                  * LATER: help from flow cache. It is optional, this
763                  * is required only for output policy.
764                  */
765                 dst = xfrm_find_bundle(fl, policy, family);
766                 if (IS_ERR(dst)) {
767                         xfrm_pol_put(policy);
768                         return PTR_ERR(dst);
769                 }
770
771                 if (dst)
772                         break;
773
774                 nx = xfrm_tmpl_resolve(policy, fl, xfrm, family);
775
776                 if (unlikely(nx<0)) {
777                         err = nx;
778                         if (err == -EAGAIN && flags) {
779                                 DECLARE_WAITQUEUE(wait, current);
780
781                                 add_wait_queue(&km_waitq, &wait);
782                                 set_current_state(TASK_INTERRUPTIBLE);
783                                 schedule();
784                                 set_current_state(TASK_RUNNING);
785                                 remove_wait_queue(&km_waitq, &wait);
786
787                                 nx = xfrm_tmpl_resolve(policy, fl, xfrm, family);
788
789                                 if (nx == -EAGAIN && signal_pending(current)) {
790                                         err = -ERESTART;
791                                         goto error;
792                                 }
793                                 if (nx == -EAGAIN ||
794                                     genid != atomic_read(&flow_cache_genid)) {
795                                         xfrm_pol_put(policy);
796                                         goto restart;
797                                 }
798                                 err = nx;
799                         }
800                         if (err < 0)
801                                 goto error;
802                 }
803                 if (nx == 0) {
804                         /* Flow passes not transformed. */
805                         xfrm_pol_put(policy);
806                         return 0;
807                 }
808
809                 dst = &rt->u.dst;
810                 err = xfrm_bundle_create(policy, xfrm, nx, fl, &dst, family);
811
812                 if (unlikely(err)) {
813                         int i;
814                         for (i=0; i<nx; i++)
815                                 xfrm_state_put(xfrm[i]);
816                         goto error;
817                 }
818
819                 write_lock_bh(&policy->lock);
820                 if (unlikely(policy->dead || stale_bundle(dst))) {
821                         /* Wow! While we worked on resolving, this
822                          * policy has gone. Retry. It is not paranoia,
823                          * we just cannot enlist new bundle to dead object.
824                          * We can't enlist stable bundles either.
825                          */
826                         write_unlock_bh(&policy->lock);
827
828                         xfrm_pol_put(policy);
829                         if (dst)
830                                 dst_free(dst);
831                         goto restart;
832                 }
833                 dst->next = policy->bundles;
834                 policy->bundles = dst;
835                 dst_hold(dst);
836                 write_unlock_bh(&policy->lock);
837         }
838         *dst_p = dst;
839         ip_rt_put(rt);
840         xfrm_pol_put(policy);
841         return 0;
842
843 error:
844         ip_rt_put(rt);
845         xfrm_pol_put(policy);
846         *dst_p = NULL;
847         return err;
848 }
849
850 /* When skb is transformed back to its "native" form, we have to
851  * check policy restrictions. At the moment we make this in maximally
852  * stupid way. Shame on me. :-) Of course, connected sockets must
853  * have policy cached at them.
854  */
855
856 static inline int
857 xfrm_state_ok(struct xfrm_tmpl *tmpl, struct xfrm_state *x, 
858               unsigned short family)
859 {
860         if (xfrm_state_kern(x))
861                 return tmpl->optional && !xfrm_state_addr_cmp(tmpl, x, family);
862         return  x->id.proto == tmpl->id.proto &&
863                 (x->id.spi == tmpl->id.spi || !tmpl->id.spi) &&
864                 (x->props.reqid == tmpl->reqid || !tmpl->reqid) &&
865                 x->props.mode == tmpl->mode &&
866                 (tmpl->aalgos & (1<<x->props.aalgo)) &&
867                 !(x->props.mode && xfrm_state_addr_cmp(tmpl, x, family));
868 }
869
870 static inline int
871 xfrm_policy_ok(struct xfrm_tmpl *tmpl, struct sec_path *sp, int start,
872                unsigned short family)
873 {
874         int idx = start;
875
876         if (tmpl->optional) {
877                 if (!tmpl->mode)
878                         return start;
879         } else
880                 start = -1;
881         for (; idx < sp->len; idx++) {
882                 if (xfrm_state_ok(tmpl, sp->x[idx].xvec, family))
883                         return ++idx;
884                 if (sp->x[idx].xvec->props.mode)
885                         break;
886         }
887         return start;
888 }
889
890 static int
891 _decode_session(struct sk_buff *skb, struct flowi *fl, unsigned short family)
892 {
893         struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
894
895         if (unlikely(afinfo == NULL))
896                 return -EAFNOSUPPORT;
897
898         afinfo->decode_session(skb, fl);
899         xfrm_policy_put_afinfo(afinfo);
900         return 0;
901 }
902
903 int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, 
904                         unsigned short family)
905 {
906         struct xfrm_policy *pol;
907         struct flowi fl;
908
909         if (_decode_session(skb, &fl, family) < 0)
910                 return 0;
911
912         /* First, check used SA against their selectors. */
913         if (skb->sp) {
914                 int i;
915
916                 for (i=skb->sp->len-1; i>=0; i--) {
917                   struct sec_decap_state *xvec = &(skb->sp->x[i]);
918                         if (!xfrm_selector_match(&xvec->xvec->sel, &fl, family))
919                                 return 0;
920
921                         /* If there is a post_input processor, try running it */
922                         if (xvec->xvec->type->post_input &&
923                             (xvec->xvec->type->post_input)(xvec->xvec,
924                                                            &(xvec->decap),
925                                                            skb) != 0)
926                                 return 0;
927                 }
928         }
929
930         pol = NULL;
931         if (sk && sk->sk_policy[dir])
932                 pol = xfrm_sk_policy_lookup(sk, dir, &fl);
933
934         if (!pol)
935                 pol = flow_cache_lookup(&fl, family,
936                                         policy_to_flow_dir(dir),
937                                         xfrm_policy_lookup);
938
939         if (!pol)
940                 return !skb->sp;
941
942         pol->curlft.use_time = (unsigned long)xtime.tv_sec;
943
944         if (pol->action == XFRM_POLICY_ALLOW) {
945                 struct sec_path *sp;
946                 static struct sec_path dummy;
947                 int i, k;
948
949                 if ((sp = skb->sp) == NULL)
950                         sp = &dummy;
951
952                 /* For each tunnel xfrm, find the first matching tmpl.
953                  * For each tmpl before that, find corresponding xfrm.
954                  * Order is _important_. Later we will implement
955                  * some barriers, but at the moment barriers
956                  * are implied between each two transformations.
957                  */
958                 for (i = pol->xfrm_nr-1, k = 0; i >= 0; i--) {
959                         k = xfrm_policy_ok(pol->xfrm_vec+i, sp, k, family);
960                         if (k < 0)
961                                 goto reject;
962                 }
963
964                 for (; k < sp->len; k++) {
965                         if (sp->x[k].xvec->props.mode)
966                                 goto reject;
967                 }
968
969                 xfrm_pol_put(pol);
970                 return 1;
971         }
972
973 reject:
974         xfrm_pol_put(pol);
975         return 0;
976 }
977
978 int __xfrm_route_forward(struct sk_buff *skb, unsigned short family)
979 {
980         struct flowi fl;
981
982         if (_decode_session(skb, &fl, family) < 0)
983                 return 0;
984
985         return xfrm_lookup(&skb->dst, &fl, NULL, 0) == 0;
986 }
987
988 /* Optimize later using cookies and generation ids. */
989
990 static struct dst_entry *xfrm_dst_check(struct dst_entry *dst, u32 cookie)
991 {
992         if (!stale_bundle(dst))
993                 return dst;
994
995         dst_release(dst);
996         return NULL;
997 }
998
999 static int stale_bundle(struct dst_entry *dst)
1000 {
1001         struct dst_entry *child = dst;
1002
1003         while (child) {
1004                 if (child->obsolete > 0 ||
1005                     (child->dev && !netif_running(child->dev)) ||
1006                     (child->xfrm && child->xfrm->km.state != XFRM_STATE_VALID)) {
1007                         return 1;
1008                 }
1009                 child = child->child;
1010         }
1011
1012         return 0;
1013 }
1014
1015 static void xfrm_dst_destroy(struct dst_entry *dst)
1016 {
1017         xfrm_state_put(dst->xfrm);
1018         dst->xfrm = NULL;
1019 }
1020
1021 static void xfrm_link_failure(struct sk_buff *skb)
1022 {
1023         /* Impossible. Such dst must be popped before reaches point of failure. */
1024         return;
1025 }
1026
1027 static struct dst_entry *xfrm_negative_advice(struct dst_entry *dst)
1028 {
1029         if (dst) {
1030                 if (dst->obsolete) {
1031                         dst_release(dst);
1032                         dst = NULL;
1033                 }
1034         }
1035         return dst;
1036 }
1037
1038 static void xfrm_prune_bundles(int (*func)(struct dst_entry *))
1039 {
1040         int i;
1041         struct xfrm_policy *pol;
1042         struct dst_entry *dst, **dstp, *gc_list = NULL;
1043
1044         read_lock_bh(&xfrm_policy_lock);
1045         for (i=0; i<2*XFRM_POLICY_MAX; i++) {
1046                 for (pol = xfrm_policy_list[i]; pol; pol = pol->next) {
1047                         write_lock(&pol->lock);
1048                         dstp = &pol->bundles;
1049                         while ((dst=*dstp) != NULL) {
1050                                 if (func(dst)) {
1051                                         *dstp = dst->next;
1052                                         dst->next = gc_list;
1053                                         gc_list = dst;
1054                                 } else {
1055                                         dstp = &dst->next;
1056                                 }
1057                         }
1058                         write_unlock(&pol->lock);
1059                 }
1060         }
1061         read_unlock_bh(&xfrm_policy_lock);
1062
1063         while (gc_list) {
1064                 dst = gc_list;
1065                 gc_list = dst->next;
1066                 dst_free(dst);
1067         }
1068 }
1069
1070 static int unused_bundle(struct dst_entry *dst)
1071 {
1072         return !atomic_read(&dst->__refcnt);
1073 }
1074
1075 static void __xfrm_garbage_collect(void)
1076 {
1077         xfrm_prune_bundles(unused_bundle);
1078 }
1079
1080 int xfrm_flush_bundles(void)
1081 {
1082         xfrm_prune_bundles(stale_bundle);
1083         return 0;
1084 }
1085
1086 /* Well... that's _TASK_. We need to scan through transformation
1087  * list and figure out what mss tcp should generate in order to
1088  * final datagram fit to mtu. Mama mia... :-)
1089  *
1090  * Apparently, some easy way exists, but we used to choose the most
1091  * bizarre ones. :-) So, raising Kalashnikov... tra-ta-ta.
1092  *
1093  * Consider this function as something like dark humour. :-)
1094  */
1095 static int xfrm_get_mss(struct dst_entry *dst, u32 mtu)
1096 {
1097         int res = mtu - dst->header_len;
1098
1099         for (;;) {
1100                 struct dst_entry *d = dst;
1101                 int m = res;
1102
1103                 do {
1104                         struct xfrm_state *x = d->xfrm;
1105                         if (x) {
1106                                 spin_lock_bh(&x->lock);
1107                                 if (x->km.state == XFRM_STATE_VALID &&
1108                                     x->type && x->type->get_max_size)
1109                                         m = x->type->get_max_size(d->xfrm, m);
1110                                 else
1111                                         m += x->props.header_len;
1112                                 spin_unlock_bh(&x->lock);
1113                         }
1114                 } while ((d = d->child) != NULL);
1115
1116                 if (m <= mtu)
1117                         break;
1118                 res -= (m - mtu);
1119                 if (res < 88)
1120                         return mtu;
1121         }
1122
1123         return res + dst->header_len;
1124 }
1125
1126 int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo)
1127 {
1128         int err = 0;
1129         if (unlikely(afinfo == NULL))
1130                 return -EINVAL;
1131         if (unlikely(afinfo->family >= NPROTO))
1132                 return -EAFNOSUPPORT;
1133         write_lock(&xfrm_policy_afinfo_lock);
1134         if (unlikely(xfrm_policy_afinfo[afinfo->family] != NULL))
1135                 err = -ENOBUFS;
1136         else {
1137                 struct dst_ops *dst_ops = afinfo->dst_ops;
1138                 if (likely(dst_ops->kmem_cachep == NULL))
1139                         dst_ops->kmem_cachep = xfrm_dst_cache;
1140                 if (likely(dst_ops->check == NULL))
1141                         dst_ops->check = xfrm_dst_check;
1142                 if (likely(dst_ops->destroy == NULL))
1143                         dst_ops->destroy = xfrm_dst_destroy;
1144                 if (likely(dst_ops->negative_advice == NULL))
1145                         dst_ops->negative_advice = xfrm_negative_advice;
1146                 if (likely(dst_ops->link_failure == NULL))
1147                         dst_ops->link_failure = xfrm_link_failure;
1148                 if (likely(dst_ops->get_mss == NULL))
1149                         dst_ops->get_mss = xfrm_get_mss;
1150                 if (likely(afinfo->garbage_collect == NULL))
1151                         afinfo->garbage_collect = __xfrm_garbage_collect;
1152                 xfrm_policy_afinfo[afinfo->family] = afinfo;
1153         }
1154         write_unlock(&xfrm_policy_afinfo_lock);
1155         return err;
1156 }
1157
1158 int xfrm_policy_unregister_afinfo(struct xfrm_policy_afinfo *afinfo)
1159 {
1160         int err = 0;
1161         if (unlikely(afinfo == NULL))
1162                 return -EINVAL;
1163         if (unlikely(afinfo->family >= NPROTO))
1164                 return -EAFNOSUPPORT;
1165         write_lock(&xfrm_policy_afinfo_lock);
1166         if (likely(xfrm_policy_afinfo[afinfo->family] != NULL)) {
1167                 if (unlikely(xfrm_policy_afinfo[afinfo->family] != afinfo))
1168                         err = -EINVAL;
1169                 else {
1170                         struct dst_ops *dst_ops = afinfo->dst_ops;
1171                         xfrm_policy_afinfo[afinfo->family] = NULL;
1172                         dst_ops->kmem_cachep = NULL;
1173                         dst_ops->check = NULL;
1174                         dst_ops->destroy = NULL;
1175                         dst_ops->negative_advice = NULL;
1176                         dst_ops->link_failure = NULL;
1177                         dst_ops->get_mss = NULL;
1178                         afinfo->garbage_collect = NULL;
1179                 }
1180         }
1181         write_unlock(&xfrm_policy_afinfo_lock);
1182         return err;
1183 }
1184
1185 struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family)
1186 {
1187         struct xfrm_policy_afinfo *afinfo;
1188         if (unlikely(family >= NPROTO))
1189                 return NULL;
1190         read_lock(&xfrm_policy_afinfo_lock);
1191         afinfo = xfrm_policy_afinfo[family];
1192         if (likely(afinfo != NULL))
1193                 read_lock(&afinfo->lock);
1194         read_unlock(&xfrm_policy_afinfo_lock);
1195         return afinfo;
1196 }
1197
1198 void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo)
1199 {
1200         if (unlikely(afinfo == NULL))
1201                 return;
1202         read_unlock(&afinfo->lock);
1203 }
1204
1205 static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void *ptr)
1206 {
1207         switch (event) {
1208         case NETDEV_DOWN:
1209                 xfrm_flush_bundles();
1210         }
1211         return NOTIFY_DONE;
1212 }
1213
1214 struct notifier_block xfrm_dev_notifier = {
1215         xfrm_dev_event,
1216         NULL,
1217         0
1218 };
1219
1220 void __init xfrm_policy_init(void)
1221 {
1222         xfrm_dst_cache = kmem_cache_create("xfrm_dst_cache",
1223                                            sizeof(struct xfrm_dst),
1224                                            0, SLAB_HWCACHE_ALIGN,
1225                                            NULL, NULL);
1226         if (!xfrm_dst_cache)
1227                 panic("XFRM: failed to allocate xfrm_dst_cache\n");
1228
1229         INIT_WORK(&xfrm_policy_gc_work, xfrm_policy_gc_task, NULL);
1230         register_netdevice_notifier(&xfrm_dev_notifier);
1231 }
1232
1233 void __init xfrm_init(void)
1234 {
1235         xfrm_state_init();
1236         xfrm_policy_init();
1237         xfrm_input_init();
1238 }
1239