vserver 1.9.3
[linux-2.6.git] / net / xfrm / xfrm_policy.c
1 /* 
2  * xfrm_policy.c
3  *
4  * Changes:
5  *      Mitsuru KANDA @USAGI
6  *      Kazunori MIYAZAWA @USAGI
7  *      Kunihiro Ishiguro <kunihiro@ipinfusion.com>
8  *              IPv6 support
9  *      Kazunori MIYAZAWA @USAGI
10  *      YOSHIFUJI Hideaki
11  *              Split up af-specific portion
12  *      Derek Atkins <derek@ihtfp.com>          Add the post_input processor
13  *      
14  */
15
16 #include <linux/config.h>
17 #include <linux/slab.h>
18 #include <linux/kmod.h>
19 #include <linux/list.h>
20 #include <linux/spinlock.h>
21 #include <linux/workqueue.h>
22 #include <linux/notifier.h>
23 #include <linux/netdevice.h>
24 #include <net/xfrm.h>
25 #include <net/ip.h>
26
27 DECLARE_MUTEX(xfrm_cfg_sem);
28
29 static rwlock_t xfrm_policy_lock = RW_LOCK_UNLOCKED;
30
31 struct xfrm_policy *xfrm_policy_list[XFRM_POLICY_MAX*2];
32
33 static rwlock_t xfrm_policy_afinfo_lock = RW_LOCK_UNLOCKED;
34 static struct xfrm_policy_afinfo *xfrm_policy_afinfo[NPROTO];
35
36 kmem_cache_t *xfrm_dst_cache;
37
38 static struct work_struct xfrm_policy_gc_work;
39 static struct list_head xfrm_policy_gc_list =
40         LIST_HEAD_INIT(xfrm_policy_gc_list);
41 static spinlock_t xfrm_policy_gc_lock = SPIN_LOCK_UNLOCKED;
42
43 int xfrm_register_type(struct xfrm_type *type, unsigned short family)
44 {
45         struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
46         struct xfrm_type_map *typemap;
47         int err = 0;
48
49         if (unlikely(afinfo == NULL))
50                 return -EAFNOSUPPORT;
51         typemap = afinfo->type_map;
52
53         write_lock(&typemap->lock);
54         if (likely(typemap->map[type->proto] == NULL))
55                 typemap->map[type->proto] = type;
56         else
57                 err = -EEXIST;
58         write_unlock(&typemap->lock);
59         xfrm_policy_put_afinfo(afinfo);
60         return err;
61 }
62
63 int xfrm_unregister_type(struct xfrm_type *type, unsigned short family)
64 {
65         struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
66         struct xfrm_type_map *typemap;
67         int err = 0;
68
69         if (unlikely(afinfo == NULL))
70                 return -EAFNOSUPPORT;
71         typemap = afinfo->type_map;
72
73         write_lock(&typemap->lock);
74         if (unlikely(typemap->map[type->proto] != type))
75                 err = -ENOENT;
76         else
77                 typemap->map[type->proto] = NULL;
78         write_unlock(&typemap->lock);
79         xfrm_policy_put_afinfo(afinfo);
80         return err;
81 }
82
83 struct xfrm_type *xfrm_get_type(u8 proto, unsigned short family)
84 {
85         struct xfrm_policy_afinfo *afinfo;
86         struct xfrm_type_map *typemap;
87         struct xfrm_type *type;
88         int modload_attempted = 0;
89
90 retry:
91         afinfo = xfrm_policy_get_afinfo(family);
92         if (unlikely(afinfo == NULL))
93                 return NULL;
94         typemap = afinfo->type_map;
95
96         read_lock(&typemap->lock);
97         type = typemap->map[proto];
98         if (unlikely(type && !try_module_get(type->owner)))
99                 type = NULL;
100         read_unlock(&typemap->lock);
101         if (!type && !modload_attempted) {
102                 xfrm_policy_put_afinfo(afinfo);
103                 request_module("xfrm-type-%d-%d",
104                                (int) family, (int) proto);
105                 modload_attempted = 1;
106                 goto retry;
107         }
108
109         xfrm_policy_put_afinfo(afinfo);
110         return type;
111 }
112
113 int xfrm_dst_lookup(struct xfrm_dst **dst, struct flowi *fl, 
114                     unsigned short family)
115 {
116         struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
117         int err = 0;
118
119         if (unlikely(afinfo == NULL))
120                 return -EAFNOSUPPORT;
121
122         if (likely(afinfo->dst_lookup != NULL))
123                 err = afinfo->dst_lookup(dst, fl);
124         else
125                 err = -EINVAL;
126         xfrm_policy_put_afinfo(afinfo);
127         return err;
128 }
129
130 void xfrm_put_type(struct xfrm_type *type)
131 {
132         module_put(type->owner);
133 }
134
135 static inline unsigned long make_jiffies(long secs)
136 {
137         if (secs >= (MAX_SCHEDULE_TIMEOUT-1)/HZ)
138                 return MAX_SCHEDULE_TIMEOUT-1;
139         else
140                 return secs*HZ;
141 }
142
143 static void xfrm_policy_timer(unsigned long data)
144 {
145         struct xfrm_policy *xp = (struct xfrm_policy*)data;
146         unsigned long now = (unsigned long)xtime.tv_sec;
147         long next = LONG_MAX;
148         int warn = 0;
149         int dir;
150
151         read_lock(&xp->lock);
152
153         if (xp->dead)
154                 goto out;
155
156         dir = xp->index & 7;
157
158         if (xp->lft.hard_add_expires_seconds) {
159                 long tmo = xp->lft.hard_add_expires_seconds +
160                         xp->curlft.add_time - now;
161                 if (tmo <= 0)
162                         goto expired;
163                 if (tmo < next)
164                         next = tmo;
165         }
166         if (xp->lft.hard_use_expires_seconds) {
167                 long tmo = xp->lft.hard_use_expires_seconds +
168                         (xp->curlft.use_time ? : xp->curlft.add_time) - now;
169                 if (tmo <= 0)
170                         goto expired;
171                 if (tmo < next)
172                         next = tmo;
173         }
174         if (xp->lft.soft_add_expires_seconds) {
175                 long tmo = xp->lft.soft_add_expires_seconds +
176                         xp->curlft.add_time - now;
177                 if (tmo <= 0) {
178                         warn = 1;
179                         tmo = XFRM_KM_TIMEOUT;
180                 }
181                 if (tmo < next)
182                         next = tmo;
183         }
184         if (xp->lft.soft_use_expires_seconds) {
185                 long tmo = xp->lft.soft_use_expires_seconds +
186                         (xp->curlft.use_time ? : xp->curlft.add_time) - now;
187                 if (tmo <= 0) {
188                         warn = 1;
189                         tmo = XFRM_KM_TIMEOUT;
190                 }
191                 if (tmo < next)
192                         next = tmo;
193         }
194
195         if (warn)
196                 km_policy_expired(xp, dir, 0);
197         if (next != LONG_MAX &&
198             !mod_timer(&xp->timer, jiffies + make_jiffies(next)))
199                 xfrm_pol_hold(xp);
200
201 out:
202         read_unlock(&xp->lock);
203         xfrm_pol_put(xp);
204         return;
205
206 expired:
207         read_unlock(&xp->lock);
208         km_policy_expired(xp, dir, 1);
209         xfrm_policy_delete(xp, dir);
210         xfrm_pol_put(xp);
211 }
212
213
214 /* Allocate xfrm_policy. Not used here, it is supposed to be used by pfkeyv2
215  * SPD calls.
216  */
217
218 struct xfrm_policy *xfrm_policy_alloc(int gfp)
219 {
220         struct xfrm_policy *policy;
221
222         policy = kmalloc(sizeof(struct xfrm_policy), gfp);
223
224         if (policy) {
225                 memset(policy, 0, sizeof(struct xfrm_policy));
226                 atomic_set(&policy->refcnt, 1);
227                 policy->lock = RW_LOCK_UNLOCKED;
228                 init_timer(&policy->timer);
229                 policy->timer.data = (unsigned long)policy;
230                 policy->timer.function = xfrm_policy_timer;
231         }
232         return policy;
233 }
234
235 /* Destroy xfrm_policy: descendant resources must be released to this moment. */
236
237 void __xfrm_policy_destroy(struct xfrm_policy *policy)
238 {
239         if (!policy->dead)
240                 BUG();
241
242         if (policy->bundles)
243                 BUG();
244
245         if (del_timer(&policy->timer))
246                 BUG();
247
248         kfree(policy);
249 }
250
251 static void xfrm_policy_gc_kill(struct xfrm_policy *policy)
252 {
253         struct dst_entry *dst;
254
255         while ((dst = policy->bundles) != NULL) {
256                 policy->bundles = dst->next;
257                 dst_free(dst);
258         }
259
260         if (del_timer(&policy->timer))
261                 atomic_dec(&policy->refcnt);
262
263         if (atomic_read(&policy->refcnt) > 1)
264                 flow_cache_flush();
265
266         xfrm_pol_put(policy);
267 }
268
269 static void xfrm_policy_gc_task(void *data)
270 {
271         struct xfrm_policy *policy;
272         struct list_head *entry, *tmp;
273         struct list_head gc_list = LIST_HEAD_INIT(gc_list);
274
275         spin_lock_bh(&xfrm_policy_gc_lock);
276         list_splice_init(&xfrm_policy_gc_list, &gc_list);
277         spin_unlock_bh(&xfrm_policy_gc_lock);
278
279         list_for_each_safe(entry, tmp, &gc_list) {
280                 policy = list_entry(entry, struct xfrm_policy, list);
281                 xfrm_policy_gc_kill(policy);
282         }
283 }
284
285 /* Rule must be locked. Release descentant resources, announce
286  * entry dead. The rule must be unlinked from lists to the moment.
287  */
288
289 void xfrm_policy_kill(struct xfrm_policy *policy)
290 {
291         write_lock_bh(&policy->lock);
292         if (policy->dead)
293                 goto out;
294
295         policy->dead = 1;
296
297         spin_lock(&xfrm_policy_gc_lock);
298         list_add(&policy->list, &xfrm_policy_gc_list);
299         spin_unlock(&xfrm_policy_gc_lock);
300         schedule_work(&xfrm_policy_gc_work);
301
302 out:
303         write_unlock_bh(&policy->lock);
304 }
305
306 /* Generate new index... KAME seems to generate them ordered by cost
307  * of an absolute inpredictability of ordering of rules. This will not pass. */
308 static u32 xfrm_gen_index(int dir)
309 {
310         u32 idx;
311         struct xfrm_policy *p;
312         static u32 idx_generator;
313
314         for (;;) {
315                 idx = (idx_generator | dir);
316                 idx_generator += 8;
317                 if (idx == 0)
318                         idx = 8;
319                 for (p = xfrm_policy_list[dir]; p; p = p->next) {
320                         if (p->index == idx)
321                                 break;
322                 }
323                 if (!p)
324                         return idx;
325         }
326 }
327
328 int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
329 {
330         struct xfrm_policy *pol, **p;
331         struct xfrm_policy *delpol = NULL;
332         struct xfrm_policy **newpos = NULL;
333
334         write_lock_bh(&xfrm_policy_lock);
335         for (p = &xfrm_policy_list[dir]; (pol=*p)!=NULL; p = &pol->next) {
336                 if (!delpol && memcmp(&policy->selector, &pol->selector, sizeof(pol->selector)) == 0) {
337                         if (excl) {
338                                 write_unlock_bh(&xfrm_policy_lock);
339                                 return -EEXIST;
340                         }
341                         *p = pol->next;
342                         delpol = pol;
343                         if (policy->priority > pol->priority)
344                                 continue;
345                 } else if (policy->priority >= pol->priority)
346                         continue;
347                 if (!newpos)
348                         newpos = p;
349                 if (delpol)
350                         break;
351         }
352         if (newpos)
353                 p = newpos;
354         xfrm_pol_hold(policy);
355         policy->next = *p;
356         *p = policy;
357         atomic_inc(&flow_cache_genid);
358         policy->index = delpol ? delpol->index : xfrm_gen_index(dir);
359         policy->curlft.add_time = (unsigned long)xtime.tv_sec;
360         policy->curlft.use_time = 0;
361         if (!mod_timer(&policy->timer, jiffies + HZ))
362                 xfrm_pol_hold(policy);
363         write_unlock_bh(&xfrm_policy_lock);
364
365         if (delpol) {
366                 xfrm_policy_kill(delpol);
367         }
368         return 0;
369 }
370
371 struct xfrm_policy *xfrm_policy_bysel(int dir, struct xfrm_selector *sel,
372                                       int delete)
373 {
374         struct xfrm_policy *pol, **p;
375
376         write_lock_bh(&xfrm_policy_lock);
377         for (p = &xfrm_policy_list[dir]; (pol=*p)!=NULL; p = &pol->next) {
378                 if (memcmp(sel, &pol->selector, sizeof(*sel)) == 0) {
379                         xfrm_pol_hold(pol);
380                         if (delete)
381                                 *p = pol->next;
382                         break;
383                 }
384         }
385         write_unlock_bh(&xfrm_policy_lock);
386
387         if (pol && delete) {
388                 atomic_inc(&flow_cache_genid);
389                 xfrm_policy_kill(pol);
390         }
391         return pol;
392 }
393
394 struct xfrm_policy *xfrm_policy_byid(int dir, u32 id, int delete)
395 {
396         struct xfrm_policy *pol, **p;
397
398         write_lock_bh(&xfrm_policy_lock);
399         for (p = &xfrm_policy_list[id & 7]; (pol=*p)!=NULL; p = &pol->next) {
400                 if (pol->index == id) {
401                         xfrm_pol_hold(pol);
402                         if (delete)
403                                 *p = pol->next;
404                         break;
405                 }
406         }
407         write_unlock_bh(&xfrm_policy_lock);
408
409         if (pol && delete) {
410                 atomic_inc(&flow_cache_genid);
411                 xfrm_policy_kill(pol);
412         }
413         return pol;
414 }
415
416 void xfrm_policy_flush(void)
417 {
418         struct xfrm_policy *xp;
419         int dir;
420
421         write_lock_bh(&xfrm_policy_lock);
422         for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
423                 while ((xp = xfrm_policy_list[dir]) != NULL) {
424                         xfrm_policy_list[dir] = xp->next;
425                         write_unlock_bh(&xfrm_policy_lock);
426
427                         xfrm_policy_kill(xp);
428
429                         write_lock_bh(&xfrm_policy_lock);
430                 }
431         }
432         atomic_inc(&flow_cache_genid);
433         write_unlock_bh(&xfrm_policy_lock);
434 }
435
436 int xfrm_policy_walk(int (*func)(struct xfrm_policy *, int, int, void*),
437                      void *data)
438 {
439         struct xfrm_policy *xp;
440         int dir;
441         int count = 0;
442         int error = 0;
443
444         read_lock_bh(&xfrm_policy_lock);
445         for (dir = 0; dir < 2*XFRM_POLICY_MAX; dir++) {
446                 for (xp = xfrm_policy_list[dir]; xp; xp = xp->next)
447                         count++;
448         }
449
450         if (count == 0) {
451                 error = -ENOENT;
452                 goto out;
453         }
454
455         for (dir = 0; dir < 2*XFRM_POLICY_MAX; dir++) {
456                 for (xp = xfrm_policy_list[dir]; xp; xp = xp->next) {
457                         error = func(xp, dir%XFRM_POLICY_MAX, --count, data);
458                         if (error)
459                                 goto out;
460                 }
461         }
462
463 out:
464         read_unlock_bh(&xfrm_policy_lock);
465         return error;
466 }
467
468
469 /* Find policy to apply to this flow. */
470
471 static void xfrm_policy_lookup(struct flowi *fl, u16 family, u8 dir,
472                                void **objp, atomic_t **obj_refp)
473 {
474         struct xfrm_policy *pol;
475
476         read_lock_bh(&xfrm_policy_lock);
477         for (pol = xfrm_policy_list[dir]; pol; pol = pol->next) {
478                 struct xfrm_selector *sel = &pol->selector;
479                 int match;
480
481                 if (pol->family != family)
482                         continue;
483
484                 match = xfrm_selector_match(sel, fl, family);
485                 if (match) {
486                         xfrm_pol_hold(pol);
487                         break;
488                 }
489         }
490         read_unlock_bh(&xfrm_policy_lock);
491         if ((*objp = (void *) pol) != NULL)
492                 *obj_refp = &pol->refcnt;
493 }
494
495 struct xfrm_policy *xfrm_sk_policy_lookup(struct sock *sk, int dir, struct flowi *fl)
496 {
497         struct xfrm_policy *pol;
498
499         read_lock_bh(&xfrm_policy_lock);
500         if ((pol = sk->sk_policy[dir]) != NULL) {
501                 int match = xfrm_selector_match(&pol->selector, fl,
502                                                 sk->sk_family);
503                 if (match)
504                         xfrm_pol_hold(pol);
505                 else
506                         pol = NULL;
507         }
508         read_unlock_bh(&xfrm_policy_lock);
509         return pol;
510 }
511
512 static void __xfrm_policy_link(struct xfrm_policy *pol, int dir)
513 {
514         pol->next = xfrm_policy_list[dir];
515         xfrm_policy_list[dir] = pol;
516         xfrm_pol_hold(pol);
517 }
518
519 static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
520                                                 int dir)
521 {
522         struct xfrm_policy **polp;
523
524         for (polp = &xfrm_policy_list[dir];
525              *polp != NULL; polp = &(*polp)->next) {
526                 if (*polp == pol) {
527                         *polp = pol->next;
528                         return pol;
529                 }
530         }
531         return NULL;
532 }
533
534 void xfrm_policy_delete(struct xfrm_policy *pol, int dir)
535 {
536         write_lock_bh(&xfrm_policy_lock);
537         pol = __xfrm_policy_unlink(pol, dir);
538         write_unlock_bh(&xfrm_policy_lock);
539         if (pol) {
540                 if (dir < XFRM_POLICY_MAX)
541                         atomic_inc(&flow_cache_genid);
542                 xfrm_policy_kill(pol);
543         }
544 }
545
546 int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol)
547 {
548         struct xfrm_policy *old_pol;
549
550         write_lock_bh(&xfrm_policy_lock);
551         old_pol = sk->sk_policy[dir];
552         sk->sk_policy[dir] = pol;
553         if (pol) {
554                 pol->curlft.add_time = (unsigned long)xtime.tv_sec;
555                 pol->index = xfrm_gen_index(XFRM_POLICY_MAX+dir);
556                 __xfrm_policy_link(pol, XFRM_POLICY_MAX+dir);
557         }
558         if (old_pol)
559                 __xfrm_policy_unlink(old_pol, XFRM_POLICY_MAX+dir);
560         write_unlock_bh(&xfrm_policy_lock);
561
562         if (old_pol) {
563                 xfrm_policy_kill(old_pol);
564         }
565         return 0;
566 }
567
568 static struct xfrm_policy *clone_policy(struct xfrm_policy *old, int dir)
569 {
570         struct xfrm_policy *newp = xfrm_policy_alloc(GFP_ATOMIC);
571
572         if (newp) {
573                 newp->selector = old->selector;
574                 newp->lft = old->lft;
575                 newp->curlft = old->curlft;
576                 newp->action = old->action;
577                 newp->flags = old->flags;
578                 newp->xfrm_nr = old->xfrm_nr;
579                 newp->index = old->index;
580                 memcpy(newp->xfrm_vec, old->xfrm_vec,
581                        newp->xfrm_nr*sizeof(struct xfrm_tmpl));
582                 write_lock_bh(&xfrm_policy_lock);
583                 __xfrm_policy_link(newp, XFRM_POLICY_MAX+dir);
584                 write_unlock_bh(&xfrm_policy_lock);
585                 xfrm_pol_put(newp);
586         }
587         return newp;
588 }
589
590 int __xfrm_sk_clone_policy(struct sock *sk)
591 {
592         struct xfrm_policy *p0 = sk->sk_policy[0],
593                            *p1 = sk->sk_policy[1];
594
595         sk->sk_policy[0] = sk->sk_policy[1] = NULL;
596         if (p0 && (sk->sk_policy[0] = clone_policy(p0, 0)) == NULL)
597                 return -ENOMEM;
598         if (p1 && (sk->sk_policy[1] = clone_policy(p1, 1)) == NULL)
599                 return -ENOMEM;
600         return 0;
601 }
602
603 /* Resolve list of templates for the flow, given policy. */
604
605 static int
606 xfrm_tmpl_resolve(struct xfrm_policy *policy, struct flowi *fl,
607                   struct xfrm_state **xfrm,
608                   unsigned short family)
609 {
610         int nx;
611         int i, error;
612         xfrm_address_t *daddr = xfrm_flowi_daddr(fl, family);
613         xfrm_address_t *saddr = xfrm_flowi_saddr(fl, family);
614
615         for (nx=0, i = 0; i < policy->xfrm_nr; i++) {
616                 struct xfrm_state *x;
617                 xfrm_address_t *remote = daddr;
618                 xfrm_address_t *local  = saddr;
619                 struct xfrm_tmpl *tmpl = &policy->xfrm_vec[i];
620
621                 if (tmpl->mode) {
622                         remote = &tmpl->id.daddr;
623                         local = &tmpl->saddr;
624                 }
625
626                 x = xfrm_state_find(remote, local, fl, tmpl, policy, &error, family);
627
628                 if (x && x->km.state == XFRM_STATE_VALID) {
629                         xfrm[nx++] = x;
630                         daddr = remote;
631                         saddr = local;
632                         continue;
633                 }
634                 if (x) {
635                         error = (x->km.state == XFRM_STATE_ERROR ?
636                                  -EINVAL : -EAGAIN);
637                         xfrm_state_put(x);
638                 }
639
640                 if (!tmpl->optional)
641                         goto fail;
642         }
643         return nx;
644
645 fail:
646         for (nx--; nx>=0; nx--)
647                 xfrm_state_put(xfrm[nx]);
648         return error;
649 }
650
651 /* Check that the bundle accepts the flow and its components are
652  * still valid.
653  */
654
655 static struct dst_entry *
656 xfrm_find_bundle(struct flowi *fl, struct xfrm_policy *policy, unsigned short family)
657 {
658         struct dst_entry *x;
659         struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
660         if (unlikely(afinfo == NULL))
661                 return ERR_PTR(-EINVAL);
662         x = afinfo->find_bundle(fl, policy);
663         xfrm_policy_put_afinfo(afinfo);
664         return x;
665 }
666
667 /* Allocate chain of dst_entry's, attach known xfrm's, calculate
668  * all the metrics... Shortly, bundle a bundle.
669  */
670
671 static int
672 xfrm_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int nx,
673                    struct flowi *fl, struct dst_entry **dst_p,
674                    unsigned short family)
675 {
676         int err;
677         struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
678         if (unlikely(afinfo == NULL))
679                 return -EINVAL;
680         err = afinfo->bundle_create(policy, xfrm, nx, fl, dst_p);
681         xfrm_policy_put_afinfo(afinfo);
682         return err;
683 }
684
685 static inline int policy_to_flow_dir(int dir)
686 {
687         if (XFRM_POLICY_IN == FLOW_DIR_IN &&
688             XFRM_POLICY_OUT == FLOW_DIR_OUT &&
689             XFRM_POLICY_FWD == FLOW_DIR_FWD)
690                 return dir;
691         switch (dir) {
692         default:
693         case XFRM_POLICY_IN:
694                 return FLOW_DIR_IN;
695         case XFRM_POLICY_OUT:
696                 return FLOW_DIR_OUT;
697         case XFRM_POLICY_FWD:
698                 return FLOW_DIR_FWD;
699         };
700 }
701
702 static int stale_bundle(struct dst_entry *dst);
703
704 /* Main function: finds/creates a bundle for given flow.
705  *
706  * At the moment we eat a raw IP route. Mostly to speed up lookups
707  * on interfaces with disabled IPsec.
708  */
709 int xfrm_lookup(struct dst_entry **dst_p, struct flowi *fl,
710                 struct sock *sk, int flags)
711 {
712         struct xfrm_policy *policy;
713         struct xfrm_state *xfrm[XFRM_MAX_DEPTH];
714         struct dst_entry *dst, *dst_orig = *dst_p;
715         int nx = 0;
716         int err;
717         u32 genid;
718         u16 family = dst_orig->ops->family;
719 restart:
720         genid = atomic_read(&flow_cache_genid);
721         policy = NULL;
722         if (sk && sk->sk_policy[1])
723                 policy = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl);
724
725         if (!policy) {
726                 /* To accelerate a bit...  */
727                 if ((dst_orig->flags & DST_NOXFRM) || !xfrm_policy_list[XFRM_POLICY_OUT])
728                         return 0;
729
730                 policy = flow_cache_lookup(fl, family,
731                                            policy_to_flow_dir(XFRM_POLICY_OUT),
732                                            xfrm_policy_lookup);
733         }
734
735         if (!policy)
736                 return 0;
737
738         policy->curlft.use_time = (unsigned long)xtime.tv_sec;
739
740         switch (policy->action) {
741         case XFRM_POLICY_BLOCK:
742                 /* Prohibit the flow */
743                 xfrm_pol_put(policy);
744                 return -EPERM;
745
746         case XFRM_POLICY_ALLOW:
747                 if (policy->xfrm_nr == 0) {
748                         /* Flow passes not transformed. */
749                         xfrm_pol_put(policy);
750                         return 0;
751                 }
752
753                 /* Try to find matching bundle.
754                  *
755                  * LATER: help from flow cache. It is optional, this
756                  * is required only for output policy.
757                  */
758                 dst = xfrm_find_bundle(fl, policy, family);
759                 if (IS_ERR(dst)) {
760                         xfrm_pol_put(policy);
761                         return PTR_ERR(dst);
762                 }
763
764                 if (dst)
765                         break;
766
767                 nx = xfrm_tmpl_resolve(policy, fl, xfrm, family);
768
769                 if (unlikely(nx<0)) {
770                         err = nx;
771                         if (err == -EAGAIN && flags) {
772                                 DECLARE_WAITQUEUE(wait, current);
773
774                                 add_wait_queue(&km_waitq, &wait);
775                                 set_current_state(TASK_INTERRUPTIBLE);
776                                 schedule();
777                                 set_current_state(TASK_RUNNING);
778                                 remove_wait_queue(&km_waitq, &wait);
779
780                                 nx = xfrm_tmpl_resolve(policy, fl, xfrm, family);
781
782                                 if (nx == -EAGAIN && signal_pending(current)) {
783                                         err = -ERESTART;
784                                         goto error;
785                                 }
786                                 if (nx == -EAGAIN ||
787                                     genid != atomic_read(&flow_cache_genid)) {
788                                         xfrm_pol_put(policy);
789                                         goto restart;
790                                 }
791                                 err = nx;
792                         }
793                         if (err < 0)
794                                 goto error;
795                 }
796                 if (nx == 0) {
797                         /* Flow passes not transformed. */
798                         xfrm_pol_put(policy);
799                         return 0;
800                 }
801
802                 dst = dst_orig;
803                 err = xfrm_bundle_create(policy, xfrm, nx, fl, &dst, family);
804
805                 if (unlikely(err)) {
806                         int i;
807                         for (i=0; i<nx; i++)
808                                 xfrm_state_put(xfrm[i]);
809                         goto error;
810                 }
811
812                 write_lock_bh(&policy->lock);
813                 if (unlikely(policy->dead || stale_bundle(dst))) {
814                         /* Wow! While we worked on resolving, this
815                          * policy has gone. Retry. It is not paranoia,
816                          * we just cannot enlist new bundle to dead object.
817                          * We can't enlist stable bundles either.
818                          */
819                         write_unlock_bh(&policy->lock);
820
821                         xfrm_pol_put(policy);
822                         if (dst)
823                                 dst_free(dst);
824                         goto restart;
825                 }
826                 dst->next = policy->bundles;
827                 policy->bundles = dst;
828                 dst_hold(dst);
829                 write_unlock_bh(&policy->lock);
830         }
831         *dst_p = dst;
832         dst_release(dst_orig);
833         xfrm_pol_put(policy);
834         return 0;
835
836 error:
837         dst_release(dst_orig);
838         xfrm_pol_put(policy);
839         *dst_p = NULL;
840         return err;
841 }
842
843 /* When skb is transformed back to its "native" form, we have to
844  * check policy restrictions. At the moment we make this in maximally
845  * stupid way. Shame on me. :-) Of course, connected sockets must
846  * have policy cached at them.
847  */
848
849 static inline int
850 xfrm_state_ok(struct xfrm_tmpl *tmpl, struct xfrm_state *x, 
851               unsigned short family)
852 {
853         if (xfrm_state_kern(x))
854                 return tmpl->optional && !xfrm_state_addr_cmp(tmpl, x, family);
855         return  x->id.proto == tmpl->id.proto &&
856                 (x->id.spi == tmpl->id.spi || !tmpl->id.spi) &&
857                 (x->props.reqid == tmpl->reqid || !tmpl->reqid) &&
858                 x->props.mode == tmpl->mode &&
859                 (tmpl->aalgos & (1<<x->props.aalgo)) &&
860                 !(x->props.mode && xfrm_state_addr_cmp(tmpl, x, family));
861 }
862
863 static inline int
864 xfrm_policy_ok(struct xfrm_tmpl *tmpl, struct sec_path *sp, int start,
865                unsigned short family)
866 {
867         int idx = start;
868
869         if (tmpl->optional) {
870                 if (!tmpl->mode)
871                         return start;
872         } else
873                 start = -1;
874         for (; idx < sp->len; idx++) {
875                 if (xfrm_state_ok(tmpl, sp->x[idx].xvec, family))
876                         return ++idx;
877                 if (sp->x[idx].xvec->props.mode)
878                         break;
879         }
880         return start;
881 }
882
883 static int
884 _decode_session(struct sk_buff *skb, struct flowi *fl, unsigned short family)
885 {
886         struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
887
888         if (unlikely(afinfo == NULL))
889                 return -EAFNOSUPPORT;
890
891         afinfo->decode_session(skb, fl);
892         xfrm_policy_put_afinfo(afinfo);
893         return 0;
894 }
895
896 int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, 
897                         unsigned short family)
898 {
899         struct xfrm_policy *pol;
900         struct flowi fl;
901
902         if (_decode_session(skb, &fl, family) < 0)
903                 return 0;
904
905         /* First, check used SA against their selectors. */
906         if (skb->sp) {
907                 int i;
908
909                 for (i=skb->sp->len-1; i>=0; i--) {
910                   struct sec_decap_state *xvec = &(skb->sp->x[i]);
911                         if (!xfrm_selector_match(&xvec->xvec->sel, &fl, family))
912                                 return 0;
913
914                         /* If there is a post_input processor, try running it */
915                         if (xvec->xvec->type->post_input &&
916                             (xvec->xvec->type->post_input)(xvec->xvec,
917                                                            &(xvec->decap),
918                                                            skb) != 0)
919                                 return 0;
920                 }
921         }
922
923         pol = NULL;
924         if (sk && sk->sk_policy[dir])
925                 pol = xfrm_sk_policy_lookup(sk, dir, &fl);
926
927         if (!pol)
928                 pol = flow_cache_lookup(&fl, family,
929                                         policy_to_flow_dir(dir),
930                                         xfrm_policy_lookup);
931
932         if (!pol)
933                 return !skb->sp;
934
935         pol->curlft.use_time = (unsigned long)xtime.tv_sec;
936
937         if (pol->action == XFRM_POLICY_ALLOW) {
938                 struct sec_path *sp;
939                 static struct sec_path dummy;
940                 int i, k;
941
942                 if ((sp = skb->sp) == NULL)
943                         sp = &dummy;
944
945                 /* For each tunnel xfrm, find the first matching tmpl.
946                  * For each tmpl before that, find corresponding xfrm.
947                  * Order is _important_. Later we will implement
948                  * some barriers, but at the moment barriers
949                  * are implied between each two transformations.
950                  */
951                 for (i = pol->xfrm_nr-1, k = 0; i >= 0; i--) {
952                         k = xfrm_policy_ok(pol->xfrm_vec+i, sp, k, family);
953                         if (k < 0)
954                                 goto reject;
955                 }
956
957                 for (; k < sp->len; k++) {
958                         if (sp->x[k].xvec->props.mode)
959                                 goto reject;
960                 }
961
962                 xfrm_pol_put(pol);
963                 return 1;
964         }
965
966 reject:
967         xfrm_pol_put(pol);
968         return 0;
969 }
970
971 int __xfrm_route_forward(struct sk_buff *skb, unsigned short family)
972 {
973         struct flowi fl;
974
975         if (_decode_session(skb, &fl, family) < 0)
976                 return 0;
977
978         return xfrm_lookup(&skb->dst, &fl, NULL, 0) == 0;
979 }
980
981 /* Optimize later using cookies and generation ids. */
982
983 static struct dst_entry *xfrm_dst_check(struct dst_entry *dst, u32 cookie)
984 {
985         if (!stale_bundle(dst))
986                 return dst;
987
988         dst_release(dst);
989         return NULL;
990 }
991
992 static int stale_bundle(struct dst_entry *dst)
993 {
994         struct dst_entry *child = dst;
995
996         while (child) {
997                 if (child->obsolete > 0 ||
998                     (child->dev && !netif_running(child->dev)) ||
999                     (child->xfrm && child->xfrm->km.state != XFRM_STATE_VALID)) {
1000                         return 1;
1001                 }
1002                 child = child->child;
1003         }
1004
1005         return 0;
1006 }
1007
1008 static void xfrm_dst_destroy(struct dst_entry *dst)
1009 {
1010         if (!dst->xfrm)
1011                 return;
1012         xfrm_state_put(dst->xfrm);
1013         dst->xfrm = NULL;
1014 }
1015
1016 static void xfrm_link_failure(struct sk_buff *skb)
1017 {
1018         /* Impossible. Such dst must be popped before reaches point of failure. */
1019         return;
1020 }
1021
1022 static struct dst_entry *xfrm_negative_advice(struct dst_entry *dst)
1023 {
1024         if (dst) {
1025                 if (dst->obsolete) {
1026                         dst_release(dst);
1027                         dst = NULL;
1028                 }
1029         }
1030         return dst;
1031 }
1032
1033 static void xfrm_prune_bundles(int (*func)(struct dst_entry *))
1034 {
1035         int i;
1036         struct xfrm_policy *pol;
1037         struct dst_entry *dst, **dstp, *gc_list = NULL;
1038
1039         read_lock_bh(&xfrm_policy_lock);
1040         for (i=0; i<2*XFRM_POLICY_MAX; i++) {
1041                 for (pol = xfrm_policy_list[i]; pol; pol = pol->next) {
1042                         write_lock(&pol->lock);
1043                         dstp = &pol->bundles;
1044                         while ((dst=*dstp) != NULL) {
1045                                 if (func(dst)) {
1046                                         *dstp = dst->next;
1047                                         dst->next = gc_list;
1048                                         gc_list = dst;
1049                                 } else {
1050                                         dstp = &dst->next;
1051                                 }
1052                         }
1053                         write_unlock(&pol->lock);
1054                 }
1055         }
1056         read_unlock_bh(&xfrm_policy_lock);
1057
1058         while (gc_list) {
1059                 dst = gc_list;
1060                 gc_list = dst->next;
1061                 dst_free(dst);
1062         }
1063 }
1064
1065 static int unused_bundle(struct dst_entry *dst)
1066 {
1067         return !atomic_read(&dst->__refcnt);
1068 }
1069
1070 static void __xfrm_garbage_collect(void)
1071 {
1072         xfrm_prune_bundles(unused_bundle);
1073 }
1074
1075 int xfrm_flush_bundles(void)
1076 {
1077         xfrm_prune_bundles(stale_bundle);
1078         return 0;
1079 }
1080
1081 /* Well... that's _TASK_. We need to scan through transformation
1082  * list and figure out what mss tcp should generate in order to
1083  * final datagram fit to mtu. Mama mia... :-)
1084  *
1085  * Apparently, some easy way exists, but we used to choose the most
1086  * bizarre ones. :-) So, raising Kalashnikov... tra-ta-ta.
1087  *
1088  * Consider this function as something like dark humour. :-)
1089  */
1090 static int xfrm_get_mss(struct dst_entry *dst, u32 mtu)
1091 {
1092         int res = mtu - dst->header_len;
1093
1094         for (;;) {
1095                 struct dst_entry *d = dst;
1096                 int m = res;
1097
1098                 do {
1099                         struct xfrm_state *x = d->xfrm;
1100                         if (x) {
1101                                 spin_lock_bh(&x->lock);
1102                                 if (x->km.state == XFRM_STATE_VALID &&
1103                                     x->type && x->type->get_max_size)
1104                                         m = x->type->get_max_size(d->xfrm, m);
1105                                 else
1106                                         m += x->props.header_len;
1107                                 spin_unlock_bh(&x->lock);
1108                         }
1109                 } while ((d = d->child) != NULL);
1110
1111                 if (m <= mtu)
1112                         break;
1113                 res -= (m - mtu);
1114                 if (res < 88)
1115                         return mtu;
1116         }
1117
1118         return res + dst->header_len;
1119 }
1120
1121 int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo)
1122 {
1123         int err = 0;
1124         if (unlikely(afinfo == NULL))
1125                 return -EINVAL;
1126         if (unlikely(afinfo->family >= NPROTO))
1127                 return -EAFNOSUPPORT;
1128         write_lock(&xfrm_policy_afinfo_lock);
1129         if (unlikely(xfrm_policy_afinfo[afinfo->family] != NULL))
1130                 err = -ENOBUFS;
1131         else {
1132                 struct dst_ops *dst_ops = afinfo->dst_ops;
1133                 if (likely(dst_ops->kmem_cachep == NULL))
1134                         dst_ops->kmem_cachep = xfrm_dst_cache;
1135                 if (likely(dst_ops->check == NULL))
1136                         dst_ops->check = xfrm_dst_check;
1137                 if (likely(dst_ops->destroy == NULL))
1138                         dst_ops->destroy = xfrm_dst_destroy;
1139                 if (likely(dst_ops->negative_advice == NULL))
1140                         dst_ops->negative_advice = xfrm_negative_advice;
1141                 if (likely(dst_ops->link_failure == NULL))
1142                         dst_ops->link_failure = xfrm_link_failure;
1143                 if (likely(dst_ops->get_mss == NULL))
1144                         dst_ops->get_mss = xfrm_get_mss;
1145                 if (likely(afinfo->garbage_collect == NULL))
1146                         afinfo->garbage_collect = __xfrm_garbage_collect;
1147                 xfrm_policy_afinfo[afinfo->family] = afinfo;
1148         }
1149         write_unlock(&xfrm_policy_afinfo_lock);
1150         return err;
1151 }
1152
1153 int xfrm_policy_unregister_afinfo(struct xfrm_policy_afinfo *afinfo)
1154 {
1155         int err = 0;
1156         if (unlikely(afinfo == NULL))
1157                 return -EINVAL;
1158         if (unlikely(afinfo->family >= NPROTO))
1159                 return -EAFNOSUPPORT;
1160         write_lock(&xfrm_policy_afinfo_lock);
1161         if (likely(xfrm_policy_afinfo[afinfo->family] != NULL)) {
1162                 if (unlikely(xfrm_policy_afinfo[afinfo->family] != afinfo))
1163                         err = -EINVAL;
1164                 else {
1165                         struct dst_ops *dst_ops = afinfo->dst_ops;
1166                         xfrm_policy_afinfo[afinfo->family] = NULL;
1167                         dst_ops->kmem_cachep = NULL;
1168                         dst_ops->check = NULL;
1169                         dst_ops->destroy = NULL;
1170                         dst_ops->negative_advice = NULL;
1171                         dst_ops->link_failure = NULL;
1172                         dst_ops->get_mss = NULL;
1173                         afinfo->garbage_collect = NULL;
1174                 }
1175         }
1176         write_unlock(&xfrm_policy_afinfo_lock);
1177         return err;
1178 }
1179
1180 struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family)
1181 {
1182         struct xfrm_policy_afinfo *afinfo;
1183         if (unlikely(family >= NPROTO))
1184                 return NULL;
1185         read_lock(&xfrm_policy_afinfo_lock);
1186         afinfo = xfrm_policy_afinfo[family];
1187         if (likely(afinfo != NULL))
1188                 read_lock(&afinfo->lock);
1189         read_unlock(&xfrm_policy_afinfo_lock);
1190         return afinfo;
1191 }
1192
1193 void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo)
1194 {
1195         if (unlikely(afinfo == NULL))
1196                 return;
1197         read_unlock(&afinfo->lock);
1198 }
1199
1200 static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void *ptr)
1201 {
1202         switch (event) {
1203         case NETDEV_DOWN:
1204                 xfrm_flush_bundles();
1205         }
1206         return NOTIFY_DONE;
1207 }
1208
1209 struct notifier_block xfrm_dev_notifier = {
1210         xfrm_dev_event,
1211         NULL,
1212         0
1213 };
1214
1215 void __init xfrm_policy_init(void)
1216 {
1217         xfrm_dst_cache = kmem_cache_create("xfrm_dst_cache",
1218                                            sizeof(struct xfrm_dst),
1219                                            0, SLAB_HWCACHE_ALIGN,
1220                                            NULL, NULL);
1221         if (!xfrm_dst_cache)
1222                 panic("XFRM: failed to allocate xfrm_dst_cache\n");
1223
1224         INIT_WORK(&xfrm_policy_gc_work, xfrm_policy_gc_task, NULL);
1225         register_netdevice_notifier(&xfrm_dev_notifier);
1226 }
1227
1228 void __init xfrm_init(void)
1229 {
1230         xfrm_state_init();
1231         xfrm_policy_init();
1232         xfrm_input_init();
1233 }
1234