VServer 1.9.2 (patch-2.6.8.1-vs1.9.2.diff)
[linux-2.6.git] / net / xfrm / xfrm_policy.c
1 /* 
2  * xfrm_policy.c
3  *
4  * Changes:
5  *      Mitsuru KANDA @USAGI
6  *      Kazunori MIYAZAWA @USAGI
7  *      Kunihiro Ishiguro <kunihiro@ipinfusion.com>
8  *              IPv6 support
9  *      Kazunori MIYAZAWA @USAGI
10  *      YOSHIFUJI Hideaki
11  *              Split up af-specific portion
12  *      Derek Atkins <derek@ihtfp.com>          Add the post_input processor
13  *      
14  */
15
16 #include <linux/config.h>
17 #include <linux/slab.h>
18 #include <linux/kmod.h>
19 #include <linux/list.h>
20 #include <linux/spinlock.h>
21 #include <linux/workqueue.h>
22 #include <linux/notifier.h>
23 #include <linux/netdevice.h>
24 #include <net/xfrm.h>
25 #include <net/ip.h>
26
27 DECLARE_MUTEX(xfrm_cfg_sem);
28
29 static rwlock_t xfrm_policy_lock = RW_LOCK_UNLOCKED;
30
31 struct xfrm_policy *xfrm_policy_list[XFRM_POLICY_MAX*2];
32
33 static rwlock_t xfrm_policy_afinfo_lock = RW_LOCK_UNLOCKED;
34 static struct xfrm_policy_afinfo *xfrm_policy_afinfo[NPROTO];
35
36 kmem_cache_t *xfrm_dst_cache;
37
38 static struct work_struct xfrm_policy_gc_work;
39 static struct list_head xfrm_policy_gc_list =
40         LIST_HEAD_INIT(xfrm_policy_gc_list);
41 static spinlock_t xfrm_policy_gc_lock = SPIN_LOCK_UNLOCKED;
42
43 int xfrm_register_type(struct xfrm_type *type, unsigned short family)
44 {
45         struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
46         struct xfrm_type_map *typemap;
47         int err = 0;
48
49         if (unlikely(afinfo == NULL))
50                 return -EAFNOSUPPORT;
51         typemap = afinfo->type_map;
52
53         write_lock(&typemap->lock);
54         if (likely(typemap->map[type->proto] == NULL))
55                 typemap->map[type->proto] = type;
56         else
57                 err = -EEXIST;
58         write_unlock(&typemap->lock);
59         xfrm_policy_put_afinfo(afinfo);
60         return err;
61 }
62
63 int xfrm_unregister_type(struct xfrm_type *type, unsigned short family)
64 {
65         struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
66         struct xfrm_type_map *typemap;
67         int err = 0;
68
69         if (unlikely(afinfo == NULL))
70                 return -EAFNOSUPPORT;
71         typemap = afinfo->type_map;
72
73         write_lock(&typemap->lock);
74         if (unlikely(typemap->map[type->proto] != type))
75                 err = -ENOENT;
76         else
77                 typemap->map[type->proto] = NULL;
78         write_unlock(&typemap->lock);
79         xfrm_policy_put_afinfo(afinfo);
80         return err;
81 }
82
83 struct xfrm_type *xfrm_get_type(u8 proto, unsigned short family)
84 {
85         struct xfrm_policy_afinfo *afinfo;
86         struct xfrm_type_map *typemap;
87         struct xfrm_type *type;
88         int modload_attempted = 0;
89
90 retry:
91         afinfo = xfrm_policy_get_afinfo(family);
92         if (unlikely(afinfo == NULL))
93                 return NULL;
94         typemap = afinfo->type_map;
95
96         read_lock(&typemap->lock);
97         type = typemap->map[proto];
98         if (unlikely(type && !try_module_get(type->owner)))
99                 type = NULL;
100         read_unlock(&typemap->lock);
101         if (!type && !modload_attempted) {
102                 xfrm_policy_put_afinfo(afinfo);
103                 request_module("xfrm-type-%d-%d",
104                                (int) family, (int) proto);
105                 modload_attempted = 1;
106                 goto retry;
107         }
108
109         xfrm_policy_put_afinfo(afinfo);
110         return type;
111 }
112
113 int xfrm_dst_lookup(struct xfrm_dst **dst, struct flowi *fl, 
114                     unsigned short family)
115 {
116         struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
117         int err = 0;
118
119         if (unlikely(afinfo == NULL))
120                 return -EAFNOSUPPORT;
121
122         if (likely(afinfo->dst_lookup != NULL))
123                 err = afinfo->dst_lookup(dst, fl);
124         else
125                 err = -EINVAL;
126         xfrm_policy_put_afinfo(afinfo);
127         return err;
128 }
129
130 void xfrm_put_type(struct xfrm_type *type)
131 {
132         module_put(type->owner);
133 }
134
135 static inline unsigned long make_jiffies(long secs)
136 {
137         if (secs >= (MAX_SCHEDULE_TIMEOUT-1)/HZ)
138                 return MAX_SCHEDULE_TIMEOUT-1;
139         else
140                 return secs*HZ;
141 }
142
143 static void xfrm_policy_timer(unsigned long data)
144 {
145         struct xfrm_policy *xp = (struct xfrm_policy*)data;
146         unsigned long now = (unsigned long)xtime.tv_sec;
147         long next = LONG_MAX;
148         int warn = 0;
149         int dir;
150
151         read_lock(&xp->lock);
152
153         if (xp->dead)
154                 goto out;
155
156         dir = xp->index & 7;
157
158         if (xp->lft.hard_add_expires_seconds) {
159                 long tmo = xp->lft.hard_add_expires_seconds +
160                         xp->curlft.add_time - now;
161                 if (tmo <= 0)
162                         goto expired;
163                 if (tmo < next)
164                         next = tmo;
165         }
166         if (xp->lft.hard_use_expires_seconds) {
167                 long tmo = xp->lft.hard_use_expires_seconds +
168                         (xp->curlft.use_time ? : xp->curlft.add_time) - now;
169                 if (tmo <= 0)
170                         goto expired;
171                 if (tmo < next)
172                         next = tmo;
173         }
174         if (xp->lft.soft_add_expires_seconds) {
175                 long tmo = xp->lft.soft_add_expires_seconds +
176                         xp->curlft.add_time - now;
177                 if (tmo <= 0) {
178                         warn = 1;
179                         tmo = XFRM_KM_TIMEOUT;
180                 }
181                 if (tmo < next)
182                         next = tmo;
183         }
184         if (xp->lft.soft_use_expires_seconds) {
185                 long tmo = xp->lft.soft_use_expires_seconds +
186                         (xp->curlft.use_time ? : xp->curlft.add_time) - now;
187                 if (tmo <= 0) {
188                         warn = 1;
189                         tmo = XFRM_KM_TIMEOUT;
190                 }
191                 if (tmo < next)
192                         next = tmo;
193         }
194
195         if (warn)
196                 km_policy_expired(xp, dir, 0);
197         if (next != LONG_MAX &&
198             !mod_timer(&xp->timer, jiffies + make_jiffies(next)))
199                 xfrm_pol_hold(xp);
200
201 out:
202         read_unlock(&xp->lock);
203         xfrm_pol_put(xp);
204         return;
205
206 expired:
207         read_unlock(&xp->lock);
208         km_policy_expired(xp, dir, 1);
209         xfrm_policy_delete(xp, dir);
210         xfrm_pol_put(xp);
211 }
212
213
214 /* Allocate xfrm_policy. Not used here, it is supposed to be used by pfkeyv2
215  * SPD calls.
216  */
217
218 struct xfrm_policy *xfrm_policy_alloc(int gfp)
219 {
220         struct xfrm_policy *policy;
221
222         policy = kmalloc(sizeof(struct xfrm_policy), gfp);
223
224         if (policy) {
225                 memset(policy, 0, sizeof(struct xfrm_policy));
226                 atomic_set(&policy->refcnt, 1);
227                 policy->lock = RW_LOCK_UNLOCKED;
228                 init_timer(&policy->timer);
229                 policy->timer.data = (unsigned long)policy;
230                 policy->timer.function = xfrm_policy_timer;
231         }
232         return policy;
233 }
234
235 /* Destroy xfrm_policy: descendant resources must be released to this moment. */
236
237 void __xfrm_policy_destroy(struct xfrm_policy *policy)
238 {
239         if (!policy->dead)
240                 BUG();
241
242         if (policy->bundles)
243                 BUG();
244
245         if (del_timer(&policy->timer))
246                 BUG();
247
248         kfree(policy);
249 }
250
251 static void xfrm_policy_gc_kill(struct xfrm_policy *policy)
252 {
253         struct dst_entry *dst;
254
255         while ((dst = policy->bundles) != NULL) {
256                 policy->bundles = dst->next;
257                 dst_free(dst);
258         }
259
260         if (del_timer(&policy->timer))
261                 atomic_dec(&policy->refcnt);
262
263         if (atomic_read(&policy->refcnt) > 1)
264                 flow_cache_flush();
265
266         xfrm_pol_put(policy);
267 }
268
269 static void xfrm_policy_gc_task(void *data)
270 {
271         struct xfrm_policy *policy;
272         struct list_head *entry, *tmp;
273         struct list_head gc_list = LIST_HEAD_INIT(gc_list);
274
275         spin_lock_bh(&xfrm_policy_gc_lock);
276         list_splice_init(&xfrm_policy_gc_list, &gc_list);
277         spin_unlock_bh(&xfrm_policy_gc_lock);
278
279         list_for_each_safe(entry, tmp, &gc_list) {
280                 policy = list_entry(entry, struct xfrm_policy, list);
281                 xfrm_policy_gc_kill(policy);
282         }
283 }
284
285 /* Rule must be locked. Release descentant resources, announce
286  * entry dead. The rule must be unlinked from lists to the moment.
287  */
288
289 void xfrm_policy_kill(struct xfrm_policy *policy)
290 {
291         write_lock_bh(&policy->lock);
292         if (policy->dead)
293                 goto out;
294
295         policy->dead = 1;
296
297         spin_lock(&xfrm_policy_gc_lock);
298         list_add(&policy->list, &xfrm_policy_gc_list);
299         spin_unlock(&xfrm_policy_gc_lock);
300         schedule_work(&xfrm_policy_gc_work);
301
302 out:
303         write_unlock_bh(&policy->lock);
304 }
305
306 /* Generate new index... KAME seems to generate them ordered by cost
307  * of an absolute inpredictability of ordering of rules. This will not pass. */
308 static u32 xfrm_gen_index(int dir)
309 {
310         u32 idx;
311         struct xfrm_policy *p;
312         static u32 idx_generator;
313
314         for (;;) {
315                 idx = (idx_generator | dir);
316                 idx_generator += 8;
317                 if (idx == 0)
318                         idx = 8;
319                 for (p = xfrm_policy_list[dir]; p; p = p->next) {
320                         if (p->index == idx)
321                                 break;
322                 }
323                 if (!p)
324                         return idx;
325         }
326 }
327
328 int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
329 {
330         struct xfrm_policy *pol, **p;
331         struct xfrm_policy *delpol = NULL;
332         struct xfrm_policy **newpos = NULL;
333
334         write_lock_bh(&xfrm_policy_lock);
335         for (p = &xfrm_policy_list[dir]; (pol=*p)!=NULL; p = &pol->next) {
336                 if (!delpol && memcmp(&policy->selector, &pol->selector, sizeof(pol->selector)) == 0) {
337                         if (excl) {
338                                 write_unlock_bh(&xfrm_policy_lock);
339                                 return -EEXIST;
340                         }
341                         *p = pol->next;
342                         delpol = pol;
343                         if (policy->priority > pol->priority)
344                                 continue;
345                 } else if (policy->priority >= pol->priority)
346                         continue;
347                 if (!newpos)
348                         newpos = p;
349                 if (delpol)
350                         break;
351         }
352         if (newpos)
353                 p = newpos;
354         xfrm_pol_hold(policy);
355         policy->next = *p;
356         *p = policy;
357         atomic_inc(&flow_cache_genid);
358         policy->index = delpol ? delpol->index : xfrm_gen_index(dir);
359         policy->curlft.add_time = (unsigned long)xtime.tv_sec;
360         policy->curlft.use_time = 0;
361         if (!mod_timer(&policy->timer, jiffies + HZ))
362                 xfrm_pol_hold(policy);
363         write_unlock_bh(&xfrm_policy_lock);
364
365         if (delpol) {
366                 xfrm_policy_kill(delpol);
367         }
368         return 0;
369 }
370
371 struct xfrm_policy *xfrm_policy_bysel(int dir, struct xfrm_selector *sel,
372                                       int delete)
373 {
374         struct xfrm_policy *pol, **p;
375
376         write_lock_bh(&xfrm_policy_lock);
377         for (p = &xfrm_policy_list[dir]; (pol=*p)!=NULL; p = &pol->next) {
378                 if (memcmp(sel, &pol->selector, sizeof(*sel)) == 0) {
379                         xfrm_pol_hold(pol);
380                         if (delete)
381                                 *p = pol->next;
382                         break;
383                 }
384         }
385         write_unlock_bh(&xfrm_policy_lock);
386
387         if (pol && delete) {
388                 atomic_inc(&flow_cache_genid);
389                 xfrm_policy_kill(pol);
390         }
391         return pol;
392 }
393
394 struct xfrm_policy *xfrm_policy_byid(int dir, u32 id, int delete)
395 {
396         struct xfrm_policy *pol, **p;
397
398         write_lock_bh(&xfrm_policy_lock);
399         for (p = &xfrm_policy_list[id & 7]; (pol=*p)!=NULL; p = &pol->next) {
400                 if (pol->index == id) {
401                         xfrm_pol_hold(pol);
402                         if (delete)
403                                 *p = pol->next;
404                         break;
405                 }
406         }
407         write_unlock_bh(&xfrm_policy_lock);
408
409         if (pol && delete) {
410                 atomic_inc(&flow_cache_genid);
411                 xfrm_policy_kill(pol);
412         }
413         return pol;
414 }
415
416 void xfrm_policy_flush(void)
417 {
418         struct xfrm_policy *xp;
419         int dir;
420
421         write_lock_bh(&xfrm_policy_lock);
422         for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
423                 while ((xp = xfrm_policy_list[dir]) != NULL) {
424                         xfrm_policy_list[dir] = xp->next;
425                         write_unlock_bh(&xfrm_policy_lock);
426
427                         xfrm_policy_kill(xp);
428
429                         write_lock_bh(&xfrm_policy_lock);
430                 }
431         }
432         atomic_inc(&flow_cache_genid);
433         write_unlock_bh(&xfrm_policy_lock);
434 }
435
436 int xfrm_policy_walk(int (*func)(struct xfrm_policy *, int, int, void*),
437                      void *data)
438 {
439         struct xfrm_policy *xp;
440         int dir;
441         int count = 0;
442         int error = 0;
443
444         read_lock_bh(&xfrm_policy_lock);
445         for (dir = 0; dir < 2*XFRM_POLICY_MAX; dir++) {
446                 for (xp = xfrm_policy_list[dir]; xp; xp = xp->next)
447                         count++;
448         }
449
450         if (count == 0) {
451                 error = -ENOENT;
452                 goto out;
453         }
454
455         for (dir = 0; dir < 2*XFRM_POLICY_MAX; dir++) {
456                 for (xp = xfrm_policy_list[dir]; xp; xp = xp->next) {
457                         error = func(xp, dir%XFRM_POLICY_MAX, --count, data);
458                         if (error)
459                                 goto out;
460                 }
461         }
462
463 out:
464         read_unlock_bh(&xfrm_policy_lock);
465         return error;
466 }
467
468
469 /* Find policy to apply to this flow. */
470
471 static void xfrm_policy_lookup(struct flowi *fl, u16 family, u8 dir,
472                                void **objp, atomic_t **obj_refp)
473 {
474         struct xfrm_policy *pol;
475
476         read_lock_bh(&xfrm_policy_lock);
477         for (pol = xfrm_policy_list[dir]; pol; pol = pol->next) {
478                 struct xfrm_selector *sel = &pol->selector;
479                 int match;
480
481                 if (pol->family != family)
482                         continue;
483
484                 match = xfrm_selector_match(sel, fl, family);
485                 if (match) {
486                         xfrm_pol_hold(pol);
487                         break;
488                 }
489         }
490         read_unlock_bh(&xfrm_policy_lock);
491         if ((*objp = (void *) pol) != NULL)
492                 *obj_refp = &pol->refcnt;
493 }
494
495 struct xfrm_policy *xfrm_sk_policy_lookup(struct sock *sk, int dir, struct flowi *fl)
496 {
497         struct xfrm_policy *pol;
498
499         read_lock_bh(&xfrm_policy_lock);
500         if ((pol = sk->sk_policy[dir]) != NULL) {
501                 int match = xfrm_selector_match(&pol->selector, fl,
502                                                 sk->sk_family);
503                 if (match)
504                         xfrm_pol_hold(pol);
505                 else
506                         pol = NULL;
507         }
508         read_unlock_bh(&xfrm_policy_lock);
509         return pol;
510 }
511
512 static void __xfrm_policy_link(struct xfrm_policy *pol, int dir)
513 {
514         pol->next = xfrm_policy_list[dir];
515         xfrm_policy_list[dir] = pol;
516         xfrm_pol_hold(pol);
517 }
518
519 static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
520                                                 int dir)
521 {
522         struct xfrm_policy **polp;
523
524         for (polp = &xfrm_policy_list[dir];
525              *polp != NULL; polp = &(*polp)->next) {
526                 if (*polp == pol) {
527                         *polp = pol->next;
528                         return pol;
529                 }
530         }
531         return NULL;
532 }
533
534 void xfrm_policy_delete(struct xfrm_policy *pol, int dir)
535 {
536         write_lock_bh(&xfrm_policy_lock);
537         pol = __xfrm_policy_unlink(pol, dir);
538         write_unlock_bh(&xfrm_policy_lock);
539         if (pol) {
540                 if (dir < XFRM_POLICY_MAX)
541                         atomic_inc(&flow_cache_genid);
542                 xfrm_policy_kill(pol);
543         }
544 }
545
546 int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol)
547 {
548         struct xfrm_policy *old_pol;
549
550         write_lock_bh(&xfrm_policy_lock);
551         old_pol = sk->sk_policy[dir];
552         sk->sk_policy[dir] = pol;
553         if (pol) {
554                 pol->curlft.add_time = (unsigned long)xtime.tv_sec;
555                 pol->index = xfrm_gen_index(XFRM_POLICY_MAX+dir);
556                 __xfrm_policy_link(pol, XFRM_POLICY_MAX+dir);
557         }
558         if (old_pol)
559                 __xfrm_policy_unlink(old_pol, XFRM_POLICY_MAX+dir);
560         write_unlock_bh(&xfrm_policy_lock);
561
562         if (old_pol) {
563                 xfrm_policy_kill(old_pol);
564         }
565         return 0;
566 }
567
568 static struct xfrm_policy *clone_policy(struct xfrm_policy *old, int dir)
569 {
570         struct xfrm_policy *newp = xfrm_policy_alloc(GFP_ATOMIC);
571
572         if (newp) {
573                 newp->selector = old->selector;
574                 newp->lft = old->lft;
575                 newp->curlft = old->curlft;
576                 newp->action = old->action;
577                 newp->flags = old->flags;
578                 newp->xfrm_nr = old->xfrm_nr;
579                 newp->index = old->index;
580                 memcpy(newp->xfrm_vec, old->xfrm_vec,
581                        newp->xfrm_nr*sizeof(struct xfrm_tmpl));
582                 write_lock_bh(&xfrm_policy_lock);
583                 __xfrm_policy_link(newp, XFRM_POLICY_MAX+dir);
584                 write_unlock_bh(&xfrm_policy_lock);
585                 xfrm_pol_put(newp);
586         }
587         return newp;
588 }
589
590 int __xfrm_sk_clone_policy(struct sock *sk)
591 {
592         struct xfrm_policy *p0 = sk->sk_policy[0],
593                            *p1 = sk->sk_policy[1];
594
595         sk->sk_policy[0] = sk->sk_policy[1] = NULL;
596         if (p0 && (sk->sk_policy[0] = clone_policy(p0, 0)) == NULL)
597                 return -ENOMEM;
598         if (p1 && (sk->sk_policy[1] = clone_policy(p1, 1)) == NULL)
599                 return -ENOMEM;
600         return 0;
601 }
602
603 /* Resolve list of templates for the flow, given policy. */
604
605 static int
606 xfrm_tmpl_resolve(struct xfrm_policy *policy, struct flowi *fl,
607                   struct xfrm_state **xfrm,
608                   unsigned short family)
609 {
610         int nx;
611         int i, error;
612         xfrm_address_t *daddr = xfrm_flowi_daddr(fl, family);
613         xfrm_address_t *saddr = xfrm_flowi_saddr(fl, family);
614
615         for (nx=0, i = 0; i < policy->xfrm_nr; i++) {
616                 struct xfrm_state *x;
617                 xfrm_address_t *remote = daddr;
618                 xfrm_address_t *local  = saddr;
619                 struct xfrm_tmpl *tmpl = &policy->xfrm_vec[i];
620
621                 if (tmpl->mode) {
622                         remote = &tmpl->id.daddr;
623                         local = &tmpl->saddr;
624                 }
625
626                 x = xfrm_state_find(remote, local, fl, tmpl, policy, &error, family);
627
628                 if (x && x->km.state == XFRM_STATE_VALID) {
629                         xfrm[nx++] = x;
630                         daddr = remote;
631                         saddr = local;
632                         continue;
633                 }
634                 if (x) {
635                         error = (x->km.state == XFRM_STATE_ERROR ?
636                                  -EINVAL : -EAGAIN);
637                         xfrm_state_put(x);
638                 }
639
640                 if (!tmpl->optional)
641                         goto fail;
642         }
643         return nx;
644
645 fail:
646         for (nx--; nx>=0; nx--)
647                 xfrm_state_put(xfrm[nx]);
648         return error;
649 }
650
651 /* Check that the bundle accepts the flow and its components are
652  * still valid.
653  */
654
655 static struct dst_entry *
656 xfrm_find_bundle(struct flowi *fl, struct xfrm_policy *policy, unsigned short family)
657 {
658         struct dst_entry *x;
659         struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
660         if (unlikely(afinfo == NULL))
661                 return ERR_PTR(-EINVAL);
662         x = afinfo->find_bundle(fl, policy);
663         xfrm_policy_put_afinfo(afinfo);
664         return x;
665 }
666
667 /* Allocate chain of dst_entry's, attach known xfrm's, calculate
668  * all the metrics... Shortly, bundle a bundle.
669  */
670
671 static int
672 xfrm_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int nx,
673                    struct flowi *fl, struct dst_entry **dst_p,
674                    unsigned short family)
675 {
676         int err;
677         struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
678         if (unlikely(afinfo == NULL))
679                 return -EINVAL;
680         err = afinfo->bundle_create(policy, xfrm, nx, fl, dst_p);
681         xfrm_policy_put_afinfo(afinfo);
682         return err;
683 }
684
685 static inline int policy_to_flow_dir(int dir)
686 {
687         if (XFRM_POLICY_IN == FLOW_DIR_IN &&
688             XFRM_POLICY_OUT == FLOW_DIR_OUT &&
689             XFRM_POLICY_FWD == FLOW_DIR_FWD)
690                 return dir;
691         switch (dir) {
692         default:
693         case XFRM_POLICY_IN:
694                 return FLOW_DIR_IN;
695         case XFRM_POLICY_OUT:
696                 return FLOW_DIR_OUT;
697         case XFRM_POLICY_FWD:
698                 return FLOW_DIR_FWD;
699         };
700 }
701
702 static int stale_bundle(struct dst_entry *dst);
703
704 /* Main function: finds/creates a bundle for given flow.
705  *
706  * At the moment we eat a raw IP route. Mostly to speed up lookups
707  * on interfaces with disabled IPsec.
708  */
709 int xfrm_lookup(struct dst_entry **dst_p, struct flowi *fl,
710                 struct sock *sk, int flags)
711 {
712         struct xfrm_policy *policy;
713         struct xfrm_state *xfrm[XFRM_MAX_DEPTH];
714         struct rtable *rt = (struct rtable*)*dst_p;
715         struct dst_entry *dst;
716         int nx = 0;
717         int err;
718         u32 genid;
719         u16 family = (*dst_p)->ops->family;
720
721         switch (family) {
722         case AF_INET:
723                 if (!fl->fl4_src)
724                         fl->fl4_src = rt->rt_src;
725                 if (!fl->fl4_dst)
726                         fl->fl4_dst = rt->rt_dst;
727         case AF_INET6:
728                 /* Still not clear... */
729         default:
730                 /* nothing */;
731         }
732
733 restart:
734         genid = atomic_read(&flow_cache_genid);
735         policy = NULL;
736         if (sk && sk->sk_policy[1])
737                 policy = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl);
738
739         if (!policy) {
740                 /* To accelerate a bit...  */
741                 if ((rt->u.dst.flags & DST_NOXFRM) || !xfrm_policy_list[XFRM_POLICY_OUT])
742                         return 0;
743
744                 policy = flow_cache_lookup(fl, family,
745                                            policy_to_flow_dir(XFRM_POLICY_OUT),
746                                            xfrm_policy_lookup);
747         }
748
749         if (!policy)
750                 return 0;
751
752         policy->curlft.use_time = (unsigned long)xtime.tv_sec;
753
754         switch (policy->action) {
755         case XFRM_POLICY_BLOCK:
756                 /* Prohibit the flow */
757                 xfrm_pol_put(policy);
758                 return -EPERM;
759
760         case XFRM_POLICY_ALLOW:
761                 if (policy->xfrm_nr == 0) {
762                         /* Flow passes not transformed. */
763                         xfrm_pol_put(policy);
764                         return 0;
765                 }
766
767                 /* Try to find matching bundle.
768                  *
769                  * LATER: help from flow cache. It is optional, this
770                  * is required only for output policy.
771                  */
772                 dst = xfrm_find_bundle(fl, policy, family);
773                 if (IS_ERR(dst)) {
774                         xfrm_pol_put(policy);
775                         return PTR_ERR(dst);
776                 }
777
778                 if (dst)
779                         break;
780
781                 nx = xfrm_tmpl_resolve(policy, fl, xfrm, family);
782
783                 if (unlikely(nx<0)) {
784                         err = nx;
785                         if (err == -EAGAIN && flags) {
786                                 DECLARE_WAITQUEUE(wait, current);
787
788                                 add_wait_queue(&km_waitq, &wait);
789                                 set_current_state(TASK_INTERRUPTIBLE);
790                                 schedule();
791                                 set_current_state(TASK_RUNNING);
792                                 remove_wait_queue(&km_waitq, &wait);
793
794                                 nx = xfrm_tmpl_resolve(policy, fl, xfrm, family);
795
796                                 if (nx == -EAGAIN && signal_pending(current)) {
797                                         err = -ERESTART;
798                                         goto error;
799                                 }
800                                 if (nx == -EAGAIN ||
801                                     genid != atomic_read(&flow_cache_genid)) {
802                                         xfrm_pol_put(policy);
803                                         goto restart;
804                                 }
805                                 err = nx;
806                         }
807                         if (err < 0)
808                                 goto error;
809                 }
810                 if (nx == 0) {
811                         /* Flow passes not transformed. */
812                         xfrm_pol_put(policy);
813                         return 0;
814                 }
815
816                 dst = &rt->u.dst;
817                 err = xfrm_bundle_create(policy, xfrm, nx, fl, &dst, family);
818
819                 if (unlikely(err)) {
820                         int i;
821                         for (i=0; i<nx; i++)
822                                 xfrm_state_put(xfrm[i]);
823                         goto error;
824                 }
825
826                 write_lock_bh(&policy->lock);
827                 if (unlikely(policy->dead || stale_bundle(dst))) {
828                         /* Wow! While we worked on resolving, this
829                          * policy has gone. Retry. It is not paranoia,
830                          * we just cannot enlist new bundle to dead object.
831                          * We can't enlist stable bundles either.
832                          */
833                         write_unlock_bh(&policy->lock);
834
835                         xfrm_pol_put(policy);
836                         if (dst)
837                                 dst_free(dst);
838                         goto restart;
839                 }
840                 dst->next = policy->bundles;
841                 policy->bundles = dst;
842                 dst_hold(dst);
843                 write_unlock_bh(&policy->lock);
844         }
845         *dst_p = dst;
846         ip_rt_put(rt);
847         xfrm_pol_put(policy);
848         return 0;
849
850 error:
851         ip_rt_put(rt);
852         xfrm_pol_put(policy);
853         *dst_p = NULL;
854         return err;
855 }
856
857 /* When skb is transformed back to its "native" form, we have to
858  * check policy restrictions. At the moment we make this in maximally
859  * stupid way. Shame on me. :-) Of course, connected sockets must
860  * have policy cached at them.
861  */
862
863 static inline int
864 xfrm_state_ok(struct xfrm_tmpl *tmpl, struct xfrm_state *x, 
865               unsigned short family)
866 {
867         if (xfrm_state_kern(x))
868                 return tmpl->optional && !xfrm_state_addr_cmp(tmpl, x, family);
869         return  x->id.proto == tmpl->id.proto &&
870                 (x->id.spi == tmpl->id.spi || !tmpl->id.spi) &&
871                 (x->props.reqid == tmpl->reqid || !tmpl->reqid) &&
872                 x->props.mode == tmpl->mode &&
873                 (tmpl->aalgos & (1<<x->props.aalgo)) &&
874                 !(x->props.mode && xfrm_state_addr_cmp(tmpl, x, family));
875 }
876
877 static inline int
878 xfrm_policy_ok(struct xfrm_tmpl *tmpl, struct sec_path *sp, int start,
879                unsigned short family)
880 {
881         int idx = start;
882
883         if (tmpl->optional) {
884                 if (!tmpl->mode)
885                         return start;
886         } else
887                 start = -1;
888         for (; idx < sp->len; idx++) {
889                 if (xfrm_state_ok(tmpl, sp->x[idx].xvec, family))
890                         return ++idx;
891                 if (sp->x[idx].xvec->props.mode)
892                         break;
893         }
894         return start;
895 }
896
897 static int
898 _decode_session(struct sk_buff *skb, struct flowi *fl, unsigned short family)
899 {
900         struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
901
902         if (unlikely(afinfo == NULL))
903                 return -EAFNOSUPPORT;
904
905         afinfo->decode_session(skb, fl);
906         xfrm_policy_put_afinfo(afinfo);
907         return 0;
908 }
909
910 int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, 
911                         unsigned short family)
912 {
913         struct xfrm_policy *pol;
914         struct flowi fl;
915
916         if (_decode_session(skb, &fl, family) < 0)
917                 return 0;
918
919         /* First, check used SA against their selectors. */
920         if (skb->sp) {
921                 int i;
922
923                 for (i=skb->sp->len-1; i>=0; i--) {
924                   struct sec_decap_state *xvec = &(skb->sp->x[i]);
925                         if (!xfrm_selector_match(&xvec->xvec->sel, &fl, family))
926                                 return 0;
927
928                         /* If there is a post_input processor, try running it */
929                         if (xvec->xvec->type->post_input &&
930                             (xvec->xvec->type->post_input)(xvec->xvec,
931                                                            &(xvec->decap),
932                                                            skb) != 0)
933                                 return 0;
934                 }
935         }
936
937         pol = NULL;
938         if (sk && sk->sk_policy[dir])
939                 pol = xfrm_sk_policy_lookup(sk, dir, &fl);
940
941         if (!pol)
942                 pol = flow_cache_lookup(&fl, family,
943                                         policy_to_flow_dir(dir),
944                                         xfrm_policy_lookup);
945
946         if (!pol)
947                 return !skb->sp;
948
949         pol->curlft.use_time = (unsigned long)xtime.tv_sec;
950
951         if (pol->action == XFRM_POLICY_ALLOW) {
952                 struct sec_path *sp;
953                 static struct sec_path dummy;
954                 int i, k;
955
956                 if ((sp = skb->sp) == NULL)
957                         sp = &dummy;
958
959                 /* For each tunnel xfrm, find the first matching tmpl.
960                  * For each tmpl before that, find corresponding xfrm.
961                  * Order is _important_. Later we will implement
962                  * some barriers, but at the moment barriers
963                  * are implied between each two transformations.
964                  */
965                 for (i = pol->xfrm_nr-1, k = 0; i >= 0; i--) {
966                         k = xfrm_policy_ok(pol->xfrm_vec+i, sp, k, family);
967                         if (k < 0)
968                                 goto reject;
969                 }
970
971                 for (; k < sp->len; k++) {
972                         if (sp->x[k].xvec->props.mode)
973                                 goto reject;
974                 }
975
976                 xfrm_pol_put(pol);
977                 return 1;
978         }
979
980 reject:
981         xfrm_pol_put(pol);
982         return 0;
983 }
984
985 int __xfrm_route_forward(struct sk_buff *skb, unsigned short family)
986 {
987         struct flowi fl;
988
989         if (_decode_session(skb, &fl, family) < 0)
990                 return 0;
991
992         return xfrm_lookup(&skb->dst, &fl, NULL, 0) == 0;
993 }
994
995 /* Optimize later using cookies and generation ids. */
996
997 static struct dst_entry *xfrm_dst_check(struct dst_entry *dst, u32 cookie)
998 {
999         if (!stale_bundle(dst))
1000                 return dst;
1001
1002         dst_release(dst);
1003         return NULL;
1004 }
1005
1006 static int stale_bundle(struct dst_entry *dst)
1007 {
1008         struct dst_entry *child = dst;
1009
1010         while (child) {
1011                 if (child->obsolete > 0 ||
1012                     (child->dev && !netif_running(child->dev)) ||
1013                     (child->xfrm && child->xfrm->km.state != XFRM_STATE_VALID)) {
1014                         return 1;
1015                 }
1016                 child = child->child;
1017         }
1018
1019         return 0;
1020 }
1021
1022 static void xfrm_dst_destroy(struct dst_entry *dst)
1023 {
1024         if (!dst->xfrm)
1025                 return;
1026         xfrm_state_put(dst->xfrm);
1027         dst->xfrm = NULL;
1028 }
1029
1030 static void xfrm_link_failure(struct sk_buff *skb)
1031 {
1032         /* Impossible. Such dst must be popped before reaches point of failure. */
1033         return;
1034 }
1035
1036 static struct dst_entry *xfrm_negative_advice(struct dst_entry *dst)
1037 {
1038         if (dst) {
1039                 if (dst->obsolete) {
1040                         dst_release(dst);
1041                         dst = NULL;
1042                 }
1043         }
1044         return dst;
1045 }
1046
1047 static void xfrm_prune_bundles(int (*func)(struct dst_entry *))
1048 {
1049         int i;
1050         struct xfrm_policy *pol;
1051         struct dst_entry *dst, **dstp, *gc_list = NULL;
1052
1053         read_lock_bh(&xfrm_policy_lock);
1054         for (i=0; i<2*XFRM_POLICY_MAX; i++) {
1055                 for (pol = xfrm_policy_list[i]; pol; pol = pol->next) {
1056                         write_lock(&pol->lock);
1057                         dstp = &pol->bundles;
1058                         while ((dst=*dstp) != NULL) {
1059                                 if (func(dst)) {
1060                                         *dstp = dst->next;
1061                                         dst->next = gc_list;
1062                                         gc_list = dst;
1063                                 } else {
1064                                         dstp = &dst->next;
1065                                 }
1066                         }
1067                         write_unlock(&pol->lock);
1068                 }
1069         }
1070         read_unlock_bh(&xfrm_policy_lock);
1071
1072         while (gc_list) {
1073                 dst = gc_list;
1074                 gc_list = dst->next;
1075                 dst_free(dst);
1076         }
1077 }
1078
1079 static int unused_bundle(struct dst_entry *dst)
1080 {
1081         return !atomic_read(&dst->__refcnt);
1082 }
1083
1084 static void __xfrm_garbage_collect(void)
1085 {
1086         xfrm_prune_bundles(unused_bundle);
1087 }
1088
1089 int xfrm_flush_bundles(void)
1090 {
1091         xfrm_prune_bundles(stale_bundle);
1092         return 0;
1093 }
1094
1095 /* Well... that's _TASK_. We need to scan through transformation
1096  * list and figure out what mss tcp should generate in order to
1097  * final datagram fit to mtu. Mama mia... :-)
1098  *
1099  * Apparently, some easy way exists, but we used to choose the most
1100  * bizarre ones. :-) So, raising Kalashnikov... tra-ta-ta.
1101  *
1102  * Consider this function as something like dark humour. :-)
1103  */
1104 static int xfrm_get_mss(struct dst_entry *dst, u32 mtu)
1105 {
1106         int res = mtu - dst->header_len;
1107
1108         for (;;) {
1109                 struct dst_entry *d = dst;
1110                 int m = res;
1111
1112                 do {
1113                         struct xfrm_state *x = d->xfrm;
1114                         if (x) {
1115                                 spin_lock_bh(&x->lock);
1116                                 if (x->km.state == XFRM_STATE_VALID &&
1117                                     x->type && x->type->get_max_size)
1118                                         m = x->type->get_max_size(d->xfrm, m);
1119                                 else
1120                                         m += x->props.header_len;
1121                                 spin_unlock_bh(&x->lock);
1122                         }
1123                 } while ((d = d->child) != NULL);
1124
1125                 if (m <= mtu)
1126                         break;
1127                 res -= (m - mtu);
1128                 if (res < 88)
1129                         return mtu;
1130         }
1131
1132         return res + dst->header_len;
1133 }
1134
1135 int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo)
1136 {
1137         int err = 0;
1138         if (unlikely(afinfo == NULL))
1139                 return -EINVAL;
1140         if (unlikely(afinfo->family >= NPROTO))
1141                 return -EAFNOSUPPORT;
1142         write_lock(&xfrm_policy_afinfo_lock);
1143         if (unlikely(xfrm_policy_afinfo[afinfo->family] != NULL))
1144                 err = -ENOBUFS;
1145         else {
1146                 struct dst_ops *dst_ops = afinfo->dst_ops;
1147                 if (likely(dst_ops->kmem_cachep == NULL))
1148                         dst_ops->kmem_cachep = xfrm_dst_cache;
1149                 if (likely(dst_ops->check == NULL))
1150                         dst_ops->check = xfrm_dst_check;
1151                 if (likely(dst_ops->destroy == NULL))
1152                         dst_ops->destroy = xfrm_dst_destroy;
1153                 if (likely(dst_ops->negative_advice == NULL))
1154                         dst_ops->negative_advice = xfrm_negative_advice;
1155                 if (likely(dst_ops->link_failure == NULL))
1156                         dst_ops->link_failure = xfrm_link_failure;
1157                 if (likely(dst_ops->get_mss == NULL))
1158                         dst_ops->get_mss = xfrm_get_mss;
1159                 if (likely(afinfo->garbage_collect == NULL))
1160                         afinfo->garbage_collect = __xfrm_garbage_collect;
1161                 xfrm_policy_afinfo[afinfo->family] = afinfo;
1162         }
1163         write_unlock(&xfrm_policy_afinfo_lock);
1164         return err;
1165 }
1166
1167 int xfrm_policy_unregister_afinfo(struct xfrm_policy_afinfo *afinfo)
1168 {
1169         int err = 0;
1170         if (unlikely(afinfo == NULL))
1171                 return -EINVAL;
1172         if (unlikely(afinfo->family >= NPROTO))
1173                 return -EAFNOSUPPORT;
1174         write_lock(&xfrm_policy_afinfo_lock);
1175         if (likely(xfrm_policy_afinfo[afinfo->family] != NULL)) {
1176                 if (unlikely(xfrm_policy_afinfo[afinfo->family] != afinfo))
1177                         err = -EINVAL;
1178                 else {
1179                         struct dst_ops *dst_ops = afinfo->dst_ops;
1180                         xfrm_policy_afinfo[afinfo->family] = NULL;
1181                         dst_ops->kmem_cachep = NULL;
1182                         dst_ops->check = NULL;
1183                         dst_ops->destroy = NULL;
1184                         dst_ops->negative_advice = NULL;
1185                         dst_ops->link_failure = NULL;
1186                         dst_ops->get_mss = NULL;
1187                         afinfo->garbage_collect = NULL;
1188                 }
1189         }
1190         write_unlock(&xfrm_policy_afinfo_lock);
1191         return err;
1192 }
1193
1194 struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family)
1195 {
1196         struct xfrm_policy_afinfo *afinfo;
1197         if (unlikely(family >= NPROTO))
1198                 return NULL;
1199         read_lock(&xfrm_policy_afinfo_lock);
1200         afinfo = xfrm_policy_afinfo[family];
1201         if (likely(afinfo != NULL))
1202                 read_lock(&afinfo->lock);
1203         read_unlock(&xfrm_policy_afinfo_lock);
1204         return afinfo;
1205 }
1206
1207 void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo)
1208 {
1209         if (unlikely(afinfo == NULL))
1210                 return;
1211         read_unlock(&afinfo->lock);
1212 }
1213
1214 static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void *ptr)
1215 {
1216         switch (event) {
1217         case NETDEV_DOWN:
1218                 xfrm_flush_bundles();
1219         }
1220         return NOTIFY_DONE;
1221 }
1222
1223 struct notifier_block xfrm_dev_notifier = {
1224         xfrm_dev_event,
1225         NULL,
1226         0
1227 };
1228
1229 void __init xfrm_policy_init(void)
1230 {
1231         xfrm_dst_cache = kmem_cache_create("xfrm_dst_cache",
1232                                            sizeof(struct xfrm_dst),
1233                                            0, SLAB_HWCACHE_ALIGN,
1234                                            NULL, NULL);
1235         if (!xfrm_dst_cache)
1236                 panic("XFRM: failed to allocate xfrm_dst_cache\n");
1237
1238         INIT_WORK(&xfrm_policy_gc_work, xfrm_policy_gc_task, NULL);
1239         register_netdevice_notifier(&xfrm_dev_notifier);
1240 }
1241
1242 void __init xfrm_init(void)
1243 {
1244         xfrm_state_init();
1245         xfrm_policy_init();
1246         xfrm_input_init();
1247 }
1248