X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=net%2Fipv4%2Fip_fragment.c;h=da734c4391796e017ef55d5e3e4730bfe16a4490;hb=43bc926fffd92024b46cafaf7350d669ba9ca884;hp=df083ded1109392721501a958087e9e652748e6f;hpb=c7b5ebbddf7bcd3651947760f423e3783bbe6573;p=linux-2.6.git diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index df083ded1..da734c439 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -22,6 +22,7 @@ * Patrick McHardy : LRU queue of frag heads for evictor. */ +#include #include #include #include @@ -38,6 +39,7 @@ #include #include #include +#include #include #include #include @@ -56,6 +58,8 @@ int sysctl_ipfrag_high_thresh = 256*1024; int sysctl_ipfrag_low_thresh = 192*1024; +int sysctl_ipfrag_max_dist = 64; + /* Important NOTE! Fragment queue must be destroyed before MSL expires. * RFC791 is wrong proposing to prolongate timer each fragment arrival by TTL. */ @@ -71,8 +75,9 @@ struct ipfrag_skb_cb /* Describe an entry in the "incomplete datagrams" queue. */ struct ipq { - struct ipq *next; /* linked list pointers */ + struct hlist_node list; struct list_head lru_list; /* lru list member */ + u32 user; u32 saddr; u32 daddr; u16 id; @@ -88,9 +93,10 @@ struct ipq { spinlock_t lock; atomic_t refcnt; struct timer_list timer; /* when will this queue expire? */ - struct ipq **pprev; - int iif; struct timeval stamp; + int iif; + unsigned int rid; + struct inet_peer *peer; }; /* Hash table. */ @@ -98,17 +104,15 @@ struct ipq { #define IPQ_HASHSZ 64 /* Per-bucket lock is easy to add now. */ -static struct ipq *ipq_hash[IPQ_HASHSZ]; -static rwlock_t ipfrag_lock = RW_LOCK_UNLOCKED; +static struct hlist_head ipq_hash[IPQ_HASHSZ]; +static DEFINE_RWLOCK(ipfrag_lock); static u32 ipfrag_hash_rnd; static LIST_HEAD(ipq_lru_list); int ip_frag_nqueues = 0; static __inline__ void __ipq_unlink(struct ipq *qp) { - if(qp->next) - qp->next->pprev = qp->pprev; - *qp->pprev = qp->next; + hlist_del(&qp->list); list_del(&qp->lru_list); ip_frag_nqueues--; } @@ -138,27 +142,18 @@ static void ipfrag_secret_rebuild(unsigned long dummy) get_random_bytes(&ipfrag_hash_rnd, sizeof(u32)); for (i = 0; i < IPQ_HASHSZ; i++) { struct ipq *q; + struct hlist_node *p, *n; - q = ipq_hash[i]; - while (q) { - struct ipq *next = q->next; + hlist_for_each_entry_safe(q, p, n, &ipq_hash[i], list) { unsigned int hval = ipqhashfn(q->id, q->saddr, q->daddr, q->protocol); if (hval != i) { - /* Unlink. */ - if (q->next) - q->next->pprev = q->pprev; - *q->pprev = q->next; + hlist_del(&q->list); /* Relink to new hash chain. */ - if ((q->next = ipq_hash[hval]) != NULL) - q->next->pprev = &q->next; - ipq_hash[hval] = q; - q->pprev = &ipq_hash[hval]; + hlist_add_head(&q->list, &ipq_hash[hval]); } - - q = next; } } write_unlock(&ipfrag_lock); @@ -206,6 +201,9 @@ static void ip_frag_destroy(struct ipq *qp, int *work) BUG_TRAP(qp->last_in&COMPLETE); BUG_TRAP(del_timer(&qp->timer) == 0); + if (qp->peer) + inet_putpeer(qp->peer); + /* Release all fragment data. */ fp = qp->fragments; while (fp) { @@ -243,13 +241,13 @@ static void ipq_kill(struct ipq *ipq) /* Memory limiting on fragments. Evictor trashes the oldest * fragment queue until we are back under the threshold. */ -static void __ip_evictor(int threshold) +static void ip_evictor(void) { struct ipq *qp; struct list_head *tmp; int work; - work = atomic_read(&ip_frag_mem) - threshold; + work = atomic_read(&ip_frag_mem) - sysctl_ipfrag_low_thresh; if (work <= 0) return; @@ -274,11 +272,6 @@ static void __ip_evictor(int threshold) } } -static inline void ip_evictor(void) -{ - __ip_evictor(sysctl_ipfrag_low_thresh); -} - /* * Oops, a fragment queue timed out. Kill it and send an ICMP reply. */ @@ -311,21 +304,28 @@ out: /* Creation primitives. */ -static struct ipq *ip_frag_intern(unsigned int hash, struct ipq *qp_in) +static struct ipq *ip_frag_intern(struct ipq *qp_in) { struct ipq *qp; +#ifdef CONFIG_SMP + struct hlist_node *n; +#endif + unsigned int hash; write_lock(&ipfrag_lock); + hash = ipqhashfn(qp_in->id, qp_in->saddr, qp_in->daddr, + qp_in->protocol); #ifdef CONFIG_SMP /* With SMP race we have to recheck hash table, because * such entry could be created on other cpu, while we * promoted read lock to write lock. */ - for(qp = ipq_hash[hash]; qp; qp = qp->next) { + hlist_for_each_entry(qp, n, &ipq_hash[hash], list) { if(qp->id == qp_in->id && qp->saddr == qp_in->saddr && qp->daddr == qp_in->daddr && - qp->protocol == qp_in->protocol) { + qp->protocol == qp_in->protocol && + qp->user == qp_in->user) { atomic_inc(&qp->refcnt); write_unlock(&ipfrag_lock); qp_in->last_in |= COMPLETE; @@ -340,10 +340,7 @@ static struct ipq *ip_frag_intern(unsigned int hash, struct ipq *qp_in) atomic_inc(&qp->refcnt); atomic_inc(&qp->refcnt); - if((qp->next = ipq_hash[hash]) != NULL) - qp->next->pprev = &qp->next; - ipq_hash[hash] = qp; - qp->pprev = &ipq_hash[hash]; + hlist_add_head(&qp->list, &ipq_hash[hash]); INIT_LIST_HEAD(&qp->lru_list); list_add_tail(&qp->lru_list, &ipq_lru_list); ip_frag_nqueues++; @@ -352,7 +349,7 @@ static struct ipq *ip_frag_intern(unsigned int hash, struct ipq *qp_in) } /* Add an entry to the 'ipq' queue for a newly received IP datagram. */ -static struct ipq *ip_frag_create(unsigned hash, struct iphdr *iph) +static struct ipq *ip_frag_create(struct iphdr *iph, u32 user) { struct ipq *qp; @@ -364,43 +361,48 @@ static struct ipq *ip_frag_create(unsigned hash, struct iphdr *iph) qp->id = iph->id; qp->saddr = iph->saddr; qp->daddr = iph->daddr; + qp->user = user; qp->len = 0; qp->meat = 0; qp->fragments = NULL; qp->iif = 0; + qp->peer = sysctl_ipfrag_max_dist ? inet_getpeer(iph->saddr, 1) : NULL; /* Initialize a timer for this entry. */ init_timer(&qp->timer); qp->timer.data = (unsigned long) qp; /* pointer to queue */ qp->timer.function = ip_expire; /* expire function */ - qp->lock = SPIN_LOCK_UNLOCKED; + spin_lock_init(&qp->lock); atomic_set(&qp->refcnt, 1); - return ip_frag_intern(hash, qp); + return ip_frag_intern(qp); out_nomem: - NETDEBUG(if (net_ratelimit()) printk(KERN_ERR "ip_frag_create: no memory left !\n")); + LIMIT_NETDEBUG(KERN_ERR "ip_frag_create: no memory left !\n"); return NULL; } /* Find the correct entry in the "incomplete datagrams" queue for * this IP datagram, and create new one, if nothing is found. */ -static inline struct ipq *ip_find(struct iphdr *iph) +static inline struct ipq *ip_find(struct iphdr *iph, u32 user) { - __u16 id = iph->id; + __be16 id = iph->id; __u32 saddr = iph->saddr; __u32 daddr = iph->daddr; __u8 protocol = iph->protocol; - unsigned int hash = ipqhashfn(id, saddr, daddr, protocol); + unsigned int hash; struct ipq *qp; + struct hlist_node *n; read_lock(&ipfrag_lock); - for(qp = ipq_hash[hash]; qp; qp = qp->next) { + hash = ipqhashfn(id, saddr, daddr, protocol); + hlist_for_each_entry(qp, n, &ipq_hash[hash], list) { if(qp->id == id && qp->saddr == saddr && qp->daddr == daddr && - qp->protocol == protocol) { + qp->protocol == protocol && + qp->user == user) { atomic_inc(&qp->refcnt); read_unlock(&ipfrag_lock); return qp; @@ -408,7 +410,57 @@ static inline struct ipq *ip_find(struct iphdr *iph) } read_unlock(&ipfrag_lock); - return ip_frag_create(hash, iph); + return ip_frag_create(iph, user); +} + +/* Is the fragment too far ahead to be part of ipq? */ +static inline int ip_frag_too_far(struct ipq *qp) +{ + struct inet_peer *peer = qp->peer; + unsigned int max = sysctl_ipfrag_max_dist; + unsigned int start, end; + + int rc; + + if (!peer || !max) + return 0; + + start = qp->rid; + end = atomic_inc_return(&peer->rid); + qp->rid = end; + + rc = qp->fragments && (end - start) > max; + + if (rc) { + IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); + } + + return rc; +} + +static int ip_frag_reinit(struct ipq *qp) +{ + struct sk_buff *fp; + + if (!mod_timer(&qp->timer, jiffies + sysctl_ipfrag_time)) { + atomic_inc(&qp->refcnt); + return -ETIMEDOUT; + } + + fp = qp->fragments; + do { + struct sk_buff *xp = fp->next; + frag_kfree_skb(fp, NULL); + fp = xp; + } while (fp); + + qp->last_in = 0; + qp->len = 0; + qp->meat = 0; + qp->fragments = NULL; + qp->iif = 0; + + return 0; } /* Add new segment to existing queue. */ @@ -421,6 +473,12 @@ static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb) if (qp->last_in & COMPLETE) goto err; + if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) && + unlikely(ip_frag_too_far(qp)) && unlikely(ip_frag_reinit(qp))) { + ipq_kill(qp); + goto err; + } + offset = ntohs(skb->nh.iph->frag_off); flags = offset & ~IP_OFFSET; offset &= IP_OFFSET; @@ -458,7 +516,7 @@ static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb) if (pskb_pull(skb, ihl) == NULL) goto err; - if (pskb_trim(skb, end-offset)) + if (pskb_trim_rcsum(skb, end-offset)) goto err; /* Find out which fragments are in front and at the back of us @@ -534,7 +592,7 @@ static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb) if (skb->dev) qp->iif = skb->dev->ifindex; skb->dev = NULL; - qp->stamp = skb->stamp; + skb_get_timestamp(skb, &qp->stamp); qp->meat += skb->len; atomic_add(skb->truesize, &ip_frag_mem); if (offset == 0) @@ -616,7 +674,7 @@ static struct sk_buff *ip_frag_reasm(struct ipq *qp, struct net_device *dev) head->next = NULL; head->dev = dev; - head->stamp = qp->stamp; + skb_set_timestamp(head, &qp->stamp); iph = head->nh.iph; iph->frag_off = 0; @@ -626,10 +684,8 @@ static struct sk_buff *ip_frag_reasm(struct ipq *qp, struct net_device *dev) return head; out_nomem: - NETDEBUG(if (net_ratelimit()) - printk(KERN_ERR - "IP: queue_glue: no memory for gluing queue %p\n", - qp)); + LIMIT_NETDEBUG(KERN_ERR "IP: queue_glue: no memory for gluing " + "queue %p\n", qp); goto out_fail; out_oversize: if (net_ratelimit()) @@ -642,7 +698,7 @@ out_fail: } /* Process an incoming IP datagram fragment. */ -struct sk_buff *ip_defrag(struct sk_buff *skb) +struct sk_buff *ip_defrag(struct sk_buff *skb, u32 user) { struct iphdr *iph = skb->nh.iph; struct ipq *qp; @@ -657,7 +713,7 @@ struct sk_buff *ip_defrag(struct sk_buff *skb) dev = skb->dev; /* Lookup (or create) queue header */ - if ((qp = ip_find(iph)) != NULL) { + if ((qp = ip_find(iph, user)) != NULL) { struct sk_buff *ret = NULL; spin_lock(&qp->lock); @@ -689,10 +745,4 @@ void ipfrag_init(void) add_timer(&ipfrag_secret_timer); } -void ipfrag_flush(void) -{ - __ip_evictor(0); -} - EXPORT_SYMBOL(ip_defrag); -EXPORT_SYMBOL(ipfrag_flush);