X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=net%2Fipv4%2Fip_fragment.c;h=8ce00d3703dacdeb46a387f27a8d5a609f100e7a;hb=97bf2856c6014879bd04983a3e9dfcdac1e7fe85;hp=1f007c67b5e7489d17deeaefbff5e022b926db5c;hpb=9213980e6a70d8473e0ffd4b39ab5b6caaba9ff5;p=linux-2.6.git diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 1f007c67b..8ce00d370 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -22,7 +22,7 @@ * Patrick McHardy : LRU queue of frag heads for evictor. */ -#include +#include #include #include #include @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -53,13 +54,15 @@ * even the most extreme cases without allowing an attacker to measurably * harm machine performance. */ -int sysctl_ipfrag_high_thresh = 256*1024; -int sysctl_ipfrag_low_thresh = 192*1024; +int sysctl_ipfrag_high_thresh __read_mostly = 256*1024; +int sysctl_ipfrag_low_thresh __read_mostly = 192*1024; + +int sysctl_ipfrag_max_dist __read_mostly = 64; /* Important NOTE! Fragment queue must be destroyed before MSL expires. * RFC791 is wrong proposing to prolongate timer each fragment arrival by TTL. */ -int sysctl_ipfrag_time = IP_FRAG_TIME; +int sysctl_ipfrag_time __read_mostly = IP_FRAG_TIME; struct ipfrag_skb_cb { @@ -71,11 +74,12 @@ struct ipfrag_skb_cb /* Describe an entry in the "incomplete datagrams" queue. */ struct ipq { - struct ipq *next; /* linked list pointers */ + struct hlist_node list; struct list_head lru_list; /* lru list member */ - u32 saddr; - u32 daddr; - u16 id; + u32 user; + __be32 saddr; + __be32 daddr; + __be16 id; u8 protocol; u8 last_in; #define COMPLETE 4 @@ -88,9 +92,10 @@ struct ipq { spinlock_t lock; atomic_t refcnt; struct timer_list timer; /* when will this queue expire? */ - struct ipq **pprev; - int iif; struct timeval stamp; + int iif; + unsigned int rid; + struct inet_peer *peer; }; /* Hash table. */ @@ -98,17 +103,15 @@ struct ipq { #define IPQ_HASHSZ 64 /* Per-bucket lock is easy to add now. */ -static struct ipq *ipq_hash[IPQ_HASHSZ]; -static rwlock_t ipfrag_lock = RW_LOCK_UNLOCKED; +static struct hlist_head ipq_hash[IPQ_HASHSZ]; +static DEFINE_RWLOCK(ipfrag_lock); static u32 ipfrag_hash_rnd; static LIST_HEAD(ipq_lru_list); int ip_frag_nqueues = 0; static __inline__ void __ipq_unlink(struct ipq *qp) { - if(qp->next) - qp->next->pprev = qp->pprev; - *qp->pprev = qp->next; + hlist_del(&qp->list); list_del(&qp->lru_list); ip_frag_nqueues--; } @@ -120,14 +123,15 @@ static __inline__ void ipq_unlink(struct ipq *ipq) write_unlock(&ipfrag_lock); } -static unsigned int ipqhashfn(u16 id, u32 saddr, u32 daddr, u8 prot) +static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot) { - return jhash_3words((u32)id << 16 | prot, saddr, daddr, + return jhash_3words((__force u32)id << 16 | prot, + (__force u32)saddr, (__force u32)daddr, ipfrag_hash_rnd) & (IPQ_HASHSZ - 1); } static struct timer_list ipfrag_secret_timer; -int sysctl_ipfrag_secret_interval = 10 * 60 * HZ; +int sysctl_ipfrag_secret_interval __read_mostly = 10 * 60 * HZ; static void ipfrag_secret_rebuild(unsigned long dummy) { @@ -138,27 +142,18 @@ static void ipfrag_secret_rebuild(unsigned long dummy) get_random_bytes(&ipfrag_hash_rnd, sizeof(u32)); for (i = 0; i < IPQ_HASHSZ; i++) { struct ipq *q; + struct hlist_node *p, *n; - q = ipq_hash[i]; - while (q) { - struct ipq *next = q->next; + hlist_for_each_entry_safe(q, p, n, &ipq_hash[i], list) { unsigned int hval = ipqhashfn(q->id, q->saddr, q->daddr, q->protocol); if (hval != i) { - /* Unlink. */ - if (q->next) - q->next->pprev = q->pprev; - *q->pprev = q->next; + hlist_del(&q->list); /* Relink to new hash chain. */ - if ((q->next = ipq_hash[hval]) != NULL) - q->next->pprev = &q->next; - ipq_hash[hval] = q; - q->pprev = &ipq_hash[hval]; + hlist_add_head(&q->list, &ipq_hash[hval]); } - - q = next; } } write_unlock(&ipfrag_lock); @@ -169,14 +164,18 @@ static void ipfrag_secret_rebuild(unsigned long dummy) atomic_t ip_frag_mem = ATOMIC_INIT(0); /* Memory used for fragments */ /* Memory Tracking Functions. */ -static __inline__ void frag_kfree_skb(struct sk_buff *skb) +static __inline__ void frag_kfree_skb(struct sk_buff *skb, int *work) { + if (work) + *work -= skb->truesize; atomic_sub(skb->truesize, &ip_frag_mem); kfree_skb(skb); } -static __inline__ void frag_free_queue(struct ipq *qp) +static __inline__ void frag_free_queue(struct ipq *qp, int *work) { + if (work) + *work -= sizeof(struct ipq); atomic_sub(sizeof(struct ipq), &ip_frag_mem); kfree(qp); } @@ -195,30 +194,33 @@ static __inline__ struct ipq *frag_alloc_queue(void) /* Destruction primitives. */ /* Complete destruction of ipq. */ -static void ip_frag_destroy(struct ipq *qp) +static void ip_frag_destroy(struct ipq *qp, int *work) { struct sk_buff *fp; BUG_TRAP(qp->last_in&COMPLETE); BUG_TRAP(del_timer(&qp->timer) == 0); + if (qp->peer) + inet_putpeer(qp->peer); + /* Release all fragment data. */ fp = qp->fragments; while (fp) { struct sk_buff *xp = fp->next; - frag_kfree_skb(fp); + frag_kfree_skb(fp, work); fp = xp; } /* Finally, release the queue descriptor itself. */ - frag_free_queue(qp); + frag_free_queue(qp, work); } -static __inline__ void ipq_put(struct ipq *ipq) +static __inline__ void ipq_put(struct ipq *ipq, int *work) { if (atomic_dec_and_test(&ipq->refcnt)) - ip_frag_destroy(ipq); + ip_frag_destroy(ipq, work); } /* Kill ipq entry. It is not destroyed immediately, @@ -237,16 +239,19 @@ static void ipq_kill(struct ipq *ipq) } /* Memory limiting on fragments. Evictor trashes the oldest - * fragment queue until we are back under the low threshold. + * fragment queue until we are back under the threshold. */ static void ip_evictor(void) { struct ipq *qp; struct list_head *tmp; + int work; - for(;;) { - if (atomic_read(&ip_frag_mem) <= sysctl_ipfrag_low_thresh) - return; + work = atomic_read(&ip_frag_mem) - sysctl_ipfrag_low_thresh; + if (work <= 0) + return; + + while (work > 0) { read_lock(&ipfrag_lock); if (list_empty(&ipq_lru_list)) { read_unlock(&ipfrag_lock); @@ -262,8 +267,8 @@ static void ip_evictor(void) ipq_kill(qp); spin_unlock(&qp->lock); - ipq_put(qp); - IP_INC_STATS_BH(ReasmFails); + ipq_put(qp, &work); + IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); } } @@ -281,8 +286,8 @@ static void ip_expire(unsigned long arg) ipq_kill(qp); - IP_INC_STATS_BH(ReasmTimeout); - IP_INC_STATS_BH(ReasmFails); + IP_INC_STATS_BH(IPSTATS_MIB_REASMTIMEOUT); + IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); if ((qp->last_in&FIRST_IN) && qp->fragments != NULL) { struct sk_buff *head = qp->fragments; @@ -294,30 +299,37 @@ static void ip_expire(unsigned long arg) } out: spin_unlock(&qp->lock); - ipq_put(qp); + ipq_put(qp, NULL); } /* Creation primitives. */ -static struct ipq *ip_frag_intern(unsigned int hash, struct ipq *qp_in) +static struct ipq *ip_frag_intern(struct ipq *qp_in) { struct ipq *qp; +#ifdef CONFIG_SMP + struct hlist_node *n; +#endif + unsigned int hash; write_lock(&ipfrag_lock); + hash = ipqhashfn(qp_in->id, qp_in->saddr, qp_in->daddr, + qp_in->protocol); #ifdef CONFIG_SMP /* With SMP race we have to recheck hash table, because * such entry could be created on other cpu, while we * promoted read lock to write lock. */ - for(qp = ipq_hash[hash]; qp; qp = qp->next) { + hlist_for_each_entry(qp, n, &ipq_hash[hash], list) { if(qp->id == qp_in->id && qp->saddr == qp_in->saddr && qp->daddr == qp_in->daddr && - qp->protocol == qp_in->protocol) { + qp->protocol == qp_in->protocol && + qp->user == qp_in->user) { atomic_inc(&qp->refcnt); write_unlock(&ipfrag_lock); qp_in->last_in |= COMPLETE; - ipq_put(qp_in); + ipq_put(qp_in, NULL); return qp; } } @@ -328,10 +340,7 @@ static struct ipq *ip_frag_intern(unsigned int hash, struct ipq *qp_in) atomic_inc(&qp->refcnt); atomic_inc(&qp->refcnt); - if((qp->next = ipq_hash[hash]) != NULL) - qp->next->pprev = &qp->next; - ipq_hash[hash] = qp; - qp->pprev = &ipq_hash[hash]; + hlist_add_head(&qp->list, &ipq_hash[hash]); INIT_LIST_HEAD(&qp->lru_list); list_add_tail(&qp->lru_list, &ipq_lru_list); ip_frag_nqueues++; @@ -340,7 +349,7 @@ static struct ipq *ip_frag_intern(unsigned int hash, struct ipq *qp_in) } /* Add an entry to the 'ipq' queue for a newly received IP datagram. */ -static struct ipq *ip_frag_create(unsigned hash, struct iphdr *iph) +static struct ipq *ip_frag_create(struct iphdr *iph, u32 user) { struct ipq *qp; @@ -352,43 +361,48 @@ static struct ipq *ip_frag_create(unsigned hash, struct iphdr *iph) qp->id = iph->id; qp->saddr = iph->saddr; qp->daddr = iph->daddr; + qp->user = user; qp->len = 0; qp->meat = 0; qp->fragments = NULL; qp->iif = 0; + qp->peer = sysctl_ipfrag_max_dist ? inet_getpeer(iph->saddr, 1) : NULL; /* Initialize a timer for this entry. */ init_timer(&qp->timer); qp->timer.data = (unsigned long) qp; /* pointer to queue */ qp->timer.function = ip_expire; /* expire function */ - qp->lock = SPIN_LOCK_UNLOCKED; + spin_lock_init(&qp->lock); atomic_set(&qp->refcnt, 1); - return ip_frag_intern(hash, qp); + return ip_frag_intern(qp); out_nomem: - NETDEBUG(if (net_ratelimit()) printk(KERN_ERR "ip_frag_create: no memory left !\n")); + LIMIT_NETDEBUG(KERN_ERR "ip_frag_create: no memory left !\n"); return NULL; } /* Find the correct entry in the "incomplete datagrams" queue for * this IP datagram, and create new one, if nothing is found. */ -static inline struct ipq *ip_find(struct iphdr *iph) +static inline struct ipq *ip_find(struct iphdr *iph, u32 user) { - __u16 id = iph->id; - __u32 saddr = iph->saddr; - __u32 daddr = iph->daddr; + __be16 id = iph->id; + __be32 saddr = iph->saddr; + __be32 daddr = iph->daddr; __u8 protocol = iph->protocol; - unsigned int hash = ipqhashfn(id, saddr, daddr, protocol); + unsigned int hash; struct ipq *qp; + struct hlist_node *n; read_lock(&ipfrag_lock); - for(qp = ipq_hash[hash]; qp; qp = qp->next) { + hash = ipqhashfn(id, saddr, daddr, protocol); + hlist_for_each_entry(qp, n, &ipq_hash[hash], list) { if(qp->id == id && qp->saddr == saddr && qp->daddr == daddr && - qp->protocol == protocol) { + qp->protocol == protocol && + qp->user == user) { atomic_inc(&qp->refcnt); read_unlock(&ipfrag_lock); return qp; @@ -396,7 +410,57 @@ static inline struct ipq *ip_find(struct iphdr *iph) } read_unlock(&ipfrag_lock); - return ip_frag_create(hash, iph); + return ip_frag_create(iph, user); +} + +/* Is the fragment too far ahead to be part of ipq? */ +static inline int ip_frag_too_far(struct ipq *qp) +{ + struct inet_peer *peer = qp->peer; + unsigned int max = sysctl_ipfrag_max_dist; + unsigned int start, end; + + int rc; + + if (!peer || !max) + return 0; + + start = qp->rid; + end = atomic_inc_return(&peer->rid); + qp->rid = end; + + rc = qp->fragments && (end - start) > max; + + if (rc) { + IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); + } + + return rc; +} + +static int ip_frag_reinit(struct ipq *qp) +{ + struct sk_buff *fp; + + if (!mod_timer(&qp->timer, jiffies + sysctl_ipfrag_time)) { + atomic_inc(&qp->refcnt); + return -ETIMEDOUT; + } + + fp = qp->fragments; + do { + struct sk_buff *xp = fp->next; + frag_kfree_skb(fp, NULL); + fp = xp; + } while (fp); + + qp->last_in = 0; + qp->len = 0; + qp->meat = 0; + qp->fragments = NULL; + qp->iif = 0; + + return 0; } /* Add new segment to existing queue. */ @@ -409,6 +473,12 @@ static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb) if (qp->last_in & COMPLETE) goto err; + if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) && + unlikely(ip_frag_too_far(qp)) && unlikely(ip_frag_reinit(qp))) { + ipq_kill(qp); + goto err; + } + offset = ntohs(skb->nh.iph->frag_off); flags = offset & ~IP_OFFSET; offset &= IP_OFFSET; @@ -446,7 +516,7 @@ static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb) if (pskb_pull(skb, ihl) == NULL) goto err; - if (pskb_trim(skb, end-offset)) + if (pskb_trim_rcsum(skb, end-offset)) goto err; /* Find out which fragments are in front and at the back of us @@ -495,7 +565,7 @@ static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb) } else { struct sk_buff *free_it = next; - /* Old fragmnet is completely overridden with + /* Old fragment is completely overridden with * new one drop it. */ next = next->next; @@ -506,7 +576,7 @@ static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb) qp->fragments = next; qp->meat -= free_it->len; - frag_kfree_skb(free_it); + frag_kfree_skb(free_it, NULL); } } @@ -522,7 +592,7 @@ static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb) if (skb->dev) qp->iif = skb->dev->ifindex; skb->dev = NULL; - qp->stamp = skb->stamp; + skb_get_timestamp(skb, &qp->stamp); qp->meat += skb->len; atomic_add(skb->truesize, &ip_frag_mem); if (offset == 0) @@ -596,7 +666,7 @@ static struct sk_buff *ip_frag_reasm(struct ipq *qp, struct net_device *dev) head->len += fp->len; if (head->ip_summed != fp->ip_summed) head->ip_summed = CHECKSUM_NONE; - else if (head->ip_summed == CHECKSUM_HW) + else if (head->ip_summed == CHECKSUM_COMPLETE) head->csum = csum_add(head->csum, fp->csum); head->truesize += fp->truesize; atomic_sub(fp->truesize, &ip_frag_mem); @@ -604,20 +674,18 @@ static struct sk_buff *ip_frag_reasm(struct ipq *qp, struct net_device *dev) head->next = NULL; head->dev = dev; - head->stamp = qp->stamp; + skb_set_timestamp(head, &qp->stamp); iph = head->nh.iph; iph->frag_off = 0; iph->tot_len = htons(len); - IP_INC_STATS_BH(ReasmOKs); + IP_INC_STATS_BH(IPSTATS_MIB_REASMOKS); qp->fragments = NULL; return head; out_nomem: - NETDEBUG(if (net_ratelimit()) - printk(KERN_ERR - "IP: queue_glue: no memory for gluing queue %p\n", - qp)); + LIMIT_NETDEBUG(KERN_ERR "IP: queue_glue: no memory for gluing " + "queue %p\n", qp); goto out_fail; out_oversize: if (net_ratelimit()) @@ -625,18 +693,18 @@ out_oversize: "Oversized IP packet from %d.%d.%d.%d.\n", NIPQUAD(qp->saddr)); out_fail: - IP_INC_STATS_BH(ReasmFails); + IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); return NULL; } /* Process an incoming IP datagram fragment. */ -struct sk_buff *ip_defrag(struct sk_buff *skb) +struct sk_buff *ip_defrag(struct sk_buff *skb, u32 user) { struct iphdr *iph = skb->nh.iph; struct ipq *qp; struct net_device *dev; - IP_INC_STATS_BH(ReasmReqds); + IP_INC_STATS_BH(IPSTATS_MIB_REASMREQDS); /* Start by cleaning up the memory. */ if (atomic_read(&ip_frag_mem) > sysctl_ipfrag_high_thresh) @@ -645,7 +713,7 @@ struct sk_buff *ip_defrag(struct sk_buff *skb) dev = skb->dev; /* Lookup (or create) queue header */ - if ((qp = ip_find(iph)) != NULL) { + if ((qp = ip_find(iph, user)) != NULL) { struct sk_buff *ret = NULL; spin_lock(&qp->lock); @@ -657,11 +725,11 @@ struct sk_buff *ip_defrag(struct sk_buff *skb) ret = ip_frag_reasm(qp, dev); spin_unlock(&qp->lock); - ipq_put(qp); + ipq_put(qp, NULL); return ret; } - IP_INC_STATS_BH(ReasmFails); + IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); kfree_skb(skb); return NULL; }