2 * mm/prio_tree.c - priority search tree for mapping->i_mmap
4 * Copyright (C) 2004, Rajesh Venkatasubramanian <vrajesh@umich.edu>
6 * This file is released under the GPL v2.
8 * Based on the radix priority search tree proposed by Edward M. McCreight
9 * SIAM Journal of Computing, vol. 14, no.2, pages 257-276, May 1985
11 * 02Feb2004 Initial version
14 #include <linux/init.h>
15 #include <linux/module.h>
17 #include <linux/prio_tree.h>
20 * A clever mix of heap and radix trees forms a radix priority search tree (PST)
21 * which is useful for storing intervals, e.g, we can consider a vma as a closed
22 * interval of file pages [offset_begin, offset_end], and store all vmas that
23 * map a file in a PST. Then, using the PST, we can answer a stabbing query,
24 * i.e., selecting a set of stored intervals (vmas) that overlap with (map) a
25 * given input interval X (a set of consecutive file pages), in "O(log n + m)"
26 * time where 'log n' is the height of the PST, and 'm' is the number of stored
27 * intervals (vmas) that overlap (map) with the input interval X (the set of
28 * consecutive file pages).
30 * In our implementation, we store closed intervals of the form [radix_index,
31 * heap_index]. We assume that always radix_index <= heap_index. McCreight's PST
32 * is designed for storing intervals with unique radix indices, i.e., each
33 * interval have different radix_index. However, this limitation can be easily
34 * overcome by using the size, i.e., heap_index - radix_index, as part of the
35 * index, so we index the tree using [(radix_index,size), heap_index].
37 * When the above-mentioned indexing scheme is used, theoretically, in a 32 bit
38 * machine, the maximum height of a PST can be 64. We can use a balanced version
39 * of the priority search tree to optimize the tree height, but the balanced
40 * tree proposed by McCreight is too complex and memory-hungry for our purpose.
44 * The following macros are used for implementing prio_tree for i_mmap
47 #define RADIX_INDEX(vma) ((vma)->vm_pgoff)
48 #define VMA_SIZE(vma) (((vma)->vm_end - (vma)->vm_start) >> PAGE_SHIFT)
50 #define HEAP_INDEX(vma) ((vma)->vm_pgoff + (VMA_SIZE(vma) - 1))
52 #define GET_INDEX_VMA(vma, radix, heap) \
54 radix = RADIX_INDEX(vma); \
55 heap = HEAP_INDEX(vma); \
58 #define GET_INDEX(node, radix, heap) \
60 struct vm_area_struct *__tmp = \
61 prio_tree_entry(node, struct vm_area_struct, shared.prio_tree_node);\
62 GET_INDEX_VMA(__tmp, radix, heap); \
65 static unsigned long index_bits_to_maxindex[BITS_PER_LONG];
67 void __init prio_tree_init(void)
71 for (i = 0; i < ARRAY_SIZE(index_bits_to_maxindex) - 1; i++)
72 index_bits_to_maxindex[i] = (1UL << (i + 1)) - 1;
73 index_bits_to_maxindex[ARRAY_SIZE(index_bits_to_maxindex) - 1] = ~0UL;
77 * Maximum heap_index that can be stored in a PST with index_bits bits
79 static inline unsigned long prio_tree_maxindex(unsigned int bits)
81 return index_bits_to_maxindex[bits - 1];
85 * Extend a priority search tree so that it can store a node with heap_index
86 * max_heap_index. In the worst case, this algorithm takes O((log n)^2).
87 * However, this function is used rarely and the common case performance is
90 static struct prio_tree_node *prio_tree_expand(struct prio_tree_root *root,
91 struct prio_tree_node *node, unsigned long max_heap_index)
93 static void prio_tree_remove(struct prio_tree_root *,
94 struct prio_tree_node *);
95 struct prio_tree_node *first = NULL, *prev, *last = NULL;
97 if (max_heap_index > prio_tree_maxindex(root->index_bits))
100 while (max_heap_index > prio_tree_maxindex(root->index_bits)) {
103 if (prio_tree_empty(root))
107 first = root->prio_tree_node;
108 prio_tree_remove(root, root->prio_tree_node);
109 INIT_PRIO_TREE_NODE(first);
113 last = root->prio_tree_node;
114 prio_tree_remove(root, root->prio_tree_node);
115 INIT_PRIO_TREE_NODE(last);
121 INIT_PRIO_TREE_NODE(node);
125 first->parent = node;
129 if (!prio_tree_empty(root)) {
130 last->left = root->prio_tree_node;
131 last->left->parent = last;
134 root->prio_tree_node = node;
139 * Replace a prio_tree_node with a new node and return the old node
141 static struct prio_tree_node *prio_tree_replace(struct prio_tree_root *root,
142 struct prio_tree_node *old, struct prio_tree_node *node)
144 INIT_PRIO_TREE_NODE(node);
146 if (prio_tree_root(old)) {
147 BUG_ON(root->prio_tree_node != old);
149 * We can reduce root->index_bits here. However, it is complex
150 * and does not help much to improve performance (IMO).
153 root->prio_tree_node = node;
155 node->parent = old->parent;
156 if (old->parent->left == old)
157 old->parent->left = node;
159 old->parent->right = node;
162 if (!prio_tree_left_empty(old)) {
163 node->left = old->left;
164 old->left->parent = node;
167 if (!prio_tree_right_empty(old)) {
168 node->right = old->right;
169 old->right->parent = node;
176 * Insert a prio_tree_node @node into a radix priority search tree @root. The
177 * algorithm typically takes O(log n) time where 'log n' is the number of bits
178 * required to represent the maximum heap_index. In the worst case, the algo
179 * can take O((log n)^2) - check prio_tree_expand.
181 * If a prior node with same radix_index and heap_index is already found in
182 * the tree, then returns the address of the prior node. Otherwise, inserts
183 * @node into the tree and returns @node.
185 static struct prio_tree_node *prio_tree_insert(struct prio_tree_root *root,
186 struct prio_tree_node *node)
188 struct prio_tree_node *cur, *res = node;
189 unsigned long radix_index, heap_index;
190 unsigned long r_index, h_index, index, mask;
193 GET_INDEX(node, radix_index, heap_index);
195 if (prio_tree_empty(root) ||
196 heap_index > prio_tree_maxindex(root->index_bits))
197 return prio_tree_expand(root, node, heap_index);
199 cur = root->prio_tree_node;
200 mask = 1UL << (root->index_bits - 1);
203 GET_INDEX(cur, r_index, h_index);
205 if (r_index == radix_index && h_index == heap_index)
208 if (h_index < heap_index ||
209 (h_index == heap_index && r_index > radix_index)) {
210 struct prio_tree_node *tmp = node;
211 node = prio_tree_replace(root, cur, node);
215 r_index = radix_index;
218 h_index = heap_index;
223 index = heap_index - radix_index;
228 if (prio_tree_right_empty(cur)) {
229 INIT_PRIO_TREE_NODE(node);
236 if (prio_tree_left_empty(cur)) {
237 INIT_PRIO_TREE_NODE(node);
248 mask = 1UL << (root->index_bits - 1);
252 /* Should not reach here */
258 * Remove a prio_tree_node @node from a radix priority search tree @root. The
259 * algorithm takes O(log n) time where 'log n' is the number of bits required
260 * to represent the maximum heap_index.
262 static void prio_tree_remove(struct prio_tree_root *root,
263 struct prio_tree_node *node)
265 struct prio_tree_node *cur;
266 unsigned long r_index, h_index_right, h_index_left;
270 while (!prio_tree_left_empty(cur) || !prio_tree_right_empty(cur)) {
271 if (!prio_tree_left_empty(cur))
272 GET_INDEX(cur->left, r_index, h_index_left);
278 if (!prio_tree_right_empty(cur))
279 GET_INDEX(cur->right, r_index, h_index_right);
285 /* both h_index_left and h_index_right cannot be 0 */
286 if (h_index_left >= h_index_right)
292 if (prio_tree_root(cur)) {
293 BUG_ON(root->prio_tree_node != cur);
294 INIT_PRIO_TREE_ROOT(root);
298 if (cur->parent->right == cur)
299 cur->parent->right = cur->parent;
301 cur->parent->left = cur->parent;
304 cur = prio_tree_replace(root, cur->parent, cur);
308 * Following functions help to enumerate all prio_tree_nodes in the tree that
309 * overlap with the input interval X [radix_index, heap_index]. The enumeration
310 * takes O(log n + m) time where 'log n' is the height of the tree (which is
311 * proportional to # of bits required to represent the maximum heap_index) and
312 * 'm' is the number of prio_tree_nodes that overlap the interval X.
315 static struct prio_tree_node *prio_tree_left(
316 struct prio_tree_root *root, struct prio_tree_iter *iter,
317 unsigned long radix_index, unsigned long heap_index,
318 unsigned long *r_index, unsigned long *h_index)
320 if (prio_tree_left_empty(iter->cur))
323 GET_INDEX(iter->cur->left, *r_index, *h_index);
325 if (radix_index <= *h_index) {
326 iter->cur = iter->cur->left;
329 if (iter->size_level)
332 if (iter->size_level) {
333 BUG_ON(!prio_tree_left_empty(iter->cur));
334 BUG_ON(!prio_tree_right_empty(iter->cur));
336 iter->mask = ULONG_MAX;
338 iter->size_level = 1;
339 iter->mask = 1UL << (root->index_bits - 1);
348 static struct prio_tree_node *prio_tree_right(
349 struct prio_tree_root *root, struct prio_tree_iter *iter,
350 unsigned long radix_index, unsigned long heap_index,
351 unsigned long *r_index, unsigned long *h_index)
355 if (prio_tree_right_empty(iter->cur))
358 if (iter->size_level)
361 value = iter->value | iter->mask;
363 if (heap_index < value)
366 GET_INDEX(iter->cur->right, *r_index, *h_index);
368 if (radix_index <= *h_index) {
369 iter->cur = iter->cur->right;
373 if (iter->size_level)
376 if (iter->size_level) {
377 BUG_ON(!prio_tree_left_empty(iter->cur));
378 BUG_ON(!prio_tree_right_empty(iter->cur));
380 iter->mask = ULONG_MAX;
382 iter->size_level = 1;
383 iter->mask = 1UL << (root->index_bits - 1);
392 static struct prio_tree_node *prio_tree_parent(struct prio_tree_iter *iter)
394 iter->cur = iter->cur->parent;
395 if (iter->mask == ULONG_MAX)
397 else if (iter->size_level == 1)
401 if (iter->size_level)
403 if (!iter->size_level && (iter->value & iter->mask))
404 iter->value ^= iter->mask;
408 static inline int overlap(unsigned long radix_index, unsigned long heap_index,
409 unsigned long r_index, unsigned long h_index)
411 return heap_index >= r_index && radix_index <= h_index;
417 * Get the first prio_tree_node that overlaps with the interval [radix_index,
418 * heap_index]. Note that always radix_index <= heap_index. We do a pre-order
419 * traversal of the tree.
421 static struct prio_tree_node *prio_tree_first(struct prio_tree_root *root,
422 struct prio_tree_iter *iter, unsigned long radix_index,
423 unsigned long heap_index)
425 unsigned long r_index, h_index;
427 INIT_PRIO_TREE_ITER(iter);
429 if (prio_tree_empty(root))
432 GET_INDEX(root->prio_tree_node, r_index, h_index);
434 if (radix_index > h_index)
437 iter->mask = 1UL << (root->index_bits - 1);
438 iter->cur = root->prio_tree_node;
441 if (overlap(radix_index, heap_index, r_index, h_index))
444 if (prio_tree_left(root, iter, radix_index, heap_index,
448 if (prio_tree_right(root, iter, radix_index, heap_index,
460 * Get the next prio_tree_node that overlaps with the input interval in iter
462 static struct prio_tree_node *prio_tree_next(struct prio_tree_root *root,
463 struct prio_tree_iter *iter, unsigned long radix_index,
464 unsigned long heap_index)
466 unsigned long r_index, h_index;
469 while (prio_tree_left(root, iter, radix_index,
470 heap_index, &r_index, &h_index)) {
471 if (overlap(radix_index, heap_index, r_index, h_index))
475 while (!prio_tree_right(root, iter, radix_index,
476 heap_index, &r_index, &h_index)) {
477 while (!prio_tree_root(iter->cur) &&
478 iter->cur->parent->right == iter->cur)
479 prio_tree_parent(iter);
481 if (prio_tree_root(iter->cur))
484 prio_tree_parent(iter);
487 if (overlap(radix_index, heap_index, r_index, h_index))
494 * Radix priority search tree for address_space->i_mmap
496 * For each vma that map a unique set of file pages i.e., unique [radix_index,
497 * heap_index] value, we have a corresponing priority search tree node. If
498 * multiple vmas have identical [radix_index, heap_index] value, then one of
499 * them is used as a tree node and others are stored in a vm_set list. The tree
500 * node points to the first vma (head) of the list using vm_set.head.
506 * L R -> H-I-J-K-M-N-O-P-Q-S
507 * ^ ^ <-- vm_set.list -->
510 * We need some way to identify whether a vma is a tree node, head of a vm_set
511 * list, or just a member of a vm_set list. We cannot use vm_flags to store
512 * such information. The reason is, in the above figure, it is possible that
513 * vm_flags' of R and H are covered by the different mmap_sems. When R is
514 * removed under R->mmap_sem, H replaces R as a tree node. Since we do not hold
515 * H->mmap_sem, we cannot use H->vm_flags for marking that H is a tree node now.
516 * That's why some trick involving shared.vm_set.parent is used for identifying
517 * tree nodes and list head nodes.
519 * vma radix priority search tree node rules:
521 * vma->shared.vm_set.parent != NULL ==> a tree node
522 * vma->shared.vm_set.head != NULL ==> list of others mapping same range
523 * vma->shared.vm_set.head == NULL ==> no others map the same range
525 * vma->shared.vm_set.parent == NULL
526 * vma->shared.vm_set.head != NULL ==> list head of vmas mapping same range
527 * vma->shared.vm_set.head == NULL ==> a list node
531 * Add a new vma known to map the same set of pages as the old vma:
532 * useful for fork's dup_mmap as well as vma_prio_tree_insert below.
533 * Note that it just happens to work correctly on i_mmap_nonlinear too.
535 void vma_prio_tree_add(struct vm_area_struct *vma, struct vm_area_struct *old)
537 /* Leave these BUG_ONs till prio_tree patch stabilizes */
538 BUG_ON(RADIX_INDEX(vma) != RADIX_INDEX(old));
539 BUG_ON(HEAP_INDEX(vma) != HEAP_INDEX(old));
541 if (!old->shared.vm_set.parent)
542 list_add(&vma->shared.vm_set.list,
543 &old->shared.vm_set.list);
544 else if (old->shared.vm_set.head)
545 list_add_tail(&vma->shared.vm_set.list,
546 &old->shared.vm_set.head->shared.vm_set.list);
548 INIT_LIST_HEAD(&vma->shared.vm_set.list);
549 vma->shared.vm_set.head = old;
550 old->shared.vm_set.head = vma;
554 void vma_prio_tree_insert(struct vm_area_struct *vma,
555 struct prio_tree_root *root)
557 struct prio_tree_node *ptr;
558 struct vm_area_struct *old;
560 ptr = prio_tree_insert(root, &vma->shared.prio_tree_node);
561 if (ptr != &vma->shared.prio_tree_node) {
562 old = prio_tree_entry(ptr, struct vm_area_struct,
563 shared.prio_tree_node);
564 vma_prio_tree_add(vma, old);
568 void vma_prio_tree_remove(struct vm_area_struct *vma,
569 struct prio_tree_root *root)
571 struct vm_area_struct *node, *head, *new_head;
573 if (!vma->shared.vm_set.head) {
574 if (!vma->shared.vm_set.parent)
575 list_del_init(&vma->shared.vm_set.list);
577 prio_tree_remove(root, &vma->shared.prio_tree_node);
579 /* Leave this BUG_ON till prio_tree patch stabilizes */
580 BUG_ON(vma->shared.vm_set.head->shared.vm_set.head != vma);
581 if (vma->shared.vm_set.parent) {
582 head = vma->shared.vm_set.head;
583 if (!list_empty(&head->shared.vm_set.list)) {
584 new_head = list_entry(
585 head->shared.vm_set.list.next,
586 struct vm_area_struct,
588 list_del_init(&head->shared.vm_set.list);
592 prio_tree_replace(root, &vma->shared.prio_tree_node,
593 &head->shared.prio_tree_node);
594 head->shared.vm_set.head = new_head;
596 new_head->shared.vm_set.head = head;
599 node = vma->shared.vm_set.head;
600 if (!list_empty(&vma->shared.vm_set.list)) {
601 new_head = list_entry(
602 vma->shared.vm_set.list.next,
603 struct vm_area_struct,
605 list_del_init(&vma->shared.vm_set.list);
606 node->shared.vm_set.head = new_head;
607 new_head->shared.vm_set.head = node;
609 node->shared.vm_set.head = NULL;
615 * Helper function to enumerate vmas that map a given file page or a set of
616 * contiguous file pages. The function returns vmas that at least map a single
617 * page in the given range of contiguous file pages.
619 struct vm_area_struct *vma_prio_tree_next(struct vm_area_struct *vma,
620 struct prio_tree_root *root, struct prio_tree_iter *iter,
621 pgoff_t begin, pgoff_t end)
623 struct prio_tree_node *ptr;
624 struct vm_area_struct *next;
628 * First call is with NULL vma
630 ptr = prio_tree_first(root, iter, begin, end);
632 next = prio_tree_entry(ptr, struct vm_area_struct,
633 shared.prio_tree_node);
634 prefetch(next->shared.vm_set.head);
640 if (vma->shared.vm_set.parent) {
641 if (vma->shared.vm_set.head) {
642 next = vma->shared.vm_set.head;
643 prefetch(next->shared.vm_set.list.next);
647 next = list_entry(vma->shared.vm_set.list.next,
648 struct vm_area_struct, shared.vm_set.list);
649 if (!next->shared.vm_set.head) {
650 prefetch(next->shared.vm_set.list.next);
655 ptr = prio_tree_next(root, iter, begin, end);
657 next = prio_tree_entry(ptr, struct vm_area_struct,
658 shared.prio_tree_node);
659 prefetch(next->shared.vm_set.head);