2 * mm/prio_tree.c - priority search tree for mapping->i_mmap
4 * Copyright (C) 2004, Rajesh Venkatasubramanian <vrajesh@umich.edu>
6 * This file is released under the GPL v2.
8 * Based on the radix priority search tree proposed by Edward M. McCreight
9 * SIAM Journal of Computing, vol. 14, no.2, pages 257-276, May 1985
11 * 02Feb2004 Initial version
14 #include <linux/init.h>
15 #include <linux/module.h>
17 #include <linux/prio_tree.h>
20 * A clever mix of heap and radix trees forms a radix priority search tree (PST)
21 * which is useful for storing intervals, e.g, we can consider a vma as a closed
22 * interval of file pages [offset_begin, offset_end], and store all vmas that
23 * map a file in a PST. Then, using the PST, we can answer a stabbing query,
24 * i.e., selecting a set of stored intervals (vmas) that overlap with (map) a
25 * given input interval X (a set of consecutive file pages), in "O(log n + m)"
26 * time where 'log n' is the height of the PST, and 'm' is the number of stored
27 * intervals (vmas) that overlap (map) with the input interval X (the set of
28 * consecutive file pages).
30 * In our implementation, we store closed intervals of the form [radix_index,
31 * heap_index]. We assume that always radix_index <= heap_index. McCreight's PST
32 * is designed for storing intervals with unique radix indices, i.e., each
33 * interval have different radix_index. However, this limitation can be easily
34 * overcome by using the size, i.e., heap_index - radix_index, as part of the
35 * index, so we index the tree using [(radix_index,size), heap_index].
37 * When the above-mentioned indexing scheme is used, theoretically, in a 32 bit
38 * machine, the maximum height of a PST can be 64. We can use a balanced version
39 * of the priority search tree to optimize the tree height, but the balanced
40 * tree proposed by McCreight is too complex and memory-hungry for our purpose.
44 * The following macros are used for implementing prio_tree for i_mmap
47 #define RADIX_INDEX(vma) ((vma)->vm_pgoff)
48 #define VMA_SIZE(vma) (((vma)->vm_end - (vma)->vm_start) >> PAGE_SHIFT)
50 #define HEAP_INDEX(vma) ((vma)->vm_pgoff + (VMA_SIZE(vma) - 1))
52 #define GET_INDEX_VMA(vma, radix, heap) \
54 radix = RADIX_INDEX(vma); \
55 heap = HEAP_INDEX(vma); \
58 #define GET_INDEX(node, radix, heap) \
60 struct vm_area_struct *__tmp = \
61 prio_tree_entry(node, struct vm_area_struct, shared.prio_tree_node);\
62 GET_INDEX_VMA(__tmp, radix, heap); \
65 static unsigned long index_bits_to_maxindex[BITS_PER_LONG];
67 void __init prio_tree_init(void)
71 for (i = 0; i < ARRAY_SIZE(index_bits_to_maxindex) - 1; i++)
72 index_bits_to_maxindex[i] = (1UL << (i + 1)) - 1;
73 index_bits_to_maxindex[ARRAY_SIZE(index_bits_to_maxindex) - 1] = ~0UL;
77 * Maximum heap_index that can be stored in a PST with index_bits bits
79 static inline unsigned long prio_tree_maxindex(unsigned int bits)
81 return index_bits_to_maxindex[bits - 1];
84 static void prio_tree_remove(struct prio_tree_root *, struct prio_tree_node *);
87 * Extend a priority search tree so that it can store a node with heap_index
88 * max_heap_index. In the worst case, this algorithm takes O((log n)^2).
89 * However, this function is used rarely and the common case performance is
92 static struct prio_tree_node *prio_tree_expand(struct prio_tree_root *root,
93 struct prio_tree_node *node, unsigned long max_heap_index)
95 struct prio_tree_node *first = NULL, *prev, *last = NULL;
97 if (max_heap_index > prio_tree_maxindex(root->index_bits))
100 while (max_heap_index > prio_tree_maxindex(root->index_bits)) {
103 if (prio_tree_empty(root))
107 first = root->prio_tree_node;
108 prio_tree_remove(root, root->prio_tree_node);
109 INIT_PRIO_TREE_NODE(first);
113 last = root->prio_tree_node;
114 prio_tree_remove(root, root->prio_tree_node);
115 INIT_PRIO_TREE_NODE(last);
121 INIT_PRIO_TREE_NODE(node);
125 first->parent = node;
129 if (!prio_tree_empty(root)) {
130 last->left = root->prio_tree_node;
131 last->left->parent = last;
134 root->prio_tree_node = node;
139 * Replace a prio_tree_node with a new node and return the old node
141 static struct prio_tree_node *prio_tree_replace(struct prio_tree_root *root,
142 struct prio_tree_node *old, struct prio_tree_node *node)
144 INIT_PRIO_TREE_NODE(node);
146 if (prio_tree_root(old)) {
147 BUG_ON(root->prio_tree_node != old);
149 * We can reduce root->index_bits here. However, it is complex
150 * and does not help much to improve performance (IMO).
153 root->prio_tree_node = node;
155 node->parent = old->parent;
156 if (old->parent->left == old)
157 old->parent->left = node;
159 old->parent->right = node;
162 if (!prio_tree_left_empty(old)) {
163 node->left = old->left;
164 old->left->parent = node;
167 if (!prio_tree_right_empty(old)) {
168 node->right = old->right;
169 old->right->parent = node;
176 * Insert a prio_tree_node @node into a radix priority search tree @root. The
177 * algorithm typically takes O(log n) time where 'log n' is the number of bits
178 * required to represent the maximum heap_index. In the worst case, the algo
179 * can take O((log n)^2) - check prio_tree_expand.
181 * If a prior node with same radix_index and heap_index is already found in
182 * the tree, then returns the address of the prior node. Otherwise, inserts
183 * @node into the tree and returns @node.
185 static struct prio_tree_node *prio_tree_insert(struct prio_tree_root *root,
186 struct prio_tree_node *node)
188 struct prio_tree_node *cur, *res = node;
189 unsigned long radix_index, heap_index;
190 unsigned long r_index, h_index, index, mask;
193 GET_INDEX(node, radix_index, heap_index);
195 if (prio_tree_empty(root) ||
196 heap_index > prio_tree_maxindex(root->index_bits))
197 return prio_tree_expand(root, node, heap_index);
199 cur = root->prio_tree_node;
200 mask = 1UL << (root->index_bits - 1);
203 GET_INDEX(cur, r_index, h_index);
205 if (r_index == radix_index && h_index == heap_index)
208 if (h_index < heap_index ||
209 (h_index == heap_index && r_index > radix_index)) {
210 struct prio_tree_node *tmp = node;
211 node = prio_tree_replace(root, cur, node);
215 r_index = radix_index;
218 h_index = heap_index;
223 index = heap_index - radix_index;
228 if (prio_tree_right_empty(cur)) {
229 INIT_PRIO_TREE_NODE(node);
236 if (prio_tree_left_empty(cur)) {
237 INIT_PRIO_TREE_NODE(node);
248 mask = 1UL << (BITS_PER_LONG - 1);
252 /* Should not reach here */
258 * Remove a prio_tree_node @node from a radix priority search tree @root. The
259 * algorithm takes O(log n) time where 'log n' is the number of bits required
260 * to represent the maximum heap_index.
262 static void prio_tree_remove(struct prio_tree_root *root,
263 struct prio_tree_node *node)
265 struct prio_tree_node *cur;
266 unsigned long r_index, h_index_right, h_index_left;
270 while (!prio_tree_left_empty(cur) || !prio_tree_right_empty(cur)) {
271 if (!prio_tree_left_empty(cur))
272 GET_INDEX(cur->left, r_index, h_index_left);
278 if (!prio_tree_right_empty(cur))
279 GET_INDEX(cur->right, r_index, h_index_right);
285 /* both h_index_left and h_index_right cannot be 0 */
286 if (h_index_left >= h_index_right)
292 if (prio_tree_root(cur)) {
293 BUG_ON(root->prio_tree_node != cur);
294 INIT_PRIO_TREE_ROOT(root);
298 if (cur->parent->right == cur)
299 cur->parent->right = cur->parent;
301 cur->parent->left = cur->parent;
304 cur = prio_tree_replace(root, cur->parent, cur);
308 * Following functions help to enumerate all prio_tree_nodes in the tree that
309 * overlap with the input interval X [radix_index, heap_index]. The enumeration
310 * takes O(log n + m) time where 'log n' is the height of the tree (which is
311 * proportional to # of bits required to represent the maximum heap_index) and
312 * 'm' is the number of prio_tree_nodes that overlap the interval X.
315 static struct prio_tree_node *prio_tree_left(struct prio_tree_iter *iter,
316 unsigned long *r_index, unsigned long *h_index)
318 if (prio_tree_left_empty(iter->cur))
321 GET_INDEX(iter->cur->left, *r_index, *h_index);
323 if (iter->r_index <= *h_index) {
324 iter->cur = iter->cur->left;
327 if (iter->size_level)
330 if (iter->size_level) {
331 BUG_ON(!prio_tree_left_empty(iter->cur));
332 BUG_ON(!prio_tree_right_empty(iter->cur));
334 iter->mask = ULONG_MAX;
336 iter->size_level = 1;
337 iter->mask = 1UL << (BITS_PER_LONG - 1);
346 static struct prio_tree_node *prio_tree_right(struct prio_tree_iter *iter,
347 unsigned long *r_index, unsigned long *h_index)
351 if (prio_tree_right_empty(iter->cur))
354 if (iter->size_level)
357 value = iter->value | iter->mask;
359 if (iter->h_index < value)
362 GET_INDEX(iter->cur->right, *r_index, *h_index);
364 if (iter->r_index <= *h_index) {
365 iter->cur = iter->cur->right;
369 if (iter->size_level)
372 if (iter->size_level) {
373 BUG_ON(!prio_tree_left_empty(iter->cur));
374 BUG_ON(!prio_tree_right_empty(iter->cur));
376 iter->mask = ULONG_MAX;
378 iter->size_level = 1;
379 iter->mask = 1UL << (BITS_PER_LONG - 1);
388 static struct prio_tree_node *prio_tree_parent(struct prio_tree_iter *iter)
390 iter->cur = iter->cur->parent;
391 if (iter->mask == ULONG_MAX)
393 else if (iter->size_level == 1)
397 if (iter->size_level)
399 if (!iter->size_level && (iter->value & iter->mask))
400 iter->value ^= iter->mask;
404 static inline int overlap(struct prio_tree_iter *iter,
405 unsigned long r_index, unsigned long h_index)
407 return iter->h_index >= r_index && iter->r_index <= h_index;
413 * Get the first prio_tree_node that overlaps with the interval [radix_index,
414 * heap_index]. Note that always radix_index <= heap_index. We do a pre-order
415 * traversal of the tree.
417 static struct prio_tree_node *prio_tree_first(struct prio_tree_iter *iter)
419 struct prio_tree_root *root;
420 unsigned long r_index, h_index;
422 INIT_PRIO_TREE_ITER(iter);
425 if (prio_tree_empty(root))
428 GET_INDEX(root->prio_tree_node, r_index, h_index);
430 if (iter->r_index > h_index)
433 iter->mask = 1UL << (root->index_bits - 1);
434 iter->cur = root->prio_tree_node;
437 if (overlap(iter, r_index, h_index))
440 if (prio_tree_left(iter, &r_index, &h_index))
443 if (prio_tree_right(iter, &r_index, &h_index))
454 * Get the next prio_tree_node that overlaps with the input interval in iter
456 static struct prio_tree_node *prio_tree_next(struct prio_tree_iter *iter)
458 unsigned long r_index, h_index;
461 while (prio_tree_left(iter, &r_index, &h_index))
462 if (overlap(iter, r_index, h_index))
465 while (!prio_tree_right(iter, &r_index, &h_index)) {
466 while (!prio_tree_root(iter->cur) &&
467 iter->cur->parent->right == iter->cur)
468 prio_tree_parent(iter);
470 if (prio_tree_root(iter->cur))
473 prio_tree_parent(iter);
476 if (overlap(iter, r_index, h_index))
483 * Radix priority search tree for address_space->i_mmap
485 * For each vma that map a unique set of file pages i.e., unique [radix_index,
486 * heap_index] value, we have a corresponing priority search tree node. If
487 * multiple vmas have identical [radix_index, heap_index] value, then one of
488 * them is used as a tree node and others are stored in a vm_set list. The tree
489 * node points to the first vma (head) of the list using vm_set.head.
495 * L R -> H-I-J-K-M-N-O-P-Q-S
496 * ^ ^ <-- vm_set.list -->
499 * We need some way to identify whether a vma is a tree node, head of a vm_set
500 * list, or just a member of a vm_set list. We cannot use vm_flags to store
501 * such information. The reason is, in the above figure, it is possible that
502 * vm_flags' of R and H are covered by the different mmap_sems. When R is
503 * removed under R->mmap_sem, H replaces R as a tree node. Since we do not hold
504 * H->mmap_sem, we cannot use H->vm_flags for marking that H is a tree node now.
505 * That's why some trick involving shared.vm_set.parent is used for identifying
506 * tree nodes and list head nodes.
508 * vma radix priority search tree node rules:
510 * vma->shared.vm_set.parent != NULL ==> a tree node
511 * vma->shared.vm_set.head != NULL ==> list of others mapping same range
512 * vma->shared.vm_set.head == NULL ==> no others map the same range
514 * vma->shared.vm_set.parent == NULL
515 * vma->shared.vm_set.head != NULL ==> list head of vmas mapping same range
516 * vma->shared.vm_set.head == NULL ==> a list node
520 * Add a new vma known to map the same set of pages as the old vma:
521 * useful for fork's dup_mmap as well as vma_prio_tree_insert below.
522 * Note that it just happens to work correctly on i_mmap_nonlinear too.
524 void vma_prio_tree_add(struct vm_area_struct *vma, struct vm_area_struct *old)
526 /* Leave these BUG_ONs till prio_tree patch stabilizes */
527 BUG_ON(RADIX_INDEX(vma) != RADIX_INDEX(old));
528 BUG_ON(HEAP_INDEX(vma) != HEAP_INDEX(old));
530 vma->shared.vm_set.head = NULL;
531 vma->shared.vm_set.parent = NULL;
533 if (!old->shared.vm_set.parent)
534 list_add(&vma->shared.vm_set.list,
535 &old->shared.vm_set.list);
536 else if (old->shared.vm_set.head)
537 list_add_tail(&vma->shared.vm_set.list,
538 &old->shared.vm_set.head->shared.vm_set.list);
540 INIT_LIST_HEAD(&vma->shared.vm_set.list);
541 vma->shared.vm_set.head = old;
542 old->shared.vm_set.head = vma;
546 void vma_prio_tree_insert(struct vm_area_struct *vma,
547 struct prio_tree_root *root)
549 struct prio_tree_node *ptr;
550 struct vm_area_struct *old;
552 vma->shared.vm_set.head = NULL;
554 ptr = prio_tree_insert(root, &vma->shared.prio_tree_node);
555 if (ptr != &vma->shared.prio_tree_node) {
556 old = prio_tree_entry(ptr, struct vm_area_struct,
557 shared.prio_tree_node);
558 vma_prio_tree_add(vma, old);
562 void vma_prio_tree_remove(struct vm_area_struct *vma,
563 struct prio_tree_root *root)
565 struct vm_area_struct *node, *head, *new_head;
567 if (!vma->shared.vm_set.head) {
568 if (!vma->shared.vm_set.parent)
569 list_del_init(&vma->shared.vm_set.list);
571 prio_tree_remove(root, &vma->shared.prio_tree_node);
573 /* Leave this BUG_ON till prio_tree patch stabilizes */
574 BUG_ON(vma->shared.vm_set.head->shared.vm_set.head != vma);
575 if (vma->shared.vm_set.parent) {
576 head = vma->shared.vm_set.head;
577 if (!list_empty(&head->shared.vm_set.list)) {
578 new_head = list_entry(
579 head->shared.vm_set.list.next,
580 struct vm_area_struct,
582 list_del_init(&head->shared.vm_set.list);
586 prio_tree_replace(root, &vma->shared.prio_tree_node,
587 &head->shared.prio_tree_node);
588 head->shared.vm_set.head = new_head;
590 new_head->shared.vm_set.head = head;
593 node = vma->shared.vm_set.head;
594 if (!list_empty(&vma->shared.vm_set.list)) {
595 new_head = list_entry(
596 vma->shared.vm_set.list.next,
597 struct vm_area_struct,
599 list_del_init(&vma->shared.vm_set.list);
600 node->shared.vm_set.head = new_head;
601 new_head->shared.vm_set.head = node;
603 node->shared.vm_set.head = NULL;
609 * Helper function to enumerate vmas that map a given file page or a set of
610 * contiguous file pages. The function returns vmas that at least map a single
611 * page in the given range of contiguous file pages.
613 struct vm_area_struct *vma_prio_tree_next(struct vm_area_struct *vma,
614 struct prio_tree_iter *iter)
616 struct prio_tree_node *ptr;
617 struct vm_area_struct *next;
621 * First call is with NULL vma
623 ptr = prio_tree_first(iter);
625 next = prio_tree_entry(ptr, struct vm_area_struct,
626 shared.prio_tree_node);
627 prefetch(next->shared.vm_set.head);
633 if (vma->shared.vm_set.parent) {
634 if (vma->shared.vm_set.head) {
635 next = vma->shared.vm_set.head;
636 prefetch(next->shared.vm_set.list.next);
640 next = list_entry(vma->shared.vm_set.list.next,
641 struct vm_area_struct, shared.vm_set.list);
642 if (!next->shared.vm_set.head) {
643 prefetch(next->shared.vm_set.list.next);
648 ptr = prio_tree_next(iter);
650 next = prio_tree_entry(ptr, struct vm_area_struct,
651 shared.prio_tree_node);
652 prefetch(next->shared.vm_set.head);