2 * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
12 ** directory_part_size
16 ** are_leaves_removable
20 ** is_left_neighbor_in_cache
24 ** can_node_be_removed
26 ** dc_check_balance_internal
27 ** dc_check_balance_leaf
38 #include <linux/config.h>
39 #include <linux/time.h>
40 #include <linux/string.h>
41 #include <linux/reiserfs_fs.h>
42 #include <linux/buffer_head.h>
45 /* To make any changes in the tree we find a node, that contains item
46 to be changed/deleted or position in the node we insert a new item
47 to. We call this node S. To do balancing we need to decide what we
48 will shift to left/right neighbor, or to a new node, where new item
49 will be etc. To make this analysis simpler we build virtual
50 node. Virtual node is an array of items, that will replace items of
51 node S. (For instance if we are going to delete an item, virtual
52 node does not contain it). Virtual node keeps information about
53 item sizes and types, mergeability of first and last items, sizes
54 of all entries in directory item. We use this array of items when
55 calculating what we can shift to neighbors and how many nodes we
56 have to have if we do not any shiftings, if we shift to left/right
57 neighbor or to both. */
60 /* taking item number in virtual node, returns number of item, that it has in source buffer */
61 static inline int old_item_num (int new_num, int affected_item_num, int mode)
63 if (mode == M_PASTE || mode == M_CUT || new_num < affected_item_num)
66 if (mode == M_INSERT) {
69 "vs-8005: for INSERT mode and item number of inserted item");
74 RFALSE( mode != M_DELETE,
75 "vs-8010: old_item_num: mode must be M_DELETE (mode = \'%c\'", mode);
80 static void create_virtual_node (struct tree_balance * tb, int h)
82 struct item_head * ih;
83 struct virtual_node * vn = tb->tb_vn;
85 struct buffer_head * Sh; /* this comes from tb->S[h] */
87 Sh = PATH_H_PBUFFER (tb->tb_path, h);
89 /* size of changed node */
90 vn->vn_size = MAX_CHILD_SIZE (Sh) - B_FREE_SPACE (Sh) + tb->insert_size[h];
92 /* for internal nodes array if virtual items is not created */
94 vn->vn_nr_item = (vn->vn_size - DC_SIZE) / (DC_SIZE + KEY_SIZE);
98 /* number of items in virtual node */
99 vn->vn_nr_item = B_NR_ITEMS (Sh) + ((vn->vn_mode == M_INSERT)? 1 : 0) - ((vn->vn_mode == M_DELETE)? 1 : 0);
101 /* first virtual item */
102 vn->vn_vi = (struct virtual_item *)(tb->tb_vn + 1);
103 memset (vn->vn_vi, 0, vn->vn_nr_item * sizeof (struct virtual_item));
104 vn->vn_free_ptr += vn->vn_nr_item * sizeof (struct virtual_item);
107 /* first item in the node */
108 ih = B_N_PITEM_HEAD (Sh, 0);
110 /* define the mergeability for 0-th item (if it is not being deleted) */
111 if (op_is_left_mergeable (&(ih->ih_key), Sh->b_size) && (vn->vn_mode != M_DELETE || vn->vn_affected_item_num))
112 vn->vn_vi[0].vi_type |= VI_TYPE_LEFT_MERGEABLE;
114 /* go through all items those remain in the virtual node (except for the new (inserted) one) */
115 for (new_num = 0; new_num < vn->vn_nr_item; new_num ++) {
117 struct virtual_item * vi = vn->vn_vi + new_num;
118 int is_affected = ((new_num != vn->vn_affected_item_num) ? 0 : 1);
121 if (is_affected && vn->vn_mode == M_INSERT)
124 /* get item number in source node */
125 j = old_item_num (new_num, vn->vn_affected_item_num, vn->vn_mode);
127 vi->vi_item_len += ih_item_len(ih + j) + IH_SIZE;
129 vi->vi_item = B_I_PITEM (Sh, ih + j);
130 vi->vi_uarea = vn->vn_free_ptr;
132 // FIXME: there is no check, that item operation did not
133 // consume too much memory
134 vn->vn_free_ptr += op_create_vi (vn, vi, is_affected, tb->insert_size [0]);
135 if (tb->vn_buf + tb->vn_buf_size < vn->vn_free_ptr)
136 reiserfs_panic (tb->tb_sb, "vs-8030: create_virtual_node: "
137 "virtual node space consumed");
140 /* this is not being changed */
143 if (vn->vn_mode == M_PASTE || vn->vn_mode == M_CUT) {
144 vn->vn_vi[new_num].vi_item_len += tb->insert_size[0];
145 vi->vi_new_data = vn->vn_data; // pointer to data which is going to be pasted
150 /* virtual inserted item is not defined yet */
151 if (vn->vn_mode == M_INSERT) {
152 struct virtual_item * vi = vn->vn_vi + vn->vn_affected_item_num;
154 RFALSE( vn->vn_ins_ih == 0,
155 "vs-8040: item header of inserted item is not specified");
156 vi->vi_item_len = tb->insert_size[0];
157 vi->vi_ih = vn->vn_ins_ih;
158 vi->vi_item = vn->vn_data;
159 vi->vi_uarea = vn->vn_free_ptr;
161 op_create_vi (vn, vi, 0/*not pasted or cut*/, tb->insert_size [0]);
164 /* set right merge flag we take right delimiting key and check whether it is a mergeable item */
168 key = B_N_PDELIM_KEY (tb->CFR[0], tb->rkey[0]);
169 if (op_is_left_mergeable (key, Sh->b_size) && (vn->vn_mode != M_DELETE ||
170 vn->vn_affected_item_num != B_NR_ITEMS (Sh) - 1))
171 vn->vn_vi[vn->vn_nr_item-1].vi_type |= VI_TYPE_RIGHT_MERGEABLE;
173 #ifdef CONFIG_REISERFS_CHECK
174 if (op_is_left_mergeable (key, Sh->b_size) &&
175 !(vn->vn_mode != M_DELETE || vn->vn_affected_item_num != B_NR_ITEMS (Sh) - 1) ) {
176 /* we delete last item and it could be merged with right neighbor's first item */
177 if (!(B_NR_ITEMS (Sh) == 1 && is_direntry_le_ih (B_N_PITEM_HEAD (Sh, 0)) &&
178 I_ENTRY_COUNT (B_N_PITEM_HEAD (Sh, 0)) == 1)) {
179 /* node contains more than 1 item, or item is not directory item, or this item contains more than 1 entry */
180 print_block (Sh, 0, -1, -1);
181 reiserfs_panic (tb->tb_sb, "vs-8045: create_virtual_node: rdkey %k, affected item==%d (mode==%c) Must be %c",
182 key, vn->vn_affected_item_num, vn->vn_mode, M_DELETE);
184 /* we can delete directory item, that has only one directory entry in it */
193 /* using virtual node check, how many items can be shifted to left
195 static void check_left (struct tree_balance * tb, int h, int cur_free)
198 struct virtual_node * vn = tb->tb_vn;
199 struct virtual_item * vi;
202 RFALSE( cur_free < 0, "vs-8050: cur_free (%d) < 0", cur_free);
206 tb->lnum[h] = cur_free / (DC_SIZE + KEY_SIZE);
212 if (!cur_free || !vn->vn_nr_item) {
213 /* no free space or nothing to move */
219 RFALSE( !PATH_H_PPARENT (tb->tb_path, 0),
220 "vs-8055: parent does not exist or invalid");
223 if ((unsigned int)cur_free >= (vn->vn_size - ((vi->vi_type & VI_TYPE_LEFT_MERGEABLE) ? IH_SIZE : 0))) {
224 /* all contents of S[0] fits into L[0] */
226 RFALSE( vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE,
227 "vs-8055: invalid mode or balance condition failed");
229 tb->lnum[0] = vn->vn_nr_item;
235 d_size = 0, ih_size = IH_SIZE;
237 /* first item may be merge with last item in left neighbor */
238 if (vi->vi_type & VI_TYPE_LEFT_MERGEABLE)
239 d_size = -((int)IH_SIZE), ih_size = 0;
242 for (i = 0; i < vn->vn_nr_item; i ++, ih_size = IH_SIZE, d_size = 0, vi ++) {
243 d_size += vi->vi_item_len;
244 if (cur_free >= d_size) {
245 /* the item can be shifted entirely */
251 /* the item cannot be shifted entirely, try to split it */
252 /* check whether L[0] can hold ih and at least one byte of the item body */
253 if (cur_free <= ih_size) {
254 /* cannot shift even a part of the current item */
260 tb->lbytes = op_check_left (vi, cur_free, 0, 0);
261 if (tb->lbytes != -1)
262 /* count partially shifted item */
272 /* using virtual node check, how many items can be shifted to right
274 static void check_right (struct tree_balance * tb, int h, int cur_free)
277 struct virtual_node * vn = tb->tb_vn;
278 struct virtual_item * vi;
281 RFALSE( cur_free < 0, "vs-8070: cur_free < 0");
285 tb->rnum[h] = cur_free / (DC_SIZE + KEY_SIZE);
291 if (!cur_free || !vn->vn_nr_item) {
298 RFALSE( !PATH_H_PPARENT (tb->tb_path, 0),
299 "vs-8075: parent does not exist or invalid");
301 vi = vn->vn_vi + vn->vn_nr_item - 1;
302 if ((unsigned int)cur_free >= (vn->vn_size - ((vi->vi_type & VI_TYPE_RIGHT_MERGEABLE) ? IH_SIZE : 0))) {
303 /* all contents of S[0] fits into R[0] */
305 RFALSE( vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE,
306 "vs-8080: invalid mode or balance condition failed");
308 tb->rnum[h] = vn->vn_nr_item;
313 d_size = 0, ih_size = IH_SIZE;
315 /* last item may be merge with first item in right neighbor */
316 if (vi->vi_type & VI_TYPE_RIGHT_MERGEABLE)
317 d_size = -(int)IH_SIZE, ih_size = 0;
320 for (i = vn->vn_nr_item - 1; i >= 0; i --, d_size = 0, ih_size = IH_SIZE, vi --) {
321 d_size += vi->vi_item_len;
322 if (cur_free >= d_size) {
323 /* the item can be shifted entirely */
329 /* check whether R[0] can hold ih and at least one byte of the item body */
330 if ( cur_free <= ih_size ) { /* cannot shift even a part of the current item */
335 /* R[0] can hold the header of the item and at least one byte of its body */
336 cur_free -= ih_size; /* cur_free is still > 0 */
338 tb->rbytes = op_check_right (vi, cur_free);
339 if (tb->rbytes != -1)
340 /* count partially shifted item */
351 * from - number of items, which are shifted to left neighbor entirely
352 * to - number of item, which are shifted to right neighbor entirely
353 * from_bytes - number of bytes of boundary item (or directory entries) which are shifted to left neighbor
354 * to_bytes - number of bytes of boundary item (or directory entries) which are shifted to right neighbor */
355 static int get_num_ver (int mode, struct tree_balance * tb, int h,
356 int from, int from_bytes,
357 int to, int to_bytes,
358 short * snum012, int flow
365 struct virtual_node * vn = tb->tb_vn;
366 // struct virtual_item * vi;
368 int total_node_size, max_node_size, current_item_size;
370 int start_item, /* position of item we start filling node from */
371 end_item, /* position of item we finish filling node by */
372 start_bytes,/* number of first bytes (entries for directory) of start_item-th item
373 we do not include into node that is being filled */
374 end_bytes; /* number of last bytes (entries for directory) of end_item-th item
375 we do node include into node that is being filled */
376 int split_item_positions[2]; /* these are positions in virtual item of
377 items, that are split between S[0] and
378 S1new and S1new and S2new */
380 split_item_positions[0] = -1;
381 split_item_positions[1] = -1;
383 /* We only create additional nodes if we are in insert or paste mode
384 or we are in replace mode at the internal level. If h is 0 and
385 the mode is M_REPLACE then in fix_nodes we change the mode to
386 paste or insert before we get here in the code. */
387 RFALSE( tb->insert_size[h] < 0 || (mode != M_INSERT && mode != M_PASTE),
388 "vs-8100: insert_size < 0 in overflow");
390 max_node_size = MAX_CHILD_SIZE (PATH_H_PBUFFER (tb->tb_path, h));
392 /* snum012 [0-2] - number of items, that lay
393 to S[0], first new node and second new node */
394 snum012[3] = -1; /* s1bytes */
395 snum012[4] = -1; /* s2bytes */
399 i = ((to - from) * (KEY_SIZE + DC_SIZE) + DC_SIZE);
400 if (i == max_node_size)
402 return (i / max_node_size + 1);
408 cur_free = max_node_size;
410 // start from 'from'-th item
412 // skip its first 'start_bytes' units
413 start_bytes = ((from_bytes != -1) ? from_bytes : 0);
415 // last included item is the 'end_item'-th one
416 end_item = vn->vn_nr_item - to - 1;
417 // do not count last 'end_bytes' units of 'end_item'-th item
418 end_bytes = (to_bytes != -1) ? to_bytes : 0;
420 /* go through all item beginning from the start_item-th item and ending by
421 the end_item-th item. Do not count first 'start_bytes' units of
422 'start_item'-th item and last 'end_bytes' of 'end_item'-th item */
424 for (i = start_item; i <= end_item; i ++) {
425 struct virtual_item * vi = vn->vn_vi + i;
426 int skip_from_end = ((i == end_item) ? end_bytes : 0);
428 RFALSE( needed_nodes > 3, "vs-8105: too many nodes are needed");
430 /* get size of current item */
431 current_item_size = vi->vi_item_len;
433 /* do not take in calculation head part (from_bytes) of from-th item */
434 current_item_size -= op_part_size (vi, 0/*from start*/, start_bytes);
436 /* do not take in calculation tail part of last item */
437 current_item_size -= op_part_size (vi, 1/*from end*/, skip_from_end);
439 /* if item fits into current node entierly */
440 if (total_node_size + current_item_size <= max_node_size) {
441 snum012[needed_nodes - 1] ++;
442 total_node_size += current_item_size;
447 if (current_item_size > max_node_size) {
448 /* virtual item length is longer, than max size of item in
449 a node. It is impossible for direct item */
450 RFALSE( is_direct_le_ih (vi->vi_ih),
452 "direct item length is %d. It can not be longer than %d",
453 current_item_size, max_node_size);
454 /* we will try to split it */
459 /* as we do not split items, take new node and continue */
460 needed_nodes ++; i --; total_node_size = 0;
464 // calculate number of item units which fit into node being
469 free_space = max_node_size - total_node_size - IH_SIZE;
470 units = op_check_left (vi, free_space, start_bytes, skip_from_end);
472 /* nothing fits into current node, take new node and continue */
473 needed_nodes ++, i--, total_node_size = 0;
478 /* something fits into the current node */
479 //if (snum012[3] != -1 || needed_nodes != 1)
480 // reiserfs_panic (tb->tb_sb, "vs-8115: get_num_ver: too many nodes required");
481 //snum012[needed_nodes - 1 + 3] = op_unit_num (vi) - start_bytes - units;
482 start_bytes += units;
483 snum012[needed_nodes - 1 + 3] = units;
485 if (needed_nodes > 2)
486 reiserfs_warning (tb->tb_sb, "vs-8111: get_num_ver: "
487 "split_item_position is out of boundary");
488 snum012[needed_nodes - 1] ++;
489 split_item_positions[needed_nodes - 1] = i;
491 /* continue from the same item with start_bytes != -1 */
497 // sum012[4] (if it is not -1) contains number of units of which
498 // are to be in S1new, snum012[3] - to be in S0. They are supposed
499 // to be S1bytes and S2bytes correspondingly, so recalculate
500 if (snum012[4] > 0) {
502 int bytes_to_r, bytes_to_l;
505 split_item_num = split_item_positions[1];
506 bytes_to_l = ((from == split_item_num && from_bytes != -1) ? from_bytes : 0);
507 bytes_to_r = ((end_item == split_item_num && end_bytes != -1) ? end_bytes : 0);
508 bytes_to_S1new = ((split_item_positions[0] == split_item_positions[1]) ? snum012[3] : 0);
511 snum012[4] = op_unit_num (&vn->vn_vi[split_item_num]) - snum012[4] - bytes_to_r - bytes_to_l - bytes_to_S1new;
513 if (vn->vn_vi[split_item_num].vi_index != TYPE_DIRENTRY)
514 reiserfs_warning (tb->tb_sb, "vs-8115: get_num_ver: not "
518 /* now we know S2bytes, calculate S1bytes */
519 if (snum012[3] > 0) {
521 int bytes_to_r, bytes_to_l;
524 split_item_num = split_item_positions[0];
525 bytes_to_l = ((from == split_item_num && from_bytes != -1) ? from_bytes : 0);
526 bytes_to_r = ((end_item == split_item_num && end_bytes != -1) ? end_bytes : 0);
527 bytes_to_S2new = ((split_item_positions[0] == split_item_positions[1] && snum012[4] != -1) ? snum012[4] : 0);
530 snum012[3] = op_unit_num (&vn->vn_vi[split_item_num]) - snum012[3] - bytes_to_r - bytes_to_l - bytes_to_S2new;
537 #ifdef CONFIG_REISERFS_CHECK
538 extern struct tree_balance * cur_tb;
542 /* Set parameters for balancing.
543 * Performs write of results of analysis of balancing into structure tb,
544 * where it will later be used by the functions that actually do the balancing.
546 * tb tree_balance structure;
547 * h current level of the node;
548 * lnum number of items from S[h] that must be shifted to L[h];
549 * rnum number of items from S[h] that must be shifted to R[h];
550 * blk_num number of blocks that S[h] will be splitted into;
551 * s012 number of items that fall into splitted nodes.
552 * lbytes number of bytes which flow to the left neighbor from the item that is not
553 * not shifted entirely
554 * rbytes number of bytes which flow to the right neighbor from the item that is not
555 * not shifted entirely
556 * s1bytes number of bytes which flow to the first new node when S[0] splits (this number is contained in s012 array)
559 static void set_parameters (struct tree_balance * tb, int h, int lnum,
560 int rnum, int blk_num, short * s012, int lb, int rb)
565 tb->blknum[h] = blk_num;
568 { /* only for leaf level */
571 tb->s0num = * s012 ++,
572 tb->s1num = * s012 ++,
573 tb->s2num = * s012 ++;
574 tb->s1bytes = * s012 ++;
575 tb->s2bytes = * s012;
580 PROC_INFO_ADD( tb -> tb_sb, lnum[ h ], lnum );
581 PROC_INFO_ADD( tb -> tb_sb, rnum[ h ], rnum );
583 PROC_INFO_ADD( tb -> tb_sb, lbytes[ h ], lb );
584 PROC_INFO_ADD( tb -> tb_sb, rbytes[ h ], rb );
589 /* check, does node disappear if we shift tb->lnum[0] items to left
590 neighbor and tb->rnum[0] to the right one. */
591 static int is_leaf_removable (struct tree_balance * tb)
593 struct virtual_node * vn = tb->tb_vn;
594 int to_left, to_right;
598 /* number of items, that will be shifted to left (right) neighbor
600 to_left = tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0);
601 to_right = tb->rnum[0] - ((tb->rbytes != -1) ? 1 : 0);
602 remain_items = vn->vn_nr_item;
604 /* how many items remain in S[0] after shiftings to neighbors */
605 remain_items -= (to_left + to_right);
607 if (remain_items < 1) {
608 /* all content of node can be shifted to neighbors */
609 set_parameters (tb, 0, to_left, vn->vn_nr_item - to_left, 0, NULL, -1, -1);
613 if (remain_items > 1 || tb->lbytes == -1 || tb->rbytes == -1)
614 /* S[0] is not removable */
617 /* check, whether we can divide 1 remaining item between neighbors */
619 /* get size of remaining item (in item units) */
620 size = op_unit_num (&(vn->vn_vi[to_left]));
622 if (tb->lbytes + tb->rbytes >= size) {
623 set_parameters (tb, 0, to_left + 1, to_right + 1, 0, NULL, tb->lbytes, -1);
631 /* check whether L, S, R can be joined in one node */
632 static int are_leaves_removable (struct tree_balance * tb, int lfree, int rfree)
634 struct virtual_node * vn = tb->tb_vn;
636 struct buffer_head *S0;
638 S0 = PATH_H_PBUFFER (tb->tb_path, 0);
641 if (vn->vn_nr_item) {
642 if (vn->vn_vi[0].vi_type & VI_TYPE_LEFT_MERGEABLE)
645 if (vn->vn_vi[vn->vn_nr_item-1].vi_type & VI_TYPE_RIGHT_MERGEABLE)
648 /* there was only one item and it will be deleted */
649 struct item_head * ih;
651 RFALSE( B_NR_ITEMS (S0) != 1,
652 "vs-8125: item number must be 1: it is %d", B_NR_ITEMS(S0));
654 ih = B_N_PITEM_HEAD (S0, 0);
655 if (tb->CFR[0] && !comp_short_le_keys (&(ih->ih_key), B_N_PDELIM_KEY (tb->CFR[0], tb->rkey[0])))
656 if (is_direntry_le_ih (ih)) {
657 /* Directory must be in correct state here: that is
658 somewhere at the left side should exist first directory
659 item. But the item being deleted can not be that first
660 one because its right neighbor is item of the same
661 directory. (But first item always gets deleted in last
662 turn). So, neighbors of deleted item can be merged, so
663 we can save ih_size */
666 /* we might check that left neighbor exists and is of the
668 RFALSE(le_ih_k_offset (ih) == DOT_OFFSET,
669 "vs-8130: first directory item can not be removed until directory is not empty");
674 if (MAX_CHILD_SIZE (S0) + vn->vn_size <= rfree + lfree + ih_size) {
675 set_parameters (tb, 0, -1, -1, -1, NULL, -1, -1);
676 PROC_INFO_INC( tb -> tb_sb, leaves_removable );
685 /* when we do not split item, lnum and rnum are numbers of entire items */
686 #define SET_PAR_SHIFT_LEFT \
691 to_l = (MAX_NR_KEY(Sh)+1 - lpar + vn->vn_nr_item + 1) / 2 -\
692 (MAX_NR_KEY(Sh) + 1 - lpar);\
694 set_parameters (tb, h, to_l, 0, lnver, NULL, -1, -1);\
698 if (lset==LEFT_SHIFT_FLOW)\
699 set_parameters (tb, h, lpar, 0, lnver, snum012+lset,\
702 set_parameters (tb, h, lpar - (tb->lbytes!=-1), 0, lnver, snum012+lset,\
707 #define SET_PAR_SHIFT_RIGHT \
712 to_r = (MAX_NR_KEY(Sh)+1 - rpar + vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 - rpar);\
714 set_parameters (tb, h, 0, to_r, rnver, NULL, -1, -1);\
718 if (rset==RIGHT_SHIFT_FLOW)\
719 set_parameters (tb, h, 0, rpar, rnver, snum012+rset,\
722 set_parameters (tb, h, 0, rpar - (tb->rbytes!=-1), rnver, snum012+rset,\
727 void free_buffers_in_tb (
728 struct tree_balance * p_s_tb
732 decrement_counters_in_path(p_s_tb->tb_path);
734 for ( n_counter = 0; n_counter < MAX_HEIGHT; n_counter++ ) {
735 decrement_bcount(p_s_tb->L[n_counter]);
736 p_s_tb->L[n_counter] = NULL;
737 decrement_bcount(p_s_tb->R[n_counter]);
738 p_s_tb->R[n_counter] = NULL;
739 decrement_bcount(p_s_tb->FL[n_counter]);
740 p_s_tb->FL[n_counter] = NULL;
741 decrement_bcount(p_s_tb->FR[n_counter]);
742 p_s_tb->FR[n_counter] = NULL;
743 decrement_bcount(p_s_tb->CFL[n_counter]);
744 p_s_tb->CFL[n_counter] = NULL;
745 decrement_bcount(p_s_tb->CFR[n_counter]);
746 p_s_tb->CFR[n_counter] = NULL;
751 /* Get new buffers for storing new nodes that are created while balancing.
752 * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked;
753 * CARRY_ON - schedule didn't occur while the function worked;
754 * NO_DISK_SPACE - no disk space.
756 /* The function is NOT SCHEDULE-SAFE! */
757 static int get_empty_nodes(
758 struct tree_balance * p_s_tb,
761 struct buffer_head * p_s_new_bh,
762 * p_s_Sh = PATH_H_PBUFFER (p_s_tb->tb_path, n_h);
763 b_blocknr_t * p_n_blocknr,
764 a_n_blocknrs[MAX_AMOUNT_NEEDED] = {0, };
767 n_amount_needed,/* number of needed empty blocks */
769 struct super_block * p_s_sb = p_s_tb->tb_sb;
772 /* number_of_freeblk is the number of empty blocks which have been
773 acquired for use by the balancing algorithm minus the number of
774 empty blocks used in the previous levels of the analysis,
775 number_of_freeblk = tb->cur_blknum can be non-zero if a schedule occurs
776 after empty blocks are acquired, and the balancing analysis is
777 then restarted, amount_needed is the number needed by this level
778 (n_h) of the balancing analysis.
780 Note that for systems with many processes writing, it would be
781 more layout optimal to calculate the total number needed by all
782 levels and then to run reiserfs_new_blocks to get all of them at once. */
784 /* Initiate number_of_freeblk to the amount acquired prior to the restart of
785 the analysis or 0 if not restarted, then subtract the amount needed
786 by all of the levels of the tree below n_h. */
787 /* blknum includes S[n_h], so we subtract 1 in this calculation */
788 for ( n_counter = 0, n_number_of_freeblk = p_s_tb->cur_blknum; n_counter < n_h; n_counter++ )
789 n_number_of_freeblk -= ( p_s_tb->blknum[n_counter] ) ? (p_s_tb->blknum[n_counter] - 1) : 0;
791 /* Allocate missing empty blocks. */
792 /* if p_s_Sh == 0 then we are getting a new root */
793 n_amount_needed = ( p_s_Sh ) ? (p_s_tb->blknum[n_h] - 1) : 1;
794 /* Amount_needed = the amount that we need more than the amount that we have. */
795 if ( n_amount_needed > n_number_of_freeblk )
796 n_amount_needed -= n_number_of_freeblk;
797 else /* If we have enough already then there is nothing to do. */
800 /* No need to check quota - is not allocated for blocks used for formatted nodes */
801 if (reiserfs_new_form_blocknrs (p_s_tb, a_n_blocknrs,
802 n_amount_needed) == NO_DISK_SPACE)
803 return NO_DISK_SPACE;
805 /* for each blocknumber we just got, get a buffer and stick it on FEB */
806 for ( p_n_blocknr = a_n_blocknrs, n_counter = 0; n_counter < n_amount_needed;
807 p_n_blocknr++, n_counter++ ) {
809 RFALSE( ! *p_n_blocknr,
810 "PAP-8135: reiserfs_new_blocknrs failed when got new blocks");
812 p_s_new_bh = sb_getblk(p_s_sb, *p_n_blocknr);
813 RFALSE (buffer_dirty (p_s_new_bh) ||
814 buffer_journaled (p_s_new_bh) ||
815 buffer_journal_dirty (p_s_new_bh),
816 "PAP-8140: journlaled or dirty buffer %b for the new block",
819 /* Put empty buffers into the array. */
820 RFALSE (p_s_tb->FEB[p_s_tb->cur_blknum],
821 "PAP-8141: busy slot for new buffer");
823 mark_buffer_journal_new(p_s_new_bh) ;
824 p_s_tb->FEB[p_s_tb->cur_blknum++] = p_s_new_bh;
827 if ( n_retval == CARRY_ON && FILESYSTEM_CHANGED_TB (p_s_tb) )
828 n_retval = REPEAT_SEARCH ;
834 /* Get free space of the left neighbor, which is stored in the parent
835 * node of the left neighbor. */
836 static int get_lfree (struct tree_balance * tb, int h)
838 struct buffer_head * l, * f;
841 if ((f = PATH_H_PPARENT (tb->tb_path, h)) == 0 || (l = tb->FL[h]) == 0)
845 order = PATH_H_B_ITEM_ORDER (tb->tb_path, h) - 1;
847 order = B_NR_ITEMS (l);
851 return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f,order)));
855 /* Get free space of the right neighbor,
856 * which is stored in the parent node of the right neighbor.
858 static int get_rfree (struct tree_balance * tb, int h)
860 struct buffer_head * r, * f;
863 if ((f = PATH_H_PPARENT (tb->tb_path, h)) == 0 || (r = tb->FR[h]) == 0)
867 order = PATH_H_B_ITEM_ORDER (tb->tb_path, h) + 1;
873 return (MAX_CHILD_SIZE(f) - dc_size( B_N_CHILD(f,order)));
878 /* Check whether left neighbor is in memory. */
879 static int is_left_neighbor_in_cache(
880 struct tree_balance * p_s_tb,
883 struct buffer_head * p_s_father, * left;
884 struct super_block * p_s_sb = p_s_tb->tb_sb;
885 b_blocknr_t n_left_neighbor_blocknr;
886 int n_left_neighbor_position;
888 if ( ! p_s_tb->FL[n_h] ) /* Father of the left neighbor does not exist. */
891 /* Calculate father of the node to be balanced. */
892 p_s_father = PATH_H_PBUFFER(p_s_tb->tb_path, n_h + 1);
894 RFALSE( ! p_s_father ||
895 ! B_IS_IN_TREE (p_s_father) ||
896 ! B_IS_IN_TREE (p_s_tb->FL[n_h]) ||
897 ! buffer_uptodate (p_s_father) ||
898 ! buffer_uptodate (p_s_tb->FL[n_h]),
899 "vs-8165: F[h] (%b) or FL[h] (%b) is invalid",
900 p_s_father, p_s_tb->FL[n_h]);
903 /* Get position of the pointer to the left neighbor into the left father. */
904 n_left_neighbor_position = ( p_s_father == p_s_tb->FL[n_h] ) ?
905 p_s_tb->lkey[n_h] : B_NR_ITEMS (p_s_tb->FL[n_h]);
906 /* Get left neighbor block number. */
907 n_left_neighbor_blocknr = B_N_CHILD_NUM(p_s_tb->FL[n_h], n_left_neighbor_position);
908 /* Look for the left neighbor in the cache. */
909 if ( (left = sb_find_get_block(p_s_sb, n_left_neighbor_blocknr)) ) {
911 RFALSE( buffer_uptodate (left) && ! B_IS_IN_TREE(left),
912 "vs-8170: left neighbor (%b %z) is not in the tree", left, left);
921 #define LEFT_PARENTS 'l'
922 #define RIGHT_PARENTS 'r'
925 static void decrement_key (struct cpu_key * p_s_key)
927 // call item specific function for this key
928 item_ops[cpu_key_k_type (p_s_key)]->decrement_key (p_s_key);
934 /* Calculate far left/right parent of the left/right neighbor of the current node, that
935 * is calculate the left/right (FL[h]/FR[h]) neighbor of the parent F[h].
936 * Calculate left/right common parent of the current node and L[h]/R[h].
937 * Calculate left/right delimiting key position.
938 * Returns: PATH_INCORRECT - path in the tree is not correct;
939 SCHEDULE_OCCURRED - schedule occurred while the function worked;
940 * CARRY_ON - schedule didn't occur while the function worked;
942 static int get_far_parent (struct tree_balance * p_s_tb,
944 struct buffer_head ** pp_s_father,
945 struct buffer_head ** pp_s_com_father,
948 struct buffer_head * p_s_parent;
949 INITIALIZE_PATH (s_path_to_neighbor_father);
950 struct path * p_s_path = p_s_tb->tb_path;
951 struct cpu_key s_lr_father_key;
953 n_position = INT_MAX,
954 n_first_last_position = 0,
955 n_path_offset = PATH_H_PATH_OFFSET(p_s_path, n_h);
957 /* Starting from F[n_h] go upwards in the tree, and look for the common
958 ancestor of F[n_h], and its neighbor l/r, that should be obtained. */
960 n_counter = n_path_offset;
962 RFALSE( n_counter < FIRST_PATH_ELEMENT_OFFSET,
963 "PAP-8180: invalid path length");
966 for ( ; n_counter > FIRST_PATH_ELEMENT_OFFSET; n_counter-- ) {
967 /* Check whether parent of the current buffer in the path is really parent in the tree. */
968 if ( ! B_IS_IN_TREE(p_s_parent = PATH_OFFSET_PBUFFER(p_s_path, n_counter - 1)) )
969 return REPEAT_SEARCH;
970 /* Check whether position in the parent is correct. */
971 if ( (n_position = PATH_OFFSET_POSITION(p_s_path, n_counter - 1)) > B_NR_ITEMS(p_s_parent) )
972 return REPEAT_SEARCH;
973 /* Check whether parent at the path really points to the child. */
974 if ( B_N_CHILD_NUM(p_s_parent, n_position) !=
975 PATH_OFFSET_PBUFFER(p_s_path, n_counter)->b_blocknr )
976 return REPEAT_SEARCH;
977 /* Return delimiting key if position in the parent is not equal to first/last one. */
978 if ( c_lr_par == RIGHT_PARENTS )
979 n_first_last_position = B_NR_ITEMS (p_s_parent);
980 if ( n_position != n_first_last_position ) {
981 *pp_s_com_father = p_s_parent;
982 get_bh(*pp_s_com_father) ;
983 /*(*pp_s_com_father = p_s_parent)->b_count++;*/
988 /* if we are in the root of the tree, then there is no common father */
989 if ( n_counter == FIRST_PATH_ELEMENT_OFFSET ) {
990 /* Check whether first buffer in the path is the root of the tree. */
991 if ( PATH_OFFSET_PBUFFER(p_s_tb->tb_path, FIRST_PATH_ELEMENT_OFFSET)->b_blocknr ==
992 SB_ROOT_BLOCK (p_s_tb->tb_sb) ) {
993 *pp_s_father = *pp_s_com_father = NULL;
996 return REPEAT_SEARCH;
999 RFALSE( B_LEVEL (*pp_s_com_father) <= DISK_LEAF_NODE_LEVEL,
1000 "PAP-8185: (%b %z) level too small",
1001 *pp_s_com_father, *pp_s_com_father);
1003 /* Check whether the common parent is locked. */
1005 if ( buffer_locked (*pp_s_com_father) ) {
1006 __wait_on_buffer(*pp_s_com_father);
1007 if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) {
1008 decrement_bcount(*pp_s_com_father);
1009 return REPEAT_SEARCH;
1013 /* So, we got common parent of the current node and its left/right neighbor.
1014 Now we are geting the parent of the left/right neighbor. */
1016 /* Form key to get parent of the left/right neighbor. */
1017 le_key2cpu_key (&s_lr_father_key, B_N_PDELIM_KEY(*pp_s_com_father, ( c_lr_par == LEFT_PARENTS ) ?
1018 (p_s_tb->lkey[n_h - 1] = n_position - 1) : (p_s_tb->rkey[n_h - 1] = n_position)));
1021 if ( c_lr_par == LEFT_PARENTS )
1022 decrement_key(&s_lr_father_key);
1024 if (search_by_key(p_s_tb->tb_sb, &s_lr_father_key, &s_path_to_neighbor_father, n_h + 1) == IO_ERROR)
1028 if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) {
1029 decrement_counters_in_path(&s_path_to_neighbor_father);
1030 decrement_bcount(*pp_s_com_father);
1031 return REPEAT_SEARCH;
1034 *pp_s_father = PATH_PLAST_BUFFER(&s_path_to_neighbor_father);
1036 RFALSE( B_LEVEL (*pp_s_father) != n_h + 1,
1037 "PAP-8190: (%b %z) level too small", *pp_s_father, *pp_s_father);
1038 RFALSE( s_path_to_neighbor_father.path_length < FIRST_PATH_ELEMENT_OFFSET,
1039 "PAP-8192: path length is too small");
1041 s_path_to_neighbor_father.path_length--;
1042 decrement_counters_in_path(&s_path_to_neighbor_father);
1047 /* Get parents of neighbors of node in the path(S[n_path_offset]) and common parents of
1048 * S[n_path_offset] and L[n_path_offset]/R[n_path_offset]: F[n_path_offset], FL[n_path_offset],
1049 * FR[n_path_offset], CFL[n_path_offset], CFR[n_path_offset].
1050 * Calculate numbers of left and right delimiting keys position: lkey[n_path_offset], rkey[n_path_offset].
1051 * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked;
1052 * CARRY_ON - schedule didn't occur while the function worked;
1054 static int get_parents (struct tree_balance * p_s_tb, int n_h)
1056 struct path * p_s_path = p_s_tb->tb_path;
1059 n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h);
1060 struct buffer_head * p_s_curf,
1063 /* Current node is the root of the tree or will be root of the tree */
1064 if ( n_path_offset <= FIRST_PATH_ELEMENT_OFFSET ) {
1065 /* The root can not have parents.
1066 Release nodes which previously were obtained as parents of the current node neighbors. */
1067 decrement_bcount(p_s_tb->FL[n_h]);
1068 decrement_bcount(p_s_tb->CFL[n_h]);
1069 decrement_bcount(p_s_tb->FR[n_h]);
1070 decrement_bcount(p_s_tb->CFR[n_h]);
1071 p_s_tb->FL[n_h] = p_s_tb->CFL[n_h] = p_s_tb->FR[n_h] = p_s_tb->CFR[n_h] = NULL;
1075 /* Get parent FL[n_path_offset] of L[n_path_offset]. */
1076 if ( (n_position = PATH_OFFSET_POSITION(p_s_path, n_path_offset - 1)) ) {
1077 /* Current node is not the first child of its parent. */
1078 /*(p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1))->b_count += 2;*/
1079 p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1);
1082 p_s_tb->lkey[n_h] = n_position - 1;
1085 /* Calculate current parent of L[n_path_offset], which is the left neighbor of the current node.
1086 Calculate current common parent of L[n_path_offset] and the current node. Note that
1087 CFL[n_path_offset] not equal FL[n_path_offset] and CFL[n_path_offset] not equal F[n_path_offset].
1088 Calculate lkey[n_path_offset]. */
1089 if ( (n_ret_value = get_far_parent(p_s_tb, n_h + 1, &p_s_curf,
1090 &p_s_curcf, LEFT_PARENTS)) != CARRY_ON )
1094 decrement_bcount(p_s_tb->FL[n_h]);
1095 p_s_tb->FL[n_h] = p_s_curf; /* New initialization of FL[n_h]. */
1096 decrement_bcount(p_s_tb->CFL[n_h]);
1097 p_s_tb->CFL[n_h] = p_s_curcf; /* New initialization of CFL[n_h]. */
1099 RFALSE( (p_s_curf && !B_IS_IN_TREE (p_s_curf)) ||
1100 (p_s_curcf && !B_IS_IN_TREE (p_s_curcf)),
1101 "PAP-8195: FL (%b) or CFL (%b) is invalid", p_s_curf, p_s_curcf);
1103 /* Get parent FR[n_h] of R[n_h]. */
1105 /* Current node is the last child of F[n_h]. FR[n_h] != F[n_h]. */
1106 if ( n_position == B_NR_ITEMS (PATH_H_PBUFFER(p_s_path, n_h + 1)) ) {
1107 /* Calculate current parent of R[n_h], which is the right neighbor of F[n_h].
1108 Calculate current common parent of R[n_h] and current node. Note that CFR[n_h]
1109 not equal FR[n_path_offset] and CFR[n_h] not equal F[n_h]. */
1110 if ( (n_ret_value = get_far_parent(p_s_tb, n_h + 1, &p_s_curf, &p_s_curcf, RIGHT_PARENTS)) != CARRY_ON )
1114 /* Current node is not the last child of its parent F[n_h]. */
1115 /*(p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1))->b_count += 2;*/
1116 p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1);
1119 p_s_tb->rkey[n_h] = n_position;
1122 decrement_bcount(p_s_tb->FR[n_h]);
1123 p_s_tb->FR[n_h] = p_s_curf; /* New initialization of FR[n_path_offset]. */
1125 decrement_bcount(p_s_tb->CFR[n_h]);
1126 p_s_tb->CFR[n_h] = p_s_curcf; /* New initialization of CFR[n_path_offset]. */
1128 RFALSE( (p_s_curf && !B_IS_IN_TREE (p_s_curf)) ||
1129 (p_s_curcf && !B_IS_IN_TREE (p_s_curcf)),
1130 "PAP-8205: FR (%b) or CFR (%b) is invalid", p_s_curf, p_s_curcf);
1136 /* it is possible to remove node as result of shiftings to
1137 neighbors even when we insert or paste item. */
1138 static inline int can_node_be_removed (int mode, int lfree, int sfree, int rfree, struct tree_balance * tb, int h)
1140 struct buffer_head * Sh = PATH_H_PBUFFER (tb->tb_path, h);
1141 int levbytes = tb->insert_size[h];
1142 struct item_head * ih;
1143 struct key * r_key = NULL;
1145 ih = B_N_PITEM_HEAD (Sh, 0);
1147 r_key = B_N_PDELIM_KEY(tb->CFR[h],tb->rkey[h]);
1150 lfree + rfree + sfree < MAX_CHILD_SIZE(Sh) + levbytes
1151 /* shifting may merge items which might save space */
1152 - (( ! h && op_is_left_mergeable (&(ih->ih_key), Sh->b_size) ) ? IH_SIZE : 0)
1153 - (( ! h && r_key && op_is_left_mergeable (r_key, Sh->b_size) ) ? IH_SIZE : 0)
1154 + (( h ) ? KEY_SIZE : 0))
1156 /* node can not be removed */
1157 if (sfree >= levbytes ) { /* new item fits into node S[h] without any shifting */
1159 tb->s0num = B_NR_ITEMS(Sh) + ((mode == M_INSERT ) ? 1 : 0);
1160 set_parameters (tb, h, 0, 0, 1, NULL, -1, -1);
1161 return NO_BALANCING_NEEDED;
1164 PROC_INFO_INC( tb -> tb_sb, can_node_be_removed[ h ] );
1165 return !NO_BALANCING_NEEDED;
1170 /* Check whether current node S[h] is balanced when increasing its size by
1171 * Inserting or Pasting.
1172 * Calculate parameters for balancing for current level h.
1174 * tb tree_balance structure;
1175 * h current level of the node;
1176 * inum item number in S[h];
1177 * mode i - insert, p - paste;
1178 * Returns: 1 - schedule occurred;
1179 * 0 - balancing for higher levels needed;
1180 * -1 - no balancing for higher levels needed;
1181 * -2 - no disk space.
1183 /* ip means Inserting or Pasting */
1184 static int ip_check_balance (struct tree_balance * tb, int h)
1186 struct virtual_node * vn = tb->tb_vn;
1187 int levbytes, /* Number of bytes that must be inserted into (value
1188 is negative if bytes are deleted) buffer which
1189 contains node being balanced. The mnemonic is
1190 that the attempted change in node space used level
1191 is levbytes bytes. */
1194 int lfree, sfree, rfree /* free space in L, S and R */;
1196 /* nver is short for number of vertixes, and lnver is the number if
1197 we shift to the left, rnver is the number if we shift to the
1198 right, and lrnver is the number if we shift in both directions.
1199 The goal is to minimize first the number of vertixes, and second,
1200 the number of vertixes whose contents are changed by shifting,
1201 and third the number of uncached vertixes whose contents are
1202 changed by shifting and must be read from disk. */
1203 int nver, lnver, rnver, lrnver;
1205 /* used at leaf level only, S0 = S[0] is the node being balanced,
1206 sInum [ I = 0,1,2 ] is the number of items that will
1207 remain in node SI after balancing. S1 and S2 are new
1208 nodes that might be created. */
1210 /* we perform 8 calls to get_num_ver(). For each call we calculate five parameters.
1211 where 4th parameter is s1bytes and 5th - s2bytes
1213 short snum012[40] = {0,}; /* s0num, s1num, s2num for 8 cases
1214 0,1 - do not shift and do not shift but bottle
1215 2 - shift only whole item to left
1216 3 - shift to left and bottle as much as possible
1217 4,5 - shift to right (whole items and as much as possible
1218 6,7 - shift to both directions (whole items and as much as possible)
1221 /* Sh is the node whose balance is currently being checked */
1222 struct buffer_head * Sh;
1224 Sh = PATH_H_PBUFFER (tb->tb_path, h);
1225 levbytes = tb->insert_size[h];
1227 /* Calculate balance parameters for creating new root. */
1230 reiserfs_panic (tb->tb_sb, "vs-8210: ip_check_balance: S[0] can not be 0");
1231 switch ( n_ret_value = get_empty_nodes (tb, h) ) {
1233 set_parameters (tb, h, 0, 0, 1, NULL, -1, -1);
1234 return NO_BALANCING_NEEDED; /* no balancing for higher levels needed */
1240 reiserfs_panic(tb->tb_sb, "vs-8215: ip_check_balance: incorrect return value of get_empty_nodes");
1244 if ( (n_ret_value = get_parents (tb, h)) != CARRY_ON ) /* get parents of S[h] neighbors. */
1247 sfree = B_FREE_SPACE (Sh);
1249 /* get free space of neighbors */
1250 rfree = get_rfree (tb, h);
1251 lfree = get_lfree (tb, h);
1253 if (can_node_be_removed (vn->vn_mode, lfree, sfree, rfree, tb, h) == NO_BALANCING_NEEDED)
1254 /* and new item fits into node S[h] without any shifting */
1255 return NO_BALANCING_NEEDED;
1257 create_virtual_node (tb, h);
1260 determine maximal number of items we can shift to the left neighbor (in tb structure)
1261 and the maximal number of bytes that can flow to the left neighbor
1262 from the left most liquid item that cannot be shifted from S[0] entirely (returned value)
1264 check_left (tb, h, lfree);
1267 determine maximal number of items we can shift to the right neighbor (in tb structure)
1268 and the maximal number of bytes that can flow to the right neighbor
1269 from the right most liquid item that cannot be shifted from S[0] entirely (returned value)
1271 check_right (tb, h, rfree);
1274 /* all contents of internal node S[h] can be moved into its
1275 neighbors, S[h] will be removed after balancing */
1276 if (h && (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1)) {
1279 /* Since we are working on internal nodes, and our internal
1280 nodes have fixed size entries, then we can balance by the
1281 number of items rather than the space they consume. In this
1282 routine we set the left node equal to the right node,
1283 allowing a difference of less than or equal to 1 child
1285 to_r = ((MAX_NR_KEY(Sh)<<1)+2-tb->lnum[h]-tb->rnum[h]+vn->vn_nr_item+1)/2 -
1286 (MAX_NR_KEY(Sh) + 1 - tb->rnum[h]);
1287 set_parameters (tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL, -1, -1);
1291 /* this checks balance condition, that any two neighboring nodes can not fit in one node */
1293 ( tb->lnum[h] >= vn->vn_nr_item + 1 ||
1294 tb->rnum[h] >= vn->vn_nr_item + 1),
1295 "vs-8220: tree is not balanced on internal level");
1296 RFALSE( ! h && ((tb->lnum[h] >= vn->vn_nr_item && (tb->lbytes == -1)) ||
1297 (tb->rnum[h] >= vn->vn_nr_item && (tb->rbytes == -1)) ),
1298 "vs-8225: tree is not balanced on leaf level");
1300 /* all contents of S[0] can be moved into its neighbors
1301 S[0] will be removed after balancing. */
1302 if (!h && is_leaf_removable (tb))
1306 /* why do we perform this check here rather than earlier??
1307 Answer: we can win 1 node in some cases above. Moreover we
1308 checked it above, when we checked, that S[0] is not removable
1310 if (sfree >= levbytes) { /* new item fits into node S[h] without any shifting */
1312 tb->s0num = vn->vn_nr_item;
1313 set_parameters (tb, h, 0, 0, 1, NULL, -1, -1);
1314 return NO_BALANCING_NEEDED;
1319 int lpar, rpar, nset, lset, rset, lrset;
1321 * regular overflowing of the node
1324 /* get_num_ver works in 2 modes (FLOW & NO_FLOW)
1325 lpar, rpar - number of items we can shift to left/right neighbor (including splitting item)
1326 nset, lset, rset, lrset - shows, whether flowing items give better packing
1329 #define NO_FLOW 0 /* do not any splitting */
1331 /* we choose one the following */
1332 #define NOTHING_SHIFT_NO_FLOW 0
1333 #define NOTHING_SHIFT_FLOW 5
1334 #define LEFT_SHIFT_NO_FLOW 10
1335 #define LEFT_SHIFT_FLOW 15
1336 #define RIGHT_SHIFT_NO_FLOW 20
1337 #define RIGHT_SHIFT_FLOW 25
1338 #define LR_SHIFT_NO_FLOW 30
1339 #define LR_SHIFT_FLOW 35
1346 /* calculate number of blocks S[h] must be split into when
1347 nothing is shifted to the neighbors,
1348 as well as number of items in each part of the split node (s012 numbers),
1349 and number of bytes (s1bytes) of the shared drop which flow to S1 if any */
1350 nset = NOTHING_SHIFT_NO_FLOW;
1351 nver = get_num_ver (vn->vn_mode, tb, h,
1352 0, -1, h?vn->vn_nr_item:0, -1,
1359 /* note, that in this case we try to bottle between S[0] and S1 (S1 - the first new node) */
1360 nver1 = get_num_ver (vn->vn_mode, tb, h,
1362 snum012 + NOTHING_SHIFT_FLOW, FLOW);
1364 nset = NOTHING_SHIFT_FLOW, nver = nver1;
1368 /* calculate number of blocks S[h] must be split into when
1369 l_shift_num first items and l_shift_bytes of the right most
1370 liquid item to be shifted are shifted to the left neighbor,
1371 as well as number of items in each part of the splitted node (s012 numbers),
1372 and number of bytes (s1bytes) of the shared drop which flow to S1 if any
1374 lset = LEFT_SHIFT_NO_FLOW;
1375 lnver = get_num_ver (vn->vn_mode, tb, h,
1376 lpar - (( h || tb->lbytes == -1 ) ? 0 : 1), -1, h ? vn->vn_nr_item:0, -1,
1377 snum012 + LEFT_SHIFT_NO_FLOW, NO_FLOW);
1382 lnver1 = get_num_ver (vn->vn_mode, tb, h,
1383 lpar - ((tb->lbytes != -1) ? 1 : 0), tb->lbytes, 0, -1,
1384 snum012 + LEFT_SHIFT_FLOW, FLOW);
1386 lset = LEFT_SHIFT_FLOW, lnver = lnver1;
1390 /* calculate number of blocks S[h] must be split into when
1391 r_shift_num first items and r_shift_bytes of the left most
1392 liquid item to be shifted are shifted to the right neighbor,
1393 as well as number of items in each part of the splitted node (s012 numbers),
1394 and number of bytes (s1bytes) of the shared drop which flow to S1 if any
1396 rset = RIGHT_SHIFT_NO_FLOW;
1397 rnver = get_num_ver (vn->vn_mode, tb, h,
1398 0, -1, h ? (vn->vn_nr_item-rpar) : (rpar - (( tb->rbytes != -1 ) ? 1 : 0)), -1,
1399 snum012 + RIGHT_SHIFT_NO_FLOW, NO_FLOW);
1404 rnver1 = get_num_ver (vn->vn_mode, tb, h,
1405 0, -1, (rpar - ((tb->rbytes != -1) ? 1 : 0)), tb->rbytes,
1406 snum012 + RIGHT_SHIFT_FLOW, FLOW);
1409 rset = RIGHT_SHIFT_FLOW, rnver = rnver1;
1413 /* calculate number of blocks S[h] must be split into when
1414 items are shifted in both directions,
1415 as well as number of items in each part of the splitted node (s012 numbers),
1416 and number of bytes (s1bytes) of the shared drop which flow to S1 if any
1418 lrset = LR_SHIFT_NO_FLOW;
1419 lrnver = get_num_ver (vn->vn_mode, tb, h,
1420 lpar - ((h || tb->lbytes == -1) ? 0 : 1), -1, h ? (vn->vn_nr_item-rpar):(rpar - ((tb->rbytes != -1) ? 1 : 0)), -1,
1421 snum012 + LR_SHIFT_NO_FLOW, NO_FLOW);
1426 lrnver1 = get_num_ver (vn->vn_mode, tb, h,
1427 lpar - ((tb->lbytes != -1) ? 1 : 0), tb->lbytes, (rpar - ((tb->rbytes != -1) ? 1 : 0)), tb->rbytes,
1428 snum012 + LR_SHIFT_FLOW, FLOW);
1429 if (lrnver > lrnver1)
1430 lrset = LR_SHIFT_FLOW, lrnver = lrnver1;
1435 /* Our general shifting strategy is:
1436 1) to minimized number of new nodes;
1437 2) to minimized number of neighbors involved in shifting;
1438 3) to minimized number of disk reads; */
1440 /* we can win TWO or ONE nodes by shifting in both directions */
1441 if (lrnver < lnver && lrnver < rnver)
1444 (tb->lnum[h] != 1 ||
1446 lrnver != 1 || rnver != 2 || lnver != 2 || h != 1),
1448 if (lrset == LR_SHIFT_FLOW)
1449 set_parameters (tb, h, tb->lnum[h], tb->rnum[h], lrnver, snum012 + lrset,
1450 tb->lbytes, tb->rbytes);
1452 set_parameters (tb, h, tb->lnum[h] - ((tb->lbytes == -1) ? 0 : 1),
1453 tb->rnum[h] - ((tb->rbytes == -1) ? 0 : 1), lrnver, snum012 + lrset, -1, -1);
1458 /* if shifting doesn't lead to better packing then don't shift */
1461 set_parameters (tb, h, 0, 0, nver, snum012 + nset, -1, -1);
1466 /* now we know that for better packing shifting in only one
1467 direction either to the left or to the right is required */
1469 /* if shifting to the left is better than shifting to the right */
1476 /* if shifting to the right is better than shifting to the left */
1479 SET_PAR_SHIFT_RIGHT;
1484 /* now shifting in either direction gives the same number
1485 of nodes and we can make use of the cached neighbors */
1486 if (is_left_neighbor_in_cache (tb,h))
1492 /* shift to the right independently on whether the right neighbor in cache or not */
1493 SET_PAR_SHIFT_RIGHT;
1499 /* Check whether current node S[h] is balanced when Decreasing its size by
1500 * Deleting or Cutting for INTERNAL node of S+tree.
1501 * Calculate parameters for balancing for current level h.
1503 * tb tree_balance structure;
1504 * h current level of the node;
1505 * inum item number in S[h];
1506 * mode i - insert, p - paste;
1507 * Returns: 1 - schedule occurred;
1508 * 0 - balancing for higher levels needed;
1509 * -1 - no balancing for higher levels needed;
1510 * -2 - no disk space.
1512 * Note: Items of internal nodes have fixed size, so the balance condition for
1513 * the internal part of S+tree is as for the B-trees.
1515 static int dc_check_balance_internal (struct tree_balance * tb, int h)
1517 struct virtual_node * vn = tb->tb_vn;
1519 /* Sh is the node whose balance is currently being checked,
1520 and Fh is its father. */
1521 struct buffer_head * Sh, * Fh;
1524 int lfree, rfree /* free space in L and R */;
1526 Sh = PATH_H_PBUFFER (tb->tb_path, h);
1527 Fh = PATH_H_PPARENT (tb->tb_path, h);
1529 maxsize = MAX_CHILD_SIZE(Sh);
1531 /* using tb->insert_size[h], which is negative in this case, create_virtual_node calculates: */
1532 /* new_nr_item = number of items node would have if operation is */
1533 /* performed without balancing (new_nr_item); */
1534 create_virtual_node (tb, h);
1537 { /* S[h] is the root. */
1538 if ( vn->vn_nr_item > 0 )
1540 set_parameters (tb, h, 0, 0, 1, NULL, -1, -1);
1541 return NO_BALANCING_NEEDED; /* no balancing for higher levels needed */
1543 /* new_nr_item == 0.
1544 * Current root will be deleted resulting in
1545 * decrementing the tree height. */
1546 set_parameters (tb, h, 0, 0, 0, NULL, -1, -1);
1550 if ( (n_ret_value = get_parents(tb,h)) != CARRY_ON )
1554 /* get free space of neighbors */
1555 rfree = get_rfree (tb, h);
1556 lfree = get_lfree (tb, h);
1558 /* determine maximal number of items we can fit into neighbors */
1559 check_left (tb, h, lfree);
1560 check_right (tb, h, rfree);
1563 if ( vn->vn_nr_item >= MIN_NR_KEY(Sh) )
1564 { /* Balance condition for the internal node is valid.
1565 * In this case we balance only if it leads to better packing. */
1566 if ( vn->vn_nr_item == MIN_NR_KEY(Sh) )
1567 { /* Here we join S[h] with one of its neighbors,
1568 * which is impossible with greater values of new_nr_item. */
1569 if ( tb->lnum[h] >= vn->vn_nr_item + 1 )
1571 /* All contents of S[h] can be moved to L[h]. */
1575 order_L = ((n=PATH_H_B_ITEM_ORDER(tb->tb_path, h))==0) ? B_NR_ITEMS(tb->FL[h]) : n - 1;
1576 n = dc_size(B_N_CHILD(tb->FL[h],order_L)) / (DC_SIZE + KEY_SIZE);
1577 set_parameters (tb, h, -n-1, 0, 0, NULL, -1, -1);
1581 if ( tb->rnum[h] >= vn->vn_nr_item + 1 )
1583 /* All contents of S[h] can be moved to R[h]. */
1587 order_R = ((n=PATH_H_B_ITEM_ORDER(tb->tb_path, h))==B_NR_ITEMS(Fh)) ? 0 : n + 1;
1588 n = dc_size(B_N_CHILD(tb->FR[h],order_R)) / (DC_SIZE + KEY_SIZE);
1589 set_parameters (tb, h, 0, -n-1, 0, NULL, -1, -1);
1594 if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1)
1596 /* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */
1599 to_r = ((MAX_NR_KEY(Sh)<<1)+2-tb->lnum[h]-tb->rnum[h]+vn->vn_nr_item+1)/2 -
1600 (MAX_NR_KEY(Sh) + 1 - tb->rnum[h]);
1601 set_parameters (tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL, -1, -1);
1605 /* Balancing does not lead to better packing. */
1606 set_parameters (tb, h, 0, 0, 1, NULL, -1, -1);
1607 return NO_BALANCING_NEEDED;
1610 /* Current node contain insufficient number of items. Balancing is required. */
1611 /* Check whether we can merge S[h] with left neighbor. */
1612 if (tb->lnum[h] >= vn->vn_nr_item + 1)
1613 if (is_left_neighbor_in_cache (tb,h) || tb->rnum[h] < vn->vn_nr_item + 1 || !tb->FR[h])
1618 order_L = ((n=PATH_H_B_ITEM_ORDER(tb->tb_path, h))==0) ? B_NR_ITEMS(tb->FL[h]) : n - 1;
1619 n = dc_size(B_N_CHILD(tb->FL[h],order_L)) / (DC_SIZE + KEY_SIZE);
1620 set_parameters (tb, h, -n-1, 0, 0, NULL, -1, -1);
1624 /* Check whether we can merge S[h] with right neighbor. */
1625 if (tb->rnum[h] >= vn->vn_nr_item + 1)
1630 order_R = ((n=PATH_H_B_ITEM_ORDER(tb->tb_path, h))==B_NR_ITEMS(Fh)) ? 0 : (n + 1);
1631 n = dc_size(B_N_CHILD(tb->FR[h],order_R)) / (DC_SIZE + KEY_SIZE);
1632 set_parameters (tb, h, 0, -n-1, 0, NULL, -1, -1);
1636 /* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */
1637 if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1)
1641 to_r = ((MAX_NR_KEY(Sh)<<1)+2-tb->lnum[h]-tb->rnum[h]+vn->vn_nr_item+1)/2 -
1642 (MAX_NR_KEY(Sh) + 1 - tb->rnum[h]);
1643 set_parameters (tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL, -1, -1);
1647 /* For internal nodes try to borrow item from a neighbor */
1648 RFALSE( !tb->FL[h] && !tb->FR[h], "vs-8235: trying to borrow for root");
1650 /* Borrow one or two items from caching neighbor */
1651 if (is_left_neighbor_in_cache (tb,h) || !tb->FR[h])
1655 from_l = (MAX_NR_KEY(Sh) + 1 - tb->lnum[h] + vn->vn_nr_item + 1) / 2 - (vn->vn_nr_item + 1);
1656 set_parameters (tb, h, -from_l, 0, 1, NULL, -1, -1);
1660 set_parameters (tb, h, 0, -((MAX_NR_KEY(Sh)+1-tb->rnum[h]+vn->vn_nr_item+1)/2-(vn->vn_nr_item+1)), 1,
1666 /* Check whether current node S[h] is balanced when Decreasing its size by
1667 * Deleting or Truncating for LEAF node of S+tree.
1668 * Calculate parameters for balancing for current level h.
1670 * tb tree_balance structure;
1671 * h current level of the node;
1672 * inum item number in S[h];
1673 * mode i - insert, p - paste;
1674 * Returns: 1 - schedule occurred;
1675 * 0 - balancing for higher levels needed;
1676 * -1 - no balancing for higher levels needed;
1677 * -2 - no disk space.
1679 static int dc_check_balance_leaf (struct tree_balance * tb, int h)
1681 struct virtual_node * vn = tb->tb_vn;
1683 /* Number of bytes that must be deleted from
1684 (value is negative if bytes are deleted) buffer which
1685 contains node being balanced. The mnemonic is that the
1686 attempted change in node space used level is levbytes bytes. */
1688 /* the maximal item size */
1691 /* S0 is the node whose balance is currently being checked,
1692 and F0 is its father. */
1693 struct buffer_head * S0, * F0;
1694 int lfree, rfree /* free space in L and R */;
1696 S0 = PATH_H_PBUFFER (tb->tb_path, 0);
1697 F0 = PATH_H_PPARENT (tb->tb_path, 0);
1699 levbytes = tb->insert_size[h];
1701 maxsize = MAX_CHILD_SIZE(S0); /* maximal possible size of an item */
1704 { /* S[0] is the root now. */
1706 RFALSE( -levbytes >= maxsize - B_FREE_SPACE (S0),
1707 "vs-8240: attempt to create empty buffer tree");
1709 set_parameters (tb, h, 0, 0, 1, NULL, -1, -1);
1710 return NO_BALANCING_NEEDED;
1713 if ( (n_ret_value = get_parents(tb,h)) != CARRY_ON )
1716 /* get free space of neighbors */
1717 rfree = get_rfree (tb, h);
1718 lfree = get_lfree (tb, h);
1720 create_virtual_node (tb, h);
1722 /* if 3 leaves can be merge to one, set parameters and return */
1723 if (are_leaves_removable (tb, lfree, rfree))
1726 /* determine maximal number of items we can shift to the left/right neighbor
1727 and the maximal number of bytes that can flow to the left/right neighbor
1728 from the left/right most liquid item that cannot be shifted from S[0] entirely
1730 check_left (tb, h, lfree);
1731 check_right (tb, h, rfree);
1733 /* check whether we can merge S with left neighbor. */
1734 if (tb->lnum[0] >= vn->vn_nr_item && tb->lbytes == -1)
1735 if (is_left_neighbor_in_cache (tb,h) ||
1736 ((tb->rnum[0] - ((tb->rbytes == -1) ? 0 : 1)) < vn->vn_nr_item) || /* S can not be merged with R */
1739 RFALSE( !tb->FL[h], "vs-8245: dc_check_balance_leaf: FL[h] must exist");
1741 /* set parameter to merge S[0] with its left neighbor */
1742 set_parameters (tb, h, -1, 0, 0, NULL, -1, -1);
1746 /* check whether we can merge S[0] with right neighbor. */
1747 if (tb->rnum[0] >= vn->vn_nr_item && tb->rbytes == -1) {
1748 set_parameters (tb, h, 0, -1, 0, NULL, -1, -1);
1752 /* All contents of S[0] can be moved to the neighbors (L[0] & R[0]). Set parameters and return */
1753 if (is_leaf_removable (tb))
1756 /* Balancing is not required. */
1757 tb->s0num = vn->vn_nr_item;
1758 set_parameters (tb, h, 0, 0, 1, NULL, -1, -1);
1759 return NO_BALANCING_NEEDED;
1764 /* Check whether current node S[h] is balanced when Decreasing its size by
1765 * Deleting or Cutting.
1766 * Calculate parameters for balancing for current level h.
1768 * tb tree_balance structure;
1769 * h current level of the node;
1770 * inum item number in S[h];
1771 * mode d - delete, c - cut.
1772 * Returns: 1 - schedule occurred;
1773 * 0 - balancing for higher levels needed;
1774 * -1 - no balancing for higher levels needed;
1775 * -2 - no disk space.
1777 static int dc_check_balance (struct tree_balance * tb, int h)
1779 RFALSE( ! (PATH_H_PBUFFER (tb->tb_path, h)), "vs-8250: S is not initialized");
1782 return dc_check_balance_internal (tb, h);
1784 return dc_check_balance_leaf (tb, h);
1789 /* Check whether current node S[h] is balanced.
1790 * Calculate parameters for balancing for current level h.
1793 * tb tree_balance structure:
1795 * tb is a large structure that must be read about in the header file
1796 * at the same time as this procedure if the reader is to successfully
1797 * understand this procedure
1799 * h current level of the node;
1800 * inum item number in S[h];
1801 * mode i - insert, p - paste, d - delete, c - cut.
1802 * Returns: 1 - schedule occurred;
1803 * 0 - balancing for higher levels needed;
1804 * -1 - no balancing for higher levels needed;
1805 * -2 - no disk space.
1807 static int check_balance (int mode,
1808 struct tree_balance * tb,
1812 struct item_head * ins_ih,
1816 struct virtual_node * vn;
1818 vn = tb->tb_vn = (struct virtual_node *)(tb->vn_buf);
1819 vn->vn_free_ptr = (char *)(tb->tb_vn + 1);
1821 vn->vn_affected_item_num = inum;
1822 vn->vn_pos_in_item = pos_in_item;
1823 vn->vn_ins_ih = ins_ih;
1826 RFALSE( mode == M_INSERT && !vn->vn_ins_ih,
1827 "vs-8255: ins_ih can not be 0 in insert mode");
1829 if ( tb->insert_size[h] > 0 )
1830 /* Calculate balance parameters when size of node is increasing. */
1831 return ip_check_balance (tb, h);
1833 /* Calculate balance parameters when size of node is decreasing. */
1834 return dc_check_balance (tb, h);
1839 /* Check whether parent at the path is the really parent of the current node.*/
1840 static int get_direct_parent(
1841 struct tree_balance * p_s_tb,
1844 struct buffer_head * p_s_bh;
1845 struct path * p_s_path = p_s_tb->tb_path;
1847 n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h);
1849 /* We are in the root or in the new root. */
1850 if ( n_path_offset <= FIRST_PATH_ELEMENT_OFFSET ) {
1852 RFALSE( n_path_offset < FIRST_PATH_ELEMENT_OFFSET - 1,
1853 "PAP-8260: invalid offset in the path");
1855 if ( PATH_OFFSET_PBUFFER(p_s_path, FIRST_PATH_ELEMENT_OFFSET)->b_blocknr ==
1856 SB_ROOT_BLOCK (p_s_tb->tb_sb) ) {
1857 /* Root is not changed. */
1858 PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1) = NULL;
1859 PATH_OFFSET_POSITION(p_s_path, n_path_offset - 1) = 0;
1862 return REPEAT_SEARCH; /* Root is changed and we must recalculate the path. */
1865 if ( ! B_IS_IN_TREE(p_s_bh = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1)) )
1866 return REPEAT_SEARCH; /* Parent in the path is not in the tree. */
1868 if ( (n_position = PATH_OFFSET_POSITION(p_s_path, n_path_offset - 1)) > B_NR_ITEMS(p_s_bh) )
1869 return REPEAT_SEARCH;
1871 if ( B_N_CHILD_NUM(p_s_bh, n_position) != PATH_OFFSET_PBUFFER(p_s_path, n_path_offset)->b_blocknr )
1872 /* Parent in the path is not parent of the current node in the tree. */
1873 return REPEAT_SEARCH;
1875 if ( buffer_locked(p_s_bh) ) {
1876 __wait_on_buffer(p_s_bh);
1877 if ( FILESYSTEM_CHANGED_TB (p_s_tb) )
1878 return REPEAT_SEARCH;
1881 return CARRY_ON; /* Parent in the path is unlocked and really parent of the current node. */
1885 /* Using lnum[n_h] and rnum[n_h] we should determine what neighbors
1887 * need in order to balance S[n_h], and get them if necessary.
1888 * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked;
1889 * CARRY_ON - schedule didn't occur while the function worked;
1891 static int get_neighbors(
1892 struct tree_balance * p_s_tb,
1895 int n_child_position,
1896 n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h + 1);
1897 unsigned long n_son_number;
1898 struct super_block * p_s_sb = p_s_tb->tb_sb;
1899 struct buffer_head * p_s_bh;
1902 PROC_INFO_INC( p_s_sb, get_neighbors[ n_h ] );
1904 if ( p_s_tb->lnum[n_h] ) {
1905 /* We need left neighbor to balance S[n_h]. */
1906 PROC_INFO_INC( p_s_sb, need_l_neighbor[ n_h ] );
1907 p_s_bh = PATH_OFFSET_PBUFFER(p_s_tb->tb_path, n_path_offset);
1909 RFALSE( p_s_bh == p_s_tb->FL[n_h] &&
1910 ! PATH_OFFSET_POSITION(p_s_tb->tb_path, n_path_offset),
1911 "PAP-8270: invalid position in the parent");
1913 n_child_position = ( p_s_bh == p_s_tb->FL[n_h] ) ? p_s_tb->lkey[n_h] : B_NR_ITEMS (p_s_tb->FL[n_h]);
1914 n_son_number = B_N_CHILD_NUM(p_s_tb->FL[n_h], n_child_position);
1915 p_s_bh = sb_bread(p_s_sb, n_son_number);
1918 if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) {
1919 decrement_bcount(p_s_bh);
1920 PROC_INFO_INC( p_s_sb, get_neighbors_restart[ n_h ] );
1921 return REPEAT_SEARCH;
1924 RFALSE( ! B_IS_IN_TREE(p_s_tb->FL[n_h]) ||
1925 n_child_position > B_NR_ITEMS(p_s_tb->FL[n_h]) ||
1926 B_N_CHILD_NUM(p_s_tb->FL[n_h], n_child_position) !=
1927 p_s_bh->b_blocknr, "PAP-8275: invalid parent");
1928 RFALSE( ! B_IS_IN_TREE(p_s_bh), "PAP-8280: invalid child");
1930 B_FREE_SPACE (p_s_bh) != MAX_CHILD_SIZE (p_s_bh) - dc_size(B_N_CHILD (p_s_tb->FL[0],n_child_position)),
1931 "PAP-8290: invalid child size of left neighbor");
1933 decrement_bcount(p_s_tb->L[n_h]);
1934 p_s_tb->L[n_h] = p_s_bh;
1938 if ( p_s_tb->rnum[n_h] ) { /* We need right neighbor to balance S[n_path_offset]. */
1939 PROC_INFO_INC( p_s_sb, need_r_neighbor[ n_h ] );
1940 p_s_bh = PATH_OFFSET_PBUFFER(p_s_tb->tb_path, n_path_offset);
1942 RFALSE( p_s_bh == p_s_tb->FR[n_h] &&
1943 PATH_OFFSET_POSITION(p_s_tb->tb_path, n_path_offset) >= B_NR_ITEMS(p_s_bh),
1944 "PAP-8295: invalid position in the parent");
1946 n_child_position = ( p_s_bh == p_s_tb->FR[n_h] ) ? p_s_tb->rkey[n_h] + 1 : 0;
1947 n_son_number = B_N_CHILD_NUM(p_s_tb->FR[n_h], n_child_position);
1948 p_s_bh = sb_bread(p_s_sb, n_son_number);
1951 if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) {
1952 decrement_bcount(p_s_bh);
1953 PROC_INFO_INC( p_s_sb, get_neighbors_restart[ n_h ] );
1954 return REPEAT_SEARCH;
1956 decrement_bcount(p_s_tb->R[n_h]);
1957 p_s_tb->R[n_h] = p_s_bh;
1959 RFALSE( ! n_h && B_FREE_SPACE (p_s_bh) != MAX_CHILD_SIZE (p_s_bh) - dc_size(B_N_CHILD (p_s_tb->FR[0],n_child_position)),
1960 "PAP-8300: invalid child size of right neighbor (%d != %d - %d)",
1961 B_FREE_SPACE (p_s_bh), MAX_CHILD_SIZE (p_s_bh),
1962 dc_size(B_N_CHILD (p_s_tb->FR[0],n_child_position)));
1968 #ifdef CONFIG_REISERFS_CHECK
1969 void * reiserfs_kmalloc (size_t size, int flags, struct super_block * s)
1972 static size_t malloced;
1975 vp = kmalloc (size, flags);
1977 REISERFS_SB(s)->s_kmallocs += size;
1978 if (REISERFS_SB(s)->s_kmallocs > malloced + 200000) {
1979 reiserfs_warning (s,
1980 "vs-8301: reiserfs_kmalloc: allocated memory %d",
1981 REISERFS_SB(s)->s_kmallocs);
1982 malloced = REISERFS_SB(s)->s_kmallocs;
1988 void reiserfs_kfree (const void * vp, size_t size, struct super_block * s)
1992 REISERFS_SB(s)->s_kmallocs -= size;
1993 if (REISERFS_SB(s)->s_kmallocs < 0)
1994 reiserfs_warning (s, "vs-8302: reiserfs_kfree: allocated memory %d",
1995 REISERFS_SB(s)->s_kmallocs);
2001 static int get_virtual_node_size (struct super_block * sb, struct buffer_head * bh)
2003 int max_num_of_items;
2004 int max_num_of_entries;
2005 unsigned long blocksize = sb->s_blocksize;
2007 #define MIN_NAME_LEN 1
2009 max_num_of_items = (blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN);
2010 max_num_of_entries = (blocksize - BLKH_SIZE - IH_SIZE) /
2011 (DEH_SIZE + MIN_NAME_LEN);
2013 return sizeof(struct virtual_node) +
2014 max(max_num_of_items * sizeof (struct virtual_item),
2015 sizeof (struct virtual_item) + sizeof(struct direntry_uarea) +
2016 (max_num_of_entries - 1) * sizeof (__u16));
2021 /* maybe we should fail balancing we are going to perform when kmalloc
2022 fails several times. But now it will loop until kmalloc gets
2024 static int get_mem_for_virtual_node (struct tree_balance * tb)
2030 size = get_virtual_node_size (tb->tb_sb, PATH_PLAST_BUFFER (tb->tb_path));
2032 if (size > tb->vn_buf_size) {
2033 /* we have to allocate more memory for virtual node */
2035 /* free memory allocated before */
2036 reiserfs_kfree (tb->vn_buf, tb->vn_buf_size, tb->tb_sb);
2037 /* this is not needed if kfree is atomic */
2041 /* virtual node requires now more memory */
2042 tb->vn_buf_size = size;
2044 /* get memory for virtual item */
2045 buf = reiserfs_kmalloc(size, GFP_ATOMIC | __GFP_NOWARN, tb->tb_sb);
2047 /* getting memory with GFP_KERNEL priority may involve
2048 balancing now (due to indirect_to_direct conversion on
2049 dcache shrinking). So, release path and collected
2051 free_buffers_in_tb (tb);
2052 buf = reiserfs_kmalloc(size, GFP_NOFS, tb->tb_sb);
2054 #ifdef CONFIG_REISERFS_CHECK
2055 reiserfs_warning (tb->tb_sb,
2056 "vs-8345: get_mem_for_virtual_node: "
2057 "kmalloc failed. reiserfs kmalloced %d bytes",
2058 REISERFS_SB(tb->tb_sb)->s_kmallocs);
2060 tb->vn_buf_size = 0;
2064 return REPEAT_SEARCH;
2070 if ( check_fs && FILESYSTEM_CHANGED_TB (tb) )
2071 return REPEAT_SEARCH;
2077 #ifdef CONFIG_REISERFS_CHECK
2078 static void tb_buffer_sanity_check (struct super_block * p_s_sb,
2079 struct buffer_head * p_s_bh,
2080 const char *descr, int level) {
2082 if (atomic_read (&(p_s_bh->b_count)) <= 0) {
2084 reiserfs_panic (p_s_sb, "jmacd-1: tb_buffer_sanity_check(): negative or zero reference counter for buffer %s[%d] (%b)\n", descr, level, p_s_bh);
2087 if ( ! buffer_uptodate (p_s_bh) ) {
2088 reiserfs_panic (p_s_sb, "jmacd-2: tb_buffer_sanity_check(): buffer is not up to date %s[%d] (%b)\n", descr, level, p_s_bh);
2091 if ( ! B_IS_IN_TREE (p_s_bh) ) {
2092 reiserfs_panic (p_s_sb, "jmacd-3: tb_buffer_sanity_check(): buffer is not in tree %s[%d] (%b)\n", descr, level, p_s_bh);
2095 if (p_s_bh->b_bdev != p_s_sb->s_bdev) {
2096 reiserfs_panic (p_s_sb, "jmacd-4: tb_buffer_sanity_check(): buffer has wrong device %s[%d] (%b)\n", descr, level, p_s_bh);
2099 if (p_s_bh->b_size != p_s_sb->s_blocksize) {
2100 reiserfs_panic (p_s_sb, "jmacd-5: tb_buffer_sanity_check(): buffer has wrong blocksize %s[%d] (%b)\n", descr, level, p_s_bh);
2103 if (p_s_bh->b_blocknr > SB_BLOCK_COUNT(p_s_sb)) {
2104 reiserfs_panic (p_s_sb, "jmacd-6: tb_buffer_sanity_check(): buffer block number too high %s[%d] (%b)\n", descr, level, p_s_bh);
2109 static void tb_buffer_sanity_check (struct super_block * p_s_sb,
2110 struct buffer_head * p_s_bh,
2111 const char *descr, int level)
2115 static int clear_all_dirty_bits(struct super_block *s,
2116 struct buffer_head *bh) {
2117 return reiserfs_prepare_for_journal(s, bh, 0) ;
2120 static int wait_tb_buffers_until_unlocked (struct tree_balance * p_s_tb)
2122 struct buffer_head * locked;
2123 #ifdef CONFIG_REISERFS_CHECK
2124 int repeat_counter = 0;
2132 for ( i = p_s_tb->tb_path->path_length; !locked && i > ILLEGAL_PATH_ELEMENT_OFFSET; i-- ) {
2133 if ( PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i) ) {
2134 /* if I understand correctly, we can only be sure the last buffer
2135 ** in the path is in the tree --clm
2137 #ifdef CONFIG_REISERFS_CHECK
2138 if (PATH_PLAST_BUFFER(p_s_tb->tb_path) ==
2139 PATH_OFFSET_PBUFFER(p_s_tb->tb_path, i)) {
2140 tb_buffer_sanity_check (p_s_tb->tb_sb,
2141 PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i),
2143 p_s_tb->tb_path->path_length - i);
2146 if (!clear_all_dirty_bits(p_s_tb->tb_sb,
2147 PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i)))
2149 locked = PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i);
2154 for ( i = 0; !locked && i < MAX_HEIGHT && p_s_tb->insert_size[i]; i++ ) {
2156 if (p_s_tb->lnum[i] ) {
2158 if ( p_s_tb->L[i] ) {
2159 tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->L[i], "L", i);
2160 if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->L[i]))
2161 locked = p_s_tb->L[i];
2164 if ( !locked && p_s_tb->FL[i] ) {
2165 tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->FL[i], "FL", i);
2166 if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FL[i]))
2167 locked = p_s_tb->FL[i];
2170 if ( !locked && p_s_tb->CFL[i] ) {
2171 tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->CFL[i], "CFL", i);
2172 if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->CFL[i]))
2173 locked = p_s_tb->CFL[i];
2178 if ( !locked && (p_s_tb->rnum[i]) ) {
2180 if ( p_s_tb->R[i] ) {
2181 tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->R[i], "R", i);
2182 if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->R[i]))
2183 locked = p_s_tb->R[i];
2187 if ( !locked && p_s_tb->FR[i] ) {
2188 tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->FR[i], "FR", i);
2189 if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FR[i]))
2190 locked = p_s_tb->FR[i];
2193 if ( !locked && p_s_tb->CFR[i] ) {
2194 tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->CFR[i], "CFR", i);
2195 if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->CFR[i]))
2196 locked = p_s_tb->CFR[i];
2200 /* as far as I can tell, this is not required. The FEB list seems
2201 ** to be full of newly allocated nodes, which will never be locked,
2202 ** dirty, or anything else.
2203 ** To be safe, I'm putting in the checks and waits in. For the moment,
2204 ** they are needed to keep the code in journal.c from complaining
2205 ** about the buffer. That code is inside CONFIG_REISERFS_CHECK as well.
2208 for ( i = 0; !locked && i < MAX_FEB_SIZE; i++ ) {
2209 if ( p_s_tb->FEB[i] ) {
2210 if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FEB[i]))
2211 locked = p_s_tb->FEB[i] ;
2216 #ifdef CONFIG_REISERFS_CHECK
2218 if ( (repeat_counter % 10000) == 0) {
2219 reiserfs_warning (p_s_tb->tb_sb,
2220 "wait_tb_buffers_until_released(): too many "
2221 "iterations waiting for buffer to unlock "
2224 /* Don't loop forever. Try to recover from possible error. */
2226 return ( FILESYSTEM_CHANGED_TB (p_s_tb) ) ? REPEAT_SEARCH : CARRY_ON;
2229 __wait_on_buffer (locked);
2230 if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) {
2231 return REPEAT_SEARCH;
2241 /* Prepare for balancing, that is
2242 * get all necessary parents, and neighbors;
2243 * analyze what and where should be moved;
2244 * get sufficient number of new nodes;
2245 * Balancing will start only after all resources will be collected at a time.
2247 * When ported to SMP kernels, only at the last moment after all needed nodes
2248 * are collected in cache, will the resources be locked using the usual
2249 * textbook ordered lock acquisition algorithms. Note that ensuring that
2250 * this code neither write locks what it does not need to write lock nor locks out of order
2251 * will be a pain in the butt that could have been avoided. Grumble grumble. -Hans
2253 * fix is meant in the sense of render unchanging
2255 * Latency might be improved by first gathering a list of what buffers are needed
2256 * and then getting as many of them in parallel as possible? -Hans
2259 * op_mode i - insert, d - delete, c - cut (truncate), p - paste (append)
2260 * tb tree_balance structure;
2261 * inum item number in S[h];
2262 * pos_in_item - comment this if you can
2263 * ins_ih & ins_sd are used when inserting
2264 * Returns: 1 - schedule occurred while the function worked;
2265 * 0 - schedule didn't occur while the function worked;
2266 * -1 - if no_disk_space
2270 int fix_nodes (int n_op_mode,
2271 struct tree_balance * p_s_tb,
2272 struct item_head * p_s_ins_ih, // item head of item being inserted
2273 const void * data // inserted item or data to be pasted
2277 n_item_num = PATH_LAST_POSITION(p_s_tb->tb_path);
2280 /* we set wait_tb_buffers_run when we have to restore any dirty bits cleared
2281 ** during wait_tb_buffers_run
2283 int wait_tb_buffers_run = 0 ;
2284 struct buffer_head * p_s_tbS0 = PATH_PLAST_BUFFER(p_s_tb->tb_path);
2286 ++ REISERFS_SB(p_s_tb -> tb_sb) -> s_fix_nodes;
2288 n_pos_in_item = p_s_tb->tb_path->pos_in_item;
2291 p_s_tb->fs_gen = get_generation (p_s_tb->tb_sb);
2293 /* we prepare and log the super here so it will already be in the
2294 ** transaction when do_balance needs to change it.
2295 ** This way do_balance won't have to schedule when trying to prepare
2296 ** the super for logging
2298 reiserfs_prepare_for_journal(p_s_tb->tb_sb,
2299 SB_BUFFER_WITH_SB(p_s_tb->tb_sb), 1) ;
2300 journal_mark_dirty(p_s_tb->transaction_handle, p_s_tb->tb_sb,
2301 SB_BUFFER_WITH_SB(p_s_tb->tb_sb)) ;
2302 if ( FILESYSTEM_CHANGED_TB (p_s_tb) )
2303 return REPEAT_SEARCH;
2305 /* if it possible in indirect_to_direct conversion */
2306 if (buffer_locked (p_s_tbS0)) {
2307 __wait_on_buffer (p_s_tbS0);
2308 if ( FILESYSTEM_CHANGED_TB (p_s_tb) )
2309 return REPEAT_SEARCH;
2312 #ifdef CONFIG_REISERFS_CHECK
2314 print_cur_tb ("fix_nodes");
2315 reiserfs_panic(p_s_tb->tb_sb,"PAP-8305: fix_nodes: there is pending do_balance");
2318 if (!buffer_uptodate (p_s_tbS0) || !B_IS_IN_TREE (p_s_tbS0)) {
2319 reiserfs_panic (p_s_tb->tb_sb, "PAP-8320: fix_nodes: S[0] (%b %z) is not uptodate "
2320 "at the beginning of fix_nodes or not in tree (mode %c)", p_s_tbS0, p_s_tbS0, n_op_mode);
2323 /* Check parameters. */
2324 switch (n_op_mode) {
2326 if ( n_item_num <= 0 || n_item_num > B_NR_ITEMS(p_s_tbS0) )
2327 reiserfs_panic(p_s_tb->tb_sb,"PAP-8330: fix_nodes: Incorrect item number %d (in S0 - %d) in case of insert",
2328 n_item_num, B_NR_ITEMS(p_s_tbS0));
2333 if ( n_item_num < 0 || n_item_num >= B_NR_ITEMS(p_s_tbS0) ) {
2334 print_block (p_s_tbS0, 0, -1, -1);
2335 reiserfs_panic(p_s_tb->tb_sb,"PAP-8335: fix_nodes: Incorrect item number(%d); mode = %c insert_size = %d\n", n_item_num, n_op_mode, p_s_tb->insert_size[0]);
2339 reiserfs_panic(p_s_tb->tb_sb,"PAP-8340: fix_nodes: Incorrect mode of operation");
2343 if (get_mem_for_virtual_node (p_s_tb) == REPEAT_SEARCH)
2344 // FIXME: maybe -ENOMEM when tb->vn_buf == 0? Now just repeat
2345 return REPEAT_SEARCH;
2348 /* Starting from the leaf level; for all levels n_h of the tree. */
2349 for ( n_h = 0; n_h < MAX_HEIGHT && p_s_tb->insert_size[n_h]; n_h++ ) {
2350 if ( (n_ret_value = get_direct_parent(p_s_tb, n_h)) != CARRY_ON ) {
2354 if ( (n_ret_value = check_balance (n_op_mode, p_s_tb, n_h, n_item_num,
2355 n_pos_in_item, p_s_ins_ih, data)) != CARRY_ON ) {
2356 if ( n_ret_value == NO_BALANCING_NEEDED ) {
2357 /* No balancing for higher levels needed. */
2358 if ( (n_ret_value = get_neighbors(p_s_tb, n_h)) != CARRY_ON ) {
2361 if ( n_h != MAX_HEIGHT - 1 )
2362 p_s_tb->insert_size[n_h + 1] = 0;
2363 /* ok, analysis and resource gathering are complete */
2369 if ( (n_ret_value = get_neighbors(p_s_tb, n_h)) != CARRY_ON ) {
2373 if ( (n_ret_value = get_empty_nodes(p_s_tb, n_h)) != CARRY_ON ) {
2374 goto repeat; /* No disk space, or schedule occurred and
2375 analysis may be invalid and needs to be redone. */
2378 if ( ! PATH_H_PBUFFER(p_s_tb->tb_path, n_h) ) {
2379 /* We have a positive insert size but no nodes exist on this
2380 level, this means that we are creating a new root. */
2382 RFALSE( p_s_tb->blknum[n_h] != 1,
2383 "PAP-8350: creating new empty root");
2385 if ( n_h < MAX_HEIGHT - 1 )
2386 p_s_tb->insert_size[n_h + 1] = 0;
2389 if ( ! PATH_H_PBUFFER(p_s_tb->tb_path, n_h + 1) ) {
2390 if ( p_s_tb->blknum[n_h] > 1 ) {
2391 /* The tree needs to be grown, so this node S[n_h]
2392 which is the root node is split into two nodes,
2393 and a new node (S[n_h+1]) will be created to
2394 become the root node. */
2396 RFALSE( n_h == MAX_HEIGHT - 1,
2397 "PAP-8355: attempt to create too high of a tree");
2399 p_s_tb->insert_size[n_h + 1] = (DC_SIZE + KEY_SIZE) * (p_s_tb->blknum[n_h] - 1) + DC_SIZE;
2402 if ( n_h < MAX_HEIGHT - 1 )
2403 p_s_tb->insert_size[n_h + 1] = 0;
2406 p_s_tb->insert_size[n_h + 1] = (DC_SIZE + KEY_SIZE) * (p_s_tb->blknum[n_h] - 1);
2409 if ((n_ret_value = wait_tb_buffers_until_unlocked (p_s_tb)) == CARRY_ON) {
2410 if (FILESYSTEM_CHANGED_TB(p_s_tb)) {
2411 wait_tb_buffers_run = 1 ;
2412 n_ret_value = REPEAT_SEARCH ;
2418 wait_tb_buffers_run = 1 ;
2423 // fix_nodes was unable to perform its calculation due to
2424 // filesystem got changed under us, lack of free disk space or i/o
2425 // failure. If the first is the case - the search will be
2426 // repeated. For now - free all resources acquired so far except
2427 // for the new allocated nodes
2431 /* Release path buffers. */
2432 if (wait_tb_buffers_run) {
2433 pathrelse_and_restore(p_s_tb->tb_sb, p_s_tb->tb_path) ;
2435 pathrelse (p_s_tb->tb_path);
2437 /* brelse all resources collected for balancing */
2438 for ( i = 0; i < MAX_HEIGHT; i++ ) {
2439 if (wait_tb_buffers_run) {
2440 reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->L[i]);
2441 reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->R[i]);
2442 reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->FL[i]);
2443 reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->FR[i]);
2444 reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->CFL[i]);
2445 reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->CFR[i]);
2448 brelse (p_s_tb->L[i]);p_s_tb->L[i] = NULL;
2449 brelse (p_s_tb->R[i]);p_s_tb->R[i] = NULL;
2450 brelse (p_s_tb->FL[i]);p_s_tb->FL[i] = NULL;
2451 brelse (p_s_tb->FR[i]);p_s_tb->FR[i] = NULL;
2452 brelse (p_s_tb->CFL[i]);p_s_tb->CFL[i] = NULL;
2453 brelse (p_s_tb->CFR[i]);p_s_tb->CFR[i] = NULL;
2456 if (wait_tb_buffers_run) {
2457 for ( i = 0; i < MAX_FEB_SIZE; i++ ) {
2458 if ( p_s_tb->FEB[i] ) {
2459 reiserfs_restore_prepared_buffer(p_s_tb->tb_sb,
2470 /* Anatoly will probably forgive me renaming p_s_tb to tb. I just
2471 wanted to make lines shorter */
2472 void unfix_nodes (struct tree_balance * tb)
2476 /* Release path buffers. */
2477 pathrelse_and_restore (tb->tb_sb, tb->tb_path);
2479 /* brelse all resources collected for balancing */
2480 for ( i = 0; i < MAX_HEIGHT; i++ ) {
2481 reiserfs_restore_prepared_buffer (tb->tb_sb, tb->L[i]);
2482 reiserfs_restore_prepared_buffer (tb->tb_sb, tb->R[i]);
2483 reiserfs_restore_prepared_buffer (tb->tb_sb, tb->FL[i]);
2484 reiserfs_restore_prepared_buffer (tb->tb_sb, tb->FR[i]);
2485 reiserfs_restore_prepared_buffer (tb->tb_sb, tb->CFL[i]);
2486 reiserfs_restore_prepared_buffer (tb->tb_sb, tb->CFR[i]);
2492 brelse (tb->CFL[i]);
2493 brelse (tb->CFR[i]);
2496 /* deal with list of allocated (used and unused) nodes */
2497 for ( i = 0; i < MAX_FEB_SIZE; i++ ) {
2499 b_blocknr_t blocknr = tb->FEB[i]->b_blocknr ;
2500 /* de-allocated block which was not used by balancing and
2501 bforget about buffer for it */
2502 brelse (tb->FEB[i]);
2503 reiserfs_free_block (tb->transaction_handle, NULL, blocknr, 0);
2506 /* release used as new nodes including a new root */
2507 brelse (tb->used[i]);
2512 reiserfs_kfree (tb->vn_buf, tb->vn_buf_size, tb->tb_sb);