2 * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
12 ** directory_part_size
16 ** are_leaves_removable
20 ** is_left_neighbor_in_cache
24 ** can_node_be_removed
26 ** dc_check_balance_internal
27 ** dc_check_balance_leaf
38 #include <linux/config.h>
39 #include <linux/time.h>
40 #include <linux/string.h>
41 #include <linux/reiserfs_fs.h>
42 #include <linux/buffer_head.h>
45 /* To make any changes in the tree we find a node, that contains item
46 to be changed/deleted or position in the node we insert a new item
47 to. We call this node S. To do balancing we need to decide what we
48 will shift to left/right neighbor, or to a new node, where new item
49 will be etc. To make this analysis simpler we build virtual
50 node. Virtual node is an array of items, that will replace items of
51 node S. (For instance if we are going to delete an item, virtual
52 node does not contain it). Virtual node keeps information about
53 item sizes and types, mergeability of first and last items, sizes
54 of all entries in directory item. We use this array of items when
55 calculating what we can shift to neighbors and how many nodes we
56 have to have if we do not any shiftings, if we shift to left/right
57 neighbor or to both. */
60 /* taking item number in virtual node, returns number of item, that it has in source buffer */
61 static inline int old_item_num (int new_num, int affected_item_num, int mode)
63 if (mode == M_PASTE || mode == M_CUT || new_num < affected_item_num)
66 if (mode == M_INSERT) {
69 "vs-8005: for INSERT mode and item number of inserted item");
74 RFALSE( mode != M_DELETE,
75 "vs-8010: old_item_num: mode must be M_DELETE (mode = \'%c\'", mode);
80 static void create_virtual_node (struct tree_balance * tb, int h)
82 struct item_head * ih;
83 struct virtual_node * vn = tb->tb_vn;
85 struct buffer_head * Sh; /* this comes from tb->S[h] */
87 Sh = PATH_H_PBUFFER (tb->tb_path, h);
89 /* size of changed node */
90 vn->vn_size = MAX_CHILD_SIZE (Sh) - B_FREE_SPACE (Sh) + tb->insert_size[h];
92 /* for internal nodes array if virtual items is not created */
94 vn->vn_nr_item = (vn->vn_size - DC_SIZE) / (DC_SIZE + KEY_SIZE);
98 /* number of items in virtual node */
99 vn->vn_nr_item = B_NR_ITEMS (Sh) + ((vn->vn_mode == M_INSERT)? 1 : 0) - ((vn->vn_mode == M_DELETE)? 1 : 0);
101 /* first virtual item */
102 vn->vn_vi = (struct virtual_item *)(tb->tb_vn + 1);
103 memset (vn->vn_vi, 0, vn->vn_nr_item * sizeof (struct virtual_item));
104 vn->vn_free_ptr += vn->vn_nr_item * sizeof (struct virtual_item);
107 /* first item in the node */
108 ih = B_N_PITEM_HEAD (Sh, 0);
110 /* define the mergeability for 0-th item (if it is not being deleted) */
111 if (op_is_left_mergeable (&(ih->ih_key), Sh->b_size) && (vn->vn_mode != M_DELETE || vn->vn_affected_item_num))
112 vn->vn_vi[0].vi_type |= VI_TYPE_LEFT_MERGEABLE;
114 /* go through all items those remain in the virtual node (except for the new (inserted) one) */
115 for (new_num = 0; new_num < vn->vn_nr_item; new_num ++) {
117 struct virtual_item * vi = vn->vn_vi + new_num;
118 int is_affected = ((new_num != vn->vn_affected_item_num) ? 0 : 1);
121 if (is_affected && vn->vn_mode == M_INSERT)
124 /* get item number in source node */
125 j = old_item_num (new_num, vn->vn_affected_item_num, vn->vn_mode);
127 vi->vi_item_len += ih_item_len(ih + j) + IH_SIZE;
129 vi->vi_item = B_I_PITEM (Sh, ih + j);
130 vi->vi_uarea = vn->vn_free_ptr;
132 // FIXME: there is no check, that item operation did not
133 // consume too much memory
134 vn->vn_free_ptr += op_create_vi (vn, vi, is_affected, tb->insert_size [0]);
135 if (tb->vn_buf + tb->vn_buf_size < vn->vn_free_ptr)
136 reiserfs_panic (tb->tb_sb, "vs-8030: create_virtual_node: "
137 "virtual node space consumed");
140 /* this is not being changed */
143 if (vn->vn_mode == M_PASTE || vn->vn_mode == M_CUT) {
144 vn->vn_vi[new_num].vi_item_len += tb->insert_size[0];
145 vi->vi_new_data = vn->vn_data; // pointer to data which is going to be pasted
150 /* virtual inserted item is not defined yet */
151 if (vn->vn_mode == M_INSERT) {
152 struct virtual_item * vi = vn->vn_vi + vn->vn_affected_item_num;
154 RFALSE( vn->vn_ins_ih == 0,
155 "vs-8040: item header of inserted item is not specified");
156 vi->vi_item_len = tb->insert_size[0];
157 vi->vi_ih = vn->vn_ins_ih;
158 vi->vi_item = vn->vn_data;
159 vi->vi_uarea = vn->vn_free_ptr;
161 op_create_vi (vn, vi, 0/*not pasted or cut*/, tb->insert_size [0]);
164 /* set right merge flag we take right delimiting key and check whether it is a mergeable item */
168 key = B_N_PDELIM_KEY (tb->CFR[0], tb->rkey[0]);
169 if (op_is_left_mergeable (key, Sh->b_size) && (vn->vn_mode != M_DELETE ||
170 vn->vn_affected_item_num != B_NR_ITEMS (Sh) - 1))
171 vn->vn_vi[vn->vn_nr_item-1].vi_type |= VI_TYPE_RIGHT_MERGEABLE;
173 #ifdef CONFIG_REISERFS_CHECK
174 if (op_is_left_mergeable (key, Sh->b_size) &&
175 !(vn->vn_mode != M_DELETE || vn->vn_affected_item_num != B_NR_ITEMS (Sh) - 1) ) {
176 /* we delete last item and it could be merged with right neighbor's first item */
177 if (!(B_NR_ITEMS (Sh) == 1 && is_direntry_le_ih (B_N_PITEM_HEAD (Sh, 0)) &&
178 I_ENTRY_COUNT (B_N_PITEM_HEAD (Sh, 0)) == 1)) {
179 /* node contains more than 1 item, or item is not directory item, or this item contains more than 1 entry */
180 print_block (Sh, 0, -1, -1);
181 reiserfs_panic (tb->tb_sb, "vs-8045: create_virtual_node: rdkey %k, affected item==%d (mode==%c) Must be %c",
182 key, vn->vn_affected_item_num, vn->vn_mode, M_DELETE);
184 /* we can delete directory item, that has only one directory entry in it */
193 /* using virtual node check, how many items can be shifted to left
195 static void check_left (struct tree_balance * tb, int h, int cur_free)
198 struct virtual_node * vn = tb->tb_vn;
199 struct virtual_item * vi;
202 RFALSE( cur_free < 0, "vs-8050: cur_free (%d) < 0", cur_free);
206 tb->lnum[h] = cur_free / (DC_SIZE + KEY_SIZE);
212 if (!cur_free || !vn->vn_nr_item) {
213 /* no free space or nothing to move */
219 RFALSE( !PATH_H_PPARENT (tb->tb_path, 0),
220 "vs-8055: parent does not exist or invalid");
223 if ((unsigned int)cur_free >= (vn->vn_size - ((vi->vi_type & VI_TYPE_LEFT_MERGEABLE) ? IH_SIZE : 0))) {
224 /* all contents of S[0] fits into L[0] */
226 RFALSE( vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE,
227 "vs-8055: invalid mode or balance condition failed");
229 tb->lnum[0] = vn->vn_nr_item;
235 d_size = 0, ih_size = IH_SIZE;
237 /* first item may be merge with last item in left neighbor */
238 if (vi->vi_type & VI_TYPE_LEFT_MERGEABLE)
239 d_size = -((int)IH_SIZE), ih_size = 0;
242 for (i = 0; i < vn->vn_nr_item; i ++, ih_size = IH_SIZE, d_size = 0, vi ++) {
243 d_size += vi->vi_item_len;
244 if (cur_free >= d_size) {
245 /* the item can be shifted entirely */
251 /* the item cannot be shifted entirely, try to split it */
252 /* check whether L[0] can hold ih and at least one byte of the item body */
253 if (cur_free <= ih_size) {
254 /* cannot shift even a part of the current item */
260 tb->lbytes = op_check_left (vi, cur_free, 0, 0);
261 if (tb->lbytes != -1)
262 /* count partially shifted item */
272 /* using virtual node check, how many items can be shifted to right
274 static void check_right (struct tree_balance * tb, int h, int cur_free)
277 struct virtual_node * vn = tb->tb_vn;
278 struct virtual_item * vi;
281 RFALSE( cur_free < 0, "vs-8070: cur_free < 0");
285 tb->rnum[h] = cur_free / (DC_SIZE + KEY_SIZE);
291 if (!cur_free || !vn->vn_nr_item) {
298 RFALSE( !PATH_H_PPARENT (tb->tb_path, 0),
299 "vs-8075: parent does not exist or invalid");
301 vi = vn->vn_vi + vn->vn_nr_item - 1;
302 if ((unsigned int)cur_free >= (vn->vn_size - ((vi->vi_type & VI_TYPE_RIGHT_MERGEABLE) ? IH_SIZE : 0))) {
303 /* all contents of S[0] fits into R[0] */
305 RFALSE( vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE,
306 "vs-8080: invalid mode or balance condition failed");
308 tb->rnum[h] = vn->vn_nr_item;
313 d_size = 0, ih_size = IH_SIZE;
315 /* last item may be merge with first item in right neighbor */
316 if (vi->vi_type & VI_TYPE_RIGHT_MERGEABLE)
317 d_size = -(int)IH_SIZE, ih_size = 0;
320 for (i = vn->vn_nr_item - 1; i >= 0; i --, d_size = 0, ih_size = IH_SIZE, vi --) {
321 d_size += vi->vi_item_len;
322 if (cur_free >= d_size) {
323 /* the item can be shifted entirely */
329 /* check whether R[0] can hold ih and at least one byte of the item body */
330 if ( cur_free <= ih_size ) { /* cannot shift even a part of the current item */
335 /* R[0] can hold the header of the item and at least one byte of its body */
336 cur_free -= ih_size; /* cur_free is still > 0 */
338 tb->rbytes = op_check_right (vi, cur_free);
339 if (tb->rbytes != -1)
340 /* count partially shifted item */
351 * from - number of items, which are shifted to left neighbor entirely
352 * to - number of item, which are shifted to right neighbor entirely
353 * from_bytes - number of bytes of boundary item (or directory entries) which are shifted to left neighbor
354 * to_bytes - number of bytes of boundary item (or directory entries) which are shifted to right neighbor */
355 static int get_num_ver (int mode, struct tree_balance * tb, int h,
356 int from, int from_bytes,
357 int to, int to_bytes,
358 short * snum012, int flow
365 struct virtual_node * vn = tb->tb_vn;
366 // struct virtual_item * vi;
368 int total_node_size, max_node_size, current_item_size;
370 int start_item, /* position of item we start filling node from */
371 end_item, /* position of item we finish filling node by */
372 start_bytes,/* number of first bytes (entries for directory) of start_item-th item
373 we do not include into node that is being filled */
374 end_bytes; /* number of last bytes (entries for directory) of end_item-th item
375 we do node include into node that is being filled */
376 int split_item_positions[2]; /* these are positions in virtual item of
377 items, that are split between S[0] and
378 S1new and S1new and S2new */
380 split_item_positions[0] = -1;
381 split_item_positions[1] = -1;
383 /* We only create additional nodes if we are in insert or paste mode
384 or we are in replace mode at the internal level. If h is 0 and
385 the mode is M_REPLACE then in fix_nodes we change the mode to
386 paste or insert before we get here in the code. */
387 RFALSE( tb->insert_size[h] < 0 || (mode != M_INSERT && mode != M_PASTE),
388 "vs-8100: insert_size < 0 in overflow");
390 max_node_size = MAX_CHILD_SIZE (PATH_H_PBUFFER (tb->tb_path, h));
392 /* snum012 [0-2] - number of items, that lay
393 to S[0], first new node and second new node */
394 snum012[3] = -1; /* s1bytes */
395 snum012[4] = -1; /* s2bytes */
399 i = ((to - from) * (KEY_SIZE + DC_SIZE) + DC_SIZE);
400 if (i == max_node_size)
402 return (i / max_node_size + 1);
408 cur_free = max_node_size;
410 // start from 'from'-th item
412 // skip its first 'start_bytes' units
413 start_bytes = ((from_bytes != -1) ? from_bytes : 0);
415 // last included item is the 'end_item'-th one
416 end_item = vn->vn_nr_item - to - 1;
417 // do not count last 'end_bytes' units of 'end_item'-th item
418 end_bytes = (to_bytes != -1) ? to_bytes : 0;
420 /* go through all item beginning from the start_item-th item and ending by
421 the end_item-th item. Do not count first 'start_bytes' units of
422 'start_item'-th item and last 'end_bytes' of 'end_item'-th item */
424 for (i = start_item; i <= end_item; i ++) {
425 struct virtual_item * vi = vn->vn_vi + i;
426 int skip_from_end = ((i == end_item) ? end_bytes : 0);
428 RFALSE( needed_nodes > 3, "vs-8105: too many nodes are needed");
430 /* get size of current item */
431 current_item_size = vi->vi_item_len;
433 /* do not take in calculation head part (from_bytes) of from-th item */
434 current_item_size -= op_part_size (vi, 0/*from start*/, start_bytes);
436 /* do not take in calculation tail part of last item */
437 current_item_size -= op_part_size (vi, 1/*from end*/, skip_from_end);
439 /* if item fits into current node entierly */
440 if (total_node_size + current_item_size <= max_node_size) {
441 snum012[needed_nodes - 1] ++;
442 total_node_size += current_item_size;
447 if (current_item_size > max_node_size) {
448 /* virtual item length is longer, than max size of item in
449 a node. It is impossible for direct item */
450 RFALSE( is_direct_le_ih (vi->vi_ih),
452 "direct item length is %d. It can not be longer than %d",
453 current_item_size, max_node_size);
454 /* we will try to split it */
459 /* as we do not split items, take new node and continue */
460 needed_nodes ++; i --; total_node_size = 0;
464 // calculate number of item units which fit into node being
469 free_space = max_node_size - total_node_size - IH_SIZE;
470 units = op_check_left (vi, free_space, start_bytes, skip_from_end);
472 /* nothing fits into current node, take new node and continue */
473 needed_nodes ++, i--, total_node_size = 0;
478 /* something fits into the current node */
479 //if (snum012[3] != -1 || needed_nodes != 1)
480 // reiserfs_panic (tb->tb_sb, "vs-8115: get_num_ver: too many nodes required");
481 //snum012[needed_nodes - 1 + 3] = op_unit_num (vi) - start_bytes - units;
482 start_bytes += units;
483 snum012[needed_nodes - 1 + 3] = units;
485 if (needed_nodes > 2)
486 reiserfs_warning ("vs-8111: get_num_ver: split_item_position is out of boundary\n");
487 snum012[needed_nodes - 1] ++;
488 split_item_positions[needed_nodes - 1] = i;
490 /* continue from the same item with start_bytes != -1 */
496 // sum012[4] (if it is not -1) contains number of units of which
497 // are to be in S1new, snum012[3] - to be in S0. They are supposed
498 // to be S1bytes and S2bytes correspondingly, so recalculate
499 if (snum012[4] > 0) {
501 int bytes_to_r, bytes_to_l;
504 split_item_num = split_item_positions[1];
505 bytes_to_l = ((from == split_item_num && from_bytes != -1) ? from_bytes : 0);
506 bytes_to_r = ((end_item == split_item_num && end_bytes != -1) ? end_bytes : 0);
507 bytes_to_S1new = ((split_item_positions[0] == split_item_positions[1]) ? snum012[3] : 0);
510 snum012[4] = op_unit_num (&vn->vn_vi[split_item_num]) - snum012[4] - bytes_to_r - bytes_to_l - bytes_to_S1new;
512 if (vn->vn_vi[split_item_num].vi_index != TYPE_DIRENTRY)
513 reiserfs_warning ("vs-8115: get_num_ver: not directory item\n");
516 /* now we know S2bytes, calculate S1bytes */
517 if (snum012[3] > 0) {
519 int bytes_to_r, bytes_to_l;
522 split_item_num = split_item_positions[0];
523 bytes_to_l = ((from == split_item_num && from_bytes != -1) ? from_bytes : 0);
524 bytes_to_r = ((end_item == split_item_num && end_bytes != -1) ? end_bytes : 0);
525 bytes_to_S2new = ((split_item_positions[0] == split_item_positions[1] && snum012[4] != -1) ? snum012[4] : 0);
528 snum012[3] = op_unit_num (&vn->vn_vi[split_item_num]) - snum012[3] - bytes_to_r - bytes_to_l - bytes_to_S2new;
535 #ifdef CONFIG_REISERFS_CHECK
536 extern struct tree_balance * cur_tb;
540 /* Set parameters for balancing.
541 * Performs write of results of analysis of balancing into structure tb,
542 * where it will later be used by the functions that actually do the balancing.
544 * tb tree_balance structure;
545 * h current level of the node;
546 * lnum number of items from S[h] that must be shifted to L[h];
547 * rnum number of items from S[h] that must be shifted to R[h];
548 * blk_num number of blocks that S[h] will be splitted into;
549 * s012 number of items that fall into splitted nodes.
550 * lbytes number of bytes which flow to the left neighbor from the item that is not
551 * not shifted entirely
552 * rbytes number of bytes which flow to the right neighbor from the item that is not
553 * not shifted entirely
554 * s1bytes number of bytes which flow to the first new node when S[0] splits (this number is contained in s012 array)
557 static void set_parameters (struct tree_balance * tb, int h, int lnum,
558 int rnum, int blk_num, short * s012, int lb, int rb)
563 tb->blknum[h] = blk_num;
566 { /* only for leaf level */
569 tb->s0num = * s012 ++,
570 tb->s1num = * s012 ++,
571 tb->s2num = * s012 ++;
572 tb->s1bytes = * s012 ++;
573 tb->s2bytes = * s012;
578 PROC_INFO_ADD( tb -> tb_sb, lnum[ h ], lnum );
579 PROC_INFO_ADD( tb -> tb_sb, rnum[ h ], rnum );
581 PROC_INFO_ADD( tb -> tb_sb, lbytes[ h ], lb );
582 PROC_INFO_ADD( tb -> tb_sb, rbytes[ h ], rb );
587 /* check, does node disappear if we shift tb->lnum[0] items to left
588 neighbor and tb->rnum[0] to the right one. */
589 static int is_leaf_removable (struct tree_balance * tb)
591 struct virtual_node * vn = tb->tb_vn;
592 int to_left, to_right;
596 /* number of items, that will be shifted to left (right) neighbor
598 to_left = tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0);
599 to_right = tb->rnum[0] - ((tb->rbytes != -1) ? 1 : 0);
600 remain_items = vn->vn_nr_item;
602 /* how many items remain in S[0] after shiftings to neighbors */
603 remain_items -= (to_left + to_right);
605 if (remain_items < 1) {
606 /* all content of node can be shifted to neighbors */
607 set_parameters (tb, 0, to_left, vn->vn_nr_item - to_left, 0, NULL, -1, -1);
611 if (remain_items > 1 || tb->lbytes == -1 || tb->rbytes == -1)
612 /* S[0] is not removable */
615 /* check, whether we can divide 1 remaining item between neighbors */
617 /* get size of remaining item (in item units) */
618 size = op_unit_num (&(vn->vn_vi[to_left]));
620 if (tb->lbytes + tb->rbytes >= size) {
621 set_parameters (tb, 0, to_left + 1, to_right + 1, 0, NULL, tb->lbytes, -1);
629 /* check whether L, S, R can be joined in one node */
630 static int are_leaves_removable (struct tree_balance * tb, int lfree, int rfree)
632 struct virtual_node * vn = tb->tb_vn;
634 struct buffer_head *S0;
636 S0 = PATH_H_PBUFFER (tb->tb_path, 0);
639 if (vn->vn_nr_item) {
640 if (vn->vn_vi[0].vi_type & VI_TYPE_LEFT_MERGEABLE)
643 if (vn->vn_vi[vn->vn_nr_item-1].vi_type & VI_TYPE_RIGHT_MERGEABLE)
646 /* there was only one item and it will be deleted */
647 struct item_head * ih;
649 RFALSE( B_NR_ITEMS (S0) != 1,
650 "vs-8125: item number must be 1: it is %d", B_NR_ITEMS(S0));
652 ih = B_N_PITEM_HEAD (S0, 0);
653 if (tb->CFR[0] && !comp_short_le_keys (&(ih->ih_key), B_N_PDELIM_KEY (tb->CFR[0], tb->rkey[0])))
654 if (is_direntry_le_ih (ih)) {
655 /* Directory must be in correct state here: that is
656 somewhere at the left side should exist first directory
657 item. But the item being deleted can not be that first
658 one because its right neighbor is item of the same
659 directory. (But first item always gets deleted in last
660 turn). So, neighbors of deleted item can be merged, so
661 we can save ih_size */
664 /* we might check that left neighbor exists and is of the
666 RFALSE(le_ih_k_offset (ih) == DOT_OFFSET,
667 "vs-8130: first directory item can not be removed until directory is not empty");
672 if (MAX_CHILD_SIZE (S0) + vn->vn_size <= rfree + lfree + ih_size) {
673 set_parameters (tb, 0, -1, -1, -1, NULL, -1, -1);
674 PROC_INFO_INC( tb -> tb_sb, leaves_removable );
683 /* when we do not split item, lnum and rnum are numbers of entire items */
684 #define SET_PAR_SHIFT_LEFT \
689 to_l = (MAX_NR_KEY(Sh)+1 - lpar + vn->vn_nr_item + 1) / 2 -\
690 (MAX_NR_KEY(Sh) + 1 - lpar);\
692 set_parameters (tb, h, to_l, 0, lnver, NULL, -1, -1);\
696 if (lset==LEFT_SHIFT_FLOW)\
697 set_parameters (tb, h, lpar, 0, lnver, snum012+lset,\
700 set_parameters (tb, h, lpar - (tb->lbytes!=-1), 0, lnver, snum012+lset,\
705 #define SET_PAR_SHIFT_RIGHT \
710 to_r = (MAX_NR_KEY(Sh)+1 - rpar + vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 - rpar);\
712 set_parameters (tb, h, 0, to_r, rnver, NULL, -1, -1);\
716 if (rset==RIGHT_SHIFT_FLOW)\
717 set_parameters (tb, h, 0, rpar, rnver, snum012+rset,\
720 set_parameters (tb, h, 0, rpar - (tb->rbytes!=-1), rnver, snum012+rset,\
725 void free_buffers_in_tb (
726 struct tree_balance * p_s_tb
730 decrement_counters_in_path(p_s_tb->tb_path);
732 for ( n_counter = 0; n_counter < MAX_HEIGHT; n_counter++ ) {
733 decrement_bcount(p_s_tb->L[n_counter]);
734 p_s_tb->L[n_counter] = NULL;
735 decrement_bcount(p_s_tb->R[n_counter]);
736 p_s_tb->R[n_counter] = NULL;
737 decrement_bcount(p_s_tb->FL[n_counter]);
738 p_s_tb->FL[n_counter] = NULL;
739 decrement_bcount(p_s_tb->FR[n_counter]);
740 p_s_tb->FR[n_counter] = NULL;
741 decrement_bcount(p_s_tb->CFL[n_counter]);
742 p_s_tb->CFL[n_counter] = NULL;
743 decrement_bcount(p_s_tb->CFR[n_counter]);
744 p_s_tb->CFR[n_counter] = NULL;
749 /* Get new buffers for storing new nodes that are created while balancing.
750 * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked;
751 * CARRY_ON - schedule didn't occur while the function worked;
752 * NO_DISK_SPACE - no disk space.
754 /* The function is NOT SCHEDULE-SAFE! */
755 static int get_empty_nodes(
756 struct tree_balance * p_s_tb,
759 struct buffer_head * p_s_new_bh,
760 * p_s_Sh = PATH_H_PBUFFER (p_s_tb->tb_path, n_h);
761 b_blocknr_t * p_n_blocknr,
762 a_n_blocknrs[MAX_AMOUNT_NEEDED] = {0, };
765 n_amount_needed,/* number of needed empty blocks */
767 struct super_block * p_s_sb = p_s_tb->tb_sb;
770 /* number_of_freeblk is the number of empty blocks which have been
771 acquired for use by the balancing algorithm minus the number of
772 empty blocks used in the previous levels of the analysis,
773 number_of_freeblk = tb->cur_blknum can be non-zero if a schedule occurs
774 after empty blocks are acquired, and the balancing analysis is
775 then restarted, amount_needed is the number needed by this level
776 (n_h) of the balancing analysis.
778 Note that for systems with many processes writing, it would be
779 more layout optimal to calculate the total number needed by all
780 levels and then to run reiserfs_new_blocks to get all of them at once. */
782 /* Initiate number_of_freeblk to the amount acquired prior to the restart of
783 the analysis or 0 if not restarted, then subtract the amount needed
784 by all of the levels of the tree below n_h. */
785 /* blknum includes S[n_h], so we subtract 1 in this calculation */
786 for ( n_counter = 0, n_number_of_freeblk = p_s_tb->cur_blknum; n_counter < n_h; n_counter++ )
787 n_number_of_freeblk -= ( p_s_tb->blknum[n_counter] ) ? (p_s_tb->blknum[n_counter] - 1) : 0;
789 /* Allocate missing empty blocks. */
790 /* if p_s_Sh == 0 then we are getting a new root */
791 n_amount_needed = ( p_s_Sh ) ? (p_s_tb->blknum[n_h] - 1) : 1;
792 /* Amount_needed = the amount that we need more than the amount that we have. */
793 if ( n_amount_needed > n_number_of_freeblk )
794 n_amount_needed -= n_number_of_freeblk;
795 else /* If we have enough already then there is nothing to do. */
798 if ( reiserfs_new_form_blocknrs (p_s_tb, a_n_blocknrs,
799 n_amount_needed) == NO_DISK_SPACE )
800 return NO_DISK_SPACE;
802 /* for each blocknumber we just got, get a buffer and stick it on FEB */
803 for ( p_n_blocknr = a_n_blocknrs, n_counter = 0; n_counter < n_amount_needed;
804 p_n_blocknr++, n_counter++ ) {
806 RFALSE( ! *p_n_blocknr,
807 "PAP-8135: reiserfs_new_blocknrs failed when got new blocks");
809 p_s_new_bh = sb_getblk(p_s_sb, *p_n_blocknr);
810 RFALSE (buffer_dirty (p_s_new_bh) ||
811 buffer_journaled (p_s_new_bh) ||
812 buffer_journal_dirty (p_s_new_bh),
813 "PAP-8140: journlaled or dirty buffer %b for the new block",
816 /* Put empty buffers into the array. */
817 RFALSE (p_s_tb->FEB[p_s_tb->cur_blknum],
818 "PAP-8141: busy slot for new buffer");
820 mark_buffer_journal_new(p_s_new_bh) ;
821 p_s_tb->FEB[p_s_tb->cur_blknum++] = p_s_new_bh;
824 if ( n_retval == CARRY_ON && FILESYSTEM_CHANGED_TB (p_s_tb) )
825 n_retval = REPEAT_SEARCH ;
831 /* Get free space of the left neighbor, which is stored in the parent
832 * node of the left neighbor. */
833 static int get_lfree (struct tree_balance * tb, int h)
835 struct buffer_head * l, * f;
838 if ((f = PATH_H_PPARENT (tb->tb_path, h)) == 0 || (l = tb->FL[h]) == 0)
842 order = PATH_H_B_ITEM_ORDER (tb->tb_path, h) - 1;
844 order = B_NR_ITEMS (l);
848 return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f,order)));
852 /* Get free space of the right neighbor,
853 * which is stored in the parent node of the right neighbor.
855 static int get_rfree (struct tree_balance * tb, int h)
857 struct buffer_head * r, * f;
860 if ((f = PATH_H_PPARENT (tb->tb_path, h)) == 0 || (r = tb->FR[h]) == 0)
864 order = PATH_H_B_ITEM_ORDER (tb->tb_path, h) + 1;
870 return (MAX_CHILD_SIZE(f) - dc_size( B_N_CHILD(f,order)));
875 /* Check whether left neighbor is in memory. */
876 static int is_left_neighbor_in_cache(
877 struct tree_balance * p_s_tb,
880 struct buffer_head * p_s_father, * left;
881 struct super_block * p_s_sb = p_s_tb->tb_sb;
882 b_blocknr_t n_left_neighbor_blocknr;
883 int n_left_neighbor_position;
885 if ( ! p_s_tb->FL[n_h] ) /* Father of the left neighbor does not exist. */
888 /* Calculate father of the node to be balanced. */
889 p_s_father = PATH_H_PBUFFER(p_s_tb->tb_path, n_h + 1);
891 RFALSE( ! p_s_father ||
892 ! B_IS_IN_TREE (p_s_father) ||
893 ! B_IS_IN_TREE (p_s_tb->FL[n_h]) ||
894 ! buffer_uptodate (p_s_father) ||
895 ! buffer_uptodate (p_s_tb->FL[n_h]),
896 "vs-8165: F[h] (%b) or FL[h] (%b) is invalid",
897 p_s_father, p_s_tb->FL[n_h]);
900 /* Get position of the pointer to the left neighbor into the left father. */
901 n_left_neighbor_position = ( p_s_father == p_s_tb->FL[n_h] ) ?
902 p_s_tb->lkey[n_h] : B_NR_ITEMS (p_s_tb->FL[n_h]);
903 /* Get left neighbor block number. */
904 n_left_neighbor_blocknr = B_N_CHILD_NUM(p_s_tb->FL[n_h], n_left_neighbor_position);
905 /* Look for the left neighbor in the cache. */
906 if ( (left = sb_find_get_block(p_s_sb, n_left_neighbor_blocknr)) ) {
908 RFALSE( buffer_uptodate (left) && ! B_IS_IN_TREE(left),
909 "vs-8170: left neighbor (%b %z) is not in the tree", left, left);
918 #define LEFT_PARENTS 'l'
919 #define RIGHT_PARENTS 'r'
922 static void decrement_key (struct cpu_key * p_s_key)
924 // call item specific function for this key
925 item_ops[cpu_key_k_type (p_s_key)]->decrement_key (p_s_key);
931 /* Calculate far left/right parent of the left/right neighbor of the current node, that
932 * is calculate the left/right (FL[h]/FR[h]) neighbor of the parent F[h].
933 * Calculate left/right common parent of the current node and L[h]/R[h].
934 * Calculate left/right delimiting key position.
935 * Returns: PATH_INCORRECT - path in the tree is not correct;
936 SCHEDULE_OCCURRED - schedule occurred while the function worked;
937 * CARRY_ON - schedule didn't occur while the function worked;
939 static int get_far_parent (struct tree_balance * p_s_tb,
941 struct buffer_head ** pp_s_father,
942 struct buffer_head ** pp_s_com_father,
945 struct buffer_head * p_s_parent;
946 INITIALIZE_PATH (s_path_to_neighbor_father);
947 struct path * p_s_path = p_s_tb->tb_path;
948 struct cpu_key s_lr_father_key;
950 n_position = INT_MAX,
951 n_first_last_position = 0,
952 n_path_offset = PATH_H_PATH_OFFSET(p_s_path, n_h);
954 /* Starting from F[n_h] go upwards in the tree, and look for the common
955 ancestor of F[n_h], and its neighbor l/r, that should be obtained. */
957 n_counter = n_path_offset;
959 RFALSE( n_counter < FIRST_PATH_ELEMENT_OFFSET,
960 "PAP-8180: invalid path length");
963 for ( ; n_counter > FIRST_PATH_ELEMENT_OFFSET; n_counter-- ) {
964 /* Check whether parent of the current buffer in the path is really parent in the tree. */
965 if ( ! B_IS_IN_TREE(p_s_parent = PATH_OFFSET_PBUFFER(p_s_path, n_counter - 1)) )
966 return REPEAT_SEARCH;
967 /* Check whether position in the parent is correct. */
968 if ( (n_position = PATH_OFFSET_POSITION(p_s_path, n_counter - 1)) > B_NR_ITEMS(p_s_parent) )
969 return REPEAT_SEARCH;
970 /* Check whether parent at the path really points to the child. */
971 if ( B_N_CHILD_NUM(p_s_parent, n_position) !=
972 PATH_OFFSET_PBUFFER(p_s_path, n_counter)->b_blocknr )
973 return REPEAT_SEARCH;
974 /* Return delimiting key if position in the parent is not equal to first/last one. */
975 if ( c_lr_par == RIGHT_PARENTS )
976 n_first_last_position = B_NR_ITEMS (p_s_parent);
977 if ( n_position != n_first_last_position ) {
978 *pp_s_com_father = p_s_parent;
979 get_bh(*pp_s_com_father) ;
980 /*(*pp_s_com_father = p_s_parent)->b_count++;*/
985 /* if we are in the root of the tree, then there is no common father */
986 if ( n_counter == FIRST_PATH_ELEMENT_OFFSET ) {
987 /* Check whether first buffer in the path is the root of the tree. */
988 if ( PATH_OFFSET_PBUFFER(p_s_tb->tb_path, FIRST_PATH_ELEMENT_OFFSET)->b_blocknr ==
989 SB_ROOT_BLOCK (p_s_tb->tb_sb) ) {
990 *pp_s_father = *pp_s_com_father = NULL;
993 return REPEAT_SEARCH;
996 RFALSE( B_LEVEL (*pp_s_com_father) <= DISK_LEAF_NODE_LEVEL,
997 "PAP-8185: (%b %z) level too small",
998 *pp_s_com_father, *pp_s_com_father);
1000 /* Check whether the common parent is locked. */
1002 if ( buffer_locked (*pp_s_com_father) ) {
1003 __wait_on_buffer(*pp_s_com_father);
1004 if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) {
1005 decrement_bcount(*pp_s_com_father);
1006 return REPEAT_SEARCH;
1010 /* So, we got common parent of the current node and its left/right neighbor.
1011 Now we are geting the parent of the left/right neighbor. */
1013 /* Form key to get parent of the left/right neighbor. */
1014 le_key2cpu_key (&s_lr_father_key, B_N_PDELIM_KEY(*pp_s_com_father, ( c_lr_par == LEFT_PARENTS ) ?
1015 (p_s_tb->lkey[n_h - 1] = n_position - 1) : (p_s_tb->rkey[n_h - 1] = n_position)));
1018 if ( c_lr_par == LEFT_PARENTS )
1019 decrement_key(&s_lr_father_key);
1021 if (search_by_key(p_s_tb->tb_sb, &s_lr_father_key, &s_path_to_neighbor_father, n_h + 1) == IO_ERROR)
1025 if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) {
1026 decrement_counters_in_path(&s_path_to_neighbor_father);
1027 decrement_bcount(*pp_s_com_father);
1028 return REPEAT_SEARCH;
1031 *pp_s_father = PATH_PLAST_BUFFER(&s_path_to_neighbor_father);
1033 RFALSE( B_LEVEL (*pp_s_father) != n_h + 1,
1034 "PAP-8190: (%b %z) level too small", *pp_s_father, *pp_s_father);
1035 RFALSE( s_path_to_neighbor_father.path_length < FIRST_PATH_ELEMENT_OFFSET,
1036 "PAP-8192: path length is too small");
1038 s_path_to_neighbor_father.path_length--;
1039 decrement_counters_in_path(&s_path_to_neighbor_father);
1044 /* Get parents of neighbors of node in the path(S[n_path_offset]) and common parents of
1045 * S[n_path_offset] and L[n_path_offset]/R[n_path_offset]: F[n_path_offset], FL[n_path_offset],
1046 * FR[n_path_offset], CFL[n_path_offset], CFR[n_path_offset].
1047 * Calculate numbers of left and right delimiting keys position: lkey[n_path_offset], rkey[n_path_offset].
1048 * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked;
1049 * CARRY_ON - schedule didn't occur while the function worked;
1051 static int get_parents (struct tree_balance * p_s_tb, int n_h)
1053 struct path * p_s_path = p_s_tb->tb_path;
1056 n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h);
1057 struct buffer_head * p_s_curf,
1060 /* Current node is the root of the tree or will be root of the tree */
1061 if ( n_path_offset <= FIRST_PATH_ELEMENT_OFFSET ) {
1062 /* The root can not have parents.
1063 Release nodes which previously were obtained as parents of the current node neighbors. */
1064 decrement_bcount(p_s_tb->FL[n_h]);
1065 decrement_bcount(p_s_tb->CFL[n_h]);
1066 decrement_bcount(p_s_tb->FR[n_h]);
1067 decrement_bcount(p_s_tb->CFR[n_h]);
1068 p_s_tb->FL[n_h] = p_s_tb->CFL[n_h] = p_s_tb->FR[n_h] = p_s_tb->CFR[n_h] = NULL;
1072 /* Get parent FL[n_path_offset] of L[n_path_offset]. */
1073 if ( (n_position = PATH_OFFSET_POSITION(p_s_path, n_path_offset - 1)) ) {
1074 /* Current node is not the first child of its parent. */
1075 /*(p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1))->b_count += 2;*/
1076 p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1);
1079 p_s_tb->lkey[n_h] = n_position - 1;
1082 /* Calculate current parent of L[n_path_offset], which is the left neighbor of the current node.
1083 Calculate current common parent of L[n_path_offset] and the current node. Note that
1084 CFL[n_path_offset] not equal FL[n_path_offset] and CFL[n_path_offset] not equal F[n_path_offset].
1085 Calculate lkey[n_path_offset]. */
1086 if ( (n_ret_value = get_far_parent(p_s_tb, n_h + 1, &p_s_curf,
1087 &p_s_curcf, LEFT_PARENTS)) != CARRY_ON )
1091 decrement_bcount(p_s_tb->FL[n_h]);
1092 p_s_tb->FL[n_h] = p_s_curf; /* New initialization of FL[n_h]. */
1093 decrement_bcount(p_s_tb->CFL[n_h]);
1094 p_s_tb->CFL[n_h] = p_s_curcf; /* New initialization of CFL[n_h]. */
1096 RFALSE( (p_s_curf && !B_IS_IN_TREE (p_s_curf)) ||
1097 (p_s_curcf && !B_IS_IN_TREE (p_s_curcf)),
1098 "PAP-8195: FL (%b) or CFL (%b) is invalid", p_s_curf, p_s_curcf);
1100 /* Get parent FR[n_h] of R[n_h]. */
1102 /* Current node is the last child of F[n_h]. FR[n_h] != F[n_h]. */
1103 if ( n_position == B_NR_ITEMS (PATH_H_PBUFFER(p_s_path, n_h + 1)) ) {
1104 /* Calculate current parent of R[n_h], which is the right neighbor of F[n_h].
1105 Calculate current common parent of R[n_h] and current node. Note that CFR[n_h]
1106 not equal FR[n_path_offset] and CFR[n_h] not equal F[n_h]. */
1107 if ( (n_ret_value = get_far_parent(p_s_tb, n_h + 1, &p_s_curf, &p_s_curcf, RIGHT_PARENTS)) != CARRY_ON )
1111 /* Current node is not the last child of its parent F[n_h]. */
1112 /*(p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1))->b_count += 2;*/
1113 p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1);
1116 p_s_tb->rkey[n_h] = n_position;
1119 decrement_bcount(p_s_tb->FR[n_h]);
1120 p_s_tb->FR[n_h] = p_s_curf; /* New initialization of FR[n_path_offset]. */
1122 decrement_bcount(p_s_tb->CFR[n_h]);
1123 p_s_tb->CFR[n_h] = p_s_curcf; /* New initialization of CFR[n_path_offset]. */
1125 RFALSE( (p_s_curf && !B_IS_IN_TREE (p_s_curf)) ||
1126 (p_s_curcf && !B_IS_IN_TREE (p_s_curcf)),
1127 "PAP-8205: FR (%b) or CFR (%b) is invalid", p_s_curf, p_s_curcf);
1133 /* it is possible to remove node as result of shiftings to
1134 neighbors even when we insert or paste item. */
1135 static inline int can_node_be_removed (int mode, int lfree, int sfree, int rfree, struct tree_balance * tb, int h)
1137 struct buffer_head * Sh = PATH_H_PBUFFER (tb->tb_path, h);
1138 int levbytes = tb->insert_size[h];
1139 struct item_head * ih;
1140 struct key * r_key = NULL;
1142 ih = B_N_PITEM_HEAD (Sh, 0);
1144 r_key = B_N_PDELIM_KEY(tb->CFR[h],tb->rkey[h]);
1147 lfree + rfree + sfree < MAX_CHILD_SIZE(Sh) + levbytes
1148 /* shifting may merge items which might save space */
1149 - (( ! h && op_is_left_mergeable (&(ih->ih_key), Sh->b_size) ) ? IH_SIZE : 0)
1150 - (( ! h && r_key && op_is_left_mergeable (r_key, Sh->b_size) ) ? IH_SIZE : 0)
1151 + (( h ) ? KEY_SIZE : 0))
1153 /* node can not be removed */
1154 if (sfree >= levbytes ) { /* new item fits into node S[h] without any shifting */
1156 tb->s0num = B_NR_ITEMS(Sh) + ((mode == M_INSERT ) ? 1 : 0);
1157 set_parameters (tb, h, 0, 0, 1, NULL, -1, -1);
1158 return NO_BALANCING_NEEDED;
1161 PROC_INFO_INC( tb -> tb_sb, can_node_be_removed[ h ] );
1162 return !NO_BALANCING_NEEDED;
1167 /* Check whether current node S[h] is balanced when increasing its size by
1168 * Inserting or Pasting.
1169 * Calculate parameters for balancing for current level h.
1171 * tb tree_balance structure;
1172 * h current level of the node;
1173 * inum item number in S[h];
1174 * mode i - insert, p - paste;
1175 * Returns: 1 - schedule occurred;
1176 * 0 - balancing for higher levels needed;
1177 * -1 - no balancing for higher levels needed;
1178 * -2 - no disk space.
1180 /* ip means Inserting or Pasting */
1181 static int ip_check_balance (struct tree_balance * tb, int h)
1183 struct virtual_node * vn = tb->tb_vn;
1184 int levbytes, /* Number of bytes that must be inserted into (value
1185 is negative if bytes are deleted) buffer which
1186 contains node being balanced. The mnemonic is
1187 that the attempted change in node space used level
1188 is levbytes bytes. */
1191 int lfree, sfree, rfree /* free space in L, S and R */;
1193 /* nver is short for number of vertixes, and lnver is the number if
1194 we shift to the left, rnver is the number if we shift to the
1195 right, and lrnver is the number if we shift in both directions.
1196 The goal is to minimize first the number of vertixes, and second,
1197 the number of vertixes whose contents are changed by shifting,
1198 and third the number of uncached vertixes whose contents are
1199 changed by shifting and must be read from disk. */
1200 int nver, lnver, rnver, lrnver;
1202 /* used at leaf level only, S0 = S[0] is the node being balanced,
1203 sInum [ I = 0,1,2 ] is the number of items that will
1204 remain in node SI after balancing. S1 and S2 are new
1205 nodes that might be created. */
1207 /* we perform 8 calls to get_num_ver(). For each call we calculate five parameters.
1208 where 4th parameter is s1bytes and 5th - s2bytes
1210 short snum012[40] = {0,}; /* s0num, s1num, s2num for 8 cases
1211 0,1 - do not shift and do not shift but bottle
1212 2 - shift only whole item to left
1213 3 - shift to left and bottle as much as possible
1214 4,5 - shift to right (whole items and as much as possible
1215 6,7 - shift to both directions (whole items and as much as possible)
1218 /* Sh is the node whose balance is currently being checked */
1219 struct buffer_head * Sh;
1221 Sh = PATH_H_PBUFFER (tb->tb_path, h);
1222 levbytes = tb->insert_size[h];
1224 /* Calculate balance parameters for creating new root. */
1227 reiserfs_panic (tb->tb_sb, "vs-8210: ip_check_balance: S[0] can not be 0");
1228 switch ( n_ret_value = get_empty_nodes (tb, h) ) {
1230 set_parameters (tb, h, 0, 0, 1, NULL, -1, -1);
1231 return NO_BALANCING_NEEDED; /* no balancing for higher levels needed */
1237 reiserfs_panic(tb->tb_sb, "vs-8215: ip_check_balance: incorrect return value of get_empty_nodes");
1241 if ( (n_ret_value = get_parents (tb, h)) != CARRY_ON ) /* get parents of S[h] neighbors. */
1244 sfree = B_FREE_SPACE (Sh);
1246 /* get free space of neighbors */
1247 rfree = get_rfree (tb, h);
1248 lfree = get_lfree (tb, h);
1250 if (can_node_be_removed (vn->vn_mode, lfree, sfree, rfree, tb, h) == NO_BALANCING_NEEDED)
1251 /* and new item fits into node S[h] without any shifting */
1252 return NO_BALANCING_NEEDED;
1254 create_virtual_node (tb, h);
1257 determine maximal number of items we can shift to the left neighbor (in tb structure)
1258 and the maximal number of bytes that can flow to the left neighbor
1259 from the left most liquid item that cannot be shifted from S[0] entirely (returned value)
1261 check_left (tb, h, lfree);
1264 determine maximal number of items we can shift to the right neighbor (in tb structure)
1265 and the maximal number of bytes that can flow to the right neighbor
1266 from the right most liquid item that cannot be shifted from S[0] entirely (returned value)
1268 check_right (tb, h, rfree);
1271 /* all contents of internal node S[h] can be moved into its
1272 neighbors, S[h] will be removed after balancing */
1273 if (h && (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1)) {
1276 /* Since we are working on internal nodes, and our internal
1277 nodes have fixed size entries, then we can balance by the
1278 number of items rather than the space they consume. In this
1279 routine we set the left node equal to the right node,
1280 allowing a difference of less than or equal to 1 child
1282 to_r = ((MAX_NR_KEY(Sh)<<1)+2-tb->lnum[h]-tb->rnum[h]+vn->vn_nr_item+1)/2 -
1283 (MAX_NR_KEY(Sh) + 1 - tb->rnum[h]);
1284 set_parameters (tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL, -1, -1);
1288 /* this checks balance condition, that any two neighboring nodes can not fit in one node */
1290 ( tb->lnum[h] >= vn->vn_nr_item + 1 ||
1291 tb->rnum[h] >= vn->vn_nr_item + 1),
1292 "vs-8220: tree is not balanced on internal level");
1293 RFALSE( ! h && ((tb->lnum[h] >= vn->vn_nr_item && (tb->lbytes == -1)) ||
1294 (tb->rnum[h] >= vn->vn_nr_item && (tb->rbytes == -1)) ),
1295 "vs-8225: tree is not balanced on leaf level");
1297 /* all contents of S[0] can be moved into its neighbors
1298 S[0] will be removed after balancing. */
1299 if (!h && is_leaf_removable (tb))
1303 /* why do we perform this check here rather than earlier??
1304 Answer: we can win 1 node in some cases above. Moreover we
1305 checked it above, when we checked, that S[0] is not removable
1307 if (sfree >= levbytes) { /* new item fits into node S[h] without any shifting */
1309 tb->s0num = vn->vn_nr_item;
1310 set_parameters (tb, h, 0, 0, 1, NULL, -1, -1);
1311 return NO_BALANCING_NEEDED;
1316 int lpar, rpar, nset, lset, rset, lrset;
1318 * regular overflowing of the node
1321 /* get_num_ver works in 2 modes (FLOW & NO_FLOW)
1322 lpar, rpar - number of items we can shift to left/right neighbor (including splitting item)
1323 nset, lset, rset, lrset - shows, whether flowing items give better packing
1326 #define NO_FLOW 0 /* do not any splitting */
1328 /* we choose one the following */
1329 #define NOTHING_SHIFT_NO_FLOW 0
1330 #define NOTHING_SHIFT_FLOW 5
1331 #define LEFT_SHIFT_NO_FLOW 10
1332 #define LEFT_SHIFT_FLOW 15
1333 #define RIGHT_SHIFT_NO_FLOW 20
1334 #define RIGHT_SHIFT_FLOW 25
1335 #define LR_SHIFT_NO_FLOW 30
1336 #define LR_SHIFT_FLOW 35
1343 /* calculate number of blocks S[h] must be split into when
1344 nothing is shifted to the neighbors,
1345 as well as number of items in each part of the split node (s012 numbers),
1346 and number of bytes (s1bytes) of the shared drop which flow to S1 if any */
1347 nset = NOTHING_SHIFT_NO_FLOW;
1348 nver = get_num_ver (vn->vn_mode, tb, h,
1349 0, -1, h?vn->vn_nr_item:0, -1,
1356 /* note, that in this case we try to bottle between S[0] and S1 (S1 - the first new node) */
1357 nver1 = get_num_ver (vn->vn_mode, tb, h,
1359 snum012 + NOTHING_SHIFT_FLOW, FLOW);
1361 nset = NOTHING_SHIFT_FLOW, nver = nver1;
1365 /* calculate number of blocks S[h] must be split into when
1366 l_shift_num first items and l_shift_bytes of the right most
1367 liquid item to be shifted are shifted to the left neighbor,
1368 as well as number of items in each part of the splitted node (s012 numbers),
1369 and number of bytes (s1bytes) of the shared drop which flow to S1 if any
1371 lset = LEFT_SHIFT_NO_FLOW;
1372 lnver = get_num_ver (vn->vn_mode, tb, h,
1373 lpar - (( h || tb->lbytes == -1 ) ? 0 : 1), -1, h ? vn->vn_nr_item:0, -1,
1374 snum012 + LEFT_SHIFT_NO_FLOW, NO_FLOW);
1379 lnver1 = get_num_ver (vn->vn_mode, tb, h,
1380 lpar - ((tb->lbytes != -1) ? 1 : 0), tb->lbytes, 0, -1,
1381 snum012 + LEFT_SHIFT_FLOW, FLOW);
1383 lset = LEFT_SHIFT_FLOW, lnver = lnver1;
1387 /* calculate number of blocks S[h] must be split into when
1388 r_shift_num first items and r_shift_bytes of the left most
1389 liquid item to be shifted are shifted to the right neighbor,
1390 as well as number of items in each part of the splitted node (s012 numbers),
1391 and number of bytes (s1bytes) of the shared drop which flow to S1 if any
1393 rset = RIGHT_SHIFT_NO_FLOW;
1394 rnver = get_num_ver (vn->vn_mode, tb, h,
1395 0, -1, h ? (vn->vn_nr_item-rpar) : (rpar - (( tb->rbytes != -1 ) ? 1 : 0)), -1,
1396 snum012 + RIGHT_SHIFT_NO_FLOW, NO_FLOW);
1401 rnver1 = get_num_ver (vn->vn_mode, tb, h,
1402 0, -1, (rpar - ((tb->rbytes != -1) ? 1 : 0)), tb->rbytes,
1403 snum012 + RIGHT_SHIFT_FLOW, FLOW);
1406 rset = RIGHT_SHIFT_FLOW, rnver = rnver1;
1410 /* calculate number of blocks S[h] must be split into when
1411 items are shifted in both directions,
1412 as well as number of items in each part of the splitted node (s012 numbers),
1413 and number of bytes (s1bytes) of the shared drop which flow to S1 if any
1415 lrset = LR_SHIFT_NO_FLOW;
1416 lrnver = get_num_ver (vn->vn_mode, tb, h,
1417 lpar - ((h || tb->lbytes == -1) ? 0 : 1), -1, h ? (vn->vn_nr_item-rpar):(rpar - ((tb->rbytes != -1) ? 1 : 0)), -1,
1418 snum012 + LR_SHIFT_NO_FLOW, NO_FLOW);
1423 lrnver1 = get_num_ver (vn->vn_mode, tb, h,
1424 lpar - ((tb->lbytes != -1) ? 1 : 0), tb->lbytes, (rpar - ((tb->rbytes != -1) ? 1 : 0)), tb->rbytes,
1425 snum012 + LR_SHIFT_FLOW, FLOW);
1426 if (lrnver > lrnver1)
1427 lrset = LR_SHIFT_FLOW, lrnver = lrnver1;
1432 /* Our general shifting strategy is:
1433 1) to minimized number of new nodes;
1434 2) to minimized number of neighbors involved in shifting;
1435 3) to minimized number of disk reads; */
1437 /* we can win TWO or ONE nodes by shifting in both directions */
1438 if (lrnver < lnver && lrnver < rnver)
1441 (tb->lnum[h] != 1 ||
1443 lrnver != 1 || rnver != 2 || lnver != 2 || h != 1),
1445 if (lrset == LR_SHIFT_FLOW)
1446 set_parameters (tb, h, tb->lnum[h], tb->rnum[h], lrnver, snum012 + lrset,
1447 tb->lbytes, tb->rbytes);
1449 set_parameters (tb, h, tb->lnum[h] - ((tb->lbytes == -1) ? 0 : 1),
1450 tb->rnum[h] - ((tb->rbytes == -1) ? 0 : 1), lrnver, snum012 + lrset, -1, -1);
1455 /* if shifting doesn't lead to better packing then don't shift */
1458 set_parameters (tb, h, 0, 0, nver, snum012 + nset, -1, -1);
1463 /* now we know that for better packing shifting in only one
1464 direction either to the left or to the right is required */
1466 /* if shifting to the left is better than shifting to the right */
1473 /* if shifting to the right is better than shifting to the left */
1476 SET_PAR_SHIFT_RIGHT;
1481 /* now shifting in either direction gives the same number
1482 of nodes and we can make use of the cached neighbors */
1483 if (is_left_neighbor_in_cache (tb,h))
1489 /* shift to the right independently on whether the right neighbor in cache or not */
1490 SET_PAR_SHIFT_RIGHT;
1496 /* Check whether current node S[h] is balanced when Decreasing its size by
1497 * Deleting or Cutting for INTERNAL node of S+tree.
1498 * Calculate parameters for balancing for current level h.
1500 * tb tree_balance structure;
1501 * h current level of the node;
1502 * inum item number in S[h];
1503 * mode i - insert, p - paste;
1504 * Returns: 1 - schedule occurred;
1505 * 0 - balancing for higher levels needed;
1506 * -1 - no balancing for higher levels needed;
1507 * -2 - no disk space.
1509 * Note: Items of internal nodes have fixed size, so the balance condition for
1510 * the internal part of S+tree is as for the B-trees.
1512 static int dc_check_balance_internal (struct tree_balance * tb, int h)
1514 struct virtual_node * vn = tb->tb_vn;
1516 /* Sh is the node whose balance is currently being checked,
1517 and Fh is its father. */
1518 struct buffer_head * Sh, * Fh;
1521 int lfree, rfree /* free space in L and R */;
1523 Sh = PATH_H_PBUFFER (tb->tb_path, h);
1524 Fh = PATH_H_PPARENT (tb->tb_path, h);
1526 maxsize = MAX_CHILD_SIZE(Sh);
1528 /* using tb->insert_size[h], which is negative in this case, create_virtual_node calculates: */
1529 /* new_nr_item = number of items node would have if operation is */
1530 /* performed without balancing (new_nr_item); */
1531 create_virtual_node (tb, h);
1534 { /* S[h] is the root. */
1535 if ( vn->vn_nr_item > 0 )
1537 set_parameters (tb, h, 0, 0, 1, NULL, -1, -1);
1538 return NO_BALANCING_NEEDED; /* no balancing for higher levels needed */
1540 /* new_nr_item == 0.
1541 * Current root will be deleted resulting in
1542 * decrementing the tree height. */
1543 set_parameters (tb, h, 0, 0, 0, NULL, -1, -1);
1547 if ( (n_ret_value = get_parents(tb,h)) != CARRY_ON )
1551 /* get free space of neighbors */
1552 rfree = get_rfree (tb, h);
1553 lfree = get_lfree (tb, h);
1555 /* determine maximal number of items we can fit into neighbors */
1556 check_left (tb, h, lfree);
1557 check_right (tb, h, rfree);
1560 if ( vn->vn_nr_item >= MIN_NR_KEY(Sh) )
1561 { /* Balance condition for the internal node is valid.
1562 * In this case we balance only if it leads to better packing. */
1563 if ( vn->vn_nr_item == MIN_NR_KEY(Sh) )
1564 { /* Here we join S[h] with one of its neighbors,
1565 * which is impossible with greater values of new_nr_item. */
1566 if ( tb->lnum[h] >= vn->vn_nr_item + 1 )
1568 /* All contents of S[h] can be moved to L[h]. */
1572 order_L = ((n=PATH_H_B_ITEM_ORDER(tb->tb_path, h))==0) ? B_NR_ITEMS(tb->FL[h]) : n - 1;
1573 n = dc_size(B_N_CHILD(tb->FL[h],order_L)) / (DC_SIZE + KEY_SIZE);
1574 set_parameters (tb, h, -n-1, 0, 0, NULL, -1, -1);
1578 if ( tb->rnum[h] >= vn->vn_nr_item + 1 )
1580 /* All contents of S[h] can be moved to R[h]. */
1584 order_R = ((n=PATH_H_B_ITEM_ORDER(tb->tb_path, h))==B_NR_ITEMS(Fh)) ? 0 : n + 1;
1585 n = dc_size(B_N_CHILD(tb->FR[h],order_R)) / (DC_SIZE + KEY_SIZE);
1586 set_parameters (tb, h, 0, -n-1, 0, NULL, -1, -1);
1591 if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1)
1593 /* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */
1596 to_r = ((MAX_NR_KEY(Sh)<<1)+2-tb->lnum[h]-tb->rnum[h]+vn->vn_nr_item+1)/2 -
1597 (MAX_NR_KEY(Sh) + 1 - tb->rnum[h]);
1598 set_parameters (tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL, -1, -1);
1602 /* Balancing does not lead to better packing. */
1603 set_parameters (tb, h, 0, 0, 1, NULL, -1, -1);
1604 return NO_BALANCING_NEEDED;
1607 /* Current node contain insufficient number of items. Balancing is required. */
1608 /* Check whether we can merge S[h] with left neighbor. */
1609 if (tb->lnum[h] >= vn->vn_nr_item + 1)
1610 if (is_left_neighbor_in_cache (tb,h) || tb->rnum[h] < vn->vn_nr_item + 1 || !tb->FR[h])
1615 order_L = ((n=PATH_H_B_ITEM_ORDER(tb->tb_path, h))==0) ? B_NR_ITEMS(tb->FL[h]) : n - 1;
1616 n = dc_size(B_N_CHILD(tb->FL[h],order_L)) / (DC_SIZE + KEY_SIZE);
1617 set_parameters (tb, h, -n-1, 0, 0, NULL, -1, -1);
1621 /* Check whether we can merge S[h] with right neighbor. */
1622 if (tb->rnum[h] >= vn->vn_nr_item + 1)
1627 order_R = ((n=PATH_H_B_ITEM_ORDER(tb->tb_path, h))==B_NR_ITEMS(Fh)) ? 0 : (n + 1);
1628 n = dc_size(B_N_CHILD(tb->FR[h],order_R)) / (DC_SIZE + KEY_SIZE);
1629 set_parameters (tb, h, 0, -n-1, 0, NULL, -1, -1);
1633 /* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */
1634 if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1)
1638 to_r = ((MAX_NR_KEY(Sh)<<1)+2-tb->lnum[h]-tb->rnum[h]+vn->vn_nr_item+1)/2 -
1639 (MAX_NR_KEY(Sh) + 1 - tb->rnum[h]);
1640 set_parameters (tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL, -1, -1);
1644 /* For internal nodes try to borrow item from a neighbor */
1645 RFALSE( !tb->FL[h] && !tb->FR[h], "vs-8235: trying to borrow for root");
1647 /* Borrow one or two items from caching neighbor */
1648 if (is_left_neighbor_in_cache (tb,h) || !tb->FR[h])
1652 from_l = (MAX_NR_KEY(Sh) + 1 - tb->lnum[h] + vn->vn_nr_item + 1) / 2 - (vn->vn_nr_item + 1);
1653 set_parameters (tb, h, -from_l, 0, 1, NULL, -1, -1);
1657 set_parameters (tb, h, 0, -((MAX_NR_KEY(Sh)+1-tb->rnum[h]+vn->vn_nr_item+1)/2-(vn->vn_nr_item+1)), 1,
1663 /* Check whether current node S[h] is balanced when Decreasing its size by
1664 * Deleting or Truncating for LEAF node of S+tree.
1665 * Calculate parameters for balancing for current level h.
1667 * tb tree_balance structure;
1668 * h current level of the node;
1669 * inum item number in S[h];
1670 * mode i - insert, p - paste;
1671 * Returns: 1 - schedule occurred;
1672 * 0 - balancing for higher levels needed;
1673 * -1 - no balancing for higher levels needed;
1674 * -2 - no disk space.
1676 static int dc_check_balance_leaf (struct tree_balance * tb, int h)
1678 struct virtual_node * vn = tb->tb_vn;
1680 /* Number of bytes that must be deleted from
1681 (value is negative if bytes are deleted) buffer which
1682 contains node being balanced. The mnemonic is that the
1683 attempted change in node space used level is levbytes bytes. */
1685 /* the maximal item size */
1688 /* S0 is the node whose balance is currently being checked,
1689 and F0 is its father. */
1690 struct buffer_head * S0, * F0;
1691 int lfree, rfree /* free space in L and R */;
1693 S0 = PATH_H_PBUFFER (tb->tb_path, 0);
1694 F0 = PATH_H_PPARENT (tb->tb_path, 0);
1696 levbytes = tb->insert_size[h];
1698 maxsize = MAX_CHILD_SIZE(S0); /* maximal possible size of an item */
1701 { /* S[0] is the root now. */
1703 RFALSE( -levbytes >= maxsize - B_FREE_SPACE (S0),
1704 "vs-8240: attempt to create empty buffer tree");
1706 set_parameters (tb, h, 0, 0, 1, NULL, -1, -1);
1707 return NO_BALANCING_NEEDED;
1710 if ( (n_ret_value = get_parents(tb,h)) != CARRY_ON )
1713 /* get free space of neighbors */
1714 rfree = get_rfree (tb, h);
1715 lfree = get_lfree (tb, h);
1717 create_virtual_node (tb, h);
1719 /* if 3 leaves can be merge to one, set parameters and return */
1720 if (are_leaves_removable (tb, lfree, rfree))
1723 /* determine maximal number of items we can shift to the left/right neighbor
1724 and the maximal number of bytes that can flow to the left/right neighbor
1725 from the left/right most liquid item that cannot be shifted from S[0] entirely
1727 check_left (tb, h, lfree);
1728 check_right (tb, h, rfree);
1730 /* check whether we can merge S with left neighbor. */
1731 if (tb->lnum[0] >= vn->vn_nr_item && tb->lbytes == -1)
1732 if (is_left_neighbor_in_cache (tb,h) ||
1733 ((tb->rnum[0] - ((tb->rbytes == -1) ? 0 : 1)) < vn->vn_nr_item) || /* S can not be merged with R */
1736 RFALSE( !tb->FL[h], "vs-8245: dc_check_balance_leaf: FL[h] must exist");
1738 /* set parameter to merge S[0] with its left neighbor */
1739 set_parameters (tb, h, -1, 0, 0, NULL, -1, -1);
1743 /* check whether we can merge S[0] with right neighbor. */
1744 if (tb->rnum[0] >= vn->vn_nr_item && tb->rbytes == -1) {
1745 set_parameters (tb, h, 0, -1, 0, NULL, -1, -1);
1749 /* All contents of S[0] can be moved to the neighbors (L[0] & R[0]). Set parameters and return */
1750 if (is_leaf_removable (tb))
1753 /* Balancing is not required. */
1754 tb->s0num = vn->vn_nr_item;
1755 set_parameters (tb, h, 0, 0, 1, NULL, -1, -1);
1756 return NO_BALANCING_NEEDED;
1761 /* Check whether current node S[h] is balanced when Decreasing its size by
1762 * Deleting or Cutting.
1763 * Calculate parameters for balancing for current level h.
1765 * tb tree_balance structure;
1766 * h current level of the node;
1767 * inum item number in S[h];
1768 * mode d - delete, c - cut.
1769 * Returns: 1 - schedule occurred;
1770 * 0 - balancing for higher levels needed;
1771 * -1 - no balancing for higher levels needed;
1772 * -2 - no disk space.
1774 static int dc_check_balance (struct tree_balance * tb, int h)
1776 RFALSE( ! (PATH_H_PBUFFER (tb->tb_path, h)), "vs-8250: S is not initialized");
1779 return dc_check_balance_internal (tb, h);
1781 return dc_check_balance_leaf (tb, h);
1786 /* Check whether current node S[h] is balanced.
1787 * Calculate parameters for balancing for current level h.
1790 * tb tree_balance structure:
1792 * tb is a large structure that must be read about in the header file
1793 * at the same time as this procedure if the reader is to successfully
1794 * understand this procedure
1796 * h current level of the node;
1797 * inum item number in S[h];
1798 * mode i - insert, p - paste, d - delete, c - cut.
1799 * Returns: 1 - schedule occurred;
1800 * 0 - balancing for higher levels needed;
1801 * -1 - no balancing for higher levels needed;
1802 * -2 - no disk space.
1804 static int check_balance (int mode,
1805 struct tree_balance * tb,
1809 struct item_head * ins_ih,
1813 struct virtual_node * vn;
1815 vn = tb->tb_vn = (struct virtual_node *)(tb->vn_buf);
1816 vn->vn_free_ptr = (char *)(tb->tb_vn + 1);
1818 vn->vn_affected_item_num = inum;
1819 vn->vn_pos_in_item = pos_in_item;
1820 vn->vn_ins_ih = ins_ih;
1823 RFALSE( mode == M_INSERT && !vn->vn_ins_ih,
1824 "vs-8255: ins_ih can not be 0 in insert mode");
1826 if ( tb->insert_size[h] > 0 )
1827 /* Calculate balance parameters when size of node is increasing. */
1828 return ip_check_balance (tb, h);
1830 /* Calculate balance parameters when size of node is decreasing. */
1831 return dc_check_balance (tb, h);
1836 /* Check whether parent at the path is the really parent of the current node.*/
1837 static int get_direct_parent(
1838 struct tree_balance * p_s_tb,
1841 struct buffer_head * p_s_bh;
1842 struct path * p_s_path = p_s_tb->tb_path;
1844 n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h);
1846 /* We are in the root or in the new root. */
1847 if ( n_path_offset <= FIRST_PATH_ELEMENT_OFFSET ) {
1849 RFALSE( n_path_offset < FIRST_PATH_ELEMENT_OFFSET - 1,
1850 "PAP-8260: invalid offset in the path");
1852 if ( PATH_OFFSET_PBUFFER(p_s_path, FIRST_PATH_ELEMENT_OFFSET)->b_blocknr ==
1853 SB_ROOT_BLOCK (p_s_tb->tb_sb) ) {
1854 /* Root is not changed. */
1855 PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1) = NULL;
1856 PATH_OFFSET_POSITION(p_s_path, n_path_offset - 1) = 0;
1859 return REPEAT_SEARCH; /* Root is changed and we must recalculate the path. */
1862 if ( ! B_IS_IN_TREE(p_s_bh = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1)) )
1863 return REPEAT_SEARCH; /* Parent in the path is not in the tree. */
1865 if ( (n_position = PATH_OFFSET_POSITION(p_s_path, n_path_offset - 1)) > B_NR_ITEMS(p_s_bh) )
1866 return REPEAT_SEARCH;
1868 if ( B_N_CHILD_NUM(p_s_bh, n_position) != PATH_OFFSET_PBUFFER(p_s_path, n_path_offset)->b_blocknr )
1869 /* Parent in the path is not parent of the current node in the tree. */
1870 return REPEAT_SEARCH;
1872 if ( buffer_locked(p_s_bh) ) {
1873 __wait_on_buffer(p_s_bh);
1874 if ( FILESYSTEM_CHANGED_TB (p_s_tb) )
1875 return REPEAT_SEARCH;
1878 return CARRY_ON; /* Parent in the path is unlocked and really parent of the current node. */
1882 /* Using lnum[n_h] and rnum[n_h] we should determine what neighbors
1884 * need in order to balance S[n_h], and get them if necessary.
1885 * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked;
1886 * CARRY_ON - schedule didn't occur while the function worked;
1888 static int get_neighbors(
1889 struct tree_balance * p_s_tb,
1892 int n_child_position,
1893 n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h + 1);
1894 unsigned long n_son_number;
1895 struct super_block * p_s_sb = p_s_tb->tb_sb;
1896 struct buffer_head * p_s_bh;
1899 PROC_INFO_INC( p_s_sb, get_neighbors[ n_h ] );
1901 if ( p_s_tb->lnum[n_h] ) {
1902 /* We need left neighbor to balance S[n_h]. */
1903 PROC_INFO_INC( p_s_sb, need_l_neighbor[ n_h ] );
1904 p_s_bh = PATH_OFFSET_PBUFFER(p_s_tb->tb_path, n_path_offset);
1906 RFALSE( p_s_bh == p_s_tb->FL[n_h] &&
1907 ! PATH_OFFSET_POSITION(p_s_tb->tb_path, n_path_offset),
1908 "PAP-8270: invalid position in the parent");
1910 n_child_position = ( p_s_bh == p_s_tb->FL[n_h] ) ? p_s_tb->lkey[n_h] : B_NR_ITEMS (p_s_tb->FL[n_h]);
1911 n_son_number = B_N_CHILD_NUM(p_s_tb->FL[n_h], n_child_position);
1912 p_s_bh = sb_bread(p_s_sb, n_son_number);
1915 if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) {
1916 decrement_bcount(p_s_bh);
1917 PROC_INFO_INC( p_s_sb, get_neighbors_restart[ n_h ] );
1918 return REPEAT_SEARCH;
1921 RFALSE( ! B_IS_IN_TREE(p_s_tb->FL[n_h]) ||
1922 n_child_position > B_NR_ITEMS(p_s_tb->FL[n_h]) ||
1923 B_N_CHILD_NUM(p_s_tb->FL[n_h], n_child_position) !=
1924 p_s_bh->b_blocknr, "PAP-8275: invalid parent");
1925 RFALSE( ! B_IS_IN_TREE(p_s_bh), "PAP-8280: invalid child");
1927 B_FREE_SPACE (p_s_bh) != MAX_CHILD_SIZE (p_s_bh) - dc_size(B_N_CHILD (p_s_tb->FL[0],n_child_position)),
1928 "PAP-8290: invalid child size of left neighbor");
1930 decrement_bcount(p_s_tb->L[n_h]);
1931 p_s_tb->L[n_h] = p_s_bh;
1935 if ( p_s_tb->rnum[n_h] ) { /* We need right neighbor to balance S[n_path_offset]. */
1936 PROC_INFO_INC( p_s_sb, need_r_neighbor[ n_h ] );
1937 p_s_bh = PATH_OFFSET_PBUFFER(p_s_tb->tb_path, n_path_offset);
1939 RFALSE( p_s_bh == p_s_tb->FR[n_h] &&
1940 PATH_OFFSET_POSITION(p_s_tb->tb_path, n_path_offset) >= B_NR_ITEMS(p_s_bh),
1941 "PAP-8295: invalid position in the parent");
1943 n_child_position = ( p_s_bh == p_s_tb->FR[n_h] ) ? p_s_tb->rkey[n_h] + 1 : 0;
1944 n_son_number = B_N_CHILD_NUM(p_s_tb->FR[n_h], n_child_position);
1945 p_s_bh = sb_bread(p_s_sb, n_son_number);
1948 if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) {
1949 decrement_bcount(p_s_bh);
1950 PROC_INFO_INC( p_s_sb, get_neighbors_restart[ n_h ] );
1951 return REPEAT_SEARCH;
1953 decrement_bcount(p_s_tb->R[n_h]);
1954 p_s_tb->R[n_h] = p_s_bh;
1956 RFALSE( ! n_h && B_FREE_SPACE (p_s_bh) != MAX_CHILD_SIZE (p_s_bh) - dc_size(B_N_CHILD (p_s_tb->FR[0],n_child_position)),
1957 "PAP-8300: invalid child size of right neighbor (%d != %d - %d)",
1958 B_FREE_SPACE (p_s_bh), MAX_CHILD_SIZE (p_s_bh),
1959 dc_size(B_N_CHILD (p_s_tb->FR[0],n_child_position)));
1965 #ifdef CONFIG_REISERFS_CHECK
1966 void * reiserfs_kmalloc (size_t size, int flags, struct super_block * s)
1969 static size_t malloced;
1972 vp = kmalloc (size, flags);
1974 REISERFS_SB(s)->s_kmallocs += size;
1975 if (REISERFS_SB(s)->s_kmallocs > malloced + 200000) {
1976 reiserfs_warning ("vs-8301: reiserfs_kmalloc: allocated memory %d\n", REISERFS_SB(s)->s_kmallocs);
1977 malloced = REISERFS_SB(s)->s_kmallocs;
1980 /*printk ("malloc : size %d, allocated %d\n", size, REISERFS_SB(s)->s_kmallocs);*/
1984 void reiserfs_kfree (const void * vp, size_t size, struct super_block * s)
1988 REISERFS_SB(s)->s_kmallocs -= size;
1989 if (REISERFS_SB(s)->s_kmallocs < 0)
1990 reiserfs_warning ("vs-8302: reiserfs_kfree: allocated memory %d\n", REISERFS_SB(s)->s_kmallocs);
1996 static int get_virtual_node_size (struct super_block * sb, struct buffer_head * bh)
1998 int max_num_of_items;
1999 int max_num_of_entries;
2000 unsigned long blocksize = sb->s_blocksize;
2002 #define MIN_NAME_LEN 1
2004 max_num_of_items = (blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN);
2005 max_num_of_entries = (blocksize - BLKH_SIZE - IH_SIZE) /
2006 (DEH_SIZE + MIN_NAME_LEN);
2008 return sizeof(struct virtual_node) +
2009 max(max_num_of_items * sizeof (struct virtual_item),
2010 sizeof (struct virtual_item) + sizeof(struct direntry_uarea) +
2011 (max_num_of_entries - 1) * sizeof (__u16));
2016 /* maybe we should fail balancing we are going to perform when kmalloc
2017 fails several times. But now it will loop until kmalloc gets
2019 static int get_mem_for_virtual_node (struct tree_balance * tb)
2025 size = get_virtual_node_size (tb->tb_sb, PATH_PLAST_BUFFER (tb->tb_path));
2027 if (size > tb->vn_buf_size) {
2028 /* we have to allocate more memory for virtual node */
2030 /* free memory allocated before */
2031 reiserfs_kfree (tb->vn_buf, tb->vn_buf_size, tb->tb_sb);
2032 /* this is not needed if kfree is atomic */
2036 /* virtual node requires now more memory */
2037 tb->vn_buf_size = size;
2039 /* get memory for virtual item */
2040 buf = reiserfs_kmalloc(size, GFP_ATOMIC | __GFP_NOWARN, tb->tb_sb);
2042 /* getting memory with GFP_KERNEL priority may involve
2043 balancing now (due to indirect_to_direct conversion on
2044 dcache shrinking). So, release path and collected
2046 free_buffers_in_tb (tb);
2047 buf = reiserfs_kmalloc(size, GFP_NOFS, tb->tb_sb);
2049 #ifdef CONFIG_REISERFS_CHECK
2050 reiserfs_warning ("vs-8345: get_mem_for_virtual_node: "
2051 "kmalloc failed. reiserfs kmalloced %d bytes\n",
2052 REISERFS_SB(tb->tb_sb)->s_kmallocs);
2054 tb->vn_buf_size = 0;
2058 return REPEAT_SEARCH;
2064 if ( check_fs && FILESYSTEM_CHANGED_TB (tb) )
2065 return REPEAT_SEARCH;
2071 #ifdef CONFIG_REISERFS_CHECK
2072 static void tb_buffer_sanity_check (struct super_block * p_s_sb,
2073 struct buffer_head * p_s_bh,
2074 const char *descr, int level) {
2076 if (atomic_read (&(p_s_bh->b_count)) <= 0) {
2078 reiserfs_panic (p_s_sb, "jmacd-1: tb_buffer_sanity_check(): negative or zero reference counter for buffer %s[%d] (%b)\n", descr, level, p_s_bh);
2081 if ( ! buffer_uptodate (p_s_bh) ) {
2082 reiserfs_panic (p_s_sb, "jmacd-2: tb_buffer_sanity_check(): buffer is not up to date %s[%d] (%b)\n", descr, level, p_s_bh);
2085 if ( ! B_IS_IN_TREE (p_s_bh) ) {
2086 reiserfs_panic (p_s_sb, "jmacd-3: tb_buffer_sanity_check(): buffer is not in tree %s[%d] (%b)\n", descr, level, p_s_bh);
2089 if (p_s_bh->b_bdev != p_s_sb->s_bdev) {
2090 reiserfs_panic (p_s_sb, "jmacd-4: tb_buffer_sanity_check(): buffer has wrong device %s[%d] (%b)\n", descr, level, p_s_bh);
2093 if (p_s_bh->b_size != p_s_sb->s_blocksize) {
2094 reiserfs_panic (p_s_sb, "jmacd-5: tb_buffer_sanity_check(): buffer has wrong blocksize %s[%d] (%b)\n", descr, level, p_s_bh);
2097 if (p_s_bh->b_blocknr > SB_BLOCK_COUNT(p_s_sb)) {
2098 reiserfs_panic (p_s_sb, "jmacd-6: tb_buffer_sanity_check(): buffer block number too high %s[%d] (%b)\n", descr, level, p_s_bh);
2103 static void tb_buffer_sanity_check (struct super_block * p_s_sb,
2104 struct buffer_head * p_s_bh,
2105 const char *descr, int level)
2109 static int clear_all_dirty_bits(struct super_block *s,
2110 struct buffer_head *bh) {
2111 return reiserfs_prepare_for_journal(s, bh, 0) ;
2114 static int wait_tb_buffers_until_unlocked (struct tree_balance * p_s_tb)
2116 struct buffer_head * locked;
2117 #ifdef CONFIG_REISERFS_CHECK
2118 int repeat_counter = 0;
2126 for ( i = p_s_tb->tb_path->path_length; !locked && i > ILLEGAL_PATH_ELEMENT_OFFSET; i-- ) {
2127 if ( PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i) ) {
2128 /* if I understand correctly, we can only be sure the last buffer
2129 ** in the path is in the tree --clm
2131 #ifdef CONFIG_REISERFS_CHECK
2132 if (PATH_PLAST_BUFFER(p_s_tb->tb_path) ==
2133 PATH_OFFSET_PBUFFER(p_s_tb->tb_path, i)) {
2134 tb_buffer_sanity_check (p_s_tb->tb_sb,
2135 PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i),
2137 p_s_tb->tb_path->path_length - i);
2140 if (!clear_all_dirty_bits(p_s_tb->tb_sb,
2141 PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i)))
2143 locked = PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i);
2148 for ( i = 0; !locked && i < MAX_HEIGHT && p_s_tb->insert_size[i]; i++ ) {
2150 if (p_s_tb->lnum[i] ) {
2152 if ( p_s_tb->L[i] ) {
2153 tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->L[i], "L", i);
2154 if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->L[i]))
2155 locked = p_s_tb->L[i];
2158 if ( !locked && p_s_tb->FL[i] ) {
2159 tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->FL[i], "FL", i);
2160 if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FL[i]))
2161 locked = p_s_tb->FL[i];
2164 if ( !locked && p_s_tb->CFL[i] ) {
2165 tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->CFL[i], "CFL", i);
2166 if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->CFL[i]))
2167 locked = p_s_tb->CFL[i];
2172 if ( !locked && (p_s_tb->rnum[i]) ) {
2174 if ( p_s_tb->R[i] ) {
2175 tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->R[i], "R", i);
2176 if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->R[i]))
2177 locked = p_s_tb->R[i];
2181 if ( !locked && p_s_tb->FR[i] ) {
2182 tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->FR[i], "FR", i);
2183 if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FR[i]))
2184 locked = p_s_tb->FR[i];
2187 if ( !locked && p_s_tb->CFR[i] ) {
2188 tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->CFR[i], "CFR", i);
2189 if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->CFR[i]))
2190 locked = p_s_tb->CFR[i];
2194 /* as far as I can tell, this is not required. The FEB list seems
2195 ** to be full of newly allocated nodes, which will never be locked,
2196 ** dirty, or anything else.
2197 ** To be safe, I'm putting in the checks and waits in. For the moment,
2198 ** they are needed to keep the code in journal.c from complaining
2199 ** about the buffer. That code is inside CONFIG_REISERFS_CHECK as well.
2202 for ( i = 0; !locked && i < MAX_FEB_SIZE; i++ ) {
2203 if ( p_s_tb->FEB[i] ) {
2204 if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FEB[i]))
2205 locked = p_s_tb->FEB[i] ;
2210 #ifdef CONFIG_REISERFS_CHECK
2212 if ( (repeat_counter % 10000) == 0) {
2213 reiserfs_warning ("wait_tb_buffers_until_released(): too many iterations waiting for buffer to unlock (%b)\n", locked);
2215 /* Don't loop forever. Try to recover from possible error. */
2217 return ( FILESYSTEM_CHANGED_TB (p_s_tb) ) ? REPEAT_SEARCH : CARRY_ON;
2220 __wait_on_buffer (locked);
2221 if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) {
2222 return REPEAT_SEARCH;
2232 /* Prepare for balancing, that is
2233 * get all necessary parents, and neighbors;
2234 * analyze what and where should be moved;
2235 * get sufficient number of new nodes;
2236 * Balancing will start only after all resources will be collected at a time.
2238 * When ported to SMP kernels, only at the last moment after all needed nodes
2239 * are collected in cache, will the resources be locked using the usual
2240 * textbook ordered lock acquisition algorithms. Note that ensuring that
2241 * this code neither write locks what it does not need to write lock nor locks out of order
2242 * will be a pain in the butt that could have been avoided. Grumble grumble. -Hans
2244 * fix is meant in the sense of render unchanging
2246 * Latency might be improved by first gathering a list of what buffers are needed
2247 * and then getting as many of them in parallel as possible? -Hans
2250 * op_mode i - insert, d - delete, c - cut (truncate), p - paste (append)
2251 * tb tree_balance structure;
2252 * inum item number in S[h];
2253 * pos_in_item - comment this if you can
2254 * ins_ih & ins_sd are used when inserting
2255 * Returns: 1 - schedule occurred while the function worked;
2256 * 0 - schedule didn't occur while the function worked;
2257 * -1 - if no_disk_space
2261 int fix_nodes (int n_op_mode,
2262 struct tree_balance * p_s_tb,
2263 struct item_head * p_s_ins_ih, // item head of item being inserted
2264 const void * data // inserted item or data to be pasted
2268 n_item_num = PATH_LAST_POSITION(p_s_tb->tb_path);
2271 /* we set wait_tb_buffers_run when we have to restore any dirty bits cleared
2272 ** during wait_tb_buffers_run
2274 int wait_tb_buffers_run = 0 ;
2275 struct buffer_head * p_s_tbS0 = PATH_PLAST_BUFFER(p_s_tb->tb_path);
2277 ++ REISERFS_SB(p_s_tb -> tb_sb) -> s_fix_nodes;
2279 n_pos_in_item = p_s_tb->tb_path->pos_in_item;
2282 p_s_tb->fs_gen = get_generation (p_s_tb->tb_sb);
2284 /* we prepare and log the super here so it will already be in the
2285 ** transaction when do_balance needs to change it.
2286 ** This way do_balance won't have to schedule when trying to prepare
2287 ** the super for logging
2289 reiserfs_prepare_for_journal(p_s_tb->tb_sb,
2290 SB_BUFFER_WITH_SB(p_s_tb->tb_sb), 1) ;
2291 journal_mark_dirty(p_s_tb->transaction_handle, p_s_tb->tb_sb,
2292 SB_BUFFER_WITH_SB(p_s_tb->tb_sb)) ;
2293 if ( FILESYSTEM_CHANGED_TB (p_s_tb) )
2294 return REPEAT_SEARCH;
2296 /* if it possible in indirect_to_direct conversion */
2297 if (buffer_locked (p_s_tbS0)) {
2298 __wait_on_buffer (p_s_tbS0);
2299 if ( FILESYSTEM_CHANGED_TB (p_s_tb) )
2300 return REPEAT_SEARCH;
2303 #ifdef CONFIG_REISERFS_CHECK
2305 print_cur_tb ("fix_nodes");
2306 reiserfs_panic(p_s_tb->tb_sb,"PAP-8305: fix_nodes: there is pending do_balance");
2309 if (!buffer_uptodate (p_s_tbS0) || !B_IS_IN_TREE (p_s_tbS0)) {
2310 reiserfs_panic (p_s_tb->tb_sb, "PAP-8320: fix_nodes: S[0] (%b %z) is not uptodate "
2311 "at the beginning of fix_nodes or not in tree (mode %c)", p_s_tbS0, p_s_tbS0, n_op_mode);
2314 /* Check parameters. */
2315 switch (n_op_mode) {
2317 if ( n_item_num <= 0 || n_item_num > B_NR_ITEMS(p_s_tbS0) )
2318 reiserfs_panic(p_s_tb->tb_sb,"PAP-8330: fix_nodes: Incorrect item number %d (in S0 - %d) in case of insert",
2319 n_item_num, B_NR_ITEMS(p_s_tbS0));
2324 if ( n_item_num < 0 || n_item_num >= B_NR_ITEMS(p_s_tbS0) ) {
2325 print_block (p_s_tbS0, 0, -1, -1);
2326 printk("mode = %c insert_size = %d\n", n_op_mode, p_s_tb->insert_size[0]);
2327 reiserfs_panic(p_s_tb->tb_sb,"PAP-8335: fix_nodes: Incorrect item number(%d)", n_item_num);
2331 reiserfs_panic(p_s_tb->tb_sb,"PAP-8340: fix_nodes: Incorrect mode of operation");
2335 if (get_mem_for_virtual_node (p_s_tb) == REPEAT_SEARCH)
2336 // FIXME: maybe -ENOMEM when tb->vn_buf == 0? Now just repeat
2337 return REPEAT_SEARCH;
2340 /* Starting from the leaf level; for all levels n_h of the tree. */
2341 for ( n_h = 0; n_h < MAX_HEIGHT && p_s_tb->insert_size[n_h]; n_h++ ) {
2342 if ( (n_ret_value = get_direct_parent(p_s_tb, n_h)) != CARRY_ON ) {
2346 if ( (n_ret_value = check_balance (n_op_mode, p_s_tb, n_h, n_item_num,
2347 n_pos_in_item, p_s_ins_ih, data)) != CARRY_ON ) {
2348 if ( n_ret_value == NO_BALANCING_NEEDED ) {
2349 /* No balancing for higher levels needed. */
2350 if ( (n_ret_value = get_neighbors(p_s_tb, n_h)) != CARRY_ON ) {
2353 if ( n_h != MAX_HEIGHT - 1 )
2354 p_s_tb->insert_size[n_h + 1] = 0;
2355 /* ok, analysis and resource gathering are complete */
2361 if ( (n_ret_value = get_neighbors(p_s_tb, n_h)) != CARRY_ON ) {
2365 if ( (n_ret_value = get_empty_nodes(p_s_tb, n_h)) != CARRY_ON ) {
2366 goto repeat; /* No disk space, or schedule occurred and
2367 analysis may be invalid and needs to be redone. */
2370 if ( ! PATH_H_PBUFFER(p_s_tb->tb_path, n_h) ) {
2371 /* We have a positive insert size but no nodes exist on this
2372 level, this means that we are creating a new root. */
2374 RFALSE( p_s_tb->blknum[n_h] != 1,
2375 "PAP-8350: creating new empty root");
2377 if ( n_h < MAX_HEIGHT - 1 )
2378 p_s_tb->insert_size[n_h + 1] = 0;
2381 if ( ! PATH_H_PBUFFER(p_s_tb->tb_path, n_h + 1) ) {
2382 if ( p_s_tb->blknum[n_h] > 1 ) {
2383 /* The tree needs to be grown, so this node S[n_h]
2384 which is the root node is split into two nodes,
2385 and a new node (S[n_h+1]) will be created to
2386 become the root node. */
2388 RFALSE( n_h == MAX_HEIGHT - 1,
2389 "PAP-8355: attempt to create too high of a tree");
2391 p_s_tb->insert_size[n_h + 1] = (DC_SIZE + KEY_SIZE) * (p_s_tb->blknum[n_h] - 1) + DC_SIZE;
2394 if ( n_h < MAX_HEIGHT - 1 )
2395 p_s_tb->insert_size[n_h + 1] = 0;
2398 p_s_tb->insert_size[n_h + 1] = (DC_SIZE + KEY_SIZE) * (p_s_tb->blknum[n_h] - 1);
2401 if ((n_ret_value = wait_tb_buffers_until_unlocked (p_s_tb)) == CARRY_ON) {
2402 if (FILESYSTEM_CHANGED_TB(p_s_tb)) {
2403 wait_tb_buffers_run = 1 ;
2404 n_ret_value = REPEAT_SEARCH ;
2410 wait_tb_buffers_run = 1 ;
2415 // fix_nodes was unable to perform its calculation due to
2416 // filesystem got changed under us, lack of free disk space or i/o
2417 // failure. If the first is the case - the search will be
2418 // repeated. For now - free all resources acquired so far except
2419 // for the new allocated nodes
2423 /* Release path buffers. */
2424 if (wait_tb_buffers_run) {
2425 pathrelse_and_restore(p_s_tb->tb_sb, p_s_tb->tb_path) ;
2427 pathrelse (p_s_tb->tb_path);
2429 /* brelse all resources collected for balancing */
2430 for ( i = 0; i < MAX_HEIGHT; i++ ) {
2431 if (wait_tb_buffers_run) {
2432 reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->L[i]);
2433 reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->R[i]);
2434 reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->FL[i]);
2435 reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->FR[i]);
2436 reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->CFL[i]);
2437 reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->CFR[i]);
2440 brelse (p_s_tb->L[i]);p_s_tb->L[i] = 0;
2441 brelse (p_s_tb->R[i]);p_s_tb->R[i] = 0;
2442 brelse (p_s_tb->FL[i]);p_s_tb->FL[i] = 0;
2443 brelse (p_s_tb->FR[i]);p_s_tb->FR[i] = 0;
2444 brelse (p_s_tb->CFL[i]);p_s_tb->CFL[i] = 0;
2445 brelse (p_s_tb->CFR[i]);p_s_tb->CFR[i] = 0;
2448 if (wait_tb_buffers_run) {
2449 for ( i = 0; i < MAX_FEB_SIZE; i++ ) {
2450 if ( p_s_tb->FEB[i] ) {
2451 reiserfs_restore_prepared_buffer(p_s_tb->tb_sb,
2462 /* Anatoly will probably forgive me renaming p_s_tb to tb. I just
2463 wanted to make lines shorter */
2464 void unfix_nodes (struct tree_balance * tb)
2468 /* Release path buffers. */
2469 pathrelse_and_restore (tb->tb_sb, tb->tb_path);
2471 /* brelse all resources collected for balancing */
2472 for ( i = 0; i < MAX_HEIGHT; i++ ) {
2473 reiserfs_restore_prepared_buffer (tb->tb_sb, tb->L[i]);
2474 reiserfs_restore_prepared_buffer (tb->tb_sb, tb->R[i]);
2475 reiserfs_restore_prepared_buffer (tb->tb_sb, tb->FL[i]);
2476 reiserfs_restore_prepared_buffer (tb->tb_sb, tb->FR[i]);
2477 reiserfs_restore_prepared_buffer (tb->tb_sb, tb->CFL[i]);
2478 reiserfs_restore_prepared_buffer (tb->tb_sb, tb->CFR[i]);
2484 brelse (tb->CFL[i]);
2485 brelse (tb->CFR[i]);
2488 /* deal with list of allocated (used and unused) nodes */
2489 for ( i = 0; i < MAX_FEB_SIZE; i++ ) {
2491 b_blocknr_t blocknr = tb->FEB[i]->b_blocknr ;
2492 /* de-allocated block which was not used by balancing and
2493 bforget about buffer for it */
2494 brelse (tb->FEB[i]);
2495 reiserfs_free_block (tb->transaction_handle, blocknr);
2498 /* release used as new nodes including a new root */
2499 brelse (tb->used[i]);
2504 reiserfs_kfree (tb->vn_buf, tb->vn_buf_size, tb->tb_sb);