--- zzzz-none-000/linux-3.10.107/fs/btrfs/ctree.c 2017-06-27 09:49:32.000000000 +0000 +++ scorpion-7490-727/linux-3.10.107/fs/btrfs/ctree.c 2021-02-04 17:41:59.000000000 +0000 @@ -39,9 +39,8 @@ struct extent_buffer *src_buf); static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, int level, int slot); -static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, +static int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb); -static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); struct btrfs_path *btrfs_alloc_path(void) { @@ -81,13 +80,6 @@ { int i; -#ifdef CONFIG_DEBUG_LOCK_ALLOC - /* lockdep really cares that we take all of these spinlocks - * in the right order. If any of the locks in the path are not - * currently blocking, it is going to complain. So, make really - * really sure by forcing the path to blocking before we clear - * the path blocking. - */ if (held) { btrfs_set_lock_blocking_rw(held, held_rw); if (held_rw == BTRFS_WRITE_LOCK) @@ -96,7 +88,6 @@ held_rw = BTRFS_READ_LOCK_BLOCKING; } btrfs_set_path_blocking(p); -#endif for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) { if (p->nodes[i] && p->locks[i]) { @@ -108,10 +99,8 @@ } } -#ifdef CONFIG_DEBUG_LOCK_ALLOC if (held) btrfs_clear_lock_blocking_rw(held, held_rw); -#endif } /* this also releases the path */ @@ -224,10 +213,19 @@ */ static void add_root_to_dirty_list(struct btrfs_root *root) { + if (test_bit(BTRFS_ROOT_DIRTY, &root->state) || + !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state)) + return; + spin_lock(&root->fs_info->trans_lock); - if (root->track_dirty && list_empty(&root->dirty_list)) { - list_add(&root->dirty_list, - &root->fs_info->dirty_cowonly_roots); + if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) { + /* Want the extent tree to be the last on the list */ + if (root->objectid == BTRFS_EXTENT_TREE_OBJECTID) + list_move_tail(&root->dirty_list, + &root->fs_info->dirty_cowonly_roots); + else + list_move(&root->dirty_list, + &root->fs_info->dirty_cowonly_roots); } spin_unlock(&root->fs_info->trans_lock); } @@ -247,9 +245,10 @@ int level; struct btrfs_disk_key disk_key; - WARN_ON(root->ref_cows && trans->transid != - root->fs_info->running_transaction->transid); - WARN_ON(root->ref_cows && trans->transid != root->last_trans); + WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) && + trans->transid != root->fs_info->running_transaction->transid); + WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) && + trans->transid != root->last_trans); level = btrfs_header_level(buf); if (level == 0) @@ -257,9 +256,8 @@ else btrfs_node_key(buf, &disk_key, 0); - cow = btrfs_alloc_free_block(trans, root, buf->len, 0, - new_root_objectid, &disk_key, level, - buf->start, 0); + cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid, + &disk_key, level, buf->start, 0); if (IS_ERR(cow)) return PTR_ERR(cow); @@ -274,15 +272,14 @@ else btrfs_set_header_owner(cow, new_root_objectid); - write_extent_buffer(cow, root->fs_info->fsid, - (unsigned long)btrfs_header_fsid(cow), + write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE); WARN_ON(btrfs_header_generation(buf) > trans->transid); if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) - ret = btrfs_inc_ref(trans, root, cow, 1, 1); + ret = btrfs_inc_ref(trans, root, cow, 1); else - ret = btrfs_inc_ref(trans, root, cow, 0, 1); + ret = btrfs_inc_ref(trans, root, cow, 0); if (ret) return ret; @@ -356,44 +353,14 @@ } /* - * Increment the upper half of tree_mod_seq, set lower half zero. - * - * Must be called with fs_info->tree_mod_seq_lock held. - */ -static inline u64 btrfs_inc_tree_mod_seq_major(struct btrfs_fs_info *fs_info) -{ - u64 seq = atomic64_read(&fs_info->tree_mod_seq); - seq &= 0xffffffff00000000ull; - seq += 1ull << 32; - atomic64_set(&fs_info->tree_mod_seq, seq); - return seq; -} - -/* - * Increment the lower half of tree_mod_seq. - * - * Must be called with fs_info->tree_mod_seq_lock held. The way major numbers - * are generated should not technically require a spin lock here. (Rationale: - * incrementing the minor while incrementing the major seq number is between its - * atomic64_read and atomic64_set calls doesn't duplicate sequence numbers, it - * just returns a unique sequence number as usual.) We have decided to leave - * that requirement in here and rethink it once we notice it really imposes a - * problem on some workload. + * Pull a new tree mod seq number for our operation. */ -static inline u64 btrfs_inc_tree_mod_seq_minor(struct btrfs_fs_info *fs_info) +static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info) { return atomic64_inc_return(&fs_info->tree_mod_seq); } /* - * return the last minor in the previous major tree_mod_seq number - */ -u64 btrfs_tree_mod_seq_prev(u64 seq) -{ - return (seq & 0xffffffff00000000ull) - 1ull; -} - -/* * This adds a new blocker to the tree mod log's blocker list if the @elem * passed does not already have a sequence number set. So when a caller expects * to record tree modifications, it should ensure to set elem->seq to zero @@ -404,19 +371,16 @@ u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem) { - u64 seq; - tree_mod_log_write_lock(fs_info); spin_lock(&fs_info->tree_mod_seq_lock); if (!elem->seq) { - elem->seq = btrfs_inc_tree_mod_seq_major(fs_info); + elem->seq = btrfs_inc_tree_mod_seq(fs_info); list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); } - seq = btrfs_inc_tree_mod_seq_minor(fs_info); spin_unlock(&fs_info->tree_mod_seq_lock); tree_mod_log_write_unlock(fs_info); - return seq; + return elem->seq; } void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, @@ -476,6 +440,8 @@ * the index is the shifted logical of the *new* root node for root replace * operations, or the shifted logical of the affected block for all other * operations. + * + * Note: must be called with write lock (tree_mod_log_write_lock). */ static noinline int __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) @@ -485,7 +451,9 @@ struct rb_node *parent = NULL; struct tree_mod_elem *cur; - BUG_ON(!tm || !tm->seq); + BUG_ON(!tm); + + tm->seq = btrfs_inc_tree_mod_seq(fs_info); tm_root = &fs_info->tree_mod_log; new = &tm_root->rb_node; @@ -500,10 +468,8 @@ new = &((*new)->rb_left); else if (cur->seq > tm->seq) new = &((*new)->rb_right); - else { - kfree(tm); + else return -EEXIST; - } } rb_link_node(&tm->node, parent, new); @@ -526,11 +492,7 @@ return 1; tree_mod_log_write_lock(fs_info); - if (list_empty(&fs_info->tree_mod_seq_list)) { - /* - * someone emptied the list while we were waiting for the lock. - * we must not add to the list when no blocker exists. - */ + if (list_empty(&(fs_info)->tree_mod_seq_list)) { tree_mod_log_write_unlock(fs_info); return 1; } @@ -538,43 +500,28 @@ return 0; } -/* - * This allocates memory and gets a tree modification sequence number. - * - * Returns <0 on error. - * Returns >0 (the added sequence number) on success. - */ -static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags, - struct tree_mod_elem **tm_ret) +/* Similar to tree_mod_dont_log, but doesn't acquire any locks. */ +static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info, + struct extent_buffer *eb) { - struct tree_mod_elem *tm; - - /* - * once we switch from spin locks to something different, we should - * honor the flags parameter here. - */ - tm = *tm_ret = kzalloc(sizeof(*tm), GFP_ATOMIC); - if (!tm) - return -ENOMEM; - - spin_lock(&fs_info->tree_mod_seq_lock); - tm->seq = btrfs_inc_tree_mod_seq_minor(fs_info); - spin_unlock(&fs_info->tree_mod_seq_lock); + smp_mb(); + if (list_empty(&(fs_info)->tree_mod_seq_list)) + return 0; + if (eb && btrfs_header_level(eb) == 0) + return 0; - return tm->seq; + return 1; } -static inline int -__tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, int slot, - enum mod_log_op op, gfp_t flags) +static struct tree_mod_elem * +alloc_tree_mod_elem(struct extent_buffer *eb, int slot, + enum mod_log_op op, gfp_t flags) { - int ret; struct tree_mod_elem *tm; - ret = tree_mod_alloc(fs_info, flags, &tm); - if (ret < 0) - return ret; + tm = kzalloc(sizeof(*tm), flags); + if (!tm) + return NULL; tm->index = eb->start >> PAGE_CACHE_SHIFT; if (op != MOD_LOG_KEY_ADD) { @@ -584,39 +531,37 @@ tm->op = op; tm->slot = slot; tm->generation = btrfs_node_ptr_generation(eb, slot); + RB_CLEAR_NODE(&tm->node); - return __tree_mod_log_insert(fs_info, tm); + return tm; } static noinline int -tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, int slot, - enum mod_log_op op, gfp_t flags) +tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, int slot, + enum mod_log_op op, gfp_t flags) { + struct tree_mod_elem *tm; int ret; - if (tree_mod_dont_log(fs_info, eb)) + if (!tree_mod_need_log(fs_info, eb)) return 0; - ret = __tree_mod_log_insert_key(fs_info, eb, slot, op, flags); + tm = alloc_tree_mod_elem(eb, slot, op, flags); + if (!tm) + return -ENOMEM; - tree_mod_log_write_unlock(fs_info); - return ret; -} + if (tree_mod_dont_log(fs_info, eb)) { + kfree(tm); + return 0; + } -static noinline int -tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, - int slot, enum mod_log_op op) -{ - return tree_mod_log_insert_key_mask(fs_info, eb, slot, op, GFP_NOFS); -} + ret = __tree_mod_log_insert(fs_info, tm); + tree_mod_log_write_unlock(fs_info); + if (ret) + kfree(tm); -static noinline int -tree_mod_log_insert_key_locked(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, int slot, - enum mod_log_op op) -{ - return __tree_mod_log_insert_key(fs_info, eb, slot, op, GFP_NOFS); + return ret; } static noinline int @@ -624,27 +569,24 @@ struct extent_buffer *eb, int dst_slot, int src_slot, int nr_items, gfp_t flags) { - struct tree_mod_elem *tm; - int ret; + struct tree_mod_elem *tm = NULL; + struct tree_mod_elem **tm_list = NULL; + int ret = 0; int i; + int locked = 0; - if (tree_mod_dont_log(fs_info, eb)) + if (!tree_mod_need_log(fs_info, eb)) return 0; - /* - * When we override something during the move, we log these removals. - * This can only happen when we move towards the beginning of the - * buffer, i.e. dst_slot < src_slot. - */ - for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { - ret = tree_mod_log_insert_key_locked(fs_info, eb, i + dst_slot, - MOD_LOG_KEY_REMOVE_WHILE_MOVING); - BUG_ON(ret < 0); - } + tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), flags); + if (!tm_list) + return -ENOMEM; - ret = tree_mod_alloc(fs_info, flags, &tm); - if (ret < 0) - goto out; + tm = kzalloc(sizeof(*tm), flags); + if (!tm) { + ret = -ENOMEM; + goto free_tms; + } tm->index = eb->start >> PAGE_CACHE_SHIFT; tm->slot = src_slot; @@ -652,28 +594,70 @@ tm->move.nr_items = nr_items; tm->op = MOD_LOG_MOVE_KEYS; + for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { + tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot, + MOD_LOG_KEY_REMOVE_WHILE_MOVING, flags); + if (!tm_list[i]) { + ret = -ENOMEM; + goto free_tms; + } + } + + if (tree_mod_dont_log(fs_info, eb)) + goto free_tms; + locked = 1; + + /* + * When we override something during the move, we log these removals. + * This can only happen when we move towards the beginning of the + * buffer, i.e. dst_slot < src_slot. + */ + for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { + ret = __tree_mod_log_insert(fs_info, tm_list[i]); + if (ret) + goto free_tms; + } + ret = __tree_mod_log_insert(fs_info, tm); -out: + if (ret) + goto free_tms; tree_mod_log_write_unlock(fs_info); + kfree(tm_list); + + return 0; +free_tms: + for (i = 0; i < nr_items; i++) { + if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) + rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log); + kfree(tm_list[i]); + } + if (locked) + tree_mod_log_write_unlock(fs_info); + kfree(tm_list); + kfree(tm); + return ret; } -static inline void -__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) +static inline int +__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, + struct tree_mod_elem **tm_list, + int nritems) { - int i; - u32 nritems; + int i, j; int ret; - if (btrfs_header_level(eb) == 0) - return; - - nritems = btrfs_header_nritems(eb); for (i = nritems - 1; i >= 0; i--) { - ret = tree_mod_log_insert_key_locked(fs_info, eb, i, - MOD_LOG_KEY_REMOVE_WHILE_FREEING); - BUG_ON(ret < 0); + ret = __tree_mod_log_insert(fs_info, tm_list[i]); + if (ret) { + for (j = nritems - 1; j > i; j--) + rb_erase(&tm_list[j]->node, + &fs_info->tree_mod_log); + return ret; + } } + + return 0; } static noinline int @@ -682,18 +666,38 @@ struct extent_buffer *new_root, gfp_t flags, int log_removal) { - struct tree_mod_elem *tm; - int ret; + struct tree_mod_elem *tm = NULL; + struct tree_mod_elem **tm_list = NULL; + int nritems = 0; + int ret = 0; + int i; - if (tree_mod_dont_log(fs_info, NULL)) + if (!tree_mod_need_log(fs_info, NULL)) return 0; - if (log_removal) - __tree_mod_log_free_eb(fs_info, old_root); + if (log_removal && btrfs_header_level(old_root) > 0) { + nritems = btrfs_header_nritems(old_root); + tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), + flags); + if (!tm_list) { + ret = -ENOMEM; + goto free_tms; + } + for (i = 0; i < nritems; i++) { + tm_list[i] = alloc_tree_mod_elem(old_root, i, + MOD_LOG_KEY_REMOVE_WHILE_FREEING, flags); + if (!tm_list[i]) { + ret = -ENOMEM; + goto free_tms; + } + } + } - ret = tree_mod_alloc(fs_info, flags, &tm); - if (ret < 0) - goto out; + tm = kzalloc(sizeof(*tm), flags); + if (!tm) { + ret = -ENOMEM; + goto free_tms; + } tm->index = new_root->start >> PAGE_CACHE_SHIFT; tm->old_root.logical = old_root->start; @@ -701,9 +705,29 @@ tm->generation = btrfs_header_generation(old_root); tm->op = MOD_LOG_ROOT_REPLACE; - ret = __tree_mod_log_insert(fs_info, tm); -out: + if (tree_mod_dont_log(fs_info, NULL)) + goto free_tms; + + if (tm_list) + ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems); + if (!ret) + ret = __tree_mod_log_insert(fs_info, tm); + tree_mod_log_write_unlock(fs_info); + if (ret) + goto free_tms; + kfree(tm_list); + + return ret; + +free_tms: + if (tm_list) { + for (i = 0; i < nritems; i++) + kfree(tm_list[i]); + kfree(tm_list); + } + kfree(tm); + return ret; } @@ -773,34 +797,75 @@ return __tree_mod_log_search(fs_info, start, min_seq, 0); } -static noinline void +static noinline int tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, struct extent_buffer *src, unsigned long dst_offset, unsigned long src_offset, int nr_items) { - int ret; + int ret = 0; + struct tree_mod_elem **tm_list = NULL; + struct tree_mod_elem **tm_list_add, **tm_list_rem; int i; + int locked = 0; - if (tree_mod_dont_log(fs_info, NULL)) - return; + if (!tree_mod_need_log(fs_info, NULL)) + return 0; - if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) { - tree_mod_log_write_unlock(fs_info); - return; + if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) + return 0; + + tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *), + GFP_NOFS); + if (!tm_list) + return -ENOMEM; + + tm_list_add = tm_list; + tm_list_rem = tm_list + nr_items; + for (i = 0; i < nr_items; i++) { + tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset, + MOD_LOG_KEY_REMOVE, GFP_NOFS); + if (!tm_list_rem[i]) { + ret = -ENOMEM; + goto free_tms; + } + + tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset, + MOD_LOG_KEY_ADD, GFP_NOFS); + if (!tm_list_add[i]) { + ret = -ENOMEM; + goto free_tms; + } } + if (tree_mod_dont_log(fs_info, NULL)) + goto free_tms; + locked = 1; + for (i = 0; i < nr_items; i++) { - ret = tree_mod_log_insert_key_locked(fs_info, src, - i + src_offset, - MOD_LOG_KEY_REMOVE); - BUG_ON(ret < 0); - ret = tree_mod_log_insert_key_locked(fs_info, dst, - i + dst_offset, - MOD_LOG_KEY_ADD); - BUG_ON(ret < 0); + ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]); + if (ret) + goto free_tms; + ret = __tree_mod_log_insert(fs_info, tm_list_add[i]); + if (ret) + goto free_tms; } tree_mod_log_write_unlock(fs_info); + kfree(tm_list); + + return 0; + +free_tms: + for (i = 0; i < nr_items * 2; i++) { + if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) + rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log); + kfree(tm_list[i]); + } + if (locked) + tree_mod_log_write_unlock(fs_info); + kfree(tm_list); + + return ret; } static inline void @@ -819,21 +884,57 @@ { int ret; - ret = tree_mod_log_insert_key_mask(fs_info, eb, slot, - MOD_LOG_KEY_REPLACE, - atomic ? GFP_ATOMIC : GFP_NOFS); + ret = tree_mod_log_insert_key(fs_info, eb, slot, + MOD_LOG_KEY_REPLACE, + atomic ? GFP_ATOMIC : GFP_NOFS); BUG_ON(ret < 0); } -static noinline void +static noinline int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) { - if (tree_mod_dont_log(fs_info, eb)) - return; + struct tree_mod_elem **tm_list = NULL; + int nritems = 0; + int i; + int ret = 0; + + if (btrfs_header_level(eb) == 0) + return 0; - __tree_mod_log_free_eb(fs_info, eb); + if (!tree_mod_need_log(fs_info, NULL)) + return 0; + + nritems = btrfs_header_nritems(eb); + tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS); + if (!tm_list) + return -ENOMEM; + for (i = 0; i < nritems; i++) { + tm_list[i] = alloc_tree_mod_elem(eb, i, + MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS); + if (!tm_list[i]) { + ret = -ENOMEM; + goto free_tms; + } + } + + if (tree_mod_dont_log(fs_info, eb)) + goto free_tms; + + ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems); tree_mod_log_write_unlock(fs_info); + if (ret) + goto free_tms; + kfree(tm_list); + + return 0; + +free_tms: + for (i = 0; i < nritems; i++) + kfree(tm_list[i]); + kfree(tm_list); + + return ret; } static noinline void @@ -859,14 +960,14 @@ * snapshot and the block was not allocated by tree relocation, * we know the block is not shared. */ - if (root->ref_cows && + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) && buf != root->node && buf != root->commit_root && (btrfs_header_generation(buf) <= btrfs_root_last_snapshot(&root->root_item) || btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) return 1; #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - if (root->ref_cows && + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) && btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) return 1; #endif @@ -910,7 +1011,7 @@ return ret; if (refs == 0) { ret = -EROFS; - btrfs_std_error(root->fs_info, ret); + btrfs_std_error(root->fs_info, ret, NULL); return ret; } } else { @@ -930,14 +1031,14 @@ if ((owner == root->root_key.objectid || root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { - ret = btrfs_inc_ref(trans, root, buf, 1, 1); + ret = btrfs_inc_ref(trans, root, buf, 1); BUG_ON(ret); /* -ENOMEM */ if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { - ret = btrfs_dec_ref(trans, root, buf, 0, 1); + ret = btrfs_dec_ref(trans, root, buf, 0); BUG_ON(ret); /* -ENOMEM */ - ret = btrfs_inc_ref(trans, root, cow, 1, 1); + ret = btrfs_inc_ref(trans, root, cow, 1); BUG_ON(ret); /* -ENOMEM */ } new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; @@ -945,9 +1046,9 @@ if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) - ret = btrfs_inc_ref(trans, root, cow, 1, 1); + ret = btrfs_inc_ref(trans, root, cow, 1); else - ret = btrfs_inc_ref(trans, root, cow, 0, 1); + ret = btrfs_inc_ref(trans, root, cow, 0); BUG_ON(ret); /* -ENOMEM */ } if (new_flags != 0) { @@ -964,14 +1065,14 @@ if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) - ret = btrfs_inc_ref(trans, root, cow, 1, 1); + ret = btrfs_inc_ref(trans, root, cow, 1); else - ret = btrfs_inc_ref(trans, root, cow, 0, 1); + ret = btrfs_inc_ref(trans, root, cow, 0); BUG_ON(ret); /* -ENOMEM */ - ret = btrfs_dec_ref(trans, root, buf, 1, 1); + ret = btrfs_dec_ref(trans, root, buf, 1); BUG_ON(ret); /* -ENOMEM */ } - clean_tree_block(trans, root, buf); + clean_tree_block(trans, root->fs_info, buf); *last_ref = 1; } return 0; @@ -1008,9 +1109,10 @@ btrfs_assert_tree_locked(buf); - WARN_ON(root->ref_cows && trans->transid != - root->fs_info->running_transaction->transid); - WARN_ON(root->ref_cows && trans->transid != root->last_trans); + WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) && + trans->transid != root->fs_info->running_transaction->transid); + WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) && + trans->transid != root->last_trans); level = btrfs_header_level(buf); @@ -1027,9 +1129,9 @@ } else parent_start = 0; - cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start, - root->root_key.objectid, &disk_key, - level, search_start, empty_size); + cow = btrfs_alloc_tree_block(trans, root, parent_start, + root->root_key.objectid, &disk_key, level, + search_start, empty_size); if (IS_ERR(cow)) return PTR_ERR(cow); @@ -1046,8 +1148,7 @@ else btrfs_set_header_owner(cow, root->root_key.objectid); - write_extent_buffer(cow, root->fs_info->fsid, - (unsigned long)btrfs_header_fsid(cow), + write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE); ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); @@ -1056,8 +1157,13 @@ return ret; } - if (root->ref_cows) - btrfs_reloc_cow_block(trans, root, buf, cow); + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) { + ret = btrfs_reloc_cow_block(trans, root, buf, cow); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + return ret; + } + } if (buf == root->node) { WARN_ON(parent && parent != buf); @@ -1083,14 +1189,19 @@ WARN_ON(trans->transid != btrfs_header_generation(parent)); tree_mod_log_insert_key(root->fs_info, parent, parent_slot, - MOD_LOG_KEY_REPLACE); + MOD_LOG_KEY_REPLACE, GFP_NOFS); btrfs_set_node_blockptr(parent, parent_slot, cow->start); btrfs_set_node_ptr_generation(parent, parent_slot, trans->transid); btrfs_mark_buffer_dirty(parent); - if (last_ref) - tree_mod_log_free_eb(root->fs_info, buf); + if (last_ref) { + ret = tree_mod_log_free_eb(root->fs_info, buf); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + return ret; + } + } btrfs_free_tree_block(trans, root, buf, parent_start, last_ref); } @@ -1116,7 +1227,7 @@ int looped = 0; if (!time_seq) - return 0; + return NULL; /* * the very last operation that's logged for a root is the replacement @@ -1127,7 +1238,7 @@ tm = tree_mod_log_search_oldest(fs_info, root_logical, time_seq); if (!looped && !tm) - return 0; + return NULL; /* * if there are no tree operation for the oldest root, we simply * return it. this should only happen if that (old) root is at @@ -1240,8 +1351,8 @@ * is freed (its refcount is decremented). */ static struct extent_buffer * -tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, - u64 time_seq) +tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path, + struct extent_buffer *eb, u64 time_seq) { struct extent_buffer *eb_rewin; struct tree_mod_elem *tm; @@ -1256,11 +1367,17 @@ if (!tm) return eb; + btrfs_set_path_blocking(path); + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) { BUG_ON(tm->slot != 0); - eb_rewin = alloc_dummy_extent_buffer(eb->start, - fs_info->tree_root->nodesize); - BUG_ON(!eb_rewin); + eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start); + if (!eb_rewin) { + btrfs_tree_read_unlock_blocking(eb); + free_extent_buffer(eb); + return NULL; + } btrfs_set_header_bytenr(eb_rewin, eb->start); btrfs_set_header_backref_rev(eb_rewin, btrfs_header_backref_rev(eb)); @@ -1268,11 +1385,15 @@ btrfs_set_header_level(eb_rewin, btrfs_header_level(eb)); } else { eb_rewin = btrfs_clone_extent_buffer(eb); - BUG_ON(!eb_rewin); + if (!eb_rewin) { + btrfs_tree_read_unlock_blocking(eb); + free_extent_buffer(eb); + return NULL; + } } - extent_buffer_get(eb_rewin); - btrfs_tree_read_unlock(eb); + btrfs_clear_path_blocking(path, NULL, BTRFS_READ_LOCK); + btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); extent_buffer_get(eb_rewin); @@ -1301,7 +1422,6 @@ struct tree_mod_root *old_root = NULL; u64 old_generation = 0; u64 logical; - u32 blocksize; eb_root = btrfs_read_lock_root_node(root); tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq); @@ -1320,13 +1440,12 @@ if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) { btrfs_tree_read_unlock(eb_root); free_extent_buffer(eb_root); - blocksize = btrfs_level_size(root, old_root->level); - old = read_tree_block(root, logical, blocksize, 0); - if (!old || !extent_buffer_uptodate(old)) { - free_extent_buffer(old); - pr_warn("btrfs: failed to read tree block %llu from get_old_root\n", - logical); - WARN_ON(1); + old = read_tree_block(root, logical, 0); + if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) { + if (!IS_ERR(old)) + free_extent_buffer(old); + btrfs_warn(root->fs_info, + "failed to read tree block %llu from get_old_root", logical); } else { eb = btrfs_clone_extent_buffer(old); free_extent_buffer(old); @@ -1334,10 +1453,11 @@ } else if (old_root) { btrfs_tree_read_unlock(eb_root); free_extent_buffer(eb_root); - eb = alloc_dummy_extent_buffer(logical, root->nodesize); + eb = alloc_dummy_extent_buffer(root->fs_info, logical); } else { + btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK); eb = btrfs_clone_extent_buffer(eb_root); - btrfs_tree_read_unlock(eb_root); + btrfs_tree_read_unlock_blocking(eb_root); free_extent_buffer(eb_root); } @@ -1382,6 +1502,9 @@ struct btrfs_root *root, struct extent_buffer *buf) { + if (btrfs_test_is_dummy_root(root)) + return 0; + /* ensure we can see the force_cow */ smp_rmb(); @@ -1400,7 +1523,7 @@ !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) && - !root->force_cow) + !test_bit(BTRFS_ROOT_FORCE_COW, &root->state)) return 0; return 1; } @@ -1420,16 +1543,15 @@ if (trans->transaction != root->fs_info->running_transaction) WARN(1, KERN_CRIT "trans %llu running %llu\n", - (unsigned long long)trans->transid, - (unsigned long long) + trans->transid, root->fs_info->running_transaction->transid); if (trans->transid != root->fs_info->generation) WARN(1, KERN_CRIT "trans %llu running %llu\n", - (unsigned long long)trans->transid, - (unsigned long long)root->fs_info->generation); + trans->transid, root->fs_info->generation); if (!should_cow_block(trans, root, buf)) { + trans->dirty = true; *cow_ret = buf; return 0; } @@ -1525,15 +1647,15 @@ WARN_ON(trans->transid != root->fs_info->generation); parent_nritems = btrfs_header_nritems(parent); - blocksize = btrfs_level_size(root, parent_level - 1); - end_slot = parent_nritems; + blocksize = root->nodesize; + end_slot = parent_nritems - 1; - if (parent_nritems == 1) + if (parent_nritems <= 1) return 0; btrfs_set_lock_blocking(parent); - for (i = start_slot; i < end_slot; i++) { + for (i = start_slot; i <= end_slot; i++) { int close = 1; btrfs_node_key(parent, &disk_key, i); @@ -1550,7 +1672,7 @@ other = btrfs_node_blockptr(parent, i - 1); close = close_blocks(blocknr, other, blocksize); } - if (!close && i < end_slot - 2) { + if (!close && i < end_slot) { other = btrfs_node_blockptr(parent, i + 1); close = close_blocks(blocknr, other, blocksize); } @@ -1559,16 +1681,17 @@ continue; } - cur = btrfs_find_tree_block(root, blocknr, blocksize); + cur = btrfs_find_tree_block(root->fs_info, blocknr); if (cur) uptodate = btrfs_buffer_uptodate(cur, gen, 0); else uptodate = 0; if (!cur || !uptodate) { if (!cur) { - cur = read_tree_block(root, blocknr, - blocksize, gen); - if (!cur || !extent_buffer_uptodate(cur)) { + cur = read_tree_block(root, blocknr, gen); + if (IS_ERR(cur)) { + return PTR_ERR(cur); + } else if (!extent_buffer_uptodate(cur)) { free_extent_buffer(cur); return -EIO; } @@ -1746,10 +1869,10 @@ BUG_ON(level == 0); eb = read_tree_block(root, btrfs_node_blockptr(parent, slot), - btrfs_level_size(root, level - 1), btrfs_node_ptr_generation(parent, slot)); - if (eb && !extent_buffer_uptodate(eb)) { - free_extent_buffer(eb); + if (IS_ERR(eb) || !extent_buffer_uptodate(eb)) { + if (!IS_ERR(eb)) + free_extent_buffer(eb); eb = NULL; } @@ -1805,7 +1928,7 @@ child = read_node_slot(root, mid, 0); if (!child) { ret = -EROFS; - btrfs_std_error(root->fs_info, ret); + btrfs_std_error(root->fs_info, ret, NULL); goto enospc; } @@ -1826,7 +1949,7 @@ path->locks[level] = 0; path->nodes[level] = NULL; - clean_tree_block(trans, root, mid); + clean_tree_block(trans, root->fs_info, mid); btrfs_tree_unlock(mid); /* once for the path */ free_extent_buffer(mid); @@ -1880,7 +2003,7 @@ if (wret < 0 && wret != -ENOSPC) ret = wret; if (btrfs_header_nritems(right) == 0) { - clean_tree_block(trans, root, right); + clean_tree_block(trans, root->fs_info, right); btrfs_tree_unlock(right); del_ptr(root, path, level + 1, pslot + 1); root_sub_used(root, right->len); @@ -1908,7 +2031,7 @@ */ if (!left) { ret = -EROFS; - btrfs_std_error(root->fs_info, ret); + btrfs_std_error(root->fs_info, ret, NULL); goto enospc; } wret = balance_node_right(trans, root, mid, left); @@ -1924,7 +2047,7 @@ BUG_ON(wret == 1); } if (btrfs_header_nritems(mid) == 0) { - clean_tree_block(trans, root, mid); + clean_tree_block(trans, root->fs_info, mid); btrfs_tree_unlock(mid); del_ptr(root, path, level + 1, pslot); root_sub_used(root, mid->len); @@ -2141,8 +2264,8 @@ node = path->nodes[level]; search = btrfs_node_blockptr(node, slot); - blocksize = btrfs_level_size(root, level - 1); - eb = btrfs_find_tree_block(root, search, blocksize); + blocksize = root->nodesize; + eb = btrfs_find_tree_block(root->fs_info, search); if (eb) { free_extent_buffer(eb); return; @@ -2172,7 +2295,7 @@ if ((search <= target && target - search <= 65536) || (search > target && search - target <= 65536)) { gen = btrfs_node_ptr_generation(node, nr); - readahead_tree_block(root, search, blocksize, gen); + readahead_tree_block(root, search); nread += blocksize; } nscan++; @@ -2181,12 +2304,8 @@ } } -/* - * returns -EAGAIN if it had to drop the path, or zero if everything was in - * cache - */ -static noinline int reada_for_balance(struct btrfs_root *root, - struct btrfs_path *path, int level) +static noinline void reada_for_balance(struct btrfs_root *root, + struct btrfs_path *path, int level) { int slot; int nritems; @@ -2195,21 +2314,18 @@ u64 gen; u64 block1 = 0; u64 block2 = 0; - int ret = 0; - int blocksize; parent = path->nodes[level + 1]; if (!parent) - return 0; + return; nritems = btrfs_header_nritems(parent); slot = path->slots[level + 1]; - blocksize = btrfs_level_size(root, level); if (slot > 0) { block1 = btrfs_node_blockptr(parent, slot - 1); gen = btrfs_node_ptr_generation(parent, slot - 1); - eb = btrfs_find_tree_block(root, block1, blocksize); + eb = btrfs_find_tree_block(root->fs_info, block1); /* * if we get -eagain from btrfs_buffer_uptodate, we * don't want to return eagain here. That will loop @@ -2222,33 +2338,16 @@ if (slot + 1 < nritems) { block2 = btrfs_node_blockptr(parent, slot + 1); gen = btrfs_node_ptr_generation(parent, slot + 1); - eb = btrfs_find_tree_block(root, block2, blocksize); + eb = btrfs_find_tree_block(root->fs_info, block2); if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0) block2 = 0; free_extent_buffer(eb); } - if (block1 || block2) { - ret = -EAGAIN; - - /* release the whole path */ - btrfs_release_path(path); - - /* read the blocks */ - if (block1) - readahead_tree_block(root, block1, blocksize, 0); - if (block2) - readahead_tree_block(root, block2, blocksize, 0); - if (block1) { - eb = read_tree_block(root, block1, blocksize, 0); - free_extent_buffer(eb); - } - if (block2) { - eb = read_tree_block(root, block2, blocksize, 0); - free_extent_buffer(eb); - } - } - return ret; + if (block1) + readahead_tree_block(root, block1); + if (block2) + readahead_tree_block(root, block2); } @@ -2350,47 +2449,38 @@ { u64 blocknr; u64 gen; - u32 blocksize; struct extent_buffer *b = *eb_ret; struct extent_buffer *tmp; int ret; blocknr = btrfs_node_blockptr(b, slot); gen = btrfs_node_ptr_generation(b, slot); - blocksize = btrfs_level_size(root, level - 1); - tmp = btrfs_find_tree_block(root, blocknr, blocksize); + tmp = btrfs_find_tree_block(root->fs_info, blocknr); if (tmp) { /* first we do an atomic uptodate check */ - if (btrfs_buffer_uptodate(tmp, 0, 1) > 0) { - if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { - /* - * we found an up to date block without - * sleeping, return - * right away - */ - *eb_ret = tmp; - return 0; - } - /* the pages were up to date, but we failed - * the generation number check. Do a full - * read for the generation number that is correct. - * We must do this without dropping locks so - * we can trust our generation number - */ - free_extent_buffer(tmp); - btrfs_set_path_blocking(p); + if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { + *eb_ret = tmp; + return 0; + } - /* now we're allowed to do a blocking uptodate check */ - tmp = read_tree_block(root, blocknr, blocksize, gen); - if (tmp && btrfs_buffer_uptodate(tmp, gen, 0) > 0) { - *eb_ret = tmp; - return 0; - } - free_extent_buffer(tmp); - btrfs_release_path(p); - return -EIO; + /* the pages were up to date, but we failed + * the generation number check. Do a full + * read for the generation number that is correct. + * We must do this without dropping locks so + * we can trust our generation number + */ + btrfs_set_path_blocking(p); + + /* now we're allowed to do a blocking uptodate check */ + ret = btrfs_read_buffer(tmp, gen); + if (!ret) { + *eb_ret = tmp; + return 0; } + free_extent_buffer(tmp); + btrfs_release_path(p); + return -EIO; } /* @@ -2410,8 +2500,8 @@ btrfs_release_path(p); ret = -EAGAIN; - tmp = read_tree_block(root, blocknr, blocksize, 0); - if (tmp) { + tmp = read_tree_block(root, blocknr, 0); + if (!IS_ERR(tmp)) { /* * If the read above didn't mark this buffer up to date, * it will never end up being up to date. Set ret to EIO now @@ -2451,11 +2541,8 @@ goto again; } - sret = reada_for_balance(root, p, level); - if (sret) - goto again; - btrfs_set_path_blocking(p); + reada_for_balance(root, p, level); sret = split_node(trans, root, p, level); btrfs_clear_path_blocking(p, NULL, 0); @@ -2475,11 +2562,8 @@ goto again; } - sret = reada_for_balance(root, p, level); - if (sret) - goto again; - btrfs_set_path_blocking(p); + reada_for_balance(root, p, level); sret = balance_level(trans, root, p, level); btrfs_clear_path_blocking(p, NULL, 0); @@ -2502,6 +2586,75 @@ return ret; } +static void key_search_validate(struct extent_buffer *b, + struct btrfs_key *key, + int level) +{ +#ifdef CONFIG_BTRFS_ASSERT + struct btrfs_disk_key disk_key; + + btrfs_cpu_key_to_disk(&disk_key, key); + + if (level == 0) + ASSERT(!memcmp_extent_buffer(b, &disk_key, + offsetof(struct btrfs_leaf, items[0].key), + sizeof(disk_key))); + else + ASSERT(!memcmp_extent_buffer(b, &disk_key, + offsetof(struct btrfs_node, ptrs[0].key), + sizeof(disk_key))); +#endif +} + +static int key_search(struct extent_buffer *b, struct btrfs_key *key, + int level, int *prev_cmp, int *slot) +{ + if (*prev_cmp != 0) { + *prev_cmp = bin_search(b, key, level, slot); + return *prev_cmp; + } + + key_search_validate(b, key, level); + *slot = 0; + + return 0; +} + +int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path, + u64 iobjectid, u64 ioff, u8 key_type, + struct btrfs_key *found_key) +{ + int ret; + struct btrfs_key key; + struct extent_buffer *eb; + + ASSERT(path); + ASSERT(found_key); + + key.type = key_type; + key.objectid = iobjectid; + key.offset = ioff; + + ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0); + if (ret < 0) + return ret; + + eb = path->nodes[0]; + if (ret && path->slots[0] >= btrfs_header_nritems(eb)) { + ret = btrfs_next_leaf(fs_root, path); + if (ret) + return ret; + eb = path->nodes[0]; + } + + btrfs_item_key_to_cpu(eb, found_key, path->slots[0]); + if (found_key->type != key.type || + found_key->objectid != key.objectid) + return 1; + + return 0; +} + /* * look for key in the tree. path is filled in with nodes along the way * if key is found, we return zero and you can find the item in the leaf @@ -2530,10 +2683,12 @@ int write_lock_level = 0; u8 lowest_level = 0; int min_write_lock_level; + int prev_cmp; lowest_level = p->lowest_level; WARN_ON(lowest_level && ins_len > 0); WARN_ON(p->nodes[0] != NULL); + BUG_ON(!cow && ins_len); if (ins_len < 0) { lowest_unlock = 2; @@ -2560,6 +2715,7 @@ min_write_lock_level = write_lock_level; again: + prev_cmp = -1; /* * we try very hard to do read locks on the root */ @@ -2570,9 +2726,13 @@ * the commit roots are read only * so we always do read locks */ + if (p->need_commit_sem) + down_read(&root->fs_info->commit_root_sem); b = root->commit_root; extent_buffer_get(b); level = btrfs_header_level(b); + if (p->need_commit_sem) + up_read(&root->fs_info->commit_root_sem); if (!p->skip_locking) btrfs_tree_read_lock(b); } else { @@ -2614,10 +2774,10 @@ * then we don't want to set the path blocking, * so we test it here */ - if (!should_cow_block(trans, root, b)) + if (!should_cow_block(trans, root, b)) { + trans->dirty = true; goto cow_done; - - btrfs_set_path_blocking(p); + } /* * must have write locks on this node and the @@ -2632,6 +2792,7 @@ goto again; } + btrfs_set_path_blocking(p); err = btrfs_cow_block(trans, root, b, p->nodes[level + 1], p->slots[level + 1], &b); @@ -2641,8 +2802,6 @@ } } cow_done: - BUG_ON(!cow && ins_len); - p->nodes[level] = b; btrfs_clear_path_blocking(p, NULL, 0); @@ -2652,15 +2811,21 @@ * It is safe to drop the lock on our parent before we * go through the expensive btree search on b. * - * If cow is true, then we might be changing slot zero, - * which may require changing the parent. So, we can't - * drop the lock until after we know which slot we're - * operating on. + * If we're inserting or deleting (ins_len != 0), then we might + * be changing slot zero, which may require changing the parent. + * So, we can't drop the lock until after we know which slot + * we're operating on. */ - if (!cow) - btrfs_unlock_up_safe(p, level + 1); + if (!ins_len && !p->keep_locks) { + int u = level + 1; - ret = bin_search(b, key, level, &slot); + if (u < BTRFS_MAX_LEVEL && p->locks[u]) { + btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]); + p->locks[u] = 0; + } + } + + ret = key_search(b, key, level, &prev_cmp, &slot); if (level != 0) { int dec = 0; @@ -2686,7 +2851,7 @@ * which means we must have a write lock * on the parent */ - if (slot == 0 && cow && + if (slot == 0 && ins_len && write_lock_level < level + 1) { write_lock_level = level + 1; btrfs_release_path(p); @@ -2723,7 +2888,7 @@ } p->locks[level] = BTRFS_WRITE_LOCK; } else { - err = btrfs_try_tree_read_lock(b); + err = btrfs_tree_read_lock_atomic(b); if (!err) { btrfs_set_path_blocking(p); btrfs_tree_read_lock(b); @@ -2795,6 +2960,7 @@ int level; int lowest_unlock = 1; u8 lowest_level = 0; + int prev_cmp = -1; lowest_level = p->lowest_level; WARN_ON(p->nodes[0] != NULL); @@ -2822,7 +2988,12 @@ */ btrfs_unlock_up_safe(p, level + 1); - ret = bin_search(b, key, level, &slot); + /* + * Since we can unwind eb's we want to do a real search every + * time. + */ + prev_cmp = -1; + ret = key_search(b, key, level, &prev_cmp, &slot); if (level != 0) { int dec = 0; @@ -2849,14 +3020,18 @@ } level = btrfs_header_level(b); - err = btrfs_try_tree_read_lock(b); + err = btrfs_tree_read_lock_atomic(b); if (!err) { btrfs_set_path_blocking(p); btrfs_tree_read_lock(b); btrfs_clear_path_blocking(p, b, BTRFS_READ_LOCK); } - b = tree_mod_log_rewind(root->fs_info, b, time_seq); + b = tree_mod_log_rewind(root->fs_info, p, b, time_seq); + if (!b) { + ret = -ENOMEM; + goto done; + } p->locks[level] = BTRFS_READ_LOCK; p->nodes[level] = b; } else { @@ -2929,7 +3104,9 @@ if (ret < 0) return ret; if (!ret) { - p->slots[0] = btrfs_header_nritems(leaf) - 1; + leaf = p->nodes[0]; + if (p->slots[0] == btrfs_header_nritems(leaf)) + p->slots[0]--; return 0; } if (!return_any) @@ -2957,7 +3134,8 @@ * higher levels * */ -static void fixup_low_keys(struct btrfs_root *root, struct btrfs_path *path, +static void fixup_low_keys(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, struct btrfs_disk_key *key, int level) { int i; @@ -2968,7 +3146,7 @@ if (!path->nodes[i]) break; t = path->nodes[i]; - tree_mod_log_set_node_key(root->fs_info, t, tslot, 1); + tree_mod_log_set_node_key(fs_info, t, tslot, 1); btrfs_set_node_key(t, key, tslot); btrfs_mark_buffer_dirty(path->nodes[i]); if (tslot != 0) @@ -2982,7 +3160,8 @@ * This function isn't completely safe. It's the caller's responsibility * that the new key won't break the order */ -void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path, +void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, struct btrfs_key *new_key) { struct btrfs_disk_key disk_key; @@ -3004,7 +3183,7 @@ btrfs_set_item_key(eb, &disk_key, slot); btrfs_mark_buffer_dirty(eb); if (slot == 0) - fixup_low_keys(root, path, &disk_key, 1); + fixup_low_keys(fs_info, path, &disk_key, 1); } /* @@ -3050,8 +3229,12 @@ } else push_items = min(src_nritems - 8, push_items); - tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0, - push_items); + ret = tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0, + push_items); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + return ret; + } copy_extent_buffer(dst, src, btrfs_node_key_ptr_offset(dst_nritems), btrfs_node_key_ptr_offset(0), @@ -3121,8 +3304,12 @@ (dst_nritems) * sizeof(struct btrfs_key_ptr)); - tree_mod_log_eb_copy(root->fs_info, dst, src, 0, - src_nritems - push_items, push_items); + ret = tree_mod_log_eb_copy(root->fs_info, dst, src, 0, + src_nritems - push_items, push_items); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + return ret; + } copy_extent_buffer(dst, src, btrfs_node_key_ptr_offset(0), btrfs_node_key_ptr_offset(src_nritems - push_items), @@ -3146,7 +3333,7 @@ */ static noinline int insert_new_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_path *path, int level, int log_removal) + struct btrfs_path *path, int level) { u64 lower_gen; struct extent_buffer *lower; @@ -3163,9 +3350,8 @@ else btrfs_node_key(lower, &lower_key, 0); - c = btrfs_alloc_free_block(trans, root, root->nodesize, 0, - root->root_key.objectid, &lower_key, - level, root->node->start, 0); + c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, + &lower_key, level, root->node->start, 0); if (IS_ERR(c)) return PTR_ERR(c); @@ -3179,13 +3365,11 @@ btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(c, root->root_key.objectid); - write_extent_buffer(c, root->fs_info->fsid, - (unsigned long)btrfs_header_fsid(c), + write_extent_buffer(c, root->fs_info->fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE); write_extent_buffer(c, root->fs_info->chunk_tree_uuid, - (unsigned long)btrfs_header_chunk_tree_uuid(c), - BTRFS_UUID_SIZE); + btrfs_header_chunk_tree_uuid(c), BTRFS_UUID_SIZE); btrfs_set_node_key(c, &lower_key, 0); btrfs_set_node_blockptr(c, 0, lower->start); @@ -3197,7 +3381,7 @@ btrfs_mark_buffer_dirty(c); old = root->node; - tree_mod_log_set_root_pointer(root, c, log_removal); + tree_mod_log_set_root_pointer(root, c, 0); rcu_assign_pointer(root->node, c); /* the super has an extra ref to root->node */ @@ -3206,7 +3390,7 @@ add_root_to_dirty_list(root); extent_buffer_get(c); path->nodes[level] = c; - path->locks[level] = BTRFS_WRITE_LOCK; + path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; path->slots[level] = 0; return 0; } @@ -3244,7 +3428,7 @@ } if (level) { ret = tree_mod_log_insert_key(root->fs_info, lower, slot, - MOD_LOG_KEY_ADD); + MOD_LOG_KEY_ADD, GFP_NOFS); BUG_ON(ret < 0); } btrfs_set_node_key(lower, key, slot); @@ -3281,14 +3465,14 @@ /* * trying to split the root, lets make a new one * - * tree mod log: We pass 0 as log_removal parameter to + * tree mod log: We don't log_removal old root in * insert_new_root, because that root buffer will be kept as a * normal node. We are going to log removal of half of the * elements below with tree_mod_log_eb_copy. We're holding a * tree lock on the buffer, which is why we cannot race with * other tree_mod_log users. */ - ret = insert_new_root(trans, root, path, level + 1, 0); + ret = insert_new_root(trans, root, path, level + 1); if (ret) return ret; } else { @@ -3305,9 +3489,8 @@ mid = (c_nritems + 1) / 2; btrfs_node_key(c, &disk_key, mid); - split = btrfs_alloc_free_block(trans, root, root->nodesize, 0, - root->root_key.objectid, - &disk_key, level, c->start, 0); + split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, + &disk_key, level, c->start, 0); if (IS_ERR(split)) return PTR_ERR(split); @@ -3320,13 +3503,17 @@ btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(split, root->root_key.objectid); write_extent_buffer(split, root->fs_info->fsid, - (unsigned long)btrfs_header_fsid(split), - BTRFS_FSID_SIZE); + btrfs_header_fsid(), BTRFS_FSID_SIZE); write_extent_buffer(split, root->fs_info->chunk_tree_uuid, - (unsigned long)btrfs_header_chunk_tree_uuid(split), + btrfs_header_chunk_tree_uuid(split), BTRFS_UUID_SIZE); - tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid); + ret = tree_mod_log_eb_copy(root->fs_info, split, c, 0, + mid, c_nritems - mid); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + return ret; + } copy_extent_buffer(split, c, btrfs_node_key_ptr_offset(0), btrfs_node_key_ptr_offset(mid), @@ -3371,8 +3558,8 @@ if (!nr) return 0; btrfs_init_map_token(&token); - start_item = btrfs_item_nr(l, start); - end_item = btrfs_item_nr(l, end); + start_item = btrfs_item_nr(start); + end_item = btrfs_item_nr(end); data_len = btrfs_token_item_offset(l, start_item, &token) + btrfs_token_item_size(l, start_item, &token); data_len = data_len - btrfs_token_item_offset(l, end_item, &token); @@ -3393,8 +3580,8 @@ int ret; ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems); if (ret < 0) { - printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, " - "used %d nritems %d\n", + btrfs_crit(root->fs_info, + "leaf free space ret %d, leaf data size %lu, used %d nritems %d", ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root), leaf_space_used(leaf, 0, nritems), nritems); } @@ -3440,7 +3627,7 @@ slot = path->slots[1]; i = left_nritems - 1; while (i >= nr) { - item = btrfs_item_nr(left, i); + item = btrfs_item_nr(i); if (!empty && push_items > 0) { if (path->slots[0] > i) @@ -3504,7 +3691,7 @@ btrfs_set_header_nritems(right, right_nritems); push_space = BTRFS_LEAF_DATA_SIZE(root); for (i = 0; i < right_nritems; i++) { - item = btrfs_item_nr(right, i); + item = btrfs_item_nr(i); push_space -= btrfs_token_item_size(right, item, &token); btrfs_set_token_item_offset(right, item, push_space, &token); } @@ -3515,7 +3702,7 @@ if (left_nritems) btrfs_mark_buffer_dirty(left); else - clean_tree_block(trans, root, left); + clean_tree_block(trans, root->fs_info, left); btrfs_mark_buffer_dirty(right); @@ -3527,7 +3714,7 @@ if (path->slots[0] >= left_nritems) { path->slots[0] -= left_nritems; if (btrfs_header_nritems(path->nodes[0]) == 0) - clean_tree_block(trans, root, path->nodes[0]); + clean_tree_block(trans, root->fs_info, path->nodes[0]); btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); path->nodes[0] = right; @@ -3602,6 +3789,19 @@ if (left_nritems == 0) goto out_unlock; + if (path->slots[0] == left_nritems && !empty) { + /* Key greater than all keys in the leaf, right neighbor has + * enough room for it and we're not emptying our leaf to delete + * it, therefore use right neighbor to insert the new item and + * no need to touch/dirty our left leaft. */ + btrfs_tree_unlock(left); + free_extent_buffer(left); + path->nodes[0] = right; + path->slots[0] = 0; + path->slots[1]++; + return 0; + } + return __push_leaf_right(trans, root, path, min_data_size, empty, right, free_space, left_nritems, min_slot); out_unlock: @@ -3646,7 +3846,7 @@ nr = min(right_nritems - 1, max_slot); for (i = 0; i < nr; i++) { - item = btrfs_item_nr(right, i); + item = btrfs_item_nr(i); if (!empty && push_items > 0) { if (path->slots[0] < i) @@ -3673,8 +3873,7 @@ ret = 1; goto out; } - if (!empty && push_items == btrfs_header_nritems(right)) - WARN_ON(1); + WARN_ON(!empty && push_items == btrfs_header_nritems(right)); /* push data from right to left */ copy_extent_buffer(left, right, @@ -3697,7 +3896,7 @@ for (i = old_left_nritems; i < old_left_nritems + push_items; i++) { u32 ioff; - item = btrfs_item_nr(left, i); + item = btrfs_item_nr(i); ioff = btrfs_token_item_offset(left, item, &token); btrfs_set_token_item_offset(left, item, @@ -3728,7 +3927,7 @@ btrfs_set_header_nritems(right, right_nritems); push_space = BTRFS_LEAF_DATA_SIZE(root); for (i = 0; i < right_nritems; i++) { - item = btrfs_item_nr(right, i); + item = btrfs_item_nr(i); push_space = push_space - btrfs_token_item_size(right, item, &token); @@ -3739,10 +3938,10 @@ if (right_nritems) btrfs_mark_buffer_dirty(right); else - clean_tree_block(trans, root, right); + clean_tree_block(trans, root->fs_info, right); btrfs_item_key(right, &disk_key, 0); - fixup_low_keys(root, path, &disk_key, 1); + fixup_low_keys(root->fs_info, path, &disk_key, 1); /* then fixup the leaf pointer in the path */ if (path->slots[0] < push_items) { @@ -3869,7 +4068,7 @@ btrfs_item_end_nr(l, mid); for (i = 0; i < nritems; i++) { - struct btrfs_item *item = btrfs_item_nr(right, i); + struct btrfs_item *item = btrfs_item_nr(i); u32 ioff; ioff = btrfs_token_item_offset(right, item, &token); @@ -3919,14 +4118,17 @@ int progress = 0; int slot; u32 nritems; + int space_needed = data_size; slot = path->slots[0]; + if (slot < btrfs_header_nritems(path->nodes[0])) + space_needed -= btrfs_leaf_free_space(root, path->nodes[0]); /* * try to push all the items after our slot into the * right leaf */ - ret = push_leaf_right(trans, root, path, 1, data_size, 0, slot); + ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot); if (ret < 0) return ret; @@ -3946,7 +4148,7 @@ /* try to push all the items before our slot into the next leaf */ slot = path->slots[0]; - ret = push_leaf_left(trans, root, path, 1, data_size, 0, slot); + ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot); if (ret < 0) return ret; @@ -3976,6 +4178,7 @@ int mid; int slot; struct extent_buffer *right; + struct btrfs_fs_info *fs_info = root->fs_info; int ret = 0; int wret; int split; @@ -3989,14 +4192,19 @@ return -EOVERFLOW; /* first try to make some room by pushing left and right */ - if (data_size) { - wret = push_leaf_right(trans, root, path, data_size, - data_size, 0, 0); + if (data_size && path->nodes[1]) { + int space_needed = data_size; + + if (slot < btrfs_header_nritems(l)) + space_needed -= btrfs_leaf_free_space(root, l); + + wret = push_leaf_right(trans, root, path, space_needed, + space_needed, 0, 0); if (wret < 0) return wret; if (wret) { - wret = push_leaf_left(trans, root, path, data_size, - data_size, 0, (u32)-1); + wret = push_leaf_left(trans, root, path, space_needed, + space_needed, 0, (u32)-1); if (wret < 0) return wret; } @@ -4008,7 +4216,7 @@ } if (!path->nodes[1]) { - ret = insert_new_root(trans, root, path, 1, 1); + ret = insert_new_root(trans, root, path, 1); if (ret) return ret; } @@ -4050,7 +4258,7 @@ data_size > BTRFS_LEAF_DATA_SIZE(root)) { if (data_size && !tried_avoid_double) goto push_for_double; - split = 2 ; + split = 2; } } } @@ -4061,13 +4269,12 @@ else btrfs_item_key(l, &disk_key, mid); - right = btrfs_alloc_free_block(trans, root, root->leafsize, 0, - root->root_key.objectid, - &disk_key, 0, l->start, 0); + right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, + &disk_key, 0, l->start, 0); if (IS_ERR(right)) return PTR_ERR(right); - root_add_used(root, root->leafsize); + root_add_used(root, root->nodesize); memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header)); btrfs_set_header_bytenr(right, right->start); @@ -4075,12 +4282,11 @@ btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(right, root->root_key.objectid); btrfs_set_header_level(right, 0); - write_extent_buffer(right, root->fs_info->fsid, - (unsigned long)btrfs_header_fsid(right), - BTRFS_FSID_SIZE); + write_extent_buffer(right, fs_info->fsid, + btrfs_header_fsid(), BTRFS_FSID_SIZE); - write_extent_buffer(right, root->fs_info->chunk_tree_uuid, - (unsigned long)btrfs_header_chunk_tree_uuid(right), + write_extent_buffer(right, fs_info->chunk_tree_uuid, + btrfs_header_chunk_tree_uuid(right), BTRFS_UUID_SIZE); if (split == 0) { @@ -4102,7 +4308,7 @@ path->nodes[0] = right; path->slots[0] = 0; if (path->slots[1] == 0) - fixup_low_keys(root, path, &disk_key, 1); + fixup_low_keys(fs_info, path, &disk_key, 1); } btrfs_mark_buffer_dirty(right); return ret; @@ -4158,13 +4364,15 @@ path->search_for_split = 1; ret = btrfs_search_slot(trans, root, &key, path, 0, 1); path->search_for_split = 0; + if (ret > 0) + ret = -EAGAIN; if (ret < 0) goto err; ret = -EAGAIN; leaf = path->nodes[0]; - /* if our item isn't there or got smaller, return now */ - if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0])) + /* if our item isn't there, return now */ + if (item_size != btrfs_item_size_nr(leaf, path->slots[0])) goto err; /* the leaf has changed, it now has room. return now */ @@ -4212,7 +4420,7 @@ btrfs_set_path_blocking(path); - item = btrfs_item_nr(leaf, path->slots[0]); + item = btrfs_item_nr(path->slots[0]); orig_offset = btrfs_item_offset(leaf, item); item_size = btrfs_item_size(leaf, item); @@ -4235,7 +4443,7 @@ btrfs_cpu_key_to_disk(&disk_key, new_key); btrfs_set_item_key(leaf, &disk_key, slot); - new_item = btrfs_item_nr(leaf, slot); + new_item = btrfs_item_nr(slot); btrfs_set_item_offset(leaf, new_item, orig_offset); btrfs_set_item_size(leaf, new_item, item_size - split_offset); @@ -4374,7 +4582,7 @@ /* first correct the data pointers */ for (i = slot; i < nritems; i++) { u32 ioff; - item = btrfs_item_nr(leaf, i); + item = btrfs_item_nr(i); ioff = btrfs_token_item_offset(leaf, item, &token); btrfs_set_token_item_offset(leaf, item, @@ -4406,8 +4614,7 @@ ptr = btrfs_item_ptr_offset(leaf, slot); memmove_extent_buffer(leaf, ptr, (unsigned long)fi, - offsetof(struct btrfs_file_extent_item, - disk_bytenr)); + BTRFS_FILE_EXTENT_INLINE_DATA_START); } } @@ -4419,10 +4626,10 @@ btrfs_set_disk_key_offset(&disk_key, offset + size_diff); btrfs_set_item_key(leaf, &disk_key, slot); if (slot == 0) - fixup_low_keys(root, path, &disk_key, 1); + fixup_low_keys(root->fs_info, path, &disk_key, 1); } - item = btrfs_item_nr(leaf, slot); + item = btrfs_item_nr(slot); btrfs_set_item_size(leaf, item, new_size); btrfs_mark_buffer_dirty(leaf); @@ -4433,7 +4640,7 @@ } /* - * make the item pointed to by the path bigger, data_size is the new size. + * make the item pointed to by the path bigger, data_size is the added size. */ void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path, u32 data_size) @@ -4465,7 +4672,7 @@ BUG_ON(slot < 0); if (slot >= nritems) { btrfs_print_leaf(root, leaf); - printk(KERN_CRIT "slot %d too large, nritems %d\n", + btrfs_crit(root->fs_info, "slot %d too large, nritems %d", slot, nritems); BUG_ON(1); } @@ -4476,7 +4683,7 @@ /* first correct the data pointers */ for (i = slot; i < nritems; i++) { u32 ioff; - item = btrfs_item_nr(leaf, i); + item = btrfs_item_nr(i); ioff = btrfs_token_item_offset(leaf, item, &token); btrfs_set_token_item_offset(leaf, item, @@ -4490,7 +4697,7 @@ data_end = old_data; old_size = btrfs_item_size_nr(leaf, slot); - item = btrfs_item_nr(leaf, slot); + item = btrfs_item_nr(slot); btrfs_set_item_size(leaf, item, old_size + data_size); btrfs_mark_buffer_dirty(leaf); @@ -4518,6 +4725,12 @@ int slot; struct btrfs_map_token token; + if (path->slots[0] == 0) { + btrfs_cpu_key_to_disk(&disk_key, cpu_key); + fixup_low_keys(root->fs_info, path, &disk_key, 1); + } + btrfs_unlock_up_safe(path, 1); + btrfs_init_map_token(&token); leaf = path->nodes[0]; @@ -4528,7 +4741,7 @@ if (btrfs_leaf_free_space(root, leaf) < total_size) { btrfs_print_leaf(root, leaf); - printk(KERN_CRIT "not enough freespace need %u have %d\n", + btrfs_crit(root->fs_info, "not enough freespace need %u have %d", total_size, btrfs_leaf_free_space(root, leaf)); BUG(); } @@ -4538,7 +4751,7 @@ if (old_data < data_end) { btrfs_print_leaf(root, leaf); - printk(KERN_CRIT "slot %d old_data %d data_end %d\n", + btrfs_crit(root->fs_info, "slot %d old_data %d data_end %d", slot, old_data, data_end); BUG_ON(1); } @@ -4549,7 +4762,7 @@ for (i = slot; i < nritems; i++) { u32 ioff; - item = btrfs_item_nr(leaf, i); + item = btrfs_item_nr( i); ioff = btrfs_token_item_offset(leaf, item, &token); btrfs_set_token_item_offset(leaf, item, ioff - total_data, &token); @@ -4570,7 +4783,7 @@ for (i = 0; i < nr; i++) { btrfs_cpu_key_to_disk(&disk_key, cpu_key + i); btrfs_set_item_key(leaf, &disk_key, slot + i); - item = btrfs_item_nr(leaf, slot + i); + item = btrfs_item_nr(slot + i); btrfs_set_token_item_offset(leaf, item, data_end - data_size[i], &token); data_end -= data_size[i]; @@ -4578,12 +4791,6 @@ } btrfs_set_header_nritems(leaf, nritems + nr); - - if (slot == 0) { - btrfs_cpu_key_to_disk(&disk_key, cpu_key); - fixup_low_keys(root, path, &disk_key, 1); - } - btrfs_unlock_up_safe(path, 1); btrfs_mark_buffer_dirty(leaf); if (btrfs_leaf_free_space(root, leaf) < 0) { @@ -4678,7 +4885,7 @@ (nritems - slot - 1)); } else if (level) { ret = tree_mod_log_insert_key(root->fs_info, parent, slot, - MOD_LOG_KEY_REMOVE); + MOD_LOG_KEY_REMOVE, GFP_NOFS); BUG_ON(ret < 0); } @@ -4692,7 +4899,7 @@ struct btrfs_disk_key disk_key; btrfs_node_key(parent, &disk_key, 0); - fixup_low_keys(root, path, &disk_key, level + 1); + fixup_low_keys(root->fs_info, path, &disk_key, level + 1); } btrfs_mark_buffer_dirty(parent); } @@ -4736,8 +4943,8 @@ { struct extent_buffer *leaf; struct btrfs_item *item; - int last_off; - int dsize = 0; + u32 last_off; + u32 dsize = 0; int ret = 0; int wret; int i; @@ -4765,7 +4972,7 @@ for (i = slot + nr; i < nritems; i++) { u32 ioff; - item = btrfs_item_nr(leaf, i); + item = btrfs_item_nr(i); ioff = btrfs_token_item_offset(leaf, item, &token); btrfs_set_token_item_offset(leaf, item, ioff + dsize, &token); @@ -4785,7 +4992,7 @@ btrfs_set_header_level(leaf, 0); } else { btrfs_set_path_blocking(path); - clean_tree_block(trans, root, leaf); + clean_tree_block(trans, root->fs_info, leaf); btrfs_del_leaf(trans, root, path, leaf); } } else { @@ -4794,7 +5001,7 @@ struct btrfs_disk_key disk_key; btrfs_item_key(leaf, &disk_key, 0); - fixup_low_keys(root, path, &disk_key, 1); + fixup_low_keys(root->fs_info, path, &disk_key, 1); } /* delete the leaf if it is mostly empty */ @@ -4858,14 +5065,18 @@ btrfs_item_key_to_cpu(path->nodes[0], &key, 0); - if (key.offset > 0) + if (key.offset > 0) { key.offset--; - else if (key.type > 0) + } else if (key.type > 0) { key.type--; - else if (key.objectid > 0) + key.offset = (u64)-1; + } else if (key.objectid > 0) { key.objectid--; - else + key.type = (u8)-1; + key.offset = (u64)-1; + } else { return 1; + } btrfs_release_path(path); ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); @@ -4873,7 +5084,17 @@ return ret; btrfs_item_key(path->nodes[0], &found_key, 0); ret = comp_keys(&found_key, &key); - if (ret < 0) + /* + * We might have had an item with the previous key in the tree right + * before we released our path. And after we released our path, that + * item might have been pushed to the first slot (0) of the leaf we + * were holding due to a tree balance. Alternatively, an item with the + * previous key can exist as the only element of a leaf (big fat item). + * Therefore account for these 2 cases, so that our callers (like + * btrfs_previous_item) don't miss an existing item with a key matching + * the previous key we computed above. + */ + if (ret <= 0) return 0; return 1; } @@ -4901,7 +5122,6 @@ * was nothing in the tree that matched the search criteria. */ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, - struct btrfs_key *max_key, struct btrfs_path *path, u64 min_trans) { @@ -4912,8 +5132,9 @@ u32 nritems; int level; int ret = 1; + int keep_locks = path->keep_locks; - WARN_ON(!path->keep_locks); + path->keep_locks = 1; again: cur = btrfs_read_lock_root_node(root); level = btrfs_header_level(cur); @@ -4946,10 +5167,8 @@ * If it is too old, old, skip to the next one. */ while (slot < nritems) { - u64 blockptr; u64 gen; - blockptr = btrfs_node_blockptr(cur, slot); gen = btrfs_node_ptr_generation(cur, slot); if (gen < min_trans) { slot++; @@ -4979,7 +5198,6 @@ path->slots[level] = slot; if (level == path->lowest_level) { ret = 0; - unlock_up(path, level, 1, 0, NULL); goto out; } btrfs_set_path_blocking(path); @@ -4994,9 +5212,12 @@ btrfs_clear_path_blocking(path, NULL, 0); } out: - if (ret == 0) + path->keep_locks = keep_locks; + if (ret == 0) { + btrfs_unlock_up_safe(path, path->lowest_level + 1); + btrfs_set_path_blocking(path); memcpy(min_key, &found_key, sizeof(found_key)); - btrfs_set_path_blocking(path); + } return ret; } @@ -5115,7 +5336,6 @@ { int ret; int cmp; - struct btrfs_trans_handle *trans = NULL; struct btrfs_path *left_path = NULL; struct btrfs_path *right_path = NULL; struct btrfs_key left_key; @@ -5131,9 +5351,8 @@ int advance_right; u64 left_blockptr; u64 right_blockptr; - u64 left_start_ctransid; - u64 right_start_ctransid; - u64 ctransid; + u64 left_gen; + u64 right_gen; left_path = btrfs_alloc_path(); if (!left_path) { @@ -5146,7 +5365,7 @@ goto out; } - tmp_buf = kmalloc(left_root->leafsize, GFP_NOFS); + tmp_buf = kmalloc(left_root->nodesize, GFP_NOFS); if (!tmp_buf) { ret = -ENOMEM; goto out; @@ -5157,21 +5376,6 @@ right_path->search_commit_root = 1; right_path->skip_locking = 1; - spin_lock(&left_root->root_item_lock); - left_start_ctransid = btrfs_root_ctransid(&left_root->root_item); - spin_unlock(&left_root->root_item_lock); - - spin_lock(&right_root->root_item_lock); - right_start_ctransid = btrfs_root_ctransid(&right_root->root_item); - spin_unlock(&right_root->root_item_lock); - - trans = btrfs_join_transaction(left_root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - trans = NULL; - goto out; - } - /* * Strategy: Go to the first items of both trees. Then do * @@ -5208,6 +5412,7 @@ * the right if possible or go up and right. */ + down_read(&left_root->fs_info->commit_root_sem); left_level = btrfs_header_level(left_root->commit_root); left_root_level = left_level; left_path->nodes[left_level] = left_root->commit_root; @@ -5217,6 +5422,7 @@ right_root_level = right_level; right_path->nodes[right_level] = right_root->commit_root; extent_buffer_get(right_path->nodes[right_level]); + up_read(&left_root->fs_info->commit_root_sem); if (left_level == 0) btrfs_item_key_to_cpu(left_path->nodes[left_level], @@ -5235,67 +5441,6 @@ advance_left = advance_right = 0; while (1) { - /* - * We need to make sure the transaction does not get committed - * while we do anything on commit roots. This means, we need to - * join and leave transactions for every item that we process. - */ - if (trans && btrfs_should_end_transaction(trans, left_root)) { - btrfs_release_path(left_path); - btrfs_release_path(right_path); - - ret = btrfs_end_transaction(trans, left_root); - trans = NULL; - if (ret < 0) - goto out; - } - /* now rejoin the transaction */ - if (!trans) { - trans = btrfs_join_transaction(left_root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - trans = NULL; - goto out; - } - - spin_lock(&left_root->root_item_lock); - ctransid = btrfs_root_ctransid(&left_root->root_item); - spin_unlock(&left_root->root_item_lock); - if (ctransid != left_start_ctransid) - left_start_ctransid = 0; - - spin_lock(&right_root->root_item_lock); - ctransid = btrfs_root_ctransid(&right_root->root_item); - spin_unlock(&right_root->root_item_lock); - if (ctransid != right_start_ctransid) - right_start_ctransid = 0; - - if (!left_start_ctransid || !right_start_ctransid) { - WARN(1, KERN_WARNING - "btrfs: btrfs_compare_tree detected " - "a change in one of the trees while " - "iterating. This is probably a " - "bug.\n"); - ret = -EIO; - goto out; - } - - /* - * the commit root may have changed, so start again - * where we stopped - */ - left_path->lowest_level = left_level; - right_path->lowest_level = right_level; - ret = btrfs_search_slot(NULL, left_root, - &left_key, left_path, 0, 0); - if (ret < 0) - goto out; - ret = btrfs_search_slot(NULL, right_root, - &right_key, right_path, 0, 0); - if (ret < 0) - goto out; - } - if (advance_left && !left_end_reached) { ret = tree_advance(left_root, left_path, &left_level, left_root_level, @@ -5365,19 +5510,20 @@ goto out; advance_right = ADVANCE; } else { + enum btrfs_compare_tree_result result; + WARN_ON(!extent_buffer_uptodate(left_path->nodes[0])); ret = tree_compare_item(left_root, left_path, right_path, tmp_buf); - if (ret) { - WARN_ON(!extent_buffer_uptodate(left_path->nodes[0])); - ret = changed_cb(left_root, right_root, - left_path, right_path, - &left_key, - BTRFS_COMPARE_TREE_CHANGED, - ctx); - if (ret < 0) - goto out; - } + if (ret) + result = BTRFS_COMPARE_TREE_CHANGED; + else + result = BTRFS_COMPARE_TREE_SAME; + ret = changed_cb(left_root, right_root, + left_path, right_path, + &left_key, result, ctx); + if (ret < 0) + goto out; advance_left = ADVANCE; advance_right = ADVANCE; } @@ -5394,7 +5540,14 @@ right_blockptr = btrfs_node_blockptr( right_path->nodes[right_level], right_path->slots[right_level]); - if (left_blockptr == right_blockptr) { + left_gen = btrfs_node_ptr_generation( + left_path->nodes[left_level], + left_path->slots[left_level]); + right_gen = btrfs_node_ptr_generation( + right_path->nodes[right_level], + right_path->slots[right_level]); + if (left_blockptr == right_blockptr && + left_gen == right_gen) { /* * As we're on a shared block, don't * allow to go deeper. @@ -5417,14 +5570,6 @@ btrfs_free_path(left_path); btrfs_free_path(right_path); kfree(tmp_buf); - - if (trans) { - if (!ret) - ret = btrfs_end_transaction(trans, left_root); - else - btrfs_end_transaction(trans, left_root); - } - return ret; } @@ -5563,6 +5708,24 @@ ret = 0; goto done; } + /* + * So the above check misses one case: + * - after releasing the path above, someone has removed the item that + * used to be at the very end of the block, and balance between leafs + * gets another one with bigger key.offset to replace it. + * + * This one should be returned as well, or we can get leaf corruption + * later(esp. in __btrfs_drop_extents()). + * + * And a bit more explanation about this check, + * with ret > 0, the key isn't found, the path points to the slot + * where it should be inserted, so the path->slots[0] item must be the + * bigger one. + */ + if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) { + ret = 0; + goto done; + } while (level < BTRFS_MAX_LEVEL) { if (!path->nodes[level]) { @@ -5710,4 +5873,47 @@ break; } return 1; +} + +/* + * search in extent tree to find a previous Metadata/Data extent item with + * min objecitd. + * + * returns 0 if something is found, 1 if nothing was found and < 0 on error + */ +int btrfs_previous_extent_item(struct btrfs_root *root, + struct btrfs_path *path, u64 min_objectid) +{ + struct btrfs_key found_key; + struct extent_buffer *leaf; + u32 nritems; + int ret; + + while (1) { + if (path->slots[0] == 0) { + btrfs_set_path_blocking(path); + ret = btrfs_prev_leaf(root, path); + if (ret != 0) + return ret; + } else { + path->slots[0]--; + } + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + if (nritems == 0) + return 1; + if (path->slots[0] == nritems) + path->slots[0]--; + + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (found_key.objectid < min_objectid) + break; + if (found_key.type == BTRFS_EXTENT_ITEM_KEY || + found_key.type == BTRFS_METADATA_ITEM_KEY) + return 0; + if (found_key.objectid == min_objectid && + found_key.type < BTRFS_EXTENT_ITEM_KEY) + break; + } + return 1; }