--- zzzz-none-000/linux-3.10.107/fs/btrfs/extent-tree.c 2017-06-27 09:49:32.000000000 +0000 +++ scorpion-7490-727/linux-3.10.107/fs/btrfs/extent-tree.c 2021-02-04 17:41:59.000000000 +0000 @@ -24,17 +24,18 @@ #include #include #include -#include "compat.h" +#include #include "hash.h" -#include "ctree.h" +#include "tree-log.h" #include "disk-io.h" #include "print-tree.h" -#include "transaction.h" #include "volumes.h" #include "raid56.h" #include "locking.h" #include "free-space-cache.h" #include "math.h" +#include "sysfs.h" +#include "qgroup.h" #undef SCRAMBLE_DELAYED_REFS @@ -73,11 +74,12 @@ RESERVE_ALLOC_NO_ACCOUNT = 2, }; -static int update_block_group(struct btrfs_root *root, - u64 bytenr, u64 num_bytes, int alloc); +static int update_block_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 num_bytes, int alloc); static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 parent, + struct btrfs_delayed_ref_node *node, u64 parent, u64 root_objectid, u64 owner_objectid, u64 owner_offset, int refs_to_drop, struct btrfs_delayed_extent_op *extra_op); @@ -102,7 +104,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes, int dump_block_groups); static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, - u64 num_bytes, int reserve); + u64 num_bytes, int reserve, + int delalloc); static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes); int btrfs_pin_extent(struct btrfs_root *root, @@ -112,7 +115,8 @@ block_group_cache_done(struct btrfs_block_group_cache *cache) { smp_mb(); - return cache->cached == BTRFS_CACHE_FINISHED; + return cache->cached == BTRFS_CACHE_FINISHED || + cache->cached == BTRFS_CACHE_ERROR; } static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) @@ -120,7 +124,7 @@ return (cache->flags & bits) == bits; } -static void btrfs_get_block_group(struct btrfs_block_group_cache *cache) +void btrfs_get_block_group(struct btrfs_block_group_cache *cache) { atomic_inc(&cache->count); } @@ -310,12 +314,6 @@ struct btrfs_caching_control *ctl; spin_lock(&cache->lock); - if (cache->cached != BTRFS_CACHE_STARTED) { - spin_unlock(&cache->lock); - return NULL; - } - - /* We're loading it the fast way, so we don't have a caching_ctl. */ if (!cache->caching_ctl) { spin_unlock(&cache->lock); return NULL; @@ -333,6 +331,27 @@ kfree(ctl); } +#ifdef CONFIG_BTRFS_DEBUG +static void fragment_free_space(struct btrfs_root *root, + struct btrfs_block_group_cache *block_group) +{ + u64 start = block_group->key.objectid; + u64 len = block_group->key.offset; + u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ? + root->nodesize : root->sectorsize; + u64 step = chunk << 1; + + while (len > chunk) { + btrfs_remove_free_space(block_group, start, chunk); + start += step; + if (len < step) + len = 0; + else + len -= step; + } +} +#endif + /* * this is only called by cache_block_group, since we could have freed extents * we need to check the pinned_extents for any extents that can't be used yet @@ -388,7 +407,8 @@ u64 total_found = 0; u64 last = 0; u32 nritems; - int ret = 0; + int ret = -ENOMEM; + bool wakeup = true; caching_ctl = container_of(work, struct btrfs_caching_control, work); block_group = caching_ctl->block_group; @@ -401,6 +421,15 @@ last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); +#ifdef CONFIG_BTRFS_DEBUG + /* + * If we're fragmenting we don't want to make anybody think we can + * allocate from this block group until we've had a chance to fragment + * the free space. + */ + if (btrfs_should_fragment_free_space(extent_root, block_group)) + wakeup = false; +#endif /* * We don't want to deadlock with somebody trying to allocate a new * extent for the extent root while also trying to search the extent @@ -417,8 +446,9 @@ again: mutex_lock(&caching_ctl->mutex); /* need to make sure the commit_root doesn't disappear */ - down_read(&fs_info->extent_commit_sem); + down_read(&fs_info->commit_root_sem); +next: ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) goto err; @@ -439,10 +469,12 @@ if (ret) break; - if (need_resched()) { - caching_ctl->progress = last; + if (need_resched() || + rwsem_is_contended(&fs_info->commit_root_sem)) { + if (wakeup) + caching_ctl->progress = last; btrfs_release_path(path); - up_read(&fs_info->extent_commit_sem); + up_read(&fs_info->commit_root_sem); mutex_unlock(&caching_ctl->mutex); cond_resched(); goto again; @@ -458,6 +490,17 @@ continue; } + if (key.objectid < last) { + key.objectid = last; + key.offset = 0; + key.type = BTRFS_EXTENT_ITEM_KEY; + + if (wakeup) + caching_ctl->progress = last; + btrfs_release_path(path); + goto next; + } + if (key.objectid < block_group->key.objectid) { path->slots[0]++; continue; @@ -474,13 +517,14 @@ key.objectid); if (key.type == BTRFS_METADATA_ITEM_KEY) last = key.objectid + - fs_info->tree_root->leafsize; + fs_info->tree_root->nodesize; else last = key.objectid + key.offset; if (total_found > (1024 * 1024 * 2)) { total_found = 0; - wake_up(&caching_ctl->wait); + if (wakeup) + wake_up(&caching_ctl->wait); } } path->slots[0]++; @@ -490,21 +534,41 @@ total_found += add_new_free_space(block_group, fs_info, last, block_group->key.objectid + block_group->key.offset); - caching_ctl->progress = (u64)-1; - spin_lock(&block_group->lock); block_group->caching_ctl = NULL; block_group->cached = BTRFS_CACHE_FINISHED; spin_unlock(&block_group->lock); +#ifdef CONFIG_BTRFS_DEBUG + if (btrfs_should_fragment_free_space(extent_root, block_group)) { + u64 bytes_used; + + spin_lock(&block_group->space_info->lock); + spin_lock(&block_group->lock); + bytes_used = block_group->key.offset - + btrfs_block_group_used(&block_group->item); + block_group->space_info->bytes_used += bytes_used >> 1; + spin_unlock(&block_group->lock); + spin_unlock(&block_group->space_info->lock); + fragment_free_space(extent_root, block_group); + } +#endif + + caching_ctl->progress = (u64)-1; err: btrfs_free_path(path); - up_read(&fs_info->extent_commit_sem); + up_read(&fs_info->commit_root_sem); free_excluded_extents(extent_root, block_group); mutex_unlock(&caching_ctl->mutex); out: + if (ret) { + spin_lock(&block_group->lock); + block_group->caching_ctl = NULL; + block_group->cached = BTRFS_CACHE_ERROR; + spin_unlock(&block_group->lock); + } wake_up(&caching_ctl->wait); put_caching_control(caching_ctl); @@ -529,7 +593,8 @@ caching_ctl->block_group = cache; caching_ctl->progress = cache->key.objectid; atomic_set(&caching_ctl->count, 1); - caching_ctl->work.func = caching_thread; + btrfs_init_work(&caching_ctl->work, btrfs_cache_helper, + caching_thread, NULL, NULL); spin_lock(&cache->lock); /* @@ -570,6 +635,7 @@ spin_unlock(&cache->lock); if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) { + mutex_lock(&caching_ctl->mutex); ret = load_free_space_cache(fs_info, cache); spin_lock(&cache->lock); @@ -577,15 +643,35 @@ cache->caching_ctl = NULL; cache->cached = BTRFS_CACHE_FINISHED; cache->last_byte_to_unpin = (u64)-1; + caching_ctl->progress = (u64)-1; } else { if (load_cache_only) { cache->caching_ctl = NULL; cache->cached = BTRFS_CACHE_NO; } else { cache->cached = BTRFS_CACHE_STARTED; + cache->has_caching_ctl = 1; } } spin_unlock(&cache->lock); +#ifdef CONFIG_BTRFS_DEBUG + if (ret == 1 && + btrfs_should_fragment_free_space(fs_info->extent_root, + cache)) { + u64 bytes_used; + + spin_lock(&cache->space_info->lock); + spin_lock(&cache->lock); + bytes_used = cache->key.offset - + btrfs_block_group_used(&cache->item); + cache->space_info->bytes_used += bytes_used >> 1; + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + fragment_free_space(fs_info->extent_root, cache); + } +#endif + mutex_unlock(&caching_ctl->mutex); + wake_up(&caching_ctl->wait); if (ret == 1) { put_caching_control(caching_ctl); @@ -603,6 +689,7 @@ cache->cached = BTRFS_CACHE_NO; } else { cache->cached = BTRFS_CACHE_STARTED; + cache->has_caching_ctl = 1; } spin_unlock(&cache->lock); wake_up(&caching_ctl->wait); @@ -613,14 +700,14 @@ return 0; } - down_write(&fs_info->extent_commit_sem); + down_write(&fs_info->commit_root_sem); atomic_inc(&caching_ctl->count); list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); - up_write(&fs_info->extent_commit_sem); + up_write(&fs_info->commit_root_sem); btrfs_get_block_group(cache); - btrfs_queue_worker(&fs_info->caching_workers, &caching_ctl->work); + btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work); return ret; } @@ -686,8 +773,8 @@ rcu_read_unlock(); } -/* simple helper to search for an existing extent at a given offset */ -int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len) +/* simple helper to search for an existing data extent at a given offset */ +int btrfs_lookup_data_extent(struct btrfs_root *root, u64 start, u64 len) { int ret; struct btrfs_key key; @@ -702,12 +789,6 @@ key.type = BTRFS_EXTENT_ITEM_KEY; ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path, 0, 0); - if (ret > 0) { - btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - if (key.objectid == start && - key.type == BTRFS_METADATA_ITEM_KEY) - ret = 0; - } btrfs_free_path(path); return ret; } @@ -741,7 +822,7 @@ * different */ if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) { - offset = root->leafsize; + offset = root->nodesize; metadata = 0; } @@ -749,31 +830,34 @@ if (!path) return -ENOMEM; - if (metadata) { - key.objectid = bytenr; - key.type = BTRFS_METADATA_ITEM_KEY; - key.offset = offset; - } else { - key.objectid = bytenr; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = offset; - } - if (!trans) { path->skip_locking = 1; path->search_commit_root = 1; } -again: + +search_again: + key.objectid = bytenr; + key.offset = offset; + if (metadata) + key.type = BTRFS_METADATA_ITEM_KEY; + else + key.type = BTRFS_EXTENT_ITEM_KEY; + ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path, 0, 0); if (ret < 0) goto out_free; if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) { - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = root->leafsize; - btrfs_release_path(path); - goto again; + if (path->slots[0]) { + path->slots[0]--; + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0]); + if (key.objectid == bytenr && + key.type == BTRFS_EXTENT_ITEM_KEY && + key.offset == root->nodesize) + ret = 0; + } } if (ret == 0) { @@ -824,14 +908,16 @@ mutex_lock(&head->mutex); mutex_unlock(&head->mutex); btrfs_put_delayed_ref(&head->node); - goto again; + goto search_again; } + spin_lock(&head->lock); if (head->extent_op && head->extent_op->update_flags) extent_flags |= head->extent_op->flags_to_set; else BUG_ON(num_refs == 0); num_refs += head->node.ref_mod; + spin_unlock(&head->lock); mutex_unlock(&head->mutex); } spin_unlock(&delayed_refs->lock); @@ -1041,11 +1127,11 @@ __le64 lenum; lenum = cpu_to_le64(root_objectid); - high_crc = crc32c(high_crc, &lenum, sizeof(lenum)); + high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum)); lenum = cpu_to_le64(owner); - low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); + low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); lenum = cpu_to_le64(offset); - low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); + low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); return ((u64)high_crc << 31) ^ (u64)low_crc; } @@ -1238,7 +1324,7 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - int refs_to_drop) + int refs_to_drop, int *last_ref) { struct btrfs_key key; struct btrfs_extent_data_ref *ref1 = NULL; @@ -1274,6 +1360,7 @@ if (num_refs == 0) { ret = btrfs_del_item(trans, root, path); + *last_ref = 1; } else { if (key.type == BTRFS_EXTENT_DATA_REF_KEY) btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); @@ -1292,8 +1379,7 @@ return ret; } -static noinline u32 extent_data_ref_count(struct btrfs_root *root, - struct btrfs_path *path, +static noinline u32 extent_data_ref_count(struct btrfs_path *path, struct btrfs_extent_inline_ref *iref) { struct btrfs_key key; @@ -1509,6 +1595,7 @@ ret = 0; } if (ret) { + key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = num_bytes; btrfs_release_path(path); @@ -1519,9 +1606,8 @@ if (ret && !insert) { err = -ENOENT; goto out; - } else if (ret) { + } else if (WARN_ON(ret)) { err = -EIO; - WARN_ON(1); goto out; } @@ -1731,7 +1817,8 @@ struct btrfs_path *path, struct btrfs_extent_inline_ref *iref, int refs_to_mod, - struct btrfs_delayed_extent_op *extent_op) + struct btrfs_delayed_extent_op *extent_op, + int *last_ref) { struct extent_buffer *leaf; struct btrfs_extent_item *ei; @@ -1775,6 +1862,7 @@ else btrfs_set_shared_data_ref_count(leaf, sref, refs); } else { + *last_ref = 1; size = btrfs_extent_inline_ref_size(type); item_size = btrfs_item_size_nr(leaf, path->slots[0]); ptr = (unsigned long)iref; @@ -1806,7 +1894,7 @@ if (ret == 0) { BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID); update_inline_extent_backref(root, path, iref, - refs_to_add, extent_op); + refs_to_add, extent_op, NULL); } else if (ret == -ENOENT) { setup_inline_extent_backref(root, path, iref, parent, root_objectid, owner, offset, @@ -1839,30 +1927,99 @@ struct btrfs_root *root, struct btrfs_path *path, struct btrfs_extent_inline_ref *iref, - int refs_to_drop, int is_data) + int refs_to_drop, int is_data, int *last_ref) { int ret = 0; BUG_ON(!is_data && refs_to_drop != 1); if (iref) { update_inline_extent_backref(root, path, iref, - -refs_to_drop, NULL); + -refs_to_drop, NULL, last_ref); } else if (is_data) { - ret = remove_extent_data_ref(trans, root, path, refs_to_drop); + ret = remove_extent_data_ref(trans, root, path, refs_to_drop, + last_ref); } else { + *last_ref = 1; ret = btrfs_del_item(trans, root, path); } return ret; } -static int btrfs_issue_discard(struct block_device *bdev, - u64 start, u64 len) +#define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len)) +static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, + u64 *discarded_bytes) { - return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0); + int j, ret = 0; + u64 bytes_left, end; + u64 aligned_start = ALIGN(start, 1 << 9); + + if (WARN_ON(start != aligned_start)) { + len -= aligned_start - start; + len = round_down(len, 1 << 9); + start = aligned_start; + } + + *discarded_bytes = 0; + + if (!len) + return 0; + + end = start + len; + bytes_left = len; + + /* Skip any superblocks on this device. */ + for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) { + u64 sb_start = btrfs_sb_offset(j); + u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE; + u64 size = sb_start - start; + + if (!in_range(sb_start, start, bytes_left) && + !in_range(sb_end, start, bytes_left) && + !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE)) + continue; + + /* + * Superblock spans beginning of range. Adjust start and + * try again. + */ + if (sb_start <= start) { + start += sb_end - start; + if (start > end) { + bytes_left = 0; + break; + } + bytes_left = end - start; + continue; + } + + if (size) { + ret = blkdev_issue_discard(bdev, start >> 9, size >> 9, + GFP_NOFS, 0); + if (!ret) + *discarded_bytes += size; + else if (ret != -EOPNOTSUPP) + return ret; + } + + start = sb_end; + if (start > end) { + bytes_left = 0; + break; + } + bytes_left = end - start; + } + + if (bytes_left) { + ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9, + GFP_NOFS, 0); + if (!ret) + *discarded_bytes += bytes_left; + } + return ret; } -static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, - u64 num_bytes, u64 *actual_bytes) +int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u64 *actual_bytes) { int ret; u64 discarded_bytes = 0; @@ -1879,14 +2036,16 @@ for (i = 0; i < bbio->num_stripes; i++, stripe++) { + u64 bytes; if (!stripe->dev->can_discard) continue; ret = btrfs_issue_discard(stripe->dev->bdev, stripe->physical, - stripe->length); + stripe->length, + &bytes); if (!ret) - discarded_bytes += stripe->length; + discarded_bytes += bytes; else if (ret != -EOPNOTSUPP) break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */ @@ -1897,7 +2056,7 @@ */ ret = 0; } - kfree(bbio); + btrfs_put_bbio(bbio); } if (actual_bytes) @@ -1913,7 +2072,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 owner, u64 offset, int for_cow) + u64 root_objectid, u64 owner, u64 offset) { int ret; struct btrfs_fs_info *fs_info = root->fs_info; @@ -1925,29 +2084,32 @@ ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, num_bytes, parent, root_objectid, (int)owner, - BTRFS_ADD_DELAYED_REF, NULL, for_cow); + BTRFS_ADD_DELAYED_REF, NULL); } else { ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, - num_bytes, - parent, root_objectid, owner, offset, - BTRFS_ADD_DELAYED_REF, NULL, for_cow); + num_bytes, parent, root_objectid, + owner, offset, 0, + BTRFS_ADD_DELAYED_REF, NULL); } return ret; } static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, u64 num_bytes, + struct btrfs_delayed_ref_node *node, u64 parent, u64 root_objectid, u64 owner, u64 offset, int refs_to_add, struct btrfs_delayed_extent_op *extent_op) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_extent_item *item; + struct btrfs_key key; + u64 bytenr = node->bytenr; + u64 num_bytes = node->num_bytes; u64 refs; int ret; - int err = 0; path = btrfs_alloc_path(); if (!path) @@ -1956,19 +2118,20 @@ path->reada = 1; path->leave_spinning = 1; /* this will setup the path even if it fails to insert the back ref */ - ret = insert_inline_extent_backref(trans, root->fs_info->extent_root, - path, bytenr, num_bytes, parent, + ret = insert_inline_extent_backref(trans, fs_info->extent_root, path, + bytenr, num_bytes, parent, root_objectid, owner, offset, refs_to_add, extent_op); - if (ret == 0) + if ((ret < 0 && ret != -EAGAIN) || !ret) goto out; - if (ret != -EAGAIN) { - err = ret; - goto out; - } - + /* + * Ok we had -EAGAIN which means we didn't have space to insert and + * inline extent ref, so just update the reference count and add a + * normal backref. + */ leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); refs = btrfs_extent_refs(leaf, item); btrfs_set_extent_refs(leaf, item, refs + refs_to_add); @@ -1980,7 +2143,6 @@ path->reada = 1; path->leave_spinning = 1; - /* now insert the actual backref */ ret = insert_extent_backref(trans, root->fs_info->extent_root, path, bytenr, parent, root_objectid, @@ -1989,7 +2151,7 @@ btrfs_abort_transaction(trans, root, ret); out: btrfs_free_path(path); - return err; + return ret; } static int run_delayed_data_ref(struct btrfs_trans_handle *trans, @@ -2010,10 +2172,11 @@ ins.type = BTRFS_EXTENT_ITEM_KEY; ref = btrfs_delayed_node_to_data_ref(node); + trace_run_delayed_data_ref(node, ref, node->action); + if (node->type == BTRFS_SHARED_DATA_REF_KEY) parent = ref->parent; - else - ref_root = ref->root; + ref_root = ref->root; if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { if (extent_op) @@ -2023,14 +2186,12 @@ ref->objectid, ref->offset, &ins, node->ref_mod); } else if (node->action == BTRFS_ADD_DELAYED_REF) { - ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, - node->num_bytes, parent, + ret = __btrfs_inc_extent_ref(trans, root, node, parent, ref_root, ref->objectid, ref->offset, node->ref_mod, extent_op); } else if (node->action == BTRFS_DROP_DELAYED_REF) { - ret = __btrfs_free_extent(trans, root, node->bytenr, - node->num_bytes, parent, + ret = __btrfs_free_extent(trans, root, node, parent, ref_root, ref->objectid, ref->offset, node->ref_mod, extent_op); @@ -2103,15 +2264,28 @@ } if (ret > 0) { if (metadata) { - btrfs_release_path(path); - metadata = 0; + if (path->slots[0] > 0) { + path->slots[0]--; + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0]); + if (key.objectid == node->bytenr && + key.type == BTRFS_EXTENT_ITEM_KEY && + key.offset == node->num_bytes) + ret = 0; + } + if (ret > 0) { + btrfs_release_path(path); + metadata = 0; - key.offset = node->num_bytes; - key.type = BTRFS_EXTENT_ITEM_KEY; - goto again; + key.objectid = node->bytenr; + key.offset = node->num_bytes; + key.type = BTRFS_EXTENT_ITEM_KEY; + goto again; + } + } else { + err = -EIO; + goto out; } - err = -EIO; - goto out; } leaf = path->nodes[0]; @@ -2153,10 +2327,11 @@ SKINNY_METADATA); ref = btrfs_delayed_node_to_tree_ref(node); + trace_run_delayed_tree_ref(node, ref, node->action); + if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) parent = ref->parent; - else - ref_root = ref->root; + ref_root = ref->root; ins.objectid = node->bytenr; if (skinny_metadata) { @@ -2176,12 +2351,13 @@ &extent_op->key, ref->level, &ins); } else if (node->action == BTRFS_ADD_DELAYED_REF) { - ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, - node->num_bytes, parent, ref_root, - ref->level, 0, 1, extent_op); + ret = __btrfs_inc_extent_ref(trans, root, node, + parent, ref_root, + ref->level, 0, 1, + extent_op); } else if (node->action == BTRFS_DROP_DELAYED_REF) { - ret = __btrfs_free_extent(trans, root, node->bytenr, - node->num_bytes, parent, ref_root, + ret = __btrfs_free_extent(trans, root, node, + parent, ref_root, ref->level, 0, 1, extent_op); } else { BUG(); @@ -2198,8 +2374,12 @@ { int ret = 0; - if (trans->aborted) + if (trans->aborted) { + if (insert_reserved) + btrfs_pin_extent(root, node->bytenr, + node->num_bytes, 1); return 0; + } if (btrfs_delayed_ref_is_head(node)) { struct btrfs_delayed_ref_head *head; @@ -2211,6 +2391,8 @@ */ BUG_ON(extent_op); head = btrfs_delayed_node_to_head(node); + trace_run_delayed_ref_head(node, head, node->action); + if (insert_reserved) { btrfs_pin_extent(root, node->bytenr, node->num_bytes, 1); @@ -2220,6 +2402,11 @@ node->num_bytes); } } + + /* Also free its reserved qgroup space */ + btrfs_qgroup_free_delayed_ref(root->fs_info, + head->qgroup_ref_root, + head->qgroup_reserved); return ret; } @@ -2236,68 +2423,65 @@ return ret; } -static noinline struct btrfs_delayed_ref_node * +static inline struct btrfs_delayed_ref_node * select_delayed_ref(struct btrfs_delayed_ref_head *head) { - struct rb_node *node; struct btrfs_delayed_ref_node *ref; - int action = BTRFS_ADD_DELAYED_REF; -again: + + if (list_empty(&head->ref_list)) + return NULL; + /* - * select delayed ref of type BTRFS_ADD_DELAYED_REF first. - * this prevents ref count from going down to zero when - * there still are pending delayed ref. + * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first. + * This is to prevent a ref count from going down to zero, which deletes + * the extent item from the extent tree, when there still are references + * to add, which would fail because they would not find the extent item. */ - node = rb_prev(&head->node.rb_node); - while (1) { - if (!node) - break; - ref = rb_entry(node, struct btrfs_delayed_ref_node, - rb_node); - if (ref->bytenr != head->node.bytenr) - break; - if (ref->action == action) + list_for_each_entry(ref, &head->ref_list, list) { + if (ref->action == BTRFS_ADD_DELAYED_REF) return ref; - node = rb_prev(node); } - if (action == BTRFS_ADD_DELAYED_REF) { - action = BTRFS_DROP_DELAYED_REF; - goto again; - } - return NULL; + + return list_entry(head->ref_list.next, struct btrfs_delayed_ref_node, + list); } /* * Returns 0 on success or if called with an already aborted transaction. * Returns -ENOMEM or -EIO on failure and will abort the transaction. */ -static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct list_head *cluster) +static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + unsigned long nr) { struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_ref_node *ref; struct btrfs_delayed_ref_head *locked_ref = NULL; struct btrfs_delayed_extent_op *extent_op; struct btrfs_fs_info *fs_info = root->fs_info; + ktime_t start = ktime_get(); int ret; - int count = 0; + unsigned long count = 0; + unsigned long actual_count = 0; int must_insert_reserved = 0; delayed_refs = &trans->transaction->delayed_refs; while (1) { if (!locked_ref) { - /* pick a new head ref from the cluster list */ - if (list_empty(cluster)) + if (count >= nr) break; - locked_ref = list_entry(cluster->next, - struct btrfs_delayed_ref_head, cluster); + spin_lock(&delayed_refs->lock); + locked_ref = btrfs_select_ref_head(trans); + if (!locked_ref) { + spin_unlock(&delayed_refs->lock); + break; + } /* grab the lock that says we are going to process * all the refs for this head */ ret = btrfs_delayed_ref_lock(trans, locked_ref); - + spin_unlock(&delayed_refs->lock); /* * we may have dropped the spin lock to get the head * mutex lock, and that might have given someone else @@ -2317,7 +2501,13 @@ * and then it being added back again before the drop can * finish. If we merged anything we need to re-loop so we can * get a good ref. + * Or we can get node references of the same type that weren't + * merged when created due to bumps in the tree mod seq, and + * we need to merge them to prevent adding an inline extent + * backref before dropping it (triggering a BUG_ON at + * insert_inline_extent_backref()). */ + spin_lock(&locked_ref->lock); btrfs_merge_delayed_refs(trans, fs_info, delayed_refs, locked_ref); @@ -2329,17 +2519,15 @@ if (ref && ref->seq && btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) { - /* - * there are still refs with lower seq numbers in the - * process of being added. Don't run this ref yet. - */ - list_del_init(&locked_ref->cluster); - btrfs_delayed_ref_unlock(locked_ref); - locked_ref = NULL; + spin_unlock(&locked_ref->lock); + spin_lock(&delayed_refs->lock); + locked_ref->processing = 0; delayed_refs->num_heads_ready++; spin_unlock(&delayed_refs->lock); + btrfs_delayed_ref_unlock(locked_ref); + locked_ref = NULL; cond_resched(); - spin_lock(&delayed_refs->lock); + count++; continue; } @@ -2354,6 +2542,8 @@ locked_ref->extent_op = NULL; if (!ref) { + + /* All delayed refs have been processed, Go ahead * and send the head node to run_one_delayed_ref, * so that any accounting fixes can happen @@ -2366,26 +2556,57 @@ } if (extent_op) { - spin_unlock(&delayed_refs->lock); - + spin_unlock(&locked_ref->lock); ret = run_delayed_extent_op(trans, root, ref, extent_op); btrfs_free_delayed_extent_op(extent_op); if (ret) { - btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); + /* + * Need to reset must_insert_reserved if + * there was an error so the abort stuff + * can cleanup the reserved space + * properly. + */ + if (must_insert_reserved) + locked_ref->must_insert_reserved = 1; spin_lock(&delayed_refs->lock); + locked_ref->processing = 0; + delayed_refs->num_heads_ready++; + spin_unlock(&delayed_refs->lock); + btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); btrfs_delayed_ref_unlock(locked_ref); return ret; } + continue; + } - goto next; + /* + * Need to drop our head ref lock and re-aqcuire the + * delayed ref lock and then re-check to make sure + * nobody got added. + */ + spin_unlock(&locked_ref->lock); + spin_lock(&delayed_refs->lock); + spin_lock(&locked_ref->lock); + if (!list_empty(&locked_ref->ref_list) || + locked_ref->extent_op) { + spin_unlock(&locked_ref->lock); + spin_unlock(&delayed_refs->lock); + continue; } + ref->in_tree = 0; + delayed_refs->num_heads--; + rb_erase(&locked_ref->href_node, + &delayed_refs->href_root); + spin_unlock(&delayed_refs->lock); + } else { + actual_count++; + ref->in_tree = 0; + list_del(&ref->list); } + atomic_dec(&delayed_refs->num_entries); - ref->in_tree = 0; - rb_erase(&ref->rb_node, &delayed_refs->root); - delayed_refs->num_entries--; if (!btrfs_delayed_ref_is_head(ref)) { /* * when we play the delayed ref, also correct the @@ -2402,20 +2623,18 @@ default: WARN_ON(1); } - } else { - list_del_init(&locked_ref->cluster); } - spin_unlock(&delayed_refs->lock); + spin_unlock(&locked_ref->lock); ret = run_one_delayed_ref(trans, root, ref, extent_op, must_insert_reserved); btrfs_free_delayed_extent_op(extent_op); if (ret) { + locked_ref->processing = 0; btrfs_delayed_ref_unlock(locked_ref); btrfs_put_delayed_ref(ref); btrfs_debug(fs_info, "run_one_delayed_ref returned %d", ret); - spin_lock(&delayed_refs->lock); return ret; } @@ -2426,16 +2645,39 @@ * list before we release it. */ if (btrfs_delayed_ref_is_head(ref)) { + if (locked_ref->is_data && + locked_ref->total_ref_mod < 0) { + spin_lock(&delayed_refs->lock); + delayed_refs->pending_csums -= ref->num_bytes; + spin_unlock(&delayed_refs->lock); + } btrfs_delayed_ref_unlock(locked_ref); locked_ref = NULL; } btrfs_put_delayed_ref(ref); count++; -next: cond_resched(); + } + + /* + * We don't want to include ref heads since we can have empty ref heads + * and those will drastically skew our runtime down since we just do + * accounting, no actual extent tree updates. + */ + if (actual_count > 0) { + u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start)); + u64 avg; + + /* + * We weigh the current average higher than our current runtime + * to avoid large swings in the average. + */ spin_lock(&delayed_refs->lock); + avg = fs_info->avg_delayed_ref_runtime * 3 + runtime; + fs_info->avg_delayed_ref_runtime = avg >> 2; /* div by 4 */ + spin_unlock(&delayed_refs->lock); } - return count; + return 0; } #ifdef SCRAMBLE_DELAYED_REFS @@ -2481,49 +2723,169 @@ } #endif -int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) +static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads) +{ + u64 num_bytes; + + num_bytes = heads * (sizeof(struct btrfs_extent_item) + + sizeof(struct btrfs_extent_inline_ref)); + if (!btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) + num_bytes += heads * sizeof(struct btrfs_tree_block_info); + + /* + * We don't ever fill up leaves all the way so multiply by 2 just to be + * closer to what we're really going to want to ouse. + */ + return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root)); +} + +/* + * Takes the number of bytes to be csumm'ed and figures out how many leaves it + * would require to store the csums for that many bytes. + */ +u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes) { - struct qgroup_update *qgroup_update; + u64 csum_size; + u64 num_csums_per_leaf; + u64 num_csums; + + csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item); + num_csums_per_leaf = div64_u64(csum_size, + (u64)btrfs_super_csum_size(root->fs_info->super_copy)); + num_csums = div64_u64(csum_bytes, root->sectorsize); + num_csums += num_csums_per_leaf - 1; + num_csums = div64_u64(num_csums, num_csums_per_leaf); + return num_csums; +} + +int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_block_rsv *global_rsv; + u64 num_heads = trans->transaction->delayed_refs.num_heads_ready; + u64 csum_bytes = trans->transaction->delayed_refs.pending_csums; + u64 num_dirty_bgs = trans->transaction->num_dirty_bgs; + u64 num_bytes, num_dirty_bgs_bytes; int ret = 0; - if (list_empty(&trans->qgroup_ref_list) != - !trans->delayed_ref_elem.seq) { - /* list without seq or seq without list */ - btrfs_err(fs_info, - "qgroup accounting update error, list is%s empty, seq is %#x.%x", - list_empty(&trans->qgroup_ref_list) ? "" : " not", - (u32)(trans->delayed_ref_elem.seq >> 32), - (u32)trans->delayed_ref_elem.seq); - BUG(); + num_bytes = btrfs_calc_trans_metadata_size(root, 1); + num_heads = heads_to_leaves(root, num_heads); + if (num_heads > 1) + num_bytes += (num_heads - 1) * root->nodesize; + num_bytes <<= 1; + num_bytes += btrfs_csum_bytes_to_leaves(root, csum_bytes) * root->nodesize; + num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(root, + num_dirty_bgs); + global_rsv = &root->fs_info->global_block_rsv; + + /* + * If we can't allocate any more chunks lets make sure we have _lots_ of + * wiggle room since running delayed refs can create more delayed refs. + */ + if (global_rsv->space_info->full) { + num_dirty_bgs_bytes <<= 1; + num_bytes <<= 1; } - if (!trans->delayed_ref_elem.seq) - return 0; + spin_lock(&global_rsv->lock); + if (global_rsv->reserved <= num_bytes + num_dirty_bgs_bytes) + ret = 1; + spin_unlock(&global_rsv->lock); + return ret; +} - while (!list_empty(&trans->qgroup_ref_list)) { - qgroup_update = list_first_entry(&trans->qgroup_ref_list, - struct qgroup_update, list); - list_del(&qgroup_update->list); - if (!ret) - ret = btrfs_qgroup_account_ref( - trans, fs_info, qgroup_update->node, - qgroup_update->extent_op); - kfree(qgroup_update); +int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + u64 num_entries = + atomic_read(&trans->transaction->delayed_refs.num_entries); + u64 avg_runtime; + u64 val; + + smp_mb(); + avg_runtime = fs_info->avg_delayed_ref_runtime; + val = num_entries * avg_runtime; + if (num_entries * avg_runtime >= NSEC_PER_SEC) + return 1; + if (val >= NSEC_PER_SEC / 2) + return 2; + + return btrfs_check_space_for_delayed_refs(trans, root); +} + +struct async_delayed_refs { + struct btrfs_root *root; + int count; + int error; + int sync; + struct completion wait; + struct btrfs_work work; +}; + +static void delayed_ref_async_start(struct btrfs_work *work) +{ + struct async_delayed_refs *async; + struct btrfs_trans_handle *trans; + int ret; + + async = container_of(work, struct async_delayed_refs, work); + + trans = btrfs_join_transaction(async->root); + if (IS_ERR(trans)) { + async->error = PTR_ERR(trans); + goto done; } - btrfs_put_tree_mod_seq(fs_info, &trans->delayed_ref_elem); + /* + * trans->sync means that when we call end_transaciton, we won't + * wait on delayed refs + */ + trans->sync = true; + ret = btrfs_run_delayed_refs(trans, async->root, async->count); + if (ret) + async->error = ret; - return ret; + ret = btrfs_end_transaction(trans, async->root); + if (ret && !async->error) + async->error = ret; +done: + if (async->sync) + complete(&async->wait); + else + kfree(async); } -static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq, - int count) +int btrfs_async_run_delayed_refs(struct btrfs_root *root, + unsigned long count, int wait) { - int val = atomic_read(&delayed_refs->ref_seq); + struct async_delayed_refs *async; + int ret; - if (val < seq || val >= seq + count) - return 1; + async = kmalloc(sizeof(*async), GFP_NOFS); + if (!async) + return -ENOMEM; + + async->root = root->fs_info->tree_root; + async->count = count; + async->error = 0; + if (wait) + async->sync = 1; + else + async->sync = 0; + init_completion(&async->wait); + + btrfs_init_work(&async->work, btrfs_extent_refs_helper, + delayed_ref_async_start, NULL, NULL); + + btrfs_queue_work(root->fs_info->extent_workers, &async->work); + + if (wait) { + wait_for_completion(&async->wait); + ret = async->error; + kfree(async); + return ret; + } return 0; } @@ -2542,13 +2904,10 @@ { struct rb_node *node; struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_delayed_ref_node *ref; - struct list_head cluster; + struct btrfs_delayed_ref_head *head; int ret; - u64 delayed_start; int run_all = count == (unsigned long)-1; - int run_most = 0; - int loops; + bool can_flush_pending_bgs = trans->can_flush_pending_bgs; /* We'll clean this up in btrfs_cleanup_transaction */ if (trans->aborted) @@ -2557,131 +2916,40 @@ if (root == root->fs_info->extent_root) root = root->fs_info->tree_root; - btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info); - delayed_refs = &trans->transaction->delayed_refs; - INIT_LIST_HEAD(&cluster); - if (count == 0) { - count = delayed_refs->num_entries * 2; - run_most = 1; - } - - if (!run_all && !run_most) { - int old; - int seq = atomic_read(&delayed_refs->ref_seq); - -progress: - old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1); - if (old) { - DEFINE_WAIT(__wait); - if (delayed_refs->num_entries < 16348) - return 0; - - prepare_to_wait(&delayed_refs->wait, &__wait, - TASK_UNINTERRUPTIBLE); - - old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1); - if (old) { - schedule(); - finish_wait(&delayed_refs->wait, &__wait); - - if (!refs_newer(delayed_refs, seq, 256)) - goto progress; - else - return 0; - } else { - finish_wait(&delayed_refs->wait, &__wait); - goto again; - } - } - - } else { - atomic_inc(&delayed_refs->procs_running_refs); - } + if (count == 0) + count = atomic_read(&delayed_refs->num_entries) * 2; again: - loops = 0; - spin_lock(&delayed_refs->lock); - #ifdef SCRAMBLE_DELAYED_REFS delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); #endif - - while (1) { - if (!(run_all || run_most) && - delayed_refs->num_heads_ready < 64) - break; - - /* - * go find something we can process in the rbtree. We start at - * the beginning of the tree, and then build a cluster - * of refs to process starting at the first one we are able to - * lock - */ - delayed_start = delayed_refs->run_delayed_start; - ret = btrfs_find_ref_cluster(trans, &cluster, - delayed_refs->run_delayed_start); - if (ret) - break; - - ret = run_clustered_refs(trans, root, &cluster); - if (ret < 0) { - btrfs_release_ref_cluster(&cluster); - spin_unlock(&delayed_refs->lock); - btrfs_abort_transaction(trans, root, ret); - atomic_dec(&delayed_refs->procs_running_refs); - return ret; - } - - atomic_add(ret, &delayed_refs->ref_seq); - - count -= min_t(unsigned long, ret, count); - - if (count == 0) - break; - - if (delayed_start >= delayed_refs->run_delayed_start) { - if (loops == 0) { - /* - * btrfs_find_ref_cluster looped. let's do one - * more cycle. if we don't run any delayed ref - * during that cycle (because we can't because - * all of them are blocked), bail out. - */ - loops = 1; - } else { - /* - * no runnable refs left, stop trying - */ - BUG_ON(run_all); - break; - } - } - if (ret) { - /* refs were run, let's reset staleness detection */ - loops = 0; - } + trans->can_flush_pending_bgs = false; + ret = __btrfs_run_delayed_refs(trans, root, count); + if (ret < 0) { + btrfs_abort_transaction(trans, root, ret); + return ret; } if (run_all) { - if (!list_empty(&trans->new_bgs)) { - spin_unlock(&delayed_refs->lock); + if (!list_empty(&trans->new_bgs)) btrfs_create_pending_block_groups(trans, root); - spin_lock(&delayed_refs->lock); - } - node = rb_first(&delayed_refs->root); - if (!node) + spin_lock(&delayed_refs->lock); + node = rb_first(&delayed_refs->href_root); + if (!node) { + spin_unlock(&delayed_refs->lock); goto out; + } count = (unsigned long)-1; while (node) { - ref = rb_entry(node, struct btrfs_delayed_ref_node, - rb_node); - if (btrfs_delayed_ref_is_head(ref)) { - struct btrfs_delayed_ref_head *head; + head = rb_entry(node, struct btrfs_delayed_ref_head, + href_node); + if (btrfs_delayed_ref_is_head(&head->node)) { + struct btrfs_delayed_ref_node *ref; - head = btrfs_delayed_node_to_head(ref); + ref = &head->node; atomic_inc(&ref->refs); spin_unlock(&delayed_refs->lock); @@ -2695,21 +2963,18 @@ btrfs_put_delayed_ref(ref); cond_resched(); goto again; + } else { + WARN_ON(1); } node = rb_next(node); } spin_unlock(&delayed_refs->lock); - schedule_timeout(1); + cond_resched(); goto again; } out: - atomic_dec(&delayed_refs->procs_running_refs); - smp_mb(); - if (waitqueue_active(&delayed_refs->wait)) - wake_up(&delayed_refs->wait); - - spin_unlock(&delayed_refs->lock); assert_qgroups_uptodate(trans); + trans->can_flush_pending_bgs = can_flush_pending_bgs; return 0; } @@ -2747,15 +3012,15 @@ struct btrfs_delayed_ref_node *ref; struct btrfs_delayed_data_ref *data_ref; struct btrfs_delayed_ref_root *delayed_refs; - struct rb_node *node; int ret = 0; - ret = -ENOENT; delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); head = btrfs_find_delayed_ref_head(trans, bytenr); - if (!head) - goto out; + if (!head) { + spin_unlock(&delayed_refs->lock); + return 0; + } if (!mutex_trylock(&head->mutex)) { atomic_inc(&head->node.refs); @@ -2772,40 +3037,31 @@ btrfs_put_delayed_ref(&head->node); return -EAGAIN; } + spin_unlock(&delayed_refs->lock); - node = rb_prev(&head->node.rb_node); - if (!node) - goto out_unlock; - - ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - - if (ref->bytenr != bytenr) - goto out_unlock; - - ret = 1; - if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) - goto out_unlock; - - data_ref = btrfs_delayed_node_to_data_ref(ref); + spin_lock(&head->lock); + list_for_each_entry(ref, &head->ref_list, list) { + /* If it's a shared ref we know a cross reference exists */ + if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) { + ret = 1; + break; + } - node = rb_prev(node); - if (node) { - int seq = ref->seq; + data_ref = btrfs_delayed_node_to_data_ref(ref); - ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - if (ref->bytenr == bytenr && ref->seq == seq) - goto out_unlock; + /* + * If our ref doesn't match the one we're currently looking at + * then we have a cross reference. + */ + if (data_ref->root != root->root_key.objectid || + data_ref->objectid != objectid || + data_ref->offset != offset) { + ret = 1; + break; + } } - - if (data_ref->root != root->root_key.objectid || - data_ref->objectid != objectid || data_ref->offset != offset) - goto out_unlock; - - ret = 0; -out_unlock: + spin_unlock(&head->lock); mutex_unlock(&head->mutex); -out: - spin_unlock(&delayed_refs->lock); return ret; } @@ -2919,7 +3175,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, - int full_backref, int inc, int for_cow) + int full_backref, int inc) { u64 bytenr; u64 num_bytes; @@ -2932,13 +3188,17 @@ int level; int ret = 0; int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, - u64, u64, u64, u64, u64, u64, int); + u64, u64, u64, u64, u64, u64); + + + if (btrfs_test_is_dummy_root(root)) + return 0; ref_root = btrfs_header_owner(buf); nritems = btrfs_header_nritems(buf); level = btrfs_header_level(buf); - if (!root->ref_cows && level == 0) + if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0) return 0; if (inc) @@ -2954,7 +3214,7 @@ for (i = 0; i < nritems; i++) { if (level == 0) { btrfs_item_key_to_cpu(buf, &key, i); - if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) + if (key.type != BTRFS_EXTENT_DATA_KEY) continue; fi = btrfs_item_ptr(buf, i, struct btrfs_file_extent_item); @@ -2969,15 +3229,14 @@ key.offset -= btrfs_file_extent_offset(buf, fi); ret = process_func(trans, root, bytenr, num_bytes, parent, ref_root, key.objectid, - key.offset, for_cow); + key.offset); if (ret) goto fail; } else { bytenr = btrfs_node_blockptr(buf, i); - num_bytes = btrfs_level_size(root, level - 1); + num_bytes = root->nodesize; ret = process_func(trans, root, bytenr, num_bytes, - parent, ref_root, level - 1, 0, - for_cow); + parent, ref_root, level - 1, 0); if (ret) goto fail; } @@ -2988,15 +3247,15 @@ } int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref, int for_cow) + struct extent_buffer *buf, int full_backref) { - return __btrfs_mod_ref(trans, root, buf, full_backref, 1, for_cow); + return __btrfs_mod_ref(trans, root, buf, full_backref, 1); } int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref, int for_cow) + struct extent_buffer *buf, int full_backref) { - return __btrfs_mod_ref(trans, root, buf, full_backref, 0, for_cow); + return __btrfs_mod_ref(trans, root, buf, full_backref, 0); } static int write_one_cache_group(struct btrfs_trans_handle *trans, @@ -3010,21 +3269,19 @@ struct extent_buffer *leaf; ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1); - if (ret < 0) + if (ret) { + if (ret > 0) + ret = -ENOENT; goto fail; - BUG_ON(ret); /* Corruption */ + } leaf = path->nodes[0]; bi = btrfs_item_ptr_offset(leaf, path->slots[0]); write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item)); btrfs_mark_buffer_dirty(leaf); - btrfs_release_path(path); fail: - if (ret) { - btrfs_abort_transaction(trans, root, ret); - return ret; - } - return 0; + btrfs_release_path(path); + return ret; } @@ -3033,7 +3290,19 @@ struct btrfs_block_group_cache *cache) { struct rb_node *node; + spin_lock(&root->fs_info->block_group_cache_lock); + + /* If our block group was removed, we need a full search. */ + if (RB_EMPTY_NODE(&cache->cache_node)) { + const u64 next_bytenr = cache->key.objectid + cache->key.offset; + + spin_unlock(&root->fs_info->block_group_cache_lock); + btrfs_put_block_group(cache); + cache = btrfs_lookup_first_block_group(root->fs_info, + next_bytenr); + return cache; + } node = rb_next(&cache->cache_node); btrfs_put_block_group(cache); if (node) { @@ -3054,7 +3323,7 @@ struct inode *inode = NULL; u64 alloc_hint = 0; int dcs = BTRFS_DC_ERROR; - int num_pages = 0; + u64 num_pages = 0; int retries = 0; int ret = 0; @@ -3069,6 +3338,8 @@ return 0; } + if (trans->aborted) + return 0; again: inode = lookup_free_space_inode(root, block_group, path); if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { @@ -3104,6 +3375,20 @@ */ BTRFS_I(inode)->generation = 0; ret = btrfs_update_inode(trans, root, inode); + if (ret) { + /* + * So theoretically we could recover from this, simply set the + * super cache generation to 0 so we know to invalidate the + * cache, but then we'd have to keep track of the block groups + * that fail this way so we know we _have_ to reset this cache + * before the next commit or risk reading stale cache. So to + * limit our exposure to horrible edge cases lets just abort the + * transaction, this only happens in really bad situations + * anyway. + */ + btrfs_abort_transaction(trans, root, ret); + goto out_put; + } WARN_ON(ret); if (i_size_read(inode) > 0) { @@ -3112,8 +3397,7 @@ if (ret) goto out_put; - ret = btrfs_truncate_free_space_cache(root, trans, path, - inode); + ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode); if (ret) goto out_put; } @@ -3133,28 +3417,47 @@ spin_unlock(&block_group->lock); /* + * We hit an ENOSPC when setting up the cache in this transaction, just + * skip doing the setup, we've already cleared the cache so we're safe. + */ + if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) { + ret = -ENOSPC; + goto out_put; + } + + /* * Try to preallocate enough space based on how big the block group is. * Keep in mind this has to include any pinned space which could end up * taking up quite a bit since it's not folded into the other space * cache. */ - num_pages = (int)div64_u64(block_group->key.offset, 256 * 1024 * 1024); + num_pages = div_u64(block_group->key.offset, 256 * 1024 * 1024); if (!num_pages) num_pages = 1; num_pages *= 16; num_pages *= PAGE_CACHE_SIZE; - ret = btrfs_check_data_free_space(inode, num_pages); + ret = btrfs_check_data_free_space(inode, 0, num_pages); if (ret) goto out_put; ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages, num_pages, num_pages, &alloc_hint); + /* + * Our cache requires contiguous chunks so that we don't modify a bunch + * of metadata or split extents when writing the cache out, which means + * we can enospc if we are heavily fragmented in addition to just normal + * out of space conditions. So if we hit this just skip setting up any + * other block groups for this transaction, maybe we'll unpin enough + * space the next time around. + */ if (!ret) dcs = BTRFS_DC_SETUP; - btrfs_free_reserved_data_space(inode, num_pages); + else if (ret == -ENOSPC) + set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags); + btrfs_free_reserved_data_space(inode, 0, num_pages); out_put: iput(inode); @@ -3170,125 +3473,292 @@ return ret; } -int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, - struct btrfs_root *root) +int btrfs_setup_space_cache(struct btrfs_trans_handle *trans, + struct btrfs_root *root) { - struct btrfs_block_group_cache *cache; - int err = 0; + struct btrfs_block_group_cache *cache, *tmp; + struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_path *path; - u64 last = 0; + + if (list_empty(&cur_trans->dirty_bgs) || + !btrfs_test_opt(root, SPACE_CACHE)) + return 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; + /* Could add new block groups, use _safe just in case */ + list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs, + dirty_list) { + if (cache->disk_cache_state == BTRFS_DC_CLEAR) + cache_save_setup(cache, trans, path); + } + + btrfs_free_path(path); + return 0; +} + +/* + * transaction commit does final block group cache writeback during a + * critical section where nothing is allowed to change the FS. This is + * required in order for the cache to actually match the block group, + * but can introduce a lot of latency into the commit. + * + * So, btrfs_start_dirty_block_groups is here to kick off block group + * cache IO. There's a chance we'll have to redo some of it if the + * block group changes again during the commit, but it greatly reduces + * the commit latency by getting rid of the easy block groups while + * we're still allowing others to join the commit. + */ +int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_block_group_cache *cache; + struct btrfs_transaction *cur_trans = trans->transaction; + int ret = 0; + int should_put; + struct btrfs_path *path = NULL; + LIST_HEAD(dirty); + struct list_head *io = &cur_trans->io_bgs; + int num_started = 0; + int loops = 0; + + spin_lock(&cur_trans->dirty_bgs_lock); + if (list_empty(&cur_trans->dirty_bgs)) { + spin_unlock(&cur_trans->dirty_bgs_lock); + return 0; + } + list_splice_init(&cur_trans->dirty_bgs, &dirty); + spin_unlock(&cur_trans->dirty_bgs_lock); + again: - while (1) { - cache = btrfs_lookup_first_block_group(root->fs_info, last); - while (cache) { - if (cache->disk_cache_state == BTRFS_DC_CLEAR) - break; - cache = next_block_group(root, cache); - } - if (!cache) { - if (last == 0) - break; - last = 0; - continue; - } - err = cache_save_setup(cache, trans, path); - last = cache->key.objectid + cache->key.offset; - btrfs_put_block_group(cache); + /* + * make sure all the block groups on our dirty list actually + * exist + */ + btrfs_create_pending_block_groups(trans, root); + + if (!path) { + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; } - while (1) { - if (last == 0) { - err = btrfs_run_delayed_refs(trans, root, - (unsigned long)-1); - if (err) /* File system offline */ - goto out; + /* + * cache_write_mutex is here only to save us from balance or automatic + * removal of empty block groups deleting this block group while we are + * writing out the cache + */ + mutex_lock(&trans->transaction->cache_write_mutex); + while (!list_empty(&dirty)) { + cache = list_first_entry(&dirty, + struct btrfs_block_group_cache, + dirty_list); + /* + * this can happen if something re-dirties a block + * group that is already under IO. Just wait for it to + * finish and then do it all again + */ + if (!list_empty(&cache->io_list)) { + list_del_init(&cache->io_list); + btrfs_wait_cache_io(root, trans, cache, + &cache->io_ctl, path, + cache->key.objectid); + btrfs_put_block_group(cache); } - cache = btrfs_lookup_first_block_group(root->fs_info, last); - while (cache) { - if (cache->disk_cache_state == BTRFS_DC_CLEAR) { - btrfs_put_block_group(cache); - goto again; - } - if (cache->dirty) - break; - cache = next_block_group(root, cache); + /* + * btrfs_wait_cache_io uses the cache->dirty_list to decide + * if it should update the cache_state. Don't delete + * until after we wait. + * + * Since we're not running in the commit critical section + * we need the dirty_bgs_lock to protect from update_block_group + */ + spin_lock(&cur_trans->dirty_bgs_lock); + list_del_init(&cache->dirty_list); + spin_unlock(&cur_trans->dirty_bgs_lock); + + should_put = 1; + + cache_save_setup(cache, trans, path); + + if (cache->disk_cache_state == BTRFS_DC_SETUP) { + cache->io_ctl.inode = NULL; + ret = btrfs_write_out_cache(root, trans, cache, path); + if (ret == 0 && cache->io_ctl.inode) { + num_started++; + should_put = 0; + + /* + * the cache_write_mutex is protecting + * the io_list + */ + list_add_tail(&cache->io_list, io); + } else { + /* + * if we failed to write the cache, the + * generation will be bad and life goes on + */ + ret = 0; + } } - if (!cache) { - if (last == 0) - break; - last = 0; - continue; + if (!ret) { + ret = write_one_cache_group(trans, root, path, cache); + /* + * Our block group might still be attached to the list + * of new block groups in the transaction handle of some + * other task (struct btrfs_trans_handle->new_bgs). This + * means its block group item isn't yet in the extent + * tree. If this happens ignore the error, as we will + * try again later in the critical section of the + * transaction commit. + */ + if (ret == -ENOENT) { + ret = 0; + spin_lock(&cur_trans->dirty_bgs_lock); + if (list_empty(&cache->dirty_list)) { + list_add_tail(&cache->dirty_list, + &cur_trans->dirty_bgs); + btrfs_get_block_group(cache); + } + spin_unlock(&cur_trans->dirty_bgs_lock); + } else if (ret) { + btrfs_abort_transaction(trans, root, ret); + } } - if (cache->disk_cache_state == BTRFS_DC_SETUP) - cache->disk_cache_state = BTRFS_DC_NEED_WRITE; - cache->dirty = 0; - last = cache->key.objectid + cache->key.offset; + /* if its not on the io list, we need to put the block group */ + if (should_put) + btrfs_put_block_group(cache); - err = write_one_cache_group(trans, root, path, cache); - if (err) /* File system offline */ - goto out; + if (ret) + break; - btrfs_put_block_group(cache); + /* + * Avoid blocking other tasks for too long. It might even save + * us from writing caches for block groups that are going to be + * removed. + */ + mutex_unlock(&trans->transaction->cache_write_mutex); + mutex_lock(&trans->transaction->cache_write_mutex); } + mutex_unlock(&trans->transaction->cache_write_mutex); - while (1) { + /* + * go through delayed refs for all the stuff we've just kicked off + * and then loop back (just once) + */ + ret = btrfs_run_delayed_refs(trans, root, 0); + if (!ret && loops == 0) { + loops++; + spin_lock(&cur_trans->dirty_bgs_lock); + list_splice_init(&cur_trans->dirty_bgs, &dirty); /* - * I don't think this is needed since we're just marking our - * preallocated extent as written, but just in case it can't - * hurt. - */ - if (last == 0) { - err = btrfs_run_delayed_refs(trans, root, - (unsigned long)-1); - if (err) /* File system offline */ - goto out; + * dirty_bgs_lock protects us from concurrent block group + * deletes too (not just cache_write_mutex). + */ + if (!list_empty(&dirty)) { + spin_unlock(&cur_trans->dirty_bgs_lock); + goto again; } + spin_unlock(&cur_trans->dirty_bgs_lock); + } - cache = btrfs_lookup_first_block_group(root->fs_info, last); - while (cache) { - /* - * Really this shouldn't happen, but it could if we - * couldn't write the entire preallocated extent and - * splitting the extent resulted in a new block. - */ - if (cache->dirty) { - btrfs_put_block_group(cache); - goto again; + btrfs_free_path(path); + return ret; +} + +int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_block_group_cache *cache; + struct btrfs_transaction *cur_trans = trans->transaction; + int ret = 0; + int should_put; + struct btrfs_path *path; + struct list_head *io = &cur_trans->io_bgs; + int num_started = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* + * We don't need the lock here since we are protected by the transaction + * commit. We want to do the cache_save_setup first and then run the + * delayed refs to make sure we have the best chance at doing this all + * in one shot. + */ + while (!list_empty(&cur_trans->dirty_bgs)) { + cache = list_first_entry(&cur_trans->dirty_bgs, + struct btrfs_block_group_cache, + dirty_list); + + /* + * this can happen if cache_save_setup re-dirties a block + * group that is already under IO. Just wait for it to + * finish and then do it all again + */ + if (!list_empty(&cache->io_list)) { + list_del_init(&cache->io_list); + btrfs_wait_cache_io(root, trans, cache, + &cache->io_ctl, path, + cache->key.objectid); + btrfs_put_block_group(cache); + } + + /* + * don't remove from the dirty list until after we've waited + * on any pending IO + */ + list_del_init(&cache->dirty_list); + should_put = 1; + + cache_save_setup(cache, trans, path); + + if (!ret) + ret = btrfs_run_delayed_refs(trans, root, (unsigned long) -1); + + if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) { + cache->io_ctl.inode = NULL; + ret = btrfs_write_out_cache(root, trans, cache, path); + if (ret == 0 && cache->io_ctl.inode) { + num_started++; + should_put = 0; + list_add_tail(&cache->io_list, io); + } else { + /* + * if we failed to write the cache, the + * generation will be bad and life goes on + */ + ret = 0; } - if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE) - break; - cache = next_block_group(root, cache); } - if (!cache) { - if (last == 0) - break; - last = 0; - continue; + if (!ret) { + ret = write_one_cache_group(trans, root, path, cache); + if (ret) + btrfs_abort_transaction(trans, root, ret); } - err = btrfs_write_out_cache(root, trans, cache, path); + /* if its not on the io list, we need to put the block group */ + if (should_put) + btrfs_put_block_group(cache); + } - /* - * If we didn't have an error then the cache state is still - * NEED_WRITE, so we can set it to WRITTEN. - */ - if (!err && cache->disk_cache_state == BTRFS_DC_NEED_WRITE) - cache->disk_cache_state = BTRFS_DC_WRITTEN; - last = cache->key.objectid + cache->key.offset; + while (!list_empty(io)) { + cache = list_first_entry(io, struct btrfs_block_group_cache, + io_list); + list_del_init(&cache->io_list); + btrfs_wait_cache_io(root, trans, cache, + &cache->io_ctl, path, cache->key.objectid); btrfs_put_block_group(cache); } -out: btrfs_free_path(path); - return err; + return ret; } int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) @@ -3304,6 +3774,23 @@ return readonly; } +static const char *alloc_name(u64 flags) +{ + switch (flags) { + case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA: + return "mixed"; + case BTRFS_BLOCK_GROUP_METADATA: + return "metadata"; + case BTRFS_BLOCK_GROUP_DATA: + return "data"; + case BTRFS_BLOCK_GROUP_SYSTEM: + return "system"; + default: + WARN_ON(1); + return "invalid-combination"; + }; +} + static int update_space_info(struct btrfs_fs_info *info, u64 flags, u64 total_bytes, u64 bytes_used, struct btrfs_space_info **space_info) @@ -3311,6 +3798,7 @@ struct btrfs_space_info *found; int i; int factor; + int ret; if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) @@ -3325,7 +3813,8 @@ found->disk_total += total_bytes * factor; found->bytes_used += bytes_used; found->disk_used += bytes_used * factor; - found->full = 0; + if (total_bytes > 0) + found->full = 0; spin_unlock(&found->lock); *space_info = found; return 0; @@ -3334,6 +3823,12 @@ if (!found) return -ENOMEM; + ret = percpu_counter_init(&found->total_bytes_pinned, 0, GFP_KERNEL); + if (ret) { + kfree(found); + return ret; + } + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) INIT_LIST_HEAD(&found->block_groups[i]); init_rwsem(&found->groups_sem); @@ -3348,15 +3843,27 @@ found->bytes_readonly = 0; found->bytes_may_use = 0; found->full = 0; + found->max_extent_size = 0; found->force_alloc = CHUNK_ALLOC_NO_FORCE; found->chunk_alloc = 0; found->flush = 0; init_waitqueue_head(&found->wait); + INIT_LIST_HEAD(&found->ro_bgs); + + ret = kobject_init_and_add(&found->kobj, &space_info_ktype, + info->space_info_kobj, "%s", + alloc_name(found->flags)); + if (ret) { + kfree(found); + return ret; + } + *space_info = found; list_add_rcu(&found->list, &info->space_info); if (flags & BTRFS_BLOCK_GROUP_DATA) info->data_sinfo = found; - return 0; + + return ret; } static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) @@ -3411,15 +3918,10 @@ */ static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) { - /* - * we add in the count of missing devices because we want - * to make sure that any RAID levels on a degraded FS - * continue to be honored. - */ - u64 num_devices = root->fs_info->fs_devices->rw_devices + - root->fs_info->fs_devices->missing_devices; + u64 num_devices = root->fs_info->fs_devices->rw_devices; u64 target; - u64 tmp; + u64 raid_type; + u64 allowed = 0; /* * see if restripe for this chunk_type is in progress, if so @@ -3437,38 +3939,35 @@ spin_unlock(&root->fs_info->balance_lock); /* First, mask out the RAID levels which aren't possible */ - if (num_devices == 1) - flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_BLOCK_GROUP_RAID5); - if (num_devices < 3) - flags &= ~BTRFS_BLOCK_GROUP_RAID6; - if (num_devices < 4) - flags &= ~BTRFS_BLOCK_GROUP_RAID10; - - tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10); - flags &= ~tmp; - - if (tmp & BTRFS_BLOCK_GROUP_RAID6) - tmp = BTRFS_BLOCK_GROUP_RAID6; - else if (tmp & BTRFS_BLOCK_GROUP_RAID5) - tmp = BTRFS_BLOCK_GROUP_RAID5; - else if (tmp & BTRFS_BLOCK_GROUP_RAID10) - tmp = BTRFS_BLOCK_GROUP_RAID10; - else if (tmp & BTRFS_BLOCK_GROUP_RAID1) - tmp = BTRFS_BLOCK_GROUP_RAID1; - else if (tmp & BTRFS_BLOCK_GROUP_RAID0) - tmp = BTRFS_BLOCK_GROUP_RAID0; + for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) { + if (num_devices >= btrfs_raid_array[raid_type].devs_min) + allowed |= btrfs_raid_group[raid_type]; + } + allowed &= flags; - return extended_to_chunk(flags | tmp); + if (allowed & BTRFS_BLOCK_GROUP_RAID6) + allowed = BTRFS_BLOCK_GROUP_RAID6; + else if (allowed & BTRFS_BLOCK_GROUP_RAID5) + allowed = BTRFS_BLOCK_GROUP_RAID5; + else if (allowed & BTRFS_BLOCK_GROUP_RAID10) + allowed = BTRFS_BLOCK_GROUP_RAID10; + else if (allowed & BTRFS_BLOCK_GROUP_RAID1) + allowed = BTRFS_BLOCK_GROUP_RAID1; + else if (allowed & BTRFS_BLOCK_GROUP_RAID0) + allowed = BTRFS_BLOCK_GROUP_RAID0; + + flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK; + + return extended_to_chunk(flags | allowed); } -static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) +static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags) { unsigned seq; + u64 flags; do { + flags = orig_flags; seq = read_seqbegin(&root->fs_info->profiles_lock); if (flags & BTRFS_BLOCK_GROUP_DATA) @@ -3498,25 +3997,22 @@ return ret; } -/* - * This will check the space that the inode allocates from to make sure we have - * enough space for bytes. - */ -int btrfs_check_data_free_space(struct inode *inode, u64 bytes) +int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes) { struct btrfs_space_info *data_sinfo; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_fs_info *fs_info = root->fs_info; u64 used; - int ret = 0, committed = 0, alloc_chunk = 1; + int ret = 0; + int need_commit = 2; + int have_pinned_space; /* make sure bytes are sectorsize aligned */ bytes = ALIGN(bytes, root->sectorsize); - if (root == root->fs_info->tree_root || - BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) { - alloc_chunk = 0; - committed = 1; + if (btrfs_is_free_space_inode(inode)) { + need_commit = 0; + ASSERT(current->journal_info); } data_sinfo = fs_info->data_sinfo; @@ -3537,13 +4033,23 @@ * if we don't have enough free bytes in this space then we need * to alloc a new chunk. */ - if (!data_sinfo->full && alloc_chunk) { + if (!data_sinfo->full) { u64 alloc_target; data_sinfo->force_alloc = CHUNK_ALLOC_FORCE; spin_unlock(&data_sinfo->lock); alloc: alloc_target = btrfs_get_alloc_profile(root, 1); + /* + * It is ugly that we don't call nolock join + * transaction for the free space inode case here. + * But it is safe because we only do the data space + * reservation for the free space cache in the + * transaction context, the common join transaction + * just increase the counter of the current transaction + * handler, doesn't try to acquire the trans_lock of + * the fs. + */ trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); @@ -3555,8 +4061,10 @@ if (ret < 0) { if (ret != -ENOSPC) return ret; - else + else { + have_pinned_space = 1; goto commit_trans; + } } if (!data_sinfo) @@ -3566,27 +4074,52 @@ } /* - * If we have less pinned bytes than we want to allocate then - * don't bother committing the transaction, it won't help us. + * If we don't have enough pinned space to deal with this + * allocation, and no removed chunk in current transaction, + * don't bother committing the transaction. */ - if (data_sinfo->bytes_pinned < bytes) - committed = 1; + have_pinned_space = percpu_counter_compare( + &data_sinfo->total_bytes_pinned, + used + bytes - data_sinfo->total_bytes); spin_unlock(&data_sinfo->lock); /* commit the current transaction and try again */ commit_trans: - if (!committed && + if (need_commit && !atomic_read(&root->fs_info->open_ioctl_trans)) { - committed = 1; + need_commit--; + + if (need_commit > 0) { + btrfs_start_delalloc_roots(fs_info, 0, -1); + btrfs_wait_ordered_roots(fs_info, -1); + } + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); - ret = btrfs_commit_transaction(trans, root); - if (ret) - return ret; - goto again; + if (have_pinned_space >= 0 || + test_bit(BTRFS_TRANS_HAVE_FREE_BGS, + &trans->transaction->flags) || + need_commit > 0) { + ret = btrfs_commit_transaction(trans, root); + if (ret) + return ret; + /* + * The cleaner kthread might still be doing iput + * operations. Wait for it to finish so that + * more space is released. + */ + mutex_lock(&root->fs_info->cleaner_delayed_iput_mutex); + mutex_unlock(&root->fs_info->cleaner_delayed_iput_mutex); + goto again; + } else { + btrfs_end_transaction(trans, root); + } } + trace_btrfs_space_reservation(root->fs_info, + "space_info:enospc", + data_sinfo->flags, bytes, 1); return -ENOSPC; } data_sinfo->bytes_may_use += bytes; @@ -3594,28 +4127,81 @@ data_sinfo->flags, bytes, 1); spin_unlock(&data_sinfo->lock); - return 0; + return ret; } /* - * Called if we need to clear a data reservation for this inode. + * New check_data_free_space() with ability for precious data reservation + * Will replace old btrfs_check_data_free_space(), but for patch split, + * add a new function first and then replace it. */ -void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) +int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + /* align the range */ + len = round_up(start + len, root->sectorsize) - + round_down(start, root->sectorsize); + start = round_down(start, root->sectorsize); + + ret = btrfs_alloc_data_chunk_ondemand(inode, len); + if (ret < 0) + return ret; + + /* + * Use new btrfs_qgroup_reserve_data to reserve precious data space + * + * TODO: Find a good method to avoid reserve data space for NOCOW + * range, but don't impact performance on quota disable case. + */ + ret = btrfs_qgroup_reserve_data(inode, start, len); + return ret; +} + +/* + * Called if we need to clear a data reservation for this inode + * Normally in a error case. + * + * This one will *NOT* use accurate qgroup reserved space API, just for case + * which we can't sleep and is sure it won't affect qgroup reserved space. + * Like clear_bit_hook(). + */ +void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, + u64 len) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_space_info *data_sinfo; - /* make sure bytes are sectorsize aligned */ - bytes = ALIGN(bytes, root->sectorsize); + /* Make sure the range is aligned to sectorsize */ + len = round_up(start + len, root->sectorsize) - + round_down(start, root->sectorsize); + start = round_down(start, root->sectorsize); data_sinfo = root->fs_info->data_sinfo; spin_lock(&data_sinfo->lock); - data_sinfo->bytes_may_use -= bytes; + if (WARN_ON(data_sinfo->bytes_may_use < len)) + data_sinfo->bytes_may_use = 0; + else + data_sinfo->bytes_may_use -= len; trace_btrfs_space_reservation(root->fs_info, "space_info", - data_sinfo->flags, bytes, 0); + data_sinfo->flags, len, 0); spin_unlock(&data_sinfo->lock); } +/* + * Called if we need to clear a data reservation for this inode + * Normally in a error case. + * + * This one will handle the per-indoe data rsv map for accurate reserved + * space framework. + */ +void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len) +{ + btrfs_free_reserved_data_space_noquota(inode, start, len); + btrfs_qgroup_free_data(inode, start, len); +} + static void force_metadata_allocation(struct btrfs_fs_info *info) { struct list_head *head = &info->space_info; @@ -3671,7 +4257,7 @@ return 1; } -static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type) +static u64 get_profile_num_devs(struct btrfs_root *root, u64 type) { u64 num_dev; @@ -3685,24 +4271,43 @@ else num_dev = 1; /* DUP or single */ - /* metadata for updaing devices and chunk tree */ - return btrfs_calc_trans_metadata_size(root, num_dev + 1); + return num_dev; } -static void check_system_chunk(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 type) +/* + * If @is_allocation is true, reserve space in the system space info necessary + * for allocating a chunk, otherwise if it's false, reserve space necessary for + * removing a chunk. + */ +void check_system_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 type) { struct btrfs_space_info *info; u64 left; u64 thresh; + int ret = 0; + u64 num_devs; + + /* + * Needed because we can end up allocating a system chunk and for an + * atomic and race free space reservation in the chunk block reserve. + */ + ASSERT(mutex_is_locked(&root->fs_info->chunk_mutex)); info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM); spin_lock(&info->lock); left = info->total_bytes - info->bytes_used - info->bytes_pinned - - info->bytes_reserved - info->bytes_readonly; + info->bytes_reserved - info->bytes_readonly - + info->bytes_may_use; spin_unlock(&info->lock); - thresh = get_system_chunk_thresh(root, type); + num_devs = get_profile_num_devs(root, type); + + /* num_devs device items to update and 1 chunk item to add or remove */ + thresh = btrfs_calc_trunc_metadata_size(root, num_devs) + + btrfs_calc_trans_metadata_size(root, 1); + if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) { btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu", left, thresh, type); @@ -3713,7 +4318,21 @@ u64 flags; flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0); - btrfs_alloc_chunk(trans, root, flags); + /* + * Ignore failure to create system chunk. We might end up not + * needing it, as we might not need to COW all nodes/leafs from + * the paths we visit in the chunk tree (they were already COWed + * or created in the current transaction for example). + */ + ret = btrfs_alloc_chunk(trans, root, flags); + } + + if (!ret) { + ret = btrfs_block_rsv_add(root->fs_info->chunk_root, + &root->fs_info->chunk_block_rsv, + thresh, BTRFS_RESERVE_NO_FLUSH); + if (!ret) + trans->chunk_bytes_reserved += thresh; } } @@ -3742,8 +4361,12 @@ if (force < space_info->force_alloc) force = space_info->force_alloc; if (space_info->full) { + if (should_alloc_chunk(extent_root, space_info, force)) + ret = -ENOSPC; + else + ret = 0; spin_unlock(&space_info->lock); - return 0; + return ret; } if (!should_alloc_chunk(extent_root, space_info, force)) { @@ -3814,6 +4437,25 @@ space_info->chunk_alloc = 0; spin_unlock(&space_info->lock); mutex_unlock(&fs_info->chunk_mutex); + /* + * When we allocate a new chunk we reserve space in the chunk block + * reserve to make sure we can COW nodes/leafs in the chunk tree or + * add new nodes/leafs to it if we end up needing to do it when + * inserting the chunk item and updating device items as part of the + * second phase of chunk allocation, performed by + * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a + * large number of new block groups to create in our transaction + * handle's new_bgs list to avoid exhausting the chunk block reserve + * in extreme cases - like having a single transaction create many new + * block groups when starting to write out the free space caches of all + * the block groups that were made dirty during the lifetime of the + * transaction. + */ + if (trans->can_flush_pending_bgs && + trans->chunk_bytes_reserved >= (2 * 1024 * 1024ull)) { + btrfs_create_pending_block_groups(trans, trans->root); + btrfs_trans_release_chunk_metadata(trans); + } return ret; } @@ -3826,7 +4468,6 @@ u64 space_size; u64 avail; u64 used; - u64 to_add; used = space_info->bytes_used + space_info->bytes_reserved + space_info->bytes_pinned + space_info->bytes_readonly; @@ -3860,39 +4501,30 @@ BTRFS_BLOCK_GROUP_RAID10)) avail >>= 1; - to_add = space_info->total_bytes; - /* * If we aren't flushing all things, let us overcommit up to * 1/2th of the space. If we can flush, don't let us overcommit * too much, let it overcommit up to 1/8 of the space. */ if (flush == BTRFS_RESERVE_FLUSH_ALL) - to_add >>= 3; + avail >>= 3; else - to_add >>= 1; - - /* - * Limit the overcommit to the amount of free space we could possibly - * allocate for chunks. - */ - to_add = min(avail, to_add); + avail >>= 1; - if (used + bytes < space_info->total_bytes + to_add) + if (used + bytes < space_info->total_bytes + avail) return 1; return 0; } static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root, - unsigned long nr_pages) + unsigned long nr_pages, int nr_items) { struct super_block *sb = root->fs_info->sb; - int started; - /* If we can not start writeback, just sync all the delalloc file. */ - started = try_to_writeback_inodes_sb_nr(sb, nr_pages, - WB_REASON_FS_FREE_SPACE); - if (!started) { + if (down_read_trylock(&sb->s_umount)) { + writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE); + up_read(&sb->s_umount); + } else { /* * We needn't worry the filesystem going from r/w to r/o though * we don't acquire ->s_umount mutex, because the filesystem @@ -3900,12 +4532,26 @@ * the filesystem is readonly(all dirty pages are written to * the disk). */ - btrfs_start_delalloc_inodes(root, 0); + btrfs_start_delalloc_roots(root->fs_info, 0, nr_items); if (!current->journal_info) - btrfs_wait_ordered_extents(root, 0); + btrfs_wait_ordered_roots(root->fs_info, nr_items); } } +static inline int calc_reclaim_items_nr(struct btrfs_root *root, u64 to_reclaim) +{ + u64 bytes; + int nr; + + bytes = btrfs_calc_trans_metadata_size(root, 1); + nr = (int)div64_u64(to_reclaim, bytes); + if (!nr) + nr = 1; + return nr; +} + +#define EXTENT_SIZE_PER_ITEM (256 * 1024) + /* * shrink metadata reservation for delalloc */ @@ -3918,35 +4564,51 @@ u64 delalloc_bytes; u64 max_reclaim; long time_left; - unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; - int loops = 0; + unsigned long nr_pages; + int loops; + int items; enum btrfs_reserve_flush_enum flush; + /* Calc the number of the pages we need flush for space reservation */ + items = calc_reclaim_items_nr(root, to_reclaim); + to_reclaim = items * EXTENT_SIZE_PER_ITEM; + trans = (struct btrfs_trans_handle *)current->journal_info; block_rsv = &root->fs_info->delalloc_block_rsv; space_info = block_rsv->space_info; - smp_mb(); delalloc_bytes = percpu_counter_sum_positive( &root->fs_info->delalloc_bytes); if (delalloc_bytes == 0) { if (trans) return; - btrfs_wait_ordered_extents(root, 0); + if (wait_ordered) + btrfs_wait_ordered_roots(root->fs_info, items); return; } + loops = 0; while (delalloc_bytes && loops < 3) { max_reclaim = min(delalloc_bytes, to_reclaim); nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; - btrfs_writeback_inodes_sb_nr(root, nr_pages); + btrfs_writeback_inodes_sb_nr(root, nr_pages, items); /* * We need to wait for the async pages to actually start before * we do anything. */ - wait_event(root->fs_info->async_submit_wait, - !atomic_read(&root->fs_info->async_delalloc_pages)); + max_reclaim = atomic_read(&root->fs_info->async_delalloc_pages); + if (!max_reclaim) + goto skip_async; + + if (max_reclaim <= nr_pages) + max_reclaim = 0; + else + max_reclaim -= nr_pages; + wait_event(root->fs_info->async_submit_wait, + atomic_read(&root->fs_info->async_delalloc_pages) <= + (int)max_reclaim); +skip_async: if (!trans) flush = BTRFS_RESERVE_FLUSH_ALL; else @@ -3960,13 +4622,12 @@ loops++; if (wait_ordered && !trans) { - btrfs_wait_ordered_extents(root, 0); + btrfs_wait_ordered_roots(root->fs_info, items); } else { time_left = schedule_timeout_killable(1); if (time_left) break; } - smp_mb(); delalloc_bytes = percpu_counter_sum_positive( &root->fs_info->delalloc_bytes); } @@ -3997,12 +4658,9 @@ goto commit; /* See if there is enough pinned space to make this reservation */ - spin_lock(&space_info->lock); - if (space_info->bytes_pinned >= bytes) { - spin_unlock(&space_info->lock); + if (percpu_counter_compare(&space_info->total_bytes_pinned, + bytes) >= 0) goto commit; - } - spin_unlock(&space_info->lock); /* * See if there is some space in the delayed insertion reservation for @@ -4011,15 +4669,13 @@ if (space_info != delayed_rsv->space_info) return -ENOSPC; - spin_lock(&space_info->lock); spin_lock(&delayed_rsv->lock); - if (space_info->bytes_pinned + delayed_rsv->size < bytes) { + if (percpu_counter_compare(&space_info->total_bytes_pinned, + bytes - delayed_rsv->size) >= 0) { spin_unlock(&delayed_rsv->lock); - spin_unlock(&space_info->lock); return -ENOSPC; } spin_unlock(&delayed_rsv->lock); - spin_unlock(&space_info->lock); commit: trans = btrfs_join_transaction(root); @@ -4049,16 +4705,11 @@ switch (state) { case FLUSH_DELAYED_ITEMS_NR: case FLUSH_DELAYED_ITEMS: - if (state == FLUSH_DELAYED_ITEMS_NR) { - u64 bytes = btrfs_calc_trans_metadata_size(root, 1); - - nr = (int)div64_u64(num_bytes, bytes); - if (!nr) - nr = 1; - nr *= 2; - } else { + if (state == FLUSH_DELAYED_ITEMS_NR) + nr = calc_reclaim_items_nr(root, num_bytes) * 2; + else nr = -1; - } + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); @@ -4069,7 +4720,7 @@ break; case FLUSH_DELALLOC: case FLUSH_DELALLOC_WAIT: - shrink_delalloc(root, num_bytes, orig_bytes, + shrink_delalloc(root, num_bytes * 2, orig_bytes, state == FLUSH_DELALLOC_WAIT); break; case ALLOC_CHUNK: @@ -4095,6 +4746,117 @@ return ret; } + +static inline u64 +btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, + struct btrfs_space_info *space_info) +{ + u64 used; + u64 expected; + u64 to_reclaim; + + to_reclaim = min_t(u64, num_online_cpus() * 1024 * 1024, + 16 * 1024 * 1024); + spin_lock(&space_info->lock); + if (can_overcommit(root, space_info, to_reclaim, + BTRFS_RESERVE_FLUSH_ALL)) { + to_reclaim = 0; + goto out; + } + + used = space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_pinned + space_info->bytes_readonly + + space_info->bytes_may_use; + if (can_overcommit(root, space_info, 1024 * 1024, + BTRFS_RESERVE_FLUSH_ALL)) + expected = div_factor_fine(space_info->total_bytes, 95); + else + expected = div_factor_fine(space_info->total_bytes, 90); + + if (used > expected) + to_reclaim = used - expected; + else + to_reclaim = 0; + to_reclaim = min(to_reclaim, space_info->bytes_may_use + + space_info->bytes_reserved); +out: + spin_unlock(&space_info->lock); + + return to_reclaim; +} + +static inline int need_do_async_reclaim(struct btrfs_space_info *space_info, + struct btrfs_fs_info *fs_info, u64 used) +{ + u64 thresh = div_factor_fine(space_info->total_bytes, 98); + + /* If we're just plain full then async reclaim just slows us down. */ + if (space_info->bytes_used >= thresh) + return 0; + + return (used >= thresh && !btrfs_fs_closing(fs_info) && + !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); +} + +static int btrfs_need_do_async_reclaim(struct btrfs_space_info *space_info, + struct btrfs_fs_info *fs_info, + int flush_state) +{ + u64 used; + + spin_lock(&space_info->lock); + /* + * We run out of space and have not got any free space via flush_space, + * so don't bother doing async reclaim. + */ + if (flush_state > COMMIT_TRANS && space_info->full) { + spin_unlock(&space_info->lock); + return 0; + } + + used = space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_pinned + space_info->bytes_readonly + + space_info->bytes_may_use; + if (need_do_async_reclaim(space_info, fs_info, used)) { + spin_unlock(&space_info->lock); + return 1; + } + spin_unlock(&space_info->lock); + + return 0; +} + +static void btrfs_async_reclaim_metadata_space(struct work_struct *work) +{ + struct btrfs_fs_info *fs_info; + struct btrfs_space_info *space_info; + u64 to_reclaim; + int flush_state; + + fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); + space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, + space_info); + if (!to_reclaim) + return; + + flush_state = FLUSH_DELAYED_ITEMS_NR; + do { + flush_space(fs_info->fs_root, space_info, to_reclaim, + to_reclaim, flush_state); + flush_state++; + if (!btrfs_need_do_async_reclaim(space_info, fs_info, + flush_state)) + return; + } while (flush_state < COMMIT_TRANS); +} + +void btrfs_init_async_reclaim_work(struct work_struct *work) +{ + INIT_WORK(work, btrfs_async_reclaim_metadata_space); +} + /** * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space * @root - the root we're allocating for @@ -4202,8 +4964,19 @@ if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { flushing = true; space_info->flush = 1; + } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { + used += orig_bytes; + /* + * We will do the space reservation dance during log replay, + * which means we won't have fs_info->fs_root set, so don't do + * the async reclaim as we will panic. + */ + if (!root->fs_info->log_root_recovering && + need_do_async_reclaim(space_info, root->fs_info, used) && + !work_busy(&root->fs_info->async_reclaim_work)) + queue_work(system_unbound_wq, + &root->fs_info->async_reclaim_work); } - spin_unlock(&space_info->lock); if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) @@ -4241,6 +5014,10 @@ !block_rsv_use_bytes(global_rsv, orig_bytes)) ret = 0; } + if (ret == -ENOSPC) + trace_btrfs_space_reservation(root->fs_info, + "space_info:enospc", + space_info->flags, orig_bytes, 1); if (flushing) { spin_lock(&space_info->lock); space_info->flush = 0; @@ -4256,10 +5033,9 @@ { struct btrfs_block_rsv *block_rsv = NULL; - if (root->ref_cows) - block_rsv = trans->block_rsv; - - if (root == root->fs_info->csum_root && trans->adding_csums) + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || + (root == root->fs_info->csum_root && trans->adding_csums) || + (root == root->fs_info->uuid_root)) block_rsv = trans->block_rsv; if (!block_rsv) @@ -4298,6 +5074,31 @@ spin_unlock(&block_rsv->lock); } +int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *dest, u64 num_bytes, + int min_factor) +{ + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + u64 min_bytes; + + if (global_rsv->space_info != dest->space_info) + return -ENOSPC; + + spin_lock(&global_rsv->lock); + min_bytes = div_factor(global_rsv->size, min_factor); + if (global_rsv->reserved < min_bytes + num_bytes) { + spin_unlock(&global_rsv->lock); + return -ENOSPC; + } + global_rsv->reserved -= num_bytes; + if (global_rsv->reserved < global_rsv->size) + global_rsv->full = 0; + spin_unlock(&global_rsv->lock); + + block_rsv_add_bytes(dest, num_bytes, 1); + return 0; +} + static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, struct btrfs_block_rsv *dest, u64 num_bytes) @@ -4337,7 +5138,6 @@ space_info->bytes_may_use -= num_bytes; trace_btrfs_space_reservation(fs_info, "space_info", space_info->flags, num_bytes, 0); - space_info->reservation_progress++; spin_unlock(&space_info->lock); } } @@ -4388,6 +5188,11 @@ kfree(rsv); } +void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv) +{ + kfree(rsv); +} + int btrfs_block_rsv_add(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 num_bytes, enum btrfs_reserve_flush_enum flush) @@ -4466,7 +5271,7 @@ u64 num_bytes) { struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; - if (global_rsv->full || global_rsv == block_rsv || + if (global_rsv == block_rsv || block_rsv->space_info != global_rsv->space_info) global_rsv = NULL; block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv, @@ -4500,12 +5305,12 @@ num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) * csum_size * 2; - num_bytes += div64_u64(data_used + meta_used, 50); + num_bytes += div_u64(data_used + meta_used, 50); if (num_bytes * 3 > meta_used) - num_bytes = div64_u64(meta_used, 3); + num_bytes = div_u64(meta_used, 3); - return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10); + return ALIGN(num_bytes, fs_info->extent_root->nodesize << 10); } static void update_global_block_rsv(struct btrfs_fs_info *fs_info) @@ -4538,7 +5343,6 @@ sinfo->bytes_may_use -= num_bytes; trace_btrfs_space_reservation(fs_info, "space_info", sinfo->flags, num_bytes, 0); - sinfo->reservation_progress++; block_rsv->reserved = block_rsv->size; block_rsv->full = 1; } @@ -4601,6 +5405,24 @@ trans->bytes_reserved = 0; } +/* + * To be called after all the new block groups attached to the transaction + * handle have been created (btrfs_create_pending_block_groups()). + */ +void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->root->fs_info; + + if (!trans->chunk_bytes_reserved) + return; + + WARN_ON_ONCE(!list_empty(&trans->new_bgs)); + + block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL, + trans->chunk_bytes_reserved); + trans->chunk_bytes_reserved = 0; +} + /* Can only return 0 or -ENOSPC */ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, struct inode *inode) @@ -4646,15 +5468,17 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv, int items, - u64 *qgroup_reserved) + u64 *qgroup_reserved, + bool use_global_rsv) { u64 num_bytes; int ret; + struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; if (root->fs_info->quota_enabled) { /* One for parent inode, two for dir entries */ - num_bytes = 3 * root->leafsize; - ret = btrfs_qgroup_reserve(root, num_bytes); + num_bytes = 3 * root->nodesize; + ret = btrfs_qgroup_reserve_meta(root, num_bytes); if (ret) return ret; } else { @@ -4668,10 +5492,12 @@ BTRFS_BLOCK_GROUP_METADATA); ret = btrfs_block_rsv_add(root, rsv, num_bytes, BTRFS_RESERVE_FLUSH_ALL); - if (ret) { - if (*qgroup_reserved) - btrfs_qgroup_free(root, *qgroup_reserved); - } + + if (ret == -ENOSPC && use_global_rsv) + ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes); + + if (ret && *qgroup_reserved) + btrfs_qgroup_free_meta(root, *qgroup_reserved); return ret; } @@ -4681,26 +5507,30 @@ u64 qgroup_reserved) { btrfs_block_rsv_release(root, rsv, (u64)-1); - if (qgroup_reserved) - btrfs_qgroup_free(root, qgroup_reserved); } /** * drop_outstanding_extent - drop an outstanding extent * @inode: the inode we're dropping the extent for + * @num_bytes: the number of bytes we're relaseing. * * This is called when we are freeing up an outstanding extent, either called * after an error or after an extent is written. This will return the number of * reserved extents that need to be freed. This must be called with * BTRFS_I(inode)->lock held. */ -static unsigned drop_outstanding_extent(struct inode *inode) +static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes) { unsigned drop_inode_space = 0; unsigned dropped_extents = 0; + unsigned num_extents = 0; - BUG_ON(!BTRFS_I(inode)->outstanding_extents); - BTRFS_I(inode)->outstanding_extents--; + num_extents = (unsigned)div64_u64(num_bytes + + BTRFS_MAX_EXTENT_SIZE - 1, + BTRFS_MAX_EXTENT_SIZE); + ASSERT(num_extents); + ASSERT(BTRFS_I(inode)->outstanding_extents >= num_extents); + BTRFS_I(inode)->outstanding_extents -= num_extents; if (BTRFS_I(inode)->outstanding_extents == 0 && test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, @@ -4743,30 +5573,18 @@ int reserve) { struct btrfs_root *root = BTRFS_I(inode)->root; - u64 csum_size; - int num_csums_per_leaf; - int num_csums; - int old_csums; + u64 old_csums, num_csums; if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM && BTRFS_I(inode)->csum_bytes == 0) return 0; - old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize); + old_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes); if (reserve) BTRFS_I(inode)->csum_bytes += num_bytes; else BTRFS_I(inode)->csum_bytes -= num_bytes; - csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item); - num_csums_per_leaf = (int)div64_u64(csum_size, - sizeof(struct btrfs_csum_item) + - sizeof(struct btrfs_disk_key)); - num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize); - num_csums = num_csums + num_csums_per_leaf - 1; - num_csums = num_csums / num_csums_per_leaf; - - old_csums = old_csums + num_csums_per_leaf - 1; - old_csums = old_csums / num_csums_per_leaf; + num_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes); /* No change, no need to reserve more */ if (old_csums == num_csums) @@ -4813,7 +5631,11 @@ num_bytes = ALIGN(num_bytes, root->sectorsize); spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->outstanding_extents++; + nr_extents = (unsigned)div64_u64(num_bytes + + BTRFS_MAX_EXTENT_SIZE - 1, + BTRFS_MAX_EXTENT_SIZE); + BTRFS_I(inode)->outstanding_extents += nr_extents; + nr_extents = 0; if (BTRFS_I(inode)->outstanding_extents > BTRFS_I(inode)->reserved_extents) @@ -4836,17 +5658,15 @@ spin_unlock(&BTRFS_I(inode)->lock); if (root->fs_info->quota_enabled) { - ret = btrfs_qgroup_reserve(root, num_bytes + - nr_extents * root->leafsize); + ret = btrfs_qgroup_reserve_meta(root, + nr_extents * root->nodesize); if (ret) goto out_fail; } ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); if (unlikely(ret)) { - if (root->fs_info->quota_enabled) - btrfs_qgroup_free(root, num_bytes + - nr_extents * root->leafsize); + btrfs_qgroup_free_meta(root, nr_extents * root->nodesize); goto out_fail; } @@ -4863,7 +5683,7 @@ mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); if (to_reserve) - trace_btrfs_space_reservation(root->fs_info,"delalloc", + trace_btrfs_space_reservation(root->fs_info, "delalloc", btrfs_ino(inode), to_reserve, 1); block_rsv_add_bytes(block_rsv, to_reserve, 1); @@ -4871,7 +5691,7 @@ out_fail: spin_lock(&BTRFS_I(inode)->lock); - dropped = drop_outstanding_extent(inode); + dropped = drop_outstanding_extent(inode, num_bytes); /* * If the inodes csum_bytes is the same as the original * csum_bytes then we know we haven't raced with any free()ers @@ -4950,7 +5770,7 @@ num_bytes = ALIGN(num_bytes, root->sectorsize); spin_lock(&BTRFS_I(inode)->lock); - dropped = drop_outstanding_extent(inode); + dropped = drop_outstanding_extent(inode, num_bytes); if (num_bytes) to_free = calc_csum_metadata_size(inode, num_bytes, 0); @@ -4958,53 +5778,59 @@ if (dropped > 0) to_free += btrfs_calc_trans_metadata_size(root, dropped); + if (btrfs_test_is_dummy_root(root)) + return; + trace_btrfs_space_reservation(root->fs_info, "delalloc", btrfs_ino(inode), to_free, 0); - if (root->fs_info->quota_enabled) { - btrfs_qgroup_free(root, num_bytes + - dropped * root->leafsize); - } btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, to_free); } /** - * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc + * btrfs_delalloc_reserve_space - reserve data and metadata space for + * delalloc * @inode: inode we're writing to - * @num_bytes: the number of bytes we want to allocate + * @start: start range we are writing to + * @len: how long the range we are writing to + * + * TODO: This function will finally replace old btrfs_delalloc_reserve_space() * * This will do the following things * - * o reserve space in the data space info for num_bytes - * o reserve space in the metadata space info based on number of outstanding + * o reserve space in data space info for num bytes + * and reserve precious corresponding qgroup space + * (Done in check_data_free_space) + * + * o reserve space for metadata space, based on the number of outstanding * extents and how much csums will be needed - * o add to the inodes ->delalloc_bytes + * also reserve metadata space in a per root over-reserve method. + * o add to the inodes->delalloc_bytes * o add it to the fs_info's delalloc inodes list. + * (Above 3 all done in delalloc_reserve_metadata) * - * This will return 0 for success and -ENOSPC if there is no space left. + * Return 0 for success + * Return <0 for error(-ENOSPC or -EQUOT) */ -int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes) +int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len) { int ret; - ret = btrfs_check_data_free_space(inode, num_bytes); - if (ret) - return ret; - - ret = btrfs_delalloc_reserve_metadata(inode, num_bytes); - if (ret) { - btrfs_free_reserved_data_space(inode, num_bytes); + ret = btrfs_check_data_free_space(inode, start, len); + if (ret < 0) return ret; - } - - return 0; + ret = btrfs_delalloc_reserve_metadata(inode, len); + if (ret < 0) + btrfs_free_reserved_data_space(inode, start, len); + return ret; } /** * btrfs_delalloc_release_space - release data and metadata space for delalloc * @inode: inode we're releasing space for - * @num_bytes: the number of bytes we want to free up + * @start: start position of the space already reserved + * @len: the len of the space already reserved * * This must be matched with a call to btrfs_delalloc_reserve_space. This is * called in the case that we don't need the metadata AND data reservations @@ -5013,15 +5839,17 @@ * This function will release the metadata space that was not used and will * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes * list if there are no delalloc bytes left. + * Also it will handle the qgroup reserved space. */ -void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes) +void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len) { - btrfs_delalloc_release_metadata(inode, num_bytes); - btrfs_free_reserved_data_space(inode, num_bytes); + btrfs_delalloc_release_metadata(inode, len); + btrfs_free_reserved_data_space(inode, start, len); } -static int update_block_group(struct btrfs_root *root, - u64 bytenr, u64 num_bytes, int alloc) +static int update_block_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 num_bytes, int alloc) { struct btrfs_block_group_cache *cache = NULL; struct btrfs_fs_info *info = root->fs_info; @@ -5031,14 +5859,14 @@ int factor; /* block accounting for super block */ - spin_lock(&info->delalloc_lock); + spin_lock(&info->delalloc_root_lock); old_val = btrfs_super_bytes_used(info->super_copy); if (alloc) old_val += num_bytes; else old_val -= num_bytes; btrfs_set_super_bytes_used(info->super_copy, old_val); - spin_unlock(&info->delalloc_lock); + spin_unlock(&info->delalloc_root_lock); while (total) { cache = btrfs_lookup_block_group(info, bytenr); @@ -5069,7 +5897,6 @@ cache->disk_cache_state < BTRFS_DC_CLEAR) cache->disk_cache_state = BTRFS_DC_CLEAR; - cache->dirty = 1; old_val = btrfs_block_group_used(&cache->item); num_bytes = min(total, cache->key.offset - byte_in_group); if (alloc) { @@ -5095,6 +5922,32 @@ bytenr, bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); } + + spin_lock(&trans->transaction->dirty_bgs_lock); + if (list_empty(&cache->dirty_list)) { + list_add_tail(&cache->dirty_list, + &trans->transaction->dirty_bgs); + trans->transaction->num_dirty_bgs++; + btrfs_get_block_group(cache); + } + spin_unlock(&trans->transaction->dirty_bgs_lock); + + /* + * No longer have used bytes in this block group, queue it for + * deletion. We do this after adding the block group to the + * dirty list to avoid races between cleaner kthread and space + * cache writeout. + */ + if (!alloc && old_val == 0) { + spin_lock(&info->unused_bgs_lock); + if (list_empty(&cache->bg_list)) { + btrfs_get_block_group(cache); + list_add_tail(&cache->bg_list, + &info->unused_bgs); + } + spin_unlock(&info->unused_bgs_lock); + } + btrfs_put_block_group(cache); total -= num_bytes; bytenr += num_bytes; @@ -5141,6 +5994,8 @@ set_extent_dirty(root->fs_info->pinned_extents, bytenr, bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); + if (reserved) + trace_btrfs_reserved_extent_free(root, bytenr, num_bytes); return 0; } @@ -5190,11 +6045,86 @@ return ret; } +static int __exclude_logged_extent(struct btrfs_root *root, u64 start, u64 num_bytes) +{ + int ret; + struct btrfs_block_group_cache *block_group; + struct btrfs_caching_control *caching_ctl; + + block_group = btrfs_lookup_block_group(root->fs_info, start); + if (!block_group) + return -EINVAL; + + cache_block_group(block_group, 0); + caching_ctl = get_caching_control(block_group); + + if (!caching_ctl) { + /* Logic error */ + BUG_ON(!block_group_cache_done(block_group)); + ret = btrfs_remove_free_space(block_group, start, num_bytes); + } else { + mutex_lock(&caching_ctl->mutex); + + if (start >= caching_ctl->progress) { + ret = add_excluded_extent(root, start, num_bytes); + } else if (start + num_bytes <= caching_ctl->progress) { + ret = btrfs_remove_free_space(block_group, + start, num_bytes); + } else { + num_bytes = caching_ctl->progress - start; + ret = btrfs_remove_free_space(block_group, + start, num_bytes); + if (ret) + goto out_lock; + + num_bytes = (start + num_bytes) - + caching_ctl->progress; + start = caching_ctl->progress; + ret = add_excluded_extent(root, start, num_bytes); + } +out_lock: + mutex_unlock(&caching_ctl->mutex); + put_caching_control(caching_ctl); + } + btrfs_put_block_group(block_group); + return ret; +} + +int btrfs_exclude_logged_extents(struct btrfs_root *log, + struct extent_buffer *eb) +{ + struct btrfs_file_extent_item *item; + struct btrfs_key key; + int found_type; + int i; + + if (!btrfs_fs_incompat(log->fs_info, MIXED_GROUPS)) + return 0; + + for (i = 0; i < btrfs_header_nritems(eb); i++) { + btrfs_item_key_to_cpu(eb, &key, i); + if (key.type != BTRFS_EXTENT_DATA_KEY) + continue; + item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); + found_type = btrfs_file_extent_type(eb, item); + if (found_type == BTRFS_FILE_EXTENT_INLINE) + continue; + if (btrfs_file_extent_disk_bytenr(eb, item) == 0) + continue; + key.objectid = btrfs_file_extent_disk_bytenr(eb, item); + key.offset = btrfs_file_extent_disk_num_bytes(eb, item); + __exclude_logged_extent(log, key.objectid, key.offset); + } + + return 0; +} + /** * btrfs_update_reserved_bytes - update the block_group and space info counters * @cache: The cache we are manipulating * @num_bytes: The number of bytes in question * @reserve: One of the reservation enums + * @delalloc: The blocks are allocated for the delalloc write * * This is called by the allocator when it reserves space, or by somebody who is * freeing space that was never actually used on disk. For example if you @@ -5213,7 +6143,7 @@ * succeeds. */ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, - u64 num_bytes, int reserve) + u64 num_bytes, int reserve, int delalloc) { struct btrfs_space_info *space_info = cache->space_info; int ret = 0; @@ -5232,13 +6162,18 @@ num_bytes, 0); space_info->bytes_may_use -= num_bytes; } + + if (delalloc) + cache->delalloc_bytes += num_bytes; } } else { if (cache->ro) space_info->bytes_readonly += num_bytes; cache->reserved -= num_bytes; space_info->bytes_reserved -= num_bytes; - space_info->reservation_progress++; + + if (delalloc) + cache->delalloc_bytes -= num_bytes; } spin_unlock(&cache->lock); spin_unlock(&space_info->lock); @@ -5253,7 +6188,7 @@ struct btrfs_caching_control *caching_ctl; struct btrfs_block_group_cache *cache; - down_write(&fs_info->extent_commit_sem); + down_write(&fs_info->commit_root_sem); list_for_each_entry_safe(caching_ctl, next, &fs_info->caching_block_groups, list) { @@ -5272,11 +6207,39 @@ else fs_info->pinned_extents = &fs_info->freed_extents[0]; - up_write(&fs_info->extent_commit_sem); + up_write(&fs_info->commit_root_sem); update_global_block_rsv(fs_info); } +/* + * Returns the free cluster for the given space info and sets empty_cluster to + * what it should be based on the mount options. + */ +static struct btrfs_free_cluster * +fetch_cluster_info(struct btrfs_root *root, struct btrfs_space_info *space_info, + u64 *empty_cluster) +{ + struct btrfs_free_cluster *ret = NULL; + bool ssd = btrfs_test_opt(root, SSD); + + *empty_cluster = 0; + if (btrfs_mixed_space_info(space_info)) + return ret; + + if (ssd) + *empty_cluster = 2 * 1024 * 1024; + if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { + ret = &root->fs_info->meta_alloc_cluster; + if (!ssd) + *empty_cluster = 64 * 1024; + } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && ssd) { + ret = &root->fs_info->data_alloc_cluster; + } + + return ret; +} + static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end, const bool return_free_space) { @@ -5284,7 +6247,10 @@ struct btrfs_block_group_cache *cache = NULL; struct btrfs_space_info *space_info; struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + struct btrfs_free_cluster *cluster = NULL; u64 len; + u64 total_unpinned = 0; + u64 empty_cluster = 0; bool readonly; while (start <= end) { @@ -5293,8 +6259,14 @@ start >= cache->key.objectid + cache->key.offset) { if (cache) btrfs_put_block_group(cache); + total_unpinned = 0; cache = btrfs_lookup_block_group(fs_info, start); BUG_ON(!cache); /* Logic error */ + + cluster = fetch_cluster_info(root, + cache->space_info, + &empty_cluster); + empty_cluster <<= 1; } len = cache->key.objectid + cache->key.offset - start; @@ -5307,12 +6279,28 @@ } start += len; + total_unpinned += len; space_info = cache->space_info; + /* + * If this space cluster has been marked as fragmented and we've + * unpinned enough in this block group to potentially allow a + * cluster to be created inside of it go ahead and clear the + * fragmented check. + */ + if (cluster && cluster->fragmented && + total_unpinned > empty_cluster) { + spin_lock(&cluster->lock); + cluster->fragmented = 0; + spin_unlock(&cluster->lock); + } + spin_lock(&space_info->lock); spin_lock(&cache->lock); cache->pinned -= len; space_info->bytes_pinned -= len; + space_info->max_extent_size = 0; + percpu_counter_add(&space_info->total_bytes_pinned, -len); if (cache->ro) { space_info->bytes_readonly += len; readonly = true; @@ -5342,24 +6330,26 @@ struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_block_group_cache *block_group, *tmp; + struct list_head *deleted_bgs; struct extent_io_tree *unpin; u64 start; u64 end; int ret; - if (trans->aborted) - return 0; - if (fs_info->pinned_extents == &fs_info->freed_extents[0]) unpin = &fs_info->freed_extents[1]; else unpin = &fs_info->freed_extents[0]; - while (1) { + while (!trans->aborted) { + mutex_lock(&fs_info->unused_bg_unpin_mutex); ret = find_first_extent_bit(unpin, 0, &start, &end, EXTENT_DIRTY, NULL); - if (ret) + if (ret) { + mutex_unlock(&fs_info->unused_bg_unpin_mutex); break; + } if (btrfs_test_opt(root, DISCARD)) ret = btrfs_discard_extent(root, start, @@ -5367,15 +6357,65 @@ clear_extent_dirty(unpin, start, end, GFP_NOFS); unpin_extent_range(root, start, end, true); + mutex_unlock(&fs_info->unused_bg_unpin_mutex); cond_resched(); } + /* + * Transaction is finished. We don't need the lock anymore. We + * do need to clean up the block groups in case of a transaction + * abort. + */ + deleted_bgs = &trans->transaction->deleted_bgs; + list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) { + u64 trimmed = 0; + + ret = -EROFS; + if (!trans->aborted) + ret = btrfs_discard_extent(root, + block_group->key.objectid, + block_group->key.offset, + &trimmed); + + list_del_init(&block_group->bg_list); + btrfs_put_block_group_trimming(block_group); + btrfs_put_block_group(block_group); + + if (ret) { + const char *errstr = btrfs_decode_error(ret); + btrfs_warn(fs_info, + "Discard failed while removing blockgroup: errno=%d %s\n", + ret, errstr); + } + } + return 0; } +static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes, + u64 owner, u64 root_objectid) +{ + struct btrfs_space_info *space_info; + u64 flags; + + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID) + flags = BTRFS_BLOCK_GROUP_SYSTEM; + else + flags = BTRFS_BLOCK_GROUP_METADATA; + } else { + flags = BTRFS_BLOCK_GROUP_DATA; + } + + space_info = __find_space_info(fs_info, flags); + BUG_ON(!space_info); /* Logic bug */ + percpu_counter_add(&space_info->total_bytes_pinned, num_bytes); +} + + static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 parent, + struct btrfs_delayed_ref_node *node, u64 parent, u64 root_objectid, u64 owner_objectid, u64 owner_offset, int refs_to_drop, struct btrfs_delayed_extent_op *extent_op) @@ -5394,6 +6434,9 @@ int num_to_del = 1; u32 item_size; u64 refs; + u64 bytenr = node->bytenr; + u64 num_bytes = node->num_bytes; + int last_ref = 0; bool skinny_metadata = btrfs_fs_incompat(root->fs_info, SKINNY_METADATA); @@ -5444,7 +6487,7 @@ BUG_ON(iref); ret = remove_extent_backref(trans, extent_root, path, NULL, refs_to_drop, - is_data); + is_data, &last_ref); if (ret) { btrfs_abort_transaction(trans, extent_root, ret); goto out; @@ -5479,6 +6522,7 @@ if (ret > 0 && skinny_metadata) { skinny_metadata = false; + key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = num_bytes; btrfs_release_path(path); @@ -5488,7 +6532,7 @@ if (ret) { btrfs_err(info, "umm, got %d back from search, was looking for %llu", - ret, (unsigned long long)bytenr); + ret, bytenr); if (ret > 0) btrfs_print_leaf(extent_root, path->nodes[0]); @@ -5499,16 +6543,14 @@ } extent_slot = path->slots[0]; } - } else if (ret == -ENOENT) { + } else if (WARN_ON(ret == -ENOENT)) { btrfs_print_leaf(extent_root, path->nodes[0]); - WARN_ON(1); btrfs_err(info, "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu", - (unsigned long long)bytenr, - (unsigned long long)parent, - (unsigned long long)root_objectid, - (unsigned long long)owner_objectid, - (unsigned long long)owner_offset); + bytenr, parent, root_objectid, owner_objectid, + owner_offset); + btrfs_abort_transaction(trans, extent_root, ret); + goto out; } else { btrfs_abort_transaction(trans, extent_root, ret); goto out; @@ -5537,7 +6579,7 @@ -1, 1); if (ret) { btrfs_err(info, "umm, got %d back from search, was looking for %llu", - ret, (unsigned long long)bytenr); + ret, bytenr); btrfs_print_leaf(extent_root, path->nodes[0]); } if (ret < 0) { @@ -5564,7 +6606,7 @@ refs = btrfs_extent_refs(leaf, ei); if (refs < refs_to_drop) { btrfs_err(info, "trying to drop %d refs but we only have %Lu " - "for bytenr %Lu\n", refs_to_drop, refs, bytenr); + "for bytenr %Lu", refs_to_drop, refs, bytenr); ret = -EINVAL; btrfs_abort_transaction(trans, extent_root, ret); goto out; @@ -5587,16 +6629,18 @@ if (found_extent) { ret = remove_extent_backref(trans, extent_root, path, iref, refs_to_drop, - is_data); + is_data, &last_ref); if (ret) { btrfs_abort_transaction(trans, extent_root, ret); goto out; } } + add_pinned_bytes(root->fs_info, -num_bytes, owner_objectid, + root_objectid); } else { if (found_extent) { BUG_ON(is_data && refs_to_drop != - extent_data_ref_count(root, path, iref)); + extent_data_ref_count(path, iref)); if (iref) { BUG_ON(path->slots[0] != extent_slot); } else { @@ -5606,6 +6650,7 @@ } } + last_ref = 1; ret = btrfs_del_items(trans, extent_root, path, path->slots[0], num_to_del); if (ret) { @@ -5622,12 +6667,14 @@ } } - ret = update_block_group(root, bytenr, num_bytes, 0); + ret = update_block_group(trans, root, bytenr, num_bytes, 0); if (ret) { btrfs_abort_transaction(trans, extent_root, ret); goto out; } } + btrfs_release_path(path); + out: btrfs_free_path(path); return ret; @@ -5644,24 +6691,16 @@ { struct btrfs_delayed_ref_head *head; struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_delayed_ref_node *ref; - struct rb_node *node; int ret = 0; delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); head = btrfs_find_delayed_ref_head(trans, bytenr); if (!head) - goto out; + goto out_delayed_unlock; - node = rb_prev(&head->node.rb_node); - if (!node) - goto out; - - ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - - /* there are still entries for this ref, we can't drop it */ - if (ref->bytenr == bytenr) + spin_lock(&head->lock); + if (!list_empty(&head->ref_list)) goto out; if (head->extent_op) { @@ -5683,19 +6722,19 @@ * ahead and process it. */ head->node.in_tree = 0; - rb_erase(&head->node.rb_node, &delayed_refs->root); + rb_erase(&head->href_node, &delayed_refs->href_root); - delayed_refs->num_entries--; + atomic_dec(&delayed_refs->num_entries); /* * we don't take a ref on the node because we're removing it from the * tree, so we just steal the ref the tree was holding. */ delayed_refs->num_heads--; - if (list_empty(&head->cluster)) + if (head->processing == 0) delayed_refs->num_heads_ready--; - - list_del_init(&head->cluster); + head->processing = 0; + spin_unlock(&head->lock); spin_unlock(&delayed_refs->lock); BUG_ON(head->extent_op); @@ -5706,6 +6745,9 @@ btrfs_put_delayed_ref(&head->node); return ret; out: + spin_unlock(&head->lock); + +out_delayed_unlock: spin_unlock(&delayed_refs->lock); return 0; } @@ -5715,7 +6757,7 @@ struct extent_buffer *buf, u64 parent, int last_ref) { - struct btrfs_block_group_cache *cache = NULL; + int pin = 1; int ret; if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { @@ -5723,49 +6765,64 @@ buf->start, buf->len, parent, root->root_key.objectid, btrfs_header_level(buf), - BTRFS_DROP_DELAYED_REF, NULL, 0); + BTRFS_DROP_DELAYED_REF, NULL); BUG_ON(ret); /* -ENOMEM */ } if (!last_ref) return; - cache = btrfs_lookup_block_group(root->fs_info, buf->start); - if (btrfs_header_generation(buf) == trans->transid) { + struct btrfs_block_group_cache *cache; + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { ret = check_ref_cleanup(trans, root, buf->start); if (!ret) goto out; } + cache = btrfs_lookup_block_group(root->fs_info, buf->start); + if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { pin_down_extent(root, cache, buf->start, buf->len, 1); + btrfs_put_block_group(cache); goto out; } WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); btrfs_add_free_space(cache, buf->start, buf->len); - btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE); + btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE, 0); + btrfs_put_block_group(cache); + trace_btrfs_reserved_extent_free(root, buf->start, buf->len); + pin = 0; } out: + if (pin) + add_pinned_bytes(root->fs_info, buf->len, + btrfs_header_level(buf), + root->root_key.objectid); + /* * Deleting the buffer, clear the corrupt flag since it doesn't matter * anymore. */ clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); - btrfs_put_block_group(cache); } /* Can return -ENOMEM */ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, - u64 owner, u64 offset, int for_cow) + u64 owner, u64 offset) { int ret; struct btrfs_fs_info *fs_info = root->fs_info; + if (btrfs_test_is_dummy_root(root)) + return 0; + + add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid); + /* * tree log blocks never actually go into the extent allocation * tree, just update pinning info and exit early. @@ -5779,25 +6836,17 @@ ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, num_bytes, parent, root_objectid, (int)owner, - BTRFS_DROP_DELAYED_REF, NULL, for_cow); + BTRFS_DROP_DELAYED_REF, NULL); } else { ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, num_bytes, parent, root_objectid, owner, - offset, BTRFS_DROP_DELAYED_REF, - NULL, for_cow); + offset, 0, + BTRFS_DROP_DELAYED_REF, NULL); } return ret; } -static u64 stripe_align(struct btrfs_root *root, - struct btrfs_block_group_cache *cache, - u64 val, u64 num_bytes) -{ - u64 ret = ALIGN(val, root->stripesize); - return ret; -} - /* * when we wait for progress in the block group caching, its because * our allocation attempt failed at least once. So, we must sleep @@ -5808,8 +6857,11 @@ * for our min num_bytes. Another option is to have it go ahead * and look in the rbtree for a free extent of a given size, but this * is a good start. + * + * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using + * any of the information in this block group. */ -static noinline int +static noinline void wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, u64 num_bytes) { @@ -5817,28 +6869,29 @@ caching_ctl = get_caching_control(cache); if (!caching_ctl) - return 0; + return; wait_event(caching_ctl->wait, block_group_cache_done(cache) || (cache->free_space_ctl->free_space >= num_bytes)); put_caching_control(caching_ctl); - return 0; } static noinline int wait_block_group_cache_done(struct btrfs_block_group_cache *cache) { struct btrfs_caching_control *caching_ctl; + int ret = 0; caching_ctl = get_caching_control(cache); if (!caching_ctl) - return 0; + return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0; wait_event(caching_ctl->wait, block_group_cache_done(cache)); - + if (cache->cached == BTRFS_CACHE_ERROR) + ret = -EIO; put_caching_control(caching_ctl); - return 0; + return ret; } int __get_raid_index(u64 flags) @@ -5859,11 +6912,29 @@ return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ } -static int get_block_group_index(struct btrfs_block_group_cache *cache) +int get_block_group_index(struct btrfs_block_group_cache *cache) { return __get_raid_index(cache->flags); } +static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = { + [BTRFS_RAID_RAID10] = "raid10", + [BTRFS_RAID_RAID1] = "raid1", + [BTRFS_RAID_DUP] = "dup", + [BTRFS_RAID_RAID0] = "raid0", + [BTRFS_RAID_SINGLE] = "single", + [BTRFS_RAID_RAID5] = "raid5", + [BTRFS_RAID_RAID6] = "raid6", +}; + +static const char *get_raid_name(enum btrfs_raid_types type) +{ + if (type >= BTRFS_NR_RAID_TYPES) + return NULL; + + return btrfs_raid_type_names[type]; +} + enum btrfs_loop_type { LOOP_CACHING_NOWAIT = 0, LOOP_CACHING_WAIT = 1, @@ -5871,40 +6942,107 @@ LOOP_NO_EMPTY_SIZE = 3, }; +static inline void +btrfs_lock_block_group(struct btrfs_block_group_cache *cache, + int delalloc) +{ + if (delalloc) + down_read(&cache->data_rwsem); +} + +static inline void +btrfs_grab_block_group(struct btrfs_block_group_cache *cache, + int delalloc) +{ + btrfs_get_block_group(cache); + if (delalloc) + down_read(&cache->data_rwsem); +} + +static struct btrfs_block_group_cache * +btrfs_lock_cluster(struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, + int delalloc) +{ + struct btrfs_block_group_cache *used_bg; + bool locked = false; +again: + spin_lock(&cluster->refill_lock); + if (locked) { + if (used_bg == cluster->block_group) + return used_bg; + + up_read(&used_bg->data_rwsem); + btrfs_put_block_group(used_bg); + } + + used_bg = cluster->block_group; + if (!used_bg) + return NULL; + + if (used_bg == block_group) + return used_bg; + + btrfs_get_block_group(used_bg); + + if (!delalloc) + return used_bg; + + if (down_read_trylock(&used_bg->data_rwsem)) + return used_bg; + + spin_unlock(&cluster->refill_lock); + down_read(&used_bg->data_rwsem); + locked = true; + goto again; +} + +static inline void +btrfs_release_block_group(struct btrfs_block_group_cache *cache, + int delalloc) +{ + if (delalloc) + up_read(&cache->data_rwsem); + btrfs_put_block_group(cache); +} + /* * walks the btree of allocated extents and find a hole of a given size. * The key ins is changed to record the hole: - * ins->objectid == block start + * ins->objectid == start position * ins->flags = BTRFS_EXTENT_ITEM_KEY - * ins->offset == number of blocks + * ins->offset == the size of the hole. * Any available blocks before search_start are skipped. + * + * If there is no suitable free space, we will record the max size of + * the free space extent currently. */ -static noinline int find_free_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *orig_root, +static noinline int find_free_extent(struct btrfs_root *orig_root, u64 num_bytes, u64 empty_size, u64 hint_byte, struct btrfs_key *ins, - u64 flags) + u64 flags, int delalloc) { int ret = 0; struct btrfs_root *root = orig_root->fs_info->extent_root; struct btrfs_free_cluster *last_ptr = NULL; struct btrfs_block_group_cache *block_group = NULL; - struct btrfs_block_group_cache *used_block_group; u64 search_start = 0; - int empty_cluster = 2 * 1024 * 1024; + u64 max_extent_size = 0; + u64 empty_cluster = 0; struct btrfs_space_info *space_info; int loop = 0; int index = __get_raid_index(flags); int alloc_type = (flags & BTRFS_BLOCK_GROUP_DATA) ? RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC; - bool found_uncached_bg = false; bool failed_cluster_refill = false; bool failed_alloc = false; bool use_cluster = true; bool have_caching_bg = false; + bool orig_have_caching_bg = false; + bool full_search = false; WARN_ON(num_bytes < root->sectorsize); - btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); + ins->type = BTRFS_EXTENT_ITEM_KEY; ins->objectid = 0; ins->offset = 0; @@ -5917,40 +7055,50 @@ } /* - * If the space info is for both data and metadata it means we have a - * small filesystem and we can't use the clustering stuff. + * If our free space is heavily fragmented we may not be able to make + * big contiguous allocations, so instead of doing the expensive search + * for free space, simply return ENOSPC with our max_extent_size so we + * can go ahead and search for a more manageable chunk. + * + * If our max_extent_size is large enough for our allocation simply + * disable clustering since we will likely not be able to find enough + * space to create a cluster and induce latency trying. */ - if (btrfs_mixed_space_info(space_info)) - use_cluster = false; - - if (flags & BTRFS_BLOCK_GROUP_METADATA && use_cluster) { - last_ptr = &root->fs_info->meta_alloc_cluster; - if (!btrfs_test_opt(root, SSD)) - empty_cluster = 64 * 1024; - } - - if ((flags & BTRFS_BLOCK_GROUP_DATA) && use_cluster && - btrfs_test_opt(root, SSD)) { - last_ptr = &root->fs_info->data_alloc_cluster; + if (unlikely(space_info->max_extent_size)) { + spin_lock(&space_info->lock); + if (space_info->max_extent_size && + num_bytes > space_info->max_extent_size) { + ins->offset = space_info->max_extent_size; + spin_unlock(&space_info->lock); + return -ENOSPC; + } else if (space_info->max_extent_size) { + use_cluster = false; + } + spin_unlock(&space_info->lock); } + last_ptr = fetch_cluster_info(orig_root, space_info, &empty_cluster); if (last_ptr) { spin_lock(&last_ptr->lock); if (last_ptr->block_group) hint_byte = last_ptr->window_start; + if (last_ptr->fragmented) { + /* + * We still set window_start so we can keep track of the + * last place we found an allocation to try and save + * some time. + */ + hint_byte = last_ptr->window_start; + use_cluster = false; + } spin_unlock(&last_ptr->lock); } search_start = max(search_start, first_logical_byte(root, 0)); search_start = max(search_start, hint_byte); - - if (!last_ptr) - empty_cluster = 0; - if (search_start == hint_byte) { block_group = btrfs_lookup_block_group(root->fs_info, search_start); - used_block_group = block_group; /* * we don't want to use the block group if it doesn't match our * allocation bits, or if its not cached. @@ -5973,6 +7121,7 @@ up_read(&space_info->groups_sem); } else { index = get_block_group_index(block_group); + btrfs_lock_block_group(block_group, delalloc); goto have_block_group; } } else if (block_group) { @@ -5981,14 +7130,15 @@ } search: have_caching_bg = false; + if (index == 0 || index == __get_raid_index(flags)) + full_search = true; down_read(&space_info->groups_sem); list_for_each_entry(block_group, &space_info->block_groups[index], list) { u64 offset; int cached; - used_block_group = block_group; - btrfs_get_block_group(block_group); + btrfs_grab_block_group(block_group, delalloc); search_start = block_group->key.objectid; /* @@ -6015,12 +7165,14 @@ have_block_group: cached = block_group_cache_done(block_group); if (unlikely(!cached)) { - found_uncached_bg = true; + have_caching_bg = true; ret = cache_block_group(block_group, 0); BUG_ON(ret < 0); ret = 0; } + if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) + goto loop; if (unlikely(block_group->ro)) goto loop; @@ -6028,42 +7180,45 @@ * Ok we want to try and use the cluster allocator, so * lets look there */ - if (last_ptr) { + if (last_ptr && use_cluster) { + struct btrfs_block_group_cache *used_block_group; unsigned long aligned_cluster; /* * the refill lock keeps out other * people trying to start a new cluster */ - spin_lock(&last_ptr->refill_lock); - used_block_group = last_ptr->block_group; - if (used_block_group != block_group && - (!used_block_group || - used_block_group->ro || - !block_group_bits(used_block_group, flags))) { - used_block_group = block_group; + used_block_group = btrfs_lock_cluster(block_group, + last_ptr, + delalloc); + if (!used_block_group) goto refill_cluster; - } - if (used_block_group != block_group) - btrfs_get_block_group(used_block_group); + if (used_block_group != block_group && + (used_block_group->ro || + !block_group_bits(used_block_group, flags))) + goto release_cluster; offset = btrfs_alloc_from_cluster(used_block_group, - last_ptr, num_bytes, used_block_group->key.objectid); + last_ptr, + num_bytes, + used_block_group->key.objectid, + &max_extent_size); if (offset) { /* we have a block, we're done */ spin_unlock(&last_ptr->refill_lock); trace_btrfs_reserve_extent_cluster(root, - block_group, search_start, num_bytes); + used_block_group, + search_start, num_bytes); + if (used_block_group != block_group) { + btrfs_release_block_group(block_group, + delalloc); + block_group = used_block_group; + } goto checks; } WARN_ON(last_ptr->block_group != used_block_group); - if (used_block_group != block_group) { - btrfs_put_block_group(used_block_group); - used_block_group = block_group; - } -refill_cluster: - BUG_ON(used_block_group != block_group); +release_cluster: /* If we are on LOOP_NO_EMPTY_SIZE, we can't * set up a new clusters, so lets just skip it * and let the allocator find whatever block @@ -6080,8 +7235,10 @@ * succeeding in the unclustered * allocation. */ if (loop >= LOOP_NO_EMPTY_SIZE && - last_ptr->block_group != block_group) { + used_block_group != block_group) { spin_unlock(&last_ptr->refill_lock); + btrfs_release_block_group(used_block_group, + delalloc); goto unclustered_alloc; } @@ -6091,6 +7248,10 @@ */ btrfs_return_cluster_to_free_space(NULL, last_ptr); + if (used_block_group != block_group) + btrfs_release_block_group(used_block_group, + delalloc); +refill_cluster: if (loop >= LOOP_NO_EMPTY_SIZE) { spin_unlock(&last_ptr->refill_lock); goto unclustered_alloc; @@ -6101,18 +7262,20 @@ block_group->full_stripe_len); /* allocate a cluster in this block group */ - ret = btrfs_find_space_cluster(trans, root, - block_group, last_ptr, - search_start, num_bytes, - aligned_cluster); + ret = btrfs_find_space_cluster(root, block_group, + last_ptr, search_start, + num_bytes, + aligned_cluster); if (ret == 0) { /* * now pull our allocation out of this * cluster */ offset = btrfs_alloc_from_cluster(block_group, - last_ptr, num_bytes, - search_start); + last_ptr, + num_bytes, + search_start, + &max_extent_size); if (offset) { /* we found one, proceed */ spin_unlock(&last_ptr->refill_lock); @@ -6143,17 +7306,32 @@ } unclustered_alloc: + /* + * We are doing an unclustered alloc, set the fragmented flag so + * we don't bother trying to setup a cluster again until we get + * more space. + */ + if (unlikely(last_ptr)) { + spin_lock(&last_ptr->lock); + last_ptr->fragmented = 1; + spin_unlock(&last_ptr->lock); + } spin_lock(&block_group->free_space_ctl->tree_lock); if (cached && block_group->free_space_ctl->free_space < num_bytes + empty_cluster + empty_size) { + if (block_group->free_space_ctl->free_space > + max_extent_size) + max_extent_size = + block_group->free_space_ctl->free_space; spin_unlock(&block_group->free_space_ctl->tree_lock); goto loop; } spin_unlock(&block_group->free_space_ctl->tree_lock); offset = btrfs_find_space_for_alloc(block_group, search_start, - num_bytes, empty_size); + num_bytes, empty_size, + &max_extent_size); /* * If we didn't find a chunk, and we haven't failed on this * block group before, and this block group is in the middle of @@ -6170,30 +7348,27 @@ failed_alloc = true; goto have_block_group; } else if (!offset) { - if (!cached) - have_caching_bg = true; goto loop; } checks: - search_start = stripe_align(root, used_block_group, - offset, num_bytes); + search_start = ALIGN(offset, root->stripesize); /* move on to the next group */ if (search_start + num_bytes > - used_block_group->key.objectid + used_block_group->key.offset) { - btrfs_add_free_space(used_block_group, offset, num_bytes); + block_group->key.objectid + block_group->key.offset) { + btrfs_add_free_space(block_group, offset, num_bytes); goto loop; } if (offset < search_start) - btrfs_add_free_space(used_block_group, offset, + btrfs_add_free_space(block_group, offset, search_start - offset); BUG_ON(offset > search_start); - ret = btrfs_update_reserved_bytes(used_block_group, num_bytes, - alloc_type); + ret = btrfs_update_reserved_bytes(block_group, num_bytes, + alloc_type, delalloc); if (ret == -EAGAIN) { - btrfs_add_free_space(used_block_group, offset, num_bytes); + btrfs_add_free_space(block_group, offset, num_bytes); goto loop; } @@ -6203,20 +7378,20 @@ trace_btrfs_reserve_extent(orig_root, block_group, search_start, num_bytes); - if (used_block_group != block_group) - btrfs_put_block_group(used_block_group); - btrfs_put_block_group(block_group); + btrfs_release_block_group(block_group, delalloc); break; loop: failed_cluster_refill = false; failed_alloc = false; BUG_ON(index != get_block_group_index(block_group)); - if (used_block_group != block_group) - btrfs_put_block_group(used_block_group); - btrfs_put_block_group(block_group); + btrfs_release_block_group(block_group, delalloc); } up_read(&space_info->groups_sem); + if ((loop == LOOP_CACHING_NOWAIT) && have_caching_bg + && !orig_have_caching_bg) + orig_have_caching_bg = true; + if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg) goto search; @@ -6233,22 +7408,71 @@ */ if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) { index = 0; - loop++; + if (loop == LOOP_CACHING_NOWAIT) { + /* + * We want to skip the LOOP_CACHING_WAIT step if we + * don't have any unached bgs and we've alrelady done a + * full search through. + */ + if (orig_have_caching_bg || !full_search) + loop = LOOP_CACHING_WAIT; + else + loop = LOOP_ALLOC_CHUNK; + } else { + loop++; + } + if (loop == LOOP_ALLOC_CHUNK) { + struct btrfs_trans_handle *trans; + int exist = 0; + + trans = current->journal_info; + if (trans) + exist = 1; + else + trans = btrfs_join_transaction(root); + + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + ret = do_chunk_alloc(trans, root, flags, CHUNK_ALLOC_FORCE); + + /* + * If we can't allocate a new chunk we've already looped + * through at least once, move on to the NO_EMPTY_SIZE + * case. + */ + if (ret == -ENOSPC) + loop = LOOP_NO_EMPTY_SIZE; + /* * Do not bail out on ENOSPC since we * can do more things. */ - if (ret < 0 && ret != -ENOSPC) { + if (ret < 0 && ret != -ENOSPC) btrfs_abort_transaction(trans, root, ret); + else + ret = 0; + if (!exist) + btrfs_end_transaction(trans, root); + if (ret) goto out; - } } if (loop == LOOP_NO_EMPTY_SIZE) { + /* + * Don't loop again if we already have no empty_size and + * no empty_cluster. + */ + if (empty_size == 0 && + empty_cluster == 0) { + ret = -ENOSPC; + goto out; + } empty_size = 0; empty_cluster = 0; } @@ -6257,10 +7481,20 @@ } else if (!ins->objectid) { ret = -ENOSPC; } else if (ins->objectid) { + if (!use_cluster && last_ptr) { + spin_lock(&last_ptr->lock); + last_ptr->window_start = ins->objectid; + spin_unlock(&last_ptr->lock); + } ret = 0; } out: - + if (ret == -ENOSPC) { + spin_lock(&space_info->lock); + space_info->max_extent_size = max_extent_size; + spin_unlock(&space_info->lock); + ins->offset = max_extent_size; + } return ret; } @@ -6271,20 +7505,16 @@ int index = 0; spin_lock(&info->lock); - printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n", - (unsigned long long)info->flags, - (unsigned long long)(info->total_bytes - info->bytes_used - - info->bytes_pinned - info->bytes_reserved - - info->bytes_readonly), + printk(KERN_INFO "BTRFS: space_info %llu has %llu free, is %sfull\n", + info->flags, + info->total_bytes - info->bytes_used - info->bytes_pinned - + info->bytes_reserved - info->bytes_readonly, (info->full) ? "" : "not "); - printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, " + printk(KERN_INFO "BTRFS: space_info total=%llu, used=%llu, pinned=%llu, " "reserved=%llu, may_use=%llu, readonly=%llu\n", - (unsigned long long)info->total_bytes, - (unsigned long long)info->bytes_used, - (unsigned long long)info->bytes_pinned, - (unsigned long long)info->bytes_reserved, - (unsigned long long)info->bytes_may_use, - (unsigned long long)info->bytes_readonly); + info->total_bytes, info->bytes_used, info->bytes_pinned, + info->bytes_reserved, info->bytes_may_use, + info->bytes_readonly); spin_unlock(&info->lock); if (!dump_block_groups) @@ -6294,13 +7524,12 @@ again: list_for_each_entry(cache, &info->block_groups[index], list) { spin_lock(&cache->lock); - printk(KERN_INFO "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s\n", - (unsigned long long)cache->key.objectid, - (unsigned long long)cache->key.offset, - (unsigned long long)btrfs_block_group_used(&cache->item), - (unsigned long long)cache->pinned, - (unsigned long long)cache->reserved, - cache->ro ? "[readonly]" : ""); + printk(KERN_INFO "BTRFS: " + "block group %llu has %llu bytes, " + "%llu used %llu pinned %llu reserved %s\n", + cache->key.objectid, cache->key.offset, + btrfs_block_group_used(&cache->item), cache->pinned, + cache->reserved, cache->ro ? "[readonly]" : ""); btrfs_dump_free_space(cache, bytes); spin_unlock(&cache->lock); } @@ -6309,25 +7538,24 @@ up_read(&info->groups_sem); } -int btrfs_reserve_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes, u64 min_alloc_size, u64 empty_size, u64 hint_byte, - struct btrfs_key *ins, int is_data) + struct btrfs_key *ins, int is_data, int delalloc) { - bool final_tried = false; + bool final_tried = num_bytes == min_alloc_size; u64 flags; int ret; flags = btrfs_get_alloc_profile(root, is_data); again: WARN_ON(num_bytes < root->sectorsize); - ret = find_free_extent(trans, root, num_bytes, empty_size, - hint_byte, ins, flags); + ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins, + flags, delalloc); if (ret == -ENOSPC) { - if (!final_tried) { - num_bytes = num_bytes >> 1; + if (!final_tried && ins->offset) { + num_bytes = min(num_bytes >> 1, ins->offset); num_bytes = round_down(num_bytes, root->sectorsize); num_bytes = max(num_bytes, min_alloc_size); if (num_bytes == min_alloc_size) @@ -6338,20 +7566,18 @@ sinfo = __find_space_info(root->fs_info, flags); btrfs_err(root->fs_info, "allocation failed flags %llu, wanted %llu", - (unsigned long long)flags, - (unsigned long long)num_bytes); + flags, num_bytes); if (sinfo) dump_space_info(sinfo, num_bytes, 1); } } - trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset); - return ret; } static int __btrfs_free_reserved_extent(struct btrfs_root *root, - u64 start, u64 len, int pin) + u64 start, u64 len, + int pin, int delalloc) { struct btrfs_block_group_cache *cache; int ret = 0; @@ -6359,7 +7585,7 @@ cache = btrfs_lookup_block_group(root->fs_info, start); if (!cache) { btrfs_err(root->fs_info, "Unable to find block group for %llu", - (unsigned long long)start); + start); return -ENOSPC; } @@ -6369,8 +7595,9 @@ if (btrfs_test_opt(root, DISCARD)) ret = btrfs_discard_extent(root, start, len, NULL); btrfs_add_free_space(cache, start, len); - btrfs_update_reserved_bytes(cache, len, RESERVE_FREE); + btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc); } + btrfs_put_block_group(cache); trace_btrfs_reserved_extent_free(root, start, len); @@ -6379,15 +7606,15 @@ } int btrfs_free_reserved_extent(struct btrfs_root *root, - u64 start, u64 len) + u64 start, u64 len, int delalloc) { - return __btrfs_free_reserved_extent(root, start, len, 0); + return __btrfs_free_reserved_extent(root, start, len, 0, delalloc); } int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, u64 start, u64 len) { - return __btrfs_free_reserved_extent(root, start, len, 1); + return __btrfs_free_reserved_extent(root, start, len, 1, 0); } static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, @@ -6451,13 +7678,13 @@ btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_free_path(path); - ret = update_block_group(root, ins->objectid, ins->offset, 1); + ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); if (ret) { /* -ENOENT, logic error */ btrfs_err(fs_info, "update block group failed for %llu %llu", - (unsigned long long)ins->objectid, - (unsigned long long)ins->offset); + ins->objectid, ins->offset); BUG(); } + trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset); return ret; } @@ -6475,6 +7702,7 @@ struct btrfs_path *path; struct extent_buffer *leaf; u32 size = sizeof(*extent_item) + sizeof(*iref); + u64 num_bytes = ins->offset; bool skinny_metadata = btrfs_fs_incompat(root->fs_info, SKINNY_METADATA); @@ -6482,14 +7710,19 @@ size += sizeof(*block_info); path = btrfs_alloc_path(); - if (!path) + if (!path) { + btrfs_free_and_pin_reserved_extent(root, ins->objectid, + root->nodesize); return -ENOMEM; + } path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, ins, size); if (ret) { btrfs_free_path(path); + btrfs_free_and_pin_reserved_extent(root, ins->objectid, + root->nodesize); return ret; } @@ -6503,6 +7736,7 @@ if (skinny_metadata) { iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); + num_bytes = root->nodesize; } else { block_info = (struct btrfs_tree_block_info *)(extent_item + 1); btrfs_set_tree_block_key(leaf, block_info, key); @@ -6524,20 +7758,23 @@ btrfs_mark_buffer_dirty(leaf); btrfs_free_path(path); - ret = update_block_group(root, ins->objectid, root->leafsize, 1); + ret = update_block_group(trans, root, ins->objectid, root->nodesize, + 1); if (ret) { /* -ENOENT, logic error */ btrfs_err(fs_info, "update block group failed for %llu %llu", - (unsigned long long)ins->objectid, - (unsigned long long)ins->offset); + ins->objectid, ins->offset); BUG(); } + + trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->nodesize); return ret; } int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 root_objectid, u64 owner, - u64 offset, struct btrfs_key *ins) + u64 offset, u64 ram_bytes, + struct btrfs_key *ins) { int ret; @@ -6546,7 +7783,8 @@ ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid, ins->offset, 0, root_objectid, owner, offset, - BTRFS_ADD_DELAYED_EXTENT, NULL, 0); + ram_bytes, BTRFS_ADD_DELAYED_EXTENT, + NULL); return ret; } @@ -6562,90 +7800,66 @@ { int ret; struct btrfs_block_group_cache *block_group; - struct btrfs_caching_control *caching_ctl; - u64 start = ins->objectid; - u64 num_bytes = ins->offset; - - block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); - cache_block_group(block_group, 0); - caching_ctl = get_caching_control(block_group); - - if (!caching_ctl) { - BUG_ON(!block_group_cache_done(block_group)); - ret = btrfs_remove_free_space(block_group, start, num_bytes); - if (ret) - goto out; - } else { - mutex_lock(&caching_ctl->mutex); - - if (start >= caching_ctl->progress) { - ret = add_excluded_extent(root, start, num_bytes); - } else if (start + num_bytes <= caching_ctl->progress) { - ret = btrfs_remove_free_space(block_group, - start, num_bytes); - } else { - num_bytes = caching_ctl->progress - start; - ret = btrfs_remove_free_space(block_group, - start, num_bytes); - if (ret) - goto out_lock; - start = caching_ctl->progress; - num_bytes = ins->objectid + ins->offset - - caching_ctl->progress; - ret = add_excluded_extent(root, start, num_bytes); - } -out_lock: - mutex_unlock(&caching_ctl->mutex); - put_caching_control(caching_ctl); + /* + * Mixed block groups will exclude before processing the log so we only + * need to do the exlude dance if this fs isn't mixed. + */ + if (!btrfs_fs_incompat(root->fs_info, MIXED_GROUPS)) { + ret = __exclude_logged_extent(root, ins->objectid, ins->offset); if (ret) - goto out; + return ret; } + block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); + if (!block_group) + return -EINVAL; + ret = btrfs_update_reserved_bytes(block_group, ins->offset, - RESERVE_ALLOC_NO_ACCOUNT); + RESERVE_ALLOC_NO_ACCOUNT, 0); BUG_ON(ret); /* logic error */ ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 0, owner, offset, ins, 1); -out: btrfs_put_block_group(block_group); return ret; } static struct extent_buffer * btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, u32 blocksize, int level) + u64 bytenr, int level) { struct extent_buffer *buf; - buf = btrfs_find_create_tree_block(root, bytenr, blocksize); + buf = btrfs_find_create_tree_block(root, bytenr); if (!buf) return ERR_PTR(-ENOMEM); btrfs_set_header_generation(buf, trans->transid); btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); btrfs_tree_lock(buf); - clean_tree_block(trans, root, buf); + clean_tree_block(trans, root->fs_info, buf); clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); btrfs_set_lock_blocking(buf); btrfs_set_buffer_uptodate(buf); if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { + buf->log_index = root->log_transid % 2; /* * we allow two log transactions at a time, use different * EXENT bit to differentiate dirty pages. */ - if (root->log_transid % 2 == 0) + if (buf->log_index == 0) set_extent_dirty(&root->dirty_log_pages, buf->start, buf->start + buf->len - 1, GFP_NOFS); else set_extent_new(&root->dirty_log_pages, buf->start, buf->start + buf->len - 1, GFP_NOFS); } else { + buf->log_index = -1; set_extent_dirty(&trans->transaction->dirty_pages, buf->start, buf->start + buf->len - 1, GFP_NOFS); } - trans->blocks_used++; + trans->dirty = true; /* this returns a buffer locked for blocking */ return buf; } @@ -6683,7 +7897,7 @@ /*DEFAULT_RATELIMIT_BURST*/ 1); if (__ratelimit(&_rs)) WARN(1, KERN_DEBUG - "btrfs: block rsv returned %d\n", ret); + "BTRFS: block rsv returned %d\n", ret); } try_reserve: ret = reserve_metadata_bytes(root, block_rsv, blocksize, @@ -6713,13 +7927,10 @@ /* * finds a free extent and does all the dirty work required for allocation - * returns the key for the extent through ins, and a tree buffer for - * the first block of the extent through buf. - * - * returns the tree buffer or NULL. + * returns the tree buffer or an ERR_PTR on error. */ -struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u32 blocksize, +struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 parent, u64 root_objectid, struct btrfs_disk_key *key, int level, u64 hint, u64 empty_size) @@ -6727,25 +7938,35 @@ struct btrfs_key ins; struct btrfs_block_rsv *block_rsv; struct extent_buffer *buf; + struct btrfs_delayed_extent_op *extent_op; u64 flags = 0; int ret; + u32 blocksize = root->nodesize; bool skinny_metadata = btrfs_fs_incompat(root->fs_info, SKINNY_METADATA); + if (btrfs_test_is_dummy_root(root)) { + buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, + level); + if (!IS_ERR(buf)) + root->alloc_bytenr += blocksize; + return buf; + } + block_rsv = use_block_rsv(trans, root, blocksize); if (IS_ERR(block_rsv)) return ERR_CAST(block_rsv); - ret = btrfs_reserve_extent(trans, root, blocksize, blocksize, - empty_size, hint, &ins, 0); - if (ret) { - unuse_block_rsv(root->fs_info, block_rsv, blocksize); - return ERR_PTR(ret); - } + ret = btrfs_reserve_extent(root, blocksize, blocksize, + empty_size, hint, &ins, 0, 0); + if (ret) + goto out_unuse; - buf = btrfs_init_new_buffer(trans, root, ins.objectid, - blocksize, level); - BUG_ON(IS_ERR(buf)); /* -ENOMEM */ + buf = btrfs_init_new_buffer(trans, root, ins.objectid, level); + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); + goto out_free_reserved; + } if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { if (parent == 0) @@ -6755,9 +7976,11 @@ BUG_ON(parent > 0); if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { - struct btrfs_delayed_extent_op *extent_op; extent_op = btrfs_alloc_delayed_extent_op(); - BUG_ON(!extent_op); /* -ENOMEM */ + if (!extent_op) { + ret = -ENOMEM; + goto out_free_buf; + } if (key) memcpy(&extent_op->key, key, sizeof(extent_op->key)); else @@ -6772,13 +7995,24 @@ extent_op->level = level; ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, - ins.objectid, - ins.offset, parent, root_objectid, - level, BTRFS_ADD_DELAYED_EXTENT, - extent_op, 0); - BUG_ON(ret); /* -ENOMEM */ + ins.objectid, ins.offset, + parent, root_objectid, level, + BTRFS_ADD_DELAYED_EXTENT, + extent_op); + if (ret) + goto out_free_delayed; } return buf; + +out_free_delayed: + btrfs_free_delayed_extent_op(extent_op); +out_free_buf: + free_extent_buffer(buf); +out_free_reserved: + btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 0); +out_unuse: + unuse_block_rsv(root->fs_info, block_rsv, blocksize); + return ERR_PTR(ret); } struct walk_control { @@ -6826,7 +8060,7 @@ eb = path->nodes[wc->level]; nritems = btrfs_header_nritems(eb); - blocksize = btrfs_level_size(root, wc->level - 1); + blocksize = root->nodesize; for (slot = path->slots[wc->level]; slot < nritems; slot++) { if (nread >= wc->reada_count) @@ -6873,16 +8107,253 @@ continue; } reada: - ret = readahead_tree_block(root, bytenr, blocksize, - generation); - if (ret) - break; + readahead_tree_block(root, bytenr); nread++; } wc->reada_slot = slot; } /* + * These may not be seen by the usual inc/dec ref code so we have to + * add them here. + */ +static int record_one_subtree_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 num_bytes) +{ + struct btrfs_qgroup_extent_record *qrecord; + struct btrfs_delayed_ref_root *delayed_refs; + + qrecord = kmalloc(sizeof(*qrecord), GFP_NOFS); + if (!qrecord) + return -ENOMEM; + + qrecord->bytenr = bytenr; + qrecord->num_bytes = num_bytes; + qrecord->old_roots = NULL; + + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + if (btrfs_qgroup_insert_dirty_extent(delayed_refs, qrecord)) + kfree(qrecord); + spin_unlock(&delayed_refs->lock); + + return 0; +} + +static int account_leaf_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *eb) +{ + int nr = btrfs_header_nritems(eb); + int i, extent_type, ret; + struct btrfs_key key; + struct btrfs_file_extent_item *fi; + u64 bytenr, num_bytes; + + /* We can be called directly from walk_up_proc() */ + if (!root->fs_info->quota_enabled) + return 0; + + for (i = 0; i < nr; i++) { + btrfs_item_key_to_cpu(eb, &key, i); + + if (key.type != BTRFS_EXTENT_DATA_KEY) + continue; + + fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); + /* filter out non qgroup-accountable extents */ + extent_type = btrfs_file_extent_type(eb, fi); + + if (extent_type == BTRFS_FILE_EXTENT_INLINE) + continue; + + bytenr = btrfs_file_extent_disk_bytenr(eb, fi); + if (!bytenr) + continue; + + num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi); + + ret = record_one_subtree_extent(trans, root, bytenr, num_bytes); + if (ret) + return ret; + } + return 0; +} + +/* + * Walk up the tree from the bottom, freeing leaves and any interior + * nodes which have had all slots visited. If a node (leaf or + * interior) is freed, the node above it will have it's slot + * incremented. The root node will never be freed. + * + * At the end of this function, we should have a path which has all + * slots incremented to the next position for a search. If we need to + * read a new node it will be NULL and the node above it will have the + * correct slot selected for a later read. + * + * If we increment the root nodes slot counter past the number of + * elements, 1 is returned to signal completion of the search. + */ +static int adjust_slots_upwards(struct btrfs_root *root, + struct btrfs_path *path, int root_level) +{ + int level = 0; + int nr, slot; + struct extent_buffer *eb; + + if (root_level == 0) + return 1; + + while (level <= root_level) { + eb = path->nodes[level]; + nr = btrfs_header_nritems(eb); + path->slots[level]++; + slot = path->slots[level]; + if (slot >= nr || level == 0) { + /* + * Don't free the root - we will detect this + * condition after our loop and return a + * positive value for caller to stop walking the tree. + */ + if (level != root_level) { + btrfs_tree_unlock_rw(eb, path->locks[level]); + path->locks[level] = 0; + + free_extent_buffer(eb); + path->nodes[level] = NULL; + path->slots[level] = 0; + } + } else { + /* + * We have a valid slot to walk back down + * from. Stop here so caller can process these + * new nodes. + */ + break; + } + + level++; + } + + eb = path->nodes[root_level]; + if (path->slots[root_level] >= btrfs_header_nritems(eb)) + return 1; + + return 0; +} + +/* + * root_eb is the subtree root and is locked before this function is called. + */ +static int account_shared_subtree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *root_eb, + u64 root_gen, + int root_level) +{ + int ret = 0; + int level; + struct extent_buffer *eb = root_eb; + struct btrfs_path *path = NULL; + + BUG_ON(root_level < 0 || root_level > BTRFS_MAX_LEVEL); + BUG_ON(root_eb == NULL); + + if (!root->fs_info->quota_enabled) + return 0; + + if (!extent_buffer_uptodate(root_eb)) { + ret = btrfs_read_buffer(root_eb, root_gen); + if (ret) + goto out; + } + + if (root_level == 0) { + ret = account_leaf_items(trans, root, root_eb); + goto out; + } + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* + * Walk down the tree. Missing extent blocks are filled in as + * we go. Metadata is accounted every time we read a new + * extent block. + * + * When we reach a leaf, we account for file extent items in it, + * walk back up the tree (adjusting slot pointers as we go) + * and restart the search process. + */ + extent_buffer_get(root_eb); /* For path */ + path->nodes[root_level] = root_eb; + path->slots[root_level] = 0; + path->locks[root_level] = 0; /* so release_path doesn't try to unlock */ +walk_down: + level = root_level; + while (level >= 0) { + if (path->nodes[level] == NULL) { + int parent_slot; + u64 child_gen; + u64 child_bytenr; + + /* We need to get child blockptr/gen from + * parent before we can read it. */ + eb = path->nodes[level + 1]; + parent_slot = path->slots[level + 1]; + child_bytenr = btrfs_node_blockptr(eb, parent_slot); + child_gen = btrfs_node_ptr_generation(eb, parent_slot); + + eb = read_tree_block(root, child_bytenr, child_gen); + if (IS_ERR(eb)) { + ret = PTR_ERR(eb); + goto out; + } else if (!extent_buffer_uptodate(eb)) { + free_extent_buffer(eb); + ret = -EIO; + goto out; + } + + path->nodes[level] = eb; + path->slots[level] = 0; + + btrfs_tree_read_lock(eb); + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + path->locks[level] = BTRFS_READ_LOCK_BLOCKING; + + ret = record_one_subtree_extent(trans, root, child_bytenr, + root->nodesize); + if (ret) + goto out; + } + + if (level == 0) { + ret = account_leaf_items(trans, root, path->nodes[level]); + if (ret) + goto out; + + /* Nonzero return here means we completed our search */ + ret = adjust_slots_upwards(root, path, root_level); + if (ret) + break; + + /* Restart search with new slots */ + goto walk_down; + } + + level--; + } + + ret = 0; +out: + btrfs_free_path(path); + + return ret; +} + +/* * helper to process tree block while walking down the tree. * * when wc->stage == UPDATE_BACKREF, this function updates @@ -6936,9 +8407,9 @@ /* wc->stage == UPDATE_BACKREF */ if (!(wc->flags[level] & flag)) { BUG_ON(!path->locks[level]); - ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc); + ret = btrfs_inc_ref(trans, root, eb, 1); BUG_ON(ret); /* -ENOMEM */ - ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc); + ret = btrfs_dec_ref(trans, root, eb, 0); BUG_ON(ret); /* -ENOMEM */ ret = btrfs_set_disk_extent_flags(trans, root, eb->start, eb->len, flag, @@ -6985,6 +8456,7 @@ int level = wc->level; int reada = 0; int ret = 0; + bool need_account = false; generation = btrfs_node_ptr_generation(path->nodes[level], path->slots[level]); @@ -7000,13 +8472,15 @@ } bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); - blocksize = btrfs_level_size(root, level - 1); + blocksize = root->nodesize; - next = btrfs_find_tree_block(root, bytenr, blocksize); + next = btrfs_find_tree_block(root->fs_info, bytenr); if (!next) { - next = btrfs_find_create_tree_block(root, bytenr, blocksize); + next = btrfs_find_create_tree_block(root, bytenr); if (!next) return -ENOMEM; + btrfs_set_buffer_lockdep_class(root->root_key.objectid, next, + level - 1); reada = 1; } btrfs_tree_lock(next); @@ -7015,19 +8489,19 @@ ret = btrfs_lookup_extent_info(trans, root, bytenr, level - 1, 1, &wc->refs[level - 1], &wc->flags[level - 1]); - if (ret < 0) { - btrfs_tree_unlock(next); - return ret; - } + if (ret < 0) + goto out_unlock; if (unlikely(wc->refs[level - 1] == 0)) { btrfs_err(root->fs_info, "Missing references."); - BUG(); + ret = -EIO; + goto out_unlock; } *lookup_info = 0; if (wc->stage == DROP_REFERENCE) { if (wc->refs[level - 1] > 1) { + need_account = true; if (level == 1 && (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) goto skip; @@ -7061,8 +8535,10 @@ if (!next) { if (reada && level == 1) reada_walk_down(trans, root, wc, path); - next = read_tree_block(root, bytenr, blocksize, generation); - if (!next || !extent_buffer_uptodate(next)) { + next = read_tree_block(root, bytenr, generation); + if (IS_ERR(next)) { + return PTR_ERR(next); + } else if (!extent_buffer_uptodate(next)) { free_extent_buffer(next); return -EIO; } @@ -7071,7 +8547,12 @@ } level--; - BUG_ON(level != btrfs_header_level(next)); + ASSERT(level == btrfs_header_level(next)); + if (level != btrfs_header_level(next)) { + btrfs_err(root->fs_info, "mismatched level"); + ret = -EIO; + goto out_unlock; + } path->nodes[level] = next; path->slots[level] = 0; path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; @@ -7086,19 +8567,43 @@ if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { parent = path->nodes[level]->start; } else { - BUG_ON(root->root_key.objectid != + ASSERT(root->root_key.objectid == btrfs_header_owner(path->nodes[level])); + if (root->root_key.objectid != + btrfs_header_owner(path->nodes[level])) { + btrfs_err(root->fs_info, + "mismatched block owner"); + ret = -EIO; + goto out_unlock; + } parent = 0; } + if (need_account) { + ret = account_shared_subtree(trans, root, next, + generation, level - 1); + if (ret) { + btrfs_err_rl(root->fs_info, + "Error " + "%d accounting shared subtree. Quota " + "is out of sync, rescan required.", + ret); + } + } ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, - root->root_key.objectid, level - 1, 0, 0); - BUG_ON(ret); /* -ENOMEM */ + root->root_key.objectid, level - 1, 0); + if (ret) + goto out_unlock; } + + *lookup_info = 1; + ret = 1; + +out_unlock: btrfs_tree_unlock(next); free_extent_buffer(next); - *lookup_info = 1; - return 1; + + return ret; } /* @@ -7171,12 +8676,18 @@ if (wc->refs[level] == 1) { if (level == 0) { if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) - ret = btrfs_dec_ref(trans, root, eb, 1, - wc->for_reloc); + ret = btrfs_dec_ref(trans, root, eb, 1); else - ret = btrfs_dec_ref(trans, root, eb, 0, - wc->for_reloc); + ret = btrfs_dec_ref(trans, root, eb, 0); BUG_ON(ret); /* -ENOMEM */ + ret = account_leaf_items(trans, root, eb); + if (ret) { + btrfs_err_rl(root->fs_info, + "error " + "%d accounting leaf items. Quota " + "is out of sync, rescan required.", + ret); + } } /* make block locked assertion in clean_tree_block happy */ if (!path->locks[level] && @@ -7185,7 +8696,7 @@ btrfs_set_lock_blocking(eb); path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; } - clean_tree_block(trans, root, eb); + clean_tree_block(trans, root->fs_info, eb); } if (eb == root->node) { @@ -7302,6 +8813,8 @@ int level; bool root_dropped = false; + btrfs_debug(root->fs_info, "Drop subvolume %llu", root->objectid); + path = btrfs_alloc_path(); if (!path) { err = -ENOMEM; @@ -7389,11 +8902,6 @@ wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); while (1) { - if (!for_reloc && btrfs_fs_closing(root->fs_info)) { - pr_debug("btrfs: drop snapshot early exit\n"); - err = -EAGAIN; - goto out_end_trans; - } ret = walk_down_tree(trans, root, path, wc); if (ret < 0) { @@ -7421,7 +8929,8 @@ } BUG_ON(wc->level == 0); - if (btrfs_should_end_transaction(trans, tree_root)) { + if (btrfs_should_end_transaction(trans, tree_root) || + (!for_reloc && btrfs_need_cleaner_sleep(root))) { ret = btrfs_update_root(trans, tree_root, &root->root_key, root_item); @@ -7432,6 +8941,12 @@ } btrfs_end_transaction_throttle(trans, tree_root); + if (!for_reloc && btrfs_need_cleaner_sleep(root)) { + pr_debug("BTRFS: drop snapshot early exit\n"); + err = -EAGAIN; + goto out_free; + } + trans = btrfs_start_transaction(tree_root, 0); if (IS_ERR(trans)) { err = PTR_ERR(trans); @@ -7452,8 +8967,8 @@ } if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { - ret = btrfs_find_last_root(tree_root, root->root_key.objectid, - NULL, NULL); + ret = btrfs_find_root(tree_root, &root->root_key, path, + NULL, NULL); if (ret < 0) { btrfs_abort_transaction(trans, tree_root, ret); err = ret; @@ -7469,12 +8984,12 @@ } } - if (root->in_radix) { - btrfs_free_fs_root(tree_root->fs_info, root); + if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) { + btrfs_add_dropped_root(trans, root); } else { free_extent_buffer(root->node); free_extent_buffer(root->commit_root); - kfree(root); + btrfs_put_fs_root(root); } root_dropped = true; out_end_trans: @@ -7490,10 +9005,10 @@ * don't have it in the radix (like when we recover after a power fail * or unmount) so we don't leak memory. */ - if (root_dropped == false) + if (!for_reloc && root_dropped == false) btrfs_add_dead_root(root); if (err && err != -EAGAIN) - btrfs_std_error(root->fs_info, err); + btrfs_std_error(root->fs_info, err, NULL); return err; } @@ -7581,13 +9096,7 @@ if (stripped) return extended_to_chunk(stripped); - /* - * we add in the count of missing devices because we want - * to make sure that any RAID levels on a degraded FS - * continue to be honored. - */ - num_devices = root->fs_info->fs_devices->rw_devices + - root->fs_info->fs_devices->missing_devices; + num_devices = root->fs_info->fs_devices->rw_devices; stripped = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | @@ -7623,14 +9132,13 @@ return flags; } -static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force) +static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force) { struct btrfs_space_info *sinfo = cache->space_info; u64 num_bytes; u64 min_allocable_bytes; int ret = -ENOSPC; - /* * We need some metadata space and system metadata space for * allocating chunks in some corner cases until we force to set @@ -7647,6 +9155,7 @@ spin_lock(&cache->lock); if (cache->ro) { + cache->ro++; ret = 0; goto out; } @@ -7658,7 +9167,8 @@ sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes + min_allocable_bytes <= sinfo->total_bytes) { sinfo->bytes_readonly += num_bytes; - cache->ro = 1; + cache->ro++; + list_add_tail(&cache->ro_list, &sinfo->ro_bgs); ret = 0; } out: @@ -7667,7 +9177,7 @@ return ret; } -int btrfs_set_block_group_ro(struct btrfs_root *root, +int btrfs_inc_block_group_ro(struct btrfs_root *root, struct btrfs_block_group_cache *cache) { @@ -7675,21 +9185,49 @@ u64 alloc_flags; int ret; - BUG_ON(cache->ro); - +again: trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); + /* + * we're not allowed to set block groups readonly after the dirty + * block groups cache has started writing. If it already started, + * back off and let this transaction commit + */ + mutex_lock(&root->fs_info->ro_block_group_mutex); + if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) { + u64 transid = trans->transid; + + mutex_unlock(&root->fs_info->ro_block_group_mutex); + btrfs_end_transaction(trans, root); + + ret = btrfs_wait_for_commit(root, transid); + if (ret) + return ret; + goto again; + } + + /* + * if we are changing raid levels, try to allocate a corresponding + * block group with the new raid level. + */ alloc_flags = update_block_group_flags(root, cache->flags); if (alloc_flags != cache->flags) { ret = do_chunk_alloc(trans, root, alloc_flags, CHUNK_ALLOC_FORCE); + /* + * ENOSPC is allowed here, we may have enough space + * already allocated at the new raid level to + * carry on + */ + if (ret == -ENOSPC) + ret = 0; if (ret < 0) goto out; } - ret = set_block_group_ro(cache, 0); + ret = inc_block_group_ro(cache, 0); if (!ret) goto out; alloc_flags = get_alloc_profile(root, cache->space_info->flags); @@ -7697,8 +9235,16 @@ CHUNK_ALLOC_FORCE); if (ret < 0) goto out; - ret = set_block_group_ro(cache, 0); + ret = inc_block_group_ro(cache, 0); out: + if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) { + alloc_flags = update_block_group_flags(root, cache->flags); + lock_chunks(root->fs_info->chunk_root); + check_system_chunk(trans, root, alloc_flags); + unlock_chunks(root->fs_info->chunk_root); + } + mutex_unlock(&root->fs_info->ro_block_group_mutex); + btrfs_end_transaction(trans, root); return ret; } @@ -7713,15 +9259,20 @@ /* * helper to account the unused space of all the readonly block group in the - * list. takes mirrors into account. + * space_info. takes mirrors into account. */ -static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list) +u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) { struct btrfs_block_group_cache *block_group; u64 free_bytes = 0; int factor; - list_for_each_entry(block_group, groups_list, list) { + /* It's df, we don't care if it's racey */ + if (list_empty(&sinfo->ro_bgs)) + return 0; + + spin_lock(&sinfo->lock); + list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) { spin_lock(&block_group->lock); if (!block_group->ro) { @@ -7742,32 +9293,12 @@ spin_unlock(&block_group->lock); } - - return free_bytes; -} - -/* - * helper to account the unused space of all the readonly block group in the - * space_info. takes mirrors into account. - */ -u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) -{ - int i; - u64 free_bytes = 0; - - spin_lock(&sinfo->lock); - - for(i = 0; i < BTRFS_NR_RAID_TYPES; i++) - if (!list_empty(&sinfo->block_groups[i])) - free_bytes += __btrfs_get_ro_block_group_free_space( - &sinfo->block_groups[i]); - spin_unlock(&sinfo->lock); return free_bytes; } -void btrfs_set_block_group_rw(struct btrfs_root *root, +void btrfs_dec_block_group_ro(struct btrfs_root *root, struct btrfs_block_group_cache *cache) { struct btrfs_space_info *sinfo = cache->space_info; @@ -7777,10 +9308,13 @@ spin_lock(&sinfo->lock); spin_lock(&cache->lock); - num_bytes = cache->key.offset - cache->reserved - cache->pinned - - cache->bytes_super - btrfs_block_group_used(&cache->item); - sinfo->bytes_readonly -= num_bytes; - cache->ro = 0; + if (!--cache->ro) { + num_bytes = cache->key.offset - cache->reserved - + cache->pinned - cache->bytes_super - + btrfs_block_group_used(&cache->item); + sinfo->bytes_readonly -= num_bytes; + list_del_init(&cache->ro_list); + } spin_unlock(&cache->lock); spin_unlock(&sinfo->lock); } @@ -7797,6 +9331,7 @@ struct btrfs_space_info *space_info; struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; struct btrfs_device *device; + struct btrfs_trans_handle *trans; u64 min_free; u64 dev_min = 1; u64 dev_nr = 0; @@ -7880,7 +9415,14 @@ min_free <<= 1; } else if (index == BTRFS_RAID_RAID0) { dev_min = fs_devices->rw_devices; - do_div(min_free, dev_min); + min_free = div64_u64(min_free, dev_min); + } + + /* We need to do this so that we can look at pending chunks */ + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; } mutex_lock(&root->fs_info->chunk_mutex); @@ -7893,7 +9435,7 @@ */ if (device->total_bytes > device->bytes_used + min_free && !device->is_tgtdev_for_dev_replace) { - ret = find_free_dev_extent(device, min_free, + ret = find_free_dev_extent(trans, device, min_free, &dev_offset, NULL); if (!ret) dev_nr++; @@ -7905,6 +9447,7 @@ } } mutex_unlock(&root->fs_info->chunk_mutex); + btrfs_end_transaction(trans, root); out: btrfs_put_block_group(block_group); return ret; @@ -7987,14 +9530,24 @@ struct btrfs_caching_control *caching_ctl; struct rb_node *n; - down_write(&info->extent_commit_sem); + down_write(&info->commit_root_sem); while (!list_empty(&info->caching_block_groups)) { caching_ctl = list_entry(info->caching_block_groups.next, struct btrfs_caching_control, list); list_del(&caching_ctl->list); put_caching_control(caching_ctl); } - up_write(&info->extent_commit_sem); + up_write(&info->commit_root_sem); + + spin_lock(&info->unused_bgs_lock); + while (!list_empty(&info->unused_bgs)) { + block_group = list_first_entry(&info->unused_bgs, + struct btrfs_block_group_cache, + bg_list); + list_del_init(&block_group->bg_list); + btrfs_put_block_group(block_group); + } + spin_unlock(&info->unused_bgs_lock); spin_lock(&info->block_group_cache_lock); while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { @@ -8002,6 +9555,7 @@ cache_node); rb_erase(&block_group->cache_node, &info->block_group_cache_tree); + RB_CLEAR_NODE(&block_group->cache_node); spin_unlock(&info->block_group_cache_lock); down_write(&block_group->space_info->groups_sem); @@ -8015,7 +9569,8 @@ * We haven't cached this block group, which means we could * possibly have excluded extents on this block group. */ - if (block_group->cached == BTRFS_CACHE_NO) + if (block_group->cached == BTRFS_CACHE_NO || + block_group->cached == BTRFS_CACHE_ERROR) free_excluded_extents(info->extent_root, block_group); btrfs_remove_free_space_cache(block_group); @@ -8035,20 +9590,31 @@ release_global_block_rsv(info); - while(!list_empty(&info->space_info)) { + while (!list_empty(&info->space_info)) { + int i; + space_info = list_entry(info->space_info.next, struct btrfs_space_info, list); if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) { - if (space_info->bytes_pinned > 0 || + if (WARN_ON(space_info->bytes_pinned > 0 || space_info->bytes_reserved > 0 || - space_info->bytes_may_use > 0) { - WARN_ON(1); + space_info->bytes_may_use > 0)) { dump_space_info(space_info, 0, 0); } } list_del(&space_info->list); - kfree(space_info); + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { + struct kobject *kobj; + kobj = space_info->block_group_kobjs[i]; + space_info->block_group_kobjs[i] = NULL; + if (kobj) { + kobject_del(kobj); + kobject_put(kobj); + } + } + kobject_del(&space_info->kobj); + kobject_put(&space_info->kobj); } return 0; } @@ -8057,10 +9623,75 @@ struct btrfs_block_group_cache *cache) { int index = get_block_group_index(cache); + bool first = false; down_write(&space_info->groups_sem); + if (list_empty(&space_info->block_groups[index])) + first = true; list_add_tail(&cache->list, &space_info->block_groups[index]); up_write(&space_info->groups_sem); + + if (first) { + struct raid_kobject *rkobj; + int ret; + + rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS); + if (!rkobj) + goto out_err; + rkobj->raid_type = index; + kobject_init(&rkobj->kobj, &btrfs_raid_ktype); + ret = kobject_add(&rkobj->kobj, &space_info->kobj, + "%s", get_raid_name(index)); + if (ret) { + kobject_put(&rkobj->kobj); + goto out_err; + } + space_info->block_group_kobjs[index] = &rkobj->kobj; + } + + return; +out_err: + pr_warn("BTRFS: failed to add kobject for block cache. ignoring.\n"); +} + +static struct btrfs_block_group_cache * +btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size) +{ + struct btrfs_block_group_cache *cache; + + cache = kzalloc(sizeof(*cache), GFP_NOFS); + if (!cache) + return NULL; + + cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), + GFP_NOFS); + if (!cache->free_space_ctl) { + kfree(cache); + return NULL; + } + + cache->key.objectid = start; + cache->key.offset = size; + cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; + + cache->sectorsize = root->sectorsize; + cache->fs_info = root->fs_info; + cache->full_stripe_len = btrfs_full_stripe_len(root, + &root->fs_info->mapping_tree, + start); + atomic_set(&cache->count, 1); + spin_lock_init(&cache->lock); + init_rwsem(&cache->data_rwsem); + INIT_LIST_HEAD(&cache->list); + INIT_LIST_HEAD(&cache->cluster_list); + INIT_LIST_HEAD(&cache->bg_list); + INIT_LIST_HEAD(&cache->ro_list); + INIT_LIST_HEAD(&cache->dirty_list); + INIT_LIST_HEAD(&cache->io_list); + btrfs_init_free_space_ctl(cache); + atomic_set(&cache->trimming, 0); + + return cache; } int btrfs_read_block_groups(struct btrfs_root *root) @@ -8075,11 +9706,16 @@ struct extent_buffer *leaf; int need_clear = 0; u64 cache_gen; + u64 feature; + int mixed; + + feature = btrfs_super_incompat_flags(info->super_copy); + mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS); root = info->extent_root; key.objectid = 0; key.offset = 0; - btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY); + key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -8098,26 +9734,16 @@ break; if (ret != 0) goto error; + leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - cache = kzalloc(sizeof(*cache), GFP_NOFS); + + cache = btrfs_create_block_group_cache(root, found_key.objectid, + found_key.offset); if (!cache) { ret = -ENOMEM; goto error; } - cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), - GFP_NOFS); - if (!cache->free_space_ctl) { - kfree(cache); - ret = -ENOMEM; - goto error; - } - - atomic_set(&cache->count, 1); - spin_lock_init(&cache->lock); - cache->fs_info = info; - INIT_LIST_HEAD(&cache->list); - INIT_LIST_HEAD(&cache->cluster_list); if (need_clear) { /* @@ -8130,24 +9756,26 @@ * b) Setting 'dirty flag' makes sure that we flush * the new space cache info onto disk. */ - cache->disk_cache_state = BTRFS_DC_CLEAR; if (btrfs_test_opt(root, SPACE_CACHE)) - cache->dirty = 1; + cache->disk_cache_state = BTRFS_DC_CLEAR; } read_extent_buffer(leaf, &cache->item, btrfs_item_ptr_offset(leaf, path->slots[0]), sizeof(cache->item)); - memcpy(&cache->key, &found_key, sizeof(found_key)); + cache->flags = btrfs_block_group_flags(&cache->item); + if (!mixed && + ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) && + (cache->flags & BTRFS_BLOCK_GROUP_DATA))) { + btrfs_err(info, +"bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups", + cache->key.objectid); + ret = -EINVAL; + goto error; + } key.objectid = found_key.objectid + found_key.offset; btrfs_release_path(path); - cache->flags = btrfs_block_group_flags(&cache->item); - cache->sectorsize = root->sectorsize; - cache->full_stripe_len = btrfs_full_stripe_len(root, - &root->fs_info->mapping_tree, - found_key.objectid); - btrfs_init_free_space_ctl(cache); /* * We need to exclude the super stripes now so that the space @@ -8161,8 +9789,7 @@ * case. */ free_excluded_extents(root, cache); - kfree(cache->free_space_ctl); - kfree(cache); + btrfs_put_block_group(cache); goto error; } @@ -8202,6 +9829,7 @@ spin_lock(&info->block_group_cache_lock); rb_erase(&cache->cache_node, &info->block_group_cache_tree); + RB_CLEAR_NODE(&cache->cache_node); spin_unlock(&info->block_group_cache_lock); btrfs_put_block_group(cache); goto error; @@ -8215,8 +9843,18 @@ __link_block_group(space_info, cache); set_avail_alloc_bits(root->fs_info, cache->flags); - if (btrfs_chunk_readonly(root, cache->key.objectid)) - set_block_group_ro(cache, 1); + if (btrfs_chunk_readonly(root, cache->key.objectid)) { + inc_block_group_ro(cache, 1); + } else if (btrfs_block_group_used(&cache->item) == 0) { + spin_lock(&info->unused_bgs_lock); + /* Should always be true but just in case. */ + if (list_empty(&cache->bg_list)) { + btrfs_get_block_group(cache); + list_add_tail(&cache->bg_list, + &info->unused_bgs); + } + spin_unlock(&info->unused_bgs_lock); + } } list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) { @@ -8231,10 +9869,14 @@ * avoid allocating from un-mirrored block group if there are * mirrored block groups. */ - list_for_each_entry(cache, &space_info->block_groups[3], list) - set_block_group_ro(cache, 1); - list_for_each_entry(cache, &space_info->block_groups[4], list) - set_block_group_ro(cache, 1); + list_for_each_entry(cache, + &space_info->block_groups[BTRFS_RAID_RAID0], + list) + inc_block_group_ro(cache, 1); + list_for_each_entry(cache, + &space_info->block_groups[BTRFS_RAID_SINGLE], + list) + inc_block_group_ro(cache, 1); } init_global_block_rsv(info); @@ -8252,13 +9894,12 @@ struct btrfs_block_group_item item; struct btrfs_key key; int ret = 0; + bool can_flush_pending_bgs = trans->can_flush_pending_bgs; - list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, - new_bg_list) { - list_del_init(&block_group->new_bg_list); - + trans->can_flush_pending_bgs = false; + list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) { if (ret) - continue; + goto next; spin_lock(&block_group->lock); memcpy(&item, &block_group->item, sizeof(item)); @@ -8269,7 +9910,14 @@ sizeof(item)); if (ret) btrfs_abort_transaction(trans, extent_root, ret); + ret = btrfs_finish_chunk_alloc(trans, extent_root, + key.objectid, key.offset); + if (ret) + btrfs_abort_transaction(trans, extent_root, ret); +next: + list_del_init(&block_group->bg_list); } + trans->can_flush_pending_bgs = can_flush_pending_bgs; } int btrfs_make_block_group(struct btrfs_trans_handle *trans, @@ -8283,40 +9931,17 @@ extent_root = root->fs_info->extent_root; - root->fs_info->last_trans_log_full_commit = trans->transid; + btrfs_set_log_full_commit(root->fs_info, trans); - cache = kzalloc(sizeof(*cache), GFP_NOFS); + cache = btrfs_create_block_group_cache(root, chunk_offset, size); if (!cache) return -ENOMEM; - cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), - GFP_NOFS); - if (!cache->free_space_ctl) { - kfree(cache); - return -ENOMEM; - } - - cache->key.objectid = chunk_offset; - cache->key.offset = size; - cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; - cache->sectorsize = root->sectorsize; - cache->fs_info = root->fs_info; - cache->full_stripe_len = btrfs_full_stripe_len(root, - &root->fs_info->mapping_tree, - chunk_offset); - - atomic_set(&cache->count, 1); - spin_lock_init(&cache->lock); - INIT_LIST_HEAD(&cache->list); - INIT_LIST_HEAD(&cache->cluster_list); - INIT_LIST_HEAD(&cache->new_bg_list); - - btrfs_init_free_space_ctl(cache); btrfs_set_block_group_used(&cache->item, bytes_used); btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid); - cache->flags = type; btrfs_set_block_group_flags(&cache->item, type); + cache->flags = type; cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; ret = exclude_super_stripes(root, cache); @@ -8326,8 +9951,7 @@ * case. */ free_excluded_extents(root, cache); - kfree(cache->free_space_ctl); - kfree(cache); + btrfs_put_block_group(cache); return ret; } @@ -8336,6 +9960,27 @@ free_excluded_extents(root, cache); +#ifdef CONFIG_BTRFS_DEBUG + if (btrfs_should_fragment_free_space(root, cache)) { + u64 new_bytes_used = size - bytes_used; + + bytes_used += new_bytes_used >> 1; + fragment_free_space(root, cache); + } +#endif + /* + * Call to ensure the corresponding space_info object is created and + * assigned to our block group, but don't update its counters just yet. + * We want our bg to be added to the rbtree with its ->space_info set. + */ + ret = update_space_info(root->fs_info, cache->flags, 0, 0, + &cache->space_info); + if (ret) { + btrfs_remove_free_space_cache(cache); + btrfs_put_block_group(cache); + return ret; + } + ret = btrfs_add_block_group_cache(root->fs_info, cache); if (ret) { btrfs_remove_free_space_cache(cache); @@ -8343,6 +9988,10 @@ return ret; } + /* + * Now that our block group has its ->space_info set and is inserted in + * the rbtree, update the space info's counters. + */ ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, &cache->space_info); if (ret) { @@ -8350,6 +9999,7 @@ spin_lock(&root->fs_info->block_group_cache_lock); rb_erase(&cache->cache_node, &root->fs_info->block_group_cache_tree); + RB_CLEAR_NODE(&cache->cache_node); spin_unlock(&root->fs_info->block_group_cache_lock); btrfs_put_block_group(cache); return ret; @@ -8362,7 +10012,7 @@ __link_block_group(cache->space_info, cache); - list_add_tail(&cache->new_bg_list, &trans->new_bgs); + list_add_tail(&cache->bg_list, &trans->new_bgs); set_avail_alloc_bits(extent_root->fs_info, type); @@ -8385,7 +10035,8 @@ } int btrfs_remove_block_group(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 group_start) + struct btrfs_root *root, u64 group_start, + struct extent_map *em) { struct btrfs_path *path; struct btrfs_block_group_cache *block_group; @@ -8393,9 +10044,12 @@ struct btrfs_root *tree_root = root->fs_info->tree_root; struct btrfs_key key; struct inode *inode; + struct kobject *kobj = NULL; int ret; int index; int factor; + struct btrfs_caching_control *caching_ctl = NULL; + bool remove_em; root = root->fs_info->extent_root; @@ -8439,7 +10093,38 @@ goto out; } + /* + * get the inode first so any iput calls done for the io_list + * aren't the final iput (no unlinks allowed now) + */ inode = lookup_free_space_inode(tree_root, block_group, path); + + mutex_lock(&trans->transaction->cache_write_mutex); + /* + * make sure our free spache cache IO is done before remove the + * free space inode + */ + spin_lock(&trans->transaction->dirty_bgs_lock); + if (!list_empty(&block_group->io_list)) { + list_del_init(&block_group->io_list); + + WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode); + + spin_unlock(&trans->transaction->dirty_bgs_lock); + btrfs_wait_cache_io(root, trans, block_group, + &block_group->io_ctl, path, + block_group->key.objectid); + btrfs_put_block_group(block_group); + spin_lock(&trans->transaction->dirty_bgs_lock); + } + + if (!list_empty(&block_group->dirty_list)) { + list_del_init(&block_group->dirty_list); + btrfs_put_block_group(block_group); + } + spin_unlock(&trans->transaction->dirty_bgs_lock); + mutex_unlock(&trans->transaction->cache_write_mutex); + if (!IS_ERR(inode)) { ret = btrfs_orphan_add(trans, inode); if (ret) { @@ -8480,6 +10165,7 @@ spin_lock(&root->fs_info->block_group_cache_lock); rb_erase(&block_group->cache_node, &root->fs_info->block_group_cache_tree); + RB_CLEAR_NODE(&block_group->cache_node); if (root->fs_info->first_logical_byte == block_group->key.objectid) root->fs_info->first_logical_byte = (u64)-1; @@ -8491,24 +10177,142 @@ * are still on the list after taking the semaphore */ list_del_init(&block_group->list); - if (list_empty(&block_group->space_info->block_groups[index])) + if (list_empty(&block_group->space_info->block_groups[index])) { + kobj = block_group->space_info->block_group_kobjs[index]; + block_group->space_info->block_group_kobjs[index] = NULL; clear_avail_alloc_bits(root->fs_info, block_group->flags); + } up_write(&block_group->space_info->groups_sem); + if (kobj) { + kobject_del(kobj); + kobject_put(kobj); + } + if (block_group->has_caching_ctl) + caching_ctl = get_caching_control(block_group); if (block_group->cached == BTRFS_CACHE_STARTED) wait_block_group_cache_done(block_group); + if (block_group->has_caching_ctl) { + down_write(&root->fs_info->commit_root_sem); + if (!caching_ctl) { + struct btrfs_caching_control *ctl; + + list_for_each_entry(ctl, + &root->fs_info->caching_block_groups, list) + if (ctl->block_group == block_group) { + caching_ctl = ctl; + atomic_inc(&caching_ctl->count); + break; + } + } + if (caching_ctl) + list_del_init(&caching_ctl->list); + up_write(&root->fs_info->commit_root_sem); + if (caching_ctl) { + /* Once for the caching bgs list and once for us. */ + put_caching_control(caching_ctl); + put_caching_control(caching_ctl); + } + } + spin_lock(&trans->transaction->dirty_bgs_lock); + if (!list_empty(&block_group->dirty_list)) { + WARN_ON(1); + } + if (!list_empty(&block_group->io_list)) { + WARN_ON(1); + } + spin_unlock(&trans->transaction->dirty_bgs_lock); btrfs_remove_free_space_cache(block_group); spin_lock(&block_group->space_info->lock); + list_del_init(&block_group->ro_list); + + if (btrfs_test_opt(root, ENOSPC_DEBUG)) { + WARN_ON(block_group->space_info->total_bytes + < block_group->key.offset); + WARN_ON(block_group->space_info->bytes_readonly + < block_group->key.offset); + WARN_ON(block_group->space_info->disk_total + < block_group->key.offset * factor); + } block_group->space_info->total_bytes -= block_group->key.offset; block_group->space_info->bytes_readonly -= block_group->key.offset; block_group->space_info->disk_total -= block_group->key.offset * factor; + spin_unlock(&block_group->space_info->lock); memcpy(&key, &block_group->key, sizeof(key)); - btrfs_clear_space_info_full(root->fs_info); + lock_chunks(root); + if (!list_empty(&em->list)) { + /* We're in the transaction->pending_chunks list. */ + free_extent_map(em); + } + spin_lock(&block_group->lock); + block_group->removed = 1; + /* + * At this point trimming can't start on this block group, because we + * removed the block group from the tree fs_info->block_group_cache_tree + * so no one can't find it anymore and even if someone already got this + * block group before we removed it from the rbtree, they have already + * incremented block_group->trimming - if they didn't, they won't find + * any free space entries because we already removed them all when we + * called btrfs_remove_free_space_cache(). + * + * And we must not remove the extent map from the fs_info->mapping_tree + * to prevent the same logical address range and physical device space + * ranges from being reused for a new block group. This is because our + * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is + * completely transactionless, so while it is trimming a range the + * currently running transaction might finish and a new one start, + * allowing for new block groups to be created that can reuse the same + * physical device locations unless we take this special care. + * + * There may also be an implicit trim operation if the file system + * is mounted with -odiscard. The same protections must remain + * in place until the extents have been discarded completely when + * the transaction commit has completed. + */ + remove_em = (atomic_read(&block_group->trimming) == 0); + /* + * Make sure a trimmer task always sees the em in the pinned_chunks list + * if it sees block_group->removed == 1 (needs to lock block_group->lock + * before checking block_group->removed). + */ + if (!remove_em) { + /* + * Our em might be in trans->transaction->pending_chunks which + * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks), + * and so is the fs_info->pinned_chunks list. + * + * So at this point we must be holding the chunk_mutex to avoid + * any races with chunk allocation (more specifically at + * volumes.c:contains_pending_extent()), to ensure it always + * sees the em, either in the pending_chunks list or in the + * pinned_chunks list. + */ + list_move_tail(&em->list, &root->fs_info->pinned_chunks); + } + spin_unlock(&block_group->lock); + + if (remove_em) { + struct extent_map_tree *em_tree; + + em_tree = &root->fs_info->mapping_tree.map_tree; + write_lock(&em_tree->lock); + /* + * The em might be in the pending_chunks list, so make sure the + * chunk mutex is locked, since remove_extent_mapping() will + * delete us from that list. + */ + remove_extent_mapping(em_tree, em); + write_unlock(&em_tree->lock); + /* once for the tree */ + free_extent_map(em); + } + + unlock_chunks(root); btrfs_put_block_group(block_group); btrfs_put_block_group(block_group); @@ -8525,6 +10329,215 @@ return ret; } +struct btrfs_trans_handle * +btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info, + const u64 chunk_offset) +{ + struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; + struct extent_map *em; + struct map_lookup *map; + unsigned int num_items; + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, chunk_offset, 1); + read_unlock(&em_tree->lock); + ASSERT(em && em->start == chunk_offset); + + /* + * We need to reserve 3 + N units from the metadata space info in order + * to remove a block group (done at btrfs_remove_chunk() and at + * btrfs_remove_block_group()), which are used for: + * + * 1 unit for adding the free space inode's orphan (located in the tree + * of tree roots). + * 1 unit for deleting the block group item (located in the extent + * tree). + * 1 unit for deleting the free space item (located in tree of tree + * roots). + * N units for deleting N device extent items corresponding to each + * stripe (located in the device tree). + * + * In order to remove a block group we also need to reserve units in the + * system space info in order to update the chunk tree (update one or + * more device items and remove one chunk item), but this is done at + * btrfs_remove_chunk() through a call to check_system_chunk(). + */ + map = (struct map_lookup *)em->bdev; + num_items = 3 + map->num_stripes; + free_extent_map(em); + + return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root, + num_items, 1); +} + +/* + * Process the unused_bgs list and remove any that don't have any allocated + * space inside of them. + */ +void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) +{ + struct btrfs_block_group_cache *block_group; + struct btrfs_space_info *space_info; + struct btrfs_root *root = fs_info->extent_root; + struct btrfs_trans_handle *trans; + int ret = 0; + + if (!fs_info->open) + return; + + spin_lock(&fs_info->unused_bgs_lock); + while (!list_empty(&fs_info->unused_bgs)) { + u64 start, end; + int trimming; + + block_group = list_first_entry(&fs_info->unused_bgs, + struct btrfs_block_group_cache, + bg_list); + list_del_init(&block_group->bg_list); + + space_info = block_group->space_info; + + if (ret || btrfs_mixed_space_info(space_info)) { + btrfs_put_block_group(block_group); + continue; + } + spin_unlock(&fs_info->unused_bgs_lock); + + mutex_lock(&fs_info->delete_unused_bgs_mutex); + + /* Don't want to race with allocators so take the groups_sem */ + down_write(&space_info->groups_sem); + spin_lock(&block_group->lock); + if (block_group->reserved || + btrfs_block_group_used(&block_group->item) || + block_group->ro || + list_is_singular(&block_group->list)) { + /* + * We want to bail if we made new allocations or have + * outstanding allocations in this block group. We do + * the ro check in case balance is currently acting on + * this block group. + */ + spin_unlock(&block_group->lock); + up_write(&space_info->groups_sem); + goto next; + } + spin_unlock(&block_group->lock); + + /* We don't want to force the issue, only flip if it's ok. */ + ret = inc_block_group_ro(block_group, 0); + up_write(&space_info->groups_sem); + if (ret < 0) { + ret = 0; + goto next; + } + + /* + * Want to do this before we do anything else so we can recover + * properly if we fail to join the transaction. + */ + trans = btrfs_start_trans_remove_block_group(fs_info, + block_group->key.objectid); + if (IS_ERR(trans)) { + btrfs_dec_block_group_ro(root, block_group); + ret = PTR_ERR(trans); + goto next; + } + + /* + * We could have pending pinned extents for this block group, + * just delete them, we don't care about them anymore. + */ + start = block_group->key.objectid; + end = start + block_group->key.offset - 1; + /* + * Hold the unused_bg_unpin_mutex lock to avoid racing with + * btrfs_finish_extent_commit(). If we are at transaction N, + * another task might be running finish_extent_commit() for the + * previous transaction N - 1, and have seen a range belonging + * to the block group in freed_extents[] before we were able to + * clear the whole block group range from freed_extents[]. This + * means that task can lookup for the block group after we + * unpinned it from freed_extents[] and removed it, leading to + * a BUG_ON() at btrfs_unpin_extent_range(). + */ + mutex_lock(&fs_info->unused_bg_unpin_mutex); + ret = clear_extent_bits(&fs_info->freed_extents[0], start, end, + EXTENT_DIRTY, GFP_NOFS); + if (ret) { + mutex_unlock(&fs_info->unused_bg_unpin_mutex); + btrfs_dec_block_group_ro(root, block_group); + goto end_trans; + } + ret = clear_extent_bits(&fs_info->freed_extents[1], start, end, + EXTENT_DIRTY, GFP_NOFS); + if (ret) { + mutex_unlock(&fs_info->unused_bg_unpin_mutex); + btrfs_dec_block_group_ro(root, block_group); + goto end_trans; + } + mutex_unlock(&fs_info->unused_bg_unpin_mutex); + + /* Reset pinned so btrfs_put_block_group doesn't complain */ + spin_lock(&space_info->lock); + spin_lock(&block_group->lock); + + space_info->bytes_pinned -= block_group->pinned; + space_info->bytes_readonly += block_group->pinned; + percpu_counter_add(&space_info->total_bytes_pinned, + -block_group->pinned); + block_group->pinned = 0; + + spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); + + /* DISCARD can flip during remount */ + trimming = btrfs_test_opt(root, DISCARD); + + /* Implicit trim during transaction commit. */ + if (trimming) + btrfs_get_block_group_trimming(block_group); + + /* + * Btrfs_remove_chunk will abort the transaction if things go + * horribly wrong. + */ + ret = btrfs_remove_chunk(trans, root, + block_group->key.objectid); + + if (ret) { + if (trimming) + btrfs_put_block_group_trimming(block_group); + goto end_trans; + } + + /* + * If we're not mounted with -odiscard, we can just forget + * about this block group. Otherwise we'll need to wait + * until transaction commit to do the actual discard. + */ + if (trimming) { + spin_lock(&fs_info->unused_bgs_lock); + /* + * A concurrent scrub might have added us to the list + * fs_info->unused_bgs, so use a list_move operation + * to add the block group to the deleted_bgs list. + */ + list_move(&block_group->bg_list, + &trans->transaction->deleted_bgs); + spin_unlock(&fs_info->unused_bgs_lock); + btrfs_get_block_group(block_group); + } +end_trans: + btrfs_end_transaction(trans, root); +next: + mutex_unlock(&fs_info->delete_unused_bgs_mutex); + btrfs_put_block_group(block_group); + spin_lock(&fs_info->unused_bgs_lock); + } + spin_unlock(&fs_info->unused_bgs_lock); +} + int btrfs_init_space_info(struct btrfs_fs_info *fs_info) { struct btrfs_space_info *space_info; @@ -8568,16 +10581,99 @@ return unpin_extent_range(root, start, end, false); } -int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr, - u64 num_bytes, u64 *actual_bytes) +/* + * It used to be that old block groups would be left around forever. + * Iterating over them would be enough to trim unused space. Since we + * now automatically remove them, we also need to iterate over unallocated + * space. + * + * We don't want a transaction for this since the discard may take a + * substantial amount of time. We don't require that a transaction be + * running, but we do need to take a running transaction into account + * to ensure that we're not discarding chunks that were released in + * the current transaction. + * + * Holding the chunks lock will prevent other threads from allocating + * or releasing chunks, but it won't prevent a running transaction + * from committing and releasing the memory that the pending chunks + * list head uses. For that, we need to take a reference to the + * transaction. + */ +static int btrfs_trim_free_extents(struct btrfs_device *device, + u64 minlen, u64 *trimmed) { - return btrfs_discard_extent(root, bytenr, num_bytes, actual_bytes); + u64 start = 0, len = 0; + int ret; + + *trimmed = 0; + + /* Not writeable = nothing to do. */ + if (!device->writeable) + return 0; + + /* No free space = nothing to do. */ + if (device->total_bytes <= device->bytes_used) + return 0; + + ret = 0; + + while (1) { + struct btrfs_fs_info *fs_info = device->dev_root->fs_info; + struct btrfs_transaction *trans; + u64 bytes; + + ret = mutex_lock_interruptible(&fs_info->chunk_mutex); + if (ret) + return ret; + + down_read(&fs_info->commit_root_sem); + + spin_lock(&fs_info->trans_lock); + trans = fs_info->running_transaction; + if (trans) + atomic_inc(&trans->use_count); + spin_unlock(&fs_info->trans_lock); + + ret = find_free_dev_extent_start(trans, device, minlen, start, + &start, &len); + if (trans) + btrfs_put_transaction(trans); + + if (ret) { + up_read(&fs_info->commit_root_sem); + mutex_unlock(&fs_info->chunk_mutex); + if (ret == -ENOSPC) + ret = 0; + break; + } + + ret = btrfs_issue_discard(device->bdev, start, len, &bytes); + up_read(&fs_info->commit_root_sem); + mutex_unlock(&fs_info->chunk_mutex); + + if (ret) + break; + + start += len; + *trimmed += bytes; + + if (fatal_signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + + cond_resched(); + } + + return ret; } int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_group_cache *cache = NULL; + struct btrfs_device *device; + struct list_head *devices; u64 group_trimmed; u64 start; u64 end; @@ -8606,8 +10702,15 @@ if (end - start >= range->minlen) { if (!block_group_cache_done(cache)) { ret = cache_block_group(cache, 0); - if (!ret) - wait_block_group_cache_done(cache); + if (ret) { + btrfs_put_block_group(cache); + break; + } + ret = wait_block_group_cache_done(cache); + if (ret) { + btrfs_put_block_group(cache); + break; + } } ret = btrfs_trim_block_group(cache, &group_trimmed, @@ -8625,6 +10728,54 @@ cache = next_block_group(fs_info->tree_root, cache); } + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); + devices = &root->fs_info->fs_devices->alloc_list; + list_for_each_entry(device, devices, dev_alloc_list) { + ret = btrfs_trim_free_extents(device, range->minlen, + &group_trimmed); + if (ret) + break; + + trimmed += group_trimmed; + } + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + range->len = trimmed; return ret; } + +/* + * btrfs_{start,end}_write_no_snapshoting() are similar to + * mnt_{want,drop}_write(), they are used to prevent some tasks from writing + * data into the page cache through nocow before the subvolume is snapshoted, + * but flush the data into disk after the snapshot creation, or to prevent + * operations while snapshoting is ongoing and that cause the snapshot to be + * inconsistent (writes followed by expanding truncates for example). + */ +void btrfs_end_write_no_snapshoting(struct btrfs_root *root) +{ + percpu_counter_dec(&root->subv_writers->counter); + /* + * Make sure counter is updated before we wake up waiters. + */ + smp_mb(); + if (waitqueue_active(&root->subv_writers->wait)) + wake_up(&root->subv_writers->wait); +} + +int btrfs_start_write_no_snapshoting(struct btrfs_root *root) +{ + if (atomic_read(&root->will_be_snapshoted)) + return 0; + + percpu_counter_inc(&root->subv_writers->counter); + /* + * Make sure counter is updated before we check for snapshot creation. + */ + smp_mb(); + if (atomic_read(&root->will_be_snapshoted)) { + btrfs_end_write_no_snapshoting(root); + return 0; + } + return 1; +}