--- zzzz-none-000/linux-3.10.107/fs/btrfs/file.c 2017-06-27 09:49:32.000000000 +0000 +++ scorpion-7490-727/linux-3.10.107/fs/btrfs/file.c 2021-02-04 17:41:59.000000000 +0000 @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include @@ -32,6 +31,7 @@ #include #include #include +#include #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -39,8 +39,8 @@ #include "print-tree.h" #include "tree-log.h" #include "locking.h" -#include "compat.h" #include "volumes.h" +#include "qgroup.h" static struct kmem_cache *btrfs_inode_defrag_cachep; /* @@ -273,11 +273,7 @@ defrag = rb_entry(node, struct inode_defrag, rb_node); kmem_cache_free(btrfs_inode_defrag_cachep, defrag); - if (need_resched()) { - spin_unlock(&fs_info->defrag_inodes_lock); - cond_resched(); - spin_lock(&fs_info->defrag_inodes_lock); - } + cond_resched_lock(&fs_info->defrag_inodes_lock); node = rb_first(&fs_info->defrag_inodes); } @@ -299,7 +295,7 @@ /* get the inode */ key.objectid = defrag->root; - btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; index = srcu_read_lock(&fs_info->subvol_srcu); @@ -309,13 +305,9 @@ ret = PTR_ERR(inode_root); goto cleanup; } - if (btrfs_root_refs(&inode_root->root_item) == 0) { - ret = -ENOENT; - goto cleanup; - } key.objectid = defrag->ino; - btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); + key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL); if (IS_ERR(inode)) { @@ -374,7 +366,7 @@ u64 root_objectid = 0; atomic_inc(&fs_info->defrag_running); - while(1) { + while (1) { /* Pause the auto defragger. */ if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) @@ -430,13 +422,8 @@ struct page *page = prepared_pages[pg]; /* * Copy data from userspace to the current page - * - * Disable pagefault to avoid recursive lock since - * the pages are already locked */ - pagefault_disable(); copied = iov_iter_copy_from_user_atomic(page, i, offset, count); - pagefault_enable(); /* Flush processor's dcache for this page */ flush_dcache_page(page); @@ -457,11 +444,11 @@ write_bytes -= copied; total_copied += copied; - /* Return to btrfs_file_aio_write to fault page */ + /* Return to btrfs_file_write_iter to fault page */ if (unlikely(copied == 0)) break; - if (unlikely(copied < PAGE_CACHE_SIZE - offset)) { + if (copied < PAGE_CACHE_SIZE - offset) { offset += copied; } else { pg++; @@ -480,11 +467,12 @@ for (i = 0; i < num_pages; i++) { /* page checked is some magic around finding pages that * have been modified without going through btrfs_set_page_dirty - * clear it here + * clear it here. There should be no need to mark the pages + * accessed as prepare_pages should have marked them accessed + * in prepare_pages via find_or_create_page() */ ClearPageChecked(pages[i]); unlock_page(pages[i]); - mark_page_accessed(pages[i]); page_cache_release(pages[i]); } } @@ -596,36 +584,42 @@ clear_bit(EXTENT_FLAG_PINNED, &em->flags); clear_bit(EXTENT_FLAG_LOGGING, &flags); modified = !list_empty(&em->list); - remove_extent_mapping(em_tree, em); if (no_splits) goto next; - if (em->block_start < EXTENT_MAP_LAST_BYTE && - em->start < start) { + if (em->start < start) { split->start = em->start; split->len = start - em->start; - split->orig_start = em->orig_start; - split->block_start = em->block_start; - if (compressed) - split->block_len = em->block_len; - else - split->block_len = split->len; - split->ram_bytes = em->ram_bytes; - split->orig_block_len = max(split->block_len, - em->orig_block_len); + if (em->block_start < EXTENT_MAP_LAST_BYTE) { + split->orig_start = em->orig_start; + split->block_start = em->block_start; + + if (compressed) + split->block_len = em->block_len; + else + split->block_len = split->len; + split->orig_block_len = max(split->block_len, + em->orig_block_len); + split->ram_bytes = em->ram_bytes; + } else { + split->orig_start = split->start; + split->block_len = 0; + split->block_start = em->block_start; + split->orig_block_len = 0; + split->ram_bytes = split->len; + } + split->generation = gen; split->bdev = em->bdev; split->flags = flags; split->compress_type = em->compress_type; - ret = add_extent_mapping(em_tree, split, modified); - BUG_ON(ret); /* Logic error */ + replace_extent_mapping(em_tree, em, split, modified); free_extent_map(split); split = split2; split2 = NULL; } - if (em->block_start < EXTENT_MAP_LAST_BYTE && - testend && em->start + em->len > start + len) { + if (testend && em->start + em->len > start + len) { u64 diff = start + len - em->start; split->start = start + len; @@ -634,26 +628,44 @@ split->flags = flags; split->compress_type = em->compress_type; split->generation = gen; - split->orig_block_len = max(em->block_len, + + if (em->block_start < EXTENT_MAP_LAST_BYTE) { + split->orig_block_len = max(em->block_len, em->orig_block_len); - split->ram_bytes = em->ram_bytes; - if (compressed) { - split->block_len = em->block_len; - split->block_start = em->block_start; - split->orig_start = em->orig_start; + split->ram_bytes = em->ram_bytes; + if (compressed) { + split->block_len = em->block_len; + split->block_start = em->block_start; + split->orig_start = em->orig_start; + } else { + split->block_len = split->len; + split->block_start = em->block_start + + diff; + split->orig_start = em->orig_start; + } } else { - split->block_len = split->len; - split->block_start = em->block_start + diff; - split->orig_start = em->orig_start; + split->ram_bytes = split->len; + split->orig_start = split->start; + split->block_len = 0; + split->block_start = em->block_start; + split->orig_block_len = 0; } - ret = add_extent_mapping(em_tree, split, modified); - BUG_ON(ret); /* Logic error */ + if (extent_map_in_tree(em)) { + replace_extent_mapping(em_tree, em, split, + modified); + } else { + ret = add_extent_mapping(em_tree, split, + modified); + ASSERT(ret == 0); /* Logic error */ + } free_extent_map(split); split = NULL; } next: + if (extent_map_in_tree(em)) + remove_extent_mapping(em_tree, em); write_unlock(&em_tree->lock); /* once for us */ @@ -679,7 +691,10 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, struct btrfs_path *path, u64 start, u64 end, - u64 *drop_end, int drop_cache) + u64 *drop_end, int drop_cache, + int replace_extent, + u32 extent_item_size, + int *key_inserted) { struct extent_buffer *leaf; struct btrfs_file_extent_item *fi; @@ -697,15 +712,18 @@ int recow; int ret; int modify_tree = -1; - int update_refs = (root->ref_cows || root == root->fs_info->tree_root); + int update_refs; int found = 0; + int leafs_visited = 0; if (drop_cache) btrfs_drop_extent_cache(inode, start, end - 1, 0); - if (start >= BTRFS_I(inode)->disk_i_size) + if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent) modify_tree = 0; + update_refs = (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || + root == root->fs_info->tree_root); while (1) { recow = 0; ret = btrfs_lookup_file_extent(trans, root, path, ino, @@ -720,6 +738,7 @@ path->slots[0]--; } ret = 0; + leafs_visited++; next_slot: leaf = path->nodes[0]; if (path->slots[0] >= btrfs_header_nritems(leaf)) { @@ -731,13 +750,22 @@ ret = 0; break; } + leafs_visited++; leaf = path->nodes[0]; recow = 1; } btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - if (key.objectid > ino || - key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end) + + if (key.objectid > ino) + break; + if (WARN_ON_ONCE(key.objectid < ino) || + key.type < BTRFS_EXTENT_DATA_KEY) { + ASSERT(del_nr == 0); + path->slots[0]++; + goto next_slot; + } + if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end) break; fi = btrfs_item_ptr(leaf, path->slots[0], @@ -753,12 +781,25 @@ btrfs_file_extent_num_bytes(leaf, fi); } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { extent_end = key.offset + - btrfs_file_extent_inline_len(leaf, fi); + btrfs_file_extent_inline_len(leaf, + path->slots[0], fi); } else { - WARN_ON(1); - extent_end = search_start; + /* can't happen */ + BUG(); } + /* + * Don't skip extent items representing 0 byte lengths. They + * used to be created (bug) if while punching holes we hit + * -ENOSPC condition. So if we find one here, just ensure we + * delete it, otherwise we would insert a new file extent item + * with the same key (offset) as that 0 bytes length file + * extent item in the call to setup_items_for_insert() later + * in this function. + */ + if (extent_end == key.offset && extent_end >= search_start) + goto delete_extent_item; + if (extent_end <= search_start) { path->slots[0]++; goto next_slot; @@ -778,7 +819,10 @@ */ if (start > key.offset && end < extent_end) { BUG_ON(del_nr > 0); - BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); + if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + ret = -EOPNOTSUPP; + break; + } memcpy(&new_key, &key, sizeof(new_key)); new_key.offset = start; @@ -811,7 +855,7 @@ disk_bytenr, num_bytes, 0, root->root_key.objectid, new_key.objectid, - start - extent_offset, 0); + start - extent_offset); BUG_ON(ret); /* -ENOMEM */ } key.offset = start; @@ -821,11 +865,14 @@ * | -------- extent -------- | */ if (start <= key.offset && end < extent_end) { - BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); + if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + ret = -EOPNOTSUPP; + break; + } memcpy(&new_key, &key, sizeof(new_key)); new_key.offset = end; - btrfs_set_item_key_safe(root, path, &new_key); + btrfs_set_item_key_safe(root->fs_info, path, &new_key); extent_offset += end - key.offset; btrfs_set_file_extent_offset(leaf, fi, extent_offset); @@ -844,7 +891,10 @@ */ if (start > key.offset && end >= extent_end) { BUG_ON(del_nr > 0); - BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); + if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + ret = -EOPNOTSUPP; + break; + } btrfs_set_file_extent_num_bytes(leaf, fi, start - key.offset); @@ -863,6 +913,7 @@ * | ------ extent ------ | */ if (start <= key.offset && end >= extent_end) { +delete_extent_item: if (del_nr == 0) { del_slot = path->slots[0]; del_nr = 1; @@ -882,7 +933,7 @@ disk_bytenr, num_bytes, 0, root->root_key.objectid, key.objectid, key.offset - - extent_offset, 0); + extent_offset); BUG_ON(ret); /* -ENOMEM */ inode_sub_bytes(inode, extent_end - key.offset); @@ -914,14 +965,52 @@ } if (!ret && del_nr > 0) { + /* + * Set path->slots[0] to first slot, so that after the delete + * if items are move off from our leaf to its immediate left or + * right neighbor leafs, we end up with a correct and adjusted + * path->slots[0] for our insertion (if replace_extent != 0). + */ + path->slots[0] = del_slot; ret = btrfs_del_items(trans, root, path, del_slot, del_nr); if (ret) btrfs_abort_transaction(trans, root, ret); } + leaf = path->nodes[0]; + /* + * If btrfs_del_items() was called, it might have deleted a leaf, in + * which case it unlocked our path, so check path->locks[0] matches a + * write lock. + */ + if (!ret && replace_extent && leafs_visited == 1 && + (path->locks[0] == BTRFS_WRITE_LOCK_BLOCKING || + path->locks[0] == BTRFS_WRITE_LOCK) && + btrfs_leaf_free_space(root, leaf) >= + sizeof(struct btrfs_item) + extent_item_size) { + + key.objectid = ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = start; + if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) { + struct btrfs_key slot_key; + + btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]); + if (btrfs_comp_cpu_keys(&key, &slot_key) > 0) + path->slots[0]++; + } + setup_items_for_insert(root, path, &key, + &extent_item_size, + extent_item_size, + sizeof(struct btrfs_item) + + extent_item_size, 1); + *key_inserted = 1; + } + + if (!replace_extent || !(*key_inserted)) + btrfs_release_path(path); if (drop_end) *drop_end = found ? min(end, extent_end) : end; - btrfs_release_path(path); return ret; } @@ -936,7 +1025,7 @@ if (!path) return -ENOMEM; ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL, - drop_cache); + drop_cache, 0, 0, NULL); btrfs_free_path(path); return ret; } @@ -1041,7 +1130,7 @@ ino, bytenr, orig_offset, &other_start, &other_end)) { new_key.offset = end; - btrfs_set_item_key_safe(root, path, &new_key); + btrfs_set_item_key_safe(root->fs_info, path, &new_key); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); btrfs_set_file_extent_generation(leaf, fi, @@ -1075,7 +1164,7 @@ trans->transid); path->slots[0]++; new_key.offset = start; - btrfs_set_item_key_safe(root, path, &new_key); + btrfs_set_item_key_safe(root->fs_info, path, &new_key); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); @@ -1123,7 +1212,7 @@ ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, root->root_key.objectid, - ino, orig_offset, 0); + ino, orig_offset); BUG_ON(ret); /* -ENOMEM */ if (split == start) { @@ -1150,7 +1239,7 @@ del_nr++; ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 0, root->root_key.objectid, - ino, orig_offset, 0); + ino, orig_offset); BUG_ON(ret); /* -ENOMEM */ } other_start = 0; @@ -1167,7 +1256,7 @@ del_nr++; ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 0, root->root_key.objectid, - ino, orig_offset, 0); + ino, orig_offset); BUG_ON(ret); /* -ENOMEM */ } if (del_nr == 0) { @@ -1202,7 +1291,8 @@ * on error we return an unlocked page and the error value * on success we return a locked page and 0 */ -static int prepare_uptodate_page(struct page *page, u64 pos, +static int prepare_uptodate_page(struct inode *inode, + struct page *page, u64 pos, bool force_uptodate) { int ret = 0; @@ -1217,35 +1307,29 @@ unlock_page(page); return -EIO; } + if (page->mapping != inode->i_mapping) { + unlock_page(page); + return -EAGAIN; + } } return 0; } /* - * this gets pages into the page cache and locks them down, it also properly - * waits for data=ordered extents to finish before allowing the pages to be - * modified. + * this just gets pages into the page cache and locks them down. */ -static noinline int prepare_pages(struct btrfs_root *root, struct file *file, - struct page **pages, size_t num_pages, - loff_t pos, unsigned long first_index, - size_t write_bytes, bool force_uptodate) +static noinline int prepare_pages(struct inode *inode, struct page **pages, + size_t num_pages, loff_t pos, + size_t write_bytes, bool force_uptodate) { - struct extent_state *cached_state = NULL; int i; unsigned long index = pos >> PAGE_CACHE_SHIFT; - struct inode *inode = file_inode(file); gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); int err = 0; - int faili = 0; - u64 start_pos; - u64 last_pos; + int faili; - start_pos = pos & ~((u64)root->sectorsize - 1); - last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT; - -again: for (i = 0; i < num_pages; i++) { +again: pages[i] = find_or_create_page(inode->i_mapping, index + i, mask | __GFP_WRITE); if (!pages[i]) { @@ -1255,66 +1339,141 @@ } if (i == 0) - err = prepare_uptodate_page(pages[i], pos, + err = prepare_uptodate_page(inode, pages[i], pos, force_uptodate); - if (i == num_pages - 1) - err = prepare_uptodate_page(pages[i], + if (!err && i == num_pages - 1) + err = prepare_uptodate_page(inode, pages[i], pos + write_bytes, false); if (err) { page_cache_release(pages[i]); + if (err == -EAGAIN) { + err = 0; + goto again; + } faili = i - 1; goto fail; } wait_on_page_writeback(pages[i]); } - err = 0; + + return 0; +fail: + while (faili >= 0) { + unlock_page(pages[faili]); + page_cache_release(pages[faili]); + faili--; + } + return err; + +} + +/* + * This function locks the extent and properly waits for data=ordered extents + * to finish before allowing the pages to be modified if need. + * + * The return value: + * 1 - the extent is locked + * 0 - the extent is not locked, and everything is OK + * -EAGAIN - need re-prepare the pages + * the other < 0 number - Something wrong happens + */ +static noinline int +lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages, + size_t num_pages, loff_t pos, + u64 *lockstart, u64 *lockend, + struct extent_state **cached_state) +{ + u64 start_pos; + u64 last_pos; + int i; + int ret = 0; + + start_pos = pos & ~((u64)PAGE_CACHE_SIZE - 1); + last_pos = start_pos + ((u64)num_pages << PAGE_CACHE_SHIFT) - 1; + if (start_pos < inode->i_size) { struct btrfs_ordered_extent *ordered; lock_extent_bits(&BTRFS_I(inode)->io_tree, - start_pos, last_pos - 1, 0, &cached_state); - ordered = btrfs_lookup_first_ordered_extent(inode, - last_pos - 1); + start_pos, last_pos, 0, cached_state); + ordered = btrfs_lookup_ordered_range(inode, start_pos, + last_pos - start_pos + 1); if (ordered && ordered->file_offset + ordered->len > start_pos && - ordered->file_offset < last_pos) { - btrfs_put_ordered_extent(ordered); + ordered->file_offset <= last_pos) { unlock_extent_cached(&BTRFS_I(inode)->io_tree, - start_pos, last_pos - 1, - &cached_state, GFP_NOFS); + start_pos, last_pos, + cached_state, GFP_NOFS); for (i = 0; i < num_pages; i++) { unlock_page(pages[i]); page_cache_release(pages[i]); } - btrfs_wait_ordered_range(inode, start_pos, - last_pos - start_pos); - goto again; + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + return -EAGAIN; } if (ordered) btrfs_put_ordered_extent(ordered); clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos, - last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC | + last_pos, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, - 0, 0, &cached_state, GFP_NOFS); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, - start_pos, last_pos - 1, &cached_state, - GFP_NOFS); + 0, 0, cached_state, GFP_NOFS); + *lockstart = start_pos; + *lockend = last_pos; + ret = 1; } + for (i = 0; i < num_pages; i++) { if (clear_page_dirty_for_io(pages[i])) account_page_redirty(pages[i]); set_page_extent_mapped(pages[i]); WARN_ON(!PageLocked(pages[i])); } - return 0; -fail: - while (faili >= 0) { - unlock_page(pages[faili]); - page_cache_release(pages[faili]); - faili--; + + return ret; +} + +static noinline int check_can_nocow(struct inode *inode, loff_t pos, + size_t *write_bytes) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_ordered_extent *ordered; + u64 lockstart, lockend; + u64 num_bytes; + int ret; + + ret = btrfs_start_write_no_snapshoting(root); + if (!ret) + return -ENOSPC; + + lockstart = round_down(pos, root->sectorsize); + lockend = round_up(pos + *write_bytes, root->sectorsize) - 1; + + while (1) { + lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend); + ordered = btrfs_lookup_ordered_range(inode, lockstart, + lockend - lockstart + 1); + if (!ordered) { + break; + } + unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); } - return err; + num_bytes = lockend - lockstart + 1; + ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL); + if (ret <= 0) { + ret = 0; + btrfs_end_write_no_snapshoting(root); + } else { + *write_bytes = min_t(size_t, *write_bytes , + num_bytes - pos + lockstart); + } + + unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend); + + return ret; } static noinline ssize_t __btrfs_buffered_write(struct file *file, @@ -1324,30 +1483,33 @@ struct inode *inode = file_inode(file); struct btrfs_root *root = BTRFS_I(inode)->root; struct page **pages = NULL; - unsigned long first_index; + struct extent_state *cached_state = NULL; + u64 release_bytes = 0; + u64 lockstart; + u64 lockend; size_t num_written = 0; int nrptrs; int ret = 0; + bool only_release_metadata = false; bool force_page_uptodate = false; + bool need_unlock; - nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) / - PAGE_CACHE_SIZE, PAGE_CACHE_SIZE / - (sizeof(struct page *))); + nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_CACHE_SIZE), + PAGE_CACHE_SIZE / (sizeof(struct page *))); nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied); nrptrs = max(nrptrs, 8); - pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); + pages = kmalloc_array(nrptrs, sizeof(struct page *), GFP_KERNEL); if (!pages) return -ENOMEM; - first_index = pos >> PAGE_CACHE_SHIFT; - while (iov_iter_count(i) > 0) { size_t offset = pos & (PAGE_CACHE_SIZE - 1); size_t write_bytes = min(iov_iter_count(i), nrptrs * (size_t)PAGE_CACHE_SIZE - offset); - size_t num_pages = (write_bytes + offset + - PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + size_t num_pages = DIV_ROUND_UP(write_bytes + offset, + PAGE_CACHE_SIZE); + size_t reserve_bytes; size_t dirty_pages; size_t copied; @@ -1362,23 +1524,65 @@ break; } - ret = btrfs_delalloc_reserve_space(inode, - num_pages << PAGE_CACHE_SHIFT); - if (ret) + reserve_bytes = num_pages << PAGE_CACHE_SHIFT; + + if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | + BTRFS_INODE_PREALLOC)) && + check_can_nocow(inode, pos, &write_bytes) > 0) { + /* + * For nodata cow case, no need to reserve + * data space. + */ + only_release_metadata = true; + /* + * our prealloc extent may be smaller than + * write_bytes, so scale down. + */ + num_pages = DIV_ROUND_UP(write_bytes + offset, + PAGE_CACHE_SIZE); + reserve_bytes = num_pages << PAGE_CACHE_SHIFT; + goto reserve_metadata; + } + + ret = btrfs_check_data_free_space(inode, pos, write_bytes); + if (ret < 0) break; +reserve_metadata: + ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes); + if (ret) { + if (!only_release_metadata) + btrfs_free_reserved_data_space(inode, pos, + write_bytes); + else + btrfs_end_write_no_snapshoting(root); + break; + } + + release_bytes = reserve_bytes; + need_unlock = false; +again: /* * This is going to setup the pages array with the number of * pages we want, so we don't really need to worry about the * contents of pages from loop to loop */ - ret = prepare_pages(root, file, pages, num_pages, - pos, first_index, write_bytes, + ret = prepare_pages(inode, pages, num_pages, + pos, write_bytes, force_page_uptodate); - if (ret) { - btrfs_delalloc_release_space(inode, - num_pages << PAGE_CACHE_SHIFT); + if (ret) + break; + + ret = lock_and_cleanup_extent_if_need(inode, pages, num_pages, + pos, &lockstart, &lockend, + &cached_state); + if (ret < 0) { + if (ret == -EAGAIN) + goto again; break; + } else if (ret > 0) { + need_unlock = true; + ret = 0; } copied = btrfs_copy_from_user(pos, num_pages, @@ -1396,9 +1600,8 @@ dirty_pages = 0; } else { force_page_uptodate = false; - dirty_pages = (copied + offset + - PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT; + dirty_pages = DIV_ROUND_UP(copied + offset, + PAGE_CACHE_SIZE); } /* @@ -1409,26 +1612,54 @@ * managed to copy. */ if (num_pages > dirty_pages) { + release_bytes = (num_pages - dirty_pages) << + PAGE_CACHE_SHIFT; if (copied > 0) { spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->outstanding_extents++; spin_unlock(&BTRFS_I(inode)->lock); } - btrfs_delalloc_release_space(inode, - (num_pages - dirty_pages) << - PAGE_CACHE_SHIFT); + if (only_release_metadata) { + btrfs_delalloc_release_metadata(inode, + release_bytes); + } else { + u64 __pos; + + __pos = round_down(pos, root->sectorsize) + + (dirty_pages << PAGE_CACHE_SHIFT); + btrfs_delalloc_release_space(inode, __pos, + release_bytes); + } } - if (copied > 0) { + release_bytes = dirty_pages << PAGE_CACHE_SHIFT; + + if (copied > 0) ret = btrfs_dirty_pages(root, inode, pages, dirty_pages, pos, copied, NULL); - if (ret) { - btrfs_delalloc_release_space(inode, - dirty_pages << PAGE_CACHE_SHIFT); - btrfs_drop_pages(pages, num_pages); - break; - } + if (need_unlock) + unlock_extent_cached(&BTRFS_I(inode)->io_tree, + lockstart, lockend, &cached_state, + GFP_NOFS); + if (ret) { + btrfs_drop_pages(pages, num_pages); + break; + } + + release_bytes = 0; + if (only_release_metadata) + btrfs_end_write_no_snapshoting(root); + + if (only_release_metadata && copied > 0) { + lockstart = round_down(pos, root->sectorsize); + lockend = lockstart + + (dirty_pages << PAGE_CACHE_SHIFT) - 1; + + set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, + lockend, EXTENT_NORESERVE, NULL, + NULL, GFP_NOFS); + only_release_metadata = false; } btrfs_drop_pages(pages, num_pages); @@ -1436,7 +1667,7 @@ cond_resched(); balance_dirty_pages_ratelimited(inode->i_mapping); - if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1) + if (dirty_pages < (root->nodesize >> PAGE_CACHE_SHIFT) + 1) btrfs_btree_balance_dirty(root); pos += copied; @@ -1445,41 +1676,53 @@ kfree(pages); + if (release_bytes) { + if (only_release_metadata) { + btrfs_end_write_no_snapshoting(root); + btrfs_delalloc_release_metadata(inode, release_bytes); + } else { + btrfs_delalloc_release_space(inode, pos, release_bytes); + } + } + return num_written ? num_written : ret; } static ssize_t __btrfs_direct_write(struct kiocb *iocb, - const struct iovec *iov, - unsigned long nr_segs, loff_t pos, - loff_t *ppos, size_t count, size_t ocount) + struct iov_iter *from, + loff_t pos) { struct file *file = iocb->ki_filp; - struct iov_iter i; + struct inode *inode = file_inode(file); ssize_t written; ssize_t written_buffered; loff_t endbyte; int err; - written = generic_file_direct_write(iocb, iov, &nr_segs, pos, ppos, - count, ocount); + written = generic_file_direct_write(iocb, from, pos); - if (written < 0 || written == count) + if (written < 0 || !iov_iter_count(from)) return written; pos += written; - count -= written; - iov_iter_init(&i, iov, nr_segs, count, written); - written_buffered = __btrfs_buffered_write(file, &i, pos); + written_buffered = __btrfs_buffered_write(file, from, pos); if (written_buffered < 0) { err = written_buffered; goto out; } + /* + * Ensure all data is persisted. We want the next direct IO read to be + * able to read what was just written. + */ endbyte = pos + written_buffered - 1; - err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte); + err = btrfs_fdatawrite_range(inode, pos, endbyte); + if (err) + goto out; + err = filemap_fdatawait_range(inode->i_mapping, pos, endbyte); if (err) goto out; written += written_buffered; - *ppos = pos + written_buffered; + iocb->ki_pos = pos + written_buffered; invalidate_mapping_pages(file->f_mapping, pos >> PAGE_CACHE_SHIFT, endbyte >> PAGE_CACHE_SHIFT); out: @@ -1504,42 +1747,29 @@ inode_inc_iversion(inode); } -static ssize_t btrfs_file_aio_write(struct kiocb *iocb, - const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +static ssize_t btrfs_file_write_iter(struct kiocb *iocb, + struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct btrfs_root *root = BTRFS_I(inode)->root; - loff_t *ppos = &iocb->ki_pos; u64 start_pos; + u64 end_pos; ssize_t num_written = 0; - ssize_t err = 0; - size_t count, ocount; bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host); + ssize_t err; + loff_t pos; + size_t count; mutex_lock(&inode->i_mutex); - - err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); - if (err) { - mutex_unlock(&inode->i_mutex); - goto out; - } - count = ocount; - - current->backing_dev_info = inode->i_mapping->backing_dev_info; - err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); - if (err) { + err = generic_write_checks(iocb, from); + if (err <= 0) { mutex_unlock(&inode->i_mutex); - goto out; - } - - if (count == 0) { - mutex_unlock(&inode->i_mutex); - goto out; + return err; } - err = file_remove_suid(file); + current->backing_dev_info = inode_to_bdi(inode); + err = file_remove_privs(file); if (err) { mutex_unlock(&inode->i_mutex); goto out; @@ -1565,9 +1795,13 @@ */ update_time_for_write(inode); + pos = iocb->ki_pos; + count = iov_iter_count(from); start_pos = round_down(pos, root->sectorsize); if (start_pos > i_size_read(inode)) { - err = btrfs_cont_expand(inode, i_size_read(inode), start_pos); + /* Expand hole size to cover write data, preventing empty gap */ + end_pos = round_up(pos + count, root->sectorsize); + err = btrfs_cont_expand(inode, i_size_read(inode), end_pos); if (err) { mutex_unlock(&inode->i_mutex); goto out; @@ -1577,17 +1811,12 @@ if (sync) atomic_inc(&BTRFS_I(inode)->sync_writers); - if (unlikely(file->f_flags & O_DIRECT)) { - num_written = __btrfs_direct_write(iocb, iov, nr_segs, - pos, ppos, count, ocount); + if (iocb->ki_flags & IOCB_DIRECT) { + num_written = __btrfs_direct_write(iocb, from, pos); } else { - struct iov_iter i; - - iov_iter_init(&i, iov, nr_segs, count, num_written); - - num_written = __btrfs_buffered_write(file, &i, pos); + num_written = __btrfs_buffered_write(file, from, pos); if (num_written > 0) - *ppos = pos + num_written; + iocb->ki_pos = pos + num_written; } mutex_unlock(&inode->i_mutex); @@ -1597,10 +1826,12 @@ * otherwise subsequent syncs to a file that's been synced in this * transaction will appear to have already occured. */ + spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->last_sub_trans = root->log_transid; - if (num_written > 0 || num_written == -EIOCBQUEUED) { + spin_unlock(&BTRFS_I(inode)->lock); + if (num_written > 0) { err = generic_write_sync(file, pos, num_written); - if (err < 0 && num_written > 0) + if (err < 0) num_written = err; } @@ -1613,6 +1844,8 @@ int btrfs_release_file(struct inode *inode, struct file *filp) { + if (filp->private_data) + btrfs_ioctl_trans_end(filp); /* * ordered_data_close is set by settattr when we are about to truncate * a file from a non-zero size to a zero size. This tries to @@ -1620,29 +1853,22 @@ * application were using truncate to replace a file in place. */ if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, - &BTRFS_I(inode)->runtime_flags)) { - struct btrfs_trans_handle *trans; - struct btrfs_root *root = BTRFS_I(inode)->root; - - /* - * We need to block on a committing transaction to keep us from - * throwing a ordered operation on to the list and causing - * something like sync to deadlock trying to flush out this - * inode. - */ - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) - return PTR_ERR(trans); - btrfs_add_ordered_operation(trans, BTRFS_I(inode)->root, inode); - btrfs_end_transaction(trans, root); - if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) + &BTRFS_I(inode)->runtime_flags)) filemap_flush(inode->i_mapping); - } - if (filp->private_data) - btrfs_ioctl_trans_end(filp); return 0; } +static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end) +{ + int ret; + + atomic_inc(&BTRFS_I(inode)->sync_writers); + ret = btrfs_fdatawrite_range(inode, start, end); + atomic_dec(&BTRFS_I(inode)->sync_writers); + + return ret; +} + /* * fsync call for both files and directories. This logs the inode into * the tree log instead of forcing full commits whenever possible. @@ -1656,13 +1882,20 @@ */ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { - struct dentry *dentry = file->f_path.dentry; - struct inode *inode = dentry->d_inode; + struct dentry *dentry = file_dentry(file); + struct inode *inode = d_inode(dentry); struct btrfs_root *root = BTRFS_I(inode)->root; - int ret = 0; struct btrfs_trans_handle *trans; + struct btrfs_log_ctx ctx; + int ret = 0; bool full_sync = 0; + u64 len; + /* + * The range length can be represented by u64, we have to do the typecasts + * to avoid signed overflow if it's [0, LLONG_MAX] eg. from fsync() + */ + len = (u64)end - (u64)start + 1; trace_btrfs_sync_file(file, datasync); /* @@ -1671,26 +1904,65 @@ * multi-task, and make the performance up. See * btrfs_wait_ordered_range for an explanation of the ASYNC check. */ - atomic_inc(&BTRFS_I(inode)->sync_writers); - ret = filemap_fdatawrite_range(inode->i_mapping, start, end); - if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, - &BTRFS_I(inode)->runtime_flags)) - ret = filemap_fdatawrite_range(inode->i_mapping, start, end); - atomic_dec(&BTRFS_I(inode)->sync_writers); + ret = start_ordered_ops(inode, start, end); if (ret) return ret; mutex_lock(&inode->i_mutex); - - /* - * We flush the dirty pages again to avoid some dirty pages in the - * range being left. - */ atomic_inc(&root->log_batch); full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); - if (full_sync) - btrfs_wait_ordered_range(inode, start, end - start + 1); + /* + * We might have have had more pages made dirty after calling + * start_ordered_ops and before acquiring the inode's i_mutex. + */ + if (full_sync) { + /* + * For a full sync, we need to make sure any ordered operations + * start and finish before we start logging the inode, so that + * all extents are persisted and the respective file extent + * items are in the fs/subvol btree. + */ + ret = btrfs_wait_ordered_range(inode, start, len); + } else { + /* + * Start any new ordered operations before starting to log the + * inode. We will wait for them to finish in btrfs_sync_log(). + * + * Right before acquiring the inode's mutex, we might have new + * writes dirtying pages, which won't immediately start the + * respective ordered operations - that is done through the + * fill_delalloc callbacks invoked from the writepage and + * writepages address space operations. So make sure we start + * all ordered operations before starting to log our inode. Not + * doing this means that while logging the inode, writeback + * could start and invoke writepage/writepages, which would call + * the fill_delalloc callbacks (cow_file_range, + * submit_compressed_extents). These callbacks add first an + * extent map to the modified list of extents and then create + * the respective ordered operation, which means in + * tree-log.c:btrfs_log_inode() we might capture all existing + * ordered operations (with btrfs_get_logged_extents()) before + * the fill_delalloc callback adds its ordered operation, and by + * the time we visit the modified list of extent maps (with + * btrfs_log_changed_extents()), we see and process the extent + * map they created. We then use the extent map to construct a + * file extent item for logging without waiting for the + * respective ordered operation to finish - this file extent + * item points to a disk location that might not have yet been + * written to, containing random data - so after a crash a log + * replay will make our inode have file extent items that point + * to disk locations containing invalid data, as we returned + * success to userspace without waiting for the respective + * ordered operation to finish, because it wasn't captured by + * btrfs_get_logged_extents(). + */ + ret = start_ordered_ops(inode, start, end); + } + if (ret) { + mutex_unlock(&inode->i_mutex); + goto out; + } atomic_inc(&root->log_batch); /* @@ -1723,8 +1995,10 @@ */ smp_mb(); if (btrfs_inode_in_log(inode, root->fs_info->generation) || - (full_sync && BTRFS_I(inode)->last_trans <= - root->fs_info->last_trans_committed)) { + (BTRFS_I(inode)->last_trans <= + root->fs_info->last_trans_committed && + (full_sync || + !btrfs_have_ordered_extents_in_range(inode, start, len)))) { /* * We'v had everything committed since the last time we were * modified so clear this flag in case it was set for whatever @@ -1742,17 +2016,31 @@ if (file->private_data) btrfs_ioctl_trans_end(file); + /* + * We use start here because we will need to wait on the IO to complete + * in btrfs_sync_log, which could require joining a transaction (for + * example checking cross references in the nocow path). If we use join + * here we could get into a situation where we're waiting on IO to + * happen that is blocked on a transaction trying to commit. With start + * we inc the extwriter counter, so we wait for all extwriters to exit + * before we start blocking join'ers. This comment is to keep somebody + * from thinking they are super smart and changing this to + * btrfs_join_transaction *cough*Josef*cough*. + */ trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); mutex_unlock(&inode->i_mutex); goto out; } + trans->sync = true; + + btrfs_init_log_ctx(&ctx); - ret = btrfs_log_dentry_safe(trans, root, dentry); + ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, &ctx); if (ret < 0) { - mutex_unlock(&inode->i_mutex); - goto out; + /* Fallthrough and commit/free transaction. */ + ret = 1; } /* we've logged all the items and now have a consistent @@ -1767,28 +2055,41 @@ */ mutex_unlock(&inode->i_mutex); + /* + * If any of the ordered extents had an error, just return it to user + * space, so that the application knows some writes didn't succeed and + * can take proper action (retry for e.g.). Blindly committing the + * transaction in this case, would fool userspace that everything was + * successful. And we also want to make sure our log doesn't contain + * file extent items pointing to extents that weren't fully written to - + * just like in the non fast fsync path, where we check for the ordered + * operation's error flag before writing to the log tree and return -EIO + * if any of them had this flag set (btrfs_wait_ordered_range) - + * therefore we need to check for errors in the ordered operations, + * which are indicated by ctx.io_err. + */ + if (ctx.io_err) { + btrfs_end_transaction(trans, root); + ret = ctx.io_err; + goto out; + } + if (ret != BTRFS_NO_LOG_SYNC) { - if (ret > 0) { - /* - * If we didn't already wait for ordered extents we need - * to do that now. - */ - if (!full_sync) - btrfs_wait_ordered_range(inode, start, - end - start + 1); - ret = btrfs_commit_transaction(trans, root); - } else { - ret = btrfs_sync_log(trans, root); - if (ret == 0) { + if (!ret) { + ret = btrfs_sync_log(trans, root, &ctx); + if (!ret) { ret = btrfs_end_transaction(trans, root); - } else { - if (!full_sync) - btrfs_wait_ordered_range(inode, start, - end - - start + 1); - ret = btrfs_commit_transaction(trans, root); + goto out; } } + if (!full_sync) { + ret = btrfs_wait_ordered_range(inode, start, len); + if (ret) { + btrfs_end_transaction(trans, root); + goto out; + } + } + ret = btrfs_commit_transaction(trans, root); } else { ret = btrfs_end_transaction(trans, root); } @@ -1798,8 +2099,8 @@ static const struct vm_operations_struct btrfs_file_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = btrfs_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) @@ -1855,11 +2156,13 @@ struct btrfs_key key; int ret; + if (btrfs_fs_incompat(root->fs_info, NO_HOLES)) + goto out; + key.objectid = btrfs_ino(inode); key.type = BTRFS_EXTENT_DATA_KEY; key.offset = offset; - ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) return ret; @@ -1881,12 +2184,11 @@ goto out; } - if (hole_mergeable(inode, leaf, path->slots[0]+1, offset, end)) { + if (hole_mergeable(inode, leaf, path->slots[0], offset, end)) { u64 num_bytes; - path->slots[0]++; key.offset = offset; - btrfs_set_item_key_safe(root, path, &key); + btrfs_set_item_key_safe(root->fs_info, path, &key); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end - @@ -1941,6 +2243,37 @@ return 0; } +/* + * Find a hole extent on given inode and change start/len to the end of hole + * extent.(hole/vacuum extent whose em->start <= start && + * em->start + em->len > start) + * When a hole extent is found, return 1 and modify start/len. + */ +static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len) +{ + struct extent_map *em; + int ret = 0; + + em = btrfs_get_extent(inode, NULL, 0, *start, *len, 0); + if (IS_ERR_OR_NULL(em)) { + if (!em) + ret = -ENOMEM; + else + ret = PTR_ERR(em); + return ret; + } + + /* Hole or vacuum extent(only exists in no-hole mode) */ + if (em->block_start == EXTENT_MAP_HOLE) { + ret = 1; + *len = em->start + em->len > *start + *len ? + 0 : *start + *len - em->start - em->len; + *start = em->start + em->len; + } + free_extent_map(em); + return ret; +} + static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) { struct btrfs_root *root = BTRFS_I(inode)->root; @@ -1948,20 +2281,44 @@ struct btrfs_path *path; struct btrfs_block_rsv *rsv; struct btrfs_trans_handle *trans; - u64 lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize); - u64 lockend = round_down(offset + len, - BTRFS_I(inode)->root->sectorsize) - 1; - u64 cur_offset = lockstart; + u64 lockstart; + u64 lockend; + u64 tail_start; + u64 tail_len; + u64 orig_start = offset; + u64 cur_offset; u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); u64 drop_end; int ret = 0; int err = 0; - bool same_page = ((offset >> PAGE_CACHE_SHIFT) == - ((offset + len - 1) >> PAGE_CACHE_SHIFT)); + unsigned int rsv_count; + bool same_page; + bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES); + u64 ino_size; + bool truncated_page = false; + bool updated_inode = false; - btrfs_wait_ordered_range(inode, offset, len); + ret = btrfs_wait_ordered_range(inode, offset, len); + if (ret) + return ret; mutex_lock(&inode->i_mutex); + ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE); + ret = find_first_non_hole(inode, &offset, &len); + if (ret < 0) + goto out_only_mutex; + if (ret && !len) { + /* Already in a large hole */ + ret = 0; + goto out_only_mutex; + } + + lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize); + lockend = round_down(offset + len, + BTRFS_I(inode)->root->sectorsize) - 1; + same_page = ((offset >> PAGE_CACHE_SHIFT) == + ((offset + len - 1) >> PAGE_CACHE_SHIFT)); + /* * We needn't truncate any page which is beyond the end of the file * because we are sure there is no data there. @@ -1971,14 +2328,18 @@ * entire page. */ if (same_page && len < PAGE_CACHE_SIZE) { - if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) + if (offset < ino_size) { + truncated_page = true; ret = btrfs_truncate_page(inode, offset, len, 0); - mutex_unlock(&inode->i_mutex); - return ret; + } else { + ret = 0; + } + goto out_only_mutex; } /* zero back part of the first page */ - if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) { + if (offset < ino_size) { + truncated_page = true; ret = btrfs_truncate_page(inode, offset, 0, 0); if (ret) { mutex_unlock(&inode->i_mutex); @@ -1986,18 +2347,46 @@ } } - /* zero the front end of the last page */ - if (offset + len < round_up(inode->i_size, PAGE_CACHE_SIZE)) { - ret = btrfs_truncate_page(inode, offset + len, 0, 1); - if (ret) { - mutex_unlock(&inode->i_mutex); - return ret; + /* Check the aligned pages after the first unaligned page, + * if offset != orig_start, which means the first unaligned page + * including serveral following pages are already in holes, + * the extra check can be skipped */ + if (offset == orig_start) { + /* after truncate page, check hole again */ + len = offset + len - lockstart; + offset = lockstart; + ret = find_first_non_hole(inode, &offset, &len); + if (ret < 0) + goto out_only_mutex; + if (ret && !len) { + ret = 0; + goto out_only_mutex; + } + lockstart = offset; + } + + /* Check the tail unaligned part is in a hole */ + tail_start = lockend + 1; + tail_len = offset + len - tail_start; + if (tail_len) { + ret = find_first_non_hole(inode, &tail_start, &tail_len); + if (unlikely(ret < 0)) + goto out_only_mutex; + if (!ret) { + /* zero the front end of the last page */ + if (tail_start + tail_len < ino_size) { + truncated_page = true; + ret = btrfs_truncate_page(inode, + tail_start + tail_len, 0, 1); + if (ret) + goto out_only_mutex; + } } } if (lockend < lockstart) { - mutex_unlock(&inode->i_mutex); - return 0; + ret = 0; + goto out_only_mutex; } while (1) { @@ -2015,11 +2404,9 @@ * we need to try again. */ if ((!ordered || - (ordered->file_offset + ordered->len < lockstart || + (ordered->file_offset + ordered->len <= lockstart || ordered->file_offset > lockend)) && - !test_range_bit(&BTRFS_I(inode)->io_tree, lockstart, - lockend, EXTENT_UPTODATE, 0, - cached_state)) { + !btrfs_page_exists_in_range(inode, lockstart, lockend)) { if (ordered) btrfs_put_ordered_extent(ordered); break; @@ -2028,8 +2415,12 @@ btrfs_put_ordered_extent(ordered); unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state, GFP_NOFS); - btrfs_wait_ordered_range(inode, lockstart, - lockend - lockstart + 1); + ret = btrfs_wait_ordered_range(inode, lockstart, + lockend - lockstart + 1); + if (ret) { + mutex_unlock(&inode->i_mutex); + return ret; + } } path = btrfs_alloc_path(); @@ -2049,9 +2440,10 @@ /* * 1 - update the inode * 1 - removing the extents in the range - * 1 - adding the hole extent + * 1 - adding the hole extent if no_holes isn't set */ - trans = btrfs_start_transaction(root, 3); + rsv_count = no_holes ? 2 : 3; + trans = btrfs_start_transaction(root, rsv_count); if (IS_ERR(trans)) { err = PTR_ERR(trans); goto out_free; @@ -2062,19 +2454,24 @@ BUG_ON(ret); trans->block_rsv = rsv; + cur_offset = lockstart; + len = lockend - cur_offset; while (cur_offset < lockend) { ret = __btrfs_drop_extents(trans, root, inode, path, cur_offset, lockend + 1, - &drop_end, 1); + &drop_end, 1, 0, 0, NULL); if (ret != -ENOSPC) break; trans->block_rsv = &root->fs_info->trans_block_rsv; - ret = fill_holes(trans, inode, path, cur_offset, drop_end); - if (ret) { - err = ret; - break; + if (cur_offset < ino_size) { + ret = fill_holes(trans, inode, path, cur_offset, + drop_end); + if (ret) { + err = ret; + break; + } } cur_offset = drop_end; @@ -2088,7 +2485,7 @@ btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(root); - trans = btrfs_start_transaction(root, 3); + trans = btrfs_start_transaction(root, rsv_count); if (IS_ERR(trans)) { ret = PTR_ERR(trans); trans = NULL; @@ -2099,6 +2496,14 @@ rsv, min_size); BUG_ON(ret); /* shouldn't happen */ trans->block_rsv = rsv; + + ret = find_first_non_hole(inode, &cur_offset, &len); + if (unlikely(ret < 0)) + break; + if (ret && !len) { + ret = 0; + break; + } } if (ret) { @@ -2107,10 +2512,30 @@ } trans->block_rsv = &root->fs_info->trans_block_rsv; - ret = fill_holes(trans, inode, path, cur_offset, drop_end); - if (ret) { - err = ret; - goto out_trans; + /* + * If we are using the NO_HOLES feature we might have had already an + * hole that overlaps a part of the region [lockstart, lockend] and + * ends at (or beyond) lockend. Since we have no file extent items to + * represent holes, drop_end can be less than lockend and so we must + * make sure we have an extent map representing the existing hole (the + * call to __btrfs_drop_extents() might have dropped the existing extent + * map representing the existing hole), otherwise the fast fsync path + * will not record the existence of the hole region + * [existing_hole_start, lockend]. + */ + if (drop_end <= lockend) + drop_end = lockend + 1; + /* + * Don't insert file hole extent item if it's for a range beyond eof + * (because it's useless) or if it represents a 0 bytes range (when + * cur_offset == drop_end). + */ + if (cur_offset < ino_size && cur_offset < drop_end) { + ret = fill_holes(trans, inode, path, cur_offset, drop_end); + if (ret) { + err = ret; + goto out_trans; + } } out_trans: @@ -2122,6 +2547,7 @@ trans->block_rsv = &root->fs_info->trans_block_rsv; ret = btrfs_update_inode(trans, root, inode); + updated_inode = true; btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(root); out_free: @@ -2130,24 +2556,84 @@ out: unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state, GFP_NOFS); +out_only_mutex: + if (!updated_inode && truncated_page && !ret && !err) { + /* + * If we only end up zeroing part of a page, we still need to + * update the inode item, so that all the time fields are + * updated as well as the necessary btrfs inode in memory fields + * for detecting, at fsync time, if the inode isn't yet in the + * log tree or it's there but not up to date. + */ + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + } else { + err = btrfs_update_inode(trans, root, inode); + ret = btrfs_end_transaction(trans, root); + } + } mutex_unlock(&inode->i_mutex); if (ret && !err) err = ret; return err; } +/* Helper structure to record which range is already reserved */ +struct falloc_range { + struct list_head list; + u64 start; + u64 len; +}; + +/* + * Helper function to add falloc range + * + * Caller should have locked the larger range of extent containing + * [start, len) + */ +static int add_falloc_range(struct list_head *head, u64 start, u64 len) +{ + struct falloc_range *prev = NULL; + struct falloc_range *range = NULL; + + if (list_empty(head)) + goto insert; + + /* + * As fallocate iterate by bytenr order, we only need to check + * the last range. + */ + prev = list_entry(head->prev, struct falloc_range, list); + if (prev->start + prev->len == start) { + prev->len += len; + return 0; + } +insert: + range = kmalloc(sizeof(*range), GFP_NOFS); + if (!range) + return -ENOMEM; + range->start = start; + range->len = len; + list_add_tail(&range->list, head); + return 0; +} + static long btrfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); struct extent_state *cached_state = NULL; - struct btrfs_root *root = BTRFS_I(inode)->root; + struct falloc_range *range; + struct falloc_range *tmp; + struct list_head reserve_list; u64 cur_offset; u64 last_byte; u64 alloc_start; u64 alloc_end; u64 alloc_hint = 0; u64 locked_end; + u64 actual_end = 0; struct extent_map *em; int blocksize = BTRFS_I(inode)->root->sectorsize; int ret; @@ -2163,36 +2649,51 @@ return btrfs_punch_hole(inode, offset, len); /* - * Make sure we have enough space before we do the - * allocation. + * Only trigger disk allocation, don't trigger qgroup reserve + * + * For qgroup space, it will be checked later. */ - ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start); - if (ret) + ret = btrfs_alloc_data_chunk_ondemand(inode, alloc_end - alloc_start); + if (ret < 0) return ret; - if (root->fs_info->quota_enabled) { - ret = btrfs_qgroup_reserve(root, alloc_end - alloc_start); - if (ret) - goto out_reserve_fail; - } - - /* - * wait for ordered IO before we have any locks. We'll loop again - * below with the locks held. - */ - btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start); mutex_lock(&inode->i_mutex); ret = inode_newsize_ok(inode, alloc_end); if (ret) goto out; + /* + * TODO: Move these two operations after we have checked + * accurate reserved space, or fallocate can still fail but + * with page truncated or size expanded. + * + * But that's a minor problem and won't do much harm BTW. + */ if (alloc_start > inode->i_size) { ret = btrfs_cont_expand(inode, i_size_read(inode), alloc_start); if (ret) goto out; + } else if (offset + len > inode->i_size) { + /* + * If we are fallocating from the end of the file onward we + * need to zero out the end of the page if i_size lands in the + * middle of a page. + */ + ret = btrfs_truncate_page(inode, inode->i_size, 0, 0); + if (ret) + goto out; } + /* + * wait for ordered IO before we have any locks. We'll loop again + * below with the locks held. + */ + ret = btrfs_wait_ordered_range(inode, alloc_start, + alloc_end - alloc_start); + if (ret) + goto out; + locked_end = alloc_end - 1; while (1) { struct btrfs_ordered_extent *ordered; @@ -2215,8 +2716,10 @@ * we can't wait on the range with the transaction * running or with the extent lock held */ - btrfs_wait_ordered_range(inode, alloc_start, - alloc_end - alloc_start); + ret = btrfs_wait_ordered_range(inode, alloc_start, + alloc_end - alloc_start); + if (ret) + goto out; } else { if (ordered) btrfs_put_ordered_extent(ordered); @@ -2224,10 +2727,10 @@ } } + /* First, check if we exceed the qgroup limit */ + INIT_LIST_HEAD(&reserve_list); cur_offset = alloc_start; while (1) { - u64 actual_end; - em = btrfs_get_extent(inode, NULL, 0, cur_offset, alloc_end - cur_offset, 0); if (IS_ERR_OR_NULL(em)) { @@ -2240,161 +2743,144 @@ last_byte = min(extent_map_end(em), alloc_end); actual_end = min_t(u64, extent_map_end(em), offset + len); last_byte = ALIGN(last_byte, blocksize); - if (em->block_start == EXTENT_MAP_HOLE || (cur_offset >= inode->i_size && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { - ret = btrfs_prealloc_file_range(inode, mode, cur_offset, - last_byte - cur_offset, - 1 << inode->i_blkbits, - offset + len, - &alloc_hint); - + ret = add_falloc_range(&reserve_list, cur_offset, + last_byte - cur_offset); if (ret < 0) { free_extent_map(em); break; } - } else if (actual_end > inode->i_size && - !(mode & FALLOC_FL_KEEP_SIZE)) { - /* - * We didn't need to allocate any more space, but we - * still extended the size of the file so we need to - * update i_size. - */ - inode->i_ctime = CURRENT_TIME; - i_size_write(inode, actual_end); - btrfs_ordered_update_i_size(inode, actual_end, NULL); + ret = btrfs_qgroup_reserve_data(inode, cur_offset, + last_byte - cur_offset); + if (ret < 0) + break; } free_extent_map(em); - cur_offset = last_byte; - if (cur_offset >= alloc_end) { - ret = 0; + if (cur_offset >= alloc_end) break; + } + + /* + * If ret is still 0, means we're OK to fallocate. + * Or just cleanup the list and exit. + */ + list_for_each_entry_safe(range, tmp, &reserve_list, list) { + if (!ret) + ret = btrfs_prealloc_file_range(inode, mode, + range->start, + range->len, 1 << inode->i_blkbits, + offset + len, &alloc_hint); + list_del(&range->list); + kfree(range); + } + if (ret < 0) + goto out_unlock; + + if (actual_end > inode->i_size && + !(mode & FALLOC_FL_KEEP_SIZE)) { + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(inode)->root; + + /* + * We didn't need to allocate any more space, but we + * still extended the size of the file so we need to + * update i_size and the inode item. + */ + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + } else { + inode->i_ctime = CURRENT_TIME; + i_size_write(inode, actual_end); + btrfs_ordered_update_i_size(inode, actual_end, NULL); + ret = btrfs_update_inode(trans, root, inode); + if (ret) + btrfs_end_transaction(trans, root); + else + ret = btrfs_end_transaction(trans, root); } } +out_unlock: unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, &cached_state, GFP_NOFS); out: + /* + * As we waited the extent range, the data_rsv_map must be empty + * in the range, as written data range will be released from it. + * And for prealloacted extent, it will also be released when + * its metadata is written. + * So this is completely used as cleanup. + */ + btrfs_qgroup_free_data(inode, alloc_start, alloc_end - alloc_start); mutex_unlock(&inode->i_mutex); - if (root->fs_info->quota_enabled) - btrfs_qgroup_free(root, alloc_end - alloc_start); -out_reserve_fail: /* Let go of our reservation. */ - btrfs_free_reserved_data_space(inode, alloc_end - alloc_start); + btrfs_free_reserved_data_space(inode, alloc_start, + alloc_end - alloc_start); return ret; } static int find_desired_extent(struct inode *inode, loff_t *offset, int whence) { struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_map *em; + struct extent_map *em = NULL; struct extent_state *cached_state = NULL; - u64 lockstart = *offset; - u64 lockend = i_size_read(inode); - u64 start = *offset; - u64 orig_start = *offset; - u64 len = i_size_read(inode); - u64 last_end = 0; + u64 lockstart; + u64 lockend; + u64 start; + u64 len; int ret = 0; - lockend = max_t(u64, root->sectorsize, lockend); + if (inode->i_size == 0) + return -ENXIO; + + /* + * *offset can be negative, in this case we start finding DATA/HOLE from + * the very start of the file. + */ + start = max_t(loff_t, 0, *offset); + + lockstart = round_down(start, root->sectorsize); + lockend = round_up(i_size_read(inode), root->sectorsize); if (lockend <= lockstart) lockend = lockstart + root->sectorsize; - lockend--; len = lockend - lockstart + 1; - len = max_t(u64, len, root->sectorsize); - if (inode->i_size == 0) - return -ENXIO; - lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0, &cached_state); - /* - * Delalloc is such a pain. If we have a hole and we have pending - * delalloc for a portion of the hole we will get back a hole that - * exists for the entire range since it hasn't been actually written - * yet. So to take care of this case we need to look for an extent just - * before the position we want in case there is outstanding delalloc - * going on here. - */ - if (whence == SEEK_HOLE && start != 0) { - if (start <= root->sectorsize) - em = btrfs_get_extent_fiemap(inode, NULL, 0, 0, - root->sectorsize, 0); - else - em = btrfs_get_extent_fiemap(inode, NULL, 0, - start - root->sectorsize, - root->sectorsize, 0); - if (IS_ERR(em)) { - ret = PTR_ERR(em); - goto out; - } - last_end = em->start + em->len; - if (em->block_start == EXTENT_MAP_DELALLOC) - last_end = min_t(u64, last_end, inode->i_size); - free_extent_map(em); - } - - while (1) { + while (start < inode->i_size) { em = btrfs_get_extent_fiemap(inode, NULL, 0, start, len, 0); if (IS_ERR(em)) { ret = PTR_ERR(em); + em = NULL; break; } - if (em->block_start == EXTENT_MAP_HOLE) { - if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) { - if (last_end <= orig_start) { - free_extent_map(em); - ret = -ENXIO; - break; - } - } - - if (whence == SEEK_HOLE) { - *offset = start; - free_extent_map(em); - break; - } - } else { - if (whence == SEEK_DATA) { - if (em->block_start == EXTENT_MAP_DELALLOC) { - if (start >= inode->i_size) { - free_extent_map(em); - ret = -ENXIO; - break; - } - } - - if (!test_bit(EXTENT_FLAG_PREALLOC, - &em->flags)) { - *offset = start; - free_extent_map(em); - break; - } - } - } + if (whence == SEEK_HOLE && + (em->block_start == EXTENT_MAP_HOLE || + test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) + break; + else if (whence == SEEK_DATA && + (em->block_start != EXTENT_MAP_HOLE && + !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) + break; start = em->start + em->len; - last_end = em->start + em->len; - - if (em->block_start == EXTENT_MAP_DELALLOC) - last_end = min_t(u64, last_end, inode->i_size); - - if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) { - free_extent_map(em); - ret = -ENXIO; - break; - } free_extent_map(em); + em = NULL; cond_resched(); } - if (!ret) - *offset = min(*offset, inode->i_size); -out: + free_extent_map(em); + if (!ret) { + if (whence == SEEK_DATA && start >= inode->i_size) + ret = -ENXIO; + else + *offset = min_t(loff_t, start, inode->i_size); + } unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state, GFP_NOFS); return ret; @@ -2425,20 +2911,7 @@ } } - if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) { - offset = -EINVAL; - goto out; - } - if (offset > inode->i_sb->s_maxbytes) { - offset = -EINVAL; - goto out; - } - - /* Special lock needed here? */ - if (offset != file->f_pos) { - file->f_pos = offset; - file->f_version = 0; - } + offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); out: mutex_unlock(&inode->i_mutex); return offset; @@ -2446,11 +2919,9 @@ const struct file_operations btrfs_file_operations = { .llseek = btrfs_file_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = generic_file_aio_read, + .read_iter = generic_file_read_iter, .splice_read = generic_file_splice_read, - .aio_write = btrfs_file_aio_write, + .write_iter = btrfs_file_write_iter, .mmap = btrfs_file_mmap, .open = generic_file_open, .release = btrfs_release_file, @@ -2479,3 +2950,29 @@ return 0; } + +int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end) +{ + int ret; + + /* + * So with compression we will find and lock a dirty page and clear the + * first one as dirty, setup an async extent, and immediately return + * with the entire range locked but with nobody actually marked with + * writeback. So we can't just filemap_write_and_wait_range() and + * expect it to work since it will just kick off a thread to do the + * actual work. So we need to call filemap_fdatawrite_range _again_ + * since it will wait on the page lock, which won't be unlocked until + * after the pages have been marked as writeback and so we're good to go + * from there. We have to do this otherwise we'll miss the ordered + * extents and that results in badness. Please Josef, do not think you + * know better and pull this out at some point in the future, it is + * right and you are wrong. + */ + ret = filemap_fdatawrite_range(inode->i_mapping, start, end); + if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) + ret = filemap_fdatawrite_range(inode->i_mapping, start, end); + + return ret; +}