--- zzzz-none-000/linux-3.10.107/fs/ext4/extents.c 2017-06-27 09:49:32.000000000 +0000 +++ scorpion-7490-727/linux-3.10.107/fs/ext4/extents.c 2021-02-04 17:41:59.000000000 +0000 @@ -37,9 +37,9 @@ #include #include #include -#include #include #include +#include #include "ext4_jbd2.h" #include "ext4_extents.h" #include "xattr.h" @@ -51,8 +51,8 @@ */ #define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \ due to ENOSPC */ -#define EXT4_EXT_MARK_UNINIT1 0x2 /* mark first half uninitialized */ -#define EXT4_EXT_MARK_UNINIT2 0x4 /* mark second half uninitialized */ +#define EXT4_EXT_MARK_UNWRIT1 0x2 /* mark first half unwritten */ +#define EXT4_EXT_MARK_UNWRIT2 0x4 /* mark second half unwritten */ #define EXT4_EXT_DATA_VALID1 0x8 /* first half contains valid data */ #define EXT4_EXT_DATA_VALID2 0x10 /* second half contains valid data */ @@ -74,8 +74,7 @@ { struct ext4_extent_tail *et; - if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (!ext4_has_metadata_csum(inode->i_sb)) return 1; et = find_ext4_extent_tail(eh); @@ -89,8 +88,7 @@ { struct ext4_extent_tail *et; - if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (!ext4_has_metadata_csum(inode->i_sb)) return; et = find_ext4_extent_tail(eh); @@ -99,14 +97,14 @@ static int ext4_split_extent(handle_t *handle, struct inode *inode, - struct ext4_ext_path *path, + struct ext4_ext_path **ppath, struct ext4_map_blocks *map, int split_flag, int flags); static int ext4_split_extent_at(handle_t *handle, struct inode *inode, - struct ext4_ext_path *path, + struct ext4_ext_path **ppath, ext4_lblk_t split, int split_flag, int flags); @@ -144,6 +142,7 @@ { if (path->p_bh) { /* path points to block */ + BUFFER_TRACE(path->p_bh, "get_write_access"); return ext4_journal_get_write_access(handle, path->p_bh); } /* path points to leaf/index in inode body */ @@ -161,6 +160,8 @@ struct inode *inode, struct ext4_ext_path *path) { int err; + + WARN_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem)); if (path->p_bh) { ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh)); /* path points to block */ @@ -289,6 +290,20 @@ return size; } +static inline int +ext4_force_split_extent_at(handle_t *handle, struct inode *inode, + struct ext4_ext_path **ppath, ext4_lblk_t lblk, + int nofail) +{ + struct ext4_ext_path *path = *ppath; + int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext); + + return ext4_split_extent_at(handle, inode, ppath, lblk, unwritten ? + EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0, + EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO | + (nofail ? EXT4_GET_BLOCKS_METADATA_NOFAIL:0)); +} + /* * Calculate the number of metadata blocks needed * to allocate @blocks @@ -428,10 +443,10 @@ static int __ext4_ext_check(const char *function, unsigned int line, struct inode *inode, struct ext4_extent_header *eh, - int depth) + int depth, ext4_fsblk_t pblk) { const char *error_msg; - int max = 0; + int max = 0, err = -EFSCORRUPTED; if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) { error_msg = "invalid magic"; @@ -466,48 +481,156 @@ if (ext_depth(inode) != depth && !ext4_extent_block_csum_verify(inode, eh)) { error_msg = "extent tree corrupted"; + err = -EFSBADCRC; goto corrupted; } return 0; corrupted: ext4_error_inode(inode, function, line, 0, - "bad header/extent: %s - magic %x, " - "entries %u, max %u(%u), depth %u(%u)", - error_msg, le16_to_cpu(eh->eh_magic), - le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max), - max, le16_to_cpu(eh->eh_depth), depth); - - return -EIO; + "pblk %llu bad header/extent: %s - magic %x, " + "entries %u, max %u(%u), depth %u(%u)", + (unsigned long long) pblk, error_msg, + le16_to_cpu(eh->eh_magic), + le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max), + max, le16_to_cpu(eh->eh_depth), depth); + return err; } -#define ext4_ext_check(inode, eh, depth) \ - __ext4_ext_check(__func__, __LINE__, inode, eh, depth) +#define ext4_ext_check(inode, eh, depth, pblk) \ + __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk)) int ext4_ext_check_inode(struct inode *inode) { - return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode)); + return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode), 0); } -static int __ext4_ext_check_block(const char *function, unsigned int line, - struct inode *inode, - struct ext4_extent_header *eh, - int depth, - struct buffer_head *bh) +static struct buffer_head * +__read_extent_tree_block(const char *function, unsigned int line, + struct inode *inode, ext4_fsblk_t pblk, int depth, + int flags) { - int ret; + struct buffer_head *bh; + int err; - if (buffer_verified(bh)) - return 0; - ret = ext4_ext_check(inode, eh, depth); - if (ret) - return ret; + bh = sb_getblk_gfp(inode->i_sb, pblk, __GFP_MOVABLE | GFP_NOFS); + if (unlikely(!bh)) + return ERR_PTR(-ENOMEM); + + if (!bh_uptodate_or_lock(bh)) { + trace_ext4_ext_load_extent(inode, pblk, _RET_IP_); + err = bh_submit_read(bh); + if (err < 0) + goto errout; + } + if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE)) + return bh; + err = __ext4_ext_check(function, line, inode, + ext_block_hdr(bh), depth, pblk); + if (err) + goto errout; set_buffer_verified(bh); - return ret; + /* + * If this is a leaf block, cache all of its entries + */ + if (!(flags & EXT4_EX_NOCACHE) && depth == 0) { + struct ext4_extent_header *eh = ext_block_hdr(bh); + struct ext4_extent *ex = EXT_FIRST_EXTENT(eh); + ext4_lblk_t prev = 0; + int i; + + for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) { + unsigned int status = EXTENT_STATUS_WRITTEN; + ext4_lblk_t lblk = le32_to_cpu(ex->ee_block); + int len = ext4_ext_get_actual_len(ex); + + if (prev && (prev != lblk)) + ext4_es_cache_extent(inode, prev, + lblk - prev, ~0, + EXTENT_STATUS_HOLE); + + if (ext4_ext_is_unwritten(ex)) + status = EXTENT_STATUS_UNWRITTEN; + ext4_es_cache_extent(inode, lblk, len, + ext4_ext_pblock(ex), status); + prev = lblk + len; + } + } + return bh; +errout: + put_bh(bh); + return ERR_PTR(err); + } -#define ext4_ext_check_block(inode, eh, depth, bh) \ - __ext4_ext_check_block(__func__, __LINE__, inode, eh, depth, bh) +#define read_extent_tree_block(inode, pblk, depth, flags) \ + __read_extent_tree_block(__func__, __LINE__, (inode), (pblk), \ + (depth), (flags)) + +/* + * This function is called to cache a file's extent information in the + * extent status tree + */ +int ext4_ext_precache(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_ext_path *path = NULL; + struct buffer_head *bh; + int i = 0, depth, ret = 0; + + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + return 0; /* not an extent-mapped inode */ + + down_read(&ei->i_data_sem); + depth = ext_depth(inode); + + path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), + GFP_NOFS); + if (path == NULL) { + up_read(&ei->i_data_sem); + return -ENOMEM; + } + + /* Don't cache anything if there are no external extent blocks */ + if (depth == 0) + goto out; + path[0].p_hdr = ext_inode_hdr(inode); + ret = ext4_ext_check(inode, path[0].p_hdr, depth, 0); + if (ret) + goto out; + path[0].p_idx = EXT_FIRST_INDEX(path[0].p_hdr); + while (i >= 0) { + /* + * If this is a leaf block or we've reached the end of + * the index block, go up + */ + if ((i == depth) || + path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) { + brelse(path[i].p_bh); + path[i].p_bh = NULL; + i--; + continue; + } + bh = read_extent_tree_block(inode, + ext4_idx_pblock(path[i].p_idx++), + depth - i - 1, + EXT4_EX_FORCE_CACHE); + if (IS_ERR(bh)) { + ret = PTR_ERR(bh); + break; + } + i++; + path[i].p_bh = bh; + path[i].p_hdr = ext_block_hdr(bh); + path[i].p_idx = EXT_FIRST_INDEX(path[i].p_hdr); + } + ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED); +out: + up_read(&ei->i_data_sem); + ext4_ext_drop_refs(path); + kfree(path); + return ret; +} #ifdef EXT_DEBUG static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) @@ -522,7 +645,7 @@ } else if (path->p_ext) { ext_debug(" %d:[%d]%d:%llu ", le32_to_cpu(path->p_ext->ee_block), - ext4_ext_is_uninitialized(path->p_ext), + ext4_ext_is_unwritten(path->p_ext), ext4_ext_get_actual_len(path->p_ext), ext4_ext_pblock(path->p_ext)); } else @@ -548,7 +671,7 @@ for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) { ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block), - ext4_ext_is_uninitialized(ex), + ext4_ext_is_unwritten(ex), ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex)); } ext_debug("\n"); @@ -579,7 +702,7 @@ ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n", le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex), - ext4_ext_is_uninitialized(ex), + ext4_ext_is_unwritten(ex), ext4_ext_get_actual_len(ex), newblock); ex++; @@ -594,9 +717,11 @@ void ext4_ext_drop_refs(struct ext4_ext_path *path) { - int depth = path->p_depth; - int i; + int depth, i; + if (!path) + return; + depth = path->p_depth; for (i = 0; i <= depth; i++, path++) if (path->p_bh) { brelse(path->p_bh); @@ -704,7 +829,7 @@ ext_debug(" -> %d:%llu:[%d]%d ", le32_to_cpu(path->p_ext->ee_block), ext4_ext_pblock(path->p_ext), - ext4_ext_is_uninitialized(path->p_ext), + ext4_ext_is_unwritten(path->p_ext), ext4_ext_get_actual_len(path->p_ext)); #ifdef CHECK_BINSEARCH @@ -740,24 +865,38 @@ } struct ext4_ext_path * -ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, - struct ext4_ext_path *path) +ext4_find_extent(struct inode *inode, ext4_lblk_t block, + struct ext4_ext_path **orig_path, int flags) { struct ext4_extent_header *eh; struct buffer_head *bh; - short int depth, i, ppos = 0, alloc = 0; + struct ext4_ext_path *path = orig_path ? *orig_path : NULL; + short int depth, i, ppos = 0; int ret; eh = ext_inode_hdr(inode); depth = ext_depth(inode); + if (depth < 0 || depth > EXT4_MAX_EXTENT_DEPTH) { + EXT4_ERROR_INODE(inode, "inode has invalid extent depth: %d", + depth); + ret = -EFSCORRUPTED; + goto err; + } - /* account possible depth increase */ + if (path) { + ext4_ext_drop_refs(path); + if (depth > path[0].p_maxdepth) { + kfree(path); + *orig_path = path = NULL; + } + } if (!path) { + /* account possible depth increase */ path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2), GFP_NOFS); - if (!path) + if (unlikely(!path)) return ERR_PTR(-ENOMEM); - alloc = 1; + path[0].p_maxdepth = depth + 1; } path[0].p_hdr = eh; path[0].p_bh = NULL; @@ -773,36 +912,24 @@ path[ppos].p_depth = i; path[ppos].p_ext = NULL; - bh = sb_getblk(inode->i_sb, path[ppos].p_block); - if (unlikely(!bh)) { - ret = -ENOMEM; + bh = read_extent_tree_block(inode, path[ppos].p_block, --i, + flags); + if (IS_ERR(bh)) { + ret = PTR_ERR(bh); goto err; } - if (!bh_uptodate_or_lock(bh)) { - trace_ext4_ext_load_extent(inode, block, - path[ppos].p_block); - ret = bh_submit_read(bh); - if (ret < 0) { - put_bh(bh); - goto err; - } - } + eh = ext_block_hdr(bh); ppos++; if (unlikely(ppos > depth)) { put_bh(bh); EXT4_ERROR_INODE(inode, "ppos %d > depth %d", ppos, depth); - ret = -EIO; + ret = -EFSCORRUPTED; goto err; } path[ppos].p_bh = bh; path[ppos].p_hdr = eh; - i--; - - ret = ext4_ext_check_block(inode, eh, i, bh); - if (ret < 0) - goto err; } path[ppos].p_depth = i; @@ -821,8 +948,9 @@ err: ext4_ext_drop_refs(path); - if (alloc) - kfree(path); + kfree(path); + if (orig_path) + *orig_path = NULL; return ERR_PTR(ret); } @@ -846,7 +974,7 @@ EXT4_ERROR_INODE(inode, "logical %d == ei_block %d!", logical, le32_to_cpu(curp->p_idx->ei_block)); - return -EIO; + return -EFSCORRUPTED; } if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries) @@ -855,7 +983,7 @@ "eh_entries %d >= eh_max %d!", le16_to_cpu(curp->p_hdr->eh_entries), le16_to_cpu(curp->p_hdr->eh_max)); - return -EIO; + return -EFSCORRUPTED; } if (logical > le32_to_cpu(curp->p_idx->ei_block)) { @@ -879,7 +1007,7 @@ if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) { EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!"); - return -EIO; + return -EFSCORRUPTED; } ix->ei_block = cpu_to_le32(logical); @@ -888,7 +1016,7 @@ if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) { EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!"); - return -EIO; + return -EFSCORRUPTED; } err = ext4_ext_dirty(handle, inode, curp); @@ -929,7 +1057,7 @@ * border from split point */ if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) { EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!"); - return -EIO; + return -EFSCORRUPTED; } if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { border = path[depth].p_ext[1].ee_block; @@ -973,10 +1101,10 @@ newblock = ablocks[--a]; if (unlikely(newblock == 0)) { EXT4_ERROR_INODE(inode, "newblock == 0!"); - err = -EIO; + err = -EFSCORRUPTED; goto cleanup; } - bh = sb_getblk(inode->i_sb, newblock); + bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS); if (unlikely(!bh)) { err = -ENOMEM; goto cleanup; @@ -999,7 +1127,7 @@ EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!", path[depth].p_hdr->eh_entries, path[depth].p_hdr->eh_max); - err = -EIO; + err = -EFSCORRUPTED; goto cleanup; } /* start copy from next extent */ @@ -1038,7 +1166,7 @@ k = depth - at - 1; if (unlikely(k < 0)) { EXT4_ERROR_INODE(inode, "k %d < 0!", k); - err = -EIO; + err = -EFSCORRUPTED; goto cleanup; } if (k) @@ -1078,7 +1206,7 @@ EXT4_ERROR_INODE(inode, "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!", le32_to_cpu(path[i].p_ext->ee_block)); - err = -EIO; + err = -EFSCORRUPTED; goto cleanup; } /* start copy indexes */ @@ -1149,20 +1277,28 @@ * just created block */ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, - unsigned int flags, - struct ext4_extent *newext) + unsigned int flags) { struct ext4_extent_header *neh; struct buffer_head *bh; - ext4_fsblk_t newblock; + ext4_fsblk_t newblock, goal = 0; + struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; int err = 0; - newblock = ext4_ext_new_meta_block(handle, inode, NULL, - newext, &err, flags); + /* Try to prepend new index to old one */ + if (ext_depth(inode)) + goal = ext4_idx_pblock(EXT_FIRST_INDEX(ext_inode_hdr(inode))); + if (goal > le32_to_cpu(es->s_first_data_block)) { + flags |= EXT4_MB_HINT_TRY_GOAL; + goal--; + } else + goal = ext4_inode_to_goal_block(inode); + newblock = ext4_new_meta_blocks(handle, inode, goal, flags, + NULL, &err); if (newblock == 0) return err; - bh = sb_getblk(inode->i_sb, newblock); + bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS); if (unlikely(!bh)) return -ENOMEM; lock_buffer(bh); @@ -1223,10 +1359,12 @@ * if no free index is found, then it requests in-depth growing. */ static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode, - unsigned int flags, - struct ext4_ext_path *path, + unsigned int mb_flags, + unsigned int gb_flags, + struct ext4_ext_path **ppath, struct ext4_extent *newext) { + struct ext4_ext_path *path = *ppath; struct ext4_ext_path *curp; int depth, i, err = 0; @@ -1245,28 +1383,26 @@ if (EXT_HAS_FREE_INDEX(curp)) { /* if we found index with free entry, then use that * entry: create all needed subtree and add new leaf */ - err = ext4_ext_split(handle, inode, flags, path, newext, i); + err = ext4_ext_split(handle, inode, mb_flags, path, newext, i); if (err) goto out; /* refill path */ - ext4_ext_drop_refs(path); - path = ext4_ext_find_extent(inode, + path = ext4_find_extent(inode, (ext4_lblk_t)le32_to_cpu(newext->ee_block), - path); + ppath, gb_flags); if (IS_ERR(path)) err = PTR_ERR(path); } else { /* tree is full, time to grow in depth */ - err = ext4_ext_grow_indepth(handle, inode, flags, newext); + err = ext4_ext_grow_indepth(handle, inode, mb_flags); if (err) goto out; /* refill path */ - ext4_ext_drop_refs(path); - path = ext4_ext_find_extent(inode, + path = ext4_find_extent(inode, (ext4_lblk_t)le32_to_cpu(newext->ee_block), - path); + ppath, gb_flags); if (IS_ERR(path)) { err = PTR_ERR(path); goto out; @@ -1304,7 +1440,7 @@ if (unlikely(path == NULL)) { EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical); - return -EIO; + return -EFSCORRUPTED; } depth = path->p_depth; *phys = 0; @@ -1323,7 +1459,7 @@ EXT4_ERROR_INODE(inode, "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!", *logical, le32_to_cpu(ex->ee_block)); - return -EIO; + return -EFSCORRUPTED; } while (--depth >= 0) { ix = path[depth].p_idx; @@ -1334,7 +1470,7 @@ EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ? le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0, depth); - return -EIO; + return -EFSCORRUPTED; } } return 0; @@ -1344,7 +1480,7 @@ EXT4_ERROR_INODE(inode, "logical %d < ee_block %d + ee_len %d!", *logical, le32_to_cpu(ex->ee_block), ee_len); - return -EIO; + return -EFSCORRUPTED; } *logical = le32_to_cpu(ex->ee_block) + ee_len - 1; @@ -1374,7 +1510,7 @@ if (unlikely(path == NULL)) { EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical); - return -EIO; + return -EFSCORRUPTED; } depth = path->p_depth; *phys = 0; @@ -1393,7 +1529,7 @@ EXT4_ERROR_INODE(inode, "first_extent(path[%d].p_hdr) != ex", depth); - return -EIO; + return -EFSCORRUPTED; } while (--depth >= 0) { ix = path[depth].p_idx; @@ -1401,7 +1537,7 @@ EXT4_ERROR_INODE(inode, "ix != EXT_FIRST_INDEX *logical %d!", *logical); - return -EIO; + return -EFSCORRUPTED; } } goto found_extent; @@ -1411,7 +1547,7 @@ EXT4_ERROR_INODE(inode, "logical %d < ee_block %d + ee_len %d!", *logical, le32_to_cpu(ex->ee_block), ee_len); - return -EIO; + return -EFSCORRUPTED; } if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) { @@ -1437,29 +1573,21 @@ ix++; block = ext4_idx_pblock(ix); while (++depth < path->p_depth) { - bh = sb_bread(inode->i_sb, block); - if (bh == NULL) - return -EIO; - eh = ext_block_hdr(bh); /* subtract from p_depth to get proper eh_depth */ - if (ext4_ext_check_block(inode, eh, - path->p_depth - depth, bh)) { - put_bh(bh); - return -EIO; - } + bh = read_extent_tree_block(inode, block, + path->p_depth - depth, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); + eh = ext_block_hdr(bh); ix = EXT_FIRST_INDEX(eh); block = ext4_idx_pblock(ix); put_bh(bh); } - bh = sb_bread(inode->i_sb, block); - if (bh == NULL) - return -EIO; + bh = read_extent_tree_block(inode, block, path->p_depth - depth, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); eh = ext_block_hdr(bh); - if (ext4_ext_check_block(inode, eh, path->p_depth - depth, bh)) { - put_bh(bh); - return -EIO; - } ex = EXT_FIRST_EXTENT(eh); found_extent: *logical = le32_to_cpu(ex->ee_block); @@ -1477,7 +1605,7 @@ * allocated block. Thus, index entries have to be consistent * with leaves. */ -static ext4_lblk_t +ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path) { int depth; @@ -1557,7 +1685,7 @@ if (unlikely(ex == NULL || eh == NULL)) { EXT4_ERROR_INODE(inode, "ex %p == NULL or eh %p == NULL", ex, eh); - return -EIO; + return -EFSCORRUPTED; } if (depth == 0) { @@ -1603,22 +1731,11 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, struct ext4_extent *ex2) { - unsigned short ext1_ee_len, ext2_ee_len, max_len; + unsigned short ext1_ee_len, ext2_ee_len; - /* - * Make sure that both extents are initialized. We don't merge - * uninitialized extents so that we can be sure that end_io code has - * the extent that was written properly split out and conversion to - * initialized is trivial. - */ - if (ext4_ext_is_uninitialized(ex1) || ext4_ext_is_uninitialized(ex2)) + if (ext4_ext_is_unwritten(ex1) != ext4_ext_is_unwritten(ex2)) return 0; - if (ext4_ext_is_uninitialized(ex1)) - max_len = EXT_UNINIT_MAX_LEN; - else - max_len = EXT_INIT_MAX_LEN; - ext1_ee_len = ext4_ext_get_actual_len(ex1); ext2_ee_len = ext4_ext_get_actual_len(ex2); @@ -1631,7 +1748,12 @@ * as an RO_COMPAT feature, refuse to merge to extents if * this can result in the top bit of ee_len being set. */ - if (ext1_ee_len + ext2_ee_len > max_len) + if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN) + return 0; + if (ext4_ext_is_unwritten(ex1) && + (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) || + atomic_read(&EXT4_I(inode)->i_unwritten) || + (ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN))) return 0; #ifdef AGGRESSIVE_TEST if (ext1_ee_len >= 4) @@ -1656,8 +1778,7 @@ { struct ext4_extent_header *eh; unsigned int depth, len; - int merge_done = 0; - int uninitialized = 0; + int merge_done = 0, unwritten; depth = ext_depth(inode); BUG_ON(path[depth].p_hdr == NULL); @@ -1667,12 +1788,11 @@ if (!ext4_can_extents_be_merged(inode, ex, ex + 1)) break; /* merge with next extent! */ - if (ext4_ext_is_uninitialized(ex)) - uninitialized = 1; + unwritten = ext4_ext_is_unwritten(ex); ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + ext4_ext_get_actual_len(ex + 1)); - if (uninitialized) - ext4_ext_mark_uninitialized(ex); + if (unwritten) + ext4_ext_mark_unwritten(ex); if (ex + 1 < EXT_LAST_EXTENT(eh)) { len = (EXT_LAST_EXTENT(eh) - ex - 1) @@ -1722,6 +1842,7 @@ sizeof(struct ext4_extent_idx); s += sizeof(struct ext4_extent_header); + path[1].p_maxdepth = path[0].p_maxdepth; memcpy(path[0].p_hdr, path[1].p_hdr, s); path[0].p_depth = 0; path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) + @@ -1730,8 +1851,7 @@ brelse(path[1].p_bh); ext4_free_blocks(handle, inode, NULL, blk, 1, - EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET | - EXT4_FREE_BLOCKS_RESERVE); + EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); } /* @@ -1817,37 +1937,39 @@ * creating new leaf in the no-space case. */ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, - struct ext4_ext_path *path, - struct ext4_extent *newext, int flag) + struct ext4_ext_path **ppath, + struct ext4_extent *newext, int gb_flags) { + struct ext4_ext_path *path = *ppath; struct ext4_extent_header *eh; struct ext4_extent *ex, *fex; struct ext4_extent *nearex; /* nearest extent */ struct ext4_ext_path *npath = NULL; int depth, len, err; ext4_lblk_t next; - unsigned uninitialized = 0; - int flags = 0; + int mb_flags = 0, unwritten; + if (gb_flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) + mb_flags |= EXT4_MB_DELALLOC_RESERVED; if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); - return -EIO; + return -EFSCORRUPTED; } depth = ext_depth(inode); ex = path[depth].p_ext; eh = path[depth].p_hdr; if (unlikely(path[depth].p_hdr == NULL)) { EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); - return -EIO; + return -EFSCORRUPTED; } /* try to insert block into found extent and return */ - if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)) { + if (ex && !(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) { /* * Try to see whether we should rather test the extent on * right from ex, or from the left of ex. This is because - * ext4_ext_find_extent() can return either extent on the + * ext4_find_extent() can return either extent on the * left, or on the right from the searched position. This * will make merging more effective. */ @@ -1867,29 +1989,21 @@ if (ext4_can_extents_be_merged(inode, ex, newext)) { ext_debug("append [%d]%d block to %u:[%d]%d" "(from %llu)\n", - ext4_ext_is_uninitialized(newext), + ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext), le32_to_cpu(ex->ee_block), - ext4_ext_is_uninitialized(ex), + ext4_ext_is_unwritten(ex), ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex)); err = ext4_ext_get_access(handle, inode, path + depth); if (err) return err; - - /* - * ext4_can_extents_be_merged should have checked - * that either both extents are uninitialized, or - * both aren't. Thus we need to check only one of - * them here. - */ - if (ext4_ext_is_uninitialized(ex)) - uninitialized = 1; + unwritten = ext4_ext_is_unwritten(ex); ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + ext4_ext_get_actual_len(newext)); - if (uninitialized) - ext4_ext_mark_uninitialized(ex); + if (unwritten) + ext4_ext_mark_unwritten(ex); eh = path[depth].p_hdr; nearex = ex; goto merge; @@ -1901,10 +2015,10 @@ ext_debug("prepend %u[%d]%d block to %u:[%d]%d" "(from %llu)\n", le32_to_cpu(newext->ee_block), - ext4_ext_is_uninitialized(newext), + ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext), le32_to_cpu(ex->ee_block), - ext4_ext_is_uninitialized(ex), + ext4_ext_is_unwritten(ex), ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex)); err = ext4_ext_get_access(handle, inode, @@ -1912,20 +2026,13 @@ if (err) return err; - /* - * ext4_can_extents_be_merged should have checked - * that either both extents are uninitialized, or - * both aren't. Thus we need to check only one of - * them here. - */ - if (ext4_ext_is_uninitialized(ex)) - uninitialized = 1; + unwritten = ext4_ext_is_unwritten(ex); ex->ee_block = newext->ee_block; ext4_ext_store_pblock(ex, ext4_ext_pblock(newext)); ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + ext4_ext_get_actual_len(newext)); - if (uninitialized) - ext4_ext_mark_uninitialized(ex); + if (unwritten) + ext4_ext_mark_unwritten(ex); eh = path[depth].p_hdr; nearex = ex; goto merge; @@ -1945,7 +2052,7 @@ if (next != EXT_MAX_BLOCKS) { ext_debug("next leaf block - %u\n", next); BUG_ON(npath != NULL); - npath = ext4_ext_find_extent(inode, next, NULL); + npath = ext4_find_extent(inode, next, NULL, 0); if (IS_ERR(npath)) return PTR_ERR(npath); BUG_ON(npath->p_depth != path->p_depth); @@ -1964,9 +2071,10 @@ * There is no free space in the found leaf. * We're gonna add a new leaf in the tree. */ - if (flag & EXT4_GET_BLOCKS_METADATA_NOFAIL) - flags = EXT4_MB_USE_RESERVED; - err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext); + if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL) + mb_flags |= EXT4_MB_USE_RESERVED; + err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags, + ppath, newext); if (err) goto cleanup; depth = ext_depth(inode); @@ -1984,7 +2092,7 @@ ext_debug("first extent in the leaf: %u:%llu:[%d]%d\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), - ext4_ext_is_uninitialized(newext), + ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext)); nearex = EXT_FIRST_EXTENT(eh); } else { @@ -1995,7 +2103,7 @@ "nearest %p\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), - ext4_ext_is_uninitialized(newext), + ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext), nearex); nearex++; @@ -2006,7 +2114,7 @@ "nearest %p\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), - ext4_ext_is_uninitialized(newext), + ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext), nearex); } @@ -2016,7 +2124,7 @@ "move %d extents from 0x%p to 0x%p\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), - ext4_ext_is_uninitialized(newext), + ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext), len, nearex, nearex + 1); memmove(nearex + 1, nearex, @@ -2032,7 +2140,7 @@ merge: /* try to merge extents */ - if (!(flag & EXT4_GET_BLOCKS_PRE_IO)) + if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) ext4_ext_try_to_merge(handle, inode, path, nearex); @@ -2044,10 +2152,8 @@ err = ext4_ext_dirty(handle, inode, path + path->p_depth); cleanup: - if (npath) { - ext4_ext_drop_refs(npath); - kfree(npath); - } + ext4_ext_drop_refs(npath); + kfree(npath); return err; } @@ -2069,13 +2175,7 @@ /* find extent for this block */ down_read(&EXT4_I(inode)->i_data_sem); - if (path && ext_depth(inode) != depth) { - /* depth was changed. we have to realloc path */ - kfree(path); - path = NULL; - } - - path = ext4_ext_find_extent(inode, block, path); + path = ext4_find_extent(inode, block, &path, 0); if (IS_ERR(path)) { up_read(&EXT4_I(inode)->i_data_sem); err = PTR_ERR(path); @@ -2087,12 +2187,11 @@ if (unlikely(path[depth].p_hdr == NULL)) { up_read(&EXT4_I(inode)->i_data_sem); EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); - err = -EIO; + err = -EFSCORRUPTED; break; } ex = path[depth].p_ext; next = ext4_ext_next_allocated_block(path); - ext4_ext_drop_refs(path); flags = 0; exists = 0; @@ -2138,7 +2237,7 @@ es.es_lblk = le32_to_cpu(ex->ee_block); es.es_len = ext4_ext_get_actual_len(ex); es.es_pblk = ext4_ext_pblock(ex); - if (ext4_ext_is_uninitialized(ex)) + if (ext4_ext_is_unwritten(ex)) flags |= FIEMAP_EXTENT_UNWRITTEN; } @@ -2150,13 +2249,14 @@ next_del = ext4_find_delayed_extent(inode, &es); if (!exists && next_del) { exists = 1; - flags |= FIEMAP_EXTENT_DELALLOC; + flags |= (FIEMAP_EXTENT_DELALLOC | + FIEMAP_EXTENT_UNKNOWN); } up_read(&EXT4_I(inode)->i_data_sem); if (unlikely(es.es_len == 0)) { EXT4_ERROR_INODE(inode, "es.es_len == 0"); - err = -EIO; + err = -EFSCORRUPTED; break; } @@ -2179,7 +2279,7 @@ "next extent == %u, next " "delalloc extent = %u", next, next_del); - err = -EIO; + err = -EFSCORRUPTED; break; } } @@ -2201,11 +2301,8 @@ block = es.es_lblk + es.es_len; } - if (path) { - ext4_ext_drop_refs(path); - kfree(path); - } - + ext4_ext_drop_refs(path); + kfree(path); return err; } @@ -2219,16 +2316,16 @@ ext4_lblk_t block) { int depth = ext_depth(inode); - unsigned long len; + ext4_lblk_t len; ext4_lblk_t lblock; struct ext4_extent *ex; + struct extent_status es; ex = path[depth].p_ext; if (ex == NULL) { - /* - * there is no extent yet, so gap is [0;-] and we - * don't cache it - */ + /* there is no extent yet, so gap is [0;-] */ + lblock = 0; + len = EXT_MAX_BLOCKS; ext_debug("cache gap(whole file):"); } else if (block < le32_to_cpu(ex->ee_block)) { lblock = block; @@ -2237,9 +2334,6 @@ block, le32_to_cpu(ex->ee_block), ext4_ext_get_actual_len(ex)); - if (!ext4_find_delalloc_range(inode, lblock, lblock + len - 1)) - ext4_es_insert_extent(inode, lblock, len, ~0, - EXTENT_STATUS_HOLE); } else if (block >= le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex)) { ext4_lblk_t next; @@ -2253,15 +2347,19 @@ block); BUG_ON(next == lblock); len = next - lblock; - if (!ext4_find_delalloc_range(inode, lblock, lblock + len - 1)) - ext4_es_insert_extent(inode, lblock, len, ~0, - EXTENT_STATUS_HOLE); } else { - lblock = len = 0; BUG(); } - ext_debug(" -> %u:%lu\n", lblock, len); + ext4_es_find_delayed_extent_range(inode, lblock, lblock + len - 1, &es); + if (es.es_len) { + /* There's delayed extent containing lblock? */ + if (es.es_lblk <= lblock) + return; + len = min(es.es_lblk - lblock, len); + } + ext_debug(" -> %u:%u\n", lblock, len); + ext4_es_insert_extent(inode, lblock, len, ~0, EXTENT_STATUS_HOLE); } /* @@ -2280,7 +2378,7 @@ leaf = ext4_idx_pblock(path->p_idx); if (unlikely(path->p_hdr->eh_entries == 0)) { EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0"); - return -EIO; + return -EFSCORRUPTED; } err = ext4_ext_get_access(handle, inode, path); if (err) @@ -2353,17 +2451,15 @@ } /* - * How many index/leaf blocks need to change/allocate to modify nrblocks? + * How many index/leaf blocks need to change/allocate to add @extents extents? * - * if nrblocks are fit in a single extent (chunk flag is 1), then - * in the worse case, each tree level index/leaf need to be changed - * if the tree split due to insert a new extent, then the old tree - * index/leaf need to be updated too + * If we add a single extent, then in the worse case, each tree level + * index/leaf need to be changed in case of the tree split. * - * If the nrblocks are discontiguous, they could cause - * the whole tree split more than once, but this is really rare. + * If more extents are inserted, they could cause the whole tree split more + * than once, but this is really rare. */ -int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) +int ext4_ext_index_trans_blocks(struct inode *inode, int extents) { int index; int depth; @@ -2374,7 +2470,7 @@ depth = ext_depth(inode); - if (chunk) + if (extents <= 1) index = depth * 2; else index = depth * 3; @@ -2382,27 +2478,31 @@ return index; } +static inline int get_default_free_blocks_flags(struct inode *inode) +{ + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET; + else if (ext4_should_journal_data(inode)) + return EXT4_FREE_BLOCKS_FORGET; + return 0; +} + static int ext4_remove_blocks(handle_t *handle, struct inode *inode, struct ext4_extent *ex, - ext4_fsblk_t *partial_cluster, + long long *partial_cluster, ext4_lblk_t from, ext4_lblk_t to) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - unsigned short ee_len = ext4_ext_get_actual_len(ex); + unsigned short ee_len = ext4_ext_get_actual_len(ex); ext4_fsblk_t pblk; - int flags = 0; - - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - flags |= EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET; - else if (ext4_should_journal_data(inode)) - flags |= EXT4_FREE_BLOCKS_FORGET; + int flags = get_default_free_blocks_flags(inode); /* * For bigalloc file systems, we never free a partial cluster * at the beginning of the extent. Instead, we make a note * that we tried freeing the cluster, and check to see if we * need to free it on a subsequent call to ext4_remove_blocks, - * or at the end of the ext4_truncate() operation. + * or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space. */ flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER; @@ -2413,7 +2513,8 @@ * partial cluster here. */ pblk = ext4_ext_pblock(ex) + ee_len - 1; - if (*partial_cluster && (EXT4_B2C(sbi, pblk) != *partial_cluster)) { + if (*partial_cluster > 0 && + *partial_cluster != (long long) EXT4_B2C(sbi, pblk)) { ext4_free_blocks(handle, inode, NULL, EXT4_C2B(sbi, *partial_cluster), sbi->s_cluster_ratio, flags); @@ -2439,59 +2540,72 @@ && to == le32_to_cpu(ex->ee_block) + ee_len - 1) { /* tail removal */ ext4_lblk_t num; + long long first_cluster; num = le32_to_cpu(ex->ee_block) + ee_len - from; pblk = ext4_ext_pblock(ex) + ee_len - num; - ext_debug("free last %u blocks starting %llu\n", num, pblk); + /* + * Usually we want to free partial cluster at the end of the + * extent, except for the situation when the cluster is still + * used by any other extent (partial_cluster is negative). + */ + if (*partial_cluster < 0 && + *partial_cluster == -(long long) EXT4_B2C(sbi, pblk+num-1)) + flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER; + + ext_debug("free last %u blocks starting %llu partial %lld\n", + num, pblk, *partial_cluster); ext4_free_blocks(handle, inode, NULL, pblk, num, flags); /* * If the block range to be freed didn't start at the * beginning of a cluster, and we removed the entire - * extent, save the partial cluster here, since we - * might need to delete if we determine that the - * truncate operation has removed all of the blocks in - * the cluster. + * extent and the cluster is not used by any other extent, + * save the partial cluster here, since we might need to + * delete if we determine that the truncate or punch hole + * operation has removed all of the blocks in the cluster. + * If that cluster is used by another extent, preserve its + * negative value so it isn't freed later on. + * + * If the whole extent wasn't freed, we've reached the + * start of the truncated/punched region and have finished + * removing blocks. If there's a partial cluster here it's + * shared with the remainder of the extent and is no longer + * a candidate for removal. */ - if (EXT4_PBLK_COFF(sbi, pblk) && - (ee_len == num)) - *partial_cluster = EXT4_B2C(sbi, pblk); - else + if (EXT4_PBLK_COFF(sbi, pblk) && ee_len == num) { + first_cluster = (long long) EXT4_B2C(sbi, pblk); + if (first_cluster != -*partial_cluster) + *partial_cluster = first_cluster; + } else { *partial_cluster = 0; - } else if (from == le32_to_cpu(ex->ee_block) - && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { - /* head removal */ - ext4_lblk_t num; - ext4_fsblk_t start; - - num = to - from; - start = ext4_ext_pblock(ex); - - ext_debug("free first %u blocks starting %llu\n", num, start); - ext4_free_blocks(handle, inode, NULL, start, num, flags); - - } else { - printk(KERN_INFO "strange request: removal(2) " - "%u-%u from %u:%u\n", - from, to, le32_to_cpu(ex->ee_block), ee_len); - } + } + } else + ext4_error(sbi->s_sb, "strange request: removal(2) " + "%u-%u from %u:%u\n", + from, to, le32_to_cpu(ex->ee_block), ee_len); return 0; } /* * ext4_ext_rm_leaf() Removes the extents associated with the - * blocks appearing between "start" and "end", and splits the extents - * if "start" and "end" appear in the same extent + * blocks appearing between "start" and "end". Both "start" + * and "end" must appear in the same extent or EIO is returned. * * @handle: The journal handle * @inode: The files inode * @path: The path to the leaf + * @partial_cluster: The cluster which we'll have to free if all extents + * has been released from it. However, if this value is + * negative, it's a cluster just to the right of the + * punched region and it must not be freed. * @start: The first block to remove * @end: The last block to remove */ static int ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, - struct ext4_ext_path *path, ext4_fsblk_t *partial_cluster, + struct ext4_ext_path *path, + long long *partial_cluster, ext4_lblk_t start, ext4_lblk_t end) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); @@ -2502,8 +2616,9 @@ unsigned num; ext4_lblk_t ex_ee_block; unsigned short ex_ee_len; - unsigned uninitialized = 0; + unsigned unwritten = 0; struct ext4_extent *ex; + ext4_fsblk_t pblk; /* the header must be checked already in ext4_ext_remove_space() */ ext_debug("truncate since %u in leaf to %u\n", start, end); @@ -2512,47 +2627,28 @@ eh = path[depth].p_hdr; if (unlikely(path[depth].p_hdr == NULL)) { EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); - return -EIO; + return -EFSCORRUPTED; } /* find where to start removing */ - ex = EXT_LAST_EXTENT(eh); + ex = path[depth].p_ext; + if (!ex) + ex = EXT_LAST_EXTENT(eh); ex_ee_block = le32_to_cpu(ex->ee_block); ex_ee_len = ext4_ext_get_actual_len(ex); - /* - * If we're starting with an extent other than the last one in the - * node, we need to see if it shares a cluster with the extent to - * the right (towards the end of the file). If its leftmost cluster - * is this extent's rightmost cluster and it is not cluster aligned, - * we'll mark it as a partial that is not to be deallocated. - */ - - if (ex != EXT_LAST_EXTENT(eh)) { - ext4_fsblk_t current_pblk, right_pblk; - long long current_cluster, right_cluster; - - current_pblk = ext4_ext_pblock(ex) + ex_ee_len - 1; - current_cluster = (long long)EXT4_B2C(sbi, current_pblk); - right_pblk = ext4_ext_pblock(ex + 1); - right_cluster = (long long)EXT4_B2C(sbi, right_pblk); - if (current_cluster == right_cluster && - EXT4_PBLK_COFF(sbi, right_pblk)) - *partial_cluster = -right_cluster; - } - trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster); while (ex >= EXT_FIRST_EXTENT(eh) && ex_ee_block + ex_ee_len > start) { - if (ext4_ext_is_uninitialized(ex)) - uninitialized = 1; + if (ext4_ext_is_unwritten(ex)) + unwritten = 1; else - uninitialized = 0; + unwritten = 0; ext_debug("remove ext %u:[%d]%d\n", ex_ee_block, - uninitialized, ex_ee_len); + unwritten, ex_ee_len); path[depth].p_ext = ex; a = ex_ee_block > start ? ex_ee_block : start; @@ -2563,6 +2659,18 @@ /* If this extent is beyond the end of the hole, skip it */ if (end < ex_ee_block) { + /* + * We're going to skip this extent and move to another, + * so note that its first cluster is in use to avoid + * freeing it when removing blocks. Eventually, the + * right edge of the truncated/punched region will + * be just to the left. + */ + if (sbi->s_cluster_ratio > 1) { + pblk = ext4_ext_pblock(ex); + *partial_cluster = + -(long long) EXT4_B2C(sbi, pblk); + } ex--; ex_ee_block = le32_to_cpu(ex->ee_block); ex_ee_len = ext4_ext_get_actual_len(ex); @@ -2573,7 +2681,7 @@ "on extent %u:%u", start, end, ex_ee_block, ex_ee_block + ex_ee_len - 1); - err = -EIO; + err = -EFSCORRUPTED; goto out; } else if (a != ex_ee_block) { /* remove tail of the extent */ @@ -2614,11 +2722,11 @@ ex->ee_len = cpu_to_le16(num); /* - * Do not mark uninitialized if all the blocks in the + * Do not mark unwritten if all the blocks in the * extent have been removed. */ - if (uninitialized && num) - ext4_ext_mark_uninitialized(ex); + if (unwritten && num) + ext4_ext_mark_unwritten(ex); /* * If the extent was completely released, * we need to remove it from the leaf @@ -2638,8 +2746,7 @@ sizeof(struct ext4_extent)); } le16_add_cpu(&eh->eh_entries, -1); - } else - *partial_cluster = 0; + } err = ext4_ext_dirty(handle, inode, path + depth); if (err) @@ -2656,21 +2763,20 @@ err = ext4_ext_correct_indexes(handle, inode, path); /* - * If there is still a entry in the leaf node, check to see if - * it references the partial cluster. This is the only place - * where it could; if it doesn't, we can free the cluster. - */ - if (*partial_cluster && ex >= EXT_FIRST_EXTENT(eh) && - (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) != - *partial_cluster)) { - int flags = EXT4_FREE_BLOCKS_FORGET; - - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - flags |= EXT4_FREE_BLOCKS_METADATA; - - ext4_free_blocks(handle, inode, NULL, - EXT4_C2B(sbi, *partial_cluster), - sbi->s_cluster_ratio, flags); + * If there's a partial cluster and at least one extent remains in + * the leaf, free the partial cluster if it isn't shared with the + * current extent. If it is shared with the current extent + * we zero partial_cluster because we've reached the start of the + * truncated/punched region and we're done removing blocks. + */ + if (*partial_cluster > 0 && ex >= EXT_FIRST_EXTENT(eh)) { + pblk = ext4_ext_pblock(ex) + ex_ee_len - 1; + if (*partial_cluster != (long long) EXT4_B2C(sbi, pblk)) { + ext4_free_blocks(handle, inode, NULL, + EXT4_C2B(sbi, *partial_cluster), + sbi->s_cluster_ratio, + get_default_free_blocks_flags(inode)); + } *partial_cluster = 0; } @@ -2707,10 +2813,10 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, ext4_lblk_t end) { - struct super_block *sb = inode->i_sb; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int depth = ext_depth(inode); struct ext4_ext_path *path = NULL; - ext4_fsblk_t partial_cluster = 0; + long long partial_cluster = 0; handle_t *handle; int i = 0, err = 0; @@ -2722,7 +2828,7 @@ return PTR_ERR(handle); again: - trace_ext4_ext_remove_space(inode, start, depth); + trace_ext4_ext_remove_space(inode, start, end, depth); /* * Check if we are removing extents inside the extent tree. If that @@ -2733,10 +2839,11 @@ */ if (end < EXT_MAX_BLOCKS - 1) { struct ext4_extent *ex; - ext4_lblk_t ee_block; + ext4_lblk_t ee_block, ex_end, lblk; + ext4_fsblk_t pblk; - /* find extent for this block */ - path = ext4_ext_find_extent(inode, end, NULL); + /* find extent for or closest extent to this block */ + path = ext4_find_extent(inode, end, NULL, EXT4_EX_NOCACHE); if (IS_ERR(path)) { ext4_journal_stop(handle); return PTR_ERR(path); @@ -2749,12 +2856,13 @@ EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); - err = -EIO; + err = -EFSCORRUPTED; } goto out; } ee_block = le32_to_cpu(ex->ee_block); + ex_end = ee_block + ext4_ext_get_actual_len(ex) - 1; /* * See if the last block is inside the extent, if so split @@ -2762,13 +2870,18 @@ * tail of the first part of the split extent in * ext4_ext_rm_leaf(). */ - if (end >= ee_block && - end < ee_block + ext4_ext_get_actual_len(ex) - 1) { - int split_flag = 0; - - if (ext4_ext_is_uninitialized(ex)) - split_flag = EXT4_EXT_MARK_UNINIT1 | - EXT4_EXT_MARK_UNINIT2; + if (end >= ee_block && end < ex_end) { + + /* + * If we're going to split the extent, note that + * the cluster containing the block after 'end' is + * in use to avoid freeing it when removing blocks. + */ + if (sbi->s_cluster_ratio > 1) { + pblk = ext4_ext_pblock(ex) + end - ee_block + 2; + partial_cluster = + -(long long) EXT4_B2C(sbi, pblk); + } /* * Split the extent in two so that 'end' is the last @@ -2776,13 +2889,28 @@ * fail removing space due to ENOSPC so try to use * reserved block if that happens. */ - err = ext4_split_extent_at(handle, inode, path, - end + 1, split_flag, - EXT4_GET_BLOCKS_PRE_IO | - EXT4_GET_BLOCKS_METADATA_NOFAIL); - + err = ext4_force_split_extent_at(handle, inode, &path, + end + 1, 1); if (err < 0) goto out; + + } else if (sbi->s_cluster_ratio > 1 && end >= ex_end) { + /* + * If there's an extent to the right its first cluster + * contains the immediate right boundary of the + * truncated/punched region. Set partial_cluster to + * its negative value so it won't be freed if shared + * with the current extent. The end < ee_block case + * is handled in ext4_ext_rm_leaf(). + */ + lblk = ex_end + 1; + err = ext4_ext_search_right(inode, path, &lblk, &pblk, + &ex); + if (err) + goto out; + if (pblk) + partial_cluster = + -(long long) EXT4_B2C(sbi, pblk); } } /* @@ -2802,12 +2930,12 @@ ext4_journal_stop(handle); return -ENOMEM; } - path[0].p_depth = depth; + path[0].p_maxdepth = path[0].p_depth = depth; path[0].p_hdr = ext_inode_hdr(inode); i = 0; - if (ext4_ext_check(inode, path[0].p_hdr, depth)) { - err = -EIO; + if (ext4_ext_check(inode, path[0].p_hdr, depth, 0)) { + err = -EFSCORRUPTED; goto out; } } @@ -2853,19 +2981,19 @@ ext_debug("move to level %d (block %llu)\n", i + 1, ext4_idx_pblock(path[i].p_idx)); memset(path + i + 1, 0, sizeof(*path)); - bh = sb_bread(sb, ext4_idx_pblock(path[i].p_idx)); - if (!bh) { + bh = read_extent_tree_block(inode, + ext4_idx_pblock(path[i].p_idx), depth - i - 1, + EXT4_EX_NOCACHE); + if (IS_ERR(bh)) { /* should we reset i_size? */ - err = -EIO; + err = PTR_ERR(bh); break; } + /* Yield here to deal with large extent trees. + * Should be a no-op if we did IO above. */ + cond_resched(); if (WARN_ON(i + 1 > depth)) { - err = -EIO; - break; - } - if (ext4_ext_check_block(inode, ext_block_hdr(bh), - depth - i - 1, bh)) { - err = -EIO; + err = -EFSCORRUPTED; break; } path[i + 1].p_bh = bh; @@ -2890,22 +3018,21 @@ } } - trace_ext4_ext_remove_space_done(inode, start, depth, partial_cluster, - path->p_hdr->eh_entries); + trace_ext4_ext_remove_space_done(inode, start, end, depth, + partial_cluster, path->p_hdr->eh_entries); - /* If we still have something in the partial cluster and we have removed + /* + * If we still have something in the partial cluster and we have removed * even the first extent, then we should free the blocks in the partial - * cluster as well. */ - if (partial_cluster && path->p_hdr->eh_entries == 0) { - int flags = EXT4_FREE_BLOCKS_FORGET; - - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - flags |= EXT4_FREE_BLOCKS_METADATA; - + * cluster as well. (This code will only run when there are no leaves + * to the immediate left of the truncated/punched region.) + */ + if (partial_cluster > 0 && err == 0) { + /* don't zero partial_cluster since it's not used afterwards */ ext4_free_blocks(handle, inode, NULL, - EXT4_C2B(EXT4_SB(sb), partial_cluster), - EXT4_SB(sb)->s_cluster_ratio, flags); - partial_cluster = 0; + EXT4_C2B(sbi, partial_cluster), + sbi->s_cluster_ratio, + get_default_free_blocks_flags(inode)); } /* TODO: flexible tree reduction should be here */ @@ -2925,10 +3052,9 @@ out: ext4_ext_drop_refs(path); kfree(path); - if (err == -EAGAIN) { - path = NULL; + path = NULL; + if (err == -EAGAIN) goto again; - } ext4_journal_stop(handle); return err; @@ -2943,7 +3069,7 @@ * possible initialization would be here */ - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { + if (ext4_has_feature_extents(sb)) { #if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS) printk(KERN_INFO "EXT4-fs: file extents enabled" #ifdef AGGRESSIVE_TEST @@ -2970,7 +3096,7 @@ */ void ext4_ext_release(struct super_block *sb) { - if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) + if (!ext4_has_feature_extents(sb)) return; #ifdef EXTENTS_STATS @@ -2985,6 +3111,23 @@ #endif } +static int ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex) +{ + ext4_lblk_t ee_block; + ext4_fsblk_t ee_pblock; + unsigned int ee_len; + + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); + ee_pblock = ext4_ext_pblock(ex); + + if (ee_len == 0) + return 0; + + return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock, + EXTENT_STATUS_WRITTEN); +} + /* FIXME!! we need to try to merge to left or right after zero-out */ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) { @@ -2995,6 +3138,9 @@ ee_len = ext4_ext_get_actual_len(ex); ee_pblock = ext4_ext_pblock(ex); + if (ext4_encrypted_inode(inode)) + return ext4_encrypted_zeroout(inode, ex); + ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS); if (ret > 0) ret = 0; @@ -3010,7 +3156,7 @@ * @path: the path to the extent * @split: the logical block where the extent is splitted. * @split_flags: indicates if the extent could be zeroout if split fails, and - * the states(init or uninit) of new extents. + * the states(init or unwritten) of new extents. * @flags: flags used to insert new extent to extent tree. * * @@ -3025,11 +3171,12 @@ */ static int ext4_split_extent_at(handle_t *handle, struct inode *inode, - struct ext4_ext_path *path, + struct ext4_ext_path **ppath, ext4_lblk_t split, int split_flag, int flags) { + struct ext4_ext_path *path = *ppath; ext4_fsblk_t newblock; ext4_lblk_t ee_block; struct ext4_extent *ex, newex, orig_ex, zero_ex; @@ -3052,10 +3199,10 @@ newblock = split - ee_block + ext4_ext_pblock(ex); BUG_ON(split < ee_block || split >= (ee_block + ee_len)); - BUG_ON(!ext4_ext_is_uninitialized(ex) && + BUG_ON(!ext4_ext_is_unwritten(ex) && split_flag & (EXT4_EXT_MAY_ZEROOUT | - EXT4_EXT_MARK_UNINIT1 | - EXT4_EXT_MARK_UNINIT2)); + EXT4_EXT_MARK_UNWRIT1 | + EXT4_EXT_MARK_UNWRIT2)); err = ext4_ext_get_access(handle, inode, path + depth); if (err) @@ -3067,8 +3214,8 @@ * then we just change the state of the extent, and splitting * is not needed. */ - if (split_flag & EXT4_EXT_MARK_UNINIT2) - ext4_ext_mark_uninitialized(ex); + if (split_flag & EXT4_EXT_MARK_UNWRIT2) + ext4_ext_mark_unwritten(ex); else ext4_ext_mark_initialized(ex); @@ -3082,8 +3229,8 @@ /* case a */ memcpy(&orig_ex, ex, sizeof(orig_ex)); ex->ee_len = cpu_to_le16(split - ee_block); - if (split_flag & EXT4_EXT_MARK_UNINIT1) - ext4_ext_mark_uninitialized(ex); + if (split_flag & EXT4_EXT_MARK_UNWRIT1) + ext4_ext_mark_unwritten(ex); /* * path may lead to new leaf, not to original leaf any more @@ -3097,10 +3244,10 @@ ex2->ee_block = cpu_to_le32(split); ex2->ee_len = cpu_to_le16(ee_len - (split - ee_block)); ext4_ext_store_pblock(ex2, newblock); - if (split_flag & EXT4_EXT_MARK_UNINIT2) - ext4_ext_mark_uninitialized(ex2); + if (split_flag & EXT4_EXT_MARK_UNWRIT2) + ext4_ext_mark_unwritten(ex2); - err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); + err = ext4_ext_insert_extent(handle, inode, ppath, &newex, flags); if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) { if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) { if (split_flag & EXT4_EXT_DATA_VALID1) { @@ -3137,7 +3284,7 @@ goto fix_extent_len; /* update extent status tree */ - err = ext4_es_zeroout(inode, &zero_ex); + err = ext4_zeroout_es(inode, &zero_ex); goto out; } else if (err) @@ -3149,7 +3296,7 @@ fix_extent_len: ex->ee_len = orig_ex.ee_len; - ext4_ext_dirty(handle, inode, path + depth); + ext4_ext_dirty(handle, inode, path + path->p_depth); return err; } @@ -3157,7 +3304,7 @@ * ext4_split_extents() splits an extent and mark extent which is covered * by @map as split_flags indicates * - * It may result in splitting the extent into multiple extents (upto three) + * It may result in splitting the extent into multiple extents (up to three) * There are three possibilities: * a> There is no split required * b> Splits in two extents: Split is happening at either end of the extent @@ -3166,16 +3313,17 @@ */ static int ext4_split_extent(handle_t *handle, struct inode *inode, - struct ext4_ext_path *path, + struct ext4_ext_path **ppath, struct ext4_map_blocks *map, int split_flag, int flags) { + struct ext4_ext_path *path = *ppath; ext4_lblk_t ee_block; struct ext4_extent *ex; unsigned int ee_len, depth; int err = 0; - int uninitialized; + int unwritten; int split_flag1, flags1; int allocated = map->m_len; @@ -3183,17 +3331,17 @@ ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); - uninitialized = ext4_ext_is_uninitialized(ex); + unwritten = ext4_ext_is_unwritten(ex); if (map->m_lblk + map->m_len < ee_block + ee_len) { split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT; flags1 = flags | EXT4_GET_BLOCKS_PRE_IO; - if (uninitialized) - split_flag1 |= EXT4_EXT_MARK_UNINIT1 | - EXT4_EXT_MARK_UNINIT2; + if (unwritten) + split_flag1 |= EXT4_EXT_MARK_UNWRIT1 | + EXT4_EXT_MARK_UNWRIT2; if (split_flag & EXT4_EXT_DATA_VALID2) split_flag1 |= EXT4_EXT_DATA_VALID1; - err = ext4_split_extent_at(handle, inode, path, + err = ext4_split_extent_at(handle, inode, ppath, map->m_lblk + map->m_len, split_flag1, flags1); if (err) goto out; @@ -3204,23 +3352,27 @@ * Update path is required because previous ext4_split_extent_at() may * result in split of original leaf or extent zeroout. */ - ext4_ext_drop_refs(path); - path = ext4_ext_find_extent(inode, map->m_lblk, path); + path = ext4_find_extent(inode, map->m_lblk, ppath, 0); if (IS_ERR(path)) return PTR_ERR(path); depth = ext_depth(inode); ex = path[depth].p_ext; - uninitialized = ext4_ext_is_uninitialized(ex); + if (!ex) { + EXT4_ERROR_INODE(inode, "unexpected hole at %lu", + (unsigned long) map->m_lblk); + return -EFSCORRUPTED; + } + unwritten = ext4_ext_is_unwritten(ex); split_flag1 = 0; if (map->m_lblk >= ee_block) { split_flag1 = split_flag & EXT4_EXT_DATA_VALID2; - if (uninitialized) { - split_flag1 |= EXT4_EXT_MARK_UNINIT1; + if (unwritten) { + split_flag1 |= EXT4_EXT_MARK_UNWRIT1; split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT | - EXT4_EXT_MARK_UNINIT2); + EXT4_EXT_MARK_UNWRIT2); } - err = ext4_split_extent_at(handle, inode, path, + err = ext4_split_extent_at(handle, inode, ppath, map->m_lblk, split_flag1, flags); if (err) goto out; @@ -3233,16 +3385,16 @@ /* * This function is called by ext4_ext_map_blocks() if someone tries to write - * to an uninitialized extent. It may result in splitting the uninitialized + * to an unwritten extent. It may result in splitting the unwritten * extent into multiple extents (up to three - one initialized and two - * uninitialized). + * unwritten). * There are three possibilities: * a> There is no split required: Entire extent should be initialized * b> Splits in two extents: Write is happening at either end of the extent * c> Splits in three extents: Somone is writing in middle of the extent * * Pre-conditions: - * - The extent pointed to by 'path' is uninitialized. + * - The extent pointed to by 'path' is unwritten. * - The extent pointed to by 'path' contains a superset * of the logical span [map->m_lblk, map->m_lblk + map->m_len). * @@ -3254,9 +3406,10 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, - struct ext4_ext_path *path, + struct ext4_ext_path **ppath, int flags) { + struct ext4_ext_path *path = *ppath; struct ext4_sb_info *sbi; struct ext4_extent_header *eh; struct ext4_map_blocks split_map; @@ -3288,12 +3441,12 @@ trace_ext4_ext_convert_to_initialized_enter(inode, map, ex); /* Pre-conditions */ - BUG_ON(!ext4_ext_is_uninitialized(ex)); + BUG_ON(!ext4_ext_is_unwritten(ex)); BUG_ON(!in_range(map->m_lblk, ee_block, ee_len)); /* * Attempt to transfer newly initialized blocks from the currently - * uninitialized extent to its neighbor. This is much cheaper + * unwritten extent to its neighbor. This is much cheaper * than an insertion followed by a merge as those involve costly * memmove() calls. Transferring to the left is the common case in * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE) @@ -3329,7 +3482,7 @@ * - C4: abut_ex can receive the additional blocks without * overflowing the (initialized) length limit. */ - if ((!ext4_ext_is_uninitialized(abut_ex)) && /*C1*/ + if ((!ext4_ext_is_unwritten(abut_ex)) && /*C1*/ ((prev_lblk + prev_len) == ee_block) && /*C2*/ ((prev_pblk + prev_len) == ee_pblk) && /*C3*/ (prev_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/ @@ -3344,7 +3497,7 @@ ex->ee_block = cpu_to_le32(ee_block + map_len); ext4_ext_store_pblock(ex, ee_pblk + map_len); ex->ee_len = cpu_to_le16(ee_len - map_len); - ext4_ext_mark_uninitialized(ex); /* Restore the flag */ + ext4_ext_mark_unwritten(ex); /* Restore the flag */ /* Extend abut_ex by 'map_len' blocks */ abut_ex->ee_len = cpu_to_le16(prev_len + map_len); @@ -3375,7 +3528,7 @@ * - C4: abut_ex can receive the additional blocks without * overflowing the (initialized) length limit. */ - if ((!ext4_ext_is_uninitialized(abut_ex)) && /*C1*/ + if ((!ext4_ext_is_unwritten(abut_ex)) && /*C1*/ ((map->m_lblk + map_len) == next_lblk) && /*C2*/ ((ee_pblk + ee_len) == next_pblk) && /*C3*/ (next_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/ @@ -3390,7 +3543,7 @@ abut_ex->ee_block = cpu_to_le32(next_lblk - map_len); ext4_ext_store_pblock(abut_ex, next_pblk - map_len); ex->ee_len = cpu_to_le16(ee_len - map_len); - ext4_ext_mark_uninitialized(ex); /* Restore the flag */ + ext4_ext_mark_unwritten(ex); /* Restore the flag */ /* Extend abut_ex by 'map_len' blocks */ abut_ex->ee_len = cpu_to_le16(next_len + map_len); @@ -3412,7 +3565,7 @@ WARN_ON(map->m_lblk < ee_block); /* * It is safe to convert extent to initialized via explicit - * zeroout only if extent is fully insde i_size or new_size. + * zeroout only if extent is fully inside i_size or new_size. */ split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; @@ -3420,6 +3573,9 @@ max_zeroout = sbi->s_extent_max_zeroout_kb >> (inode->i_sb->s_blocksize_bits - 10); + if (ext4_encrypted_inode(inode)) + max_zeroout = 0; + /* If extent is less than s_max_zeroout_kb, zeroout directly */ if (max_zeroout && (ee_len <= max_zeroout)) { err = ext4_ext_zeroout(inode, ex); @@ -3480,55 +3636,57 @@ } } - allocated = ext4_split_extent(handle, inode, path, - &split_map, split_flag, flags); - if (allocated < 0) - err = allocated; - + err = ext4_split_extent(handle, inode, ppath, &split_map, split_flag, + flags); + if (err > 0) + err = 0; out: /* If we have gotten a failure, don't zero out status tree */ if (!err) - err = ext4_es_zeroout(inode, &zero_ex); + err = ext4_zeroout_es(inode, &zero_ex); return err ? err : allocated; } /* * This function is called by ext4_ext_map_blocks() from * ext4_get_blocks_dio_write() when DIO to write - * to an uninitialized extent. + * to an unwritten extent. * - * Writing to an uninitialized extent may result in splitting the uninitialized - * extent into multiple initialized/uninitialized extents (up to three) + * Writing to an unwritten extent may result in splitting the unwritten + * extent into multiple initialized/unwritten extents (up to three) * There are three possibilities: - * a> There is no split required: Entire extent should be uninitialized + * a> There is no split required: Entire extent should be unwritten * b> Splits in two extents: Write is happening at either end of the extent * c> Splits in three extents: Somone is writing in middle of the extent * + * This works the same way in the case of initialized -> unwritten conversion. + * * One of more index blocks maybe needed if the extent tree grow after - * the uninitialized extent split. To prevent ENOSPC occur at the IO - * complete, we need to split the uninitialized extent before DIO submit - * the IO. The uninitialized extent called at this time will be split - * into three uninitialized extent(at most). After IO complete, the part + * the unwritten extent split. To prevent ENOSPC occur at the IO + * complete, we need to split the unwritten extent before DIO submit + * the IO. The unwritten extent called at this time will be split + * into three unwritten extent(at most). After IO complete, the part * being filled will be convert to initialized by the end_io callback function * via ext4_convert_unwritten_extents(). * - * Returns the size of uninitialized extent to be written on success. + * Returns the size of unwritten extent to be written on success. */ -static int ext4_split_unwritten_extents(handle_t *handle, +static int ext4_split_convert_extents(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, - struct ext4_ext_path *path, + struct ext4_ext_path **ppath, int flags) { + struct ext4_ext_path *path = *ppath; ext4_lblk_t eof_block; ext4_lblk_t ee_block; struct ext4_extent *ex; unsigned int ee_len; int split_flag = 0, depth; - ext_debug("ext4_split_unwritten_extents: inode %lu, logical" - "block %llu, max_blocks %u\n", inode->i_ino, - (unsigned long long)map->m_lblk, map->m_len); + ext_debug("%s: inode %lu, logical block %llu, max_blocks %u\n", + __func__, inode->i_ino, + (unsigned long long)map->m_lblk, map->m_len); eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits; @@ -3543,19 +3701,25 @@ ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); - split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; - split_flag |= EXT4_EXT_MARK_UNINIT2; - if (flags & EXT4_GET_BLOCKS_CONVERT) - split_flag |= EXT4_EXT_DATA_VALID2; + /* Convert to unwritten */ + if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) { + split_flag |= EXT4_EXT_DATA_VALID1; + /* Convert to initialized */ + } else if (flags & EXT4_GET_BLOCKS_CONVERT) { + split_flag |= ee_block + ee_len <= eof_block ? + EXT4_EXT_MAY_ZEROOUT : 0; + split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2); + } flags |= EXT4_GET_BLOCKS_PRE_IO; - return ext4_split_extent(handle, inode, path, map, split_flag, flags); + return ext4_split_extent(handle, inode, ppath, map, split_flag, flags); } static int ext4_convert_unwritten_extents_endio(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, - struct ext4_ext_path *path) + struct ext4_ext_path **ppath) { + struct ext4_ext_path *path = *ppath; struct ext4_extent *ex; ext4_lblk_t ee_block; unsigned int ee_len; @@ -3584,16 +3748,13 @@ inode->i_ino, (unsigned long long)ee_block, ee_len, (unsigned long long)map->m_lblk, map->m_len); #endif - err = ext4_split_unwritten_extents(handle, inode, map, path, - EXT4_GET_BLOCKS_CONVERT); + err = ext4_split_convert_extents(handle, inode, map, ppath, + EXT4_GET_BLOCKS_CONVERT); if (err < 0) - goto out; - ext4_ext_drop_refs(path); - path = ext4_ext_find_extent(inode, map->m_lblk, path); - if (IS_ERR(path)) { - err = PTR_ERR(path); - goto out; - } + return err; + path = ext4_find_extent(inode, map->m_lblk, ppath, 0); + if (IS_ERR(path)) + return PTR_ERR(path); depth = ext_depth(inode); ex = path[depth].p_ext; } @@ -3786,34 +3947,109 @@ } static int -ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, +convert_initialized_extent(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, + struct ext4_ext_path **ppath, int flags, + unsigned int allocated, ext4_fsblk_t newblock) +{ + struct ext4_ext_path *path = *ppath; + struct ext4_extent *ex; + ext4_lblk_t ee_block; + unsigned int ee_len; + int depth; + int err = 0; + + /* + * Make sure that the extent is no bigger than we support with + * unwritten extent + */ + if (map->m_len > EXT_UNWRITTEN_MAX_LEN) + map->m_len = EXT_UNWRITTEN_MAX_LEN / 2; + + depth = ext_depth(inode); + ex = path[depth].p_ext; + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); + + ext_debug("%s: inode %lu, logical" + "block %llu, max_blocks %u\n", __func__, inode->i_ino, + (unsigned long long)ee_block, ee_len); + + if (ee_block != map->m_lblk || ee_len > map->m_len) { + err = ext4_split_convert_extents(handle, inode, map, ppath, + EXT4_GET_BLOCKS_CONVERT_UNWRITTEN); + if (err < 0) + return err; + path = ext4_find_extent(inode, map->m_lblk, ppath, 0); + if (IS_ERR(path)) + return PTR_ERR(path); + depth = ext_depth(inode); + ex = path[depth].p_ext; + if (!ex) { + EXT4_ERROR_INODE(inode, "unexpected hole at %lu", + (unsigned long) map->m_lblk); + return -EFSCORRUPTED; + } + } + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + return err; + /* first mark the extent as unwritten */ + ext4_ext_mark_unwritten(ex); + + /* note: ext4_ext_correct_indexes() isn't needed here because + * borders are not changed + */ + ext4_ext_try_to_merge(handle, inode, path, ex); + + /* Mark modified extent as dirty */ + err = ext4_ext_dirty(handle, inode, path + path->p_depth); + if (err) + return err; + ext4_ext_show_leaf(inode, path); + + ext4_update_inode_fsync_trans(handle, inode, 1); + err = check_eofblocks_fl(handle, inode, map->m_lblk, path, map->m_len); + if (err) + return err; + map->m_flags |= EXT4_MAP_UNWRITTEN; + if (allocated > map->m_len) + allocated = map->m_len; + map->m_len = allocated; + return allocated; +} + +static int +ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, - struct ext4_ext_path *path, int flags, + struct ext4_ext_path **ppath, int flags, unsigned int allocated, ext4_fsblk_t newblock) { + struct ext4_ext_path *path = *ppath; int ret = 0; int err = 0; ext4_io_end_t *io = ext4_inode_aio(inode); - ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical " + ext_debug("ext4_ext_handle_unwritten_extents: inode %lu, logical " "block %llu, max_blocks %u, flags %x, allocated %u\n", inode->i_ino, (unsigned long long)map->m_lblk, map->m_len, flags, allocated); ext4_ext_show_leaf(inode, path); /* - * When writing into uninitialized space, we should not fail to + * When writing into unwritten space, we should not fail to * allocate metadata blocks for the new extent block if needed. */ flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL; - trace_ext4_ext_handle_uninitialized_extents(inode, map, flags, + trace_ext4_ext_handle_unwritten_extents(inode, map, flags, allocated, newblock); /* get_block() before submit the IO, split the extent */ - if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { - ret = ext4_split_unwritten_extents(handle, inode, map, - path, flags); + if (flags & EXT4_GET_BLOCKS_PRE_IO) { + ret = ext4_split_convert_extents(handle, inode, map, ppath, + flags | EXT4_GET_BLOCKS_CONVERT); if (ret <= 0) goto out; /* @@ -3826,14 +4062,12 @@ else ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); map->m_flags |= EXT4_MAP_UNWRITTEN; - if (ext4_should_dioread_nolock(inode)) - map->m_flags |= EXT4_MAP_UNINIT; goto out; } /* IO end_io complete, convert the filled extent to written */ - if ((flags & EXT4_GET_BLOCKS_CONVERT)) { + if (flags & EXT4_GET_BLOCKS_CONVERT) { ret = ext4_convert_unwritten_extents_endio(handle, inode, map, - path); + ppath); if (ret >= 0) { ext4_update_inode_fsync_trans(handle, inode, 1); err = check_eofblocks_fl(handle, inode, map->m_lblk, @@ -3841,6 +4075,7 @@ } else err = ret; map->m_flags |= EXT4_MAP_MAPPED; + map->m_pblk = newblock; if (allocated > map->m_len) allocated = map->m_len; map->m_len = allocated; @@ -3851,7 +4086,7 @@ * repeat fallocate creation request * we already have an unwritten extent */ - if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) { + if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) { map->m_flags |= EXT4_MAP_UNWRITTEN; goto map_out; } @@ -3870,7 +4105,7 @@ } /* buffered write, writepage time, convert*/ - ret = ext4_ext_convert_to_initialized(handle, inode, map, path, flags); + ret = ext4_ext_convert_to_initialized(handle, inode, map, ppath, flags); if (ret >= 0) ext4_update_inode_fsync_trans(handle, inode, 1); out: @@ -3927,10 +4162,6 @@ map->m_pblk = newblock; map->m_len = allocated; out2: - if (path) { - ext4_ext_drop_refs(path); - kfree(path); - } return err ? err : allocated; } @@ -4069,13 +4300,14 @@ ext4_io_end_t *io = ext4_inode_aio(inode); ext4_lblk_t cluster_offset; int set_unwritten = 0; + bool map_from_cluster = false; ext_debug("blocks %u/%u requested for inode %lu\n", map->m_lblk, map->m_len, inode->i_ino); trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); /* find extent for this block */ - path = ext4_ext_find_extent(inode, map->m_lblk, NULL); + path = ext4_find_extent(inode, map->m_lblk, NULL, 0); if (IS_ERR(path)) { err = PTR_ERR(path); path = NULL; @@ -4087,14 +4319,14 @@ /* * consistent leaf must not be empty; * this situation is possible, though, _during_ tree modification; - * this is why assert can't be put in ext4_ext_find_extent() + * this is why assert can't be put in ext4_find_extent() */ if (unlikely(path[depth].p_ext == NULL && depth != 0)) { EXT4_ERROR_INODE(inode, "bad extent address " "lblock: %lu, depth: %d pblock %lld", (unsigned long) map->m_lblk, depth, path[depth].p_block); - err = -EIO; + err = -EFSCORRUPTED; goto out2; } @@ -4104,8 +4336,9 @@ ext4_fsblk_t ee_start = ext4_ext_pblock(ex); unsigned short ee_len; + /* - * Uninitialized extents are treated as holes, except that + * unwritten extents are treated as holes, except that * we split out initialized portions during a write. */ ee_len = ext4_ext_get_actual_len(ex); @@ -4120,24 +4353,30 @@ ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk, ee_block, ee_len, newblock); - if (!ext4_ext_is_uninitialized(ex)) + /* + * If the extent is initialized check whether the + * caller wants to convert it to unwritten. + */ + if ((!ext4_ext_is_unwritten(ex)) && + (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) { + allocated = convert_initialized_extent( + handle, inode, map, &path, + flags, allocated, newblock); + goto out2; + } else if (!ext4_ext_is_unwritten(ex)) goto out; - ret = ext4_ext_handle_uninitialized_extents( - handle, inode, map, path, flags, + ret = ext4_ext_handle_unwritten_extents( + handle, inode, map, &path, flags, allocated, newblock); if (ret < 0) err = ret; else allocated = ret; - goto out3; + goto out2; } } - if ((sbi->s_cluster_ratio > 1) && - ext4_find_delalloc_cluster(inode, map->m_lblk)) - map->m_flags |= EXT4_MAP_FROM_CLUSTER; - /* * requested block isn't allocated yet; * we couldn't try to create block if create flag is zero @@ -4147,27 +4386,25 @@ * put just found gap into cache to speed up * subsequent requests */ - if ((flags & EXT4_GET_BLOCKS_NO_PUT_HOLE) == 0) - ext4_ext_put_gap_in_cache(inode, path, map->m_lblk); + ext4_ext_put_gap_in_cache(inode, path, map->m_lblk); goto out2; } /* * Okay, we need to do block allocation. */ - map->m_flags &= ~EXT4_MAP_FROM_CLUSTER; newex.ee_block = cpu_to_le32(map->m_lblk); cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk); /* * If we are doing bigalloc, check to see if the extent returned - * by ext4_ext_find_extent() implies a cluster we can use. + * by ext4_find_extent() implies a cluster we can use. */ if (cluster_offset && ex && get_implied_cluster_alloc(inode->i_sb, map, ex, path)) { ar.len = allocated = map->m_len; newblock = map->m_pblk; - map->m_flags |= EXT4_MAP_FROM_CLUSTER; + map_from_cluster = true; goto got_allocated_blocks; } @@ -4188,22 +4425,22 @@ get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) { ar.len = allocated = map->m_len; newblock = map->m_pblk; - map->m_flags |= EXT4_MAP_FROM_CLUSTER; + map_from_cluster = true; goto got_allocated_blocks; } /* * See if request is beyond maximum number of blocks we can have in * a single extent. For an initialized extent this limit is - * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is - * EXT_UNINIT_MAX_LEN. + * EXT_INIT_MAX_LEN and for an unwritten extent this limit is + * EXT_UNWRITTEN_MAX_LEN. */ if (map->m_len > EXT_INIT_MAX_LEN && - !(flags & EXT4_GET_BLOCKS_UNINIT_EXT)) + !(flags & EXT4_GET_BLOCKS_UNWRIT_EXT)) map->m_len = EXT_INIT_MAX_LEN; - else if (map->m_len > EXT_UNINIT_MAX_LEN && - (flags & EXT4_GET_BLOCKS_UNINIT_EXT)) - map->m_len = EXT_UNINIT_MAX_LEN; + else if (map->m_len > EXT_UNWRITTEN_MAX_LEN && + (flags & EXT4_GET_BLOCKS_UNWRIT_EXT)) + map->m_len = EXT_UNWRITTEN_MAX_LEN; /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */ newex.ee_len = cpu_to_le16(map->m_len); @@ -4236,6 +4473,10 @@ ar.flags = 0; if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE) ar.flags |= EXT4_MB_HINT_NOPREALLOC; + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) + ar.flags |= EXT4_MB_DELALLOC_RESERVED; + if (flags & EXT4_GET_BLOCKS_METADATA_NOFAIL) + ar.flags |= EXT4_MB_USE_RESERVED; newblock = ext4_mb_new_blocks(handle, &ar, &err); if (!newblock) goto out2; @@ -4251,21 +4492,19 @@ /* try to insert new extent into found leaf and return */ ext4_ext_store_pblock(&newex, newblock + offset); newex.ee_len = cpu_to_le16(ar.len); - /* Mark uninitialized */ - if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){ - ext4_ext_mark_uninitialized(&newex); + /* Mark unwritten */ + if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT){ + ext4_ext_mark_unwritten(&newex); map->m_flags |= EXT4_MAP_UNWRITTEN; /* * io_end structure was created for every IO write to an - * uninitialized extent. To avoid unnecessary conversion, + * unwritten extent. To avoid unnecessary conversion, * here we flag the IO that really needs the conversion. * For non asycn direct IO case, flag the inode state * that we need to perform conversion when IO is done. */ - if ((flags & EXT4_GET_BLOCKS_PRE_IO)) + if (flags & EXT4_GET_BLOCKS_PRE_IO) set_unwritten = 1; - if (ext4_should_dioread_nolock(inode)) - map->m_flags |= EXT4_MAP_UNINIT; } err = 0; @@ -4273,7 +4512,7 @@ err = check_eofblocks_fl(handle, inode, map->m_lblk, path, ar.len); if (!err) - err = ext4_ext_insert_extent(handle, inode, path, + err = ext4_ext_insert_extent(handle, inode, &path, &newex, flags); if (!err && set_unwritten) { @@ -4291,8 +4530,8 @@ /* not a good idea to call discard here directly, * but otherwise we'd need to call it every free() */ ext4_discard_preallocations(inode); - ext4_free_blocks(handle, inode, NULL, ext4_ext_pblock(&newex), - ext4_ext_get_actual_len(&newex), fb_flags); + ext4_free_blocks(handle, inode, NULL, newblock, + EXT4_C2B(sbi, allocated_clusters), fb_flags); goto out2; } @@ -4314,19 +4553,7 @@ */ reserved_clusters = get_reserved_cluster_alloc(inode, map->m_lblk, allocated); - if (map->m_flags & EXT4_MAP_FROM_CLUSTER) { - if (reserved_clusters) { - /* - * We have clusters reserved for this range. - * But since we are not doing actual allocation - * and are simply using blocks from previously - * allocated cluster, we should release the - * reservation and not claim quota. - */ - ext4_da_update_reserve_space(inode, - reserved_clusters, 0); - } - } else { + if (!map_from_cluster) { BUG_ON(allocated_clusters < reserved_clusters); if (reserved_clusters < allocated_clusters) { struct ext4_inode_info *ei = EXT4_I(inode); @@ -4392,9 +4619,9 @@ /* * Cache the extent and update transaction to commit on fdatasync only - * when it is _not_ an uninitialized extent. + * when it is _not_ an unwritten extent. */ - if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) + if ((flags & EXT4_GET_BLOCKS_UNWRIT_EXT) == 0) ext4_update_inode_fsync_trans(handle, inode, 1); else ext4_update_inode_fsync_trans(handle, inode, 0); @@ -4406,14 +4633,11 @@ map->m_pblk = newblock; map->m_len = allocated; out2: - if (path) { - ext4_ext_drop_refs(path); - kfree(path); - } - -out3: - trace_ext4_ext_map_blocks_exit(inode, map, err ? err : allocated); + ext4_ext_drop_refs(path); + kfree(path); + trace_ext4_ext_map_blocks_exit(inode, flags, map, + err ? err : allocated); return err ? err : allocated; } @@ -4451,34 +4675,249 @@ ext4_std_error(inode->i_sb, err); } -static void ext4_falloc_update_inode(struct inode *inode, - int mode, loff_t new_size, int update_ctime) +static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, + ext4_lblk_t len, loff_t new_size, + int flags, int mode) { - struct timespec now; + struct inode *inode = file_inode(file); + handle_t *handle; + int ret = 0; + int ret2 = 0; + int retries = 0; + int depth = 0; + struct ext4_map_blocks map; + unsigned int credits; + loff_t epos; + + map.m_lblk = offset; + map.m_len = len; + /* + * Don't normalize the request if it can fit in one extent so + * that it doesn't get unnecessarily split into multiple + * extents. + */ + if (len <= EXT_UNWRITTEN_MAX_LEN) + flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; + + /* + * credits to insert 1 extent into extent tree + */ + credits = ext4_chunk_trans_blocks(inode, len); + /* + * We can only call ext_depth() on extent based inodes + */ + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + depth = ext_depth(inode); + else + depth = -1; + +retry: + while (ret >= 0 && len) { + /* + * Recalculate credits when extent tree depth changes. + */ + if (depth >= 0 && depth != ext_depth(inode)) { + credits = ext4_chunk_trans_blocks(inode, len); + depth = ext_depth(inode); + } + + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, + credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + break; + } + ret = ext4_map_blocks(handle, inode, &map, flags); + if (ret <= 0) { + ext4_debug("inode #%lu: block %u: len %u: " + "ext4_ext_map_blocks returned %d", + inode->i_ino, map.m_lblk, + map.m_len, ret); + ext4_mark_inode_dirty(handle, inode); + ret2 = ext4_journal_stop(handle); + break; + } + map.m_lblk += ret; + map.m_len = len = len - ret; + epos = (loff_t)map.m_lblk << inode->i_blkbits; + inode->i_ctime = ext4_current_time(inode); + if (new_size) { + if (epos > new_size) + epos = new_size; + if (ext4_update_inode_size(inode, epos) & 0x1) + inode->i_mtime = inode->i_ctime; + } else { + if (epos > inode->i_size) + ext4_set_inode_flag(inode, + EXT4_INODE_EOFBLOCKS); + } + ext4_mark_inode_dirty(handle, inode); + ret2 = ext4_journal_stop(handle); + if (ret2) + break; + } + if (ret == -ENOSPC && + ext4_should_retry_alloc(inode->i_sb, &retries)) { + ret = 0; + goto retry; + } + + return ret > 0 ? ret2 : ret; +} - if (update_ctime) { - now = current_fs_time(inode->i_sb); - if (!timespec_equal(&inode->i_ctime, &now)) - inode->i_ctime = now; +static long ext4_zero_range(struct file *file, loff_t offset, + loff_t len, int mode) +{ + struct inode *inode = file_inode(file); + handle_t *handle = NULL; + unsigned int max_blocks; + loff_t new_size = 0; + int ret = 0; + int flags; + int credits; + int partial_begin, partial_end; + loff_t start, end; + ext4_lblk_t lblk; + unsigned int blkbits = inode->i_blkbits; + + trace_ext4_zero_range(inode, offset, len, mode); + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + /* Call ext4_force_commit to flush all data in case of data=journal. */ + if (ext4_should_journal_data(inode)) { + ret = ext4_force_commit(inode->i_sb); + if (ret) + return ret; } + /* - * Update only when preallocation was requested beyond - * the file size. + * Round up offset. This is not fallocate, we neet to zero out + * blocks, so convert interior block aligned part of the range to + * unwritten and possibly manually zero out unaligned parts of the + * range. + */ + start = round_up(offset, 1 << blkbits); + end = round_down((offset + len), 1 << blkbits); + + if (start < offset || end > offset + len) + return -EINVAL; + partial_begin = offset & ((1 << blkbits) - 1); + partial_end = (offset + len) & ((1 << blkbits) - 1); + + lblk = start >> blkbits; + max_blocks = (end >> blkbits); + if (max_blocks < lblk) + max_blocks = 0; + else + max_blocks -= lblk; + + mutex_lock(&inode->i_mutex); + + /* + * Indirect files do not support unwritten extnets */ - if (!(mode & FALLOC_FL_KEEP_SIZE)) { - if (new_size > i_size_read(inode)) - i_size_write(inode, new_size); - if (new_size > EXT4_I(inode)->i_disksize) - ext4_update_i_disksize(inode, new_size); - } else { + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { + ret = -EOPNOTSUPP; + goto out_mutex; + } + + if (!(mode & FALLOC_FL_KEEP_SIZE) && + offset + len > i_size_read(inode)) { + new_size = offset + len; + ret = inode_newsize_ok(inode, new_size); + if (ret) + goto out_mutex; + } + + flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; + if (mode & FALLOC_FL_KEEP_SIZE) + flags |= EXT4_GET_BLOCKS_KEEP_SIZE; + + /* Wait all existing dio workers, newcomers will block on i_mutex */ + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + + /* Preallocate the range including the unaligned edges */ + if (partial_begin || partial_end) { + ret = ext4_alloc_file_blocks(file, + round_down(offset, 1 << blkbits) >> blkbits, + (round_up((offset + len), 1 << blkbits) - + round_down(offset, 1 << blkbits)) >> blkbits, + new_size, flags, mode); + if (ret) + goto out_dio; + + } + + /* Zero range excluding the unaligned edges */ + if (max_blocks > 0) { + flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | + EXT4_EX_NOCACHE); + /* - * Mark that we allocate beyond EOF so the subsequent truncate - * can proceed even if the new size is the same as i_size. + * Prevent page faults from reinstantiating pages we have + * released from page cache. */ - if (new_size > i_size_read(inode)) + down_write(&EXT4_I(inode)->i_mmap_sem); + ret = ext4_update_disksize_before_punch(inode, offset, len); + if (ret) { + up_write(&EXT4_I(inode)->i_mmap_sem); + goto out_dio; + } + /* Now release the pages and zero block aligned part of pages */ + truncate_pagecache_range(inode, start, end - 1); + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + + ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, + flags, mode); + up_write(&EXT4_I(inode)->i_mmap_sem); + if (ret) + goto out_dio; + } + if (!partial_begin && !partial_end) + goto out_dio; + + /* + * In worst case we have to writeout two nonadjacent unwritten + * blocks and update the inode + */ + credits = (2 * ext4_ext_index_trans_blocks(inode, 2)) + 1; + if (ext4_should_journal_data(inode)) + credits += 2; + handle = ext4_journal_start(inode, EXT4_HT_MISC, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + ext4_std_error(inode->i_sb, ret); + goto out_dio; + } + + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + if (new_size) { + ext4_update_inode_size(inode, new_size); + } else { + /* + * Mark that we allocate beyond EOF so the subsequent truncate + * can proceed even if the new size is the same as i_size. + */ + if ((offset + len) > i_size_read(inode)) ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS); } + ext4_mark_inode_dirty(handle, inode); + + /* Zero out partial block at the edges of the range */ + ret = ext4_zero_partial_blocks(handle, inode, offset, len); + if (file->f_flags & O_SYNC) + ext4_handle_sync(handle); + + ext4_journal_stop(handle); +out_dio: + ext4_inode_resume_unlocked_dio(inode); +out_mutex: + mutex_unlock(&inode->i_mutex); + return ret; } /* @@ -4491,111 +4930,99 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); - handle_t *handle; - loff_t new_size; + loff_t new_size = 0; unsigned int max_blocks; int ret = 0; - int ret2 = 0; - int retries = 0; int flags; - struct ext4_map_blocks map; - unsigned int credits, blkbits = inode->i_blkbits; + ext4_lblk_t lblk; + unsigned int blkbits = inode->i_blkbits; + + /* + * Encrypted inodes can't handle collapse range or insert + * range since we would need to re-encrypt blocks with a + * different IV or XTS tweak (which are based on the logical + * block number). + * + * XXX It's not clear why zero range isn't working, but we'll + * leave it disabled for encrypted inodes for now. This is a + * bug we should fix.... + */ + if (ext4_encrypted_inode(inode) && + (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE | + FALLOC_FL_ZERO_RANGE))) + return -EOPNOTSUPP; /* Return error if mode is not supported */ - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | + FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | + FALLOC_FL_INSERT_RANGE)) return -EOPNOTSUPP; if (mode & FALLOC_FL_PUNCH_HOLE) - return ext4_punch_hole(file, offset, len); + return ext4_punch_hole(inode, offset, len); ret = ext4_convert_inline_data(inode); if (ret) return ret; - /* - * currently supporting (pre)allocate mode for extent-based - * files _only_ - */ - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) - return -EOPNOTSUPP; + if (mode & FALLOC_FL_COLLAPSE_RANGE) + return ext4_collapse_range(inode, offset, len); + + if (mode & FALLOC_FL_INSERT_RANGE) + return ext4_insert_range(inode, offset, len); + + if (mode & FALLOC_FL_ZERO_RANGE) + return ext4_zero_range(file, offset, len, mode); trace_ext4_fallocate_enter(inode, offset, len, mode); - map.m_lblk = offset >> blkbits; + lblk = offset >> blkbits; /* * We can't just convert len to max_blocks because * If blocksize = 4096 offset = 3072 and len = 2048 */ max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) - - map.m_lblk; - /* - * credits to insert 1 extent into extent tree - */ - credits = ext4_chunk_trans_blocks(inode, max_blocks); - mutex_lock(&inode->i_mutex); - ret = inode_newsize_ok(inode, (len + offset)); - if (ret) { - mutex_unlock(&inode->i_mutex); - trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); - return ret; - } - flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT; + - lblk; + + flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; if (mode & FALLOC_FL_KEEP_SIZE) flags |= EXT4_GET_BLOCKS_KEEP_SIZE; + + mutex_lock(&inode->i_mutex); + /* - * Don't normalize the request if it can fit in one extent so - * that it doesn't get unnecessarily split into multiple - * extents. + * We only support preallocation for extent-based files only */ - if (len <= EXT_UNINIT_MAX_LEN << blkbits) - flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; - -retry: - while (ret >= 0 && ret < max_blocks) { - map.m_lblk = map.m_lblk + ret; - map.m_len = max_blocks = max_blocks - ret; - handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, - credits); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - break; - } - ret = ext4_map_blocks(handle, inode, &map, flags); - if (ret <= 0) { -#ifdef EXT4FS_DEBUG - ext4_warning(inode->i_sb, - "inode #%lu: block %u: len %u: " - "ext4_ext_map_blocks returned %d", - inode->i_ino, map.m_lblk, - map.m_len, ret); -#endif - ext4_mark_inode_dirty(handle, inode); - ret2 = ext4_journal_stop(handle); - break; - } - if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len, - blkbits) >> blkbits)) - new_size = offset + len; - else - new_size = ((loff_t) map.m_lblk + ret) << blkbits; + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { + ret = -EOPNOTSUPP; + goto out; + } - ext4_falloc_update_inode(inode, mode, new_size, - (map.m_flags & EXT4_MAP_NEW)); - ext4_mark_inode_dirty(handle, inode); - if ((file->f_flags & O_SYNC) && ret >= max_blocks) - ext4_handle_sync(handle); - ret2 = ext4_journal_stop(handle); - if (ret2) - break; + if (!(mode & FALLOC_FL_KEEP_SIZE) && + offset + len > i_size_read(inode)) { + new_size = offset + len; + ret = inode_newsize_ok(inode, new_size); + if (ret) + goto out; } - if (ret == -ENOSPC && - ext4_should_retry_alloc(inode->i_sb, &retries)) { - ret = 0; - goto retry; + + /* Wait all existing dio workers, newcomers will block on i_mutex */ + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + + ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, + flags, mode); + ext4_inode_resume_unlocked_dio(inode); + if (ret) + goto out; + + if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) { + ret = jbd2_complete_transaction(EXT4_SB(inode->i_sb)->s_journal, + EXT4_I(inode)->i_sync_tid); } +out: mutex_unlock(&inode->i_mutex); - trace_ext4_fallocate_exit(inode, offset, max_blocks, - ret > 0 ? ret2 : ret); - return ret > 0 ? ret2 : ret; + trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); + return ret; } /* @@ -4608,10 +5035,9 @@ * function, to convert the fallocated extents after IO is completed. * Returns 0 on success. */ -int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, - ssize_t len) +int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, + loff_t offset, ssize_t len) { - handle_t *handle; unsigned int max_blocks; int ret = 0; int ret2 = 0; @@ -4626,16 +5052,32 @@ max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) - map.m_lblk); /* - * credits to insert 1 extent into extent tree - */ - credits = ext4_chunk_trans_blocks(inode, max_blocks); + * This is somewhat ugly but the idea is clear: When transaction is + * reserved, everything goes into it. Otherwise we rather start several + * smaller transactions for conversion of each extent separately. + */ + if (handle) { + handle = ext4_journal_start_reserved(handle, + EXT4_HT_EXT_CONVERT); + if (IS_ERR(handle)) + return PTR_ERR(handle); + credits = 0; + } else { + /* + * credits to insert 1 extent into extent tree + */ + credits = ext4_chunk_trans_blocks(inode, max_blocks); + } while (ret >= 0 && ret < max_blocks) { map.m_lblk += ret; map.m_len = (max_blocks -= ret); - handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - break; + if (credits) { + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, + credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + break; + } } ret = ext4_map_blocks(handle, inode, &map, EXT4_GET_BLOCKS_IO_CONVERT_EXT); @@ -4646,10 +5088,13 @@ inode->i_ino, map.m_lblk, map.m_len, ret); ext4_mark_inode_dirty(handle, inode); - ret2 = ext4_journal_stop(handle); - if (ret <= 0 || ret2 ) + if (credits) + ret2 = ext4_journal_stop(handle); + if (ret <= 0 || ret2) break; } + if (!credits) + ret2 = ext4_journal_stop(handle); return ret > 0 ? ret2 : ret; } @@ -4746,12 +5191,19 @@ if (ext4_has_inline_data(inode)) { int has_inline = 1; - error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline); + error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline, + start, len); if (has_inline) return error; } + if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { + error = ext4_ext_precache(inode); + if (error) + return error; + } + /* fallback to generic here if not in extents fmt */ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return generic_block_fiemap(inode, fieinfo, start, len, @@ -4779,6 +5231,754 @@ error = ext4_fill_fiemap_extents(inode, start_blk, len_blks, fieinfo); } - return error; } + +/* + * ext4_access_path: + * Function to access the path buffer for marking it dirty. + * It also checks if there are sufficient credits left in the journal handle + * to update path. + */ +static int +ext4_access_path(handle_t *handle, struct inode *inode, + struct ext4_ext_path *path) +{ + int credits, err; + + if (!ext4_handle_valid(handle)) + return 0; + + /* + * Check if need to extend journal credits + * 3 for leaf, sb, and inode plus 2 (bmap and group + * descriptor) for each block group; assume two block + * groups + */ + if (handle->h_buffer_credits < 7) { + credits = ext4_writepage_trans_blocks(inode); + err = ext4_ext_truncate_extend_restart(handle, inode, credits); + /* EAGAIN is success */ + if (err && err != -EAGAIN) + return err; + } + + err = ext4_ext_get_access(handle, inode, path); + return err; +} + +/* + * ext4_ext_shift_path_extents: + * Shift the extents of a path structure lying between path[depth].p_ext + * and EXT_LAST_EXTENT(path[depth].p_hdr), by @shift blocks. @SHIFT tells + * if it is right shift or left shift operation. + */ +static int +ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, + struct inode *inode, handle_t *handle, + enum SHIFT_DIRECTION SHIFT) +{ + int depth, err = 0; + struct ext4_extent *ex_start, *ex_last; + bool update = 0; + depth = path->p_depth; + + while (depth >= 0) { + if (depth == path->p_depth) { + ex_start = path[depth].p_ext; + if (!ex_start) + return -EFSCORRUPTED; + + ex_last = EXT_LAST_EXTENT(path[depth].p_hdr); + + err = ext4_access_path(handle, inode, path + depth); + if (err) + goto out; + + if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr)) + update = 1; + + while (ex_start <= ex_last) { + if (SHIFT == SHIFT_LEFT) { + le32_add_cpu(&ex_start->ee_block, + -shift); + /* Try to merge to the left. */ + if ((ex_start > + EXT_FIRST_EXTENT(path[depth].p_hdr)) + && + ext4_ext_try_to_merge_right(inode, + path, ex_start - 1)) + ex_last--; + else + ex_start++; + } else { + le32_add_cpu(&ex_last->ee_block, shift); + ext4_ext_try_to_merge_right(inode, path, + ex_last); + ex_last--; + } + } + err = ext4_ext_dirty(handle, inode, path + depth); + if (err) + goto out; + + if (--depth < 0 || !update) + break; + } + + /* Update index too */ + err = ext4_access_path(handle, inode, path + depth); + if (err) + goto out; + + if (SHIFT == SHIFT_LEFT) + le32_add_cpu(&path[depth].p_idx->ei_block, -shift); + else + le32_add_cpu(&path[depth].p_idx->ei_block, shift); + err = ext4_ext_dirty(handle, inode, path + depth); + if (err) + goto out; + + /* we are done if current index is not a starting index */ + if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr)) + break; + + depth--; + } + +out: + return err; +} + +/* + * ext4_ext_shift_extents: + * All the extents which lies in the range from @start to the last allocated + * block for the @inode are shifted either towards left or right (depending + * upon @SHIFT) by @shift blocks. + * On success, 0 is returned, error otherwise. + */ +static int +ext4_ext_shift_extents(struct inode *inode, handle_t *handle, + ext4_lblk_t start, ext4_lblk_t shift, + enum SHIFT_DIRECTION SHIFT) +{ + struct ext4_ext_path *path; + int ret = 0, depth; + struct ext4_extent *extent; + ext4_lblk_t stop, *iterator, ex_start, ex_end; + + /* Let path point to the last extent */ + path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, + EXT4_EX_NOCACHE); + if (IS_ERR(path)) + return PTR_ERR(path); + + depth = path->p_depth; + extent = path[depth].p_ext; + if (!extent) + goto out; + + stop = le32_to_cpu(extent->ee_block); + + /* + * In case of left shift, Don't start shifting extents until we make + * sure the hole is big enough to accommodate the shift. + */ + if (SHIFT == SHIFT_LEFT) { + path = ext4_find_extent(inode, start - 1, &path, + EXT4_EX_NOCACHE); + if (IS_ERR(path)) + return PTR_ERR(path); + depth = path->p_depth; + extent = path[depth].p_ext; + if (extent) { + ex_start = le32_to_cpu(extent->ee_block); + ex_end = le32_to_cpu(extent->ee_block) + + ext4_ext_get_actual_len(extent); + } else { + ex_start = 0; + ex_end = 0; + } + + if ((start == ex_start && shift > ex_start) || + (shift > start - ex_end)) { + ext4_ext_drop_refs(path); + kfree(path); + return -EINVAL; + } + } + + /* + * In case of left shift, iterator points to start and it is increased + * till we reach stop. In case of right shift, iterator points to stop + * and it is decreased till we reach start. + */ + if (SHIFT == SHIFT_LEFT) + iterator = &start; + else + iterator = &stop; + + /* + * Its safe to start updating extents. Start and stop are unsigned, so + * in case of right shift if extent with 0 block is reached, iterator + * becomes NULL to indicate the end of the loop. + */ + while (iterator && start <= stop) { + path = ext4_find_extent(inode, *iterator, &path, + EXT4_EX_NOCACHE); + if (IS_ERR(path)) + return PTR_ERR(path); + depth = path->p_depth; + extent = path[depth].p_ext; + if (!extent) { + EXT4_ERROR_INODE(inode, "unexpected hole at %lu", + (unsigned long) *iterator); + return -EFSCORRUPTED; + } + if (SHIFT == SHIFT_LEFT && *iterator > + le32_to_cpu(extent->ee_block)) { + /* Hole, move to the next extent */ + if (extent < EXT_LAST_EXTENT(path[depth].p_hdr)) { + path[depth].p_ext++; + } else { + *iterator = ext4_ext_next_allocated_block(path); + continue; + } + } + + if (SHIFT == SHIFT_LEFT) { + extent = EXT_LAST_EXTENT(path[depth].p_hdr); + *iterator = le32_to_cpu(extent->ee_block) + + ext4_ext_get_actual_len(extent); + } else { + extent = EXT_FIRST_EXTENT(path[depth].p_hdr); + if (le32_to_cpu(extent->ee_block) > 0) + *iterator = le32_to_cpu(extent->ee_block) - 1; + else + /* Beginning is reached, end of the loop */ + iterator = NULL; + /* Update path extent in case we need to stop */ + while (le32_to_cpu(extent->ee_block) < start) + extent++; + path[depth].p_ext = extent; + } + ret = ext4_ext_shift_path_extents(path, shift, inode, + handle, SHIFT); + if (ret) + break; + } +out: + ext4_ext_drop_refs(path); + kfree(path); + return ret; +} + +/* + * ext4_collapse_range: + * This implements the fallocate's collapse range functionality for ext4 + * Returns: 0 and non-zero on error. + */ +int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) +{ + struct super_block *sb = inode->i_sb; + ext4_lblk_t punch_start, punch_stop; + handle_t *handle; + unsigned int credits; + loff_t new_size, ioffset; + int ret; + + /* + * We need to test this early because xfstests assumes that a + * collapse range of (0, 1) will return EOPNOTSUPP if the file + * system does not support collapse range. + */ + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + return -EOPNOTSUPP; + + /* Collapse range works only on fs block size aligned offsets. */ + if (offset & (EXT4_CLUSTER_SIZE(sb) - 1) || + len & (EXT4_CLUSTER_SIZE(sb) - 1)) + return -EINVAL; + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + trace_ext4_collapse_range(inode, offset, len); + + punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb); + punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb); + + /* Call ext4_force_commit to flush all data in case of data=journal. */ + if (ext4_should_journal_data(inode)) { + ret = ext4_force_commit(inode->i_sb); + if (ret) + return ret; + } + + mutex_lock(&inode->i_mutex); + /* + * There is no need to overlap collapse range with EOF, in which case + * it is effectively a truncate operation + */ + if (offset + len >= i_size_read(inode)) { + ret = -EINVAL; + goto out_mutex; + } + + /* Currently just for extent based files */ + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + ret = -EOPNOTSUPP; + goto out_mutex; + } + + /* Wait for existing dio to complete */ + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + + /* + * Prevent page faults from reinstantiating pages we have released from + * page cache. + */ + down_write(&EXT4_I(inode)->i_mmap_sem); + /* + * Need to round down offset to be aligned with page size boundary + * for page size > block size. + */ + ioffset = round_down(offset, PAGE_SIZE); + /* + * Write tail of the last page before removed range since it will get + * removed from the page cache below. + */ + ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, offset); + if (ret) + goto out_mmap; + /* + * Write data that will be shifted to preserve them when discarding + * page cache below. We are also protected from pages becoming dirty + * by i_mmap_sem. + */ + ret = filemap_write_and_wait_range(inode->i_mapping, offset + len, + LLONG_MAX); + if (ret) + goto out_mmap; + truncate_pagecache(inode, ioffset); + + credits = ext4_writepage_trans_blocks(inode); + handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out_mmap; + } + + down_write(&EXT4_I(inode)->i_data_sem); + ext4_discard_preallocations(inode); + + ret = ext4_es_remove_extent(inode, punch_start, + EXT_MAX_BLOCKS - punch_start); + if (ret) { + up_write(&EXT4_I(inode)->i_data_sem); + goto out_stop; + } + + ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1); + if (ret) { + up_write(&EXT4_I(inode)->i_data_sem); + goto out_stop; + } + ext4_discard_preallocations(inode); + + ret = ext4_ext_shift_extents(inode, handle, punch_stop, + punch_stop - punch_start, SHIFT_LEFT); + if (ret) { + up_write(&EXT4_I(inode)->i_data_sem); + goto out_stop; + } + + new_size = i_size_read(inode) - len; + i_size_write(inode, new_size); + EXT4_I(inode)->i_disksize = new_size; + + up_write(&EXT4_I(inode)->i_data_sem); + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); + +out_stop: + ext4_journal_stop(handle); +out_mmap: + up_write(&EXT4_I(inode)->i_mmap_sem); + ext4_inode_resume_unlocked_dio(inode); +out_mutex: + mutex_unlock(&inode->i_mutex); + return ret; +} + +/* + * ext4_insert_range: + * This function implements the FALLOC_FL_INSERT_RANGE flag of fallocate. + * The data blocks starting from @offset to the EOF are shifted by @len + * towards right to create a hole in the @inode. Inode size is increased + * by len bytes. + * Returns 0 on success, error otherwise. + */ +int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) +{ + struct super_block *sb = inode->i_sb; + handle_t *handle; + struct ext4_ext_path *path; + struct ext4_extent *extent; + ext4_lblk_t offset_lblk, len_lblk, ee_start_lblk = 0; + unsigned int credits, ee_len; + int ret = 0, depth, split_flag = 0; + loff_t ioffset; + + /* + * We need to test this early because xfstests assumes that an + * insert range of (0, 1) will return EOPNOTSUPP if the file + * system does not support insert range. + */ + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + return -EOPNOTSUPP; + + /* Insert range works only on fs block size aligned offsets. */ + if (offset & (EXT4_CLUSTER_SIZE(sb) - 1) || + len & (EXT4_CLUSTER_SIZE(sb) - 1)) + return -EINVAL; + + if (!S_ISREG(inode->i_mode)) + return -EOPNOTSUPP; + + trace_ext4_insert_range(inode, offset, len); + + offset_lblk = offset >> EXT4_BLOCK_SIZE_BITS(sb); + len_lblk = len >> EXT4_BLOCK_SIZE_BITS(sb); + + /* Call ext4_force_commit to flush all data in case of data=journal */ + if (ext4_should_journal_data(inode)) { + ret = ext4_force_commit(inode->i_sb); + if (ret) + return ret; + } + + mutex_lock(&inode->i_mutex); + /* Currently just for extent based files */ + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + ret = -EOPNOTSUPP; + goto out_mutex; + } + + /* Check for wrap through zero */ + if (inode->i_size + len > inode->i_sb->s_maxbytes) { + ret = -EFBIG; + goto out_mutex; + } + + /* Offset should be less than i_size */ + if (offset >= i_size_read(inode)) { + ret = -EINVAL; + goto out_mutex; + } + + /* Wait for existing dio to complete */ + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + + /* + * Prevent page faults from reinstantiating pages we have released from + * page cache. + */ + down_write(&EXT4_I(inode)->i_mmap_sem); + /* + * Need to round down to align start offset to page size boundary + * for page size > block size. + */ + ioffset = round_down(offset, PAGE_SIZE); + /* Write out all dirty pages */ + ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, + LLONG_MAX); + if (ret) + goto out_mmap; + truncate_pagecache(inode, ioffset); + + credits = ext4_writepage_trans_blocks(inode); + handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out_mmap; + } + + /* Expand file to avoid data loss if there is error while shifting */ + inode->i_size += len; + EXT4_I(inode)->i_disksize += len; + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + ret = ext4_mark_inode_dirty(handle, inode); + if (ret) + goto out_stop; + + down_write(&EXT4_I(inode)->i_data_sem); + ext4_discard_preallocations(inode); + + path = ext4_find_extent(inode, offset_lblk, NULL, 0); + if (IS_ERR(path)) { + up_write(&EXT4_I(inode)->i_data_sem); + goto out_stop; + } + + depth = ext_depth(inode); + extent = path[depth].p_ext; + if (extent) { + ee_start_lblk = le32_to_cpu(extent->ee_block); + ee_len = ext4_ext_get_actual_len(extent); + + /* + * If offset_lblk is not the starting block of extent, split + * the extent @offset_lblk + */ + if ((offset_lblk > ee_start_lblk) && + (offset_lblk < (ee_start_lblk + ee_len))) { + if (ext4_ext_is_unwritten(extent)) + split_flag = EXT4_EXT_MARK_UNWRIT1 | + EXT4_EXT_MARK_UNWRIT2; + ret = ext4_split_extent_at(handle, inode, &path, + offset_lblk, split_flag, + EXT4_EX_NOCACHE | + EXT4_GET_BLOCKS_PRE_IO | + EXT4_GET_BLOCKS_METADATA_NOFAIL); + } + + ext4_ext_drop_refs(path); + kfree(path); + if (ret < 0) { + up_write(&EXT4_I(inode)->i_data_sem); + goto out_stop; + } + } else { + ext4_ext_drop_refs(path); + kfree(path); + } + + ret = ext4_es_remove_extent(inode, offset_lblk, + EXT_MAX_BLOCKS - offset_lblk); + if (ret) { + up_write(&EXT4_I(inode)->i_data_sem); + goto out_stop; + } + + /* + * if offset_lblk lies in a hole which is at start of file, use + * ee_start_lblk to shift extents + */ + ret = ext4_ext_shift_extents(inode, handle, + ee_start_lblk > offset_lblk ? ee_start_lblk : offset_lblk, + len_lblk, SHIFT_RIGHT); + + up_write(&EXT4_I(inode)->i_data_sem); + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + +out_stop: + ext4_journal_stop(handle); +out_mmap: + up_write(&EXT4_I(inode)->i_mmap_sem); + ext4_inode_resume_unlocked_dio(inode); +out_mutex: + mutex_unlock(&inode->i_mutex); + return ret; +} + +/** + * ext4_swap_extents - Swap extents between two inodes + * + * @inode1: First inode + * @inode2: Second inode + * @lblk1: Start block for first inode + * @lblk2: Start block for second inode + * @count: Number of blocks to swap + * @mark_unwritten: Mark second inode's extents as unwritten after swap + * @erp: Pointer to save error value + * + * This helper routine does exactly what is promise "swap extents". All other + * stuff such as page-cache locking consistency, bh mapping consistency or + * extent's data copying must be performed by caller. + * Locking: + * i_mutex is held for both inodes + * i_data_sem is locked for write for both inodes + * Assumptions: + * All pages from requested range are locked for both inodes + */ +int +ext4_swap_extents(handle_t *handle, struct inode *inode1, + struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2, + ext4_lblk_t count, int unwritten, int *erp) +{ + struct ext4_ext_path *path1 = NULL; + struct ext4_ext_path *path2 = NULL; + int replaced_count = 0; + + BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem)); + BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem)); + BUG_ON(!mutex_is_locked(&inode1->i_mutex)); + BUG_ON(!mutex_is_locked(&inode2->i_mutex)); + + *erp = ext4_es_remove_extent(inode1, lblk1, count); + if (unlikely(*erp)) + return 0; + *erp = ext4_es_remove_extent(inode2, lblk2, count); + if (unlikely(*erp)) + return 0; + + while (count) { + struct ext4_extent *ex1, *ex2, tmp_ex; + ext4_lblk_t e1_blk, e2_blk; + int e1_len, e2_len, len; + int split = 0; + + path1 = ext4_find_extent(inode1, lblk1, NULL, EXT4_EX_NOCACHE); + if (IS_ERR(path1)) { + *erp = PTR_ERR(path1); + path1 = NULL; + finish: + count = 0; + goto repeat; + } + path2 = ext4_find_extent(inode2, lblk2, NULL, EXT4_EX_NOCACHE); + if (IS_ERR(path2)) { + *erp = PTR_ERR(path2); + path2 = NULL; + goto finish; + } + ex1 = path1[path1->p_depth].p_ext; + ex2 = path2[path2->p_depth].p_ext; + /* Do we have somthing to swap ? */ + if (unlikely(!ex2 || !ex1)) + goto finish; + + e1_blk = le32_to_cpu(ex1->ee_block); + e2_blk = le32_to_cpu(ex2->ee_block); + e1_len = ext4_ext_get_actual_len(ex1); + e2_len = ext4_ext_get_actual_len(ex2); + + /* Hole handling */ + if (!in_range(lblk1, e1_blk, e1_len) || + !in_range(lblk2, e2_blk, e2_len)) { + ext4_lblk_t next1, next2; + + /* if hole after extent, then go to next extent */ + next1 = ext4_ext_next_allocated_block(path1); + next2 = ext4_ext_next_allocated_block(path2); + /* If hole before extent, then shift to that extent */ + if (e1_blk > lblk1) + next1 = e1_blk; + if (e2_blk > lblk2) + next2 = e1_blk; + /* Do we have something to swap */ + if (next1 == EXT_MAX_BLOCKS || next2 == EXT_MAX_BLOCKS) + goto finish; + /* Move to the rightest boundary */ + len = next1 - lblk1; + if (len < next2 - lblk2) + len = next2 - lblk2; + if (len > count) + len = count; + lblk1 += len; + lblk2 += len; + count -= len; + goto repeat; + } + + /* Prepare left boundary */ + if (e1_blk < lblk1) { + split = 1; + *erp = ext4_force_split_extent_at(handle, inode1, + &path1, lblk1, 0); + if (unlikely(*erp)) + goto finish; + } + if (e2_blk < lblk2) { + split = 1; + *erp = ext4_force_split_extent_at(handle, inode2, + &path2, lblk2, 0); + if (unlikely(*erp)) + goto finish; + } + /* ext4_split_extent_at() may result in leaf extent split, + * path must to be revalidated. */ + if (split) + goto repeat; + + /* Prepare right boundary */ + len = count; + if (len > e1_blk + e1_len - lblk1) + len = e1_blk + e1_len - lblk1; + if (len > e2_blk + e2_len - lblk2) + len = e2_blk + e2_len - lblk2; + + if (len != e1_len) { + split = 1; + *erp = ext4_force_split_extent_at(handle, inode1, + &path1, lblk1 + len, 0); + if (unlikely(*erp)) + goto finish; + } + if (len != e2_len) { + split = 1; + *erp = ext4_force_split_extent_at(handle, inode2, + &path2, lblk2 + len, 0); + if (*erp) + goto finish; + } + /* ext4_split_extent_at() may result in leaf extent split, + * path must to be revalidated. */ + if (split) + goto repeat; + + BUG_ON(e2_len != e1_len); + *erp = ext4_ext_get_access(handle, inode1, path1 + path1->p_depth); + if (unlikely(*erp)) + goto finish; + *erp = ext4_ext_get_access(handle, inode2, path2 + path2->p_depth); + if (unlikely(*erp)) + goto finish; + + /* Both extents are fully inside boundaries. Swap it now */ + tmp_ex = *ex1; + ext4_ext_store_pblock(ex1, ext4_ext_pblock(ex2)); + ext4_ext_store_pblock(ex2, ext4_ext_pblock(&tmp_ex)); + ex1->ee_len = cpu_to_le16(e2_len); + ex2->ee_len = cpu_to_le16(e1_len); + if (unwritten) + ext4_ext_mark_unwritten(ex2); + if (ext4_ext_is_unwritten(&tmp_ex)) + ext4_ext_mark_unwritten(ex1); + + ext4_ext_try_to_merge(handle, inode2, path2, ex2); + ext4_ext_try_to_merge(handle, inode1, path1, ex1); + *erp = ext4_ext_dirty(handle, inode2, path2 + + path2->p_depth); + if (unlikely(*erp)) + goto finish; + *erp = ext4_ext_dirty(handle, inode1, path1 + + path1->p_depth); + /* + * Looks scarry ah..? second inode already points to new blocks, + * and it was successfully dirtied. But luckily error may happen + * only due to journal error, so full transaction will be + * aborted anyway. + */ + if (unlikely(*erp)) + goto finish; + lblk1 += len; + lblk2 += len; + replaced_count += len; + count -= len; + + repeat: + ext4_ext_drop_refs(path1); + kfree(path1); + ext4_ext_drop_refs(path2); + kfree(path2); + path1 = path2 = NULL; + } + return replaced_count; +}