--- zzzz-none-000/linux-3.10.107/fs/btrfs/send.c 2017-06-27 09:49:32.000000000 +0000 +++ scorpion-7490-727/linux-3.10.107/fs/btrfs/send.c 2021-02-04 17:41:59.000000000 +0000 @@ -24,11 +24,12 @@ #include #include #include -#include #include +#include #include "send.h" #include "backref.h" +#include "hash.h" #include "locking.h" #include "disk-io.h" #include "btrfs_inode.h" @@ -50,15 +51,18 @@ struct { char *start; char *end; - char *prepared; char *buf; - int buf_len; - int reversed:1; - int virtual_mem:1; + unsigned short buf_len:15; + unsigned short reversed:1; char inline_buf[]; }; - char pad[PAGE_SIZE]; + /* + * Average path length does not exceed 200 bytes, we'll have + * better packing in the slab and higher chance to satisfy + * a allocation later during send. + */ + char pad[256]; }; }; #define FS_PATH_INLINE_SIZE \ @@ -87,8 +91,6 @@ u64 cmd_send_size[BTRFS_SEND_C_MAX + 1]; u64 flags; /* 'flags' member of btrfs_ioctl_send_args is u64 */ - struct vfsmount *mnt; - struct btrfs_root *send_root; struct btrfs_root *parent_root; struct clone_root *clone_roots; @@ -110,6 +112,8 @@ int cur_inode_deleted; u64 cur_inode_size; u64 cur_inode_mode; + u64 cur_inode_rdev; + u64 cur_inode_last_extent; u64 send_progress; @@ -120,8 +124,132 @@ struct list_head name_cache_list; int name_cache_size; - struct file *cur_inode_filp; + struct file_ra_state ra; + char *read_buf; + + /* + * We process inodes by their increasing order, so if before an + * incremental send we reverse the parent/child relationship of + * directories such that a directory with a lower inode number was + * the parent of a directory with a higher inode number, and the one + * becoming the new parent got renamed too, we can't rename/move the + * directory with lower inode number when we finish processing it - we + * must process the directory with higher inode number first, then + * rename/move it and then rename/move the directory with lower inode + * number. Example follows. + * + * Tree state when the first send was performed: + * + * . + * |-- a (ino 257) + * |-- b (ino 258) + * | + * | + * |-- c (ino 259) + * | |-- d (ino 260) + * | + * |-- c2 (ino 261) + * + * Tree state when the second (incremental) send is performed: + * + * . + * |-- a (ino 257) + * |-- b (ino 258) + * |-- c2 (ino 261) + * |-- d2 (ino 260) + * |-- cc (ino 259) + * + * The sequence of steps that lead to the second state was: + * + * mv /a/b/c/d /a/b/c2/d2 + * mv /a/b/c /a/b/c2/d2/cc + * + * "c" has lower inode number, but we can't move it (2nd mv operation) + * before we move "d", which has higher inode number. + * + * So we just memorize which move/rename operations must be performed + * later when their respective parent is processed and moved/renamed. + */ + + /* Indexed by parent directory inode number. */ + struct rb_root pending_dir_moves; + + /* + * Reverse index, indexed by the inode number of a directory that + * is waiting for the move/rename of its immediate parent before its + * own move/rename can be performed. + */ + struct rb_root waiting_dir_moves; + + /* + * A directory that is going to be rm'ed might have a child directory + * which is in the pending directory moves index above. In this case, + * the directory can only be removed after the move/rename of its child + * is performed. Example: + * + * Parent snapshot: + * + * . (ino 256) + * |-- a/ (ino 257) + * |-- b/ (ino 258) + * |-- c/ (ino 259) + * | |-- x/ (ino 260) + * | + * |-- y/ (ino 261) + * + * Send snapshot: + * + * . (ino 256) + * |-- a/ (ino 257) + * |-- b/ (ino 258) + * |-- YY/ (ino 261) + * |-- x/ (ino 260) + * + * Sequence of steps that lead to the send snapshot: + * rm -f /a/b/c/foo.txt + * mv /a/b/y /a/b/YY + * mv /a/b/c/x /a/b/YY + * rmdir /a/b/c + * + * When the child is processed, its move/rename is delayed until its + * parent is processed (as explained above), but all other operations + * like update utimes, chown, chgrp, etc, are performed and the paths + * that it uses for those operations must use the orphanized name of + * its parent (the directory we're going to rm later), so we need to + * memorize that name. + * + * Indexed by the inode number of the directory to be deleted. + */ + struct rb_root orphan_dirs; +}; + +struct pending_dir_move { + struct rb_node node; + struct list_head list; + u64 parent_ino; + u64 ino; + u64 gen; + bool is_orphan; + struct list_head update_refs; +}; + +struct waiting_dir_move { + struct rb_node node; + u64 ino; + /* + * There might be some directory that could not be removed because it + * was waiting for this directory inode to be moved first. Therefore + * after this directory is moved, we can try to rmdir the ino rmdir_ino. + */ + u64 rmdir_ino; + bool orphanized; +}; + +struct orphan_dir_info { + struct rb_node node; + u64 ino; + u64 gen; }; struct name_cache_entry { @@ -145,6 +273,20 @@ char name[]; }; +static int is_waiting_for_move(struct send_ctx *sctx, u64 ino); + +static struct waiting_dir_move * +get_waiting_dir_move(struct send_ctx *sctx, u64 ino); + +static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino); + +static int need_send_hole(struct send_ctx *sctx) +{ + return (sctx->parent_root && !sctx->cur_inode_new && + !sctx->cur_inode_new_gen && !sctx->cur_inode_deleted && + S_ISREG(sctx->cur_inode_mode)); +} + static void fs_path_reset(struct fs_path *p) { if (p->reversed) { @@ -158,7 +300,7 @@ } } -static struct fs_path *fs_path_alloc(struct send_ctx *sctx) +static struct fs_path *fs_path_alloc(void) { struct fs_path *p; @@ -166,18 +308,17 @@ if (!p) return NULL; p->reversed = 0; - p->virtual_mem = 0; p->buf = p->inline_buf; p->buf_len = FS_PATH_INLINE_SIZE; fs_path_reset(p); return p; } -static struct fs_path *fs_path_alloc_reversed(struct send_ctx *sctx) +static struct fs_path *fs_path_alloc_reversed(void) { struct fs_path *p; - p = fs_path_alloc(sctx); + p = fs_path_alloc(); if (!p) return NULL; p->reversed = 1; @@ -185,16 +326,12 @@ return p; } -static void fs_path_free(struct send_ctx *sctx, struct fs_path *p) +static void fs_path_free(struct fs_path *p) { if (!p) return; - if (p->buf != p->inline_buf) { - if (p->virtual_mem) - vfree(p->buf); - else - kfree(p->buf); - } + if (p->buf != p->inline_buf) + kfree(p->buf); kfree(p); } @@ -214,42 +351,33 @@ if (p->buf_len >= len) return 0; + if (len > PATH_MAX) { + WARN_ON(1); + return -ENOMEM; + } + path_len = p->end - p->start; old_buf_len = p->buf_len; - len = PAGE_ALIGN(len); + /* + * First time the inline_buf does not suffice + */ if (p->buf == p->inline_buf) { tmp_buf = kmalloc(len, GFP_NOFS); - if (!tmp_buf) { - tmp_buf = vmalloc(len); - if (!tmp_buf) - return -ENOMEM; - p->virtual_mem = 1; - } - memcpy(tmp_buf, p->buf, p->buf_len); - p->buf = tmp_buf; - p->buf_len = len; + if (tmp_buf) + memcpy(tmp_buf, p->buf, old_buf_len); } else { - if (p->virtual_mem) { - tmp_buf = vmalloc(len); - if (!tmp_buf) - return -ENOMEM; - memcpy(tmp_buf, p->buf, p->buf_len); - vfree(p->buf); - } else { - tmp_buf = krealloc(p->buf, len, GFP_NOFS); - if (!tmp_buf) { - tmp_buf = vmalloc(len); - if (!tmp_buf) - return -ENOMEM; - memcpy(tmp_buf, p->buf, p->buf_len); - kfree(p->buf); - p->virtual_mem = 1; - } - } - p->buf = tmp_buf; - p->buf_len = len; + tmp_buf = krealloc(p->buf, len, GFP_NOFS); } + if (!tmp_buf) + return -ENOMEM; + p->buf = tmp_buf; + /* + * The real size of the buffer is bigger, this will let the fast path + * happen most of the time + */ + p->buf_len = ksize(p->buf); + if (p->reversed) { tmp_buf = p->buf + old_buf_len - path_len - 1; p->end = p->buf + p->buf_len - 1; @@ -262,7 +390,8 @@ return 0; } -static int fs_path_prepare_for_add(struct fs_path *p, int name_len) +static int fs_path_prepare_for_add(struct fs_path *p, int name_len, + char **prepared) { int ret; int new_len; @@ -278,11 +407,11 @@ if (p->start != p->end) *--p->start = '/'; p->start -= name_len; - p->prepared = p->start; + *prepared = p->start; } else { if (p->start != p->end) *p->end++ = '/'; - p->prepared = p->end; + *prepared = p->end; p->end += name_len; *p->end = 0; } @@ -294,12 +423,12 @@ static int fs_path_add(struct fs_path *p, const char *name, int name_len) { int ret; + char *prepared; - ret = fs_path_prepare_for_add(p, name_len); + ret = fs_path_prepare_for_add(p, name_len, &prepared); if (ret < 0) goto out; - memcpy(p->prepared, name, name_len); - p->prepared = NULL; + memcpy(prepared, name, name_len); out: return ret; @@ -308,12 +437,12 @@ static int fs_path_add_path(struct fs_path *p, struct fs_path *p2) { int ret; + char *prepared; - ret = fs_path_prepare_for_add(p, p2->end - p2->start); + ret = fs_path_prepare_for_add(p, p2->end - p2->start, &prepared); if (ret < 0) goto out; - memcpy(p->prepared, p2->start, p2->end - p2->start); - p->prepared = NULL; + memcpy(prepared, p2->start, p2->end - p2->start); out: return ret; @@ -324,28 +453,18 @@ unsigned long off, int len) { int ret; + char *prepared; - ret = fs_path_prepare_for_add(p, len); + ret = fs_path_prepare_for_add(p, len, &prepared); if (ret < 0) goto out; - read_extent_buffer(eb, p->prepared, off, len); - p->prepared = NULL; + read_extent_buffer(eb, prepared, off, len); out: return ret; } -#if 0 -static void fs_path_remove(struct fs_path *p) -{ - BUG_ON(p->reversed); - while (p->start != p->end && *p->end != '/') - p->end--; - *p->end = 0; -} -#endif - static int fs_path_copy(struct fs_path *p, struct fs_path *from) { int ret; @@ -384,6 +503,7 @@ return NULL; path->search_commit_root = 1; path->skip_locking = 1; + path->need_commit_sem = 1; return path; } @@ -397,7 +517,8 @@ set_fs(KERNEL_DS); while (pos < len) { - ret = vfs_write(filp, (char *)buf + pos, len - pos, off); + ret = vfs_write(filp, (__force const char __user *)buf + pos, + len - pos, off); /* TODO handle that correctly */ /*if (ret == -ERESTARTSYS) { continue; @@ -436,30 +557,15 @@ return 0; } -#if 0 -static int tlv_put_u8(struct send_ctx *sctx, u16 attr, u8 value) -{ - return tlv_put(sctx, attr, &value, sizeof(value)); -} - -static int tlv_put_u16(struct send_ctx *sctx, u16 attr, u16 value) -{ - __le16 tmp = cpu_to_le16(value); - return tlv_put(sctx, attr, &tmp, sizeof(tmp)); -} - -static int tlv_put_u32(struct send_ctx *sctx, u16 attr, u32 value) -{ - __le32 tmp = cpu_to_le32(value); - return tlv_put(sctx, attr, &tmp, sizeof(tmp)); -} -#endif +#define TLV_PUT_DEFINE_INT(bits) \ + static int tlv_put_u##bits(struct send_ctx *sctx, \ + u##bits attr, u##bits value) \ + { \ + __le##bits __tmp = cpu_to_le##bits(value); \ + return tlv_put(sctx, attr, &__tmp, sizeof(__tmp)); \ + } -static int tlv_put_u64(struct send_ctx *sctx, u16 attr, u64 value) -{ - __le64 tmp = cpu_to_le64(value); - return tlv_put(sctx, attr, &tmp, sizeof(tmp)); -} +TLV_PUT_DEFINE_INT(64) static int tlv_put_string(struct send_ctx *sctx, u16 attr, const char *str, int len) @@ -475,17 +581,6 @@ return tlv_put(sctx, attr, uuid, BTRFS_UUID_SIZE); } -#if 0 -static int tlv_put_timespec(struct send_ctx *sctx, u16 attr, - struct timespec *ts) -{ - struct btrfs_timespec bts; - bts.sec = cpu_to_le64(ts->tv_sec); - bts.nsec = cpu_to_le32(ts->tv_nsec); - return tlv_put(sctx, attr, &bts, sizeof(bts)); -} -#endif - static int tlv_put_btrfs_timespec(struct send_ctx *sctx, u16 attr, struct extent_buffer *eb, struct btrfs_timespec *ts) @@ -533,12 +628,6 @@ if (ret < 0) \ goto tlv_put_failure; \ } while (0) -#define TLV_PUT_TIMESPEC(sctx, attrtype, ts) \ - do { \ - ret = tlv_put_timespec(sctx, attrtype, ts); \ - if (ret < 0) \ - goto tlv_put_failure; \ - } while (0) #define TLV_PUT_BTRFS_TIMESPEC(sctx, attrtype, eb, ts) \ do { \ ret = tlv_put_btrfs_timespec(sctx, attrtype, eb, ts); \ @@ -564,10 +653,8 @@ { struct btrfs_cmd_header *hdr; - if (!sctx->send_buf) { - WARN_ON(1); + if (WARN_ON(!sctx->send_buf)) return -EINVAL; - } BUG_ON(sctx->send_size); @@ -588,7 +675,7 @@ hdr->len = cpu_to_le32(sctx->send_size - sizeof(*hdr)); hdr->crc = 0; - crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size); + crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size); hdr->crc = cpu_to_le32(crc); ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size, @@ -696,29 +783,22 @@ /* * Helper function to retrieve some fields from an inode item. */ -static int get_inode_info(struct btrfs_root *root, - u64 ino, u64 *size, u64 *gen, - u64 *mode, u64 *uid, u64 *gid, - u64 *rdev) +static int __get_inode_info(struct btrfs_root *root, struct btrfs_path *path, + u64 ino, u64 *size, u64 *gen, u64 *mode, u64 *uid, + u64 *gid, u64 *rdev) { int ret; struct btrfs_inode_item *ii; struct btrfs_key key; - struct btrfs_path *path; - - path = alloc_path_for_send(); - if (!path) - return -ENOMEM; key.objectid = ino; key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto out; if (ret) { - ret = -ENOENT; - goto out; + if (ret > 0) + ret = -ENOENT; + return ret; } ii = btrfs_item_ptr(path->nodes[0], path->slots[0], @@ -736,7 +816,22 @@ if (rdev) *rdev = btrfs_inode_rdev(path->nodes[0], ii); -out: + return ret; +} + +static int get_inode_info(struct btrfs_root *root, + u64 ino, u64 *size, u64 *gen, + u64 *mode, u64 *uid, u64 *gid, + u64 *rdev) +{ + struct btrfs_path *path; + int ret; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + ret = __get_inode_info(root, path, ino, size, gen, mode, uid, gid, + rdev); btrfs_free_path(path); return ret; } @@ -753,8 +848,7 @@ * * path must point to the INODE_REF or INODE_EXTREF when called. */ -static int iterate_inode_ref(struct send_ctx *sctx, - struct btrfs_root *root, struct btrfs_path *path, +static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *found_key, int resolve, iterate_inode_ref_t iterate, void *ctx) { @@ -777,13 +871,13 @@ unsigned long elem_size; unsigned long ptr; - p = fs_path_alloc_reversed(sctx); + p = fs_path_alloc_reversed(); if (!p) return -ENOMEM; tmp_path = alloc_path_for_send(); if (!tmp_path) { - fs_path_free(sctx, p); + fs_path_free(p); return -ENOMEM; } @@ -791,7 +885,7 @@ if (found_key->type == BTRFS_INODE_REF_KEY) { ptr = (unsigned long)btrfs_item_ptr(eb, slot, struct btrfs_inode_ref); - item = btrfs_item_nr(eb, slot); + item = btrfs_item_nr(slot); total = btrfs_item_size(eb, item); elem_size = sizeof(*iref); } else { @@ -858,7 +952,7 @@ out: btrfs_free_path(tmp_path); - fs_path_free(sctx, p); + fs_path_free(p); return ret; } @@ -874,8 +968,7 @@ * * path must point to the dir item when called. */ -static int iterate_dir_item(struct send_ctx *sctx, - struct btrfs_root *root, struct btrfs_path *path, +static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *found_key, iterate_dir_item_t iterate, void *ctx) { @@ -885,9 +978,7 @@ struct btrfs_dir_item *di; struct btrfs_key di_key; char *buf = NULL; - char *buf2 = NULL; int buf_len; - int buf_virtual = 0; u32 name_len; u32 data_len; u32 cur; @@ -897,7 +988,13 @@ int num; u8 type; - buf_len = PAGE_SIZE; + /* + * Start with a small buffer (1 page). If later we end up needing more + * space, which can happen for xattrs on a fs with a leaf size greater + * then the page size, attempt to increase the buffer. Typically xattr + * values are small. + */ + buf_len = PATH_MAX; buf = kmalloc(buf_len, GFP_NOFS); if (!buf) { ret = -ENOMEM; @@ -906,7 +1003,7 @@ eb = path->nodes[0]; slot = path->slots[0]; - item = btrfs_item_nr(eb, slot); + item = btrfs_item_nr(slot); di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); cur = 0; len = 0; @@ -919,30 +1016,45 @@ type = btrfs_dir_type(eb, di); btrfs_dir_item_key_to_cpu(eb, di, &di_key); + if (type == BTRFS_FT_XATTR) { + if (name_len > XATTR_NAME_MAX) { + ret = -ENAMETOOLONG; + goto out; + } + if (name_len + data_len > BTRFS_MAX_XATTR_SIZE(root)) { + ret = -E2BIG; + goto out; + } + } else { + /* + * Path too long + */ + if (name_len + data_len > PATH_MAX) { + ret = -ENAMETOOLONG; + goto out; + } + } + if (name_len + data_len > buf_len) { - buf_len = PAGE_ALIGN(name_len + data_len); - if (buf_virtual) { - buf2 = vmalloc(buf_len); - if (!buf2) { - ret = -ENOMEM; - goto out; - } + buf_len = name_len + data_len; + if (is_vmalloc_addr(buf)) { vfree(buf); + buf = NULL; } else { - buf2 = krealloc(buf, buf_len, GFP_NOFS); - if (!buf2) { - buf2 = vmalloc(buf_len); - if (!buf2) { - ret = -ENOMEM; - goto out; - } + char *tmp = krealloc(buf, buf_len, + GFP_NOFS | __GFP_NOWARN); + + if (!tmp) kfree(buf); - buf_virtual = 1; + buf = tmp; + } + if (!buf) { + buf = vmalloc(buf_len); + if (!buf) { + ret = -ENOMEM; + goto out; } } - - buf = buf2; - buf2 = NULL; } read_extent_buffer(eb, buf, (unsigned long)(di + 1), @@ -965,10 +1077,7 @@ } out: - if (buf_virtual) - vfree(buf); - else - kfree(buf); + kvfree(buf); return ret; } @@ -990,7 +1099,7 @@ * Retrieve the first path of an inode. If an inode has more then one * ref/hardlink, this is ignored. */ -static int get_inode_path(struct send_ctx *sctx, struct btrfs_root *root, +static int get_inode_path(struct btrfs_root *root, u64 ino, struct fs_path *path) { int ret; @@ -1022,8 +1131,8 @@ goto out; } - ret = iterate_inode_ref(sctx, root, p, &found_key, 1, - __copy_first_ref, path); + ret = iterate_inode_ref(root, p, &found_key, 1, + __copy_first_ref, path); if (ret < 0) goto out; ret = 0; @@ -1036,6 +1145,7 @@ struct backref_ctx { struct send_ctx *sctx; + struct btrfs_path *path; /* number of total found references */ u64 found; @@ -1049,6 +1159,9 @@ /* may be truncated in case it's the last extent in a file */ u64 extent_len; + /* data offset in the file extent item */ + u64 data_offset; + /* Just to check for bugs in backref resolving */ int found_itself; }; @@ -1106,12 +1219,13 @@ * There are inodes that have extents that lie behind its i_size. Don't * accept clones from these extents. */ - ret = get_inode_info(found->root, ino, &i_size, NULL, NULL, NULL, NULL, - NULL); + ret = __get_inode_info(found->root, bctx->path, ino, &i_size, NULL, NULL, + NULL, NULL, NULL); + btrfs_release_path(bctx->path); if (ret < 0) return ret; - if (offset + bctx->extent_len > i_size) + if (offset + bctx->data_offset + bctx->extent_len > i_size) return 0; /* @@ -1186,12 +1300,17 @@ if (!tmp_path) return -ENOMEM; + /* We only use this path under the commit sem */ + tmp_path->need_commit_sem = 0; + backref_ctx = kmalloc(sizeof(*backref_ctx), GFP_NOFS); if (!backref_ctx) { ret = -ENOMEM; goto out; } + backref_ctx->path = tmp_path; + if (data_offset >= ino_size) { /* * There may be extents that lie behind the file's size. @@ -1219,8 +1338,10 @@ } logical = disk_byte + btrfs_file_extent_offset(eb, fi); + down_read(&sctx->send_root->fs_info->commit_root_sem); ret = extent_from_logical(sctx->send_root->fs_info, disk_byte, tmp_path, &found_key, &flags); + up_read(&sctx->send_root->fs_info->commit_root_sem); btrfs_release_path(tmp_path); if (ret < 0) @@ -1246,6 +1367,19 @@ backref_ctx->cur_offset = data_offset; backref_ctx->found_itself = 0; backref_ctx->extent_len = num_bytes; + /* + * For non-compressed extents iterate_extent_inodes() gives us extent + * offsets that already take into account the data offset, but not for + * compressed extents, since the offset is logical and not relative to + * the physical extent locations. We must take this into account to + * avoid sending clone offsets that go beyond the source file's size, + * which would result in the clone ioctl failing with -EINVAL on the + * receiving end. + */ + if (compressed == BTRFS_COMPRESS_NONE) + backref_ctx->data_offset = 0; + else + backref_ctx->data_offset = btrfs_file_extent_offset(eb, fi); /* * The last extent of a file may be too large due to page alignment. @@ -1262,8 +1396,6 @@ extent_item_pos = logical - found_key.objectid; else extent_item_pos = 0; - - extent_item_pos = logical - found_key.objectid; ret = iterate_extent_inodes(sctx->send_root->fs_info, found_key.objectid, extent_item_pos, 1, __iterate_backrefs, backref_ctx); @@ -1274,9 +1406,9 @@ if (!backref_ctx->found_itself) { /* found a bug in backref code? */ ret = -EIO; - printk(KERN_ERR "btrfs: ERROR did not find backref in " + btrfs_err(sctx->send_root->fs_info, "did not find backref in " "send_root. inode=%llu, offset=%llu, " - "disk_byte=%llu found extent=%llu\n", + "disk_byte=%llu found extent=%llu", ino, data_offset, disk_byte, found_key.objectid); goto out; } @@ -1314,8 +1446,7 @@ return ret; } -static int read_symlink(struct send_ctx *sctx, - struct btrfs_root *root, +static int read_symlink(struct btrfs_root *root, u64 ino, struct fs_path *dest) { @@ -1362,7 +1493,7 @@ BUG_ON(compression); off = btrfs_file_extent_inline_start(ei); - len = btrfs_file_extent_inline_len(path->nodes[0], ei); + len = btrfs_file_extent_inline_len(path->nodes[0], path->slots[0], ei); ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len); @@ -1391,13 +1522,9 @@ return -ENOMEM; while (1) { - len = snprintf(tmp, sizeof(tmp) - 1, "o%llu-%llu-%llu", + len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu", ino, gen, idx); - if (len >= sizeof(tmp)) { - /* should really not happen */ - ret = -EOVERFLOW; - goto out; - } + ASSERT(len < sizeof(tmp)); di = btrfs_lookup_dir_item(NULL, sctx->send_root, path, BTRFS_FIRST_FREE_OBJECTID, @@ -1580,8 +1707,7 @@ * Looks up the first btrfs_inode_ref of a given ino. It returns the parent dir, * generation of the parent dir and the name of the dir entry. */ -static int get_first_ref(struct send_ctx *sctx, - struct btrfs_root *root, u64 ino, +static int get_first_ref(struct btrfs_root *root, u64 ino, u64 *dir, u64 *dir_gen, struct fs_path *name) { int ret; @@ -1612,7 +1738,7 @@ goto out; } - if (key.type == BTRFS_INODE_REF_KEY) { + if (found_key.type == BTRFS_INODE_REF_KEY) { struct btrfs_inode_ref *iref; iref = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_ref); @@ -1634,10 +1760,12 @@ goto out; btrfs_release_path(path); - ret = get_inode_info(root, parent_dir, NULL, dir_gen, NULL, NULL, - NULL, NULL); - if (ret < 0) - goto out; + if (dir_gen) { + ret = get_inode_info(root, parent_dir, NULL, dir_gen, NULL, + NULL, NULL, NULL); + if (ret < 0) + goto out; + } *dir = parent_dir; @@ -1646,21 +1774,19 @@ return ret; } -static int is_first_ref(struct send_ctx *sctx, - struct btrfs_root *root, +static int is_first_ref(struct btrfs_root *root, u64 ino, u64 dir, const char *name, int name_len) { int ret; struct fs_path *tmp_name; u64 tmp_dir; - u64 tmp_dir_gen; - tmp_name = fs_path_alloc(sctx); + tmp_name = fs_path_alloc(); if (!tmp_name) return -ENOMEM; - ret = get_first_ref(sctx, root, ino, &tmp_dir, &tmp_dir_gen, tmp_name); + ret = get_first_ref(root, ino, &tmp_dir, NULL, tmp_name); if (ret < 0) goto out; @@ -1672,7 +1798,7 @@ ret = !memcmp(tmp_name->start, name, name_len); out: - fs_path_free(sctx, tmp_name); + fs_path_free(tmp_name); return ret; } @@ -1691,6 +1817,7 @@ u64 *who_ino, u64 *who_gen) { int ret = 0; + u64 gen; u64 other_inode = 0; u8 other_type = 0; @@ -1701,6 +1828,24 @@ if (ret <= 0) goto out; + /* + * If we have a parent root we need to verify that the parent dir was + * not delted and then re-created, if it was then we have no overwrite + * and we can just unlink this entry. + */ + if (sctx->parent_root) { + ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL, + NULL, NULL, NULL); + if (ret < 0 && ret != -ENOENT) + goto out; + if (ret) { + ret = 0; + goto out; + } + if (gen != dir_gen) + goto out; + } + ret = lookup_dir_item_inode(sctx->parent_root, dir, name, name_len, &other_inode, &other_type); if (ret < 0 && ret != -ENOENT) @@ -1776,8 +1921,15 @@ goto out; } - /* we know that it is or will be overwritten. check this now */ - if (ow_inode < sctx->send_progress) + /* + * We know that it is or will be overwritten. Check this now. + * The current inode being processed might have been the one that caused + * inode 'ino' to be orphanized, therefore check if ow_inode matches + * the current inode being processed. + */ + if ((ow_inode < sctx->send_progress) || + (ino != sctx->cur_ino && ow_inode == sctx->cur_ino && + gen == sctx->cur_inode_gen)) ret = 1; else ret = 0; @@ -1801,11 +1953,11 @@ if (!sctx->parent_root) goto out; - name = fs_path_alloc(sctx); + name = fs_path_alloc(); if (!name) return -ENOMEM; - ret = get_first_ref(sctx, sctx->parent_root, ino, &dir, &dir_gen, name); + ret = get_first_ref(sctx->parent_root, ino, &dir, &dir_gen, name); if (ret < 0) goto out; @@ -1813,7 +1965,7 @@ name->start, fs_path_len(name)); out: - fs_path_free(sctx, name); + fs_path_free(name); return ret; } @@ -1860,13 +2012,20 @@ nce_head = radix_tree_lookup(&sctx->name_cache, (unsigned long)nce->ino); - BUG_ON(!nce_head); + if (!nce_head) { + btrfs_err(sctx->send_root->fs_info, + "name_cache_delete lookup failed ino %llu cache size %d, leaking memory", + nce->ino, sctx->name_cache_size); + } list_del(&nce->radix_list); list_del(&nce->list); sctx->name_cache_size--; - if (list_empty(nce_head)) { + /* + * We may not get to the final release of nce_head if the lookup fails + */ + if (nce_head && list_empty(nce_head)) { radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino); kfree(nce_head); } @@ -1945,7 +2104,6 @@ { int ret; int nce_ret; - struct btrfs_path *path = NULL; struct name_cache_entry *nce = NULL; /* @@ -1971,10 +2129,6 @@ } } - path = alloc_path_for_send(); - if (!path) - return -ENOMEM; - /* * If the inode is not existent yet, add the orphan name and return 1. * This should only happen for the parent dir that we determine in @@ -1997,11 +2151,11 @@ * send_root or parent_root for ref lookup. */ if (ino < sctx->send_progress) - ret = get_first_ref(sctx, sctx->send_root, ino, - parent_ino, parent_gen, dest); + ret = get_first_ref(sctx->send_root, ino, + parent_ino, parent_gen, dest); else - ret = get_first_ref(sctx, sctx->parent_root, ino, - parent_ino, parent_gen, dest); + ret = get_first_ref(sctx->parent_root, ino, + parent_ino, parent_gen, dest); if (ret < 0) goto out; @@ -2050,7 +2204,6 @@ name_cache_clean_unused(sctx); out: - btrfs_free_path(path); return ret; } @@ -2088,7 +2241,7 @@ u64 parent_gen = 0; int stop = 0; - name = fs_path_alloc(sctx); + name = fs_path_alloc(); if (!name) { ret = -ENOMEM; goto out; @@ -2098,14 +2251,35 @@ fs_path_reset(dest); while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) { + struct waiting_dir_move *wdm; + fs_path_reset(name); - ret = __get_cur_name_and_parent(sctx, ino, gen, - &parent_inode, &parent_gen, name); + if (is_waiting_for_rm(sctx, ino)) { + ret = gen_unique_name(sctx, ino, gen, name); + if (ret < 0) + goto out; + ret = fs_path_add_path(dest, name); + break; + } + + wdm = get_waiting_dir_move(sctx, ino); + if (wdm && wdm->orphanized) { + ret = gen_unique_name(sctx, ino, gen, name); + stop = 1; + } else if (wdm) { + ret = get_first_ref(sctx->parent_root, ino, + &parent_inode, &parent_gen, name); + } else { + ret = __get_cur_name_and_parent(sctx, ino, gen, + &parent_inode, + &parent_gen, name); + if (ret) + stop = 1; + } + if (ret < 0) goto out; - if (ret) - stop = 1; ret = fs_path_add_path(dest, name); if (ret < 0) @@ -2116,84 +2290,13 @@ } out: - fs_path_free(sctx, name); + fs_path_free(name); if (!ret) fs_path_unreverse(dest); return ret; } /* - * Called for regular files when sending extents data. Opens a struct file - * to read from the file. - */ -static int open_cur_inode_file(struct send_ctx *sctx) -{ - int ret = 0; - struct btrfs_key key; - struct path path; - struct inode *inode; - struct dentry *dentry; - struct file *filp; - int new = 0; - - if (sctx->cur_inode_filp) - goto out; - - key.objectid = sctx->cur_ino; - key.type = BTRFS_INODE_ITEM_KEY; - key.offset = 0; - - inode = btrfs_iget(sctx->send_root->fs_info->sb, &key, sctx->send_root, - &new); - if (IS_ERR(inode)) { - ret = PTR_ERR(inode); - goto out; - } - - dentry = d_obtain_alias(inode); - inode = NULL; - if (IS_ERR(dentry)) { - ret = PTR_ERR(dentry); - goto out; - } - - path.mnt = sctx->mnt; - path.dentry = dentry; - filp = dentry_open(&path, O_RDONLY | O_LARGEFILE, current_cred()); - dput(dentry); - dentry = NULL; - if (IS_ERR(filp)) { - ret = PTR_ERR(filp); - goto out; - } - sctx->cur_inode_filp = filp; - -out: - /* - * no xxxput required here as every vfs op - * does it by itself on failure - */ - return ret; -} - -/* - * Closes the struct file that was created in open_cur_inode_file - */ -static int close_cur_inode_file(struct send_ctx *sctx) -{ - int ret = 0; - - if (!sctx->cur_inode_filp) - goto out; - - ret = filp_close(sctx->cur_inode_filp, NULL); - sctx->cur_inode_filp = NULL; - -out: - return ret; -} - -/* * Sends a BTRFS_SEND_C_SUBVOL command/item to userspace */ static int send_subvol_begin(struct send_ctx *sctx) @@ -2208,7 +2311,7 @@ char *name = NULL; int namelen; - path = alloc_path_for_send(); + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -2254,15 +2357,25 @@ } TLV_PUT_STRING(sctx, BTRFS_SEND_A_PATH, name, namelen); - TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID, - sctx->send_root->root_item.uuid); + + if (!btrfs_is_empty_uuid(sctx->send_root->root_item.received_uuid)) + TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID, + sctx->send_root->root_item.received_uuid); + else + TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID, + sctx->send_root->root_item.uuid); + TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID, - sctx->send_root->root_item.ctransid); + le64_to_cpu(sctx->send_root->root_item.ctransid)); if (parent_root) { - TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, - sctx->parent_root->root_item.uuid); + if (!btrfs_is_empty_uuid(parent_root->root_item.received_uuid)) + TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, + parent_root->root_item.received_uuid); + else + TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, + parent_root->root_item.uuid); TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID, - sctx->parent_root->root_item.ctransid); + le64_to_cpu(sctx->parent_root->root_item.ctransid)); } ret = send_cmd(sctx); @@ -2281,7 +2394,7 @@ verbose_printk("btrfs: send_truncate %llu size=%llu\n", ino, size); - p = fs_path_alloc(sctx); + p = fs_path_alloc(); if (!p) return -ENOMEM; @@ -2299,7 +2412,7 @@ tlv_put_failure: out: - fs_path_free(sctx, p); + fs_path_free(p); return ret; } @@ -2310,7 +2423,7 @@ verbose_printk("btrfs: send_chmod %llu mode=%llu\n", ino, mode); - p = fs_path_alloc(sctx); + p = fs_path_alloc(); if (!p) return -ENOMEM; @@ -2328,7 +2441,7 @@ tlv_put_failure: out: - fs_path_free(sctx, p); + fs_path_free(p); return ret; } @@ -2339,7 +2452,7 @@ verbose_printk("btrfs: send_chown %llu uid=%llu, gid=%llu\n", ino, uid, gid); - p = fs_path_alloc(sctx); + p = fs_path_alloc(); if (!p) return -ENOMEM; @@ -2358,7 +2471,7 @@ tlv_put_failure: out: - fs_path_free(sctx, p); + fs_path_free(p); return ret; } @@ -2374,7 +2487,7 @@ verbose_printk("btrfs: send_utimes %llu\n", ino); - p = fs_path_alloc(sctx); + p = fs_path_alloc(); if (!p) return -ENOMEM; @@ -2403,19 +2516,16 @@ if (ret < 0) goto out; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); - TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb, - btrfs_inode_atime(ii)); - TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb, - btrfs_inode_mtime(ii)); - TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb, - btrfs_inode_ctime(ii)); + TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb, &ii->atime); + TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb, &ii->mtime); + TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb, &ii->ctime); /* TODO Add otime support when the otime patches get into upstream */ ret = send_cmd(sctx); tlv_put_failure: out: - fs_path_free(sctx, p); + fs_path_free(p); btrfs_free_path(path); return ret; } @@ -2436,14 +2546,20 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino); - p = fs_path_alloc(sctx); + p = fs_path_alloc(); if (!p) return -ENOMEM; - ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode, NULL, - NULL, &rdev); - if (ret < 0) - goto out; + if (ino != sctx->cur_ino) { + ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode, + NULL, NULL, &rdev); + if (ret < 0) + goto out; + } else { + gen = sctx->cur_inode_gen; + mode = sctx->cur_inode_mode; + rdev = sctx->cur_inode_rdev; + } if (S_ISREG(mode)) { cmd = BTRFS_SEND_C_MKFILE; @@ -2458,7 +2574,7 @@ } else if (S_ISSOCK(mode)) { cmd = BTRFS_SEND_C_MKSOCK; } else { - printk(KERN_WARNING "btrfs: unexpected inode type %o", + btrfs_warn(sctx->send_root->fs_info, "unexpected inode type %o", (int)(mode & S_IFMT)); ret = -ENOTSUPP; goto out; @@ -2477,7 +2593,7 @@ if (S_ISLNK(mode)) { fs_path_reset(p); - ret = read_symlink(sctx, sctx->send_root, ino, p); + ret = read_symlink(sctx->send_root, ino, p); if (ret < 0) goto out; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p); @@ -2494,7 +2610,7 @@ tlv_put_failure: out: - fs_path_free(sctx, p); + fs_path_free(p); return ret; } @@ -2523,17 +2639,26 @@ key.objectid = dir; key.type = BTRFS_DIR_INDEX_KEY; key.offset = 0; + ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0); + if (ret < 0) + goto out; + while (1) { - ret = btrfs_search_slot_for_read(sctx->send_root, &key, path, - 1, 0); - if (ret < 0) - goto out; - if (!ret) { - eb = path->nodes[0]; - slot = path->slots[0]; - btrfs_item_key_to_cpu(eb, &found_key, slot); + eb = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(eb)) { + ret = btrfs_next_leaf(sctx->send_root, path); + if (ret < 0) { + goto out; + } else if (ret > 0) { + ret = 0; + break; + } + continue; } - if (ret || found_key.objectid != key.objectid || + + btrfs_item_key_to_cpu(eb, &found_key, slot); + if (found_key.objectid != key.objectid || found_key.type != key.type) { ret = 0; goto out; @@ -2548,8 +2673,7 @@ goto out; } - key.offset = found_key.offset + 1; - btrfs_release_path(path); + path->slots[0]++; } out: @@ -2601,11 +2725,10 @@ * everything mixed. So we first record all refs and later process them. * This function is a helper to record one ref. */ -static int record_ref(struct list_head *head, u64 dir, +static int __record_ref(struct list_head *head, u64 dir, u64 dir_gen, struct fs_path *path) { struct recorded_ref *ref; - char *tmp; ref = kmalloc(sizeof(*ref), GFP_NOFS); if (!ref) @@ -2615,32 +2738,42 @@ ref->dir_gen = dir_gen; ref->full_path = path; - tmp = strrchr(ref->full_path->start, '/'); - if (!tmp) { - ref->name_len = ref->full_path->end - ref->full_path->start; - ref->name = ref->full_path->start; + ref->name = (char *)kbasename(ref->full_path->start); + ref->name_len = ref->full_path->end - ref->name; + ref->dir_path = ref->full_path->start; + if (ref->name == ref->full_path->start) ref->dir_path_len = 0; - ref->dir_path = ref->full_path->start; - } else { - tmp++; - ref->name_len = ref->full_path->end - tmp; - ref->name = tmp; - ref->dir_path = ref->full_path->start; + else ref->dir_path_len = ref->full_path->end - ref->full_path->start - 1 - ref->name_len; - } list_add_tail(&ref->list, head); return 0; } -static void __free_recorded_refs(struct send_ctx *sctx, struct list_head *head) +static int dup_ref(struct recorded_ref *ref, struct list_head *list) +{ + struct recorded_ref *new; + + new = kmalloc(sizeof(*ref), GFP_NOFS); + if (!new) + return -ENOMEM; + + new->dir = ref->dir; + new->dir_gen = ref->dir_gen; + new->full_path = NULL; + INIT_LIST_HEAD(&new->list); + list_add_tail(&new->list, list); + return 0; +} + +static void __free_recorded_refs(struct list_head *head) { struct recorded_ref *cur; while (!list_empty(head)) { cur = list_entry(head->next, struct recorded_ref, list); - fs_path_free(sctx, cur->full_path); + fs_path_free(cur->full_path); list_del(&cur->list); kfree(cur); } @@ -2648,8 +2781,8 @@ static void free_recorded_refs(struct send_ctx *sctx) { - __free_recorded_refs(sctx, &sctx->new_refs); - __free_recorded_refs(sctx, &sctx->deleted_refs); + __free_recorded_refs(&sctx->new_refs); + __free_recorded_refs(&sctx->deleted_refs); } /* @@ -2663,7 +2796,7 @@ int ret; struct fs_path *orphan; - orphan = fs_path_alloc(sctx); + orphan = fs_path_alloc(); if (!orphan) return -ENOMEM; @@ -2674,16 +2807,82 @@ ret = send_rename(sctx, path, orphan); out: - fs_path_free(sctx, orphan); + fs_path_free(orphan); return ret; } +static struct orphan_dir_info * +add_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino) +{ + struct rb_node **p = &sctx->orphan_dirs.rb_node; + struct rb_node *parent = NULL; + struct orphan_dir_info *entry, *odi; + + odi = kmalloc(sizeof(*odi), GFP_NOFS); + if (!odi) + return ERR_PTR(-ENOMEM); + odi->ino = dir_ino; + odi->gen = 0; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct orphan_dir_info, node); + if (dir_ino < entry->ino) { + p = &(*p)->rb_left; + } else if (dir_ino > entry->ino) { + p = &(*p)->rb_right; + } else { + kfree(odi); + return entry; + } + } + + rb_link_node(&odi->node, parent, p); + rb_insert_color(&odi->node, &sctx->orphan_dirs); + return odi; +} + +static struct orphan_dir_info * +get_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino) +{ + struct rb_node *n = sctx->orphan_dirs.rb_node; + struct orphan_dir_info *entry; + + while (n) { + entry = rb_entry(n, struct orphan_dir_info, node); + if (dir_ino < entry->ino) + n = n->rb_left; + else if (dir_ino > entry->ino) + n = n->rb_right; + else + return entry; + } + return NULL; +} + +static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino) +{ + struct orphan_dir_info *odi = get_orphan_dir_info(sctx, dir_ino); + + return odi != NULL; +} + +static void free_orphan_dir_info(struct send_ctx *sctx, + struct orphan_dir_info *odi) +{ + if (!odi) + return; + rb_erase(&odi->node, &sctx->orphan_dirs); + kfree(odi); +} + /* * Returns 1 if a directory can be removed at this point in time. * We check this by iterating all dir items and checking if the inode behind * the dir item was already processed. */ -static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 send_progress) +static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, + u64 send_progress) { int ret = 0; struct btrfs_root *root = sctx->parent_root; @@ -2706,31 +2905,52 @@ key.objectid = dir; key.type = BTRFS_DIR_INDEX_KEY; key.offset = 0; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; while (1) { - ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); - if (ret < 0) - goto out; - if (!ret) { - btrfs_item_key_to_cpu(path->nodes[0], &found_key, - path->slots[0]); + struct waiting_dir_move *dm; + + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + else if (ret > 0) + break; + continue; } - if (ret || found_key.objectid != key.objectid || - found_key.type != key.type) { + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + if (found_key.objectid != key.objectid || + found_key.type != key.type) break; - } di = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_dir_item); btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc); + dm = get_waiting_dir_move(sctx, loc.objectid); + if (dm) { + struct orphan_dir_info *odi; + + odi = add_orphan_dir_info(sctx, dir); + if (IS_ERR(odi)) { + ret = PTR_ERR(odi); + goto out; + } + odi->gen = dir_gen; + dm->rmdir_ino = dir; + ret = 0; + goto out; + } + if (loc.objectid > send_progress) { ret = 0; goto out; } - btrfs_release_path(path); - key.offset = found_key.offset + 1; + path->slots[0]++; } ret = 1; @@ -2740,22 +2960,580 @@ return ret; } +static int is_waiting_for_move(struct send_ctx *sctx, u64 ino) +{ + struct waiting_dir_move *entry = get_waiting_dir_move(sctx, ino); + + return entry != NULL; +} + +static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino, bool orphanized) +{ + struct rb_node **p = &sctx->waiting_dir_moves.rb_node; + struct rb_node *parent = NULL; + struct waiting_dir_move *entry, *dm; + + dm = kmalloc(sizeof(*dm), GFP_NOFS); + if (!dm) + return -ENOMEM; + dm->ino = ino; + dm->rmdir_ino = 0; + dm->orphanized = orphanized; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct waiting_dir_move, node); + if (ino < entry->ino) { + p = &(*p)->rb_left; + } else if (ino > entry->ino) { + p = &(*p)->rb_right; + } else { + kfree(dm); + return -EEXIST; + } + } + + rb_link_node(&dm->node, parent, p); + rb_insert_color(&dm->node, &sctx->waiting_dir_moves); + return 0; +} + +static struct waiting_dir_move * +get_waiting_dir_move(struct send_ctx *sctx, u64 ino) +{ + struct rb_node *n = sctx->waiting_dir_moves.rb_node; + struct waiting_dir_move *entry; + + while (n) { + entry = rb_entry(n, struct waiting_dir_move, node); + if (ino < entry->ino) + n = n->rb_left; + else if (ino > entry->ino) + n = n->rb_right; + else + return entry; + } + return NULL; +} + +static void free_waiting_dir_move(struct send_ctx *sctx, + struct waiting_dir_move *dm) +{ + if (!dm) + return; + rb_erase(&dm->node, &sctx->waiting_dir_moves); + kfree(dm); +} + +static int add_pending_dir_move(struct send_ctx *sctx, + u64 ino, + u64 ino_gen, + u64 parent_ino, + struct list_head *new_refs, + struct list_head *deleted_refs, + const bool is_orphan) +{ + struct rb_node **p = &sctx->pending_dir_moves.rb_node; + struct rb_node *parent = NULL; + struct pending_dir_move *entry = NULL, *pm; + struct recorded_ref *cur; + int exists = 0; + int ret; + + pm = kmalloc(sizeof(*pm), GFP_NOFS); + if (!pm) + return -ENOMEM; + pm->parent_ino = parent_ino; + pm->ino = ino; + pm->gen = ino_gen; + pm->is_orphan = is_orphan; + INIT_LIST_HEAD(&pm->list); + INIT_LIST_HEAD(&pm->update_refs); + RB_CLEAR_NODE(&pm->node); + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct pending_dir_move, node); + if (parent_ino < entry->parent_ino) { + p = &(*p)->rb_left; + } else if (parent_ino > entry->parent_ino) { + p = &(*p)->rb_right; + } else { + exists = 1; + break; + } + } + + list_for_each_entry(cur, deleted_refs, list) { + ret = dup_ref(cur, &pm->update_refs); + if (ret < 0) + goto out; + } + list_for_each_entry(cur, new_refs, list) { + ret = dup_ref(cur, &pm->update_refs); + if (ret < 0) + goto out; + } + + ret = add_waiting_dir_move(sctx, pm->ino, is_orphan); + if (ret) + goto out; + + if (exists) { + list_add_tail(&pm->list, &entry->list); + } else { + rb_link_node(&pm->node, parent, p); + rb_insert_color(&pm->node, &sctx->pending_dir_moves); + } + ret = 0; +out: + if (ret) { + __free_recorded_refs(&pm->update_refs); + kfree(pm); + } + return ret; +} + +static struct pending_dir_move *get_pending_dir_moves(struct send_ctx *sctx, + u64 parent_ino) +{ + struct rb_node *n = sctx->pending_dir_moves.rb_node; + struct pending_dir_move *entry; + + while (n) { + entry = rb_entry(n, struct pending_dir_move, node); + if (parent_ino < entry->parent_ino) + n = n->rb_left; + else if (parent_ino > entry->parent_ino) + n = n->rb_right; + else + return entry; + } + return NULL; +} + +static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) +{ + struct fs_path *from_path = NULL; + struct fs_path *to_path = NULL; + struct fs_path *name = NULL; + u64 orig_progress = sctx->send_progress; + struct recorded_ref *cur; + u64 parent_ino, parent_gen; + struct waiting_dir_move *dm = NULL; + u64 rmdir_ino = 0; + int ret; + + name = fs_path_alloc(); + from_path = fs_path_alloc(); + if (!name || !from_path) { + ret = -ENOMEM; + goto out; + } + + dm = get_waiting_dir_move(sctx, pm->ino); + ASSERT(dm); + rmdir_ino = dm->rmdir_ino; + free_waiting_dir_move(sctx, dm); + + if (pm->is_orphan) { + ret = gen_unique_name(sctx, pm->ino, + pm->gen, from_path); + } else { + ret = get_first_ref(sctx->parent_root, pm->ino, + &parent_ino, &parent_gen, name); + if (ret < 0) + goto out; + ret = get_cur_path(sctx, parent_ino, parent_gen, + from_path); + if (ret < 0) + goto out; + ret = fs_path_add_path(from_path, name); + } + if (ret < 0) + goto out; + + sctx->send_progress = sctx->cur_ino + 1; + fs_path_reset(name); + to_path = name; + name = NULL; + ret = get_cur_path(sctx, pm->ino, pm->gen, to_path); + if (ret < 0) + goto out; + + ret = send_rename(sctx, from_path, to_path); + if (ret < 0) + goto out; + + if (rmdir_ino) { + struct orphan_dir_info *odi; + + odi = get_orphan_dir_info(sctx, rmdir_ino); + if (!odi) { + /* already deleted */ + goto finish; + } + ret = can_rmdir(sctx, rmdir_ino, odi->gen, sctx->cur_ino + 1); + if (ret < 0) + goto out; + if (!ret) + goto finish; + + name = fs_path_alloc(); + if (!name) { + ret = -ENOMEM; + goto out; + } + ret = get_cur_path(sctx, rmdir_ino, odi->gen, name); + if (ret < 0) + goto out; + ret = send_rmdir(sctx, name); + if (ret < 0) + goto out; + free_orphan_dir_info(sctx, odi); + } + +finish: + ret = send_utimes(sctx, pm->ino, pm->gen); + if (ret < 0) + goto out; + + /* + * After rename/move, need to update the utimes of both new parent(s) + * and old parent(s). + */ + list_for_each_entry(cur, &pm->update_refs, list) { + if (cur->dir == rmdir_ino) + continue; + ret = send_utimes(sctx, cur->dir, cur->dir_gen); + if (ret < 0) + goto out; + } + +out: + fs_path_free(name); + fs_path_free(from_path); + fs_path_free(to_path); + sctx->send_progress = orig_progress; + + return ret; +} + +static void free_pending_move(struct send_ctx *sctx, struct pending_dir_move *m) +{ + if (!list_empty(&m->list)) + list_del(&m->list); + if (!RB_EMPTY_NODE(&m->node)) + rb_erase(&m->node, &sctx->pending_dir_moves); + __free_recorded_refs(&m->update_refs); + kfree(m); +} + +static void tail_append_pending_moves(struct pending_dir_move *moves, + struct list_head *stack) +{ + if (list_empty(&moves->list)) { + list_add_tail(&moves->list, stack); + } else { + LIST_HEAD(list); + list_splice_init(&moves->list, &list); + list_add_tail(&moves->list, stack); + list_splice_tail(&list, stack); + } +} + +static int apply_children_dir_moves(struct send_ctx *sctx) +{ + struct pending_dir_move *pm; + struct list_head stack; + u64 parent_ino = sctx->cur_ino; + int ret = 0; + + pm = get_pending_dir_moves(sctx, parent_ino); + if (!pm) + return 0; + + INIT_LIST_HEAD(&stack); + tail_append_pending_moves(pm, &stack); + + while (!list_empty(&stack)) { + pm = list_first_entry(&stack, struct pending_dir_move, list); + parent_ino = pm->ino; + ret = apply_dir_move(sctx, pm); + free_pending_move(sctx, pm); + if (ret) + goto out; + pm = get_pending_dir_moves(sctx, parent_ino); + if (pm) + tail_append_pending_moves(pm, &stack); + } + return 0; + +out: + while (!list_empty(&stack)) { + pm = list_first_entry(&stack, struct pending_dir_move, list); + free_pending_move(sctx, pm); + } + return ret; +} + +/* + * We might need to delay a directory rename even when no ancestor directory + * (in the send root) with a higher inode number than ours (sctx->cur_ino) was + * renamed. This happens when we rename a directory to the old name (the name + * in the parent root) of some other unrelated directory that got its rename + * delayed due to some ancestor with higher number that got renamed. + * + * Example: + * + * Parent snapshot: + * . (ino 256) + * |---- a/ (ino 257) + * | |---- file (ino 260) + * | + * |---- b/ (ino 258) + * |---- c/ (ino 259) + * + * Send snapshot: + * . (ino 256) + * |---- a/ (ino 258) + * |---- x/ (ino 259) + * |---- y/ (ino 257) + * |----- file (ino 260) + * + * Here we can not rename 258 from 'b' to 'a' without the rename of inode 257 + * from 'a' to 'x/y' happening first, which in turn depends on the rename of + * inode 259 from 'c' to 'x'. So the order of rename commands the send stream + * must issue is: + * + * 1 - rename 259 from 'c' to 'x' + * 2 - rename 257 from 'a' to 'x/y' + * 3 - rename 258 from 'b' to 'a' + * + * Returns 1 if the rename of sctx->cur_ino needs to be delayed, 0 if it can + * be done right away and < 0 on error. + */ +static int wait_for_dest_dir_move(struct send_ctx *sctx, + struct recorded_ref *parent_ref, + const bool is_orphan) +{ + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_key di_key; + struct btrfs_dir_item *di; + u64 left_gen; + u64 right_gen; + int ret = 0; + + if (RB_EMPTY_ROOT(&sctx->waiting_dir_moves)) + return 0; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + key.objectid = parent_ref->dir; + key.type = BTRFS_DIR_ITEM_KEY; + key.offset = btrfs_name_hash(parent_ref->name, parent_ref->name_len); + + ret = btrfs_search_slot(NULL, sctx->parent_root, &key, path, 0, 0); + if (ret < 0) { + goto out; + } else if (ret > 0) { + ret = 0; + goto out; + } + + di = btrfs_match_dir_item_name(sctx->parent_root, path, + parent_ref->name, parent_ref->name_len); + if (!di) { + ret = 0; + goto out; + } + /* + * di_key.objectid has the number of the inode that has a dentry in the + * parent directory with the same name that sctx->cur_ino is being + * renamed to. We need to check if that inode is in the send root as + * well and if it is currently marked as an inode with a pending rename, + * if it is, we need to delay the rename of sctx->cur_ino as well, so + * that it happens after that other inode is renamed. + */ + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &di_key); + if (di_key.type != BTRFS_INODE_ITEM_KEY) { + ret = 0; + goto out; + } + + ret = get_inode_info(sctx->parent_root, di_key.objectid, NULL, + &left_gen, NULL, NULL, NULL, NULL); + if (ret < 0) + goto out; + ret = get_inode_info(sctx->send_root, di_key.objectid, NULL, + &right_gen, NULL, NULL, NULL, NULL); + if (ret < 0) { + if (ret == -ENOENT) + ret = 0; + goto out; + } + + /* Different inode, no need to delay the rename of sctx->cur_ino */ + if (right_gen != left_gen) { + ret = 0; + goto out; + } + + if (is_waiting_for_move(sctx, di_key.objectid)) { + ret = add_pending_dir_move(sctx, + sctx->cur_ino, + sctx->cur_inode_gen, + di_key.objectid, + &sctx->new_refs, + &sctx->deleted_refs, + is_orphan); + if (!ret) + ret = 1; + } +out: + btrfs_free_path(path); + return ret; +} + +/* + * Check if ino ino1 is an ancestor of inode ino2 in the given root. + * Return 1 if true, 0 if false and < 0 on error. + */ +static int is_ancestor(struct btrfs_root *root, + const u64 ino1, + const u64 ino1_gen, + const u64 ino2, + struct fs_path *fs_path) +{ + u64 ino = ino2; + + while (ino > BTRFS_FIRST_FREE_OBJECTID) { + int ret; + u64 parent; + u64 parent_gen; + + fs_path_reset(fs_path); + ret = get_first_ref(root, ino, &parent, &parent_gen, fs_path); + if (ret < 0) { + if (ret == -ENOENT && ino == ino2) + ret = 0; + return ret; + } + if (parent == ino1) + return parent_gen == ino1_gen ? 1 : 0; + ino = parent; + } + return 0; +} + +static int wait_for_parent_move(struct send_ctx *sctx, + struct recorded_ref *parent_ref, + const bool is_orphan) +{ + int ret = 0; + u64 ino = parent_ref->dir; + u64 parent_ino_before, parent_ino_after; + struct fs_path *path_before = NULL; + struct fs_path *path_after = NULL; + int len1, len2; + + path_after = fs_path_alloc(); + path_before = fs_path_alloc(); + if (!path_after || !path_before) { + ret = -ENOMEM; + goto out; + } + + /* + * Our current directory inode may not yet be renamed/moved because some + * ancestor (immediate or not) has to be renamed/moved first. So find if + * such ancestor exists and make sure our own rename/move happens after + * that ancestor is processed to avoid path build infinite loops (done + * at get_cur_path()). + */ + while (ino > BTRFS_FIRST_FREE_OBJECTID) { + if (is_waiting_for_move(sctx, ino)) { + /* + * If the current inode is an ancestor of ino in the + * parent root, we need to delay the rename of the + * current inode, otherwise don't delayed the rename + * because we can end up with a circular dependency + * of renames, resulting in some directories never + * getting the respective rename operations issued in + * the send stream or getting into infinite path build + * loops. + */ + ret = is_ancestor(sctx->parent_root, + sctx->cur_ino, sctx->cur_inode_gen, + ino, path_before); + break; + } + + fs_path_reset(path_before); + fs_path_reset(path_after); + + ret = get_first_ref(sctx->send_root, ino, &parent_ino_after, + NULL, path_after); + if (ret < 0) + goto out; + ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before, + NULL, path_before); + if (ret < 0 && ret != -ENOENT) { + goto out; + } else if (ret == -ENOENT) { + ret = 0; + break; + } + + len1 = fs_path_len(path_before); + len2 = fs_path_len(path_after); + if (ino > sctx->cur_ino && + (parent_ino_before != parent_ino_after || len1 != len2 || + memcmp(path_before->start, path_after->start, len1))) { + ret = 1; + break; + } + ino = parent_ino_after; + } + +out: + fs_path_free(path_before); + fs_path_free(path_after); + + if (ret == 1) { + ret = add_pending_dir_move(sctx, + sctx->cur_ino, + sctx->cur_inode_gen, + ino, + &sctx->new_refs, + &sctx->deleted_refs, + is_orphan); + if (!ret) + ret = 1; + } + + return ret; +} + /* * This does all the move/link/unlink/rmdir magic. */ -static int process_recorded_refs(struct send_ctx *sctx) +static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) { int ret = 0; struct recorded_ref *cur; struct recorded_ref *cur2; - struct ulist *check_dirs = NULL; - struct ulist_iterator uit; - struct ulist_node *un; + struct list_head check_dirs; struct fs_path *valid_path = NULL; u64 ow_inode = 0; u64 ow_gen; int did_overwrite = 0; int is_orphan = 0; + u64 last_dir_ino_rm = 0; + bool can_rename = true; verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); @@ -2764,19 +3542,14 @@ * which is always '..' */ BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID); + INIT_LIST_HEAD(&check_dirs); - valid_path = fs_path_alloc(sctx); + valid_path = fs_path_alloc(); if (!valid_path) { ret = -ENOMEM; goto out; } - check_dirs = ulist_alloc(GFP_NOFS); - if (!check_dirs) { - ret = -ENOMEM; - goto out; - } - /* * First, check if the first ref of the current inode was overwritten * before. If yes, we know that the current inode was already orphanized @@ -2862,16 +3635,33 @@ if (ret < 0) goto out; if (ret) { - ret = is_first_ref(sctx, sctx->parent_root, - ow_inode, cur->dir, cur->name, - cur->name_len); + ret = is_first_ref(sctx->parent_root, + ow_inode, cur->dir, cur->name, + cur->name_len); if (ret < 0) goto out; if (ret) { + struct name_cache_entry *nce; + ret = orphanize_inode(sctx, ow_inode, ow_gen, cur->full_path); if (ret < 0) goto out; + /* + * Make sure we clear our orphanized inode's + * name from the name cache. This is because the + * inode ow_inode might be an ancestor of some + * other inode that will be orphanized as well + * later and has an inode number greater than + * sctx->send_progress. We need to prevent + * future name lookups from using the old name + * and get instead the orphan name. + */ + nce = name_cache_search(sctx, ow_inode, ow_gen); + if (nce) { + name_cache_delete(sctx, nce); + kfree(nce); + } } else { ret = send_unlink(sctx, cur->full_path); if (ret < 0) @@ -2879,12 +3669,33 @@ } } + if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root) { + ret = wait_for_dest_dir_move(sctx, cur, is_orphan); + if (ret < 0) + goto out; + if (ret == 1) { + can_rename = false; + *pending_move = 1; + } + } + + if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root && + can_rename) { + ret = wait_for_parent_move(sctx, cur, is_orphan); + if (ret < 0) + goto out; + if (ret == 1) { + can_rename = false; + *pending_move = 1; + } + } + /* * link/move the ref to the new place. If we have an orphan * inode, move it and update valid_path. If not, link or move * it depending on the inode mode. */ - if (is_orphan) { + if (is_orphan && can_rename) { ret = send_rename(sctx, valid_path, cur->full_path); if (ret < 0) goto out; @@ -2892,7 +3703,7 @@ ret = fs_path_copy(valid_path, cur->full_path); if (ret < 0) goto out; - } else { + } else if (can_rename) { if (S_ISDIR(sctx->cur_inode_mode)) { /* * Dirs can't be linked, so move it. For moved @@ -2900,10 +3711,10 @@ * ref. The deleted ref is ignored later. */ ret = send_rename(sctx, valid_path, - cur->full_path); - if (ret < 0) - goto out; - ret = fs_path_copy(valid_path, cur->full_path); + cur->full_path); + if (!ret) + ret = fs_path_copy(valid_path, + cur->full_path); if (ret < 0) goto out; } else { @@ -2913,8 +3724,7 @@ goto out; } } - ret = ulist_add(check_dirs, cur->dir, cur->dir_gen, - GFP_NOFS); + ret = dup_ref(cur, &check_dirs); if (ret < 0) goto out; } @@ -2926,7 +3736,8 @@ * later, we do this check again and rmdir it then if possible. * See the use of check_dirs for more details. */ - ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_ino); + ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen, + sctx->cur_ino); if (ret < 0) goto out; if (ret) { @@ -2942,8 +3753,7 @@ } list_for_each_entry(cur, &sctx->deleted_refs, list) { - ret = ulist_add(check_dirs, cur->dir, cur->dir_gen, - GFP_NOFS); + ret = dup_ref(cur, &check_dirs); if (ret < 0) goto out; } @@ -2954,8 +3764,7 @@ */ cur = list_entry(sctx->deleted_refs.next, struct recorded_ref, list); - ret = ulist_add(check_dirs, cur->dir, cur->dir_gen, - GFP_NOFS); + ret = dup_ref(cur, &check_dirs); if (ret < 0) goto out; } else if (!S_ISDIR(sctx->cur_inode_mode)) { @@ -2975,12 +3784,10 @@ if (ret < 0) goto out; } - ret = ulist_add(check_dirs, cur->dir, cur->dir_gen, - GFP_NOFS); + ret = dup_ref(cur, &check_dirs); if (ret < 0) goto out; } - /* * If the inode is still orphan, unlink the orphan. This may * happen when a previous inode did overwrite the first ref @@ -3002,38 +3809,40 @@ * deletion and if it's finally possible to perform the rmdir now. * We also update the inode stats of the parent dirs here. */ - ULIST_ITER_INIT(&uit); - while ((un = ulist_next(check_dirs, &uit))) { + list_for_each_entry(cur, &check_dirs, list) { /* * In case we had refs into dirs that were not processed yet, * we don't need to do the utime and rmdir logic for these dirs. * The dir will be processed later. */ - if (un->val > sctx->cur_ino) + if (cur->dir > sctx->cur_ino) continue; - ret = get_cur_inode_state(sctx, un->val, un->aux); + ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen); if (ret < 0) goto out; if (ret == inode_state_did_create || ret == inode_state_no_change) { /* TODO delayed utimes */ - ret = send_utimes(sctx, un->val, un->aux); + ret = send_utimes(sctx, cur->dir, cur->dir_gen); if (ret < 0) goto out; - } else if (ret == inode_state_did_delete) { - ret = can_rmdir(sctx, un->val, sctx->cur_ino); + } else if (ret == inode_state_did_delete && + cur->dir != last_dir_ino_rm) { + ret = can_rmdir(sctx, cur->dir, cur->dir_gen, + sctx->cur_ino); if (ret < 0) goto out; if (ret) { - ret = get_cur_path(sctx, un->val, un->aux, - valid_path); + ret = get_cur_path(sctx, cur->dir, + cur->dir_gen, valid_path); if (ret < 0) goto out; ret = send_rmdir(sctx, valid_path); if (ret < 0) goto out; + last_dir_ino_rm = cur->dir; } } } @@ -3041,26 +3850,25 @@ ret = 0; out: + __free_recorded_refs(&check_dirs); free_recorded_refs(sctx); - ulist_free(check_dirs); - fs_path_free(sctx, valid_path); + fs_path_free(valid_path); return ret; } -static int __record_new_ref(int num, u64 dir, int index, - struct fs_path *name, - void *ctx) +static int record_ref(struct btrfs_root *root, int num, u64 dir, int index, + struct fs_path *name, void *ctx, struct list_head *refs) { int ret = 0; struct send_ctx *sctx = ctx; struct fs_path *p; u64 gen; - p = fs_path_alloc(sctx); + p = fs_path_alloc(); if (!p) return -ENOMEM; - ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL, NULL, + ret = get_inode_info(root, dir, NULL, &gen, NULL, NULL, NULL, NULL); if (ret < 0) goto out; @@ -3072,53 +3880,39 @@ if (ret < 0) goto out; - ret = record_ref(&sctx->new_refs, dir, gen, p); + ret = __record_ref(refs, dir, gen, p); out: if (ret) - fs_path_free(sctx, p); + fs_path_free(p); return ret; } +static int __record_new_ref(int num, u64 dir, int index, + struct fs_path *name, + void *ctx) +{ + struct send_ctx *sctx = ctx; + return record_ref(sctx->send_root, num, dir, index, name, + ctx, &sctx->new_refs); +} + + static int __record_deleted_ref(int num, u64 dir, int index, struct fs_path *name, void *ctx) { - int ret = 0; struct send_ctx *sctx = ctx; - struct fs_path *p; - u64 gen; - - p = fs_path_alloc(sctx); - if (!p) - return -ENOMEM; - - ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL, NULL, - NULL, NULL); - if (ret < 0) - goto out; - - ret = get_cur_path(sctx, dir, gen, p); - if (ret < 0) - goto out; - ret = fs_path_add_path(p, name); - if (ret < 0) - goto out; - - ret = record_ref(&sctx->deleted_refs, dir, gen, p); - -out: - if (ret) - fs_path_free(sctx, p); - return ret; + return record_ref(sctx->parent_root, num, dir, index, name, + ctx, &sctx->deleted_refs); } static int record_new_ref(struct send_ctx *sctx) { int ret; - ret = iterate_inode_ref(sctx, sctx->send_root, sctx->left_path, - sctx->cmp_key, 0, __record_new_ref, sctx); + ret = iterate_inode_ref(sctx->send_root, sctx->left_path, + sctx->cmp_key, 0, __record_new_ref, sctx); if (ret < 0) goto out; ret = 0; @@ -3131,8 +3925,8 @@ { int ret; - ret = iterate_inode_ref(sctx, sctx->parent_root, sctx->right_path, - sctx->cmp_key, 0, __record_deleted_ref, sctx); + ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, + sctx->cmp_key, 0, __record_deleted_ref, sctx); if (ret < 0) goto out; ret = 0; @@ -3143,6 +3937,8 @@ struct find_ref_ctx { u64 dir; + u64 dir_gen; + struct btrfs_root *root; struct fs_path *name; int found_idx; }; @@ -3152,29 +3948,42 @@ void *ctx_) { struct find_ref_ctx *ctx = ctx_; + u64 dir_gen; + int ret; if (dir == ctx->dir && fs_path_len(name) == fs_path_len(ctx->name) && strncmp(name->start, ctx->name->start, fs_path_len(name)) == 0) { + /* + * To avoid doing extra lookups we'll only do this if everything + * else matches. + */ + ret = get_inode_info(ctx->root, dir, NULL, &dir_gen, NULL, + NULL, NULL, NULL); + if (ret) + return ret; + if (dir_gen != ctx->dir_gen) + return 0; ctx->found_idx = num; return 1; } return 0; } -static int find_iref(struct send_ctx *sctx, - struct btrfs_root *root, +static int find_iref(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *key, - u64 dir, struct fs_path *name) + u64 dir, u64 dir_gen, struct fs_path *name) { int ret; struct find_ref_ctx ctx; ctx.dir = dir; ctx.name = name; + ctx.dir_gen = dir_gen; ctx.found_idx = -1; + ctx.root = root; - ret = iterate_inode_ref(sctx, root, path, key, 0, __find_iref, &ctx); + ret = iterate_inode_ref(root, path, key, 0, __find_iref, &ctx); if (ret < 0) return ret; @@ -3188,11 +3997,17 @@ struct fs_path *name, void *ctx) { + u64 dir_gen; int ret; struct send_ctx *sctx = ctx; - ret = find_iref(sctx, sctx->parent_root, sctx->right_path, - sctx->cmp_key, dir, name); + ret = get_inode_info(sctx->send_root, dir, NULL, &dir_gen, NULL, + NULL, NULL, NULL); + if (ret) + return ret; + + ret = find_iref(sctx->parent_root, sctx->right_path, + sctx->cmp_key, dir, dir_gen, name); if (ret == -ENOENT) ret = __record_new_ref(num, dir, index, name, sctx); else if (ret > 0) @@ -3205,11 +4020,17 @@ struct fs_path *name, void *ctx) { + u64 dir_gen; int ret; struct send_ctx *sctx = ctx; - ret = find_iref(sctx, sctx->send_root, sctx->left_path, sctx->cmp_key, - dir, name); + ret = get_inode_info(sctx->parent_root, dir, NULL, &dir_gen, NULL, + NULL, NULL, NULL); + if (ret) + return ret; + + ret = find_iref(sctx->send_root, sctx->left_path, sctx->cmp_key, + dir, dir_gen, name); if (ret == -ENOENT) ret = __record_deleted_ref(num, dir, index, name, sctx); else if (ret > 0) @@ -3222,11 +4043,11 @@ { int ret = 0; - ret = iterate_inode_ref(sctx, sctx->send_root, sctx->left_path, + ret = iterate_inode_ref(sctx->send_root, sctx->left_path, sctx->cmp_key, 0, __record_changed_new_ref, sctx); if (ret < 0) goto out; - ret = iterate_inode_ref(sctx, sctx->parent_root, sctx->right_path, + ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, sctx->cmp_key, 0, __record_changed_deleted_ref, sctx); if (ret < 0) goto out; @@ -3251,6 +4072,7 @@ struct extent_buffer *eb; int slot; iterate_inode_ref_t cb; + int pending_move = 0; path = alloc_path_for_send(); if (!path) @@ -3263,21 +4085,31 @@ root = sctx->parent_root; cb = __record_deleted_ref; } else { - BUG(); + btrfs_err(sctx->send_root->fs_info, + "Wrong command %d in process_all_refs", cmd); + ret = -EINVAL; + goto out; } key.objectid = sctx->cmp_key->objectid; key.type = BTRFS_INODE_REF_KEY; key.offset = 0; - while (1) { - ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); - if (ret < 0) - goto out; - if (ret) - break; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + while (1) { eb = path->nodes[0]; slot = path->slots[0]; + if (slot >= btrfs_header_nritems(eb)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + else if (ret > 0) + break; + continue; + } + btrfs_item_key_to_cpu(eb, &found_key, slot); if (found_key.objectid != key.objectid || @@ -3285,17 +4117,17 @@ found_key.type != BTRFS_INODE_EXTREF_KEY)) break; - ret = iterate_inode_ref(sctx, root, path, &found_key, 0, cb, - sctx); - btrfs_release_path(path); + ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx); if (ret < 0) goto out; - key.offset = found_key.offset + 1; + path->slots[0]++; } btrfs_release_path(path); - ret = process_recorded_refs(sctx); + ret = process_recorded_refs(sctx, &pending_move); + /* Only applicable to an incremental send. */ + ASSERT(pending_move == 0); out: btrfs_free_path(path); @@ -3354,7 +4186,7 @@ struct fs_path *p; posix_acl_xattr_header dummy_acl; - p = fs_path_alloc(sctx); + p = fs_path_alloc(); if (!p) return -ENOMEM; @@ -3381,7 +4213,7 @@ ret = send_set_xattr(sctx, p, name, name_len, data, data_len); out: - fs_path_free(sctx, p); + fs_path_free(p); return ret; } @@ -3394,7 +4226,7 @@ struct send_ctx *sctx = ctx; struct fs_path *p; - p = fs_path_alloc(sctx); + p = fs_path_alloc(); if (!p) return -ENOMEM; @@ -3405,7 +4237,7 @@ ret = send_remove_xattr(sctx, p, name, name_len); out: - fs_path_free(sctx, p); + fs_path_free(p); return ret; } @@ -3413,8 +4245,8 @@ { int ret = 0; - ret = iterate_dir_item(sctx, sctx->send_root, sctx->left_path, - sctx->cmp_key, __process_new_xattr, sctx); + ret = iterate_dir_item(sctx->send_root, sctx->left_path, + sctx->cmp_key, __process_new_xattr, sctx); return ret; } @@ -3423,8 +4255,8 @@ { int ret; - ret = iterate_dir_item(sctx, sctx->parent_root, sctx->right_path, - sctx->cmp_key, __process_deleted_xattr, sctx); + ret = iterate_dir_item(sctx->parent_root, sctx->right_path, + sctx->cmp_key, __process_deleted_xattr, sctx); return ret; } @@ -3448,17 +4280,15 @@ strncmp(name, ctx->name, name_len) == 0) { ctx->found_idx = num; ctx->found_data_len = data_len; - ctx->found_data = kmalloc(data_len, GFP_NOFS); + ctx->found_data = kmemdup(data, data_len, GFP_NOFS); if (!ctx->found_data) return -ENOMEM; - memcpy(ctx->found_data, data, data_len); return 1; } return 0; } -static int find_xattr(struct send_ctx *sctx, - struct btrfs_root *root, +static int find_xattr(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *key, const char *name, int name_len, @@ -3473,7 +4303,7 @@ ctx.found_data = NULL; ctx.found_data_len = 0; - ret = iterate_dir_item(sctx, root, path, key, __find_xattr, &ctx); + ret = iterate_dir_item(root, path, key, __find_xattr, &ctx); if (ret < 0) return ret; @@ -3499,9 +4329,9 @@ char *found_data = NULL; int found_data_len = 0; - ret = find_xattr(sctx, sctx->parent_root, sctx->right_path, - sctx->cmp_key, name, name_len, &found_data, - &found_data_len); + ret = find_xattr(sctx->parent_root, sctx->right_path, + sctx->cmp_key, name, name_len, &found_data, + &found_data_len); if (ret == -ENOENT) { ret = __process_new_xattr(num, di_key, name, name_len, data, data_len, type, ctx); @@ -3527,8 +4357,8 @@ int ret; struct send_ctx *sctx = ctx; - ret = find_xattr(sctx, sctx->send_root, sctx->left_path, sctx->cmp_key, - name, name_len, NULL, NULL); + ret = find_xattr(sctx->send_root, sctx->left_path, sctx->cmp_key, + name, name_len, NULL, NULL); if (ret == -ENOENT) ret = __process_deleted_xattr(num, di_key, name, name_len, data, data_len, type, ctx); @@ -3542,11 +4372,11 @@ { int ret = 0; - ret = iterate_dir_item(sctx, sctx->send_root, sctx->left_path, + ret = iterate_dir_item(sctx->send_root, sctx->left_path, sctx->cmp_key, __process_changed_new_xattr, sctx); if (ret < 0) goto out; - ret = iterate_dir_item(sctx, sctx->parent_root, sctx->right_path, + ret = iterate_dir_item(sctx->parent_root, sctx->right_path, sctx->cmp_key, __process_changed_deleted_xattr, sctx); out: @@ -3572,32 +4402,37 @@ key.objectid = sctx->cmp_key->objectid; key.type = BTRFS_XATTR_ITEM_KEY; key.offset = 0; - while (1) { - ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); - if (ret < 0) - goto out; - if (ret) { - ret = 0; - goto out; - } + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + while (1) { eb = path->nodes[0]; slot = path->slots[0]; - btrfs_item_key_to_cpu(eb, &found_key, slot); + if (slot >= btrfs_header_nritems(eb)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) { + goto out; + } else if (ret > 0) { + ret = 0; + break; + } + continue; + } + btrfs_item_key_to_cpu(eb, &found_key, slot); if (found_key.objectid != key.objectid || found_key.type != key.type) { ret = 0; goto out; } - ret = iterate_dir_item(sctx, root, path, &found_key, - __process_new_xattr, sctx); + ret = iterate_dir_item(root, path, &found_key, + __process_new_xattr, sctx); if (ret < 0) goto out; - btrfs_release_path(path); - key.offset = found_key.offset + 1; + path->slots[0]++; } out: @@ -3605,6 +4440,79 @@ return ret; } +static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len) +{ + struct btrfs_root *root = sctx->send_root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct inode *inode; + struct page *page; + char *addr; + struct btrfs_key key; + pgoff_t index = offset >> PAGE_CACHE_SHIFT; + pgoff_t last_index; + unsigned pg_offset = offset & ~PAGE_CACHE_MASK; + ssize_t ret = 0; + + key.objectid = sctx->cur_ino; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + inode = btrfs_iget(fs_info->sb, &key, root, NULL); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + if (offset + len > i_size_read(inode)) { + if (offset > i_size_read(inode)) + len = 0; + else + len = offset - i_size_read(inode); + } + if (len == 0) + goto out; + + last_index = (offset + len - 1) >> PAGE_CACHE_SHIFT; + + /* initial readahead */ + memset(&sctx->ra, 0, sizeof(struct file_ra_state)); + file_ra_state_init(&sctx->ra, inode->i_mapping); + btrfs_force_ra(inode->i_mapping, &sctx->ra, NULL, index, + last_index - index + 1); + + while (index <= last_index) { + unsigned cur_len = min_t(unsigned, len, + PAGE_CACHE_SIZE - pg_offset); + page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); + if (!page) { + ret = -ENOMEM; + break; + } + + if (!PageUptodate(page)) { + btrfs_readpage(NULL, page); + lock_page(page); + if (!PageUptodate(page)) { + unlock_page(page); + page_cache_release(page); + ret = -EIO; + break; + } + } + + addr = kmap(page); + memcpy(sctx->read_buf + ret, addr + pg_offset, cur_len); + kunmap(page); + unlock_page(page); + page_cache_release(page); + index++; + pg_offset = 0; + len -= cur_len; + ret += cur_len; + } +out: + iput(inode); + return ret; +} + /* * Read some bytes from the current inode/file and send a write command to * user space. @@ -3613,35 +4521,20 @@ { int ret = 0; struct fs_path *p; - loff_t pos = offset; - int num_read = 0; - mm_segment_t old_fs; + ssize_t num_read = 0; - p = fs_path_alloc(sctx); + p = fs_path_alloc(); if (!p) return -ENOMEM; - /* - * vfs normally only accepts user space buffers for security reasons. - * we only read from the file and also only provide the read_buf buffer - * to vfs. As this buffer does not come from a user space call, it's - * ok to temporary allow kernel space buffers. - */ - old_fs = get_fs(); - set_fs(KERNEL_DS); - verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len); - ret = open_cur_inode_file(sctx); - if (ret < 0) - goto out; - - ret = vfs_read(sctx->cur_inode_filp, sctx->read_buf, len, &pos); - if (ret < 0) - goto out; - num_read = ret; - if (!num_read) + num_read = fill_read_buf(sctx, offset, len); + if (num_read <= 0) { + if (num_read < 0) + ret = num_read; goto out; + } ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); if (ret < 0) @@ -3659,8 +4552,7 @@ tlv_put_failure: out: - fs_path_free(sctx, p); - set_fs(old_fs); + fs_path_free(p); if (ret < 0) return ret; return num_read; @@ -3682,7 +4574,7 @@ clone_root->root->objectid, clone_root->ino, clone_root->offset); - p = fs_path_alloc(sctx); + p = fs_path_alloc(); if (!p) return -ENOMEM; @@ -3705,16 +4597,28 @@ goto out; ret = get_cur_path(sctx, clone_root->ino, gen, p); } else { - ret = get_inode_path(sctx, clone_root->root, - clone_root->ino, p); + ret = get_inode_path(clone_root->root, clone_root->ino, p); } if (ret < 0) goto out; - TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, - clone_root->root->root_item.uuid); + /* + * If the parent we're using has a received_uuid set then use that as + * our clone source as that is what we will look for when doing a + * receive. + * + * This covers the case that we create a snapshot off of a received + * subvolume and then use that as the parent and try to receive on a + * different host. + */ + if (!btrfs_is_empty_uuid(clone_root->root->root_item.received_uuid)) + TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, + clone_root->root->root_item.received_uuid); + else + TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, + clone_root->root->root_item.uuid); TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID, - clone_root->root->root_item.ctransid); + le64_to_cpu(clone_root->root->root_item.ctransid)); TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET, clone_root->offset); @@ -3723,7 +4627,7 @@ tlv_put_failure: out: - fs_path_free(sctx, p); + fs_path_free(p); return ret; } @@ -3736,7 +4640,7 @@ int ret = 0; struct fs_path *p; - p = fs_path_alloc(sctx); + p = fs_path_alloc(); if (!p) return -ENOMEM; @@ -3756,7 +4660,205 @@ tlv_put_failure: out: - fs_path_free(sctx, p); + fs_path_free(p); + return ret; +} + +static int send_hole(struct send_ctx *sctx, u64 end) +{ + struct fs_path *p = NULL; + u64 offset = sctx->cur_inode_last_extent; + u64 len; + int ret = 0; + + p = fs_path_alloc(); + if (!p) + return -ENOMEM; + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); + if (ret < 0) + goto tlv_put_failure; + memset(sctx->read_buf, 0, BTRFS_SEND_READ_SIZE); + while (offset < end) { + len = min_t(u64, end - offset, BTRFS_SEND_READ_SIZE); + + ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); + if (ret < 0) + break; + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); + TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); + TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, len); + ret = send_cmd(sctx); + if (ret < 0) + break; + offset += len; + } +tlv_put_failure: + fs_path_free(p); + return ret; +} + +static int send_extent_data(struct send_ctx *sctx, + const u64 offset, + const u64 len) +{ + u64 sent = 0; + + if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) + return send_update_extent(sctx, offset, len); + + while (sent < len) { + u64 size = len - sent; + int ret; + + if (size > BTRFS_SEND_READ_SIZE) + size = BTRFS_SEND_READ_SIZE; + ret = send_write(sctx, offset + sent, size); + if (ret < 0) + return ret; + if (!ret) + break; + sent += ret; + } + return 0; +} + +static int clone_range(struct send_ctx *sctx, + struct clone_root *clone_root, + const u64 disk_byte, + u64 data_offset, + u64 offset, + u64 len) +{ + struct btrfs_path *path; + struct btrfs_key key; + int ret; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + /* + * We can't send a clone operation for the entire range if we find + * extent items in the respective range in the source file that + * refer to different extents or if we find holes. + * So check for that and do a mix of clone and regular write/copy + * operations if needed. + * + * Example: + * + * mkfs.btrfs -f /dev/sda + * mount /dev/sda /mnt + * xfs_io -f -c "pwrite -S 0xaa 0K 100K" /mnt/foo + * cp --reflink=always /mnt/foo /mnt/bar + * xfs_io -c "pwrite -S 0xbb 50K 50K" /mnt/foo + * btrfs subvolume snapshot -r /mnt /mnt/snap + * + * If when we send the snapshot and we are processing file bar (which + * has a higher inode number than foo) we blindly send a clone operation + * for the [0, 100K[ range from foo to bar, the receiver ends up getting + * a file bar that matches the content of file foo - iow, doesn't match + * the content from bar in the original filesystem. + */ + key.objectid = clone_root->ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = clone_root->offset; + ret = btrfs_search_slot(NULL, clone_root->root, &key, path, 0, 0); + if (ret < 0) + goto out; + if (ret > 0 && path->slots[0] > 0) { + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1); + if (key.objectid == clone_root->ino && + key.type == BTRFS_EXTENT_DATA_KEY) + path->slots[0]--; + } + + while (true) { + struct extent_buffer *leaf = path->nodes[0]; + int slot = path->slots[0]; + struct btrfs_file_extent_item *ei; + u8 type; + u64 ext_len; + u64 clone_len; + + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(clone_root->root, path); + if (ret < 0) + goto out; + else if (ret > 0) + break; + continue; + } + + btrfs_item_key_to_cpu(leaf, &key, slot); + + /* + * We might have an implicit trailing hole (NO_HOLES feature + * enabled). We deal with it after leaving this loop. + */ + if (key.objectid != clone_root->ino || + key.type != BTRFS_EXTENT_DATA_KEY) + break; + + ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + type = btrfs_file_extent_type(leaf, ei); + if (type == BTRFS_FILE_EXTENT_INLINE) { + ext_len = btrfs_file_extent_inline_len(leaf, slot, ei); + ext_len = PAGE_CACHE_ALIGN(ext_len); + } else { + ext_len = btrfs_file_extent_num_bytes(leaf, ei); + } + + if (key.offset + ext_len <= clone_root->offset) + goto next; + + if (key.offset > clone_root->offset) { + /* Implicit hole, NO_HOLES feature enabled. */ + u64 hole_len = key.offset - clone_root->offset; + + if (hole_len > len) + hole_len = len; + ret = send_extent_data(sctx, offset, hole_len); + if (ret < 0) + goto out; + + len -= hole_len; + if (len == 0) + break; + offset += hole_len; + clone_root->offset += hole_len; + data_offset += hole_len; + } + + if (key.offset >= clone_root->offset + len) + break; + + clone_len = min_t(u64, ext_len, len); + + if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte && + btrfs_file_extent_offset(leaf, ei) == data_offset) + ret = send_clone(sctx, offset, clone_len, clone_root); + else + ret = send_extent_data(sctx, offset, clone_len); + + if (ret < 0) + goto out; + + len -= clone_len; + if (len == 0) + break; + offset += clone_len; + clone_root->offset += clone_len; + data_offset += clone_len; +next: + path->slots[0]++; + } + + if (len > 0) + ret = send_extent_data(sctx, offset, len); + else + ret = 0; +out: + btrfs_free_path(path); return ret; } @@ -3768,16 +4870,16 @@ int ret = 0; struct btrfs_file_extent_item *ei; u64 offset = key->offset; - u64 pos = 0; u64 len; - u32 l; u8 type; + u64 bs = sctx->send_root->fs_info->sb->s_blocksize; ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_file_extent_item); type = btrfs_file_extent_type(path->nodes[0], ei); if (type == BTRFS_FILE_EXTENT_INLINE) { - len = btrfs_file_extent_inline_len(path->nodes[0], ei); + len = btrfs_file_extent_inline_len(path->nodes[0], + path->slots[0], ei); /* * it is possible the inline item won't cover the whole page, * but there may be items after this page. Make @@ -3795,23 +4897,16 @@ goto out; } - if (clone_root) { - ret = send_clone(sctx, offset, len, clone_root); - } else if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) { - ret = send_update_extent(sctx, offset, len); + if (clone_root && IS_ALIGNED(offset + len, bs)) { + u64 disk_byte; + u64 data_offset; + + disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei); + data_offset = btrfs_file_extent_offset(path->nodes[0], ei); + ret = clone_range(sctx, clone_root, disk_byte, data_offset, + offset, len); } else { - while (pos < len) { - l = len - pos; - if (l > BTRFS_SEND_READ_SIZE) - l = BTRFS_SEND_READ_SIZE; - ret = send_write(sctx, pos + offset, l); - if (ret < 0) - goto out; - if (!ret) - break; - pos += ret; - } - ret = 0; + ret = send_extent_data(sctx, offset, len); } out: return ret; @@ -3898,7 +4993,8 @@ btrfs_item_key_to_cpu(eb, &found_key, slot); if (found_key.objectid != key.objectid || found_key.type != key.type) { - ret = 0; + /* If we're a hole then just pretend nothing changed */ + ret = (left_disknr) ? 0 : 1; goto out; } @@ -3909,22 +5005,23 @@ while (key.offset < ekey->offset + left_len) { ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); right_type = btrfs_file_extent_type(eb, ei); - right_disknr = btrfs_file_extent_disk_bytenr(eb, ei); - right_len = btrfs_file_extent_num_bytes(eb, ei); - right_offset = btrfs_file_extent_offset(eb, ei); - right_gen = btrfs_file_extent_generation(eb, ei); - if (right_type != BTRFS_FILE_EXTENT_REG) { ret = 0; goto out; } + right_disknr = btrfs_file_extent_disk_bytenr(eb, ei); + right_len = btrfs_file_extent_num_bytes(eb, ei); + right_offset = btrfs_file_extent_offset(eb, ei); + right_gen = btrfs_file_extent_generation(eb, ei); + /* * Are we at extent 8? If yes, we know the extent is changed. * This may only happen on the first iteration. */ if (found_key.offset + right_len <= ekey->offset) { - ret = 0; + /* If we're a hole just pretend nothing changed */ + ret = (left_disknr) ? 0 : 1; goto out; } @@ -3985,12 +5082,107 @@ return ret; } +static int get_last_extent(struct send_ctx *sctx, u64 offset) +{ + struct btrfs_path *path; + struct btrfs_root *root = sctx->send_root; + struct btrfs_file_extent_item *fi; + struct btrfs_key key; + u64 extent_end; + u8 type; + int ret; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + sctx->cur_inode_last_extent = 0; + + key.objectid = sctx->cur_ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = offset; + ret = btrfs_search_slot_for_read(root, &key, path, 0, 1); + if (ret < 0) + goto out; + ret = 0; + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid != sctx->cur_ino || key.type != BTRFS_EXTENT_DATA_KEY) + goto out; + + fi = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + type = btrfs_file_extent_type(path->nodes[0], fi); + if (type == BTRFS_FILE_EXTENT_INLINE) { + u64 size = btrfs_file_extent_inline_len(path->nodes[0], + path->slots[0], fi); + extent_end = ALIGN(key.offset + size, + sctx->send_root->sectorsize); + } else { + extent_end = key.offset + + btrfs_file_extent_num_bytes(path->nodes[0], fi); + } + sctx->cur_inode_last_extent = extent_end; +out: + btrfs_free_path(path); + return ret; +} + +static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path, + struct btrfs_key *key) +{ + struct btrfs_file_extent_item *fi; + u64 extent_end; + u8 type; + int ret = 0; + + if (sctx->cur_ino != key->objectid || !need_send_hole(sctx)) + return 0; + + if (sctx->cur_inode_last_extent == (u64)-1) { + ret = get_last_extent(sctx, key->offset - 1); + if (ret) + return ret; + } + + fi = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + type = btrfs_file_extent_type(path->nodes[0], fi); + if (type == BTRFS_FILE_EXTENT_INLINE) { + u64 size = btrfs_file_extent_inline_len(path->nodes[0], + path->slots[0], fi); + extent_end = ALIGN(key->offset + size, + sctx->send_root->sectorsize); + } else { + extent_end = key->offset + + btrfs_file_extent_num_bytes(path->nodes[0], fi); + } + + if (path->slots[0] == 0 && + sctx->cur_inode_last_extent < key->offset) { + /* + * We might have skipped entire leafs that contained only + * file extent items for our current inode. These leafs have + * a generation number smaller (older) than the one in the + * current leaf and the leaf our last extent came from, and + * are located between these 2 leafs. + */ + ret = get_last_extent(sctx, key->offset - 1); + if (ret) + return ret; + } + + if (sctx->cur_inode_last_extent < key->offset) + ret = send_hole(sctx, key->offset); + sctx->cur_inode_last_extent = extent_end; + return ret; +} + static int process_extent(struct send_ctx *sctx, struct btrfs_path *path, struct btrfs_key *key) { - int ret = 0; struct clone_root *found_clone = NULL; + int ret = 0; if (S_ISLNK(sctx->cur_inode_mode)) return 0; @@ -4001,7 +5193,33 @@ goto out; if (ret) { ret = 0; - goto out; + goto out_hole; + } + } else { + struct btrfs_file_extent_item *ei; + u8 type; + + ei = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + type = btrfs_file_extent_type(path->nodes[0], ei); + if (type == BTRFS_FILE_EXTENT_PREALLOC || + type == BTRFS_FILE_EXTENT_REG) { + /* + * The send spec does not have a prealloc command yet, + * so just leave a hole for prealloc'ed extents until + * we have enough commands queued up to justify rev'ing + * the send spec. + */ + if (type == BTRFS_FILE_EXTENT_PREALLOC) { + ret = 0; + goto out; + } + + /* Have a hole, just skip it. */ + if (btrfs_file_extent_disk_bytenr(path->nodes[0], ei) == 0) { + ret = 0; + goto out; + } } } @@ -4011,7 +5229,10 @@ goto out; ret = send_write_or_clone(sctx, path, key, found_clone); - + if (ret) + goto out; +out_hole: + ret = maybe_send_hole(sctx, path, key); out: return ret; } @@ -4034,17 +5255,25 @@ key.objectid = sctx->cmp_key->objectid; key.type = BTRFS_EXTENT_DATA_KEY; key.offset = 0; - while (1) { - ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); - if (ret < 0) - goto out; - if (ret) { - ret = 0; - goto out; - } + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + while (1) { eb = path->nodes[0]; slot = path->slots[0]; + + if (slot >= btrfs_header_nritems(eb)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) { + goto out; + } else if (ret > 0) { + ret = 0; + break; + } + continue; + } + btrfs_item_key_to_cpu(eb, &found_key, slot); if (found_key.objectid != key.objectid || @@ -4057,8 +5286,7 @@ if (ret < 0) goto out; - btrfs_release_path(path); - key.offset = found_key.offset + 1; + path->slots[0]++; } out: @@ -4066,7 +5294,9 @@ return ret; } -static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end) +static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end, + int *pending_move, + int *refs_processed) { int ret = 0; @@ -4078,17 +5308,11 @@ if (list_empty(&sctx->new_refs) && list_empty(&sctx->deleted_refs)) goto out; - ret = process_recorded_refs(sctx); + ret = process_recorded_refs(sctx, pending_move); if (ret < 0) goto out; - /* - * We have processed the refs and thus need to advance send_progress. - * Now, calls to get_cur_xxx will take the updated refs of the current - * inode into account. - */ - sctx->send_progress = sctx->cur_ino + 1; - + *refs_processed = 1; out: return ret; } @@ -4104,11 +5328,29 @@ u64 right_gid; int need_chmod = 0; int need_chown = 0; + int pending_move = 0; + int refs_processed = 0; - ret = process_recorded_refs_if_needed(sctx, at_end); + ret = process_recorded_refs_if_needed(sctx, at_end, &pending_move, + &refs_processed); if (ret < 0) goto out; + /* + * We have processed the refs and thus need to advance send_progress. + * Now, calls to get_cur_xxx will take the updated refs of the current + * inode into account. + * + * On the other hand, if our current inode is a directory and couldn't + * be moved/renamed because its parent was renamed/moved too and it has + * a higher inode number, we can only move/rename our current inode + * after we moved/renamed its parent. Therefore in this case operate on + * the old path (pre move/rename) of our current inode, and the + * move/rename will be performed later. + */ + if (refs_processed && !pending_move) + sctx->send_progress = sctx->cur_ino + 1; + if (sctx->cur_ino == 0 || sctx->cur_inode_deleted) goto out; if (!at_end && sctx->cmp_key->objectid == sctx->cur_ino) @@ -4137,6 +5379,21 @@ } if (S_ISREG(sctx->cur_inode_mode)) { + if (need_send_hole(sctx)) { + if (sctx->cur_inode_last_extent == (u64)-1 || + sctx->cur_inode_last_extent < + sctx->cur_inode_size) { + ret = get_last_extent(sctx, (u64)-1); + if (ret) + goto out; + } + if (sctx->cur_inode_last_extent < + sctx->cur_inode_size) { + ret = send_hole(sctx, sctx->cur_inode_size); + if (ret) + goto out; + } + } ret = send_truncate(sctx, sctx->cur_ino, sctx->cur_inode_gen, sctx->cur_inode_size); if (ret < 0) @@ -4157,12 +5414,25 @@ } /* - * Need to send that every time, no matter if it actually changed - * between the two trees as we have done changes to the inode before. + * If other directory inodes depended on our current directory + * inode's move/rename, now do their move/rename operations. */ - ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen); - if (ret < 0) - goto out; + if (!is_waiting_for_move(sctx, sctx->cur_ino)) { + ret = apply_children_dir_moves(sctx); + if (ret) + goto out; + /* + * Need to send that every time, no matter if it actually + * changed between the two trees as we have done changes to + * the inode before. If our inode is a directory and it's + * waiting to be moved/renamed, we will send its utimes when + * it's moved/renamed, therefore we don't need to do it here. + */ + sctx->send_progress = sctx->cur_ino + 1; + ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen); + if (ret < 0) + goto out; + } out: return ret; @@ -4178,12 +5448,9 @@ u64 left_gen = 0; u64 right_gen = 0; - ret = close_cur_inode_file(sctx); - if (ret < 0) - goto out; - sctx->cur_ino = key->objectid; sctx->cur_inode_new_gen = 0; + sctx->cur_inode_last_extent = (u64)-1; /* * Set send_progress to current inode. This will tell all get_cur_xxx @@ -4232,6 +5499,8 @@ sctx->left_path->nodes[0], left_ii); sctx->cur_inode_mode = btrfs_inode_mode( sctx->left_path->nodes[0], left_ii); + sctx->cur_inode_rdev = btrfs_inode_rdev( + sctx->left_path->nodes[0], left_ii); if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) ret = send_create_inode_if_needed(sctx); } else if (result == BTRFS_COMPARE_TREE_DELETED) { @@ -4276,6 +5545,8 @@ sctx->left_path->nodes[0], left_ii); sctx->cur_inode_mode = btrfs_inode_mode( sctx->left_path->nodes[0], left_ii); + sctx->cur_inode_rdev = btrfs_inode_rdev( + sctx->left_path->nodes[0], left_ii); ret = send_create_inode_if_needed(sctx); if (ret < 0) goto out; @@ -4390,6 +5661,64 @@ return ret; } +static int dir_changed(struct send_ctx *sctx, u64 dir) +{ + u64 orig_gen, new_gen; + int ret; + + ret = get_inode_info(sctx->send_root, dir, NULL, &new_gen, NULL, NULL, + NULL, NULL); + if (ret) + return ret; + + ret = get_inode_info(sctx->parent_root, dir, NULL, &orig_gen, NULL, + NULL, NULL, NULL); + if (ret) + return ret; + + return (orig_gen != new_gen) ? 1 : 0; +} + +static int compare_refs(struct send_ctx *sctx, struct btrfs_path *path, + struct btrfs_key *key) +{ + struct btrfs_inode_extref *extref; + struct extent_buffer *leaf; + u64 dirid = 0, last_dirid = 0; + unsigned long ptr; + u32 item_size; + u32 cur_offset = 0; + int ref_name_len; + int ret = 0; + + /* Easy case, just check this one dirid */ + if (key->type == BTRFS_INODE_REF_KEY) { + dirid = key->offset; + + ret = dir_changed(sctx, dirid); + goto out; + } + + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + while (cur_offset < item_size) { + extref = (struct btrfs_inode_extref *)(ptr + + cur_offset); + dirid = btrfs_inode_extref_parent(leaf, extref); + ref_name_len = btrfs_inode_extref_name_len(leaf, extref); + cur_offset += ref_name_len + sizeof(*extref); + if (dirid == last_dirid) + continue; + ret = dir_changed(sctx, dirid); + if (ret) + break; + last_dirid = dirid; + } +out: + return ret; +} + /* * Updates compare related fields in sctx and simply forwards to the actual * changed_xxx functions. @@ -4405,6 +5734,23 @@ int ret = 0; struct send_ctx *sctx = ctx; + if (result == BTRFS_COMPARE_TREE_SAME) { + if (key->type == BTRFS_INODE_REF_KEY || + key->type == BTRFS_INODE_EXTREF_KEY) { + ret = compare_refs(sctx, left_path, key); + if (!ret) + return 0; + if (ret < 0) + return ret; + } else if (key->type == BTRFS_EXTENT_DATA_KEY) { + return maybe_send_hole(sctx, left_path, key); + } else { + return 0; + } + result = BTRFS_COMPARE_TREE_CHANGED; + ret = 0; + } + sctx->left_path = left_path; sctx->right_path = right_path; sctx->cmp_key = key; @@ -4435,57 +5781,21 @@ static int full_send_tree(struct send_ctx *sctx) { int ret; - struct btrfs_trans_handle *trans = NULL; struct btrfs_root *send_root = sctx->send_root; struct btrfs_key key; struct btrfs_key found_key; struct btrfs_path *path; struct extent_buffer *eb; int slot; - u64 start_ctransid; - u64 ctransid; path = alloc_path_for_send(); if (!path) return -ENOMEM; - spin_lock(&send_root->root_item_lock); - start_ctransid = btrfs_root_ctransid(&send_root->root_item); - spin_unlock(&send_root->root_item_lock); - key.objectid = BTRFS_FIRST_FREE_OBJECTID; key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; -join_trans: - /* - * We need to make sure the transaction does not get committed - * while we do anything on commit roots. Join a transaction to prevent - * this. - */ - trans = btrfs_join_transaction(send_root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - trans = NULL; - goto out; - } - - /* - * Make sure the tree has not changed after re-joining. We detect this - * by comparing start_ctransid and ctransid. They should always match. - */ - spin_lock(&send_root->root_item_lock); - ctransid = btrfs_root_ctransid(&send_root->root_item); - spin_unlock(&send_root->root_item_lock); - - if (ctransid != start_ctransid) { - WARN(1, KERN_WARNING "btrfs: the root that you're trying to " - "send was modified in between. This is " - "probably a bug.\n"); - ret = -EIO; - goto out; - } - ret = btrfs_search_slot_for_read(send_root, &key, path, 1, 0); if (ret < 0) goto out; @@ -4493,19 +5803,6 @@ goto out_finish; while (1) { - /* - * When someone want to commit while we iterate, end the - * joined transaction and rejoin. - */ - if (btrfs_should_end_transaction(trans, send_root)) { - ret = btrfs_end_transaction(trans, send_root); - trans = NULL; - if (ret < 0) - goto out; - btrfs_release_path(path); - goto join_trans; - } - eb = path->nodes[0]; slot = path->slots[0]; btrfs_item_key_to_cpu(eb, &found_key, slot); @@ -4533,12 +5830,6 @@ out: btrfs_free_path(path); - if (trans) { - if (!ret) - ret = btrfs_end_transaction(trans, send_root); - else - btrfs_end_transaction(trans, send_root); - } return ret; } @@ -4571,15 +5862,70 @@ } out: - if (!ret) - ret = close_cur_inode_file(sctx); - else - close_cur_inode_file(sctx); - free_recorded_refs(sctx); return ret; } +/* + * If orphan cleanup did remove any orphans from a root, it means the tree + * was modified and therefore the commit root is not the same as the current + * root anymore. This is a problem, because send uses the commit root and + * therefore can see inode items that don't exist in the current root anymore, + * and for example make calls to btrfs_iget, which will do tree lookups based + * on the current root and not on the commit root. Those lookups will fail, + * returning a -ESTALE error, and making send fail with that error. So make + * sure a send does not see any orphans we have just removed, and that it will + * see the same inodes regardless of whether a transaction commit happened + * before it started (meaning that the commit root will be the same as the + * current root) or not. + */ +static int ensure_commit_roots_uptodate(struct send_ctx *sctx) +{ + int i; + struct btrfs_trans_handle *trans = NULL; + +again: + if (sctx->parent_root && + sctx->parent_root->node != sctx->parent_root->commit_root) + goto commit_trans; + + for (i = 0; i < sctx->clone_roots_cnt; i++) + if (sctx->clone_roots[i].root->node != + sctx->clone_roots[i].root->commit_root) + goto commit_trans; + + if (trans) + return btrfs_end_transaction(trans, sctx->send_root); + + return 0; + +commit_trans: + /* Use any root, all fs roots will get their commit roots updated. */ + if (!trans) { + trans = btrfs_join_transaction(sctx->send_root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + goto again; + } + + return btrfs_commit_transaction(trans, sctx->send_root); +} + +static void btrfs_root_dec_send_in_progress(struct btrfs_root* root) +{ + spin_lock(&root->root_item_lock); + root->send_in_progress--; + /* + * Not much left to do, we don't know why it's unbalanced and + * can't blindly reset it to 0. + */ + if (root->send_in_progress < 0) + btrfs_err(root->fs_info, + "send_in_progres unbalanced %d root %llu", + root->send_in_progress, root->root_key.objectid); + spin_unlock(&root->root_item_lock); +} + long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) { int ret = 0; @@ -4591,6 +5937,9 @@ struct send_ctx *sctx = NULL; u32 i; u64 *clone_sources_tmp = NULL; + int clone_sources_to_rollback = 0; + int sort_clone_roots = 0; + int index; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -4599,38 +5948,26 @@ fs_info = send_root->fs_info; /* + * The subvolume must remain read-only during send, protect against + * making it RW. This also protects against deletion. + */ + spin_lock(&send_root->root_item_lock); + send_root->send_in_progress++; + spin_unlock(&send_root->root_item_lock); + + /* * This is done when we lookup the root, it should already be complete * by the time we get here. */ WARN_ON(send_root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE); /* - * If we just created this root we need to make sure that the orphan - * cleanup has been done and committed since we search the commit root, - * so check its commit root transid with our otransid and if they match - * commit the transaction to make sure everything is updated. - */ - down_read(&send_root->fs_info->extent_commit_sem); - if (btrfs_header_generation(send_root->commit_root) == - btrfs_root_otransid(&send_root->root_item)) { - struct btrfs_trans_handle *trans; - - up_read(&send_root->fs_info->extent_commit_sem); - - trans = btrfs_attach_transaction_barrier(send_root); - if (IS_ERR(trans)) { - if (PTR_ERR(trans) != -ENOENT) { - ret = PTR_ERR(trans); - goto out; - } - /* ENOENT means theres no transaction */ - } else { - ret = btrfs_commit_transaction(trans, send_root); - if (ret) - goto out; - } - } else { - up_read(&send_root->fs_info->extent_commit_sem); + * Userspace tools do the checks and warn the user if it's + * not RO. + */ + if (!btrfs_root_readonly(send_root)) { + ret = -EPERM; + goto out; } arg = memdup_user(arg_, sizeof(*arg)); @@ -4671,9 +6008,16 @@ goto out; } - sctx->mnt = mnt_file->f_path.mnt; - sctx->send_root = send_root; + /* + * Unlikely but possible, if the subvolume is marked for deletion but + * is slow to remove the directory entry, send can still be started + */ + if (btrfs_root_dead(sctx->send_root)) { + ret = -EPERM; + goto out; + } + sctx->clone_roots_cnt = arg->clone_sources_count; sctx->send_max_size = BTRFS_SEND_BUF_SIZE; @@ -4689,6 +6033,10 @@ goto out; } + sctx->pending_dir_moves = RB_ROOT; + sctx->waiting_dir_moves = RB_ROOT; + sctx->orphan_dirs = RB_ROOT; + sctx->clone_roots = vzalloc(sizeof(struct clone_root) * (arg->clone_sources_count + 1)); if (!sctx->clone_roots) { @@ -4716,16 +6064,29 @@ key.objectid = clone_sources_tmp[i]; key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; + + index = srcu_read_lock(&fs_info->subvol_srcu); + clone_root = btrfs_read_fs_root_no_name(fs_info, &key); - if (!clone_root) { - ret = -EINVAL; - goto out; - } if (IS_ERR(clone_root)) { + srcu_read_unlock(&fs_info->subvol_srcu, index); ret = PTR_ERR(clone_root); goto out; } + spin_lock(&clone_root->root_item_lock); + if (!btrfs_root_readonly(clone_root) || + btrfs_root_dead(clone_root)) { + spin_unlock(&clone_root->root_item_lock); + srcu_read_unlock(&fs_info->subvol_srcu, index); + ret = -EPERM; + goto out; + } + clone_root->send_in_progress++; + spin_unlock(&clone_root->root_item_lock); + srcu_read_unlock(&fs_info->subvol_srcu, index); + sctx->clone_roots[i].root = clone_root; + clone_sources_to_rollback = i + 1; } vfree(clone_sources_tmp); clone_sources_tmp = NULL; @@ -4735,11 +6096,28 @@ key.objectid = arg->parent_root; key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; + + index = srcu_read_lock(&fs_info->subvol_srcu); + sctx->parent_root = btrfs_read_fs_root_no_name(fs_info, &key); - if (!sctx->parent_root) { - ret = -EINVAL; + if (IS_ERR(sctx->parent_root)) { + srcu_read_unlock(&fs_info->subvol_srcu, index); + ret = PTR_ERR(sctx->parent_root); + goto out; + } + + spin_lock(&sctx->parent_root->root_item_lock); + sctx->parent_root->send_in_progress++; + if (!btrfs_root_readonly(sctx->parent_root) || + btrfs_root_dead(sctx->parent_root)) { + spin_unlock(&sctx->parent_root->root_item_lock); + srcu_read_unlock(&fs_info->subvol_srcu, index); + ret = -EPERM; goto out; } + spin_unlock(&sctx->parent_root->root_item_lock); + + srcu_read_unlock(&fs_info->subvol_srcu, index); } /* @@ -4753,8 +6131,15 @@ sort(sctx->clone_roots, sctx->clone_roots_cnt, sizeof(*sctx->clone_roots), __clone_root_cmp_sort, NULL); + sort_clone_roots = 1; + + ret = ensure_commit_roots_uptodate(sctx); + if (ret) + goto out; + current->journal_info = BTRFS_SEND_TRANS_STUB; ret = send_subvol(sctx); + current->journal_info = NULL; if (ret < 0) goto out; @@ -4768,6 +6153,58 @@ } out: + WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->pending_dir_moves)); + while (sctx && !RB_EMPTY_ROOT(&sctx->pending_dir_moves)) { + struct rb_node *n; + struct pending_dir_move *pm; + + n = rb_first(&sctx->pending_dir_moves); + pm = rb_entry(n, struct pending_dir_move, node); + while (!list_empty(&pm->list)) { + struct pending_dir_move *pm2; + + pm2 = list_first_entry(&pm->list, + struct pending_dir_move, list); + free_pending_move(sctx, pm2); + } + free_pending_move(sctx, pm); + } + + WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves)); + while (sctx && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves)) { + struct rb_node *n; + struct waiting_dir_move *dm; + + n = rb_first(&sctx->waiting_dir_moves); + dm = rb_entry(n, struct waiting_dir_move, node); + rb_erase(&dm->node, &sctx->waiting_dir_moves); + kfree(dm); + } + + WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->orphan_dirs)); + while (sctx && !RB_EMPTY_ROOT(&sctx->orphan_dirs)) { + struct rb_node *n; + struct orphan_dir_info *odi; + + n = rb_first(&sctx->orphan_dirs); + odi = rb_entry(n, struct orphan_dir_info, node); + free_orphan_dir_info(sctx, odi); + } + + if (sort_clone_roots) { + for (i = 0; i < sctx->clone_roots_cnt; i++) + btrfs_root_dec_send_in_progress( + sctx->clone_roots[i].root); + } else { + for (i = 0; sctx && i < clone_sources_to_rollback; i++) + btrfs_root_dec_send_in_progress( + sctx->clone_roots[i].root); + + btrfs_root_dec_send_in_progress(send_root); + } + if (sctx && !IS_ERR_OR_NULL(sctx->parent_root)) + btrfs_root_dec_send_in_progress(sctx->parent_root); + kfree(arg); vfree(clone_sources_tmp);