--- zzzz-none-000/linux-3.10.107/fs/btrfs/dev-replace.c 2017-06-27 09:49:32.000000000 +0000 +++ scorpion-7490-727/linux-3.10.107/fs/btrfs/dev-replace.c 2021-02-04 17:41:59.000000000 +0000 @@ -26,7 +26,6 @@ #include #include #include -#include "compat.h" #include "ctree.h" #include "extent_map.h" #include "disk-io.h" @@ -37,8 +36,8 @@ #include "check-integrity.h" #include "rcu-string.h" #include "dev-replace.h" +#include "sysfs.h" -static u64 btrfs_get_seconds_since_1970(void); static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, int scrub_ret); static void btrfs_dev_replace_update_device_in_mapping_tree( @@ -104,7 +103,8 @@ ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item); if (item_size != sizeof(struct btrfs_dev_replace_item)) { - pr_warn("btrfs: dev_replace entry found has unexpected size, ignore entry\n"); + btrfs_warn(fs_info, + "dev_replace entry found has unexpected size, ignore entry"); goto no_valid_dev_replace_entry_found; } @@ -147,14 +147,20 @@ if (!dev_replace->srcdev && !btrfs_test_opt(dev_root, DEGRADED)) { ret = -EIO; - pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?\n", - (unsigned long long)src_devid); + btrfs_warn(fs_info, + "cannot mount because device replace operation is ongoing and"); + btrfs_warn(fs_info, + "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?", + src_devid); } if (!dev_replace->tgtdev && !btrfs_test_opt(dev_root, DEGRADED)) { ret = -EIO; - pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "tgtdev (devid %llu) is missing, need to run btrfs dev scan?\n", - (unsigned long long)BTRFS_DEV_REPLACE_DEVID); + btrfs_warn(fs_info, + "cannot mount because device replace operation is ongoing and"); + btrfs_warn(fs_info, + "tgtdev (devid %llu) is missing, need to run 'btrfs dev scan'?", + BTRFS_DEV_REPLACE_DEVID); } if (dev_replace->tgtdev) { if (dev_replace->srcdev) { @@ -162,8 +168,12 @@ dev_replace->srcdev->total_bytes; dev_replace->tgtdev->disk_total_bytes = dev_replace->srcdev->disk_total_bytes; + dev_replace->tgtdev->commit_total_bytes = + dev_replace->srcdev->commit_total_bytes; dev_replace->tgtdev->bytes_used = dev_replace->srcdev->bytes_used; + dev_replace->tgtdev->commit_bytes_used = + dev_replace->srcdev->commit_bytes_used; } dev_replace->tgtdev->is_tgtdev_for_dev_replace = 1; btrfs_init_dev_replace_tgtdev_for_resume(fs_info, @@ -173,8 +183,7 @@ } out: - if (path) - btrfs_free_path(path); + btrfs_free_path(path); return ret; } @@ -212,7 +221,7 @@ } ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); if (ret < 0) { - pr_warn("btrfs: error %d while searching for dev_replace item!\n", + btrfs_warn(fs_info, "error %d while searching for dev_replace item!", ret); goto out; } @@ -232,7 +241,7 @@ */ ret = btrfs_del_item(trans, dev_root, path); if (ret != 0) { - pr_warn("btrfs: delete too small dev_replace item failed %d!\n", + btrfs_warn(fs_info, "delete too small dev_replace item failed %d!", ret); goto out; } @@ -245,7 +254,7 @@ ret = btrfs_insert_empty_item(trans, dev_root, path, &key, sizeof(*ptr)); if (ret < 0) { - pr_warn("btrfs: insert dev_replace item failed %d!\n", + btrfs_warn(fs_info, "insert dev_replace item failed %d!", ret); goto out; } @@ -296,13 +305,6 @@ dev_replace->cursor_left_last_write_of_item; } -static u64 btrfs_get_seconds_since_1970(void) -{ - struct timespec t = CURRENT_TIME_SEC; - - return t.tv_sec; -} - int btrfs_dev_replace_start(struct btrfs_root *root, struct btrfs_ioctl_dev_replace_args *args) { @@ -313,11 +315,6 @@ struct btrfs_device *tgt_device = NULL; struct btrfs_device *src_device = NULL; - if (btrfs_fs_incompat(fs_info, RAID56)) { - pr_warn("btrfs: dev_replace cannot yet handle RAID5/RAID6\n"); - return -EINVAL; - } - switch (args->start.cont_reading_from_srcdev_mode) { case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS: case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID: @@ -330,29 +327,33 @@ args->start.tgtdev_name[0] == '\0') return -EINVAL; + /* the disk copy procedure reuses the scrub code */ mutex_lock(&fs_info->volume_mutex); - ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name, - &tgt_device); - if (ret) { - pr_err("btrfs: target device %s is invalid!\n", - args->start.tgtdev_name); - mutex_unlock(&fs_info->volume_mutex); - return -EINVAL; - } - ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid, args->start.srcdev_name, &src_device); - mutex_unlock(&fs_info->volume_mutex); if (ret) { - ret = -EINVAL; - goto leave_no_lock; + mutex_unlock(&fs_info->volume_mutex); + return ret; } - if (tgt_device->total_bytes < src_device->total_bytes) { - pr_err("btrfs: target device is smaller than source device!\n"); - ret = -EINVAL; - goto leave_no_lock; + ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name, + src_device, &tgt_device); + mutex_unlock(&fs_info->volume_mutex); + if (ret) + return ret; + + /* + * Here we commit the transaction to make sure commit_total_bytes + * of all the devices are updated. + */ + trans = btrfs_attach_transaction(root); + if (!IS_ERR(trans)) { + ret = btrfs_commit_transaction(trans, root); + if (ret) + return ret; + } else if (PTR_ERR(trans) != -ENOENT) { + return PTR_ERR(trans); } btrfs_dev_replace_lock(dev_replace); @@ -374,23 +375,19 @@ WARN_ON(!tgt_device); dev_replace->tgtdev = tgt_device; - printk_in_rcu(KERN_INFO - "btrfs: dev_replace from %s (devid %llu) to %s) started\n", + btrfs_info_in_rcu(root->fs_info, + "dev_replace from %s (devid %llu) to %s started", src_device->missing ? "" : rcu_str_deref(src_device->name), src_device->devid, rcu_str_deref(tgt_device->name)); - tgt_device->total_bytes = src_device->total_bytes; - tgt_device->disk_total_bytes = src_device->disk_total_bytes; - tgt_device->bytes_used = src_device->bytes_used; - /* * from now on, the writes to the srcdev are all duplicated to * go to the tgtdev as well (refer to btrfs_map_block()). */ dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED; - dev_replace->time_started = btrfs_get_seconds_since_1970(); + dev_replace->time_started = get_seconds(); dev_replace->cursor_left = 0; dev_replace->committed_cursor_left = 0; dev_replace->cursor_left_last_write_of_item = 0; @@ -400,7 +397,11 @@ args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; btrfs_dev_replace_unlock(dev_replace); - btrfs_wait_ordered_extents(root, 0); + ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device); + if (ret) + btrfs_err(root->fs_info, "kobj add dev failed %d\n", ret); + + btrfs_wait_ordered_roots(root->fs_info, -1); /* force writing the updated state information to disk */ trans = btrfs_start_transaction(root, 0); @@ -415,24 +416,47 @@ /* the disk copy procedure reuses the scrub code */ ret = btrfs_scrub_dev(fs_info, src_device->devid, 0, - src_device->total_bytes, + btrfs_device_get_total_bytes(src_device), &dev_replace->scrub_progress, 0, 1); ret = btrfs_dev_replace_finishing(root->fs_info, ret); - WARN_ON(ret); + /* don't warn if EINPROGRESS, someone else might be running scrub */ + if (ret == -EINPROGRESS) { + args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS; + ret = 0; + } else { + WARN_ON(ret); + } - return 0; + return ret; leave: dev_replace->srcdev = NULL; dev_replace->tgtdev = NULL; btrfs_dev_replace_unlock(dev_replace); -leave_no_lock: - if (tgt_device) - btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); + btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); return ret; } +/* + * blocked until all flighting bios are finished. + */ +static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info) +{ + set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state); + wait_event(fs_info->replace_wait, !percpu_counter_sum( + &fs_info->bio_counter)); +} + +/* + * we have removed target device, it is safe to allow new bios request. + */ +static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info) +{ + clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state); + wake_up(&fs_info->replace_wait); +} + static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, int scrub_ret) { @@ -460,22 +484,16 @@ src_device = dev_replace->srcdev; btrfs_dev_replace_unlock(dev_replace); - /* replace old device with new one in mapping tree */ - if (!scrub_ret) - btrfs_dev_replace_update_device_in_mapping_tree(fs_info, - src_device, - tgt_device); - /* * flush all outstanding I/O and inode extent mappings before the * copy operation is declared as being finished */ - ret = btrfs_start_delalloc_inodes(root, 0); + ret = btrfs_start_delalloc_roots(root->fs_info, 0, -1); if (ret) { mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return ret; } - btrfs_wait_ordered_extents(root, 0); + btrfs_wait_ordered_roots(root->fs_info, -1); trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { @@ -485,35 +503,44 @@ ret = btrfs_commit_transaction(trans, root); WARN_ON(ret); + mutex_lock(&uuid_mutex); /* keep away write_all_supers() during the finishing procedure */ mutex_lock(&root->fs_info->fs_devices->device_list_mutex); + mutex_lock(&root->fs_info->chunk_mutex); btrfs_dev_replace_lock(dev_replace); dev_replace->replace_state = scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED; dev_replace->tgtdev = NULL; dev_replace->srcdev = NULL; - dev_replace->time_stopped = btrfs_get_seconds_since_1970(); + dev_replace->time_stopped = get_seconds(); dev_replace->item_needs_writeback = 1; - if (scrub_ret) { - printk_in_rcu(KERN_ERR - "btrfs: btrfs_scrub_dev(%s, %llu, %s) failed %d\n", + /* replace old device with new one in mapping tree */ + if (!scrub_ret) { + btrfs_dev_replace_update_device_in_mapping_tree(fs_info, + src_device, + tgt_device); + } else { + btrfs_err_in_rcu(root->fs_info, + "btrfs_scrub_dev(%s, %llu, %s) failed %d", src_device->missing ? "" : rcu_str_deref(src_device->name), src_device->devid, rcu_str_deref(tgt_device->name), scrub_ret); btrfs_dev_replace_unlock(dev_replace); + mutex_unlock(&root->fs_info->chunk_mutex); mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + mutex_unlock(&uuid_mutex); if (tgt_device) btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); - return 0; + return scrub_ret; } - printk_in_rcu(KERN_INFO - "btrfs: dev_replace from %s (devid %llu) to %s) finished\n", + btrfs_info_in_rcu(root->fs_info, + "dev_replace from %s (devid %llu) to %s finished", src_device->missing ? "" : rcu_str_deref(src_device->name), src_device->devid, @@ -521,24 +548,31 @@ tgt_device->is_tgtdev_for_dev_replace = 0; tgt_device->devid = src_device->devid; src_device->devid = BTRFS_DEV_REPLACE_DEVID; - tgt_device->bytes_used = src_device->bytes_used; memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp)); memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid)); memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid)); - tgt_device->total_bytes = src_device->total_bytes; - tgt_device->disk_total_bytes = src_device->disk_total_bytes; - tgt_device->bytes_used = src_device->bytes_used; + btrfs_device_set_total_bytes(tgt_device, src_device->total_bytes); + btrfs_device_set_disk_total_bytes(tgt_device, + src_device->disk_total_bytes); + btrfs_device_set_bytes_used(tgt_device, src_device->bytes_used); + ASSERT(list_empty(&src_device->resized_list)); + tgt_device->commit_total_bytes = src_device->commit_total_bytes; + tgt_device->commit_bytes_used = src_device->bytes_used; if (fs_info->sb->s_bdev == src_device->bdev) fs_info->sb->s_bdev = tgt_device->bdev; if (fs_info->fs_devices->latest_bdev == src_device->bdev) fs_info->fs_devices->latest_bdev = tgt_device->bdev; list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); + fs_info->fs_devices->rw_devices++; + + btrfs_dev_replace_unlock(dev_replace); + + btrfs_rm_dev_replace_blocked(fs_info); + + btrfs_rm_dev_replace_remove_srcdev(fs_info, src_device); + + btrfs_rm_dev_replace_unblocked(fs_info); - btrfs_rm_dev_replace_srcdev(fs_info, src_device); - if (src_device->bdev) { - /* zero out the old super */ - btrfs_scratch_superblock(src_device); - } /* * this is again a consistent state where no dev_replace procedure * is running, the target device is part of the filesystem, the @@ -546,8 +580,13 @@ * superblock is scratched out so that it is no longer marked to * belong to this filesystem. */ - btrfs_dev_replace_unlock(dev_replace); + mutex_unlock(&root->fs_info->chunk_mutex); mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + mutex_unlock(&uuid_mutex); + + /* replace the sysfs entry */ + btrfs_sysfs_rm_device_link(fs_info->fs_devices, src_device); + btrfs_rm_dev_replace_free_srcdev(fs_info, src_device); /* write back the superblocks */ trans = btrfs_start_transaction(root, 0); @@ -608,6 +647,7 @@ struct btrfs_ioctl_dev_replace_args *args) { struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + struct btrfs_device *srcdev; btrfs_dev_replace_lock(dev_replace); /* even if !dev_replace_is_valid, the values are good enough for @@ -630,8 +670,9 @@ break; case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: - args->status.progress_1000 = div64_u64(dev_replace->cursor_left, - div64_u64(dev_replace->srcdev->total_bytes, 1000)); + srcdev = dev_replace->srcdev; + args->status.progress_1000 = div_u64(dev_replace->cursor_left, + div_u64(btrfs_device_get_total_bytes(srcdev), 1000)); break; } btrfs_dev_replace_unlock(dev_replace); @@ -653,6 +694,9 @@ u64 result; int ret; + if (fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + mutex_lock(&dev_replace->lock_finishing_cancel_unmount); btrfs_dev_replace_lock(dev_replace); switch (dev_replace->replace_state) { @@ -671,7 +715,7 @@ break; } dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED; - dev_replace->time_stopped = btrfs_get_seconds_since_1970(); + dev_replace->time_stopped = get_seconds(); dev_replace->item_needs_writeback = 1; btrfs_dev_replace_unlock(dev_replace); btrfs_scrub_cancel(fs_info); @@ -706,9 +750,9 @@ case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED; - dev_replace->time_stopped = btrfs_get_seconds_since_1970(); + dev_replace->time_stopped = get_seconds(); dev_replace->item_needs_writeback = 1; - pr_info("btrfs: suspending dev_replace for unmount\n"); + btrfs_info(fs_info, "suspending dev_replace for unmount"); break; } @@ -737,8 +781,9 @@ break; } if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) { - pr_info("btrfs: cannot continue dev_replace, tgtdev is missing\n" - "btrfs: you may cancel the operation after 'mount -o degraded'\n"); + btrfs_info(fs_info, "cannot continue dev_replace, tgtdev is missing"); + btrfs_info(fs_info, + "you may cancel the operation after 'mount -o degraded'"); btrfs_dev_replace_unlock(dev_replace); return 0; } @@ -747,7 +792,7 @@ WARN_ON(atomic_xchg( &fs_info->mutually_exclusive_operation_running, 1)); task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl"); - return PTR_RET(task); + return PTR_ERR_OR_ZERO(task); } static int btrfs_dev_replace_kthread(void *data) @@ -762,16 +807,16 @@ btrfs_dev_replace_status(fs_info, status_args); progress = status_args->status.progress_1000; kfree(status_args); - do_div(progress, 10); - printk_in_rcu(KERN_INFO - "btrfs: continuing dev_replace from %s (devid %llu) to %s @%u%%\n", - dev_replace->srcdev->missing ? "" : - rcu_str_deref(dev_replace->srcdev->name), - dev_replace->srcdev->devid, - dev_replace->tgtdev ? - rcu_str_deref(dev_replace->tgtdev->name) : - "", - (unsigned int)progress); + progress = div_u64(progress, 10); + btrfs_info_in_rcu(fs_info, + "continuing dev_replace from %s (devid %llu) to %s @%u%%", + dev_replace->srcdev->missing ? "" : + rcu_str_deref(dev_replace->srcdev->name), + dev_replace->srcdev->devid, + dev_replace->tgtdev ? + rcu_str_deref(dev_replace->tgtdev->name) : + "", + (unsigned int)progress); } btrfs_dev_replace_continue_on_mount(fs_info); atomic_set(&fs_info->mutually_exclusive_operation_running, 0); @@ -786,7 +831,7 @@ ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid, dev_replace->committed_cursor_left, - dev_replace->srcdev->total_bytes, + btrfs_device_get_total_bytes(dev_replace->srcdev), &dev_replace->scrub_progress, 0, 1); ret = btrfs_dev_replace_finishing(fs_info, ret); WARN_ON(ret); @@ -863,3 +908,31 @@ mutex_unlock(&dev_replace->lock_management_lock); } } + +void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info) +{ + percpu_counter_inc(&fs_info->bio_counter); +} + +void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount) +{ + percpu_counter_sub(&fs_info->bio_counter, amount); + + if (waitqueue_active(&fs_info->replace_wait)) + wake_up(&fs_info->replace_wait); +} + +void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info) +{ + while (1) { + percpu_counter_inc(&fs_info->bio_counter); + if (likely(!test_bit(BTRFS_FS_STATE_DEV_REPLACING, + &fs_info->fs_state))) + break; + + btrfs_bio_counter_dec(fs_info); + wait_event(fs_info->replace_wait, + !test_bit(BTRFS_FS_STATE_DEV_REPLACING, + &fs_info->fs_state)); + } +}