--- zzzz-none-000/linux-3.10.107/fs/super.c 2017-06-27 09:49:32.000000000 +0000 +++ scorpion-7490-727/linux-3.10.107/fs/super.c 2021-02-04 17:41:59.000000000 +0000 @@ -22,7 +22,6 @@ #include #include -#include #include #include #include @@ -37,8 +36,8 @@ #include "internal.h" -LIST_HEAD(super_blocks); -DEFINE_SPINLOCK(sb_lock); +static LIST_HEAD(super_blocks); +static DEFINE_SPINLOCK(sb_lock); static char *sb_writers_name[SB_FREEZE_LEVELS] = { "sb_writers", @@ -53,11 +52,15 @@ * shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we * take a passive reference to the superblock to avoid this from occurring. */ -static int prune_super(struct shrinker *shrink, struct shrink_control *sc) +static unsigned long super_cache_scan(struct shrinker *shrink, + struct shrink_control *sc) { struct super_block *sb; - int fs_objects = 0; - int total_objects; + long fs_objects = 0; + long total_objects; + long freed = 0; + long dentries; + long inodes; sb = container_of(shrink, struct super_block, s_shrink); @@ -65,79 +68,106 @@ * Deadlock avoidance. We may hold various FS locks, and we don't want * to recurse into the FS that called us in clear_inode() and friends.. */ - if (sc->nr_to_scan && !(sc->gfp_mask & __GFP_FS)) - return -1; + if (!(sc->gfp_mask & __GFP_FS)) + return SHRINK_STOP; - if (!grab_super_passive(sb)) - return -1; + if (!trylock_super(sb)) + return SHRINK_STOP; - if (sb->s_op && sb->s_op->nr_cached_objects) - fs_objects = sb->s_op->nr_cached_objects(sb); + if (sb->s_op->nr_cached_objects) + fs_objects = sb->s_op->nr_cached_objects(sb, sc); - total_objects = sb->s_nr_dentry_unused + - sb->s_nr_inodes_unused + fs_objects + 1; + inodes = list_lru_shrink_count(&sb->s_inode_lru, sc); + dentries = list_lru_shrink_count(&sb->s_dentry_lru, sc); + total_objects = dentries + inodes + fs_objects + 1; if (!total_objects) total_objects = 1; - if (sc->nr_to_scan) { - int dentries; - int inodes; - - /* proportion the scan between the caches */ - dentries = (sc->nr_to_scan * sb->s_nr_dentry_unused) / - total_objects; - inodes = (sc->nr_to_scan * sb->s_nr_inodes_unused) / - total_objects; - if (fs_objects) - fs_objects = (sc->nr_to_scan * fs_objects) / - total_objects; - /* - * prune the dcache first as the icache is pinned by it, then - * prune the icache, followed by the filesystem specific caches - */ - prune_dcache_sb(sb, dentries); - prune_icache_sb(sb, inodes); + /* proportion the scan between the caches */ + dentries = mult_frac(sc->nr_to_scan, dentries, total_objects); + inodes = mult_frac(sc->nr_to_scan, inodes, total_objects); + fs_objects = mult_frac(sc->nr_to_scan, fs_objects, total_objects); - if (fs_objects && sb->s_op->free_cached_objects) { - sb->s_op->free_cached_objects(sb, fs_objects); - fs_objects = sb->s_op->nr_cached_objects(sb); - } - total_objects = sb->s_nr_dentry_unused + - sb->s_nr_inodes_unused + fs_objects; + /* + * prune the dcache first as the icache is pinned by it, then + * prune the icache, followed by the filesystem specific caches + * + * Ensure that we always scan at least one object - memcg kmem + * accounting uses this to fully empty the caches. + */ + sc->nr_to_scan = dentries + 1; + freed = prune_dcache_sb(sb, sc); + sc->nr_to_scan = inodes + 1; + freed += prune_icache_sb(sb, sc); + + if (fs_objects) { + sc->nr_to_scan = fs_objects + 1; + freed += sb->s_op->free_cached_objects(sb, sc); } - total_objects = (total_objects / 100) * sysctl_vfs_cache_pressure; - drop_super(sb); - return total_objects; + up_read(&sb->s_umount); + return freed; } -static int init_sb_writers(struct super_block *s, struct file_system_type *type) +static unsigned long super_cache_count(struct shrinker *shrink, + struct shrink_control *sc) { - int err; - int i; + struct super_block *sb; + long total_objects = 0; - for (i = 0; i < SB_FREEZE_LEVELS; i++) { - err = percpu_counter_init(&s->s_writers.counter[i], 0); - if (err < 0) - goto err_out; - lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i], - &type->s_writers_key[i], 0); - } - init_waitqueue_head(&s->s_writers.wait); - init_waitqueue_head(&s->s_writers.wait_unfrozen); - return 0; -err_out: - while (--i >= 0) - percpu_counter_destroy(&s->s_writers.counter[i]); - return err; + sb = container_of(shrink, struct super_block, s_shrink); + + /* + * Don't call trylock_super as it is a potential + * scalability bottleneck. The counts could get updated + * between super_cache_count and super_cache_scan anyway. + * Call to super_cache_count with shrinker_rwsem held + * ensures the safety of call to list_lru_shrink_count() and + * s_op->nr_cached_objects(). + */ + if (sb->s_op && sb->s_op->nr_cached_objects) + total_objects = sb->s_op->nr_cached_objects(sb, sc); + + total_objects += list_lru_shrink_count(&sb->s_dentry_lru, sc); + total_objects += list_lru_shrink_count(&sb->s_inode_lru, sc); + + total_objects = vfs_pressure_ratio(total_objects); + return total_objects; } -static void destroy_sb_writers(struct super_block *s) +static void destroy_super_work(struct work_struct *work) { + struct super_block *s = container_of(work, struct super_block, + destroy_work); int i; for (i = 0; i < SB_FREEZE_LEVELS; i++) - percpu_counter_destroy(&s->s_writers.counter[i]); + percpu_free_rwsem(&s->s_writers.rw_sem[i]); + kfree(s); +} + +static void destroy_super_rcu(struct rcu_head *head) +{ + struct super_block *s = container_of(head, struct super_block, rcu); + INIT_WORK(&s->destroy_work, destroy_super_work); + schedule_work(&s->destroy_work); +} + +/** + * destroy_super - frees a superblock + * @s: superblock to free + * + * Frees a superblock. + */ +static void destroy_super(struct super_block *s) +{ + list_lru_destroy(&s->s_dentry_lru); + list_lru_destroy(&s->s_inode_lru); + security_sb_free(s); + WARN_ON(!list_empty(&s->s_mounts)); + kfree(s->s_subtype); + kfree(s->s_options); + call_rcu(&s->rcu, destroy_super_rcu); } /** @@ -152,86 +182,75 @@ { struct super_block *s = kzalloc(sizeof(struct super_block), GFP_USER); static const struct super_operations default_op; + int i; - if (s) { - if (security_sb_alloc(s)) { - /* - * We cannot call security_sb_free() without - * security_sb_alloc() succeeding. So bail out manually - */ - kfree(s); - s = NULL; - goto out; - } - if (init_sb_writers(s, type)) - goto err_out; - s->s_flags = flags; - s->s_bdi = &default_backing_dev_info; - INIT_HLIST_NODE(&s->s_instances); - INIT_HLIST_BL_HEAD(&s->s_anon); - INIT_LIST_HEAD(&s->s_inodes); - INIT_LIST_HEAD(&s->s_dentry_lru); - INIT_LIST_HEAD(&s->s_inode_lru); - spin_lock_init(&s->s_inode_lru_lock); - INIT_LIST_HEAD(&s->s_mounts); - init_rwsem(&s->s_umount); - lockdep_set_class(&s->s_umount, &type->s_umount_key); - /* - * sget() can have s_umount recursion. - * - * When it cannot find a suitable sb, it allocates a new - * one (this one), and tries again to find a suitable old - * one. - * - * In case that succeeds, it will acquire the s_umount - * lock of the old one. Since these are clearly distrinct - * locks, and this object isn't exposed yet, there's no - * risk of deadlocks. - * - * Annotate this by putting this lock in a different - * subclass. - */ - down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING); - s->s_count = 1; - atomic_set(&s->s_active, 1); - mutex_init(&s->s_vfs_rename_mutex); - lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key); - mutex_init(&s->s_dquot.dqio_mutex); - mutex_init(&s->s_dquot.dqonoff_mutex); - init_rwsem(&s->s_dquot.dqptr_sem); - s->s_maxbytes = MAX_NON_LFS; - s->s_op = &default_op; - s->s_time_gran = 1000000000; - s->cleancache_poolid = -1; - - s->s_shrink.seeks = DEFAULT_SEEKS; - s->s_shrink.shrink = prune_super; - s->s_shrink.batch = 1024; + if (!s) + return NULL; + + INIT_LIST_HEAD(&s->s_mounts); + + if (security_sb_alloc(s)) + goto fail; + + for (i = 0; i < SB_FREEZE_LEVELS; i++) { + if (__percpu_init_rwsem(&s->s_writers.rw_sem[i], + sb_writers_name[i], + &type->s_writers_key[i])) + goto fail; } -out: + init_waitqueue_head(&s->s_writers.wait_unfrozen); + s->s_bdi = &noop_backing_dev_info; + s->s_flags = flags; + INIT_HLIST_NODE(&s->s_instances); + INIT_HLIST_BL_HEAD(&s->s_anon); + mutex_init(&s->s_sync_lock); + INIT_LIST_HEAD(&s->s_inodes); + spin_lock_init(&s->s_inode_list_lock); + + if (list_lru_init_memcg(&s->s_dentry_lru)) + goto fail; + if (list_lru_init_memcg(&s->s_inode_lru)) + goto fail; + + init_rwsem(&s->s_umount); + lockdep_set_class(&s->s_umount, &type->s_umount_key); + /* + * sget() can have s_umount recursion. + * + * When it cannot find a suitable sb, it allocates a new + * one (this one), and tries again to find a suitable old + * one. + * + * In case that succeeds, it will acquire the s_umount + * lock of the old one. Since these are clearly distrinct + * locks, and this object isn't exposed yet, there's no + * risk of deadlocks. + * + * Annotate this by putting this lock in a different + * subclass. + */ + down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING); + s->s_count = 1; + atomic_set(&s->s_active, 1); + mutex_init(&s->s_vfs_rename_mutex); + lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key); + mutex_init(&s->s_dquot.dqio_mutex); + mutex_init(&s->s_dquot.dqonoff_mutex); + s->s_maxbytes = MAX_NON_LFS; + s->s_op = &default_op; + s->s_time_gran = 1000000000; + s->cleancache_poolid = CLEANCACHE_NO_POOL; + + s->s_shrink.seeks = DEFAULT_SEEKS; + s->s_shrink.scan_objects = super_cache_scan; + s->s_shrink.count_objects = super_cache_count; + s->s_shrink.batch = 1024; + s->s_shrink.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE; return s; -err_out: - security_sb_free(s); - destroy_sb_writers(s); - kfree(s); - s = NULL; - goto out; -} -/** - * destroy_super - frees a superblock - * @s: superblock to free - * - * Frees a superblock. - */ -static inline void destroy_super(struct super_block *s) -{ - destroy_sb_writers(s); - security_sb_free(s); - WARN_ON(!list_empty(&s->s_mounts)); - kfree(s->s_subtype); - kfree(s->s_options); - kfree(s); +fail: + destroy_super(s); + return NULL; } /* Superblock refcounting */ @@ -278,10 +297,17 @@ struct file_system_type *fs = s->s_type; if (atomic_dec_and_test(&s->s_active)) { cleancache_invalidate_fs(s); + unregister_shrinker(&s->s_shrink); fs->kill_sb(s); - /* caches are now gone, we can safely kill the shrinker now */ - unregister_shrinker(&s->s_shrink); + /* + * Since list_lru_destroy() may sleep, we cannot call it from + * put_super(), where we hold the sb_lock. Therefore we destroy + * the lru lists right now. + */ + list_lru_destroy(&s->s_dentry_lru); + list_lru_destroy(&s->s_inode_lru); + put_filesystem(fs); put_super(s); } else { @@ -337,35 +363,31 @@ } /* - * grab_super_passive - acquire a passive reference + * trylock_super - try to grab ->s_umount shared * @sb: reference we are trying to grab * - * Tries to acquire a passive reference. This is used in places where we + * Try to prevent fs shutdown. This is used in places where we * cannot take an active reference but we need to ensure that the - * superblock does not go away while we are working on it. It returns - * false if a reference was not gained, and returns true with the s_umount - * lock held in read mode if a reference is gained. On successful return, - * the caller must drop the s_umount lock and the passive reference when - * done. + * filesystem is not shut down while we are working on it. It returns + * false if we cannot acquire s_umount or if we lose the race and + * filesystem already got into shutdown, and returns true with the s_umount + * lock held in read mode in case of success. On successful return, + * the caller must drop the s_umount lock when done. + * + * Note that unlike get_super() et.al. this one does *not* bump ->s_count. + * The reason why it's safe is that we are OK with doing trylock instead + * of down_read(). There's a couple of places that are OK with that, but + * it's very much not a general-purpose interface. */ -bool grab_super_passive(struct super_block *sb) +bool trylock_super(struct super_block *sb) { - spin_lock(&sb_lock); - if (hlist_unhashed(&sb->s_instances)) { - spin_unlock(&sb_lock); - return false; - } - - sb->s_count++; - spin_unlock(&sb_lock); - if (down_read_trylock(&sb->s_umount)) { - if (sb->s_root && (sb->s_flags & MS_BORN)) + if (!hlist_unhashed(&sb->s_instances) && + sb->s_root && (sb->s_flags & MS_BORN)) return true; up_read(&sb->s_umount); } - put_super(sb); return false; } @@ -392,10 +414,16 @@ sync_filesystem(sb); sb->s_flags &= ~MS_ACTIVE; - fsnotify_unmount_inodes(&sb->s_inodes); + fsnotify_unmount_inodes(sb); + cgroup_writeback_umount(); evict_inodes(sb); + if (sb->s_dio_done_wq) { + destroy_workqueue(sb->s_dio_done_wq); + sb->s_dio_done_wq = NULL; + } + if (sop->put_super) sop->put_super(sb); @@ -696,13 +724,22 @@ return -EACCES; #endif - if (flags & MS_RDONLY) - acct_auto_close(sb); - shrink_dcache_sb(sb); - sync_filesystem(sb); - remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY); + if (remount_ro) { + if (!hlist_empty(&sb->s_pins)) { + up_write(&sb->s_umount); + group_pin_kill(&sb->s_pins); + down_write(&sb->s_umount); + if (!sb->s_root) + return 0; + if (sb->s_writers.frozen != SB_UNFROZEN) + return -EBUSY; + remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY); + } + } + shrink_dcache_sb(sb); + /* If we are remounting RDONLY and current sb is read/write, make sure there are no rw files opened */ if (remount_ro) { @@ -797,7 +834,10 @@ static DEFINE_IDA(unnamed_dev_ida); static DEFINE_SPINLOCK(unnamed_dev_lock);/* protects the above */ -static int unnamed_dev_start = 0; /* don't bother trying below it */ +/* Many userspace utilities consider an FSID of 0 invalid. + * Always return at least 1 from get_anon_bdev. + */ +static int unnamed_dev_start = 1; int get_anon_bdev(dev_t *p) { @@ -818,7 +858,7 @@ else if (error) return -EAGAIN; - if (dev == (1 << MINORBITS)) { + if (dev >= (1 << MINORBITS)) { spin_lock(&unnamed_dev_lock); ida_remove(&unnamed_dev_ida, dev); if (unnamed_dev_start > dev) @@ -844,10 +884,7 @@ int set_anon_super(struct super_block *s, void *data) { - int error = get_anon_bdev(&s->s_dev); - if (!error) - s->s_bdi = &noop_backing_dev_info; - return error; + return get_anon_bdev(&s->s_dev); } EXPORT_SYMBOL(set_anon_super); @@ -1092,7 +1129,6 @@ sb = root->d_sb; BUG_ON(!sb); WARN_ON(!sb->s_bdi); - WARN_ON(sb->s_bdi == &default_backing_dev_info); sb->s_flags |= MS_BORN; error = security_sb_kern_mount(sb, flags, secdata); @@ -1126,72 +1162,46 @@ */ void __sb_end_write(struct super_block *sb, int level) { - percpu_counter_dec(&sb->s_writers.counter[level-1]); - /* - * Make sure s_writers are updated before we wake up waiters in - * freeze_super(). - */ - smp_mb(); - if (waitqueue_active(&sb->s_writers.wait)) - wake_up(&sb->s_writers.wait); - rwsem_release(&sb->s_writers.lock_map[level-1], 1, _RET_IP_); + percpu_up_read(sb->s_writers.rw_sem + level-1); } EXPORT_SYMBOL(__sb_end_write); -#ifdef CONFIG_LOCKDEP -/* - * We want lockdep to tell us about possible deadlocks with freezing but - * it's it bit tricky to properly instrument it. Getting a freeze protection - * works as getting a read lock but there are subtle problems. XFS for example - * gets freeze protection on internal level twice in some cases, which is OK - * only because we already hold a freeze protection also on higher level. Due - * to these cases we have to tell lockdep we are doing trylock when we - * already hold a freeze protection for a higher freeze level. - */ -static void acquire_freeze_lock(struct super_block *sb, int level, bool trylock, - unsigned long ip) -{ - int i; - - if (!trylock) { - for (i = 0; i < level - 1; i++) - if (lock_is_held(&sb->s_writers.lock_map[i])) { - trylock = true; - break; - } - } - rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, trylock, ip); -} -#endif - /* * This is an internal function, please use sb_start_{write,pagefault,intwrite} * instead. */ int __sb_start_write(struct super_block *sb, int level, bool wait) { -retry: - if (unlikely(sb->s_writers.frozen >= level)) { - if (!wait) - return 0; - wait_event(sb->s_writers.wait_unfrozen, - sb->s_writers.frozen < level); - } + bool force_trylock = false; + int ret = 1; #ifdef CONFIG_LOCKDEP - acquire_freeze_lock(sb, level, !wait, _RET_IP_); -#endif - percpu_counter_inc(&sb->s_writers.counter[level-1]); /* - * Make sure counter is updated before we check for frozen. - * freeze_super() first sets frozen and then checks the counter. + * We want lockdep to tell us about possible deadlocks with freezing + * but it's it bit tricky to properly instrument it. Getting a freeze + * protection works as getting a read lock but there are subtle + * problems. XFS for example gets freeze protection on internal level + * twice in some cases, which is OK only because we already hold a + * freeze protection also on higher level. Due to these cases we have + * to use wait == F (trylock mode) which must not fail. */ - smp_mb(); - if (unlikely(sb->s_writers.frozen >= level)) { - __sb_end_write(sb, level); - goto retry; + if (wait) { + int i; + + for (i = 0; i < level - 1; i++) + if (percpu_rwsem_is_held(sb->s_writers.rw_sem + i)) { + force_trylock = true; + break; + } } - return 1; +#endif + if (wait && !force_trylock) + percpu_down_read(sb->s_writers.rw_sem + level-1); + else + ret = percpu_down_read_trylock(sb->s_writers.rw_sem + level-1); + + WARN_ON(force_trylock & !ret); + return ret; } EXPORT_SYMBOL(__sb_start_write); @@ -1201,37 +1211,33 @@ * @level: type of writers we wait for (normal vs page fault) * * This function waits until there are no writers of given type to given file - * system. Caller of this function should make sure there can be no new writers - * of type @level before calling this function. Otherwise this function can - * livelock. + * system. */ static void sb_wait_write(struct super_block *sb, int level) { - s64 writers; - + percpu_down_write(sb->s_writers.rw_sem + level-1); /* - * We just cycle-through lockdep here so that it does not complain - * about returning with lock to userspace + * We are going to return to userspace and forget about this lock, the + * ownership goes to the caller of thaw_super() which does unlock. + * + * FIXME: we should do this before return from freeze_super() after we + * called sync_filesystem(sb) and s_op->freeze_fs(sb), and thaw_super() + * should re-acquire these locks before s_op->unfreeze_fs(sb). However + * this leads to lockdep false-positives, so currently we do the early + * release right after acquire. */ - rwsem_acquire(&sb->s_writers.lock_map[level-1], 0, 0, _THIS_IP_); - rwsem_release(&sb->s_writers.lock_map[level-1], 1, _THIS_IP_); - - do { - DEFINE_WAIT(wait); + percpu_rwsem_release(sb->s_writers.rw_sem + level-1, 0, _THIS_IP_); +} - /* - * We use a barrier in prepare_to_wait() to separate setting - * of frozen and checking of the counter - */ - prepare_to_wait(&sb->s_writers.wait, &wait, - TASK_UNINTERRUPTIBLE); +static void sb_freeze_unlock(struct super_block *sb) +{ + int level; - writers = percpu_counter_sum(&sb->s_writers.counter[level-1]); - if (writers) - schedule(); + for (level = 0; level < SB_FREEZE_LEVELS; ++level) + percpu_rwsem_acquire(sb->s_writers.rw_sem + level, 0, _THIS_IP_); - finish_wait(&sb->s_writers.wait, &wait); - } while (writers); + for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--) + percpu_up_write(sb->s_writers.rw_sem + level); } /** @@ -1290,20 +1296,14 @@ return 0; } - /* From now on, no new normal writers can start */ sb->s_writers.frozen = SB_FREEZE_WRITE; - smp_wmb(); - /* Release s_umount to preserve sb_start_write -> s_umount ordering */ up_write(&sb->s_umount); - sb_wait_write(sb, SB_FREEZE_WRITE); + down_write(&sb->s_umount); /* Now we go and block page faults... */ - down_write(&sb->s_umount); sb->s_writers.frozen = SB_FREEZE_PAGEFAULT; - smp_wmb(); - sb_wait_write(sb, SB_FREEZE_PAGEFAULT); /* All writers are done so after syncing there won't be dirty data */ @@ -1311,7 +1311,6 @@ /* Now wait for internal filesystem counter */ sb->s_writers.frozen = SB_FREEZE_FS; - smp_wmb(); sb_wait_write(sb, SB_FREEZE_FS); if (sb->s_op->freeze_fs) { @@ -1320,7 +1319,7 @@ printk(KERN_ERR "VFS:Filesystem freeze failed\n"); sb->s_writers.frozen = SB_UNFROZEN; - smp_wmb(); + sb_freeze_unlock(sb); wake_up(&sb->s_writers.wait_unfrozen); deactivate_locked_super(sb); return ret; @@ -1352,8 +1351,10 @@ return -EINVAL; } - if (sb->s_flags & MS_RDONLY) + if (sb->s_flags & MS_RDONLY) { + sb->s_writers.frozen = SB_UNFROZEN; goto out; + } if (sb->s_op->unfreeze_fs) { error = sb->s_op->unfreeze_fs(sb); @@ -1365,12 +1366,11 @@ } } -out: sb->s_writers.frozen = SB_UNFROZEN; - smp_wmb(); + sb_freeze_unlock(sb); +out: wake_up(&sb->s_writers.wait_unfrozen); deactivate_locked_super(sb); - return 0; } EXPORT_SYMBOL(thaw_super);