--- zzzz-none-000/linux-3.10.107/block/blk-cgroup.c 2017-06-27 09:49:32.000000000 +0000 +++ scorpion-7490-727/linux-3.10.107/block/blk-cgroup.c 2021-02-04 17:41:59.000000000 +0000 @@ -9,48 +9,45 @@ * * Copyright (C) 2009 Vivek Goyal * Nauman Rafique + * + * For policy-specific per-blkcg data: + * Copyright (C) 2015 Paolo Valente + * Arianna Avanzini */ #include #include #include #include #include +#include #include #include #include #include -#include "blk-cgroup.h" +#include +#include #include "blk.h" #define MAX_KEY_LEN 100 +/* + * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation. + * blkcg_pol_register_mutex nests outside of it and synchronizes entire + * policy [un]register operations including cgroup file additions / + * removals. Putting cgroup file registration outside blkcg_pol_mutex + * allows grabbing it from cgroup callbacks. + */ +static DEFINE_MUTEX(blkcg_pol_register_mutex); static DEFINE_MUTEX(blkcg_pol_mutex); -struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT, - .cfq_leaf_weight = 2 * CFQ_WEIGHT_DEFAULT, }; +struct blkcg blkcg_root; EXPORT_SYMBOL_GPL(blkcg_root); -static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; +struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css; -static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, - struct request_queue *q, bool update_hint); +static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; -/** - * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants - * @d_blkg: loop cursor pointing to the current descendant - * @pos_cgrp: used for iteration - * @p_blkg: target blkg to walk descendants of - * - * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU - * read locked. If called under either blkcg or queue lock, the iteration - * is guaranteed to include all and only online blkgs. The caller may - * update @pos_cgrp by calling cgroup_rightmost_descendant() to skip - * subtree. - */ -#define blkg_for_each_descendant_pre(d_blkg, pos_cgrp, p_blkg) \ - cgroup_for_each_descendant_pre((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \ - if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \ - (p_blkg)->q, false))) +static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */ static bool blkcg_policy_enabled(struct request_queue *q, const struct blkcg_policy *pol) @@ -71,20 +68,15 @@ if (!blkg) return; - for (i = 0; i < BLKCG_MAX_POLS; i++) { - struct blkcg_policy *pol = blkcg_policy[i]; - struct blkg_policy_data *pd = blkg->pd[i]; - - if (!pd) - continue; + for (i = 0; i < BLKCG_MAX_POLS; i++) + if (blkg->pd[i]) + blkcg_policy[i]->pd_free_fn(blkg->pd[i]); - if (pol && pol->pd_exit_fn) - pol->pd_exit_fn(blkg); + if (blkg->blkcg != &blkcg_root) + blk_exit_rl(&blkg->rl); - kfree(pd); - } - - blk_exit_rl(&blkg->rl); + blkg_rwstat_exit(&blkg->stat_ios); + blkg_rwstat_exit(&blkg->stat_bytes); kfree(blkg); } @@ -107,10 +99,14 @@ if (!blkg) return NULL; + if (blkg_rwstat_init(&blkg->stat_bytes, gfp_mask) || + blkg_rwstat_init(&blkg->stat_ios, gfp_mask)) + goto err_free; + blkg->q = q; INIT_LIST_HEAD(&blkg->q_node); blkg->blkcg = blkcg; - blkg->refcnt = 1; + atomic_set(&blkg->refcnt, 1); /* root blkg uses @q->root_rl, init rl only for !root blkgs */ if (blkcg != &blkcg_root) { @@ -127,17 +123,13 @@ continue; /* alloc per-policy data and attach it to blkg */ - pd = kzalloc_node(pol->pd_size, gfp_mask, q->node); + pd = pol->pd_alloc_fn(gfp_mask, q->node); if (!pd) goto err_free; blkg->pd[i] = pd; pd->blkg = blkg; pd->plid = i; - - /* invoke per-policy init */ - if (pol->pd_init_fn) - pol->pd_init_fn(blkg); } return blkg; @@ -147,26 +139,11 @@ return NULL; } -/** - * __blkg_lookup - internal version of blkg_lookup() - * @blkcg: blkcg of interest - * @q: request_queue of interest - * @update_hint: whether to update lookup hint with the result or not - * - * This is internal version and shouldn't be used by policy - * implementations. Looks up blkgs for the @blkcg - @q pair regardless of - * @q's bypass state. If @update_hint is %true, the caller should be - * holding @q->queue_lock and lookup hint is updated on success. - */ -static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, +struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg, struct request_queue *q, bool update_hint) { struct blkcg_gq *blkg; - blkg = rcu_dereference(blkcg->blkg_hint); - if (blkg && blkg->q == q) - return blkg; - /* * Hint didn't match. Look up from the radix tree. Note that the * hint can only be updated under queue_lock as otherwise @blkg @@ -184,66 +161,66 @@ return NULL; } - -/** - * blkg_lookup - lookup blkg for the specified blkcg - q pair - * @blkcg: blkcg of interest - * @q: request_queue of interest - * - * Lookup blkg for the @blkcg - @q pair. This function should be called - * under RCU read lock and is guaranteed to return %NULL if @q is bypassing - * - see blk_queue_bypass_start() for details. - */ -struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q) -{ - WARN_ON_ONCE(!rcu_read_lock_held()); - - if (unlikely(blk_queue_bypass(q))) - return NULL; - return __blkg_lookup(blkcg, q, false); -} -EXPORT_SYMBOL_GPL(blkg_lookup); +EXPORT_SYMBOL_GPL(blkg_lookup_slowpath); /* * If @new_blkg is %NULL, this function tries to allocate a new one as - * necessary using %GFP_ATOMIC. @new_blkg is always consumed on return. + * necessary using %GFP_NOWAIT. @new_blkg is always consumed on return. */ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct request_queue *q, struct blkcg_gq *new_blkg) { struct blkcg_gq *blkg; + struct bdi_writeback_congested *wb_congested; int i, ret; WARN_ON_ONCE(!rcu_read_lock_held()); lockdep_assert_held(q->queue_lock); /* blkg holds a reference to blkcg */ - if (!css_tryget(&blkcg->css)) { - ret = -EINVAL; + if (!css_tryget_online(&blkcg->css)) { + ret = -ENODEV; goto err_free_blkg; } + wb_congested = wb_congested_get_create(&q->backing_dev_info, + blkcg->css.id, GFP_NOWAIT); + if (!wb_congested) { + ret = -ENOMEM; + goto err_put_css; + } + /* allocate */ if (!new_blkg) { - new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC); + new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT); if (unlikely(!new_blkg)) { ret = -ENOMEM; - goto err_put_css; + goto err_put_congested; } } blkg = new_blkg; + blkg->wb_congested = wb_congested; - /* link parent and insert */ + /* link parent */ if (blkcg_parent(blkcg)) { blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false); if (WARN_ON_ONCE(!blkg->parent)) { - blkg = ERR_PTR(-EINVAL); - goto err_put_css; + ret = -ENODEV; + goto err_put_congested; } blkg_get(blkg->parent); } + /* invoke per-policy init */ + for (i = 0; i < BLKCG_MAX_POLS; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; + + if (blkg->pd[i] && pol->pd_init_fn) + pol->pd_init_fn(blkg->pd[i]); + } + + /* insert */ spin_lock(&blkcg->lock); ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg); if (likely(!ret)) { @@ -254,7 +231,7 @@ struct blkcg_policy *pol = blkcg_policy[i]; if (blkg->pd[i] && pol->pd_online_fn) - pol->pd_online_fn(blkg); + pol->pd_online_fn(blkg->pd[i]); } } blkg->online = true; @@ -267,6 +244,8 @@ blkg_put(blkg); return ERR_PTR(ret); +err_put_congested: + wb_congested_put(wb_congested); err_put_css: css_put(&blkcg->css); err_free_blkg: @@ -301,7 +280,7 @@ * we shouldn't allow anything to go through for a bypassing queue. */ if (unlikely(blk_queue_bypass(q))) - return ERR_PTR(blk_queue_dying(q) ? -EINVAL : -EBUSY); + return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY); blkg = __blkg_lookup(blkcg, q, true); if (blkg) @@ -325,11 +304,11 @@ return blkg; } } -EXPORT_SYMBOL_GPL(blkg_lookup_create); static void blkg_destroy(struct blkcg_gq *blkg) { struct blkcg *blkcg = blkg->blkcg; + struct blkcg_gq *parent = blkg->parent; int i; lockdep_assert_held(blkg->q->queue_lock); @@ -343,8 +322,14 @@ struct blkcg_policy *pol = blkcg_policy[i]; if (blkg->pd[i] && pol->pd_offline_fn) - pol->pd_offline_fn(blkg); + pol->pd_offline_fn(blkg->pd[i]); + } + + if (parent) { + blkg_rwstat_add_aux(&parent->stat_bytes, &blkg->stat_bytes); + blkg_rwstat_add_aux(&parent->stat_ios, &blkg->stat_ios); } + blkg->online = false; radix_tree_delete(&blkcg->blkg_tree, blkg->q->id); @@ -356,7 +341,7 @@ * under queue_lock. If it's not pointing to @blkg now, it never * will. Hint assignment itself can race safely. */ - if (rcu_dereference_raw(blkcg->blkg_hint) == blkg) + if (rcu_access_pointer(blkcg->blkg_hint) == blkg) rcu_assign_pointer(blkcg->blkg_hint, NULL); /* @@ -386,38 +371,32 @@ spin_unlock(&blkcg->lock); } - /* - * root blkg is destroyed. Just clear the pointer since - * root_rl does not take reference on root blkg. - */ q->root_blkg = NULL; q->root_rl.blkg = NULL; } -static void blkg_rcu_free(struct rcu_head *rcu_head) +/* + * A group is RCU protected, but having an rcu lock does not mean that one + * can access all the fields of blkg and assume these are valid. For + * example, don't try to follow throtl_data and request queue links. + * + * Having a reference to blkg under an rcu allows accesses to only values + * local to groups like group stats and group rate limits. + */ +void __blkg_release_rcu(struct rcu_head *rcu_head) { - blkg_free(container_of(rcu_head, struct blkcg_gq, rcu_head)); -} + struct blkcg_gq *blkg = container_of(rcu_head, struct blkcg_gq, rcu_head); -void __blkg_release(struct blkcg_gq *blkg) -{ /* release the blkcg and parent blkg refs this blkg has been holding */ css_put(&blkg->blkcg->css); if (blkg->parent) blkg_put(blkg->parent); - /* - * A group is freed in rcu manner. But having an rcu lock does not - * mean that one can access all the fields of blkg and assume these - * are valid. For example, don't try to follow throtl_data and - * request queue links. - * - * Having a reference to blkg under an rcu allows acess to only - * values local to groups like group stats and group rate limits - */ - call_rcu(&blkg->rcu_head, blkg_rcu_free); + wb_congested_put(blkg->wb_congested); + + blkg_free(blkg); } -EXPORT_SYMBOL_GPL(__blkg_release); +EXPORT_SYMBOL_GPL(__blkg_release_rcu); /* * The next function used by blk_queue_for_each_rl(). It's a bit tricky @@ -454,10 +433,10 @@ return &blkg->rl; } -static int blkcg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, - u64 val) +static int blkcg_reset_stats(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 val) { - struct blkcg *blkcg = cgroup_to_blkcg(cgroup); + struct blkcg *blkcg = css_to_blkcg(css); struct blkcg_gq *blkg; int i; @@ -470,12 +449,14 @@ * anyway. If you get hit by a race, retry. */ hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { + blkg_rwstat_reset(&blkg->stat_bytes); + blkg_rwstat_reset(&blkg->stat_ios); + for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; - if (blkcg_policy_enabled(blkg->q, pol) && - pol->pd_reset_stats_fn) - pol->pd_reset_stats_fn(blkg); + if (blkg->pd[i] && pol->pd_reset_stats_fn) + pol->pd_reset_stats_fn(blkg->pd[i]); } } @@ -484,13 +465,14 @@ return 0; } -static const char *blkg_dev_name(struct blkcg_gq *blkg) +const char *blkg_dev_name(struct blkcg_gq *blkg) { /* some drivers (floppy) instantiate a queue w/o disk registered */ if (blkg->q->backing_dev_info.dev) return dev_name(blkg->q->backing_dev_info.dev); return NULL; } +EXPORT_SYMBOL_GPL(blkg_dev_name); /** * blkcg_print_blkgs - helper for printing per-blkg data @@ -579,9 +561,10 @@ for (i = 0; i < BLKG_RWSTAT_NR; i++) seq_printf(sf, "%s %s %llu\n", dname, rwstr[i], - (unsigned long long)rwstat->cnt[i]); + (unsigned long long)atomic64_read(&rwstat->aux_cnt[i])); - v = rwstat->cnt[BLKG_RWSTAT_READ] + rwstat->cnt[BLKG_RWSTAT_WRITE]; + v = atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_READ]) + + atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_WRITE]); seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v); return v; } @@ -618,33 +601,122 @@ } EXPORT_SYMBOL_GPL(blkg_prfill_rwstat); +static u64 blkg_prfill_rwstat_field(struct seq_file *sf, + struct blkg_policy_data *pd, int off) +{ + struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd->blkg + off); + + return __blkg_prfill_rwstat(sf, pd, &rwstat); +} + +/** + * blkg_print_stat_bytes - seq_show callback for blkg->stat_bytes + * @sf: seq_file to print to + * @v: unused + * + * To be used as cftype->seq_show to print blkg->stat_bytes. + * cftype->private must be set to the blkcg_policy. + */ +int blkg_print_stat_bytes(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private, + offsetof(struct blkcg_gq, stat_bytes), true); + return 0; +} +EXPORT_SYMBOL_GPL(blkg_print_stat_bytes); + +/** + * blkg_print_stat_bytes - seq_show callback for blkg->stat_ios + * @sf: seq_file to print to + * @v: unused + * + * To be used as cftype->seq_show to print blkg->stat_ios. cftype->private + * must be set to the blkcg_policy. + */ +int blkg_print_stat_ios(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private, + offsetof(struct blkcg_gq, stat_ios), true); + return 0; +} +EXPORT_SYMBOL_GPL(blkg_print_stat_ios); + +static u64 blkg_prfill_rwstat_field_recursive(struct seq_file *sf, + struct blkg_policy_data *pd, + int off) +{ + struct blkg_rwstat rwstat = blkg_rwstat_recursive_sum(pd->blkg, + NULL, off); + return __blkg_prfill_rwstat(sf, pd, &rwstat); +} + +/** + * blkg_print_stat_bytes_recursive - recursive version of blkg_print_stat_bytes + * @sf: seq_file to print to + * @v: unused + */ +int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + blkg_prfill_rwstat_field_recursive, + (void *)seq_cft(sf)->private, + offsetof(struct blkcg_gq, stat_bytes), true); + return 0; +} +EXPORT_SYMBOL_GPL(blkg_print_stat_bytes_recursive); + +/** + * blkg_print_stat_ios_recursive - recursive version of blkg_print_stat_ios + * @sf: seq_file to print to + * @v: unused + */ +int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + blkg_prfill_rwstat_field_recursive, + (void *)seq_cft(sf)->private, + offsetof(struct blkcg_gq, stat_ios), true); + return 0; +} +EXPORT_SYMBOL_GPL(blkg_print_stat_ios_recursive); + /** * blkg_stat_recursive_sum - collect hierarchical blkg_stat - * @pd: policy private data of interest - * @off: offset to the blkg_stat in @pd + * @blkg: blkg of interest + * @pol: blkcg_policy which contains the blkg_stat + * @off: offset to the blkg_stat in blkg_policy_data or @blkg + * + * Collect the blkg_stat specified by @blkg, @pol and @off and all its + * online descendants and their aux counts. The caller must be holding the + * queue lock for online tests. * - * Collect the blkg_stat specified by @off from @pd and all its online - * descendants and return the sum. The caller must be holding the queue - * lock for online tests. + * If @pol is NULL, blkg_stat is at @off bytes into @blkg; otherwise, it is + * at @off bytes into @blkg's blkg_policy_data of the policy. */ -u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off) +u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg, + struct blkcg_policy *pol, int off) { - struct blkcg_policy *pol = blkcg_policy[pd->plid]; struct blkcg_gq *pos_blkg; - struct cgroup *pos_cgrp; - u64 sum; + struct cgroup_subsys_state *pos_css; + u64 sum = 0; - lockdep_assert_held(pd->blkg->q->queue_lock); - - sum = blkg_stat_read((void *)pd + off); + lockdep_assert_held(blkg->q->queue_lock); rcu_read_lock(); - blkg_for_each_descendant_pre(pos_blkg, pos_cgrp, pd_to_blkg(pd)) { - struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol); - struct blkg_stat *stat = (void *)pos_pd + off; + blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { + struct blkg_stat *stat; - if (pos_blkg->online) - sum += blkg_stat_read(stat); + if (!pos_blkg->online) + continue; + + if (pol) + stat = (void *)blkg_to_pd(pos_blkg, pol) + off; + else + stat = (void *)blkg + off; + + sum += blkg_stat_read(stat) + atomic64_read(&stat->aux_cnt); } rcu_read_unlock(); @@ -654,39 +726,43 @@ /** * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat - * @pd: policy private data of interest - * @off: offset to the blkg_stat in @pd + * @blkg: blkg of interest + * @pol: blkcg_policy which contains the blkg_rwstat + * @off: offset to the blkg_rwstat in blkg_policy_data or @blkg + * + * Collect the blkg_rwstat specified by @blkg, @pol and @off and all its + * online descendants and their aux counts. The caller must be holding the + * queue lock for online tests. * - * Collect the blkg_rwstat specified by @off from @pd and all its online - * descendants and return the sum. The caller must be holding the queue - * lock for online tests. + * If @pol is NULL, blkg_rwstat is at @off bytes into @blkg; otherwise, it + * is at @off bytes into @blkg's blkg_policy_data of the policy. */ -struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd, - int off) +struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, + struct blkcg_policy *pol, int off) { - struct blkcg_policy *pol = blkcg_policy[pd->plid]; struct blkcg_gq *pos_blkg; - struct cgroup *pos_cgrp; - struct blkg_rwstat sum; + struct cgroup_subsys_state *pos_css; + struct blkg_rwstat sum = { }; int i; - lockdep_assert_held(pd->blkg->q->queue_lock); - - sum = blkg_rwstat_read((void *)pd + off); + lockdep_assert_held(blkg->q->queue_lock); rcu_read_lock(); - blkg_for_each_descendant_pre(pos_blkg, pos_cgrp, pd_to_blkg(pd)) { - struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol); - struct blkg_rwstat *rwstat = (void *)pos_pd + off; - struct blkg_rwstat tmp; + blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { + struct blkg_rwstat *rwstat; if (!pos_blkg->online) continue; - tmp = blkg_rwstat_read(rwstat); + if (pol) + rwstat = (void *)blkg_to_pd(pos_blkg, pol) + off; + else + rwstat = (void *)pos_blkg + off; for (i = 0; i < BLKG_RWSTAT_NR; i++) - sum.cnt[i] += tmp.cnt[i]; + atomic64_add(atomic64_read(&rwstat->aux_cnt[i]) + + percpu_counter_sum_positive(&rwstat->cpu_cnt[i]), + &sum.aux_cnt[i]); } rcu_read_unlock(); @@ -702,29 +778,34 @@ * @ctx: blkg_conf_ctx to be filled * * Parse per-blkg config update from @input and initialize @ctx with the - * result. @ctx->blkg points to the blkg to be updated and @ctx->v the new - * value. This function returns with RCU read lock and queue lock held and - * must be paired with blkg_conf_finish(). + * result. @ctx->blkg points to the blkg to be updated and @ctx->body the + * part of @input following MAJ:MIN. This function returns with RCU read + * lock and queue lock held and must be paired with blkg_conf_finish(). */ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, - const char *input, struct blkg_conf_ctx *ctx) + char *input, struct blkg_conf_ctx *ctx) __acquires(rcu) __acquires(disk->queue->queue_lock) { struct gendisk *disk; struct blkcg_gq *blkg; unsigned int major, minor; - unsigned long long v; - int part, ret; + int key_len, part, ret; + char *body; - if (sscanf(input, "%u:%u %llu", &major, &minor, &v) != 3) + if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2) return -EINVAL; + body = input + key_len; + if (!isspace(*body)) + return -EINVAL; + body = skip_spaces(body); + disk = get_gendisk(MKDEV(major, minor), &part); if (!disk) - return -EINVAL; + return -ENODEV; if (part) { put_disk(disk); - return -EINVAL; + return -ENODEV; } rcu_read_lock(); @@ -733,7 +814,7 @@ if (blkcg_policy_enabled(disk->queue, pol)) blkg = blkg_lookup_create(blkcg, disk->queue); else - blkg = ERR_PTR(-EINVAL); + blkg = ERR_PTR(-EOPNOTSUPP); if (IS_ERR(blkg)) { ret = PTR_ERR(blkg); @@ -755,7 +836,7 @@ ctx->disk = disk; ctx->blkg = blkg; - ctx->v = v; + ctx->body = body; return 0; } EXPORT_SYMBOL_GPL(blkg_conf_prep); @@ -776,8 +857,56 @@ } EXPORT_SYMBOL_GPL(blkg_conf_finish); +static int blkcg_print_stat(struct seq_file *sf, void *v) +{ + struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); + struct blkcg_gq *blkg; + + rcu_read_lock(); + + hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { + const char *dname; + struct blkg_rwstat rwstat; + u64 rbytes, wbytes, rios, wios; + + dname = blkg_dev_name(blkg); + if (!dname) + continue; + + spin_lock_irq(blkg->q->queue_lock); + + rwstat = blkg_rwstat_recursive_sum(blkg, NULL, + offsetof(struct blkcg_gq, stat_bytes)); + rbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]); + wbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]); + + rwstat = blkg_rwstat_recursive_sum(blkg, NULL, + offsetof(struct blkcg_gq, stat_ios)); + rios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]); + wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]); + + spin_unlock_irq(blkg->q->queue_lock); + + if (rbytes || wbytes || rios || wios) + seq_printf(sf, "%s rbytes=%llu wbytes=%llu rios=%llu wios=%llu\n", + dname, rbytes, wbytes, rios, wios); + } + + rcu_read_unlock(); + return 0; +} + struct cftype blkcg_files[] = { { + .name = "stat", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = blkcg_print_stat, + }, + { } /* terminate */ +}; + +struct cftype blkcg_legacy_files[] = { + { .name = "reset_stats", .write_u64 = blkcg_reset_stats, }, @@ -786,18 +915,18 @@ /** * blkcg_css_offline - cgroup css_offline callback - * @cgroup: cgroup of interest + * @css: css of interest * - * This function is called when @cgroup is about to go away and responsible - * for shooting down all blkgs associated with @cgroup. blkgs should be + * This function is called when @css is about to go away and responsible + * for shooting down all blkgs associated with @css. blkgs should be * removed while holding both q and blkcg locks. As blkcg lock is nested * inside q lock, this function performs reverse double lock dancing. * * This is the blkcg counterpart of ioc_release_fn(). */ -static void blkcg_css_offline(struct cgroup *cgroup) +static void blkcg_css_offline(struct cgroup_subsys_state *css) { - struct blkcg *blkcg = cgroup_to_blkcg(cgroup); + struct blkcg *blkcg = css_to_blkcg(css); spin_lock_irq(&blkcg->lock); @@ -817,40 +946,91 @@ } spin_unlock_irq(&blkcg->lock); + + wb_blkcg_offline(blkcg); } -static void blkcg_css_free(struct cgroup *cgroup) +static void blkcg_css_free(struct cgroup_subsys_state *css) { - struct blkcg *blkcg = cgroup_to_blkcg(cgroup); + struct blkcg *blkcg = css_to_blkcg(css); + int i; + + mutex_lock(&blkcg_pol_mutex); + + list_del(&blkcg->all_blkcgs_node); + + for (i = 0; i < BLKCG_MAX_POLS; i++) + if (blkcg->cpd[i]) + blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]); + + mutex_unlock(&blkcg_pol_mutex); - if (blkcg != &blkcg_root) - kfree(blkcg); + kfree(blkcg); } -static struct cgroup_subsys_state *blkcg_css_alloc(struct cgroup *cgroup) +static struct cgroup_subsys_state * +blkcg_css_alloc(struct cgroup_subsys_state *parent_css) { - static atomic64_t id_seq = ATOMIC64_INIT(0); struct blkcg *blkcg; - struct cgroup *parent = cgroup->parent; + struct cgroup_subsys_state *ret; + int i; - if (!parent) { + mutex_lock(&blkcg_pol_mutex); + + if (!parent_css) { blkcg = &blkcg_root; - goto done; + } else { + blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); + if (!blkcg) { + ret = ERR_PTR(-ENOMEM); + goto free_blkcg; + } } - blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); - if (!blkcg) - return ERR_PTR(-ENOMEM); + for (i = 0; i < BLKCG_MAX_POLS ; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; + struct blkcg_policy_data *cpd; + + /* + * If the policy hasn't been attached yet, wait for it + * to be attached before doing anything else. Otherwise, + * check if the policy requires any specific per-cgroup + * data: if it does, allocate and initialize it. + */ + if (!pol || !pol->cpd_alloc_fn) + continue; + + cpd = pol->cpd_alloc_fn(GFP_KERNEL); + if (!cpd) { + ret = ERR_PTR(-ENOMEM); + goto free_pd_blkcg; + } + blkcg->cpd[i] = cpd; + cpd->blkcg = blkcg; + cpd->plid = i; + if (pol->cpd_init_fn) + pol->cpd_init_fn(cpd); + } - blkcg->cfq_weight = CFQ_WEIGHT_DEFAULT; - blkcg->cfq_leaf_weight = CFQ_WEIGHT_DEFAULT; - blkcg->id = atomic64_inc_return(&id_seq); /* root is 0, start from 1 */ -done: spin_lock_init(&blkcg->lock); - INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC); + INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT); INIT_HLIST_HEAD(&blkcg->blkg_list); +#ifdef CONFIG_CGROUP_WRITEBACK + INIT_LIST_HEAD(&blkcg->cgwb_list); +#endif + list_add_tail(&blkcg->all_blkcgs_node, &all_blkcgs); + mutex_unlock(&blkcg_pol_mutex); return &blkcg->css; + +free_pd_blkcg: + for (i--; i >= 0; i--) + if (blkcg->cpd[i]) + blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]); +free_blkcg: + kfree(blkcg); + mutex_unlock(&blkcg_pol_mutex); + return ret; } /** @@ -865,9 +1045,43 @@ */ int blkcg_init_queue(struct request_queue *q) { - might_sleep(); + struct blkcg_gq *new_blkg, *blkg; + bool preloaded; + int ret; + + new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL); + if (!new_blkg) + return -ENOMEM; + + preloaded = !radix_tree_preload(GFP_KERNEL); + + /* + * Make sure the root blkg exists and count the existing blkgs. As + * @q is bypassing at this point, blkg_lookup_create() can't be + * used. Open code insertion. + */ + rcu_read_lock(); + spin_lock_irq(q->queue_lock); + blkg = blkg_create(&blkcg_root, q, new_blkg); + spin_unlock_irq(q->queue_lock); + rcu_read_unlock(); + + if (preloaded) + radix_tree_preload_end(); - return blk_throtl_init(q); + if (IS_ERR(blkg)) + return PTR_ERR(blkg); + + q->root_blkg = blkg; + q->root_rl.blkg = blkg; + + ret = blk_throtl_init(q); + if (ret) { + spin_lock_irq(q->queue_lock); + blkg_destroy_all(q); + spin_unlock_irq(q->queue_lock); + } + return ret; } /** @@ -887,13 +1101,6 @@ if (!q->root_blkg) return; - /* - * @q could be exiting and already have destroyed all blkgs as - * indicated by NULL root_blkg. If so, don't confuse policies. - */ - if (!q->root_blkg) - return; - blk_throtl_drain(q); } @@ -918,14 +1125,15 @@ * of the main cic data structures. For now we allow a task to change * its cgroup only if it's the only owner of its ioc. */ -static int blkcg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) +static int blkcg_can_attach(struct cgroup_taskset *tset) { struct task_struct *task; + struct cgroup_subsys_state *dst_css; struct io_context *ioc; int ret = 0; /* task_lock() is needed to avoid races with exit_io_context() */ - cgroup_taskset_for_each(task, cgrp, tset) { + cgroup_taskset_for_each(task, dst_css, tset) { task_lock(task); ioc = task->io_context; if (ioc && atomic_read(&ioc->nr_tasks) > 1) @@ -937,25 +1145,45 @@ return ret; } -struct cgroup_subsys blkio_subsys = { - .name = "blkio", +static void blkcg_bind(struct cgroup_subsys_state *root_css) +{ + int i; + + mutex_lock(&blkcg_pol_mutex); + + for (i = 0; i < BLKCG_MAX_POLS; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; + struct blkcg *blkcg; + + if (!pol || !pol->cpd_bind_fn) + continue; + + list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) + if (blkcg->cpd[pol->plid]) + pol->cpd_bind_fn(blkcg->cpd[pol->plid]); + } + mutex_unlock(&blkcg_pol_mutex); +} + +struct cgroup_subsys io_cgrp_subsys = { .css_alloc = blkcg_css_alloc, .css_offline = blkcg_css_offline, .css_free = blkcg_css_free, .can_attach = blkcg_can_attach, - .subsys_id = blkio_subsys_id, - .base_cftypes = blkcg_files, - .module = THIS_MODULE, - + .bind = blkcg_bind, + .dfl_cftypes = blkcg_files, + .legacy_cftypes = blkcg_legacy_files, + .legacy_name = "blkio", +#ifdef CONFIG_MEMCG /* - * blkio subsystem is utterly broken in terms of hierarchy support. - * It treats all cgroups equally regardless of where they're - * located in the hierarchy - all cgroups are treated as if they're - * right below the root. Fix it and remove the following. + * This ensures that, if available, memcg is automatically enabled + * together on the default hierarchy so that the owner cgroup can + * be retrieved from writeback pages. */ - .broken_hierarchy = true, + .depends_on = 1 << memory_cgrp_id, +#endif }; -EXPORT_SYMBOL_GPL(blkio_subsys); +EXPORT_SYMBOL_GPL(io_cgrp_subsys); /** * blkcg_activate_policy - activate a blkcg policy on a request_queue @@ -976,98 +1204,54 @@ int blkcg_activate_policy(struct request_queue *q, const struct blkcg_policy *pol) { - LIST_HEAD(pds); - struct blkcg_gq *blkg, *new_blkg; - struct blkg_policy_data *pd, *n; - int cnt = 0, ret; - bool preloaded; + struct blkg_policy_data *pd_prealloc = NULL; + struct blkcg_gq *blkg; + int ret; if (blkcg_policy_enabled(q, pol)) return 0; - /* preallocations for root blkg */ - new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL); - if (!new_blkg) - return -ENOMEM; - blk_queue_bypass_start(q); - - preloaded = !radix_tree_preload(GFP_KERNEL); - - /* - * Make sure the root blkg exists and count the existing blkgs. As - * @q is bypassing at this point, blkg_lookup_create() can't be - * used. Open code it. - */ - spin_lock_irq(q->queue_lock); - - rcu_read_lock(); - blkg = __blkg_lookup(&blkcg_root, q, false); - if (blkg) - blkg_free(new_blkg); - else - blkg = blkg_create(&blkcg_root, q, new_blkg); - rcu_read_unlock(); - - if (preloaded) - radix_tree_preload_end(); - - if (IS_ERR(blkg)) { - ret = PTR_ERR(blkg); - goto out_unlock; - } - q->root_blkg = blkg; - q->root_rl.blkg = blkg; - - list_for_each_entry(blkg, &q->blkg_list, q_node) - cnt++; - - spin_unlock_irq(q->queue_lock); - - /* allocate policy_data for all existing blkgs */ - while (cnt--) { - pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node); - if (!pd) { +pd_prealloc: + if (!pd_prealloc) { + pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q->node); + if (!pd_prealloc) { ret = -ENOMEM; - goto out_free; + goto out_bypass_end; } - list_add_tail(&pd->alloc_node, &pds); } - /* - * Install the allocated pds. With @q bypassing, no new blkg - * should have been created while the queue lock was dropped. - */ spin_lock_irq(q->queue_lock); list_for_each_entry(blkg, &q->blkg_list, q_node) { - if (WARN_ON(list_empty(&pds))) { - /* umm... this shouldn't happen, just abort */ - ret = -ENOMEM; - goto out_unlock; - } - pd = list_first_entry(&pds, struct blkg_policy_data, alloc_node); - list_del_init(&pd->alloc_node); + struct blkg_policy_data *pd; - /* grab blkcg lock too while installing @pd on @blkg */ - spin_lock(&blkg->blkcg->lock); + if (blkg->pd[pol->plid]) + continue; + + pd = pol->pd_alloc_fn(GFP_NOWAIT, q->node); + if (!pd) + swap(pd, pd_prealloc); + if (!pd) { + spin_unlock_irq(q->queue_lock); + goto pd_prealloc; + } blkg->pd[pol->plid] = pd; pd->blkg = blkg; pd->plid = pol->plid; - pol->pd_init_fn(blkg); - - spin_unlock(&blkg->blkcg->lock); + if (pol->pd_init_fn) + pol->pd_init_fn(pd); } __set_bit(pol->plid, q->blkcg_pols); ret = 0; -out_unlock: + spin_unlock_irq(q->queue_lock); -out_free: +out_bypass_end: blk_queue_bypass_end(q); - list_for_each_entry_safe(pd, n, &pds, alloc_node) - kfree(pd); + if (pd_prealloc) + pol->pd_free_fn(pd_prealloc); return ret; } EXPORT_SYMBOL_GPL(blkcg_activate_policy); @@ -1093,21 +1277,16 @@ __clear_bit(pol->plid, q->blkcg_pols); - /* if no policy is left, no need for blkgs - shoot them down */ - if (bitmap_empty(q->blkcg_pols, BLKCG_MAX_POLS)) - blkg_destroy_all(q); - list_for_each_entry(blkg, &q->blkg_list, q_node) { /* grab blkcg lock too while removing @pd from @blkg */ spin_lock(&blkg->blkcg->lock); - if (pol->pd_offline_fn) - pol->pd_offline_fn(blkg); - if (pol->pd_exit_fn) - pol->pd_exit_fn(blkg); - - kfree(blkg->pd[pol->plid]); - blkg->pd[pol->plid] = NULL; + if (blkg->pd[pol->plid]) { + if (pol->pd_offline_fn) + pol->pd_offline_fn(blkg->pd[pol->plid]); + pol->pd_free_fn(blkg->pd[pol->plid]); + blkg->pd[pol->plid] = NULL; + } spin_unlock(&blkg->blkcg->lock); } @@ -1126,11 +1305,10 @@ */ int blkcg_policy_register(struct blkcg_policy *pol) { + struct blkcg *blkcg; int i, ret; - if (WARN_ON(pol->pd_size < sizeof(struct blkg_policy_data))) - return -EINVAL; - + mutex_lock(&blkcg_pol_register_mutex); mutex_lock(&blkcg_pol_mutex); /* find an empty slot */ @@ -1139,18 +1317,53 @@ if (!blkcg_policy[i]) break; if (i >= BLKCG_MAX_POLS) - goto out_unlock; + goto err_unlock; - /* register and update blkgs */ + /* register @pol */ pol->plid = i; - blkcg_policy[i] = pol; + blkcg_policy[pol->plid] = pol; + + /* allocate and install cpd's */ + if (pol->cpd_alloc_fn) { + list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { + struct blkcg_policy_data *cpd; + + cpd = pol->cpd_alloc_fn(GFP_KERNEL); + if (!cpd) + goto err_free_cpds; + + blkcg->cpd[pol->plid] = cpd; + cpd->blkcg = blkcg; + cpd->plid = pol->plid; + pol->cpd_init_fn(cpd); + } + } + + mutex_unlock(&blkcg_pol_mutex); /* everything is in place, add intf files for the new policy */ - if (pol->cftypes) - WARN_ON(cgroup_add_cftypes(&blkio_subsys, pol->cftypes)); - ret = 0; -out_unlock: + if (pol->dfl_cftypes) + WARN_ON(cgroup_add_dfl_cftypes(&io_cgrp_subsys, + pol->dfl_cftypes)); + if (pol->legacy_cftypes) + WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys, + pol->legacy_cftypes)); + mutex_unlock(&blkcg_pol_register_mutex); + return 0; + +err_free_cpds: + if (pol->cpd_alloc_fn) { + list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { + if (blkcg->cpd[pol->plid]) { + pol->cpd_free_fn(blkcg->cpd[pol->plid]); + blkcg->cpd[pol->plid] = NULL; + } + } + } + blkcg_policy[pol->plid] = NULL; +err_unlock: mutex_unlock(&blkcg_pol_mutex); + mutex_unlock(&blkcg_pol_register_mutex); return ret; } EXPORT_SYMBOL_GPL(blkcg_policy_register); @@ -1163,18 +1376,34 @@ */ void blkcg_policy_unregister(struct blkcg_policy *pol) { - mutex_lock(&blkcg_pol_mutex); + struct blkcg *blkcg; + + mutex_lock(&blkcg_pol_register_mutex); if (WARN_ON(blkcg_policy[pol->plid] != pol)) goto out_unlock; /* kill the intf files first */ - if (pol->cftypes) - cgroup_rm_cftypes(&blkio_subsys, pol->cftypes); + if (pol->dfl_cftypes) + cgroup_rm_cftypes(pol->dfl_cftypes); + if (pol->legacy_cftypes) + cgroup_rm_cftypes(pol->legacy_cftypes); - /* unregister and update blkgs */ + /* remove cpds and unregister */ + mutex_lock(&blkcg_pol_mutex); + + if (pol->cpd_alloc_fn) { + list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { + if (blkcg->cpd[pol->plid]) { + pol->cpd_free_fn(blkcg->cpd[pol->plid]); + blkcg->cpd[pol->plid] = NULL; + } + } + } blkcg_policy[pol->plid] = NULL; -out_unlock: + mutex_unlock(&blkcg_pol_mutex); +out_unlock: + mutex_unlock(&blkcg_pol_register_mutex); } EXPORT_SYMBOL_GPL(blkcg_policy_unregister);