--- zzzz-none-000/linux-3.10.107/fs/ceph/snap.c 2017-06-27 09:49:32.000000000 +0000 +++ scorpion-7490-727/linux-3.10.107/fs/ceph/snap.c 2021-02-04 17:41:59.000000000 +0000 @@ -70,13 +70,11 @@ * safe. we do need to protect against concurrent empty list * additions, however. */ - if (atomic_read(&realm->nref) == 0) { + if (atomic_inc_return(&realm->nref) == 1) { spin_lock(&mdsc->snap_empty_lock); list_del_init(&realm->empty_item); spin_unlock(&mdsc->snap_empty_lock); } - - atomic_inc(&realm->nref); } static void __insert_snap_realm(struct rb_root *root, @@ -116,7 +114,7 @@ if (!realm) return ERR_PTR(-ENOMEM); - atomic_set(&realm->nref, 0); /* tree does not take a ref */ + atomic_set(&realm->nref, 1); /* for caller */ realm->ino = ino; INIT_LIST_HEAD(&realm->children); INIT_LIST_HEAD(&realm->child_item); @@ -134,8 +132,8 @@ * * caller must hold snap_rwsem for write. */ -struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc, - u64 ino) +static struct ceph_snap_realm *__lookup_snap_realm(struct ceph_mds_client *mdsc, + u64 ino) { struct rb_node *n = mdsc->snap_realms.rb_node; struct ceph_snap_realm *r; @@ -154,6 +152,16 @@ return NULL; } +struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc, + u64 ino) +{ + struct ceph_snap_realm *r; + r = __lookup_snap_realm(mdsc, ino); + if (r) + ceph_get_snap_realm(mdsc, r); + return r; +} + static void __put_snap_realm(struct ceph_mds_client *mdsc, struct ceph_snap_realm *realm); @@ -273,7 +281,6 @@ } realm->parent_ino = parentino; realm->parent = parent; - ceph_get_snap_realm(mdsc, parent); list_add(&realm->child_item, &parent->children); return 1; } @@ -288,6 +295,9 @@ return 0; } + +struct ceph_snap_context *ceph_empty_snapc; + /* * build the snap context for a given realm. */ @@ -365,8 +375,7 @@ realm->ino, realm, snapc, snapc->seq, (unsigned int) snapc->num_snaps); - if (realm->cached_context) - ceph_put_snap_context(realm->cached_context); + ceph_put_snap_context(realm->cached_context); realm->cached_context = snapc; return 0; @@ -420,6 +429,14 @@ return 0; } +static bool has_new_snaps(struct ceph_snap_context *o, + struct ceph_snap_context *n) +{ + if (n->num_snaps == 0) + return false; + /* snaps are in descending order */ + return n->snaps[0] > o->seq; +} /* * When a snapshot is applied, the size/mtime inode metadata is queued @@ -439,6 +456,7 @@ { struct inode *inode = &ci->vfs_inode; struct ceph_cap_snap *capsnap; + struct ceph_snap_context *old_snapc, *new_snapc; int used, dirty; capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS); @@ -451,6 +469,9 @@ used = __ceph_caps_used(ci); dirty = __ceph_caps_dirty(ci); + old_snapc = ci->i_head_snapc; + new_snapc = ci->i_snap_realm->cached_context; + /* * If there is a write in progress, treat that as a dirty Fw, * even though it hasn't completed yet; by the time we finish @@ -465,71 +486,95 @@ writes in progress now were started before the previous cap_snap. lucky us. */ dout("queue_cap_snap %p already pending\n", inode); - kfree(capsnap); - } else if (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL| - CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR)) { - struct ceph_snap_context *snapc = ci->i_head_snapc; - - /* - * if we are a sync write, we may need to go to the snaprealm - * to get the current snapc. - */ - if (!snapc) - snapc = ci->i_snap_realm->cached_context; + goto update_snapc; + } + if (ci->i_wrbuffer_ref_head == 0 && + !(dirty & (CEPH_CAP_ANY_EXCL|CEPH_CAP_FILE_WR))) { + dout("queue_cap_snap %p nothing dirty|writing\n", inode); + goto update_snapc; + } - dout("queue_cap_snap %p cap_snap %p queuing under %p %s\n", - inode, capsnap, snapc, ceph_cap_string(dirty)); - ihold(inode); + BUG_ON(!old_snapc); - atomic_set(&capsnap->nref, 1); - capsnap->ci = ci; - INIT_LIST_HEAD(&capsnap->ci_item); - INIT_LIST_HEAD(&capsnap->flushing_item); - - capsnap->follows = snapc->seq; - capsnap->issued = __ceph_caps_issued(ci, NULL); - capsnap->dirty = dirty; - - capsnap->mode = inode->i_mode; - capsnap->uid = inode->i_uid; - capsnap->gid = inode->i_gid; - - if (dirty & CEPH_CAP_XATTR_EXCL) { - __ceph_build_xattrs_blob(ci); - capsnap->xattr_blob = - ceph_buffer_get(ci->i_xattrs.blob); - capsnap->xattr_version = ci->i_xattrs.version; - } else { - capsnap->xattr_blob = NULL; - capsnap->xattr_version = 0; + /* + * There is no need to send FLUSHSNAP message to MDS if there is + * no new snapshot. But when there is dirty pages or on-going + * writes, we still need to create cap_snap. cap_snap is needed + * by the write path and page writeback path. + * + * also see ceph_try_drop_cap_snap() + */ + if (has_new_snaps(old_snapc, new_snapc)) { + if (dirty & (CEPH_CAP_ANY_EXCL|CEPH_CAP_FILE_WR)) + capsnap->need_flush = true; + } else { + if (!(used & CEPH_CAP_FILE_WR) && + ci->i_wrbuffer_ref_head == 0) { + dout("queue_cap_snap %p " + "no new_snap|dirty_page|writing\n", inode); + goto update_snapc; } + } - /* dirty page count moved from _head to this cap_snap; - all subsequent writes page dirties occur _after_ this - snapshot. */ - capsnap->dirty_pages = ci->i_wrbuffer_ref_head; - ci->i_wrbuffer_ref_head = 0; - capsnap->context = snapc; - ci->i_head_snapc = - ceph_get_snap_context(ci->i_snap_realm->cached_context); - dout(" new snapc is %p\n", ci->i_head_snapc); - list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps); - - if (used & CEPH_CAP_FILE_WR) { - dout("queue_cap_snap %p cap_snap %p snapc %p" - " seq %llu used WR, now pending\n", inode, - capsnap, snapc, snapc->seq); - capsnap->writing = 1; - } else { - /* note mtime, size NOW. */ - __ceph_finish_cap_snap(ci, capsnap); - } + dout("queue_cap_snap %p cap_snap %p queuing under %p %s %s\n", + inode, capsnap, old_snapc, ceph_cap_string(dirty), + capsnap->need_flush ? "" : "no_flush"); + ihold(inode); + + atomic_set(&capsnap->nref, 1); + capsnap->ci = ci; + INIT_LIST_HEAD(&capsnap->ci_item); + INIT_LIST_HEAD(&capsnap->flushing_item); + + capsnap->follows = old_snapc->seq; + capsnap->issued = __ceph_caps_issued(ci, NULL); + capsnap->dirty = dirty; + + capsnap->mode = inode->i_mode; + capsnap->uid = inode->i_uid; + capsnap->gid = inode->i_gid; + + if (dirty & CEPH_CAP_XATTR_EXCL) { + __ceph_build_xattrs_blob(ci); + capsnap->xattr_blob = + ceph_buffer_get(ci->i_xattrs.blob); + capsnap->xattr_version = ci->i_xattrs.version; } else { - dout("queue_cap_snap %p nothing dirty|writing\n", inode); - kfree(capsnap); + capsnap->xattr_blob = NULL; + capsnap->xattr_version = 0; } + capsnap->inline_data = ci->i_inline_version != CEPH_INLINE_NONE; + + /* dirty page count moved from _head to this cap_snap; + all subsequent writes page dirties occur _after_ this + snapshot. */ + capsnap->dirty_pages = ci->i_wrbuffer_ref_head; + ci->i_wrbuffer_ref_head = 0; + capsnap->context = old_snapc; + list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps); + old_snapc = NULL; + + if (used & CEPH_CAP_FILE_WR) { + dout("queue_cap_snap %p cap_snap %p snapc %p" + " seq %llu used WR, now pending\n", inode, + capsnap, old_snapc, old_snapc->seq); + capsnap->writing = 1; + } else { + /* note mtime, size NOW. */ + __ceph_finish_cap_snap(ci, capsnap); + } + capsnap = NULL; + +update_snapc: + if (ci->i_head_snapc) { + ci->i_head_snapc = ceph_get_snap_context(new_snapc); + dout(" new snapc is %p\n", new_snapc); + } spin_unlock(&ci->i_ceph_lock); + + kfree(capsnap); + ceph_put_snap_context(old_snapc); } /* @@ -590,15 +635,13 @@ if (!inode) continue; spin_unlock(&realm->inodes_with_caps_lock); - if (lastinode) - iput(lastinode); + iput(lastinode); lastinode = inode; ceph_queue_cap_snap(ci); spin_lock(&realm->inodes_with_caps_lock); } spin_unlock(&realm->inodes_with_caps_lock); - if (lastinode) - iput(lastinode); + iput(lastinode); list_for_each_entry(child, &realm->children, child_item) { dout("queue_realm_cap_snaps %p %llx queue child %p %llx\n", @@ -619,12 +662,14 @@ * Caller must hold snap_rwsem for write. */ int ceph_update_snap_trace(struct ceph_mds_client *mdsc, - void *p, void *e, bool deletion) + void *p, void *e, bool deletion, + struct ceph_snap_realm **realm_ret) { struct ceph_mds_snap_realm *ri; /* encoded */ __le64 *snaps; /* encoded */ __le64 *prior_parent_snaps; /* encoded */ - struct ceph_snap_realm *realm; + struct ceph_snap_realm *realm = NULL; + struct ceph_snap_realm *first_realm = NULL; int invalidate = 0; int err = -ENOMEM; LIST_HEAD(dirty_realms); @@ -678,6 +723,8 @@ /* queue realm for cap_snap creation */ list_add(&realm->dirty_item, &dirty_realms); + if (realm->seq > mdsc->last_snap_seq) + mdsc->last_snap_seq = realm->seq; invalidate = 1; } else if (!realm->cached_context) { @@ -692,13 +739,18 @@ dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino, realm, invalidate, p, e); - if (p < e) - goto more; - /* invalidate when we reach the _end_ (root) of the trace */ - if (invalidate) + if (invalidate && p >= e) rebuild_snap_realms(realm); + if (!first_realm) + first_realm = realm; + else + ceph_put_snap_realm(mdsc, realm); + + if (p < e) + goto more; + /* * queue cap snaps _after_ we've built the new snap contexts, * so that i_head_snapc can be set appropriately. @@ -709,12 +761,21 @@ queue_realm_cap_snaps(realm); } + if (realm_ret) + *realm_ret = first_realm; + else + ceph_put_snap_realm(mdsc, first_realm); + __cleanup_empty_realms(mdsc); return 0; bad: err = -EINVAL; fail: + if (realm && !IS_ERR(realm)) + ceph_put_snap_realm(mdsc, realm); + if (first_realm) + ceph_put_snap_realm(mdsc, first_realm); pr_err("update_snap_trace error %d\n", err); return err; } @@ -832,7 +893,6 @@ if (IS_ERR(realm)) goto out; } - ceph_get_snap_realm(mdsc, realm); dout("splitting snap_realm %llx %p\n", realm->ino, realm); for (i = 0; i < num_split_inos; i++) { @@ -893,7 +953,7 @@ /* we may have taken some of the old realm's children. */ for (i = 0; i < num_split_realms; i++) { struct ceph_snap_realm *child = - ceph_lookup_snap_realm(mdsc, + __lookup_snap_realm(mdsc, le64_to_cpu(split_realms[i])); if (!child) continue; @@ -906,7 +966,7 @@ * snap, we can avoid queueing cap_snaps. */ ceph_update_snap_trace(mdsc, p, e, - op == CEPH_SNAP_OP_DESTROY); + op == CEPH_SNAP_OP_DESTROY, NULL); if (op == CEPH_SNAP_OP_SPLIT) /* we took a reference when we created the realm, above */ @@ -928,5 +988,16 @@ return; } +int __init ceph_snap_init(void) +{ + ceph_empty_snapc = ceph_create_snap_context(0, GFP_NOFS); + if (!ceph_empty_snapc) + return -ENOMEM; + ceph_empty_snapc->seq = 1; + return 0; +} - +void ceph_snap_exit(void) +{ + ceph_put_snap_context(ceph_empty_snapc); +}