--- zzzz-none-000/linux-3.10.107/fs/nfs/pnfs.c 2017-06-27 09:49:32.000000000 +0000 +++ scorpion-7490-727/linux-3.10.107/fs/nfs/pnfs.c 2021-02-04 17:41:59.000000000 +0000 @@ -33,6 +33,9 @@ #include "internal.h" #include "pnfs.h" #include "iostat.h" +#include "nfs4trace.h" +#include "delegation.h" +#include "nfs42.h" #define NFSDBG_FACILITY NFSDBG_PNFS #define PNFS_LAYOUTGET_RETRY_TIMEOUT (120*HZ) @@ -49,6 +52,10 @@ */ static LIST_HEAD(pnfs_modules_tbl); +static int +pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid, + enum pnfs_iomode iomode, bool sync); + /* Return the registered pnfs layout driver module matching given id */ static struct pnfs_layoutdriver_type * find_pnfs_driver_locked(u32 id) @@ -237,6 +244,8 @@ struct inode *inode = lo->plh_inode; if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) { + if (!list_empty(&lo->plh_segs)) + WARN_ONCE(1, "NFS: BUG unfreed layout segments.\n"); pnfs_detach_layout_hdr(lo); spin_unlock(&inode->i_lock); pnfs_free_layout_hdr(lo); @@ -336,6 +345,61 @@ rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq); } +/* Return true if layoutreturn is needed */ +static bool +pnfs_layout_need_return(struct pnfs_layout_hdr *lo, + struct pnfs_layout_segment *lseg) +{ + struct pnfs_layout_segment *s; + + if (!test_and_clear_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags)) + return false; + + list_for_each_entry(s, &lo->plh_segs, pls_list) + if (s != lseg && test_bit(NFS_LSEG_LAYOUTRETURN, &s->pls_flags)) + return false; + + return true; +} + +static bool +pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo) +{ + /* Serialise LAYOUTGET/LAYOUTRETURN */ + if (atomic_read(&lo->plh_outstanding) != 0) + return false; + if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) + return false; + lo->plh_return_iomode = 0; + pnfs_get_layout_hdr(lo); + clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, &lo->plh_flags); + return true; +} + +static void pnfs_layoutreturn_before_put_lseg(struct pnfs_layout_segment *lseg, + struct pnfs_layout_hdr *lo, struct inode *inode) +{ + lo = lseg->pls_layout; + inode = lo->plh_inode; + + spin_lock(&inode->i_lock); + if (pnfs_layout_need_return(lo, lseg)) { + nfs4_stateid stateid; + enum pnfs_iomode iomode; + bool send; + + stateid = lo->plh_stateid; + iomode = lo->plh_return_iomode; + send = pnfs_prepare_layoutreturn(lo); + spin_unlock(&inode->i_lock); + if (send) { + /* Send an async layoutreturn so we dont deadlock */ + pnfs_send_layoutreturn(lo, stateid, iomode, false); + } + } else + spin_unlock(&inode->i_lock); +} + void pnfs_put_lseg(struct pnfs_layout_segment *lseg) { @@ -348,9 +412,22 @@ dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg, atomic_read(&lseg->pls_refcount), test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); + + /* Handle the case where refcount != 1 */ + if (atomic_add_unless(&lseg->pls_refcount, -1, 1)) + return; + lo = lseg->pls_layout; inode = lo->plh_inode; + /* Do we need a layoutreturn? */ + if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags)) + pnfs_layoutreturn_before_put_lseg(lseg, lo, inode); + if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) { + if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags)) { + spin_unlock(&inode->i_lock); + return; + } pnfs_get_layout_hdr(lo); pnfs_layout_remove_lseg(lo, lseg); spin_unlock(&inode->i_lock); @@ -360,7 +437,47 @@ } EXPORT_SYMBOL_GPL(pnfs_put_lseg); -static inline u64 +static void pnfs_free_lseg_async_work(struct work_struct *work) +{ + struct pnfs_layout_segment *lseg; + struct pnfs_layout_hdr *lo; + + lseg = container_of(work, struct pnfs_layout_segment, pls_work); + lo = lseg->pls_layout; + + pnfs_free_lseg(lseg); + pnfs_put_layout_hdr(lo); +} + +static void pnfs_free_lseg_async(struct pnfs_layout_segment *lseg) +{ + INIT_WORK(&lseg->pls_work, pnfs_free_lseg_async_work); + schedule_work(&lseg->pls_work); +} + +void +pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg) +{ + if (!lseg) + return; + + assert_spin_locked(&lseg->pls_layout->plh_inode->i_lock); + + dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg, + atomic_read(&lseg->pls_refcount), + test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); + if (atomic_dec_and_test(&lseg->pls_refcount)) { + struct pnfs_layout_hdr *lo = lseg->pls_layout; + if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags)) + return; + pnfs_get_layout_hdr(lo); + pnfs_layout_remove_lseg(lo, lseg); + pnfs_free_lseg_async(lseg); + } +} +EXPORT_SYMBOL_GPL(pnfs_put_lseg_locked); + +static u64 end_offset(u64 start, u64 len) { u64 end; @@ -376,9 +493,9 @@ * start2 end2 * [----------------) */ -static inline int -lo_seg_contained(struct pnfs_layout_range *l1, - struct pnfs_layout_range *l2) +static bool +pnfs_lseg_range_contained(const struct pnfs_layout_range *l1, + const struct pnfs_layout_range *l2) { u64 start1 = l1->offset; u64 end1 = end_offset(start1, l1->length); @@ -395,9 +512,9 @@ * start2 end2 * [----------------) */ -static inline int -lo_seg_intersecting(struct pnfs_layout_range *l1, - struct pnfs_layout_range *l2) +static bool +pnfs_lseg_range_intersecting(const struct pnfs_layout_range *l1, + const struct pnfs_layout_range *l2) { u64 start1 = l1->offset; u64 end1 = end_offset(start1, l1->length); @@ -409,12 +526,12 @@ } static bool -should_free_lseg(struct pnfs_layout_range *lseg_range, - struct pnfs_layout_range *recall_range) +should_free_lseg(const struct pnfs_layout_range *lseg_range, + const struct pnfs_layout_range *recall_range) { return (recall_range->iomode == IOMODE_ANY || lseg_range->iomode == recall_range->iomode) && - lo_seg_intersecting(lseg_range, recall_range); + pnfs_lseg_range_intersecting(lseg_range, recall_range); } static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg, @@ -504,6 +621,7 @@ pnfs_get_layout_hdr(lo); pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RO_FAILED); pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED); + pnfs_clear_retry_layoutget(lo); spin_unlock(&nfsi->vfs_inode.i_lock); pnfs_free_lseg_list(&tmp_list); pnfs_put_layout_hdr(lo); @@ -576,6 +694,9 @@ dprintk("%s freeing layout for inode %lu\n", __func__, lo->plh_inode->i_ino); inode = lo->plh_inode; + + pnfs_layoutcommit_inode(inode, false); + spin_lock(&inode->i_lock); list_del_init(&lo->plh_bulk_destroy); lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */ @@ -661,7 +782,7 @@ */ static bool pnfs_seqid_is_newer(u32 s1, u32 s2) { - return (s32)s1 - (s32)s2 > 0; + return (s32)(s1 - s2) > 0; } /* update lo->plh_stateid with new if is more recent */ @@ -700,27 +821,27 @@ /* lget is set to 1 if called from inside send_layoutget call chain */ static bool -pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo, int lget) +pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo) { return lo->plh_block_lgets || - test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || - (list_empty(&lo->plh_segs) && - (atomic_read(&lo->plh_outstanding) > lget)); + test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); } int pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, + struct pnfs_layout_range *range, struct nfs4_state *open_state) { int status = 0; dprintk("--> %s\n", __func__); spin_lock(&lo->plh_inode->i_lock); - if (pnfs_layoutgets_blocked(lo, 1)) { + if (pnfs_layoutgets_blocked(lo)) { status = -EAGAIN; } else if (!nfs4_valid_open_stateid(open_state)) { status = -EBADF; - } else if (list_empty(&lo->plh_segs)) { + } else if (list_empty(&lo->plh_segs) || + test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) { int seq; do { @@ -750,27 +871,42 @@ struct nfs_server *server = NFS_SERVER(ino); struct nfs4_layoutget *lgp; struct pnfs_layout_segment *lseg; + loff_t i_size; dprintk("--> %s\n", __func__); - lgp = kzalloc(sizeof(*lgp), gfp_flags); - if (lgp == NULL) - return NULL; + /* + * Synchronously retrieve layout information from server and + * store in lseg. If we race with a concurrent seqid morphing + * op, then re-send the LAYOUTGET. + */ + do { + lgp = kzalloc(sizeof(*lgp), gfp_flags); + if (lgp == NULL) + return NULL; + + i_size = i_size_read(ino); + + lgp->args.minlength = PAGE_CACHE_SIZE; + if (lgp->args.minlength > range->length) + lgp->args.minlength = range->length; + if (range->iomode == IOMODE_READ) { + if (range->offset >= i_size) + lgp->args.minlength = 0; + else if (i_size - range->offset < lgp->args.minlength) + lgp->args.minlength = i_size - range->offset; + } + lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; + lgp->args.range = *range; + lgp->args.type = server->pnfs_curr_ld->id; + lgp->args.inode = ino; + lgp->args.ctx = get_nfs_open_context(ctx); + lgp->gfp_flags = gfp_flags; + lgp->cred = lo->plh_lc_cred; - lgp->args.minlength = PAGE_CACHE_SIZE; - if (lgp->args.minlength > range->length) - lgp->args.minlength = range->length; - lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; - lgp->args.range = *range; - lgp->args.type = server->pnfs_curr_ld->id; - lgp->args.inode = ino; - lgp->args.ctx = get_nfs_open_context(ctx); - lgp->gfp_flags = gfp_flags; + lseg = nfs4_proc_layoutget(lgp, gfp_flags); + } while (lseg == ERR_PTR(-EAGAIN)); - /* Synchronously retrieve layout information from server and - * store in lseg. - */ - lseg = nfs4_proc_layoutget(lgp, gfp_flags); if (IS_ERR(lseg)) { switch (PTR_ERR(lseg)) { case -ENOMEM: @@ -781,7 +917,9 @@ pnfs_layout_io_set_failed(lo, range->iomode); } return NULL; - } + } else + pnfs_layout_clear_fail_bit(lo, + pnfs_iomode_to_fail_bit(range->iomode)); return lseg; } @@ -801,6 +939,48 @@ } } +void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo) +{ + clear_bit_unlock(NFS_LAYOUT_RETURN, &lo->plh_flags); + smp_mb__after_atomic(); + wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN); + rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq); +} + +static int +pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid, + enum pnfs_iomode iomode, bool sync) +{ + struct inode *ino = lo->plh_inode; + struct nfs4_layoutreturn *lrp; + int status = 0; + + lrp = kzalloc(sizeof(*lrp), GFP_NOFS); + if (unlikely(lrp == NULL)) { + status = -ENOMEM; + spin_lock(&ino->i_lock); + pnfs_clear_layoutreturn_waitbit(lo); + spin_unlock(&ino->i_lock); + pnfs_put_layout_hdr(lo); + goto out; + } + + lrp->args.stateid = stateid; + lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id; + lrp->args.inode = ino; + lrp->args.range.iomode = iomode; + lrp->args.range.offset = 0; + lrp->args.range.length = NFS4_MAX_UINT64; + lrp->args.layout = lo; + lrp->clp = NFS_SERVER(ino)->nfs_client; + lrp->cred = lo->plh_lc_cred; + + status = nfs4_proc_layoutreturn(lrp, sync); +out: + dprintk("<-- %s status: %d\n", __func__, status); + return status; +} + /* * Initiates a LAYOUTRETURN(FILE), and removes the pnfs_layout_hdr * when the layout segment list is empty. @@ -815,9 +995,9 @@ struct pnfs_layout_hdr *lo = NULL; struct nfs_inode *nfsi = NFS_I(ino); LIST_HEAD(tmp_list); - struct nfs4_layoutreturn *lrp; nfs4_stateid stateid; int status = 0, empty; + bool send; dprintk("NFS: %s for inode %lu\n", __func__, ino->i_ino); @@ -834,34 +1014,31 @@ empty = list_empty(&lo->plh_segs); pnfs_clear_layoutcommit(ino, &tmp_list); pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL); + + if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) { + struct pnfs_layout_range range = { + .iomode = IOMODE_ANY, + .offset = 0, + .length = NFS4_MAX_UINT64, + }; + NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, &range); + } + /* Don't send a LAYOUTRETURN if list was initially empty */ if (empty) { spin_unlock(&ino->i_lock); - pnfs_put_layout_hdr(lo); dprintk("NFS: %s no layout segments to return\n", __func__); - goto out; + goto out_put_layout_hdr; } - lo->plh_block_lgets++; + + set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); + send = pnfs_prepare_layoutreturn(lo); spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&tmp_list); - - lrp = kzalloc(sizeof(*lrp), GFP_KERNEL); - if (unlikely(lrp == NULL)) { - status = -ENOMEM; - spin_lock(&ino->i_lock); - lo->plh_block_lgets--; - spin_unlock(&ino->i_lock); - pnfs_put_layout_hdr(lo); - goto out; - } - - lrp->args.stateid = stateid; - lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id; - lrp->args.inode = ino; - lrp->args.layout = lo; - lrp->clp = NFS_SERVER(ino)->nfs_client; - - status = nfs4_proc_layoutreturn(lrp); + if (send) + status = pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true); +out_put_layout_hdr: + pnfs_put_layout_hdr(lo); out: dprintk("<-- %s status: %d\n", __func__, status); return status; @@ -897,32 +1074,61 @@ bool pnfs_roc(struct inode *ino) { + struct nfs_inode *nfsi = NFS_I(ino); + struct nfs_open_context *ctx; + struct nfs4_state *state; struct pnfs_layout_hdr *lo; struct pnfs_layout_segment *lseg, *tmp; + nfs4_stateid stateid; LIST_HEAD(tmp_list); - bool found = false; + bool found = false, layoutreturn = false, roc = false; spin_lock(&ino->i_lock); - lo = NFS_I(ino)->layout; - if (!lo || !test_and_clear_bit(NFS_LAYOUT_ROC, &lo->plh_flags) || - test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) - goto out_nolayout; + lo = nfsi->layout; + if (!lo || test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) + goto out_noroc; + + /* no roc if we hold a delegation */ + if (nfs4_check_delegation(ino, FMODE_READ)) + goto out_noroc; + + list_for_each_entry(ctx, &nfsi->open_files, list) { + state = ctx->state; + /* Don't return layout if there is open file state */ + if (state != NULL && state->state != 0) + goto out_noroc; + } + + stateid = lo->plh_stateid; + /* always send layoutreturn if being marked so */ + if (test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, + &lo->plh_flags)) + layoutreturn = pnfs_prepare_layoutreturn(lo); + + pnfs_clear_retry_layoutget(lo); list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list) - if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) { + /* If we are sending layoutreturn, invalidate all valid lsegs */ + if (layoutreturn || test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) { mark_lseg_invalid(lseg, &tmp_list); found = true; } - if (!found) - goto out_nolayout; - lo->plh_block_lgets++; - pnfs_get_layout_hdr(lo); /* matched in pnfs_roc_release */ - spin_unlock(&ino->i_lock); - pnfs_free_lseg_list(&tmp_list); - return true; + /* ROC in two conditions: + * 1. there are ROC lsegs + * 2. we don't send layoutreturn + */ + if (found && !layoutreturn) { + /* lo ref dropped in pnfs_roc_release() */ + pnfs_get_layout_hdr(lo); + roc = true; + } -out_nolayout: +out_noroc: spin_unlock(&ino->i_lock); - return false; + pnfs_free_lseg_list(&tmp_list); + pnfs_layoutcommit_inode(ino, true); + if (layoutreturn) + pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true); + return roc; } void pnfs_roc_release(struct inode *ino) @@ -931,7 +1137,7 @@ spin_lock(&ino->i_lock); lo = NFS_I(ino)->layout; - lo->plh_block_lgets--; + pnfs_clear_layoutreturn_waitbit(lo); if (atomic_dec_and_test(&lo->plh_refcount)) { pnfs_detach_layout_hdr(lo); spin_unlock(&ino->i_lock); @@ -949,23 +1155,16 @@ if (pnfs_seqid_is_newer(barrier, lo->plh_barrier)) lo->plh_barrier = barrier; spin_unlock(&ino->i_lock); + trace_nfs4_layoutreturn_on_close(ino, 0); } -bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task) +void pnfs_roc_get_barrier(struct inode *ino, u32 *barrier) { struct nfs_inode *nfsi = NFS_I(ino); struct pnfs_layout_hdr *lo; - struct pnfs_layout_segment *lseg; u32 current_seqid; - bool found = false; spin_lock(&ino->i_lock); - list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list) - if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) { - rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL); - found = true; - goto out; - } lo = nfsi->layout; current_seqid = be32_to_cpu(lo->plh_stateid.seqid); @@ -973,9 +1172,25 @@ * a barrier, we choose the worst-case barrier. */ *barrier = current_seqid + atomic_read(&lo->plh_outstanding); -out: spin_unlock(&ino->i_lock); - return found; +} + +bool pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task) +{ + struct nfs_inode *nfsi = NFS_I(ino); + struct pnfs_layout_hdr *lo; + bool sleep = false; + + /* we might not have grabbed lo reference. so need to check under + * i_lock */ + spin_lock(&ino->i_lock); + lo = nfsi->layout; + if (lo && test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) { + rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL); + sleep = true; + } + spin_unlock(&ino->i_lock); + return sleep; } /* @@ -984,8 +1199,8 @@ * are seen first. */ static s64 -cmp_layout(struct pnfs_layout_range *l1, - struct pnfs_layout_range *l2) +pnfs_lseg_range_cmp(const struct pnfs_layout_range *l1, + const struct pnfs_layout_range *l2) { s64 d; @@ -1003,16 +1218,41 @@ return (int)(l1->iomode == IOMODE_READ) - (int)(l2->iomode == IOMODE_READ); } -static void -pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo, - struct pnfs_layout_segment *lseg) +static bool +pnfs_lseg_range_is_after(const struct pnfs_layout_range *l1, + const struct pnfs_layout_range *l2) { - struct pnfs_layout_segment *lp; + return pnfs_lseg_range_cmp(l1, l2) > 0; +} + +static bool +pnfs_lseg_no_merge(struct pnfs_layout_segment *lseg, + struct pnfs_layout_segment *old) +{ + return false; +} + +void +pnfs_generic_layout_insert_lseg(struct pnfs_layout_hdr *lo, + struct pnfs_layout_segment *lseg, + bool (*is_after)(const struct pnfs_layout_range *, + const struct pnfs_layout_range *), + bool (*do_merge)(struct pnfs_layout_segment *, + struct pnfs_layout_segment *), + struct list_head *free_me) +{ + struct pnfs_layout_segment *lp, *tmp; dprintk("%s:Begin\n", __func__); - list_for_each_entry(lp, &lo->plh_segs, pls_list) { - if (cmp_layout(&lseg->pls_range, &lp->pls_range) > 0) + list_for_each_entry_safe(lp, tmp, &lo->plh_segs, pls_list) { + if (test_bit(NFS_LSEG_VALID, &lp->pls_flags) == 0) + continue; + if (do_merge(lseg, lp)) { + mark_lseg_invalid(lp, free_me); + continue; + } + if (is_after(&lseg->pls_range, &lp->pls_range)) continue; list_add_tail(&lseg->pls_list, &lp->pls_list); dprintk("%s: inserted lseg %p " @@ -1034,6 +1274,24 @@ dprintk("%s:Return\n", __func__); } +EXPORT_SYMBOL_GPL(pnfs_generic_layout_insert_lseg); + +static void +pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo, + struct pnfs_layout_segment *lseg, + struct list_head *free_me) +{ + struct inode *inode = lo->plh_inode; + struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; + + if (ld->add_lseg != NULL) + ld->add_lseg(lo, lseg, free_me); + else + pnfs_generic_layout_insert_lseg(lo, lseg, + pnfs_lseg_range_is_after, + pnfs_lseg_no_merge, + free_me); +} static struct pnfs_layout_hdr * alloc_init_layout_hdr(struct inode *ino, @@ -1050,7 +1308,7 @@ INIT_LIST_HEAD(&lo->plh_segs); INIT_LIST_HEAD(&lo->plh_bulk_destroy); lo->plh_inode = ino; - lo->plh_lc_cred = get_rpccred(ctx->state->owner->so_cred); + lo->plh_lc_cred = get_rpccred(ctx->cred); return lo; } @@ -1091,21 +1349,21 @@ * READ READ true * READ RW true */ -static int -is_matching_lseg(struct pnfs_layout_range *ls_range, - struct pnfs_layout_range *range) +static bool +pnfs_lseg_range_match(const struct pnfs_layout_range *ls_range, + const struct pnfs_layout_range *range) { struct pnfs_layout_range range1; if ((range->iomode == IOMODE_RW && ls_range->iomode != IOMODE_RW) || - !lo_seg_intersecting(ls_range, range)) + !pnfs_lseg_range_intersecting(ls_range, range)) return 0; /* range1 covers only the first byte in the range */ range1 = *range; range1.length = 1; - return lo_seg_contained(ls_range, &range1); + return pnfs_lseg_range_contained(ls_range, &range1); } /* @@ -1121,12 +1379,11 @@ list_for_each_entry(lseg, &lo->plh_segs, pls_list) { if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) && - is_matching_lseg(&lseg->pls_range, range)) { + !test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags) && + pnfs_lseg_range_match(&lseg->pls_range, range)) { ret = pnfs_get_lseg(lseg); break; } - if (lseg->pls_range.offset > range->offset) - break; } dprintk("%s:Return lseg %p ref %d\n", @@ -1209,6 +1466,37 @@ return ret; } +/* stop waiting if someone clears NFS_LAYOUT_RETRY_LAYOUTGET bit. */ +static int pnfs_layoutget_retry_bit_wait(struct wait_bit_key *key, int mode) +{ + if (!test_bit(NFS_LAYOUT_RETRY_LAYOUTGET, key->flags)) + return 1; + return nfs_wait_bit_killable(key, mode); +} + +static bool pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo) +{ + if (!pnfs_should_retry_layoutget(lo)) + return false; + /* + * send layoutcommit as it can hold up layoutreturn due to lseg + * reference + */ + pnfs_layoutcommit_inode(lo->plh_inode, false); + return !wait_on_bit_action(&lo->plh_flags, NFS_LAYOUT_RETURN, + pnfs_layoutget_retry_bit_wait, + TASK_UNINTERRUPTIBLE); +} + +static void pnfs_clear_first_layoutget(struct pnfs_layout_hdr *lo) +{ + unsigned long *bitlock = &lo->plh_flags; + + clear_bit_unlock(NFS_LAYOUT_FIRST_LAYOUTGET, bitlock); + smp_mb__after_atomic(); + wake_up_bit(bitlock, NFS_LAYOUT_FIRST_LAYOUTGET); +} + /* * Layout segment is retreived from the server if not cached. * The appropriate layout segment is referenced and returned to the caller. @@ -1236,9 +1524,15 @@ if (!pnfs_enabled_sb(NFS_SERVER(ino))) goto out; + if (iomode == IOMODE_READ && i_size_read(ino) == 0) + goto out; + if (pnfs_within_mdsthreshold(ctx, ino, iomode)) goto out; +lookup_again: + nfs4_client_recover_expired_lease(clp); + first = false; spin_lock(&ino->i_lock); lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags); if (lo == NULL) { @@ -1253,27 +1547,61 @@ } /* if LAYOUTGET already failed once we don't try again */ - if (pnfs_layout_io_test_failed(lo, iomode)) + if (pnfs_layout_io_test_failed(lo, iomode) && + !pnfs_should_retry_layoutget(lo)) goto out_unlock; - /* Check to see if the layout for the given range already exists */ - lseg = pnfs_find_lseg(lo, &arg); - if (lseg) - goto out_unlock; + first = list_empty(&lo->plh_segs); + if (first) { + /* The first layoutget for the file. Need to serialize per + * RFC 5661 Errata 3208. + */ + if (test_and_set_bit(NFS_LAYOUT_FIRST_LAYOUTGET, + &lo->plh_flags)) { + spin_unlock(&ino->i_lock); + wait_on_bit(&lo->plh_flags, NFS_LAYOUT_FIRST_LAYOUTGET, + TASK_UNINTERRUPTIBLE); + pnfs_put_layout_hdr(lo); + goto lookup_again; + } + } else { + /* Check to see if the layout for the given range + * already exists + */ + lseg = pnfs_find_lseg(lo, &arg); + if (lseg) + goto out_unlock; + } + + /* + * Because we free lsegs before sending LAYOUTRETURN, we need to wait + * for LAYOUTRETURN even if first is true. + */ + if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) { + spin_unlock(&ino->i_lock); + dprintk("%s wait for layoutreturn\n", __func__); + if (pnfs_prepare_to_retry_layoutget(lo)) { + if (first) + pnfs_clear_first_layoutget(lo); + pnfs_put_layout_hdr(lo); + dprintk("%s retrying\n", __func__); + goto lookup_again; + } + goto out_put_layout_hdr; + } - if (pnfs_layoutgets_blocked(lo, 0)) + if (pnfs_layoutgets_blocked(lo)) goto out_unlock; atomic_inc(&lo->plh_outstanding); - - first = list_empty(&lo->plh_layouts) ? true : false; spin_unlock(&ino->i_lock); - if (first) { + if (list_empty(&lo->plh_layouts)) { /* The lo must be on the clp list if there is any * chance of a CB_LAYOUTRECALL(FILE) coming in. */ spin_lock(&clp->cl_lock); - list_add_tail(&lo->plh_layouts, &server->layouts); + if (list_empty(&lo->plh_layouts)) + list_add_tail(&lo->plh_layouts, &server->layouts); spin_unlock(&clp->cl_lock); } @@ -1286,8 +1614,11 @@ arg.length = PAGE_CACHE_ALIGN(arg.length); lseg = send_layoutget(lo, ctx, &arg, gfp_flags); + pnfs_clear_retry_layoutget(lo); atomic_dec(&lo->plh_outstanding); out_put_layout_hdr: + if (first) + pnfs_clear_first_layoutget(lo); pnfs_put_layout_hdr(lo); out: dprintk("%s: inode %s/%llu pNFS layout segment %s for " @@ -1305,6 +1636,26 @@ } EXPORT_SYMBOL_GPL(pnfs_update_layout); +static bool +pnfs_sanity_check_layout_range(struct pnfs_layout_range *range) +{ + switch (range->iomode) { + case IOMODE_READ: + case IOMODE_RW: + break; + default: + return false; + } + if (range->offset == NFS4_MAX_UINT64) + return false; + if (range->length == 0) + return false; + if (range->length != NFS4_MAX_UINT64 && + range->length > NFS4_MAX_UINT64 - range->offset) + return false; + return true; +} + struct pnfs_layout_segment * pnfs_layout_process(struct nfs4_layoutget *lgp) { @@ -1312,7 +1663,11 @@ struct nfs4_layoutget_res *res = &lgp->res; struct pnfs_layout_segment *lseg; struct inode *ino = lo->plh_inode; - int status = 0; + LIST_HEAD(free_me); + int status = -EINVAL; + + if (!pnfs_sanity_check_layout_range(&res->range)) + goto out; /* Inject layout blob into I/O device driver */ lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags); @@ -1326,32 +1681,45 @@ goto out; } + init_lseg(lo, lseg); + lseg->pls_range = res->range; + spin_lock(&ino->i_lock); - if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { - dprintk("%s forget reply due to recall\n", __func__); + if (pnfs_layoutgets_blocked(lo)) { + dprintk("%s forget reply due to state\n", __func__); goto out_forget_reply; } - if (pnfs_layoutgets_blocked(lo, 1) || - pnfs_layout_stateid_blocked(lo, &res->stateid)) { - dprintk("%s forget reply due to state\n", __func__); - goto out_forget_reply; + if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) { + /* existing state ID, make sure the sequence number matches. */ + if (pnfs_layout_stateid_blocked(lo, &res->stateid)) { + dprintk("%s forget reply due to sequence\n", __func__); + status = -EAGAIN; + goto out_forget_reply; + } + pnfs_set_layout_stateid(lo, &res->stateid, false); + } else { + /* + * We got an entirely new state ID. Mark all segments for the + * inode invalid, and don't bother validating the stateid + * sequence number. + */ + pnfs_mark_matching_lsegs_invalid(lo, &free_me, NULL); + + nfs4_stateid_copy(&lo->plh_stateid, &res->stateid); + lo->plh_barrier = be32_to_cpu(res->stateid.seqid); } - /* Done processing layoutget. Set the layout stateid */ - pnfs_set_layout_stateid(lo, &res->stateid, false); + clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); - init_lseg(lo, lseg); - lseg->pls_range = res->range; pnfs_get_lseg(lseg); - pnfs_layout_insert_lseg(lo, lseg); + pnfs_layout_insert_lseg(lo, lseg, &free_me); - if (res->return_on_close) { + if (res->return_on_close) set_bit(NFS_LSEG_ROC, &lseg->pls_flags); - set_bit(NFS_LAYOUT_ROC, &lo->plh_flags); - } spin_unlock(&ino->i_lock); + pnfs_free_lseg_list(&free_me); return lseg; out: return ERR_PTR(status); @@ -1363,29 +1731,80 @@ goto out; } +static void +pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, + struct list_head *tmp_list, + struct pnfs_layout_range *return_range) +{ + struct pnfs_layout_segment *lseg, *next; + + dprintk("%s:Begin lo %p\n", __func__, lo); + + if (list_empty(&lo->plh_segs)) + return; + + list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) + if (should_free_lseg(&lseg->pls_range, return_range)) { + dprintk("%s: marking lseg %p iomode %d " + "offset %llu length %llu\n", __func__, + lseg, lseg->pls_range.iomode, + lseg->pls_range.offset, + lseg->pls_range.length); + set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags); + mark_lseg_invalid(lseg, tmp_list); + set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, + &lo->plh_flags); + } +} + +void pnfs_error_mark_layout_for_return(struct inode *inode, + struct pnfs_layout_segment *lseg) +{ + struct pnfs_layout_hdr *lo = NFS_I(inode)->layout; + int iomode = pnfs_iomode_to_fail_bit(lseg->pls_range.iomode); + struct pnfs_layout_range range = { + .iomode = lseg->pls_range.iomode, + .offset = 0, + .length = NFS4_MAX_UINT64, + }; + LIST_HEAD(free_me); + + spin_lock(&inode->i_lock); + /* set failure bit so that pnfs path will be retried later */ + pnfs_layout_set_fail_bit(lo, iomode); + if (lo->plh_return_iomode == 0) + lo->plh_return_iomode = range.iomode; + else if (lo->plh_return_iomode != range.iomode) + lo->plh_return_iomode = IOMODE_ANY; + /* + * mark all matching lsegs so that we are sure to have no live + * segments at hand when sending layoutreturn. See pnfs_put_lseg() + * for how it works. + */ + pnfs_mark_matching_lsegs_return(lo, &free_me, &range); + spin_unlock(&inode->i_lock); + pnfs_free_lseg_list(&free_me); +} +EXPORT_SYMBOL_GPL(pnfs_error_mark_layout_for_return); + void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { u64 rd_size = req->wb_bytes; - WARN_ON_ONCE(pgio->pg_lseg != NULL); + if (pgio->pg_lseg == NULL) { + if (pgio->pg_dreq == NULL) + rd_size = i_size_read(pgio->pg_inode) - req_offset(req); + else + rd_size = nfs_dreq_bytes_left(pgio->pg_dreq); - if (req->wb_offset != req->wb_pgbase) { - nfs_pageio_reset_read_mds(pgio); - return; + pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, + req->wb_context, + req_offset(req), + rd_size, + IOMODE_READ, + GFP_KERNEL); } - - if (pgio->pg_dreq == NULL) - rd_size = i_size_read(pgio->pg_inode) - req_offset(req); - else - rd_size = nfs_dreq_bytes_left(pgio->pg_dreq); - - pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, - req->wb_context, - req_offset(req), - rd_size, - IOMODE_READ, - GFP_KERNEL); /* If no lseg, fall back to read through mds */ if (pgio->pg_lseg == NULL) nfs_pageio_reset_read_mds(pgio); @@ -1397,19 +1816,13 @@ pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req, u64 wb_size) { - WARN_ON_ONCE(pgio->pg_lseg != NULL); - - if (req->wb_offset != req->wb_pgbase) { - nfs_pageio_reset_write_mds(pgio); - return; - } - - pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, - req->wb_context, - req_offset(req), - wb_size, - IOMODE_RW, - GFP_NOFS); + if (pgio->pg_lseg == NULL) + pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, + req->wb_context, + req_offset(req), + wb_size, + IOMODE_RW, + GFP_NOFS); /* If no lseg, fall back to write through mds */ if (pgio->pg_lseg == NULL) nfs_pageio_reset_write_mds(pgio); @@ -1417,93 +1830,81 @@ EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_write); void -pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode, - const struct nfs_pgio_completion_ops *compl_ops) +pnfs_generic_pg_cleanup(struct nfs_pageio_descriptor *desc) { - struct nfs_server *server = NFS_SERVER(inode); - struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld; - - if (ld == NULL) - nfs_pageio_init_read(pgio, inode, compl_ops); - else - nfs_pageio_init(pgio, inode, ld->pg_read_ops, compl_ops, server->rsize, 0); + if (desc->pg_lseg) { + pnfs_put_lseg(desc->pg_lseg); + desc->pg_lseg = NULL; + } } +EXPORT_SYMBOL_GPL(pnfs_generic_pg_cleanup); -void -pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, - int ioflags, - const struct nfs_pgio_completion_ops *compl_ops) +/* + * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number + * of bytes (maximum @req->wb_bytes) that can be coalesced. + */ +size_t +pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, + struct nfs_page *prev, struct nfs_page *req) { - struct nfs_server *server = NFS_SERVER(inode); - struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld; - - if (ld == NULL) - nfs_pageio_init_write(pgio, inode, ioflags, compl_ops); - else - nfs_pageio_init(pgio, inode, ld->pg_write_ops, compl_ops, server->wsize, ioflags); -} + unsigned int size; + u64 seg_end, req_start, seg_left; -bool -pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, - struct nfs_page *req) -{ - if (pgio->pg_lseg == NULL) - return nfs_generic_pg_test(pgio, prev, req); + size = nfs_generic_pg_test(pgio, prev, req); + if (!size) + return 0; /* - * Test if a nfs_page is fully contained in the pnfs_layout_range. - * Note that this test makes several assumptions: - * - that the previous nfs_page in the struct nfs_pageio_descriptor - * is known to lie within the range. - * - that the nfs_page being tested is known to be contiguous with the - * previous nfs_page. - * - Layout ranges are page aligned, so we only have to test the - * start offset of the request. + * 'size' contains the number of bytes left in the current page (up + * to the original size asked for in @req->wb_bytes). + * + * Calculate how many bytes are left in the layout segment + * and if there are less bytes than 'size', return that instead. * * Please also note that 'end_offset' is actually the offset of the * first byte that lies outside the pnfs_layout_range. FIXME? * */ - return req_offset(req) < end_offset(pgio->pg_lseg->pls_range.offset, - pgio->pg_lseg->pls_range.length); + if (pgio->pg_lseg) { + seg_end = end_offset(pgio->pg_lseg->pls_range.offset, + pgio->pg_lseg->pls_range.length); + req_start = req_offset(req); + WARN_ON_ONCE(req_start >= seg_end); + /* start of request is past the last byte of this segment */ + if (req_start >= seg_end) { + /* reference the new lseg */ + if (pgio->pg_ops->pg_cleanup) + pgio->pg_ops->pg_cleanup(pgio); + if (pgio->pg_ops->pg_init) + pgio->pg_ops->pg_init(pgio, req); + return 0; + } + + /* adjust 'size' iff there are fewer bytes left in the + * segment than what nfs_generic_pg_test returned */ + seg_left = seg_end - req_start; + if (seg_left < size) + size = (unsigned int)seg_left; + } + + return size; } EXPORT_SYMBOL_GPL(pnfs_generic_pg_test); -int pnfs_write_done_resend_to_mds(struct inode *inode, - struct list_head *head, - const struct nfs_pgio_completion_ops *compl_ops, - struct nfs_direct_req *dreq) +int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *hdr) { struct nfs_pageio_descriptor pgio; - LIST_HEAD(failed); /* Resend all requests through the MDS */ - nfs_pageio_init_write(&pgio, inode, FLUSH_STABLE, compl_ops); - pgio.pg_dreq = dreq; - while (!list_empty(head)) { - struct nfs_page *req = nfs_list_entry(head->next); - - nfs_list_remove_request(req); - if (!nfs_pageio_add_request(&pgio, req)) - nfs_list_add_request(req, &failed); - } - nfs_pageio_complete(&pgio); - - if (!list_empty(&failed)) { - /* For some reason our attempt to resend pages. Mark the - * overall send request as having failed, and let - * nfs_writeback_release_full deal with the error. - */ - list_move(&failed, head); - return -EIO; - } - return 0; + nfs_pageio_init_write(&pgio, hdr->inode, FLUSH_STABLE, true, + hdr->completion_ops); + set_bit(NFS_CONTEXT_RESEND_WRITES, &hdr->args.context->flags); + return nfs_pageio_resend(&pgio, hdr); } EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds); -static void pnfs_ld_handle_write_error(struct nfs_write_data *data) +static void pnfs_ld_handle_write_error(struct nfs_pgio_header *hdr) { - struct nfs_pgio_header *hdr = data->header; dprintk("pnfs write error = %d\n", hdr->pnfs_error); if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags & @@ -1511,49 +1912,47 @@ pnfs_return_layout(hdr->inode); } if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) - data->task.tk_status = pnfs_write_done_resend_to_mds(hdr->inode, - &hdr->pages, - hdr->completion_ops, - hdr->dreq); + hdr->task.tk_status = pnfs_write_done_resend_to_mds(hdr); } /* * Called by non rpc-based layout drivers */ -void pnfs_ld_write_done(struct nfs_write_data *data) +void pnfs_ld_write_done(struct nfs_pgio_header *hdr) { - struct nfs_pgio_header *hdr = data->header; - - if (!hdr->pnfs_error) { - pnfs_set_layoutcommit(data); - hdr->mds_ops->rpc_call_done(&data->task, data); - } else - pnfs_ld_handle_write_error(data); - hdr->mds_ops->rpc_release(data); + if (likely(!hdr->pnfs_error)) { + pnfs_set_layoutcommit(hdr->inode, hdr->lseg, + hdr->mds_offset + hdr->res.count); + hdr->mds_ops->rpc_call_done(&hdr->task, hdr); + } + trace_nfs4_pnfs_write(hdr, hdr->pnfs_error); + if (unlikely(hdr->pnfs_error)) + pnfs_ld_handle_write_error(hdr); + hdr->mds_ops->rpc_release(hdr); } EXPORT_SYMBOL_GPL(pnfs_ld_write_done); static void pnfs_write_through_mds(struct nfs_pageio_descriptor *desc, - struct nfs_write_data *data) + struct nfs_pgio_header *hdr) { - struct nfs_pgio_header *hdr = data->header; + struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc); if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { - list_splice_tail_init(&hdr->pages, &desc->pg_list); + list_splice_tail_init(&hdr->pages, &mirror->pg_list); nfs_pageio_reset_write_mds(desc); - desc->pg_recoalesce = 1; + mirror->pg_recoalesce = 1; } - nfs_writedata_release(data); + nfs_pgio_data_destroy(hdr); + hdr->release(hdr); } static enum pnfs_try_status -pnfs_try_to_write_data(struct nfs_write_data *wdata, +pnfs_try_to_write_data(struct nfs_pgio_header *hdr, const struct rpc_call_ops *call_ops, struct pnfs_layout_segment *lseg, int how) { - struct nfs_pgio_header *hdr = wdata->header; struct inode *inode = hdr->inode; enum pnfs_try_status trypnfs; struct nfs_server *nfss = NFS_SERVER(inode); @@ -1561,8 +1960,8 @@ hdr->mds_ops = call_ops; dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__, - inode->i_ino, wdata->args.count, wdata->args.offset, how); - trypnfs = nfss->pnfs_curr_ld->write_pagelist(wdata, how); + inode->i_ino, hdr->args.count, hdr->args.offset, how); + trypnfs = nfss->pnfs_curr_ld->write_pagelist(hdr, how); if (trypnfs != PNFS_NOT_ATTEMPTED) nfs_inc_stats(inode, NFSIOS_PNFS_WRITE); dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); @@ -1570,146 +1969,108 @@ } static void -pnfs_do_multiple_writes(struct nfs_pageio_descriptor *desc, struct list_head *head, int how) +pnfs_do_write(struct nfs_pageio_descriptor *desc, + struct nfs_pgio_header *hdr, int how) { - struct nfs_write_data *data; const struct rpc_call_ops *call_ops = desc->pg_rpc_callops; struct pnfs_layout_segment *lseg = desc->pg_lseg; + enum pnfs_try_status trypnfs; - desc->pg_lseg = NULL; - while (!list_empty(head)) { - enum pnfs_try_status trypnfs; - - data = list_first_entry(head, struct nfs_write_data, list); - list_del_init(&data->list); - - trypnfs = pnfs_try_to_write_data(data, call_ops, lseg, how); - if (trypnfs == PNFS_NOT_ATTEMPTED) - pnfs_write_through_mds(desc, data); - } - pnfs_put_lseg(lseg); + trypnfs = pnfs_try_to_write_data(hdr, call_ops, lseg, how); + if (trypnfs == PNFS_NOT_ATTEMPTED) + pnfs_write_through_mds(desc, hdr); } static void pnfs_writehdr_free(struct nfs_pgio_header *hdr) { pnfs_put_lseg(hdr->lseg); - nfs_writehdr_free(hdr); + nfs_pgio_header_free(hdr); } -EXPORT_SYMBOL_GPL(pnfs_writehdr_free); int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) { - struct nfs_write_header *whdr; + struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc); + struct nfs_pgio_header *hdr; int ret; - whdr = nfs_writehdr_alloc(); - if (!whdr) { - desc->pg_completion_ops->error_cleanup(&desc->pg_list); - pnfs_put_lseg(desc->pg_lseg); - desc->pg_lseg = NULL; + hdr = nfs_pgio_header_alloc(desc->pg_rw_ops); + if (!hdr) { + desc->pg_completion_ops->error_cleanup(&mirror->pg_list); return -ENOMEM; } - hdr = &whdr->header; nfs_pgheader_init(desc, hdr, pnfs_writehdr_free); + hdr->lseg = pnfs_get_lseg(desc->pg_lseg); - atomic_inc(&hdr->refcnt); - ret = nfs_generic_flush(desc, hdr); - if (ret != 0) { - pnfs_put_lseg(desc->pg_lseg); - desc->pg_lseg = NULL; - } else - pnfs_do_multiple_writes(desc, &hdr->rpc_list, desc->pg_ioflags); - if (atomic_dec_and_test(&hdr->refcnt)) - hdr->completion_ops->completion(hdr); + ret = nfs_generic_pgio(desc, hdr); + if (!ret) + pnfs_do_write(desc, hdr, desc->pg_ioflags); + return ret; } EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages); -int pnfs_read_done_resend_to_mds(struct inode *inode, - struct list_head *head, - const struct nfs_pgio_completion_ops *compl_ops, - struct nfs_direct_req *dreq) +int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *hdr) { struct nfs_pageio_descriptor pgio; - LIST_HEAD(failed); /* Resend all requests through the MDS */ - nfs_pageio_init_read(&pgio, inode, compl_ops); - pgio.pg_dreq = dreq; - while (!list_empty(head)) { - struct nfs_page *req = nfs_list_entry(head->next); - - nfs_list_remove_request(req); - if (!nfs_pageio_add_request(&pgio, req)) - nfs_list_add_request(req, &failed); - } - nfs_pageio_complete(&pgio); - - if (!list_empty(&failed)) { - list_move(&failed, head); - return -EIO; - } - return 0; + nfs_pageio_init_read(&pgio, hdr->inode, true, hdr->completion_ops); + return nfs_pageio_resend(&pgio, hdr); } EXPORT_SYMBOL_GPL(pnfs_read_done_resend_to_mds); -static void pnfs_ld_handle_read_error(struct nfs_read_data *data) +static void pnfs_ld_handle_read_error(struct nfs_pgio_header *hdr) { - struct nfs_pgio_header *hdr = data->header; - dprintk("pnfs read error = %d\n", hdr->pnfs_error); if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags & PNFS_LAYOUTRET_ON_ERROR) { pnfs_return_layout(hdr->inode); } if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) - data->task.tk_status = pnfs_read_done_resend_to_mds(hdr->inode, - &hdr->pages, - hdr->completion_ops, - hdr->dreq); + hdr->task.tk_status = pnfs_read_done_resend_to_mds(hdr); } /* * Called by non rpc-based layout drivers */ -void pnfs_ld_read_done(struct nfs_read_data *data) +void pnfs_ld_read_done(struct nfs_pgio_header *hdr) { - struct nfs_pgio_header *hdr = data->header; - if (likely(!hdr->pnfs_error)) { - __nfs4_read_done_cb(data); - hdr->mds_ops->rpc_call_done(&data->task, data); - } else - pnfs_ld_handle_read_error(data); - hdr->mds_ops->rpc_release(data); + __nfs4_read_done_cb(hdr); + hdr->mds_ops->rpc_call_done(&hdr->task, hdr); + } + trace_nfs4_pnfs_read(hdr, hdr->pnfs_error); + if (unlikely(hdr->pnfs_error)) + pnfs_ld_handle_read_error(hdr); + hdr->mds_ops->rpc_release(hdr); } EXPORT_SYMBOL_GPL(pnfs_ld_read_done); static void pnfs_read_through_mds(struct nfs_pageio_descriptor *desc, - struct nfs_read_data *data) + struct nfs_pgio_header *hdr) { - struct nfs_pgio_header *hdr = data->header; + struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc); if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { - list_splice_tail_init(&hdr->pages, &desc->pg_list); + list_splice_tail_init(&hdr->pages, &mirror->pg_list); nfs_pageio_reset_read_mds(desc); - desc->pg_recoalesce = 1; + mirror->pg_recoalesce = 1; } - nfs_readdata_release(data); + nfs_pgio_data_destroy(hdr); + hdr->release(hdr); } /* * Call the appropriate parallel I/O subsystem read function. */ static enum pnfs_try_status -pnfs_try_to_read_data(struct nfs_read_data *rdata, +pnfs_try_to_read_data(struct nfs_pgio_header *hdr, const struct rpc_call_ops *call_ops, struct pnfs_layout_segment *lseg) { - struct nfs_pgio_header *hdr = rdata->header; struct inode *inode = hdr->inode; struct nfs_server *nfss = NFS_SERVER(inode); enum pnfs_try_status trypnfs; @@ -1717,74 +2078,77 @@ hdr->mds_ops = call_ops; dprintk("%s: Reading ino:%lu %u@%llu\n", - __func__, inode->i_ino, rdata->args.count, rdata->args.offset); + __func__, inode->i_ino, hdr->args.count, hdr->args.offset); - trypnfs = nfss->pnfs_curr_ld->read_pagelist(rdata); + trypnfs = nfss->pnfs_curr_ld->read_pagelist(hdr); if (trypnfs != PNFS_NOT_ATTEMPTED) nfs_inc_stats(inode, NFSIOS_PNFS_READ); dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); return trypnfs; } +/* Resend all requests through pnfs. */ +int pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr) +{ + struct nfs_pageio_descriptor pgio; + + nfs_pageio_init_read(&pgio, hdr->inode, false, hdr->completion_ops); + return nfs_pageio_resend(&pgio, hdr); +} +EXPORT_SYMBOL_GPL(pnfs_read_resend_pnfs); + static void -pnfs_do_multiple_reads(struct nfs_pageio_descriptor *desc, struct list_head *head) +pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr) { - struct nfs_read_data *data; const struct rpc_call_ops *call_ops = desc->pg_rpc_callops; struct pnfs_layout_segment *lseg = desc->pg_lseg; + enum pnfs_try_status trypnfs; + int err = 0; - desc->pg_lseg = NULL; - while (!list_empty(head)) { - enum pnfs_try_status trypnfs; - - data = list_first_entry(head, struct nfs_read_data, list); - list_del_init(&data->list); - - trypnfs = pnfs_try_to_read_data(data, call_ops, lseg); - if (trypnfs == PNFS_NOT_ATTEMPTED) - pnfs_read_through_mds(desc, data); - } - pnfs_put_lseg(lseg); + trypnfs = pnfs_try_to_read_data(hdr, call_ops, lseg); + if (trypnfs == PNFS_TRY_AGAIN) + err = pnfs_read_resend_pnfs(hdr); + if (trypnfs == PNFS_NOT_ATTEMPTED || err) + pnfs_read_through_mds(desc, hdr); } static void pnfs_readhdr_free(struct nfs_pgio_header *hdr) { pnfs_put_lseg(hdr->lseg); - nfs_readhdr_free(hdr); + nfs_pgio_header_free(hdr); } -EXPORT_SYMBOL_GPL(pnfs_readhdr_free); int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc) { - struct nfs_read_header *rhdr; + struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc); + struct nfs_pgio_header *hdr; int ret; - rhdr = nfs_readhdr_alloc(); - if (!rhdr) { - desc->pg_completion_ops->error_cleanup(&desc->pg_list); - ret = -ENOMEM; - pnfs_put_lseg(desc->pg_lseg); - desc->pg_lseg = NULL; - return ret; + hdr = nfs_pgio_header_alloc(desc->pg_rw_ops); + if (!hdr) { + desc->pg_completion_ops->error_cleanup(&mirror->pg_list); + return -ENOMEM; } - hdr = &rhdr->header; nfs_pgheader_init(desc, hdr, pnfs_readhdr_free); hdr->lseg = pnfs_get_lseg(desc->pg_lseg); - atomic_inc(&hdr->refcnt); - ret = nfs_generic_pagein(desc, hdr); - if (ret != 0) { - pnfs_put_lseg(desc->pg_lseg); - desc->pg_lseg = NULL; - } else - pnfs_do_multiple_reads(desc, &hdr->rpc_list); - if (atomic_dec_and_test(&hdr->refcnt)) - hdr->completion_ops->completion(hdr); + ret = nfs_generic_pgio(desc, hdr); + if (!ret) + pnfs_do_read(desc, hdr); return ret; } EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages); +static void pnfs_clear_layoutcommitting(struct inode *inode) +{ + unsigned long *bitlock = &NFS_I(inode)->flags; + + clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock); + smp_mb__after_atomic(); + wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING); +} + /* * There can be multiple RW segments. */ @@ -1802,7 +2166,6 @@ static void pnfs_list_write_lseg_done(struct inode *inode, struct list_head *listp) { struct pnfs_layout_segment *lseg, *tmp; - unsigned long *bitlock = &NFS_I(inode)->flags; /* Matched by references in pnfs_set_layoutcommit */ list_for_each_entry_safe(lseg, tmp, listp, pls_lc_list) { @@ -1810,9 +2173,7 @@ pnfs_put_lseg(lseg); } - clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock); - smp_mb__after_clear_bit(); - wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING); + pnfs_clear_layoutcommitting(inode); } void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg) @@ -1822,29 +2183,27 @@ EXPORT_SYMBOL_GPL(pnfs_set_lo_fail); void -pnfs_set_layoutcommit(struct nfs_write_data *wdata) +pnfs_set_layoutcommit(struct inode *inode, struct pnfs_layout_segment *lseg, + loff_t end_pos) { - struct nfs_pgio_header *hdr = wdata->header; - struct inode *inode = hdr->inode; struct nfs_inode *nfsi = NFS_I(inode); - loff_t end_pos = wdata->mds_offset + wdata->res.count; bool mark_as_dirty = false; spin_lock(&inode->i_lock); if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { + nfsi->layout->plh_lwb = end_pos; mark_as_dirty = true; dprintk("%s: Set layoutcommit for inode %lu ", __func__, inode->i_ino); - } - if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &hdr->lseg->pls_flags)) { + } else if (end_pos > nfsi->layout->plh_lwb) + nfsi->layout->plh_lwb = end_pos; + if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) { /* references matched in nfs4_layoutcommit_release */ - pnfs_get_lseg(hdr->lseg); + pnfs_get_lseg(lseg); } - if (end_pos > nfsi->layout->plh_lwb) - nfsi->layout->plh_lwb = end_pos; spin_unlock(&inode->i_lock); dprintk("%s: lseg %p end_pos %llu\n", - __func__, hdr->lseg, nfsi->layout->plh_lwb); + __func__, lseg, nfsi->layout->plh_lwb); /* if pnfs_layoutcommit_inode() runs between inode locks, the next one * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */ @@ -1873,50 +2232,44 @@ int pnfs_layoutcommit_inode(struct inode *inode, bool sync) { + struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; struct nfs4_layoutcommit_data *data; struct nfs_inode *nfsi = NFS_I(inode); loff_t end_pos; - int status = 0; + int status; - dprintk("--> %s inode %lu\n", __func__, inode->i_ino); - - if (!test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) + if (!pnfs_layoutcommit_outstanding(inode)) return 0; - /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */ - data = kzalloc(sizeof(*data), GFP_NOFS); - if (!data) { - status = -ENOMEM; - goto out; - } - - if (!test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) - goto out_free; + dprintk("--> %s inode %lu\n", __func__, inode->i_ino); + status = -EAGAIN; if (test_and_set_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags)) { - if (!sync) { - status = -EAGAIN; - goto out_free; - } - status = wait_on_bit_lock(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING, - nfs_wait_bit_killable, TASK_KILLABLE); + if (!sync) + goto out; + status = wait_on_bit_lock_action(&nfsi->flags, + NFS_INO_LAYOUTCOMMITTING, + nfs_wait_bit_killable, + TASK_KILLABLE); if (status) - goto out_free; + goto out; } - INIT_LIST_HEAD(&data->lseg_list); + status = -ENOMEM; + /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */ + data = kzalloc(sizeof(*data), GFP_NOFS); + if (!data) + goto clear_layoutcommitting; + + status = 0; spin_lock(&inode->i_lock); - if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { - clear_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags); - spin_unlock(&inode->i_lock); - wake_up_bit(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING); - goto out_free; - } + if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) + goto out_unlock; + INIT_LIST_HEAD(&data->lseg_list); pnfs_list_write_lseg(inode, &data->lseg_list); end_pos = nfsi->layout->plh_lwb; - nfsi->layout->plh_lwb = 0; nfs4_stateid_copy(&data->args.stateid, &nfsi->layout->plh_stateid); spin_unlock(&inode->i_lock); @@ -1929,16 +2282,40 @@ data->args.lastbytewritten = end_pos - 1; data->res.server = NFS_SERVER(inode); + if (ld->prepare_layoutcommit) { + status = ld->prepare_layoutcommit(&data->args); + if (status) { + put_rpccred(data->cred); + spin_lock(&inode->i_lock); + set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags); + if (end_pos > nfsi->layout->plh_lwb) + nfsi->layout->plh_lwb = end_pos; + goto out_unlock; + } + } + + status = nfs4_proc_layoutcommit(data, sync); out: if (status) mark_inode_dirty_sync(inode); dprintk("<-- %s status %d\n", __func__, status); return status; -out_free: +out_unlock: + spin_unlock(&inode->i_lock); kfree(data); +clear_layoutcommitting: + pnfs_clear_layoutcommitting(inode); goto out; } +EXPORT_SYMBOL_GPL(pnfs_layoutcommit_inode); + +int +pnfs_generic_sync(struct inode *inode, bool datasync) +{ + return pnfs_layoutcommit_inode(inode, true); +} +EXPORT_SYMBOL_GPL(pnfs_generic_sync); struct nfs4_threshold *pnfs_mdsthreshold_alloc(void) { @@ -1951,3 +2328,67 @@ } return thp; } + +#if IS_ENABLED(CONFIG_NFS_V4_2) +int +pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags) +{ + struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs42_layoutstat_data *data; + struct pnfs_layout_hdr *hdr; + int status = 0; + + if (!pnfs_enabled_sb(server) || !ld->prepare_layoutstats) + goto out; + + if (!nfs_server_capable(inode, NFS_CAP_LAYOUTSTATS)) + goto out; + + if (test_and_set_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags)) + goto out; + + spin_lock(&inode->i_lock); + if (!NFS_I(inode)->layout) { + spin_unlock(&inode->i_lock); + goto out; + } + hdr = NFS_I(inode)->layout; + pnfs_get_layout_hdr(hdr); + spin_unlock(&inode->i_lock); + + data = kzalloc(sizeof(*data), gfp_flags); + if (!data) { + status = -ENOMEM; + goto out_put; + } + + data->args.fh = NFS_FH(inode); + data->args.inode = inode; + nfs4_stateid_copy(&data->args.stateid, &hdr->plh_stateid); + status = ld->prepare_layoutstats(&data->args); + if (status) + goto out_free; + + status = nfs42_proc_layoutstats_generic(NFS_SERVER(inode), data); + +out: + dprintk("%s returns %d\n", __func__, status); + return status; + +out_free: + kfree(data); +out_put: + pnfs_put_layout_hdr(hdr); + smp_mb__before_atomic(); + clear_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags); + smp_mb__after_atomic(); + goto out; +} +EXPORT_SYMBOL_GPL(pnfs_report_layoutstat); +#endif + +unsigned int layoutstats_timer; +module_param(layoutstats_timer, uint, 0644); +EXPORT_SYMBOL_GPL(layoutstats_timer);