--- zzzz-none-000/linux-3.10.107/fs/xfs/xfs_buf_item.c 2017-06-27 09:49:32.000000000 +0000 +++ scorpion-7490-727/linux-3.10.107/fs/xfs/xfs_buf_item.c 2021-02-04 17:41:59.000000000 +0000 @@ -17,17 +17,18 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_trans.h" #include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" +#include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_trans_priv.h" #include "xfs_error.h" #include "xfs_trace.h" +#include "xfs_log.h" kmem_zone_t *xfs_buf_item_zone; @@ -39,6 +40,14 @@ STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp); +static inline int +xfs_buf_log_format_size( + struct xfs_buf_log_format *blfp) +{ + return offsetof(struct xfs_buf_log_format, blf_data_map) + + (blfp->blf_map_size * sizeof(blfp->blf_data_map[0])); +} + /* * This returns the number of log iovecs needed to log the * given buf log item. @@ -49,25 +58,27 @@ * * If the XFS_BLI_STALE flag has been set, then log nothing. */ -STATIC uint +STATIC void xfs_buf_item_size_segment( struct xfs_buf_log_item *bip, - struct xfs_buf_log_format *blfp) + struct xfs_buf_log_format *blfp, + int *nvecs, + int *nbytes) { struct xfs_buf *bp = bip->bli_buf; - uint nvecs; int next_bit; int last_bit; last_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); if (last_bit == -1) - return 0; + return; /* * initial count for a dirty buffer is 2 vectors - the format structure * and the first dirty region. */ - nvecs = 2; + *nvecs += 2; + *nbytes += xfs_buf_log_format_size(blfp) + XFS_BLF_CHUNK; while (last_bit != -1) { /* @@ -87,18 +98,17 @@ break; } else if (next_bit != last_bit + 1) { last_bit = next_bit; - nvecs++; + (*nvecs)++; } else if (xfs_buf_offset(bp, next_bit * XFS_BLF_CHUNK) != (xfs_buf_offset(bp, last_bit * XFS_BLF_CHUNK) + XFS_BLF_CHUNK)) { last_bit = next_bit; - nvecs++; + (*nvecs)++; } else { last_bit++; } + *nbytes += XFS_BLF_CHUNK; } - - return nvecs; } /* @@ -118,12 +128,13 @@ * If the XFS_BLI_STALE flag has been set, then log nothing but the buf log * format structures. */ -STATIC uint +STATIC void xfs_buf_item_size( - struct xfs_log_item *lip) + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) { struct xfs_buf_log_item *bip = BUF_ITEM(lip); - uint nvecs; int i; ASSERT(atomic_read(&bip->bli_refcount) > 0); @@ -135,11 +146,26 @@ */ trace_xfs_buf_item_size_stale(bip); ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); - return bip->bli_format_count; + *nvecs += bip->bli_format_count; + for (i = 0; i < bip->bli_format_count; i++) { + *nbytes += xfs_buf_log_format_size(&bip->bli_formats[i]); + } + return; } ASSERT(bip->bli_flags & XFS_BLI_LOGGED); + if (bip->bli_flags & XFS_BLI_ORDERED) { + /* + * The buffer has been logged just to order it. + * It is not being included in the transaction + * commit, so no vectors are used at all. + */ + trace_xfs_buf_item_size_ordered(bip); + *nvecs = XFS_LOG_VEC_ORDERED; + return; + } + /* * the vector count is based on the number of buffer vectors we have * dirty bits in. This will only be greater than one when we have a @@ -149,30 +175,54 @@ * count for the extra buf log format structure that will need to be * written. */ - nvecs = 0; for (i = 0; i < bip->bli_format_count; i++) { - nvecs += xfs_buf_item_size_segment(bip, &bip->bli_formats[i]); + xfs_buf_item_size_segment(bip, &bip->bli_formats[i], + nvecs, nbytes); } - trace_xfs_buf_item_size(bip); - return nvecs; } -static struct xfs_log_iovec * +static inline void +xfs_buf_item_copy_iovec( + struct xfs_log_vec *lv, + struct xfs_log_iovec **vecp, + struct xfs_buf *bp, + uint offset, + int first_bit, + uint nbits) +{ + offset += first_bit * XFS_BLF_CHUNK; + xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BCHUNK, + xfs_buf_offset(bp, offset), + nbits * XFS_BLF_CHUNK); +} + +static inline bool +xfs_buf_item_straddle( + struct xfs_buf *bp, + uint offset, + int next_bit, + int last_bit) +{ + return xfs_buf_offset(bp, offset + (next_bit << XFS_BLF_SHIFT)) != + (xfs_buf_offset(bp, offset + (last_bit << XFS_BLF_SHIFT)) + + XFS_BLF_CHUNK); +} + +static void xfs_buf_item_format_segment( struct xfs_buf_log_item *bip, - struct xfs_log_iovec *vecp, + struct xfs_log_vec *lv, + struct xfs_log_iovec **vecp, uint offset, struct xfs_buf_log_format *blfp) { struct xfs_buf *bp = bip->bli_buf; uint base_size; - uint nvecs; int first_bit; int last_bit; int next_bit; uint nbits; - uint buffer_offset; /* copy the flags across from the base format item */ blfp->blf_flags = bip->__bli_format.blf_flags; @@ -182,24 +232,19 @@ * the actual size of the dirty bitmap rather than the size of the in * memory structure. */ - base_size = offsetof(struct xfs_buf_log_format, blf_data_map) + - (blfp->blf_map_size * sizeof(blfp->blf_data_map[0])); + base_size = xfs_buf_log_format_size(blfp); - nvecs = 0; first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); if (!(bip->bli_flags & XFS_BLI_STALE) && first_bit == -1) { /* * If the map is not be dirty in the transaction, mark * the size as zero and do not advance the vector pointer. */ - goto out; + return; } - vecp->i_addr = blfp; - vecp->i_len = base_size; - vecp->i_type = XLOG_REG_TYPE_BFORMAT; - vecp++; - nvecs = 1; + blfp = xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BFORMAT, blfp, base_size); + blfp->blf_size = 1; if (bip->bli_flags & XFS_BLI_STALE) { /* @@ -209,13 +254,13 @@ */ trace_xfs_buf_item_format_stale(bip); ASSERT(blfp->blf_flags & XFS_BLF_CANCEL); - goto out; + return; } + /* * Fill in an iovec for each set of contiguous chunks. */ - last_bit = first_bit; nbits = 1; for (;;) { @@ -228,42 +273,22 @@ next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, (uint)last_bit + 1); /* - * If we run out of bits fill in the last iovec and get - * out of the loop. - * Else if we start a new set of bits then fill in the - * iovec for the series we were looking at and start - * counting the bits in the new one. - * Else we're still in the same set of bits so just - * keep counting and scanning. + * If we run out of bits fill in the last iovec and get out of + * the loop. Else if we start a new set of bits then fill in + * the iovec for the series we were looking at and start + * counting the bits in the new one. Else we're still in the + * same set of bits so just keep counting and scanning. */ if (next_bit == -1) { - buffer_offset = offset + first_bit * XFS_BLF_CHUNK; - vecp->i_addr = xfs_buf_offset(bp, buffer_offset); - vecp->i_len = nbits * XFS_BLF_CHUNK; - vecp->i_type = XLOG_REG_TYPE_BCHUNK; - nvecs++; + xfs_buf_item_copy_iovec(lv, vecp, bp, offset, + first_bit, nbits); + blfp->blf_size++; break; - } else if (next_bit != last_bit + 1) { - buffer_offset = offset + first_bit * XFS_BLF_CHUNK; - vecp->i_addr = xfs_buf_offset(bp, buffer_offset); - vecp->i_len = nbits * XFS_BLF_CHUNK; - vecp->i_type = XLOG_REG_TYPE_BCHUNK; - nvecs++; - vecp++; - first_bit = next_bit; - last_bit = next_bit; - nbits = 1; - } else if (xfs_buf_offset(bp, offset + - (next_bit << XFS_BLF_SHIFT)) != - (xfs_buf_offset(bp, offset + - (last_bit << XFS_BLF_SHIFT)) + - XFS_BLF_CHUNK)) { - buffer_offset = offset + first_bit * XFS_BLF_CHUNK; - vecp->i_addr = xfs_buf_offset(bp, buffer_offset); - vecp->i_len = nbits * XFS_BLF_CHUNK; - vecp->i_type = XLOG_REG_TYPE_BCHUNK; - nvecs++; - vecp++; + } else if (next_bit != last_bit + 1 || + xfs_buf_item_straddle(bp, offset, next_bit, last_bit)) { + xfs_buf_item_copy_iovec(lv, vecp, bp, offset, + first_bit, nbits); + blfp->blf_size++; first_bit = next_bit; last_bit = next_bit; nbits = 1; @@ -272,9 +297,6 @@ nbits++; } } -out: - blfp->blf_size = nvecs; - return vecp; } /* @@ -286,10 +308,11 @@ STATIC void xfs_buf_item_format( struct xfs_log_item *lip, - struct xfs_log_iovec *vecp) + struct xfs_log_vec *lv) { struct xfs_buf_log_item *bip = BUF_ITEM(lip); struct xfs_buf *bp = bip->bli_buf; + struct xfs_log_iovec *vecp = NULL; uint offset = 0; int i; @@ -303,21 +326,39 @@ /* * If it is an inode buffer, transfer the in-memory state to the - * format flags and clear the in-memory state. We do not transfer + * format flags and clear the in-memory state. + * + * For buffer based inode allocation, we do not transfer * this state if the inode buffer allocation has not yet been committed * to the log as setting the XFS_BLI_INODE_BUF flag will prevent * correct replay of the inode allocation. + * + * For icreate item based inode allocation, the buffers aren't written + * to the journal during allocation, and hence we should always tag the + * buffer as an inode buffer so that the correct unlinked list replay + * occurs during recovery. */ if (bip->bli_flags & XFS_BLI_INODE_BUF) { - if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && + if (xfs_sb_version_hascrc(&lip->li_mountp->m_sb) || + !((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && xfs_log_item_in_current_chkpt(lip))) bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF; bip->bli_flags &= ~XFS_BLI_INODE_BUF; } + if ((bip->bli_flags & (XFS_BLI_ORDERED|XFS_BLI_STALE)) == + XFS_BLI_ORDERED) { + /* + * The buffer has been logged just to order it. It is not being + * included in the transaction commit, so don't format it. + */ + trace_xfs_buf_item_format_ordered(bip); + return; + } + for (i = 0; i < bip->bli_format_count; i++) { - vecp = xfs_buf_item_format_segment(bip, vecp, offset, - &bip->bli_formats[i]); + xfs_buf_item_format_segment(bip, lv, &vecp, offset, + &bip->bli_formats[i]); offset += bp->b_maps[i].bm_len; } @@ -344,6 +385,7 @@ ASSERT(atomic_read(&bip->bli_refcount) > 0); ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || + (bip->bli_flags & XFS_BLI_ORDERED) || (bip->bli_flags & XFS_BLI_STALE)); trace_xfs_buf_item_pin(bip); @@ -450,13 +492,21 @@ xfs_buf_lock(bp); xfs_buf_hold(bp); bp->b_flags |= XBF_ASYNC; - xfs_buf_ioerror(bp, EIO); + xfs_buf_ioerror(bp, -EIO); XFS_BUF_UNDONE(bp); xfs_buf_stale(bp); - xfs_buf_ioend(bp, 0); + xfs_buf_ioend(bp); } } +/* + * Buffer IO error rate limiting. Limit it to no more than 10 messages per 30 + * seconds so as to not spam logs too much on repeated detection of the same + * buffer being bad.. + */ + +static DEFINE_RATELIMIT_STATE(xfs_buf_write_fail_rl_state, 30 * HZ, 10); + STATIC uint xfs_buf_item_push( struct xfs_log_item *lip, @@ -485,6 +535,14 @@ trace_xfs_buf_item_push(bip); + /* has a previous flush failed due to IO errors? */ + if ((bp->b_flags & XBF_WRITE_FAIL) && + ___ratelimit(&xfs_buf_write_fail_rl_state, "XFS: Failing async write")) { + xfs_warn(bp->b_target->bt_mount, +"Failing async write on buffer block 0x%llx. Retrying async write.", + (long long)bp->b_bn); + } + if (!xfs_buf_delwri_queue(bp, buffer_list)) rval = XFS_ITEM_FLUSHING; xfs_buf_unlock(bp); @@ -516,8 +574,9 @@ { struct xfs_buf_log_item *bip = BUF_ITEM(lip); struct xfs_buf *bp = bip->bli_buf; - int aborted, clean, i; - uint hold; + bool clean; + bool aborted; + int flags; /* Clear the buffer's association with this transaction. */ bp->b_transp = NULL; @@ -528,23 +587,21 @@ * (cancelled) buffers at unpin time, but we'll never go through the * pin/unpin cycle if we abort inside commit. */ - aborted = (lip->li_flags & XFS_LI_ABORTED) != 0; - + aborted = (lip->li_flags & XFS_LI_ABORTED) ? true : false; /* - * Before possibly freeing the buf item, determine if we should - * release the buffer at the end of this routine. + * Before possibly freeing the buf item, copy the per-transaction state + * so we can reference it safely later after clearing it from the + * buffer log item. */ - hold = bip->bli_flags & XFS_BLI_HOLD; - - /* Clear the per transaction state. */ - bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD); + flags = bip->bli_flags; + bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED); /* * If the buf item is marked stale, then don't do anything. We'll * unlock the buffer and free the buf item when the buffer is unpinned * for the last time. */ - if (bip->bli_flags & XFS_BLI_STALE) { + if (flags & XFS_BLI_STALE) { trace_xfs_buf_item_unlock_stale(bip); ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); if (!aborted) { @@ -561,26 +618,41 @@ * be the only reference to the buf item, so we free it anyway * regardless of whether it is dirty or not. A dirty abort implies a * shutdown, anyway. + * + * Ordered buffers are dirty but may have no recorded changes, so ensure + * we only release clean items here. */ - clean = 1; - for (i = 0; i < bip->bli_format_count; i++) { - if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map, - bip->bli_formats[i].blf_map_size)) { - clean = 0; - break; + clean = (flags & XFS_BLI_DIRTY) ? false : true; + if (clean) { + int i; + for (i = 0; i < bip->bli_format_count; i++) { + if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map, + bip->bli_formats[i].blf_map_size)) { + clean = false; + break; + } } } - if (clean) - xfs_buf_item_relse(bp); - else if (aborted) { - if (atomic_dec_and_test(&bip->bli_refcount)) { + + /* + * Clean buffers, by definition, cannot be in the AIL. However, aborted + * buffers may be dirty and hence in the AIL. Therefore if we are + * aborting a buffer and we've just taken the last refernce away, we + * have to check if it is in the AIL before freeing it. We need to free + * it in this case, because an aborted transaction has already shut the + * filesystem down and this is the last chance we will have to do so. + */ + if (atomic_dec_and_test(&bip->bli_refcount)) { + if (clean) + xfs_buf_item_relse(bp); + else if (aborted) { ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp)); + xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR); xfs_buf_item_relse(bp); } - } else - atomic_dec(&bip->bli_refcount); + } - if (!hold) + if (!(flags & XFS_BLI_HOLD)) xfs_buf_relse(bp); } @@ -653,7 +725,7 @@ bip->bli_formats = kmem_zalloc(count * sizeof(struct xfs_buf_log_format), KM_SLEEP); if (!bip->bli_formats) - return ENOMEM; + return -ENOMEM; return 0; } @@ -674,13 +746,13 @@ * buffer (see xfs_buf_attach_iodone() below), then put the * buf log item at the front. */ -void +int xfs_buf_item_init( - xfs_buf_t *bp, - xfs_mount_t *mp) + struct xfs_buf *bp, + struct xfs_mount *mp) { - xfs_log_item_t *lip = bp->b_fspriv; - xfs_buf_log_item_t *bip; + struct xfs_log_item *lip = bp->b_fspriv; + struct xfs_buf_log_item *bip; int chunks; int map_size; int error; @@ -694,12 +766,11 @@ */ ASSERT(bp->b_target->bt_mount == mp); if (lip != NULL && lip->li_type == XFS_LI_BUF) - return; + return 0; bip = kmem_zone_zalloc(xfs_buf_item_zone, KM_SLEEP); xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops); bip->bli_buf = bp; - xfs_buf_hold(bp); /* * chunks is the number of XFS_BLF_CHUNK size pieces the buffer @@ -712,6 +783,11 @@ */ error = xfs_buf_item_get_format(bip, bp->b_map_count); ASSERT(error == 0); + if (error) { /* to stop gcc throwing set-but-unused warnings */ + kmem_zone_free(xfs_buf_item_zone, bip); + return error; + } + for (i = 0; i < bip->bli_format_count; i++) { chunks = DIV_ROUND_UP(BBTOB(bp->b_maps[i].bm_len), @@ -724,20 +800,6 @@ bip->bli_formats[i].blf_map_size = map_size; } -#ifdef XFS_TRANS_DEBUG - /* - * Allocate the arrays for tracking what needs to be logged - * and what our callers request to be logged. bli_orig - * holds a copy of the original, clean buffer for comparison - * against, and bli_logged keeps a 1 bit flag per byte in - * the buffer to indicate which bytes the callers have asked - * to have logged. - */ - bip->bli_orig = kmem_alloc(BBTOB(bp->b_length), KM_SLEEP); - memcpy(bip->bli_orig, bp->b_addr, BBTOB(bp->b_length)); - bip->bli_logged = kmem_zalloc(BBTOB(bp->b_length) / NBBY, KM_SLEEP); -#endif - /* * Put the buf item into the list of items attached to the * buffer at the front. @@ -745,6 +807,8 @@ if (bp->b_fspriv) bip->bli_item.li_bio_list = bp->b_fspriv; bp->b_fspriv = bip; + xfs_buf_hold(bp); + return 0; } @@ -752,9 +816,8 @@ * Mark bytes first through last inclusive as dirty in the buf * item's bitmap. */ -void +static void xfs_buf_item_log_segment( - struct xfs_buf_log_item *bip, uint first, uint last, uint *map) @@ -846,12 +909,6 @@ struct xfs_buf *bp = bip->bli_buf; /* - * Mark the item as having some dirty data for - * quick reference in xfs_buf_item_dirty. - */ - bip->bli_flags |= XFS_BLI_DIRTY; - - /* * walk each buffer segment and mark them dirty appropriately. */ start = 0; @@ -868,7 +925,7 @@ if (end > last) end = last; - xfs_buf_item_log_segment(bip, first, end, + xfs_buf_item_log_segment(first, end, &bip->bli_formats[i].blf_data_map[0]); start += bp->b_maps[i].bm_len; @@ -877,7 +934,7 @@ /* - * Return 1 if the buffer has some data that has been logged (at any + * Return 1 if the buffer has been logged or ordered in a transaction (at any * point, not just the current transaction) and 0 if not. */ uint @@ -891,11 +948,6 @@ xfs_buf_item_free( xfs_buf_log_item_t *bip) { -#ifdef XFS_TRANS_DEBUG - kmem_free(bip->bli_orig); - kmem_free(bip->bli_logged); -#endif /* XFS_TRANS_DEBUG */ - xfs_buf_item_free_format(bip); kmem_zone_free(xfs_buf_item_zone, bip); } @@ -911,11 +963,11 @@ xfs_buf_item_relse( xfs_buf_t *bp) { - xfs_buf_log_item_t *bip; + xfs_buf_log_item_t *bip = bp->b_fspriv; trace_xfs_buf_item_relse(bp, _RET_IP_); + ASSERT(!(bip->bli_item.li_flags & XFS_LI_IN_AIL)); - bip = bp->b_fspriv; bp->b_fspriv = bip->bli_item.li_bio_list; if (bp->b_fspriv == NULL) bp->b_iodone = NULL; @@ -1006,7 +1058,7 @@ static ulong lasttime; static xfs_buftarg_t *lasttarg; - if (likely(!xfs_buf_geterror(bp))) + if (likely(!bp->b_error)) goto do_callbacks; /* @@ -1035,7 +1087,7 @@ * a way to shut the filesystem down if the writes keep failing. * * In practice we'll shut the filesystem down soon as non-transient - * erorrs tend to affect the whole device and a failing log write + * errors tend to affect the whole device and a failing log write * will make us give up. But we really ought to do better here. */ if (XFS_BUF_ISASYNC(bp)) { @@ -1045,9 +1097,10 @@ xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */ - if (!XFS_BUF_ISSTALE(bp)) { - bp->b_flags |= XBF_WRITE | XBF_ASYNC | XBF_DONE; - xfs_buf_iorequest(bp); + if (!(bp->b_flags & (XBF_STALE|XBF_WRITE_FAIL))) { + bp->b_flags |= XBF_WRITE | XBF_ASYNC | + XBF_DONE | XBF_WRITE_FAIL; + xfs_buf_submit(bp); } else { xfs_buf_relse(bp); } @@ -1068,7 +1121,7 @@ xfs_buf_do_callbacks(bp); bp->b_fspriv = NULL; bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); + xfs_buf_ioend(bp); } /*