--- zzzz-none-000/linux-3.10.107/fs/xfs/xfs_log.c 2017-06-27 09:49:32.000000000 +0000 +++ scorpion-7490-727/linux-3.10.107/fs/xfs/xfs_log.c 2021-02-04 17:41:59.000000000 +0000 @@ -17,25 +17,23 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_trans.h" -#include "xfs_sb.h" -#include "xfs_ag.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_error.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_log.h" #include "xfs_log_priv.h" -#include "xfs_buf_item.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" #include "xfs_log_recover.h" -#include "xfs_trans_priv.h" -#include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_trace.h" #include "xfs_fsops.h" #include "xfs_cksum.h" +#include "xfs_sysfs.h" +#include "xfs_sb.h" kmem_zone_t *xfs_log_ticket_zone; @@ -111,7 +109,7 @@ STATIC void xlog_verify_dest_ptr( struct xlog *log, - char *ptr); + void *ptr); STATIC void xlog_verify_grant_tail( struct xlog *log); @@ -257,7 +255,8 @@ struct xlog *log, struct xlog_grant_head *head, struct xlog_ticket *tic, - int need_bytes) + int need_bytes) __releases(&head->lock) + __acquires(&head->lock) { list_add_tail(&tic->t_queue, &head->waiters); @@ -269,7 +268,7 @@ __set_current_state(TASK_UNINTERRUPTIBLE); spin_unlock(&head->lock); - XFS_STATS_INC(xs_sleep_logspace); + XFS_STATS_INC(log->l_mp, xs_sleep_logspace); trace_xfs_log_grant_sleep(log, tic); schedule(); @@ -284,7 +283,7 @@ return 0; shutdown: list_del_init(&tic->t_queue); - return XFS_ERROR(EIO); + return -EIO; } /* @@ -378,9 +377,9 @@ int error = 0; if (XLOG_FORCED_SHUTDOWN(log)) - return XFS_ERROR(EIO); + return -EIO; - XFS_STATS_INC(xs_try_logspace); + XFS_STATS_INC(mp, xs_try_logspace); /* * This is a new transaction on the ticket, so we need to change the @@ -447,15 +446,15 @@ ASSERT(client == XFS_TRANSACTION || client == XFS_LOG); if (XLOG_FORCED_SHUTDOWN(log)) - return XFS_ERROR(EIO); + return -EIO; - XFS_STATS_INC(xs_try_logspace); + XFS_STATS_INC(mp, xs_try_logspace); ASSERT(*ticp == NULL); tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent, KM_SLEEP | KM_MAYFAIL); if (!tic) - return XFS_ERROR(ENOMEM); + return -ENOMEM; tic->t_trans_type = t_type; *ticp = tic; @@ -514,7 +513,7 @@ struct xfs_mount *mp, struct xlog_ticket *ticket, struct xlog_in_core **iclog, - uint flags) + bool regrant) { struct xlog *log = mp->m_log; xfs_lsn_t lsn = 0; @@ -527,14 +526,11 @@ (((ticket->t_flags & XLOG_TIC_INITED) == 0) && (xlog_commit_record(log, ticket, iclog, &lsn)))) { lsn = (xfs_lsn_t) -1; - if (ticket->t_flags & XLOG_TIC_PERM_RESERV) { - flags |= XFS_LOG_REL_PERM_RESERV; - } + regrant = false; } - if ((ticket->t_flags & XLOG_TIC_PERM_RESERV) == 0 || - (flags & XFS_LOG_REL_PERM_RESERV)) { + if (!regrant) { trace_xfs_log_done_nonperm(log, ticket); /* @@ -542,7 +538,6 @@ * request has been made to release a permanent reservation. */ xlog_ungrant_log_space(log, ticket); - xfs_log_ticket_put(ticket); } else { trace_xfs_log_done_perm(log, ticket); @@ -554,6 +549,7 @@ ticket->t_flags |= XLOG_TIC_INITED; } + xfs_log_ticket_put(ticket); return lsn; } @@ -591,7 +587,7 @@ { if (xlog_state_release_iclog(mp->m_log, iclog)) { xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); - return EIO; + return -EIO; } return 0; @@ -614,23 +610,70 @@ xfs_daddr_t blk_offset, int num_bblks) { - int error; + int error = 0; + int min_logfsbs; - if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) - xfs_notice(mp, "Mounting Filesystem"); - else { + if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) { + xfs_notice(mp, "Mounting V%d Filesystem", + XFS_SB_VERSION_NUM(&mp->m_sb)); + } else { xfs_notice(mp, -"Mounting filesystem in no-recovery mode. Filesystem will be inconsistent."); +"Mounting V%d filesystem in no-recovery mode. Filesystem will be inconsistent.", + XFS_SB_VERSION_NUM(&mp->m_sb)); ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); } mp->m_log = xlog_alloc_log(mp, log_target, blk_offset, num_bblks); if (IS_ERR(mp->m_log)) { - error = -PTR_ERR(mp->m_log); + error = PTR_ERR(mp->m_log); goto out; } /* + * Validate the given log space and drop a critical message via syslog + * if the log size is too small that would lead to some unexpected + * situations in transaction log space reservation stage. + * + * Note: we can't just reject the mount if the validation fails. This + * would mean that people would have to downgrade their kernel just to + * remedy the situation as there is no way to grow the log (short of + * black magic surgery with xfs_db). + * + * We can, however, reject mounts for CRC format filesystems, as the + * mkfs binary being used to make the filesystem should never create a + * filesystem with a log that is too small. + */ + min_logfsbs = xfs_log_calc_minimum_size(mp); + + if (mp->m_sb.sb_logblocks < min_logfsbs) { + xfs_warn(mp, + "Log size %d blocks too small, minimum size is %d blocks", + mp->m_sb.sb_logblocks, min_logfsbs); + error = -EINVAL; + } else if (mp->m_sb.sb_logblocks > XFS_MAX_LOG_BLOCKS) { + xfs_warn(mp, + "Log size %d blocks too large, maximum size is %lld blocks", + mp->m_sb.sb_logblocks, XFS_MAX_LOG_BLOCKS); + error = -EINVAL; + } else if (XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks) > XFS_MAX_LOG_BYTES) { + xfs_warn(mp, + "log size %lld bytes too large, maximum size is %lld bytes", + XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks), + XFS_MAX_LOG_BYTES); + error = -EINVAL; + } + if (error) { + if (xfs_sb_version_hascrc(&mp->m_sb)) { + xfs_crit(mp, "AAIEEE! Log failed size checks. Abort!"); + ASSERT(0); + goto out_free_log; + } + xfs_crit(mp, "Log size out of supported range."); + xfs_crit(mp, +"Continuing onwards, but if log hangs are experienced then please report this message in the bug report."); + } + + /* * Initialize the AIL now we have a log. */ error = xfs_trans_ail_init(mp); @@ -657,10 +700,16 @@ if (error) { xfs_warn(mp, "log mount/recovery failed: error %d", error); + xlog_recover_cancel(mp->m_log); goto out_destroy_ail; } } + error = xfs_sysfs_init(&mp->m_log->l_kobj, &xfs_log_ktype, &mp->m_kobj, + "log"); + if (error) + goto out_destroy_ail; + /* Normal transactions can now occur */ mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY; @@ -692,18 +741,35 @@ * it. */ int -xfs_log_mount_finish(xfs_mount_t *mp) +xfs_log_mount_finish( + struct xfs_mount *mp) { int error = 0; - if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) { - error = xlog_recover_finish(mp->m_log); - if (!error) - xfs_log_work_queue(mp); - } else { + if (mp->m_flags & XFS_MOUNT_NORECOVERY) { ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); + return 0; } + error = xlog_recover_finish(mp->m_log); + if (!error) + xfs_log_work_queue(mp); + + return error; +} + +/* + * The mount has failed. Cancel the recovery if it hasn't completed and destroy + * the log. + */ +int +xfs_log_mount_cancel( + struct xfs_mount *mp) +{ + int error; + + error = xlog_recover_cancel(mp->m_log); + xfs_log_unmount(mp); return error; } @@ -720,7 +786,7 @@ * Unmount record used to have a string "Unmount filesystem--" in the * data section where the "Un" was really a magic number (XLOG_UNMOUNT_TYPE). * We just write the magic number now since that particular field isn't - * currently architecture converted and "nUmount" is a bit foo. + * currently architecture converted and "Unmount" is a bit foo. * As far as I know, there weren't any dependencies on the old behaviour. */ @@ -901,6 +967,9 @@ xfs_log_quiesce(mp); xfs_trans_ail_destroy(mp); + + xfs_sysfs_del(&mp->m_log->l_kobj); + xlog_dealloc_log(mp->m_log); } @@ -954,25 +1023,32 @@ } /* - * Determine if we have a transaction that has gone to disk - * that needs to be covered. To begin the transition to the idle state - * firstly the log needs to be idle (no AIL and nothing in the iclogs). - * If we are then in a state where covering is needed, the caller is informed - * that dummy transactions are required to move the log into the idle state. - * - * Because this is called as part of the sync process, we should also indicate - * that dummy transactions should be issued in anything but the covered or - * idle states. This ensures that the log tail is accurately reflected in - * the log at the end of the sync, hence if a crash occurrs avoids replay - * of transactions where the metadata is already on disk. + * Determine if we have a transaction that has gone to disk that needs to be + * covered. To begin the transition to the idle state firstly the log needs to + * be idle. That means the CIL, the AIL and the iclogs needs to be empty before + * we start attempting to cover the log. + * + * Only if we are then in a state where covering is needed, the caller is + * informed that dummy transactions are required to move the log into the idle + * state. + * + * If there are any items in the AIl or CIL, then we do not want to attempt to + * cover the log as we may be in a situation where there isn't log space + * available to run a dummy transaction and this can lead to deadlocks when the + * tail of the log is pinned by an item that is modified in the CIL. Hence + * there's no point in running a dummy transaction at this point because we + * can't start trying to idle the log until both the CIL and AIL are empty. */ int xfs_log_need_covered(xfs_mount_t *mp) { - int needed = 0; struct xlog *log = mp->m_log; + int needed = 0; + + if (!xfs_fs_writable(mp, SB_FREEZE_WRITE)) + return 0; - if (!xfs_fs_writable(mp)) + if (!xlog_cil_empty(log)) return 0; spin_lock(&log->l_icloglock); @@ -983,14 +1059,17 @@ break; case XLOG_STATE_COVER_NEED: case XLOG_STATE_COVER_NEED2: - if (!xfs_ail_min_lsn(log->l_ailp) && - xlog_iclogs_empty(log)) { - if (log->l_covered_state == XLOG_STATE_COVER_NEED) - log->l_covered_state = XLOG_STATE_COVER_DONE; - else - log->l_covered_state = XLOG_STATE_COVER_DONE2; - } - /* FALLTHRU */ + if (xfs_ail_min_lsn(log->l_ailp)) + break; + if (!xlog_iclogs_empty(log)) + break; + + needed = 1; + if (log->l_covered_state == XLOG_STATE_COVER_NEED) + log->l_covered_state = XLOG_STATE_COVER_DONE; + else + log->l_covered_state = XLOG_STATE_COVER_DONE2; + break; default: needed = 1; break; @@ -1022,6 +1101,7 @@ tail_lsn = lip->li_lsn; else tail_lsn = atomic64_read(&log->l_last_sync_lsn); + trace_xfs_log_assign_tail_lsn(log, tail_lsn); atomic64_set(&log->l_tail_lsn, tail_lsn); return tail_lsn; } @@ -1080,11 +1160,13 @@ * In this case we just want to return the size of the * log as the amount of space left. */ + xfs_alert(log->l_mp, "xlog_space_left: head behind tail"); + xfs_alert(log->l_mp, + " tail_cycle = %d, tail_bytes = %d", + tail_cycle, tail_bytes); xfs_alert(log->l_mp, - "xlog_space_left: head behind tail\n" - " tail_cycle = %d, tail_bytes = %d\n" - " GH cycle = %d, GH bytes = %d", - tail_cycle, tail_bytes, head_cycle, head_bytes); + " GH cycle = %d, GH bytes = %d", + head_cycle, head_bytes); ASSERT(0); free_bytes = log->l_logsize; } @@ -1108,7 +1190,7 @@ /* * Race to shutdown the filesystem if we see an error. */ - if (XFS_TEST_ERROR((xfs_buf_geterror(bp)), l->l_mp, + if (XFS_TEST_ERROR(bp->b_error, l->l_mp, XFS_ERRTAG_IODONE_IOERR, XFS_RANDOM_IODONE_IOERR)) { xfs_buf_ioerror_alert(bp, __func__); xfs_buf_stale(bp); @@ -1126,11 +1208,14 @@ /* log I/O is always issued ASYNC */ ASSERT(XFS_BUF_ISASYNC(bp)); xlog_state_done_syncing(iclog, aborted); + /* - * do not reference the buffer (bp) here as we could race - * with it being freed after writing the unmount record to the - * log. + * drop the buffer lock now that we are done. Nothing references + * the buffer after this, so an unmount waiting on this lock can now + * tear it down safely. As such, it is unsafe to reference the buffer + * (bp) after the unlock as we could race with it being freed. */ + xfs_buf_unlock(bp); } /* @@ -1223,9 +1308,20 @@ struct xfs_mount *mp = log->l_mp; /* dgc: errors ignored - not fatal and nowhere to report them */ - if (xfs_log_need_covered(mp)) - xfs_fs_log_dummy(mp); - else + if (xfs_log_need_covered(mp)) { + /* + * Dump a transaction into the log that contains no real change. + * This is needed to stamp the current tail LSN into the log + * during the covering operation. + * + * We cannot use an inode here for this - that will push dirty + * state back up into the VFS and then periodic inode flushing + * will prevent log covering from making progress. Hence we + * synchronously log the superblock instead to ensure the + * superblock is immediately unpinned and can be written back. + */ + xfs_sync_sb(mp, true); + } else xfs_log_force(mp, 0); /* start pushing all the metadata that is currently dirty */ @@ -1253,7 +1349,7 @@ xlog_in_core_t *iclog, *prev_iclog=NULL; xfs_buf_t *bp; int i; - int error = ENOMEM; + int error = -ENOMEM; uint log2_size = 0; log = kmem_zalloc(sizeof(struct xlog), KM_MAYFAIL); @@ -1280,7 +1376,7 @@ xlog_grant_head_init(&log->l_reserve_head); xlog_grant_head_init(&log->l_write_head); - error = EFSCORRUPTED; + error = -EFSCORRUPTED; if (xfs_sb_version_hassector(&mp->m_sb)) { log2_size = mp->m_sb.sb_logsectlog; if (log2_size < BBSHIFT) { @@ -1309,12 +1405,28 @@ xlog_get_iclog_buffer_size(mp, log); - error = ENOMEM; - bp = xfs_buf_alloc(mp->m_logdev_targp, 0, BTOBB(log->l_iclog_size), 0); + /* + * Use a NULL block for the extra log buffer used during splits so that + * it will trigger errors if we ever try to do IO on it without first + * having set it up properly. + */ + error = -ENOMEM; + bp = xfs_buf_alloc(mp->m_logdev_targp, XFS_BUF_DADDR_NULL, + BTOBB(log->l_iclog_size), 0); if (!bp) goto out_free_log; - bp->b_iodone = xlog_iodone; + + /* + * The iclogbuf buffer locks are held over IO but we are not going to do + * IO yet. Hence unlock the buffer so that the log IO path can grab it + * when appropriately. + */ ASSERT(xfs_buf_islocked(bp)); + xfs_buf_unlock(bp); + + /* use high priority wq for log I/O completion */ + bp->b_ioend_wq = mp->m_log_workqueue; + bp->b_iodone = xlog_iodone; log->l_xbuf = bp; spin_lock_init(&log->l_icloglock); @@ -1343,11 +1455,16 @@ if (!bp) goto out_free_iclog; + ASSERT(xfs_buf_islocked(bp)); + xfs_buf_unlock(bp); + + /* use high priority wq for log I/O completion */ + bp->b_ioend_wq = mp->m_log_workqueue; bp->b_iodone = xlog_iodone; iclog->ic_bp = bp; iclog->ic_data = bp->b_addr; #ifdef DEBUG - log->l_iclog_bak[i] = (xfs_caddr_t)&(iclog->ic_header); + log->l_iclog_bak[i] = &iclog->ic_header; #endif head = &iclog->ic_header; memset(head, 0, sizeof(xlog_rec_header_t)); @@ -1367,7 +1484,6 @@ iclog->ic_callback_tail = &(iclog->ic_callback); iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize; - ASSERT(xfs_buf_islocked(iclog->ic_bp)); init_waitqueue_head(&iclog->ic_force_wait); init_waitqueue_head(&iclog->ic_write_wait); @@ -1393,7 +1509,7 @@ out_free_log: kmem_free(log); out: - return ERR_PTR(-error); + return ERR_PTR(error); } /* xlog_alloc_log */ @@ -1503,7 +1619,7 @@ int i, j, k; int size = iclog->ic_offset + roundoff; __be32 cycle_lsn; - xfs_caddr_t dp; + char *dp; cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn); @@ -1556,8 +1672,13 @@ if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { union xlog_in_core2 *xhdr = (union xlog_in_core2 *)rhead; int i; + int xheads; - for (i = 1; i < log->l_iclog_heads; i++) { + xheads = size / XLOG_HEADER_CYCLE_SIZE; + if (size % XLOG_HEADER_CYCLE_SIZE) + xheads++; + + for (i = 1; i < xheads; i++) { crc = crc32c(crc, &xhdr[i].hic_xheader, sizeof(struct xlog_rec_ext_header)); } @@ -1576,6 +1697,12 @@ * we transition the iclogs to IOERROR state *after* flushing all existing * iclogs to disk. This is because we don't want anymore new transactions to be * started or completed afterwards. + * + * We lock the iclogbufs here so that we can serialise against IO completion + * during unmount. We might be processing a shutdown triggered during unmount, + * and that can occur asynchronously to the unmount thread, and hence we need to + * ensure that completes before tearing down the iclogbufs. Hence we need to + * hold the buffer lock across the log IO to acheive that. */ STATIC int xlog_bdstrat( @@ -1583,19 +1710,21 @@ { struct xlog_in_core *iclog = bp->b_fspriv; + xfs_buf_lock(bp); if (iclog->ic_state & XLOG_STATE_IOERROR) { - xfs_buf_ioerror(bp, EIO); + xfs_buf_ioerror(bp, -EIO); xfs_buf_stale(bp); - xfs_buf_ioend(bp, 0); + xfs_buf_ioend(bp); /* * It would seem logical to return EIO here, but we rely on * the log state machine to propagate I/O errors instead of - * doing it here. + * doing it here. Similarly, IO completion will unlock the + * buffer, so we don't do it here. */ return 0; } - xfs_buf_iorequest(bp); + xfs_buf_submit(bp); return 0; } @@ -1639,7 +1768,7 @@ int v2 = xfs_sb_version_haslogv2(&log->l_mp->m_sb); int size; - XFS_STATS_INC(xs_log_writes); + XFS_STATS_INC(log->l_mp, xs_log_writes); ASSERT(atomic_read(&iclog->ic_refcnt) == 0); /* Add for LR header */ @@ -1676,7 +1805,7 @@ bp = iclog->ic_bp; XFS_BUF_SET_ADDR(bp, BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn))); - XFS_STATS_ADD(xs_log_blocks, BTOBB(count)); + XFS_STATS_ADD(log->l_mp, xs_log_blocks, BTOBB(count)); /* Do we need to split this write into 2 parts? */ if (XFS_BUF_ADDR(bp) + BTOBB(count) > log->l_logBBsize) { @@ -1792,14 +1921,28 @@ xlog_cil_destroy(log); /* - * always need to ensure that the extra buffer does not point to memory - * owned by another log buffer before we free it. + * Cycle all the iclogbuf locks to make sure all log IO completion + * is done before we tear down these buffers. */ + iclog = log->l_iclog; + for (i = 0; i < log->l_iclog_bufs; i++) { + xfs_buf_lock(iclog->ic_bp); + xfs_buf_unlock(iclog->ic_bp); + iclog = iclog->ic_next; + } + + /* + * Always need to ensure that the extra buffer does not point to memory + * owned by another log buffer before we free it. Also, cycle the lock + * first to ensure we've completed IO on it. + */ + xfs_buf_lock(log->l_xbuf); + xfs_buf_unlock(log->l_xbuf); xfs_buf_set_empty(log->l_xbuf, BTOBB(log->l_iclog_size)); xfs_buf_free(log->l_xbuf); iclog = log->l_iclog; - for (i=0; il_iclog_bufs; i++) { + for (i = 0; i < log->l_iclog_bufs; i++) { xfs_buf_free(iclog->ic_bp); next_iclog = iclog->ic_next; kmem_free(iclog); @@ -1910,30 +2053,28 @@ "SWAPEXT" }; - xfs_warn(mp, - "xlog_write: reservation summary:\n" - " trans type = %s (%u)\n" - " unit res = %d bytes\n" - " current res = %d bytes\n" - " total reg = %u bytes (o/flow = %u bytes)\n" - " ophdrs = %u (ophdr space = %u bytes)\n" - " ophdr + reg = %u bytes\n" - " num regions = %u\n", - ((ticket->t_trans_type <= 0 || - ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ? + xfs_warn(mp, "xlog_write: reservation summary:"); + xfs_warn(mp, " trans type = %s (%u)", + ((ticket->t_trans_type <= 0 || + ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ? "bad-trans-type" : trans_type_str[ticket->t_trans_type-1]), - ticket->t_trans_type, - ticket->t_unit_res, - ticket->t_curr_res, - ticket->t_res_arr_sum, ticket->t_res_o_flow, - ticket->t_res_num_ophdrs, ophdr_spc, - ticket->t_res_arr_sum + - ticket->t_res_o_flow + ophdr_spc, - ticket->t_res_num); + ticket->t_trans_type); + xfs_warn(mp, " unit res = %d bytes", + ticket->t_unit_res); + xfs_warn(mp, " current res = %d bytes", + ticket->t_curr_res); + xfs_warn(mp, " total reg = %u bytes (o/flow = %u bytes)", + ticket->t_res_arr_sum, ticket->t_res_o_flow); + xfs_warn(mp, " ophdrs = %u (ophdr space = %u bytes)", + ticket->t_res_num_ophdrs, ophdr_spc); + xfs_warn(mp, " ophdr + reg = %u bytes", + ticket->t_res_arr_sum + ticket->t_res_o_flow + ophdr_spc); + xfs_warn(mp, " num regions = %u", + ticket->t_res_num); for (i = 0; i < ticket->t_res_num; i++) { uint r_type = ticket->t_res_arr[i].r_type; - xfs_warn(mp, "region[%u]: %s - %u bytes\n", i, + xfs_warn(mp, "region[%u]: %s - %u bytes", i, ((r_type <= 0 || r_type > XLOG_REG_TYPE_MAX) ? "bad-rtype" : res_type_str[r_type-1]), ticket->t_res_arr[i].r_len); @@ -1941,7 +2082,7 @@ xfs_alert_tag(mp, XFS_PTAG_LOGRES, "xlog_write: reservation ran out. Need to up reservation"); - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); } /* @@ -1963,6 +2104,10 @@ headers++; for (lv = log_vector; lv; lv = lv->lv_next) { + /* we don't write ordered log vectors */ + if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) + continue; + headers += lv->lv_niovecs; for (i = 0; i < lv->lv_niovecs; i++) { @@ -2040,7 +2185,7 @@ * Set up the parameters of the region copy into the log. This has * to handle region write split across multiple log buffers - this * state is kept external to this function so that this code can - * can be written in an obvious, self documenting manner. + * be written in an obvious, self documenting manner. */ static int xlog_write_setup_copy( @@ -2216,7 +2361,7 @@ index = 0; lv = log_vector; vecp = lv->lv_iovecp; - while (lv && index < lv->lv_niovecs) { + while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) { void *ptr; int log_offset; @@ -2236,13 +2381,22 @@ * This loop writes out as many regions as can fit in the amount * of space which was allocated by xlog_state_get_iclog_space(). */ - while (lv && index < lv->lv_niovecs) { - struct xfs_log_iovec *reg = &vecp[index]; + while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) { + struct xfs_log_iovec *reg; struct xlog_op_header *ophdr; int start_rec_copy; int copy_len; int copy_off; + bool ordered = false; + /* ordered log vectors have no regions to write */ + if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) { + ASSERT(lv->lv_niovecs == 0); + ordered = true; + goto next_lv; + } + + reg = &vecp[index]; ASSERT(reg->i_len % sizeof(__int32_t) == 0); ASSERT((unsigned long)ptr % sizeof(__int32_t) == 0); @@ -2255,7 +2409,7 @@ ophdr = xlog_write_setup_ophdr(log, ptr, ticket, flags); if (!ophdr) - return XFS_ERROR(EIO); + return -EIO; xlog_write_adv_cnt(&ptr, &len, &log_offset, sizeof(struct xlog_op_header)); @@ -2268,11 +2422,20 @@ &partial_copy_len); xlog_verify_dest_ptr(log, ptr); - /* copy region */ + /* + * Copy region. + * + * Unmount records just log an opheader, so can have + * empty payloads with no data region to copy. Hence we + * only copy the payload if the vector says it has data + * to copy. + */ ASSERT(copy_len >= 0); - memcpy(ptr, reg->i_addr + copy_off, copy_len); - xlog_write_adv_cnt(&ptr, &len, &log_offset, copy_len); - + if (copy_len > 0) { + memcpy(ptr, reg->i_addr + copy_off, copy_len); + xlog_write_adv_cnt(&ptr, &len, &log_offset, + copy_len); + } copy_len += start_rec_copy + sizeof(xlog_op_header_t); record_cnt++; data_cnt += contwr ? copy_len : 0; @@ -2302,12 +2465,13 @@ break; if (++index == lv->lv_niovecs) { +next_lv: lv = lv->lv_next; index = 0; if (lv) vecp = lv->lv_iovecp; } - if (record_cnt == 0) { + if (record_cnt == 0 && ordered == false) { if (!lv) return 0; break; @@ -2753,12 +2917,12 @@ spin_lock(&log->l_icloglock); if (XLOG_FORCED_SHUTDOWN(log)) { spin_unlock(&log->l_icloglock); - return XFS_ERROR(EIO); + return -EIO; } iclog = log->l_iclog; if (iclog->ic_state != XLOG_STATE_ACTIVE) { - XFS_STATS_INC(xs_log_noiclogs); + XFS_STATS_INC(log->l_mp, xs_log_noiclogs); /* Wait for log writes to have flushed */ xlog_wait(&log->l_flush_wait, &log->l_icloglock); @@ -2941,7 +3105,7 @@ int sync = 0; /* do we sync? */ if (iclog->ic_state & XLOG_STATE_IOERROR) - return XFS_ERROR(EIO); + return -EIO; ASSERT(atomic_read(&iclog->ic_refcnt) > 0); if (!atomic_dec_and_lock(&iclog->ic_refcnt, &log->l_icloglock)) @@ -2949,7 +3113,7 @@ if (iclog->ic_state & XLOG_STATE_IOERROR) { spin_unlock(&log->l_icloglock); - return XFS_ERROR(EIO); + return -EIO; } ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE || iclog->ic_state == XLOG_STATE_WANT_SYNC); @@ -3010,11 +3174,19 @@ } if (log->l_curr_block >= log->l_logBBsize) { + /* + * Rewind the current block before the cycle is bumped to make + * sure that the combined LSN never transiently moves forward + * when the log wraps to the next cycle. This is to support the + * unlocked sample of these fields from xlog_valid_lsn(). Most + * other cases should acquire l_icloglock. + */ + log->l_curr_block -= log->l_logBBsize; + ASSERT(log->l_curr_block >= 0); + smp_wmb(); log->l_curr_cycle++; if (log->l_curr_cycle == XLOG_HEADER_MAGIC_NUM) log->l_curr_cycle++; - log->l_curr_block -= log->l_logBBsize; - ASSERT(log->l_curr_block >= 0); } ASSERT(iclog == log->l_iclog); log->l_iclog = iclog->ic_next; @@ -3057,7 +3229,7 @@ struct xlog_in_core *iclog; xfs_lsn_t lsn; - XFS_STATS_INC(xs_log_force); + XFS_STATS_INC(mp, xs_log_force); xlog_cil_force(log); @@ -3066,7 +3238,7 @@ iclog = log->l_iclog; if (iclog->ic_state & XLOG_STATE_IOERROR) { spin_unlock(&log->l_icloglock); - return XFS_ERROR(EIO); + return -EIO; } /* If the head iclog is not active nor dirty, we just attach @@ -3104,7 +3276,7 @@ spin_unlock(&log->l_icloglock); if (xlog_state_release_iclog(log, iclog)) - return XFS_ERROR(EIO); + return -EIO; if (log_flushed) *log_flushed = 1; @@ -3140,9 +3312,9 @@ */ if (iclog->ic_state & XLOG_STATE_IOERROR) { spin_unlock(&log->l_icloglock); - return XFS_ERROR(EIO); + return -EIO; } - XFS_STATS_INC(xs_log_force_sleep); + XFS_STATS_INC(mp, xs_log_force_sleep); xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); /* * No need to grab the log lock here since we're @@ -3150,7 +3322,7 @@ * and the memory read should be atomic. */ if (iclog->ic_state & XLOG_STATE_IOERROR) - return XFS_ERROR(EIO); + return -EIO; if (log_flushed) *log_flushed = 1; } else { @@ -3207,7 +3379,7 @@ ASSERT(lsn != 0); - XFS_STATS_INC(xs_log_force); + XFS_STATS_INC(mp, xs_log_force); lsn = xlog_cil_force_lsn(log, lsn); if (lsn == NULLCOMMITLSN) @@ -3218,7 +3390,7 @@ iclog = log->l_iclog; if (iclog->ic_state & XLOG_STATE_IOERROR) { spin_unlock(&log->l_icloglock); - return XFS_ERROR(EIO); + return -EIO; } do { @@ -3256,7 +3428,7 @@ (XLOG_STATE_WANT_SYNC | XLOG_STATE_SYNCING))) { ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR)); - XFS_STATS_INC(xs_log_force_sleep); + XFS_STATS_INC(mp, xs_log_force_sleep); xlog_wait(&iclog->ic_prev->ic_write_wait, &log->l_icloglock); @@ -3269,7 +3441,7 @@ xlog_state_switch_iclogs(log, iclog, 0); spin_unlock(&log->l_icloglock); if (xlog_state_release_iclog(log, iclog)) - return XFS_ERROR(EIO); + return -EIO; if (log_flushed) *log_flushed = 1; spin_lock(&log->l_icloglock); @@ -3284,9 +3456,9 @@ */ if (iclog->ic_state & XLOG_STATE_IOERROR) { spin_unlock(&log->l_icloglock); - return XFS_ERROR(EIO); + return -EIO; } - XFS_STATS_INC(xs_log_force_sleep); + XFS_STATS_INC(mp, xs_log_force_sleep); xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); /* * No need to grab the log lock here since we're @@ -3294,7 +3466,7 @@ * and the memory read should be atomic. */ if (iclog->ic_state & XLOG_STATE_IOERROR) - return XFS_ERROR(EIO); + return -EIO; if (log_flushed) *log_flushed = 1; @@ -3377,24 +3549,17 @@ } /* - * Allocate and initialise a new log ticket. + * Figure out the total log space unit (in bytes) that would be + * required for a log ticket. */ -struct xlog_ticket * -xlog_ticket_alloc( - struct xlog *log, - int unit_bytes, - int cnt, - char client, - bool permanent, - xfs_km_flags_t alloc_flags) -{ - struct xlog_ticket *tic; - uint num_headers; - int iclog_space; - - tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags); - if (!tic) - return NULL; +int +xfs_log_calc_unit_res( + struct xfs_mount *mp, + int unit_bytes) +{ + struct xlog *log = mp->m_log; + int iclog_space; + uint num_headers; /* * Permanent reservations have up to 'cnt'-1 active log operations @@ -3469,20 +3634,43 @@ unit_bytes += log->l_iclog_hsize; /* for roundoff padding for transaction data and one for commit record */ - if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) && - log->l_mp->m_sb.sb_logsunit > 1) { + if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1) { /* log su roundoff */ - unit_bytes += 2*log->l_mp->m_sb.sb_logsunit; + unit_bytes += 2 * mp->m_sb.sb_logsunit; } else { /* BB roundoff */ - unit_bytes += 2*BBSIZE; + unit_bytes += 2 * BBSIZE; } + return unit_bytes; +} + +/* + * Allocate and initialise a new log ticket. + */ +struct xlog_ticket * +xlog_ticket_alloc( + struct xlog *log, + int unit_bytes, + int cnt, + char client, + bool permanent, + xfs_km_flags_t alloc_flags) +{ + struct xlog_ticket *tic; + int unit_res; + + tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags); + if (!tic) + return NULL; + + unit_res = xfs_log_calc_unit_res(log->l_mp, unit_bytes); + atomic_set(&tic->t_ref, 1); tic->t_task = current; INIT_LIST_HEAD(&tic->t_queue); - tic->t_unit_res = unit_bytes; - tic->t_curr_res = unit_bytes; + tic->t_unit_res = unit_res; + tic->t_curr_res = unit_res; tic->t_cnt = cnt; tic->t_ocnt = cnt; tic->t_tid = prandom_u32(); @@ -3513,7 +3701,7 @@ void xlog_verify_dest_ptr( struct xlog *log, - char *ptr) + void *ptr) { int i; int good_ptr = 0; @@ -3616,9 +3804,8 @@ xlog_op_header_t *ophead; xlog_in_core_t *icptr; xlog_in_core_2_t *xhdr; - xfs_caddr_t ptr; - xfs_caddr_t base_ptr; - __psint_t field_offset; + void *base_ptr, *ptr, *p; + ptrdiff_t field_offset; __uint8_t clientid; int len, i, j, k, op_len; int idx; @@ -3626,11 +3813,9 @@ /* check validity of iclog pointers */ spin_lock(&log->l_icloglock); icptr = log->l_iclog; - for (i=0; i < log->l_iclog_bufs; i++) { - if (icptr == NULL) - xfs_emerg(log->l_mp, "%s: invalid ptr", __func__); - icptr = icptr->ic_next; - } + for (i = 0; i < log->l_iclog_bufs; i++, icptr = icptr->ic_next) + ASSERT(icptr); + if (icptr != log->l_iclog) xfs_emerg(log->l_mp, "%s: corrupt iclog ring", __func__); spin_unlock(&log->l_icloglock); @@ -3639,9 +3824,9 @@ if (iclog->ic_header.h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) xfs_emerg(log->l_mp, "%s: invalid magic num", __func__); - ptr = (xfs_caddr_t) &iclog->ic_header; - for (ptr += BBSIZE; ptr < ((xfs_caddr_t)&iclog->ic_header) + count; - ptr += BBSIZE) { + base_ptr = ptr = &iclog->ic_header; + p = &iclog->ic_header; + for (ptr += BBSIZE; ptr < base_ptr + count; ptr += BBSIZE) { if (*(__be32 *)ptr == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) xfs_emerg(log->l_mp, "%s: unexpected magic num", __func__); @@ -3649,20 +3834,19 @@ /* check fields */ len = be32_to_cpu(iclog->ic_header.h_num_logops); - ptr = iclog->ic_datap; - base_ptr = ptr; - ophead = (xlog_op_header_t *)ptr; + base_ptr = ptr = iclog->ic_datap; + ophead = ptr; xhdr = iclog->ic_data; for (i = 0; i < len; i++) { - ophead = (xlog_op_header_t *)ptr; + ophead = ptr; /* clientid is only 1 byte */ - field_offset = (__psint_t) - ((xfs_caddr_t)&(ophead->oh_clientid) - base_ptr); + p = &ophead->oh_clientid; + field_offset = p - base_ptr; if (!syncing || (field_offset & 0x1ff)) { clientid = ophead->oh_clientid; } else { - idx = BTOBBT((xfs_caddr_t)&(ophead->oh_clientid) - iclog->ic_datap); + idx = BTOBBT((char *)&ophead->oh_clientid - iclog->ic_datap); if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) { j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); @@ -3680,13 +3864,13 @@ (unsigned long)field_offset); /* check length */ - field_offset = (__psint_t) - ((xfs_caddr_t)&(ophead->oh_len) - base_ptr); + p = &ophead->oh_len; + field_offset = p - base_ptr; if (!syncing || (field_offset & 0x1ff)) { op_len = be32_to_cpu(ophead->oh_len); } else { - idx = BTOBBT((__psint_t)&ophead->oh_len - - (__psint_t)iclog->ic_datap); + idx = BTOBBT((uintptr_t)&ophead->oh_len - + (uintptr_t)iclog->ic_datap); if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) { j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); @@ -3732,18 +3916,17 @@ * This is called from xfs_force_shutdown, when we're forcibly * shutting down the filesystem, typically because of an IO error. * Our main objectives here are to make sure that: - * a. the filesystem gets marked 'SHUTDOWN' for all interested + * a. if !logerror, flush the logs to disk. Anything modified + * after this is ignored. + * b. the filesystem gets marked 'SHUTDOWN' for all interested * parties to find out, 'atomically'. - * b. those who're sleeping on log reservations, pinned objects and + * c. those who're sleeping on log reservations, pinned objects and * other resources get woken up, and be told the bad news. - * c. nothing new gets queued up after (a) and (b) are done. - * d. if !logerror, flush the iclogs to disk, then seal them off - * for business. - * - * Note: for delayed logging the !logerror case needs to flush the regions - * held in memory out to the iclogs before flushing them to disk. This needs - * to be done before the log is marked as shutdown, otherwise the flush to the - * iclogs will fail. + * d. nothing new gets queued up after (b) and (c) are done. + * + * Note: for the !logerror case we need to flush the regions held in memory out + * to disk first. This needs to be done before the log is marked as shutdown, + * otherwise the iclog writes will fail. */ int xfs_log_force_umount( @@ -3775,16 +3958,16 @@ ASSERT(XLOG_FORCED_SHUTDOWN(log)); return 1; } - retval = 0; /* - * Flush the in memory commit item list before marking the log as - * being shut down. We need to do it in this order to ensure all the - * completed transactions are flushed to disk with the xfs_log_force() - * call below. + * Flush all the completed transactions to disk before marking the log + * being shut down. We need to do it in this order to ensure that + * completed operations are safely on disk before we shut down, and that + * we don't have to issue any buffer IO after the shutdown flags are set + * to guarantee this. */ if (!logerror) - xlog_cil_force(log); + _xfs_log_force(mp, XFS_LOG_SYNC, NULL); /* * mark the filesystem and the as in a shutdown state and wake @@ -3796,18 +3979,11 @@ XFS_BUF_DONE(mp->m_sb_bp); /* - * This flag is sort of redundant because of the mount flag, but - * it's good to maintain the separation between the log and the rest - * of XFS. + * Mark the log and the iclogs with IO error flags to prevent any + * further log IO from being issued or completed. */ log->l_flags |= XLOG_IO_ERROR; - - /* - * If we hit a log error, we want to mark all the iclogs IOERROR - * while we're still holding the loglock. - */ - if (logerror) - retval = xlog_state_ioerror(log); + retval = xlog_state_ioerror(log); spin_unlock(&log->l_icloglock); /* @@ -3820,23 +3996,13 @@ xlog_grant_head_wake_all(&log->l_reserve_head); xlog_grant_head_wake_all(&log->l_write_head); - if (!(log->l_iclog->ic_state & XLOG_STATE_IOERROR)) { - ASSERT(!logerror); - /* - * Force the incore logs to disk before shutting the - * log down completely. - */ - _xfs_log_force(mp, XFS_LOG_SYNC, NULL); - - spin_lock(&log->l_icloglock); - retval = xlog_state_ioerror(log); - spin_unlock(&log->l_icloglock); - } /* - * Wake up everybody waiting on xfs_log_force. - * Callback all log item committed functions as if the - * log writes were completed. + * Wake up everybody waiting on xfs_log_force. Wake the CIL push first + * as if the log writes were completed. The abort handling in the log + * item committed callback functions will do this again under lock to + * avoid races. */ + wake_up_all(&log->l_cilp->xc_commit_wait); xlog_state_do_callback(log, XFS_LI_ABORTED, NULL); #ifdef XFSERRORDEBUG @@ -3874,3 +4040,45 @@ return 1; } +/* + * Verify that an LSN stamped into a piece of metadata is valid. This is + * intended for use in read verifiers on v5 superblocks. + */ +bool +xfs_log_check_lsn( + struct xfs_mount *mp, + xfs_lsn_t lsn) +{ + struct xlog *log = mp->m_log; + bool valid; + + /* + * norecovery mode skips mount-time log processing and unconditionally + * resets the in-core LSN. We can't validate in this mode, but + * modifications are not allowed anyways so just return true. + */ + if (mp->m_flags & XFS_MOUNT_NORECOVERY) + return true; + + /* + * Some metadata LSNs are initialized to NULL (e.g., the agfl). This is + * handled by recovery and thus safe to ignore here. + */ + if (lsn == NULLCOMMITLSN) + return true; + + valid = xlog_valid_lsn(mp->m_log, lsn); + + /* warn the user about what's gone wrong before verifier failure */ + if (!valid) { + spin_lock(&log->l_icloglock); + xfs_warn(mp, +"Corruption warning: Metadata has LSN (%d:%d) ahead of current LSN (%d:%d). " +"Please unmount and run xfs_repair (>= v4.3) to resolve.", + CYCLE_LSN(lsn), BLOCK_LSN(lsn), + log->l_curr_cycle, log->l_curr_block); + spin_unlock(&log->l_icloglock); + } + + return valid; +}