--- zzzz-none-000/linux-3.10.107/fs/xfs/xfs_inode.c 2017-06-27 09:49:32.000000000 +0000 +++ scorpion-7490-727/linux-3.10.107/fs/xfs/xfs_inode.c 2021-02-04 17:41:59.000000000 +0000 @@ -19,36 +19,36 @@ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" -#include "xfs_trans_priv.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" #include "xfs_inode.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_dir2.h" +#include "xfs_attr_sf.h" +#include "xfs_attr.h" +#include "xfs_trans_space.h" +#include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_inode_item.h" -#include "xfs_btree.h" -#include "xfs_alloc.h" #include "xfs_ialloc.h" #include "xfs_bmap.h" +#include "xfs_bmap_util.h" #include "xfs_error.h" -#include "xfs_utils.h" #include "xfs_quota.h" #include "xfs_filestream.h" -#include "xfs_vnodeops.h" #include "xfs_cksum.h" #include "xfs_trace.h" #include "xfs_icache.h" +#include "xfs_symlink.h" +#include "xfs_trans_priv.h" +#include "xfs_log.h" +#include "xfs_bmap_btree.h" -kmem_zone_t *xfs_ifork_zone; kmem_zone_t *xfs_inode_zone; /* @@ -58,9 +58,8 @@ #define XFS_ITRUNC_MAX_EXTENTS 2 STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *); -STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int); -STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int); -STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int); + +STATIC int xfs_iunlink_remove(xfs_trans_t *, xfs_inode_t *); /* * helper function to extract extent size hint from inode @@ -77,69 +76,75 @@ } /* - * This is a wrapper routine around the xfs_ilock() routine used to centralize - * some grungy code. It is used in places that wish to lock the inode solely - * for reading the extents. The reason these places can't just call - * xfs_ilock(SHARED) is that the inode lock also guards to bringing in of the - * extents from disk for a file in b-tree format. If the inode is in b-tree - * format, then we need to lock the inode exclusively until the extents are read - * in. Locking it exclusively all the time would limit our parallelism - * unnecessarily, though. What we do instead is check to see if the extents - * have been read in yet, and only lock the inode exclusively if they have not. - * - * The function returns a value which should be given to the corresponding - * xfs_iunlock_map_shared(). This value is the mode in which the lock was - * actually taken. + * These two are wrapper routines around the xfs_ilock() routine used to + * centralize some grungy code. They are used in places that wish to lock the + * inode solely for reading the extents. The reason these places can't just + * call xfs_ilock(ip, XFS_ILOCK_SHARED) is that the inode lock also guards to + * bringing in of the extents from disk for a file in b-tree format. If the + * inode is in b-tree format, then we need to lock the inode exclusively until + * the extents are read in. Locking it exclusively all the time would limit + * our parallelism unnecessarily, though. What we do instead is check to see + * if the extents have been read in yet, and only lock the inode exclusively + * if they have not. + * + * The functions return a value which should be given to the corresponding + * xfs_iunlock() call. */ uint -xfs_ilock_map_shared( - xfs_inode_t *ip) +xfs_ilock_data_map_shared( + struct xfs_inode *ip) { - uint lock_mode; + uint lock_mode = XFS_ILOCK_SHARED; - if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) && - ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) { + if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE && + (ip->i_df.if_flags & XFS_IFEXTENTS) == 0) lock_mode = XFS_ILOCK_EXCL; - } else { - lock_mode = XFS_ILOCK_SHARED; - } - xfs_ilock(ip, lock_mode); - return lock_mode; } -/* - * This is simply the unlock routine to go with xfs_ilock_map_shared(). - * All it does is call xfs_iunlock() with the given lock_mode. - */ -void -xfs_iunlock_map_shared( - xfs_inode_t *ip, - unsigned int lock_mode) +uint +xfs_ilock_attr_map_shared( + struct xfs_inode *ip) { - xfs_iunlock(ip, lock_mode); + uint lock_mode = XFS_ILOCK_SHARED; + + if (ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE && + (ip->i_afp->if_flags & XFS_IFEXTENTS) == 0) + lock_mode = XFS_ILOCK_EXCL; + xfs_ilock(ip, lock_mode); + return lock_mode; } /* - * The xfs inode contains 2 locks: a multi-reader lock called the - * i_iolock and a multi-reader lock called the i_lock. This routine - * allows either or both of the locks to be obtained. - * - * The 2 locks should always be ordered so that the IO lock is - * obtained first in order to prevent deadlock. - * - * ip -- the inode being locked - * lock_flags -- this parameter indicates the inode's locks - * to be locked. It can be: - * XFS_IOLOCK_SHARED, - * XFS_IOLOCK_EXCL, - * XFS_ILOCK_SHARED, - * XFS_ILOCK_EXCL, - * XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED, - * XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL, - * XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED, - * XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL + * The xfs inode contains 3 multi-reader locks: the i_iolock the i_mmap_lock and + * the i_lock. This routine allows various combinations of the locks to be + * obtained. + * + * The 3 locks should always be ordered so that the IO lock is obtained first, + * the mmap lock second and the ilock last in order to prevent deadlock. + * + * Basic locking order: + * + * i_iolock -> i_mmap_lock -> page_lock -> i_ilock + * + * mmap_sem locking order: + * + * i_iolock -> page lock -> mmap_sem + * mmap_sem -> i_mmap_lock -> page_lock + * + * The difference in mmap_sem locking order mean that we cannot hold the + * i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can + * fault in pages during copy in/out (for buffered IO) or require the mmap_sem + * in get_user_pages() to map the user pages into the kernel address space for + * direct IO. Similarly the i_iolock cannot be taken inside a page fault because + * page faults already hold the mmap_sem. + * + * Hence to serialise fully against both syscall and mmap based IO, we need to + * take both the i_iolock and the i_mmap_lock. These locks should *only* be both + * taken in places where we need to invalidate the page cache in a race + * free manner (e.g. truncate, hole punch and other extent manipulation + * functions). */ void xfs_ilock( @@ -155,15 +160,22 @@ */ ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); + ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) != + (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); + ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0); if (lock_flags & XFS_IOLOCK_EXCL) mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); else if (lock_flags & XFS_IOLOCK_SHARED) mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); + if (lock_flags & XFS_MMAPLOCK_EXCL) + mrupdate_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags)); + else if (lock_flags & XFS_MMAPLOCK_SHARED) + mraccess_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags)); + if (lock_flags & XFS_ILOCK_EXCL) mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); else if (lock_flags & XFS_ILOCK_SHARED) @@ -196,9 +208,11 @@ */ ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); + ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) != + (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); + ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0); if (lock_flags & XFS_IOLOCK_EXCL) { if (!mrtryupdate(&ip->i_iolock)) @@ -207,21 +221,35 @@ if (!mrtryaccess(&ip->i_iolock)) goto out; } + + if (lock_flags & XFS_MMAPLOCK_EXCL) { + if (!mrtryupdate(&ip->i_mmaplock)) + goto out_undo_iolock; + } else if (lock_flags & XFS_MMAPLOCK_SHARED) { + if (!mrtryaccess(&ip->i_mmaplock)) + goto out_undo_iolock; + } + if (lock_flags & XFS_ILOCK_EXCL) { if (!mrtryupdate(&ip->i_lock)) - goto out_undo_iolock; + goto out_undo_mmaplock; } else if (lock_flags & XFS_ILOCK_SHARED) { if (!mrtryaccess(&ip->i_lock)) - goto out_undo_iolock; + goto out_undo_mmaplock; } return 1; - out_undo_iolock: +out_undo_mmaplock: + if (lock_flags & XFS_MMAPLOCK_EXCL) + mrunlock_excl(&ip->i_mmaplock); + else if (lock_flags & XFS_MMAPLOCK_SHARED) + mrunlock_shared(&ip->i_mmaplock); +out_undo_iolock: if (lock_flags & XFS_IOLOCK_EXCL) mrunlock_excl(&ip->i_iolock); else if (lock_flags & XFS_IOLOCK_SHARED) mrunlock_shared(&ip->i_iolock); - out: +out: return 0; } @@ -249,9 +277,11 @@ */ ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); + ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) != + (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); + ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0); ASSERT(lock_flags != 0); if (lock_flags & XFS_IOLOCK_EXCL) @@ -259,6 +289,11 @@ else if (lock_flags & XFS_IOLOCK_SHARED) mrunlock_shared(&ip->i_iolock); + if (lock_flags & XFS_MMAPLOCK_EXCL) + mrunlock_excl(&ip->i_mmaplock); + else if (lock_flags & XFS_MMAPLOCK_SHARED) + mrunlock_shared(&ip->i_mmaplock); + if (lock_flags & XFS_ILOCK_EXCL) mrunlock_excl(&ip->i_lock); else if (lock_flags & XFS_ILOCK_SHARED) @@ -276,11 +311,14 @@ xfs_inode_t *ip, uint lock_flags) { - ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0); + ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL)); + ASSERT((lock_flags & + ~(XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL)) == 0); if (lock_flags & XFS_ILOCK_EXCL) mrdemote(&ip->i_lock); + if (lock_flags & XFS_MMAPLOCK_EXCL) + mrdemote(&ip->i_mmaplock); if (lock_flags & XFS_IOLOCK_EXCL) mrdemote(&ip->i_iolock); @@ -299,6 +337,12 @@ return rwsem_is_locked(&ip->i_lock.mr_lock); } + if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) { + if (!(lock_flags & XFS_MMAPLOCK_SHARED)) + return !!ip->i_mmaplock.mr_writer; + return rwsem_is_locked(&ip->i_mmaplock.mr_lock); + } + if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) { if (!(lock_flags & XFS_IOLOCK_SHARED)) return !!ip->i_iolock.mr_writer; @@ -310,621 +354,258 @@ } #endif -void -__xfs_iflock( - struct xfs_inode *ip) -{ - wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT); - DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT); - - do { - prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE); - if (xfs_isiflocked(ip)) - io_schedule(); - } while (!xfs_iflock_nowait(ip)); - - finish_wait(wq, &wait.wait); -} - #ifdef DEBUG +int xfs_locked_n; +int xfs_small_retries; +int xfs_middle_retries; +int xfs_lots_retries; +int xfs_lock_delays; +#endif + /* - * Make sure that the extents in the given memory buffer - * are valid. + * xfs_lockdep_subclass_ok() is only used in an ASSERT, so is only called when + * DEBUG or XFS_WARN is set. And MAX_LOCKDEP_SUBCLASSES is then only defined + * when CONFIG_LOCKDEP is set. Hence the complex define below to avoid build + * errors and warnings. */ -STATIC void -xfs_validate_extents( - xfs_ifork_t *ifp, - int nrecs, - xfs_exntfmt_t fmt) +#if (defined(DEBUG) || defined(XFS_WARN)) && defined(CONFIG_LOCKDEP) +static bool +xfs_lockdep_subclass_ok( + int subclass) { - xfs_bmbt_irec_t irec; - xfs_bmbt_rec_host_t rec; - int i; - - for (i = 0; i < nrecs; i++) { - xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); - rec.l0 = get_unaligned(&ep->l0); - rec.l1 = get_unaligned(&ep->l1); - xfs_bmbt_get_all(&rec, &irec); - if (fmt == XFS_EXTFMT_NOSTATE) - ASSERT(irec.br_state == XFS_EXT_NORM); - } + return subclass < MAX_LOCKDEP_SUBCLASSES; } -#else /* DEBUG */ -#define xfs_validate_extents(ifp, nrecs, fmt) -#endif /* DEBUG */ +#else +#define xfs_lockdep_subclass_ok(subclass) (true) +#endif /* - * Check that none of the inode's in the buffer have a next - * unlinked field of 0. + * Bump the subclass so xfs_lock_inodes() acquires each lock with a different + * value. This can be called for any type of inode lock combination, including + * parent locking. Care must be taken to ensure we don't overrun the subclass + * storage fields in the class mask we build. */ -#if defined(DEBUG) -void -xfs_inobp_check( - xfs_mount_t *mp, - xfs_buf_t *bp) +static inline int +xfs_lock_inumorder(int lock_mode, int subclass) { - int i; - int j; - xfs_dinode_t *dip; + int class = 0; - j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog; + ASSERT(!(lock_mode & (XFS_ILOCK_PARENT | XFS_ILOCK_RTBITMAP | + XFS_ILOCK_RTSUM))); + ASSERT(xfs_lockdep_subclass_ok(subclass)); - for (i = 0; i < j; i++) { - dip = (xfs_dinode_t *)xfs_buf_offset(bp, - i * mp->m_sb.sb_inodesize); - if (!dip->di_next_unlinked) { - xfs_alert(mp, - "Detected bogus zero next_unlinked field in incore inode buffer 0x%p.", - bp); - ASSERT(dip->di_next_unlinked); - } + if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) { + ASSERT(subclass <= XFS_IOLOCK_MAX_SUBCLASS); + ASSERT(xfs_lockdep_subclass_ok(subclass + + XFS_IOLOCK_PARENT_VAL)); + class += subclass << XFS_IOLOCK_SHIFT; + if (lock_mode & XFS_IOLOCK_PARENT) + class += XFS_IOLOCK_PARENT_VAL << XFS_IOLOCK_SHIFT; } -} -#endif -static void -xfs_inode_buf_verify( - struct xfs_buf *bp) -{ - struct xfs_mount *mp = bp->b_target->bt_mount; - int i; - int ni; - - /* - * Validate the magic number and version of every inode in the buffer - */ - ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock; - for (i = 0; i < ni; i++) { - int di_ok; - xfs_dinode_t *dip; - - dip = (struct xfs_dinode *)xfs_buf_offset(bp, - (i << mp->m_sb.sb_inodelog)); - di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && - XFS_DINODE_GOOD_VERSION(dip->di_version); - if (unlikely(XFS_TEST_ERROR(!di_ok, mp, - XFS_ERRTAG_ITOBP_INOTOBP, - XFS_RANDOM_ITOBP_INOTOBP))) { - xfs_buf_ioerror(bp, EFSCORRUPTED); - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH, - mp, dip); -#ifdef DEBUG - xfs_emerg(mp, - "bad inode magic/vsn daddr %lld #%d (magic=%x)", - (unsigned long long)bp->b_bn, i, - be16_to_cpu(dip->di_magic)); - ASSERT(0); -#endif - } + if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) { + ASSERT(subclass <= XFS_MMAPLOCK_MAX_SUBCLASS); + class += subclass << XFS_MMAPLOCK_SHIFT; } - xfs_inobp_check(mp, bp); -} + if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) { + ASSERT(subclass <= XFS_ILOCK_MAX_SUBCLASS); + class += subclass << XFS_ILOCK_SHIFT; + } -static void -xfs_inode_buf_read_verify( - struct xfs_buf *bp) -{ - xfs_inode_buf_verify(bp); -} - -static void -xfs_inode_buf_write_verify( - struct xfs_buf *bp) -{ - xfs_inode_buf_verify(bp); + return (lock_mode & ~XFS_LOCK_SUBCLASS_MASK) | class; } -const struct xfs_buf_ops xfs_inode_buf_ops = { - .verify_read = xfs_inode_buf_read_verify, - .verify_write = xfs_inode_buf_write_verify, -}; - - /* - * This routine is called to map an inode to the buffer containing the on-disk - * version of the inode. It returns a pointer to the buffer containing the - * on-disk inode in the bpp parameter, and in the dipp parameter it returns a - * pointer to the on-disk inode within that buffer. + * The following routine will lock n inodes in exclusive mode. We assume the + * caller calls us with the inodes in i_ino order. + * + * We need to detect deadlock where an inode that we lock is in the AIL and we + * start waiting for another inode that is locked by a thread in a long running + * transaction (such as truncate). This can result in deadlock since the long + * running trans might need to wait for the inode we just locked in order to + * push the tail and free space in the log. * - * If a non-zero error is returned, then the contents of bpp and dipp are - * undefined. + * xfs_lock_inodes() can only be used to lock one type of lock at a time - + * the iolock, the mmaplock or the ilock, but not more than one at a time. If we + * lock more than one at a time, lockdep will report false positives saying we + * have violated locking orders. */ -int -xfs_imap_to_bp( - struct xfs_mount *mp, - struct xfs_trans *tp, - struct xfs_imap *imap, - struct xfs_dinode **dipp, - struct xfs_buf **bpp, - uint buf_flags, - uint iget_flags) +void +xfs_lock_inodes( + xfs_inode_t **ips, + int inodes, + uint lock_mode) { - struct xfs_buf *bp; - int error; - - buf_flags |= XBF_UNMAPPED; - error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, - (int)imap->im_len, buf_flags, &bp, - &xfs_inode_buf_ops); - if (error) { - if (error == EAGAIN) { - ASSERT(buf_flags & XBF_TRYLOCK); - return error; - } + int attempts = 0, i, j, try_lock; + xfs_log_item_t *lp; - if (error == EFSCORRUPTED && - (iget_flags & XFS_IGET_UNTRUSTED)) - return XFS_ERROR(EINVAL); + /* + * Currently supports between 2 and 5 inodes with exclusive locking. We + * support an arbitrary depth of locking here, but absolute limits on + * inodes depend on the the type of locking and the limits placed by + * lockdep annotations in xfs_lock_inumorder. These are all checked by + * the asserts. + */ + ASSERT(ips && inodes >= 2 && inodes <= 5); + ASSERT(lock_mode & (XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL | + XFS_ILOCK_EXCL)); + ASSERT(!(lock_mode & (XFS_IOLOCK_SHARED | XFS_MMAPLOCK_SHARED | + XFS_ILOCK_SHARED))); + ASSERT(!(lock_mode & XFS_IOLOCK_EXCL) || + inodes <= XFS_IOLOCK_MAX_SUBCLASS + 1); + ASSERT(!(lock_mode & XFS_MMAPLOCK_EXCL) || + inodes <= XFS_MMAPLOCK_MAX_SUBCLASS + 1); + ASSERT(!(lock_mode & XFS_ILOCK_EXCL) || + inodes <= XFS_ILOCK_MAX_SUBCLASS + 1); - xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.", - __func__, error); - return error; - } + if (lock_mode & XFS_IOLOCK_EXCL) { + ASSERT(!(lock_mode & (XFS_MMAPLOCK_EXCL | XFS_ILOCK_EXCL))); + } else if (lock_mode & XFS_MMAPLOCK_EXCL) + ASSERT(!(lock_mode & XFS_ILOCK_EXCL)); - *bpp = bp; - *dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset); - return 0; -} + try_lock = 0; + i = 0; +again: + for (; i < inodes; i++) { + ASSERT(ips[i]); -/* - * Move inode type and inode format specific information from the - * on-disk inode to the in-core inode. For fifos, devs, and sockets - * this means set if_rdev to the proper value. For files, directories, - * and symlinks this means to bring in the in-line data or extent - * pointers. For a file in B-tree format, only the root is immediately - * brought in-core. The rest will be in-lined in if_extents when it - * is first referenced (see xfs_iread_extents()). - */ -STATIC int -xfs_iformat( - xfs_inode_t *ip, - xfs_dinode_t *dip) -{ - xfs_attr_shortform_t *atp; - int size; - int error = 0; - xfs_fsize_t di_size; + if (i && (ips[i] == ips[i - 1])) /* Already locked */ + continue; - if (unlikely(be32_to_cpu(dip->di_nextents) + - be16_to_cpu(dip->di_anextents) > - be64_to_cpu(dip->di_nblocks))) { - xfs_warn(ip->i_mount, - "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.", - (unsigned long long)ip->i_ino, - (int)(be32_to_cpu(dip->di_nextents) + - be16_to_cpu(dip->di_anextents)), - (unsigned long long) - be64_to_cpu(dip->di_nblocks)); - XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW, - ip->i_mount, dip); - return XFS_ERROR(EFSCORRUPTED); - } - - if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) { - xfs_warn(ip->i_mount, "corrupt dinode %Lu, forkoff = 0x%x.", - (unsigned long long)ip->i_ino, - dip->di_forkoff); - XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW, - ip->i_mount, dip); - return XFS_ERROR(EFSCORRUPTED); - } - - if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) && - !ip->i_mount->m_rtdev_targp)) { - xfs_warn(ip->i_mount, - "corrupt dinode %Lu, has realtime flag set.", - ip->i_ino); - XFS_CORRUPTION_ERROR("xfs_iformat(realtime)", - XFS_ERRLEVEL_LOW, ip->i_mount, dip); - return XFS_ERROR(EFSCORRUPTED); - } + /* + * If try_lock is not set yet, make sure all locked inodes are + * not in the AIL. If any are, set try_lock to be used later. + */ + if (!try_lock) { + for (j = (i - 1); j >= 0 && !try_lock; j--) { + lp = (xfs_log_item_t *)ips[j]->i_itemp; + if (lp && (lp->li_flags & XFS_LI_IN_AIL)) + try_lock++; + } + } - switch (ip->i_d.di_mode & S_IFMT) { - case S_IFIFO: - case S_IFCHR: - case S_IFBLK: - case S_IFSOCK: - if (unlikely(dip->di_format != XFS_DINODE_FMT_DEV)) { - XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW, - ip->i_mount, dip); - return XFS_ERROR(EFSCORRUPTED); + /* + * If any of the previous locks we have locked is in the AIL, + * we must TRY to get the second and subsequent locks. If + * we can't get any, we must release all we have + * and try again. + */ + if (!try_lock) { + xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i)); + continue; } - ip->i_d.di_size = 0; - ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip); - break; - case S_IFREG: - case S_IFLNK: - case S_IFDIR: - switch (dip->di_format) { - case XFS_DINODE_FMT_LOCAL: + /* try_lock means we have an inode locked that is in the AIL. */ + ASSERT(i != 0); + if (xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) + continue; + + /* + * Unlock all previous guys and try again. xfs_iunlock will try + * to push the tail if the inode is in the AIL. + */ + attempts++; + for (j = i - 1; j >= 0; j--) { /* - * no local regular files yet + * Check to see if we've already unlocked this one. Not + * the first one going back, and the inode ptr is the + * same. */ - if (unlikely(S_ISREG(be16_to_cpu(dip->di_mode)))) { - xfs_warn(ip->i_mount, - "corrupt inode %Lu (local format for regular file).", - (unsigned long long) ip->i_ino); - XFS_CORRUPTION_ERROR("xfs_iformat(4)", - XFS_ERRLEVEL_LOW, - ip->i_mount, dip); - return XFS_ERROR(EFSCORRUPTED); - } - - di_size = be64_to_cpu(dip->di_size); - if (unlikely(di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) { - xfs_warn(ip->i_mount, - "corrupt inode %Lu (bad size %Ld for local inode).", - (unsigned long long) ip->i_ino, - (long long) di_size); - XFS_CORRUPTION_ERROR("xfs_iformat(5)", - XFS_ERRLEVEL_LOW, - ip->i_mount, dip); - return XFS_ERROR(EFSCORRUPTED); - } + if (j != (i - 1) && ips[j] == ips[j + 1]) + continue; - size = (int)di_size; - error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size); - break; - case XFS_DINODE_FMT_EXTENTS: - error = xfs_iformat_extents(ip, dip, XFS_DATA_FORK); - break; - case XFS_DINODE_FMT_BTREE: - error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK); - break; - default: - XFS_ERROR_REPORT("xfs_iformat(6)", XFS_ERRLEVEL_LOW, - ip->i_mount); - return XFS_ERROR(EFSCORRUPTED); + xfs_iunlock(ips[j], lock_mode); } - break; - default: - XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount); - return XFS_ERROR(EFSCORRUPTED); - } - if (error) { - return error; - } - if (!XFS_DFORK_Q(dip)) - return 0; - - ASSERT(ip->i_afp == NULL); - ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS); - - switch (dip->di_aformat) { - case XFS_DINODE_FMT_LOCAL: - atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip); - size = be16_to_cpu(atp->hdr.totsize); - - if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) { - xfs_warn(ip->i_mount, - "corrupt inode %Lu (bad attr fork size %Ld).", - (unsigned long long) ip->i_ino, - (long long) size); - XFS_CORRUPTION_ERROR("xfs_iformat(8)", - XFS_ERRLEVEL_LOW, - ip->i_mount, dip); - return XFS_ERROR(EFSCORRUPTED); + if ((attempts % 5) == 0) { + delay(1); /* Don't just spin the CPU */ +#ifdef DEBUG + xfs_lock_delays++; +#endif } - - error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size); - break; - case XFS_DINODE_FMT_EXTENTS: - error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK); - break; - case XFS_DINODE_FMT_BTREE: - error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK); - break; - default: - error = XFS_ERROR(EFSCORRUPTED); - break; - } - if (error) { - kmem_zone_free(xfs_ifork_zone, ip->i_afp); - ip->i_afp = NULL; - xfs_idestroy_fork(ip, XFS_DATA_FORK); + i = 0; + try_lock = 0; + goto again; } - return error; -} -/* - * The file is in-lined in the on-disk inode. - * If it fits into if_inline_data, then copy - * it there, otherwise allocate a buffer for it - * and copy the data there. Either way, set - * if_data to point at the data. - * If we allocate a buffer for the data, make - * sure that its size is a multiple of 4 and - * record the real size in i_real_bytes. - */ -STATIC int -xfs_iformat_local( - xfs_inode_t *ip, - xfs_dinode_t *dip, - int whichfork, - int size) -{ - xfs_ifork_t *ifp; - int real_size; - - /* - * If the size is unreasonable, then something - * is wrong and we just bail out rather than crash in - * kmem_alloc() or memcpy() below. - */ - if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { - xfs_warn(ip->i_mount, - "corrupt inode %Lu (bad size %d for local fork, size = %d).", - (unsigned long long) ip->i_ino, size, - XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)); - XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW, - ip->i_mount, dip); - return XFS_ERROR(EFSCORRUPTED); - } - ifp = XFS_IFORK_PTR(ip, whichfork); - real_size = 0; - if (size == 0) - ifp->if_u1.if_data = NULL; - else if (size <= sizeof(ifp->if_u2.if_inline_data)) - ifp->if_u1.if_data = ifp->if_u2.if_inline_data; - else { - real_size = roundup(size, 4); - ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS); - } - ifp->if_bytes = size; - ifp->if_real_bytes = real_size; - if (size) - memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size); - ifp->if_flags &= ~XFS_IFEXTENTS; - ifp->if_flags |= XFS_IFINLINE; - return 0; +#ifdef DEBUG + if (attempts) { + if (attempts < 5) xfs_small_retries++; + else if (attempts < 100) xfs_middle_retries++; + else xfs_lots_retries++; + } else { + xfs_locked_n++; + } +#endif } /* - * The file consists of a set of extents all - * of which fit into the on-disk inode. - * If there are few enough extents to fit into - * the if_inline_ext, then copy them there. - * Otherwise allocate a buffer for them and copy - * them into it. Either way, set if_extents - * to point at the extents. - */ -STATIC int -xfs_iformat_extents( - xfs_inode_t *ip, - xfs_dinode_t *dip, - int whichfork) -{ - xfs_bmbt_rec_t *dp; - xfs_ifork_t *ifp; - int nex; - int size; - int i; - - ifp = XFS_IFORK_PTR(ip, whichfork); - nex = XFS_DFORK_NEXTENTS(dip, whichfork); - size = nex * (uint)sizeof(xfs_bmbt_rec_t); - - /* - * If the number of extents is unreasonable, then something - * is wrong and we just bail out rather than crash in - * kmem_alloc() or memcpy() below. - */ - if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { - xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).", - (unsigned long long) ip->i_ino, nex); - XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW, - ip->i_mount, dip); - return XFS_ERROR(EFSCORRUPTED); - } - - ifp->if_real_bytes = 0; - if (nex == 0) - ifp->if_u1.if_extents = NULL; - else if (nex <= XFS_INLINE_EXTS) - ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; - else - xfs_iext_add(ifp, 0, nex); - - ifp->if_bytes = size; - if (size) { - dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork); - xfs_validate_extents(ifp, nex, XFS_EXTFMT_INODE(ip)); - for (i = 0; i < nex; i++, dp++) { - xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); - ep->l0 = get_unaligned_be64(&dp->l0); - ep->l1 = get_unaligned_be64(&dp->l1); - } - XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork); - if (whichfork != XFS_DATA_FORK || - XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE) - if (unlikely(xfs_check_nostate_extents( - ifp, 0, nex))) { - XFS_ERROR_REPORT("xfs_iformat_extents(2)", - XFS_ERRLEVEL_LOW, - ip->i_mount); - return XFS_ERROR(EFSCORRUPTED); - } + * xfs_lock_two_inodes() can only be used to lock one type of lock at a time - + * the iolock, the mmaplock or the ilock, but not more than one at a time. If we + * lock more than one at a time, lockdep will report false positives saying we + * have violated locking orders. + */ +void +xfs_lock_two_inodes( + xfs_inode_t *ip0, + xfs_inode_t *ip1, + uint lock_mode) +{ + xfs_inode_t *temp; + int attempts = 0; + xfs_log_item_t *lp; + + if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) { + ASSERT(!(lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL))); + ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))); + } else if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) + ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))); + + ASSERT(ip0->i_ino != ip1->i_ino); + + if (ip0->i_ino > ip1->i_ino) { + temp = ip0; + ip0 = ip1; + ip1 = temp; + } + + again: + xfs_ilock(ip0, xfs_lock_inumorder(lock_mode, 0)); + + /* + * If the first lock we have locked is in the AIL, we must TRY to get + * the second lock. If we can't get it, we must release the first one + * and try again. + */ + lp = (xfs_log_item_t *)ip0->i_itemp; + if (lp && (lp->li_flags & XFS_LI_IN_AIL)) { + if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(lock_mode, 1))) { + xfs_iunlock(ip0, lock_mode); + if ((++attempts % 5) == 0) + delay(1); /* Don't just spin the CPU */ + goto again; + } + } else { + xfs_ilock(ip1, xfs_lock_inumorder(lock_mode, 1)); } - ifp->if_flags |= XFS_IFEXTENTS; - return 0; } -/* - * The file has too many extents to fit into - * the inode, so they are in B-tree format. - * Allocate a buffer for the root of the B-tree - * and copy the root into it. The i_extents - * field will remain NULL until all of the - * extents are read in (when they are needed). - */ -STATIC int -xfs_iformat_btree( - xfs_inode_t *ip, - xfs_dinode_t *dip, - int whichfork) -{ - struct xfs_mount *mp = ip->i_mount; - xfs_bmdr_block_t *dfp; - xfs_ifork_t *ifp; - /* REFERENCED */ - int nrecs; - int size; - - ifp = XFS_IFORK_PTR(ip, whichfork); - dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork); - size = XFS_BMAP_BROOT_SPACE(mp, dfp); - nrecs = be16_to_cpu(dfp->bb_numrecs); - - /* - * blow out if -- fork has less extents than can fit in - * fork (fork shouldn't be a btree format), root btree - * block has more records than can fit into the fork, - * or the number of extents is greater than the number of - * blocks. - */ - if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= - XFS_IFORK_MAXEXT(ip, whichfork) || - XFS_BMDR_SPACE_CALC(nrecs) > - XFS_DFORK_SIZE(dip, mp, whichfork) || - XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) { - xfs_warn(mp, "corrupt inode %Lu (btree).", - (unsigned long long) ip->i_ino); - XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW, - mp, dip); - return XFS_ERROR(EFSCORRUPTED); - } - - ifp->if_broot_bytes = size; - ifp->if_broot = kmem_alloc(size, KM_SLEEP | KM_NOFS); - ASSERT(ifp->if_broot != NULL); - /* - * Copy and convert from the on-disk structure - * to the in-memory structure. - */ - xfs_bmdr_to_bmbt(ip, dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork), - ifp->if_broot, size); - ifp->if_flags &= ~XFS_IFEXTENTS; - ifp->if_flags |= XFS_IFBROOT; - return 0; -} +void +__xfs_iflock( + struct xfs_inode *ip) +{ + wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT); + DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT); -STATIC void -xfs_dinode_from_disk( - xfs_icdinode_t *to, - xfs_dinode_t *from) -{ - to->di_magic = be16_to_cpu(from->di_magic); - to->di_mode = be16_to_cpu(from->di_mode); - to->di_version = from ->di_version; - to->di_format = from->di_format; - to->di_onlink = be16_to_cpu(from->di_onlink); - to->di_uid = be32_to_cpu(from->di_uid); - to->di_gid = be32_to_cpu(from->di_gid); - to->di_nlink = be32_to_cpu(from->di_nlink); - to->di_projid_lo = be16_to_cpu(from->di_projid_lo); - to->di_projid_hi = be16_to_cpu(from->di_projid_hi); - memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); - to->di_flushiter = be16_to_cpu(from->di_flushiter); - to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec); - to->di_atime.t_nsec = be32_to_cpu(from->di_atime.t_nsec); - to->di_mtime.t_sec = be32_to_cpu(from->di_mtime.t_sec); - to->di_mtime.t_nsec = be32_to_cpu(from->di_mtime.t_nsec); - to->di_ctime.t_sec = be32_to_cpu(from->di_ctime.t_sec); - to->di_ctime.t_nsec = be32_to_cpu(from->di_ctime.t_nsec); - to->di_size = be64_to_cpu(from->di_size); - to->di_nblocks = be64_to_cpu(from->di_nblocks); - to->di_extsize = be32_to_cpu(from->di_extsize); - to->di_nextents = be32_to_cpu(from->di_nextents); - to->di_anextents = be16_to_cpu(from->di_anextents); - to->di_forkoff = from->di_forkoff; - to->di_aformat = from->di_aformat; - to->di_dmevmask = be32_to_cpu(from->di_dmevmask); - to->di_dmstate = be16_to_cpu(from->di_dmstate); - to->di_flags = be16_to_cpu(from->di_flags); - to->di_gen = be32_to_cpu(from->di_gen); - - if (to->di_version == 3) { - to->di_changecount = be64_to_cpu(from->di_changecount); - to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec); - to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec); - to->di_flags2 = be64_to_cpu(from->di_flags2); - to->di_ino = be64_to_cpu(from->di_ino); - to->di_lsn = be64_to_cpu(from->di_lsn); - memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2)); - uuid_copy(&to->di_uuid, &from->di_uuid); - } -} + do { + prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + if (xfs_isiflocked(ip)) + io_schedule(); + } while (!xfs_iflock_nowait(ip)); -void -xfs_dinode_to_disk( - xfs_dinode_t *to, - xfs_icdinode_t *from) -{ - to->di_magic = cpu_to_be16(from->di_magic); - to->di_mode = cpu_to_be16(from->di_mode); - to->di_version = from ->di_version; - to->di_format = from->di_format; - to->di_onlink = cpu_to_be16(from->di_onlink); - to->di_uid = cpu_to_be32(from->di_uid); - to->di_gid = cpu_to_be32(from->di_gid); - to->di_nlink = cpu_to_be32(from->di_nlink); - to->di_projid_lo = cpu_to_be16(from->di_projid_lo); - to->di_projid_hi = cpu_to_be16(from->di_projid_hi); - memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); - to->di_flushiter = cpu_to_be16(from->di_flushiter); - to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec); - to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec); - to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec); - to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec); - to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec); - to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec); - to->di_size = cpu_to_be64(from->di_size); - to->di_nblocks = cpu_to_be64(from->di_nblocks); - to->di_extsize = cpu_to_be32(from->di_extsize); - to->di_nextents = cpu_to_be32(from->di_nextents); - to->di_anextents = cpu_to_be16(from->di_anextents); - to->di_forkoff = from->di_forkoff; - to->di_aformat = from->di_aformat; - to->di_dmevmask = cpu_to_be32(from->di_dmevmask); - to->di_dmstate = cpu_to_be16(from->di_dmstate); - to->di_flags = cpu_to_be16(from->di_flags); - to->di_gen = cpu_to_be32(from->di_gen); - - if (from->di_version == 3) { - to->di_changecount = cpu_to_be64(from->di_changecount); - to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec); - to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec); - to->di_flags2 = cpu_to_be64(from->di_flags2); - to->di_ino = cpu_to_be64(from->di_ino); - to->di_lsn = cpu_to_be64(from->di_lsn); - memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2)); - uuid_copy(&to->di_uuid, &from->di_uuid); - } + finish_wait(wq, &wait.wait); } STATIC uint @@ -985,210 +666,49 @@ (XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0); } -static bool -xfs_dinode_verify( - struct xfs_mount *mp, - struct xfs_inode *ip, - struct xfs_dinode *dip) -{ - if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC)) - return false; - - /* only version 3 or greater inodes are extensively verified here */ - if (dip->di_version < 3) - return true; - - if (!xfs_sb_version_hascrc(&mp->m_sb)) - return false; - if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize, - offsetof(struct xfs_dinode, di_crc))) - return false; - if (be64_to_cpu(dip->di_ino) != ip->i_ino) - return false; - if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_uuid)) - return false; - return true; -} - -void -xfs_dinode_calc_crc( - struct xfs_mount *mp, - struct xfs_dinode *dip) -{ - __uint32_t crc; - - if (dip->di_version < 3) - return; - - ASSERT(xfs_sb_version_hascrc(&mp->m_sb)); - crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize, - offsetof(struct xfs_dinode, di_crc)); - dip->di_crc = xfs_end_cksum(crc); -} - /* - * Read the disk inode attributes into the in-core inode structure. + * Lookups up an inode from "name". If ci_name is not NULL, then a CI match + * is allowed, otherwise it has to be an exact match. If a CI match is found, + * ci_name->name will point to a the actual name (caller must free) or + * will be set to NULL if an exact match is found. */ int -xfs_iread( - xfs_mount_t *mp, - xfs_trans_t *tp, - xfs_inode_t *ip, - uint iget_flags) +xfs_lookup( + xfs_inode_t *dp, + struct xfs_name *name, + xfs_inode_t **ipp, + struct xfs_name *ci_name) { - xfs_buf_t *bp; - xfs_dinode_t *dip; - int error; - - /* - * Fill in the location information in the in-core inode. - */ - error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags); - if (error) - return error; - - /* - * Get pointers to the on-disk inode and the buffer containing it. - */ - error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0, iget_flags); - if (error) - return error; - - /* even unallocated inodes are verified */ - if (!xfs_dinode_verify(mp, ip, dip)) { - xfs_alert(mp, "%s: validation failed for inode %lld failed", - __func__, ip->i_ino); - - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, dip); - error = XFS_ERROR(EFSCORRUPTED); - goto out_brelse; - } - - /* - * If the on-disk inode is already linked to a directory - * entry, copy all of the inode into the in-core inode. - * xfs_iformat() handles copying in the inode format - * specific information. - * Otherwise, just get the truly permanent information. - */ - if (dip->di_mode) { - xfs_dinode_from_disk(&ip->i_d, dip); - error = xfs_iformat(ip, dip); - if (error) { -#ifdef DEBUG - xfs_alert(mp, "%s: xfs_iformat() returned error %d", - __func__, error); -#endif /* DEBUG */ - goto out_brelse; - } - } else { - /* - * Partial initialisation of the in-core inode. Just the bits - * that xfs_ialloc won't overwrite or relies on being correct. - */ - ip->i_d.di_magic = be16_to_cpu(dip->di_magic); - ip->i_d.di_version = dip->di_version; - ip->i_d.di_gen = be32_to_cpu(dip->di_gen); - ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter); + xfs_ino_t inum; + int error; - if (dip->di_version == 3) { - ip->i_d.di_ino = be64_to_cpu(dip->di_ino); - uuid_copy(&ip->i_d.di_uuid, &dip->di_uuid); - } + trace_xfs_lookup(dp, name); - /* - * Make sure to pull in the mode here as well in - * case the inode is released without being used. - * This ensures that xfs_inactive() will see that - * the inode is already free and not try to mess - * with the uninitialized part of it. - */ - ip->i_d.di_mode = 0; - } + if (XFS_FORCED_SHUTDOWN(dp->i_mount)) + return -EIO; - /* - * The inode format changed when we moved the link count and - * made it 32 bits long. If this is an old format inode, - * convert it in memory to look like a new one. If it gets - * flushed to disk we will convert back before flushing or - * logging it. We zero out the new projid field and the old link - * count field. We'll handle clearing the pad field (the remains - * of the old uuid field) when we actually convert the inode to - * the new format. We don't change the version number so that we - * can distinguish this from a real new format inode. - */ - if (ip->i_d.di_version == 1) { - ip->i_d.di_nlink = ip->i_d.di_onlink; - ip->i_d.di_onlink = 0; - xfs_set_projid(ip, 0); - } + xfs_ilock(dp, XFS_IOLOCK_SHARED); + error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name); + if (error) + goto out_unlock; - ip->i_delayed_blks = 0; + error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp); + if (error) + goto out_free_name; - /* - * Mark the buffer containing the inode as something to keep - * around for a while. This helps to keep recently accessed - * meta-data in-core longer. - */ - xfs_buf_set_ref(bp, XFS_INO_REF); + xfs_iunlock(dp, XFS_IOLOCK_SHARED); + return 0; - /* - * Use xfs_trans_brelse() to release the buffer containing the - * on-disk inode, because it was acquired with xfs_trans_read_buf() - * in xfs_imap_to_bp() above. If tp is NULL, this is just a normal - * brelse(). If we're within a transaction, then xfs_trans_brelse() - * will only release the buffer if it is not dirty within the - * transaction. It will be OK to release the buffer in this case, - * because inodes on disk are never destroyed and we will be - * locking the new in-core inode before putting it in the hash - * table where other processes can find it. Thus we don't have - * to worry about the inode being changed just because we released - * the buffer. - */ - out_brelse: - xfs_trans_brelse(tp, bp); +out_free_name: + if (ci_name) + kmem_free(ci_name->name); +out_unlock: + xfs_iunlock(dp, XFS_IOLOCK_SHARED); + *ipp = NULL; return error; } /* - * Read in extents from a btree-format inode. - * Allocate and fill in if_extents. Real work is done in xfs_bmap.c. - */ -int -xfs_iread_extents( - xfs_trans_t *tp, - xfs_inode_t *ip, - int whichfork) -{ - int error; - xfs_ifork_t *ifp; - xfs_extnum_t nextents; - - if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) { - XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW, - ip->i_mount); - return XFS_ERROR(EFSCORRUPTED); - } - nextents = XFS_IFORK_NEXTENTS(ip, whichfork); - ifp = XFS_IFORK_PTR(ip, whichfork); - - /* - * We know that the size is valid (it's checked in iformat_btree) - */ - ifp->if_bytes = ifp->if_real_bytes = 0; - ifp->if_flags |= XFS_IFEXTENTS; - xfs_iext_add(ifp, 0, nextents); - error = xfs_bmap_read_extents(tp, ip, whichfork); - if (error) { - xfs_iext_destroy(ifp); - ifp->if_flags &= ~XFS_IFEXTENTS; - return error; - } - xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip)); - return 0; -} - -/* * Allocate an inode on disk and return a copy of its in-core version. * The in-core inode is locked exclusively. Set mode, nlink, and rdev * appropriately within the inode. The uid and gid for the inode are @@ -1236,8 +756,7 @@ xfs_inode_t *ip; uint flags; int error; - timespec_t tv; - int filestreams = 0; + struct timespec tv; /* * Call the space management code to pick @@ -1264,36 +783,23 @@ return error; ASSERT(ip != NULL); + /* + * We always convert v1 inodes to v2 now - we only support filesystems + * with >= v2 inode capability, so there is no reason for ever leaving + * an inode in v1 format. + */ + if (ip->i_d.di_version == 1) + ip->i_d.di_version = 2; + ip->i_d.di_mode = mode; ip->i_d.di_onlink = 0; ip->i_d.di_nlink = nlink; ASSERT(ip->i_d.di_nlink == nlink); - ip->i_d.di_uid = current_fsuid(); - ip->i_d.di_gid = current_fsgid(); + ip->i_d.di_uid = xfs_kuid_to_uid(current_fsuid()); + ip->i_d.di_gid = xfs_kgid_to_gid(current_fsgid()); xfs_set_projid(ip, prid); memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); - /* - * If the superblock version is up to where we support new format - * inodes and this is currently an old format inode, then change - * the inode version number now. This way we only do the conversion - * here rather than here and in the flush/logging code. - */ - if (xfs_sb_version_hasnlink(&mp->m_sb) && - ip->i_d.di_version == 1) { - ip->i_d.di_version = 2; - /* - * We've already zeroed the old link count, the projid field, - * and the pad field. - */ - } - - /* - * Project ids won't be stored on disk if we are using a version 1 inode. - */ - if ((prid != 0) && (ip->i_d.di_version == 1)) - xfs_bump_ino_vers2(tp, ip); - if (pip && XFS_INHERIT_GID(pip)) { ip->i_d.di_gid = pip->i_d.di_gid; if ((pip->i_d.di_mode & S_ISGID) && S_ISDIR(mode)) { @@ -1308,7 +814,7 @@ */ if ((irix_sgid_inherit) && (ip->i_d.di_mode & S_ISGID) && - (!in_group_p((gid_t)ip->i_d.di_gid))) { + (!in_group_p(xfs_gid_to_kgid(ip->i_d.di_gid)))) { ip->i_d.di_mode &= ~S_ISGID; } @@ -1316,7 +822,7 @@ ip->i_d.di_nextents = 0; ASSERT(ip->i_d.di_nblocks == 0); - nanotime(&tv); + tv = current_fs_time(mp->m_super); ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec; ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec; ip->i_d.di_atime = ip->i_d.di_mtime; @@ -1332,7 +838,7 @@ if (ip->i_d.di_version == 3) { ASSERT(ip->i_d.di_ino == ino); - ASSERT(uuid_equal(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid)); + ASSERT(uuid_equal(&ip->i_d.di_uuid, &mp->m_sb.sb_meta_uuid)); ip->i_d.di_crc = 0; ip->i_d.di_changecount = 1; ip->i_d.di_lsn = 0; @@ -1354,13 +860,6 @@ flags |= XFS_ILOG_DEV; break; case S_IFREG: - /* - * we can't set up filestreams until after the VFS inode - * is set up properly. - */ - if (pip && xfs_inode_is_filestream(pip)) - filestreams = 1; - /* fall through */ case S_IFDIR: if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) { uint di_flags = 0; @@ -1372,6 +871,8 @@ di_flags |= XFS_DIFLAG_EXTSZINHERIT; ip->i_d.di_extsize = pip->i_d.di_extsize; } + if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) + di_flags |= XFS_DIFLAG_PROJINHERIT; } else if (S_ISREG(mode)) { if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) di_flags |= XFS_DIFLAG_REALTIME; @@ -1392,8 +893,6 @@ if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) && xfs_inherit_nosymlinks) di_flags |= XFS_DIFLAG_NOSYMLINKS; - if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) - di_flags |= XFS_DIFLAG_PROJINHERIT; if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) && xfs_inherit_nodefrag) di_flags |= XFS_DIFLAG_NODEFRAG; @@ -1423,23 +922,605 @@ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_log_inode(tp, ip, flags); - /* now that we have an i_mode we can setup inode ops and unlock */ + /* now that we have an i_mode we can setup the inode structure */ xfs_setup_inode(ip); - /* now we have set up the vfs inode we can associate the filestream */ - if (filestreams) { - error = xfs_filestream_associate(pip, ip); - if (error < 0) - return -error; - if (!error) - xfs_iflags_set(ip, XFS_IFILESTREAM); + *ipp = ip; + return 0; +} + +/* + * Allocates a new inode from disk and return a pointer to the + * incore copy. This routine will internally commit the current + * transaction and allocate a new one if the Space Manager needed + * to do an allocation to replenish the inode free-list. + * + * This routine is designed to be called from xfs_create and + * xfs_create_dir. + * + */ +int +xfs_dir_ialloc( + xfs_trans_t **tpp, /* input: current transaction; + output: may be a new transaction. */ + xfs_inode_t *dp, /* directory within whose allocate + the inode. */ + umode_t mode, + xfs_nlink_t nlink, + xfs_dev_t rdev, + prid_t prid, /* project id */ + int okalloc, /* ok to allocate new space */ + xfs_inode_t **ipp, /* pointer to inode; it will be + locked. */ + int *committed) + +{ + xfs_trans_t *tp; + xfs_inode_t *ip; + xfs_buf_t *ialloc_context = NULL; + int code; + void *dqinfo; + uint tflags; + + tp = *tpp; + ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); + + /* + * xfs_ialloc will return a pointer to an incore inode if + * the Space Manager has an available inode on the free + * list. Otherwise, it will do an allocation and replenish + * the freelist. Since we can only do one allocation per + * transaction without deadlocks, we will need to commit the + * current transaction and start a new one. We will then + * need to call xfs_ialloc again to get the inode. + * + * If xfs_ialloc did an allocation to replenish the freelist, + * it returns the bp containing the head of the freelist as + * ialloc_context. We will hold a lock on it across the + * transaction commit so that no other process can steal + * the inode(s) that we've just allocated. + */ + code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, okalloc, + &ialloc_context, &ip); + + /* + * Return an error if we were unable to allocate a new inode. + * This should only happen if we run out of space on disk or + * encounter a disk error. + */ + if (code) { + *ipp = NULL; + return code; + } + if (!ialloc_context && !ip) { + *ipp = NULL; + return -ENOSPC; + } + + /* + * If the AGI buffer is non-NULL, then we were unable to get an + * inode in one operation. We need to commit the current + * transaction and call xfs_ialloc() again. It is guaranteed + * to succeed the second time. + */ + if (ialloc_context) { + /* + * Normally, xfs_trans_commit releases all the locks. + * We call bhold to hang on to the ialloc_context across + * the commit. Holding this buffer prevents any other + * processes from doing any allocations in this + * allocation group. + */ + xfs_trans_bhold(tp, ialloc_context); + + /* + * We want the quota changes to be associated with the next + * transaction, NOT this one. So, detach the dqinfo from this + * and attach it to the next transaction. + */ + dqinfo = NULL; + tflags = 0; + if (tp->t_dqinfo) { + dqinfo = (void *)tp->t_dqinfo; + tp->t_dqinfo = NULL; + tflags = tp->t_flags & XFS_TRANS_DQ_DIRTY; + tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY); + } + + code = xfs_trans_roll(&tp, 0); + if (committed != NULL) + *committed = 1; + + /* + * Re-attach the quota info that we detached from prev trx. + */ + if (dqinfo) { + tp->t_dqinfo = dqinfo; + tp->t_flags |= tflags; + } + + if (code) { + xfs_buf_relse(ialloc_context); + *tpp = tp; + *ipp = NULL; + return code; + } + xfs_trans_bjoin(tp, ialloc_context); + + /* + * Call ialloc again. Since we've locked out all + * other allocations in this allocation group, + * this call should always succeed. + */ + code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, + okalloc, &ialloc_context, &ip); + + /* + * If we get an error at this point, return to the caller + * so that the current transaction can be aborted. + */ + if (code) { + *tpp = tp; + *ipp = NULL; + return code; + } + ASSERT(!ialloc_context && ip); + + } else { + if (committed != NULL) + *committed = 0; } *ipp = ip; + *tpp = tp; + return 0; } /* + * Decrement the link count on an inode & log the change. + * If this causes the link count to go to zero, initiate the + * logging activity required to truncate a file. + */ +int /* error */ +xfs_droplink( + xfs_trans_t *tp, + xfs_inode_t *ip) +{ + int error; + + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); + + ASSERT (ip->i_d.di_nlink > 0); + ip->i_d.di_nlink--; + drop_nlink(VFS_I(ip)); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + error = 0; + if (ip->i_d.di_nlink == 0) { + /* + * We're dropping the last link to this file. + * Move the on-disk inode to the AGI unlinked list. + * From xfs_inactive() we will pull the inode from + * the list and free it. + */ + error = xfs_iunlink(tp, ip); + } + return error; +} + +/* + * Increment the link count on an inode & log the change. + */ +int +xfs_bumplink( + xfs_trans_t *tp, + xfs_inode_t *ip) +{ + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); + + ASSERT(ip->i_d.di_version > 1); + ASSERT(ip->i_d.di_nlink > 0 || (VFS_I(ip)->i_state & I_LINKABLE)); + ip->i_d.di_nlink++; + inc_nlink(VFS_I(ip)); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + return 0; +} + +int +xfs_create( + xfs_inode_t *dp, + struct xfs_name *name, + umode_t mode, + xfs_dev_t rdev, + xfs_inode_t **ipp) +{ + int is_dir = S_ISDIR(mode); + struct xfs_mount *mp = dp->i_mount; + struct xfs_inode *ip = NULL; + struct xfs_trans *tp = NULL; + int error; + xfs_bmap_free_t free_list; + xfs_fsblock_t first_block; + bool unlock_dp_on_error = false; + int committed; + prid_t prid; + struct xfs_dquot *udqp = NULL; + struct xfs_dquot *gdqp = NULL; + struct xfs_dquot *pdqp = NULL; + struct xfs_trans_res *tres; + uint resblks; + + trace_xfs_create(dp, name); + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + prid = xfs_get_initial_prid(dp); + + /* + * Make sure that we have allocated dquot(s) on disk. + */ + error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()), + xfs_kgid_to_gid(current_fsgid()), prid, + XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, + &udqp, &gdqp, &pdqp); + if (error) + return error; + + if (is_dir) { + rdev = 0; + resblks = XFS_MKDIR_SPACE_RES(mp, name->len); + tres = &M_RES(mp)->tr_mkdir; + tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR); + } else { + resblks = XFS_CREATE_SPACE_RES(mp, name->len); + tres = &M_RES(mp)->tr_create; + tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE); + } + + /* + * Initially assume that the file does not exist and + * reserve the resources for that case. If that is not + * the case we'll drop the one we have and get a more + * appropriate transaction later. + */ + error = xfs_trans_reserve(tp, tres, resblks, 0); + if (error == -ENOSPC) { + /* flush outstanding delalloc blocks and retry */ + xfs_flush_inodes(mp); + error = xfs_trans_reserve(tp, tres, resblks, 0); + } + if (error == -ENOSPC) { + /* No space at all so try a "no-allocation" reservation */ + resblks = 0; + error = xfs_trans_reserve(tp, tres, 0, 0); + } + if (error) + goto out_trans_cancel; + + + xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL | + XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT); + unlock_dp_on_error = true; + + xfs_bmap_init(&free_list, &first_block); + + /* + * Reserve disk quota and the inode. + */ + error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, + pdqp, resblks, 1, 0); + if (error) + goto out_trans_cancel; + + if (!resblks) { + error = xfs_dir_canenter(tp, dp, name); + if (error) + goto out_trans_cancel; + } + + /* + * A newly created regular or special file just has one directory + * entry pointing to them, but a directory also the "." entry + * pointing to itself. + */ + error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, + prid, resblks > 0, &ip, &committed); + if (error) + goto out_trans_cancel; + + /* + * Now we join the directory inode to the transaction. We do not do it + * earlier because xfs_dir_ialloc might commit the previous transaction + * (and release all the locks). An error from here on will result in + * the transaction cancel unlocking dp so don't do it explicitly in the + * error path. + */ + xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + unlock_dp_on_error = false; + + error = xfs_dir_createname(tp, dp, name, ip->i_ino, + &first_block, &free_list, resblks ? + resblks - XFS_IALLOC_SPACE_RES(mp) : 0); + if (error) { + ASSERT(error != -ENOSPC); + goto out_trans_cancel; + } + xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + + if (is_dir) { + error = xfs_dir_init(tp, ip, dp); + if (error) + goto out_bmap_cancel; + + error = xfs_bumplink(tp, dp); + if (error) + goto out_bmap_cancel; + } + + /* + * If this is a synchronous mount, make sure that the + * create transaction goes to disk before returning to + * the user. + */ + if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) + xfs_trans_set_sync(tp); + + /* + * Attach the dquot(s) to the inodes and modify them incore. + * These ids of the inode couldn't have changed since the new + * inode has been locked ever since it was created. + */ + xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); + + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) + goto out_bmap_cancel; + + error = xfs_trans_commit(tp); + if (error) + goto out_release_inode; + + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + xfs_qm_dqrele(pdqp); + + *ipp = ip; + return 0; + + out_bmap_cancel: + xfs_bmap_cancel(&free_list); + out_trans_cancel: + xfs_trans_cancel(tp); + out_release_inode: + /* + * Wait until after the current transaction is aborted to finish the + * setup of the inode and release the inode. This prevents recursive + * transactions and deadlocks from xfs_inactive. + */ + if (ip) { + xfs_finish_inode_setup(ip); + IRELE(ip); + } + + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + xfs_qm_dqrele(pdqp); + + if (unlock_dp_on_error) + xfs_iunlock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + return error; +} + +int +xfs_create_tmpfile( + struct xfs_inode *dp, + struct dentry *dentry, + umode_t mode, + struct xfs_inode **ipp) +{ + struct xfs_mount *mp = dp->i_mount; + struct xfs_inode *ip = NULL; + struct xfs_trans *tp = NULL; + int error; + prid_t prid; + struct xfs_dquot *udqp = NULL; + struct xfs_dquot *gdqp = NULL; + struct xfs_dquot *pdqp = NULL; + struct xfs_trans_res *tres; + uint resblks; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + prid = xfs_get_initial_prid(dp); + + /* + * Make sure that we have allocated dquot(s) on disk. + */ + error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()), + xfs_kgid_to_gid(current_fsgid()), prid, + XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, + &udqp, &gdqp, &pdqp); + if (error) + return error; + + resblks = XFS_IALLOC_SPACE_RES(mp); + tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE_TMPFILE); + + tres = &M_RES(mp)->tr_create_tmpfile; + error = xfs_trans_reserve(tp, tres, resblks, 0); + if (error == -ENOSPC) { + /* No space at all so try a "no-allocation" reservation */ + resblks = 0; + error = xfs_trans_reserve(tp, tres, 0, 0); + } + if (error) + goto out_trans_cancel; + + error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, + pdqp, resblks, 1, 0); + if (error) + goto out_trans_cancel; + + error = xfs_dir_ialloc(&tp, dp, mode, 1, 0, + prid, resblks > 0, &ip, NULL); + if (error) + goto out_trans_cancel; + + if (mp->m_flags & XFS_MOUNT_WSYNC) + xfs_trans_set_sync(tp); + + /* + * Attach the dquot(s) to the inodes and modify them incore. + * These ids of the inode couldn't have changed since the new + * inode has been locked ever since it was created. + */ + xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); + + ip->i_d.di_nlink--; + error = xfs_iunlink(tp, ip); + if (error) + goto out_trans_cancel; + + error = xfs_trans_commit(tp); + if (error) + goto out_release_inode; + + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + xfs_qm_dqrele(pdqp); + + *ipp = ip; + return 0; + + out_trans_cancel: + xfs_trans_cancel(tp); + out_release_inode: + /* + * Wait until after the current transaction is aborted to finish the + * setup of the inode and release the inode. This prevents recursive + * transactions and deadlocks from xfs_inactive. + */ + if (ip) { + xfs_finish_inode_setup(ip); + IRELE(ip); + } + + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + xfs_qm_dqrele(pdqp); + + return error; +} + +int +xfs_link( + xfs_inode_t *tdp, + xfs_inode_t *sip, + struct xfs_name *target_name) +{ + xfs_mount_t *mp = tdp->i_mount; + xfs_trans_t *tp; + int error; + xfs_bmap_free_t free_list; + xfs_fsblock_t first_block; + int committed; + int resblks; + + trace_xfs_link(tdp, target_name); + + ASSERT(!S_ISDIR(sip->i_d.di_mode)); + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + error = xfs_qm_dqattach(sip, 0); + if (error) + goto std_return; + + error = xfs_qm_dqattach(tdp, 0); + if (error) + goto std_return; + + tp = xfs_trans_alloc(mp, XFS_TRANS_LINK); + resblks = XFS_LINK_SPACE_RES(mp, target_name->len); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, resblks, 0); + if (error == -ENOSPC) { + resblks = 0; + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, 0, 0); + } + if (error) + goto error_return; + + xfs_ilock(tdp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT); + xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL); + + xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, tdp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + + /* + * If we are using project inheritance, we only allow hard link + * creation in our tree when the project IDs are the same; else + * the tree quota mechanism could be circumvented. + */ + if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && + (xfs_get_projid(tdp) != xfs_get_projid(sip)))) { + error = -EXDEV; + goto error_return; + } + + if (!resblks) { + error = xfs_dir_canenter(tp, tdp, target_name); + if (error) + goto error_return; + } + + xfs_bmap_init(&free_list, &first_block); + + if (sip->i_d.di_nlink == 0) { + error = xfs_iunlink_remove(tp, sip); + if (error) + goto error_return; + } + + error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino, + &first_block, &free_list, resblks); + if (error) + goto error_return; + xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE); + + error = xfs_bumplink(tp, sip); + if (error) + goto error_return; + + /* + * If this is a synchronous mount, make sure that the + * link transaction goes to disk before returning to + * the user. + */ + if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { + xfs_trans_set_sync(tp); + } + + error = xfs_bmap_finish (&tp, &free_list, &committed); + if (error) { + xfs_bmap_cancel(&free_list); + goto error_return; + } + + return xfs_trans_commit(tp); + + error_return: + xfs_trans_cancel(tp); + std_return: + return error; +} + +/* * Free up the underlying blocks past new_size. The new size must be smaller * than the current size. This routine can be used both for the attribute and * data fork, and does not modify the inode size, which is left to the caller. @@ -1469,7 +1550,6 @@ { struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp = *tpp; - struct xfs_trans *ntp; xfs_bmap_free_t free_list; xfs_fsblock_t first_block; xfs_fileoff_t first_unmap_block; @@ -1527,32 +1607,7 @@ if (error) goto out_bmap_cancel; - if (committed) { - /* - * Mark the inode dirty so it will be logged and - * moved forward in the log as part of every commit. - */ - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - } - - ntp = xfs_trans_dup(tp); - error = xfs_trans_commit(tp, 0); - tp = ntp; - - xfs_trans_ijoin(tp, ip, 0); - - if (error) - goto out; - - /* - * Transaction commit worked ok so we can drop the extra ticket - * reference that we gained in xfs_trans_dup() - */ - xfs_log_ticket_put(tp->t_ticket); - error = xfs_trans_reserve(tp, 0, - XFS_ITRUNCATE_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, - XFS_ITRUNCATE_LOG_COUNT); + error = xfs_trans_roll(&tp, ip); if (error) goto out; } @@ -1578,6 +1633,317 @@ goto out; } +int +xfs_release( + xfs_inode_t *ip) +{ + xfs_mount_t *mp = ip->i_mount; + int error; + + if (!S_ISREG(ip->i_d.di_mode) || (ip->i_d.di_mode == 0)) + return 0; + + /* If this is a read-only mount, don't do this (would generate I/O) */ + if (mp->m_flags & XFS_MOUNT_RDONLY) + return 0; + + if (!XFS_FORCED_SHUTDOWN(mp)) { + int truncated; + + /* + * If we previously truncated this file and removed old data + * in the process, we want to initiate "early" writeout on + * the last close. This is an attempt to combat the notorious + * NULL files problem which is particularly noticeable from a + * truncate down, buffered (re-)write (delalloc), followed by + * a crash. What we are effectively doing here is + * significantly reducing the time window where we'd otherwise + * be exposed to that problem. + */ + truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED); + if (truncated) { + xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE); + if (ip->i_delayed_blks > 0) { + error = filemap_flush(VFS_I(ip)->i_mapping); + if (error) + return error; + } + } + } + + if (ip->i_d.di_nlink == 0) + return 0; + + if (xfs_can_free_eofblocks(ip, false)) { + + /* + * If we can't get the iolock just skip truncating the blocks + * past EOF because we could deadlock with the mmap_sem + * otherwise. We'll get another chance to drop them once the + * last reference to the inode is dropped, so we'll never leak + * blocks permanently. + * + * Further, check if the inode is being opened, written and + * closed frequently and we have delayed allocation blocks + * outstanding (e.g. streaming writes from the NFS server), + * truncating the blocks past EOF will cause fragmentation to + * occur. + * + * In this case don't do the truncation, either, but we have to + * be careful how we detect this case. Blocks beyond EOF show + * up as i_delayed_blks even when the inode is clean, so we + * need to truncate them away first before checking for a dirty + * release. Hence on the first dirty close we will still remove + * the speculative allocation, but after that we will leave it + * in place. + */ + if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE)) + return 0; + + error = xfs_free_eofblocks(mp, ip, true); + if (error && error != -EAGAIN) + return error; + + /* delalloc blocks after truncation means it really is dirty */ + if (ip->i_delayed_blks) + xfs_iflags_set(ip, XFS_IDIRTY_RELEASE); + } + return 0; +} + +/* + * xfs_inactive_truncate + * + * Called to perform a truncate when an inode becomes unlinked. + */ +STATIC int +xfs_inactive_truncate( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + int error; + + tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); + if (error) { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); + xfs_trans_cancel(tp); + return error; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + /* + * Log the inode size first to prevent stale data exposure in the event + * of a system crash before the truncate completes. See the related + * comment in xfs_setattr_size() for details. + */ + ip->i_d.di_size = 0; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0); + if (error) + goto error_trans_cancel; + + ASSERT(ip->i_d.di_nextents == 0); + + error = xfs_trans_commit(tp); + if (error) + goto error_unlock; + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return 0; + +error_trans_cancel: + xfs_trans_cancel(tp); +error_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; +} + +/* + * xfs_inactive_ifree() + * + * Perform the inode free when an inode is unlinked. + */ +STATIC int +xfs_inactive_ifree( + struct xfs_inode *ip) +{ + xfs_bmap_free_t free_list; + xfs_fsblock_t first_block; + int committed; + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + int error; + + tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); + + /* + * The ifree transaction might need to allocate blocks for record + * insertion to the finobt. We don't want to fail here at ENOSPC, so + * allow ifree to dip into the reserved block pool if necessary. + * + * Freeing large sets of inodes generally means freeing inode chunks, + * directory and file data blocks, so this should be relatively safe. + * Only under severe circumstances should it be possible to free enough + * inodes to exhaust the reserve block pool via finobt expansion while + * at the same time not creating free space in the filesystem. + * + * Send a warning if the reservation does happen to fail, as the inode + * now remains allocated and sits on the unlinked list until the fs is + * repaired. + */ + tp->t_flags |= XFS_TRANS_RESERVE; + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ifree, + XFS_IFREE_SPACE_RES(mp), 0); + if (error) { + if (error == -ENOSPC) { + xfs_warn_ratelimited(mp, + "Failed to remove inode(s) from unlinked list. " + "Please free space, unmount and run xfs_repair."); + } else { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); + } + xfs_trans_cancel(tp); + return error; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + xfs_bmap_init(&free_list, &first_block); + error = xfs_ifree(tp, ip, &free_list); + if (error) { + /* + * If we fail to free the inode, shut down. The cancel + * might do that, we need to make sure. Otherwise the + * inode might be lost for a long time or forever. + */ + if (!XFS_FORCED_SHUTDOWN(mp)) { + xfs_notice(mp, "%s: xfs_ifree returned error %d", + __func__, error); + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + } + xfs_trans_cancel(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; + } + + /* + * Credit the quota account(s). The inode is gone. + */ + xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1); + + /* + * Just ignore errors at this point. There is nothing we can do except + * to try to keep going. Make sure it's not a silent error. + */ + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) { + xfs_notice(mp, "%s: xfs_bmap_finish returned error %d", + __func__, error); + xfs_bmap_cancel(&free_list); + } + error = xfs_trans_commit(tp); + if (error) + xfs_notice(mp, "%s: xfs_trans_commit returned error %d", + __func__, error); + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return 0; +} + +/* + * xfs_inactive + * + * This is called when the vnode reference count for the vnode + * goes to zero. If the file has been unlinked, then it must + * now be truncated. Also, we clear all of the read-ahead state + * kept for the inode here since the file is now closed. + */ +void +xfs_inactive( + xfs_inode_t *ip) +{ + struct xfs_mount *mp; + int error; + int truncate = 0; + + /* + * If the inode is already free, then there can be nothing + * to clean up here. + */ + if (ip->i_d.di_mode == 0) { + ASSERT(ip->i_df.if_real_bytes == 0); + ASSERT(ip->i_df.if_broot_bytes == 0); + return; + } + + mp = ip->i_mount; + + /* If this is a read-only mount, don't do this (would generate I/O) */ + if (mp->m_flags & XFS_MOUNT_RDONLY) + return; + + if (ip->i_d.di_nlink != 0) { + /* + * force is true because we are evicting an inode from the + * cache. Post-eof blocks must be freed, lest we end up with + * broken free space accounting. + */ + if (xfs_can_free_eofblocks(ip, true)) + xfs_free_eofblocks(mp, ip, false); + + return; + } + + if (S_ISREG(ip->i_d.di_mode) && + (ip->i_d.di_size != 0 || XFS_ISIZE(ip) != 0 || + ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0)) + truncate = 1; + + error = xfs_qm_dqattach(ip, 0); + if (error) + return; + + if (S_ISLNK(ip->i_d.di_mode)) + error = xfs_inactive_symlink(ip); + else if (truncate) + error = xfs_inactive_truncate(ip); + if (error) + return; + + /* + * If there are attributes associated with the file then blow them away + * now. The code calls a routine that recursively deconstructs the + * attribute fork. If also blows away the in-core attribute fork. + */ + if (XFS_IFORK_Q(ip)) { + error = xfs_attr_inactive(ip); + if (error) + return; + } + + ASSERT(!ip->i_afp); + ASSERT(ip->i_d.di_anextents == 0); + ASSERT(ip->i_d.di_forkoff == 0); + + /* + * Free the inode. + */ + error = xfs_inactive_ifree(ip); + if (error) + return; + + /* + * Release the dquots held by inode, if any. + */ + xfs_qm_dqdetach(ip); +} + /* * This is called when the inode's link count goes to 0. * We place the on-disk inode on a list in the AGI. It @@ -1836,41 +2202,48 @@ } /* - * A big issue when freeing the inode cluster is is that we _cannot_ skip any + * A big issue when freeing the inode cluster is that we _cannot_ skip any * inodes that are in memory - they all must be marked stale and attached to * the cluster buffer. */ STATIC int xfs_ifree_cluster( - xfs_inode_t *free_ip, - xfs_trans_t *tp, - xfs_ino_t inum) + xfs_inode_t *free_ip, + xfs_trans_t *tp, + struct xfs_icluster *xic) { xfs_mount_t *mp = free_ip->i_mount; int blks_per_cluster; + int inodes_per_cluster; int nbufs; - int ninodes; int i, j; + int ioffset; xfs_daddr_t blkno; xfs_buf_t *bp; xfs_inode_t *ip; xfs_inode_log_item_t *iip; xfs_log_item_t *lip; struct xfs_perag *pag; + xfs_ino_t inum; + inum = xic->first_ino; pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum)); - if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) { - blks_per_cluster = 1; - ninodes = mp->m_sb.sb_inopblock; - nbufs = XFS_IALLOC_BLOCKS(mp); - } else { - blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) / - mp->m_sb.sb_blocksize; - ninodes = blks_per_cluster * mp->m_sb.sb_inopblock; - nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster; - } + blks_per_cluster = xfs_icluster_size_fsb(mp); + inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog; + nbufs = mp->m_ialloc_blks / blks_per_cluster; + + for (j = 0; j < nbufs; j++, inum += inodes_per_cluster) { + /* + * The allocation bitmap tells us which inodes of the chunk were + * physically allocated. Skip the cluster if an inode falls into + * a sparse region. + */ + ioffset = inum - xic->first_ino; + if ((xic->alloc & XFS_INOBT_MASK(ioffset)) == 0) { + ASSERT(do_mod(ioffset, inodes_per_cluster) == 0); + continue; + } - for (j = 0; j < nbufs; j++, inum += ninodes) { blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum), XFS_INO_TO_AGBNO(mp, inum)); @@ -1887,7 +2260,7 @@ XBF_UNMAPPED); if (!bp) - return ENOMEM; + return -ENOMEM; /* * This buffer may not have been correctly initialised as we @@ -1932,7 +2305,7 @@ * transaction stale above, which means there is no point in * even trying to lock them. */ - for (i = 0; i < ninodes; i++) { + for (i = 0; i < inodes_per_cluster; i++) { retry: rcu_read_lock(); ip = radix_tree_lookup(&pag->pag_ici_root, @@ -1992,6 +2365,7 @@ iip->ili_last_fields = iip->ili_fields; iip->ili_fields = 0; + iip->ili_fsync_fields = 0; iip->ili_logged = 1; xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, &iip->ili_item.li_lsn); @@ -2028,10 +2402,7 @@ xfs_bmap_free_t *flist) { int error; - int delete; - xfs_ino_t first_ino; - xfs_dinode_t *dip; - xfs_buf_t *ibp; + struct xfs_icluster xic = { 0 }; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(ip->i_d.di_nlink == 0); @@ -2044,14 +2415,13 @@ * Pull the on-disk inode from the AGI unlinked list. */ error = xfs_iunlink_remove(tp, ip); - if (error != 0) { + if (error) return error; - } - error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino); - if (error != 0) { + error = xfs_difree(tp, ip->i_ino, flist, &xic); + if (error) return error; - } + ip->i_d.di_mode = 0; /* mark incore inode as free */ ip->i_d.di_flags = 0; ip->i_d.di_dmevmask = 0; @@ -2063,301 +2433,15 @@ * by reincarnations of this inode. */ ip->i_d.di_gen++; - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - error = xfs_imap_to_bp(ip->i_mount, tp, &ip->i_imap, &dip, &ibp, - 0, 0); - if (error) - return error; - - /* - * Clear the on-disk di_mode. This is to prevent xfs_bulkstat - * from picking up this inode when it is reclaimed (its incore state - * initialzed but not flushed to disk yet). The in-core di_mode is - * already cleared and a corresponding transaction logged. - * The hack here just synchronizes the in-core to on-disk - * di_mode value in advance before the actual inode sync to disk. - * This is OK because the inode is already unlinked and would never - * change its di_mode again for this inode generation. - * This is a temporary hack that would require a proper fix - * in the future. - */ - dip->di_mode = 0; - - if (delete) { - error = xfs_ifree_cluster(ip, tp, first_ino); - } + if (xic.deleted) + error = xfs_ifree_cluster(ip, tp, &xic); return error; } /* - * Reallocate the space for if_broot based on the number of records - * being added or deleted as indicated in rec_diff. Move the records - * and pointers in if_broot to fit the new size. When shrinking this - * will eliminate holes between the records and pointers created by - * the caller. When growing this will create holes to be filled in - * by the caller. - * - * The caller must not request to add more records than would fit in - * the on-disk inode root. If the if_broot is currently NULL, then - * if we adding records one will be allocated. The caller must also - * not request that the number of records go below zero, although - * it can go to zero. - * - * ip -- the inode whose if_broot area is changing - * ext_diff -- the change in the number of records, positive or negative, - * requested for the if_broot array. - */ -void -xfs_iroot_realloc( - xfs_inode_t *ip, - int rec_diff, - int whichfork) -{ - struct xfs_mount *mp = ip->i_mount; - int cur_max; - xfs_ifork_t *ifp; - struct xfs_btree_block *new_broot; - int new_max; - size_t new_size; - char *np; - char *op; - - /* - * Handle the degenerate case quietly. - */ - if (rec_diff == 0) { - return; - } - - ifp = XFS_IFORK_PTR(ip, whichfork); - if (rec_diff > 0) { - /* - * If there wasn't any memory allocated before, just - * allocate it now and get out. - */ - if (ifp->if_broot_bytes == 0) { - new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff); - ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS); - ifp->if_broot_bytes = (int)new_size; - return; - } - - /* - * If there is already an existing if_broot, then we need - * to realloc() it and shift the pointers to their new - * location. The records don't change location because - * they are kept butted up against the btree block header. - */ - cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0); - new_max = cur_max + rec_diff; - new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max); - ifp->if_broot = kmem_realloc(ifp->if_broot, new_size, - XFS_BMAP_BROOT_SPACE_CALC(mp, cur_max), - KM_SLEEP | KM_NOFS); - op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, - ifp->if_broot_bytes); - np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, - (int)new_size); - ifp->if_broot_bytes = (int)new_size; - ASSERT(ifp->if_broot_bytes <= - XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ(ip)); - memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t)); - return; - } - - /* - * rec_diff is less than 0. In this case, we are shrinking the - * if_broot buffer. It must already exist. If we go to zero - * records, just get rid of the root and clear the status bit. - */ - ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0)); - cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0); - new_max = cur_max + rec_diff; - ASSERT(new_max >= 0); - if (new_max > 0) - new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max); - else - new_size = 0; - if (new_size > 0) { - new_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS); - /* - * First copy over the btree block header. - */ - memcpy(new_broot, ifp->if_broot, - XFS_BMBT_BLOCK_LEN(ip->i_mount)); - } else { - new_broot = NULL; - ifp->if_flags &= ~XFS_IFBROOT; - } - - /* - * Only copy the records and pointers if there are any. - */ - if (new_max > 0) { - /* - * First copy the records. - */ - op = (char *)XFS_BMBT_REC_ADDR(mp, ifp->if_broot, 1); - np = (char *)XFS_BMBT_REC_ADDR(mp, new_broot, 1); - memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t)); - - /* - * Then copy the pointers. - */ - op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, - ifp->if_broot_bytes); - np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1, - (int)new_size); - memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t)); - } - kmem_free(ifp->if_broot); - ifp->if_broot = new_broot; - ifp->if_broot_bytes = (int)new_size; - ASSERT(ifp->if_broot_bytes <= - XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ(ip)); - return; -} - - -/* - * This is called when the amount of space needed for if_data - * is increased or decreased. The change in size is indicated by - * the number of bytes that need to be added or deleted in the - * byte_diff parameter. - * - * If the amount of space needed has decreased below the size of the - * inline buffer, then switch to using the inline buffer. Otherwise, - * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer - * to what is needed. - * - * ip -- the inode whose if_data area is changing - * byte_diff -- the change in the number of bytes, positive or negative, - * requested for the if_data array. - */ -void -xfs_idata_realloc( - xfs_inode_t *ip, - int byte_diff, - int whichfork) -{ - xfs_ifork_t *ifp; - int new_size; - int real_size; - - if (byte_diff == 0) { - return; - } - - ifp = XFS_IFORK_PTR(ip, whichfork); - new_size = (int)ifp->if_bytes + byte_diff; - ASSERT(new_size >= 0); - - if (new_size == 0) { - if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { - kmem_free(ifp->if_u1.if_data); - } - ifp->if_u1.if_data = NULL; - real_size = 0; - } else if (new_size <= sizeof(ifp->if_u2.if_inline_data)) { - /* - * If the valid extents/data can fit in if_inline_ext/data, - * copy them from the malloc'd vector and free it. - */ - if (ifp->if_u1.if_data == NULL) { - ifp->if_u1.if_data = ifp->if_u2.if_inline_data; - } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { - ASSERT(ifp->if_real_bytes != 0); - memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data, - new_size); - kmem_free(ifp->if_u1.if_data); - ifp->if_u1.if_data = ifp->if_u2.if_inline_data; - } - real_size = 0; - } else { - /* - * Stuck with malloc/realloc. - * For inline data, the underlying buffer must be - * a multiple of 4 bytes in size so that it can be - * logged and stay on word boundaries. We enforce - * that here. - */ - real_size = roundup(new_size, 4); - if (ifp->if_u1.if_data == NULL) { - ASSERT(ifp->if_real_bytes == 0); - ifp->if_u1.if_data = kmem_alloc(real_size, - KM_SLEEP | KM_NOFS); - } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { - /* - * Only do the realloc if the underlying size - * is really changing. - */ - if (ifp->if_real_bytes != real_size) { - ifp->if_u1.if_data = - kmem_realloc(ifp->if_u1.if_data, - real_size, - ifp->if_real_bytes, - KM_SLEEP | KM_NOFS); - } - } else { - ASSERT(ifp->if_real_bytes == 0); - ifp->if_u1.if_data = kmem_alloc(real_size, - KM_SLEEP | KM_NOFS); - memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data, - ifp->if_bytes); - } - } - ifp->if_real_bytes = real_size; - ifp->if_bytes = new_size; - ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork)); -} - -void -xfs_idestroy_fork( - xfs_inode_t *ip, - int whichfork) -{ - xfs_ifork_t *ifp; - - ifp = XFS_IFORK_PTR(ip, whichfork); - if (ifp->if_broot != NULL) { - kmem_free(ifp->if_broot); - ifp->if_broot = NULL; - } - - /* - * If the format is local, then we can't have an extents - * array so just look for an inline data array. If we're - * not local then we may or may not have an extents list, - * so check and free it up if we do. - */ - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { - if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) && - (ifp->if_u1.if_data != NULL)) { - ASSERT(ifp->if_real_bytes != 0); - kmem_free(ifp->if_u1.if_data); - ifp->if_u1.if_data = NULL; - ifp->if_real_bytes = 0; - } - } else if ((ifp->if_flags & XFS_IFEXTENTS) && - ((ifp->if_flags & XFS_IFEXTIREC) || - ((ifp->if_u1.if_extents != NULL) && - (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext)))) { - ASSERT(ifp->if_real_bytes != 0); - xfs_iext_destroy(ifp); - } - ASSERT(ifp->if_u1.if_extents == NULL || - ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext); - ASSERT(ifp->if_real_bytes == 0); - if (whichfork == XFS_ATTR_FORK) { - kmem_zone_free(xfs_ifork_zone, ip->i_afp); - ip->i_afp = NULL; - } -} - -/* * This is called to unpin an inode. The caller must have the inode locked * in at least shared mode so that the buffer cannot be subsequently pinned * once someone is waiting for it to be unpinned. @@ -2401,162 +2485,694 @@ } /* - * xfs_iextents_copy() - * - * This is called to copy the REAL extents (as opposed to the delayed - * allocation extents) from the inode into the given buffer. It - * returns the number of bytes copied into the buffer. - * - * If there are no delayed allocation extents, then we can just - * memcpy() the extents into the buffer. Otherwise, we need to - * examine each extent in turn and skip those which are delayed. + * Removing an inode from the namespace involves removing the directory entry + * and dropping the link count on the inode. Removing the directory entry can + * result in locking an AGF (directory blocks were freed) and removing a link + * count can result in placing the inode on an unlinked list which results in + * locking an AGI. + * + * The big problem here is that we have an ordering constraint on AGF and AGI + * locking - inode allocation locks the AGI, then can allocate a new extent for + * new inodes, locking the AGF after the AGI. Similarly, freeing the inode + * removes the inode from the unlinked list, requiring that we lock the AGI + * first, and then freeing the inode can result in an inode chunk being freed + * and hence freeing disk space requiring that we lock an AGF. + * + * Hence the ordering that is imposed by other parts of the code is AGI before + * AGF. This means we cannot remove the directory entry before we drop the inode + * reference count and put it on the unlinked list as this results in a lock + * order of AGF then AGI, and this can deadlock against inode allocation and + * freeing. Therefore we must drop the link counts before we remove the + * directory entry. + * + * This is still safe from a transactional point of view - it is not until we + * get to xfs_bmap_finish() that we have the possibility of multiple + * transactions in this operation. Hence as long as we remove the directory + * entry and drop the link count in the first transaction of the remove + * operation, there are no transactional constraints on the ordering here. */ int -xfs_iextents_copy( - xfs_inode_t *ip, - xfs_bmbt_rec_t *dp, - int whichfork) +xfs_remove( + xfs_inode_t *dp, + struct xfs_name *name, + xfs_inode_t *ip) { - int copied; - int i; - xfs_ifork_t *ifp; - int nrecs; - xfs_fsblock_t start_block; + xfs_mount_t *mp = dp->i_mount; + xfs_trans_t *tp = NULL; + int is_dir = S_ISDIR(ip->i_d.di_mode); + int error = 0; + xfs_bmap_free_t free_list; + xfs_fsblock_t first_block; + int committed; + uint resblks; - ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); - ASSERT(ifp->if_bytes > 0); + trace_xfs_remove(dp, name); - nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - XFS_BMAP_TRACE_EXLIST(ip, nrecs, whichfork); - ASSERT(nrecs > 0); - - /* - * There are some delayed allocation extents in the - * inode, so copy the extents one at a time and skip - * the delayed ones. There must be at least one - * non-delayed extent. - */ - copied = 0; - for (i = 0; i < nrecs; i++) { - xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); - start_block = xfs_bmbt_get_startblock(ep); - if (isnullstartblock(start_block)) { - /* - * It's a delayed allocation extent, so skip it. - */ - continue; + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + error = xfs_qm_dqattach(dp, 0); + if (error) + goto std_return; + + error = xfs_qm_dqattach(ip, 0); + if (error) + goto std_return; + + if (is_dir) + tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR); + else + tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE); + + /* + * We try to get the real space reservation first, + * allowing for directory btree deletion(s) implying + * possible bmap insert(s). If we can't get the space + * reservation then we use 0 instead, and avoid the bmap + * btree insert(s) in the directory code by, if the bmap + * insert tries to happen, instead trimming the LAST + * block from the directory. + */ + resblks = XFS_REMOVE_SPACE_RES(mp); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, resblks, 0); + if (error == -ENOSPC) { + resblks = 0; + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, 0, 0); + } + if (error) { + ASSERT(error != -ENOSPC); + goto out_trans_cancel; + } + + xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT); + xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL); + + xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + + /* + * If we're removing a directory perform some additional validation. + */ + if (is_dir) { + ASSERT(ip->i_d.di_nlink >= 2); + if (ip->i_d.di_nlink != 2) { + error = -ENOTEMPTY; + goto out_trans_cancel; } + if (!xfs_dir_isempty(ip)) { + error = -ENOTEMPTY; + goto out_trans_cancel; + } + + /* Drop the link from ip's "..". */ + error = xfs_droplink(tp, dp); + if (error) + goto out_trans_cancel; - /* Translate to on disk format */ - put_unaligned(cpu_to_be64(ep->l0), &dp->l0); - put_unaligned(cpu_to_be64(ep->l1), &dp->l1); - dp++; - copied++; + /* Drop the "." link from ip to self. */ + error = xfs_droplink(tp, ip); + if (error) + goto out_trans_cancel; + } else { + /* + * When removing a non-directory we need to log the parent + * inode here. For a directory this is done implicitly + * by the xfs_droplink call for the ".." entry. + */ + xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); } - ASSERT(copied != 0); - xfs_validate_extents(ifp, copied, XFS_EXTFMT_INODE(ip)); + xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + + /* Drop the link from dp to ip. */ + error = xfs_droplink(tp, ip); + if (error) + goto out_trans_cancel; + + xfs_bmap_init(&free_list, &first_block); + error = xfs_dir_removename(tp, dp, name, ip->i_ino, + &first_block, &free_list, resblks); + if (error) { + ASSERT(error != -ENOENT); + goto out_bmap_cancel; + } + + /* + * If this is a synchronous mount, make sure that the + * remove transaction goes to disk before returning to + * the user. + */ + if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) + xfs_trans_set_sync(tp); + + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) + goto out_bmap_cancel; + + error = xfs_trans_commit(tp); + if (error) + goto std_return; - return (copied * (uint)sizeof(xfs_bmbt_rec_t)); + if (is_dir && xfs_inode_is_filestream(ip)) + xfs_filestream_deassociate(ip); + + return 0; + + out_bmap_cancel: + xfs_bmap_cancel(&free_list); + out_trans_cancel: + xfs_trans_cancel(tp); + std_return: + return error; } /* - * Each of the following cases stores data into the same region - * of the on-disk inode, so only one of them can be valid at - * any given time. While it is possible to have conflicting formats - * and log flags, e.g. having XFS_ILOG_?DATA set when the fork is - * in EXTENTS format, this can only happen when the fork has - * changed formats after being modified but before being flushed. - * In these cases, the format always takes precedence, because the - * format indicates the current state of the fork. + * Enter all inodes for a rename transaction into a sorted array. */ -/*ARGSUSED*/ +#define __XFS_SORT_INODES 5 STATIC void -xfs_iflush_fork( - xfs_inode_t *ip, - xfs_dinode_t *dip, - xfs_inode_log_item_t *iip, - int whichfork, - xfs_buf_t *bp) +xfs_sort_for_rename( + struct xfs_inode *dp1, /* in: old (source) directory inode */ + struct xfs_inode *dp2, /* in: new (target) directory inode */ + struct xfs_inode *ip1, /* in: inode of old entry */ + struct xfs_inode *ip2, /* in: inode of new entry */ + struct xfs_inode *wip, /* in: whiteout inode */ + struct xfs_inode **i_tab,/* out: sorted array of inodes */ + int *num_inodes) /* in/out: inodes in array */ { - char *cp; - xfs_ifork_t *ifp; - xfs_mount_t *mp; - static const short brootflag[2] = - { XFS_ILOG_DBROOT, XFS_ILOG_ABROOT }; - static const short dataflag[2] = - { XFS_ILOG_DDATA, XFS_ILOG_ADATA }; - static const short extflag[2] = - { XFS_ILOG_DEXT, XFS_ILOG_AEXT }; + int i, j; + + ASSERT(*num_inodes == __XFS_SORT_INODES); + memset(i_tab, 0, *num_inodes * sizeof(struct xfs_inode *)); - if (!iip) - return; - ifp = XFS_IFORK_PTR(ip, whichfork); /* - * This can happen if we gave up in iformat in an error path, - * for the attribute fork. + * i_tab contains a list of pointers to inodes. We initialize + * the table here & we'll sort it. We will then use it to + * order the acquisition of the inode locks. + * + * Note that the table may contain duplicates. e.g., dp1 == dp2. */ - if (!ifp) { - ASSERT(whichfork == XFS_ATTR_FORK); - return; + i = 0; + i_tab[i++] = dp1; + i_tab[i++] = dp2; + i_tab[i++] = ip1; + if (ip2) + i_tab[i++] = ip2; + if (wip) + i_tab[i++] = wip; + *num_inodes = i; + + /* + * Sort the elements via bubble sort. (Remember, there are at + * most 5 elements to sort, so this is adequate.) + */ + for (i = 0; i < *num_inodes; i++) { + for (j = 1; j < *num_inodes; j++) { + if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) { + struct xfs_inode *temp = i_tab[j]; + i_tab[j] = i_tab[j-1]; + i_tab[j-1] = temp; + } + } } - cp = XFS_DFORK_PTR(dip, whichfork); - mp = ip->i_mount; - switch (XFS_IFORK_FORMAT(ip, whichfork)) { - case XFS_DINODE_FMT_LOCAL: - if ((iip->ili_fields & dataflag[whichfork]) && - (ifp->if_bytes > 0)) { - ASSERT(ifp->if_u1.if_data != NULL); - ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork)); - memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes); +} + +static int +xfs_finish_rename( + struct xfs_trans *tp, + struct xfs_bmap_free *free_list) +{ + int committed = 0; + int error; + + /* + * If this is a synchronous mount, make sure that the rename transaction + * goes to disk before returning to the user. + */ + if (tp->t_mountp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) + xfs_trans_set_sync(tp); + + error = xfs_bmap_finish(&tp, free_list, &committed); + if (error) { + xfs_bmap_cancel(free_list); + xfs_trans_cancel(tp); + return error; + } + + return xfs_trans_commit(tp); +} + +/* + * xfs_cross_rename() + * + * responsible for handling RENAME_EXCHANGE flag in renameat2() sytemcall + */ +STATIC int +xfs_cross_rename( + struct xfs_trans *tp, + struct xfs_inode *dp1, + struct xfs_name *name1, + struct xfs_inode *ip1, + struct xfs_inode *dp2, + struct xfs_name *name2, + struct xfs_inode *ip2, + struct xfs_bmap_free *free_list, + xfs_fsblock_t *first_block, + int spaceres) +{ + int error = 0; + int ip1_flags = 0; + int ip2_flags = 0; + int dp2_flags = 0; + + /* Swap inode number for dirent in first parent */ + error = xfs_dir_replace(tp, dp1, name1, + ip2->i_ino, + first_block, free_list, spaceres); + if (error) + goto out_trans_abort; + + /* Swap inode number for dirent in second parent */ + error = xfs_dir_replace(tp, dp2, name2, + ip1->i_ino, + first_block, free_list, spaceres); + if (error) + goto out_trans_abort; + + /* + * If we're renaming one or more directories across different parents, + * update the respective ".." entries (and link counts) to match the new + * parents. + */ + if (dp1 != dp2) { + dp2_flags = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; + + if (S_ISDIR(ip2->i_d.di_mode)) { + error = xfs_dir_replace(tp, ip2, &xfs_name_dotdot, + dp1->i_ino, first_block, + free_list, spaceres); + if (error) + goto out_trans_abort; + + /* transfer ip2 ".." reference to dp1 */ + if (!S_ISDIR(ip1->i_d.di_mode)) { + error = xfs_droplink(tp, dp2); + if (error) + goto out_trans_abort; + error = xfs_bumplink(tp, dp1); + if (error) + goto out_trans_abort; + } + + /* + * Although ip1 isn't changed here, userspace needs + * to be warned about the change, so that applications + * relying on it (like backup ones), will properly + * notify the change + */ + ip1_flags |= XFS_ICHGTIME_CHG; + ip2_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; } - break; - case XFS_DINODE_FMT_EXTENTS: - ASSERT((ifp->if_flags & XFS_IFEXTENTS) || - !(iip->ili_fields & extflag[whichfork])); - if ((iip->ili_fields & extflag[whichfork]) && - (ifp->if_bytes > 0)) { - ASSERT(xfs_iext_get_ext(ifp, 0)); - ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0); - (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp, - whichfork); + if (S_ISDIR(ip1->i_d.di_mode)) { + error = xfs_dir_replace(tp, ip1, &xfs_name_dotdot, + dp2->i_ino, first_block, + free_list, spaceres); + if (error) + goto out_trans_abort; + + /* transfer ip1 ".." reference to dp2 */ + if (!S_ISDIR(ip2->i_d.di_mode)) { + error = xfs_droplink(tp, dp1); + if (error) + goto out_trans_abort; + error = xfs_bumplink(tp, dp2); + if (error) + goto out_trans_abort; + } + + /* + * Although ip2 isn't changed here, userspace needs + * to be warned about the change, so that applications + * relying on it (like backup ones), will properly + * notify the change + */ + ip1_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; + ip2_flags |= XFS_ICHGTIME_CHG; } - break; + } + + if (ip1_flags) { + xfs_trans_ichgtime(tp, ip1, ip1_flags); + xfs_trans_log_inode(tp, ip1, XFS_ILOG_CORE); + } + if (ip2_flags) { + xfs_trans_ichgtime(tp, ip2, ip2_flags); + xfs_trans_log_inode(tp, ip2, XFS_ILOG_CORE); + } + if (dp2_flags) { + xfs_trans_ichgtime(tp, dp2, dp2_flags); + xfs_trans_log_inode(tp, dp2, XFS_ILOG_CORE); + } + xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE); + return xfs_finish_rename(tp, free_list); + +out_trans_abort: + xfs_bmap_cancel(free_list); + xfs_trans_cancel(tp); + return error; +} + +/* + * xfs_rename_alloc_whiteout() + * + * Return a referenced, unlinked, unlocked inode that that can be used as a + * whiteout in a rename transaction. We use a tmpfile inode here so that if we + * crash between allocating the inode and linking it into the rename transaction + * recovery will free the inode and we won't leak it. + */ +static int +xfs_rename_alloc_whiteout( + struct xfs_inode *dp, + struct xfs_inode **wip) +{ + struct xfs_inode *tmpfile; + int error; + + error = xfs_create_tmpfile(dp, NULL, S_IFCHR | WHITEOUT_MODE, &tmpfile); + if (error) + return error; + + /* + * Prepare the tmpfile inode as if it were created through the VFS. + * Otherwise, the link increment paths will complain about nlink 0->1. + * Drop the link count as done by d_tmpfile(), complete the inode setup + * and flag it as linkable. + */ + drop_nlink(VFS_I(tmpfile)); + xfs_finish_inode_setup(tmpfile); + VFS_I(tmpfile)->i_state |= I_LINKABLE; + + *wip = tmpfile; + return 0; +} + +/* + * xfs_rename + */ +int +xfs_rename( + struct xfs_inode *src_dp, + struct xfs_name *src_name, + struct xfs_inode *src_ip, + struct xfs_inode *target_dp, + struct xfs_name *target_name, + struct xfs_inode *target_ip, + unsigned int flags) +{ + struct xfs_mount *mp = src_dp->i_mount; + struct xfs_trans *tp; + struct xfs_bmap_free free_list; + xfs_fsblock_t first_block; + struct xfs_inode *wip = NULL; /* whiteout inode */ + struct xfs_inode *inodes[__XFS_SORT_INODES]; + int num_inodes = __XFS_SORT_INODES; + bool new_parent = (src_dp != target_dp); + bool src_is_directory = S_ISDIR(src_ip->i_d.di_mode); + int spaceres; + int error; + + trace_xfs_rename(src_dp, target_dp, src_name, target_name); + + if ((flags & RENAME_EXCHANGE) && !target_ip) + return -EINVAL; + + /* + * If we are doing a whiteout operation, allocate the whiteout inode + * we will be placing at the target and ensure the type is set + * appropriately. + */ + if (flags & RENAME_WHITEOUT) { + ASSERT(!(flags & (RENAME_NOREPLACE | RENAME_EXCHANGE))); + error = xfs_rename_alloc_whiteout(target_dp, &wip); + if (error) + return error; + + /* setup target dirent info as whiteout */ + src_name->type = XFS_DIR3_FT_CHRDEV; + } + + xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, wip, + inodes, &num_inodes); + + tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME); + spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, spaceres, 0); + if (error == -ENOSPC) { + spaceres = 0; + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, 0, 0); + } + if (error) + goto out_trans_cancel; + + /* + * Attach the dquots to the inodes + */ + error = xfs_qm_vop_rename_dqattach(inodes); + if (error) + goto out_trans_cancel; + + /* + * Lock all the participating inodes. Depending upon whether + * the target_name exists in the target directory, and + * whether the target directory is the same as the source + * directory, we can lock from 2 to 4 inodes. + */ + if (!new_parent) + xfs_ilock(src_dp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT); + else + xfs_lock_two_inodes(src_dp, target_dp, + XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT); + + xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL); + + /* + * Join all the inodes to the transaction. From this point on, + * we can rely on either trans_commit or trans_cancel to unlock + * them. + */ + xfs_trans_ijoin(tp, src_dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + if (new_parent) + xfs_trans_ijoin(tp, target_dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL); + if (target_ip) + xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL); + if (wip) + xfs_trans_ijoin(tp, wip, XFS_ILOCK_EXCL); - case XFS_DINODE_FMT_BTREE: - if ((iip->ili_fields & brootflag[whichfork]) && - (ifp->if_broot_bytes > 0)) { - ASSERT(ifp->if_broot != NULL); - ASSERT(ifp->if_broot_bytes <= - (XFS_IFORK_SIZE(ip, whichfork) + - XFS_BROOT_SIZE_ADJ(ip))); - xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes, - (xfs_bmdr_block_t *)cp, - XFS_DFORK_SIZE(dip, mp, whichfork)); + /* + * If we are using project inheritance, we only allow renames + * into our tree when the project IDs are the same; else the + * tree quota mechanism would be circumvented. + */ + if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && + (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) { + error = -EXDEV; + goto out_trans_cancel; + } + + xfs_bmap_init(&free_list, &first_block); + + /* RENAME_EXCHANGE is unique from here on. */ + if (flags & RENAME_EXCHANGE) + return xfs_cross_rename(tp, src_dp, src_name, src_ip, + target_dp, target_name, target_ip, + &free_list, &first_block, spaceres); + + /* + * Set up the target. + */ + if (target_ip == NULL) { + /* + * If there's no space reservation, check the entry will + * fit before actually inserting it. + */ + if (!spaceres) { + error = xfs_dir_canenter(tp, target_dp, target_name); + if (error) + goto out_trans_cancel; } - break; + /* + * If target does not exist and the rename crosses + * directories, adjust the target directory link count + * to account for the ".." reference from the new entry. + */ + error = xfs_dir_createname(tp, target_dp, target_name, + src_ip->i_ino, &first_block, + &free_list, spaceres); + if (error) + goto out_bmap_cancel; - case XFS_DINODE_FMT_DEV: - if (iip->ili_fields & XFS_ILOG_DEV) { - ASSERT(whichfork == XFS_DATA_FORK); - xfs_dinode_put_rdev(dip, ip->i_df.if_u2.if_rdev); + xfs_trans_ichgtime(tp, target_dp, + XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + + if (new_parent && src_is_directory) { + error = xfs_bumplink(tp, target_dp); + if (error) + goto out_bmap_cancel; } - break; + } else { /* target_ip != NULL */ + /* + * If target exists and it's a directory, check that both + * target and source are directories and that target can be + * destroyed, or that neither is a directory. + */ + if (S_ISDIR(target_ip->i_d.di_mode)) { + /* + * Make sure target dir is empty. + */ + if (!(xfs_dir_isempty(target_ip)) || + (target_ip->i_d.di_nlink > 2)) { + error = -EEXIST; + goto out_trans_cancel; + } + } + + /* + * Link the source inode under the target name. + * If the source inode is a directory and we are moving + * it across directories, its ".." entry will be + * inconsistent until we replace that down below. + * + * In case there is already an entry with the same + * name at the destination directory, remove it first. + */ + error = xfs_dir_replace(tp, target_dp, target_name, + src_ip->i_ino, + &first_block, &free_list, spaceres); + if (error) + goto out_bmap_cancel; + + xfs_trans_ichgtime(tp, target_dp, + XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - case XFS_DINODE_FMT_UUID: - if (iip->ili_fields & XFS_ILOG_UUID) { - ASSERT(whichfork == XFS_DATA_FORK); - memcpy(XFS_DFORK_DPTR(dip), - &ip->i_df.if_u2.if_uuid, - sizeof(uuid_t)); + /* + * Decrement the link count on the target since the target + * dir no longer points to it. + */ + error = xfs_droplink(tp, target_ip); + if (error) + goto out_bmap_cancel; + + if (src_is_directory) { + /* + * Drop the link from the old "." entry. + */ + error = xfs_droplink(tp, target_ip); + if (error) + goto out_bmap_cancel; } - break; + } /* target_ip != NULL */ - default: - ASSERT(0); - break; + /* + * Remove the source. + */ + if (new_parent && src_is_directory) { + /* + * Rewrite the ".." entry to point to the new + * directory. + */ + error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot, + target_dp->i_ino, + &first_block, &free_list, spaceres); + ASSERT(error != -EEXIST); + if (error) + goto out_bmap_cancel; } + + /* + * We always want to hit the ctime on the source inode. + * + * This isn't strictly required by the standards since the source + * inode isn't really being changed, but old unix file systems did + * it and some incremental backup programs won't work without it. + */ + xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE); + + /* + * Adjust the link count on src_dp. This is necessary when + * renaming a directory, either within one parent when + * the target existed, or across two parent directories. + */ + if (src_is_directory && (new_parent || target_ip != NULL)) { + + /* + * Decrement link count on src_directory since the + * entry that's moved no longer points to it. + */ + error = xfs_droplink(tp, src_dp); + if (error) + goto out_bmap_cancel; + } + + /* + * For whiteouts, we only need to update the source dirent with the + * inode number of the whiteout inode rather than removing it + * altogether. + */ + if (wip) { + error = xfs_dir_replace(tp, src_dp, src_name, wip->i_ino, + &first_block, &free_list, spaceres); + } else + error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino, + &first_block, &free_list, spaceres); + if (error) + goto out_bmap_cancel; + + /* + * For whiteouts, we need to bump the link count on the whiteout inode. + * This means that failures all the way up to this point leave the inode + * on the unlinked list and so cleanup is a simple matter of dropping + * the remaining reference to it. If we fail here after bumping the link + * count, we're shutting down the filesystem so we'll never see the + * intermediate state on disk. + */ + if (wip) { + ASSERT(VFS_I(wip)->i_nlink == 0 && wip->i_d.di_nlink == 0); + error = xfs_bumplink(tp, wip); + if (error) + goto out_bmap_cancel; + error = xfs_iunlink_remove(tp, wip); + if (error) + goto out_bmap_cancel; + xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE); + + /* + * Now we have a real link, clear the "I'm a tmpfile" state + * flag from the inode so it doesn't accidentally get misused in + * future. + */ + VFS_I(wip)->i_state &= ~I_LINKABLE; + } + + xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE); + if (new_parent) + xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE); + + error = xfs_finish_rename(tp, &free_list); + if (wip) + IRELE(wip); + return error; + +out_bmap_cancel: + xfs_bmap_cancel(&free_list); +out_trans_cancel: + xfs_trans_cancel(tp); + if (wip) + IRELE(wip); + return error; } STATIC int @@ -2578,13 +3194,13 @@ pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); - inodes_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog; + inodes_per_cluster = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog; ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *); ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS); if (!ilist) goto out_put; - mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1); + mask = ~(((mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog)) - 1); first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask; rcu_read_lock(); /* really need a gang lookup range call here */ @@ -2657,8 +3273,8 @@ } if (clcount) { - XFS_STATS_INC(xs_icluster_flushcnt); - XFS_STATS_ADD(xs_icluster_flushinode, clcount); + XFS_STATS_INC(mp, xs_icluster_flushcnt); + XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount); } out_free: @@ -2695,8 +3311,8 @@ if (bp->b_iodone) { XFS_BUF_UNDONE(bp); xfs_buf_stale(bp); - xfs_buf_ioerror(bp, EIO); - xfs_buf_ioend(bp, 0); + xfs_buf_ioerror(bp, -EIO); + xfs_buf_ioend(bp); } else { xfs_buf_stale(bp); xfs_buf_relse(bp); @@ -2709,7 +3325,7 @@ xfs_iflush_abort(iq, false); kmem_free(ilist); xfs_perag_put(pag); - return XFS_ERROR(EFSCORRUPTED); + return -EFSCORRUPTED; } /* @@ -2731,7 +3347,7 @@ struct xfs_dinode *dip; int error; - XFS_STATS_INC(xs_iflush_count); + XFS_STATS_INC(mp, xs_iflush_count); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); ASSERT(xfs_isiflocked(ip)); @@ -2764,7 +3380,7 @@ * as we wait for an empty AIL as part of the unmount process. */ if (XFS_FORCED_SHUTDOWN(mp)) { - error = XFS_ERROR(EIO); + error = -EIO; goto abort_out; } @@ -2779,7 +3395,7 @@ */ error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK, 0); - if (error == EAGAIN) { + if (error == -EAGAIN) { xfs_ifunlock(ip); return error; } @@ -2816,7 +3432,7 @@ xfs_buf_relse(bp); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); cluster_corrupt_out: - error = XFS_ERROR(EFSCORRUPTED); + error = -EFSCORRUPTED; abort_out: /* * Unlocks the flush lock @@ -2825,7 +3441,6 @@ return error; } - STATIC int xfs_iflush_int( struct xfs_inode *ip, @@ -2840,9 +3455,10 @@ ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); ASSERT(iip != NULL && iip->ili_fields != 0); + ASSERT(ip->i_d.di_version > 1); /* set *dip = inode's place in the buffer */ - dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); + dip = xfs_buf_offset(bp, ip->i_imap.im_boffset); if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC), mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) { @@ -2898,12 +3514,18 @@ __func__, ip->i_ino, ip->i_d.di_forkoff, ip); goto corrupt_out; } + /* - * bump the flush iteration count, used to detect flushes which - * postdate a log record during recovery. This is redundant as we now - * log every change and hence this can't happen. Still, it doesn't hurt. + * Inode item log recovery for v2 inodes are dependent on the + * di_flushiter count for correct sequencing. We bump the flush + * iteration count so we can detect flushes which postdate a log record + * during recovery. This is redundant as we now log every change and + * hence this can't happen but we need to still do it to ensure + * backwards compatibility with old kernels that predate logging all + * inode changes. */ - ip->i_d.di_flushiter++; + if (ip->i_d.di_version < 3) + ip->i_d.di_flushiter++; /* * Copy the dirty parts of the inode into the on-disk @@ -2917,40 +3539,9 @@ if (ip->i_d.di_flushiter == DI_MAX_FLUSH) ip->i_d.di_flushiter = 0; - /* - * If this is really an old format inode and the superblock version - * has not been updated to support only new format inodes, then - * convert back to the old inode format. If the superblock version - * has been updated, then make the conversion permanent. - */ - ASSERT(ip->i_d.di_version == 1 || xfs_sb_version_hasnlink(&mp->m_sb)); - if (ip->i_d.di_version == 1) { - if (!xfs_sb_version_hasnlink(&mp->m_sb)) { - /* - * Convert it back. - */ - ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1); - dip->di_onlink = cpu_to_be16(ip->i_d.di_nlink); - } else { - /* - * The superblock version has already been bumped, - * so just make the conversion to the new inode - * format permanent. - */ - ip->i_d.di_version = 2; - dip->di_version = 2; - ip->i_d.di_onlink = 0; - dip->di_onlink = 0; - memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); - memset(&(dip->di_pad[0]), 0, - sizeof(dip->di_pad)); - ASSERT(xfs_get_projid(ip) == 0); - } - } - - xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK, bp); + xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK); if (XFS_IFORK_Q(ip)) - xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK, bp); + xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK); xfs_inobp_check(mp, bp); /* @@ -2980,6 +3571,7 @@ */ iip->ili_last_fields = iip->ili_fields; iip->ili_fields = 0; + iip->ili_fsync_fields = 0; iip->ili_logged = 1; xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, @@ -3005,1074 +3597,5 @@ return 0; corrupt_out: - return XFS_ERROR(EFSCORRUPTED); -} - -/* - * Return a pointer to the extent record at file index idx. - */ -xfs_bmbt_rec_host_t * -xfs_iext_get_ext( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_extnum_t idx) /* index of target extent */ -{ - ASSERT(idx >= 0); - ASSERT(idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)); - - if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) { - return ifp->if_u1.if_ext_irec->er_extbuf; - } else if (ifp->if_flags & XFS_IFEXTIREC) { - xfs_ext_irec_t *erp; /* irec pointer */ - int erp_idx = 0; /* irec index */ - xfs_extnum_t page_idx = idx; /* ext index in target list */ - - erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0); - return &erp->er_extbuf[page_idx]; - } else if (ifp->if_bytes) { - return &ifp->if_u1.if_extents[idx]; - } else { - return NULL; - } -} - -/* - * Insert new item(s) into the extent records for incore inode - * fork 'ifp'. 'count' new items are inserted at index 'idx'. - */ -void -xfs_iext_insert( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t idx, /* starting index of new items */ - xfs_extnum_t count, /* number of inserted items */ - xfs_bmbt_irec_t *new, /* items to insert */ - int state) /* type of extent conversion */ -{ - xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df; - xfs_extnum_t i; /* extent record index */ - - trace_xfs_iext_insert(ip, idx, new, state, _RET_IP_); - - ASSERT(ifp->if_flags & XFS_IFEXTENTS); - xfs_iext_add(ifp, idx, count); - for (i = idx; i < idx + count; i++, new++) - xfs_bmbt_set_all(xfs_iext_get_ext(ifp, i), new); -} - -/* - * This is called when the amount of space required for incore file - * extents needs to be increased. The ext_diff parameter stores the - * number of new extents being added and the idx parameter contains - * the extent index where the new extents will be added. If the new - * extents are being appended, then we just need to (re)allocate and - * initialize the space. Otherwise, if the new extents are being - * inserted into the middle of the existing entries, a bit more work - * is required to make room for the new extents to be inserted. The - * caller is responsible for filling in the new extent entries upon - * return. - */ -void -xfs_iext_add( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_extnum_t idx, /* index to begin adding exts */ - int ext_diff) /* number of extents to add */ -{ - int byte_diff; /* new bytes being added */ - int new_size; /* size of extents after adding */ - xfs_extnum_t nextents; /* number of extents in file */ - - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - ASSERT((idx >= 0) && (idx <= nextents)); - byte_diff = ext_diff * sizeof(xfs_bmbt_rec_t); - new_size = ifp->if_bytes + byte_diff; - /* - * If the new number of extents (nextents + ext_diff) - * fits inside the inode, then continue to use the inline - * extent buffer. - */ - if (nextents + ext_diff <= XFS_INLINE_EXTS) { - if (idx < nextents) { - memmove(&ifp->if_u2.if_inline_ext[idx + ext_diff], - &ifp->if_u2.if_inline_ext[idx], - (nextents - idx) * sizeof(xfs_bmbt_rec_t)); - memset(&ifp->if_u2.if_inline_ext[idx], 0, byte_diff); - } - ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; - ifp->if_real_bytes = 0; - } - /* - * Otherwise use a linear (direct) extent list. - * If the extents are currently inside the inode, - * xfs_iext_realloc_direct will switch us from - * inline to direct extent allocation mode. - */ - else if (nextents + ext_diff <= XFS_LINEAR_EXTS) { - xfs_iext_realloc_direct(ifp, new_size); - if (idx < nextents) { - memmove(&ifp->if_u1.if_extents[idx + ext_diff], - &ifp->if_u1.if_extents[idx], - (nextents - idx) * sizeof(xfs_bmbt_rec_t)); - memset(&ifp->if_u1.if_extents[idx], 0, byte_diff); - } - } - /* Indirection array */ - else { - xfs_ext_irec_t *erp; - int erp_idx = 0; - int page_idx = idx; - - ASSERT(nextents + ext_diff > XFS_LINEAR_EXTS); - if (ifp->if_flags & XFS_IFEXTIREC) { - erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 1); - } else { - xfs_iext_irec_init(ifp); - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - erp = ifp->if_u1.if_ext_irec; - } - /* Extents fit in target extent page */ - if (erp && erp->er_extcount + ext_diff <= XFS_LINEAR_EXTS) { - if (page_idx < erp->er_extcount) { - memmove(&erp->er_extbuf[page_idx + ext_diff], - &erp->er_extbuf[page_idx], - (erp->er_extcount - page_idx) * - sizeof(xfs_bmbt_rec_t)); - memset(&erp->er_extbuf[page_idx], 0, byte_diff); - } - erp->er_extcount += ext_diff; - xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff); - } - /* Insert a new extent page */ - else if (erp) { - xfs_iext_add_indirect_multi(ifp, - erp_idx, page_idx, ext_diff); - } - /* - * If extent(s) are being appended to the last page in - * the indirection array and the new extent(s) don't fit - * in the page, then erp is NULL and erp_idx is set to - * the next index needed in the indirection array. - */ - else { - int count = ext_diff; - - while (count) { - erp = xfs_iext_irec_new(ifp, erp_idx); - erp->er_extcount = count; - count -= MIN(count, (int)XFS_LINEAR_EXTS); - if (count) { - erp_idx++; - } - } - } - } - ifp->if_bytes = new_size; -} - -/* - * This is called when incore extents are being added to the indirection - * array and the new extents do not fit in the target extent list. The - * erp_idx parameter contains the irec index for the target extent list - * in the indirection array, and the idx parameter contains the extent - * index within the list. The number of extents being added is stored - * in the count parameter. - * - * |-------| |-------| - * | | | | idx - number of extents before idx - * | idx | | count | - * | | | | count - number of extents being inserted at idx - * |-------| |-------| - * | count | | nex2 | nex2 - number of extents after idx + count - * |-------| |-------| - */ -void -xfs_iext_add_indirect_multi( - xfs_ifork_t *ifp, /* inode fork pointer */ - int erp_idx, /* target extent irec index */ - xfs_extnum_t idx, /* index within target list */ - int count) /* new extents being added */ -{ - int byte_diff; /* new bytes being added */ - xfs_ext_irec_t *erp; /* pointer to irec entry */ - xfs_extnum_t ext_diff; /* number of extents to add */ - xfs_extnum_t ext_cnt; /* new extents still needed */ - xfs_extnum_t nex2; /* extents after idx + count */ - xfs_bmbt_rec_t *nex2_ep = NULL; /* temp list for nex2 extents */ - int nlists; /* number of irec's (lists) */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - erp = &ifp->if_u1.if_ext_irec[erp_idx]; - nex2 = erp->er_extcount - idx; - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - - /* - * Save second part of target extent list - * (all extents past */ - if (nex2) { - byte_diff = nex2 * sizeof(xfs_bmbt_rec_t); - nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_NOFS); - memmove(nex2_ep, &erp->er_extbuf[idx], byte_diff); - erp->er_extcount -= nex2; - xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -nex2); - memset(&erp->er_extbuf[idx], 0, byte_diff); - } - - /* - * Add the new extents to the end of the target - * list, then allocate new irec record(s) and - * extent buffer(s) as needed to store the rest - * of the new extents. - */ - ext_cnt = count; - ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS - erp->er_extcount); - if (ext_diff) { - erp->er_extcount += ext_diff; - xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff); - ext_cnt -= ext_diff; - } - while (ext_cnt) { - erp_idx++; - erp = xfs_iext_irec_new(ifp, erp_idx); - ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS); - erp->er_extcount = ext_diff; - xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff); - ext_cnt -= ext_diff; - } - - /* Add nex2 extents back to indirection array */ - if (nex2) { - xfs_extnum_t ext_avail; - int i; - - byte_diff = nex2 * sizeof(xfs_bmbt_rec_t); - ext_avail = XFS_LINEAR_EXTS - erp->er_extcount; - i = 0; - /* - * If nex2 extents fit in the current page, append - * nex2_ep after the new extents. - */ - if (nex2 <= ext_avail) { - i = erp->er_extcount; - } - /* - * Otherwise, check if space is available in the - * next page. - */ - else if ((erp_idx < nlists - 1) && - (nex2 <= (ext_avail = XFS_LINEAR_EXTS - - ifp->if_u1.if_ext_irec[erp_idx+1].er_extcount))) { - erp_idx++; - erp++; - /* Create a hole for nex2 extents */ - memmove(&erp->er_extbuf[nex2], erp->er_extbuf, - erp->er_extcount * sizeof(xfs_bmbt_rec_t)); - } - /* - * Final choice, create a new extent page for - * nex2 extents. - */ - else { - erp_idx++; - erp = xfs_iext_irec_new(ifp, erp_idx); - } - memmove(&erp->er_extbuf[i], nex2_ep, byte_diff); - kmem_free(nex2_ep); - erp->er_extcount += nex2; - xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, nex2); - } -} - -/* - * This is called when the amount of space required for incore file - * extents needs to be decreased. The ext_diff parameter stores the - * number of extents to be removed and the idx parameter contains - * the extent index where the extents will be removed from. - * - * If the amount of space needed has decreased below the linear - * limit, XFS_IEXT_BUFSZ, then switch to using the contiguous - * extent array. Otherwise, use kmem_realloc() to adjust the - * size to what is needed. - */ -void -xfs_iext_remove( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t idx, /* index to begin removing exts */ - int ext_diff, /* number of extents to remove */ - int state) /* type of extent conversion */ -{ - xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df; - xfs_extnum_t nextents; /* number of extents in file */ - int new_size; /* size of extents after removal */ - - trace_xfs_iext_remove(ip, idx, state, _RET_IP_); - - ASSERT(ext_diff > 0); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t); - - if (new_size == 0) { - xfs_iext_destroy(ifp); - } else if (ifp->if_flags & XFS_IFEXTIREC) { - xfs_iext_remove_indirect(ifp, idx, ext_diff); - } else if (ifp->if_real_bytes) { - xfs_iext_remove_direct(ifp, idx, ext_diff); - } else { - xfs_iext_remove_inline(ifp, idx, ext_diff); - } - ifp->if_bytes = new_size; -} - -/* - * This removes ext_diff extents from the inline buffer, beginning - * at extent index idx. - */ -void -xfs_iext_remove_inline( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_extnum_t idx, /* index to begin removing exts */ - int ext_diff) /* number of extents to remove */ -{ - int nextents; /* number of extents in file */ - - ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); - ASSERT(idx < XFS_INLINE_EXTS); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - ASSERT(((nextents - ext_diff) > 0) && - (nextents - ext_diff) < XFS_INLINE_EXTS); - - if (idx + ext_diff < nextents) { - memmove(&ifp->if_u2.if_inline_ext[idx], - &ifp->if_u2.if_inline_ext[idx + ext_diff], - (nextents - (idx + ext_diff)) * - sizeof(xfs_bmbt_rec_t)); - memset(&ifp->if_u2.if_inline_ext[nextents - ext_diff], - 0, ext_diff * sizeof(xfs_bmbt_rec_t)); - } else { - memset(&ifp->if_u2.if_inline_ext[idx], 0, - ext_diff * sizeof(xfs_bmbt_rec_t)); - } + return -EFSCORRUPTED; } - -/* - * This removes ext_diff extents from a linear (direct) extent list, - * beginning at extent index idx. If the extents are being removed - * from the end of the list (ie. truncate) then we just need to re- - * allocate the list to remove the extra space. Otherwise, if the - * extents are being removed from the middle of the existing extent - * entries, then we first need to move the extent records beginning - * at idx + ext_diff up in the list to overwrite the records being - * removed, then remove the extra space via kmem_realloc. - */ -void -xfs_iext_remove_direct( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_extnum_t idx, /* index to begin removing exts */ - int ext_diff) /* number of extents to remove */ -{ - xfs_extnum_t nextents; /* number of extents in file */ - int new_size; /* size of extents after removal */ - - ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); - new_size = ifp->if_bytes - - (ext_diff * sizeof(xfs_bmbt_rec_t)); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - - if (new_size == 0) { - xfs_iext_destroy(ifp); - return; - } - /* Move extents up in the list (if needed) */ - if (idx + ext_diff < nextents) { - memmove(&ifp->if_u1.if_extents[idx], - &ifp->if_u1.if_extents[idx + ext_diff], - (nextents - (idx + ext_diff)) * - sizeof(xfs_bmbt_rec_t)); - } - memset(&ifp->if_u1.if_extents[nextents - ext_diff], - 0, ext_diff * sizeof(xfs_bmbt_rec_t)); - /* - * Reallocate the direct extent list. If the extents - * will fit inside the inode then xfs_iext_realloc_direct - * will switch from direct to inline extent allocation - * mode for us. - */ - xfs_iext_realloc_direct(ifp, new_size); - ifp->if_bytes = new_size; -} - -/* - * This is called when incore extents are being removed from the - * indirection array and the extents being removed span multiple extent - * buffers. The idx parameter contains the file extent index where we - * want to begin removing extents, and the count parameter contains - * how many extents need to be removed. - * - * |-------| |-------| - * | nex1 | | | nex1 - number of extents before idx - * |-------| | count | - * | | | | count - number of extents being removed at idx - * | count | |-------| - * | | | nex2 | nex2 - number of extents after idx + count - * |-------| |-------| - */ -void -xfs_iext_remove_indirect( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_extnum_t idx, /* index to begin removing extents */ - int count) /* number of extents to remove */ -{ - xfs_ext_irec_t *erp; /* indirection array pointer */ - int erp_idx = 0; /* indirection array index */ - xfs_extnum_t ext_cnt; /* extents left to remove */ - xfs_extnum_t ext_diff; /* extents to remove in current list */ - xfs_extnum_t nex1; /* number of extents before idx */ - xfs_extnum_t nex2; /* extents after idx + count */ - int page_idx = idx; /* index in target extent list */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0); - ASSERT(erp != NULL); - nex1 = page_idx; - ext_cnt = count; - while (ext_cnt) { - nex2 = MAX((erp->er_extcount - (nex1 + ext_cnt)), 0); - ext_diff = MIN(ext_cnt, (erp->er_extcount - nex1)); - /* - * Check for deletion of entire list; - * xfs_iext_irec_remove() updates extent offsets. - */ - if (ext_diff == erp->er_extcount) { - xfs_iext_irec_remove(ifp, erp_idx); - ext_cnt -= ext_diff; - nex1 = 0; - if (ext_cnt) { - ASSERT(erp_idx < ifp->if_real_bytes / - XFS_IEXT_BUFSZ); - erp = &ifp->if_u1.if_ext_irec[erp_idx]; - nex1 = 0; - continue; - } else { - break; - } - } - /* Move extents up (if needed) */ - if (nex2) { - memmove(&erp->er_extbuf[nex1], - &erp->er_extbuf[nex1 + ext_diff], - nex2 * sizeof(xfs_bmbt_rec_t)); - } - /* Zero out rest of page */ - memset(&erp->er_extbuf[nex1 + nex2], 0, (XFS_IEXT_BUFSZ - - ((nex1 + nex2) * sizeof(xfs_bmbt_rec_t)))); - /* Update remaining counters */ - erp->er_extcount -= ext_diff; - xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -ext_diff); - ext_cnt -= ext_diff; - nex1 = 0; - erp_idx++; - erp++; - } - ifp->if_bytes -= count * sizeof(xfs_bmbt_rec_t); - xfs_iext_irec_compact(ifp); -} - -/* - * Create, destroy, or resize a linear (direct) block of extents. - */ -void -xfs_iext_realloc_direct( - xfs_ifork_t *ifp, /* inode fork pointer */ - int new_size) /* new size of extents */ -{ - int rnew_size; /* real new size of extents */ - - rnew_size = new_size; - - ASSERT(!(ifp->if_flags & XFS_IFEXTIREC) || - ((new_size >= 0) && (new_size <= XFS_IEXT_BUFSZ) && - (new_size != ifp->if_real_bytes))); - - /* Free extent records */ - if (new_size == 0) { - xfs_iext_destroy(ifp); - } - /* Resize direct extent list and zero any new bytes */ - else if (ifp->if_real_bytes) { - /* Check if extents will fit inside the inode */ - if (new_size <= XFS_INLINE_EXTS * sizeof(xfs_bmbt_rec_t)) { - xfs_iext_direct_to_inline(ifp, new_size / - (uint)sizeof(xfs_bmbt_rec_t)); - ifp->if_bytes = new_size; - return; - } - if (!is_power_of_2(new_size)){ - rnew_size = roundup_pow_of_two(new_size); - } - if (rnew_size != ifp->if_real_bytes) { - ifp->if_u1.if_extents = - kmem_realloc(ifp->if_u1.if_extents, - rnew_size, - ifp->if_real_bytes, KM_NOFS); - } - if (rnew_size > ifp->if_real_bytes) { - memset(&ifp->if_u1.if_extents[ifp->if_bytes / - (uint)sizeof(xfs_bmbt_rec_t)], 0, - rnew_size - ifp->if_real_bytes); - } - } - /* - * Switch from the inline extent buffer to a direct - * extent list. Be sure to include the inline extent - * bytes in new_size. - */ - else { - new_size += ifp->if_bytes; - if (!is_power_of_2(new_size)) { - rnew_size = roundup_pow_of_two(new_size); - } - xfs_iext_inline_to_direct(ifp, rnew_size); - } - ifp->if_real_bytes = rnew_size; - ifp->if_bytes = new_size; -} - -/* - * Switch from linear (direct) extent records to inline buffer. - */ -void -xfs_iext_direct_to_inline( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_extnum_t nextents) /* number of extents in file */ -{ - ASSERT(ifp->if_flags & XFS_IFEXTENTS); - ASSERT(nextents <= XFS_INLINE_EXTS); - /* - * The inline buffer was zeroed when we switched - * from inline to direct extent allocation mode, - * so we don't need to clear it here. - */ - memcpy(ifp->if_u2.if_inline_ext, ifp->if_u1.if_extents, - nextents * sizeof(xfs_bmbt_rec_t)); - kmem_free(ifp->if_u1.if_extents); - ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; - ifp->if_real_bytes = 0; -} - -/* - * Switch from inline buffer to linear (direct) extent records. - * new_size should already be rounded up to the next power of 2 - * by the caller (when appropriate), so use new_size as it is. - * However, since new_size may be rounded up, we can't update - * if_bytes here. It is the caller's responsibility to update - * if_bytes upon return. - */ -void -xfs_iext_inline_to_direct( - xfs_ifork_t *ifp, /* inode fork pointer */ - int new_size) /* number of extents in file */ -{ - ifp->if_u1.if_extents = kmem_alloc(new_size, KM_NOFS); - memset(ifp->if_u1.if_extents, 0, new_size); - if (ifp->if_bytes) { - memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext, - ifp->if_bytes); - memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS * - sizeof(xfs_bmbt_rec_t)); - } - ifp->if_real_bytes = new_size; -} - -/* - * Resize an extent indirection array to new_size bytes. - */ -STATIC void -xfs_iext_realloc_indirect( - xfs_ifork_t *ifp, /* inode fork pointer */ - int new_size) /* new indirection array size */ -{ - int nlists; /* number of irec's (ex lists) */ - int size; /* current indirection array size */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - size = nlists * sizeof(xfs_ext_irec_t); - ASSERT(ifp->if_real_bytes); - ASSERT((new_size >= 0) && (new_size != size)); - if (new_size == 0) { - xfs_iext_destroy(ifp); - } else { - ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *) - kmem_realloc(ifp->if_u1.if_ext_irec, - new_size, size, KM_NOFS); - } -} - -/* - * Switch from indirection array to linear (direct) extent allocations. - */ -STATIC void -xfs_iext_indirect_to_direct( - xfs_ifork_t *ifp) /* inode fork pointer */ -{ - xfs_bmbt_rec_host_t *ep; /* extent record pointer */ - xfs_extnum_t nextents; /* number of extents in file */ - int size; /* size of file extents */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - ASSERT(nextents <= XFS_LINEAR_EXTS); - size = nextents * sizeof(xfs_bmbt_rec_t); - - xfs_iext_irec_compact_pages(ifp); - ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ); - - ep = ifp->if_u1.if_ext_irec->er_extbuf; - kmem_free(ifp->if_u1.if_ext_irec); - ifp->if_flags &= ~XFS_IFEXTIREC; - ifp->if_u1.if_extents = ep; - ifp->if_bytes = size; - if (nextents < XFS_LINEAR_EXTS) { - xfs_iext_realloc_direct(ifp, size); - } -} - -/* - * Free incore file extents. - */ -void -xfs_iext_destroy( - xfs_ifork_t *ifp) /* inode fork pointer */ -{ - if (ifp->if_flags & XFS_IFEXTIREC) { - int erp_idx; - int nlists; - - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - for (erp_idx = nlists - 1; erp_idx >= 0 ; erp_idx--) { - xfs_iext_irec_remove(ifp, erp_idx); - } - ifp->if_flags &= ~XFS_IFEXTIREC; - } else if (ifp->if_real_bytes) { - kmem_free(ifp->if_u1.if_extents); - } else if (ifp->if_bytes) { - memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS * - sizeof(xfs_bmbt_rec_t)); - } - ifp->if_u1.if_extents = NULL; - ifp->if_real_bytes = 0; - ifp->if_bytes = 0; -} - -/* - * Return a pointer to the extent record for file system block bno. - */ -xfs_bmbt_rec_host_t * /* pointer to found extent record */ -xfs_iext_bno_to_ext( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_fileoff_t bno, /* block number to search for */ - xfs_extnum_t *idxp) /* index of target extent */ -{ - xfs_bmbt_rec_host_t *base; /* pointer to first extent */ - xfs_filblks_t blockcount = 0; /* number of blocks in extent */ - xfs_bmbt_rec_host_t *ep = NULL; /* pointer to target extent */ - xfs_ext_irec_t *erp = NULL; /* indirection array pointer */ - int high; /* upper boundary in search */ - xfs_extnum_t idx = 0; /* index of target extent */ - int low; /* lower boundary in search */ - xfs_extnum_t nextents; /* number of file extents */ - xfs_fileoff_t startoff = 0; /* start offset of extent */ - - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - if (nextents == 0) { - *idxp = 0; - return NULL; - } - low = 0; - if (ifp->if_flags & XFS_IFEXTIREC) { - /* Find target extent list */ - int erp_idx = 0; - erp = xfs_iext_bno_to_irec(ifp, bno, &erp_idx); - base = erp->er_extbuf; - high = erp->er_extcount - 1; - } else { - base = ifp->if_u1.if_extents; - high = nextents - 1; - } - /* Binary search extent records */ - while (low <= high) { - idx = (low + high) >> 1; - ep = base + idx; - startoff = xfs_bmbt_get_startoff(ep); - blockcount = xfs_bmbt_get_blockcount(ep); - if (bno < startoff) { - high = idx - 1; - } else if (bno >= startoff + blockcount) { - low = idx + 1; - } else { - /* Convert back to file-based extent index */ - if (ifp->if_flags & XFS_IFEXTIREC) { - idx += erp->er_extoff; - } - *idxp = idx; - return ep; - } - } - /* Convert back to file-based extent index */ - if (ifp->if_flags & XFS_IFEXTIREC) { - idx += erp->er_extoff; - } - if (bno >= startoff + blockcount) { - if (++idx == nextents) { - ep = NULL; - } else { - ep = xfs_iext_get_ext(ifp, idx); - } - } - *idxp = idx; - return ep; -} - -/* - * Return a pointer to the indirection array entry containing the - * extent record for filesystem block bno. Store the index of the - * target irec in *erp_idxp. - */ -xfs_ext_irec_t * /* pointer to found extent record */ -xfs_iext_bno_to_irec( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_fileoff_t bno, /* block number to search for */ - int *erp_idxp) /* irec index of target ext list */ -{ - xfs_ext_irec_t *erp = NULL; /* indirection array pointer */ - xfs_ext_irec_t *erp_next; /* next indirection array entry */ - int erp_idx; /* indirection array index */ - int nlists; /* number of extent irec's (lists) */ - int high; /* binary search upper limit */ - int low; /* binary search lower limit */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - erp_idx = 0; - low = 0; - high = nlists - 1; - while (low <= high) { - erp_idx = (low + high) >> 1; - erp = &ifp->if_u1.if_ext_irec[erp_idx]; - erp_next = erp_idx < nlists - 1 ? erp + 1 : NULL; - if (bno < xfs_bmbt_get_startoff(erp->er_extbuf)) { - high = erp_idx - 1; - } else if (erp_next && bno >= - xfs_bmbt_get_startoff(erp_next->er_extbuf)) { - low = erp_idx + 1; - } else { - break; - } - } - *erp_idxp = erp_idx; - return erp; -} - -/* - * Return a pointer to the indirection array entry containing the - * extent record at file extent index *idxp. Store the index of the - * target irec in *erp_idxp and store the page index of the target - * extent record in *idxp. - */ -xfs_ext_irec_t * -xfs_iext_idx_to_irec( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_extnum_t *idxp, /* extent index (file -> page) */ - int *erp_idxp, /* pointer to target irec */ - int realloc) /* new bytes were just added */ -{ - xfs_ext_irec_t *prev; /* pointer to previous irec */ - xfs_ext_irec_t *erp = NULL; /* pointer to current irec */ - int erp_idx; /* indirection array index */ - int nlists; /* number of irec's (ex lists) */ - int high; /* binary search upper limit */ - int low; /* binary search lower limit */ - xfs_extnum_t page_idx = *idxp; /* extent index in target list */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - ASSERT(page_idx >= 0); - ASSERT(page_idx <= ifp->if_bytes / sizeof(xfs_bmbt_rec_t)); - ASSERT(page_idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t) || realloc); - - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - erp_idx = 0; - low = 0; - high = nlists - 1; - - /* Binary search extent irec's */ - while (low <= high) { - erp_idx = (low + high) >> 1; - erp = &ifp->if_u1.if_ext_irec[erp_idx]; - prev = erp_idx > 0 ? erp - 1 : NULL; - if (page_idx < erp->er_extoff || (page_idx == erp->er_extoff && - realloc && prev && prev->er_extcount < XFS_LINEAR_EXTS)) { - high = erp_idx - 1; - } else if (page_idx > erp->er_extoff + erp->er_extcount || - (page_idx == erp->er_extoff + erp->er_extcount && - !realloc)) { - low = erp_idx + 1; - } else if (page_idx == erp->er_extoff + erp->er_extcount && - erp->er_extcount == XFS_LINEAR_EXTS) { - ASSERT(realloc); - page_idx = 0; - erp_idx++; - erp = erp_idx < nlists ? erp + 1 : NULL; - break; - } else { - page_idx -= erp->er_extoff; - break; - } - } - *idxp = page_idx; - *erp_idxp = erp_idx; - return(erp); -} - -/* - * Allocate and initialize an indirection array once the space needed - * for incore extents increases above XFS_IEXT_BUFSZ. - */ -void -xfs_iext_irec_init( - xfs_ifork_t *ifp) /* inode fork pointer */ -{ - xfs_ext_irec_t *erp; /* indirection array pointer */ - xfs_extnum_t nextents; /* number of extents in file */ - - ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - ASSERT(nextents <= XFS_LINEAR_EXTS); - - erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS); - - if (nextents == 0) { - ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS); - } else if (!ifp->if_real_bytes) { - xfs_iext_inline_to_direct(ifp, XFS_IEXT_BUFSZ); - } else if (ifp->if_real_bytes < XFS_IEXT_BUFSZ) { - xfs_iext_realloc_direct(ifp, XFS_IEXT_BUFSZ); - } - erp->er_extbuf = ifp->if_u1.if_extents; - erp->er_extcount = nextents; - erp->er_extoff = 0; - - ifp->if_flags |= XFS_IFEXTIREC; - ifp->if_real_bytes = XFS_IEXT_BUFSZ; - ifp->if_bytes = nextents * sizeof(xfs_bmbt_rec_t); - ifp->if_u1.if_ext_irec = erp; - - return; -} - -/* - * Allocate and initialize a new entry in the indirection array. - */ -xfs_ext_irec_t * -xfs_iext_irec_new( - xfs_ifork_t *ifp, /* inode fork pointer */ - int erp_idx) /* index for new irec */ -{ - xfs_ext_irec_t *erp; /* indirection array pointer */ - int i; /* loop counter */ - int nlists; /* number of irec's (ex lists) */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - - /* Resize indirection array */ - xfs_iext_realloc_indirect(ifp, ++nlists * - sizeof(xfs_ext_irec_t)); - /* - * Move records down in the array so the - * new page can use erp_idx. - */ - erp = ifp->if_u1.if_ext_irec; - for (i = nlists - 1; i > erp_idx; i--) { - memmove(&erp[i], &erp[i-1], sizeof(xfs_ext_irec_t)); - } - ASSERT(i == erp_idx); - - /* Initialize new extent record */ - erp = ifp->if_u1.if_ext_irec; - erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS); - ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ; - memset(erp[erp_idx].er_extbuf, 0, XFS_IEXT_BUFSZ); - erp[erp_idx].er_extcount = 0; - erp[erp_idx].er_extoff = erp_idx > 0 ? - erp[erp_idx-1].er_extoff + erp[erp_idx-1].er_extcount : 0; - return (&erp[erp_idx]); -} - -/* - * Remove a record from the indirection array. - */ -void -xfs_iext_irec_remove( - xfs_ifork_t *ifp, /* inode fork pointer */ - int erp_idx) /* irec index to remove */ -{ - xfs_ext_irec_t *erp; /* indirection array pointer */ - int i; /* loop counter */ - int nlists; /* number of irec's (ex lists) */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - erp = &ifp->if_u1.if_ext_irec[erp_idx]; - if (erp->er_extbuf) { - xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, - -erp->er_extcount); - kmem_free(erp->er_extbuf); - } - /* Compact extent records */ - erp = ifp->if_u1.if_ext_irec; - for (i = erp_idx; i < nlists - 1; i++) { - memmove(&erp[i], &erp[i+1], sizeof(xfs_ext_irec_t)); - } - /* - * Manually free the last extent record from the indirection - * array. A call to xfs_iext_realloc_indirect() with a size - * of zero would result in a call to xfs_iext_destroy() which - * would in turn call this function again, creating a nasty - * infinite loop. - */ - if (--nlists) { - xfs_iext_realloc_indirect(ifp, - nlists * sizeof(xfs_ext_irec_t)); - } else { - kmem_free(ifp->if_u1.if_ext_irec); - } - ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ; -} - -/* - * This is called to clean up large amounts of unused memory allocated - * by the indirection array. Before compacting anything though, verify - * that the indirection array is still needed and switch back to the - * linear extent list (or even the inline buffer) if possible. The - * compaction policy is as follows: - * - * Full Compaction: Extents fit into a single page (or inline buffer) - * Partial Compaction: Extents occupy less than 50% of allocated space - * No Compaction: Extents occupy at least 50% of allocated space - */ -void -xfs_iext_irec_compact( - xfs_ifork_t *ifp) /* inode fork pointer */ -{ - xfs_extnum_t nextents; /* number of extents in file */ - int nlists; /* number of irec's (ex lists) */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - - if (nextents == 0) { - xfs_iext_destroy(ifp); - } else if (nextents <= XFS_INLINE_EXTS) { - xfs_iext_indirect_to_direct(ifp); - xfs_iext_direct_to_inline(ifp, nextents); - } else if (nextents <= XFS_LINEAR_EXTS) { - xfs_iext_indirect_to_direct(ifp); - } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) { - xfs_iext_irec_compact_pages(ifp); - } -} - -/* - * Combine extents from neighboring extent pages. - */ -void -xfs_iext_irec_compact_pages( - xfs_ifork_t *ifp) /* inode fork pointer */ -{ - xfs_ext_irec_t *erp, *erp_next;/* pointers to irec entries */ - int erp_idx = 0; /* indirection array index */ - int nlists; /* number of irec's (ex lists) */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - while (erp_idx < nlists - 1) { - erp = &ifp->if_u1.if_ext_irec[erp_idx]; - erp_next = erp + 1; - if (erp_next->er_extcount <= - (XFS_LINEAR_EXTS - erp->er_extcount)) { - memcpy(&erp->er_extbuf[erp->er_extcount], - erp_next->er_extbuf, erp_next->er_extcount * - sizeof(xfs_bmbt_rec_t)); - erp->er_extcount += erp_next->er_extcount; - /* - * Free page before removing extent record - * so er_extoffs don't get modified in - * xfs_iext_irec_remove. - */ - kmem_free(erp_next->er_extbuf); - erp_next->er_extbuf = NULL; - xfs_iext_irec_remove(ifp, erp_idx + 1); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - } else { - erp_idx++; - } - } -} - -/* - * This is called to update the er_extoff field in the indirection - * array when extents have been added or removed from one of the - * extent lists. erp_idx contains the irec index to begin updating - * at and ext_diff contains the number of extents that were added - * or removed. - */ -void -xfs_iext_irec_update_extoffs( - xfs_ifork_t *ifp, /* inode fork pointer */ - int erp_idx, /* irec index to update */ - int ext_diff) /* number of new extents */ -{ - int i; /* loop counter */ - int nlists; /* number of irec's (ex lists */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - for (i = erp_idx; i < nlists; i++) { - ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff; - } -} - -/* - * Test whether it is appropriate to check an inode for and free post EOF - * blocks. The 'force' parameter determines whether we should also consider - * regular files that are marked preallocated or append-only. - */ -bool -xfs_can_free_eofblocks(struct xfs_inode *ip, bool force) -{ - /* prealloc/delalloc exists only on regular files */ - if (!S_ISREG(ip->i_d.di_mode)) - return false; - - /* - * Zero sized files with no cached pages and delalloc blocks will not - * have speculative prealloc/delalloc blocks to remove. - */ - if (VFS_I(ip)->i_size == 0 && - VN_CACHED(VFS_I(ip)) == 0 && - ip->i_delayed_blks == 0) - return false; - - /* If we haven't read in the extent list, then don't do it now. */ - if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) - return false; - - /* - * Do not free real preallocated or append-only files unless the file - * has delalloc blocks and we are forced to remove them. - */ - if (ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) - if (!force || ip->i_delayed_blks == 0) - return false; - - return true; -} -