--- zzzz-none-000/linux-3.10.107/fs/ext4/page-io.c 2017-06-27 09:49:32.000000000 +0000 +++ scorpion-7490-727/linux-3.10.107/fs/ext4/page-io.c 2021-02-04 17:41:59.000000000 +0000 @@ -8,7 +8,6 @@ #include #include -#include #include #include #include @@ -18,7 +17,6 @@ #include #include #include -#include #include #include #include @@ -46,46 +44,134 @@ } /* - * This function is called by ext4_evict_inode() to make sure there is - * no more pending I/O completion work left to do. + * Print an buffer I/O error compatible with the fs/buffer.c. This + * provides compatibility with dmesg scrapers that look for a specific + * buffer I/O error message. We really need a unified error reporting + * structure to userspace ala Digital Unix's uerf system, but it's + * probably not going to happen in my lifetime, due to LKML politics... */ -void ext4_ioend_shutdown(struct inode *inode) +static void buffer_io_error(struct buffer_head *bh) +{ + char b[BDEVNAME_SIZE]; + printk_ratelimited(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n", + bdevname(bh->b_bdev, b), + (unsigned long long)bh->b_blocknr); +} + +static void ext4_finish_bio(struct bio *bio) { - wait_queue_head_t *wq = ext4_ioend_wq(inode); + int i; + struct bio_vec *bvec; - wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0)); - /* - * We need to make sure the work structure is finished being - * used before we let the inode get destroyed. - */ - if (work_pending(&EXT4_I(inode)->i_unwritten_work)) - cancel_work_sync(&EXT4_I(inode)->i_unwritten_work); + bio_for_each_segment_all(bvec, bio, i) { + struct page *page = bvec->bv_page; +#ifdef CONFIG_EXT4_FS_ENCRYPTION + struct page *data_page = NULL; + struct ext4_crypto_ctx *ctx = NULL; +#endif + struct buffer_head *bh, *head; + unsigned bio_start = bvec->bv_offset; + unsigned bio_end = bio_start + bvec->bv_len; + unsigned under_io = 0; + unsigned long flags; + + if (!page) + continue; + +#ifdef CONFIG_EXT4_FS_ENCRYPTION + if (!page->mapping) { + /* The bounce data pages are unmapped. */ + data_page = page; + ctx = (struct ext4_crypto_ctx *)page_private(data_page); + page = ctx->w.control_page; + } +#endif + + if (bio->bi_error) { + SetPageError(page); + set_bit(AS_EIO, &page->mapping->flags); + } + bh = head = page_buffers(page); + /* + * We check all buffers in the page under BH_Uptodate_Lock + * to avoid races with other end io clearing async_write flags + */ + local_irq_save(flags); + bit_spin_lock(BH_Uptodate_Lock, &head->b_state); + do { + if (bh_offset(bh) < bio_start || + bh_offset(bh) + bh->b_size > bio_end) { + if (buffer_async_write(bh)) + under_io++; + continue; + } + clear_buffer_async_write(bh); + if (bio->bi_error) + buffer_io_error(bh); + } while ((bh = bh->b_this_page) != head); + bit_spin_unlock(BH_Uptodate_Lock, &head->b_state); + local_irq_restore(flags); + if (!under_io) { +#ifdef CONFIG_EXT4_FS_ENCRYPTION + if (ctx) + ext4_restore_control_page(data_page); +#endif + end_page_writeback(page); + } + } +} + +static void ext4_release_io_end(ext4_io_end_t *io_end) +{ + struct bio *bio, *next_bio; + + BUG_ON(!list_empty(&io_end->list)); + BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN); + WARN_ON(io_end->handle); + + if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count)) + wake_up_all(ext4_ioend_wq(io_end->inode)); + + for (bio = io_end->bio; bio; bio = next_bio) { + next_bio = bio->bi_private; + ext4_finish_bio(bio); + bio_put(bio); + } + kmem_cache_free(io_end_cachep, io_end); } -void ext4_free_io_end(ext4_io_end_t *io) +static void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) { - BUG_ON(!io); - BUG_ON(!list_empty(&io->list)); - BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN); + struct inode *inode = io_end->inode; - if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count)) - wake_up_all(ext4_ioend_wq(io->inode)); - kmem_cache_free(io_end_cachep, io); + io_end->flag &= ~EXT4_IO_END_UNWRITTEN; + /* Wake up anyone waiting on unwritten extent conversion */ + if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) + wake_up_all(ext4_ioend_wq(inode)); } -/* check a range of space and convert unwritten extents to written. */ +/* + * Check a range of space and convert unwritten extents to written. Note that + * we are protected from truncate touching same part of extent tree by the + * fact that truncate code waits for all DIO to finish (thus exclusion from + * direct IO is achieved) and also waits for PageWriteback bits. Thus we + * cannot get to ext4_ext_truncate() before all IOs overlapping that range are + * completed (happens from ext4_free_ioend()). + */ static int ext4_end_io(ext4_io_end_t *io) { struct inode *inode = io->inode; loff_t offset = io->offset; ssize_t size = io->size; + handle_t *handle = io->handle; int ret = 0; ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p," "list->prev 0x%p\n", io, inode->i_ino, io->list.next, io->list.prev); - ret = ext4_convert_unwritten_extents(inode, offset, size); + io->handle = NULL; /* Following call will use up the handle */ + ret = ext4_convert_unwritten_extents(handle, inode, offset, size); if (ret < 0) { ext4_msg(inode->i_sb, KERN_EMERG, "failed to convert unwritten extents to written " @@ -93,30 +179,22 @@ "(inode %lu, offset %llu, size %zd, error %d)", inode->i_ino, offset, size, ret); } - /* Wake up anyone waiting on unwritten extent conversion */ - if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) - wake_up_all(ext4_ioend_wq(inode)); - if (io->flag & EXT4_IO_END_DIRECT) - inode_dio_done(inode); - if (io->iocb) - aio_complete(io->iocb, io->result, 0); + ext4_clear_io_unwritten_flag(io); + ext4_release_io_end(io); return ret; } -static void dump_completed_IO(struct inode *inode) +static void dump_completed_IO(struct inode *inode, struct list_head *head) { #ifdef EXT4FS_DEBUG struct list_head *cur, *before, *after; ext4_io_end_t *io, *io0, *io1; - if (list_empty(&EXT4_I(inode)->i_completed_io_list)) { - ext4_debug("inode %lu completed_io list is empty\n", - inode->i_ino); + if (list_empty(head)) return; - } - ext4_debug("Dump inode %lu completed_io list\n", inode->i_ino); - list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list) { + ext4_debug("Dump inode %lu completed io list\n", inode->i_ino); + list_for_each_entry(io, head, list) { cur = &io->list; before = cur->prev; io0 = container_of(before, ext4_io_end_t, list); @@ -130,23 +208,26 @@ } /* Add the io_end to per-inode completed end_io list. */ -void ext4_add_complete_io(ext4_io_end_t *io_end) +static void ext4_add_complete_io(ext4_io_end_t *io_end) { struct ext4_inode_info *ei = EXT4_I(io_end->inode); + struct ext4_sb_info *sbi = EXT4_SB(io_end->inode->i_sb); struct workqueue_struct *wq; unsigned long flags; - BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); - wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; - + /* Only reserved conversions from writeback should enter here */ + WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); + WARN_ON(!io_end->handle && sbi->s_journal); spin_lock_irqsave(&ei->i_completed_io_lock, flags); - if (list_empty(&ei->i_completed_io_list)) - queue_work(wq, &ei->i_unwritten_work); - list_add_tail(&io_end->list, &ei->i_completed_io_list); + wq = sbi->rsv_conversion_wq; + if (list_empty(&ei->i_rsv_conversion_list)) + queue_work(wq, &ei->i_rsv_conversion_work); + list_add_tail(&io_end->list, &ei->i_rsv_conversion_list); spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); } -static int ext4_do_flush_completed_IO(struct inode *inode) +static int ext4_do_flush_completed_IO(struct inode *inode, + struct list_head *head) { ext4_io_end_t *io; struct list_head unwritten; @@ -155,8 +236,8 @@ int err, ret = 0; spin_lock_irqsave(&ei->i_completed_io_lock, flags); - dump_completed_IO(inode); - list_replace_init(&ei->i_completed_io_list, &unwritten); + dump_completed_IO(inode, head); + list_replace_init(head, &unwritten); spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); while (!list_empty(&unwritten)) { @@ -167,30 +248,18 @@ err = ext4_end_io(io); if (unlikely(!ret && err)) ret = err; - io->flag &= ~EXT4_IO_END_UNWRITTEN; - ext4_free_io_end(io); } return ret; } /* - * work on completed aio dio IO, to convert unwritten extents to extents + * work on completed IO, to convert unwritten extents to extents */ -void ext4_end_io_work(struct work_struct *work) +void ext4_end_io_rsv_work(struct work_struct *work) { struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info, - i_unwritten_work); - ext4_do_flush_completed_IO(&ei->vfs_inode); -} - -int ext4_flush_unwritten_io(struct inode *inode) -{ - int ret; - WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex) && - !(inode->i_state & I_FREEING)); - ret = ext4_do_flush_completed_IO(inode); - ext4_unwritten_wait(inode); - return ret; + i_rsv_conversion_work); + ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_rsv_conversion_list); } ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) @@ -200,98 +269,84 @@ atomic_inc(&EXT4_I(inode)->i_ioend_count); io->inode = inode; INIT_LIST_HEAD(&io->list); + atomic_set(&io->count, 1); } return io; } -/* - * Print an buffer I/O error compatible with the fs/buffer.c. This - * provides compatibility with dmesg scrapers that look for a specific - * buffer I/O error message. We really need a unified error reporting - * structure to userspace ala Digital Unix's uerf system, but it's - * probably not going to happen in my lifetime, due to LKML politics... - */ -static void buffer_io_error(struct buffer_head *bh) +void ext4_put_io_end_defer(ext4_io_end_t *io_end) { - char b[BDEVNAME_SIZE]; - printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n", - bdevname(bh->b_bdev, b), - (unsigned long long)bh->b_blocknr); + if (atomic_dec_and_test(&io_end->count)) { + if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || !io_end->size) { + ext4_release_io_end(io_end); + return; + } + ext4_add_complete_io(io_end); + } +} + +int ext4_put_io_end(ext4_io_end_t *io_end) +{ + int err = 0; + + if (atomic_dec_and_test(&io_end->count)) { + if (io_end->flag & EXT4_IO_END_UNWRITTEN) { + err = ext4_convert_unwritten_extents(io_end->handle, + io_end->inode, io_end->offset, + io_end->size); + io_end->handle = NULL; + ext4_clear_io_unwritten_flag(io_end); + } + ext4_release_io_end(io_end); + } + return err; +} + +ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end) +{ + atomic_inc(&io_end->count); + return io_end; } -static void ext4_end_bio(struct bio *bio, int error) +/* BIO completion function for page writeback */ +static void ext4_end_bio(struct bio *bio) { ext4_io_end_t *io_end = bio->bi_private; - struct inode *inode; - int i; - int blocksize; - sector_t bi_sector = bio->bi_sector; + sector_t bi_sector = bio->bi_iter.bi_sector; BUG_ON(!io_end); - inode = io_end->inode; - blocksize = 1 << inode->i_blkbits; - bio->bi_private = NULL; bio->bi_end_io = NULL; - if (test_bit(BIO_UPTODATE, &bio->bi_flags)) - error = 0; - for (i = 0; i < bio->bi_vcnt; i++) { - struct bio_vec *bvec = &bio->bi_io_vec[i]; - struct page *page = bvec->bv_page; - struct buffer_head *bh, *head; - unsigned bio_start = bvec->bv_offset; - unsigned bio_end = bio_start + bvec->bv_len; - unsigned under_io = 0; - unsigned long flags; - if (!page) - continue; - - if (error) { - SetPageError(page); - set_bit(AS_EIO, &page->mapping->flags); - } - bh = head = page_buffers(page); - /* - * We check all buffers in the page under BH_Uptodate_Lock - * to avoid races with other end io clearing async_write flags - */ - local_irq_save(flags); - bit_spin_lock(BH_Uptodate_Lock, &head->b_state); - do { - if (bh_offset(bh) < bio_start || - bh_offset(bh) + blocksize > bio_end) { - if (buffer_async_write(bh)) - under_io++; - continue; - } - clear_buffer_async_write(bh); - if (error) - buffer_io_error(bh); - } while ((bh = bh->b_this_page) != head); - bit_spin_unlock(BH_Uptodate_Lock, &head->b_state); - local_irq_restore(flags); - if (!under_io) - end_page_writeback(page); - } - bio_put(bio); + if (bio->bi_error) { + struct inode *inode = io_end->inode; - if (error) { - io_end->flag |= EXT4_IO_END_ERROR; - ext4_warning(inode->i_sb, "I/O error writing to inode %lu " + ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu " "(offset %llu size %ld starting block %llu)", - inode->i_ino, + bio->bi_error, inode->i_ino, (unsigned long long) io_end->offset, (long) io_end->size, (unsigned long long) bi_sector >> (inode->i_blkbits - 9)); + mapping_set_error(inode->i_mapping, bio->bi_error); } - if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { - ext4_free_io_end(io_end); - return; + if (io_end->flag & EXT4_IO_END_UNWRITTEN) { + /* + * Link bio into list hanging from io_end. We have to do it + * atomically as bio completions can be racing against each + * other. + */ + bio->bi_private = xchg(&io_end->bio, bio); + ext4_put_io_end_defer(io_end); + } else { + /* + * Drop io_end reference early. Inode can get freed once + * we finish the bio. + */ + ext4_put_io_end_defer(io_end); + ext4_finish_bio(bio); + bio_put(bio); } - - ext4_add_complete_io(io_end); } void ext4_io_submit(struct ext4_io_submit *io) @@ -299,49 +354,46 @@ struct bio *bio = io->io_bio; if (bio) { + int io_op = io->io_wbc->sync_mode == WB_SYNC_ALL ? + WRITE_SYNC : WRITE; bio_get(io->io_bio); - submit_bio(io->io_op, io->io_bio); - BUG_ON(bio_flagged(io->io_bio, BIO_EOPNOTSUPP)); + submit_bio(io_op, io->io_bio); bio_put(io->io_bio); } io->io_bio = NULL; - io->io_op = 0; +} + +void ext4_io_submit_init(struct ext4_io_submit *io, + struct writeback_control *wbc) +{ + io->io_wbc = wbc; + io->io_bio = NULL; io->io_end = NULL; } -static int io_submit_init(struct ext4_io_submit *io, - struct inode *inode, - struct writeback_control *wbc, - struct buffer_head *bh) -{ - ext4_io_end_t *io_end; - struct page *page = bh->b_page; - int nvecs = bio_get_nr_vecs(bh->b_bdev); +static int io_submit_init_bio(struct ext4_io_submit *io, + struct buffer_head *bh) +{ struct bio *bio; - io_end = ext4_init_io_end(inode, GFP_NOFS); - if (!io_end) + bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); + if (!bio) return -ENOMEM; - bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES)); - bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); + wbc_init_bio(io->io_wbc, bio); + bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_bdev = bh->b_bdev; - bio->bi_private = io->io_end = io_end; bio->bi_end_io = ext4_end_bio; - - io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh); - + bio->bi_private = ext4_get_io_end(io->io_end); io->io_bio = bio; - io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); io->io_next_block = bh->b_blocknr; return 0; } static int io_submit_add_bh(struct ext4_io_submit *io, struct inode *inode, - struct writeback_control *wbc, + struct page *page, struct buffer_head *bh) { - ext4_io_end_t *io_end; int ret; if (io->io_bio && bh->b_blocknr != io->io_next_block) { @@ -349,42 +401,45 @@ ext4_io_submit(io); } if (io->io_bio == NULL) { - ret = io_submit_init(io, inode, wbc, bh); + ret = io_submit_init_bio(io, bh); if (ret) return ret; } - io_end = io->io_end; - if (test_clear_buffer_uninit(bh)) - ext4_set_io_unwritten_flag(inode, io_end); - io->io_end->size += bh->b_size; - io->io_next_block++; - ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh)); + ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh)); if (ret != bh->b_size) goto submit_and_retry; + wbc_account_io(io->io_wbc, page, bh->b_size); + io->io_next_block++; return 0; } int ext4_bio_write_page(struct ext4_io_submit *io, struct page *page, int len, - struct writeback_control *wbc) + struct writeback_control *wbc, + bool keep_towrite) { + struct page *data_page = NULL; struct inode *inode = page->mapping->host; unsigned block_start, blocksize; struct buffer_head *bh, *head; int ret = 0; int nr_submitted = 0; + int nr_to_submit = 0; blocksize = 1 << inode->i_blkbits; BUG_ON(!PageLocked(page)); BUG_ON(PageWriteback(page)); - set_page_writeback(page); + if (keep_towrite) + set_page_writeback_keepwrite(page); + else + set_page_writeback(page); ClearPageError(page); /* - * Comments copied from block_write_full_page_endio: + * Comments copied from block_write_full_page: * * The page straddles i_size. It must be zeroed out on each and every * writepage invocation because it may be mmapped. "A file is mapped @@ -423,21 +478,33 @@ unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); } set_buffer_async_write(bh); + nr_to_submit++; } while ((bh = bh->b_this_page) != head); - /* Now submit buffers to write */ bh = head = page_buffers(page); + + if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode) && + nr_to_submit) { + data_page = ext4_encrypt(inode, page); + if (IS_ERR(data_page)) { + ret = PTR_ERR(data_page); + data_page = NULL; + goto out; + } + } + + /* Now submit buffers to write */ do { if (!buffer_async_write(bh)) continue; - ret = io_submit_add_bh(io, inode, wbc, bh); + ret = io_submit_add_bh(io, inode, + data_page ? data_page : page, bh); if (ret) { /* * We only get here on ENOMEM. Not much else * we can do but mark the page as dirty, and * better luck next time. */ - redirty_page_for_writepage(wbc, page); break; } nr_submitted++; @@ -446,6 +513,11 @@ /* Error stopped previous loop? Clean up buffers... */ if (ret) { + out: + if (data_page) + ext4_restore_control_page(data_page); + printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret); + redirty_page_for_writepage(wbc, page); do { clear_buffer_async_write(bh); bh = bh->b_this_page;