diff options
Diffstat (limited to 'fs/iomap/direct-io.c')
| -rw-r--r-- | fs/iomap/direct-io.c | 268 |
1 files changed, 141 insertions, 127 deletions
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 5d5d63efbd57..8e273408453a 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -16,21 +16,13 @@ * Private flags for iomap_dio, must not overlap with the public ones in * iomap.h: */ -#define IOMAP_DIO_NO_INVALIDATE (1U << 25) -#define IOMAP_DIO_CALLER_COMP (1U << 26) -#define IOMAP_DIO_INLINE_COMP (1U << 27) +#define IOMAP_DIO_NO_INVALIDATE (1U << 26) +#define IOMAP_DIO_COMP_WORK (1U << 27) #define IOMAP_DIO_WRITE_THROUGH (1U << 28) #define IOMAP_DIO_NEED_SYNC (1U << 29) #define IOMAP_DIO_WRITE (1U << 30) #define IOMAP_DIO_DIRTY (1U << 31) -/* - * Used for sub block zeroing in iomap_dio_zero() - */ -#define IOMAP_ZERO_PAGE_SIZE (SZ_64K) -#define IOMAP_ZERO_PAGE_ORDER (get_order(IOMAP_ZERO_PAGE_SIZE)) -static struct page *zero_page; - struct iomap_dio { struct kiocb *iocb; const struct iomap_dio_ops *dops; @@ -140,11 +132,6 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio) } EXPORT_SYMBOL_GPL(iomap_dio_complete); -static ssize_t iomap_dio_deferred_complete(void *data) -{ - return iomap_dio_complete(data); -} - static void iomap_dio_complete_work(struct work_struct *work) { struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work); @@ -179,33 +166,33 @@ static void iomap_dio_done(struct iomap_dio *dio) WRITE_ONCE(dio->submit.waiter, NULL); blk_wake_io_task(waiter); - } else if (dio->flags & IOMAP_DIO_INLINE_COMP) { - WRITE_ONCE(iocb->private, NULL); - iomap_dio_complete_work(&dio->aio.work); - } else if (dio->flags & IOMAP_DIO_CALLER_COMP) { - /* - * If this dio is flagged with IOMAP_DIO_CALLER_COMP, then - * schedule our completion that way to avoid an async punt to a - * workqueue. - */ - /* only polled IO cares about private cleared */ - iocb->private = dio; - iocb->dio_complete = iomap_dio_deferred_complete; + return; + } - /* - * Invoke ->ki_complete() directly. We've assigned our - * dio_complete callback handler, and since the issuer set - * IOCB_DIO_CALLER_COMP, we know their ki_complete handler will - * notice ->dio_complete being set and will defer calling that - * handler until it can be done from a safe task context. - * - * Note that the 'res' being passed in here is not important - * for this case. The actual completion value of the request - * will be gotten from dio_complete when that is run by the - * issuer. - */ - iocb->ki_complete(iocb, 0); - } else { + /* + * Always run error completions in user context. These are not + * performance critical and some code relies on taking sleeping locks + * for error handling. + */ + if (dio->error) + dio->flags |= IOMAP_DIO_COMP_WORK; + + /* + * Never invalidate pages from this context to avoid deadlocks with + * buffered I/O completions when called from the ioend workqueue, + * or avoid sleeping when called directly from ->bi_end_io. + * Tough luck if you hit the tiny race with someone dirtying the range + * right between this check and the actual completion. + */ + if ((dio->flags & IOMAP_DIO_WRITE) && + !(dio->flags & IOMAP_DIO_COMP_WORK)) { + if (dio->iocb->ki_filp->f_mapping->nrpages) + dio->flags |= IOMAP_DIO_COMP_WORK; + else + dio->flags |= IOMAP_DIO_NO_INVALIDATE; + } + + if (dio->flags & IOMAP_DIO_COMP_WORK) { struct inode *inode = file_inode(iocb->ki_filp); /* @@ -216,7 +203,11 @@ static void iomap_dio_done(struct iomap_dio *dio) */ INIT_WORK(&dio->aio.work, iomap_dio_complete_work); queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work); + return; } + + WRITE_ONCE(iocb->private, NULL); + iomap_dio_complete_work(&dio->aio.work); } void iomap_dio_bio_end_io(struct bio *bio) @@ -252,16 +243,9 @@ u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend) /* * Try to avoid another context switch for the completion given * that we are already called from the ioend completion - * workqueue, but never invalidate pages from this thread to - * avoid deadlocks with buffered I/O completions. Tough luck if - * you hit the tiny race with someone dirtying the range now - * between this check and the actual completion. + * workqueue. */ - if (!dio->iocb->ki_filp->f_mapping->nrpages) { - dio->flags |= IOMAP_DIO_INLINE_COMP; - dio->flags |= IOMAP_DIO_NO_INVALIDATE; - } - dio->flags &= ~IOMAP_DIO_CALLER_COMP; + dio->flags &= ~IOMAP_DIO_COMP_WORK; iomap_dio_done(dio); } @@ -285,42 +269,36 @@ static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, { struct inode *inode = file_inode(dio->iocb->ki_filp); struct bio *bio; + struct folio *zero_folio = largest_zero_folio(); + int nr_vecs = max(1, i_blocksize(inode) / folio_size(zero_folio)); if (!len) return 0; + /* - * Max block size supported is 64k + * This limit shall never be reached as most filesystems have a + * maximum blocksize of 64k. */ - if (WARN_ON_ONCE(len > IOMAP_ZERO_PAGE_SIZE)) + if (WARN_ON_ONCE(nr_vecs > BIO_MAX_VECS)) return -EINVAL; - bio = iomap_dio_alloc_bio(iter, dio, 1, REQ_OP_WRITE | REQ_SYNC | REQ_IDLE); + bio = iomap_dio_alloc_bio(iter, dio, nr_vecs, + REQ_OP_WRITE | REQ_SYNC | REQ_IDLE); fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits, GFP_KERNEL); bio->bi_iter.bi_sector = iomap_sector(&iter->iomap, pos); bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; - __bio_add_page(bio, zero_page, len, 0); + while (len > 0) { + unsigned int io_len = min(len, folio_size(zero_folio)); + + bio_add_folio_nofail(bio, zero_folio, io_len, 0); + len -= io_len; + } iomap_dio_submit_bio(iter, dio, bio, pos); - return 0; -} -/* - * Use a FUA write if we need datasync semantics and this is a pure data I/O - * that doesn't require any metadata updates (including after I/O completion - * such as unwritten extent conversion) and the underlying device either - * doesn't have a volatile write cache or supports FUA. - * This allows us to avoid cache flushes on I/O completion. - */ -static inline bool iomap_dio_can_use_fua(const struct iomap *iomap, - struct iomap_dio *dio) -{ - if (iomap->flags & (IOMAP_F_SHARED | IOMAP_F_DIRTY)) - return false; - if (!(dio->flags & IOMAP_DIO_WRITE_THROUGH)) - return false; - return !bdev_write_cache(iomap->bdev) || bdev_fua(iomap->bdev); + return 0; } static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) @@ -336,12 +314,39 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) int nr_pages, ret = 0; u64 copied = 0; size_t orig_count; + unsigned int alignment; + + /* + * File systems that write out of place and always allocate new blocks + * need each bio to be block aligned as that's the unit of allocation. + */ + if (dio->flags & IOMAP_DIO_FSBLOCK_ALIGNED) + alignment = fs_block_size; + else + alignment = bdev_logical_block_size(iomap->bdev); - if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1)) + if ((pos | length) & (alignment - 1)) return -EINVAL; if (dio->flags & IOMAP_DIO_WRITE) { - bio_opf |= REQ_OP_WRITE; + bool need_completion_work = true; + + switch (iomap->type) { + case IOMAP_MAPPED: + /* + * Directly mapped I/O does not inherently need to do + * work at I/O completion time. But there are various + * cases below where this will get set again. + */ + need_completion_work = false; + break; + case IOMAP_UNWRITTEN: + dio->flags |= IOMAP_DIO_UNWRITTEN; + need_zeroout = true; + break; + default: + break; + } if (iomap->flags & IOMAP_F_ATOMIC_BIO) { /* @@ -354,35 +359,54 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) bio_opf |= REQ_ATOMIC; } - if (iomap->type == IOMAP_UNWRITTEN) { - dio->flags |= IOMAP_DIO_UNWRITTEN; - need_zeroout = true; - } - - if (iomap->flags & IOMAP_F_SHARED) + if (iomap->flags & IOMAP_F_SHARED) { + /* + * Unsharing of needs to update metadata at I/O + * completion time. + */ + need_completion_work = true; dio->flags |= IOMAP_DIO_COW; + } - if (iomap->flags & IOMAP_F_NEW) + if (iomap->flags & IOMAP_F_NEW) { + /* + * Newly allocated blocks might need recording in + * metadata at I/O completion time. + */ + need_completion_work = true; need_zeroout = true; - else if (iomap->type == IOMAP_MAPPED && - iomap_dio_can_use_fua(iomap, dio)) - bio_opf |= REQ_FUA; + } - if (!(bio_opf & REQ_FUA)) - dio->flags &= ~IOMAP_DIO_WRITE_THROUGH; + /* + * Use a FUA write if we need datasync semantics and this is a + * pure overwrite that doesn't require any metadata updates. + * + * This allows us to avoid cache flushes on I/O completion. + */ + if (dio->flags & IOMAP_DIO_WRITE_THROUGH) { + if (!need_completion_work && + !(iomap->flags & IOMAP_F_DIRTY) && + (!bdev_write_cache(iomap->bdev) || + bdev_fua(iomap->bdev))) + bio_opf |= REQ_FUA; + else + dio->flags &= ~IOMAP_DIO_WRITE_THROUGH; + } /* - * We can only do deferred completion for pure overwrites that + * We can only do inline completion for pure overwrites that * don't require additional I/O at completion time. * - * This rules out writes that need zeroing or extent conversion, - * extend the file size, or issue metadata I/O or cache flushes - * during completion processing. + * This rules out writes that need zeroing or metdata updates to + * convert unwritten or shared extents. + * + * Writes that extend i_size are also not supported, but this is + * handled in __iomap_dio_rw(). */ - if (need_zeroout || (pos >= i_size_read(inode)) || - ((dio->flags & IOMAP_DIO_NEED_SYNC) && - !(bio_opf & REQ_FUA))) - dio->flags &= ~IOMAP_DIO_CALLER_COMP; + if (need_completion_work) + dio->flags |= IOMAP_DIO_COMP_WORK; + + bio_opf |= REQ_OP_WRITE; } else { bio_opf |= REQ_OP_READ; } @@ -403,7 +427,7 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) * ones we set for inline and deferred completions. If none of those * are available for this IO, clear the polled flag. */ - if (!(dio->flags & (IOMAP_DIO_INLINE_COMP|IOMAP_DIO_CALLER_COMP))) + if (dio->flags & IOMAP_DIO_COMP_WORK) dio->iocb->ki_flags &= ~IOCB_HIPRI; if (need_zeroout) { @@ -434,7 +458,7 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) bio->bi_end_io = iomap_dio_bio_end_io; ret = bio_iov_iter_get_pages(bio, dio->submit.iter, - bdev_logical_block_size(iomap->bdev) - 1); + alignment - 1); if (unlikely(ret)) { /* * We have to stop part way through an IO. We must fall @@ -496,7 +520,7 @@ out: /* Undo iter limitation to current extent */ iov_iter_reexpand(dio->submit.iter, orig_count - copied); if (copied) - return iomap_iter_advance(iter, &copied); + return iomap_iter_advance(iter, copied); return ret; } @@ -507,7 +531,7 @@ static int iomap_dio_hole_iter(struct iomap_iter *iter, struct iomap_dio *dio) dio->size += length; if (!length) return -EFAULT; - return iomap_iter_advance(iter, &length); + return iomap_iter_advance(iter, length); } static int iomap_dio_inline_iter(struct iomap_iter *iomi, struct iomap_dio *dio) @@ -542,7 +566,7 @@ static int iomap_dio_inline_iter(struct iomap_iter *iomi, struct iomap_dio *dio) dio->size += copied; if (!copied) return -EFAULT; - return iomap_iter_advance(iomi, &copied); + return iomap_iter_advance(iomi, copied); } static int iomap_dio_iter(struct iomap_iter *iter, struct iomap_dio *dio) @@ -639,10 +663,10 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (iocb->ki_flags & IOCB_NOWAIT) iomi.flags |= IOMAP_NOWAIT; - if (iov_iter_rw(iter) == READ) { - /* reads can always complete inline */ - dio->flags |= IOMAP_DIO_INLINE_COMP; + if (dio_flags & IOMAP_DIO_FSBLOCK_ALIGNED) + dio->flags |= IOMAP_DIO_FSBLOCK_ALIGNED; + if (iov_iter_rw(iter) == READ) { if (iomi.pos >= dio->i_size) goto out_free_dio; @@ -656,15 +680,6 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, iomi.flags |= IOMAP_WRITE; dio->flags |= IOMAP_DIO_WRITE; - /* - * Flag as supporting deferred completions, if the issuer - * groks it. This can avoid a workqueue punt for writes. - * We may later clear this flag if we need to do other IO - * as part of this IO completion. - */ - if (iocb->ki_flags & IOCB_DIO_CALLER_COMP) - dio->flags |= IOMAP_DIO_CALLER_COMP; - if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) { ret = -EAGAIN; if (iomi.pos >= dio->i_size || @@ -694,6 +709,12 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, } /* + * i_size updates must to happen from process context. + */ + if (iomi.pos + iomi.len > dio->i_size) + dio->flags |= IOMAP_DIO_COMP_WORK; + + /* * Try to invalidate cache pages for the range we are writing. * If this invalidation fails, let the caller fall back to * buffered I/O. @@ -717,12 +738,12 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, } goto out_free_dio; } + } - if (!wait_for_completion && !inode->i_sb->s_dio_done_wq) { - ret = sb_init_dio_done_wq(inode->i_sb); - if (ret < 0) - goto out_free_dio; - } + if (!wait_for_completion && !inode->i_sb->s_dio_done_wq) { + ret = sb_init_dio_done_wq(inode->i_sb); + if (ret < 0) + goto out_free_dio; } inode_dio_begin(inode); @@ -765,9 +786,14 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, * If all the writes we issued were already written through to the * media, we don't need to flush the cache on IO completion. Clear the * sync flag for this case. + * + * Otherwise clear the inline completion flag if any sync work is + * needed, as that needs to be performed from process context. */ if (dio->flags & IOMAP_DIO_WRITE_THROUGH) dio->flags &= ~IOMAP_DIO_NEED_SYNC; + else if (dio->flags & IOMAP_DIO_NEED_SYNC) + dio->flags |= IOMAP_DIO_COMP_WORK; /* * We are about to drop our additional submission reference, which @@ -825,15 +851,3 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, return iomap_dio_complete(dio); } EXPORT_SYMBOL_GPL(iomap_dio_rw); - -static int __init iomap_dio_init(void) -{ - zero_page = alloc_pages(GFP_KERNEL | __GFP_ZERO, - IOMAP_ZERO_PAGE_ORDER); - - if (!zero_page) - return -ENOMEM; - - return 0; -} -fs_initcall(iomap_dio_init); |
