summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2026-02-09 18:14:52 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2026-02-09 18:14:52 -0800
commit4adc13ed7c281c16152a700e47b65d17de07321a (patch)
tree5cadc2218d2e6be035076b9456d5784ef090e54c /lib
parent0c00ed308d0559fc216be0442a3df124e9e13533 (diff)
parent3373503df025ab6c9a8ad2ce6b7febd2eb3c99dc (diff)
Merge tag 'for-7.0/block-stable-pages-20260206' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux
Pull bounce buffer dio for stable pages from Jens Axboe: "This adds support for bounce buffering of dio for stable pages. This was all done by Christoph. In his words: This series tries to address the problem that under I/O pages can be modified during direct I/O, even when the device or file system require stable pages during I/O to calculate checksums, parity or data operations. It does so by adding block layer helpers to bounce buffer an iov_iter into a bio, then wires that up in iomap and ultimately XFS. The reason that the file system even needs to know about it, is because reads need a user context to copy the data back, and the infrastructure to defer ioends to a workqueue currently sits in XFS. I'm going to look into moving that into ioend and enabling it for other file systems. Additionally btrfs already has it's own infrastructure for this, and actually an urgent need to bounce buffer, so this should be useful there and could be wire up easily. In fact the idea comes from patches by Qu that did this in btrfs. This patch fixes all but one xfstests failures on T10 PI capable devices (generic/095 seems to have issues with a mix of mmap and splice still, I'm looking into that separately), and make qemu VMs running Windows, or Linux with swap enabled fine on an XFS file on a device using PI. Performance numbers on my (not exactly state of the art) NVMe PI test setup: Sequential reads using io_uring, QD=16. Bandwidth and CPU usage (usr/sys): | size | zero copy | bounce | +------+--------------------------+--------------------------+ | 4k | 1316MiB/s (12.65/55.40%) | 1081MiB/s (11.76/49.78%) | | 64K | 3370MiB/s ( 5.46/18.20%) | 3365MiB/s ( 4.47/15.68%) | | 1M | 3401MiB/s ( 0.76/23.05%) | 3400MiB/s ( 0.80/09.06%) | +------+--------------------------+--------------------------+ Sequential writes using io_uring, QD=16. Bandwidth and CPU usage (usr/sys): | size | zero copy | bounce | +------+--------------------------+--------------------------+ | 4k | 882MiB/s (11.83/33.88%) | 750MiB/s (10.53/34.08%) | | 64K | 2009MiB/s ( 7.33/15.80%) | 2007MiB/s ( 7.47/24.71%) | | 1M | 1992MiB/s ( 7.26/ 9.13%) | 1992MiB/s ( 9.21/19.11%) | +------+--------------------------+--------------------------+ Note that the 64k read numbers look really odd to me for the baseline zero copy case, but are reproducible over many repeated runs. The bounce read numbers should further improve when moving the PI validation to the file system and removing the double context switch, which I have patches for that will sent out soon" * tag 'for-7.0/block-stable-pages-20260206' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux: xfs: use bounce buffering direct I/O when the device requires stable pages iomap: add a flag to bounce buffer direct I/O iomap: support ioends for direct reads iomap: rename IOMAP_DIO_DIRTY to IOMAP_DIO_USER_BACKED iomap: free the bio before completing the dio iomap: share code between iomap_dio_bio_end_io and iomap_finish_ioend_direct iomap: split out the per-bio logic from iomap_dio_bio_iter iomap: simplify iomap_dio_bio_iter iomap: fix submission side handling of completion side errors block: add helpers to bounce buffer an iov_iter into bios block: remove bio_release_page iov_iter: extract a iov_iter_extract_bvecs helper from bio code block: open code bio_add_page and fix handling of mismatching P2P ranges block: refactor get_contig_folio_len block: add a BIO_MAX_SIZE constant and use it
Diffstat (limited to 'lib')
-rw-r--r--lib/iov_iter.c98
1 files changed, 98 insertions, 0 deletions
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 896760bad455..545250507f08 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -1845,3 +1845,101 @@ ssize_t iov_iter_extract_pages(struct iov_iter *i,
return -EFAULT;
}
EXPORT_SYMBOL_GPL(iov_iter_extract_pages);
+
+static unsigned int get_contig_folio_len(struct page **pages,
+ unsigned int *num_pages, size_t left, size_t offset)
+{
+ struct folio *folio = page_folio(pages[0]);
+ size_t contig_sz = min_t(size_t, PAGE_SIZE - offset, left);
+ unsigned int max_pages, i;
+ size_t folio_offset, len;
+
+ folio_offset = PAGE_SIZE * folio_page_idx(folio, pages[0]) + offset;
+ len = min(folio_size(folio) - folio_offset, left);
+
+ /*
+ * We might COW a single page in the middle of a large folio, so we have
+ * to check that all pages belong to the same folio.
+ */
+ left -= contig_sz;
+ max_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE);
+ for (i = 1; i < max_pages; i++) {
+ size_t next = min_t(size_t, PAGE_SIZE, left);
+
+ if (page_folio(pages[i]) != folio ||
+ pages[i] != pages[i - 1] + 1)
+ break;
+ contig_sz += next;
+ left -= next;
+ }
+
+ *num_pages = i;
+ return contig_sz;
+}
+
+#define PAGE_PTRS_PER_BVEC (sizeof(struct bio_vec) / sizeof(struct page *))
+
+/**
+ * iov_iter_extract_bvecs - Extract bvecs from an iterator
+ * @iter: the iterator to extract from
+ * @bv: bvec return array
+ * @max_size: maximum size to extract from @iter
+ * @nr_vecs: number of vectors in @bv (on in and output)
+ * @max_vecs: maximum vectors in @bv, including those filled before calling
+ * @extraction_flags: flags to qualify request
+ *
+ * Like iov_iter_extract_pages(), but returns physically contiguous ranges
+ * contained in a single folio as a single bvec instead of multiple entries.
+ *
+ * Returns the number of bytes extracted when successful, or a negative errno.
+ * If @nr_vecs was non-zero on entry, the number of successfully extracted bytes
+ * can be 0.
+ */
+ssize_t iov_iter_extract_bvecs(struct iov_iter *iter, struct bio_vec *bv,
+ size_t max_size, unsigned short *nr_vecs,
+ unsigned short max_vecs, iov_iter_extraction_t extraction_flags)
+{
+ unsigned short entries_left = max_vecs - *nr_vecs;
+ unsigned short nr_pages, i = 0;
+ size_t left, offset, len;
+ struct page **pages;
+ ssize_t size;
+
+ /*
+ * Move page array up in the allocated memory for the bio vecs as far as
+ * possible so that we can start filling biovecs from the beginning
+ * without overwriting the temporary page array.
+ */
+ BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
+ pages = (struct page **)(bv + *nr_vecs) +
+ entries_left * (PAGE_PTRS_PER_BVEC - 1);
+
+ size = iov_iter_extract_pages(iter, &pages, max_size, entries_left,
+ extraction_flags, &offset);
+ if (unlikely(size <= 0))
+ return size ? size : -EFAULT;
+
+ nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE);
+ for (left = size; left > 0; left -= len) {
+ unsigned int nr_to_add;
+
+ if (*nr_vecs > 0 &&
+ !zone_device_pages_have_same_pgmap(bv[*nr_vecs - 1].bv_page,
+ pages[i]))
+ break;
+
+ len = get_contig_folio_len(&pages[i], &nr_to_add, left, offset);
+ bvec_set_page(&bv[*nr_vecs], pages[i], len, offset);
+ i += nr_to_add;
+ (*nr_vecs)++;
+ offset = 0;
+ }
+
+ iov_iter_revert(iter, left);
+ if (iov_iter_extract_will_pin(iter)) {
+ while (i < nr_pages)
+ unpin_user_page(pages[i++]);
+ }
+ return size - left;
+}
+EXPORT_SYMBOL_GPL(iov_iter_extract_bvecs);