diff options
Diffstat (limited to 'include')
-rw-r--r-- | include/asm-generic/vmlinux.lds.h | 2 | ||||
-rw-r--r-- | include/linux/bio-integrity.h | 1 | ||||
-rw-r--r-- | include/linux/bio.h | 18 | ||||
-rw-r--r-- | include/linux/blk-integrity.h | 32 | ||||
-rw-r--r-- | include/linux/blk-mq-dma.h | 25 | ||||
-rw-r--r-- | include/linux/blk-mq.h | 4 | ||||
-rw-r--r-- | include/linux/blk_types.h | 19 | ||||
-rw-r--r-- | include/linux/blkdev.h | 26 | ||||
-rw-r--r-- | include/linux/hw_bitfield.h | 62 | ||||
-rw-r--r-- | include/linux/io_uring/cmd.h | 69 | ||||
-rw-r--r-- | include/linux/io_uring_types.h | 31 | ||||
-rw-r--r-- | include/linux/libata.h | 2 | ||||
-rw-r--r-- | include/linux/module.h | 18 | ||||
-rw-r--r-- | include/linux/poison.h | 3 | ||||
-rw-r--r-- | include/linux/uio.h | 2 | ||||
-rw-r--r-- | include/scsi/libsas.h | 2 | ||||
-rw-r--r-- | include/scsi/scsi_host.h | 2 | ||||
-rw-r--r-- | include/scsi/scsicam.h | 7 | ||||
-rw-r--r-- | include/trace/events/io_uring.h | 4 | ||||
-rw-r--r-- | include/uapi/linux/io_uring.h | 38 | ||||
-rw-r--r-- | include/uapi/linux/io_uring/query.h | 41 |
21 files changed, 331 insertions, 77 deletions
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 0e35c526b281..8a9a2e732a65 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -832,6 +832,7 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) /* Required sections not related to debugging. */ #define ELF_DETAILS \ + .modinfo : { *(.modinfo) } \ .comment 0 : { *(.comment) } \ .symtab 0 : { *(.symtab) } \ .strtab 0 : { *(.strtab) } \ @@ -1045,7 +1046,6 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) *(.discard.*) \ *(.export_symbol) \ *(.no_trim_symbol) \ - *(.modinfo) \ /* ld.bfd warns about .gnu.version* even when not emitted */ \ *(.gnu.version*) \ diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index 0a25716820fe..851254f36eb3 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -13,6 +13,7 @@ enum bip_flags { BIP_CHECK_GUARD = 1 << 5, /* guard check */ BIP_CHECK_REFTAG = 1 << 6, /* reftag check */ BIP_CHECK_APPTAG = 1 << 7, /* apptag check */ + BIP_P2P_DMA = 1 << 8, /* using P2P address */ }; struct bio_integrity_payload { diff --git a/include/linux/bio.h b/include/linux/bio.h index 46ffac5caab7..a64a30131031 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -322,8 +322,8 @@ static inline void bio_next_folio(struct folio_iter *fi, struct bio *bio) void bio_trim(struct bio *bio, sector_t offset, sector_t size); extern struct bio *bio_split(struct bio *bio, int sectors, gfp_t gfp, struct bio_set *bs); -int bio_split_rw_at(struct bio *bio, const struct queue_limits *lim, - unsigned *segs, unsigned max_bytes); +int bio_split_io_at(struct bio *bio, const struct queue_limits *lim, + unsigned *segs, unsigned max_bytes, unsigned len_align); /** * bio_next_split - get next @sectors from a bio, splitting if necessary @@ -405,6 +405,11 @@ struct request_queue; void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table, unsigned short max_vecs, blk_opf_t opf); +static inline void bio_init_inline(struct bio *bio, struct block_device *bdev, + unsigned short max_vecs, blk_opf_t opf) +{ + bio_init(bio, bdev, bio_inline_vecs(bio), max_vecs, opf); +} extern void bio_uninit(struct bio *); void bio_reset(struct bio *bio, struct block_device *bdev, blk_opf_t opf); void bio_chain(struct bio *, struct bio *); @@ -441,7 +446,14 @@ int submit_bio_wait(struct bio *bio); int bdev_rw_virt(struct block_device *bdev, sector_t sector, void *data, size_t len, enum req_op op); -int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter); +int bio_iov_iter_get_pages_aligned(struct bio *bio, struct iov_iter *iter, + unsigned len_align_mask); + +static inline int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) +{ + return bio_iov_iter_get_pages_aligned(bio, iter, 0); +} + void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter); void __bio_release_pages(struct bio *bio, bool mark_dirty); extern void bio_set_pages_dirty(struct bio *bio); diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index e67a2b6e8f11..b659373788f6 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -4,6 +4,7 @@ #include <linux/blk-mq.h> #include <linux/bio-integrity.h> +#include <linux/blk-mq-dma.h> struct request; @@ -26,11 +27,25 @@ static inline bool queue_limits_stack_integrity_bdev(struct queue_limits *t, #ifdef CONFIG_BLK_DEV_INTEGRITY int blk_rq_map_integrity_sg(struct request *, struct scatterlist *); + +static inline bool blk_rq_integrity_dma_unmap(struct request *req, + struct device *dma_dev, struct dma_iova_state *state, + size_t mapped_len) +{ + return blk_dma_unmap(req, dma_dev, state, mapped_len, + bio_integrity(req->bio)->bip_flags & BIP_P2P_DMA); +} + int blk_rq_count_integrity_sg(struct request_queue *, struct bio *); int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf, ssize_t bytes); int blk_get_meta_cap(struct block_device *bdev, unsigned int cmd, struct logical_block_metadata_cap __user *argp); +bool blk_rq_integrity_dma_map_iter_start(struct request *req, + struct device *dma_dev, struct dma_iova_state *state, + struct blk_dma_iter *iter); +bool blk_rq_integrity_dma_map_iter_next(struct request *req, + struct device *dma_dev, struct blk_dma_iter *iter); static inline bool blk_integrity_queue_supports_integrity(struct request_queue *q) @@ -109,12 +124,29 @@ static inline int blk_rq_map_integrity_sg(struct request *q, { return 0; } +static inline bool blk_rq_integrity_dma_unmap(struct request *req, + struct device *dma_dev, struct dma_iova_state *state, + size_t mapped_len) +{ + return false; +} static inline int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf, ssize_t bytes) { return -EINVAL; } +static inline bool blk_rq_integrity_dma_map_iter_start(struct request *req, + struct device *dma_dev, struct dma_iova_state *state, + struct blk_dma_iter *iter) +{ + return false; +} +static inline bool blk_rq_integrity_dma_map_iter_next(struct request *req, + struct device *dma_dev, struct blk_dma_iter *iter) +{ + return false; +} static inline struct blk_integrity *bdev_get_integrity(struct block_device *b) { return NULL; diff --git a/include/linux/blk-mq-dma.h b/include/linux/blk-mq-dma.h index c26a01aeae00..51829958d872 100644 --- a/include/linux/blk-mq-dma.h +++ b/include/linux/blk-mq-dma.h @@ -5,6 +5,13 @@ #include <linux/blk-mq.h> #include <linux/pci-p2pdma.h> +struct blk_map_iter { + struct bvec_iter iter; + struct bio *bio; + struct bio_vec *bvecs; + bool is_integrity; +}; + struct blk_dma_iter { /* Output address range for this iteration */ dma_addr_t addr; @@ -14,7 +21,7 @@ struct blk_dma_iter { blk_status_t status; /* Internal to blk_rq_dma_map_iter_* */ - struct req_iterator iter; + struct blk_map_iter iter; struct pci_p2pdma_map_state p2pdma; }; @@ -36,19 +43,20 @@ static inline bool blk_rq_dma_map_coalesce(struct dma_iova_state *state) } /** - * blk_rq_dma_unmap - try to DMA unmap a request + * blk_dma_unmap - try to DMA unmap a request * @req: request to unmap * @dma_dev: device to unmap from * @state: DMA IOVA state * @mapped_len: number of bytes to unmap + * @is_p2p: true if mapped with PCI_P2PDMA_MAP_BUS_ADDR * * Returns %false if the callers need to manually unmap every DMA segment * mapped using @iter or %true if no work is left to be done. */ -static inline bool blk_rq_dma_unmap(struct request *req, struct device *dma_dev, - struct dma_iova_state *state, size_t mapped_len) +static inline bool blk_dma_unmap(struct request *req, struct device *dma_dev, + struct dma_iova_state *state, size_t mapped_len, bool is_p2p) { - if (req->cmd_flags & REQ_P2PDMA) + if (is_p2p) return true; if (dma_use_iova(state)) { @@ -60,4 +68,11 @@ static inline bool blk_rq_dma_unmap(struct request *req, struct device *dma_dev, return !dma_need_unmap(dma_dev); } +static inline bool blk_rq_dma_unmap(struct request *req, struct device *dma_dev, + struct dma_iova_state *state, size_t mapped_len) +{ + return blk_dma_unmap(req, dma_dev, state, mapped_len, + req->cmd_flags & REQ_P2PDMA); +} + #endif /* BLK_MQ_DMA_H */ diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 2a5a828f19a0..b25d12545f46 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -507,6 +507,8 @@ enum hctx_type { * request_queue.tag_set_list. * @srcu: Use as lock when type of the request queue is blocking * (BLK_MQ_F_BLOCKING). + * @tags_srcu: SRCU used to defer freeing of tags page_list to prevent + * use-after-free when iterating tags. * @update_nr_hwq_lock: * Synchronize updating nr_hw_queues with add/del disk & * switching elevator. @@ -531,6 +533,7 @@ struct blk_mq_tag_set { struct mutex tag_list_lock; struct list_head tag_list; struct srcu_struct *srcu; + struct srcu_struct tags_srcu; struct rw_semaphore update_nr_hwq_lock; }; @@ -767,6 +770,7 @@ struct blk_mq_tags { * request pool */ spinlock_t lock; + struct rcu_head rcu_head; }; static inline struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 09b99d52fd36..8e8d1cc8b06c 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -198,10 +198,6 @@ static inline bool blk_path_error(blk_status_t error) return true; } -struct bio_issue { - u64 value; -}; - typedef __u32 __bitwise blk_opf_t; typedef unsigned int blk_qc_t; @@ -242,7 +238,8 @@ struct bio { * on release of the bio. */ struct blkcg_gq *bi_blkg; - struct bio_issue bi_issue; + /* Time that this bio was issued. */ + u64 issue_time_ns; #ifdef CONFIG_BLK_CGROUP_IOCOST u64 bi_iocost_cost; #endif @@ -269,18 +266,16 @@ struct bio { struct bio_vec *bi_io_vec; /* the actual vec list */ struct bio_set *bi_pool; - - /* - * We can inline a number of vecs at the end of the bio, to avoid - * double allocations for a small number of bio_vecs. This member - * MUST obviously be kept at the very end of the bio. - */ - struct bio_vec bi_inline_vecs[]; }; #define BIO_RESET_BYTES offsetof(struct bio, bi_max_vecs) #define BIO_MAX_SECTORS (UINT_MAX >> SECTOR_SHIFT) +static inline struct bio_vec *bio_inline_vecs(struct bio *bio) +{ + return (struct bio_vec *)(bio + 1); +} + /* * bio flags */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index fe1797bbec42..066e5309bd45 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -657,6 +657,7 @@ enum { QUEUE_FLAG_DISABLE_WBT_DEF, /* for sched to disable/enable wbt */ QUEUE_FLAG_NO_ELV_SWITCH, /* can't switch elevator any more */ QUEUE_FLAG_QOS_ENABLED, /* qos is enabled */ + QUEUE_FLAG_BIO_ISSUE_TIME, /* record bio->issue_time_ns */ QUEUE_FLAG_MAX }; @@ -999,6 +1000,8 @@ extern int blk_register_queue(struct gendisk *disk); extern void blk_unregister_queue(struct gendisk *disk); void submit_bio_noacct(struct bio *bio); struct bio *bio_split_to_limits(struct bio *bio); +struct bio *bio_submit_split_bioset(struct bio *bio, unsigned int split_sectors, + struct bio_set *bs); extern int blk_lld_busy(struct request_queue *q); extern int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags); @@ -1590,13 +1593,6 @@ static inline unsigned int bdev_dma_alignment(struct block_device *bdev) return queue_dma_alignment(bdev_get_queue(bdev)); } -static inline bool bdev_iter_is_aligned(struct block_device *bdev, - struct iov_iter *iter) -{ - return iov_iter_is_aligned(iter, bdev_dma_alignment(bdev), - bdev_logical_block_size(bdev) - 1); -} - static inline unsigned int blk_lim_dma_alignment_and_pad(struct queue_limits *lim) { @@ -1660,7 +1656,7 @@ struct block_device_operations { unsigned int (*check_events) (struct gendisk *disk, unsigned int clearing); void (*unlock_native_capacity) (struct gendisk *); - int (*getgeo)(struct block_device *, struct hd_geometry *); + int (*getgeo)(struct gendisk *, struct hd_geometry *); int (*set_read_only)(struct block_device *bdev, bool ro); void (*free_disk)(struct gendisk *disk); /* this callback is with swap_lock and sometimes page table lock held */ @@ -1870,6 +1866,20 @@ bdev_atomic_write_unit_max_bytes(struct block_device *bdev) return queue_atomic_write_unit_max_bytes(bdev_get_queue(bdev)); } +static inline int bio_split_rw_at(struct bio *bio, + const struct queue_limits *lim, + unsigned *segs, unsigned max_bytes) +{ + return bio_split_io_at(bio, lim, segs, max_bytes, lim->dma_alignment); +} + +static inline int bio_iov_iter_get_bdev_pages(struct bio *bio, + struct iov_iter *iter, struct block_device *bdev) +{ + return bio_iov_iter_get_pages_aligned(bio, iter, + bdev_logical_block_size(bdev) - 1); +} + #define DEFINE_IO_COMP_BATCH(name) struct io_comp_batch name = { } #endif /* _LINUX_BLKDEV_H */ diff --git a/include/linux/hw_bitfield.h b/include/linux/hw_bitfield.h new file mode 100644 index 000000000000..df202e167ce4 --- /dev/null +++ b/include/linux/hw_bitfield.h @@ -0,0 +1,62 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2025, Collabora Ltd. + */ + +#ifndef _LINUX_HW_BITFIELD_H +#define _LINUX_HW_BITFIELD_H + +#include <linux/bitfield.h> +#include <linux/build_bug.h> +#include <linux/limits.h> + +/** + * FIELD_PREP_WM16() - prepare a bitfield element with a mask in the upper half + * @_mask: shifted mask defining the field's length and position + * @_val: value to put in the field + * + * FIELD_PREP_WM16() masks and shifts up the value, as well as bitwise ORs the + * result with the mask shifted up by 16. + * + * This is useful for a common design of hardware registers where the upper + * 16-bit half of a 32-bit register is used as a write-enable mask. In such a + * register, a bit in the lower half is only updated if the corresponding bit + * in the upper half is high. + */ +#define FIELD_PREP_WM16(_mask, _val) \ + ({ \ + typeof(_val) __val = _val; \ + typeof(_mask) __mask = _mask; \ + __BF_FIELD_CHECK(__mask, ((u16)0U), __val, \ + "HWORD_UPDATE: "); \ + (((typeof(__mask))(__val) << __bf_shf(__mask)) & (__mask)) | \ + ((__mask) << 16); \ + }) + +/** + * FIELD_PREP_WM16_CONST() - prepare a constant bitfield element with a mask in + * the upper half + * @_mask: shifted mask defining the field's length and position + * @_val: value to put in the field + * + * FIELD_PREP_WM16_CONST() masks and shifts up the value, as well as bitwise ORs + * the result with the mask shifted up by 16. + * + * This is useful for a common design of hardware registers where the upper + * 16-bit half of a 32-bit register is used as a write-enable mask. In such a + * register, a bit in the lower half is only updated if the corresponding bit + * in the upper half is high. + * + * Unlike FIELD_PREP_WM16(), this is a constant expression and can therefore + * be used in initializers. Error checking is less comfortable for this + * version. + */ +#define FIELD_PREP_WM16_CONST(_mask, _val) \ + ( \ + FIELD_PREP_CONST(_mask, _val) | \ + (BUILD_BUG_ON_ZERO(const_true((u64)(_mask) > U16_MAX)) + \ + ((_mask) << 16)) \ + ) + + +#endif /* _LINUX_HW_BITFIELD_H */ diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index cfa6d0c0c322..7509025b4071 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -11,11 +11,14 @@ /* io_uring_cmd is being issued again */ #define IORING_URING_CMD_REISSUE (1U << 31) +typedef void (*io_uring_cmd_tw_t)(struct io_uring_cmd *cmd, + unsigned issue_flags); + struct io_uring_cmd { struct file *file; const struct io_uring_sqe *sqe; /* callback to defer completions to task context */ - void (*task_work_cb)(struct io_uring_cmd *cmd, unsigned); + io_uring_cmd_tw_t task_work_cb; u32 cmd_op; u32 flags; u8 pdu[32]; /* available inline for free use */ @@ -53,11 +56,11 @@ int io_uring_cmd_import_fixed_vec(struct io_uring_cmd *ioucmd, * Note: the caller should never hard code @issue_flags and is only allowed * to pass the mask provided by the core io_uring code. */ -void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, u64 res2, - unsigned issue_flags); +void __io_uring_cmd_done(struct io_uring_cmd *cmd, s32 ret, u64 res2, + unsigned issue_flags, bool is_cqe32); void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned), + io_uring_cmd_tw_t task_work_cb, unsigned flags); /* @@ -70,6 +73,21 @@ void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, /* Execute the request from a blocking context */ void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd); +/* + * Select a buffer from the provided buffer group for multishot uring_cmd. + * Returns the selected buffer address and size. + */ +struct io_br_sel io_uring_cmd_buffer_select(struct io_uring_cmd *ioucmd, + unsigned buf_group, size_t *len, + unsigned int issue_flags); + +/* + * Complete a multishot uring_cmd event. This will post a CQE to the completion + * queue and update the provided buffer. + */ +bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd, + struct io_br_sel *sel, unsigned int issue_flags); + #else static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, @@ -86,13 +104,12 @@ static inline int io_uring_cmd_import_fixed_vec(struct io_uring_cmd *ioucmd, { return -EOPNOTSUPP; } -static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, - u64 ret2, unsigned issue_flags) +static inline void __io_uring_cmd_done(struct io_uring_cmd *cmd, s32 ret, + u64 ret2, unsigned issue_flags, bool is_cqe32) { } static inline void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned), - unsigned flags) + io_uring_cmd_tw_t task_work_cb, unsigned flags) { } static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, @@ -102,28 +119,28 @@ static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, static inline void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd) { } -#endif - -/* - * Polled completions must ensure they are coming from a poll queue, and - * hence are completed inside the usual poll handling loops. - */ -static inline void io_uring_cmd_iopoll_done(struct io_uring_cmd *ioucmd, - ssize_t ret, ssize_t res2) +static inline struct io_br_sel +io_uring_cmd_buffer_select(struct io_uring_cmd *ioucmd, unsigned buf_group, + size_t *len, unsigned int issue_flags) +{ + return (struct io_br_sel) { .val = -EOPNOTSUPP }; +} +static inline bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd, + struct io_br_sel *sel, unsigned int issue_flags) { - lockdep_assert(in_task()); - io_uring_cmd_done(ioucmd, ret, res2, 0); + return true; } +#endif /* users must follow the IOU_F_TWQ_LAZY_WAKE semantics */ static inline void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned)) + io_uring_cmd_tw_t task_work_cb) { __io_uring_cmd_do_in_task(ioucmd, task_work_cb, IOU_F_TWQ_LAZY_WAKE); } static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned)) + io_uring_cmd_tw_t task_work_cb) { __io_uring_cmd_do_in_task(ioucmd, task_work_cb, 0); } @@ -142,6 +159,18 @@ static inline void *io_uring_cmd_ctx_handle(struct io_uring_cmd *cmd) return cmd_to_io_kiocb(cmd)->ctx; } +static inline void io_uring_cmd_done(struct io_uring_cmd *ioucmd, s32 ret, + unsigned issue_flags) +{ + return __io_uring_cmd_done(ioucmd, ret, 0, issue_flags, false); +} + +static inline void io_uring_cmd_done32(struct io_uring_cmd *ioucmd, s32 ret, + u64 res2, unsigned issue_flags) +{ + return __io_uring_cmd_done(ioucmd, ret, res2, issue_flags, true); +} + int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq, void (*release)(void *), unsigned int index, unsigned int issue_flags); diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 12f5ee43850e..c2ea6280901d 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -86,6 +86,25 @@ struct io_mapped_region { }; /* + * Return value from io_buffer_list selection, to avoid stashing it in + * struct io_kiocb. For legacy/classic provided buffers, keeping a reference + * across execution contexts are fine. But for ring provided buffers, the + * list may go away as soon as ->uring_lock is dropped. As the io_kiocb + * persists, it's better to just keep the buffer local for those cases. + */ +struct io_br_sel { + struct io_buffer_list *buf_list; + /* + * Some selection parts return the user address, others return an error. + */ + union { + void __user *addr; + ssize_t val; + }; +}; + + +/* * Arbitrary limit, can be raised if need be */ #define IO_RINGFD_REG_MAX 16 @@ -671,12 +690,6 @@ struct io_kiocb { /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */ struct io_buffer *kbuf; - /* - * stores buffer ID for ring provided buffers, valid IFF - * REQ_F_BUFFER_RING is set. - */ - struct io_buffer_list *buf_list; - struct io_rsrc_node *buf_node; }; @@ -724,10 +737,4 @@ struct io_overflow_cqe { struct list_head list; struct io_uring_cqe cqe; }; - -static inline bool io_ctx_cqe32(struct io_ring_ctx *ctx) -{ - return ctx->flags & IORING_SETUP_CQE32; -} - #endif diff --git a/include/linux/libata.h b/include/linux/libata.h index 0620dd67369f..21de0935775d 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -1203,7 +1203,7 @@ extern void ata_qc_complete(struct ata_queued_cmd *qc); extern u64 ata_qc_get_active(struct ata_port *ap); extern void ata_scsi_simulate(struct ata_device *dev, struct scsi_cmnd *cmd); extern int ata_std_bios_param(struct scsi_device *sdev, - struct block_device *bdev, + struct gendisk *unused, sector_t capacity, int geom[]); extern void ata_scsi_unlock_native_capacity(struct scsi_device *sdev); extern int ata_scsi_sdev_init(struct scsi_device *sdev); diff --git a/include/linux/module.h b/include/linux/module.h index 3319a5269d28..e135cc79acee 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -244,14 +244,22 @@ struct module_kobject *lookup_or_create_module_kobject(const char *name); /* What your module does. */ #define MODULE_DESCRIPTION(_description) MODULE_INFO(description, _description) -#ifdef MODULE +/* + * Format: __mod_device_table__kmod_<modname>__<type>__<name> + * Parts of the string `__kmod_` and `__` are used as delimiters when parsing + * a symbol in file2alias.c + */ +#define __mod_device_table(type, name) \ + __PASTE(__mod_device_table__, \ + __PASTE(__KBUILD_MODNAME, \ + __PASTE(__, \ + __PASTE(type, \ + __PASTE(__, name))))) + /* Creates an alias so file2alias.c can find device table. */ #define MODULE_DEVICE_TABLE(type, name) \ -static typeof(name) __mod_device_table__##type##__##name \ +static typeof(name) __mod_device_table(type, name) \ __attribute__ ((used, alias(__stringify(name)))) -#else /* !MODULE */ -#define MODULE_DEVICE_TABLE(type, name) -#endif /* Version of form [<epoch>:]<version>[-<extra-version>]. * Or for CVS/RCS ID version, everything but the number is stripped. diff --git a/include/linux/poison.h b/include/linux/poison.h index 8ca2235f78d5..299e2dd7da6d 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -90,4 +90,7 @@ /********** lib/stackdepot.c **********/ #define STACK_DEPOT_POISON ((void *)(0xD390 + POISON_POINTER_DELTA)) +/********** io_uring/ **********/ +#define IO_URING_PTR_POISON ((void *)(0x1091UL + POISON_POINTER_DELTA)) + #endif diff --git a/include/linux/uio.h b/include/linux/uio.h index 2e86c653186c..5b127043a151 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -286,8 +286,6 @@ size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i); #endif size_t iov_iter_zero(size_t bytes, struct iov_iter *); -bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask, - unsigned len_mask); unsigned long iov_iter_alignment(const struct iov_iter *i); unsigned long iov_iter_gap_alignment(const struct iov_iter *i); void iov_iter_init(struct iov_iter *i, unsigned int direction, const struct iovec *iov, diff --git a/include/scsi/libsas.h b/include/scsi/libsas.h index ba460b6c0374..9c6e90829dbd 100644 --- a/include/scsi/libsas.h +++ b/include/scsi/libsas.h @@ -685,7 +685,7 @@ extern int sas_queuecommand(struct Scsi_Host *, struct scsi_cmnd *); extern int sas_target_alloc(struct scsi_target *); int sas_sdev_configure(struct scsi_device *dev, struct queue_limits *lim); extern int sas_change_queue_depth(struct scsi_device *, int new_depth); -extern int sas_bios_param(struct scsi_device *, struct block_device *, +extern int sas_bios_param(struct scsi_device *, struct gendisk *, sector_t capacity, int *hsc); int sas_execute_internal_abort_single(struct domain_device *device, u16 tag, unsigned int qid, diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h index c53812b9026f..f5a243261236 100644 --- a/include/scsi/scsi_host.h +++ b/include/scsi/scsi_host.h @@ -318,7 +318,7 @@ struct scsi_host_template { * * Status: OPTIONAL */ - int (* bios_param)(struct scsi_device *, struct block_device *, + int (* bios_param)(struct scsi_device *, struct gendisk *, sector_t, int []); /* diff --git a/include/scsi/scsicam.h b/include/scsi/scsicam.h index 08edd603e521..1131f51ed2c8 100644 --- a/include/scsi/scsicam.h +++ b/include/scsi/scsicam.h @@ -13,7 +13,8 @@ #ifndef SCSICAM_H #define SCSICAM_H -int scsicam_bios_param(struct block_device *bdev, sector_t capacity, int *ip); -bool scsi_partsize(struct block_device *bdev, sector_t capacity, int geom[3]); -unsigned char *scsi_bios_ptable(struct block_device *bdev); +struct gendisk; +int scsicam_bios_param(struct gendisk *disk, sector_t capacity, int *ip); +bool scsi_partsize(struct gendisk *disk, sector_t capacity, int geom[3]); +unsigned char *scsi_bios_ptable(struct gendisk *disk); #endif /* def SCSICAM_H */ diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h index 178ab6f611be..45d15460b495 100644 --- a/include/trace/events/io_uring.h +++ b/include/trace/events/io_uring.h @@ -340,8 +340,8 @@ TP_PROTO(struct io_ring_ctx *ctx, void *req, struct io_uring_cqe *cqe), __entry->user_data = cqe->user_data; __entry->res = cqe->res; __entry->cflags = cqe->flags; - __entry->extra1 = io_ctx_cqe32(ctx) ? cqe->big_cqe[0] : 0; - __entry->extra2 = io_ctx_cqe32(ctx) ? cqe->big_cqe[1] : 0; + __entry->extra1 = ctx->flags & IORING_SETUP_CQE32 || cqe->flags & IORING_CQE_F_32 ? cqe->big_cqe[0] : 0; + __entry->extra2 = ctx->flags & IORING_SETUP_CQE32 || cqe->flags & IORING_CQE_F_32 ? cqe->big_cqe[1] : 0; ), TP_printk("ring %p, req %p, user_data 0x%llx, result %d, cflags 0x%x " diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 6957dc539d83..a0cc1cc0dd01 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -225,6 +225,12 @@ enum io_uring_sqe_flags_bit { /* Use hybrid poll in iopoll process */ #define IORING_SETUP_HYBRID_IOPOLL (1U << 17) +/* + * Allow both 16b and 32b CQEs. If a 32b CQE is posted, it will have + * IORING_CQE_F_32 set in cqe->flags. + */ +#define IORING_SETUP_CQE_MIXED (1U << 18) + enum io_uring_op { IORING_OP_NOP, IORING_OP_READV, @@ -298,9 +304,13 @@ enum io_uring_op { * sqe->uring_cmd_flags top 8bits aren't available for userspace * IORING_URING_CMD_FIXED use registered buffer; pass this flag * along with setting sqe->buf_index. + * IORING_URING_CMD_MULTISHOT must be used with buffer select, like other + * multishot commands. Not compatible with + * IORING_URING_CMD_FIXED, for now. */ #define IORING_URING_CMD_FIXED (1U << 0) -#define IORING_URING_CMD_MASK IORING_URING_CMD_FIXED +#define IORING_URING_CMD_MULTISHOT (1U << 1) +#define IORING_URING_CMD_MASK (IORING_URING_CMD_FIXED | IORING_URING_CMD_MULTISHOT) /* @@ -454,6 +464,7 @@ enum io_uring_msg_ring_flags { #define IORING_NOP_FIXED_FILE (1U << 2) #define IORING_NOP_FIXED_BUFFER (1U << 3) #define IORING_NOP_TW (1U << 4) +#define IORING_NOP_CQE32 (1U << 5) /* * IO completion data structure (Completion Queue Entry) @@ -487,12 +498,22 @@ struct io_uring_cqe { * other provided buffer type, all completions with a * buffer passed back is automatically returned to the * application. + * IORING_CQE_F_SKIP If set, then the application/liburing must ignore this + * CQE. It's only purpose is to fill a gap in the ring, + * if a large CQE is attempted posted when the ring has + * just a single small CQE worth of space left before + * wrapping. + * IORING_CQE_F_32 If set, this is a 32b/big-cqe posting. Use with rings + * setup in a mixed CQE mode, where both 16b and 32b + * CQEs may be posted to the CQ ring. */ #define IORING_CQE_F_BUFFER (1U << 0) #define IORING_CQE_F_MORE (1U << 1) #define IORING_CQE_F_SOCK_NONEMPTY (1U << 2) #define IORING_CQE_F_NOTIF (1U << 3) #define IORING_CQE_F_BUF_MORE (1U << 4) +#define IORING_CQE_F_SKIP (1U << 5) +#define IORING_CQE_F_32 (1U << 15) #define IORING_CQE_BUFFER_SHIFT 16 @@ -665,6 +686,12 @@ enum io_uring_register_op { IORING_REGISTER_MEM_REGION = 34, + /* query various aspects of io_uring, see linux/io_uring/query.h */ + IORING_REGISTER_QUERY = 35, + + /* return zcrx buffers back into circulation */ + IORING_REGISTER_ZCRX_REFILL = 36, + /* this goes last */ IORING_REGISTER_LAST, @@ -1046,6 +1073,15 @@ struct io_uring_zcrx_ifq_reg { __u64 __resv[3]; }; +struct io_uring_zcrx_sync_refill { + __u32 zcrx_id; + /* the number of entries to return */ + __u32 nr_entries; + /* pointer to an array of struct io_uring_zcrx_rqe */ + __u64 rqes; + __u64 __resv[2]; +}; + #ifdef __cplusplus } #endif diff --git a/include/uapi/linux/io_uring/query.h b/include/uapi/linux/io_uring/query.h new file mode 100644 index 000000000000..5d754322a27c --- /dev/null +++ b/include/uapi/linux/io_uring/query.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR MIT */ +/* + * Header file for the io_uring query interface. + */ +#ifndef LINUX_IO_URING_QUERY_H +#define LINUX_IO_URING_QUERY_H + +#include <linux/types.h> + +struct io_uring_query_hdr { + __u64 next_entry; + __u64 query_data; + __u32 query_op; + __u32 size; + __s32 result; + __u32 __resv[3]; +}; + +enum { + IO_URING_QUERY_OPCODES = 0, + + __IO_URING_QUERY_MAX, +}; + +/* Doesn't require a ring */ +struct io_uring_query_opcode { + /* The number of supported IORING_OP_* opcodes */ + __u32 nr_request_opcodes; + /* The number of supported IORING_[UN]REGISTER_* opcodes */ + __u32 nr_register_opcodes; + /* Bitmask of all supported IORING_FEAT_* flags */ + __u64 feature_flags; + /* Bitmask of all supported IORING_SETUP_* flags */ + __u64 ring_setup_flags; + /* Bitmask of all supported IORING_ENTER_** flags */ + __u64 enter_flags; + /* Bitmask of all supported IOSQE_* flags */ + __u64 sqe_flags; +}; + +#endif |