| 1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
 | /*-------------------------------------------------------------------------
 *
 * aio.h
 *    Main AIO interface
 *
 * This is the header to include when actually issuing AIO. When just
 * declaring functions involving an AIO related type, it might suffice to
 * include aio_types.h. Initialization related functions are in the dedicated
 * aio_init.h.
 *
 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * src/include/storage/aio.h
 *
 *-------------------------------------------------------------------------
 */
#ifndef AIO_H
#define AIO_H
#include "storage/aio_types.h"
#include "storage/procnumber.h"
/* io_uring is incompatible with EXEC_BACKEND */
#if defined(USE_LIBURING) && !defined(EXEC_BACKEND)
#define IOMETHOD_IO_URING_ENABLED
#endif
/* Enum for io_method GUC. */
typedef enum IoMethod
{
	IOMETHOD_SYNC = 0,
	IOMETHOD_WORKER,
#ifdef IOMETHOD_IO_URING_ENABLED
	IOMETHOD_IO_URING,
#endif
}			IoMethod;
/* We'll default to worker based execution. */
#define DEFAULT_IO_METHOD IOMETHOD_WORKER
/*
 * Flags for an IO that can be set with pgaio_io_set_flag().
 */
typedef enum PgAioHandleFlags
{
	/*
	 * The IO references backend local memory.
	 *
	 * This needs to be set on an IO whenever the IO references process-local
	 * memory. Some IO methods do not support executing IO that references
	 * process local memory and thus need to fall back to executing IO
	 * synchronously for IOs with this flag set.
	 *
	 * Required for correctness.
	 */
	PGAIO_HF_REFERENCES_LOCAL = 1 << 1,
	/*
	 * Hint that IO will be executed synchronously.
	 *
	 * This can make it a bit cheaper to execute synchronous IO via the AIO
	 * interface, to avoid needing an AIO and non-AIO version of code.
	 *
	 * Advantageous to set, if applicable, but not required for correctness.
	 */
	PGAIO_HF_SYNCHRONOUS = 1 << 0,
	/*
	 * IO is using buffered IO, used to control heuristic in some IO methods.
	 *
	 * Advantageous to set, if applicable, but not required for correctness.
	 */
	PGAIO_HF_BUFFERED = 1 << 2,
} PgAioHandleFlags;
/*
 * The IO operations supported by the AIO subsystem.
 *
 * This could be in aio_internal.h, as it is not publicly referenced, but
 * PgAioOpData currently *does* need to be public, therefore keeping this
 * public seems to make sense.
 */
typedef enum PgAioOp
{
	/* intentionally the zero value, to help catch zeroed memory etc */
	PGAIO_OP_INVALID = 0,
	PGAIO_OP_READV,
	PGAIO_OP_WRITEV,
	/**
	 * In the near term we'll need at least:
	 * - fsync / fdatasync
	 * - flush_range
	 *
	 * Eventually we'll additionally want at least:
	 * - send
	 * - recv
	 * - accept
	 **/
} PgAioOp;
#define PGAIO_OP_COUNT	(PGAIO_OP_WRITEV + 1)
/*
 * On what is IO being performed?
 *
 * PgAioTargetID specific behaviour should be implemented in
 * aio_target.c.
 */
typedef enum PgAioTargetID
{
	/* intentionally the zero value, to help catch zeroed memory etc */
	PGAIO_TID_INVALID = 0,
	PGAIO_TID_SMGR,
} PgAioTargetID;
#define PGAIO_TID_COUNT (PGAIO_TID_SMGR + 1)
/*
 * Data necessary for support IO operations (see PgAioOp).
 *
 * NB: Note that the FDs in here may *not* be relied upon for re-issuing
 * requests (e.g. for partial reads/writes or in an IO worker) - the FD might
 * be from another process, or closed since. That's not a problem for staged
 * IOs, as all staged IOs are submitted when closing an FD.
 */
typedef union
{
	struct
	{
		int			fd;
		uint16		iov_length;
		uint64		offset;
	}			read;
	struct
	{
		int			fd;
		uint16		iov_length;
		uint64		offset;
	}			write;
} PgAioOpData;
/*
 * Information the object that IO is executed on. Mostly callbacks that
 * operate on PgAioTargetData.
 *
 * typedef is in aio_types.h
 */
struct PgAioTargetInfo
{
	/*
	 * To support executing using worker processes, the file descriptor for an
	 * IO may need to be reopened in a different process.
	 */
	void		(*reopen) (PgAioHandle *ioh);
	/* describe the target of the IO, used for log messages and views */
	char	   *(*describe_identity) (const PgAioTargetData *sd);
	/* name of the target, used in log messages / views */
	const char *name;
};
/*
 * IDs for callbacks that can be registered on an IO.
 *
 * Callbacks are identified by an ID rather than a function pointer. There are
 * two main reasons:
 *
 * 1) Memory within PgAioHandle is precious, due to the number of PgAioHandle
 *    structs in pre-allocated shared memory.
 *
 * 2) Due to EXEC_BACKEND function pointers are not necessarily stable between
 *    different backends, therefore function pointers cannot directly be in
 *    shared memory.
 *
 * Without 2), we could fairly easily allow to add new callbacks, by filling a
 * ID->pointer mapping table on demand. In the presence of 2 that's still
 * doable, but harder, because every process has to re-register the pointers
 * so that a local ID->"backend local pointer" mapping can be maintained.
 */
typedef enum PgAioHandleCallbackID
{
	PGAIO_HCB_INVALID = 0,
	PGAIO_HCB_MD_READV,
	PGAIO_HCB_SHARED_BUFFER_READV,
	PGAIO_HCB_LOCAL_BUFFER_READV,
} PgAioHandleCallbackID;
#define PGAIO_HCB_MAX	PGAIO_HCB_LOCAL_BUFFER_READV
StaticAssertDecl(PGAIO_HCB_MAX < (1 << PGAIO_RESULT_ID_BITS),
				 "PGAIO_HCB_MAX is too big for PGAIO_RESULT_ID_BITS");
typedef void (*PgAioHandleCallbackStage) (PgAioHandle *ioh, uint8 cb_flags);
typedef PgAioResult (*PgAioHandleCallbackComplete) (PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_flags);
typedef void (*PgAioHandleCallbackReport) (PgAioResult result, const PgAioTargetData *target_data, int elevel);
/* typedef is in aio_types.h */
struct PgAioHandleCallbacks
{
	/*
	 * Prepare resources affected by the IO for execution. This could e.g.
	 * include moving ownership of buffer pins to the AIO subsystem.
	 */
	PgAioHandleCallbackStage stage;
	/*
	 * Update the state of resources affected by the IO to reflect completion
	 * of the IO. This could e.g. include updating shared buffer state to
	 * signal the IO has finished.
	 *
	 * The _shared suffix indicates that this is executed by the backend that
	 * completed the IO, which may or may not be the backend that issued the
	 * IO.  Obviously the callback thus can only modify resources in shared
	 * memory.
	 *
	 * The latest registered callback is called first. This allows
	 * higher-level code to register callbacks that can rely on callbacks
	 * registered by lower-level code to already have been executed.
	 *
	 * NB: This is called in a critical section. Errors can be signalled by
	 * the callback's return value, it's the responsibility of the IO's issuer
	 * to react appropriately.
	 */
	PgAioHandleCallbackComplete complete_shared;
	/*
	 * Like complete_shared, except called in the issuing backend.
	 *
	 * This variant of the completion callback is useful when backend-local
	 * state has to be updated to reflect the IO's completion. E.g. a
	 * temporary buffer's BufferDesc isn't accessible in complete_shared.
	 *
	 * Local callbacks are only called after complete_shared for all
	 * registered callbacks has been called.
	 */
	PgAioHandleCallbackComplete complete_local;
	/*
	 * Report the result of an IO operation. This is e.g. used to raise an
	 * error after an IO failed at the appropriate time (i.e. not when the IO
	 * failed, but under control of the code that issued the IO).
	 */
	PgAioHandleCallbackReport report;
};
/*
 * How many callbacks can be registered for one IO handle. Currently we only
 * need two, but it's not hard to imagine needing a few more.
 */
#define PGAIO_HANDLE_MAX_CALLBACKS	4
/* --------------------------------------------------------------------------------
 * IO Handles
 * --------------------------------------------------------------------------------
 */
/* functions in aio.c */
struct ResourceOwnerData;
extern PgAioHandle *pgaio_io_acquire(struct ResourceOwnerData *resowner, PgAioReturn *ret);
extern PgAioHandle *pgaio_io_acquire_nb(struct ResourceOwnerData *resowner, PgAioReturn *ret);
extern void pgaio_io_release(PgAioHandle *ioh);
struct dlist_node;
extern void pgaio_io_release_resowner(struct dlist_node *ioh_node, bool on_error);
extern void pgaio_io_set_flag(PgAioHandle *ioh, PgAioHandleFlags flag);
extern int	pgaio_io_get_id(PgAioHandle *ioh);
extern ProcNumber pgaio_io_get_owner(PgAioHandle *ioh);
extern void pgaio_io_get_wref(PgAioHandle *ioh, PgAioWaitRef *iow);
/* functions in aio_io.c */
struct iovec;
extern int	pgaio_io_get_iovec(PgAioHandle *ioh, struct iovec **iov);
extern PgAioOp pgaio_io_get_op(PgAioHandle *ioh);
extern PgAioOpData *pgaio_io_get_op_data(PgAioHandle *ioh);
extern void pgaio_io_start_readv(PgAioHandle *ioh,
								 int fd, int iovcnt, uint64 offset);
extern void pgaio_io_start_writev(PgAioHandle *ioh,
								  int fd, int iovcnt, uint64 offset);
/* functions in aio_target.c */
extern void pgaio_io_set_target(PgAioHandle *ioh, PgAioTargetID targetid);
extern bool pgaio_io_has_target(PgAioHandle *ioh);
extern PgAioTargetData *pgaio_io_get_target_data(PgAioHandle *ioh);
extern char *pgaio_io_get_target_description(PgAioHandle *ioh);
/* functions in aio_callback.c */
extern void pgaio_io_register_callbacks(PgAioHandle *ioh, PgAioHandleCallbackID cb_id,
										uint8 cb_data);
extern void pgaio_io_set_handle_data_64(PgAioHandle *ioh, uint64 *data, uint8 len);
extern void pgaio_io_set_handle_data_32(PgAioHandle *ioh, uint32 *data, uint8 len);
extern uint64 *pgaio_io_get_handle_data(PgAioHandle *ioh, uint8 *len);
/* --------------------------------------------------------------------------------
 * IO Wait References
 * --------------------------------------------------------------------------------
 */
extern void pgaio_wref_clear(PgAioWaitRef *iow);
extern bool pgaio_wref_valid(PgAioWaitRef *iow);
extern int	pgaio_wref_get_id(PgAioWaitRef *iow);
extern void pgaio_wref_wait(PgAioWaitRef *iow);
extern bool pgaio_wref_check_done(PgAioWaitRef *iow);
/* --------------------------------------------------------------------------------
 * IO Result
 * --------------------------------------------------------------------------------
 */
extern void pgaio_result_report(PgAioResult result, const PgAioTargetData *target_data,
								int elevel);
/* --------------------------------------------------------------------------------
 * Actions on multiple IOs.
 * --------------------------------------------------------------------------------
 */
extern void pgaio_enter_batchmode(void);
extern void pgaio_exit_batchmode(void);
extern void pgaio_submit_staged(void);
extern bool pgaio_have_staged(void);
/* --------------------------------------------------------------------------------
 * Other
 * --------------------------------------------------------------------------------
 */
extern void pgaio_closing_fd(int fd);
/* GUCs */
extern PGDLLIMPORT int io_method;
extern PGDLLIMPORT int io_max_concurrency;
#endif							/* AIO_H */
 |