diff --git a/src/backend/storage/aio/method_io_uring.c b/src/backend/storage/aio/method_io_uring.c index 39984df31b4..9f76d2683c0 100644 --- a/src/backend/storage/aio/method_io_uring.c +++ b/src/backend/storage/aio/method_io_uring.c @@ -409,7 +409,6 @@ static int pgaio_uring_submit(uint16 num_staged_ios, PgAioHandle **staged_ios) { struct io_uring *uring_instance = &pgaio_my_uring_context->io_uring_ring; - int in_flight_before = dclist_count(&pgaio_my_backend->in_flight_ios); Assert(num_staged_ios <= PGAIO_SUBMIT_BATCH_SIZE); @@ -425,27 +424,6 @@ pgaio_uring_submit(uint16 num_staged_ios, PgAioHandle **staged_ios) pgaio_io_prepare_submit(ioh); pgaio_uring_sq_from_io(ioh, sqe); - - /* - * io_uring executes IO in process context if possible. That's - * generally good, as it reduces context switching. When performing a - * lot of buffered IO that means that copying between page cache and - * userspace memory happens in the foreground, as it can't be - * offloaded to DMA hardware as is possible when using direct IO. When - * executing a lot of buffered IO this causes io_uring to be slower - * than worker mode, as worker mode parallelizes the copying. io_uring - * can be told to offload work to worker threads instead. - * - * If an IO is buffered IO and we already have IOs in flight or - * multiple IOs are being submitted, we thus tell io_uring to execute - * the IO in the background. We don't do so for the first few IOs - * being submitted as executing in this process' context has lower - * latency. - */ - if (in_flight_before > 4 && (ioh->flags & PGAIO_HF_BUFFERED)) - io_uring_sqe_set_flags(sqe, IOSQE_ASYNC); - - in_flight_before++; } while (true) @@ -701,10 +679,65 @@ pgaio_uring_check_one(PgAioHandle *ioh, uint64 ref_generation) LWLockRelease(&owner_context->completion_lock); } +/* + * io_uring executes IO in process context if possible. That's generally good, + * as it reduces context switching. When performing a lot of buffered IO that + * means that copying between page cache and userspace memory happens in the + * foreground, as it can't be offloaded to DMA hardware as is possible when + * using direct IO. When executing a lot of buffered IO this causes io_uring + * to be slower than worker mode, as worker mode parallelizes the + * copying. io_uring can be told to offload work to worker threads instead. + * + * If the IOs are small, we only benefit from forcing things into the + * background if there is a lot of IO, as otherwise the overhead from context + * switching is higher than the gain. + * + * If IOs are large, there is benefit from asynchronous processing at lower + * queue depths, as IO latency is less of a crucial factor and parallelizing + * memory copies is more important. In addition, it is important to trigger + * asynchronous processing even at low queue depth, as with foreground + * processing we might never actually reach deep enough IO depths to trigger + * asynchronous processing, which in turn would deprive readahead control + * logic of information about whether a deeper look-ahead distance would be + * advantageous. + * + * We have done some basic benchmarking to validate the thresholds used, but + * it's quite plausible that there are better values. See + * https://postgr.es/m/3gkuvs3lz3u3skuaxfkxnsysfqslf2srigl6546vhesekve6v2%40va3r5esummvg + * for some details of this benchmarking. + */ +static bool +pgaio_uring_should_use_async(PgAioHandle *ioh, size_t io_size) +{ + /* + * With DIO there's no benefit from forcing asynchronous processing, as + * io_uring will never execute direct IO synchronously during submission. + */ + if (!(ioh->flags & PGAIO_HF_BUFFERED)) + return false; + + /* + * Once the IO queue depth is not that shallow anymore, the overhead of + * dispatching to the background is a less significant factor. + */ + if (dclist_count(&pgaio_my_backend->in_flight_ios) > 4) + return true; + + /* + * If the IO is larger, the gains from parallelizing the memory copy are + * larger and typically the impact of the latency is smaller. + */ + if (io_size >= (BLCKSZ * 4)) + return true; + + return false; +} + static void pgaio_uring_sq_from_io(PgAioHandle *ioh, struct io_uring_sqe *sqe) { struct iovec *iov; + size_t io_size = 0; switch ((PgAioOp) ioh->op) { @@ -717,6 +750,8 @@ pgaio_uring_sq_from_io(PgAioHandle *ioh, struct io_uring_sqe *sqe) iov->iov_base, iov->iov_len, ioh->op_data.read.offset); + + io_size = iov->iov_len; } else { @@ -726,7 +761,13 @@ pgaio_uring_sq_from_io(PgAioHandle *ioh, struct io_uring_sqe *sqe) ioh->op_data.read.iov_length, ioh->op_data.read.offset); + for (int i = 0; i < ioh->op_data.read.iov_length; i++, iov++) + io_size += iov->iov_len; } + + if (pgaio_uring_should_use_async(ioh, io_size)) + io_uring_sqe_set_flags(sqe, IOSQE_ASYNC); + break; case PGAIO_OP_WRITEV: @@ -747,6 +788,12 @@ pgaio_uring_sq_from_io(PgAioHandle *ioh, struct io_uring_sqe *sqe) ioh->op_data.write.iov_length, ioh->op_data.write.offset); } + + /* + * For now don't trigger use of IOSQE_ASYNC for writes, it's not + * clear there is a performance benefit in doing so. + */ + break; case PGAIO_OP_INVALID: