diff --git a/doc/src/sgml/ref/explain.sgml b/doc/src/sgml/ref/explain.sgml index 5b8b521802e..e95e19081e1 100644 --- a/doc/src/sgml/ref/explain.sgml +++ b/doc/src/sgml/ref/explain.sgml @@ -46,6 +46,7 @@ EXPLAIN [ ( option [, ...] ) ] boolean ] SUMMARY [ boolean ] MEMORY [ boolean ] + IO [ boolean ] FORMAT { TEXT | XML | JSON | YAML } @@ -298,6 +299,17 @@ ROLLBACK; + + IO + + + Include information on I/O performed by scan nodes proving such information. + This parameter may only be used when ANALYZE is also + enabled. It defaults to FALSE. + + + + FORMAT diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index f6ac5a0897c..abfd8e8970a 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -43,6 +43,7 @@ #include "catalog/pg_database.h" #include "catalog/pg_database_d.h" #include "commands/vacuum.h" +#include "executor/instrument_node.h" #include "pgstat.h" #include "port/pg_bitutils.h" #include "storage/lmgr.h" @@ -1200,6 +1201,7 @@ heap_beginscan(Relation relation, Snapshot snapshot, scan->rs_base.rs_nkeys = nkeys; scan->rs_base.rs_flags = flags; scan->rs_base.rs_parallel = parallel_scan; + scan->rs_base.rs_instrument = NULL; scan->rs_strategy = NULL; /* set in initscan */ scan->rs_cbuf = InvalidBuffer; @@ -1312,6 +1314,14 @@ heap_beginscan(Relation relation, Snapshot snapshot, sizeof(TBMIterateResult)); } + /* enable read stream instrumentation */ + if ((flags & SO_SCAN_INSTRUMENT) && (scan->rs_read_stream != NULL)) + { + scan->rs_base.rs_instrument = palloc0_object(TableScanInstrumentation); + read_stream_enable_stats(scan->rs_read_stream, + &scan->rs_base.rs_instrument->io); + } + scan->rs_vmbuffer = InvalidBuffer; return (TableScanDesc) scan; @@ -1415,6 +1425,9 @@ heap_endscan(TableScanDesc sscan) if (scan->rs_base.rs_flags & SO_TEMP_SNAPSHOT) UnregisterSnapshot(scan->rs_base.rs_snapshot); + if (scan->rs_base.rs_instrument) + pfree(scan->rs_base.rs_instrument); + pfree(scan); } diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index f151f21f9b3..20e0bc8f232 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -13,6 +13,7 @@ */ #include "postgres.h" +#include "access/relscan.h" #include "access/xact.h" #include "catalog/pg_type.h" #include "commands/createas.h" @@ -139,6 +140,8 @@ static void show_hashagg_info(AggState *aggstate, ExplainState *es); static void show_indexsearches_info(PlanState *planstate, ExplainState *es); static void show_tidbitmap_info(BitmapHeapScanState *planstate, ExplainState *es); +static void show_scan_io_usage(ScanState *planstate, + ExplainState *es); static void show_instrumentation_count(const char *qlabel, int which, PlanState *planstate, ExplainState *es); static void show_foreignscan_info(ForeignScanState *fsstate, ExplainState *es); @@ -519,6 +522,8 @@ ExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, ExplainState *es, instrument_option |= INSTRUMENT_BUFFERS; if (es->wal) instrument_option |= INSTRUMENT_WAL; + if (es->io) + instrument_option |= INSTRUMENT_IO; /* * We always collect timing for the entire statement, even when node-level @@ -2008,6 +2013,7 @@ ExplainNode(PlanState *planstate, List *ancestors, show_instrumentation_count("Rows Removed by Filter", 1, planstate, es); show_tidbitmap_info((BitmapHeapScanState *) planstate, es); + show_scan_io_usage((ScanState *) planstate, es); break; case T_SampleScan: show_tablesample(((SampleScan *) plan)->tablesample, @@ -3984,6 +3990,128 @@ show_tidbitmap_info(BitmapHeapScanState *planstate, ExplainState *es) } } +/* + * Print I/O stats - prefetching and I/O performed + * + * This prints two types of stats - "prefetch" about the prefetching done by + * ReadStream, and "I/O" issued by the stream. The prefetch stats are based + * on buffers pulled from the stream (even if no I/O is needed). The I/O + * information is related to I/O requests issued by the stream. + * + * The prefetch stats are printed if any buffer was pulled from the stream. + * For the I/O stats it depend on the output format. In non-text formats the + * information is printed if prefetch stats were printed. In text format it + * gets printed only if there were any I/O requests. + */ +static void +print_io_usage(ExplainState *es, IOStats *stats) +{ + /* don't print prefetch stats if there's nothing to report */ + if (stats->prefetch_count > 0) + { + if (es->format == EXPLAIN_FORMAT_TEXT) + { + /* prefetch distance info */ + ExplainIndentText(es); + appendStringInfo(es->str, "Prefetch: avg=%.2f max=%d capacity=%d\n", + (stats->distance_sum * 1.0 / stats->prefetch_count), + stats->distance_max, + stats->distance_capacity); + + /* prefetch I/O info (only if there were actual I/Os) */ + if (stats->io_count > 0) + { + ExplainIndentText(es); + appendStringInfo(es->str, "I/O: count=%" PRIu64 " waits=%" PRIu64 + " size=%.2f in-progress=%.2f\n", + stats->io_count, stats->wait_count, + (stats->io_nblocks * 1.0 / stats->io_count), + (stats->io_in_progress * 1.0 / stats->io_count)); + } + } + else + { + ExplainPropertyFloat("Average Prefetch Distance", NULL, + (stats->distance_sum * 1.0 / stats->prefetch_count), 3, es); + ExplainPropertyInteger("Max Prefetch Distance", NULL, + stats->distance_max, es); + ExplainPropertyInteger("Prefetch Capacity", NULL, + stats->distance_capacity, es); + + ExplainPropertyUInteger("I/O Count", NULL, + stats->io_count, es); + ExplainPropertyUInteger("I/O Waits", NULL, + stats->wait_count, es); + ExplainPropertyFloat("Average I/O Size", NULL, + (stats->io_nblocks * 1.0 / Max(1, stats->io_count)), 3, es); + ExplainPropertyFloat("Average I/Os In Progress", NULL, + (stats->io_in_progress * 1.0 / Max(1, stats->io_count)), 3, es); + } + } +} + +/* + * Show information about prefetch and I/O in a scan node. + */ +static void +show_scan_io_usage(ScanState *planstate, ExplainState *es) +{ + Plan *plan = planstate->ps.plan; + IOStats stats = {0}; + + if (!es->io) + return; + + /* + * Initialize counters with stats from the local process first. + * + * The scan descriptor may not exist, e.g. if the scan did not start, or + * because of debug_parallel_query=regress. We still want to collect data + * from workers. + */ + if (planstate->ss_currentScanDesc && + planstate->ss_currentScanDesc->rs_instrument) + { + stats = planstate->ss_currentScanDesc->rs_instrument->io; + } + + /* + * Accumulate data from parallel workers (if any). + */ + switch (nodeTag(plan)) + { + case T_BitmapHeapScan: + { + SharedBitmapHeapInstrumentation *sinstrument + = ((BitmapHeapScanState *) planstate)->sinstrument; + + if (sinstrument) + { + for (int i = 0; i < sinstrument->num_workers; ++i) + { + BitmapHeapScanInstrumentation *winstrument = &sinstrument->sinstrument[i]; + + AccumulateIOStats(&stats, &winstrument->stats.io); + + if (!es->workers_state) + continue; + + ExplainOpenWorker(i, es); + print_io_usage(es, &winstrument->stats.io); + ExplainCloseWorker(i, es); + } + } + + break; + } + default: + /* ignore other plans */ + return; + } + + print_io_usage(es, &stats); +} + /* * If it's EXPLAIN ANALYZE, show instrumentation information for a plan node * diff --git a/src/backend/commands/explain_state.c b/src/backend/commands/explain_state.c index 65dd4111459..0e07a63fca6 100644 --- a/src/backend/commands/explain_state.c +++ b/src/backend/commands/explain_state.c @@ -162,6 +162,8 @@ ParseExplainOptionList(ExplainState *es, List *options, ParseState *pstate) "EXPLAIN", opt->defname, p), parser_errposition(pstate, opt->location))); } + else if (strcmp(opt->defname, "io") == 0) + es->io = defGetBoolean(opt); else if (!ApplyExtensionExplainOption(es, opt, pstate)) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), @@ -188,6 +190,12 @@ ParseExplainOptionList(ExplainState *es, List *options, ParseState *pstate) (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("EXPLAIN option %s requires ANALYZE", "TIMING"))); + /* check that IO is used with EXPLAIN ANALYZE */ + if (es->io && !es->analyze) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("EXPLAIN option %s requires ANALYZE", "IO"))); + /* check that serialize is used with EXPLAIN ANALYZE */ if (es->serialize != EXPLAIN_SERIALIZE_NONE && !es->analyze) ereport(ERROR, diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c index d65e2a87b42..83d6478bc2b 100644 --- a/src/backend/executor/nodeBitmapHeapscan.c +++ b/src/backend/executor/nodeBitmapHeapscan.c @@ -144,13 +144,20 @@ BitmapTableScanSetup(BitmapHeapScanState *node) */ if (!node->ss.ss_currentScanDesc) { + uint32 flags = SO_NONE; + + if (ScanRelIsReadOnly(&node->ss)) + flags |= SO_HINT_REL_READ_ONLY; + + if (node->ss.ps.state->es_instrument & INSTRUMENT_IO) + flags |= SO_SCAN_INSTRUMENT; + node->ss.ss_currentScanDesc = table_beginscan_bm(node->ss.ss_currentRelation, node->ss.ps.state->es_snapshot, 0, NULL, - ScanRelIsReadOnly(&node->ss) ? - SO_HINT_REL_READ_ONLY : SO_NONE); + flags); } node->ss.ss_currentScanDesc->st.rs_tbmiterator = tbmiterator; @@ -330,6 +337,14 @@ ExecEndBitmapHeapScan(BitmapHeapScanState *node) */ si->exact_pages += node->stats.exact_pages; si->lossy_pages += node->stats.lossy_pages; + + /* collect I/O instrumentation for this process */ + if (node->ss.ss_currentScanDesc && + node->ss.ss_currentScanDesc->rs_instrument) + { + AccumulateIOStats(&si->stats.io, + &node->ss.ss_currentScanDesc->rs_instrument->io); + } } /* diff --git a/src/backend/storage/aio/read_stream.c b/src/backend/storage/aio/read_stream.c index 0b6cdf7c873..b6fce4e7cc6 100644 --- a/src/backend/storage/aio/read_stream.c +++ b/src/backend/storage/aio/read_stream.c @@ -74,6 +74,7 @@ #include "postgres.h" #include "miscadmin.h" +#include "executor/instrument_node.h" #include "storage/aio.h" #include "storage/fd.h" #include "storage/smgr.h" @@ -123,6 +124,9 @@ struct ReadStream bool advice_enabled; bool temporary; + /* scan stats counters */ + IOStats *stats; + /* * One-block buffer to support 'ungetting' a block number, to resolve flow * control problems when I/Os are split. @@ -188,6 +192,73 @@ block_range_read_stream_cb(ReadStream *stream, return InvalidBlockNumber; } +/* + * Update stream stats with current pinned buffer depth. + * + * Called once per buffer returned to the consumer in read_stream_next_buffer(). + * Records the number of pinned buffers at that moment, so we can compute the + * average look-ahead depth. + */ +static inline void +read_stream_count_prefetch(ReadStream *stream) +{ + IOStats *stats = stream->stats; + + if (stats == NULL) + return; + + stats->prefetch_count++; + stats->distance_sum += stream->pinned_buffers; + if (stream->pinned_buffers > stats->distance_max) + stats->distance_max = stream->pinned_buffers; +} + +/* + * Update stream stats about size of I/O requests. + * + * We count the number of I/O requests, size of requests (counted in blocks) + * and number of in-progress I/Os. + */ +static inline void +read_stream_count_io(ReadStream *stream, int nblocks, int in_progress) +{ + IOStats *stats = stream->stats; + + if (stats == NULL) + return; + + stats->io_count++; + stats->io_nblocks += nblocks; + stats->io_in_progress += in_progress; +} + +/* + * Update stream stats about waits for I/O when consuming buffers. + * + * We count the number of I/O waits while pulling buffers out of a stream. + */ +static inline void +read_stream_count_wait(ReadStream *stream) +{ + IOStats *stats = stream->stats; + + if (stats == NULL) + return; + + stats->wait_count++; +} + +/* + * Enable collection of stats into the provided IOStats. + */ +void +read_stream_enable_stats(ReadStream *stream, IOStats *stats) +{ + stream->stats = stats; + if (stream->stats) + stream->stats->distance_capacity = stream->max_pinned_buffers; +} + /* * Ask the callback which block it would like us to read next, with a one block * buffer in front to allow read_stream_unget_block() to work. @@ -426,6 +497,9 @@ read_stream_start_pending_read(ReadStream *stream) Assert(stream->ios_in_progress < stream->max_ios); stream->ios_in_progress++; stream->seq_blocknum = stream->pending_read_blocknum + nblocks; + + /* update I/O stats */ + read_stream_count_io(stream, nblocks, stream->ios_in_progress); } /* @@ -1021,6 +1095,7 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data) flags))) { /* Fast return. */ + read_stream_count_prefetch(stream); return buffer; } @@ -1036,6 +1111,12 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data) * to avoid having to effectively do another synchronous IO for * the next block (if it were also a miss). */ + + /* update I/O stats */ + read_stream_count_io(stream, 1, stream->ios_in_progress); + + /* update prefetch distance */ + read_stream_count_prefetch(stream); } else { @@ -1116,6 +1197,10 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data) if (stream->ios[io_index].op.flags & READ_BUFFERS_SYNCHRONOUSLY) needed_wait = true; + /* Count it as a wait if we need to wait for IO */ + if (needed_wait) + read_stream_count_wait(stream); + /* * Have the read-ahead distance ramp up rapidly after we needed to * wait for IO. We only increase the read-ahead-distance when we @@ -1228,6 +1313,8 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data) } #endif + read_stream_count_prefetch(stream); + /* Pin transferred to caller. */ Assert(stream->pinned_buffers > 0); stream->pinned_buffers--; diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index fd2076c582a..2ea06a67a63 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -24,6 +24,7 @@ struct ParallelTableScanDescData; +struct TableScanInstrumentation; /* * Generic descriptor for table scans. This is the base-class for table scans, @@ -64,6 +65,11 @@ typedef struct TableScanDescData struct ParallelTableScanDescData *rs_parallel; /* parallel scan * information */ + + /* + * Instrumentation counters maintained by all table AMs. + */ + struct TableScanInstrumentation *rs_instrument; } TableScanDescData; typedef struct TableScanDescData *TableScanDesc; diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index a21c7db5439..c13f05d39db 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -69,6 +69,9 @@ typedef enum ScanOptions /* set if the query doesn't modify the relation */ SO_HINT_REL_READ_ONLY = 1 << 10, + + /* collect scan instrumentation */ + SO_SCAN_INSTRUMENT = 1 << 11, } ScanOptions; /* diff --git a/src/include/commands/explain_state.h b/src/include/commands/explain_state.h index 6252fe11f15..97bc7ed49f6 100644 --- a/src/include/commands/explain_state.h +++ b/src/include/commands/explain_state.h @@ -55,6 +55,7 @@ typedef struct ExplainState bool summary; /* print total planning and execution timing */ bool memory; /* print planner's memory usage information */ bool settings; /* print modified settings */ + bool io; /* print info about IO (prefetch, ...) */ bool generic; /* generate a generic plan */ ExplainSerializeOption serialize; /* serialize the query's output? */ ExplainFormat format; /* output format */ diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h index cc9fbb0e2f0..f093a52aae0 100644 --- a/src/include/executor/instrument.h +++ b/src/include/executor/instrument.h @@ -64,6 +64,7 @@ typedef enum InstrumentOption INSTRUMENT_BUFFERS = 1 << 1, /* needs buffer usage */ INSTRUMENT_ROWS = 1 << 2, /* needs row count */ INSTRUMENT_WAL = 1 << 3, /* needs WAL usage */ + INSTRUMENT_IO = 1 << 4, /* needs IO usage */ INSTRUMENT_ALL = PG_INT32_MAX } InstrumentOption; diff --git a/src/include/executor/instrument_node.h b/src/include/executor/instrument_node.h index e6a3f9f1941..22a75ccd863 100644 --- a/src/include/executor/instrument_node.h +++ b/src/include/executor/instrument_node.h @@ -48,6 +48,55 @@ typedef struct SharedAggInfo } SharedAggInfo; +/* --------------------- + * Instrumentation information about read streams and I/O + * --------------------- + */ +typedef struct IOStats +{ + /* number of buffers returned to consumer (for averaging distance) */ + uint64 prefetch_count; + + /* sum of pinned_buffers sampled at each buffer return */ + uint64 distance_sum; + + /* maximum actual pinned_buffers observed during the scan */ + int16 distance_max; + + /* maximum possible look-ahead distance (max_pinned_buffers) */ + int16 distance_capacity; + + /* number of waits for a read (for the I/O) */ + uint64 wait_count; + + /* I/O stats */ + uint64 io_count; /* number of I/Os */ + uint64 io_nblocks; /* sum of blocks for all I/Os */ + uint64 io_in_progress; /* sum of in-progress I/Os */ +} IOStats; + +typedef struct TableScanInstrumentation +{ + IOStats io; +} TableScanInstrumentation; + +/* merge IO statistics from 'src' into 'dst' */ +static inline void +AccumulateIOStats(IOStats *dst, IOStats *src) +{ + dst->prefetch_count += src->prefetch_count; + dst->distance_sum += src->distance_sum; + if (src->distance_max > dst->distance_max) + dst->distance_max = src->distance_max; + if (src->distance_capacity > dst->distance_capacity) + dst->distance_capacity = src->distance_capacity; + dst->wait_count += src->wait_count; + dst->io_count += src->io_count; + dst->io_nblocks += src->io_nblocks; + dst->io_in_progress += src->io_in_progress; +} + + /* --------------------- * Instrumentation information for indexscans (amgettuple and amgetbitmap) * --------------------- @@ -79,6 +128,7 @@ typedef struct BitmapHeapScanInstrumentation { uint64 exact_pages; uint64 lossy_pages; + TableScanInstrumentation stats; } BitmapHeapScanInstrumentation; /* diff --git a/src/include/storage/read_stream.h b/src/include/storage/read_stream.h index c9359b29b0f..48995c6d534 100644 --- a/src/include/storage/read_stream.h +++ b/src/include/storage/read_stream.h @@ -65,6 +65,7 @@ struct ReadStream; typedef struct ReadStream ReadStream; +struct IOStats; /* for block_range_read_stream_cb */ typedef struct BlockRangeReadStreamPrivate @@ -103,5 +104,6 @@ extern BlockNumber read_stream_pause(ReadStream *stream); extern void read_stream_resume(ReadStream *stream); extern void read_stream_reset(ReadStream *stream); extern void read_stream_end(ReadStream *stream); +extern void read_stream_enable_stats(ReadStream *stream, struct IOStats *stats); #endif /* READ_STREAM_H */ diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index a998bb5e882..ca54c783647 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -1279,6 +1279,7 @@ IOContext IOFuncSelector IOObject IOOp +IOStats IO_STATUS_BLOCK IPCompareMethod ITEM @@ -3127,6 +3128,7 @@ TableLikeClause TableSampleClause TableScanDesc TableScanDescData +TableScanInstrumentation TableSpaceCacheEntry TableSpaceOpts TableToProcess