diff --git a/doc/src/sgml/ref/explain.sgml b/doc/src/sgml/ref/explain.sgml
index 5b8b521802e..e95e19081e1 100644
--- a/doc/src/sgml/ref/explain.sgml
+++ b/doc/src/sgml/ref/explain.sgml
@@ -46,6 +46,7 @@ EXPLAIN [ ( option [, ...] ) ] boolean ]
SUMMARY [ boolean ]
MEMORY [ boolean ]
+ IO [ boolean ]
FORMAT { TEXT | XML | JSON | YAML }
@@ -298,6 +299,17 @@ ROLLBACK;
+
+ IO
+
+
+ Include information on I/O performed by scan nodes proving such information.
+ This parameter may only be used when ANALYZE is also
+ enabled. It defaults to FALSE.
+
+
+
+
FORMAT
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index f6ac5a0897c..abfd8e8970a 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -43,6 +43,7 @@
#include "catalog/pg_database.h"
#include "catalog/pg_database_d.h"
#include "commands/vacuum.h"
+#include "executor/instrument_node.h"
#include "pgstat.h"
#include "port/pg_bitutils.h"
#include "storage/lmgr.h"
@@ -1200,6 +1201,7 @@ heap_beginscan(Relation relation, Snapshot snapshot,
scan->rs_base.rs_nkeys = nkeys;
scan->rs_base.rs_flags = flags;
scan->rs_base.rs_parallel = parallel_scan;
+ scan->rs_base.rs_instrument = NULL;
scan->rs_strategy = NULL; /* set in initscan */
scan->rs_cbuf = InvalidBuffer;
@@ -1312,6 +1314,14 @@ heap_beginscan(Relation relation, Snapshot snapshot,
sizeof(TBMIterateResult));
}
+ /* enable read stream instrumentation */
+ if ((flags & SO_SCAN_INSTRUMENT) && (scan->rs_read_stream != NULL))
+ {
+ scan->rs_base.rs_instrument = palloc0_object(TableScanInstrumentation);
+ read_stream_enable_stats(scan->rs_read_stream,
+ &scan->rs_base.rs_instrument->io);
+ }
+
scan->rs_vmbuffer = InvalidBuffer;
return (TableScanDesc) scan;
@@ -1415,6 +1425,9 @@ heap_endscan(TableScanDesc sscan)
if (scan->rs_base.rs_flags & SO_TEMP_SNAPSHOT)
UnregisterSnapshot(scan->rs_base.rs_snapshot);
+ if (scan->rs_base.rs_instrument)
+ pfree(scan->rs_base.rs_instrument);
+
pfree(scan);
}
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index f151f21f9b3..20e0bc8f232 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -13,6 +13,7 @@
*/
#include "postgres.h"
+#include "access/relscan.h"
#include "access/xact.h"
#include "catalog/pg_type.h"
#include "commands/createas.h"
@@ -139,6 +140,8 @@ static void show_hashagg_info(AggState *aggstate, ExplainState *es);
static void show_indexsearches_info(PlanState *planstate, ExplainState *es);
static void show_tidbitmap_info(BitmapHeapScanState *planstate,
ExplainState *es);
+static void show_scan_io_usage(ScanState *planstate,
+ ExplainState *es);
static void show_instrumentation_count(const char *qlabel, int which,
PlanState *planstate, ExplainState *es);
static void show_foreignscan_info(ForeignScanState *fsstate, ExplainState *es);
@@ -519,6 +522,8 @@ ExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, ExplainState *es,
instrument_option |= INSTRUMENT_BUFFERS;
if (es->wal)
instrument_option |= INSTRUMENT_WAL;
+ if (es->io)
+ instrument_option |= INSTRUMENT_IO;
/*
* We always collect timing for the entire statement, even when node-level
@@ -2008,6 +2013,7 @@ ExplainNode(PlanState *planstate, List *ancestors,
show_instrumentation_count("Rows Removed by Filter", 1,
planstate, es);
show_tidbitmap_info((BitmapHeapScanState *) planstate, es);
+ show_scan_io_usage((ScanState *) planstate, es);
break;
case T_SampleScan:
show_tablesample(((SampleScan *) plan)->tablesample,
@@ -3984,6 +3990,128 @@ show_tidbitmap_info(BitmapHeapScanState *planstate, ExplainState *es)
}
}
+/*
+ * Print I/O stats - prefetching and I/O performed
+ *
+ * This prints two types of stats - "prefetch" about the prefetching done by
+ * ReadStream, and "I/O" issued by the stream. The prefetch stats are based
+ * on buffers pulled from the stream (even if no I/O is needed). The I/O
+ * information is related to I/O requests issued by the stream.
+ *
+ * The prefetch stats are printed if any buffer was pulled from the stream.
+ * For the I/O stats it depend on the output format. In non-text formats the
+ * information is printed if prefetch stats were printed. In text format it
+ * gets printed only if there were any I/O requests.
+ */
+static void
+print_io_usage(ExplainState *es, IOStats *stats)
+{
+ /* don't print prefetch stats if there's nothing to report */
+ if (stats->prefetch_count > 0)
+ {
+ if (es->format == EXPLAIN_FORMAT_TEXT)
+ {
+ /* prefetch distance info */
+ ExplainIndentText(es);
+ appendStringInfo(es->str, "Prefetch: avg=%.2f max=%d capacity=%d\n",
+ (stats->distance_sum * 1.0 / stats->prefetch_count),
+ stats->distance_max,
+ stats->distance_capacity);
+
+ /* prefetch I/O info (only if there were actual I/Os) */
+ if (stats->io_count > 0)
+ {
+ ExplainIndentText(es);
+ appendStringInfo(es->str, "I/O: count=%" PRIu64 " waits=%" PRIu64
+ " size=%.2f in-progress=%.2f\n",
+ stats->io_count, stats->wait_count,
+ (stats->io_nblocks * 1.0 / stats->io_count),
+ (stats->io_in_progress * 1.0 / stats->io_count));
+ }
+ }
+ else
+ {
+ ExplainPropertyFloat("Average Prefetch Distance", NULL,
+ (stats->distance_sum * 1.0 / stats->prefetch_count), 3, es);
+ ExplainPropertyInteger("Max Prefetch Distance", NULL,
+ stats->distance_max, es);
+ ExplainPropertyInteger("Prefetch Capacity", NULL,
+ stats->distance_capacity, es);
+
+ ExplainPropertyUInteger("I/O Count", NULL,
+ stats->io_count, es);
+ ExplainPropertyUInteger("I/O Waits", NULL,
+ stats->wait_count, es);
+ ExplainPropertyFloat("Average I/O Size", NULL,
+ (stats->io_nblocks * 1.0 / Max(1, stats->io_count)), 3, es);
+ ExplainPropertyFloat("Average I/Os In Progress", NULL,
+ (stats->io_in_progress * 1.0 / Max(1, stats->io_count)), 3, es);
+ }
+ }
+}
+
+/*
+ * Show information about prefetch and I/O in a scan node.
+ */
+static void
+show_scan_io_usage(ScanState *planstate, ExplainState *es)
+{
+ Plan *plan = planstate->ps.plan;
+ IOStats stats = {0};
+
+ if (!es->io)
+ return;
+
+ /*
+ * Initialize counters with stats from the local process first.
+ *
+ * The scan descriptor may not exist, e.g. if the scan did not start, or
+ * because of debug_parallel_query=regress. We still want to collect data
+ * from workers.
+ */
+ if (planstate->ss_currentScanDesc &&
+ planstate->ss_currentScanDesc->rs_instrument)
+ {
+ stats = planstate->ss_currentScanDesc->rs_instrument->io;
+ }
+
+ /*
+ * Accumulate data from parallel workers (if any).
+ */
+ switch (nodeTag(plan))
+ {
+ case T_BitmapHeapScan:
+ {
+ SharedBitmapHeapInstrumentation *sinstrument
+ = ((BitmapHeapScanState *) planstate)->sinstrument;
+
+ if (sinstrument)
+ {
+ for (int i = 0; i < sinstrument->num_workers; ++i)
+ {
+ BitmapHeapScanInstrumentation *winstrument = &sinstrument->sinstrument[i];
+
+ AccumulateIOStats(&stats, &winstrument->stats.io);
+
+ if (!es->workers_state)
+ continue;
+
+ ExplainOpenWorker(i, es);
+ print_io_usage(es, &winstrument->stats.io);
+ ExplainCloseWorker(i, es);
+ }
+ }
+
+ break;
+ }
+ default:
+ /* ignore other plans */
+ return;
+ }
+
+ print_io_usage(es, &stats);
+}
+
/*
* If it's EXPLAIN ANALYZE, show instrumentation information for a plan node
*
diff --git a/src/backend/commands/explain_state.c b/src/backend/commands/explain_state.c
index 65dd4111459..0e07a63fca6 100644
--- a/src/backend/commands/explain_state.c
+++ b/src/backend/commands/explain_state.c
@@ -162,6 +162,8 @@ ParseExplainOptionList(ExplainState *es, List *options, ParseState *pstate)
"EXPLAIN", opt->defname, p),
parser_errposition(pstate, opt->location)));
}
+ else if (strcmp(opt->defname, "io") == 0)
+ es->io = defGetBoolean(opt);
else if (!ApplyExtensionExplainOption(es, opt, pstate))
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
@@ -188,6 +190,12 @@ ParseExplainOptionList(ExplainState *es, List *options, ParseState *pstate)
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("EXPLAIN option %s requires ANALYZE", "TIMING")));
+ /* check that IO is used with EXPLAIN ANALYZE */
+ if (es->io && !es->analyze)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("EXPLAIN option %s requires ANALYZE", "IO")));
+
/* check that serialize is used with EXPLAIN ANALYZE */
if (es->serialize != EXPLAIN_SERIALIZE_NONE && !es->analyze)
ereport(ERROR,
diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c
index d65e2a87b42..83d6478bc2b 100644
--- a/src/backend/executor/nodeBitmapHeapscan.c
+++ b/src/backend/executor/nodeBitmapHeapscan.c
@@ -144,13 +144,20 @@ BitmapTableScanSetup(BitmapHeapScanState *node)
*/
if (!node->ss.ss_currentScanDesc)
{
+ uint32 flags = SO_NONE;
+
+ if (ScanRelIsReadOnly(&node->ss))
+ flags |= SO_HINT_REL_READ_ONLY;
+
+ if (node->ss.ps.state->es_instrument & INSTRUMENT_IO)
+ flags |= SO_SCAN_INSTRUMENT;
+
node->ss.ss_currentScanDesc =
table_beginscan_bm(node->ss.ss_currentRelation,
node->ss.ps.state->es_snapshot,
0,
NULL,
- ScanRelIsReadOnly(&node->ss) ?
- SO_HINT_REL_READ_ONLY : SO_NONE);
+ flags);
}
node->ss.ss_currentScanDesc->st.rs_tbmiterator = tbmiterator;
@@ -330,6 +337,14 @@ ExecEndBitmapHeapScan(BitmapHeapScanState *node)
*/
si->exact_pages += node->stats.exact_pages;
si->lossy_pages += node->stats.lossy_pages;
+
+ /* collect I/O instrumentation for this process */
+ if (node->ss.ss_currentScanDesc &&
+ node->ss.ss_currentScanDesc->rs_instrument)
+ {
+ AccumulateIOStats(&si->stats.io,
+ &node->ss.ss_currentScanDesc->rs_instrument->io);
+ }
}
/*
diff --git a/src/backend/storage/aio/read_stream.c b/src/backend/storage/aio/read_stream.c
index 0b6cdf7c873..b6fce4e7cc6 100644
--- a/src/backend/storage/aio/read_stream.c
+++ b/src/backend/storage/aio/read_stream.c
@@ -74,6 +74,7 @@
#include "postgres.h"
#include "miscadmin.h"
+#include "executor/instrument_node.h"
#include "storage/aio.h"
#include "storage/fd.h"
#include "storage/smgr.h"
@@ -123,6 +124,9 @@ struct ReadStream
bool advice_enabled;
bool temporary;
+ /* scan stats counters */
+ IOStats *stats;
+
/*
* One-block buffer to support 'ungetting' a block number, to resolve flow
* control problems when I/Os are split.
@@ -188,6 +192,73 @@ block_range_read_stream_cb(ReadStream *stream,
return InvalidBlockNumber;
}
+/*
+ * Update stream stats with current pinned buffer depth.
+ *
+ * Called once per buffer returned to the consumer in read_stream_next_buffer().
+ * Records the number of pinned buffers at that moment, so we can compute the
+ * average look-ahead depth.
+ */
+static inline void
+read_stream_count_prefetch(ReadStream *stream)
+{
+ IOStats *stats = stream->stats;
+
+ if (stats == NULL)
+ return;
+
+ stats->prefetch_count++;
+ stats->distance_sum += stream->pinned_buffers;
+ if (stream->pinned_buffers > stats->distance_max)
+ stats->distance_max = stream->pinned_buffers;
+}
+
+/*
+ * Update stream stats about size of I/O requests.
+ *
+ * We count the number of I/O requests, size of requests (counted in blocks)
+ * and number of in-progress I/Os.
+ */
+static inline void
+read_stream_count_io(ReadStream *stream, int nblocks, int in_progress)
+{
+ IOStats *stats = stream->stats;
+
+ if (stats == NULL)
+ return;
+
+ stats->io_count++;
+ stats->io_nblocks += nblocks;
+ stats->io_in_progress += in_progress;
+}
+
+/*
+ * Update stream stats about waits for I/O when consuming buffers.
+ *
+ * We count the number of I/O waits while pulling buffers out of a stream.
+ */
+static inline void
+read_stream_count_wait(ReadStream *stream)
+{
+ IOStats *stats = stream->stats;
+
+ if (stats == NULL)
+ return;
+
+ stats->wait_count++;
+}
+
+/*
+ * Enable collection of stats into the provided IOStats.
+ */
+void
+read_stream_enable_stats(ReadStream *stream, IOStats *stats)
+{
+ stream->stats = stats;
+ if (stream->stats)
+ stream->stats->distance_capacity = stream->max_pinned_buffers;
+}
+
/*
* Ask the callback which block it would like us to read next, with a one block
* buffer in front to allow read_stream_unget_block() to work.
@@ -426,6 +497,9 @@ read_stream_start_pending_read(ReadStream *stream)
Assert(stream->ios_in_progress < stream->max_ios);
stream->ios_in_progress++;
stream->seq_blocknum = stream->pending_read_blocknum + nblocks;
+
+ /* update I/O stats */
+ read_stream_count_io(stream, nblocks, stream->ios_in_progress);
}
/*
@@ -1021,6 +1095,7 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
flags)))
{
/* Fast return. */
+ read_stream_count_prefetch(stream);
return buffer;
}
@@ -1036,6 +1111,12 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
* to avoid having to effectively do another synchronous IO for
* the next block (if it were also a miss).
*/
+
+ /* update I/O stats */
+ read_stream_count_io(stream, 1, stream->ios_in_progress);
+
+ /* update prefetch distance */
+ read_stream_count_prefetch(stream);
}
else
{
@@ -1116,6 +1197,10 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
if (stream->ios[io_index].op.flags & READ_BUFFERS_SYNCHRONOUSLY)
needed_wait = true;
+ /* Count it as a wait if we need to wait for IO */
+ if (needed_wait)
+ read_stream_count_wait(stream);
+
/*
* Have the read-ahead distance ramp up rapidly after we needed to
* wait for IO. We only increase the read-ahead-distance when we
@@ -1228,6 +1313,8 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
}
#endif
+ read_stream_count_prefetch(stream);
+
/* Pin transferred to caller. */
Assert(stream->pinned_buffers > 0);
stream->pinned_buffers--;
diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h
index fd2076c582a..2ea06a67a63 100644
--- a/src/include/access/relscan.h
+++ b/src/include/access/relscan.h
@@ -24,6 +24,7 @@
struct ParallelTableScanDescData;
+struct TableScanInstrumentation;
/*
* Generic descriptor for table scans. This is the base-class for table scans,
@@ -64,6 +65,11 @@ typedef struct TableScanDescData
struct ParallelTableScanDescData *rs_parallel; /* parallel scan
* information */
+
+ /*
+ * Instrumentation counters maintained by all table AMs.
+ */
+ struct TableScanInstrumentation *rs_instrument;
} TableScanDescData;
typedef struct TableScanDescData *TableScanDesc;
diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h
index a21c7db5439..c13f05d39db 100644
--- a/src/include/access/tableam.h
+++ b/src/include/access/tableam.h
@@ -69,6 +69,9 @@ typedef enum ScanOptions
/* set if the query doesn't modify the relation */
SO_HINT_REL_READ_ONLY = 1 << 10,
+
+ /* collect scan instrumentation */
+ SO_SCAN_INSTRUMENT = 1 << 11,
} ScanOptions;
/*
diff --git a/src/include/commands/explain_state.h b/src/include/commands/explain_state.h
index 6252fe11f15..97bc7ed49f6 100644
--- a/src/include/commands/explain_state.h
+++ b/src/include/commands/explain_state.h
@@ -55,6 +55,7 @@ typedef struct ExplainState
bool summary; /* print total planning and execution timing */
bool memory; /* print planner's memory usage information */
bool settings; /* print modified settings */
+ bool io; /* print info about IO (prefetch, ...) */
bool generic; /* generate a generic plan */
ExplainSerializeOption serialize; /* serialize the query's output? */
ExplainFormat format; /* output format */
diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h
index cc9fbb0e2f0..f093a52aae0 100644
--- a/src/include/executor/instrument.h
+++ b/src/include/executor/instrument.h
@@ -64,6 +64,7 @@ typedef enum InstrumentOption
INSTRUMENT_BUFFERS = 1 << 1, /* needs buffer usage */
INSTRUMENT_ROWS = 1 << 2, /* needs row count */
INSTRUMENT_WAL = 1 << 3, /* needs WAL usage */
+ INSTRUMENT_IO = 1 << 4, /* needs IO usage */
INSTRUMENT_ALL = PG_INT32_MAX
} InstrumentOption;
diff --git a/src/include/executor/instrument_node.h b/src/include/executor/instrument_node.h
index e6a3f9f1941..22a75ccd863 100644
--- a/src/include/executor/instrument_node.h
+++ b/src/include/executor/instrument_node.h
@@ -48,6 +48,55 @@ typedef struct SharedAggInfo
} SharedAggInfo;
+/* ---------------------
+ * Instrumentation information about read streams and I/O
+ * ---------------------
+ */
+typedef struct IOStats
+{
+ /* number of buffers returned to consumer (for averaging distance) */
+ uint64 prefetch_count;
+
+ /* sum of pinned_buffers sampled at each buffer return */
+ uint64 distance_sum;
+
+ /* maximum actual pinned_buffers observed during the scan */
+ int16 distance_max;
+
+ /* maximum possible look-ahead distance (max_pinned_buffers) */
+ int16 distance_capacity;
+
+ /* number of waits for a read (for the I/O) */
+ uint64 wait_count;
+
+ /* I/O stats */
+ uint64 io_count; /* number of I/Os */
+ uint64 io_nblocks; /* sum of blocks for all I/Os */
+ uint64 io_in_progress; /* sum of in-progress I/Os */
+} IOStats;
+
+typedef struct TableScanInstrumentation
+{
+ IOStats io;
+} TableScanInstrumentation;
+
+/* merge IO statistics from 'src' into 'dst' */
+static inline void
+AccumulateIOStats(IOStats *dst, IOStats *src)
+{
+ dst->prefetch_count += src->prefetch_count;
+ dst->distance_sum += src->distance_sum;
+ if (src->distance_max > dst->distance_max)
+ dst->distance_max = src->distance_max;
+ if (src->distance_capacity > dst->distance_capacity)
+ dst->distance_capacity = src->distance_capacity;
+ dst->wait_count += src->wait_count;
+ dst->io_count += src->io_count;
+ dst->io_nblocks += src->io_nblocks;
+ dst->io_in_progress += src->io_in_progress;
+}
+
+
/* ---------------------
* Instrumentation information for indexscans (amgettuple and amgetbitmap)
* ---------------------
@@ -79,6 +128,7 @@ typedef struct BitmapHeapScanInstrumentation
{
uint64 exact_pages;
uint64 lossy_pages;
+ TableScanInstrumentation stats;
} BitmapHeapScanInstrumentation;
/*
diff --git a/src/include/storage/read_stream.h b/src/include/storage/read_stream.h
index c9359b29b0f..48995c6d534 100644
--- a/src/include/storage/read_stream.h
+++ b/src/include/storage/read_stream.h
@@ -65,6 +65,7 @@
struct ReadStream;
typedef struct ReadStream ReadStream;
+struct IOStats;
/* for block_range_read_stream_cb */
typedef struct BlockRangeReadStreamPrivate
@@ -103,5 +104,6 @@ extern BlockNumber read_stream_pause(ReadStream *stream);
extern void read_stream_resume(ReadStream *stream);
extern void read_stream_reset(ReadStream *stream);
extern void read_stream_end(ReadStream *stream);
+extern void read_stream_enable_stats(ReadStream *stream, struct IOStats *stats);
#endif /* READ_STREAM_H */
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index a998bb5e882..ca54c783647 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -1279,6 +1279,7 @@ IOContext
IOFuncSelector
IOObject
IOOp
+IOStats
IO_STATUS_BLOCK
IPCompareMethod
ITEM
@@ -3127,6 +3128,7 @@ TableLikeClause
TableSampleClause
TableScanDesc
TableScanDescData
+TableScanInstrumentation
TableSpaceCacheEntry
TableSpaceOpts
TableToProcess