diff --git a/src/backend/optimizer/plan/initsplan.c b/src/backend/optimizer/plan/initsplan.c index 96ee312ebdf..b38422c47a4 100644 --- a/src/backend/optimizer/plan/initsplan.c +++ b/src/backend/optimizer/plan/initsplan.c @@ -82,6 +82,18 @@ typedef struct JoinTreeItem * lateral references */ } JoinTreeItem; +/* + * Compatibility info for one GROUP BY item, precomputed for use by + * remove_useless_groupby_columns() when matching unique-index columns against + * GROUP BY items. + */ +typedef struct GroupByColInfo +{ + AttrNumber attno; /* var->varattno */ + List *eq_opfamilies; /* mergejoin opfamilies of sgc->eqop */ + Oid coll; /* var->varcollid */ +} GroupByColInfo; + static bool is_partial_agg_memory_risky(PlannerInfo *root); static void create_agg_clause_infos(PlannerInfo *root); @@ -421,6 +433,7 @@ remove_useless_groupby_columns(PlannerInfo *root) { Query *parse = root->parse; Bitmapset **groupbyattnos; + List **groupbycols; Bitmapset **surplusvars; bool tryremove = false; ListCell *lc; @@ -437,14 +450,20 @@ remove_useless_groupby_columns(PlannerInfo *root) /* * Scan the GROUP BY clause to find GROUP BY items that are simple Vars. * Fill groupbyattnos[k] with a bitmapset of the column attnos of RTE k - * that are GROUP BY items. + * that are GROUP BY items, and groupbycols[k] with a parallel list of + * GroupByColInfo records. We need the latter so that, when checking a + * unique index against this rel's GROUP BY items, we can verify that the + * index's notion of equality agrees with at least one GROUP BY item per + * index column. */ groupbyattnos = palloc0_array(Bitmapset *, list_length(parse->rtable) + 1); + groupbycols = palloc0_array(List *, list_length(parse->rtable) + 1); foreach(lc, root->processed_groupClause) { SortGroupClause *sgc = lfirst_node(SortGroupClause, lc); TargetEntry *tle = get_sortgroupclause_tle(sgc, parse->targetList); Var *var = (Var *) tle->expr; + GroupByColInfo *info; /* * Ignore non-Vars and Vars from other query levels. @@ -470,6 +489,12 @@ remove_useless_groupby_columns(PlannerInfo *root) tryremove |= !bms_is_empty(groupbyattnos[relid]); groupbyattnos[relid] = bms_add_member(groupbyattnos[relid], var->varattno - FirstLowInvalidHeapAttributeNumber); + + info = palloc(sizeof(GroupByColInfo)); + info->attno = var->varattno; + info->eq_opfamilies = get_mergejoin_opfamilies(sgc->eqop); + info->coll = var->varcollid; + groupbycols[relid] = lappend(groupbycols[relid], info); } /* @@ -524,7 +549,7 @@ remove_useless_groupby_columns(PlannerInfo *root) foreach_node(IndexOptInfo, index, rel->indexlist) { Bitmapset *ind_attnos; - bool nulls_check_ok; + bool index_check_ok; /* * Skip any non-unique and deferrable indexes. Predicate indexes @@ -539,9 +564,14 @@ remove_useless_groupby_columns(PlannerInfo *root) continue; ind_attnos = NULL; - nulls_check_ok = true; + index_check_ok = true; for (int i = 0; i < index->nkeycolumns; i++) { + AttrNumber indkey_attno = index->indexkeys[i]; + Oid indkey_opfamily = index->opfamily[i]; + Oid indkey_coll = index->indexcollations[i]; + ListCell *lc2; + /* * We must insist that the index columns are all defined NOT * NULL otherwise duplicate NULLs could exist. However, we @@ -551,20 +581,41 @@ remove_useless_groupby_columns(PlannerInfo *root) * despite the NULL. */ if (!index->nullsnotdistinct && - !bms_is_member(index->indexkeys[i], - rel->notnullattnums)) + !bms_is_member(indkey_attno, rel->notnullattnums)) { - nulls_check_ok = false; + index_check_ok = false; + break; + } + + /* + * The index proves uniqueness only under its own opfamily and + * collation. Require some GROUP BY item on this column to + * use a compatible eqop and collation, the same check + * relation_has_unique_index_for() applies to join clauses. + */ + foreach(lc2, groupbycols[relid]) + { + GroupByColInfo *info = (GroupByColInfo *) lfirst(lc2); + + if (info->attno != indkey_attno) + continue; + if (list_member_oid(info->eq_opfamilies, indkey_opfamily) && + collations_agree_on_equality(indkey_coll, info->coll)) + break; + } + if (lc2 == NULL) + { + index_check_ok = false; break; } ind_attnos = bms_add_member(ind_attnos, - index->indexkeys[i] - + indkey_attno - FirstLowInvalidHeapAttributeNumber); } - if (!nulls_check_ok) + if (!index_check_ok) continue; /* diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out index ff80869fb33..fbda0e3bbc2 100644 --- a/src/test/regress/expected/aggregates.out +++ b/src/test/regress/expected/aggregates.out @@ -1645,6 +1645,37 @@ explain (costs off) select y,z from t2 group by y,z; -> Seq Scan on t2 (3 rows) +-- A unique index proves uniqueness only under its own opfamily. When the +-- GROUP BY's eqop comes from a different opfamily with looser equality, +-- rows the index regards as distinct can collapse into one GROUP BY group, +-- so the index is not usable for removing redundant columns. +create type t_rec as (x numeric); +create temp table t_opf (a t_rec not null, b text); +create unique index on t_opf (a record_image_ops); +-- (1.0) and (1.00) are bytewise distinct but logically equal as records; +-- the index admits both, but GROUP BY a (default record_ops) would merge +-- them, so b must be retained as a grouping key. +insert into t_opf values (row(1.0)::t_rec, 'X'), (row(1.00)::t_rec, 'Y'); +explain (costs off) +select a, b from t_opf group by a, b order by b; + QUERY PLAN +------------------------------- + Group + Group Key: b, a + -> Sort + Sort Key: b, a + -> Seq Scan on t_opf +(5 rows) + +select a, b from t_opf group by a, b order by b; + a | b +--------+--- + (1.0) | X + (1.00) | Y +(2 rows) + +drop table t_opf; +drop type t_rec; drop table t1 cascade; NOTICE: drop cascades to table t1c drop table t2; diff --git a/src/test/regress/expected/collate.icu.utf8.out b/src/test/regress/expected/collate.icu.utf8.out index 9c4b8ccff78..04e2f6df037 100644 --- a/src/test/regress/expected/collate.icu.utf8.out +++ b/src/test/regress/expected/collate.icu.utf8.out @@ -3321,6 +3321,38 @@ GROUP BY t1.id, t1.val; DROP TABLE eager_agg_t1; DROP TABLE eager_agg_t2; +-- +-- A unique index can prove functional dependency for GROUP BY column +-- removal only if its per-column collation agrees on equality with +-- the GROUP BY column's collation. An index built under a different +-- (deterministic) collation would otherwise let remove_useless_groupby_columns +-- drop other columns whose values still differ within a nondeterministic +-- group. +-- +CREATE TABLE groupby_collation_t (a text COLLATE case_insensitive NOT NULL, b text); +INSERT INTO groupby_collation_t VALUES ('foo', 'X'), ('FOO', 'Y'); +CREATE UNIQUE INDEX ON groupby_collation_t (a COLLATE "C"); +-- Column b must NOT be dropped: under case_insensitive on a, 'foo' and +-- 'FOO' would merge, but they have distinct b values. +EXPLAIN (COSTS OFF) +SELECT a, b FROM groupby_collation_t GROUP BY a, b ORDER BY a, b; + QUERY PLAN +------------------------------------------------- + Group + Group Key: a, b + -> Sort + Sort Key: a COLLATE case_insensitive, b + -> Seq Scan on groupby_collation_t +(5 rows) + +SELECT a, b FROM groupby_collation_t GROUP BY a, b ORDER BY a, b; + a | b +-----+--- + foo | X + FOO | Y +(2 rows) + +DROP TABLE groupby_collation_t; -- virtual generated columns CREATE TABLE t5 ( a int, diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql index 89bb83718e0..580f364ba97 100644 --- a/src/test/regress/sql/aggregates.sql +++ b/src/test/regress/sql/aggregates.sql @@ -577,6 +577,23 @@ alter table t2 alter column z drop not null; create unique index t2_z_uidx on t2(z) nulls not distinct; explain (costs off) select y,z from t2 group by y,z; +-- A unique index proves uniqueness only under its own opfamily. When the +-- GROUP BY's eqop comes from a different opfamily with looser equality, +-- rows the index regards as distinct can collapse into one GROUP BY group, +-- so the index is not usable for removing redundant columns. +create type t_rec as (x numeric); +create temp table t_opf (a t_rec not null, b text); +create unique index on t_opf (a record_image_ops); +-- (1.0) and (1.00) are bytewise distinct but logically equal as records; +-- the index admits both, but GROUP BY a (default record_ops) would merge +-- them, so b must be retained as a grouping key. +insert into t_opf values (row(1.0)::t_rec, 'X'), (row(1.00)::t_rec, 'Y'); +explain (costs off) +select a, b from t_opf group by a, b order by b; +select a, b from t_opf group by a, b order by b; +drop table t_opf; +drop type t_rec; + drop table t1 cascade; drop table t2; drop table t3; diff --git a/src/test/regress/sql/collate.icu.utf8.sql b/src/test/regress/sql/collate.icu.utf8.sql index a8e34017e07..18c47e6e05a 100644 --- a/src/test/regress/sql/collate.icu.utf8.sql +++ b/src/test/regress/sql/collate.icu.utf8.sql @@ -1219,6 +1219,26 @@ GROUP BY t1.id, t1.val; DROP TABLE eager_agg_t1; DROP TABLE eager_agg_t2; +-- +-- A unique index can prove functional dependency for GROUP BY column +-- removal only if its per-column collation agrees on equality with +-- the GROUP BY column's collation. An index built under a different +-- (deterministic) collation would otherwise let remove_useless_groupby_columns +-- drop other columns whose values still differ within a nondeterministic +-- group. +-- +CREATE TABLE groupby_collation_t (a text COLLATE case_insensitive NOT NULL, b text); +INSERT INTO groupby_collation_t VALUES ('foo', 'X'), ('FOO', 'Y'); +CREATE UNIQUE INDEX ON groupby_collation_t (a COLLATE "C"); + +-- Column b must NOT be dropped: under case_insensitive on a, 'foo' and +-- 'FOO' would merge, but they have distinct b values. +EXPLAIN (COSTS OFF) +SELECT a, b FROM groupby_collation_t GROUP BY a, b ORDER BY a, b; +SELECT a, b FROM groupby_collation_t GROUP BY a, b ORDER BY a, b; + +DROP TABLE groupby_collation_t; + -- virtual generated columns CREATE TABLE t5 ( a int, diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 3ade7c08b6d..06532bf7152 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -1166,6 +1166,7 @@ GraphPattern GraphPropertyRef GraphTableParseState Group +GroupByColInfo GroupByOrdering GroupClause GroupPath