From 5c214b58b0599e9900dc777b3e00ea7120e7e10d Mon Sep 17 00:00:00 2001
From: Richard Guo <rguo@postgresql.org>
Date: Fri, 8 May 2026 12:47:26 +0900
Subject: [PATCH] Consider opfamily and collation when removing redundant GROUP
 BY columns

remove_useless_groupby_columns() uses a relation's unique indexes to
prove that some GROUP BY columns are functionally dependent on others,
and so can be dropped from the GROUP BY clause.  The match between
index columns and GROUP BY columns was done by attno alone, ignoring
two equality-relation issues.

A type may belong to multiple btree opfamilies whose notions of
equality differ.  The record type, for instance, has record_ops
(per-field equality) and record_image_ops (bytewise equality).  A
unique index under one opfamily does not prove uniqueness under the
equality used by GROUP BY when the SortGroupClause's eqop comes from a
different opfamily.

Likewise, since nondeterministic collations were introduced in PG 12,
two collations may disagree on equality, and a unique index under one
collation does not prove uniqueness under another.

In either case, rows that the index considers distinct can collapse
into a single GROUP BY group, taking ungrouped columns of differing
values with them, so the planner drops a column that is not in fact
functionally dependent and produces wrong results.

Fix by requiring, for each unique-index key column, that some GROUP BY
item on the same column has an eqop in the index's opfamily and a
collation that agrees on equality with the index's collation.  This
mirrors the combined check relation_has_unique_index_for() applies to
join clauses.

This is a v18 regression: commit bd10ec529 extended
remove_useless_groupby_columns() from primary-key constraints to
arbitrary unique indexes.  Before that, the function consulted only
primary keys, whose enforcement index is required by parse_utilcmd.c
to use the default opclass and the column's declared collation, so
neither mismatch could arise.  Back-patch to v18 only.

Author: Richard Guo <guofenglinux@gmail.com>
Reviewed-by: Ayush Tiwari <ayushtiwari.slg01@gmail.com>
Discussion: https://postgr.es/m/CAMbWs49t6uArWoTT-cHY+nhsi23nJJKcF9Xb9cYGzaZ9kNJ98g@mail.gmail.com
Backpatch-through: 18
---
 src/backend/optimizer/plan/initsplan.c        | 67 ++++++++++++++++---
 src/test/regress/expected/aggregates.out      | 31 +++++++++
 .../regress/expected/collate.icu.utf8.out     | 32 +++++++++
 src/test/regress/sql/aggregates.sql           | 17 +++++
 src/test/regress/sql/collate.icu.utf8.sql     | 20 ++++++
 src/tools/pgindent/typedefs.list              |  1 +
 6 files changed, 160 insertions(+), 8 deletions(-)

diff --git a/src/backend/optimizer/plan/initsplan.c b/src/backend/optimizer/plan/initsplan.c
index 01804b085b3..8112509e872 100644
--- a/src/backend/optimizer/plan/initsplan.c
+++ b/src/backend/optimizer/plan/initsplan.c
@@ -80,6 +80,18 @@ typedef struct JoinTreeItem
 									 * lateral references */
 } JoinTreeItem;
 
+/*
+ * Compatibility info for one GROUP BY item, precomputed for use by
+ * remove_useless_groupby_columns() when matching unique-index columns against
+ * GROUP BY items.
+ */
+typedef struct GroupByColInfo
+{
+	AttrNumber	attno;			/* var->varattno */
+	List	   *eq_opfamilies;	/* mergejoin opfamilies of sgc->eqop */
+	Oid			coll;			/* var->varcollid */
+} GroupByColInfo;
+
 
 static void extract_lateral_references(PlannerInfo *root, RelOptInfo *brel,
 									   Index rtindex);
@@ -413,6 +425,7 @@ remove_useless_groupby_columns(PlannerInfo *root)
 {
 	Query	   *parse = root->parse;
 	Bitmapset **groupbyattnos;
+	List	  **groupbycols;
 	Bitmapset **surplusvars;
 	bool		tryremove = false;
 	ListCell   *lc;
@@ -429,15 +442,21 @@ remove_useless_groupby_columns(PlannerInfo *root)
 	/*
 	 * Scan the GROUP BY clause to find GROUP BY items that are simple Vars.
 	 * Fill groupbyattnos[k] with a bitmapset of the column attnos of RTE k
-	 * that are GROUP BY items.
+	 * that are GROUP BY items, and groupbycols[k] with a parallel list of
+	 * GroupByColInfo records.  We need the latter so that, when checking a
+	 * unique index against this rel's GROUP BY items, we can verify that the
+	 * index's notion of equality agrees with at least one GROUP BY item per
+	 * index column.
 	 */
 	groupbyattnos = (Bitmapset **) palloc0(sizeof(Bitmapset *) *
 										   (list_length(parse->rtable) + 1));
+	groupbycols = palloc0_array(List *, list_length(parse->rtable) + 1);
 	foreach(lc, root->processed_groupClause)
 	{
 		SortGroupClause *sgc = lfirst_node(SortGroupClause, lc);
 		TargetEntry *tle = get_sortgroupclause_tle(sgc, parse->targetList);
 		Var		   *var = (Var *) tle->expr;
+		GroupByColInfo *info;
 
 		/*
 		 * Ignore non-Vars and Vars from other query levels.
@@ -463,6 +482,12 @@ remove_useless_groupby_columns(PlannerInfo *root)
 		tryremove |= !bms_is_empty(groupbyattnos[relid]);
 		groupbyattnos[relid] = bms_add_member(groupbyattnos[relid],
 											  var->varattno - FirstLowInvalidHeapAttributeNumber);
+
+		info = palloc(sizeof(GroupByColInfo));
+		info->attno = var->varattno;
+		info->eq_opfamilies = get_mergejoin_opfamilies(sgc->eqop);
+		info->coll = var->varcollid;
+		groupbycols[relid] = lappend(groupbycols[relid], info);
 	}
 
 	/*
@@ -517,7 +542,7 @@ remove_useless_groupby_columns(PlannerInfo *root)
 		foreach_node(IndexOptInfo, index, rel->indexlist)
 		{
 			Bitmapset  *ind_attnos;
-			bool		nulls_check_ok;
+			bool		index_check_ok;
 
 			/*
 			 * Skip any non-unique and deferrable indexes.  Predicate indexes
@@ -532,9 +557,14 @@ remove_useless_groupby_columns(PlannerInfo *root)
 				continue;
 
 			ind_attnos = NULL;
-			nulls_check_ok = true;
+			index_check_ok = true;
 			for (int i = 0; i < index->nkeycolumns; i++)
 			{
+				AttrNumber	indkey_attno = index->indexkeys[i];
+				Oid			indkey_opfamily = index->opfamily[i];
+				Oid			indkey_coll = index->indexcollations[i];
+				ListCell   *lc2;
+
 				/*
 				 * We must insist that the index columns are all defined NOT
 				 * NULL otherwise duplicate NULLs could exist.  However, we
@@ -544,20 +574,41 @@ remove_useless_groupby_columns(PlannerInfo *root)
 				 * despite the NULL.
 				 */
 				if (!index->nullsnotdistinct &&
-					!bms_is_member(index->indexkeys[i],
-								   rel->notnullattnums))
+					!bms_is_member(indkey_attno, rel->notnullattnums))
 				{
-					nulls_check_ok = false;
+					index_check_ok = false;
+					break;
+				}
+
+				/*
+				 * The index proves uniqueness only under its own opfamily and
+				 * collation.  Require some GROUP BY item on this column to
+				 * use a compatible eqop and collation, the same check
+				 * relation_has_unique_index_for() applies to join clauses.
+				 */
+				foreach(lc2, groupbycols[relid])
+				{
+					GroupByColInfo *info = (GroupByColInfo *) lfirst(lc2);
+
+					if (info->attno != indkey_attno)
+						continue;
+					if (list_member_oid(info->eq_opfamilies, indkey_opfamily) &&
+						collations_agree_on_equality(indkey_coll, info->coll))
+						break;
+				}
+				if (lc2 == NULL)
+				{
+					index_check_ok = false;
 					break;
 				}
 
 				ind_attnos =
 					bms_add_member(ind_attnos,
-								   index->indexkeys[i] -
+								   indkey_attno -
 								   FirstLowInvalidHeapAttributeNumber);
 			}
 
-			if (!nulls_check_ok)
+			if (!index_check_ok)
 				continue;
 
 			/*
diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out
index 6e0c86d79fc..cdb4ecf288f 100644
--- a/src/test/regress/expected/aggregates.out
+++ b/src/test/regress/expected/aggregates.out
@@ -1515,6 +1515,37 @@ explain (costs off) select y,z from t2 group by y,z;
    ->  Seq Scan on t2
 (3 rows)
 
+-- A unique index proves uniqueness only under its own opfamily.  When the
+-- GROUP BY's eqop comes from a different opfamily with looser equality,
+-- rows the index regards as distinct can collapse into one GROUP BY group,
+-- so the index is not usable for removing redundant columns.
+create type t_rec as (x numeric);
+create temp table t_opf (a t_rec not null, b text);
+create unique index on t_opf (a record_image_ops);
+-- (1.0) and (1.00) are bytewise distinct but logically equal as records;
+-- the index admits both, but GROUP BY a (default record_ops) would merge
+-- them, so b must be retained as a grouping key.
+insert into t_opf values (row(1.0)::t_rec, 'X'), (row(1.00)::t_rec, 'Y');
+explain (costs off)
+select a, b from t_opf group by a, b order by b;
+          QUERY PLAN           
+-------------------------------
+ Group
+   Group Key: b, a
+   ->  Sort
+         Sort Key: b, a
+         ->  Seq Scan on t_opf
+(5 rows)
+
+select a, b from t_opf group by a, b order by b;
+   a    | b 
+--------+---
+ (1.0)  | X
+ (1.00) | Y
+(2 rows)
+
+drop table t_opf;
+drop type t_rec;
 drop table t1 cascade;
 NOTICE:  drop cascades to table t1c
 drop table t2;
diff --git a/src/test/regress/expected/collate.icu.utf8.out b/src/test/regress/expected/collate.icu.utf8.out
index 783dce6b887..66093040965 100644
--- a/src/test/regress/expected/collate.icu.utf8.out
+++ b/src/test/regress/expected/collate.icu.utf8.out
@@ -3245,6 +3245,38 @@ DROP TABLE pagg_tab6;
 RESET enable_partitionwise_aggregate;
 RESET max_parallel_workers_per_gather;
 RESET enable_incremental_sort;
+--
+-- A unique index can prove functional dependency for GROUP BY column
+-- removal only if its per-column collation agrees on equality with
+-- the GROUP BY column's collation.  An index built under a different
+-- (deterministic) collation would otherwise let remove_useless_groupby_columns
+-- drop other columns whose values still differ within a nondeterministic
+-- group.
+--
+CREATE TABLE groupby_collation_t (a text COLLATE case_insensitive NOT NULL, b text);
+INSERT INTO groupby_collation_t VALUES ('foo', 'X'), ('FOO', 'Y');
+CREATE UNIQUE INDEX ON groupby_collation_t (a COLLATE "C");
+-- Column b must NOT be dropped: under case_insensitive on a, 'foo' and
+-- 'FOO' would merge, but they have distinct b values.
+EXPLAIN (COSTS OFF)
+SELECT a, b FROM groupby_collation_t GROUP BY a, b ORDER BY a, b;
+                   QUERY PLAN                    
+-------------------------------------------------
+ Group
+   Group Key: a, b
+   ->  Sort
+         Sort Key: a COLLATE case_insensitive, b
+         ->  Seq Scan on groupby_collation_t
+(5 rows)
+
+SELECT a, b FROM groupby_collation_t GROUP BY a, b ORDER BY a, b;
+  a  | b 
+-----+---
+ foo | X
+ FOO | Y
+(2 rows)
+
+DROP TABLE groupby_collation_t;
 -- virtual generated columns
 CREATE TABLE t5 (
     a int,
diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql
index c383890bc85..b751783e833 100644
--- a/src/test/regress/sql/aggregates.sql
+++ b/src/test/regress/sql/aggregates.sql
@@ -539,6 +539,23 @@ alter table t2 alter column z drop not null;
 create unique index t2_z_uidx on t2(z) nulls not distinct;
 explain (costs off) select y,z from t2 group by y,z;
 
+-- A unique index proves uniqueness only under its own opfamily.  When the
+-- GROUP BY's eqop comes from a different opfamily with looser equality,
+-- rows the index regards as distinct can collapse into one GROUP BY group,
+-- so the index is not usable for removing redundant columns.
+create type t_rec as (x numeric);
+create temp table t_opf (a t_rec not null, b text);
+create unique index on t_opf (a record_image_ops);
+-- (1.0) and (1.00) are bytewise distinct but logically equal as records;
+-- the index admits both, but GROUP BY a (default record_ops) would merge
+-- them, so b must be retained as a grouping key.
+insert into t_opf values (row(1.0)::t_rec, 'X'), (row(1.00)::t_rec, 'Y');
+explain (costs off)
+select a, b from t_opf group by a, b order by b;
+select a, b from t_opf group by a, b order by b;
+drop table t_opf;
+drop type t_rec;
+
 drop table t1 cascade;
 drop table t2;
 drop table t3;
diff --git a/src/test/regress/sql/collate.icu.utf8.sql b/src/test/regress/sql/collate.icu.utf8.sql
index 724df44d9ae..90ea90d64e0 100644
--- a/src/test/regress/sql/collate.icu.utf8.sql
+++ b/src/test/regress/sql/collate.icu.utf8.sql
@@ -1170,6 +1170,26 @@ RESET enable_partitionwise_aggregate;
 RESET max_parallel_workers_per_gather;
 RESET enable_incremental_sort;
 
+--
+-- A unique index can prove functional dependency for GROUP BY column
+-- removal only if its per-column collation agrees on equality with
+-- the GROUP BY column's collation.  An index built under a different
+-- (deterministic) collation would otherwise let remove_useless_groupby_columns
+-- drop other columns whose values still differ within a nondeterministic
+-- group.
+--
+CREATE TABLE groupby_collation_t (a text COLLATE case_insensitive NOT NULL, b text);
+INSERT INTO groupby_collation_t VALUES ('foo', 'X'), ('FOO', 'Y');
+CREATE UNIQUE INDEX ON groupby_collation_t (a COLLATE "C");
+
+-- Column b must NOT be dropped: under case_insensitive on a, 'foo' and
+-- 'FOO' would merge, but they have distinct b values.
+EXPLAIN (COSTS OFF)
+SELECT a, b FROM groupby_collation_t GROUP BY a, b ORDER BY a, b;
+SELECT a, b FROM groupby_collation_t GROUP BY a, b ORDER BY a, b;
+
+DROP TABLE groupby_collation_t;
+
 -- virtual generated columns
 CREATE TABLE t5 (
     a int,
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index c2fef51cd7b..8cd74c4e5b6 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -1103,6 +1103,7 @@ GrantRoleStmt
 GrantStmt
 GrantTargetType
 Group
+GroupByColInfo
 GroupByOrdering
 GroupClause
 GroupPath