mirror of
https://github.com/postgres/postgres.git
synced 2026-06-09 00:32:10 -04:00
Improve overflow/underflow handling in regr_r2().
Commit6498287696improved corr()'s final function to cope with overflow/underflow in the final calculation, and clamped its result to [-1, 1] in case of roundoff error. Improve regr_r2() in a similar way, clamping its result to [0, 1]. Arguably this is a bug fix, but given the lack of prior complaints, refrain from back-patching, as we did with6498287696. Reported-by: Chengpeng Yan <chengpeng_yan@outlook.com> Author: Chengpeng Yan <chengpeng_yan@outlook.com> Reviewed-by: Dean Rasheed <dean.a.rasheed@gmail.com> Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us> Discussion: https://postgr.es/m/33E01656-BB3B-46E9-A41F-24A01A7C35F4@outlook.com
This commit is contained in:
parent
95b6ec52e6
commit
d58ec50e0f
3 changed files with 87 additions and 27 deletions
|
|
@ -3916,7 +3916,12 @@ float8_regr_r2(PG_FUNCTION_ARGS)
|
|||
float8 N,
|
||||
Sxx,
|
||||
Syy,
|
||||
Sxy;
|
||||
Sxy,
|
||||
numerator,
|
||||
denominator,
|
||||
sqrtdenominator,
|
||||
sqrtresult,
|
||||
result;
|
||||
|
||||
transvalues = check_float8_array(transarray, "float8_regr_r2", 8);
|
||||
N = transvalues[0];
|
||||
|
|
@ -3938,7 +3943,35 @@ float8_regr_r2(PG_FUNCTION_ARGS)
|
|||
if (Syy == 0)
|
||||
PG_RETURN_FLOAT8(1.0);
|
||||
|
||||
PG_RETURN_FLOAT8((Sxy * Sxy) / (Sxx * Syy));
|
||||
/*
|
||||
* The products Sxy * Sxy and/or Sxx * Syy might underflow or overflow. If
|
||||
* so, we can recover by computing Sxy / (sqrt(Sxx) * sqrt(Syy)) and
|
||||
* squaring it instead. However, the double sqrt() calculation is a bit
|
||||
* slower and less accurate, so don't do it if we don't have to.
|
||||
*/
|
||||
numerator = Sxy * Sxy;
|
||||
denominator = Sxx * Syy;
|
||||
if (numerator == 0 || isinf(numerator) ||
|
||||
denominator == 0 || isinf(denominator))
|
||||
{
|
||||
sqrtdenominator = sqrt(Sxx) * sqrt(Syy);
|
||||
sqrtresult = Sxy / sqrtdenominator;
|
||||
result = sqrtresult * sqrtresult;
|
||||
}
|
||||
else
|
||||
result = numerator / denominator;
|
||||
|
||||
/*
|
||||
* Despite all these precautions, this formula can yield results outside
|
||||
* [0, 1] due to roundoff error. Clamp it to the expected range.
|
||||
*
|
||||
* Note that result is guaranteed to be non-negative becase Sxx and Syy
|
||||
* are non-negative, so we only need to clamp the upper end of the range.
|
||||
*/
|
||||
if (result > 1)
|
||||
result = 1;
|
||||
|
||||
PG_RETURN_FLOAT8(result);
|
||||
}
|
||||
|
||||
Datum
|
||||
|
|
|
|||
|
|
@ -516,6 +516,7 @@ SELECT covar_pop(1::float8,'nan'::float8), covar_samp(3::float8,'nan'::float8);
|
|||
(1 row)
|
||||
|
||||
-- check some cases that formerly had poor roundoff-error behavior
|
||||
-- note: regr_r2() differs from corr() for a horizontal line, per spec
|
||||
SELECT corr(0.09, g), regr_r2(0.09, g)
|
||||
FROM generate_series(1, 30) g;
|
||||
corr | regr_r2
|
||||
|
|
@ -537,38 +538,55 @@ SELECT corr(1.3 + g * 1e-16, 1.3 + g * 1e-16)
|
|||
|
||||
(1 row)
|
||||
|
||||
SELECT corr(1e-100 + g * 1e-105, 1e-100 + g * 1e-105)
|
||||
-- check some cases that formerly suffered from internal overflow/underflow
|
||||
SELECT corr(1e-100 + g * 1e-105, 1e-100 + g * 1e-105),
|
||||
regr_r2(1e-100 + g * 1e-105, 1e-100 + g * 1e-105)
|
||||
FROM generate_series(1, 3) g;
|
||||
corr
|
||||
------
|
||||
1
|
||||
corr | regr_r2
|
||||
------+---------
|
||||
1 | 1
|
||||
(1 row)
|
||||
|
||||
SELECT corr(1e-100 + g * 1e-105, 1e-100 + g * 1e-105)
|
||||
SELECT corr(1e-100 + g * 1e-105, 1e-100 + g * 1e-105),
|
||||
regr_r2(1e-100 + g * 1e-105, 1e-100 + g * 1e-105)
|
||||
FROM generate_series(1, 30) g;
|
||||
corr
|
||||
------
|
||||
1
|
||||
corr | regr_r2
|
||||
------+---------
|
||||
1 | 1
|
||||
(1 row)
|
||||
|
||||
SELECT corr(1e100 + g * 1e95, 1e100 + g * 1e95),
|
||||
regr_r2(1e100 + g * 1e95, 1e100 + g * 1e95)
|
||||
FROM generate_series(1, 2) g;
|
||||
corr | regr_r2
|
||||
------+---------
|
||||
1 | 1
|
||||
(1 row)
|
||||
|
||||
-- these examples pose definitional questions for NaN inputs,
|
||||
-- which we resolve by saying that an all-NaN input column is not all equal
|
||||
SELECT corr(g, 'NaN') FROM generate_series(1, 30) g;
|
||||
corr
|
||||
------
|
||||
NaN
|
||||
SELECT corr(g, 'NaN'), regr_r2(g, 'NaN') FROM generate_series(1, 30) g;
|
||||
corr | regr_r2
|
||||
------+---------
|
||||
NaN | NaN
|
||||
(1 row)
|
||||
|
||||
SELECT corr(0.1, 'NaN') FROM generate_series(1, 30) g;
|
||||
corr
|
||||
------
|
||||
|
||||
SELECT corr(0.1, 'NaN'), regr_r2(0.1, 'NaN') FROM generate_series(1, 30) g;
|
||||
corr | regr_r2
|
||||
------+---------
|
||||
| 1
|
||||
(1 row)
|
||||
|
||||
SELECT corr('NaN', 'NaN') FROM generate_series(1, 30) g;
|
||||
corr
|
||||
------
|
||||
NaN
|
||||
SELECT corr('NaN', 0.1), regr_r2('NaN', 0.1) FROM generate_series(1, 30) g;
|
||||
corr | regr_r2
|
||||
------+---------
|
||||
|
|
||||
(1 row)
|
||||
|
||||
SELECT corr('NaN', 'NaN'), regr_r2('NaN', 'NaN') FROM generate_series(1, 30) g;
|
||||
corr | regr_r2
|
||||
------+---------
|
||||
NaN | NaN
|
||||
(1 row)
|
||||
|
||||
-- test accum and combine functions directly
|
||||
|
|
|
|||
|
|
@ -141,22 +141,31 @@ SELECT covar_pop(1::float8,'inf'::float8), covar_samp(3::float8,'inf'::float8);
|
|||
SELECT covar_pop(1::float8,'nan'::float8), covar_samp(3::float8,'nan'::float8);
|
||||
|
||||
-- check some cases that formerly had poor roundoff-error behavior
|
||||
-- note: regr_r2() differs from corr() for a horizontal line, per spec
|
||||
SELECT corr(0.09, g), regr_r2(0.09, g)
|
||||
FROM generate_series(1, 30) g;
|
||||
SELECT corr(g, 0.09), regr_r2(g, 0.09), regr_slope(g, 0.09), regr_intercept(g, 0.09)
|
||||
FROM generate_series(1, 30) g;
|
||||
SELECT corr(1.3 + g * 1e-16, 1.3 + g * 1e-16)
|
||||
FROM generate_series(1, 3) g;
|
||||
SELECT corr(1e-100 + g * 1e-105, 1e-100 + g * 1e-105)
|
||||
|
||||
-- check some cases that formerly suffered from internal overflow/underflow
|
||||
SELECT corr(1e-100 + g * 1e-105, 1e-100 + g * 1e-105),
|
||||
regr_r2(1e-100 + g * 1e-105, 1e-100 + g * 1e-105)
|
||||
FROM generate_series(1, 3) g;
|
||||
SELECT corr(1e-100 + g * 1e-105, 1e-100 + g * 1e-105)
|
||||
SELECT corr(1e-100 + g * 1e-105, 1e-100 + g * 1e-105),
|
||||
regr_r2(1e-100 + g * 1e-105, 1e-100 + g * 1e-105)
|
||||
FROM generate_series(1, 30) g;
|
||||
SELECT corr(1e100 + g * 1e95, 1e100 + g * 1e95),
|
||||
regr_r2(1e100 + g * 1e95, 1e100 + g * 1e95)
|
||||
FROM generate_series(1, 2) g;
|
||||
|
||||
-- these examples pose definitional questions for NaN inputs,
|
||||
-- which we resolve by saying that an all-NaN input column is not all equal
|
||||
SELECT corr(g, 'NaN') FROM generate_series(1, 30) g;
|
||||
SELECT corr(0.1, 'NaN') FROM generate_series(1, 30) g;
|
||||
SELECT corr('NaN', 'NaN') FROM generate_series(1, 30) g;
|
||||
SELECT corr(g, 'NaN'), regr_r2(g, 'NaN') FROM generate_series(1, 30) g;
|
||||
SELECT corr(0.1, 'NaN'), regr_r2(0.1, 'NaN') FROM generate_series(1, 30) g;
|
||||
SELECT corr('NaN', 0.1), regr_r2('NaN', 0.1) FROM generate_series(1, 30) g;
|
||||
SELECT corr('NaN', 'NaN'), regr_r2('NaN', 'NaN') FROM generate_series(1, 30) g;
|
||||
|
||||
-- test accum and combine functions directly
|
||||
CREATE TABLE regr_test (x float8, y float8);
|
||||
|
|
|
|||
Loading…
Reference in a new issue