BUG: stats: Reformulate loggamma._rvs to handle c << 1. (scipy#13349)

* BUG: stats: Reformulate loggamma._rvs to handle c << 1. Closes scipygh-11094. Several tests in test_morestats.py call loggamma.rvs to generate test data. These tests have hardcoded expected results that depend on the values in the input data. Because the stream of variates generated by loggamma.rvs has changed, those tests will fail if the input data is generated by loggamma.rvs. Instead, the data that was generated by a new function that takes the log of random variates from `stats.gamma.rvs`. Co-authored-by: Matt Haberland <[email protected]>
mreineck · Aug 18, 2022 · 2e871e9 · 2e871e9
1 parent bf9e262
commit 2e871e9
Show file tree

Hide file tree

Showing 5 changed files with 47 additions and 15 deletions.
diff --git a/scipy/stats/_continuous_distns.py b/scipy/stats/_continuous_distns.py
@@ -5355,11 +5355,23 @@ class loggamma_gen(rv_continuous):
     %(example)s
 
     """
+
     def _shape_info(self):
         return [_ShapeInfo("c", False, (0, np.inf), (False, False))]
 
     def _rvs(self, c, size=None, random_state=None):
-        return np.log(random_state.gamma(c, size=size))
+        # Use the property of the gamma distribution Gamma(c)
+        #    Gamma(c) ~ Gamma(c + 1)*U**(1/c),
+        # where U is uniform on [0, 1]. (See, e.g.,
+        # G. Marsaglia and W.W. Tsang, "A simple method for generating gamma
+        # variables", https://doi.org/10.1145/358407.358414)
+        # So
+        #    log(Gamma(c)) ~ log(Gamma(c + 1)) + log(U)/c
+        # Generating a sample with this formulation is a bit slower
+        # than the more obvious log(Gamma(c)), but it avoids loss
+        # of precision when c << 1.
+        return (np.log(random_state.gamma(c + 1, size=size))
+                + np.log(random_state.uniform(size=size))/c)
 
     def _pdf(self, x, c):
         # loggamma.pdf(x, c) = exp(c*x-exp(x)) / gamma(c)

diff --git a/scipy/stats/_morestats.py b/scipy/stats/_morestats.py
@@ -1182,11 +1182,11 @@ def boxcox_normmax(x, brack=None, method='pearsonr', optimizer=None):
     >>> lmax_pearsonr = stats.boxcox_normmax(x)
 
     >>> lmax_mle
-    1.4613865614008015
+    2.217563431465757
     >>> lmax_pearsonr
-    1.6685004886804342
+    2.238318660200961
     >>> stats.boxcox_normmax(x, method='all')
-    array([1.66850049, 1.46138656])
+    array([2.23831866, 2.21756343])
 
     >>> fig = plt.figure()
     >>> ax = fig.add_subplot(111)

diff --git a/scipy/stats/tests/test_continuous_basic.py b/scipy/stats/tests/test_continuous_basic.py
@@ -341,7 +341,7 @@ def test_rvs_broadcast(dist, shape_args):
     shape_only = dist in ['argus', 'betaprime', 'dgamma', 'dweibull',
                           'exponnorm', 'genhyperbolic', 'geninvgauss',
                           'levy_stable', 'nct', 'norminvgauss', 'rice',
-                          'skewnorm', 'semicircular', 'gennorm']
+                          'skewnorm', 'semicircular', 'gennorm', 'loggamma']
 
     distfunc = getattr(stats, dist)
     loc = np.zeros(2)

diff --git a/scipy/stats/tests/test_distributions.py b/scipy/stats/tests/test_distributions.py
@@ -1324,6 +1324,20 @@ def test_stats(self):
             assert_array_almost_equal(computed, [mean, var, skew, kurt],
                                       decimal=4)
 
+    @pytest.mark.parametrize('c', [0.1, 0.001])
+    def test_rvs(self, c):
+        # Regression test for gh-11094.
+        x = stats.loggamma.rvs(c, size=100000)
+        # Before gh-11094 was fixed, the case with c=0.001 would
+        # generate many -inf values.
+        assert np.isfinite(x).all()
+        # Crude statistical test.  About half the values should be
+        # less than the median and half greater than the median.
+        med = stats.loggamma.median(c)
+        btest = stats.binomtest(np.count_nonzero(x < med), len(x))
+        ci = btest.proportion_ci(confidence_level=0.999)
+        assert ci.low < 0.5 < ci.high
+
 
 class TestLogistic:
     # gh-6226

diff --git a/scipy/stats/tests/test_morestats.py b/scipy/stats/tests/test_morestats.py
@@ -48,6 +48,12 @@
 g10 = [0.991, 0.995, 0.984, 0.994, 0.997, 0.997, 0.991, 0.998, 1.004, 0.997]
 
 
+# The loggamma RVS stream is changing due to gh-13349; this version
+# preserves the old stream so that tests don't change.
+def _old_loggamma_rvs(*args, **kwargs):
+    return np.log(stats.gamma.rvs(*args, **kwargs))
+
+
 class TestBayes_mvs:
     def test_basic(self):
         # Expected values in this test simply taken from the function.  For
@@ -1643,7 +1649,7 @@ def test_bad_arg(self):
 
 class TestPpccPlot:
     def setup_method(self):
-        self.x = stats.loggamma.rvs(5, size=500, random_state=7654321) + 5
+        self.x = _old_loggamma_rvs(5, size=500, random_state=7654321) + 5
 
     def test_basic(self):
         N = 5
@@ -1825,7 +1831,7 @@ def test_gh_6873(self):
 class TestBoxcox:
 
     def test_fixed_lmbda(self):
-        x = stats.loggamma.rvs(5, size=50, random_state=12345) + 5
+        x = _old_loggamma_rvs(5, size=50, random_state=12345) + 5
         xt = stats.boxcox(x, lmbda=1)
         assert_allclose(xt, x - 1)
         xt = stats.boxcox(x, lmbda=-1)
@@ -1854,7 +1860,7 @@ def test_lmbda_None(self):
 
     def test_alpha(self):
         rng = np.random.RandomState(1234)
-        x = stats.loggamma.rvs(5, size=50, random_state=rng) + 5
+        x = _old_loggamma_rvs(5, size=50, random_state=rng) + 5
 
         # Some regular values for alpha, on a small sample size
         _, _, interval = stats.boxcox(x, alpha=0.75)
@@ -1863,7 +1869,7 @@ def test_alpha(self):
         assert_allclose(interval, [1.2138178554857557, 8.209033272375663])
 
         # Try some extreme values, see we don't hit the N=500 limit
-        x = stats.loggamma.rvs(7, size=500, random_state=rng) + 15
+        x = _old_loggamma_rvs(7, size=500, random_state=rng) + 15
         _, _, interval = stats.boxcox(x, alpha=0.001)
         assert_allclose(interval, [0.3988867, 11.40553131])
         _, _, interval = stats.boxcox(x, alpha=0.999)
@@ -1941,7 +1947,7 @@ def optimizer(fun):
 
 class TestBoxcoxNormmax:
     def setup_method(self):
-        self.x = stats.loggamma.rvs(5, size=50, random_state=12345) + 5
+        self.x = _old_loggamma_rvs(5, size=50, random_state=12345) + 5
 
     def test_pearsonr(self):
         maxlog = stats.boxcox_normmax(self.x)
@@ -2013,7 +2019,7 @@ def test_user_defined_optimizer_and_brack_raises_error(self):
 
 class TestBoxcoxNormplot:
     def setup_method(self):
-        self.x = stats.loggamma.rvs(5, size=500, random_state=7654321) + 5
+        self.x = _old_loggamma_rvs(5, size=500, random_state=7654321) + 5
 
     def test_basic(self):
         N = 5
@@ -2072,7 +2078,7 @@ def test_fixed_lmbda(self):
         rng = np.random.RandomState(12345)
 
         # Test positive input
-        x = stats.loggamma.rvs(5, size=50, random_state=rng) + 5
+        x = _old_loggamma_rvs(5, size=50, random_state=rng) + 5
         assert np.all(x > 0)
         xt = stats.yeojohnson(x, lmbda=1)
         assert_allclose(xt, x)
@@ -2084,7 +2090,7 @@ def test_fixed_lmbda(self):
         assert_allclose(xt, x)
 
         # Test negative input
-        x = stats.loggamma.rvs(5, size=50, random_state=rng) - 5
+        x = _old_loggamma_rvs(5, size=50, random_state=rng) - 5
         assert np.all(x < 0)
         xt = stats.yeojohnson(x, lmbda=2)
         assert_allclose(xt, -np.log(-x + 1))
@@ -2094,7 +2100,7 @@ def test_fixed_lmbda(self):
         assert_allclose(xt, 1 / (-x + 1) - 1)
 
         # test both positive and negative input
-        x = stats.loggamma.rvs(5, size=50, random_state=rng) - 2
+        x = _old_loggamma_rvs(5, size=50, random_state=rng) - 2
         assert not np.all(x < 0)
         assert not np.all(x >= 0)
         pos = x >= 0
@@ -2195,7 +2201,7 @@ def test_input_high_variance(self):
 
 class TestYeojohnsonNormmax:
     def setup_method(self):
-        self.x = stats.loggamma.rvs(5, size=50, random_state=12345) + 5
+        self.x = _old_loggamma_rvs(5, size=50, random_state=12345) + 5
 
     def test_mle(self):
         maxlog = stats.yeojohnson_normmax(self.x)