From 49f34716234bd02b09ca086419e6843532c240a1 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Fri, 5 Apr 2024 00:01:55 -0500 Subject: [PATCH 01/37] Add kde_rand() --- Lib/statistics.py | 92 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 91 insertions(+), 1 deletion(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 58fb31def8896e..089802136140db 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -138,12 +138,13 @@ from itertools import count, groupby, repeat from bisect import bisect_left, bisect_right from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum, sumprod -from math import isfinite, isinf, pi, cos, sin, cosh, atan +from math import isfinite, isinf, pi, cos, sin, tan, cosh, asin, atan from functools import reduce from operator import itemgetter from collections import Counter, namedtuple, defaultdict _SQRT2 = sqrt(2.0) +_random = random # === Exceptions === @@ -1697,3 +1698,92 @@ def __getstate__(self): def __setstate__(self, state): self._mu, self._sigma = state + + +## kde_rand() ################################################################ + +def _newton_raphson(f_inv_est, f, f_prime, tolerance=1e-12): + def f_inv(y): + "Return x such that f(x) ≈ y within the specified tolerance." + x = f_inv_est(y) + while abs(diff := y - f(x)) > tolerance: + x += diff / f_prime(x) + return x + return f_inv + +_parabolic_invcdf = _newton_raphson( + f_inv_est = lambda p: ((2.0 * p) ** 0.583367470424302 - 1.0 + if p <= 1/2 else + 1.0 - (2.0 - 2.0*p) ** 0.583367470424302), + f = lambda t: -1/4 * t**3 + 3/4 * t + 1/2, + f_prime = lambda t: 3/4 * (1.0 - t * t)) + +_quartic_invcdf = _newton_raphson( + f_inv_est = lambda p: ((2.0 * p) ** 0.4258865685331 - 1.0 + if p <= 1/2 else + 1.0 - (2.0 - 2.0*p) ** 0.4258865685331), + f = lambda t: 3/16 * t**5 - 5/8 * t**3 + 15/16 * t + 1/2, + f_prime = lambda t: 15/16 * (1.0 - t * t) ** 2) + +_triweight_invcdf = _newton_raphson( + f_inv_est = lambda p: ((2.0 * p) ** 0.3400218741872791 - 1.0 + if p <= 1/2 else + 1.0 - (2.0 - 2.0*p) ** 0.3400218741872791), + f = lambda t: 35/32 * (-1/7*t**7 + 3/5*t**5 - t**3 + t) + 1/2, + f_prime = lambda t: 35/32 * (1.0 - t * t) ** 3) + +_kernel_invcdfs = { + 'normal': NormalDist().inv_cdf, + 'logisitic': lambda p: log(p / (1 - p)), + 'sigmoid': lambda p: log(tan(p * pi/2)), + 'rectangular': lambda p: 2*p - 1, + 'parabolic': _parabolic_invcdf, + 'quartic': _quartic_invcdf, + 'triweight': _triweight_invcdf, + 'triangular': lambda p: sqrt(2*p) - 1 if p < 0.5 else 1 - sqrt(2 - 2*p), + 'cosine': lambda p: 2*asin(2*p - 1)/pi, +} +_kernel_invcdfs['gauss'] = _kernel_invcdfs['normal'] +_kernel_invcdfs['uniform'] = _kernel_invcdfs['rectangular'] +_kernel_invcdfs['epanechnikov'] = _kernel_invcdfs['parabolic'] +_kernel_invcdfs['biweight'] = _kernel_invcdfs['quartic'] + +def kde_random(data, h, kernel='normal', *, seed=None): + """Return a function that makes a random selection from the estimated + probability density function created by: kde(data, h, kernel) + + For reproducible results, set *seed* to an integer, float, str, or bytes. + Not thread-safe without a lock around calls. + + A StatisticsError will be raised if the data sequence is empty. + + """ + n = len(data) + if not n: + raise StatisticsError('Empty data sequence') + + if not isinstance(data[0], (int, float)): + raise TypeError('Data sequence must contain ints or floats') + + if h <= 0.0: + raise StatisticsError(f'Bandwidth h must be positive, not {h=!r}') + + if seed is None: + random = _random.random + choice = _random.choice + else: + prng = _random.Random(seed) + random = prng.random + choice = prng.choice + + try: + kernel_invcdf = _kernel_invcdfs[kernel] + except KeyError: + raise StatisticsError(f'Unknown kernel name: {kernel!r}') + + def rand(): + return choice(data) + h * kernel_invcdf(random()) + + rand.__doc__ = f'Random KDE selection with {h=!r} and {kernel=!r}' + + return rand From 0fd40f3cecbf1a8bd20e0d624ba8d253d9a9550f Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Fri, 5 Apr 2024 00:09:56 -0500 Subject: [PATCH 02/37] Update __all__ --- Lib/statistics.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Lib/statistics.py b/Lib/statistics.py index 089802136140db..8ca63b8ccc9749 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -113,6 +113,7 @@ 'geometric_mean', 'harmonic_mean', 'kde', + 'kde_random', 'linear_regression', 'mean', 'median', From 75680e5fca6aa01f12b963fcca333725ab0a58e8 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Fri, 5 Apr 2024 21:24:33 -0500 Subject: [PATCH 03/37] Auto detect size changes --- Lib/statistics.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/Lib/statistics.py b/Lib/statistics.py index 8ca63b8ccc9749..8b746e8fce5255 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -976,9 +976,11 @@ def kde(data, h, kernel='normal', *, cumulative=False): if support is None: def pdf(x): + n = len(data) return sum(K((x - x_i) / h) for x_i in data) / (n * h) def cdf(x): + n = len(data) return sum(I((x - x_i) / h) for x_i in data) / n else: @@ -987,12 +989,20 @@ def cdf(x): bandwidth = h * support def pdf(x): + nonlocal n, sample + if len(data) != n: + sample = sorted(data) + n = len(data) i = bisect_left(sample, x - bandwidth) j = bisect_right(sample, x + bandwidth) supported = sample[i : j] return sum(K((x - x_i) / h) for x_i in supported) / (n * h) def cdf(x): + nonlocal n, sample + if len(data) != n: + sample = sorted(data) + n = len(data) i = bisect_left(sample, x - bandwidth) j = bisect_right(sample, x + bandwidth) supported = sample[i : j] From 63ba396e5edbf1393edec8813740b6d3157d651e Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Fri, 5 Apr 2024 21:31:55 -0500 Subject: [PATCH 04/37] . --- Lib/statistics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 8b746e8fce5255..95e99432e268e2 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -1711,7 +1711,7 @@ def __setstate__(self, state): self._mu, self._sigma = state -## kde_rand() ################################################################ +## kde_random() ############################################################## def _newton_raphson(f_inv_est, f, f_prime, tolerance=1e-12): def f_inv(y): From c69c5042cd4b3166490279d97767d0077eb97de5 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Fri, 5 Apr 2024 21:46:34 -0500 Subject: [PATCH 05/37] Factor-out s-scurve function --- Lib/statistics.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 95e99432e268e2..0eec821f69b0ee 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -1722,24 +1722,25 @@ def f_inv(y): return x return f_inv +def _simple_s_curve(power): + "S-curve that crosses (0, -1), (1/2, 0), (1, 1)." + # Approximates the invcdf for a kernels with support==1.0 + return lambda p: ((2 * p) ** power - 1 + if p <= 1/2 else + 1 - (2 - 2*p) ** power) + _parabolic_invcdf = _newton_raphson( - f_inv_est = lambda p: ((2.0 * p) ** 0.583367470424302 - 1.0 - if p <= 1/2 else - 1.0 - (2.0 - 2.0*p) ** 0.583367470424302), + f_inv_est = _simple_s_curve(0.583367470424302), f = lambda t: -1/4 * t**3 + 3/4 * t + 1/2, f_prime = lambda t: 3/4 * (1.0 - t * t)) _quartic_invcdf = _newton_raphson( - f_inv_est = lambda p: ((2.0 * p) ** 0.4258865685331 - 1.0 - if p <= 1/2 else - 1.0 - (2.0 - 2.0*p) ** 0.4258865685331), + f_inv_est = _simple_s_curve(0.4258865685331), f = lambda t: 3/16 * t**5 - 5/8 * t**3 + 15/16 * t + 1/2, f_prime = lambda t: 15/16 * (1.0 - t * t) ** 2) _triweight_invcdf = _newton_raphson( - f_inv_est = lambda p: ((2.0 * p) ** 0.3400218741872791 - 1.0 - if p <= 1/2 else - 1.0 - (2.0 - 2.0*p) ** 0.3400218741872791), + f_inv_est = _simple_s_curve(0.3400218741872791), f = lambda t: 35/32 * (-1/7*t**7 + 3/5*t**5 - t**3 + t) + 1/2, f_prime = lambda t: 35/32 * (1.0 - t * t) ** 3) @@ -1751,7 +1752,7 @@ def f_inv(y): 'parabolic': _parabolic_invcdf, 'quartic': _quartic_invcdf, 'triweight': _triweight_invcdf, - 'triangular': lambda p: sqrt(2*p) - 1 if p < 0.5 else 1 - sqrt(2 - 2*p), + 'triangular': lambda p: sqrt(2*p) - 1 if p < 1/2 else 1 - sqrt(2 - 2*p), 'cosine': lambda p: 2*asin(2*p - 1)/pi, } _kernel_invcdfs['gauss'] = _kernel_invcdfs['normal'] From dcd83f2204d8cf791aec4e5d0c41860bbfe03077 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Fri, 5 Apr 2024 21:50:13 -0500 Subject: [PATCH 06/37] . --- Lib/statistics.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 0eec821f69b0ee..8c64a294ee82a0 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -1713,10 +1713,10 @@ def __setstate__(self, state): ## kde_random() ############################################################## -def _newton_raphson(f_inv_est, f, f_prime, tolerance=1e-12): +def _newton_raphson(f_inv_estimate, f, f_prime, tolerance=1e-12): def f_inv(y): "Return x such that f(x) ≈ y within the specified tolerance." - x = f_inv_est(y) + x = f_inv_estimate(y) while abs(diff := y - f(x)) > tolerance: x += diff / f_prime(x) return x @@ -1730,17 +1730,17 @@ def _simple_s_curve(power): 1 - (2 - 2*p) ** power) _parabolic_invcdf = _newton_raphson( - f_inv_est = _simple_s_curve(0.583367470424302), + f_inv_estimate = _simple_s_curve(0.583367470424302), f = lambda t: -1/4 * t**3 + 3/4 * t + 1/2, f_prime = lambda t: 3/4 * (1.0 - t * t)) _quartic_invcdf = _newton_raphson( - f_inv_est = _simple_s_curve(0.4258865685331), + f_inv_estimate = _simple_s_curve(0.4258865685331), f = lambda t: 3/16 * t**5 - 5/8 * t**3 + 15/16 * t + 1/2, f_prime = lambda t: 15/16 * (1.0 - t * t) ** 2) _triweight_invcdf = _newton_raphson( - f_inv_est = _simple_s_curve(0.3400218741872791), + f_inv_estimate = _simple_s_curve(0.3400218741872791), f = lambda t: 35/32 * (-1/7*t**7 + 3/5*t**5 - t**3 + t) + 1/2, f_prime = lambda t: 35/32 * (1.0 - t * t) ** 3) From 57492bfd8201eaa871c759719af2048364e40aab Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Fri, 5 Apr 2024 22:20:19 -0500 Subject: [PATCH 07/37] . --- Lib/statistics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 8c64a294ee82a0..13c9c3a6f73e46 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -1724,7 +1724,7 @@ def f_inv(y): def _simple_s_curve(power): "S-curve that crosses (0, -1), (1/2, 0), (1, 1)." - # Approximates the invcdf for a kernels with support==1.0 + # Approximates the invcdf for kernels with support==1.0 return lambda p: ((2 * p) ** power - 1 if p <= 1/2 else 1 - (2 - 2*p) ** power) @@ -1753,7 +1753,7 @@ def _simple_s_curve(power): 'quartic': _quartic_invcdf, 'triweight': _triweight_invcdf, 'triangular': lambda p: sqrt(2*p) - 1 if p < 1/2 else 1 - sqrt(2 - 2*p), - 'cosine': lambda p: 2*asin(2*p - 1)/pi, + 'cosine': lambda p: 2 * asin(2*p - 1) / pi, } _kernel_invcdfs['gauss'] = _kernel_invcdfs['normal'] _kernel_invcdfs['uniform'] = _kernel_invcdfs['rectangular'] From 5b41598493a04d00b07eee2bfd952a820c29aade Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Sat, 6 Apr 2024 21:03:30 -0500 Subject: [PATCH 08/37] . --- Lib/statistics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 13c9c3a6f73e46..9e1083e54d5b2d 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -1722,9 +1722,9 @@ def f_inv(y): return x return f_inv -def _simple_s_curve(power): +def _simple_s_curve(power=0.5): "S-curve that crosses (0, -1), (1/2, 0), (1, 1)." - # Approximates the invcdf for kernels with support==1.0 + # Approximates the invcdf for kernels with support: -1 <= x <= 1. return lambda p: ((2 * p) ** power - 1 if p <= 1/2 else 1 - (2 - 2*p) ** power) From 89a6033824356c73252d994fbc571af708f2baaa Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Sat, 6 Apr 2024 22:11:06 -0500 Subject: [PATCH 09/37] Add docs for kde_random(). --- Doc/library/statistics.rst | 80 +++++++++++--------------------------- 1 file changed, 23 insertions(+), 57 deletions(-) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index 197c123f8356d8..c99f16e12ada7a 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -311,6 +311,29 @@ However, for reading convenience, most of the examples show sorted sequences. .. versionadded:: 3.13 +.. function:: kde_random(data, h, kernel='normal', *, seed=None) + + Return a function that makes a random selection from the estimated + probability density function created by ``kde(data, h, kernel)``. + + For reproducible results, set *seed* to an integer, float, str, or bytes. + Not thread-safe without a lock around calls. + + A :exc:`StatisticsError` will be raised if the *data* sequence is empty. + + Continuing the above example for :func:`kde`, we can use + :func:`kde_random` to generate new random selections from an + estimated probability density function: + + >>> data = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2] + >>> rand = kde_random(sample, h=1.5, seed=8675309) + >>> new_selections = [rand() for i in range(10)] + >>> [round(x, 1) for x in new_selections] + [0.7, 6.2, 1.2, 6.9, 7.0, 1.8, 2.5, -0.5, -1.8, 5.6] + + .. versionadded:: 3.13 + + .. function:: median(data) Return the median (middle value) of numeric data, using the common "mean of @@ -1148,63 +1171,6 @@ The final prediction goes to the largest posterior. This is known as the 'female' -Sampling from kernel density estimation -*************************************** - -The :func:`kde()` function creates a continuous probability density -function from discrete samples. Some applications need a way to make -random selections from that distribution. - -The technique is to pick a sample from a bandwidth scaled kernel -function and recenter the result around a randomly chosen point from -the input data. This can be done with any kernel that has a known or -accurately approximated inverse cumulative distribution function. - -.. testcode:: - - from random import choice, random, seed - from math import sqrt, log, pi, tan, asin - from statistics import NormalDist - - kernel_invcdfs = { - 'normal': NormalDist().inv_cdf, - 'logistic': lambda p: log(p / (1 - p)), - 'sigmoid': lambda p: log(tan(p * pi/2)), - 'rectangular': lambda p: 2*p - 1, - 'triangular': lambda p: sqrt(2*p) - 1 if p < 0.5 else 1 - sqrt(2 - 2*p), - 'cosine': lambda p: 2*asin(2*p - 1)/pi, - } - - def kde_random(data, h, kernel='normal'): - 'Return a function that samples from kde() smoothed data.' - kernel_invcdf = kernel_invcdfs[kernel] - def rand(): - return h * kernel_invcdf(random()) + choice(data) - return rand - -For example: - -.. doctest:: - - >>> discrete_samples = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2] - >>> rand = kde_random(discrete_samples, h=1.5) - >>> seed(8675309) - >>> selections = [rand() for i in range(10)] - >>> [round(x, 1) for x in selections] - [4.7, 7.4, 1.2, 7.8, 6.9, -1.3, 5.8, 0.2, -1.4, 5.7] - -.. testcode:: - :hide: - - from statistics import kde - from math import isclose - - # Verify that cdf / invcdf will round trip - xarr = [i/100 for i in range(-100, 101)] - for kernel, invcdf in kernel_invcdfs.items(): - cdf = kde([0.0], h=1.0, kernel=kernel, cumulative=True) - for x in xarr: - assert isclose(invcdf(cdf(x)), x, abs_tol=1E-9) .. # This modelines must appear within the last ten lines of the file. From 4d4f7928b2e31a525c7db268eec16db006a17306 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Sat, 6 Apr 2024 22:18:34 -0500 Subject: [PATCH 10/37] . --- Doc/library/statistics.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index c99f16e12ada7a..8510658c183039 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -314,7 +314,7 @@ However, for reading convenience, most of the examples show sorted sequences. .. function:: kde_random(data, h, kernel='normal', *, seed=None) Return a function that makes a random selection from the estimated - probability density function created by ``kde(data, h, kernel)``. + probability density function produced by ``kde(data, h, kernel)``. For reproducible results, set *seed* to an integer, float, str, or bytes. Not thread-safe without a lock around calls. From 7848ccabc21ae14b6e80874a4dd3652230cd1ba3 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Sat, 6 Apr 2024 22:38:43 -0500 Subject: [PATCH 11/37] . --- Lib/statistics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 9e1083e54d5b2d..236ca78715737f 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -1722,7 +1722,7 @@ def f_inv(y): return x return f_inv -def _simple_s_curve(power=0.5): +def _simple_s_curve(power): "S-curve that crosses (0, -1), (1/2, 0), (1, 1)." # Approximates the invcdf for kernels with support: -1 <= x <= 1. return lambda p: ((2 * p) ** power - 1 From 37a89553ae953ee4598c3dbb4feffabd8654d045 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Sun, 7 Apr 2024 13:19:07 -0500 Subject: [PATCH 12/37] Improve docstrings --- Doc/library/statistics.rst | 7 +++++-- Lib/statistics.py | 21 ++++++++++++++++----- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index 8510658c183039..0dd81d5510660b 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -316,8 +316,11 @@ However, for reading convenience, most of the examples show sorted sequences. Return a function that makes a random selection from the estimated probability density function produced by ``kde(data, h, kernel)``. - For reproducible results, set *seed* to an integer, float, str, or bytes. - Not thread-safe without a lock around calls. + Providing a *seed* allows reproducible selections within a single + thread (or with a lock around calls). In the future, the selection + method for the *parabolic*, *quartic*, and *triweight* kernels may be + replaced with faster algorithms that give different results. The + seed may be an integer, float, str, or bytes. A :exc:`StatisticsError` will be raised if the *data* sequence is empty. diff --git a/Lib/statistics.py b/Lib/statistics.py index 236ca78715737f..a35c7826d171ce 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -1724,7 +1724,7 @@ def f_inv(y): def _simple_s_curve(power): "S-curve that crosses (0, -1), (1/2, 0), (1, 1)." - # Approximates the invcdf for kernels with support: -1 <= x <= 1. + # Approximates the inverse CDF for kernels with support: -1 < x < 1. return lambda p: ((2 * p) ** power - 1 if p <= 1/2 else 1 - (2 - 2*p) ** power) @@ -1762,12 +1762,23 @@ def _simple_s_curve(power): def kde_random(data, h, kernel='normal', *, seed=None): """Return a function that makes a random selection from the estimated - probability density function created by: kde(data, h, kernel) + probability density function created by kde(data, h, kernel). - For reproducible results, set *seed* to an integer, float, str, or bytes. - Not thread-safe without a lock around calls. + Providing a *seed* allows reproducible selections within a single + thread (or with a lock around calls). In the future, the selection + method for the parabolic, quartic, and triweight kernels may be + replaced with faster algorithms that give different results. + The seed may be an integer, float, str, or bytes. - A StatisticsError will be raised if the data sequence is empty. + A StatisticsError will be raised if the *data* sequence is empty. + + Example: + + >>> data = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2] + >>> rand = kde_random(sample, h=1.5, seed=8675309) + >>> new_selections = [rand() for i in range(10)] + >>> [round(x, 1) for x in new_selections] + [0.7, 6.2, 1.2, 6.9, 7.0, 1.8, 2.5, -0.5, -1.8, 5.6] """ n = len(data) From a30fb9d9dd2b07fd9df64c348bf6021ed307da9d Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Sun, 7 Apr 2024 13:23:58 -0500 Subject: [PATCH 13/37] The "with locks" comment was inaccurate. --- Doc/library/statistics.rst | 8 ++++---- Lib/statistics.py | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index 0dd81d5510660b..22c7b988cd8d39 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -317,10 +317,10 @@ However, for reading convenience, most of the examples show sorted sequences. probability density function produced by ``kde(data, h, kernel)``. Providing a *seed* allows reproducible selections within a single - thread (or with a lock around calls). In the future, the selection - method for the *parabolic*, *quartic*, and *triweight* kernels may be - replaced with faster algorithms that give different results. The - seed may be an integer, float, str, or bytes. + thread. In the future, the selection method for the *parabolic*, + *quartic*, and *triweight* kernels may be replaced with faster + algorithms that give different results. The seed may be an integer, + float, str, or bytes. A :exc:`StatisticsError` will be raised if the *data* sequence is empty. diff --git a/Lib/statistics.py b/Lib/statistics.py index a35c7826d171ce..ccbb825ddf9bf4 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -1765,10 +1765,10 @@ def kde_random(data, h, kernel='normal', *, seed=None): probability density function created by kde(data, h, kernel). Providing a *seed* allows reproducible selections within a single - thread (or with a lock around calls). In the future, the selection - method for the parabolic, quartic, and triweight kernels may be - replaced with faster algorithms that give different results. - The seed may be an integer, float, str, or bytes. + thread. In the future, the selection method for the parabolic, + quartic, and triweight kernels may be replaced with faster + algorithms that give different results. The seed may be an + integer, float, str, or bytes. A StatisticsError will be raised if the *data* sequence is empty. From a2a077f107721cb310ed4375023ecaed64e2cea7 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Sun, 7 Apr 2024 18:51:46 -0500 Subject: [PATCH 14/37] Add summary table entry. Fix doctests. --- Doc/library/statistics.rst | 3 ++- Lib/statistics.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index 22c7b988cd8d39..5e442caa154499 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -77,6 +77,7 @@ or sample. :func:`geometric_mean` Geometric mean of data. :func:`harmonic_mean` Harmonic mean of data. :func:`kde` Estimate the probability density distribution of the data. +:func:`kde_random` Random sampling from the PDF generated by kde(). :func:`median` Median (middle value) of data. :func:`median_low` Low median of data. :func:`median_high` High median of data. @@ -329,7 +330,7 @@ However, for reading convenience, most of the examples show sorted sequences. estimated probability density function: >>> data = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2] - >>> rand = kde_random(sample, h=1.5, seed=8675309) + >>> rand = kde_random(data, h=1.5, seed=8675309) >>> new_selections = [rand() for i in range(10)] >>> [round(x, 1) for x in new_selections] [0.7, 6.2, 1.2, 6.9, 7.0, 1.8, 2.5, -0.5, -1.8, 5.6] diff --git a/Lib/statistics.py b/Lib/statistics.py index ccbb825ddf9bf4..ee6ab4e958f31f 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -1775,7 +1775,7 @@ def kde_random(data, h, kernel='normal', *, seed=None): Example: >>> data = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2] - >>> rand = kde_random(sample, h=1.5, seed=8675309) + >>> rand = kde_random(data, h=1.5, seed=8675309) >>> new_selections = [rand() for i in range(10)] >>> [round(x, 1) for x in new_selections] [0.7, 6.2, 1.2, 6.9, 7.0, 1.8, 2.5, -0.5, -1.8, 5.6] From a82b100daf0f68c2ab0f7640786ade3a05d3ba54 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Sun, 7 Apr 2024 19:58:19 -0500 Subject: [PATCH 15/37] Fix typo --- Lib/statistics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index ee6ab4e958f31f..75ceb18eb7cb51 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -1746,7 +1746,7 @@ def _simple_s_curve(power): _kernel_invcdfs = { 'normal': NormalDist().inv_cdf, - 'logisitic': lambda p: log(p / (1 - p)), + 'logistic': lambda p: log(p / (1 - p)), 'sigmoid': lambda p: log(tan(p * pi/2)), 'rectangular': lambda p: 2*p - 1, 'parabolic': _parabolic_invcdf, From 1397e91d094cc9735d5f7282edc17c6f09962f3d Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Thu, 11 Apr 2024 03:17:40 -0500 Subject: [PATCH 16/37] . --- Lib/statistics.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 75ceb18eb7cb51..4e97bddeaa8710 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -1730,17 +1730,17 @@ def _simple_s_curve(power): 1 - (2 - 2*p) ** power) _parabolic_invcdf = _newton_raphson( - f_inv_estimate = _simple_s_curve(0.583367470424302), + f_inv_estimate = _simple_s_curve(0.583367470424302), # (7/12) f = lambda t: -1/4 * t**3 + 3/4 * t + 1/2, f_prime = lambda t: 3/4 * (1.0 - t * t)) _quartic_invcdf = _newton_raphson( - f_inv_estimate = _simple_s_curve(0.4258865685331), + f_inv_estimate = _simple_s_curve(0.4258865685331), # (204/479) f = lambda t: 3/16 * t**5 - 5/8 * t**3 + 15/16 * t + 1/2, f_prime = lambda t: 15/16 * (1.0 - t * t) ** 2) _triweight_invcdf = _newton_raphson( - f_inv_estimate = _simple_s_curve(0.3400218741872791), + f_inv_estimate = _simple_s_curve(0.3400218741872791), (17/50) f = lambda t: 35/32 * (-1/7*t**7 + 3/5*t**5 - t**3 + t) + 1/2, f_prime = lambda t: 35/32 * (1.0 - t * t) ** 3) From 630d942a6a54aeae76fbb2f9f3c20dbb88889962 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Sat, 13 Apr 2024 17:52:10 -0500 Subject: [PATCH 17/37] Change variable name to match the supporting reference. --- Lib/statistics.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 4e97bddeaa8710..2fe7d6ac0133b7 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -921,13 +921,13 @@ def kde(data, h, kernel='normal', *, cumulative=False): sqrt2pi = sqrt(2 * pi) sqrt2 = sqrt(2) K = lambda t: exp(-1/2 * t * t) / sqrt2pi - I = lambda t: 1/2 * (1.0 + erf(t / sqrt2)) + W = lambda t: 1/2 * (1.0 + erf(t / sqrt2)) support = None case 'logistic': # 1.0 / (exp(t) + 2.0 + exp(-t)) K = lambda t: 1/2 / (1.0 + cosh(t)) - I = lambda t: 1.0 - 1.0 / (exp(t) + 1.0) + W = lambda t: 1.0 - 1.0 / (exp(t) + 1.0) support = None case 'sigmoid': @@ -935,39 +935,39 @@ def kde(data, h, kernel='normal', *, cumulative=False): c1 = 1 / pi c2 = 2 / pi K = lambda t: c1 / cosh(t) - I = lambda t: c2 * atan(exp(t)) + W = lambda t: c2 * atan(exp(t)) support = None case 'rectangular' | 'uniform': K = lambda t: 1/2 - I = lambda t: 1/2 * t + 1/2 + W = lambda t: 1/2 * t + 1/2 support = 1.0 case 'triangular': K = lambda t: 1.0 - abs(t) - I = lambda t: t*t * (1/2 if t < 0.0 else -1/2) + t + 1/2 + W = lambda t: t*t * (1/2 if t < 0.0 else -1/2) + t + 1/2 support = 1.0 case 'parabolic' | 'epanechnikov': K = lambda t: 3/4 * (1.0 - t * t) - I = lambda t: -1/4 * t**3 + 3/4 * t + 1/2 + W = lambda t: -1/4 * t**3 + 3/4 * t + 1/2 support = 1.0 case 'quartic' | 'biweight': K = lambda t: 15/16 * (1.0 - t * t) ** 2 - I = lambda t: 3/16 * t**5 - 5/8 * t**3 + 15/16 * t + 1/2 + W = lambda t: 3/16 * t**5 - 5/8 * t**3 + 15/16 * t + 1/2 support = 1.0 case 'triweight': K = lambda t: 35/32 * (1.0 - t * t) ** 3 - I = lambda t: 35/32 * (-1/7*t**7 + 3/5*t**5 - t**3 + t) + 1/2 + W = lambda t: 35/32 * (-1/7*t**7 + 3/5*t**5 - t**3 + t) + 1/2 support = 1.0 case 'cosine': c1 = pi / 4 c2 = pi / 2 K = lambda t: c1 * cos(c2 * t) - I = lambda t: 1/2 * sin(c2 * t) + 1/2 + W = lambda t: 1/2 * sin(c2 * t) + 1/2 support = 1.0 case _: @@ -981,7 +981,7 @@ def pdf(x): def cdf(x): n = len(data) - return sum(I((x - x_i) / h) for x_i in data) / n + return sum(W((x - x_i) / h) for x_i in data) / n else: @@ -1006,7 +1006,7 @@ def cdf(x): i = bisect_left(sample, x - bandwidth) j = bisect_right(sample, x + bandwidth) supported = sample[i : j] - return sum((I((x - x_i) / h) for x_i in supported), i) / n + return sum((W((x - x_i) / h) for x_i in supported), i) / n if cumulative: cdf.__doc__ = f'CDF estimate with {h=!r} and {kernel=!r}' From e9e5d543fb4ed633c44d0aa2c900b0b9f793424e Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Sun, 14 Apr 2024 22:06:25 -0500 Subject: [PATCH 18/37] Closed-form for _parabolic_invcdf(). --- Lib/statistics.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 2fe7d6ac0133b7..74afc080562c37 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -1729,18 +1729,13 @@ def _simple_s_curve(power): if p <= 1/2 else 1 - (2 - 2*p) ** power) -_parabolic_invcdf = _newton_raphson( - f_inv_estimate = _simple_s_curve(0.583367470424302), # (7/12) - f = lambda t: -1/4 * t**3 + 3/4 * t + 1/2, - f_prime = lambda t: 3/4 * (1.0 - t * t)) - _quartic_invcdf = _newton_raphson( f_inv_estimate = _simple_s_curve(0.4258865685331), # (204/479) f = lambda t: 3/16 * t**5 - 5/8 * t**3 + 15/16 * t + 1/2, f_prime = lambda t: 15/16 * (1.0 - t * t) ** 2) _triweight_invcdf = _newton_raphson( - f_inv_estimate = _simple_s_curve(0.3400218741872791), (17/50) + f_inv_estimate = _simple_s_curve(0.3400218741872791), # (17/50) f = lambda t: 35/32 * (-1/7*t**7 + 3/5*t**5 - t**3 + t) + 1/2, f_prime = lambda t: 35/32 * (1.0 - t * t) ** 3) @@ -1749,7 +1744,7 @@ def _simple_s_curve(power): 'logistic': lambda p: log(p / (1 - p)), 'sigmoid': lambda p: log(tan(p * pi/2)), 'rectangular': lambda p: 2*p - 1, - 'parabolic': _parabolic_invcdf, + 'parabolic': lambda p: 2 * cos((acos(2*p-1) + pi) / 3), 'quartic': _quartic_invcdf, 'triweight': _triweight_invcdf, 'triangular': lambda p: sqrt(2*p) - 1 if p < 1/2 else 1 - sqrt(2 - 2*p), From 8a0fd6994acc7d03d94727d9bb6d3e9f037b77c8 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Sun, 14 Apr 2024 23:03:47 -0500 Subject: [PATCH 19/37] Flip sign in Newton-Raphson to match the common presentation. --- Lib/statistics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 74afc080562c37..cc80505ace965e 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -1717,8 +1717,8 @@ def _newton_raphson(f_inv_estimate, f, f_prime, tolerance=1e-12): def f_inv(y): "Return x such that f(x) ≈ y within the specified tolerance." x = f_inv_estimate(y) - while abs(diff := y - f(x)) > tolerance: - x += diff / f_prime(x) + while abs(diff := f(x) - y) > tolerance: + x -= diff / f_prime(x) return x return f_inv From 3459bf20ac203c591445b727d956ade0e84fe189 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Tue, 16 Apr 2024 23:09:20 -0500 Subject: [PATCH 20/37] Add kernel_invcdf tests --- Lib/statistics.py | 2 +- Lib/test/test_statistics.py | 11 +++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index cc80505ace965e..04833aeff5cb8e 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -139,7 +139,7 @@ from itertools import count, groupby, repeat from bisect import bisect_left, bisect_right from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum, sumprod -from math import isfinite, isinf, pi, cos, sin, tan, cosh, asin, atan +from math import isfinite, isinf, pi, cos, sin, tan, cosh, asin, atan, acos from functools import reduce from operator import itemgetter from collections import Counter, namedtuple, defaultdict diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py index 204787a88a9c5f..ff8d743fa58726 100644 --- a/Lib/test/test_statistics.py +++ b/Lib/test/test_statistics.py @@ -2426,6 +2426,17 @@ def integrate(func, low, high, steps=10_000): self.assertEqual(f_hat(-1.0), 1/2) self.assertEqual(f_hat(1.0), 1/2) + def test_kde_kernel_invcdfs(self): + kernel_invcdfs = statistics._kernel_invcdfs + kde = statistics.kde + + # Verify that cdf / invcdf will round trip + xarr = [i/100 for i in range(-100, 101)] + for kernel, invcdf in kernel_invcdfs.items(): + with self.subTest(kernel=kernel): + cdf = kde([0.0], h=1.0, kernel=kernel, cumulative=True) + for x in xarr: + self.assertAlmostEqual(invcdf(cdf(x)), x, places=5) class TestQuantiles(unittest.TestCase): From 1ca870ace5af8e5babe2e9436671de902c66d2be Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Fri, 19 Apr 2024 01:21:38 -0500 Subject: [PATCH 21/37] Better _quartic_invcdf_estimate --- Lib/statistics.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 04833aeff5cb8e..912148c1a209ae 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -1729,8 +1729,15 @@ def _simple_s_curve(power): if p <= 1/2 else 1 - (2 - 2*p) ** power) +def _quartic_invcdf_estimate(p): + sign, p = (1, p) if p <= 1/2 else (-1, 1 - p) + x = (2 * p) ** 0.4258865685331 - 1 # (204/479) + if p >= 0.004: + x += 0.026818732 * sin(7.101753784 * p + 2.732305658) + return x * sign + _quartic_invcdf = _newton_raphson( - f_inv_estimate = _simple_s_curve(0.4258865685331), # (204/479) + f_inv_estimate = _quartic_invcdf_estimate, f = lambda t: 3/16 * t**5 - 5/8 * t**3 + 15/16 * t + 1/2, f_prime = lambda t: 15/16 * (1.0 - t * t) ** 2) From 96a5f14d2f04c4441fecb9e084228fd9f8caede1 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Fri, 19 Apr 2024 01:36:41 -0500 Subject: [PATCH 22/37] Make standalone _triweight_invcdf_estimate() --- Lib/statistics.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 912148c1a209ae..2a93297db98e0a 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -1722,13 +1722,6 @@ def f_inv(y): return x return f_inv -def _simple_s_curve(power): - "S-curve that crosses (0, -1), (1/2, 0), (1, 1)." - # Approximates the inverse CDF for kernels with support: -1 < x < 1. - return lambda p: ((2 * p) ** power - 1 - if p <= 1/2 else - 1 - (2 - 2*p) ** power) - def _quartic_invcdf_estimate(p): sign, p = (1, p) if p <= 1/2 else (-1, 1 - p) x = (2 * p) ** 0.4258865685331 - 1 # (204/479) @@ -1741,8 +1734,13 @@ def _quartic_invcdf_estimate(p): f = lambda t: 3/16 * t**5 - 5/8 * t**3 + 15/16 * t + 1/2, f_prime = lambda t: 15/16 * (1.0 - t * t) ** 2) +def _triweight_invcdf_estimate(p): + sign, p = (1, p) if p <= 1/2 else (-1, 1 - p) + x = (2 * p) ** 0.3400218741872791 - 1 # (17/50) + return x * sign + _triweight_invcdf = _newton_raphson( - f_inv_estimate = _simple_s_curve(0.3400218741872791), # (17/50) + f_inv_estimate = _triweight_invcdf_estimate, f = lambda t: 35/32 * (-1/7*t**7 + 3/5*t**5 - t**3 + t) + 1/2, f_prime = lambda t: 35/32 * (1.0 - t * t) ** 3) @@ -1767,10 +1765,7 @@ def kde_random(data, h, kernel='normal', *, seed=None): probability density function created by kde(data, h, kernel). Providing a *seed* allows reproducible selections within a single - thread. In the future, the selection method for the parabolic, - quartic, and triweight kernels may be replaced with faster - algorithms that give different results. The seed may be an - integer, float, str, or bytes. + thread. The seed may be an integer, float, str, or bytes. A StatisticsError will be raised if the *data* sequence is empty. From f05eddbda6c1174b9407a6ff790447123a2f023e Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Fri, 19 Apr 2024 01:43:10 -0500 Subject: [PATCH 23/37] Floats everywhere --- Lib/statistics.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 2a93297db98e0a..0ec57108ead6e8 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -1723,8 +1723,8 @@ def f_inv(y): return f_inv def _quartic_invcdf_estimate(p): - sign, p = (1, p) if p <= 1/2 else (-1, 1 - p) - x = (2 * p) ** 0.4258865685331 - 1 # (204/479) + sign, p = (1.0, p) if p <= 1/2 else (-1.0, 1.0 - p) + x = (2.0 * p) ** 0.4258865685331 - 1.0 if p >= 0.004: x += 0.026818732 * sin(7.101753784 * p + 2.732305658) return x * sign @@ -1735,8 +1735,8 @@ def _quartic_invcdf_estimate(p): f_prime = lambda t: 15/16 * (1.0 - t * t) ** 2) def _triweight_invcdf_estimate(p): - sign, p = (1, p) if p <= 1/2 else (-1, 1 - p) - x = (2 * p) ** 0.3400218741872791 - 1 # (17/50) + sign, p = (1.0, p) if p <= 1/2 else (-1.0, 1.0 - p) + x = (2.0 * p) ** 0.3400218741872791 - 1.0 return x * sign _triweight_invcdf = _newton_raphson( From 065be116969cdb8bda6f967da14b656a8be5090f Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Tue, 23 Apr 2024 18:25:50 -0500 Subject: [PATCH 24/37] . --- Lib/statistics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 0ec57108ead6e8..011d95065d03ac 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -1725,8 +1725,8 @@ def f_inv(y): def _quartic_invcdf_estimate(p): sign, p = (1.0, p) if p <= 1/2 else (-1.0, 1.0 - p) x = (2.0 * p) ** 0.4258865685331 - 1.0 - if p >= 0.004: - x += 0.026818732 * sin(7.101753784 * p + 2.732305658) + if p >= 0.004 < 0.499: + x += 0.026818732 * sin(7.101753784 * p + 2.73230839482953) return x * sign _quartic_invcdf = _newton_raphson( From d71ca9d396ca41f62cf2253c518cf0a868be7b48 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Mon, 29 Apr 2024 21:47:34 -0700 Subject: [PATCH 25/37] Never used the global, shared Random instance. --- Lib/statistics.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 011d95065d03ac..4fd17cccf1412d 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -1788,13 +1788,9 @@ def kde_random(data, h, kernel='normal', *, seed=None): if h <= 0.0: raise StatisticsError(f'Bandwidth h must be positive, not {h=!r}') - if seed is None: - random = _random.random - choice = _random.choice - else: - prng = _random.Random(seed) - random = prng.random - choice = prng.choice + prng = _random.Random(seed) + random = prng.random + choice = prng.choice try: kernel_invcdf = _kernel_invcdfs[kernel] From ee43ed76328f1a642b102a65b5c64112224dcd9d Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Mon, 29 Apr 2024 21:48:36 -0700 Subject: [PATCH 26/37] . --- Lib/statistics.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 4fd17cccf1412d..01c5c49e65b74a 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -1788,15 +1788,15 @@ def kde_random(data, h, kernel='normal', *, seed=None): if h <= 0.0: raise StatisticsError(f'Bandwidth h must be positive, not {h=!r}') - prng = _random.Random(seed) - random = prng.random - choice = prng.choice - try: kernel_invcdf = _kernel_invcdfs[kernel] except KeyError: raise StatisticsError(f'Unknown kernel name: {kernel!r}') + prng = _random.Random(seed) + random = prng.random + choice = prng.choice + def rand(): return choice(data) + h * kernel_invcdf(random()) From f85c3037639e303de732f366bdb04510f72f7552 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Tue, 30 Apr 2024 00:27:14 -0700 Subject: [PATCH 27/37] Update whatsnew --- Doc/whatsnew/3.13.rst | 3 ++- Lib/statistics.py | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/Doc/whatsnew/3.13.rst b/Doc/whatsnew/3.13.rst index 98349a5984bb7e..b04cfd59f96236 100644 --- a/Doc/whatsnew/3.13.rst +++ b/Doc/whatsnew/3.13.rst @@ -722,7 +722,8 @@ statistics * Add :func:`statistics.kde` for kernel density estimation. This makes it possible to estimate a continuous probability density function - from a fixed number of discrete samples. + from a fixed number of discrete samples. Also added :func:`kde_random` + for sampling the estimated probability density function. (Contributed by Raymond Hettinger in :gh:`115863`.) .. _whatsnew313-subprocess: diff --git a/Lib/statistics.py b/Lib/statistics.py index 01c5c49e65b74a..f3ce2d8b6b442a 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -1078,6 +1078,7 @@ def quantiles(data, *, n=4, method='exclusive'): if ld == 1: return data * (n - 1) raise StatisticsError('must have at least one data point') + if method == 'inclusive': m = ld - 1 result = [] @@ -1086,6 +1087,7 @@ def quantiles(data, *, n=4, method='exclusive'): interpolated = (data[j] * (n - delta) + data[j + 1] * delta) / n result.append(interpolated) return result + if method == 'exclusive': m = ld + 1 result = [] @@ -1096,6 +1098,7 @@ def quantiles(data, *, n=4, method='exclusive'): interpolated = (data[j - 1] * (n - delta) + data[j] * delta) / n result.append(interpolated) return result + raise ValueError(f'Unknown method: {method!r}') From 544ca8fee9feb3afd7646fdcc80e8c37cebfb1cf Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Tue, 30 Apr 2024 00:36:27 -0700 Subject: [PATCH 28/37] Expand "it" variable name to "iterator" --- Doc/library/itertools.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Doc/library/itertools.rst b/Doc/library/itertools.rst index 9a5cb8be37d349..df64b7cda8b3b5 100644 --- a/Doc/library/itertools.rst +++ b/Doc/library/itertools.rst @@ -903,9 +903,9 @@ and :term:`generators ` which incur interpreter overhead. def sliding_window(iterable, n): "Collect data into overlapping fixed-length chunks or blocks." # sliding_window('ABCDEFG', 4) → ABCD BCDE CDEF DEFG - it = iter(iterable) - window = collections.deque(islice(it, n-1), maxlen=n) - for x in it: + iterator = iter(iterable) + window = collections.deque(islice(iterator, n-1), maxlen=n) + for x in iterator: window.append(x) yield tuple(window) @@ -955,8 +955,8 @@ and :term:`generators ` which incur interpreter overhead. seq_index = getattr(iterable, 'index', None) if seq_index is None: # Path for general iterables - it = islice(iterable, start, stop) - for i, element in enumerate(it, start): + iterator = islice(iterable, start, stop) + for i, element in enumerate(iterator, start): if element is value or element == value: yield i else: From 1249d14388c87f47e4ce0c1985c2b47cf690291b Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Fri, 3 May 2024 19:17:29 -0500 Subject: [PATCH 29/37] Test the kde_random() outer function --- Lib/test/test_statistics.py | 43 +++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py index ff8d743fa58726..1ad917af1be376 100644 --- a/Lib/test/test_statistics.py +++ b/Lib/test/test_statistics.py @@ -2438,6 +2438,49 @@ def test_kde_kernel_invcdfs(self): for x in xarr: self.assertAlmostEqual(invcdf(cdf(x)), x, places=5) + def test_kde_random(self): + kde_random = statistics.kde_random + StatisticsError = statistics.StatisticsError + kernels = ['normal', 'gauss', 'logistic', 'sigmoid', 'rectangular', + 'uniform', 'triangular', 'parabolic', 'epanechnikov', + 'quartic', 'biweight', 'triweight', 'cosine'] + sample = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2] + + # Smoke test + + for kernel in kernels: + with self.subTest(kernel=kernel): + rand = kde_random(sample, h=1.5, kernel=kernel) + selections = [rand() for i in range(10)] + + + # Check error cases + + with self.assertRaises(StatisticsError): + kde_random([], h=1.0) # Empty dataset + with self.assertRaises(TypeError): + kde_random(['abc', 'def'], 1.5) # Non-numeric data + with self.assertRaises(TypeError): + kde_random(iter(sample), 1.5) # Data is not a sequence + with self.assertRaises(StatisticsError): + kde_random(sample, h=0.0) # Zero bandwidth + with self.assertRaises(StatisticsError): + kde_random(sample, h=0.0) # Negative bandwidth + with self.assertRaises(TypeError): + kde_random(sample, h='str') # Wrong bandwidth type + with self.assertRaises(StatisticsError): + kde_random(sample, h=1.0, kernel='bogus') # Invalid kernel + + # Test name and docstring of the generated function + + h = 1.5 + kernel = 'cosine' + prng = kde_random(sample, h, kernel) + self.assertEqual(prng.__name__, 'rand') + self.assertIn(kernel, prng.__doc__) + self.assertIn(repr(h), prng.__doc__) + + class TestQuantiles(unittest.TestCase): def test_specific_cases(self): From a09d30b7322935532ba8c957db03a33115d399a2 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Fri, 3 May 2024 19:27:47 -0500 Subject: [PATCH 30/37] Add fully qualified reference --- Doc/whatsnew/3.13.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Doc/whatsnew/3.13.rst b/Doc/whatsnew/3.13.rst index 05579cf3f0a590..c2f106231438e4 100644 --- a/Doc/whatsnew/3.13.rst +++ b/Doc/whatsnew/3.13.rst @@ -745,7 +745,7 @@ statistics * Add :func:`statistics.kde` for kernel density estimation. This makes it possible to estimate a continuous probability density function - from a fixed number of discrete samples. Also added :func:`kde_random` + from a fixed number of discrete samples. Also added :func:`statistics.kde_random` for sampling the estimated probability density function. (Contributed by Raymond Hettinger in :gh:`115863`.) From 405d443dc4ce27f4787a148588ba01069345470f Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Fri, 3 May 2024 22:18:37 -0500 Subject: [PATCH 31/37] Add an approximate distribution test --- Lib/test/test_statistics.py | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py index 1ad917af1be376..39d4376c96e5a1 100644 --- a/Lib/test/test_statistics.py +++ b/Lib/test/test_statistics.py @@ -2453,7 +2453,6 @@ def test_kde_random(self): rand = kde_random(sample, h=1.5, kernel=kernel) selections = [rand() for i in range(10)] - # Check error cases with self.assertRaises(StatisticsError): @@ -2480,6 +2479,31 @@ def test_kde_random(self): self.assertIn(kernel, prng.__doc__) self.assertIn(repr(h), prng.__doc__) + # Approximate distribution test + + data = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2, 7.8, 14.3, 15.1, 15.3, 15.8, 17.0] + n = 1_000_000 + h = 1.75 + + for kernel in kernels: + with self.subTest(kernel=kernel): + + f_hat = statistics.kde(data, h, kernel) + rand = kde_random(data, h, kernel, seed=8675309**2) + big_sample = sorted([rand() for i in range(n)]) + + def pdf_observed(x, dx=0.1): + # P(x-dx <= X < x+dx) / (2*dx) + i = bisect.bisect_left(big_sample, x - dx) + j = bisect.bisect_right(big_sample, x + dx) + p_nearby = (j - i) / len(big_sample) + return p_nearby / (2 * dx) + + for x in range(-4, 19): + pd1 = f_hat(x) + pd2 = pdf_observed(x) + self.assertTrue(math.isclose(pd1, pd2, abs_tol=0.01)) + class TestQuantiles(unittest.TestCase): From a4692bf45f1d9aa75598b2bd650127c94c674429 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Fri, 3 May 2024 22:25:55 -0500 Subject: [PATCH 32/37] Use F_hat() for a better estimate --- Lib/test/test_statistics.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py index 39d4376c96e5a1..351a35ee08b4b7 100644 --- a/Lib/test/test_statistics.py +++ b/Lib/test/test_statistics.py @@ -2484,25 +2484,26 @@ def test_kde_random(self): data = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2, 7.8, 14.3, 15.1, 15.3, 15.8, 17.0] n = 1_000_000 h = 1.75 + dx = 0.1 + + def p_expected(x): + return F_hat(x + dx) - F_hat(x - dx) + + def p_observed(x): + # P(x-dx <= X < x+dx) / (2*dx) + i = bisect.bisect_left(big_sample, x - dx) + j = bisect.bisect_right(big_sample, x + dx) + return (j - i) / len(big_sample) for kernel in kernels: with self.subTest(kernel=kernel): - f_hat = statistics.kde(data, h, kernel) + F_hat = statistics.kde(data, h, kernel, cumulative=True) rand = kde_random(data, h, kernel, seed=8675309**2) big_sample = sorted([rand() for i in range(n)]) - def pdf_observed(x, dx=0.1): - # P(x-dx <= X < x+dx) / (2*dx) - i = bisect.bisect_left(big_sample, x - dx) - j = bisect.bisect_right(big_sample, x + dx) - p_nearby = (j - i) / len(big_sample) - return p_nearby / (2 * dx) - for x in range(-4, 19): - pd1 = f_hat(x) - pd2 = pdf_observed(x) - self.assertTrue(math.isclose(pd1, pd2, abs_tol=0.01)) + self.assertTrue(math.isclose(p_observed(x), p_expected(x), abs_tol=0.001)) class TestQuantiles(unittest.TestCase): From ec2d9f29ab1b3054333e0394da7aad8187d965c2 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Fri, 3 May 2024 22:30:01 -0500 Subject: [PATCH 33/37] Test the curve in more places --- Lib/test/test_statistics.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py index 351a35ee08b4b7..fbf15bca1f5e46 100644 --- a/Lib/test/test_statistics.py +++ b/Lib/test/test_statistics.py @@ -2479,7 +2479,7 @@ def test_kde_random(self): self.assertIn(kernel, prng.__doc__) self.assertIn(repr(h), prng.__doc__) - # Approximate distribution test + # Approximate distribution test: Compare a random sample to expected distribution data = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2, 7.8, 14.3, 15.1, 15.3, 15.8, 17.0] n = 1_000_000 @@ -2502,7 +2502,8 @@ def p_observed(x): rand = kde_random(data, h, kernel, seed=8675309**2) big_sample = sorted([rand() for i in range(n)]) - for x in range(-4, 19): + for x in range(-40, 190): + x /= 10 self.assertTrue(math.isclose(p_observed(x), p_expected(x), abs_tol=0.001)) From c612c1f0f4de55b9921e983d85ec83f16219b965 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Fri, 3 May 2024 22:39:35 -0500 Subject: [PATCH 34/37] Refine the reproducibility note. --- Doc/library/statistics.rst | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index e9f6eec0eba0ba..0a4219921e2921 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -317,15 +317,13 @@ However, for reading convenience, most of the examples show sorted sequences. Return a function that makes a random selection from the estimated probability density function produced by ``kde(data, h, kernel)``. - Providing a *seed* allows reproducible selections within a single - thread. In the future, the selection method for the *parabolic*, - *quartic*, and *triweight* kernels may be replaced with faster - algorithms that give different results. The seed may be an integer, - float, str, or bytes. + Providing a *seed* allows reproducible selections. In the future, the + values may change slightly as more accurate kernel inverse estimates + are implemented. The seed may be an integer, float, str, or bytes. A :exc:`StatisticsError` will be raised if the *data* sequence is empty. - Continuing the above example for :func:`kde`, we can use + Continuing the example for :func:`kde`, we can use :func:`kde_random` to generate new random selections from an estimated probability density function: From 1324952149bdf11aca357ebd3f2aa48e03c05d2d Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Fri, 3 May 2024 22:41:13 -0500 Subject: [PATCH 35/37] Missing CDF qualifier --- Doc/library/statistics.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index 0a4219921e2921..d5a316e45ee3e2 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -318,7 +318,7 @@ However, for reading convenience, most of the examples show sorted sequences. probability density function produced by ``kde(data, h, kernel)``. Providing a *seed* allows reproducible selections. In the future, the - values may change slightly as more accurate kernel inverse estimates + values may change slightly as more accurate kernel inverse CDF estimates are implemented. The seed may be an integer, float, str, or bytes. A :exc:`StatisticsError` will be raised if the *data* sequence is empty. From 13e702f35772980e0304900d97130a5ce95f884d Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Fri, 3 May 2024 22:42:48 -0500 Subject: [PATCH 36/37] Word smithing --- Doc/whatsnew/3.13.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Doc/whatsnew/3.13.rst b/Doc/whatsnew/3.13.rst index c2f106231438e4..269a7cc985ad19 100644 --- a/Doc/whatsnew/3.13.rst +++ b/Doc/whatsnew/3.13.rst @@ -746,7 +746,7 @@ statistics * Add :func:`statistics.kde` for kernel density estimation. This makes it possible to estimate a continuous probability density function from a fixed number of discrete samples. Also added :func:`statistics.kde_random` - for sampling the estimated probability density function. + for sampling from the estimated probability density function. (Contributed by Raymond Hettinger in :gh:`115863`.) .. _whatsnew313-subprocess: From 15a4764887a3dad5212ca53e50548f74b98d49bd Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Fri, 3 May 2024 22:44:02 -0500 Subject: [PATCH 37/37] Word smithing --- Lib/test/test_statistics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py index fbf15bca1f5e46..fe6c59c30dae28 100644 --- a/Lib/test/test_statistics.py +++ b/Lib/test/test_statistics.py @@ -2479,7 +2479,7 @@ def test_kde_random(self): self.assertIn(kernel, prng.__doc__) self.assertIn(repr(h), prng.__doc__) - # Approximate distribution test: Compare a random sample to expected distribution + # Approximate distribution test: Compare a random sample to the expected distribution data = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2, 7.8, 14.3, 15.1, 15.3, 15.8, 17.0] n = 1_000_000