From 777def49f7e934946b0a266f8c6859991cfa400e Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Sun, 8 Jan 2023 00:15:50 -0600 Subject: [PATCH 1/6] Working 2-fold summation. Running time goes from 660 ns to 740 ns. --- Modules/mathmodule.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c index 9545ad2a99eb43..750f4ab56bf566 100644 --- a/Modules/mathmodule.c +++ b/Modules/mathmodule.c @@ -2859,8 +2859,9 @@ dl_to_d() converts from extended precision to double precision. */ typedef struct{ double hi; double lo; } DoubleLength; +typedef struct{ double hi; double lo; double tiny; } TripleLength; -static const DoubleLength dl_zero = {0.0, 0.0}; +static const TripleLength tl_zero = {0.0, 0.0, 0.0}; static inline DoubleLength twosum(double a, double b) @@ -2874,11 +2875,12 @@ twosum(double a, double b) return (DoubleLength) {s, t}; } -static inline DoubleLength -dl_add(DoubleLength total, double x) +static inline TripleLength +tl_add(TripleLength total, double x) { DoubleLength s = twosum(total.hi, x); - return (DoubleLength) {s.hi, total.lo + s.lo}; + DoubleLength t = twosum(total.lo, s.lo); + return (TripleLength) {s.hi, t.hi, total.tiny + t.lo}; } static inline DoubleLength @@ -2902,18 +2904,18 @@ dl_mul(double x, double y) return (DoubleLength) {z, zz}; } -static inline DoubleLength -dl_fma(DoubleLength total, double p, double q) +static inline TripleLength +tl_fma(TripleLength total, double p, double q) { DoubleLength product = dl_mul(p, q); - total = dl_add(total, product.hi); - return dl_add(total, product.lo); + total = tl_add(total, product.hi); + return tl_add(total, product.lo); } static inline double -dl_to_d(DoubleLength total) +tl_to_d(TripleLength total) { - return total.hi + total.lo; + return total.tiny + total.lo + total.hi; } /*[clinic input] @@ -2944,7 +2946,7 @@ math_sumprod_impl(PyObject *module, PyObject *p, PyObject *q) bool int_path_enabled = true, int_total_in_use = false; bool flt_path_enabled = true, flt_total_in_use = false; long int_total = 0; - DoubleLength flt_total = dl_zero; + TripleLength flt_total = tl_zero; p_it = PyObject_GetIter(p); if (p_it == NULL) { @@ -3079,7 +3081,7 @@ math_sumprod_impl(PyObject *module, PyObject *p, PyObject *q) } else { goto finalize_flt_path; } - DoubleLength new_flt_total = dl_fma(flt_total, flt_p, flt_q); + TripleLength new_flt_total = tl_fma(flt_total, flt_p, flt_q); if (isfinite(new_flt_total.hi)) { flt_total = new_flt_total; flt_total_in_use = true; @@ -3093,7 +3095,7 @@ math_sumprod_impl(PyObject *module, PyObject *p, PyObject *q) // We're finished, overflowed, have a non-float, or got a non-finite value flt_path_enabled = false; if (flt_total_in_use) { - term_i = PyFloat_FromDouble(dl_to_d(flt_total)); + term_i = PyFloat_FromDouble(tl_to_d(flt_total)); if (term_i == NULL) { goto err_exit; } @@ -3104,7 +3106,7 @@ math_sumprod_impl(PyObject *module, PyObject *p, PyObject *q) Py_SETREF(total, new_total); new_total = NULL; Py_CLEAR(term_i); - flt_total = dl_zero; + flt_total = tl_zero; flt_total_in_use = false; } } From 05884b735e937c7ba4ada3b1987bbdfafec2c9b6 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Sun, 8 Jan 2023 02:11:16 -0600 Subject: [PATCH 2/6] Update comments --- Modules/mathmodule.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c index 750f4ab56bf566..31e268c1dec68f 100644 --- a/Modules/mathmodule.c +++ b/Modules/mathmodule.c @@ -2845,16 +2845,15 @@ based on ideas from three sources: Ultimately Fast Accurate Summation by Siegfried M. Rump https://www.tuhh.de/ti3/paper/rump/Ru08b.pdf -The double length routines allow for quite a bit of instruction -level parallelism. On a 3.22 Ghz Apple M1 Max, the incremental -cost of increasing the input vector size by one is 6.0 nsec. - -dl_zero() returns an extended precision zero -dl_split() exactly splits a double into two half precision components. -dl_add() performs compensated summation to keep a running total. -dl_mul() implements lossless multiplication of doubles. -dl_fma() implements an extended precision fused-multiply-add. -dl_to_d() converts from extended precision to double precision. +Double length API: +* dl_split() exactly splits a C double into two half precision components. +* dl_mul() implements lossless multiplication of two C doubles. + +Triple length API: +* tl_zero is a triple length zero for starting or resetting an accumulation. +* tl_add() adds a C double to a triple length number. +* tl_fma() performs a triple length fused-multiply-add. +* tl_to_d() converts from triple length number to a C double. */ From 7a993cdf03a6fb96f08d7c3c75405ff4c4acf265 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Sun, 8 Jan 2023 12:47:44 -0600 Subject: [PATCH 3/6] ASCII art --- Modules/mathmodule.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c index 31e268c1dec68f..cdb508747a5a5d 100644 --- a/Modules/mathmodule.c +++ b/Modules/mathmodule.c @@ -2877,9 +2877,17 @@ twosum(double a, double b) static inline TripleLength tl_add(TripleLength total, double x) { - DoubleLength s = twosum(total.hi, x); - DoubleLength t = twosum(total.lo, s.lo); - return (TripleLength) {s.hi, t.hi, total.tiny + t.lo}; + /* Input: x total.hi total.lo total.tiny + |--- twosum ---| + s.hi s.lo + |--- twosum ---| + t.hi t.lo + |--- single sum ---| + Output: s.hi t.hi tiny + */ + DoubleLength s = twosum(x, total.hi); + DoubleLength t = twosum(s.lo, total.lo); + return (TripleLength) {s.hi, t.hi, t.lo + total.tiny}; } static inline DoubleLength From 04160fd7d1123c6591c9690d77a8d6fb1f4e3237 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Sun, 8 Jan 2023 12:58:34 -0600 Subject: [PATCH 4/6] Tweak comments --- Modules/mathmodule.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c index cdb508747a5a5d..11e815c6f17e1d 100644 --- a/Modules/mathmodule.c +++ b/Modules/mathmodule.c @@ -2832,7 +2832,7 @@ long_add_would_overflow(long a, long b) } /* -Double length extended precision floating point arithmetic +Double and triple length extended precision floating point arithmetic based on ideas from three sources: Improved Kahan–Babuška algorithm by Arnold Neumaier @@ -2845,15 +2845,15 @@ based on ideas from three sources: Ultimately Fast Accurate Summation by Siegfried M. Rump https://www.tuhh.de/ti3/paper/rump/Ru08b.pdf -Double length API: -* dl_split() exactly splits a C double into two half precision components. -* dl_mul() implements lossless multiplication of two C doubles. +Double length functions: +* dl_split() exact split of a C double into two half precision components. +* dl_mul() exact multiplication of two C doubles. -Triple length API: +Triple length functions and constant: * tl_zero is a triple length zero for starting or resetting an accumulation. -* tl_add() adds a C double to a triple length number. +* tl_add() compensated addition of a C double to a triple length number. * tl_fma() performs a triple length fused-multiply-add. -* tl_to_d() converts from triple length number to a C double. +* tl_to_d() converts from triple length number back to a C double. */ From 91723fa1e5568e60dc9e4104572b0f3d18e541cf Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Sun, 8 Jan 2023 13:03:03 -0600 Subject: [PATCH 5/6] Limit test resource utilization --- Lib/test/test_math.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Lib/test/test_math.py b/Lib/test/test_math.py index 65fe169671eae2..b8ac8f32055d33 100644 --- a/Lib/test/test_math.py +++ b/Lib/test/test_math.py @@ -1294,6 +1294,7 @@ def test_sumprod_accuracy(self): self.assertEqual(sumprod([0.1] * 20, [True, False] * 10), 1.0) self.assertEqual(sumprod([1.0, 10E100, 1.0, -10E100], [1.0]*4), 2.0) + @support.requires_resource('cpu') def test_sumprod_stress(self): sumprod = math.sumprod product = itertools.product From 2180ef92644bcb1fd51c2c1ef01e2424835b78de Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Sun, 8 Jan 2023 13:08:33 -0600 Subject: [PATCH 6/6] Add whatsnew entry --- Doc/whatsnew/3.12.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Doc/whatsnew/3.12.rst b/Doc/whatsnew/3.12.rst index 2f50ece4dab3fb..b882bb607f911c 100644 --- a/Doc/whatsnew/3.12.rst +++ b/Doc/whatsnew/3.12.rst @@ -262,6 +262,12 @@ dis :data:`~dis.hasarg` collection instead. (Contributed by Irit Katriel in :gh:`94216`.) +math +---- + +* Added :func:`math.sumprod` for computing a sum of products. + (Contributed by Raymond Hettinger in :gh:`100485`.) + os --