From 777def49f7e934946b0a266f8c6859991cfa400e Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <raymond.hettinger@gmail.com>
Date: Sun, 8 Jan 2023 00:15:50 -0600
Subject: [PATCH 1/6] Working 2-fold summation.  Running time goes from 660 ns
 to 740 ns.

---
 Modules/mathmodule.c | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c
index 9545ad2a99eb43..750f4ab56bf566 100644
--- a/Modules/mathmodule.c
+++ b/Modules/mathmodule.c
@@ -2859,8 +2859,9 @@ dl_to_d() converts from extended precision to double precision.
 */
 
 typedef struct{ double hi; double lo; } DoubleLength;
+typedef struct{ double hi; double lo; double tiny; } TripleLength;
 
-static const DoubleLength dl_zero = {0.0, 0.0};
+static const TripleLength tl_zero = {0.0, 0.0, 0.0};
 
 static inline DoubleLength
 twosum(double a, double b)
@@ -2874,11 +2875,12 @@ twosum(double a, double b)
     return  (DoubleLength) {s, t};
 }
 
-static inline DoubleLength
-dl_add(DoubleLength total, double x)
+static inline TripleLength
+tl_add(TripleLength total, double x)
 {
     DoubleLength s = twosum(total.hi, x);
-    return (DoubleLength) {s.hi, total.lo + s.lo};
+    DoubleLength t = twosum(total.lo, s.lo);
+    return (TripleLength) {s.hi, t.hi, total.tiny + t.lo};
 }
 
 static inline DoubleLength
@@ -2902,18 +2904,18 @@ dl_mul(double x, double y)
     return (DoubleLength) {z, zz};
 }
 
-static inline DoubleLength
-dl_fma(DoubleLength total, double p, double q)
+static inline TripleLength
+tl_fma(TripleLength total, double p, double q)
 {
     DoubleLength product = dl_mul(p, q);
-    total = dl_add(total, product.hi);
-    return  dl_add(total, product.lo);
+    total = tl_add(total, product.hi);
+    return  tl_add(total, product.lo);
 }
 
 static inline double
-dl_to_d(DoubleLength total)
+tl_to_d(TripleLength total)
 {
-    return total.hi + total.lo;
+    return total.tiny + total.lo + total.hi;
 }
 
 /*[clinic input]
@@ -2944,7 +2946,7 @@ math_sumprod_impl(PyObject *module, PyObject *p, PyObject *q)
     bool int_path_enabled = true, int_total_in_use = false;
     bool flt_path_enabled = true, flt_total_in_use = false;
     long int_total = 0;
-    DoubleLength flt_total = dl_zero;
+    TripleLength flt_total = tl_zero;
 
     p_it = PyObject_GetIter(p);
     if (p_it == NULL) {
@@ -3079,7 +3081,7 @@ math_sumprod_impl(PyObject *module, PyObject *p, PyObject *q)
                 } else {
                     goto finalize_flt_path;
                 }
-                DoubleLength new_flt_total = dl_fma(flt_total, flt_p, flt_q);
+                TripleLength new_flt_total = tl_fma(flt_total, flt_p, flt_q);
                 if (isfinite(new_flt_total.hi)) {
                     flt_total = new_flt_total;
                     flt_total_in_use = true;
@@ -3093,7 +3095,7 @@ math_sumprod_impl(PyObject *module, PyObject *p, PyObject *q)
             // We're finished, overflowed, have a non-float, or got a non-finite value
             flt_path_enabled = false;
             if (flt_total_in_use) {
-                term_i = PyFloat_FromDouble(dl_to_d(flt_total));
+                term_i = PyFloat_FromDouble(tl_to_d(flt_total));
                 if (term_i == NULL) {
                     goto err_exit;
                 }
@@ -3104,7 +3106,7 @@ math_sumprod_impl(PyObject *module, PyObject *p, PyObject *q)
                 Py_SETREF(total, new_total);
                 new_total = NULL;
                 Py_CLEAR(term_i);
-                flt_total = dl_zero;
+                flt_total = tl_zero;
                 flt_total_in_use = false;
             }
         }

From 05884b735e937c7ba4ada3b1987bbdfafec2c9b6 Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <raymond.hettinger@gmail.com>
Date: Sun, 8 Jan 2023 02:11:16 -0600
Subject: [PATCH 2/6] Update comments

---
 Modules/mathmodule.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c
index 750f4ab56bf566..31e268c1dec68f 100644
--- a/Modules/mathmodule.c
+++ b/Modules/mathmodule.c
@@ -2845,16 +2845,15 @@ based on ideas from three sources:
   Ultimately Fast Accurate Summation by Siegfried M. Rump
   https://www.tuhh.de/ti3/paper/rump/Ru08b.pdf
 
-The double length routines allow for quite a bit of instruction
-level parallelism.  On a 3.22 Ghz Apple M1 Max, the incremental
-cost of increasing the input vector size by one is 6.0 nsec.
-
-dl_zero() returns an extended precision zero
-dl_split() exactly splits a double into two half precision components.
-dl_add() performs compensated summation to keep a running total.
-dl_mul() implements lossless multiplication of doubles.
-dl_fma() implements an extended precision fused-multiply-add.
-dl_to_d() converts from extended precision to double precision.
+Double length API:
+* dl_split() exactly splits a C double into two half precision components.
+* dl_mul() implements lossless multiplication of two C doubles.
+
+Triple length API:
+* tl_zero is a triple length zero for starting or resetting an accumulation.
+* tl_add() adds a C double to a triple length number.
+* tl_fma() performs a triple length fused-multiply-add.
+* tl_to_d() converts from triple length number to a C double.
 
 */
 

From 7a993cdf03a6fb96f08d7c3c75405ff4c4acf265 Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <raymond.hettinger@gmail.com>
Date: Sun, 8 Jan 2023 12:47:44 -0600
Subject: [PATCH 3/6] ASCII art

---
 Modules/mathmodule.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c
index 31e268c1dec68f..cdb508747a5a5d 100644
--- a/Modules/mathmodule.c
+++ b/Modules/mathmodule.c
@@ -2877,9 +2877,17 @@ twosum(double a, double b)
 static inline TripleLength
 tl_add(TripleLength total, double x)
 {
-    DoubleLength s = twosum(total.hi, x);
-    DoubleLength t = twosum(total.lo, s.lo);
-    return (TripleLength) {s.hi, t.hi, total.tiny + t.lo};
+    /* Input:       x     total.hi   total.lo    total.tiny
+                   |--- twosum ---|
+                    s.hi      s.lo
+                             |--- twosum ---|
+                              t.hi      t.lo
+                                       |--- single sum ---|
+       Output:      s.hi     t.hi       tiny
+     */
+    DoubleLength s = twosum(x, total.hi);
+    DoubleLength t = twosum(s.lo, total.lo);
+    return (TripleLength) {s.hi, t.hi, t.lo + total.tiny};
 }
 
 static inline DoubleLength

From 04160fd7d1123c6591c9690d77a8d6fb1f4e3237 Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <raymond.hettinger@gmail.com>
Date: Sun, 8 Jan 2023 12:58:34 -0600
Subject: [PATCH 4/6] Tweak comments

---
 Modules/mathmodule.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c
index cdb508747a5a5d..11e815c6f17e1d 100644
--- a/Modules/mathmodule.c
+++ b/Modules/mathmodule.c
@@ -2832,7 +2832,7 @@ long_add_would_overflow(long a, long b)
 }
 
 /*
-Double length extended precision floating point arithmetic
+Double and triple length extended precision floating point arithmetic
 based on ideas from three sources:
 
   Improved Kahan–Babuška algorithm by Arnold Neumaier
@@ -2845,15 +2845,15 @@ based on ideas from three sources:
   Ultimately Fast Accurate Summation by Siegfried M. Rump
   https://www.tuhh.de/ti3/paper/rump/Ru08b.pdf
 
-Double length API:
-* dl_split() exactly splits a C double into two half precision components.
-* dl_mul() implements lossless multiplication of two C doubles.
+Double length functions:
+* dl_split() exact split of a C double into two half precision components.
+* dl_mul() exact multiplication of two C doubles.
 
-Triple length API:
+Triple length functions and constant:
 * tl_zero is a triple length zero for starting or resetting an accumulation.
-* tl_add() adds a C double to a triple length number.
+* tl_add() compensated addition of a C double to a triple length number.
 * tl_fma() performs a triple length fused-multiply-add.
-* tl_to_d() converts from triple length number to a C double.
+* tl_to_d() converts from triple length number back to a C double.
 
 */
 

From 91723fa1e5568e60dc9e4104572b0f3d18e541cf Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <raymond.hettinger@gmail.com>
Date: Sun, 8 Jan 2023 13:03:03 -0600
Subject: [PATCH 5/6] Limit test resource utilization

---
 Lib/test/test_math.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Lib/test/test_math.py b/Lib/test/test_math.py
index 65fe169671eae2..b8ac8f32055d33 100644
--- a/Lib/test/test_math.py
+++ b/Lib/test/test_math.py
@@ -1294,6 +1294,7 @@ def test_sumprod_accuracy(self):
         self.assertEqual(sumprod([0.1] * 20, [True, False] * 10), 1.0)
         self.assertEqual(sumprod([1.0, 10E100, 1.0, -10E100], [1.0]*4), 2.0)
 
+    @support.requires_resource('cpu')
     def test_sumprod_stress(self):
         sumprod = math.sumprod
         product = itertools.product

From 2180ef92644bcb1fd51c2c1ef01e2424835b78de Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <raymond.hettinger@gmail.com>
Date: Sun, 8 Jan 2023 13:08:33 -0600
Subject: [PATCH 6/6] Add whatsnew entry

---
 Doc/whatsnew/3.12.rst | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/Doc/whatsnew/3.12.rst b/Doc/whatsnew/3.12.rst
index 2f50ece4dab3fb..b882bb607f911c 100644
--- a/Doc/whatsnew/3.12.rst
+++ b/Doc/whatsnew/3.12.rst
@@ -262,6 +262,12 @@ dis
   :data:`~dis.hasarg` collection instead.
   (Contributed by Irit Katriel in :gh:`94216`.)
 
+math
+----
+
+* Added :func:`math.sumprod` for computing a sum of products.
+  (Contributed by Raymond Hettinger in :gh:`100485`.)
+
 os
 --