From 376df8a48e7f762522360e190c2e51b0c3a785be Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <raymond.hettinger@gmail.com>
Date: Fri, 27 Jan 2023 17:12:58 -0600
Subject: [PATCH 1/7] FMA version

---
 Modules/mathmodule.c | 24 +++---------------------
 1 file changed, 3 insertions(+), 21 deletions(-)

diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c
index 2da50bbd54b57f..f7c0f3692af38f 100644
--- a/Modules/mathmodule.c
+++ b/Modules/mathmodule.c
@@ -2835,10 +2835,6 @@ long_add_would_overflow(long a, long b)
 Double and triple length extended precision floating point arithmetic
 based on:
 
-  A Floating-Point Technique for Extending the Available Precision
-  by T. J. Dekker
-  https://csclub.uwaterloo.ca/~pbarfuss/dekker1971.pdf
-
   Accurate Sum and Dot Product
   by Takeshi Ogita, Siegfried M. Rump, and Shin’Ichi Oishi
   https://doi.org/10.1137/030601818
@@ -2858,26 +2854,12 @@ twosum(double a, double b)
     return (DoubleLength) {x, y};
 }
 
-static inline DoubleLength
-dl_split(double x) {
-    // Rump Algorithm 3.2 Error-free splitting of a floating point number
-    // Dekker (5.5) and (5.6).
-    double t = x * 134217729.0;  // Veltkamp constant = 2.0 ** 27 + 1
-    double hi = t - (t - x);
-    double lo = x - hi;
-    return (DoubleLength) {hi, lo};
-}
-
 static inline DoubleLength
 dl_mul(double x, double y)
 {
-    // Dekker (5.12) and mul12()
-    DoubleLength xx = dl_split(x);
-    DoubleLength yy = dl_split(y);
-    double p = xx.hi * yy.hi;
-    double q = xx.hi * yy.lo + xx.lo * yy.hi;
-    double z = p + q;
-    double zz = p - z + q + xx.lo * yy.lo;
+    // Rump Algorithm 3.5. Error-free transformation of a product
+    double z = x * y;
+    double zz = fma(x, y, -z);
     return (DoubleLength) {z, zz};
 }
 

From 3f700be54074178102fce1020e20b5cb9f7819de Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <raymond.hettinger@gmail.com>
Date: Fri, 27 Jan 2023 17:35:49 -0600
Subject: [PATCH 2/7] Neated-up

---
 Modules/mathmodule.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c
index f7c0f3692af38f..0159fcc1e7a446 100644
--- a/Modules/mathmodule.c
+++ b/Modules/mathmodule.c
@@ -2844,20 +2844,20 @@ based on:
 
 typedef struct{ double hi; double lo; } DoubleLength;
 
-static inline DoubleLength
-twosum(double a, double b)
+static DoubleLength
+dl_sum(double a, double b)
 {
-    // Rump Algorithm 3.1 Error-free transformation of the sum
+    // Algorithm 3.1 Error-free transformation of the sum
     double x = a + b;
     double z = x - a;
     double y = (a - (x - z)) + (b - z);
     return (DoubleLength) {x, y};
 }
 
-static inline DoubleLength
+static DoubleLength
 dl_mul(double x, double y)
 {
-    // Rump Algorithm 3.5. Error-free transformation of a product
+    // Algorithm 3.5. Error-free transformation of a product
     double z = x * y;
     double zz = fma(x, y, -z);
     return (DoubleLength) {z, zz};
@@ -2867,21 +2867,21 @@ typedef struct { double hi; double lo; double tiny; } TripleLength;
 
 static const TripleLength tl_zero = {0.0, 0.0, 0.0};
 
-static inline TripleLength
+static TripleLength
 tl_fma(TripleLength total, double x, double y)
 {
-    // Rump Algorithm 5.10 with K=3 and using SumKVert
+    // Algorithm 5.10 with SumKVert for K=3
     DoubleLength pr = dl_mul(x, y);
-    DoubleLength sm = twosum(total.hi, pr.hi);
-    DoubleLength r1 = twosum(total.lo, pr.lo);
-    DoubleLength r2 = twosum(r1.hi, sm.lo);
+    DoubleLength sm = dl_sum(total.hi, pr.hi);
+    DoubleLength r1 = dl_sum(total.lo, pr.lo);
+    DoubleLength r2 = dl_sum(r1.hi, sm.lo);
     return (TripleLength) {sm.hi, r2.hi, total.tiny + r1.lo + r2.lo};
 }
 
-static inline double
+static double
 tl_to_d(TripleLength total)
 {
-    DoubleLength last = twosum(total.lo, total.hi);
+    DoubleLength last = dl_sum(total.lo, total.hi);
     return total.tiny + last.lo + last.hi;
 }
 

From 43a7ba9afa8f64c4e8ee3f208f0790dc737397ef Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <raymond.hettinger@gmail.com>
Date: Fri, 27 Jan 2023 17:57:12 -0600
Subject: [PATCH 3/7] Compute TWO_PRODUCT() with FMA

---
 Lib/test/test_math.py | 86 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 86 insertions(+)

diff --git a/Lib/test/test_math.py b/Lib/test/test_math.py
index b8ac8f32055d33..513ee65f29decd 100644
--- a/Lib/test/test_math.py
+++ b/Lib/test/test_math.py
@@ -1369,6 +1369,92 @@ def run(func, *args):
                             args,
                         )
 
+    @requires_IEEE_754
+    @unittest.skipIf(HAVE_DOUBLE_ROUNDING,
+                         "sumprod() accuracy not guaranteed on machines with double rounding")
+    @support.cpython_only    # Other implementations may choose a different algorithm
+    @support.requires_resource('cpu')
+    def test_sumprod_extended_precision_accuracy(self):
+        sumprod = math.sumprod
+        import operator
+        from fractions import Fraction
+        from itertools import starmap
+        from collections import namedtuple
+        from math import log2, exp2, log10, fabs
+        from random import choices, uniform, shuffle
+        from statistics import median
+        from functools import partial
+        from pprint import pp
+
+        DotExample = namedtuple('DotExample', ('x', 'y', 'target_sumprod', 'condition'))
+
+        def DotExact(x, y):
+            vec1 = map(Fraction, x)
+            vec2 = map(Fraction, y)
+            return sum(starmap(operator.mul, zip(vec1, vec2, strict=True)))
+
+        def Condition(x, y):
+            return 2.0 * DotExact(map(abs, x), map(abs, y)) / abs(DotExact(x, y))
+
+        def linspace(lo, hi, n):
+            width = (hi - lo) / (n - 1)
+            return [lo + width * i for i in range(n)]
+
+        def GenDot(n, c=1e12):
+            """ Algorithm 6.1 (GenDot) works as follows. The condition number (5.7) of
+            the dot product xT y is proportional to the degree of cancellation. In
+            order to achieve a prescribed cancellation, we generate the first half of
+            the vectors x and y randomly within a large exponent range. This range is
+            chosen according to the anticipated condition number. The second half of x
+            and y is then constructed choosing xi randomly with decreasing exponent,
+            and calculating yi such that some cancellation occurs. Finally, we permute
+            the vectors x, y randomly and calculate the achieved condition number.
+            """
+
+            assert n >= 6
+            n2 = n // 2
+            x = [0.0] * n
+            y = [0.0] * n
+            b = log2(c)
+
+            # First half with exponents from 0 to |_b/2_| and random ints in between
+            e = choices(range(int(b/2)), k=n2)
+            e[0] = int(b / 2) + 1
+            e[-1] = 0.0
+
+            x[:n2] = [uniform(-1.0, 1.0) * exp2(p) for p in e]
+            y[:n2] = [uniform(-1.0, 1.0) * exp2(p) for p in e]
+
+            # Second half
+            e = list(map(round, linspace(b/2, 0.0 , n-n2)))
+            for i in range(n2, n):
+                x[i] = uniform(-1.0, 1.0) * exp2(e[i - n2])
+                y[i] = (uniform(-1.0, 1.0) * exp2(e[i - n2]) - DotExact(x, y)) / x[i]
+
+            # Shuffle
+            pairs = list(zip(x, y))
+            shuffle(pairs)
+            x, y = zip(*pairs)
+
+            return DotExample(x, y, DotExact(x, y), Condition(x, y))
+
+        def RelativeError(res, ex):
+            x, y, target_sumprod, condition = ex
+            n = DotExact(list(x) + [-res], list(y) + [1])
+            return fabs(n / target_sumprod)
+
+        def Trial(dotfunc, c=10e7, n=10):
+            ex = GenDot(10, c)
+            res = dotfunc(ex.x, ex.y)
+            return RelativeError(res, ex)
+
+        times = 1000          # Number of trials
+        n = 20                # Length of vectors
+        c = 1e30              # Target condition number
+
+        relative_err = median(Trial(sumprod, c=c, n=n) for i in range(times))
+        self.assertLess(relative_err, 1e-16)
+
     def testModf(self):
         self.assertRaises(TypeError, math.modf)
 

From 925235f8a212a10c3a30c468573173fa0944d6f6 Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <raymond.hettinger@gmail.com>
Date: Fri, 27 Jan 2023 18:05:15 -0600
Subject: [PATCH 4/7] Remove default values from the test support functions

---
 Lib/test/test_math.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Lib/test/test_math.py b/Lib/test/test_math.py
index 513ee65f29decd..0687f87a2a7023 100644
--- a/Lib/test/test_math.py
+++ b/Lib/test/test_math.py
@@ -1400,7 +1400,7 @@ def linspace(lo, hi, n):
             width = (hi - lo) / (n - 1)
             return [lo + width * i for i in range(n)]
 
-        def GenDot(n, c=1e12):
+        def GenDot(n, c):
             """ Algorithm 6.1 (GenDot) works as follows. The condition number (5.7) of
             the dot product xT y is proportional to the degree of cancellation. In
             order to achieve a prescribed cancellation, we generate the first half of
@@ -1443,7 +1443,7 @@ def RelativeError(res, ex):
             n = DotExact(list(x) + [-res], list(y) + [1])
             return fabs(n / target_sumprod)
 
-        def Trial(dotfunc, c=10e7, n=10):
+        def Trial(dotfunc, c, n):
             ex = GenDot(10, c)
             res = dotfunc(ex.x, ex.y)
             return RelativeError(res, ex)
@@ -1452,7 +1452,7 @@ def Trial(dotfunc, c=10e7, n=10):
         n = 20                # Length of vectors
         c = 1e30              # Target condition number
 
-        relative_err = median(Trial(sumprod, c=c, n=n) for i in range(times))
+        relative_err = median(Trial(sumprod, c, n) for i in range(times))
         self.assertLess(relative_err, 1e-16)
 
     def testModf(self):

From 16ea0dbc5ed28049dc92d599af9d127c6df32b1b Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <raymond.hettinger@gmail.com>
Date: Fri, 27 Jan 2023 18:10:58 -0600
Subject: [PATCH 5/7] Remove unused imports

---
 Lib/test/test_math.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/Lib/test/test_math.py b/Lib/test/test_math.py
index 0687f87a2a7023..fcdec93484e000 100644
--- a/Lib/test/test_math.py
+++ b/Lib/test/test_math.py
@@ -1375,16 +1375,13 @@ def run(func, *args):
     @support.cpython_only    # Other implementations may choose a different algorithm
     @support.requires_resource('cpu')
     def test_sumprod_extended_precision_accuracy(self):
-        sumprod = math.sumprod
         import operator
         from fractions import Fraction
         from itertools import starmap
         from collections import namedtuple
-        from math import log2, exp2, log10, fabs
+        from math import log2, exp2, fabs
         from random import choices, uniform, shuffle
         from statistics import median
-        from functools import partial
-        from pprint import pp
 
         DotExample = namedtuple('DotExample', ('x', 'y', 'target_sumprod', 'condition'))
 
@@ -1452,7 +1449,7 @@ def Trial(dotfunc, c, n):
         n = 20                # Length of vectors
         c = 1e30              # Target condition number
 
-        relative_err = median(Trial(sumprod, c, n) for i in range(times))
+        relative_err = median(Trial(math.sumprod, c, n) for i in range(times))
         self.assertLess(relative_err, 1e-16)
 
     def testModf(self):

From 66bcff34064132f9dae0dd82e503aec1ec4970dd Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <raymond.hettinger@gmail.com>
Date: Fri, 27 Jan 2023 20:32:34 -0600
Subject: [PATCH 6/7] Make tl_signature match argument order of the fma() in C

---
 Modules/mathmodule.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c
index 0159fcc1e7a446..22abeaa6e34222 100644
--- a/Modules/mathmodule.c
+++ b/Modules/mathmodule.c
@@ -2868,7 +2868,7 @@ typedef struct { double hi; double lo; double tiny; } TripleLength;
 static const TripleLength tl_zero = {0.0, 0.0, 0.0};
 
 static TripleLength
-tl_fma(TripleLength total, double x, double y)
+tl_fma(double x, double y, TripleLength total)
 {
     // Algorithm 5.10 with SumKVert for K=3
     DoubleLength pr = dl_mul(x, y);
@@ -3048,7 +3048,7 @@ math_sumprod_impl(PyObject *module, PyObject *p, PyObject *q)
                 } else {
                     goto finalize_flt_path;
                 }
-                TripleLength new_flt_total = tl_fma(flt_total, flt_p, flt_q);
+                TripleLength new_flt_total = tl_fma(flt_p, flt_q, flt_total);
                 if (isfinite(new_flt_total.hi)) {
                     flt_total = new_flt_total;
                     flt_total_in_use = true;

From 1e97b8a648d08e2e867aaa7440d4efb82f7c12d5 Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <raymond.hettinger@gmail.com>
Date: Fri, 27 Jan 2023 20:34:38 -0600
Subject: [PATCH 7/7] Beautify comments

---
 Modules/mathmodule.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c
index 22abeaa6e34222..481e29915464f1 100644
--- a/Modules/mathmodule.c
+++ b/Modules/mathmodule.c
@@ -2832,8 +2832,7 @@ long_add_would_overflow(long a, long b)
 }
 
 /*
-Double and triple length extended precision floating point arithmetic
-based on:
+Double and triple length extended precision algorithms from:
 
   Accurate Sum and Dot Product
   by Takeshi Ogita, Siegfried M. Rump, and Shin’Ichi Oishi
@@ -2847,7 +2846,7 @@ typedef struct{ double hi; double lo; } DoubleLength;
 static DoubleLength
 dl_sum(double a, double b)
 {
-    // Algorithm 3.1 Error-free transformation of the sum
+    /* Algorithm 3.1 Error-free transformation of the sum */
     double x = a + b;
     double z = x - a;
     double y = (a - (x - z)) + (b - z);
@@ -2857,7 +2856,7 @@ dl_sum(double a, double b)
 static DoubleLength
 dl_mul(double x, double y)
 {
-    // Algorithm 3.5. Error-free transformation of a product
+    /* Algorithm 3.5. Error-free transformation of a product */
     double z = x * y;
     double zz = fma(x, y, -z);
     return (DoubleLength) {z, zz};
@@ -2870,7 +2869,7 @@ static const TripleLength tl_zero = {0.0, 0.0, 0.0};
 static TripleLength
 tl_fma(double x, double y, TripleLength total)
 {
-    // Algorithm 5.10 with SumKVert for K=3
+    /* Algorithm 5.10 with SumKVert for K=3 */
     DoubleLength pr = dl_mul(x, y);
     DoubleLength sm = dl_sum(total.hi, pr.hi);
     DoubleLength r1 = dl_sum(total.lo, pr.lo);