From 2bb6aa85e854654ee387175f6269060624c25cc6 Mon Sep 17 00:00:00 2001 From: Peter Harris Date: Sat, 9 Nov 2024 23:43:15 +0000 Subject: [PATCH 1/2] pca --- Source/astcenc_averages_and_directions.cpp | 54 +++++++++++----------- Source/astcenc_vecmathlib.h | 14 ++++++ 2 files changed, 41 insertions(+), 27 deletions(-) diff --git a/Source/astcenc_averages_and_directions.cpp b/Source/astcenc_averages_and_directions.cpp index 8e2f8d8c..a674bc72 100644 --- a/Source/astcenc_averages_and_directions.cpp +++ b/Source/astcenc_averages_and_directions.cpp @@ -586,44 +586,44 @@ void compute_avgs_and_dirs_3_comp_rgb( vfloat4 average = partition_averages[partition]; pm[partition].avg = average; - vfloat4 sum_xp = vfloat4::zero(); - vfloat4 sum_yp = vfloat4::zero(); - vfloat4 sum_zp = vfloat4::zero(); + // Compute covariance matrix and bounding box + vfloat4 cov[3] = { vfloat4::zero(), vfloat4::zero(), vfloat4::zero() }; + vfloat4 maxc(-1e38f); + vfloat4 minc(1e38f); for (unsigned int i = 0; i < texel_count; i++) { unsigned int iwt = texel_indexes[i]; - vfloat4 texel_datum = blk.texel3(iwt); - texel_datum = texel_datum - average; - - vfloat4 zero = vfloat4::zero(); + vfloat4 c = blk.texel3(iwt); + vfloat4 d = (c - average); - vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero; - sum_xp += select(zero, texel_datum, tdm0); + cov[0] += d.swz<0, 0, 0>() * d; // xx, xy, xz + cov[1] += d.swz<0, 1, 1>() * d.swz<1, 1, 2>(); // xy, yy, yz + cov[2] += d * d.swz<2, 2, 2>(); // xz, yz, zz - vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero; - sum_yp += select(zero, texel_datum, tdm1); - - vmask4 tdm2 = texel_datum.swz<2,2,2,2>() > zero; - sum_zp += select(zero, texel_datum, tdm2); + minc = min(minc, c); + maxc = max(maxc, c); } - vfloat4 prod_xp = dot(sum_xp, sum_xp); - vfloat4 prod_yp = dot(sum_yp, sum_yp); - vfloat4 prod_zp = dot(sum_zp, sum_zp); - - vfloat4 best_vector = sum_xp; - vfloat4 best_sum = prod_xp; + // Use best axis of the bounding box based on the signs of the covariance matrix + vfloat4 dir = maxc - minc; + dir = change_sign(dir, cov[2]); - vmask4 mask = prod_yp > best_sum; - best_vector = select(best_vector, sum_yp, mask); - best_sum = select(best_sum, prod_yp, mask); + // Perform single power iteration + vfloat4 d; + d = dir.swz<0, 0, 0, 0>() * cov[0] + + dir.swz<1, 1, 1, 1>() * cov[1] + + dir.swz<2, 2, 2, 2>() * cov[2]; - mask = prod_zp > best_sum; - best_vector = select(best_vector, sum_zp, mask); - - pm[partition].dir = best_vector; + if (all(abs(d) < vfloat4(0.00001f))) + { + pm[partition].dir = dir; + } + else + { + pm[partition].dir = d; + } } } diff --git a/Source/astcenc_vecmathlib.h b/Source/astcenc_vecmathlib.h index e6ae97cc..86382633 100644 --- a/Source/astcenc_vecmathlib.h +++ b/Source/astcenc_vecmathlib.h @@ -276,6 +276,20 @@ ASTCENC_SIMD_INLINE vfloat change_sign(vfloat a, vfloat b) return int_as_float(r); } +#if ASTCENC_SIMD_WIDTH != 4 +/** + * @brief Return @c a with lanes negated if the @c b lane is negative. + */ +ASTCENC_SIMD_INLINE vfloat4 change_sign(vfloat4 a, vfloat4 b) +{ + vint4 ia = float_as_int(a); + vint4 ib = float_as_int(b); + vint4 sign_mask(static_cast(0x80000000)); + vint4 r = ia ^ (ib & sign_mask); + return int_as_float(r); +} +#endif + /** * @brief Return fast, but approximate, vector atan(x). * From fe25f8b2a40bb3c4584d64ba8fa939f338b14535 Mon Sep 17 00:00:00 2001 From: Peter Harris Date: Sun, 10 Nov 2024 12:08:44 +0000 Subject: [PATCH 2/2] Use 4 lane swizzles The alpha lane is already zero, so we don't need to use a 3 lane swizzle to force zero it. --- Source/astcenc_averages_and_directions.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Source/astcenc_averages_and_directions.cpp b/Source/astcenc_averages_and_directions.cpp index a674bc72..a56b5371 100644 --- a/Source/astcenc_averages_and_directions.cpp +++ b/Source/astcenc_averages_and_directions.cpp @@ -598,9 +598,9 @@ void compute_avgs_and_dirs_3_comp_rgb( vfloat4 c = blk.texel3(iwt); vfloat4 d = (c - average); - cov[0] += d.swz<0, 0, 0>() * d; // xx, xy, xz - cov[1] += d.swz<0, 1, 1>() * d.swz<1, 1, 2>(); // xy, yy, yz - cov[2] += d * d.swz<2, 2, 2>(); // xz, yz, zz + cov[0] += d.swz<0, 0, 0, 3>() * d; // xx, xy, xz + cov[1] += d.swz<0, 1, 1, 3>() * d.swz<1, 1, 2, 3>(); // xy, yy, yz + cov[2] += d * d.swz<2, 2, 2, 3>(); // xz, yz, zz minc = min(minc, c); maxc = max(maxc, c);