From 09292bb87e170d7d225b5bd377675b88fdcaeeff Mon Sep 17 00:00:00 2001 From: Bananymous Date: Sat, 21 Mar 2026 03:38:11 +0200 Subject: [PATCH] BAN: Cleanup math code and add SSE sqrt We should prefer SSE instructions when they are easily available. For other functions x87 is just simpler. It's hard to write faster and close to as accurate approximations with SSE. This does not use xmmintrin.h as clangd does not like that file and starts throwing errors in every file that includes this :) --- BAN/include/BAN/Math.h | 88 ++++++++++++++++++++++++------------------ 1 file changed, 50 insertions(+), 38 deletions(-) diff --git a/BAN/include/BAN/Math.h b/BAN/include/BAN/Math.h index 0a663471..2f5a0745 100644 --- a/BAN/include/BAN/Math.h +++ b/BAN/include/BAN/Math.h @@ -6,6 +6,19 @@ #include +// This is ugly but my clangd does not like including +// intrinsic headers at all +#if !defined(__SSE__) || !defined(__SSE2__) +#pragma GCC push_options +#ifndef __SSE__ +#pragma GCC target("sse") +#endif +#ifndef __SSE2__ +#pragma GCC target("sse2") +#endif +#define BAN_MATH_POP_OPTIONS +#endif + namespace BAN::Math { @@ -167,7 +180,7 @@ namespace BAN::Math template static T modf(T x, T* iptr) { - const T frac = BAN::Math::fmod(x, 1); + const T frac = BAN::Math::fmod(x, (T)1.0); *iptr = x - frac; return frac; } @@ -175,15 +188,15 @@ namespace BAN::Math template inline constexpr T frexp(T num, int* exp) { - if (num == 0.0) + if (num == (T)0.0) { *exp = 0; - return 0.0; + return (T)0.0; } - T _exp; - asm("fxtract" : "+t"(num), "=u"(_exp)); - *exp = (int)_exp + 1; + T e; + asm("fxtract" : "+t"(num), "=u"(e)); + *exp = (int)e + 1; return num / (T)2.0; } @@ -251,6 +264,7 @@ namespace BAN::Math "fstp %%st(1);" : "+t"(x) ); + return x; } @@ -263,18 +277,9 @@ namespace BAN::Math template inline constexpr T pow(T x, T y) { - asm( - "fyl2x;" - "fld1;" - "fld %%st(1);" - "fprem;" - "f2xm1;" - "faddp;" - "fscale;" - : "+t"(x), "+u"(y) - ); - - return x; + if (x == (T)0.0) + return (T)0.0; + return exp2(y * log2(x)); } template @@ -310,16 +315,27 @@ namespace BAN::Math template inline constexpr T sqrt(T x) { - asm("fsqrt" : "+t"(x)); - return x; + if constexpr(BAN::is_same_v) + { + using v4sf = float __attribute__((vector_size(16))); + return __builtin_ia32_sqrtss((v4sf) { x, 0.0f, 0.0f, 0.0f })[0]; + } + else if constexpr(BAN::is_same_v) + { + using v2df = double __attribute__((vector_size(16))); + return __builtin_ia32_sqrtsd((v2df) { x, 0.0 })[0]; + } + else if constexpr(BAN::is_same_v) + { + asm("fsqrt" : "+t"(x)); + return x; + } } template inline constexpr T cbrt(T value) { - if (value == 0.0) - return 0.0; - return pow(value, 1.0 / 3.0); + return pow(value, (T)1.0 / (T)3.0); } template @@ -346,30 +362,21 @@ namespace BAN::Math inline constexpr T tan(T x) { T one, ret; - asm( - "fptan" - : "=t"(one), "=u"(ret) - : "0"(x) - ); + asm("fptan" : "=t"(one), "=u"(ret) : "0"(x)); return ret; } template inline constexpr T atan2(T y, T x) { - asm( - "fpatan" - : "+t"(x) - : "u"(y) - : "st(1)" - ); + asm("fpatan" : "+t"(x) : "u"(y) : "st(1)"); return x; } template inline constexpr T atan(T x) { - return atan2(x, 1.0); + return atan2(x, (T)1.0); } template @@ -378,10 +385,10 @@ namespace BAN::Math if (x == (T)0.0) return (T)0.0; if (x == (T)1.0) - return numbers::pi_v / (T)2.0; + return +numbers::pi_v / (T)2.0; if (x == (T)-1.0) return -numbers::pi_v / (T)2.0; - return (T)2.0 * atan(x / (T(1.0) + sqrt((T)1.0 - x * x))); + return (T)2.0 * atan(x / ((T)1.0 + sqrt((T)1.0 - x * x))); } template @@ -411,7 +418,7 @@ namespace BAN::Math template inline constexpr T tanh(T x) { - const T exp_px = exp(x); + const T exp_px = exp(+x); const T exp_nx = exp(-x); return (exp_px - exp_nx) / (exp_px + exp_nx); } @@ -441,3 +448,8 @@ namespace BAN::Math } } + +#ifdef BAN_MATH_POP_OPTIONS +#undef BAN_MATH_POP_OPTIONS +#pragma GCC pop_options +#endif