From 50233de6264f114769a2db752d33a658a447b27a Mon Sep 17 00:00:00 2001 From: thibaudk Date: Sun, 27 Oct 2024 18:13:58 +0000 Subject: [PATCH] [processor] start port of frequency_analyzer --- Amuencha/AmuenchaModel.cpp | 188 +++++++++- Amuencha/AmuenchaModel.hpp | 72 +++- Amuencha/AmuenchaUi.hpp | 10 +- Amuencha/SpiralDisplay.cpp | 7 + Amuencha/SpiralDisplay.hpp | 17 +- Amuencha/sse_mathfun.h | 711 +++++++++++++++++++++++++++++++++++++ CMakeLists.txt | 1 + 7 files changed, 983 insertions(+), 23 deletions(-) create mode 100644 Amuencha/sse_mathfun.h diff --git a/Amuencha/AmuenchaModel.cpp b/Amuencha/AmuenchaModel.cpp index 34b2631..3845422 100644 --- a/Amuencha/AmuenchaModel.cpp +++ b/Amuencha/AmuenchaModel.cpp @@ -1,14 +1,196 @@ #include "Amuencha.hpp" +#include "sse_mathfun.h" + +#include +#include + +#include +#include +#include namespace Amuencha { +void Analyser::prepare(halp::setup info) +{ + sampling_rate = info.rate; + samplerate_div_2pi = sampling_rate/two_pi; + analyzer_setup(); +} + void Analyser::operator()(halp::tick t) { // Process the input buffer auto* in = inputs.audio[0]; - // for(int j{0}; j < t.frames; j++) - // { - // } + for(int j{0}; j < t.frames; j++) + { + + } +} + +void Analyser::analyzer_setup(float max_buffer_duration) +{ + using namespace boost::math::float_constants; + using namespace std; + + // Prepare the windows + //std::vector> windowed_sines; + windowed_sines.resize(frequencies.size()); + power_normalization_factors.resize(frequencies.size()); + + int big_buffer_size = 0; + + for (int idx{0}; idx < frequencies.size(); ++idx) + { + // for each freq, span at least 20 periods for more precise measurements + // This still gives reasonable latencies, e.g. 50ms at 400Hz, 100ms at 200Hz, 400ms at 50Hz... + // Could also span more for even better measurements, with larger + // computation cost and latency + float f = frequencies[idx]; + int window_size = (int)(min(inputs.periods_nb.value / f, max_buffer_duration * 0.001f) + * sampling_rate); + vector window(window_size); + vector window_deriv(window_size); + + if (!read_from_cache(window, window_deriv)) + { + initialize_window(window); + initialize_window_deriv(window_deriv); + write_to_cache(window, window_deriv); + } + windowed_sines[idx].resize(window_size); + float wsum = 0; + + for (int i{0}; i < window_size;) + { + if (i < window_size - 4) + { + v4sf tfs = { + (float)(i - window_size - 1) / sampling_rate, + (float)(i + 1 - window_size - 1) / sampling_rate, + (float)(i + 2 - window_size - 1) / sampling_rate, + (float)(i + 3 - window_size - 1) / sampling_rate + }; + tfs *= (float)(-two_pi*f); + v4sf sin_tf, cos_tf; + sincos_ps(tfs, &sin_tf, &cos_tf); + + for (int j{0}; j < 3; ++j) + { + v4sf ws = { + cos_tf[j] * window[i + j], + sin_tf[j] * window[i + j], + cos_tf[j] * window_deriv[i + j], + sin_tf[j] * window_deriv[i + j] + }; + windowed_sines[idx][i + j] = ws; + wsum += window[i + j]; + } + i += 4; + continue; + } + float t = (float)(i-window_size-1) / sampling_rate; + float re = cosf(-two_pi*t*f); + float im = sinf(-two_pi*t*f); + v4sf ws = { + re * window[i], + im * window[i], + re * window_deriv[i], + im * window_deriv[i] + }; + windowed_sines[idx][i] = ws; + wsum += window[i]; + ++i; + } + power_normalization_factors[idx] = 1. / (wsum*wsum); + big_buffer_size = max(big_buffer_size, window_size); + } + + big_buffer.clear(); + // fill with 0 signal content to start with + big_buffer.resize(big_buffer_size, 0.f); +} + +void Analyser::initialize_window(std::vector& window) +{ + // Kaiser window with a parameter of alpha=3 that nullifies the window on edges + int size = window.size(); + const float two_over_N = 2. / size; + const float alpha = 3.; + const float alpha_pi = alpha * pi; + const float inv_denom = 1. / boost::math::cyl_bessel_i(0., alpha_pi); + for (int i{0}; i < size; ++i) + { + float p = i * two_over_N - 1.; + window[i] = boost::math::cyl_bessel_i(0., alpha_pi * sqrt(1. - p*p)) * inv_denom; + } +} + +void Analyser::initialize_window_deriv(std::vector &window) +{ + // Derivative of the Kaiser window with a parameter of alpha=3 that nullifies the window on edges + int size = window.size(); + const float two_over_N = 2. / size; + const float alpha = 3.; + const float alpha_pi = alpha * pi; + const float inv_denom = 1. / boost::math::cyl_bessel_i(0., alpha_pi); + for (int i{1}; i < size; ++i) + { + float p = i * two_over_N - 1.; + window[i] = boost::math::cyl_bessel_i(1., alpha_pi * sqrt(1. - p*p)) * + inv_denom * alpha_pi / sqrt(1. - p*p) * (-p)*two_over_N; + } + // lim I1(x)/x as x->0 = 1/2 + window[0] = 0.5 * inv_denom * alpha_pi * alpha_pi * two_over_N; +} + +bool Analyser::read_from_cache(std::vector &window, std::vector &window_deriv) +{ + auto it = mem_win_cache.find(window.size()); + + if (it != mem_win_cache.end()) + { + window = it->second; + auto itd = mem_winderiv_cache.find(window.size()); + + if (itd != mem_winderiv_cache.end()) + { + window_deriv = itd->second; + return true; + } + // else, load from disk + } + + using namespace std; + // TODO: make the cache location parametrizable (and an option to not use it) + ifstream file(".amuencha_cache/w"+to_string(window.size())+".bin", ios::binary); + if (file.is_open()) + { + file.read(reinterpret_cast(&window[0]), window.size()*sizeof(float)); + file.read(reinterpret_cast(&window_deriv[0]), window_deriv.size()*sizeof(float)); + + if (file.tellg() != (window.size() + window_deriv.size()) * sizeof(float)) + { + cerr << "Error: invalid cache .amuencha_cache/w"+to_string(window.size())+".bin\n"; + return false; + } + return true; + } + return false; +} + +void Analyser::write_to_cache(std::vector &window, std::vector &window_deriv) +{ + using namespace std; +#if defined(_WIN32) || defined(_WIN64) + mkdir(".amuencha_cache"); +#else + mkdir(".amuencha_cache", 0755); +#endif + ofstream file(".amuencha_cache/w"+to_string(window.size())+".bin", ios::binary|ios::trunc); + file.write(reinterpret_cast(&window[0]), window.size()*sizeof(float)); + file.write(reinterpret_cast(&window_deriv[0]), window_deriv.size()*sizeof(float)); + mem_win_cache[window.size()] = window; + mem_winderiv_cache[window.size()] = window_deriv; } } diff --git a/Amuencha/AmuenchaModel.hpp b/Amuencha/AmuenchaModel.hpp index ae13601..26c193a 100644 --- a/Amuencha/AmuenchaModel.hpp +++ b/Amuencha/AmuenchaModel.hpp @@ -15,6 +15,15 @@ public: halp_meta(c_name, "amuencha") halp_meta(uuid, "b37351b4-7b8d-4150-9e1c-708eec9182b2") + // This one will be memcpy'd as it is a trivial type + struct processor_to_ui + { + int min; + int max; + }; + + std::function send_message; + // Define inputs and outputs ports. // See the docs at https://github.com/celtera/avendish struct ins @@ -34,18 +43,18 @@ public: self.send_message({.min = self.inputs.min.value, .max = this->value}); } } max; - // halp::hslider_i32<"Min", halp::range{.min = 0, .max = 127, .init = 24}> min; - // halp::hslider_i32<"Max", halp::range{.min = 0, .max = 127, .init = 72}> max; + halp::spinbox_i32<"Analyze num. periods", + halp::range{.min = 0, .max = 99, .init = 30}> periods_nb; } inputs; - // This one will be memcpy'd as it is a trivial type - struct processor_to_ui + void process_message(const std::vector& frequencies) { - int min; - int max; - }; + this->frequencies = frequencies; + reassigned_frequencies = frequencies; + power_spectrum.resize(frequencies.size()); - std::function send_message; + if (sampling_rate != 0) analyzer_setup(); + } struct outs { @@ -53,10 +62,7 @@ public: } outputs; using setup = halp::setup; - void prepare(halp::setup info) - { - // Initialization, this method will be called with buffer size, etc. - } + void prepare(halp::setup info); // Do our processing for N samples using tick = halp::tick; @@ -66,6 +72,48 @@ public: // UI is defined in another file to keep things clear. struct ui; + + // Arguments are: frequency bins [f,f+1), and power in each bin + // hence the first vector size is 1 more than the second + // TODO if useful: make a proper listener API with id, etc + typedef std::function&, const std::vector&)> PowerHandler; + PowerHandler power_handler; + + // sampling rate in Hz + // freqs in Hz + // PowerHandler for the callback + // max_buffer_duration in milliseconds, specifies the largest buffer for computing the frequency content + // At lower frequencies, long buffers are needed for accurate frequency separation. + // When that max buffer duration is reached, then it is capped and the frequency resolution decreases + // Too low buffers also limit the min_freq, duration must be >= period + void analyzer_setup(float max_buffer_duration = 500); + +private: + // The window is the usual Kaiser with alpha=3 + static void initialize_window(std::vector& window); + static void initialize_window_deriv(std::vector& window); + // The filter bank. One filter per frequency + // The 4 entries in the v4sf are the real, imaginary parts of the windowed + // sine wavelet, and the real, imaginary parts of the derived windowed sine + // used for reassigning the power spectrum. + // Hopefully, with SIMD, computing all 4 of them is the same price as just one + // TODO: v8sf and compute 2 freqs at the same time + typedef float v4sf __attribute__ ((vector_size (16))); + std::vector> windowed_sines; + std::vector frequencies; + std::vector power_normalization_factors; + float sampling_rate; + float samplerate_div_2pi; + std::vector big_buffer; + std::vector reassigned_frequencies; + std::vector power_spectrum; + + // caching computations for faster init + // on disk for persistence between executions, + // in memory for avoiding reloading from disk when changing the spiral size + bool read_from_cache(std::vector& window, std::vector& window_deriv); + void write_to_cache(std::vector& window, std::vector& window_deriv); + std::map> mem_win_cache, mem_winderiv_cache; }; } diff --git a/Amuencha/AmuenchaUi.hpp b/Amuencha/AmuenchaUi.hpp index 359d137..5d54085 100644 --- a/Amuencha/AmuenchaUi.hpp +++ b/Amuencha/AmuenchaUi.hpp @@ -20,16 +20,24 @@ struct Analyser::ui halp_meta(layout, vbox) halp::item<&ins::min> min; halp::item<&ins::max> max; + halp::item<&ins::periods_nb> periods; } controls; halp::custom_actions_item spiral{.x = 0, .y = 0}; + // Define the communication between UI and processor. struct bus { + // std::function&)> send_message; + // // Set up connections - // init(ui& self) + // void init(ui& self) // { + // self.spiral.on_new_frequencies = [&] + // { + // send_message(self.spiral.get_frequencies()); + // }; // } // Receive a message on the UI thread from the processing thread diff --git a/Amuencha/SpiralDisplay.cpp b/Amuencha/SpiralDisplay.cpp index b706425..ead5ece 100644 --- a/Amuencha/SpiralDisplay.cpp +++ b/Amuencha/SpiralDisplay.cpp @@ -23,6 +23,11 @@ void Amuencha::SpiralDisplay::set_min_max_notes(int min_midi_note, int max_midi_ // update(); } +std::vector Amuencha::SpiralDisplay::get_frequencies() const noexcept +{ + return frequencies; +} + void Amuencha::SpiralDisplay::compute_frequencies() { // Now the spiral @@ -84,6 +89,8 @@ void Amuencha::SpiralDisplay::compute_frequencies() display_spectrum[id].resize(num_bins); fill(display_spectrum[id].begin(), display_spectrum[id].end(), 0.); } + + // on_new_frequencies(); } void Amuencha::SpiralDisplay::power_handler(int ID, const std::vector &reassigned_frequencies, const std::vector &power_spectrum) diff --git a/Amuencha/SpiralDisplay.hpp b/Amuencha/SpiralDisplay.hpp index 79bb20d..a583db8 100644 --- a/Amuencha/SpiralDisplay.hpp +++ b/Amuencha/SpiralDisplay.hpp @@ -75,6 +75,16 @@ struct SpiralDisplay ctx.stroke(); } + [[nodiscard]] std::vector get_frequencies() const noexcept; + + std::function on_new_frequencies; + + // // Callback when the power spectrum is available at the prescribed frequencies + // // The ID is that of the caller, setting the color of the display + void power_handler(int ID, + const std::vector& reassigned_frequencies, + const std::vector& power_spectrum); + private: static const constexpr std::string_view note_names[12] {"C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"}; @@ -127,13 +137,6 @@ private: } void compute_frequencies(); - - // // Callback when the power spectrum is available at the prescribed frequencies - // // The ID is that of the caller, setting the color of the display - void power_handler(int ID, - const std::vector& reassigned_frequencies, - const std::vector& power_spectrum); - }; } diff --git a/Amuencha/sse_mathfun.h b/Amuencha/sse_mathfun.h new file mode 100644 index 0000000..2a7862a --- /dev/null +++ b/Amuencha/sse_mathfun.h @@ -0,0 +1,711 @@ +/* SIMD (SSE1+MMX or SSE2) implementation of sin, cos, exp and log + + Inspired by Intel Approximate Math library, and based on the + corresponding algorithms of the cephes math library + + The default is to use the SSE1 version. If you define USE_SSE2 the + the SSE2 intrinsics will be used in place of the MMX intrinsics. Do + not expect any significant performance improvement with SSE2. +*/ + +/* Copyright (C) 2007 Julien Pommier + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. + + (this is the zlib license) +*/ + +#include + +/* yes I know, the top of this file is quite ugly */ + +#ifdef _MSC_VER /* visual c++ */ +# define ALIGN16_BEG __declspec(align(16)) +# define ALIGN16_END +#else /* gcc or icc */ +# define ALIGN16_BEG +# define ALIGN16_END __attribute__((aligned(16))) +#endif + +/* __m128 is ugly to write */ +typedef __m128 v4sf; // vector of 4 float (sse1) + +#ifdef USE_SSE2 +# include +typedef __m128i v4si; // vector of 4 int (sse2) +#else +typedef __m64 v2si; // vector of 2 int (mmx) +#endif + +/* declare some SSE constants -- why can't I figure a better way to do that? */ +#define _PS_CONST(Name, Val) \ + static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val } +#define _PI32_CONST(Name, Val) \ + static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val } +#define _PS_CONST_TYPE(Name, Type, Val) \ + static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val } + +_PS_CONST(1 , 1.0f); +_PS_CONST(0p5, 0.5f); +/* the smallest non denormalized float number */ +_PS_CONST_TYPE(min_norm_pos, int, 0x00800000); +_PS_CONST_TYPE(mant_mask, int, 0x7f800000); +_PS_CONST_TYPE(inv_mant_mask, int, ~0x7f800000); + +_PS_CONST_TYPE(sign_mask, int, (int)0x80000000); +_PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000); + +_PI32_CONST(1, 1); +_PI32_CONST(inv1, ~1); +_PI32_CONST(2, 2); +_PI32_CONST(4, 4); +_PI32_CONST(0x7f, 0x7f); + +_PS_CONST(cephes_SQRTHF, 0.707106781186547524); +_PS_CONST(cephes_log_p0, 7.0376836292E-2); +_PS_CONST(cephes_log_p1, - 1.1514610310E-1); +_PS_CONST(cephes_log_p2, 1.1676998740E-1); +_PS_CONST(cephes_log_p3, - 1.2420140846E-1); +_PS_CONST(cephes_log_p4, + 1.4249322787E-1); +_PS_CONST(cephes_log_p5, - 1.6668057665E-1); +_PS_CONST(cephes_log_p6, + 2.0000714765E-1); +_PS_CONST(cephes_log_p7, - 2.4999993993E-1); +_PS_CONST(cephes_log_p8, + 3.3333331174E-1); +_PS_CONST(cephes_log_q1, -2.12194440e-4); +_PS_CONST(cephes_log_q2, 0.693359375); + +#ifndef USE_SSE2 +typedef union xmm_mm_union { + __m128 xmm; + __m64 mm[2]; +} xmm_mm_union; + +#define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) { \ + xmm_mm_union u; u.xmm = xmm_; \ + mm0_ = u.mm[0]; \ + mm1_ = u.mm[1]; \ +} + +#define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { \ + xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; \ + } + +#endif // USE_SSE2 + +/* natural logarithm computed for 4 simultaneous float + return NaN for x <= 0 +*/ +v4sf log_ps(v4sf x) { +#ifdef USE_SSE2 + v4si emm0; +#else + v2si mm0, mm1; +#endif + v4sf one = *(v4sf*)_ps_1; + + v4sf invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps()); + + x = _mm_max_ps(x, *(v4sf*)_ps_min_norm_pos); /* cut off denormalized stuff */ + +#ifndef USE_SSE2 + /* part 1: x = frexpf(x, &e); */ + COPY_XMM_TO_MM(x, mm0, mm1); + mm0 = _mm_srli_pi32(mm0, 23); + mm1 = _mm_srli_pi32(mm1, 23); +#else + emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23); +#endif + /* keep only the fractional part */ + x = _mm_and_ps(x, *(v4sf*)_ps_inv_mant_mask); + x = _mm_or_ps(x, *(v4sf*)_ps_0p5); + +#ifndef USE_SSE2 + /* now e=mm0:mm1 contain the really base-2 exponent */ + mm0 = _mm_sub_pi32(mm0, *(v2si*)_pi32_0x7f); + mm1 = _mm_sub_pi32(mm1, *(v2si*)_pi32_0x7f); + v4sf e = _mm_cvtpi32x2_ps(mm0, mm1); + _mm_empty(); /* bye bye mmx */ +#else + emm0 = _mm_sub_epi32(emm0, *(v4si*)_pi32_0x7f); + v4sf e = _mm_cvtepi32_ps(emm0); +#endif + + e = _mm_add_ps(e, one); + + /* part2: + if( x < SQRTHF ) { + e -= 1; + x = x + x - 1.0; + } else { x = x - 1.0; } + */ + v4sf mask = _mm_cmplt_ps(x, *(v4sf*)_ps_cephes_SQRTHF); + v4sf tmp = _mm_and_ps(x, mask); + x = _mm_sub_ps(x, one); + e = _mm_sub_ps(e, _mm_and_ps(one, mask)); + x = _mm_add_ps(x, tmp); + + + v4sf z = _mm_mul_ps(x,x); + + v4sf y = *(v4sf*)_ps_cephes_log_p0; + y = _mm_mul_ps(y, x); + y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p1); + y = _mm_mul_ps(y, x); + y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p2); + y = _mm_mul_ps(y, x); + y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p3); + y = _mm_mul_ps(y, x); + y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p4); + y = _mm_mul_ps(y, x); + y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p5); + y = _mm_mul_ps(y, x); + y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p6); + y = _mm_mul_ps(y, x); + y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p7); + y = _mm_mul_ps(y, x); + y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p8); + y = _mm_mul_ps(y, x); + + y = _mm_mul_ps(y, z); + + + tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q1); + y = _mm_add_ps(y, tmp); + + + tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5); + y = _mm_sub_ps(y, tmp); + + tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q2); + x = _mm_add_ps(x, y); + x = _mm_add_ps(x, tmp); + x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN + return x; +} + +_PS_CONST(exp_hi, 88.3762626647949f); +_PS_CONST(exp_lo, -88.3762626647949f); + +_PS_CONST(cephes_LOG2EF, 1.44269504088896341); +_PS_CONST(cephes_exp_C1, 0.693359375); +_PS_CONST(cephes_exp_C2, -2.12194440e-4); + +_PS_CONST(cephes_exp_p0, 1.9875691500E-4); +_PS_CONST(cephes_exp_p1, 1.3981999507E-3); +_PS_CONST(cephes_exp_p2, 8.3334519073E-3); +_PS_CONST(cephes_exp_p3, 4.1665795894E-2); +_PS_CONST(cephes_exp_p4, 1.6666665459E-1); +_PS_CONST(cephes_exp_p5, 5.0000001201E-1); + +v4sf exp_ps(v4sf x) { + v4sf tmp = _mm_setzero_ps(), fx; +#ifdef USE_SSE2 + v4si emm0; +#else + v2si mm0, mm1; +#endif + v4sf one = *(v4sf*)_ps_1; + + x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi); + x = _mm_max_ps(x, *(v4sf*)_ps_exp_lo); + + /* express exp(x) as exp(g + n*log(2)) */ + fx = _mm_mul_ps(x, *(v4sf*)_ps_cephes_LOG2EF); + fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5); + + /* how to perform a floorf with SSE: just below */ +#ifndef USE_SSE2 + /* step 1 : cast to int */ + tmp = _mm_movehl_ps(tmp, fx); + mm0 = _mm_cvttps_pi32(fx); + mm1 = _mm_cvttps_pi32(tmp); + /* step 2 : cast back to float */ + tmp = _mm_cvtpi32x2_ps(mm0, mm1); +#else + emm0 = _mm_cvttps_epi32(fx); + tmp = _mm_cvtepi32_ps(emm0); +#endif + /* if greater, substract 1 */ + v4sf mask = _mm_cmpgt_ps(tmp, fx); + mask = _mm_and_ps(mask, one); + fx = _mm_sub_ps(tmp, mask); + + tmp = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C1); + v4sf z = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C2); + x = _mm_sub_ps(x, tmp); + x = _mm_sub_ps(x, z); + + z = _mm_mul_ps(x,x); + + v4sf y = *(v4sf*)_ps_cephes_exp_p0; + y = _mm_mul_ps(y, x); + y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p1); + y = _mm_mul_ps(y, x); + y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p2); + y = _mm_mul_ps(y, x); + y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p3); + y = _mm_mul_ps(y, x); + y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p4); + y = _mm_mul_ps(y, x); + y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p5); + y = _mm_mul_ps(y, z); + y = _mm_add_ps(y, x); + y = _mm_add_ps(y, one); + + /* build 2^n */ +#ifndef USE_SSE2 + z = _mm_movehl_ps(z, fx); + mm0 = _mm_cvttps_pi32(fx); + mm1 = _mm_cvttps_pi32(z); + mm0 = _mm_add_pi32(mm0, *(v2si*)_pi32_0x7f); + mm1 = _mm_add_pi32(mm1, *(v2si*)_pi32_0x7f); + mm0 = _mm_slli_pi32(mm0, 23); + mm1 = _mm_slli_pi32(mm1, 23); + + v4sf pow2n; + COPY_MM_TO_XMM(mm0, mm1, pow2n); + _mm_empty(); +#else + emm0 = _mm_cvttps_epi32(fx); + emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f); + emm0 = _mm_slli_epi32(emm0, 23); + v4sf pow2n = _mm_castsi128_ps(emm0); +#endif + y = _mm_mul_ps(y, pow2n); + return y; +} + +_PS_CONST(minus_cephes_DP1, -0.78515625); +_PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4); +_PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8); +_PS_CONST(sincof_p0, -1.9515295891E-4); +_PS_CONST(sincof_p1, 8.3321608736E-3); +_PS_CONST(sincof_p2, -1.6666654611E-1); +_PS_CONST(coscof_p0, 2.443315711809948E-005); +_PS_CONST(coscof_p1, -1.388731625493765E-003); +_PS_CONST(coscof_p2, 4.166664568298827E-002); +_PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI + + +/* evaluation of 4 sines at onces, using only SSE1+MMX intrinsics so + it runs also on old athlons XPs and the pentium III of your grand + mother. + + The code is the exact rewriting of the cephes sinf function. + Precision is excellent as long as x < 8192 (I did not bother to + take into account the special handling they have for greater values + -- it does not return garbage for arguments over 8192, though, but + the extra precision is missing). + + Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the + surprising but correct result. + + Performance is also surprisingly good, 1.33 times faster than the + macos vsinf SSE2 function, and 1.5 times faster than the + __vrs4_sinf of amd's ACML (which is only available in 64 bits). Not + too bad for an SSE1 function (with no special tuning) ! + However the latter libraries probably have a much better handling of NaN, + Inf, denormalized and other special arguments.. + + On my core 1 duo, the execution of this function takes approximately 95 cycles. + + From what I have observed on the experiments with Intel AMath lib, switching to an + SSE2 version would improve the perf by only 10%. + + Since it is based on SSE intrinsics, it has to be compiled at -O2 to + deliver full speed. +*/ +v4sf sin_ps(v4sf x) { // any x + v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y; + +#ifdef USE_SSE2 + v4si emm0, emm2; +#else + v2si mm0, mm1, mm2, mm3; +#endif + sign_bit = x; + /* take the absolute value */ + x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask); + /* extract the sign bit (upper one) */ + sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask); + + /* scale by 4/Pi */ + y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI); + +#ifdef USE_SSE2 + /* store the integer part of y in mm0 */ + emm2 = _mm_cvttps_epi32(y); + /* j=(j+1) & (~1) (see the cephes sources) */ + emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1); + emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1); + y = _mm_cvtepi32_ps(emm2); + + /* get the swap sign flag */ + emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4); + emm0 = _mm_slli_epi32(emm0, 29); + /* get the polynom selection mask + there is one polynom for 0 <= x <= Pi/4 + and another one for Pi/4