[processor] start port of frequency_analyzer
This commit is contained in:
parent
2c17c57580
commit
50233de626
7 changed files with 983 additions and 23 deletions
|
@ -1,14 +1,196 @@
|
||||||
#include "Amuencha.hpp"
|
#include "Amuencha.hpp"
|
||||||
|
#include "sse_mathfun.h"
|
||||||
|
|
||||||
|
#include <boost/math/constants/constants.hpp>
|
||||||
|
#include <boost/math/special_functions/bessel.hpp>
|
||||||
|
|
||||||
|
#include <sys/stat.h>
|
||||||
|
#include <iostream>
|
||||||
|
#include <fstream>
|
||||||
|
|
||||||
namespace Amuencha
|
namespace Amuencha
|
||||||
{
|
{
|
||||||
|
void Analyser::prepare(halp::setup info)
|
||||||
|
{
|
||||||
|
sampling_rate = info.rate;
|
||||||
|
samplerate_div_2pi = sampling_rate/two_pi;
|
||||||
|
analyzer_setup();
|
||||||
|
}
|
||||||
|
|
||||||
void Analyser::operator()(halp::tick t)
|
void Analyser::operator()(halp::tick t)
|
||||||
{
|
{
|
||||||
// Process the input buffer
|
// Process the input buffer
|
||||||
auto* in = inputs.audio[0];
|
auto* in = inputs.audio[0];
|
||||||
|
|
||||||
// for(int j{0}; j < t.frames; j++)
|
for(int j{0}; j < t.frames; j++)
|
||||||
// {
|
{
|
||||||
// }
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void Analyser::analyzer_setup(float max_buffer_duration)
|
||||||
|
{
|
||||||
|
using namespace boost::math::float_constants;
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
|
// Prepare the windows
|
||||||
|
//std::vector<std::vector<v4sf>> windowed_sines;
|
||||||
|
windowed_sines.resize(frequencies.size());
|
||||||
|
power_normalization_factors.resize(frequencies.size());
|
||||||
|
|
||||||
|
int big_buffer_size = 0;
|
||||||
|
|
||||||
|
for (int idx{0}; idx < frequencies.size(); ++idx)
|
||||||
|
{
|
||||||
|
// for each freq, span at least 20 periods for more precise measurements
|
||||||
|
// This still gives reasonable latencies, e.g. 50ms at 400Hz, 100ms at 200Hz, 400ms at 50Hz...
|
||||||
|
// Could also span more for even better measurements, with larger
|
||||||
|
// computation cost and latency
|
||||||
|
float f = frequencies[idx];
|
||||||
|
int window_size = (int)(min(inputs.periods_nb.value / f, max_buffer_duration * 0.001f)
|
||||||
|
* sampling_rate);
|
||||||
|
vector<float> window(window_size);
|
||||||
|
vector<float> window_deriv(window_size);
|
||||||
|
|
||||||
|
if (!read_from_cache(window, window_deriv))
|
||||||
|
{
|
||||||
|
initialize_window(window);
|
||||||
|
initialize_window_deriv(window_deriv);
|
||||||
|
write_to_cache(window, window_deriv);
|
||||||
|
}
|
||||||
|
windowed_sines[idx].resize(window_size);
|
||||||
|
float wsum = 0;
|
||||||
|
|
||||||
|
for (int i{0}; i < window_size;)
|
||||||
|
{
|
||||||
|
if (i < window_size - 4)
|
||||||
|
{
|
||||||
|
v4sf tfs = {
|
||||||
|
(float)(i - window_size - 1) / sampling_rate,
|
||||||
|
(float)(i + 1 - window_size - 1) / sampling_rate,
|
||||||
|
(float)(i + 2 - window_size - 1) / sampling_rate,
|
||||||
|
(float)(i + 3 - window_size - 1) / sampling_rate
|
||||||
|
};
|
||||||
|
tfs *= (float)(-two_pi*f);
|
||||||
|
v4sf sin_tf, cos_tf;
|
||||||
|
sincos_ps(tfs, &sin_tf, &cos_tf);
|
||||||
|
|
||||||
|
for (int j{0}; j < 3; ++j)
|
||||||
|
{
|
||||||
|
v4sf ws = {
|
||||||
|
cos_tf[j] * window[i + j],
|
||||||
|
sin_tf[j] * window[i + j],
|
||||||
|
cos_tf[j] * window_deriv[i + j],
|
||||||
|
sin_tf[j] * window_deriv[i + j]
|
||||||
|
};
|
||||||
|
windowed_sines[idx][i + j] = ws;
|
||||||
|
wsum += window[i + j];
|
||||||
|
}
|
||||||
|
i += 4;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
float t = (float)(i-window_size-1) / sampling_rate;
|
||||||
|
float re = cosf(-two_pi*t*f);
|
||||||
|
float im = sinf(-two_pi*t*f);
|
||||||
|
v4sf ws = {
|
||||||
|
re * window[i],
|
||||||
|
im * window[i],
|
||||||
|
re * window_deriv[i],
|
||||||
|
im * window_deriv[i]
|
||||||
|
};
|
||||||
|
windowed_sines[idx][i] = ws;
|
||||||
|
wsum += window[i];
|
||||||
|
++i;
|
||||||
|
}
|
||||||
|
power_normalization_factors[idx] = 1. / (wsum*wsum);
|
||||||
|
big_buffer_size = max(big_buffer_size, window_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
big_buffer.clear();
|
||||||
|
// fill with 0 signal content to start with
|
||||||
|
big_buffer.resize(big_buffer_size, 0.f);
|
||||||
|
}
|
||||||
|
|
||||||
|
void Analyser::initialize_window(std::vector<float>& window)
|
||||||
|
{
|
||||||
|
// Kaiser window with a parameter of alpha=3 that nullifies the window on edges
|
||||||
|
int size = window.size();
|
||||||
|
const float two_over_N = 2. / size;
|
||||||
|
const float alpha = 3.;
|
||||||
|
const float alpha_pi = alpha * pi;
|
||||||
|
const float inv_denom = 1. / boost::math::cyl_bessel_i(0., alpha_pi);
|
||||||
|
for (int i{0}; i < size; ++i)
|
||||||
|
{
|
||||||
|
float p = i * two_over_N - 1.;
|
||||||
|
window[i] = boost::math::cyl_bessel_i(0., alpha_pi * sqrt(1. - p*p)) * inv_denom;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void Analyser::initialize_window_deriv(std::vector<float> &window)
|
||||||
|
{
|
||||||
|
// Derivative of the Kaiser window with a parameter of alpha=3 that nullifies the window on edges
|
||||||
|
int size = window.size();
|
||||||
|
const float two_over_N = 2. / size;
|
||||||
|
const float alpha = 3.;
|
||||||
|
const float alpha_pi = alpha * pi;
|
||||||
|
const float inv_denom = 1. / boost::math::cyl_bessel_i(0., alpha_pi);
|
||||||
|
for (int i{1}; i < size; ++i)
|
||||||
|
{
|
||||||
|
float p = i * two_over_N - 1.;
|
||||||
|
window[i] = boost::math::cyl_bessel_i(1., alpha_pi * sqrt(1. - p*p)) *
|
||||||
|
inv_denom * alpha_pi / sqrt(1. - p*p) * (-p)*two_over_N;
|
||||||
|
}
|
||||||
|
// lim I1(x)/x as x->0 = 1/2
|
||||||
|
window[0] = 0.5 * inv_denom * alpha_pi * alpha_pi * two_over_N;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool Analyser::read_from_cache(std::vector<float> &window, std::vector<float> &window_deriv)
|
||||||
|
{
|
||||||
|
auto it = mem_win_cache.find(window.size());
|
||||||
|
|
||||||
|
if (it != mem_win_cache.end())
|
||||||
|
{
|
||||||
|
window = it->second;
|
||||||
|
auto itd = mem_winderiv_cache.find(window.size());
|
||||||
|
|
||||||
|
if (itd != mem_winderiv_cache.end())
|
||||||
|
{
|
||||||
|
window_deriv = itd->second;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
// else, load from disk
|
||||||
|
}
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
// TODO: make the cache location parametrizable (and an option to not use it)
|
||||||
|
ifstream file(".amuencha_cache/w"+to_string(window.size())+".bin", ios::binary);
|
||||||
|
if (file.is_open())
|
||||||
|
{
|
||||||
|
file.read(reinterpret_cast<char*>(&window[0]), window.size()*sizeof(float));
|
||||||
|
file.read(reinterpret_cast<char*>(&window_deriv[0]), window_deriv.size()*sizeof(float));
|
||||||
|
|
||||||
|
if (file.tellg() != (window.size() + window_deriv.size()) * sizeof(float))
|
||||||
|
{
|
||||||
|
cerr << "Error: invalid cache .amuencha_cache/w"+to_string(window.size())+".bin\n";
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Analyser::write_to_cache(std::vector<float> &window, std::vector<float> &window_deriv)
|
||||||
|
{
|
||||||
|
using namespace std;
|
||||||
|
#if defined(_WIN32) || defined(_WIN64)
|
||||||
|
mkdir(".amuencha_cache");
|
||||||
|
#else
|
||||||
|
mkdir(".amuencha_cache", 0755);
|
||||||
|
#endif
|
||||||
|
ofstream file(".amuencha_cache/w"+to_string(window.size())+".bin", ios::binary|ios::trunc);
|
||||||
|
file.write(reinterpret_cast<char*>(&window[0]), window.size()*sizeof(float));
|
||||||
|
file.write(reinterpret_cast<char*>(&window_deriv[0]), window_deriv.size()*sizeof(float));
|
||||||
|
mem_win_cache[window.size()] = window;
|
||||||
|
mem_winderiv_cache[window.size()] = window_deriv;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -15,6 +15,15 @@ public:
|
||||||
halp_meta(c_name, "amuencha")
|
halp_meta(c_name, "amuencha")
|
||||||
halp_meta(uuid, "b37351b4-7b8d-4150-9e1c-708eec9182b2")
|
halp_meta(uuid, "b37351b4-7b8d-4150-9e1c-708eec9182b2")
|
||||||
|
|
||||||
|
// This one will be memcpy'd as it is a trivial type
|
||||||
|
struct processor_to_ui
|
||||||
|
{
|
||||||
|
int min;
|
||||||
|
int max;
|
||||||
|
};
|
||||||
|
|
||||||
|
std::function<void(const processor_to_ui&)> send_message;
|
||||||
|
|
||||||
// Define inputs and outputs ports.
|
// Define inputs and outputs ports.
|
||||||
// See the docs at https://github.com/celtera/avendish
|
// See the docs at https://github.com/celtera/avendish
|
||||||
struct ins
|
struct ins
|
||||||
|
@ -34,18 +43,18 @@ public:
|
||||||
self.send_message({.min = self.inputs.min.value, .max = this->value});
|
self.send_message({.min = self.inputs.min.value, .max = this->value});
|
||||||
}
|
}
|
||||||
} max;
|
} max;
|
||||||
// halp::hslider_i32<"Min", halp::range{.min = 0, .max = 127, .init = 24}> min;
|
halp::spinbox_i32<"Analyze num. periods",
|
||||||
// halp::hslider_i32<"Max", halp::range{.min = 0, .max = 127, .init = 72}> max;
|
halp::range{.min = 0, .max = 99, .init = 30}> periods_nb;
|
||||||
} inputs;
|
} inputs;
|
||||||
|
|
||||||
// This one will be memcpy'd as it is a trivial type
|
void process_message(const std::vector<float>& frequencies)
|
||||||
struct processor_to_ui
|
|
||||||
{
|
{
|
||||||
int min;
|
this->frequencies = frequencies;
|
||||||
int max;
|
reassigned_frequencies = frequencies;
|
||||||
};
|
power_spectrum.resize(frequencies.size());
|
||||||
|
|
||||||
std::function<void(processor_to_ui)> send_message;
|
if (sampling_rate != 0) analyzer_setup();
|
||||||
|
}
|
||||||
|
|
||||||
struct outs
|
struct outs
|
||||||
{
|
{
|
||||||
|
@ -53,10 +62,7 @@ public:
|
||||||
} outputs;
|
} outputs;
|
||||||
|
|
||||||
using setup = halp::setup;
|
using setup = halp::setup;
|
||||||
void prepare(halp::setup info)
|
void prepare(halp::setup info);
|
||||||
{
|
|
||||||
// Initialization, this method will be called with buffer size, etc.
|
|
||||||
}
|
|
||||||
|
|
||||||
// Do our processing for N samples
|
// Do our processing for N samples
|
||||||
using tick = halp::tick;
|
using tick = halp::tick;
|
||||||
|
@ -66,6 +72,48 @@ public:
|
||||||
|
|
||||||
// UI is defined in another file to keep things clear.
|
// UI is defined in another file to keep things clear.
|
||||||
struct ui;
|
struct ui;
|
||||||
|
|
||||||
|
// Arguments are: frequency bins [f,f+1), and power in each bin
|
||||||
|
// hence the first vector size is 1 more than the second
|
||||||
|
// TODO if useful: make a proper listener API with id, etc
|
||||||
|
typedef std::function<void(const std::vector<float>&, const std::vector<float>&)> PowerHandler;
|
||||||
|
PowerHandler power_handler;
|
||||||
|
|
||||||
|
// sampling rate in Hz
|
||||||
|
// freqs in Hz
|
||||||
|
// PowerHandler for the callback
|
||||||
|
// max_buffer_duration in milliseconds, specifies the largest buffer for computing the frequency content
|
||||||
|
// At lower frequencies, long buffers are needed for accurate frequency separation.
|
||||||
|
// When that max buffer duration is reached, then it is capped and the frequency resolution decreases
|
||||||
|
// Too low buffers also limit the min_freq, duration must be >= period
|
||||||
|
void analyzer_setup(float max_buffer_duration = 500);
|
||||||
|
|
||||||
|
private:
|
||||||
|
// The window is the usual Kaiser with alpha=3
|
||||||
|
static void initialize_window(std::vector<float>& window);
|
||||||
|
static void initialize_window_deriv(std::vector<float>& window);
|
||||||
|
// The filter bank. One filter per frequency
|
||||||
|
// The 4 entries in the v4sf are the real, imaginary parts of the windowed
|
||||||
|
// sine wavelet, and the real, imaginary parts of the derived windowed sine
|
||||||
|
// used for reassigning the power spectrum.
|
||||||
|
// Hopefully, with SIMD, computing all 4 of them is the same price as just one
|
||||||
|
// TODO: v8sf and compute 2 freqs at the same time
|
||||||
|
typedef float v4sf __attribute__ ((vector_size (16)));
|
||||||
|
std::vector<std::vector<v4sf>> windowed_sines;
|
||||||
|
std::vector<float> frequencies;
|
||||||
|
std::vector<float> power_normalization_factors;
|
||||||
|
float sampling_rate;
|
||||||
|
float samplerate_div_2pi;
|
||||||
|
std::vector<float> big_buffer;
|
||||||
|
std::vector<float> reassigned_frequencies;
|
||||||
|
std::vector<float> power_spectrum;
|
||||||
|
|
||||||
|
// caching computations for faster init
|
||||||
|
// on disk for persistence between executions,
|
||||||
|
// in memory for avoiding reloading from disk when changing the spiral size
|
||||||
|
bool read_from_cache(std::vector<float>& window, std::vector<float>& window_deriv);
|
||||||
|
void write_to_cache(std::vector<float>& window, std::vector<float>& window_deriv);
|
||||||
|
std::map<int, std::vector<float>> mem_win_cache, mem_winderiv_cache;
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,16 +20,24 @@ struct Analyser::ui
|
||||||
halp_meta(layout, vbox)
|
halp_meta(layout, vbox)
|
||||||
halp::item<&ins::min> min;
|
halp::item<&ins::min> min;
|
||||||
halp::item<&ins::max> max;
|
halp::item<&ins::max> max;
|
||||||
|
halp::item<&ins::periods_nb> periods;
|
||||||
} controls;
|
} controls;
|
||||||
|
|
||||||
halp::custom_actions_item<SpiralDisplay> spiral{.x = 0, .y = 0};
|
halp::custom_actions_item<SpiralDisplay> spiral{.x = 0, .y = 0};
|
||||||
|
|
||||||
|
|
||||||
// Define the communication between UI and processor.
|
// Define the communication between UI and processor.
|
||||||
struct bus
|
struct bus
|
||||||
{
|
{
|
||||||
|
// std::function<void(const std::vector<float>&)> send_message;
|
||||||
|
|
||||||
// // Set up connections
|
// // Set up connections
|
||||||
// init(ui& self)
|
// void init(ui& self)
|
||||||
// {
|
// {
|
||||||
|
// self.spiral.on_new_frequencies = [&]
|
||||||
|
// {
|
||||||
|
// send_message(self.spiral.get_frequencies());
|
||||||
|
// };
|
||||||
// }
|
// }
|
||||||
|
|
||||||
// Receive a message on the UI thread from the processing thread
|
// Receive a message on the UI thread from the processing thread
|
||||||
|
|
|
@ -23,6 +23,11 @@ void Amuencha::SpiralDisplay::set_min_max_notes(int min_midi_note, int max_midi_
|
||||||
// update();
|
// update();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::vector<float> Amuencha::SpiralDisplay::get_frequencies() const noexcept
|
||||||
|
{
|
||||||
|
return frequencies;
|
||||||
|
}
|
||||||
|
|
||||||
void Amuencha::SpiralDisplay::compute_frequencies()
|
void Amuencha::SpiralDisplay::compute_frequencies()
|
||||||
{
|
{
|
||||||
// Now the spiral
|
// Now the spiral
|
||||||
|
@ -84,6 +89,8 @@ void Amuencha::SpiralDisplay::compute_frequencies()
|
||||||
display_spectrum[id].resize(num_bins);
|
display_spectrum[id].resize(num_bins);
|
||||||
fill(display_spectrum[id].begin(), display_spectrum[id].end(), 0.);
|
fill(display_spectrum[id].begin(), display_spectrum[id].end(), 0.);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// on_new_frequencies();
|
||||||
}
|
}
|
||||||
|
|
||||||
void Amuencha::SpiralDisplay::power_handler(int ID, const std::vector<float> &reassigned_frequencies, const std::vector<float> &power_spectrum)
|
void Amuencha::SpiralDisplay::power_handler(int ID, const std::vector<float> &reassigned_frequencies, const std::vector<float> &power_spectrum)
|
||||||
|
|
|
@ -75,6 +75,16 @@ struct SpiralDisplay
|
||||||
ctx.stroke();
|
ctx.stroke();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[[nodiscard]] std::vector<float> get_frequencies() const noexcept;
|
||||||
|
|
||||||
|
std::function<void()> on_new_frequencies;
|
||||||
|
|
||||||
|
// // Callback when the power spectrum is available at the prescribed frequencies
|
||||||
|
// // The ID is that of the caller, setting the color of the display
|
||||||
|
void power_handler(int ID,
|
||||||
|
const std::vector<float>& reassigned_frequencies,
|
||||||
|
const std::vector<float>& power_spectrum);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
static const constexpr std::string_view note_names[12]
|
static const constexpr std::string_view note_names[12]
|
||||||
{"C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"};
|
{"C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"};
|
||||||
|
@ -127,13 +137,6 @@ private:
|
||||||
}
|
}
|
||||||
|
|
||||||
void compute_frequencies();
|
void compute_frequencies();
|
||||||
|
|
||||||
// // Callback when the power spectrum is available at the prescribed frequencies
|
|
||||||
// // The ID is that of the caller, setting the color of the display
|
|
||||||
void power_handler(int ID,
|
|
||||||
const std::vector<float>& reassigned_frequencies,
|
|
||||||
const std::vector<float>& power_spectrum);
|
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
711
Amuencha/sse_mathfun.h
Normal file
711
Amuencha/sse_mathfun.h
Normal file
|
@ -0,0 +1,711 @@
|
||||||
|
/* SIMD (SSE1+MMX or SSE2) implementation of sin, cos, exp and log
|
||||||
|
|
||||||
|
Inspired by Intel Approximate Math library, and based on the
|
||||||
|
corresponding algorithms of the cephes math library
|
||||||
|
|
||||||
|
The default is to use the SSE1 version. If you define USE_SSE2 the
|
||||||
|
the SSE2 intrinsics will be used in place of the MMX intrinsics. Do
|
||||||
|
not expect any significant performance improvement with SSE2.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* Copyright (C) 2007 Julien Pommier
|
||||||
|
|
||||||
|
This software is provided 'as-is', without any express or implied
|
||||||
|
warranty. In no event will the authors be held liable for any damages
|
||||||
|
arising from the use of this software.
|
||||||
|
|
||||||
|
Permission is granted to anyone to use this software for any purpose,
|
||||||
|
including commercial applications, and to alter it and redistribute it
|
||||||
|
freely, subject to the following restrictions:
|
||||||
|
|
||||||
|
1. The origin of this software must not be misrepresented; you must not
|
||||||
|
claim that you wrote the original software. If you use this software
|
||||||
|
in a product, an acknowledgment in the product documentation would be
|
||||||
|
appreciated but is not required.
|
||||||
|
2. Altered source versions must be plainly marked as such, and must not be
|
||||||
|
misrepresented as being the original software.
|
||||||
|
3. This notice may not be removed or altered from any source distribution.
|
||||||
|
|
||||||
|
(this is the zlib license)
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <xmmintrin.h>
|
||||||
|
|
||||||
|
/* yes I know, the top of this file is quite ugly */
|
||||||
|
|
||||||
|
#ifdef _MSC_VER /* visual c++ */
|
||||||
|
# define ALIGN16_BEG __declspec(align(16))
|
||||||
|
# define ALIGN16_END
|
||||||
|
#else /* gcc or icc */
|
||||||
|
# define ALIGN16_BEG
|
||||||
|
# define ALIGN16_END __attribute__((aligned(16)))
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* __m128 is ugly to write */
|
||||||
|
typedef __m128 v4sf; // vector of 4 float (sse1)
|
||||||
|
|
||||||
|
#ifdef USE_SSE2
|
||||||
|
# include <emmintrin.h>
|
||||||
|
typedef __m128i v4si; // vector of 4 int (sse2)
|
||||||
|
#else
|
||||||
|
typedef __m64 v2si; // vector of 2 int (mmx)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* declare some SSE constants -- why can't I figure a better way to do that? */
|
||||||
|
#define _PS_CONST(Name, Val) \
|
||||||
|
static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
|
||||||
|
#define _PI32_CONST(Name, Val) \
|
||||||
|
static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
|
||||||
|
#define _PS_CONST_TYPE(Name, Type, Val) \
|
||||||
|
static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
|
||||||
|
|
||||||
|
_PS_CONST(1 , 1.0f);
|
||||||
|
_PS_CONST(0p5, 0.5f);
|
||||||
|
/* the smallest non denormalized float number */
|
||||||
|
_PS_CONST_TYPE(min_norm_pos, int, 0x00800000);
|
||||||
|
_PS_CONST_TYPE(mant_mask, int, 0x7f800000);
|
||||||
|
_PS_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
|
||||||
|
|
||||||
|
_PS_CONST_TYPE(sign_mask, int, (int)0x80000000);
|
||||||
|
_PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
|
||||||
|
|
||||||
|
_PI32_CONST(1, 1);
|
||||||
|
_PI32_CONST(inv1, ~1);
|
||||||
|
_PI32_CONST(2, 2);
|
||||||
|
_PI32_CONST(4, 4);
|
||||||
|
_PI32_CONST(0x7f, 0x7f);
|
||||||
|
|
||||||
|
_PS_CONST(cephes_SQRTHF, 0.707106781186547524);
|
||||||
|
_PS_CONST(cephes_log_p0, 7.0376836292E-2);
|
||||||
|
_PS_CONST(cephes_log_p1, - 1.1514610310E-1);
|
||||||
|
_PS_CONST(cephes_log_p2, 1.1676998740E-1);
|
||||||
|
_PS_CONST(cephes_log_p3, - 1.2420140846E-1);
|
||||||
|
_PS_CONST(cephes_log_p4, + 1.4249322787E-1);
|
||||||
|
_PS_CONST(cephes_log_p5, - 1.6668057665E-1);
|
||||||
|
_PS_CONST(cephes_log_p6, + 2.0000714765E-1);
|
||||||
|
_PS_CONST(cephes_log_p7, - 2.4999993993E-1);
|
||||||
|
_PS_CONST(cephes_log_p8, + 3.3333331174E-1);
|
||||||
|
_PS_CONST(cephes_log_q1, -2.12194440e-4);
|
||||||
|
_PS_CONST(cephes_log_q2, 0.693359375);
|
||||||
|
|
||||||
|
#ifndef USE_SSE2
|
||||||
|
typedef union xmm_mm_union {
|
||||||
|
__m128 xmm;
|
||||||
|
__m64 mm[2];
|
||||||
|
} xmm_mm_union;
|
||||||
|
|
||||||
|
#define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) { \
|
||||||
|
xmm_mm_union u; u.xmm = xmm_; \
|
||||||
|
mm0_ = u.mm[0]; \
|
||||||
|
mm1_ = u.mm[1]; \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { \
|
||||||
|
xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; \
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // USE_SSE2
|
||||||
|
|
||||||
|
/* natural logarithm computed for 4 simultaneous float
|
||||||
|
return NaN for x <= 0
|
||||||
|
*/
|
||||||
|
v4sf log_ps(v4sf x) {
|
||||||
|
#ifdef USE_SSE2
|
||||||
|
v4si emm0;
|
||||||
|
#else
|
||||||
|
v2si mm0, mm1;
|
||||||
|
#endif
|
||||||
|
v4sf one = *(v4sf*)_ps_1;
|
||||||
|
|
||||||
|
v4sf invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
|
||||||
|
|
||||||
|
x = _mm_max_ps(x, *(v4sf*)_ps_min_norm_pos); /* cut off denormalized stuff */
|
||||||
|
|
||||||
|
#ifndef USE_SSE2
|
||||||
|
/* part 1: x = frexpf(x, &e); */
|
||||||
|
COPY_XMM_TO_MM(x, mm0, mm1);
|
||||||
|
mm0 = _mm_srli_pi32(mm0, 23);
|
||||||
|
mm1 = _mm_srli_pi32(mm1, 23);
|
||||||
|
#else
|
||||||
|
emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
|
||||||
|
#endif
|
||||||
|
/* keep only the fractional part */
|
||||||
|
x = _mm_and_ps(x, *(v4sf*)_ps_inv_mant_mask);
|
||||||
|
x = _mm_or_ps(x, *(v4sf*)_ps_0p5);
|
||||||
|
|
||||||
|
#ifndef USE_SSE2
|
||||||
|
/* now e=mm0:mm1 contain the really base-2 exponent */
|
||||||
|
mm0 = _mm_sub_pi32(mm0, *(v2si*)_pi32_0x7f);
|
||||||
|
mm1 = _mm_sub_pi32(mm1, *(v2si*)_pi32_0x7f);
|
||||||
|
v4sf e = _mm_cvtpi32x2_ps(mm0, mm1);
|
||||||
|
_mm_empty(); /* bye bye mmx */
|
||||||
|
#else
|
||||||
|
emm0 = _mm_sub_epi32(emm0, *(v4si*)_pi32_0x7f);
|
||||||
|
v4sf e = _mm_cvtepi32_ps(emm0);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
e = _mm_add_ps(e, one);
|
||||||
|
|
||||||
|
/* part2:
|
||||||
|
if( x < SQRTHF ) {
|
||||||
|
e -= 1;
|
||||||
|
x = x + x - 1.0;
|
||||||
|
} else { x = x - 1.0; }
|
||||||
|
*/
|
||||||
|
v4sf mask = _mm_cmplt_ps(x, *(v4sf*)_ps_cephes_SQRTHF);
|
||||||
|
v4sf tmp = _mm_and_ps(x, mask);
|
||||||
|
x = _mm_sub_ps(x, one);
|
||||||
|
e = _mm_sub_ps(e, _mm_and_ps(one, mask));
|
||||||
|
x = _mm_add_ps(x, tmp);
|
||||||
|
|
||||||
|
|
||||||
|
v4sf z = _mm_mul_ps(x,x);
|
||||||
|
|
||||||
|
v4sf y = *(v4sf*)_ps_cephes_log_p0;
|
||||||
|
y = _mm_mul_ps(y, x);
|
||||||
|
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p1);
|
||||||
|
y = _mm_mul_ps(y, x);
|
||||||
|
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p2);
|
||||||
|
y = _mm_mul_ps(y, x);
|
||||||
|
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p3);
|
||||||
|
y = _mm_mul_ps(y, x);
|
||||||
|
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p4);
|
||||||
|
y = _mm_mul_ps(y, x);
|
||||||
|
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p5);
|
||||||
|
y = _mm_mul_ps(y, x);
|
||||||
|
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p6);
|
||||||
|
y = _mm_mul_ps(y, x);
|
||||||
|
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p7);
|
||||||
|
y = _mm_mul_ps(y, x);
|
||||||
|
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p8);
|
||||||
|
y = _mm_mul_ps(y, x);
|
||||||
|
|
||||||
|
y = _mm_mul_ps(y, z);
|
||||||
|
|
||||||
|
|
||||||
|
tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q1);
|
||||||
|
y = _mm_add_ps(y, tmp);
|
||||||
|
|
||||||
|
|
||||||
|
tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
|
||||||
|
y = _mm_sub_ps(y, tmp);
|
||||||
|
|
||||||
|
tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q2);
|
||||||
|
x = _mm_add_ps(x, y);
|
||||||
|
x = _mm_add_ps(x, tmp);
|
||||||
|
x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
|
||||||
|
return x;
|
||||||
|
}
|
||||||
|
|
||||||
|
_PS_CONST(exp_hi, 88.3762626647949f);
|
||||||
|
_PS_CONST(exp_lo, -88.3762626647949f);
|
||||||
|
|
||||||
|
_PS_CONST(cephes_LOG2EF, 1.44269504088896341);
|
||||||
|
_PS_CONST(cephes_exp_C1, 0.693359375);
|
||||||
|
_PS_CONST(cephes_exp_C2, -2.12194440e-4);
|
||||||
|
|
||||||
|
_PS_CONST(cephes_exp_p0, 1.9875691500E-4);
|
||||||
|
_PS_CONST(cephes_exp_p1, 1.3981999507E-3);
|
||||||
|
_PS_CONST(cephes_exp_p2, 8.3334519073E-3);
|
||||||
|
_PS_CONST(cephes_exp_p3, 4.1665795894E-2);
|
||||||
|
_PS_CONST(cephes_exp_p4, 1.6666665459E-1);
|
||||||
|
_PS_CONST(cephes_exp_p5, 5.0000001201E-1);
|
||||||
|
|
||||||
|
v4sf exp_ps(v4sf x) {
|
||||||
|
v4sf tmp = _mm_setzero_ps(), fx;
|
||||||
|
#ifdef USE_SSE2
|
||||||
|
v4si emm0;
|
||||||
|
#else
|
||||||
|
v2si mm0, mm1;
|
||||||
|
#endif
|
||||||
|
v4sf one = *(v4sf*)_ps_1;
|
||||||
|
|
||||||
|
x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi);
|
||||||
|
x = _mm_max_ps(x, *(v4sf*)_ps_exp_lo);
|
||||||
|
|
||||||
|
/* express exp(x) as exp(g + n*log(2)) */
|
||||||
|
fx = _mm_mul_ps(x, *(v4sf*)_ps_cephes_LOG2EF);
|
||||||
|
fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5);
|
||||||
|
|
||||||
|
/* how to perform a floorf with SSE: just below */
|
||||||
|
#ifndef USE_SSE2
|
||||||
|
/* step 1 : cast to int */
|
||||||
|
tmp = _mm_movehl_ps(tmp, fx);
|
||||||
|
mm0 = _mm_cvttps_pi32(fx);
|
||||||
|
mm1 = _mm_cvttps_pi32(tmp);
|
||||||
|
/* step 2 : cast back to float */
|
||||||
|
tmp = _mm_cvtpi32x2_ps(mm0, mm1);
|
||||||
|
#else
|
||||||
|
emm0 = _mm_cvttps_epi32(fx);
|
||||||
|
tmp = _mm_cvtepi32_ps(emm0);
|
||||||
|
#endif
|
||||||
|
/* if greater, substract 1 */
|
||||||
|
v4sf mask = _mm_cmpgt_ps(tmp, fx);
|
||||||
|
mask = _mm_and_ps(mask, one);
|
||||||
|
fx = _mm_sub_ps(tmp, mask);
|
||||||
|
|
||||||
|
tmp = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C1);
|
||||||
|
v4sf z = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C2);
|
||||||
|
x = _mm_sub_ps(x, tmp);
|
||||||
|
x = _mm_sub_ps(x, z);
|
||||||
|
|
||||||
|
z = _mm_mul_ps(x,x);
|
||||||
|
|
||||||
|
v4sf y = *(v4sf*)_ps_cephes_exp_p0;
|
||||||
|
y = _mm_mul_ps(y, x);
|
||||||
|
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p1);
|
||||||
|
y = _mm_mul_ps(y, x);
|
||||||
|
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p2);
|
||||||
|
y = _mm_mul_ps(y, x);
|
||||||
|
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p3);
|
||||||
|
y = _mm_mul_ps(y, x);
|
||||||
|
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p4);
|
||||||
|
y = _mm_mul_ps(y, x);
|
||||||
|
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p5);
|
||||||
|
y = _mm_mul_ps(y, z);
|
||||||
|
y = _mm_add_ps(y, x);
|
||||||
|
y = _mm_add_ps(y, one);
|
||||||
|
|
||||||
|
/* build 2^n */
|
||||||
|
#ifndef USE_SSE2
|
||||||
|
z = _mm_movehl_ps(z, fx);
|
||||||
|
mm0 = _mm_cvttps_pi32(fx);
|
||||||
|
mm1 = _mm_cvttps_pi32(z);
|
||||||
|
mm0 = _mm_add_pi32(mm0, *(v2si*)_pi32_0x7f);
|
||||||
|
mm1 = _mm_add_pi32(mm1, *(v2si*)_pi32_0x7f);
|
||||||
|
mm0 = _mm_slli_pi32(mm0, 23);
|
||||||
|
mm1 = _mm_slli_pi32(mm1, 23);
|
||||||
|
|
||||||
|
v4sf pow2n;
|
||||||
|
COPY_MM_TO_XMM(mm0, mm1, pow2n);
|
||||||
|
_mm_empty();
|
||||||
|
#else
|
||||||
|
emm0 = _mm_cvttps_epi32(fx);
|
||||||
|
emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f);
|
||||||
|
emm0 = _mm_slli_epi32(emm0, 23);
|
||||||
|
v4sf pow2n = _mm_castsi128_ps(emm0);
|
||||||
|
#endif
|
||||||
|
y = _mm_mul_ps(y, pow2n);
|
||||||
|
return y;
|
||||||
|
}
|
||||||
|
|
||||||
|
_PS_CONST(minus_cephes_DP1, -0.78515625);
|
||||||
|
_PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
|
||||||
|
_PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
|
||||||
|
_PS_CONST(sincof_p0, -1.9515295891E-4);
|
||||||
|
_PS_CONST(sincof_p1, 8.3321608736E-3);
|
||||||
|
_PS_CONST(sincof_p2, -1.6666654611E-1);
|
||||||
|
_PS_CONST(coscof_p0, 2.443315711809948E-005);
|
||||||
|
_PS_CONST(coscof_p1, -1.388731625493765E-003);
|
||||||
|
_PS_CONST(coscof_p2, 4.166664568298827E-002);
|
||||||
|
_PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
|
||||||
|
|
||||||
|
|
||||||
|
/* evaluation of 4 sines at onces, using only SSE1+MMX intrinsics so
|
||||||
|
it runs also on old athlons XPs and the pentium III of your grand
|
||||||
|
mother.
|
||||||
|
|
||||||
|
The code is the exact rewriting of the cephes sinf function.
|
||||||
|
Precision is excellent as long as x < 8192 (I did not bother to
|
||||||
|
take into account the special handling they have for greater values
|
||||||
|
-- it does not return garbage for arguments over 8192, though, but
|
||||||
|
the extra precision is missing).
|
||||||
|
|
||||||
|
Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
|
||||||
|
surprising but correct result.
|
||||||
|
|
||||||
|
Performance is also surprisingly good, 1.33 times faster than the
|
||||||
|
macos vsinf SSE2 function, and 1.5 times faster than the
|
||||||
|
__vrs4_sinf of amd's ACML (which is only available in 64 bits). Not
|
||||||
|
too bad for an SSE1 function (with no special tuning) !
|
||||||
|
However the latter libraries probably have a much better handling of NaN,
|
||||||
|
Inf, denormalized and other special arguments..
|
||||||
|
|
||||||
|
On my core 1 duo, the execution of this function takes approximately 95 cycles.
|
||||||
|
|
||||||
|
From what I have observed on the experiments with Intel AMath lib, switching to an
|
||||||
|
SSE2 version would improve the perf by only 10%.
|
||||||
|
|
||||||
|
Since it is based on SSE intrinsics, it has to be compiled at -O2 to
|
||||||
|
deliver full speed.
|
||||||
|
*/
|
||||||
|
v4sf sin_ps(v4sf x) { // any x
|
||||||
|
v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
|
||||||
|
|
||||||
|
#ifdef USE_SSE2
|
||||||
|
v4si emm0, emm2;
|
||||||
|
#else
|
||||||
|
v2si mm0, mm1, mm2, mm3;
|
||||||
|
#endif
|
||||||
|
sign_bit = x;
|
||||||
|
/* take the absolute value */
|
||||||
|
x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
|
||||||
|
/* extract the sign bit (upper one) */
|
||||||
|
sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
|
||||||
|
|
||||||
|
/* scale by 4/Pi */
|
||||||
|
y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
|
||||||
|
|
||||||
|
#ifdef USE_SSE2
|
||||||
|
/* store the integer part of y in mm0 */
|
||||||
|
emm2 = _mm_cvttps_epi32(y);
|
||||||
|
/* j=(j+1) & (~1) (see the cephes sources) */
|
||||||
|
emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
|
||||||
|
emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
|
||||||
|
y = _mm_cvtepi32_ps(emm2);
|
||||||
|
|
||||||
|
/* get the swap sign flag */
|
||||||
|
emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
|
||||||
|
emm0 = _mm_slli_epi32(emm0, 29);
|
||||||
|
/* get the polynom selection mask
|
||||||
|
there is one polynom for 0 <= x <= Pi/4
|
||||||
|
and another one for Pi/4<x<=Pi/2
|
||||||
|
|
||||||
|
Both branches will be computed.
|
||||||
|
*/
|
||||||
|
emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
|
||||||
|
emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
|
||||||
|
|
||||||
|
v4sf swap_sign_bit = _mm_castsi128_ps(emm0);
|
||||||
|
v4sf poly_mask = _mm_castsi128_ps(emm2);
|
||||||
|
sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
|
||||||
|
|
||||||
|
#else
|
||||||
|
/* store the integer part of y in mm0:mm1 */
|
||||||
|
xmm2 = _mm_movehl_ps(xmm2, y);
|
||||||
|
mm2 = _mm_cvttps_pi32(y);
|
||||||
|
mm3 = _mm_cvttps_pi32(xmm2);
|
||||||
|
/* j=(j+1) & (~1) (see the cephes sources) */
|
||||||
|
mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
|
||||||
|
mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
|
||||||
|
mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
|
||||||
|
mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
|
||||||
|
y = _mm_cvtpi32x2_ps(mm2, mm3);
|
||||||
|
/* get the swap sign flag */
|
||||||
|
mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
|
||||||
|
mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
|
||||||
|
mm0 = _mm_slli_pi32(mm0, 29);
|
||||||
|
mm1 = _mm_slli_pi32(mm1, 29);
|
||||||
|
/* get the polynom selection mask */
|
||||||
|
mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
|
||||||
|
mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
|
||||||
|
mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
|
||||||
|
mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
|
||||||
|
v4sf swap_sign_bit, poly_mask;
|
||||||
|
COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit);
|
||||||
|
COPY_MM_TO_XMM(mm2, mm3, poly_mask);
|
||||||
|
sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
|
||||||
|
_mm_empty(); /* good-bye mmx */
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* The magic pass: "Extended precision modular arithmetic"
|
||||||
|
x = ((x - y * DP1) - y * DP2) - y * DP3; */
|
||||||
|
xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
|
||||||
|
xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
|
||||||
|
xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
|
||||||
|
xmm1 = _mm_mul_ps(y, xmm1);
|
||||||
|
xmm2 = _mm_mul_ps(y, xmm2);
|
||||||
|
xmm3 = _mm_mul_ps(y, xmm3);
|
||||||
|
x = _mm_add_ps(x, xmm1);
|
||||||
|
x = _mm_add_ps(x, xmm2);
|
||||||
|
x = _mm_add_ps(x, xmm3);
|
||||||
|
|
||||||
|
/* Evaluate the first polynom (0 <= x <= Pi/4) */
|
||||||
|
y = *(v4sf*)_ps_coscof_p0;
|
||||||
|
v4sf z = _mm_mul_ps(x,x);
|
||||||
|
|
||||||
|
y = _mm_mul_ps(y, z);
|
||||||
|
y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
|
||||||
|
y = _mm_mul_ps(y, z);
|
||||||
|
y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
|
||||||
|
y = _mm_mul_ps(y, z);
|
||||||
|
y = _mm_mul_ps(y, z);
|
||||||
|
v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
|
||||||
|
y = _mm_sub_ps(y, tmp);
|
||||||
|
y = _mm_add_ps(y, *(v4sf*)_ps_1);
|
||||||
|
|
||||||
|
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
|
||||||
|
|
||||||
|
v4sf y2 = *(v4sf*)_ps_sincof_p0;
|
||||||
|
y2 = _mm_mul_ps(y2, z);
|
||||||
|
y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
|
||||||
|
y2 = _mm_mul_ps(y2, z);
|
||||||
|
y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
|
||||||
|
y2 = _mm_mul_ps(y2, z);
|
||||||
|
y2 = _mm_mul_ps(y2, x);
|
||||||
|
y2 = _mm_add_ps(y2, x);
|
||||||
|
|
||||||
|
/* select the correct result from the two polynoms */
|
||||||
|
xmm3 = poly_mask;
|
||||||
|
y2 = _mm_and_ps(xmm3, y2); //, xmm3);
|
||||||
|
y = _mm_andnot_ps(xmm3, y);
|
||||||
|
y = _mm_add_ps(y,y2);
|
||||||
|
/* update the sign */
|
||||||
|
y = _mm_xor_ps(y, sign_bit);
|
||||||
|
return y;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* almost the same as sin_ps */
|
||||||
|
v4sf cos_ps(v4sf x) { // any x
|
||||||
|
v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
|
||||||
|
#ifdef USE_SSE2
|
||||||
|
v4si emm0, emm2;
|
||||||
|
#else
|
||||||
|
v2si mm0, mm1, mm2, mm3;
|
||||||
|
#endif
|
||||||
|
/* take the absolute value */
|
||||||
|
x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
|
||||||
|
|
||||||
|
/* scale by 4/Pi */
|
||||||
|
y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
|
||||||
|
|
||||||
|
#ifdef USE_SSE2
|
||||||
|
/* store the integer part of y in mm0 */
|
||||||
|
emm2 = _mm_cvttps_epi32(y);
|
||||||
|
/* j=(j+1) & (~1) (see the cephes sources) */
|
||||||
|
emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
|
||||||
|
emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
|
||||||
|
y = _mm_cvtepi32_ps(emm2);
|
||||||
|
|
||||||
|
emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
|
||||||
|
|
||||||
|
/* get the swap sign flag */
|
||||||
|
emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
|
||||||
|
emm0 = _mm_slli_epi32(emm0, 29);
|
||||||
|
/* get the polynom selection mask */
|
||||||
|
emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
|
||||||
|
emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
|
||||||
|
|
||||||
|
v4sf sign_bit = _mm_castsi128_ps(emm0);
|
||||||
|
v4sf poly_mask = _mm_castsi128_ps(emm2);
|
||||||
|
#else
|
||||||
|
/* store the integer part of y in mm0:mm1 */
|
||||||
|
xmm2 = _mm_movehl_ps(xmm2, y);
|
||||||
|
mm2 = _mm_cvttps_pi32(y);
|
||||||
|
mm3 = _mm_cvttps_pi32(xmm2);
|
||||||
|
|
||||||
|
/* j=(j+1) & (~1) (see the cephes sources) */
|
||||||
|
mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
|
||||||
|
mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
|
||||||
|
mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
|
||||||
|
mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
|
||||||
|
|
||||||
|
y = _mm_cvtpi32x2_ps(mm2, mm3);
|
||||||
|
|
||||||
|
|
||||||
|
mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2);
|
||||||
|
mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2);
|
||||||
|
|
||||||
|
/* get the swap sign flag in mm0:mm1 and the
|
||||||
|
polynom selection mask in mm2:mm3 */
|
||||||
|
|
||||||
|
mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4);
|
||||||
|
mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4);
|
||||||
|
mm0 = _mm_slli_pi32(mm0, 29);
|
||||||
|
mm1 = _mm_slli_pi32(mm1, 29);
|
||||||
|
|
||||||
|
mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
|
||||||
|
mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
|
||||||
|
|
||||||
|
mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
|
||||||
|
mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
|
||||||
|
|
||||||
|
v4sf sign_bit, poly_mask;
|
||||||
|
COPY_MM_TO_XMM(mm0, mm1, sign_bit);
|
||||||
|
COPY_MM_TO_XMM(mm2, mm3, poly_mask);
|
||||||
|
_mm_empty(); /* good-bye mmx */
|
||||||
|
#endif
|
||||||
|
/* The magic pass: "Extended precision modular arithmetic"
|
||||||
|
x = ((x - y * DP1) - y * DP2) - y * DP3; */
|
||||||
|
xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
|
||||||
|
xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
|
||||||
|
xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
|
||||||
|
xmm1 = _mm_mul_ps(y, xmm1);
|
||||||
|
xmm2 = _mm_mul_ps(y, xmm2);
|
||||||
|
xmm3 = _mm_mul_ps(y, xmm3);
|
||||||
|
x = _mm_add_ps(x, xmm1);
|
||||||
|
x = _mm_add_ps(x, xmm2);
|
||||||
|
x = _mm_add_ps(x, xmm3);
|
||||||
|
|
||||||
|
/* Evaluate the first polynom (0 <= x <= Pi/4) */
|
||||||
|
y = *(v4sf*)_ps_coscof_p0;
|
||||||
|
v4sf z = _mm_mul_ps(x,x);
|
||||||
|
|
||||||
|
y = _mm_mul_ps(y, z);
|
||||||
|
y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
|
||||||
|
y = _mm_mul_ps(y, z);
|
||||||
|
y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
|
||||||
|
y = _mm_mul_ps(y, z);
|
||||||
|
y = _mm_mul_ps(y, z);
|
||||||
|
v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
|
||||||
|
y = _mm_sub_ps(y, tmp);
|
||||||
|
y = _mm_add_ps(y, *(v4sf*)_ps_1);
|
||||||
|
|
||||||
|
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
|
||||||
|
|
||||||
|
v4sf y2 = *(v4sf*)_ps_sincof_p0;
|
||||||
|
y2 = _mm_mul_ps(y2, z);
|
||||||
|
y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
|
||||||
|
y2 = _mm_mul_ps(y2, z);
|
||||||
|
y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
|
||||||
|
y2 = _mm_mul_ps(y2, z);
|
||||||
|
y2 = _mm_mul_ps(y2, x);
|
||||||
|
y2 = _mm_add_ps(y2, x);
|
||||||
|
|
||||||
|
/* select the correct result from the two polynoms */
|
||||||
|
xmm3 = poly_mask;
|
||||||
|
y2 = _mm_and_ps(xmm3, y2); //, xmm3);
|
||||||
|
y = _mm_andnot_ps(xmm3, y);
|
||||||
|
y = _mm_add_ps(y,y2);
|
||||||
|
/* update the sign */
|
||||||
|
y = _mm_xor_ps(y, sign_bit);
|
||||||
|
|
||||||
|
return y;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them..
|
||||||
|
it is almost as fast, and gives you a free cosine with your sine */
|
||||||
|
void sincos_ps(v4sf x, v4sf *s, v4sf *c) {
|
||||||
|
v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
|
||||||
|
#ifdef USE_SSE2
|
||||||
|
v4si emm0, emm2, emm4;
|
||||||
|
#else
|
||||||
|
v2si mm0, mm1, mm2, mm3, mm4, mm5;
|
||||||
|
#endif
|
||||||
|
sign_bit_sin = x;
|
||||||
|
/* take the absolute value */
|
||||||
|
x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
|
||||||
|
/* extract the sign bit (upper one) */
|
||||||
|
sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask);
|
||||||
|
|
||||||
|
/* scale by 4/Pi */
|
||||||
|
y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
|
||||||
|
|
||||||
|
#ifdef USE_SSE2
|
||||||
|
/* store the integer part of y in emm2 */
|
||||||
|
emm2 = _mm_cvttps_epi32(y);
|
||||||
|
|
||||||
|
/* j=(j+1) & (~1) (see the cephes sources) */
|
||||||
|
emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
|
||||||
|
emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
|
||||||
|
y = _mm_cvtepi32_ps(emm2);
|
||||||
|
|
||||||
|
emm4 = emm2;
|
||||||
|
|
||||||
|
/* get the swap sign flag for the sine */
|
||||||
|
emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
|
||||||
|
emm0 = _mm_slli_epi32(emm0, 29);
|
||||||
|
v4sf swap_sign_bit_sin = _mm_castsi128_ps(emm0);
|
||||||
|
|
||||||
|
/* get the polynom selection mask for the sine*/
|
||||||
|
emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
|
||||||
|
emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
|
||||||
|
v4sf poly_mask = _mm_castsi128_ps(emm2);
|
||||||
|
#else
|
||||||
|
/* store the integer part of y in mm2:mm3 */
|
||||||
|
xmm3 = _mm_movehl_ps(xmm3, y);
|
||||||
|
mm2 = _mm_cvttps_pi32(y);
|
||||||
|
mm3 = _mm_cvttps_pi32(xmm3);
|
||||||
|
|
||||||
|
/* j=(j+1) & (~1) (see the cephes sources) */
|
||||||
|
mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
|
||||||
|
mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
|
||||||
|
mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
|
||||||
|
mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
|
||||||
|
|
||||||
|
y = _mm_cvtpi32x2_ps(mm2, mm3);
|
||||||
|
|
||||||
|
mm4 = mm2;
|
||||||
|
mm5 = mm3;
|
||||||
|
|
||||||
|
/* get the swap sign flag for the sine */
|
||||||
|
mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
|
||||||
|
mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
|
||||||
|
mm0 = _mm_slli_pi32(mm0, 29);
|
||||||
|
mm1 = _mm_slli_pi32(mm1, 29);
|
||||||
|
v4sf swap_sign_bit_sin;
|
||||||
|
COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
|
||||||
|
|
||||||
|
/* get the polynom selection mask for the sine */
|
||||||
|
|
||||||
|
mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
|
||||||
|
mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
|
||||||
|
mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
|
||||||
|
mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
|
||||||
|
v4sf poly_mask;
|
||||||
|
COPY_MM_TO_XMM(mm2, mm3, poly_mask);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* The magic pass: "Extended precision modular arithmetic"
|
||||||
|
x = ((x - y * DP1) - y * DP2) - y * DP3; */
|
||||||
|
xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
|
||||||
|
xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
|
||||||
|
xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
|
||||||
|
xmm1 = _mm_mul_ps(y, xmm1);
|
||||||
|
xmm2 = _mm_mul_ps(y, xmm2);
|
||||||
|
xmm3 = _mm_mul_ps(y, xmm3);
|
||||||
|
x = _mm_add_ps(x, xmm1);
|
||||||
|
x = _mm_add_ps(x, xmm2);
|
||||||
|
x = _mm_add_ps(x, xmm3);
|
||||||
|
|
||||||
|
#ifdef USE_SSE2
|
||||||
|
emm4 = _mm_sub_epi32(emm4, *(v4si*)_pi32_2);
|
||||||
|
emm4 = _mm_andnot_si128(emm4, *(v4si*)_pi32_4);
|
||||||
|
emm4 = _mm_slli_epi32(emm4, 29);
|
||||||
|
v4sf sign_bit_cos = _mm_castsi128_ps(emm4);
|
||||||
|
#else
|
||||||
|
/* get the sign flag for the cosine */
|
||||||
|
mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2);
|
||||||
|
mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2);
|
||||||
|
mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4);
|
||||||
|
mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4);
|
||||||
|
mm4 = _mm_slli_pi32(mm4, 29);
|
||||||
|
mm5 = _mm_slli_pi32(mm5, 29);
|
||||||
|
v4sf sign_bit_cos;
|
||||||
|
COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
|
||||||
|
_mm_empty(); /* good-bye mmx */
|
||||||
|
#endif
|
||||||
|
|
||||||
|
sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
|
||||||
|
|
||||||
|
|
||||||
|
/* Evaluate the first polynom (0 <= x <= Pi/4) */
|
||||||
|
v4sf z = _mm_mul_ps(x,x);
|
||||||
|
y = *(v4sf*)_ps_coscof_p0;
|
||||||
|
|
||||||
|
y = _mm_mul_ps(y, z);
|
||||||
|
y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
|
||||||
|
y = _mm_mul_ps(y, z);
|
||||||
|
y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
|
||||||
|
y = _mm_mul_ps(y, z);
|
||||||
|
y = _mm_mul_ps(y, z);
|
||||||
|
v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
|
||||||
|
y = _mm_sub_ps(y, tmp);
|
||||||
|
y = _mm_add_ps(y, *(v4sf*)_ps_1);
|
||||||
|
|
||||||
|
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
|
||||||
|
|
||||||
|
v4sf y2 = *(v4sf*)_ps_sincof_p0;
|
||||||
|
y2 = _mm_mul_ps(y2, z);
|
||||||
|
y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
|
||||||
|
y2 = _mm_mul_ps(y2, z);
|
||||||
|
y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
|
||||||
|
y2 = _mm_mul_ps(y2, z);
|
||||||
|
y2 = _mm_mul_ps(y2, x);
|
||||||
|
y2 = _mm_add_ps(y2, x);
|
||||||
|
|
||||||
|
/* select the correct result from the two polynoms */
|
||||||
|
xmm3 = poly_mask;
|
||||||
|
v4sf ysin2 = _mm_and_ps(xmm3, y2);
|
||||||
|
v4sf ysin1 = _mm_andnot_ps(xmm3, y);
|
||||||
|
y2 = _mm_sub_ps(y2,ysin2);
|
||||||
|
y = _mm_sub_ps(y, ysin1);
|
||||||
|
|
||||||
|
xmm1 = _mm_add_ps(ysin1,ysin2);
|
||||||
|
xmm2 = _mm_add_ps(y,y2);
|
||||||
|
|
||||||
|
/* update the sign */
|
||||||
|
*s = _mm_xor_ps(xmm1, sign_bit_sin);
|
||||||
|
*c = _mm_xor_ps(xmm2, sign_bit_cos);
|
||||||
|
}
|
||||||
|
|
|
@ -23,6 +23,7 @@ avnd_score_plugin_add(
|
||||||
Amuencha/AmuenchaUi.hpp
|
Amuencha/AmuenchaUi.hpp
|
||||||
Amuencha/SpiralDisplay.hpp
|
Amuencha/SpiralDisplay.hpp
|
||||||
Amuencha/SpiralDisplay.cpp
|
Amuencha/SpiralDisplay.cpp
|
||||||
|
Amuencha/sse_mathfun.h
|
||||||
TARGET amuencha
|
TARGET amuencha
|
||||||
MAIN_CLASS Analyser
|
MAIN_CLASS Analyser
|
||||||
NAMESPACE Amuencha
|
NAMESPACE Amuencha
|
||||||
|
|
Loading…
Add table
Reference in a new issue