/usr/include/bse/bseresamplerimpl.hh is in libbse-dev 0.7.8-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 | // Licensed GNU LGPL v2.1 or later: http://www.gnu.org/licenses/lgpl.html
#ifndef __BSE_RESAMPLER_TCC__
#define __BSE_RESAMPLER_TCC__
#include <vector>
#include <bse/bseresampler.hh>
#include <birnet/birnet.hh>
#include <math.h>
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#ifdef __SSE__
#include <xmmintrin.h>
#endif
namespace Bse {
namespace Resampler {
using std::vector;
using std::min;
using std::max;
using std::copy;
using Birnet::AlignedArray;
/* see: http://ds9a.nl/gcc-simd/ */
union F4Vector
{
float f[4];
#ifdef __SSE__
__m128 v; // vector of four single floats
#endif
};
/**
* FIR filter routine
*
* A FIR filter has the characteristic that it has a finite impulse response,
* and can be computed by convolution of the input signal with that finite
* impulse response.
*
* Thus, we use this for computing the output of the FIR filter
*
* output = input[0] * taps[0] + input[1] * taps[1] + ... + input[N-1] * taps[N-1]
*
* where input is the input signal, taps are the filter coefficients, in
* other texts sometimes called h[0]..h[N-1] (impulse response) or a[0]..a[N-1]
* (non recursive part of a digital filter), and N is the filter order.
*/
template<class Accumulator> static inline Accumulator
fir_process_one_sample (const float *input,
const float *taps, /* [0..order-1] */
const guint order)
{
Accumulator out = 0;
for (guint i = 0; i < order; i++)
out += input[i] * taps[i];
return out;
}
/**
* FIR filter routine for 4 samples simultaneously
*
* This routine produces (approximately) the same result as fir_process_one_sample
* but computes four consecutive output values at once using vectorized SSE
* instructions. Note that input and sse_taps need to be 16-byte aligned here.
*
* Also note that sse_taps is not a plain impulse response here, but a special
* version that needs to be computed with fir_compute_sse_taps.
*/
static inline void
fir_process_4samples_sse (const float *input,
const float *sse_taps,
const guint order,
float *out0,
float *out1,
float *out2,
float *out3)
{
#ifdef __SSE__
/* input and taps must be 16-byte aligned */
const F4Vector *input_v = reinterpret_cast<const F4Vector *> (input);
const F4Vector *sse_taps_v = reinterpret_cast<const F4Vector *> (sse_taps);
F4Vector out0_v, out1_v, out2_v, out3_v;
out0_v.v = _mm_mul_ps (input_v[0].v, sse_taps_v[0].v);
out1_v.v = _mm_mul_ps (input_v[0].v, sse_taps_v[1].v);
out2_v.v = _mm_mul_ps (input_v[0].v, sse_taps_v[2].v);
out3_v.v = _mm_mul_ps (input_v[0].v, sse_taps_v[3].v);
for (guint i = 1; i < (order + 6) / 4; i++)
{
out0_v.v = _mm_add_ps (out0_v.v, _mm_mul_ps (input_v[i].v, sse_taps_v[i * 4 + 0].v));
out1_v.v = _mm_add_ps (out1_v.v, _mm_mul_ps (input_v[i].v, sse_taps_v[i * 4 + 1].v));
out2_v.v = _mm_add_ps (out2_v.v, _mm_mul_ps (input_v[i].v, sse_taps_v[i * 4 + 2].v));
out3_v.v = _mm_add_ps (out3_v.v, _mm_mul_ps (input_v[i].v, sse_taps_v[i * 4 + 3].v));
}
*out0 = out0_v.f[0] + out0_v.f[1] + out0_v.f[2] + out0_v.f[3];
*out1 = out1_v.f[0] + out1_v.f[1] + out1_v.f[2] + out1_v.f[3];
*out2 = out2_v.f[0] + out2_v.f[1] + out2_v.f[2] + out2_v.f[3];
*out3 = out3_v.f[0] + out3_v.f[1] + out3_v.f[2] + out3_v.f[3];
#else
g_assert_not_reached();
#endif
}
/**
* fir_compute_sse_taps takes a normal vector of FIR taps as argument and
* computes a specially scrambled version of these taps, ready to be used
* for SSE operations (by fir_process_4samples_sse).
*
* we require a special ordering of the FIR taps, to get maximum benefit of the SSE operations
*
* example: suppose the FIR taps are [ x1 x2 x3 x4 x5 x6 x7 x8 x9 ], then the SSE taps become
*
* [ x1 x2 x3 x4 0 x1 x2 x3 0 0 x1 x2 0 0 0 x1 <- for input[0]
* x5 x6 x7 x8 x4 x5 x6 x7 x3 x4 x5 x6 x2 x3 x4 x5 <- for input[1]
* x9 0 0 0 x8 x9 0 0 x7 x8 x9 0 x6 x7 x8 x9 ] <- for input[2]
* \------------/\-----------/\-----------/\-----------/
* for out0 for out1 for out2 for out3
*
* so that we can compute out0, out1, out2 and out3 simultaneously
* from input[0]..input[2]
*/
static inline vector<float>
fir_compute_sse_taps (const vector<float>& taps)
{
const int order = taps.size();
vector<float> sse_taps ((order + 6) / 4 * 16);
for (int j = 0; j < 4; j++)
for (int i = 0; i < order; i++)
{
int k = i + j;
sse_taps[(k / 4) * 16 + (k % 4) + j * 4] = taps[i];
}
return sse_taps;
}
/**
* This function tests the SSEified FIR filter code (that is, the reordering
* done by fir_compute_sse_taps and the actual computation implemented in
* fir_process_4samples_sse).
*
* It prints diagnostic information, and returns true if the filter
* implementation works correctly, and false otherwise. The maximum filter
* order to be tested can be optionally specified as argument.
*/
static inline bool
fir_test_filter_sse (bool verbose,
const guint max_order = 64)
{
int errors = 0;
if (verbose)
printf ("testing SSE filter implementation:\n\n");
for (guint order = 0; order < max_order; order++)
{
vector<float> taps (order);
for (guint i = 0; i < order; i++)
taps[i] = i + 1;
AlignedArray<float,16> sse_taps (fir_compute_sse_taps (taps));
if (verbose)
{
for (uint i = 0; i < sse_taps.size(); i++)
{
printf ("%3d", (int) (sse_taps[i] + 0.5));
if (i % 4 == 3)
printf (" |");
if (i % 16 == 15)
printf (" ||| upper bound = %d\n", (order + 6) / 4);
}
printf ("\n\n");
}
AlignedArray<float,16> random_mem (order + 4);
for (guint i = 0; i < order + 4; i++)
random_mem[i] = 1.0 - rand() / (0.5 * RAND_MAX);
/* FIXME: the problem with this test is that we explicitely test SSE code
* here, but the test case is not compiled with -msse within the BEAST tree
*/
float out[4];
fir_process_4samples_sse (&random_mem[0], &sse_taps[0], order,
&out[0], &out[1], &out[2], &out[3]);
double avg_diff = 0.0;
for (int i = 0; i < 4; i++)
{
double diff = fir_process_one_sample<double> (&random_mem[i], &taps[0], order) - out[i];
avg_diff += fabs (diff);
}
avg_diff /= (order + 1);
bool is_error = (avg_diff > 0.00001);
if (is_error || verbose)
printf ("*** order = %d, avg_diff = %g\n", order, avg_diff);
if (is_error)
errors++;
}
if (errors)
printf ("*** %d errors detected\n", errors);
else
printf ("filter implementation ok.\n");
return (errors == 0);
}
/**
* Factor 2 upsampling of a data stream
*
* Template arguments:
* ORDER number of resampling filter coefficients
* USE_SSE whether to use SSE (vectorized) instructions or not
*/
template<guint ORDER, bool USE_SSE>
class Upsampler2 : public Resampler2 {
vector<float> taps;
AlignedArray<float,16> history;
AlignedArray<float,16> sse_taps;
protected:
/* fast SSE optimized convolution */
void
process_4samples_aligned (const float *input /* aligned */,
float *output)
{
const guint H = (ORDER / 2); /* half the filter length */
output[1] = input[H];
output[3] = input[H + 1];
output[5] = input[H + 2];
output[7] = input[H + 3];
fir_process_4samples_sse (input, &sse_taps[0], ORDER, &output[0], &output[2], &output[4], &output[6]);
}
/* slow convolution */
void
process_sample_unaligned (const float *input,
float *output)
{
const guint H = (ORDER / 2); /* half the filter length */
output[0] = fir_process_one_sample<float> (&input[0], &taps[0], ORDER);
output[1] = input[H];
}
void
process_block_aligned (const float *input,
guint n_input_samples,
float *output)
{
uint i = 0;
if (USE_SSE)
{
while (i + 3 < n_input_samples)
{
process_4samples_aligned (&input[i], &output[i*2]);
i += 4;
}
}
while (i < n_input_samples)
{
process_sample_unaligned (&input[i], &output[2*i]);
i++;
}
}
void
process_block_unaligned (const float *input,
guint n_input_samples,
float *output)
{
uint i = 0;
if (USE_SSE)
{
while ((reinterpret_cast<ptrdiff_t> (&input[i]) & 15) && i < n_input_samples)
{
process_sample_unaligned (&input[i], &output[2 * i]);
i++;
}
}
process_block_aligned (&input[i], n_input_samples - i, &output[2 * i]);
}
public:
/**
* Constructs an Upsampler2 object with a given set of filter coefficients.
*
* init_taps: coefficients for the upsampling FIR halfband filter
*/
Upsampler2 (float *init_taps) :
taps (init_taps, init_taps + ORDER),
history (2 * ORDER),
sse_taps (fir_compute_sse_taps (taps))
{
g_assert ((ORDER & 1) == 0); /* even order filter */
}
/**
* The function process_block() takes a block of input samples and produces a
* block with twice the length, containing interpolated output samples.
*/
void
process_block (const float *input,
guint n_input_samples,
float *output)
{
const uint history_todo = min (n_input_samples, ORDER - 1);
copy (input, input + history_todo, &history[ORDER - 1]);
process_block_aligned (&history[0], history_todo, output);
if (n_input_samples > history_todo)
{
process_block_unaligned (input, n_input_samples - history_todo, &output [2 * history_todo]);
// build new history from new input
copy (input + n_input_samples - history_todo, input + n_input_samples, &history[0]);
}
else
{
// build new history from end of old history
// (very expensive if n_input_samples tends to be a lot smaller than ORDER often)
g_memmove (&history[0], &history[n_input_samples], sizeof (history[0]) * (ORDER - 1));
}
}
/**
* Returns the FIR filter order.
*/
guint
order() const
{
return ORDER;
}
double
delay() const
{
return order() - 1;
}
};
/**
* Factor 2 downsampling of a data stream
*
* Template arguments:
* ORDER number of resampling filter coefficients
* USE_SSE whether to use SSE (vectorized) instructions or not
*/
template<guint ORDER, bool USE_SSE>
class Downsampler2 : public Resampler2 {
vector<float> taps;
AlignedArray<float,16> history_even;
AlignedArray<float,16> history_odd;
AlignedArray<float,16> sse_taps;
/* fast SSE optimized convolution */
template<int ODD_STEPPING> void
process_4samples_aligned (const float *input_even /* aligned */,
const float *input_odd,
float *output)
{
const guint H = (ORDER / 2) - 1; /* half the filter length */
fir_process_4samples_sse (input_even, &sse_taps[0], ORDER, &output[0], &output[1], &output[2], &output[3]);
output[0] += 0.5 * input_odd[H * ODD_STEPPING];
output[1] += 0.5 * input_odd[(H + 1) * ODD_STEPPING];
output[2] += 0.5 * input_odd[(H + 2) * ODD_STEPPING];
output[3] += 0.5 * input_odd[(H + 3) * ODD_STEPPING];
}
/* slow convolution */
template<int ODD_STEPPING> float
process_sample_unaligned (const float *input_even,
const float *input_odd)
{
const guint H = (ORDER / 2) - 1; /* half the filter length */
return fir_process_one_sample<float> (&input_even[0], &taps[0], ORDER) + 0.5 * input_odd[H * ODD_STEPPING];
}
template<int ODD_STEPPING> void
process_block_aligned (const float *input_even,
const float *input_odd,
float *output,
guint n_output_samples)
{
uint i = 0;
if (USE_SSE)
{
while (i + 3 < n_output_samples)
{
process_4samples_aligned<ODD_STEPPING> (&input_even[i], &input_odd[i * ODD_STEPPING], &output[i]);
i += 4;
}
}
while (i < n_output_samples)
{
output[i] = process_sample_unaligned<ODD_STEPPING> (&input_even[i], &input_odd[i * ODD_STEPPING]);
i++;
}
}
template<int ODD_STEPPING> void
process_block_unaligned (const float *input_even,
const float *input_odd,
float *output,
guint n_output_samples)
{
uint i = 0;
if (USE_SSE)
{
while ((reinterpret_cast<ptrdiff_t> (&input_even[i]) & 15) && i < n_output_samples)
{
output[i] = process_sample_unaligned<ODD_STEPPING> (&input_even[i], &input_odd[i * ODD_STEPPING]);
i++;
}
}
process_block_aligned<ODD_STEPPING> (&input_even[i], &input_odd[i * ODD_STEPPING], &output[i], n_output_samples);
}
void
deinterleave2 (const float *data,
guint n_data_values,
float *output)
{
for (uint i = 0; i < n_data_values; i += 2)
output[i / 2] = data[i];
}
public:
/**
* Constructs a Downsampler2 class using a given set of filter coefficients.
*
* init_taps: coefficients for the downsampling FIR halfband filter
*/
Downsampler2 (float *init_taps) :
taps (init_taps, init_taps + ORDER),
history_even (2 * ORDER),
history_odd (2 * ORDER),
sse_taps (fir_compute_sse_taps (taps))
{
g_assert ((ORDER & 1) == 0); /* even order filter */
}
/**
* The function process_block() takes a block of input samples and produces
* a block with half the length, containing downsampled output samples.
*/
void
process_block (const float *input,
guint n_input_samples,
float *output)
{
g_assert ((n_input_samples & 1) == 0);
const uint BLOCKSIZE = 1024;
F4Vector block[BLOCKSIZE / 4]; /* using F4Vector ensures 16-byte alignment */
float *input_even = &block[0].f[0];
while (n_input_samples)
{
uint n_input_todo = min (n_input_samples, BLOCKSIZE * 2);
/* since the halfband filter contains zeros every other sample
* and since we're using SSE instructions, which expect the
* data to be consecutively represented in memory, we prepare
* a block of samples containing only even-indexed samples
*
* we keep the deinterleaved data on the stack (instead of per-class
* allocated memory), to ensure that even running a lot of these
* downsampler streams will not result in cache trashing
*
* FIXME: this implementation is suboptimal for non-SSE, because it
* performs an extra deinterleaving step in any case, but deinterleaving
* is only required for SSE instructions
*/
deinterleave2 (input, n_input_todo, input_even);
const float *input_odd = input + 1; /* we process this one with a stepping of 2 */
const uint n_output_todo = n_input_todo / 2;
const uint history_todo = min (n_output_todo, ORDER - 1);
copy (input_even, input_even + history_todo, &history_even[ORDER - 1]);
deinterleave2 (input_odd, history_todo * 2, &history_odd[ORDER - 1]);
process_block_aligned <1> (&history_even[0], &history_odd[0], output, history_todo);
if (n_output_todo > history_todo)
{
process_block_unaligned<2> (input_even, input_odd, &output[history_todo], n_output_todo - history_todo);
// build new history from new input (here: history_todo == ORDER - 1)
copy (input_even + n_output_todo - history_todo, input_even + n_output_todo, &history_even[0]);
deinterleave2 (input_odd + n_input_todo - history_todo * 2, history_todo * 2, &history_odd[0]); /* FIXME: can be optimized */
}
else
{
// build new history from end of old history
// (very expensive if n_output_todo tends to be a lot smaller than ORDER often)
g_memmove (&history_even[0], &history_even[n_output_todo], sizeof (history_even[0]) * (ORDER - 1));
g_memmove (&history_odd[0], &history_odd[n_output_todo], sizeof (history_odd[0]) * (ORDER - 1));
}
n_input_samples -= n_input_todo;
input += n_input_todo;
output += n_output_todo;
}
}
/**
* Returns the filter order.
*/
guint
order() const
{
return ORDER;
}
double
delay() const
{
return order() / 2 - 0.5;
}
};
template<bool USE_SSE> Resampler2*
Resampler2::create_impl (BseResampler2Mode mode,
BseResampler2Precision precision)
{
if (mode == BSE_RESAMPLER2_MODE_UPSAMPLE)
{
switch (precision)
{
case BSE_RESAMPLER2_PREC_LINEAR: return create_impl_with_coeffs <Upsampler2<2, USE_SSE> > (halfband_fir_linear_coeffs, 2, 2.0);
case BSE_RESAMPLER2_PREC_48DB: return create_impl_with_coeffs <Upsampler2<16, USE_SSE> > (halfband_fir_48db_coeffs, 16, 2.0);
case BSE_RESAMPLER2_PREC_72DB: return create_impl_with_coeffs <Upsampler2<24, USE_SSE> > (halfband_fir_72db_coeffs, 24, 2.0);
case BSE_RESAMPLER2_PREC_96DB: return create_impl_with_coeffs <Upsampler2<32, USE_SSE> > (halfband_fir_96db_coeffs, 32, 2.0);
case BSE_RESAMPLER2_PREC_120DB: return create_impl_with_coeffs <Upsampler2<42, USE_SSE> > (halfband_fir_120db_coeffs, 42, 2.0);
case BSE_RESAMPLER2_PREC_144DB: return create_impl_with_coeffs <Upsampler2<52, USE_SSE> > (halfband_fir_144db_coeffs, 52, 2.0);
}
}
else if (mode == BSE_RESAMPLER2_MODE_DOWNSAMPLE)
{
switch (precision)
{
case BSE_RESAMPLER2_PREC_LINEAR: return create_impl_with_coeffs <Downsampler2<2, USE_SSE> > (halfband_fir_linear_coeffs, 2, 1.0);
case BSE_RESAMPLER2_PREC_48DB: return create_impl_with_coeffs <Downsampler2<16, USE_SSE> > (halfband_fir_48db_coeffs, 16, 1.0);
case BSE_RESAMPLER2_PREC_72DB: return create_impl_with_coeffs <Downsampler2<24, USE_SSE> > (halfband_fir_72db_coeffs, 24, 1.0);
case BSE_RESAMPLER2_PREC_96DB: return create_impl_with_coeffs <Downsampler2<32, USE_SSE> > (halfband_fir_96db_coeffs, 32, 1.0);
case BSE_RESAMPLER2_PREC_120DB: return create_impl_with_coeffs <Downsampler2<42, USE_SSE> > (halfband_fir_120db_coeffs, 42, 1.0);
case BSE_RESAMPLER2_PREC_144DB: return create_impl_with_coeffs <Downsampler2<52, USE_SSE> > (halfband_fir_144db_coeffs, 52, 1.0);
}
}
return 0;
}
} // Resampler
} // Bse
#endif /* __BSE_RESAMPLER_TCC__ */
|