diff --git a/convert.c b/convert.c index e1d1d601d..94f49312f 100644 --- a/convert.c +++ b/convert.c @@ -91,6 +91,54 @@ static void convert_sc16q11(void *iq_data, } } +static void convert_s16(void *iq_data, + uint16_t *mag_data, + unsigned nsamples, + struct converter_state *state, + double *out_mean_level, + double *out_mean_power) +{ + MODES_NOTUSED(state); + + const int16_t *in = (const int16_t *) iq_data; + + if (STARCH_IS_ALIGNED(in) && STARCH_IS_ALIGNED(mag_data)) + starch_magnitude_s16_aligned(in, mag_data, nsamples); + else + starch_magnitude_s16(in, mag_data, nsamples); + + if (out_mean_level && out_mean_power) { + if (STARCH_IS_ALIGNED(mag_data)) + starch_mean_power_u16_aligned(mag_data, nsamples, out_mean_level, out_mean_power); + else + starch_mean_power_u16(mag_data, nsamples, out_mean_level, out_mean_power); + } +} + +static void convert_u16o12(void *iq_data, + uint16_t *mag_data, + unsigned nsamples, + struct converter_state *state, + double *out_mean_level, + double *out_mean_power) +{ + MODES_NOTUSED(state); + + const uint16_t *in = (const uint16_t *) iq_data; + + if (STARCH_IS_ALIGNED(in) && STARCH_IS_ALIGNED(mag_data)) + starch_magnitude_u16o12_aligned(in, mag_data, nsamples); + else + starch_magnitude_u16o12(in, mag_data, nsamples); + + if (out_mean_level && out_mean_power) { + if (STARCH_IS_ALIGNED(mag_data)) + starch_mean_power_u16_aligned(mag_data, nsamples, out_mean_level, out_mean_power); + else + starch_mean_power_u16(mag_data, nsamples, out_mean_level, out_mean_power); + } +} + iq_convert_fn init_converter(input_format_t format, double sample_rate, int filter_dc, @@ -111,6 +159,10 @@ iq_convert_fn init_converter(input_format_t format, return convert_sc16; case INPUT_SC16Q11: return convert_sc16q11; + case INPUT_S16: + return convert_s16; + case INPUT_U16O12: + return convert_u16o12; default: fprintf(stderr, "no suitable converter for format=%d\n", format); return NULL; diff --git a/convert.h b/convert.h index 6b861b114..9881e7c5a 100644 --- a/convert.h +++ b/convert.h @@ -21,7 +21,7 @@ #define DUMP1090_CONVERT_H struct converter_state; -typedef enum { INPUT_UC8=0, INPUT_SC16, INPUT_SC16Q11 } input_format_t; +typedef enum { INPUT_UC8=0, INPUT_SC16, INPUT_SC16Q11, INPUT_S16, INPUT_U16O12 } input_format_t; typedef void (*iq_convert_fn)(void *iq_data, uint16_t *mag_data, @@ -36,5 +36,8 @@ iq_convert_fn init_converter(input_format_t format, struct converter_state **out_state); void cleanup_converter(struct converter_state *state); +const char *formatGetName(input_format_t format_type); +input_format_t formatGetByName(const char *name); +int formatGetBytesPerSample(input_format_t format_type); #endif diff --git a/dsp/benchmark/magnitude_s16_benchmark.c b/dsp/benchmark/magnitude_s16_benchmark.c new file mode 100644 index 000000000..bd87d6ff4 --- /dev/null +++ b/dsp/benchmark/magnitude_s16_benchmark.c @@ -0,0 +1,42 @@ +#include +#include +#include + + +void STARCH_BENCHMARK(magnitude_s16) (void) +{ + int16_t *in = NULL; + uint16_t *out_mag = NULL; + const unsigned len = 65535; + + if (!(in = STARCH_BENCHMARK_ALLOC(len, int16_t)) || !(out_mag = STARCH_BENCHMARK_ALLOC(len, uint16_t))) { + goto done; + } + + unsigned i = 0; + + for (; i < len; ++i) { + in[i] = i - 32767; + out_mag[i] = abs(in[i]); + } + + STARCH_BENCHMARK_RUN( magnitude_s16, in, out_mag, len ); + + done: + STARCH_BENCHMARK_FREE(in); + STARCH_BENCHMARK_FREE(out_mag); +} + +bool STARCH_BENCHMARK_VERIFY(magnitude_s16) (const int16_t *in, uint16_t *out, unsigned len) +{ + bool okay = true; + + for (unsigned i = 0; i < len; ++i) { + okay = (out[i] == abs(in[i])); + if (!okay) { + fprintf(stderr, "verification failed: in[%u]=%d out[%u]=%u\n", i, in[i], i, out[i]); + } + } + + return okay; +} diff --git a/dsp/benchmark/magnitude_u16o12_benchmark.c b/dsp/benchmark/magnitude_u16o12_benchmark.c new file mode 100644 index 000000000..28e927b7e --- /dev/null +++ b/dsp/benchmark/magnitude_u16o12_benchmark.c @@ -0,0 +1,56 @@ +#include +#include +#include + +// The magic scaler is (32767.0 / 2047.0) +#define MAGIC_SCALER 32.01514f +#define CONVERT_AND_SCALE(__in) \ +({ \ + uint16_t __out = abs(le16toh(__in) - 2048); \ + ceil(__out * 32.01514f); \ +}) + +void STARCH_BENCHMARK(magnitude_u16o12) (void) +{ + uint16_t *in = NULL; + uint16_t *out_mag = NULL; + const unsigned len = 4096; + unsigned i; + + if (!(in = STARCH_BENCHMARK_ALLOC(len, uint16_t)) || !(out_mag = STARCH_BENCHMARK_ALLOC(len, uint16_t))) { + goto done; + } + + for (i = 1; i < len; i++) { + in[i] = i; + } + + STARCH_BENCHMARK_RUN( magnitude_u16o12, in, out_mag, len ); + + done: + STARCH_BENCHMARK_FREE(in); + STARCH_BENCHMARK_FREE(out_mag); +} + +bool STARCH_BENCHMARK_VERIFY(magnitude_u16o12) (const uint16_t *in, uint16_t *out, unsigned len) +{ + bool okay = true; + + for (unsigned i = 1; i < len; ++i) { + uint16_t j; + if (in[i] < 2048) { + j = 2048 - (out[i] * 2047.0 / 65535.0f - 0.5f); + } else { + j = 2048 + (out[i] * 2047.0 / 65535.0f + 0.5f); + } + okay = (j == in[j]); + if (!okay) { + fprintf(stderr, "verification failed: in[%u]=%d out[%u]=%u\n", i, in[i], i, out[i]); + } + } + + return okay; +} + +#undef CONVERT_AND_SCALE +#undef MAGIC_SCALER diff --git a/dsp/generated/benchmark.c b/dsp/generated/benchmark.c index ce38e0186..d02659c56 100644 --- a/dsp/generated/benchmark.c +++ b/dsp/generated/benchmark.c @@ -560,6 +560,230 @@ static void starch_benchmark_run_magnitude_power_uc8_aligned( const uc8_t * arg0 } } +/* prototypes for benchmark helpers provided by user code */ +void starch_magnitude_s16_benchmark (void); +bool starch_magnitude_s16_benchmark_verify ( const int16_t * arg0, uint16_t * arg1, unsigned arg2 ); + +/* prototype the benchmarking function so that we can build with -Wmissing-declarations */ +void starch_magnitude_s16_benchmark(void); + +static void starch_benchmark_one_magnitude_s16( starch_magnitude_s16_regentry * _entry, const int16_t * arg0, uint16_t * arg1, unsigned arg2 ) +{ + fprintf(stderr, " %-40s ", _entry->name); + + /* test for support */ + if (_entry->flavor_supported && !(_entry->flavor_supported())) { + fprintf(stderr, "unsupported\n"); + return; + } + + if (starch_benchmark_flavor_whitelist && !starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_whitelist)) { + fprintf(stderr, "skipped (not whitelisted)\n"); + return; + } + + if (starch_benchmark_flavor_blacklist && starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_blacklist)) { + fprintf(stderr, "skipped (blacklisted)\n"); + return; + } + + if (starch_benchmark_list_only) { + fprintf(stderr, "supported\n"); + return; + } + + /* initial warmup */ + for (unsigned _loop = 0; _loop < starch_benchmark_warmup_loops; ++_loop) + _entry->callable ( arg0, arg1, arg2 ); + + /* verify correctness of the output */ + if (! starch_magnitude_s16_benchmark_verify ( arg0, arg1, arg2 )) { + fprintf(stderr, "skipped (verification failed)\n"); + starch_benchmark_validation_failed = true; + return; + } + if (starch_benchmark_validate_only) { + fprintf(stderr, "validation ok\n"); + return; + } + + /* pre-benchmark, find a loop count that takes at least 100ms */ + starch_benchmark_time _start, _end; + uint64_t _elapsed = 0; + uint64_t _loops = 127; + while (_elapsed < 100000000) { + _loops *= 2; + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2 ); + starch_benchmark_get_time(&_end); + _elapsed = starch_benchmark_elapsed(&_start, &_end); + } + + /* real benchmark, run for approx 1 second */ + _loops = _loops * 1000000000 / _elapsed; + + _elapsed = 0; + uint64_t _elapsed_min = UINT64_MAX; + uint64_t _elapsed_max = 0; + for (unsigned _iter = 0; _iter < starch_benchmark_iterations; ++_iter) { + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2 ); + starch_benchmark_get_time(&_end); + uint64_t _elapsed_one = starch_benchmark_elapsed(&_start, &_end); + if (_elapsed_one < _elapsed_min) + _elapsed_min = _elapsed_one; + if (_elapsed_one > _elapsed_max) + _elapsed_max = _elapsed_one; + _elapsed += _elapsed_one; + } + + uint64_t _per_loop; + if (starch_benchmark_iterations > 2) + _per_loop = (_elapsed - _elapsed_min - _elapsed_max) / _loops / (starch_benchmark_iterations - 2); + else + _per_loop = _elapsed / _loops / starch_benchmark_iterations; + + fprintf(stderr, "%" PRIu64 " ns/call\n", _per_loop); + + if (starch_benchmark_result_count >= starch_benchmark_result_size) { + if (!starch_benchmark_result_size) + starch_benchmark_result_size = 64; + else + starch_benchmark_result_size *= 2; + starch_benchmark_results = realloc(starch_benchmark_results, starch_benchmark_result_size * sizeof(*starch_benchmark_results)); + if (!starch_benchmark_results) { + fprintf(stderr, "realloc: %s\n", strerror(errno)); + exit(1); + } + } + + starch_benchmark_results[starch_benchmark_result_count].name = "magnitude_s16"; + starch_benchmark_results[starch_benchmark_result_count].impl = _entry->name; + starch_benchmark_results[starch_benchmark_result_count].ns = _per_loop; + ++starch_benchmark_result_count; +} + +static void starch_benchmark_run_magnitude_s16( const int16_t * arg0, uint16_t * arg1, unsigned arg2 ) +{ + for (starch_magnitude_s16_regentry *_entry = starch_magnitude_s16_registry; _entry->name; ++_entry) { + starch_benchmark_one_magnitude_s16( _entry, arg0, arg1, arg2 ); + } +} + +/* prototypes for benchmark helpers provided by user code */ +void starch_magnitude_s16_aligned_benchmark (void); +bool starch_magnitude_s16_aligned_benchmark_verify ( const int16_t * arg0, uint16_t * arg1, unsigned arg2 ); + +/* prototype the benchmarking function so that we can build with -Wmissing-declarations */ +void starch_magnitude_s16_aligned_benchmark(void); + +static void starch_benchmark_one_magnitude_s16_aligned( starch_magnitude_s16_aligned_regentry * _entry, const int16_t * arg0, uint16_t * arg1, unsigned arg2 ) +{ + fprintf(stderr, " %-40s ", _entry->name); + + /* test for support */ + if (_entry->flavor_supported && !(_entry->flavor_supported())) { + fprintf(stderr, "unsupported\n"); + return; + } + + if (starch_benchmark_flavor_whitelist && !starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_whitelist)) { + fprintf(stderr, "skipped (not whitelisted)\n"); + return; + } + + if (starch_benchmark_flavor_blacklist && starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_blacklist)) { + fprintf(stderr, "skipped (blacklisted)\n"); + return; + } + + if (starch_benchmark_list_only) { + fprintf(stderr, "supported\n"); + return; + } + + /* initial warmup */ + for (unsigned _loop = 0; _loop < starch_benchmark_warmup_loops; ++_loop) + _entry->callable ( arg0, arg1, arg2 ); + + /* verify correctness of the output */ + if (! starch_magnitude_s16_aligned_benchmark_verify ( arg0, arg1, arg2 )) { + fprintf(stderr, "skipped (verification failed)\n"); + starch_benchmark_validation_failed = true; + return; + } + if (starch_benchmark_validate_only) { + fprintf(stderr, "validation ok\n"); + return; + } + + /* pre-benchmark, find a loop count that takes at least 100ms */ + starch_benchmark_time _start, _end; + uint64_t _elapsed = 0; + uint64_t _loops = 127; + while (_elapsed < 100000000) { + _loops *= 2; + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2 ); + starch_benchmark_get_time(&_end); + _elapsed = starch_benchmark_elapsed(&_start, &_end); + } + + /* real benchmark, run for approx 1 second */ + _loops = _loops * 1000000000 / _elapsed; + + _elapsed = 0; + uint64_t _elapsed_min = UINT64_MAX; + uint64_t _elapsed_max = 0; + for (unsigned _iter = 0; _iter < starch_benchmark_iterations; ++_iter) { + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2 ); + starch_benchmark_get_time(&_end); + uint64_t _elapsed_one = starch_benchmark_elapsed(&_start, &_end); + if (_elapsed_one < _elapsed_min) + _elapsed_min = _elapsed_one; + if (_elapsed_one > _elapsed_max) + _elapsed_max = _elapsed_one; + _elapsed += _elapsed_one; + } + + uint64_t _per_loop; + if (starch_benchmark_iterations > 2) + _per_loop = (_elapsed - _elapsed_min - _elapsed_max) / _loops / (starch_benchmark_iterations - 2); + else + _per_loop = _elapsed / _loops / starch_benchmark_iterations; + + fprintf(stderr, "%" PRIu64 " ns/call\n", _per_loop); + + if (starch_benchmark_result_count >= starch_benchmark_result_size) { + if (!starch_benchmark_result_size) + starch_benchmark_result_size = 64; + else + starch_benchmark_result_size *= 2; + starch_benchmark_results = realloc(starch_benchmark_results, starch_benchmark_result_size * sizeof(*starch_benchmark_results)); + if (!starch_benchmark_results) { + fprintf(stderr, "realloc: %s\n", strerror(errno)); + exit(1); + } + } + + starch_benchmark_results[starch_benchmark_result_count].name = "magnitude_s16_aligned"; + starch_benchmark_results[starch_benchmark_result_count].impl = _entry->name; + starch_benchmark_results[starch_benchmark_result_count].ns = _per_loop; + ++starch_benchmark_result_count; +} + +static void starch_benchmark_run_magnitude_s16_aligned( const int16_t * arg0, uint16_t * arg1, unsigned arg2 ) +{ + for (starch_magnitude_s16_aligned_regentry *_entry = starch_magnitude_s16_aligned_registry; _entry->name; ++_entry) { + starch_benchmark_one_magnitude_s16_aligned( _entry, arg0, arg1, arg2 ); + } +} + /* prototypes for benchmark helpers provided by user code */ void starch_magnitude_sc16_benchmark (void); bool starch_magnitude_sc16_benchmark_verify ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); @@ -784,6 +1008,230 @@ static void starch_benchmark_run_magnitude_sc16_aligned( const sc16_t * arg0, ui } } +/* prototypes for benchmark helpers provided by user code */ +void starch_magnitude_u16o12_benchmark (void); +bool starch_magnitude_u16o12_benchmark_verify ( const uint16_t * arg0, uint16_t * arg1, unsigned arg2 ); + +/* prototype the benchmarking function so that we can build with -Wmissing-declarations */ +void starch_magnitude_u16o12_benchmark(void); + +static void starch_benchmark_one_magnitude_u16o12( starch_magnitude_u16o12_regentry * _entry, const uint16_t * arg0, uint16_t * arg1, unsigned arg2 ) +{ + fprintf(stderr, " %-40s ", _entry->name); + + /* test for support */ + if (_entry->flavor_supported && !(_entry->flavor_supported())) { + fprintf(stderr, "unsupported\n"); + return; + } + + if (starch_benchmark_flavor_whitelist && !starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_whitelist)) { + fprintf(stderr, "skipped (not whitelisted)\n"); + return; + } + + if (starch_benchmark_flavor_blacklist && starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_blacklist)) { + fprintf(stderr, "skipped (blacklisted)\n"); + return; + } + + if (starch_benchmark_list_only) { + fprintf(stderr, "supported\n"); + return; + } + + /* initial warmup */ + for (unsigned _loop = 0; _loop < starch_benchmark_warmup_loops; ++_loop) + _entry->callable ( arg0, arg1, arg2 ); + + /* verify correctness of the output */ + if (! starch_magnitude_u16o12_benchmark_verify ( arg0, arg1, arg2 )) { + fprintf(stderr, "skipped (verification failed)\n"); + starch_benchmark_validation_failed = true; + return; + } + if (starch_benchmark_validate_only) { + fprintf(stderr, "validation ok\n"); + return; + } + + /* pre-benchmark, find a loop count that takes at least 100ms */ + starch_benchmark_time _start, _end; + uint64_t _elapsed = 0; + uint64_t _loops = 127; + while (_elapsed < 100000000) { + _loops *= 2; + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2 ); + starch_benchmark_get_time(&_end); + _elapsed = starch_benchmark_elapsed(&_start, &_end); + } + + /* real benchmark, run for approx 1 second */ + _loops = _loops * 1000000000 / _elapsed; + + _elapsed = 0; + uint64_t _elapsed_min = UINT64_MAX; + uint64_t _elapsed_max = 0; + for (unsigned _iter = 0; _iter < starch_benchmark_iterations; ++_iter) { + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2 ); + starch_benchmark_get_time(&_end); + uint64_t _elapsed_one = starch_benchmark_elapsed(&_start, &_end); + if (_elapsed_one < _elapsed_min) + _elapsed_min = _elapsed_one; + if (_elapsed_one > _elapsed_max) + _elapsed_max = _elapsed_one; + _elapsed += _elapsed_one; + } + + uint64_t _per_loop; + if (starch_benchmark_iterations > 2) + _per_loop = (_elapsed - _elapsed_min - _elapsed_max) / _loops / (starch_benchmark_iterations - 2); + else + _per_loop = _elapsed / _loops / starch_benchmark_iterations; + + fprintf(stderr, "%" PRIu64 " ns/call\n", _per_loop); + + if (starch_benchmark_result_count >= starch_benchmark_result_size) { + if (!starch_benchmark_result_size) + starch_benchmark_result_size = 64; + else + starch_benchmark_result_size *= 2; + starch_benchmark_results = realloc(starch_benchmark_results, starch_benchmark_result_size * sizeof(*starch_benchmark_results)); + if (!starch_benchmark_results) { + fprintf(stderr, "realloc: %s\n", strerror(errno)); + exit(1); + } + } + + starch_benchmark_results[starch_benchmark_result_count].name = "magnitude_u16o12"; + starch_benchmark_results[starch_benchmark_result_count].impl = _entry->name; + starch_benchmark_results[starch_benchmark_result_count].ns = _per_loop; + ++starch_benchmark_result_count; +} + +static void starch_benchmark_run_magnitude_u16o12( const uint16_t * arg0, uint16_t * arg1, unsigned arg2 ) +{ + for (starch_magnitude_u16o12_regentry *_entry = starch_magnitude_u16o12_registry; _entry->name; ++_entry) { + starch_benchmark_one_magnitude_u16o12( _entry, arg0, arg1, arg2 ); + } +} + +/* prototypes for benchmark helpers provided by user code */ +void starch_magnitude_u16o12_aligned_benchmark (void); +bool starch_magnitude_u16o12_aligned_benchmark_verify ( const uint16_t * arg0, uint16_t * arg1, unsigned arg2 ); + +/* prototype the benchmarking function so that we can build with -Wmissing-declarations */ +void starch_magnitude_u16o12_aligned_benchmark(void); + +static void starch_benchmark_one_magnitude_u16o12_aligned( starch_magnitude_u16o12_aligned_regentry * _entry, const uint16_t * arg0, uint16_t * arg1, unsigned arg2 ) +{ + fprintf(stderr, " %-40s ", _entry->name); + + /* test for support */ + if (_entry->flavor_supported && !(_entry->flavor_supported())) { + fprintf(stderr, "unsupported\n"); + return; + } + + if (starch_benchmark_flavor_whitelist && !starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_whitelist)) { + fprintf(stderr, "skipped (not whitelisted)\n"); + return; + } + + if (starch_benchmark_flavor_blacklist && starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_blacklist)) { + fprintf(stderr, "skipped (blacklisted)\n"); + return; + } + + if (starch_benchmark_list_only) { + fprintf(stderr, "supported\n"); + return; + } + + /* initial warmup */ + for (unsigned _loop = 0; _loop < starch_benchmark_warmup_loops; ++_loop) + _entry->callable ( arg0, arg1, arg2 ); + + /* verify correctness of the output */ + if (! starch_magnitude_u16o12_aligned_benchmark_verify ( arg0, arg1, arg2 )) { + fprintf(stderr, "skipped (verification failed)\n"); + starch_benchmark_validation_failed = true; + return; + } + if (starch_benchmark_validate_only) { + fprintf(stderr, "validation ok\n"); + return; + } + + /* pre-benchmark, find a loop count that takes at least 100ms */ + starch_benchmark_time _start, _end; + uint64_t _elapsed = 0; + uint64_t _loops = 127; + while (_elapsed < 100000000) { + _loops *= 2; + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2 ); + starch_benchmark_get_time(&_end); + _elapsed = starch_benchmark_elapsed(&_start, &_end); + } + + /* real benchmark, run for approx 1 second */ + _loops = _loops * 1000000000 / _elapsed; + + _elapsed = 0; + uint64_t _elapsed_min = UINT64_MAX; + uint64_t _elapsed_max = 0; + for (unsigned _iter = 0; _iter < starch_benchmark_iterations; ++_iter) { + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2 ); + starch_benchmark_get_time(&_end); + uint64_t _elapsed_one = starch_benchmark_elapsed(&_start, &_end); + if (_elapsed_one < _elapsed_min) + _elapsed_min = _elapsed_one; + if (_elapsed_one > _elapsed_max) + _elapsed_max = _elapsed_one; + _elapsed += _elapsed_one; + } + + uint64_t _per_loop; + if (starch_benchmark_iterations > 2) + _per_loop = (_elapsed - _elapsed_min - _elapsed_max) / _loops / (starch_benchmark_iterations - 2); + else + _per_loop = _elapsed / _loops / starch_benchmark_iterations; + + fprintf(stderr, "%" PRIu64 " ns/call\n", _per_loop); + + if (starch_benchmark_result_count >= starch_benchmark_result_size) { + if (!starch_benchmark_result_size) + starch_benchmark_result_size = 64; + else + starch_benchmark_result_size *= 2; + starch_benchmark_results = realloc(starch_benchmark_results, starch_benchmark_result_size * sizeof(*starch_benchmark_results)); + if (!starch_benchmark_results) { + fprintf(stderr, "realloc: %s\n", strerror(errno)); + exit(1); + } + } + + starch_benchmark_results[starch_benchmark_result_count].name = "magnitude_u16o12_aligned"; + starch_benchmark_results[starch_benchmark_result_count].impl = _entry->name; + starch_benchmark_results[starch_benchmark_result_count].ns = _per_loop; + ++starch_benchmark_result_count; +} + +static void starch_benchmark_run_magnitude_u16o12_aligned( const uint16_t * arg0, uint16_t * arg1, unsigned arg2 ) +{ + for (starch_magnitude_u16o12_aligned_regentry *_entry = starch_magnitude_u16o12_aligned_registry; _entry->name; ++_entry) { + starch_benchmark_one_magnitude_u16o12_aligned( _entry, arg0, arg1, arg2 ); + } +} + /* prototypes for benchmark helpers provided by user code */ void starch_magnitude_sc16q11_benchmark (void); bool starch_magnitude_sc16q11_benchmark_verify ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); @@ -1251,6 +1699,8 @@ static void starch_benchmark_run_mean_power_u16_aligned( const uint16_t * arg0, #include "../benchmark/magnitude_sc16q11_benchmark.c" #include "../benchmark/magnitude_uc8_benchmark.c" #include "../benchmark/mean_power_u16_benchmark.c" +#include "../benchmark/magnitude_s16_benchmark.c" +#include "../benchmark/magnitude_u16o12_benchmark.c" #undef STARCH_ALIGNMENT #undef STARCH_ALIGNED @@ -1279,6 +1729,8 @@ static void starch_benchmark_run_mean_power_u16_aligned( const uint16_t * arg0, #include "../benchmark/magnitude_sc16q11_benchmark.c" #include "../benchmark/magnitude_uc8_benchmark.c" #include "../benchmark/mean_power_u16_benchmark.c" +#include "../benchmark/magnitude_s16_benchmark.c" +#include "../benchmark/magnitude_u16o12_benchmark.c" static void starch_benchmark_all_magnitude_uc8(void) { @@ -1300,6 +1752,16 @@ static void starch_benchmark_all_magnitude_power_uc8_aligned(void) fprintf(stderr, "==== magnitude_power_uc8_aligned ===\n"); starch_magnitude_power_uc8_aligned_benchmark (); } +static void starch_benchmark_all_magnitude_s16(void) +{ + fprintf(stderr, "==== magnitude_s16 ===\n"); + starch_magnitude_s16_benchmark (); +} +static void starch_benchmark_all_magnitude_s16_aligned(void) +{ + fprintf(stderr, "==== magnitude_s16_aligned ===\n"); + starch_magnitude_s16_aligned_benchmark (); +} static void starch_benchmark_all_magnitude_sc16(void) { fprintf(stderr, "==== magnitude_sc16 ===\n"); @@ -1310,6 +1772,16 @@ static void starch_benchmark_all_magnitude_sc16_aligned(void) fprintf(stderr, "==== magnitude_sc16_aligned ===\n"); starch_magnitude_sc16_aligned_benchmark (); } +static void starch_benchmark_all_magnitude_u16o12(void) +{ + fprintf(stderr, "==== magnitude_u16o12 ===\n"); + starch_magnitude_u16o12_benchmark (); +} +static void starch_benchmark_all_magnitude_u16o12_aligned(void) +{ + fprintf(stderr, "==== magnitude_u16o12_aligned ===\n"); + starch_magnitude_u16o12_aligned_benchmark (); +} static void starch_benchmark_all_magnitude_sc16q11(void) { fprintf(stderr, "==== magnitude_sc16q11 ===\n"); @@ -1387,8 +1859,12 @@ static void starch_benchmark_usage(const char *argv0) "magnitude_uc8_aligned " "magnitude_power_uc8 " "magnitude_power_uc8_aligned " + "magnitude_s16 " + "magnitude_s16_aligned " "magnitude_sc16 " "magnitude_sc16_aligned " + "magnitude_u16o12 " + "magnitude_u16o12_aligned " "magnitude_sc16q11 " "magnitude_sc16q11_aligned " "mean_power_u16 " @@ -1498,6 +1974,16 @@ int main(int argc, char **argv) starch_benchmark_all_magnitude_power_uc8_aligned(); continue; } + if (!strcmp(argv[i], "magnitude_s16")) { + specific = 1; + starch_benchmark_all_magnitude_s16(); + continue; + } + if (!strcmp(argv[i], "magnitude_s16_aligned")) { + specific = 1; + starch_benchmark_all_magnitude_s16_aligned(); + continue; + } if (!strcmp(argv[i], "magnitude_sc16")) { specific = 1; starch_benchmark_all_magnitude_sc16(); @@ -1508,6 +1994,16 @@ int main(int argc, char **argv) starch_benchmark_all_magnitude_sc16_aligned(); continue; } + if (!strcmp(argv[i], "magnitude_u16o12")) { + specific = 1; + starch_benchmark_all_magnitude_u16o12(); + continue; + } + if (!strcmp(argv[i], "magnitude_u16o12_aligned")) { + specific = 1; + starch_benchmark_all_magnitude_u16o12_aligned(); + continue; + } if (!strcmp(argv[i], "magnitude_sc16q11")) { specific = 1; starch_benchmark_all_magnitude_sc16q11(); @@ -1538,8 +2034,12 @@ int main(int argc, char **argv) starch_benchmark_all_magnitude_uc8_aligned(); starch_benchmark_all_magnitude_power_uc8(); starch_benchmark_all_magnitude_power_uc8_aligned(); + starch_benchmark_all_magnitude_s16(); + starch_benchmark_all_magnitude_s16_aligned(); starch_benchmark_all_magnitude_sc16(); starch_benchmark_all_magnitude_sc16_aligned(); + starch_benchmark_all_magnitude_u16o12(); + starch_benchmark_all_magnitude_u16o12_aligned(); starch_benchmark_all_magnitude_sc16q11(); starch_benchmark_all_magnitude_sc16q11_aligned(); starch_benchmark_all_mean_power_u16(); diff --git a/dsp/generated/dispatcher.c b/dsp/generated/dispatcher.c index 7a0bce49f..90f7d8a99 100644 --- a/dsp/generated/dispatcher.c +++ b/dsp/generated/dispatcher.c @@ -405,6 +405,159 @@ starch_magnitude_power_uc8_aligned_regentry starch_magnitude_power_uc8_aligned_r { 0, NULL, NULL, NULL, NULL } }; +/* dispatcher / registry for magnitude_s16 */ + +starch_magnitude_s16_regentry * starch_magnitude_s16_select() { + for (starch_magnitude_s16_regentry *entry = starch_magnitude_s16_registry; + entry->name; + ++entry) + { + if (entry->flavor_supported && !(entry->flavor_supported())) + continue; + return entry; + } + return NULL; +} + +static void starch_magnitude_s16_dispatch ( const int16_t * arg0, uint16_t * arg1, unsigned arg2 ) { + starch_magnitude_s16_regentry *entry = starch_magnitude_s16_select(); + if (!entry) + abort(); + + starch_magnitude_s16 = entry->callable; + starch_magnitude_s16 ( arg0, arg1, arg2 ); +} + +starch_magnitude_s16_ptr starch_magnitude_s16 = starch_magnitude_s16_dispatch; + +void starch_magnitude_s16_set_wisdom (const char * const * received_wisdom) +{ + /* re-rank the registry based on received wisdom */ + starch_magnitude_s16_regentry *entry; + for (entry = starch_magnitude_s16_registry; entry->name; ++entry) { + const char * const *search; + for (search = received_wisdom; *search; ++search) { + if (!strcmp(*search, entry->name)) { + break; + } + } + if (*search) { + /* matches an entry in the wisdom list, order by position in the list */ + entry->rank = search - received_wisdom; + } else { + /* no match, rank after all possible matches, retaining existing order */ + entry->rank = (search - received_wisdom) + (entry - starch_magnitude_s16_registry); + } + } + + /* re-sort based on the new ranking */ + qsort(starch_magnitude_s16_registry, entry - starch_magnitude_s16_registry, sizeof(starch_magnitude_s16_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_magnitude_s16 = starch_magnitude_s16_dispatch; +} + +starch_magnitude_s16_regentry starch_magnitude_s16_registry[] = { + +#ifdef STARCH_MIX_GENERIC + { 0, "exact_u32_generic", "generic", starch_magnitude_s16_exact_u32_generic, NULL }, +#endif /* STARCH_MIX_GENERIC */ + +#ifdef STARCH_MIX_ARM + { 0, "exact_u32_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_s16_exact_u32_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 1, "exact_u32_generic", "generic", starch_magnitude_s16_exact_u32_generic, NULL }, +#endif /* STARCH_MIX_ARM */ + +#ifdef STARCH_MIX_AARCH64 + { 0, "exact_u32_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_s16_exact_u32_armv8_neon_simd, cpu_supports_armv8_simd }, + { 1, "exact_u32_generic", "generic", starch_magnitude_s16_exact_u32_generic, NULL }, +#endif /* STARCH_MIX_AARCH64 */ + +#ifdef STARCH_MIX_X86 + { 0, "exact_u32_x86_avx2", "x86_avx2", starch_magnitude_s16_exact_u32_x86_avx2, cpu_supports_avx2 }, + { 1, "exact_u32_generic", "generic", starch_magnitude_s16_exact_u32_generic, NULL }, +#endif /* STARCH_MIX_X86 */ + { 0, NULL, NULL, NULL, NULL } +}; + +/* dispatcher / registry for magnitude_s16_aligned */ + +starch_magnitude_s16_aligned_regentry * starch_magnitude_s16_aligned_select() { + for (starch_magnitude_s16_aligned_regentry *entry = starch_magnitude_s16_aligned_registry; + entry->name; + ++entry) + { + if (entry->flavor_supported && !(entry->flavor_supported())) + continue; + return entry; + } + return NULL; +} + +static void starch_magnitude_s16_aligned_dispatch ( const int16_t * arg0, uint16_t * arg1, unsigned arg2 ) { + starch_magnitude_s16_aligned_regentry *entry = starch_magnitude_s16_aligned_select(); + if (!entry) + abort(); + + starch_magnitude_s16_aligned = entry->callable; + starch_magnitude_s16_aligned ( arg0, arg1, arg2 ); +} + +starch_magnitude_s16_aligned_ptr starch_magnitude_s16_aligned = starch_magnitude_s16_aligned_dispatch; + +void starch_magnitude_s16_aligned_set_wisdom (const char * const * received_wisdom) +{ + /* re-rank the registry based on received wisdom */ + starch_magnitude_s16_aligned_regentry *entry; + for (entry = starch_magnitude_s16_aligned_registry; entry->name; ++entry) { + const char * const *search; + for (search = received_wisdom; *search; ++search) { + if (!strcmp(*search, entry->name)) { + break; + } + } + if (*search) { + /* matches an entry in the wisdom list, order by position in the list */ + entry->rank = search - received_wisdom; + } else { + /* no match, rank after all possible matches, retaining existing order */ + entry->rank = (search - received_wisdom) + (entry - starch_magnitude_s16_aligned_registry); + } + } + + /* re-sort based on the new ranking */ + qsort(starch_magnitude_s16_aligned_registry, entry - starch_magnitude_s16_aligned_registry, sizeof(starch_magnitude_s16_aligned_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_magnitude_s16_aligned = starch_magnitude_s16_aligned_dispatch; +} + +starch_magnitude_s16_aligned_regentry starch_magnitude_s16_aligned_registry[] = { + +#ifdef STARCH_MIX_GENERIC + { 0, "exact_u32_generic", "generic", starch_magnitude_s16_exact_u32_generic, NULL }, +#endif /* STARCH_MIX_GENERIC */ + +#ifdef STARCH_MIX_ARM + { 0, "exact_u32_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_magnitude_s16_aligned_exact_u32_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 1, "exact_u32_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_s16_exact_u32_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 2, "exact_u32_generic", "generic", starch_magnitude_s16_exact_u32_generic, NULL }, +#endif /* STARCH_MIX_ARM */ + +#ifdef STARCH_MIX_AARCH64 + { 0, "exact_u32_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_s16_aligned_exact_u32_armv8_neon_simd, cpu_supports_armv8_simd }, + { 1, "exact_u32_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_s16_exact_u32_armv8_neon_simd, cpu_supports_armv8_simd }, + { 2, "exact_u32_generic", "generic", starch_magnitude_s16_exact_u32_generic, NULL }, +#endif /* STARCH_MIX_AARCH64 */ + +#ifdef STARCH_MIX_X86 + { 0, "exact_u32_x86_avx2_aligned", "x86_avx2", starch_magnitude_s16_aligned_exact_u32_x86_avx2, cpu_supports_avx2 }, + { 1, "exact_u32_x86_avx2", "x86_avx2", starch_magnitude_s16_exact_u32_x86_avx2, cpu_supports_avx2 }, + { 2, "exact_u32_generic", "generic", starch_magnitude_s16_exact_u32_generic, NULL }, +#endif /* STARCH_MIX_X86 */ + { 0, NULL, NULL, NULL, NULL } +}; + /* dispatcher / registry for magnitude_sc16 */ starch_magnitude_sc16_regentry * starch_magnitude_sc16_select() { @@ -581,6 +734,244 @@ starch_magnitude_sc16_aligned_regentry starch_magnitude_sc16_aligned_registry[] { 0, NULL, NULL, NULL, NULL } }; +/* dispatcher / registry for magnitude_u16o12 */ + +starch_magnitude_u16o12_regentry * starch_magnitude_u16o12_select() { + for (starch_magnitude_u16o12_regentry *entry = starch_magnitude_u16o12_registry; + entry->name; + ++entry) + { + if (entry->flavor_supported && !(entry->flavor_supported())) + continue; + return entry; + } + return NULL; +} + +static void starch_magnitude_u16o12_dispatch ( const uint16_t * arg0, uint16_t * arg1, unsigned arg2 ) { + starch_magnitude_u16o12_regentry *entry = starch_magnitude_u16o12_select(); + if (!entry) + abort(); + + starch_magnitude_u16o12 = entry->callable; + starch_magnitude_u16o12 ( arg0, arg1, arg2 ); +} + +starch_magnitude_u16o12_ptr starch_magnitude_u16o12 = starch_magnitude_u16o12_dispatch; + +void starch_magnitude_u16o12_set_wisdom (const char * const * received_wisdom) +{ + /* re-rank the registry based on received wisdom */ + starch_magnitude_u16o12_regentry *entry; + for (entry = starch_magnitude_u16o12_registry; entry->name; ++entry) { + const char * const *search; + for (search = received_wisdom; *search; ++search) { + if (!strcmp(*search, entry->name)) { + break; + } + } + if (*search) { + /* matches an entry in the wisdom list, order by position in the list */ + entry->rank = search - received_wisdom; + } else { + /* no match, rank after all possible matches, retaining existing order */ + entry->rank = (search - received_wisdom) + (entry - starch_magnitude_u16o12_registry); + } + } + + /* re-sort based on the new ranking */ + qsort(starch_magnitude_u16o12_registry, entry - starch_magnitude_u16o12_registry, sizeof(starch_magnitude_u16o12_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_magnitude_u16o12 = starch_magnitude_u16o12_dispatch; +} + +starch_magnitude_u16o12_regentry starch_magnitude_u16o12_registry[] = { + +#ifdef STARCH_MIX_GENERIC + { 0, "exact_generic", "generic", starch_magnitude_u16o12_exact_generic, NULL }, + { 1, "exact_unroll_4_generic", "generic", starch_magnitude_u16o12_exact_unroll_4_generic, NULL }, + { 2, "exact_unroll_8_generic", "generic", starch_magnitude_u16o12_exact_unroll_8_generic, NULL }, + { 3, "lookup_generic", "generic", starch_magnitude_u16o12_lookup_generic, NULL }, + { 4, "lookup_unroll_4_generic", "generic", starch_magnitude_u16o12_lookup_unroll_4_generic, NULL }, + { 5, "lookup_unroll_8_generic", "generic", starch_magnitude_u16o12_lookup_unroll_8_generic, NULL }, +#endif /* STARCH_MIX_GENERIC */ + +#ifdef STARCH_MIX_ARM + { 0, "exact_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_u16o12_exact_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 1, "exact_unroll_4_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_u16o12_exact_unroll_4_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 2, "exact_unroll_8_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_u16o12_exact_unroll_8_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 3, "lookup_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_u16o12_lookup_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 4, "lookup_unroll_4_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_u16o12_lookup_unroll_4_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 5, "lookup_unroll_8_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_u16o12_lookup_unroll_8_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 6, "exact_generic", "generic", starch_magnitude_u16o12_exact_generic, NULL }, + { 7, "exact_unroll_4_generic", "generic", starch_magnitude_u16o12_exact_unroll_4_generic, NULL }, + { 8, "exact_unroll_8_generic", "generic", starch_magnitude_u16o12_exact_unroll_8_generic, NULL }, + { 9, "lookup_generic", "generic", starch_magnitude_u16o12_lookup_generic, NULL }, + { 10, "lookup_unroll_4_generic", "generic", starch_magnitude_u16o12_lookup_unroll_4_generic, NULL }, + { 11, "lookup_unroll_8_generic", "generic", starch_magnitude_u16o12_lookup_unroll_8_generic, NULL }, +#endif /* STARCH_MIX_ARM */ + +#ifdef STARCH_MIX_AARCH64 + { 0, "exact_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_u16o12_exact_armv8_neon_simd, cpu_supports_armv8_simd }, + { 1, "exact_unroll_4_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_u16o12_exact_unroll_4_armv8_neon_simd, cpu_supports_armv8_simd }, + { 2, "exact_unroll_8_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_u16o12_exact_unroll_8_armv8_neon_simd, cpu_supports_armv8_simd }, + { 3, "lookup_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_u16o12_lookup_armv8_neon_simd, cpu_supports_armv8_simd }, + { 4, "lookup_unroll_4_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_u16o12_lookup_unroll_4_armv8_neon_simd, cpu_supports_armv8_simd }, + { 5, "lookup_unroll_8_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_u16o12_lookup_unroll_8_armv8_neon_simd, cpu_supports_armv8_simd }, + { 6, "exact_generic", "generic", starch_magnitude_u16o12_exact_generic, NULL }, + { 7, "exact_unroll_4_generic", "generic", starch_magnitude_u16o12_exact_unroll_4_generic, NULL }, + { 8, "exact_unroll_8_generic", "generic", starch_magnitude_u16o12_exact_unroll_8_generic, NULL }, + { 9, "lookup_generic", "generic", starch_magnitude_u16o12_lookup_generic, NULL }, + { 10, "lookup_unroll_4_generic", "generic", starch_magnitude_u16o12_lookup_unroll_4_generic, NULL }, + { 11, "lookup_unroll_8_generic", "generic", starch_magnitude_u16o12_lookup_unroll_8_generic, NULL }, +#endif /* STARCH_MIX_AARCH64 */ + +#ifdef STARCH_MIX_X86 + { 0, "exact_x86_avx2", "x86_avx2", starch_magnitude_u16o12_exact_x86_avx2, cpu_supports_avx2 }, + { 1, "exact_unroll_4_x86_avx2", "x86_avx2", starch_magnitude_u16o12_exact_unroll_4_x86_avx2, cpu_supports_avx2 }, + { 2, "exact_unroll_8_x86_avx2", "x86_avx2", starch_magnitude_u16o12_exact_unroll_8_x86_avx2, cpu_supports_avx2 }, + { 3, "lookup_x86_avx2", "x86_avx2", starch_magnitude_u16o12_lookup_x86_avx2, cpu_supports_avx2 }, + { 4, "lookup_unroll_4_x86_avx2", "x86_avx2", starch_magnitude_u16o12_lookup_unroll_4_x86_avx2, cpu_supports_avx2 }, + { 5, "lookup_unroll_8_x86_avx2", "x86_avx2", starch_magnitude_u16o12_lookup_unroll_8_x86_avx2, cpu_supports_avx2 }, + { 6, "exact_generic", "generic", starch_magnitude_u16o12_exact_generic, NULL }, + { 7, "exact_unroll_4_generic", "generic", starch_magnitude_u16o12_exact_unroll_4_generic, NULL }, + { 8, "exact_unroll_8_generic", "generic", starch_magnitude_u16o12_exact_unroll_8_generic, NULL }, + { 9, "lookup_generic", "generic", starch_magnitude_u16o12_lookup_generic, NULL }, + { 10, "lookup_unroll_4_generic", "generic", starch_magnitude_u16o12_lookup_unroll_4_generic, NULL }, + { 11, "lookup_unroll_8_generic", "generic", starch_magnitude_u16o12_lookup_unroll_8_generic, NULL }, +#endif /* STARCH_MIX_X86 */ + { 0, NULL, NULL, NULL, NULL } +}; + +/* dispatcher / registry for magnitude_u16o12_aligned */ + +starch_magnitude_u16o12_aligned_regentry * starch_magnitude_u16o12_aligned_select() { + for (starch_magnitude_u16o12_aligned_regentry *entry = starch_magnitude_u16o12_aligned_registry; + entry->name; + ++entry) + { + if (entry->flavor_supported && !(entry->flavor_supported())) + continue; + return entry; + } + return NULL; +} + +static void starch_magnitude_u16o12_aligned_dispatch ( const uint16_t * arg0, uint16_t * arg1, unsigned arg2 ) { + starch_magnitude_u16o12_aligned_regentry *entry = starch_magnitude_u16o12_aligned_select(); + if (!entry) + abort(); + + starch_magnitude_u16o12_aligned = entry->callable; + starch_magnitude_u16o12_aligned ( arg0, arg1, arg2 ); +} + +starch_magnitude_u16o12_aligned_ptr starch_magnitude_u16o12_aligned = starch_magnitude_u16o12_aligned_dispatch; + +void starch_magnitude_u16o12_aligned_set_wisdom (const char * const * received_wisdom) +{ + /* re-rank the registry based on received wisdom */ + starch_magnitude_u16o12_aligned_regentry *entry; + for (entry = starch_magnitude_u16o12_aligned_registry; entry->name; ++entry) { + const char * const *search; + for (search = received_wisdom; *search; ++search) { + if (!strcmp(*search, entry->name)) { + break; + } + } + if (*search) { + /* matches an entry in the wisdom list, order by position in the list */ + entry->rank = search - received_wisdom; + } else { + /* no match, rank after all possible matches, retaining existing order */ + entry->rank = (search - received_wisdom) + (entry - starch_magnitude_u16o12_aligned_registry); + } + } + + /* re-sort based on the new ranking */ + qsort(starch_magnitude_u16o12_aligned_registry, entry - starch_magnitude_u16o12_aligned_registry, sizeof(starch_magnitude_u16o12_aligned_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_magnitude_u16o12_aligned = starch_magnitude_u16o12_aligned_dispatch; +} + +starch_magnitude_u16o12_aligned_regentry starch_magnitude_u16o12_aligned_registry[] = { + +#ifdef STARCH_MIX_GENERIC + { 0, "exact_generic", "generic", starch_magnitude_u16o12_exact_generic, NULL }, + { 1, "exact_unroll_4_generic", "generic", starch_magnitude_u16o12_exact_unroll_4_generic, NULL }, + { 2, "exact_unroll_8_generic", "generic", starch_magnitude_u16o12_exact_unroll_8_generic, NULL }, + { 3, "lookup_generic", "generic", starch_magnitude_u16o12_lookup_generic, NULL }, + { 4, "lookup_unroll_4_generic", "generic", starch_magnitude_u16o12_lookup_unroll_4_generic, NULL }, + { 5, "lookup_unroll_8_generic", "generic", starch_magnitude_u16o12_lookup_unroll_8_generic, NULL }, +#endif /* STARCH_MIX_GENERIC */ + +#ifdef STARCH_MIX_ARM + { 0, "exact_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_magnitude_u16o12_aligned_exact_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 1, "exact_unroll_4_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_magnitude_u16o12_aligned_exact_unroll_4_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 2, "exact_unroll_8_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_magnitude_u16o12_aligned_exact_unroll_8_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 3, "lookup_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_magnitude_u16o12_aligned_lookup_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 4, "lookup_unroll_4_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_magnitude_u16o12_aligned_lookup_unroll_4_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 5, "lookup_unroll_8_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_magnitude_u16o12_aligned_lookup_unroll_8_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 6, "exact_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_u16o12_exact_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 7, "exact_unroll_4_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_u16o12_exact_unroll_4_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 8, "exact_unroll_8_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_u16o12_exact_unroll_8_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 9, "lookup_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_u16o12_lookup_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 10, "lookup_unroll_4_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_u16o12_lookup_unroll_4_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 11, "lookup_unroll_8_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_u16o12_lookup_unroll_8_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 12, "exact_generic", "generic", starch_magnitude_u16o12_exact_generic, NULL }, + { 13, "exact_unroll_4_generic", "generic", starch_magnitude_u16o12_exact_unroll_4_generic, NULL }, + { 14, "exact_unroll_8_generic", "generic", starch_magnitude_u16o12_exact_unroll_8_generic, NULL }, + { 15, "lookup_generic", "generic", starch_magnitude_u16o12_lookup_generic, NULL }, + { 16, "lookup_unroll_4_generic", "generic", starch_magnitude_u16o12_lookup_unroll_4_generic, NULL }, + { 17, "lookup_unroll_8_generic", "generic", starch_magnitude_u16o12_lookup_unroll_8_generic, NULL }, +#endif /* STARCH_MIX_ARM */ + +#ifdef STARCH_MIX_AARCH64 + { 0, "exact_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_u16o12_aligned_exact_armv8_neon_simd, cpu_supports_armv8_simd }, + { 1, "exact_unroll_4_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_u16o12_aligned_exact_unroll_4_armv8_neon_simd, cpu_supports_armv8_simd }, + { 2, "exact_unroll_8_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_u16o12_aligned_exact_unroll_8_armv8_neon_simd, cpu_supports_armv8_simd }, + { 3, "lookup_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_u16o12_aligned_lookup_armv8_neon_simd, cpu_supports_armv8_simd }, + { 4, "lookup_unroll_4_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_u16o12_aligned_lookup_unroll_4_armv8_neon_simd, cpu_supports_armv8_simd }, + { 5, "lookup_unroll_8_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_u16o12_aligned_lookup_unroll_8_armv8_neon_simd, cpu_supports_armv8_simd }, + { 6, "exact_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_u16o12_exact_armv8_neon_simd, cpu_supports_armv8_simd }, + { 7, "exact_unroll_4_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_u16o12_exact_unroll_4_armv8_neon_simd, cpu_supports_armv8_simd }, + { 8, "exact_unroll_8_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_u16o12_exact_unroll_8_armv8_neon_simd, cpu_supports_armv8_simd }, + { 9, "lookup_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_u16o12_lookup_armv8_neon_simd, cpu_supports_armv8_simd }, + { 10, "lookup_unroll_4_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_u16o12_lookup_unroll_4_armv8_neon_simd, cpu_supports_armv8_simd }, + { 11, "lookup_unroll_8_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_u16o12_lookup_unroll_8_armv8_neon_simd, cpu_supports_armv8_simd }, + { 12, "exact_generic", "generic", starch_magnitude_u16o12_exact_generic, NULL }, + { 13, "exact_unroll_4_generic", "generic", starch_magnitude_u16o12_exact_unroll_4_generic, NULL }, + { 14, "exact_unroll_8_generic", "generic", starch_magnitude_u16o12_exact_unroll_8_generic, NULL }, + { 15, "lookup_generic", "generic", starch_magnitude_u16o12_lookup_generic, NULL }, + { 16, "lookup_unroll_4_generic", "generic", starch_magnitude_u16o12_lookup_unroll_4_generic, NULL }, + { 17, "lookup_unroll_8_generic", "generic", starch_magnitude_u16o12_lookup_unroll_8_generic, NULL }, +#endif /* STARCH_MIX_AARCH64 */ + +#ifdef STARCH_MIX_X86 + { 0, "exact_x86_avx2_aligned", "x86_avx2", starch_magnitude_u16o12_aligned_exact_x86_avx2, cpu_supports_avx2 }, + { 1, "exact_unroll_4_x86_avx2_aligned", "x86_avx2", starch_magnitude_u16o12_aligned_exact_unroll_4_x86_avx2, cpu_supports_avx2 }, + { 2, "exact_unroll_8_x86_avx2_aligned", "x86_avx2", starch_magnitude_u16o12_aligned_exact_unroll_8_x86_avx2, cpu_supports_avx2 }, + { 3, "lookup_x86_avx2_aligned", "x86_avx2", starch_magnitude_u16o12_aligned_lookup_x86_avx2, cpu_supports_avx2 }, + { 4, "lookup_unroll_4_x86_avx2_aligned", "x86_avx2", starch_magnitude_u16o12_aligned_lookup_unroll_4_x86_avx2, cpu_supports_avx2 }, + { 5, "lookup_unroll_8_x86_avx2_aligned", "x86_avx2", starch_magnitude_u16o12_aligned_lookup_unroll_8_x86_avx2, cpu_supports_avx2 }, + { 6, "exact_x86_avx2", "x86_avx2", starch_magnitude_u16o12_exact_x86_avx2, cpu_supports_avx2 }, + { 7, "exact_unroll_4_x86_avx2", "x86_avx2", starch_magnitude_u16o12_exact_unroll_4_x86_avx2, cpu_supports_avx2 }, + { 8, "exact_unroll_8_x86_avx2", "x86_avx2", starch_magnitude_u16o12_exact_unroll_8_x86_avx2, cpu_supports_avx2 }, + { 9, "lookup_x86_avx2", "x86_avx2", starch_magnitude_u16o12_lookup_x86_avx2, cpu_supports_avx2 }, + { 10, "lookup_unroll_4_x86_avx2", "x86_avx2", starch_magnitude_u16o12_lookup_unroll_4_x86_avx2, cpu_supports_avx2 }, + { 11, "lookup_unroll_8_x86_avx2", "x86_avx2", starch_magnitude_u16o12_lookup_unroll_8_x86_avx2, cpu_supports_avx2 }, + { 12, "exact_generic", "generic", starch_magnitude_u16o12_exact_generic, NULL }, + { 13, "exact_unroll_4_generic", "generic", starch_magnitude_u16o12_exact_unroll_4_generic, NULL }, + { 14, "exact_unroll_8_generic", "generic", starch_magnitude_u16o12_exact_unroll_8_generic, NULL }, + { 15, "lookup_generic", "generic", starch_magnitude_u16o12_lookup_generic, NULL }, + { 16, "lookup_unroll_4_generic", "generic", starch_magnitude_u16o12_lookup_unroll_4_generic, NULL }, + { 17, "lookup_unroll_8_generic", "generic", starch_magnitude_u16o12_lookup_unroll_8_generic, NULL }, +#endif /* STARCH_MIX_X86 */ + { 0, NULL, NULL, NULL, NULL } +}; + /* dispatcher / registry for magnitude_sc16q11 */ starch_magnitude_sc16q11_regentry * starch_magnitude_sc16q11_select() { @@ -1008,6 +1399,14 @@ int starch_read_wisdom (const char * path) for (starch_magnitude_power_uc8_aligned_regentry *entry = starch_magnitude_power_uc8_aligned_registry; entry->name; ++entry) { entry->rank = 0; } + int rank_magnitude_s16 = 0; + for (starch_magnitude_s16_regentry *entry = starch_magnitude_s16_registry; entry->name; ++entry) { + entry->rank = 0; + } + int rank_magnitude_s16_aligned = 0; + for (starch_magnitude_s16_aligned_regentry *entry = starch_magnitude_s16_aligned_registry; entry->name; ++entry) { + entry->rank = 0; + } int rank_magnitude_sc16 = 0; for (starch_magnitude_sc16_regentry *entry = starch_magnitude_sc16_registry; entry->name; ++entry) { entry->rank = 0; @@ -1016,6 +1415,14 @@ int starch_read_wisdom (const char * path) for (starch_magnitude_sc16_aligned_regentry *entry = starch_magnitude_sc16_aligned_registry; entry->name; ++entry) { entry->rank = 0; } + int rank_magnitude_u16o12 = 0; + for (starch_magnitude_u16o12_regentry *entry = starch_magnitude_u16o12_registry; entry->name; ++entry) { + entry->rank = 0; + } + int rank_magnitude_u16o12_aligned = 0; + for (starch_magnitude_u16o12_aligned_regentry *entry = starch_magnitude_u16o12_aligned_registry; entry->name; ++entry) { + entry->rank = 0; + } int rank_magnitude_sc16q11 = 0; for (starch_magnitude_sc16q11_regentry *entry = starch_magnitude_sc16q11_registry; entry->name; ++entry) { entry->rank = 0; @@ -1101,6 +1508,24 @@ int starch_read_wisdom (const char * path) } continue; } + if (!strcmp(name, "magnitude_s16")) { + for (starch_magnitude_s16_regentry *entry = starch_magnitude_s16_registry; entry->name; ++entry) { + if (!strcmp(impl, entry->name)) { + entry->rank = ++rank_magnitude_s16; + break; + } + } + continue; + } + if (!strcmp(name, "magnitude_s16_aligned")) { + for (starch_magnitude_s16_aligned_regentry *entry = starch_magnitude_s16_aligned_registry; entry->name; ++entry) { + if (!strcmp(impl, entry->name)) { + entry->rank = ++rank_magnitude_s16_aligned; + break; + } + } + continue; + } if (!strcmp(name, "magnitude_sc16")) { for (starch_magnitude_sc16_regentry *entry = starch_magnitude_sc16_registry; entry->name; ++entry) { if (!strcmp(impl, entry->name)) { @@ -1119,6 +1544,24 @@ int starch_read_wisdom (const char * path) } continue; } + if (!strcmp(name, "magnitude_u16o12")) { + for (starch_magnitude_u16o12_regentry *entry = starch_magnitude_u16o12_registry; entry->name; ++entry) { + if (!strcmp(impl, entry->name)) { + entry->rank = ++rank_magnitude_u16o12; + break; + } + } + continue; + } + if (!strcmp(name, "magnitude_u16o12_aligned")) { + for (starch_magnitude_u16o12_aligned_regentry *entry = starch_magnitude_u16o12_aligned_registry; entry->name; ++entry) { + if (!strcmp(impl, entry->name)) { + entry->rank = ++rank_magnitude_u16o12_aligned; + break; + } + } + continue; + } if (!strcmp(name, "magnitude_sc16q11")) { for (starch_magnitude_sc16q11_regentry *entry = starch_magnitude_sc16q11_registry; entry->name; ++entry) { if (!strcmp(impl, entry->name)) { @@ -1209,6 +1652,28 @@ int starch_read_wisdom (const char * path) /* reset the implementation pointer so the next call will re-select */ starch_magnitude_power_uc8_aligned = starch_magnitude_power_uc8_aligned_dispatch; } + { + starch_magnitude_s16_regentry *entry; + for (entry = starch_magnitude_s16_registry; entry->name; ++entry) { + if (!entry->rank) + entry->rank = ++rank_magnitude_s16; + } + qsort(starch_magnitude_s16_registry, entry - starch_magnitude_s16_registry, sizeof(starch_magnitude_s16_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_magnitude_s16 = starch_magnitude_s16_dispatch; + } + { + starch_magnitude_s16_aligned_regentry *entry; + for (entry = starch_magnitude_s16_aligned_registry; entry->name; ++entry) { + if (!entry->rank) + entry->rank = ++rank_magnitude_s16_aligned; + } + qsort(starch_magnitude_s16_aligned_registry, entry - starch_magnitude_s16_aligned_registry, sizeof(starch_magnitude_s16_aligned_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_magnitude_s16_aligned = starch_magnitude_s16_aligned_dispatch; + } { starch_magnitude_sc16_regentry *entry; for (entry = starch_magnitude_sc16_registry; entry->name; ++entry) { @@ -1231,6 +1696,28 @@ int starch_read_wisdom (const char * path) /* reset the implementation pointer so the next call will re-select */ starch_magnitude_sc16_aligned = starch_magnitude_sc16_aligned_dispatch; } + { + starch_magnitude_u16o12_regentry *entry; + for (entry = starch_magnitude_u16o12_registry; entry->name; ++entry) { + if (!entry->rank) + entry->rank = ++rank_magnitude_u16o12; + } + qsort(starch_magnitude_u16o12_registry, entry - starch_magnitude_u16o12_registry, sizeof(starch_magnitude_u16o12_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_magnitude_u16o12 = starch_magnitude_u16o12_dispatch; + } + { + starch_magnitude_u16o12_aligned_regentry *entry; + for (entry = starch_magnitude_u16o12_aligned_registry; entry->name; ++entry) { + if (!entry->rank) + entry->rank = ++rank_magnitude_u16o12_aligned; + } + qsort(starch_magnitude_u16o12_aligned_registry, entry - starch_magnitude_u16o12_aligned_registry, sizeof(starch_magnitude_u16o12_aligned_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_magnitude_u16o12_aligned = starch_magnitude_u16o12_aligned_dispatch; + } { starch_magnitude_sc16q11_regentry *entry; for (entry = starch_magnitude_sc16q11_registry; entry->name; ++entry) { diff --git a/dsp/generated/flavor.armv7a_neon_vfpv4.c b/dsp/generated/flavor.armv7a_neon_vfpv4.c index cf8b5cce2..dc40dbfd9 100644 --- a/dsp/generated/flavor.armv7a_neon_vfpv4.c +++ b/dsp/generated/flavor.armv7a_neon_vfpv4.c @@ -19,6 +19,8 @@ #include "../impl/magnitude_sc16q11.c" #include "../impl/magnitude_uc8.c" #include "../impl/mean_power_u16.c" +#include "../impl/magnitude_s16.c" +#include "../impl/magnitude_u16o12.c" #undef STARCH_ALIGNMENT @@ -38,4 +40,6 @@ #include "../impl/magnitude_sc16q11.c" #include "../impl/magnitude_uc8.c" #include "../impl/mean_power_u16.c" +#include "../impl/magnitude_s16.c" +#include "../impl/magnitude_u16o12.c" diff --git a/dsp/generated/flavor.armv8_a.c b/dsp/generated/flavor.armv8_a.c index 2c7a9c23e..8ce8782e2 100644 --- a/dsp/generated/flavor.armv8_a.c +++ b/dsp/generated/flavor.armv8_a.c @@ -18,6 +18,8 @@ #include "../impl/magnitude_sc16q11.c" #include "../impl/magnitude_uc8.c" #include "../impl/mean_power_u16.c" +#include "../impl/magnitude_s16.c" +#include "../impl/magnitude_u16o12.c" #undef STARCH_ALIGNMENT @@ -37,4 +39,6 @@ #include "../impl/magnitude_sc16q11.c" #include "../impl/magnitude_uc8.c" #include "../impl/mean_power_u16.c" +#include "../impl/magnitude_s16.c" +#include "../impl/magnitude_u16o12.c" diff --git a/dsp/generated/flavor.armv8_neon_simd.c b/dsp/generated/flavor.armv8_neon_simd.c index 276e47e5d..74805b9ed 100644 --- a/dsp/generated/flavor.armv8_neon_simd.c +++ b/dsp/generated/flavor.armv8_neon_simd.c @@ -19,6 +19,8 @@ #include "../impl/magnitude_sc16q11.c" #include "../impl/magnitude_uc8.c" #include "../impl/mean_power_u16.c" +#include "../impl/magnitude_s16.c" +#include "../impl/magnitude_u16o12.c" #undef STARCH_ALIGNMENT @@ -38,4 +40,6 @@ #include "../impl/magnitude_sc16q11.c" #include "../impl/magnitude_uc8.c" #include "../impl/mean_power_u16.c" +#include "../impl/magnitude_s16.c" +#include "../impl/magnitude_u16o12.c" diff --git a/dsp/generated/flavor.generic.c b/dsp/generated/flavor.generic.c index 8b8fa0b24..d0ddf165f 100644 --- a/dsp/generated/flavor.generic.c +++ b/dsp/generated/flavor.generic.c @@ -18,4 +18,6 @@ #include "../impl/magnitude_sc16q11.c" #include "../impl/magnitude_uc8.c" #include "../impl/mean_power_u16.c" +#include "../impl/magnitude_s16.c" +#include "../impl/magnitude_u16o12.c" diff --git a/dsp/generated/flavor.x86_avx2.c b/dsp/generated/flavor.x86_avx2.c index de56b0dc1..ee419bd71 100644 --- a/dsp/generated/flavor.x86_avx2.c +++ b/dsp/generated/flavor.x86_avx2.c @@ -18,6 +18,8 @@ #include "../impl/magnitude_sc16q11.c" #include "../impl/magnitude_uc8.c" #include "../impl/mean_power_u16.c" +#include "../impl/magnitude_s16.c" +#include "../impl/magnitude_u16o12.c" #undef STARCH_ALIGNMENT @@ -37,4 +39,6 @@ #include "../impl/magnitude_sc16q11.c" #include "../impl/magnitude_uc8.c" #include "../impl/mean_power_u16.c" +#include "../impl/magnitude_s16.c" +#include "../impl/magnitude_u16o12.c" diff --git a/dsp/generated/makefile.arm b/dsp/generated/makefile.arm index 96c0044db..dc9c57799 100644 --- a/dsp/generated/makefile.arm +++ b/dsp/generated/makefile.arm @@ -21,19 +21,19 @@ STARCH_CFLAGS := -DSTARCH_MIX_ARM -dsp/generated/flavor.armv7a_neon_vfpv4.o: dsp/generated/flavor.armv7a_neon_vfpv4.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_sc16.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_uc8.c dsp/impl/mean_power_u16.c +dsp/generated/flavor.armv7a_neon_vfpv4.o: dsp/generated/flavor.armv7a_neon_vfpv4.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_sc16.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_uc8.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_s16.c dsp/impl/magnitude_u16o12.c $(STARCH_COMPILE) $(STARCH_CFLAGS) -march=armv7-a+neon-vfpv4 -mfpu=neon-vfpv4 -ffast-math dsp/generated/flavor.armv7a_neon_vfpv4.c -o dsp/generated/flavor.armv7a_neon_vfpv4.o -dsp/generated/flavor.generic.o: dsp/generated/flavor.generic.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_sc16.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_uc8.c dsp/impl/mean_power_u16.c +dsp/generated/flavor.generic.o: dsp/generated/flavor.generic.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_sc16.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_uc8.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_s16.c dsp/impl/magnitude_u16o12.c $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/flavor.generic.c -o dsp/generated/flavor.generic.o -dsp/generated/dispatcher.o: dsp/generated/dispatcher.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_sc16.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_uc8.c dsp/impl/mean_power_u16.c +dsp/generated/dispatcher.o: dsp/generated/dispatcher.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_sc16.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_uc8.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_s16.c dsp/impl/magnitude_u16o12.c $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/dispatcher.c -o dsp/generated/dispatcher.o STARCH_OBJS := dsp/generated/flavor.armv7a_neon_vfpv4.o dsp/generated/flavor.generic.o dsp/generated/dispatcher.o -dsp/generated/benchmark.o: dsp/generated/benchmark.c dsp/benchmark/magnitude_power_uc8_benchmark.c dsp/benchmark/magnitude_sc16_benchmark.c dsp/benchmark/magnitude_sc16q11_benchmark.c dsp/benchmark/magnitude_uc8_benchmark.c dsp/benchmark/mean_power_u16_benchmark.c +dsp/generated/benchmark.o: dsp/generated/benchmark.c dsp/benchmark/magnitude_power_uc8_benchmark.c dsp/benchmark/magnitude_sc16_benchmark.c dsp/benchmark/magnitude_sc16q11_benchmark.c dsp/benchmark/magnitude_uc8_benchmark.c dsp/benchmark/mean_power_u16_benchmark.c dsp/benchmark/magnitude_s16_benchmark.c dsp/benchmark/magnitude_u16o12_benchmark.c $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/benchmark.c -o dsp/generated/benchmark.o STARCH_BENCHMARK_OBJ := dsp/generated/benchmark.o diff --git a/dsp/generated/makefile.generic b/dsp/generated/makefile.generic index 18c6787bc..fe23741bb 100644 --- a/dsp/generated/makefile.generic +++ b/dsp/generated/makefile.generic @@ -21,16 +21,16 @@ STARCH_CFLAGS := -DSTARCH_MIX_GENERIC -dsp/generated/flavor.generic.o: dsp/generated/flavor.generic.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_sc16.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_uc8.c dsp/impl/mean_power_u16.c +dsp/generated/flavor.generic.o: dsp/generated/flavor.generic.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_sc16.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_uc8.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_s16.c dsp/impl/magnitude_u16o12.c $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/flavor.generic.c -o dsp/generated/flavor.generic.o -dsp/generated/dispatcher.o: dsp/generated/dispatcher.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_sc16.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_uc8.c dsp/impl/mean_power_u16.c +dsp/generated/dispatcher.o: dsp/generated/dispatcher.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_sc16.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_uc8.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_s16.c dsp/impl/magnitude_u16o12.c $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/dispatcher.c -o dsp/generated/dispatcher.o STARCH_OBJS := dsp/generated/flavor.generic.o dsp/generated/dispatcher.o -dsp/generated/benchmark.o: dsp/generated/benchmark.c dsp/benchmark/magnitude_power_uc8_benchmark.c dsp/benchmark/magnitude_sc16_benchmark.c dsp/benchmark/magnitude_sc16q11_benchmark.c dsp/benchmark/magnitude_uc8_benchmark.c dsp/benchmark/mean_power_u16_benchmark.c +dsp/generated/benchmark.o: dsp/generated/benchmark.c dsp/benchmark/magnitude_power_uc8_benchmark.c dsp/benchmark/magnitude_sc16_benchmark.c dsp/benchmark/magnitude_sc16q11_benchmark.c dsp/benchmark/magnitude_uc8_benchmark.c dsp/benchmark/mean_power_u16_benchmark.c dsp/benchmark/magnitude_s16_benchmark.c dsp/benchmark/magnitude_u16o12_benchmark.c $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/benchmark.c -o dsp/generated/benchmark.o STARCH_BENCHMARK_OBJ := dsp/generated/benchmark.o diff --git a/dsp/generated/makefile.x86 b/dsp/generated/makefile.x86 index 8d21e85c3..130b587a7 100644 --- a/dsp/generated/makefile.x86 +++ b/dsp/generated/makefile.x86 @@ -21,19 +21,19 @@ STARCH_CFLAGS := -DSTARCH_MIX_X86 -dsp/generated/flavor.x86_avx2.o: dsp/generated/flavor.x86_avx2.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_sc16.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_uc8.c dsp/impl/mean_power_u16.c +dsp/generated/flavor.x86_avx2.o: dsp/generated/flavor.x86_avx2.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_sc16.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_uc8.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_s16.c dsp/impl/magnitude_u16o12.c $(STARCH_COMPILE) $(STARCH_CFLAGS) -mavx2 -ffast-math dsp/generated/flavor.x86_avx2.c -o dsp/generated/flavor.x86_avx2.o -dsp/generated/flavor.generic.o: dsp/generated/flavor.generic.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_sc16.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_uc8.c dsp/impl/mean_power_u16.c +dsp/generated/flavor.generic.o: dsp/generated/flavor.generic.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_sc16.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_uc8.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_s16.c dsp/impl/magnitude_u16o12.c $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/flavor.generic.c -o dsp/generated/flavor.generic.o -dsp/generated/dispatcher.o: dsp/generated/dispatcher.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_sc16.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_uc8.c dsp/impl/mean_power_u16.c +dsp/generated/dispatcher.o: dsp/generated/dispatcher.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_sc16.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_uc8.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_s16.c dsp/impl/magnitude_u16o12.c $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/dispatcher.c -o dsp/generated/dispatcher.o STARCH_OBJS := dsp/generated/flavor.x86_avx2.o dsp/generated/flavor.generic.o dsp/generated/dispatcher.o -dsp/generated/benchmark.o: dsp/generated/benchmark.c dsp/benchmark/magnitude_power_uc8_benchmark.c dsp/benchmark/magnitude_sc16_benchmark.c dsp/benchmark/magnitude_sc16q11_benchmark.c dsp/benchmark/magnitude_uc8_benchmark.c dsp/benchmark/mean_power_u16_benchmark.c +dsp/generated/benchmark.o: dsp/generated/benchmark.c dsp/benchmark/magnitude_power_uc8_benchmark.c dsp/benchmark/magnitude_sc16_benchmark.c dsp/benchmark/magnitude_sc16q11_benchmark.c dsp/benchmark/magnitude_uc8_benchmark.c dsp/benchmark/mean_power_u16_benchmark.c dsp/benchmark/magnitude_s16_benchmark.c dsp/benchmark/magnitude_u16o12_benchmark.c $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/benchmark.c -o dsp/generated/benchmark.o STARCH_BENCHMARK_OBJ := dsp/generated/benchmark.o diff --git a/dsp/generated/starch.h b/dsp/generated/starch.h index dabf95050..61bb63f03 100644 --- a/dsp/generated/starch.h +++ b/dsp/generated/starch.h @@ -105,6 +105,36 @@ extern starch_magnitude_power_uc8_aligned_regentry starch_magnitude_power_uc8_al starch_magnitude_power_uc8_aligned_regentry * starch_magnitude_power_uc8_aligned_select(); void starch_magnitude_power_uc8_aligned_set_wisdom( const char * const * received_wisdom ); +typedef void (* starch_magnitude_s16_ptr) ( const int16_t * arg0, uint16_t * arg1, unsigned arg2 ); +extern starch_magnitude_s16_ptr starch_magnitude_s16; + +typedef struct { + int rank; + const char *name; + const char *flavor; + starch_magnitude_s16_ptr callable; + int (*flavor_supported)(); +} starch_magnitude_s16_regentry; + +extern starch_magnitude_s16_regentry starch_magnitude_s16_registry[]; +starch_magnitude_s16_regentry * starch_magnitude_s16_select(); +void starch_magnitude_s16_set_wisdom( const char * const * received_wisdom ); + +typedef void (* starch_magnitude_s16_aligned_ptr) ( const int16_t * arg0, uint16_t * arg1, unsigned arg2 ); +extern starch_magnitude_s16_aligned_ptr starch_magnitude_s16_aligned; + +typedef struct { + int rank; + const char *name; + const char *flavor; + starch_magnitude_s16_aligned_ptr callable; + int (*flavor_supported)(); +} starch_magnitude_s16_aligned_regentry; + +extern starch_magnitude_s16_aligned_regentry starch_magnitude_s16_aligned_registry[]; +starch_magnitude_s16_aligned_regentry * starch_magnitude_s16_aligned_select(); +void starch_magnitude_s16_aligned_set_wisdom( const char * const * received_wisdom ); + typedef void (* starch_magnitude_sc16_ptr) ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); extern starch_magnitude_sc16_ptr starch_magnitude_sc16; @@ -135,6 +165,36 @@ extern starch_magnitude_sc16_aligned_regentry starch_magnitude_sc16_aligned_regi starch_magnitude_sc16_aligned_regentry * starch_magnitude_sc16_aligned_select(); void starch_magnitude_sc16_aligned_set_wisdom( const char * const * received_wisdom ); +typedef void (* starch_magnitude_u16o12_ptr) ( const uint16_t * arg0, uint16_t * arg1, unsigned arg2 ); +extern starch_magnitude_u16o12_ptr starch_magnitude_u16o12; + +typedef struct { + int rank; + const char *name; + const char *flavor; + starch_magnitude_u16o12_ptr callable; + int (*flavor_supported)(); +} starch_magnitude_u16o12_regentry; + +extern starch_magnitude_u16o12_regentry starch_magnitude_u16o12_registry[]; +starch_magnitude_u16o12_regentry * starch_magnitude_u16o12_select(); +void starch_magnitude_u16o12_set_wisdom( const char * const * received_wisdom ); + +typedef void (* starch_magnitude_u16o12_aligned_ptr) ( const uint16_t * arg0, uint16_t * arg1, unsigned arg2 ); +extern starch_magnitude_u16o12_aligned_ptr starch_magnitude_u16o12_aligned; + +typedef struct { + int rank; + const char *name; + const char *flavor; + starch_magnitude_u16o12_aligned_ptr callable; + int (*flavor_supported)(); +} starch_magnitude_u16o12_aligned_regentry; + +extern starch_magnitude_u16o12_aligned_regentry starch_magnitude_u16o12_aligned_registry[]; +starch_magnitude_u16o12_aligned_regentry * starch_magnitude_u16o12_aligned_select(); +void starch_magnitude_u16o12_aligned_set_wisdom( const char * const * received_wisdom ); + typedef void (* starch_magnitude_sc16q11_ptr) ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); extern starch_magnitude_sc16q11_ptr starch_magnitude_sc16q11; @@ -213,6 +273,13 @@ void starch_magnitude_uc8_exact_generic ( const uc8_t * arg0, uint16_t * arg1, u void starch_mean_power_u16_float_generic ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); void starch_mean_power_u16_u32_generic ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); void starch_mean_power_u16_u64_generic ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_magnitude_s16_exact_u32_generic ( const int16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_u16o12_exact_generic ( const uint16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_u16o12_exact_unroll_4_generic ( const uint16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_u16o12_exact_unroll_8_generic ( const uint16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_u16o12_lookup_generic ( const uint16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_u16o12_lookup_unroll_4_generic ( const uint16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_u16o12_lookup_unroll_8_generic ( const uint16_t * arg0, uint16_t * arg1, unsigned arg2 ); #endif /* STARCH_FLAVOR_GENERIC */ int starch_read_wisdom (const char * path); @@ -259,6 +326,20 @@ void starch_mean_power_u16_u64_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsign void starch_mean_power_u16_aligned_u64_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); void starch_mean_power_u16_neon_float_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); void starch_mean_power_u16_aligned_neon_float_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_magnitude_s16_exact_u32_armv7a_neon_vfpv4 ( const int16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_s16_aligned_exact_u32_armv7a_neon_vfpv4 ( const int16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_u16o12_exact_armv7a_neon_vfpv4 ( const uint16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_u16o12_aligned_exact_armv7a_neon_vfpv4 ( const uint16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_u16o12_exact_unroll_4_armv7a_neon_vfpv4 ( const uint16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_u16o12_aligned_exact_unroll_4_armv7a_neon_vfpv4 ( const uint16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_u16o12_exact_unroll_8_armv7a_neon_vfpv4 ( const uint16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_u16o12_aligned_exact_unroll_8_armv7a_neon_vfpv4 ( const uint16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_u16o12_lookup_armv7a_neon_vfpv4 ( const uint16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_u16o12_aligned_lookup_armv7a_neon_vfpv4 ( const uint16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_u16o12_lookup_unroll_4_armv7a_neon_vfpv4 ( const uint16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_u16o12_aligned_lookup_unroll_4_armv7a_neon_vfpv4 ( const uint16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_u16o12_lookup_unroll_8_armv7a_neon_vfpv4 ( const uint16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_u16o12_aligned_lookup_unroll_8_armv7a_neon_vfpv4 ( const uint16_t * arg0, uint16_t * arg1, unsigned arg2 ); #endif /* STARCH_FLAVOR_ARMV7A_NEON_VFPV4 */ int starch_read_wisdom (const char * path); @@ -305,6 +386,20 @@ void starch_mean_power_u16_u64_armv8_neon_simd ( const uint16_t * arg0, unsigned void starch_mean_power_u16_aligned_u64_armv8_neon_simd ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); void starch_mean_power_u16_neon_float_armv8_neon_simd ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); void starch_mean_power_u16_aligned_neon_float_armv8_neon_simd ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_magnitude_s16_exact_u32_armv8_neon_simd ( const int16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_s16_aligned_exact_u32_armv8_neon_simd ( const int16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_u16o12_exact_armv8_neon_simd ( const uint16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_u16o12_aligned_exact_armv8_neon_simd ( const uint16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_u16o12_exact_unroll_4_armv8_neon_simd ( const uint16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_u16o12_aligned_exact_unroll_4_armv8_neon_simd ( const uint16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_u16o12_exact_unroll_8_armv8_neon_simd ( const uint16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_u16o12_aligned_exact_unroll_8_armv8_neon_simd ( const uint16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_u16o12_lookup_armv8_neon_simd ( const uint16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_u16o12_aligned_lookup_armv8_neon_simd ( const uint16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_u16o12_lookup_unroll_4_armv8_neon_simd ( const uint16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_u16o12_aligned_lookup_unroll_4_armv8_neon_simd ( const uint16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_u16o12_lookup_unroll_8_armv8_neon_simd ( const uint16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_u16o12_aligned_lookup_unroll_8_armv8_neon_simd ( const uint16_t * arg0, uint16_t * arg1, unsigned arg2 ); #endif /* STARCH_FLAVOR_ARMV8_NEON_SIMD */ int starch_read_wisdom (const char * path); @@ -341,6 +436,20 @@ void starch_mean_power_u16_u32_x86_avx2 ( const uint16_t * arg0, unsigned arg1, void starch_mean_power_u16_aligned_u32_x86_avx2 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); void starch_mean_power_u16_u64_x86_avx2 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); void starch_mean_power_u16_aligned_u64_x86_avx2 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_magnitude_s16_exact_u32_x86_avx2 ( const int16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_s16_aligned_exact_u32_x86_avx2 ( const int16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_u16o12_exact_x86_avx2 ( const uint16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_u16o12_aligned_exact_x86_avx2 ( const uint16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_u16o12_exact_unroll_4_x86_avx2 ( const uint16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_u16o12_aligned_exact_unroll_4_x86_avx2 ( const uint16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_u16o12_exact_unroll_8_x86_avx2 ( const uint16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_u16o12_aligned_exact_unroll_8_x86_avx2 ( const uint16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_u16o12_lookup_x86_avx2 ( const uint16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_u16o12_aligned_lookup_x86_avx2 ( const uint16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_u16o12_lookup_unroll_4_x86_avx2 ( const uint16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_u16o12_aligned_lookup_unroll_4_x86_avx2 ( const uint16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_u16o12_lookup_unroll_8_x86_avx2 ( const uint16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_u16o12_aligned_lookup_unroll_8_x86_avx2 ( const uint16_t * arg0, uint16_t * arg1, unsigned arg2 ); #endif /* STARCH_FLAVOR_X86_AVX2 */ int starch_read_wisdom (const char * path); diff --git a/dsp/helpers/tables.c b/dsp/helpers/tables.c index 1ab7eccdb..c36546d1f 100644 --- a/dsp/helpers/tables.c +++ b/dsp/helpers/tables.c @@ -103,3 +103,33 @@ const uint16_t * get_sc16q11_mag_12bit_table() return table; } +// The magic scaler is (32767.0 / 2047.0) +#define MAGIC_SCALER 32.01514f +#define CONVERT_AND_SCALE(__in) \ +({ \ + uint16_t __out = abs(le16toh(__in) - 2048); \ + ceil(__out * 32.01514f); \ +}) + + +const uint16_t * get_u16o12_mag_table() +{ + static uint16_t *table = NULL; + + if (table) { + return table; + } + + table = malloc(sizeof(uint16_t) * 4096 * 4096); + if (!table) { + fprintf(stderr, "can't allocate SC16Q11 conversion lookup table\n"); + abort(); + } + + for (int i = 1; i < 4096 ; i++) { + table[i] = CONVERT_AND_SCALE(i); + } + + return table; +} + diff --git a/dsp/helpers/tables.h b/dsp/helpers/tables.h index cfb86d3c9..1d65e0f7b 100644 --- a/dsp/helpers/tables.h +++ b/dsp/helpers/tables.h @@ -6,5 +6,6 @@ const uint16_t * get_uc8_mag_table(); const uint16_t * get_sc16q11_mag_11bit_table(); const uint16_t * get_sc16q11_mag_12bit_table(); +const uint16_t * get_u16o12_mag_table(); #endif diff --git a/dsp/impl/magnitude_s16.c b/dsp/impl/magnitude_s16.c new file mode 100644 index 000000000..6f457dc92 --- /dev/null +++ b/dsp/impl/magnitude_s16.c @@ -0,0 +1,17 @@ +#include +#include + +/* Convert (little-endian) signed 16-bit values to unsigned 16-bit magnitudes */ + +void STARCH_IMPL(magnitude_s16, exact_u32) (const int16_t *in, uint16_t *out, unsigned len) +{ + const int16_t * restrict in_align = STARCH_ALIGNED(in); + uint16_t * restrict out_align = STARCH_ALIGNED(out); + + while (len--) { + out_align[0] = abs((int16_t) le16toh(in_align[0])); + + out_align += 1; + in_align += 1; + } +} diff --git a/dsp/impl/magnitude_u16o12.c b/dsp/impl/magnitude_u16o12.c new file mode 100644 index 000000000..aabbc6f29 --- /dev/null +++ b/dsp/impl/magnitude_u16o12.c @@ -0,0 +1,189 @@ +#include +#include +#include +#include "dsp/helpers/tables.h" + +/* + * Convert (little-endian) unsigned 16 offset 12 ( Excess 2048 format) + * values to unsigned 16-bit magnitudes. + * + * Offset 12: + * + * Signed Scaled + * Input Value Magnitude + * ------------------------------ + * 0 + * 1 -2047 65535 + * 2 -2046 65503 + * ... + * 2046 -2 64 + * 2047 -1 32 + * 2048 0 0 + * 2049 1 32 + * 2050 2 64 + * + * ... + * 4093 2045 65471 + * 4094 2046 65503 + * 4095 2047 65535 + * + */ + +// The magic scaler is (32767.0 / 2047.0) +#define MAGIC_SCALER 32.01514f +#define CONVERT_AND_SCALE(__in) \ +({ \ + uint16_t __out = abs(le16toh(__in) - 2048); \ + ceil(__out * 32.01514f); \ +}) + +void STARCH_IMPL(magnitude_u16o12, exact) (const uint16_t *in, uint16_t *out, unsigned len) +{ + const uint16_t * restrict in_align = STARCH_ALIGNED(in); + uint16_t * restrict out_align = STARCH_ALIGNED(out); + + while (len--) { + out_align[0] = CONVERT_AND_SCALE(in_align[0]); + + out_align += 1; + in_align += 1; + } +} + +void STARCH_IMPL(magnitude_u16o12, exact_unroll_4) (const uint16_t *in, uint16_t *out, unsigned len) +{ + const uint16_t * restrict in_align = STARCH_ALIGNED(in); + uint16_t * restrict out_align = STARCH_ALIGNED(out); + + unsigned len4 = len >> 2; + unsigned len1 = len & 3; + + while (len4--) { + + out_align[0] = CONVERT_AND_SCALE(in_align[0]); + out_align[1] = CONVERT_AND_SCALE(in_align[1]); + out_align[2] = CONVERT_AND_SCALE(in_align[2]); + out_align[3] = CONVERT_AND_SCALE(in_align[3]); + + out_align += 4; + in_align += 4; + } + + while (len1--) { + out_align[0] = CONVERT_AND_SCALE(in_align[0]); + + out_align += 1; + in_align += 1; + } + +} + +void STARCH_IMPL(magnitude_u16o12, exact_unroll_8) (const uint16_t *in, uint16_t *out, unsigned len) +{ + const uint16_t * restrict in_align = STARCH_ALIGNED(in); + uint16_t * restrict out_align = STARCH_ALIGNED(out); + + unsigned len8 = len >> 3; + unsigned len1 = len & 4; + + while (len8--) { + + out_align[0] = CONVERT_AND_SCALE(in_align[0]); + out_align[1] = CONVERT_AND_SCALE(in_align[1]); + out_align[2] = CONVERT_AND_SCALE(in_align[2]); + out_align[3] = CONVERT_AND_SCALE(in_align[3]); + out_align[4] = CONVERT_AND_SCALE(in_align[4]); + out_align[5] = CONVERT_AND_SCALE(in_align[5]); + out_align[6] = CONVERT_AND_SCALE(in_align[6]); + out_align[7] = CONVERT_AND_SCALE(in_align[7]); + + + out_align += 8; + in_align += 8; + } + + while (len1--) { + out_align[0] = CONVERT_AND_SCALE(in_align[0]); + + out_align += 1; + in_align += 1; + } + +} + +void STARCH_IMPL(magnitude_u16o12, lookup) (const uint16_t *in, uint16_t *out, unsigned len) +{ + const uint16_t * restrict in_align = STARCH_ALIGNED(in); + uint16_t * restrict out_align = STARCH_ALIGNED(out); + const uint16_t * const mag_table = get_u16o12_mag_table(); + + while (len--) { + out_align[0] = mag_table[in_align[0]]; + + out_align += 1; + in_align += 1; + } +} + +void STARCH_IMPL(magnitude_u16o12, lookup_unroll_4) (const uint16_t *in, uint16_t *out, unsigned len) +{ + const uint16_t * restrict in_align = STARCH_ALIGNED(in); + uint16_t * restrict out_align = STARCH_ALIGNED(out); + const uint16_t * const mag_table = get_u16o12_mag_table(); + + unsigned len4 = len >> 2; + unsigned len1 = len & 3; + + while (len4--) { + out_align[0] = mag_table[in_align[0]]; + out_align[1] = mag_table[in_align[1]]; + out_align[2] = mag_table[in_align[2]]; + out_align[3] = mag_table[in_align[3]]; + + out_align += 4; + in_align += 4; + } + + while (len1--) { + out_align[0] = mag_table[in_align[0]]; + + out_align += 1; + in_align += 1; + } + +} + +void STARCH_IMPL(magnitude_u16o12, lookup_unroll_8) (const uint16_t *in, uint16_t *out, unsigned len) +{ + const uint16_t * restrict in_align = STARCH_ALIGNED(in); + uint16_t * restrict out_align = STARCH_ALIGNED(out); + const uint16_t * const mag_table = get_u16o12_mag_table(); + + unsigned len8 = len >> 3; + unsigned len1 = len & 4; + + while (len8--) { + out_align[0] = mag_table[in_align[0]]; + out_align[1] = mag_table[in_align[1]]; + out_align[2] = mag_table[in_align[2]]; + out_align[3] = mag_table[in_align[3]]; + out_align[4] = mag_table[in_align[4]]; + out_align[5] = mag_table[in_align[5]]; + out_align[6] = mag_table[in_align[6]]; + out_align[7] = mag_table[in_align[7]]; + + out_align += 8; + in_align += 8; + } + + while (len1--) { + out_align[0] = mag_table[in_align[0]]; + + out_align += 1; + in_align += 1; + } + +} + +#undef CONVERT_AND_SCALE +#undef MAGIC_SCALER diff --git a/dsp/starchgen.py b/dsp/starchgen.py index 81e5818b1..b42b3490b 100755 --- a/dsp/starchgen.py +++ b/dsp/starchgen.py @@ -17,7 +17,9 @@ gen.add_function(name = 'magnitude_uc8', argtypes = ['const uc8_t *', 'uint16_t *', 'unsigned'], aligned = True) gen.add_function(name = 'magnitude_power_uc8', argtypes = ['const uc8_t *', 'uint16_t *', 'unsigned', 'double *', 'double *'], aligned = True) +gen.add_function(name = 'magnitude_s16', argtypes = ['const int16_t *', 'uint16_t *', 'unsigned'], aligned = True) gen.add_function(name = 'magnitude_sc16', argtypes = ['const sc16_t *', 'uint16_t *', 'unsigned'], aligned = True) +gen.add_function(name = 'magnitude_u16o12', argtypes = ['const uint16_t *', 'uint16_t *', 'unsigned'], aligned = True) gen.add_function(name = 'magnitude_sc16q11', argtypes = ['const sc16_t *', 'uint16_t *', 'unsigned'], aligned = True) gen.add_function(name = 'mean_power_u16', argtypes = ['const uint16_t *', 'unsigned', 'double *', 'double *'], aligned = True) diff --git a/wisdom/wisdom.aarch64.pi4b b/wisdom/wisdom.aarch64.pi4b index 093c70e39..6edc95ad0 100644 --- a/wisdom/wisdom.aarch64.pi4b +++ b/wisdom/wisdom.aarch64.pi4b @@ -1,101 +1,140 @@ # generated by ./starch-benchmark -i 15 -o wisdom.aarch64.pi4b -magnitude_power_uc8 neon_vrsqrte_armv8_neon_simd # 242171 ns/call -magnitude_power_uc8 lookup_unroll_4_armv8_neon_simd # 309918 ns/call -magnitude_power_uc8 lookup_unroll_4_generic # 310083 ns/call -magnitude_power_uc8 twopass_armv8_neon_simd # 331999 ns/call -magnitude_power_uc8 twopass_generic # 332283 ns/call -magnitude_power_uc8 lookup_armv8_neon_simd # 354725 ns/call -magnitude_power_uc8 lookup_generic # 354993 ns/call +magnitude_power_uc8 neon_vrsqrte_armv8_neon_simd # 242178 ns/call +magnitude_power_uc8 lookup_unroll_4_armv8_neon_simd # 310600 ns/call +magnitude_power_uc8 lookup_unroll_4_generic # 310661 ns/call +magnitude_power_uc8 twopass_generic # 349781 ns/call +magnitude_power_uc8 lookup_armv8_neon_simd # 355078 ns/call +magnitude_power_uc8 lookup_generic # 355115 ns/call +magnitude_power_uc8 twopass_armv8_neon_simd # 407323 ns/call -magnitude_power_uc8_aligned neon_vrsqrte_armv8_neon_simd # 231223 ns/call -magnitude_power_uc8_aligned neon_vrsqrte_armv8_neon_simd_aligned # 231231 ns/call -magnitude_power_uc8_aligned lookup_unroll_4_armv8_neon_simd # 317120 ns/call -magnitude_power_uc8_aligned lookup_unroll_4_armv8_neon_simd_aligned # 317202 ns/call -magnitude_power_uc8_aligned lookup_unroll_4_generic # 317261 ns/call -magnitude_power_uc8_aligned twopass_armv8_neon_simd # 326316 ns/call -magnitude_power_uc8_aligned twopass_generic # 326441 ns/call -magnitude_power_uc8_aligned twopass_armv8_neon_simd_aligned # 339548 ns/call -magnitude_power_uc8_aligned lookup_generic # 353854 ns/call -magnitude_power_uc8_aligned lookup_armv8_neon_simd_aligned # 353897 ns/call -magnitude_power_uc8_aligned lookup_armv8_neon_simd # 354025 ns/call +magnitude_power_uc8_aligned neon_vrsqrte_armv8_neon_simd_aligned # 231235 ns/call +magnitude_power_uc8_aligned neon_vrsqrte_armv8_neon_simd # 231235 ns/call +magnitude_power_uc8_aligned lookup_unroll_4_armv8_neon_simd # 317273 ns/call +magnitude_power_uc8_aligned lookup_unroll_4_armv8_neon_simd_aligned # 317305 ns/call +magnitude_power_uc8_aligned twopass_armv8_neon_simd # 326955 ns/call +magnitude_power_uc8_aligned twopass_armv8_neon_simd_aligned # 339809 ns/call +magnitude_power_uc8_aligned lookup_armv8_neon_simd_aligned # 354228 ns/call +magnitude_power_uc8_aligned lookup_armv8_neon_simd # 354375 ns/call +magnitude_power_uc8_aligned twopass_generic # 392324 ns/call +magnitude_power_uc8_aligned lookup_generic # 478261 ns/call +magnitude_power_uc8_aligned lookup_unroll_4_generic # 485894 ns/call -magnitude_sc16 neon_vrsqrte_armv8_neon_simd # 687064 ns/call -magnitude_sc16 exact_u32_armv8_neon_simd # 1337885 ns/call -magnitude_sc16 exact_float_armv8_neon_simd # 1409773 ns/call -magnitude_sc16 exact_u32_generic # 3331842 ns/call -magnitude_sc16 exact_float_generic # 3414790 ns/call +magnitude_s16 exact_u32_armv8_neon_simd # 42270 ns/call +magnitude_s16 exact_u32_generic # 65615 ns/call -magnitude_sc16_aligned neon_vrsqrte_armv8_neon_simd # 669434 ns/call -magnitude_sc16_aligned neon_vrsqrte_armv8_neon_simd_aligned # 770926 ns/call -magnitude_sc16_aligned exact_u32_armv8_neon_simd # 1336333 ns/call -magnitude_sc16_aligned exact_float_armv8_neon_simd # 1397618 ns/call -magnitude_sc16_aligned exact_u32_armv8_neon_simd_aligned # 1808644 ns/call -magnitude_sc16_aligned exact_float_armv8_neon_simd_aligned # 1927454 ns/call -magnitude_sc16_aligned exact_u32_generic # 2750034 ns/call -magnitude_sc16_aligned exact_float_generic # 3167265 ns/call +magnitude_s16_aligned exact_u32_armv8_neon_simd # 77429 ns/call +magnitude_s16_aligned exact_u32_generic # 78551 ns/call +magnitude_s16_aligned exact_u32_armv8_neon_simd_aligned # 78641 ns/call -magnitude_sc16q11 neon_vrsqrte_armv8_neon_simd # 166265 ns/call -magnitude_sc16q11 exact_float_armv8_neon_simd # 347400 ns/call -magnitude_sc16q11 exact_u32_armv8_neon_simd # 350422 ns/call -magnitude_sc16q11 exact_u32_generic # 951466 ns/call -magnitude_sc16q11 exact_float_generic # 1041727 ns/call -magnitude_sc16q11 12bit_table_generic # 2008901 ns/call -magnitude_sc16q11 12bit_table_armv8_neon_simd # 2117606 ns/call -magnitude_sc16q11 11bit_table_generic # 2315294 ns/call -magnitude_sc16q11 11bit_table_armv8_neon_simd # 2317090 ns/call +magnitude_sc16 neon_vrsqrte_armv8_neon_simd # 714831 ns/call +magnitude_sc16 exact_float_armv8_neon_simd # 1409688 ns/call +magnitude_sc16 exact_u32_armv8_neon_simd # 1525298 ns/call +magnitude_sc16 exact_u32_generic # 2774527 ns/call +magnitude_sc16 exact_float_generic # 3178482 ns/call -magnitude_sc16q11_aligned neon_vrsqrte_armv8_neon_simd # 155062 ns/call -magnitude_sc16q11_aligned neon_vrsqrte_armv8_neon_simd_aligned # 212453 ns/call -magnitude_sc16q11_aligned exact_u32_armv8_neon_simd_aligned # 329287 ns/call -magnitude_sc16q11_aligned exact_float_armv8_neon_simd_aligned # 345611 ns/call -magnitude_sc16q11_aligned exact_float_armv8_neon_simd # 426742 ns/call -magnitude_sc16q11_aligned exact_u32_armv8_neon_simd # 493451 ns/call -magnitude_sc16q11_aligned exact_u32_generic # 993016 ns/call -magnitude_sc16q11_aligned exact_float_generic # 1041225 ns/call -magnitude_sc16q11_aligned 12bit_table_armv8_neon_simd_aligned # 2008440 ns/call -magnitude_sc16q11_aligned 12bit_table_generic # 2010237 ns/call -magnitude_sc16q11_aligned 12bit_table_armv8_neon_simd # 2010954 ns/call -magnitude_sc16q11_aligned 11bit_table_armv8_neon_simd_aligned # 2314544 ns/call -magnitude_sc16q11_aligned 11bit_table_generic # 2317709 ns/call -magnitude_sc16q11_aligned 11bit_table_armv8_neon_simd # 2672466 ns/call +magnitude_sc16_aligned neon_vrsqrte_armv8_neon_simd_aligned # 643589 ns/call +magnitude_sc16_aligned neon_vrsqrte_armv8_neon_simd # 643837 ns/call +magnitude_sc16_aligned exact_u32_armv8_neon_simd # 1335633 ns/call +magnitude_sc16_aligned exact_u32_armv8_neon_simd_aligned # 1336341 ns/call +magnitude_sc16_aligned exact_float_armv8_neon_simd_aligned # 1398166 ns/call +magnitude_sc16_aligned exact_float_armv8_neon_simd # 1398459 ns/call +magnitude_sc16_aligned exact_u32_generic # 2883375 ns/call +magnitude_sc16_aligned exact_float_generic # 3800509 ns/call -magnitude_uc8 neon_vrsqrte_armv8_neon_simd # 213353 ns/call -magnitude_uc8 lookup_generic # 285617 ns/call -magnitude_uc8 lookup_armv8_neon_simd # 285723 ns/call -magnitude_uc8 lookup_unroll_4_generic # 288439 ns/call -magnitude_uc8 lookup_unroll_4_armv8_neon_simd # 288520 ns/call -magnitude_uc8 exact_armv8_neon_simd # 533721 ns/call -magnitude_uc8 exact_generic # 1703775 ns/call +magnitude_sc16q11 neon_vrsqrte_armv8_neon_simd # 166225 ns/call +magnitude_sc16q11 exact_u32_armv8_neon_simd # 328709 ns/call +magnitude_sc16q11 exact_float_armv8_neon_simd # 347332 ns/call +magnitude_sc16q11 exact_u32_generic # 1020089 ns/call +magnitude_sc16q11 exact_float_generic # 1427598 ns/call +magnitude_sc16q11 12bit_table_armv8_neon_simd # 2022833 ns/call +magnitude_sc16q11 12bit_table_generic # 2134803 ns/call +magnitude_sc16q11 11bit_table_armv8_neon_simd # 2305257 ns/call +magnitude_sc16q11 11bit_table_generic # 2656868 ns/call -magnitude_uc8_aligned neon_vrsqrte_armv8_neon_simd # 214464 ns/call -magnitude_uc8_aligned lookup_armv8_neon_simd_aligned # 280649 ns/call -magnitude_uc8_aligned lookup_generic # 280742 ns/call -magnitude_uc8_aligned lookup_unroll_4_armv8_neon_simd # 293121 ns/call -magnitude_uc8_aligned lookup_unroll_4_generic # 293163 ns/call -magnitude_uc8_aligned neon_vrsqrte_armv8_neon_simd_aligned # 294461 ns/call -magnitude_uc8_aligned lookup_unroll_4_armv8_neon_simd_aligned # 313567 ns/call -magnitude_uc8_aligned lookup_armv8_neon_simd # 340192 ns/call -magnitude_uc8_aligned exact_armv8_neon_simd # 533623 ns/call -magnitude_uc8_aligned exact_armv8_neon_simd_aligned # 731823 ns/call -magnitude_uc8_aligned exact_generic # 1705445 ns/call +magnitude_sc16q11_aligned neon_vrsqrte_armv8_neon_simd_aligned # 155050 ns/call +magnitude_sc16q11_aligned exact_u32_armv8_neon_simd # 329258 ns/call +magnitude_sc16q11_aligned exact_float_armv8_neon_simd # 345569 ns/call +magnitude_sc16q11_aligned exact_u32_armv8_neon_simd_aligned # 346999 ns/call +magnitude_sc16q11_aligned neon_vrsqrte_armv8_neon_simd # 354947 ns/call +magnitude_sc16q11_aligned exact_float_armv8_neon_simd_aligned # 551563 ns/call +magnitude_sc16q11_aligned 12bit_table_armv8_neon_simd_aligned # 2023169 ns/call +magnitude_sc16q11_aligned exact_u32_generic # 2213044 ns/call +magnitude_sc16q11_aligned 11bit_table_armv8_neon_simd # 2315496 ns/call +magnitude_sc16q11_aligned 11bit_table_armv8_neon_simd_aligned # 2315505 ns/call +magnitude_sc16q11_aligned exact_float_generic # 2404910 ns/call +magnitude_sc16q11_aligned 12bit_table_armv8_neon_simd # 3039219 ns/call +magnitude_sc16q11_aligned 12bit_table_generic # 3302417 ns/call +magnitude_sc16q11_aligned 11bit_table_generic # 3642372 ns/call -mean_power_u16 u32_armv8_neon_simd # 45663 ns/call -mean_power_u16 u32_generic # 45672 ns/call -mean_power_u16 neon_float_armv8_neon_simd # 72283 ns/call -mean_power_u16 u64_armv8_neon_simd # 89187 ns/call -mean_power_u16 u64_generic # 89199 ns/call -mean_power_u16 float_armv8_neon_simd # 94634 ns/call -mean_power_u16 float_generic # 176676 ns/call +magnitude_u16o12 lookup_unroll_8_armv8_neon_simd # 5543 ns/call +magnitude_u16o12 lookup_unroll_4_armv8_neon_simd # 6022 ns/call +magnitude_u16o12 lookup_unroll_4_generic # 6024 ns/call +magnitude_u16o12 lookup_armv8_neon_simd # 6419 ns/call +magnitude_u16o12 lookup_generic # 6419 ns/call +magnitude_u16o12 lookup_unroll_8_generic # 8379 ns/call +magnitude_u16o12 exact_generic # 9682 ns/call +magnitude_u16o12 exact_unroll_8_generic # 9682 ns/call +magnitude_u16o12 exact_unroll_4_generic # 9683 ns/call +magnitude_u16o12 exact_unroll_4_armv8_neon_simd # 10069 ns/call +magnitude_u16o12 exact_unroll_8_armv8_neon_simd # 13021 ns/call +magnitude_u16o12 exact_armv8_neon_simd # 14536 ns/call -mean_power_u16_aligned u32_armv8_neon_simd # 44865 ns/call -mean_power_u16_aligned u32_generic # 52958 ns/call -mean_power_u16_aligned u32_armv8_neon_simd_aligned # 60579 ns/call -mean_power_u16_aligned neon_float_armv8_neon_simd # 77277 ns/call -mean_power_u16_aligned u64_armv8_neon_simd # 86287 ns/call -mean_power_u16_aligned u64_generic # 86295 ns/call -mean_power_u16_aligned float_armv8_neon_simd_aligned # 87501 ns/call -mean_power_u16_aligned neon_float_armv8_neon_simd_aligned # 94315 ns/call -mean_power_u16_aligned float_armv8_neon_simd # 104800 ns/call -mean_power_u16_aligned u64_armv8_neon_simd_aligned # 119504 ns/call -mean_power_u16_aligned float_generic # 176475 ns/call +magnitude_u16o12_aligned lookup_unroll_8_armv8_neon_simd # 5538 ns/call +magnitude_u16o12_aligned lookup_armv8_neon_simd_aligned # 6427 ns/call +magnitude_u16o12_aligned lookup_unroll_4_armv8_neon_simd_aligned # 6474 ns/call +magnitude_u16o12_aligned lookup_unroll_4_armv8_neon_simd # 6496 ns/call +magnitude_u16o12_aligned lookup_armv8_neon_simd # 6940 ns/call +magnitude_u16o12_aligned lookup_unroll_8_armv8_neon_simd_aligned # 7448 ns/call +magnitude_u16o12_aligned lookup_generic # 8761 ns/call +magnitude_u16o12_aligned exact_unroll_8_armv8_neon_simd_aligned # 9256 ns/call +magnitude_u16o12_aligned exact_generic # 9256 ns/call +magnitude_u16o12_aligned exact_unroll_8_generic # 9256 ns/call +magnitude_u16o12_aligned exact_unroll_4_armv8_neon_simd_aligned # 9257 ns/call +magnitude_u16o12_aligned exact_unroll_4_generic # 9257 ns/call +magnitude_u16o12_aligned exact_unroll_8_armv8_neon_simd # 9870 ns/call +magnitude_u16o12_aligned lookup_unroll_8_generic # 10142 ns/call +magnitude_u16o12_aligned exact_unroll_4_armv8_neon_simd # 10249 ns/call +magnitude_u16o12_aligned exact_armv8_neon_simd_aligned # 10994 ns/call +magnitude_u16o12_aligned lookup_unroll_4_generic # 11246 ns/call +magnitude_u16o12_aligned exact_armv8_neon_simd # 11466 ns/call + +magnitude_uc8 neon_vrsqrte_armv8_neon_simd # 213351 ns/call +magnitude_uc8 lookup_generic # 285769 ns/call +magnitude_uc8 lookup_armv8_neon_simd # 285886 ns/call +magnitude_uc8 lookup_unroll_4_armv8_neon_simd # 289026 ns/call +magnitude_uc8 lookup_unroll_4_generic # 303185 ns/call +magnitude_uc8 exact_armv8_neon_simd # 533711 ns/call +magnitude_uc8 exact_generic # 1705307 ns/call + +magnitude_uc8_aligned neon_vrsqrte_armv8_neon_simd_aligned # 214463 ns/call +magnitude_uc8_aligned neon_vrsqrte_armv8_neon_simd # 223971 ns/call +magnitude_uc8_aligned lookup_armv8_neon_simd_aligned # 281222 ns/call +magnitude_uc8_aligned lookup_generic # 281593 ns/call +magnitude_uc8_aligned lookup_unroll_4_armv8_neon_simd_aligned # 293814 ns/call +magnitude_uc8_aligned lookup_unroll_4_generic # 294267 ns/call +magnitude_uc8_aligned lookup_armv8_neon_simd # 335826 ns/call +magnitude_uc8_aligned lookup_unroll_4_armv8_neon_simd # 353925 ns/call +magnitude_uc8_aligned exact_armv8_neon_simd_aligned # 533612 ns/call +magnitude_uc8_aligned exact_armv8_neon_simd # 729929 ns/call +magnitude_uc8_aligned exact_generic # 1703376 ns/call + +mean_power_u16 u32_generic # 104479 ns/call +mean_power_u16 u32_armv8_neon_simd # 108001 ns/call +mean_power_u16 neon_float_armv8_neon_simd # 165800 ns/call +mean_power_u16 u64_armv8_neon_simd # 202718 ns/call +mean_power_u16 u64_generic # 205818 ns/call +mean_power_u16 float_armv8_neon_simd # 217034 ns/call +mean_power_u16 float_generic # 416181 ns/call + +mean_power_u16_aligned neon_float_armv8_neon_simd # 100361 ns/call +mean_power_u16_aligned u32_armv8_neon_simd # 103584 ns/call +mean_power_u16_aligned u32_generic # 104844 ns/call +mean_power_u16_aligned u32_armv8_neon_simd_aligned # 105711 ns/call +mean_power_u16_aligned neon_float_armv8_neon_simd_aligned # 141045 ns/call +mean_power_u16_aligned u64_armv8_neon_simd_aligned # 177954 ns/call +mean_power_u16_aligned float_armv8_neon_simd # 181779 ns/call +mean_power_u16_aligned u64_generic # 193860 ns/call +mean_power_u16_aligned u64_armv8_neon_simd # 199711 ns/call +mean_power_u16_aligned float_armv8_neon_simd_aligned # 200928 ns/call +mean_power_u16_aligned float_generic # 408905 ns/call diff --git a/wisdom/wisdom.aarch64.tegra b/wisdom/wisdom.aarch64.tegra index 48c66e022..c2e61fdc7 100644 --- a/wisdom/wisdom.aarch64.tegra +++ b/wisdom/wisdom.aarch64.tegra @@ -1,101 +1,140 @@ # generated by ./starch-benchmark -i 15 -o wisdom.aarch64.tegra -magnitude_power_uc8 neon_vrsqrte_armv8_neon_simd # 94796 ns/call -magnitude_power_uc8 lookup_armv8_neon_simd # 192167 ns/call -magnitude_power_uc8 lookup_generic # 192384 ns/call -magnitude_power_uc8 lookup_unroll_4_generic # 201674 ns/call -magnitude_power_uc8 lookup_unroll_4_armv8_neon_simd # 202605 ns/call -magnitude_power_uc8 twopass_armv8_neon_simd # 211684 ns/call -magnitude_power_uc8 twopass_generic # 212405 ns/call +magnitude_power_uc8 neon_vrsqrte_armv8_neon_simd # 94781 ns/call +magnitude_power_uc8 lookup_armv8_neon_simd # 191342 ns/call +magnitude_power_uc8 lookup_generic # 192116 ns/call +magnitude_power_uc8 lookup_unroll_4_generic # 196104 ns/call +magnitude_power_uc8 lookup_unroll_4_armv8_neon_simd # 199271 ns/call +magnitude_power_uc8 twopass_generic # 211844 ns/call +magnitude_power_uc8 twopass_armv8_neon_simd # 212377 ns/call -magnitude_power_uc8_aligned neon_vrsqrte_armv8_neon_simd # 94539 ns/call -magnitude_power_uc8_aligned neon_vrsqrte_armv8_neon_simd_aligned # 96537 ns/call -magnitude_power_uc8_aligned lookup_armv8_neon_simd # 194018 ns/call -magnitude_power_uc8_aligned lookup_generic # 194129 ns/call -magnitude_power_uc8_aligned lookup_armv8_neon_simd_aligned # 194586 ns/call -magnitude_power_uc8_aligned lookup_unroll_4_armv8_neon_simd # 202656 ns/call -magnitude_power_uc8_aligned lookup_unroll_4_generic # 203133 ns/call -magnitude_power_uc8_aligned lookup_unroll_4_armv8_neon_simd_aligned # 203492 ns/call -magnitude_power_uc8_aligned twopass_armv8_neon_simd # 218867 ns/call -magnitude_power_uc8_aligned twopass_generic # 219683 ns/call -magnitude_power_uc8_aligned twopass_armv8_neon_simd_aligned # 232710 ns/call +magnitude_power_uc8_aligned neon_vrsqrte_armv8_neon_simd # 94718 ns/call +magnitude_power_uc8_aligned neon_vrsqrte_armv8_neon_simd_aligned # 95092 ns/call +magnitude_power_uc8_aligned lookup_armv8_neon_simd # 191174 ns/call +magnitude_power_uc8_aligned lookup_armv8_neon_simd_aligned # 191763 ns/call +magnitude_power_uc8_aligned lookup_generic # 194173 ns/call +magnitude_power_uc8_aligned lookup_unroll_4_generic # 201119 ns/call +magnitude_power_uc8_aligned lookup_unroll_4_armv8_neon_simd_aligned # 201311 ns/call +magnitude_power_uc8_aligned lookup_unroll_4_armv8_neon_simd # 204118 ns/call +magnitude_power_uc8_aligned twopass_armv8_neon_simd # 210881 ns/call +magnitude_power_uc8_aligned twopass_generic # 216370 ns/call +magnitude_power_uc8_aligned twopass_armv8_neon_simd_aligned # 229570 ns/call -magnitude_sc16 neon_vrsqrte_armv8_neon_simd # 248412 ns/call -magnitude_sc16 exact_u32_armv8_neon_simd # 497100 ns/call -magnitude_sc16 exact_float_armv8_neon_simd # 499026 ns/call -magnitude_sc16 exact_u32_generic # 2498651 ns/call -magnitude_sc16 exact_float_generic # 2630913 ns/call +magnitude_s16 exact_u32_armv8_neon_simd # 22370 ns/call +magnitude_s16 exact_u32_generic # 22375 ns/call -magnitude_sc16_aligned neon_vrsqrte_armv8_neon_simd_aligned # 251091 ns/call -magnitude_sc16_aligned neon_vrsqrte_armv8_neon_simd # 251917 ns/call -magnitude_sc16_aligned exact_u32_armv8_neon_simd # 495168 ns/call -magnitude_sc16_aligned exact_float_armv8_neon_simd # 496604 ns/call -magnitude_sc16_aligned exact_u32_armv8_neon_simd_aligned # 497295 ns/call -magnitude_sc16_aligned exact_float_armv8_neon_simd_aligned # 497677 ns/call -magnitude_sc16_aligned exact_u32_generic # 2502639 ns/call -magnitude_sc16_aligned exact_float_generic # 2508165 ns/call +magnitude_s16_aligned exact_u32_armv8_neon_simd # 22275 ns/call +magnitude_s16_aligned exact_u32_generic # 22290 ns/call +magnitude_s16_aligned exact_u32_armv8_neon_simd_aligned # 22309 ns/call -magnitude_sc16q11 neon_vrsqrte_armv8_neon_simd # 61889 ns/call -magnitude_sc16q11 exact_u32_armv8_neon_simd # 121180 ns/call -magnitude_sc16q11 exact_float_armv8_neon_simd # 122913 ns/call -magnitude_sc16q11 12bit_table_generic # 600092 ns/call -magnitude_sc16q11 12bit_table_armv8_neon_simd # 602741 ns/call -magnitude_sc16q11 11bit_table_armv8_neon_simd # 713333 ns/call -magnitude_sc16q11 11bit_table_generic # 747792 ns/call -magnitude_sc16q11 exact_float_generic # 819436 ns/call -magnitude_sc16q11 exact_u32_generic # 830130 ns/call +magnitude_sc16 neon_vrsqrte_armv8_neon_simd # 252184 ns/call +magnitude_sc16 exact_float_armv8_neon_simd # 492235 ns/call +magnitude_sc16 exact_u32_armv8_neon_simd # 498425 ns/call +magnitude_sc16 exact_u32_generic # 2284035 ns/call +magnitude_sc16 exact_float_generic # 2498803 ns/call -magnitude_sc16q11_aligned neon_vrsqrte_armv8_neon_simd_aligned # 62013 ns/call -magnitude_sc16q11_aligned neon_vrsqrte_armv8_neon_simd # 62417 ns/call -magnitude_sc16q11_aligned exact_u32_armv8_neon_simd # 121349 ns/call -magnitude_sc16q11_aligned exact_u32_armv8_neon_simd_aligned # 121531 ns/call -magnitude_sc16q11_aligned exact_float_armv8_neon_simd # 122073 ns/call -magnitude_sc16q11_aligned exact_float_armv8_neon_simd_aligned # 122670 ns/call -magnitude_sc16q11_aligned 12bit_table_armv8_neon_simd # 589282 ns/call -magnitude_sc16q11_aligned 12bit_table_generic # 590574 ns/call -magnitude_sc16q11_aligned 12bit_table_armv8_neon_simd_aligned # 591626 ns/call -magnitude_sc16q11_aligned 11bit_table_armv8_neon_simd # 708434 ns/call -magnitude_sc16q11_aligned 11bit_table_armv8_neon_simd_aligned # 712503 ns/call -magnitude_sc16q11_aligned 11bit_table_generic # 739828 ns/call -magnitude_sc16q11_aligned exact_float_generic # 822781 ns/call -magnitude_sc16q11_aligned exact_u32_generic # 831139 ns/call +magnitude_sc16_aligned neon_vrsqrte_armv8_neon_simd # 251193 ns/call +magnitude_sc16_aligned neon_vrsqrte_armv8_neon_simd_aligned # 251930 ns/call +magnitude_sc16_aligned exact_float_armv8_neon_simd # 491778 ns/call +magnitude_sc16_aligned exact_u32_armv8_neon_simd_aligned # 495439 ns/call +magnitude_sc16_aligned exact_float_armv8_neon_simd_aligned # 495968 ns/call +magnitude_sc16_aligned exact_u32_armv8_neon_simd # 496616 ns/call +magnitude_sc16_aligned exact_u32_generic # 2419011 ns/call +magnitude_sc16_aligned exact_float_generic # 2503908 ns/call -magnitude_uc8 neon_vrsqrte_armv8_neon_simd # 75259 ns/call -magnitude_uc8 lookup_armv8_neon_simd # 185908 ns/call -magnitude_uc8 lookup_generic # 187426 ns/call -magnitude_uc8 lookup_unroll_4_armv8_neon_simd # 203217 ns/call -magnitude_uc8 lookup_unroll_4_generic # 205435 ns/call -magnitude_uc8 exact_armv8_neon_simd # 211685 ns/call -magnitude_uc8 exact_generic # 1143963 ns/call +magnitude_sc16q11 neon_vrsqrte_armv8_neon_simd # 62216 ns/call +magnitude_sc16q11 exact_float_armv8_neon_simd # 122302 ns/call +magnitude_sc16q11 exact_u32_armv8_neon_simd # 122327 ns/call +magnitude_sc16q11 12bit_table_armv8_neon_simd # 591601 ns/call +magnitude_sc16q11 12bit_table_generic # 592334 ns/call +magnitude_sc16q11 exact_u32_generic # 727231 ns/call +magnitude_sc16q11 11bit_table_generic # 738250 ns/call +magnitude_sc16q11 11bit_table_armv8_neon_simd # 750103 ns/call +magnitude_sc16q11 exact_float_generic # 823944 ns/call -magnitude_uc8_aligned neon_vrsqrte_armv8_neon_simd # 74829 ns/call -magnitude_uc8_aligned neon_vrsqrte_armv8_neon_simd_aligned # 75205 ns/call -magnitude_uc8_aligned lookup_armv8_neon_simd_aligned # 176228 ns/call -magnitude_uc8_aligned lookup_armv8_neon_simd # 176801 ns/call -magnitude_uc8_aligned lookup_generic # 177103 ns/call -magnitude_uc8_aligned lookup_unroll_4_armv8_neon_simd_aligned # 196536 ns/call -magnitude_uc8_aligned lookup_unroll_4_armv8_neon_simd # 197343 ns/call -magnitude_uc8_aligned lookup_unroll_4_generic # 198190 ns/call -magnitude_uc8_aligned exact_armv8_neon_simd # 210215 ns/call -magnitude_uc8_aligned exact_armv8_neon_simd_aligned # 211766 ns/call -magnitude_uc8_aligned exact_generic # 1129546 ns/call +magnitude_sc16q11_aligned neon_vrsqrte_armv8_neon_simd_aligned # 61770 ns/call +magnitude_sc16q11_aligned neon_vrsqrte_armv8_neon_simd # 62217 ns/call +magnitude_sc16q11_aligned exact_u32_armv8_neon_simd_aligned # 121534 ns/call +magnitude_sc16q11_aligned exact_float_armv8_neon_simd # 121992 ns/call +magnitude_sc16q11_aligned exact_u32_armv8_neon_simd # 122068 ns/call +magnitude_sc16q11_aligned exact_float_armv8_neon_simd_aligned # 122240 ns/call +magnitude_sc16q11_aligned 12bit_table_armv8_neon_simd_aligned # 572093 ns/call +magnitude_sc16q11_aligned 12bit_table_generic # 581602 ns/call +magnitude_sc16q11_aligned 12bit_table_armv8_neon_simd # 584198 ns/call +magnitude_sc16q11_aligned 11bit_table_armv8_neon_simd # 730392 ns/call +magnitude_sc16q11_aligned 11bit_table_generic # 731592 ns/call +magnitude_sc16q11_aligned 11bit_table_armv8_neon_simd_aligned # 738102 ns/call +magnitude_sc16q11_aligned exact_u32_generic # 753089 ns/call +magnitude_sc16q11_aligned exact_float_generic # 798343 ns/call -mean_power_u16 neon_float_armv8_neon_simd # 39477 ns/call -mean_power_u16 u32_generic # 42560 ns/call -mean_power_u16 u32_armv8_neon_simd # 44544 ns/call -mean_power_u16 float_armv8_neon_simd # 52529 ns/call -mean_power_u16 u64_generic # 85141 ns/call -mean_power_u16 u64_armv8_neon_simd # 85219 ns/call -mean_power_u16 float_generic # 155312 ns/call +magnitude_u16o12 exact_unroll_8_armv8_neon_simd # 3361 ns/call +magnitude_u16o12 exact_armv8_neon_simd # 3374 ns/call +magnitude_u16o12 exact_unroll_4_armv8_neon_simd # 3375 ns/call +magnitude_u16o12 exact_generic # 3383 ns/call +magnitude_u16o12 exact_unroll_4_generic # 3394 ns/call +magnitude_u16o12 exact_unroll_8_generic # 3394 ns/call +magnitude_u16o12 lookup_unroll_4_armv8_neon_simd # 3747 ns/call +magnitude_u16o12 lookup_unroll_4_generic # 3774 ns/call +magnitude_u16o12 lookup_armv8_neon_simd # 4124 ns/call +magnitude_u16o12 lookup_unroll_8_armv8_neon_simd # 4142 ns/call +magnitude_u16o12 lookup_unroll_8_generic # 4161 ns/call +magnitude_u16o12 lookup_generic # 4164 ns/call -mean_power_u16_aligned neon_float_armv8_neon_simd # 39385 ns/call -mean_power_u16_aligned neon_float_armv8_neon_simd_aligned # 39524 ns/call -mean_power_u16_aligned u32_generic # 42604 ns/call -mean_power_u16_aligned u32_armv8_neon_simd_aligned # 42712 ns/call -mean_power_u16_aligned u32_armv8_neon_simd # 44513 ns/call -mean_power_u16_aligned float_armv8_neon_simd # 52471 ns/call -mean_power_u16_aligned float_armv8_neon_simd_aligned # 52593 ns/call -mean_power_u16_aligned u64_armv8_neon_simd # 85041 ns/call -mean_power_u16_aligned u64_generic # 85056 ns/call -mean_power_u16_aligned u64_armv8_neon_simd_aligned # 85239 ns/call -mean_power_u16_aligned float_generic # 153697 ns/call +magnitude_u16o12_aligned exact_unroll_8_armv8_neon_simd # 3349 ns/call +magnitude_u16o12_aligned exact_unroll_4_armv8_neon_simd # 3351 ns/call +magnitude_u16o12_aligned exact_unroll_4_generic # 3352 ns/call +magnitude_u16o12_aligned exact_generic # 3353 ns/call +magnitude_u16o12_aligned exact_armv8_neon_simd # 3359 ns/call +magnitude_u16o12_aligned exact_unroll_8_generic # 3361 ns/call +magnitude_u16o12_aligned exact_armv8_neon_simd_aligned # 3392 ns/call +magnitude_u16o12_aligned exact_unroll_8_armv8_neon_simd_aligned # 3397 ns/call +magnitude_u16o12_aligned exact_unroll_4_armv8_neon_simd_aligned # 3420 ns/call +magnitude_u16o12_aligned lookup_unroll_4_generic # 3721 ns/call +magnitude_u16o12_aligned lookup_unroll_4_armv8_neon_simd # 3730 ns/call +magnitude_u16o12_aligned lookup_unroll_4_armv8_neon_simd_aligned # 3786 ns/call +magnitude_u16o12_aligned lookup_unroll_8_generic # 4086 ns/call +magnitude_u16o12_aligned lookup_generic # 4091 ns/call +magnitude_u16o12_aligned lookup_unroll_8_armv8_neon_simd # 4092 ns/call +magnitude_u16o12_aligned lookup_armv8_neon_simd # 4102 ns/call +magnitude_u16o12_aligned lookup_unroll_8_armv8_neon_simd_aligned # 4159 ns/call +magnitude_u16o12_aligned lookup_armv8_neon_simd_aligned # 4170 ns/call + +magnitude_uc8 neon_vrsqrte_armv8_neon_simd # 75488 ns/call +magnitude_uc8 lookup_generic # 176176 ns/call +magnitude_uc8 lookup_armv8_neon_simd # 176214 ns/call +magnitude_uc8 lookup_unroll_4_generic # 196504 ns/call +magnitude_uc8 lookup_unroll_4_armv8_neon_simd # 196996 ns/call +magnitude_uc8 exact_armv8_neon_simd # 211805 ns/call +magnitude_uc8 exact_generic # 1139651 ns/call + +magnitude_uc8_aligned neon_vrsqrte_armv8_neon_simd # 75180 ns/call +magnitude_uc8_aligned neon_vrsqrte_armv8_neon_simd_aligned # 75733 ns/call +magnitude_uc8_aligned lookup_generic # 174369 ns/call +magnitude_uc8_aligned lookup_armv8_neon_simd # 175660 ns/call +magnitude_uc8_aligned lookup_armv8_neon_simd_aligned # 178247 ns/call +magnitude_uc8_aligned lookup_unroll_4_generic # 195812 ns/call +magnitude_uc8_aligned lookup_unroll_4_armv8_neon_simd_aligned # 195981 ns/call +magnitude_uc8_aligned lookup_unroll_4_armv8_neon_simd # 196430 ns/call +magnitude_uc8_aligned exact_armv8_neon_simd # 209988 ns/call +magnitude_uc8_aligned exact_armv8_neon_simd_aligned # 211549 ns/call +magnitude_uc8_aligned exact_generic # 1157366 ns/call + +mean_power_u16 neon_float_armv8_neon_simd # 39883 ns/call +mean_power_u16 u32_generic # 42945 ns/call +mean_power_u16 u32_armv8_neon_simd # 43126 ns/call +mean_power_u16 float_armv8_neon_simd # 53033 ns/call +mean_power_u16 u64_generic # 85898 ns/call +mean_power_u16 u64_armv8_neon_simd # 86019 ns/call +mean_power_u16 float_generic # 166467 ns/call + +mean_power_u16_aligned neon_float_armv8_neon_simd # 39643 ns/call +mean_power_u16_aligned neon_float_armv8_neon_simd_aligned # 39751 ns/call +mean_power_u16_aligned u32_generic # 42883 ns/call +mean_power_u16_aligned u32_armv8_neon_simd # 43040 ns/call +mean_power_u16_aligned u32_armv8_neon_simd_aligned # 43105 ns/call +mean_power_u16_aligned float_armv8_neon_simd # 52847 ns/call +mean_power_u16_aligned float_armv8_neon_simd_aligned # 53129 ns/call +mean_power_u16_aligned u64_armv8_neon_simd # 85639 ns/call +mean_power_u16_aligned u64_generic # 85826 ns/call +mean_power_u16_aligned u64_armv8_neon_simd_aligned # 86041 ns/call +mean_power_u16_aligned float_generic # 163232 ns/call diff --git a/wisdom/wisdom.arm.pi4b b/wisdom/wisdom.arm.pi4b new file mode 100644 index 000000000..09317d0d7 --- /dev/null +++ b/wisdom/wisdom.arm.pi4b @@ -0,0 +1,140 @@ +# generated by ./starch-benchmark -i 15 -o wisdom.armv7a.rpi4b + +magnitude_power_uc8 neon_vrsqrte_armv7a_neon_vfpv4 # 225540 ns/call +magnitude_power_uc8 twopass_armv7a_neon_vfpv4 # 233211 ns/call +magnitude_power_uc8 twopass_generic # 233232 ns/call +magnitude_power_uc8 lookup_generic # 304156 ns/call +magnitude_power_uc8 lookup_armv7a_neon_vfpv4 # 304537 ns/call +magnitude_power_uc8 lookup_unroll_4_armv7a_neon_vfpv4 # 338250 ns/call +magnitude_power_uc8 lookup_unroll_4_generic # 372099 ns/call + +magnitude_power_uc8_aligned neon_vrsqrte_armv7a_neon_vfpv4 # 212274 ns/call +magnitude_power_uc8_aligned neon_vrsqrte_armv7a_neon_vfpv4_aligned # 212275 ns/call +magnitude_power_uc8_aligned twopass_armv7a_neon_vfpv4 # 232032 ns/call +magnitude_power_uc8_aligned twopass_generic # 232251 ns/call +magnitude_power_uc8_aligned twopass_armv7a_neon_vfpv4_aligned # 232326 ns/call +magnitude_power_uc8_aligned lookup_generic # 303654 ns/call +magnitude_power_uc8_aligned lookup_armv7a_neon_vfpv4_aligned # 303814 ns/call +magnitude_power_uc8_aligned lookup_armv7a_neon_vfpv4 # 303852 ns/call +magnitude_power_uc8_aligned lookup_unroll_4_armv7a_neon_vfpv4_aligned # 339941 ns/call +magnitude_power_uc8_aligned lookup_unroll_4_armv7a_neon_vfpv4 # 340382 ns/call +magnitude_power_uc8_aligned lookup_unroll_4_generic # 373623 ns/call + +magnitude_s16 exact_u32_armv7a_neon_vfpv4 # 39425 ns/call +magnitude_s16 exact_u32_generic # 131966 ns/call + +magnitude_s16_aligned exact_u32_armv7a_neon_vfpv4_aligned # 39269 ns/call +magnitude_s16_aligned exact_u32_armv7a_neon_vfpv4 # 39269 ns/call +magnitude_s16_aligned exact_u32_generic # 131936 ns/call + +magnitude_sc16 neon_vrsqrte_armv7a_neon_vfpv4 # 684872 ns/call +magnitude_sc16 exact_u32_armv7a_neon_vfpv4 # 2469975 ns/call +magnitude_sc16 exact_float_armv7a_neon_vfpv4 # 2490141 ns/call +magnitude_sc16 exact_u32_generic # 3488709 ns/call +magnitude_sc16 exact_float_generic # 3643835 ns/call + +magnitude_sc16_aligned neon_vrsqrte_armv7a_neon_vfpv4 # 643701 ns/call +magnitude_sc16_aligned neon_vrsqrte_armv7a_neon_vfpv4_aligned # 643890 ns/call +magnitude_sc16_aligned exact_u32_armv7a_neon_vfpv4 # 2461804 ns/call +magnitude_sc16_aligned exact_u32_armv7a_neon_vfpv4_aligned # 2462217 ns/call +magnitude_sc16_aligned exact_float_armv7a_neon_vfpv4_aligned # 2480493 ns/call +magnitude_sc16_aligned exact_float_armv7a_neon_vfpv4 # 2488359 ns/call +magnitude_sc16_aligned exact_u32_generic # 3491985 ns/call +magnitude_sc16_aligned exact_float_generic # 3645385 ns/call + +magnitude_sc16q11 neon_vrsqrte_armv7a_neon_vfpv4 # 166720 ns/call +magnitude_sc16q11 exact_u32_armv7a_neon_vfpv4 # 614620 ns/call +magnitude_sc16q11 exact_float_armv7a_neon_vfpv4 # 823358 ns/call +magnitude_sc16q11 exact_u32_generic # 1169385 ns/call +magnitude_sc16q11 exact_float_generic # 1226916 ns/call +magnitude_sc16q11 11bit_table_armv7a_neon_vfpv4 # 1949962 ns/call +magnitude_sc16q11 12bit_table_armv7a_neon_vfpv4 # 2017967 ns/call +magnitude_sc16q11 12bit_table_generic # 2398223 ns/call +magnitude_sc16q11 11bit_table_generic # 2671997 ns/call + +magnitude_sc16q11_aligned neon_vrsqrte_armv7a_neon_vfpv4 # 155244 ns/call +magnitude_sc16q11_aligned neon_vrsqrte_armv7a_neon_vfpv4_aligned # 155247 ns/call +magnitude_sc16q11_aligned exact_u32_armv7a_neon_vfpv4 # 612178 ns/call +magnitude_sc16q11_aligned exact_u32_armv7a_neon_vfpv4_aligned # 612179 ns/call +magnitude_sc16q11_aligned exact_float_armv7a_neon_vfpv4 # 823110 ns/call +magnitude_sc16q11_aligned exact_float_armv7a_neon_vfpv4_aligned # 824402 ns/call +magnitude_sc16q11_aligned exact_u32_generic # 1172600 ns/call +magnitude_sc16q11_aligned exact_float_generic # 1224333 ns/call +magnitude_sc16q11_aligned 11bit_table_armv7a_neon_vfpv4 # 1951147 ns/call +magnitude_sc16q11_aligned 12bit_table_armv7a_neon_vfpv4 # 2020144 ns/call +magnitude_sc16q11_aligned 12bit_table_armv7a_neon_vfpv4_aligned # 2020611 ns/call +magnitude_sc16q11_aligned 11bit_table_armv7a_neon_vfpv4_aligned # 2090768 ns/call +magnitude_sc16q11_aligned 12bit_table_generic # 2404217 ns/call +magnitude_sc16q11_aligned 11bit_table_generic # 2672598 ns/call + +magnitude_u16o12 lookup_unroll_8_generic # 6265 ns/call +magnitude_u16o12 lookup_unroll_8_armv7a_neon_vfpv4 # 6419 ns/call +magnitude_u16o12 lookup_unroll_4_generic # 6563 ns/call +magnitude_u16o12 lookup_unroll_4_armv7a_neon_vfpv4 # 6569 ns/call +magnitude_u16o12 lookup_armv7a_neon_vfpv4 # 8235 ns/call +magnitude_u16o12 lookup_generic # 8238 ns/call +magnitude_u16o12 exact_unroll_4_generic # 57157 ns/call +magnitude_u16o12 exact_unroll_8_armv7a_neon_vfpv4 # 57402 ns/call +magnitude_u16o12 exact_unroll_8_generic # 57600 ns/call +magnitude_u16o12 exact_armv7a_neon_vfpv4 # 58053 ns/call +magnitude_u16o12 exact_generic # 58054 ns/call +magnitude_u16o12 exact_unroll_4_armv7a_neon_vfpv4 # 58319 ns/call + +magnitude_u16o12_aligned lookup_unroll_8_generic # 6019 ns/call +magnitude_u16o12_aligned lookup_unroll_8_armv7a_neon_vfpv4 # 6203 ns/call +magnitude_u16o12_aligned lookup_unroll_8_armv7a_neon_vfpv4_aligned # 6210 ns/call +magnitude_u16o12_aligned lookup_unroll_4_armv7a_neon_vfpv4_aligned # 6420 ns/call +magnitude_u16o12_aligned lookup_unroll_4_armv7a_neon_vfpv4 # 6420 ns/call +magnitude_u16o12_aligned lookup_unroll_4_generic # 6437 ns/call +magnitude_u16o12_aligned lookup_armv7a_neon_vfpv4_aligned # 7049 ns/call +magnitude_u16o12_aligned lookup_armv7a_neon_vfpv4 # 8225 ns/call +magnitude_u16o12_aligned lookup_generic # 8225 ns/call +magnitude_u16o12_aligned exact_unroll_4_generic # 57153 ns/call +magnitude_u16o12_aligned exact_unroll_8_armv7a_neon_vfpv4_aligned # 57333 ns/call +magnitude_u16o12_aligned exact_unroll_8_armv7a_neon_vfpv4 # 57392 ns/call +magnitude_u16o12_aligned exact_unroll_4_armv7a_neon_vfpv4_aligned # 57498 ns/call +magnitude_u16o12_aligned exact_unroll_8_generic # 57599 ns/call +magnitude_u16o12_aligned exact_armv7a_neon_vfpv4_aligned # 58043 ns/call +magnitude_u16o12_aligned exact_armv7a_neon_vfpv4 # 58052 ns/call +magnitude_u16o12_aligned exact_generic # 58052 ns/call +magnitude_u16o12_aligned exact_unroll_4_armv7a_neon_vfpv4 # 58315 ns/call + +magnitude_uc8 neon_vrsqrte_armv7a_neon_vfpv4 # 188811 ns/call +magnitude_uc8 lookup_unroll_4_generic # 280179 ns/call +magnitude_uc8 lookup_unroll_4_armv7a_neon_vfpv4 # 283230 ns/call +magnitude_uc8 lookup_armv7a_neon_vfpv4 # 284119 ns/call +magnitude_uc8 lookup_generic # 284149 ns/call +magnitude_uc8 exact_armv7a_neon_vfpv4 # 921110 ns/call +magnitude_uc8 exact_generic # 1696849 ns/call + +magnitude_uc8_aligned neon_vrsqrte_armv7a_neon_vfpv4_aligned # 187253 ns/call +magnitude_uc8_aligned neon_vrsqrte_armv7a_neon_vfpv4 # 187260 ns/call +magnitude_uc8_aligned lookup_unroll_4_generic # 281212 ns/call +magnitude_uc8_aligned lookup_generic # 284611 ns/call +magnitude_uc8_aligned lookup_armv7a_neon_vfpv4 # 284657 ns/call +magnitude_uc8_aligned lookup_armv7a_neon_vfpv4_aligned # 284821 ns/call +magnitude_uc8_aligned lookup_unroll_4_armv7a_neon_vfpv4_aligned # 284981 ns/call +magnitude_uc8_aligned lookup_unroll_4_armv7a_neon_vfpv4 # 285150 ns/call +magnitude_uc8_aligned exact_armv7a_neon_vfpv4_aligned # 920284 ns/call +magnitude_uc8_aligned exact_armv7a_neon_vfpv4 # 920285 ns/call +magnitude_uc8_aligned exact_generic # 1700141 ns/call + +mean_power_u16 u32_armv7a_neon_vfpv4 # 45568 ns/call +mean_power_u16 neon_float_armv7a_neon_vfpv4 # 58630 ns/call +mean_power_u16 u64_armv7a_neon_vfpv4 # 79653 ns/call +mean_power_u16 float_armv7a_neon_vfpv4 # 94305 ns/call +mean_power_u16 u64_generic # 131637 ns/call +mean_power_u16 u32_generic # 132114 ns/call +mean_power_u16 float_generic # 187107 ns/call + +mean_power_u16_aligned u32_armv7a_neon_vfpv4 # 45155 ns/call +mean_power_u16_aligned u32_armv7a_neon_vfpv4_aligned # 45158 ns/call +mean_power_u16_aligned neon_float_armv7a_neon_vfpv4 # 58522 ns/call +mean_power_u16_aligned neon_float_armv7a_neon_vfpv4_aligned # 58523 ns/call +mean_power_u16_aligned u64_armv7a_neon_vfpv4 # 80467 ns/call +mean_power_u16_aligned u64_armv7a_neon_vfpv4_aligned # 80685 ns/call +mean_power_u16_aligned float_armv7a_neon_vfpv4 # 86374 ns/call +mean_power_u16_aligned float_armv7a_neon_vfpv4_aligned # 86376 ns/call +mean_power_u16_aligned u64_generic # 131642 ns/call +mean_power_u16_aligned u32_generic # 132122 ns/call +mean_power_u16_aligned float_generic # 187184 ns/call diff --git a/wisdom/wisdom.x86_64.i7-6950X@4.00GHz b/wisdom/wisdom.x86_64.i7-6950X@4.00GHz new file mode 100644 index 000000000..10b81b175 --- /dev/null +++ b/wisdom/wisdom.x86_64.i7-6950X@4.00GHz @@ -0,0 +1,125 @@ +# generated by ./starch-benchmark -i 15 -o wisdom.x86_64.i7-6950X@4.00GHz + +magnitude_power_uc8 twopass_generic # 47717 ns/call +magnitude_power_uc8 twopass_x86_avx2 # 47769 ns/call +magnitude_power_uc8 lookup_generic # 50307 ns/call +magnitude_power_uc8 lookup_x86_avx2 # 50516 ns/call +magnitude_power_uc8 lookup_unroll_4_x86_avx2 # 52375 ns/call +magnitude_power_uc8 lookup_unroll_4_generic # 52783 ns/call + +magnitude_power_uc8_aligned twopass_generic # 45410 ns/call +magnitude_power_uc8_aligned twopass_x86_avx2_aligned # 45453 ns/call +magnitude_power_uc8_aligned twopass_x86_avx2 # 45592 ns/call +magnitude_power_uc8_aligned lookup_generic # 46982 ns/call +magnitude_power_uc8_aligned lookup_x86_avx2_aligned # 47048 ns/call +magnitude_power_uc8_aligned lookup_x86_avx2 # 47565 ns/call +magnitude_power_uc8_aligned lookup_unroll_4_x86_avx2_aligned # 48766 ns/call +magnitude_power_uc8_aligned lookup_unroll_4_x86_avx2 # 48902 ns/call +magnitude_power_uc8_aligned lookup_unroll_4_generic # 48979 ns/call + +magnitude_s16 exact_u32_x86_avx2 # 5819 ns/call +magnitude_s16 exact_u32_generic # 5931 ns/call + +magnitude_s16_aligned exact_u32_x86_avx2_aligned # 4942 ns/call +magnitude_s16_aligned exact_u32_x86_avx2 # 5017 ns/call +magnitude_s16_aligned exact_u32_generic # 5255 ns/call + +magnitude_sc16 exact_float_x86_avx2 # 229273 ns/call +magnitude_sc16 exact_u32_x86_avx2 # 241288 ns/call +magnitude_sc16 exact_u32_generic # 848228 ns/call +magnitude_sc16 exact_float_generic # 866695 ns/call + +magnitude_sc16_aligned exact_float_x86_avx2 # 211764 ns/call +magnitude_sc16_aligned exact_float_x86_avx2_aligned # 212186 ns/call +magnitude_sc16_aligned exact_u32_x86_avx2_aligned # 223779 ns/call +magnitude_sc16_aligned exact_u32_x86_avx2 # 230133 ns/call +magnitude_sc16_aligned exact_float_generic # 861515 ns/call +magnitude_sc16_aligned exact_u32_generic # 863065 ns/call + +magnitude_sc16q11 exact_float_x86_avx2 # 59644 ns/call +magnitude_sc16q11 exact_u32_x86_avx2 # 65329 ns/call +magnitude_sc16q11 12bit_table_generic # 250976 ns/call +magnitude_sc16q11 12bit_table_x86_avx2 # 251008 ns/call +magnitude_sc16q11 11bit_table_generic # 299927 ns/call +magnitude_sc16q11 11bit_table_x86_avx2 # 301723 ns/call +magnitude_sc16q11 exact_float_generic # 313735 ns/call +magnitude_sc16q11 exact_u32_generic # 317950 ns/call + +magnitude_sc16q11_aligned exact_float_x86_avx2_aligned # 53073 ns/call +magnitude_sc16q11_aligned exact_float_x86_avx2 # 53187 ns/call +magnitude_sc16q11_aligned exact_u32_x86_avx2_aligned # 54149 ns/call +magnitude_sc16q11_aligned exact_u32_x86_avx2 # 56085 ns/call +magnitude_sc16q11_aligned 12bit_table_generic # 251234 ns/call +magnitude_sc16q11_aligned 12bit_table_x86_avx2_aligned # 251541 ns/call +magnitude_sc16q11_aligned 12bit_table_x86_avx2 # 251966 ns/call +magnitude_sc16q11_aligned 11bit_table_x86_avx2_aligned # 297765 ns/call +magnitude_sc16q11_aligned 11bit_table_x86_avx2 # 298004 ns/call +magnitude_sc16q11_aligned 11bit_table_generic # 298078 ns/call +magnitude_sc16q11_aligned exact_float_generic # 309472 ns/call +magnitude_sc16q11_aligned exact_u32_generic # 310400 ns/call + +magnitude_u16o12 exact_unroll_8_x86_avx2 # 1134 ns/call +magnitude_u16o12 exact_unroll_4_x86_avx2 # 1135 ns/call +magnitude_u16o12 exact_x86_avx2 # 1144 ns/call +magnitude_u16o12 lookup_unroll_8_generic # 1489 ns/call +magnitude_u16o12 lookup_unroll_8_x86_avx2 # 1490 ns/call +magnitude_u16o12 lookup_unroll_4_x86_avx2 # 1493 ns/call +magnitude_u16o12 lookup_unroll_4_generic # 1494 ns/call +magnitude_u16o12 lookup_x86_avx2 # 1563 ns/call +magnitude_u16o12 lookup_generic # 1574 ns/call +magnitude_u16o12 exact_unroll_8_generic # 10061 ns/call +magnitude_u16o12 exact_unroll_4_generic # 10157 ns/call +magnitude_u16o12 exact_generic # 10586 ns/call + +magnitude_u16o12_aligned exact_unroll_8_x86_avx2_aligned # 900 ns/call +magnitude_u16o12_aligned exact_x86_avx2_aligned # 905 ns/call +magnitude_u16o12_aligned exact_unroll_4_x86_avx2_aligned # 912 ns/call +magnitude_u16o12_aligned exact_x86_avx2 # 949 ns/call +magnitude_u16o12_aligned exact_unroll_4_x86_avx2 # 952 ns/call +magnitude_u16o12_aligned exact_unroll_8_x86_avx2 # 952 ns/call +magnitude_u16o12_aligned lookup_unroll_8_x86_avx2 # 1134 ns/call +magnitude_u16o12_aligned lookup_unroll_8_x86_avx2_aligned # 1135 ns/call +magnitude_u16o12_aligned lookup_unroll_8_generic # 1151 ns/call +magnitude_u16o12_aligned lookup_unroll_4_generic # 1161 ns/call +magnitude_u16o12_aligned lookup_unroll_4_x86_avx2_aligned # 1163 ns/call +magnitude_u16o12_aligned lookup_unroll_4_x86_avx2 # 1164 ns/call +magnitude_u16o12_aligned lookup_generic # 1545 ns/call +magnitude_u16o12_aligned lookup_x86_avx2_aligned # 1549 ns/call +magnitude_u16o12_aligned lookup_x86_avx2 # 1550 ns/call +magnitude_u16o12_aligned exact_unroll_8_generic # 9627 ns/call +magnitude_u16o12_aligned exact_unroll_4_generic # 9633 ns/call +magnitude_u16o12_aligned exact_generic # 10028 ns/call + +magnitude_uc8 lookup_unroll_4_generic # 37182 ns/call +magnitude_uc8 lookup_unroll_4_x86_avx2 # 37196 ns/call +magnitude_uc8 lookup_generic # 37373 ns/call +magnitude_uc8 lookup_x86_avx2 # 37477 ns/call +magnitude_uc8 exact_x86_avx2 # 68020 ns/call +magnitude_uc8 exact_generic # 425967 ns/call + +magnitude_uc8_aligned lookup_unroll_4_x86_avx2_aligned # 37053 ns/call +magnitude_uc8_aligned lookup_unroll_4_generic # 37055 ns/call +magnitude_uc8_aligned lookup_unroll_4_x86_avx2 # 37133 ns/call +magnitude_uc8_aligned lookup_generic # 37327 ns/call +magnitude_uc8_aligned lookup_x86_avx2 # 37361 ns/call +magnitude_uc8_aligned lookup_x86_avx2_aligned # 37396 ns/call +magnitude_uc8_aligned exact_x86_avx2_aligned # 66275 ns/call +magnitude_uc8_aligned exact_x86_avx2 # 67078 ns/call +magnitude_uc8_aligned exact_generic # 424799 ns/call + +mean_power_u16 u32_x86_avx2 # 9411 ns/call +mean_power_u16 u32_generic # 10594 ns/call +mean_power_u16 float_x86_avx2 # 14919 ns/call +mean_power_u16 u64_x86_avx2 # 21358 ns/call +mean_power_u16 u64_generic # 24858 ns/call +mean_power_u16 float_generic # 82535 ns/call + +mean_power_u16_aligned u32_x86_avx2 # 9128 ns/call +mean_power_u16_aligned u32_x86_avx2_aligned # 9148 ns/call +mean_power_u16_aligned u32_generic # 10299 ns/call +mean_power_u16_aligned float_x86_avx2_aligned # 13018 ns/call +mean_power_u16_aligned float_x86_avx2 # 14312 ns/call +mean_power_u16_aligned u64_x86_avx2 # 20915 ns/call +mean_power_u16_aligned u64_x86_avx2_aligned # 21280 ns/call +mean_power_u16_aligned u64_generic # 24653 ns/call +mean_power_u16_aligned float_generic # 82146 ns/call