diff --git a/sys/include/matstat.h b/sys/include/matstat.h new file mode 100644 index 0000000000..74c99df101 --- /dev/null +++ b/sys/include/matstat.h @@ -0,0 +1,120 @@ +/* + * Copyright (C) 2018 Eistec AB + * + * This file is subject to the terms and conditions of the GNU Lesser + * General Public License v2.1. See the file LICENSE in the top level + * directory for more details. + */ + +/** + * @defgroup sys_matstat Matstat - Integer mathematical statistics library + * @ingroup sys + * @brief Library for computing 1-pass statistics + * + * The Matstat library uses single pass algorithms to compute statistic measures + * such as mean and variance over many values. The values can be immediately + * discarded after processing, keeping the memory requirement constant + * regardless of how many values need to be processed. + * + * The design goal is to provide basic mathematical statistics operations on + * constrained devices with a "good enough" accuracy to be able to provide some + * descriptive measures of data. For more accurate measures of statistics, use a + * fancier library, or copy the data to a PC. + * + * It is important to know that using integer operations will result in lower + * precision in the computed measures because of truncation. + * + * @{ + * @file + * @brief Matstat library declarations + * + * @author Joakim Nohlgård + */ + +#ifndef MATSTAT_H +#define MATSTAT_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @brief Internal state for computing running statistics + */ +typedef struct { + int64_t sum; /**< Sum of values added */ + uint64_t sum_sq; /**< Sum of squared differences */ + uint32_t count; /**< Number of values added */ + int32_t mean; /**< Mean value */ + int32_t min; /**< Minimum value seen */ + int32_t max; /**< Maximum value seen */ +} matstat_state_t; + +/** + * @brief Empty state initializer + */ +#define MATSTAT_STATE_INIT (const matstat_state_t) { \ + .sum = 0, \ + .sum_sq = 0, \ + .count = 0, \ + .mean = 0, \ + .min = INT32_MAX, \ + .max = INT32_MIN, \ + } + +/** + * @brief Reset state + * + * @param[in] state State struct to clear + */ +void matstat_clear(matstat_state_t *state); + +/** + * @brief Add a sample to state + * + * @param[in] state State struct to operate on + * @param[in] value Value to add to the state + */ +void matstat_add(matstat_state_t *state, int32_t value); + +/** + * @brief Return the computed mean value of all samples so far + * + * @param[in] state State struct to operate on + * + * @return arithmetic mean + */ +static inline int32_t matstat_mean(const matstat_state_t *state) +{ + return state->mean; +} + +/** + * @brief Compute the sample variance of all samples so far + * + * @param[in] state State struct to operate on + * + * @return sample variance + */ +uint64_t matstat_variance(const matstat_state_t *state); + +/** + * @brief Combine two states + * + * Add the sums and count of @p src and @p dest, take the maximum of the max + * values and minimum of the min values. The result is written to @p dest. + * + * @param[inout] dest destination state struct + * @param[out] src source state struct + */ +void matstat_merge(matstat_state_t *dest, const matstat_state_t *src); + +#ifdef __cplusplus +} +#endif + +#endif /* MATSTAT_H */ + +/** @} */ diff --git a/sys/matstat/Makefile b/sys/matstat/Makefile new file mode 100644 index 0000000000..48422e909a --- /dev/null +++ b/sys/matstat/Makefile @@ -0,0 +1 @@ +include $(RIOTBASE)/Makefile.base diff --git a/sys/matstat/matstat.c b/sys/matstat/matstat.c new file mode 100644 index 0000000000..35f549c188 --- /dev/null +++ b/sys/matstat/matstat.c @@ -0,0 +1,96 @@ +/* + * Copyright (C) 2018 Eistec AB + * + * This file is subject to the terms and conditions of the GNU Lesser + * General Public License v2.1. See the file LICENSE in the top level + * directory for more details. + */ + +#include +#include "matstat.h" + +#define ENABLE_DEBUG (0) +#include "debug.h" + +void matstat_clear(matstat_state_t *state) +{ + *state = MATSTAT_STATE_INIT; +} + +void matstat_add(matstat_state_t *state, int32_t value) +{ + if (value > state->max) { + state->max = value; + } + if (value < state->min) { + state->min = value; + } + state->sum += value; + /* Using Welford's algorithm for variance */ + ++state->count; + if (state->count == 1) { + state->sum_sq = 0; + state->mean = value; + } + else { + int32_t new_mean = state->sum / state->count; + int64_t diff = (value - state->mean) * (value - new_mean); + if ((diff < 0) && ((uint64_t)(-diff) > state->sum_sq)) { + /* Handle corner cases where sum_sq becomes negative */ + state->sum_sq = 0; + } + else { + state->sum_sq += diff; + } + state->mean = new_mean; + } +} + +uint64_t matstat_variance(const matstat_state_t *state) +{ + if (state->count < 2) { + /* We don't have any way of returning an error */ + return 0; + } + uint64_t variance = state->sum_sq / (state->count - 1); + DEBUG("Var: (%" PRIu64 " / (%" PRId32 " - 1)) = %" PRIu64 "\n", + state->sum_sq, state->count, variance); + return variance; +} + +void matstat_merge(matstat_state_t *dest, const matstat_state_t *src) +{ + if (src->count == 0) { + /* src is empty, no-op */ + return; + } + if (dest->count == 0) { + /* dest is empty, straight copy */ + *dest = *src; + return; + } + /* Combining the variance of the two samples needs some extra + * handling if the means are different between the two states, + * source: https://stats.stackexchange.com/a/43183 + * (using sum_sq = sigma2 * n, instead of sum_sq = sigma2 * (n-1) to simplify algorithm) + */ + dest->sum_sq = (dest->sum_sq + dest->sum * dest->mean + src->sum_sq + src->sum * src->mean); + dest->count += src->count; + dest->sum += src->sum; + int32_t new_mean = dest->sum / dest->count; + int64_t diff = -new_mean * dest->sum; + if ((diff < 0) && ((uint64_t)(-diff) > dest->sum_sq)) { + /* Handle corner cases where sum_sq becomes negative */ + dest->sum_sq = 0; + } + else { + dest->sum_sq += diff; + } + dest->mean = new_mean; + if (src->max > dest->max) { + dest->max = src->max; + } + if (src->min < dest->min) { + dest->min = src->min; + } +} diff --git a/tests/unittests/tests-matstat/Makefile b/tests/unittests/tests-matstat/Makefile new file mode 100644 index 0000000000..48422e909a --- /dev/null +++ b/tests/unittests/tests-matstat/Makefile @@ -0,0 +1 @@ +include $(RIOTBASE)/Makefile.base diff --git a/tests/unittests/tests-matstat/Makefile.include b/tests/unittests/tests-matstat/Makefile.include new file mode 100644 index 0000000000..7e5d88411c --- /dev/null +++ b/tests/unittests/tests-matstat/Makefile.include @@ -0,0 +1 @@ +USEMODULE += matstat diff --git a/tests/unittests/tests-matstat/tests-matstat.c b/tests/unittests/tests-matstat/tests-matstat.c new file mode 100644 index 0000000000..cd6f83a483 --- /dev/null +++ b/tests/unittests/tests-matstat/tests-matstat.c @@ -0,0 +1,296 @@ +/* + * Copyright (C) 2018 Eistec AB + * + * This file is subject to the terms and conditions of the GNU Lesser + * General Public License v2.1. See the file LICENSE in the top level + * directory for more details. + */ + +#include +#include "embUnit.h" +#include "tests-matstat.h" + +#include "matstat.h" + +#define ENABLE_DEBUG (0) +#include "debug.h" + +/* White box testing of matstat library */ + +static void test_matstat_basic(void) +{ + /* nothing special, only verifying the basic functionality */ + matstat_state_t state = MATSTAT_STATE_INIT; + matstat_add(&state, 10); + TEST_ASSERT_EQUAL_INT(10, state.min); + TEST_ASSERT_EQUAL_INT(10, state.max); + TEST_ASSERT_EQUAL_INT(1, state.count); + matstat_add(&state, 20); + TEST_ASSERT_EQUAL_INT(10, state.min); + TEST_ASSERT_EQUAL_INT(20, state.max); + TEST_ASSERT_EQUAL_INT(2, state.count); + matstat_add(&state, 30); + TEST_ASSERT_EQUAL_INT(10, state.min); + TEST_ASSERT_EQUAL_INT(30, state.max); + TEST_ASSERT_EQUAL_INT(3, state.count); + matstat_add(&state, 40); + TEST_ASSERT_EQUAL_INT(10, state.min); + TEST_ASSERT_EQUAL_INT(40, state.max); + TEST_ASSERT_EQUAL_INT(4, state.count); + int32_t mean = matstat_mean(&state); + TEST_ASSERT_EQUAL_INT(25, mean); + uint64_t var = matstat_variance(&state); + TEST_ASSERT_EQUAL_INT(166, var); + matstat_clear(&state); + TEST_ASSERT_EQUAL_INT(0, state.count); +} + +static void test_matstat_var_stability(void) +{ + /* This test is designed to detect stability errors where the values are + * located very close together, which should yield a very low variance. */ + /* The initial implementation of the variance algorithm resulted in a very + * large variance in this test, due to cancellation problems */ + matstat_state_t state = MATSTAT_STATE_INIT; + matstat_add(&state, 999999); + matstat_add(&state, 1000000); + matstat_add(&state, 1000000); + matstat_add(&state, 1000000); + matstat_add(&state, 1000000); + matstat_add(&state, 1000000); + matstat_add(&state, 1000000); + matstat_add(&state, 1000000); + matstat_add(&state, 1000000); + matstat_add(&state, 1000000); + int32_t mean = matstat_mean(&state); + TEST_ASSERT(mean >= 999999); + TEST_ASSERT(mean <= 1000000); + uint64_t var = matstat_variance(&state); + TEST_ASSERT(var <= 1); +} + +static void test_matstat_negative_variance(void) +{ + /* This is a regression test for two related problems where the truncation + * in the mean computation (integer division) causes the sum_sq value to become + * negative, or the variance itself to become negative */ + matstat_state_t state = MATSTAT_STATE_INIT; + matstat_add(&state, -1); + matstat_add(&state, 0); + uint64_t var = matstat_variance(&state); + TEST_ASSERT_EQUAL_INT(0, var); + matstat_clear(&state); + matstat_add(&state, 1); + matstat_add(&state, 0); + matstat_add(&state, 0); + matstat_add(&state, 0); + var = matstat_variance(&state); + TEST_ASSERT_EQUAL_INT(0, var); + matstat_clear(&state); + matstat_add(&state, 1234567); + for (unsigned int k = 0; k < 9999; ++k) { + matstat_add(&state, 1234567); + matstat_add(&state, 1234566); + } + var = matstat_variance(&state); + TEST_ASSERT_EQUAL_INT(0, var); +} + +static void test_matstat_merge_basic(void) +{ + /* This is a basic test of the merging functionality without any "special" cases */ + matstat_state_t state1 = MATSTAT_STATE_INIT; + matstat_state_t state2 = MATSTAT_STATE_INIT; + matstat_state_t state_ref = MATSTAT_STATE_INIT; + matstat_add(&state1, 2000); + matstat_add(&state_ref, 2000); + matstat_add(&state1, 1000); + matstat_add(&state_ref, 1000); + matstat_add(&state1, 2000); + matstat_add(&state_ref, 2000); + matstat_add(&state2, 2000); + matstat_add(&state_ref, 2000); + matstat_add(&state2, 2456); + matstat_add(&state_ref, 2456); + matstat_add(&state2, 1234); + matstat_add(&state_ref, 1234); + matstat_add(&state2, 5678); + matstat_add(&state_ref, 5678); + matstat_add(&state2, 9999); + matstat_add(&state_ref, 9999); + matstat_merge(&state1, &state2); + TEST_ASSERT_EQUAL_INT(state_ref.min, state1.min); + TEST_ASSERT_EQUAL_INT(state_ref.max, state1.max); + TEST_ASSERT_EQUAL_INT(state_ref.count, state1.count); + TEST_ASSERT_EQUAL_INT(state_ref.sum, state1.sum); + int32_t mean = matstat_mean(&state1); + int32_t mean_ref = matstat_mean(&state_ref); + TEST_ASSERT_EQUAL_INT(mean_ref, mean); +} + +static void test_matstat_merge_empty(void) +{ + /* Testing merging with one or more empty states */ + matstat_state_t state1 = MATSTAT_STATE_INIT; + matstat_state_t state2 = MATSTAT_STATE_INIT; + matstat_merge(&state1, &state2); + TEST_ASSERT_EQUAL_INT(0, state1.count); + TEST_ASSERT_EQUAL_INT(0, state2.count); + TEST_ASSERT_EQUAL_INT(0, state1.sum); + TEST_ASSERT_EQUAL_INT(0, state2.sum); + TEST_ASSERT_EQUAL_INT(0, state1.sum_sq); + TEST_ASSERT_EQUAL_INT(0, state2.sum_sq); + matstat_add(&state1, 2000); + matstat_add(&state1, 1000); + matstat_add(&state1, 2000); + matstat_merge(&state1, &state2); + TEST_ASSERT_EQUAL_INT(1000, state1.min); + TEST_ASSERT_EQUAL_INT(2000, state1.max); + TEST_ASSERT_EQUAL_INT(3, state1.count); + TEST_ASSERT_EQUAL_INT(0, state2.count); + matstat_clear(&state1); + TEST_ASSERT_EQUAL_INT(0, state1.count); + matstat_add(&state2, 2000); + matstat_add(&state2, 1000); + matstat_add(&state2, 2000); + TEST_ASSERT_EQUAL_INT(3, state2.count); + matstat_merge(&state1, &state2); + TEST_ASSERT_EQUAL_INT(1000, state1.min); + TEST_ASSERT_EQUAL_INT(2000, state1.max); + TEST_ASSERT_EQUAL_INT(3, state1.count); +} + +static void test_matstat_merge_variance(void) +{ + /* This test should ensure that merging states from separate sequences will + * yield correct results for the variance computation */ + matstat_state_t state1 = MATSTAT_STATE_INIT; + matstat_state_t state2 = MATSTAT_STATE_INIT; + matstat_state_t state_ref = MATSTAT_STATE_INIT; + matstat_add(&state1, 2000); + matstat_add(&state_ref, 2000); + matstat_add(&state1, 1000); + matstat_add(&state_ref, 1000); + matstat_add(&state1, 2000); + matstat_add(&state_ref, 2000); + matstat_add(&state2, 9999); + matstat_add(&state_ref, 9999); + matstat_add(&state2, 2456); + matstat_add(&state_ref, 2456); + matstat_add(&state2, 1234); + matstat_add(&state_ref, 1234); + matstat_add(&state2, 5678); + matstat_add(&state_ref, 5678); + matstat_add(&state2, 9999); + matstat_add(&state_ref, 9999); + matstat_merge(&state1, &state2); + uint64_t var = matstat_variance(&state1); + uint64_t var_ref = matstat_variance(&state_ref); + int64_t var_diff = var - var_ref; + /* There will invariably be some loss of accuracy because of the integer + * operations involved in the variance computation. */ + TEST_ASSERT(var_diff < 1000); + TEST_ASSERT(var_diff > -1000); + TEST_ASSERT_EQUAL_INT(state_ref.mean, state1.mean); +} + +static void test_matstat_merge_variance_regr1(void) +{ + /* This is a regression check for an issue where the sum_sq variable became + * negative after merging a sequence of states with different means, and + * small but non-zero sum_sq values. */ + /* Numbers were taken from a stats dump from the bench_timers application */ + matstat_state_t inputs[] = { + { .count = 2686, .sum = 5414, .sum_sq = 1380, .min = 1, .max = 3, .mean = 2 }, + { .count = 2643, .sum = 5272, .sum_sq = 3263, .min = 1, .max = 3, .mean = 1 }, + { .count = 2650, .sum = 5328, .sum_sq = 719, .min = 1, .max = 3, .mean = 2 }, + { .count = 2562, .sum = 5117, .sum_sq = 2756, .min = 1, .max = 3, .mean = 1 }, + { .count = 2579, .sum = 5157, .sum_sq = 635, .min = 1, .max = 3, .mean = 1 }, + { .count = 2533, .sum = 5050, .sum_sq = 2944, .min = 1, .max = 3, .mean = 1 }, + { .count = 2630, .sum = 5276, .sum_sq = 1078, .min = 1, .max = 3, .mean = 2 }, + { .count = 2667, .sum = 5333, .sum_sq = 974, .min = 1, .max = 3, .mean = 1 }, + { .count = 2414, .sum = 4859, .sum_sq = 1074, .min = 1, .max = 3, .mean = 2 }, + }; + matstat_state_t merged = MATSTAT_STATE_INIT; + for (unsigned k = 0; k < sizeof(inputs) / sizeof(inputs[0]); ++k) { + matstat_merge(&merged, &inputs[k]); + } + int64_t var = (int64_t)matstat_variance(&merged); + /* Expected variance for this input is 0, because of integer truncation of the result. + * The bug gave the following result instead: + * count = 23364, sum = 46806, sum_sq = 18446744073709540510, mean = 2, var = 789570863061659 + */ + /* Left here for debugging test case failures: */ + /* printf("\nmerged: count = %" PRIu32 ", sum = %" PRId64 ", sum_sq = %" PRIu64 ", " + "mean = %" PRId32 ", var = %" PRIu64 "\n", merged.count, merged.sum, + merged.sum_sq, merged.mean, var); */ + TEST_ASSERT((int64_t)merged.sum_sq > 0); + TEST_ASSERT(var >= 0); +} + +static void test_matstat_accuracy(void) +{ + /* This test verifies that the numeric accuracy is "good enough" */ + matstat_state_t state = MATSTAT_STATE_INIT; + /* + * The test values below were sampled from a normal distribution with + * mean = 12345 + * standard deviation = 10000 => variance = 100000000 + * + * The sample distribution, when computed with double precision floating + * point values, is: + * sample variance = 115969073.207895 + * sample mean = 12293.05 + */ + /* This test will fail unless the library adaptively adjusts the offset to + * reduce the error in the variance */ + matstat_add(&state, -9228); + matstat_add(&state, 6225); + matstat_add(&state, 15935); + matstat_add(&state, 1171); + matstat_add(&state, 9500); + matstat_add(&state, 22805); + matstat_add(&state, 6484); + matstat_add(&state, 10157); + matstat_add(&state, 23870); + matstat_add(&state, 9010); + matstat_add(&state, 16093); + matstat_add(&state, 20969); + matstat_add(&state, 18077); + matstat_add(&state, 9202); + matstat_add(&state, 20074); + matstat_add(&state, 19236); + matstat_add(&state, 32276); + matstat_add(&state, 6342); + matstat_add(&state, 18759); + matstat_add(&state, -11096); + int32_t mean = matstat_mean(&state); + uint64_t var = matstat_variance(&state); + int64_t var_diff = var - 115969073; + TEST_ASSERT(var_diff < 10000); + TEST_ASSERT(var_diff > -10000); + TEST_ASSERT_EQUAL_INT(12293, mean); +} + +Test *tests_matstat_tests(void) +{ + EMB_UNIT_TESTFIXTURES(fixtures) { + new_TestFixture(test_matstat_basic), + new_TestFixture(test_matstat_var_stability), + new_TestFixture(test_matstat_merge_basic), + new_TestFixture(test_matstat_merge_empty), + new_TestFixture(test_matstat_merge_variance), + new_TestFixture(test_matstat_merge_variance_regr1), + new_TestFixture(test_matstat_accuracy), + new_TestFixture(test_matstat_negative_variance), + }; + + EMB_UNIT_TESTCALLER(matstat_tests, NULL, NULL, fixtures); + + return (Test *)&matstat_tests; +} + +void tests_matstat(void) +{ + TESTS_RUN(tests_matstat_tests()); +} diff --git a/tests/unittests/tests-matstat/tests-matstat.h b/tests/unittests/tests-matstat/tests-matstat.h new file mode 100644 index 0000000000..b37904ba24 --- /dev/null +++ b/tests/unittests/tests-matstat/tests-matstat.h @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2018 Eistec AB + * + * This file is subject to the terms and conditions of the GNU Lesser + * General Public License v2.1. See the file LICENSE in the top level + * directory for more details. + */ + +/** + * @addtogroup unittests + * @{ + * + * @file + * @brief Unittests for the Matstat library + * + * @author Joakim Nohlgård + */ +#ifndef TESTS_MATSTAT_H +#define TESTS_MATSTAT_H + +#ifdef __cplusplus +extern "C" { +#endif + +/** +* @brief The entry point of this test suite. +*/ +void tests_matstat(void); + +/** + * @brief Generates tests for matstat + * + * @return embUnit tests if successful, NULL if not. + */ +Test *tests_matstat_tests(void); + +#ifdef __cplusplus +} +#endif + +#endif /* TESTS_MATSTAT_H */ +/** @} */