diff --git a/tests/bench_sys_atomic_utils/Makefile b/tests/bench_sys_atomic_utils/Makefile new file mode 100644 index 0000000000..dc9d10e24a --- /dev/null +++ b/tests/bench_sys_atomic_utils/Makefile @@ -0,0 +1,7 @@ +include ../Makefile.tests_common + +USEMODULE += xtimer +USEMODULE += atomic_utils +USEMODULE += test_utils_interactive_sync + +include $(RIOTBASE)/Makefile.include diff --git a/tests/bench_sys_atomic_utils/Makefile.ci b/tests/bench_sys_atomic_utils/Makefile.ci new file mode 100644 index 0000000000..518b330a9e --- /dev/null +++ b/tests/bench_sys_atomic_utils/Makefile.ci @@ -0,0 +1,3 @@ +BOARD_INSUFFICIENT_MEMORY := \ + stm32f030f4-demo \ + # diff --git a/tests/bench_sys_atomic_utils/README.md b/tests/bench_sys_atomic_utils/README.md new file mode 100644 index 0000000000..3c29cdd6d8 --- /dev/null +++ b/tests/bench_sys_atomic_utils/README.md @@ -0,0 +1,18 @@ +# Benchmark for `sys/atomic_utils` + +This application will perform 100.000 repetitions (or 1.000.000 on +Cortex-M7 and ESP32) for each atomic operation and will print the total time it +took in a table. For comparison, the speed of C11 atomics and plain `volatile` +accesses are also printed. + +## Expectations + +Lower is better! + +Plain `volatile` accesses are not atomic, and therefore should perform faster +than actual atomic operations. If atomic operations turn out to be faster +(by more than rounding errors and noise), something is odd. + +The `atomic_utils` aim to be at least as fast as C11 atomics and often faster. +If the `atomic_utils` implementation performs slower than C11 atomics, you have +found potential for further optimization. diff --git a/tests/bench_sys_atomic_utils/main.c b/tests/bench_sys_atomic_utils/main.c new file mode 100644 index 0000000000..e15d4fb231 --- /dev/null +++ b/tests/bench_sys_atomic_utils/main.c @@ -0,0 +1,504 @@ +/* + * Copyright (C) 2020 Otto-von-Guericke-Universität Magdeburg + * + * This file is subject to the terms and conditions of the GNU Lesser + * General Public License v2.1. See the file LICENSE in the top level + * directory for more details. + */ + +/** + * @ingroup tests + * @{ + * + * @file + * @brief Atomic util benchmark + * + * @author Marian Buschsieweke + * + * @} + */ + +#include +#include +#include + +#include "atomic_utils.h" +#include "xtimer.h" + +/* On fast CPUs: 1.000.000 loops */ +#if defined(CPU_CORE_CORTEX_M7) || defined(CPU_ESP32) +#define LOOPS 1000000 +#else +/* Else 100.000 loops */ +#define LOOPS 100000 +#endif + +#define CONCAT(a, b) a ## b +#define CONCAT3(a, b, c) a ## b ## c +#define CONCAT4(a, b, c, d) a ## b ## c ## d + +enum { + IMPL_VOLATILE, + IMPL_ATOMIC_UTIL, + IMPL_C11_ATOMIC, + IMPL_NUMOF +}; + +#define BENCH_ATOMIC_STORE(name, type, c11type) \ + static void CONCAT(bench_atomic_store_, name)(uint32_t *result_us) \ + { \ + uint32_t start, stop; \ + \ + { \ + volatile type val; \ + start = xtimer_now_usec(); \ + for (uint32_t i = 0; i < LOOPS; i++) { \ + val = 42; \ + } \ + (void)val; \ + stop = xtimer_now_usec(); \ + result_us[IMPL_VOLATILE] = stop - start; \ + } \ + \ + { \ + type val; \ + start = xtimer_now_usec(); \ + for (uint32_t i = 0; i < LOOPS; i++) { \ + CONCAT(atomic_store_, name)(&val, 42); \ + } \ + stop = xtimer_now_usec(); \ + result_us[IMPL_ATOMIC_UTIL] = stop - start; \ + } \ + \ + { \ + c11type val; \ + start = xtimer_now_usec(); \ + for (uint32_t i = 0; i < LOOPS; i++) { \ + atomic_store(&val, 42); \ + } \ + stop = xtimer_now_usec(); \ + result_us[IMPL_C11_ATOMIC] = stop - start; \ + } \ + } +BENCH_ATOMIC_STORE(u8, uint8_t, atomic_uint_least8_t) +BENCH_ATOMIC_STORE(u16, uint16_t, atomic_uint_least16_t) +BENCH_ATOMIC_STORE(u32, uint32_t, atomic_uint_least32_t) +BENCH_ATOMIC_STORE(u64, uint64_t, atomic_uint_least64_t) + +#define BENCH_ATOMIC_LOAD(name, type, c11type) \ + static void CONCAT(bench_atomic_load_, name)(uint32_t *result_us) \ + { \ + uint32_t start, stop; \ + \ + { \ + volatile type val = 0; \ + start = xtimer_now_usec(); \ + for (uint32_t i = 0; i < LOOPS; i++) { \ + type tmp = val; \ + (void)tmp; \ + } \ + stop = xtimer_now_usec(); \ + result_us[IMPL_VOLATILE] = stop - start; \ + } \ + \ + { \ + type val = 0; \ + start = xtimer_now_usec(); \ + for (uint32_t i = 0; i < LOOPS; i++) { \ + type tmp = CONCAT(atomic_load_, name)(&val); \ + (void)tmp; \ + } \ + stop = xtimer_now_usec(); \ + result_us[IMPL_ATOMIC_UTIL] = stop - start; \ + } \ + \ + { \ + c11type val = ATOMIC_VAR_INIT(0); \ + start = xtimer_now_usec(); \ + for (uint32_t i = 0; i < LOOPS; i++) { \ + type tmp = atomic_load(&val); \ + (void)tmp; \ + } \ + stop = xtimer_now_usec(); \ + result_us[IMPL_C11_ATOMIC] = stop - start; \ + } \ + } +BENCH_ATOMIC_LOAD(u8, uint8_t, atomic_uint_least8_t) +BENCH_ATOMIC_LOAD(u16, uint16_t, atomic_uint_least16_t) +BENCH_ATOMIC_LOAD(u32, uint32_t, atomic_uint_least32_t) +BENCH_ATOMIC_LOAD(u64, uint64_t, atomic_uint_least64_t) + +#define BENCH_ATOMIC_FETCH_OP(opname, op, name, type, c11type) \ + static void CONCAT4(bench_atomic_fetch_, opname, _, name)(uint32_t *result_us) \ + { \ + uint32_t start, stop; \ + \ + { \ + volatile type val = 0; \ + start = xtimer_now_usec(); \ + for (uint32_t i = 0; i < LOOPS; i++) { \ + val = val op 1; \ + } \ + (void)val; \ + stop = xtimer_now_usec(); \ + result_us[IMPL_VOLATILE] = stop - start; \ + } \ + \ + { \ + type val = 0; \ + start = xtimer_now_usec(); \ + for (uint32_t i = 0; i < LOOPS; i++) { \ + CONCAT4(atomic_fetch_, opname, _, name)(&val, 1); \ + } \ + stop = xtimer_now_usec(); \ + result_us[IMPL_ATOMIC_UTIL] = stop - start; \ + } \ + \ + { \ + c11type val = ATOMIC_VAR_INIT(0); \ + start = xtimer_now_usec(); \ + for (uint32_t i = 0; i < LOOPS; i++) { \ + CONCAT(atomic_fetch_, opname)(&val, 1); \ + } \ + stop = xtimer_now_usec(); \ + result_us[IMPL_C11_ATOMIC] = stop - start; \ + } \ + } +BENCH_ATOMIC_FETCH_OP(add, +, u8, uint8_t, atomic_uint_least8_t) +BENCH_ATOMIC_FETCH_OP(add, +, u16, uint16_t, atomic_uint_least16_t) +BENCH_ATOMIC_FETCH_OP(add, +, u32, uint32_t, atomic_uint_least32_t) +BENCH_ATOMIC_FETCH_OP(add, +, u64, uint64_t, atomic_uint_least64_t) +BENCH_ATOMIC_FETCH_OP(sub, -, u8, uint8_t, atomic_uint_least8_t) +BENCH_ATOMIC_FETCH_OP(sub, -, u16, uint16_t, atomic_uint_least16_t) +BENCH_ATOMIC_FETCH_OP(sub, -, u32, uint32_t, atomic_uint_least32_t) +BENCH_ATOMIC_FETCH_OP(sub, -, u64, uint64_t, atomic_uint_least64_t) +BENCH_ATOMIC_FETCH_OP(or, |, u8, uint8_t, atomic_uint_least8_t) +BENCH_ATOMIC_FETCH_OP(or, |, u16, uint16_t, atomic_uint_least16_t) +BENCH_ATOMIC_FETCH_OP(or, |, u32, uint32_t, atomic_uint_least32_t) +BENCH_ATOMIC_FETCH_OP(or, |, u64, uint64_t, atomic_uint_least64_t) +BENCH_ATOMIC_FETCH_OP(xor, ^, u8, uint8_t, atomic_uint_least8_t) +BENCH_ATOMIC_FETCH_OP(xor, ^, u16, uint16_t, atomic_uint_least16_t) +BENCH_ATOMIC_FETCH_OP(xor, ^, u32, uint32_t, atomic_uint_least32_t) +BENCH_ATOMIC_FETCH_OP(xor, ^, u64, uint64_t, atomic_uint_least64_t) +BENCH_ATOMIC_FETCH_OP(and, &, u8, uint8_t, atomic_uint_least8_t) +BENCH_ATOMIC_FETCH_OP(and, &, u16, uint16_t, atomic_uint_least16_t) +BENCH_ATOMIC_FETCH_OP(and, &, u32, uint32_t, atomic_uint_least32_t) +BENCH_ATOMIC_FETCH_OP(and, &, u64, uint64_t, atomic_uint_least64_t) + +#define BENCH_ATOMIC_SET_CLEAR_BIT(name, type, c11type, opname, set_or_clear) \ + static void CONCAT4(bench_atomic_, opname, _bit_, name)(uint32_t *result_us) \ + { \ + uint32_t start, stop; \ + static const uint8_t _bit = 5; \ + type mask = ((type)1) << _bit; \ + \ + { \ + volatile type val = 0; \ + start = xtimer_now_usec(); \ + for (uint32_t i = 0; i < LOOPS; i++) { \ + if (set_or_clear) { \ + val |= mask; \ + } \ + else { \ + val &= ~(mask); \ + } \ + } \ + (void)val; \ + stop = xtimer_now_usec(); \ + result_us[IMPL_VOLATILE] = stop - start; \ + } \ + \ + { \ + static type val = 0; \ + CONCAT3(atomic_bit_, name, _t) bit = \ + CONCAT(atomic_bit_, name)(&val, _bit); \ + start = xtimer_now_usec(); \ + for (uint32_t i = 0; i < LOOPS; i++) { \ + CONCAT4(atomic_, opname, _bit_, name)(bit); \ + } \ + stop = xtimer_now_usec(); \ + result_us[IMPL_ATOMIC_UTIL] = stop - start; \ + } \ + \ + { \ + c11type val = ATOMIC_VAR_INIT(0); \ + start = xtimer_now_usec(); \ + for (uint32_t i = 0; i < LOOPS; i++) { \ + if (set_or_clear) { \ + atomic_fetch_or(&val, mask); \ + } \ + else { \ + atomic_fetch_and(&val, ~(mask)); \ + } \ + } \ + stop = xtimer_now_usec(); \ + result_us[IMPL_C11_ATOMIC] = stop - start; \ + } \ + } +BENCH_ATOMIC_SET_CLEAR_BIT(u8, uint8_t, atomic_uint_least8_t, set, 1) +BENCH_ATOMIC_SET_CLEAR_BIT(u16, uint16_t, atomic_uint_least16_t, set, 1) +BENCH_ATOMIC_SET_CLEAR_BIT(u32, uint32_t, atomic_uint_least32_t, set, 1) +BENCH_ATOMIC_SET_CLEAR_BIT(u64, uint64_t, atomic_uint_least64_t, set, 1) +BENCH_ATOMIC_SET_CLEAR_BIT(u8, uint8_t, atomic_uint_least8_t, clear, 0) +BENCH_ATOMIC_SET_CLEAR_BIT(u16, uint16_t, atomic_uint_least16_t, clear, 0) +BENCH_ATOMIC_SET_CLEAR_BIT(u32, uint32_t, atomic_uint_least32_t, clear, 0) +BENCH_ATOMIC_SET_CLEAR_BIT(u64, uint64_t, atomic_uint_least64_t, clear, 0) + +#define BENCH_SEMI_ATOMIC_FETCH_OP(opname, op, name, type, c11type) \ + static void CONCAT4(bench_semi_atomic_fetch_, opname, _, name)(uint32_t *result_us) \ + { \ + uint32_t start, stop; \ + \ + { \ + volatile type val = 0; \ + start = xtimer_now_usec(); \ + for (uint32_t i = 0; i < LOOPS; i++) { \ + val = val op 1; \ + } \ + (void)val; \ + stop = xtimer_now_usec(); \ + result_us[IMPL_VOLATILE] = stop - start; \ + } \ + \ + { \ + type val = 0; \ + start = xtimer_now_usec(); \ + for (uint32_t i = 0; i < LOOPS; i++) { \ + CONCAT4(semi_atomic_fetch_, opname, _, name)(&val, 1); \ + } \ + stop = xtimer_now_usec(); \ + result_us[IMPL_ATOMIC_UTIL] = stop - start; \ + } \ + \ + { \ + c11type val = ATOMIC_VAR_INIT(0); \ + start = xtimer_now_usec(); \ + for (uint32_t i = 0; i < LOOPS; i++) { \ + CONCAT(atomic_fetch_, opname)(&val, 1); \ + } \ + stop = xtimer_now_usec(); \ + result_us[IMPL_C11_ATOMIC] = stop - start; \ + } \ + } +BENCH_SEMI_ATOMIC_FETCH_OP(add, +, u8, uint8_t, atomic_uint_least8_t) +BENCH_SEMI_ATOMIC_FETCH_OP(add, +, u16, uint16_t, atomic_uint_least16_t) +BENCH_SEMI_ATOMIC_FETCH_OP(add, +, u32, uint32_t, atomic_uint_least32_t) +BENCH_SEMI_ATOMIC_FETCH_OP(add, +, u64, uint64_t, atomic_uint_least64_t) +BENCH_SEMI_ATOMIC_FETCH_OP(sub, -, u8, uint8_t, atomic_uint_least8_t) +BENCH_SEMI_ATOMIC_FETCH_OP(sub, -, u16, uint16_t, atomic_uint_least16_t) +BENCH_SEMI_ATOMIC_FETCH_OP(sub, -, u32, uint32_t, atomic_uint_least32_t) +BENCH_SEMI_ATOMIC_FETCH_OP(sub, -, u64, uint64_t, atomic_uint_least64_t) +BENCH_SEMI_ATOMIC_FETCH_OP(or, |, u8, uint8_t, atomic_uint_least8_t) +BENCH_SEMI_ATOMIC_FETCH_OP(or, |, u16, uint16_t, atomic_uint_least16_t) +BENCH_SEMI_ATOMIC_FETCH_OP(or, |, u32, uint32_t, atomic_uint_least32_t) +BENCH_SEMI_ATOMIC_FETCH_OP(or, |, u64, uint64_t, atomic_uint_least64_t) +BENCH_SEMI_ATOMIC_FETCH_OP(xor, ^, u8, uint8_t, atomic_uint_least8_t) +BENCH_SEMI_ATOMIC_FETCH_OP(xor, ^, u16, uint16_t, atomic_uint_least16_t) +BENCH_SEMI_ATOMIC_FETCH_OP(xor, ^, u32, uint32_t, atomic_uint_least32_t) +BENCH_SEMI_ATOMIC_FETCH_OP(xor, ^, u64, uint64_t, atomic_uint_least64_t) +BENCH_SEMI_ATOMIC_FETCH_OP(and, &, u8, uint8_t, atomic_uint_least8_t) +BENCH_SEMI_ATOMIC_FETCH_OP(and, &, u16, uint16_t, atomic_uint_least16_t) +BENCH_SEMI_ATOMIC_FETCH_OP(and, &, u32, uint32_t, atomic_uint_least32_t) +BENCH_SEMI_ATOMIC_FETCH_OP(and, &, u64, uint64_t, atomic_uint_least64_t) + +#define LINE "+------+----------+------+------------------+" \ + "------------------+------------------+" +#define FMT "| %4s | %8s | %4u | %13" PRIu32 " µs | %13" PRIu32 " µs | " \ + "%13" PRIu32 " µs |\n" +int main(void) +{ + uint32_t results[IMPL_NUMOF]; + + puts("Note: LOWER IS BETTER!\n"); + puts(LINE); + printf("| %4s | %8s | %4s | %16s | %16s | %16s |\n", + "mode", "op", "bits", "volatile", "atomic util", "c11 atomic"); + puts(LINE); + + bench_atomic_store_u8(results); + printf(FMT, "atom", "store", 8, results[IMPL_VOLATILE], + results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]); + bench_atomic_store_u16(results); + printf(FMT, "atom", "store", 16, results[IMPL_VOLATILE], + results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]); + bench_atomic_store_u32(results); + printf(FMT, "atom", "store", 32, results[IMPL_VOLATILE], + results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]); + bench_atomic_store_u64(results); + printf(FMT, "atom", "store", 64, results[IMPL_VOLATILE], + results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]); + + bench_atomic_load_u8(results); + printf(FMT, "atom", "load", 8, results[IMPL_VOLATILE], + results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]); + bench_atomic_load_u16(results); + printf(FMT, "atom", "load", 16, results[IMPL_VOLATILE], + results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]); + bench_atomic_load_u32(results); + printf(FMT, "atom", "load", 32, results[IMPL_VOLATILE], + results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]); + bench_atomic_load_u64(results); + printf(FMT, "atom", "load", 64, results[IMPL_VOLATILE], + results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]); + + /* atomic read-modify-write operations */ + bench_atomic_fetch_add_u8(results); + printf(FMT, "atom", "add", 8, results[IMPL_VOLATILE], + results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]); + bench_atomic_fetch_add_u16(results); + printf(FMT, "atom", "add", 16, results[IMPL_VOLATILE], + results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]); + bench_atomic_fetch_add_u32(results); + printf(FMT, "atom", "add", 32, results[IMPL_VOLATILE], + results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]); + bench_atomic_fetch_add_u64(results); + printf(FMT, "atom", "add", 64, results[IMPL_VOLATILE], + results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]); + + bench_atomic_fetch_sub_u8(results); + printf(FMT, "atom", "sub", 8, results[IMPL_VOLATILE], + results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]); + bench_atomic_fetch_sub_u16(results); + printf(FMT, "atom", "sub", 16, results[IMPL_VOLATILE], + results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]); + bench_atomic_fetch_sub_u32(results); + printf(FMT, "atom", "sub", 32, results[IMPL_VOLATILE], + results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]); + bench_atomic_fetch_sub_u64(results); + printf(FMT, "atom", "sub", 64, results[IMPL_VOLATILE], + results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]); + + bench_atomic_fetch_or_u8(results); + printf(FMT, "atom", "or", 8, results[IMPL_VOLATILE], + results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]); + bench_atomic_fetch_or_u16(results); + printf(FMT, "atom", "or", 16, results[IMPL_VOLATILE], + results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]); + bench_atomic_fetch_or_u32(results); + printf(FMT, "atom", "or", 32, results[IMPL_VOLATILE], + results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]); + bench_atomic_fetch_or_u64(results); + printf(FMT, "atom", "or", 64, results[IMPL_VOLATILE], + results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]); + + bench_atomic_fetch_xor_u8(results); + printf(FMT, "atom", "xor", 8, results[IMPL_VOLATILE], + results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]); + bench_atomic_fetch_xor_u16(results); + printf(FMT, "atom", "xor", 16, results[IMPL_VOLATILE], + results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]); + bench_atomic_fetch_xor_u32(results); + printf(FMT, "atom", "xor", 32, results[IMPL_VOLATILE], + results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]); + bench_atomic_fetch_xor_u64(results); + printf(FMT, "atom", "xor", 64, results[IMPL_VOLATILE], + results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]); + + bench_atomic_fetch_and_u8(results); + printf(FMT, "atom", "and", 8, results[IMPL_VOLATILE], + results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]); + bench_atomic_fetch_and_u16(results); + printf(FMT, "atom", "and", 16, results[IMPL_VOLATILE], + results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]); + bench_atomic_fetch_and_u32(results); + printf(FMT, "atom", "and", 32, results[IMPL_VOLATILE], + results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]); + bench_atomic_fetch_and_u64(results); + printf(FMT, "atom", "and", 64, results[IMPL_VOLATILE], + results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]); + + /* atomic bit setting and clearing */ + bench_atomic_set_bit_u8(results); + printf(FMT, "atom", "set", 8, results[IMPL_VOLATILE], + results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]); + bench_atomic_set_bit_u16(results); + printf(FMT, "atom", "set", 16, results[IMPL_VOLATILE], + results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]); + bench_atomic_set_bit_u32(results); + printf(FMT, "atom", "set", 32, results[IMPL_VOLATILE], + results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]); + bench_atomic_set_bit_u64(results); + printf(FMT, "atom", "set", 64, results[IMPL_VOLATILE], + results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]); + + bench_atomic_clear_bit_u8(results); + printf(FMT, "atom", "clear", 8, results[IMPL_VOLATILE], + results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]); + bench_atomic_clear_bit_u16(results); + printf(FMT, "atom", "clear", 16, results[IMPL_VOLATILE], + results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]); + bench_atomic_clear_bit_u32(results); + printf(FMT, "atom", "clear", 32, results[IMPL_VOLATILE], + results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]); + bench_atomic_clear_bit_u64(results); + printf(FMT, "atom", "clear", 64, results[IMPL_VOLATILE], + results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]); + + /* semi-atomic read-modify-write operations */ + bench_semi_atomic_fetch_add_u8(results); + printf(FMT, "semi", "add", 8, results[IMPL_VOLATILE], + results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]); + bench_semi_atomic_fetch_add_u16(results); + printf(FMT, "semi", "add", 16, results[IMPL_VOLATILE], + results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]); + bench_semi_atomic_fetch_add_u32(results); + printf(FMT, "semi", "add", 32, results[IMPL_VOLATILE], + results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]); + bench_semi_atomic_fetch_add_u64(results); + printf(FMT, "semi", "add", 64, results[IMPL_VOLATILE], + results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]); + + bench_semi_atomic_fetch_sub_u8(results); + printf(FMT, "semi", "sub", 8, results[IMPL_VOLATILE], + results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]); + bench_semi_atomic_fetch_sub_u16(results); + printf(FMT, "semi", "sub", 16, results[IMPL_VOLATILE], + results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]); + bench_semi_atomic_fetch_sub_u32(results); + printf(FMT, "semi", "sub", 32, results[IMPL_VOLATILE], + results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]); + bench_semi_atomic_fetch_sub_u64(results); + printf(FMT, "semi", "sub", 64, results[IMPL_VOLATILE], + results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]); + + bench_semi_atomic_fetch_or_u8(results); + printf(FMT, "semi", "or", 8, results[IMPL_VOLATILE], + results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]); + bench_semi_atomic_fetch_or_u16(results); + printf(FMT, "semi", "or", 16, results[IMPL_VOLATILE], + results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]); + bench_semi_atomic_fetch_or_u32(results); + printf(FMT, "semi", "or", 32, results[IMPL_VOLATILE], + results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]); + bench_semi_atomic_fetch_or_u64(results); + printf(FMT, "semi", "or", 64, results[IMPL_VOLATILE], + results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]); + + bench_semi_atomic_fetch_xor_u8(results); + printf(FMT, "semi", "xor", 8, results[IMPL_VOLATILE], + results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]); + bench_semi_atomic_fetch_xor_u16(results); + printf(FMT, "semi", "xor", 16, results[IMPL_VOLATILE], + results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]); + bench_semi_atomic_fetch_xor_u32(results); + printf(FMT, "semi", "xor", 32, results[IMPL_VOLATILE], + results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]); + bench_semi_atomic_fetch_xor_u64(results); + printf(FMT, "semi", "xor", 64, results[IMPL_VOLATILE], + results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]); + + bench_semi_atomic_fetch_and_u8(results); + printf(FMT, "semi", "and", 8, results[IMPL_VOLATILE], + results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]); + bench_semi_atomic_fetch_and_u16(results); + printf(FMT, "semi", "and", 16, results[IMPL_VOLATILE], + results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]); + bench_semi_atomic_fetch_and_u32(results); + printf(FMT, "semi", "and", 32, results[IMPL_VOLATILE], + results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]); + bench_semi_atomic_fetch_and_u64(results); + printf(FMT, "semi", "and", 64, results[IMPL_VOLATILE], + results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]); + puts(LINE); + return 0; +}