tests: Add benchmark for sys/atomic_utils

This commit is contained in:
Marian Buschsieweke 2020-06-22 22:16:42 +02:00
parent 56a54a773e
commit 23c91db0b5
No known key found for this signature in database
GPG Key ID: 61F64C6599B1539F
4 changed files with 532 additions and 0 deletions

View File

@ -0,0 +1,7 @@
include ../Makefile.tests_common
USEMODULE += xtimer
USEMODULE += atomic_utils
USEMODULE += test_utils_interactive_sync
include $(RIOTBASE)/Makefile.include

View File

@ -0,0 +1,3 @@
BOARD_INSUFFICIENT_MEMORY := \
stm32f030f4-demo \
#

View File

@ -0,0 +1,18 @@
# Benchmark for `sys/atomic_utils`
This application will perform 100.000 repetitions (or 1.000.000 on
Cortex-M7 and ESP32) for each atomic operation and will print the total time it
took in a table. For comparison, the speed of C11 atomics and plain `volatile`
accesses are also printed.
## Expectations
Lower is better!
Plain `volatile` accesses are not atomic, and therefore should perform faster
than actual atomic operations. If atomic operations turn out to be faster
(by more than rounding errors and noise), something is odd.
The `atomic_utils` aim to be at least as fast as C11 atomics and often faster.
If the `atomic_utils` implementation performs slower than C11 atomics, you have
found potential for further optimization.

View File

@ -0,0 +1,504 @@
/*
* Copyright (C) 2020 Otto-von-Guericke-Universität Magdeburg
*
* This file is subject to the terms and conditions of the GNU Lesser
* General Public License v2.1. See the file LICENSE in the top level
* directory for more details.
*/
/**
* @ingroup tests
* @{
*
* @file
* @brief Atomic util benchmark
*
* @author Marian Buschsieweke <marian.buschsieweke@ovgu.de>
*
* @}
*/
#include <stdint.h>
#include <stdatomic.h>
#include <stdio.h>
#include "atomic_utils.h"
#include "xtimer.h"
/* On fast CPUs: 1.000.000 loops */
#if defined(CPU_CORE_CORTEX_M7) || defined(CPU_ESP32)
#define LOOPS 1000000
#else
/* Else 100.000 loops */
#define LOOPS 100000
#endif
#define CONCAT(a, b) a ## b
#define CONCAT3(a, b, c) a ## b ## c
#define CONCAT4(a, b, c, d) a ## b ## c ## d
enum {
IMPL_VOLATILE,
IMPL_ATOMIC_UTIL,
IMPL_C11_ATOMIC,
IMPL_NUMOF
};
#define BENCH_ATOMIC_STORE(name, type, c11type) \
static void CONCAT(bench_atomic_store_, name)(uint32_t *result_us) \
{ \
uint32_t start, stop; \
\
{ \
volatile type val; \
start = xtimer_now_usec(); \
for (uint32_t i = 0; i < LOOPS; i++) { \
val = 42; \
} \
(void)val; \
stop = xtimer_now_usec(); \
result_us[IMPL_VOLATILE] = stop - start; \
} \
\
{ \
type val; \
start = xtimer_now_usec(); \
for (uint32_t i = 0; i < LOOPS; i++) { \
CONCAT(atomic_store_, name)(&val, 42); \
} \
stop = xtimer_now_usec(); \
result_us[IMPL_ATOMIC_UTIL] = stop - start; \
} \
\
{ \
c11type val; \
start = xtimer_now_usec(); \
for (uint32_t i = 0; i < LOOPS; i++) { \
atomic_store(&val, 42); \
} \
stop = xtimer_now_usec(); \
result_us[IMPL_C11_ATOMIC] = stop - start; \
} \
}
BENCH_ATOMIC_STORE(u8, uint8_t, atomic_uint_least8_t)
BENCH_ATOMIC_STORE(u16, uint16_t, atomic_uint_least16_t)
BENCH_ATOMIC_STORE(u32, uint32_t, atomic_uint_least32_t)
BENCH_ATOMIC_STORE(u64, uint64_t, atomic_uint_least64_t)
#define BENCH_ATOMIC_LOAD(name, type, c11type) \
static void CONCAT(bench_atomic_load_, name)(uint32_t *result_us) \
{ \
uint32_t start, stop; \
\
{ \
volatile type val = 0; \
start = xtimer_now_usec(); \
for (uint32_t i = 0; i < LOOPS; i++) { \
type tmp = val; \
(void)tmp; \
} \
stop = xtimer_now_usec(); \
result_us[IMPL_VOLATILE] = stop - start; \
} \
\
{ \
type val = 0; \
start = xtimer_now_usec(); \
for (uint32_t i = 0; i < LOOPS; i++) { \
type tmp = CONCAT(atomic_load_, name)(&val); \
(void)tmp; \
} \
stop = xtimer_now_usec(); \
result_us[IMPL_ATOMIC_UTIL] = stop - start; \
} \
\
{ \
c11type val = ATOMIC_VAR_INIT(0); \
start = xtimer_now_usec(); \
for (uint32_t i = 0; i < LOOPS; i++) { \
type tmp = atomic_load(&val); \
(void)tmp; \
} \
stop = xtimer_now_usec(); \
result_us[IMPL_C11_ATOMIC] = stop - start; \
} \
}
BENCH_ATOMIC_LOAD(u8, uint8_t, atomic_uint_least8_t)
BENCH_ATOMIC_LOAD(u16, uint16_t, atomic_uint_least16_t)
BENCH_ATOMIC_LOAD(u32, uint32_t, atomic_uint_least32_t)
BENCH_ATOMIC_LOAD(u64, uint64_t, atomic_uint_least64_t)
#define BENCH_ATOMIC_FETCH_OP(opname, op, name, type, c11type) \
static void CONCAT4(bench_atomic_fetch_, opname, _, name)(uint32_t *result_us) \
{ \
uint32_t start, stop; \
\
{ \
volatile type val = 0; \
start = xtimer_now_usec(); \
for (uint32_t i = 0; i < LOOPS; i++) { \
val = val op 1; \
} \
(void)val; \
stop = xtimer_now_usec(); \
result_us[IMPL_VOLATILE] = stop - start; \
} \
\
{ \
type val = 0; \
start = xtimer_now_usec(); \
for (uint32_t i = 0; i < LOOPS; i++) { \
CONCAT4(atomic_fetch_, opname, _, name)(&val, 1); \
} \
stop = xtimer_now_usec(); \
result_us[IMPL_ATOMIC_UTIL] = stop - start; \
} \
\
{ \
c11type val = ATOMIC_VAR_INIT(0); \
start = xtimer_now_usec(); \
for (uint32_t i = 0; i < LOOPS; i++) { \
CONCAT(atomic_fetch_, opname)(&val, 1); \
} \
stop = xtimer_now_usec(); \
result_us[IMPL_C11_ATOMIC] = stop - start; \
} \
}
BENCH_ATOMIC_FETCH_OP(add, +, u8, uint8_t, atomic_uint_least8_t)
BENCH_ATOMIC_FETCH_OP(add, +, u16, uint16_t, atomic_uint_least16_t)
BENCH_ATOMIC_FETCH_OP(add, +, u32, uint32_t, atomic_uint_least32_t)
BENCH_ATOMIC_FETCH_OP(add, +, u64, uint64_t, atomic_uint_least64_t)
BENCH_ATOMIC_FETCH_OP(sub, -, u8, uint8_t, atomic_uint_least8_t)
BENCH_ATOMIC_FETCH_OP(sub, -, u16, uint16_t, atomic_uint_least16_t)
BENCH_ATOMIC_FETCH_OP(sub, -, u32, uint32_t, atomic_uint_least32_t)
BENCH_ATOMIC_FETCH_OP(sub, -, u64, uint64_t, atomic_uint_least64_t)
BENCH_ATOMIC_FETCH_OP(or, |, u8, uint8_t, atomic_uint_least8_t)
BENCH_ATOMIC_FETCH_OP(or, |, u16, uint16_t, atomic_uint_least16_t)
BENCH_ATOMIC_FETCH_OP(or, |, u32, uint32_t, atomic_uint_least32_t)
BENCH_ATOMIC_FETCH_OP(or, |, u64, uint64_t, atomic_uint_least64_t)
BENCH_ATOMIC_FETCH_OP(xor, ^, u8, uint8_t, atomic_uint_least8_t)
BENCH_ATOMIC_FETCH_OP(xor, ^, u16, uint16_t, atomic_uint_least16_t)
BENCH_ATOMIC_FETCH_OP(xor, ^, u32, uint32_t, atomic_uint_least32_t)
BENCH_ATOMIC_FETCH_OP(xor, ^, u64, uint64_t, atomic_uint_least64_t)
BENCH_ATOMIC_FETCH_OP(and, &, u8, uint8_t, atomic_uint_least8_t)
BENCH_ATOMIC_FETCH_OP(and, &, u16, uint16_t, atomic_uint_least16_t)
BENCH_ATOMIC_FETCH_OP(and, &, u32, uint32_t, atomic_uint_least32_t)
BENCH_ATOMIC_FETCH_OP(and, &, u64, uint64_t, atomic_uint_least64_t)
#define BENCH_ATOMIC_SET_CLEAR_BIT(name, type, c11type, opname, set_or_clear) \
static void CONCAT4(bench_atomic_, opname, _bit_, name)(uint32_t *result_us) \
{ \
uint32_t start, stop; \
static const uint8_t _bit = 5; \
type mask = ((type)1) << _bit; \
\
{ \
volatile type val = 0; \
start = xtimer_now_usec(); \
for (uint32_t i = 0; i < LOOPS; i++) { \
if (set_or_clear) { \
val |= mask; \
} \
else { \
val &= ~(mask); \
} \
} \
(void)val; \
stop = xtimer_now_usec(); \
result_us[IMPL_VOLATILE] = stop - start; \
} \
\
{ \
static type val = 0; \
CONCAT3(atomic_bit_, name, _t) bit = \
CONCAT(atomic_bit_, name)(&val, _bit); \
start = xtimer_now_usec(); \
for (uint32_t i = 0; i < LOOPS; i++) { \
CONCAT4(atomic_, opname, _bit_, name)(bit); \
} \
stop = xtimer_now_usec(); \
result_us[IMPL_ATOMIC_UTIL] = stop - start; \
} \
\
{ \
c11type val = ATOMIC_VAR_INIT(0); \
start = xtimer_now_usec(); \
for (uint32_t i = 0; i < LOOPS; i++) { \
if (set_or_clear) { \
atomic_fetch_or(&val, mask); \
} \
else { \
atomic_fetch_and(&val, ~(mask)); \
} \
} \
stop = xtimer_now_usec(); \
result_us[IMPL_C11_ATOMIC] = stop - start; \
} \
}
BENCH_ATOMIC_SET_CLEAR_BIT(u8, uint8_t, atomic_uint_least8_t, set, 1)
BENCH_ATOMIC_SET_CLEAR_BIT(u16, uint16_t, atomic_uint_least16_t, set, 1)
BENCH_ATOMIC_SET_CLEAR_BIT(u32, uint32_t, atomic_uint_least32_t, set, 1)
BENCH_ATOMIC_SET_CLEAR_BIT(u64, uint64_t, atomic_uint_least64_t, set, 1)
BENCH_ATOMIC_SET_CLEAR_BIT(u8, uint8_t, atomic_uint_least8_t, clear, 0)
BENCH_ATOMIC_SET_CLEAR_BIT(u16, uint16_t, atomic_uint_least16_t, clear, 0)
BENCH_ATOMIC_SET_CLEAR_BIT(u32, uint32_t, atomic_uint_least32_t, clear, 0)
BENCH_ATOMIC_SET_CLEAR_BIT(u64, uint64_t, atomic_uint_least64_t, clear, 0)
#define BENCH_SEMI_ATOMIC_FETCH_OP(opname, op, name, type, c11type) \
static void CONCAT4(bench_semi_atomic_fetch_, opname, _, name)(uint32_t *result_us) \
{ \
uint32_t start, stop; \
\
{ \
volatile type val = 0; \
start = xtimer_now_usec(); \
for (uint32_t i = 0; i < LOOPS; i++) { \
val = val op 1; \
} \
(void)val; \
stop = xtimer_now_usec(); \
result_us[IMPL_VOLATILE] = stop - start; \
} \
\
{ \
type val = 0; \
start = xtimer_now_usec(); \
for (uint32_t i = 0; i < LOOPS; i++) { \
CONCAT4(semi_atomic_fetch_, opname, _, name)(&val, 1); \
} \
stop = xtimer_now_usec(); \
result_us[IMPL_ATOMIC_UTIL] = stop - start; \
} \
\
{ \
c11type val = ATOMIC_VAR_INIT(0); \
start = xtimer_now_usec(); \
for (uint32_t i = 0; i < LOOPS; i++) { \
CONCAT(atomic_fetch_, opname)(&val, 1); \
} \
stop = xtimer_now_usec(); \
result_us[IMPL_C11_ATOMIC] = stop - start; \
} \
}
BENCH_SEMI_ATOMIC_FETCH_OP(add, +, u8, uint8_t, atomic_uint_least8_t)
BENCH_SEMI_ATOMIC_FETCH_OP(add, +, u16, uint16_t, atomic_uint_least16_t)
BENCH_SEMI_ATOMIC_FETCH_OP(add, +, u32, uint32_t, atomic_uint_least32_t)
BENCH_SEMI_ATOMIC_FETCH_OP(add, +, u64, uint64_t, atomic_uint_least64_t)
BENCH_SEMI_ATOMIC_FETCH_OP(sub, -, u8, uint8_t, atomic_uint_least8_t)
BENCH_SEMI_ATOMIC_FETCH_OP(sub, -, u16, uint16_t, atomic_uint_least16_t)
BENCH_SEMI_ATOMIC_FETCH_OP(sub, -, u32, uint32_t, atomic_uint_least32_t)
BENCH_SEMI_ATOMIC_FETCH_OP(sub, -, u64, uint64_t, atomic_uint_least64_t)
BENCH_SEMI_ATOMIC_FETCH_OP(or, |, u8, uint8_t, atomic_uint_least8_t)
BENCH_SEMI_ATOMIC_FETCH_OP(or, |, u16, uint16_t, atomic_uint_least16_t)
BENCH_SEMI_ATOMIC_FETCH_OP(or, |, u32, uint32_t, atomic_uint_least32_t)
BENCH_SEMI_ATOMIC_FETCH_OP(or, |, u64, uint64_t, atomic_uint_least64_t)
BENCH_SEMI_ATOMIC_FETCH_OP(xor, ^, u8, uint8_t, atomic_uint_least8_t)
BENCH_SEMI_ATOMIC_FETCH_OP(xor, ^, u16, uint16_t, atomic_uint_least16_t)
BENCH_SEMI_ATOMIC_FETCH_OP(xor, ^, u32, uint32_t, atomic_uint_least32_t)
BENCH_SEMI_ATOMIC_FETCH_OP(xor, ^, u64, uint64_t, atomic_uint_least64_t)
BENCH_SEMI_ATOMIC_FETCH_OP(and, &, u8, uint8_t, atomic_uint_least8_t)
BENCH_SEMI_ATOMIC_FETCH_OP(and, &, u16, uint16_t, atomic_uint_least16_t)
BENCH_SEMI_ATOMIC_FETCH_OP(and, &, u32, uint32_t, atomic_uint_least32_t)
BENCH_SEMI_ATOMIC_FETCH_OP(and, &, u64, uint64_t, atomic_uint_least64_t)
#define LINE "+------+----------+------+------------------+" \
"------------------+------------------+"
#define FMT "| %4s | %8s | %4u | %13" PRIu32 " µs | %13" PRIu32 " µs | " \
"%13" PRIu32 " µs |\n"
int main(void)
{
uint32_t results[IMPL_NUMOF];
puts("Note: LOWER IS BETTER!\n");
puts(LINE);
printf("| %4s | %8s | %4s | %16s | %16s | %16s |\n",
"mode", "op", "bits", "volatile", "atomic util", "c11 atomic");
puts(LINE);
bench_atomic_store_u8(results);
printf(FMT, "atom", "store", 8, results[IMPL_VOLATILE],
results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]);
bench_atomic_store_u16(results);
printf(FMT, "atom", "store", 16, results[IMPL_VOLATILE],
results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]);
bench_atomic_store_u32(results);
printf(FMT, "atom", "store", 32, results[IMPL_VOLATILE],
results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]);
bench_atomic_store_u64(results);
printf(FMT, "atom", "store", 64, results[IMPL_VOLATILE],
results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]);
bench_atomic_load_u8(results);
printf(FMT, "atom", "load", 8, results[IMPL_VOLATILE],
results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]);
bench_atomic_load_u16(results);
printf(FMT, "atom", "load", 16, results[IMPL_VOLATILE],
results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]);
bench_atomic_load_u32(results);
printf(FMT, "atom", "load", 32, results[IMPL_VOLATILE],
results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]);
bench_atomic_load_u64(results);
printf(FMT, "atom", "load", 64, results[IMPL_VOLATILE],
results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]);
/* atomic read-modify-write operations */
bench_atomic_fetch_add_u8(results);
printf(FMT, "atom", "add", 8, results[IMPL_VOLATILE],
results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]);
bench_atomic_fetch_add_u16(results);
printf(FMT, "atom", "add", 16, results[IMPL_VOLATILE],
results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]);
bench_atomic_fetch_add_u32(results);
printf(FMT, "atom", "add", 32, results[IMPL_VOLATILE],
results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]);
bench_atomic_fetch_add_u64(results);
printf(FMT, "atom", "add", 64, results[IMPL_VOLATILE],
results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]);
bench_atomic_fetch_sub_u8(results);
printf(FMT, "atom", "sub", 8, results[IMPL_VOLATILE],
results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]);
bench_atomic_fetch_sub_u16(results);
printf(FMT, "atom", "sub", 16, results[IMPL_VOLATILE],
results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]);
bench_atomic_fetch_sub_u32(results);
printf(FMT, "atom", "sub", 32, results[IMPL_VOLATILE],
results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]);
bench_atomic_fetch_sub_u64(results);
printf(FMT, "atom", "sub", 64, results[IMPL_VOLATILE],
results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]);
bench_atomic_fetch_or_u8(results);
printf(FMT, "atom", "or", 8, results[IMPL_VOLATILE],
results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]);
bench_atomic_fetch_or_u16(results);
printf(FMT, "atom", "or", 16, results[IMPL_VOLATILE],
results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]);
bench_atomic_fetch_or_u32(results);
printf(FMT, "atom", "or", 32, results[IMPL_VOLATILE],
results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]);
bench_atomic_fetch_or_u64(results);
printf(FMT, "atom", "or", 64, results[IMPL_VOLATILE],
results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]);
bench_atomic_fetch_xor_u8(results);
printf(FMT, "atom", "xor", 8, results[IMPL_VOLATILE],
results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]);
bench_atomic_fetch_xor_u16(results);
printf(FMT, "atom", "xor", 16, results[IMPL_VOLATILE],
results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]);
bench_atomic_fetch_xor_u32(results);
printf(FMT, "atom", "xor", 32, results[IMPL_VOLATILE],
results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]);
bench_atomic_fetch_xor_u64(results);
printf(FMT, "atom", "xor", 64, results[IMPL_VOLATILE],
results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]);
bench_atomic_fetch_and_u8(results);
printf(FMT, "atom", "and", 8, results[IMPL_VOLATILE],
results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]);
bench_atomic_fetch_and_u16(results);
printf(FMT, "atom", "and", 16, results[IMPL_VOLATILE],
results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]);
bench_atomic_fetch_and_u32(results);
printf(FMT, "atom", "and", 32, results[IMPL_VOLATILE],
results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]);
bench_atomic_fetch_and_u64(results);
printf(FMT, "atom", "and", 64, results[IMPL_VOLATILE],
results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]);
/* atomic bit setting and clearing */
bench_atomic_set_bit_u8(results);
printf(FMT, "atom", "set", 8, results[IMPL_VOLATILE],
results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]);
bench_atomic_set_bit_u16(results);
printf(FMT, "atom", "set", 16, results[IMPL_VOLATILE],
results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]);
bench_atomic_set_bit_u32(results);
printf(FMT, "atom", "set", 32, results[IMPL_VOLATILE],
results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]);
bench_atomic_set_bit_u64(results);
printf(FMT, "atom", "set", 64, results[IMPL_VOLATILE],
results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]);
bench_atomic_clear_bit_u8(results);
printf(FMT, "atom", "clear", 8, results[IMPL_VOLATILE],
results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]);
bench_atomic_clear_bit_u16(results);
printf(FMT, "atom", "clear", 16, results[IMPL_VOLATILE],
results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]);
bench_atomic_clear_bit_u32(results);
printf(FMT, "atom", "clear", 32, results[IMPL_VOLATILE],
results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]);
bench_atomic_clear_bit_u64(results);
printf(FMT, "atom", "clear", 64, results[IMPL_VOLATILE],
results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]);
/* semi-atomic read-modify-write operations */
bench_semi_atomic_fetch_add_u8(results);
printf(FMT, "semi", "add", 8, results[IMPL_VOLATILE],
results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]);
bench_semi_atomic_fetch_add_u16(results);
printf(FMT, "semi", "add", 16, results[IMPL_VOLATILE],
results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]);
bench_semi_atomic_fetch_add_u32(results);
printf(FMT, "semi", "add", 32, results[IMPL_VOLATILE],
results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]);
bench_semi_atomic_fetch_add_u64(results);
printf(FMT, "semi", "add", 64, results[IMPL_VOLATILE],
results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]);
bench_semi_atomic_fetch_sub_u8(results);
printf(FMT, "semi", "sub", 8, results[IMPL_VOLATILE],
results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]);
bench_semi_atomic_fetch_sub_u16(results);
printf(FMT, "semi", "sub", 16, results[IMPL_VOLATILE],
results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]);
bench_semi_atomic_fetch_sub_u32(results);
printf(FMT, "semi", "sub", 32, results[IMPL_VOLATILE],
results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]);
bench_semi_atomic_fetch_sub_u64(results);
printf(FMT, "semi", "sub", 64, results[IMPL_VOLATILE],
results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]);
bench_semi_atomic_fetch_or_u8(results);
printf(FMT, "semi", "or", 8, results[IMPL_VOLATILE],
results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]);
bench_semi_atomic_fetch_or_u16(results);
printf(FMT, "semi", "or", 16, results[IMPL_VOLATILE],
results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]);
bench_semi_atomic_fetch_or_u32(results);
printf(FMT, "semi", "or", 32, results[IMPL_VOLATILE],
results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]);
bench_semi_atomic_fetch_or_u64(results);
printf(FMT, "semi", "or", 64, results[IMPL_VOLATILE],
results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]);
bench_semi_atomic_fetch_xor_u8(results);
printf(FMT, "semi", "xor", 8, results[IMPL_VOLATILE],
results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]);
bench_semi_atomic_fetch_xor_u16(results);
printf(FMT, "semi", "xor", 16, results[IMPL_VOLATILE],
results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]);
bench_semi_atomic_fetch_xor_u32(results);
printf(FMT, "semi", "xor", 32, results[IMPL_VOLATILE],
results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]);
bench_semi_atomic_fetch_xor_u64(results);
printf(FMT, "semi", "xor", 64, results[IMPL_VOLATILE],
results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]);
bench_semi_atomic_fetch_and_u8(results);
printf(FMT, "semi", "and", 8, results[IMPL_VOLATILE],
results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]);
bench_semi_atomic_fetch_and_u16(results);
printf(FMT, "semi", "and", 16, results[IMPL_VOLATILE],
results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]);
bench_semi_atomic_fetch_and_u32(results);
printf(FMT, "semi", "and", 32, results[IMPL_VOLATILE],
results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]);
bench_semi_atomic_fetch_and_u64(results);
printf(FMT, "semi", "and", 64, results[IMPL_VOLATILE],
results[IMPL_ATOMIC_UTIL], results[IMPL_C11_ATOMIC]);
puts(LINE);
return 0;
}