// ©2013-2015 Cameron Desrochers. // Distributed under the simplified BSD license (see the LICENSE file that // should have come with this file). // Benchmarks for moodycamel::ReaderWriterQueue. #if defined(_MSC_VER) && _MSC_VER < 1700 #define NO_FOLLY_SUPPORT #endif #if !defined(__amd64__) && !defined(_M_X64) && !defined(__x86_64__) && !defined(_M_IX86) && !defined(__i386__) #define NO_SPSC_SUPPORT // SPSC implementation is for x86 only #endif #include "ext/1024cores/spscqueue.h" // Dmitry's (on Intel site) #ifndef NO_FOLLY_SUPPORT #include "ext/folly/ProducerConsumerQueue.h" // Facebook's folly (GitHub) #endif #include "../readerwriterqueue.h" // Mine #include "systemtime.h" #include "../tests/common/simplethread.h" #include #include #include // For std::accumulate #include #include #include #ifndef UNUSED #define UNUSED(x) ((void)x); #endif using namespace moodycamel; #ifndef NO_FOLLY_SUPPORT using namespace folly; #endif typedef std::minstd_rand RNG_t; enum BenchmarkType { bench_raw_add, bench_raw_remove, bench_empty_remove, bench_single_threaded, bench_mostly_add, bench_mostly_remove, bench_heavy_concurrent, bench_random_concurrent, BENCHMARK_COUNT }; // Returns the number of seconds elapsed (high-precision), and the number of enqueue/dequeue // operations performed (in the out_Ops parameter) template double runBenchmark(BenchmarkType benchmark, unsigned int randomSeed, double& out_Ops); const int BENCHMARK_NAME_MAX = 17; // Not including null terminator const char* benchmarkName(BenchmarkType benchmark); int main(int argc, char** argv) { #ifdef NDEBUG const int TEST_COUNT = 25; #else const int TEST_COUNT = 2; #endif assert(TEST_COUNT >= 2); const double FASTEST_PERCENT_CONSIDERED = 20; // Consider only the fastest runs in the top 20% double rwqResults[BENCHMARK_COUNT][TEST_COUNT]; double spscResults[BENCHMARK_COUNT][TEST_COUNT]; double follyResults[BENCHMARK_COUNT][TEST_COUNT]; // Also calculate a rough heuristic of "ops/s" (across all runs, not just fastest) double rwqOps[BENCHMARK_COUNT][TEST_COUNT]; double spscOps[BENCHMARK_COUNT][TEST_COUNT]; double follyOps[BENCHMARK_COUNT][TEST_COUNT]; // Make sure the randomness of each benchmark run is identical unsigned int randSeeds[BENCHMARK_COUNT]; for (unsigned int i = 0; i != BENCHMARK_COUNT; ++i) { randSeeds[i] = ((unsigned int)time(NULL)) * i; } // Run benchmarks for (int benchmark = 0; benchmark < BENCHMARK_COUNT; ++benchmark) { for (int i = 0; i < TEST_COUNT; ++i) { rwqResults[benchmark][i] = runBenchmark>((BenchmarkType)benchmark, randSeeds[benchmark], rwqOps[benchmark][i]); } #ifndef NO_SPSC_SUPPORT for (int i = 0; i < TEST_COUNT; ++i) { spscResults[benchmark][i] = runBenchmark>((BenchmarkType)benchmark, randSeeds[benchmark], spscOps[benchmark][i]); } #else for (int i = 0; i < TEST_COUNT; ++i) { spscResults[benchmark][i] = 0; spscOps[benchmark][i] = 0; } #endif #ifndef NO_FOLLY_SUPPORT for (int i = 0; i < TEST_COUNT; ++i) { follyResults[benchmark][i] = runBenchmark>((BenchmarkType)benchmark, randSeeds[benchmark], follyOps[benchmark][i]); } #else for (int i = 0; i < TEST_COUNT; ++i) { follyResults[benchmark][i] = 0; follyOps[benchmark][i] = 0; } #endif } // Sort results for (int benchmark = 0; benchmark < BENCHMARK_COUNT; ++benchmark) { std::sort(&rwqResults[benchmark][0], &rwqResults[benchmark][0] + TEST_COUNT); std::sort(&spscResults[benchmark][0], &spscResults[benchmark][0] + TEST_COUNT); std::sort(&follyResults[benchmark][0], &follyResults[benchmark][0] + TEST_COUNT); } // Display results int max = std::max(2, (int)(TEST_COUNT * FASTEST_PERCENT_CONSIDERED / 100)); assert(max > 0); #ifdef NO_SPSC_SUPPORT std::cout << "Note: SPSC queue not supported on this platform, discount its timings" << std::endl; #endif #ifdef NO_FOLLY_SUPPORT std::cout << "Note: Folly queue not supported by this compiler, discount its timings" << std::endl; #endif std::cout << std::setw(BENCHMARK_NAME_MAX) << " " << " |----------- Min ------------|------------ Max ------------|------------ Avg ------------|\n"; std::cout << std::left << std::setw(BENCHMARK_NAME_MAX) << "Benchmark" << " | RWQ | SPSC | Folly | RWQ | SPSC | Folly | RWQ | SPSC | Folly | xSPSC | xFolly\n"; std::cout.fill('-'); std::cout << std::setw(BENCHMARK_NAME_MAX) << "---------" << "-+---------+---------+---------+---------+---------+---------+---------+---------+---------+-------+-------\n"; std::cout.fill(' '); double rwqOpsPerSec = 0, spscOpsPerSec = 0, follyOpsPerSec = 0; int opTimedBenchmarks = 0; for (int benchmark = 0; benchmark < BENCHMARK_COUNT; ++benchmark) { double rwqMin = rwqResults[benchmark][0], rwqMax = rwqResults[benchmark][max - 1]; double spscMin = spscResults[benchmark][0], spscMax = spscResults[benchmark][max - 1]; double follyMin = follyResults[benchmark][0], follyMax = follyResults[benchmark][max - 1]; double rwqAvg = std::accumulate(&rwqResults[benchmark][0], &rwqResults[benchmark][0] + max, 0.0) / max; double spscAvg = std::accumulate(&spscResults[benchmark][0], &spscResults[benchmark][0] + max, 0.0) / max; double follyAvg = std::accumulate(&follyResults[benchmark][0], &follyResults[benchmark][0] + max, 0.0) / max; double spscMult = rwqAvg < 0.00001 ? 0 : spscAvg / rwqAvg; double follyMult = follyAvg < 0.00001 ? 0 : follyAvg / rwqAvg; if (rwqResults[benchmark][0] != -1) { double rwqTotalAvg = std::accumulate(&rwqResults[benchmark][0], &rwqResults[benchmark][0] + TEST_COUNT, 0.0) / TEST_COUNT; double spscTotalAvg = std::accumulate(&spscResults[benchmark][0], &spscResults[benchmark][0] + TEST_COUNT, 0.0) / TEST_COUNT; double follyTotalAvg = std::accumulate(&follyResults[benchmark][0], &follyResults[benchmark][0] + TEST_COUNT, 0.0) / TEST_COUNT; rwqOpsPerSec += rwqTotalAvg == 0 ? 0 : std::accumulate(&rwqOps[benchmark][0], &rwqOps[benchmark][0] + TEST_COUNT, 0.0) / TEST_COUNT / rwqTotalAvg; spscOpsPerSec += spscTotalAvg == 0 ? 0 : std::accumulate(&spscOps[benchmark][0], &spscOps[benchmark][0] + TEST_COUNT, 0.0) / TEST_COUNT / spscTotalAvg; follyOpsPerSec += follyTotalAvg == 0 ? 0 : std::accumulate(&follyOps[benchmark][0], &follyOps[benchmark][0] + TEST_COUNT, 0.0) / TEST_COUNT / follyTotalAvg; ++opTimedBenchmarks; } std::cout << std::left << std::setw(BENCHMARK_NAME_MAX) << benchmarkName((BenchmarkType)benchmark) << " | " << std::fixed << std::setprecision(4) << rwqMin << "s | " << std::fixed << std::setprecision(4) << spscMin << "s | " << std::fixed << std::setprecision(4) << follyMin << "s | " << std::fixed << std::setprecision(4) << rwqMax << "s | " << std::fixed << std::setprecision(4) << spscMax << "s | " << std::fixed << std::setprecision(4) << follyMax << "s | " << std::fixed << std::setprecision(4) << rwqAvg << "s | " << std::fixed << std::setprecision(4) << spscAvg << "s | " << std::fixed << std::setprecision(4) << follyAvg << "s | " << std::fixed << std::setprecision(2) << spscMult << "x | " << std::fixed << std::setprecision(2) << follyMult << "x" << "\n" ; } rwqOpsPerSec /= opTimedBenchmarks; spscOpsPerSec /= opTimedBenchmarks; follyOpsPerSec /= opTimedBenchmarks; std::cout << "\nAverage ops/s:\n" << " ReaderWriterQueue: " << std::fixed << std::setprecision(2) << rwqOpsPerSec / 1000000 << " million\n" << " SPSC queue: " << std::fixed << std::setprecision(2) << spscOpsPerSec / 1000000 << " million\n" << " Folly queue: " << std::fixed << std::setprecision(2) << follyOpsPerSec / 1000000 << " million\n" ; std::cout << std::endl; return 0; } template double runBenchmark(BenchmarkType benchmark, unsigned int randomSeed, double& out_Ops) { typedef unsigned long long counter_t; SystemTime start; double result = 0; volatile int forceNoOptimizeDummy; switch (benchmark) { case bench_raw_add: { const counter_t MAX = 100 * 1000; out_Ops = MAX; TQueue q(MAX); int num = 0; start = getSystemTime(); for (counter_t i = 0; i != MAX; ++i) { q.enqueue(num); ++num; } result = getTimeDelta(start); int temp = -1; q.try_dequeue(temp); forceNoOptimizeDummy = temp; } break; case bench_raw_remove: { const counter_t MAX = 100 * 1000; out_Ops = MAX; TQueue q(MAX); int num = 0; for (counter_t i = 0; i != MAX; ++i) { q.enqueue(num); ++num; } int element = -1; int total = 0; num = 0; start = getSystemTime(); for (counter_t i = 0; i != MAX; ++i) { bool success = q.try_dequeue(element); assert(success && num++ == element); UNUSED(success); total += element; } result = getTimeDelta(start); assert(!q.try_dequeue(element)); forceNoOptimizeDummy = total; } break; case bench_empty_remove: { const counter_t MAX = 2000 * 1000; out_Ops = MAX; TQueue q(MAX); int total = 0; start = getSystemTime(); SimpleThread consumer([&]() { int element; for (counter_t i = 0; i != MAX; ++i) { if (q.try_dequeue(element)) { total += element; } } }); SimpleThread producer([&]() { int num = 0; for (counter_t i = 0; i != MAX / 2; ++i) { if ((i & 32767) == 0) { // Just to make sure the loops aren't optimized out entirely q.enqueue(num); ++num; } } }); producer.join(); consumer.join(); result = getTimeDelta(start); forceNoOptimizeDummy = total; } break; case bench_single_threaded: { const counter_t MAX = 200 * 1000; out_Ops = MAX; RNG_t rng(randomSeed); std::uniform_int_distribution rand(0, 1); TQueue q(MAX); int num = 0; int element = -1; start = getSystemTime(); for (counter_t i = 0; i != MAX; ++i) { if (rand(rng) == 1) { q.enqueue(num); ++num; } else { q.try_dequeue(element); } } result = getTimeDelta(start); forceNoOptimizeDummy = (int)(q.try_dequeue(element)); } break; case bench_mostly_add: { const counter_t MAX = 1200 * 1000; out_Ops = MAX; int readOps = 0; RNG_t rng(randomSeed); std::uniform_int_distribution rand(0, 3); TQueue q(MAX); int element = -1; start = getSystemTime(); SimpleThread consumer([&]() { for (counter_t i = 0; i != MAX / 10; ++i) { if (rand(rng) == 0) { q.try_dequeue(element); ++readOps; } } }); SimpleThread producer([&]() { int num = 0; for (counter_t i = 0; i != MAX; ++i) { q.enqueue(num); ++num; } }); producer.join(); consumer.join(); result = getTimeDelta(start); forceNoOptimizeDummy = (int)(q.try_dequeue(element)); out_Ops += readOps; } break; case bench_mostly_remove: { const counter_t MAX = 1200 * 1000; out_Ops = MAX; int writeOps = 0; RNG_t rng(randomSeed); std::uniform_int_distribution rand(0, 3); TQueue q(MAX); int element = -1; start = getSystemTime(); SimpleThread consumer([&]() { for (counter_t i = 0; i != MAX; ++i) { q.try_dequeue(element); } }); SimpleThread producer([&]() { int num = 0; for (counter_t i = 0; i != MAX / 10; ++i) { if (rand(rng) == 0) { q.enqueue(num); ++num; } } writeOps = num; }); producer.join(); consumer.join(); result = getTimeDelta(start); forceNoOptimizeDummy = (int)(q.try_dequeue(element)); out_Ops += writeOps; } break; case bench_heavy_concurrent: { const counter_t MAX = 1000 * 1000; out_Ops = MAX * 2; TQueue q(MAX); int element = -1; start = getSystemTime(); SimpleThread consumer([&]() { for (counter_t i = 0; i != MAX; ++i) { q.try_dequeue(element); } }); SimpleThread producer([&]() { int num = 0; for (counter_t i = 0; i != MAX; ++i) { q.enqueue(num); ++num; } }); producer.join(); consumer.join(); result = getTimeDelta(start); forceNoOptimizeDummy = (int)(q.try_dequeue(element)); } break; case bench_random_concurrent: { const counter_t MAX = 800 * 1000; int readOps = 0, writeOps = 0; TQueue q(MAX); int element = -1; start = getSystemTime(); SimpleThread consumer([&]() { RNG_t rng(randomSeed); std::uniform_int_distribution rand(0, 15); for (counter_t i = 0; i != MAX; ++i) { if (rand(rng) == 0) { q.try_dequeue(element); ++readOps; } } }); SimpleThread producer([&]() { RNG_t rng(randomSeed * 3 - 1); std::uniform_int_distribution rand(0, 15); int num = 0; for (counter_t i = 0; i != MAX; ++i) { if (rand(rng) == 0) { q.enqueue(num); ++num; } } writeOps = num; }); producer.join(); consumer.join(); result = getTimeDelta(start); forceNoOptimizeDummy = (int)(q.try_dequeue(element)); out_Ops = readOps + writeOps; } break; default: assert(false); out_Ops = 0; return 0; } UNUSED(forceNoOptimizeDummy); return result / 1000.0; } const char* benchmarkName(BenchmarkType benchmark) { switch (benchmark) { case bench_raw_add: return "Raw add"; case bench_raw_remove: return "Raw remove"; case bench_empty_remove: return "Raw empty remove"; case bench_single_threaded: return "Single-threaded"; case bench_mostly_add: return "Mostly add"; case bench_mostly_remove: return "Mostly remove"; case bench_heavy_concurrent: return "Heavy concurrent"; case bench_random_concurrent: return "Random concurrent"; default: return ""; } }