224 lines
6.4 KiB
C++
224 lines
6.4 KiB
C++
|
// Copyright 2017 The Abseil Authors.
|
||
|
//
|
||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
// you may not use this file except in compliance with the License.
|
||
|
// You may obtain a copy of the License at
|
||
|
//
|
||
|
// https://www.apache.org/licenses/LICENSE-2.0
|
||
|
//
|
||
|
// Unless required by applicable law or agreed to in writing, software
|
||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
// See the License for the specific language governing permissions and
|
||
|
// limitations under the License.
|
||
|
|
||
|
#include <cstdint>
|
||
|
#include <mutex> // NOLINT(build/c++11)
|
||
|
#include <vector>
|
||
|
|
||
|
#include "absl/base/internal/cycleclock.h"
|
||
|
#include "absl/base/internal/spinlock.h"
|
||
|
#include "absl/synchronization/blocking_counter.h"
|
||
|
#include "absl/synchronization/internal/thread_pool.h"
|
||
|
#include "absl/synchronization/mutex.h"
|
||
|
#include "benchmark/benchmark.h"
|
||
|
|
||
|
namespace {
|
||
|
|
||
|
void BM_Mutex(benchmark::State& state) {
|
||
|
static absl::Mutex* mu = new absl::Mutex;
|
||
|
for (auto _ : state) {
|
||
|
absl::MutexLock lock(mu);
|
||
|
}
|
||
|
}
|
||
|
BENCHMARK(BM_Mutex)->UseRealTime()->Threads(1)->ThreadPerCpu();
|
||
|
|
||
|
static void DelayNs(int64_t ns, int* data) {
|
||
|
int64_t end = absl::base_internal::CycleClock::Now() +
|
||
|
ns * absl::base_internal::CycleClock::Frequency() / 1e9;
|
||
|
while (absl::base_internal::CycleClock::Now() < end) {
|
||
|
++(*data);
|
||
|
benchmark::DoNotOptimize(*data);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
template <typename MutexType>
|
||
|
class RaiiLocker {
|
||
|
public:
|
||
|
explicit RaiiLocker(MutexType* mu) : mu_(mu) { mu_->Lock(); }
|
||
|
~RaiiLocker() { mu_->Unlock(); }
|
||
|
private:
|
||
|
MutexType* mu_;
|
||
|
};
|
||
|
|
||
|
template <>
|
||
|
class RaiiLocker<std::mutex> {
|
||
|
public:
|
||
|
explicit RaiiLocker(std::mutex* mu) : mu_(mu) { mu_->lock(); }
|
||
|
~RaiiLocker() { mu_->unlock(); }
|
||
|
private:
|
||
|
std::mutex* mu_;
|
||
|
};
|
||
|
|
||
|
template <typename MutexType>
|
||
|
void BM_Contended(benchmark::State& state) {
|
||
|
struct Shared {
|
||
|
MutexType mu;
|
||
|
int data = 0;
|
||
|
};
|
||
|
static auto* shared = new Shared;
|
||
|
int local = 0;
|
||
|
for (auto _ : state) {
|
||
|
// Here we model both local work outside of the critical section as well as
|
||
|
// some work inside of the critical section. The idea is to capture some
|
||
|
// more or less realisitic contention levels.
|
||
|
// If contention is too low, the benchmark won't measure anything useful.
|
||
|
// If contention is unrealistically high, the benchmark will favor
|
||
|
// bad mutex implementations that block and otherwise distract threads
|
||
|
// from the mutex and shared state for as much as possible.
|
||
|
// To achieve this amount of local work is multiplied by number of threads
|
||
|
// to keep ratio between local work and critical section approximately
|
||
|
// equal regardless of number of threads.
|
||
|
DelayNs(100 * state.threads, &local);
|
||
|
RaiiLocker<MutexType> locker(&shared->mu);
|
||
|
DelayNs(state.range(0), &shared->data);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
BENCHMARK_TEMPLATE(BM_Contended, absl::Mutex)
|
||
|
->UseRealTime()
|
||
|
// ThreadPerCpu poorly handles non-power-of-two CPU counts.
|
||
|
->Threads(1)
|
||
|
->Threads(2)
|
||
|
->Threads(4)
|
||
|
->Threads(6)
|
||
|
->Threads(8)
|
||
|
->Threads(12)
|
||
|
->Threads(16)
|
||
|
->Threads(24)
|
||
|
->Threads(32)
|
||
|
->Threads(48)
|
||
|
->Threads(64)
|
||
|
->Threads(96)
|
||
|
->Threads(128)
|
||
|
->Threads(192)
|
||
|
->Threads(256)
|
||
|
// Some empirically chosen amounts of work in critical section.
|
||
|
// 1 is low contention, 200 is high contention and few values in between.
|
||
|
->Arg(1)
|
||
|
->Arg(20)
|
||
|
->Arg(50)
|
||
|
->Arg(200);
|
||
|
|
||
|
BENCHMARK_TEMPLATE(BM_Contended, absl::base_internal::SpinLock)
|
||
|
->UseRealTime()
|
||
|
// ThreadPerCpu poorly handles non-power-of-two CPU counts.
|
||
|
->Threads(1)
|
||
|
->Threads(2)
|
||
|
->Threads(4)
|
||
|
->Threads(6)
|
||
|
->Threads(8)
|
||
|
->Threads(12)
|
||
|
->Threads(16)
|
||
|
->Threads(24)
|
||
|
->Threads(32)
|
||
|
->Threads(48)
|
||
|
->Threads(64)
|
||
|
->Threads(96)
|
||
|
->Threads(128)
|
||
|
->Threads(192)
|
||
|
->Threads(256)
|
||
|
// Some empirically chosen amounts of work in critical section.
|
||
|
// 1 is low contention, 200 is high contention and few values in between.
|
||
|
->Arg(1)
|
||
|
->Arg(20)
|
||
|
->Arg(50)
|
||
|
->Arg(200);
|
||
|
|
||
|
BENCHMARK_TEMPLATE(BM_Contended, std::mutex)
|
||
|
->UseRealTime()
|
||
|
// ThreadPerCpu poorly handles non-power-of-two CPU counts.
|
||
|
->Threads(1)
|
||
|
->Threads(2)
|
||
|
->Threads(4)
|
||
|
->Threads(6)
|
||
|
->Threads(8)
|
||
|
->Threads(12)
|
||
|
->Threads(16)
|
||
|
->Threads(24)
|
||
|
->Threads(32)
|
||
|
->Threads(48)
|
||
|
->Threads(64)
|
||
|
->Threads(96)
|
||
|
->Threads(128)
|
||
|
->Threads(192)
|
||
|
->Threads(256)
|
||
|
// Some empirically chosen amounts of work in critical section.
|
||
|
// 1 is low contention, 200 is high contention and few values in between.
|
||
|
->Arg(1)
|
||
|
->Arg(20)
|
||
|
->Arg(50)
|
||
|
->Arg(200);
|
||
|
|
||
|
// Measure the overhead of conditions on mutex release (when they must be
|
||
|
// evaluated). Mutex has (some) support for equivalence classes allowing
|
||
|
// Conditions with the same function/argument to potentially not be multiply
|
||
|
// evaluated.
|
||
|
//
|
||
|
// num_classes==0 is used for the special case of every waiter being distinct.
|
||
|
void BM_ConditionWaiters(benchmark::State& state) {
|
||
|
int num_classes = state.range(0);
|
||
|
int num_waiters = state.range(1);
|
||
|
|
||
|
struct Helper {
|
||
|
static void Waiter(absl::BlockingCounter* init, absl::Mutex* m, int* p) {
|
||
|
init->DecrementCount();
|
||
|
m->LockWhen(absl::Condition(
|
||
|
static_cast<bool (*)(int*)>([](int* v) { return *v == 0; }), p));
|
||
|
m->Unlock();
|
||
|
}
|
||
|
};
|
||
|
|
||
|
if (num_classes == 0) {
|
||
|
// No equivalence classes.
|
||
|
num_classes = num_waiters;
|
||
|
}
|
||
|
|
||
|
absl::BlockingCounter init(num_waiters);
|
||
|
absl::Mutex mu;
|
||
|
std::vector<int> equivalence_classes(num_classes, 1);
|
||
|
|
||
|
// Must be declared last to be destroyed first.
|
||
|
absl::synchronization_internal::ThreadPool pool(num_waiters);
|
||
|
|
||
|
for (int i = 0; i < num_waiters; i++) {
|
||
|
// Mutex considers Conditions with the same function and argument
|
||
|
// to be equivalent.
|
||
|
pool.Schedule([&, i] {
|
||
|
Helper::Waiter(&init, &mu, &equivalence_classes[i % num_classes]);
|
||
|
});
|
||
|
}
|
||
|
init.Wait();
|
||
|
|
||
|
for (auto _ : state) {
|
||
|
mu.Lock();
|
||
|
mu.Unlock(); // Each unlock requires Condition evaluation for our waiters.
|
||
|
}
|
||
|
|
||
|
mu.Lock();
|
||
|
for (int i = 0; i < num_classes; i++) {
|
||
|
equivalence_classes[i] = 0;
|
||
|
}
|
||
|
mu.Unlock();
|
||
|
}
|
||
|
|
||
|
// Some configurations have higher thread limits than others.
|
||
|
#if defined(__linux__) && !defined(THREAD_SANITIZER)
|
||
|
constexpr int kMaxConditionWaiters = 8192;
|
||
|
#else
|
||
|
constexpr int kMaxConditionWaiters = 1024;
|
||
|
#endif
|
||
|
BENCHMARK(BM_ConditionWaiters)->RangePair(0, 2, 1, kMaxConditionWaiters);
|
||
|
|
||
|
} // namespace
|