[ceph.git] / ceph / src / test / perf_local.cc

// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
/* Copyright (c) 2015 Haomai Wang <haomaiwang@gmail.com>
 * Copyright (c) 2011-2014 Stanford University
 * Copyright (c) 2011 Facebook
 *
 * Permission to use, copy, modify, and distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR(S) DISCLAIM ALL WARRANTIES
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL AUTHORS BE LIABLE FOR
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 */

// This program contains a collection of low-level performance measurements
// for Ceph, which can be run either individually or altogether.  These
// tests measure performance in a single stand-alone process, not in a cluster
// with multiple servers.  Invoke the program like this:
//
//     Perf test1 test2 ...
//
// test1 and test2 are the names of individual performance measurements to
// run.  If no test names are provided then all of the performance tests
// are run.
//
// To add a new test:
// * Write a function that implements the test.  Use existing test functions
//   as a guideline, and be sure to generate output in the same form as
//   other tests.
// * Create a new entry for the test in the #tests table.
#include <vector>
#include <sched.h>

#include "acconfig.h"
#ifdef HAVE_SSE
#include <xmmintrin.h>
#endif

#include "include/buffer.h"
#include "include/encoding.h"
#include "include/ceph_hash.h"
#include "include/spinlock.h"
#include "common/ceph_argparse.h"
#include "common/Cycles.h"
#include "common/Cond.h"
#include "common/ceph_mutex.h"
#include "common/Thread.h"
#include "common/Timer.h"
#include "msg/async/Event.h"
#include "global/global_init.h"

#include "test/perf_helper.h"

#include <atomic>

using namespace std;
using namespace ceph;

/**
 * Ask the operating system to pin the current thread to a given CPU.
 *
 * \param cpu
 *      Indicates the desired CPU and hyperthread; low order 2 bits
 *      specify CPU, next bit specifies hyperthread.
 */
void bind_thread_to_cpu(int cpu)
{
#ifdef HAVE_SCHED
  cpu_set_t set;
  CPU_ZERO(&set);
  CPU_SET(cpu, &set);
  sched_setaffinity(0, sizeof(set), &set);
#endif
}

/*
 * This function just discards its argument. It's used to make it
 * appear that data is used,  so that the compiler won't optimize
 * away the code we're trying to measure.
 *
 * \param value
 *      Pointer to arbitrary value; it's discarded.
 */
void discard(void* value) {
  int x = *reinterpret_cast<int*>(value);
  if (x == 0x43924776) {
    printf("Value was 0x%x\n", x);
  }
}

//----------------------------------------------------------------------
// Test functions start here
//----------------------------------------------------------------------

// Measure the cost of atomic compare-and-swap
double atomic_int_cmp()
{
  int count = 1000000;
  std::atomic<unsigned> value = { 11 };
  unsigned int test = 11;
  uint64_t start = Cycles::rdtsc();
  for (int i = 0; i < count; i++) {
    value.compare_exchange_strong(test, test+2);
    test += 2;
  }
  uint64_t stop = Cycles::rdtsc();
  // printf("Final value: %d\n", value.load());
  return Cycles::to_seconds(stop - start)/count;
}

// Measure the cost of incrementing an atomic
double atomic_int_inc()
{
  int count = 1000000;
  std::atomic<int64_t> value = { 11 };
  uint64_t start = Cycles::rdtsc();
  for (int i = 0; i < count; i++) {
    value++;
  }
  uint64_t stop = Cycles::rdtsc();
  // printf("Final value: %d\n", value.load());
  return Cycles::to_seconds(stop - start)/count;
}

// Measure the cost of reading an atomic
double atomic_int_read()
{
  int count = 1000000;
  std::atomic<int64_t> value = { 11 };
  int total = 0;
  uint64_t start = Cycles::rdtsc();
  for (int i = 0; i < count; i++) {
    total += value;
  }
  uint64_t stop = Cycles::rdtsc();
  // printf("Total: %d\n", total);
  return Cycles::to_seconds(stop - start)/count;
}

// Measure the cost of storing a new value in an atomic
double atomic_int_set()
{
  int count = 1000000;
  std::atomic<int64_t> value = { 11 };
  uint64_t start = Cycles::rdtsc();
  for (int i = 0; i < count; i++) {
    value = 88;
  }
  uint64_t stop = Cycles::rdtsc();
  return Cycles::to_seconds(stop - start)/count;
}

// Measure the cost of acquiring and releasing a mutex in the
// fast case where the mutex is free.
double mutex_nonblock()
{
  int count = 1000000;
  ceph::mutex m = ceph::make_mutex("mutex_nonblock::m");
  uint64_t start = Cycles::rdtsc();
  for (int i = 0; i < count; i++) {
    m.lock();
    m.unlock();
  }
  uint64_t stop = Cycles::rdtsc();
  return Cycles::to_seconds(stop - start)/count;
}

// Measure the cost of allocating and deallocating a buffer, plus
// appending (logically) one ptr.
double buffer_basic()
{
  int count = 1000000;
  uint64_t start = Cycles::rdtsc();
  bufferptr ptr("abcdefg", 7);
  for (int i = 0; i < count; i++) {
    bufferlist b;
    b.append(ptr, 0, 5);
  }
  uint64_t stop = Cycles::rdtsc();
  return Cycles::to_seconds(stop - start)/count;
}

struct DummyBlock {
  int a = 1, b = 2, c = 3, d = 4;
  void encode(bufferlist &bl) const {
    ENCODE_START(1, 1, bl);
    encode(a, bl);
    encode(b, bl);
    encode(c, bl);
    encode(d, bl);
    ENCODE_FINISH(bl);
  }
  void decode(bufferlist::const_iterator &bl) {
    DECODE_START(1, bl);
    decode(a, bl);
    decode(b, bl);
    decode(c, bl);
    decode(d, bl);
    DECODE_FINISH(bl);
  }
};
WRITE_CLASS_ENCODER(DummyBlock)

// Measure the cost of encoding and decoding a buffer, plus
// allocating space for one chunk.
double buffer_encode_decode()
{
  int count = 1000000;
  uint64_t start = Cycles::rdtsc();
  for (int i = 0; i < count; i++) {
    bufferlist b;
    DummyBlock dummy_block;
    encode(dummy_block, b);
    auto iter = b.cbegin();
    decode(dummy_block, iter);
  }
  uint64_t stop = Cycles::rdtsc();
  return Cycles::to_seconds(stop - start)/count;
}

// Measure the cost of allocating and deallocating a buffer, plus
// copying in a small block.
double buffer_basic_copy()
{
  int count = 1000000;
  uint64_t start = Cycles::rdtsc();
  for (int i = 0; i < count; i++) {
    bufferlist b;
    b.append("abcdefg", 6);
  }
  uint64_t stop = Cycles::rdtsc();
  return Cycles::to_seconds(stop - start)/count;
}

// Measure the cost of making a copy of parts of two ptrs.
double buffer_copy()
{
  int count = 1000000;
  bufferlist b;
  b.append("abcde", 5);
  b.append("01234", 5);
  char copy[10];
  uint64_t start = Cycles::rdtsc();
  for (int i = 0; i < count; i++) {
    b.cbegin(2).copy(6, copy);
  }
  uint64_t stop = Cycles::rdtsc();
  return Cycles::to_seconds(stop - start)/count;
}

// Measure the cost of allocating new space by extending the
// bufferlist
double buffer_encode()
{
  int count = 100000;
  uint64_t total = 0;
  for (int i = 0; i < count; i++) {
    bufferlist b;
    DummyBlock dummy_block;
    encode(dummy_block, b);
    uint64_t start = Cycles::rdtsc();
    encode(dummy_block, b);
    encode(dummy_block, b);
    encode(dummy_block, b);
    encode(dummy_block, b);
    encode(dummy_block, b);
    encode(dummy_block, b);
    encode(dummy_block, b);
    encode(dummy_block, b);
    encode(dummy_block, b);
    encode(dummy_block, b);
    total += Cycles::rdtsc() - start;
  }
  return Cycles::to_seconds(total)/(count*10);
}

// Measure the cost of creating an iterator and iterating over 10
// chunks in a buffer.
double buffer_iterator()
{
  bufferlist b;
  const char s[] = "abcdefghijklmnopqrstuvwxyz";
  bufferptr ptr(s, sizeof(s));
  for (int i = 0; i < 5; i++) {
    b.append(ptr, i, 5);
  }
  int count = 100000;
  int sum = 0;
  uint64_t start = Cycles::rdtsc();
  for (int i = 0; i < count; i++) {
    auto it = b.cbegin();
    while (!it.end()) {
      sum += (static_cast<const char*>(it.get_current_ptr().c_str()))[it.get_remaining()-1];
      ++it;
    }
  }
  uint64_t stop = Cycles::rdtsc();
  discard(&sum);
  return Cycles::to_seconds(stop - start)/count;
}

// Implements the CondPingPong test.
class CondPingPong {
  ceph::mutex mutex = ceph::make_mutex("CondPingPong::mutex");
  ceph::condition_variable cond;
  int prod = 0;
  int cons = 0;
  const int count = 10000;

  class Consumer : public Thread {
    CondPingPong *p;
   public:
    explicit Consumer(CondPingPong *p): p(p) {}
    void* entry() override {
      p->consume();
      return 0;
    }
  } consumer;

 public:
  CondPingPong(): consumer(this) {}

  double run() {
    consumer.create("consumer");
    uint64_t start = Cycles::rdtsc();
    produce();
    uint64_t stop = Cycles::rdtsc();
    consumer.join();
    return Cycles::to_seconds(stop - start)/count;
  }

  void produce() {
    std::unique_lock l{mutex};
    while (cons < count) {
      cond.wait(l, [this] { return cons >= prod; });
      ++prod;
      cond.notify_all();
    }
  }

  void consume() {
    std::unique_lock l{mutex};
    while (cons < count) {
      cond.wait(l, [this] { return cons != prod; });
      ++cons;
      cond.notify_all();
    }
  }
};

// Measure the cost of coordinating between threads using a condition variable.
double cond_ping_pong()
{
  return CondPingPong().run();
}

// Measure the cost of a 32-bit divide. Divides don't take a constant
// number of cycles. Values were chosen here semi-randomly to depict a
// fairly expensive scenario. Someone with fancy ALU knowledge could
// probably pick worse values.
double div32()
{
#if defined(__i386__) || defined(__x86_64__)
  int count = 1000000;
  uint64_t start = Cycles::rdtsc();
  // NB: Expect an x86 processor exception is there's overflow.
  uint32_t numeratorHi = 0xa5a5a5a5U;
  uint32_t numeratorLo = 0x55aa55aaU;
  uint32_t divisor = 0xaa55aa55U;
  uint32_t quotient;
  uint32_t remainder;
  for (int i = 0; i < count; i++) {
    __asm__ __volatile__("div %4" :
                         "=a"(quotient), "=d"(remainder) :
                         "a"(numeratorLo), "d"(numeratorHi), "r"(divisor) :
                         "cc");
  }
  uint64_t stop = Cycles::rdtsc();
  return Cycles::to_seconds(stop - start)/count;
#elif defined(__aarch64__)
  int count = 1000000;
  uint64_t start = Cycles::rdtsc();
  uint64_t numerator = 0xa5a5a5a555aa55aaUL;
  uint32_t divisor = 0xaa55aa55U;
  uint32_t result;
  for (int i = 0; i < count; i++) {
    asm volatile("udiv %0, %1, %2" : "=r"(result) :
                  "r"(numerator), "r"(divisor));
  }
  uint64_t stop = Cycles::rdtsc();
  return Cycles::to_seconds(stop - start)/count;
#else
  return -1;
#endif
}

// Measure the cost of a 64-bit divide. Divides don't take a constant
// number of cycles. Values were chosen here semi-randomly to depict a
// fairly expensive scenario. Someone with fancy ALU knowledge could
// probably pick worse values.
double div64()
{
#if defined(__x86_64__) || defined(__amd64__)
  int count = 1000000;
  // NB: Expect an x86 processor exception is there's overflow.
  uint64_t start = Cycles::rdtsc();
  uint64_t numeratorHi = 0x5a5a5a5a5a5UL;
  uint64_t numeratorLo = 0x55aa55aa55aa55aaUL;
  uint64_t divisor = 0xaa55aa55aa55aa55UL;
  uint64_t quotient;
  uint64_t remainder;
  for (int i = 0; i < count; i++) {
    __asm__ __volatile__("divq %4" :
                         "=a"(quotient), "=d"(remainder) :
                         "a"(numeratorLo), "d"(numeratorHi), "r"(divisor) :
                         "cc");
  }
  uint64_t stop = Cycles::rdtsc();
  return Cycles::to_seconds(stop - start)/count;
#else
  return -1;
#endif
}

// Measure the cost of calling a non-inlined function.
double function_call()
{
  int count = 1000000;
  uint64_t x = 0;
  uint64_t start = Cycles::rdtsc();
  for (int i = 0; i < count; i++) {
    x = PerfHelper::plus_one(x);
  }
  uint64_t stop = Cycles::rdtsc();
  return Cycles::to_seconds(stop - start)/count;
}

// Measure the minimum cost of EventCenter::process_events, when there are no
// Pollers and no Timers.
double eventcenter_poll()
{
  int count = 1000000;
  EventCenter center(g_ceph_context);
  center.init(1000, 0, "posix");
  center.set_owner();
  uint64_t start = Cycles::rdtsc();
  for (int i = 0; i < count; i++) {
    center.process_events(0);
  }
  uint64_t stop = Cycles::rdtsc();
  return Cycles::to_seconds(stop - start)/count;
}

class CenterWorker : public Thread {
  CephContext *cct;
  bool done;

 public:
  EventCenter center;
  explicit CenterWorker(CephContext *c): cct(c), done(false), center(c) {
    center.init(100, 0, "posix");
  }
  void stop() {
    done = true;
    center.wakeup();
  }
  void* entry() override {
    center.set_owner();
    bind_thread_to_cpu(2);
    while (!done)
      center.process_events(1000);
    return 0;
  }
};

class CountEvent: public EventCallback {
  std::atomic<int64_t> *count;

 public:
  explicit CountEvent(std::atomic<int64_t> *atomic): count(atomic) {}
  void do_request(uint64_t id) override {
    (*count)--;
  }
};

double eventcenter_dispatch()
{
  int count = 100000;

  CenterWorker worker(g_ceph_context);
  std::atomic<int64_t> flag = { 1 };
  worker.create("evt_center_disp");
  EventCallbackRef count_event(new CountEvent(&flag));

  worker.center.dispatch_event_external(count_event);
  // Start a new thread and wait for it to ready.
  while (flag)
    usleep(100);

  uint64_t start = Cycles::rdtsc();
  for (int i = 0; i < count; i++) {
    flag = 1;
    worker.center.dispatch_event_external(count_event);
    while (flag)
      ;
  }
  uint64_t stop = Cycles::rdtsc();
  worker.stop();
  worker.join();
  return Cycles::to_seconds(stop - start)/count;
}

// Measure the cost of copying a given number of bytes with memcpy.
double memcpy_shared(size_t size)
{
  int count = 1000000;
  char src[size], dst[size];

  memset(src, 0, sizeof(src));

  uint64_t start = Cycles::rdtsc();
  for (int i = 0; i < count; i++) {
    memcpy(dst, src, size);
  }
  uint64_t stop = Cycles::rdtsc();
  return Cycles::to_seconds(stop - start)/count;
}

double memcpy100()
{
  return memcpy_shared(100);
}

double memcpy1000()
{
  return memcpy_shared(1000);
}

double memcpy10000()
{
  return memcpy_shared(10000);
}

// Benchmark rjenkins hashing performance on cached data.
template <int key_length>
double ceph_str_hash_rjenkins()
{
  int count = 100000;
  char buf[key_length];

  uint64_t start = Cycles::rdtsc();
  for (int i = 0; i < count; i++)
    ceph_str_hash(CEPH_STR_HASH_RJENKINS, buf, sizeof(buf));
  uint64_t stop = Cycles::rdtsc();

  return Cycles::to_seconds(stop - start)/count;
}

// Measure the cost of reading the fine-grain cycle counter.
double rdtsc_test()
{
  int count = 1000000;
  uint64_t start = Cycles::rdtsc();
  uint64_t total = 0;
  for (int i = 0; i < count; i++) {
    total += Cycles::rdtsc();
  }
  uint64_t stop = Cycles::rdtsc();
  return Cycles::to_seconds(stop - start)/count;
}

// Measure the cost of the Cycles::to_seconds method.
double perf_cycles_to_seconds()
{
  int count = 1000000;
  double total = 0;
  uint64_t cycles = 994261;
  uint64_t start = Cycles::rdtsc();
  for (int i = 0; i < count; i++) {
    total += Cycles::to_seconds(cycles);
  }
  uint64_t stop = Cycles::rdtsc();
  // printf("Result: %.4f\n", total/count);
  return Cycles::to_seconds(stop - start)/count;
}

// Measure the cost of the Cylcles::toNanoseconds method.
double perf_cycles_to_nanoseconds()
{
  int count = 1000000;
  uint64_t total = 0;
  uint64_t cycles = 994261;
  uint64_t start = Cycles::rdtsc();
  for (int i = 0; i < count; i++) {
    total += Cycles::to_nanoseconds(cycles);
  }
  uint64_t stop = Cycles::rdtsc();
  // printf("Result: %lu\n", total/count);
  return Cycles::to_seconds(stop - start)/count;
}


#ifdef HAVE_SSE
/**
 * Prefetch the cache lines containing [object, object + numBytes) into the
 * processor's caches.
 * The best docs for this are in the Intel instruction set reference under
 * PREFETCH.
 * \param object
 *      The start of the region of memory to prefetch.
 * \param num_bytes
 *      The size of the region of memory to prefetch.
 */
static inline void prefetch(const void *object, uint64_t num_bytes)
{
    uint64_t offset = reinterpret_cast<uint64_t>(object) & 0x3fUL;
    const char* p = reinterpret_cast<const char*>(object) - offset;
    for (uint64_t i = 0; i < offset + num_bytes; i += 64)
        _mm_prefetch(p + i, _MM_HINT_T0);
}
#elif defined(__aarch64__)
static inline void prefetch(const void *object, uint64_t num_bytes)
{
    uint64_t offset = reinterpret_cast<uint64_t>(object) & 0x3fUL;
    const char* ptr = reinterpret_cast<const char*>(object) - offset;
    for (uint64_t i = 0; i < offset + num_bytes; i += 64, ptr += 64)
        asm volatile("prfm pldl1keep, %a0\n" : : "p" (ptr));
}
#endif

// Measure the cost of the prefetch instruction.
double perf_prefetch()
{
#if defined(HAVE_SSE) || defined(__aarch64__)
  uint64_t total_ticks = 0;
  int count = 10;
  char buf[16 * 64];

  for (int i = 0; i < count; i++) {
    PerfHelper::flush_cache();
    uint64_t start = Cycles::rdtsc();
    prefetch(&buf[576], 64);
    prefetch(&buf[0],   64);
    prefetch(&buf[512], 64);
    prefetch(&buf[960], 64);
    prefetch(&buf[640], 64);
    prefetch(&buf[896], 64);
    prefetch(&buf[256], 64);
    prefetch(&buf[704], 64);
    prefetch(&buf[320], 64);
    prefetch(&buf[384], 64);
    prefetch(&buf[128], 64);
    prefetch(&buf[448], 64);
    prefetch(&buf[768], 64);
    prefetch(&buf[832], 64);
    prefetch(&buf[64],  64);
    prefetch(&buf[192], 64);
    uint64_t stop = Cycles::rdtsc();
    total_ticks += stop - start;
  }
  return Cycles::to_seconds(total_ticks) / count / 16;
#else
  return -1;
#endif
}

#if defined(__x86_64__)
/**
 * This function is used to seralize machine instructions so that no
 * instructions that appear after it in the current thread can run before any
 * instructions that appear before it. 
 *
 * It is useful for putting around rdpmc instructions (to pinpoint cache
 * misses) as well as before rdtsc instructions, to prevent time pollution from
 * instructions supposed to be executing before the timer starts.
 */
static inline void serialize() {
    uint32_t eax, ebx, ecx, edx;
    __asm volatile("cpuid"
        : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
        : "a" (1U));
}
#endif

// Measure the cost of cpuid
double perf_serialize() {
#if defined(__x86_64__)
  int count = 1000000;
  uint64_t start = Cycles::rdtsc();
  for (int i = 0; i < count; i++) {
    serialize();
  }
  uint64_t stop = Cycles::rdtsc();
  return Cycles::to_seconds(stop - start)/count;
#else
  return -1;
#endif
}

// Measure the cost of an lfence instruction.
double lfence()
{
#ifdef HAVE_SSE2
  int count = 1000000;
  uint64_t start = Cycles::rdtsc();
  for (int i = 0; i < count; i++) {
    __asm__ __volatile__("lfence" ::: "memory");
  }
  uint64_t stop = Cycles::rdtsc();
  return Cycles::to_seconds(stop - start)/count;
#elif defined(__aarch64__)
  int count = 1000000;
  uint64_t start = Cycles::rdtsc();
  for (int i = 0; i < count; i++) {
    asm volatile("dmb ishld" ::: "memory");
  }
  uint64_t stop = Cycles::rdtsc();
  return Cycles::to_seconds(stop - start)/count;
#else
  return -1;
#endif
}

// Measure the cost of an sfence instruction.
double sfence()
{
#ifdef HAVE_SSE
  int count = 1000000;
  uint64_t start = Cycles::rdtsc();
  for (int i = 0; i < count; i++) {
    __asm__ __volatile__("sfence" ::: "memory");
  }
  uint64_t stop = Cycles::rdtsc();
  return Cycles::to_seconds(stop - start)/count;
#elif defined(__aarch64__)
  int count = 1000000;
  uint64_t start = Cycles::rdtsc();
  for (int i = 0; i < count; i++) {
    asm volatile("dmb ishst" ::: "memory");
  }
  uint64_t stop = Cycles::rdtsc();
  return Cycles::to_seconds(stop - start)/count;
#else
  return -1;
#endif
}

// Measure the cost of acquiring and releasing a SpinLock (assuming the
// lock is initially free).
double test_spinlock()
{
  int count = 1000000;
  ceph::spinlock lock;
  uint64_t start = Cycles::rdtsc();
  for (int i = 0; i < count; i++) {
    lock.lock();
    lock.unlock();
  }
  uint64_t stop = Cycles::rdtsc();
  return Cycles::to_seconds(stop - start)/count;
}

// Helper for spawn_thread. This is the main function that the thread executes
// (intentionally empty).
class ThreadHelper : public Thread {
  void *entry() override { return 0; }
};

// Measure the cost of start and joining with a thread.
double spawn_thread()
{
  int count = 10000;
  ThreadHelper thread;
  uint64_t start = Cycles::rdtsc();
  for (int i = 0; i < count; i++) {
    thread.create("thread_helper");
    thread.join();
  }
  uint64_t stop = Cycles::rdtsc();
  return Cycles::to_seconds(stop - start)/count;
}

class FakeContext : public Context {
 public:
  void finish(int r) override {}
};

// Measure the cost of starting and stopping a Dispatch::Timer.
double perf_timer()
{
  int count = 1000000;
  ceph::mutex lock = ceph::make_mutex("perf_timer::lock");
  SafeTimer timer(g_ceph_context, lock);
  FakeContext **c = new FakeContext*[count];
  for (int i = 0; i < count; i++) {
    c[i] = new FakeContext();
  }
  uint64_t start = Cycles::rdtsc();
  std::lock_guard l{lock};
  for (int i = 0; i < count; i++) {
    if (timer.add_event_after(12345, c[i])) {
      timer.cancel_event(c[i]);
    }
  }
  uint64_t stop = Cycles::rdtsc();
  delete[] c;
  return Cycles::to_seconds(stop - start)/count;
}

// Measure the cost of throwing and catching an int. This uses an integer as
// the value thrown, which is presumably as fast as possible.
double throw_int()
{
  int count = 10000;
  uint64_t start = Cycles::rdtsc();
  for (int i = 0; i < count; i++) {
    try {
      throw 0;
    } catch (int) { // NOLINT
      // pass
    }
  }
  uint64_t stop = Cycles::rdtsc();
  return Cycles::to_seconds(stop - start)/count;
}

// Measure the cost of throwing and catching an int from a function call.
double throw_int_call()
{
  int count = 10000;
  uint64_t start = Cycles::rdtsc();
  for (int i = 0; i < count; i++) {
    try {
      PerfHelper::throw_int();
    } catch (int) { // NOLINT
      // pass
    }
  }
  uint64_t stop = Cycles::rdtsc();
  return Cycles::to_seconds(stop - start)/count;
}

// Measure the cost of throwing and catching an Exception. This uses an actual
// exception as the value thrown, which may be slower than throwInt.
double throw_exception()
{
  int count = 10000;
  uint64_t start = Cycles::rdtsc();
  for (int i = 0; i < count; i++) {
    try {
      throw buffer::end_of_buffer();
    } catch (const buffer::end_of_buffer&) {
      // pass
    }
  }
  uint64_t stop = Cycles::rdtsc();
  return Cycles::to_seconds(stop - start)/count;
}

// Measure the cost of throwing and catching an Exception from a function call.
double throw_exception_call()
{
  int count = 10000;
  uint64_t start = Cycles::rdtsc();
  for (int i = 0; i < count; i++) {
    try {
      PerfHelper::throw_end_of_buffer();
    } catch (const buffer::end_of_buffer&) {
      // pass
    }
  }
  uint64_t stop = Cycles::rdtsc();
  return Cycles::to_seconds(stop - start)/count;
}

// Measure the cost of pushing a new element on a std::vector, copying
// from the end to an internal element, and popping the end element.
double vector_push_pop()
{
  int count = 100000;
  std::vector<int> vector;
  vector.push_back(1);
  vector.push_back(2);
  vector.push_back(3);
  uint64_t start = Cycles::rdtsc();
  for (int i = 0; i < count; i++) {
    vector.push_back(i);
    vector.push_back(i+1);
    vector.push_back(i+2);
    vector[2] = vector.back();
    vector.pop_back();
    vector[0] = vector.back();
    vector.pop_back();
    vector[1] = vector.back();
    vector.pop_back();
  }
  uint64_t stop = Cycles::rdtsc();
  return Cycles::to_seconds(stop - start)/(count*3);
}

// Measure the cost of ceph_clock_now
double perf_ceph_clock_now()
{
  int count = 100000;
  uint64_t start = Cycles::rdtsc();
  for (int i = 0; i < count; i++) {
    ceph_clock_now();
  }
  uint64_t stop = Cycles::rdtsc();
  return Cycles::to_seconds(stop - start)/count;
}

// The following struct and table define each performance test in terms of
// a string name and a function that implements the test.
struct TestInfo {
  const char* name;             // Name of the performance test; this is
                                // what gets typed on the command line to
                                // run the test.
  double (*func)();             // Function that implements the test;
                                // returns the time (in seconds) for each
                                // iteration of that test.
  const char *description;      // Short description of this test (not more
                                // than about 40 characters, so the entire
                                // test output fits on a single line).
};
TestInfo tests[] = {
  {"atomic_int_cmp", atomic_int_cmp,
    "atomic_t::compare_and_swap"},
  {"atomic_int_inc", atomic_int_inc,
    "atomic_t::inc"},
  {"atomic_int_read", atomic_int_read,
    "atomic_t::read"},
  {"atomic_int_set", atomic_int_set,
    "atomic_t::set"},
  {"mutex_nonblock", mutex_nonblock,
    "Mutex lock/unlock (no blocking)"},
  {"buffer_basic", buffer_basic,
    "buffer create, add one ptr, delete"},
  {"buffer_encode_decode", buffer_encode_decode,
    "buffer create, encode/decode object, delete"},
  {"buffer_basic_copy", buffer_basic_copy,
    "buffer create, copy small block, delete"},
  {"buffer_copy", buffer_copy,
    "copy out 2 small ptrs from buffer"},
  {"buffer_encode10", buffer_encode,
    "buffer encoding 10 structures onto existing ptr"},
  {"buffer_iterator", buffer_iterator,
    "iterate over buffer with 5 ptrs"},
  {"cond_ping_pong", cond_ping_pong,
    "condition variable round-trip"},
  {"div32", div32,
    "32-bit integer division instruction"},
  {"div64", div64,
    "64-bit integer division instruction"},
  {"function_call", function_call,
    "Call a function that has not been inlined"},
  {"eventcenter_poll", eventcenter_poll,
    "EventCenter::process_events (no timers or events)"},
  {"eventcenter_dispatch", eventcenter_dispatch,
    "EventCenter::dispatch_event_external latency"},
  {"memcpy100", memcpy100,
    "Copy 100 bytes with memcpy"},
  {"memcpy1000", memcpy1000,
    "Copy 1000 bytes with memcpy"},
  {"memcpy10000", memcpy10000,
    "Copy 10000 bytes with memcpy"},
  {"ceph_str_hash_rjenkins", ceph_str_hash_rjenkins<16>,
    "rjenkins hash on 16 byte of data"},
  {"ceph_str_hash_rjenkins", ceph_str_hash_rjenkins<256>,
    "rjenkins hash on 256 bytes of data"},
  {"rdtsc", rdtsc_test,
    "Read the fine-grain cycle counter"},
  {"cycles_to_seconds", perf_cycles_to_seconds,
    "Convert a rdtsc result to (double) seconds"},
  {"cycles_to_seconds", perf_cycles_to_nanoseconds,
    "Convert a rdtsc result to (uint64_t) nanoseconds"},
  {"prefetch", perf_prefetch,
    "Prefetch instruction"},
  {"serialize", perf_serialize,
    "serialize instruction"},
  {"lfence", lfence,
    "Lfence instruction"},
  {"sfence", sfence,
    "Sfence instruction"},
  {"spin_lock", test_spinlock,
    "Acquire/release SpinLock"},
  {"spawn_thread", spawn_thread,
    "Start and stop a thread"},
  {"perf_timer", perf_timer,
    "Insert and cancel a SafeTimer"},
  {"throw_int", throw_int,
    "Throw an int"},
  {"throw_int_call", throw_int_call,
    "Throw an int in a function call"},
  {"throw_exception", throw_exception,
    "Throw an Exception"},
  {"throw_exception_call", throw_exception_call,
    "Throw an Exception in a function call"},
  {"vector_push_pop", vector_push_pop,
    "Push and pop a std::vector"},
  {"ceph_clock_now", perf_ceph_clock_now,
   "ceph_clock_now function"},
};

/**
 * Runs a particular test and prints a one-line result message.
 *
 * \param info
 *      Describes the test to run.
 */
void run_test(TestInfo& info)
{
  double secs = info.func();
  int width = printf("%-24s ", info.name);
  if (secs == -1) {
    width += printf(" architecture nonsupport ");
  } else if (secs < 1.0e-06) {
    width += printf("%8.2fns", 1e09*secs);
  } else if (secs < 1.0e-03) {
    width += printf("%8.2fus", 1e06*secs);
  } else if (secs < 1.0) {
    width += printf("%8.2fms", 1e03*secs);
  } else {
    width += printf("%8.2fs", secs);
  }
  printf("%*s %s\n", 32-width, "", info.description);
}

int main(int argc, char *argv[])
{
  auto args = argv_to_vec(argc, argv);

  auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
			 CODE_ENVIRONMENT_UTILITY,
			 CINIT_FLAG_NO_DEFAULT_CONFIG_FILE);
  common_init_finish(g_ceph_context);
  Cycles::init();

  bind_thread_to_cpu(3);
  if (argc == 1) {
    // No test names specified; run all tests.
    for (size_t i = 0; i < sizeof(tests)/sizeof(TestInfo); ++i) {
      run_test(tests[i]);
    }
  } else {
    // Run only the tests that were specified on the command line.
    for (int i = 1; i < argc; i++) {
      bool found_test = false;
      for (size_t j = 0; j < sizeof(tests)/sizeof(TestInfo); ++j) {
        if (strcmp(argv[i], tests[j].name) == 0) {
          found_test = true;
          run_test(tests[j]);
          break;
        }
      }
      if (!found_test) {
        int width = printf("%-24s ??", argv[i]);
        printf("%*s No such test\n", 32-width, "");
      }
    }
  }
}