ceph/src/test/perf_local.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /* Copyright (c) 2015 Haomai Wang <haomaiwang@gmail.com>
   4  * Copyright (c) 2011-2014 Stanford University
   5  * Copyright (c) 2011 Facebook
   6  *
   7  * Permission to use, copy, modify, and distribute this software for any
   8  * purpose with or without fee is hereby granted, provided that the above
   9  * copyright notice and this permission notice appear in all copies.
  10  *
  11  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR(S) DISCLAIM ALL WARRANTIES
  12  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  13  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL AUTHORS BE LIABLE FOR
  14  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  15  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  16  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
  17  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  18  */
  19
  20 // This program contains a collection of low-level performance measurements
  21 // for Ceph, which can be run either individually or altogether.  These
  22 // tests measure performance in a single stand-alone process, not in a cluster
  23 // with multiple servers.  Invoke the program like this:
  24 //
  25 //     Perf test1 test2 ...
  26 //
  27 // test1 and test2 are the names of individual performance measurements to
  28 // run.  If no test names are provided then all of the performance tests
  29 // are run.
  30 //
  31 // To add a new test:
  32 // * Write a function that implements the test.  Use existing test functions
  33 //   as a guideline, and be sure to generate output in the same form as
  34 //   other tests.
  35 // * Create a new entry for the test in the #tests table.
  36 #include <vector>
  37 #include <sched.h>
  38
  39 #include "acconfig.h"
  40 #ifdef HAVE_SSE
  41 #include <xmmintrin.h>
  42 #endif
  43
  44 #include "include/atomic.h"
  45 #include "include/buffer.h"
  46 #include "include/encoding.h"
  47 #include "include/ceph_hash.h"
  48 #include "include/Spinlock.h"
  49 #include "common/ceph_argparse.h"
  50 #include "common/Cycles.h"
  51 #include "common/Cond.h"
  52 #include "common/Mutex.h"
  53 #include "common/Thread.h"
  54 #include "common/Timer.h"
  55 #include "msg/async/Event.h"
  56 #include "global/global_init.h"
  57
  58 #include "test/perf_helper.h"
  59
  60 using namespace ceph;
  61
  62 /**
  63  * Ask the operating system to pin the current thread to a given CPU.
  64  *
  65  * \param cpu
  66  *      Indicates the desired CPU and hyperthread; low order 2 bits
  67  *      specify CPU, next bit specifies hyperthread.
  68  */
  69 void bind_thread_to_cpu(int cpu)
  70 {
  71 #ifdef HAVE_SCHED
  72   cpu_set_t set;
  73   CPU_ZERO(&set);
  74   CPU_SET(cpu, &set);
  75   sched_setaffinity(0, sizeof(set), &set);
  76 #endif
  77 }
  78
  79 /*
  80  * This function just discards its argument. It's used to make it
  81  * appear that data is used,  so that the compiler won't optimize
  82  * away the code we're trying to measure.
  83  *
  84  * \param value
  85  *      Pointer to arbitrary value; it's discarded.
  86  */
  87 void discard(void* value) {
  88   int x = *reinterpret_cast<int*>(value);
  89   if (x == 0x43924776) {
  90     printf("Value was 0x%x\n", x);
  91   }
  92 }
  93
  94 //----------------------------------------------------------------------
  95 // Test functions start here
  96 //----------------------------------------------------------------------
  97
  98 // Measure the cost of atomic_t::compare_and_swap
  99 double atomic_int_cmp()
 100 {
 101   int count = 1000000;
 102   atomic_t value(11);
 103   int test = 11;
 104   uint64_t start = Cycles::rdtsc();
 105   for (int i = 0; i < count; i++) {
 106     value.compare_and_swap(test, test+2);
 107     test += 2;
 108   }
 109   uint64_t stop = Cycles::rdtsc();
 110   // printf("Final value: %d\n", value.load());
 111   return Cycles::to_seconds(stop - start)/count;
 112 }
 113
 114 // Measure the cost of atomic_t::inc
 115 double atomic_int_inc()
 116 {
 117   int count = 1000000;
 118   atomic_t value(11);
 119   uint64_t start = Cycles::rdtsc();
 120   for (int i = 0; i < count; i++) {
 121     value.inc();
 122   }
 123   uint64_t stop = Cycles::rdtsc();
 124   // printf("Final value: %d\n", value.load());
 125   return Cycles::to_seconds(stop - start)/count;
 126 }
 127
 128 // Measure the cost of reading an atomic_t
 129 double atomic_int_read()
 130 {
 131   int count = 1000000;
 132   atomic_t value(11);
 133   int total = 0;
 134   uint64_t start = Cycles::rdtsc();
 135   for (int i = 0; i < count; i++) {
 136     total += value.read();
 137   }
 138   uint64_t stop = Cycles::rdtsc();
 139   // printf("Total: %d\n", total);
 140   return Cycles::to_seconds(stop - start)/count;
 141 }
 142
 143 // Measure the cost of storing a new value in a atomic_t
 144 double atomic_int_set()
 145 {
 146   int count = 1000000;
 147   atomic_t value(11);
 148   uint64_t start = Cycles::rdtsc();
 149   for (int i = 0; i < count; i++) {
 150     value.set(88);
 151   }
 152   uint64_t stop = Cycles::rdtsc();
 153   return Cycles::to_seconds(stop - start)/count;
 154 }
 155
 156 // Measure the cost of acquiring and releasing a mutex in the
 157 // fast case where the mutex is free.
 158 double mutex_nonblock()
 159 {
 160   int count = 1000000;
 161   Mutex m("mutex_nonblock::m");
 162   uint64_t start = Cycles::rdtsc();
 163   for (int i = 0; i < count; i++) {
 164     m.Lock();
 165     m.Unlock();
 166   }
 167   uint64_t stop = Cycles::rdtsc();
 168   return Cycles::to_seconds(stop - start)/count;
 169 }
 170
 171 // Measure the cost of allocating and deallocating a buffer, plus
 172 // appending (logically) one ptr.
 173 double buffer_basic()
 174 {
 175   int count = 1000000;
 176   uint64_t start = Cycles::rdtsc();
 177   bufferptr ptr("abcdefg", 7);
 178   for (int i = 0; i < count; i++) {
 179     bufferlist b;
 180     b.append(ptr, 0, 5);
 181   }
 182   uint64_t stop = Cycles::rdtsc();
 183   return Cycles::to_seconds(stop - start)/count;
 184 }
 185
 186 struct DummyBlock {
 187   int a = 1, b = 2, c = 3, d = 4;
 188   void encode(bufferlist &bl) const {
 189     ENCODE_START(1, 1, bl);
 190     ::encode(a, bl);
 191     ::encode(b, bl);
 192     ::encode(c, bl);
 193     ::encode(d, bl);
 194     ENCODE_FINISH(bl);
 195   }
 196   void decode(bufferlist::iterator &bl) {
 197     DECODE_START(1, bl);
 198     ::decode(a, bl);
 199     ::decode(b, bl);
 200     ::decode(c, bl);
 201     ::decode(d, bl);
 202     DECODE_FINISH(bl);
 203   }
 204 };
 205 WRITE_CLASS_ENCODER(DummyBlock)
 206
 207 // Measure the cost of encoding and decoding a buffer, plus
 208 // allocating space for one chunk.
 209 double buffer_encode_decode()
 210 {
 211   int count = 1000000;
 212   uint64_t start = Cycles::rdtsc();
 213   for (int i = 0; i < count; i++) {
 214     bufferlist b;
 215     DummyBlock dummy_block;
 216     ::encode(dummy_block, b);
 217     bufferlist::iterator iter = b.begin();
 218     ::decode(dummy_block, iter);
 219   }
 220   uint64_t stop = Cycles::rdtsc();
 221   return Cycles::to_seconds(stop - start)/count;
 222 }
 223
 224 // Measure the cost of allocating and deallocating a buffer, plus
 225 // copying in a small block.
 226 double buffer_basic_copy()
 227 {
 228   int count = 1000000;
 229   uint64_t start = Cycles::rdtsc();
 230   for (int i = 0; i < count; i++) {
 231     bufferlist b;
 232     b.append("abcdefg", 6);
 233   }
 234   uint64_t stop = Cycles::rdtsc();
 235   return Cycles::to_seconds(stop - start)/count;
 236 }
 237
 238 // Measure the cost of making a copy of parts of two ptrs.
 239 double buffer_copy()
 240 {
 241   int count = 1000000;
 242   bufferlist b;
 243   b.append("abcde", 5);
 244   b.append("01234", 5);
 245   char copy[10];
 246   uint64_t start = Cycles::rdtsc();
 247   for (int i = 0; i < count; i++) {
 248     b.copy(2, 6, copy);
 249   }
 250   uint64_t stop = Cycles::rdtsc();
 251   return Cycles::to_seconds(stop - start)/count;
 252 }
 253
 254 // Measure the cost of allocating new space by extending the
 255 // bufferlist
 256 double buffer_encode()
 257 {
 258   int count = 100000;
 259   uint64_t total = 0;
 260   for (int i = 0; i < count; i++) {
 261     bufferlist b;
 262     DummyBlock dummy_block;
 263     ::encode(dummy_block, b);
 264     uint64_t start = Cycles::rdtsc();
 265     ::encode(dummy_block, b);
 266     ::encode(dummy_block, b);
 267     ::encode(dummy_block, b);
 268     ::encode(dummy_block, b);
 269     ::encode(dummy_block, b);
 270     ::encode(dummy_block, b);
 271     ::encode(dummy_block, b);
 272     ::encode(dummy_block, b);
 273     ::encode(dummy_block, b);
 274     ::encode(dummy_block, b);
 275     total += Cycles::rdtsc() - start;
 276   }
 277   return Cycles::to_seconds(total)/(count*10);
 278 }
 279
 280 // Measure the cost of retrieving an object from the beginning of a buffer.
 281 double buffer_get_contiguous()
 282 {
 283   int count = 1000000;
 284   int value = 11;
 285   bufferlist b;
 286   b.append((char*)&value, sizeof(value));
 287   int sum = 0;
 288   uint64_t start = Cycles::rdtsc();
 289   for (int i = 0; i < count; i++) {
 290     sum += *reinterpret_cast<int*>(b.get_contiguous(0, sizeof(value)));
 291   }
 292   uint64_t stop = Cycles::rdtsc();
 293   return Cycles::to_seconds(stop - start)/count;
 294 }
 295
 296 // Measure the cost of creating an iterator and iterating over 10
 297 // chunks in a buffer.
 298 double buffer_iterator()
 299 {
 300   bufferlist b;
 301   const char s[] = "abcdefghijklmnopqrstuvwxyz";
 302   bufferptr ptr(s, sizeof(s));
 303   for (int i = 0; i < 5; i++) {
 304     b.append(ptr, i, 5);
 305   }
 306   int count = 100000;
 307   int sum = 0;
 308   uint64_t start = Cycles::rdtsc();
 309   for (int i = 0; i < count; i++) {
 310     bufferlist::iterator it = b.begin();
 311     while (!it.end()) {
 312       sum += (static_cast<const char*>(it.get_current_ptr().c_str()))[it.get_remaining()-1];
 313       ++it;
 314     }
 315   }
 316   uint64_t stop = Cycles::rdtsc();
 317   discard(&sum);
 318   return Cycles::to_seconds(stop - start)/count;
 319 }
 320
 321 // Implements the CondPingPong test.
 322 class CondPingPong {
 323   Mutex mutex;
 324   Cond cond;
 325   int prod;
 326   int cons;
 327   const int count;
 328
 329   class Consumer : public Thread {
 330     CondPingPong *p;
 331    public:
 332     explicit Consumer(CondPingPong *p): p(p) {}
 333     void* entry() override {
 334       p->consume();
 335       return 0;
 336     }
 337   } consumer;
 338
 339  public:
 340   CondPingPong(): mutex("CondPingPong::mutex"), prod(0), cons(0), count(10000), consumer(this) {}
 341
 342   double run() {
 343     consumer.create("consumer");
 344     uint64_t start = Cycles::rdtsc();
 345     produce();
 346     uint64_t stop = Cycles::rdtsc();
 347     consumer.join();
 348     return Cycles::to_seconds(stop - start)/count;
 349   }
 350
 351   void produce() {
 352     Mutex::Locker l(mutex);
 353     while (cons < count) {
 354       while (cons < prod)
 355         cond.Wait(mutex);
 356       ++prod;
 357       cond.Signal();
 358     }
 359   }
 360
 361   void consume() {
 362     Mutex::Locker l(mutex);
 363     while (cons < count) {
 364       while (cons == prod)
 365         cond.Wait(mutex);
 366       ++cons;
 367       cond.Signal();
 368     }
 369   }
 370 };
 371
 372 // Measure the cost of coordinating between threads using a condition variable.
 373 double cond_ping_pong()
 374 {
 375   return CondPingPong().run();
 376 }
 377
 378 // Measure the cost of a 32-bit divide. Divides don't take a constant
 379 // number of cycles. Values were chosen here semi-randomly to depict a
 380 // fairly expensive scenario. Someone with fancy ALU knowledge could
 381 // probably pick worse values.
 382 double div32()
 383 {
 384 #if defined(__i386__) || defined(__x86_64__)
 385   int count = 1000000;
 386   uint64_t start = Cycles::rdtsc();
 387   // NB: Expect an x86 processor exception is there's overflow.
 388   uint32_t numeratorHi = 0xa5a5a5a5U;
 389   uint32_t numeratorLo = 0x55aa55aaU;
 390   uint32_t divisor = 0xaa55aa55U;
 391   uint32_t quotient;
 392   uint32_t remainder;
 393   for (int i = 0; i < count; i++) {
 394     __asm__ __volatile__("div %4" :
 395                          "=a"(quotient), "=d"(remainder) :
 396                          "a"(numeratorLo), "d"(numeratorHi), "r"(divisor) :
 397                          "cc");
 398   }
 399   uint64_t stop = Cycles::rdtsc();
 400   return Cycles::to_seconds(stop - start)/count;
 401 #else
 402   return -1;
 403 #endif
 404 }
 405
 406 // Measure the cost of a 64-bit divide. Divides don't take a constant
 407 // number of cycles. Values were chosen here semi-randomly to depict a
 408 // fairly expensive scenario. Someone with fancy ALU knowledge could
 409 // probably pick worse values.
 410 double div64()
 411 {
 412 #if defined(__x86_64__) || defined(__amd64__)
 413   int count = 1000000;
 414   // NB: Expect an x86 processor exception is there's overflow.
 415   uint64_t start = Cycles::rdtsc();
 416   uint64_t numeratorHi = 0x5a5a5a5a5a5UL;
 417   uint64_t numeratorLo = 0x55aa55aa55aa55aaUL;
 418   uint64_t divisor = 0xaa55aa55aa55aa55UL;
 419   uint64_t quotient;
 420   uint64_t remainder;
 421   for (int i = 0; i < count; i++) {
 422     __asm__ __volatile__("divq %4" :
 423                          "=a"(quotient), "=d"(remainder) :
 424                          "a"(numeratorLo), "d"(numeratorHi), "r"(divisor) :
 425                          "cc");
 426   }
 427   uint64_t stop = Cycles::rdtsc();
 428   return Cycles::to_seconds(stop - start)/count;
 429 #else
 430   return -1;
 431 #endif
 432 }
 433
 434 // Measure the cost of calling a non-inlined function.
 435 double function_call()
 436 {
 437   int count = 1000000;
 438   uint64_t x = 0;
 439   uint64_t start = Cycles::rdtsc();
 440   for (int i = 0; i < count; i++) {
 441     x = PerfHelper::plus_one(x);
 442   }
 443   uint64_t stop = Cycles::rdtsc();
 444   return Cycles::to_seconds(stop - start)/count;
 445 }
 446
 447 // Measure the minimum cost of EventCenter::process_events, when there are no
 448 // Pollers and no Timers.
 449 double eventcenter_poll()
 450 {
 451   int count = 1000000;
 452   EventCenter center(g_ceph_context);
 453   center.init(1000, 0, "posix");
 454   center.set_owner();
 455   uint64_t start = Cycles::rdtsc();
 456   for (int i = 0; i < count; i++) {
 457     center.process_events(0);
 458   }
 459   uint64_t stop = Cycles::rdtsc();
 460   return Cycles::to_seconds(stop - start)/count;
 461 }
 462
 463 class CenterWorker : public Thread {
 464   CephContext *cct;
 465   bool done;
 466
 467  public:
 468   EventCenter center;
 469   explicit CenterWorker(CephContext *c): cct(c), done(false), center(c) {
 470     center.init(100, 0, "posix");
 471   }
 472   void stop() {
 473     done = true;
 474     center.wakeup();
 475   }
 476   void* entry() override {
 477     center.set_owner();
 478     bind_thread_to_cpu(2);
 479     while (!done)
 480       center.process_events(1000);
 481     return 0;
 482   }
 483 };
 484
 485 class CountEvent: public EventCallback {
 486   atomic_t *count;
 487
 488  public:
 489   explicit CountEvent(atomic_t *atomic): count(atomic) {}
 490   void do_request(int id) override {
 491     count->dec();
 492   }
 493 };
 494
 495 double eventcenter_dispatch()
 496 {
 497   int count = 100000;
 498
 499   CenterWorker worker(g_ceph_context);
 500   atomic_t flag(1);
 501   worker.create("evt_center_disp");
 502   EventCallbackRef count_event(new CountEvent(&flag));
 503
 504   worker.center.dispatch_event_external(count_event);
 505   // Start a new thread and wait for it to ready.
 506   while (flag.read())
 507     usleep(100);
 508
 509   uint64_t start = Cycles::rdtsc();
 510   for (int i = 0; i < count; i++) {
 511     flag.set(1);
 512     worker.center.dispatch_event_external(count_event);
 513     while (flag.read())
 514       ;
 515   }
 516   uint64_t stop = Cycles::rdtsc();
 517   worker.stop();
 518   worker.join();
 519   return Cycles::to_seconds(stop - start)/count;
 520 }
 521
 522 // Measure the cost of copying a given number of bytes with memcpy.
 523 double memcpy_shared(size_t size)
 524 {
 525   int count = 1000000;
 526   char src[size], dst[size];
 527
 528   memset(src, 0, sizeof(src));
 529
 530   uint64_t start = Cycles::rdtsc();
 531   for (int i = 0; i < count; i++) {
 532     memcpy(dst, src, size);
 533   }
 534   uint64_t stop = Cycles::rdtsc();
 535   return Cycles::to_seconds(stop - start)/count;
 536 }
 537
 538 double memcpy100()
 539 {
 540   return memcpy_shared(100);
 541 }
 542
 543 double memcpy1000()
 544 {
 545   return memcpy_shared(1000);
 546 }
 547
 548 double memcpy10000()
 549 {
 550   return memcpy_shared(10000);
 551 }
 552
 553 // Benchmark rjenkins hashing performance on cached data.
 554 template <int key_length>
 555 double ceph_str_hash_rjenkins()
 556 {
 557   int count = 100000;
 558   char buf[key_length];
 559
 560   uint64_t start = Cycles::rdtsc();
 561   for (int i = 0; i < count; i++)
 562     ceph_str_hash(CEPH_STR_HASH_RJENKINS, buf, sizeof(buf));
 563   uint64_t stop = Cycles::rdtsc();
 564
 565   return Cycles::to_seconds(stop - start)/count;
 566 }
 567
 568 // Measure the cost of reading the fine-grain cycle counter.
 569 double rdtsc_test()
 570 {
 571   int count = 1000000;
 572   uint64_t start = Cycles::rdtsc();
 573   uint64_t total = 0;
 574   for (int i = 0; i < count; i++) {
 575     total += Cycles::rdtsc();
 576   }
 577   uint64_t stop = Cycles::rdtsc();
 578   return Cycles::to_seconds(stop - start)/count;
 579 }
 580
 581 // Measure the cost of the Cycles::to_seconds method.
 582 double perf_cycles_to_seconds()
 583 {
 584   int count = 1000000;
 585   double total = 0;
 586   uint64_t cycles = 994261;
 587   uint64_t start = Cycles::rdtsc();
 588   for (int i = 0; i < count; i++) {
 589     total += Cycles::to_seconds(cycles);
 590   }
 591   uint64_t stop = Cycles::rdtsc();
 592   // printf("Result: %.4f\n", total/count);
 593   return Cycles::to_seconds(stop - start)/count;
 594 }
 595
 596 // Measure the cost of the Cylcles::toNanoseconds method.
 597 double perf_cycles_to_nanoseconds()
 598 {
 599   int count = 1000000;
 600   uint64_t total = 0;
 601   uint64_t cycles = 994261;
 602   uint64_t start = Cycles::rdtsc();
 603   for (int i = 0; i < count; i++) {
 604     total += Cycles::to_nanoseconds(cycles);
 605   }
 606   uint64_t stop = Cycles::rdtsc();
 607   // printf("Result: %lu\n", total/count);
 608   return Cycles::to_seconds(stop - start)/count;
 609 }
 610
 611
 612 #ifdef HAVE_SSE
 613 /**
 614  * Prefetch the cache lines containing [object, object + numBytes) into the
 615  * processor's caches.
 616  * The best docs for this are in the Intel instruction set reference under
 617  * PREFETCH.
 618  * \param object
 619  *      The start of the region of memory to prefetch.
 620  * \param num_bytes
 621  *      The size of the region of memory to prefetch.
 622  */
 623 static inline void prefetch(const void *object, uint64_t num_bytes)
 624 {
 625     uint64_t offset = reinterpret_cast<uint64_t>(object) & 0x3fUL;
 626     const char* p = reinterpret_cast<const char*>(object) - offset;
 627     for (uint64_t i = 0; i < offset + num_bytes; i += 64)
 628         _mm_prefetch(p + i, _MM_HINT_T0);
 629 }
 630 #endif
 631
 632 // Measure the cost of the prefetch instruction.
 633 double perf_prefetch()
 634 {
 635 #ifdef HAVE_SSE
 636   uint64_t total_ticks = 0;
 637   int count = 10;
 638   char buf[16 * 64];
 639   uint64_t start, stop;
 640
 641   for (int i = 0; i < count; i++) {
 642     PerfHelper::flush_cache();
 643     start = Cycles::rdtsc();
 644     prefetch(&buf[576], 64);
 645     prefetch(&buf[0],   64);
 646     prefetch(&buf[512], 64);
 647     prefetch(&buf[960], 64);
 648     prefetch(&buf[640], 64);
 649     prefetch(&buf[896], 64);
 650     prefetch(&buf[256], 64);
 651     prefetch(&buf[704], 64);
 652     prefetch(&buf[320], 64);
 653     prefetch(&buf[384], 64);
 654     prefetch(&buf[128], 64);
 655     prefetch(&buf[448], 64);
 656     prefetch(&buf[768], 64);
 657     prefetch(&buf[832], 64);
 658     prefetch(&buf[64],  64);
 659     prefetch(&buf[192], 64);
 660     stop = Cycles::rdtsc();
 661     total_ticks += stop - start;
 662   }
 663   return Cycles::to_seconds(total_ticks) / count / 16;
 664 #else
 665   return -1;
 666 #endif
 667 }
 668
 669 #if defined(__x86_64__)
 670 /**
 671  * This function is used to seralize machine instructions so that no
 672  * instructions that appear after it in the current thread can run before any
 673  * instructions that appear before it.
 674  *
 675  * It is useful for putting around rdpmc instructions (to pinpoint cache
 676  * misses) as well as before rdtsc instructions, to prevent time pollution from
 677  * instructions supposed to be executing before the timer starts.
 678  */
 679 static inline void serialize() {
 680     uint32_t eax, ebx, ecx, edx;
 681     __asm volatile("cpuid"
 682         : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
 683         : "a" (1U));
 684 }
 685 #endif
 686
 687 // Measure the cost of cpuid
 688 double perf_serialize() {
 689 #if defined(__x86_64__)
 690   int count = 1000000;
 691   uint64_t start = Cycles::rdtsc();
 692   for (int i = 0; i < count; i++) {
 693     serialize();
 694   }
 695   uint64_t stop = Cycles::rdtsc();
 696   return Cycles::to_seconds(stop - start)/count;
 697 #else
 698   return -1;
 699 #endif
 700 }
 701
 702 // Measure the cost of an lfence instruction.
 703 double lfence()
 704 {
 705 #ifdef HAVE_SSE2
 706   int count = 1000000;
 707   uint64_t start = Cycles::rdtsc();
 708   for (int i = 0; i < count; i++) {
 709     __asm__ __volatile__("lfence" ::: "memory");
 710   }
 711   uint64_t stop = Cycles::rdtsc();
 712   return Cycles::to_seconds(stop - start)/count;
 713 #else
 714   return -1;
 715 #endif
 716 }
 717
 718 // Measure the cost of an sfence instruction.
 719 double sfence()
 720 {
 721 #ifdef HAVE_SSE
 722   int count = 1000000;
 723   uint64_t start = Cycles::rdtsc();
 724   for (int i = 0; i < count; i++) {
 725     __asm__ __volatile__("sfence" ::: "memory");
 726   }
 727   uint64_t stop = Cycles::rdtsc();
 728   return Cycles::to_seconds(stop - start)/count;
 729 #else
 730   return -1;
 731 #endif
 732 }
 733
 734 // Measure the cost of acquiring and releasing a SpinLock (assuming the
 735 // lock is initially free).
 736 double test_spinlock()
 737 {
 738   int count = 1000000;
 739   Spinlock lock;
 740   uint64_t start = Cycles::rdtsc();
 741   for (int i = 0; i < count; i++) {
 742     lock.lock();
 743     lock.unlock();
 744   }
 745   uint64_t stop = Cycles::rdtsc();
 746   return Cycles::to_seconds(stop - start)/count;
 747 }
 748
 749 // Helper for spawn_thread. This is the main function that the thread executes
 750 // (intentionally empty).
 751 class ThreadHelper : public Thread {
 752   void *entry() override { return 0; }
 753 };
 754
 755 // Measure the cost of start and joining with a thread.
 756 double spawn_thread()
 757 {
 758   int count = 10000;
 759   ThreadHelper thread;
 760   uint64_t start = Cycles::rdtsc();
 761   for (int i = 0; i < count; i++) {
 762     thread.create("thread_helper");
 763     thread.join();
 764   }
 765   uint64_t stop = Cycles::rdtsc();
 766   return Cycles::to_seconds(stop - start)/count;
 767 }
 768
 769 class FakeContext : public Context {
 770  public:
 771   void finish(int r) override {}
 772 };
 773
 774 // Measure the cost of starting and stopping a Dispatch::Timer.
 775 double perf_timer()
 776 {
 777   int count = 1000000;
 778   Mutex lock("perf_timer::lock");
 779   SafeTimer timer(g_ceph_context, lock);
 780   FakeContext **c = new FakeContext*[count];
 781   for (int i = 0; i < count; i++) {
 782     c[i] = new FakeContext();
 783   }
 784   uint64_t start = Cycles::rdtsc();
 785   Mutex::Locker l(lock);
 786   for (int i = 0; i < count; i++) {
 787     timer.add_event_after(12345, c[i]);
 788     timer.cancel_event(c[i]);
 789   }
 790   uint64_t stop = Cycles::rdtsc();
 791   delete[] c;
 792   return Cycles::to_seconds(stop - start)/count;
 793 }
 794
 795 // Measure the cost of throwing and catching an int. This uses an integer as
 796 // the value thrown, which is presumably as fast as possible.
 797 double throw_int()
 798 {
 799   int count = 10000;
 800   uint64_t start = Cycles::rdtsc();
 801   for (int i = 0; i < count; i++) {
 802     try {
 803       throw 0;
 804     } catch (int) { // NOLINT
 805       // pass
 806     }
 807   }
 808   uint64_t stop = Cycles::rdtsc();
 809   return Cycles::to_seconds(stop - start)/count;
 810 }
 811
 812 // Measure the cost of throwing and catching an int from a function call.
 813 double throw_int_call()
 814 {
 815   int count = 10000;
 816   uint64_t start = Cycles::rdtsc();
 817   for (int i = 0; i < count; i++) {
 818     try {
 819       PerfHelper::throw_int();
 820     } catch (int) { // NOLINT
 821       // pass
 822     }
 823   }
 824   uint64_t stop = Cycles::rdtsc();
 825   return Cycles::to_seconds(stop - start)/count;
 826 }
 827
 828 // Measure the cost of throwing and catching an Exception. This uses an actual
 829 // exception as the value thrown, which may be slower than throwInt.
 830 double throw_exception()
 831 {
 832   int count = 10000;
 833   uint64_t start = Cycles::rdtsc();
 834   for (int i = 0; i < count; i++) {
 835     try {
 836       throw buffer::end_of_buffer();
 837     } catch (const buffer::end_of_buffer&) {
 838       // pass
 839     }
 840   }
 841   uint64_t stop = Cycles::rdtsc();
 842   return Cycles::to_seconds(stop - start)/count;
 843 }
 844
 845 // Measure the cost of throwing and catching an Exception from a function call.
 846 double throw_exception_call()
 847 {
 848   int count = 10000;
 849   uint64_t start = Cycles::rdtsc();
 850   for (int i = 0; i < count; i++) {
 851     try {
 852       PerfHelper::throw_end_of_buffer();
 853     } catch (const buffer::end_of_buffer&) {
 854       // pass
 855     }
 856   }
 857   uint64_t stop = Cycles::rdtsc();
 858   return Cycles::to_seconds(stop - start)/count;
 859 }
 860
 861 // Measure the cost of pushing a new element on a std::vector, copying
 862 // from the end to an internal element, and popping the end element.
 863 double vector_push_pop()
 864 {
 865   int count = 100000;
 866   std::vector<int> vector;
 867   vector.push_back(1);
 868   vector.push_back(2);
 869   vector.push_back(3);
 870   uint64_t start = Cycles::rdtsc();
 871   for (int i = 0; i < count; i++) {
 872     vector.push_back(i);
 873     vector.push_back(i+1);
 874     vector.push_back(i+2);
 875     vector[2] = vector.back();
 876     vector.pop_back();
 877     vector[0] = vector.back();
 878     vector.pop_back();
 879     vector[1] = vector.back();
 880     vector.pop_back();
 881   }
 882   uint64_t stop = Cycles::rdtsc();
 883   return Cycles::to_seconds(stop - start)/(count*3);
 884 }
 885
 886 // Measure the cost of ceph_clock_now
 887 double perf_ceph_clock_now()
 888 {
 889   int count = 100000;
 890   uint64_t start = Cycles::rdtsc();
 891   for (int i = 0; i < count; i++) {
 892     ceph_clock_now();
 893   }
 894   uint64_t stop = Cycles::rdtsc();
 895   return Cycles::to_seconds(stop - start)/count;
 896 }
 897
 898 // The following struct and table define each performance test in terms of
 899 // a string name and a function that implements the test.
 900 struct TestInfo {
 901   const char* name;             // Name of the performance test; this is
 902                                 // what gets typed on the command line to
 903                                 // run the test.
 904   double (*func)();             // Function that implements the test;
 905                                 // returns the time (in seconds) for each
 906                                 // iteration of that test.
 907   const char *description;      // Short description of this test (not more
 908                                 // than about 40 characters, so the entire
 909                                 // test output fits on a single line).
 910 };
 911 TestInfo tests[] = {
 912   {"atomic_int_cmp", atomic_int_cmp,
 913     "atomic_t::compare_and_swap"},
 914   {"atomic_int_inc", atomic_int_inc,
 915     "atomic_t::inc"},
 916   {"atomic_int_read", atomic_int_read,
 917     "atomic_t::read"},
 918   {"atomic_int_set", atomic_int_set,
 919     "atomic_t::set"},
 920   {"mutex_nonblock", mutex_nonblock,
 921     "Mutex lock/unlock (no blocking)"},
 922   {"buffer_basic", buffer_basic,
 923     "buffer create, add one ptr, delete"},
 924   {"buffer_encode_decode", buffer_encode_decode,
 925     "buffer create, encode/decode object, delete"},
 926   {"buffer_basic_copy", buffer_basic_copy,
 927     "buffer create, copy small block, delete"},
 928   {"buffer_copy", buffer_copy,
 929     "copy out 2 small ptrs from buffer"},
 930   {"buffer_encode10", buffer_encode,
 931     "buffer encoding 10 structures onto existing ptr"},
 932   {"buffer_get_contiguous", buffer_get_contiguous,
 933     "Buffer::get_contiguous"},
 934   {"buffer_iterator", buffer_iterator,
 935     "iterate over buffer with 5 ptrs"},
 936   {"cond_ping_pong", cond_ping_pong,
 937     "condition variable round-trip"},
 938   {"div32", div32,
 939     "32-bit integer division instruction"},
 940   {"div64", div64,
 941     "64-bit integer division instruction"},
 942   {"function_call", function_call,
 943     "Call a function that has not been inlined"},
 944   {"eventcenter_poll", eventcenter_poll,
 945     "EventCenter::process_events (no timers or events)"},
 946   {"eventcenter_dispatch", eventcenter_dispatch,
 947     "EventCenter::dispatch_event_external latency"},
 948   {"memcpy100", memcpy100,
 949     "Copy 100 bytes with memcpy"},
 950   {"memcpy1000", memcpy1000,
 951     "Copy 1000 bytes with memcpy"},
 952   {"memcpy10000", memcpy10000,
 953     "Copy 10000 bytes with memcpy"},
 954   {"ceph_str_hash_rjenkins", ceph_str_hash_rjenkins<16>,
 955     "rjenkins hash on 16 byte of data"},
 956   {"ceph_str_hash_rjenkins", ceph_str_hash_rjenkins<256>,
 957     "rjenkins hash on 256 bytes of data"},
 958   {"rdtsc", rdtsc_test,
 959     "Read the fine-grain cycle counter"},
 960   {"cycles_to_seconds", perf_cycles_to_seconds,
 961     "Convert a rdtsc result to (double) seconds"},
 962   {"cycles_to_seconds", perf_cycles_to_nanoseconds,
 963     "Convert a rdtsc result to (uint64_t) nanoseconds"},
 964   {"prefetch", perf_prefetch,
 965     "Prefetch instruction"},
 966   {"serialize", perf_serialize,
 967     "serialize instruction"},
 968   {"lfence", lfence,
 969     "Lfence instruction"},
 970   {"sfence", sfence,
 971     "Sfence instruction"},
 972   {"spin_lock", test_spinlock,
 973     "Acquire/release SpinLock"},
 974   {"spawn_thread", spawn_thread,
 975     "Start and stop a thread"},
 976   {"perf_timer", perf_timer,
 977     "Insert and cancel a SafeTimer"},
 978   {"throw_int", throw_int,
 979     "Throw an int"},
 980   {"throw_int_call", throw_int_call,
 981     "Throw an int in a function call"},
 982   {"throw_exception", throw_exception,
 983     "Throw an Exception"},
 984   {"throw_exception_call", throw_exception_call,
 985     "Throw an Exception in a function call"},
 986   {"vector_push_pop", vector_push_pop,
 987     "Push and pop a std::vector"},
 988   {"ceph_clock_now", perf_ceph_clock_now,
 989    "ceph_clock_now function"},
 990 };
 991
 992 /**
 993  * Runs a particular test and prints a one-line result message.
 994  *
 995  * \param info
 996  *      Describes the test to run.
 997  */
 998 void run_test(TestInfo& info)
 999 {
1000   double secs = info.func();
1001   int width = printf("%-24s ", info.name);
1002   if (secs == -1) {
1003     width += printf(" architecture nonsupport ");
1004   } else if (secs < 1.0e-06) {
1005     width += printf("%8.2fns", 1e09*secs);
1006   } else if (secs < 1.0e-03) {
1007     width += printf("%8.2fus", 1e06*secs);
1008   } else if (secs < 1.0) {
1009     width += printf("%8.2fms", 1e03*secs);
1010   } else {
1011     width += printf("%8.2fs", secs);
1012   }
1013   printf("%*s %s\n", 32-width, "", info.description);
1014 }
1015
1016 int main(int argc, char *argv[])
1017 {
1018   vector<const char*> args;
1019   argv_to_vec(argc, (const char **)argv, args);
1020
1021   auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
1022                          CODE_ENVIRONMENT_UTILITY, 0);
1023   common_init_finish(g_ceph_context);
1024   Cycles::init();
1025
1026   bind_thread_to_cpu(3);
1027   if (argc == 1) {
1028     // No test names specified; run all tests.
1029     for (size_t i = 0; i < sizeof(tests)/sizeof(TestInfo); ++i) {
1030       run_test(tests[i]);
1031     }
1032   } else {
1033     // Run only the tests that were specified on the command line.
1034     for (int i = 1; i < argc; i++) {
1035       bool found_test = false;
1036       for (size_t j = 0; j < sizeof(tests)/sizeof(TestInfo); ++j) {
1037         if (strcmp(argv[i], tests[j].name) == 0) {
1038           found_test = true;
1039           run_test(tests[j]);
1040           break;
1041         }
1042       }
1043       if (!found_test) {
1044         int width = printf("%-24s ??", argv[i]);
1045         printf("%*s No such test\n", 32-width, "");
1046       }
1047     }
1048   }
1049 }