ceph/src/test/perf_local.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /* Copyright (c) 2015 Haomai Wang <haomaiwang@gmail.com>
   4  * Copyright (c) 2011-2014 Stanford University
   5  * Copyright (c) 2011 Facebook
   6  *
   7  * Permission to use, copy, modify, and distribute this software for any
   8  * purpose with or without fee is hereby granted, provided that the above
   9  * copyright notice and this permission notice appear in all copies.
  10  *
  11  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR(S) DISCLAIM ALL WARRANTIES
  12  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  13  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL AUTHORS BE LIABLE FOR
  14  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  15  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  16  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
  17  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  18  */
  19
  20 // This program contains a collection of low-level performance measurements
  21 // for Ceph, which can be run either individually or altogether.  These
  22 // tests measure performance in a single stand-alone process, not in a cluster
  23 // with multiple servers.  Invoke the program like this:
  24 //
  25 //     Perf test1 test2 ...
  26 //
  27 // test1 and test2 are the names of individual performance measurements to
  28 // run.  If no test names are provided then all of the performance tests
  29 // are run.
  30 //
  31 // To add a new test:
  32 // * Write a function that implements the test.  Use existing test functions
  33 //   as a guideline, and be sure to generate output in the same form as
  34 //   other tests.
  35 // * Create a new entry for the test in the #tests table.
  36 #include <vector>
  37 #include <sched.h>
  38
  39 #include "acconfig.h"
  40 #ifdef HAVE_SSE
  41 #include <xmmintrin.h>
  42 #endif
  43
  44 #include "include/buffer.h"
  45 #include "include/encoding.h"
  46 #include "include/ceph_hash.h"
  47 #include "include/Spinlock.h"
  48 #include "common/ceph_argparse.h"
  49 #include "common/Cycles.h"
  50 #include "common/Cond.h"
  51 #include "common/Mutex.h"
  52 #include "common/Thread.h"
  53 #include "common/Timer.h"
  54 #include "msg/async/Event.h"
  55 #include "global/global_init.h"
  56
  57 #include "test/perf_helper.h"
  58
  59 #include <atomic>
  60
  61 using namespace ceph;
  62
  63 /**
  64  * Ask the operating system to pin the current thread to a given CPU.
  65  *
  66  * \param cpu
  67  *      Indicates the desired CPU and hyperthread; low order 2 bits
  68  *      specify CPU, next bit specifies hyperthread.
  69  */
  70 void bind_thread_to_cpu(int cpu)
  71 {
  72 #ifdef HAVE_SCHED
  73   cpu_set_t set;
  74   CPU_ZERO(&set);
  75   CPU_SET(cpu, &set);
  76   sched_setaffinity(0, sizeof(set), &set);
  77 #endif
  78 }
  79
  80 /*
  81  * This function just discards its argument. It's used to make it
  82  * appear that data is used,  so that the compiler won't optimize
  83  * away the code we're trying to measure.
  84  *
  85  * \param value
  86  *      Pointer to arbitrary value; it's discarded.
  87  */
  88 void discard(void* value) {
  89   int x = *reinterpret_cast<int*>(value);
  90   if (x == 0x43924776) {
  91     printf("Value was 0x%x\n", x);
  92   }
  93 }
  94
  95 //----------------------------------------------------------------------
  96 // Test functions start here
  97 //----------------------------------------------------------------------
  98
  99 // Measure the cost of atomic compare-and-swap
 100 double atomic_int_cmp()
 101 {
 102   int count = 1000000;
 103   std::atomic<unsigned> value = { 11 };
 104   unsigned int test = 11;
 105   uint64_t start = Cycles::rdtsc();
 106   for (int i = 0; i < count; i++) {
 107     value.compare_exchange_strong(test, test+2);
 108     test += 2;
 109   }
 110   uint64_t stop = Cycles::rdtsc();
 111   // printf("Final value: %d\n", value.load());
 112   return Cycles::to_seconds(stop - start)/count;
 113 }
 114
 115 // Measure the cost of incrementing an atomic
 116 double atomic_int_inc()
 117 {
 118   int count = 1000000;
 119   std::atomic<int64_t> value = { 11 };
 120   uint64_t start = Cycles::rdtsc();
 121   for (int i = 0; i < count; i++) {
 122     value++;
 123   }
 124   uint64_t stop = Cycles::rdtsc();
 125   // printf("Final value: %d\n", value.load());
 126   return Cycles::to_seconds(stop - start)/count;
 127 }
 128
 129 // Measure the cost of reading an atomic
 130 double atomic_int_read()
 131 {
 132   int count = 1000000;
 133   std::atomic<int64_t> value = { 11 };
 134   int total = 0;
 135   uint64_t start = Cycles::rdtsc();
 136   for (int i = 0; i < count; i++) {
 137     total += value;
 138   }
 139   uint64_t stop = Cycles::rdtsc();
 140   // printf("Total: %d\n", total);
 141   return Cycles::to_seconds(stop - start)/count;
 142 }
 143
 144 // Measure the cost of storing a new value in an atomic
 145 double atomic_int_set()
 146 {
 147   int count = 1000000;
 148   std::atomic<int64_t> value = { 11 };
 149   uint64_t start = Cycles::rdtsc();
 150   for (int i = 0; i < count; i++) {
 151     value = 88;
 152   }
 153   uint64_t stop = Cycles::rdtsc();
 154   return Cycles::to_seconds(stop - start)/count;
 155 }
 156
 157 // Measure the cost of acquiring and releasing a mutex in the
 158 // fast case where the mutex is free.
 159 double mutex_nonblock()
 160 {
 161   int count = 1000000;
 162   Mutex m("mutex_nonblock::m");
 163   uint64_t start = Cycles::rdtsc();
 164   for (int i = 0; i < count; i++) {
 165     m.Lock();
 166     m.Unlock();
 167   }
 168   uint64_t stop = Cycles::rdtsc();
 169   return Cycles::to_seconds(stop - start)/count;
 170 }
 171
 172 // Measure the cost of allocating and deallocating a buffer, plus
 173 // appending (logically) one ptr.
 174 double buffer_basic()
 175 {
 176   int count = 1000000;
 177   uint64_t start = Cycles::rdtsc();
 178   bufferptr ptr("abcdefg", 7);
 179   for (int i = 0; i < count; i++) {
 180     bufferlist b;
 181     b.append(ptr, 0, 5);
 182   }
 183   uint64_t stop = Cycles::rdtsc();
 184   return Cycles::to_seconds(stop - start)/count;
 185 }
 186
 187 struct DummyBlock {
 188   int a = 1, b = 2, c = 3, d = 4;
 189   void encode(bufferlist &bl) const {
 190     ENCODE_START(1, 1, bl);
 191     ::encode(a, bl);
 192     ::encode(b, bl);
 193     ::encode(c, bl);
 194     ::encode(d, bl);
 195     ENCODE_FINISH(bl);
 196   }
 197   void decode(bufferlist::iterator &bl) {
 198     DECODE_START(1, bl);
 199     ::decode(a, bl);
 200     ::decode(b, bl);
 201     ::decode(c, bl);
 202     ::decode(d, bl);
 203     DECODE_FINISH(bl);
 204   }
 205 };
 206 WRITE_CLASS_ENCODER(DummyBlock)
 207
 208 // Measure the cost of encoding and decoding a buffer, plus
 209 // allocating space for one chunk.
 210 double buffer_encode_decode()
 211 {
 212   int count = 1000000;
 213   uint64_t start = Cycles::rdtsc();
 214   for (int i = 0; i < count; i++) {
 215     bufferlist b;
 216     DummyBlock dummy_block;
 217     ::encode(dummy_block, b);
 218     bufferlist::iterator iter = b.begin();
 219     ::decode(dummy_block, iter);
 220   }
 221   uint64_t stop = Cycles::rdtsc();
 222   return Cycles::to_seconds(stop - start)/count;
 223 }
 224
 225 // Measure the cost of allocating and deallocating a buffer, plus
 226 // copying in a small block.
 227 double buffer_basic_copy()
 228 {
 229   int count = 1000000;
 230   uint64_t start = Cycles::rdtsc();
 231   for (int i = 0; i < count; i++) {
 232     bufferlist b;
 233     b.append("abcdefg", 6);
 234   }
 235   uint64_t stop = Cycles::rdtsc();
 236   return Cycles::to_seconds(stop - start)/count;
 237 }
 238
 239 // Measure the cost of making a copy of parts of two ptrs.
 240 double buffer_copy()
 241 {
 242   int count = 1000000;
 243   bufferlist b;
 244   b.append("abcde", 5);
 245   b.append("01234", 5);
 246   char copy[10];
 247   uint64_t start = Cycles::rdtsc();
 248   for (int i = 0; i < count; i++) {
 249     b.copy(2, 6, copy);
 250   }
 251   uint64_t stop = Cycles::rdtsc();
 252   return Cycles::to_seconds(stop - start)/count;
 253 }
 254
 255 // Measure the cost of allocating new space by extending the
 256 // bufferlist
 257 double buffer_encode()
 258 {
 259   int count = 100000;
 260   uint64_t total = 0;
 261   for (int i = 0; i < count; i++) {
 262     bufferlist b;
 263     DummyBlock dummy_block;
 264     ::encode(dummy_block, b);
 265     uint64_t start = Cycles::rdtsc();
 266     ::encode(dummy_block, b);
 267     ::encode(dummy_block, b);
 268     ::encode(dummy_block, b);
 269     ::encode(dummy_block, b);
 270     ::encode(dummy_block, b);
 271     ::encode(dummy_block, b);
 272     ::encode(dummy_block, b);
 273     ::encode(dummy_block, b);
 274     ::encode(dummy_block, b);
 275     ::encode(dummy_block, b);
 276     total += Cycles::rdtsc() - start;
 277   }
 278   return Cycles::to_seconds(total)/(count*10);
 279 }
 280
 281 // Measure the cost of retrieving an object from the beginning of a buffer.
 282 double buffer_get_contiguous()
 283 {
 284   int count = 1000000;
 285   int value = 11;
 286   bufferlist b;
 287   b.append((char*)&value, sizeof(value));
 288   int sum = 0;
 289   uint64_t start = Cycles::rdtsc();
 290   for (int i = 0; i < count; i++) {
 291     sum += *reinterpret_cast<int*>(b.get_contiguous(0, sizeof(value)));
 292   }
 293   uint64_t stop = Cycles::rdtsc();
 294   return Cycles::to_seconds(stop - start)/count;
 295 }
 296
 297 // Measure the cost of creating an iterator and iterating over 10
 298 // chunks in a buffer.
 299 double buffer_iterator()
 300 {
 301   bufferlist b;
 302   const char s[] = "abcdefghijklmnopqrstuvwxyz";
 303   bufferptr ptr(s, sizeof(s));
 304   for (int i = 0; i < 5; i++) {
 305     b.append(ptr, i, 5);
 306   }
 307   int count = 100000;
 308   int sum = 0;
 309   uint64_t start = Cycles::rdtsc();
 310   for (int i = 0; i < count; i++) {
 311     bufferlist::iterator it = b.begin();
 312     while (!it.end()) {
 313       sum += (static_cast<const char*>(it.get_current_ptr().c_str()))[it.get_remaining()-1];
 314       ++it;
 315     }
 316   }
 317   uint64_t stop = Cycles::rdtsc();
 318   discard(&sum);
 319   return Cycles::to_seconds(stop - start)/count;
 320 }
 321
 322 // Implements the CondPingPong test.
 323 class CondPingPong {
 324   Mutex mutex;
 325   Cond cond;
 326   int prod;
 327   int cons;
 328   const int count;
 329
 330   class Consumer : public Thread {
 331     CondPingPong *p;
 332    public:
 333     explicit Consumer(CondPingPong *p): p(p) {}
 334     void* entry() override {
 335       p->consume();
 336       return 0;
 337     }
 338   } consumer;
 339
 340  public:
 341   CondPingPong(): mutex("CondPingPong::mutex"), prod(0), cons(0), count(10000), consumer(this) {}
 342
 343   double run() {
 344     consumer.create("consumer");
 345     uint64_t start = Cycles::rdtsc();
 346     produce();
 347     uint64_t stop = Cycles::rdtsc();
 348     consumer.join();
 349     return Cycles::to_seconds(stop - start)/count;
 350   }
 351
 352   void produce() {
 353     Mutex::Locker l(mutex);
 354     while (cons < count) {
 355       while (cons < prod)
 356         cond.Wait(mutex);
 357       ++prod;
 358       cond.Signal();
 359     }
 360   }
 361
 362   void consume() {
 363     Mutex::Locker l(mutex);
 364     while (cons < count) {
 365       while (cons == prod)
 366         cond.Wait(mutex);
 367       ++cons;
 368       cond.Signal();
 369     }
 370   }
 371 };
 372
 373 // Measure the cost of coordinating between threads using a condition variable.
 374 double cond_ping_pong()
 375 {
 376   return CondPingPong().run();
 377 }
 378
 379 // Measure the cost of a 32-bit divide. Divides don't take a constant
 380 // number of cycles. Values were chosen here semi-randomly to depict a
 381 // fairly expensive scenario. Someone with fancy ALU knowledge could
 382 // probably pick worse values.
 383 double div32()
 384 {
 385 #if defined(__i386__) || defined(__x86_64__)
 386   int count = 1000000;
 387   uint64_t start = Cycles::rdtsc();
 388   // NB: Expect an x86 processor exception is there's overflow.
 389   uint32_t numeratorHi = 0xa5a5a5a5U;
 390   uint32_t numeratorLo = 0x55aa55aaU;
 391   uint32_t divisor = 0xaa55aa55U;
 392   uint32_t quotient;
 393   uint32_t remainder;
 394   for (int i = 0; i < count; i++) {
 395     __asm__ __volatile__("div %4" :
 396                          "=a"(quotient), "=d"(remainder) :
 397                          "a"(numeratorLo), "d"(numeratorHi), "r"(divisor) :
 398                          "cc");
 399   }
 400   uint64_t stop = Cycles::rdtsc();
 401   return Cycles::to_seconds(stop - start)/count;
 402 #else
 403   return -1;
 404 #endif
 405 }
 406
 407 // Measure the cost of a 64-bit divide. Divides don't take a constant
 408 // number of cycles. Values were chosen here semi-randomly to depict a
 409 // fairly expensive scenario. Someone with fancy ALU knowledge could
 410 // probably pick worse values.
 411 double div64()
 412 {
 413 #if defined(__x86_64__) || defined(__amd64__)
 414   int count = 1000000;
 415   // NB: Expect an x86 processor exception is there's overflow.
 416   uint64_t start = Cycles::rdtsc();
 417   uint64_t numeratorHi = 0x5a5a5a5a5a5UL;
 418   uint64_t numeratorLo = 0x55aa55aa55aa55aaUL;
 419   uint64_t divisor = 0xaa55aa55aa55aa55UL;
 420   uint64_t quotient;
 421   uint64_t remainder;
 422   for (int i = 0; i < count; i++) {
 423     __asm__ __volatile__("divq %4" :
 424                          "=a"(quotient), "=d"(remainder) :
 425                          "a"(numeratorLo), "d"(numeratorHi), "r"(divisor) :
 426                          "cc");
 427   }
 428   uint64_t stop = Cycles::rdtsc();
 429   return Cycles::to_seconds(stop - start)/count;
 430 #else
 431   return -1;
 432 #endif
 433 }
 434
 435 // Measure the cost of calling a non-inlined function.
 436 double function_call()
 437 {
 438   int count = 1000000;
 439   uint64_t x = 0;
 440   uint64_t start = Cycles::rdtsc();
 441   for (int i = 0; i < count; i++) {
 442     x = PerfHelper::plus_one(x);
 443   }
 444   uint64_t stop = Cycles::rdtsc();
 445   return Cycles::to_seconds(stop - start)/count;
 446 }
 447
 448 // Measure the minimum cost of EventCenter::process_events, when there are no
 449 // Pollers and no Timers.
 450 double eventcenter_poll()
 451 {
 452   int count = 1000000;
 453   EventCenter center(g_ceph_context);
 454   center.init(1000, 0, "posix");
 455   center.set_owner();
 456   uint64_t start = Cycles::rdtsc();
 457   for (int i = 0; i < count; i++) {
 458     center.process_events(0);
 459   }
 460   uint64_t stop = Cycles::rdtsc();
 461   return Cycles::to_seconds(stop - start)/count;
 462 }
 463
 464 class CenterWorker : public Thread {
 465   CephContext *cct;
 466   bool done;
 467
 468  public:
 469   EventCenter center;
 470   explicit CenterWorker(CephContext *c): cct(c), done(false), center(c) {
 471     center.init(100, 0, "posix");
 472   }
 473   void stop() {
 474     done = true;
 475     center.wakeup();
 476   }
 477   void* entry() override {
 478     center.set_owner();
 479     bind_thread_to_cpu(2);
 480     while (!done)
 481       center.process_events(1000);
 482     return 0;
 483   }
 484 };
 485
 486 class CountEvent: public EventCallback {
 487   std::atomic<int64_t> *count;
 488
 489  public:
 490   explicit CountEvent(std::atomic<int64_t> *atomic): count(atomic) {}
 491   void do_request(int id) override {
 492     (*count)--;
 493   }
 494 };
 495
 496 double eventcenter_dispatch()
 497 {
 498   int count = 100000;
 499
 500   CenterWorker worker(g_ceph_context);
 501   std::atomic<int64_t> flag = { 1 };
 502   worker.create("evt_center_disp");
 503   EventCallbackRef count_event(new CountEvent(&flag));
 504
 505   worker.center.dispatch_event_external(count_event);
 506   // Start a new thread and wait for it to ready.
 507   while (flag)
 508     usleep(100);
 509
 510   uint64_t start = Cycles::rdtsc();
 511   for (int i = 0; i < count; i++) {
 512     flag = 1;
 513     worker.center.dispatch_event_external(count_event);
 514     while (flag)
 515       ;
 516   }
 517   uint64_t stop = Cycles::rdtsc();
 518   worker.stop();
 519   worker.join();
 520   return Cycles::to_seconds(stop - start)/count;
 521 }
 522
 523 // Measure the cost of copying a given number of bytes with memcpy.
 524 double memcpy_shared(size_t size)
 525 {
 526   int count = 1000000;
 527   char src[size], dst[size];
 528
 529   memset(src, 0, sizeof(src));
 530
 531   uint64_t start = Cycles::rdtsc();
 532   for (int i = 0; i < count; i++) {
 533     memcpy(dst, src, size);
 534   }
 535   uint64_t stop = Cycles::rdtsc();
 536   return Cycles::to_seconds(stop - start)/count;
 537 }
 538
 539 double memcpy100()
 540 {
 541   return memcpy_shared(100);
 542 }
 543
 544 double memcpy1000()
 545 {
 546   return memcpy_shared(1000);
 547 }
 548
 549 double memcpy10000()
 550 {
 551   return memcpy_shared(10000);
 552 }
 553
 554 // Benchmark rjenkins hashing performance on cached data.
 555 template <int key_length>
 556 double ceph_str_hash_rjenkins()
 557 {
 558   int count = 100000;
 559   char buf[key_length];
 560
 561   uint64_t start = Cycles::rdtsc();
 562   for (int i = 0; i < count; i++)
 563     ceph_str_hash(CEPH_STR_HASH_RJENKINS, buf, sizeof(buf));
 564   uint64_t stop = Cycles::rdtsc();
 565
 566   return Cycles::to_seconds(stop - start)/count;
 567 }
 568
 569 // Measure the cost of reading the fine-grain cycle counter.
 570 double rdtsc_test()
 571 {
 572   int count = 1000000;
 573   uint64_t start = Cycles::rdtsc();
 574   uint64_t total = 0;
 575   for (int i = 0; i < count; i++) {
 576     total += Cycles::rdtsc();
 577   }
 578   uint64_t stop = Cycles::rdtsc();
 579   return Cycles::to_seconds(stop - start)/count;
 580 }
 581
 582 // Measure the cost of the Cycles::to_seconds method.
 583 double perf_cycles_to_seconds()
 584 {
 585   int count = 1000000;
 586   double total = 0;
 587   uint64_t cycles = 994261;
 588   uint64_t start = Cycles::rdtsc();
 589   for (int i = 0; i < count; i++) {
 590     total += Cycles::to_seconds(cycles);
 591   }
 592   uint64_t stop = Cycles::rdtsc();
 593   // printf("Result: %.4f\n", total/count);
 594   return Cycles::to_seconds(stop - start)/count;
 595 }
 596
 597 // Measure the cost of the Cylcles::toNanoseconds method.
 598 double perf_cycles_to_nanoseconds()
 599 {
 600   int count = 1000000;
 601   uint64_t total = 0;
 602   uint64_t cycles = 994261;
 603   uint64_t start = Cycles::rdtsc();
 604   for (int i = 0; i < count; i++) {
 605     total += Cycles::to_nanoseconds(cycles);
 606   }
 607   uint64_t stop = Cycles::rdtsc();
 608   // printf("Result: %lu\n", total/count);
 609   return Cycles::to_seconds(stop - start)/count;
 610 }
 611
 612
 613 #ifdef HAVE_SSE
 614 /**
 615  * Prefetch the cache lines containing [object, object + numBytes) into the
 616  * processor's caches.
 617  * The best docs for this are in the Intel instruction set reference under
 618  * PREFETCH.
 619  * \param object
 620  *      The start of the region of memory to prefetch.
 621  * \param num_bytes
 622  *      The size of the region of memory to prefetch.
 623  */
 624 static inline void prefetch(const void *object, uint64_t num_bytes)
 625 {
 626     uint64_t offset = reinterpret_cast<uint64_t>(object) & 0x3fUL;
 627     const char* p = reinterpret_cast<const char*>(object) - offset;
 628     for (uint64_t i = 0; i < offset + num_bytes; i += 64)
 629         _mm_prefetch(p + i, _MM_HINT_T0);
 630 }
 631 #endif
 632
 633 // Measure the cost of the prefetch instruction.
 634 double perf_prefetch()
 635 {
 636 #ifdef HAVE_SSE
 637   uint64_t total_ticks = 0;
 638   int count = 10;
 639   char buf[16 * 64];
 640   uint64_t start, stop;
 641
 642   for (int i = 0; i < count; i++) {
 643     PerfHelper::flush_cache();
 644     start = Cycles::rdtsc();
 645     prefetch(&buf[576], 64);
 646     prefetch(&buf[0],   64);
 647     prefetch(&buf[512], 64);
 648     prefetch(&buf[960], 64);
 649     prefetch(&buf[640], 64);
 650     prefetch(&buf[896], 64);
 651     prefetch(&buf[256], 64);
 652     prefetch(&buf[704], 64);
 653     prefetch(&buf[320], 64);
 654     prefetch(&buf[384], 64);
 655     prefetch(&buf[128], 64);
 656     prefetch(&buf[448], 64);
 657     prefetch(&buf[768], 64);
 658     prefetch(&buf[832], 64);
 659     prefetch(&buf[64],  64);
 660     prefetch(&buf[192], 64);
 661     stop = Cycles::rdtsc();
 662     total_ticks += stop - start;
 663   }
 664   return Cycles::to_seconds(total_ticks) / count / 16;
 665 #else
 666   return -1;
 667 #endif
 668 }
 669
 670 #if defined(__x86_64__)
 671 /**
 672  * This function is used to seralize machine instructions so that no
 673  * instructions that appear after it in the current thread can run before any
 674  * instructions that appear before it.
 675  *
 676  * It is useful for putting around rdpmc instructions (to pinpoint cache
 677  * misses) as well as before rdtsc instructions, to prevent time pollution from
 678  * instructions supposed to be executing before the timer starts.
 679  */
 680 static inline void serialize() {
 681     uint32_t eax, ebx, ecx, edx;
 682     __asm volatile("cpuid"
 683         : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
 684         : "a" (1U));
 685 }
 686 #endif
 687
 688 // Measure the cost of cpuid
 689 double perf_serialize() {
 690 #if defined(__x86_64__)
 691   int count = 1000000;
 692   uint64_t start = Cycles::rdtsc();
 693   for (int i = 0; i < count; i++) {
 694     serialize();
 695   }
 696   uint64_t stop = Cycles::rdtsc();
 697   return Cycles::to_seconds(stop - start)/count;
 698 #else
 699   return -1;
 700 #endif
 701 }
 702
 703 // Measure the cost of an lfence instruction.
 704 double lfence()
 705 {
 706 #ifdef HAVE_SSE2
 707   int count = 1000000;
 708   uint64_t start = Cycles::rdtsc();
 709   for (int i = 0; i < count; i++) {
 710     __asm__ __volatile__("lfence" ::: "memory");
 711   }
 712   uint64_t stop = Cycles::rdtsc();
 713   return Cycles::to_seconds(stop - start)/count;
 714 #else
 715   return -1;
 716 #endif
 717 }
 718
 719 // Measure the cost of an sfence instruction.
 720 double sfence()
 721 {
 722 #ifdef HAVE_SSE
 723   int count = 1000000;
 724   uint64_t start = Cycles::rdtsc();
 725   for (int i = 0; i < count; i++) {
 726     __asm__ __volatile__("sfence" ::: "memory");
 727   }
 728   uint64_t stop = Cycles::rdtsc();
 729   return Cycles::to_seconds(stop - start)/count;
 730 #else
 731   return -1;
 732 #endif
 733 }
 734
 735 // Measure the cost of acquiring and releasing a SpinLock (assuming the
 736 // lock is initially free).
 737 double test_spinlock()
 738 {
 739   int count = 1000000;
 740   Spinlock lock;
 741   uint64_t start = Cycles::rdtsc();
 742   for (int i = 0; i < count; i++) {
 743     lock.lock();
 744     lock.unlock();
 745   }
 746   uint64_t stop = Cycles::rdtsc();
 747   return Cycles::to_seconds(stop - start)/count;
 748 }
 749
 750 // Helper for spawn_thread. This is the main function that the thread executes
 751 // (intentionally empty).
 752 class ThreadHelper : public Thread {
 753   void *entry() override { return 0; }
 754 };
 755
 756 // Measure the cost of start and joining with a thread.
 757 double spawn_thread()
 758 {
 759   int count = 10000;
 760   ThreadHelper thread;
 761   uint64_t start = Cycles::rdtsc();
 762   for (int i = 0; i < count; i++) {
 763     thread.create("thread_helper");
 764     thread.join();
 765   }
 766   uint64_t stop = Cycles::rdtsc();
 767   return Cycles::to_seconds(stop - start)/count;
 768 }
 769
 770 class FakeContext : public Context {
 771  public:
 772   void finish(int r) override {}
 773 };
 774
 775 // Measure the cost of starting and stopping a Dispatch::Timer.
 776 double perf_timer()
 777 {
 778   int count = 1000000;
 779   Mutex lock("perf_timer::lock");
 780   SafeTimer timer(g_ceph_context, lock);
 781   FakeContext **c = new FakeContext*[count];
 782   for (int i = 0; i < count; i++) {
 783     c[i] = new FakeContext();
 784   }
 785   uint64_t start = Cycles::rdtsc();
 786   Mutex::Locker l(lock);
 787   for (int i = 0; i < count; i++) {
 788     if (timer.add_event_after(12345, c[i])) {
 789       timer.cancel_event(c[i]);
 790     }
 791   }
 792   uint64_t stop = Cycles::rdtsc();
 793   delete[] c;
 794   return Cycles::to_seconds(stop - start)/count;
 795 }
 796
 797 // Measure the cost of throwing and catching an int. This uses an integer as
 798 // the value thrown, which is presumably as fast as possible.
 799 double throw_int()
 800 {
 801   int count = 10000;
 802   uint64_t start = Cycles::rdtsc();
 803   for (int i = 0; i < count; i++) {
 804     try {
 805       throw 0;
 806     } catch (int) { // NOLINT
 807       // pass
 808     }
 809   }
 810   uint64_t stop = Cycles::rdtsc();
 811   return Cycles::to_seconds(stop - start)/count;
 812 }
 813
 814 // Measure the cost of throwing and catching an int from a function call.
 815 double throw_int_call()
 816 {
 817   int count = 10000;
 818   uint64_t start = Cycles::rdtsc();
 819   for (int i = 0; i < count; i++) {
 820     try {
 821       PerfHelper::throw_int();
 822     } catch (int) { // NOLINT
 823       // pass
 824     }
 825   }
 826   uint64_t stop = Cycles::rdtsc();
 827   return Cycles::to_seconds(stop - start)/count;
 828 }
 829
 830 // Measure the cost of throwing and catching an Exception. This uses an actual
 831 // exception as the value thrown, which may be slower than throwInt.
 832 double throw_exception()
 833 {
 834   int count = 10000;
 835   uint64_t start = Cycles::rdtsc();
 836   for (int i = 0; i < count; i++) {
 837     try {
 838       throw buffer::end_of_buffer();
 839     } catch (const buffer::end_of_buffer&) {
 840       // pass
 841     }
 842   }
 843   uint64_t stop = Cycles::rdtsc();
 844   return Cycles::to_seconds(stop - start)/count;
 845 }
 846
 847 // Measure the cost of throwing and catching an Exception from a function call.
 848 double throw_exception_call()
 849 {
 850   int count = 10000;
 851   uint64_t start = Cycles::rdtsc();
 852   for (int i = 0; i < count; i++) {
 853     try {
 854       PerfHelper::throw_end_of_buffer();
 855     } catch (const buffer::end_of_buffer&) {
 856       // pass
 857     }
 858   }
 859   uint64_t stop = Cycles::rdtsc();
 860   return Cycles::to_seconds(stop - start)/count;
 861 }
 862
 863 // Measure the cost of pushing a new element on a std::vector, copying
 864 // from the end to an internal element, and popping the end element.
 865 double vector_push_pop()
 866 {
 867   int count = 100000;
 868   std::vector<int> vector;
 869   vector.push_back(1);
 870   vector.push_back(2);
 871   vector.push_back(3);
 872   uint64_t start = Cycles::rdtsc();
 873   for (int i = 0; i < count; i++) {
 874     vector.push_back(i);
 875     vector.push_back(i+1);
 876     vector.push_back(i+2);
 877     vector[2] = vector.back();
 878     vector.pop_back();
 879     vector[0] = vector.back();
 880     vector.pop_back();
 881     vector[1] = vector.back();
 882     vector.pop_back();
 883   }
 884   uint64_t stop = Cycles::rdtsc();
 885   return Cycles::to_seconds(stop - start)/(count*3);
 886 }
 887
 888 // Measure the cost of ceph_clock_now
 889 double perf_ceph_clock_now()
 890 {
 891   int count = 100000;
 892   uint64_t start = Cycles::rdtsc();
 893   for (int i = 0; i < count; i++) {
 894     ceph_clock_now();
 895   }
 896   uint64_t stop = Cycles::rdtsc();
 897   return Cycles::to_seconds(stop - start)/count;
 898 }
 899
 900 // The following struct and table define each performance test in terms of
 901 // a string name and a function that implements the test.
 902 struct TestInfo {
 903   const char* name;             // Name of the performance test; this is
 904                                 // what gets typed on the command line to
 905                                 // run the test.
 906   double (*func)();             // Function that implements the test;
 907                                 // returns the time (in seconds) for each
 908                                 // iteration of that test.
 909   const char *description;      // Short description of this test (not more
 910                                 // than about 40 characters, so the entire
 911                                 // test output fits on a single line).
 912 };
 913 TestInfo tests[] = {
 914   {"atomic_int_cmp", atomic_int_cmp,
 915     "atomic_t::compare_and_swap"},
 916   {"atomic_int_inc", atomic_int_inc,
 917     "atomic_t::inc"},
 918   {"atomic_int_read", atomic_int_read,
 919     "atomic_t::read"},
 920   {"atomic_int_set", atomic_int_set,
 921     "atomic_t::set"},
 922   {"mutex_nonblock", mutex_nonblock,
 923     "Mutex lock/unlock (no blocking)"},
 924   {"buffer_basic", buffer_basic,
 925     "buffer create, add one ptr, delete"},
 926   {"buffer_encode_decode", buffer_encode_decode,
 927     "buffer create, encode/decode object, delete"},
 928   {"buffer_basic_copy", buffer_basic_copy,
 929     "buffer create, copy small block, delete"},
 930   {"buffer_copy", buffer_copy,
 931     "copy out 2 small ptrs from buffer"},
 932   {"buffer_encode10", buffer_encode,
 933     "buffer encoding 10 structures onto existing ptr"},
 934   {"buffer_get_contiguous", buffer_get_contiguous,
 935     "Buffer::get_contiguous"},
 936   {"buffer_iterator", buffer_iterator,
 937     "iterate over buffer with 5 ptrs"},
 938   {"cond_ping_pong", cond_ping_pong,
 939     "condition variable round-trip"},
 940   {"div32", div32,
 941     "32-bit integer division instruction"},
 942   {"div64", div64,
 943     "64-bit integer division instruction"},
 944   {"function_call", function_call,
 945     "Call a function that has not been inlined"},
 946   {"eventcenter_poll", eventcenter_poll,
 947     "EventCenter::process_events (no timers or events)"},
 948   {"eventcenter_dispatch", eventcenter_dispatch,
 949     "EventCenter::dispatch_event_external latency"},
 950   {"memcpy100", memcpy100,
 951     "Copy 100 bytes with memcpy"},
 952   {"memcpy1000", memcpy1000,
 953     "Copy 1000 bytes with memcpy"},
 954   {"memcpy10000", memcpy10000,
 955     "Copy 10000 bytes with memcpy"},
 956   {"ceph_str_hash_rjenkins", ceph_str_hash_rjenkins<16>,
 957     "rjenkins hash on 16 byte of data"},
 958   {"ceph_str_hash_rjenkins", ceph_str_hash_rjenkins<256>,
 959     "rjenkins hash on 256 bytes of data"},
 960   {"rdtsc", rdtsc_test,
 961     "Read the fine-grain cycle counter"},
 962   {"cycles_to_seconds", perf_cycles_to_seconds,
 963     "Convert a rdtsc result to (double) seconds"},
 964   {"cycles_to_seconds", perf_cycles_to_nanoseconds,
 965     "Convert a rdtsc result to (uint64_t) nanoseconds"},
 966   {"prefetch", perf_prefetch,
 967     "Prefetch instruction"},
 968   {"serialize", perf_serialize,
 969     "serialize instruction"},
 970   {"lfence", lfence,
 971     "Lfence instruction"},
 972   {"sfence", sfence,
 973     "Sfence instruction"},
 974   {"spin_lock", test_spinlock,
 975     "Acquire/release SpinLock"},
 976   {"spawn_thread", spawn_thread,
 977     "Start and stop a thread"},
 978   {"perf_timer", perf_timer,
 979     "Insert and cancel a SafeTimer"},
 980   {"throw_int", throw_int,
 981     "Throw an int"},
 982   {"throw_int_call", throw_int_call,
 983     "Throw an int in a function call"},
 984   {"throw_exception", throw_exception,
 985     "Throw an Exception"},
 986   {"throw_exception_call", throw_exception_call,
 987     "Throw an Exception in a function call"},
 988   {"vector_push_pop", vector_push_pop,
 989     "Push and pop a std::vector"},
 990   {"ceph_clock_now", perf_ceph_clock_now,
 991    "ceph_clock_now function"},
 992 };
 993
 994 /**
 995  * Runs a particular test and prints a one-line result message.
 996  *
 997  * \param info
 998  *      Describes the test to run.
 999  */
1000 void run_test(TestInfo& info)
1001 {
1002   double secs = info.func();
1003   int width = printf("%-24s ", info.name);
1004   if (secs == -1) {
1005     width += printf(" architecture nonsupport ");
1006   } else if (secs < 1.0e-06) {
1007     width += printf("%8.2fns", 1e09*secs);
1008   } else if (secs < 1.0e-03) {
1009     width += printf("%8.2fus", 1e06*secs);
1010   } else if (secs < 1.0) {
1011     width += printf("%8.2fms", 1e03*secs);
1012   } else {
1013     width += printf("%8.2fs", secs);
1014   }
1015   printf("%*s %s\n", 32-width, "", info.description);
1016 }
1017
1018 int main(int argc, char *argv[])
1019 {
1020   vector<const char*> args;
1021   argv_to_vec(argc, (const char **)argv, args);
1022
1023   auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
1024                          CODE_ENVIRONMENT_UTILITY, 0);
1025   common_init_finish(g_ceph_context);
1026   Cycles::init();
1027
1028   bind_thread_to_cpu(3);
1029   if (argc == 1) {
1030     // No test names specified; run all tests.
1031     for (size_t i = 0; i < sizeof(tests)/sizeof(TestInfo); ++i) {
1032       run_test(tests[i]);
1033     }
1034   } else {
1035     // Run only the tests that were specified on the command line.
1036     for (int i = 1; i < argc; i++) {
1037       bool found_test = false;
1038       for (size_t j = 0; j < sizeof(tests)/sizeof(TestInfo); ++j) {
1039         if (strcmp(argv[i], tests[j].name) == 0) {
1040           found_test = true;
1041           run_test(tests[j]);
1042           break;
1043         }
1044       }
1045       if (!found_test) {
1046         int width = printf("%-24s ??", argv[i]);
1047         printf("%*s No such test\n", 32-width, "");
1048       }
1049     }
1050   }
1051 }