ceph/src/test/perf_local.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /* Copyright (c) 2015 Haomai Wang <haomaiwang@gmail.com>
   4  * Copyright (c) 2011-2014 Stanford University
   5  * Copyright (c) 2011 Facebook
   6  *
   7  * Permission to use, copy, modify, and distribute this software for any
   8  * purpose with or without fee is hereby granted, provided that the above
   9  * copyright notice and this permission notice appear in all copies.
  10  *
  11  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR(S) DISCLAIM ALL WARRANTIES
  12  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  13  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL AUTHORS BE LIABLE FOR
  14  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  15  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  16  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
  17  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  18  */
  19
  20 // This program contains a collection of low-level performance measurements
  21 // for Ceph, which can be run either individually or altogether.  These
  22 // tests measure performance in a single stand-alone process, not in a cluster
  23 // with multiple servers.  Invoke the program like this:
  24 //
  25 //     Perf test1 test2 ...
  26 //
  27 // test1 and test2 are the names of individual performance measurements to
  28 // run.  If no test names are provided then all of the performance tests
  29 // are run.
  30 //
  31 // To add a new test:
  32 // * Write a function that implements the test.  Use existing test functions
  33 //   as a guideline, and be sure to generate output in the same form as
  34 //   other tests.
  35 // * Create a new entry for the test in the #tests table.
  36 #include <vector>
  37 #include <sched.h>
  38
  39 #include "acconfig.h"
  40 #ifdef HAVE_SSE
  41 #include <xmmintrin.h>
  42 #endif
  43
  44 #include "include/buffer.h"
  45 #include "include/encoding.h"
  46 #include "include/ceph_hash.h"
  47 #include "include/spinlock.h"
  48 #include "common/ceph_argparse.h"
  49 #include "common/Cycles.h"
  50 #include "common/Cond.h"
  51 #include "common/ceph_mutex.h"
  52 #include "common/Thread.h"
  53 #include "common/Timer.h"
  54 #include "msg/async/Event.h"
  55 #include "global/global_init.h"
  56
  57 #include "test/perf_helper.h"
  58
  59 #include <atomic>
  60
  61 using namespace ceph;
  62
  63 /**
  64  * Ask the operating system to pin the current thread to a given CPU.
  65  *
  66  * \param cpu
  67  *      Indicates the desired CPU and hyperthread; low order 2 bits
  68  *      specify CPU, next bit specifies hyperthread.
  69  */
  70 void bind_thread_to_cpu(int cpu)
  71 {
  72 #ifdef HAVE_SCHED
  73   cpu_set_t set;
  74   CPU_ZERO(&set);
  75   CPU_SET(cpu, &set);
  76   sched_setaffinity(0, sizeof(set), &set);
  77 #endif
  78 }
  79
  80 /*
  81  * This function just discards its argument. It's used to make it
  82  * appear that data is used,  so that the compiler won't optimize
  83  * away the code we're trying to measure.
  84  *
  85  * \param value
  86  *      Pointer to arbitrary value; it's discarded.
  87  */
  88 void discard(void* value) {
  89   int x = *reinterpret_cast<int*>(value);
  90   if (x == 0x43924776) {
  91     printf("Value was 0x%x\n", x);
  92   }
  93 }
  94
  95 //----------------------------------------------------------------------
  96 // Test functions start here
  97 //----------------------------------------------------------------------
  98
  99 // Measure the cost of atomic compare-and-swap
 100 double atomic_int_cmp()
 101 {
 102   int count = 1000000;
 103   std::atomic<unsigned> value = { 11 };
 104   unsigned int test = 11;
 105   uint64_t start = Cycles::rdtsc();
 106   for (int i = 0; i < count; i++) {
 107     value.compare_exchange_strong(test, test+2);
 108     test += 2;
 109   }
 110   uint64_t stop = Cycles::rdtsc();
 111   // printf("Final value: %d\n", value.load());
 112   return Cycles::to_seconds(stop - start)/count;
 113 }
 114
 115 // Measure the cost of incrementing an atomic
 116 double atomic_int_inc()
 117 {
 118   int count = 1000000;
 119   std::atomic<int64_t> value = { 11 };
 120   uint64_t start = Cycles::rdtsc();
 121   for (int i = 0; i < count; i++) {
 122     value++;
 123   }
 124   uint64_t stop = Cycles::rdtsc();
 125   // printf("Final value: %d\n", value.load());
 126   return Cycles::to_seconds(stop - start)/count;
 127 }
 128
 129 // Measure the cost of reading an atomic
 130 double atomic_int_read()
 131 {
 132   int count = 1000000;
 133   std::atomic<int64_t> value = { 11 };
 134   int total = 0;
 135   uint64_t start = Cycles::rdtsc();
 136   for (int i = 0; i < count; i++) {
 137     total += value;
 138   }
 139   uint64_t stop = Cycles::rdtsc();
 140   // printf("Total: %d\n", total);
 141   return Cycles::to_seconds(stop - start)/count;
 142 }
 143
 144 // Measure the cost of storing a new value in an atomic
 145 double atomic_int_set()
 146 {
 147   int count = 1000000;
 148   std::atomic<int64_t> value = { 11 };
 149   uint64_t start = Cycles::rdtsc();
 150   for (int i = 0; i < count; i++) {
 151     value = 88;
 152   }
 153   uint64_t stop = Cycles::rdtsc();
 154   return Cycles::to_seconds(stop - start)/count;
 155 }
 156
 157 // Measure the cost of acquiring and releasing a mutex in the
 158 // fast case where the mutex is free.
 159 double mutex_nonblock()
 160 {
 161   int count = 1000000;
 162   ceph::mutex m = ceph::make_mutex("mutex_nonblock::m");
 163   uint64_t start = Cycles::rdtsc();
 164   for (int i = 0; i < count; i++) {
 165     m.lock();
 166     m.unlock();
 167   }
 168   uint64_t stop = Cycles::rdtsc();
 169   return Cycles::to_seconds(stop - start)/count;
 170 }
 171
 172 // Measure the cost of allocating and deallocating a buffer, plus
 173 // appending (logically) one ptr.
 174 double buffer_basic()
 175 {
 176   int count = 1000000;
 177   uint64_t start = Cycles::rdtsc();
 178   bufferptr ptr("abcdefg", 7);
 179   for (int i = 0; i < count; i++) {
 180     bufferlist b;
 181     b.append(ptr, 0, 5);
 182   }
 183   uint64_t stop = Cycles::rdtsc();
 184   return Cycles::to_seconds(stop - start)/count;
 185 }
 186
 187 struct DummyBlock {
 188   int a = 1, b = 2, c = 3, d = 4;
 189   void encode(bufferlist &bl) const {
 190     ENCODE_START(1, 1, bl);
 191     encode(a, bl);
 192     encode(b, bl);
 193     encode(c, bl);
 194     encode(d, bl);
 195     ENCODE_FINISH(bl);
 196   }
 197   void decode(bufferlist::const_iterator &bl) {
 198     DECODE_START(1, bl);
 199     decode(a, bl);
 200     decode(b, bl);
 201     decode(c, bl);
 202     decode(d, bl);
 203     DECODE_FINISH(bl);
 204   }
 205 };
 206 WRITE_CLASS_ENCODER(DummyBlock)
 207
 208 // Measure the cost of encoding and decoding a buffer, plus
 209 // allocating space for one chunk.
 210 double buffer_encode_decode()
 211 {
 212   int count = 1000000;
 213   uint64_t start = Cycles::rdtsc();
 214   for (int i = 0; i < count; i++) {
 215     bufferlist b;
 216     DummyBlock dummy_block;
 217     encode(dummy_block, b);
 218     auto iter = b.cbegin();
 219     decode(dummy_block, iter);
 220   }
 221   uint64_t stop = Cycles::rdtsc();
 222   return Cycles::to_seconds(stop - start)/count;
 223 }
 224
 225 // Measure the cost of allocating and deallocating a buffer, plus
 226 // copying in a small block.
 227 double buffer_basic_copy()
 228 {
 229   int count = 1000000;
 230   uint64_t start = Cycles::rdtsc();
 231   for (int i = 0; i < count; i++) {
 232     bufferlist b;
 233     b.append("abcdefg", 6);
 234   }
 235   uint64_t stop = Cycles::rdtsc();
 236   return Cycles::to_seconds(stop - start)/count;
 237 }
 238
 239 // Measure the cost of making a copy of parts of two ptrs.
 240 double buffer_copy()
 241 {
 242   int count = 1000000;
 243   bufferlist b;
 244   b.append("abcde", 5);
 245   b.append("01234", 5);
 246   char copy[10];
 247   uint64_t start = Cycles::rdtsc();
 248   for (int i = 0; i < count; i++) {
 249     b.cbegin(2).copy(6, copy);
 250   }
 251   uint64_t stop = Cycles::rdtsc();
 252   return Cycles::to_seconds(stop - start)/count;
 253 }
 254
 255 // Measure the cost of allocating new space by extending the
 256 // bufferlist
 257 double buffer_encode()
 258 {
 259   int count = 100000;
 260   uint64_t total = 0;
 261   for (int i = 0; i < count; i++) {
 262     bufferlist b;
 263     DummyBlock dummy_block;
 264     encode(dummy_block, b);
 265     uint64_t start = Cycles::rdtsc();
 266     encode(dummy_block, b);
 267     encode(dummy_block, b);
 268     encode(dummy_block, b);
 269     encode(dummy_block, b);
 270     encode(dummy_block, b);
 271     encode(dummy_block, b);
 272     encode(dummy_block, b);
 273     encode(dummy_block, b);
 274     encode(dummy_block, b);
 275     encode(dummy_block, b);
 276     total += Cycles::rdtsc() - start;
 277   }
 278   return Cycles::to_seconds(total)/(count*10);
 279 }
 280
 281 // Measure the cost of creating an iterator and iterating over 10
 282 // chunks in a buffer.
 283 double buffer_iterator()
 284 {
 285   bufferlist b;
 286   const char s[] = "abcdefghijklmnopqrstuvwxyz";
 287   bufferptr ptr(s, sizeof(s));
 288   for (int i = 0; i < 5; i++) {
 289     b.append(ptr, i, 5);
 290   }
 291   int count = 100000;
 292   int sum = 0;
 293   uint64_t start = Cycles::rdtsc();
 294   for (int i = 0; i < count; i++) {
 295     auto it = b.cbegin();
 296     while (!it.end()) {
 297       sum += (static_cast<const char*>(it.get_current_ptr().c_str()))[it.get_remaining()-1];
 298       ++it;
 299     }
 300   }
 301   uint64_t stop = Cycles::rdtsc();
 302   discard(&sum);
 303   return Cycles::to_seconds(stop - start)/count;
 304 }
 305
 306 // Implements the CondPingPong test.
 307 class CondPingPong {
 308   ceph::mutex mutex = ceph::make_mutex("CondPingPong::mutex");
 309   ceph::condition_variable cond;
 310   int prod = 0;
 311   int cons = 0;
 312   const int count = 10000;
 313
 314   class Consumer : public Thread {
 315     CondPingPong *p;
 316    public:
 317     explicit Consumer(CondPingPong *p): p(p) {}
 318     void* entry() override {
 319       p->consume();
 320       return 0;
 321     }
 322   } consumer;
 323
 324  public:
 325   CondPingPong(): consumer(this) {}
 326
 327   double run() {
 328     consumer.create("consumer");
 329     uint64_t start = Cycles::rdtsc();
 330     produce();
 331     uint64_t stop = Cycles::rdtsc();
 332     consumer.join();
 333     return Cycles::to_seconds(stop - start)/count;
 334   }
 335
 336   void produce() {
 337     std::unique_lock l{mutex};
 338     while (cons < count) {
 339       cond.wait(l, [this] { return cons >= prod; });
 340       ++prod;
 341       cond.notify_all();
 342     }
 343   }
 344
 345   void consume() {
 346     std::unique_lock l{mutex};
 347     while (cons < count) {
 348       cond.wait(l, [this] { return cons != prod; });
 349       ++cons;
 350       cond.notify_all();
 351     }
 352   }
 353 };
 354
 355 // Measure the cost of coordinating between threads using a condition variable.
 356 double cond_ping_pong()
 357 {
 358   return CondPingPong().run();
 359 }
 360
 361 // Measure the cost of a 32-bit divide. Divides don't take a constant
 362 // number of cycles. Values were chosen here semi-randomly to depict a
 363 // fairly expensive scenario. Someone with fancy ALU knowledge could
 364 // probably pick worse values.
 365 double div32()
 366 {
 367 #if defined(__i386__) || defined(__x86_64__)
 368   int count = 1000000;
 369   uint64_t start = Cycles::rdtsc();
 370   // NB: Expect an x86 processor exception is there's overflow.
 371   uint32_t numeratorHi = 0xa5a5a5a5U;
 372   uint32_t numeratorLo = 0x55aa55aaU;
 373   uint32_t divisor = 0xaa55aa55U;
 374   uint32_t quotient;
 375   uint32_t remainder;
 376   for (int i = 0; i < count; i++) {
 377     __asm__ __volatile__("div %4" :
 378                          "=a"(quotient), "=d"(remainder) :
 379                          "a"(numeratorLo), "d"(numeratorHi), "r"(divisor) :
 380                          "cc");
 381   }
 382   uint64_t stop = Cycles::rdtsc();
 383   return Cycles::to_seconds(stop - start)/count;
 384 #else
 385   return -1;
 386 #endif
 387 }
 388
 389 // Measure the cost of a 64-bit divide. Divides don't take a constant
 390 // number of cycles. Values were chosen here semi-randomly to depict a
 391 // fairly expensive scenario. Someone with fancy ALU knowledge could
 392 // probably pick worse values.
 393 double div64()
 394 {
 395 #if defined(__x86_64__) || defined(__amd64__)
 396   int count = 1000000;
 397   // NB: Expect an x86 processor exception is there's overflow.
 398   uint64_t start = Cycles::rdtsc();
 399   uint64_t numeratorHi = 0x5a5a5a5a5a5UL;
 400   uint64_t numeratorLo = 0x55aa55aa55aa55aaUL;
 401   uint64_t divisor = 0xaa55aa55aa55aa55UL;
 402   uint64_t quotient;
 403   uint64_t remainder;
 404   for (int i = 0; i < count; i++) {
 405     __asm__ __volatile__("divq %4" :
 406                          "=a"(quotient), "=d"(remainder) :
 407                          "a"(numeratorLo), "d"(numeratorHi), "r"(divisor) :
 408                          "cc");
 409   }
 410   uint64_t stop = Cycles::rdtsc();
 411   return Cycles::to_seconds(stop - start)/count;
 412 #else
 413   return -1;
 414 #endif
 415 }
 416
 417 // Measure the cost of calling a non-inlined function.
 418 double function_call()
 419 {
 420   int count = 1000000;
 421   uint64_t x = 0;
 422   uint64_t start = Cycles::rdtsc();
 423   for (int i = 0; i < count; i++) {
 424     x = PerfHelper::plus_one(x);
 425   }
 426   uint64_t stop = Cycles::rdtsc();
 427   return Cycles::to_seconds(stop - start)/count;
 428 }
 429
 430 // Measure the minimum cost of EventCenter::process_events, when there are no
 431 // Pollers and no Timers.
 432 double eventcenter_poll()
 433 {
 434   int count = 1000000;
 435   EventCenter center(g_ceph_context);
 436   center.init(1000, 0, "posix");
 437   center.set_owner();
 438   uint64_t start = Cycles::rdtsc();
 439   for (int i = 0; i < count; i++) {
 440     center.process_events(0);
 441   }
 442   uint64_t stop = Cycles::rdtsc();
 443   return Cycles::to_seconds(stop - start)/count;
 444 }
 445
 446 class CenterWorker : public Thread {
 447   CephContext *cct;
 448   bool done;
 449
 450  public:
 451   EventCenter center;
 452   explicit CenterWorker(CephContext *c): cct(c), done(false), center(c) {
 453     center.init(100, 0, "posix");
 454   }
 455   void stop() {
 456     done = true;
 457     center.wakeup();
 458   }
 459   void* entry() override {
 460     center.set_owner();
 461     bind_thread_to_cpu(2);
 462     while (!done)
 463       center.process_events(1000);
 464     return 0;
 465   }
 466 };
 467
 468 class CountEvent: public EventCallback {
 469   std::atomic<int64_t> *count;
 470
 471  public:
 472   explicit CountEvent(std::atomic<int64_t> *atomic): count(atomic) {}
 473   void do_request(uint64_t id) override {
 474     (*count)--;
 475   }
 476 };
 477
 478 double eventcenter_dispatch()
 479 {
 480   int count = 100000;
 481
 482   CenterWorker worker(g_ceph_context);
 483   std::atomic<int64_t> flag = { 1 };
 484   worker.create("evt_center_disp");
 485   EventCallbackRef count_event(new CountEvent(&flag));
 486
 487   worker.center.dispatch_event_external(count_event);
 488   // Start a new thread and wait for it to ready.
 489   while (flag)
 490     usleep(100);
 491
 492   uint64_t start = Cycles::rdtsc();
 493   for (int i = 0; i < count; i++) {
 494     flag = 1;
 495     worker.center.dispatch_event_external(count_event);
 496     while (flag)
 497       ;
 498   }
 499   uint64_t stop = Cycles::rdtsc();
 500   worker.stop();
 501   worker.join();
 502   return Cycles::to_seconds(stop - start)/count;
 503 }
 504
 505 // Measure the cost of copying a given number of bytes with memcpy.
 506 double memcpy_shared(size_t size)
 507 {
 508   int count = 1000000;
 509   char src[size], dst[size];
 510
 511   memset(src, 0, sizeof(src));
 512
 513   uint64_t start = Cycles::rdtsc();
 514   for (int i = 0; i < count; i++) {
 515     memcpy(dst, src, size);
 516   }
 517   uint64_t stop = Cycles::rdtsc();
 518   return Cycles::to_seconds(stop - start)/count;
 519 }
 520
 521 double memcpy100()
 522 {
 523   return memcpy_shared(100);
 524 }
 525
 526 double memcpy1000()
 527 {
 528   return memcpy_shared(1000);
 529 }
 530
 531 double memcpy10000()
 532 {
 533   return memcpy_shared(10000);
 534 }
 535
 536 // Benchmark rjenkins hashing performance on cached data.
 537 template <int key_length>
 538 double ceph_str_hash_rjenkins()
 539 {
 540   int count = 100000;
 541   char buf[key_length];
 542
 543   uint64_t start = Cycles::rdtsc();
 544   for (int i = 0; i < count; i++)
 545     ceph_str_hash(CEPH_STR_HASH_RJENKINS, buf, sizeof(buf));
 546   uint64_t stop = Cycles::rdtsc();
 547
 548   return Cycles::to_seconds(stop - start)/count;
 549 }
 550
 551 // Measure the cost of reading the fine-grain cycle counter.
 552 double rdtsc_test()
 553 {
 554   int count = 1000000;
 555   uint64_t start = Cycles::rdtsc();
 556   uint64_t total = 0;
 557   for (int i = 0; i < count; i++) {
 558     total += Cycles::rdtsc();
 559   }
 560   uint64_t stop = Cycles::rdtsc();
 561   return Cycles::to_seconds(stop - start)/count;
 562 }
 563
 564 // Measure the cost of the Cycles::to_seconds method.
 565 double perf_cycles_to_seconds()
 566 {
 567   int count = 1000000;
 568   double total = 0;
 569   uint64_t cycles = 994261;
 570   uint64_t start = Cycles::rdtsc();
 571   for (int i = 0; i < count; i++) {
 572     total += Cycles::to_seconds(cycles);
 573   }
 574   uint64_t stop = Cycles::rdtsc();
 575   // printf("Result: %.4f\n", total/count);
 576   return Cycles::to_seconds(stop - start)/count;
 577 }
 578
 579 // Measure the cost of the Cylcles::toNanoseconds method.
 580 double perf_cycles_to_nanoseconds()
 581 {
 582   int count = 1000000;
 583   uint64_t total = 0;
 584   uint64_t cycles = 994261;
 585   uint64_t start = Cycles::rdtsc();
 586   for (int i = 0; i < count; i++) {
 587     total += Cycles::to_nanoseconds(cycles);
 588   }
 589   uint64_t stop = Cycles::rdtsc();
 590   // printf("Result: %lu\n", total/count);
 591   return Cycles::to_seconds(stop - start)/count;
 592 }
 593
 594
 595 #ifdef HAVE_SSE
 596 /**
 597  * Prefetch the cache lines containing [object, object + numBytes) into the
 598  * processor's caches.
 599  * The best docs for this are in the Intel instruction set reference under
 600  * PREFETCH.
 601  * \param object
 602  *      The start of the region of memory to prefetch.
 603  * \param num_bytes
 604  *      The size of the region of memory to prefetch.
 605  */
 606 static inline void prefetch(const void *object, uint64_t num_bytes)
 607 {
 608     uint64_t offset = reinterpret_cast<uint64_t>(object) & 0x3fUL;
 609     const char* p = reinterpret_cast<const char*>(object) - offset;
 610     for (uint64_t i = 0; i < offset + num_bytes; i += 64)
 611         _mm_prefetch(p + i, _MM_HINT_T0);
 612 }
 613 #endif
 614
 615 // Measure the cost of the prefetch instruction.
 616 double perf_prefetch()
 617 {
 618 #ifdef HAVE_SSE
 619   uint64_t total_ticks = 0;
 620   int count = 10;
 621   char buf[16 * 64];
 622
 623   for (int i = 0; i < count; i++) {
 624     PerfHelper::flush_cache();
 625     uint64_t start = Cycles::rdtsc();
 626     prefetch(&buf[576], 64);
 627     prefetch(&buf[0],   64);
 628     prefetch(&buf[512], 64);
 629     prefetch(&buf[960], 64);
 630     prefetch(&buf[640], 64);
 631     prefetch(&buf[896], 64);
 632     prefetch(&buf[256], 64);
 633     prefetch(&buf[704], 64);
 634     prefetch(&buf[320], 64);
 635     prefetch(&buf[384], 64);
 636     prefetch(&buf[128], 64);
 637     prefetch(&buf[448], 64);
 638     prefetch(&buf[768], 64);
 639     prefetch(&buf[832], 64);
 640     prefetch(&buf[64],  64);
 641     prefetch(&buf[192], 64);
 642     uint64_t stop = Cycles::rdtsc();
 643     total_ticks += stop - start;
 644   }
 645   return Cycles::to_seconds(total_ticks) / count / 16;
 646 #else
 647   return -1;
 648 #endif
 649 }
 650
 651 #if defined(__x86_64__)
 652 /**
 653  * This function is used to seralize machine instructions so that no
 654  * instructions that appear after it in the current thread can run before any
 655  * instructions that appear before it.
 656  *
 657  * It is useful for putting around rdpmc instructions (to pinpoint cache
 658  * misses) as well as before rdtsc instructions, to prevent time pollution from
 659  * instructions supposed to be executing before the timer starts.
 660  */
 661 static inline void serialize() {
 662     uint32_t eax, ebx, ecx, edx;
 663     __asm volatile("cpuid"
 664         : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
 665         : "a" (1U));
 666 }
 667 #endif
 668
 669 // Measure the cost of cpuid
 670 double perf_serialize() {
 671 #if defined(__x86_64__)
 672   int count = 1000000;
 673   uint64_t start = Cycles::rdtsc();
 674   for (int i = 0; i < count; i++) {
 675     serialize();
 676   }
 677   uint64_t stop = Cycles::rdtsc();
 678   return Cycles::to_seconds(stop - start)/count;
 679 #else
 680   return -1;
 681 #endif
 682 }
 683
 684 // Measure the cost of an lfence instruction.
 685 double lfence()
 686 {
 687 #ifdef HAVE_SSE2
 688   int count = 1000000;
 689   uint64_t start = Cycles::rdtsc();
 690   for (int i = 0; i < count; i++) {
 691     __asm__ __volatile__("lfence" ::: "memory");
 692   }
 693   uint64_t stop = Cycles::rdtsc();
 694   return Cycles::to_seconds(stop - start)/count;
 695 #else
 696   return -1;
 697 #endif
 698 }
 699
 700 // Measure the cost of an sfence instruction.
 701 double sfence()
 702 {
 703 #ifdef HAVE_SSE
 704   int count = 1000000;
 705   uint64_t start = Cycles::rdtsc();
 706   for (int i = 0; i < count; i++) {
 707     __asm__ __volatile__("sfence" ::: "memory");
 708   }
 709   uint64_t stop = Cycles::rdtsc();
 710   return Cycles::to_seconds(stop - start)/count;
 711 #else
 712   return -1;
 713 #endif
 714 }
 715
 716 // Measure the cost of acquiring and releasing a SpinLock (assuming the
 717 // lock is initially free).
 718 double test_spinlock()
 719 {
 720   int count = 1000000;
 721   ceph::spinlock lock;
 722   uint64_t start = Cycles::rdtsc();
 723   for (int i = 0; i < count; i++) {
 724     lock.lock();
 725     lock.unlock();
 726   }
 727   uint64_t stop = Cycles::rdtsc();
 728   return Cycles::to_seconds(stop - start)/count;
 729 }
 730
 731 // Helper for spawn_thread. This is the main function that the thread executes
 732 // (intentionally empty).
 733 class ThreadHelper : public Thread {
 734   void *entry() override { return 0; }
 735 };
 736
 737 // Measure the cost of start and joining with a thread.
 738 double spawn_thread()
 739 {
 740   int count = 10000;
 741   ThreadHelper thread;
 742   uint64_t start = Cycles::rdtsc();
 743   for (int i = 0; i < count; i++) {
 744     thread.create("thread_helper");
 745     thread.join();
 746   }
 747   uint64_t stop = Cycles::rdtsc();
 748   return Cycles::to_seconds(stop - start)/count;
 749 }
 750
 751 class FakeContext : public Context {
 752  public:
 753   void finish(int r) override {}
 754 };
 755
 756 // Measure the cost of starting and stopping a Dispatch::Timer.
 757 double perf_timer()
 758 {
 759   int count = 1000000;
 760   ceph::mutex lock = ceph::make_mutex("perf_timer::lock");
 761   SafeTimer timer(g_ceph_context, lock);
 762   FakeContext **c = new FakeContext*[count];
 763   for (int i = 0; i < count; i++) {
 764     c[i] = new FakeContext();
 765   }
 766   uint64_t start = Cycles::rdtsc();
 767   std::lock_guard l{lock};
 768   for (int i = 0; i < count; i++) {
 769     if (timer.add_event_after(12345, c[i])) {
 770       timer.cancel_event(c[i]);
 771     }
 772   }
 773   uint64_t stop = Cycles::rdtsc();
 774   delete[] c;
 775   return Cycles::to_seconds(stop - start)/count;
 776 }
 777
 778 // Measure the cost of throwing and catching an int. This uses an integer as
 779 // the value thrown, which is presumably as fast as possible.
 780 double throw_int()
 781 {
 782   int count = 10000;
 783   uint64_t start = Cycles::rdtsc();
 784   for (int i = 0; i < count; i++) {
 785     try {
 786       throw 0;
 787     } catch (int) { // NOLINT
 788       // pass
 789     }
 790   }
 791   uint64_t stop = Cycles::rdtsc();
 792   return Cycles::to_seconds(stop - start)/count;
 793 }
 794
 795 // Measure the cost of throwing and catching an int from a function call.
 796 double throw_int_call()
 797 {
 798   int count = 10000;
 799   uint64_t start = Cycles::rdtsc();
 800   for (int i = 0; i < count; i++) {
 801     try {
 802       PerfHelper::throw_int();
 803     } catch (int) { // NOLINT
 804       // pass
 805     }
 806   }
 807   uint64_t stop = Cycles::rdtsc();
 808   return Cycles::to_seconds(stop - start)/count;
 809 }
 810
 811 // Measure the cost of throwing and catching an Exception. This uses an actual
 812 // exception as the value thrown, which may be slower than throwInt.
 813 double throw_exception()
 814 {
 815   int count = 10000;
 816   uint64_t start = Cycles::rdtsc();
 817   for (int i = 0; i < count; i++) {
 818     try {
 819       throw buffer::end_of_buffer();
 820     } catch (const buffer::end_of_buffer&) {
 821       // pass
 822     }
 823   }
 824   uint64_t stop = Cycles::rdtsc();
 825   return Cycles::to_seconds(stop - start)/count;
 826 }
 827
 828 // Measure the cost of throwing and catching an Exception from a function call.
 829 double throw_exception_call()
 830 {
 831   int count = 10000;
 832   uint64_t start = Cycles::rdtsc();
 833   for (int i = 0; i < count; i++) {
 834     try {
 835       PerfHelper::throw_end_of_buffer();
 836     } catch (const buffer::end_of_buffer&) {
 837       // pass
 838     }
 839   }
 840   uint64_t stop = Cycles::rdtsc();
 841   return Cycles::to_seconds(stop - start)/count;
 842 }
 843
 844 // Measure the cost of pushing a new element on a std::vector, copying
 845 // from the end to an internal element, and popping the end element.
 846 double vector_push_pop()
 847 {
 848   int count = 100000;
 849   std::vector<int> vector;
 850   vector.push_back(1);
 851   vector.push_back(2);
 852   vector.push_back(3);
 853   uint64_t start = Cycles::rdtsc();
 854   for (int i = 0; i < count; i++) {
 855     vector.push_back(i);
 856     vector.push_back(i+1);
 857     vector.push_back(i+2);
 858     vector[2] = vector.back();
 859     vector.pop_back();
 860     vector[0] = vector.back();
 861     vector.pop_back();
 862     vector[1] = vector.back();
 863     vector.pop_back();
 864   }
 865   uint64_t stop = Cycles::rdtsc();
 866   return Cycles::to_seconds(stop - start)/(count*3);
 867 }
 868
 869 // Measure the cost of ceph_clock_now
 870 double perf_ceph_clock_now()
 871 {
 872   int count = 100000;
 873   uint64_t start = Cycles::rdtsc();
 874   for (int i = 0; i < count; i++) {
 875     ceph_clock_now();
 876   }
 877   uint64_t stop = Cycles::rdtsc();
 878   return Cycles::to_seconds(stop - start)/count;
 879 }
 880
 881 // The following struct and table define each performance test in terms of
 882 // a string name and a function that implements the test.
 883 struct TestInfo {
 884   const char* name;             // Name of the performance test; this is
 885                                 // what gets typed on the command line to
 886                                 // run the test.
 887   double (*func)();             // Function that implements the test;
 888                                 // returns the time (in seconds) for each
 889                                 // iteration of that test.
 890   const char *description;      // Short description of this test (not more
 891                                 // than about 40 characters, so the entire
 892                                 // test output fits on a single line).
 893 };
 894 TestInfo tests[] = {
 895   {"atomic_int_cmp", atomic_int_cmp,
 896     "atomic_t::compare_and_swap"},
 897   {"atomic_int_inc", atomic_int_inc,
 898     "atomic_t::inc"},
 899   {"atomic_int_read", atomic_int_read,
 900     "atomic_t::read"},
 901   {"atomic_int_set", atomic_int_set,
 902     "atomic_t::set"},
 903   {"mutex_nonblock", mutex_nonblock,
 904     "Mutex lock/unlock (no blocking)"},
 905   {"buffer_basic", buffer_basic,
 906     "buffer create, add one ptr, delete"},
 907   {"buffer_encode_decode", buffer_encode_decode,
 908     "buffer create, encode/decode object, delete"},
 909   {"buffer_basic_copy", buffer_basic_copy,
 910     "buffer create, copy small block, delete"},
 911   {"buffer_copy", buffer_copy,
 912     "copy out 2 small ptrs from buffer"},
 913   {"buffer_encode10", buffer_encode,
 914     "buffer encoding 10 structures onto existing ptr"},
 915   {"buffer_iterator", buffer_iterator,
 916     "iterate over buffer with 5 ptrs"},
 917   {"cond_ping_pong", cond_ping_pong,
 918     "condition variable round-trip"},
 919   {"div32", div32,
 920     "32-bit integer division instruction"},
 921   {"div64", div64,
 922     "64-bit integer division instruction"},
 923   {"function_call", function_call,
 924     "Call a function that has not been inlined"},
 925   {"eventcenter_poll", eventcenter_poll,
 926     "EventCenter::process_events (no timers or events)"},
 927   {"eventcenter_dispatch", eventcenter_dispatch,
 928     "EventCenter::dispatch_event_external latency"},
 929   {"memcpy100", memcpy100,
 930     "Copy 100 bytes with memcpy"},
 931   {"memcpy1000", memcpy1000,
 932     "Copy 1000 bytes with memcpy"},
 933   {"memcpy10000", memcpy10000,
 934     "Copy 10000 bytes with memcpy"},
 935   {"ceph_str_hash_rjenkins", ceph_str_hash_rjenkins<16>,
 936     "rjenkins hash on 16 byte of data"},
 937   {"ceph_str_hash_rjenkins", ceph_str_hash_rjenkins<256>,
 938     "rjenkins hash on 256 bytes of data"},
 939   {"rdtsc", rdtsc_test,
 940     "Read the fine-grain cycle counter"},
 941   {"cycles_to_seconds", perf_cycles_to_seconds,
 942     "Convert a rdtsc result to (double) seconds"},
 943   {"cycles_to_seconds", perf_cycles_to_nanoseconds,
 944     "Convert a rdtsc result to (uint64_t) nanoseconds"},
 945   {"prefetch", perf_prefetch,
 946     "Prefetch instruction"},
 947   {"serialize", perf_serialize,
 948     "serialize instruction"},
 949   {"lfence", lfence,
 950     "Lfence instruction"},
 951   {"sfence", sfence,
 952     "Sfence instruction"},
 953   {"spin_lock", test_spinlock,
 954     "Acquire/release SpinLock"},
 955   {"spawn_thread", spawn_thread,
 956     "Start and stop a thread"},
 957   {"perf_timer", perf_timer,
 958     "Insert and cancel a SafeTimer"},
 959   {"throw_int", throw_int,
 960     "Throw an int"},
 961   {"throw_int_call", throw_int_call,
 962     "Throw an int in a function call"},
 963   {"throw_exception", throw_exception,
 964     "Throw an Exception"},
 965   {"throw_exception_call", throw_exception_call,
 966     "Throw an Exception in a function call"},
 967   {"vector_push_pop", vector_push_pop,
 968     "Push and pop a std::vector"},
 969   {"ceph_clock_now", perf_ceph_clock_now,
 970    "ceph_clock_now function"},
 971 };
 972
 973 /**
 974  * Runs a particular test and prints a one-line result message.
 975  *
 976  * \param info
 977  *      Describes the test to run.
 978  */
 979 void run_test(TestInfo& info)
 980 {
 981   double secs = info.func();
 982   int width = printf("%-24s ", info.name);
 983   if (secs == -1) {
 984     width += printf(" architecture nonsupport ");
 985   } else if (secs < 1.0e-06) {
 986     width += printf("%8.2fns", 1e09*secs);
 987   } else if (secs < 1.0e-03) {
 988     width += printf("%8.2fus", 1e06*secs);
 989   } else if (secs < 1.0) {
 990     width += printf("%8.2fms", 1e03*secs);
 991   } else {
 992     width += printf("%8.2fs", secs);
 993   }
 994   printf("%*s %s\n", 32-width, "", info.description);
 995 }
 996
 997 int main(int argc, char *argv[])
 998 {
 999   vector<const char*> args;
1000   argv_to_vec(argc, (const char **)argv, args);
1001
1002   auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
1003                          CODE_ENVIRONMENT_UTILITY,
1004                          CINIT_FLAG_NO_DEFAULT_CONFIG_FILE);
1005   common_init_finish(g_ceph_context);
1006   Cycles::init();
1007
1008   bind_thread_to_cpu(3);
1009   if (argc == 1) {
1010     // No test names specified; run all tests.
1011     for (size_t i = 0; i < sizeof(tests)/sizeof(TestInfo); ++i) {
1012       run_test(tests[i]);
1013     }
1014   } else {
1015     // Run only the tests that were specified on the command line.
1016     for (int i = 1; i < argc; i++) {
1017       bool found_test = false;
1018       for (size_t j = 0; j < sizeof(tests)/sizeof(TestInfo); ++j) {
1019         if (strcmp(argv[i], tests[j].name) == 0) {
1020           found_test = true;
1021           run_test(tests[j]);
1022           break;
1023         }
1024       }
1025       if (!found_test) {
1026         int width = printf("%-24s ??", argv[i]);
1027         printf("%*s No such test\n", 32-width, "");
1028       }
1029     }
1030   }
1031 }