]> git.proxmox.com Git - ceph.git/blame - ceph/src/test/perf_local.cc
import quincy beta 17.1.0
[ceph.git] / ceph / src / test / perf_local.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/* Copyright (c) 2015 Haomai Wang <haomaiwang@gmail.com>
4 * Copyright (c) 2011-2014 Stanford University
5 * Copyright (c) 2011 Facebook
6 *
7 * Permission to use, copy, modify, and distribute this software for any
8 * purpose with or without fee is hereby granted, provided that the above
9 * copyright notice and this permission notice appear in all copies.
10 *
11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR(S) DISCLAIM ALL WARRANTIES
12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL AUTHORS BE LIABLE FOR
14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18 */
19
20// This program contains a collection of low-level performance measurements
21// for Ceph, which can be run either individually or altogether. These
22// tests measure performance in a single stand-alone process, not in a cluster
23// with multiple servers. Invoke the program like this:
24//
25// Perf test1 test2 ...
26//
27// test1 and test2 are the names of individual performance measurements to
28// run. If no test names are provided then all of the performance tests
29// are run.
30//
31// To add a new test:
32// * Write a function that implements the test. Use existing test functions
33// as a guideline, and be sure to generate output in the same form as
34// other tests.
35// * Create a new entry for the test in the #tests table.
36#include <vector>
37#include <sched.h>
38
39#include "acconfig.h"
40#ifdef HAVE_SSE
41#include <xmmintrin.h>
42#endif
43
7c673cae
FG
44#include "include/buffer.h"
45#include "include/encoding.h"
46#include "include/ceph_hash.h"
11fdf7f2 47#include "include/spinlock.h"
7c673cae
FG
48#include "common/ceph_argparse.h"
49#include "common/Cycles.h"
50#include "common/Cond.h"
9f95a23c 51#include "common/ceph_mutex.h"
7c673cae
FG
52#include "common/Thread.h"
53#include "common/Timer.h"
54#include "msg/async/Event.h"
55#include "global/global_init.h"
56
57#include "test/perf_helper.h"
58
31f18b77
FG
59#include <atomic>
60
20effc67 61using namespace std;
7c673cae
FG
62using namespace ceph;
63
64/**
65 * Ask the operating system to pin the current thread to a given CPU.
66 *
67 * \param cpu
68 * Indicates the desired CPU and hyperthread; low order 2 bits
69 * specify CPU, next bit specifies hyperthread.
70 */
71void bind_thread_to_cpu(int cpu)
72{
73#ifdef HAVE_SCHED
74 cpu_set_t set;
75 CPU_ZERO(&set);
76 CPU_SET(cpu, &set);
77 sched_setaffinity(0, sizeof(set), &set);
78#endif
79}
80
81/*
82 * This function just discards its argument. It's used to make it
83 * appear that data is used, so that the compiler won't optimize
84 * away the code we're trying to measure.
85 *
86 * \param value
87 * Pointer to arbitrary value; it's discarded.
88 */
89void discard(void* value) {
90 int x = *reinterpret_cast<int*>(value);
91 if (x == 0x43924776) {
92 printf("Value was 0x%x\n", x);
93 }
94}
95
96//----------------------------------------------------------------------
97// Test functions start here
98//----------------------------------------------------------------------
99
31f18b77 100// Measure the cost of atomic compare-and-swap
7c673cae
FG
101double atomic_int_cmp()
102{
103 int count = 1000000;
31f18b77
FG
104 std::atomic<unsigned> value = { 11 };
105 unsigned int test = 11;
7c673cae
FG
106 uint64_t start = Cycles::rdtsc();
107 for (int i = 0; i < count; i++) {
31f18b77 108 value.compare_exchange_strong(test, test+2);
7c673cae
FG
109 test += 2;
110 }
111 uint64_t stop = Cycles::rdtsc();
112 // printf("Final value: %d\n", value.load());
113 return Cycles::to_seconds(stop - start)/count;
114}
115
31f18b77 116// Measure the cost of incrementing an atomic
7c673cae
FG
117double atomic_int_inc()
118{
119 int count = 1000000;
31f18b77 120 std::atomic<int64_t> value = { 11 };
7c673cae
FG
121 uint64_t start = Cycles::rdtsc();
122 for (int i = 0; i < count; i++) {
31f18b77 123 value++;
7c673cae
FG
124 }
125 uint64_t stop = Cycles::rdtsc();
126 // printf("Final value: %d\n", value.load());
127 return Cycles::to_seconds(stop - start)/count;
128}
129
31f18b77 130// Measure the cost of reading an atomic
7c673cae
FG
131double atomic_int_read()
132{
133 int count = 1000000;
31f18b77 134 std::atomic<int64_t> value = { 11 };
7c673cae
FG
135 int total = 0;
136 uint64_t start = Cycles::rdtsc();
137 for (int i = 0; i < count; i++) {
31f18b77 138 total += value;
7c673cae
FG
139 }
140 uint64_t stop = Cycles::rdtsc();
141 // printf("Total: %d\n", total);
142 return Cycles::to_seconds(stop - start)/count;
143}
144
31f18b77 145// Measure the cost of storing a new value in an atomic
7c673cae
FG
146double atomic_int_set()
147{
148 int count = 1000000;
31f18b77 149 std::atomic<int64_t> value = { 11 };
7c673cae
FG
150 uint64_t start = Cycles::rdtsc();
151 for (int i = 0; i < count; i++) {
31f18b77 152 value = 88;
7c673cae
FG
153 }
154 uint64_t stop = Cycles::rdtsc();
155 return Cycles::to_seconds(stop - start)/count;
156}
157
158// Measure the cost of acquiring and releasing a mutex in the
159// fast case where the mutex is free.
160double mutex_nonblock()
161{
162 int count = 1000000;
9f95a23c 163 ceph::mutex m = ceph::make_mutex("mutex_nonblock::m");
7c673cae
FG
164 uint64_t start = Cycles::rdtsc();
165 for (int i = 0; i < count; i++) {
9f95a23c
TL
166 m.lock();
167 m.unlock();
7c673cae
FG
168 }
169 uint64_t stop = Cycles::rdtsc();
170 return Cycles::to_seconds(stop - start)/count;
171}
172
173// Measure the cost of allocating and deallocating a buffer, plus
174// appending (logically) one ptr.
175double buffer_basic()
176{
177 int count = 1000000;
178 uint64_t start = Cycles::rdtsc();
179 bufferptr ptr("abcdefg", 7);
180 for (int i = 0; i < count; i++) {
181 bufferlist b;
182 b.append(ptr, 0, 5);
183 }
184 uint64_t stop = Cycles::rdtsc();
185 return Cycles::to_seconds(stop - start)/count;
186}
187
188struct DummyBlock {
189 int a = 1, b = 2, c = 3, d = 4;
190 void encode(bufferlist &bl) const {
191 ENCODE_START(1, 1, bl);
11fdf7f2
TL
192 encode(a, bl);
193 encode(b, bl);
194 encode(c, bl);
195 encode(d, bl);
7c673cae
FG
196 ENCODE_FINISH(bl);
197 }
11fdf7f2 198 void decode(bufferlist::const_iterator &bl) {
7c673cae 199 DECODE_START(1, bl);
11fdf7f2
TL
200 decode(a, bl);
201 decode(b, bl);
202 decode(c, bl);
203 decode(d, bl);
7c673cae
FG
204 DECODE_FINISH(bl);
205 }
206};
207WRITE_CLASS_ENCODER(DummyBlock)
208
209// Measure the cost of encoding and decoding a buffer, plus
210// allocating space for one chunk.
211double buffer_encode_decode()
212{
213 int count = 1000000;
214 uint64_t start = Cycles::rdtsc();
215 for (int i = 0; i < count; i++) {
216 bufferlist b;
217 DummyBlock dummy_block;
11fdf7f2
TL
218 encode(dummy_block, b);
219 auto iter = b.cbegin();
220 decode(dummy_block, iter);
7c673cae
FG
221 }
222 uint64_t stop = Cycles::rdtsc();
223 return Cycles::to_seconds(stop - start)/count;
224}
225
226// Measure the cost of allocating and deallocating a buffer, plus
227// copying in a small block.
228double buffer_basic_copy()
229{
230 int count = 1000000;
231 uint64_t start = Cycles::rdtsc();
232 for (int i = 0; i < count; i++) {
233 bufferlist b;
234 b.append("abcdefg", 6);
235 }
236 uint64_t stop = Cycles::rdtsc();
237 return Cycles::to_seconds(stop - start)/count;
238}
239
240// Measure the cost of making a copy of parts of two ptrs.
241double buffer_copy()
242{
243 int count = 1000000;
244 bufferlist b;
245 b.append("abcde", 5);
246 b.append("01234", 5);
247 char copy[10];
248 uint64_t start = Cycles::rdtsc();
249 for (int i = 0; i < count; i++) {
9f95a23c 250 b.cbegin(2).copy(6, copy);
7c673cae
FG
251 }
252 uint64_t stop = Cycles::rdtsc();
253 return Cycles::to_seconds(stop - start)/count;
254}
255
256// Measure the cost of allocating new space by extending the
257// bufferlist
258double buffer_encode()
259{
260 int count = 100000;
261 uint64_t total = 0;
262 for (int i = 0; i < count; i++) {
263 bufferlist b;
264 DummyBlock dummy_block;
11fdf7f2 265 encode(dummy_block, b);
7c673cae 266 uint64_t start = Cycles::rdtsc();
11fdf7f2
TL
267 encode(dummy_block, b);
268 encode(dummy_block, b);
269 encode(dummy_block, b);
270 encode(dummy_block, b);
271 encode(dummy_block, b);
272 encode(dummy_block, b);
273 encode(dummy_block, b);
274 encode(dummy_block, b);
275 encode(dummy_block, b);
276 encode(dummy_block, b);
7c673cae
FG
277 total += Cycles::rdtsc() - start;
278 }
279 return Cycles::to_seconds(total)/(count*10);
280}
281
7c673cae
FG
282// Measure the cost of creating an iterator and iterating over 10
283// chunks in a buffer.
284double buffer_iterator()
285{
286 bufferlist b;
287 const char s[] = "abcdefghijklmnopqrstuvwxyz";
288 bufferptr ptr(s, sizeof(s));
289 for (int i = 0; i < 5; i++) {
290 b.append(ptr, i, 5);
291 }
292 int count = 100000;
293 int sum = 0;
294 uint64_t start = Cycles::rdtsc();
295 for (int i = 0; i < count; i++) {
11fdf7f2 296 auto it = b.cbegin();
7c673cae
FG
297 while (!it.end()) {
298 sum += (static_cast<const char*>(it.get_current_ptr().c_str()))[it.get_remaining()-1];
299 ++it;
300 }
301 }
302 uint64_t stop = Cycles::rdtsc();
303 discard(&sum);
304 return Cycles::to_seconds(stop - start)/count;
305}
306
307// Implements the CondPingPong test.
308class CondPingPong {
9f95a23c
TL
309 ceph::mutex mutex = ceph::make_mutex("CondPingPong::mutex");
310 ceph::condition_variable cond;
311 int prod = 0;
312 int cons = 0;
313 const int count = 10000;
7c673cae
FG
314
315 class Consumer : public Thread {
316 CondPingPong *p;
317 public:
318 explicit Consumer(CondPingPong *p): p(p) {}
319 void* entry() override {
320 p->consume();
321 return 0;
322 }
323 } consumer;
324
325 public:
9f95a23c 326 CondPingPong(): consumer(this) {}
7c673cae
FG
327
328 double run() {
329 consumer.create("consumer");
330 uint64_t start = Cycles::rdtsc();
331 produce();
332 uint64_t stop = Cycles::rdtsc();
333 consumer.join();
334 return Cycles::to_seconds(stop - start)/count;
335 }
336
337 void produce() {
9f95a23c 338 std::unique_lock l{mutex};
7c673cae 339 while (cons < count) {
9f95a23c 340 cond.wait(l, [this] { return cons >= prod; });
7c673cae 341 ++prod;
9f95a23c 342 cond.notify_all();
7c673cae
FG
343 }
344 }
345
346 void consume() {
9f95a23c 347 std::unique_lock l{mutex};
7c673cae 348 while (cons < count) {
9f95a23c 349 cond.wait(l, [this] { return cons != prod; });
7c673cae 350 ++cons;
9f95a23c 351 cond.notify_all();
7c673cae
FG
352 }
353 }
354};
355
356// Measure the cost of coordinating between threads using a condition variable.
357double cond_ping_pong()
358{
359 return CondPingPong().run();
360}
361
362// Measure the cost of a 32-bit divide. Divides don't take a constant
363// number of cycles. Values were chosen here semi-randomly to depict a
364// fairly expensive scenario. Someone with fancy ALU knowledge could
365// probably pick worse values.
366double div32()
367{
368#if defined(__i386__) || defined(__x86_64__)
369 int count = 1000000;
370 uint64_t start = Cycles::rdtsc();
371 // NB: Expect an x86 processor exception is there's overflow.
372 uint32_t numeratorHi = 0xa5a5a5a5U;
373 uint32_t numeratorLo = 0x55aa55aaU;
374 uint32_t divisor = 0xaa55aa55U;
375 uint32_t quotient;
376 uint32_t remainder;
377 for (int i = 0; i < count; i++) {
378 __asm__ __volatile__("div %4" :
379 "=a"(quotient), "=d"(remainder) :
380 "a"(numeratorLo), "d"(numeratorHi), "r"(divisor) :
381 "cc");
382 }
383 uint64_t stop = Cycles::rdtsc();
384 return Cycles::to_seconds(stop - start)/count;
f67539c2
TL
385#elif defined(__aarch64__)
386 int count = 1000000;
387 uint64_t start = Cycles::rdtsc();
388 uint64_t numerator = 0xa5a5a5a555aa55aaUL;
389 uint32_t divisor = 0xaa55aa55U;
390 uint32_t result;
391 for (int i = 0; i < count; i++) {
392 asm volatile("udiv %0, %1, %2" : "=r"(result) :
393 "r"(numerator), "r"(divisor));
394 }
395 uint64_t stop = Cycles::rdtsc();
396 return Cycles::to_seconds(stop - start)/count;
7c673cae
FG
397#else
398 return -1;
399#endif
400}
401
402// Measure the cost of a 64-bit divide. Divides don't take a constant
403// number of cycles. Values were chosen here semi-randomly to depict a
404// fairly expensive scenario. Someone with fancy ALU knowledge could
405// probably pick worse values.
406double div64()
407{
408#if defined(__x86_64__) || defined(__amd64__)
409 int count = 1000000;
410 // NB: Expect an x86 processor exception is there's overflow.
411 uint64_t start = Cycles::rdtsc();
412 uint64_t numeratorHi = 0x5a5a5a5a5a5UL;
413 uint64_t numeratorLo = 0x55aa55aa55aa55aaUL;
414 uint64_t divisor = 0xaa55aa55aa55aa55UL;
415 uint64_t quotient;
416 uint64_t remainder;
417 for (int i = 0; i < count; i++) {
418 __asm__ __volatile__("divq %4" :
419 "=a"(quotient), "=d"(remainder) :
420 "a"(numeratorLo), "d"(numeratorHi), "r"(divisor) :
421 "cc");
422 }
423 uint64_t stop = Cycles::rdtsc();
424 return Cycles::to_seconds(stop - start)/count;
425#else
426 return -1;
427#endif
428}
429
430// Measure the cost of calling a non-inlined function.
431double function_call()
432{
433 int count = 1000000;
434 uint64_t x = 0;
435 uint64_t start = Cycles::rdtsc();
436 for (int i = 0; i < count; i++) {
437 x = PerfHelper::plus_one(x);
438 }
439 uint64_t stop = Cycles::rdtsc();
440 return Cycles::to_seconds(stop - start)/count;
441}
442
443// Measure the minimum cost of EventCenter::process_events, when there are no
444// Pollers and no Timers.
445double eventcenter_poll()
446{
447 int count = 1000000;
448 EventCenter center(g_ceph_context);
449 center.init(1000, 0, "posix");
450 center.set_owner();
451 uint64_t start = Cycles::rdtsc();
452 for (int i = 0; i < count; i++) {
453 center.process_events(0);
454 }
455 uint64_t stop = Cycles::rdtsc();
456 return Cycles::to_seconds(stop - start)/count;
457}
458
459class CenterWorker : public Thread {
460 CephContext *cct;
461 bool done;
462
463 public:
464 EventCenter center;
465 explicit CenterWorker(CephContext *c): cct(c), done(false), center(c) {
466 center.init(100, 0, "posix");
467 }
468 void stop() {
469 done = true;
470 center.wakeup();
471 }
472 void* entry() override {
473 center.set_owner();
474 bind_thread_to_cpu(2);
475 while (!done)
476 center.process_events(1000);
477 return 0;
478 }
479};
480
481class CountEvent: public EventCallback {
31f18b77 482 std::atomic<int64_t> *count;
7c673cae
FG
483
484 public:
31f18b77 485 explicit CountEvent(std::atomic<int64_t> *atomic): count(atomic) {}
11fdf7f2 486 void do_request(uint64_t id) override {
31f18b77 487 (*count)--;
7c673cae
FG
488 }
489};
490
491double eventcenter_dispatch()
492{
493 int count = 100000;
494
495 CenterWorker worker(g_ceph_context);
31f18b77 496 std::atomic<int64_t> flag = { 1 };
7c673cae
FG
497 worker.create("evt_center_disp");
498 EventCallbackRef count_event(new CountEvent(&flag));
499
500 worker.center.dispatch_event_external(count_event);
501 // Start a new thread and wait for it to ready.
31f18b77 502 while (flag)
7c673cae
FG
503 usleep(100);
504
505 uint64_t start = Cycles::rdtsc();
506 for (int i = 0; i < count; i++) {
31f18b77 507 flag = 1;
7c673cae 508 worker.center.dispatch_event_external(count_event);
31f18b77 509 while (flag)
7c673cae
FG
510 ;
511 }
512 uint64_t stop = Cycles::rdtsc();
513 worker.stop();
514 worker.join();
515 return Cycles::to_seconds(stop - start)/count;
516}
517
518// Measure the cost of copying a given number of bytes with memcpy.
519double memcpy_shared(size_t size)
520{
521 int count = 1000000;
522 char src[size], dst[size];
523
524 memset(src, 0, sizeof(src));
525
526 uint64_t start = Cycles::rdtsc();
527 for (int i = 0; i < count; i++) {
528 memcpy(dst, src, size);
529 }
530 uint64_t stop = Cycles::rdtsc();
531 return Cycles::to_seconds(stop - start)/count;
532}
533
534double memcpy100()
535{
536 return memcpy_shared(100);
537}
538
539double memcpy1000()
540{
541 return memcpy_shared(1000);
542}
543
544double memcpy10000()
545{
546 return memcpy_shared(10000);
547}
548
549// Benchmark rjenkins hashing performance on cached data.
550template <int key_length>
551double ceph_str_hash_rjenkins()
552{
553 int count = 100000;
554 char buf[key_length];
555
556 uint64_t start = Cycles::rdtsc();
557 for (int i = 0; i < count; i++)
558 ceph_str_hash(CEPH_STR_HASH_RJENKINS, buf, sizeof(buf));
559 uint64_t stop = Cycles::rdtsc();
560
561 return Cycles::to_seconds(stop - start)/count;
562}
563
564// Measure the cost of reading the fine-grain cycle counter.
565double rdtsc_test()
566{
567 int count = 1000000;
568 uint64_t start = Cycles::rdtsc();
569 uint64_t total = 0;
570 for (int i = 0; i < count; i++) {
571 total += Cycles::rdtsc();
572 }
573 uint64_t stop = Cycles::rdtsc();
574 return Cycles::to_seconds(stop - start)/count;
575}
576
577// Measure the cost of the Cycles::to_seconds method.
578double perf_cycles_to_seconds()
579{
580 int count = 1000000;
581 double total = 0;
582 uint64_t cycles = 994261;
583 uint64_t start = Cycles::rdtsc();
584 for (int i = 0; i < count; i++) {
585 total += Cycles::to_seconds(cycles);
586 }
587 uint64_t stop = Cycles::rdtsc();
588 // printf("Result: %.4f\n", total/count);
589 return Cycles::to_seconds(stop - start)/count;
590}
591
592// Measure the cost of the Cylcles::toNanoseconds method.
593double perf_cycles_to_nanoseconds()
594{
595 int count = 1000000;
596 uint64_t total = 0;
597 uint64_t cycles = 994261;
598 uint64_t start = Cycles::rdtsc();
599 for (int i = 0; i < count; i++) {
600 total += Cycles::to_nanoseconds(cycles);
601 }
602 uint64_t stop = Cycles::rdtsc();
603 // printf("Result: %lu\n", total/count);
604 return Cycles::to_seconds(stop - start)/count;
605}
606
607
608#ifdef HAVE_SSE
609/**
610 * Prefetch the cache lines containing [object, object + numBytes) into the
611 * processor's caches.
612 * The best docs for this are in the Intel instruction set reference under
613 * PREFETCH.
614 * \param object
615 * The start of the region of memory to prefetch.
616 * \param num_bytes
617 * The size of the region of memory to prefetch.
618 */
619static inline void prefetch(const void *object, uint64_t num_bytes)
620{
621 uint64_t offset = reinterpret_cast<uint64_t>(object) & 0x3fUL;
622 const char* p = reinterpret_cast<const char*>(object) - offset;
623 for (uint64_t i = 0; i < offset + num_bytes; i += 64)
624 _mm_prefetch(p + i, _MM_HINT_T0);
625}
f67539c2
TL
626#elif defined(__aarch64__)
627static inline void prefetch(const void *object, uint64_t num_bytes)
628{
629 uint64_t offset = reinterpret_cast<uint64_t>(object) & 0x3fUL;
630 const char* ptr = reinterpret_cast<const char*>(object) - offset;
631 for (uint64_t i = 0; i < offset + num_bytes; i += 64, ptr += 64)
632 asm volatile("prfm pldl1keep, %a0\n" : : "p" (ptr));
633}
7c673cae
FG
634#endif
635
636// Measure the cost of the prefetch instruction.
637double perf_prefetch()
638{
f67539c2 639#if defined(HAVE_SSE) || defined(__aarch64__)
7c673cae
FG
640 uint64_t total_ticks = 0;
641 int count = 10;
642 char buf[16 * 64];
7c673cae
FG
643
644 for (int i = 0; i < count; i++) {
645 PerfHelper::flush_cache();
11fdf7f2 646 uint64_t start = Cycles::rdtsc();
7c673cae
FG
647 prefetch(&buf[576], 64);
648 prefetch(&buf[0], 64);
649 prefetch(&buf[512], 64);
650 prefetch(&buf[960], 64);
651 prefetch(&buf[640], 64);
652 prefetch(&buf[896], 64);
653 prefetch(&buf[256], 64);
654 prefetch(&buf[704], 64);
655 prefetch(&buf[320], 64);
656 prefetch(&buf[384], 64);
657 prefetch(&buf[128], 64);
658 prefetch(&buf[448], 64);
659 prefetch(&buf[768], 64);
660 prefetch(&buf[832], 64);
661 prefetch(&buf[64], 64);
662 prefetch(&buf[192], 64);
11fdf7f2 663 uint64_t stop = Cycles::rdtsc();
7c673cae
FG
664 total_ticks += stop - start;
665 }
666 return Cycles::to_seconds(total_ticks) / count / 16;
667#else
668 return -1;
669#endif
670}
671
672#if defined(__x86_64__)
673/**
674 * This function is used to seralize machine instructions so that no
675 * instructions that appear after it in the current thread can run before any
676 * instructions that appear before it.
677 *
678 * It is useful for putting around rdpmc instructions (to pinpoint cache
679 * misses) as well as before rdtsc instructions, to prevent time pollution from
680 * instructions supposed to be executing before the timer starts.
681 */
682static inline void serialize() {
683 uint32_t eax, ebx, ecx, edx;
684 __asm volatile("cpuid"
685 : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
686 : "a" (1U));
687}
688#endif
689
690// Measure the cost of cpuid
691double perf_serialize() {
692#if defined(__x86_64__)
693 int count = 1000000;
694 uint64_t start = Cycles::rdtsc();
695 for (int i = 0; i < count; i++) {
696 serialize();
697 }
698 uint64_t stop = Cycles::rdtsc();
699 return Cycles::to_seconds(stop - start)/count;
700#else
701 return -1;
702#endif
703}
704
705// Measure the cost of an lfence instruction.
706double lfence()
707{
708#ifdef HAVE_SSE2
709 int count = 1000000;
710 uint64_t start = Cycles::rdtsc();
711 for (int i = 0; i < count; i++) {
712 __asm__ __volatile__("lfence" ::: "memory");
713 }
714 uint64_t stop = Cycles::rdtsc();
715 return Cycles::to_seconds(stop - start)/count;
f67539c2
TL
716#elif defined(__aarch64__)
717 int count = 1000000;
718 uint64_t start = Cycles::rdtsc();
719 for (int i = 0; i < count; i++) {
720 asm volatile("dmb ishld" ::: "memory");
721 }
722 uint64_t stop = Cycles::rdtsc();
723 return Cycles::to_seconds(stop - start)/count;
7c673cae
FG
724#else
725 return -1;
726#endif
727}
728
729// Measure the cost of an sfence instruction.
730double sfence()
731{
732#ifdef HAVE_SSE
733 int count = 1000000;
734 uint64_t start = Cycles::rdtsc();
735 for (int i = 0; i < count; i++) {
736 __asm__ __volatile__("sfence" ::: "memory");
737 }
738 uint64_t stop = Cycles::rdtsc();
739 return Cycles::to_seconds(stop - start)/count;
f67539c2
TL
740#elif defined(__aarch64__)
741 int count = 1000000;
742 uint64_t start = Cycles::rdtsc();
743 for (int i = 0; i < count; i++) {
744 asm volatile("dmb ishst" ::: "memory");
745 }
746 uint64_t stop = Cycles::rdtsc();
747 return Cycles::to_seconds(stop - start)/count;
7c673cae
FG
748#else
749 return -1;
750#endif
751}
752
753// Measure the cost of acquiring and releasing a SpinLock (assuming the
754// lock is initially free).
755double test_spinlock()
756{
757 int count = 1000000;
11fdf7f2 758 ceph::spinlock lock;
7c673cae
FG
759 uint64_t start = Cycles::rdtsc();
760 for (int i = 0; i < count; i++) {
761 lock.lock();
762 lock.unlock();
763 }
764 uint64_t stop = Cycles::rdtsc();
765 return Cycles::to_seconds(stop - start)/count;
766}
767
768// Helper for spawn_thread. This is the main function that the thread executes
769// (intentionally empty).
770class ThreadHelper : public Thread {
771 void *entry() override { return 0; }
772};
773
774// Measure the cost of start and joining with a thread.
775double spawn_thread()
776{
777 int count = 10000;
778 ThreadHelper thread;
779 uint64_t start = Cycles::rdtsc();
780 for (int i = 0; i < count; i++) {
781 thread.create("thread_helper");
782 thread.join();
783 }
784 uint64_t stop = Cycles::rdtsc();
785 return Cycles::to_seconds(stop - start)/count;
786}
787
788class FakeContext : public Context {
789 public:
790 void finish(int r) override {}
791};
792
793// Measure the cost of starting and stopping a Dispatch::Timer.
794double perf_timer()
795{
796 int count = 1000000;
9f95a23c 797 ceph::mutex lock = ceph::make_mutex("perf_timer::lock");
7c673cae
FG
798 SafeTimer timer(g_ceph_context, lock);
799 FakeContext **c = new FakeContext*[count];
800 for (int i = 0; i < count; i++) {
801 c[i] = new FakeContext();
802 }
803 uint64_t start = Cycles::rdtsc();
9f95a23c 804 std::lock_guard l{lock};
7c673cae 805 for (int i = 0; i < count; i++) {
3efd9988
FG
806 if (timer.add_event_after(12345, c[i])) {
807 timer.cancel_event(c[i]);
808 }
7c673cae
FG
809 }
810 uint64_t stop = Cycles::rdtsc();
811 delete[] c;
812 return Cycles::to_seconds(stop - start)/count;
813}
814
815// Measure the cost of throwing and catching an int. This uses an integer as
816// the value thrown, which is presumably as fast as possible.
817double throw_int()
818{
819 int count = 10000;
820 uint64_t start = Cycles::rdtsc();
821 for (int i = 0; i < count; i++) {
822 try {
823 throw 0;
824 } catch (int) { // NOLINT
825 // pass
826 }
827 }
828 uint64_t stop = Cycles::rdtsc();
829 return Cycles::to_seconds(stop - start)/count;
830}
831
832// Measure the cost of throwing and catching an int from a function call.
833double throw_int_call()
834{
835 int count = 10000;
836 uint64_t start = Cycles::rdtsc();
837 for (int i = 0; i < count; i++) {
838 try {
839 PerfHelper::throw_int();
840 } catch (int) { // NOLINT
841 // pass
842 }
843 }
844 uint64_t stop = Cycles::rdtsc();
845 return Cycles::to_seconds(stop - start)/count;
846}
847
848// Measure the cost of throwing and catching an Exception. This uses an actual
849// exception as the value thrown, which may be slower than throwInt.
850double throw_exception()
851{
852 int count = 10000;
853 uint64_t start = Cycles::rdtsc();
854 for (int i = 0; i < count; i++) {
855 try {
856 throw buffer::end_of_buffer();
857 } catch (const buffer::end_of_buffer&) {
858 // pass
859 }
860 }
861 uint64_t stop = Cycles::rdtsc();
862 return Cycles::to_seconds(stop - start)/count;
863}
864
865// Measure the cost of throwing and catching an Exception from a function call.
866double throw_exception_call()
867{
868 int count = 10000;
869 uint64_t start = Cycles::rdtsc();
870 for (int i = 0; i < count; i++) {
871 try {
872 PerfHelper::throw_end_of_buffer();
873 } catch (const buffer::end_of_buffer&) {
874 // pass
875 }
876 }
877 uint64_t stop = Cycles::rdtsc();
878 return Cycles::to_seconds(stop - start)/count;
879}
880
881// Measure the cost of pushing a new element on a std::vector, copying
882// from the end to an internal element, and popping the end element.
883double vector_push_pop()
884{
885 int count = 100000;
886 std::vector<int> vector;
887 vector.push_back(1);
888 vector.push_back(2);
889 vector.push_back(3);
890 uint64_t start = Cycles::rdtsc();
891 for (int i = 0; i < count; i++) {
892 vector.push_back(i);
893 vector.push_back(i+1);
894 vector.push_back(i+2);
895 vector[2] = vector.back();
896 vector.pop_back();
897 vector[0] = vector.back();
898 vector.pop_back();
899 vector[1] = vector.back();
900 vector.pop_back();
901 }
902 uint64_t stop = Cycles::rdtsc();
903 return Cycles::to_seconds(stop - start)/(count*3);
904}
905
906// Measure the cost of ceph_clock_now
907double perf_ceph_clock_now()
908{
909 int count = 100000;
910 uint64_t start = Cycles::rdtsc();
911 for (int i = 0; i < count; i++) {
912 ceph_clock_now();
913 }
914 uint64_t stop = Cycles::rdtsc();
915 return Cycles::to_seconds(stop - start)/count;
916}
917
918// The following struct and table define each performance test in terms of
919// a string name and a function that implements the test.
920struct TestInfo {
921 const char* name; // Name of the performance test; this is
922 // what gets typed on the command line to
923 // run the test.
924 double (*func)(); // Function that implements the test;
925 // returns the time (in seconds) for each
926 // iteration of that test.
927 const char *description; // Short description of this test (not more
928 // than about 40 characters, so the entire
929 // test output fits on a single line).
930};
931TestInfo tests[] = {
932 {"atomic_int_cmp", atomic_int_cmp,
933 "atomic_t::compare_and_swap"},
934 {"atomic_int_inc", atomic_int_inc,
935 "atomic_t::inc"},
936 {"atomic_int_read", atomic_int_read,
937 "atomic_t::read"},
938 {"atomic_int_set", atomic_int_set,
939 "atomic_t::set"},
940 {"mutex_nonblock", mutex_nonblock,
941 "Mutex lock/unlock (no blocking)"},
942 {"buffer_basic", buffer_basic,
943 "buffer create, add one ptr, delete"},
944 {"buffer_encode_decode", buffer_encode_decode,
945 "buffer create, encode/decode object, delete"},
946 {"buffer_basic_copy", buffer_basic_copy,
947 "buffer create, copy small block, delete"},
948 {"buffer_copy", buffer_copy,
949 "copy out 2 small ptrs from buffer"},
950 {"buffer_encode10", buffer_encode,
951 "buffer encoding 10 structures onto existing ptr"},
7c673cae
FG
952 {"buffer_iterator", buffer_iterator,
953 "iterate over buffer with 5 ptrs"},
954 {"cond_ping_pong", cond_ping_pong,
955 "condition variable round-trip"},
956 {"div32", div32,
957 "32-bit integer division instruction"},
958 {"div64", div64,
959 "64-bit integer division instruction"},
960 {"function_call", function_call,
961 "Call a function that has not been inlined"},
962 {"eventcenter_poll", eventcenter_poll,
963 "EventCenter::process_events (no timers or events)"},
964 {"eventcenter_dispatch", eventcenter_dispatch,
965 "EventCenter::dispatch_event_external latency"},
966 {"memcpy100", memcpy100,
967 "Copy 100 bytes with memcpy"},
968 {"memcpy1000", memcpy1000,
969 "Copy 1000 bytes with memcpy"},
970 {"memcpy10000", memcpy10000,
971 "Copy 10000 bytes with memcpy"},
972 {"ceph_str_hash_rjenkins", ceph_str_hash_rjenkins<16>,
973 "rjenkins hash on 16 byte of data"},
974 {"ceph_str_hash_rjenkins", ceph_str_hash_rjenkins<256>,
975 "rjenkins hash on 256 bytes of data"},
976 {"rdtsc", rdtsc_test,
977 "Read the fine-grain cycle counter"},
978 {"cycles_to_seconds", perf_cycles_to_seconds,
979 "Convert a rdtsc result to (double) seconds"},
980 {"cycles_to_seconds", perf_cycles_to_nanoseconds,
981 "Convert a rdtsc result to (uint64_t) nanoseconds"},
982 {"prefetch", perf_prefetch,
983 "Prefetch instruction"},
984 {"serialize", perf_serialize,
985 "serialize instruction"},
986 {"lfence", lfence,
987 "Lfence instruction"},
988 {"sfence", sfence,
989 "Sfence instruction"},
990 {"spin_lock", test_spinlock,
991 "Acquire/release SpinLock"},
992 {"spawn_thread", spawn_thread,
993 "Start and stop a thread"},
994 {"perf_timer", perf_timer,
995 "Insert and cancel a SafeTimer"},
996 {"throw_int", throw_int,
997 "Throw an int"},
998 {"throw_int_call", throw_int_call,
999 "Throw an int in a function call"},
1000 {"throw_exception", throw_exception,
1001 "Throw an Exception"},
1002 {"throw_exception_call", throw_exception_call,
1003 "Throw an Exception in a function call"},
1004 {"vector_push_pop", vector_push_pop,
1005 "Push and pop a std::vector"},
1006 {"ceph_clock_now", perf_ceph_clock_now,
1007 "ceph_clock_now function"},
1008};
1009
1010/**
1011 * Runs a particular test and prints a one-line result message.
1012 *
1013 * \param info
1014 * Describes the test to run.
1015 */
1016void run_test(TestInfo& info)
1017{
1018 double secs = info.func();
1019 int width = printf("%-24s ", info.name);
1020 if (secs == -1) {
1021 width += printf(" architecture nonsupport ");
1022 } else if (secs < 1.0e-06) {
1023 width += printf("%8.2fns", 1e09*secs);
1024 } else if (secs < 1.0e-03) {
1025 width += printf("%8.2fus", 1e06*secs);
1026 } else if (secs < 1.0) {
1027 width += printf("%8.2fms", 1e03*secs);
1028 } else {
1029 width += printf("%8.2fs", secs);
1030 }
1031 printf("%*s %s\n", 32-width, "", info.description);
1032}
1033
1034int main(int argc, char *argv[])
1035{
20effc67 1036 auto args = argv_to_vec(argc, argv);
7c673cae
FG
1037
1038 auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
11fdf7f2
TL
1039 CODE_ENVIRONMENT_UTILITY,
1040 CINIT_FLAG_NO_DEFAULT_CONFIG_FILE);
7c673cae
FG
1041 common_init_finish(g_ceph_context);
1042 Cycles::init();
1043
1044 bind_thread_to_cpu(3);
1045 if (argc == 1) {
1046 // No test names specified; run all tests.
1047 for (size_t i = 0; i < sizeof(tests)/sizeof(TestInfo); ++i) {
1048 run_test(tests[i]);
1049 }
1050 } else {
1051 // Run only the tests that were specified on the command line.
1052 for (int i = 1; i < argc; i++) {
1053 bool found_test = false;
1054 for (size_t j = 0; j < sizeof(tests)/sizeof(TestInfo); ++j) {
1055 if (strcmp(argv[i], tests[j].name) == 0) {
1056 found_test = true;
1057 run_test(tests[j]);
1058 break;
1059 }
1060 }
1061 if (!found_test) {
1062 int width = printf("%-24s ??", argv[i]);
1063 printf("%*s No such test\n", 32-width, "");
1064 }
1065 }
1066 }
1067}