]> git.proxmox.com Git - ceph.git/blame - ceph/src/test/perf_local.cc
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / test / perf_local.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/* Copyright (c) 2015 Haomai Wang <haomaiwang@gmail.com>
4 * Copyright (c) 2011-2014 Stanford University
5 * Copyright (c) 2011 Facebook
6 *
7 * Permission to use, copy, modify, and distribute this software for any
8 * purpose with or without fee is hereby granted, provided that the above
9 * copyright notice and this permission notice appear in all copies.
10 *
11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR(S) DISCLAIM ALL WARRANTIES
12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL AUTHORS BE LIABLE FOR
14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18 */
19
20// This program contains a collection of low-level performance measurements
21// for Ceph, which can be run either individually or altogether. These
22// tests measure performance in a single stand-alone process, not in a cluster
23// with multiple servers. Invoke the program like this:
24//
25// Perf test1 test2 ...
26//
27// test1 and test2 are the names of individual performance measurements to
28// run. If no test names are provided then all of the performance tests
29// are run.
30//
31// To add a new test:
32// * Write a function that implements the test. Use existing test functions
33// as a guideline, and be sure to generate output in the same form as
34// other tests.
35// * Create a new entry for the test in the #tests table.
36#include <vector>
37#include <sched.h>
38
39#include "acconfig.h"
40#ifdef HAVE_SSE
41#include <xmmintrin.h>
42#endif
43
7c673cae
FG
44#include "include/buffer.h"
45#include "include/encoding.h"
46#include "include/ceph_hash.h"
11fdf7f2 47#include "include/spinlock.h"
7c673cae
FG
48#include "common/ceph_argparse.h"
49#include "common/Cycles.h"
50#include "common/Cond.h"
9f95a23c 51#include "common/ceph_mutex.h"
7c673cae
FG
52#include "common/Thread.h"
53#include "common/Timer.h"
54#include "msg/async/Event.h"
55#include "global/global_init.h"
56
57#include "test/perf_helper.h"
58
31f18b77
FG
59#include <atomic>
60
7c673cae
FG
61using namespace ceph;
62
63/**
64 * Ask the operating system to pin the current thread to a given CPU.
65 *
66 * \param cpu
67 * Indicates the desired CPU and hyperthread; low order 2 bits
68 * specify CPU, next bit specifies hyperthread.
69 */
70void bind_thread_to_cpu(int cpu)
71{
72#ifdef HAVE_SCHED
73 cpu_set_t set;
74 CPU_ZERO(&set);
75 CPU_SET(cpu, &set);
76 sched_setaffinity(0, sizeof(set), &set);
77#endif
78}
79
80/*
81 * This function just discards its argument. It's used to make it
82 * appear that data is used, so that the compiler won't optimize
83 * away the code we're trying to measure.
84 *
85 * \param value
86 * Pointer to arbitrary value; it's discarded.
87 */
88void discard(void* value) {
89 int x = *reinterpret_cast<int*>(value);
90 if (x == 0x43924776) {
91 printf("Value was 0x%x\n", x);
92 }
93}
94
95//----------------------------------------------------------------------
96// Test functions start here
97//----------------------------------------------------------------------
98
31f18b77 99// Measure the cost of atomic compare-and-swap
7c673cae
FG
100double atomic_int_cmp()
101{
102 int count = 1000000;
31f18b77
FG
103 std::atomic<unsigned> value = { 11 };
104 unsigned int test = 11;
7c673cae
FG
105 uint64_t start = Cycles::rdtsc();
106 for (int i = 0; i < count; i++) {
31f18b77 107 value.compare_exchange_strong(test, test+2);
7c673cae
FG
108 test += 2;
109 }
110 uint64_t stop = Cycles::rdtsc();
111 // printf("Final value: %d\n", value.load());
112 return Cycles::to_seconds(stop - start)/count;
113}
114
31f18b77 115// Measure the cost of incrementing an atomic
7c673cae
FG
116double atomic_int_inc()
117{
118 int count = 1000000;
31f18b77 119 std::atomic<int64_t> value = { 11 };
7c673cae
FG
120 uint64_t start = Cycles::rdtsc();
121 for (int i = 0; i < count; i++) {
31f18b77 122 value++;
7c673cae
FG
123 }
124 uint64_t stop = Cycles::rdtsc();
125 // printf("Final value: %d\n", value.load());
126 return Cycles::to_seconds(stop - start)/count;
127}
128
31f18b77 129// Measure the cost of reading an atomic
7c673cae
FG
130double atomic_int_read()
131{
132 int count = 1000000;
31f18b77 133 std::atomic<int64_t> value = { 11 };
7c673cae
FG
134 int total = 0;
135 uint64_t start = Cycles::rdtsc();
136 for (int i = 0; i < count; i++) {
31f18b77 137 total += value;
7c673cae
FG
138 }
139 uint64_t stop = Cycles::rdtsc();
140 // printf("Total: %d\n", total);
141 return Cycles::to_seconds(stop - start)/count;
142}
143
31f18b77 144// Measure the cost of storing a new value in an atomic
7c673cae
FG
145double atomic_int_set()
146{
147 int count = 1000000;
31f18b77 148 std::atomic<int64_t> value = { 11 };
7c673cae
FG
149 uint64_t start = Cycles::rdtsc();
150 for (int i = 0; i < count; i++) {
31f18b77 151 value = 88;
7c673cae
FG
152 }
153 uint64_t stop = Cycles::rdtsc();
154 return Cycles::to_seconds(stop - start)/count;
155}
156
157// Measure the cost of acquiring and releasing a mutex in the
158// fast case where the mutex is free.
159double mutex_nonblock()
160{
161 int count = 1000000;
9f95a23c 162 ceph::mutex m = ceph::make_mutex("mutex_nonblock::m");
7c673cae
FG
163 uint64_t start = Cycles::rdtsc();
164 for (int i = 0; i < count; i++) {
9f95a23c
TL
165 m.lock();
166 m.unlock();
7c673cae
FG
167 }
168 uint64_t stop = Cycles::rdtsc();
169 return Cycles::to_seconds(stop - start)/count;
170}
171
172// Measure the cost of allocating and deallocating a buffer, plus
173// appending (logically) one ptr.
174double buffer_basic()
175{
176 int count = 1000000;
177 uint64_t start = Cycles::rdtsc();
178 bufferptr ptr("abcdefg", 7);
179 for (int i = 0; i < count; i++) {
180 bufferlist b;
181 b.append(ptr, 0, 5);
182 }
183 uint64_t stop = Cycles::rdtsc();
184 return Cycles::to_seconds(stop - start)/count;
185}
186
187struct DummyBlock {
188 int a = 1, b = 2, c = 3, d = 4;
189 void encode(bufferlist &bl) const {
190 ENCODE_START(1, 1, bl);
11fdf7f2
TL
191 encode(a, bl);
192 encode(b, bl);
193 encode(c, bl);
194 encode(d, bl);
7c673cae
FG
195 ENCODE_FINISH(bl);
196 }
11fdf7f2 197 void decode(bufferlist::const_iterator &bl) {
7c673cae 198 DECODE_START(1, bl);
11fdf7f2
TL
199 decode(a, bl);
200 decode(b, bl);
201 decode(c, bl);
202 decode(d, bl);
7c673cae
FG
203 DECODE_FINISH(bl);
204 }
205};
206WRITE_CLASS_ENCODER(DummyBlock)
207
208// Measure the cost of encoding and decoding a buffer, plus
209// allocating space for one chunk.
210double buffer_encode_decode()
211{
212 int count = 1000000;
213 uint64_t start = Cycles::rdtsc();
214 for (int i = 0; i < count; i++) {
215 bufferlist b;
216 DummyBlock dummy_block;
11fdf7f2
TL
217 encode(dummy_block, b);
218 auto iter = b.cbegin();
219 decode(dummy_block, iter);
7c673cae
FG
220 }
221 uint64_t stop = Cycles::rdtsc();
222 return Cycles::to_seconds(stop - start)/count;
223}
224
225// Measure the cost of allocating and deallocating a buffer, plus
226// copying in a small block.
227double buffer_basic_copy()
228{
229 int count = 1000000;
230 uint64_t start = Cycles::rdtsc();
231 for (int i = 0; i < count; i++) {
232 bufferlist b;
233 b.append("abcdefg", 6);
234 }
235 uint64_t stop = Cycles::rdtsc();
236 return Cycles::to_seconds(stop - start)/count;
237}
238
239// Measure the cost of making a copy of parts of two ptrs.
240double buffer_copy()
241{
242 int count = 1000000;
243 bufferlist b;
244 b.append("abcde", 5);
245 b.append("01234", 5);
246 char copy[10];
247 uint64_t start = Cycles::rdtsc();
248 for (int i = 0; i < count; i++) {
9f95a23c 249 b.cbegin(2).copy(6, copy);
7c673cae
FG
250 }
251 uint64_t stop = Cycles::rdtsc();
252 return Cycles::to_seconds(stop - start)/count;
253}
254
255// Measure the cost of allocating new space by extending the
256// bufferlist
257double buffer_encode()
258{
259 int count = 100000;
260 uint64_t total = 0;
261 for (int i = 0; i < count; i++) {
262 bufferlist b;
263 DummyBlock dummy_block;
11fdf7f2 264 encode(dummy_block, b);
7c673cae 265 uint64_t start = Cycles::rdtsc();
11fdf7f2
TL
266 encode(dummy_block, b);
267 encode(dummy_block, b);
268 encode(dummy_block, b);
269 encode(dummy_block, b);
270 encode(dummy_block, b);
271 encode(dummy_block, b);
272 encode(dummy_block, b);
273 encode(dummy_block, b);
274 encode(dummy_block, b);
275 encode(dummy_block, b);
7c673cae
FG
276 total += Cycles::rdtsc() - start;
277 }
278 return Cycles::to_seconds(total)/(count*10);
279}
280
7c673cae
FG
281// Measure the cost of creating an iterator and iterating over 10
282// chunks in a buffer.
283double buffer_iterator()
284{
285 bufferlist b;
286 const char s[] = "abcdefghijklmnopqrstuvwxyz";
287 bufferptr ptr(s, sizeof(s));
288 for (int i = 0; i < 5; i++) {
289 b.append(ptr, i, 5);
290 }
291 int count = 100000;
292 int sum = 0;
293 uint64_t start = Cycles::rdtsc();
294 for (int i = 0; i < count; i++) {
11fdf7f2 295 auto it = b.cbegin();
7c673cae
FG
296 while (!it.end()) {
297 sum += (static_cast<const char*>(it.get_current_ptr().c_str()))[it.get_remaining()-1];
298 ++it;
299 }
300 }
301 uint64_t stop = Cycles::rdtsc();
302 discard(&sum);
303 return Cycles::to_seconds(stop - start)/count;
304}
305
306// Implements the CondPingPong test.
307class CondPingPong {
9f95a23c
TL
308 ceph::mutex mutex = ceph::make_mutex("CondPingPong::mutex");
309 ceph::condition_variable cond;
310 int prod = 0;
311 int cons = 0;
312 const int count = 10000;
7c673cae
FG
313
314 class Consumer : public Thread {
315 CondPingPong *p;
316 public:
317 explicit Consumer(CondPingPong *p): p(p) {}
318 void* entry() override {
319 p->consume();
320 return 0;
321 }
322 } consumer;
323
324 public:
9f95a23c 325 CondPingPong(): consumer(this) {}
7c673cae
FG
326
327 double run() {
328 consumer.create("consumer");
329 uint64_t start = Cycles::rdtsc();
330 produce();
331 uint64_t stop = Cycles::rdtsc();
332 consumer.join();
333 return Cycles::to_seconds(stop - start)/count;
334 }
335
336 void produce() {
9f95a23c 337 std::unique_lock l{mutex};
7c673cae 338 while (cons < count) {
9f95a23c 339 cond.wait(l, [this] { return cons >= prod; });
7c673cae 340 ++prod;
9f95a23c 341 cond.notify_all();
7c673cae
FG
342 }
343 }
344
345 void consume() {
9f95a23c 346 std::unique_lock l{mutex};
7c673cae 347 while (cons < count) {
9f95a23c 348 cond.wait(l, [this] { return cons != prod; });
7c673cae 349 ++cons;
9f95a23c 350 cond.notify_all();
7c673cae
FG
351 }
352 }
353};
354
355// Measure the cost of coordinating between threads using a condition variable.
356double cond_ping_pong()
357{
358 return CondPingPong().run();
359}
360
361// Measure the cost of a 32-bit divide. Divides don't take a constant
362// number of cycles. Values were chosen here semi-randomly to depict a
363// fairly expensive scenario. Someone with fancy ALU knowledge could
364// probably pick worse values.
365double div32()
366{
367#if defined(__i386__) || defined(__x86_64__)
368 int count = 1000000;
369 uint64_t start = Cycles::rdtsc();
370 // NB: Expect an x86 processor exception is there's overflow.
371 uint32_t numeratorHi = 0xa5a5a5a5U;
372 uint32_t numeratorLo = 0x55aa55aaU;
373 uint32_t divisor = 0xaa55aa55U;
374 uint32_t quotient;
375 uint32_t remainder;
376 for (int i = 0; i < count; i++) {
377 __asm__ __volatile__("div %4" :
378 "=a"(quotient), "=d"(remainder) :
379 "a"(numeratorLo), "d"(numeratorHi), "r"(divisor) :
380 "cc");
381 }
382 uint64_t stop = Cycles::rdtsc();
383 return Cycles::to_seconds(stop - start)/count;
f67539c2
TL
384#elif defined(__aarch64__)
385 int count = 1000000;
386 uint64_t start = Cycles::rdtsc();
387 uint64_t numerator = 0xa5a5a5a555aa55aaUL;
388 uint32_t divisor = 0xaa55aa55U;
389 uint32_t result;
390 for (int i = 0; i < count; i++) {
391 asm volatile("udiv %0, %1, %2" : "=r"(result) :
392 "r"(numerator), "r"(divisor));
393 }
394 uint64_t stop = Cycles::rdtsc();
395 return Cycles::to_seconds(stop - start)/count;
7c673cae
FG
396#else
397 return -1;
398#endif
399}
400
401// Measure the cost of a 64-bit divide. Divides don't take a constant
402// number of cycles. Values were chosen here semi-randomly to depict a
403// fairly expensive scenario. Someone with fancy ALU knowledge could
404// probably pick worse values.
405double div64()
406{
407#if defined(__x86_64__) || defined(__amd64__)
408 int count = 1000000;
409 // NB: Expect an x86 processor exception is there's overflow.
410 uint64_t start = Cycles::rdtsc();
411 uint64_t numeratorHi = 0x5a5a5a5a5a5UL;
412 uint64_t numeratorLo = 0x55aa55aa55aa55aaUL;
413 uint64_t divisor = 0xaa55aa55aa55aa55UL;
414 uint64_t quotient;
415 uint64_t remainder;
416 for (int i = 0; i < count; i++) {
417 __asm__ __volatile__("divq %4" :
418 "=a"(quotient), "=d"(remainder) :
419 "a"(numeratorLo), "d"(numeratorHi), "r"(divisor) :
420 "cc");
421 }
422 uint64_t stop = Cycles::rdtsc();
423 return Cycles::to_seconds(stop - start)/count;
424#else
425 return -1;
426#endif
427}
428
429// Measure the cost of calling a non-inlined function.
430double function_call()
431{
432 int count = 1000000;
433 uint64_t x = 0;
434 uint64_t start = Cycles::rdtsc();
435 for (int i = 0; i < count; i++) {
436 x = PerfHelper::plus_one(x);
437 }
438 uint64_t stop = Cycles::rdtsc();
439 return Cycles::to_seconds(stop - start)/count;
440}
441
442// Measure the minimum cost of EventCenter::process_events, when there are no
443// Pollers and no Timers.
444double eventcenter_poll()
445{
446 int count = 1000000;
447 EventCenter center(g_ceph_context);
448 center.init(1000, 0, "posix");
449 center.set_owner();
450 uint64_t start = Cycles::rdtsc();
451 for (int i = 0; i < count; i++) {
452 center.process_events(0);
453 }
454 uint64_t stop = Cycles::rdtsc();
455 return Cycles::to_seconds(stop - start)/count;
456}
457
458class CenterWorker : public Thread {
459 CephContext *cct;
460 bool done;
461
462 public:
463 EventCenter center;
464 explicit CenterWorker(CephContext *c): cct(c), done(false), center(c) {
465 center.init(100, 0, "posix");
466 }
467 void stop() {
468 done = true;
469 center.wakeup();
470 }
471 void* entry() override {
472 center.set_owner();
473 bind_thread_to_cpu(2);
474 while (!done)
475 center.process_events(1000);
476 return 0;
477 }
478};
479
480class CountEvent: public EventCallback {
31f18b77 481 std::atomic<int64_t> *count;
7c673cae
FG
482
483 public:
31f18b77 484 explicit CountEvent(std::atomic<int64_t> *atomic): count(atomic) {}
11fdf7f2 485 void do_request(uint64_t id) override {
31f18b77 486 (*count)--;
7c673cae
FG
487 }
488};
489
490double eventcenter_dispatch()
491{
492 int count = 100000;
493
494 CenterWorker worker(g_ceph_context);
31f18b77 495 std::atomic<int64_t> flag = { 1 };
7c673cae
FG
496 worker.create("evt_center_disp");
497 EventCallbackRef count_event(new CountEvent(&flag));
498
499 worker.center.dispatch_event_external(count_event);
500 // Start a new thread and wait for it to ready.
31f18b77 501 while (flag)
7c673cae
FG
502 usleep(100);
503
504 uint64_t start = Cycles::rdtsc();
505 for (int i = 0; i < count; i++) {
31f18b77 506 flag = 1;
7c673cae 507 worker.center.dispatch_event_external(count_event);
31f18b77 508 while (flag)
7c673cae
FG
509 ;
510 }
511 uint64_t stop = Cycles::rdtsc();
512 worker.stop();
513 worker.join();
514 return Cycles::to_seconds(stop - start)/count;
515}
516
517// Measure the cost of copying a given number of bytes with memcpy.
518double memcpy_shared(size_t size)
519{
520 int count = 1000000;
521 char src[size], dst[size];
522
523 memset(src, 0, sizeof(src));
524
525 uint64_t start = Cycles::rdtsc();
526 for (int i = 0; i < count; i++) {
527 memcpy(dst, src, size);
528 }
529 uint64_t stop = Cycles::rdtsc();
530 return Cycles::to_seconds(stop - start)/count;
531}
532
533double memcpy100()
534{
535 return memcpy_shared(100);
536}
537
538double memcpy1000()
539{
540 return memcpy_shared(1000);
541}
542
543double memcpy10000()
544{
545 return memcpy_shared(10000);
546}
547
548// Benchmark rjenkins hashing performance on cached data.
549template <int key_length>
550double ceph_str_hash_rjenkins()
551{
552 int count = 100000;
553 char buf[key_length];
554
555 uint64_t start = Cycles::rdtsc();
556 for (int i = 0; i < count; i++)
557 ceph_str_hash(CEPH_STR_HASH_RJENKINS, buf, sizeof(buf));
558 uint64_t stop = Cycles::rdtsc();
559
560 return Cycles::to_seconds(stop - start)/count;
561}
562
563// Measure the cost of reading the fine-grain cycle counter.
564double rdtsc_test()
565{
566 int count = 1000000;
567 uint64_t start = Cycles::rdtsc();
568 uint64_t total = 0;
569 for (int i = 0; i < count; i++) {
570 total += Cycles::rdtsc();
571 }
572 uint64_t stop = Cycles::rdtsc();
573 return Cycles::to_seconds(stop - start)/count;
574}
575
576// Measure the cost of the Cycles::to_seconds method.
577double perf_cycles_to_seconds()
578{
579 int count = 1000000;
580 double total = 0;
581 uint64_t cycles = 994261;
582 uint64_t start = Cycles::rdtsc();
583 for (int i = 0; i < count; i++) {
584 total += Cycles::to_seconds(cycles);
585 }
586 uint64_t stop = Cycles::rdtsc();
587 // printf("Result: %.4f\n", total/count);
588 return Cycles::to_seconds(stop - start)/count;
589}
590
591// Measure the cost of the Cylcles::toNanoseconds method.
592double perf_cycles_to_nanoseconds()
593{
594 int count = 1000000;
595 uint64_t total = 0;
596 uint64_t cycles = 994261;
597 uint64_t start = Cycles::rdtsc();
598 for (int i = 0; i < count; i++) {
599 total += Cycles::to_nanoseconds(cycles);
600 }
601 uint64_t stop = Cycles::rdtsc();
602 // printf("Result: %lu\n", total/count);
603 return Cycles::to_seconds(stop - start)/count;
604}
605
606
607#ifdef HAVE_SSE
608/**
609 * Prefetch the cache lines containing [object, object + numBytes) into the
610 * processor's caches.
611 * The best docs for this are in the Intel instruction set reference under
612 * PREFETCH.
613 * \param object
614 * The start of the region of memory to prefetch.
615 * \param num_bytes
616 * The size of the region of memory to prefetch.
617 */
618static inline void prefetch(const void *object, uint64_t num_bytes)
619{
620 uint64_t offset = reinterpret_cast<uint64_t>(object) & 0x3fUL;
621 const char* p = reinterpret_cast<const char*>(object) - offset;
622 for (uint64_t i = 0; i < offset + num_bytes; i += 64)
623 _mm_prefetch(p + i, _MM_HINT_T0);
624}
f67539c2
TL
625#elif defined(__aarch64__)
626static inline void prefetch(const void *object, uint64_t num_bytes)
627{
628 uint64_t offset = reinterpret_cast<uint64_t>(object) & 0x3fUL;
629 const char* ptr = reinterpret_cast<const char*>(object) - offset;
630 for (uint64_t i = 0; i < offset + num_bytes; i += 64, ptr += 64)
631 asm volatile("prfm pldl1keep, %a0\n" : : "p" (ptr));
632}
7c673cae
FG
633#endif
634
635// Measure the cost of the prefetch instruction.
636double perf_prefetch()
637{
f67539c2 638#if defined(HAVE_SSE) || defined(__aarch64__)
7c673cae
FG
639 uint64_t total_ticks = 0;
640 int count = 10;
641 char buf[16 * 64];
7c673cae
FG
642
643 for (int i = 0; i < count; i++) {
644 PerfHelper::flush_cache();
11fdf7f2 645 uint64_t start = Cycles::rdtsc();
7c673cae
FG
646 prefetch(&buf[576], 64);
647 prefetch(&buf[0], 64);
648 prefetch(&buf[512], 64);
649 prefetch(&buf[960], 64);
650 prefetch(&buf[640], 64);
651 prefetch(&buf[896], 64);
652 prefetch(&buf[256], 64);
653 prefetch(&buf[704], 64);
654 prefetch(&buf[320], 64);
655 prefetch(&buf[384], 64);
656 prefetch(&buf[128], 64);
657 prefetch(&buf[448], 64);
658 prefetch(&buf[768], 64);
659 prefetch(&buf[832], 64);
660 prefetch(&buf[64], 64);
661 prefetch(&buf[192], 64);
11fdf7f2 662 uint64_t stop = Cycles::rdtsc();
7c673cae
FG
663 total_ticks += stop - start;
664 }
665 return Cycles::to_seconds(total_ticks) / count / 16;
666#else
667 return -1;
668#endif
669}
670
671#if defined(__x86_64__)
672/**
673 * This function is used to seralize machine instructions so that no
674 * instructions that appear after it in the current thread can run before any
675 * instructions that appear before it.
676 *
677 * It is useful for putting around rdpmc instructions (to pinpoint cache
678 * misses) as well as before rdtsc instructions, to prevent time pollution from
679 * instructions supposed to be executing before the timer starts.
680 */
681static inline void serialize() {
682 uint32_t eax, ebx, ecx, edx;
683 __asm volatile("cpuid"
684 : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
685 : "a" (1U));
686}
687#endif
688
689// Measure the cost of cpuid
690double perf_serialize() {
691#if defined(__x86_64__)
692 int count = 1000000;
693 uint64_t start = Cycles::rdtsc();
694 for (int i = 0; i < count; i++) {
695 serialize();
696 }
697 uint64_t stop = Cycles::rdtsc();
698 return Cycles::to_seconds(stop - start)/count;
699#else
700 return -1;
701#endif
702}
703
704// Measure the cost of an lfence instruction.
705double lfence()
706{
707#ifdef HAVE_SSE2
708 int count = 1000000;
709 uint64_t start = Cycles::rdtsc();
710 for (int i = 0; i < count; i++) {
711 __asm__ __volatile__("lfence" ::: "memory");
712 }
713 uint64_t stop = Cycles::rdtsc();
714 return Cycles::to_seconds(stop - start)/count;
f67539c2
TL
715#elif defined(__aarch64__)
716 int count = 1000000;
717 uint64_t start = Cycles::rdtsc();
718 for (int i = 0; i < count; i++) {
719 asm volatile("dmb ishld" ::: "memory");
720 }
721 uint64_t stop = Cycles::rdtsc();
722 return Cycles::to_seconds(stop - start)/count;
7c673cae
FG
723#else
724 return -1;
725#endif
726}
727
728// Measure the cost of an sfence instruction.
729double sfence()
730{
731#ifdef HAVE_SSE
732 int count = 1000000;
733 uint64_t start = Cycles::rdtsc();
734 for (int i = 0; i < count; i++) {
735 __asm__ __volatile__("sfence" ::: "memory");
736 }
737 uint64_t stop = Cycles::rdtsc();
738 return Cycles::to_seconds(stop - start)/count;
f67539c2
TL
739#elif defined(__aarch64__)
740 int count = 1000000;
741 uint64_t start = Cycles::rdtsc();
742 for (int i = 0; i < count; i++) {
743 asm volatile("dmb ishst" ::: "memory");
744 }
745 uint64_t stop = Cycles::rdtsc();
746 return Cycles::to_seconds(stop - start)/count;
7c673cae
FG
747#else
748 return -1;
749#endif
750}
751
752// Measure the cost of acquiring and releasing a SpinLock (assuming the
753// lock is initially free).
754double test_spinlock()
755{
756 int count = 1000000;
11fdf7f2 757 ceph::spinlock lock;
7c673cae
FG
758 uint64_t start = Cycles::rdtsc();
759 for (int i = 0; i < count; i++) {
760 lock.lock();
761 lock.unlock();
762 }
763 uint64_t stop = Cycles::rdtsc();
764 return Cycles::to_seconds(stop - start)/count;
765}
766
767// Helper for spawn_thread. This is the main function that the thread executes
768// (intentionally empty).
769class ThreadHelper : public Thread {
770 void *entry() override { return 0; }
771};
772
773// Measure the cost of start and joining with a thread.
774double spawn_thread()
775{
776 int count = 10000;
777 ThreadHelper thread;
778 uint64_t start = Cycles::rdtsc();
779 for (int i = 0; i < count; i++) {
780 thread.create("thread_helper");
781 thread.join();
782 }
783 uint64_t stop = Cycles::rdtsc();
784 return Cycles::to_seconds(stop - start)/count;
785}
786
787class FakeContext : public Context {
788 public:
789 void finish(int r) override {}
790};
791
792// Measure the cost of starting and stopping a Dispatch::Timer.
793double perf_timer()
794{
795 int count = 1000000;
9f95a23c 796 ceph::mutex lock = ceph::make_mutex("perf_timer::lock");
7c673cae
FG
797 SafeTimer timer(g_ceph_context, lock);
798 FakeContext **c = new FakeContext*[count];
799 for (int i = 0; i < count; i++) {
800 c[i] = new FakeContext();
801 }
802 uint64_t start = Cycles::rdtsc();
9f95a23c 803 std::lock_guard l{lock};
7c673cae 804 for (int i = 0; i < count; i++) {
3efd9988
FG
805 if (timer.add_event_after(12345, c[i])) {
806 timer.cancel_event(c[i]);
807 }
7c673cae
FG
808 }
809 uint64_t stop = Cycles::rdtsc();
810 delete[] c;
811 return Cycles::to_seconds(stop - start)/count;
812}
813
814// Measure the cost of throwing and catching an int. This uses an integer as
815// the value thrown, which is presumably as fast as possible.
816double throw_int()
817{
818 int count = 10000;
819 uint64_t start = Cycles::rdtsc();
820 for (int i = 0; i < count; i++) {
821 try {
822 throw 0;
823 } catch (int) { // NOLINT
824 // pass
825 }
826 }
827 uint64_t stop = Cycles::rdtsc();
828 return Cycles::to_seconds(stop - start)/count;
829}
830
831// Measure the cost of throwing and catching an int from a function call.
832double throw_int_call()
833{
834 int count = 10000;
835 uint64_t start = Cycles::rdtsc();
836 for (int i = 0; i < count; i++) {
837 try {
838 PerfHelper::throw_int();
839 } catch (int) { // NOLINT
840 // pass
841 }
842 }
843 uint64_t stop = Cycles::rdtsc();
844 return Cycles::to_seconds(stop - start)/count;
845}
846
847// Measure the cost of throwing and catching an Exception. This uses an actual
848// exception as the value thrown, which may be slower than throwInt.
849double throw_exception()
850{
851 int count = 10000;
852 uint64_t start = Cycles::rdtsc();
853 for (int i = 0; i < count; i++) {
854 try {
855 throw buffer::end_of_buffer();
856 } catch (const buffer::end_of_buffer&) {
857 // pass
858 }
859 }
860 uint64_t stop = Cycles::rdtsc();
861 return Cycles::to_seconds(stop - start)/count;
862}
863
864// Measure the cost of throwing and catching an Exception from a function call.
865double throw_exception_call()
866{
867 int count = 10000;
868 uint64_t start = Cycles::rdtsc();
869 for (int i = 0; i < count; i++) {
870 try {
871 PerfHelper::throw_end_of_buffer();
872 } catch (const buffer::end_of_buffer&) {
873 // pass
874 }
875 }
876 uint64_t stop = Cycles::rdtsc();
877 return Cycles::to_seconds(stop - start)/count;
878}
879
880// Measure the cost of pushing a new element on a std::vector, copying
881// from the end to an internal element, and popping the end element.
882double vector_push_pop()
883{
884 int count = 100000;
885 std::vector<int> vector;
886 vector.push_back(1);
887 vector.push_back(2);
888 vector.push_back(3);
889 uint64_t start = Cycles::rdtsc();
890 for (int i = 0; i < count; i++) {
891 vector.push_back(i);
892 vector.push_back(i+1);
893 vector.push_back(i+2);
894 vector[2] = vector.back();
895 vector.pop_back();
896 vector[0] = vector.back();
897 vector.pop_back();
898 vector[1] = vector.back();
899 vector.pop_back();
900 }
901 uint64_t stop = Cycles::rdtsc();
902 return Cycles::to_seconds(stop - start)/(count*3);
903}
904
905// Measure the cost of ceph_clock_now
906double perf_ceph_clock_now()
907{
908 int count = 100000;
909 uint64_t start = Cycles::rdtsc();
910 for (int i = 0; i < count; i++) {
911 ceph_clock_now();
912 }
913 uint64_t stop = Cycles::rdtsc();
914 return Cycles::to_seconds(stop - start)/count;
915}
916
917// The following struct and table define each performance test in terms of
918// a string name and a function that implements the test.
919struct TestInfo {
920 const char* name; // Name of the performance test; this is
921 // what gets typed on the command line to
922 // run the test.
923 double (*func)(); // Function that implements the test;
924 // returns the time (in seconds) for each
925 // iteration of that test.
926 const char *description; // Short description of this test (not more
927 // than about 40 characters, so the entire
928 // test output fits on a single line).
929};
930TestInfo tests[] = {
931 {"atomic_int_cmp", atomic_int_cmp,
932 "atomic_t::compare_and_swap"},
933 {"atomic_int_inc", atomic_int_inc,
934 "atomic_t::inc"},
935 {"atomic_int_read", atomic_int_read,
936 "atomic_t::read"},
937 {"atomic_int_set", atomic_int_set,
938 "atomic_t::set"},
939 {"mutex_nonblock", mutex_nonblock,
940 "Mutex lock/unlock (no blocking)"},
941 {"buffer_basic", buffer_basic,
942 "buffer create, add one ptr, delete"},
943 {"buffer_encode_decode", buffer_encode_decode,
944 "buffer create, encode/decode object, delete"},
945 {"buffer_basic_copy", buffer_basic_copy,
946 "buffer create, copy small block, delete"},
947 {"buffer_copy", buffer_copy,
948 "copy out 2 small ptrs from buffer"},
949 {"buffer_encode10", buffer_encode,
950 "buffer encoding 10 structures onto existing ptr"},
7c673cae
FG
951 {"buffer_iterator", buffer_iterator,
952 "iterate over buffer with 5 ptrs"},
953 {"cond_ping_pong", cond_ping_pong,
954 "condition variable round-trip"},
955 {"div32", div32,
956 "32-bit integer division instruction"},
957 {"div64", div64,
958 "64-bit integer division instruction"},
959 {"function_call", function_call,
960 "Call a function that has not been inlined"},
961 {"eventcenter_poll", eventcenter_poll,
962 "EventCenter::process_events (no timers or events)"},
963 {"eventcenter_dispatch", eventcenter_dispatch,
964 "EventCenter::dispatch_event_external latency"},
965 {"memcpy100", memcpy100,
966 "Copy 100 bytes with memcpy"},
967 {"memcpy1000", memcpy1000,
968 "Copy 1000 bytes with memcpy"},
969 {"memcpy10000", memcpy10000,
970 "Copy 10000 bytes with memcpy"},
971 {"ceph_str_hash_rjenkins", ceph_str_hash_rjenkins<16>,
972 "rjenkins hash on 16 byte of data"},
973 {"ceph_str_hash_rjenkins", ceph_str_hash_rjenkins<256>,
974 "rjenkins hash on 256 bytes of data"},
975 {"rdtsc", rdtsc_test,
976 "Read the fine-grain cycle counter"},
977 {"cycles_to_seconds", perf_cycles_to_seconds,
978 "Convert a rdtsc result to (double) seconds"},
979 {"cycles_to_seconds", perf_cycles_to_nanoseconds,
980 "Convert a rdtsc result to (uint64_t) nanoseconds"},
981 {"prefetch", perf_prefetch,
982 "Prefetch instruction"},
983 {"serialize", perf_serialize,
984 "serialize instruction"},
985 {"lfence", lfence,
986 "Lfence instruction"},
987 {"sfence", sfence,
988 "Sfence instruction"},
989 {"spin_lock", test_spinlock,
990 "Acquire/release SpinLock"},
991 {"spawn_thread", spawn_thread,
992 "Start and stop a thread"},
993 {"perf_timer", perf_timer,
994 "Insert and cancel a SafeTimer"},
995 {"throw_int", throw_int,
996 "Throw an int"},
997 {"throw_int_call", throw_int_call,
998 "Throw an int in a function call"},
999 {"throw_exception", throw_exception,
1000 "Throw an Exception"},
1001 {"throw_exception_call", throw_exception_call,
1002 "Throw an Exception in a function call"},
1003 {"vector_push_pop", vector_push_pop,
1004 "Push and pop a std::vector"},
1005 {"ceph_clock_now", perf_ceph_clock_now,
1006 "ceph_clock_now function"},
1007};
1008
1009/**
1010 * Runs a particular test and prints a one-line result message.
1011 *
1012 * \param info
1013 * Describes the test to run.
1014 */
1015void run_test(TestInfo& info)
1016{
1017 double secs = info.func();
1018 int width = printf("%-24s ", info.name);
1019 if (secs == -1) {
1020 width += printf(" architecture nonsupport ");
1021 } else if (secs < 1.0e-06) {
1022 width += printf("%8.2fns", 1e09*secs);
1023 } else if (secs < 1.0e-03) {
1024 width += printf("%8.2fus", 1e06*secs);
1025 } else if (secs < 1.0) {
1026 width += printf("%8.2fms", 1e03*secs);
1027 } else {
1028 width += printf("%8.2fs", secs);
1029 }
1030 printf("%*s %s\n", 32-width, "", info.description);
1031}
1032
1033int main(int argc, char *argv[])
1034{
1035 vector<const char*> args;
1036 argv_to_vec(argc, (const char **)argv, args);
1037
1038 auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
11fdf7f2
TL
1039 CODE_ENVIRONMENT_UTILITY,
1040 CINIT_FLAG_NO_DEFAULT_CONFIG_FILE);
7c673cae
FG
1041 common_init_finish(g_ceph_context);
1042 Cycles::init();
1043
1044 bind_thread_to_cpu(3);
1045 if (argc == 1) {
1046 // No test names specified; run all tests.
1047 for (size_t i = 0; i < sizeof(tests)/sizeof(TestInfo); ++i) {
1048 run_test(tests[i]);
1049 }
1050 } else {
1051 // Run only the tests that were specified on the command line.
1052 for (int i = 1; i < argc; i++) {
1053 bool found_test = false;
1054 for (size_t j = 0; j < sizeof(tests)/sizeof(TestInfo); ++j) {
1055 if (strcmp(argv[i], tests[j].name) == 0) {
1056 found_test = true;
1057 run_test(tests[j]);
1058 break;
1059 }
1060 }
1061 if (!found_test) {
1062 int width = printf("%-24s ??", argv[i]);
1063 printf("%*s No such test\n", 32-width, "");
1064 }
1065 }
1066 }
1067}