]> git.proxmox.com Git - ceph.git/blame - ceph/src/test/perf_local.cc
update sources to v12.1.0
[ceph.git] / ceph / src / test / perf_local.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/* Copyright (c) 2015 Haomai Wang <haomaiwang@gmail.com>
4 * Copyright (c) 2011-2014 Stanford University
5 * Copyright (c) 2011 Facebook
6 *
7 * Permission to use, copy, modify, and distribute this software for any
8 * purpose with or without fee is hereby granted, provided that the above
9 * copyright notice and this permission notice appear in all copies.
10 *
11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR(S) DISCLAIM ALL WARRANTIES
12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL AUTHORS BE LIABLE FOR
14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18 */
19
20// This program contains a collection of low-level performance measurements
21// for Ceph, which can be run either individually or altogether. These
22// tests measure performance in a single stand-alone process, not in a cluster
23// with multiple servers. Invoke the program like this:
24//
25// Perf test1 test2 ...
26//
27// test1 and test2 are the names of individual performance measurements to
28// run. If no test names are provided then all of the performance tests
29// are run.
30//
31// To add a new test:
32// * Write a function that implements the test. Use existing test functions
33// as a guideline, and be sure to generate output in the same form as
34// other tests.
35// * Create a new entry for the test in the #tests table.
36#include <vector>
37#include <sched.h>
38
39#include "acconfig.h"
40#ifdef HAVE_SSE
41#include <xmmintrin.h>
42#endif
43
7c673cae
FG
44#include "include/buffer.h"
45#include "include/encoding.h"
46#include "include/ceph_hash.h"
47#include "include/Spinlock.h"
48#include "common/ceph_argparse.h"
49#include "common/Cycles.h"
50#include "common/Cond.h"
51#include "common/Mutex.h"
52#include "common/Thread.h"
53#include "common/Timer.h"
54#include "msg/async/Event.h"
55#include "global/global_init.h"
56
57#include "test/perf_helper.h"
58
31f18b77
FG
59#include <atomic>
60
7c673cae
FG
61using namespace ceph;
62
63/**
64 * Ask the operating system to pin the current thread to a given CPU.
65 *
66 * \param cpu
67 * Indicates the desired CPU and hyperthread; low order 2 bits
68 * specify CPU, next bit specifies hyperthread.
69 */
70void bind_thread_to_cpu(int cpu)
71{
72#ifdef HAVE_SCHED
73 cpu_set_t set;
74 CPU_ZERO(&set);
75 CPU_SET(cpu, &set);
76 sched_setaffinity(0, sizeof(set), &set);
77#endif
78}
79
80/*
81 * This function just discards its argument. It's used to make it
82 * appear that data is used, so that the compiler won't optimize
83 * away the code we're trying to measure.
84 *
85 * \param value
86 * Pointer to arbitrary value; it's discarded.
87 */
88void discard(void* value) {
89 int x = *reinterpret_cast<int*>(value);
90 if (x == 0x43924776) {
91 printf("Value was 0x%x\n", x);
92 }
93}
94
95//----------------------------------------------------------------------
96// Test functions start here
97//----------------------------------------------------------------------
98
31f18b77 99// Measure the cost of atomic compare-and-swap
7c673cae
FG
100double atomic_int_cmp()
101{
102 int count = 1000000;
31f18b77
FG
103 std::atomic<unsigned> value = { 11 };
104 unsigned int test = 11;
7c673cae
FG
105 uint64_t start = Cycles::rdtsc();
106 for (int i = 0; i < count; i++) {
31f18b77 107 value.compare_exchange_strong(test, test+2);
7c673cae
FG
108 test += 2;
109 }
110 uint64_t stop = Cycles::rdtsc();
111 // printf("Final value: %d\n", value.load());
112 return Cycles::to_seconds(stop - start)/count;
113}
114
31f18b77 115// Measure the cost of incrementing an atomic
7c673cae
FG
116double atomic_int_inc()
117{
118 int count = 1000000;
31f18b77 119 std::atomic<int64_t> value = { 11 };
7c673cae
FG
120 uint64_t start = Cycles::rdtsc();
121 for (int i = 0; i < count; i++) {
31f18b77 122 value++;
7c673cae
FG
123 }
124 uint64_t stop = Cycles::rdtsc();
125 // printf("Final value: %d\n", value.load());
126 return Cycles::to_seconds(stop - start)/count;
127}
128
31f18b77 129// Measure the cost of reading an atomic
7c673cae
FG
130double atomic_int_read()
131{
132 int count = 1000000;
31f18b77 133 std::atomic<int64_t> value = { 11 };
7c673cae
FG
134 int total = 0;
135 uint64_t start = Cycles::rdtsc();
136 for (int i = 0; i < count; i++) {
31f18b77 137 total += value;
7c673cae
FG
138 }
139 uint64_t stop = Cycles::rdtsc();
140 // printf("Total: %d\n", total);
141 return Cycles::to_seconds(stop - start)/count;
142}
143
31f18b77 144// Measure the cost of storing a new value in an atomic
7c673cae
FG
145double atomic_int_set()
146{
147 int count = 1000000;
31f18b77 148 std::atomic<int64_t> value = { 11 };
7c673cae
FG
149 uint64_t start = Cycles::rdtsc();
150 for (int i = 0; i < count; i++) {
31f18b77 151 value = 88;
7c673cae
FG
152 }
153 uint64_t stop = Cycles::rdtsc();
154 return Cycles::to_seconds(stop - start)/count;
155}
156
157// Measure the cost of acquiring and releasing a mutex in the
158// fast case where the mutex is free.
159double mutex_nonblock()
160{
161 int count = 1000000;
162 Mutex m("mutex_nonblock::m");
163 uint64_t start = Cycles::rdtsc();
164 for (int i = 0; i < count; i++) {
165 m.Lock();
166 m.Unlock();
167 }
168 uint64_t stop = Cycles::rdtsc();
169 return Cycles::to_seconds(stop - start)/count;
170}
171
172// Measure the cost of allocating and deallocating a buffer, plus
173// appending (logically) one ptr.
174double buffer_basic()
175{
176 int count = 1000000;
177 uint64_t start = Cycles::rdtsc();
178 bufferptr ptr("abcdefg", 7);
179 for (int i = 0; i < count; i++) {
180 bufferlist b;
181 b.append(ptr, 0, 5);
182 }
183 uint64_t stop = Cycles::rdtsc();
184 return Cycles::to_seconds(stop - start)/count;
185}
186
187struct DummyBlock {
188 int a = 1, b = 2, c = 3, d = 4;
189 void encode(bufferlist &bl) const {
190 ENCODE_START(1, 1, bl);
191 ::encode(a, bl);
192 ::encode(b, bl);
193 ::encode(c, bl);
194 ::encode(d, bl);
195 ENCODE_FINISH(bl);
196 }
197 void decode(bufferlist::iterator &bl) {
198 DECODE_START(1, bl);
199 ::decode(a, bl);
200 ::decode(b, bl);
201 ::decode(c, bl);
202 ::decode(d, bl);
203 DECODE_FINISH(bl);
204 }
205};
206WRITE_CLASS_ENCODER(DummyBlock)
207
208// Measure the cost of encoding and decoding a buffer, plus
209// allocating space for one chunk.
210double buffer_encode_decode()
211{
212 int count = 1000000;
213 uint64_t start = Cycles::rdtsc();
214 for (int i = 0; i < count; i++) {
215 bufferlist b;
216 DummyBlock dummy_block;
217 ::encode(dummy_block, b);
218 bufferlist::iterator iter = b.begin();
219 ::decode(dummy_block, iter);
220 }
221 uint64_t stop = Cycles::rdtsc();
222 return Cycles::to_seconds(stop - start)/count;
223}
224
225// Measure the cost of allocating and deallocating a buffer, plus
226// copying in a small block.
227double buffer_basic_copy()
228{
229 int count = 1000000;
230 uint64_t start = Cycles::rdtsc();
231 for (int i = 0; i < count; i++) {
232 bufferlist b;
233 b.append("abcdefg", 6);
234 }
235 uint64_t stop = Cycles::rdtsc();
236 return Cycles::to_seconds(stop - start)/count;
237}
238
239// Measure the cost of making a copy of parts of two ptrs.
240double buffer_copy()
241{
242 int count = 1000000;
243 bufferlist b;
244 b.append("abcde", 5);
245 b.append("01234", 5);
246 char copy[10];
247 uint64_t start = Cycles::rdtsc();
248 for (int i = 0; i < count; i++) {
249 b.copy(2, 6, copy);
250 }
251 uint64_t stop = Cycles::rdtsc();
252 return Cycles::to_seconds(stop - start)/count;
253}
254
255// Measure the cost of allocating new space by extending the
256// bufferlist
257double buffer_encode()
258{
259 int count = 100000;
260 uint64_t total = 0;
261 for (int i = 0; i < count; i++) {
262 bufferlist b;
263 DummyBlock dummy_block;
264 ::encode(dummy_block, b);
265 uint64_t start = Cycles::rdtsc();
266 ::encode(dummy_block, b);
267 ::encode(dummy_block, b);
268 ::encode(dummy_block, b);
269 ::encode(dummy_block, b);
270 ::encode(dummy_block, b);
271 ::encode(dummy_block, b);
272 ::encode(dummy_block, b);
273 ::encode(dummy_block, b);
274 ::encode(dummy_block, b);
275 ::encode(dummy_block, b);
276 total += Cycles::rdtsc() - start;
277 }
278 return Cycles::to_seconds(total)/(count*10);
279}
280
281// Measure the cost of retrieving an object from the beginning of a buffer.
282double buffer_get_contiguous()
283{
284 int count = 1000000;
285 int value = 11;
286 bufferlist b;
287 b.append((char*)&value, sizeof(value));
288 int sum = 0;
289 uint64_t start = Cycles::rdtsc();
290 for (int i = 0; i < count; i++) {
291 sum += *reinterpret_cast<int*>(b.get_contiguous(0, sizeof(value)));
292 }
293 uint64_t stop = Cycles::rdtsc();
294 return Cycles::to_seconds(stop - start)/count;
295}
296
297// Measure the cost of creating an iterator and iterating over 10
298// chunks in a buffer.
299double buffer_iterator()
300{
301 bufferlist b;
302 const char s[] = "abcdefghijklmnopqrstuvwxyz";
303 bufferptr ptr(s, sizeof(s));
304 for (int i = 0; i < 5; i++) {
305 b.append(ptr, i, 5);
306 }
307 int count = 100000;
308 int sum = 0;
309 uint64_t start = Cycles::rdtsc();
310 for (int i = 0; i < count; i++) {
311 bufferlist::iterator it = b.begin();
312 while (!it.end()) {
313 sum += (static_cast<const char*>(it.get_current_ptr().c_str()))[it.get_remaining()-1];
314 ++it;
315 }
316 }
317 uint64_t stop = Cycles::rdtsc();
318 discard(&sum);
319 return Cycles::to_seconds(stop - start)/count;
320}
321
322// Implements the CondPingPong test.
323class CondPingPong {
324 Mutex mutex;
325 Cond cond;
326 int prod;
327 int cons;
328 const int count;
329
330 class Consumer : public Thread {
331 CondPingPong *p;
332 public:
333 explicit Consumer(CondPingPong *p): p(p) {}
334 void* entry() override {
335 p->consume();
336 return 0;
337 }
338 } consumer;
339
340 public:
341 CondPingPong(): mutex("CondPingPong::mutex"), prod(0), cons(0), count(10000), consumer(this) {}
342
343 double run() {
344 consumer.create("consumer");
345 uint64_t start = Cycles::rdtsc();
346 produce();
347 uint64_t stop = Cycles::rdtsc();
348 consumer.join();
349 return Cycles::to_seconds(stop - start)/count;
350 }
351
352 void produce() {
353 Mutex::Locker l(mutex);
354 while (cons < count) {
355 while (cons < prod)
356 cond.Wait(mutex);
357 ++prod;
358 cond.Signal();
359 }
360 }
361
362 void consume() {
363 Mutex::Locker l(mutex);
364 while (cons < count) {
365 while (cons == prod)
366 cond.Wait(mutex);
367 ++cons;
368 cond.Signal();
369 }
370 }
371};
372
373// Measure the cost of coordinating between threads using a condition variable.
374double cond_ping_pong()
375{
376 return CondPingPong().run();
377}
378
379// Measure the cost of a 32-bit divide. Divides don't take a constant
380// number of cycles. Values were chosen here semi-randomly to depict a
381// fairly expensive scenario. Someone with fancy ALU knowledge could
382// probably pick worse values.
383double div32()
384{
385#if defined(__i386__) || defined(__x86_64__)
386 int count = 1000000;
387 uint64_t start = Cycles::rdtsc();
388 // NB: Expect an x86 processor exception is there's overflow.
389 uint32_t numeratorHi = 0xa5a5a5a5U;
390 uint32_t numeratorLo = 0x55aa55aaU;
391 uint32_t divisor = 0xaa55aa55U;
392 uint32_t quotient;
393 uint32_t remainder;
394 for (int i = 0; i < count; i++) {
395 __asm__ __volatile__("div %4" :
396 "=a"(quotient), "=d"(remainder) :
397 "a"(numeratorLo), "d"(numeratorHi), "r"(divisor) :
398 "cc");
399 }
400 uint64_t stop = Cycles::rdtsc();
401 return Cycles::to_seconds(stop - start)/count;
402#else
403 return -1;
404#endif
405}
406
407// Measure the cost of a 64-bit divide. Divides don't take a constant
408// number of cycles. Values were chosen here semi-randomly to depict a
409// fairly expensive scenario. Someone with fancy ALU knowledge could
410// probably pick worse values.
411double div64()
412{
413#if defined(__x86_64__) || defined(__amd64__)
414 int count = 1000000;
415 // NB: Expect an x86 processor exception is there's overflow.
416 uint64_t start = Cycles::rdtsc();
417 uint64_t numeratorHi = 0x5a5a5a5a5a5UL;
418 uint64_t numeratorLo = 0x55aa55aa55aa55aaUL;
419 uint64_t divisor = 0xaa55aa55aa55aa55UL;
420 uint64_t quotient;
421 uint64_t remainder;
422 for (int i = 0; i < count; i++) {
423 __asm__ __volatile__("divq %4" :
424 "=a"(quotient), "=d"(remainder) :
425 "a"(numeratorLo), "d"(numeratorHi), "r"(divisor) :
426 "cc");
427 }
428 uint64_t stop = Cycles::rdtsc();
429 return Cycles::to_seconds(stop - start)/count;
430#else
431 return -1;
432#endif
433}
434
435// Measure the cost of calling a non-inlined function.
436double function_call()
437{
438 int count = 1000000;
439 uint64_t x = 0;
440 uint64_t start = Cycles::rdtsc();
441 for (int i = 0; i < count; i++) {
442 x = PerfHelper::plus_one(x);
443 }
444 uint64_t stop = Cycles::rdtsc();
445 return Cycles::to_seconds(stop - start)/count;
446}
447
448// Measure the minimum cost of EventCenter::process_events, when there are no
449// Pollers and no Timers.
450double eventcenter_poll()
451{
452 int count = 1000000;
453 EventCenter center(g_ceph_context);
454 center.init(1000, 0, "posix");
455 center.set_owner();
456 uint64_t start = Cycles::rdtsc();
457 for (int i = 0; i < count; i++) {
458 center.process_events(0);
459 }
460 uint64_t stop = Cycles::rdtsc();
461 return Cycles::to_seconds(stop - start)/count;
462}
463
464class CenterWorker : public Thread {
465 CephContext *cct;
466 bool done;
467
468 public:
469 EventCenter center;
470 explicit CenterWorker(CephContext *c): cct(c), done(false), center(c) {
471 center.init(100, 0, "posix");
472 }
473 void stop() {
474 done = true;
475 center.wakeup();
476 }
477 void* entry() override {
478 center.set_owner();
479 bind_thread_to_cpu(2);
480 while (!done)
481 center.process_events(1000);
482 return 0;
483 }
484};
485
486class CountEvent: public EventCallback {
31f18b77 487 std::atomic<int64_t> *count;
7c673cae
FG
488
489 public:
31f18b77 490 explicit CountEvent(std::atomic<int64_t> *atomic): count(atomic) {}
7c673cae 491 void do_request(int id) override {
31f18b77 492 (*count)--;
7c673cae
FG
493 }
494};
495
496double eventcenter_dispatch()
497{
498 int count = 100000;
499
500 CenterWorker worker(g_ceph_context);
31f18b77 501 std::atomic<int64_t> flag = { 1 };
7c673cae
FG
502 worker.create("evt_center_disp");
503 EventCallbackRef count_event(new CountEvent(&flag));
504
505 worker.center.dispatch_event_external(count_event);
506 // Start a new thread and wait for it to ready.
31f18b77 507 while (flag)
7c673cae
FG
508 usleep(100);
509
510 uint64_t start = Cycles::rdtsc();
511 for (int i = 0; i < count; i++) {
31f18b77 512 flag = 1;
7c673cae 513 worker.center.dispatch_event_external(count_event);
31f18b77 514 while (flag)
7c673cae
FG
515 ;
516 }
517 uint64_t stop = Cycles::rdtsc();
518 worker.stop();
519 worker.join();
520 return Cycles::to_seconds(stop - start)/count;
521}
522
523// Measure the cost of copying a given number of bytes with memcpy.
524double memcpy_shared(size_t size)
525{
526 int count = 1000000;
527 char src[size], dst[size];
528
529 memset(src, 0, sizeof(src));
530
531 uint64_t start = Cycles::rdtsc();
532 for (int i = 0; i < count; i++) {
533 memcpy(dst, src, size);
534 }
535 uint64_t stop = Cycles::rdtsc();
536 return Cycles::to_seconds(stop - start)/count;
537}
538
539double memcpy100()
540{
541 return memcpy_shared(100);
542}
543
544double memcpy1000()
545{
546 return memcpy_shared(1000);
547}
548
549double memcpy10000()
550{
551 return memcpy_shared(10000);
552}
553
554// Benchmark rjenkins hashing performance on cached data.
555template <int key_length>
556double ceph_str_hash_rjenkins()
557{
558 int count = 100000;
559 char buf[key_length];
560
561 uint64_t start = Cycles::rdtsc();
562 for (int i = 0; i < count; i++)
563 ceph_str_hash(CEPH_STR_HASH_RJENKINS, buf, sizeof(buf));
564 uint64_t stop = Cycles::rdtsc();
565
566 return Cycles::to_seconds(stop - start)/count;
567}
568
569// Measure the cost of reading the fine-grain cycle counter.
570double rdtsc_test()
571{
572 int count = 1000000;
573 uint64_t start = Cycles::rdtsc();
574 uint64_t total = 0;
575 for (int i = 0; i < count; i++) {
576 total += Cycles::rdtsc();
577 }
578 uint64_t stop = Cycles::rdtsc();
579 return Cycles::to_seconds(stop - start)/count;
580}
581
582// Measure the cost of the Cycles::to_seconds method.
583double perf_cycles_to_seconds()
584{
585 int count = 1000000;
586 double total = 0;
587 uint64_t cycles = 994261;
588 uint64_t start = Cycles::rdtsc();
589 for (int i = 0; i < count; i++) {
590 total += Cycles::to_seconds(cycles);
591 }
592 uint64_t stop = Cycles::rdtsc();
593 // printf("Result: %.4f\n", total/count);
594 return Cycles::to_seconds(stop - start)/count;
595}
596
597// Measure the cost of the Cylcles::toNanoseconds method.
598double perf_cycles_to_nanoseconds()
599{
600 int count = 1000000;
601 uint64_t total = 0;
602 uint64_t cycles = 994261;
603 uint64_t start = Cycles::rdtsc();
604 for (int i = 0; i < count; i++) {
605 total += Cycles::to_nanoseconds(cycles);
606 }
607 uint64_t stop = Cycles::rdtsc();
608 // printf("Result: %lu\n", total/count);
609 return Cycles::to_seconds(stop - start)/count;
610}
611
612
613#ifdef HAVE_SSE
614/**
615 * Prefetch the cache lines containing [object, object + numBytes) into the
616 * processor's caches.
617 * The best docs for this are in the Intel instruction set reference under
618 * PREFETCH.
619 * \param object
620 * The start of the region of memory to prefetch.
621 * \param num_bytes
622 * The size of the region of memory to prefetch.
623 */
624static inline void prefetch(const void *object, uint64_t num_bytes)
625{
626 uint64_t offset = reinterpret_cast<uint64_t>(object) & 0x3fUL;
627 const char* p = reinterpret_cast<const char*>(object) - offset;
628 for (uint64_t i = 0; i < offset + num_bytes; i += 64)
629 _mm_prefetch(p + i, _MM_HINT_T0);
630}
631#endif
632
633// Measure the cost of the prefetch instruction.
634double perf_prefetch()
635{
636#ifdef HAVE_SSE
637 uint64_t total_ticks = 0;
638 int count = 10;
639 char buf[16 * 64];
640 uint64_t start, stop;
641
642 for (int i = 0; i < count; i++) {
643 PerfHelper::flush_cache();
644 start = Cycles::rdtsc();
645 prefetch(&buf[576], 64);
646 prefetch(&buf[0], 64);
647 prefetch(&buf[512], 64);
648 prefetch(&buf[960], 64);
649 prefetch(&buf[640], 64);
650 prefetch(&buf[896], 64);
651 prefetch(&buf[256], 64);
652 prefetch(&buf[704], 64);
653 prefetch(&buf[320], 64);
654 prefetch(&buf[384], 64);
655 prefetch(&buf[128], 64);
656 prefetch(&buf[448], 64);
657 prefetch(&buf[768], 64);
658 prefetch(&buf[832], 64);
659 prefetch(&buf[64], 64);
660 prefetch(&buf[192], 64);
661 stop = Cycles::rdtsc();
662 total_ticks += stop - start;
663 }
664 return Cycles::to_seconds(total_ticks) / count / 16;
665#else
666 return -1;
667#endif
668}
669
670#if defined(__x86_64__)
671/**
672 * This function is used to seralize machine instructions so that no
673 * instructions that appear after it in the current thread can run before any
674 * instructions that appear before it.
675 *
676 * It is useful for putting around rdpmc instructions (to pinpoint cache
677 * misses) as well as before rdtsc instructions, to prevent time pollution from
678 * instructions supposed to be executing before the timer starts.
679 */
680static inline void serialize() {
681 uint32_t eax, ebx, ecx, edx;
682 __asm volatile("cpuid"
683 : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
684 : "a" (1U));
685}
686#endif
687
688// Measure the cost of cpuid
689double perf_serialize() {
690#if defined(__x86_64__)
691 int count = 1000000;
692 uint64_t start = Cycles::rdtsc();
693 for (int i = 0; i < count; i++) {
694 serialize();
695 }
696 uint64_t stop = Cycles::rdtsc();
697 return Cycles::to_seconds(stop - start)/count;
698#else
699 return -1;
700#endif
701}
702
703// Measure the cost of an lfence instruction.
704double lfence()
705{
706#ifdef HAVE_SSE2
707 int count = 1000000;
708 uint64_t start = Cycles::rdtsc();
709 for (int i = 0; i < count; i++) {
710 __asm__ __volatile__("lfence" ::: "memory");
711 }
712 uint64_t stop = Cycles::rdtsc();
713 return Cycles::to_seconds(stop - start)/count;
714#else
715 return -1;
716#endif
717}
718
719// Measure the cost of an sfence instruction.
720double sfence()
721{
722#ifdef HAVE_SSE
723 int count = 1000000;
724 uint64_t start = Cycles::rdtsc();
725 for (int i = 0; i < count; i++) {
726 __asm__ __volatile__("sfence" ::: "memory");
727 }
728 uint64_t stop = Cycles::rdtsc();
729 return Cycles::to_seconds(stop - start)/count;
730#else
731 return -1;
732#endif
733}
734
735// Measure the cost of acquiring and releasing a SpinLock (assuming the
736// lock is initially free).
737double test_spinlock()
738{
739 int count = 1000000;
740 Spinlock lock;
741 uint64_t start = Cycles::rdtsc();
742 for (int i = 0; i < count; i++) {
743 lock.lock();
744 lock.unlock();
745 }
746 uint64_t stop = Cycles::rdtsc();
747 return Cycles::to_seconds(stop - start)/count;
748}
749
750// Helper for spawn_thread. This is the main function that the thread executes
751// (intentionally empty).
752class ThreadHelper : public Thread {
753 void *entry() override { return 0; }
754};
755
756// Measure the cost of start and joining with a thread.
757double spawn_thread()
758{
759 int count = 10000;
760 ThreadHelper thread;
761 uint64_t start = Cycles::rdtsc();
762 for (int i = 0; i < count; i++) {
763 thread.create("thread_helper");
764 thread.join();
765 }
766 uint64_t stop = Cycles::rdtsc();
767 return Cycles::to_seconds(stop - start)/count;
768}
769
770class FakeContext : public Context {
771 public:
772 void finish(int r) override {}
773};
774
775// Measure the cost of starting and stopping a Dispatch::Timer.
776double perf_timer()
777{
778 int count = 1000000;
779 Mutex lock("perf_timer::lock");
780 SafeTimer timer(g_ceph_context, lock);
781 FakeContext **c = new FakeContext*[count];
782 for (int i = 0; i < count; i++) {
783 c[i] = new FakeContext();
784 }
785 uint64_t start = Cycles::rdtsc();
786 Mutex::Locker l(lock);
787 for (int i = 0; i < count; i++) {
788 timer.add_event_after(12345, c[i]);
789 timer.cancel_event(c[i]);
790 }
791 uint64_t stop = Cycles::rdtsc();
792 delete[] c;
793 return Cycles::to_seconds(stop - start)/count;
794}
795
796// Measure the cost of throwing and catching an int. This uses an integer as
797// the value thrown, which is presumably as fast as possible.
798double throw_int()
799{
800 int count = 10000;
801 uint64_t start = Cycles::rdtsc();
802 for (int i = 0; i < count; i++) {
803 try {
804 throw 0;
805 } catch (int) { // NOLINT
806 // pass
807 }
808 }
809 uint64_t stop = Cycles::rdtsc();
810 return Cycles::to_seconds(stop - start)/count;
811}
812
813// Measure the cost of throwing and catching an int from a function call.
814double throw_int_call()
815{
816 int count = 10000;
817 uint64_t start = Cycles::rdtsc();
818 for (int i = 0; i < count; i++) {
819 try {
820 PerfHelper::throw_int();
821 } catch (int) { // NOLINT
822 // pass
823 }
824 }
825 uint64_t stop = Cycles::rdtsc();
826 return Cycles::to_seconds(stop - start)/count;
827}
828
829// Measure the cost of throwing and catching an Exception. This uses an actual
830// exception as the value thrown, which may be slower than throwInt.
831double throw_exception()
832{
833 int count = 10000;
834 uint64_t start = Cycles::rdtsc();
835 for (int i = 0; i < count; i++) {
836 try {
837 throw buffer::end_of_buffer();
838 } catch (const buffer::end_of_buffer&) {
839 // pass
840 }
841 }
842 uint64_t stop = Cycles::rdtsc();
843 return Cycles::to_seconds(stop - start)/count;
844}
845
846// Measure the cost of throwing and catching an Exception from a function call.
847double throw_exception_call()
848{
849 int count = 10000;
850 uint64_t start = Cycles::rdtsc();
851 for (int i = 0; i < count; i++) {
852 try {
853 PerfHelper::throw_end_of_buffer();
854 } catch (const buffer::end_of_buffer&) {
855 // pass
856 }
857 }
858 uint64_t stop = Cycles::rdtsc();
859 return Cycles::to_seconds(stop - start)/count;
860}
861
862// Measure the cost of pushing a new element on a std::vector, copying
863// from the end to an internal element, and popping the end element.
864double vector_push_pop()
865{
866 int count = 100000;
867 std::vector<int> vector;
868 vector.push_back(1);
869 vector.push_back(2);
870 vector.push_back(3);
871 uint64_t start = Cycles::rdtsc();
872 for (int i = 0; i < count; i++) {
873 vector.push_back(i);
874 vector.push_back(i+1);
875 vector.push_back(i+2);
876 vector[2] = vector.back();
877 vector.pop_back();
878 vector[0] = vector.back();
879 vector.pop_back();
880 vector[1] = vector.back();
881 vector.pop_back();
882 }
883 uint64_t stop = Cycles::rdtsc();
884 return Cycles::to_seconds(stop - start)/(count*3);
885}
886
887// Measure the cost of ceph_clock_now
888double perf_ceph_clock_now()
889{
890 int count = 100000;
891 uint64_t start = Cycles::rdtsc();
892 for (int i = 0; i < count; i++) {
893 ceph_clock_now();
894 }
895 uint64_t stop = Cycles::rdtsc();
896 return Cycles::to_seconds(stop - start)/count;
897}
898
899// The following struct and table define each performance test in terms of
900// a string name and a function that implements the test.
901struct TestInfo {
902 const char* name; // Name of the performance test; this is
903 // what gets typed on the command line to
904 // run the test.
905 double (*func)(); // Function that implements the test;
906 // returns the time (in seconds) for each
907 // iteration of that test.
908 const char *description; // Short description of this test (not more
909 // than about 40 characters, so the entire
910 // test output fits on a single line).
911};
912TestInfo tests[] = {
913 {"atomic_int_cmp", atomic_int_cmp,
914 "atomic_t::compare_and_swap"},
915 {"atomic_int_inc", atomic_int_inc,
916 "atomic_t::inc"},
917 {"atomic_int_read", atomic_int_read,
918 "atomic_t::read"},
919 {"atomic_int_set", atomic_int_set,
920 "atomic_t::set"},
921 {"mutex_nonblock", mutex_nonblock,
922 "Mutex lock/unlock (no blocking)"},
923 {"buffer_basic", buffer_basic,
924 "buffer create, add one ptr, delete"},
925 {"buffer_encode_decode", buffer_encode_decode,
926 "buffer create, encode/decode object, delete"},
927 {"buffer_basic_copy", buffer_basic_copy,
928 "buffer create, copy small block, delete"},
929 {"buffer_copy", buffer_copy,
930 "copy out 2 small ptrs from buffer"},
931 {"buffer_encode10", buffer_encode,
932 "buffer encoding 10 structures onto existing ptr"},
933 {"buffer_get_contiguous", buffer_get_contiguous,
934 "Buffer::get_contiguous"},
935 {"buffer_iterator", buffer_iterator,
936 "iterate over buffer with 5 ptrs"},
937 {"cond_ping_pong", cond_ping_pong,
938 "condition variable round-trip"},
939 {"div32", div32,
940 "32-bit integer division instruction"},
941 {"div64", div64,
942 "64-bit integer division instruction"},
943 {"function_call", function_call,
944 "Call a function that has not been inlined"},
945 {"eventcenter_poll", eventcenter_poll,
946 "EventCenter::process_events (no timers or events)"},
947 {"eventcenter_dispatch", eventcenter_dispatch,
948 "EventCenter::dispatch_event_external latency"},
949 {"memcpy100", memcpy100,
950 "Copy 100 bytes with memcpy"},
951 {"memcpy1000", memcpy1000,
952 "Copy 1000 bytes with memcpy"},
953 {"memcpy10000", memcpy10000,
954 "Copy 10000 bytes with memcpy"},
955 {"ceph_str_hash_rjenkins", ceph_str_hash_rjenkins<16>,
956 "rjenkins hash on 16 byte of data"},
957 {"ceph_str_hash_rjenkins", ceph_str_hash_rjenkins<256>,
958 "rjenkins hash on 256 bytes of data"},
959 {"rdtsc", rdtsc_test,
960 "Read the fine-grain cycle counter"},
961 {"cycles_to_seconds", perf_cycles_to_seconds,
962 "Convert a rdtsc result to (double) seconds"},
963 {"cycles_to_seconds", perf_cycles_to_nanoseconds,
964 "Convert a rdtsc result to (uint64_t) nanoseconds"},
965 {"prefetch", perf_prefetch,
966 "Prefetch instruction"},
967 {"serialize", perf_serialize,
968 "serialize instruction"},
969 {"lfence", lfence,
970 "Lfence instruction"},
971 {"sfence", sfence,
972 "Sfence instruction"},
973 {"spin_lock", test_spinlock,
974 "Acquire/release SpinLock"},
975 {"spawn_thread", spawn_thread,
976 "Start and stop a thread"},
977 {"perf_timer", perf_timer,
978 "Insert and cancel a SafeTimer"},
979 {"throw_int", throw_int,
980 "Throw an int"},
981 {"throw_int_call", throw_int_call,
982 "Throw an int in a function call"},
983 {"throw_exception", throw_exception,
984 "Throw an Exception"},
985 {"throw_exception_call", throw_exception_call,
986 "Throw an Exception in a function call"},
987 {"vector_push_pop", vector_push_pop,
988 "Push and pop a std::vector"},
989 {"ceph_clock_now", perf_ceph_clock_now,
990 "ceph_clock_now function"},
991};
992
993/**
994 * Runs a particular test and prints a one-line result message.
995 *
996 * \param info
997 * Describes the test to run.
998 */
999void run_test(TestInfo& info)
1000{
1001 double secs = info.func();
1002 int width = printf("%-24s ", info.name);
1003 if (secs == -1) {
1004 width += printf(" architecture nonsupport ");
1005 } else if (secs < 1.0e-06) {
1006 width += printf("%8.2fns", 1e09*secs);
1007 } else if (secs < 1.0e-03) {
1008 width += printf("%8.2fus", 1e06*secs);
1009 } else if (secs < 1.0) {
1010 width += printf("%8.2fms", 1e03*secs);
1011 } else {
1012 width += printf("%8.2fs", secs);
1013 }
1014 printf("%*s %s\n", 32-width, "", info.description);
1015}
1016
1017int main(int argc, char *argv[])
1018{
1019 vector<const char*> args;
1020 argv_to_vec(argc, (const char **)argv, args);
1021
1022 auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
1023 CODE_ENVIRONMENT_UTILITY, 0);
1024 common_init_finish(g_ceph_context);
1025 Cycles::init();
1026
1027 bind_thread_to_cpu(3);
1028 if (argc == 1) {
1029 // No test names specified; run all tests.
1030 for (size_t i = 0; i < sizeof(tests)/sizeof(TestInfo); ++i) {
1031 run_test(tests[i]);
1032 }
1033 } else {
1034 // Run only the tests that were specified on the command line.
1035 for (int i = 1; i < argc; i++) {
1036 bool found_test = false;
1037 for (size_t j = 0; j < sizeof(tests)/sizeof(TestInfo); ++j) {
1038 if (strcmp(argv[i], tests[j].name) == 0) {
1039 found_test = true;
1040 run_test(tests[j]);
1041 break;
1042 }
1043 }
1044 if (!found_test) {
1045 int width = printf("%-24s ??", argv[i]);
1046 printf("%*s No such test\n", 32-width, "");
1047 }
1048 }
1049 }
1050}