]> git.proxmox.com Git - ceph.git/blob - ceph/src/test/perf_local.cc
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / test / perf_local.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /* Copyright (c) 2015 Haomai Wang <haomaiwang@gmail.com>
4 * Copyright (c) 2011-2014 Stanford University
5 * Copyright (c) 2011 Facebook
6 *
7 * Permission to use, copy, modify, and distribute this software for any
8 * purpose with or without fee is hereby granted, provided that the above
9 * copyright notice and this permission notice appear in all copies.
10 *
11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR(S) DISCLAIM ALL WARRANTIES
12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL AUTHORS BE LIABLE FOR
14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18 */
19
20 // This program contains a collection of low-level performance measurements
21 // for Ceph, which can be run either individually or altogether. These
22 // tests measure performance in a single stand-alone process, not in a cluster
23 // with multiple servers. Invoke the program like this:
24 //
25 // Perf test1 test2 ...
26 //
27 // test1 and test2 are the names of individual performance measurements to
28 // run. If no test names are provided then all of the performance tests
29 // are run.
30 //
31 // To add a new test:
32 // * Write a function that implements the test. Use existing test functions
33 // as a guideline, and be sure to generate output in the same form as
34 // other tests.
35 // * Create a new entry for the test in the #tests table.
36 #include <vector>
37 #include <sched.h>
38
39 #include "acconfig.h"
40 #ifdef HAVE_SSE
41 #include <xmmintrin.h>
42 #endif
43
44 #include "include/atomic.h"
45 #include "include/buffer.h"
46 #include "include/encoding.h"
47 #include "include/ceph_hash.h"
48 #include "include/Spinlock.h"
49 #include "common/ceph_argparse.h"
50 #include "common/Cycles.h"
51 #include "common/Cond.h"
52 #include "common/Mutex.h"
53 #include "common/Thread.h"
54 #include "common/Timer.h"
55 #include "msg/async/Event.h"
56 #include "global/global_init.h"
57
58 #include "test/perf_helper.h"
59
60 using namespace ceph;
61
62 /**
63 * Ask the operating system to pin the current thread to a given CPU.
64 *
65 * \param cpu
66 * Indicates the desired CPU and hyperthread; low order 2 bits
67 * specify CPU, next bit specifies hyperthread.
68 */
69 void bind_thread_to_cpu(int cpu)
70 {
71 #ifdef HAVE_SCHED
72 cpu_set_t set;
73 CPU_ZERO(&set);
74 CPU_SET(cpu, &set);
75 sched_setaffinity(0, sizeof(set), &set);
76 #endif
77 }
78
79 /*
80 * This function just discards its argument. It's used to make it
81 * appear that data is used, so that the compiler won't optimize
82 * away the code we're trying to measure.
83 *
84 * \param value
85 * Pointer to arbitrary value; it's discarded.
86 */
87 void discard(void* value) {
88 int x = *reinterpret_cast<int*>(value);
89 if (x == 0x43924776) {
90 printf("Value was 0x%x\n", x);
91 }
92 }
93
94 //----------------------------------------------------------------------
95 // Test functions start here
96 //----------------------------------------------------------------------
97
98 // Measure the cost of atomic_t::compare_and_swap
99 double atomic_int_cmp()
100 {
101 int count = 1000000;
102 atomic_t value(11);
103 int test = 11;
104 uint64_t start = Cycles::rdtsc();
105 for (int i = 0; i < count; i++) {
106 value.compare_and_swap(test, test+2);
107 test += 2;
108 }
109 uint64_t stop = Cycles::rdtsc();
110 // printf("Final value: %d\n", value.load());
111 return Cycles::to_seconds(stop - start)/count;
112 }
113
114 // Measure the cost of atomic_t::inc
115 double atomic_int_inc()
116 {
117 int count = 1000000;
118 atomic_t value(11);
119 uint64_t start = Cycles::rdtsc();
120 for (int i = 0; i < count; i++) {
121 value.inc();
122 }
123 uint64_t stop = Cycles::rdtsc();
124 // printf("Final value: %d\n", value.load());
125 return Cycles::to_seconds(stop - start)/count;
126 }
127
128 // Measure the cost of reading an atomic_t
129 double atomic_int_read()
130 {
131 int count = 1000000;
132 atomic_t value(11);
133 int total = 0;
134 uint64_t start = Cycles::rdtsc();
135 for (int i = 0; i < count; i++) {
136 total += value.read();
137 }
138 uint64_t stop = Cycles::rdtsc();
139 // printf("Total: %d\n", total);
140 return Cycles::to_seconds(stop - start)/count;
141 }
142
143 // Measure the cost of storing a new value in a atomic_t
144 double atomic_int_set()
145 {
146 int count = 1000000;
147 atomic_t value(11);
148 uint64_t start = Cycles::rdtsc();
149 for (int i = 0; i < count; i++) {
150 value.set(88);
151 }
152 uint64_t stop = Cycles::rdtsc();
153 return Cycles::to_seconds(stop - start)/count;
154 }
155
156 // Measure the cost of acquiring and releasing a mutex in the
157 // fast case where the mutex is free.
158 double mutex_nonblock()
159 {
160 int count = 1000000;
161 Mutex m("mutex_nonblock::m");
162 uint64_t start = Cycles::rdtsc();
163 for (int i = 0; i < count; i++) {
164 m.Lock();
165 m.Unlock();
166 }
167 uint64_t stop = Cycles::rdtsc();
168 return Cycles::to_seconds(stop - start)/count;
169 }
170
171 // Measure the cost of allocating and deallocating a buffer, plus
172 // appending (logically) one ptr.
173 double buffer_basic()
174 {
175 int count = 1000000;
176 uint64_t start = Cycles::rdtsc();
177 bufferptr ptr("abcdefg", 7);
178 for (int i = 0; i < count; i++) {
179 bufferlist b;
180 b.append(ptr, 0, 5);
181 }
182 uint64_t stop = Cycles::rdtsc();
183 return Cycles::to_seconds(stop - start)/count;
184 }
185
186 struct DummyBlock {
187 int a = 1, b = 2, c = 3, d = 4;
188 void encode(bufferlist &bl) const {
189 ENCODE_START(1, 1, bl);
190 ::encode(a, bl);
191 ::encode(b, bl);
192 ::encode(c, bl);
193 ::encode(d, bl);
194 ENCODE_FINISH(bl);
195 }
196 void decode(bufferlist::iterator &bl) {
197 DECODE_START(1, bl);
198 ::decode(a, bl);
199 ::decode(b, bl);
200 ::decode(c, bl);
201 ::decode(d, bl);
202 DECODE_FINISH(bl);
203 }
204 };
205 WRITE_CLASS_ENCODER(DummyBlock)
206
207 // Measure the cost of encoding and decoding a buffer, plus
208 // allocating space for one chunk.
209 double buffer_encode_decode()
210 {
211 int count = 1000000;
212 uint64_t start = Cycles::rdtsc();
213 for (int i = 0; i < count; i++) {
214 bufferlist b;
215 DummyBlock dummy_block;
216 ::encode(dummy_block, b);
217 bufferlist::iterator iter = b.begin();
218 ::decode(dummy_block, iter);
219 }
220 uint64_t stop = Cycles::rdtsc();
221 return Cycles::to_seconds(stop - start)/count;
222 }
223
224 // Measure the cost of allocating and deallocating a buffer, plus
225 // copying in a small block.
226 double buffer_basic_copy()
227 {
228 int count = 1000000;
229 uint64_t start = Cycles::rdtsc();
230 for (int i = 0; i < count; i++) {
231 bufferlist b;
232 b.append("abcdefg", 6);
233 }
234 uint64_t stop = Cycles::rdtsc();
235 return Cycles::to_seconds(stop - start)/count;
236 }
237
238 // Measure the cost of making a copy of parts of two ptrs.
239 double buffer_copy()
240 {
241 int count = 1000000;
242 bufferlist b;
243 b.append("abcde", 5);
244 b.append("01234", 5);
245 char copy[10];
246 uint64_t start = Cycles::rdtsc();
247 for (int i = 0; i < count; i++) {
248 b.copy(2, 6, copy);
249 }
250 uint64_t stop = Cycles::rdtsc();
251 return Cycles::to_seconds(stop - start)/count;
252 }
253
254 // Measure the cost of allocating new space by extending the
255 // bufferlist
256 double buffer_encode()
257 {
258 int count = 100000;
259 uint64_t total = 0;
260 for (int i = 0; i < count; i++) {
261 bufferlist b;
262 DummyBlock dummy_block;
263 ::encode(dummy_block, b);
264 uint64_t start = Cycles::rdtsc();
265 ::encode(dummy_block, b);
266 ::encode(dummy_block, b);
267 ::encode(dummy_block, b);
268 ::encode(dummy_block, b);
269 ::encode(dummy_block, b);
270 ::encode(dummy_block, b);
271 ::encode(dummy_block, b);
272 ::encode(dummy_block, b);
273 ::encode(dummy_block, b);
274 ::encode(dummy_block, b);
275 total += Cycles::rdtsc() - start;
276 }
277 return Cycles::to_seconds(total)/(count*10);
278 }
279
280 // Measure the cost of retrieving an object from the beginning of a buffer.
281 double buffer_get_contiguous()
282 {
283 int count = 1000000;
284 int value = 11;
285 bufferlist b;
286 b.append((char*)&value, sizeof(value));
287 int sum = 0;
288 uint64_t start = Cycles::rdtsc();
289 for (int i = 0; i < count; i++) {
290 sum += *reinterpret_cast<int*>(b.get_contiguous(0, sizeof(value)));
291 }
292 uint64_t stop = Cycles::rdtsc();
293 return Cycles::to_seconds(stop - start)/count;
294 }
295
296 // Measure the cost of creating an iterator and iterating over 10
297 // chunks in a buffer.
298 double buffer_iterator()
299 {
300 bufferlist b;
301 const char s[] = "abcdefghijklmnopqrstuvwxyz";
302 bufferptr ptr(s, sizeof(s));
303 for (int i = 0; i < 5; i++) {
304 b.append(ptr, i, 5);
305 }
306 int count = 100000;
307 int sum = 0;
308 uint64_t start = Cycles::rdtsc();
309 for (int i = 0; i < count; i++) {
310 bufferlist::iterator it = b.begin();
311 while (!it.end()) {
312 sum += (static_cast<const char*>(it.get_current_ptr().c_str()))[it.get_remaining()-1];
313 ++it;
314 }
315 }
316 uint64_t stop = Cycles::rdtsc();
317 discard(&sum);
318 return Cycles::to_seconds(stop - start)/count;
319 }
320
321 // Implements the CondPingPong test.
322 class CondPingPong {
323 Mutex mutex;
324 Cond cond;
325 int prod;
326 int cons;
327 const int count;
328
329 class Consumer : public Thread {
330 CondPingPong *p;
331 public:
332 explicit Consumer(CondPingPong *p): p(p) {}
333 void* entry() override {
334 p->consume();
335 return 0;
336 }
337 } consumer;
338
339 public:
340 CondPingPong(): mutex("CondPingPong::mutex"), prod(0), cons(0), count(10000), consumer(this) {}
341
342 double run() {
343 consumer.create("consumer");
344 uint64_t start = Cycles::rdtsc();
345 produce();
346 uint64_t stop = Cycles::rdtsc();
347 consumer.join();
348 return Cycles::to_seconds(stop - start)/count;
349 }
350
351 void produce() {
352 Mutex::Locker l(mutex);
353 while (cons < count) {
354 while (cons < prod)
355 cond.Wait(mutex);
356 ++prod;
357 cond.Signal();
358 }
359 }
360
361 void consume() {
362 Mutex::Locker l(mutex);
363 while (cons < count) {
364 while (cons == prod)
365 cond.Wait(mutex);
366 ++cons;
367 cond.Signal();
368 }
369 }
370 };
371
372 // Measure the cost of coordinating between threads using a condition variable.
373 double cond_ping_pong()
374 {
375 return CondPingPong().run();
376 }
377
378 // Measure the cost of a 32-bit divide. Divides don't take a constant
379 // number of cycles. Values were chosen here semi-randomly to depict a
380 // fairly expensive scenario. Someone with fancy ALU knowledge could
381 // probably pick worse values.
382 double div32()
383 {
384 #if defined(__i386__) || defined(__x86_64__)
385 int count = 1000000;
386 uint64_t start = Cycles::rdtsc();
387 // NB: Expect an x86 processor exception is there's overflow.
388 uint32_t numeratorHi = 0xa5a5a5a5U;
389 uint32_t numeratorLo = 0x55aa55aaU;
390 uint32_t divisor = 0xaa55aa55U;
391 uint32_t quotient;
392 uint32_t remainder;
393 for (int i = 0; i < count; i++) {
394 __asm__ __volatile__("div %4" :
395 "=a"(quotient), "=d"(remainder) :
396 "a"(numeratorLo), "d"(numeratorHi), "r"(divisor) :
397 "cc");
398 }
399 uint64_t stop = Cycles::rdtsc();
400 return Cycles::to_seconds(stop - start)/count;
401 #else
402 return -1;
403 #endif
404 }
405
406 // Measure the cost of a 64-bit divide. Divides don't take a constant
407 // number of cycles. Values were chosen here semi-randomly to depict a
408 // fairly expensive scenario. Someone with fancy ALU knowledge could
409 // probably pick worse values.
410 double div64()
411 {
412 #if defined(__x86_64__) || defined(__amd64__)
413 int count = 1000000;
414 // NB: Expect an x86 processor exception is there's overflow.
415 uint64_t start = Cycles::rdtsc();
416 uint64_t numeratorHi = 0x5a5a5a5a5a5UL;
417 uint64_t numeratorLo = 0x55aa55aa55aa55aaUL;
418 uint64_t divisor = 0xaa55aa55aa55aa55UL;
419 uint64_t quotient;
420 uint64_t remainder;
421 for (int i = 0; i < count; i++) {
422 __asm__ __volatile__("divq %4" :
423 "=a"(quotient), "=d"(remainder) :
424 "a"(numeratorLo), "d"(numeratorHi), "r"(divisor) :
425 "cc");
426 }
427 uint64_t stop = Cycles::rdtsc();
428 return Cycles::to_seconds(stop - start)/count;
429 #else
430 return -1;
431 #endif
432 }
433
434 // Measure the cost of calling a non-inlined function.
435 double function_call()
436 {
437 int count = 1000000;
438 uint64_t x = 0;
439 uint64_t start = Cycles::rdtsc();
440 for (int i = 0; i < count; i++) {
441 x = PerfHelper::plus_one(x);
442 }
443 uint64_t stop = Cycles::rdtsc();
444 return Cycles::to_seconds(stop - start)/count;
445 }
446
447 // Measure the minimum cost of EventCenter::process_events, when there are no
448 // Pollers and no Timers.
449 double eventcenter_poll()
450 {
451 int count = 1000000;
452 EventCenter center(g_ceph_context);
453 center.init(1000, 0, "posix");
454 center.set_owner();
455 uint64_t start = Cycles::rdtsc();
456 for (int i = 0; i < count; i++) {
457 center.process_events(0);
458 }
459 uint64_t stop = Cycles::rdtsc();
460 return Cycles::to_seconds(stop - start)/count;
461 }
462
463 class CenterWorker : public Thread {
464 CephContext *cct;
465 bool done;
466
467 public:
468 EventCenter center;
469 explicit CenterWorker(CephContext *c): cct(c), done(false), center(c) {
470 center.init(100, 0, "posix");
471 }
472 void stop() {
473 done = true;
474 center.wakeup();
475 }
476 void* entry() override {
477 center.set_owner();
478 bind_thread_to_cpu(2);
479 while (!done)
480 center.process_events(1000);
481 return 0;
482 }
483 };
484
485 class CountEvent: public EventCallback {
486 atomic_t *count;
487
488 public:
489 explicit CountEvent(atomic_t *atomic): count(atomic) {}
490 void do_request(int id) override {
491 count->dec();
492 }
493 };
494
495 double eventcenter_dispatch()
496 {
497 int count = 100000;
498
499 CenterWorker worker(g_ceph_context);
500 atomic_t flag(1);
501 worker.create("evt_center_disp");
502 EventCallbackRef count_event(new CountEvent(&flag));
503
504 worker.center.dispatch_event_external(count_event);
505 // Start a new thread and wait for it to ready.
506 while (flag.read())
507 usleep(100);
508
509 uint64_t start = Cycles::rdtsc();
510 for (int i = 0; i < count; i++) {
511 flag.set(1);
512 worker.center.dispatch_event_external(count_event);
513 while (flag.read())
514 ;
515 }
516 uint64_t stop = Cycles::rdtsc();
517 worker.stop();
518 worker.join();
519 return Cycles::to_seconds(stop - start)/count;
520 }
521
522 // Measure the cost of copying a given number of bytes with memcpy.
523 double memcpy_shared(size_t size)
524 {
525 int count = 1000000;
526 char src[size], dst[size];
527
528 memset(src, 0, sizeof(src));
529
530 uint64_t start = Cycles::rdtsc();
531 for (int i = 0; i < count; i++) {
532 memcpy(dst, src, size);
533 }
534 uint64_t stop = Cycles::rdtsc();
535 return Cycles::to_seconds(stop - start)/count;
536 }
537
538 double memcpy100()
539 {
540 return memcpy_shared(100);
541 }
542
543 double memcpy1000()
544 {
545 return memcpy_shared(1000);
546 }
547
548 double memcpy10000()
549 {
550 return memcpy_shared(10000);
551 }
552
553 // Benchmark rjenkins hashing performance on cached data.
554 template <int key_length>
555 double ceph_str_hash_rjenkins()
556 {
557 int count = 100000;
558 char buf[key_length];
559
560 uint64_t start = Cycles::rdtsc();
561 for (int i = 0; i < count; i++)
562 ceph_str_hash(CEPH_STR_HASH_RJENKINS, buf, sizeof(buf));
563 uint64_t stop = Cycles::rdtsc();
564
565 return Cycles::to_seconds(stop - start)/count;
566 }
567
568 // Measure the cost of reading the fine-grain cycle counter.
569 double rdtsc_test()
570 {
571 int count = 1000000;
572 uint64_t start = Cycles::rdtsc();
573 uint64_t total = 0;
574 for (int i = 0; i < count; i++) {
575 total += Cycles::rdtsc();
576 }
577 uint64_t stop = Cycles::rdtsc();
578 return Cycles::to_seconds(stop - start)/count;
579 }
580
581 // Measure the cost of the Cycles::to_seconds method.
582 double perf_cycles_to_seconds()
583 {
584 int count = 1000000;
585 double total = 0;
586 uint64_t cycles = 994261;
587 uint64_t start = Cycles::rdtsc();
588 for (int i = 0; i < count; i++) {
589 total += Cycles::to_seconds(cycles);
590 }
591 uint64_t stop = Cycles::rdtsc();
592 // printf("Result: %.4f\n", total/count);
593 return Cycles::to_seconds(stop - start)/count;
594 }
595
596 // Measure the cost of the Cylcles::toNanoseconds method.
597 double perf_cycles_to_nanoseconds()
598 {
599 int count = 1000000;
600 uint64_t total = 0;
601 uint64_t cycles = 994261;
602 uint64_t start = Cycles::rdtsc();
603 for (int i = 0; i < count; i++) {
604 total += Cycles::to_nanoseconds(cycles);
605 }
606 uint64_t stop = Cycles::rdtsc();
607 // printf("Result: %lu\n", total/count);
608 return Cycles::to_seconds(stop - start)/count;
609 }
610
611
612 #ifdef HAVE_SSE
613 /**
614 * Prefetch the cache lines containing [object, object + numBytes) into the
615 * processor's caches.
616 * The best docs for this are in the Intel instruction set reference under
617 * PREFETCH.
618 * \param object
619 * The start of the region of memory to prefetch.
620 * \param num_bytes
621 * The size of the region of memory to prefetch.
622 */
623 static inline void prefetch(const void *object, uint64_t num_bytes)
624 {
625 uint64_t offset = reinterpret_cast<uint64_t>(object) & 0x3fUL;
626 const char* p = reinterpret_cast<const char*>(object) - offset;
627 for (uint64_t i = 0; i < offset + num_bytes; i += 64)
628 _mm_prefetch(p + i, _MM_HINT_T0);
629 }
630 #endif
631
632 // Measure the cost of the prefetch instruction.
633 double perf_prefetch()
634 {
635 #ifdef HAVE_SSE
636 uint64_t total_ticks = 0;
637 int count = 10;
638 char buf[16 * 64];
639 uint64_t start, stop;
640
641 for (int i = 0; i < count; i++) {
642 PerfHelper::flush_cache();
643 start = Cycles::rdtsc();
644 prefetch(&buf[576], 64);
645 prefetch(&buf[0], 64);
646 prefetch(&buf[512], 64);
647 prefetch(&buf[960], 64);
648 prefetch(&buf[640], 64);
649 prefetch(&buf[896], 64);
650 prefetch(&buf[256], 64);
651 prefetch(&buf[704], 64);
652 prefetch(&buf[320], 64);
653 prefetch(&buf[384], 64);
654 prefetch(&buf[128], 64);
655 prefetch(&buf[448], 64);
656 prefetch(&buf[768], 64);
657 prefetch(&buf[832], 64);
658 prefetch(&buf[64], 64);
659 prefetch(&buf[192], 64);
660 stop = Cycles::rdtsc();
661 total_ticks += stop - start;
662 }
663 return Cycles::to_seconds(total_ticks) / count / 16;
664 #else
665 return -1;
666 #endif
667 }
668
669 #if defined(__x86_64__)
670 /**
671 * This function is used to seralize machine instructions so that no
672 * instructions that appear after it in the current thread can run before any
673 * instructions that appear before it.
674 *
675 * It is useful for putting around rdpmc instructions (to pinpoint cache
676 * misses) as well as before rdtsc instructions, to prevent time pollution from
677 * instructions supposed to be executing before the timer starts.
678 */
679 static inline void serialize() {
680 uint32_t eax, ebx, ecx, edx;
681 __asm volatile("cpuid"
682 : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
683 : "a" (1U));
684 }
685 #endif
686
687 // Measure the cost of cpuid
688 double perf_serialize() {
689 #if defined(__x86_64__)
690 int count = 1000000;
691 uint64_t start = Cycles::rdtsc();
692 for (int i = 0; i < count; i++) {
693 serialize();
694 }
695 uint64_t stop = Cycles::rdtsc();
696 return Cycles::to_seconds(stop - start)/count;
697 #else
698 return -1;
699 #endif
700 }
701
702 // Measure the cost of an lfence instruction.
703 double lfence()
704 {
705 #ifdef HAVE_SSE2
706 int count = 1000000;
707 uint64_t start = Cycles::rdtsc();
708 for (int i = 0; i < count; i++) {
709 __asm__ __volatile__("lfence" ::: "memory");
710 }
711 uint64_t stop = Cycles::rdtsc();
712 return Cycles::to_seconds(stop - start)/count;
713 #else
714 return -1;
715 #endif
716 }
717
718 // Measure the cost of an sfence instruction.
719 double sfence()
720 {
721 #ifdef HAVE_SSE
722 int count = 1000000;
723 uint64_t start = Cycles::rdtsc();
724 for (int i = 0; i < count; i++) {
725 __asm__ __volatile__("sfence" ::: "memory");
726 }
727 uint64_t stop = Cycles::rdtsc();
728 return Cycles::to_seconds(stop - start)/count;
729 #else
730 return -1;
731 #endif
732 }
733
734 // Measure the cost of acquiring and releasing a SpinLock (assuming the
735 // lock is initially free).
736 double test_spinlock()
737 {
738 int count = 1000000;
739 Spinlock lock;
740 uint64_t start = Cycles::rdtsc();
741 for (int i = 0; i < count; i++) {
742 lock.lock();
743 lock.unlock();
744 }
745 uint64_t stop = Cycles::rdtsc();
746 return Cycles::to_seconds(stop - start)/count;
747 }
748
749 // Helper for spawn_thread. This is the main function that the thread executes
750 // (intentionally empty).
751 class ThreadHelper : public Thread {
752 void *entry() override { return 0; }
753 };
754
755 // Measure the cost of start and joining with a thread.
756 double spawn_thread()
757 {
758 int count = 10000;
759 ThreadHelper thread;
760 uint64_t start = Cycles::rdtsc();
761 for (int i = 0; i < count; i++) {
762 thread.create("thread_helper");
763 thread.join();
764 }
765 uint64_t stop = Cycles::rdtsc();
766 return Cycles::to_seconds(stop - start)/count;
767 }
768
769 class FakeContext : public Context {
770 public:
771 void finish(int r) override {}
772 };
773
774 // Measure the cost of starting and stopping a Dispatch::Timer.
775 double perf_timer()
776 {
777 int count = 1000000;
778 Mutex lock("perf_timer::lock");
779 SafeTimer timer(g_ceph_context, lock);
780 FakeContext **c = new FakeContext*[count];
781 for (int i = 0; i < count; i++) {
782 c[i] = new FakeContext();
783 }
784 uint64_t start = Cycles::rdtsc();
785 Mutex::Locker l(lock);
786 for (int i = 0; i < count; i++) {
787 timer.add_event_after(12345, c[i]);
788 timer.cancel_event(c[i]);
789 }
790 uint64_t stop = Cycles::rdtsc();
791 delete[] c;
792 return Cycles::to_seconds(stop - start)/count;
793 }
794
795 // Measure the cost of throwing and catching an int. This uses an integer as
796 // the value thrown, which is presumably as fast as possible.
797 double throw_int()
798 {
799 int count = 10000;
800 uint64_t start = Cycles::rdtsc();
801 for (int i = 0; i < count; i++) {
802 try {
803 throw 0;
804 } catch (int) { // NOLINT
805 // pass
806 }
807 }
808 uint64_t stop = Cycles::rdtsc();
809 return Cycles::to_seconds(stop - start)/count;
810 }
811
812 // Measure the cost of throwing and catching an int from a function call.
813 double throw_int_call()
814 {
815 int count = 10000;
816 uint64_t start = Cycles::rdtsc();
817 for (int i = 0; i < count; i++) {
818 try {
819 PerfHelper::throw_int();
820 } catch (int) { // NOLINT
821 // pass
822 }
823 }
824 uint64_t stop = Cycles::rdtsc();
825 return Cycles::to_seconds(stop - start)/count;
826 }
827
828 // Measure the cost of throwing and catching an Exception. This uses an actual
829 // exception as the value thrown, which may be slower than throwInt.
830 double throw_exception()
831 {
832 int count = 10000;
833 uint64_t start = Cycles::rdtsc();
834 for (int i = 0; i < count; i++) {
835 try {
836 throw buffer::end_of_buffer();
837 } catch (const buffer::end_of_buffer&) {
838 // pass
839 }
840 }
841 uint64_t stop = Cycles::rdtsc();
842 return Cycles::to_seconds(stop - start)/count;
843 }
844
845 // Measure the cost of throwing and catching an Exception from a function call.
846 double throw_exception_call()
847 {
848 int count = 10000;
849 uint64_t start = Cycles::rdtsc();
850 for (int i = 0; i < count; i++) {
851 try {
852 PerfHelper::throw_end_of_buffer();
853 } catch (const buffer::end_of_buffer&) {
854 // pass
855 }
856 }
857 uint64_t stop = Cycles::rdtsc();
858 return Cycles::to_seconds(stop - start)/count;
859 }
860
861 // Measure the cost of pushing a new element on a std::vector, copying
862 // from the end to an internal element, and popping the end element.
863 double vector_push_pop()
864 {
865 int count = 100000;
866 std::vector<int> vector;
867 vector.push_back(1);
868 vector.push_back(2);
869 vector.push_back(3);
870 uint64_t start = Cycles::rdtsc();
871 for (int i = 0; i < count; i++) {
872 vector.push_back(i);
873 vector.push_back(i+1);
874 vector.push_back(i+2);
875 vector[2] = vector.back();
876 vector.pop_back();
877 vector[0] = vector.back();
878 vector.pop_back();
879 vector[1] = vector.back();
880 vector.pop_back();
881 }
882 uint64_t stop = Cycles::rdtsc();
883 return Cycles::to_seconds(stop - start)/(count*3);
884 }
885
886 // Measure the cost of ceph_clock_now
887 double perf_ceph_clock_now()
888 {
889 int count = 100000;
890 uint64_t start = Cycles::rdtsc();
891 for (int i = 0; i < count; i++) {
892 ceph_clock_now();
893 }
894 uint64_t stop = Cycles::rdtsc();
895 return Cycles::to_seconds(stop - start)/count;
896 }
897
898 // The following struct and table define each performance test in terms of
899 // a string name and a function that implements the test.
900 struct TestInfo {
901 const char* name; // Name of the performance test; this is
902 // what gets typed on the command line to
903 // run the test.
904 double (*func)(); // Function that implements the test;
905 // returns the time (in seconds) for each
906 // iteration of that test.
907 const char *description; // Short description of this test (not more
908 // than about 40 characters, so the entire
909 // test output fits on a single line).
910 };
911 TestInfo tests[] = {
912 {"atomic_int_cmp", atomic_int_cmp,
913 "atomic_t::compare_and_swap"},
914 {"atomic_int_inc", atomic_int_inc,
915 "atomic_t::inc"},
916 {"atomic_int_read", atomic_int_read,
917 "atomic_t::read"},
918 {"atomic_int_set", atomic_int_set,
919 "atomic_t::set"},
920 {"mutex_nonblock", mutex_nonblock,
921 "Mutex lock/unlock (no blocking)"},
922 {"buffer_basic", buffer_basic,
923 "buffer create, add one ptr, delete"},
924 {"buffer_encode_decode", buffer_encode_decode,
925 "buffer create, encode/decode object, delete"},
926 {"buffer_basic_copy", buffer_basic_copy,
927 "buffer create, copy small block, delete"},
928 {"buffer_copy", buffer_copy,
929 "copy out 2 small ptrs from buffer"},
930 {"buffer_encode10", buffer_encode,
931 "buffer encoding 10 structures onto existing ptr"},
932 {"buffer_get_contiguous", buffer_get_contiguous,
933 "Buffer::get_contiguous"},
934 {"buffer_iterator", buffer_iterator,
935 "iterate over buffer with 5 ptrs"},
936 {"cond_ping_pong", cond_ping_pong,
937 "condition variable round-trip"},
938 {"div32", div32,
939 "32-bit integer division instruction"},
940 {"div64", div64,
941 "64-bit integer division instruction"},
942 {"function_call", function_call,
943 "Call a function that has not been inlined"},
944 {"eventcenter_poll", eventcenter_poll,
945 "EventCenter::process_events (no timers or events)"},
946 {"eventcenter_dispatch", eventcenter_dispatch,
947 "EventCenter::dispatch_event_external latency"},
948 {"memcpy100", memcpy100,
949 "Copy 100 bytes with memcpy"},
950 {"memcpy1000", memcpy1000,
951 "Copy 1000 bytes with memcpy"},
952 {"memcpy10000", memcpy10000,
953 "Copy 10000 bytes with memcpy"},
954 {"ceph_str_hash_rjenkins", ceph_str_hash_rjenkins<16>,
955 "rjenkins hash on 16 byte of data"},
956 {"ceph_str_hash_rjenkins", ceph_str_hash_rjenkins<256>,
957 "rjenkins hash on 256 bytes of data"},
958 {"rdtsc", rdtsc_test,
959 "Read the fine-grain cycle counter"},
960 {"cycles_to_seconds", perf_cycles_to_seconds,
961 "Convert a rdtsc result to (double) seconds"},
962 {"cycles_to_seconds", perf_cycles_to_nanoseconds,
963 "Convert a rdtsc result to (uint64_t) nanoseconds"},
964 {"prefetch", perf_prefetch,
965 "Prefetch instruction"},
966 {"serialize", perf_serialize,
967 "serialize instruction"},
968 {"lfence", lfence,
969 "Lfence instruction"},
970 {"sfence", sfence,
971 "Sfence instruction"},
972 {"spin_lock", test_spinlock,
973 "Acquire/release SpinLock"},
974 {"spawn_thread", spawn_thread,
975 "Start and stop a thread"},
976 {"perf_timer", perf_timer,
977 "Insert and cancel a SafeTimer"},
978 {"throw_int", throw_int,
979 "Throw an int"},
980 {"throw_int_call", throw_int_call,
981 "Throw an int in a function call"},
982 {"throw_exception", throw_exception,
983 "Throw an Exception"},
984 {"throw_exception_call", throw_exception_call,
985 "Throw an Exception in a function call"},
986 {"vector_push_pop", vector_push_pop,
987 "Push and pop a std::vector"},
988 {"ceph_clock_now", perf_ceph_clock_now,
989 "ceph_clock_now function"},
990 };
991
992 /**
993 * Runs a particular test and prints a one-line result message.
994 *
995 * \param info
996 * Describes the test to run.
997 */
998 void run_test(TestInfo& info)
999 {
1000 double secs = info.func();
1001 int width = printf("%-24s ", info.name);
1002 if (secs == -1) {
1003 width += printf(" architecture nonsupport ");
1004 } else if (secs < 1.0e-06) {
1005 width += printf("%8.2fns", 1e09*secs);
1006 } else if (secs < 1.0e-03) {
1007 width += printf("%8.2fus", 1e06*secs);
1008 } else if (secs < 1.0) {
1009 width += printf("%8.2fms", 1e03*secs);
1010 } else {
1011 width += printf("%8.2fs", secs);
1012 }
1013 printf("%*s %s\n", 32-width, "", info.description);
1014 }
1015
1016 int main(int argc, char *argv[])
1017 {
1018 vector<const char*> args;
1019 argv_to_vec(argc, (const char **)argv, args);
1020
1021 auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
1022 CODE_ENVIRONMENT_UTILITY, 0);
1023 common_init_finish(g_ceph_context);
1024 Cycles::init();
1025
1026 bind_thread_to_cpu(3);
1027 if (argc == 1) {
1028 // No test names specified; run all tests.
1029 for (size_t i = 0; i < sizeof(tests)/sizeof(TestInfo); ++i) {
1030 run_test(tests[i]);
1031 }
1032 } else {
1033 // Run only the tests that were specified on the command line.
1034 for (int i = 1; i < argc; i++) {
1035 bool found_test = false;
1036 for (size_t j = 0; j < sizeof(tests)/sizeof(TestInfo); ++j) {
1037 if (strcmp(argv[i], tests[j].name) == 0) {
1038 found_test = true;
1039 run_test(tests[j]);
1040 break;
1041 }
1042 }
1043 if (!found_test) {
1044 int width = printf("%-24s ??", argv[i]);
1045 printf("%*s No such test\n", 32-width, "");
1046 }
1047 }
1048 }
1049 }