1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /* Copyright (c) 2015 Haomai Wang <haomaiwang@gmail.com>
4 * Copyright (c) 2011-2014 Stanford University
5 * Copyright (c) 2011 Facebook
7 * Permission to use, copy, modify, and distribute this software for any
8 * purpose with or without fee is hereby granted, provided that the above
9 * copyright notice and this permission notice appear in all copies.
11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR(S) DISCLAIM ALL WARRANTIES
12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL AUTHORS BE LIABLE FOR
14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
20 // This program contains a collection of low-level performance measurements
21 // for Ceph, which can be run either individually or altogether. These
22 // tests measure performance in a single stand-alone process, not in a cluster
23 // with multiple servers. Invoke the program like this:
25 // Perf test1 test2 ...
27 // test1 and test2 are the names of individual performance measurements to
28 // run. If no test names are provided then all of the performance tests
32 // * Write a function that implements the test. Use existing test functions
33 // as a guideline, and be sure to generate output in the same form as
35 // * Create a new entry for the test in the #tests table.
41 #include <xmmintrin.h>
44 #include "include/buffer.h"
45 #include "include/encoding.h"
46 #include "include/ceph_hash.h"
47 #include "include/spinlock.h"
48 #include "common/ceph_argparse.h"
49 #include "common/Cycles.h"
50 #include "common/Cond.h"
51 #include "common/ceph_mutex.h"
52 #include "common/Thread.h"
53 #include "common/Timer.h"
54 #include "msg/async/Event.h"
55 #include "global/global_init.h"
57 #include "test/perf_helper.h"
65 * Ask the operating system to pin the current thread to a given CPU.
68 * Indicates the desired CPU and hyperthread; low order 2 bits
69 * specify CPU, next bit specifies hyperthread.
71 void bind_thread_to_cpu(int cpu
)
77 sched_setaffinity(0, sizeof(set
), &set
);
82 * This function just discards its argument. It's used to make it
83 * appear that data is used, so that the compiler won't optimize
84 * away the code we're trying to measure.
87 * Pointer to arbitrary value; it's discarded.
89 void discard(void* value
) {
90 int x
= *reinterpret_cast<int*>(value
);
91 if (x
== 0x43924776) {
92 printf("Value was 0x%x\n", x
);
96 //----------------------------------------------------------------------
97 // Test functions start here
98 //----------------------------------------------------------------------
100 // Measure the cost of atomic compare-and-swap
101 double atomic_int_cmp()
104 std::atomic
<unsigned> value
= { 11 };
105 unsigned int test
= 11;
106 uint64_t start
= Cycles::rdtsc();
107 for (int i
= 0; i
< count
; i
++) {
108 value
.compare_exchange_strong(test
, test
+2);
111 uint64_t stop
= Cycles::rdtsc();
112 // printf("Final value: %d\n", value.load());
113 return Cycles::to_seconds(stop
- start
)/count
;
116 // Measure the cost of incrementing an atomic
117 double atomic_int_inc()
120 std::atomic
<int64_t> value
= { 11 };
121 uint64_t start
= Cycles::rdtsc();
122 for (int i
= 0; i
< count
; i
++) {
125 uint64_t stop
= Cycles::rdtsc();
126 // printf("Final value: %d\n", value.load());
127 return Cycles::to_seconds(stop
- start
)/count
;
130 // Measure the cost of reading an atomic
131 double atomic_int_read()
134 std::atomic
<int64_t> value
= { 11 };
135 [[maybe_unused
]] int total
= 0;
136 uint64_t start
= Cycles::rdtsc();
137 for (int i
= 0; i
< count
; i
++) {
140 uint64_t stop
= Cycles::rdtsc();
141 // printf("Total: %d\n", total);
142 return Cycles::to_seconds(stop
- start
)/count
;
145 // Measure the cost of storing a new value in an atomic
146 double atomic_int_set()
149 std::atomic
<int64_t> value
= { 11 };
150 uint64_t start
= Cycles::rdtsc();
151 for (int i
= 0; i
< count
; i
++) {
154 uint64_t stop
= Cycles::rdtsc();
155 return Cycles::to_seconds(stop
- start
)/count
;
158 // Measure the cost of acquiring and releasing a mutex in the
159 // fast case where the mutex is free.
160 double mutex_nonblock()
163 ceph::mutex m
= ceph::make_mutex("mutex_nonblock::m");
164 uint64_t start
= Cycles::rdtsc();
165 for (int i
= 0; i
< count
; i
++) {
169 uint64_t stop
= Cycles::rdtsc();
170 return Cycles::to_seconds(stop
- start
)/count
;
173 // Measure the cost of allocating and deallocating a buffer, plus
174 // appending (logically) one ptr.
175 double buffer_basic()
178 uint64_t start
= Cycles::rdtsc();
179 bufferptr
ptr("abcdefg", 7);
180 for (int i
= 0; i
< count
; i
++) {
184 uint64_t stop
= Cycles::rdtsc();
185 return Cycles::to_seconds(stop
- start
)/count
;
189 int a
= 1, b
= 2, c
= 3, d
= 4;
190 void encode(bufferlist
&bl
) const {
191 ENCODE_START(1, 1, bl
);
198 void decode(bufferlist::const_iterator
&bl
) {
207 WRITE_CLASS_ENCODER(DummyBlock
)
209 // Measure the cost of encoding and decoding a buffer, plus
210 // allocating space for one chunk.
211 double buffer_encode_decode()
214 uint64_t start
= Cycles::rdtsc();
215 for (int i
= 0; i
< count
; i
++) {
217 DummyBlock dummy_block
;
218 encode(dummy_block
, b
);
219 auto iter
= b
.cbegin();
220 decode(dummy_block
, iter
);
222 uint64_t stop
= Cycles::rdtsc();
223 return Cycles::to_seconds(stop
- start
)/count
;
226 // Measure the cost of allocating and deallocating a buffer, plus
227 // copying in a small block.
228 double buffer_basic_copy()
231 uint64_t start
= Cycles::rdtsc();
232 for (int i
= 0; i
< count
; i
++) {
234 b
.append("abcdefg", 6);
236 uint64_t stop
= Cycles::rdtsc();
237 return Cycles::to_seconds(stop
- start
)/count
;
240 // Measure the cost of making a copy of parts of two ptrs.
245 b
.append("abcde", 5);
246 b
.append("01234", 5);
248 uint64_t start
= Cycles::rdtsc();
249 for (int i
= 0; i
< count
; i
++) {
250 b
.cbegin(2).copy(6, copy
);
252 uint64_t stop
= Cycles::rdtsc();
253 return Cycles::to_seconds(stop
- start
)/count
;
256 // Measure the cost of allocating new space by extending the
258 double buffer_encode()
262 for (int i
= 0; i
< count
; i
++) {
264 DummyBlock dummy_block
;
265 encode(dummy_block
, b
);
266 uint64_t start
= Cycles::rdtsc();
267 encode(dummy_block
, b
);
268 encode(dummy_block
, b
);
269 encode(dummy_block
, b
);
270 encode(dummy_block
, b
);
271 encode(dummy_block
, b
);
272 encode(dummy_block
, b
);
273 encode(dummy_block
, b
);
274 encode(dummy_block
, b
);
275 encode(dummy_block
, b
);
276 encode(dummy_block
, b
);
277 total
+= Cycles::rdtsc() - start
;
279 return Cycles::to_seconds(total
)/(count
*10);
282 // Measure the cost of creating an iterator and iterating over 10
283 // chunks in a buffer.
284 double buffer_iterator()
287 const char s
[] = "abcdefghijklmnopqrstuvwxyz";
288 bufferptr
ptr(s
, sizeof(s
));
289 for (int i
= 0; i
< 5; i
++) {
294 uint64_t start
= Cycles::rdtsc();
295 for (int i
= 0; i
< count
; i
++) {
296 auto it
= b
.cbegin();
298 sum
+= (static_cast<const char*>(it
.get_current_ptr().c_str()))[it
.get_remaining()-1];
302 uint64_t stop
= Cycles::rdtsc();
304 return Cycles::to_seconds(stop
- start
)/count
;
307 // Implements the CondPingPong test.
309 ceph::mutex mutex
= ceph::make_mutex("CondPingPong::mutex");
310 ceph::condition_variable cond
;
313 const int count
= 10000;
315 class Consumer
: public Thread
{
318 explicit Consumer(CondPingPong
*p
): p(p
) {}
319 void* entry() override
{
326 CondPingPong(): consumer(this) {}
329 consumer
.create("consumer");
330 uint64_t start
= Cycles::rdtsc();
332 uint64_t stop
= Cycles::rdtsc();
334 return Cycles::to_seconds(stop
- start
)/count
;
338 std::unique_lock l
{mutex
};
339 while (cons
< count
) {
340 cond
.wait(l
, [this] { return cons
>= prod
; });
347 std::unique_lock l
{mutex
};
348 while (cons
< count
) {
349 cond
.wait(l
, [this] { return cons
!= prod
; });
356 // Measure the cost of coordinating between threads using a condition variable.
357 double cond_ping_pong()
359 return CondPingPong().run();
362 // Measure the cost of a 32-bit divide. Divides don't take a constant
363 // number of cycles. Values were chosen here semi-randomly to depict a
364 // fairly expensive scenario. Someone with fancy ALU knowledge could
365 // probably pick worse values.
368 #if defined(__i386__) || defined(__x86_64__)
370 uint64_t start
= Cycles::rdtsc();
371 // NB: Expect an x86 processor exception is there's overflow.
372 uint32_t numeratorHi
= 0xa5a5a5a5U
;
373 uint32_t numeratorLo
= 0x55aa55aaU
;
374 uint32_t divisor
= 0xaa55aa55U
;
377 for (int i
= 0; i
< count
; i
++) {
378 __asm__
__volatile__("div %4" :
379 "=a"(quotient
), "=d"(remainder
) :
380 "a"(numeratorLo
), "d"(numeratorHi
), "r"(divisor
) :
383 uint64_t stop
= Cycles::rdtsc();
384 return Cycles::to_seconds(stop
- start
)/count
;
385 #elif defined(__aarch64__)
387 uint64_t start
= Cycles::rdtsc();
388 uint64_t numerator
= 0xa5a5a5a555aa55aaUL
;
389 uint32_t divisor
= 0xaa55aa55U
;
391 for (int i
= 0; i
< count
; i
++) {
392 asm volatile("udiv %0, %1, %2" : "=r"(result
) :
393 "r"(numerator
), "r"(divisor
));
395 uint64_t stop
= Cycles::rdtsc();
396 return Cycles::to_seconds(stop
- start
)/count
;
402 // Measure the cost of a 64-bit divide. Divides don't take a constant
403 // number of cycles. Values were chosen here semi-randomly to depict a
404 // fairly expensive scenario. Someone with fancy ALU knowledge could
405 // probably pick worse values.
408 #if defined(__x86_64__) || defined(__amd64__)
410 // NB: Expect an x86 processor exception is there's overflow.
411 uint64_t start
= Cycles::rdtsc();
412 uint64_t numeratorHi
= 0x5a5a5a5a5a5UL
;
413 uint64_t numeratorLo
= 0x55aa55aa55aa55aaUL
;
414 uint64_t divisor
= 0xaa55aa55aa55aa55UL
;
417 for (int i
= 0; i
< count
; i
++) {
418 __asm__
__volatile__("divq %4" :
419 "=a"(quotient
), "=d"(remainder
) :
420 "a"(numeratorLo
), "d"(numeratorHi
), "r"(divisor
) :
423 uint64_t stop
= Cycles::rdtsc();
424 return Cycles::to_seconds(stop
- start
)/count
;
430 // Measure the cost of calling a non-inlined function.
431 double function_call()
435 uint64_t start
= Cycles::rdtsc();
436 for (int i
= 0; i
< count
; i
++) {
437 x
= PerfHelper::plus_one(x
);
439 uint64_t stop
= Cycles::rdtsc();
440 return Cycles::to_seconds(stop
- start
)/count
;
443 // Measure the minimum cost of EventCenter::process_events, when there are no
444 // Pollers and no Timers.
445 double eventcenter_poll()
448 EventCenter
center(g_ceph_context
);
449 center
.init(1000, 0, "posix");
451 uint64_t start
= Cycles::rdtsc();
452 for (int i
= 0; i
< count
; i
++) {
453 center
.process_events(0);
455 uint64_t stop
= Cycles::rdtsc();
456 return Cycles::to_seconds(stop
- start
)/count
;
459 class CenterWorker
: public Thread
{
465 explicit CenterWorker(CephContext
*c
): cct(c
), done(false), center(c
) {
466 center
.init(100, 0, "posix");
472 void* entry() override
{
474 bind_thread_to_cpu(2);
476 center
.process_events(1000);
481 class CountEvent
: public EventCallback
{
482 std::atomic
<int64_t> *count
;
485 explicit CountEvent(std::atomic
<int64_t> *atomic
): count(atomic
) {}
486 void do_request(uint64_t id
) override
{
491 double eventcenter_dispatch()
495 CenterWorker
worker(g_ceph_context
);
496 std::atomic
<int64_t> flag
= { 1 };
497 worker
.create("evt_center_disp");
498 EventCallbackRef
count_event(new CountEvent(&flag
));
500 worker
.center
.dispatch_event_external(count_event
);
501 // Start a new thread and wait for it to ready.
505 uint64_t start
= Cycles::rdtsc();
506 for (int i
= 0; i
< count
; i
++) {
508 worker
.center
.dispatch_event_external(count_event
);
512 uint64_t stop
= Cycles::rdtsc();
515 return Cycles::to_seconds(stop
- start
)/count
;
518 // Measure the cost of copying a given number of bytes with memcpy.
519 double memcpy_shared(size_t size
)
522 char src
[size
], dst
[size
];
524 memset(src
, 0, sizeof(src
));
526 uint64_t start
= Cycles::rdtsc();
527 for (int i
= 0; i
< count
; i
++) {
528 memcpy(dst
, src
, size
);
530 uint64_t stop
= Cycles::rdtsc();
531 return Cycles::to_seconds(stop
- start
)/count
;
536 return memcpy_shared(100);
541 return memcpy_shared(1000);
546 return memcpy_shared(10000);
549 // Benchmark rjenkins hashing performance on cached data.
550 template <int key_length
>
551 double ceph_str_hash_rjenkins()
554 char buf
[key_length
];
556 uint64_t start
= Cycles::rdtsc();
557 for (int i
= 0; i
< count
; i
++)
558 ceph_str_hash(CEPH_STR_HASH_RJENKINS
, buf
, sizeof(buf
));
559 uint64_t stop
= Cycles::rdtsc();
561 return Cycles::to_seconds(stop
- start
)/count
;
564 // Measure the cost of reading the fine-grain cycle counter.
568 uint64_t start
= Cycles::rdtsc();
569 [[maybe_unused
]] uint64_t total
= 0;
570 for (int i
= 0; i
< count
; i
++) {
571 total
+= Cycles::rdtsc();
573 uint64_t stop
= Cycles::rdtsc();
574 return Cycles::to_seconds(stop
- start
)/count
;
577 // Measure the cost of the Cycles::to_seconds method.
578 double perf_cycles_to_seconds()
581 [[maybe_unused
]] double total
= 0;
582 uint64_t cycles
= 994261;
583 uint64_t start
= Cycles::rdtsc();
584 for (int i
= 0; i
< count
; i
++) {
585 total
+= Cycles::to_seconds(cycles
);
587 uint64_t stop
= Cycles::rdtsc();
588 // printf("Result: %.4f\n", total/count);
589 return Cycles::to_seconds(stop
- start
)/count
;
592 // Measure the cost of the Cylcles::toNanoseconds method.
593 double perf_cycles_to_nanoseconds()
596 [[maybe_unused
]] uint64_t total
= 0;
597 uint64_t cycles
= 994261;
598 uint64_t start
= Cycles::rdtsc();
599 for (int i
= 0; i
< count
; i
++) {
600 total
+= Cycles::to_nanoseconds(cycles
);
602 uint64_t stop
= Cycles::rdtsc();
603 // printf("Result: %lu\n", total/count);
604 return Cycles::to_seconds(stop
- start
)/count
;
610 * Prefetch the cache lines containing [object, object + numBytes) into the
611 * processor's caches.
612 * The best docs for this are in the Intel instruction set reference under
615 * The start of the region of memory to prefetch.
617 * The size of the region of memory to prefetch.
619 static inline void prefetch(const void *object
, uint64_t num_bytes
)
621 uint64_t offset
= reinterpret_cast<uint64_t>(object
) & 0x3fUL
;
622 const char* p
= reinterpret_cast<const char*>(object
) - offset
;
623 for (uint64_t i
= 0; i
< offset
+ num_bytes
; i
+= 64)
624 _mm_prefetch(p
+ i
, _MM_HINT_T0
);
626 #elif defined(__aarch64__)
627 static inline void prefetch(const void *object
, uint64_t num_bytes
)
629 uint64_t offset
= reinterpret_cast<uint64_t>(object
) & 0x3fUL
;
630 const char* ptr
= reinterpret_cast<const char*>(object
) - offset
;
631 for (uint64_t i
= 0; i
< offset
+ num_bytes
; i
+= 64, ptr
+= 64)
632 asm volatile("prfm pldl1keep, %a0\n" : : "p" (ptr
));
636 // Measure the cost of the prefetch instruction.
637 double perf_prefetch()
639 #if defined(HAVE_SSE) || defined(__aarch64__)
640 uint64_t total_ticks
= 0;
644 for (int i
= 0; i
< count
; i
++) {
645 PerfHelper::flush_cache();
646 uint64_t start
= Cycles::rdtsc();
647 prefetch(&buf
[576], 64);
648 prefetch(&buf
[0], 64);
649 prefetch(&buf
[512], 64);
650 prefetch(&buf
[960], 64);
651 prefetch(&buf
[640], 64);
652 prefetch(&buf
[896], 64);
653 prefetch(&buf
[256], 64);
654 prefetch(&buf
[704], 64);
655 prefetch(&buf
[320], 64);
656 prefetch(&buf
[384], 64);
657 prefetch(&buf
[128], 64);
658 prefetch(&buf
[448], 64);
659 prefetch(&buf
[768], 64);
660 prefetch(&buf
[832], 64);
661 prefetch(&buf
[64], 64);
662 prefetch(&buf
[192], 64);
663 uint64_t stop
= Cycles::rdtsc();
664 total_ticks
+= stop
- start
;
666 return Cycles::to_seconds(total_ticks
) / count
/ 16;
672 #if defined(__x86_64__)
674 * This function is used to seralize machine instructions so that no
675 * instructions that appear after it in the current thread can run before any
676 * instructions that appear before it.
678 * It is useful for putting around rdpmc instructions (to pinpoint cache
679 * misses) as well as before rdtsc instructions, to prevent time pollution from
680 * instructions supposed to be executing before the timer starts.
682 static inline void serialize() {
683 uint32_t eax
, ebx
, ecx
, edx
;
684 __asm
volatile("cpuid"
685 : "=a" (eax
), "=b" (ebx
), "=c" (ecx
), "=d" (edx
)
690 // Measure the cost of cpuid
691 double perf_serialize() {
692 #if defined(__x86_64__)
694 uint64_t start
= Cycles::rdtsc();
695 for (int i
= 0; i
< count
; i
++) {
698 uint64_t stop
= Cycles::rdtsc();
699 return Cycles::to_seconds(stop
- start
)/count
;
705 // Measure the cost of an lfence instruction.
710 uint64_t start
= Cycles::rdtsc();
711 for (int i
= 0; i
< count
; i
++) {
712 __asm__
__volatile__("lfence" ::: "memory");
714 uint64_t stop
= Cycles::rdtsc();
715 return Cycles::to_seconds(stop
- start
)/count
;
716 #elif defined(__aarch64__)
718 uint64_t start
= Cycles::rdtsc();
719 for (int i
= 0; i
< count
; i
++) {
720 asm volatile("dmb ishld" ::: "memory");
722 uint64_t stop
= Cycles::rdtsc();
723 return Cycles::to_seconds(stop
- start
)/count
;
729 // Measure the cost of an sfence instruction.
734 uint64_t start
= Cycles::rdtsc();
735 for (int i
= 0; i
< count
; i
++) {
736 __asm__
__volatile__("sfence" ::: "memory");
738 uint64_t stop
= Cycles::rdtsc();
739 return Cycles::to_seconds(stop
- start
)/count
;
740 #elif defined(__aarch64__)
742 uint64_t start
= Cycles::rdtsc();
743 for (int i
= 0; i
< count
; i
++) {
744 asm volatile("dmb ishst" ::: "memory");
746 uint64_t stop
= Cycles::rdtsc();
747 return Cycles::to_seconds(stop
- start
)/count
;
753 // Measure the cost of acquiring and releasing a SpinLock (assuming the
754 // lock is initially free).
755 double test_spinlock()
759 uint64_t start
= Cycles::rdtsc();
760 for (int i
= 0; i
< count
; i
++) {
764 uint64_t stop
= Cycles::rdtsc();
765 return Cycles::to_seconds(stop
- start
)/count
;
768 // Helper for spawn_thread. This is the main function that the thread executes
769 // (intentionally empty).
770 class ThreadHelper
: public Thread
{
771 void *entry() override
{ return 0; }
774 // Measure the cost of start and joining with a thread.
775 double spawn_thread()
779 uint64_t start
= Cycles::rdtsc();
780 for (int i
= 0; i
< count
; i
++) {
781 thread
.create("thread_helper");
784 uint64_t stop
= Cycles::rdtsc();
785 return Cycles::to_seconds(stop
- start
)/count
;
788 class FakeContext
: public Context
{
790 void finish(int r
) override
{}
793 // Measure the cost of starting and stopping a Dispatch::Timer.
797 ceph::mutex lock
= ceph::make_mutex("perf_timer::lock");
798 SafeTimer
timer(g_ceph_context
, lock
);
799 FakeContext
**c
= new FakeContext
*[count
];
800 for (int i
= 0; i
< count
; i
++) {
801 c
[i
] = new FakeContext();
803 uint64_t start
= Cycles::rdtsc();
804 std::lock_guard l
{lock
};
805 for (int i
= 0; i
< count
; i
++) {
806 if (timer
.add_event_after(12345, c
[i
])) {
807 timer
.cancel_event(c
[i
]);
810 uint64_t stop
= Cycles::rdtsc();
812 return Cycles::to_seconds(stop
- start
)/count
;
815 // Measure the cost of throwing and catching an int. This uses an integer as
816 // the value thrown, which is presumably as fast as possible.
820 uint64_t start
= Cycles::rdtsc();
821 for (int i
= 0; i
< count
; i
++) {
824 } catch (int) { // NOLINT
828 uint64_t stop
= Cycles::rdtsc();
829 return Cycles::to_seconds(stop
- start
)/count
;
832 // Measure the cost of throwing and catching an int from a function call.
833 double throw_int_call()
836 uint64_t start
= Cycles::rdtsc();
837 for (int i
= 0; i
< count
; i
++) {
839 PerfHelper::throw_int();
840 } catch (int) { // NOLINT
844 uint64_t stop
= Cycles::rdtsc();
845 return Cycles::to_seconds(stop
- start
)/count
;
848 // Measure the cost of throwing and catching an Exception. This uses an actual
849 // exception as the value thrown, which may be slower than throwInt.
850 double throw_exception()
853 uint64_t start
= Cycles::rdtsc();
854 for (int i
= 0; i
< count
; i
++) {
856 throw buffer::end_of_buffer();
857 } catch (const buffer::end_of_buffer
&) {
861 uint64_t stop
= Cycles::rdtsc();
862 return Cycles::to_seconds(stop
- start
)/count
;
865 // Measure the cost of throwing and catching an Exception from a function call.
866 double throw_exception_call()
869 uint64_t start
= Cycles::rdtsc();
870 for (int i
= 0; i
< count
; i
++) {
872 PerfHelper::throw_end_of_buffer();
873 } catch (const buffer::end_of_buffer
&) {
877 uint64_t stop
= Cycles::rdtsc();
878 return Cycles::to_seconds(stop
- start
)/count
;
881 // Measure the cost of pushing a new element on a std::vector, copying
882 // from the end to an internal element, and popping the end element.
883 double vector_push_pop()
886 std::vector
<int> vector
;
890 uint64_t start
= Cycles::rdtsc();
891 for (int i
= 0; i
< count
; i
++) {
893 vector
.push_back(i
+1);
894 vector
.push_back(i
+2);
895 vector
[2] = vector
.back();
897 vector
[0] = vector
.back();
899 vector
[1] = vector
.back();
902 uint64_t stop
= Cycles::rdtsc();
903 return Cycles::to_seconds(stop
- start
)/(count
*3);
906 // Measure the cost of ceph_clock_now
907 double perf_ceph_clock_now()
910 uint64_t start
= Cycles::rdtsc();
911 for (int i
= 0; i
< count
; i
++) {
914 uint64_t stop
= Cycles::rdtsc();
915 return Cycles::to_seconds(stop
- start
)/count
;
918 // The following struct and table define each performance test in terms of
919 // a string name and a function that implements the test.
921 const char* name
; // Name of the performance test; this is
922 // what gets typed on the command line to
924 double (*func
)(); // Function that implements the test;
925 // returns the time (in seconds) for each
926 // iteration of that test.
927 const char *description
; // Short description of this test (not more
928 // than about 40 characters, so the entire
929 // test output fits on a single line).
932 {"atomic_int_cmp", atomic_int_cmp
,
933 "atomic_t::compare_and_swap"},
934 {"atomic_int_inc", atomic_int_inc
,
936 {"atomic_int_read", atomic_int_read
,
938 {"atomic_int_set", atomic_int_set
,
940 {"mutex_nonblock", mutex_nonblock
,
941 "Mutex lock/unlock (no blocking)"},
942 {"buffer_basic", buffer_basic
,
943 "buffer create, add one ptr, delete"},
944 {"buffer_encode_decode", buffer_encode_decode
,
945 "buffer create, encode/decode object, delete"},
946 {"buffer_basic_copy", buffer_basic_copy
,
947 "buffer create, copy small block, delete"},
948 {"buffer_copy", buffer_copy
,
949 "copy out 2 small ptrs from buffer"},
950 {"buffer_encode10", buffer_encode
,
951 "buffer encoding 10 structures onto existing ptr"},
952 {"buffer_iterator", buffer_iterator
,
953 "iterate over buffer with 5 ptrs"},
954 {"cond_ping_pong", cond_ping_pong
,
955 "condition variable round-trip"},
957 "32-bit integer division instruction"},
959 "64-bit integer division instruction"},
960 {"function_call", function_call
,
961 "Call a function that has not been inlined"},
962 {"eventcenter_poll", eventcenter_poll
,
963 "EventCenter::process_events (no timers or events)"},
964 {"eventcenter_dispatch", eventcenter_dispatch
,
965 "EventCenter::dispatch_event_external latency"},
966 {"memcpy100", memcpy100
,
967 "Copy 100 bytes with memcpy"},
968 {"memcpy1000", memcpy1000
,
969 "Copy 1000 bytes with memcpy"},
970 {"memcpy10000", memcpy10000
,
971 "Copy 10000 bytes with memcpy"},
972 {"ceph_str_hash_rjenkins", ceph_str_hash_rjenkins
<16>,
973 "rjenkins hash on 16 byte of data"},
974 {"ceph_str_hash_rjenkins", ceph_str_hash_rjenkins
<256>,
975 "rjenkins hash on 256 bytes of data"},
976 {"rdtsc", rdtsc_test
,
977 "Read the fine-grain cycle counter"},
978 {"cycles_to_seconds", perf_cycles_to_seconds
,
979 "Convert a rdtsc result to (double) seconds"},
980 {"cycles_to_seconds", perf_cycles_to_nanoseconds
,
981 "Convert a rdtsc result to (uint64_t) nanoseconds"},
982 {"prefetch", perf_prefetch
,
983 "Prefetch instruction"},
984 {"serialize", perf_serialize
,
985 "serialize instruction"},
987 "Lfence instruction"},
989 "Sfence instruction"},
990 {"spin_lock", test_spinlock
,
991 "Acquire/release SpinLock"},
992 {"spawn_thread", spawn_thread
,
993 "Start and stop a thread"},
994 {"perf_timer", perf_timer
,
995 "Insert and cancel a SafeTimer"},
996 {"throw_int", throw_int
,
998 {"throw_int_call", throw_int_call
,
999 "Throw an int in a function call"},
1000 {"throw_exception", throw_exception
,
1001 "Throw an Exception"},
1002 {"throw_exception_call", throw_exception_call
,
1003 "Throw an Exception in a function call"},
1004 {"vector_push_pop", vector_push_pop
,
1005 "Push and pop a std::vector"},
1006 {"ceph_clock_now", perf_ceph_clock_now
,
1007 "ceph_clock_now function"},
1011 * Runs a particular test and prints a one-line result message.
1014 * Describes the test to run.
1016 void run_test(TestInfo
& info
)
1018 double secs
= info
.func();
1019 int width
= printf("%-24s ", info
.name
);
1021 width
+= printf(" architecture nonsupport ");
1022 } else if (secs
< 1.0e-06) {
1023 width
+= printf("%8.2fns", 1e09
*secs
);
1024 } else if (secs
< 1.0e-03) {
1025 width
+= printf("%8.2fus", 1e06
*secs
);
1026 } else if (secs
< 1.0) {
1027 width
+= printf("%8.2fms", 1e03
*secs
);
1029 width
+= printf("%8.2fs", secs
);
1031 printf("%*s %s\n", 32-width
, "", info
.description
);
1034 int main(int argc
, char *argv
[])
1036 auto args
= argv_to_vec(argc
, argv
);
1038 auto cct
= global_init(NULL
, args
, CEPH_ENTITY_TYPE_CLIENT
,
1039 CODE_ENVIRONMENT_UTILITY
,
1040 CINIT_FLAG_NO_DEFAULT_CONFIG_FILE
);
1041 common_init_finish(g_ceph_context
);
1044 bind_thread_to_cpu(3);
1046 // No test names specified; run all tests.
1047 for (size_t i
= 0; i
< sizeof(tests
)/sizeof(TestInfo
); ++i
) {
1051 // Run only the tests that were specified on the command line.
1052 for (int i
= 1; i
< argc
; i
++) {
1053 bool found_test
= false;
1054 for (size_t j
= 0; j
< sizeof(tests
)/sizeof(TestInfo
); ++j
) {
1055 if (strcmp(argv
[i
], tests
[j
].name
) == 0) {
1062 int width
= printf("%-24s ??", argv
[i
]);
1063 printf("%*s No such test\n", 32-width
, "");