1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /* Copyright (c) 2015 Haomai Wang <haomaiwang@gmail.com>
4 * Copyright (c) 2011-2014 Stanford University
5 * Copyright (c) 2011 Facebook
7 * Permission to use, copy, modify, and distribute this software for any
8 * purpose with or without fee is hereby granted, provided that the above
9 * copyright notice and this permission notice appear in all copies.
11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR(S) DISCLAIM ALL WARRANTIES
12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL AUTHORS BE LIABLE FOR
14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
20 // This program contains a collection of low-level performance measurements
21 // for Ceph, which can be run either individually or altogether. These
22 // tests measure performance in a single stand-alone process, not in a cluster
23 // with multiple servers. Invoke the program like this:
25 // Perf test1 test2 ...
27 // test1 and test2 are the names of individual performance measurements to
28 // run. If no test names are provided then all of the performance tests
32 // * Write a function that implements the test. Use existing test functions
33 // as a guideline, and be sure to generate output in the same form as
35 // * Create a new entry for the test in the #tests table.
41 #include <xmmintrin.h>
44 #include "include/atomic.h"
45 #include "include/buffer.h"
46 #include "include/encoding.h"
47 #include "include/ceph_hash.h"
48 #include "include/Spinlock.h"
49 #include "common/ceph_argparse.h"
50 #include "common/Cycles.h"
51 #include "common/Cond.h"
52 #include "common/Mutex.h"
53 #include "common/Thread.h"
54 #include "common/Timer.h"
55 #include "msg/async/Event.h"
56 #include "global/global_init.h"
58 #include "test/perf_helper.h"
63 * Ask the operating system to pin the current thread to a given CPU.
66 * Indicates the desired CPU and hyperthread; low order 2 bits
67 * specify CPU, next bit specifies hyperthread.
69 void bind_thread_to_cpu(int cpu
)
75 sched_setaffinity(0, sizeof(set
), &set
);
80 * This function just discards its argument. It's used to make it
81 * appear that data is used, so that the compiler won't optimize
82 * away the code we're trying to measure.
85 * Pointer to arbitrary value; it's discarded.
87 void discard(void* value
) {
88 int x
= *reinterpret_cast<int*>(value
);
89 if (x
== 0x43924776) {
90 printf("Value was 0x%x\n", x
);
94 //----------------------------------------------------------------------
95 // Test functions start here
96 //----------------------------------------------------------------------
98 // Measure the cost of atomic_t::compare_and_swap
99 double atomic_int_cmp()
104 uint64_t start
= Cycles::rdtsc();
105 for (int i
= 0; i
< count
; i
++) {
106 value
.compare_and_swap(test
, test
+2);
109 uint64_t stop
= Cycles::rdtsc();
110 // printf("Final value: %d\n", value.load());
111 return Cycles::to_seconds(stop
- start
)/count
;
114 // Measure the cost of atomic_t::inc
115 double atomic_int_inc()
119 uint64_t start
= Cycles::rdtsc();
120 for (int i
= 0; i
< count
; i
++) {
123 uint64_t stop
= Cycles::rdtsc();
124 // printf("Final value: %d\n", value.load());
125 return Cycles::to_seconds(stop
- start
)/count
;
128 // Measure the cost of reading an atomic_t
129 double atomic_int_read()
134 uint64_t start
= Cycles::rdtsc();
135 for (int i
= 0; i
< count
; i
++) {
136 total
+= value
.read();
138 uint64_t stop
= Cycles::rdtsc();
139 // printf("Total: %d\n", total);
140 return Cycles::to_seconds(stop
- start
)/count
;
143 // Measure the cost of storing a new value in a atomic_t
144 double atomic_int_set()
148 uint64_t start
= Cycles::rdtsc();
149 for (int i
= 0; i
< count
; i
++) {
152 uint64_t stop
= Cycles::rdtsc();
153 return Cycles::to_seconds(stop
- start
)/count
;
156 // Measure the cost of acquiring and releasing a mutex in the
157 // fast case where the mutex is free.
158 double mutex_nonblock()
161 Mutex
m("mutex_nonblock::m");
162 uint64_t start
= Cycles::rdtsc();
163 for (int i
= 0; i
< count
; i
++) {
167 uint64_t stop
= Cycles::rdtsc();
168 return Cycles::to_seconds(stop
- start
)/count
;
171 // Measure the cost of allocating and deallocating a buffer, plus
172 // appending (logically) one ptr.
173 double buffer_basic()
176 uint64_t start
= Cycles::rdtsc();
177 bufferptr
ptr("abcdefg", 7);
178 for (int i
= 0; i
< count
; i
++) {
182 uint64_t stop
= Cycles::rdtsc();
183 return Cycles::to_seconds(stop
- start
)/count
;
187 int a
= 1, b
= 2, c
= 3, d
= 4;
188 void encode(bufferlist
&bl
) const {
189 ENCODE_START(1, 1, bl
);
196 void decode(bufferlist::iterator
&bl
) {
205 WRITE_CLASS_ENCODER(DummyBlock
)
207 // Measure the cost of encoding and decoding a buffer, plus
208 // allocating space for one chunk.
209 double buffer_encode_decode()
212 uint64_t start
= Cycles::rdtsc();
213 for (int i
= 0; i
< count
; i
++) {
215 DummyBlock dummy_block
;
216 ::encode(dummy_block
, b
);
217 bufferlist::iterator iter
= b
.begin();
218 ::decode(dummy_block
, iter
);
220 uint64_t stop
= Cycles::rdtsc();
221 return Cycles::to_seconds(stop
- start
)/count
;
224 // Measure the cost of allocating and deallocating a buffer, plus
225 // copying in a small block.
226 double buffer_basic_copy()
229 uint64_t start
= Cycles::rdtsc();
230 for (int i
= 0; i
< count
; i
++) {
232 b
.append("abcdefg", 6);
234 uint64_t stop
= Cycles::rdtsc();
235 return Cycles::to_seconds(stop
- start
)/count
;
238 // Measure the cost of making a copy of parts of two ptrs.
243 b
.append("abcde", 5);
244 b
.append("01234", 5);
246 uint64_t start
= Cycles::rdtsc();
247 for (int i
= 0; i
< count
; i
++) {
250 uint64_t stop
= Cycles::rdtsc();
251 return Cycles::to_seconds(stop
- start
)/count
;
254 // Measure the cost of allocating new space by extending the
256 double buffer_encode()
260 for (int i
= 0; i
< count
; i
++) {
262 DummyBlock dummy_block
;
263 ::encode(dummy_block
, b
);
264 uint64_t start
= Cycles::rdtsc();
265 ::encode(dummy_block
, b
);
266 ::encode(dummy_block
, b
);
267 ::encode(dummy_block
, b
);
268 ::encode(dummy_block
, b
);
269 ::encode(dummy_block
, b
);
270 ::encode(dummy_block
, b
);
271 ::encode(dummy_block
, b
);
272 ::encode(dummy_block
, b
);
273 ::encode(dummy_block
, b
);
274 ::encode(dummy_block
, b
);
275 total
+= Cycles::rdtsc() - start
;
277 return Cycles::to_seconds(total
)/(count
*10);
280 // Measure the cost of retrieving an object from the beginning of a buffer.
281 double buffer_get_contiguous()
286 b
.append((char*)&value
, sizeof(value
));
288 uint64_t start
= Cycles::rdtsc();
289 for (int i
= 0; i
< count
; i
++) {
290 sum
+= *reinterpret_cast<int*>(b
.get_contiguous(0, sizeof(value
)));
292 uint64_t stop
= Cycles::rdtsc();
293 return Cycles::to_seconds(stop
- start
)/count
;
296 // Measure the cost of creating an iterator and iterating over 10
297 // chunks in a buffer.
298 double buffer_iterator()
301 const char s
[] = "abcdefghijklmnopqrstuvwxyz";
302 bufferptr
ptr(s
, sizeof(s
));
303 for (int i
= 0; i
< 5; i
++) {
308 uint64_t start
= Cycles::rdtsc();
309 for (int i
= 0; i
< count
; i
++) {
310 bufferlist::iterator it
= b
.begin();
312 sum
+= (static_cast<const char*>(it
.get_current_ptr().c_str()))[it
.get_remaining()-1];
316 uint64_t stop
= Cycles::rdtsc();
318 return Cycles::to_seconds(stop
- start
)/count
;
321 // Implements the CondPingPong test.
329 class Consumer
: public Thread
{
332 explicit Consumer(CondPingPong
*p
): p(p
) {}
333 void* entry() override
{
340 CondPingPong(): mutex("CondPingPong::mutex"), prod(0), cons(0), count(10000), consumer(this) {}
343 consumer
.create("consumer");
344 uint64_t start
= Cycles::rdtsc();
346 uint64_t stop
= Cycles::rdtsc();
348 return Cycles::to_seconds(stop
- start
)/count
;
352 Mutex::Locker
l(mutex
);
353 while (cons
< count
) {
362 Mutex::Locker
l(mutex
);
363 while (cons
< count
) {
372 // Measure the cost of coordinating between threads using a condition variable.
373 double cond_ping_pong()
375 return CondPingPong().run();
378 // Measure the cost of a 32-bit divide. Divides don't take a constant
379 // number of cycles. Values were chosen here semi-randomly to depict a
380 // fairly expensive scenario. Someone with fancy ALU knowledge could
381 // probably pick worse values.
384 #if defined(__i386__) || defined(__x86_64__)
386 uint64_t start
= Cycles::rdtsc();
387 // NB: Expect an x86 processor exception is there's overflow.
388 uint32_t numeratorHi
= 0xa5a5a5a5U
;
389 uint32_t numeratorLo
= 0x55aa55aaU
;
390 uint32_t divisor
= 0xaa55aa55U
;
393 for (int i
= 0; i
< count
; i
++) {
394 __asm__
__volatile__("div %4" :
395 "=a"(quotient
), "=d"(remainder
) :
396 "a"(numeratorLo
), "d"(numeratorHi
), "r"(divisor
) :
399 uint64_t stop
= Cycles::rdtsc();
400 return Cycles::to_seconds(stop
- start
)/count
;
406 // Measure the cost of a 64-bit divide. Divides don't take a constant
407 // number of cycles. Values were chosen here semi-randomly to depict a
408 // fairly expensive scenario. Someone with fancy ALU knowledge could
409 // probably pick worse values.
412 #if defined(__x86_64__) || defined(__amd64__)
414 // NB: Expect an x86 processor exception is there's overflow.
415 uint64_t start
= Cycles::rdtsc();
416 uint64_t numeratorHi
= 0x5a5a5a5a5a5UL
;
417 uint64_t numeratorLo
= 0x55aa55aa55aa55aaUL
;
418 uint64_t divisor
= 0xaa55aa55aa55aa55UL
;
421 for (int i
= 0; i
< count
; i
++) {
422 __asm__
__volatile__("divq %4" :
423 "=a"(quotient
), "=d"(remainder
) :
424 "a"(numeratorLo
), "d"(numeratorHi
), "r"(divisor
) :
427 uint64_t stop
= Cycles::rdtsc();
428 return Cycles::to_seconds(stop
- start
)/count
;
434 // Measure the cost of calling a non-inlined function.
435 double function_call()
439 uint64_t start
= Cycles::rdtsc();
440 for (int i
= 0; i
< count
; i
++) {
441 x
= PerfHelper::plus_one(x
);
443 uint64_t stop
= Cycles::rdtsc();
444 return Cycles::to_seconds(stop
- start
)/count
;
447 // Measure the minimum cost of EventCenter::process_events, when there are no
448 // Pollers and no Timers.
449 double eventcenter_poll()
452 EventCenter
center(g_ceph_context
);
453 center
.init(1000, 0, "posix");
455 uint64_t start
= Cycles::rdtsc();
456 for (int i
= 0; i
< count
; i
++) {
457 center
.process_events(0);
459 uint64_t stop
= Cycles::rdtsc();
460 return Cycles::to_seconds(stop
- start
)/count
;
463 class CenterWorker
: public Thread
{
469 explicit CenterWorker(CephContext
*c
): cct(c
), done(false), center(c
) {
470 center
.init(100, 0, "posix");
476 void* entry() override
{
478 bind_thread_to_cpu(2);
480 center
.process_events(1000);
485 class CountEvent
: public EventCallback
{
489 explicit CountEvent(atomic_t
*atomic
): count(atomic
) {}
490 void do_request(int id
) override
{
495 double eventcenter_dispatch()
499 CenterWorker
worker(g_ceph_context
);
501 worker
.create("evt_center_disp");
502 EventCallbackRef
count_event(new CountEvent(&flag
));
504 worker
.center
.dispatch_event_external(count_event
);
505 // Start a new thread and wait for it to ready.
509 uint64_t start
= Cycles::rdtsc();
510 for (int i
= 0; i
< count
; i
++) {
512 worker
.center
.dispatch_event_external(count_event
);
516 uint64_t stop
= Cycles::rdtsc();
519 return Cycles::to_seconds(stop
- start
)/count
;
522 // Measure the cost of copying a given number of bytes with memcpy.
523 double memcpy_shared(size_t size
)
526 char src
[size
], dst
[size
];
528 memset(src
, 0, sizeof(src
));
530 uint64_t start
= Cycles::rdtsc();
531 for (int i
= 0; i
< count
; i
++) {
532 memcpy(dst
, src
, size
);
534 uint64_t stop
= Cycles::rdtsc();
535 return Cycles::to_seconds(stop
- start
)/count
;
540 return memcpy_shared(100);
545 return memcpy_shared(1000);
550 return memcpy_shared(10000);
553 // Benchmark rjenkins hashing performance on cached data.
554 template <int key_length
>
555 double ceph_str_hash_rjenkins()
558 char buf
[key_length
];
560 uint64_t start
= Cycles::rdtsc();
561 for (int i
= 0; i
< count
; i
++)
562 ceph_str_hash(CEPH_STR_HASH_RJENKINS
, buf
, sizeof(buf
));
563 uint64_t stop
= Cycles::rdtsc();
565 return Cycles::to_seconds(stop
- start
)/count
;
568 // Measure the cost of reading the fine-grain cycle counter.
572 uint64_t start
= Cycles::rdtsc();
574 for (int i
= 0; i
< count
; i
++) {
575 total
+= Cycles::rdtsc();
577 uint64_t stop
= Cycles::rdtsc();
578 return Cycles::to_seconds(stop
- start
)/count
;
581 // Measure the cost of the Cycles::to_seconds method.
582 double perf_cycles_to_seconds()
586 uint64_t cycles
= 994261;
587 uint64_t start
= Cycles::rdtsc();
588 for (int i
= 0; i
< count
; i
++) {
589 total
+= Cycles::to_seconds(cycles
);
591 uint64_t stop
= Cycles::rdtsc();
592 // printf("Result: %.4f\n", total/count);
593 return Cycles::to_seconds(stop
- start
)/count
;
596 // Measure the cost of the Cylcles::toNanoseconds method.
597 double perf_cycles_to_nanoseconds()
601 uint64_t cycles
= 994261;
602 uint64_t start
= Cycles::rdtsc();
603 for (int i
= 0; i
< count
; i
++) {
604 total
+= Cycles::to_nanoseconds(cycles
);
606 uint64_t stop
= Cycles::rdtsc();
607 // printf("Result: %lu\n", total/count);
608 return Cycles::to_seconds(stop
- start
)/count
;
614 * Prefetch the cache lines containing [object, object + numBytes) into the
615 * processor's caches.
616 * The best docs for this are in the Intel instruction set reference under
619 * The start of the region of memory to prefetch.
621 * The size of the region of memory to prefetch.
623 static inline void prefetch(const void *object
, uint64_t num_bytes
)
625 uint64_t offset
= reinterpret_cast<uint64_t>(object
) & 0x3fUL
;
626 const char* p
= reinterpret_cast<const char*>(object
) - offset
;
627 for (uint64_t i
= 0; i
< offset
+ num_bytes
; i
+= 64)
628 _mm_prefetch(p
+ i
, _MM_HINT_T0
);
632 // Measure the cost of the prefetch instruction.
633 double perf_prefetch()
636 uint64_t total_ticks
= 0;
639 uint64_t start
, stop
;
641 for (int i
= 0; i
< count
; i
++) {
642 PerfHelper::flush_cache();
643 start
= Cycles::rdtsc();
644 prefetch(&buf
[576], 64);
645 prefetch(&buf
[0], 64);
646 prefetch(&buf
[512], 64);
647 prefetch(&buf
[960], 64);
648 prefetch(&buf
[640], 64);
649 prefetch(&buf
[896], 64);
650 prefetch(&buf
[256], 64);
651 prefetch(&buf
[704], 64);
652 prefetch(&buf
[320], 64);
653 prefetch(&buf
[384], 64);
654 prefetch(&buf
[128], 64);
655 prefetch(&buf
[448], 64);
656 prefetch(&buf
[768], 64);
657 prefetch(&buf
[832], 64);
658 prefetch(&buf
[64], 64);
659 prefetch(&buf
[192], 64);
660 stop
= Cycles::rdtsc();
661 total_ticks
+= stop
- start
;
663 return Cycles::to_seconds(total_ticks
) / count
/ 16;
669 #if defined(__x86_64__)
671 * This function is used to seralize machine instructions so that no
672 * instructions that appear after it in the current thread can run before any
673 * instructions that appear before it.
675 * It is useful for putting around rdpmc instructions (to pinpoint cache
676 * misses) as well as before rdtsc instructions, to prevent time pollution from
677 * instructions supposed to be executing before the timer starts.
679 static inline void serialize() {
680 uint32_t eax
, ebx
, ecx
, edx
;
681 __asm
volatile("cpuid"
682 : "=a" (eax
), "=b" (ebx
), "=c" (ecx
), "=d" (edx
)
687 // Measure the cost of cpuid
688 double perf_serialize() {
689 #if defined(__x86_64__)
691 uint64_t start
= Cycles::rdtsc();
692 for (int i
= 0; i
< count
; i
++) {
695 uint64_t stop
= Cycles::rdtsc();
696 return Cycles::to_seconds(stop
- start
)/count
;
702 // Measure the cost of an lfence instruction.
707 uint64_t start
= Cycles::rdtsc();
708 for (int i
= 0; i
< count
; i
++) {
709 __asm__
__volatile__("lfence" ::: "memory");
711 uint64_t stop
= Cycles::rdtsc();
712 return Cycles::to_seconds(stop
- start
)/count
;
718 // Measure the cost of an sfence instruction.
723 uint64_t start
= Cycles::rdtsc();
724 for (int i
= 0; i
< count
; i
++) {
725 __asm__
__volatile__("sfence" ::: "memory");
727 uint64_t stop
= Cycles::rdtsc();
728 return Cycles::to_seconds(stop
- start
)/count
;
734 // Measure the cost of acquiring and releasing a SpinLock (assuming the
735 // lock is initially free).
736 double test_spinlock()
740 uint64_t start
= Cycles::rdtsc();
741 for (int i
= 0; i
< count
; i
++) {
745 uint64_t stop
= Cycles::rdtsc();
746 return Cycles::to_seconds(stop
- start
)/count
;
749 // Helper for spawn_thread. This is the main function that the thread executes
750 // (intentionally empty).
751 class ThreadHelper
: public Thread
{
752 void *entry() override
{ return 0; }
755 // Measure the cost of start and joining with a thread.
756 double spawn_thread()
760 uint64_t start
= Cycles::rdtsc();
761 for (int i
= 0; i
< count
; i
++) {
762 thread
.create("thread_helper");
765 uint64_t stop
= Cycles::rdtsc();
766 return Cycles::to_seconds(stop
- start
)/count
;
769 class FakeContext
: public Context
{
771 void finish(int r
) override
{}
774 // Measure the cost of starting and stopping a Dispatch::Timer.
778 Mutex
lock("perf_timer::lock");
779 SafeTimer
timer(g_ceph_context
, lock
);
780 FakeContext
**c
= new FakeContext
*[count
];
781 for (int i
= 0; i
< count
; i
++) {
782 c
[i
] = new FakeContext();
784 uint64_t start
= Cycles::rdtsc();
785 Mutex::Locker
l(lock
);
786 for (int i
= 0; i
< count
; i
++) {
787 timer
.add_event_after(12345, c
[i
]);
788 timer
.cancel_event(c
[i
]);
790 uint64_t stop
= Cycles::rdtsc();
792 return Cycles::to_seconds(stop
- start
)/count
;
795 // Measure the cost of throwing and catching an int. This uses an integer as
796 // the value thrown, which is presumably as fast as possible.
800 uint64_t start
= Cycles::rdtsc();
801 for (int i
= 0; i
< count
; i
++) {
804 } catch (int) { // NOLINT
808 uint64_t stop
= Cycles::rdtsc();
809 return Cycles::to_seconds(stop
- start
)/count
;
812 // Measure the cost of throwing and catching an int from a function call.
813 double throw_int_call()
816 uint64_t start
= Cycles::rdtsc();
817 for (int i
= 0; i
< count
; i
++) {
819 PerfHelper::throw_int();
820 } catch (int) { // NOLINT
824 uint64_t stop
= Cycles::rdtsc();
825 return Cycles::to_seconds(stop
- start
)/count
;
828 // Measure the cost of throwing and catching an Exception. This uses an actual
829 // exception as the value thrown, which may be slower than throwInt.
830 double throw_exception()
833 uint64_t start
= Cycles::rdtsc();
834 for (int i
= 0; i
< count
; i
++) {
836 throw buffer::end_of_buffer();
837 } catch (const buffer::end_of_buffer
&) {
841 uint64_t stop
= Cycles::rdtsc();
842 return Cycles::to_seconds(stop
- start
)/count
;
845 // Measure the cost of throwing and catching an Exception from a function call.
846 double throw_exception_call()
849 uint64_t start
= Cycles::rdtsc();
850 for (int i
= 0; i
< count
; i
++) {
852 PerfHelper::throw_end_of_buffer();
853 } catch (const buffer::end_of_buffer
&) {
857 uint64_t stop
= Cycles::rdtsc();
858 return Cycles::to_seconds(stop
- start
)/count
;
861 // Measure the cost of pushing a new element on a std::vector, copying
862 // from the end to an internal element, and popping the end element.
863 double vector_push_pop()
866 std::vector
<int> vector
;
870 uint64_t start
= Cycles::rdtsc();
871 for (int i
= 0; i
< count
; i
++) {
873 vector
.push_back(i
+1);
874 vector
.push_back(i
+2);
875 vector
[2] = vector
.back();
877 vector
[0] = vector
.back();
879 vector
[1] = vector
.back();
882 uint64_t stop
= Cycles::rdtsc();
883 return Cycles::to_seconds(stop
- start
)/(count
*3);
886 // Measure the cost of ceph_clock_now
887 double perf_ceph_clock_now()
890 uint64_t start
= Cycles::rdtsc();
891 for (int i
= 0; i
< count
; i
++) {
894 uint64_t stop
= Cycles::rdtsc();
895 return Cycles::to_seconds(stop
- start
)/count
;
898 // The following struct and table define each performance test in terms of
899 // a string name and a function that implements the test.
901 const char* name
; // Name of the performance test; this is
902 // what gets typed on the command line to
904 double (*func
)(); // Function that implements the test;
905 // returns the time (in seconds) for each
906 // iteration of that test.
907 const char *description
; // Short description of this test (not more
908 // than about 40 characters, so the entire
909 // test output fits on a single line).
912 {"atomic_int_cmp", atomic_int_cmp
,
913 "atomic_t::compare_and_swap"},
914 {"atomic_int_inc", atomic_int_inc
,
916 {"atomic_int_read", atomic_int_read
,
918 {"atomic_int_set", atomic_int_set
,
920 {"mutex_nonblock", mutex_nonblock
,
921 "Mutex lock/unlock (no blocking)"},
922 {"buffer_basic", buffer_basic
,
923 "buffer create, add one ptr, delete"},
924 {"buffer_encode_decode", buffer_encode_decode
,
925 "buffer create, encode/decode object, delete"},
926 {"buffer_basic_copy", buffer_basic_copy
,
927 "buffer create, copy small block, delete"},
928 {"buffer_copy", buffer_copy
,
929 "copy out 2 small ptrs from buffer"},
930 {"buffer_encode10", buffer_encode
,
931 "buffer encoding 10 structures onto existing ptr"},
932 {"buffer_get_contiguous", buffer_get_contiguous
,
933 "Buffer::get_contiguous"},
934 {"buffer_iterator", buffer_iterator
,
935 "iterate over buffer with 5 ptrs"},
936 {"cond_ping_pong", cond_ping_pong
,
937 "condition variable round-trip"},
939 "32-bit integer division instruction"},
941 "64-bit integer division instruction"},
942 {"function_call", function_call
,
943 "Call a function that has not been inlined"},
944 {"eventcenter_poll", eventcenter_poll
,
945 "EventCenter::process_events (no timers or events)"},
946 {"eventcenter_dispatch", eventcenter_dispatch
,
947 "EventCenter::dispatch_event_external latency"},
948 {"memcpy100", memcpy100
,
949 "Copy 100 bytes with memcpy"},
950 {"memcpy1000", memcpy1000
,
951 "Copy 1000 bytes with memcpy"},
952 {"memcpy10000", memcpy10000
,
953 "Copy 10000 bytes with memcpy"},
954 {"ceph_str_hash_rjenkins", ceph_str_hash_rjenkins
<16>,
955 "rjenkins hash on 16 byte of data"},
956 {"ceph_str_hash_rjenkins", ceph_str_hash_rjenkins
<256>,
957 "rjenkins hash on 256 bytes of data"},
958 {"rdtsc", rdtsc_test
,
959 "Read the fine-grain cycle counter"},
960 {"cycles_to_seconds", perf_cycles_to_seconds
,
961 "Convert a rdtsc result to (double) seconds"},
962 {"cycles_to_seconds", perf_cycles_to_nanoseconds
,
963 "Convert a rdtsc result to (uint64_t) nanoseconds"},
964 {"prefetch", perf_prefetch
,
965 "Prefetch instruction"},
966 {"serialize", perf_serialize
,
967 "serialize instruction"},
969 "Lfence instruction"},
971 "Sfence instruction"},
972 {"spin_lock", test_spinlock
,
973 "Acquire/release SpinLock"},
974 {"spawn_thread", spawn_thread
,
975 "Start and stop a thread"},
976 {"perf_timer", perf_timer
,
977 "Insert and cancel a SafeTimer"},
978 {"throw_int", throw_int
,
980 {"throw_int_call", throw_int_call
,
981 "Throw an int in a function call"},
982 {"throw_exception", throw_exception
,
983 "Throw an Exception"},
984 {"throw_exception_call", throw_exception_call
,
985 "Throw an Exception in a function call"},
986 {"vector_push_pop", vector_push_pop
,
987 "Push and pop a std::vector"},
988 {"ceph_clock_now", perf_ceph_clock_now
,
989 "ceph_clock_now function"},
993 * Runs a particular test and prints a one-line result message.
996 * Describes the test to run.
998 void run_test(TestInfo
& info
)
1000 double secs
= info
.func();
1001 int width
= printf("%-24s ", info
.name
);
1003 width
+= printf(" architecture nonsupport ");
1004 } else if (secs
< 1.0e-06) {
1005 width
+= printf("%8.2fns", 1e09
*secs
);
1006 } else if (secs
< 1.0e-03) {
1007 width
+= printf("%8.2fus", 1e06
*secs
);
1008 } else if (secs
< 1.0) {
1009 width
+= printf("%8.2fms", 1e03
*secs
);
1011 width
+= printf("%8.2fs", secs
);
1013 printf("%*s %s\n", 32-width
, "", info
.description
);
1016 int main(int argc
, char *argv
[])
1018 vector
<const char*> args
;
1019 argv_to_vec(argc
, (const char **)argv
, args
);
1021 auto cct
= global_init(NULL
, args
, CEPH_ENTITY_TYPE_CLIENT
,
1022 CODE_ENVIRONMENT_UTILITY
, 0);
1023 common_init_finish(g_ceph_context
);
1026 bind_thread_to_cpu(3);
1028 // No test names specified; run all tests.
1029 for (size_t i
= 0; i
< sizeof(tests
)/sizeof(TestInfo
); ++i
) {
1033 // Run only the tests that were specified on the command line.
1034 for (int i
= 1; i
< argc
; i
++) {
1035 bool found_test
= false;
1036 for (size_t j
= 0; j
< sizeof(tests
)/sizeof(TestInfo
); ++j
) {
1037 if (strcmp(argv
[i
], tests
[j
].name
) == 0) {
1044 int width
= printf("%-24s ??", argv
[i
]);
1045 printf("%*s No such test\n", 32-width
, "");