]> git.proxmox.com Git - ceph.git/blob - ceph/src/test/perf_local.cc
import 15.2.0 Octopus source
[ceph.git] / ceph / src / test / perf_local.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /* Copyright (c) 2015 Haomai Wang <haomaiwang@gmail.com>
4 * Copyright (c) 2011-2014 Stanford University
5 * Copyright (c) 2011 Facebook
6 *
7 * Permission to use, copy, modify, and distribute this software for any
8 * purpose with or without fee is hereby granted, provided that the above
9 * copyright notice and this permission notice appear in all copies.
10 *
11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR(S) DISCLAIM ALL WARRANTIES
12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL AUTHORS BE LIABLE FOR
14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18 */
19
20 // This program contains a collection of low-level performance measurements
21 // for Ceph, which can be run either individually or altogether. These
22 // tests measure performance in a single stand-alone process, not in a cluster
23 // with multiple servers. Invoke the program like this:
24 //
25 // Perf test1 test2 ...
26 //
27 // test1 and test2 are the names of individual performance measurements to
28 // run. If no test names are provided then all of the performance tests
29 // are run.
30 //
31 // To add a new test:
32 // * Write a function that implements the test. Use existing test functions
33 // as a guideline, and be sure to generate output in the same form as
34 // other tests.
35 // * Create a new entry for the test in the #tests table.
36 #include <vector>
37 #include <sched.h>
38
39 #include "acconfig.h"
40 #ifdef HAVE_SSE
41 #include <xmmintrin.h>
42 #endif
43
44 #include "include/buffer.h"
45 #include "include/encoding.h"
46 #include "include/ceph_hash.h"
47 #include "include/spinlock.h"
48 #include "common/ceph_argparse.h"
49 #include "common/Cycles.h"
50 #include "common/Cond.h"
51 #include "common/ceph_mutex.h"
52 #include "common/Thread.h"
53 #include "common/Timer.h"
54 #include "msg/async/Event.h"
55 #include "global/global_init.h"
56
57 #include "test/perf_helper.h"
58
59 #include <atomic>
60
61 using namespace ceph;
62
63 /**
64 * Ask the operating system to pin the current thread to a given CPU.
65 *
66 * \param cpu
67 * Indicates the desired CPU and hyperthread; low order 2 bits
68 * specify CPU, next bit specifies hyperthread.
69 */
70 void bind_thread_to_cpu(int cpu)
71 {
72 #ifdef HAVE_SCHED
73 cpu_set_t set;
74 CPU_ZERO(&set);
75 CPU_SET(cpu, &set);
76 sched_setaffinity(0, sizeof(set), &set);
77 #endif
78 }
79
80 /*
81 * This function just discards its argument. It's used to make it
82 * appear that data is used, so that the compiler won't optimize
83 * away the code we're trying to measure.
84 *
85 * \param value
86 * Pointer to arbitrary value; it's discarded.
87 */
88 void discard(void* value) {
89 int x = *reinterpret_cast<int*>(value);
90 if (x == 0x43924776) {
91 printf("Value was 0x%x\n", x);
92 }
93 }
94
95 //----------------------------------------------------------------------
96 // Test functions start here
97 //----------------------------------------------------------------------
98
99 // Measure the cost of atomic compare-and-swap
100 double atomic_int_cmp()
101 {
102 int count = 1000000;
103 std::atomic<unsigned> value = { 11 };
104 unsigned int test = 11;
105 uint64_t start = Cycles::rdtsc();
106 for (int i = 0; i < count; i++) {
107 value.compare_exchange_strong(test, test+2);
108 test += 2;
109 }
110 uint64_t stop = Cycles::rdtsc();
111 // printf("Final value: %d\n", value.load());
112 return Cycles::to_seconds(stop - start)/count;
113 }
114
115 // Measure the cost of incrementing an atomic
116 double atomic_int_inc()
117 {
118 int count = 1000000;
119 std::atomic<int64_t> value = { 11 };
120 uint64_t start = Cycles::rdtsc();
121 for (int i = 0; i < count; i++) {
122 value++;
123 }
124 uint64_t stop = Cycles::rdtsc();
125 // printf("Final value: %d\n", value.load());
126 return Cycles::to_seconds(stop - start)/count;
127 }
128
129 // Measure the cost of reading an atomic
130 double atomic_int_read()
131 {
132 int count = 1000000;
133 std::atomic<int64_t> value = { 11 };
134 int total = 0;
135 uint64_t start = Cycles::rdtsc();
136 for (int i = 0; i < count; i++) {
137 total += value;
138 }
139 uint64_t stop = Cycles::rdtsc();
140 // printf("Total: %d\n", total);
141 return Cycles::to_seconds(stop - start)/count;
142 }
143
144 // Measure the cost of storing a new value in an atomic
145 double atomic_int_set()
146 {
147 int count = 1000000;
148 std::atomic<int64_t> value = { 11 };
149 uint64_t start = Cycles::rdtsc();
150 for (int i = 0; i < count; i++) {
151 value = 88;
152 }
153 uint64_t stop = Cycles::rdtsc();
154 return Cycles::to_seconds(stop - start)/count;
155 }
156
157 // Measure the cost of acquiring and releasing a mutex in the
158 // fast case where the mutex is free.
159 double mutex_nonblock()
160 {
161 int count = 1000000;
162 ceph::mutex m = ceph::make_mutex("mutex_nonblock::m");
163 uint64_t start = Cycles::rdtsc();
164 for (int i = 0; i < count; i++) {
165 m.lock();
166 m.unlock();
167 }
168 uint64_t stop = Cycles::rdtsc();
169 return Cycles::to_seconds(stop - start)/count;
170 }
171
172 // Measure the cost of allocating and deallocating a buffer, plus
173 // appending (logically) one ptr.
174 double buffer_basic()
175 {
176 int count = 1000000;
177 uint64_t start = Cycles::rdtsc();
178 bufferptr ptr("abcdefg", 7);
179 for (int i = 0; i < count; i++) {
180 bufferlist b;
181 b.append(ptr, 0, 5);
182 }
183 uint64_t stop = Cycles::rdtsc();
184 return Cycles::to_seconds(stop - start)/count;
185 }
186
187 struct DummyBlock {
188 int a = 1, b = 2, c = 3, d = 4;
189 void encode(bufferlist &bl) const {
190 ENCODE_START(1, 1, bl);
191 encode(a, bl);
192 encode(b, bl);
193 encode(c, bl);
194 encode(d, bl);
195 ENCODE_FINISH(bl);
196 }
197 void decode(bufferlist::const_iterator &bl) {
198 DECODE_START(1, bl);
199 decode(a, bl);
200 decode(b, bl);
201 decode(c, bl);
202 decode(d, bl);
203 DECODE_FINISH(bl);
204 }
205 };
206 WRITE_CLASS_ENCODER(DummyBlock)
207
208 // Measure the cost of encoding and decoding a buffer, plus
209 // allocating space for one chunk.
210 double buffer_encode_decode()
211 {
212 int count = 1000000;
213 uint64_t start = Cycles::rdtsc();
214 for (int i = 0; i < count; i++) {
215 bufferlist b;
216 DummyBlock dummy_block;
217 encode(dummy_block, b);
218 auto iter = b.cbegin();
219 decode(dummy_block, iter);
220 }
221 uint64_t stop = Cycles::rdtsc();
222 return Cycles::to_seconds(stop - start)/count;
223 }
224
225 // Measure the cost of allocating and deallocating a buffer, plus
226 // copying in a small block.
227 double buffer_basic_copy()
228 {
229 int count = 1000000;
230 uint64_t start = Cycles::rdtsc();
231 for (int i = 0; i < count; i++) {
232 bufferlist b;
233 b.append("abcdefg", 6);
234 }
235 uint64_t stop = Cycles::rdtsc();
236 return Cycles::to_seconds(stop - start)/count;
237 }
238
239 // Measure the cost of making a copy of parts of two ptrs.
240 double buffer_copy()
241 {
242 int count = 1000000;
243 bufferlist b;
244 b.append("abcde", 5);
245 b.append("01234", 5);
246 char copy[10];
247 uint64_t start = Cycles::rdtsc();
248 for (int i = 0; i < count; i++) {
249 b.cbegin(2).copy(6, copy);
250 }
251 uint64_t stop = Cycles::rdtsc();
252 return Cycles::to_seconds(stop - start)/count;
253 }
254
255 // Measure the cost of allocating new space by extending the
256 // bufferlist
257 double buffer_encode()
258 {
259 int count = 100000;
260 uint64_t total = 0;
261 for (int i = 0; i < count; i++) {
262 bufferlist b;
263 DummyBlock dummy_block;
264 encode(dummy_block, b);
265 uint64_t start = Cycles::rdtsc();
266 encode(dummy_block, b);
267 encode(dummy_block, b);
268 encode(dummy_block, b);
269 encode(dummy_block, b);
270 encode(dummy_block, b);
271 encode(dummy_block, b);
272 encode(dummy_block, b);
273 encode(dummy_block, b);
274 encode(dummy_block, b);
275 encode(dummy_block, b);
276 total += Cycles::rdtsc() - start;
277 }
278 return Cycles::to_seconds(total)/(count*10);
279 }
280
281 // Measure the cost of creating an iterator and iterating over 10
282 // chunks in a buffer.
283 double buffer_iterator()
284 {
285 bufferlist b;
286 const char s[] = "abcdefghijklmnopqrstuvwxyz";
287 bufferptr ptr(s, sizeof(s));
288 for (int i = 0; i < 5; i++) {
289 b.append(ptr, i, 5);
290 }
291 int count = 100000;
292 int sum = 0;
293 uint64_t start = Cycles::rdtsc();
294 for (int i = 0; i < count; i++) {
295 auto it = b.cbegin();
296 while (!it.end()) {
297 sum += (static_cast<const char*>(it.get_current_ptr().c_str()))[it.get_remaining()-1];
298 ++it;
299 }
300 }
301 uint64_t stop = Cycles::rdtsc();
302 discard(&sum);
303 return Cycles::to_seconds(stop - start)/count;
304 }
305
306 // Implements the CondPingPong test.
307 class CondPingPong {
308 ceph::mutex mutex = ceph::make_mutex("CondPingPong::mutex");
309 ceph::condition_variable cond;
310 int prod = 0;
311 int cons = 0;
312 const int count = 10000;
313
314 class Consumer : public Thread {
315 CondPingPong *p;
316 public:
317 explicit Consumer(CondPingPong *p): p(p) {}
318 void* entry() override {
319 p->consume();
320 return 0;
321 }
322 } consumer;
323
324 public:
325 CondPingPong(): consumer(this) {}
326
327 double run() {
328 consumer.create("consumer");
329 uint64_t start = Cycles::rdtsc();
330 produce();
331 uint64_t stop = Cycles::rdtsc();
332 consumer.join();
333 return Cycles::to_seconds(stop - start)/count;
334 }
335
336 void produce() {
337 std::unique_lock l{mutex};
338 while (cons < count) {
339 cond.wait(l, [this] { return cons >= prod; });
340 ++prod;
341 cond.notify_all();
342 }
343 }
344
345 void consume() {
346 std::unique_lock l{mutex};
347 while (cons < count) {
348 cond.wait(l, [this] { return cons != prod; });
349 ++cons;
350 cond.notify_all();
351 }
352 }
353 };
354
355 // Measure the cost of coordinating between threads using a condition variable.
356 double cond_ping_pong()
357 {
358 return CondPingPong().run();
359 }
360
361 // Measure the cost of a 32-bit divide. Divides don't take a constant
362 // number of cycles. Values were chosen here semi-randomly to depict a
363 // fairly expensive scenario. Someone with fancy ALU knowledge could
364 // probably pick worse values.
365 double div32()
366 {
367 #if defined(__i386__) || defined(__x86_64__)
368 int count = 1000000;
369 uint64_t start = Cycles::rdtsc();
370 // NB: Expect an x86 processor exception is there's overflow.
371 uint32_t numeratorHi = 0xa5a5a5a5U;
372 uint32_t numeratorLo = 0x55aa55aaU;
373 uint32_t divisor = 0xaa55aa55U;
374 uint32_t quotient;
375 uint32_t remainder;
376 for (int i = 0; i < count; i++) {
377 __asm__ __volatile__("div %4" :
378 "=a"(quotient), "=d"(remainder) :
379 "a"(numeratorLo), "d"(numeratorHi), "r"(divisor) :
380 "cc");
381 }
382 uint64_t stop = Cycles::rdtsc();
383 return Cycles::to_seconds(stop - start)/count;
384 #else
385 return -1;
386 #endif
387 }
388
389 // Measure the cost of a 64-bit divide. Divides don't take a constant
390 // number of cycles. Values were chosen here semi-randomly to depict a
391 // fairly expensive scenario. Someone with fancy ALU knowledge could
392 // probably pick worse values.
393 double div64()
394 {
395 #if defined(__x86_64__) || defined(__amd64__)
396 int count = 1000000;
397 // NB: Expect an x86 processor exception is there's overflow.
398 uint64_t start = Cycles::rdtsc();
399 uint64_t numeratorHi = 0x5a5a5a5a5a5UL;
400 uint64_t numeratorLo = 0x55aa55aa55aa55aaUL;
401 uint64_t divisor = 0xaa55aa55aa55aa55UL;
402 uint64_t quotient;
403 uint64_t remainder;
404 for (int i = 0; i < count; i++) {
405 __asm__ __volatile__("divq %4" :
406 "=a"(quotient), "=d"(remainder) :
407 "a"(numeratorLo), "d"(numeratorHi), "r"(divisor) :
408 "cc");
409 }
410 uint64_t stop = Cycles::rdtsc();
411 return Cycles::to_seconds(stop - start)/count;
412 #else
413 return -1;
414 #endif
415 }
416
417 // Measure the cost of calling a non-inlined function.
418 double function_call()
419 {
420 int count = 1000000;
421 uint64_t x = 0;
422 uint64_t start = Cycles::rdtsc();
423 for (int i = 0; i < count; i++) {
424 x = PerfHelper::plus_one(x);
425 }
426 uint64_t stop = Cycles::rdtsc();
427 return Cycles::to_seconds(stop - start)/count;
428 }
429
430 // Measure the minimum cost of EventCenter::process_events, when there are no
431 // Pollers and no Timers.
432 double eventcenter_poll()
433 {
434 int count = 1000000;
435 EventCenter center(g_ceph_context);
436 center.init(1000, 0, "posix");
437 center.set_owner();
438 uint64_t start = Cycles::rdtsc();
439 for (int i = 0; i < count; i++) {
440 center.process_events(0);
441 }
442 uint64_t stop = Cycles::rdtsc();
443 return Cycles::to_seconds(stop - start)/count;
444 }
445
446 class CenterWorker : public Thread {
447 CephContext *cct;
448 bool done;
449
450 public:
451 EventCenter center;
452 explicit CenterWorker(CephContext *c): cct(c), done(false), center(c) {
453 center.init(100, 0, "posix");
454 }
455 void stop() {
456 done = true;
457 center.wakeup();
458 }
459 void* entry() override {
460 center.set_owner();
461 bind_thread_to_cpu(2);
462 while (!done)
463 center.process_events(1000);
464 return 0;
465 }
466 };
467
468 class CountEvent: public EventCallback {
469 std::atomic<int64_t> *count;
470
471 public:
472 explicit CountEvent(std::atomic<int64_t> *atomic): count(atomic) {}
473 void do_request(uint64_t id) override {
474 (*count)--;
475 }
476 };
477
478 double eventcenter_dispatch()
479 {
480 int count = 100000;
481
482 CenterWorker worker(g_ceph_context);
483 std::atomic<int64_t> flag = { 1 };
484 worker.create("evt_center_disp");
485 EventCallbackRef count_event(new CountEvent(&flag));
486
487 worker.center.dispatch_event_external(count_event);
488 // Start a new thread and wait for it to ready.
489 while (flag)
490 usleep(100);
491
492 uint64_t start = Cycles::rdtsc();
493 for (int i = 0; i < count; i++) {
494 flag = 1;
495 worker.center.dispatch_event_external(count_event);
496 while (flag)
497 ;
498 }
499 uint64_t stop = Cycles::rdtsc();
500 worker.stop();
501 worker.join();
502 return Cycles::to_seconds(stop - start)/count;
503 }
504
505 // Measure the cost of copying a given number of bytes with memcpy.
506 double memcpy_shared(size_t size)
507 {
508 int count = 1000000;
509 char src[size], dst[size];
510
511 memset(src, 0, sizeof(src));
512
513 uint64_t start = Cycles::rdtsc();
514 for (int i = 0; i < count; i++) {
515 memcpy(dst, src, size);
516 }
517 uint64_t stop = Cycles::rdtsc();
518 return Cycles::to_seconds(stop - start)/count;
519 }
520
521 double memcpy100()
522 {
523 return memcpy_shared(100);
524 }
525
526 double memcpy1000()
527 {
528 return memcpy_shared(1000);
529 }
530
531 double memcpy10000()
532 {
533 return memcpy_shared(10000);
534 }
535
536 // Benchmark rjenkins hashing performance on cached data.
537 template <int key_length>
538 double ceph_str_hash_rjenkins()
539 {
540 int count = 100000;
541 char buf[key_length];
542
543 uint64_t start = Cycles::rdtsc();
544 for (int i = 0; i < count; i++)
545 ceph_str_hash(CEPH_STR_HASH_RJENKINS, buf, sizeof(buf));
546 uint64_t stop = Cycles::rdtsc();
547
548 return Cycles::to_seconds(stop - start)/count;
549 }
550
551 // Measure the cost of reading the fine-grain cycle counter.
552 double rdtsc_test()
553 {
554 int count = 1000000;
555 uint64_t start = Cycles::rdtsc();
556 uint64_t total = 0;
557 for (int i = 0; i < count; i++) {
558 total += Cycles::rdtsc();
559 }
560 uint64_t stop = Cycles::rdtsc();
561 return Cycles::to_seconds(stop - start)/count;
562 }
563
564 // Measure the cost of the Cycles::to_seconds method.
565 double perf_cycles_to_seconds()
566 {
567 int count = 1000000;
568 double total = 0;
569 uint64_t cycles = 994261;
570 uint64_t start = Cycles::rdtsc();
571 for (int i = 0; i < count; i++) {
572 total += Cycles::to_seconds(cycles);
573 }
574 uint64_t stop = Cycles::rdtsc();
575 // printf("Result: %.4f\n", total/count);
576 return Cycles::to_seconds(stop - start)/count;
577 }
578
579 // Measure the cost of the Cylcles::toNanoseconds method.
580 double perf_cycles_to_nanoseconds()
581 {
582 int count = 1000000;
583 uint64_t total = 0;
584 uint64_t cycles = 994261;
585 uint64_t start = Cycles::rdtsc();
586 for (int i = 0; i < count; i++) {
587 total += Cycles::to_nanoseconds(cycles);
588 }
589 uint64_t stop = Cycles::rdtsc();
590 // printf("Result: %lu\n", total/count);
591 return Cycles::to_seconds(stop - start)/count;
592 }
593
594
595 #ifdef HAVE_SSE
596 /**
597 * Prefetch the cache lines containing [object, object + numBytes) into the
598 * processor's caches.
599 * The best docs for this are in the Intel instruction set reference under
600 * PREFETCH.
601 * \param object
602 * The start of the region of memory to prefetch.
603 * \param num_bytes
604 * The size of the region of memory to prefetch.
605 */
606 static inline void prefetch(const void *object, uint64_t num_bytes)
607 {
608 uint64_t offset = reinterpret_cast<uint64_t>(object) & 0x3fUL;
609 const char* p = reinterpret_cast<const char*>(object) - offset;
610 for (uint64_t i = 0; i < offset + num_bytes; i += 64)
611 _mm_prefetch(p + i, _MM_HINT_T0);
612 }
613 #endif
614
615 // Measure the cost of the prefetch instruction.
616 double perf_prefetch()
617 {
618 #ifdef HAVE_SSE
619 uint64_t total_ticks = 0;
620 int count = 10;
621 char buf[16 * 64];
622
623 for (int i = 0; i < count; i++) {
624 PerfHelper::flush_cache();
625 uint64_t start = Cycles::rdtsc();
626 prefetch(&buf[576], 64);
627 prefetch(&buf[0], 64);
628 prefetch(&buf[512], 64);
629 prefetch(&buf[960], 64);
630 prefetch(&buf[640], 64);
631 prefetch(&buf[896], 64);
632 prefetch(&buf[256], 64);
633 prefetch(&buf[704], 64);
634 prefetch(&buf[320], 64);
635 prefetch(&buf[384], 64);
636 prefetch(&buf[128], 64);
637 prefetch(&buf[448], 64);
638 prefetch(&buf[768], 64);
639 prefetch(&buf[832], 64);
640 prefetch(&buf[64], 64);
641 prefetch(&buf[192], 64);
642 uint64_t stop = Cycles::rdtsc();
643 total_ticks += stop - start;
644 }
645 return Cycles::to_seconds(total_ticks) / count / 16;
646 #else
647 return -1;
648 #endif
649 }
650
651 #if defined(__x86_64__)
652 /**
653 * This function is used to seralize machine instructions so that no
654 * instructions that appear after it in the current thread can run before any
655 * instructions that appear before it.
656 *
657 * It is useful for putting around rdpmc instructions (to pinpoint cache
658 * misses) as well as before rdtsc instructions, to prevent time pollution from
659 * instructions supposed to be executing before the timer starts.
660 */
661 static inline void serialize() {
662 uint32_t eax, ebx, ecx, edx;
663 __asm volatile("cpuid"
664 : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
665 : "a" (1U));
666 }
667 #endif
668
669 // Measure the cost of cpuid
670 double perf_serialize() {
671 #if defined(__x86_64__)
672 int count = 1000000;
673 uint64_t start = Cycles::rdtsc();
674 for (int i = 0; i < count; i++) {
675 serialize();
676 }
677 uint64_t stop = Cycles::rdtsc();
678 return Cycles::to_seconds(stop - start)/count;
679 #else
680 return -1;
681 #endif
682 }
683
684 // Measure the cost of an lfence instruction.
685 double lfence()
686 {
687 #ifdef HAVE_SSE2
688 int count = 1000000;
689 uint64_t start = Cycles::rdtsc();
690 for (int i = 0; i < count; i++) {
691 __asm__ __volatile__("lfence" ::: "memory");
692 }
693 uint64_t stop = Cycles::rdtsc();
694 return Cycles::to_seconds(stop - start)/count;
695 #else
696 return -1;
697 #endif
698 }
699
700 // Measure the cost of an sfence instruction.
701 double sfence()
702 {
703 #ifdef HAVE_SSE
704 int count = 1000000;
705 uint64_t start = Cycles::rdtsc();
706 for (int i = 0; i < count; i++) {
707 __asm__ __volatile__("sfence" ::: "memory");
708 }
709 uint64_t stop = Cycles::rdtsc();
710 return Cycles::to_seconds(stop - start)/count;
711 #else
712 return -1;
713 #endif
714 }
715
716 // Measure the cost of acquiring and releasing a SpinLock (assuming the
717 // lock is initially free).
718 double test_spinlock()
719 {
720 int count = 1000000;
721 ceph::spinlock lock;
722 uint64_t start = Cycles::rdtsc();
723 for (int i = 0; i < count; i++) {
724 lock.lock();
725 lock.unlock();
726 }
727 uint64_t stop = Cycles::rdtsc();
728 return Cycles::to_seconds(stop - start)/count;
729 }
730
731 // Helper for spawn_thread. This is the main function that the thread executes
732 // (intentionally empty).
733 class ThreadHelper : public Thread {
734 void *entry() override { return 0; }
735 };
736
737 // Measure the cost of start and joining with a thread.
738 double spawn_thread()
739 {
740 int count = 10000;
741 ThreadHelper thread;
742 uint64_t start = Cycles::rdtsc();
743 for (int i = 0; i < count; i++) {
744 thread.create("thread_helper");
745 thread.join();
746 }
747 uint64_t stop = Cycles::rdtsc();
748 return Cycles::to_seconds(stop - start)/count;
749 }
750
751 class FakeContext : public Context {
752 public:
753 void finish(int r) override {}
754 };
755
756 // Measure the cost of starting and stopping a Dispatch::Timer.
757 double perf_timer()
758 {
759 int count = 1000000;
760 ceph::mutex lock = ceph::make_mutex("perf_timer::lock");
761 SafeTimer timer(g_ceph_context, lock);
762 FakeContext **c = new FakeContext*[count];
763 for (int i = 0; i < count; i++) {
764 c[i] = new FakeContext();
765 }
766 uint64_t start = Cycles::rdtsc();
767 std::lock_guard l{lock};
768 for (int i = 0; i < count; i++) {
769 if (timer.add_event_after(12345, c[i])) {
770 timer.cancel_event(c[i]);
771 }
772 }
773 uint64_t stop = Cycles::rdtsc();
774 delete[] c;
775 return Cycles::to_seconds(stop - start)/count;
776 }
777
778 // Measure the cost of throwing and catching an int. This uses an integer as
779 // the value thrown, which is presumably as fast as possible.
780 double throw_int()
781 {
782 int count = 10000;
783 uint64_t start = Cycles::rdtsc();
784 for (int i = 0; i < count; i++) {
785 try {
786 throw 0;
787 } catch (int) { // NOLINT
788 // pass
789 }
790 }
791 uint64_t stop = Cycles::rdtsc();
792 return Cycles::to_seconds(stop - start)/count;
793 }
794
795 // Measure the cost of throwing and catching an int from a function call.
796 double throw_int_call()
797 {
798 int count = 10000;
799 uint64_t start = Cycles::rdtsc();
800 for (int i = 0; i < count; i++) {
801 try {
802 PerfHelper::throw_int();
803 } catch (int) { // NOLINT
804 // pass
805 }
806 }
807 uint64_t stop = Cycles::rdtsc();
808 return Cycles::to_seconds(stop - start)/count;
809 }
810
811 // Measure the cost of throwing and catching an Exception. This uses an actual
812 // exception as the value thrown, which may be slower than throwInt.
813 double throw_exception()
814 {
815 int count = 10000;
816 uint64_t start = Cycles::rdtsc();
817 for (int i = 0; i < count; i++) {
818 try {
819 throw buffer::end_of_buffer();
820 } catch (const buffer::end_of_buffer&) {
821 // pass
822 }
823 }
824 uint64_t stop = Cycles::rdtsc();
825 return Cycles::to_seconds(stop - start)/count;
826 }
827
828 // Measure the cost of throwing and catching an Exception from a function call.
829 double throw_exception_call()
830 {
831 int count = 10000;
832 uint64_t start = Cycles::rdtsc();
833 for (int i = 0; i < count; i++) {
834 try {
835 PerfHelper::throw_end_of_buffer();
836 } catch (const buffer::end_of_buffer&) {
837 // pass
838 }
839 }
840 uint64_t stop = Cycles::rdtsc();
841 return Cycles::to_seconds(stop - start)/count;
842 }
843
844 // Measure the cost of pushing a new element on a std::vector, copying
845 // from the end to an internal element, and popping the end element.
846 double vector_push_pop()
847 {
848 int count = 100000;
849 std::vector<int> vector;
850 vector.push_back(1);
851 vector.push_back(2);
852 vector.push_back(3);
853 uint64_t start = Cycles::rdtsc();
854 for (int i = 0; i < count; i++) {
855 vector.push_back(i);
856 vector.push_back(i+1);
857 vector.push_back(i+2);
858 vector[2] = vector.back();
859 vector.pop_back();
860 vector[0] = vector.back();
861 vector.pop_back();
862 vector[1] = vector.back();
863 vector.pop_back();
864 }
865 uint64_t stop = Cycles::rdtsc();
866 return Cycles::to_seconds(stop - start)/(count*3);
867 }
868
869 // Measure the cost of ceph_clock_now
870 double perf_ceph_clock_now()
871 {
872 int count = 100000;
873 uint64_t start = Cycles::rdtsc();
874 for (int i = 0; i < count; i++) {
875 ceph_clock_now();
876 }
877 uint64_t stop = Cycles::rdtsc();
878 return Cycles::to_seconds(stop - start)/count;
879 }
880
881 // The following struct and table define each performance test in terms of
882 // a string name and a function that implements the test.
883 struct TestInfo {
884 const char* name; // Name of the performance test; this is
885 // what gets typed on the command line to
886 // run the test.
887 double (*func)(); // Function that implements the test;
888 // returns the time (in seconds) for each
889 // iteration of that test.
890 const char *description; // Short description of this test (not more
891 // than about 40 characters, so the entire
892 // test output fits on a single line).
893 };
894 TestInfo tests[] = {
895 {"atomic_int_cmp", atomic_int_cmp,
896 "atomic_t::compare_and_swap"},
897 {"atomic_int_inc", atomic_int_inc,
898 "atomic_t::inc"},
899 {"atomic_int_read", atomic_int_read,
900 "atomic_t::read"},
901 {"atomic_int_set", atomic_int_set,
902 "atomic_t::set"},
903 {"mutex_nonblock", mutex_nonblock,
904 "Mutex lock/unlock (no blocking)"},
905 {"buffer_basic", buffer_basic,
906 "buffer create, add one ptr, delete"},
907 {"buffer_encode_decode", buffer_encode_decode,
908 "buffer create, encode/decode object, delete"},
909 {"buffer_basic_copy", buffer_basic_copy,
910 "buffer create, copy small block, delete"},
911 {"buffer_copy", buffer_copy,
912 "copy out 2 small ptrs from buffer"},
913 {"buffer_encode10", buffer_encode,
914 "buffer encoding 10 structures onto existing ptr"},
915 {"buffer_iterator", buffer_iterator,
916 "iterate over buffer with 5 ptrs"},
917 {"cond_ping_pong", cond_ping_pong,
918 "condition variable round-trip"},
919 {"div32", div32,
920 "32-bit integer division instruction"},
921 {"div64", div64,
922 "64-bit integer division instruction"},
923 {"function_call", function_call,
924 "Call a function that has not been inlined"},
925 {"eventcenter_poll", eventcenter_poll,
926 "EventCenter::process_events (no timers or events)"},
927 {"eventcenter_dispatch", eventcenter_dispatch,
928 "EventCenter::dispatch_event_external latency"},
929 {"memcpy100", memcpy100,
930 "Copy 100 bytes with memcpy"},
931 {"memcpy1000", memcpy1000,
932 "Copy 1000 bytes with memcpy"},
933 {"memcpy10000", memcpy10000,
934 "Copy 10000 bytes with memcpy"},
935 {"ceph_str_hash_rjenkins", ceph_str_hash_rjenkins<16>,
936 "rjenkins hash on 16 byte of data"},
937 {"ceph_str_hash_rjenkins", ceph_str_hash_rjenkins<256>,
938 "rjenkins hash on 256 bytes of data"},
939 {"rdtsc", rdtsc_test,
940 "Read the fine-grain cycle counter"},
941 {"cycles_to_seconds", perf_cycles_to_seconds,
942 "Convert a rdtsc result to (double) seconds"},
943 {"cycles_to_seconds", perf_cycles_to_nanoseconds,
944 "Convert a rdtsc result to (uint64_t) nanoseconds"},
945 {"prefetch", perf_prefetch,
946 "Prefetch instruction"},
947 {"serialize", perf_serialize,
948 "serialize instruction"},
949 {"lfence", lfence,
950 "Lfence instruction"},
951 {"sfence", sfence,
952 "Sfence instruction"},
953 {"spin_lock", test_spinlock,
954 "Acquire/release SpinLock"},
955 {"spawn_thread", spawn_thread,
956 "Start and stop a thread"},
957 {"perf_timer", perf_timer,
958 "Insert and cancel a SafeTimer"},
959 {"throw_int", throw_int,
960 "Throw an int"},
961 {"throw_int_call", throw_int_call,
962 "Throw an int in a function call"},
963 {"throw_exception", throw_exception,
964 "Throw an Exception"},
965 {"throw_exception_call", throw_exception_call,
966 "Throw an Exception in a function call"},
967 {"vector_push_pop", vector_push_pop,
968 "Push and pop a std::vector"},
969 {"ceph_clock_now", perf_ceph_clock_now,
970 "ceph_clock_now function"},
971 };
972
973 /**
974 * Runs a particular test and prints a one-line result message.
975 *
976 * \param info
977 * Describes the test to run.
978 */
979 void run_test(TestInfo& info)
980 {
981 double secs = info.func();
982 int width = printf("%-24s ", info.name);
983 if (secs == -1) {
984 width += printf(" architecture nonsupport ");
985 } else if (secs < 1.0e-06) {
986 width += printf("%8.2fns", 1e09*secs);
987 } else if (secs < 1.0e-03) {
988 width += printf("%8.2fus", 1e06*secs);
989 } else if (secs < 1.0) {
990 width += printf("%8.2fms", 1e03*secs);
991 } else {
992 width += printf("%8.2fs", secs);
993 }
994 printf("%*s %s\n", 32-width, "", info.description);
995 }
996
997 int main(int argc, char *argv[])
998 {
999 vector<const char*> args;
1000 argv_to_vec(argc, (const char **)argv, args);
1001
1002 auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
1003 CODE_ENVIRONMENT_UTILITY,
1004 CINIT_FLAG_NO_DEFAULT_CONFIG_FILE);
1005 common_init_finish(g_ceph_context);
1006 Cycles::init();
1007
1008 bind_thread_to_cpu(3);
1009 if (argc == 1) {
1010 // No test names specified; run all tests.
1011 for (size_t i = 0; i < sizeof(tests)/sizeof(TestInfo); ++i) {
1012 run_test(tests[i]);
1013 }
1014 } else {
1015 // Run only the tests that were specified on the command line.
1016 for (int i = 1; i < argc; i++) {
1017 bool found_test = false;
1018 for (size_t j = 0; j < sizeof(tests)/sizeof(TestInfo); ++j) {
1019 if (strcmp(argv[i], tests[j].name) == 0) {
1020 found_test = true;
1021 run_test(tests[j]);
1022 break;
1023 }
1024 }
1025 if (!found_test) {
1026 int width = printf("%-24s ??", argv[i]);
1027 printf("%*s No such test\n", 32-width, "");
1028 }
1029 }
1030 }
1031 }