]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* Copyright (c) 2015 Haomai Wang <haomaiwang@gmail.com> | |
4 | * Copyright (c) 2011-2014 Stanford University | |
5 | * Copyright (c) 2011 Facebook | |
6 | * | |
7 | * Permission to use, copy, modify, and distribute this software for any | |
8 | * purpose with or without fee is hereby granted, provided that the above | |
9 | * copyright notice and this permission notice appear in all copies. | |
10 | * | |
11 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR(S) DISCLAIM ALL WARRANTIES | |
12 | * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF | |
13 | * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL AUTHORS BE LIABLE FOR | |
14 | * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES | |
15 | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN | |
16 | * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF | |
17 | * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | |
18 | */ | |
19 | ||
20 | // This program contains a collection of low-level performance measurements | |
21 | // for Ceph, which can be run either individually or altogether. These | |
22 | // tests measure performance in a single stand-alone process, not in a cluster | |
23 | // with multiple servers. Invoke the program like this: | |
24 | // | |
25 | // Perf test1 test2 ... | |
26 | // | |
27 | // test1 and test2 are the names of individual performance measurements to | |
28 | // run. If no test names are provided then all of the performance tests | |
29 | // are run. | |
30 | // | |
31 | // To add a new test: | |
32 | // * Write a function that implements the test. Use existing test functions | |
33 | // as a guideline, and be sure to generate output in the same form as | |
34 | // other tests. | |
35 | // * Create a new entry for the test in the #tests table. | |
36 | #include <vector> | |
37 | #include <sched.h> | |
38 | ||
39 | #include "acconfig.h" | |
40 | #ifdef HAVE_SSE | |
41 | #include <xmmintrin.h> | |
42 | #endif | |
43 | ||
7c673cae FG |
44 | #include "include/buffer.h" |
45 | #include "include/encoding.h" | |
46 | #include "include/ceph_hash.h" | |
11fdf7f2 | 47 | #include "include/spinlock.h" |
7c673cae FG |
48 | #include "common/ceph_argparse.h" |
49 | #include "common/Cycles.h" | |
50 | #include "common/Cond.h" | |
9f95a23c | 51 | #include "common/ceph_mutex.h" |
7c673cae FG |
52 | #include "common/Thread.h" |
53 | #include "common/Timer.h" | |
54 | #include "msg/async/Event.h" | |
55 | #include "global/global_init.h" | |
56 | ||
57 | #include "test/perf_helper.h" | |
58 | ||
31f18b77 FG |
59 | #include <atomic> |
60 | ||
20effc67 | 61 | using namespace std; |
7c673cae FG |
62 | using namespace ceph; |
63 | ||
64 | /** | |
65 | * Ask the operating system to pin the current thread to a given CPU. | |
66 | * | |
67 | * \param cpu | |
68 | * Indicates the desired CPU and hyperthread; low order 2 bits | |
69 | * specify CPU, next bit specifies hyperthread. | |
70 | */ | |
71 | void bind_thread_to_cpu(int cpu) | |
72 | { | |
73 | #ifdef HAVE_SCHED | |
74 | cpu_set_t set; | |
75 | CPU_ZERO(&set); | |
76 | CPU_SET(cpu, &set); | |
77 | sched_setaffinity(0, sizeof(set), &set); | |
78 | #endif | |
79 | } | |
80 | ||
81 | /* | |
82 | * This function just discards its argument. It's used to make it | |
83 | * appear that data is used, so that the compiler won't optimize | |
84 | * away the code we're trying to measure. | |
85 | * | |
86 | * \param value | |
87 | * Pointer to arbitrary value; it's discarded. | |
88 | */ | |
89 | void discard(void* value) { | |
90 | int x = *reinterpret_cast<int*>(value); | |
91 | if (x == 0x43924776) { | |
92 | printf("Value was 0x%x\n", x); | |
93 | } | |
94 | } | |
95 | ||
96 | //---------------------------------------------------------------------- | |
97 | // Test functions start here | |
98 | //---------------------------------------------------------------------- | |
99 | ||
31f18b77 | 100 | // Measure the cost of atomic compare-and-swap |
7c673cae FG |
101 | double atomic_int_cmp() |
102 | { | |
103 | int count = 1000000; | |
31f18b77 FG |
104 | std::atomic<unsigned> value = { 11 }; |
105 | unsigned int test = 11; | |
7c673cae FG |
106 | uint64_t start = Cycles::rdtsc(); |
107 | for (int i = 0; i < count; i++) { | |
31f18b77 | 108 | value.compare_exchange_strong(test, test+2); |
7c673cae FG |
109 | test += 2; |
110 | } | |
111 | uint64_t stop = Cycles::rdtsc(); | |
112 | // printf("Final value: %d\n", value.load()); | |
113 | return Cycles::to_seconds(stop - start)/count; | |
114 | } | |
115 | ||
31f18b77 | 116 | // Measure the cost of incrementing an atomic |
7c673cae FG |
117 | double atomic_int_inc() |
118 | { | |
119 | int count = 1000000; | |
31f18b77 | 120 | std::atomic<int64_t> value = { 11 }; |
7c673cae FG |
121 | uint64_t start = Cycles::rdtsc(); |
122 | for (int i = 0; i < count; i++) { | |
31f18b77 | 123 | value++; |
7c673cae FG |
124 | } |
125 | uint64_t stop = Cycles::rdtsc(); | |
126 | // printf("Final value: %d\n", value.load()); | |
127 | return Cycles::to_seconds(stop - start)/count; | |
128 | } | |
129 | ||
31f18b77 | 130 | // Measure the cost of reading an atomic |
7c673cae FG |
131 | double atomic_int_read() |
132 | { | |
133 | int count = 1000000; | |
31f18b77 | 134 | std::atomic<int64_t> value = { 11 }; |
1e59de90 | 135 | [[maybe_unused]] int total = 0; |
7c673cae FG |
136 | uint64_t start = Cycles::rdtsc(); |
137 | for (int i = 0; i < count; i++) { | |
31f18b77 | 138 | total += value; |
7c673cae FG |
139 | } |
140 | uint64_t stop = Cycles::rdtsc(); | |
141 | // printf("Total: %d\n", total); | |
142 | return Cycles::to_seconds(stop - start)/count; | |
143 | } | |
144 | ||
31f18b77 | 145 | // Measure the cost of storing a new value in an atomic |
7c673cae FG |
146 | double atomic_int_set() |
147 | { | |
148 | int count = 1000000; | |
31f18b77 | 149 | std::atomic<int64_t> value = { 11 }; |
7c673cae FG |
150 | uint64_t start = Cycles::rdtsc(); |
151 | for (int i = 0; i < count; i++) { | |
31f18b77 | 152 | value = 88; |
7c673cae FG |
153 | } |
154 | uint64_t stop = Cycles::rdtsc(); | |
155 | return Cycles::to_seconds(stop - start)/count; | |
156 | } | |
157 | ||
158 | // Measure the cost of acquiring and releasing a mutex in the | |
159 | // fast case where the mutex is free. | |
160 | double mutex_nonblock() | |
161 | { | |
162 | int count = 1000000; | |
9f95a23c | 163 | ceph::mutex m = ceph::make_mutex("mutex_nonblock::m"); |
7c673cae FG |
164 | uint64_t start = Cycles::rdtsc(); |
165 | for (int i = 0; i < count; i++) { | |
9f95a23c TL |
166 | m.lock(); |
167 | m.unlock(); | |
7c673cae FG |
168 | } |
169 | uint64_t stop = Cycles::rdtsc(); | |
170 | return Cycles::to_seconds(stop - start)/count; | |
171 | } | |
172 | ||
173 | // Measure the cost of allocating and deallocating a buffer, plus | |
174 | // appending (logically) one ptr. | |
175 | double buffer_basic() | |
176 | { | |
177 | int count = 1000000; | |
178 | uint64_t start = Cycles::rdtsc(); | |
179 | bufferptr ptr("abcdefg", 7); | |
180 | for (int i = 0; i < count; i++) { | |
181 | bufferlist b; | |
182 | b.append(ptr, 0, 5); | |
183 | } | |
184 | uint64_t stop = Cycles::rdtsc(); | |
185 | return Cycles::to_seconds(stop - start)/count; | |
186 | } | |
187 | ||
188 | struct DummyBlock { | |
189 | int a = 1, b = 2, c = 3, d = 4; | |
190 | void encode(bufferlist &bl) const { | |
191 | ENCODE_START(1, 1, bl); | |
11fdf7f2 TL |
192 | encode(a, bl); |
193 | encode(b, bl); | |
194 | encode(c, bl); | |
195 | encode(d, bl); | |
7c673cae FG |
196 | ENCODE_FINISH(bl); |
197 | } | |
11fdf7f2 | 198 | void decode(bufferlist::const_iterator &bl) { |
7c673cae | 199 | DECODE_START(1, bl); |
11fdf7f2 TL |
200 | decode(a, bl); |
201 | decode(b, bl); | |
202 | decode(c, bl); | |
203 | decode(d, bl); | |
7c673cae FG |
204 | DECODE_FINISH(bl); |
205 | } | |
206 | }; | |
207 | WRITE_CLASS_ENCODER(DummyBlock) | |
208 | ||
209 | // Measure the cost of encoding and decoding a buffer, plus | |
210 | // allocating space for one chunk. | |
211 | double buffer_encode_decode() | |
212 | { | |
213 | int count = 1000000; | |
214 | uint64_t start = Cycles::rdtsc(); | |
215 | for (int i = 0; i < count; i++) { | |
216 | bufferlist b; | |
217 | DummyBlock dummy_block; | |
11fdf7f2 TL |
218 | encode(dummy_block, b); |
219 | auto iter = b.cbegin(); | |
220 | decode(dummy_block, iter); | |
7c673cae FG |
221 | } |
222 | uint64_t stop = Cycles::rdtsc(); | |
223 | return Cycles::to_seconds(stop - start)/count; | |
224 | } | |
225 | ||
226 | // Measure the cost of allocating and deallocating a buffer, plus | |
227 | // copying in a small block. | |
228 | double buffer_basic_copy() | |
229 | { | |
230 | int count = 1000000; | |
231 | uint64_t start = Cycles::rdtsc(); | |
232 | for (int i = 0; i < count; i++) { | |
233 | bufferlist b; | |
234 | b.append("abcdefg", 6); | |
235 | } | |
236 | uint64_t stop = Cycles::rdtsc(); | |
237 | return Cycles::to_seconds(stop - start)/count; | |
238 | } | |
239 | ||
240 | // Measure the cost of making a copy of parts of two ptrs. | |
241 | double buffer_copy() | |
242 | { | |
243 | int count = 1000000; | |
244 | bufferlist b; | |
245 | b.append("abcde", 5); | |
246 | b.append("01234", 5); | |
247 | char copy[10]; | |
248 | uint64_t start = Cycles::rdtsc(); | |
249 | for (int i = 0; i < count; i++) { | |
9f95a23c | 250 | b.cbegin(2).copy(6, copy); |
7c673cae FG |
251 | } |
252 | uint64_t stop = Cycles::rdtsc(); | |
253 | return Cycles::to_seconds(stop - start)/count; | |
254 | } | |
255 | ||
256 | // Measure the cost of allocating new space by extending the | |
257 | // bufferlist | |
258 | double buffer_encode() | |
259 | { | |
260 | int count = 100000; | |
261 | uint64_t total = 0; | |
262 | for (int i = 0; i < count; i++) { | |
263 | bufferlist b; | |
264 | DummyBlock dummy_block; | |
11fdf7f2 | 265 | encode(dummy_block, b); |
7c673cae | 266 | uint64_t start = Cycles::rdtsc(); |
11fdf7f2 TL |
267 | encode(dummy_block, b); |
268 | encode(dummy_block, b); | |
269 | encode(dummy_block, b); | |
270 | encode(dummy_block, b); | |
271 | encode(dummy_block, b); | |
272 | encode(dummy_block, b); | |
273 | encode(dummy_block, b); | |
274 | encode(dummy_block, b); | |
275 | encode(dummy_block, b); | |
276 | encode(dummy_block, b); | |
7c673cae FG |
277 | total += Cycles::rdtsc() - start; |
278 | } | |
279 | return Cycles::to_seconds(total)/(count*10); | |
280 | } | |
281 | ||
7c673cae FG |
282 | // Measure the cost of creating an iterator and iterating over 10 |
283 | // chunks in a buffer. | |
284 | double buffer_iterator() | |
285 | { | |
286 | bufferlist b; | |
287 | const char s[] = "abcdefghijklmnopqrstuvwxyz"; | |
288 | bufferptr ptr(s, sizeof(s)); | |
289 | for (int i = 0; i < 5; i++) { | |
290 | b.append(ptr, i, 5); | |
291 | } | |
292 | int count = 100000; | |
293 | int sum = 0; | |
294 | uint64_t start = Cycles::rdtsc(); | |
295 | for (int i = 0; i < count; i++) { | |
11fdf7f2 | 296 | auto it = b.cbegin(); |
7c673cae FG |
297 | while (!it.end()) { |
298 | sum += (static_cast<const char*>(it.get_current_ptr().c_str()))[it.get_remaining()-1]; | |
299 | ++it; | |
300 | } | |
301 | } | |
302 | uint64_t stop = Cycles::rdtsc(); | |
303 | discard(&sum); | |
304 | return Cycles::to_seconds(stop - start)/count; | |
305 | } | |
306 | ||
307 | // Implements the CondPingPong test. | |
308 | class CondPingPong { | |
9f95a23c TL |
309 | ceph::mutex mutex = ceph::make_mutex("CondPingPong::mutex"); |
310 | ceph::condition_variable cond; | |
311 | int prod = 0; | |
312 | int cons = 0; | |
313 | const int count = 10000; | |
7c673cae FG |
314 | |
315 | class Consumer : public Thread { | |
316 | CondPingPong *p; | |
317 | public: | |
318 | explicit Consumer(CondPingPong *p): p(p) {} | |
319 | void* entry() override { | |
320 | p->consume(); | |
321 | return 0; | |
322 | } | |
323 | } consumer; | |
324 | ||
325 | public: | |
9f95a23c | 326 | CondPingPong(): consumer(this) {} |
7c673cae FG |
327 | |
328 | double run() { | |
329 | consumer.create("consumer"); | |
330 | uint64_t start = Cycles::rdtsc(); | |
331 | produce(); | |
332 | uint64_t stop = Cycles::rdtsc(); | |
333 | consumer.join(); | |
334 | return Cycles::to_seconds(stop - start)/count; | |
335 | } | |
336 | ||
337 | void produce() { | |
9f95a23c | 338 | std::unique_lock l{mutex}; |
7c673cae | 339 | while (cons < count) { |
9f95a23c | 340 | cond.wait(l, [this] { return cons >= prod; }); |
7c673cae | 341 | ++prod; |
9f95a23c | 342 | cond.notify_all(); |
7c673cae FG |
343 | } |
344 | } | |
345 | ||
346 | void consume() { | |
9f95a23c | 347 | std::unique_lock l{mutex}; |
7c673cae | 348 | while (cons < count) { |
9f95a23c | 349 | cond.wait(l, [this] { return cons != prod; }); |
7c673cae | 350 | ++cons; |
9f95a23c | 351 | cond.notify_all(); |
7c673cae FG |
352 | } |
353 | } | |
354 | }; | |
355 | ||
356 | // Measure the cost of coordinating between threads using a condition variable. | |
357 | double cond_ping_pong() | |
358 | { | |
359 | return CondPingPong().run(); | |
360 | } | |
361 | ||
362 | // Measure the cost of a 32-bit divide. Divides don't take a constant | |
363 | // number of cycles. Values were chosen here semi-randomly to depict a | |
364 | // fairly expensive scenario. Someone with fancy ALU knowledge could | |
365 | // probably pick worse values. | |
366 | double div32() | |
367 | { | |
368 | #if defined(__i386__) || defined(__x86_64__) | |
369 | int count = 1000000; | |
370 | uint64_t start = Cycles::rdtsc(); | |
371 | // NB: Expect an x86 processor exception is there's overflow. | |
372 | uint32_t numeratorHi = 0xa5a5a5a5U; | |
373 | uint32_t numeratorLo = 0x55aa55aaU; | |
374 | uint32_t divisor = 0xaa55aa55U; | |
375 | uint32_t quotient; | |
376 | uint32_t remainder; | |
377 | for (int i = 0; i < count; i++) { | |
378 | __asm__ __volatile__("div %4" : | |
379 | "=a"(quotient), "=d"(remainder) : | |
380 | "a"(numeratorLo), "d"(numeratorHi), "r"(divisor) : | |
381 | "cc"); | |
382 | } | |
383 | uint64_t stop = Cycles::rdtsc(); | |
384 | return Cycles::to_seconds(stop - start)/count; | |
f67539c2 TL |
385 | #elif defined(__aarch64__) |
386 | int count = 1000000; | |
387 | uint64_t start = Cycles::rdtsc(); | |
388 | uint64_t numerator = 0xa5a5a5a555aa55aaUL; | |
389 | uint32_t divisor = 0xaa55aa55U; | |
390 | uint32_t result; | |
391 | for (int i = 0; i < count; i++) { | |
392 | asm volatile("udiv %0, %1, %2" : "=r"(result) : | |
393 | "r"(numerator), "r"(divisor)); | |
394 | } | |
395 | uint64_t stop = Cycles::rdtsc(); | |
396 | return Cycles::to_seconds(stop - start)/count; | |
7c673cae FG |
397 | #else |
398 | return -1; | |
399 | #endif | |
400 | } | |
401 | ||
402 | // Measure the cost of a 64-bit divide. Divides don't take a constant | |
403 | // number of cycles. Values were chosen here semi-randomly to depict a | |
404 | // fairly expensive scenario. Someone with fancy ALU knowledge could | |
405 | // probably pick worse values. | |
406 | double div64() | |
407 | { | |
408 | #if defined(__x86_64__) || defined(__amd64__) | |
409 | int count = 1000000; | |
410 | // NB: Expect an x86 processor exception is there's overflow. | |
411 | uint64_t start = Cycles::rdtsc(); | |
412 | uint64_t numeratorHi = 0x5a5a5a5a5a5UL; | |
413 | uint64_t numeratorLo = 0x55aa55aa55aa55aaUL; | |
414 | uint64_t divisor = 0xaa55aa55aa55aa55UL; | |
415 | uint64_t quotient; | |
416 | uint64_t remainder; | |
417 | for (int i = 0; i < count; i++) { | |
418 | __asm__ __volatile__("divq %4" : | |
419 | "=a"(quotient), "=d"(remainder) : | |
420 | "a"(numeratorLo), "d"(numeratorHi), "r"(divisor) : | |
421 | "cc"); | |
422 | } | |
423 | uint64_t stop = Cycles::rdtsc(); | |
424 | return Cycles::to_seconds(stop - start)/count; | |
425 | #else | |
426 | return -1; | |
427 | #endif | |
428 | } | |
429 | ||
430 | // Measure the cost of calling a non-inlined function. | |
431 | double function_call() | |
432 | { | |
433 | int count = 1000000; | |
434 | uint64_t x = 0; | |
435 | uint64_t start = Cycles::rdtsc(); | |
436 | for (int i = 0; i < count; i++) { | |
437 | x = PerfHelper::plus_one(x); | |
438 | } | |
439 | uint64_t stop = Cycles::rdtsc(); | |
440 | return Cycles::to_seconds(stop - start)/count; | |
441 | } | |
442 | ||
443 | // Measure the minimum cost of EventCenter::process_events, when there are no | |
444 | // Pollers and no Timers. | |
445 | double eventcenter_poll() | |
446 | { | |
447 | int count = 1000000; | |
448 | EventCenter center(g_ceph_context); | |
449 | center.init(1000, 0, "posix"); | |
450 | center.set_owner(); | |
451 | uint64_t start = Cycles::rdtsc(); | |
452 | for (int i = 0; i < count; i++) { | |
453 | center.process_events(0); | |
454 | } | |
455 | uint64_t stop = Cycles::rdtsc(); | |
456 | return Cycles::to_seconds(stop - start)/count; | |
457 | } | |
458 | ||
459 | class CenterWorker : public Thread { | |
460 | CephContext *cct; | |
461 | bool done; | |
462 | ||
463 | public: | |
464 | EventCenter center; | |
465 | explicit CenterWorker(CephContext *c): cct(c), done(false), center(c) { | |
466 | center.init(100, 0, "posix"); | |
467 | } | |
468 | void stop() { | |
469 | done = true; | |
470 | center.wakeup(); | |
471 | } | |
472 | void* entry() override { | |
473 | center.set_owner(); | |
474 | bind_thread_to_cpu(2); | |
475 | while (!done) | |
476 | center.process_events(1000); | |
477 | return 0; | |
478 | } | |
479 | }; | |
480 | ||
481 | class CountEvent: public EventCallback { | |
31f18b77 | 482 | std::atomic<int64_t> *count; |
7c673cae FG |
483 | |
484 | public: | |
31f18b77 | 485 | explicit CountEvent(std::atomic<int64_t> *atomic): count(atomic) {} |
11fdf7f2 | 486 | void do_request(uint64_t id) override { |
31f18b77 | 487 | (*count)--; |
7c673cae FG |
488 | } |
489 | }; | |
490 | ||
491 | double eventcenter_dispatch() | |
492 | { | |
493 | int count = 100000; | |
494 | ||
495 | CenterWorker worker(g_ceph_context); | |
31f18b77 | 496 | std::atomic<int64_t> flag = { 1 }; |
7c673cae FG |
497 | worker.create("evt_center_disp"); |
498 | EventCallbackRef count_event(new CountEvent(&flag)); | |
499 | ||
500 | worker.center.dispatch_event_external(count_event); | |
501 | // Start a new thread and wait for it to ready. | |
31f18b77 | 502 | while (flag) |
7c673cae FG |
503 | usleep(100); |
504 | ||
505 | uint64_t start = Cycles::rdtsc(); | |
506 | for (int i = 0; i < count; i++) { | |
31f18b77 | 507 | flag = 1; |
7c673cae | 508 | worker.center.dispatch_event_external(count_event); |
31f18b77 | 509 | while (flag) |
7c673cae FG |
510 | ; |
511 | } | |
512 | uint64_t stop = Cycles::rdtsc(); | |
513 | worker.stop(); | |
514 | worker.join(); | |
515 | return Cycles::to_seconds(stop - start)/count; | |
516 | } | |
517 | ||
518 | // Measure the cost of copying a given number of bytes with memcpy. | |
519 | double memcpy_shared(size_t size) | |
520 | { | |
521 | int count = 1000000; | |
522 | char src[size], dst[size]; | |
523 | ||
524 | memset(src, 0, sizeof(src)); | |
525 | ||
526 | uint64_t start = Cycles::rdtsc(); | |
527 | for (int i = 0; i < count; i++) { | |
528 | memcpy(dst, src, size); | |
529 | } | |
530 | uint64_t stop = Cycles::rdtsc(); | |
531 | return Cycles::to_seconds(stop - start)/count; | |
532 | } | |
533 | ||
534 | double memcpy100() | |
535 | { | |
536 | return memcpy_shared(100); | |
537 | } | |
538 | ||
539 | double memcpy1000() | |
540 | { | |
541 | return memcpy_shared(1000); | |
542 | } | |
543 | ||
544 | double memcpy10000() | |
545 | { | |
546 | return memcpy_shared(10000); | |
547 | } | |
548 | ||
549 | // Benchmark rjenkins hashing performance on cached data. | |
550 | template <int key_length> | |
551 | double ceph_str_hash_rjenkins() | |
552 | { | |
553 | int count = 100000; | |
554 | char buf[key_length]; | |
555 | ||
556 | uint64_t start = Cycles::rdtsc(); | |
557 | for (int i = 0; i < count; i++) | |
558 | ceph_str_hash(CEPH_STR_HASH_RJENKINS, buf, sizeof(buf)); | |
559 | uint64_t stop = Cycles::rdtsc(); | |
560 | ||
561 | return Cycles::to_seconds(stop - start)/count; | |
562 | } | |
563 | ||
564 | // Measure the cost of reading the fine-grain cycle counter. | |
565 | double rdtsc_test() | |
566 | { | |
567 | int count = 1000000; | |
568 | uint64_t start = Cycles::rdtsc(); | |
1e59de90 | 569 | [[maybe_unused]] uint64_t total = 0; |
7c673cae FG |
570 | for (int i = 0; i < count; i++) { |
571 | total += Cycles::rdtsc(); | |
572 | } | |
573 | uint64_t stop = Cycles::rdtsc(); | |
574 | return Cycles::to_seconds(stop - start)/count; | |
575 | } | |
576 | ||
577 | // Measure the cost of the Cycles::to_seconds method. | |
578 | double perf_cycles_to_seconds() | |
579 | { | |
580 | int count = 1000000; | |
1e59de90 | 581 | [[maybe_unused]] double total = 0; |
7c673cae FG |
582 | uint64_t cycles = 994261; |
583 | uint64_t start = Cycles::rdtsc(); | |
584 | for (int i = 0; i < count; i++) { | |
585 | total += Cycles::to_seconds(cycles); | |
586 | } | |
587 | uint64_t stop = Cycles::rdtsc(); | |
588 | // printf("Result: %.4f\n", total/count); | |
589 | return Cycles::to_seconds(stop - start)/count; | |
590 | } | |
591 | ||
592 | // Measure the cost of the Cylcles::toNanoseconds method. | |
593 | double perf_cycles_to_nanoseconds() | |
594 | { | |
595 | int count = 1000000; | |
1e59de90 | 596 | [[maybe_unused]] uint64_t total = 0; |
7c673cae FG |
597 | uint64_t cycles = 994261; |
598 | uint64_t start = Cycles::rdtsc(); | |
599 | for (int i = 0; i < count; i++) { | |
600 | total += Cycles::to_nanoseconds(cycles); | |
601 | } | |
602 | uint64_t stop = Cycles::rdtsc(); | |
603 | // printf("Result: %lu\n", total/count); | |
604 | return Cycles::to_seconds(stop - start)/count; | |
605 | } | |
606 | ||
607 | ||
608 | #ifdef HAVE_SSE | |
609 | /** | |
610 | * Prefetch the cache lines containing [object, object + numBytes) into the | |
611 | * processor's caches. | |
612 | * The best docs for this are in the Intel instruction set reference under | |
613 | * PREFETCH. | |
614 | * \param object | |
615 | * The start of the region of memory to prefetch. | |
616 | * \param num_bytes | |
617 | * The size of the region of memory to prefetch. | |
618 | */ | |
619 | static inline void prefetch(const void *object, uint64_t num_bytes) | |
620 | { | |
621 | uint64_t offset = reinterpret_cast<uint64_t>(object) & 0x3fUL; | |
622 | const char* p = reinterpret_cast<const char*>(object) - offset; | |
623 | for (uint64_t i = 0; i < offset + num_bytes; i += 64) | |
624 | _mm_prefetch(p + i, _MM_HINT_T0); | |
625 | } | |
f67539c2 TL |
626 | #elif defined(__aarch64__) |
627 | static inline void prefetch(const void *object, uint64_t num_bytes) | |
628 | { | |
629 | uint64_t offset = reinterpret_cast<uint64_t>(object) & 0x3fUL; | |
630 | const char* ptr = reinterpret_cast<const char*>(object) - offset; | |
631 | for (uint64_t i = 0; i < offset + num_bytes; i += 64, ptr += 64) | |
632 | asm volatile("prfm pldl1keep, %a0\n" : : "p" (ptr)); | |
633 | } | |
7c673cae FG |
634 | #endif |
635 | ||
636 | // Measure the cost of the prefetch instruction. | |
637 | double perf_prefetch() | |
638 | { | |
f67539c2 | 639 | #if defined(HAVE_SSE) || defined(__aarch64__) |
7c673cae FG |
640 | uint64_t total_ticks = 0; |
641 | int count = 10; | |
642 | char buf[16 * 64]; | |
7c673cae FG |
643 | |
644 | for (int i = 0; i < count; i++) { | |
645 | PerfHelper::flush_cache(); | |
11fdf7f2 | 646 | uint64_t start = Cycles::rdtsc(); |
7c673cae FG |
647 | prefetch(&buf[576], 64); |
648 | prefetch(&buf[0], 64); | |
649 | prefetch(&buf[512], 64); | |
650 | prefetch(&buf[960], 64); | |
651 | prefetch(&buf[640], 64); | |
652 | prefetch(&buf[896], 64); | |
653 | prefetch(&buf[256], 64); | |
654 | prefetch(&buf[704], 64); | |
655 | prefetch(&buf[320], 64); | |
656 | prefetch(&buf[384], 64); | |
657 | prefetch(&buf[128], 64); | |
658 | prefetch(&buf[448], 64); | |
659 | prefetch(&buf[768], 64); | |
660 | prefetch(&buf[832], 64); | |
661 | prefetch(&buf[64], 64); | |
662 | prefetch(&buf[192], 64); | |
11fdf7f2 | 663 | uint64_t stop = Cycles::rdtsc(); |
7c673cae FG |
664 | total_ticks += stop - start; |
665 | } | |
666 | return Cycles::to_seconds(total_ticks) / count / 16; | |
667 | #else | |
668 | return -1; | |
669 | #endif | |
670 | } | |
671 | ||
672 | #if defined(__x86_64__) | |
673 | /** | |
674 | * This function is used to seralize machine instructions so that no | |
675 | * instructions that appear after it in the current thread can run before any | |
676 | * instructions that appear before it. | |
677 | * | |
678 | * It is useful for putting around rdpmc instructions (to pinpoint cache | |
679 | * misses) as well as before rdtsc instructions, to prevent time pollution from | |
680 | * instructions supposed to be executing before the timer starts. | |
681 | */ | |
682 | static inline void serialize() { | |
683 | uint32_t eax, ebx, ecx, edx; | |
684 | __asm volatile("cpuid" | |
685 | : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) | |
686 | : "a" (1U)); | |
687 | } | |
688 | #endif | |
689 | ||
690 | // Measure the cost of cpuid | |
691 | double perf_serialize() { | |
692 | #if defined(__x86_64__) | |
693 | int count = 1000000; | |
694 | uint64_t start = Cycles::rdtsc(); | |
695 | for (int i = 0; i < count; i++) { | |
696 | serialize(); | |
697 | } | |
698 | uint64_t stop = Cycles::rdtsc(); | |
699 | return Cycles::to_seconds(stop - start)/count; | |
700 | #else | |
701 | return -1; | |
702 | #endif | |
703 | } | |
704 | ||
705 | // Measure the cost of an lfence instruction. | |
706 | double lfence() | |
707 | { | |
708 | #ifdef HAVE_SSE2 | |
709 | int count = 1000000; | |
710 | uint64_t start = Cycles::rdtsc(); | |
711 | for (int i = 0; i < count; i++) { | |
712 | __asm__ __volatile__("lfence" ::: "memory"); | |
713 | } | |
714 | uint64_t stop = Cycles::rdtsc(); | |
715 | return Cycles::to_seconds(stop - start)/count; | |
f67539c2 TL |
716 | #elif defined(__aarch64__) |
717 | int count = 1000000; | |
718 | uint64_t start = Cycles::rdtsc(); | |
719 | for (int i = 0; i < count; i++) { | |
720 | asm volatile("dmb ishld" ::: "memory"); | |
721 | } | |
722 | uint64_t stop = Cycles::rdtsc(); | |
723 | return Cycles::to_seconds(stop - start)/count; | |
7c673cae FG |
724 | #else |
725 | return -1; | |
726 | #endif | |
727 | } | |
728 | ||
729 | // Measure the cost of an sfence instruction. | |
730 | double sfence() | |
731 | { | |
732 | #ifdef HAVE_SSE | |
733 | int count = 1000000; | |
734 | uint64_t start = Cycles::rdtsc(); | |
735 | for (int i = 0; i < count; i++) { | |
736 | __asm__ __volatile__("sfence" ::: "memory"); | |
737 | } | |
738 | uint64_t stop = Cycles::rdtsc(); | |
739 | return Cycles::to_seconds(stop - start)/count; | |
f67539c2 TL |
740 | #elif defined(__aarch64__) |
741 | int count = 1000000; | |
742 | uint64_t start = Cycles::rdtsc(); | |
743 | for (int i = 0; i < count; i++) { | |
744 | asm volatile("dmb ishst" ::: "memory"); | |
745 | } | |
746 | uint64_t stop = Cycles::rdtsc(); | |
747 | return Cycles::to_seconds(stop - start)/count; | |
7c673cae FG |
748 | #else |
749 | return -1; | |
750 | #endif | |
751 | } | |
752 | ||
753 | // Measure the cost of acquiring and releasing a SpinLock (assuming the | |
754 | // lock is initially free). | |
755 | double test_spinlock() | |
756 | { | |
757 | int count = 1000000; | |
11fdf7f2 | 758 | ceph::spinlock lock; |
7c673cae FG |
759 | uint64_t start = Cycles::rdtsc(); |
760 | for (int i = 0; i < count; i++) { | |
761 | lock.lock(); | |
762 | lock.unlock(); | |
763 | } | |
764 | uint64_t stop = Cycles::rdtsc(); | |
765 | return Cycles::to_seconds(stop - start)/count; | |
766 | } | |
767 | ||
768 | // Helper for spawn_thread. This is the main function that the thread executes | |
769 | // (intentionally empty). | |
770 | class ThreadHelper : public Thread { | |
771 | void *entry() override { return 0; } | |
772 | }; | |
773 | ||
774 | // Measure the cost of start and joining with a thread. | |
775 | double spawn_thread() | |
776 | { | |
777 | int count = 10000; | |
778 | ThreadHelper thread; | |
779 | uint64_t start = Cycles::rdtsc(); | |
780 | for (int i = 0; i < count; i++) { | |
781 | thread.create("thread_helper"); | |
782 | thread.join(); | |
783 | } | |
784 | uint64_t stop = Cycles::rdtsc(); | |
785 | return Cycles::to_seconds(stop - start)/count; | |
786 | } | |
787 | ||
788 | class FakeContext : public Context { | |
789 | public: | |
790 | void finish(int r) override {} | |
791 | }; | |
792 | ||
793 | // Measure the cost of starting and stopping a Dispatch::Timer. | |
794 | double perf_timer() | |
795 | { | |
796 | int count = 1000000; | |
9f95a23c | 797 | ceph::mutex lock = ceph::make_mutex("perf_timer::lock"); |
7c673cae FG |
798 | SafeTimer timer(g_ceph_context, lock); |
799 | FakeContext **c = new FakeContext*[count]; | |
800 | for (int i = 0; i < count; i++) { | |
801 | c[i] = new FakeContext(); | |
802 | } | |
803 | uint64_t start = Cycles::rdtsc(); | |
9f95a23c | 804 | std::lock_guard l{lock}; |
7c673cae | 805 | for (int i = 0; i < count; i++) { |
3efd9988 FG |
806 | if (timer.add_event_after(12345, c[i])) { |
807 | timer.cancel_event(c[i]); | |
808 | } | |
7c673cae FG |
809 | } |
810 | uint64_t stop = Cycles::rdtsc(); | |
811 | delete[] c; | |
812 | return Cycles::to_seconds(stop - start)/count; | |
813 | } | |
814 | ||
815 | // Measure the cost of throwing and catching an int. This uses an integer as | |
816 | // the value thrown, which is presumably as fast as possible. | |
817 | double throw_int() | |
818 | { | |
819 | int count = 10000; | |
820 | uint64_t start = Cycles::rdtsc(); | |
821 | for (int i = 0; i < count; i++) { | |
822 | try { | |
823 | throw 0; | |
824 | } catch (int) { // NOLINT | |
825 | // pass | |
826 | } | |
827 | } | |
828 | uint64_t stop = Cycles::rdtsc(); | |
829 | return Cycles::to_seconds(stop - start)/count; | |
830 | } | |
831 | ||
832 | // Measure the cost of throwing and catching an int from a function call. | |
833 | double throw_int_call() | |
834 | { | |
835 | int count = 10000; | |
836 | uint64_t start = Cycles::rdtsc(); | |
837 | for (int i = 0; i < count; i++) { | |
838 | try { | |
839 | PerfHelper::throw_int(); | |
840 | } catch (int) { // NOLINT | |
841 | // pass | |
842 | } | |
843 | } | |
844 | uint64_t stop = Cycles::rdtsc(); | |
845 | return Cycles::to_seconds(stop - start)/count; | |
846 | } | |
847 | ||
848 | // Measure the cost of throwing and catching an Exception. This uses an actual | |
849 | // exception as the value thrown, which may be slower than throwInt. | |
850 | double throw_exception() | |
851 | { | |
852 | int count = 10000; | |
853 | uint64_t start = Cycles::rdtsc(); | |
854 | for (int i = 0; i < count; i++) { | |
855 | try { | |
856 | throw buffer::end_of_buffer(); | |
857 | } catch (const buffer::end_of_buffer&) { | |
858 | // pass | |
859 | } | |
860 | } | |
861 | uint64_t stop = Cycles::rdtsc(); | |
862 | return Cycles::to_seconds(stop - start)/count; | |
863 | } | |
864 | ||
865 | // Measure the cost of throwing and catching an Exception from a function call. | |
866 | double throw_exception_call() | |
867 | { | |
868 | int count = 10000; | |
869 | uint64_t start = Cycles::rdtsc(); | |
870 | for (int i = 0; i < count; i++) { | |
871 | try { | |
872 | PerfHelper::throw_end_of_buffer(); | |
873 | } catch (const buffer::end_of_buffer&) { | |
874 | // pass | |
875 | } | |
876 | } | |
877 | uint64_t stop = Cycles::rdtsc(); | |
878 | return Cycles::to_seconds(stop - start)/count; | |
879 | } | |
880 | ||
881 | // Measure the cost of pushing a new element on a std::vector, copying | |
882 | // from the end to an internal element, and popping the end element. | |
883 | double vector_push_pop() | |
884 | { | |
885 | int count = 100000; | |
886 | std::vector<int> vector; | |
887 | vector.push_back(1); | |
888 | vector.push_back(2); | |
889 | vector.push_back(3); | |
890 | uint64_t start = Cycles::rdtsc(); | |
891 | for (int i = 0; i < count; i++) { | |
892 | vector.push_back(i); | |
893 | vector.push_back(i+1); | |
894 | vector.push_back(i+2); | |
895 | vector[2] = vector.back(); | |
896 | vector.pop_back(); | |
897 | vector[0] = vector.back(); | |
898 | vector.pop_back(); | |
899 | vector[1] = vector.back(); | |
900 | vector.pop_back(); | |
901 | } | |
902 | uint64_t stop = Cycles::rdtsc(); | |
903 | return Cycles::to_seconds(stop - start)/(count*3); | |
904 | } | |
905 | ||
906 | // Measure the cost of ceph_clock_now | |
907 | double perf_ceph_clock_now() | |
908 | { | |
909 | int count = 100000; | |
910 | uint64_t start = Cycles::rdtsc(); | |
911 | for (int i = 0; i < count; i++) { | |
912 | ceph_clock_now(); | |
913 | } | |
914 | uint64_t stop = Cycles::rdtsc(); | |
915 | return Cycles::to_seconds(stop - start)/count; | |
916 | } | |
917 | ||
918 | // The following struct and table define each performance test in terms of | |
919 | // a string name and a function that implements the test. | |
920 | struct TestInfo { | |
921 | const char* name; // Name of the performance test; this is | |
922 | // what gets typed on the command line to | |
923 | // run the test. | |
924 | double (*func)(); // Function that implements the test; | |
925 | // returns the time (in seconds) for each | |
926 | // iteration of that test. | |
927 | const char *description; // Short description of this test (not more | |
928 | // than about 40 characters, so the entire | |
929 | // test output fits on a single line). | |
930 | }; | |
931 | TestInfo tests[] = { | |
932 | {"atomic_int_cmp", atomic_int_cmp, | |
933 | "atomic_t::compare_and_swap"}, | |
934 | {"atomic_int_inc", atomic_int_inc, | |
935 | "atomic_t::inc"}, | |
936 | {"atomic_int_read", atomic_int_read, | |
937 | "atomic_t::read"}, | |
938 | {"atomic_int_set", atomic_int_set, | |
939 | "atomic_t::set"}, | |
940 | {"mutex_nonblock", mutex_nonblock, | |
941 | "Mutex lock/unlock (no blocking)"}, | |
942 | {"buffer_basic", buffer_basic, | |
943 | "buffer create, add one ptr, delete"}, | |
944 | {"buffer_encode_decode", buffer_encode_decode, | |
945 | "buffer create, encode/decode object, delete"}, | |
946 | {"buffer_basic_copy", buffer_basic_copy, | |
947 | "buffer create, copy small block, delete"}, | |
948 | {"buffer_copy", buffer_copy, | |
949 | "copy out 2 small ptrs from buffer"}, | |
950 | {"buffer_encode10", buffer_encode, | |
951 | "buffer encoding 10 structures onto existing ptr"}, | |
7c673cae FG |
952 | {"buffer_iterator", buffer_iterator, |
953 | "iterate over buffer with 5 ptrs"}, | |
954 | {"cond_ping_pong", cond_ping_pong, | |
955 | "condition variable round-trip"}, | |
956 | {"div32", div32, | |
957 | "32-bit integer division instruction"}, | |
958 | {"div64", div64, | |
959 | "64-bit integer division instruction"}, | |
960 | {"function_call", function_call, | |
961 | "Call a function that has not been inlined"}, | |
962 | {"eventcenter_poll", eventcenter_poll, | |
963 | "EventCenter::process_events (no timers or events)"}, | |
964 | {"eventcenter_dispatch", eventcenter_dispatch, | |
965 | "EventCenter::dispatch_event_external latency"}, | |
966 | {"memcpy100", memcpy100, | |
967 | "Copy 100 bytes with memcpy"}, | |
968 | {"memcpy1000", memcpy1000, | |
969 | "Copy 1000 bytes with memcpy"}, | |
970 | {"memcpy10000", memcpy10000, | |
971 | "Copy 10000 bytes with memcpy"}, | |
972 | {"ceph_str_hash_rjenkins", ceph_str_hash_rjenkins<16>, | |
973 | "rjenkins hash on 16 byte of data"}, | |
974 | {"ceph_str_hash_rjenkins", ceph_str_hash_rjenkins<256>, | |
975 | "rjenkins hash on 256 bytes of data"}, | |
976 | {"rdtsc", rdtsc_test, | |
977 | "Read the fine-grain cycle counter"}, | |
978 | {"cycles_to_seconds", perf_cycles_to_seconds, | |
979 | "Convert a rdtsc result to (double) seconds"}, | |
980 | {"cycles_to_seconds", perf_cycles_to_nanoseconds, | |
981 | "Convert a rdtsc result to (uint64_t) nanoseconds"}, | |
982 | {"prefetch", perf_prefetch, | |
983 | "Prefetch instruction"}, | |
984 | {"serialize", perf_serialize, | |
985 | "serialize instruction"}, | |
986 | {"lfence", lfence, | |
987 | "Lfence instruction"}, | |
988 | {"sfence", sfence, | |
989 | "Sfence instruction"}, | |
990 | {"spin_lock", test_spinlock, | |
991 | "Acquire/release SpinLock"}, | |
992 | {"spawn_thread", spawn_thread, | |
993 | "Start and stop a thread"}, | |
994 | {"perf_timer", perf_timer, | |
995 | "Insert and cancel a SafeTimer"}, | |
996 | {"throw_int", throw_int, | |
997 | "Throw an int"}, | |
998 | {"throw_int_call", throw_int_call, | |
999 | "Throw an int in a function call"}, | |
1000 | {"throw_exception", throw_exception, | |
1001 | "Throw an Exception"}, | |
1002 | {"throw_exception_call", throw_exception_call, | |
1003 | "Throw an Exception in a function call"}, | |
1004 | {"vector_push_pop", vector_push_pop, | |
1005 | "Push and pop a std::vector"}, | |
1006 | {"ceph_clock_now", perf_ceph_clock_now, | |
1007 | "ceph_clock_now function"}, | |
1008 | }; | |
1009 | ||
1010 | /** | |
1011 | * Runs a particular test and prints a one-line result message. | |
1012 | * | |
1013 | * \param info | |
1014 | * Describes the test to run. | |
1015 | */ | |
1016 | void run_test(TestInfo& info) | |
1017 | { | |
1018 | double secs = info.func(); | |
1019 | int width = printf("%-24s ", info.name); | |
1020 | if (secs == -1) { | |
1021 | width += printf(" architecture nonsupport "); | |
1022 | } else if (secs < 1.0e-06) { | |
1023 | width += printf("%8.2fns", 1e09*secs); | |
1024 | } else if (secs < 1.0e-03) { | |
1025 | width += printf("%8.2fus", 1e06*secs); | |
1026 | } else if (secs < 1.0) { | |
1027 | width += printf("%8.2fms", 1e03*secs); | |
1028 | } else { | |
1029 | width += printf("%8.2fs", secs); | |
1030 | } | |
1031 | printf("%*s %s\n", 32-width, "", info.description); | |
1032 | } | |
1033 | ||
1034 | int main(int argc, char *argv[]) | |
1035 | { | |
20effc67 | 1036 | auto args = argv_to_vec(argc, argv); |
7c673cae FG |
1037 | |
1038 | auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, | |
11fdf7f2 TL |
1039 | CODE_ENVIRONMENT_UTILITY, |
1040 | CINIT_FLAG_NO_DEFAULT_CONFIG_FILE); | |
7c673cae FG |
1041 | common_init_finish(g_ceph_context); |
1042 | Cycles::init(); | |
1043 | ||
1044 | bind_thread_to_cpu(3); | |
1045 | if (argc == 1) { | |
1046 | // No test names specified; run all tests. | |
1047 | for (size_t i = 0; i < sizeof(tests)/sizeof(TestInfo); ++i) { | |
1048 | run_test(tests[i]); | |
1049 | } | |
1050 | } else { | |
1051 | // Run only the tests that were specified on the command line. | |
1052 | for (int i = 1; i < argc; i++) { | |
1053 | bool found_test = false; | |
1054 | for (size_t j = 0; j < sizeof(tests)/sizeof(TestInfo); ++j) { | |
1055 | if (strcmp(argv[i], tests[j].name) == 0) { | |
1056 | found_test = true; | |
1057 | run_test(tests[j]); | |
1058 | break; | |
1059 | } | |
1060 | } | |
1061 | if (!found_test) { | |
1062 | int width = printf("%-24s ??", argv[i]); | |
1063 | printf("%*s No such test\n", 32-width, ""); | |
1064 | } | |
1065 | } | |
1066 | } | |
1067 | } |