]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* Copyright (c) 2015 Haomai Wang <haomaiwang@gmail.com> | |
4 | * Copyright (c) 2011-2014 Stanford University | |
5 | * Copyright (c) 2011 Facebook | |
6 | * | |
7 | * Permission to use, copy, modify, and distribute this software for any | |
8 | * purpose with or without fee is hereby granted, provided that the above | |
9 | * copyright notice and this permission notice appear in all copies. | |
10 | * | |
11 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR(S) DISCLAIM ALL WARRANTIES | |
12 | * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF | |
13 | * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL AUTHORS BE LIABLE FOR | |
14 | * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES | |
15 | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN | |
16 | * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF | |
17 | * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | |
18 | */ | |
19 | ||
20 | // This program contains a collection of low-level performance measurements | |
21 | // for Ceph, which can be run either individually or altogether. These | |
22 | // tests measure performance in a single stand-alone process, not in a cluster | |
23 | // with multiple servers. Invoke the program like this: | |
24 | // | |
25 | // Perf test1 test2 ... | |
26 | // | |
27 | // test1 and test2 are the names of individual performance measurements to | |
28 | // run. If no test names are provided then all of the performance tests | |
29 | // are run. | |
30 | // | |
31 | // To add a new test: | |
32 | // * Write a function that implements the test. Use existing test functions | |
33 | // as a guideline, and be sure to generate output in the same form as | |
34 | // other tests. | |
35 | // * Create a new entry for the test in the #tests table. | |
36 | #include <vector> | |
37 | #include <sched.h> | |
38 | ||
39 | #include "acconfig.h" | |
40 | #ifdef HAVE_SSE | |
41 | #include <xmmintrin.h> | |
42 | #endif | |
43 | ||
7c673cae FG |
44 | #include "include/buffer.h" |
45 | #include "include/encoding.h" | |
46 | #include "include/ceph_hash.h" | |
11fdf7f2 | 47 | #include "include/spinlock.h" |
7c673cae FG |
48 | #include "common/ceph_argparse.h" |
49 | #include "common/Cycles.h" | |
50 | #include "common/Cond.h" | |
9f95a23c | 51 | #include "common/ceph_mutex.h" |
7c673cae FG |
52 | #include "common/Thread.h" |
53 | #include "common/Timer.h" | |
54 | #include "msg/async/Event.h" | |
55 | #include "global/global_init.h" | |
56 | ||
57 | #include "test/perf_helper.h" | |
58 | ||
31f18b77 FG |
59 | #include <atomic> |
60 | ||
7c673cae FG |
61 | using namespace ceph; |
62 | ||
63 | /** | |
64 | * Ask the operating system to pin the current thread to a given CPU. | |
65 | * | |
66 | * \param cpu | |
67 | * Indicates the desired CPU and hyperthread; low order 2 bits | |
68 | * specify CPU, next bit specifies hyperthread. | |
69 | */ | |
70 | void bind_thread_to_cpu(int cpu) | |
71 | { | |
72 | #ifdef HAVE_SCHED | |
73 | cpu_set_t set; | |
74 | CPU_ZERO(&set); | |
75 | CPU_SET(cpu, &set); | |
76 | sched_setaffinity(0, sizeof(set), &set); | |
77 | #endif | |
78 | } | |
79 | ||
80 | /* | |
81 | * This function just discards its argument. It's used to make it | |
82 | * appear that data is used, so that the compiler won't optimize | |
83 | * away the code we're trying to measure. | |
84 | * | |
85 | * \param value | |
86 | * Pointer to arbitrary value; it's discarded. | |
87 | */ | |
88 | void discard(void* value) { | |
89 | int x = *reinterpret_cast<int*>(value); | |
90 | if (x == 0x43924776) { | |
91 | printf("Value was 0x%x\n", x); | |
92 | } | |
93 | } | |
94 | ||
95 | //---------------------------------------------------------------------- | |
96 | // Test functions start here | |
97 | //---------------------------------------------------------------------- | |
98 | ||
31f18b77 | 99 | // Measure the cost of atomic compare-and-swap |
7c673cae FG |
100 | double atomic_int_cmp() |
101 | { | |
102 | int count = 1000000; | |
31f18b77 FG |
103 | std::atomic<unsigned> value = { 11 }; |
104 | unsigned int test = 11; | |
7c673cae FG |
105 | uint64_t start = Cycles::rdtsc(); |
106 | for (int i = 0; i < count; i++) { | |
31f18b77 | 107 | value.compare_exchange_strong(test, test+2); |
7c673cae FG |
108 | test += 2; |
109 | } | |
110 | uint64_t stop = Cycles::rdtsc(); | |
111 | // printf("Final value: %d\n", value.load()); | |
112 | return Cycles::to_seconds(stop - start)/count; | |
113 | } | |
114 | ||
31f18b77 | 115 | // Measure the cost of incrementing an atomic |
7c673cae FG |
116 | double atomic_int_inc() |
117 | { | |
118 | int count = 1000000; | |
31f18b77 | 119 | std::atomic<int64_t> value = { 11 }; |
7c673cae FG |
120 | uint64_t start = Cycles::rdtsc(); |
121 | for (int i = 0; i < count; i++) { | |
31f18b77 | 122 | value++; |
7c673cae FG |
123 | } |
124 | uint64_t stop = Cycles::rdtsc(); | |
125 | // printf("Final value: %d\n", value.load()); | |
126 | return Cycles::to_seconds(stop - start)/count; | |
127 | } | |
128 | ||
31f18b77 | 129 | // Measure the cost of reading an atomic |
7c673cae FG |
130 | double atomic_int_read() |
131 | { | |
132 | int count = 1000000; | |
31f18b77 | 133 | std::atomic<int64_t> value = { 11 }; |
7c673cae FG |
134 | int total = 0; |
135 | uint64_t start = Cycles::rdtsc(); | |
136 | for (int i = 0; i < count; i++) { | |
31f18b77 | 137 | total += value; |
7c673cae FG |
138 | } |
139 | uint64_t stop = Cycles::rdtsc(); | |
140 | // printf("Total: %d\n", total); | |
141 | return Cycles::to_seconds(stop - start)/count; | |
142 | } | |
143 | ||
31f18b77 | 144 | // Measure the cost of storing a new value in an atomic |
7c673cae FG |
145 | double atomic_int_set() |
146 | { | |
147 | int count = 1000000; | |
31f18b77 | 148 | std::atomic<int64_t> value = { 11 }; |
7c673cae FG |
149 | uint64_t start = Cycles::rdtsc(); |
150 | for (int i = 0; i < count; i++) { | |
31f18b77 | 151 | value = 88; |
7c673cae FG |
152 | } |
153 | uint64_t stop = Cycles::rdtsc(); | |
154 | return Cycles::to_seconds(stop - start)/count; | |
155 | } | |
156 | ||
157 | // Measure the cost of acquiring and releasing a mutex in the | |
158 | // fast case where the mutex is free. | |
159 | double mutex_nonblock() | |
160 | { | |
161 | int count = 1000000; | |
9f95a23c | 162 | ceph::mutex m = ceph::make_mutex("mutex_nonblock::m"); |
7c673cae FG |
163 | uint64_t start = Cycles::rdtsc(); |
164 | for (int i = 0; i < count; i++) { | |
9f95a23c TL |
165 | m.lock(); |
166 | m.unlock(); | |
7c673cae FG |
167 | } |
168 | uint64_t stop = Cycles::rdtsc(); | |
169 | return Cycles::to_seconds(stop - start)/count; | |
170 | } | |
171 | ||
172 | // Measure the cost of allocating and deallocating a buffer, plus | |
173 | // appending (logically) one ptr. | |
174 | double buffer_basic() | |
175 | { | |
176 | int count = 1000000; | |
177 | uint64_t start = Cycles::rdtsc(); | |
178 | bufferptr ptr("abcdefg", 7); | |
179 | for (int i = 0; i < count; i++) { | |
180 | bufferlist b; | |
181 | b.append(ptr, 0, 5); | |
182 | } | |
183 | uint64_t stop = Cycles::rdtsc(); | |
184 | return Cycles::to_seconds(stop - start)/count; | |
185 | } | |
186 | ||
187 | struct DummyBlock { | |
188 | int a = 1, b = 2, c = 3, d = 4; | |
189 | void encode(bufferlist &bl) const { | |
190 | ENCODE_START(1, 1, bl); | |
11fdf7f2 TL |
191 | encode(a, bl); |
192 | encode(b, bl); | |
193 | encode(c, bl); | |
194 | encode(d, bl); | |
7c673cae FG |
195 | ENCODE_FINISH(bl); |
196 | } | |
11fdf7f2 | 197 | void decode(bufferlist::const_iterator &bl) { |
7c673cae | 198 | DECODE_START(1, bl); |
11fdf7f2 TL |
199 | decode(a, bl); |
200 | decode(b, bl); | |
201 | decode(c, bl); | |
202 | decode(d, bl); | |
7c673cae FG |
203 | DECODE_FINISH(bl); |
204 | } | |
205 | }; | |
206 | WRITE_CLASS_ENCODER(DummyBlock) | |
207 | ||
208 | // Measure the cost of encoding and decoding a buffer, plus | |
209 | // allocating space for one chunk. | |
210 | double buffer_encode_decode() | |
211 | { | |
212 | int count = 1000000; | |
213 | uint64_t start = Cycles::rdtsc(); | |
214 | for (int i = 0; i < count; i++) { | |
215 | bufferlist b; | |
216 | DummyBlock dummy_block; | |
11fdf7f2 TL |
217 | encode(dummy_block, b); |
218 | auto iter = b.cbegin(); | |
219 | decode(dummy_block, iter); | |
7c673cae FG |
220 | } |
221 | uint64_t stop = Cycles::rdtsc(); | |
222 | return Cycles::to_seconds(stop - start)/count; | |
223 | } | |
224 | ||
225 | // Measure the cost of allocating and deallocating a buffer, plus | |
226 | // copying in a small block. | |
227 | double buffer_basic_copy() | |
228 | { | |
229 | int count = 1000000; | |
230 | uint64_t start = Cycles::rdtsc(); | |
231 | for (int i = 0; i < count; i++) { | |
232 | bufferlist b; | |
233 | b.append("abcdefg", 6); | |
234 | } | |
235 | uint64_t stop = Cycles::rdtsc(); | |
236 | return Cycles::to_seconds(stop - start)/count; | |
237 | } | |
238 | ||
239 | // Measure the cost of making a copy of parts of two ptrs. | |
240 | double buffer_copy() | |
241 | { | |
242 | int count = 1000000; | |
243 | bufferlist b; | |
244 | b.append("abcde", 5); | |
245 | b.append("01234", 5); | |
246 | char copy[10]; | |
247 | uint64_t start = Cycles::rdtsc(); | |
248 | for (int i = 0; i < count; i++) { | |
9f95a23c | 249 | b.cbegin(2).copy(6, copy); |
7c673cae FG |
250 | } |
251 | uint64_t stop = Cycles::rdtsc(); | |
252 | return Cycles::to_seconds(stop - start)/count; | |
253 | } | |
254 | ||
255 | // Measure the cost of allocating new space by extending the | |
256 | // bufferlist | |
257 | double buffer_encode() | |
258 | { | |
259 | int count = 100000; | |
260 | uint64_t total = 0; | |
261 | for (int i = 0; i < count; i++) { | |
262 | bufferlist b; | |
263 | DummyBlock dummy_block; | |
11fdf7f2 | 264 | encode(dummy_block, b); |
7c673cae | 265 | uint64_t start = Cycles::rdtsc(); |
11fdf7f2 TL |
266 | encode(dummy_block, b); |
267 | encode(dummy_block, b); | |
268 | encode(dummy_block, b); | |
269 | encode(dummy_block, b); | |
270 | encode(dummy_block, b); | |
271 | encode(dummy_block, b); | |
272 | encode(dummy_block, b); | |
273 | encode(dummy_block, b); | |
274 | encode(dummy_block, b); | |
275 | encode(dummy_block, b); | |
7c673cae FG |
276 | total += Cycles::rdtsc() - start; |
277 | } | |
278 | return Cycles::to_seconds(total)/(count*10); | |
279 | } | |
280 | ||
7c673cae FG |
281 | // Measure the cost of creating an iterator and iterating over 10 |
282 | // chunks in a buffer. | |
283 | double buffer_iterator() | |
284 | { | |
285 | bufferlist b; | |
286 | const char s[] = "abcdefghijklmnopqrstuvwxyz"; | |
287 | bufferptr ptr(s, sizeof(s)); | |
288 | for (int i = 0; i < 5; i++) { | |
289 | b.append(ptr, i, 5); | |
290 | } | |
291 | int count = 100000; | |
292 | int sum = 0; | |
293 | uint64_t start = Cycles::rdtsc(); | |
294 | for (int i = 0; i < count; i++) { | |
11fdf7f2 | 295 | auto it = b.cbegin(); |
7c673cae FG |
296 | while (!it.end()) { |
297 | sum += (static_cast<const char*>(it.get_current_ptr().c_str()))[it.get_remaining()-1]; | |
298 | ++it; | |
299 | } | |
300 | } | |
301 | uint64_t stop = Cycles::rdtsc(); | |
302 | discard(&sum); | |
303 | return Cycles::to_seconds(stop - start)/count; | |
304 | } | |
305 | ||
306 | // Implements the CondPingPong test. | |
307 | class CondPingPong { | |
9f95a23c TL |
308 | ceph::mutex mutex = ceph::make_mutex("CondPingPong::mutex"); |
309 | ceph::condition_variable cond; | |
310 | int prod = 0; | |
311 | int cons = 0; | |
312 | const int count = 10000; | |
7c673cae FG |
313 | |
314 | class Consumer : public Thread { | |
315 | CondPingPong *p; | |
316 | public: | |
317 | explicit Consumer(CondPingPong *p): p(p) {} | |
318 | void* entry() override { | |
319 | p->consume(); | |
320 | return 0; | |
321 | } | |
322 | } consumer; | |
323 | ||
324 | public: | |
9f95a23c | 325 | CondPingPong(): consumer(this) {} |
7c673cae FG |
326 | |
327 | double run() { | |
328 | consumer.create("consumer"); | |
329 | uint64_t start = Cycles::rdtsc(); | |
330 | produce(); | |
331 | uint64_t stop = Cycles::rdtsc(); | |
332 | consumer.join(); | |
333 | return Cycles::to_seconds(stop - start)/count; | |
334 | } | |
335 | ||
336 | void produce() { | |
9f95a23c | 337 | std::unique_lock l{mutex}; |
7c673cae | 338 | while (cons < count) { |
9f95a23c | 339 | cond.wait(l, [this] { return cons >= prod; }); |
7c673cae | 340 | ++prod; |
9f95a23c | 341 | cond.notify_all(); |
7c673cae FG |
342 | } |
343 | } | |
344 | ||
345 | void consume() { | |
9f95a23c | 346 | std::unique_lock l{mutex}; |
7c673cae | 347 | while (cons < count) { |
9f95a23c | 348 | cond.wait(l, [this] { return cons != prod; }); |
7c673cae | 349 | ++cons; |
9f95a23c | 350 | cond.notify_all(); |
7c673cae FG |
351 | } |
352 | } | |
353 | }; | |
354 | ||
355 | // Measure the cost of coordinating between threads using a condition variable. | |
356 | double cond_ping_pong() | |
357 | { | |
358 | return CondPingPong().run(); | |
359 | } | |
360 | ||
361 | // Measure the cost of a 32-bit divide. Divides don't take a constant | |
362 | // number of cycles. Values were chosen here semi-randomly to depict a | |
363 | // fairly expensive scenario. Someone with fancy ALU knowledge could | |
364 | // probably pick worse values. | |
365 | double div32() | |
366 | { | |
367 | #if defined(__i386__) || defined(__x86_64__) | |
368 | int count = 1000000; | |
369 | uint64_t start = Cycles::rdtsc(); | |
370 | // NB: Expect an x86 processor exception is there's overflow. | |
371 | uint32_t numeratorHi = 0xa5a5a5a5U; | |
372 | uint32_t numeratorLo = 0x55aa55aaU; | |
373 | uint32_t divisor = 0xaa55aa55U; | |
374 | uint32_t quotient; | |
375 | uint32_t remainder; | |
376 | for (int i = 0; i < count; i++) { | |
377 | __asm__ __volatile__("div %4" : | |
378 | "=a"(quotient), "=d"(remainder) : | |
379 | "a"(numeratorLo), "d"(numeratorHi), "r"(divisor) : | |
380 | "cc"); | |
381 | } | |
382 | uint64_t stop = Cycles::rdtsc(); | |
383 | return Cycles::to_seconds(stop - start)/count; | |
f67539c2 TL |
384 | #elif defined(__aarch64__) |
385 | int count = 1000000; | |
386 | uint64_t start = Cycles::rdtsc(); | |
387 | uint64_t numerator = 0xa5a5a5a555aa55aaUL; | |
388 | uint32_t divisor = 0xaa55aa55U; | |
389 | uint32_t result; | |
390 | for (int i = 0; i < count; i++) { | |
391 | asm volatile("udiv %0, %1, %2" : "=r"(result) : | |
392 | "r"(numerator), "r"(divisor)); | |
393 | } | |
394 | uint64_t stop = Cycles::rdtsc(); | |
395 | return Cycles::to_seconds(stop - start)/count; | |
7c673cae FG |
396 | #else |
397 | return -1; | |
398 | #endif | |
399 | } | |
400 | ||
401 | // Measure the cost of a 64-bit divide. Divides don't take a constant | |
402 | // number of cycles. Values were chosen here semi-randomly to depict a | |
403 | // fairly expensive scenario. Someone with fancy ALU knowledge could | |
404 | // probably pick worse values. | |
405 | double div64() | |
406 | { | |
407 | #if defined(__x86_64__) || defined(__amd64__) | |
408 | int count = 1000000; | |
409 | // NB: Expect an x86 processor exception is there's overflow. | |
410 | uint64_t start = Cycles::rdtsc(); | |
411 | uint64_t numeratorHi = 0x5a5a5a5a5a5UL; | |
412 | uint64_t numeratorLo = 0x55aa55aa55aa55aaUL; | |
413 | uint64_t divisor = 0xaa55aa55aa55aa55UL; | |
414 | uint64_t quotient; | |
415 | uint64_t remainder; | |
416 | for (int i = 0; i < count; i++) { | |
417 | __asm__ __volatile__("divq %4" : | |
418 | "=a"(quotient), "=d"(remainder) : | |
419 | "a"(numeratorLo), "d"(numeratorHi), "r"(divisor) : | |
420 | "cc"); | |
421 | } | |
422 | uint64_t stop = Cycles::rdtsc(); | |
423 | return Cycles::to_seconds(stop - start)/count; | |
424 | #else | |
425 | return -1; | |
426 | #endif | |
427 | } | |
428 | ||
429 | // Measure the cost of calling a non-inlined function. | |
430 | double function_call() | |
431 | { | |
432 | int count = 1000000; | |
433 | uint64_t x = 0; | |
434 | uint64_t start = Cycles::rdtsc(); | |
435 | for (int i = 0; i < count; i++) { | |
436 | x = PerfHelper::plus_one(x); | |
437 | } | |
438 | uint64_t stop = Cycles::rdtsc(); | |
439 | return Cycles::to_seconds(stop - start)/count; | |
440 | } | |
441 | ||
442 | // Measure the minimum cost of EventCenter::process_events, when there are no | |
443 | // Pollers and no Timers. | |
444 | double eventcenter_poll() | |
445 | { | |
446 | int count = 1000000; | |
447 | EventCenter center(g_ceph_context); | |
448 | center.init(1000, 0, "posix"); | |
449 | center.set_owner(); | |
450 | uint64_t start = Cycles::rdtsc(); | |
451 | for (int i = 0; i < count; i++) { | |
452 | center.process_events(0); | |
453 | } | |
454 | uint64_t stop = Cycles::rdtsc(); | |
455 | return Cycles::to_seconds(stop - start)/count; | |
456 | } | |
457 | ||
458 | class CenterWorker : public Thread { | |
459 | CephContext *cct; | |
460 | bool done; | |
461 | ||
462 | public: | |
463 | EventCenter center; | |
464 | explicit CenterWorker(CephContext *c): cct(c), done(false), center(c) { | |
465 | center.init(100, 0, "posix"); | |
466 | } | |
467 | void stop() { | |
468 | done = true; | |
469 | center.wakeup(); | |
470 | } | |
471 | void* entry() override { | |
472 | center.set_owner(); | |
473 | bind_thread_to_cpu(2); | |
474 | while (!done) | |
475 | center.process_events(1000); | |
476 | return 0; | |
477 | } | |
478 | }; | |
479 | ||
480 | class CountEvent: public EventCallback { | |
31f18b77 | 481 | std::atomic<int64_t> *count; |
7c673cae FG |
482 | |
483 | public: | |
31f18b77 | 484 | explicit CountEvent(std::atomic<int64_t> *atomic): count(atomic) {} |
11fdf7f2 | 485 | void do_request(uint64_t id) override { |
31f18b77 | 486 | (*count)--; |
7c673cae FG |
487 | } |
488 | }; | |
489 | ||
490 | double eventcenter_dispatch() | |
491 | { | |
492 | int count = 100000; | |
493 | ||
494 | CenterWorker worker(g_ceph_context); | |
31f18b77 | 495 | std::atomic<int64_t> flag = { 1 }; |
7c673cae FG |
496 | worker.create("evt_center_disp"); |
497 | EventCallbackRef count_event(new CountEvent(&flag)); | |
498 | ||
499 | worker.center.dispatch_event_external(count_event); | |
500 | // Start a new thread and wait for it to ready. | |
31f18b77 | 501 | while (flag) |
7c673cae FG |
502 | usleep(100); |
503 | ||
504 | uint64_t start = Cycles::rdtsc(); | |
505 | for (int i = 0; i < count; i++) { | |
31f18b77 | 506 | flag = 1; |
7c673cae | 507 | worker.center.dispatch_event_external(count_event); |
31f18b77 | 508 | while (flag) |
7c673cae FG |
509 | ; |
510 | } | |
511 | uint64_t stop = Cycles::rdtsc(); | |
512 | worker.stop(); | |
513 | worker.join(); | |
514 | return Cycles::to_seconds(stop - start)/count; | |
515 | } | |
516 | ||
517 | // Measure the cost of copying a given number of bytes with memcpy. | |
518 | double memcpy_shared(size_t size) | |
519 | { | |
520 | int count = 1000000; | |
521 | char src[size], dst[size]; | |
522 | ||
523 | memset(src, 0, sizeof(src)); | |
524 | ||
525 | uint64_t start = Cycles::rdtsc(); | |
526 | for (int i = 0; i < count; i++) { | |
527 | memcpy(dst, src, size); | |
528 | } | |
529 | uint64_t stop = Cycles::rdtsc(); | |
530 | return Cycles::to_seconds(stop - start)/count; | |
531 | } | |
532 | ||
533 | double memcpy100() | |
534 | { | |
535 | return memcpy_shared(100); | |
536 | } | |
537 | ||
538 | double memcpy1000() | |
539 | { | |
540 | return memcpy_shared(1000); | |
541 | } | |
542 | ||
543 | double memcpy10000() | |
544 | { | |
545 | return memcpy_shared(10000); | |
546 | } | |
547 | ||
548 | // Benchmark rjenkins hashing performance on cached data. | |
549 | template <int key_length> | |
550 | double ceph_str_hash_rjenkins() | |
551 | { | |
552 | int count = 100000; | |
553 | char buf[key_length]; | |
554 | ||
555 | uint64_t start = Cycles::rdtsc(); | |
556 | for (int i = 0; i < count; i++) | |
557 | ceph_str_hash(CEPH_STR_HASH_RJENKINS, buf, sizeof(buf)); | |
558 | uint64_t stop = Cycles::rdtsc(); | |
559 | ||
560 | return Cycles::to_seconds(stop - start)/count; | |
561 | } | |
562 | ||
563 | // Measure the cost of reading the fine-grain cycle counter. | |
564 | double rdtsc_test() | |
565 | { | |
566 | int count = 1000000; | |
567 | uint64_t start = Cycles::rdtsc(); | |
568 | uint64_t total = 0; | |
569 | for (int i = 0; i < count; i++) { | |
570 | total += Cycles::rdtsc(); | |
571 | } | |
572 | uint64_t stop = Cycles::rdtsc(); | |
573 | return Cycles::to_seconds(stop - start)/count; | |
574 | } | |
575 | ||
576 | // Measure the cost of the Cycles::to_seconds method. | |
577 | double perf_cycles_to_seconds() | |
578 | { | |
579 | int count = 1000000; | |
580 | double total = 0; | |
581 | uint64_t cycles = 994261; | |
582 | uint64_t start = Cycles::rdtsc(); | |
583 | for (int i = 0; i < count; i++) { | |
584 | total += Cycles::to_seconds(cycles); | |
585 | } | |
586 | uint64_t stop = Cycles::rdtsc(); | |
587 | // printf("Result: %.4f\n", total/count); | |
588 | return Cycles::to_seconds(stop - start)/count; | |
589 | } | |
590 | ||
591 | // Measure the cost of the Cylcles::toNanoseconds method. | |
592 | double perf_cycles_to_nanoseconds() | |
593 | { | |
594 | int count = 1000000; | |
595 | uint64_t total = 0; | |
596 | uint64_t cycles = 994261; | |
597 | uint64_t start = Cycles::rdtsc(); | |
598 | for (int i = 0; i < count; i++) { | |
599 | total += Cycles::to_nanoseconds(cycles); | |
600 | } | |
601 | uint64_t stop = Cycles::rdtsc(); | |
602 | // printf("Result: %lu\n", total/count); | |
603 | return Cycles::to_seconds(stop - start)/count; | |
604 | } | |
605 | ||
606 | ||
607 | #ifdef HAVE_SSE | |
608 | /** | |
609 | * Prefetch the cache lines containing [object, object + numBytes) into the | |
610 | * processor's caches. | |
611 | * The best docs for this are in the Intel instruction set reference under | |
612 | * PREFETCH. | |
613 | * \param object | |
614 | * The start of the region of memory to prefetch. | |
615 | * \param num_bytes | |
616 | * The size of the region of memory to prefetch. | |
617 | */ | |
618 | static inline void prefetch(const void *object, uint64_t num_bytes) | |
619 | { | |
620 | uint64_t offset = reinterpret_cast<uint64_t>(object) & 0x3fUL; | |
621 | const char* p = reinterpret_cast<const char*>(object) - offset; | |
622 | for (uint64_t i = 0; i < offset + num_bytes; i += 64) | |
623 | _mm_prefetch(p + i, _MM_HINT_T0); | |
624 | } | |
f67539c2 TL |
625 | #elif defined(__aarch64__) |
626 | static inline void prefetch(const void *object, uint64_t num_bytes) | |
627 | { | |
628 | uint64_t offset = reinterpret_cast<uint64_t>(object) & 0x3fUL; | |
629 | const char* ptr = reinterpret_cast<const char*>(object) - offset; | |
630 | for (uint64_t i = 0; i < offset + num_bytes; i += 64, ptr += 64) | |
631 | asm volatile("prfm pldl1keep, %a0\n" : : "p" (ptr)); | |
632 | } | |
7c673cae FG |
633 | #endif |
634 | ||
635 | // Measure the cost of the prefetch instruction. | |
636 | double perf_prefetch() | |
637 | { | |
f67539c2 | 638 | #if defined(HAVE_SSE) || defined(__aarch64__) |
7c673cae FG |
639 | uint64_t total_ticks = 0; |
640 | int count = 10; | |
641 | char buf[16 * 64]; | |
7c673cae FG |
642 | |
643 | for (int i = 0; i < count; i++) { | |
644 | PerfHelper::flush_cache(); | |
11fdf7f2 | 645 | uint64_t start = Cycles::rdtsc(); |
7c673cae FG |
646 | prefetch(&buf[576], 64); |
647 | prefetch(&buf[0], 64); | |
648 | prefetch(&buf[512], 64); | |
649 | prefetch(&buf[960], 64); | |
650 | prefetch(&buf[640], 64); | |
651 | prefetch(&buf[896], 64); | |
652 | prefetch(&buf[256], 64); | |
653 | prefetch(&buf[704], 64); | |
654 | prefetch(&buf[320], 64); | |
655 | prefetch(&buf[384], 64); | |
656 | prefetch(&buf[128], 64); | |
657 | prefetch(&buf[448], 64); | |
658 | prefetch(&buf[768], 64); | |
659 | prefetch(&buf[832], 64); | |
660 | prefetch(&buf[64], 64); | |
661 | prefetch(&buf[192], 64); | |
11fdf7f2 | 662 | uint64_t stop = Cycles::rdtsc(); |
7c673cae FG |
663 | total_ticks += stop - start; |
664 | } | |
665 | return Cycles::to_seconds(total_ticks) / count / 16; | |
666 | #else | |
667 | return -1; | |
668 | #endif | |
669 | } | |
670 | ||
671 | #if defined(__x86_64__) | |
672 | /** | |
673 | * This function is used to seralize machine instructions so that no | |
674 | * instructions that appear after it in the current thread can run before any | |
675 | * instructions that appear before it. | |
676 | * | |
677 | * It is useful for putting around rdpmc instructions (to pinpoint cache | |
678 | * misses) as well as before rdtsc instructions, to prevent time pollution from | |
679 | * instructions supposed to be executing before the timer starts. | |
680 | */ | |
681 | static inline void serialize() { | |
682 | uint32_t eax, ebx, ecx, edx; | |
683 | __asm volatile("cpuid" | |
684 | : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) | |
685 | : "a" (1U)); | |
686 | } | |
687 | #endif | |
688 | ||
689 | // Measure the cost of cpuid | |
690 | double perf_serialize() { | |
691 | #if defined(__x86_64__) | |
692 | int count = 1000000; | |
693 | uint64_t start = Cycles::rdtsc(); | |
694 | for (int i = 0; i < count; i++) { | |
695 | serialize(); | |
696 | } | |
697 | uint64_t stop = Cycles::rdtsc(); | |
698 | return Cycles::to_seconds(stop - start)/count; | |
699 | #else | |
700 | return -1; | |
701 | #endif | |
702 | } | |
703 | ||
704 | // Measure the cost of an lfence instruction. | |
705 | double lfence() | |
706 | { | |
707 | #ifdef HAVE_SSE2 | |
708 | int count = 1000000; | |
709 | uint64_t start = Cycles::rdtsc(); | |
710 | for (int i = 0; i < count; i++) { | |
711 | __asm__ __volatile__("lfence" ::: "memory"); | |
712 | } | |
713 | uint64_t stop = Cycles::rdtsc(); | |
714 | return Cycles::to_seconds(stop - start)/count; | |
f67539c2 TL |
715 | #elif defined(__aarch64__) |
716 | int count = 1000000; | |
717 | uint64_t start = Cycles::rdtsc(); | |
718 | for (int i = 0; i < count; i++) { | |
719 | asm volatile("dmb ishld" ::: "memory"); | |
720 | } | |
721 | uint64_t stop = Cycles::rdtsc(); | |
722 | return Cycles::to_seconds(stop - start)/count; | |
7c673cae FG |
723 | #else |
724 | return -1; | |
725 | #endif | |
726 | } | |
727 | ||
728 | // Measure the cost of an sfence instruction. | |
729 | double sfence() | |
730 | { | |
731 | #ifdef HAVE_SSE | |
732 | int count = 1000000; | |
733 | uint64_t start = Cycles::rdtsc(); | |
734 | for (int i = 0; i < count; i++) { | |
735 | __asm__ __volatile__("sfence" ::: "memory"); | |
736 | } | |
737 | uint64_t stop = Cycles::rdtsc(); | |
738 | return Cycles::to_seconds(stop - start)/count; | |
f67539c2 TL |
739 | #elif defined(__aarch64__) |
740 | int count = 1000000; | |
741 | uint64_t start = Cycles::rdtsc(); | |
742 | for (int i = 0; i < count; i++) { | |
743 | asm volatile("dmb ishst" ::: "memory"); | |
744 | } | |
745 | uint64_t stop = Cycles::rdtsc(); | |
746 | return Cycles::to_seconds(stop - start)/count; | |
7c673cae FG |
747 | #else |
748 | return -1; | |
749 | #endif | |
750 | } | |
751 | ||
752 | // Measure the cost of acquiring and releasing a SpinLock (assuming the | |
753 | // lock is initially free). | |
754 | double test_spinlock() | |
755 | { | |
756 | int count = 1000000; | |
11fdf7f2 | 757 | ceph::spinlock lock; |
7c673cae FG |
758 | uint64_t start = Cycles::rdtsc(); |
759 | for (int i = 0; i < count; i++) { | |
760 | lock.lock(); | |
761 | lock.unlock(); | |
762 | } | |
763 | uint64_t stop = Cycles::rdtsc(); | |
764 | return Cycles::to_seconds(stop - start)/count; | |
765 | } | |
766 | ||
767 | // Helper for spawn_thread. This is the main function that the thread executes | |
768 | // (intentionally empty). | |
769 | class ThreadHelper : public Thread { | |
770 | void *entry() override { return 0; } | |
771 | }; | |
772 | ||
773 | // Measure the cost of start and joining with a thread. | |
774 | double spawn_thread() | |
775 | { | |
776 | int count = 10000; | |
777 | ThreadHelper thread; | |
778 | uint64_t start = Cycles::rdtsc(); | |
779 | for (int i = 0; i < count; i++) { | |
780 | thread.create("thread_helper"); | |
781 | thread.join(); | |
782 | } | |
783 | uint64_t stop = Cycles::rdtsc(); | |
784 | return Cycles::to_seconds(stop - start)/count; | |
785 | } | |
786 | ||
787 | class FakeContext : public Context { | |
788 | public: | |
789 | void finish(int r) override {} | |
790 | }; | |
791 | ||
792 | // Measure the cost of starting and stopping a Dispatch::Timer. | |
793 | double perf_timer() | |
794 | { | |
795 | int count = 1000000; | |
9f95a23c | 796 | ceph::mutex lock = ceph::make_mutex("perf_timer::lock"); |
7c673cae FG |
797 | SafeTimer timer(g_ceph_context, lock); |
798 | FakeContext **c = new FakeContext*[count]; | |
799 | for (int i = 0; i < count; i++) { | |
800 | c[i] = new FakeContext(); | |
801 | } | |
802 | uint64_t start = Cycles::rdtsc(); | |
9f95a23c | 803 | std::lock_guard l{lock}; |
7c673cae | 804 | for (int i = 0; i < count; i++) { |
3efd9988 FG |
805 | if (timer.add_event_after(12345, c[i])) { |
806 | timer.cancel_event(c[i]); | |
807 | } | |
7c673cae FG |
808 | } |
809 | uint64_t stop = Cycles::rdtsc(); | |
810 | delete[] c; | |
811 | return Cycles::to_seconds(stop - start)/count; | |
812 | } | |
813 | ||
814 | // Measure the cost of throwing and catching an int. This uses an integer as | |
815 | // the value thrown, which is presumably as fast as possible. | |
816 | double throw_int() | |
817 | { | |
818 | int count = 10000; | |
819 | uint64_t start = Cycles::rdtsc(); | |
820 | for (int i = 0; i < count; i++) { | |
821 | try { | |
822 | throw 0; | |
823 | } catch (int) { // NOLINT | |
824 | // pass | |
825 | } | |
826 | } | |
827 | uint64_t stop = Cycles::rdtsc(); | |
828 | return Cycles::to_seconds(stop - start)/count; | |
829 | } | |
830 | ||
831 | // Measure the cost of throwing and catching an int from a function call. | |
832 | double throw_int_call() | |
833 | { | |
834 | int count = 10000; | |
835 | uint64_t start = Cycles::rdtsc(); | |
836 | for (int i = 0; i < count; i++) { | |
837 | try { | |
838 | PerfHelper::throw_int(); | |
839 | } catch (int) { // NOLINT | |
840 | // pass | |
841 | } | |
842 | } | |
843 | uint64_t stop = Cycles::rdtsc(); | |
844 | return Cycles::to_seconds(stop - start)/count; | |
845 | } | |
846 | ||
847 | // Measure the cost of throwing and catching an Exception. This uses an actual | |
848 | // exception as the value thrown, which may be slower than throwInt. | |
849 | double throw_exception() | |
850 | { | |
851 | int count = 10000; | |
852 | uint64_t start = Cycles::rdtsc(); | |
853 | for (int i = 0; i < count; i++) { | |
854 | try { | |
855 | throw buffer::end_of_buffer(); | |
856 | } catch (const buffer::end_of_buffer&) { | |
857 | // pass | |
858 | } | |
859 | } | |
860 | uint64_t stop = Cycles::rdtsc(); | |
861 | return Cycles::to_seconds(stop - start)/count; | |
862 | } | |
863 | ||
864 | // Measure the cost of throwing and catching an Exception from a function call. | |
865 | double throw_exception_call() | |
866 | { | |
867 | int count = 10000; | |
868 | uint64_t start = Cycles::rdtsc(); | |
869 | for (int i = 0; i < count; i++) { | |
870 | try { | |
871 | PerfHelper::throw_end_of_buffer(); | |
872 | } catch (const buffer::end_of_buffer&) { | |
873 | // pass | |
874 | } | |
875 | } | |
876 | uint64_t stop = Cycles::rdtsc(); | |
877 | return Cycles::to_seconds(stop - start)/count; | |
878 | } | |
879 | ||
880 | // Measure the cost of pushing a new element on a std::vector, copying | |
881 | // from the end to an internal element, and popping the end element. | |
882 | double vector_push_pop() | |
883 | { | |
884 | int count = 100000; | |
885 | std::vector<int> vector; | |
886 | vector.push_back(1); | |
887 | vector.push_back(2); | |
888 | vector.push_back(3); | |
889 | uint64_t start = Cycles::rdtsc(); | |
890 | for (int i = 0; i < count; i++) { | |
891 | vector.push_back(i); | |
892 | vector.push_back(i+1); | |
893 | vector.push_back(i+2); | |
894 | vector[2] = vector.back(); | |
895 | vector.pop_back(); | |
896 | vector[0] = vector.back(); | |
897 | vector.pop_back(); | |
898 | vector[1] = vector.back(); | |
899 | vector.pop_back(); | |
900 | } | |
901 | uint64_t stop = Cycles::rdtsc(); | |
902 | return Cycles::to_seconds(stop - start)/(count*3); | |
903 | } | |
904 | ||
905 | // Measure the cost of ceph_clock_now | |
906 | double perf_ceph_clock_now() | |
907 | { | |
908 | int count = 100000; | |
909 | uint64_t start = Cycles::rdtsc(); | |
910 | for (int i = 0; i < count; i++) { | |
911 | ceph_clock_now(); | |
912 | } | |
913 | uint64_t stop = Cycles::rdtsc(); | |
914 | return Cycles::to_seconds(stop - start)/count; | |
915 | } | |
916 | ||
917 | // The following struct and table define each performance test in terms of | |
918 | // a string name and a function that implements the test. | |
919 | struct TestInfo { | |
920 | const char* name; // Name of the performance test; this is | |
921 | // what gets typed on the command line to | |
922 | // run the test. | |
923 | double (*func)(); // Function that implements the test; | |
924 | // returns the time (in seconds) for each | |
925 | // iteration of that test. | |
926 | const char *description; // Short description of this test (not more | |
927 | // than about 40 characters, so the entire | |
928 | // test output fits on a single line). | |
929 | }; | |
930 | TestInfo tests[] = { | |
931 | {"atomic_int_cmp", atomic_int_cmp, | |
932 | "atomic_t::compare_and_swap"}, | |
933 | {"atomic_int_inc", atomic_int_inc, | |
934 | "atomic_t::inc"}, | |
935 | {"atomic_int_read", atomic_int_read, | |
936 | "atomic_t::read"}, | |
937 | {"atomic_int_set", atomic_int_set, | |
938 | "atomic_t::set"}, | |
939 | {"mutex_nonblock", mutex_nonblock, | |
940 | "Mutex lock/unlock (no blocking)"}, | |
941 | {"buffer_basic", buffer_basic, | |
942 | "buffer create, add one ptr, delete"}, | |
943 | {"buffer_encode_decode", buffer_encode_decode, | |
944 | "buffer create, encode/decode object, delete"}, | |
945 | {"buffer_basic_copy", buffer_basic_copy, | |
946 | "buffer create, copy small block, delete"}, | |
947 | {"buffer_copy", buffer_copy, | |
948 | "copy out 2 small ptrs from buffer"}, | |
949 | {"buffer_encode10", buffer_encode, | |
950 | "buffer encoding 10 structures onto existing ptr"}, | |
7c673cae FG |
951 | {"buffer_iterator", buffer_iterator, |
952 | "iterate over buffer with 5 ptrs"}, | |
953 | {"cond_ping_pong", cond_ping_pong, | |
954 | "condition variable round-trip"}, | |
955 | {"div32", div32, | |
956 | "32-bit integer division instruction"}, | |
957 | {"div64", div64, | |
958 | "64-bit integer division instruction"}, | |
959 | {"function_call", function_call, | |
960 | "Call a function that has not been inlined"}, | |
961 | {"eventcenter_poll", eventcenter_poll, | |
962 | "EventCenter::process_events (no timers or events)"}, | |
963 | {"eventcenter_dispatch", eventcenter_dispatch, | |
964 | "EventCenter::dispatch_event_external latency"}, | |
965 | {"memcpy100", memcpy100, | |
966 | "Copy 100 bytes with memcpy"}, | |
967 | {"memcpy1000", memcpy1000, | |
968 | "Copy 1000 bytes with memcpy"}, | |
969 | {"memcpy10000", memcpy10000, | |
970 | "Copy 10000 bytes with memcpy"}, | |
971 | {"ceph_str_hash_rjenkins", ceph_str_hash_rjenkins<16>, | |
972 | "rjenkins hash on 16 byte of data"}, | |
973 | {"ceph_str_hash_rjenkins", ceph_str_hash_rjenkins<256>, | |
974 | "rjenkins hash on 256 bytes of data"}, | |
975 | {"rdtsc", rdtsc_test, | |
976 | "Read the fine-grain cycle counter"}, | |
977 | {"cycles_to_seconds", perf_cycles_to_seconds, | |
978 | "Convert a rdtsc result to (double) seconds"}, | |
979 | {"cycles_to_seconds", perf_cycles_to_nanoseconds, | |
980 | "Convert a rdtsc result to (uint64_t) nanoseconds"}, | |
981 | {"prefetch", perf_prefetch, | |
982 | "Prefetch instruction"}, | |
983 | {"serialize", perf_serialize, | |
984 | "serialize instruction"}, | |
985 | {"lfence", lfence, | |
986 | "Lfence instruction"}, | |
987 | {"sfence", sfence, | |
988 | "Sfence instruction"}, | |
989 | {"spin_lock", test_spinlock, | |
990 | "Acquire/release SpinLock"}, | |
991 | {"spawn_thread", spawn_thread, | |
992 | "Start and stop a thread"}, | |
993 | {"perf_timer", perf_timer, | |
994 | "Insert and cancel a SafeTimer"}, | |
995 | {"throw_int", throw_int, | |
996 | "Throw an int"}, | |
997 | {"throw_int_call", throw_int_call, | |
998 | "Throw an int in a function call"}, | |
999 | {"throw_exception", throw_exception, | |
1000 | "Throw an Exception"}, | |
1001 | {"throw_exception_call", throw_exception_call, | |
1002 | "Throw an Exception in a function call"}, | |
1003 | {"vector_push_pop", vector_push_pop, | |
1004 | "Push and pop a std::vector"}, | |
1005 | {"ceph_clock_now", perf_ceph_clock_now, | |
1006 | "ceph_clock_now function"}, | |
1007 | }; | |
1008 | ||
1009 | /** | |
1010 | * Runs a particular test and prints a one-line result message. | |
1011 | * | |
1012 | * \param info | |
1013 | * Describes the test to run. | |
1014 | */ | |
1015 | void run_test(TestInfo& info) | |
1016 | { | |
1017 | double secs = info.func(); | |
1018 | int width = printf("%-24s ", info.name); | |
1019 | if (secs == -1) { | |
1020 | width += printf(" architecture nonsupport "); | |
1021 | } else if (secs < 1.0e-06) { | |
1022 | width += printf("%8.2fns", 1e09*secs); | |
1023 | } else if (secs < 1.0e-03) { | |
1024 | width += printf("%8.2fus", 1e06*secs); | |
1025 | } else if (secs < 1.0) { | |
1026 | width += printf("%8.2fms", 1e03*secs); | |
1027 | } else { | |
1028 | width += printf("%8.2fs", secs); | |
1029 | } | |
1030 | printf("%*s %s\n", 32-width, "", info.description); | |
1031 | } | |
1032 | ||
1033 | int main(int argc, char *argv[]) | |
1034 | { | |
1035 | vector<const char*> args; | |
1036 | argv_to_vec(argc, (const char **)argv, args); | |
1037 | ||
1038 | auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, | |
11fdf7f2 TL |
1039 | CODE_ENVIRONMENT_UTILITY, |
1040 | CINIT_FLAG_NO_DEFAULT_CONFIG_FILE); | |
7c673cae FG |
1041 | common_init_finish(g_ceph_context); |
1042 | Cycles::init(); | |
1043 | ||
1044 | bind_thread_to_cpu(3); | |
1045 | if (argc == 1) { | |
1046 | // No test names specified; run all tests. | |
1047 | for (size_t i = 0; i < sizeof(tests)/sizeof(TestInfo); ++i) { | |
1048 | run_test(tests[i]); | |
1049 | } | |
1050 | } else { | |
1051 | // Run only the tests that were specified on the command line. | |
1052 | for (int i = 1; i < argc; i++) { | |
1053 | bool found_test = false; | |
1054 | for (size_t j = 0; j < sizeof(tests)/sizeof(TestInfo); ++j) { | |
1055 | if (strcmp(argv[i], tests[j].name) == 0) { | |
1056 | found_test = true; | |
1057 | run_test(tests[j]); | |
1058 | break; | |
1059 | } | |
1060 | } | |
1061 | if (!found_test) { | |
1062 | int width = printf("%-24s ??", argv[i]); | |
1063 | printf("%*s No such test\n", 32-width, ""); | |
1064 | } | |
1065 | } | |
1066 | } | |
1067 | } |