]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* Copyright (c) 2015 Haomai Wang <haomaiwang@gmail.com> | |
4 | * Copyright (c) 2011-2014 Stanford University | |
5 | * Copyright (c) 2011 Facebook | |
6 | * | |
7 | * Permission to use, copy, modify, and distribute this software for any | |
8 | * purpose with or without fee is hereby granted, provided that the above | |
9 | * copyright notice and this permission notice appear in all copies. | |
10 | * | |
11 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR(S) DISCLAIM ALL WARRANTIES | |
12 | * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF | |
13 | * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL AUTHORS BE LIABLE FOR | |
14 | * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES | |
15 | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN | |
16 | * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF | |
17 | * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | |
18 | */ | |
19 | ||
20 | // This program contains a collection of low-level performance measurements | |
21 | // for Ceph, which can be run either individually or altogether. These | |
22 | // tests measure performance in a single stand-alone process, not in a cluster | |
23 | // with multiple servers. Invoke the program like this: | |
24 | // | |
25 | // Perf test1 test2 ... | |
26 | // | |
27 | // test1 and test2 are the names of individual performance measurements to | |
28 | // run. If no test names are provided then all of the performance tests | |
29 | // are run. | |
30 | // | |
31 | // To add a new test: | |
32 | // * Write a function that implements the test. Use existing test functions | |
33 | // as a guideline, and be sure to generate output in the same form as | |
34 | // other tests. | |
35 | // * Create a new entry for the test in the #tests table. | |
36 | #include <vector> | |
37 | #include <sched.h> | |
38 | ||
39 | #include "acconfig.h" | |
40 | #ifdef HAVE_SSE | |
41 | #include <xmmintrin.h> | |
42 | #endif | |
43 | ||
7c673cae FG |
44 | #include "include/buffer.h" |
45 | #include "include/encoding.h" | |
46 | #include "include/ceph_hash.h" | |
11fdf7f2 | 47 | #include "include/spinlock.h" |
7c673cae FG |
48 | #include "common/ceph_argparse.h" |
49 | #include "common/Cycles.h" | |
50 | #include "common/Cond.h" | |
9f95a23c | 51 | #include "common/ceph_mutex.h" |
7c673cae FG |
52 | #include "common/Thread.h" |
53 | #include "common/Timer.h" | |
54 | #include "msg/async/Event.h" | |
55 | #include "global/global_init.h" | |
56 | ||
57 | #include "test/perf_helper.h" | |
58 | ||
31f18b77 FG |
59 | #include <atomic> |
60 | ||
7c673cae FG |
61 | using namespace ceph; |
62 | ||
63 | /** | |
64 | * Ask the operating system to pin the current thread to a given CPU. | |
65 | * | |
66 | * \param cpu | |
67 | * Indicates the desired CPU and hyperthread; low order 2 bits | |
68 | * specify CPU, next bit specifies hyperthread. | |
69 | */ | |
70 | void bind_thread_to_cpu(int cpu) | |
71 | { | |
72 | #ifdef HAVE_SCHED | |
73 | cpu_set_t set; | |
74 | CPU_ZERO(&set); | |
75 | CPU_SET(cpu, &set); | |
76 | sched_setaffinity(0, sizeof(set), &set); | |
77 | #endif | |
78 | } | |
79 | ||
80 | /* | |
81 | * This function just discards its argument. It's used to make it | |
82 | * appear that data is used, so that the compiler won't optimize | |
83 | * away the code we're trying to measure. | |
84 | * | |
85 | * \param value | |
86 | * Pointer to arbitrary value; it's discarded. | |
87 | */ | |
88 | void discard(void* value) { | |
89 | int x = *reinterpret_cast<int*>(value); | |
90 | if (x == 0x43924776) { | |
91 | printf("Value was 0x%x\n", x); | |
92 | } | |
93 | } | |
94 | ||
95 | //---------------------------------------------------------------------- | |
96 | // Test functions start here | |
97 | //---------------------------------------------------------------------- | |
98 | ||
31f18b77 | 99 | // Measure the cost of atomic compare-and-swap |
7c673cae FG |
100 | double atomic_int_cmp() |
101 | { | |
102 | int count = 1000000; | |
31f18b77 FG |
103 | std::atomic<unsigned> value = { 11 }; |
104 | unsigned int test = 11; | |
7c673cae FG |
105 | uint64_t start = Cycles::rdtsc(); |
106 | for (int i = 0; i < count; i++) { | |
31f18b77 | 107 | value.compare_exchange_strong(test, test+2); |
7c673cae FG |
108 | test += 2; |
109 | } | |
110 | uint64_t stop = Cycles::rdtsc(); | |
111 | // printf("Final value: %d\n", value.load()); | |
112 | return Cycles::to_seconds(stop - start)/count; | |
113 | } | |
114 | ||
31f18b77 | 115 | // Measure the cost of incrementing an atomic |
7c673cae FG |
116 | double atomic_int_inc() |
117 | { | |
118 | int count = 1000000; | |
31f18b77 | 119 | std::atomic<int64_t> value = { 11 }; |
7c673cae FG |
120 | uint64_t start = Cycles::rdtsc(); |
121 | for (int i = 0; i < count; i++) { | |
31f18b77 | 122 | value++; |
7c673cae FG |
123 | } |
124 | uint64_t stop = Cycles::rdtsc(); | |
125 | // printf("Final value: %d\n", value.load()); | |
126 | return Cycles::to_seconds(stop - start)/count; | |
127 | } | |
128 | ||
31f18b77 | 129 | // Measure the cost of reading an atomic |
7c673cae FG |
130 | double atomic_int_read() |
131 | { | |
132 | int count = 1000000; | |
31f18b77 | 133 | std::atomic<int64_t> value = { 11 }; |
7c673cae FG |
134 | int total = 0; |
135 | uint64_t start = Cycles::rdtsc(); | |
136 | for (int i = 0; i < count; i++) { | |
31f18b77 | 137 | total += value; |
7c673cae FG |
138 | } |
139 | uint64_t stop = Cycles::rdtsc(); | |
140 | // printf("Total: %d\n", total); | |
141 | return Cycles::to_seconds(stop - start)/count; | |
142 | } | |
143 | ||
31f18b77 | 144 | // Measure the cost of storing a new value in an atomic |
7c673cae FG |
145 | double atomic_int_set() |
146 | { | |
147 | int count = 1000000; | |
31f18b77 | 148 | std::atomic<int64_t> value = { 11 }; |
7c673cae FG |
149 | uint64_t start = Cycles::rdtsc(); |
150 | for (int i = 0; i < count; i++) { | |
31f18b77 | 151 | value = 88; |
7c673cae FG |
152 | } |
153 | uint64_t stop = Cycles::rdtsc(); | |
154 | return Cycles::to_seconds(stop - start)/count; | |
155 | } | |
156 | ||
157 | // Measure the cost of acquiring and releasing a mutex in the | |
158 | // fast case where the mutex is free. | |
159 | double mutex_nonblock() | |
160 | { | |
161 | int count = 1000000; | |
9f95a23c | 162 | ceph::mutex m = ceph::make_mutex("mutex_nonblock::m"); |
7c673cae FG |
163 | uint64_t start = Cycles::rdtsc(); |
164 | for (int i = 0; i < count; i++) { | |
9f95a23c TL |
165 | m.lock(); |
166 | m.unlock(); | |
7c673cae FG |
167 | } |
168 | uint64_t stop = Cycles::rdtsc(); | |
169 | return Cycles::to_seconds(stop - start)/count; | |
170 | } | |
171 | ||
172 | // Measure the cost of allocating and deallocating a buffer, plus | |
173 | // appending (logically) one ptr. | |
174 | double buffer_basic() | |
175 | { | |
176 | int count = 1000000; | |
177 | uint64_t start = Cycles::rdtsc(); | |
178 | bufferptr ptr("abcdefg", 7); | |
179 | for (int i = 0; i < count; i++) { | |
180 | bufferlist b; | |
181 | b.append(ptr, 0, 5); | |
182 | } | |
183 | uint64_t stop = Cycles::rdtsc(); | |
184 | return Cycles::to_seconds(stop - start)/count; | |
185 | } | |
186 | ||
187 | struct DummyBlock { | |
188 | int a = 1, b = 2, c = 3, d = 4; | |
189 | void encode(bufferlist &bl) const { | |
190 | ENCODE_START(1, 1, bl); | |
11fdf7f2 TL |
191 | encode(a, bl); |
192 | encode(b, bl); | |
193 | encode(c, bl); | |
194 | encode(d, bl); | |
7c673cae FG |
195 | ENCODE_FINISH(bl); |
196 | } | |
11fdf7f2 | 197 | void decode(bufferlist::const_iterator &bl) { |
7c673cae | 198 | DECODE_START(1, bl); |
11fdf7f2 TL |
199 | decode(a, bl); |
200 | decode(b, bl); | |
201 | decode(c, bl); | |
202 | decode(d, bl); | |
7c673cae FG |
203 | DECODE_FINISH(bl); |
204 | } | |
205 | }; | |
206 | WRITE_CLASS_ENCODER(DummyBlock) | |
207 | ||
208 | // Measure the cost of encoding and decoding a buffer, plus | |
209 | // allocating space for one chunk. | |
210 | double buffer_encode_decode() | |
211 | { | |
212 | int count = 1000000; | |
213 | uint64_t start = Cycles::rdtsc(); | |
214 | for (int i = 0; i < count; i++) { | |
215 | bufferlist b; | |
216 | DummyBlock dummy_block; | |
11fdf7f2 TL |
217 | encode(dummy_block, b); |
218 | auto iter = b.cbegin(); | |
219 | decode(dummy_block, iter); | |
7c673cae FG |
220 | } |
221 | uint64_t stop = Cycles::rdtsc(); | |
222 | return Cycles::to_seconds(stop - start)/count; | |
223 | } | |
224 | ||
225 | // Measure the cost of allocating and deallocating a buffer, plus | |
226 | // copying in a small block. | |
227 | double buffer_basic_copy() | |
228 | { | |
229 | int count = 1000000; | |
230 | uint64_t start = Cycles::rdtsc(); | |
231 | for (int i = 0; i < count; i++) { | |
232 | bufferlist b; | |
233 | b.append("abcdefg", 6); | |
234 | } | |
235 | uint64_t stop = Cycles::rdtsc(); | |
236 | return Cycles::to_seconds(stop - start)/count; | |
237 | } | |
238 | ||
239 | // Measure the cost of making a copy of parts of two ptrs. | |
240 | double buffer_copy() | |
241 | { | |
242 | int count = 1000000; | |
243 | bufferlist b; | |
244 | b.append("abcde", 5); | |
245 | b.append("01234", 5); | |
246 | char copy[10]; | |
247 | uint64_t start = Cycles::rdtsc(); | |
248 | for (int i = 0; i < count; i++) { | |
9f95a23c | 249 | b.cbegin(2).copy(6, copy); |
7c673cae FG |
250 | } |
251 | uint64_t stop = Cycles::rdtsc(); | |
252 | return Cycles::to_seconds(stop - start)/count; | |
253 | } | |
254 | ||
255 | // Measure the cost of allocating new space by extending the | |
256 | // bufferlist | |
257 | double buffer_encode() | |
258 | { | |
259 | int count = 100000; | |
260 | uint64_t total = 0; | |
261 | for (int i = 0; i < count; i++) { | |
262 | bufferlist b; | |
263 | DummyBlock dummy_block; | |
11fdf7f2 | 264 | encode(dummy_block, b); |
7c673cae | 265 | uint64_t start = Cycles::rdtsc(); |
11fdf7f2 TL |
266 | encode(dummy_block, b); |
267 | encode(dummy_block, b); | |
268 | encode(dummy_block, b); | |
269 | encode(dummy_block, b); | |
270 | encode(dummy_block, b); | |
271 | encode(dummy_block, b); | |
272 | encode(dummy_block, b); | |
273 | encode(dummy_block, b); | |
274 | encode(dummy_block, b); | |
275 | encode(dummy_block, b); | |
7c673cae FG |
276 | total += Cycles::rdtsc() - start; |
277 | } | |
278 | return Cycles::to_seconds(total)/(count*10); | |
279 | } | |
280 | ||
7c673cae FG |
281 | // Measure the cost of creating an iterator and iterating over 10 |
282 | // chunks in a buffer. | |
283 | double buffer_iterator() | |
284 | { | |
285 | bufferlist b; | |
286 | const char s[] = "abcdefghijklmnopqrstuvwxyz"; | |
287 | bufferptr ptr(s, sizeof(s)); | |
288 | for (int i = 0; i < 5; i++) { | |
289 | b.append(ptr, i, 5); | |
290 | } | |
291 | int count = 100000; | |
292 | int sum = 0; | |
293 | uint64_t start = Cycles::rdtsc(); | |
294 | for (int i = 0; i < count; i++) { | |
11fdf7f2 | 295 | auto it = b.cbegin(); |
7c673cae FG |
296 | while (!it.end()) { |
297 | sum += (static_cast<const char*>(it.get_current_ptr().c_str()))[it.get_remaining()-1]; | |
298 | ++it; | |
299 | } | |
300 | } | |
301 | uint64_t stop = Cycles::rdtsc(); | |
302 | discard(&sum); | |
303 | return Cycles::to_seconds(stop - start)/count; | |
304 | } | |
305 | ||
306 | // Implements the CondPingPong test. | |
307 | class CondPingPong { | |
9f95a23c TL |
308 | ceph::mutex mutex = ceph::make_mutex("CondPingPong::mutex"); |
309 | ceph::condition_variable cond; | |
310 | int prod = 0; | |
311 | int cons = 0; | |
312 | const int count = 10000; | |
7c673cae FG |
313 | |
314 | class Consumer : public Thread { | |
315 | CondPingPong *p; | |
316 | public: | |
317 | explicit Consumer(CondPingPong *p): p(p) {} | |
318 | void* entry() override { | |
319 | p->consume(); | |
320 | return 0; | |
321 | } | |
322 | } consumer; | |
323 | ||
324 | public: | |
9f95a23c | 325 | CondPingPong(): consumer(this) {} |
7c673cae FG |
326 | |
327 | double run() { | |
328 | consumer.create("consumer"); | |
329 | uint64_t start = Cycles::rdtsc(); | |
330 | produce(); | |
331 | uint64_t stop = Cycles::rdtsc(); | |
332 | consumer.join(); | |
333 | return Cycles::to_seconds(stop - start)/count; | |
334 | } | |
335 | ||
336 | void produce() { | |
9f95a23c | 337 | std::unique_lock l{mutex}; |
7c673cae | 338 | while (cons < count) { |
9f95a23c | 339 | cond.wait(l, [this] { return cons >= prod; }); |
7c673cae | 340 | ++prod; |
9f95a23c | 341 | cond.notify_all(); |
7c673cae FG |
342 | } |
343 | } | |
344 | ||
345 | void consume() { | |
9f95a23c | 346 | std::unique_lock l{mutex}; |
7c673cae | 347 | while (cons < count) { |
9f95a23c | 348 | cond.wait(l, [this] { return cons != prod; }); |
7c673cae | 349 | ++cons; |
9f95a23c | 350 | cond.notify_all(); |
7c673cae FG |
351 | } |
352 | } | |
353 | }; | |
354 | ||
355 | // Measure the cost of coordinating between threads using a condition variable. | |
356 | double cond_ping_pong() | |
357 | { | |
358 | return CondPingPong().run(); | |
359 | } | |
360 | ||
361 | // Measure the cost of a 32-bit divide. Divides don't take a constant | |
362 | // number of cycles. Values were chosen here semi-randomly to depict a | |
363 | // fairly expensive scenario. Someone with fancy ALU knowledge could | |
364 | // probably pick worse values. | |
365 | double div32() | |
366 | { | |
367 | #if defined(__i386__) || defined(__x86_64__) | |
368 | int count = 1000000; | |
369 | uint64_t start = Cycles::rdtsc(); | |
370 | // NB: Expect an x86 processor exception is there's overflow. | |
371 | uint32_t numeratorHi = 0xa5a5a5a5U; | |
372 | uint32_t numeratorLo = 0x55aa55aaU; | |
373 | uint32_t divisor = 0xaa55aa55U; | |
374 | uint32_t quotient; | |
375 | uint32_t remainder; | |
376 | for (int i = 0; i < count; i++) { | |
377 | __asm__ __volatile__("div %4" : | |
378 | "=a"(quotient), "=d"(remainder) : | |
379 | "a"(numeratorLo), "d"(numeratorHi), "r"(divisor) : | |
380 | "cc"); | |
381 | } | |
382 | uint64_t stop = Cycles::rdtsc(); | |
383 | return Cycles::to_seconds(stop - start)/count; | |
384 | #else | |
385 | return -1; | |
386 | #endif | |
387 | } | |
388 | ||
389 | // Measure the cost of a 64-bit divide. Divides don't take a constant | |
390 | // number of cycles. Values were chosen here semi-randomly to depict a | |
391 | // fairly expensive scenario. Someone with fancy ALU knowledge could | |
392 | // probably pick worse values. | |
393 | double div64() | |
394 | { | |
395 | #if defined(__x86_64__) || defined(__amd64__) | |
396 | int count = 1000000; | |
397 | // NB: Expect an x86 processor exception is there's overflow. | |
398 | uint64_t start = Cycles::rdtsc(); | |
399 | uint64_t numeratorHi = 0x5a5a5a5a5a5UL; | |
400 | uint64_t numeratorLo = 0x55aa55aa55aa55aaUL; | |
401 | uint64_t divisor = 0xaa55aa55aa55aa55UL; | |
402 | uint64_t quotient; | |
403 | uint64_t remainder; | |
404 | for (int i = 0; i < count; i++) { | |
405 | __asm__ __volatile__("divq %4" : | |
406 | "=a"(quotient), "=d"(remainder) : | |
407 | "a"(numeratorLo), "d"(numeratorHi), "r"(divisor) : | |
408 | "cc"); | |
409 | } | |
410 | uint64_t stop = Cycles::rdtsc(); | |
411 | return Cycles::to_seconds(stop - start)/count; | |
412 | #else | |
413 | return -1; | |
414 | #endif | |
415 | } | |
416 | ||
417 | // Measure the cost of calling a non-inlined function. | |
418 | double function_call() | |
419 | { | |
420 | int count = 1000000; | |
421 | uint64_t x = 0; | |
422 | uint64_t start = Cycles::rdtsc(); | |
423 | for (int i = 0; i < count; i++) { | |
424 | x = PerfHelper::plus_one(x); | |
425 | } | |
426 | uint64_t stop = Cycles::rdtsc(); | |
427 | return Cycles::to_seconds(stop - start)/count; | |
428 | } | |
429 | ||
430 | // Measure the minimum cost of EventCenter::process_events, when there are no | |
431 | // Pollers and no Timers. | |
432 | double eventcenter_poll() | |
433 | { | |
434 | int count = 1000000; | |
435 | EventCenter center(g_ceph_context); | |
436 | center.init(1000, 0, "posix"); | |
437 | center.set_owner(); | |
438 | uint64_t start = Cycles::rdtsc(); | |
439 | for (int i = 0; i < count; i++) { | |
440 | center.process_events(0); | |
441 | } | |
442 | uint64_t stop = Cycles::rdtsc(); | |
443 | return Cycles::to_seconds(stop - start)/count; | |
444 | } | |
445 | ||
446 | class CenterWorker : public Thread { | |
447 | CephContext *cct; | |
448 | bool done; | |
449 | ||
450 | public: | |
451 | EventCenter center; | |
452 | explicit CenterWorker(CephContext *c): cct(c), done(false), center(c) { | |
453 | center.init(100, 0, "posix"); | |
454 | } | |
455 | void stop() { | |
456 | done = true; | |
457 | center.wakeup(); | |
458 | } | |
459 | void* entry() override { | |
460 | center.set_owner(); | |
461 | bind_thread_to_cpu(2); | |
462 | while (!done) | |
463 | center.process_events(1000); | |
464 | return 0; | |
465 | } | |
466 | }; | |
467 | ||
468 | class CountEvent: public EventCallback { | |
31f18b77 | 469 | std::atomic<int64_t> *count; |
7c673cae FG |
470 | |
471 | public: | |
31f18b77 | 472 | explicit CountEvent(std::atomic<int64_t> *atomic): count(atomic) {} |
11fdf7f2 | 473 | void do_request(uint64_t id) override { |
31f18b77 | 474 | (*count)--; |
7c673cae FG |
475 | } |
476 | }; | |
477 | ||
478 | double eventcenter_dispatch() | |
479 | { | |
480 | int count = 100000; | |
481 | ||
482 | CenterWorker worker(g_ceph_context); | |
31f18b77 | 483 | std::atomic<int64_t> flag = { 1 }; |
7c673cae FG |
484 | worker.create("evt_center_disp"); |
485 | EventCallbackRef count_event(new CountEvent(&flag)); | |
486 | ||
487 | worker.center.dispatch_event_external(count_event); | |
488 | // Start a new thread and wait for it to ready. | |
31f18b77 | 489 | while (flag) |
7c673cae FG |
490 | usleep(100); |
491 | ||
492 | uint64_t start = Cycles::rdtsc(); | |
493 | for (int i = 0; i < count; i++) { | |
31f18b77 | 494 | flag = 1; |
7c673cae | 495 | worker.center.dispatch_event_external(count_event); |
31f18b77 | 496 | while (flag) |
7c673cae FG |
497 | ; |
498 | } | |
499 | uint64_t stop = Cycles::rdtsc(); | |
500 | worker.stop(); | |
501 | worker.join(); | |
502 | return Cycles::to_seconds(stop - start)/count; | |
503 | } | |
504 | ||
505 | // Measure the cost of copying a given number of bytes with memcpy. | |
506 | double memcpy_shared(size_t size) | |
507 | { | |
508 | int count = 1000000; | |
509 | char src[size], dst[size]; | |
510 | ||
511 | memset(src, 0, sizeof(src)); | |
512 | ||
513 | uint64_t start = Cycles::rdtsc(); | |
514 | for (int i = 0; i < count; i++) { | |
515 | memcpy(dst, src, size); | |
516 | } | |
517 | uint64_t stop = Cycles::rdtsc(); | |
518 | return Cycles::to_seconds(stop - start)/count; | |
519 | } | |
520 | ||
521 | double memcpy100() | |
522 | { | |
523 | return memcpy_shared(100); | |
524 | } | |
525 | ||
526 | double memcpy1000() | |
527 | { | |
528 | return memcpy_shared(1000); | |
529 | } | |
530 | ||
531 | double memcpy10000() | |
532 | { | |
533 | return memcpy_shared(10000); | |
534 | } | |
535 | ||
536 | // Benchmark rjenkins hashing performance on cached data. | |
537 | template <int key_length> | |
538 | double ceph_str_hash_rjenkins() | |
539 | { | |
540 | int count = 100000; | |
541 | char buf[key_length]; | |
542 | ||
543 | uint64_t start = Cycles::rdtsc(); | |
544 | for (int i = 0; i < count; i++) | |
545 | ceph_str_hash(CEPH_STR_HASH_RJENKINS, buf, sizeof(buf)); | |
546 | uint64_t stop = Cycles::rdtsc(); | |
547 | ||
548 | return Cycles::to_seconds(stop - start)/count; | |
549 | } | |
550 | ||
551 | // Measure the cost of reading the fine-grain cycle counter. | |
552 | double rdtsc_test() | |
553 | { | |
554 | int count = 1000000; | |
555 | uint64_t start = Cycles::rdtsc(); | |
556 | uint64_t total = 0; | |
557 | for (int i = 0; i < count; i++) { | |
558 | total += Cycles::rdtsc(); | |
559 | } | |
560 | uint64_t stop = Cycles::rdtsc(); | |
561 | return Cycles::to_seconds(stop - start)/count; | |
562 | } | |
563 | ||
564 | // Measure the cost of the Cycles::to_seconds method. | |
565 | double perf_cycles_to_seconds() | |
566 | { | |
567 | int count = 1000000; | |
568 | double total = 0; | |
569 | uint64_t cycles = 994261; | |
570 | uint64_t start = Cycles::rdtsc(); | |
571 | for (int i = 0; i < count; i++) { | |
572 | total += Cycles::to_seconds(cycles); | |
573 | } | |
574 | uint64_t stop = Cycles::rdtsc(); | |
575 | // printf("Result: %.4f\n", total/count); | |
576 | return Cycles::to_seconds(stop - start)/count; | |
577 | } | |
578 | ||
579 | // Measure the cost of the Cylcles::toNanoseconds method. | |
580 | double perf_cycles_to_nanoseconds() | |
581 | { | |
582 | int count = 1000000; | |
583 | uint64_t total = 0; | |
584 | uint64_t cycles = 994261; | |
585 | uint64_t start = Cycles::rdtsc(); | |
586 | for (int i = 0; i < count; i++) { | |
587 | total += Cycles::to_nanoseconds(cycles); | |
588 | } | |
589 | uint64_t stop = Cycles::rdtsc(); | |
590 | // printf("Result: %lu\n", total/count); | |
591 | return Cycles::to_seconds(stop - start)/count; | |
592 | } | |
593 | ||
594 | ||
595 | #ifdef HAVE_SSE | |
596 | /** | |
597 | * Prefetch the cache lines containing [object, object + numBytes) into the | |
598 | * processor's caches. | |
599 | * The best docs for this are in the Intel instruction set reference under | |
600 | * PREFETCH. | |
601 | * \param object | |
602 | * The start of the region of memory to prefetch. | |
603 | * \param num_bytes | |
604 | * The size of the region of memory to prefetch. | |
605 | */ | |
606 | static inline void prefetch(const void *object, uint64_t num_bytes) | |
607 | { | |
608 | uint64_t offset = reinterpret_cast<uint64_t>(object) & 0x3fUL; | |
609 | const char* p = reinterpret_cast<const char*>(object) - offset; | |
610 | for (uint64_t i = 0; i < offset + num_bytes; i += 64) | |
611 | _mm_prefetch(p + i, _MM_HINT_T0); | |
612 | } | |
613 | #endif | |
614 | ||
615 | // Measure the cost of the prefetch instruction. | |
616 | double perf_prefetch() | |
617 | { | |
618 | #ifdef HAVE_SSE | |
619 | uint64_t total_ticks = 0; | |
620 | int count = 10; | |
621 | char buf[16 * 64]; | |
7c673cae FG |
622 | |
623 | for (int i = 0; i < count; i++) { | |
624 | PerfHelper::flush_cache(); | |
11fdf7f2 | 625 | uint64_t start = Cycles::rdtsc(); |
7c673cae FG |
626 | prefetch(&buf[576], 64); |
627 | prefetch(&buf[0], 64); | |
628 | prefetch(&buf[512], 64); | |
629 | prefetch(&buf[960], 64); | |
630 | prefetch(&buf[640], 64); | |
631 | prefetch(&buf[896], 64); | |
632 | prefetch(&buf[256], 64); | |
633 | prefetch(&buf[704], 64); | |
634 | prefetch(&buf[320], 64); | |
635 | prefetch(&buf[384], 64); | |
636 | prefetch(&buf[128], 64); | |
637 | prefetch(&buf[448], 64); | |
638 | prefetch(&buf[768], 64); | |
639 | prefetch(&buf[832], 64); | |
640 | prefetch(&buf[64], 64); | |
641 | prefetch(&buf[192], 64); | |
11fdf7f2 | 642 | uint64_t stop = Cycles::rdtsc(); |
7c673cae FG |
643 | total_ticks += stop - start; |
644 | } | |
645 | return Cycles::to_seconds(total_ticks) / count / 16; | |
646 | #else | |
647 | return -1; | |
648 | #endif | |
649 | } | |
650 | ||
651 | #if defined(__x86_64__) | |
652 | /** | |
653 | * This function is used to seralize machine instructions so that no | |
654 | * instructions that appear after it in the current thread can run before any | |
655 | * instructions that appear before it. | |
656 | * | |
657 | * It is useful for putting around rdpmc instructions (to pinpoint cache | |
658 | * misses) as well as before rdtsc instructions, to prevent time pollution from | |
659 | * instructions supposed to be executing before the timer starts. | |
660 | */ | |
661 | static inline void serialize() { | |
662 | uint32_t eax, ebx, ecx, edx; | |
663 | __asm volatile("cpuid" | |
664 | : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) | |
665 | : "a" (1U)); | |
666 | } | |
667 | #endif | |
668 | ||
669 | // Measure the cost of cpuid | |
670 | double perf_serialize() { | |
671 | #if defined(__x86_64__) | |
672 | int count = 1000000; | |
673 | uint64_t start = Cycles::rdtsc(); | |
674 | for (int i = 0; i < count; i++) { | |
675 | serialize(); | |
676 | } | |
677 | uint64_t stop = Cycles::rdtsc(); | |
678 | return Cycles::to_seconds(stop - start)/count; | |
679 | #else | |
680 | return -1; | |
681 | #endif | |
682 | } | |
683 | ||
684 | // Measure the cost of an lfence instruction. | |
685 | double lfence() | |
686 | { | |
687 | #ifdef HAVE_SSE2 | |
688 | int count = 1000000; | |
689 | uint64_t start = Cycles::rdtsc(); | |
690 | for (int i = 0; i < count; i++) { | |
691 | __asm__ __volatile__("lfence" ::: "memory"); | |
692 | } | |
693 | uint64_t stop = Cycles::rdtsc(); | |
694 | return Cycles::to_seconds(stop - start)/count; | |
695 | #else | |
696 | return -1; | |
697 | #endif | |
698 | } | |
699 | ||
700 | // Measure the cost of an sfence instruction. | |
701 | double sfence() | |
702 | { | |
703 | #ifdef HAVE_SSE | |
704 | int count = 1000000; | |
705 | uint64_t start = Cycles::rdtsc(); | |
706 | for (int i = 0; i < count; i++) { | |
707 | __asm__ __volatile__("sfence" ::: "memory"); | |
708 | } | |
709 | uint64_t stop = Cycles::rdtsc(); | |
710 | return Cycles::to_seconds(stop - start)/count; | |
711 | #else | |
712 | return -1; | |
713 | #endif | |
714 | } | |
715 | ||
716 | // Measure the cost of acquiring and releasing a SpinLock (assuming the | |
717 | // lock is initially free). | |
718 | double test_spinlock() | |
719 | { | |
720 | int count = 1000000; | |
11fdf7f2 | 721 | ceph::spinlock lock; |
7c673cae FG |
722 | uint64_t start = Cycles::rdtsc(); |
723 | for (int i = 0; i < count; i++) { | |
724 | lock.lock(); | |
725 | lock.unlock(); | |
726 | } | |
727 | uint64_t stop = Cycles::rdtsc(); | |
728 | return Cycles::to_seconds(stop - start)/count; | |
729 | } | |
730 | ||
731 | // Helper for spawn_thread. This is the main function that the thread executes | |
732 | // (intentionally empty). | |
733 | class ThreadHelper : public Thread { | |
734 | void *entry() override { return 0; } | |
735 | }; | |
736 | ||
737 | // Measure the cost of start and joining with a thread. | |
738 | double spawn_thread() | |
739 | { | |
740 | int count = 10000; | |
741 | ThreadHelper thread; | |
742 | uint64_t start = Cycles::rdtsc(); | |
743 | for (int i = 0; i < count; i++) { | |
744 | thread.create("thread_helper"); | |
745 | thread.join(); | |
746 | } | |
747 | uint64_t stop = Cycles::rdtsc(); | |
748 | return Cycles::to_seconds(stop - start)/count; | |
749 | } | |
750 | ||
751 | class FakeContext : public Context { | |
752 | public: | |
753 | void finish(int r) override {} | |
754 | }; | |
755 | ||
756 | // Measure the cost of starting and stopping a Dispatch::Timer. | |
757 | double perf_timer() | |
758 | { | |
759 | int count = 1000000; | |
9f95a23c | 760 | ceph::mutex lock = ceph::make_mutex("perf_timer::lock"); |
7c673cae FG |
761 | SafeTimer timer(g_ceph_context, lock); |
762 | FakeContext **c = new FakeContext*[count]; | |
763 | for (int i = 0; i < count; i++) { | |
764 | c[i] = new FakeContext(); | |
765 | } | |
766 | uint64_t start = Cycles::rdtsc(); | |
9f95a23c | 767 | std::lock_guard l{lock}; |
7c673cae | 768 | for (int i = 0; i < count; i++) { |
3efd9988 FG |
769 | if (timer.add_event_after(12345, c[i])) { |
770 | timer.cancel_event(c[i]); | |
771 | } | |
7c673cae FG |
772 | } |
773 | uint64_t stop = Cycles::rdtsc(); | |
774 | delete[] c; | |
775 | return Cycles::to_seconds(stop - start)/count; | |
776 | } | |
777 | ||
778 | // Measure the cost of throwing and catching an int. This uses an integer as | |
779 | // the value thrown, which is presumably as fast as possible. | |
780 | double throw_int() | |
781 | { | |
782 | int count = 10000; | |
783 | uint64_t start = Cycles::rdtsc(); | |
784 | for (int i = 0; i < count; i++) { | |
785 | try { | |
786 | throw 0; | |
787 | } catch (int) { // NOLINT | |
788 | // pass | |
789 | } | |
790 | } | |
791 | uint64_t stop = Cycles::rdtsc(); | |
792 | return Cycles::to_seconds(stop - start)/count; | |
793 | } | |
794 | ||
795 | // Measure the cost of throwing and catching an int from a function call. | |
796 | double throw_int_call() | |
797 | { | |
798 | int count = 10000; | |
799 | uint64_t start = Cycles::rdtsc(); | |
800 | for (int i = 0; i < count; i++) { | |
801 | try { | |
802 | PerfHelper::throw_int(); | |
803 | } catch (int) { // NOLINT | |
804 | // pass | |
805 | } | |
806 | } | |
807 | uint64_t stop = Cycles::rdtsc(); | |
808 | return Cycles::to_seconds(stop - start)/count; | |
809 | } | |
810 | ||
811 | // Measure the cost of throwing and catching an Exception. This uses an actual | |
812 | // exception as the value thrown, which may be slower than throwInt. | |
813 | double throw_exception() | |
814 | { | |
815 | int count = 10000; | |
816 | uint64_t start = Cycles::rdtsc(); | |
817 | for (int i = 0; i < count; i++) { | |
818 | try { | |
819 | throw buffer::end_of_buffer(); | |
820 | } catch (const buffer::end_of_buffer&) { | |
821 | // pass | |
822 | } | |
823 | } | |
824 | uint64_t stop = Cycles::rdtsc(); | |
825 | return Cycles::to_seconds(stop - start)/count; | |
826 | } | |
827 | ||
828 | // Measure the cost of throwing and catching an Exception from a function call. | |
829 | double throw_exception_call() | |
830 | { | |
831 | int count = 10000; | |
832 | uint64_t start = Cycles::rdtsc(); | |
833 | for (int i = 0; i < count; i++) { | |
834 | try { | |
835 | PerfHelper::throw_end_of_buffer(); | |
836 | } catch (const buffer::end_of_buffer&) { | |
837 | // pass | |
838 | } | |
839 | } | |
840 | uint64_t stop = Cycles::rdtsc(); | |
841 | return Cycles::to_seconds(stop - start)/count; | |
842 | } | |
843 | ||
844 | // Measure the cost of pushing a new element on a std::vector, copying | |
845 | // from the end to an internal element, and popping the end element. | |
846 | double vector_push_pop() | |
847 | { | |
848 | int count = 100000; | |
849 | std::vector<int> vector; | |
850 | vector.push_back(1); | |
851 | vector.push_back(2); | |
852 | vector.push_back(3); | |
853 | uint64_t start = Cycles::rdtsc(); | |
854 | for (int i = 0; i < count; i++) { | |
855 | vector.push_back(i); | |
856 | vector.push_back(i+1); | |
857 | vector.push_back(i+2); | |
858 | vector[2] = vector.back(); | |
859 | vector.pop_back(); | |
860 | vector[0] = vector.back(); | |
861 | vector.pop_back(); | |
862 | vector[1] = vector.back(); | |
863 | vector.pop_back(); | |
864 | } | |
865 | uint64_t stop = Cycles::rdtsc(); | |
866 | return Cycles::to_seconds(stop - start)/(count*3); | |
867 | } | |
868 | ||
869 | // Measure the cost of ceph_clock_now | |
870 | double perf_ceph_clock_now() | |
871 | { | |
872 | int count = 100000; | |
873 | uint64_t start = Cycles::rdtsc(); | |
874 | for (int i = 0; i < count; i++) { | |
875 | ceph_clock_now(); | |
876 | } | |
877 | uint64_t stop = Cycles::rdtsc(); | |
878 | return Cycles::to_seconds(stop - start)/count; | |
879 | } | |
880 | ||
881 | // The following struct and table define each performance test in terms of | |
882 | // a string name and a function that implements the test. | |
883 | struct TestInfo { | |
884 | const char* name; // Name of the performance test; this is | |
885 | // what gets typed on the command line to | |
886 | // run the test. | |
887 | double (*func)(); // Function that implements the test; | |
888 | // returns the time (in seconds) for each | |
889 | // iteration of that test. | |
890 | const char *description; // Short description of this test (not more | |
891 | // than about 40 characters, so the entire | |
892 | // test output fits on a single line). | |
893 | }; | |
894 | TestInfo tests[] = { | |
895 | {"atomic_int_cmp", atomic_int_cmp, | |
896 | "atomic_t::compare_and_swap"}, | |
897 | {"atomic_int_inc", atomic_int_inc, | |
898 | "atomic_t::inc"}, | |
899 | {"atomic_int_read", atomic_int_read, | |
900 | "atomic_t::read"}, | |
901 | {"atomic_int_set", atomic_int_set, | |
902 | "atomic_t::set"}, | |
903 | {"mutex_nonblock", mutex_nonblock, | |
904 | "Mutex lock/unlock (no blocking)"}, | |
905 | {"buffer_basic", buffer_basic, | |
906 | "buffer create, add one ptr, delete"}, | |
907 | {"buffer_encode_decode", buffer_encode_decode, | |
908 | "buffer create, encode/decode object, delete"}, | |
909 | {"buffer_basic_copy", buffer_basic_copy, | |
910 | "buffer create, copy small block, delete"}, | |
911 | {"buffer_copy", buffer_copy, | |
912 | "copy out 2 small ptrs from buffer"}, | |
913 | {"buffer_encode10", buffer_encode, | |
914 | "buffer encoding 10 structures onto existing ptr"}, | |
7c673cae FG |
915 | {"buffer_iterator", buffer_iterator, |
916 | "iterate over buffer with 5 ptrs"}, | |
917 | {"cond_ping_pong", cond_ping_pong, | |
918 | "condition variable round-trip"}, | |
919 | {"div32", div32, | |
920 | "32-bit integer division instruction"}, | |
921 | {"div64", div64, | |
922 | "64-bit integer division instruction"}, | |
923 | {"function_call", function_call, | |
924 | "Call a function that has not been inlined"}, | |
925 | {"eventcenter_poll", eventcenter_poll, | |
926 | "EventCenter::process_events (no timers or events)"}, | |
927 | {"eventcenter_dispatch", eventcenter_dispatch, | |
928 | "EventCenter::dispatch_event_external latency"}, | |
929 | {"memcpy100", memcpy100, | |
930 | "Copy 100 bytes with memcpy"}, | |
931 | {"memcpy1000", memcpy1000, | |
932 | "Copy 1000 bytes with memcpy"}, | |
933 | {"memcpy10000", memcpy10000, | |
934 | "Copy 10000 bytes with memcpy"}, | |
935 | {"ceph_str_hash_rjenkins", ceph_str_hash_rjenkins<16>, | |
936 | "rjenkins hash on 16 byte of data"}, | |
937 | {"ceph_str_hash_rjenkins", ceph_str_hash_rjenkins<256>, | |
938 | "rjenkins hash on 256 bytes of data"}, | |
939 | {"rdtsc", rdtsc_test, | |
940 | "Read the fine-grain cycle counter"}, | |
941 | {"cycles_to_seconds", perf_cycles_to_seconds, | |
942 | "Convert a rdtsc result to (double) seconds"}, | |
943 | {"cycles_to_seconds", perf_cycles_to_nanoseconds, | |
944 | "Convert a rdtsc result to (uint64_t) nanoseconds"}, | |
945 | {"prefetch", perf_prefetch, | |
946 | "Prefetch instruction"}, | |
947 | {"serialize", perf_serialize, | |
948 | "serialize instruction"}, | |
949 | {"lfence", lfence, | |
950 | "Lfence instruction"}, | |
951 | {"sfence", sfence, | |
952 | "Sfence instruction"}, | |
953 | {"spin_lock", test_spinlock, | |
954 | "Acquire/release SpinLock"}, | |
955 | {"spawn_thread", spawn_thread, | |
956 | "Start and stop a thread"}, | |
957 | {"perf_timer", perf_timer, | |
958 | "Insert and cancel a SafeTimer"}, | |
959 | {"throw_int", throw_int, | |
960 | "Throw an int"}, | |
961 | {"throw_int_call", throw_int_call, | |
962 | "Throw an int in a function call"}, | |
963 | {"throw_exception", throw_exception, | |
964 | "Throw an Exception"}, | |
965 | {"throw_exception_call", throw_exception_call, | |
966 | "Throw an Exception in a function call"}, | |
967 | {"vector_push_pop", vector_push_pop, | |
968 | "Push and pop a std::vector"}, | |
969 | {"ceph_clock_now", perf_ceph_clock_now, | |
970 | "ceph_clock_now function"}, | |
971 | }; | |
972 | ||
973 | /** | |
974 | * Runs a particular test and prints a one-line result message. | |
975 | * | |
976 | * \param info | |
977 | * Describes the test to run. | |
978 | */ | |
979 | void run_test(TestInfo& info) | |
980 | { | |
981 | double secs = info.func(); | |
982 | int width = printf("%-24s ", info.name); | |
983 | if (secs == -1) { | |
984 | width += printf(" architecture nonsupport "); | |
985 | } else if (secs < 1.0e-06) { | |
986 | width += printf("%8.2fns", 1e09*secs); | |
987 | } else if (secs < 1.0e-03) { | |
988 | width += printf("%8.2fus", 1e06*secs); | |
989 | } else if (secs < 1.0) { | |
990 | width += printf("%8.2fms", 1e03*secs); | |
991 | } else { | |
992 | width += printf("%8.2fs", secs); | |
993 | } | |
994 | printf("%*s %s\n", 32-width, "", info.description); | |
995 | } | |
996 | ||
997 | int main(int argc, char *argv[]) | |
998 | { | |
999 | vector<const char*> args; | |
1000 | argv_to_vec(argc, (const char **)argv, args); | |
1001 | ||
1002 | auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, | |
11fdf7f2 TL |
1003 | CODE_ENVIRONMENT_UTILITY, |
1004 | CINIT_FLAG_NO_DEFAULT_CONFIG_FILE); | |
7c673cae FG |
1005 | common_init_finish(g_ceph_context); |
1006 | Cycles::init(); | |
1007 | ||
1008 | bind_thread_to_cpu(3); | |
1009 | if (argc == 1) { | |
1010 | // No test names specified; run all tests. | |
1011 | for (size_t i = 0; i < sizeof(tests)/sizeof(TestInfo); ++i) { | |
1012 | run_test(tests[i]); | |
1013 | } | |
1014 | } else { | |
1015 | // Run only the tests that were specified on the command line. | |
1016 | for (int i = 1; i < argc; i++) { | |
1017 | bool found_test = false; | |
1018 | for (size_t j = 0; j < sizeof(tests)/sizeof(TestInfo); ++j) { | |
1019 | if (strcmp(argv[i], tests[j].name) == 0) { | |
1020 | found_test = true; | |
1021 | run_test(tests[j]); | |
1022 | break; | |
1023 | } | |
1024 | } | |
1025 | if (!found_test) { | |
1026 | int width = printf("%-24s ??", argv[i]); | |
1027 | printf("%*s No such test\n", 32-width, ""); | |
1028 | } | |
1029 | } | |
1030 | } | |
1031 | } |