]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* Copyright (c) 2015 Haomai Wang <haomaiwang@gmail.com> | |
4 | * Copyright (c) 2011-2014 Stanford University | |
5 | * Copyright (c) 2011 Facebook | |
6 | * | |
7 | * Permission to use, copy, modify, and distribute this software for any | |
8 | * purpose with or without fee is hereby granted, provided that the above | |
9 | * copyright notice and this permission notice appear in all copies. | |
10 | * | |
11 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR(S) DISCLAIM ALL WARRANTIES | |
12 | * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF | |
13 | * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL AUTHORS BE LIABLE FOR | |
14 | * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES | |
15 | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN | |
16 | * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF | |
17 | * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | |
18 | */ | |
19 | ||
20 | // This program contains a collection of low-level performance measurements | |
21 | // for Ceph, which can be run either individually or altogether. These | |
22 | // tests measure performance in a single stand-alone process, not in a cluster | |
23 | // with multiple servers. Invoke the program like this: | |
24 | // | |
25 | // Perf test1 test2 ... | |
26 | // | |
27 | // test1 and test2 are the names of individual performance measurements to | |
28 | // run. If no test names are provided then all of the performance tests | |
29 | // are run. | |
30 | // | |
31 | // To add a new test: | |
32 | // * Write a function that implements the test. Use existing test functions | |
33 | // as a guideline, and be sure to generate output in the same form as | |
34 | // other tests. | |
35 | // * Create a new entry for the test in the #tests table. | |
36 | #include <vector> | |
37 | #include <sched.h> | |
38 | ||
39 | #include "acconfig.h" | |
40 | #ifdef HAVE_SSE | |
41 | #include <xmmintrin.h> | |
42 | #endif | |
43 | ||
7c673cae FG |
44 | #include "include/buffer.h" |
45 | #include "include/encoding.h" | |
46 | #include "include/ceph_hash.h" | |
47 | #include "include/Spinlock.h" | |
48 | #include "common/ceph_argparse.h" | |
49 | #include "common/Cycles.h" | |
50 | #include "common/Cond.h" | |
51 | #include "common/Mutex.h" | |
52 | #include "common/Thread.h" | |
53 | #include "common/Timer.h" | |
54 | #include "msg/async/Event.h" | |
55 | #include "global/global_init.h" | |
56 | ||
57 | #include "test/perf_helper.h" | |
58 | ||
31f18b77 FG |
59 | #include <atomic> |
60 | ||
7c673cae FG |
61 | using namespace ceph; |
62 | ||
63 | /** | |
64 | * Ask the operating system to pin the current thread to a given CPU. | |
65 | * | |
66 | * \param cpu | |
67 | * Indicates the desired CPU and hyperthread; low order 2 bits | |
68 | * specify CPU, next bit specifies hyperthread. | |
69 | */ | |
70 | void bind_thread_to_cpu(int cpu) | |
71 | { | |
72 | #ifdef HAVE_SCHED | |
73 | cpu_set_t set; | |
74 | CPU_ZERO(&set); | |
75 | CPU_SET(cpu, &set); | |
76 | sched_setaffinity(0, sizeof(set), &set); | |
77 | #endif | |
78 | } | |
79 | ||
80 | /* | |
81 | * This function just discards its argument. It's used to make it | |
82 | * appear that data is used, so that the compiler won't optimize | |
83 | * away the code we're trying to measure. | |
84 | * | |
85 | * \param value | |
86 | * Pointer to arbitrary value; it's discarded. | |
87 | */ | |
88 | void discard(void* value) { | |
89 | int x = *reinterpret_cast<int*>(value); | |
90 | if (x == 0x43924776) { | |
91 | printf("Value was 0x%x\n", x); | |
92 | } | |
93 | } | |
94 | ||
95 | //---------------------------------------------------------------------- | |
96 | // Test functions start here | |
97 | //---------------------------------------------------------------------- | |
98 | ||
31f18b77 | 99 | // Measure the cost of atomic compare-and-swap |
7c673cae FG |
100 | double atomic_int_cmp() |
101 | { | |
102 | int count = 1000000; | |
31f18b77 FG |
103 | std::atomic<unsigned> value = { 11 }; |
104 | unsigned int test = 11; | |
7c673cae FG |
105 | uint64_t start = Cycles::rdtsc(); |
106 | for (int i = 0; i < count; i++) { | |
31f18b77 | 107 | value.compare_exchange_strong(test, test+2); |
7c673cae FG |
108 | test += 2; |
109 | } | |
110 | uint64_t stop = Cycles::rdtsc(); | |
111 | // printf("Final value: %d\n", value.load()); | |
112 | return Cycles::to_seconds(stop - start)/count; | |
113 | } | |
114 | ||
31f18b77 | 115 | // Measure the cost of incrementing an atomic |
7c673cae FG |
116 | double atomic_int_inc() |
117 | { | |
118 | int count = 1000000; | |
31f18b77 | 119 | std::atomic<int64_t> value = { 11 }; |
7c673cae FG |
120 | uint64_t start = Cycles::rdtsc(); |
121 | for (int i = 0; i < count; i++) { | |
31f18b77 | 122 | value++; |
7c673cae FG |
123 | } |
124 | uint64_t stop = Cycles::rdtsc(); | |
125 | // printf("Final value: %d\n", value.load()); | |
126 | return Cycles::to_seconds(stop - start)/count; | |
127 | } | |
128 | ||
31f18b77 | 129 | // Measure the cost of reading an atomic |
7c673cae FG |
130 | double atomic_int_read() |
131 | { | |
132 | int count = 1000000; | |
31f18b77 | 133 | std::atomic<int64_t> value = { 11 }; |
7c673cae FG |
134 | int total = 0; |
135 | uint64_t start = Cycles::rdtsc(); | |
136 | for (int i = 0; i < count; i++) { | |
31f18b77 | 137 | total += value; |
7c673cae FG |
138 | } |
139 | uint64_t stop = Cycles::rdtsc(); | |
140 | // printf("Total: %d\n", total); | |
141 | return Cycles::to_seconds(stop - start)/count; | |
142 | } | |
143 | ||
31f18b77 | 144 | // Measure the cost of storing a new value in an atomic |
7c673cae FG |
145 | double atomic_int_set() |
146 | { | |
147 | int count = 1000000; | |
31f18b77 | 148 | std::atomic<int64_t> value = { 11 }; |
7c673cae FG |
149 | uint64_t start = Cycles::rdtsc(); |
150 | for (int i = 0; i < count; i++) { | |
31f18b77 | 151 | value = 88; |
7c673cae FG |
152 | } |
153 | uint64_t stop = Cycles::rdtsc(); | |
154 | return Cycles::to_seconds(stop - start)/count; | |
155 | } | |
156 | ||
157 | // Measure the cost of acquiring and releasing a mutex in the | |
158 | // fast case where the mutex is free. | |
159 | double mutex_nonblock() | |
160 | { | |
161 | int count = 1000000; | |
162 | Mutex m("mutex_nonblock::m"); | |
163 | uint64_t start = Cycles::rdtsc(); | |
164 | for (int i = 0; i < count; i++) { | |
165 | m.Lock(); | |
166 | m.Unlock(); | |
167 | } | |
168 | uint64_t stop = Cycles::rdtsc(); | |
169 | return Cycles::to_seconds(stop - start)/count; | |
170 | } | |
171 | ||
172 | // Measure the cost of allocating and deallocating a buffer, plus | |
173 | // appending (logically) one ptr. | |
174 | double buffer_basic() | |
175 | { | |
176 | int count = 1000000; | |
177 | uint64_t start = Cycles::rdtsc(); | |
178 | bufferptr ptr("abcdefg", 7); | |
179 | for (int i = 0; i < count; i++) { | |
180 | bufferlist b; | |
181 | b.append(ptr, 0, 5); | |
182 | } | |
183 | uint64_t stop = Cycles::rdtsc(); | |
184 | return Cycles::to_seconds(stop - start)/count; | |
185 | } | |
186 | ||
187 | struct DummyBlock { | |
188 | int a = 1, b = 2, c = 3, d = 4; | |
189 | void encode(bufferlist &bl) const { | |
190 | ENCODE_START(1, 1, bl); | |
191 | ::encode(a, bl); | |
192 | ::encode(b, bl); | |
193 | ::encode(c, bl); | |
194 | ::encode(d, bl); | |
195 | ENCODE_FINISH(bl); | |
196 | } | |
197 | void decode(bufferlist::iterator &bl) { | |
198 | DECODE_START(1, bl); | |
199 | ::decode(a, bl); | |
200 | ::decode(b, bl); | |
201 | ::decode(c, bl); | |
202 | ::decode(d, bl); | |
203 | DECODE_FINISH(bl); | |
204 | } | |
205 | }; | |
206 | WRITE_CLASS_ENCODER(DummyBlock) | |
207 | ||
208 | // Measure the cost of encoding and decoding a buffer, plus | |
209 | // allocating space for one chunk. | |
210 | double buffer_encode_decode() | |
211 | { | |
212 | int count = 1000000; | |
213 | uint64_t start = Cycles::rdtsc(); | |
214 | for (int i = 0; i < count; i++) { | |
215 | bufferlist b; | |
216 | DummyBlock dummy_block; | |
217 | ::encode(dummy_block, b); | |
218 | bufferlist::iterator iter = b.begin(); | |
219 | ::decode(dummy_block, iter); | |
220 | } | |
221 | uint64_t stop = Cycles::rdtsc(); | |
222 | return Cycles::to_seconds(stop - start)/count; | |
223 | } | |
224 | ||
225 | // Measure the cost of allocating and deallocating a buffer, plus | |
226 | // copying in a small block. | |
227 | double buffer_basic_copy() | |
228 | { | |
229 | int count = 1000000; | |
230 | uint64_t start = Cycles::rdtsc(); | |
231 | for (int i = 0; i < count; i++) { | |
232 | bufferlist b; | |
233 | b.append("abcdefg", 6); | |
234 | } | |
235 | uint64_t stop = Cycles::rdtsc(); | |
236 | return Cycles::to_seconds(stop - start)/count; | |
237 | } | |
238 | ||
239 | // Measure the cost of making a copy of parts of two ptrs. | |
240 | double buffer_copy() | |
241 | { | |
242 | int count = 1000000; | |
243 | bufferlist b; | |
244 | b.append("abcde", 5); | |
245 | b.append("01234", 5); | |
246 | char copy[10]; | |
247 | uint64_t start = Cycles::rdtsc(); | |
248 | for (int i = 0; i < count; i++) { | |
249 | b.copy(2, 6, copy); | |
250 | } | |
251 | uint64_t stop = Cycles::rdtsc(); | |
252 | return Cycles::to_seconds(stop - start)/count; | |
253 | } | |
254 | ||
255 | // Measure the cost of allocating new space by extending the | |
256 | // bufferlist | |
257 | double buffer_encode() | |
258 | { | |
259 | int count = 100000; | |
260 | uint64_t total = 0; | |
261 | for (int i = 0; i < count; i++) { | |
262 | bufferlist b; | |
263 | DummyBlock dummy_block; | |
264 | ::encode(dummy_block, b); | |
265 | uint64_t start = Cycles::rdtsc(); | |
266 | ::encode(dummy_block, b); | |
267 | ::encode(dummy_block, b); | |
268 | ::encode(dummy_block, b); | |
269 | ::encode(dummy_block, b); | |
270 | ::encode(dummy_block, b); | |
271 | ::encode(dummy_block, b); | |
272 | ::encode(dummy_block, b); | |
273 | ::encode(dummy_block, b); | |
274 | ::encode(dummy_block, b); | |
275 | ::encode(dummy_block, b); | |
276 | total += Cycles::rdtsc() - start; | |
277 | } | |
278 | return Cycles::to_seconds(total)/(count*10); | |
279 | } | |
280 | ||
281 | // Measure the cost of retrieving an object from the beginning of a buffer. | |
282 | double buffer_get_contiguous() | |
283 | { | |
284 | int count = 1000000; | |
285 | int value = 11; | |
286 | bufferlist b; | |
287 | b.append((char*)&value, sizeof(value)); | |
288 | int sum = 0; | |
289 | uint64_t start = Cycles::rdtsc(); | |
290 | for (int i = 0; i < count; i++) { | |
291 | sum += *reinterpret_cast<int*>(b.get_contiguous(0, sizeof(value))); | |
292 | } | |
293 | uint64_t stop = Cycles::rdtsc(); | |
294 | return Cycles::to_seconds(stop - start)/count; | |
295 | } | |
296 | ||
297 | // Measure the cost of creating an iterator and iterating over 10 | |
298 | // chunks in a buffer. | |
299 | double buffer_iterator() | |
300 | { | |
301 | bufferlist b; | |
302 | const char s[] = "abcdefghijklmnopqrstuvwxyz"; | |
303 | bufferptr ptr(s, sizeof(s)); | |
304 | for (int i = 0; i < 5; i++) { | |
305 | b.append(ptr, i, 5); | |
306 | } | |
307 | int count = 100000; | |
308 | int sum = 0; | |
309 | uint64_t start = Cycles::rdtsc(); | |
310 | for (int i = 0; i < count; i++) { | |
311 | bufferlist::iterator it = b.begin(); | |
312 | while (!it.end()) { | |
313 | sum += (static_cast<const char*>(it.get_current_ptr().c_str()))[it.get_remaining()-1]; | |
314 | ++it; | |
315 | } | |
316 | } | |
317 | uint64_t stop = Cycles::rdtsc(); | |
318 | discard(&sum); | |
319 | return Cycles::to_seconds(stop - start)/count; | |
320 | } | |
321 | ||
322 | // Implements the CondPingPong test. | |
323 | class CondPingPong { | |
324 | Mutex mutex; | |
325 | Cond cond; | |
326 | int prod; | |
327 | int cons; | |
328 | const int count; | |
329 | ||
330 | class Consumer : public Thread { | |
331 | CondPingPong *p; | |
332 | public: | |
333 | explicit Consumer(CondPingPong *p): p(p) {} | |
334 | void* entry() override { | |
335 | p->consume(); | |
336 | return 0; | |
337 | } | |
338 | } consumer; | |
339 | ||
340 | public: | |
341 | CondPingPong(): mutex("CondPingPong::mutex"), prod(0), cons(0), count(10000), consumer(this) {} | |
342 | ||
343 | double run() { | |
344 | consumer.create("consumer"); | |
345 | uint64_t start = Cycles::rdtsc(); | |
346 | produce(); | |
347 | uint64_t stop = Cycles::rdtsc(); | |
348 | consumer.join(); | |
349 | return Cycles::to_seconds(stop - start)/count; | |
350 | } | |
351 | ||
352 | void produce() { | |
353 | Mutex::Locker l(mutex); | |
354 | while (cons < count) { | |
355 | while (cons < prod) | |
356 | cond.Wait(mutex); | |
357 | ++prod; | |
358 | cond.Signal(); | |
359 | } | |
360 | } | |
361 | ||
362 | void consume() { | |
363 | Mutex::Locker l(mutex); | |
364 | while (cons < count) { | |
365 | while (cons == prod) | |
366 | cond.Wait(mutex); | |
367 | ++cons; | |
368 | cond.Signal(); | |
369 | } | |
370 | } | |
371 | }; | |
372 | ||
373 | // Measure the cost of coordinating between threads using a condition variable. | |
374 | double cond_ping_pong() | |
375 | { | |
376 | return CondPingPong().run(); | |
377 | } | |
378 | ||
379 | // Measure the cost of a 32-bit divide. Divides don't take a constant | |
380 | // number of cycles. Values were chosen here semi-randomly to depict a | |
381 | // fairly expensive scenario. Someone with fancy ALU knowledge could | |
382 | // probably pick worse values. | |
383 | double div32() | |
384 | { | |
385 | #if defined(__i386__) || defined(__x86_64__) | |
386 | int count = 1000000; | |
387 | uint64_t start = Cycles::rdtsc(); | |
388 | // NB: Expect an x86 processor exception is there's overflow. | |
389 | uint32_t numeratorHi = 0xa5a5a5a5U; | |
390 | uint32_t numeratorLo = 0x55aa55aaU; | |
391 | uint32_t divisor = 0xaa55aa55U; | |
392 | uint32_t quotient; | |
393 | uint32_t remainder; | |
394 | for (int i = 0; i < count; i++) { | |
395 | __asm__ __volatile__("div %4" : | |
396 | "=a"(quotient), "=d"(remainder) : | |
397 | "a"(numeratorLo), "d"(numeratorHi), "r"(divisor) : | |
398 | "cc"); | |
399 | } | |
400 | uint64_t stop = Cycles::rdtsc(); | |
401 | return Cycles::to_seconds(stop - start)/count; | |
402 | #else | |
403 | return -1; | |
404 | #endif | |
405 | } | |
406 | ||
407 | // Measure the cost of a 64-bit divide. Divides don't take a constant | |
408 | // number of cycles. Values were chosen here semi-randomly to depict a | |
409 | // fairly expensive scenario. Someone with fancy ALU knowledge could | |
410 | // probably pick worse values. | |
411 | double div64() | |
412 | { | |
413 | #if defined(__x86_64__) || defined(__amd64__) | |
414 | int count = 1000000; | |
415 | // NB: Expect an x86 processor exception is there's overflow. | |
416 | uint64_t start = Cycles::rdtsc(); | |
417 | uint64_t numeratorHi = 0x5a5a5a5a5a5UL; | |
418 | uint64_t numeratorLo = 0x55aa55aa55aa55aaUL; | |
419 | uint64_t divisor = 0xaa55aa55aa55aa55UL; | |
420 | uint64_t quotient; | |
421 | uint64_t remainder; | |
422 | for (int i = 0; i < count; i++) { | |
423 | __asm__ __volatile__("divq %4" : | |
424 | "=a"(quotient), "=d"(remainder) : | |
425 | "a"(numeratorLo), "d"(numeratorHi), "r"(divisor) : | |
426 | "cc"); | |
427 | } | |
428 | uint64_t stop = Cycles::rdtsc(); | |
429 | return Cycles::to_seconds(stop - start)/count; | |
430 | #else | |
431 | return -1; | |
432 | #endif | |
433 | } | |
434 | ||
435 | // Measure the cost of calling a non-inlined function. | |
436 | double function_call() | |
437 | { | |
438 | int count = 1000000; | |
439 | uint64_t x = 0; | |
440 | uint64_t start = Cycles::rdtsc(); | |
441 | for (int i = 0; i < count; i++) { | |
442 | x = PerfHelper::plus_one(x); | |
443 | } | |
444 | uint64_t stop = Cycles::rdtsc(); | |
445 | return Cycles::to_seconds(stop - start)/count; | |
446 | } | |
447 | ||
448 | // Measure the minimum cost of EventCenter::process_events, when there are no | |
449 | // Pollers and no Timers. | |
450 | double eventcenter_poll() | |
451 | { | |
452 | int count = 1000000; | |
453 | EventCenter center(g_ceph_context); | |
454 | center.init(1000, 0, "posix"); | |
455 | center.set_owner(); | |
456 | uint64_t start = Cycles::rdtsc(); | |
457 | for (int i = 0; i < count; i++) { | |
458 | center.process_events(0); | |
459 | } | |
460 | uint64_t stop = Cycles::rdtsc(); | |
461 | return Cycles::to_seconds(stop - start)/count; | |
462 | } | |
463 | ||
464 | class CenterWorker : public Thread { | |
465 | CephContext *cct; | |
466 | bool done; | |
467 | ||
468 | public: | |
469 | EventCenter center; | |
470 | explicit CenterWorker(CephContext *c): cct(c), done(false), center(c) { | |
471 | center.init(100, 0, "posix"); | |
472 | } | |
473 | void stop() { | |
474 | done = true; | |
475 | center.wakeup(); | |
476 | } | |
477 | void* entry() override { | |
478 | center.set_owner(); | |
479 | bind_thread_to_cpu(2); | |
480 | while (!done) | |
481 | center.process_events(1000); | |
482 | return 0; | |
483 | } | |
484 | }; | |
485 | ||
486 | class CountEvent: public EventCallback { | |
31f18b77 | 487 | std::atomic<int64_t> *count; |
7c673cae FG |
488 | |
489 | public: | |
31f18b77 | 490 | explicit CountEvent(std::atomic<int64_t> *atomic): count(atomic) {} |
7c673cae | 491 | void do_request(int id) override { |
31f18b77 | 492 | (*count)--; |
7c673cae FG |
493 | } |
494 | }; | |
495 | ||
496 | double eventcenter_dispatch() | |
497 | { | |
498 | int count = 100000; | |
499 | ||
500 | CenterWorker worker(g_ceph_context); | |
31f18b77 | 501 | std::atomic<int64_t> flag = { 1 }; |
7c673cae FG |
502 | worker.create("evt_center_disp"); |
503 | EventCallbackRef count_event(new CountEvent(&flag)); | |
504 | ||
505 | worker.center.dispatch_event_external(count_event); | |
506 | // Start a new thread and wait for it to ready. | |
31f18b77 | 507 | while (flag) |
7c673cae FG |
508 | usleep(100); |
509 | ||
510 | uint64_t start = Cycles::rdtsc(); | |
511 | for (int i = 0; i < count; i++) { | |
31f18b77 | 512 | flag = 1; |
7c673cae | 513 | worker.center.dispatch_event_external(count_event); |
31f18b77 | 514 | while (flag) |
7c673cae FG |
515 | ; |
516 | } | |
517 | uint64_t stop = Cycles::rdtsc(); | |
518 | worker.stop(); | |
519 | worker.join(); | |
520 | return Cycles::to_seconds(stop - start)/count; | |
521 | } | |
522 | ||
523 | // Measure the cost of copying a given number of bytes with memcpy. | |
524 | double memcpy_shared(size_t size) | |
525 | { | |
526 | int count = 1000000; | |
527 | char src[size], dst[size]; | |
528 | ||
529 | memset(src, 0, sizeof(src)); | |
530 | ||
531 | uint64_t start = Cycles::rdtsc(); | |
532 | for (int i = 0; i < count; i++) { | |
533 | memcpy(dst, src, size); | |
534 | } | |
535 | uint64_t stop = Cycles::rdtsc(); | |
536 | return Cycles::to_seconds(stop - start)/count; | |
537 | } | |
538 | ||
539 | double memcpy100() | |
540 | { | |
541 | return memcpy_shared(100); | |
542 | } | |
543 | ||
544 | double memcpy1000() | |
545 | { | |
546 | return memcpy_shared(1000); | |
547 | } | |
548 | ||
549 | double memcpy10000() | |
550 | { | |
551 | return memcpy_shared(10000); | |
552 | } | |
553 | ||
554 | // Benchmark rjenkins hashing performance on cached data. | |
555 | template <int key_length> | |
556 | double ceph_str_hash_rjenkins() | |
557 | { | |
558 | int count = 100000; | |
559 | char buf[key_length]; | |
560 | ||
561 | uint64_t start = Cycles::rdtsc(); | |
562 | for (int i = 0; i < count; i++) | |
563 | ceph_str_hash(CEPH_STR_HASH_RJENKINS, buf, sizeof(buf)); | |
564 | uint64_t stop = Cycles::rdtsc(); | |
565 | ||
566 | return Cycles::to_seconds(stop - start)/count; | |
567 | } | |
568 | ||
569 | // Measure the cost of reading the fine-grain cycle counter. | |
570 | double rdtsc_test() | |
571 | { | |
572 | int count = 1000000; | |
573 | uint64_t start = Cycles::rdtsc(); | |
574 | uint64_t total = 0; | |
575 | for (int i = 0; i < count; i++) { | |
576 | total += Cycles::rdtsc(); | |
577 | } | |
578 | uint64_t stop = Cycles::rdtsc(); | |
579 | return Cycles::to_seconds(stop - start)/count; | |
580 | } | |
581 | ||
582 | // Measure the cost of the Cycles::to_seconds method. | |
583 | double perf_cycles_to_seconds() | |
584 | { | |
585 | int count = 1000000; | |
586 | double total = 0; | |
587 | uint64_t cycles = 994261; | |
588 | uint64_t start = Cycles::rdtsc(); | |
589 | for (int i = 0; i < count; i++) { | |
590 | total += Cycles::to_seconds(cycles); | |
591 | } | |
592 | uint64_t stop = Cycles::rdtsc(); | |
593 | // printf("Result: %.4f\n", total/count); | |
594 | return Cycles::to_seconds(stop - start)/count; | |
595 | } | |
596 | ||
597 | // Measure the cost of the Cylcles::toNanoseconds method. | |
598 | double perf_cycles_to_nanoseconds() | |
599 | { | |
600 | int count = 1000000; | |
601 | uint64_t total = 0; | |
602 | uint64_t cycles = 994261; | |
603 | uint64_t start = Cycles::rdtsc(); | |
604 | for (int i = 0; i < count; i++) { | |
605 | total += Cycles::to_nanoseconds(cycles); | |
606 | } | |
607 | uint64_t stop = Cycles::rdtsc(); | |
608 | // printf("Result: %lu\n", total/count); | |
609 | return Cycles::to_seconds(stop - start)/count; | |
610 | } | |
611 | ||
612 | ||
613 | #ifdef HAVE_SSE | |
614 | /** | |
615 | * Prefetch the cache lines containing [object, object + numBytes) into the | |
616 | * processor's caches. | |
617 | * The best docs for this are in the Intel instruction set reference under | |
618 | * PREFETCH. | |
619 | * \param object | |
620 | * The start of the region of memory to prefetch. | |
621 | * \param num_bytes | |
622 | * The size of the region of memory to prefetch. | |
623 | */ | |
624 | static inline void prefetch(const void *object, uint64_t num_bytes) | |
625 | { | |
626 | uint64_t offset = reinterpret_cast<uint64_t>(object) & 0x3fUL; | |
627 | const char* p = reinterpret_cast<const char*>(object) - offset; | |
628 | for (uint64_t i = 0; i < offset + num_bytes; i += 64) | |
629 | _mm_prefetch(p + i, _MM_HINT_T0); | |
630 | } | |
631 | #endif | |
632 | ||
633 | // Measure the cost of the prefetch instruction. | |
634 | double perf_prefetch() | |
635 | { | |
636 | #ifdef HAVE_SSE | |
637 | uint64_t total_ticks = 0; | |
638 | int count = 10; | |
639 | char buf[16 * 64]; | |
640 | uint64_t start, stop; | |
641 | ||
642 | for (int i = 0; i < count; i++) { | |
643 | PerfHelper::flush_cache(); | |
644 | start = Cycles::rdtsc(); | |
645 | prefetch(&buf[576], 64); | |
646 | prefetch(&buf[0], 64); | |
647 | prefetch(&buf[512], 64); | |
648 | prefetch(&buf[960], 64); | |
649 | prefetch(&buf[640], 64); | |
650 | prefetch(&buf[896], 64); | |
651 | prefetch(&buf[256], 64); | |
652 | prefetch(&buf[704], 64); | |
653 | prefetch(&buf[320], 64); | |
654 | prefetch(&buf[384], 64); | |
655 | prefetch(&buf[128], 64); | |
656 | prefetch(&buf[448], 64); | |
657 | prefetch(&buf[768], 64); | |
658 | prefetch(&buf[832], 64); | |
659 | prefetch(&buf[64], 64); | |
660 | prefetch(&buf[192], 64); | |
661 | stop = Cycles::rdtsc(); | |
662 | total_ticks += stop - start; | |
663 | } | |
664 | return Cycles::to_seconds(total_ticks) / count / 16; | |
665 | #else | |
666 | return -1; | |
667 | #endif | |
668 | } | |
669 | ||
670 | #if defined(__x86_64__) | |
671 | /** | |
672 | * This function is used to seralize machine instructions so that no | |
673 | * instructions that appear after it in the current thread can run before any | |
674 | * instructions that appear before it. | |
675 | * | |
676 | * It is useful for putting around rdpmc instructions (to pinpoint cache | |
677 | * misses) as well as before rdtsc instructions, to prevent time pollution from | |
678 | * instructions supposed to be executing before the timer starts. | |
679 | */ | |
680 | static inline void serialize() { | |
681 | uint32_t eax, ebx, ecx, edx; | |
682 | __asm volatile("cpuid" | |
683 | : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) | |
684 | : "a" (1U)); | |
685 | } | |
686 | #endif | |
687 | ||
688 | // Measure the cost of cpuid | |
689 | double perf_serialize() { | |
690 | #if defined(__x86_64__) | |
691 | int count = 1000000; | |
692 | uint64_t start = Cycles::rdtsc(); | |
693 | for (int i = 0; i < count; i++) { | |
694 | serialize(); | |
695 | } | |
696 | uint64_t stop = Cycles::rdtsc(); | |
697 | return Cycles::to_seconds(stop - start)/count; | |
698 | #else | |
699 | return -1; | |
700 | #endif | |
701 | } | |
702 | ||
703 | // Measure the cost of an lfence instruction. | |
704 | double lfence() | |
705 | { | |
706 | #ifdef HAVE_SSE2 | |
707 | int count = 1000000; | |
708 | uint64_t start = Cycles::rdtsc(); | |
709 | for (int i = 0; i < count; i++) { | |
710 | __asm__ __volatile__("lfence" ::: "memory"); | |
711 | } | |
712 | uint64_t stop = Cycles::rdtsc(); | |
713 | return Cycles::to_seconds(stop - start)/count; | |
714 | #else | |
715 | return -1; | |
716 | #endif | |
717 | } | |
718 | ||
719 | // Measure the cost of an sfence instruction. | |
720 | double sfence() | |
721 | { | |
722 | #ifdef HAVE_SSE | |
723 | int count = 1000000; | |
724 | uint64_t start = Cycles::rdtsc(); | |
725 | for (int i = 0; i < count; i++) { | |
726 | __asm__ __volatile__("sfence" ::: "memory"); | |
727 | } | |
728 | uint64_t stop = Cycles::rdtsc(); | |
729 | return Cycles::to_seconds(stop - start)/count; | |
730 | #else | |
731 | return -1; | |
732 | #endif | |
733 | } | |
734 | ||
735 | // Measure the cost of acquiring and releasing a SpinLock (assuming the | |
736 | // lock is initially free). | |
737 | double test_spinlock() | |
738 | { | |
739 | int count = 1000000; | |
740 | Spinlock lock; | |
741 | uint64_t start = Cycles::rdtsc(); | |
742 | for (int i = 0; i < count; i++) { | |
743 | lock.lock(); | |
744 | lock.unlock(); | |
745 | } | |
746 | uint64_t stop = Cycles::rdtsc(); | |
747 | return Cycles::to_seconds(stop - start)/count; | |
748 | } | |
749 | ||
750 | // Helper for spawn_thread. This is the main function that the thread executes | |
751 | // (intentionally empty). | |
752 | class ThreadHelper : public Thread { | |
753 | void *entry() override { return 0; } | |
754 | }; | |
755 | ||
756 | // Measure the cost of start and joining with a thread. | |
757 | double spawn_thread() | |
758 | { | |
759 | int count = 10000; | |
760 | ThreadHelper thread; | |
761 | uint64_t start = Cycles::rdtsc(); | |
762 | for (int i = 0; i < count; i++) { | |
763 | thread.create("thread_helper"); | |
764 | thread.join(); | |
765 | } | |
766 | uint64_t stop = Cycles::rdtsc(); | |
767 | return Cycles::to_seconds(stop - start)/count; | |
768 | } | |
769 | ||
770 | class FakeContext : public Context { | |
771 | public: | |
772 | void finish(int r) override {} | |
773 | }; | |
774 | ||
775 | // Measure the cost of starting and stopping a Dispatch::Timer. | |
776 | double perf_timer() | |
777 | { | |
778 | int count = 1000000; | |
779 | Mutex lock("perf_timer::lock"); | |
780 | SafeTimer timer(g_ceph_context, lock); | |
781 | FakeContext **c = new FakeContext*[count]; | |
782 | for (int i = 0; i < count; i++) { | |
783 | c[i] = new FakeContext(); | |
784 | } | |
785 | uint64_t start = Cycles::rdtsc(); | |
786 | Mutex::Locker l(lock); | |
787 | for (int i = 0; i < count; i++) { | |
3efd9988 FG |
788 | if (timer.add_event_after(12345, c[i])) { |
789 | timer.cancel_event(c[i]); | |
790 | } | |
7c673cae FG |
791 | } |
792 | uint64_t stop = Cycles::rdtsc(); | |
793 | delete[] c; | |
794 | return Cycles::to_seconds(stop - start)/count; | |
795 | } | |
796 | ||
797 | // Measure the cost of throwing and catching an int. This uses an integer as | |
798 | // the value thrown, which is presumably as fast as possible. | |
799 | double throw_int() | |
800 | { | |
801 | int count = 10000; | |
802 | uint64_t start = Cycles::rdtsc(); | |
803 | for (int i = 0; i < count; i++) { | |
804 | try { | |
805 | throw 0; | |
806 | } catch (int) { // NOLINT | |
807 | // pass | |
808 | } | |
809 | } | |
810 | uint64_t stop = Cycles::rdtsc(); | |
811 | return Cycles::to_seconds(stop - start)/count; | |
812 | } | |
813 | ||
814 | // Measure the cost of throwing and catching an int from a function call. | |
815 | double throw_int_call() | |
816 | { | |
817 | int count = 10000; | |
818 | uint64_t start = Cycles::rdtsc(); | |
819 | for (int i = 0; i < count; i++) { | |
820 | try { | |
821 | PerfHelper::throw_int(); | |
822 | } catch (int) { // NOLINT | |
823 | // pass | |
824 | } | |
825 | } | |
826 | uint64_t stop = Cycles::rdtsc(); | |
827 | return Cycles::to_seconds(stop - start)/count; | |
828 | } | |
829 | ||
830 | // Measure the cost of throwing and catching an Exception. This uses an actual | |
831 | // exception as the value thrown, which may be slower than throwInt. | |
832 | double throw_exception() | |
833 | { | |
834 | int count = 10000; | |
835 | uint64_t start = Cycles::rdtsc(); | |
836 | for (int i = 0; i < count; i++) { | |
837 | try { | |
838 | throw buffer::end_of_buffer(); | |
839 | } catch (const buffer::end_of_buffer&) { | |
840 | // pass | |
841 | } | |
842 | } | |
843 | uint64_t stop = Cycles::rdtsc(); | |
844 | return Cycles::to_seconds(stop - start)/count; | |
845 | } | |
846 | ||
847 | // Measure the cost of throwing and catching an Exception from a function call. | |
848 | double throw_exception_call() | |
849 | { | |
850 | int count = 10000; | |
851 | uint64_t start = Cycles::rdtsc(); | |
852 | for (int i = 0; i < count; i++) { | |
853 | try { | |
854 | PerfHelper::throw_end_of_buffer(); | |
855 | } catch (const buffer::end_of_buffer&) { | |
856 | // pass | |
857 | } | |
858 | } | |
859 | uint64_t stop = Cycles::rdtsc(); | |
860 | return Cycles::to_seconds(stop - start)/count; | |
861 | } | |
862 | ||
863 | // Measure the cost of pushing a new element on a std::vector, copying | |
864 | // from the end to an internal element, and popping the end element. | |
865 | double vector_push_pop() | |
866 | { | |
867 | int count = 100000; | |
868 | std::vector<int> vector; | |
869 | vector.push_back(1); | |
870 | vector.push_back(2); | |
871 | vector.push_back(3); | |
872 | uint64_t start = Cycles::rdtsc(); | |
873 | for (int i = 0; i < count; i++) { | |
874 | vector.push_back(i); | |
875 | vector.push_back(i+1); | |
876 | vector.push_back(i+2); | |
877 | vector[2] = vector.back(); | |
878 | vector.pop_back(); | |
879 | vector[0] = vector.back(); | |
880 | vector.pop_back(); | |
881 | vector[1] = vector.back(); | |
882 | vector.pop_back(); | |
883 | } | |
884 | uint64_t stop = Cycles::rdtsc(); | |
885 | return Cycles::to_seconds(stop - start)/(count*3); | |
886 | } | |
887 | ||
888 | // Measure the cost of ceph_clock_now | |
889 | double perf_ceph_clock_now() | |
890 | { | |
891 | int count = 100000; | |
892 | uint64_t start = Cycles::rdtsc(); | |
893 | for (int i = 0; i < count; i++) { | |
894 | ceph_clock_now(); | |
895 | } | |
896 | uint64_t stop = Cycles::rdtsc(); | |
897 | return Cycles::to_seconds(stop - start)/count; | |
898 | } | |
899 | ||
900 | // The following struct and table define each performance test in terms of | |
901 | // a string name and a function that implements the test. | |
902 | struct TestInfo { | |
903 | const char* name; // Name of the performance test; this is | |
904 | // what gets typed on the command line to | |
905 | // run the test. | |
906 | double (*func)(); // Function that implements the test; | |
907 | // returns the time (in seconds) for each | |
908 | // iteration of that test. | |
909 | const char *description; // Short description of this test (not more | |
910 | // than about 40 characters, so the entire | |
911 | // test output fits on a single line). | |
912 | }; | |
913 | TestInfo tests[] = { | |
914 | {"atomic_int_cmp", atomic_int_cmp, | |
915 | "atomic_t::compare_and_swap"}, | |
916 | {"atomic_int_inc", atomic_int_inc, | |
917 | "atomic_t::inc"}, | |
918 | {"atomic_int_read", atomic_int_read, | |
919 | "atomic_t::read"}, | |
920 | {"atomic_int_set", atomic_int_set, | |
921 | "atomic_t::set"}, | |
922 | {"mutex_nonblock", mutex_nonblock, | |
923 | "Mutex lock/unlock (no blocking)"}, | |
924 | {"buffer_basic", buffer_basic, | |
925 | "buffer create, add one ptr, delete"}, | |
926 | {"buffer_encode_decode", buffer_encode_decode, | |
927 | "buffer create, encode/decode object, delete"}, | |
928 | {"buffer_basic_copy", buffer_basic_copy, | |
929 | "buffer create, copy small block, delete"}, | |
930 | {"buffer_copy", buffer_copy, | |
931 | "copy out 2 small ptrs from buffer"}, | |
932 | {"buffer_encode10", buffer_encode, | |
933 | "buffer encoding 10 structures onto existing ptr"}, | |
934 | {"buffer_get_contiguous", buffer_get_contiguous, | |
935 | "Buffer::get_contiguous"}, | |
936 | {"buffer_iterator", buffer_iterator, | |
937 | "iterate over buffer with 5 ptrs"}, | |
938 | {"cond_ping_pong", cond_ping_pong, | |
939 | "condition variable round-trip"}, | |
940 | {"div32", div32, | |
941 | "32-bit integer division instruction"}, | |
942 | {"div64", div64, | |
943 | "64-bit integer division instruction"}, | |
944 | {"function_call", function_call, | |
945 | "Call a function that has not been inlined"}, | |
946 | {"eventcenter_poll", eventcenter_poll, | |
947 | "EventCenter::process_events (no timers or events)"}, | |
948 | {"eventcenter_dispatch", eventcenter_dispatch, | |
949 | "EventCenter::dispatch_event_external latency"}, | |
950 | {"memcpy100", memcpy100, | |
951 | "Copy 100 bytes with memcpy"}, | |
952 | {"memcpy1000", memcpy1000, | |
953 | "Copy 1000 bytes with memcpy"}, | |
954 | {"memcpy10000", memcpy10000, | |
955 | "Copy 10000 bytes with memcpy"}, | |
956 | {"ceph_str_hash_rjenkins", ceph_str_hash_rjenkins<16>, | |
957 | "rjenkins hash on 16 byte of data"}, | |
958 | {"ceph_str_hash_rjenkins", ceph_str_hash_rjenkins<256>, | |
959 | "rjenkins hash on 256 bytes of data"}, | |
960 | {"rdtsc", rdtsc_test, | |
961 | "Read the fine-grain cycle counter"}, | |
962 | {"cycles_to_seconds", perf_cycles_to_seconds, | |
963 | "Convert a rdtsc result to (double) seconds"}, | |
964 | {"cycles_to_seconds", perf_cycles_to_nanoseconds, | |
965 | "Convert a rdtsc result to (uint64_t) nanoseconds"}, | |
966 | {"prefetch", perf_prefetch, | |
967 | "Prefetch instruction"}, | |
968 | {"serialize", perf_serialize, | |
969 | "serialize instruction"}, | |
970 | {"lfence", lfence, | |
971 | "Lfence instruction"}, | |
972 | {"sfence", sfence, | |
973 | "Sfence instruction"}, | |
974 | {"spin_lock", test_spinlock, | |
975 | "Acquire/release SpinLock"}, | |
976 | {"spawn_thread", spawn_thread, | |
977 | "Start and stop a thread"}, | |
978 | {"perf_timer", perf_timer, | |
979 | "Insert and cancel a SafeTimer"}, | |
980 | {"throw_int", throw_int, | |
981 | "Throw an int"}, | |
982 | {"throw_int_call", throw_int_call, | |
983 | "Throw an int in a function call"}, | |
984 | {"throw_exception", throw_exception, | |
985 | "Throw an Exception"}, | |
986 | {"throw_exception_call", throw_exception_call, | |
987 | "Throw an Exception in a function call"}, | |
988 | {"vector_push_pop", vector_push_pop, | |
989 | "Push and pop a std::vector"}, | |
990 | {"ceph_clock_now", perf_ceph_clock_now, | |
991 | "ceph_clock_now function"}, | |
992 | }; | |
993 | ||
994 | /** | |
995 | * Runs a particular test and prints a one-line result message. | |
996 | * | |
997 | * \param info | |
998 | * Describes the test to run. | |
999 | */ | |
1000 | void run_test(TestInfo& info) | |
1001 | { | |
1002 | double secs = info.func(); | |
1003 | int width = printf("%-24s ", info.name); | |
1004 | if (secs == -1) { | |
1005 | width += printf(" architecture nonsupport "); | |
1006 | } else if (secs < 1.0e-06) { | |
1007 | width += printf("%8.2fns", 1e09*secs); | |
1008 | } else if (secs < 1.0e-03) { | |
1009 | width += printf("%8.2fus", 1e06*secs); | |
1010 | } else if (secs < 1.0) { | |
1011 | width += printf("%8.2fms", 1e03*secs); | |
1012 | } else { | |
1013 | width += printf("%8.2fs", secs); | |
1014 | } | |
1015 | printf("%*s %s\n", 32-width, "", info.description); | |
1016 | } | |
1017 | ||
1018 | int main(int argc, char *argv[]) | |
1019 | { | |
1020 | vector<const char*> args; | |
1021 | argv_to_vec(argc, (const char **)argv, args); | |
1022 | ||
1023 | auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, | |
1024 | CODE_ENVIRONMENT_UTILITY, 0); | |
1025 | common_init_finish(g_ceph_context); | |
1026 | Cycles::init(); | |
1027 | ||
1028 | bind_thread_to_cpu(3); | |
1029 | if (argc == 1) { | |
1030 | // No test names specified; run all tests. | |
1031 | for (size_t i = 0; i < sizeof(tests)/sizeof(TestInfo); ++i) { | |
1032 | run_test(tests[i]); | |
1033 | } | |
1034 | } else { | |
1035 | // Run only the tests that were specified on the command line. | |
1036 | for (int i = 1; i < argc; i++) { | |
1037 | bool found_test = false; | |
1038 | for (size_t j = 0; j < sizeof(tests)/sizeof(TestInfo); ++j) { | |
1039 | if (strcmp(argv[i], tests[j].name) == 0) { | |
1040 | found_test = true; | |
1041 | run_test(tests[j]); | |
1042 | break; | |
1043 | } | |
1044 | } | |
1045 | if (!found_test) { | |
1046 | int width = printf("%-24s ??", argv[i]); | |
1047 | printf("%*s No such test\n", 32-width, ""); | |
1048 | } | |
1049 | } | |
1050 | } | |
1051 | } |