]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* Copyright (c) 2015 Haomai Wang <haomaiwang@gmail.com> | |
4 | * Copyright (c) 2011-2014 Stanford University | |
5 | * Copyright (c) 2011 Facebook | |
6 | * | |
7 | * Permission to use, copy, modify, and distribute this software for any | |
8 | * purpose with or without fee is hereby granted, provided that the above | |
9 | * copyright notice and this permission notice appear in all copies. | |
10 | * | |
11 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR(S) DISCLAIM ALL WARRANTIES | |
12 | * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF | |
13 | * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL AUTHORS BE LIABLE FOR | |
14 | * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES | |
15 | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN | |
16 | * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF | |
17 | * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | |
18 | */ | |
19 | ||
20 | // This program contains a collection of low-level performance measurements | |
21 | // for Ceph, which can be run either individually or altogether. These | |
22 | // tests measure performance in a single stand-alone process, not in a cluster | |
23 | // with multiple servers. Invoke the program like this: | |
24 | // | |
25 | // Perf test1 test2 ... | |
26 | // | |
27 | // test1 and test2 are the names of individual performance measurements to | |
28 | // run. If no test names are provided then all of the performance tests | |
29 | // are run. | |
30 | // | |
31 | // To add a new test: | |
32 | // * Write a function that implements the test. Use existing test functions | |
33 | // as a guideline, and be sure to generate output in the same form as | |
34 | // other tests. | |
35 | // * Create a new entry for the test in the #tests table. | |
36 | #include <vector> | |
37 | #include <sched.h> | |
38 | ||
39 | #include "acconfig.h" | |
40 | #ifdef HAVE_SSE | |
41 | #include <xmmintrin.h> | |
42 | #endif | |
43 | ||
44 | #include "include/atomic.h" | |
45 | #include "include/buffer.h" | |
46 | #include "include/encoding.h" | |
47 | #include "include/ceph_hash.h" | |
48 | #include "include/Spinlock.h" | |
49 | #include "common/ceph_argparse.h" | |
50 | #include "common/Cycles.h" | |
51 | #include "common/Cond.h" | |
52 | #include "common/Mutex.h" | |
53 | #include "common/Thread.h" | |
54 | #include "common/Timer.h" | |
55 | #include "msg/async/Event.h" | |
56 | #include "global/global_init.h" | |
57 | ||
58 | #include "test/perf_helper.h" | |
59 | ||
60 | using namespace ceph; | |
61 | ||
62 | /** | |
63 | * Ask the operating system to pin the current thread to a given CPU. | |
64 | * | |
65 | * \param cpu | |
66 | * Indicates the desired CPU and hyperthread; low order 2 bits | |
67 | * specify CPU, next bit specifies hyperthread. | |
68 | */ | |
69 | void bind_thread_to_cpu(int cpu) | |
70 | { | |
71 | #ifdef HAVE_SCHED | |
72 | cpu_set_t set; | |
73 | CPU_ZERO(&set); | |
74 | CPU_SET(cpu, &set); | |
75 | sched_setaffinity(0, sizeof(set), &set); | |
76 | #endif | |
77 | } | |
78 | ||
79 | /* | |
80 | * This function just discards its argument. It's used to make it | |
81 | * appear that data is used, so that the compiler won't optimize | |
82 | * away the code we're trying to measure. | |
83 | * | |
84 | * \param value | |
85 | * Pointer to arbitrary value; it's discarded. | |
86 | */ | |
87 | void discard(void* value) { | |
88 | int x = *reinterpret_cast<int*>(value); | |
89 | if (x == 0x43924776) { | |
90 | printf("Value was 0x%x\n", x); | |
91 | } | |
92 | } | |
93 | ||
94 | //---------------------------------------------------------------------- | |
95 | // Test functions start here | |
96 | //---------------------------------------------------------------------- | |
97 | ||
98 | // Measure the cost of atomic_t::compare_and_swap | |
99 | double atomic_int_cmp() | |
100 | { | |
101 | int count = 1000000; | |
102 | atomic_t value(11); | |
103 | int test = 11; | |
104 | uint64_t start = Cycles::rdtsc(); | |
105 | for (int i = 0; i < count; i++) { | |
106 | value.compare_and_swap(test, test+2); | |
107 | test += 2; | |
108 | } | |
109 | uint64_t stop = Cycles::rdtsc(); | |
110 | // printf("Final value: %d\n", value.load()); | |
111 | return Cycles::to_seconds(stop - start)/count; | |
112 | } | |
113 | ||
114 | // Measure the cost of atomic_t::inc | |
115 | double atomic_int_inc() | |
116 | { | |
117 | int count = 1000000; | |
118 | atomic_t value(11); | |
119 | uint64_t start = Cycles::rdtsc(); | |
120 | for (int i = 0; i < count; i++) { | |
121 | value.inc(); | |
122 | } | |
123 | uint64_t stop = Cycles::rdtsc(); | |
124 | // printf("Final value: %d\n", value.load()); | |
125 | return Cycles::to_seconds(stop - start)/count; | |
126 | } | |
127 | ||
128 | // Measure the cost of reading an atomic_t | |
129 | double atomic_int_read() | |
130 | { | |
131 | int count = 1000000; | |
132 | atomic_t value(11); | |
133 | int total = 0; | |
134 | uint64_t start = Cycles::rdtsc(); | |
135 | for (int i = 0; i < count; i++) { | |
136 | total += value.read(); | |
137 | } | |
138 | uint64_t stop = Cycles::rdtsc(); | |
139 | // printf("Total: %d\n", total); | |
140 | return Cycles::to_seconds(stop - start)/count; | |
141 | } | |
142 | ||
143 | // Measure the cost of storing a new value in a atomic_t | |
144 | double atomic_int_set() | |
145 | { | |
146 | int count = 1000000; | |
147 | atomic_t value(11); | |
148 | uint64_t start = Cycles::rdtsc(); | |
149 | for (int i = 0; i < count; i++) { | |
150 | value.set(88); | |
151 | } | |
152 | uint64_t stop = Cycles::rdtsc(); | |
153 | return Cycles::to_seconds(stop - start)/count; | |
154 | } | |
155 | ||
156 | // Measure the cost of acquiring and releasing a mutex in the | |
157 | // fast case where the mutex is free. | |
158 | double mutex_nonblock() | |
159 | { | |
160 | int count = 1000000; | |
161 | Mutex m("mutex_nonblock::m"); | |
162 | uint64_t start = Cycles::rdtsc(); | |
163 | for (int i = 0; i < count; i++) { | |
164 | m.Lock(); | |
165 | m.Unlock(); | |
166 | } | |
167 | uint64_t stop = Cycles::rdtsc(); | |
168 | return Cycles::to_seconds(stop - start)/count; | |
169 | } | |
170 | ||
171 | // Measure the cost of allocating and deallocating a buffer, plus | |
172 | // appending (logically) one ptr. | |
173 | double buffer_basic() | |
174 | { | |
175 | int count = 1000000; | |
176 | uint64_t start = Cycles::rdtsc(); | |
177 | bufferptr ptr("abcdefg", 7); | |
178 | for (int i = 0; i < count; i++) { | |
179 | bufferlist b; | |
180 | b.append(ptr, 0, 5); | |
181 | } | |
182 | uint64_t stop = Cycles::rdtsc(); | |
183 | return Cycles::to_seconds(stop - start)/count; | |
184 | } | |
185 | ||
186 | struct DummyBlock { | |
187 | int a = 1, b = 2, c = 3, d = 4; | |
188 | void encode(bufferlist &bl) const { | |
189 | ENCODE_START(1, 1, bl); | |
190 | ::encode(a, bl); | |
191 | ::encode(b, bl); | |
192 | ::encode(c, bl); | |
193 | ::encode(d, bl); | |
194 | ENCODE_FINISH(bl); | |
195 | } | |
196 | void decode(bufferlist::iterator &bl) { | |
197 | DECODE_START(1, bl); | |
198 | ::decode(a, bl); | |
199 | ::decode(b, bl); | |
200 | ::decode(c, bl); | |
201 | ::decode(d, bl); | |
202 | DECODE_FINISH(bl); | |
203 | } | |
204 | }; | |
205 | WRITE_CLASS_ENCODER(DummyBlock) | |
206 | ||
207 | // Measure the cost of encoding and decoding a buffer, plus | |
208 | // allocating space for one chunk. | |
209 | double buffer_encode_decode() | |
210 | { | |
211 | int count = 1000000; | |
212 | uint64_t start = Cycles::rdtsc(); | |
213 | for (int i = 0; i < count; i++) { | |
214 | bufferlist b; | |
215 | DummyBlock dummy_block; | |
216 | ::encode(dummy_block, b); | |
217 | bufferlist::iterator iter = b.begin(); | |
218 | ::decode(dummy_block, iter); | |
219 | } | |
220 | uint64_t stop = Cycles::rdtsc(); | |
221 | return Cycles::to_seconds(stop - start)/count; | |
222 | } | |
223 | ||
224 | // Measure the cost of allocating and deallocating a buffer, plus | |
225 | // copying in a small block. | |
226 | double buffer_basic_copy() | |
227 | { | |
228 | int count = 1000000; | |
229 | uint64_t start = Cycles::rdtsc(); | |
230 | for (int i = 0; i < count; i++) { | |
231 | bufferlist b; | |
232 | b.append("abcdefg", 6); | |
233 | } | |
234 | uint64_t stop = Cycles::rdtsc(); | |
235 | return Cycles::to_seconds(stop - start)/count; | |
236 | } | |
237 | ||
238 | // Measure the cost of making a copy of parts of two ptrs. | |
239 | double buffer_copy() | |
240 | { | |
241 | int count = 1000000; | |
242 | bufferlist b; | |
243 | b.append("abcde", 5); | |
244 | b.append("01234", 5); | |
245 | char copy[10]; | |
246 | uint64_t start = Cycles::rdtsc(); | |
247 | for (int i = 0; i < count; i++) { | |
248 | b.copy(2, 6, copy); | |
249 | } | |
250 | uint64_t stop = Cycles::rdtsc(); | |
251 | return Cycles::to_seconds(stop - start)/count; | |
252 | } | |
253 | ||
254 | // Measure the cost of allocating new space by extending the | |
255 | // bufferlist | |
256 | double buffer_encode() | |
257 | { | |
258 | int count = 100000; | |
259 | uint64_t total = 0; | |
260 | for (int i = 0; i < count; i++) { | |
261 | bufferlist b; | |
262 | DummyBlock dummy_block; | |
263 | ::encode(dummy_block, b); | |
264 | uint64_t start = Cycles::rdtsc(); | |
265 | ::encode(dummy_block, b); | |
266 | ::encode(dummy_block, b); | |
267 | ::encode(dummy_block, b); | |
268 | ::encode(dummy_block, b); | |
269 | ::encode(dummy_block, b); | |
270 | ::encode(dummy_block, b); | |
271 | ::encode(dummy_block, b); | |
272 | ::encode(dummy_block, b); | |
273 | ::encode(dummy_block, b); | |
274 | ::encode(dummy_block, b); | |
275 | total += Cycles::rdtsc() - start; | |
276 | } | |
277 | return Cycles::to_seconds(total)/(count*10); | |
278 | } | |
279 | ||
280 | // Measure the cost of retrieving an object from the beginning of a buffer. | |
281 | double buffer_get_contiguous() | |
282 | { | |
283 | int count = 1000000; | |
284 | int value = 11; | |
285 | bufferlist b; | |
286 | b.append((char*)&value, sizeof(value)); | |
287 | int sum = 0; | |
288 | uint64_t start = Cycles::rdtsc(); | |
289 | for (int i = 0; i < count; i++) { | |
290 | sum += *reinterpret_cast<int*>(b.get_contiguous(0, sizeof(value))); | |
291 | } | |
292 | uint64_t stop = Cycles::rdtsc(); | |
293 | return Cycles::to_seconds(stop - start)/count; | |
294 | } | |
295 | ||
296 | // Measure the cost of creating an iterator and iterating over 10 | |
297 | // chunks in a buffer. | |
298 | double buffer_iterator() | |
299 | { | |
300 | bufferlist b; | |
301 | const char s[] = "abcdefghijklmnopqrstuvwxyz"; | |
302 | bufferptr ptr(s, sizeof(s)); | |
303 | for (int i = 0; i < 5; i++) { | |
304 | b.append(ptr, i, 5); | |
305 | } | |
306 | int count = 100000; | |
307 | int sum = 0; | |
308 | uint64_t start = Cycles::rdtsc(); | |
309 | for (int i = 0; i < count; i++) { | |
310 | bufferlist::iterator it = b.begin(); | |
311 | while (!it.end()) { | |
312 | sum += (static_cast<const char*>(it.get_current_ptr().c_str()))[it.get_remaining()-1]; | |
313 | ++it; | |
314 | } | |
315 | } | |
316 | uint64_t stop = Cycles::rdtsc(); | |
317 | discard(&sum); | |
318 | return Cycles::to_seconds(stop - start)/count; | |
319 | } | |
320 | ||
321 | // Implements the CondPingPong test. | |
322 | class CondPingPong { | |
323 | Mutex mutex; | |
324 | Cond cond; | |
325 | int prod; | |
326 | int cons; | |
327 | const int count; | |
328 | ||
329 | class Consumer : public Thread { | |
330 | CondPingPong *p; | |
331 | public: | |
332 | explicit Consumer(CondPingPong *p): p(p) {} | |
333 | void* entry() override { | |
334 | p->consume(); | |
335 | return 0; | |
336 | } | |
337 | } consumer; | |
338 | ||
339 | public: | |
340 | CondPingPong(): mutex("CondPingPong::mutex"), prod(0), cons(0), count(10000), consumer(this) {} | |
341 | ||
342 | double run() { | |
343 | consumer.create("consumer"); | |
344 | uint64_t start = Cycles::rdtsc(); | |
345 | produce(); | |
346 | uint64_t stop = Cycles::rdtsc(); | |
347 | consumer.join(); | |
348 | return Cycles::to_seconds(stop - start)/count; | |
349 | } | |
350 | ||
351 | void produce() { | |
352 | Mutex::Locker l(mutex); | |
353 | while (cons < count) { | |
354 | while (cons < prod) | |
355 | cond.Wait(mutex); | |
356 | ++prod; | |
357 | cond.Signal(); | |
358 | } | |
359 | } | |
360 | ||
361 | void consume() { | |
362 | Mutex::Locker l(mutex); | |
363 | while (cons < count) { | |
364 | while (cons == prod) | |
365 | cond.Wait(mutex); | |
366 | ++cons; | |
367 | cond.Signal(); | |
368 | } | |
369 | } | |
370 | }; | |
371 | ||
372 | // Measure the cost of coordinating between threads using a condition variable. | |
373 | double cond_ping_pong() | |
374 | { | |
375 | return CondPingPong().run(); | |
376 | } | |
377 | ||
378 | // Measure the cost of a 32-bit divide. Divides don't take a constant | |
379 | // number of cycles. Values were chosen here semi-randomly to depict a | |
380 | // fairly expensive scenario. Someone with fancy ALU knowledge could | |
381 | // probably pick worse values. | |
382 | double div32() | |
383 | { | |
384 | #if defined(__i386__) || defined(__x86_64__) | |
385 | int count = 1000000; | |
386 | uint64_t start = Cycles::rdtsc(); | |
387 | // NB: Expect an x86 processor exception is there's overflow. | |
388 | uint32_t numeratorHi = 0xa5a5a5a5U; | |
389 | uint32_t numeratorLo = 0x55aa55aaU; | |
390 | uint32_t divisor = 0xaa55aa55U; | |
391 | uint32_t quotient; | |
392 | uint32_t remainder; | |
393 | for (int i = 0; i < count; i++) { | |
394 | __asm__ __volatile__("div %4" : | |
395 | "=a"(quotient), "=d"(remainder) : | |
396 | "a"(numeratorLo), "d"(numeratorHi), "r"(divisor) : | |
397 | "cc"); | |
398 | } | |
399 | uint64_t stop = Cycles::rdtsc(); | |
400 | return Cycles::to_seconds(stop - start)/count; | |
401 | #else | |
402 | return -1; | |
403 | #endif | |
404 | } | |
405 | ||
406 | // Measure the cost of a 64-bit divide. Divides don't take a constant | |
407 | // number of cycles. Values were chosen here semi-randomly to depict a | |
408 | // fairly expensive scenario. Someone with fancy ALU knowledge could | |
409 | // probably pick worse values. | |
410 | double div64() | |
411 | { | |
412 | #if defined(__x86_64__) || defined(__amd64__) | |
413 | int count = 1000000; | |
414 | // NB: Expect an x86 processor exception is there's overflow. | |
415 | uint64_t start = Cycles::rdtsc(); | |
416 | uint64_t numeratorHi = 0x5a5a5a5a5a5UL; | |
417 | uint64_t numeratorLo = 0x55aa55aa55aa55aaUL; | |
418 | uint64_t divisor = 0xaa55aa55aa55aa55UL; | |
419 | uint64_t quotient; | |
420 | uint64_t remainder; | |
421 | for (int i = 0; i < count; i++) { | |
422 | __asm__ __volatile__("divq %4" : | |
423 | "=a"(quotient), "=d"(remainder) : | |
424 | "a"(numeratorLo), "d"(numeratorHi), "r"(divisor) : | |
425 | "cc"); | |
426 | } | |
427 | uint64_t stop = Cycles::rdtsc(); | |
428 | return Cycles::to_seconds(stop - start)/count; | |
429 | #else | |
430 | return -1; | |
431 | #endif | |
432 | } | |
433 | ||
434 | // Measure the cost of calling a non-inlined function. | |
435 | double function_call() | |
436 | { | |
437 | int count = 1000000; | |
438 | uint64_t x = 0; | |
439 | uint64_t start = Cycles::rdtsc(); | |
440 | for (int i = 0; i < count; i++) { | |
441 | x = PerfHelper::plus_one(x); | |
442 | } | |
443 | uint64_t stop = Cycles::rdtsc(); | |
444 | return Cycles::to_seconds(stop - start)/count; | |
445 | } | |
446 | ||
447 | // Measure the minimum cost of EventCenter::process_events, when there are no | |
448 | // Pollers and no Timers. | |
449 | double eventcenter_poll() | |
450 | { | |
451 | int count = 1000000; | |
452 | EventCenter center(g_ceph_context); | |
453 | center.init(1000, 0, "posix"); | |
454 | center.set_owner(); | |
455 | uint64_t start = Cycles::rdtsc(); | |
456 | for (int i = 0; i < count; i++) { | |
457 | center.process_events(0); | |
458 | } | |
459 | uint64_t stop = Cycles::rdtsc(); | |
460 | return Cycles::to_seconds(stop - start)/count; | |
461 | } | |
462 | ||
463 | class CenterWorker : public Thread { | |
464 | CephContext *cct; | |
465 | bool done; | |
466 | ||
467 | public: | |
468 | EventCenter center; | |
469 | explicit CenterWorker(CephContext *c): cct(c), done(false), center(c) { | |
470 | center.init(100, 0, "posix"); | |
471 | } | |
472 | void stop() { | |
473 | done = true; | |
474 | center.wakeup(); | |
475 | } | |
476 | void* entry() override { | |
477 | center.set_owner(); | |
478 | bind_thread_to_cpu(2); | |
479 | while (!done) | |
480 | center.process_events(1000); | |
481 | return 0; | |
482 | } | |
483 | }; | |
484 | ||
485 | class CountEvent: public EventCallback { | |
486 | atomic_t *count; | |
487 | ||
488 | public: | |
489 | explicit CountEvent(atomic_t *atomic): count(atomic) {} | |
490 | void do_request(int id) override { | |
491 | count->dec(); | |
492 | } | |
493 | }; | |
494 | ||
495 | double eventcenter_dispatch() | |
496 | { | |
497 | int count = 100000; | |
498 | ||
499 | CenterWorker worker(g_ceph_context); | |
500 | atomic_t flag(1); | |
501 | worker.create("evt_center_disp"); | |
502 | EventCallbackRef count_event(new CountEvent(&flag)); | |
503 | ||
504 | worker.center.dispatch_event_external(count_event); | |
505 | // Start a new thread and wait for it to ready. | |
506 | while (flag.read()) | |
507 | usleep(100); | |
508 | ||
509 | uint64_t start = Cycles::rdtsc(); | |
510 | for (int i = 0; i < count; i++) { | |
511 | flag.set(1); | |
512 | worker.center.dispatch_event_external(count_event); | |
513 | while (flag.read()) | |
514 | ; | |
515 | } | |
516 | uint64_t stop = Cycles::rdtsc(); | |
517 | worker.stop(); | |
518 | worker.join(); | |
519 | return Cycles::to_seconds(stop - start)/count; | |
520 | } | |
521 | ||
522 | // Measure the cost of copying a given number of bytes with memcpy. | |
523 | double memcpy_shared(size_t size) | |
524 | { | |
525 | int count = 1000000; | |
526 | char src[size], dst[size]; | |
527 | ||
528 | memset(src, 0, sizeof(src)); | |
529 | ||
530 | uint64_t start = Cycles::rdtsc(); | |
531 | for (int i = 0; i < count; i++) { | |
532 | memcpy(dst, src, size); | |
533 | } | |
534 | uint64_t stop = Cycles::rdtsc(); | |
535 | return Cycles::to_seconds(stop - start)/count; | |
536 | } | |
537 | ||
538 | double memcpy100() | |
539 | { | |
540 | return memcpy_shared(100); | |
541 | } | |
542 | ||
543 | double memcpy1000() | |
544 | { | |
545 | return memcpy_shared(1000); | |
546 | } | |
547 | ||
548 | double memcpy10000() | |
549 | { | |
550 | return memcpy_shared(10000); | |
551 | } | |
552 | ||
553 | // Benchmark rjenkins hashing performance on cached data. | |
554 | template <int key_length> | |
555 | double ceph_str_hash_rjenkins() | |
556 | { | |
557 | int count = 100000; | |
558 | char buf[key_length]; | |
559 | ||
560 | uint64_t start = Cycles::rdtsc(); | |
561 | for (int i = 0; i < count; i++) | |
562 | ceph_str_hash(CEPH_STR_HASH_RJENKINS, buf, sizeof(buf)); | |
563 | uint64_t stop = Cycles::rdtsc(); | |
564 | ||
565 | return Cycles::to_seconds(stop - start)/count; | |
566 | } | |
567 | ||
568 | // Measure the cost of reading the fine-grain cycle counter. | |
569 | double rdtsc_test() | |
570 | { | |
571 | int count = 1000000; | |
572 | uint64_t start = Cycles::rdtsc(); | |
573 | uint64_t total = 0; | |
574 | for (int i = 0; i < count; i++) { | |
575 | total += Cycles::rdtsc(); | |
576 | } | |
577 | uint64_t stop = Cycles::rdtsc(); | |
578 | return Cycles::to_seconds(stop - start)/count; | |
579 | } | |
580 | ||
581 | // Measure the cost of the Cycles::to_seconds method. | |
582 | double perf_cycles_to_seconds() | |
583 | { | |
584 | int count = 1000000; | |
585 | double total = 0; | |
586 | uint64_t cycles = 994261; | |
587 | uint64_t start = Cycles::rdtsc(); | |
588 | for (int i = 0; i < count; i++) { | |
589 | total += Cycles::to_seconds(cycles); | |
590 | } | |
591 | uint64_t stop = Cycles::rdtsc(); | |
592 | // printf("Result: %.4f\n", total/count); | |
593 | return Cycles::to_seconds(stop - start)/count; | |
594 | } | |
595 | ||
596 | // Measure the cost of the Cylcles::toNanoseconds method. | |
597 | double perf_cycles_to_nanoseconds() | |
598 | { | |
599 | int count = 1000000; | |
600 | uint64_t total = 0; | |
601 | uint64_t cycles = 994261; | |
602 | uint64_t start = Cycles::rdtsc(); | |
603 | for (int i = 0; i < count; i++) { | |
604 | total += Cycles::to_nanoseconds(cycles); | |
605 | } | |
606 | uint64_t stop = Cycles::rdtsc(); | |
607 | // printf("Result: %lu\n", total/count); | |
608 | return Cycles::to_seconds(stop - start)/count; | |
609 | } | |
610 | ||
611 | ||
612 | #ifdef HAVE_SSE | |
613 | /** | |
614 | * Prefetch the cache lines containing [object, object + numBytes) into the | |
615 | * processor's caches. | |
616 | * The best docs for this are in the Intel instruction set reference under | |
617 | * PREFETCH. | |
618 | * \param object | |
619 | * The start of the region of memory to prefetch. | |
620 | * \param num_bytes | |
621 | * The size of the region of memory to prefetch. | |
622 | */ | |
623 | static inline void prefetch(const void *object, uint64_t num_bytes) | |
624 | { | |
625 | uint64_t offset = reinterpret_cast<uint64_t>(object) & 0x3fUL; | |
626 | const char* p = reinterpret_cast<const char*>(object) - offset; | |
627 | for (uint64_t i = 0; i < offset + num_bytes; i += 64) | |
628 | _mm_prefetch(p + i, _MM_HINT_T0); | |
629 | } | |
630 | #endif | |
631 | ||
632 | // Measure the cost of the prefetch instruction. | |
633 | double perf_prefetch() | |
634 | { | |
635 | #ifdef HAVE_SSE | |
636 | uint64_t total_ticks = 0; | |
637 | int count = 10; | |
638 | char buf[16 * 64]; | |
639 | uint64_t start, stop; | |
640 | ||
641 | for (int i = 0; i < count; i++) { | |
642 | PerfHelper::flush_cache(); | |
643 | start = Cycles::rdtsc(); | |
644 | prefetch(&buf[576], 64); | |
645 | prefetch(&buf[0], 64); | |
646 | prefetch(&buf[512], 64); | |
647 | prefetch(&buf[960], 64); | |
648 | prefetch(&buf[640], 64); | |
649 | prefetch(&buf[896], 64); | |
650 | prefetch(&buf[256], 64); | |
651 | prefetch(&buf[704], 64); | |
652 | prefetch(&buf[320], 64); | |
653 | prefetch(&buf[384], 64); | |
654 | prefetch(&buf[128], 64); | |
655 | prefetch(&buf[448], 64); | |
656 | prefetch(&buf[768], 64); | |
657 | prefetch(&buf[832], 64); | |
658 | prefetch(&buf[64], 64); | |
659 | prefetch(&buf[192], 64); | |
660 | stop = Cycles::rdtsc(); | |
661 | total_ticks += stop - start; | |
662 | } | |
663 | return Cycles::to_seconds(total_ticks) / count / 16; | |
664 | #else | |
665 | return -1; | |
666 | #endif | |
667 | } | |
668 | ||
669 | #if defined(__x86_64__) | |
670 | /** | |
671 | * This function is used to seralize machine instructions so that no | |
672 | * instructions that appear after it in the current thread can run before any | |
673 | * instructions that appear before it. | |
674 | * | |
675 | * It is useful for putting around rdpmc instructions (to pinpoint cache | |
676 | * misses) as well as before rdtsc instructions, to prevent time pollution from | |
677 | * instructions supposed to be executing before the timer starts. | |
678 | */ | |
679 | static inline void serialize() { | |
680 | uint32_t eax, ebx, ecx, edx; | |
681 | __asm volatile("cpuid" | |
682 | : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) | |
683 | : "a" (1U)); | |
684 | } | |
685 | #endif | |
686 | ||
687 | // Measure the cost of cpuid | |
688 | double perf_serialize() { | |
689 | #if defined(__x86_64__) | |
690 | int count = 1000000; | |
691 | uint64_t start = Cycles::rdtsc(); | |
692 | for (int i = 0; i < count; i++) { | |
693 | serialize(); | |
694 | } | |
695 | uint64_t stop = Cycles::rdtsc(); | |
696 | return Cycles::to_seconds(stop - start)/count; | |
697 | #else | |
698 | return -1; | |
699 | #endif | |
700 | } | |
701 | ||
702 | // Measure the cost of an lfence instruction. | |
703 | double lfence() | |
704 | { | |
705 | #ifdef HAVE_SSE2 | |
706 | int count = 1000000; | |
707 | uint64_t start = Cycles::rdtsc(); | |
708 | for (int i = 0; i < count; i++) { | |
709 | __asm__ __volatile__("lfence" ::: "memory"); | |
710 | } | |
711 | uint64_t stop = Cycles::rdtsc(); | |
712 | return Cycles::to_seconds(stop - start)/count; | |
713 | #else | |
714 | return -1; | |
715 | #endif | |
716 | } | |
717 | ||
718 | // Measure the cost of an sfence instruction. | |
719 | double sfence() | |
720 | { | |
721 | #ifdef HAVE_SSE | |
722 | int count = 1000000; | |
723 | uint64_t start = Cycles::rdtsc(); | |
724 | for (int i = 0; i < count; i++) { | |
725 | __asm__ __volatile__("sfence" ::: "memory"); | |
726 | } | |
727 | uint64_t stop = Cycles::rdtsc(); | |
728 | return Cycles::to_seconds(stop - start)/count; | |
729 | #else | |
730 | return -1; | |
731 | #endif | |
732 | } | |
733 | ||
734 | // Measure the cost of acquiring and releasing a SpinLock (assuming the | |
735 | // lock is initially free). | |
736 | double test_spinlock() | |
737 | { | |
738 | int count = 1000000; | |
739 | Spinlock lock; | |
740 | uint64_t start = Cycles::rdtsc(); | |
741 | for (int i = 0; i < count; i++) { | |
742 | lock.lock(); | |
743 | lock.unlock(); | |
744 | } | |
745 | uint64_t stop = Cycles::rdtsc(); | |
746 | return Cycles::to_seconds(stop - start)/count; | |
747 | } | |
748 | ||
749 | // Helper for spawn_thread. This is the main function that the thread executes | |
750 | // (intentionally empty). | |
751 | class ThreadHelper : public Thread { | |
752 | void *entry() override { return 0; } | |
753 | }; | |
754 | ||
755 | // Measure the cost of start and joining with a thread. | |
756 | double spawn_thread() | |
757 | { | |
758 | int count = 10000; | |
759 | ThreadHelper thread; | |
760 | uint64_t start = Cycles::rdtsc(); | |
761 | for (int i = 0; i < count; i++) { | |
762 | thread.create("thread_helper"); | |
763 | thread.join(); | |
764 | } | |
765 | uint64_t stop = Cycles::rdtsc(); | |
766 | return Cycles::to_seconds(stop - start)/count; | |
767 | } | |
768 | ||
769 | class FakeContext : public Context { | |
770 | public: | |
771 | void finish(int r) override {} | |
772 | }; | |
773 | ||
774 | // Measure the cost of starting and stopping a Dispatch::Timer. | |
775 | double perf_timer() | |
776 | { | |
777 | int count = 1000000; | |
778 | Mutex lock("perf_timer::lock"); | |
779 | SafeTimer timer(g_ceph_context, lock); | |
780 | FakeContext **c = new FakeContext*[count]; | |
781 | for (int i = 0; i < count; i++) { | |
782 | c[i] = new FakeContext(); | |
783 | } | |
784 | uint64_t start = Cycles::rdtsc(); | |
785 | Mutex::Locker l(lock); | |
786 | for (int i = 0; i < count; i++) { | |
787 | timer.add_event_after(12345, c[i]); | |
788 | timer.cancel_event(c[i]); | |
789 | } | |
790 | uint64_t stop = Cycles::rdtsc(); | |
791 | delete[] c; | |
792 | return Cycles::to_seconds(stop - start)/count; | |
793 | } | |
794 | ||
795 | // Measure the cost of throwing and catching an int. This uses an integer as | |
796 | // the value thrown, which is presumably as fast as possible. | |
797 | double throw_int() | |
798 | { | |
799 | int count = 10000; | |
800 | uint64_t start = Cycles::rdtsc(); | |
801 | for (int i = 0; i < count; i++) { | |
802 | try { | |
803 | throw 0; | |
804 | } catch (int) { // NOLINT | |
805 | // pass | |
806 | } | |
807 | } | |
808 | uint64_t stop = Cycles::rdtsc(); | |
809 | return Cycles::to_seconds(stop - start)/count; | |
810 | } | |
811 | ||
812 | // Measure the cost of throwing and catching an int from a function call. | |
813 | double throw_int_call() | |
814 | { | |
815 | int count = 10000; | |
816 | uint64_t start = Cycles::rdtsc(); | |
817 | for (int i = 0; i < count; i++) { | |
818 | try { | |
819 | PerfHelper::throw_int(); | |
820 | } catch (int) { // NOLINT | |
821 | // pass | |
822 | } | |
823 | } | |
824 | uint64_t stop = Cycles::rdtsc(); | |
825 | return Cycles::to_seconds(stop - start)/count; | |
826 | } | |
827 | ||
828 | // Measure the cost of throwing and catching an Exception. This uses an actual | |
829 | // exception as the value thrown, which may be slower than throwInt. | |
830 | double throw_exception() | |
831 | { | |
832 | int count = 10000; | |
833 | uint64_t start = Cycles::rdtsc(); | |
834 | for (int i = 0; i < count; i++) { | |
835 | try { | |
836 | throw buffer::end_of_buffer(); | |
837 | } catch (const buffer::end_of_buffer&) { | |
838 | // pass | |
839 | } | |
840 | } | |
841 | uint64_t stop = Cycles::rdtsc(); | |
842 | return Cycles::to_seconds(stop - start)/count; | |
843 | } | |
844 | ||
845 | // Measure the cost of throwing and catching an Exception from a function call. | |
846 | double throw_exception_call() | |
847 | { | |
848 | int count = 10000; | |
849 | uint64_t start = Cycles::rdtsc(); | |
850 | for (int i = 0; i < count; i++) { | |
851 | try { | |
852 | PerfHelper::throw_end_of_buffer(); | |
853 | } catch (const buffer::end_of_buffer&) { | |
854 | // pass | |
855 | } | |
856 | } | |
857 | uint64_t stop = Cycles::rdtsc(); | |
858 | return Cycles::to_seconds(stop - start)/count; | |
859 | } | |
860 | ||
861 | // Measure the cost of pushing a new element on a std::vector, copying | |
862 | // from the end to an internal element, and popping the end element. | |
863 | double vector_push_pop() | |
864 | { | |
865 | int count = 100000; | |
866 | std::vector<int> vector; | |
867 | vector.push_back(1); | |
868 | vector.push_back(2); | |
869 | vector.push_back(3); | |
870 | uint64_t start = Cycles::rdtsc(); | |
871 | for (int i = 0; i < count; i++) { | |
872 | vector.push_back(i); | |
873 | vector.push_back(i+1); | |
874 | vector.push_back(i+2); | |
875 | vector[2] = vector.back(); | |
876 | vector.pop_back(); | |
877 | vector[0] = vector.back(); | |
878 | vector.pop_back(); | |
879 | vector[1] = vector.back(); | |
880 | vector.pop_back(); | |
881 | } | |
882 | uint64_t stop = Cycles::rdtsc(); | |
883 | return Cycles::to_seconds(stop - start)/(count*3); | |
884 | } | |
885 | ||
886 | // Measure the cost of ceph_clock_now | |
887 | double perf_ceph_clock_now() | |
888 | { | |
889 | int count = 100000; | |
890 | uint64_t start = Cycles::rdtsc(); | |
891 | for (int i = 0; i < count; i++) { | |
892 | ceph_clock_now(); | |
893 | } | |
894 | uint64_t stop = Cycles::rdtsc(); | |
895 | return Cycles::to_seconds(stop - start)/count; | |
896 | } | |
897 | ||
898 | // The following struct and table define each performance test in terms of | |
899 | // a string name and a function that implements the test. | |
900 | struct TestInfo { | |
901 | const char* name; // Name of the performance test; this is | |
902 | // what gets typed on the command line to | |
903 | // run the test. | |
904 | double (*func)(); // Function that implements the test; | |
905 | // returns the time (in seconds) for each | |
906 | // iteration of that test. | |
907 | const char *description; // Short description of this test (not more | |
908 | // than about 40 characters, so the entire | |
909 | // test output fits on a single line). | |
910 | }; | |
911 | TestInfo tests[] = { | |
912 | {"atomic_int_cmp", atomic_int_cmp, | |
913 | "atomic_t::compare_and_swap"}, | |
914 | {"atomic_int_inc", atomic_int_inc, | |
915 | "atomic_t::inc"}, | |
916 | {"atomic_int_read", atomic_int_read, | |
917 | "atomic_t::read"}, | |
918 | {"atomic_int_set", atomic_int_set, | |
919 | "atomic_t::set"}, | |
920 | {"mutex_nonblock", mutex_nonblock, | |
921 | "Mutex lock/unlock (no blocking)"}, | |
922 | {"buffer_basic", buffer_basic, | |
923 | "buffer create, add one ptr, delete"}, | |
924 | {"buffer_encode_decode", buffer_encode_decode, | |
925 | "buffer create, encode/decode object, delete"}, | |
926 | {"buffer_basic_copy", buffer_basic_copy, | |
927 | "buffer create, copy small block, delete"}, | |
928 | {"buffer_copy", buffer_copy, | |
929 | "copy out 2 small ptrs from buffer"}, | |
930 | {"buffer_encode10", buffer_encode, | |
931 | "buffer encoding 10 structures onto existing ptr"}, | |
932 | {"buffer_get_contiguous", buffer_get_contiguous, | |
933 | "Buffer::get_contiguous"}, | |
934 | {"buffer_iterator", buffer_iterator, | |
935 | "iterate over buffer with 5 ptrs"}, | |
936 | {"cond_ping_pong", cond_ping_pong, | |
937 | "condition variable round-trip"}, | |
938 | {"div32", div32, | |
939 | "32-bit integer division instruction"}, | |
940 | {"div64", div64, | |
941 | "64-bit integer division instruction"}, | |
942 | {"function_call", function_call, | |
943 | "Call a function that has not been inlined"}, | |
944 | {"eventcenter_poll", eventcenter_poll, | |
945 | "EventCenter::process_events (no timers or events)"}, | |
946 | {"eventcenter_dispatch", eventcenter_dispatch, | |
947 | "EventCenter::dispatch_event_external latency"}, | |
948 | {"memcpy100", memcpy100, | |
949 | "Copy 100 bytes with memcpy"}, | |
950 | {"memcpy1000", memcpy1000, | |
951 | "Copy 1000 bytes with memcpy"}, | |
952 | {"memcpy10000", memcpy10000, | |
953 | "Copy 10000 bytes with memcpy"}, | |
954 | {"ceph_str_hash_rjenkins", ceph_str_hash_rjenkins<16>, | |
955 | "rjenkins hash on 16 byte of data"}, | |
956 | {"ceph_str_hash_rjenkins", ceph_str_hash_rjenkins<256>, | |
957 | "rjenkins hash on 256 bytes of data"}, | |
958 | {"rdtsc", rdtsc_test, | |
959 | "Read the fine-grain cycle counter"}, | |
960 | {"cycles_to_seconds", perf_cycles_to_seconds, | |
961 | "Convert a rdtsc result to (double) seconds"}, | |
962 | {"cycles_to_seconds", perf_cycles_to_nanoseconds, | |
963 | "Convert a rdtsc result to (uint64_t) nanoseconds"}, | |
964 | {"prefetch", perf_prefetch, | |
965 | "Prefetch instruction"}, | |
966 | {"serialize", perf_serialize, | |
967 | "serialize instruction"}, | |
968 | {"lfence", lfence, | |
969 | "Lfence instruction"}, | |
970 | {"sfence", sfence, | |
971 | "Sfence instruction"}, | |
972 | {"spin_lock", test_spinlock, | |
973 | "Acquire/release SpinLock"}, | |
974 | {"spawn_thread", spawn_thread, | |
975 | "Start and stop a thread"}, | |
976 | {"perf_timer", perf_timer, | |
977 | "Insert and cancel a SafeTimer"}, | |
978 | {"throw_int", throw_int, | |
979 | "Throw an int"}, | |
980 | {"throw_int_call", throw_int_call, | |
981 | "Throw an int in a function call"}, | |
982 | {"throw_exception", throw_exception, | |
983 | "Throw an Exception"}, | |
984 | {"throw_exception_call", throw_exception_call, | |
985 | "Throw an Exception in a function call"}, | |
986 | {"vector_push_pop", vector_push_pop, | |
987 | "Push and pop a std::vector"}, | |
988 | {"ceph_clock_now", perf_ceph_clock_now, | |
989 | "ceph_clock_now function"}, | |
990 | }; | |
991 | ||
992 | /** | |
993 | * Runs a particular test and prints a one-line result message. | |
994 | * | |
995 | * \param info | |
996 | * Describes the test to run. | |
997 | */ | |
998 | void run_test(TestInfo& info) | |
999 | { | |
1000 | double secs = info.func(); | |
1001 | int width = printf("%-24s ", info.name); | |
1002 | if (secs == -1) { | |
1003 | width += printf(" architecture nonsupport "); | |
1004 | } else if (secs < 1.0e-06) { | |
1005 | width += printf("%8.2fns", 1e09*secs); | |
1006 | } else if (secs < 1.0e-03) { | |
1007 | width += printf("%8.2fus", 1e06*secs); | |
1008 | } else if (secs < 1.0) { | |
1009 | width += printf("%8.2fms", 1e03*secs); | |
1010 | } else { | |
1011 | width += printf("%8.2fs", secs); | |
1012 | } | |
1013 | printf("%*s %s\n", 32-width, "", info.description); | |
1014 | } | |
1015 | ||
1016 | int main(int argc, char *argv[]) | |
1017 | { | |
1018 | vector<const char*> args; | |
1019 | argv_to_vec(argc, (const char **)argv, args); | |
1020 | ||
1021 | auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, | |
1022 | CODE_ENVIRONMENT_UTILITY, 0); | |
1023 | common_init_finish(g_ceph_context); | |
1024 | Cycles::init(); | |
1025 | ||
1026 | bind_thread_to_cpu(3); | |
1027 | if (argc == 1) { | |
1028 | // No test names specified; run all tests. | |
1029 | for (size_t i = 0; i < sizeof(tests)/sizeof(TestInfo); ++i) { | |
1030 | run_test(tests[i]); | |
1031 | } | |
1032 | } else { | |
1033 | // Run only the tests that were specified on the command line. | |
1034 | for (int i = 1; i < argc; i++) { | |
1035 | bool found_test = false; | |
1036 | for (size_t j = 0; j < sizeof(tests)/sizeof(TestInfo); ++j) { | |
1037 | if (strcmp(argv[i], tests[j].name) == 0) { | |
1038 | found_test = true; | |
1039 | run_test(tests[j]); | |
1040 | break; | |
1041 | } | |
1042 | } | |
1043 | if (!found_test) { | |
1044 | int width = printf("%-24s ??", argv[i]); | |
1045 | printf("%*s No such test\n", 32-width, ""); | |
1046 | } | |
1047 | } | |
1048 | } | |
1049 | } |