ceph/src/spdk/dpdk/test/test/test_ring_perf.c

   1 /* SPDX-License-Identifier: BSD-3-Clause
   2  * Copyright(c) 2010-2014 Intel Corporation
   3  */
   4
   5
   6 #include <stdio.h>
   7 #include <inttypes.h>
   8 #include <rte_ring.h>
   9 #include <rte_cycles.h>
  10 #include <rte_launch.h>
  11 #include <rte_pause.h>
  12
  13 #include "test.h"
  14
  15 /*
  16  * Ring
  17  * ====
  18  *
  19  * Measures performance of various operations using rdtsc
  20  *  * Empty ring dequeue
  21  *  * Enqueue/dequeue of bursts in 1 threads
  22  *  * Enqueue/dequeue of bursts in 2 threads
  23  */
  24
  25 #define RING_NAME "RING_PERF"
  26 #define RING_SIZE 4096
  27 #define MAX_BURST 32
  28
  29 /*
  30  * the sizes to enqueue and dequeue in testing
  31  * (marked volatile so they won't be seen as compile-time constants)
  32  */
  33 static const volatile unsigned bulk_sizes[] = { 8, 32 };
  34
  35 struct lcore_pair {
  36         unsigned c1, c2;
  37 };
  38
  39 static volatile unsigned lcore_count = 0;
  40
  41 /**** Functions to analyse our core mask to get cores for different tests ***/
  42
  43 static int
  44 get_two_hyperthreads(struct lcore_pair *lcp)
  45 {
  46         unsigned id1, id2;
  47         unsigned c1, c2, s1, s2;
  48         RTE_LCORE_FOREACH(id1) {
  49                 /* inner loop just re-reads all id's. We could skip the first few
  50                  * elements, but since number of cores is small there is little point
  51                  */
  52                 RTE_LCORE_FOREACH(id2) {
  53                         if (id1 == id2)
  54                                 continue;
  55                         c1 = lcore_config[id1].core_id;
  56                         c2 = lcore_config[id2].core_id;
  57                         s1 = lcore_config[id1].socket_id;
  58                         s2 = lcore_config[id2].socket_id;
  59                         if ((c1 == c2) && (s1 == s2)){
  60                                 lcp->c1 = id1;
  61                                 lcp->c2 = id2;
  62                                 return 0;
  63                         }
  64                 }
  65         }
  66         return 1;
  67 }
  68
  69 static int
  70 get_two_cores(struct lcore_pair *lcp)
  71 {
  72         unsigned id1, id2;
  73         unsigned c1, c2, s1, s2;
  74         RTE_LCORE_FOREACH(id1) {
  75                 RTE_LCORE_FOREACH(id2) {
  76                         if (id1 == id2)
  77                                 continue;
  78                         c1 = lcore_config[id1].core_id;
  79                         c2 = lcore_config[id2].core_id;
  80                         s1 = lcore_config[id1].socket_id;
  81                         s2 = lcore_config[id2].socket_id;
  82                         if ((c1 != c2) && (s1 == s2)){
  83                                 lcp->c1 = id1;
  84                                 lcp->c2 = id2;
  85                                 return 0;
  86                         }
  87                 }
  88         }
  89         return 1;
  90 }
  91
  92 static int
  93 get_two_sockets(struct lcore_pair *lcp)
  94 {
  95         unsigned id1, id2;
  96         unsigned s1, s2;
  97         RTE_LCORE_FOREACH(id1) {
  98                 RTE_LCORE_FOREACH(id2) {
  99                         if (id1 == id2)
 100                                 continue;
 101                         s1 = lcore_config[id1].socket_id;
 102                         s2 = lcore_config[id2].socket_id;
 103                         if (s1 != s2){
 104                                 lcp->c1 = id1;
 105                                 lcp->c2 = id2;
 106                                 return 0;
 107                         }
 108                 }
 109         }
 110         return 1;
 111 }
 112
 113 /* Get cycle counts for dequeuing from an empty ring. Should be 2 or 3 cycles */
 114 static void
 115 test_empty_dequeue(struct rte_ring *r)
 116 {
 117         const unsigned iter_shift = 26;
 118         const unsigned iterations = 1<<iter_shift;
 119         unsigned i = 0;
 120         void *burst[MAX_BURST];
 121
 122         const uint64_t sc_start = rte_rdtsc();
 123         for (i = 0; i < iterations; i++)
 124                 rte_ring_sc_dequeue_bulk(r, burst, bulk_sizes[0], NULL);
 125         const uint64_t sc_end = rte_rdtsc();
 126
 127         const uint64_t mc_start = rte_rdtsc();
 128         for (i = 0; i < iterations; i++)
 129                 rte_ring_mc_dequeue_bulk(r, burst, bulk_sizes[0], NULL);
 130         const uint64_t mc_end = rte_rdtsc();
 131
 132         printf("SC empty dequeue: %.2F\n",
 133                         (double)(sc_end-sc_start) / iterations);
 134         printf("MC empty dequeue: %.2F\n",
 135                         (double)(mc_end-mc_start) / iterations);
 136 }
 137
 138 /*
 139  * for the separate enqueue and dequeue threads they take in one param
 140  * and return two. Input = burst size, output = cycle average for sp/sc & mp/mc
 141  */
 142 struct thread_params {
 143         struct rte_ring *r;
 144         unsigned size;        /* input value, the burst size */
 145         double spsc, mpmc;    /* output value, the single or multi timings */
 146 };
 147
 148 /*
 149  * Function that uses rdtsc to measure timing for ring enqueue. Needs pair
 150  * thread running dequeue_bulk function
 151  */
 152 static int
 153 enqueue_bulk(void *p)
 154 {
 155         const unsigned iter_shift = 23;
 156         const unsigned iterations = 1<<iter_shift;
 157         struct thread_params *params = p;
 158         struct rte_ring *r = params->r;
 159         const unsigned size = params->size;
 160         unsigned i;
 161         void *burst[MAX_BURST] = {0};
 162
 163         if ( __sync_add_and_fetch(&lcore_count, 1) != 2 )
 164                 while(lcore_count != 2)
 165                         rte_pause();
 166
 167         const uint64_t sp_start = rte_rdtsc();
 168         for (i = 0; i < iterations; i++)
 169                 while (rte_ring_sp_enqueue_bulk(r, burst, size, NULL) == 0)
 170                         rte_pause();
 171         const uint64_t sp_end = rte_rdtsc();
 172
 173         const uint64_t mp_start = rte_rdtsc();
 174         for (i = 0; i < iterations; i++)
 175                 while (rte_ring_mp_enqueue_bulk(r, burst, size, NULL) == 0)
 176                         rte_pause();
 177         const uint64_t mp_end = rte_rdtsc();
 178
 179         params->spsc = ((double)(sp_end - sp_start))/(iterations*size);
 180         params->mpmc = ((double)(mp_end - mp_start))/(iterations*size);
 181         return 0;
 182 }
 183
 184 /*
 185  * Function that uses rdtsc to measure timing for ring dequeue. Needs pair
 186  * thread running enqueue_bulk function
 187  */
 188 static int
 189 dequeue_bulk(void *p)
 190 {
 191         const unsigned iter_shift = 23;
 192         const unsigned iterations = 1<<iter_shift;
 193         struct thread_params *params = p;
 194         struct rte_ring *r = params->r;
 195         const unsigned size = params->size;
 196         unsigned i;
 197         void *burst[MAX_BURST] = {0};
 198
 199         if ( __sync_add_and_fetch(&lcore_count, 1) != 2 )
 200                 while(lcore_count != 2)
 201                         rte_pause();
 202
 203         const uint64_t sc_start = rte_rdtsc();
 204         for (i = 0; i < iterations; i++)
 205                 while (rte_ring_sc_dequeue_bulk(r, burst, size, NULL) == 0)
 206                         rte_pause();
 207         const uint64_t sc_end = rte_rdtsc();
 208
 209         const uint64_t mc_start = rte_rdtsc();
 210         for (i = 0; i < iterations; i++)
 211                 while (rte_ring_mc_dequeue_bulk(r, burst, size, NULL) == 0)
 212                         rte_pause();
 213         const uint64_t mc_end = rte_rdtsc();
 214
 215         params->spsc = ((double)(sc_end - sc_start))/(iterations*size);
 216         params->mpmc = ((double)(mc_end - mc_start))/(iterations*size);
 217         return 0;
 218 }
 219
 220 /*
 221  * Function that calls the enqueue and dequeue bulk functions on pairs of cores.
 222  * used to measure ring perf between hyperthreads, cores and sockets.
 223  */
 224 static void
 225 run_on_core_pair(struct lcore_pair *cores, struct rte_ring *r,
 226                 lcore_function_t f1, lcore_function_t f2)
 227 {
 228         struct thread_params param1 = {0}, param2 = {0};
 229         unsigned i;
 230         for (i = 0; i < sizeof(bulk_sizes)/sizeof(bulk_sizes[0]); i++) {
 231                 lcore_count = 0;
 232                 param1.size = param2.size = bulk_sizes[i];
 233                 param1.r = param2.r = r;
 234                 if (cores->c1 == rte_get_master_lcore()) {
 235                         rte_eal_remote_launch(f2, &param2, cores->c2);
 236                         f1(&param1);
 237                         rte_eal_wait_lcore(cores->c2);
 238                 } else {
 239                         rte_eal_remote_launch(f1, &param1, cores->c1);
 240                         rte_eal_remote_launch(f2, &param2, cores->c2);
 241                         rte_eal_wait_lcore(cores->c1);
 242                         rte_eal_wait_lcore(cores->c2);
 243                 }
 244                 printf("SP/SC bulk enq/dequeue (size: %u): %.2F\n", bulk_sizes[i],
 245                                 param1.spsc + param2.spsc);
 246                 printf("MP/MC bulk enq/dequeue (size: %u): %.2F\n", bulk_sizes[i],
 247                                 param1.mpmc + param2.mpmc);
 248         }
 249 }
 250
 251 /*
 252  * Test function that determines how long an enqueue + dequeue of a single item
 253  * takes on a single lcore. Result is for comparison with the bulk enq+deq.
 254  */
 255 static void
 256 test_single_enqueue_dequeue(struct rte_ring *r)
 257 {
 258         const unsigned iter_shift = 24;
 259         const unsigned iterations = 1<<iter_shift;
 260         unsigned i = 0;
 261         void *burst = NULL;
 262
 263         const uint64_t sc_start = rte_rdtsc();
 264         for (i = 0; i < iterations; i++) {
 265                 rte_ring_sp_enqueue(r, burst);
 266                 rte_ring_sc_dequeue(r, &burst);
 267         }
 268         const uint64_t sc_end = rte_rdtsc();
 269
 270         const uint64_t mc_start = rte_rdtsc();
 271         for (i = 0; i < iterations; i++) {
 272                 rte_ring_mp_enqueue(r, burst);
 273                 rte_ring_mc_dequeue(r, &burst);
 274         }
 275         const uint64_t mc_end = rte_rdtsc();
 276
 277         printf("SP/SC single enq/dequeue: %"PRIu64"\n",
 278                         (sc_end-sc_start) >> iter_shift);
 279         printf("MP/MC single enq/dequeue: %"PRIu64"\n",
 280                         (mc_end-mc_start) >> iter_shift);
 281 }
 282
 283 /*
 284  * Test that does both enqueue and dequeue on a core using the burst() API calls
 285  * instead of the bulk() calls used in other tests. Results should be the same
 286  * as for the bulk function called on a single lcore.
 287  */
 288 static void
 289 test_burst_enqueue_dequeue(struct rte_ring *r)
 290 {
 291         const unsigned iter_shift = 23;
 292         const unsigned iterations = 1<<iter_shift;
 293         unsigned sz, i = 0;
 294         void *burst[MAX_BURST] = {0};
 295
 296         for (sz = 0; sz < sizeof(bulk_sizes)/sizeof(bulk_sizes[0]); sz++) {
 297                 const uint64_t sc_start = rte_rdtsc();
 298                 for (i = 0; i < iterations; i++) {
 299                         rte_ring_sp_enqueue_burst(r, burst,
 300                                         bulk_sizes[sz], NULL);
 301                         rte_ring_sc_dequeue_burst(r, burst,
 302                                         bulk_sizes[sz], NULL);
 303                 }
 304                 const uint64_t sc_end = rte_rdtsc();
 305
 306                 const uint64_t mc_start = rte_rdtsc();
 307                 for (i = 0; i < iterations; i++) {
 308                         rte_ring_mp_enqueue_burst(r, burst,
 309                                         bulk_sizes[sz], NULL);
 310                         rte_ring_mc_dequeue_burst(r, burst,
 311                                         bulk_sizes[sz], NULL);
 312                 }
 313                 const uint64_t mc_end = rte_rdtsc();
 314
 315                 uint64_t mc_avg = ((mc_end-mc_start) >> iter_shift) / bulk_sizes[sz];
 316                 uint64_t sc_avg = ((sc_end-sc_start) >> iter_shift) / bulk_sizes[sz];
 317
 318                 printf("SP/SC burst enq/dequeue (size: %u): %"PRIu64"\n", bulk_sizes[sz],
 319                                 sc_avg);
 320                 printf("MP/MC burst enq/dequeue (size: %u): %"PRIu64"\n", bulk_sizes[sz],
 321                                 mc_avg);
 322         }
 323 }
 324
 325 /* Times enqueue and dequeue on a single lcore */
 326 static void
 327 test_bulk_enqueue_dequeue(struct rte_ring *r)
 328 {
 329         const unsigned iter_shift = 23;
 330         const unsigned iterations = 1<<iter_shift;
 331         unsigned sz, i = 0;
 332         void *burst[MAX_BURST] = {0};
 333
 334         for (sz = 0; sz < sizeof(bulk_sizes)/sizeof(bulk_sizes[0]); sz++) {
 335                 const uint64_t sc_start = rte_rdtsc();
 336                 for (i = 0; i < iterations; i++) {
 337                         rte_ring_sp_enqueue_bulk(r, burst,
 338                                         bulk_sizes[sz], NULL);
 339                         rte_ring_sc_dequeue_bulk(r, burst,
 340                                         bulk_sizes[sz], NULL);
 341                 }
 342                 const uint64_t sc_end = rte_rdtsc();
 343
 344                 const uint64_t mc_start = rte_rdtsc();
 345                 for (i = 0; i < iterations; i++) {
 346                         rte_ring_mp_enqueue_bulk(r, burst,
 347                                         bulk_sizes[sz], NULL);
 348                         rte_ring_mc_dequeue_bulk(r, burst,
 349                                         bulk_sizes[sz], NULL);
 350                 }
 351                 const uint64_t mc_end = rte_rdtsc();
 352
 353                 double sc_avg = ((double)(sc_end-sc_start) /
 354                                 (iterations * bulk_sizes[sz]));
 355                 double mc_avg = ((double)(mc_end-mc_start) /
 356                                 (iterations * bulk_sizes[sz]));
 357
 358                 printf("SP/SC bulk enq/dequeue (size: %u): %.2F\n", bulk_sizes[sz],
 359                                 sc_avg);
 360                 printf("MP/MC bulk enq/dequeue (size: %u): %.2F\n", bulk_sizes[sz],
 361                                 mc_avg);
 362         }
 363 }
 364
 365 static int
 366 test_ring_perf(void)
 367 {
 368         struct lcore_pair cores;
 369         struct rte_ring *r = NULL;
 370
 371         r = rte_ring_create(RING_NAME, RING_SIZE, rte_socket_id(), 0);
 372         if (r == NULL)
 373                 return -1;
 374
 375         printf("### Testing single element and burst enq/deq ###\n");
 376         test_single_enqueue_dequeue(r);
 377         test_burst_enqueue_dequeue(r);
 378
 379         printf("\n### Testing empty dequeue ###\n");
 380         test_empty_dequeue(r);
 381
 382         printf("\n### Testing using a single lcore ###\n");
 383         test_bulk_enqueue_dequeue(r);
 384
 385         if (get_two_hyperthreads(&cores) == 0) {
 386                 printf("\n### Testing using two hyperthreads ###\n");
 387                 run_on_core_pair(&cores, r, enqueue_bulk, dequeue_bulk);
 388         }
 389         if (get_two_cores(&cores) == 0) {
 390                 printf("\n### Testing using two physical cores ###\n");
 391                 run_on_core_pair(&cores, r, enqueue_bulk, dequeue_bulk);
 392         }
 393         if (get_two_sockets(&cores) == 0) {
 394                 printf("\n### Testing using two NUMA nodes ###\n");
 395                 run_on_core_pair(&cores, r, enqueue_bulk, dequeue_bulk);
 396         }
 397         rte_ring_free(r);
 398         return 0;
 399 }
 400
 401 REGISTER_TEST_COMMAND(ring_perf_autotest, test_ring_perf);