tools/perf/util/stat-shadow.c

   1 #include <stdio.h>
   2 #include "evsel.h"
   3 #include "stat.h"
   4 #include "color.h"
   5 #include "pmu.h"
   6 #include "rblist.h"
   7 #include "evlist.h"
   8 #include "expr.h"
   9
  10 enum {
  11         CTX_BIT_USER    = 1 << 0,
  12         CTX_BIT_KERNEL  = 1 << 1,
  13         CTX_BIT_HV      = 1 << 2,
  14         CTX_BIT_HOST    = 1 << 3,
  15         CTX_BIT_IDLE    = 1 << 4,
  16         CTX_BIT_MAX     = 1 << 5,
  17 };
  18
  19 #define NUM_CTX CTX_BIT_MAX
  20
  21 /*
  22  * AGGR_GLOBAL: Use CPU 0
  23  * AGGR_SOCKET: Use first CPU of socket
  24  * AGGR_CORE: Use first CPU of core
  25  * AGGR_NONE: Use matching CPU
  26  * AGGR_THREAD: Not supported?
  27  */
  28 static struct stats runtime_nsecs_stats[MAX_NR_CPUS];
  29 static struct stats runtime_cycles_stats[NUM_CTX][MAX_NR_CPUS];
  30 static struct stats runtime_stalled_cycles_front_stats[NUM_CTX][MAX_NR_CPUS];
  31 static struct stats runtime_stalled_cycles_back_stats[NUM_CTX][MAX_NR_CPUS];
  32 static struct stats runtime_branches_stats[NUM_CTX][MAX_NR_CPUS];
  33 static struct stats runtime_cacherefs_stats[NUM_CTX][MAX_NR_CPUS];
  34 static struct stats runtime_l1_dcache_stats[NUM_CTX][MAX_NR_CPUS];
  35 static struct stats runtime_l1_icache_stats[NUM_CTX][MAX_NR_CPUS];
  36 static struct stats runtime_ll_cache_stats[NUM_CTX][MAX_NR_CPUS];
  37 static struct stats runtime_itlb_cache_stats[NUM_CTX][MAX_NR_CPUS];
  38 static struct stats runtime_dtlb_cache_stats[NUM_CTX][MAX_NR_CPUS];
  39 static struct stats runtime_cycles_in_tx_stats[NUM_CTX][MAX_NR_CPUS];
  40 static struct stats runtime_transaction_stats[NUM_CTX][MAX_NR_CPUS];
  41 static struct stats runtime_elision_stats[NUM_CTX][MAX_NR_CPUS];
  42 static struct stats runtime_topdown_total_slots[NUM_CTX][MAX_NR_CPUS];
  43 static struct stats runtime_topdown_slots_issued[NUM_CTX][MAX_NR_CPUS];
  44 static struct stats runtime_topdown_slots_retired[NUM_CTX][MAX_NR_CPUS];
  45 static struct stats runtime_topdown_fetch_bubbles[NUM_CTX][MAX_NR_CPUS];
  46 static struct stats runtime_topdown_recovery_bubbles[NUM_CTX][MAX_NR_CPUS];
  47 static struct stats runtime_smi_num_stats[NUM_CTX][MAX_NR_CPUS];
  48 static struct stats runtime_aperf_stats[NUM_CTX][MAX_NR_CPUS];
  49 static struct rblist runtime_saved_values;
  50 static bool have_frontend_stalled;
  51
  52 struct stats walltime_nsecs_stats;
  53
  54 struct saved_value {
  55         struct rb_node rb_node;
  56         struct perf_evsel *evsel;
  57         int cpu;
  58         int ctx;
  59         struct stats stats;
  60 };
  61
  62 static int saved_value_cmp(struct rb_node *rb_node, const void *entry)
  63 {
  64         struct saved_value *a = container_of(rb_node,
  65                                              struct saved_value,
  66                                              rb_node);
  67         const struct saved_value *b = entry;
  68
  69         if (a->ctx != b->ctx)
  70                 return a->ctx - b->ctx;
  71         if (a->cpu != b->cpu)
  72                 return a->cpu - b->cpu;
  73         if (a->evsel == b->evsel)
  74                 return 0;
  75         if ((char *)a->evsel < (char *)b->evsel)
  76                 return -1;
  77         return +1;
  78 }
  79
  80 static struct rb_node *saved_value_new(struct rblist *rblist __maybe_unused,
  81                                      const void *entry)
  82 {
  83         struct saved_value *nd = malloc(sizeof(struct saved_value));
  84
  85         if (!nd)
  86                 return NULL;
  87         memcpy(nd, entry, sizeof(struct saved_value));
  88         return &nd->rb_node;
  89 }
  90
  91 static struct saved_value *saved_value_lookup(struct perf_evsel *evsel,
  92                                               int cpu, int ctx,
  93                                               bool create)
  94 {
  95         struct rb_node *nd;
  96         struct saved_value dm = {
  97                 .cpu = cpu,
  98                 .ctx = ctx,
  99                 .evsel = evsel,
 100         };
 101         nd = rblist__find(&runtime_saved_values, &dm);
 102         if (nd)
 103                 return container_of(nd, struct saved_value, rb_node);
 104         if (create) {
 105                 rblist__add_node(&runtime_saved_values, &dm);
 106                 nd = rblist__find(&runtime_saved_values, &dm);
 107                 if (nd)
 108                         return container_of(nd, struct saved_value, rb_node);
 109         }
 110         return NULL;
 111 }
 112
 113 void perf_stat__init_shadow_stats(void)
 114 {
 115         have_frontend_stalled = pmu_have_event("cpu", "stalled-cycles-frontend");
 116         rblist__init(&runtime_saved_values);
 117         runtime_saved_values.node_cmp = saved_value_cmp;
 118         runtime_saved_values.node_new = saved_value_new;
 119         /* No delete for now */
 120 }
 121
 122 static int evsel_context(struct perf_evsel *evsel)
 123 {
 124         int ctx = 0;
 125
 126         if (evsel->attr.exclude_kernel)
 127                 ctx |= CTX_BIT_KERNEL;
 128         if (evsel->attr.exclude_user)
 129                 ctx |= CTX_BIT_USER;
 130         if (evsel->attr.exclude_hv)
 131                 ctx |= CTX_BIT_HV;
 132         if (evsel->attr.exclude_host)
 133                 ctx |= CTX_BIT_HOST;
 134         if (evsel->attr.exclude_idle)
 135                 ctx |= CTX_BIT_IDLE;
 136
 137         return ctx;
 138 }
 139
 140 void perf_stat__reset_shadow_stats(void)
 141 {
 142         struct rb_node *pos, *next;
 143
 144         memset(runtime_nsecs_stats, 0, sizeof(runtime_nsecs_stats));
 145         memset(runtime_cycles_stats, 0, sizeof(runtime_cycles_stats));
 146         memset(runtime_stalled_cycles_front_stats, 0, sizeof(runtime_stalled_cycles_front_stats));
 147         memset(runtime_stalled_cycles_back_stats, 0, sizeof(runtime_stalled_cycles_back_stats));
 148         memset(runtime_branches_stats, 0, sizeof(runtime_branches_stats));
 149         memset(runtime_cacherefs_stats, 0, sizeof(runtime_cacherefs_stats));
 150         memset(runtime_l1_dcache_stats, 0, sizeof(runtime_l1_dcache_stats));
 151         memset(runtime_l1_icache_stats, 0, sizeof(runtime_l1_icache_stats));
 152         memset(runtime_ll_cache_stats, 0, sizeof(runtime_ll_cache_stats));
 153         memset(runtime_itlb_cache_stats, 0, sizeof(runtime_itlb_cache_stats));
 154         memset(runtime_dtlb_cache_stats, 0, sizeof(runtime_dtlb_cache_stats));
 155         memset(runtime_cycles_in_tx_stats, 0,
 156                         sizeof(runtime_cycles_in_tx_stats));
 157         memset(runtime_transaction_stats, 0,
 158                 sizeof(runtime_transaction_stats));
 159         memset(runtime_elision_stats, 0, sizeof(runtime_elision_stats));
 160         memset(&walltime_nsecs_stats, 0, sizeof(walltime_nsecs_stats));
 161         memset(runtime_topdown_total_slots, 0, sizeof(runtime_topdown_total_slots));
 162         memset(runtime_topdown_slots_retired, 0, sizeof(runtime_topdown_slots_retired));
 163         memset(runtime_topdown_slots_issued, 0, sizeof(runtime_topdown_slots_issued));
 164         memset(runtime_topdown_fetch_bubbles, 0, sizeof(runtime_topdown_fetch_bubbles));
 165         memset(runtime_topdown_recovery_bubbles, 0, sizeof(runtime_topdown_recovery_bubbles));
 166         memset(runtime_smi_num_stats, 0, sizeof(runtime_smi_num_stats));
 167         memset(runtime_aperf_stats, 0, sizeof(runtime_aperf_stats));
 168
 169         next = rb_first(&runtime_saved_values.entries);
 170         while (next) {
 171                 pos = next;
 172                 next = rb_next(pos);
 173                 memset(&container_of(pos, struct saved_value, rb_node)->stats,
 174                        0,
 175                        sizeof(struct stats));
 176         }
 177 }
 178
 179 /*
 180  * Update various tracking values we maintain to print
 181  * more semantic information such as miss/hit ratios,
 182  * instruction rates, etc:
 183  */
 184 void perf_stat__update_shadow_stats(struct perf_evsel *counter, u64 *count,
 185                                     int cpu)
 186 {
 187         int ctx = evsel_context(counter);
 188
 189         if (perf_evsel__match(counter, SOFTWARE, SW_TASK_CLOCK) ||
 190             perf_evsel__match(counter, SOFTWARE, SW_CPU_CLOCK))
 191                 update_stats(&runtime_nsecs_stats[cpu], count[0]);
 192         else if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES))
 193                 update_stats(&runtime_cycles_stats[ctx][cpu], count[0]);
 194         else if (perf_stat_evsel__is(counter, CYCLES_IN_TX))
 195                 update_stats(&runtime_cycles_in_tx_stats[ctx][cpu], count[0]);
 196         else if (perf_stat_evsel__is(counter, TRANSACTION_START))
 197                 update_stats(&runtime_transaction_stats[ctx][cpu], count[0]);
 198         else if (perf_stat_evsel__is(counter, ELISION_START))
 199                 update_stats(&runtime_elision_stats[ctx][cpu], count[0]);
 200         else if (perf_stat_evsel__is(counter, TOPDOWN_TOTAL_SLOTS))
 201                 update_stats(&runtime_topdown_total_slots[ctx][cpu], count[0]);
 202         else if (perf_stat_evsel__is(counter, TOPDOWN_SLOTS_ISSUED))
 203                 update_stats(&runtime_topdown_slots_issued[ctx][cpu], count[0]);
 204         else if (perf_stat_evsel__is(counter, TOPDOWN_SLOTS_RETIRED))
 205                 update_stats(&runtime_topdown_slots_retired[ctx][cpu], count[0]);
 206         else if (perf_stat_evsel__is(counter, TOPDOWN_FETCH_BUBBLES))
 207                 update_stats(&runtime_topdown_fetch_bubbles[ctx][cpu],count[0]);
 208         else if (perf_stat_evsel__is(counter, TOPDOWN_RECOVERY_BUBBLES))
 209                 update_stats(&runtime_topdown_recovery_bubbles[ctx][cpu], count[0]);
 210         else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_FRONTEND))
 211                 update_stats(&runtime_stalled_cycles_front_stats[ctx][cpu], count[0]);
 212         else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_BACKEND))
 213                 update_stats(&runtime_stalled_cycles_back_stats[ctx][cpu], count[0]);
 214         else if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS))
 215                 update_stats(&runtime_branches_stats[ctx][cpu], count[0]);
 216         else if (perf_evsel__match(counter, HARDWARE, HW_CACHE_REFERENCES))
 217                 update_stats(&runtime_cacherefs_stats[ctx][cpu], count[0]);
 218         else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1D))
 219                 update_stats(&runtime_l1_dcache_stats[ctx][cpu], count[0]);
 220         else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1I))
 221                 update_stats(&runtime_ll_cache_stats[ctx][cpu], count[0]);
 222         else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_LL))
 223                 update_stats(&runtime_ll_cache_stats[ctx][cpu], count[0]);
 224         else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_DTLB))
 225                 update_stats(&runtime_dtlb_cache_stats[ctx][cpu], count[0]);
 226         else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_ITLB))
 227                 update_stats(&runtime_itlb_cache_stats[ctx][cpu], count[0]);
 228         else if (perf_stat_evsel__is(counter, SMI_NUM))
 229                 update_stats(&runtime_smi_num_stats[ctx][cpu], count[0]);
 230         else if (perf_stat_evsel__is(counter, APERF))
 231                 update_stats(&runtime_aperf_stats[ctx][cpu], count[0]);
 232
 233         if (counter->collect_stat) {
 234                 struct saved_value *v = saved_value_lookup(counter, cpu, ctx,
 235                                                            true);
 236                 update_stats(&v->stats, count[0]);
 237         }
 238 }
 239
 240 /* used for get_ratio_color() */
 241 enum grc_type {
 242         GRC_STALLED_CYCLES_FE,
 243         GRC_STALLED_CYCLES_BE,
 244         GRC_CACHE_MISSES,
 245         GRC_MAX_NR
 246 };
 247
 248 static const char *get_ratio_color(enum grc_type type, double ratio)
 249 {
 250         static const double grc_table[GRC_MAX_NR][3] = {
 251                 [GRC_STALLED_CYCLES_FE] = { 50.0, 30.0, 10.0 },
 252                 [GRC_STALLED_CYCLES_BE] = { 75.0, 50.0, 20.0 },
 253                 [GRC_CACHE_MISSES]      = { 20.0, 10.0, 5.0 },
 254         };
 255         const char *color = PERF_COLOR_NORMAL;
 256
 257         if (ratio > grc_table[type][0])
 258                 color = PERF_COLOR_RED;
 259         else if (ratio > grc_table[type][1])
 260                 color = PERF_COLOR_MAGENTA;
 261         else if (ratio > grc_table[type][2])
 262                 color = PERF_COLOR_YELLOW;
 263
 264         return color;
 265 }
 266
 267 static struct perf_evsel *perf_stat__find_event(struct perf_evlist *evsel_list,
 268                                                 const char *name)
 269 {
 270         struct perf_evsel *c2;
 271
 272         evlist__for_each_entry (evsel_list, c2) {
 273                 if (!strcasecmp(c2->name, name))
 274                         return c2;
 275         }
 276         return NULL;
 277 }
 278
 279 /* Mark MetricExpr target events and link events using them to them. */
 280 void perf_stat__collect_metric_expr(struct perf_evlist *evsel_list)
 281 {
 282         struct perf_evsel *counter, *leader, **metric_events, *oc;
 283         bool found;
 284         const char **metric_names;
 285         int i;
 286         int num_metric_names;
 287
 288         evlist__for_each_entry(evsel_list, counter) {
 289                 bool invalid = false;
 290
 291                 leader = counter->leader;
 292                 if (!counter->metric_expr)
 293                         continue;
 294                 metric_events = counter->metric_events;
 295                 if (!metric_events) {
 296                         if (expr__find_other(counter->metric_expr, counter->name,
 297                                                 &metric_names, &num_metric_names) < 0)
 298                                 continue;
 299
 300                         metric_events = calloc(sizeof(struct perf_evsel *),
 301                                                num_metric_names + 1);
 302                         if (!metric_events)
 303                                 return;
 304                         counter->metric_events = metric_events;
 305                 }
 306
 307                 for (i = 0; i < num_metric_names; i++) {
 308                         found = false;
 309                         if (leader) {
 310                                 /* Search in group */
 311                                 for_each_group_member (oc, leader) {
 312                                         if (!strcasecmp(oc->name, metric_names[i])) {
 313                                                 found = true;
 314                                                 break;
 315                                         }
 316                                 }
 317                         }
 318                         if (!found) {
 319                                 /* Search ignoring groups */
 320                                 oc = perf_stat__find_event(evsel_list, metric_names[i]);
 321                         }
 322                         if (!oc) {
 323                                 /* Deduping one is good enough to handle duplicated PMUs. */
 324                                 static char *printed;
 325
 326                                 /*
 327                                  * Adding events automatically would be difficult, because
 328                                  * it would risk creating groups that are not schedulable.
 329                                  * perf stat doesn't understand all the scheduling constraints
 330                                  * of events. So we ask the user instead to add the missing
 331                                  * events.
 332                                  */
 333                                 if (!printed || strcasecmp(printed, metric_names[i])) {
 334                                         fprintf(stderr,
 335                                                 "Add %s event to groups to get metric expression for %s\n",
 336                                                 metric_names[i],
 337                                                 counter->name);
 338                                         printed = strdup(metric_names[i]);
 339                                 }
 340                                 invalid = true;
 341                                 continue;
 342                         }
 343                         metric_events[i] = oc;
 344                         oc->collect_stat = true;
 345                 }
 346                 metric_events[i] = NULL;
 347                 free(metric_names);
 348                 if (invalid) {
 349                         free(metric_events);
 350                         counter->metric_events = NULL;
 351                         counter->metric_expr = NULL;
 352                 }
 353         }
 354 }
 355
 356 static void print_stalled_cycles_frontend(int cpu,
 357                                           struct perf_evsel *evsel, double avg,
 358                                           struct perf_stat_output_ctx *out)
 359 {
 360         double total, ratio = 0.0;
 361         const char *color;
 362         int ctx = evsel_context(evsel);
 363
 364         total = avg_stats(&runtime_cycles_stats[ctx][cpu]);
 365
 366         if (total)
 367                 ratio = avg / total * 100.0;
 368
 369         color = get_ratio_color(GRC_STALLED_CYCLES_FE, ratio);
 370
 371         if (ratio)
 372                 out->print_metric(out->ctx, color, "%7.2f%%", "frontend cycles idle",
 373                                   ratio);
 374         else
 375                 out->print_metric(out->ctx, NULL, NULL, "frontend cycles idle", 0);
 376 }
 377
 378 static void print_stalled_cycles_backend(int cpu,
 379                                          struct perf_evsel *evsel, double avg,
 380                                          struct perf_stat_output_ctx *out)
 381 {
 382         double total, ratio = 0.0;
 383         const char *color;
 384         int ctx = evsel_context(evsel);
 385
 386         total = avg_stats(&runtime_cycles_stats[ctx][cpu]);
 387
 388         if (total)
 389                 ratio = avg / total * 100.0;
 390
 391         color = get_ratio_color(GRC_STALLED_CYCLES_BE, ratio);
 392
 393         out->print_metric(out->ctx, color, "%7.2f%%", "backend cycles idle", ratio);
 394 }
 395
 396 static void print_branch_misses(int cpu,
 397                                 struct perf_evsel *evsel,
 398                                 double avg,
 399                                 struct perf_stat_output_ctx *out)
 400 {
 401         double total, ratio = 0.0;
 402         const char *color;
 403         int ctx = evsel_context(evsel);
 404
 405         total = avg_stats(&runtime_branches_stats[ctx][cpu]);
 406
 407         if (total)
 408                 ratio = avg / total * 100.0;
 409
 410         color = get_ratio_color(GRC_CACHE_MISSES, ratio);
 411
 412         out->print_metric(out->ctx, color, "%7.2f%%", "of all branches", ratio);
 413 }
 414
 415 static void print_l1_dcache_misses(int cpu,
 416                                    struct perf_evsel *evsel,
 417                                    double avg,
 418                                    struct perf_stat_output_ctx *out)
 419 {
 420         double total, ratio = 0.0;
 421         const char *color;
 422         int ctx = evsel_context(evsel);
 423
 424         total = avg_stats(&runtime_l1_dcache_stats[ctx][cpu]);
 425
 426         if (total)
 427                 ratio = avg / total * 100.0;
 428
 429         color = get_ratio_color(GRC_CACHE_MISSES, ratio);
 430
 431         out->print_metric(out->ctx, color, "%7.2f%%", "of all L1-dcache hits", ratio);
 432 }
 433
 434 static void print_l1_icache_misses(int cpu,
 435                                    struct perf_evsel *evsel,
 436                                    double avg,
 437                                    struct perf_stat_output_ctx *out)
 438 {
 439         double total, ratio = 0.0;
 440         const char *color;
 441         int ctx = evsel_context(evsel);
 442
 443         total = avg_stats(&runtime_l1_icache_stats[ctx][cpu]);
 444
 445         if (total)
 446                 ratio = avg / total * 100.0;
 447
 448         color = get_ratio_color(GRC_CACHE_MISSES, ratio);
 449         out->print_metric(out->ctx, color, "%7.2f%%", "of all L1-icache hits", ratio);
 450 }
 451
 452 static void print_dtlb_cache_misses(int cpu,
 453                                     struct perf_evsel *evsel,
 454                                     double avg,
 455                                     struct perf_stat_output_ctx *out)
 456 {
 457         double total, ratio = 0.0;
 458         const char *color;
 459         int ctx = evsel_context(evsel);
 460
 461         total = avg_stats(&runtime_dtlb_cache_stats[ctx][cpu]);
 462
 463         if (total)
 464                 ratio = avg / total * 100.0;
 465
 466         color = get_ratio_color(GRC_CACHE_MISSES, ratio);
 467         out->print_metric(out->ctx, color, "%7.2f%%", "of all dTLB cache hits", ratio);
 468 }
 469
 470 static void print_itlb_cache_misses(int cpu,
 471                                     struct perf_evsel *evsel,
 472                                     double avg,
 473                                     struct perf_stat_output_ctx *out)
 474 {
 475         double total, ratio = 0.0;
 476         const char *color;
 477         int ctx = evsel_context(evsel);
 478
 479         total = avg_stats(&runtime_itlb_cache_stats[ctx][cpu]);
 480
 481         if (total)
 482                 ratio = avg / total * 100.0;
 483
 484         color = get_ratio_color(GRC_CACHE_MISSES, ratio);
 485         out->print_metric(out->ctx, color, "%7.2f%%", "of all iTLB cache hits", ratio);
 486 }
 487
 488 static void print_ll_cache_misses(int cpu,
 489                                   struct perf_evsel *evsel,
 490                                   double avg,
 491                                   struct perf_stat_output_ctx *out)
 492 {
 493         double total, ratio = 0.0;
 494         const char *color;
 495         int ctx = evsel_context(evsel);
 496
 497         total = avg_stats(&runtime_ll_cache_stats[ctx][cpu]);
 498
 499         if (total)
 500                 ratio = avg / total * 100.0;
 501
 502         color = get_ratio_color(GRC_CACHE_MISSES, ratio);
 503         out->print_metric(out->ctx, color, "%7.2f%%", "of all LL-cache hits", ratio);
 504 }
 505
 506 /*
 507  * High level "TopDown" CPU core pipe line bottleneck break down.
 508  *
 509  * Basic concept following
 510  * Yasin, A Top Down Method for Performance analysis and Counter architecture
 511  * ISPASS14
 512  *
 513  * The CPU pipeline is divided into 4 areas that can be bottlenecks:
 514  *
 515  * Frontend -> Backend -> Retiring
 516  * BadSpeculation in addition means out of order execution that is thrown away
 517  * (for example branch mispredictions)
 518  * Frontend is instruction decoding.
 519  * Backend is execution, like computation and accessing data in memory
 520  * Retiring is good execution that is not directly bottlenecked
 521  *
 522  * The formulas are computed in slots.
 523  * A slot is an entry in the pipeline each for the pipeline width
 524  * (for example a 4-wide pipeline has 4 slots for each cycle)
 525  *
 526  * Formulas:
 527  * BadSpeculation = ((SlotsIssued - SlotsRetired) + RecoveryBubbles) /
 528  *                      TotalSlots
 529  * Retiring = SlotsRetired / TotalSlots
 530  * FrontendBound = FetchBubbles / TotalSlots
 531  * BackendBound = 1.0 - BadSpeculation - Retiring - FrontendBound
 532  *
 533  * The kernel provides the mapping to the low level CPU events and any scaling
 534  * needed for the CPU pipeline width, for example:
 535  *
 536  * TotalSlots = Cycles * 4
 537  *
 538  * The scaling factor is communicated in the sysfs unit.
 539  *
 540  * In some cases the CPU may not be able to measure all the formulas due to
 541  * missing events. In this case multiple formulas are combined, as possible.
 542  *
 543  * Full TopDown supports more levels to sub-divide each area: for example
 544  * BackendBound into computing bound and memory bound. For now we only
 545  * support Level 1 TopDown.
 546  */
 547
 548 static double sanitize_val(double x)
 549 {
 550         if (x < 0 && x >= -0.02)
 551                 return 0.0;
 552         return x;
 553 }
 554
 555 static double td_total_slots(int ctx, int cpu)
 556 {
 557         return avg_stats(&runtime_topdown_total_slots[ctx][cpu]);
 558 }
 559
 560 static double td_bad_spec(int ctx, int cpu)
 561 {
 562         double bad_spec = 0;
 563         double total_slots;
 564         double total;
 565
 566         total = avg_stats(&runtime_topdown_slots_issued[ctx][cpu]) -
 567                 avg_stats(&runtime_topdown_slots_retired[ctx][cpu]) +
 568                 avg_stats(&runtime_topdown_recovery_bubbles[ctx][cpu]);
 569         total_slots = td_total_slots(ctx, cpu);
 570         if (total_slots)
 571                 bad_spec = total / total_slots;
 572         return sanitize_val(bad_spec);
 573 }
 574
 575 static double td_retiring(int ctx, int cpu)
 576 {
 577         double retiring = 0;
 578         double total_slots = td_total_slots(ctx, cpu);
 579         double ret_slots = avg_stats(&runtime_topdown_slots_retired[ctx][cpu]);
 580
 581         if (total_slots)
 582                 retiring = ret_slots / total_slots;
 583         return retiring;
 584 }
 585
 586 static double td_fe_bound(int ctx, int cpu)
 587 {
 588         double fe_bound = 0;
 589         double total_slots = td_total_slots(ctx, cpu);
 590         double fetch_bub = avg_stats(&runtime_topdown_fetch_bubbles[ctx][cpu]);
 591
 592         if (total_slots)
 593                 fe_bound = fetch_bub / total_slots;
 594         return fe_bound;
 595 }
 596
 597 static double td_be_bound(int ctx, int cpu)
 598 {
 599         double sum = (td_fe_bound(ctx, cpu) +
 600                       td_bad_spec(ctx, cpu) +
 601                       td_retiring(ctx, cpu));
 602         if (sum == 0)
 603                 return 0;
 604         return sanitize_val(1.0 - sum);
 605 }
 606
 607 static void print_smi_cost(int cpu, struct perf_evsel *evsel,
 608                            struct perf_stat_output_ctx *out)
 609 {
 610         double smi_num, aperf, cycles, cost = 0.0;
 611         int ctx = evsel_context(evsel);
 612         const char *color = NULL;
 613
 614         smi_num = avg_stats(&runtime_smi_num_stats[ctx][cpu]);
 615         aperf = avg_stats(&runtime_aperf_stats[ctx][cpu]);
 616         cycles = avg_stats(&runtime_cycles_stats[ctx][cpu]);
 617
 618         if ((cycles == 0) || (aperf == 0))
 619                 return;
 620
 621         if (smi_num)
 622                 cost = (aperf - cycles) / aperf * 100.00;
 623
 624         if (cost > 10)
 625                 color = PERF_COLOR_RED;
 626         out->print_metric(out->ctx, color, "%8.1f%%", "SMI cycles%", cost);
 627         out->print_metric(out->ctx, NULL, "%4.0f", "SMI#", smi_num);
 628 }
 629
 630 void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
 631                                    double avg, int cpu,
 632                                    struct perf_stat_output_ctx *out)
 633 {
 634         void *ctxp = out->ctx;
 635         print_metric_t print_metric = out->print_metric;
 636         double total, ratio = 0.0, total2;
 637         const char *color = NULL;
 638         int ctx = evsel_context(evsel);
 639
 640         if (perf_evsel__match(evsel, HARDWARE, HW_INSTRUCTIONS)) {
 641                 total = avg_stats(&runtime_cycles_stats[ctx][cpu]);
 642                 if (total) {
 643                         ratio = avg / total;
 644                         print_metric(ctxp, NULL, "%7.2f ",
 645                                         "insn per cycle", ratio);
 646                 } else {
 647                         print_metric(ctxp, NULL, NULL, "insn per cycle", 0);
 648                 }
 649                 total = avg_stats(&runtime_stalled_cycles_front_stats[ctx][cpu]);
 650                 total = max(total, avg_stats(&runtime_stalled_cycles_back_stats[ctx][cpu]));
 651
 652                 if (total && avg) {
 653                         out->new_line(ctxp);
 654                         ratio = total / avg;
 655                         print_metric(ctxp, NULL, "%7.2f ",
 656                                         "stalled cycles per insn",
 657                                         ratio);
 658                 } else if (have_frontend_stalled) {
 659                         print_metric(ctxp, NULL, NULL,
 660                                      "stalled cycles per insn", 0);
 661                 }
 662         } else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES)) {
 663                 if (runtime_branches_stats[ctx][cpu].n != 0)
 664                         print_branch_misses(cpu, evsel, avg, out);
 665                 else
 666                         print_metric(ctxp, NULL, NULL, "of all branches", 0);
 667         } else if (
 668                 evsel->attr.type == PERF_TYPE_HW_CACHE &&
 669                 evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_L1D |
 670                                         ((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
 671                                          ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) {
 672                 if (runtime_l1_dcache_stats[ctx][cpu].n != 0)
 673                         print_l1_dcache_misses(cpu, evsel, avg, out);
 674                 else
 675                         print_metric(ctxp, NULL, NULL, "of all L1-dcache hits", 0);
 676         } else if (
 677                 evsel->attr.type == PERF_TYPE_HW_CACHE &&
 678                 evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_L1I |
 679                                         ((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
 680                                          ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) {
 681                 if (runtime_l1_icache_stats[ctx][cpu].n != 0)
 682                         print_l1_icache_misses(cpu, evsel, avg, out);
 683                 else
 684                         print_metric(ctxp, NULL, NULL, "of all L1-icache hits", 0);
 685         } else if (
 686                 evsel->attr.type == PERF_TYPE_HW_CACHE &&
 687                 evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_DTLB |
 688                                         ((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
 689                                          ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) {
 690                 if (runtime_dtlb_cache_stats[ctx][cpu].n != 0)
 691                         print_dtlb_cache_misses(cpu, evsel, avg, out);
 692                 else
 693                         print_metric(ctxp, NULL, NULL, "of all dTLB cache hits", 0);
 694         } else if (
 695                 evsel->attr.type == PERF_TYPE_HW_CACHE &&
 696                 evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_ITLB |
 697                                         ((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
 698                                          ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) {
 699                 if (runtime_itlb_cache_stats[ctx][cpu].n != 0)
 700                         print_itlb_cache_misses(cpu, evsel, avg, out);
 701                 else
 702                         print_metric(ctxp, NULL, NULL, "of all iTLB cache hits", 0);
 703         } else if (
 704                 evsel->attr.type == PERF_TYPE_HW_CACHE &&
 705                 evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_LL |
 706                                         ((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
 707                                          ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) {
 708                 if (runtime_ll_cache_stats[ctx][cpu].n != 0)
 709                         print_ll_cache_misses(cpu, evsel, avg, out);
 710                 else
 711                         print_metric(ctxp, NULL, NULL, "of all LL-cache hits", 0);
 712         } else if (perf_evsel__match(evsel, HARDWARE, HW_CACHE_MISSES)) {
 713                 total = avg_stats(&runtime_cacherefs_stats[ctx][cpu]);
 714
 715                 if (total)
 716                         ratio = avg * 100 / total;
 717
 718                 if (runtime_cacherefs_stats[ctx][cpu].n != 0)
 719                         print_metric(ctxp, NULL, "%8.3f %%",
 720                                      "of all cache refs", ratio);
 721                 else
 722                         print_metric(ctxp, NULL, NULL, "of all cache refs", 0);
 723         } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) {
 724                 print_stalled_cycles_frontend(cpu, evsel, avg, out);
 725         } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_BACKEND)) {
 726                 print_stalled_cycles_backend(cpu, evsel, avg, out);
 727         } else if (perf_evsel__match(evsel, HARDWARE, HW_CPU_CYCLES)) {
 728                 total = avg_stats(&runtime_nsecs_stats[cpu]);
 729
 730                 if (total) {
 731                         ratio = avg / total;
 732                         print_metric(ctxp, NULL, "%8.3f", "GHz", ratio);
 733                 } else {
 734                         print_metric(ctxp, NULL, NULL, "Ghz", 0);
 735                 }
 736         } else if (perf_stat_evsel__is(evsel, CYCLES_IN_TX)) {
 737                 total = avg_stats(&runtime_cycles_stats[ctx][cpu]);
 738                 if (total)
 739                         print_metric(ctxp, NULL,
 740                                         "%7.2f%%", "transactional cycles",
 741                                         100.0 * (avg / total));
 742                 else
 743                         print_metric(ctxp, NULL, NULL, "transactional cycles",
 744                                      0);
 745         } else if (perf_stat_evsel__is(evsel, CYCLES_IN_TX_CP)) {
 746                 total = avg_stats(&runtime_cycles_stats[ctx][cpu]);
 747                 total2 = avg_stats(&runtime_cycles_in_tx_stats[ctx][cpu]);
 748                 if (total2 < avg)
 749                         total2 = avg;
 750                 if (total)
 751                         print_metric(ctxp, NULL, "%7.2f%%", "aborted cycles",
 752                                 100.0 * ((total2-avg) / total));
 753                 else
 754                         print_metric(ctxp, NULL, NULL, "aborted cycles", 0);
 755         } else if (perf_stat_evsel__is(evsel, TRANSACTION_START)) {
 756                 total = avg_stats(&runtime_cycles_in_tx_stats[ctx][cpu]);
 757
 758                 if (avg)
 759                         ratio = total / avg;
 760
 761                 if (runtime_cycles_in_tx_stats[ctx][cpu].n != 0)
 762                         print_metric(ctxp, NULL, "%8.0f",
 763                                      "cycles / transaction", ratio);
 764                 else
 765                         print_metric(ctxp, NULL, NULL, "cycles / transaction",
 766                                      0);
 767         } else if (perf_stat_evsel__is(evsel, ELISION_START)) {
 768                 total = avg_stats(&runtime_cycles_in_tx_stats[ctx][cpu]);
 769
 770                 if (avg)
 771                         ratio = total / avg;
 772
 773                 print_metric(ctxp, NULL, "%8.0f", "cycles / elision", ratio);
 774         } else if (perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK) ||
 775                    perf_evsel__match(evsel, SOFTWARE, SW_CPU_CLOCK)) {
 776                 if ((ratio = avg_stats(&walltime_nsecs_stats)) != 0)
 777                         print_metric(ctxp, NULL, "%8.3f", "CPUs utilized",
 778                                      avg / ratio);
 779                 else
 780                         print_metric(ctxp, NULL, NULL, "CPUs utilized", 0);
 781         } else if (perf_stat_evsel__is(evsel, TOPDOWN_FETCH_BUBBLES)) {
 782                 double fe_bound = td_fe_bound(ctx, cpu);
 783
 784                 if (fe_bound > 0.2)
 785                         color = PERF_COLOR_RED;
 786                 print_metric(ctxp, color, "%8.1f%%", "frontend bound",
 787                                 fe_bound * 100.);
 788         } else if (perf_stat_evsel__is(evsel, TOPDOWN_SLOTS_RETIRED)) {
 789                 double retiring = td_retiring(ctx, cpu);
 790
 791                 if (retiring > 0.7)
 792                         color = PERF_COLOR_GREEN;
 793                 print_metric(ctxp, color, "%8.1f%%", "retiring",
 794                                 retiring * 100.);
 795         } else if (perf_stat_evsel__is(evsel, TOPDOWN_RECOVERY_BUBBLES)) {
 796                 double bad_spec = td_bad_spec(ctx, cpu);
 797
 798                 if (bad_spec > 0.1)
 799                         color = PERF_COLOR_RED;
 800                 print_metric(ctxp, color, "%8.1f%%", "bad speculation",
 801                                 bad_spec * 100.);
 802         } else if (perf_stat_evsel__is(evsel, TOPDOWN_SLOTS_ISSUED)) {
 803                 double be_bound = td_be_bound(ctx, cpu);
 804                 const char *name = "backend bound";
 805                 static int have_recovery_bubbles = -1;
 806
 807                 /* In case the CPU does not support topdown-recovery-bubbles */
 808                 if (have_recovery_bubbles < 0)
 809                         have_recovery_bubbles = pmu_have_event("cpu",
 810                                         "topdown-recovery-bubbles");
 811                 if (!have_recovery_bubbles)
 812                         name = "backend bound/bad spec";
 813
 814                 if (be_bound > 0.2)
 815                         color = PERF_COLOR_RED;
 816                 if (td_total_slots(ctx, cpu) > 0)
 817                         print_metric(ctxp, color, "%8.1f%%", name,
 818                                         be_bound * 100.);
 819                 else
 820                         print_metric(ctxp, NULL, NULL, name, 0);
 821         } else if (evsel->metric_expr) {
 822                 struct parse_ctx pctx;
 823                 int i;
 824
 825                 expr__ctx_init(&pctx);
 826                 expr__add_id(&pctx, evsel->name, avg);
 827                 for (i = 0; evsel->metric_events[i]; i++) {
 828                         struct saved_value *v;
 829
 830                         v = saved_value_lookup(evsel->metric_events[i], cpu, ctx, false);
 831                         if (!v)
 832                                 break;
 833                         expr__add_id(&pctx, evsel->metric_events[i]->name,
 834                                              avg_stats(&v->stats));
 835                 }
 836                 if (!evsel->metric_events[i]) {
 837                         const char *p = evsel->metric_expr;
 838
 839                         if (expr__parse(&ratio, &pctx, &p) == 0)
 840                                 print_metric(ctxp, NULL, "%8.1f",
 841                                         evsel->metric_name ?
 842                                         evsel->metric_name :
 843                                         out->force_header ?  evsel->name : "",
 844                                         ratio);
 845                         else
 846                                 print_metric(ctxp, NULL, NULL, "", 0);
 847                 } else
 848                         print_metric(ctxp, NULL, NULL, "", 0);
 849         } else if (runtime_nsecs_stats[cpu].n != 0) {
 850                 char unit = 'M';
 851                 char unit_buf[10];
 852
 853                 total = avg_stats(&runtime_nsecs_stats[cpu]);
 854
 855                 if (total)
 856                         ratio = 1000.0 * avg / total;
 857                 if (ratio < 0.001) {
 858                         ratio *= 1000;
 859                         unit = 'K';
 860                 }
 861                 snprintf(unit_buf, sizeof(unit_buf), "%c/sec", unit);
 862                 print_metric(ctxp, NULL, "%8.3f", unit_buf, ratio);
 863         } else if (perf_stat_evsel__is(evsel, SMI_NUM)) {
 864                 print_smi_cost(cpu, evsel, out);
 865         } else {
 866                 print_metric(ctxp, NULL, NULL, NULL, 0);
 867         }
 868 }