]> git.proxmox.com Git - ceph.git/blame - ceph/src/seastar/scripts/perftune.py
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / seastar / scripts / perftune.py
CommitLineData
9f95a23c 1#!/usr/bin/env python3
11fdf7f2
TL
2
3import abc
4import argparse
20effc67 5import distutils.util
11fdf7f2
TL
6import enum
7import functools
8import glob
9import itertools
9f95a23c 10import logging
1e59de90 11import math
11fdf7f2
TL
12import multiprocessing
13import os
14import pathlib
15import pyudev
16import re
17import shutil
18import subprocess
19import sys
20import urllib.request
21import yaml
9f95a23c
TL
22import platform
23import shlex
11fdf7f2 24
9f95a23c
TL
25dry_run_mode = False
26def perftune_print(log_msg, *args, **kwargs):
27 if dry_run_mode:
28 log_msg = "# " + log_msg
29 print(log_msg, *args, **kwargs)
30
31def __run_one_command(prog_args, stderr=None, check=True):
32 proc = subprocess.Popen(prog_args, stdout = subprocess.PIPE, stderr = stderr)
11fdf7f2
TL
33 outs, errs = proc.communicate()
34 outs = str(outs, 'utf-8')
35
36 if check and proc.returncode != 0:
37 raise subprocess.CalledProcessError(returncode=proc.returncode, cmd=" ".join(prog_args), output=outs, stderr=errs)
38
39 return outs
40
9f95a23c
TL
41def run_one_command(prog_args, stderr=None, check=True):
42 if dry_run_mode:
43 print(" ".join([shlex.quote(x) for x in prog_args]))
44 else:
45 __run_one_command(prog_args, stderr=stderr, check=check)
46
47def run_read_only_command(prog_args, stderr=None, check=True):
48 return __run_one_command(prog_args, stderr=stderr, check=check)
49
11fdf7f2
TL
50def run_hwloc_distrib(prog_args):
51 """
52 Returns a list of strings - each representing a single line of hwloc-distrib output.
53 """
9f95a23c 54 return run_read_only_command(['hwloc-distrib'] + prog_args).splitlines()
11fdf7f2
TL
55
56def run_hwloc_calc(prog_args):
57 """
58 Returns a single string with the result of the execution.
59 """
9f95a23c 60 return run_read_only_command(['hwloc-calc'] + prog_args).rstrip()
11fdf7f2 61
20effc67
TL
62def run_ethtool(prog_args):
63 """
64 Returns a list of strings - each representing a single line of ethtool output.
65 """
66 return run_read_only_command(['ethtool'] + prog_args).splitlines()
67
9f95a23c 68def fwriteln(fname, line, log_message, log_errors=True):
11fdf7f2 69 try:
9f95a23c
TL
70 if dry_run_mode:
71 print("echo {} > {}".format(line, fname))
72 return
73 else:
74 with open(fname, 'w') as f:
75 f.write(line)
76 print(log_message)
11fdf7f2 77 except:
9f95a23c
TL
78 if log_errors:
79 print("{}: failed to write into {}: {}".format(log_message, fname, sys.exc_info()))
11fdf7f2
TL
80
81def readlines(fname):
82 try:
83 with open(fname, 'r') as f:
84 return f.readlines()
85 except:
86 print("Failed to read {}: {}".format(fname, sys.exc_info()))
87 return []
88
9f95a23c
TL
89def fwriteln_and_log(fname, line, log_errors=True):
90 msg = "Writing '{}' to {}".format(line, fname)
91 fwriteln(fname, line, log_message=msg, log_errors=log_errors)
11fdf7f2
TL
92
93double_commas_pattern = re.compile(',,')
94
9f95a23c 95def set_one_mask(conf_file, mask, log_errors=True):
11fdf7f2
TL
96 if not os.path.exists(conf_file):
97 raise Exception("Configure file to set mask doesn't exist: {}".format(conf_file))
98 mask = re.sub('0x', '', mask)
99
100 while double_commas_pattern.search(mask):
101 mask = double_commas_pattern.sub(',0,', mask)
102
9f95a23c
TL
103 msg = "Setting mask {} in {}".format(mask, conf_file)
104 fwriteln(conf_file, mask, log_message=msg, log_errors=log_errors)
11fdf7f2 105
9f95a23c 106def distribute_irqs(irqs, cpu_mask, log_errors=True):
11fdf7f2
TL
107 # If IRQs' list is empty - do nothing
108 if not irqs:
109 return
110
111 for i, mask in enumerate(run_hwloc_distrib(["{}".format(len(irqs)), '--single', '--restrict', cpu_mask])):
9f95a23c 112 set_one_mask("/proc/irq/{}/smp_affinity".format(irqs[i]), mask, log_errors=log_errors)
11fdf7f2
TL
113
114def is_process_running(name):
9f95a23c 115 return len(list(filter(lambda ps_line : not re.search('<defunct>', ps_line), run_read_only_command(['ps', '--no-headers', '-C', name], check=False).splitlines()))) > 0
11fdf7f2
TL
116
117def restart_irqbalance(banned_irqs):
118 """
119 Restart irqbalance if it's running and ban it from moving the IRQs from the
120 given list.
121 """
122 config_file = '/etc/default/irqbalance'
123 options_key = 'OPTIONS'
124 systemd = False
125 banned_irqs_list = list(banned_irqs)
126
127 # If there is nothing to ban - quit
128 if not banned_irqs_list:
129 return
130
131 # return early if irqbalance is not running
132 if not is_process_running('irqbalance'):
9f95a23c 133 perftune_print("irqbalance is not running")
11fdf7f2
TL
134 return
135
9f95a23c
TL
136 # If this file exists - this a "new (systemd) style" irqbalance packaging.
137 # This type of packaging uses IRQBALANCE_ARGS as an option key name, "old (init.d) style"
138 # packaging uses an OPTION key.
1e59de90
TL
139 if os.path.exists('/lib/systemd/system/irqbalance.service') or \
140 os.path.exists('/usr/lib/systemd/system/irqbalance.service'):
9f95a23c
TL
141 options_key = 'IRQBALANCE_ARGS'
142 systemd = True
143
11fdf7f2
TL
144 if not os.path.exists(config_file):
145 if os.path.exists('/etc/sysconfig/irqbalance'):
146 config_file = '/etc/sysconfig/irqbalance'
11fdf7f2
TL
147 elif os.path.exists('/etc/conf.d/irqbalance'):
148 config_file = '/etc/conf.d/irqbalance'
149 options_key = 'IRQBALANCE_OPTS'
150 with open('/proc/1/comm', 'r') as comm:
151 systemd = 'systemd' in comm.read()
152 else:
9f95a23c
TL
153 perftune_print("Unknown system configuration - not restarting irqbalance!")
154 perftune_print("You have to prevent it from moving IRQs {} manually!".format(banned_irqs_list))
11fdf7f2
TL
155 return
156
157 orig_file = "{}.scylla.orig".format(config_file)
158
159 # Save the original file
9f95a23c
TL
160 if not dry_run_mode:
161 if not os.path.exists(orig_file):
162 print("Saving the original irqbalance configuration is in {}".format(orig_file))
163 shutil.copyfile(config_file, orig_file)
164 else:
165 print("File {} already exists - not overwriting.".format(orig_file))
11fdf7f2
TL
166
167 # Read the config file lines
168 cfile_lines = open(config_file, 'r').readlines()
169
170 # Build the new config_file contents with the new options configuration
9f95a23c 171 perftune_print("Restarting irqbalance: going to ban the following IRQ numbers: {} ...".format(", ".join(banned_irqs_list)))
11fdf7f2
TL
172
173 # Search for the original options line
174 opt_lines = list(filter(lambda line : re.search("^\s*{}".format(options_key), line), cfile_lines))
175 if not opt_lines:
176 new_options = "{}=\"".format(options_key)
177 elif len(opt_lines) == 1:
178 # cut the last "
179 new_options = re.sub("\"\s*$", "", opt_lines[0].rstrip())
9f95a23c 180 opt_lines = opt_lines[0].strip()
11fdf7f2
TL
181 else:
182 raise Exception("Invalid format in {}: more than one lines with {} key".format(config_file, options_key))
183
184 for irq in banned_irqs_list:
185 # prevent duplicate "ban" entries for the same IRQ
186 patt_str = "\-\-banirq\={}\Z|\-\-banirq\={}\s".format(irq, irq)
187 if not re.search(patt_str, new_options):
188 new_options += " --banirq={}".format(irq)
189
190 new_options += "\""
191
9f95a23c
TL
192 if dry_run_mode:
193 if opt_lines:
194 print("sed -i 's/^{}/#{}/g' {}".format(options_key, options_key, config_file))
195 print("echo {} | tee -a {}".format(new_options, config_file))
196 else:
197 with open(config_file, 'w') as cfile:
198 for line in cfile_lines:
199 if not re.search("^\s*{}".format(options_key), line):
200 cfile.write(line)
11fdf7f2 201
9f95a23c 202 cfile.write(new_options + "\n")
11fdf7f2
TL
203
204 if systemd:
9f95a23c 205 perftune_print("Restarting irqbalance via systemctl...")
11fdf7f2
TL
206 run_one_command(['systemctl', 'try-restart', 'irqbalance'])
207 else:
9f95a23c 208 perftune_print("Restarting irqbalance directly (init.d)...")
11fdf7f2
TL
209 run_one_command(['/etc/init.d/irqbalance', 'restart'])
210
211def learn_irqs_from_proc_interrupts(pattern, irq2procline):
212 return [ irq for irq, proc_line in filter(lambda irq_proc_line_pair : re.search(pattern, irq_proc_line_pair[1]), irq2procline.items()) ]
213
214def learn_all_irqs_one(irq_conf_dir, irq2procline, xen_dev_name):
215 """
216 Returns a list of IRQs of a single device.
217
218 irq_conf_dir: a /sys/... directory with the IRQ information for the given device
219 irq2procline: a map of IRQs to the corresponding lines in the /proc/interrupts
220 xen_dev_name: a device name pattern as it appears in the /proc/interrupts on Xen systems
221 """
222 msi_irqs_dir_name = os.path.join(irq_conf_dir, 'msi_irqs')
223 # Device uses MSI IRQs
224 if os.path.exists(msi_irqs_dir_name):
225 return os.listdir(msi_irqs_dir_name)
226
227 irq_file_name = os.path.join(irq_conf_dir, 'irq')
228 # Device uses INT#x
229 if os.path.exists(irq_file_name):
230 return [ line.lstrip().rstrip() for line in open(irq_file_name, 'r').readlines() ]
231
232 # No irq file detected
233 modalias = open(os.path.join(irq_conf_dir, 'modalias'), 'r').readline()
234
235 # virtio case
236 if re.search("^virtio", modalias):
237 return list(itertools.chain.from_iterable(
238 map(lambda dirname : learn_irqs_from_proc_interrupts(dirname, irq2procline),
239 filter(lambda dirname : re.search('virtio', dirname),
240 itertools.chain.from_iterable([ dirnames for dirpath, dirnames, filenames in os.walk(os.path.join(irq_conf_dir, 'driver')) ])))))
241
242 # xen case
243 if re.search("^xen:", modalias):
244 return learn_irqs_from_proc_interrupts(xen_dev_name, irq2procline)
245
246 return []
247
248def get_irqs2procline_map():
249 return { line.split(':')[0].lstrip().rstrip() : line for line in open('/proc/interrupts', 'r').readlines() }
250
1e59de90
TL
251
252class AutodetectError(Exception):
253 pass
254
255
256def auto_detect_irq_mask(cpu_mask, cores_per_irq_core):
257 """
258 The logic of auto-detection of what was once a 'mode' is generic and is all about the amount of CPUs and NUMA
259 nodes that are present and a restricting 'cpu_mask'.
260 This function implements this logic:
261
262 * up to 4 CPU threads: use 'cpu_mask'
263 * up to 4 CPU cores (on x86 this would translate to 8 CPU threads): use a single CPU thread out of allowed
264 * up to 16 CPU cores: use a single CPU core out of allowed
265 * more than 16 CPU cores: use a single CPU core for each 16 CPU cores and distribute them evenly among all
266 present NUMA nodes.
267
268 An AutodetectError exception is raised if 'cpu_mask' is defined in a way that there is a different number of threads
269 and/or cores among different NUMA nodes. In such a case a user needs to provide
270 an IRQ CPUs definition explicitly using 'irq_cpu_mask' parameter.
271
272 :param cpu_mask: CPU mask that defines which out of present CPUs can be considered for tuning
273 :param cores_per_irq_core number of cores to allocate a single IRQ core out of, e.g. 6 means allocate a single IRQ
274 core out of every 6 CPU cores.
275 :return: CPU mask to bind IRQs to, a.k.a. irq_cpu_mask
276 """
277 cores_key = 'cores'
278 PUs_key = 'PUs'
279
280 # List of NUMA IDs that own CPUs from the given CPU mask
281 numa_ids_list = run_hwloc_calc(['-I', 'numa', cpu_mask]).split(",")
282
283 # Let's calculate number of HTs and cores on each NUMA node belonging to the given CPU set
284 cores_PUs_per_numa = {} # { <numa_id> : {'cores': <number of cores>, 'PUs': <number of PUs>}}
285 for n in numa_ids_list:
286 num_cores = int(run_hwloc_calc(['--restrict', cpu_mask, '--number-of', 'core', f'numa:{n}']))
287 num_PUs = int(run_hwloc_calc(['--restrict', cpu_mask, '--number-of', 'PU', f'numa:{n}']))
288 cores_PUs_per_numa[n] = {cores_key: num_cores, PUs_key: num_PUs}
289
290 # Let's check if configuration on each NUMA is the same. If it's not then we can't auto-detect the IRQs CPU set
291 # and a user needs to provide it explicitly
292 num_cores0 = cores_PUs_per_numa[numa_ids_list[0]][cores_key]
293 num_PUs0 = cores_PUs_per_numa[numa_ids_list[0]][PUs_key]
294 for n in numa_ids_list:
295 if cores_PUs_per_numa[n][cores_key] != num_cores0 or cores_PUs_per_numa[n][PUs_key] != num_PUs0:
296 raise AutodetectError(f"NUMA{n} has a different configuration from NUMA0 for a given CPU mask {cpu_mask}: "
297 f"{cores_PUs_per_numa[n][cores_key]}:{cores_PUs_per_numa[n][PUs_key]} vs "
298 f"{num_cores0}:{num_PUs0}. Auto-detection of IRQ CPUs in not possible. "
299 f"Please, provide irq_cpu_mask explicitly.")
300
301 # Auto-detection of IRQ CPU set is possible - let's get to it!
302 #
303 # Total counts for the whole machine
304 num_cores = int(run_hwloc_calc(['--restrict', cpu_mask, '--number-of', 'core', 'machine:0']))
305 num_PUs = int(run_hwloc_calc(['--restrict', cpu_mask, '--number-of', 'PU', 'machine:0']))
306
307 if num_PUs <= 4:
308 return cpu_mask
309 elif num_cores <= 4:
310 return run_hwloc_calc(['--restrict', cpu_mask, 'PU:0'])
311 elif num_cores <= cores_per_irq_core:
312 return run_hwloc_calc(['--restrict', cpu_mask, 'core:0'])
313 else:
314 # Big machine.
315 # Let's allocate a full core out of every cores_per_irq_core cores.
316 # Let's distribute IRQ cores among present NUMA nodes
317 num_irq_cores = math.ceil(num_cores / cores_per_irq_core)
318 hwloc_args = []
319 numa_cores_count = {n: 0 for n in numa_ids_list}
320 added_cores = 0
321 while added_cores < num_irq_cores:
322 for numa in numa_ids_list:
323 hwloc_args.append(f"node:{numa}.core:{numa_cores_count[numa]}")
324 added_cores += 1
325 numa_cores_count[numa] += 1
326
327 if added_cores >= num_irq_cores:
328 break
329
330 return run_hwloc_calc(['--restrict', cpu_mask] + hwloc_args)
331
332
11fdf7f2
TL
333################################################################################
334class PerfTunerBase(metaclass=abc.ABCMeta):
335 def __init__(self, args):
336 self.__args = args
337 self.__args.cpu_mask = run_hwloc_calc(['--restrict', self.__args.cpu_mask, 'all'])
338 self.__mode = None
1e59de90
TL
339 self.__compute_cpu_mask = None
340
341 if self.args.mode:
342 self.mode = PerfTunerBase.SupportedModes[self.args.mode]
343 elif args.irq_cpu_mask:
344 self.irqs_cpu_mask = args.irq_cpu_mask
9f95a23c 345 else:
1e59de90
TL
346 self.irqs_cpu_mask = auto_detect_irq_mask(self.cpu_mask, self.cores_per_irq_core)
347
9f95a23c 348 self.__is_aws_i3_nonmetal_instance = None
11fdf7f2
TL
349
350#### Public methods ##########################
9f95a23c
TL
351 class CPUMaskIsZeroException(Exception):
352 """Thrown if CPU mask turns out to be zero"""
353 pass
354
11fdf7f2
TL
355 class SupportedModes(enum.IntEnum):
356 """
357 Modes are ordered from the one that cuts the biggest number of CPUs
358 from the compute CPUs' set to the one that takes the smallest ('mq' doesn't
359 cut any CPU from the compute set).
360
361 This fact is used when we calculate the 'common quotient' mode out of a
362 given set of modes (e.g. default modes of different Tuners) - this would
363 be the smallest among the given modes.
364 """
365 sq_split = 0
366 sq = 1
367 mq = 2
368
9f95a23c
TL
369 # Note: no_irq_restrictions should always have the greatest value in the enum since it's the least restricting mode.
370 no_irq_restrictions = 9999
371
11fdf7f2
TL
372 @staticmethod
373 def names():
374 return PerfTunerBase.SupportedModes.__members__.keys()
375
9f95a23c
TL
376 @staticmethod
377 def combine(modes):
378 """
379 :param modes: a set of modes of the PerfTunerBase.SupportedModes type
380 :return: the mode that is the "common ground" for a given set of modes.
381 """
382
383 # Perform an explicit cast in order to verify that the values in the 'modes' are compatible with the
384 # expected PerfTunerBase.SupportedModes type.
385 return min([PerfTunerBase.SupportedModes(m) for m in modes])
386
11fdf7f2
TL
387 @staticmethod
388 def cpu_mask_is_zero(cpu_mask):
389 """
1e59de90
TL
390 The cpu_mask is a comma-separated list of 32-bit hex values with possibly omitted zero components,
391 e.g. 0xffff,,0xffff
11fdf7f2
TL
392 We want to estimate if the whole mask is all-zeros.
393 :param cpu_mask: hwloc-calc generated CPU mask
394 :return: True if mask is zero, False otherwise
395 """
1e59de90
TL
396 for cur_cpu_mask in cpu_mask.split(','):
397 if cur_cpu_mask and int(cur_cpu_mask, 16) != 0:
11fdf7f2
TL
398 return False
399
400 return True
401
402 @staticmethod
403 def compute_cpu_mask_for_mode(mq_mode, cpu_mask):
404 mq_mode = PerfTunerBase.SupportedModes(mq_mode)
11fdf7f2
TL
405
406 if mq_mode == PerfTunerBase.SupportedModes.sq:
407 # all but CPU0
1e59de90 408 compute_cpu_mask = run_hwloc_calc([cpu_mask, '~PU:0'])
11fdf7f2
TL
409 elif mq_mode == PerfTunerBase.SupportedModes.sq_split:
410 # all but CPU0 and its HT siblings
1e59de90 411 compute_cpu_mask = run_hwloc_calc([cpu_mask, '~core:0'])
11fdf7f2
TL
412 elif mq_mode == PerfTunerBase.SupportedModes.mq:
413 # all available cores
1e59de90 414 compute_cpu_mask = cpu_mask
9f95a23c
TL
415 elif mq_mode == PerfTunerBase.SupportedModes.no_irq_restrictions:
416 # all available cores
1e59de90 417 compute_cpu_mask = cpu_mask
11fdf7f2
TL
418 else:
419 raise Exception("Unsupported mode: {}".format(mq_mode))
420
1e59de90 421 if PerfTunerBase.cpu_mask_is_zero(compute_cpu_mask):
9f95a23c 422 raise PerfTunerBase.CPUMaskIsZeroException("Bad configuration mode ({}) and cpu-mask value ({}): this results in a zero-mask for compute".format(mq_mode.name, cpu_mask))
11fdf7f2 423
1e59de90 424 return compute_cpu_mask
11fdf7f2
TL
425
426 @staticmethod
427 def irqs_cpu_mask_for_mode(mq_mode, cpu_mask):
428 mq_mode = PerfTunerBase.SupportedModes(mq_mode)
429 irqs_cpu_mask = 0
430
9f95a23c 431 if mq_mode != PerfTunerBase.SupportedModes.mq and mq_mode != PerfTunerBase.SupportedModes.no_irq_restrictions:
11fdf7f2 432 irqs_cpu_mask = run_hwloc_calc([cpu_mask, "~{}".format(PerfTunerBase.compute_cpu_mask_for_mode(mq_mode, cpu_mask))])
9f95a23c 433 else: # mq_mode == PerfTunerBase.SupportedModes.mq or mq_mode == PerfTunerBase.SupportedModes.no_irq_restrictions
11fdf7f2
TL
434 # distribute equally between all available cores
435 irqs_cpu_mask = cpu_mask
436
437 if PerfTunerBase.cpu_mask_is_zero(irqs_cpu_mask):
9f95a23c 438 raise PerfTunerBase.CPUMaskIsZeroException("Bad configuration mode ({}) and cpu-mask value ({}): this results in a zero-mask for IRQs".format(mq_mode.name, cpu_mask))
11fdf7f2
TL
439
440 return irqs_cpu_mask
441
442 @property
443 def mode(self):
444 """
445 Return the configuration mode
446 """
11fdf7f2
TL
447 return self.__mode
448
449 @mode.setter
450 def mode(self, new_mode):
451 """
452 Set the new configuration mode and recalculate the corresponding masks.
453 """
454 # Make sure the new_mode is of PerfTunerBase.AllowedModes type
455 self.__mode = PerfTunerBase.SupportedModes(new_mode)
456 self.__compute_cpu_mask = PerfTunerBase.compute_cpu_mask_for_mode(self.__mode, self.__args.cpu_mask)
457 self.__irq_cpu_mask = PerfTunerBase.irqs_cpu_mask_for_mode(self.__mode, self.__args.cpu_mask)
458
20effc67
TL
459 @property
460 def cpu_mask(self):
461 """
462 Return the CPU mask we operate on (the total CPU set)
463 """
464
465 return self.__args.cpu_mask
466
1e59de90
TL
467 @property
468 def cores_per_irq_core(self):
469 """
470 Return the number of cores we are going to allocate a single IRQ core out of when auto-detecting
471 """
472 return self.__args.cores_per_irq_core
473
474 @staticmethod
475 def min_cores_per_irq_core():
476 """
477 A minimum value of cores_per_irq_core.
478 We don't allocate a full IRQ core if total number of CPU cores is less or equal to 4.
479 """
480 return 5
481
11fdf7f2
TL
482 @property
483 def compute_cpu_mask(self):
484 """
485 Return the CPU mask to use for seastar application binding.
486 """
11fdf7f2
TL
487 return self.__compute_cpu_mask
488
489 @property
490 def irqs_cpu_mask(self):
491 """
492 Return the mask of CPUs used for IRQs distribution.
493 """
11fdf7f2
TL
494 return self.__irq_cpu_mask
495
1e59de90
TL
496 @irqs_cpu_mask.setter
497 def irqs_cpu_mask(self, new_irq_cpu_mask):
498 self.__irq_cpu_mask = new_irq_cpu_mask
499
500 # Sanity check
501 if PerfTunerBase.cpu_mask_is_zero(self.__irq_cpu_mask):
502 raise PerfTunerBase.CPUMaskIsZeroException("Bad configuration: zero IRQ CPU mask is given")
503
504 if run_hwloc_calc([self.__irq_cpu_mask]) == run_hwloc_calc([self.cpu_mask]):
505 # Special case: if IRQ CPU mask is the same as total CPU mask - set a Compute CPU mask to cpu_mask
506 self.__compute_cpu_mask = self.cpu_mask
507 else:
508 # Otherwise, a Compute CPU mask is a CPU mask without IRQ CPU mask bits
509 self.__compute_cpu_mask = run_hwloc_calc([self.cpu_mask, f"~{self.__irq_cpu_mask}"])
510
511 # Sanity check
512 if PerfTunerBase.cpu_mask_is_zero(self.__compute_cpu_mask):
513 raise PerfTunerBase.CPUMaskIsZeroException(
514 f"Bad configuration: cpu_maks:{self.cpu_mask}, irq_cpu_mask:{self.__irq_cpu_mask}: "
515 f"results in a zero-mask for compute")
516
9f95a23c
TL
517 @property
518 def is_aws_i3_non_metal_instance(self):
519 """
520 :return: True if we are running on the AWS i3.nonmetal instance, e.g. i3.4xlarge
521 """
522 if self.__is_aws_i3_nonmetal_instance is None:
523 self.__check_host_type()
524
525 return self.__is_aws_i3_nonmetal_instance
526
11fdf7f2
TL
527 @property
528 def args(self):
529 return self.__args
530
531 @property
532 def irqs(self):
533 return self._get_irqs()
534
535#### "Protected"/Public (pure virtual) methods ###########
536 @abc.abstractmethod
537 def tune(self):
538 pass
539
11fdf7f2
TL
540
541 @abc.abstractmethod
542 def _get_irqs(self):
543 """
544 Return the iteratable value with all IRQs to be configured.
545 """
546 pass
547
548#### Private methods ############################
9f95a23c
TL
549 def __check_host_type(self):
550 """
551 Check if we are running on the AWS i3 nonmetal instance.
552 If yes, set self.__is_aws_i3_nonmetal_instance to True, and to False otherwise.
553 """
554 try:
555 aws_instance_type = urllib.request.urlopen("http://169.254.169.254/latest/meta-data/instance-type", timeout=0.1).read().decode()
556 if re.match(r'^i3\.((?!metal)\w)+$', aws_instance_type):
557 self.__is_aws_i3_nonmetal_instance = True
558 else:
559 self.__is_aws_i3_nonmetal_instance = False
560
561 return
562 except (urllib.error.URLError, ConnectionError, TimeoutError):
563 # Non-AWS case
564 pass
565 except:
566 logging.warning("Unexpected exception while attempting to access AWS meta server: {}".format(sys.exc_info()[0]))
567
568 self.__is_aws_i3_nonmetal_instance = False
569
11fdf7f2
TL
570#################################################
571class NetPerfTuner(PerfTunerBase):
572 def __init__(self, args):
573 super().__init__(args)
574
20effc67
TL
575 self.nics=args.nics
576
11fdf7f2
TL
577 self.__nic_is_bond_iface = self.__check_dev_is_bond_iface()
578 self.__slaves = self.__learn_slaves()
579
20effc67
TL
580 # check that self.nics contain a HW device or a bonding interface
581 self.__check_nics()
11fdf7f2 582
1e59de90
TL
583 # Fetch IRQs related info
584 self.__get_irqs_info()
11fdf7f2 585
20effc67 586
11fdf7f2
TL
587#### Public methods ############################
588 def tune(self):
589 """
590 Tune the networking server configuration.
591 """
20effc67
TL
592 for nic in self.nics:
593 if self.nic_is_hw_iface(nic):
594 perftune_print("Setting a physical interface {}...".format(nic))
595 self.__setup_one_hw_iface(nic)
596 else:
597 perftune_print("Setting {} bonding interface...".format(nic))
598 self.__setup_bonding_iface(nic)
11fdf7f2
TL
599
600 # Increase the socket listen() backlog
601 fwriteln_and_log('/proc/sys/net/core/somaxconn', '4096')
602
603 # Increase the maximum number of remembered connection requests, which are still
604 # did not receive an acknowledgment from connecting client.
605 fwriteln_and_log('/proc/sys/net/ipv4/tcp_max_syn_backlog', '4096')
606
20effc67
TL
607 def nic_is_bond_iface(self, nic):
608 return self.__nic_is_bond_iface[nic]
11fdf7f2 609
20effc67
TL
610 def nic_exists(self, nic):
611 return self.__iface_exists(nic)
f67539c2 612
20effc67
TL
613 def nic_is_hw_iface(self, nic):
614 return self.__dev_is_hw_iface(nic)
11fdf7f2 615
20effc67 616 def slaves(self, nic):
11fdf7f2 617 """
20effc67 618 Returns an iterator for all slaves of the nic.
11fdf7f2
TL
619 If agrs.nic is not a bonding interface an attempt to use the returned iterator
620 will immediately raise a StopIteration exception - use __dev_is_bond_iface() check to avoid this.
621 """
20effc67 622 return iter(self.__slaves[nic])
11fdf7f2
TL
623
624#### Protected methods ##########################
11fdf7f2
TL
625 def _get_irqs(self):
626 """
20effc67 627 Returns the iterator for all IRQs that are going to be configured (according to args.nics parameter).
11fdf7f2
TL
628 For instance, for a bonding interface that's going to include IRQs of all its slaves.
629 """
630 return itertools.chain.from_iterable(self.__nic2irqs.values())
631
632#### Private methods ############################
1e59de90
TL
633 def __get_irqs_info(self):
634 self.__irqs2procline = get_irqs2procline_map()
635 self.__nic2irqs = self.__learn_irqs()
636
11fdf7f2
TL
637 @property
638 def __rfs_table_size(self):
639 return 32768
640
20effc67 641 def __check_nics(self):
11fdf7f2 642 """
20effc67 643 Checks that self.nics are supported interfaces
11fdf7f2 644 """
20effc67
TL
645 for nic in self.nics:
646 if not self.nic_exists(nic):
647 raise Exception("Device {} does not exist".format(nic))
648 if not self.nic_is_hw_iface(nic) and not self.nic_is_bond_iface(nic):
649 raise Exception("Not supported virtual device {}".format(nic))
11fdf7f2
TL
650
651 def __get_irqs_one(self, iface):
652 """
653 Returns the list of IRQ numbers for the given interface.
654 """
655 return self.__nic2irqs[iface]
656
657 def __setup_rfs(self, iface):
658 rps_limits = glob.glob("/sys/class/net/{}/queues/*/rps_flow_cnt".format(iface))
659 one_q_limit = int(self.__rfs_table_size / len(rps_limits))
660
661 # If RFS feature is not present - get out
662 try:
663 run_one_command(['sysctl', 'net.core.rps_sock_flow_entries'])
664 except:
665 return
666
667 # Enable RFS
9f95a23c 668 perftune_print("Setting net.core.rps_sock_flow_entries to {}".format(self.__rfs_table_size))
11fdf7f2
TL
669 run_one_command(['sysctl', '-w', 'net.core.rps_sock_flow_entries={}'.format(self.__rfs_table_size)])
670
671 # Set each RPS queue limit
672 for rfs_limit_cnt in rps_limits:
9f95a23c
TL
673 msg = "Setting limit {} in {}".format(one_q_limit, rfs_limit_cnt)
674 fwriteln(rfs_limit_cnt, "{}".format(one_q_limit), log_message=msg)
11fdf7f2 675
1e59de90
TL
676 # Enable/Disable ntuple filtering HW offload on the NIC. This is going to enable/disable aRFS on NICs supporting
677 # aRFS since ntuple is pre-requisite for an aRFS feature.
678 # If no explicit configuration has been requested enable ntuple (and thereby aRFS) only in MQ mode.
679 #
680 # aRFS acts similar to (SW) RFS: it places a TCP packet on a HW queue that it supposed to be "close" to an
681 # application thread that sent a packet on the same TCP stream.
682 #
683 # For instance if a given TCP stream was sent from CPU3 then the next Rx packet is going to be placed in an Rx
684 # HW queue which IRQ affinity is set to CPU3 or otherwise to the one with affinity close enough to CPU3.
685 #
686 # Read more here: https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/6/html/performance_tuning_guide/network-acc-rfs
687 #
688 # Obviously it would achieve the best result if there is at least one Rx HW queue with an affinity set to each
689 # application threads that handle TCP.
690 #
691 # And, similarly, if we know in advance that there won't be any such HW queue (sq and sq_split modes) - there is
692 # no sense enabling aRFS.
693 op = "Enable"
694 value = 'on'
695
696 if (self.args.enable_arfs is None and self.irqs_cpu_mask == self.cpu_mask) or self.args.enable_arfs is False:
697 op = "Disable"
698 value = 'off'
699
700 ethtool_msg = "{} ntuple filtering HW offload for {}...".format(op, iface)
701
9f95a23c 702 if dry_run_mode:
1e59de90
TL
703 perftune_print(ethtool_msg)
704 run_one_command(['ethtool','-K', iface, 'ntuple', value], stderr=subprocess.DEVNULL)
9f95a23c
TL
705 else:
706 try:
1e59de90
TL
707 print("Trying to {} ntuple filtering HW offload for {}...".format(op.lower(), iface), end='')
708 run_one_command(['ethtool','-K', iface, 'ntuple', value], stderr=subprocess.DEVNULL)
9f95a23c
TL
709 print("ok")
710 except:
711 print("not supported")
11fdf7f2
TL
712
713 def __setup_rps(self, iface, mask):
714 for one_rps_cpus in self.__get_rps_cpus(iface):
715 set_one_mask(one_rps_cpus, mask)
716
717 self.__setup_rfs(iface)
718
719 def __setup_xps(self, iface):
720 xps_cpus_list = glob.glob("/sys/class/net/{}/queues/*/xps_cpus".format(iface))
721 masks = run_hwloc_distrib(["{}".format(len(xps_cpus_list))])
722
723 for i, mask in enumerate(masks):
724 set_one_mask(xps_cpus_list[i], mask)
725
f67539c2
TL
726 def __iface_exists(self, iface):
727 if len(iface) == 0:
728 return False
729 return os.path.exists("/sys/class/net/{}".format(iface))
730
11fdf7f2
TL
731 def __dev_is_hw_iface(self, iface):
732 return os.path.exists("/sys/class/net/{}/device".format(iface))
733
734 def __check_dev_is_bond_iface(self):
20effc67 735 bond_dict = {}
11fdf7f2 736 if not os.path.exists('/sys/class/net/bonding_masters'):
20effc67
TL
737 for nic in self.nics:
738 bond_dict[nic] = False
739 #return False for every nic
740 return bond_dict
741 for nic in self.nics:
742 bond_dict[nic] = any([re.search(nic, line) for line in open('/sys/class/net/bonding_masters', 'r').readlines()])
743 return bond_dict
11fdf7f2
TL
744
745 def __learn_slaves(self):
20effc67
TL
746 slaves_list_per_nic = {}
747 for nic in self.nics:
748 if self.nic_is_bond_iface(nic):
749 slaves_list_per_nic[nic] = list(itertools.chain.from_iterable([line.split() for line in open("/sys/class/net/{}/bonding/slaves".format(nic), 'r').readlines()]))
11fdf7f2 750
20effc67 751 return slaves_list_per_nic
11fdf7f2
TL
752
753 def __intel_irq_to_queue_idx(self, irq):
754 """
755 Return the HW queue index for a given IRQ for Intel NICs in order to sort the IRQs' list by this index.
756
757 Intel's fast path IRQs have the following name convention:
758 <bla-bla>-TxRx-<queue index>
759
1e59de90 760 Intel NICs also have the IRQ for Flow Director (which is not a regular fast path IRQ) whose name looks like
11fdf7f2
TL
761 this:
762 <bla-bla>:fdir-TxRx-<index>
763
764 We want to put the Flow Director's IRQ at the end of the sorted list of IRQs.
765
766 :param irq: IRQ number
1e59de90 767 :return: HW queue index for Intel NICs and sys.maxsize for all other NICs
11fdf7f2
TL
768 """
769 intel_fp_irq_re = re.compile("\-TxRx\-(\d+)")
770 fdir_re = re.compile("fdir\-TxRx\-\d+")
771
772 m = intel_fp_irq_re.search(self.__irqs2procline[irq])
773 m1 = fdir_re.search(self.__irqs2procline[irq])
774 if m and not m1:
775 return int(m.group(1))
776 else:
777 return sys.maxsize
778
20effc67
TL
779 def __mlx_irq_to_queue_idx(self, irq):
780 """
781 Return the HW queue index for a given IRQ for Mellanox NICs in order to sort the IRQs' list by this index.
782
783 Mellanox NICs have the IRQ which name looks like
784 this:
785 mlx5_comp23
786 mlx5_comp<index>
787 or this:
788 mlx4-6
789 mlx4-<index>
790
791 :param irq: IRQ number
1e59de90 792 :return: HW queue index for Mellanox NICs and sys.maxsize for all other NICs
20effc67
TL
793 """
794 mlx5_fp_irq_re = re.compile("mlx5_comp(\d+)")
795 mlx4_fp_irq_re = re.compile("mlx4\-(\d+)")
796
797 m5 = mlx5_fp_irq_re.search(self.__irqs2procline[irq])
798 if m5:
799 return int(m5.group(1))
800 else:
801 m4 = mlx4_fp_irq_re.search(self.__irqs2procline[irq])
802 if m4:
803 return int(m4.group(1))
804
805 return sys.maxsize
806
1e59de90
TL
807 def __virtio_irq_to_queue_idx(self, irq):
808 """
809 Return the HW queue index for a given IRQ for VIRTIO in order to sort the IRQs' list by this index.
810
811 VIRTIO NICs have the IRQ's name that looks like this:
812 Queue K of a device virtioY, where Y is some integer is comprised of 2 IRQs
813 with following names:
814 * Tx IRQ:
815 virtioY-output.K
816 * Rx IRQ:
817 virtioY-input.K
818
819 :param irq: IRQ number
820 :return: HW queue index for VIRTIO fast path IRQ and sys.maxsize for all other IRQs
821 """
822 virtio_fp_re = re.compile(r"virtio\d+-(input|output)\.(\d+)$")
823
824 virtio_fp_irq = virtio_fp_re.search(self.__irqs2procline[irq])
825 if virtio_fp_irq:
826 return int(virtio_fp_irq.group(2))
827
828 return sys.maxsize
829
830
20effc67
TL
831 def __get_driver_name(self, iface):
832 """
833 :param iface: Interface to check
834 :return: driver name from ethtool
835 """
836
837 driver_name = ''
838 ethtool_i_lines = run_ethtool(['-i', iface])
839 driver_re = re.compile("driver:")
840 driver_lines = list(filter(lambda one_line: driver_re.search(one_line), ethtool_i_lines))
841
842 if driver_lines:
843 if len(driver_lines) > 1:
844 raise Exception("More than one 'driver:' entries in the 'ethtool -i {}' output. Unable to continue.".format(iface))
845
846 driver_name = driver_lines[0].split()[1].strip()
847
848 return driver_name
849
11fdf7f2
TL
850 def __learn_irqs_one(self, iface):
851 """
852 This is a slow method that is going to read from the system files. Never
853 use it outside the initialization code. Use __get_irqs_one() instead.
854
855 Filter the fast path queues IRQs from the __get_all_irqs_one() result according to the known
856 patterns.
857 Right now we know about the following naming convention of the fast path queues vectors:
858 - Intel: <bla-bla>-TxRx-<bla-bla>
859 - Broadcom: <bla-bla>-fp-<bla-bla>
860 - ena: <bla-bla>-Tx-Rx-<bla-bla>
20effc67
TL
861 - Mellanox: for mlx4
862 mlx4-<queue idx>@<bla-bla>
863 or for mlx5
864 mlx5_comp<queue idx>@<bla-bla>
1e59de90 865 - VIRTIO: virtioN-[input|output].D
11fdf7f2
TL
866
867 So, we will try to filter the etries in /proc/interrupts for IRQs we've got from get_all_irqs_one()
868 according to the patterns above.
869
870 If as a result all IRQs are filtered out (if there are no IRQs with the names from the patterns above) then
871 this means that the given NIC uses a different IRQs naming pattern. In this case we won't filter any IRQ.
872
873 Otherwise, we will use only IRQs which names fit one of the patterns above.
874
875 For NICs with a limited number of Rx queues the IRQs that handle Rx are going to be at the beginning of the
876 list.
877 """
878 # filter 'all_irqs' to only reference valid keys from 'irqs2procline' and avoid an IndexError on the 'irqs' search below
879 all_irqs = set(learn_all_irqs_one("/sys/class/net/{}/device".format(iface), self.__irqs2procline, iface)).intersection(self.__irqs2procline.keys())
1e59de90
TL
880 fp_irqs_re = re.compile("\-TxRx\-|\-fp\-|\-Tx\-Rx\-|mlx4-\d+@|mlx5_comp\d+@|virtio\d+-(input|output)")
881 irqs = sorted(list(filter(lambda irq : fp_irqs_re.search(self.__irqs2procline[irq]), all_irqs)))
11fdf7f2 882 if irqs:
1e59de90 883 irqs.sort(key=self.__get_irq_to_queue_idx_functor(iface))
11fdf7f2
TL
884 return irqs
885 else:
886 return list(all_irqs)
887
1e59de90
TL
888 def __get_irq_to_queue_idx_functor(self, iface):
889 """
890 Get a functor returning a queue index for a given IRQ.
891 This functor is needed for NICs that are known to not release IRQs when the number of Rx
892 channels is reduced or have extra IRQs for non-RSS channels.
893
894 Therefore, for these NICs we need a functor that would allow us to pick IRQs that belong to channels that are
895 going to handle TCP traffic: first X channels, where the value of X depends on the NIC's type and configuration.
896
897 For others, e.g. ENA, or Broadcom, which are only going to allocate IRQs that belong to TCP handling channels,
898 we don't really need to sort them as long as we filter fast path IRQs and distribute them evenly among IRQ CPUs.
899
900 :param iface: NIC's interface name, e.g. eth19
901 :return: A functor that returns a queue index for a given IRQ if a mapping is known
902 or a constant big integer value if mapping is unknown.
903 """
904 # There are a few known drivers for which we know how to get a queue index from an IRQ name in /proc/interrupts
905 driver_name = self.__get_driver_name(iface)
906
907 # Every functor returns a sys.maxsize for an unknown driver IRQs.
908 # So, choosing Intel's as a default is as good as any other.
909 irq_to_idx_func = self.__intel_irq_to_queue_idx
910 if driver_name.startswith("mlx"):
911 irq_to_idx_func = self.__mlx_irq_to_queue_idx
912 elif driver_name.startswith("virtio"):
913 irq_to_idx_func = self.__virtio_irq_to_queue_idx
914
915 return irq_to_idx_func
916
917 def __irq_lower_bound_by_queue(self, iface, irqs, queue_idx):
918 """
919 Get the index of the first element in irqs array which queue is greater or equal to a given index.
920 IRQs array is supposed to be sorted by queues numbers IRQs belong to.
921
922 There are additional assumptions:
923 * IRQs array items queue numbers are monotonically not decreasing, and if it increases then it increases by
924 one.
925 * Queue indexes are numbered starting from zero.
926
927 :param irqs: IRQs array sorted by queues numbers IRQs belong to
928 :param queue_idx: Queue index to partition by
929 :return: The first index in the IRQs array that corresponds to a queue number greater or equal to a given index
930 which is at least queue_idx. If there is no such IRQ - returns len(irqs).
931 """
932 irq_to_idx_func = self.__get_irq_to_queue_idx_functor(iface)
933
934 if queue_idx < len(irqs):
935 for idx in range(queue_idx, len(irqs)):
936 if irq_to_idx_func(irqs[idx]) >= queue_idx:
937 return idx
938
939 return len(irqs)
940
11fdf7f2
TL
941 def __learn_irqs(self):
942 """
943 This is a slow method that is going to read from the system files. Never
944 use it outside the initialization code.
945 """
20effc67
TL
946 nic_irq_dict={}
947 for nic in self.nics:
948 if self.nic_is_bond_iface(nic):
949 for slave in filter(self.__dev_is_hw_iface, self.slaves(nic)):
950 nic_irq_dict[slave] = self.__learn_irqs_one(slave)
951 else:
952 nic_irq_dict[nic] = self.__learn_irqs_one(nic)
953 return nic_irq_dict
11fdf7f2
TL
954
955 def __get_rps_cpus(self, iface):
956 """
957 Prints all rps_cpus files names for the given HW interface.
958
959 There is a single rps_cpus file for each RPS queue and there is a single RPS
960 queue for each HW Rx queue. Each HW Rx queue should have an IRQ.
961 Therefore the number of these files is equal to the number of fast path Rx IRQs for this interface.
962 """
963 return glob.glob("/sys/class/net/{}/queues/*/rps_cpus".format(iface))
964
1e59de90
TL
965 def __set_rx_channels_count(self, iface, count):
966 """
967 Try to set the number of Rx channels of a given interface to a given value.
968
969 Rx channels of any NIC can be configured using 'ethtool -L' command using one of the following semantics:
970
971 ethtool -L <iface> rx <count>
972 or
973 ethtool -L <iface> combined <count>
974
975 If a specific semantics is not supported by a given NIC or if changing the number of channels is not supported
976 ethtool is going to return an error.
977
978 Instead of parsing and trying to detect which one of the following semantics a given interface supports we will
979 simply try to use both semantics till either one of them succeeds or both fail.
980
981
982 :param iface: NIC interface name, e.g. eth4
983 :param count: number of Rx channels we want to configure
984 :return: True if configuration was successful, False otherwise
985 """
986 options = ["rx", "combined"]
987 for o in options:
988 try:
989 cmd = ['ethtool', '-L', iface, o, f"{count}"]
990 perftune_print(f"Executing: {' '.join(cmd)}")
991 run_one_command(cmd, stderr=subprocess.DEVNULL)
992 return True
993 except subprocess.CalledProcessError:
994 pass
995
996 return False
997
11fdf7f2 998 def __setup_one_hw_iface(self, iface):
1e59de90
TL
999 # Set Rx channels count to a number of IRQ CPUs unless an explicit count is given
1000 if self.args.num_rx_queues is not None:
1001 num_rx_channels = self.args.num_rx_queues
1002 else:
1003 num_rx_channels = 0
1004
1005 # If a mask is wider than 32 bits it's going to be presented as a comma-separated list of 32-bit masks
1006 # with possibly omitted zero components, e.g. 0x01,0x100,,0x12122
1007 for m in self.irqs_cpu_mask.split(","):
1008 if m:
1009 num_rx_channels += bin(int(m, 16)).count('1')
1010
1011 # Let's try setting the number of Rx channels to the number of IRQ CPUs.
1012 #
1013 # If we were able to change the number of Rx channels the number of IRQs could have changed.
1014 # In this case let's refresh IRQs info.
1015 rx_channels_set = self.__set_rx_channels_count(iface, num_rx_channels)
1016 if rx_channels_set:
1017 self.__get_irqs_info()
1018
11fdf7f2
TL
1019 max_num_rx_queues = self.__max_rx_queue_count(iface)
1020 all_irqs = self.__get_irqs_one(iface)
1021
1022 # Bind the NIC's IRQs according to the configuration mode
1023 #
1024 # If this NIC has a limited number of Rx queues then we want to distribute their IRQs separately.
1025 # For such NICs we've sorted IRQs list so that IRQs that handle Rx are all at the head of the list.
1e59de90 1026 if rx_channels_set or max_num_rx_queues < len(all_irqs):
11fdf7f2 1027 num_rx_queues = self.__get_rx_queue_count(iface)
1e59de90
TL
1028 tcp_irqs_lower_bound = self.__irq_lower_bound_by_queue(iface, all_irqs, num_rx_queues)
1029 perftune_print(f"Distributing IRQs handling Rx and Tx for first {num_rx_queues} channels:")
1030 distribute_irqs(all_irqs[0:tcp_irqs_lower_bound], self.irqs_cpu_mask)
9f95a23c 1031 perftune_print("Distributing the rest of IRQs")
1e59de90 1032 distribute_irqs(all_irqs[tcp_irqs_lower_bound:], self.irqs_cpu_mask)
11fdf7f2 1033 else:
9f95a23c 1034 perftune_print("Distributing all IRQs")
11fdf7f2
TL
1035 distribute_irqs(all_irqs, self.irqs_cpu_mask)
1036
20effc67 1037 self.__setup_rps(iface, self.cpu_mask)
11fdf7f2
TL
1038 self.__setup_xps(iface)
1039
20effc67
TL
1040 def __setup_bonding_iface(self, nic):
1041 for slave in self.slaves(nic):
11fdf7f2 1042 if self.__dev_is_hw_iface(slave):
9f95a23c 1043 perftune_print("Setting up {}...".format(slave))
11fdf7f2
TL
1044 self.__setup_one_hw_iface(slave)
1045 else:
9f95a23c 1046 perftune_print("Skipping {} (not a physical slave device?)".format(slave))
11fdf7f2
TL
1047
1048 def __max_rx_queue_count(self, iface):
1049 """
1050 :param iface: Interface to check
1051 :return: The maximum number of RSS queues for the given interface if there is known limitation and sys.maxsize
1052 otherwise.
1053
1054 Networking drivers serving HW with the known maximum RSS queue limitation (due to lack of RSS bits):
1055
1056 ixgbe: PF NICs support up to 16 RSS queues.
1057 ixgbevf: VF NICs support up to 4 RSS queues.
1058 i40e: PF NICs support up to 64 RSS queues.
1059 i40evf: VF NICs support up to 16 RSS queues.
1060
1061 """
1062 driver_to_max_rss = {'ixgbe': 16, 'ixgbevf': 4, 'i40e': 64, 'i40evf': 16}
1063
20effc67 1064 driver_name = self.__get_driver_name(iface)
11fdf7f2
TL
1065 return driver_to_max_rss.get(driver_name, sys.maxsize)
1066
1067 def __get_rx_queue_count(self, iface):
1068 """
1069 :return: the RSS Rx queues count for the given interface.
1070 """
1071 num_irqs = len(self.__get_irqs_one(iface))
1072 rx_queues_count = len(self.__get_rps_cpus(iface))
1073
1074 if rx_queues_count == 0:
1075 rx_queues_count = num_irqs
1076
1077 return min(self.__max_rx_queue_count(iface), rx_queues_count)
1078
11fdf7f2 1079
11fdf7f2 1080
9f95a23c
TL
1081class ClocksourceManager:
1082 class PreferredClockSourceNotAvailableException(Exception):
1083 pass
1084
1085 def __init__(self, args):
1086 self.__args = args
f67539c2
TL
1087 self._preferred = {"x86_64": "tsc", "kvm": "kvm-clock"}
1088 self._arch = self._get_arch()
9f95a23c
TL
1089 self._available_clocksources_file = "/sys/devices/system/clocksource/clocksource0/available_clocksource"
1090 self._current_clocksource_file = "/sys/devices/system/clocksource/clocksource0/current_clocksource"
f67539c2 1091 self._recommendation_if_unavailable = { "x86_64": "The tsc clocksource is not available. Consider using a hardware platform where the tsc clocksource is available, or try forcing it withe the tsc=reliable boot option", "kvm": "kvm-clock is not available" }
9f95a23c
TL
1092
1093 def _available_clocksources(self):
1094 return open(self._available_clocksources_file).readline().split()
1095
1096 def _current_clocksource(self):
1097 return open(self._current_clocksource_file).readline().strip()
1098
f67539c2
TL
1099 def _get_arch(self):
1100 try:
1101 virt = run_read_only_command(['systemd-detect-virt']).strip()
1102 if virt == "kvm":
1103 return virt
1104 except:
1105 pass
1106 return platform.machine()
1107
9f95a23c
TL
1108 def enforce_preferred_clocksource(self):
1109 fwriteln(self._current_clocksource_file, self._preferred[self._arch], "Setting clocksource to {}".format(self._preferred[self._arch]))
1110
1111 def preferred(self):
1112 return self._preferred[self._arch]
1113
1114 def setting_available(self):
1115 return self._arch in self._preferred
1116
1117 def preferred_clocksource_available(self):
1118 return self._preferred[self._arch] in self._available_clocksources()
1119
1120 def recommendation_if_unavailable(self):
1121 return self._recommendation_if_unavailable[self._arch]
1122
1123class SystemPerfTuner(PerfTunerBase):
1124 def __init__(self, args):
1125 super().__init__(args)
1126 self._clocksource_manager = ClocksourceManager(args)
1127
1128 def tune(self):
1129 if self.args.tune_clock:
1130 if not self._clocksource_manager.setting_available():
1e59de90 1131 perftune_print("Clocksource setting not available or not needed for this architecture. Not tuning")
9f95a23c
TL
1132 elif not self._clocksource_manager.preferred_clocksource_available():
1133 perftune_print(self._clocksource_manager.recommendation_if_unavailable())
1134 else:
1135 self._clocksource_manager.enforce_preferred_clocksource()
1136
1137#### Protected methods ##########################
9f95a23c
TL
1138 def _get_irqs(self):
1139 return []
1140
1141
11fdf7f2
TL
1142#################################################
1143class DiskPerfTuner(PerfTunerBase):
1144 class SupportedDiskTypes(enum.IntEnum):
1145 nvme = 0
1146 non_nvme = 1
1147
1148 def __init__(self, args):
1149 super().__init__(args)
1150
1151 if not (self.args.dirs or self.args.devs):
1152 raise Exception("'disks' tuning was requested but neither directories nor storage devices were given")
1153
1154 self.__pyudev_ctx = pyudev.Context()
1155 self.__dir2disks = self.__learn_directories()
1156 self.__irqs2procline = get_irqs2procline_map()
1157 self.__disk2irqs = self.__learn_irqs()
1158 self.__type2diskinfo = self.__group_disks_info_by_type()
1159
1160 # sets of devices that have already been tuned
1161 self.__io_scheduler_tuned_devs = set()
1162 self.__nomerges_tuned_devs = set()
20effc67 1163 self.__write_back_cache_tuned_devs = set()
11fdf7f2
TL
1164
1165#### Public methods #############################
1166 def tune(self):
1167 """
1168 Distribute IRQs according to the requested mode (args.mode):
1169 - Distribute NVMe disks' IRQs equally among all available CPUs.
1170 - Distribute non-NVMe disks' IRQs equally among designated CPUs or among
1171 all available CPUs in the 'mq' mode.
1172 """
11fdf7f2
TL
1173 non_nvme_disks, non_nvme_irqs = self.__disks_info_by_type(DiskPerfTuner.SupportedDiskTypes.non_nvme)
1174 if non_nvme_disks:
9f95a23c 1175 perftune_print("Setting non-NVMe disks: {}...".format(", ".join(non_nvme_disks)))
1e59de90 1176 distribute_irqs(non_nvme_irqs, self.irqs_cpu_mask)
11fdf7f2
TL
1177 self.__tune_disks(non_nvme_disks)
1178 else:
9f95a23c 1179 perftune_print("No non-NVMe disks to tune")
11fdf7f2
TL
1180
1181 nvme_disks, nvme_irqs = self.__disks_info_by_type(DiskPerfTuner.SupportedDiskTypes.nvme)
1182 if nvme_disks:
9f95a23c
TL
1183 # Linux kernel is going to use IRQD_AFFINITY_MANAGED mode for NVMe IRQs
1184 # on most systems (currently only AWS i3 non-metal are known to have a
1185 # different configuration). SMP affinity of an IRQ in this mode may not be
1186 # changed and an attempt to modify it is going to fail. However right now
1187 # the only way to determine that IRQD_AFFINITY_MANAGED mode has been used
1188 # is to attempt to modify IRQ SMP affinity (and fail) therefore we prefer
1189 # to always do it.
1190 #
1191 # What we don't want however is to see annoying errors every time we
1192 # detect that IRQD_AFFINITY_MANAGED was actually used. Therefore we will only log
1193 # them in the "verbose" mode or when we run on an i3.nonmetal AWS instance.
1194 perftune_print("Setting NVMe disks: {}...".format(", ".join(nvme_disks)))
1195 distribute_irqs(nvme_irqs, self.args.cpu_mask,
1196 log_errors=(self.is_aws_i3_non_metal_instance or self.args.verbose))
11fdf7f2
TL
1197 self.__tune_disks(nvme_disks)
1198 else:
9f95a23c 1199 perftune_print("No NVMe disks to tune")
11fdf7f2
TL
1200
1201#### Protected methods ##########################
11fdf7f2
TL
1202 def _get_irqs(self):
1203 return itertools.chain.from_iterable(irqs for disks, irqs in self.__type2diskinfo.values())
1204
1205#### Private methods ############################
1206 @property
1207 def __io_schedulers(self):
1208 """
1209 :return: An ordered list of IO schedulers that we want to configure. Schedulers are ordered by their priority
1210 from the highest (left most) to the lowest.
1211 """
1212 return ["none", "noop"]
1213
1214 @property
1215 def __nomerges(self):
1216 return '2'
1217
20effc67
TL
1218 @property
1219 def __write_cache_config(self):
1220 """
1221 :return: None - if write cache mode configuration is not requested or the corresponding write cache
1222 configuration value string
1223 """
1224 if self.args.set_write_back is None:
1225 return None
1226
1227 return "write back" if self.args.set_write_back else "write through"
1228
11fdf7f2
TL
1229 def __disks_info_by_type(self, disks_type):
1230 """
1231 Returns a tuple ( [<disks>], [<irqs>] ) for the given disks type.
1232 IRQs numbers in the second list are promised to be unique.
1233 """
1234 return self.__type2diskinfo[DiskPerfTuner.SupportedDiskTypes(disks_type)]
1235
1236 def __nvme_fast_path_irq_filter(self, irq):
1237 """
1238 Return True for fast path NVMe IRQs.
1239 For NVMe device only queues 1-<number of CPUs> are going to do fast path work.
1240
1241 NVMe IRQs have the following name convention:
1242 nvme<device index>q<queue index>, e.g. nvme0q7
1243
1244 :param irq: IRQ number
1245 :return: True if this IRQ is an IRQ of a FP NVMe queue.
1246 """
1247 nvme_irq_re = re.compile(r'(\s|^)nvme\d+q(\d+)(\s|$)')
1248
1249 # There may be more than an single HW queue bound to the same IRQ. In this case queue names are going to be
1e59de90 1250 # comma separated
11fdf7f2
TL
1251 split_line = self.__irqs2procline[irq].split(",")
1252
1253 for line in split_line:
1254 m = nvme_irq_re.search(line)
1255 if m and 0 < int(m.group(2)) <= multiprocessing.cpu_count():
1256 return True
1257
1258 return False
1259
1260 def __group_disks_info_by_type(self):
1261 """
1262 Return a map of tuples ( [<disks>], [<irqs>] ), where "disks" are all disks of the specific type
1263 and "irqs" are the corresponding IRQs.
1264
1265 It's promised that every element is "disks" and "irqs" is unique.
1266
1267 The disk types are 'nvme' and 'non-nvme'
1268 """
1269 disks_info_by_type = {}
1270 nvme_disks = set()
1271 nvme_irqs = set()
1272 non_nvme_disks = set()
1273 non_nvme_irqs = set()
1274 nvme_disk_name_pattern = re.compile('^nvme')
1275
1276 for disk, irqs in self.__disk2irqs.items():
1277 if nvme_disk_name_pattern.search(disk):
1278 nvme_disks.add(disk)
1279 for irq in irqs:
1280 nvme_irqs.add(irq)
1281 else:
1282 non_nvme_disks.add(disk)
1283 for irq in irqs:
1284 non_nvme_irqs.add(irq)
1285
1286 if not (nvme_disks or non_nvme_disks):
1287 raise Exception("'disks' tuning was requested but no disks were found")
1288
1289 nvme_irqs = list(nvme_irqs)
1290
1291 # There is a known issue with Xen hypervisor that exposes itself on AWS i3 instances where nvme module
1292 # over-allocates HW queues and uses only queues 1,2,3,..., <up to number of CPUs> for data transfer.
1293 # On these instances we will distribute only these queues.
9f95a23c
TL
1294
1295 if self.is_aws_i3_non_metal_instance:
1296 nvme_irqs = list(filter(self.__nvme_fast_path_irq_filter, nvme_irqs))
11fdf7f2
TL
1297
1298 # Sort IRQs for easier verification
1299 nvme_irqs.sort(key=lambda irq_num_str: int(irq_num_str))
1300
1301 disks_info_by_type[DiskPerfTuner.SupportedDiskTypes.nvme] = (list(nvme_disks), nvme_irqs)
1302 disks_info_by_type[DiskPerfTuner.SupportedDiskTypes.non_nvme] = ( list(non_nvme_disks), list(non_nvme_irqs) )
1303
1304 return disks_info_by_type
1305
1306 def __learn_directories(self):
1307 return { directory : self.__learn_directory(directory) for directory in self.args.dirs }
1308
1309 def __learn_directory(self, directory, recur=False):
1310 """
1311 Returns a list of disks the given directory is mounted on (there will be more than one if
1312 the mount point is on the RAID volume)
1313 """
1314 if not os.path.exists(directory):
1315 if not recur:
9f95a23c 1316 perftune_print("{} doesn't exist - skipping".format(directory))
11fdf7f2
TL
1317
1318 return []
1319
1320 try:
9f95a23c 1321 udev_obj = pyudev.Devices.from_device_number(self.__pyudev_ctx, 'block', os.stat(directory).st_dev)
11fdf7f2
TL
1322 return self.__get_phys_devices(udev_obj)
1323 except:
1324 # handle cases like ecryptfs where the directory is mounted to another directory and not to some block device
9f95a23c 1325 filesystem = run_read_only_command(['df', '-P', directory]).splitlines()[-1].split()[0].strip()
11fdf7f2
TL
1326 if not re.search(r'^/dev/', filesystem):
1327 devs = self.__learn_directory(filesystem, True)
1328 else:
1329 raise Exception("Logic error: failed to create a udev device while 'df -P' {} returns a {}".format(directory, filesystem))
1330
1331 # log error only for the original directory
1332 if not recur and not devs:
9f95a23c 1333 perftune_print("Can't get a block device for {} - skipping".format(directory))
11fdf7f2
TL
1334
1335 return devs
1336
1337 def __get_phys_devices(self, udev_obj):
1338 # if device is a virtual device - the underlying physical devices are going to be its slaves
1339 if re.search(r'virtual', udev_obj.sys_path):
20effc67
TL
1340 slaves = os.listdir(os.path.join(udev_obj.sys_path, 'slaves'))
1341 # If the device is virtual but doesn't have slaves (e.g. as nvm-subsystem virtual devices) handle it
1342 # as a regular device.
1343 if slaves:
1344 return list(itertools.chain.from_iterable([ self.__get_phys_devices(pyudev.Devices.from_device_file(self.__pyudev_ctx, "/dev/{}".format(slave))) for slave in slaves ]))
1345
1346 # device node is something like /dev/sda1 - we need only the part without /dev/
1347 return [ re.match(r'/dev/(\S+\d*)', udev_obj.device_node).group(1) ]
11fdf7f2
TL
1348
1349 def __learn_irqs(self):
1350 disk2irqs = {}
1351
1352 for devices in list(self.__dir2disks.values()) + [ self.args.devs ]:
1353 for device in devices:
1354 # There could be that some of the given directories are on the same disk.
1355 # There is no need to rediscover IRQs of the disk we've already handled.
1356 if device in disk2irqs.keys():
1357 continue
1358
9f95a23c 1359 udev_obj = pyudev.Devices.from_device_file(self.__pyudev_ctx, "/dev/{}".format(device))
11fdf7f2 1360 dev_sys_path = udev_obj.sys_path
20effc67
TL
1361
1362 # If the device is a virtual NVMe device it's sys file name goes as follows:
1363 # /sys/devices/virtual/nvme-subsystem/nvme-subsys0/nvme0n1
1364 #
1365 # and then there is this symlink:
1366 # /sys/devices/virtual/nvme-subsystem/nvme-subsys0/nvme0n1/device/nvme0 -> ../../../pci0000:85/0000:85:01.0/0000:87:00.0/nvme/nvme0
1367 #
1368 # So, the "main device" is a "nvme\d+" prefix of the actual device name.
1369 if re.search(r'virtual', udev_obj.sys_path):
1370 m = re.match(r'(nvme\d+)\S*', device)
1371 if m:
1372 dev_sys_path = "{}/device/{}".format(udev_obj.sys_path, m.group(1))
1373
1374 split_sys_path = list(pathlib.PurePath(pathlib.Path(dev_sys_path).resolve()).parts)
11fdf7f2
TL
1375
1376 # first part is always /sys/devices/pciXXX ...
1377 controller_path_parts = split_sys_path[0:4]
1378
1379 # ...then there is a chain of one or more "domain:bus:device.function" followed by the storage device enumeration crap
1380 # e.g. /sys/devices/pci0000:00/0000:00:1f.2/ata2/host1/target1:0:0/1:0:0:0/block/sda/sda3 or
1381 # /sys/devices/pci0000:00/0000:00:02.0/0000:02:00.0/host6/target6:2:0/6:2:0:0/block/sda/sda1
1382 # We want only the path till the last BDF including - it contains the IRQs information.
1383
1384 patt = re.compile("^[0-9ABCDEFabcdef]{4}\:[0-9ABCDEFabcdef]{2}\:[0-9ABCDEFabcdef]{2}\.[0-9ABCDEFabcdef]$")
1385 for split_sys_path_branch in split_sys_path[4:]:
1386 if patt.search(split_sys_path_branch):
1387 controller_path_parts.append(split_sys_path_branch)
1388 else:
1389 break
1390
1391 controler_path_str = functools.reduce(lambda x, y : os.path.join(x, y), controller_path_parts)
1392 disk2irqs[device] = learn_all_irqs_one(controler_path_str, self.__irqs2procline, 'blkif')
1393
1394 return disk2irqs
1395
1396 def __get_feature_file(self, dev_node, path_creator):
1397 """
1398 Find the closest ancestor with the given feature and return its ('feature file', 'device node') tuple.
1399
1400 If there isn't such an ancestor - return (None, None) tuple.
1401
1402 :param dev_node Device node file name, e.g. /dev/sda1
1403 :param path_creator A functor that creates a feature file name given a device system file name
1404 """
20effc67
TL
1405 # Sanity check
1406 if dev_node is None or path_creator is None:
1407 return None, None
1408
9f95a23c 1409 udev = pyudev.Devices.from_device_file(pyudev.Context(), dev_node)
11fdf7f2
TL
1410 feature_file = path_creator(udev.sys_path)
1411
1412 if os.path.exists(feature_file):
1413 return feature_file, dev_node
1414 elif udev.parent is not None:
1415 return self.__get_feature_file(udev.parent.device_node, path_creator)
1416 else:
1417 return None, None
1418
1419 def __tune_one_feature(self, dev_node, path_creator, value, tuned_devs_set):
1420 """
1421 Find the closest ancestor that has the given feature, configure it and
1422 return True.
1423
1424 If there isn't such ancestor - return False.
1425
1426 :param dev_node Device node file name, e.g. /dev/sda1
1427 :param path_creator A functor that creates a feature file name given a device system file name
1428 """
1429 feature_file, feature_node = self.__get_feature_file(dev_node, path_creator)
1430
1431 if feature_file is None:
1432 return False
1433
1434 if feature_node not in tuned_devs_set:
1435 fwriteln_and_log(feature_file, value)
1436 tuned_devs_set.add(feature_node)
1437
1438 return True
1439
1440 def __tune_io_scheduler(self, dev_node, io_scheduler):
1441 return self.__tune_one_feature(dev_node, lambda p : os.path.join(p, 'queue', 'scheduler'), io_scheduler, self.__io_scheduler_tuned_devs)
1442
1443 def __tune_nomerges(self, dev_node):
1444 return self.__tune_one_feature(dev_node, lambda p : os.path.join(p, 'queue', 'nomerges'), self.__nomerges, self.__nomerges_tuned_devs)
1445
20effc67
TL
1446 # If write cache configuration is not requested - return True immediately
1447 def __tune_write_back_cache(self, dev_node):
1448 if self.__write_cache_config is None:
1449 return True
1450
1451 return self.__tune_one_feature(dev_node, lambda p : os.path.join(p, 'queue', 'write_cache'), self.__write_cache_config, self.__write_back_cache_tuned_devs)
1452
11fdf7f2
TL
1453 def __get_io_scheduler(self, dev_node):
1454 """
1455 Return a supported scheduler that is also present in the required schedulers list (__io_schedulers).
1456
1457 If there isn't such a supported scheduler - return None.
1458 """
1459 feature_file, feature_node = self.__get_feature_file(dev_node, lambda p : os.path.join(p, 'queue', 'scheduler'))
1460
1461 lines = readlines(feature_file)
1462 if not lines:
1463 return None
1464
1465 # Supported schedulers appear in the config file as a single line as follows:
1466 #
1467 # sched1 [sched2] sched3
1468 #
1469 # ...with one or more schedulers where currently selected scheduler is the one in brackets.
1470 #
1471 # Return the scheduler with the highest priority among those that are supported for the current device.
20effc67 1472 supported_schedulers = frozenset([scheduler.lstrip("[").rstrip("]").rstrip("\n") for scheduler in lines[0].split(" ")])
11fdf7f2
TL
1473 return next((scheduler for scheduler in self.__io_schedulers if scheduler in supported_schedulers), None)
1474
1475 def __tune_disk(self, device):
1476 dev_node = "/dev/{}".format(device)
1477 io_scheduler = self.__get_io_scheduler(dev_node)
1478
1479 if not io_scheduler:
9f95a23c 1480 perftune_print("Not setting I/O Scheduler for {} - required schedulers ({}) are not supported".format(device, list(self.__io_schedulers)))
11fdf7f2 1481 elif not self.__tune_io_scheduler(dev_node, io_scheduler):
9f95a23c 1482 perftune_print("Not setting I/O Scheduler for {} - feature not present".format(device))
11fdf7f2
TL
1483
1484 if not self.__tune_nomerges(dev_node):
9f95a23c 1485 perftune_print("Not setting 'nomerges' for {} - feature not present".format(device))
11fdf7f2 1486
20effc67
TL
1487 if not self.__tune_write_back_cache(dev_node):
1488 perftune_print("Not setting 'write_cache' for {} - feature not present".format(device))
1489
11fdf7f2
TL
1490 def __tune_disks(self, disks):
1491 for disk in disks:
1492 self.__tune_disk(disk)
1493
1494################################################################################
1495class TuneModes(enum.Enum):
1496 disks = 0
1497 net = 1
9f95a23c 1498 system = 2
11fdf7f2
TL
1499
1500 @staticmethod
1501 def names():
1502 return list(TuneModes.__members__.keys())
1503
1504argp = argparse.ArgumentParser(description = 'Configure various system parameters in order to improve the seastar application performance.', formatter_class=argparse.RawDescriptionHelpFormatter,
1505 epilog=
1506'''
1507This script will:
1508
1509 - Ban relevant IRQs from being moved by irqbalance.
1510 - Configure various system parameters in /proc/sys.
1e59de90
TL
1511 - Distribute the IRQs (using SMP affinity configuration) among CPUs according to the configuration mode (see below)
1512 or an 'irq_cpu_mask' value.
11fdf7f2
TL
1513
1514As a result some of the CPUs may be destined to only handle the IRQs and taken out of the CPU set
1515that should be used to run the seastar application ("compute CPU set").
1516
1517Modes description:
1518
1519 sq - set all IRQs of a given NIC to CPU0 and configure RPS
1520 to spreads NAPIs' handling between other CPUs.
1521
1522 sq_split - divide all IRQs of a given NIC between CPU0 and its HT siblings and configure RPS
1523 to spreads NAPIs' handling between other CPUs.
1524
1525 mq - distribute NIC's IRQs among all CPUs instead of binding
1526 them all to CPU0. In this mode RPS is always enabled to
1527 spreads NAPIs' handling between all CPUs.
1528
1529 If there isn't any mode given script will use a default mode:
1e59de90
TL
1530 - If number of CPU cores is greater than 16, allocate a single IRQ CPU core for each 16 CPU cores in 'cpu_mask'.
1531 IRQ cores are going to be allocated evenly on available NUMA nodes according to 'cpu_mask' value.
1532 - If number of physical CPU cores per Rx HW queue is greater than 4 and less than 16 - use the 'sq-split' mode.
1533 - Otherwise, if number of hyper-threads per Rx HW queue is greater than 4 - use the 'sq' mode.
11fdf7f2
TL
1534 - Otherwise use the 'mq' mode.
1535
1536Default values:
1537
1538 --nic NIC - default: eth0
1539 --cpu-mask MASK - default: all available cores mask
9f95a23c 1540 --tune-clock - default: false
11fdf7f2 1541''')
1e59de90 1542argp.add_argument('--mode', choices=PerfTunerBase.SupportedModes.names(), help='configuration mode (deprecated, use --irq-cpu-mask instead)')
20effc67 1543argp.add_argument('--nic', action='append', help='network interface name(s), by default uses \'eth0\' (may appear more than once)', dest='nics', default=[])
9f95a23c 1544argp.add_argument('--tune-clock', action='store_true', help='Force tuning of the system clocksource')
11fdf7f2 1545argp.add_argument('--get-cpu-mask', action='store_true', help="print the CPU mask to be used for compute")
9f95a23c 1546argp.add_argument('--get-cpu-mask-quiet', action='store_true', help="print the CPU mask to be used for compute, print the zero CPU set if that's what it turns out to be")
1e59de90 1547argp.add_argument('--get-irq-cpu-mask', action='store_true', help="print the CPU mask to be used for IRQs binding")
9f95a23c 1548argp.add_argument('--verbose', action='store_true', help="be more verbose about operations and their result")
11fdf7f2
TL
1549argp.add_argument('--tune', choices=TuneModes.names(), help="components to configure (may be given more than once)", action='append', default=[])
1550argp.add_argument('--cpu-mask', help="mask of cores to use, by default use all available cores", metavar='MASK')
9f95a23c 1551argp.add_argument('--irq-cpu-mask', help="mask of cores to use for IRQs binding", metavar='MASK')
11fdf7f2
TL
1552argp.add_argument('--dir', help="directory to optimize (may appear more than once)", action='append', dest='dirs', default=[])
1553argp.add_argument('--dev', help="device to optimize (may appear more than once), e.g. sda1", action='append', dest='devs', default=[])
1554argp.add_argument('--options-file', help="configuration YAML file")
1555argp.add_argument('--dump-options-file', action='store_true', help="Print the configuration YAML file containing the current configuration")
9f95a23c 1556argp.add_argument('--dry-run', action='store_true', help="Don't take any action, just recommend what to do.")
20effc67 1557argp.add_argument('--write-back-cache', help="Enable/Disable \'write back\' write cache mode.", dest="set_write_back")
1e59de90
TL
1558argp.add_argument('--arfs', help="Enable/Disable aRFS", dest="enable_arfs")
1559argp.add_argument('--num-rx-queues', help="Set a given number of Rx queues", type=int)
1560argp.add_argument('--irq-core-auto-detection-ratio', help="Use a given ratio for IRQ mask auto-detection. For "
1561 "instance, if 8 is given and auto-detection is requested, a "
1562 "single IRQ CPU core is going to be allocated for every 8 "
1563 "CPU cores out of available according to a 'cpu_mask' value."
1564 "Default is 16",
1565 type=int, default=16, dest='cores_per_irq_core')
9f95a23c
TL
1566
1567def parse_cpu_mask_from_yaml(y, field_name, fname):
1568 hex_32bit_pattern='0x[0-9a-fA-F]{1,8}'
1569 mask_pattern = re.compile('^{}((,({})?)*,{})*$'.format(hex_32bit_pattern, hex_32bit_pattern, hex_32bit_pattern))
1570
1571 if mask_pattern.match(str(y[field_name])):
1572 return y[field_name]
1573 else:
1574 raise Exception("Bad '{}' value in {}: {}".format(field_name, fname, str(y[field_name])))
11fdf7f2 1575
20effc67
TL
1576def extend_and_unique(orig_list, iterable):
1577 """
1578 Extend items to a list, and make the list items unique
1579 """
1580 assert(isinstance(orig_list, list))
1581 assert(isinstance(iterable, list))
1582 orig_list.extend(iterable)
1583 return list(set(orig_list))
1584
1e59de90
TL
1585def parse_tri_state_arg(value, arg_name):
1586 try:
1587 if value is not None:
1588 return distutils.util.strtobool(value)
1589 else:
1590 return None
1591 except:
1592 sys.exit("Invalid {} value: should be boolean but given: {}".format(arg_name, value))
1593
11fdf7f2
TL
1594def parse_options_file(prog_args):
1595 if not prog_args.options_file:
1596 return
1597
9f95a23c 1598 y = yaml.safe_load(open(prog_args.options_file))
11fdf7f2
TL
1599 if y is None:
1600 return
1601
1602 if 'mode' in y and not prog_args.mode:
1603 if not y['mode'] in PerfTunerBase.SupportedModes.names():
1604 raise Exception("Bad 'mode' value in {}: {}".format(prog_args.options_file, y['mode']))
1605 prog_args.mode = y['mode']
1606
20effc67
TL
1607 if 'nic' in y:
1608 # Multiple nics was supported by commit a2fc9d72c31b97840bc75ae49dbd6f4b6d394e25
1609 # `nic' option dumped to config file will be list after this change, but the `nic'
1610 # option in old config file is still string, which was generated before this change.
1611 # So here convert the string option to list.
1612 if not isinstance(y['nic'], list):
1613 y['nic'] = [y['nic']]
1614 prog_args.nics = extend_and_unique(prog_args.nics, y['nic'])
11fdf7f2 1615
9f95a23c
TL
1616 if 'tune_clock' in y and not prog_args.tune_clock:
1617 prog_args.tune_clock= y['tune_clock']
1618
11fdf7f2
TL
1619 if 'tune' in y:
1620 if set(y['tune']) <= set(TuneModes.names()):
20effc67 1621 prog_args.tune = extend_and_unique(prog_args.tune, y['tune'])
11fdf7f2
TL
1622 else:
1623 raise Exception("Bad 'tune' value in {}: {}".format(prog_args.options_file, y['tune']))
1624
1625 if 'cpu_mask' in y and not prog_args.cpu_mask:
9f95a23c
TL
1626 prog_args.cpu_mask = parse_cpu_mask_from_yaml(y, 'cpu_mask', prog_args.options_file)
1627
1628 if 'irq_cpu_mask' in y and not prog_args.irq_cpu_mask:
1629 prog_args.irq_cpu_mask = parse_cpu_mask_from_yaml(y, 'irq_cpu_mask', prog_args.options_file)
11fdf7f2
TL
1630
1631 if 'dir' in y:
20effc67 1632 prog_args.dirs = extend_and_unique(prog_args.dirs, y['dir'])
11fdf7f2
TL
1633
1634 if 'dev' in y:
20effc67
TL
1635 prog_args.devs = extend_and_unique(prog_args.devs, y['dev'])
1636
1637 if 'write_back_cache' in y:
1638 prog_args.set_write_back = distutils.util.strtobool("{}".format(y['write_back_cache']))
11fdf7f2 1639
1e59de90
TL
1640 if 'arfs' in y:
1641 prog_args.enable_arfs = distutils.util.strtobool("{}".format(y['arfs']))
1642
1643 if 'num_rx_queues' in y:
1644 prog_args.num_rx_queues = int(y['num_rx_queues'])
1645
1646 # prog_options['irq_core_auto_detection_ratio'] = prog_args.cores_per_irq_core
1647 if 'irq_core_auto_detection_ratio' in y:
1648 prog_args.cores_per_irq_core = int(y['irq_core_auto_detection_ratio'])
1649
11fdf7f2
TL
1650def dump_config(prog_args):
1651 prog_options = {}
1652
1653 if prog_args.mode:
1e59de90
TL
1654 assert prog_args.cpu_mask, "cpu_mask has to always be set. Something is terribly wrong (a bug in perftune.py?)"
1655 mode = PerfTunerBase.SupportedModes[prog_args.mode]
1656 prog_options['irq_cpu_mask'] = PerfTunerBase.irqs_cpu_mask_for_mode(mode, prog_args.cpu_mask)
11fdf7f2 1657
20effc67 1658 if prog_args.nics:
1e59de90 1659 prog_options['nic'] = list(set(prog_args.nics))
11fdf7f2 1660
9f95a23c
TL
1661 if prog_args.tune_clock:
1662 prog_options['tune_clock'] = prog_args.tune_clock
1663
11fdf7f2 1664 if prog_args.tune:
1e59de90 1665 prog_options['tune'] = list(set(prog_args.tune))
11fdf7f2
TL
1666
1667 if prog_args.cpu_mask:
1668 prog_options['cpu_mask'] = prog_args.cpu_mask
1669
9f95a23c
TL
1670 if prog_args.irq_cpu_mask:
1671 prog_options['irq_cpu_mask'] = prog_args.irq_cpu_mask
1672
11fdf7f2 1673 if prog_args.dirs:
1e59de90 1674 prog_options['dir'] = list(set(prog_args.dirs))
11fdf7f2
TL
1675
1676 if prog_args.devs:
1e59de90 1677 prog_options['dev'] = list(set(prog_args.devs))
11fdf7f2 1678
20effc67
TL
1679 if prog_args.set_write_back is not None:
1680 prog_options['write_back_cache'] = prog_args.set_write_back
1681
1e59de90
TL
1682 if prog_args.enable_arfs is not None:
1683 prog_options['arfs'] = prog_args.enable_arfs
1684
1685 if prog_args.num_rx_queues is not None:
1686 prog_options['num_rx_queues'] = f"{prog_args.num_rx_queues}"
1687
1688 prog_options['irq_core_auto_detection_ratio'] = prog_args.cores_per_irq_core
1689
9f95a23c 1690 perftune_print(yaml.dump(prog_options, default_flow_style=False))
11fdf7f2
TL
1691################################################################################
1692
1693args = argp.parse_args()
20effc67
TL
1694
1695# Sanity check
1e59de90
TL
1696args.set_write_back = parse_tri_state_arg(args.set_write_back, "--write-back-cache/write_back_cache")
1697args.enable_arfs = parse_tri_state_arg(args.enable_arfs, "--arfs/arfs")
20effc67 1698
9f95a23c 1699dry_run_mode = args.dry_run
11fdf7f2
TL
1700parse_options_file(args)
1701
1702# if nothing needs to be configured - quit
9f95a23c 1703if not args.tune:
11fdf7f2
TL
1704 sys.exit("ERROR: At least one tune mode MUST be given.")
1705
9f95a23c
TL
1706# The must be either 'mode' or an explicit 'irq_cpu_mask' given - not both
1707if args.mode and args.irq_cpu_mask:
1708 sys.exit("ERROR: Provide either tune mode or IRQs CPU mask - not both.")
1709
1e59de90
TL
1710# Sanity check
1711if args.cores_per_irq_core < PerfTunerBase.min_cores_per_irq_core():
1712 sys.exit(f"ERROR: irq_core_auto_detection_ratio value must be greater or equal than "
1713 f"{PerfTunerBase.min_cores_per_irq_core()}")
1714
11fdf7f2 1715# set default values #####################
20effc67
TL
1716if not args.nics:
1717 args.nics = ['eth0']
11fdf7f2
TL
1718
1719if not args.cpu_mask:
1720 args.cpu_mask = run_hwloc_calc(['all'])
1721##########################################
1722
9f95a23c
TL
1723# Sanity: irq_cpu_mask should be a subset of cpu_mask
1724if args.irq_cpu_mask and run_hwloc_calc([args.cpu_mask]) != run_hwloc_calc([args.cpu_mask, args.irq_cpu_mask]):
1725 sys.exit("ERROR: IRQ CPU mask({}) must be a subset of CPU mask({})".format(args.irq_cpu_mask, args.cpu_mask))
1726
11fdf7f2
TL
1727if args.dump_options_file:
1728 dump_config(args)
1729 sys.exit(0)
1730
1731try:
1732 tuners = []
1733
1734 if TuneModes.disks.name in args.tune:
1735 tuners.append(DiskPerfTuner(args))
1736
1737 if TuneModes.net.name in args.tune:
1738 tuners.append(NetPerfTuner(args))
1739
9f95a23c
TL
1740 if TuneModes.system.name in args.tune:
1741 tuners.append(SystemPerfTuner(args))
1742
9f95a23c 1743 if args.get_cpu_mask or args.get_cpu_mask_quiet:
11fdf7f2 1744 # Print the compute mask from the first tuner - it's going to be the same in all of them
9f95a23c 1745 perftune_print(tuners[0].compute_cpu_mask)
1e59de90
TL
1746 elif args.get_irq_cpu_mask:
1747 perftune_print(tuners[0].irqs_cpu_mask)
11fdf7f2
TL
1748 else:
1749 # Tune the system
1750 restart_irqbalance(itertools.chain.from_iterable([ tuner.irqs for tuner in tuners ]))
1751
1752 for tuner in tuners:
1753 tuner.tune()
9f95a23c
TL
1754except PerfTunerBase.CPUMaskIsZeroException as e:
1755 # Print a zero CPU set if --get-cpu-mask-quiet was requested.
1756 if args.get_cpu_mask_quiet:
1757 perftune_print("0x0")
1758 else:
1759 sys.exit("ERROR: {}. Your system can't be tuned until the issue is fixed.".format(e))
11fdf7f2
TL
1760except Exception as e:
1761 sys.exit("ERROR: {}. Your system can't be tuned until the issue is fixed.".format(e))
1762