import abc
import argparse
+import distutils.util
import enum
import functools
import glob
"""
return run_read_only_command(['hwloc-calc'] + prog_args).rstrip()
+def run_ethtool(prog_args):
+ """
+ Returns a list of strings - each representing a single line of ethtool output.
+ """
+ return run_read_only_command(['ethtool'] + prog_args).splitlines()
+
def fwriteln(fname, line, log_message, log_errors=True):
try:
if dry_run_mode:
self.__compute_cpu_mask = PerfTunerBase.compute_cpu_mask_for_mode(self.__mode, self.__args.cpu_mask)
self.__irq_cpu_mask = PerfTunerBase.irqs_cpu_mask_for_mode(self.__mode, self.__args.cpu_mask)
+ @property
+ def cpu_mask(self):
+ """
+ Return the CPU mask we operate on (the total CPU set)
+ """
+
+ return self.__args.cpu_mask
+
@property
def compute_cpu_mask(self):
"""
def __init__(self, args):
super().__init__(args)
+ self.nics=args.nics
+
self.__nic_is_bond_iface = self.__check_dev_is_bond_iface()
self.__slaves = self.__learn_slaves()
- # check that self.nic is either a HW device or a bonding interface
- self.__check_nic()
+ # check that self.nics contain a HW device or a bonding interface
+ self.__check_nics()
self.__irqs2procline = get_irqs2procline_map()
self.__nic2irqs = self.__learn_irqs()
+
#### Public methods ############################
def tune(self):
"""
Tune the networking server configuration.
"""
- if self.nic_is_hw_iface:
- perftune_print("Setting a physical interface {}...".format(self.nic))
- self.__setup_one_hw_iface(self.nic)
- else:
- perftune_print("Setting {} bonding interface...".format(self.nic))
- self.__setup_bonding_iface()
+ for nic in self.nics:
+ if self.nic_is_hw_iface(nic):
+ perftune_print("Setting a physical interface {}...".format(nic))
+ self.__setup_one_hw_iface(nic)
+ else:
+ perftune_print("Setting {} bonding interface...".format(nic))
+ self.__setup_bonding_iface(nic)
# Increase the socket listen() backlog
fwriteln_and_log('/proc/sys/net/core/somaxconn', '4096')
# did not receive an acknowledgment from connecting client.
fwriteln_and_log('/proc/sys/net/ipv4/tcp_max_syn_backlog', '4096')
- @property
- def nic_is_bond_iface(self):
- return self.__nic_is_bond_iface
+ def nic_is_bond_iface(self, nic):
+ return self.__nic_is_bond_iface[nic]
- @property
- def nic(self):
- return self.args.nic
-
- @property
- def nic_exists(self):
- return self.__iface_exists(self.nic)
+ def nic_exists(self, nic):
+ return self.__iface_exists(nic)
- @property
- def nic_is_hw_iface(self):
- return self.__dev_is_hw_iface(self.nic)
+ def nic_is_hw_iface(self, nic):
+ return self.__dev_is_hw_iface(nic)
- @property
- def slaves(self):
+ def slaves(self, nic):
"""
- Returns an iterator for all slaves of the args.nic.
+ Returns an iterator for all slaves of the nic.
If agrs.nic is not a bonding interface an attempt to use the returned iterator
will immediately raise a StopIteration exception - use __dev_is_bond_iface() check to avoid this.
"""
- return iter(self.__slaves)
+ return iter(self.__slaves[nic])
#### Protected methods ##########################
def _get_def_mode(self):
- if self.nic_is_bond_iface:
- return min(map(self.__get_hw_iface_def_mode, filter(self.__dev_is_hw_iface, self.slaves)))
- else:
- return self.__get_hw_iface_def_mode(self.nic)
+ mode=PerfTunerBase.SupportedModes.no_irq_restrictions
+ for nic in self.nics:
+ if self.nic_is_bond_iface(nic):
+ mode = min(mode, min(map(self.__get_hw_iface_def_mode, filter(self.__dev_is_hw_iface, self.slaves(nic)))))
+ else:
+ mode = min(mode, self.__get_hw_iface_def_mode(nic))
+ return mode
def _get_irqs(self):
"""
- Returns the iterator for all IRQs that are going to be configured (according to args.nic parameter).
+ Returns the iterator for all IRQs that are going to be configured (according to args.nics parameter).
For instance, for a bonding interface that's going to include IRQs of all its slaves.
"""
return itertools.chain.from_iterable(self.__nic2irqs.values())
def __rfs_table_size(self):
return 32768
- def __check_nic(self):
+ def __check_nics(self):
"""
- Checks that self.nic is a supported interface
+ Checks that self.nics are supported interfaces
"""
- if not self.nic_exists:
- raise Exception("Device {} does not exist".format(self.nic))
- if not self.nic_is_hw_iface and not self.nic_is_bond_iface:
- raise Exception("Not supported virtual device {}".format(self.nic))
+ for nic in self.nics:
+ if not self.nic_exists(nic):
+ raise Exception("Device {} does not exist".format(nic))
+ if not self.nic_is_hw_iface(nic) and not self.nic_is_bond_iface(nic):
+ raise Exception("Not supported virtual device {}".format(nic))
def __get_irqs_one(self, iface):
"""
return os.path.exists("/sys/class/net/{}/device".format(iface))
def __check_dev_is_bond_iface(self):
+ bond_dict = {}
if not os.path.exists('/sys/class/net/bonding_masters'):
- return False
-
- return any([re.search(self.nic, line) for line in open('/sys/class/net/bonding_masters', 'r').readlines()])
+ for nic in self.nics:
+ bond_dict[nic] = False
+ #return False for every nic
+ return bond_dict
+ for nic in self.nics:
+ bond_dict[nic] = any([re.search(nic, line) for line in open('/sys/class/net/bonding_masters', 'r').readlines()])
+ return bond_dict
def __learn_slaves(self):
- if self.nic_is_bond_iface:
- return list(itertools.chain.from_iterable([ line.split() for line in open("/sys/class/net/{}/bonding/slaves".format(self.nic), 'r').readlines() ]))
+ slaves_list_per_nic = {}
+ for nic in self.nics:
+ if self.nic_is_bond_iface(nic):
+ slaves_list_per_nic[nic] = list(itertools.chain.from_iterable([line.split() for line in open("/sys/class/net/{}/bonding/slaves".format(nic), 'r').readlines()]))
- return []
+ return slaves_list_per_nic
def __intel_irq_to_queue_idx(self, irq):
"""
else:
return sys.maxsize
+ def __mlx_irq_to_queue_idx(self, irq):
+ """
+ Return the HW queue index for a given IRQ for Mellanox NICs in order to sort the IRQs' list by this index.
+
+ Mellanox NICs have the IRQ which name looks like
+ this:
+ mlx5_comp23
+ mlx5_comp<index>
+ or this:
+ mlx4-6
+ mlx4-<index>
+
+ :param irq: IRQ number
+ :return: HW queue index for Mellanox NICs and 0 for all other NICs
+ """
+ mlx5_fp_irq_re = re.compile("mlx5_comp(\d+)")
+ mlx4_fp_irq_re = re.compile("mlx4\-(\d+)")
+
+ m5 = mlx5_fp_irq_re.search(self.__irqs2procline[irq])
+ if m5:
+ return int(m5.group(1))
+ else:
+ m4 = mlx4_fp_irq_re.search(self.__irqs2procline[irq])
+ if m4:
+ return int(m4.group(1))
+
+ return sys.maxsize
+
+ def __get_driver_name(self, iface):
+ """
+ :param iface: Interface to check
+ :return: driver name from ethtool
+ """
+
+ driver_name = ''
+ ethtool_i_lines = run_ethtool(['-i', iface])
+ driver_re = re.compile("driver:")
+ driver_lines = list(filter(lambda one_line: driver_re.search(one_line), ethtool_i_lines))
+
+ if driver_lines:
+ if len(driver_lines) > 1:
+ raise Exception("More than one 'driver:' entries in the 'ethtool -i {}' output. Unable to continue.".format(iface))
+
+ driver_name = driver_lines[0].split()[1].strip()
+
+ return driver_name
+
def __learn_irqs_one(self, iface):
"""
This is a slow method that is going to read from the system files. Never
- Intel: <bla-bla>-TxRx-<bla-bla>
- Broadcom: <bla-bla>-fp-<bla-bla>
- ena: <bla-bla>-Tx-Rx-<bla-bla>
- - Mellanox: mlx<device model index>-<queue idx>@<bla-bla>
+ - Mellanox: for mlx4
+ mlx4-<queue idx>@<bla-bla>
+ or for mlx5
+ mlx5_comp<queue idx>@<bla-bla>
So, we will try to filter the etries in /proc/interrupts for IRQs we've got from get_all_irqs_one()
according to the patterns above.
"""
# filter 'all_irqs' to only reference valid keys from 'irqs2procline' and avoid an IndexError on the 'irqs' search below
all_irqs = set(learn_all_irqs_one("/sys/class/net/{}/device".format(iface), self.__irqs2procline, iface)).intersection(self.__irqs2procline.keys())
- fp_irqs_re = re.compile("\-TxRx\-|\-fp\-|\-Tx\-Rx\-|mlx\d+\-\d+@")
+ fp_irqs_re = re.compile("\-TxRx\-|\-fp\-|\-Tx\-Rx\-|mlx4-\d+@|mlx5_comp\d+@")
irqs = list(filter(lambda irq : fp_irqs_re.search(self.__irqs2procline[irq]), all_irqs))
if irqs:
- irqs.sort(key=self.__intel_irq_to_queue_idx)
+ driver_name = self.__get_driver_name(iface)
+ if (driver_name.startswith("mlx")):
+ irqs.sort(key=self.__mlx_irq_to_queue_idx)
+ else:
+ irqs.sort(key=self.__intel_irq_to_queue_idx)
return irqs
else:
return list(all_irqs)
This is a slow method that is going to read from the system files. Never
use it outside the initialization code.
"""
- if self.nic_is_bond_iface:
- return { slave : self.__learn_irqs_one(slave) for slave in filter(self.__dev_is_hw_iface, self.slaves) }
- else:
- return { self.nic : self.__learn_irqs_one(self.nic) }
+ nic_irq_dict={}
+ for nic in self.nics:
+ if self.nic_is_bond_iface(nic):
+ for slave in filter(self.__dev_is_hw_iface, self.slaves(nic)):
+ nic_irq_dict[slave] = self.__learn_irqs_one(slave)
+ else:
+ nic_irq_dict[nic] = self.__learn_irqs_one(nic)
+ return nic_irq_dict
def __get_rps_cpus(self, iface):
"""
perftune_print("Distributing all IRQs")
distribute_irqs(all_irqs, self.irqs_cpu_mask)
- self.__setup_rps(iface, self.compute_cpu_mask)
+ self.__setup_rps(iface, self.cpu_mask)
self.__setup_xps(iface)
- def __setup_bonding_iface(self):
- for slave in self.slaves:
+ def __setup_bonding_iface(self, nic):
+ for slave in self.slaves(nic):
if self.__dev_is_hw_iface(slave):
perftune_print("Setting up {}...".format(slave))
self.__setup_one_hw_iface(slave)
"""
driver_to_max_rss = {'ixgbe': 16, 'ixgbevf': 4, 'i40e': 64, 'i40evf': 16}
- driver_name = ''
- ethtool_i_lines = run_read_only_command(['ethtool', '-i', iface]).splitlines()
- driver_re = re.compile("driver:")
- driver_lines = list(filter(lambda one_line: driver_re.search(one_line), ethtool_i_lines))
-
- if driver_lines:
- if len(driver_lines) > 1:
- raise Exception("More than one 'driver:' entries in the 'ethtool -i {}' output. Unable to continue.".format(iface))
-
- driver_name = driver_lines[0].split()[1].strip()
-
+ driver_name = self.__get_driver_name(iface)
return driver_to_max_rss.get(driver_name, sys.maxsize)
def __get_rx_queue_count(self, iface):
"""
rx_queues_count = self.__get_rx_queue_count(iface)
- num_cores = int(run_hwloc_calc(['--number-of', 'core', 'machine:0', '--restrict', self.args.cpu_mask]))
- num_PUs = int(run_hwloc_calc(['--number-of', 'PU', 'machine:0', '--restrict', self.args.cpu_mask]))
+ num_cores = int(run_hwloc_calc(['--restrict', self.args.cpu_mask, '--number-of', 'core', 'machine:0']))
+ num_PUs = int(run_hwloc_calc(['--restrict', self.args.cpu_mask, '--number-of', 'PU', 'machine:0']))
if num_PUs <= 4 or rx_queues_count == num_PUs:
return PerfTunerBase.SupportedModes.mq
# sets of devices that have already been tuned
self.__io_scheduler_tuned_devs = set()
self.__nomerges_tuned_devs = set()
+ self.__write_back_cache_tuned_devs = set()
#### Public methods #############################
def tune(self):
if not non_nvme_disks:
return PerfTunerBase.SupportedModes.mq
- num_cores = int(run_hwloc_calc(['--number-of', 'core', 'machine:0', '--restrict', self.args.cpu_mask]))
- num_PUs = int(run_hwloc_calc(['--number-of', 'PU', 'machine:0', '--restrict', self.args.cpu_mask]))
+ num_cores = int(run_hwloc_calc(['--restrict', self.args.cpu_mask, '--number-of', 'core', 'machine:0']))
+ num_PUs = int(run_hwloc_calc(['--restrict', self.args.cpu_mask, '--number-of', 'PU', 'machine:0']))
if num_PUs <= 4:
return PerfTunerBase.SupportedModes.mq
elif num_cores <= 4:
def __nomerges(self):
return '2'
+ @property
+ def __write_cache_config(self):
+ """
+ :return: None - if write cache mode configuration is not requested or the corresponding write cache
+ configuration value string
+ """
+ if self.args.set_write_back is None:
+ return None
+
+ return "write back" if self.args.set_write_back else "write through"
+
def __disks_info_by_type(self, disks_type):
"""
Returns a tuple ( [<disks>], [<irqs>] ) for the given disks type.
def __get_phys_devices(self, udev_obj):
# if device is a virtual device - the underlying physical devices are going to be its slaves
if re.search(r'virtual', udev_obj.sys_path):
- return list(itertools.chain.from_iterable([ self.__get_phys_devices(pyudev.Devices.from_device_file(self.__pyudev_ctx, "/dev/{}".format(slave))) for slave in os.listdir(os.path.join(udev_obj.sys_path, 'slaves')) ]))
- else:
- # device node is something like /dev/sda1 - we need only the part without /dev/
- return [ re.match(r'/dev/(\S+\d*)', udev_obj.device_node).group(1) ]
+ slaves = os.listdir(os.path.join(udev_obj.sys_path, 'slaves'))
+ # If the device is virtual but doesn't have slaves (e.g. as nvm-subsystem virtual devices) handle it
+ # as a regular device.
+ if slaves:
+ return list(itertools.chain.from_iterable([ self.__get_phys_devices(pyudev.Devices.from_device_file(self.__pyudev_ctx, "/dev/{}".format(slave))) for slave in slaves ]))
+
+ # device node is something like /dev/sda1 - we need only the part without /dev/
+ return [ re.match(r'/dev/(\S+\d*)', udev_obj.device_node).group(1) ]
def __learn_irqs(self):
disk2irqs = {}
udev_obj = pyudev.Devices.from_device_file(self.__pyudev_ctx, "/dev/{}".format(device))
dev_sys_path = udev_obj.sys_path
- split_sys_path = list(pathlib.PurePath(dev_sys_path).parts)
+
+ # If the device is a virtual NVMe device it's sys file name goes as follows:
+ # /sys/devices/virtual/nvme-subsystem/nvme-subsys0/nvme0n1
+ #
+ # and then there is this symlink:
+ # /sys/devices/virtual/nvme-subsystem/nvme-subsys0/nvme0n1/device/nvme0 -> ../../../pci0000:85/0000:85:01.0/0000:87:00.0/nvme/nvme0
+ #
+ # So, the "main device" is a "nvme\d+" prefix of the actual device name.
+ if re.search(r'virtual', udev_obj.sys_path):
+ m = re.match(r'(nvme\d+)\S*', device)
+ if m:
+ dev_sys_path = "{}/device/{}".format(udev_obj.sys_path, m.group(1))
+
+ split_sys_path = list(pathlib.PurePath(pathlib.Path(dev_sys_path).resolve()).parts)
# first part is always /sys/devices/pciXXX ...
controller_path_parts = split_sys_path[0:4]
:param dev_node Device node file name, e.g. /dev/sda1
:param path_creator A functor that creates a feature file name given a device system file name
"""
+ # Sanity check
+ if dev_node is None or path_creator is None:
+ return None, None
+
udev = pyudev.Devices.from_device_file(pyudev.Context(), dev_node)
feature_file = path_creator(udev.sys_path)
def __tune_nomerges(self, dev_node):
return self.__tune_one_feature(dev_node, lambda p : os.path.join(p, 'queue', 'nomerges'), self.__nomerges, self.__nomerges_tuned_devs)
+ # If write cache configuration is not requested - return True immediately
+ def __tune_write_back_cache(self, dev_node):
+ if self.__write_cache_config is None:
+ return True
+
+ return self.__tune_one_feature(dev_node, lambda p : os.path.join(p, 'queue', 'write_cache'), self.__write_cache_config, self.__write_back_cache_tuned_devs)
+
def __get_io_scheduler(self, dev_node):
"""
Return a supported scheduler that is also present in the required schedulers list (__io_schedulers).
# ...with one or more schedulers where currently selected scheduler is the one in brackets.
#
# Return the scheduler with the highest priority among those that are supported for the current device.
- supported_schedulers = frozenset([scheduler.lstrip("[").rstrip("]") for scheduler in lines[0].split(" ")])
+ supported_schedulers = frozenset([scheduler.lstrip("[").rstrip("]").rstrip("\n") for scheduler in lines[0].split(" ")])
return next((scheduler for scheduler in self.__io_schedulers if scheduler in supported_schedulers), None)
def __tune_disk(self, device):
if not self.__tune_nomerges(dev_node):
perftune_print("Not setting 'nomerges' for {} - feature not present".format(device))
+ if not self.__tune_write_back_cache(dev_node):
+ perftune_print("Not setting 'write_cache' for {} - feature not present".format(device))
+
def __tune_disks(self, disks):
for disk in disks:
self.__tune_disk(disk)
--tune-clock - default: false
''')
argp.add_argument('--mode', choices=PerfTunerBase.SupportedModes.names(), help='configuration mode')
-argp.add_argument('--nic', help='network interface name, by default uses \'eth0\'')
+argp.add_argument('--nic', action='append', help='network interface name(s), by default uses \'eth0\' (may appear more than once)', dest='nics', default=[])
argp.add_argument('--tune-clock', action='store_true', help='Force tuning of the system clocksource')
argp.add_argument('--get-cpu-mask', action='store_true', help="print the CPU mask to be used for compute")
argp.add_argument('--get-cpu-mask-quiet', action='store_true', help="print the CPU mask to be used for compute, print the zero CPU set if that's what it turns out to be")
argp.add_argument('--options-file', help="configuration YAML file")
argp.add_argument('--dump-options-file', action='store_true', help="Print the configuration YAML file containing the current configuration")
argp.add_argument('--dry-run', action='store_true', help="Don't take any action, just recommend what to do.")
+argp.add_argument('--write-back-cache', help="Enable/Disable \'write back\' write cache mode.", dest="set_write_back")
def parse_cpu_mask_from_yaml(y, field_name, fname):
hex_32bit_pattern='0x[0-9a-fA-F]{1,8}'
else:
raise Exception("Bad '{}' value in {}: {}".format(field_name, fname, str(y[field_name])))
+def extend_and_unique(orig_list, iterable):
+ """
+ Extend items to a list, and make the list items unique
+ """
+ assert(isinstance(orig_list, list))
+ assert(isinstance(iterable, list))
+ orig_list.extend(iterable)
+ return list(set(orig_list))
+
def parse_options_file(prog_args):
if not prog_args.options_file:
return
raise Exception("Bad 'mode' value in {}: {}".format(prog_args.options_file, y['mode']))
prog_args.mode = y['mode']
- if 'nic' in y and not prog_args.nic:
- prog_args.nic = y['nic']
+ if 'nic' in y:
+ # Multiple nics was supported by commit a2fc9d72c31b97840bc75ae49dbd6f4b6d394e25
+ # `nic' option dumped to config file will be list after this change, but the `nic'
+ # option in old config file is still string, which was generated before this change.
+ # So here convert the string option to list.
+ if not isinstance(y['nic'], list):
+ y['nic'] = [y['nic']]
+ prog_args.nics = extend_and_unique(prog_args.nics, y['nic'])
if 'tune_clock' in y and not prog_args.tune_clock:
prog_args.tune_clock= y['tune_clock']
if 'tune' in y:
if set(y['tune']) <= set(TuneModes.names()):
- prog_args.tune.extend(y['tune'])
+ prog_args.tune = extend_and_unique(prog_args.tune, y['tune'])
else:
raise Exception("Bad 'tune' value in {}: {}".format(prog_args.options_file, y['tune']))
prog_args.irq_cpu_mask = parse_cpu_mask_from_yaml(y, 'irq_cpu_mask', prog_args.options_file)
if 'dir' in y:
- prog_args.dirs.extend(y['dir'])
+ prog_args.dirs = extend_and_unique(prog_args.dirs, y['dir'])
if 'dev' in y:
- prog_args.devs.extend(y['dev'])
+ prog_args.devs = extend_and_unique(prog_args.devs, y['dev'])
+
+ if 'write_back_cache' in y:
+ prog_args.set_write_back = distutils.util.strtobool("{}".format(y['write_back_cache']))
def dump_config(prog_args):
prog_options = {}
if prog_args.mode:
prog_options['mode'] = prog_args.mode
- if prog_args.nic:
- prog_options['nic'] = prog_args.nic
+ if prog_args.nics:
+ prog_options['nic'] = prog_args.nics
if prog_args.tune_clock:
prog_options['tune_clock'] = prog_args.tune_clock
if prog_args.devs:
prog_options['dev'] = prog_args.devs
+ if prog_args.set_write_back is not None:
+ prog_options['write_back_cache'] = prog_args.set_write_back
+
perftune_print(yaml.dump(prog_options, default_flow_style=False))
################################################################################
args = argp.parse_args()
+
+# Sanity check
+try:
+ if args.set_write_back:
+ args.set_write_back = distutils.util.strtobool(args.set_write_back)
+except:
+ sys.exit("Invalid --write-back-cache value: should be boolean but given: {}".format(args.set_write_back))
+
dry_run_mode = args.dry_run
parse_options_file(args)
sys.exit("ERROR: Provide either tune mode or IRQs CPU mask - not both.")
# set default values #####################
-if not args.nic:
- args.nic = 'eth0'
+if not args.nics:
+ args.nics = ['eth0']
if not args.cpu_mask:
args.cpu_mask = run_hwloc_calc(['all'])