[ceph.git] / ceph / qa / tasks / mgr / test_insights.py

import logging
import json
import datetime
import time

from .mgr_test_case import MgrTestCase


log = logging.getLogger(__name__)
UUID = 'd5775432-0742-44a3-a435-45095e32e6b2'
DATEFMT = '%Y-%m-%d %H:%M:%S.%f'

class TestInsights(MgrTestCase):
    def setUp(self):
        super(TestInsights, self).setUp()
        self.setup_mgrs()
        self._load_module("insights")
        self._load_module("selftest")
        self.crash_ids = []

    def tearDown(self):
        self._clear_crashes()

    def _insights(self):
        retstr = self.mgr_cluster.mon_manager.raw_cluster_cmd("insights")
        return json.loads(retstr)

    def _add_crash(self, hours, make_invalid = False):
        now = datetime.datetime.utcnow()
        timestamp = now - datetime.timedelta(hours = hours)
        timestamp = timestamp.strftime(DATEFMT) + 'Z'
        crash_id = '_'.join((timestamp, UUID)).replace(' ', '_')
        crash = {
            'crash_id': crash_id,
            'timestamp': timestamp,
        }
        if make_invalid:
            crash["timestamp"] = "not a timestamp"

        ret = self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
            'crash', 'post', '-i', '-',
            stdin=json.dumps(crash)
        )
        self.crash_ids.append(crash_id)
        self.assertEqual(0, ret)

    def _clear_crashes(self):
        for crash_id in self.crash_ids:
            self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
                'crash', 'rm', crash_id
            )

    def _wait_for_health_history_checks(self, *args):
        """Wait for a set of health checks to appear in the health history"""
        timeout = datetime.datetime.utcnow() + \
            datetime.timedelta(seconds = 15)
        while True:
            report = self._insights()
            missing = False
            for check in args:
                if check not in report["health"]["history"]["checks"]:
                    missing = True
                    break
            if not missing:
                return
            self.assertGreater(timeout,
                    datetime.datetime.utcnow())
            time.sleep(0.25)

    def _wait_for_curr_health_cleared(self, check):
        timeout = datetime.datetime.utcnow() + \
            datetime.timedelta(seconds = 15)
        while True:
            report = self._insights()
            if check not in report["health"]["current"]["checks"]:
                return
            self.assertGreater(timeout,
                    datetime.datetime.utcnow())
            time.sleep(0.25)

    def test_health_history(self):
        # use empty health history as starting point
        self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
            "insights", "prune-health", "0")
        report = self._insights()
        self.assertFalse(report["health"]["history"]["checks"])

        # generate health check history entries. we want to avoid the edge case
        # of running these tests at _exactly_ the top of the hour so we can
        # explicitly control when hourly work occurs. for this we use the
        # current time offset to a half hour.
        now = datetime.datetime.utcnow()
        now = datetime.datetime(
            year = now.year,
            month = now.month,
            day = now.day,
            hour = now.hour,
            minute = 30)

        check_names = set()
        for hours in [-18, -11, -5, -1, 0]:
            # change the insight module's perception of "now" ...
            self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
                "mgr", "self-test", "insights_set_now_offset", str(hours))

            # ... to simulate health check arrivals in the past
            unique_check_name = "insights_health_check_{}".format(hours)
            health_check = {
                unique_check_name: {
                    "severity": "warning",
                    "summary": "summary",
                    "detail": ["detail"]
                }
            }
            self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
                "mgr", "self-test", "health", "set",
                json.dumps(health_check))

            check_names.add(unique_check_name)

            # and also set the same health check to test deduplication
            dupe_check_name = "insights_health_check"
            health_check = {
                dupe_check_name: {
                    "severity": "warning",
                    "summary": "summary",
                    "detail": ["detail"]
                }
            }
            self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
                "mgr", "self-test", "health", "set",
                json.dumps(health_check))

            check_names.add(dupe_check_name)

            # wait for the health check to show up in the history report
            self._wait_for_health_history_checks(unique_check_name, dupe_check_name)

            # clear out the current health checks before moving on
            self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
                "mgr", "self-test", "health", "clear")
            self._wait_for_curr_health_cleared(unique_check_name)

        report = self._insights()
        for check in check_names:
            self.assertIn(check, report["health"]["history"]["checks"])

        # restart the manager
        active_id = self.mgr_cluster.get_active_id()
        self.mgr_cluster.mgr_restart(active_id)

        # ensure that at least one of the checks is present after the restart.
        # we don't for them all to be present because "earlier" checks may not
        # have sat in memory long enough to be flushed.
        all_missing = True
        report = self._insights()
        for check in check_names:
            if check in report["health"]["history"]["checks"]:
                all_missing = False
                break
        self.assertFalse(all_missing)

        # pruning really removes history
        self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
            "insights", "prune-health", "0")
        report = self._insights()
        self.assertFalse(report["health"]["history"]["checks"])

    def test_schema(self):
        """TODO: assert conformance to a full schema specification?"""
        report = self._insights()
        for key in ["osd_metadata",
                    "pg_summary",
                    "mon_status",
                    "manager_map",
                    "service_map",
                    "mon_map",
                    "crush_map",
                    "fs_map",
                    "osd_tree",
                    "df",
                    "osd_dump",
                    "config",
                    "health",
                    "crashes",
                    "version",
                    "errors"]:
            self.assertIn(key, report)

    def test_crash_history(self):
        self._clear_crashes()
        report = self._insights()
        self.assertFalse(report["crashes"]["summary"])
        self.assertFalse(report["errors"])

        # crashes show up in the report
        self._add_crash(1)
        report = self._insights()
        self.assertTrue(report["crashes"]["summary"])
        self.assertFalse(report["errors"])
        log.warning("{}".format(json.dumps(report["crashes"], indent=2)))

        self._clear_crashes()
Commit	Line	Data
11fdf7f2 TL	1	import logging
	2	import json
	3	import datetime
	4	import time
e306af50	5
f67539c2 TL	6	from .mgr_test_case import MgrTestCase
f67539c2 TL	7
11fdf7f2 TL	8
	9	log = logging.getLogger(__name__)
	10	UUID = 'd5775432-0742-44a3-a435-45095e32e6b2'
	11	DATEFMT = '%Y-%m-%d %H:%M:%S.%f'
	12
	13	class TestInsights(MgrTestCase):
	14	def setUp(self):
9f95a23c	15	super(TestInsights, self).setUp()
11fdf7f2 TL	16	self.setup_mgrs()
	17	self._load_module("insights")
	18	self._load_module("selftest")
	19	self.crash_ids = []
	20
	21	def tearDown(self):
	22	self._clear_crashes()
	23
	24	def _insights(self):
	25	retstr = self.mgr_cluster.mon_manager.raw_cluster_cmd("insights")
	26	return json.loads(retstr)
	27
	28	def _add_crash(self, hours, make_invalid = False):
	29	now = datetime.datetime.utcnow()
	30	timestamp = now - datetime.timedelta(hours = hours)
	31	timestamp = timestamp.strftime(DATEFMT) + 'Z'
	32	crash_id = '_'.join((timestamp, UUID)).replace(' ', '_')
	33	crash = {
	34	'crash_id': crash_id,
	35	'timestamp': timestamp,
	36	}
	37	if make_invalid:
	38	crash["timestamp"] = "not a timestamp"
	39
	40	ret = self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
	41	'crash', 'post', '-i', '-',
	42	stdin=json.dumps(crash)
	43	)
	44	self.crash_ids.append(crash_id)
	45	self.assertEqual(0, ret)
	46
	47	def _clear_crashes(self):
	48	for crash_id in self.crash_ids:
	49	self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
	50	'crash', 'rm', crash_id
	51	)
	52
	53	def _wait_for_health_history_checks(self, *args):
	54	"""Wait for a set of health checks to appear in the health history"""
	55	timeout = datetime.datetime.utcnow() + \
	56	datetime.timedelta(seconds = 15)
	57	while True:
	58	report = self._insights()
	59	missing = False
	60	for check in args:
	61	if check not in report["health"]["history"]["checks"]:
	62	missing = True
	63	break
	64	if not missing:
	65	return
	66	self.assertGreater(timeout,
	67	datetime.datetime.utcnow())
	68	time.sleep(0.25)
	69
	70	def _wait_for_curr_health_cleared(self, check):
	71	timeout = datetime.datetime.utcnow() + \
	72	datetime.timedelta(seconds = 15)
	73	while True:
	74	report = self._insights()
	75	if check not in report["health"]["current"]["checks"]:
	76	return
	77	self.assertGreater(timeout,
	78	datetime.datetime.utcnow())
	79	time.sleep(0.25)
80
81	def test_health_history(self):
82	# use empty health history as starting point
83	self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
84	"insights", "prune-health", "0")
85	report = self._insights()
86	self.assertFalse(report["health"]["history"]["checks"])
87
88	# generate health check history entries. we want to avoid the edge case
89	# of running these tests at _exactly_ the top of the hour so we can
90	# explicitly control when hourly work occurs. for this we use the
91	# current time offset to a half hour.
92	now = datetime.datetime.utcnow()
93	now = datetime.datetime(
94	year = now.year,
95	month = now.month,
96	day = now.day,
97	hour = now.hour,
98	minute = 30)
99
100	check_names = set()
101	for hours in [-18, -11, -5, -1, 0]:
102	# change the insight module's perception of "now" ...
103	self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
104	"mgr", "self-test", "insights_set_now_offset", str(hours))
105
106	# ... to simulate health check arrivals in the past
107	unique_check_name = "insights_health_check_{}".format(hours)
108	health_check = {
109	unique_check_name: {
110	"severity": "warning",
111	"summary": "summary",
112	"detail": ["detail"]
113	}
114	}
115	self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
116	"mgr", "self-test", "health", "set",
117	json.dumps(health_check))
118
119	check_names.add(unique_check_name)
120
121	# and also set the same health check to test deduplication
9f95a23c	122	dupe_check_name = "insights_health_check"
11fdf7f2 TL	123	health_check = {
	124	dupe_check_name: {
	125	"severity": "warning",
	126	"summary": "summary",
	127	"detail": ["detail"]
	128	}
	129	}
	130	self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
	131	"mgr", "self-test", "health", "set",
	132	json.dumps(health_check))
	133
	134	check_names.add(dupe_check_name)
	135
	136	# wait for the health check to show up in the history report
	137	self._wait_for_health_history_checks(unique_check_name, dupe_check_name)
	138
	139	# clear out the current health checks before moving on
	140	self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
	141	"mgr", "self-test", "health", "clear")
	142	self._wait_for_curr_health_cleared(unique_check_name)
	143
	144	report = self._insights()
	145	for check in check_names:
	146	self.assertIn(check, report["health"]["history"]["checks"])
	147
	148	# restart the manager
	149	active_id = self.mgr_cluster.get_active_id()
	150	self.mgr_cluster.mgr_restart(active_id)
	151
	152	# ensure that at least one of the checks is present after the restart.
	153	# we don't for them all to be present because "earlier" checks may not
	154	# have sat in memory long enough to be flushed.
	155	all_missing = True
	156	report = self._insights()
	157	for check in check_names:
	158	if check in report["health"]["history"]["checks"]:
	159	all_missing = False
	160	break
	161	self.assertFalse(all_missing)
	162
	163	# pruning really removes history
	164	self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
	165	"insights", "prune-health", "0")
	166	report = self._insights()
	167	self.assertFalse(report["health"]["history"]["checks"])
	168
11fdf7f2 TL	169	def test_schema(self):
	170	"""TODO: assert conformance to a full schema specification?"""
	171	report = self._insights()
	172	for key in ["osd_metadata",
	173	"pg_summary",
	174	"mon_status",
	175	"manager_map",
	176	"service_map",
	177	"mon_map",
	178	"crush_map",
	179	"fs_map",
	180	"osd_tree",
	181	"df",
	182	"osd_dump",
	183	"config",
	184	"health",
	185	"crashes",
	186	"version",
	187	"errors"]:
	188	self.assertIn(key, report)
	189
	190	def test_crash_history(self):
	191	self._clear_crashes()
	192	report = self._insights()
	193	self.assertFalse(report["crashes"]["summary"])
	194	self.assertFalse(report["errors"])
	195
	196	# crashes show up in the report
	197	self._add_crash(1)
	198	report = self._insights()
	199	self.assertTrue(report["crashes"]["summary"])
	200	self.assertFalse(report["errors"])
	201	log.warning("{}".format(json.dumps(report["crashes"], indent=2)))
	202
11fdf7f2	203	self._clear_crashes()