1 CEPH-MIB DEFINITIONS ::= BEGIN
4 MODULE-IDENTITY, NOTIFICATION-TYPE, enterprises
6 MODULE-COMPLIANCE, NOTIFICATION-GROUP
10 -- Linting information:
12 -- # smilint -l 6 -i notification-not-reversible ./CEPH-MIB.txt
14 -- ignore: notification-not-reversible since our SNMP gateway doesn't use SNMPv1
19 "202111010000Z" -- Nov 01, 2021
26 Send comments to: <dev@ceph.io>"
28 "The MIB module for Ceph. In it's current form it only
29 supports Notifications, since Ceph itself doesn't provide
30 any SNMP agent functionality.
32 Notifications are provided through a Prometheus/Alertmanager
33 webhook passing alerts to an external gateway service that is
34 responsible for formatting, forwarding and authenticating to
38 "202111010000Z" --Nov 01, 2021
40 "Latest version including the following updates;
42 - MIB restructure to align with linting
43 - names shortened and simplified (less verbose)
44 - Simplified structure due to switch to https://github.com/maxwo/snmp_notifier
46 - notifications updated
47 - Added module compliance
48 - Updated to latest prometheus alert rule definitions
50 ::= { enterprises 50495 }
52 cephCluster OBJECT IDENTIFIER ::= { ceph 1 }
53 cephConformance OBJECT IDENTIFIER ::= { ceph 2 }
55 -- cephMetadata is a placeholder for possible future expansion via an agent
56 -- where we could provide an overview of the clusters configuration
57 cephMetadata OBJECT IDENTIFIER ::= { cephCluster 1 }
58 cephNotifications OBJECT IDENTIFIER ::= { cephCluster 2 }
60 prometheus OBJECT IDENTIFIER ::= { cephNotifications 1 }
63 -- Notifications: first we define the notification 'branches' for the
64 -- different categories of notifications / alerts
65 promGeneric OBJECT IDENTIFIER ::= { prometheus 1 }
66 promHealthStatus OBJECT IDENTIFIER ::= { prometheus 2 }
67 promMon OBJECT IDENTIFIER ::= { prometheus 3 }
68 promOsd OBJECT IDENTIFIER ::= { prometheus 4 }
69 promMds OBJECT IDENTIFIER ::= { prometheus 5 }
70 promMgr OBJECT IDENTIFIER ::= { prometheus 6 }
71 promPGs OBJECT IDENTIFIER ::= { prometheus 7 }
72 promNode OBJECT IDENTIFIER ::= { prometheus 8 }
73 promPool OBJECT IDENTIFIER ::= { prometheus 9 }
74 promRados OBJECT IDENTIFIER ::= { prometheus 10 }
75 promCephadm OBJECT IDENTIFIER ::= { prometheus 11 }
76 promPrometheus OBJECT IDENTIFIER ::= { prometheus 12 }
78 promGenericNotification NOTIFICATION-TYPE
80 DESCRIPTION "Generic alert issued when the Prometheus rule doesn't provide an OID."
83 promGenericDaemonCrash NOTIFICATION-TYPE
85 DESCRIPTION "One or more daemons have crashed recently, and are yet to be archived"
88 promHealthStatusError NOTIFICATION-TYPE
90 DESCRIPTION "Ceph in health_error state for too long."
91 ::= { promHealthStatus 1 }
93 promHealthStatusWarning NOTIFICATION-TYPE
95 DESCRIPTION "Ceph in health_warn for too long."
96 ::= { promHealthStatus 2 }
98 promMonLowQuorum NOTIFICATION-TYPE
100 DESCRIPTION "Monitor count in quorum is low."
103 promMonDiskSpaceCritical NOTIFICATION-TYPE
105 DESCRIPTION "Monitor diskspace is critically low."
108 promOsdDownHigh NOTIFICATION-TYPE
110 DESCRIPTION "A high number of OSDs are down."
113 promOsdDown NOTIFICATION-TYPE
115 DESCRIPTION "One or more Osds down."
118 promOsdNearFull NOTIFICATION-TYPE
120 DESCRIPTION "An OSD is dangerously full."
123 promOsdFlapping NOTIFICATION-TYPE
125 DESCRIPTION "An OSD was marked down at back up at least once a minute for 5 minutes."
128 promOsdHighPgDeviation NOTIFICATION-TYPE
130 DESCRIPTION "An OSD deviates by more then 30% from average PG count."
133 promOsdFull NOTIFICATION-TYPE
135 DESCRIPTION "An OSD has reached its full threshold."
138 promOsdHighPredictedFailures NOTIFICATION-TYPE
140 DESCRIPTION "Normal self healing unable to cope with the number of devices predicted to fail."
143 promOsdHostDown NOTIFICATION-TYPE
145 DESCRIPTION "Ceph OSD host is down."
148 promMdsDamaged NOTIFICATION-TYPE
150 DESCRIPTION "Cephfs filesystem is damaged."
153 promMdsReadOnly NOTIFICATION-TYPE
155 DESCRIPTION "Cephfs filesystem marked as READ-ONLY"
158 promMdsOffline NOTIFICATION-TYPE
160 DESCRIPTION "Cephfs filesystem is unavailable/offline."
163 promMdsDegraded NOTIFICATION-TYPE
165 DESCRIPTION "Cephfs filesystem is in a degraded state."
168 promMdsNoStandby NOTIFICATION-TYPE
170 DESCRIPTION "Cephfs MDS daemon failure, no standby available"
173 promMgrModuleCrash NOTIFICATION-TYPE
175 DESCRIPTION "Ceph mgr module has crashed recently"
178 promMgrPrometheusInactive NOTIFICATION-TYPE
180 DESCRIPTION "Ceph mgr prometheus module not responding"
183 promPGsInactive NOTIFICATION-TYPE
185 DESCRIPTION "One or more PGs are inactive for more than 5 minutes."
188 promPGsUnclean NOTIFICATION-TYPE
190 DESCRIPTION "One or more PGs are not clean for more than 15 minutes."
193 promPGsUnavailable NOTIFICATION-TYPE
195 DESCRIPTION "One or more PGs is unavailable, blocking I/O to those objects."
198 promPGsDamaged NOTIFICATION-TYPE
200 DESCRIPTION "One or more PGs is damaged."
203 promPGsRecoveryFull NOTIFICATION-TYPE
205 DESCRIPTION "PG recovery is impaired due to full OSDs."
208 promPGsBackfillFull NOTIFICATION-TYPE
210 DESCRIPTION "PG backfill is impaired due to full OSDs."
213 promNodeRootVolumeFull NOTIFICATION-TYPE
215 DESCRIPTION "Root volume (OSD and MON store) is dangerously full (< 5% free)."
218 promNodeNetworkPacketDrops NOTIFICATION-TYPE
220 DESCRIPTION "A node experiences packet drop > 1 packet/s on an interface."
223 promNodeNetworkPacketErrors NOTIFICATION-TYPE
225 DESCRIPTION "A node experiences packet errors > 1 packet/s on an interface."
228 promNodeStorageFilling NOTIFICATION-TYPE
230 DESCRIPTION "A mountpoint will be full in less then 5 days assuming the average fillup rate of the past 48 hours."
233 promPoolFull NOTIFICATION-TYPE
235 DESCRIPTION "A pool is at 90% capacity or over."
238 promPoolFilling NOTIFICATION-TYPE
240 DESCRIPTION "A pool will be full in less then 5 days assuming the average fillup rate of the past 48 hours."
243 promRadosUnfound NOTIFICATION-TYPE
245 DESCRIPTION "A RADOS object can not be found, even though all OSDs are online."
248 promCephadmDaemonDown NOTIFICATION-TYPE
250 DESCRIPTION "Cephadm has determined that a daemon is down."
251 ::= { promCephadm 1 }
253 promCephadmUpgradeFailure NOTIFICATION-TYPE
255 DESCRIPTION "Cephadm attempted to upgrade the cluster and encountered a problem."
256 ::= { promCephadm 2 }
258 promPrometheusJobMissing NOTIFICATION-TYPE
260 DESCRIPTION "The prometheus scrape job is not defined."
261 ::= { promPrometheus 1 }
262 -- ---------------------------------------------------------- --
263 -- IEEE 802.1D MIB - Conformance Information
264 -- ---------------------------------------------------------- --
266 cephAlertGroups OBJECT IDENTIFIER ::= { cephConformance 1 }
267 cephCompliances OBJECT IDENTIFIER ::= { cephConformance 2 }
269 -- ---------------------------------------------------------- --
270 -- units of conformance
271 -- ---------------------------------------------------------- --
273 -- ---------------------------------------------------------- --
274 -- The Trap Notification Group
275 -- ---------------------------------------------------------- --
277 cephNotificationGroup NOTIFICATION-GROUP
279 promGenericNotification,
280 promGenericDaemonCrash,
281 promHealthStatusError,
282 promHealthStatusWarning,
284 promMonDiskSpaceCritical,
289 promOsdHighPgDeviation,
291 promOsdHighPredictedFailures,
299 promMgrPrometheusInactive,
306 promNodeRootVolumeFull,
307 promNodeNetworkPacketDrops,
308 promNodeNetworkPacketErrors,
309 promNodeStorageFilling,
313 promCephadmDaemonDown,
314 promCephadmUpgradeFailure,
315 promPrometheusJobMissing
319 "A collection of notifications triggered by the Prometheus
320 rules to convey Ceph cluster state"
321 ::= { cephAlertGroups 2 }
323 -- ---------------------------------------------------------- --
324 -- compliance statements
325 -- ---------------------------------------------------------- --
327 cephCompliance MODULE-COMPLIANCE
330 "The Compliance statement for the Ceph MIB"
333 cephNotificationGroup
335 ::= { cephCompliances 1 }