ANF CEPH 2022 du 03 au 07/10/2022 Sébastien Geiger #exemple de configuration de nrpe avec check_ceph_health depuis ceph1 [almalinux@ceph1 ~]$ sudo cephadm shell ceph auth get-or-create client.nagios mon 'allow r' > ceph.client.nagios.keyring #verification du fichier ceph.client.nagios.keyring [almalinux@ceph1 ~]$ cat ceph.client.nagios.keyring [client.nagios] key = AQCq6SljtKmNOBAAWPUHom5WEnd5UPBz2Fs6qg== [almalinux@ceph1 ~]$ scp ceph.client.nagios.keyring root@cephclt:/etc/ceph/ depuis cephclt sudo yum install -y epel-release.noarch sudo yum -y install nrpe nrpe-selinux sudo yum -y install nagios-plugins-load nagios-plugins-nrpe sudo curl https://raw.githubusercontent.com/ceph/ceph-nagios-plugins/master/src/check_ceph_health --output /usr/lib64/nagios/plugins/check_ceph_health sudo chmod 755 /usr/lib64/nagios/plugins/check_ceph_health sudo semanage fcontext -a --type nagios_unconfined_plugin_exec_t /usr/lib64/nagios/plugins/check_ceph_health sudo restorecon /usr/lib64/nagios/plugins/check_ceph_health echo "command[check_ceph_health]=/usr/lib64/nagios/plugins/check_ceph_health --id nagios --keyring /etc/ceph/ceph.client.nagios.keyring" |sudo tee -a /etc/nrpe.d/ceph_health.cfg sudo alternatives --set python /usr/bin/python3 sudo chown -R root:nrpe /etc/ceph sudo chmod u+rwx,g+rwx /etc/ceph sudo systemctl enable nrpe sudo systemctl restart nrpe /usr/lib64/nagios/plugins/check_nrpe -H localhost -c check_ceph_health HEALTH OK Module Prometheus Le service mgr intègre un exporteur Prometheus qui permet de récupérer les métriques depuis le port:9283. [ceph: root@ceph1 /]# ceph mgr services { "dashboard": "https://172.16.7.16:8443/", "prometheus": "http://172.16.7.16:9283/" } [ceph: root@ceph1 /]# curl --stderr - http://ceph1:9283/metrics |grep ceph_health_status # HELP ceph_health_status Cluster health status # TYPE ceph_health_status untyped ceph_health_status 0.0 grafana exterieur https://grafana.com/grafana/dashboards/2842 # arrêter un osd et vérifier l'etat de check_ceph_health ou de ceph_health_status via promethehus [ceph: root@ceph1 /]# ceph orch daemon stop osd.0 [almalinux@cephclt ~]$ /usr/lib64/nagios/plugins/check_nrpe -H localhost -c check_ceph_health WARNING: OSD_DOWN( 1 osds down ) WARNING: PG_DEGRADED( Degraded data redundancy: 396/2836 objects degraded (13.963%), 76 pgs degraded ) [almalinux@cephclt ~]$ curl --stderr - http://ceph1:9283/metrics |grep ceph_health_status # HELP ceph_health_status Cluster health status # TYPE ceph_health_status untyped ceph_health_status 1.0 # redémmarrer l'osd [ceph: root@ceph1 /]# ceph orch daemon start osd.0 Scheduled to start osd.0 on host 'ceph3' # après quelques minutes le service est à nouveau HEALTH OK [almalinux@cephclt ~]$ /usr/lib64/nagios/plugins/check_nrpe -H localhost -c check_ceph_health HEALTH OK