ANF CEPH 2022 du 03 au 07/10/2022
Sébastien Geiger


#exemple de configuration de nrpe avec check_ceph_health
depuis ceph1
[almalinux@ceph1 ~]$ sudo cephadm shell ceph auth get-or-create client.nagios mon 'allow r' > ceph.client.nagios.keyring
#verification du fichier ceph.client.nagios.keyring
[almalinux@ceph1 ~]$ cat ceph.client.nagios.keyring
[client.nagios]
	key = AQCq6SljtKmNOBAAWPUHom5WEnd5UPBz2Fs6qg==

[almalinux@ceph1 ~]$ scp ceph.client.nagios.keyring root@cephclt:/etc/ceph/

depuis cephclt
sudo yum install -y epel-release.noarch
sudo yum -y install nrpe nrpe-selinux
sudo yum -y install nagios-plugins-load nagios-plugins-nrpe
sudo curl https://raw.githubusercontent.com/ceph/ceph-nagios-plugins/master/src/check_ceph_health --output /usr/lib64/nagios/plugins/check_ceph_health
sudo chmod 755 /usr/lib64/nagios/plugins/check_ceph_health
sudo semanage fcontext -a --type nagios_unconfined_plugin_exec_t /usr/lib64/nagios/plugins/check_ceph_health
sudo restorecon /usr/lib64/nagios/plugins/check_ceph_health
echo "command[check_ceph_health]=/usr/lib64/nagios/plugins/check_ceph_health --id nagios --keyring /etc/ceph/ceph.client.nagios.keyring" |sudo tee -a /etc/nrpe.d/ceph_health.cfg
sudo alternatives --set python /usr/bin/python3
sudo chown -R root:nrpe /etc/ceph
sudo chmod u+rwx,g+rwx /etc/ceph
sudo systemctl enable nrpe
sudo systemctl restart nrpe
/usr/lib64/nagios/plugins/check_nrpe -H localhost -c check_ceph_health
HEALTH OK


Module Prometheus
Le service mgr intègre un exporteur Prometheus qui permet de récupérer les métriques depuis le port:9283.

[ceph: root@ceph1 /]# ceph mgr services
{
    "dashboard": "https://172.16.7.16:8443/",
    "prometheus": "http://172.16.7.16:9283/"
}

[ceph: root@ceph1 /]# curl --stderr -  http://ceph1:9283/metrics |grep ceph_health_status
# HELP ceph_health_status Cluster health status
# TYPE ceph_health_status untyped
ceph_health_status 0.0


grafana exterieur
https://grafana.com/grafana/dashboards/2842

# arrêter un osd et vérifier l'etat de check_ceph_health ou de ceph_health_status via promethehus
[ceph: root@ceph1 /]# ceph orch daemon stop osd.0
[almalinux@cephclt ~]$ /usr/lib64/nagios/plugins/check_nrpe -H localhost -c check_ceph_health
WARNING: OSD_DOWN( 1 osds down )
WARNING: PG_DEGRADED( Degraded data redundancy: 396/2836 objects degraded (13.963%), 76 pgs degraded )
[almalinux@cephclt ~]$  curl --stderr -  http://ceph1:9283/metrics |grep ceph_health_status
# HELP ceph_health_status Cluster health status
# TYPE ceph_health_status untyped
ceph_health_status 1.0

# redémmarrer l'osd
[ceph: root@ceph1 /]# ceph orch daemon start osd.0
Scheduled to start osd.0 on host 'ceph3'
# après quelques minutes le service est à nouveau HEALTH OK
[almalinux@cephclt ~]$ /usr/lib64/nagios/plugins/check_nrpe -H localhost -c check_ceph_health
HEALTH OK