Difference between revisions of "OpenShift v4x health check"

From Bitbull Wiki
Jump to navigation Jump to search
Line 1: Line 1:
 +
=Links=
 +
* [https://docs.openshift.com/container-platform/3.9/day_two_guide/environment_health_checks.html https://docs.openshift.com/container-platform/3.9/day_two_guide/environment_health_checks.html]
 +
https://docs.openshift.com/container-platform/4.4/backup_and_restore/replacing-unhealthy-etcd-member.html
  
 +
=Health Checks=
 +
==Nodes==
 +
[chris@control(zabbix-dev/system:admin) ~]$ '''oc get nodes -o wide'''
 +
NAME      STATUS  ROLES          AGE  VERSION  INTERNAL-IP      EXTERNAL-IP  OS-IMAGE                                  KERNEL-VERSION                CONTAINER-RUNTIME
 +
master01  '''Ready'''    master,worker  40d  v1.17.1  192.168.100.221  <none>        RHEL CoreOS 44.81.202005062110-0 (Ootpa)  4.18.0-147.8.1.el8_1.x86_64  cri-o://1.17.4-8.dev.rhaos4.4.git5f5c5e4.el8
 +
master02  '''Ready'''    master,worker  40d  v1.17.1  192.168.100.222  <none>        RHEL CoreOS 44.81.202005062110-0 (Ootpa)  4.18.0-147.8.1.el8_1.x86_64  cri-o://1.17.4-8.dev.rhaos4.4.git5f5c5e4.el8
 +
master03  '''Ready'''    master,worker  40d  v1.17.1  192.168.100.223  <none>        RHEL CoreOS 44.81.202005062110-0 (Ootpa)  4.18.0-147.8.1.el8_1.x86_64  cri-o://1.17.4-8.dev.rhaos4.4.git5f5c5e4.el8
 +
worker01  '''Ready'''    worker          40d  v1.17.1  192.168.100.231  <none>        RHEL CoreOS 44.81.202005062110-0 (Ootpa)  4.18.0-147.8.1.el8_1.x86_64  cri-o://1.17.4-8.dev.rhaos4.4.git5f5c5e4.el8
 +
worker02  '''Ready'''    worker          40d  v1.17.1  192.168.100.232  <none>        RHEL CoreOS 44.81.202005062110-0 (Ootpa)  4.18.0-147.8.1.el8_1.x86_64  cri-o://1.17.4-8.dev.rhaos4.4.git5f5c5e4.el8
  
  
  
 +
==etcd==
 +
===v3.9===
 +
[root@master(zabbix/admin) ~]# '''source /etc/etcd/etcd.conf'''
 +
[root@master(zabbix/admin) ~]# '''etcdctl --cert-file=$ETCD_PEER_CERT_FILE --key-file=$ETCD_PEER_KEY_FILE  --ca-file=/etc/etcd/ca.crt --endpoints=$ETCD_LISTEN_CLIENT_URLS cluster-health'''
 +
member da1c9720d5fee664 is healthy: got healthy result from https://192.168.223.74:2379
 +
'''cluster is healthy'''
  
 +
===v4.4===
 +
[chris@control(zabbix-dev/system:admin) ~]$ '''oc get etcd -o=jsonpath='{range .items[0].status.conditions[?(@.type=="EtcdMembersAvailable")]}{.message}{"\n"}''''
 +
'''master02,master01,master03 members are available''',  have not started,  are unhealthy,  are unknown
  
  
 +
==router==
 +
===v3.9===
 +
[root@master(zabbix/admin) ~]# '''oc -n default get deploymentconfigs/router'''
 +
NAME      REVISION  '''DESIRED  CURRENT'''  TRIGGERED BY
 +
router    1          '''1        1'''        config
  
  
  
 +
==registry==
 +
===v3.9===
 +
[root@master(zabbix/admin) ~]# '''oc -n default get deploymentconfigs/docker-registry'''
 +
NAME              REVISION  '''DESIRED  CURRENT'''  TRIGGERED BY
 +
docker-registry  1          '''1        1'''        config
  
 +
===v4.4===
 +
[chris@control(zabbix-dev/system:admin) ~]$ '''oc get all -n openshift-image-registry'''
 +
NAME                                                  '''READY'''  STATUS    '''RESTARTS'''  AGE
 +
pod/cluster-image-registry-operator-7bff4c7595-hkbqx  '''2/2'''    Running  '''0'''          2d20h
 +
pod/image-registry-6b6745b4f9-wqwdx                    '''1/1'''    Running  '''0'''          2d22h
 +
pod/node-ca-6wgpw                                      '''1/1'''    Running  '''0'''          2d23h
 +
pod/node-ca-gjmhw                                      '''1/1'''    Running  '''0'''          2d23h
 +
pod/node-ca-gnp7n                                      '''1/1'''    Running  '''0'''          2d23h
 +
pod/node-ca-gtvt9                                      '''1/1'''    Running  '''0'''          2d23h
 +
pod/node-ca-ps7v9                                      '''1/1'''    Running  '''0'''          2d23h
  
 +
NAME                              TYPE        CLUSTER-IP      EXTERNAL-IP  PORT(S)    AGE
 +
service/image-registry            ClusterIP  172.30.229.236  <none>        5000/TCP    40d
 +
service/image-registry-operator  ClusterIP  None            <none>        60000/TCP  40d
 +
 +
NAME                    '''DESIRED'''  CURRENT  '''READY'''  UP-TO-DATE  AVAILABLE  NODE SELECTOR            AGE
 +
daemonset.apps/node-ca  5        5        5      5            5          kubernetes.io/os=linux  40d
 +
 +
NAME                                              READY  UP-TO-DATE  AVAILABLE  AGE
 +
deployment.apps/cluster-image-registry-operator  1/1    1            1          40d
 +
deployment.apps/image-registry                    1/1    1            1          40d
 +
 +
NAME                                                        '''DESIRED'''  CURRENT  '''READY'''  AGE
 +
replicaset.apps/cluster-image-registry-operator-6f78cddbbc  0        0        0      4d5h
 +
replicaset.apps/cluster-image-registry-operator-7bff4c7595  '''1'''        1        '''1'''      2d23h
 +
replicaset.apps/cluster-image-registry-operator-86476f46bc  0        0        0      6d7h
 +
replicaset.apps/cluster-image-registry-operator-f9697f69d    0        0        0      40d
 +
replicaset.apps/cluster-image-registry-operator-fc9dfb566    0        0        0      3d3h
 +
replicaset.apps/image-registry-58cc7948d8                    0        0        0      3d3h
 +
replicaset.apps/image-registry-688fb696dc                    0        0        0      40d
 +
replicaset.apps/image-registry-6948d8479b                    0        0        0      4d5h
 +
replicaset.apps/image-registry-6b6745b4f9                    '''1'''        1        '''1'''      2d23h
 +
replicaset.apps/image-registry-7bbdbc5dc7                    0        0        0      6d7h
 +
replicaset.apps/image-registry-9dc4885b                      0        0        0      6d7h
 +
replicaset.apps/image-registry-d4cf5448b                    0        0        0      40d
 +
replicaset.apps/image-registry-f488f9578                    0        0        0      6d7h
 +
replicaset.apps/image-registry-f5647c6d8                    0        0        0      40d
 +
 +
NAME                        SCHEDULE    SUSPEND  ACTIVE  LAST SCHEDULE  AGE
 +
cronjob.batch/image-pruner  0 0 * * *  True      0        <none>          2d23h
 +
 +
 +
 +
 +
==v4 ClusterOperators==
 +
[chris@control(zabbix-dev/system:admin) ~]$ '''oc -n default get clusteroperators'''
 +
NAME                                      VERSION  AVAILABLE  PROGRESSING  '''DEGRADED'''  SINCE
 +
authentication                            4.4.4    True        False        '''False'''      35d
 +
cloud-credential                          4.4.4    True        False        '''False'''      40d
 +
cluster-autoscaler                        4.4.4    True        False        '''False'''      40d
 +
console                                    4.4.4    True        False        '''False'''      33h
 +
csi-snapshot-controller                    4.4.4    True        False        '''False'''      33h
 +
dns                                        4.4.4    True        False        '''False'''      33h
 +
etcd                                      4.4.4    True        False        '''False'''      2d20h
 +
image-registry                            4.4.4    True        False        '''False'''      33h
 +
ingress                                    4.4.4    True        False        '''False'''      33h
 +
insights                                  4.4.4    True        False        '''False'''      40d
 +
kube-apiserver                            4.4.4    True        False        '''False'''      40d
 +
kube-controller-manager                    4.4.4    True        False        '''False'''      2d23h
 +
kube-scheduler                            4.4.4    True        False        '''False'''      2d23h
 +
kube-storage-version-migrator              4.4.4    True        False        '''False'''      33h
 +
machine-api                                4.4.4    True        False        '''False'''      40d
 +
machine-config                            4.4.4    True        False        '''False'''      2d19h
 +
marketplace                                4.4.4    True        False        '''False'''      2d19h
 +
monitoring                                4.4.4    True        False        '''False'''      2d10h
 +
network                                    4.4.4    True        False        '''False'''      40d
 +
node-tuning                                4.4.4    True        False        '''False'''      33h
 +
openshift-apiserver                        4.4.4    True        False        '''False'''      33h
 +
openshift-controller-manager              4.4.4    True        False        '''False'''      33h
 +
openshift-samples                          4.4.4    True        False        '''False'''      7m37s
 +
operator-lifecycle-manager                4.4.4    True        False        '''False'''      40d
 +
operator-lifecycle-manager-catalog        4.4.4    True        False        '''False'''      40d
 +
operator-lifecycle-manager-packageserver  4.4.4    True        False        '''False'''      33h
 +
service-ca                                4.4.4    True        False        '''False'''      40d
 +
service-catalog-apiserver                  4.4.4    True        False        '''False'''      40d
 +
service-catalog-controller-manager        4.4.4    True        False        '''False'''      40d
 +
storage                                    4.4.4    True        False        '''False'''      2d23h
  
  
Line 16: Line 123:
 
[[Category:OpenShift]]
 
[[Category:OpenShift]]
 
[[Category:V43]]
 
[[Category:V43]]
 +
[[Category:V39]]
 
[[Category:ReferenceCards]]
 
[[Category:ReferenceCards]]

Revision as of 12:17, 26 May 2020

1 Links

https://docs.openshift.com/container-platform/4.4/backup_and_restore/replacing-unhealthy-etcd-member.html

2 Health Checks

2.1 Nodes

[chris@control(zabbix-dev/system:admin) ~]$ oc get nodes -o wide
NAME       STATUS   ROLES           AGE   VERSION   INTERNAL-IP       EXTERNAL-IP   OS-IMAGE                                   KERNEL-VERSION                CONTAINER-RUNTIME
master01   Ready    master,worker   40d   v1.17.1   192.168.100.221   <none>        RHEL CoreOS 44.81.202005062110-0 (Ootpa)   4.18.0-147.8.1.el8_1.x86_64   cri-o://1.17.4-8.dev.rhaos4.4.git5f5c5e4.el8
master02   Ready    master,worker   40d   v1.17.1   192.168.100.222   <none>        RHEL CoreOS 44.81.202005062110-0 (Ootpa)   4.18.0-147.8.1.el8_1.x86_64   cri-o://1.17.4-8.dev.rhaos4.4.git5f5c5e4.el8
master03   Ready    master,worker   40d   v1.17.1   192.168.100.223   <none>        RHEL CoreOS 44.81.202005062110-0 (Ootpa)   4.18.0-147.8.1.el8_1.x86_64   cri-o://1.17.4-8.dev.rhaos4.4.git5f5c5e4.el8
worker01   Ready    worker          40d   v1.17.1   192.168.100.231   <none>        RHEL CoreOS 44.81.202005062110-0 (Ootpa)   4.18.0-147.8.1.el8_1.x86_64   cri-o://1.17.4-8.dev.rhaos4.4.git5f5c5e4.el8
worker02   Ready    worker          40d   v1.17.1   192.168.100.232   <none>        RHEL CoreOS 44.81.202005062110-0 (Ootpa)   4.18.0-147.8.1.el8_1.x86_64   cri-o://1.17.4-8.dev.rhaos4.4.git5f5c5e4.el8


2.2 etcd

2.2.1 v3.9

[root@master(zabbix/admin) ~]# source /etc/etcd/etcd.conf
[root@master(zabbix/admin) ~]# etcdctl --cert-file=$ETCD_PEER_CERT_FILE --key-file=$ETCD_PEER_KEY_FILE   --ca-file=/etc/etcd/ca.crt --endpoints=$ETCD_LISTEN_CLIENT_URLS cluster-health
member da1c9720d5fee664 is healthy: got healthy result from https://192.168.223.74:2379
cluster is healthy

2.2.2 v4.4

[chris@control(zabbix-dev/system:admin) ~]$ oc get etcd -o=jsonpath='{range .items[0].status.conditions[?(@.type=="EtcdMembersAvailable")]}{.message}{"\n"}'
master02,master01,master03 members are available,  have not started,  are unhealthy,  are unknown


2.3 router

2.3.1 v3.9

[root@master(zabbix/admin) ~]# oc -n default get deploymentconfigs/router
NAME      REVISION   DESIRED   CURRENT   TRIGGERED BY
router    1          1         1         config


2.4 registry

2.4.1 v3.9

[root@master(zabbix/admin) ~]# oc -n default get deploymentconfigs/docker-registry
NAME              REVISION   DESIRED   CURRENT   TRIGGERED BY
docker-registry   1          1         1         config

2.4.2 v4.4

[chris@control(zabbix-dev/system:admin) ~]$ oc get all -n openshift-image-registry NAME READY STATUS RESTARTS AGE pod/cluster-image-registry-operator-7bff4c7595-hkbqx 2/2 Running 0 2d20h pod/image-registry-6b6745b4f9-wqwdx 1/1 Running 0 2d22h pod/node-ca-6wgpw 1/1 Running 0 2d23h pod/node-ca-gjmhw 1/1 Running 0 2d23h pod/node-ca-gnp7n 1/1 Running 0 2d23h pod/node-ca-gtvt9 1/1 Running 0 2d23h pod/node-ca-ps7v9 1/1 Running 0 2d23h

NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE service/image-registry ClusterIP 172.30.229.236 <none> 5000/TCP 40d service/image-registry-operator ClusterIP None <none> 60000/TCP 40d

NAME DESIRED CURRENT READY UP-TO-DATE AVAILABLE NODE SELECTOR AGE daemonset.apps/node-ca 5 5 5 5 5 kubernetes.io/os=linux 40d

NAME READY UP-TO-DATE AVAILABLE AGE deployment.apps/cluster-image-registry-operator 1/1 1 1 40d deployment.apps/image-registry 1/1 1 1 40d

NAME DESIRED CURRENT READY AGE replicaset.apps/cluster-image-registry-operator-6f78cddbbc 0 0 0 4d5h replicaset.apps/cluster-image-registry-operator-7bff4c7595 1 1 1 2d23h replicaset.apps/cluster-image-registry-operator-86476f46bc 0 0 0 6d7h replicaset.apps/cluster-image-registry-operator-f9697f69d 0 0 0 40d replicaset.apps/cluster-image-registry-operator-fc9dfb566 0 0 0 3d3h replicaset.apps/image-registry-58cc7948d8 0 0 0 3d3h replicaset.apps/image-registry-688fb696dc 0 0 0 40d replicaset.apps/image-registry-6948d8479b 0 0 0 4d5h replicaset.apps/image-registry-6b6745b4f9 1 1 1 2d23h replicaset.apps/image-registry-7bbdbc5dc7 0 0 0 6d7h replicaset.apps/image-registry-9dc4885b 0 0 0 6d7h replicaset.apps/image-registry-d4cf5448b 0 0 0 40d replicaset.apps/image-registry-f488f9578 0 0 0 6d7h replicaset.apps/image-registry-f5647c6d8 0 0 0 40d

NAME SCHEDULE SUSPEND ACTIVE LAST SCHEDULE AGE cronjob.batch/image-pruner 0 0 * * * True 0 <none> 2d23h



2.5 v4 ClusterOperators

[chris@control(zabbix-dev/system:admin) ~]$ oc -n default get clusteroperators
NAME                                       VERSION   AVAILABLE   PROGRESSING   DEGRADED   SINCE
authentication                             4.4.4     True        False         False      35d
cloud-credential                           4.4.4     True        False         False      40d
cluster-autoscaler                         4.4.4     True        False         False      40d
console                                    4.4.4     True        False         False      33h
csi-snapshot-controller                    4.4.4     True        False         False      33h
dns                                        4.4.4     True        False         False      33h
etcd                                       4.4.4     True        False         False      2d20h
image-registry                             4.4.4     True        False         False      33h
ingress                                    4.4.4     True        False         False      33h
insights                                   4.4.4     True        False         False      40d
kube-apiserver                             4.4.4     True        False         False      40d
kube-controller-manager                    4.4.4     True        False         False      2d23h
kube-scheduler                             4.4.4     True        False         False      2d23h
kube-storage-version-migrator              4.4.4     True        False         False      33h
machine-api                                4.4.4     True        False         False      40d
machine-config                             4.4.4     True        False         False      2d19h
marketplace                                4.4.4     True        False         False      2d19h
monitoring                                 4.4.4     True        False         False      2d10h
network                                    4.4.4     True        False         False      40d
node-tuning                                4.4.4     True        False         False      33h
openshift-apiserver                        4.4.4     True        False         False      33h
openshift-controller-manager               4.4.4     True        False         False      33h
openshift-samples                          4.4.4     True        False         False      7m37s
operator-lifecycle-manager                 4.4.4     True        False         False      40d
operator-lifecycle-manager-catalog         4.4.4     True        False         False      40d
operator-lifecycle-manager-packageserver   4.4.4     True        False         False      33h
service-ca                                 4.4.4     True        False         False      40d
service-catalog-apiserver                  4.4.4     True        False         False      40d
service-catalog-controller-manager         4.4.4     True        False         False      40d
storage                                    4.4.4     True        False         False      2d23h