[READINESS] Adding LICENSE file
[oom/readiness.git] / ready.py
1 #!/usr/bin/env python3
2 # -*- coding: utf-8 -*-
3 # Copyright © 2020 Orange
4 # Copyright © 2020 Nokia
5 #
6 # Licensed under the Apache License, Version 2.0 (the "License");
7 # you may not use this file except in compliance with the License.
8 # You may obtain a copy of the License at
9 #
10 #       http://www.apache.org/licenses/LICENSE-2.0
11 #
12 # Unless required by applicable law or agreed to in writing, software
13 # distributed under the License is distributed on an "AS IS" BASIS,
14 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 # See the License for the specific language governing permissions and
16 # limitations under the License.
17
18 """
19 Kubernetes readiness check.
20
21 Checks if a container is ready or if a job is finished.
22 The check is done according to the name of the container, not the name of
23 its parent (Job, Deployment, StatefulSet, DaemonSet).
24 """
25
26 import getopt
27 import logging
28 import os
29 import sys
30 import time
31 import random
32 import requests
33 import socket
34 from contextlib import closing
35
36 from kubernetes import client, config
37 from kubernetes.client.rest import ApiException
38
39 namespace = ""
40
41 # setup logging
42 log = logging.getLogger(__name__)
43 handler = logging.StreamHandler(sys.stdout)
44 formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
45 handler.setFormatter(formatter)
46 handler.setLevel(logging.INFO)
47 log.addHandler(handler)
48 log.setLevel(logging.INFO)
49
50 config.load_incluster_config()
51 # use for local testing:
52 #config.load_kube_config()
53 coreV1Api = client.CoreV1Api()
54 api = client.AppsV1Api()
55 batchV1Api = client.BatchV1Api()
56
57 def is_job_complete(job_name):
58     """
59     Check if Job is complete.
60
61     Args:
62         job_name (str): the name of the Job.
63
64     Returns:
65         True if job is complete, false otherwise
66     """
67     complete = False
68     log.info("Checking if job %s is complete", job_name)
69     try:
70         response = batchV1Api.read_namespaced_job_status(job_name, namespace)
71         if response.status.succeeded == 1:
72             job_status_type = response.status.conditions[0].type
73             if job_status_type == "Complete":
74                 complete = True
75                 log.info("%s is complete", job_name)
76             else:
77                 log.info("%s is NOT complete", job_name)
78         else:
79             log.info("%s has not succeeded yet", job_name)
80     except ApiException as exc:
81         log.error("Exception when calling read_namespaced_job_status: %s\n",
82                   exc)
83     return complete
84
85
86 def wait_for_statefulset_complete(statefulset_name):
87     """
88     Check if StatefulSet is running.
89
90     Args:
91         statefulset_name (str): the name of the StatefulSet.
92
93     Returns:
94         True if StatefulSet is running, false otherwise
95     """
96     complete = False
97     try:
98         response = api.read_namespaced_stateful_set(statefulset_name,
99                                                     namespace)
100         status = response.status
101         if (status.replicas == response.spec.replicas and
102                 status.ready_replicas == response.spec.replicas and
103                 status.observed_generation == response.metadata.generation):
104             log.info("Statefulset %s is ready", statefulset_name)
105             complete = True
106         else:
107             log.info("Statefulset %s is NOT ready", statefulset_name)
108     except ApiException as exc:
109         log.error("Exception when waiting for Statefulset status: %s\n", exc)
110     return complete
111
112
113 def wait_for_deployment_complete(deployment_name):
114     """
115     Check if Deployment is running.
116
117     Args:
118         deployment_name (str): the name of the Deployment.
119
120     Returns:
121         True if Deployment is running, false otherwise
122     """
123     complete = False
124     try:
125         response = api.read_namespaced_deployment(deployment_name, namespace)
126         status = response.status
127         if (status.unavailable_replicas is None and
128                 (status.updated_replicas is None or
129                  status.updated_replicas == response.spec.replicas) and
130                 status.replicas == response.spec.replicas and
131                 status.ready_replicas == response.spec.replicas and
132                 status.observed_generation == response.metadata.generation):
133             log.info("Deployment %s is ready", deployment_name)
134             complete = True
135         else:
136             log.info("Deployment %s is NOT ready", deployment_name)
137     except ApiException as exc:
138         log.error("Exception when waiting for deployment status: %s\n", exc)
139     return complete
140
141
142 def wait_for_daemonset_complete(daemonset_name):
143     """
144     Check if DaemonSet is running.
145
146     Args:
147         daemonset_name (str): the name of the DaemonSet.
148
149     Returns:
150         True if DaemonSet is running, false otherwise
151     """
152     complete = False
153     try:
154         response = api.read_namespaced_daemon_set(
155             daemonset_name, namespace)
156         status = response.status
157         if status.desired_number_scheduled == status.number_ready:
158             log.info("DaemonSet: %s/%s nodes ready --> %s is ready",
159                      status.number_ready, status.desired_number_scheduled,
160                      daemonset_name)
161             complete = True
162         else:
163             log.info("DaemonSet: %s/%s nodes ready --> %s is NOT ready",
164                      status.number_ready, status.desired_number_scheduled,
165                      daemonset_name)
166     except ApiException as exc:
167         log.error("Exception when waiting for DaemonSet status: %s\n", exc)
168     return complete
169
170
171 def is_ready(container_name):
172     """
173     Check if a container is ready.
174
175     For a container owned by a Job, it means the Job is complete.
176     Otherwise, it means the parent (Deployment, StatefulSet, DaemonSet) is
177     running with the right number of replicas
178
179     Args:
180         container_name (str): the name of the container.
181
182     Returns:
183         True if container is ready, false otherwise
184     """
185     ready = False
186     log.info("Checking if container %s is ready", container_name)
187     try:
188         response = coreV1Api.list_namespaced_pod(namespace=namespace,
189                                                  watch=False)
190         for item in response.items:
191             # container_statuses can be None, which is non-iterable.
192             if item.status.container_statuses is None:
193                 continue
194             for container in item.status.container_statuses:
195                 if container.name == container_name:
196                     name = read_name(item)
197                     if item.metadata.owner_references[0].kind == "StatefulSet":
198                         ready = wait_for_statefulset_complete(name)
199                     elif item.metadata.owner_references[0].kind == "ReplicaSet":
200                         deployment_name = get_deployment_name(name)
201                         ready = wait_for_deployment_complete(deployment_name)
202                     elif item.metadata.owner_references[0].kind == "Job":
203                         ready = is_job_complete(name)
204                     elif item.metadata.owner_references[0].kind == "DaemonSet":
205                         ready = wait_for_daemonset_complete(
206                             item.metadata.owner_references[0].name)
207                     return ready
208     except ApiException as exc:
209         log.error("Exception when calling list_namespaced_pod: %s\n", exc)
210     return ready
211
212 def is_service_ready(service_name):
213     """
214     Check if a service is ready.
215
216     The service is ready, if the selected pod is finally deployed.
217     It means the parent (Deployment, StatefulSet, DaemonSet) is
218     running with the right number of replicas
219
220     Args:
221         service_name (str): the name of the service.
222
223     Returns:
224         True if service is ready, false otherwise
225     """
226     ready = False
227     log.info("Checking if service %s is ready", service_name)
228     try:
229       services = coreV1Api.list_namespaced_service(namespace=namespace,
230                                                    watch=False)
231       for svc in services.items:
232         if (svc.metadata.name.startswith(service_name)):
233           if svc.spec.selector:
234             # convert the selector dictionary into a string selector
235             # for example: {"app":"redis"} => "app=redis"
236             selector = ''
237             for k,v in svc.spec.selector.items():
238               selector += k + '=' + v + ','
239             selector = selector[:-1]
240             log.info("Found Selector %s", selector)
241             # Get the pods that match the selector
242             pods = coreV1Api.list_namespaced_pod(namespace=namespace,
243                                                  label_selector=selector,
244                                                  watch=False)
245             for item in pods.items:
246               name = read_name(item)
247               log.info("Found pod %s selected by service %s", name, service_name)
248               return is_pod_ready (name)
249           else:
250             log.info("No Selector found, check Endpoints")
251             endpoints = coreV1Api.list_namespaced_endpoints(namespace=namespace,
252                                                    watch=False)
253             for ep in endpoints.items:
254               if (ep.metadata.name.startswith(service_name)):
255                 if ep.subsets:
256                   addresses = ep.subsets[0].addresses
257                   if addresses:
258                     name = addresses[0].target_ref.name
259                     log.info("Found pod %s selected by service %s", name, service_name)
260                     return is_pod_ready (name)
261     except ApiException as exc:
262         log.error("Exception when calling list_namespaced_service: %s\n", exc)
263     return ready
264
265 def is_pod_ready(pod_name):
266     """
267     Check if a pod is ready.
268
269     For a pod owned by a Job, it means the Job is complete.
270     Otherwise, it means the parent (Deployment, StatefulSet, DaemonSet) is
271     running with the right number of replicas
272
273     Args:
274         pod_name (str): the name of the pod.
275
276     Returns:
277         True if pod is ready, false otherwise
278     """
279     ready = False
280     log.info("Checking if pod %s is ready", pod_name)
281     try:
282         response = coreV1Api.list_namespaced_pod(namespace=namespace,
283                                                  watch=False)
284         for item in response.items:
285           if (item.metadata.name.startswith(pod_name)):
286             name = read_name(item)
287             log.info("Found pod %s", name)
288             if item.metadata.owner_references[0].kind == "StatefulSet":
289                 ready = wait_for_statefulset_complete(name)
290             elif item.metadata.owner_references[0].kind == "ReplicaSet":
291                 deployment_name = get_deployment_name(name)
292                 ready = wait_for_deployment_complete(deployment_name)
293             elif item.metadata.owner_references[0].kind == "Job":
294                 ready = is_job_complete(name)
295             elif item.metadata.owner_references[0].kind == "DaemonSet":
296                 ready = wait_for_daemonset_complete(
297                    item.metadata.owner_references[0].name)
298             return ready
299     except ApiException as exc:
300         log.error("Exception when calling list_namespaced_pod: %s\n", exc)
301     return ready
302
303 def is_app_ready(app_name):
304     """
305     Check if a pod with app-label is ready.
306
307     For a pod owned by a Job, it means the Job is complete.
308     Otherwise, it means the parent (Deployment, StatefulSet, DaemonSet) is
309     running with the right number of replicas
310
311     Args:
312         app_name (str): the app label of the pod.
313
314     Returns:
315         True if pod is ready, false otherwise
316     """
317     ready = False
318     log.info("Checking if pod with app-label %s is ready", app_name)
319     try:
320         response = coreV1Api.list_namespaced_pod(namespace=namespace,
321                                                  watch=False)
322         for item in response.items:
323           if item.metadata.labels.get('app', "NOKEY") == app_name:
324             name = read_name(item)
325             log.info("Found pod %s", name)
326             return is_pod_ready (name)
327     except ApiException as exc:
328         log.error("Exception when calling list_namespaced_pod: %s\n", exc)
329     return ready
330
331 def service_mesh_job_check(container_name):
332     """
333     Check if a Job's primary container is complete. Used for ensuring the sidecar can be killed after Job completion.
334     Args:
335         container_name (str): the name of the Job's primary container.
336
337     Returns:
338          True if job's container is in the completed state, false otherwise
339     """
340     complete = False
341     log.info("Checking if container %s is complete", container_name)
342     try:
343         response = coreV1Api.list_namespaced_pod(namespace=namespace, watch=False)
344         for item in response.items:
345             # container_statuses can be None, which is non-iterable.
346             if item.status.container_statuses is None:
347                 continue
348             for container in item.status.container_statuses:
349                 if container.name == container_name and item.status.phase == "Running":
350                     name = read_name(item)
351                     log.info("Container Details  %s ", container)
352                     log.info("Container Status  %s ", container.state.terminated)
353
354                     if container.state.terminated:
355                       log.info("Container Terminated with reason  %s ", container.state.terminated.reason)
356                       complete = True
357
358     except ApiException as exc:
359         log.error("Exception when calling read_namespaced_job_status: %s\n",
360                   exc)
361     return complete
362
363 def read_name(item):
364     """
365     Return the name of the owner's item.
366
367     Args:
368         item (str): the item.
369
370     Returns:
371         the name of first owner's item
372     """
373     return item.metadata.owner_references[0].name
374
375
376 def get_deployment_name(replicaset):
377     """
378     Return the name of the Deployment owning the ReplicatSet.
379
380     Args:
381         replicaset (str): the ReplicatSet.
382
383     Returns:
384         the name of the Deployment owning the ReplicatSet
385     """
386     api_response = api.read_namespaced_replica_set_status(replicaset,
387                                                           namespace)
388     deployment_name = read_name(api_response)
389     return deployment_name
390
391 def check_socket(host, port):
392     with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
393         if sock.connect_ex((host, port)) == 0:
394             print("Port is open")
395             return True
396         else:
397             print("Port is not open")
398             return False
399
400 def quitquitquit_post(apiurl):
401     URL = apiurl
402     if check_socket("127.0.0.1", 15020) is False:
403         log.info("no sidecar exists, exiting")
404         return True
405     response = requests.post(url = URL)
406     responseStatus = response.ok
407     try:
408         if responseStatus is True:
409             log.info("quitquitquit returned True")
410             return True
411         else:
412             log.info("quitquitquit returned False")
413             return False
414     except:
415         log.info("quitquitquit call failed with exception")
416
417 DEF_TIMEOUT = 10
418 DEF_URL = "http://127.0.0.1:15020/quitquitquit"
419 DESCRIPTION = "Kubernetes container readiness check utility"
420 USAGE = "Usage: ready.py [-t <timeout>] [-n <namespace>] -c <container_name> .. \n" \
421         "| -s <service_name> .. | -p <pod_name> .. | -a <app_name> .. \n" \
422         "| -j <job_name> .. \n" \
423         "where\n" \
424         "<timeout> - wait for container readiness timeout in min, " \
425         "<namespace> - K8S namespace the check is done" \
426         "default is " + str(DEF_TIMEOUT) + "\n" \
427         "<service_name> - name of the service to wait for\n" \
428         "<container_name> - name of the container to wait for\n" \
429         "<pod_name> - name of the pod to wait for\n" \
430         "<app_name> - app label of the pod to wait for\n" \
431         "<job_name> - name of the job to wait for\n"
432
433
434 def main(argv):
435     """
436     Checks if a container, pod or service is ready, 
437     if a job is finished or if the main container of a job has completed.
438     The check is done according to the name of the container op pod,
439     not the name of its parent (Job, Deployment, StatefulSet, DaemonSet).
440
441     Args:
442         argv: the command line
443     """
444     global namespace
445     # args are a list of container names
446     container_names = []
447     service_names = []
448     pod_names = []
449     app_names = []
450     job_names = []
451     service_mesh_job_container_names = []
452     timeout = DEF_TIMEOUT
453     url = DEF_URL
454     ns = ""
455     try:
456         opts, _args = getopt.getopt(argv, "hj:s:c:p:a:t:m:u:n:", ["service-name=",
457                                                     "container-name=",
458                                                     "pod-name=",
459                                                     "app-name=",
460                                                     "timeout=",
461                                                     "service-mesh-check=",
462                                                     "url=",
463                                                     "job-name=",
464                                                     "namespace="
465                                                     "help"])
466         for opt, arg in opts:
467             if opt in ("-h", "--help"):
468                 print("{}\n\n{}".format(DESCRIPTION, USAGE))
469                 sys.exit()
470             elif opt in ("-s", "--service-name"):
471                 service_names.append(arg)
472             elif opt in ("-c", "--container-name"):
473                 container_names.append(arg)
474             elif opt in ("-p", "--pod-name"):
475                 pod_names.append(arg)
476             elif opt in ("-a", "--app-name"):
477                 app_names.append(arg)
478             elif opt in ("-j", "--job-name"):
479                 job_names.append(arg)
480             elif opt in ("-m", "--service-mesh-check"):
481                 service_mesh_job_container_names.append(arg)
482             elif opt in ("-u", "--url"):
483                 url = arg
484             elif opt in ("-n", "--namespace"):
485                 ns = arg
486             elif opt in ("-t", "--timeout"):
487                 timeout = float(arg)
488     except (getopt.GetoptError, ValueError) as exc:
489         print("Error parsing input parameters: {}\n".format(exc))
490         print(USAGE)
491         sys.exit(2)
492     if container_names.__len__() == 0 and job_names.__len__() == 0 and pod_names.__len__() == 0 \
493        and app_names.__len__() == 0 and service_mesh_job_container_names.__len__() == 0 \
494        and service_names.__len__() == 0:
495         print("Missing required input parameter(s)\n")
496         print(USAGE)
497         sys.exit(2)
498     if ns == "":
499         # extract ns from env variable
500         namespace = os.environ['NAMESPACE']
501     else:
502         namespace = ns
503
504     for service_name in service_names:
505         timeout = time.time() + timeout * 60
506         while True:
507             ready = is_service_ready(service_name)
508             if ready is True:
509                 break
510             if time.time() > timeout:
511                 log.warning("timed out waiting for '%s' to be ready",
512                             service_name)
513                 sys.exit(1)
514             else:
515                 # spread in time potentially parallel execution in multiple
516                 # containers
517                 time.sleep(random.randint(5, 11))
518     for container_name in container_names:
519         timeout = time.time() + timeout * 60
520         while True:
521             ready = is_ready(container_name)
522             if ready is True:
523                 break
524             if time.time() > timeout:
525                 log.warning("timed out waiting for '%s' to be ready",
526                             container_name)
527                 sys.exit(1)
528             else:
529                 # spread in time potentially parallel execution in multiple
530                 # containers
531                 time.sleep(random.randint(5, 11))
532     for pod_name in pod_names:
533         timeout = time.time() + timeout * 60
534         while True:
535             ready = is_pod_ready(pod_name)
536             if ready is True:
537                 break
538             if time.time() > timeout:
539                 log.warning("timed out waiting for '%s' to be ready",
540                             pod_name)
541                 sys.exit(1)
542             else:
543                 # spread in time potentially parallel execution in multiple
544                 # containers
545                 time.sleep(random.randint(5, 11))
546     for app_name in app_names:
547         timeout = time.time() + timeout * 60
548         while True:
549             ready = is_app_ready(app_name)
550             if ready is True:
551                 break
552             if time.time() > timeout:
553                 log.warning("timed out waiting for '%s' to be ready",
554                             app_name)
555                 sys.exit(1)
556             else:
557                 # spread in time potentially parallel execution in multiple
558                 # containers
559                 time.sleep(random.randint(5, 11))
560     for job_name in job_names:
561         timeout = time.time() + timeout * 60
562         while True:
563             ready = is_job_complete(job_name)
564             if ready is True:
565                 break
566             if time.time() > timeout:
567                 log.warning("timed out waiting for '%s' to be ready",
568                             job_name)
569                 sys.exit(1)
570             else:
571                 # spread in time potentially parallel execution in multiple
572                 # containers
573                 time.sleep(random.randint(5, 11))
574     for service_mesh_job_container_name in service_mesh_job_container_names:
575         timeout = time.time() + timeout * 60
576         while True:
577             ready = service_mesh_job_check(service_mesh_job_container_name)
578             if ready is True:
579                 sideCarKilled = quitquitquit_post(url)
580                 if sideCarKilled is True:
581                     log.info("Side Car Killed through QuitQuitQuit API")
582                 else:
583                     log.info("Side Car Failed to be Killed through QuitQuitQuit API")
584                 break
585             if time.time() > timeout:
586                 log.warning("timed out waiting for '%s' to be ready",
587                             service_mesh_job_container_name)
588                 sys.exit(1)
589             else:
590                 # spread in time potentially parallel execution in multiple
591                 # containers
592                 time.sleep(random.randint(5, 11))
593
594 if __name__ == "__main__":
595     main(sys.argv[1:])