2 # -*- coding: utf-8 -*-
3 # Copyright © 2020 Orange
4 # Copyright © 2020 Nokia
6 # Licensed under the Apache License, Version 2.0 (the "License");
7 # you may not use this file except in compliance with the License.
8 # You may obtain a copy of the License at
10 # http://www.apache.org/licenses/LICENSE-2.0
12 # Unless required by applicable law or agreed to in writing, software
13 # distributed under the License is distributed on an "AS IS" BASIS,
14 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 # See the License for the specific language governing permissions and
16 # limitations under the License.
19 Kubernetes readiness check.
21 Checks if a container is ready or if a job is finished.
22 The check is done according to the name of the container, not the name of
23 its parent (Job, Deployment, StatefulSet, DaemonSet).
34 from contextlib import closing
36 from kubernetes import client, config
37 from kubernetes.client.rest import ApiException
42 log = logging.getLogger(__name__)
43 handler = logging.StreamHandler(sys.stdout)
44 formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
45 handler.setFormatter(formatter)
46 handler.setLevel(logging.INFO)
47 log.addHandler(handler)
48 log.setLevel(logging.INFO)
50 config.load_incluster_config()
51 # use for local testing:
52 #config.load_kube_config()
53 coreV1Api = client.CoreV1Api()
54 api = client.AppsV1Api()
55 batchV1Api = client.BatchV1Api()
57 def is_job_complete(job_name):
59 Check if Job is complete.
62 job_name (str): the name of the Job.
65 True if job is complete, false otherwise
68 log.info("Checking if job %s is complete", job_name)
70 response = batchV1Api.read_namespaced_job_status(job_name, namespace)
71 if response.status.succeeded == 1:
72 job_status_type = response.status.conditions[0].type
73 if job_status_type == "Complete":
75 log.info("%s is complete", job_name)
77 log.info("%s is NOT complete", job_name)
79 log.info("%s has not succeeded yet", job_name)
80 except ApiException as exc:
81 log.error("Exception when calling read_namespaced_job_status: %s\n",
86 def wait_for_statefulset_complete(statefulset_name):
88 Check if StatefulSet is running.
91 statefulset_name (str): the name of the StatefulSet.
94 True if StatefulSet is running, false otherwise
98 response = api.read_namespaced_stateful_set(statefulset_name,
100 status = response.status
101 if (status.replicas == response.spec.replicas and
102 status.ready_replicas == response.spec.replicas and
103 status.observed_generation == response.metadata.generation):
104 log.info("Statefulset %s is ready", statefulset_name)
107 log.info("Statefulset %s is NOT ready", statefulset_name)
108 except ApiException as exc:
109 log.error("Exception when waiting for Statefulset status: %s\n", exc)
113 def wait_for_deployment_complete(deployment_name):
115 Check if Deployment is running.
118 deployment_name (str): the name of the Deployment.
121 True if Deployment is running, false otherwise
125 response = api.read_namespaced_deployment(deployment_name, namespace)
126 status = response.status
127 if (status.unavailable_replicas is None and
128 (status.updated_replicas is None or
129 status.updated_replicas == response.spec.replicas) and
130 status.replicas == response.spec.replicas and
131 status.ready_replicas == response.spec.replicas and
132 status.observed_generation == response.metadata.generation):
133 log.info("Deployment %s is ready", deployment_name)
136 log.info("Deployment %s is NOT ready", deployment_name)
137 except ApiException as exc:
138 log.error("Exception when waiting for deployment status: %s\n", exc)
142 def wait_for_daemonset_complete(daemonset_name):
144 Check if DaemonSet is running.
147 daemonset_name (str): the name of the DaemonSet.
150 True if DaemonSet is running, false otherwise
154 response = api.read_namespaced_daemon_set(
155 daemonset_name, namespace)
156 status = response.status
157 if status.desired_number_scheduled == status.number_ready:
158 log.info("DaemonSet: %s/%s nodes ready --> %s is ready",
159 status.number_ready, status.desired_number_scheduled,
163 log.info("DaemonSet: %s/%s nodes ready --> %s is NOT ready",
164 status.number_ready, status.desired_number_scheduled,
166 except ApiException as exc:
167 log.error("Exception when waiting for DaemonSet status: %s\n", exc)
171 def is_ready(container_name):
173 Check if a container is ready.
175 For a container owned by a Job, it means the Job is complete.
176 Otherwise, it means the parent (Deployment, StatefulSet, DaemonSet) is
177 running with the right number of replicas
180 container_name (str): the name of the container.
183 True if container is ready, false otherwise
186 log.info("Checking if container %s is ready", container_name)
188 response = coreV1Api.list_namespaced_pod(namespace=namespace,
190 for item in response.items:
191 # container_statuses can be None, which is non-iterable.
192 if item.status.container_statuses is None:
194 for container in item.status.container_statuses:
195 if container.name == container_name:
196 name = read_name(item)
197 if item.metadata.owner_references[0].kind == "StatefulSet":
198 ready = wait_for_statefulset_complete(name)
199 elif item.metadata.owner_references[0].kind == "ReplicaSet":
200 deployment_name = get_deployment_name(name)
201 ready = wait_for_deployment_complete(deployment_name)
202 elif item.metadata.owner_references[0].kind == "Job":
203 ready = is_job_complete(name)
204 elif item.metadata.owner_references[0].kind == "DaemonSet":
205 ready = wait_for_daemonset_complete(
206 item.metadata.owner_references[0].name)
208 except ApiException as exc:
209 log.error("Exception when calling list_namespaced_pod: %s\n", exc)
212 def is_service_ready(service_name):
214 Check if a service is ready.
216 The service is ready, if the selected pod is finally deployed.
217 It means the parent (Deployment, StatefulSet, DaemonSet) is
218 running with the right number of replicas
221 service_name (str): the name of the service.
224 True if service is ready, false otherwise
227 log.info("Checking if service %s is ready", service_name)
229 services = coreV1Api.list_namespaced_service(namespace=namespace,
231 for svc in services.items:
232 if (svc.metadata.name.startswith(service_name)):
233 if svc.spec.selector:
234 # convert the selector dictionary into a string selector
235 # for example: {"app":"redis"} => "app=redis"
237 for k,v in svc.spec.selector.items():
238 selector += k + '=' + v + ','
239 selector = selector[:-1]
240 log.info("Found Selector %s", selector)
241 # Get the pods that match the selector
242 pods = coreV1Api.list_namespaced_pod(namespace=namespace,
243 label_selector=selector,
245 for item in pods.items:
246 name = read_name(item)
247 log.info("Found pod %s selected by service %s", name, service_name)
248 return is_pod_ready (name)
250 log.info("No Selector found, check Endpoints")
251 endpoints = coreV1Api.list_namespaced_endpoints(namespace=namespace,
253 for ep in endpoints.items:
254 if (ep.metadata.name.startswith(service_name)):
256 addresses = ep.subsets[0].addresses
258 name = addresses[0].target_ref.name
259 log.info("Found pod %s selected by service %s", name, service_name)
260 return is_pod_ready (name)
261 except ApiException as exc:
262 log.error("Exception when calling list_namespaced_service: %s\n", exc)
265 def is_pod_ready(pod_name):
267 Check if a pod is ready.
269 For a pod owned by a Job, it means the Job is complete.
270 Otherwise, it means the parent (Deployment, StatefulSet, DaemonSet) is
271 running with the right number of replicas
274 pod_name (str): the name of the pod.
277 True if pod is ready, false otherwise
280 log.info("Checking if pod %s is ready", pod_name)
282 response = coreV1Api.list_namespaced_pod(namespace=namespace,
284 for item in response.items:
285 if (item.metadata.name.startswith(pod_name)):
286 name = read_name(item)
287 log.info("Found pod %s", name)
288 if item.metadata.owner_references[0].kind == "StatefulSet":
289 ready = wait_for_statefulset_complete(name)
290 elif item.metadata.owner_references[0].kind == "ReplicaSet":
291 deployment_name = get_deployment_name(name)
292 ready = wait_for_deployment_complete(deployment_name)
293 elif item.metadata.owner_references[0].kind == "Job":
294 ready = is_job_complete(name)
295 elif item.metadata.owner_references[0].kind == "DaemonSet":
296 ready = wait_for_daemonset_complete(
297 item.metadata.owner_references[0].name)
299 except ApiException as exc:
300 log.error("Exception when calling list_namespaced_pod: %s\n", exc)
303 def is_app_ready(app_name):
305 Check if a pod with app-label is ready.
307 For a pod owned by a Job, it means the Job is complete.
308 Otherwise, it means the parent (Deployment, StatefulSet, DaemonSet) is
309 running with the right number of replicas
312 app_name (str): the app label of the pod.
315 True if pod is ready, false otherwise
318 log.info("Checking if pod with app-label %s is ready", app_name)
320 response = coreV1Api.list_namespaced_pod(namespace=namespace,
322 for item in response.items:
323 if item.metadata.labels.get('app', "NOKEY") == app_name:
324 name = read_name(item)
325 log.info("Found pod %s", name)
326 return is_pod_ready (name)
327 except ApiException as exc:
328 log.error("Exception when calling list_namespaced_pod: %s\n", exc)
331 def service_mesh_job_check(container_name):
333 Check if a Job's primary container is complete. Used for ensuring the sidecar can be killed after Job completion.
335 container_name (str): the name of the Job's primary container.
338 True if job's container is in the completed state, false otherwise
341 log.info("Checking if container %s is complete", container_name)
343 response = coreV1Api.list_namespaced_pod(namespace=namespace, watch=False)
344 for item in response.items:
345 # container_statuses can be None, which is non-iterable.
346 if item.status.container_statuses is None:
348 for container in item.status.container_statuses:
349 if container.name == container_name and item.status.phase == "Running":
350 name = read_name(item)
351 log.info("Container Details %s ", container)
352 log.info("Container Status %s ", container.state.terminated)
354 if container.state.terminated:
355 log.info("Container Terminated with reason %s ", container.state.terminated.reason)
358 except ApiException as exc:
359 log.error("Exception when calling read_namespaced_job_status: %s\n",
365 Return the name of the owner's item.
368 item (str): the item.
371 the name of first owner's item
373 return item.metadata.owner_references[0].name
376 def get_deployment_name(replicaset):
378 Return the name of the Deployment owning the ReplicatSet.
381 replicaset (str): the ReplicatSet.
384 the name of the Deployment owning the ReplicatSet
386 api_response = api.read_namespaced_replica_set_status(replicaset,
388 deployment_name = read_name(api_response)
389 return deployment_name
391 def check_socket(host, port):
392 with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
393 if sock.connect_ex((host, port)) == 0:
394 print("Port is open")
397 print("Port is not open")
400 def quitquitquit_post(apiurl):
402 if check_socket("127.0.0.1", 15020) is False:
403 log.info("no sidecar exists, exiting")
405 response = requests.post(url = URL)
406 responseStatus = response.ok
408 if responseStatus is True:
409 log.info("quitquitquit returned True")
412 log.info("quitquitquit returned False")
415 log.info("quitquitquit call failed with exception")
418 DEF_URL = "http://127.0.0.1:15020/quitquitquit"
419 DESCRIPTION = "Kubernetes container readiness check utility"
420 USAGE = "Usage: ready.py [-t <timeout>] [-n <namespace>] -c <container_name> .. \n" \
421 "| -s <service_name> .. | -p <pod_name> .. | -a <app_name> .. \n" \
422 "| -j <job_name> .. \n" \
424 "<timeout> - wait for container readiness timeout in min, " \
425 "<namespace> - K8S namespace the check is done" \
426 "default is " + str(DEF_TIMEOUT) + "\n" \
427 "<service_name> - name of the service to wait for\n" \
428 "<container_name> - name of the container to wait for\n" \
429 "<pod_name> - name of the pod to wait for\n" \
430 "<app_name> - app label of the pod to wait for\n" \
431 "<job_name> - name of the job to wait for\n"
436 Checks if a container, pod or service is ready,
437 if a job is finished or if the main container of a job has completed.
438 The check is done according to the name of the container op pod,
439 not the name of its parent (Job, Deployment, StatefulSet, DaemonSet).
442 argv: the command line
445 # args are a list of container names
451 service_mesh_job_container_names = []
452 timeout = DEF_TIMEOUT
456 opts, _args = getopt.getopt(argv, "hj:s:c:p:a:t:m:u:n:", ["service-name=",
461 "service-mesh-check=",
466 for opt, arg in opts:
467 if opt in ("-h", "--help"):
468 print("{}\n\n{}".format(DESCRIPTION, USAGE))
470 elif opt in ("-s", "--service-name"):
471 service_names.append(arg)
472 elif opt in ("-c", "--container-name"):
473 container_names.append(arg)
474 elif opt in ("-p", "--pod-name"):
475 pod_names.append(arg)
476 elif opt in ("-a", "--app-name"):
477 app_names.append(arg)
478 elif opt in ("-j", "--job-name"):
479 job_names.append(arg)
480 elif opt in ("-m", "--service-mesh-check"):
481 service_mesh_job_container_names.append(arg)
482 elif opt in ("-u", "--url"):
484 elif opt in ("-n", "--namespace"):
486 elif opt in ("-t", "--timeout"):
488 except (getopt.GetoptError, ValueError) as exc:
489 print("Error parsing input parameters: {}\n".format(exc))
492 if container_names.__len__() == 0 and job_names.__len__() == 0 and pod_names.__len__() == 0 \
493 and app_names.__len__() == 0 and service_mesh_job_container_names.__len__() == 0 \
494 and service_names.__len__() == 0:
495 print("Missing required input parameter(s)\n")
499 # extract ns from env variable
500 namespace = os.environ['NAMESPACE']
504 for service_name in service_names:
505 timeout = time.time() + timeout * 60
507 ready = is_service_ready(service_name)
510 if time.time() > timeout:
511 log.warning("timed out waiting for '%s' to be ready",
515 # spread in time potentially parallel execution in multiple
517 time.sleep(random.randint(5, 11))
518 for container_name in container_names:
519 timeout = time.time() + timeout * 60
521 ready = is_ready(container_name)
524 if time.time() > timeout:
525 log.warning("timed out waiting for '%s' to be ready",
529 # spread in time potentially parallel execution in multiple
531 time.sleep(random.randint(5, 11))
532 for pod_name in pod_names:
533 timeout = time.time() + timeout * 60
535 ready = is_pod_ready(pod_name)
538 if time.time() > timeout:
539 log.warning("timed out waiting for '%s' to be ready",
543 # spread in time potentially parallel execution in multiple
545 time.sleep(random.randint(5, 11))
546 for app_name in app_names:
547 timeout = time.time() + timeout * 60
549 ready = is_app_ready(app_name)
552 if time.time() > timeout:
553 log.warning("timed out waiting for '%s' to be ready",
557 # spread in time potentially parallel execution in multiple
559 time.sleep(random.randint(5, 11))
560 for job_name in job_names:
561 timeout = time.time() + timeout * 60
563 ready = is_job_complete(job_name)
566 if time.time() > timeout:
567 log.warning("timed out waiting for '%s' to be ready",
571 # spread in time potentially parallel execution in multiple
573 time.sleep(random.randint(5, 11))
574 for service_mesh_job_container_name in service_mesh_job_container_names:
575 timeout = time.time() + timeout * 60
577 ready = service_mesh_job_check(service_mesh_job_container_name)
579 sideCarKilled = quitquitquit_post(url)
580 if sideCarKilled is True:
581 log.info("Side Car Killed through QuitQuitQuit API")
583 log.info("Side Car Failed to be Killed through QuitQuitQuit API")
585 if time.time() > timeout:
586 log.warning("timed out waiting for '%s' to be ready",
587 service_mesh_job_container_name)
590 # spread in time potentially parallel execution in multiple
592 time.sleep(random.randint(5, 11))
594 if __name__ == "__main__":