3 PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
11 HELM_CHART_RELEASE_NAME=
24 COLOR_ON_RED='\033[0;31;1m'
25 COLOR_ON_GREEN='\033[0;32;1m'
36 ${CMD} - simple tool for fixing onap helm deployment
39 This script does nothing smart or special it just tries to
40 redeploy onap component. It can fix only problems related to
41 race conditions or timeouts. Nothing else. It will not fix
42 broken ONAP - there is no such ambition - that effort should
43 be directed in the upstream.
49 ${CMD} -n|--namespace <namespace>
50 (-f|--file <override>)...
51 (-s|--storage <directory>)|--no-storage-deletion
52 [-p|--release-prefix <release prefix>]
54 [(-c|--component <component release name>)...|
61 Usage 1: (simple heuristics - redeploy failed components):
62 ${CMD} -n onap -f /some/override1.yml -s /dockerdata-nfs/onap
64 Usage 2: (redeploy ONLY explicitly listed components):
65 ${CMD} -n onap -f /some/override1.yml -s /dockerdata-nfs/onap \\
66 -c onap-aaf -c onap-sdc -c onap-portal
68 Usage 3: (delete EVERYTHING and redeploy):
69 ${CMD} -n onap -f /some/override1.yml -s /dockerdata-nfs/onap --delete-all
71 Usage 4: (delete EVERYTHING and DO NOT redeploy - clean env.)
72 ${CMD} -n onap -s /dockerdata-nfs/onap --delete-all --clean-only
76 Namespace argument (always) and at least one override file (if you don't
77 use '--delete-all') are mandatory for this script to execute. Also you must
78 provide path to the storage ('--storage') OR explicitly request to not
79 delete file storage of the component ('--no-storage-deletion').
81 The storage should be a directory where persistent volume resides. It will
82 work only if the component created the persistent volume with the same
83 filename as its release name. Otherwise no files are deleted. The exception
84 is when '--delete-all' is used - in that case all content of the storage is
85 deleted (because ONAP is not consistent with the volume directory names
88 '--file' can be used multiple of times and it is used for override files
89 which are passed on to helm. The order is significant because if two
90 override files modify one value the latest one is used. This option is
91 ignored if '--clean-only' is used.
93 CAUTION 1: filename of an override file cannot contain whitespace! This is
94 actually helm/onap deploy plugin issue which does not handle such files. So
95 I dropped the more complicated version of this script when there is no
96 reason to support something on what will helm deploy choke anyway.
98 '--prefix' option is helm release argument - it is actually prefix when you
99 list the helm releases - helm is little confusing here.
101 CAUTION 2: By default release prefix is 'onap' - if you deployed release
102 'onap' and now run this script with different prefix then it will skip all
103 'onap-*' components and will deploy a new release with new prefix - BEWARE
104 TO USE PROPER RELEASE PREFIX!
106 Timeout sets the waiting time for helm deploy per component.
108 '--component' references to the release name of the chart which you want to
109 redeploy excplicitly - otherwise 'ALL FAILED' components will be
110 redeployed. You can target more than one component at once - just use the
111 argument multiple times.
113 Component option is mutually exclusive with the '--delete-all' which will
114 delete all components - healthy or not. Actually it will delete the whole
115 NAMESPACE and everything in it. Also to be sure it will cleanup all
116 orphaned images and volumes on all kubernetes nodes.
118 '--clean-only' can be used with any usage: heuristics, explicit component
119 list or with '--delete-all'. It basically just skips the last step - the
122 '--debug' will turn on helm's verbose output
128 printf "Try help: ${CMD} --help\n"
133 printf "${COLOR_ON_GREEN}INFO: $@ ${COLOR_OFF}\n"
138 printf "${COLOR_ON_RED}ERROR: $@ ${COLOR_OFF}\n"
146 # remove all successfully completed jobs
149 kubectl get jobs -n ${NAMESPACE} \
150 --ignore-not-found=true \
151 --no-headers=true | \
152 while read -r _job _completion _duration _age ; do
153 _done=$(echo ${_completion} | awk 'BEGIN {FS="/";} {print $1;}')
154 _desired=$(echo ${_completion} | awk 'BEGIN {FS="/";} {print $2;}')
155 if [ "$_desired" -eq "$_done" ] ; then
163 get_labels 'status.phase==Failed'
166 # arg: [optional: selector]
169 if [ -n "$1" ] ; then
170 _selector="--field-selector=${1}"
175 kubectl get pods -n ${NAMESPACE} \
178 --ignore-not-found=true \
179 --no-headers=true | \
180 while read -r _pod _ready _status _restart _age _labels ; do
181 [ -z "$_labels" ] && break
182 for _label in $(echo "$_labels" | tr ',' ' ') ; do
185 _label=$(echo "$_label" | sed 's/release=//')
193 # arg: <release name>
196 msg "Undeploy helm release name: ${1}"
197 helm ${HELM_DEBUG} -n ${NAMESPACE} undeploy ${1}
203 msg helm ${HELM_DEBUG} -n ${NAMESPACE} deploy ${RELEASE_PREFIX} local/onap --create-namespace --namespace ${NAMESPACE} ${OVERRIDES} --timeout ${HELM_TIMEOUT}
204 helm ${HELM_DEBUG} -n ${NAMESPACE} deploy ${RELEASE_PREFIX} local/onap --create-namespace --namespace ${NAMESPACE} ${OVERRIDES} --timeout ${HELM_TIMEOUT}
210 kubectl delete job -n ${NAMESPACE} \
216 # wait for job to be deleted
218 while [ -n "$_output" ] && sleep 1 ; do
219 _output=$(kubectl get pods -n ${NAMESPACE} \
220 --ignore-not-found=true \
222 --selector="job-name=${1}")
227 get_resources_for_component()
229 helm -n ${NAMESPACE} get manifest $1 | kubectl -n ${NAMESPACE} get -f - | awk '{print $1}' | grep -v NAME | grep -v ^$
236 local _kind="${_resource%/*}"
237 local _name="${_resource#*/}"
239 if kubectl -n ${NAMESPACE} get ${_resource} >/dev/null 2>&1; then
240 msg "${_resource} has not been removed with helm undeploy, manual removal is required. Proceeding"
241 kubectl delete ${_resource} -n ${NAMESPACE} \
245 2>&1 | grep -iv 'not[[:space:]]*found'
247 # wait for resource to be deleted
249 while [ -n "$_output" ] && sleep 1 ; do
250 _output=$(kubectl get ${_kind} ${_name} -n ${NAMESPACE} \
251 --ignore-not-found=true \
260 msg "Delete the whole namespace: ${NAMESPACE}"
261 kubectl delete namespace \
267 # wait for namespace to be deleted
269 while [ -n "$_output" ] && sleep 1 ; do
270 _output=$(kubectl get all -n ${NAMESPACE} \
271 --ignore-not-found=true \
276 delete_persistent_volume()
278 _persistent_volume=$1
279 if kubectl get ${_persistent_volume} >/dev/null 2>&1; then
280 msg "${_persistent_volume} has not been removed with helm undeploy, manual removal is required. Proceeding"
281 #very often k8s hangs on Terminating state for pv due to still active pvc. It is better to delete pvc directly
282 _claim=$(kubectl get ${_persistent_volume} -o jsonpath='{ .spec.claimRef.name}')
283 delete_resource PersistentVolumeClaim/${_claim}
287 # arg: [optional: directory]
290 _node=$(kubectl get nodes \
291 --selector=node-role.kubernetes.io/worker \
293 --no-headers=true | \
294 awk '{print $6}' | head -n 1)
296 if [ -z "$_node" ] ; then
297 error "Could not list kubernetes nodes - SKIPPING DELETION"
299 if [ -n "$1" ] ; then
300 msg "Delete directory '${1}' on $_node"
301 ssh $_node "rm -rf '${1}'"
303 msg "Delete directories '${VOLUME_STORAGE}/*' on $_node"
304 ssh $_node "find '${VOLUME_STORAGE}' -maxdepth 1 -mindepth 1 -exec rm -rf '{}' \;"
311 _nodes=$(kubectl get nodes \
312 --selector=node-role.kubernetes.io/worker \
314 --no-headers=true | \
317 if [ -z "$_nodes" ] ; then
318 error "Could not list kubernetes nodes - SKIPPING docker cleanup"
322 for _node in $_nodes ; do
323 msg "Docker cleanup on $_node"
324 ssh $_node "docker system prune --force --all --volumes" >/dev/null &
327 msg "We are waiting now for docker cleanup to finish on all nodes..."
331 is_helm_serve_running()
333 # healthy result: HTTP/1.1 200 OK
334 _helm_serve_result=$(curl -w %{http_code} --silent --connect-timeout 3 http://127.0.0.1:8879/ -o /dev/null)
336 if [ "$_helm_serve_result" == "200" ] ; then
343 # arg: <release name>
348 #Because Helm undeploy is not reliable: Gathering resources assigned to componen to track and remove orphans later
349 _component_resources=($(get_resources_for_component ${_component}))
351 declare -a _persistent_volumes
353 declare -a _unknown_kinds
355 for resource in ${_component_resources[@]}; do
357 cronjob/* | job.batch/* | secret/* | configmap/* | service/* | deployment.apps/* | statefulset.apps/* | serviceaccount/* | rolebinding.rbac.authorization.k8s.io/* | role.rbac.authorization.k8s.io/* | poddisruptionbudget.policy/* | clusterrolebinding.rbac.authorization.k8s.io/*)
358 _standard+=(${resource});;
359 #Ignoring PVC, they will be handled along with PV as 'helm' status does not return them for some components
360 persistentvolumeclaim/*)
363 _persistent_volumes+=(${resource});;
365 _unknown_kinds+=(${resource})
369 #Gathering physical location of directories for persistent volumes to delete them after undeploy
370 declare -a _physical_locations
371 for volume in ${_persistent_volumes[@]}; do
372 _physical_locations+=($(kubectl get ${volume} -o jsonpath='{ .spec.hostPath.path}' ))
375 helm_undeploy ${_component}
377 #Manual items removal
378 for resource in ${_standard[@]}; do
379 delete_resource ${resource}
382 for volume in ${_persistent_volumes[@]}; do
383 delete_persistent_volume ${volume}
386 for subdir in ${_physical_locations[@]}; do
387 delete_storage ${subdir}
390 if [ "${#_unknown_kinds[@]}" -ne 0 ] ; then
391 for resource in ${_unknown_kinds[@]}; do
392 error "Untracked resource kind present: ${resource}, attempting to delete it..."
393 delete_resource ${resource}
399 # arg: <release name>
402 # TODO: until I can verify that this does the same for this component as helm deploy
403 #msg "Redeployment of the component ${1}..."
404 #helm install "local/${_chart}" --name ${1} --namespace ${NAMESPACE} --wait --timeout ${HELM_TIMEOUT}
405 error "NOT IMPLEMENTED"
423 while [ -n "$1" ] ; do
443 --no-storage-deletion)
444 if [ -n "$arg_storage" ] ; then
445 error "Usage of storage argument together with no storage deletion option!"
448 elif [ -z "$arg_nostorage" ] ; then
449 arg_nostorage=nostorage
451 error "Duplicit argument for no storage option! (IGNORING)"
455 if [ -n "$arg_deleteall" ] ; then
456 error "'Delete all components' used already - argument mismatch"
463 if [ -n "$arg_components" ] ; then
464 error "Explicit component(s) provided already - argument mismatch"
467 elif [ -z "$arg_deleteall" ] ; then
468 arg_deleteall=deleteall
470 error "Duplicit argument for 'delete all' option! (IGNORING)"
477 if [ -z "$arg_cleanonly" ] ; then
478 arg_cleanonly=cleanonly
480 error "Duplicit argument for 'clean only' option! (IGNORING)"
487 error "Unknown parameter: $1"
494 if [ -z "$arg_namespace" ] ; then
498 error "Duplicit argument for namespace!"
504 if ! [ -f "$1" ] ; then
505 error "Wrong filename for override file: $1"
509 arg_overrides="${arg_overrides} -f $1"
513 arg_components="${arg_components} $1"
517 if [ -z "$arg_prefix" ] ; then
521 error "Duplicit argument for release prefix!"
527 if [ -z "$arg_timeout" ] ; then
528 if ! echo "$1" | grep -q '^[0-9]\+$' ; then
529 error "Timeout must be an integer: $1"
536 error "Duplicit argument for timeout!"
542 if [ -n "$arg_nostorage" ] ; then
543 error "Usage of storage argument together with no storage deletion option!"
546 elif [ -z "$arg_storage" ] ; then
550 error "Duplicit argument for storage!"
561 if [ -z "$arg_namespace" ] ; then
562 error "Missing namespace"
566 NAMESPACE="$arg_namespace"
569 if [ -z "$arg_overrides" ] && [ -z "$arg_cleanonly" ] ; then
570 error "Missing override file(s) or use '--clean-only'"
574 OVERRIDES="$arg_overrides"
577 if [ -n "$arg_prefix" ] ; then
578 RELEASE_PREFIX="$arg_prefix"
581 if [ -n "$arg_timeout" ] ; then
582 HELM_TIMEOUT="${arg_timeout}s"
585 if [ -n "$arg_storage" ] ; then
586 VOLUME_STORAGE="$arg_storage"
587 elif [ -z "$arg_nostorage" ] ; then
588 error "Missing storage argument! If it is intended then use '--no-storage-deletion' option"
593 if [ -n "$arg_components" ] ; then
594 HELM_CHART_RELEASE_NAME="$arg_components"
597 if [ -n "$arg_deleteall" ] ; then
601 if [ -n "$arg_cleanonly" ] ; then
609 # set trap for this script cleanup
610 trap on_exit INT QUIT TERM EXIT
612 # another sanity checks
613 for tool in helm kubectl curl ; do
614 if ! which "$tool" >/dev/null 2>&1 ; then
615 error "Missing '${tool}' command"
620 if ! is_helm_serve_running ; then
621 error "'helm serve' is not running (http://localhost:8879)"
625 # if --delete-all is used then redeploy all components (the current namespace is deleted)
626 if [ -n "$HELM_DELETE_ALL" ] ; then
627 # undeploy helm release (prefix)
628 helm_undeploy "$RELEASE_PREFIX"
630 # we will delete the whole namespace
633 # we will cleanup docker on each node
636 # we will delete the content of storage (volumes)
637 if [ -n "$VOLUME_STORAGE" ] ; then
640 # delete and redeploy explicit or failed components...
642 # if a helm chart release name was given then just redeploy said component and quit
643 if [ -n "$HELM_CHART_RELEASE_NAME" ] ; then
644 msg "Explicitly asked for component redeploy: ${HELM_CHART_RELEASE_NAME}"
645 _COMPONENTS="$HELM_CHART_RELEASE_NAME"
646 # simple heuristics: redeploy only failed components
648 msg "Delete successfully completed jobs..."
651 msg "Find failed components..."
652 _COMPONENTS=$(get_failed_labels)
655 for _component in ${_COMPONENTS} ; do
656 if echo "$_component" | grep -q "^${RELEASE_PREFIX}-" ; then
657 msg "Redeploy component: ${_component}"
658 undeploy_component ${_component}
660 error "Component release name '${_component}' does not match release prefix: ${RELEASE_PREFIX} (SKIP)"
665 if [ -z "$HELM_SKIP_DEPLOY" ] ; then
666 # TODO: this is suboptimal - find a way how to deploy only the affected component...
667 msg "Redeploy onap..."
670 msg "Clean only option used: Skipping redeploy..."