3 PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
11 HELM_CHART_RELEASE_NAME=
23 COLOR_ON_RED='\033[0;31;1m'
24 COLOR_ON_GREEN='\033[0;32;1m'
35 ${CMD} - simple tool for fixing onap helm deployment
38 This script does nothing smart or special it just tries to
39 redeploy onap component. It can fix only problems related to
40 race conditions or timeouts. Nothing else. It will not fix
41 broken ONAP - there is no such ambition - that effort should
42 be directed in the upstream.
48 ${CMD} -n|--namespace <namespace>
49 (-f|--file <override>)...
50 (-s|--storage <directory>)|--no-storage-deletion
51 [-p|--release-prefix <release prefix>]
53 [(-c|--component <component release name>)...|
59 Usage 1: (simple heuristics - redeploy failed components):
60 ${CMD} -n onap -f /some/override1.yml -s /dockerdata-nfs/onap
62 Usage 2: (redeploy ONLY explicitly listed components):
63 ${CMD} -n onap -f /some/override1.yml -s /dockerdata-nfs/onap \\
64 -c onap-aaf -c onap-sdc -c onap-portal
66 Usage 3: (delete EVERYTHING and redeploy):
67 ${CMD} -n onap -f /some/override1.yml -s /dockerdata-nfs/onap --delete-all
69 Usage 4: (delete EVERYTHING and DO NOT redeploy - clean env.)
70 ${CMD} -n onap -s /dockerdata-nfs/onap --delete-all --clean-only
74 Namespace argument (always) and at least one override file (if you don't
75 use '--delete-all') are mandatory for this script to execute. Also you must
76 provide path to the storage ('--storage') OR explicitly request to not
77 delete file storage of the component ('--no-storage-deletion').
79 The storage should be a directory where persistent volume resides. It will
80 work only if the component created the persistent volume with the same
81 filename as its release name. Otherwise no files are deleted. The exception
82 is when '--delete-all' is used - in that case all content of the storage is
83 deleted (because ONAP is not consistent with the volume directory names
86 '--file' can be used multiple of times and it is used for override files
87 which are passed on to helm. The order is significant because if two
88 override files modify one value the latest one is used. This option is
89 ignored if '--clean-only' is used.
91 CAUTION 1: filename of an override file cannot contain whitespace! This is
92 actually helm/onap deploy plugin issue which does not handle such files. So
93 I dropped the more complicated version of this script when there is no
94 reason to support something on what will helm deploy choke anyway.
96 '--prefix' option is helm release argument - it is actually prefix when you
97 list the helm releases - helm is little confusing here.
99 CAUTION 2: By default release prefix is 'onap' - if you deployed release
100 'onap' and now run this script with different prefix then it will skip all
101 'onap-*' components and will deploy a new release with new prefix - BEWARE
102 TO USE PROPER RELEASE PREFIX!
104 Timeout sets the waiting time for helm deploy per component.
106 '--component' references to the release name of the chart which you want to
107 redeploy excplicitly - otherwise 'ALL FAILED' components will be
108 redeployed. You can target more than one component at once - just use the
109 argument multiple times.
111 Component option is mutually exclusive with the '--delete-all' which will
112 delete all components - healthy or not. Actually it will delete the whole
113 NAMESPACE and everything in it. Also to be sure it will cleanup all
114 orphaned images and volumes on all kubernetes nodes.
116 '--clean-only' can be used with any usage: heuristics, explicit component
117 list or with '--delete-all'. It basically just skips the last step - the
124 printf "Try help: ${CMD} --help\n"
129 printf "${COLOR_ON_GREEN}INFO: $@ ${COLOR_OFF}\n"
134 printf "${COLOR_ON_RED}ERROR: $@ ${COLOR_OFF}\n"
142 # remove all successfully completed jobs
145 kubectl get jobs -n ${NAMESPACE} \
146 --ignore-not-found=true \
147 --no-headers=true | \
148 while read -r _job _completion _duration _age ; do
149 _done=$(echo ${_completion} | awk 'BEGIN {FS="/";} {print $1;}')
150 _desired=$(echo ${_completion} | awk 'BEGIN {FS="/";} {print $2;}')
151 if [ "$_desired" -eq "$_done" ] ; then
159 get_labels 'status.phase==Failed'
162 # arg: [optional: selector]
165 if [ -n "$1" ] ; then
166 _selector="--field-selector=${1}"
171 kubectl get pods -n ${NAMESPACE} \
174 --ignore-not-found=true \
175 --no-headers=true | \
176 while read -r _pod _ready _status _restart _age _labels ; do
177 [ -z "$_labels" ] && break
178 for _label in $(echo "$_labels" | tr ',' ' ') ; do
181 _label=$(echo "$_label" | sed 's/release=//')
189 # arg: <release name>
192 msg "Undeploy helm release name: ${1}"
193 # Helm v3 does not support "--purge" flag since it's a default behavior for v3
194 if [[ $(helm version --template "{{.Version}}") =~ ^v3 ]];then
197 helm undeploy ${1} --purge
204 # Helm v3 need "--create-namespace" to create namespace if don't exist
205 if [[ $(helm version --template "{{.Version}}") =~ ^v3 ]];then
206 msg helm deploy ${RELEASE_PREFIX} local/onap --create-namespace --namespace ${NAMESPACE} ${OVERRIDES} --timeout ${HELM_TIMEOUT}
207 helm deploy ${RELEASE_PREFIX} local/onap --create-namespace --namespace ${NAMESPACE} ${OVERRIDES} --timeout ${HELM_TIMEOUT}
209 msg helm deploy ${RELEASE_PREFIX} local/onap --namespace ${NAMESPACE} ${OVERRIDES} --timeout ${HELM_TIMEOUT}
210 helm deploy ${RELEASE_PREFIX} local/onap --namespace ${NAMESPACE} ${OVERRIDES} --timeout ${HELM_TIMEOUT}
217 kubectl delete job -n ${NAMESPACE} \
223 # wait for job to be deleted
225 while [ -n "$_output" ] && sleep 1 ; do
226 _output=$(kubectl get pods -n ${NAMESPACE} \
227 --ignore-not-found=true \
229 --selector="job-name=${1}")
234 get_resources_for_component()
237 helm status $1 | awk -f <(cat - <<-'EOD'
245 if ( $1 == "RESOURCES:" ) {
258 $1 != "NAME" && $1 != "==>" && work == "yes" && $1 !~ ":" && $1 != "" {
259 printf "%s/%s\n", kind, $1
270 local _kind="${_resource%/*}"
271 local _name="${_resource#*/}"
274 if kubectl get ${_resource} >/dev/null 2>&1; then
275 msg "${_resource} has not been removed with helm undeploy, manual removal is required. Proceeding"
276 kubectl delete ${_resource} -n ${NAMESPACE} \
280 2>&1 | grep -iv 'not[[:space:]]*found'
282 # wait for resource to be deleted
284 while [ -n "$_output" ] && sleep 1 ; do
285 _output=$(kubectl get ${_kind} ${_name} -n ${NAMESPACE} \
286 --ignore-not-found=true \
295 msg "Delete the whole namespace: ${NAMESPACE}"
296 kubectl delete namespace \
302 # wait for namespace to be deleted
304 while [ -n "$_output" ] && sleep 1 ; do
305 _output=$(kubectl get all -n ${NAMESPACE} \
306 --ignore-not-found=true \
311 delete_persistent_volume()
313 _persistent_volume=$1
314 if kubectl get ${_persistent_volume} >/dev/null 2>&1; then
315 msg "${_persistent_volume} has not been removed with helm undeploy, manual removal is required. Proceeding"
316 #very often k8s hangs on Terminating state for pv due to still active pvc. It is better to delete pvc directly
317 _claim=$(kubectl get ${_persistent_volume} -o jsonpath='{ .spec.claimRef.name}')
318 delete_resource PersistentVolumeClaim/${_claim}
322 # arg: [optional: directory]
325 _node=$(kubectl get nodes \
326 --selector=node-role.kubernetes.io/worker \
328 --no-headers=true | \
329 awk '{print $6}' | head -n 1)
331 if [ -z "$_node" ] ; then
332 error "Could not list kubernetes nodes - SKIPPING DELETION"
334 if [ -n "$1" ] ; then
335 msg "Delete directory '${1}' on $_node"
336 ssh $_node "rm -rf '${1}'"
338 msg "Delete directories '${VOLUME_STORAGE}/*' on $_node"
339 ssh $_node "find '${VOLUME_STORAGE}' -maxdepth 1 -mindepth 1 -exec rm -rf '{}' \;"
346 _nodes=$(kubectl get nodes \
347 --selector=node-role.kubernetes.io/worker \
349 --no-headers=true | \
352 if [ -z "$_nodes" ] ; then
353 error "Could not list kubernetes nodes - SKIPPING docker cleanup"
357 for _node in $_nodes ; do
358 msg "Docker cleanup on $_node"
359 ssh $_node "docker system prune --force --all --volumes" >/dev/null &
362 msg "We are waiting now for docker cleanup to finish on all nodes..."
366 is_helm_serve_running()
368 # healthy result: HTTP/1.1 200 OK
369 _helm_serve_result=$(curl -w %{http_code} --silent --connect-timeout 3 http://127.0.0.1:8879/ -o /dev/null)
371 if [ "$_helm_serve_result" == "200" ] ; then
378 # arg: <release name>
383 #Because Helm undeploy is not reliable: Gathering resources assigned to componen to track and remove orphans later
384 _component_resources=($(get_resources_for_component ${_component}))
386 declare -a _persistent_volumes
388 declare -a _unknown_kinds
390 for resource in ${_component_resources[@]}; do
392 CronJob/* | Job/* | Secret/* | ConfigMap/* | Pod/* | Service/* | Deployment/* | StatefulSet/*)
393 _standard+=(${resource});;
394 #Ignoring PVC, they will be handled along with PV as 'helm' status does not return them for some components
395 PersistentVolumeClaim/*)
398 _persistent_volumes+=(${resource});;
400 _unknown_kinds+=(${resource})
405 #Gathering physical location of directories for persistent volumes to delete them after undeploy
406 declare -a _physical_locations
407 for volume in ${_persistent_volumes[@]}; do
408 _physical_locations+=($(kubectl get ${volume} -o jsonpath='{ .spec.hostPath.path}' ))
411 helm_undeploy ${_component}
413 #Manual items removal
414 for resource in ${_standard[@]}; do
415 delete_resource ${resource}
418 for volume in ${_persistent_volumes[@]}; do
419 delete_persistent_volume ${volume}
422 for subdir in ${_physical_locations[@]}; do
423 delete_storage ${subdir}
426 if [ "${#_unknown_kinds[@]}" -ne 0 ] ; then
427 for resource in ${_unknown_kinds[@]}; do
428 error "Untracked resource kind present: ${resource}, attempting to delete it..."
429 delete_resource ${resource}
435 # arg: <release name>
438 # TODO: until I can verify that this does the same for this component as helm deploy
439 #msg "Redeployment of the component ${1}..."
440 #helm install "local/${_chart}" --name ${1} --namespace ${NAMESPACE} --wait --timeout ${HELM_TIMEOUT}
441 error "NOT IMPLEMENTED"
459 while [ -n "$1" ] ; do
479 --no-storage-deletion)
480 if [ -n "$arg_storage" ] ; then
481 error "Usage of storage argument together with no storage deletion option!"
484 elif [ -z "$arg_nostorage" ] ; then
485 arg_nostorage=nostorage
487 error "Duplicit argument for no storage option! (IGNORING)"
491 if [ -n "$arg_deleteall" ] ; then
492 error "'Delete all components' used already - argument mismatch"
499 if [ -n "$arg_components" ] ; then
500 error "Explicit component(s) provided already - argument mismatch"
503 elif [ -z "$arg_deleteall" ] ; then
504 arg_deleteall=deleteall
506 error "Duplicit argument for 'delete all' option! (IGNORING)"
513 if [ -z "$arg_cleanonly" ] ; then
514 arg_cleanonly=cleanonly
516 error "Duplicit argument for 'clean only' option! (IGNORING)"
520 error "Unknown parameter: $1"
527 if [ -z "$arg_namespace" ] ; then
531 error "Duplicit argument for namespace!"
537 if ! [ -f "$1" ] ; then
538 error "Wrong filename for override file: $1"
542 arg_overrides="${arg_overrides} -f $1"
546 arg_components="${arg_components} $1"
550 if [ -z "$arg_prefix" ] ; then
554 error "Duplicit argument for release prefix!"
560 if [ -z "$arg_timeout" ] ; then
561 if ! echo "$1" | grep -q '^[0-9]\+$' ; then
562 error "Timeout must be an integer: $1"
569 error "Duplicit argument for timeout!"
575 if [ -n "$arg_nostorage" ] ; then
576 error "Usage of storage argument together with no storage deletion option!"
579 elif [ -z "$arg_storage" ] ; then
583 error "Duplicit argument for storage!"
594 if [ -z "$arg_namespace" ] ; then
595 error "Missing namespace"
599 NAMESPACE="$arg_namespace"
602 if [ -z "$arg_overrides" ] && [ -z "$arg_cleanonly" ] ; then
603 error "Missing override file(s) or use '--clean-only'"
607 OVERRIDES="$arg_overrides"
610 if [ -n "$arg_prefix" ] ; then
611 RELEASE_PREFIX="$arg_prefix"
614 if [ -n "$arg_timeout" ] ; then
615 HELM_TIMEOUT="$arg_timeout"
618 if [ -n "$arg_storage" ] ; then
619 VOLUME_STORAGE="$arg_storage"
620 elif [ -z "$arg_nostorage" ] ; then
621 error "Missing storage argument! If it is intended then use '--no-storage-deletion' option"
626 if [ -n "$arg_components" ] ; then
627 HELM_CHART_RELEASE_NAME="$arg_components"
630 if [ -n "$arg_deleteall" ] ; then
634 if [ -n "$arg_cleanonly" ] ; then
638 # If running with helm v3 a time unit has to be appended to HELM_TIMEOUT
639 if [[ $(helm version --template "{{.Version}}") =~ ^v3 ]];then
640 HELM_TIMEOUT="${HELM_TIMEOUT}s"
647 # set trap for this script cleanup
648 trap on_exit INT QUIT TERM EXIT
650 # another sanity checks
651 for tool in helm kubectl curl ; do
652 if ! which "$tool" >/dev/null 2>&1 ; then
653 error "Missing '${tool}' command"
658 if ! is_helm_serve_running ; then
659 error "'helm serve' is not running (http://localhost:8879)"
663 # if --delete-all is used then redeploy all components (the current namespace is deleted)
664 if [ -n "$HELM_DELETE_ALL" ] ; then
665 # undeploy helm release (prefix)
666 helm_undeploy "$RELEASE_PREFIX"
668 # we will delete the whole namespace
671 # we will cleanup docker on each node
674 # we will delete the content of storage (volumes)
675 if [ -n "$VOLUME_STORAGE" ] ; then
678 # delete and redeploy explicit or failed components...
680 # if a helm chart release name was given then just redeploy said component and quit
681 if [ -n "$HELM_CHART_RELEASE_NAME" ] ; then
682 msg "Explicitly asked for component redeploy: ${HELM_CHART_RELEASE_NAME}"
683 _COMPONENTS="$HELM_CHART_RELEASE_NAME"
684 # simple heuristics: redeploy only failed components
686 msg "Delete successfully completed jobs..."
689 msg "Find failed components..."
690 _COMPONENTS=$(get_failed_labels)
693 for _component in ${_COMPONENTS} ; do
694 if echo "$_component" | grep -q "^${RELEASE_PREFIX}-" ; then
695 msg "Redeploy component: ${_component}"
696 undeploy_component ${_component}
698 error "Component release name '${_component}' does not match release prefix: ${RELEASE_PREFIX} (SKIP)"
703 if [ -z "$HELM_SKIP_DEPLOY" ] ; then
704 # TODO: this is suboptimal - find a way how to deploy only the affected component...
705 msg "Redeploy onap..."
708 msg "Clean only option used: Skipping redeploy..."