3 PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
11 HELM_CHART_RELEASE_NAME=
23 COLOR_ON_RED='\033[0;31;1m'
24 COLOR_ON_GREEN='\033[0;32;1m'
35 ${CMD} - simple tool for fixing onap helm deployment
38 This script does nothing smart or special it just tries to
39 redeploy onap component. It can fix only problems related to
40 race conditions or timeouts. Nothing else. It will not fix
41 broken ONAP - there is no such ambition - that effort should
42 be directed in the upstream.
48 ${CMD} -n|--namespace <namespace>
49 (-f|--file <override>)...
50 (-s|--storage <directory>)|--no-storage-deletion
51 [-p|--release-prefix <release prefix>]
53 [(-c|--component <component release name>)...|
59 Usage 1: (simple heuristics - redeploy failed components):
60 ${CMD} -n onap -f /some/override1.yml -s /dockerdata-nfs
62 Usage 2: (redeploy ONLY explicitly listed components):
63 ${CMD} -n onap -f /some/override1.yml -s /dockerdata-nfs \\
64 -c onap-aaf -c onap-sdc -c onap-portal
66 Usage 3: (delete EVERYTHING and redeploy):
67 ${CMD} -n onap -f /some/override1.yml -s /dockerdata-nfs --delete-all
69 Usage 4: (delete EVERYTHING and DO NOT redeploy - clean env.)
70 ${CMD} -n onap -s /dockerdata-nfs --delete-all --clean-only
74 Namespace argument (always) and at least one override file (if you don't
75 use '--delete-all') are mandatory for this script to execute. Also you must
76 provide path to the storage ('--storage') OR explicitly request to not
77 delete file storage of the component ('--no-storage-deletion').
79 The storage should be a directory where persistent volume resides. It will
80 work only if the component created the persistent volume with the same
81 filename as its release name. Otherwise no files are deleted. The exception
82 is when '--delete-all' is used - in that case all content of the storage is
83 deleted (because ONAP is not consistent with the volume directory names
86 '--file' can be used multiple of times and it is used for override files
87 which are passed on to helm. The order is significant because if two
88 override files modify one value the latest one is used. This option is
89 ignored if '--clean-only' is used.
91 CAUTION 1: filename of an override file cannot contain whitespace! This is
92 actually helm/onap deploy plugin issue which does not handle such files. So
93 I dropped the more complicated version of this script when there is no
94 reason to support something on what will helm deploy choke anyway.
96 '--prefix' option is helm release argument - it is actually prefix when you
97 list the helm releases - helm is little confusing here.
99 CAUTION 2: By default release prefix is 'onap' - if you deployed release
100 'onap' and now run this script with different prefix then it will skip all
101 'onap-*' components and will deploy a new release with new prefix - BEWARE
102 TO USE PROPER RELEASE PREFIX!
104 Timeout sets the waiting time for helm deploy per component.
106 '--component' references to the release name of the chart which you want to
107 redeploy excplicitly - otherwise 'ALL FAILED' components will be
108 redeployed. You can target more than one component at once - just use the
109 argument multiple times.
111 Component option is mutually exclusive with the '--delete-all' which will
112 delete all components - healthy or not. Actually it will delete the whole
113 NAMESPACE and everything in it. Also to be sure it will cleanup all
114 orphaned images and volumes on all kubernetes nodes.
116 '--clean-only' can be used with any usage: heuristics, explicit component
117 list or with '--delete-all'. It basically just skips the last step - the
124 printf "Try help: ${CMD} --help\n"
129 printf "${COLOR_ON_GREEN}INFO: $@ ${COLOR_OFF}\n"
134 printf "${COLOR_ON_RED}ERROR: $@ ${COLOR_OFF}\n"
142 # remove all successfully completed jobs
145 kubectl get jobs -n ${NAMESPACE} \
146 --ignore-not-found=true \
147 --no-headers=true | \
148 while read -r _job _completion _duration _age ; do
149 _done=$(echo ${_completion} | awk 'BEGIN {FS="/";} {print $1;}')
150 _desired=$(echo ${_completion} | awk 'BEGIN {FS="/";} {print $2;}')
151 if [ "$_desired" -eq "$_done" ] ; then
159 get_labels 'status.phase==Failed'
162 # arg: [optional: selector]
165 if [ -n "$1" ] ; then
166 _selector="--field-selector=${1}"
171 kubectl get pods -n ${NAMESPACE} \
174 --ignore-not-found=true \
175 --no-headers=true | \
176 while read -r _pod _ready _status _restart _age _labels ; do
177 [ -z "$_labels" ] && break
178 for _label in $(echo "$_labels" | tr ',' ' ') ; do
181 _label=$(echo "$_label" | sed 's/release=//')
189 # arg: <release name>
192 msg "Undeploy helm release name: ${1}"
193 # Helm v3 does not support "--purge" flag since it's a default behavior for v3
194 if [[ $(helm version --template "{{.Version}}") =~ ^v3 ]];then
197 helm undeploy ${1} --purge
205 kubectl delete job -n ${NAMESPACE} \
211 # wait for job to be deleted
213 while [ -n "$_output" ] && sleep 1 ; do
214 _output=$(kubectl get pods -n ${NAMESPACE} \
215 --ignore-not-found=true \
217 --selector="job-name=${1}")
222 get_resources_for_component()
225 helm status $1 | awk -f <(cat - <<-'EOD'
233 if ( $1 == "RESOURCES:" ) {
246 $1 != "NAME" && $1 != "==>" && work == "yes" && $1 !~ ":" && $1 != "" {
247 printf "%s/%s\n", kind, $1
258 local _kind="${_resource%/*}"
259 local _name="${_resource#*/}"
262 if kubectl get ${_resource} >/dev/null 2>&1; then
263 msg "${_resource} has not been removed with helm undeploy, manual removal is required. Proceeding"
264 kubectl delete ${_resource} -n ${NAMESPACE} \
268 2>&1 | grep -iv 'not[[:space:]]*found'
270 # wait for resource to be deleted
272 while [ -n "$_output" ] && sleep 1 ; do
273 _output=$(kubectl get ${_kind} ${_name} -n ${NAMESPACE} \
274 --ignore-not-found=true \
283 msg "Delete the whole namespace: ${NAMESPACE}"
284 kubectl delete namespace \
290 # wait for namespace to be deleted
292 while [ -n "$_output" ] && sleep 1 ; do
293 _output=$(kubectl get all -n ${NAMESPACE} \
294 --ignore-not-found=true \
299 delete_persistent_volume()
301 _persistent_volume=$1
302 if kubectl get ${_persistent_volume} >/dev/null 2>&1; then
303 msg "${_persistent_volume} has not been removed with helm undeploy, manual removal is required. Proceeding"
304 #very often k8s hangs on Terminating state for pv due to still active pvc. It is better to delete pvc directly
305 _claim=$(kubectl get ${_persistent_volume} -o jsonpath='{ .spec.claimRef.name}')
306 delete_resource PersistentVolumeClaim/${_claim}
310 # arg: [optional: directory]
313 _node=$(kubectl get nodes \
314 --selector=node-role.kubernetes.io/worker \
316 --no-headers=true | \
317 awk '{print $6}' | head -n 1)
319 if [ -z "$_node" ] ; then
320 error "Could not list kubernetes nodes - SKIPPING DELETION"
322 if [ -n "$1" ] ; then
323 msg "Delete directory '${1}' on $_node"
324 ssh $_node "rm -rf '${1}'"
326 msg "Delete directories '${VOLUME_STORAGE}/*' on $_node"
327 ssh $_node "find '${VOLUME_STORAGE}' -maxdepth 1 -mindepth 1 -exec rm -rf '{}' \;"
334 _nodes=$(kubectl get nodes \
335 --selector=node-role.kubernetes.io/worker \
337 --no-headers=true | \
340 if [ -z "$_nodes" ] ; then
341 error "Could not list kubernetes nodes - SKIPPING docker cleanup"
345 for _node in $_nodes ; do
346 msg "Docker cleanup on $_node"
347 ssh $_node "docker system prune --force --all --volumes" >/dev/null &
350 msg "We are waiting now for docker cleanup to finish on all nodes..."
354 is_helm_serve_running()
356 # healthy result: HTTP/1.1 200 OK
357 _helm_serve_result=$(curl -w %{http_code} --silent --connect-timeout 3 http://127.0.0.1:8879/ -o /dev/null)
359 if [ "$_helm_serve_result" == "200" ] ; then
366 # arg: <release name>
371 #Because Helm undeploy is not reliable: Gathering resources assigned to componen to track and remove orphans later
372 _component_resources=($(get_resources_for_component ${_component}))
374 declare -a _persistent_volumes
376 declare -a _unknown_kinds
378 for resource in ${_component_resources[@]}; do
380 CronJob/* | Job/* | Secret/* | ConfigMap/* | Pod/* | Service/* | Deployment/* | StatefulSet/*)
381 _standard+=(${resource});;
382 #Ignoring PVC, they will be handled along with PV as 'helm' status does not return them for some components
383 PersistentVolumeClaim/*)
386 _persistent_volumes+=(${resource});;
388 _unknown_kinds+=(${resource})
393 #Gathering physical location of directories for persistent volumes to delete them after undeploy
394 declare -a _physical_locations
395 for volume in ${_persistent_volumes[@]}; do
396 _physical_locations+=($(kubectl get ${volume} -o jsonpath='{ .spec.hostPath.path}' ))
399 helm_undeploy ${_component}
401 #Manual items removal
402 for resource in ${_standard[@]}; do
403 delete_resource ${resource}
406 for volume in ${_persistent_volumes[@]}; do
407 delete_persistent_volume ${volume}
410 for subdir in ${_physical_locations[@]}; do
411 delete_storage ${subdir}
414 if [ "${#_unknown_kinds[@]}" -ne 0 ] ; then
415 for resource in ${_unknown_kinds[@]}; do
416 error "Untracked resource kind present: ${resource}, attempting to delete it..."
417 delete_resource ${resource}
423 # arg: <release name>
426 # TODO: until I can verify that this does the same for this component as helm deploy
427 #msg "Redeployment of the component ${1}..."
428 #helm install "local/${_chart}" --name ${1} --namespace ${NAMESPACE} --wait --timeout ${HELM_TIMEOUT}
429 error "NOT IMPLEMENTED"
447 while [ -n "$1" ] ; do
467 --no-storage-deletion)
468 if [ -n "$arg_storage" ] ; then
469 error "Usage of storage argument together with no storage deletion option!"
472 elif [ -z "$arg_nostorage" ] ; then
473 arg_nostorage=nostorage
475 error "Duplicit argument for no storage option! (IGNORING)"
479 if [ -n "$arg_deleteall" ] ; then
480 error "'Delete all components' used already - argument mismatch"
487 if [ -n "$arg_components" ] ; then
488 error "Explicit component(s) provided already - argument mismatch"
491 elif [ -z "$arg_deleteall" ] ; then
492 arg_deleteall=deleteall
494 error "Duplicit argument for 'delete all' option! (IGNORING)"
501 if [ -z "$arg_cleanonly" ] ; then
502 arg_cleanonly=cleanonly
504 error "Duplicit argument for 'clean only' option! (IGNORING)"
508 error "Unknown parameter: $1"
515 if [ -z "$arg_namespace" ] ; then
519 error "Duplicit argument for namespace!"
525 if ! [ -f "$1" ] ; then
526 error "Wrong filename for override file: $1"
530 arg_overrides="${arg_overrides} -f $1"
534 arg_components="${arg_components} $1"
538 if [ -z "$arg_prefix" ] ; then
542 error "Duplicit argument for release prefix!"
548 if [ -z "$arg_timeout" ] ; then
549 if ! echo "$1" | grep -q '^[0-9]\+$' ; then
550 error "Timeout must be an integer: $1"
557 error "Duplicit argument for timeout!"
563 if [ -n "$arg_nostorage" ] ; then
564 error "Usage of storage argument together with no storage deletion option!"
567 elif [ -z "$arg_storage" ] ; then
571 error "Duplicit argument for storage!"
582 if [ -z "$arg_namespace" ] ; then
583 error "Missing namespace"
587 NAMESPACE="$arg_namespace"
590 if [ -z "$arg_overrides" ] && [ -z "$arg_cleanonly" ] ; then
591 error "Missing override file(s) or use '--clean-only'"
595 OVERRIDES="$arg_overrides"
598 if [ -n "$arg_prefix" ] ; then
599 RELEASE_PREFIX="$arg_prefix"
602 if [ -n "$arg_timeout" ] ; then
603 HELM_TIMEOUT="$arg_timeout"
606 if [ -n "$arg_storage" ] ; then
607 VOLUME_STORAGE="$arg_storage"
608 elif [ -z "$arg_nostorage" ] ; then
609 error "Missing storage argument! If it is intended then use '--no-storage-deletion' option"
614 if [ -n "$arg_components" ] ; then
615 HELM_CHART_RELEASE_NAME="$arg_components"
618 if [ -n "$arg_deleteall" ] ; then
622 if [ -n "$arg_cleanonly" ] ; then
626 # If running with helm v3 a time unit has to be appended to HELM_TIMEOUT
627 if [[ $(helm version --template "{{.Version}}") =~ ^v3 ]];then
628 HELM_TIMEOUT="${HELM_TIMEOUT}s"
635 # set trap for this script cleanup
636 trap on_exit INT QUIT TERM EXIT
638 # another sanity checks
639 for tool in helm kubectl curl ; do
640 if ! which "$tool" >/dev/null 2>&1 ; then
641 error "Missing '${tool}' command"
646 if ! is_helm_serve_running ; then
647 error "'helm serve' is not running (http://localhost:8879)"
651 # if --delete-all is used then redeploy all components (the current namespace is deleted)
652 if [ -n "$HELM_DELETE_ALL" ] ; then
653 # undeploy helm release (prefix)
654 helm_undeploy "$RELEASE_PREFIX"
656 # we will delete the whole namespace
659 # we will cleanup docker on each node
662 # we will delete the content of storage (volumes)
663 if [ -n "$VOLUME_STORAGE" ] ; then
666 # delete and redeploy explicit or failed components...
668 # if a helm chart release name was given then just redeploy said component and quit
669 if [ -n "$HELM_CHART_RELEASE_NAME" ] ; then
670 msg "Explicitly asked for component redeploy: ${HELM_CHART_RELEASE_NAME}"
671 _COMPONENTS="$HELM_CHART_RELEASE_NAME"
672 # simple heuristics: redeploy only failed components
674 msg "Delete successfully completed jobs..."
677 msg "Find failed components..."
678 _COMPONENTS=$(get_failed_labels)
681 for _component in ${_COMPONENTS} ; do
682 if echo "$_component" | grep -q "^${RELEASE_PREFIX}-" ; then
683 msg "Redeploy component: ${_component}"
684 undeploy_component ${_component}
686 error "Component release name '${_component}' does not match release prefix: ${RELEASE_PREFIX} (SKIP)"
691 if [ -z "$HELM_SKIP_DEPLOY" ] ; then
692 # TODO: this is suboptimal - find a way how to deploy only the affected component...
693 msg "Redeploy onap..."
694 msg helm deploy ${RELEASE_PREFIX} local/onap --namespace ${NAMESPACE} ${OVERRIDES} --timeout ${HELM_TIMEOUT}
695 helm deploy ${RELEASE_PREFIX} local/onap --namespace ${NAMESPACE} ${OVERRIDES} --timeout ${HELM_TIMEOUT}
697 msg "Clean only option used: Skipping redeploy..."