3 PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
11 HELM_CHART_RELEASE_NAME=
23 COLOR_ON_RED='\033[0;31;1m'
24 COLOR_ON_GREEN='\033[0;32;1m'
35 ${CMD} - simple tool for fixing onap helm deployment
38 This script does nothing smart or special it just tries to
39 redeploy onap component. It can fix only problems related to
40 race conditions or timeouts. Nothing else. It will not fix
41 broken ONAP - there is no such ambition - that effort should
42 be directed in the upstream.
48 ${CMD} -n|--namespace <namespace>
49 (-f|--file <override>)...
50 (-s|--storage <directory>)|--no-storage-deletion
51 [-p|--release-prefix <release prefix>]
53 [(-c|--component <component release name>)...|
57 Usage 1 (simple heuristics - redeploy failed components):
58 ${CMD} -n onap -f /some/override1.yml -s /dockerdata-nfs
60 Usage 2 (redeploy ONLY explicit listed components):
61 ${CMD} -n onap -f /some/override1.yml -s /dockerdata-nfs \
62 -c onap-aaf -c onap-sdc -c onap-portal
64 Usage 3 (delete EVERYTHING and redeploy):
65 ${CMD} -n onap -f /some/override1.yml -s /dockerdata-nfs \
68 Usage 4 (just clean - do not redeploy)
69 ${CMD} -n onap -f /some/override1.yml -s /dockerdata-nfs \
70 --delete-all --clean-only
72 Namespace argument and at least one override file are mandatory
73 for this script to execute. Also you must provide path to the
74 storage or explicitly request to not delete file storage of the
77 Storage should be directory where persistent volume resides. It
78 will work only if component created a persistent volume with the
79 same filename as its release name. Otherwise no effect. The
80 exception is when '--delete-all' is used - in that case all
81 content of the storage is deleted (because ONAP is not consistent
82 with the volume directory names - eg.: sdnc).
84 CAUTION 1: filename of an override file cannot contain whitespace!
85 This is actually helm/onap deploy plugin issue which does not
86 handle such files. So I dropped the more complicated version of
87 this script when there is no reason to support something on what
88 will helm deploy choke anyway.
90 '--prefix' option is helm release argument - it is actually prefix
91 when you list the helm releases - helm is little confusing here.
93 CAUTION 2: By default release prefix is 'onap' - if you deployed
94 release 'onap' and now run this script with different prefix then
95 it will skip all 'onap-*' components and will deploy a new release
96 with new prefix - BEWARE TO USE PROPER RELEASE PREFIX!
98 Timeout set the waiting time for helm deploy per component.
100 '--component' references to release name of the chart which you
101 want to redeploy excplicitly - otherwise 'ALL FAILED' components
102 will be redeployed. You can target more than one component at once
103 - just use the argument multiple times.
105 Component option is mutually exclusive with the '--delete-all'
106 which will delete all components - healthy or not. Actually it will
107 delete the whole NAMESPACE and everything in it.
109 '--clean-only' can be used with any usage: heuristics, explicit
110 component list or with '--delete-all'. It basically just skips the
111 last step - the actual redeploy.
117 echo -e "${COLOR_ON_GREEN}INFO: $@ ${COLOR_OFF}"
122 echo -e "${COLOR_ON_RED}ERROR: $@ ${COLOR_OFF}"
125 # remove all successfully completed jobs
128 kubectl get jobs -n ${NAMESPACE} \
129 --ignore-not-found=true \
130 --no-headers=true | \
131 while read -r _job _completion _duration _age ; do
132 _done=$(echo ${_completion} | awk 'BEGIN {FS="/";} {print $1;}')
133 _desired=$(echo ${_completion} | awk 'BEGIN {FS="/";} {print $2;}')
134 if [ "$_desired" -eq "$_done" ] ; then
142 get_labels 'status.phase==Failed'
145 # arg: [optional: selector]
148 if [ -n "$1" ] ; then
149 _selector="--field-selector=${1}"
154 kubectl get pods -n ${NAMESPACE} \
156 --include-uninitialized=true \
158 --ignore-not-found=true \
159 --no-headers=true | \
160 while read -r _pod _ready _status _restart _age _labels ; do
161 [ -z "$_labels" ] && break
162 for _label in $(echo "$_labels" | tr ',' ' ') ; do
165 _label=$(echo "$_label" | sed 's/release=//')
173 # arg: <release name>
176 msg "Undeploy helm release name: ${1}"
177 helm undeploy ${1} --purge
183 kubectl delete job -n ${NAMESPACE} \
186 --include-uninitialized=true \
190 # wait for job to be deleted
192 while [ -n "$_output" ] && sleep 1 ; do
193 _output=$(kubectl get pods -n ${NAMESPACE} \
194 --ignore-not-found=true \
196 --selector="job-name=${1}")
200 # arg: <resource> <release name>
206 msg "Delete ${_resource} for ${_release}..."
208 kubectl get ${_resource} -n ${NAMESPACE} \
209 --ignore-not-found=true \
210 --selector="release=${_release}" \
213 # this is due to missing "release" label in some pods
214 # grep for the rescue...
215 kubectl get ${_resource} -n ${NAMESPACE} \
216 --no-headers=true | grep "^${_release}"
217 } | awk '{print $1}' | sort -u | while read -r _name _rest ; do
218 echo "Deleting '${_name}'"
219 kubectl delete ${_resource} -n ${NAMESPACE} \
222 --include-uninitialized=true \
225 2>&1 | grep -iv 'not[[:space:]]*found'
227 # wait for resource to be deleted
229 while [ -n "$_output" ] && sleep 1 ; do
230 _output=$(kubectl get ${_resource} -n ${NAMESPACE} \
231 --ignore-not-found=true \
233 --field-selector="metadata.name=${_name}")
240 msg "Delete the whole namespace: ${NAMESPACE}"
241 kubectl delete namespace \
244 --include-uninitialized=true \
248 # wait for namespace to be deleted
250 while [ -n "$_output" ] && sleep 1 ; do
251 _output=$(kubectl get all -n ${NAMESPACE} \
252 --ignore-not-found=true \
257 # arg: [optional: subdir]
260 _node=$(kubectl get nodes \
261 --selector=node-role.kubernetes.io/worker \
263 --no-headers=true | \
264 awk '{print $6}' | head -n 1)
266 if [ -z "$_node" ] ; then
267 error "Could not list kubernetes nodes - SKIPPING DELETION"
269 if [ -n "$1" ] ; then
270 msg "Delete directory '${VOLUME_STORAGE}/${1}' on $_node"
272 rm -rf "${VOLUME_STORAGE}/${1}"
275 msg "Delete directories '${VOLUME_STORAGE}/*' on $_node"
277 find "${VOLUME_STORAGE}" -maxdepth 1 -mindepth 1 -exec rm -rf '{}' \;
283 # arg: <release name>
286 _chart=$(echo "$1" | sed 's/[^-]*-//')
288 # TODO: does deleted secret per component break something?
289 for x in jobs deployments pods pvc pv ; do
290 delete_resource ${x} ${1}
293 if [ -n "$VOLUME_STORAGE" ] ; then
294 msg "Persistent volume data deletion in directory: ${VOLUME_STORAGE}/${1}"
298 # TODO: until I can verify that this does the same for this component as helm deploy
299 #msg "Redeployment of the component ${1}..."
300 #helm install "local/${_chart}" --name ${1} --namespace ${NAMESPACE} --wait --timeout ${HELM_TIMEOUT}
318 while [ -n "$1" ] ; do
338 --no-storage-deletion)
339 if [ -n "$arg_storage" ] ; then
340 error "Usage of storage argument together with no storage deletion option!"
342 elif [ -z "$arg_nostorage" ] ; then
343 arg_nostorage=nostorage
345 error "Duplicit argument for no storage option! (IGNORING)"
349 if [ -n "$arg_deleteall" ] ; then
350 error "'Delete all components' used already - argument mismatch"
356 if [ -n "$arg_components" ] ; then
357 error "Explicit component(s) provided already - argument mismatch"
359 elif [ -z "$arg_deleteall" ] ; then
360 arg_deleteall=deleteall
362 error "Duplicit argument for 'delete all' option! (IGNORING)"
369 if [ -z "$arg_cleanonly" ] ; then
370 arg_cleanonly=cleanonly
372 error "Duplicit argument for 'clean only' option! (IGNORING)"
376 error "Unknown parameter: $1"
382 if [ -z "$arg_namespace" ] ; then
386 error "Duplicit argument for namespace!"
391 if ! [ -f "$1" ] ; then
392 error "Wrong filename for override file: $1"
395 arg_overrides="${arg_overrides} -f $1"
399 arg_components="${arg_components} $1"
403 if [ -z "$arg_prefix" ] ; then
407 error "Duplicit argument for release prefix!"
412 if [ -z "$arg_timeout" ] ; then
413 if ! echo "$1" | grep -q '^[0-9]\+$' ; then
414 error "Timeout must be an integer: $1"
420 error "Duplicit argument for timeout!"
425 if [ -n "$arg_nostorage" ] ; then
426 error "Usage of storage argument together with no storage deletion option!"
428 elif [ -z "$arg_storage" ] ; then
432 error "Duplicit argument for storage!"
441 if [ -z "$arg_namespace" ] ; then
442 error "Missing namespace"
446 NAMESPACE="$arg_namespace"
449 if [ -z "$arg_overrides" ] ; then
450 error "Missing override file(s)"
454 OVERRIDES="$arg_overrides"
457 if [ -n "$arg_prefix" ] ; then
458 RELEASE_PREFIX="$arg_prefix"
461 if [ -n "$arg_timeout" ] ; then
462 HELM_TIMEOUT="$arg_timeout"
465 if [ -n "$arg_storage" ] ; then
466 VOLUME_STORAGE="$arg_storage"
467 elif [ -z "$arg_nostorage" ] ; then
468 error "Missing storage argument! If it is intended then use '--no-storage-deletion' option"
472 if [ -n "$arg_components" ] ; then
473 HELM_CHART_RELEASE_NAME="$arg_components"
476 if [ -n "$arg_deleteall" ] ; then
480 if [ -n "$arg_cleanonly" ] ; then
489 # if --delete-all is used then redeploy all components (the current namespace is deleted)
490 if [ -n "$HELM_DELETE_ALL" ] ; then
491 # undeploy helm release (prefix)
492 helm_undeploy "$RELEASE_PREFIX"
494 # we will delete the whole namespace
497 if [ -n "$VOLUME_STORAGE" ] ; then
500 # delete and redeploy explicit or failed components...
502 # if a helm chart release name was given then just redeploy said component and quit
503 if [ -n "$HELM_CHART_RELEASE_NAME" ] ; then
504 msg "Explicitly asked for component redeploy: ${HELM_CHART_RELEASE_NAME}"
505 _COMPONENTS="$HELM_CHART_RELEASE_NAME"
506 # simple heuristics: redeploy only failed components
508 msg "Delete successfully completed jobs..."
511 msg "Find failed components..."
512 _COMPONENTS=$(get_failed_labels)
515 for _component in ${_COMPONENTS} ; do
516 if echo "$_component" | grep -q "^${RELEASE_PREFIX}-" ; then
517 msg "Redeploy component: ${_component}"
518 redeploy_component ${_component}
520 error "Component release name '${_component}' does not match release prefix: ${RELEASE_PREFIX} (SKIP)"
525 if [ -z "$HELM_SKIP_DEPLOY" ] ; then
526 # TODO: this is suboptimal - find a way how to deploy only the affected component...
527 msg "Redeploy onap..."
528 msg helm deploy ${RELEASE_PREFIX} local/onap --namespace ${NAMESPACE} ${OVERRIDES} --timeout ${HELM_TIMEOUT}
529 helm deploy ${RELEASE_PREFIX} local/onap --namespace ${NAMESPACE} ${OVERRIDES} --timeout ${HELM_TIMEOUT}
531 msg "Clean only option used: Skipping redeploy..."