3 PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
11 HELM_CHART_RELEASE_NAME=
23 COLOR_ON_RED='\033[0;31;1m'
24 COLOR_ON_GREEN='\033[0;32;1m'
35 ${CMD} - simple tool for fixing onap helm deployment
38 This script does nothing smart or special it just tries to
39 redeploy onap component. It can fix only problems related to
40 race conditions or timeouts. Nothing else. It will not fix
41 broken ONAP - there is no such ambition - that effort should
42 be directed in the upstream.
48 ${CMD} -n|--namespace <namespace>
49 (-f|--file <override>)...
50 (-s|--storage <directory>)|--no-storage-deletion
51 [-p|--release-prefix <release prefix>]
53 [(-c|--component <component release name>)...|
59 Usage 1: (simple heuristics - redeploy failed components):
60 ${CMD} -n onap -f /some/override1.yml -s /dockerdata-nfs
62 Usage 2: (redeploy ONLY explicitly listed components):
63 ${CMD} -n onap -f /some/override1.yml -s /dockerdata-nfs \\
64 -c onap-aaf -c onap-sdc -c onap-portal
66 Usage 3: (delete EVERYTHING and redeploy):
67 ${CMD} -n onap -f /some/override1.yml -s /dockerdata-nfs --delete-all
69 Usage 4: (delete EVERYTHING and DO NOT redeploy - clean env.)
70 ${CMD} -n onap -s /dockerdata-nfs --delete-all --clean-only
74 Namespace argument (always) and at least one override file (if you don't
75 use '--delete-all') are mandatory for this script to execute. Also you must
76 provide path to the storage ('--storage') OR explicitly request to not
77 delete file storage of the component ('--no-storage-deletion').
79 The storage should be a directory where persistent volume resides. It will
80 work only if the component created the persistent volume with the same
81 filename as its release name. Otherwise no files are deleted. The exception
82 is when '--delete-all' is used - in that case all content of the storage is
83 deleted (because ONAP is not consistent with the volume directory names
86 '--file' can be used multiple of times and it is used for override files
87 which are passed on to helm. The order is significant because if two
88 override files modify one value the latest one is used. This option is
89 ignored if '--clean-only' is used.
91 CAUTION 1: filename of an override file cannot contain whitespace! This is
92 actually helm/onap deploy plugin issue which does not handle such files. So
93 I dropped the more complicated version of this script when there is no
94 reason to support something on what will helm deploy choke anyway.
96 '--prefix' option is helm release argument - it is actually prefix when you
97 list the helm releases - helm is little confusing here.
99 CAUTION 2: By default release prefix is 'onap' - if you deployed release
100 'onap' and now run this script with different prefix then it will skip all
101 'onap-*' components and will deploy a new release with new prefix - BEWARE
102 TO USE PROPER RELEASE PREFIX!
104 Timeout sets the waiting time for helm deploy per component.
106 '--component' references to the release name of the chart which you want to
107 redeploy excplicitly - otherwise 'ALL FAILED' components will be
108 redeployed. You can target more than one component at once - just use the
109 argument multiple times.
111 Component option is mutually exclusive with the '--delete-all' which will
112 delete all components - healthy or not. Actually it will delete the whole
113 NAMESPACE and everything in it. Also to be sure it will cleanup all
114 orphaned images and volumes on all kubernetes nodes.
116 '--clean-only' can be used with any usage: heuristics, explicit component
117 list or with '--delete-all'. It basically just skips the last step - the
124 printf "Try help: ${CMD} --help\n"
129 printf "${COLOR_ON_GREEN}INFO: $@ ${COLOR_OFF}\n"
134 printf "${COLOR_ON_RED}ERROR: $@ ${COLOR_OFF}\n"
137 # remove all successfully completed jobs
140 kubectl get jobs -n ${NAMESPACE} \
141 --ignore-not-found=true \
142 --no-headers=true | \
143 while read -r _job _completion _duration _age ; do
144 _done=$(echo ${_completion} | awk 'BEGIN {FS="/";} {print $1;}')
145 _desired=$(echo ${_completion} | awk 'BEGIN {FS="/";} {print $2;}')
146 if [ "$_desired" -eq "$_done" ] ; then
154 get_labels 'status.phase==Failed'
157 # arg: [optional: selector]
160 if [ -n "$1" ] ; then
161 _selector="--field-selector=${1}"
166 kubectl get pods -n ${NAMESPACE} \
169 --ignore-not-found=true \
170 --no-headers=true | \
171 while read -r _pod _ready _status _restart _age _labels ; do
172 [ -z "$_labels" ] && break
173 for _label in $(echo "$_labels" | tr ',' ' ') ; do
176 _label=$(echo "$_label" | sed 's/release=//')
184 # arg: <release name>
187 msg "Undeploy helm release name: ${1}"
188 helm undeploy ${1} --purge
194 kubectl delete job -n ${NAMESPACE} \
200 # wait for job to be deleted
202 while [ -n "$_output" ] && sleep 1 ; do
203 _output=$(kubectl get pods -n ${NAMESPACE} \
204 --ignore-not-found=true \
206 --selector="job-name=${1}")
210 # arg: <resource> <release name>
216 msg "Delete ${_resource} for ${_release}..."
218 kubectl get ${_resource} -n ${NAMESPACE} \
219 --ignore-not-found=true \
220 --selector="release=${_release}" \
223 # this is due to missing "release" label in some pods
224 # grep for the rescue...
225 kubectl get ${_resource} -n ${NAMESPACE} \
226 --no-headers=true | grep "^${_release}[-]"
227 } | awk '{print $1}' | sort -u | while read -r _name _rest ; do
228 echo "Deleting '${_name}'"
229 kubectl delete ${_resource} -n ${NAMESPACE} \
234 2>&1 | grep -iv 'not[[:space:]]*found'
236 # wait for resource to be deleted
238 while [ -n "$_output" ] && sleep 1 ; do
239 _output=$(kubectl get ${_resource} -n ${NAMESPACE} \
240 --ignore-not-found=true \
242 --field-selector="metadata.name=${_name}")
249 msg "Delete the whole namespace: ${NAMESPACE}"
250 kubectl delete namespace \
256 # wait for namespace to be deleted
258 while [ -n "$_output" ] && sleep 1 ; do
259 _output=$(kubectl get all -n ${NAMESPACE} \
260 --ignore-not-found=true \
265 # arg: [optional: subdir]
268 _node=$(kubectl get nodes \
269 --selector=node-role.kubernetes.io/worker \
271 --no-headers=true | \
272 awk '{print $6}' | head -n 1)
274 if [ -z "$_node" ] ; then
275 error "Could not list kubernetes nodes - SKIPPING DELETION"
277 if [ -n "$1" ] ; then
278 msg "Delete directory '${VOLUME_STORAGE}/${1}' on $_node"
280 rm -rf "${VOLUME_STORAGE}/${1}"
283 msg "Delete directories '${VOLUME_STORAGE}/*' on $_node"
285 find "${VOLUME_STORAGE}" -maxdepth 1 -mindepth 1 -exec rm -rf '{}' \;
293 _nodes=$(kubectl get nodes \
294 --selector=node-role.kubernetes.io/worker \
296 --no-headers=true | \
299 if [ -z "$_nodes" ] ; then
300 error "Could not list kubernetes nodes - SKIPPING docker cleanup"
304 for _node in $_nodes ; do
305 msg "Docker cleanup on $_node"
307 ssh -T $_node >/dev/null <<EOF
308 if which docker >/dev/null ; then
309 docker system prune --force --all --volumes
315 msg "We are waiting now for docker cleanup to finish on all nodes..."
319 # arg: <release name>
322 _chart=$(echo "$1" | sed 's/[^-]*-//')
325 # for all kubernetes resources: kubectl api-resources
326 # TODO: does deleted secret per component break something?
338 delete_resource ${x} ${1}
341 if [ -n "$VOLUME_STORAGE" ] ; then
342 msg "Persistent volume data deletion in directory: ${VOLUME_STORAGE}/${1}"
347 # arg: <release name>
350 # TODO: until I can verify that this does the same for this component as helm deploy
351 #msg "Redeployment of the component ${1}..."
352 #helm install "local/${_chart}" --name ${1} --namespace ${NAMESPACE} --wait --timeout ${HELM_TIMEOUT}
353 error "NOT IMPLEMENTED"
371 while [ -n "$1" ] ; do
391 --no-storage-deletion)
392 if [ -n "$arg_storage" ] ; then
393 error "Usage of storage argument together with no storage deletion option!"
396 elif [ -z "$arg_nostorage" ] ; then
397 arg_nostorage=nostorage
399 error "Duplicit argument for no storage option! (IGNORING)"
403 if [ -n "$arg_deleteall" ] ; then
404 error "'Delete all components' used already - argument mismatch"
411 if [ -n "$arg_components" ] ; then
412 error "Explicit component(s) provided already - argument mismatch"
415 elif [ -z "$arg_deleteall" ] ; then
416 arg_deleteall=deleteall
418 error "Duplicit argument for 'delete all' option! (IGNORING)"
425 if [ -z "$arg_cleanonly" ] ; then
426 arg_cleanonly=cleanonly
428 error "Duplicit argument for 'clean only' option! (IGNORING)"
432 error "Unknown parameter: $1"
439 if [ -z "$arg_namespace" ] ; then
443 error "Duplicit argument for namespace!"
449 if ! [ -f "$1" ] ; then
450 error "Wrong filename for override file: $1"
454 arg_overrides="${arg_overrides} -f $1"
458 arg_components="${arg_components} $1"
462 if [ -z "$arg_prefix" ] ; then
466 error "Duplicit argument for release prefix!"
472 if [ -z "$arg_timeout" ] ; then
473 if ! echo "$1" | grep -q '^[0-9]\+$' ; then
474 error "Timeout must be an integer: $1"
481 error "Duplicit argument for timeout!"
487 if [ -n "$arg_nostorage" ] ; then
488 error "Usage of storage argument together with no storage deletion option!"
491 elif [ -z "$arg_storage" ] ; then
495 error "Duplicit argument for storage!"
506 if [ -z "$arg_namespace" ] ; then
507 error "Missing namespace"
511 NAMESPACE="$arg_namespace"
514 if [ -z "$arg_overrides" ] && [ -z "$arg_cleanonly" ] ; then
515 error "Missing override file(s) or use '--clean-only'"
519 OVERRIDES="$arg_overrides"
522 if [ -n "$arg_prefix" ] ; then
523 RELEASE_PREFIX="$arg_prefix"
526 if [ -n "$arg_timeout" ] ; then
527 HELM_TIMEOUT="$arg_timeout"
530 if [ -n "$arg_storage" ] ; then
531 VOLUME_STORAGE="$arg_storage"
532 elif [ -z "$arg_nostorage" ] ; then
533 error "Missing storage argument! If it is intended then use '--no-storage-deletion' option"
538 if [ -n "$arg_components" ] ; then
539 HELM_CHART_RELEASE_NAME="$arg_components"
542 if [ -n "$arg_deleteall" ] ; then
546 if [ -n "$arg_cleanonly" ] ; then
555 # if --delete-all is used then redeploy all components (the current namespace is deleted)
556 if [ -n "$HELM_DELETE_ALL" ] ; then
557 # undeploy helm release (prefix)
558 helm_undeploy "$RELEASE_PREFIX"
560 # we will delete the whole namespace
563 # we will cleanup docker on each node
566 # we will delete the content of storage (volumes)
567 if [ -n "$VOLUME_STORAGE" ] ; then
570 # delete and redeploy explicit or failed components...
572 # if a helm chart release name was given then just redeploy said component and quit
573 if [ -n "$HELM_CHART_RELEASE_NAME" ] ; then
574 msg "Explicitly asked for component redeploy: ${HELM_CHART_RELEASE_NAME}"
575 _COMPONENTS="$HELM_CHART_RELEASE_NAME"
576 # simple heuristics: redeploy only failed components
578 msg "Delete successfully completed jobs..."
581 msg "Find failed components..."
582 _COMPONENTS=$(get_failed_labels)
585 for _component in ${_COMPONENTS} ; do
586 if echo "$_component" | grep -q "^${RELEASE_PREFIX}-" ; then
587 msg "Redeploy component: ${_component}"
588 undeploy_component ${_component}
590 error "Component release name '${_component}' does not match release prefix: ${RELEASE_PREFIX} (SKIP)"
595 if [ -z "$HELM_SKIP_DEPLOY" ] ; then
596 # TODO: this is suboptimal - find a way how to deploy only the affected component...
597 msg "Redeploy onap..."
598 msg helm deploy ${RELEASE_PREFIX} local/onap --namespace ${NAMESPACE} ${OVERRIDES} --timeout ${HELM_TIMEOUT}
599 helm deploy ${RELEASE_PREFIX} local/onap --namespace ${NAMESPACE} ${OVERRIDES} --timeout ${HELM_TIMEOUT}
601 msg "Clean only option used: Skipping redeploy..."