Fix issue with etcd pod startup
[oom.git] / kubernetes / common / etcd / templates / statefulset.yaml
1 # Copyright © 2019 Intel Corporation Inc
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 #       http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14 apiVersion: apps/v1beta1
15 kind: StatefulSet
16 metadata:
17   name: {{ include "common.fullname" .  }}
18   labels:
19     heritage: "{{ .Release.Service }}"
20     release: "{{ .Release.Name }}"
21     chart: "{{ .Chart.Name }}-{{ .Chart.Version }}"
22     app: {{ include "common.name" . }}
23 spec:
24   serviceName: {{ include "common.servicename" .}}
25   replicas: {{ .Values.replicaCount }}
26   template:
27     metadata:
28       labels:
29         heritage: "{{ .Release.Service }}"
30         release: "{{ .Release.Name }}"
31         chart: "{{ .Chart.Name }}-{{ .Chart.Version }}"
32         app: {{ include "common.name" . }}
33     spec:
34 {{- if .Values.affinity }}
35       affinity:
36 {{ toYaml .Values.affinity | indent 8 }}
37 {{- end }}
38 {{- if .Values.nodeSelector }}
39       nodeSelector:
40 {{ toYaml .Values.nodeSelector | indent 8 }}
41 {{- end }}
42 {{- if .Values.tolerations }}
43       tolerations:
44 {{ toYaml .Values.tolerations | indent 8 }}
45 {{- end }}
46       containers:
47       - name: {{ include "common.fullname" .  }}
48         image: "{{ .Values.repository }}/{{ .Values.image }}"
49         imagePullPolicy: "{{ .Values.pullPolicy }}"
50         ports:
51         - containerPort: {{ .Values.service.peerInternalPort }}
52           name: {{ .Values.service.peerPortName }}
53         - containerPort: {{ .Values.service.clientInternalPort }}
54           name: {{ .Values.service.clientPortName }}
55         {{- if eq .Values.liveness.enabled true }}
56         livenessProbe:
57           tcpSocket:
58             port: {{ .Values.service.clientInternalPort }}
59           initialDelaySeconds: {{ .Values.liveness.initialDelaySeconds }}
60           periodSeconds: {{ .Values.liveness.periodSeconds }}
61           timeoutSeconds: {{ .Values.liveness.timeoutSeconds }}
62         {{ end -}}
63         resources:
64 {{ include "common.resources" . | indent 10 }}
65         env:
66         - name: INITIAL_CLUSTER_SIZE
67           value: {{ .Values.replicaCount | quote }}
68         - name: SET_NAME
69           value: {{ include "common.fullname" . }}
70         - name: SERVICE_NAME
71           value: {{ include "common.servicename" . }}
72 {{- if .Values.extraEnv }}
73 {{ toYaml .Values.extraEnv | indent 8 }}
74 {{- end }}
75         lifecycle:
76           preStop:
77             exec:
78               command:
79                 - "/bin/sh"
80                 - "-ec"
81                 - |
82                   EPS=""
83                   for i in $(seq 0 $((${INITIAL_CLUSTER_SIZE} - 1))); do
84                       EPS="${EPS}${EPS:+,}http://${SET_NAME}-${i}.${SERVICE_NAME}:2379"
85                   done
86
87                   HOSTNAME=$(hostname)
88
89                   member_hash() {
90                       etcdctl member list | grep http://${HOSTNAME}.${SERVICE_NAME}:2380 | cut -d':' -f1 | cut -d'[' -f1
91                   }
92
93                   SET_ID=${HOSTNAME##*[^0-9]}
94
95                   if [ "${SET_ID}" -ge ${INITIAL_CLUSTER_SIZE} ]; then
96                       echo "Removing ${HOSTNAME} from etcd cluster"
97                       ETCDCTL_ENDPOINT=${EPS} etcdctl member remove $(member_hash)
98                       if [ $? -eq 0 ]; then
99                           # Remove everything otherwise the cluster will no longer scale-up
100                           rm -rf /var/run/etcd/*
101                       fi
102                   fi
103         command:
104           - "/bin/sh"
105           - "-ec"
106           - |
107             HOSTNAME=$(hostname)
108
109             # store member id into PVC for later member replacement
110             collect_member() {
111                 while ! etcdctl member list &>/dev/null; do sleep 1; done
112                 etcdctl member list | grep http://${HOSTNAME}.${SERVICE_NAME}:2380 | cut -d':' -f1 | cut -d'[' -f1 > /var/run/etcd/member_id
113                 exit 0
114             }
115
116             eps() {
117                 EPS=""
118                 for i in $(seq 0 $((${INITIAL_CLUSTER_SIZE} - 1))); do
119                     EPS="${EPS}${EPS:+,}http://${SET_NAME}-${i}.${SERVICE_NAME}:2379"
120                 done
121                 echo ${EPS}
122             }
123
124             member_hash() {
125                 etcdctl member list | grep http://${HOSTNAME}.${SERVICE_NAME}:2380 | cut -d':' -f1 | cut -d'[' -f1
126             }
127
128             # we should wait for other pods to be up before trying to join
129             # otherwise we got "no such host" errors when trying to resolve other members
130             for i in $(seq 0 $((${INITIAL_CLUSTER_SIZE} - 1))); do
131                 while true; do
132                     echo "Waiting for ${SET_NAME}-${i}.${SERVICE_NAME} to come up"
133                     ping -W 1 -c 1 ${SET_NAME}-${i}.${SERVICE_NAME} > /dev/null && break
134                     sleep 1s
135                 done
136             done
137
138             # re-joining after failure?
139             if [[ -e /var/run/etcd/default.etcd && -f /var/run/etcd/member_id ]]; then
140                 echo "Re-joining etcd member"
141                 member_id=$(cat /var/run/etcd/member_id)
142
143                 # re-join member
144                 ETCDCTL_ENDPOINT=$(eps) etcdctl member update ${member_id} http://${HOSTNAME}.${SERVICE_NAME}:2380 | true
145                 exec etcd --name ${HOSTNAME} \
146                     --listen-peer-urls http://0.0.0.0:2380 \
147                     --listen-client-urls http://0.0.0.0:2379\
148                     --advertise-client-urls http://${HOSTNAME}.${SERVICE_NAME}:2379 \
149                     --data-dir /var/run/etcd/default.etcd
150             fi
151
152             # etcd-SET_ID
153             SET_ID=${HOSTNAME##*[^0-9]}
154
155             # adding a new member to existing cluster (assuming all initial pods are available)
156             if [ "${SET_ID}" -ge ${INITIAL_CLUSTER_SIZE} ]; then
157                 export ETCDCTL_ENDPOINT=$(eps)
158
159                 # member already added?
160                 MEMBER_HASH=$(member_hash)
161                 if [ -n "${MEMBER_HASH}" ]; then
162                     # the member hash exists but for some reason etcd failed
163                     # as the datadir has not be created, we can remove the member
164                     # and retrieve new hash
165                     etcdctl member remove ${MEMBER_HASH}
166                 fi
167
168                 echo "Adding new member"
169                 etcdctl member add ${HOSTNAME} http://${HOSTNAME}.${SERVICE_NAME}:2380 | grep "^ETCD_" > /var/run/etcd/new_member_envs
170
171                 if [ $? -ne 0 ]; then
172                     echo "Exiting"
173                     rm -f /var/run/etcd/new_member_envs
174                     exit 1
175                 fi
176
177                 cat /var/run/etcd/new_member_envs
178                 source /var/run/etcd/new_member_envs
179
180                 collect_member &
181
182                 exec etcd --name ${HOSTNAME} \
183                     --listen-peer-urls http://0.0.0.0:2380 \
184                     --listen-client-urls http://0.0.0.0:2379 \
185                     --advertise-client-urls http://${HOSTNAME}.${SERVICE_NAME}:2379 \
186                     --data-dir /var/run/etcd/default.etcd \
187                     --initial-advertise-peer-urls http://${HOSTNAME}.${SERVICE_NAME}:2380 \
188                     --initial-cluster ${ETCD_INITIAL_CLUSTER} \
189                     --initial-cluster-state ${ETCD_INITIAL_CLUSTER_STATE}
190             fi
191
192             PEERS=""
193             for i in $(seq 0 $((${INITIAL_CLUSTER_SIZE} - 1))); do
194                 PEERS="${PEERS}${PEERS:+,}${SET_NAME}-${i}=http://${SET_NAME}-${i}.${SERVICE_NAME}:2380"
195             done
196
197             collect_member &
198
199             # join member
200             exec etcd --name ${HOSTNAME} \
201                 --initial-advertise-peer-urls http://${HOSTNAME}.${SERVICE_NAME}:2380 \
202                 --listen-peer-urls http://0.0.0.0:2380 \
203                 --listen-client-urls http://0.0.0.0:2379 \
204                 --advertise-client-urls http://${HOSTNAME}.${SERVICE_NAME}:2379 \
205                 --initial-cluster-token etcd-cluster-1 \
206                 --initial-cluster ${PEERS} \
207                 --initial-cluster-state new \
208                 --data-dir /var/run/etcd/default.etcd
209         volumeMounts:
210         - name: {{ include "common.fullname" . }}-data
211           mountPath: /var/run/etcd
212   {{- if .Values.persistence.enabled }}
213   volumeClaimTemplates:
214   - metadata:
215       name: {{ include "common.fullname" . }}-data
216     spec:
217       accessModes:
218         - "{{ .Values.persistence.accessMode }}"
219       resources:
220         requests:
221           # upstream recommended max is 700M
222           storage: "{{ .Values.persistence.storage }}"
223       storageClassName: {{ include "common.fullname" . }}-data
224   {{- else }}
225       volumes:
226       - name: {{ include "common.fullname" . }}-data
227       {{- if .Values.memoryMode }}
228         emptyDir:
229           medium: Memory
230       {{- else }}
231         emptyDir: {}
232       {{- end }}
233   {{- end }}
234