Cluster Distributed lock service integration with OOM. 50/101850/3
authorSebastien Premont-Tendland <sebastien.premont@bell.ca>
Mon, 17 Feb 2020 16:32:15 +0000 (11:32 -0500)
committerSebastien Premont-Tendland <sebastien.premont@bell.ca>
Tue, 18 Feb 2020 18:49:56 +0000 (18:49 +0000)
Disabled by default. In order to enable cluster replicaCount
should be higher than 2 and useScriptCompileCache is set to false.

We need to disable script compile cache otherwise there is
issue with updating CBA when running multiple replicas of
blueprint processor.

Issue-ID: CCSDK-2011
Signed-off-by: Sebastien Premont-Tendland <sebastien.premont@bell.ca>
Change-Id: I6f6071556eb499832f9a765ba4c27100497c6e88

kubernetes/cds/charts/cds-blueprints-processor/resources/config/hazelcast.yaml [new file with mode: 0755]
kubernetes/cds/charts/cds-blueprints-processor/templates/deployment.yaml
kubernetes/cds/charts/cds-blueprints-processor/templates/service.yaml
kubernetes/cds/charts/cds-blueprints-processor/values.yaml

diff --git a/kubernetes/cds/charts/cds-blueprints-processor/resources/config/hazelcast.yaml b/kubernetes/cds/charts/cds-blueprints-processor/resources/config/hazelcast.yaml
new file mode 100755 (executable)
index 0000000..3a3a1ce
--- /dev/null
@@ -0,0 +1,35 @@
+hazelcast:
+  cp-subsystem:
+    cp-member-count: {{ .Values.replicaCount }}
+    group-size: {{ .Values.cluster.groupSize }}
+    session-time-to-live-seconds: 10
+    session-heartbeat-interval-seconds: 5
+    missing-cp-member-auto-removal-seconds: 120
+    fail-on-indeterminate-operation-state: false
+    raft-algorithm:
+      leader-election-timeout-in-millis: 2000
+      leader-heartbeat-period-in-millis: 5000
+      max-missed-leader-heartbeat-count: 5
+      append-request-max-entry-count: 50
+      commit-index-advance-count-to-snapshot: 1000
+      uncommitted-entry-count-to-reject-new-appends: 100
+      append-request-backoff-timeout-in-millis: 100
+  network:
+    enabled: true
+    rest-api:
+      enabled: true
+      endpoint-groups:
+        HEALTH_CHECK:
+          enabled: true
+        CP:
+          enabled: true
+    join:
+      multicast:
+        enabled: false
+      kubernetes:
+        enabled: true
+        namespace:  {{ include "common.namespace" . }}
+        service-name: {{ include "common.servicename" . }}-cluster
+        resolve-not-ready-addresses: true
+        # service-label-name: MY-SERVICE-LABEL-NAME
+        # service-label-value: MY-SERVICE-LABEL-VALUE
index a90e4d7..749e9a4 100755 (executable)
@@ -24,6 +24,18 @@ metadata:
     heritage: {{ .Release.Service }}
 spec:
   replicas: {{ .Values.replicaCount }}
+  strategy:
+    type: RollingUpdate
+    rollingUpdate:
+      # This allow a new pod to be ready before terminating the old one
+      # causing no downtime when replicas is set to 1
+      maxUnavailable: 0
+
+      # maxSurge to 1 is very important for the hazelcast integration
+      # we only want one pod at a time to restart not multiple
+      # and break the hazelcast cluster. We should not use % maxSurge value
+      # ref : https://hazelcast.com/blog/rolling-upgrade-hazelcast-imdg-on-kubernetes/
+      maxSurge: 1
   template:
     metadata:
       labels:
@@ -56,9 +68,23 @@ spec:
           env:
           - name: APP_CONFIG_HOME
             value: {{ .Values.config.appConfigDir }}
+          - name: USE_SCRIPT_COMPILE_CACHE
+            value: {{ .Values.config.useScriptCompileCache | quote }}
+          # Cluster should only be enabled when replicaCount is more than 2 and useScriptCompileCache is set to false otherwise it won't work properly
+          - name: CLUSTER_ENABLED
+            value: {{ if and (gt (int (.Values.replicaCount)) 2) (not .Values.config.useScriptCompileCache) }} {{ .Values.cluster.enabled | quote }} {{ else }} "false" {{ end }}
+          - name: CLUSTER_ID
+            value: {{ .Values.cluster.clusterName }}
+          - name: CLUSTER_NODE_ID
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.name
+          - name: CLUSTER_CONFIG_FILE
+            value: {{ .Values.config.appConfigDir }}/hazelcast.yaml
           ports:
           - containerPort: {{ .Values.service.http.internalPort }}
           - containerPort: {{ .Values.service.grpc.internalPort }}
+          - containerPort: {{ .Values.service.cluster.internalPort }}
           # disable liveness probe when breakpoints set in debugger
           # so K8s doesn't restart unresponsive container
           {{ if .Values.liveness.enabled }}
@@ -93,6 +119,9 @@ spec:
           - mountPath: {{ .Values.config.appConfigDir }}/logback.xml
             name: {{ include "common.fullname" . }}-config
             subPath: logback.xml
+          - mountPath: {{ .Values.config.appConfigDir }}/hazelcast.yaml
+            name: {{ include "common.fullname" . }}-config
+            subPath: hazelcast.yaml
 
           - mountPath: {{ .Values.config.appConfigDir }}/ONAP_RootCA.cer
             name: {{ include "common.fullname" . }}-config
@@ -122,6 +151,8 @@ spec:
               path: application.properties
             - key: logback.xml
               path: logback.xml
+            - key: hazelcast.yaml
+              path: hazelcast.yaml
             - key: ONAP_RootCA.cer
               path: ONAP_RootCA.cer
         - name: {{ include "common.fullname" . }}-blueprints
index 411df86..a6a21a6 100755 (executable)
@@ -56,3 +56,27 @@ spec:
   selector:
     app: {{ include "common.name" . }}
     release: {{ include "common.release" . }}
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: {{ include "common.servicename" . }}-cluster
+  namespace: {{ include "common.namespace" . }}
+  labels:
+    app: {{ include "common.name" . }}
+    chart: {{ .Chart.Name }}-{{ .Chart.Version | replace "+" "_" }}
+    release: {{ .Release.Name }}
+    heritage: {{ .Release.Service }}
+  annotations:
+spec:
+  type: {{ .Values.service.cluster.type }}
+  ports:
+    - port: {{ .Values.service.cluster.externalPort }}
+      targetPort: {{ .Values.service.cluster.internalPort }}
+      {{- if eq .Values.service.cluster.type "NodePort"}}
+      nodePort: {{ .Values.global.nodePortPrefixExt | default .Values.nodePortPrefixExt }}{{ .Values.service.cluster.nodePort }}
+      {{- end}}
+      name: {{ .Values.service.cluster.portName | default "cluster" }}
+  selector:
+    app: {{ include "common.name" . }}
+    release: {{ .Release.Name }}
index 781b538..35661c2 100755 (executable)
@@ -47,6 +47,7 @@ debugEnabled: false
 # application configuration
 config:
   appConfigDir: /opt/app/onap/config
+  useScriptCompileCache: true
 
 # default number of instances
 replicaCount: 1
@@ -85,6 +86,11 @@ service:
     portName: blueprints-processor-grpc
     internalPort: 9111
     externalPort: 9111
+  cluster:
+    type: ClusterIP
+    portName: blueprints-processor-cluster
+    internalPort: 5701
+    externalPort: 5701
 
 persistence:
   volumeReclaimPolicy: Retain
@@ -94,6 +100,17 @@ persistence:
   mountSubPath: cds/blueprints/deploy
   deployedBlueprint: /opt/app/onap/blueprints/deploy
 
+cluster:
+  # Cannot have cluster enabled if the replicaCount is not at least 3
+  # AND config value useScriptCompileCache is not set to false
+  enabled: false
+
+  clusterName: cds-cluster
+
+  # Defines the number of node to be part of the CP subsystem/raft algorithm. This value should be
+  # between 3 and 7 only.
+  groupSize: 3
+
 ingress:
   enabled: false
   service: