Fix intermittent failures on-demand KPI job 57/141057/1
authorhalil.cakal <halil.cakal@est.tech>
Wed, 28 May 2025 11:15:37 +0000 (12:15 +0100)
committerhalil.cakal <halil.cakal@est.tech>
Fri, 30 May 2025 10:37:25 +0000 (11:37 +0100)
- On-demand KPI jobs fails frequently since cps-and-ncmp sometimes got
  "Connection refused" error that means the PostgreSQL isn't fully ready
 yet.
- Give more time to db to start (210s), previously it was 30s
- Change healthcheck interval from 10s to 2s
- Reduce the timeout of healtchecks from 10s to 1s
- Store db container logs to investigate further
- cps-and-ncmp waits for db until its fully ready before make connection

Issue-ID: CPS-2694

Change-Id: Ie6362741f98222eec58892734190b1ee0fff148e
Signed-off-by: halil.cakal <halil.cakal@est.tech>
docker-compose/cps-base.yml
k6-tests/make-logs.sh

index 2391fb2..5fcec49 100644 (file)
@@ -47,10 +47,10 @@ services:
           memory: 3G
     healthcheck:
       test: pg_isready || exit 1 # This command runs inside the container, returning 0 for success, non-zero for failure.
-      timeout: 10s               # Time-out of the above test command.
-      interval: 10s              # How often the health is run.
-      retries: 3                 # If 3 health checks fail, the container is unhealthy.
-      start_period: 30s          # Ignore failed health checks for first 30 seconds, to give system time to start
+      timeout: 1s               # Time-out of the above test command.
+      interval: 2s              # How often the health is run.
+      retries: 100              # If 3 health checks fail, the container is unhealthy.
+      start_period: 210s        # Ignore failed health checks for first 30 seconds, to give system time to start
       ### Full start-up time allowed = 30 seconds start period + 3 tries * 10 seconds interval = 60 seconds
 
   cps-and-ncmp-template:
@@ -99,7 +99,8 @@ services:
     ports:
       - ${CPS_INSTANCE_0_REST_PORT:-8698}:8080
     depends_on:
-      - dbpostgresql
+        dbpostgresql:
+            condition: service_healthy
 
   ### DEBUG: For easier debugging use just 1 instance and comment out below
   cps-and-ncmp-1:
@@ -112,7 +113,8 @@ services:
     ports:
       - ${CPS_INSTANCE_1_REST_PORT:-8699}:8080
     depends_on:
-      - dbpostgresql
+      dbpostgresql:
+        condition: service_healthy
 
   nginx:
     container_name: ${NGINX_CONTAINER_NAME:-nginx-loadbalancer}
index 6097624..f3343b6 100644 (file)
@@ -1,6 +1,6 @@
 #!/bin/bash
 #
-# Copyright 2025 Nordix Foundation.
+# Copyright 2025 OpenInfra Foundation Europe. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-
-SERVICE_NAME="cps-and-ncmp"
+SERVICE_NAMES=("cps-and-ncmp" "dbpostgresql")
 TIMESTAMP=$(date +"%Y%m%d%H%M%S")
 LOG_DIR="${WORKSPACE:-.}/logs"
-TEMP_DIR="$LOG_DIR/temp_$TIMESTAMP"
-ZIP_FILE="$LOG_DIR/${SERVICE_NAME}_logs_$TIMESTAMP.zip"
-
 mkdir -p "$LOG_DIR"
-mkdir -p "$TEMP_DIR"
-
-# Store logs for cps-and-ncmp containers to temp directory
-CONTAINER_IDS=$(docker ps --filter "name=$SERVICE_NAME" --format "{{.ID}}")
-for CONTAINER_ID in $CONTAINER_IDS; do
-    CONTAINER_NAME=$(docker inspect --format="{{.Name}}" "$CONTAINER_ID" | sed 's/\///g')
-    LOG_FILE="$TEMP_DIR/${CONTAINER_NAME}_logs_$TIMESTAMP.log"
-    docker logs "$CONTAINER_ID" > "$LOG_FILE"
+# Store logs for each service's containers and zip them individually
+for SERVICE_NAME in "${SERVICE_NAMES[@]}"; do
+    TEMP_DIR="$LOG_DIR/temp_${SERVICE_NAME}_$TIMESTAMP"
+    ZIP_FILE="$LOG_DIR/logs_${SERVICE_NAME}_$TIMESTAMP.zip"
+    mkdir -p "$TEMP_DIR"
+    CONTAINER_IDS=$(docker ps --filter "name=$SERVICE_NAME" --format "{{.ID}}")
+    for CONTAINER_ID in $CONTAINER_IDS; do
+        CONTAINER_NAME=$(docker inspect --format="{{.Name}}" "$CONTAINER_ID" | sed 's/\///g')
+        LOG_FILE="$TEMP_DIR/${CONTAINER_NAME}_logs_$TIMESTAMP.log"
+        docker logs "$CONTAINER_ID" > "$LOG_FILE"
+    done
+    # Zip the logs for the current service
+    zip -r "$ZIP_FILE" "$TEMP_DIR"
+    echo "Logs for service $SERVICE_NAME saved to $ZIP_FILE"
+    # Clean temp files for the current service
+    rm -r "$TEMP_DIR"
 done
-
-# Zip the logs
-zip -r "$ZIP_FILE" "$TEMP_DIR"
-echo "Logs saved to $ZIP_FILE inside workspace"
-
-# Clean temp files
-rm -r "$TEMP_DIR"
-
 # Delete logs older than 2 weeks
-find "$LOG_DIR" -name "${SERVICE_NAME}_logs_*.zip" -mtime +14 -delete
+find "$LOG_DIR" -name "logs_*.zip" -mtime +14 -delete
\ No newline at end of file