[CCSDK-1985]GR Toolkit Refactor
[ccsdk/sli/plugins.git] / grToolkit / provider / src / main / java / org / onap / ccsdk / sli / plugins / grtoolkit / resolver / SixNodeHealthResolver.java
1 /*-
2  * ============LICENSE_START=======================================================
3  * openECOMP : SDN-C
4  * ================================================================================
5  * Copyright (C) 2019 AT&T Intellectual Property. All rights
6  *                      reserved.
7  * ================================================================================
8  * Licensed under the Apache License, Version 2.0 (the "License");
9  * you may not use this file except in compliance with the License.
10  * You may obtain a copy of the License at
11  *
12  *      http://www.apache.org/licenses/LICENSE-2.0
13  *
14  * Unless required by applicable law or agreed to in writing, software
15  * distributed under the License is distributed on an "AS IS" BASIS,
16  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17  * See the License for the specific language governing permissions and
18  * limitations under the License.
19  * ============LICENSE_END=========================================================
20  */
21
22 package org.onap.ccsdk.sli.plugins.grtoolkit.resolver;
23
24 import org.json.JSONArray;
25 import org.json.JSONObject;
26
27 import org.onap.ccsdk.sli.core.dblib.DbLibService;
28 import org.onap.ccsdk.sli.plugins.grtoolkit.connection.ConnectionManager;
29 import org.onap.ccsdk.sli.plugins.grtoolkit.connection.ConnectionResponse;
30 import org.onap.ccsdk.sli.plugins.grtoolkit.data.AdminHealth;
31 import org.onap.ccsdk.sli.plugins.grtoolkit.data.ClusterActor;
32 import org.onap.ccsdk.sli.plugins.grtoolkit.data.ClusterHealth;
33 import org.onap.ccsdk.sli.plugins.grtoolkit.data.DatabaseHealth;
34 import org.onap.ccsdk.sli.plugins.grtoolkit.data.FailoverStatus;
35 import org.onap.ccsdk.sli.plugins.grtoolkit.data.Health;
36 import org.onap.ccsdk.sli.plugins.grtoolkit.data.SiteHealth;
37
38 import org.opendaylight.yang.gen.v1.org.onap.ccsdk.sli.plugins.gr.toolkit.rev180926.FailoverInput;
39
40 import org.slf4j.Logger;
41 import org.slf4j.LoggerFactory;
42
43 import java.io.IOException;
44 import java.util.Arrays;
45 import java.util.List;
46 import java.util.Map;
47 import java.util.NoSuchElementException;
48 import java.util.Properties;
49 import java.util.stream.Collectors;
50
51 /**
52  * Implementation of {@code HealthResolver} for a six node controller
53  * architecture, where three nodes are located in one data center, and the
54  * other three nodes are located in another. The sites are assumed to be in an
55  * Active/Standby configuration, with the Active site nodes voting and the
56  * Standby site notes non-voting.
57  *
58  * @author Anthony Haddox
59  * @see HealthResolver
60  */
61 public class SixNodeHealthResolver extends HealthResolver {
62     private final Logger log = LoggerFactory.getLogger(SixNodeHealthResolver.class);
63
64     /**
65      * Constructs the health resolver used by the {@code GrToolkitProvider} to
66      * determine the health of the application components.
67      *
68      * @param map a HashMap containing all of the nodes in the akka cluster
69      * @param properties the properties passed ino the provider
70      * @param dbLib a reference to the {@code DbLibService} of the provider
71      * @see HealthResolver
72      * @see org.onap.ccsdk.sli.plugins.grtoolkit.GrToolkitProvider
73      */
74     public SixNodeHealthResolver(Map<String, ClusterActor> map, Properties properties, DbLibService dbLib) {
75         super(map, properties, dbLib);
76         resolveSites();
77     }
78
79     /**
80      * Implementation of {@code getClusterHealth()}. Uses the
81      * {@code ShardResolver} to gather health information about the controller.
82      * If 4 of 6 members are healthy, the cluster is deemed healthy.
83      *
84      * @return an {@code ClusterHealth} object with health of the akka cluster
85      * @see org.onap.ccsdk.sli.plugins.grtoolkit.GrToolkitProvider
86      * @see HealthResolver
87      * @see ClusterHealth
88      * @see ShardResolver
89      */
90     @Override
91     public ClusterHealth getClusterHealth() {
92         log.info("getClusterHealth(): Getting cluster health...");
93         shardResolver.getControllerHealth(memberMap);
94         long healthyMembers = memberMap.values().stream().filter(member -> member.isUp() && ! member.isUnreachable()).count();
95         return (healthyMembers > 4) ? new ClusterHealth().withHealth(Health.HEALTHY) : new ClusterHealth().withHealth(Health.FAULTY);
96     }
97
98     /**
99      * Implementation of {@code getSiteHealth()}. Gathers health information on
100      * all of the contollers, then separates the nodes into voting and
101      * non-voting sites. Each site is then checked for its health and the
102      * result is returned as a List.
103      *
104      * @return a List of {@code SiteHealth} objects with health of the site
105      * @see org.onap.ccsdk.sli.plugins.grtoolkit.GrToolkitProvider
106      * @see HealthResolver
107      * @see SiteHealth
108      * @see ShardResolver
109      */
110     @Override
111     public List<SiteHealth> getSiteHealth() {
112         log.info("getSiteHealth(): Getting site health...");
113
114         // Get cluster health to populate memberMap with necessary values
115         getClusterHealth();
116         List<ClusterActor> votingActors = memberMap.values().stream().filter(ClusterActor::isVoting).collect(Collectors.toList());
117         List<ClusterActor> nonVotingActors = memberMap.values().stream().filter(member -> !member.isVoting()).collect(Collectors.toList());
118
119         SiteHealth votingSiteHealth = getSiteHealth(votingActors).withRole("ACTIVE");
120         SiteHealth nonVotingSiteHealth = getSiteHealth(nonVotingActors).withRole("STANDBY");
121         return Arrays.asList(votingSiteHealth, nonVotingSiteHealth);
122     }
123
124     /**
125      * Gathers the site identifier, admin health, and database health of a
126      * site.
127      *
128      * @return a {@code SiteHealth} object with health of the site
129      * @see org.onap.ccsdk.sli.plugins.grtoolkit.GrToolkitProvider
130      * @see ClusterActor
131      * @see SiteHealth
132      * @see ConnectionManager
133      */
134     public SiteHealth getSiteHealth(List<ClusterActor> actorList) {
135         AdminHealth adminHealth = null;
136         DatabaseHealth databaseHealth = null;
137         String siteId = null;
138         int healthyMembers = 0;
139
140         for(ClusterActor actor : actorList) {
141             if(actor.isUp() && !actor.isUnreachable()) {
142                 healthyMembers++;
143             }
144             if(siteId == null) {
145                 try {
146                     String content = ConnectionManager.getConnectionResponse(httpProtocol + actor.getNode() + ":" + controllerPort + "/restconf/operations/gr-toolkit:site-identifier", ConnectionManager.HttpMethod.POST, null, credentials).content;
147                     siteId = new JSONObject(content).getJSONObject(OUTPUT).getString("id");
148                 } catch(IOException e) {
149                     log.error("getSiteHealth(): Error getting site identifier from {}", actor.getNode());
150                     log.error("getSiteHealth(): IOException", e);
151                 }
152             }
153             if(adminHealth == null) {
154                 try {
155                     boolean isAdminHealthy  = isRemoteComponentHealthy(httpProtocol + actor.getNode() + ":" + controllerPort + "/restconf/operations/gr-toolkit:admin-health");
156                     if(isAdminHealthy) {
157                         adminHealth = new AdminHealth(Health.HEALTHY, 200);
158                     }
159                 } catch(IOException e) {
160                     log.error("getSiteHealth(): Error getting admin health from {}", actor.getNode());
161                     log.error("getSiteHealth(): IOException", e);
162                 }
163             }
164             if(databaseHealth == null) {
165                 try {
166                     boolean isDatabaseHealthy = isRemoteComponentHealthy(httpProtocol + actor.getNode() + ":" + controllerPort + "/restconf/operations/gr-toolkit:database-health");
167                     if(isDatabaseHealthy) {
168                         databaseHealth = new DatabaseHealth(Health.HEALTHY);
169                     }
170                 } catch(IOException e) {
171                     log.error("getSiteHealth(): Error getting database health from {}", actor.getNode());
172                     log.error("getSiteHealth(): IOException", e);
173                 }
174             }
175         }
176
177         if(siteId == null) {
178             siteId = "UNKNOWN SITE";
179         }
180         if(adminHealth == null) {
181             adminHealth = new AdminHealth(Health.FAULTY, 500);
182         }
183         if(databaseHealth == null) {
184             databaseHealth = new DatabaseHealth(Health.FAULTY);
185         }
186         SiteHealth health = new SiteHealth()
187                                     .withAdminHealth(adminHealth)
188                                     .withDatabaseHealth(databaseHealth)
189                                     .withId(siteId);
190         if(isHealthy(adminHealth.getHealth()) && isHealthy(databaseHealth.getHealth()) && healthyMembers > 1) {
191             health.setHealth(Health.HEALTHY);
192         }
193
194         return health;
195     }
196
197     /**
198      * Implementation of {@code tryFailover()}. Performs a preliminary call to
199      * {@code getClusterHealth} to populate information about the cluster. If
200      * no voting members can be found, the method terminates immediately. The
201      * nodes are separated into voting and non-voting sites, and a driving
202      * operator is selected from the non-voting nodes to perform requests
203      * against. A payload to swap voting between sites is sent to the operator
204      * to perform a controller-level failover.
205      *
206      * @return an {@code SiteHealth} object with health of the site
207      * @see org.onap.ccsdk.sli.plugins.grtoolkit.GrToolkitProvider
208      * @see HealthResolver
209      * @see FailoverStatus
210      * @see FailoverInput
211      */
212     @Override
213     public FailoverStatus tryFailover(FailoverInput input) {
214         // Get Cluster Health to populate the memberMap with the necessary values
215         log.info("tryFailover(): Performing preliminary health check...");
216         getClusterHealth();
217         FailoverStatus status = new FailoverStatus();
218         ConnectionResponse votingResponse = null;
219         List<ClusterActor> votingActors = memberMap.values().stream().filter(ClusterActor::isVoting).collect(Collectors.toList());
220         List<ClusterActor> nonVotingActors = memberMap.values().stream().filter(member -> !member.isVoting()).collect(Collectors.toList());
221
222         if(nonVotingActors.size() == 0) {
223             status.setStatusCode(500);
224             status.setMessage("No nonvoting members found. Cannot perform voting switch.");
225             return status;
226         }
227
228         ClusterActor operator;
229         try {
230             operator = nonVotingActors.stream().filter(this::isControllerHealthy).findFirst().get();
231         } catch(NoSuchElementException e) {
232             log.error("tryFailover(): Could not find any healthy members.", e);
233             status.setStatusCode(500);
234             status.setMessage("Could not find any healthy members.");
235             return status;
236         }
237
238         // Assuming two 3 node sites, 3 voting and 3 non voting
239         if(votingActors.size() < 3 || nonVotingActors.size() < 3) {
240             log.warn("tryFailover(): Sites do not contain an equal amount of voting and nonvoting members: Voting: {} | NonVoting: {}", votingActors.size(), nonVotingActors.size());
241         }
242         log.info("tryFailover(): Swapping voting...");
243         try {
244             JSONObject votingInput = new JSONObject();
245             JSONObject inputBlock = new JSONObject();
246             JSONArray votingStateArray = new JSONArray();
247             JSONObject memberVotingState;
248             for(ClusterActor actor : votingActors) {
249                 memberVotingState = new JSONObject();
250                 memberVotingState.put("member-name", actor.getMember());
251                 memberVotingState.put("voting", false);
252                 votingStateArray.put(memberVotingState);
253             }
254             for(ClusterActor actor : nonVotingActors) {
255                 memberVotingState = new JSONObject();
256                 memberVotingState.put("member-name", actor.getMember());
257                 memberVotingState.put("voting", true);
258                 votingStateArray.put(memberVotingState);
259             }
260             inputBlock.put("member-voting-state", votingStateArray);
261             votingInput.put("input", inputBlock);
262             log.debug("tryFailover(): {}", votingInput);
263             // Change voting all shards
264             votingResponse = ConnectionManager.getConnectionResponse(httpProtocol + operator.getNode() + ":" + controllerPort + "/restconf/operations/cluster-admin:change-member-voting-states-for-all-shards", ConnectionManager.HttpMethod.POST, votingInput.toString(), credentials);
265         } catch(IOException e) {
266             log.error("tryFailover(): Failure changing voting", e);
267         }
268         if(votingResponse != null) {
269             if(votingResponse.statusCode != 200) {
270                 status.setStatusCode(votingResponse.statusCode);
271                 status.setMessage("Failed to swap voting.");
272             } else {
273                 status.setStatusCode(200);
274                 status.setMessage("Failover complete.");
275             }
276         } else {
277             status.setStatusCode(500);
278             status.setMessage("Failed to swap voting.");
279         }
280
281         return status;
282     }
283
284     /**
285      * Implementation of {@code resolveSites()}. Calls
286      * {@code resolveSiteForMember()} to resolve which site a member belongs to.
287      *
288      * @see HealthResolver
289      */
290     @Override
291     public void resolveSites() {
292         log.info("Map contains {} entries", memberMap.size());
293         memberMap.forEach((key, value) -> resolveSiteForMember(value));
294     }
295
296     /**
297      * Resolves which site a member belongs to. Members 1-3 are assumed to be
298      * <i>Site 1</i> while members 4-6 are assumed to be <i>Site 2</i>.
299      *
300      * @see HealthResolver
301      */
302     private void resolveSiteForMember(ClusterActor actor) {
303         try {
304             int memberNumber = Integer.parseInt(actor.getMember().split("-")[1]);
305             if(memberNumber < 4) {
306                 actor.setSite("Site 1");
307             } else {
308                 actor.setSite("Site 2");
309             }
310             log.info("resolveSiteForMember(): {} belongs to {}", actor.getNode(), actor.getSite());
311         } catch (NumberFormatException e) {
312             log.error("resolveSiteForMember(): Could not parse member number for {}. Defaulting to Site 1.", actor.getNode());
313             actor.setSite("resolveSiteForMember(): Site 1");
314         }
315     }
316 }