2 * ============LICENSE_START=======================================================
4 * ================================================================================
5 * Copyright (C) 2019 AT&T Intellectual Property. All rights
7 * ================================================================================
8 * Licensed under the Apache License, Version 2.0 (the "License");
9 * you may not use this file except in compliance with the License.
10 * You may obtain a copy of the License at
12 * http://www.apache.org/licenses/LICENSE-2.0
14 * Unless required by applicable law or agreed to in writing, software
15 * distributed under the License is distributed on an "AS IS" BASIS,
16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 * See the License for the specific language governing permissions and
18 * limitations under the License.
19 * ============LICENSE_END=========================================================
22 package org.onap.ccsdk.sli.plugins.grtoolkit.resolver;
24 import org.json.JSONArray;
25 import org.json.JSONObject;
27 import org.onap.ccsdk.sli.core.dblib.DbLibService;
28 import org.onap.ccsdk.sli.plugins.grtoolkit.connection.ConnectionManager;
29 import org.onap.ccsdk.sli.plugins.grtoolkit.connection.ConnectionResponse;
30 import org.onap.ccsdk.sli.plugins.grtoolkit.data.AdminHealth;
31 import org.onap.ccsdk.sli.plugins.grtoolkit.data.ClusterActor;
32 import org.onap.ccsdk.sli.plugins.grtoolkit.data.ClusterHealth;
33 import org.onap.ccsdk.sli.plugins.grtoolkit.data.DatabaseHealth;
34 import org.onap.ccsdk.sli.plugins.grtoolkit.data.FailoverStatus;
35 import org.onap.ccsdk.sli.plugins.grtoolkit.data.Health;
36 import org.onap.ccsdk.sli.plugins.grtoolkit.data.SiteHealth;
38 import org.opendaylight.yang.gen.v1.org.onap.ccsdk.sli.plugins.gr.toolkit.rev180926.FailoverInput;
40 import org.slf4j.Logger;
41 import org.slf4j.LoggerFactory;
43 import java.io.IOException;
44 import java.util.Arrays;
45 import java.util.List;
47 import java.util.NoSuchElementException;
48 import java.util.Properties;
49 import java.util.stream.Collectors;
52 * Implementation of {@code HealthResolver} for a six node controller
53 * architecture, where three nodes are located in one data center, and the
54 * other three nodes are located in another. The sites are assumed to be in an
55 * Active/Standby configuration, with the Active site nodes voting and the
56 * Standby site notes non-voting.
58 * @author Anthony Haddox
61 public class SixNodeHealthResolver extends HealthResolver {
62 private final Logger log = LoggerFactory.getLogger(SixNodeHealthResolver.class);
65 * Constructs the health resolver used by the {@code GrToolkitProvider} to
66 * determine the health of the application components.
68 * @param map a HashMap containing all of the nodes in the akka cluster
69 * @param properties the properties passed ino the provider
70 * @param dbLib a reference to the {@code DbLibService} of the provider
72 * @see org.onap.ccsdk.sli.plugins.grtoolkit.GrToolkitProvider
74 public SixNodeHealthResolver(Map<String, ClusterActor> map, Properties properties, DbLibService dbLib) {
75 super(map, properties, dbLib);
80 * Implementation of {@code getClusterHealth()}. Uses the
81 * {@code ShardResolver} to gather health information about the controller.
82 * If 4 of 6 members are healthy, the cluster is deemed healthy.
84 * @return an {@code ClusterHealth} object with health of the akka cluster
85 * @see org.onap.ccsdk.sli.plugins.grtoolkit.GrToolkitProvider
91 public ClusterHealth getClusterHealth() {
92 log.info("getClusterHealth(): Getting cluster health...");
93 shardResolver.getControllerHealth(memberMap);
94 long healthyMembers = memberMap.values().stream().filter(member -> member.isUp() && ! member.isUnreachable()).count();
95 return (healthyMembers > 4) ? new ClusterHealth().withHealth(Health.HEALTHY) : new ClusterHealth().withHealth(Health.FAULTY);
99 * Implementation of {@code getSiteHealth()}. Gathers health information on
100 * all of the contollers, then separates the nodes into voting and
101 * non-voting sites. Each site is then checked for its health and the
102 * result is returned as a List.
104 * @return a List of {@code SiteHealth} objects with health of the site
105 * @see org.onap.ccsdk.sli.plugins.grtoolkit.GrToolkitProvider
106 * @see HealthResolver
111 public List<SiteHealth> getSiteHealth() {
112 log.info("getSiteHealth(): Getting site health...");
114 // Get cluster health to populate memberMap with necessary values
116 List<ClusterActor> votingActors = memberMap.values().stream().filter(ClusterActor::isVoting).collect(Collectors.toList());
117 List<ClusterActor> nonVotingActors = memberMap.values().stream().filter(member -> !member.isVoting()).collect(Collectors.toList());
119 SiteHealth votingSiteHealth = getSiteHealth(votingActors).withRole("ACTIVE");
120 SiteHealth nonVotingSiteHealth = getSiteHealth(nonVotingActors).withRole("STANDBY");
121 return Arrays.asList(votingSiteHealth, nonVotingSiteHealth);
125 * Gathers the site identifier, admin health, and database health of a
128 * @return a {@code SiteHealth} object with health of the site
129 * @see org.onap.ccsdk.sli.plugins.grtoolkit.GrToolkitProvider
132 * @see ConnectionManager
134 public SiteHealth getSiteHealth(List<ClusterActor> actorList) {
135 AdminHealth adminHealth = null;
136 DatabaseHealth databaseHealth = null;
137 String siteId = null;
138 int healthyMembers = 0;
140 for(ClusterActor actor : actorList) {
141 if(actor.isUp() && !actor.isUnreachable()) {
146 String content = ConnectionManager.getConnectionResponse(httpProtocol + actor.getNode() + ":" + controllerPort + "/restconf/operations/gr-toolkit:site-identifier", ConnectionManager.HttpMethod.POST, null, credentials).content;
147 siteId = new JSONObject(content).getJSONObject(OUTPUT).getString("id");
148 } catch(IOException e) {
149 log.error("getSiteHealth(): Error getting site identifier from {}", actor.getNode());
150 log.error("getSiteHealth(): IOException", e);
153 if(adminHealth == null) {
155 boolean isAdminHealthy = isRemoteComponentHealthy(httpProtocol + actor.getNode() + ":" + controllerPort + "/restconf/operations/gr-toolkit:admin-health");
157 adminHealth = new AdminHealth(Health.HEALTHY, 200);
159 } catch(IOException e) {
160 log.error("getSiteHealth(): Error getting admin health from {}", actor.getNode());
161 log.error("getSiteHealth(): IOException", e);
164 if(databaseHealth == null) {
166 boolean isDatabaseHealthy = isRemoteComponentHealthy(httpProtocol + actor.getNode() + ":" + controllerPort + "/restconf/operations/gr-toolkit:database-health");
167 if(isDatabaseHealthy) {
168 databaseHealth = new DatabaseHealth(Health.HEALTHY);
170 } catch(IOException e) {
171 log.error("getSiteHealth(): Error getting database health from {}", actor.getNode());
172 log.error("getSiteHealth(): IOException", e);
178 siteId = "UNKNOWN SITE";
180 if(adminHealth == null) {
181 adminHealth = new AdminHealth(Health.FAULTY, 500);
183 if(databaseHealth == null) {
184 databaseHealth = new DatabaseHealth(Health.FAULTY);
186 SiteHealth health = new SiteHealth()
187 .withAdminHealth(adminHealth)
188 .withDatabaseHealth(databaseHealth)
190 if(isHealthy(adminHealth.getHealth()) && isHealthy(databaseHealth.getHealth()) && healthyMembers > 1) {
191 health.setHealth(Health.HEALTHY);
198 * Implementation of {@code tryFailover()}. Performs a preliminary call to
199 * {@code getClusterHealth} to populate information about the cluster. If
200 * no voting members can be found, the method terminates immediately. The
201 * nodes are separated into voting and non-voting sites, and a driving
202 * operator is selected from the non-voting nodes to perform requests
203 * against. A payload to swap voting between sites is sent to the operator
204 * to perform a controller-level failover.
206 * @return an {@code SiteHealth} object with health of the site
207 * @see org.onap.ccsdk.sli.plugins.grtoolkit.GrToolkitProvider
208 * @see HealthResolver
209 * @see FailoverStatus
213 public FailoverStatus tryFailover(FailoverInput input) {
214 // Get Cluster Health to populate the memberMap with the necessary values
215 log.info("tryFailover(): Performing preliminary health check...");
217 FailoverStatus status = new FailoverStatus();
218 ConnectionResponse votingResponse = null;
219 List<ClusterActor> votingActors = memberMap.values().stream().filter(ClusterActor::isVoting).collect(Collectors.toList());
220 List<ClusterActor> nonVotingActors = memberMap.values().stream().filter(member -> !member.isVoting()).collect(Collectors.toList());
222 if(nonVotingActors.size() == 0) {
223 status.setStatusCode(500);
224 status.setMessage("No nonvoting members found. Cannot perform voting switch.");
228 ClusterActor operator;
230 operator = nonVotingActors.stream().filter(this::isControllerHealthy).findFirst().get();
231 } catch(NoSuchElementException e) {
232 log.error("tryFailover(): Could not find any healthy members.", e);
233 status.setStatusCode(500);
234 status.setMessage("Could not find any healthy members.");
238 // Assuming two 3 node sites, 3 voting and 3 non voting
239 if(votingActors.size() < 3 || nonVotingActors.size() < 3) {
240 log.warn("tryFailover(): Sites do not contain an equal amount of voting and nonvoting members: Voting: {} | NonVoting: {}", votingActors.size(), nonVotingActors.size());
242 log.info("tryFailover(): Swapping voting...");
244 JSONObject votingInput = new JSONObject();
245 JSONObject inputBlock = new JSONObject();
246 JSONArray votingStateArray = new JSONArray();
247 JSONObject memberVotingState;
248 for(ClusterActor actor : votingActors) {
249 memberVotingState = new JSONObject();
250 memberVotingState.put("member-name", actor.getMember());
251 memberVotingState.put("voting", false);
252 votingStateArray.put(memberVotingState);
254 for(ClusterActor actor : nonVotingActors) {
255 memberVotingState = new JSONObject();
256 memberVotingState.put("member-name", actor.getMember());
257 memberVotingState.put("voting", true);
258 votingStateArray.put(memberVotingState);
260 inputBlock.put("member-voting-state", votingStateArray);
261 votingInput.put("input", inputBlock);
262 log.debug("tryFailover(): {}", votingInput);
263 // Change voting all shards
264 votingResponse = ConnectionManager.getConnectionResponse(httpProtocol + operator.getNode() + ":" + controllerPort + "/restconf/operations/cluster-admin:change-member-voting-states-for-all-shards", ConnectionManager.HttpMethod.POST, votingInput.toString(), credentials);
265 } catch(IOException e) {
266 log.error("tryFailover(): Failure changing voting", e);
268 if(votingResponse != null) {
269 if(votingResponse.statusCode != 200) {
270 status.setStatusCode(votingResponse.statusCode);
271 status.setMessage("Failed to swap voting.");
273 status.setStatusCode(200);
274 status.setMessage("Failover complete.");
277 status.setStatusCode(500);
278 status.setMessage("Failed to swap voting.");
285 * Implementation of {@code resolveSites()}. Calls
286 * {@code resolveSiteForMember()} to resolve which site a member belongs to.
288 * @see HealthResolver
291 public void resolveSites() {
292 log.info("Map contains {} entries", memberMap.size());
293 memberMap.forEach((key, value) -> resolveSiteForMember(value));
297 * Resolves which site a member belongs to. Members 1-3 are assumed to be
298 * <i>Site 1</i> while members 4-6 are assumed to be <i>Site 2</i>.
300 * @see HealthResolver
302 private void resolveSiteForMember(ClusterActor actor) {
304 int memberNumber = Integer.parseInt(actor.getMember().split("-")[1]);
305 if(memberNumber < 4) {
306 actor.setSite("Site 1");
308 actor.setSite("Site 2");
310 log.info("resolveSiteForMember(): {} belongs to {}", actor.getNode(), actor.getSite());
311 } catch (NumberFormatException e) {
312 log.error("resolveSiteForMember(): Could not parse member number for {}. Defaulting to Site 1.", actor.getNode());
313 actor.setSite("resolveSiteForMember(): Site 1");