1 /*******************************************************************************
2 * ============LICENSE_START=======================================================
4 * ================================================================================
5 * Copyright © 2017 AT&T Intellectual Property. All rights reserved.
6 * ================================================================================
7 * Licensed under the Apache License, Version 2.0 (the "License");
8 * you may not use this file except in compliance with the License.
9 * You may obtain a copy of the License at
10 * http://www.apache.org/licenses/LICENSE-2.0
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 * ============LICENSE_END=========================================================
19 * ECOMP is a trademark and service mark of AT&T Intellectual Property.
21 *******************************************************************************/
22 package com.att.nsa.cambria.backends.kafka;
24 import java.io.IOException;
25 import java.util.Collection;
26 import java.util.LinkedList;
27 import java.util.List;
28 import java.util.Map.Entry;
29 import java.util.concurrent.ConcurrentHashMap;
30 import java.util.concurrent.Executors;
31 import java.util.concurrent.ScheduledExecutorService;
32 import java.util.concurrent.TimeUnit;
34 import org.I0Itec.zkclient.exception.ZkException;
35 import org.I0Itec.zkclient.exception.ZkInterruptedException;
36 import org.apache.curator.framework.CuratorFramework;
37 import org.apache.curator.framework.imps.CuratorFrameworkState;
38 import org.apache.curator.framework.recipes.cache.ChildData;
39 import org.apache.curator.framework.recipes.cache.PathChildrenCache;
40 import org.apache.curator.framework.recipes.cache.PathChildrenCacheEvent;
41 import org.apache.curator.framework.recipes.cache.PathChildrenCacheListener;
42 import org.apache.curator.framework.state.ConnectionState;
43 import org.apache.curator.framework.state.ConnectionStateListener;
44 import org.apache.curator.utils.EnsurePath;
45 import org.apache.curator.utils.ZKPaths;
46 import org.apache.http.annotation.NotThreadSafe;
47 import org.apache.zookeeper.KeeperException;
48 import org.apache.zookeeper.KeeperException.NoNodeException;
49 //import org.slf4j.Logger;
50 //import org.slf4j.LoggerFactory;
51 import com.att.eelf.configuration.EELFLogger;
52 import com.att.eelf.configuration.EELFManager;
53 import com.att.ajsc.filemonitor.AJSCPropertiesMap;
54 import com.att.nsa.cambria.backends.Consumer;
55 import com.att.nsa.cambria.backends.MetricsSet;
56 import com.att.nsa.cambria.constants.CambriaConstants;
57 import com.att.nsa.cambria.utils.ConfigurationReader;
58 import com.att.nsa.drumlin.till.nv.rrNvReadable;
61 * @NotThreadSafe but expected to be used within KafkaConsumerFactory, which
67 public class KafkaConsumerCache {
69 private static final String kSetting_ConsumerHandoverWaitMs = "cambria.consumer.cache.handoverWaitMs";
70 private static final int kDefault_ConsumerHandoverWaitMs = 500;
72 private static final String kSetting_SweepEverySeconds = "cambria.consumer.cache.sweepFreqSeconds";
73 private static final String kSetting_TouchEveryMs = "cambria.consumer.cache.touchFreqMs";
75 private static final String kSetting_ZkBasePath = "cambria.consumer.cache.zkBasePath";
76 private static final String kDefault_ZkBasePath = CambriaConstants.kDefault_ZkRoot + "/consumerCache";
78 // kafka defaults to timing out a client after 6 seconds of inactivity, but
79 // it heartbeats even when the client isn't fetching. Here, we don't
80 // want to prematurely rebalance the consumer group. Assuming clients are
82 // the server at least every 30 seconds, timing out after 2 minutes should
84 // FIXME: consider allowing the client to specify its expected call rate?
85 private static final long kDefault_MustTouchEveryMs = 1000 * 60 * 2;
87 // check for expirations pretty regularly
88 private static final long kDefault_SweepEverySeconds = 15;
91 NOT_STARTED, CONNECTED, DISCONNECTED, SUSPENDED
95 * User defined exception class for kafka consumer cache
100 public class KafkaConsumerCacheException extends Exception {
102 * To throw the exception
106 KafkaConsumerCacheException(Throwable t) {
114 public KafkaConsumerCacheException(String s) {
118 private static final long serialVersionUID = 1L;
122 * Creates a KafkaConsumerCache object. Before it is used, you must call
129 public KafkaConsumerCache(String apiId, MetricsSet metrics) {
132 throw new IllegalArgumentException("API Node ID must be specified.");
138 String strkSetting_ZkBasePath= AJSCPropertiesMap.getProperty(CambriaConstants.msgRtr_prop,kSetting_ZkBasePath);
139 if(null==strkSetting_ZkBasePath)strkSetting_ZkBasePath = kDefault_ZkBasePath;
140 fBaseZkPath = strkSetting_ZkBasePath;
142 fConsumers = new ConcurrentHashMap<String, KafkaConsumer>();
143 fSweepScheduler = Executors.newScheduledThreadPool(1);
145 curatorConsumerCache = null;
147 status = Status.NOT_STARTED;
149 listener = new ConnectionStateListener() {
150 public void stateChanged(CuratorFramework client, ConnectionState newState) {
151 if (newState == ConnectionState.LOST) {
152 log.info("ZooKeeper connection expired");
153 handleConnectionLoss();
154 } else if (newState == ConnectionState.READ_ONLY) {
155 log.warn("ZooKeeper connection set to read only mode.");
156 } else if (newState == ConnectionState.RECONNECTED) {
157 log.info("ZooKeeper connection re-established");
158 handleReconnection();
159 } else if (newState == ConnectionState.SUSPENDED) {
160 log.warn("ZooKeeper connection has been suspended.");
161 handleConnectionSuspended();
168 * Start the cache service. This must be called before any get/put
174 * @throws IOException
175 * @throws KafkaConsumerCacheException
177 public void startCache(String mode, CuratorFramework curator) throws KafkaConsumerCacheException {
180 // CuratorFramework curator = null;
182 // Changed the class from where we are initializing the curator
184 if (mode != null && mode.equals(CambriaConstants.CAMBRIA)) {
185 curator = ConfigurationReader.getCurator();
186 } else if (mode != null && mode.equals(CambriaConstants.DMAAP)) {
187 curator = getCuratorFramework(curator);
190 curator.getConnectionStateListenable().addListener(listener);
192 setStatus(Status.CONNECTED);
194 curatorConsumerCache = new PathChildrenCache(curator, fBaseZkPath, true);
195 curatorConsumerCache.start();
197 curatorConsumerCache.getListenable().addListener(new PathChildrenCacheListener() {
198 public void childEvent(CuratorFramework client, PathChildrenCacheEvent event) throws Exception {
199 switch (event.getType()) {
201 final String apiId = new String(event.getData().getData());
202 final String consumer = ZKPaths.getNodeFromPath(event.getData().getPath());
204 log.info(apiId + " started consumer " + consumer);
207 case CHILD_UPDATED: {
208 final String apiId = new String(event.getData().getData());
209 final String consumer = ZKPaths.getNodeFromPath(event.getData().getPath());
211 if (fConsumers.containsKey(consumer)) {
212 log.info(apiId + " claimed consumer " + consumer + " from " + fApiId);
214 dropClaimedConsumer(consumer);
219 case CHILD_REMOVED: {
220 final String consumer = ZKPaths.getNodeFromPath(event.getData().getPath());
222 if (fConsumers.containsKey(consumer)) {
223 log.info("Someone wanted consumer " + consumer + " gone; removing it from the cache");
224 dropConsumer(consumer, false);
235 // initialize the ZK path
236 EnsurePath ensurePath = new EnsurePath(fBaseZkPath);
237 ensurePath.ensure(curator.getZookeeperClient());
239 //final long freq = fSettings.getLong(kSetting_SweepEverySeconds, kDefault_SweepEverySeconds);
240 long freq = kDefault_SweepEverySeconds;
241 String strkSetting_SweepEverySeconds = AJSCPropertiesMap.getProperty(CambriaConstants.msgRtr_prop,kSetting_SweepEverySeconds);
242 if(null==strkSetting_SweepEverySeconds) strkSetting_SweepEverySeconds = kDefault_SweepEverySeconds+"";
244 freq = Long.parseLong(strkSetting_SweepEverySeconds);
246 fSweepScheduler.scheduleAtFixedRate(new sweeper(), freq, freq, TimeUnit.SECONDS);
247 log.info("KafkaConsumerCache started");
248 log.info("sweeping cached clients every " + freq + " seconds");
249 } catch (ZkException e) {
250 throw new KafkaConsumerCacheException(e);
251 } catch (Exception e) {
252 throw new KafkaConsumerCacheException(e);
257 * Getting the curator oject to start the zookeeper connection estabished
260 * @return curator object
262 public static CuratorFramework getCuratorFramework(CuratorFramework curator) {
263 if (curator.getState() == CuratorFrameworkState.LATENT) {
267 curator.blockUntilConnected();
268 } catch (InterruptedException e) {
270 log.error("error while setting curator framework :" + e.getMessage());
278 * Stop the cache service.
280 public void stopCache() {
281 setStatus(Status.DISCONNECTED);
283 final CuratorFramework curator = ConfigurationReader.getCurator();
285 if (curator != null) {
287 curator.getConnectionStateListenable().removeListener(listener);
288 curatorConsumerCache.close();
289 log.info("Curator client closed");
290 } catch (ZkInterruptedException e) {
291 log.warn("Curator client close interrupted: " + e.getMessage());
292 } catch (IOException e) {
293 log.warn("Error while closing curator PathChildrenCache for KafkaConsumerCache" + e.getMessage());
296 curatorConsumerCache = null;
299 if (fSweepScheduler != null) {
300 fSweepScheduler.shutdownNow();
301 log.info("cache sweeper stopped");
304 if (fConsumers != null) {
309 setStatus(Status.NOT_STARTED);
311 log.info("Consumer cache service stopped");
315 * Get a cached consumer by topic, group, and id, if it exists (and remains
316 * valid) In addition, this method waits for all other consumer caches in
317 * the cluster to release their ownership and delete their version of this
321 * @param consumerGroupId
323 * @return a consumer, or null
325 public KafkaConsumer getConsumerFor(String topic, String consumerGroupId, String clientId)
326 throws KafkaConsumerCacheException {
327 if (getStatus() != KafkaConsumerCache.Status.CONNECTED)
328 throw new KafkaConsumerCacheException("The cache service is unavailable.");
330 final String consumerKey = makeConsumerKey(topic, consumerGroupId, clientId);
331 final KafkaConsumer kc = fConsumers.get(consumerKey);
334 log.debug("Consumer cache hit for [" + consumerKey + "], last was at " + kc.getLastTouch());
336 fMetrics.onKafkaConsumerCacheHit();
338 log.debug("Consumer cache miss for [" + consumerKey + "]");
339 fMetrics.onKafkaConsumerCacheMiss();
346 * Put a consumer into the cache by topic, group and ID
349 * @param consumerGroupId
352 * @throws KafkaConsumerCacheException
354 public void putConsumerFor(String topic, String consumerGroupId, String consumerId, KafkaConsumer consumer)
355 throws KafkaConsumerCacheException {
356 if (getStatus() != KafkaConsumerCache.Status.CONNECTED)
357 throw new KafkaConsumerCacheException("The cache service is unavailable.");
359 final String consumerKey = makeConsumerKey(topic, consumerGroupId, consumerId);
360 fConsumers.put(consumerKey, consumer);
363 public Collection<? extends Consumer> getConsumers() {
364 return new LinkedList<KafkaConsumer>(fConsumers.values());
368 * This method is to drop all the consumer
370 public void dropAllConsumers() {
371 for (Entry<String, KafkaConsumer> entry : fConsumers.entrySet()) {
372 dropConsumer(entry.getKey(), true);
375 // consumers should be empty here
376 if (fConsumers.size() > 0) {
377 log.warn("During dropAllConsumers, the consumer map is not empty.");
383 * Drop a consumer from our cache due to a timeout
387 private void dropTimedOutConsumer(String key) {
388 fMetrics.onKafkaConsumerTimeout();
390 if (!fConsumers.containsKey(key)) {
391 log.warn("Attempted to drop a timed out consumer which was not in our cache: " + key);
395 // First, drop this consumer from our cache
396 dropConsumer(key, true);
398 final CuratorFramework curator = ConfigurationReader.getCurator();
401 curator.delete().guaranteed().forPath(fBaseZkPath + "/" + key);
402 } catch (NoNodeException e) {
403 log.warn("A consumer was deleted from " + fApiId
404 + "'s cache, but no Cambria API node had ownership of it in ZooKeeper");
405 } catch (Exception e) {
406 log.debug("Unexpected exception while deleting consumer: " + e.getMessage());
409 log.info("Dropped " + key + " consumer due to timeout");
413 * Drop a consumer from our cache due to another API node claiming it as
418 private void dropClaimedConsumer(String key) {
419 // if the consumer is still in our cache, it implies a claim.
420 if (fConsumers.containsKey(key)) {
421 fMetrics.onKafkaConsumerClaimed();
422 log.info("Consumer [" + key + "] claimed by another node.");
425 dropConsumer(key, false);
429 * Removes the consumer from the cache and closes its connection to the
433 * @param dueToTimeout
435 private void dropConsumer(String key, boolean dueToTimeout) {
436 final KafkaConsumer kc = fConsumers.remove(key);
439 log.info("closing Kafka consumer " + key);
444 // private final rrNvReadable fSettings;
445 private final MetricsSet fMetrics;
446 private final String fBaseZkPath;
447 private final ScheduledExecutorService fSweepScheduler;
448 private final String fApiId;
449 private final ConnectionStateListener listener;
451 private ConcurrentHashMap<String, KafkaConsumer> fConsumers;
452 private PathChildrenCache curatorConsumerCache;
454 private volatile Status status;
456 private void handleReconnection() {
458 log.info("Reading current cache data from ZK and synchronizing local cache");
460 final List<ChildData> cacheData = curatorConsumerCache.getCurrentData();
462 // Remove all the consumers in this API nodes cache that now belong to
464 for (ChildData cachedConsumer : cacheData) {
465 final String consumerId = ZKPaths.getNodeFromPath(cachedConsumer.getPath());
466 final String owningApiId = (cachedConsumer.getData() != null) ? new String(cachedConsumer.getData())
469 if (!fApiId.equals(owningApiId)) {
470 fConsumers.remove(consumerId);
474 setStatus(Status.CONNECTED);
477 private void handleConnectionSuspended() {
478 log.info("Suspending cache until ZK connection is re-established");
480 setStatus(Status.SUSPENDED);
483 private void handleConnectionLoss() {
484 log.info("Clearing consumer cache (shutting down all Kafka consumers on this node)");
486 setStatus(Status.DISCONNECTED);
488 closeAllCachedConsumers();
492 private void closeAllCachedConsumers() {
493 for (Entry<String, KafkaConsumer> entry : fConsumers.entrySet()) {
494 entry.getValue().close();
498 private static String makeConsumerKey(String topic, String consumerGroupId, String clientId) {
499 return topic + "::" + consumerGroupId + "::" + clientId;
503 * This method is to get a lock
506 * @param consumerGroupId
508 * @throws KafkaConsumerCacheException
510 public void signalOwnership(final String topic, final String consumerGroupId, final String consumerId)
511 throws KafkaConsumerCacheException {
512 // get a lock at <base>/<topic>::<consumerGroupId>::<consumerId>
513 final String consumerKey = makeConsumerKey(topic, consumerGroupId, consumerId);
516 final String consumerPath = fBaseZkPath + "/" + consumerKey;
518 log.debug(fApiId + " attempting to claim ownership of consumer " + consumerKey);
520 final CuratorFramework curator = ConfigurationReader.getCurator();
523 curator.setData().forPath(consumerPath, fApiId.getBytes());
524 } catch (KeeperException.NoNodeException e) {
525 curator.create().creatingParentsIfNeeded().forPath(consumerPath, fApiId.getBytes());
528 log.info(fApiId + " successfully claimed ownership of consumer " + consumerKey);
529 } catch (Exception e) {
530 log.error(fApiId + " failed to claim ownership of consumer " + consumerKey);
531 throw new KafkaConsumerCacheException(e);
534 log.info("Backing off to give the Kafka broker time to clean up the ZK data for this consumer");
537 int kSetting_ConsumerHandoverWaitMs = kDefault_ConsumerHandoverWaitMs;
538 String strkSetting_ConsumerHandoverWaitMs= AJSCPropertiesMap.getProperty(CambriaConstants.msgRtr_prop,kSetting_ConsumerHandoverWaitMs+"");
539 if(strkSetting_ConsumerHandoverWaitMs!=null) kSetting_ConsumerHandoverWaitMs = Integer.parseInt(strkSetting_ConsumerHandoverWaitMs);
541 Thread.sleep(kSetting_ConsumerHandoverWaitMs);
542 //Thread.sleep(fSettings.getInt(kSetting_ConsumerHandoverWaitMs, kDefault_ConsumerHandoverWaitMs));
543 } catch (InterruptedException e) {
548 private void sweep() {
549 final LinkedList<String> removals = new LinkedList<String>();
550 long mustTouchEveryMs = kDefault_MustTouchEveryMs;
551 String strkSetting_TouchEveryMs = AJSCPropertiesMap.getProperty(CambriaConstants.msgRtr_prop,kSetting_TouchEveryMs);
552 //if(null!=strkSetting_TouchEveryMs) strkSetting_TouchEveryMs = kDefault_MustTouchEveryMs+"";
553 if(null!=strkSetting_TouchEveryMs)
555 mustTouchEveryMs = Long.parseLong(strkSetting_TouchEveryMs);
558 //final long mustTouchEveryMs = fSettings.getLong(kSetting_TouchEveryMs, kDefault_MustTouchEveryMs);
559 final long oldestAllowedTouchMs = System.currentTimeMillis() - mustTouchEveryMs;
561 for (Entry<String, KafkaConsumer> e : fConsumers.entrySet()) {
562 final long lastTouchMs = e.getValue().getLastTouch();
564 log.debug("consumer " + e.getKey() + " last touched at " + lastTouchMs);
566 if (lastTouchMs < oldestAllowedTouchMs) {
567 log.info("consumer " + e.getKey() + " has expired");
568 removals.add(e.getKey());
572 for (String key : removals) {
573 dropTimedOutConsumer(key);
578 * Creating a thread to run the sweep method
583 private class sweeper implements Runnable {
593 * This method is to drop consumer
596 * @param consumerGroup
599 public void dropConsumer(String topic, String consumerGroup, String clientId) {
600 dropConsumer(makeConsumerKey(topic, consumerGroup, clientId), false);
603 private Status getStatus() {
607 private void setStatus(Status status) {
608 this.status = status;
611 private static final EELFLogger log = EELFManager.getInstance().getLogger(KafkaConsumerCache.class);
612 //private static final Logger log = LoggerFactory.getLogger(KafkaConsumerCache.class);