1 /*******************************************************************************
2 * ============LICENSE_START=======================================================
4 * ================================================================================
5 * Copyright © 2017 AT&T Intellectual Property. All rights reserved.
6 * ================================================================================
7 * Licensed under the Apache License, Version 2.0 (the "License");
8 * you may not use this file except in compliance with the License.
9 * You may obtain a copy of the License at
10 * http://www.apache.org/licenses/LICENSE-2.0
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 * ============LICENSE_END=========================================================
19 * ECOMP is a trademark and service mark of AT&T Intellectual Property.
21 *******************************************************************************/
22 package com.att.nsa.cambria.backends.kafka;
24 import java.io.IOException;
25 import java.util.Collection;
26 import java.util.LinkedList;
27 import java.util.List;
28 import java.util.Map.Entry;
29 import java.util.concurrent.ConcurrentHashMap;
30 import java.util.concurrent.Executors;
31 import java.util.concurrent.ScheduledExecutorService;
32 import java.util.concurrent.TimeUnit;
34 import org.I0Itec.zkclient.exception.ZkException;
35 import org.I0Itec.zkclient.exception.ZkInterruptedException;
36 import org.apache.curator.framework.CuratorFramework;
37 import org.apache.curator.framework.imps.CuratorFrameworkState;
38 import org.apache.curator.framework.recipes.cache.ChildData;
39 import org.apache.curator.framework.recipes.cache.PathChildrenCache;
40 import org.apache.curator.framework.recipes.cache.PathChildrenCacheEvent;
41 import org.apache.curator.framework.recipes.cache.PathChildrenCacheListener;
42 import org.apache.curator.framework.state.ConnectionState;
43 import org.apache.curator.framework.state.ConnectionStateListener;
44 import org.apache.curator.utils.EnsurePath;
45 import org.apache.curator.utils.ZKPaths;
46 import org.apache.http.annotation.NotThreadSafe;
47 import org.apache.zookeeper.KeeperException;
48 import org.apache.zookeeper.KeeperException.NoNodeException;
50 //import org.slf4j.LoggerFactory;
51 import com.att.eelf.configuration.EELFLogger;
52 import com.att.eelf.configuration.EELFManager;
53 import com.att.ajsc.filemonitor.AJSCPropertiesMap;
54 import com.att.nsa.cambria.backends.Consumer;
55 import com.att.nsa.cambria.backends.MetricsSet;
56 import com.att.nsa.cambria.constants.CambriaConstants;
57 import com.att.nsa.cambria.utils.ConfigurationReader;
58 import com.att.nsa.drumlin.till.nv.rrNvReadable;
61 * @NotThreadSafe but expected to be used within KafkaConsumerFactory, which
67 public class KafkaConsumerCache {
69 private static final String kSetting_ConsumerHandoverWaitMs = "cambria.consumer.cache.handoverWaitMs";
70 private static final int kDefault_ConsumerHandoverWaitMs = 500;
72 private static final String kSetting_SweepEverySeconds = "cambria.consumer.cache.sweepFreqSeconds";
73 private static final String kSetting_TouchEveryMs = "cambria.consumer.cache.touchFreqMs";
75 private static final String kSetting_ZkBasePath = "cambria.consumer.cache.zkBasePath";
76 private static final String kDefault_ZkBasePath = CambriaConstants.kDefault_ZkRoot + "/consumerCache";
78 // kafka defaults to timing out a client after 6 seconds of inactivity, but
79 // it heartbeats even when the client isn't fetching. Here, we don't
80 // want to prematurely rebalance the consumer group. Assuming clients are
82 // the server at least every 30 seconds, timing out after 2 minutes should
84 // FIXME: consider allowing the client to specify its expected call rate?
85 private static final long kDefault_MustTouchEveryMs = (long)1000 * 60 * 2;
87 // check for expirations pretty regularly
88 private static final long kDefault_SweepEverySeconds = 15;
91 NOT_STARTED, CONNECTED, DISCONNECTED, SUSPENDED
95 * User defined exception class for kafka consumer cache
100 public class KafkaConsumerCacheException extends Exception {
102 * To throw the exception
106 KafkaConsumerCacheException(Throwable t) {
114 public KafkaConsumerCacheException(String s) {
118 private static final long serialVersionUID = 1L;
122 * Creates a KafkaConsumerCache object. Before it is used, you must call
129 public KafkaConsumerCache(String apiId, MetricsSet metrics) {
132 throw new IllegalArgumentException("API Node ID must be specified.");
138 String strkSetting_ZkBasePath= AJSCPropertiesMap.getProperty(CambriaConstants.msgRtr_prop,kSetting_ZkBasePath);
139 if(null==strkSetting_ZkBasePath)strkSetting_ZkBasePath = kDefault_ZkBasePath;
140 fBaseZkPath = strkSetting_ZkBasePath;
142 fConsumers = new ConcurrentHashMap<String, KafkaConsumer>();
143 fSweepScheduler = Executors.newScheduledThreadPool(1);
145 curatorConsumerCache = null;
147 status = Status.NOT_STARTED;
149 listener = new ConnectionStateListener() {
150 public void stateChanged(CuratorFramework client, ConnectionState newState) {
151 if (newState == ConnectionState.LOST) {
152 log.info("ZooKeeper connection expired");
153 handleConnectionLoss();
154 } else if (newState == ConnectionState.READ_ONLY) {
155 log.warn("ZooKeeper connection set to read only mode.");
156 } else if (newState == ConnectionState.RECONNECTED) {
157 log.info("ZooKeeper connection re-established");
158 handleReconnection();
159 } else if (newState == ConnectionState.SUSPENDED) {
160 log.warn("ZooKeeper connection has been suspended.");
161 handleConnectionSuspended();
168 * Start the cache service. This must be called before any get/put
174 * @throws IOException
175 * @throws KafkaConsumerCacheException
177 public void startCache(String mode, CuratorFramework curator) throws KafkaConsumerCacheException {
180 // CuratorFramework curator = null;
182 // Changed the class from where we are initializing the curator
184 if (mode != null && mode.equals(CambriaConstants.CAMBRIA)) {
185 curator = ConfigurationReader.getCurator();
186 } else if (mode != null && mode.equals(CambriaConstants.DMAAP)) {
187 curator = getCuratorFramework(curator);
190 curator.getConnectionStateListenable().addListener(listener);
192 setStatus(Status.CONNECTED);
194 curatorConsumerCache = new PathChildrenCache(curator, fBaseZkPath, true);
195 curatorConsumerCache.start();
197 curatorConsumerCache.getListenable().addListener(new PathChildrenCacheListener() {
198 public void childEvent(CuratorFramework client, PathChildrenCacheEvent event) throws Exception {
199 switch (event.getType()) {
201 final String apiId = new String(event.getData().getData());
202 final String consumer = ZKPaths.getNodeFromPath(event.getData().getPath());
204 log.info(apiId + " started consumer " + consumer);
207 case CHILD_UPDATED: {
208 final String apiId = new String(event.getData().getData());
209 final String consumer = ZKPaths.getNodeFromPath(event.getData().getPath());
211 if (fConsumers.containsKey(consumer)) {
212 log.info(apiId + " claimed consumer " + consumer + " from " + fApiId);
214 dropClaimedConsumer(consumer);
219 case CHILD_REMOVED: {
220 final String consumer = ZKPaths.getNodeFromPath(event.getData().getPath());
222 if (fConsumers.containsKey(consumer)) {
223 log.info("Someone wanted consumer " + consumer + " gone; removing it from the cache");
224 dropConsumer(consumer, false);
235 // initialize the ZK path
236 EnsurePath ensurePath = new EnsurePath(fBaseZkPath);
237 ensurePath.ensure(curator.getZookeeperClient());
239 //final long freq = fSettings.getLong(kSetting_SweepEverySeconds, kDefault_SweepEverySeconds);
240 long freq = kDefault_SweepEverySeconds;
241 String strkSetting_SweepEverySeconds = AJSCPropertiesMap.getProperty(CambriaConstants.msgRtr_prop,kSetting_SweepEverySeconds);
242 if(null==strkSetting_SweepEverySeconds) {
243 strkSetting_SweepEverySeconds = kDefault_SweepEverySeconds+"";
246 freq = Long.parseLong(strkSetting_SweepEverySeconds);
248 fSweepScheduler.scheduleAtFixedRate(new sweeper(), freq, freq, TimeUnit.SECONDS);
249 log.info("KafkaConsumerCache started");
250 log.info("sweeping cached clients every " + freq + " seconds");
251 } catch (ZkException e) {
252 throw new KafkaConsumerCacheException(e);
253 } catch (Exception e) {
254 throw new KafkaConsumerCacheException(e);
259 * Getting the curator oject to start the zookeeper connection estabished
262 * @return curator object
264 public static CuratorFramework getCuratorFramework(CuratorFramework curator) {
265 if (curator.getState() == CuratorFrameworkState.LATENT) {
269 curator.blockUntilConnected();
270 } catch (InterruptedException e) {
272 log.error("error while setting curator framework :" + e.getMessage());
273 Thread.currentThread().interrupt();
281 * Stop the cache service.
283 public void stopCache() {
284 setStatus(Status.DISCONNECTED);
286 final CuratorFramework curator = ConfigurationReader.getCurator();
288 if (curator != null) {
290 curator.getConnectionStateListenable().removeListener(listener);
291 curatorConsumerCache.close();
292 log.info("Curator client closed");
293 } catch (ZkInterruptedException e) {
294 log.error("Curator client close interrupted: " + e);
295 } catch (IOException e) {
296 log.error("Error while closing curator PathChildrenCache for KafkaConsumerCache" + e);
299 curatorConsumerCache = null;
302 if (fSweepScheduler != null) {
303 fSweepScheduler.shutdownNow();
304 log.info("cache sweeper stopped");
307 if (fConsumers != null) {
312 setStatus(Status.NOT_STARTED);
314 log.info("Consumer cache service stopped");
318 * Get a cached consumer by topic, group, and id, if it exists (and remains
319 * valid) In addition, this method waits for all other consumer caches in
320 * the cluster to release their ownership and delete their version of this
324 * @param consumerGroupId
326 * @return a consumer, or null
328 public KafkaConsumer getConsumerFor(String topic, String consumerGroupId, String clientId)
329 throws KafkaConsumerCacheException {
330 if (getStatus() != KafkaConsumerCache.Status.CONNECTED)
331 throw new KafkaConsumerCacheException("The cache service is unavailable.");
333 final String consumerKey = makeConsumerKey(topic, consumerGroupId, clientId);
334 final KafkaConsumer kc = fConsumers.get(consumerKey);
337 log.debug("Consumer cache hit for [" + consumerKey + "], last was at " + kc.getLastTouch());
339 fMetrics.onKafkaConsumerCacheHit();
341 log.debug("Consumer cache miss for [" + consumerKey + "]");
342 fMetrics.onKafkaConsumerCacheMiss();
349 * Put a consumer into the cache by topic, group and ID
352 * @param consumerGroupId
355 * @throws KafkaConsumerCacheException
357 public void putConsumerFor(String topic, String consumerGroupId, String consumerId, KafkaConsumer consumer)
358 throws KafkaConsumerCacheException {
359 if (getStatus() != KafkaConsumerCache.Status.CONNECTED)
360 throw new KafkaConsumerCacheException("The cache service is unavailable.");
362 final String consumerKey = makeConsumerKey(topic, consumerGroupId, consumerId);
363 fConsumers.put(consumerKey, consumer);
366 public Collection<? extends Consumer> getConsumers() {
367 return new LinkedList<KafkaConsumer>(fConsumers.values());
371 * This method is to drop all the consumer
373 public void dropAllConsumers() {
374 for (Entry<String, KafkaConsumer> entry : fConsumers.entrySet()) {
375 dropConsumer(entry.getKey(), true);
378 // consumers should be empty here
379 if (fConsumers.size() > 0) {
380 log.warn("During dropAllConsumers, the consumer map is not empty.");
386 * Drop a consumer from our cache due to a timeout
390 private void dropTimedOutConsumer(String key) {
391 fMetrics.onKafkaConsumerTimeout();
393 if (!fConsumers.containsKey(key)) {
394 log.warn("Attempted to drop a timed out consumer which was not in our cache: " + key);
398 // First, drop this consumer from our cache
399 dropConsumer(key, true);
401 final CuratorFramework curator = ConfigurationReader.getCurator();
404 curator.delete().guaranteed().forPath(fBaseZkPath + "/" + key);
405 } catch (NoNodeException e) {
406 log.error("Exception at : " + e);
407 } catch (Exception e) {
408 log.error("Unexpected exception while deleting consumer: " + e);
411 log.info("Dropped " + key + " consumer due to timeout");
415 * Drop a consumer from our cache due to another API node claiming it as
420 private void dropClaimedConsumer(String key) {
421 // if the consumer is still in our cache, it implies a claim.
422 if (fConsumers.containsKey(key)) {
423 fMetrics.onKafkaConsumerClaimed();
424 log.info("Consumer [" + key + "] claimed by another node.");
427 dropConsumer(key, false);
431 * Removes the consumer from the cache and closes its connection to the
435 * @param dueToTimeout
437 private void dropConsumer(String key, boolean dueToTimeout) {
438 final KafkaConsumer kc = fConsumers.remove(key);
441 log.info("closing Kafka consumer " + key);
446 // private final rrNvReadable fSettings;
447 private final MetricsSet fMetrics;
448 private final String fBaseZkPath;
449 private final ScheduledExecutorService fSweepScheduler;
450 private final String fApiId;
451 private final ConnectionStateListener listener;
453 private ConcurrentHashMap<String, KafkaConsumer> fConsumers;
454 private PathChildrenCache curatorConsumerCache;
456 private volatile Status status;
458 private void handleReconnection() {
460 log.info("Reading current cache data from ZK and synchronizing local cache");
462 final List<ChildData> cacheData = curatorConsumerCache.getCurrentData();
464 // Remove all the consumers in this API nodes cache that now belong to
466 for (ChildData cachedConsumer : cacheData) {
467 final String consumerId = ZKPaths.getNodeFromPath(cachedConsumer.getPath());
468 final String owningApiId = (cachedConsumer.getData() != null) ? new String(cachedConsumer.getData())
471 if (!fApiId.equals(owningApiId)) {
472 fConsumers.remove(consumerId);
476 setStatus(Status.CONNECTED);
479 private void handleConnectionSuspended() {
480 log.info("Suspending cache until ZK connection is re-established");
482 setStatus(Status.SUSPENDED);
485 private void handleConnectionLoss() {
486 log.info("Clearing consumer cache (shutting down all Kafka consumers on this node)");
488 setStatus(Status.DISCONNECTED);
490 closeAllCachedConsumers();
494 private void closeAllCachedConsumers() {
495 for (Entry<String, KafkaConsumer> entry : fConsumers.entrySet()) {
496 entry.getValue().close();
500 private static String makeConsumerKey(String topic, String consumerGroupId, String clientId) {
501 return topic + "::" + consumerGroupId + "::" + clientId;
505 * This method is to get a lock
508 * @param consumerGroupId
510 * @throws KafkaConsumerCacheException
512 public void signalOwnership(final String topic, final String consumerGroupId, final String consumerId)
513 throws KafkaConsumerCacheException {
514 // get a lock at <base>/<topic>::<consumerGroupId>::<consumerId>
515 final String consumerKey = makeConsumerKey(topic, consumerGroupId, consumerId);
518 final String consumerPath = fBaseZkPath + "/" + consumerKey;
520 log.debug(fApiId + " attempting to claim ownership of consumer " + consumerKey);
522 final CuratorFramework curator = ConfigurationReader.getCurator();
525 curator.setData().forPath(consumerPath, fApiId.getBytes());
526 } catch (KeeperException.NoNodeException e) {
527 log.error(e.toString());
528 curator.create().creatingParentsIfNeeded().forPath(consumerPath, fApiId.getBytes());
531 log.info(fApiId + " successfully claimed ownership of consumer " + consumerKey);
532 } catch (Exception e) {
533 log.error(fApiId + " failed to claim ownership of consumer " + consumerKey);
534 throw new KafkaConsumerCacheException(e);
537 log.info("Backing off to give the Kafka broker time to clean up the ZK data for this consumer");
540 int kSetting_ConsumerHandoverWaitMs = kDefault_ConsumerHandoverWaitMs;
541 String strkSetting_ConsumerHandoverWaitMs= AJSCPropertiesMap.getProperty(CambriaConstants.msgRtr_prop,kSetting_ConsumerHandoverWaitMs+"");
542 if(strkSetting_ConsumerHandoverWaitMs!=null) kSetting_ConsumerHandoverWaitMs = Integer.parseInt(strkSetting_ConsumerHandoverWaitMs);
544 Thread.sleep(kSetting_ConsumerHandoverWaitMs);
545 //Thread.sleep(fSettings.getInt(kSetting_ConsumerHandoverWaitMs, kDefault_ConsumerHandoverWaitMs));
546 } catch (InterruptedException e) {
547 log.error(e.toString());
548 Thread.currentThread().interrupt();
552 private void sweep() {
553 final LinkedList<String> removals = new LinkedList<String>();
554 long mustTouchEveryMs = kDefault_MustTouchEveryMs;
555 String strkSetting_TouchEveryMs = AJSCPropertiesMap.getProperty(CambriaConstants.msgRtr_prop,kSetting_TouchEveryMs);
556 //if(null!=strkSetting_TouchEveryMs) strkSetting_TouchEveryMs = kDefault_MustTouchEveryMs+"";
557 if(null!=strkSetting_TouchEveryMs)
559 mustTouchEveryMs = Long.parseLong(strkSetting_TouchEveryMs);
562 //final long mustTouchEveryMs = fSettings.getLong(kSetting_TouchEveryMs, kDefault_MustTouchEveryMs);
563 final long oldestAllowedTouchMs = System.currentTimeMillis() - mustTouchEveryMs;
565 for (Entry<String, KafkaConsumer> e : fConsumers.entrySet()) {
566 final long lastTouchMs = e.getValue().getLastTouch();
568 log.debug("consumer " + e.getKey() + " last touched at " + lastTouchMs);
570 if (lastTouchMs < oldestAllowedTouchMs) {
571 log.info("consumer " + e.getKey() + " has expired");
572 removals.add(e.getKey());
576 for (String key : removals) {
577 dropTimedOutConsumer(key);
582 * Creating a thread to run the sweep method
587 private class sweeper implements Runnable {
597 * This method is to drop consumer
600 * @param consumerGroup
603 public void dropConsumer(String topic, String consumerGroup, String clientId) {
604 dropConsumer(makeConsumerKey(topic, consumerGroup, clientId), false);
607 private Status getStatus() {
611 private void setStatus(Status status) {
612 this.status = status;
615 private static final EELFLogger log = EELFManager.getInstance().getLogger(KafkaConsumerCache.class);
616 //private static final Logger log = LoggerFactory.getLogger(KafkaConsumerCache.class);