Sonar critical issues
[dmaap/messagerouter/msgrtr.git] / src / main / java / com / att / nsa / cambria / backends / kafka / KafkaConsumerCache.java
1 /*******************************************************************************
2  *  ============LICENSE_START=======================================================
3  *  org.onap.dmaap
4  *  ================================================================================
5  *  Copyright © 2017 AT&T Intellectual Property. All rights reserved.
6  *  ================================================================================
7  *  Licensed under the Apache License, Version 2.0 (the "License");
8  *  you may not use this file except in compliance with the License.
9  *  You may obtain a copy of the License at
10  *        http://www.apache.org/licenses/LICENSE-2.0
11  *  
12  *  Unless required by applicable law or agreed to in writing, software
13  *  distributed under the License is distributed on an "AS IS" BASIS,
14  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  *  See the License for the specific language governing permissions and
16  *  limitations under the License.
17  *  ============LICENSE_END=========================================================
18  *
19  *  ECOMP is a trademark and service mark of AT&T Intellectual Property.
20  *  
21  *******************************************************************************/
22 package com.att.nsa.cambria.backends.kafka;
23
24 import java.io.IOException;
25 import java.util.Collection;
26 import java.util.LinkedList;
27 import java.util.List;
28 import java.util.Map.Entry;
29 import java.util.concurrent.ConcurrentHashMap;
30 import java.util.concurrent.Executors;
31 import java.util.concurrent.ScheduledExecutorService;
32 import java.util.concurrent.TimeUnit;
33
34 import org.I0Itec.zkclient.exception.ZkException;
35 import org.I0Itec.zkclient.exception.ZkInterruptedException;
36 import org.apache.curator.framework.CuratorFramework;
37 import org.apache.curator.framework.imps.CuratorFrameworkState;
38 import org.apache.curator.framework.recipes.cache.ChildData;
39 import org.apache.curator.framework.recipes.cache.PathChildrenCache;
40 import org.apache.curator.framework.recipes.cache.PathChildrenCacheEvent;
41 import org.apache.curator.framework.recipes.cache.PathChildrenCacheListener;
42 import org.apache.curator.framework.state.ConnectionState;
43 import org.apache.curator.framework.state.ConnectionStateListener;
44 import org.apache.curator.utils.EnsurePath;
45 import org.apache.curator.utils.ZKPaths;
46 import org.apache.http.annotation.NotThreadSafe;
47 import org.apache.zookeeper.KeeperException;
48 import org.apache.zookeeper.KeeperException.NoNodeException;
49 //import org.slf4j.Logger;
50 //import org.slf4j.LoggerFactory;
51 import com.att.eelf.configuration.EELFLogger;
52 import com.att.eelf.configuration.EELFManager;
53 import com.att.ajsc.filemonitor.AJSCPropertiesMap;
54 import com.att.nsa.cambria.backends.Consumer;
55 import com.att.nsa.cambria.backends.MetricsSet;
56 import com.att.nsa.cambria.constants.CambriaConstants;
57 import com.att.nsa.cambria.utils.ConfigurationReader;
58 import com.att.nsa.drumlin.till.nv.rrNvReadable;
59
60 /**
61  * @NotThreadSafe but expected to be used within KafkaConsumerFactory, which
62  *                must be
63  * @author author
64  *
65  */
66 @NotThreadSafe
67 public class KafkaConsumerCache {
68
69         private static final String kSetting_ConsumerHandoverWaitMs = "cambria.consumer.cache.handoverWaitMs";
70         private static final int kDefault_ConsumerHandoverWaitMs = 500;
71
72         private static final String kSetting_SweepEverySeconds = "cambria.consumer.cache.sweepFreqSeconds";
73         private static final String kSetting_TouchEveryMs = "cambria.consumer.cache.touchFreqMs";
74
75         private static final String kSetting_ZkBasePath = "cambria.consumer.cache.zkBasePath";
76         private static final String kDefault_ZkBasePath = CambriaConstants.kDefault_ZkRoot + "/consumerCache";
77
78         // kafka defaults to timing out a client after 6 seconds of inactivity, but
79         // it heartbeats even when the client isn't fetching. Here, we don't
80         // want to prematurely rebalance the consumer group. Assuming clients are
81         // hitting
82         // the server at least every 30 seconds, timing out after 2 minutes should
83         // be okay.
84         // FIXME: consider allowing the client to specify its expected call rate?
85         private static final long kDefault_MustTouchEveryMs = (long)1000 * 60 * 2;
86
87         // check for expirations pretty regularly
88         private static final long kDefault_SweepEverySeconds = 15;
89
90         private enum Status {
91                 NOT_STARTED, CONNECTED, DISCONNECTED, SUSPENDED
92         }
93
94         /**
95          * User defined exception class for kafka consumer cache
96          * 
97          * @author author
98          *
99          */
100         public class KafkaConsumerCacheException extends Exception {
101                 /**
102                  * To throw the exception
103                  * 
104                  * @param t
105                  */
106                 KafkaConsumerCacheException(Throwable t) {
107                         super(t);
108                 }
109
110                 /**
111                  * 
112                  * @param s
113                  */
114                 public KafkaConsumerCacheException(String s) {
115                         super(s);
116                 }
117
118                 private static final long serialVersionUID = 1L;
119         }
120
121         /**
122          * Creates a KafkaConsumerCache object. Before it is used, you must call
123          * startCache()
124          * 
125          * @param apiId
126          * @param s
127          * @param metrics
128          */
129         public KafkaConsumerCache(String apiId,  MetricsSet metrics) {
130
131                 if (apiId == null) {
132                         throw new IllegalArgumentException("API Node ID must be specified.");
133                 }
134
135                 fApiId = apiId;
136         //      fSettings = s;
137                 fMetrics = metrics;
138                 String strkSetting_ZkBasePath= AJSCPropertiesMap.getProperty(CambriaConstants.msgRtr_prop,kSetting_ZkBasePath);
139                 if(null==strkSetting_ZkBasePath)strkSetting_ZkBasePath = kDefault_ZkBasePath;
140                 fBaseZkPath = strkSetting_ZkBasePath;
141
142                 fConsumers = new ConcurrentHashMap<String, KafkaConsumer>();
143                 fSweepScheduler = Executors.newScheduledThreadPool(1);
144
145                 curatorConsumerCache = null;
146
147                 status = Status.NOT_STARTED;
148
149                 listener = new ConnectionStateListener() {
150                         public void stateChanged(CuratorFramework client, ConnectionState newState) {
151                                 if (newState == ConnectionState.LOST) {
152                                         log.info("ZooKeeper connection expired");
153                                         handleConnectionLoss();
154                                 } else if (newState == ConnectionState.READ_ONLY) {
155                                         log.warn("ZooKeeper connection set to read only mode.");
156                                 } else if (newState == ConnectionState.RECONNECTED) {
157                                         log.info("ZooKeeper connection re-established");
158                                         handleReconnection();
159                                 } else if (newState == ConnectionState.SUSPENDED) {
160                                         log.warn("ZooKeeper connection has been suspended.");
161                                         handleConnectionSuspended();
162                                 }
163                         }
164                 };
165         }
166
167         /**
168          * Start the cache service. This must be called before any get/put
169          * operations.
170          * 
171          * @param mode
172          *            DMAAP or cambria
173          * @param curator
174          * @throws IOException
175          * @throws KafkaConsumerCacheException
176          */
177         public void startCache(String mode, CuratorFramework curator) throws KafkaConsumerCacheException {
178                 try {
179
180                         // CuratorFramework curator = null;
181
182                         // Changed the class from where we are initializing the curator
183                         // framework
184                         if (mode != null && mode.equals(CambriaConstants.CAMBRIA)) {
185                                 curator = ConfigurationReader.getCurator();
186                         } else if (mode != null && mode.equals(CambriaConstants.DMAAP)) {
187                                 curator = getCuratorFramework(curator);
188                         }
189
190                         curator.getConnectionStateListenable().addListener(listener);
191
192                         setStatus(Status.CONNECTED);
193
194                         curatorConsumerCache = new PathChildrenCache(curator, fBaseZkPath, true);
195                         curatorConsumerCache.start();
196
197                         curatorConsumerCache.getListenable().addListener(new PathChildrenCacheListener() {
198                                 public void childEvent(CuratorFramework client, PathChildrenCacheEvent event) throws Exception {
199                                         switch (event.getType()) {
200                                         case CHILD_ADDED: {
201                                                 final String apiId = new String(event.getData().getData());
202                                                 final String consumer = ZKPaths.getNodeFromPath(event.getData().getPath());
203
204                                                 log.info(apiId + " started consumer " + consumer);
205                                                 break;
206                                         }
207                                         case CHILD_UPDATED: {
208                                                 final String apiId = new String(event.getData().getData());
209                                                 final String consumer = ZKPaths.getNodeFromPath(event.getData().getPath());
210
211                                                 if (fConsumers.containsKey(consumer)) {
212                                                         log.info(apiId + " claimed consumer " + consumer + " from " + fApiId);
213
214                                                         dropClaimedConsumer(consumer);
215                                                 }
216
217                                                 break;
218                                         }
219                                         case CHILD_REMOVED: {
220                                                 final String consumer = ZKPaths.getNodeFromPath(event.getData().getPath());
221
222                                                 if (fConsumers.containsKey(consumer)) {
223                                                         log.info("Someone wanted consumer " + consumer + " gone;  removing it from the cache");
224                                                         dropConsumer(consumer, false);
225                                                 }
226
227                                                 break;
228                                         }
229                                         default:
230                                                 break;
231                                         }
232                                 }
233                         });
234
235                         // initialize the ZK path
236                         EnsurePath ensurePath = new EnsurePath(fBaseZkPath);
237                         ensurePath.ensure(curator.getZookeeperClient());
238
239                         //final long freq = fSettings.getLong(kSetting_SweepEverySeconds, kDefault_SweepEverySeconds);
240                         long freq = kDefault_SweepEverySeconds;
241                         String strkSetting_SweepEverySeconds = AJSCPropertiesMap.getProperty(CambriaConstants.msgRtr_prop,kSetting_SweepEverySeconds);
242                         if(null==strkSetting_SweepEverySeconds) strkSetting_SweepEverySeconds = kDefault_SweepEverySeconds+"";
243                         
244                           freq = Long.parseLong(strkSetting_SweepEverySeconds);
245                                         
246                         fSweepScheduler.scheduleAtFixedRate(new sweeper(), freq, freq, TimeUnit.SECONDS);
247                         log.info("KafkaConsumerCache started");
248                         log.info("sweeping cached clients every " + freq + " seconds");
249                 } catch (ZkException e) {
250                         throw new KafkaConsumerCacheException(e);
251                 } catch (Exception e) {
252                         throw new KafkaConsumerCacheException(e);
253                 }
254         }
255
256         /**
257          * Getting the curator oject to start the zookeeper connection estabished
258          * 
259          * @param curator
260          * @return curator object
261          */
262         public static CuratorFramework getCuratorFramework(CuratorFramework curator) {
263                 if (curator.getState() == CuratorFrameworkState.LATENT) {
264                         curator.start();
265
266                         try {
267                                 curator.blockUntilConnected();
268                         } catch (InterruptedException e) {
269                                 // Ignore
270                                 log.error("error while setting curator framework :" + e.getMessage());
271                                 Thread.currentThread().interrupt();
272                         }
273                 }
274
275                 return curator;
276         }
277
278         /**
279          * Stop the cache service.
280          */
281         public void stopCache() {
282                 setStatus(Status.DISCONNECTED);
283
284                 final CuratorFramework curator = ConfigurationReader.getCurator();
285
286                 if (curator != null) {
287                         try {
288                                 curator.getConnectionStateListenable().removeListener(listener);
289                                 curatorConsumerCache.close();
290                                 log.info("Curator client closed");
291                         } catch (ZkInterruptedException e) {
292                                 log.error("Curator client close interrupted: " + e);
293                         } catch (IOException e) {
294                                 log.error("Error while closing curator PathChildrenCache for KafkaConsumerCache" + e);
295                         }
296
297                         curatorConsumerCache = null;
298                 }
299
300                 if (fSweepScheduler != null) {
301                         fSweepScheduler.shutdownNow();
302                         log.info("cache sweeper stopped");
303                 }
304
305                 if (fConsumers != null) {
306                         fConsumers.clear();
307                         fConsumers = null;
308                 }
309
310                 setStatus(Status.NOT_STARTED);
311
312                 log.info("Consumer cache service stopped");
313         }
314
315         /**
316          * Get a cached consumer by topic, group, and id, if it exists (and remains
317          * valid) In addition, this method waits for all other consumer caches in
318          * the cluster to release their ownership and delete their version of this
319          * consumer.
320          * 
321          * @param topic
322          * @param consumerGroupId
323          * @param clientId
324          * @return a consumer, or null
325          */
326         public KafkaConsumer getConsumerFor(String topic, String consumerGroupId, String clientId)
327                         throws KafkaConsumerCacheException {
328                 if (getStatus() != KafkaConsumerCache.Status.CONNECTED)
329                         throw new KafkaConsumerCacheException("The cache service is unavailable.");
330
331                 final String consumerKey = makeConsumerKey(topic, consumerGroupId, clientId);
332                 final KafkaConsumer kc = fConsumers.get(consumerKey);
333
334                 if (kc != null) {
335                         log.debug("Consumer cache hit for [" + consumerKey + "], last was at " + kc.getLastTouch());
336                         kc.touch();
337                         fMetrics.onKafkaConsumerCacheHit();
338                 } else {
339                         log.debug("Consumer cache miss for [" + consumerKey + "]");
340                         fMetrics.onKafkaConsumerCacheMiss();
341                 }
342
343                 return kc;
344         }
345
346         /**
347          * Put a consumer into the cache by topic, group and ID
348          * 
349          * @param topic
350          * @param consumerGroupId
351          * @param consumerId
352          * @param consumer
353          * @throws KafkaConsumerCacheException
354          */
355         public void putConsumerFor(String topic, String consumerGroupId, String consumerId, KafkaConsumer consumer)
356                         throws KafkaConsumerCacheException {
357                 if (getStatus() != KafkaConsumerCache.Status.CONNECTED)
358                         throw new KafkaConsumerCacheException("The cache service is unavailable.");
359
360                 final String consumerKey = makeConsumerKey(topic, consumerGroupId, consumerId);
361                 fConsumers.put(consumerKey, consumer);
362         }
363
364         public Collection<? extends Consumer> getConsumers() {
365                 return new LinkedList<KafkaConsumer>(fConsumers.values());
366         }
367
368         /**
369          * This method is to drop all the consumer
370          */
371         public void dropAllConsumers() {
372                 for (Entry<String, KafkaConsumer> entry : fConsumers.entrySet()) {
373                         dropConsumer(entry.getKey(), true);
374                 }
375
376                 // consumers should be empty here
377                 if (fConsumers.size() > 0) {
378                         log.warn("During dropAllConsumers, the consumer map is not empty.");
379                         fConsumers.clear();
380                 }
381         }
382
383         /**
384          * Drop a consumer from our cache due to a timeout
385          * 
386          * @param key
387          */
388         private void dropTimedOutConsumer(String key) {
389                 fMetrics.onKafkaConsumerTimeout();
390
391                 if (!fConsumers.containsKey(key)) {
392                         log.warn("Attempted to drop a timed out consumer which was not in our cache: " + key);
393                         return;
394                 }
395
396                 // First, drop this consumer from our cache
397                 dropConsumer(key, true);
398
399                 final CuratorFramework curator = ConfigurationReader.getCurator();
400
401                 try {
402                         curator.delete().guaranteed().forPath(fBaseZkPath + "/" + key);
403                 } catch (NoNodeException e) {
404                         log.error("Exception at : " + e);
405                 } catch (Exception e) {
406                         log.debug("Unexpected exception while deleting consumer: " + e.getMessage());
407                 }
408
409                 log.info("Dropped " + key + " consumer due to timeout");
410         }
411
412         /**
413          * Drop a consumer from our cache due to another API node claiming it as
414          * their own.
415          * 
416          * @param key
417          */
418         private void dropClaimedConsumer(String key) {
419                 // if the consumer is still in our cache, it implies a claim.
420                 if (fConsumers.containsKey(key)) {
421                         fMetrics.onKafkaConsumerClaimed();
422                         log.info("Consumer [" + key + "] claimed by another node.");
423                 }
424
425                 dropConsumer(key, false);
426         }
427
428         /**
429          * Removes the consumer from the cache and closes its connection to the
430          * kafka broker(s).
431          * 
432          * @param key
433          * @param dueToTimeout
434          */
435         private void dropConsumer(String key, boolean dueToTimeout) {
436                 final KafkaConsumer kc = fConsumers.remove(key);
437
438                 if (kc != null) {
439                         log.info("closing Kafka consumer " + key);
440                         kc.close();
441                 }
442         }
443
444 //      private final rrNvReadable fSettings;
445         private final MetricsSet fMetrics;
446         private final String fBaseZkPath;
447         private final ScheduledExecutorService fSweepScheduler;
448         private final String fApiId;
449         private final ConnectionStateListener listener;
450
451         private ConcurrentHashMap<String, KafkaConsumer> fConsumers;
452         private PathChildrenCache curatorConsumerCache;
453
454         private volatile Status status;
455
456         private void handleReconnection() {
457
458                 log.info("Reading current cache data from ZK and synchronizing local cache");
459
460                 final List<ChildData> cacheData = curatorConsumerCache.getCurrentData();
461
462                 // Remove all the consumers in this API nodes cache that now belong to
463                 // other API nodes.
464                 for (ChildData cachedConsumer : cacheData) {
465                         final String consumerId = ZKPaths.getNodeFromPath(cachedConsumer.getPath());
466                         final String owningApiId = (cachedConsumer.getData() != null) ? new String(cachedConsumer.getData())
467                                         : "undefined";
468
469                         if (!fApiId.equals(owningApiId)) {
470                                 fConsumers.remove(consumerId);
471                         }
472                 }
473
474                 setStatus(Status.CONNECTED);
475         }
476
477         private void handleConnectionSuspended() {
478                 log.info("Suspending cache until ZK connection is re-established");
479
480                 setStatus(Status.SUSPENDED);
481         }
482
483         private void handleConnectionLoss() {
484                 log.info("Clearing consumer cache (shutting down all Kafka consumers on this node)");
485
486                 setStatus(Status.DISCONNECTED);
487
488                 closeAllCachedConsumers();
489                 fConsumers.clear();
490         }
491
492         private void closeAllCachedConsumers() {
493                 for (Entry<String, KafkaConsumer> entry : fConsumers.entrySet()) {
494                         entry.getValue().close();
495                 }
496         }
497
498         private static String makeConsumerKey(String topic, String consumerGroupId, String clientId) {
499                 return topic + "::" + consumerGroupId + "::" + clientId;
500         }
501
502         /**
503          * This method is to get a lock
504          * 
505          * @param topic
506          * @param consumerGroupId
507          * @param consumerId
508          * @throws KafkaConsumerCacheException
509          */
510         public void signalOwnership(final String topic, final String consumerGroupId, final String consumerId)
511                         throws KafkaConsumerCacheException {
512                 // get a lock at <base>/<topic>::<consumerGroupId>::<consumerId>
513                 final String consumerKey = makeConsumerKey(topic, consumerGroupId, consumerId);
514
515                 try {
516                         final String consumerPath = fBaseZkPath + "/" + consumerKey;
517
518                         log.debug(fApiId + " attempting to claim ownership of consumer " + consumerKey);
519
520                         final CuratorFramework curator = ConfigurationReader.getCurator();
521
522                         try {
523                                 curator.setData().forPath(consumerPath, fApiId.getBytes());
524                         } catch (KeeperException.NoNodeException e) {
525                                 curator.create().creatingParentsIfNeeded().forPath(consumerPath, fApiId.getBytes());
526                         }
527
528                         log.info(fApiId + " successfully claimed ownership of consumer " + consumerKey);
529                 } catch (Exception e) {
530                         log.error(fApiId + " failed to claim ownership of consumer " + consumerKey);
531                         throw new KafkaConsumerCacheException(e);
532                 }
533
534                 log.info("Backing off to give the Kafka broker time to clean up the ZK data for this consumer");
535
536                 try {
537                         int kSetting_ConsumerHandoverWaitMs = kDefault_ConsumerHandoverWaitMs;
538                         String strkSetting_ConsumerHandoverWaitMs= AJSCPropertiesMap.getProperty(CambriaConstants.msgRtr_prop,kSetting_ConsumerHandoverWaitMs+"");
539                         if(strkSetting_ConsumerHandoverWaitMs!=null) kSetting_ConsumerHandoverWaitMs = Integer.parseInt(strkSetting_ConsumerHandoverWaitMs);
540                         
541                                         Thread.sleep(kSetting_ConsumerHandoverWaitMs);
542                         //Thread.sleep(fSettings.getInt(kSetting_ConsumerHandoverWaitMs, kDefault_ConsumerHandoverWaitMs));
543                 } catch (InterruptedException e) {
544                         // Ignore
545                 }
546         }
547
548         private void sweep() {
549                 final LinkedList<String> removals = new LinkedList<String>();
550                 long mustTouchEveryMs = kDefault_MustTouchEveryMs;
551                 String strkSetting_TouchEveryMs = AJSCPropertiesMap.getProperty(CambriaConstants.msgRtr_prop,kSetting_TouchEveryMs);
552                 //if(null!=strkSetting_TouchEveryMs) strkSetting_TouchEveryMs = kDefault_MustTouchEveryMs+"";
553                 if(null!=strkSetting_TouchEveryMs)
554                 {
555                   mustTouchEveryMs = Long.parseLong(strkSetting_TouchEveryMs);  
556                 }
557
558                 //final long mustTouchEveryMs = fSettings.getLong(kSetting_TouchEveryMs, kDefault_MustTouchEveryMs);
559                 final long oldestAllowedTouchMs = System.currentTimeMillis() - mustTouchEveryMs;
560
561                 for (Entry<String, KafkaConsumer> e : fConsumers.entrySet()) {
562                         final long lastTouchMs = e.getValue().getLastTouch();
563
564                         log.debug("consumer " + e.getKey() + " last touched at " + lastTouchMs);
565
566                         if (lastTouchMs < oldestAllowedTouchMs) {
567                                 log.info("consumer " + e.getKey() + " has expired");
568                                 removals.add(e.getKey());
569                         }
570                 }
571
572                 for (String key : removals) {
573                         dropTimedOutConsumer(key);
574                 }
575         }
576
577         /**
578          * Creating a thread to run the sweep method
579          * 
580          * @author author
581          *
582          */
583         private class sweeper implements Runnable {
584                 /**
585                  * run method
586                  */
587                 public void run() {
588                         sweep();
589                 }
590         }
591
592         /**
593          * This method is to drop consumer
594          * 
595          * @param topic
596          * @param consumerGroup
597          * @param clientId
598          */
599         public void dropConsumer(String topic, String consumerGroup, String clientId) {
600                 dropConsumer(makeConsumerKey(topic, consumerGroup, clientId), false);
601         }
602
603         private Status getStatus() {
604                 return this.status;
605         }
606
607         private void setStatus(Status status) {
608                 this.status = status;
609         }
610
611         private static final EELFLogger log = EELFManager.getInstance().getLogger(KafkaConsumerCache.class);
612         //private static final Logger log = LoggerFactory.getLogger(KafkaConsumerCache.class);
613 }