Skip to content

Commit 61c8a08

Browse files
authored
Fix potential deadlock with curator ZK store (slackhq#1010)
Co-authored-by: Bryan Burkholder <bryanlb@users.noreply.github.com>
1 parent f947fe6 commit 61c8a08

File tree

1 file changed

+25
-2
lines changed

1 file changed

+25
-2
lines changed

astra/src/main/java/com/slack/astra/metadata/core/AstraMetadataStore.java

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import static com.slack.astra.server.AstraConfig.DEFAULT_ZK_TIMEOUT_SECS;
44

5+
import com.google.common.util.concurrent.ThreadFactoryBuilder;
56
import com.slack.astra.util.RuntimeHalterImpl;
67
import java.io.Closeable;
78
import java.util.List;
@@ -11,6 +12,8 @@
1112
import java.util.concurrent.ConcurrentHashMap;
1213
import java.util.concurrent.CountDownLatch;
1314
import java.util.concurrent.ExecutionException;
15+
import java.util.concurrent.ExecutorService;
16+
import java.util.concurrent.Executors;
1417
import java.util.concurrent.TimeUnit;
1518
import java.util.concurrent.TimeoutException;
1619
import org.apache.curator.x.async.AsyncCuratorFramework;
@@ -44,6 +47,9 @@ public class AstraMetadataStore<T extends AstraMetadata> implements Closeable {
4447
private final Map<AstraMetadataStoreChangeListener<T>, ModeledCacheListener<T>> listenerMap =
4548
new ConcurrentHashMap<>();
4649

50+
private final ExecutorService cacheInitializedService;
51+
private final ModeledCacheListener<T> initializedListener = getCacheInitializedListener();
52+
4753
public AstraMetadataStore(
4854
AsyncCuratorFramework curator,
4955
CreateMode createMode,
@@ -64,11 +70,15 @@ public AstraMetadataStore(
6470
modeledClient = ModeledFramework.wrap(curator, modelSpec);
6571

6672
if (shouldCache) {
73+
cacheInitializedService =
74+
Executors.newSingleThreadExecutor(
75+
new ThreadFactoryBuilder().setNameFormat("cache-initialized-service-%d").build());
6776
cachedModeledFramework = modeledClient.cached();
68-
cachedModeledFramework.listenable().addListener(getCacheInitializedListener());
77+
cachedModeledFramework.listenable().addListener(initializedListener, cacheInitializedService);
6978
cachedModeledFramework.start();
7079
} else {
7180
cachedModeledFramework = null;
81+
cacheInitializedService = null;
7282
}
7383
}
7484

@@ -204,7 +214,12 @@ public void removeListener(AstraMetadataStoreChangeListener<T> watcher) {
204214

205215
private void awaitCacheInitialized() {
206216
try {
207-
cacheInitialized.await();
217+
if (!cacheInitialized.await(30, TimeUnit.SECONDS)) {
218+
// in the event we deadlock, go ahead and time this out at 30s and restart the pod
219+
new RuntimeHalterImpl()
220+
.handleFatal(
221+
new TimeoutException("Timed out waiting for Zookeeper cache to initialize"));
222+
}
208223
} catch (InterruptedException e) {
209224
new RuntimeHalterImpl().handleFatal(e);
210225
}
@@ -221,6 +236,14 @@ public void accept(Type type, ZPath path, Stat stat, T model) {
221236
public void initialized() {
222237
ModeledCacheListener.super.initialized();
223238
cacheInitialized.countDown();
239+
240+
// after it's initialized, we no longer need the listener or executor
241+
if (cachedModeledFramework != null) {
242+
cachedModeledFramework.listenable().removeListener(initializedListener);
243+
}
244+
if (cacheInitializedService != null) {
245+
cacheInitializedService.shutdown();
246+
}
224247
}
225248
};
226249
}

0 commit comments

Comments
 (0)