Skip to content

Commit

Permalink
Tolerate missing DNS entry when completing host replacement
Browse files Browse the repository at this point in the history
patch by Chris Lohfink; reviewed by Brandon Williams for CASSANDRA-16873

Co-authored by Chris Lohfink <clohfink@apple.com>
Co-authored by Josh McKenzie <jmckenzie@apache.org>
  • Loading branch information
clohfink authored and jmckenzie-dev committed Aug 27, 2021
1 parent 2b6799a commit f59411f
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 3 deletions.
1 change: 1 addition & 0 deletions CHANGES.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
4.0.1
* Tolerate missing DNS entry when completing a host replacement (CASSANDRA-16873)
* Harden PrunableArrayQueue against Pruner implementations that might throw exceptions (CASSANDRA-16866)
* Move RepairedDataInfo to the execution controller rather than the ReadCommand to avoid unintended sharing (CASSANDRA-16721)
* Bump zstd-jni version to 1.5.0-4 (CASSANDRA-16884)
Expand Down
37 changes: 35 additions & 2 deletions src/java/org/apache/cassandra/service/StorageService.java
Original file line number Diff line number Diff line change
Expand Up @@ -2719,6 +2719,38 @@ else if (Gossiper.instance.compareEndpointStartup(endpoint, currentOwner) > 0)
SystemKeyspace.updateTokens(endpoint, tokensToUpdateInSystemKeyspace);
}

@VisibleForTesting
public boolean isReplacingSameHostAddressAndHostId(UUID hostId)
{
try
{
return isReplacingSameAddress() &&
Gossiper.instance.getEndpointStateForEndpoint(DatabaseDescriptor.getReplaceAddress()) != null
&& hostId.equals(Gossiper.instance.getHostId(DatabaseDescriptor.getReplaceAddress()));
}
catch (RuntimeException ex)
{
// If a host is decomissioned and the DNS entry is removed before the
// bootstrap completes, when it completes and advertises NORMAL state to other nodes, they will be unable
// to resolve it to an InetAddress unless it happens to be cached. This could happen on nodes
// storing large amounts of data or with long index rebuild times or if new instances have been added
// to the cluster through expansion or additional host replacement.
//
// The original host replacement must have been able to resolve the replacing address on startup
// when setting StorageService.replacing, so if it is impossible to resolve now it is probably
// decommissioned and did not have the same IP address or host id. Allow the handleStateNormal
// handling to proceed, otherwise gossip state will be inconistent with some nodes believing the
// replacement host to be normal, and nodes unable to resolve the hostname will be left in JOINING.
if (ex.getCause() != null && ex.getCause().getClass() == UnknownHostException.class)
{
logger.info("Suppressed exception while checking isReplacingSameHostAddressAndHostId({}). Original host was probably decommissioned. ({})",
hostId, ex.getMessage());
return false;
}
throw ex; // otherwise rethrow
}
}

/**
* Handle node move to normal state. That is, node is entering token ring and participating
* in reads.
Expand Down Expand Up @@ -2764,9 +2796,10 @@ private void handleStateNormal(final InetAddressAndPort endpoint, final String s
// Order Matters, TM.updateHostID() should be called before TM.updateNormalToken(), (see CASSANDRA-4300).
UUID hostId = Gossiper.instance.getHostId(endpoint);
InetAddressAndPort existing = tokenMetadata.getEndpointForHostId(hostId);
if (replacing && isReplacingSameAddress() && Gossiper.instance.getEndpointStateForEndpoint(DatabaseDescriptor.getReplaceAddress()) != null
&& (hostId.equals(Gossiper.instance.getHostId(DatabaseDescriptor.getReplaceAddress()))))
if (replacing && isReplacingSameHostAddressAndHostId(hostId))
{
logger.warn("Not updating token metadata for {} because I am replacing it", endpoint);
}
else
{
if (existing != null && !existing.equals(endpoint))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,13 @@
import java.io.IOException;
import java.io.PrintWriter;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.util.*;

import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;

import org.apache.cassandra.db.SystemKeyspace;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Test;
Expand Down Expand Up @@ -697,4 +699,34 @@ public void testAuditLogEnableLoggerTransitions() throws Exception
assertTrue(AuditLogManager.instance.isEnabled());
StorageService.instance.disableAuditLog();
}
}

@Test
public void isReplacingSameHostAddressAndHostIdTest() throws UnknownHostException
{
try
{
UUID differentHostId = UUID.randomUUID();
Assert.assertFalse(StorageService.instance.isReplacingSameHostAddressAndHostId(differentHostId));

final String hostAddress = FBUtilities.getBroadcastAddressAndPort().getHostAddress(false);
UUID localHostId = SystemKeyspace.getLocalHostId();
Gossiper.instance.initializeNodeUnsafe(FBUtilities.getBroadcastAddressAndPort(), localHostId, 1);

// Check detects replacing the same host address with the same hostid
System.setProperty("cassandra.replace_address", hostAddress);
Assert.assertTrue(StorageService.instance.isReplacingSameHostAddressAndHostId(localHostId));

// Check detects replacing the same host address with a different host id
System.setProperty("cassandra.replace_address", hostAddress);
Assert.assertFalse(StorageService.instance.isReplacingSameHostAddressAndHostId(differentHostId));

// Check tolerates the DNS entry going away for the replace_address
System.setProperty("cassandra.replace_address", "unresolvable.host.local.");
Assert.assertFalse(StorageService.instance.isReplacingSameHostAddressAndHostId(differentHostId));
}
finally
{
System.clearProperty("cassandra.replace_address");
}
}
}

0 comments on commit f59411f

Please sign in to comment.