Skip to content

Commit f1ed40d

Browse files
committed
Prefer replicas on remote nodes
Signed-off-by: Lakshya Taragi <lakshya.taragi@gmail.com>
1 parent 6bc04b4 commit f1ed40d

File tree

2 files changed

+178
-5
lines changed

2 files changed

+178
-5
lines changed

server/src/main/java/org/opensearch/cluster/routing/RoutingNodes.java

Lines changed: 38 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
import org.opensearch.common.Randomness;
4545
import org.opensearch.common.annotation.PublicApi;
4646
import org.opensearch.common.collect.Tuple;
47+
import org.opensearch.common.settings.ClusterSettings;
4748
import org.opensearch.core.Assertions;
4849
import org.opensearch.core.index.Index;
4950
import org.opensearch.core.index.shard.ShardId;
@@ -61,12 +62,15 @@
6162
import java.util.NoSuchElementException;
6263
import java.util.Objects;
6364
import java.util.Queue;
65+
import java.util.Random;
6466
import java.util.Set;
6567
import java.util.function.Function;
6668
import java.util.function.Predicate;
6769
import java.util.stream.Collectors;
6870
import java.util.stream.Stream;
6971

72+
import static org.opensearch.node.remotestore.RemoteStoreNodeService.isMigratingToRemoteStore;
73+
7074
/**
7175
* {@link RoutingNodes} represents a copy the routing information contained in the {@link ClusterState cluster state}.
7276
* It can be either initialized as mutable or immutable (see {@link #RoutingNodes(ClusterState, boolean)}), allowing
@@ -418,6 +422,29 @@ public ShardRouting activeReplicaWithOldestVersion(ShardId shardId) {
418422
.orElse(null);
419423
}
420424

425+
/**
426+
* Returns one active replica shard on a remote node for the given shard id or <code>null</code> if
427+
* no such replica is found.
428+
* <p>
429+
* Since we aim to continue moving forward during remote store migration, replicas already migrated to remote nodes
430+
* are preferred for primary promotion
431+
*/
432+
public ShardRouting activeReplicaOnRemoteNode(ShardId shardId) {
433+
List<ShardRouting> replicaShardsOnRemote = assignedShards(shardId).stream()
434+
.filter(shr -> !shr.primary() && shr.active())
435+
.filter((shr) -> {
436+
RoutingNode nd = node(shr.currentNodeId());
437+
return (nd != null && nd.node().isRemoteStoreNode());
438+
})
439+
.collect(Collectors.toList());
440+
ShardRouting replicaShard = null;
441+
if (replicaShardsOnRemote.isEmpty() == false) {
442+
Random rand = Randomness.get();
443+
replicaShard = replicaShardsOnRemote.get(rand.nextInt(replicaShardsOnRemote.size()));
444+
}
445+
return replicaShard;
446+
}
447+
421448
/**
422449
* Returns <code>true</code> iff all replicas are active for the given shard routing. Otherwise <code>false</code>
423450
*/
@@ -735,11 +762,17 @@ private void unassignPrimaryAndPromoteActiveReplicaIfExists(
735762
RoutingChangesObserver routingChangesObserver
736763
) {
737764
assert failedShard.primary();
738-
ShardRouting activeReplica;
739-
if (metadata.isSegmentReplicationEnabled(failedShard.getIndexName())) {
740-
activeReplica = activeReplicaWithOldestVersion(failedShard.shardId());
741-
} else {
742-
activeReplica = activeReplicaWithHighestVersion(failedShard.shardId());
765+
ShardRouting activeReplica = null;
766+
if (isMigratingToRemoteStore(new ClusterSettings(metadata.settings(), ClusterSettings.BUILT_IN_CLUSTER_SETTINGS))) {
767+
// we might not find any replica on remote node
768+
activeReplica = activeReplicaOnRemoteNode(failedShard.shardId());
769+
}
770+
if (activeReplica == null) {
771+
if (metadata.isSegmentReplicationEnabled(failedShard.getIndexName())) {
772+
activeReplica = activeReplicaWithOldestVersion(failedShard.shardId());
773+
} else {
774+
activeReplica = activeReplicaWithHighestVersion(failedShard.shardId());
775+
}
743776
}
744777
if (activeReplica == null) {
745778
moveToUnassigned(failedShard, unassignedInfo);

server/src/test/java/org/opensearch/cluster/routing/allocation/FailedShardsRoutingTests.java

Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,27 +40,37 @@
4040
import org.opensearch.cluster.metadata.IndexMetadata;
4141
import org.opensearch.cluster.metadata.Metadata;
4242
import org.opensearch.cluster.node.DiscoveryNode;
43+
import org.opensearch.cluster.node.DiscoveryNodeRole;
4344
import org.opensearch.cluster.node.DiscoveryNodes;
4445
import org.opensearch.cluster.routing.RoutingNodes;
4546
import org.opensearch.cluster.routing.RoutingTable;
4647
import org.opensearch.cluster.routing.ShardRouting;
4748
import org.opensearch.cluster.routing.allocation.command.AllocationCommands;
4849
import org.opensearch.cluster.routing.allocation.command.MoveAllocationCommand;
4950
import org.opensearch.cluster.routing.allocation.decider.ClusterRebalanceAllocationDecider;
51+
import org.opensearch.common.UUIDs;
5052
import org.opensearch.common.settings.Settings;
53+
import org.opensearch.common.util.FeatureFlags;
5154
import org.opensearch.core.index.shard.ShardId;
5255
import org.opensearch.indices.replication.common.ReplicationType;
56+
import org.opensearch.node.remotestore.RemoteStoreNodeService;
5357
import org.opensearch.test.VersionUtils;
5458

5559
import java.util.ArrayList;
5660
import java.util.HashSet;
61+
import java.util.List;
62+
import java.util.Map;
5763
import java.util.Set;
5864

5965
import static org.opensearch.cluster.ClusterName.CLUSTER_NAME_SETTING;
6066
import static org.opensearch.cluster.routing.ShardRoutingState.INITIALIZING;
6167
import static org.opensearch.cluster.routing.ShardRoutingState.RELOCATING;
6268
import static org.opensearch.cluster.routing.ShardRoutingState.STARTED;
6369
import static org.opensearch.cluster.routing.ShardRoutingState.UNASSIGNED;
70+
import static org.opensearch.common.util.FeatureFlags.REMOTE_STORE_MIGRATION_EXPERIMENTAL;
71+
import static org.opensearch.node.remotestore.RemoteStoreNodeAttribute.REMOTE_STORE_CLUSTER_STATE_REPOSITORY_NAME_ATTRIBUTE_KEY;
72+
import static org.opensearch.node.remotestore.RemoteStoreNodeService.MIGRATION_DIRECTION_SETTING;
73+
import static org.opensearch.node.remotestore.RemoteStoreNodeService.REMOTE_STORE_COMPATIBILITY_MODE_SETTING;
6474
import static org.hamcrest.Matchers.anyOf;
6575
import static org.hamcrest.Matchers.equalTo;
6676
import static org.hamcrest.Matchers.lessThan;
@@ -812,4 +822,134 @@ private void testReplicaIsPromoted(boolean isSegmentReplicationEnabled) {
812822
}
813823
}
814824
}
825+
826+
public void testPreferReplicaOnRemoteNodeForPrimaryPromotion() {
827+
FeatureFlags.initializeFeatureFlags(Settings.builder().put(REMOTE_STORE_MIGRATION_EXPERIMENTAL, "true").build());
828+
AllocationService allocation = createAllocationService(Settings.builder().build());
829+
830+
// segment replication enabled
831+
Settings.Builder settingsBuilder = settings(Version.CURRENT).put(IndexMetadata.SETTING_REPLICATION_TYPE, ReplicationType.SEGMENT);
832+
833+
// remote store migration metadata settings
834+
Metadata metadata = Metadata.builder()
835+
.put(IndexMetadata.builder("test").settings(settingsBuilder).numberOfShards(1).numberOfReplicas(4))
836+
.persistentSettings(
837+
Settings.builder()
838+
.put(REMOTE_STORE_COMPATIBILITY_MODE_SETTING.getKey(), RemoteStoreNodeService.CompatibilityMode.MIXED.mode)
839+
.put(MIGRATION_DIRECTION_SETTING.getKey(), RemoteStoreNodeService.Direction.REMOTE_STORE.direction)
840+
.build()
841+
)
842+
.build();
843+
844+
RoutingTable initialRoutingTable = RoutingTable.builder().addAsNew(metadata.index("test")).build();
845+
846+
ClusterState clusterState = ClusterState.builder(CLUSTER_NAME_SETTING.getDefault(Settings.EMPTY))
847+
.metadata(metadata)
848+
.routingTable(initialRoutingTable)
849+
.build();
850+
851+
ShardId shardId = new ShardId(metadata.index("test").getIndex(), 0);
852+
853+
// add a remote node and start primary shard
854+
Map<String, String> remoteStoreNodeAttributes = Map.of(
855+
REMOTE_STORE_CLUSTER_STATE_REPOSITORY_NAME_ATTRIBUTE_KEY,
856+
"REMOTE_STORE_CLUSTER_STATE_REPOSITORY_NAME_ATTRIBUTE_VALUE"
857+
);
858+
DiscoveryNode remoteNode1 = new DiscoveryNode(
859+
UUIDs.base64UUID(),
860+
buildNewFakeTransportAddress(),
861+
remoteStoreNodeAttributes,
862+
DiscoveryNodeRole.BUILT_IN_ROLES,
863+
Version.CURRENT
864+
);
865+
clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder().add(remoteNode1)).build();
866+
clusterState = ClusterState.builder(clusterState).routingTable(allocation.reroute(clusterState, "reroute").routingTable()).build();
867+
assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(1));
868+
assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED).size(), equalTo(4));
869+
870+
clusterState = startInitializingShardsAndReroute(allocation, clusterState);
871+
assertThat(clusterState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(1));
872+
assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED).size(), equalTo(4));
873+
874+
// add remote and non-remote nodes and start replica shards
875+
DiscoveryNode remoteNode2 = new DiscoveryNode(
876+
UUIDs.base64UUID(),
877+
buildNewFakeTransportAddress(),
878+
remoteStoreNodeAttributes,
879+
DiscoveryNodeRole.BUILT_IN_ROLES,
880+
Version.CURRENT
881+
);
882+
DiscoveryNode remoteNode3 = new DiscoveryNode(
883+
UUIDs.base64UUID(),
884+
buildNewFakeTransportAddress(),
885+
remoteStoreNodeAttributes,
886+
DiscoveryNodeRole.BUILT_IN_ROLES,
887+
Version.CURRENT
888+
);
889+
DiscoveryNode nonRemoteNode1 = new DiscoveryNode(UUIDs.base64UUID(), buildNewFakeTransportAddress(), Version.CURRENT);
890+
DiscoveryNode nonRemoteNode2 = new DiscoveryNode(UUIDs.base64UUID(), buildNewFakeTransportAddress(), Version.CURRENT);
891+
List<DiscoveryNode> replicaShardNodes = List.of(remoteNode2, remoteNode3, nonRemoteNode1, nonRemoteNode2);
892+
893+
for (int i = 0; i < 4; i++) {
894+
clusterState = ClusterState.builder(clusterState)
895+
.nodes(DiscoveryNodes.builder(clusterState.nodes()).add(replicaShardNodes.get(i)))
896+
.build();
897+
898+
clusterState = allocation.reroute(clusterState, "reroute");
899+
assertThat(clusterState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(1 + i));
900+
assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(1));
901+
assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED).size(), equalTo(4 - (i + 1)));
902+
903+
clusterState = startInitializingShardsAndReroute(allocation, clusterState);
904+
assertThat(clusterState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(1 + (i + 1)));
905+
assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED).size(), equalTo(4 - (i + 1)));
906+
}
907+
908+
// fail primary shard
909+
ShardRouting primaryShard0 = clusterState.routingTable().index("test").shard(0).primaryShard();
910+
ClusterState newState = allocation.applyFailedShard(clusterState, primaryShard0, randomBoolean());
911+
assertNotEquals(clusterState, newState);
912+
clusterState = newState;
913+
914+
// verify that promoted replica exists on a remote node
915+
assertEquals(4, clusterState.getRoutingNodes().shardsWithState(STARTED).size());
916+
ShardRouting primaryShardRouting1 = clusterState.routingTable().index("test").shard(0).primaryShard();
917+
assertNotEquals(primaryShard0, primaryShardRouting1);
918+
assertTrue(
919+
primaryShardRouting1.currentNodeId().equals(remoteNode2.getId())
920+
|| primaryShardRouting1.currentNodeId().equals(remoteNode3.getId())
921+
);
922+
923+
// fail primary shard again
924+
newState = allocation.applyFailedShard(clusterState, primaryShardRouting1, randomBoolean());
925+
assertNotEquals(clusterState, newState);
926+
clusterState = newState;
927+
928+
// verify that promoted replica again exists on a remote node
929+
assertEquals(3, clusterState.getRoutingNodes().shardsWithState(STARTED).size());
930+
ShardRouting primaryShardRouting2 = clusterState.routingTable().index("test").shard(0).primaryShard();
931+
assertNotEquals(primaryShardRouting1, primaryShardRouting2);
932+
assertTrue(
933+
primaryShardRouting2.currentNodeId().equals(remoteNode2.getId())
934+
|| primaryShardRouting2.currentNodeId().equals(remoteNode3.getId())
935+
);
936+
assertNotEquals(primaryShardRouting1.currentNodeId(), primaryShardRouting2.currentNodeId());
937+
938+
ShardRouting expectedCandidateForSegRep = clusterState.getRoutingNodes().activeReplicaWithOldestVersion(shardId);
939+
940+
// fail primary shard again
941+
newState = allocation.applyFailedShard(clusterState, primaryShardRouting2, randomBoolean());
942+
assertNotEquals(clusterState, newState);
943+
clusterState = newState;
944+
945+
// verify that promoted replica exists on a non-remote node
946+
assertEquals(2, clusterState.getRoutingNodes().shardsWithState(STARTED).size());
947+
ShardRouting primaryShardRouting3 = clusterState.routingTable().index("test").shard(0).primaryShard();
948+
assertNotEquals(primaryShardRouting2, primaryShardRouting3);
949+
assertTrue(
950+
primaryShardRouting3.currentNodeId().equals(nonRemoteNode1.getId())
951+
|| primaryShardRouting3.currentNodeId().equals(nonRemoteNode2.getId())
952+
);
953+
assertEquals(expectedCandidateForSegRep.allocationId(), primaryShardRouting3.allocationId());
954+
}
815955
}

0 commit comments

Comments
 (0)