apache · CodingCat · Apr 16, 2024 · Apr 16, 2024 · Apr 18, 2024 · Apr 18, 2024
diff --git a/client/src/main/scala/org/apache/celeborn/client/ChangePartitionManager.scala b/client/src/main/scala/org/apache/celeborn/client/ChangePartitionManager.scala
@@ -47,6 +47,8 @@ class ChangePartitionManager(
   // shuffleId -> (partitionId -> set of ChangePartition)
   private val changePartitionRequests =
     JavaUtils.newConcurrentHashMap[Int, ConcurrentHashMap[Integer, JSet[ChangePartitionRequest]]]()
+  private val locks = Array.fill(conf.batchHandleChangePartitionParallelism)(new AnyRef())
+
   // shuffleId -> set of partition id
   private val inBatchPartitions = JavaUtils.newConcurrentHashMap[Int, JSet[Integer]]()
 
@@ -79,14 +81,18 @@ class ChangePartitionManager(
                 batchHandleChangePartitionExecutors.submit {
                   new Runnable {
                     override def run(): Unit = {
-                      val distinctPartitions = requests.synchronized {
-                        // For each partition only need handle one request
-                        requests.asScala.filter { case (partitionId, _) =>
-                          !inBatchPartitions.get(shuffleId).contains(partitionId)
-                        }.map { case (partitionId, request) =>
-                          inBatchPartitions.get(shuffleId).add(partitionId)
-                          request.asScala.toArray.maxBy(_.epoch)
-                        }.toArray
+                      val distinctPartitions = {
+                        val requestSet = inBatchPartitions.get(shuffleId)
+                        requests.asScala.map { case (partitionId, request) =>
+                          locks(partitionId % locks.length).synchronized {
+                            if (!inBatchPartitions.contains(partitionId)) {
+                              requestSet.add(partitionId)
+                              Some(request.asScala.toArray.maxBy(_.epoch))
+                            } else {
+                              None
+                            }
+                          }
+                        }.filter(_.isDefined).map(_.get).toArray
                       }
                       if (distinctPartitions.nonEmpty) {
                         handleRequestPartitions(
@@ -151,15 +157,21 @@ class ChangePartitionManager(
       oldPartition,
       cause)
 
-    requests.synchronized {
-      if (requests.containsKey(partitionId)) {
-        requests.get(partitionId).add(changePartition)
+    locks(partitionId % locks.length).synchronized {
+      var newEntry = false
+      val set = requests.computeIfAbsent(
+        partitionId,
+        new java.util.function.Function[Integer, util.Set[ChangePartitionRequest]] {
+          override def apply(t: Integer): util.Set[ChangePartitionRequest] = {
+            newEntry = true
+            new util.HashSet[ChangePartitionRequest]()
+          }
+        })
+
+      if (newEntry) {
         logTrace(s"[handleRequestPartitionLocation] For $shuffleId, request for same partition" +
           s"$partitionId-$oldEpoch exists, register context.")
-        return
       } else {
-        // If new slot for the partition has been allocated, reply and return.
-        // Else register and allocate for it.
         getLatestPartition(shuffleId, partitionId, oldEpoch).foreach { latestLoc =>
           context.reply(
             partitionId,
@@ -170,10 +182,8 @@ class ChangePartitionManager(
             s" shuffleId: $shuffleId $latestLoc")
           return
         }
-        val set = new util.HashSet[ChangePartitionRequest]()
-        set.add(changePartition)
-        requests.put(partitionId, set)
       }
+      set.add(changePartition)
     }
     if (!batchHandleChangePartitionEnabled) {
       handleRequestPartitions(shuffleId, Array(changePartition))

diff --git a/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala b/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala
@@ -914,6 +914,8 @@ class CelebornConf(loadDefaults: Boolean) extends Cloneable with Logging with Se
     PartitionSplitMode.valueOf(get(SHUFFLE_PARTITION_SPLIT_MODE))
   def shufflePartitionSplitThreshold: Long = get(SHUFFLE_PARTITION_SPLIT_THRESHOLD)
   def batchHandleChangePartitionEnabled: Boolean = get(CLIENT_BATCH_HANDLE_CHANGE_PARTITION_ENABLED)
+  def batchHandleChangePartitionParallelism: Int =
+    get(CLIENT_BATCH_HANDLE_CHANGE_PARTITION_PARALLELISM)
   def batchHandleChangePartitionNumThreads: Int = get(CLIENT_BATCH_HANDLE_CHANGE_PARTITION_THREADS)
   def batchHandleChangePartitionRequestInterval: Long =
     get(CLIENT_BATCH_HANDLE_CHANGE_PARTITION_INTERVAL)
@@ -3899,6 +3901,14 @@ object CelebornConf extends Logging {
       .booleanConf
       .createWithDefault(true)
 
+  val CLIENT_BATCH_HANDLE_CHANGE_PARTITION_PARALLELISM: ConfigEntry[Int] =
+    buildConf("celeborn.client.shuffle.batchHandleChangePartition.parallelism")
+      .categories("client")
+      .doc("Max number of change partition requests which can be concurrently processed ")
+      .version("0.5.0")
+      .intConf
+      .createWithDefault(256)
+
   val CLIENT_BATCH_HANDLE_CHANGE_PARTITION_THREADS: ConfigEntry[Int] =
     buildConf("celeborn.client.shuffle.batchHandleChangePartition.threads")
       .withAlternative("celeborn.shuffle.batchHandleChangePartition.threads")

diff --git a/docs/configuration/client.md b/docs/configuration/client.md
@@ -81,6 +81,7 @@ license: |
 | celeborn.client.rpc.reserveSlots.askTimeout | &lt;value of celeborn.rpc.askTimeout&gt; | false | Timeout for LifecycleManager request reserve slots. | 0.3.0 |  | 
 | celeborn.client.rpc.shared.threads | 16 | false | Number of shared rpc threads in LifecycleManager. | 0.3.2 |  | 
 | celeborn.client.shuffle.batchHandleChangePartition.interval | 100ms | false | Interval for LifecycleManager to schedule handling change partition requests in batch. | 0.3.0 | celeborn.shuffle.batchHandleChangePartition.interval | 
+| celeborn.client.shuffle.batchHandleChangePartition.parallelism | 256 | false | Max number of change partition requests which can be concurrently processed  | 0.5.0 |  | 
 | celeborn.client.shuffle.batchHandleChangePartition.threads | 8 | false | Threads number for LifecycleManager to handle change partition request in batch. | 0.3.0 | celeborn.shuffle.batchHandleChangePartition.threads | 
 | celeborn.client.shuffle.batchHandleCommitPartition.interval | 5s | false | Interval for LifecycleManager to schedule handling commit partition requests in batch. | 0.3.0 | celeborn.shuffle.batchHandleCommitPartition.interval | 
 | celeborn.client.shuffle.batchHandleCommitPartition.threads | 8 | false | Threads number for LifecycleManager to handle commit partition request in batch. | 0.3.0 | celeborn.shuffle.batchHandleCommitPartition.threads |