keep old api in vllm-integration

doujiang24 · doujiang24 · commit 594d518c0f13 · 2025-01-16T21:16:14.000+08:00
Signed-off-by: doujiang24 &lt;doujiang24@gmail.com&gt;
diff --git a/doc/en/vllm-integration.md b/doc/en/vllm-integration.md
@@ -37,6 +37,7 @@ pip3 install -e .
  - If you encounter any problems that you cannot solve, please refer to the [vLLM official compilation guide](https://docs.vllm.ai/en/v0.6.4.post1/getting_started/installation.html#install-the-latest-code).
 
 ## Configuration
+### Prepare configuration file to Run Example over RDMA
 
 - Prepare a _**mooncake.json**_ file for both Prefill and Decode instances
 - **You don't need to change the `prefill_url` and `decode_url` of the config file in the decode side, please use the identical config file.**
@@ -45,14 +46,33 @@ pip3 install -e .
 {
   "prefill_url": "192.168.0.137:13003",
   "decode_url": "192.168.0.139:13003",
-  "metadata_server": "192.168.0.139:2379"
+  "metadata_server": "192.168.0.139:2379",
+  "protocol": "rdma",
+  "device_name": "erdma_0"
 }
 ```
 - "prefill_url": The IP address and port of the Prefill node.
   - The port in the URL is used to communicate with etcd server for metadata.
 - "decode_url": The IP address and port of the Decode node.
   - The port in the URL is used to communicate with etcd server for metadata.
 - "metadata_server": The etcd server of mooncake transfer engine.
+- "protocol": The protocol to be used for data transmission. ("rdma/tcp")
+- "device_name": The device to be used for data transmission, required when "protocol" is set to "rdma". If multiple NIC devices are used, they can be separated by commas such as "erdma_0,erdma_1". Please note that there are no spaces between them.
+
+
+### Prepare configuration file to Run Example over TCP
+
+- Prepare a _**mooncake.json**_ file for both Prefill and Decode instances
+```json
+{
+  "prefill_url": "192.168.0.137:13003",
+  "decode_url": "192.168.0.139:13003",
+  "metadata_server": "192.168.0.139:2379",
+  "protocol": "tcp",
+  "device_name": ""
+}
+```
+
 
 ## Run Example
  - Please change the IP addresses and ports in the following guide according to your env.
diff --git a/doc/zh/vllm-integration.md b/doc/zh/vllm-integration.md
@@ -37,6 +37,7 @@ pip3 install -e .
  - 如果遇到任何无法解决的问题，请参照[vLLM官方的编译指南](https://docs.vllm.ai/en/v0.6.4.post1/getting_started/installation.html#install-the-latest-code)。
 
 ## 配置
+### 使用 RDMA 运行示例所需配置文件
 
 - 为预填充和解码实例准备一个 mooncake.json 文件
 - **在解码实例侧，你无须更改配置文件里的`prefill_url` 与 `decode_url`，使用完同相同的配置文件即可。**
@@ -45,14 +46,33 @@ pip3 install -e .
 {
   "prefill_url": "192.168.0.137:13003",
   "decode_url": "192.168.0.139:13003",
-  "metadata_server": "192.168.0.139:2379"
+  "metadata_server": "192.168.0.139:2379",
+  "protocol": "rdma",
+  "device_name": "erdma_0"
 }
 ```
 - "prefill_url": 预填充节点的 IP 地址和端口。
   - URL 中的端口用于与 etcd 服务器通信以获取元数据。
 - "decode_url": 解码节点的 IP 地址和端口。
   - URL 中的端口用于与 etcd 服务器通信以获取元数据。
 - "metadata_server": mooncake 传输引擎的 etcd 服务器。
+- "protocol": 数据传输协议("rdma/tcp")。
+- "device_name": 用于数据传输的设备，当 "protocol" 设置为 "rdma" 时必填。如果使用多个 NIC 设备，它们可以用逗号分隔，如 "erdma_0,erdma_1"。请注意它们之间没有空格。
+
+
+### 使用 TCP 运行示例所需配置文件
+
+- 为预填充和解码实例准备一个 mooncake.json 文件
+```json
+{
+  "prefill_url": "192.168.0.137:13003",
+  "decode_url": "192.168.0.139:13003",
+  "metadata_server": "192.168.0.139:2379",
+  "protocol": "tcp",
+  "device_name": ""
+}
+```
+
 
 ## 运行示例
  - 请根据您的环境更改以下指南中的 IP 地址和端口。
diff --git a/mooncake-integration/vllm/vllm_adaptor.cpp b/mooncake-integration/vllm/vllm_adaptor.cpp
@@ -28,6 +28,24 @@ VLLMAdaptor::~VLLMAdaptor() {
     large_buffer_list_.clear();
 }
 
+std::string formatDeviceNames(const std::string &device_names) {
+    std::stringstream ss(device_names);
+    std::string item;
+    std::vector<std::string> tokens;
+    while (getline(ss, item, ',')) {
+        tokens.push_back(item);
+    }
+
+    std::string formatted;
+    for (size_t i = 0; i < tokens.size(); ++i) {
+        formatted += "\"" + tokens[i] + "\"";
+        if (i < tokens.size() - 1) {
+            formatted += ",";
+        }
+    }
+    return formatted;
+}
+
 std::pair<std::string, std::string> parseConnectionString(
     const std::string &conn_string) {
     std::pair<std::string, std::string> result;
@@ -48,26 +66,46 @@ std::pair<std::string, std::string> parseConnectionString(
 }
 
 int VLLMAdaptor::initialize(const char *local_hostname,
-                            const char *metadata_server) {
+                            const char *metadata_server, const char *protocol,
+                            const char *device_name) {
     auto conn_string = parseConnectionString(metadata_server);
-    return initializeExt(local_hostname, conn_string.second.c_str(),
-                         conn_string.first.c_str());
+    return initializeExt(local_hostname, conn_string.second.c_str(), protocol,
+                         device_name, conn_string.first.c_str());
 }
 
 int VLLMAdaptor::initializeExt(const char *local_hostname,
                                const char *metadata_server,
+                               const char *protocol, const char *device_name,
                                const char *metadata_type) {
     std::string conn_string = metadata_server;
     if (conn_string.find("://") == std::string::npos)
         conn_string =
             std::string(metadata_type) + "://" + std::string(metadata_server);
 
-    engine_ = std::make_unique<TransferEngine>();
+    // TODO: remove `false` in the feature, it's for keep same API in vllm.
+    engine_ = std::make_unique<TransferEngine>(false);
     auto hostname_port = parseHostNameWithPort(local_hostname);
     int ret = engine_->init(conn_string, local_hostname,
                             hostname_port.first.c_str(), hostname_port.second);
     if (ret) return -1;
 
+    xport_ = nullptr;
+    if (strcmp(protocol, "rdma") == 0) {
+        auto device_names = formatDeviceNames(device_name);
+        std::string nic_priority_matrix =
+            "{\"cpu:0\": [[" + device_names + "], []]}";
+        void **args = (void **)malloc(2 * sizeof(void *));
+        args[0] = (void *)nic_priority_matrix.c_str();
+        args[1] = nullptr;
+        xport_ = engine_->installTransport("rdma", args);
+    } else if (strcmp(protocol, "tcp") == 0) {
+        xport_ = engine_->installTransport("tcp", nullptr);
+    } else {
+        LOG(ERROR) << "Unsupported protocol";
+        return -1;
+    }
+
+    if (!xport_) return -1;
     free_list_.resize(kSlabSizeKBTabLen);
     doBuddyAllocate(kMaxClassId);
     return 0;
@@ -202,4 +240,4 @@ PYBIND11_MODULE(mooncake_vllm_adaptor, m) {
         .def("readBytesFromBuffer", &VLLMAdaptor::readBytesFromBuffer)
         .def("expRegisterMemory", &VLLMAdaptor::expRegisterMemory)
         .def("expUnregisterMemory", &VLLMAdaptor::expUnregisterMemory);
-}
+}
diff --git a/mooncake-integration/vllm/vllm_adaptor.h b/mooncake-integration/vllm/vllm_adaptor.h
@@ -44,9 +44,11 @@ class VLLMAdaptor {
 
     ~VLLMAdaptor();
 
-    int initialize(const char *local_hostname, const char *metadata_server);
+    int initialize(const char *local_hostname, const char *metadata_server,
+                   const char *protocol, const char *device_name);
 
     int initializeExt(const char *local_hostname, const char *metadata_server,
+                      const char *protocol, const char *device_name,
                       const char *metadata_type);
 
     uintptr_t allocateManagedBuffer(size_t length);
@@ -84,6 +86,7 @@ class VLLMAdaptor {
 
    private:
     std::shared_ptr<TransferEngine> engine_;
+    Transport *xport_;
 
     std::mutex mutex_;
     std::vector<std::stack<char *>> free_list_;