Skip to content

Commit f89c207

Browse files
authored
[CI/Build] Always build with EP in CI (#922)
* Always build with EP in CI * Add missing LIBRARY_PATH * Do not build tests and examples for the final steps to save up disk space * Remove the requirement of USE_CUDA=ON for transfer-engine (without EP)
1 parent 31cc17d commit f89c207

File tree

8 files changed

+174
-201
lines changed

8 files changed

+174
-201
lines changed

.github/workflows/ci.yml

Lines changed: 7 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,7 @@ jobs:
171171
matrix:
172172
python-version: ['3.10', '3.12']
173173
env:
174+
BUILD_WITH_EP: "1"
174175
SCCACHE_GHA_ENABLED: "true"
175176

176177
steps:
@@ -193,7 +194,8 @@ jobs:
193194
cuda: '12.8.1'
194195
linux-local-args: '["--toolkit"]'
195196
method: 'network'
196-
sub-packages: '["nvcc"]'
197+
sub-packages: '["nvcc", "nvrtc-dev"]'
198+
non-cuda-sub-packages: '["libcusparse-dev", "libcublas-dev", "libcusolver-dev"]'
197199

198200
- name: Run sccache-cache
199201
uses: mozilla-actions/sccache-action@v0.0.9
@@ -213,6 +215,7 @@ jobs:
213215
run: |
214216
sudo apt update -y
215217
sudo bash -x dependencies.sh -y
218+
pip install torch==2.8.0
216219
shell: bash
217220

218221
- name: Build transfer engine only
@@ -231,7 +234,7 @@ jobs:
231234
run: |
232235
mkdir build
233236
cd build
234-
cmake .. -DUSE_ETCD=ON -DUSE_REDIS=ON -DUSE_HTTP=ON -DWITH_STORE=ON -DWITH_P2P_STORE=ON -DWITH_METRICS=ON -DBUILD_UNIT_TESTS=ON -DBUILD_EXAMPLES=ON -DENABLE_SCCACHE=ON -DUSE_CUDA=OFF -DUSE_MNNVL=OFF -DCMAKE_EXE_LINKER_FLAGS="-L/usr/local/cuda/lib64/stubs"
237+
cmake .. -DUSE_ETCD=ON -DUSE_REDIS=ON -DUSE_HTTP=ON -DWITH_STORE=ON -DWITH_P2P_STORE=ON -DWITH_EP=ON -DWITH_METRICS=ON -DBUILD_UNIT_TESTS=ON -DBUILD_EXAMPLES=ON -DENABLE_SCCACHE=ON -DUSE_CUDA=OFF -DUSE_MNNVL=OFF -DCMAKE_EXE_LINKER_FLAGS="-L/usr/local/cuda/lib64/stubs"
235238
shell: bash
236239
# TODO: lack USE_NVMEOF,USE_CUDA,USE_MNNVL
237240

@@ -259,7 +262,8 @@ jobs:
259262
- name: Configure project
260263
run: |
261264
cd build
262-
cmake .. -DUSE_HTTP=ON -DENABLE_SCCACHE=ON
265+
rm -r */tests
266+
cmake .. -DBUILD_UNIT_TESTS=OFF -DBUILD_EXAMPLES=OFF -DUSE_HTTP=ON -DENABLE_SCCACHE=ON
263267
shell: bash
264268

265269
- name: Build project
@@ -294,100 +298,6 @@ jobs:
294298
name: mooncake-wheel-ubuntu-py${{ steps.generate_tag_flags.outputs.python_version_tag }}
295299
path: mooncake-wheel/dist-py${{ steps.generate_tag_flags.outputs.python_version_tag }}/*.whl
296300

297-
build-with-ep:
298-
runs-on: ubuntu-22.04
299-
strategy:
300-
matrix:
301-
python-version: ['3.10', '3.12']
302-
env:
303-
BUILD_WITH_EP: "1"
304-
SCCACHE_GHA_ENABLED: "true"
305-
SCCACHE_CACHE_SIZE: "2G"
306-
307-
steps:
308-
- uses: actions/checkout@v4
309-
310-
- name: Set up Python ${{ matrix.python-version }}
311-
uses: actions/setup-python@v5
312-
with:
313-
python-version: ${{ matrix.python-version }}
314-
315-
- name: Free up disk space
316-
run: |
317-
sudo rm -rf /usr/share/dotnet
318-
sudo rm -rf /opt/ghc
319-
sudo rm -rf /opt/hostedtoolcache/CodeQL
320-
321-
- name: Install CUDA Toolkit
322-
uses: Jimver/cuda-toolkit@v0.2.24
323-
with:
324-
cuda: '12.8.1'
325-
linux-local-args: '["--toolkit"]'
326-
method: 'network'
327-
sub-packages: '["nvcc", "nvrtc-dev"]'
328-
non-cuda-sub-packages: '["libcusparse-dev", "libcublas-dev", "libcusolver-dev"]'
329-
330-
- name: Run sccache-cache
331-
uses: mozilla-actions/sccache-action@v0.0.9
332-
333-
- name: Configure sccache
334-
uses: actions/github-script@v7
335-
with:
336-
script: |
337-
core.exportVariable('ACTIONS_RESULTS_URL', process.env.ACTIONS_RESULTS_URL || '');
338-
core.exportVariable('ACTIONS_RUNTIME_TOKEN', process.env.ACTIONS_RUNTIME_TOKEN || '');
339-
340-
- name: Run sccache stat for check
341-
shell: bash
342-
run: ${SCCACHE_PATH} --show-stats
343-
344-
- name: Install dependencies
345-
run: |
346-
sudo apt update -y
347-
sudo bash -x dependencies.sh -y
348-
pip install toml-cli # for updating the version
349-
pip install --no-cache-dir torch==2.8.0
350-
shell: bash
351-
352-
- name: Build transfer engine with EP
353-
run: |
354-
mkdir build
355-
cd build
356-
export PATH=/usr/local/nvidia/bin:/usr/local/nvidia/lib64:$PATH
357-
export LIBRARY_PATH=/usr/local/cuda/lib64/stubs:LIBRARY_PATH
358-
export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH
359-
cmake .. -DUSE_ETCD=ON -DUSE_REDIS=ON -DUSE_HTTP=ON -DWITH_STORE=ON -DWITH_P2P_STORE=ON -DWITH_EP=ON -DWITH_METRICS=ON -DBUILD_UNIT_TESTS=ON -DBUILD_EXAMPLES=ON -DENABLE_SCCACHE=ON -DUSE_CUDA=ON -DUSE_MNNVL=OFF -DCMAKE_EXE_LINKER_FLAGS="-L/usr/local/cuda/lib64/stubs"
360-
make -j
361-
sudo make install
362-
shell: bash
363-
364-
- name: Build nvlink_allocator.so
365-
run: |
366-
mkdir -p build/mooncake-transfer-engine/nvlink-allocator
367-
cd mooncake-transfer-engine/nvlink-allocator
368-
bash build.sh --ci-build ../../build/mooncake-transfer-engine/nvlink-allocator/
369-
shell: bash
370-
371-
- name: Generate Python version tag
372-
id: generate_tag_flags
373-
run: |
374-
echo "python_version_tag=$(echo ${{ matrix.python-version }} | tr -d '.')" >> $GITHUB_OUTPUT
375-
shell: bash
376-
377-
- name: Build Python wheel
378-
run: |
379-
BASE_VERSION=$(toml get --toml-path mooncake-wheel/pyproject.toml project.version | tr -d '"')
380-
toml set --toml-path mooncake-wheel/pyproject.toml project.version "${BASE_VERSION}+ep"
381-
# Build wheel with specific Python version
382-
PYTHON_VERSION=${{ matrix.python-version }} OUTPUT_DIR=dist-py${{ steps.generate_tag_flags.outputs.python_version_tag }} ./scripts/build_wheel.sh
383-
shell: bash
384-
385-
- name: Upload Python wheel artifact
386-
uses: actions/upload-artifact@v4
387-
with:
388-
name: mooncake-wheel-ubuntu-py${{ steps.generate_tag_flags.outputs.python_version_tag }}+ep
389-
path: mooncake-wheel/dist-py${{ steps.generate_tag_flags.outputs.python_version_tag }}/*.whl
390-
391301
build-docker:
392302
name: Build Docker Image
393303
runs-on: ubuntu-22.04

.github/workflows/release.yaml

Lines changed: 3 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -15,93 +15,6 @@ jobs:
1515
strategy:
1616
matrix:
1717
python-version: ['3.8', '3.9', '3.10', '3.11', '3.12', '3.13']
18-
steps:
19-
- name: Checkout source
20-
uses: actions/checkout@v4
21-
22-
- name: Set up Python ${{ matrix.python-version }}
23-
uses: actions/setup-python@v5
24-
with:
25-
python-version: ${{ matrix.python-version }}
26-
27-
- name: Free up disk space
28-
run: |
29-
sudo rm -rf /usr/share/dotnet
30-
sudo rm -rf /opt/ghc
31-
sudo rm -rf /opt/hostedtoolcache/CodeQL
32-
33-
- name: Install CUDA Toolkit
34-
uses: Jimver/cuda-toolkit@v0.2.24
35-
with:
36-
cuda: '12.8.1'
37-
linux-local-args: '["--toolkit"]'
38-
method: 'network'
39-
sub-packages: '["nvcc"]'
40-
41-
- name: Run sccache-cache
42-
uses: mozilla-actions/sccache-action@v0.0.9
43-
44-
- name: Configure sccache
45-
uses: actions/github-script@v7
46-
with:
47-
script: |
48-
core.exportVariable('ACTIONS_RESULTS_URL', process.env.ACTIONS_RESULTS_URL || '');
49-
core.exportVariable('ACTIONS_RUNTIME_TOKEN', process.env.ACTIONS_RUNTIME_TOKEN || '');
50-
51-
- name: Run sccache stat for check
52-
shell: bash
53-
run: ${SCCACHE_PATH} --show-stats
54-
55-
- name: Configure project
56-
run: |
57-
sudo apt update -y
58-
sudo bash -x dependencies.sh -y
59-
mkdir build
60-
cd build
61-
cmake .. -DUSE_HTTP=ON -DUSE_ETCD=ON -DSTORE_USE_ETCD=ON -DENABLE_SCCACHE=ON -DCMAKE_BUILD_TYPE=Release
62-
shell: bash
63-
64-
- name: Build project
65-
run: |
66-
cd build
67-
make -j
68-
sudo make install
69-
shell: bash
70-
71-
- name: Build nvlink_allocator.so
72-
run: |
73-
mkdir -p build/mooncake-transfer-engine/nvlink-allocator
74-
cd mooncake-transfer-engine/nvlink-allocator
75-
bash build.sh --ci-build ../../build/mooncake-transfer-engine/nvlink-allocator/
76-
shell: bash
77-
78-
- name: Generate Python version tag
79-
id: generate_tag_release
80-
run: |
81-
echo "python_version_tag=$(echo ${{ matrix.python-version }} | tr -d '.')" >> $GITHUB_OUTPUT
82-
shell: bash
83-
84-
- name: Build Python wheel
85-
run: |
86-
# Set LD_LIBRARY_PATH for wheel building
87-
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib
88-
PYTHON_VERSION=${{ matrix.python-version }} OUTPUT_DIR=dist-py${{ steps.generate_tag_release.outputs.python_version_tag }} ./scripts/build_wheel.sh
89-
env:
90-
VERSION: ${{ env.VERSION }}
91-
92-
- name: Upload Python wheel artifact
93-
uses: actions/upload-artifact@v4
94-
with:
95-
name: mooncake-wheel-py${{ steps.generate_tag_release.outputs.python_version_tag }}
96-
path: mooncake-wheel/dist-py${{ steps.generate_tag_release.outputs.python_version_tag }}/*.whl
97-
98-
build-with-ep:
99-
runs-on: ubuntu-22.04
100-
permissions:
101-
contents: write
102-
strategy:
103-
matrix:
104-
python-version: ['3.10', '3.12']
10518
env:
10619
BUILD_WITH_EP: "1"
10720
steps:
@@ -146,11 +59,10 @@ jobs:
14659
run: |
14760
sudo apt update -y
14861
sudo bash -x dependencies.sh -y
149-
pip install toml-cli # for updating the version
15062
pip install torch==2.8.0
15163
mkdir build
15264
cd build
153-
cmake .. -DUSE_HTTP=ON -DUSE_ETCD=ON -DUSE_CUDA=ON -DWITH_EP=ON -DSTORE_USE_ETCD=ON -DENABLE_SCCACHE=ON -DCMAKE_BUILD_TYPE=Release
65+
cmake .. -DUSE_HTTP=ON -DUSE_ETCD=ON -DWITH_EP=ON -DSTORE_USE_ETCD=ON -DENABLE_SCCACHE=ON -DCMAKE_BUILD_TYPE=Release
15466
shell: bash
15567

15668
- name: Build project
@@ -175,8 +87,6 @@ jobs:
17587

17688
- name: Build Python wheel
17789
run: |
178-
BASE_VERSION=$(toml get --toml-path mooncake-wheel/pyproject.toml project.version | tr -d '"')
179-
toml set --toml-path mooncake-wheel/pyproject.toml project.version "${BASE_VERSION}+ep"
18090
# Set LD_LIBRARY_PATH for wheel building
18191
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib
18292
PYTHON_VERSION=${{ matrix.python-version }} OUTPUT_DIR=dist-py${{ steps.generate_tag_release.outputs.python_version_tag }} ./scripts/build_wheel.sh
@@ -186,13 +96,11 @@ jobs:
18696
- name: Upload Python wheel artifact
18797
uses: actions/upload-artifact@v4
18898
with:
189-
name: mooncake-wheel-py${{ steps.generate_tag_release.outputs.python_version_tag }}+ep
99+
name: mooncake-wheel-py${{ steps.generate_tag_release.outputs.python_version_tag }}
190100
path: mooncake-wheel/dist-py${{ steps.generate_tag_release.outputs.python_version_tag }}/*.whl
191101

192102
publish-release:
193-
needs:
194-
- build
195-
- build-with-ep
103+
needs: build
196104
runs-on: ubuntu-22.04
197105
permissions:
198106
contents: write

doc/en/build.md

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,6 @@ This document describes how to build Mooncake from source.
66
```bash
77
pip3 install mooncake-transfer-engine --upgrade
88
```
9-
- To install with the Mooncake Backend and Mooncake EP support, use the following command:
10-
```bash
11-
# replace torch2.8.0 with the corresponding version
12-
pip3 install mooncake-transfer-engine==0.3.7+ep --upgrade
13-
```
149

1510
## Automatic
1611

mooncake-ep/include/mooncake_backend.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88

99
namespace mooncake {
1010

11+
std::string getCudaTopologyJson(const std::vector<std::string>& filter);
12+
1113
class MooncakeBackend final : public ::c10d::Backend {
1214
public:
1315
struct MooncakeBackendOptions final : ::c10d::Backend::Options {
@@ -61,6 +63,7 @@ class MooncakeBackend final : public ::c10d::Backend {
6163
static void setHostIp(const std::string& hostIp) { hostIp_ = hostIp; }
6264

6365
static void setDeviceFilter(std::vector<std::string> filters) {
66+
hca_filters_ = filters;
6467
engine_.setWhitelistFilters(std::move(filters));
6568
}
6669

@@ -72,6 +75,7 @@ class MooncakeBackend final : public ::c10d::Backend {
7275
private:
7376
static TransferEngine engine_;
7477
static Transport* transport_;
78+
static std::vector<std::string> hca_filters_;
7579
static int backendIndex_;
7680
bool isCpu_{false};
7781
static std::string hostIp_;

mooncake-ep/src/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
add_library(mooncake_ep mooncake_backend.cpp mooncake_ep_buffer.cpp mooncake_ep_kernel.cu mooncake_worker.cu mooncake_worker_thread.cpp mooncake_ibgda/mlx5gda.cpp)
1+
add_library(mooncake_ep mooncake_backend.cpp mooncake_cuda_topology.cpp mooncake_ep_buffer.cpp mooncake_ep_kernel.cu mooncake_worker.cu mooncake_worker_thread.cpp mooncake_ibgda/mlx5gda.cpp)
22

33
set_target_properties(mooncake_ep PROPERTIES POSITION_INDEPENDENT_CODE ON)
44
target_link_libraries(mooncake_ep PUBLIC ${TORCH_LIBRARIES} transfer_engine ibverbs mlx5)

mooncake-ep/src/mooncake_backend.cpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,9 @@ constexpr const char* SPARSE_ERROR_MSG = "Sparse op not supported.";
1616
constexpr const char* REDUCE_DTYPE_ERROR_MSG = "Unsupported reduce dtype: ";
1717

1818
std::string MooncakeBackend::hostIp_ = "127.0.0.1";
19-
TransferEngine MooncakeBackend::engine_ = TransferEngine(true);
19+
TransferEngine MooncakeBackend::engine_ = TransferEngine();
2020
Transport* MooncakeBackend::transport_ = nullptr;
21+
std::vector<std::string> MooncakeBackend::hca_filters_;
2122
int MooncakeBackend::backendIndex_ = 0;
2223
MooncakeWorker MooncakeBackend::worker_;
2324

@@ -33,7 +34,11 @@ MooncakeBackend::MooncakeBackend(
3334
// Initialize transfer engine
3435
if (!transport_) {
3536
engine_.init(P2PHANDSHAKE, hostIp_);
36-
transport_ = engine_.installTransport("rdma", nullptr);
37+
std::string topology = getCudaTopologyJson(hca_filters_);
38+
void** args = (void**)malloc(2 * sizeof(void*));
39+
args[0] = (void*)topology.c_str();
40+
args[1] = nullptr;
41+
transport_ = engine_.installTransport("rdma", args);
3742
TORCH_CHECK(transport_ != nullptr,
3843
c10::str("Failed to install transport"));
3944
}

0 commit comments

Comments
 (0)