Skip to content

Commit 1e08ee7

Browse files
authored
[CI] Modify 4-card container startup config and move test case (#7363)
1 parent 31e2a8b commit 1e08ee7

2 files changed

Lines changed: 14 additions & 4 deletions

File tree

.github/workflows/_gpu_4cards_case_test.yml

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -181,11 +181,17 @@ jobs:
181181
docker rm -f ${runner_name} || true
182182
fi
183183
184+
export RDMA_DEVICES=$(find /dev/infiniband/uverbs* -maxdepth 1 -not -type d | xargs -I{} echo '--device {}:{}')
185+
184186
docker run --rm --net=host \
185-
--shm-size=64g \
186187
--sysctl kernel.msgmax=1048576 \
187188
--sysctl kernel.msgmnb=268435456 \
188189
--name ${runner_name} \
190+
--cap-add=SYS_PTRACE --cap-add=IPC_LOCK \
191+
--shm-size=64G \
192+
${RDMA_DEVICES} \
193+
--device=/dev/infiniband/rdma_cm \
194+
--ulimit memlock=-1:-1 \
189195
-v $(pwd):/workspace -w /workspace \
190196
-v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \
191197
-v "${CACHE_DIR}/.cache:/root/.cache" \
@@ -197,6 +203,10 @@ jobs:
197203
-e "FD_METRICS_PORT=${FD_METRICS_PORT}" \
198204
-e "FLASK_PORT=${FLASK_PORT}" \
199205
-e "FD_CACHE_QUEUE_PORT=${FD_CACHE_QUEUE_PORT}" \
206+
-e "FD_ROUTER_PORT=${FD_ROUTER_PORT}" \
207+
-e "FD_CONNECTOR_PORT=${FD_CONNECTOR_PORT}" \
208+
-e "FD_RDMA_PORT=${FD_RDMA_PORT}" \
209+
-e "CLEAN_CUDA=1" \
200210
-e TZ="Asia/Shanghai" \
201211
-e "fd_wheel_url=${fd_wheel_url}" \
202212
-e "BASE_REF=${BASE_REF}" \

tests/e2e/test_ernie_03b_pd_router_v1_rdma_tp2.py renamed to tests/e2e/4cards_cases/test_ernie_03b_pd_router_v1_rdma_tp2.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626

2727
import pytest
2828
import requests
29-
from utils.serving_utils import (
29+
from e2e.utils.serving_utils import (
3030
FD_API_PORT,
3131
FD_CACHE_QUEUE_PORT,
3232
FD_ENGINE_QUEUE_PORT,
@@ -90,7 +90,7 @@ def setup_and_run_server():
9090

9191
# get rdma nics
9292
current_dir = os.path.dirname(os.path.abspath(__file__))
93-
shell_path = os.path.join(current_dir, "utils/get_rdma_nics.sh")
93+
shell_path = os.path.join(current_dir, "../utils/get_rdma_nics.sh")
9494
output = subprocess.check_output(["bash", shell_path, "gpu"], text=True)
9595
_, rdma_nics = output.split("=")
9696
print(f"shell_path: {shell_path}, rdma_nics: {rdma_nics}")
@@ -171,7 +171,7 @@ def setup_and_run_server():
171171
# decode实例
172172
print("start decode...")
173173
env_decode = os.environ.copy()
174-
env_decode["CUDA_VISIBLE_DEVICES"] = "1"
174+
env_decode["CUDA_VISIBLE_DEVICES"] = "2"
175175
env_decode["FD_LOG_DIR"] = os.path.join(base_log_dir, "log_decode")
176176
# env_decode["KVCACHE_RDMA_NICS"] = rdma_nics
177177

0 commit comments

Comments
 (0)