Skip to content

Commit be635db

Browse files
committed
Update container creation during deployment
1 parent 57c5dc4 commit be635db

6 files changed

Lines changed: 40 additions & 41 deletions

File tree

python/fedml/computing/scheduler/comm_utils/container_utils.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -227,9 +227,8 @@ def pull_image_with_policy(self, image_pull_policy, image_name, client=None):
227227
raise Exception(f"Unsupported image pull policy: {image_pull_policy}")
228228

229229
class ContainerMetrics:
230-
def __init__(self, cpu_percent, mem_used_megabytes, mem_avail_megabytes, network_recv_megabytes, network_sent_megabytes,
231-
blk_read_megabytes, blk_write_megabytes, timestamp, gpus_stat
232-
):
230+
def __init__(self, cpu_percent, mem_used_megabytes, mem_avail_megabytes, network_recv_megabytes,
231+
network_sent_megabytes, blk_read_megabytes, blk_write_megabytes, timestamp, gpus_stat):
233232
self.cpu_percent = cpu_percent
234233
self.mem_used_megabytes = mem_used_megabytes
235234
self.mem_avail_megabytes = mem_avail_megabytes

python/fedml/computing/scheduler/comm_utils/gpu_utils/gpu_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ def get_gpu_cards() -> List[GPUCard]:
5252

5353
@staticmethod
5454
@abstractmethod
55-
def get_docker_gpu_device_mapping(gpu_ids: List[int]) -> Optional[Dict]:
55+
def get_docker_gpu_device_mapping(gpu_ids: Optional[List[int]], num_gpus: int = 0) -> Optional[Dict]:
5656
raise NotImplementedError
5757

5858
@staticmethod

python/fedml/computing/scheduler/comm_utils/gpu_utils/nvidia_utils.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,12 @@ def get_available_gpu_card_ids(order: str, limit: int, max_load: float, max_memo
2828
return GPUtil.getAvailable(order=order, limit=limit, maxLoad=max_load, maxMemory=max_memory)
2929

3030
@staticmethod
31-
def get_docker_gpu_device_mapping(gpu_ids: List[int]) -> Optional[Dict]:
32-
if gpu_ids and len(gpu_ids):
31+
def get_docker_gpu_device_mapping(gpu_ids: List[int], num_gpus: int = 0) -> Optional[Dict]:
32+
if gpu_ids is not None and len(gpu_ids):
3333
gpu_id_list = list(map(lambda x: str(x), gpu_ids))
3434
return {"device_requests": [docker.types.DeviceRequest(device_ids=gpu_id_list, capabilities=[["gpu"]])]}
35-
return None
35+
else:
36+
return {"device_requests": [docker.types.DeviceRequest(count=num_gpus, capabilities=[['gpu']])]}
3637

3738
@staticmethod
3839
def get_docker_gpu_ids_by_container_name(container_name: str, docker_client: DockerClient) -> List[int]:

python/fedml/computing/scheduler/comm_utils/gpu_utils/qualcomm_utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,8 +55,8 @@ def get_available_gpu_card_ids(order: str, limit: int, max_load: float, max_memo
5555
return list(map(lambda card: card.id, gpu_cards))
5656

5757
@staticmethod
58-
def get_docker_gpu_device_mapping(gpu_ids: List[int]) -> Optional[Dict]:
59-
if gpu_ids and len(gpu_ids):
58+
def get_docker_gpu_device_mapping(gpu_ids: Optional[List[int]], num_gpus: int = 0) -> Optional[Dict]:
59+
if gpu_ids is not None and len(gpu_ids):
6060
return {
6161
"devices": [f"{QualcommNPUtil.NPU_CARD_PATH}{gpu_id}:{QualcommNPUtil.NPU_CARD_PATH}{gpu_id}" for gpu_id
6262
in gpu_ids]}

python/fedml/computing/scheduler/comm_utils/hardware_utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,10 +42,10 @@ def get_available_gpu_ids(order: str = "memory", limit: int = 1, max_load: float
4242
return gpu_util.get_available_gpu_card_ids(order, limit, max_load, max_memory) if gpu_util is not None else []
4343

4444
@staticmethod
45-
def get_docker_gpu_device_mapping(gpu_ids: List[int]) -> Optional[Dict]:
45+
def get_docker_gpu_device_mapping(gpu_ids: Optional[List[int]], num_gpus: int = 0) -> Optional[Dict]:
4646
gpu_util = HardwareUtil.__get_util()
4747
if gpu_util is not None:
48-
return gpu_util.get_docker_gpu_device_mapping(gpu_ids)
48+
return gpu_util.get_docker_gpu_device_mapping(gpu_ids, num_gpus)
4949
return None
5050

5151
@staticmethod

python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py

Lines changed: 29 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import fedml
1919
from fedml.computing.scheduler.comm_utils import sys_utils, security_utils
2020
from fedml.computing.scheduler.comm_utils.container_utils import ContainerUtils
21+
from fedml.computing.scheduler.comm_utils.hardware_utils import HardwareUtil
2122
from fedml.computing.scheduler.comm_utils.job_utils import JobRunnerUtils
2223

2324
for type_name in collections.abc.__all__:
@@ -231,24 +232,6 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version,
231232
except docker.errors.APIError:
232233
raise Exception("Failed to get the container object")
233234

234-
# Allocate the GPU
235-
# TODO: Make sure no competition for each replica in a single deployment
236-
if exist_container_obj is not None:
237-
client.api.remove_container(exist_container_obj.id, v=True, force=True)
238-
device_requests = []
239-
if no_real_gpu_allocation is not None:
240-
use_gpu = not no_real_gpu_allocation
241-
if use_gpu:
242-
logging.info("Number of GPUs: {}".format(num_gpus))
243-
if gpu_ids is not None:
244-
gpu_id_list = map(lambda x: str(x), gpu_ids)
245-
device_requests.append(
246-
docker.types.DeviceRequest(device_ids=list(gpu_id_list), capabilities=[['gpu']]))
247-
else:
248-
device_requests.append(
249-
docker.types.DeviceRequest(count=num_gpus, capabilities=[['gpu']]))
250-
logging.info(f"device_requests: {device_requests}")
251-
252235
# Pull the inference image
253236
logging.info(f"Start pulling the inference image {inference_image_name}... with policy {image_pull_policy}")
254237
ContainerUtils.get_instance().pull_image_with_policy(image_pull_policy, inference_image_name)
@@ -306,6 +289,32 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version,
306289
}
307290
environment["MAIN_ENTRY"] = relative_entry
308291

292+
host_config_dict = {
293+
"binds": binds,
294+
"port_bindings": {
295+
port_inside_container: usr_indicated_worker_port
296+
},
297+
"shm_size": shm_size,
298+
"storage_opt": storage_opt,
299+
"tmpfs": tmpfs,
300+
"cpu_count": cpus,
301+
"mem_limit": memory
302+
}
303+
304+
# Allocate the GPU
305+
# TODO: Make sure no competition for each replica in a single deployment
306+
if exist_container_obj is not None:
307+
client.api.remove_container(exist_container_obj.id, v=True, force=True)
308+
device_requests = {}
309+
if no_real_gpu_allocation is not None:
310+
use_gpu = not no_real_gpu_allocation
311+
if use_gpu:
312+
logging.info("Number of GPUs: {}".format(num_gpus))
313+
device_requests = HardwareUtil.get_docker_gpu_device_mapping(gpu_ids, num_gpus)
314+
logging.info(f"device_requests: {device_requests}")
315+
316+
host_config_dict.update(device_requests)
317+
309318
# Environment variables
310319
if not enable_custom_image:
311320
# For some image, the default user is root. Unified to fedml.
@@ -325,24 +334,14 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version,
325334
environment[key] = extra_envs[key]
326335

327336
try:
337+
host_config = client.api.create_host_config(**host_config_dict)
328338
new_container = client.api.create_container(
329339
image=inference_image_name,
330340
name=default_server_container_name,
331341
volumes=volumns,
332342
ports=[port_inside_container], # port open inside the container
333343
environment=environment,
334-
host_config=client.api.create_host_config(
335-
binds=binds,
336-
port_bindings={
337-
port_inside_container: usr_indicated_worker_port # Could be either None or a port number
338-
},
339-
device_requests=device_requests,
340-
shm_size=shm_size,
341-
storage_opt=storage_opt,
342-
tmpfs=tmpfs,
343-
cpu_count=cpus,
344-
mem_limit=memory,
345-
),
344+
host_config=host_config,
346345
detach=True,
347346
command=customized_image_entry_cmd if enable_custom_image else None,
348347
entrypoint=customized_image_entry_cmd if enable_custom_image else None

0 commit comments

Comments
 (0)