Update container creation during deployment

alaydshah · alaydshah · commit be635dbe2cf2 · 2024-05-12T16:26:42.000-07:00
diff --git a/python/fedml/computing/scheduler/comm_utils/container_utils.py b/python/fedml/computing/scheduler/comm_utils/container_utils.py
@@ -227,9 +227,8 @@ def pull_image_with_policy(self, image_pull_policy, image_name, client=None):
             raise Exception(f"Unsupported image pull policy: {image_pull_policy}")
 
     class ContainerMetrics:
-        def __init__(self, cpu_percent, mem_used_megabytes, mem_avail_megabytes, network_recv_megabytes, network_sent_megabytes,
-                     blk_read_megabytes, blk_write_megabytes, timestamp, gpus_stat
-                     ):
+        def __init__(self, cpu_percent, mem_used_megabytes, mem_avail_megabytes, network_recv_megabytes,
+                     network_sent_megabytes, blk_read_megabytes, blk_write_megabytes, timestamp, gpus_stat):
             self.cpu_percent = cpu_percent
             self.mem_used_megabytes = mem_used_megabytes
             self.mem_avail_megabytes = mem_avail_megabytes
diff --git a/python/fedml/computing/scheduler/comm_utils/gpu_utils/gpu_utils.py b/python/fedml/computing/scheduler/comm_utils/gpu_utils/gpu_utils.py
@@ -52,7 +52,7 @@ def get_gpu_cards() -> List[GPUCard]:
 
     @staticmethod
     @abstractmethod
-    def get_docker_gpu_device_mapping(gpu_ids: List[int]) -> Optional[Dict]:
+    def get_docker_gpu_device_mapping(gpu_ids: Optional[List[int]], num_gpus: int = 0) -> Optional[Dict]:
         raise NotImplementedError
 
     @staticmethod
diff --git a/python/fedml/computing/scheduler/comm_utils/gpu_utils/nvidia_utils.py b/python/fedml/computing/scheduler/comm_utils/gpu_utils/nvidia_utils.py
@@ -28,11 +28,12 @@ def get_available_gpu_card_ids(order: str, limit: int, max_load: float, max_memo
         return GPUtil.getAvailable(order=order, limit=limit, maxLoad=max_load, maxMemory=max_memory)
 
     @staticmethod
-    def get_docker_gpu_device_mapping(gpu_ids: List[int]) -> Optional[Dict]:
-        if gpu_ids and len(gpu_ids):
+    def get_docker_gpu_device_mapping(gpu_ids: List[int], num_gpus: int = 0) -> Optional[Dict]:
+        if gpu_ids is not None and len(gpu_ids):
             gpu_id_list = list(map(lambda x: str(x), gpu_ids))
             return {"device_requests": [docker.types.DeviceRequest(device_ids=gpu_id_list, capabilities=[["gpu"]])]}
-        return None
+        else:
+            return {"device_requests": [docker.types.DeviceRequest(count=num_gpus, capabilities=[['gpu']])]}
 
     @staticmethod
     def get_docker_gpu_ids_by_container_name(container_name: str, docker_client: DockerClient) -> List[int]:
diff --git a/python/fedml/computing/scheduler/comm_utils/gpu_utils/qualcomm_utils.py b/python/fedml/computing/scheduler/comm_utils/gpu_utils/qualcomm_utils.py
@@ -55,8 +55,8 @@ def get_available_gpu_card_ids(order: str, limit: int, max_load: float, max_memo
         return list(map(lambda card: card.id, gpu_cards))
 
     @staticmethod
-    def get_docker_gpu_device_mapping(gpu_ids: List[int]) -> Optional[Dict]:
-        if gpu_ids and len(gpu_ids):
+    def get_docker_gpu_device_mapping(gpu_ids: Optional[List[int]], num_gpus: int = 0) -> Optional[Dict]:
+        if gpu_ids is not None and len(gpu_ids):
             return {
                 "devices": [f"{QualcommNPUtil.NPU_CARD_PATH}{gpu_id}:{QualcommNPUtil.NPU_CARD_PATH}{gpu_id}" for gpu_id
                             in gpu_ids]}
diff --git a/python/fedml/computing/scheduler/comm_utils/hardware_utils.py b/python/fedml/computing/scheduler/comm_utils/hardware_utils.py
@@ -42,10 +42,10 @@ def get_available_gpu_ids(order: str = "memory", limit: int = 1, max_load: float
         return gpu_util.get_available_gpu_card_ids(order, limit, max_load, max_memory) if gpu_util is not None else []
 
     @staticmethod
-    def get_docker_gpu_device_mapping(gpu_ids: List[int]) -> Optional[Dict]:
+    def get_docker_gpu_device_mapping(gpu_ids: Optional[List[int]], num_gpus: int = 0) -> Optional[Dict]:
         gpu_util = HardwareUtil.__get_util()
         if gpu_util is not None:
-            return gpu_util.get_docker_gpu_device_mapping(gpu_ids)
+            return gpu_util.get_docker_gpu_device_mapping(gpu_ids, num_gpus)
         return None
 
     @staticmethod
diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py b/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py
@@ -18,6 +18,7 @@
 import fedml
 from fedml.computing.scheduler.comm_utils import sys_utils, security_utils
 from fedml.computing.scheduler.comm_utils.container_utils import ContainerUtils
+from fedml.computing.scheduler.comm_utils.hardware_utils import HardwareUtil
 from fedml.computing.scheduler.comm_utils.job_utils import JobRunnerUtils
 
 for type_name in collections.abc.__all__:
@@ -231,24 +232,6 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version,
     except docker.errors.APIError:
         raise Exception("Failed to get the container object")
 
-    # Allocate the GPU
-    # TODO: Make sure no competition for each replica in a single deployment
-    if exist_container_obj is not None:
-        client.api.remove_container(exist_container_obj.id, v=True, force=True)
-    device_requests = []
-    if no_real_gpu_allocation is not None:
-        use_gpu = not no_real_gpu_allocation
-    if use_gpu:
-        logging.info("Number of GPUs: {}".format(num_gpus))
-        if gpu_ids is not None:
-            gpu_id_list = map(lambda x: str(x), gpu_ids)
-            device_requests.append(
-                docker.types.DeviceRequest(device_ids=list(gpu_id_list), capabilities=[['gpu']]))
-        else:
-            device_requests.append(
-                docker.types.DeviceRequest(count=num_gpus, capabilities=[['gpu']]))
-    logging.info(f"device_requests: {device_requests}")
-
     # Pull the inference image
     logging.info(f"Start pulling the inference image {inference_image_name}... with policy {image_pull_policy}")
     ContainerUtils.get_instance().pull_image_with_policy(image_pull_policy, inference_image_name)
@@ -306,6 +289,32 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version,
         }
         environment["MAIN_ENTRY"] = relative_entry
 
+    host_config_dict = {
+        "binds": binds,
+        "port_bindings": {
+            port_inside_container: usr_indicated_worker_port
+        },
+        "shm_size": shm_size,
+        "storage_opt": storage_opt,
+        "tmpfs": tmpfs,
+        "cpu_count": cpus,
+        "mem_limit": memory
+    }
+
+    # Allocate the GPU
+    # TODO: Make sure no competition for each replica in a single deployment
+    if exist_container_obj is not None:
+        client.api.remove_container(exist_container_obj.id, v=True, force=True)
+    device_requests = {}
+    if no_real_gpu_allocation is not None:
+        use_gpu = not no_real_gpu_allocation
+    if use_gpu:
+        logging.info("Number of GPUs: {}".format(num_gpus))
+        device_requests = HardwareUtil.get_docker_gpu_device_mapping(gpu_ids, num_gpus)
+    logging.info(f"device_requests: {device_requests}")
+
+    host_config_dict.update(device_requests)
+
     # Environment variables
     if not enable_custom_image:
         # For some image, the default user is root. Unified to fedml.
@@ -325,24 +334,14 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version,
             environment[key] = extra_envs[key]
 
     try:
+        host_config = client.api.create_host_config(**host_config_dict)
         new_container = client.api.create_container(
             image=inference_image_name,
             name=default_server_container_name,
             volumes=volumns,
             ports=[port_inside_container],  # port open inside the container
             environment=environment,
-            host_config=client.api.create_host_config(
-                binds=binds,
-                port_bindings={
-                    port_inside_container: usr_indicated_worker_port  # Could be either None or a port number
-                },
-                device_requests=device_requests,
-                shm_size=shm_size,
-                storage_opt=storage_opt,
-                tmpfs=tmpfs,
-                cpu_count=cpus,
-                mem_limit=memory,
-            ),
+            host_config=host_config,
             detach=True,
             command=customized_image_entry_cmd if enable_custom_image else None,
             entrypoint=customized_image_entry_cmd if enable_custom_image else None