1818import fedml
1919from fedml .computing .scheduler .comm_utils import sys_utils , security_utils
2020from fedml .computing .scheduler .comm_utils .container_utils import ContainerUtils
21+ from fedml .computing .scheduler .comm_utils .hardware_utils import HardwareUtil
2122from fedml .computing .scheduler .comm_utils .job_utils import JobRunnerUtils
2223
2324for type_name in collections .abc .__all__ :
@@ -231,24 +232,6 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version,
231232 except docker .errors .APIError :
232233 raise Exception ("Failed to get the container object" )
233234
234- # Allocate the GPU
235- # TODO: Make sure no competition for each replica in a single deployment
236- if exist_container_obj is not None :
237- client .api .remove_container (exist_container_obj .id , v = True , force = True )
238- device_requests = []
239- if no_real_gpu_allocation is not None :
240- use_gpu = not no_real_gpu_allocation
241- if use_gpu :
242- logging .info ("Number of GPUs: {}" .format (num_gpus ))
243- if gpu_ids is not None :
244- gpu_id_list = map (lambda x : str (x ), gpu_ids )
245- device_requests .append (
246- docker .types .DeviceRequest (device_ids = list (gpu_id_list ), capabilities = [['gpu' ]]))
247- else :
248- device_requests .append (
249- docker .types .DeviceRequest (count = num_gpus , capabilities = [['gpu' ]]))
250- logging .info (f"device_requests: { device_requests } " )
251-
252235 # Pull the inference image
253236 logging .info (f"Start pulling the inference image { inference_image_name } ... with policy { image_pull_policy } " )
254237 ContainerUtils .get_instance ().pull_image_with_policy (image_pull_policy , inference_image_name )
@@ -306,6 +289,32 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version,
306289 }
307290 environment ["MAIN_ENTRY" ] = relative_entry
308291
292+ host_config_dict = {
293+ "binds" : binds ,
294+ "port_bindings" : {
295+ port_inside_container : usr_indicated_worker_port
296+ },
297+ "shm_size" : shm_size ,
298+ "storage_opt" : storage_opt ,
299+ "tmpfs" : tmpfs ,
300+ "cpu_count" : cpus ,
301+ "mem_limit" : memory
302+ }
303+
304+ # Allocate the GPU
305+ # TODO: Make sure no competition for each replica in a single deployment
306+ if exist_container_obj is not None :
307+ client .api .remove_container (exist_container_obj .id , v = True , force = True )
308+ device_requests = {}
309+ if no_real_gpu_allocation is not None :
310+ use_gpu = not no_real_gpu_allocation
311+ if use_gpu :
312+ logging .info ("Number of GPUs: {}" .format (num_gpus ))
313+ device_requests = HardwareUtil .get_docker_gpu_device_mapping (gpu_ids , num_gpus )
314+ logging .info (f"device_requests: { device_requests } " )
315+
316+ host_config_dict .update (device_requests )
317+
309318 # Environment variables
310319 if not enable_custom_image :
311320 # For some image, the default user is root. Unified to fedml.
@@ -325,24 +334,14 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version,
325334 environment [key ] = extra_envs [key ]
326335
327336 try :
337+ host_config = client .api .create_host_config (** host_config_dict )
328338 new_container = client .api .create_container (
329339 image = inference_image_name ,
330340 name = default_server_container_name ,
331341 volumes = volumns ,
332342 ports = [port_inside_container ], # port open inside the container
333343 environment = environment ,
334- host_config = client .api .create_host_config (
335- binds = binds ,
336- port_bindings = {
337- port_inside_container : usr_indicated_worker_port # Could be either None or a port number
338- },
339- device_requests = device_requests ,
340- shm_size = shm_size ,
341- storage_opt = storage_opt ,
342- tmpfs = tmpfs ,
343- cpu_count = cpus ,
344- mem_limit = memory ,
345- ),
344+ host_config = host_config ,
346345 detach = True ,
347346 command = customized_image_entry_cmd if enable_custom_image else None ,
348347 entrypoint = customized_image_entry_cmd if enable_custom_image else None
0 commit comments