Merge pull request #2137 from FedML-AI/dev/v0.7.0

fedml-alex · web-flow · commit 38e445304c9c · 2024-05-30T15:05:42.000+08:00
Dev/v0.7.0
diff --git a/python/fedml/computing/scheduler/comm_utils/container_utils.py b/python/fedml/computing/scheduler/comm_utils/container_utils.py
@@ -26,7 +26,7 @@ def get_instance():
 
     def get_docker_client(self):
         try:
-            client = docker.from_env(timeout=5, version="auto")
+            client = docker.from_env()
         except Exception:
             logging.error("Failed to connect to the docker daemon, please ensure that you have "
                           "installed Docker Desktop or Docker Engine, and the docker is running")
@@ -180,7 +180,7 @@ def get_container_rank_same_model(prefix: str):
         running_model_name = hash("model_endpoint_id_{}_name_{}_model_id_{}_name_{}_ver_{}")
         """
         try:
-            client = docker.from_env(timeout=5, version="auto")
+            client = docker.from_env()
         except Exception:
             logging.error("Failed to connect to the docker daemon, please ensure that you have "
                           "installed Docker Desktop or Docker Engine, and the docker is running")
diff --git a/python/fedml/computing/scheduler/comm_utils/job_utils.py b/python/fedml/computing/scheduler/comm_utils/job_utils.py
@@ -570,7 +570,7 @@ def get_run_container_name(run_id: int) -> str:
     @staticmethod
     def get_docker_client(docker_args: DockerArgs) -> DockerClient:
         try:
-            client = docker.from_env(timeout=5, version="auto")
+            client = docker.from_env()
             if docker_args.username != "" and docker_args.registry != "":
                 client.login(username=docker_args.username, password=docker_args.password, registry=docker_args.registry)
         except Exception as e:
diff --git a/python/fedml/computing/scheduler/model_scheduler/autoscaler/autoscaler.py b/python/fedml/computing/scheduler/model_scheduler/autoscaler/autoscaler.py
@@ -50,11 +50,11 @@ def filter_by_timestamp(cls,
         filtered = metrics
         if before_now_minutes:
             less_than_ts = \
-                str(pd.Timestamp.now() - pd.Timedelta(minutes=before_now_minutes))
+                str(pd.Timestamp.utcnow().replace(tzinfo=None) - pd.Timedelta(minutes=before_now_minutes))
             filtered = metrics.query("'{}' <= {}".format(less_than_ts, "timestamp"))
         if before_now_seconds:
             less_than_ts = \
-                str(pd.Timestamp.now() - pd.Timedelta(seconds=before_now_seconds))
+                str(pd.Timestamp.utcnow().replace(tzinfo=None) - pd.Timedelta(seconds=before_now_seconds))
             filtered = metrics.query("'{}' <= {}".format(less_than_ts, "timestamp"))
         return filtered
 
@@ -151,6 +151,7 @@ def scale_operation_query_concurrency(cls,
 
         # Otherwise, we proceed as normal.
         queries_num = period_data.shape[0]
+        logging.info(f"Detect {queries_num} of requests in {concurrent_query_policy.window_size_secs} seconds")
 
         try:
             # QSR: Queries per Second per Replica: (Number of Queries / Number of Current Replicas) / Window Size
diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py b/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py
@@ -210,7 +210,7 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version,
         infer_host = "127.0.0.1"
 
     try:
-        client = docker.from_env(timeout=5, version="auto")
+        client = docker.from_env()
         if enable_custom_image and docker_registry_user_name != "" and docker_registry_user_password != "" \
                 and docker_registry != "":
             client.login(username=docker_registry_user_name, password=docker_registry_user_password,
@@ -467,7 +467,7 @@ def log_deployment_result(end_point_id, model_id, cmd_container_name, cmd_type,
             logging.info(f"Attempt: {deploy_attempt} / {deploy_attempt_threshold} ...")
 
             try:
-                client = docker.from_env(timeout=5, version="auto")
+                client = docker.from_env()
             except Exception:
                 logging.error("Failed to connect to the docker daemon, please ensure that you have "
                               "installed Docker Desktop or Docker Engine, and the docker is running")
diff --git a/python/fedml/computing/scheduler/model_scheduler/device_replica_controller.py b/python/fedml/computing/scheduler/model_scheduler/device_replica_controller.py
@@ -67,7 +67,9 @@ def __init__(self, master_id, request_json: dict):
     def calc_total_gpu_num(self):
         total_gpu_num = 0
         for device_id, gpu_num in self.devices_avail_gpus.items():
-            total_gpu_num += gpu_num
+            if type(gpu_num) is not int:
+                logging.warning(f"The value in gpu_topology should be int, but got {type(gpu_num)}. Try to convert it.")
+            total_gpu_num += int(gpu_num)
         return total_gpu_num
 
     def init_id_replica_num(self):
@@ -77,6 +79,11 @@ def init_id_replica_num(self):
         """
         id_replica_num = {}
         for id, avail_num in self.devices_avail_gpus.items():
+            if type(avail_num) is not int:
+                logging.warning(f"The value in gpu_topology should be int, "
+                                f"but got {type(avail_num)}. Try to convert it.")
+            avail_num = int(avail_num)
+
             if avail_num % self.gpu_per_replica != 0:
                 raise ValueError("The number of gpus for each device should be divisible by gpu_per_replica")
             id_replica_num[str(id)] = avail_num // self.gpu_per_replica
diff --git a/python/fedml/computing/scheduler/model_scheduler/master_job_runner_manager.py b/python/fedml/computing/scheduler/model_scheduler/master_job_runner_manager.py
@@ -64,4 +64,4 @@ def generate_request_json_with_replica_num_diff(run_id, edge_id, request_json):
 
     @staticmethod
     def generate_request_json_with_replica_version_diff(run_id, edge_id, request_json):
-        return FedMLDeployMasterJobRunner.generate_request_json_with_replica_num_diff(run_id, edge_id, request_json)
+        return FedMLDeployMasterJobRunner.generate_request_json_with_replica_version_diff(run_id, edge_id, request_json)
diff --git a/python/fedml/computing/scheduler/model_scheduler/modelops_configs.py b/python/fedml/computing/scheduler/model_scheduler/modelops_configs.py
@@ -4,6 +4,7 @@
 
 import certifi
 import requests
+import cachetools.func
 
 import fedml
 from fedml.core.mlops.mlops_utils import MLOpsUtils
@@ -32,6 +33,7 @@ def get_instance(args):
         return ModelOpsConfigs._config_instance
 
     @staticmethod
+    @cachetools.func.ttl_cache(ttl=600)
     def get_request_params():
         url = fedml._get_backend_service()
         url = "{}/fedmlOpsServer/configs/fetch".format(url)
diff --git a/python/fedml/core/mlops/mlops_configs.py b/python/fedml/core/mlops/mlops_configs.py
@@ -4,6 +4,7 @@
 
 import certifi
 import requests
+import cachetools.func
 
 import fedml
 from fedml.core.mlops.mlops_utils import MLOpsUtils
@@ -41,6 +42,7 @@ def __init__(self):
         pass
 
     @staticmethod
+    @cachetools.func.ttl_cache(ttl=600)
     def get_request_params():
         url = fedml._get_backend_service()
         url = f"{url}/fedmlOpsServer/configs/fetch"
diff --git a/python/setup.py b/python/setup.py
@@ -64,6 +64,8 @@ def finalize_options(self):
     'uvicorn',
     'wandb==0.13.2',
     'wget',
+    # Need to pin this version due to breaking change released in python docker sdk
+    'requests<2.32',
 ]
 
 requirements_extra_mpi = [

Original file line number	Diff line number	Diff line change
`@@ -64,6 +64,8 @@ def finalize_options(self):`
`64`	`64`	`'uvicorn',`
`65`	`65`	`'wandb==0.13.2',`
`66`	`66`	`'wget',`
	`67`	`+ # Need to pin this version due to breaking change released in python docker sdk`
	`68`	`+ 'requests<2.32',`
`67`	`69`	`]`
`68`	`70`
`69`	`71`	`requirements_extra_mpi = [`