Skip to content

Commit 38e4453

Browse files
authored
Merge pull request #2137 from FedML-AI/dev/v0.7.0
Dev/v0.7.0
2 parents f4c49c9 + 28e4af4 commit 38e4453

9 files changed

Lines changed: 23 additions & 9 deletions

File tree

python/fedml/computing/scheduler/comm_utils/container_utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ def get_instance():
2626

2727
def get_docker_client(self):
2828
try:
29-
client = docker.from_env(timeout=5, version="auto")
29+
client = docker.from_env()
3030
except Exception:
3131
logging.error("Failed to connect to the docker daemon, please ensure that you have "
3232
"installed Docker Desktop or Docker Engine, and the docker is running")
@@ -180,7 +180,7 @@ def get_container_rank_same_model(prefix: str):
180180
running_model_name = hash("model_endpoint_id_{}_name_{}_model_id_{}_name_{}_ver_{}")
181181
"""
182182
try:
183-
client = docker.from_env(timeout=5, version="auto")
183+
client = docker.from_env()
184184
except Exception:
185185
logging.error("Failed to connect to the docker daemon, please ensure that you have "
186186
"installed Docker Desktop or Docker Engine, and the docker is running")

python/fedml/computing/scheduler/comm_utils/job_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -570,7 +570,7 @@ def get_run_container_name(run_id: int) -> str:
570570
@staticmethod
571571
def get_docker_client(docker_args: DockerArgs) -> DockerClient:
572572
try:
573-
client = docker.from_env(timeout=5, version="auto")
573+
client = docker.from_env()
574574
if docker_args.username != "" and docker_args.registry != "":
575575
client.login(username=docker_args.username, password=docker_args.password, registry=docker_args.registry)
576576
except Exception as e:

python/fedml/computing/scheduler/model_scheduler/autoscaler/autoscaler.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,11 +50,11 @@ def filter_by_timestamp(cls,
5050
filtered = metrics
5151
if before_now_minutes:
5252
less_than_ts = \
53-
str(pd.Timestamp.now() - pd.Timedelta(minutes=before_now_minutes))
53+
str(pd.Timestamp.utcnow().replace(tzinfo=None) - pd.Timedelta(minutes=before_now_minutes))
5454
filtered = metrics.query("'{}' <= {}".format(less_than_ts, "timestamp"))
5555
if before_now_seconds:
5656
less_than_ts = \
57-
str(pd.Timestamp.now() - pd.Timedelta(seconds=before_now_seconds))
57+
str(pd.Timestamp.utcnow().replace(tzinfo=None) - pd.Timedelta(seconds=before_now_seconds))
5858
filtered = metrics.query("'{}' <= {}".format(less_than_ts, "timestamp"))
5959
return filtered
6060

@@ -151,6 +151,7 @@ def scale_operation_query_concurrency(cls,
151151

152152
# Otherwise, we proceed as normal.
153153
queries_num = period_data.shape[0]
154+
logging.info(f"Detect {queries_num} of requests in {concurrent_query_policy.window_size_secs} seconds")
154155

155156
try:
156157
# QSR: Queries per Second per Replica: (Number of Queries / Number of Current Replicas) / Window Size

python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -210,7 +210,7 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version,
210210
infer_host = "127.0.0.1"
211211

212212
try:
213-
client = docker.from_env(timeout=5, version="auto")
213+
client = docker.from_env()
214214
if enable_custom_image and docker_registry_user_name != "" and docker_registry_user_password != "" \
215215
and docker_registry != "":
216216
client.login(username=docker_registry_user_name, password=docker_registry_user_password,
@@ -467,7 +467,7 @@ def log_deployment_result(end_point_id, model_id, cmd_container_name, cmd_type,
467467
logging.info(f"Attempt: {deploy_attempt} / {deploy_attempt_threshold} ...")
468468

469469
try:
470-
client = docker.from_env(timeout=5, version="auto")
470+
client = docker.from_env()
471471
except Exception:
472472
logging.error("Failed to connect to the docker daemon, please ensure that you have "
473473
"installed Docker Desktop or Docker Engine, and the docker is running")

python/fedml/computing/scheduler/model_scheduler/device_replica_controller.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,9 @@ def __init__(self, master_id, request_json: dict):
6767
def calc_total_gpu_num(self):
6868
total_gpu_num = 0
6969
for device_id, gpu_num in self.devices_avail_gpus.items():
70-
total_gpu_num += gpu_num
70+
if type(gpu_num) is not int:
71+
logging.warning(f"The value in gpu_topology should be int, but got {type(gpu_num)}. Try to convert it.")
72+
total_gpu_num += int(gpu_num)
7173
return total_gpu_num
7274

7375
def init_id_replica_num(self):
@@ -77,6 +79,11 @@ def init_id_replica_num(self):
7779
"""
7880
id_replica_num = {}
7981
for id, avail_num in self.devices_avail_gpus.items():
82+
if type(avail_num) is not int:
83+
logging.warning(f"The value in gpu_topology should be int, "
84+
f"but got {type(avail_num)}. Try to convert it.")
85+
avail_num = int(avail_num)
86+
8087
if avail_num % self.gpu_per_replica != 0:
8188
raise ValueError("The number of gpus for each device should be divisible by gpu_per_replica")
8289
id_replica_num[str(id)] = avail_num // self.gpu_per_replica

python/fedml/computing/scheduler/model_scheduler/master_job_runner_manager.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,4 +64,4 @@ def generate_request_json_with_replica_num_diff(run_id, edge_id, request_json):
6464

6565
@staticmethod
6666
def generate_request_json_with_replica_version_diff(run_id, edge_id, request_json):
67-
return FedMLDeployMasterJobRunner.generate_request_json_with_replica_num_diff(run_id, edge_id, request_json)
67+
return FedMLDeployMasterJobRunner.generate_request_json_with_replica_version_diff(run_id, edge_id, request_json)

python/fedml/computing/scheduler/model_scheduler/modelops_configs.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
import certifi
66
import requests
7+
import cachetools.func
78

89
import fedml
910
from fedml.core.mlops.mlops_utils import MLOpsUtils
@@ -32,6 +33,7 @@ def get_instance(args):
3233
return ModelOpsConfigs._config_instance
3334

3435
@staticmethod
36+
@cachetools.func.ttl_cache(ttl=600)
3537
def get_request_params():
3638
url = fedml._get_backend_service()
3739
url = "{}/fedmlOpsServer/configs/fetch".format(url)

python/fedml/core/mlops/mlops_configs.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
import certifi
66
import requests
7+
import cachetools.func
78

89
import fedml
910
from fedml.core.mlops.mlops_utils import MLOpsUtils
@@ -41,6 +42,7 @@ def __init__(self):
4142
pass
4243

4344
@staticmethod
45+
@cachetools.func.ttl_cache(ttl=600)
4446
def get_request_params():
4547
url = fedml._get_backend_service()
4648
url = f"{url}/fedmlOpsServer/configs/fetch"

python/setup.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,8 @@ def finalize_options(self):
6464
'uvicorn',
6565
'wandb==0.13.2',
6666
'wget',
67+
# Need to pin this version due to breaking change released in python docker sdk
68+
'requests<2.32',
6769
]
6870

6971
requirements_extra_mpi = [

0 commit comments

Comments
 (0)