Skip to content

Commit a24e962

Browse files
authored
Merge pull request #2132 from FedML-AI/raphael/hot-fix-deploy
[Deploy] Try to convert the gpu_topology value type to int.
2 parents de480be + 92b7e16 commit a24e962

1 file changed

Lines changed: 8 additions & 1 deletion

File tree

python/fedml/computing/scheduler/model_scheduler/device_replica_controller.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,9 @@ def __init__(self, master_id, request_json: dict):
6767
def calc_total_gpu_num(self):
6868
total_gpu_num = 0
6969
for device_id, gpu_num in self.devices_avail_gpus.items():
70-
total_gpu_num += gpu_num
70+
if type(gpu_num) is not int:
71+
logging.warning(f"The value in gpu_topology should be int, but got {type(gpu_num)}. Try to convert it.")
72+
total_gpu_num += int(gpu_num)
7173
return total_gpu_num
7274

7375
def init_id_replica_num(self):
@@ -77,6 +79,11 @@ def init_id_replica_num(self):
7779
"""
7880
id_replica_num = {}
7981
for id, avail_num in self.devices_avail_gpus.items():
82+
if type(avail_num) is not int:
83+
logging.warning(f"The value in gpu_topology should be int, "
84+
f"but got {type(avail_num)}. Try to convert it.")
85+
avail_num = int(avail_num)
86+
8087
if avail_num % self.gpu_per_replica != 0:
8188
raise ValueError("The number of gpus for each device should be divisible by gpu_per_replica")
8289
id_replica_num[str(id)] = avail_num // self.gpu_per_replica

0 commit comments

Comments
 (0)