Merge pull request #2115 from FedML-AI/alexleung/dev_v070_for_refactor

fedml-alex · web-flow · commit 41e62ac7bafa · 2024-05-17T23:28:01.000+08:00
Alexleung/dev v070 for refactor
diff --git a/python/fedml/computing/scheduler/comm_utils/constants.py b/python/fedml/computing/scheduler/comm_utils/constants.py
@@ -83,7 +83,7 @@ class SchedulerConstants:
     TRAIN_PROVISIONING_TIMEOUT = 60 * 25
     TRAIN_STARTING_TIMEOUT = 60 * 15
     TRAIN_STOPPING_TIMEOUT = 60 * 5
-    TRAIN_RUNNING_TIMEOUT = 60 * 60 * 12
+    TRAIN_RUNNING_TIMEOUT = 60 * 60 * 24 * 2000
     TRAIN_INIT_TIMEOUT = 60 * 5
 
     PUBLIC_REDIS_PORT = 6379
diff --git a/python/fedml/computing/scheduler/comm_utils/job_monitor.py b/python/fedml/computing/scheduler/comm_utils/job_monitor.py
@@ -354,15 +354,15 @@ def monitor_slave_run_process_status(self):
                 # Check if all processes of the specific run are exited
                 # FIXME: Proactively release the gpu ids when the run processes have not even started yet as the docker
                 #  image is being pulled
-                run_process_list = client_constants.ClientConstants.get_learning_process_list(job.job_id)
-                all_run_processes_exited = True if len(run_process_list) <= 0 else False
-                if all_run_processes_exited:
-                    if not self.released_runs.get(str(job.job_id), False):
-                        self.released_runs[str(job.job_id)] = True
-                        # Release the gpu ids
-                        print(
-                            f"[run/device][{job.job_id}/{job.edge_id}] Release gpu resource when run processes has exited on monioring slave runs periodically.")
-                        JobRunnerUtils.get_instance().release_gpu_ids(job.job_id, job.edge_id)
+                # run_process_list = client_constants.ClientConstants.get_learning_process_list(job.job_id)
+                # all_run_processes_exited = True if len(run_process_list) <= 0 else False
+                # if all_run_processes_exited:
+                #     if not self.released_runs.get(str(job.job_id), False):
+                #         self.released_runs[str(job.job_id)] = True
+                #         # Release the gpu ids
+                #         print(
+                #             f"[run/device][{job.job_id}/{job.edge_id}] Release gpu resource when run processes has exited on monioring slave runs periodically.")
+                #         JobRunnerUtils.get_instance().release_gpu_ids(job.job_id, job.edge_id)
 
                 # Get the timeout threshold
                 timeout_threshold = None
@@ -381,8 +381,9 @@ def monitor_slave_run_process_status(self):
 
                 # If the run processes have exited but run status is not completed and
                 # timeout is out of the range, then release gpu ids and report failed status to the master agent.
-                if all_run_processes_exited and not SchedulerConstants.is_run_completed(job.status) and \
-                        timeout_threshold is not None and timeout > timeout_threshold:
+                # if all_run_processes_exited and not SchedulerConstants.is_run_completed(job.status) and \
+                #         timeout_threshold is not None and timeout > timeout_threshold:
+                if timeout_threshold is not None and timeout > timeout_threshold:
                     # Report failed status to the master agent
                     mlops.log_training_failed_status(
                         run_id=job.job_id, edge_id=job.edge_id, enable_broadcast=True)
diff --git a/python/fedml/computing/scheduler/scheduler_core/status_center.py b/python/fedml/computing/scheduler/scheduler_core/status_center.py
@@ -1,8 +1,6 @@
 import logging
 import time
 
-from ..slave.client_constants import ClientConstants
-from ..master.server_constants import ServerConstants
 from enum import Enum, unique
 import multiprocessing
 from multiprocessing import Process, Queue
@@ -11,7 +9,6 @@
 from .message_center import FedMLMessageCenter
 import traceback
 from .status_manager_protocols import FedMLStatusManager
-from .compute_cache_manager import ComputeCacheManager
 
 
 @unique
@@ -87,11 +84,6 @@ class FedMLStatusCenter(object):
 
     def __init__(self, message_queue=None):
         self.status_queue = message_queue
-        self.job_status_in_slave = dict()
-        self.entire_job_status = None
-        self.job_status_in_master = dict()
-        self.slave_devices_status = dict()
-        self.master_devices_status = dict()
         self.status_center_process = None
         self.status_event = None
         self.status_sender_message_center_queue = None
@@ -108,50 +100,6 @@ def __repr__(self):
             attrs=" ".join("{}={!r}".format(k, v) for k, v in self.__dict__.items()),
         )
 
-    def add_job_status_in_slave(self, device_id, status):
-        self.job_status_in_slave[device_id] = self._status_transition(status)
-
-    def add_job_status_in_master(self, device_id, status):
-        self.job_status_in_master[device_id] = self._status_transition(status)
-
-    def set_entire_job_status(self, status):
-        self.entire_job_status = status
-
-    def add_slave_device_status(self, device_id, status):
-        self.slave_devices_status[device_id] = self._status_transition(status)
-
-    def add_master_device_status(self, device_id, status):
-        self.master_devices_status[device_id] = self._status_transition(status)
-
-    def get_job_status_in_slave(self, device_id):
-        return self.job_status_in_slave.get(device_id, None)
-
-    def get_job_status_in_master(self, device_id):
-        return self.job_status_in_master.get(device_id, None)
-
-    def get_entire_job_status(self):
-        return self.entire_job_status
-
-    def get_slave_device_status(self, device_id):
-        return self.slave_devices_status.get(device_id, None)
-
-    def get_master_device_status(self, device_id):
-        return self.master_devices_status.get(device_id, None)
-
-    def _status_transition(self, status):
-        transition_status = status
-        if self.entire_job_status is not None:
-            if self.entire_job_status == ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED or \
-                    self.entire_job_status == ServerConstants.MSG_MLOPS_SERVER_STATUS_FINISHED:
-                if status == ClientConstants.MSG_MLOPS_CLIENT_STATUS_FAILED or \
-                        status == ClientConstants.MSG_MLOPS_CLIENT_STATUS_FINISHED or \
-                        status == ClientConstants.MSG_MLOPS_CLIENT_STATUS_KILLED:
-                    transition_status = status
-                else:
-                    transition_status = ClientConstants.MSG_MLOPS_CLIENT_STATUS_KILLED
-
-        return transition_status
-
     def get_status_runner(self):
         return None
 
@@ -205,16 +153,6 @@ def rebuild_message_center(self, message_center_queue):
     def rebuild_status_center(self, status_queue):
         pass
 
-    @staticmethod
-    def save_job_status(run_id, status):
-        ComputeCacheManager.get_instance().set_redis_params()
-        ComputeCacheManager.get_instance().get_status_cache().save_job_status(run_id, status)
-
-    @staticmethod
-    def save_device_status_in_job(run_id, device_id, status):
-        ComputeCacheManager.get_instance().set_redis_params()
-        ComputeCacheManager.get_instance().get_status_cache().save_device_status_in_job(run_id, device_id, status)
-
     def run_status_dispatcher(self, status_event, status_queue,
                               sender_message_center_queue,
                               listener_message_center_queue):
@@ -272,23 +210,31 @@ def run_status_dispatcher(self, status_event, status_queue,
                 else:
                     status_manager_instances[status_entity.run_id].edge_id = status_entity.edge_id
 
+                # if the job status is completed then continue
+                if status_manager_instances[status_entity.run_id].is_job_completed():
+                    continue
+
                 # Process the master and slave status.
                 if message_entity.topic.startswith(FedMLStatusCenter.TOPIC_MASTER_STATUS_PREFIX):
                     # Process the job status
                     status_manager_instances[status_entity.run_id].status_center_process_master_status(
                         message_entity.topic, message_entity.payload)
 
                     # Save the job status
-                    FedMLStatusCenter.save_job_status(status_entity.run_id, self.get_entire_job_status())
+                    status_manager_instances[status_entity.run_id].save_job_status()
+
+                    # Popup the status manager instance when the job status is completed
+                    if status_manager_instances[status_entity.run_id].is_job_completed():
+                        status_manager_instances.pop(status_entity.run_id)
+                        continue
 
                 elif message_entity.topic.startswith(FedMLStatusCenter.TOPIC_SLAVE_STATUS_PREFIX):
                     # Process the slave device status
                     status_manager_instances[status_entity.run_id].status_center_process_slave_status(
                         message_entity.topic, message_entity.payload)
 
                     # Save the device status in job
-                    FedMLStatusCenter.save_device_status_in_job(status_entity.run_id, status_entity.edge_id,
-                                                                self.get_job_status_in_slave(status_entity.edge_id))
+                    status_manager_instances[status_entity.run_id].save_device_status_in_job(status_entity.edge_id)
 
             except Exception as e:
                 if message_entity is not None:
diff --git a/python/fedml/computing/scheduler/scheduler_core/status_manager_protocols.py b/python/fedml/computing/scheduler/scheduler_core/status_manager_protocols.py
@@ -12,6 +12,7 @@
 from ..master.server_data_interface import FedMLServerDataInterface
 from .message_common import LogArgs
 from .general_constants import GeneralConstants
+from ..scheduler_core.compute_cache_manager import ComputeCacheManager
 
 
 class FedMLStatusManager(object):
@@ -33,13 +34,81 @@ def __init__(self, run_id=None, edge_id=None, server_id=None,
         self.log_args = LogArgs(role="server", edge_id=self.edge_id,
                                 server_id=self.server_id, log_file_dir=ServerConstants.get_log_file_dir())
 
+        self.job_status_in_slave = dict()
+        self.entire_job_status = None
+        self.job_status_in_master = dict()
+        self.slave_devices_status = dict()
+        self.master_devices_status = dict()
+        self.completed_job_status_list = [ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED,
+                                          ServerConstants.MSG_MLOPS_SERVER_STATUS_FINISHED,
+                                          ServerConstants.MSG_MLOPS_SERVER_STATUS_KILLED]
+
     def __repr__(self):
         return "<{klass} @{id:x} {attrs}>".format(
             klass=self.__class__.__name__,
             id=id(self) & 0xFFFFFF,
             attrs=" ".join("{}={!r}".format(k, v) for k, v in self.__dict__.items()),
         )
 
+    def add_job_status_in_slave(self, device_id, status):
+        self.job_status_in_slave[device_id] = self._status_transition(status)
+
+    def add_job_status_in_master(self, device_id, status):
+        self.job_status_in_master[device_id] = self._status_transition(status)
+
+    def set_entire_job_status(self, status):
+        self.entire_job_status = status
+
+    def add_slave_device_status(self, device_id, status):
+        self.slave_devices_status[device_id] = self._status_transition(status)
+
+    def add_master_device_status(self, run_id, device_id, status):
+        self.master_devices_status[device_id] = self._status_transition(status)
+
+    def get_job_status_in_slave(self, device_id):
+        return self.job_status_in_slave.get(device_id, None)
+
+    def get_job_status_in_master(self, device_id):
+        return self.job_status_in_master.get(device_id, None)
+
+    def get_entire_job_status(self):
+        return self.entire_job_status
+
+    def get_slave_device_status(self, device_id):
+        return self.slave_devices_status.get(device_id, None)
+
+    def get_master_device_status(self, device_id):
+        return self.master_devices_status.get(device_id, None)
+
+    def is_job_completed(self):
+        if self.entire_job_status and self.entire_job_status in self.completed_job_status_list:
+            return True
+        return False
+
+    def _status_transition(self, status):
+        transition_status = status
+        if self.entire_job_status is not None:
+            if self.entire_job_status == ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED or \
+                    self.entire_job_status == ServerConstants.MSG_MLOPS_SERVER_STATUS_FINISHED:
+                if status == ClientConstants.MSG_MLOPS_CLIENT_STATUS_FAILED or \
+                        status == ClientConstants.MSG_MLOPS_CLIENT_STATUS_FINISHED or \
+                        status == ClientConstants.MSG_MLOPS_CLIENT_STATUS_KILLED:
+                    transition_status = status
+                else:
+                    transition_status = ClientConstants.MSG_MLOPS_CLIENT_STATUS_KILLED
+
+        return transition_status
+
+    def save_job_status(self):
+        ComputeCacheManager.get_instance().set_redis_params()
+        ComputeCacheManager.get_instance().get_status_cache().save_job_status(
+            self.run_id, self.get_entire_job_status())
+
+    def save_device_status_in_job(self, device_id):
+        ComputeCacheManager.get_instance().set_redis_params()
+        ComputeCacheManager.get_instance().get_status_cache().save_device_status_in_job(
+            self.run_id, device_id, self.get_job_status_in_slave(device_id))
+
     def process_job_completed_status(self, master_id, status):
         # Stop the system performance monitor
         try:
@@ -75,10 +144,8 @@ def process_job_completed_status(self, master_id, status):
             self.report_deployment_status(self.run_id, GeneralConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_FAILED)
 
     def process_job_exception_status(self, master_id, status):
-        # Send the exception status to slave devices.
-        self.report_exception_status(
-            self.edge_id_list, run_id=self.run_id, server_id=master_id,
-            status=ClientConstants.MSG_MLOPS_CLIENT_STATUS_FAILED)
+        # Report exception job status
+        self.report_exception_status(status)
 
         # Save the job status to local storage
         FedMLServerDataInterface.get_instance().save_job_status(self.run_id, master_id, status, status)
@@ -113,9 +180,9 @@ def status_center_process_master_status(self, topic, payload):
 
     def process_job_status_consensus(self, run_id, master_id, status):
         # Set the master status in the job and entire job status
-        self.status_center.set_entire_job_status(status)
-        self.status_center.add_job_status_in_master(master_id, status)
-        status = self.status_center.get_entire_job_status()
+        self.set_entire_job_status(status)
+        self.add_job_status_in_master(master_id, status)
+        status = self.get_entire_job_status()
 
         # Set the device status based on the job status
         edge_id_status_dict = self.client_agent_active_list.get(f"{run_id}", {})
@@ -152,8 +219,8 @@ def get_device_consensus_status_in_job(job_status, device_status):
             return None
 
     def get_device_consensus_status_in_current_device(self, edge_id, status):
-        self.status_center.add_job_status_in_slave(edge_id, status)
-        consensus_status = self.status_center.get_job_status_in_slave(edge_id)
+        self.add_job_status_in_slave(edge_id, status)
+        consensus_status = self.get_job_status_in_slave(edge_id)
         consensus_status = ClientConstants.MSG_MLOPS_CLIENT_STATUS_FAILED \
             if consensus_status == ClientConstants.MSG_MLOPS_CLIENT_STATUS_EXCEPTION else consensus_status
         return consensus_status
@@ -275,25 +342,13 @@ def report_server_status(self, run_id, edge_id, server_id, status):
         self.status_reporter.report_server_id_status(
             run_id, status, edge_id=edge_id, server_id=server_id, server_agent_id=edge_id, update_db=False)
 
-    def report_exception_status(
-            self, edge_id_list, run_id=0, server_id=None, status=None, payload=None):
-        if payload is None:
-            payload_obj = {"runId": run_id, "edgeids": edge_id_list}
-            if server_id is not None:
-                payload_obj["serverId"] = server_id
-        else:
-            payload_obj = json.loads(payload)
-        payload_obj["run_status"] = ClientConstants.MSG_MLOPS_CLIENT_STATUS_EXCEPTION if status is None else status
-        topic_exception = "flserver_agent/" + str(self.edge_id) + "/stop_train"
-        self.message_reporter.send_message(topic_exception, json.dumps(payload_obj))
+    def report_exception_status(self, status):
+        self.status_reporter.report_job_status(self.run_id, status)
 
     def status_center_process_slave_status_to_master_in_slave_agent(self, topic, payload):
         # Forward the status message to the sender queue of message center.
         self.message_center.send_message(topic, payload)
 
-        # Post the status message to the listener queue of message center
-        #self.message_center.receive_message(GeneralConstants.MSG_TOPIC_REPORT_DEVICE_STATUS_IN_JOB, payload)
-
     def status_center_process_slave_status_to_mlops_in_slave_agent(self, topic, payload):
         # Forward the status message to message center.
         self.message_center.send_message(topic, payload)