Merge pull request #2126 from FedML-AI/alexleung/dev_v070_for_refactor

fedml-alex · web-flow · commit 3fbcc2c3b6c5 · 2024-05-22T08:38:18.000+08:00
Alexleung/dev v070 for refactor
diff --git a/python/fedml/computing/scheduler/master/base_master_job_runner.py b/python/fedml/computing/scheduler/master/base_master_job_runner.py
@@ -285,6 +285,10 @@ def run_server_job_impl(self, process_event, completed_event,
         self.args.run_id = self.run_id
         MLOpsRuntimeLog.get_instance(self.args).init_logs(log_level=logging.INFO)
 
+        self.status_reporter.report_server_id_status(
+            run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_RUNNING, edge_id=self.edge_id,
+            server_id=self.edge_id, server_agent_id=self.edge_id)
+
         # get training params
         private_local_data_dir = data_config.get("privateLocalData", "")
         is_using_local_data = 0
@@ -562,7 +566,7 @@ def detect_edges_status(
         return True, active_edge_info_dict, inactivate_edges
 
     def report_exception_status(self, run_id):
-        self.status_reporter.report_job_status(run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_EXCEPTION)
+        self.mlops_metrics.report_job_status(run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_EXCEPTION)
 
     def callback_run_logs(self, topic, payload):
         run_id = str(topic).split('/')[-1]
@@ -618,12 +622,6 @@ def send_training_request_to_edges(self, request_json, active_edge_info_dict=Non
                            f"request GPU count {request_num_gpus}"
                 logging.error(err_info)
 
-                # Bug fix: This mqtt message needs to be sent so platform can clean up the failed run and change the
-                # status from running to failed.
-                self.mlops_metrics.report_server_training_status(
-                    run_id, GeneralConstants.MSG_MLOPS_SERVER_STATUS_FAILED, edge_id=self.edge_id
-                )
-
                 self.status_reporter.report_server_id_status(
                     run_id, GeneralConstants.MSG_MLOPS_SERVER_STATUS_FAILED, edge_id=self.edge_id,
                     server_id=self.edge_id, server_agent_id=self.server_agent_id)
diff --git a/python/fedml/computing/scheduler/master/base_master_protocol_manager.py b/python/fedml/computing/scheduler/master/base_master_protocol_manager.py
@@ -267,19 +267,26 @@ def callback_stop_train(self, topic, payload, use_payload=None):
         server_id = request_json.get("serverId", None)
         if server_id is None:
             server_id = request_json.get("server_id", None)
+        edge_ids = request_json.get("edgeids", None)
 
-        # Broadcast the job status to all edges
-        self.rebuild_status_center(self.get_status_queue())
-        self.status_reporter.report_job_status(run_id, GeneralConstants.MSG_MLOPS_SERVER_STATUS_KILLED)
+        # Stop the job runner
+        self._get_job_runner_manager().stop_job_runner(
+            run_id, args=self.args, server_id=server_id, request_json=request_json,
+            run_as_cloud_agent=self.run_as_cloud_agent)
 
         # Cleanup the cached object
         if self.running_request_json.get(run_id_str, None) is not None:
             self.running_request_json.pop(run_id_str)
 
-        # Stop the job runner
-        self._get_job_runner_manager().stop_job_runner(
-            run_id, args=self.args, server_id=server_id, request_json=request_json,
-            run_as_cloud_agent=self.run_as_cloud_agent)
+        # Reset all edge status and server status
+        for iter_edge_id in edge_ids:
+            self.generate_status_report(run_id, iter_edge_id, server_agent_id=server_id).\
+                report_client_id_status(iter_edge_id, GeneralConstants.MSG_MLOPS_SERVER_STATUS_KILLED,
+                                        run_id=run_id, server_id=server_id)
+
+        # To be compatible to the previous version of edge devices, we just send the stopping train message to edges.
+        # Currently, the latest version of edge devices don't need to process the stopping train message.
+        self.send_training_stop_request_to_edges(edge_ids, payload=payload, run_id=run_id)
 
     def callback_complete_job(self, topic, payload):
         # Parse the parameters.
@@ -505,13 +512,12 @@ def send_training_stop_request_to_edges(
             self, edge_id_list, payload=None, run_id=0):
         if payload is None:
             payload_obj = {"runId": run_id, "edgeids": edge_id_list}
-        else:
-            payload_obj = json.loads(payload)
+            payload = json.dumps(payload_obj)
 
         for edge_id in edge_id_list:
             topic_stop_train = "flserver_agent/" + str(edge_id) + "/stop_train"
             logging.info("stop_train: send topic " + topic_stop_train)
-            self.message_center.send_message(topic_stop_train, json.dumps(payload_obj))
+            self.message_center.send_message(topic_stop_train, payload)
 
     def send_training_stop_request_to_specific_edge(self, edge_id, payload):
         topic_stop_train = "flserver_agent/" + str(edge_id) + "/stop_train"
@@ -536,7 +542,7 @@ def send_status_msg_to_edges(self, edge_id_list, run_id, server_id, context=None
             self.send_status_check_msg(run_id, edge_id, self.edge_id, context=context)
 
     def report_exception_status(self, run_id):
-        self.status_reporter.report_job_status(run_id, GeneralConstants.MSG_MLOPS_SERVER_STATUS_EXCEPTION)
+        self.mlops_metrics.report_job_status(run_id, GeneralConstants.MSG_MLOPS_SERVER_STATUS_EXCEPTION)
 
     @staticmethod
     def get_start_train_topic_with_edge_id(edge_id):
diff --git a/python/fedml/computing/scheduler/master/deploy_job_launcher.py b/python/fedml/computing/scheduler/master/deploy_job_launcher.py
@@ -40,6 +40,8 @@ def deploy_model(serving_devices, request_json, run_id):
                 "", random_list[1], None,
                 in_model_id=model_id, in_model_version=model_version,
                 endpoint_name=endpoint_name, endpoint_id=endpoint_id, run_id=run_id)
+            return endpoint_id
+        return None
 
     def check_model_device_ready_and_deploy(self, request_json, run_id, master_device_id,
                                             slave_device_id, run_edge_ids=None):
@@ -88,3 +90,6 @@ def check_model_device_ready_and_deploy(self, request_json, run_id, master_devic
 
         # Start to deploy the model
         FedMLDeployJobLauncher.deploy_model(serving_devices, request_json, run_id=run_id)
+
+
+
diff --git a/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py b/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py
@@ -51,6 +51,7 @@ def __init__(self, args, run_id=0, request_json=None, agent_config=None, edge_id
         self.deployed_replica_payload = None
         self.slave_deployment_results_map = dict()
         self.deployment_result_queue = Queue()
+        self.is_fresh_endpoint = True
 
     # Override
     def _generate_job_runner_instance(self, args, run_id=None, request_json=None, agent_config=None, edge_id=None, ):
@@ -75,6 +76,7 @@ def run_impl(
             inference_end_point_id, use_gpu, memory_size, model_version, inference_port = \
             FedMLDeployMasterJobRunner.parse_model_run_params(self.request_json)
         self.run_id = run_id
+        self.is_fresh_endpoint = self.request_json.get("is_fresh_endpoint", True)
 
         # Print request parameters.
         logging.info("model deployment request: {}".format(self.request_json))
@@ -246,7 +248,7 @@ def process_deployment_result_message(self, topic=None, payload=None):
                           f"{self.request_json}")
             return
 
-        logging.info(f"End point {end_point_id}; Device {device_id}; replica {replica_no}; "
+        logging.info(f"Endpoint {end_point_id}; Device {device_id}; replica {replica_no}; "
                      f"run_operation {run_operation} model status {model_status}.")
 
         # OPTIONAL DEBUG PARAMS
@@ -280,7 +282,7 @@ def process_deployment_result_message(self, topic=None, payload=None):
                 logging.error(f"Unsupported model status {model_status}.")
 
             # Avoid endless loop, if the rollback also failed, we should report the failure to the MLOps
-            if self.replica_controller.under_rollback:
+            if self.replica_controller.under_rollback or self.is_fresh_endpoint:
                 self.send_deployment_status(
                     end_point_id, end_point_name, payload_json["model_name"], "",
                     ServerConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_FAILED,
diff --git a/python/fedml/computing/scheduler/model_scheduler/master_protocol_manager.py b/python/fedml/computing/scheduler/model_scheduler/master_protocol_manager.py
@@ -11,6 +11,7 @@
 from .master_job_runner_manager import FedMLDeployJobRunnerManager
 from ..scheduler_core.general_constants import GeneralConstants
 from ..scheduler_core.endpoint_sync_protocol import FedMLEndpointSyncProtocol
+from ..scheduler_core.compute_cache_manager import ComputeCacheManager
 
 
 class FedMLDeployMasterProtocolManager(FedMLBaseMasterProtocolManager):
@@ -108,6 +109,10 @@ def callback_delete_deployment(self, topic, payload):
         # Parse payload as the model message object.
         model_msg_object = FedMLModelMsgObject(topic, payload)
 
+        # Get the launch job id
+        ComputeCacheManager.get_instance().set_redis_params(self.redis_addr, self.redis_port, self.redis_password)
+        launch_job_id = ComputeCacheManager.get_instance().get_gpu_cache().get_endpoint_run_id_map(model_msg_object.run_id)
+
         # Delete SQLite records
         FedMLServerDataInterface.get_instance().delete_job_from_db(model_msg_object.run_id)
         FedMLModelDatabase.get_instance().delete_deployment_result(
@@ -135,6 +140,12 @@ def callback_delete_deployment(self, topic, payload):
             model_msg_object.run_id, model_msg_object.end_point_name, model_msg_object.model_id,
             model_msg_object.model_name, model_msg_object.model_version)
 
+        # Report the launch job status with killed status.
+        if launch_job_id is not None:
+            self.generate_status_report(model_msg_object.run_id, self.edge_id, server_agent_id=self.edge_id).\
+                report_server_id_status(launch_job_id, GeneralConstants.MSG_MLOPS_SERVER_STATUS_KILLED,
+                                        server_id=self.edge_id, server_agent_id=self.edge_id)
+
     def callback_start_deployment(self, topic, payload):
         # noinspection PyBroadException
         try:
@@ -174,6 +185,11 @@ def callback_start_deployment(self, topic, payload):
         # Set redis config
         FedMLModelCache.get_instance().set_redis_params(self.redis_addr, self.redis_port, self.redis_password)
 
+        # Query if the endpoint exists
+        endpoint_device_info = FedMLModelCache.get_instance(self.redis_addr, self.redis_port).get_end_point_device_info(
+            request_json["end_point_id"])
+        request_json["is_fresh_endpoint"] = True if endpoint_device_info is None else False
+
         # Save the user setting (about replica number) of this run to Redis, if existed, update it
         FedMLModelCache.get_instance(self.redis_addr, self.redis_port).set_user_setting_replica_num(
             end_point_id=run_id, end_point_name=end_point_name, model_name=model_name, model_version=model_version,
diff --git a/python/fedml/computing/scheduler/scheduler_core/message_common.py b/python/fedml/computing/scheduler/scheduler_core/message_common.py
@@ -49,6 +49,7 @@ def __init__(self, topic=None, payload=None, status_msg_body: dict = None):
         self.payload = payload
         self.run_id = None
         self.edge_id = None
+        self.server_id = None
         self.status = None
         if status_msg_body is not None:
             self.from_message_body(status_msg_body=status_msg_body)
@@ -61,6 +62,7 @@ def from_message_body(self, status_msg_body: dict = None):
             self.run_id = payload_json.get("run_id", None)
             self.run_id = payload_json.get("runId", None) if self.run_id is None else self.run_id
             self.edge_id = payload_json.get("edge_id", None)
+            self.server_id = payload_json.get("server_id", None)
             self.status = payload_json.get("status", None)
 
     def get_message_body(self):
diff --git a/python/fedml/computing/scheduler/scheduler_core/scheduler_base_job_runner_manager.py b/python/fedml/computing/scheduler/scheduler_core/scheduler_base_job_runner_manager.py
@@ -39,6 +39,10 @@ def stop_job_runner(self, run_id):
         if self.job_runners.get(run_id_str, None) is not None:
             self.job_runners[run_id_str].trigger_stop_event()
 
+    def stop_all_job_runner(self):
+        for run_id, job_runner in self.job_runners.items():
+            job_runner.trigger_stop_event()
+
     def complete_job_runner(self, run_id):
         run_id_str = str(run_id)
         if self.job_runners.get(run_id_str, None) is not None:
diff --git a/python/fedml/computing/scheduler/scheduler_core/status_center.py b/python/fedml/computing/scheduler/scheduler_core/status_center.py
@@ -81,6 +81,7 @@ class FedMLStatusCenter(object):
     TOPIC_SLAVE_JOB_LAUNCH_SUFFIX = "/start_train"
     TOPIC_SLAVE_JOB_STOP_PREFIX = "flserver_agent/"
     TOPIC_SLAVE_JOB_STOP_SUFFIX = "/stop_train"
+    ALLOWED_MAX_JOB_STATUS_CACHE_NUM = 1000
 
     def __init__(self, message_queue=None):
         self.status_queue = message_queue
@@ -203,38 +204,43 @@ def run_status_dispatcher(self, status_event, status_queue,
                 status_entity = FedMLStatusEntity(status_msg_body=message_body)
 
                 # Generate status manager instance
-                if status_manager_instances.get(status_entity.run_id) is None:
-                    status_manager_instances[status_entity.run_id] = FedMLStatusManager(
-                        run_id=status_entity.run_id, edge_id=status_entity.edge_id, status_center=self,
+                run_id_str = str(status_entity.run_id)
+                run_id_int = int(status_entity.run_id)
+                if status_manager_instances.get(run_id_str) is None:
+                    if len(status_manager_instances.keys()) >= FedMLStatusCenter.ALLOWED_MAX_JOB_STATUS_CACHE_NUM:
+                        for iter_run_id, iter_status_mgr in status_manager_instances.items():
+                            if iter_status_mgr.is_job_completed():
+                                status_manager_instances.pop(iter_run_id)
+                                break
+                    status_manager_instances[run_id_str] = FedMLStatusManager(
+                        run_id=run_id_int, edge_id=status_entity.edge_id,
+                        server_id=status_entity.server_id, status_center=self,
                         message_center=message_center)
                 else:
-                    status_manager_instances[status_entity.run_id].edge_id = status_entity.edge_id
+                    status_manager_instances[run_id_str].edge_id = status_entity.edge_id
+                    if status_entity.server_id is not None and str(status_entity.server_id) != "0":
+                        status_manager_instances[run_id_str].server_id = status_entity.server_id
 
                 # if the job status is completed then continue
-                if status_manager_instances[status_entity.run_id].is_job_completed():
+                if status_manager_instances[run_id_str].is_job_completed():
                     continue
 
                 # Process the master and slave status.
                 if message_entity.topic.startswith(FedMLStatusCenter.TOPIC_MASTER_STATUS_PREFIX):
                     # Process the job status
-                    status_manager_instances[status_entity.run_id].status_center_process_master_status(
+                    status_manager_instances[run_id_str].status_center_process_master_status(
                         message_entity.topic, message_entity.payload)
 
                     # Save the job status
-                    status_manager_instances[status_entity.run_id].save_job_status()
-
-                    # Popup the status manager instance when the job status is completed
-                    if status_manager_instances[status_entity.run_id].is_job_completed():
-                        status_manager_instances.pop(status_entity.run_id)
-                        continue
+                    status_manager_instances[run_id_str].save_job_status()
 
                 elif message_entity.topic.startswith(FedMLStatusCenter.TOPIC_SLAVE_STATUS_PREFIX):
                     # Process the slave device status
-                    status_manager_instances[status_entity.run_id].status_center_process_slave_status(
+                    status_manager_instances[run_id_str].status_center_process_slave_status(
                         message_entity.topic, message_entity.payload)
 
                     # Save the device status in job
-                    status_manager_instances[status_entity.run_id].save_device_status_in_job(status_entity.edge_id)
+                    status_manager_instances[run_id_str].save_device_status_in_job(status_entity.edge_id)
 
             except Exception as e:
                 if message_entity is not None:
@@ -295,40 +301,49 @@ def run_status_dispatcher_in_slave(self, status_event, status_queue,
                 status_entity = FedMLStatusEntity(status_msg_body=message_body)
 
                 # Generate status manager instance
-                if status_manager_instances.get(status_entity.run_id) is None:
-                    status_manager_instances[status_entity.run_id] = FedMLStatusManager(
-                        run_id=status_entity.run_id, edge_id=status_entity.edge_id, status_center=self,
+                run_id_str = str(status_entity.run_id)
+                run_id_int = int(status_entity.run_id)
+                if status_manager_instances.get(run_id_str) is None:
+                    if len(status_manager_instances.keys()) >= FedMLStatusCenter.ALLOWED_MAX_JOB_STATUS_CACHE_NUM:
+                        for iter_run_id, iter_status_mgr in status_manager_instances.items():
+                            if iter_status_mgr.is_job_completed():
+                                status_manager_instances.pop(iter_run_id)
+                                break
+
+                    status_manager_instances[run_id_str] = FedMLStatusManager(
+                        run_id=run_id_int, edge_id=status_entity.edge_id, status_center=self,
                         message_center=message_center)
                 else:
-                    status_manager_instances[status_entity.run_id].edge_id = status_entity.edge_id
+                    status_manager_instances[run_id_str].edge_id = status_entity.edge_id
 
                 # Process the slave status
                 if message_entity.topic.startswith(FedMLStatusCenter.TOPIC_SLAVE_STATUS_PREFIX):
                     # Report the slave status to master
-                    status_manager_instances[status_entity.run_id]. \
+                    status_manager_instances[run_id_str]. \
                         status_center_process_slave_status_to_master_in_slave_agent(
                         message_entity.topic, message_entity.payload
                     )
                 elif message_entity.topic.startswith(FedMLStatusCenter.TOPIC_SLAVE_STATUS_TO_MLOPS_PREFIX):
                     # Report slave status to mlops (Active/IDLE message)
-                    status_manager_instances[status_entity.run_id]. \
+                    status_manager_instances[run_id_str]. \
                         status_center_process_slave_status_to_mlops_in_slave_agent(
                         message_entity.topic, message_entity.payload
                     )
                 elif (message_entity.topic.startswith(FedMLStatusCenter.TOPIC_SLAVE_JOB_LAUNCH_PREFIX) and
                       message_entity.topic.endswith(FedMLStatusCenter.TOPIC_SLAVE_JOB_LAUNCH_SUFFIX)):
+                    pass
                     # Async request the job status from master when launching the job
-                    job_launch_message_map[status_entity.run_id] = {"topic": message_entity.topic,
-                                                                    "payload": message_entity.payload}
-                    # status_manager_instances[status_entity.run_id]. \
+                    # job_launch_message_map[run_id_str] = {"topic": message_entity.topic,
+                    #                                       "payload": message_entity.payload}
+                    # status_manager_instances[run_id_str]. \
                     #     status_center_request_job_status_from_master_in_slave_agent(
                     #     message_entity.topic, message_entity.payload
                     # )
                 elif (message_entity.topic.startswith(FedMLStatusCenter.TOPIC_SLAVE_JOB_STOP_PREFIX) and
                       message_entity.topic.endswith(FedMLStatusCenter.TOPIC_SLAVE_JOB_STOP_SUFFIX)):
                     # Cleanup when stopped the job
-                    if job_launch_message_map.get(status_entity.run_id, None) is not None:
-                        job_launch_message_map.pop(status_entity.run_id)
+                    if job_launch_message_map.get(run_id_str, None) is not None:
+                        job_launch_message_map.pop(run_id_str)
 
             except Exception as e:
                 if message_entity is not None:
diff --git a/python/fedml/computing/scheduler/scheduler_core/status_manager_protocols.py b/python/fedml/computing/scheduler/scheduler_core/status_manager_protocols.py
diff --git a/python/fedml/computing/scheduler/slave/base_slave_protocol_manager.py b/python/fedml/computing/scheduler/slave/base_slave_protocol_manager.py
diff --git a/python/fedml/core/mlops/mlops_metrics.py b/python/fedml/core/mlops/mlops_metrics.py
diff --git a/python/fedml/core/mlops/mlops_runtime_log.py b/python/fedml/core/mlops/mlops_runtime_log.py