Merge pull request #2111 from FedML-AI/alexleung/dev_v070_for_refactor

fedml-alex · web-flow · commit d7f24239a6fd · 2024-05-16T22:58:33.000+08:00
Alexleung/dev v070 for refactor
diff --git a/python/fedml/computing/scheduler/comm_utils/run_process_utils.py b/python/fedml/computing/scheduler/comm_utils/run_process_utils.py
@@ -14,8 +14,10 @@ def get_run_process_prefix(prefix, run_id):
         return f"{prefix}-run@{run_id}@pid@"
 
     @staticmethod
-    def cleanup_run_process(run_id, data_dir, info_dir,
-                            info_file_prefix=SchedulerConstants.RUN_PROCESS_TYPE_RUNNER_PROCESS):
+    def cleanup_run_process(
+            run_id, data_dir, info_dir,
+            info_file_prefix=SchedulerConstants.RUN_PROCESS_TYPE_RUNNER_PROCESS, not_kill_subprocess=False
+    ):
         try:
             local_pkg_data_dir = data_dir
             run_process_dir = os.path.join(local_pkg_data_dir, info_dir)
@@ -43,12 +45,13 @@ def cleanup_run_process(run_id, data_dir, info_dir,
 
                 try:
                     process = psutil.Process(int(process_id))
-                    child_processes = process.children(recursive=True)
-                    for sub_process in child_processes:
-                        if platform.system() == 'Windows':
-                            os.system("taskkill /PID {} /T /F".format(sub_process.pid))
-                        else:
-                            os.kill(sub_process.pid, signal.SIGKILL)
+                    if not not_kill_subprocess:
+                        child_processes = process.children(recursive=True)
+                        for sub_process in child_processes:
+                            if platform.system() == 'Windows':
+                                os.system("taskkill /PID {} /T /F".format(sub_process.pid))
+                            else:
+                                os.kill(sub_process.pid, signal.SIGKILL)
 
                     if process is not None:
                         if platform.system() == 'Windows':
diff --git a/python/fedml/computing/scheduler/master/base_master_job_runner.py b/python/fedml/computing/scheduler/master/base_master_job_runner.py
@@ -111,10 +111,13 @@ def run(
             if self.mlops_metrics is not None:
                 self.mlops_metrics.stop_sys_perf()
             time.sleep(3)
-            ServerConstants.cleanup_run_process(self.run_id)
+            self.cleanup_runner_process(self.run_id)
             ServerConstants.cleanup_learning_process(self.run_id)
             ServerConstants.cleanup_bootstrap_process(self.run_id)
 
+    def cleanup_runner_process(self, run_id):
+        ServerConstants.cleanup_run_process(run_id)
+
     @debug
     @abstractmethod
     def run_impl(
diff --git a/python/fedml/computing/scheduler/master/server_constants.py b/python/fedml/computing/scheduler/master/server_constants.py
@@ -268,9 +268,10 @@ def get_dataset_metadata_url():
         return get_dataset_metadata_url
 
     @staticmethod
-    def cleanup_run_process(run_id):
+    def cleanup_run_process(run_id, not_kill_subprocess=False):
         RunProcessUtils.cleanup_run_process(
-            run_id, ServerConstants.get_data_dir(), ServerConstants.LOCAL_RUNNER_INFO_DIR_NAME)
+            run_id, ServerConstants.get_data_dir(), ServerConstants.LOCAL_RUNNER_INFO_DIR_NAME,
+            not_kill_subprocess=not_kill_subprocess)
 
     @staticmethod
     def save_run_process(run_id, process_id):
diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py b/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py
@@ -26,35 +26,35 @@
     pass
 
 
-class Settings(BaseSettings):
-    redis_addr: str
-    redis_port: str
-    redis_password: str
-    end_point_name: str
-    model_name: str
-    model_version: str
-    model_infer_url: str
-    version: str
-    use_mqtt_inference: bool
-    use_worker_gateway: bool
-    ext_info: str
-
-
-settings = Settings()
-
-# class settings:
-#     redis_addr = "127.0.0.1"
-#     redis_port = 6379
-#     redis_password = "fedml_default"
-#     end_point_name = ""
-#     model_name = ""
-#     model_version = ""
-#     model_infer_url = "127.0.0.1"
-#     version = "dev"
-#     use_mqtt_inference = False
-#     use_worker_gateway = False
-#     ext_info = "2b34303961245c4f175f2236282d7a272c040b0904747579087f6a760112030109010c215d54505707140005190a051c347f365c4a430c020a7d39120e26032a78730f797f7c031f0901657e75"
+# class Settings(BaseSettings):
+#     redis_addr: str
+#     redis_port: str
+#     redis_password: str
+#     end_point_name: str
+#     model_name: str
+#     model_version: str
+#     model_infer_url: str
+#     version: str
+#     use_mqtt_inference: bool
+#     use_worker_gateway: bool
+#     ext_info: str
 #
+#
+# settings = Settings()
+
+class settings:
+    redis_addr = "127.0.0.1"
+    redis_port = 6379
+    redis_password = "fedml_default"
+    end_point_name = ""
+    model_name = ""
+    model_version = ""
+    model_infer_url = "127.0.0.1"
+    version = "dev"
+    use_mqtt_inference = False
+    use_worker_gateway = False
+    ext_info = "2b34303961245c4f175f2236282d7a272c040b0904747579087f6a760112030109010c215d54505707140005190a051c347f365c4a430c020a7d39120e26032a78730f797f7c031f0901657e75"
+
 
 api = FastAPI()
 
diff --git a/python/fedml/computing/scheduler/model_scheduler/device_server_constants.py b/python/fedml/computing/scheduler/model_scheduler/device_server_constants.py
@@ -295,9 +295,10 @@ def get_public_ip():
         return ip
 
     @staticmethod
-    def cleanup_run_process(run_id):
+    def cleanup_run_process(run_id, not_kill_subprocess=False):
         RunProcessUtils.cleanup_run_process(
-            run_id, ServerConstants.get_data_dir(), ServerConstants.LOCAL_RUNNER_INFO_DIR_NAME)
+            run_id, ServerConstants.get_data_dir(), ServerConstants.LOCAL_RUNNER_INFO_DIR_NAME,
+            not_kill_subprocess=not_kill_subprocess)
 
     @staticmethod
     def save_run_process(run_id, process_id):
diff --git a/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py b/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py
@@ -25,7 +25,6 @@
 
 
 class FedMLDeployMasterJobRunner(FedMLBaseMasterJobRunner, FedMLDeployJobRunnerMsgSender, ABC):
-
     default_redis_addr = "local"
     default_redis_port = "6379"
     default_redis_password = "fedml_default"
@@ -54,7 +53,7 @@ def __init__(self, args, run_id=0, request_json=None, agent_config=None, edge_id
         self.deployment_result_queue = Queue()
 
     # Override
-    def _generate_job_runner_instance(self, args, run_id=None, request_json=None, agent_config=None, edge_id=None,):
+    def _generate_job_runner_instance(self, args, run_id=None, request_json=None, agent_config=None, edge_id=None, ):
         return FedMLDeployMasterJobRunner(
             args, run_id=run_id, request_json=request_json, agent_config=self.agent_config, edge_id=edge_id
         )
@@ -65,10 +64,10 @@ def _generate_extend_queue_list(self):
 
     # Override
     def run_impl(
-        self, edge_id_status_queue, edge_device_info_queue, run_metrics_queue,
-        run_event_queue, run_artifacts_queue, run_logs_queue, edge_device_info_global_queue,
-        run_extend_queue_list=None, sender_message_queue=None, listener_message_queue=None,
-        status_center_queue=None
+            self, edge_id_status_queue, edge_device_info_queue, run_metrics_queue,
+            run_event_queue, run_artifacts_queue, run_logs_queue, edge_device_info_global_queue,
+            run_extend_queue_list=None, sender_message_queue=None, listener_message_queue=None,
+            status_center_queue=None
     ):
         # Parse the model parameters.
         run_id, end_point_name, token, user_id, user_name, device_ids, device_objs, model_config, model_name, \
@@ -114,13 +113,13 @@ def run_impl(
             message_center=self.message_center)
 
         # start unified inference server
-        self.start_device_inference_gateway(
+        FedMLDeployMasterJobRunner.start_device_inference_gateway(
             inference_port=inference_port, agent_config=self.agent_config)
 
         # start inference monitor server
-        self.stop_device_inference_monitor(
+        FedMLDeployMasterJobRunner.stop_device_inference_monitor(
             run_id, end_point_name, model_id, model_name, model_version)
-        self.start_device_inference_monitor(
+        FedMLDeployMasterJobRunner.start_device_inference_monitor(
             run_id, end_point_name, model_id, model_name, model_version)
 
         # Changed the status to "IDLE"
@@ -331,7 +330,7 @@ def process_deployment_result_message(self, topic=None, payload=None):
             elif run_operation == "UPDATE":
                 # Overwrite the json with the rollback version diff
                 rollback_version_diff = self.replica_controller.rollback_get_replica_version_diff(
-                        device_id_trigger=device_id, replica_no_trigger=replica_no)
+                    device_id_trigger=device_id, replica_no_trigger=replica_no)
 
                 # Change the target version to the start version
                 self.replica_controller.rollback_setback_target_replica_version()
@@ -461,6 +460,9 @@ def process_deployment_result_message(self, topic=None, payload=None):
             time.sleep(3)
             self.trigger_completed_event()
 
+    def cleanup_runner_process(self, run_id):
+        ServerConstants.cleanup_run_process(run_id, not_kill_subprocess=True)
+
     @staticmethod
     def start_device_inference_gateway(
             inference_port=ServerConstants.MODEL_INFERENCE_DEFAULT_PORT,
@@ -489,6 +491,7 @@ def start_device_inference_gateway(
                     agent_config["mqtt_config"]["MQTT_USER"] + connect_str +
                     agent_config["mqtt_config"]["MQTT_PWD"] + connect_str +
                     str(agent_config["mqtt_config"]["MQTT_KEEPALIVE"]), "FEDML@9999GREAT")
+                python_program = get_python_program()
                 inference_gateway_process = ServerConstants.exec_console_with_script(
                     "REDIS_ADDR=\"{}\" REDIS_PORT=\"{}\" REDIS_PASSWORD=\"{}\" "
                     "END_POINT_NAME=\"{}\" "
@@ -503,6 +506,8 @@ def start_device_inference_gateway(
                     should_capture_stdout=False, should_capture_stderr=False)
 
                 return inference_gateway_process
+            else:
+                return inference_gateway_pids[0]
 
         return None
 
@@ -546,8 +551,6 @@ def recover_inference_and_monitor():
             except Exception as e:
                 pass
 
-            FedMLDeployMasterJobRunner.start_device_inference_gateway(agent_config=agent_config)
-
             history_jobs = FedMLServerDataInterface.get_instance().get_history_jobs()
             for job in history_jobs.job_list:
                 if job.running_json is None:
@@ -566,6 +569,9 @@ def recover_inference_and_monitor():
                 if not is_activated:
                     continue
 
+                FedMLDeployMasterJobRunner.start_device_inference_gateway(
+                    inference_port=inference_port, agent_config=agent_config)
+
                 FedMLDeployMasterJobRunner.stop_device_inference_monitor(
                     run_id, end_point_name, model_id, model_name, model_version)
                 FedMLDeployMasterJobRunner.start_device_inference_monitor(
@@ -778,7 +784,7 @@ def parse_model_run_params(running_json):
         model_version = model_config["model_version"]
         model_config_parameters = running_json.get("parameters", {})
 
-        inference_port = model_config_parameters.get("server_internal_port",    # Internal port is for the gateway
+        inference_port = model_config_parameters.get("server_internal_port",  # Internal port is for the gateway
                                                      ServerConstants.MODEL_INFERENCE_DEFAULT_PORT)
         inference_port_external = model_config_parameters.get("server_external_port", inference_port)