Merge pull request #2157 from FedML-AI/raphael/unify-connectivity

Raphael-Jin · web-flow · commit 34fdba04b227 · 2024-06-11T16:08:25.000-07:00
[Deploy] Report worker's connectivity when it finished.
diff --git a/python/fedml/computing/scheduler/comm_utils/network_util.py b/python/fedml/computing/scheduler/comm_utils/network_util.py
@@ -0,0 +1,18 @@
+import os
+from fedml.computing.scheduler.model_scheduler.device_client_constants import ClientConstants
+
+
+def return_this_device_connectivity_type() -> str:
+    """
+    Return -> "http" | "http_proxy" |"mqtt"
+    """
+    # Get the environmental variable's value and convert to lower case.
+    env_conn_type = os.getenv(ClientConstants.ENV_CONNECTION_TYPE_KEY, "").lower()
+    if env_conn_type in [
+        ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP,
+        ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP_PROXY,
+        ClientConstants.WORKER_CONNECTIVITY_TYPE_MQTT
+    ]:
+        return env_conn_type
+    else:
+        return ClientConstants.WORKER_CONNECTIVITY_TYPE_DEFAULT
diff --git a/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py b/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py
@@ -97,6 +97,12 @@ class ClientConstants(object):
     INFERENCE_INFERENCE_SERVER_VERSION = "v2"
     INFERENCE_REQUEST_TIMEOUT = 30
 
+    ENV_CONNECTION_TYPE_KEY = "FEDML_CONNECTION_TYPE"
+    WORKER_CONNECTIVITY_TYPE_HTTP = "http"
+    WORKER_CONNECTIVITY_TYPE_HTTP_PROXY = "http_proxy"
+    WORKER_CONNECTIVITY_TYPE_MQTT = "mqtt"
+    WORKER_CONNECTIVITY_TYPE_DEFAULT = WORKER_CONNECTIVITY_TYPE_HTTP
+
     MSG_MODELOPS_DEPLOYMENT_STATUS_INITIALIZING = "INITIALIZING"
     MSG_MODELOPS_DEPLOYMENT_STATUS_DEPLOYING = "DEPLOYING"
     MSG_MODELOPS_DEPLOYMENT_STATUS_INFERRING = "INFERRING"
diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_cache.py b/python/fedml/computing/scheduler/model_scheduler/device_model_cache.py
@@ -344,9 +344,13 @@ def get_result_item_info(self, result_item):
             result_payload = result_item_json["result"]
         return device_id, replica_no, result_payload
 
-    def get_idle_device(self, end_point_id, end_point_name,
-                        model_name, model_version,
-                        check_end_point_status=True, limit_specific_model_version=False):
+    def get_idle_device(self,
+                        end_point_id,
+                        end_point_name,
+                        model_name,
+                        model_version,
+                        check_end_point_status=True,
+                        limit_specific_model_version=False):
         # Deprecated the model status logic, query directly from the deployment result list
         idle_device_list = list()
 
diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py b/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py
@@ -210,7 +210,8 @@ async def _predict(
                 return inference_response
 
             # Found idle inference device
-            idle_device, end_point_id, model_id, model_name, model_version, inference_host, inference_output_url = \
+            idle_device, end_point_id, model_id, model_name, model_version, inference_host, inference_output_url,\
+                connectivity_type = \
                 found_idle_inference_device(in_end_point_id, in_end_point_name, in_model_name, in_model_version)
             if idle_device is None or idle_device == "":
                 FEDML_MODEL_CACHE.update_pending_requests_counter(end_point_id, decrease=True)
@@ -235,13 +236,16 @@ async def _predict(
                 stream_flag = input_json.get("stream", False)
                 input_list["stream"] = input_list.get("stream", stream_flag)
                 output_list = input_json.get("outputs", [])
+
+                # main execution of redirecting the inference request to the idle device
                 inference_response = await send_inference_request(
                     idle_device,
                     end_point_id,
                     inference_output_url,
                     input_list,
                     output_list,
-                    inference_type=in_return_type)
+                    inference_type=in_return_type,
+                    connectivity_type=connectivity_type)
 
             # Calculate model metrics
             try:
@@ -304,37 +308,40 @@ def found_idle_inference_device(end_point_id, end_point_name, in_model_name, in_
     inference_host = ""
     inference_output_url = ""
     model_version = ""
+    connectivity_type = ""
+
     # Found idle device (TODO: optimize the algorithm to search best device for inference)
     payload, idle_device = FEDML_MODEL_CACHE. \
         get_idle_device(end_point_id, end_point_name, in_model_name, in_model_version)
-    if payload is not None:
-        logging.info("found idle deployment result {}".format(payload))
-        deployment_result = payload
-        model_name = deployment_result["model_name"]
-        model_version = deployment_result["model_version"]
-        model_id = deployment_result["model_id"]
-        end_point_id = deployment_result["end_point_id"]
-        inference_output_url = deployment_result["model_url"]
+    if payload:
+        model_name = payload["model_name"]
+        model_version = payload["model_version"]
+        model_id = payload["model_id"]
+        end_point_id = payload["end_point_id"]
+        inference_output_url = payload["model_url"]
+        connectivity_type = \
+            payload.get("connectivity_type",
+                        ClientConstants.WORKER_CONNECTIVITY_TYPE_DEFAULT)
         url_parsed = urlparse(inference_output_url)
         inference_host = url_parsed.hostname
     else:
         logging.info("not found idle deployment result")
 
-    return idle_device, end_point_id, model_id, model_name, model_version, inference_host, inference_output_url
+    res = (idle_device, end_point_id, model_id, model_name, model_version, inference_host, inference_output_url,
+           connectivity_type)
+    logging.info(f"found idle device with metrics: {res}")
+
+    return res
 
 
 async def send_inference_request(idle_device, end_point_id, inference_url, input_list, output_list,
-                                 inference_type="default", has_public_ip=True):
+                                 inference_type="default",
+                                 connectivity_type=ClientConstants.WORKER_CONNECTIVITY_TYPE_DEFAULT):
     request_timeout_sec = FEDML_MODEL_CACHE.get_endpoint_settings(end_point_id) \
         .get("request_timeout_sec", ClientConstants.INFERENCE_REQUEST_TIMEOUT)
 
     try:
-        http_infer_available = os.getenv("FEDML_INFERENCE_HTTP_AVAILABLE", True)
-        if not http_infer_available:
-            if http_infer_available == "False" or http_infer_available == "false":
-                http_infer_available = False
-
-        if http_infer_available:
+        if connectivity_type == ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP:
             response_ok = await FedMLHttpInference.is_inference_ready(
                 inference_url,
                 timeout=request_timeout_sec)
@@ -347,22 +354,23 @@ async def send_inference_request(idle_device, end_point_id, inference_url, input
                     timeout=request_timeout_sec)
                 logging.info(f"Use http inference. return {response_ok}")
                 return inference_response
-
-        response_ok = await FedMLHttpProxyInference.is_inference_ready(
-            inference_url,
-            timeout=request_timeout_sec)
-        if response_ok:
-            response_ok, inference_response = await FedMLHttpProxyInference.run_http_proxy_inference_with_request(
-                end_point_id,
+        elif connectivity_type == ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP_PROXY:
+            logging.warning("Use http proxy inference.")
+            response_ok = await FedMLHttpProxyInference.is_inference_ready(
                 inference_url,
-                input_list,
-                output_list,
-                inference_type=inference_type,
                 timeout=request_timeout_sec)
-            logging.info(f"Use http proxy inference. return {response_ok}")
-            return inference_response
-
-        if not has_public_ip:
+            if response_ok:
+                response_ok, inference_response = await FedMLHttpProxyInference.run_http_proxy_inference_with_request(
+                    end_point_id,
+                    inference_url,
+                    input_list,
+                    output_list,
+                    inference_type=inference_type,
+                    timeout=request_timeout_sec)
+                logging.info(f"Use http proxy inference. return {response_ok}")
+                return inference_response
+        elif connectivity_type == ClientConstants.WORKER_CONNECTIVITY_TYPE_MQTT:
+            logging.warning("Use mqtt inference.")
             agent_config = {"mqtt_config": Settings.mqtt_config}
             mqtt_inference = FedMLMqttInference(
                 agent_config=agent_config,
@@ -385,7 +393,8 @@ async def send_inference_request(idle_device, end_point_id, inference_url, input
 
             logging.info(f"Use mqtt inference. return {response_ok}.")
             return inference_response
-        return {"error": True, "message": "Failed to use http, http-proxy for inference, no response from replica."}
+        else:
+            return {"error": True, "message": "Failed to use http, http-proxy for inference, no response from replica."}
     except Exception as e:
         inference_response = {"error": True,
                               "message": f"Exception when using http, http-proxy and mqtt "
diff --git a/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py b/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py
@@ -250,14 +250,6 @@ def process_deployment_result_message(self, topic=None, payload=None):
         logging.info(f"Endpoint {end_point_id}; Device {device_id}; replica {replica_no}; "
                      f"run_operation {run_operation} model status {model_status}.")
 
-        # OPTIONAL DEBUG PARAMS
-        # this_run_controller = self.model_runner_mapping[run_id_str].replica_controller
-        # logging.info(f"The current replica controller state is "
-        #              f"Total version diff num {this_run_controller.total_replica_version_diff_num}")
-        # logging.info(f"self.request_json now {self.request_json}")    # request_json will be deprecated
-        # this_run_request_json = self.request_json
-        # logging.info(f"self.request_json now {this_run_request_json}")
-
         # Set redis + sqlite deployment result
         FedMLModelCache.get_instance().set_redis_params(self.redis_addr, self.redis_port, self.redis_password)
 
diff --git a/python/fedml/computing/scheduler/model_scheduler/worker_job_runner.py b/python/fedml/computing/scheduler/model_scheduler/worker_job_runner.py
@@ -9,6 +9,8 @@
 from abc import ABC
 import yaml
 from fedml.computing.scheduler.comm_utils.job_utils import JobRunnerUtils
+from fedml.computing.scheduler.comm_utils.network_util import return_this_device_connectivity_type
+
 from fedml.core.mlops import MLOpsRuntimeLog
 from fedml.computing.scheduler.comm_utils import file_utils
 from .device_client_constants import ClientConstants
@@ -234,8 +236,11 @@ def run_impl(self, run_extend_queue_list, sender_message_center,
         running_model_name, inference_output_url, inference_model_version, model_metadata, model_config = \
             "", "", model_version, {}, {}
 
+        # ip and connectivity
+        worker_ip = GeneralConstants.get_ip_address(self.request_json)
+        connectivity = return_this_device_connectivity_type()
+
         if op == "add":
-            worker_ip = GeneralConstants.get_ip_address(self.request_json)
             for rank in range(prev_rank + 1, prev_rank + 1 + op_num):
                 try:
                     running_model_name, inference_output_url, inference_model_version, model_metadata, model_config = \
@@ -269,7 +274,9 @@ def run_impl(self, run_extend_queue_list, sender_message_center,
                     result_payload = self.send_deployment_results(
                         end_point_name, self.edge_id, ClientConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_DEPLOYED,
                         model_id, model_name, inference_output_url, model_version, inference_port_external,
-                        inference_engine, model_metadata, model_config, replica_no=rank + 1)
+                        inference_engine, model_metadata, model_config, replica_no=rank + 1,
+                        connectivity=connectivity
+                    )
 
                     if inference_port_external != inference_port:
                         # Save internal port to local db
@@ -278,7 +285,9 @@ def run_impl(self, run_extend_queue_list, sender_message_center,
                         result_payload = self.construct_deployment_results(
                             end_point_name, self.edge_id, ClientConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_DEPLOYED,
                             model_id, model_name, inference_output_url, model_version, inference_port,
-                            inference_engine, model_metadata, model_config, replica_no=rank + 1)
+                            inference_engine, model_metadata, model_config, replica_no=rank + 1,
+                            connectivity=connectivity
+                        )
 
                     FedMLModelDatabase.get_instance().set_deployment_result(
                         run_id, end_point_name, model_name, model_version, self.edge_id,
@@ -326,7 +335,6 @@ def run_impl(self, run_extend_queue_list, sender_message_center,
             return True
         elif op == "update" or op == "rollback":
             # Update is combine of delete and add
-            worker_ip = GeneralConstants.get_ip_address(self.request_json)
             for rank in replica_rank_to_update:
                 # Delete a replica (container) if exists
                 self.replica_handler.remove_replica(rank)
@@ -402,15 +410,19 @@ def run_impl(self, run_extend_queue_list, sender_message_center,
                     result_payload = self.send_deployment_results(
                         end_point_name, self.edge_id, ClientConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_DEPLOYED,
                         model_id, model_name, inference_output_url, model_version, inference_port_external,
-                        inference_engine, model_metadata, model_config, replica_no=rank + 1)
+                        inference_engine, model_metadata, model_config, replica_no=rank + 1,
+                        connectivity=connectivity
+                    )
 
                     if inference_port_external != inference_port:  # Save internal port to local db
                         logging.info("inference_port_external {} != inference_port {}".format(
                             inference_port_external, inference_port))
                         result_payload = self.construct_deployment_results(
                             end_point_name, self.edge_id, ClientConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_DEPLOYED,
                             model_id, model_name, inference_output_url, model_version, inference_port,
-                            inference_engine, model_metadata, model_config, replica_no=rank + 1)
+                            inference_engine, model_metadata, model_config, replica_no=rank + 1,
+                            connectivity=connectivity
+                        )
 
                     FedMLModelDatabase.get_instance().set_deployment_result(
                         run_id, end_point_name, model_name, model_version, self.edge_id,
@@ -433,7 +445,8 @@ def run_impl(self, run_extend_queue_list, sender_message_center,
     def construct_deployment_results(self, end_point_name, device_id, model_status,
                                      model_id, model_name, model_inference_url,
                                      model_version, inference_port, inference_engine,
-                                     model_metadata, model_config, replica_no=1):
+                                     model_metadata, model_config, replica_no=1,
+                                     connectivity=ClientConstants.WORKER_CONNECTIVITY_TYPE_DEFAULT):
         deployment_results_payload = {"end_point_id": self.run_id, "end_point_name": end_point_name,
                                       "model_id": model_id, "model_name": model_name,
                                       "model_url": model_inference_url, "model_version": model_version,
@@ -444,6 +457,7 @@ def construct_deployment_results(self, end_point_name, device_id, model_status,
                                       "model_status": model_status,
                                       "inference_port": inference_port,
                                       "replica_no": replica_no,
+                                      "connectivity_type": connectivity,
                                       }
         return deployment_results_payload
 
@@ -466,15 +480,16 @@ def construct_deployment_status(self, end_point_name, device_id,
     def send_deployment_results(self, end_point_name, device_id, model_status,
                                 model_id, model_name, model_inference_url,
                                 model_version, inference_port, inference_engine,
-                                model_metadata, model_config, replica_no=1):
+                                model_metadata, model_config, replica_no=1,
+                                connectivity=ClientConstants.WORKER_CONNECTIVITY_TYPE_DEFAULT):
         deployment_results_topic = "model_device/model_device/return_deployment_result/{}/{}".format(
             self.run_id, device_id)
 
         deployment_results_payload = self.construct_deployment_results(
             end_point_name, device_id, model_status,
             model_id, model_name, model_inference_url,
             model_version, inference_port, inference_engine,
-            model_metadata, model_config, replica_no=replica_no)
+            model_metadata, model_config, replica_no=replica_no, connectivity=connectivity)
 
         logging.info("[client] send_deployment_results: topic {}, payload {}.".format(deployment_results_topic,
                                                                                       deployment_results_payload))