Skip to content

Commit f4332b1

Browse files
authored
Merge pull request #2099 from FedML-AI/alexleung/dev_v070_for_refactor
Alexleung/dev v070 for refactor
2 parents 69b3956 + e263f74 commit f4332b1

18 files changed

Lines changed: 175 additions & 117 deletions

python/examples/launch/serve_mnist/fedml_model_config.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
workspace: "./"
22
entry_point: "mnist_serve_main.py"
33

4+
auto_detect_public_ip: true
5+
46
data_cache_dir: ""
57
bootstrap: ""
68

python/fedml/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434
_global_training_type = None
3535
_global_comm_backend = None
3636

37-
__version__ = "0.8.30"
37+
__version__ = "0.8.31"
3838

3939

4040
# This is the deployment environment used for different roles (RD/PM/BD/Public Developers). Potential VALUE: local, dev, test, release

python/fedml/computing/scheduler/comm_utils/job_monitor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -223,7 +223,7 @@ def monitor_replicas_number():
223223
curr_version = fedml.get_env_version()
224224
num_replica_url_path = "fedmlModelServer/api/v1/endpoint/replica-info"
225225
mlops_prefix = fedml._get_backend_service()
226-
url = f"{mlops_prefix}{num_replica_url_path}"
226+
url = f"{mlops_prefix}/{num_replica_url_path}"
227227

228228
cached_token = FedMLModelCache.get_instance().get_end_point_token_with_eid(endpoint_id)
229229
if cached_token is None:

python/fedml/computing/scheduler/master/base_master_agent.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,18 +17,18 @@ def __init__(self):
1717
self.master_api_process = None
1818
self.mlops_metrics = MLOpsMetrics()
1919
self.status_reporter = None
20-
self.enable_simulation_cloud_agent = True
20+
self.enable_simulation_cloud_agent = False
2121
self.use_local_process_as_cloud_server = False
2222
self.protocol_mgr = None
2323

2424
def login(
2525
self, user_id, api_key=None, device_id=None,
26-
os_name=None, role=None
26+
os_name=None, role=None, runner_cmd=None
2727
):
2828
# Login account
2929
login_result = FedMLAccountManager.get_instance().login(
3030
user_id, api_key=api_key, device_id=device_id,
31-
os_name=os_name, role=role
31+
os_name=os_name, role=role, runner_cmd=runner_cmd
3232
)
3333
if login_result is not None:
3434
self.agent_args = login_result

python/fedml/computing/scheduler/master/base_master_job_runner.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,10 @@ def run_impl(
136136

137137
logging.info("Detect all status of Edge ids: " + str(edge_ids))
138138

139+
self.status_reporter.report_server_id_status(
140+
self.run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_STARTING, edge_id=self.edge_id,
141+
server_id=self.edge_id, server_agent_id=self.edge_id)
142+
139143
status_ok, active_edge_info_dict, inactivate_edges = self.detect_edges_status(
140144
edge_device_info_queue, edge_device_info_global_queue=edge_device_info_global_queue,
141145
callback_when_edges_ready=self.send_training_request_to_edges)

python/fedml/computing/scheduler/master/base_master_job_runner_manager.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,28 @@ def start_job_runner(
3636
status_center_queue=status_center_queue
3737
)
3838

39+
def stop_job_runner(
40+
self, run_id, args=None, server_id=None, request_json=None,
41+
run_as_cloud_agent=False, run_as_cloud_server=False
42+
):
43+
super().stop_job_runner(run_id)
44+
45+
if run_as_cloud_agent or run_as_cloud_server:
46+
stopping_process = Process(
47+
target=FedMLCloudServerManager.stop_cloud_server, args=(run_id, server_id, args.agent_config))
48+
stopping_process.start()
49+
50+
def complete_job_runner(
51+
self, run_id, args=None, server_id=None, request_json=None,
52+
run_as_cloud_agent=False, run_as_cloud_server=False
53+
):
54+
super().complete_job_runner(run_id)
55+
56+
if run_as_cloud_agent or run_as_cloud_server:
57+
stopping_process = Process(
58+
target=FedMLCloudServerManager.stop_cloud_server, args=(run_id, server_id, args.agent_config))
59+
stopping_process.start()
60+
3961
def _start_cloud_server(
4062
self, args, run_id, request_json, edge_id=None,
4163
use_local_process_as_cloud_server=False

python/fedml/computing/scheduler/master/base_master_protocol_manager.py

Lines changed: 33 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ def __init__(self, args, agent_config=None):
3333
self.agent_config = agent_config
3434
self.topic_start_train = None
3535
self.topic_stop_train = None
36+
self.topic_complete_job = None
3637
self.topic_report_status = None
3738
self.topic_ota_msg = None
3839
self.topic_response_device_info = None
@@ -44,7 +45,7 @@ def __init__(self, args, agent_config=None):
4445
self.run_as_cloud_server = False
4546
self.run_as_edge_server_and_agent = False
4647
self.run_as_cloud_server_and_agent = False
47-
self.enable_simulation_cloud_agent = True
48+
self.enable_simulation_cloud_agent = False
4849
self.use_local_process_as_cloud_server = False
4950
self.ota_upgrade = FedMLOtaUpgrade(edge_id=args.edge_id)
5051
self.running_request_json = dict()
@@ -61,6 +62,9 @@ def generate_topics(self):
6162
# The topi for stopping training
6263
self.topic_stop_train = "mlops/flserver_agent_" + str(self.edge_id) + "/stop_train"
6364

65+
# The topic for completing job
66+
self.topic_complete_job = GeneralConstants.get_topic_complete_job(self.edge_id)
67+
6468
# The topic for reporting current device status.
6569
self.topic_report_status = "mlops/report_device_status"
6670

@@ -89,6 +93,7 @@ def generate_topics(self):
8993
self.subscribed_topics.clear()
9094
self.add_subscribe_topic(self.topic_start_train)
9195
self.add_subscribe_topic(self.topic_stop_train)
96+
self.add_subscribe_topic(self.topic_complete_job)
9297
self.add_subscribe_topic(self.topic_report_status)
9398
self.add_subscribe_topic(self.topic_ota_msg)
9499
self.add_subscribe_topic(self.topic_response_device_info)
@@ -103,6 +108,7 @@ def add_protocol_handler(self):
103108
# Add the message listeners for all topics
104109
self.add_message_listener(self.topic_start_train, self.callback_start_train)
105110
self.add_message_listener(self.topic_stop_train, self.callback_stop_train)
111+
self.add_message_listener(self.topic_complete_job, self.callback_complete_job)
106112
self.add_message_listener(self.topic_ota_msg, FedMLBaseMasterProtocolManager.callback_server_ota_msg)
107113
self.add_message_listener(self.topic_report_status, self.callback_report_current_status)
108114
self.add_message_listener(self.topic_response_device_info, self.callback_response_device_info)
@@ -140,12 +146,6 @@ def callback_start_train(self, topic=None, payload=None):
140146
except Exception:
141147
pass
142148

143-
# Parse the message when running in the cloud server mode.
144-
if self.run_as_cloud_server:
145-
message_bytes = payload.encode("ascii")
146-
base64_bytes = base64.b64decode(message_bytes)
147-
payload = base64_bytes.decode("ascii")
148-
149149
# Parse the parameters
150150
# [NOTES] Example Request JSON:
151151
# https://fedml-inc.larksuite.com/wiki/ScnIwUif9iupbjkYS0LuBrd6sod#WjbEdhYrvogmlGxKTOGu98C6sSb
@@ -264,6 +264,9 @@ def callback_stop_train(self, topic, payload, use_payload=None):
264264
run_id = request_json.get("runId", None)
265265
run_id = request_json.get("id", None) if run_id is None else run_id
266266
run_id_str = str(run_id)
267+
server_id = request_json.get("serverId", None)
268+
if server_id is None:
269+
server_id = request_json.get("server_id", None)
267270

268271
# Broadcast the job status to all edges
269272
self.rebuild_status_center(self.get_status_queue())
@@ -274,7 +277,24 @@ def callback_stop_train(self, topic, payload, use_payload=None):
274277
self.running_request_json.pop(run_id_str)
275278

276279
# Stop the job runner
277-
self._get_job_runner_manager().stop_job_runner(run_id)
280+
self._get_job_runner_manager().stop_job_runner(
281+
run_id, args=self.args, server_id=server_id, request_json=request_json,
282+
run_as_cloud_agent=self.run_as_cloud_agent)
283+
284+
def callback_complete_job(self, topic, payload):
285+
# Parse the parameters.
286+
request_json = json.loads(payload)
287+
run_id = request_json.get("runId", None)
288+
run_id = request_json.get("id", None) if run_id is None else run_id
289+
run_id_str = str(run_id)
290+
server_id = request_json.get("serverId", None)
291+
if server_id is None:
292+
server_id = request_json.get("server_id", None)
293+
294+
self._process_job_complete_status(run_id, server_id, request_json)
295+
296+
def _process_job_complete_status(self, run_id, server_id, complete_payload):
297+
pass
278298

279299
def callback_run_logs(self, topic, payload):
280300
run_id = str(topic).split('/')[-1]
@@ -498,6 +518,11 @@ def send_training_stop_request_to_specific_edge(self, edge_id, payload):
498518
logging.info("stop_train: send topic " + topic_stop_train)
499519
self.message_center.send_message(topic_stop_train, payload)
500520

521+
def send_training_stop_request_to_cloud_server(self, edge_id, payload):
522+
topic_stop_train = "mlops/flserver_agent_" + str(edge_id) + "/stop_train"
523+
logging.info("stop_train: send topic " + topic_stop_train)
524+
self.message_center.send_message(topic_stop_train, payload)
525+
501526
def send_status_check_msg(self, run_id, edge_id, server_id, context=None):
502527
topic_status_check = f"server/client/request_device_info/{edge_id}"
503528
payload = {"server_id": server_id, "run_id": run_id}

python/fedml/computing/scheduler/master/cloud_server_manager.py

Lines changed: 36 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,16 @@
33
import logging
44
import os
55
import traceback
6+
7+
import fedml
68
from fedml.computing.scheduler.comm_utils.sys_utils import get_python_program
79

810

911
class FedMLCloudServerManager:
1012
FEDML_CLOUD_SERVER_PREFIX = "fedml-server-run-"
1113
LOCAL_RUNNER_INFO_DIR_NAME = 'runner_infos'
1214
STATUS_IDLE = "IDLE"
15+
FEDML_SERVER_BASE_IMAGE = "/fedml-device-image:"
1316

1417
def __init__(self, args, run_id=None, edge_id=None, request_json=None, agent_config=None, version=None):
1518
self.server_docker_image = None
@@ -18,11 +21,13 @@ def __init__(self, args, run_id=None, edge_id=None, request_json=None, agent_con
1821
self.edge_id = edge_id
1922
self.request_json = request_json
2023
self.agent_config = agent_config
24+
if version is None:
25+
version = fedml.get_env_version()
2126
self.version = version
2227
image_version = self.version
2328
if image_version == "local":
24-
image_version = "dev"
25-
self.server_docker_base_image = "/fedml-device-image:" + image_version
29+
image_version = "test"
30+
self.server_docker_base_image = FedMLCloudServerManager._get_server_base_image(image_version)
2631
self.cloud_server_name = None
2732

2833
@staticmethod
@@ -121,44 +126,52 @@ def start_cloud_server(self, packages_config):
121126
logging.info("start run with k8s: " + run_deployment_cmd)
122127
os.system(run_deployment_cmd)
123128

124-
def stop_cloud_server(self):
125-
self.cloud_server_name = FedMLCloudServerManager.FEDML_CLOUD_SERVER_PREFIX + str(self.run_id) \
126-
+ "-" + str(self.edge_id)
127-
self.server_docker_image = (
128-
self.agent_config["docker_config"]["registry_server"]
129-
+ self.agent_config["docker_config"]["registry_dir"]
130-
+ self.server_docker_base_image
129+
@staticmethod
130+
def stop_cloud_server(run_id, server_id, agent_config):
131+
cloud_server_name = FedMLCloudServerManager._get_cloud_server_name(run_id, server_id)
132+
server_docker_image = (
133+
agent_config["docker_config"]["registry_server"]
134+
+ agent_config["docker_config"]["registry_dir"]
135+
+ FedMLCloudServerManager._get_server_base_image(fedml.get_env_version())
131136
)
132137
delete_deployment_cmd = (
133138
"export FEDML_AGGREGATOR_NAME="
134-
+ self.cloud_server_name
139+
+ cloud_server_name
135140
+ ";export FEDML_AGGREGATOR_SVC="
136-
+ self.cloud_server_name
141+
+ cloud_server_name
137142
+ ";export FEDML_AGGREGATOR_VERSION="
138-
+ self.version
143+
+ fedml.get_env_version()
139144
+ ';export FEDML_AGGREGATOR_IMAGE_PATH="'
140-
+ self.server_docker_image
145+
+ server_docker_image
141146
+ '"'
142147
+ ";export FEDML_CONF_ID="
143-
+ self.cloud_server_name
148+
+ cloud_server_name
144149
+ ";export FEDML_DATA_PV_ID="
145-
+ self.cloud_server_name
150+
+ cloud_server_name
146151
+ ";export FEDML_DATA_PVC_ID="
147-
+ self.cloud_server_name
152+
+ cloud_server_name
148153
+ ";export FEDML_REGISTRY_SECRET_SUFFIX="
149-
+ self.cloud_server_name
154+
+ cloud_server_name
150155
+ ";kubectl -n fedml-devops-aggregator-"
151-
+ self.version
156+
+ fedml.get_env_version()
152157
+ " delete deployment "
153-
+ self.cloud_server_name
158+
+ cloud_server_name
154159
+ ";kubectl -n fedml-devops-aggregator-"
155-
+ self.version
160+
+ fedml.get_env_version()
156161
+ " delete svc "
157-
+ self.cloud_server_name
162+
+ cloud_server_name
158163
+ ";kubectl -n fedml-devops-aggregator-"
159-
+ self.version
164+
+ fedml.get_env_version()
160165
+ " delete secret secret-"
161-
+ self.cloud_server_name
166+
+ cloud_server_name
162167
)
163168
logging.info("stop run with k8s: " + delete_deployment_cmd)
164169
os.system(delete_deployment_cmd)
170+
171+
@staticmethod
172+
def _get_server_base_image(version):
173+
return f"{FedMLCloudServerManager.FEDML_SERVER_BASE_IMAGE}{version}"
174+
175+
@staticmethod
176+
def _get_cloud_server_name(run_id, server_id):
177+
return f"{FedMLCloudServerManager.FEDML_CLOUD_SERVER_PREFIX}{run_id}-{server_id}"

python/fedml/computing/scheduler/master/master_protocol_manager.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,3 +34,10 @@ def _init_extra_items(self):
3434
# Override
3535
def print_connected_info(self):
3636
super().print_connected_info()
37+
38+
# Override
39+
def _process_job_complete_status(self, run_id, server_id, complete_payload):
40+
# Complete the job runner
41+
self._get_job_runner_manager().complete_job_runner(
42+
run_id, args=self.args, server_id=server_id, request_json=complete_payload,
43+
run_as_cloud_agent=self.run_as_cloud_agent, run_as_cloud_server=self.run_as_cloud_server)

python/fedml/computing/scheduler/master/server_login.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,6 @@ def logout():
3939
master_agent = FedMLLaunchMasterAgent()
4040
if args.type == 'login':
4141
master_agent.login(args.api_key, api_key=args.api_key, device_id=args.device_id,
42-
os_name=args.os_name, role=args.role)
42+
os_name=args.os_name, role=args.role, runner_cmd=args.runner_cmd)
4343
else:
4444
master_agent.logout()

0 commit comments

Comments
 (0)