1212from ..master .server_data_interface import FedMLServerDataInterface
1313from .message_common import LogArgs
1414from .general_constants import GeneralConstants
15+ from ..scheduler_core .compute_cache_manager import ComputeCacheManager
1516
1617
1718class FedMLStatusManager (object ):
@@ -33,13 +34,81 @@ def __init__(self, run_id=None, edge_id=None, server_id=None,
3334 self .log_args = LogArgs (role = "server" , edge_id = self .edge_id ,
3435 server_id = self .server_id , log_file_dir = ServerConstants .get_log_file_dir ())
3536
37+ self .job_status_in_slave = dict ()
38+ self .entire_job_status = None
39+ self .job_status_in_master = dict ()
40+ self .slave_devices_status = dict ()
41+ self .master_devices_status = dict ()
42+ self .completed_job_status_list = [ServerConstants .MSG_MLOPS_SERVER_STATUS_FAILED ,
43+ ServerConstants .MSG_MLOPS_SERVER_STATUS_FINISHED ,
44+ ServerConstants .MSG_MLOPS_SERVER_STATUS_KILLED ]
45+
3646 def __repr__ (self ):
3747 return "<{klass} @{id:x} {attrs}>" .format (
3848 klass = self .__class__ .__name__ ,
3949 id = id (self ) & 0xFFFFFF ,
4050 attrs = " " .join ("{}={!r}" .format (k , v ) for k , v in self .__dict__ .items ()),
4151 )
4252
53+ def add_job_status_in_slave (self , device_id , status ):
54+ self .job_status_in_slave [device_id ] = self ._status_transition (status )
55+
56+ def add_job_status_in_master (self , device_id , status ):
57+ self .job_status_in_master [device_id ] = self ._status_transition (status )
58+
59+ def set_entire_job_status (self , status ):
60+ self .entire_job_status = status
61+
62+ def add_slave_device_status (self , device_id , status ):
63+ self .slave_devices_status [device_id ] = self ._status_transition (status )
64+
65+ def add_master_device_status (self , run_id , device_id , status ):
66+ self .master_devices_status [device_id ] = self ._status_transition (status )
67+
68+ def get_job_status_in_slave (self , device_id ):
69+ return self .job_status_in_slave .get (device_id , None )
70+
71+ def get_job_status_in_master (self , device_id ):
72+ return self .job_status_in_master .get (device_id , None )
73+
74+ def get_entire_job_status (self ):
75+ return self .entire_job_status
76+
77+ def get_slave_device_status (self , device_id ):
78+ return self .slave_devices_status .get (device_id , None )
79+
80+ def get_master_device_status (self , device_id ):
81+ return self .master_devices_status .get (device_id , None )
82+
83+ def is_job_completed (self ):
84+ if self .entire_job_status and self .entire_job_status in self .completed_job_status_list :
85+ return True
86+ return False
87+
88+ def _status_transition (self , status ):
89+ transition_status = status
90+ if self .entire_job_status is not None :
91+ if self .entire_job_status == ServerConstants .MSG_MLOPS_SERVER_STATUS_FAILED or \
92+ self .entire_job_status == ServerConstants .MSG_MLOPS_SERVER_STATUS_FINISHED :
93+ if status == ClientConstants .MSG_MLOPS_CLIENT_STATUS_FAILED or \
94+ status == ClientConstants .MSG_MLOPS_CLIENT_STATUS_FINISHED or \
95+ status == ClientConstants .MSG_MLOPS_CLIENT_STATUS_KILLED :
96+ transition_status = status
97+ else :
98+ transition_status = ClientConstants .MSG_MLOPS_CLIENT_STATUS_KILLED
99+
100+ return transition_status
101+
102+ def save_job_status (self ):
103+ ComputeCacheManager .get_instance ().set_redis_params ()
104+ ComputeCacheManager .get_instance ().get_status_cache ().save_job_status (
105+ self .run_id , self .get_entire_job_status ())
106+
107+ def save_device_status_in_job (self , device_id ):
108+ ComputeCacheManager .get_instance ().set_redis_params ()
109+ ComputeCacheManager .get_instance ().get_status_cache ().save_device_status_in_job (
110+ self .run_id , device_id , self .get_job_status_in_slave (device_id ))
111+
43112 def process_job_completed_status (self , master_id , status ):
44113 # Stop the system performance monitor
45114 try :
@@ -75,10 +144,8 @@ def process_job_completed_status(self, master_id, status):
75144 self .report_deployment_status (self .run_id , GeneralConstants .MSG_MODELOPS_DEPLOYMENT_STATUS_FAILED )
76145
77146 def process_job_exception_status (self , master_id , status ):
78- # Send the exception status to slave devices.
79- self .report_exception_status (
80- self .edge_id_list , run_id = self .run_id , server_id = master_id ,
81- status = ClientConstants .MSG_MLOPS_CLIENT_STATUS_FAILED )
147+ # Report exception job status
148+ self .report_exception_status (status )
82149
83150 # Save the job status to local storage
84151 FedMLServerDataInterface .get_instance ().save_job_status (self .run_id , master_id , status , status )
@@ -113,9 +180,9 @@ def status_center_process_master_status(self, topic, payload):
113180
114181 def process_job_status_consensus (self , run_id , master_id , status ):
115182 # Set the master status in the job and entire job status
116- self .status_center . set_entire_job_status (status )
117- self .status_center . add_job_status_in_master (master_id , status )
118- status = self .status_center . get_entire_job_status ()
183+ self .set_entire_job_status (status )
184+ self .add_job_status_in_master (master_id , status )
185+ status = self .get_entire_job_status ()
119186
120187 # Set the device status based on the job status
121188 edge_id_status_dict = self .client_agent_active_list .get (f"{ run_id } " , {})
@@ -152,8 +219,8 @@ def get_device_consensus_status_in_job(job_status, device_status):
152219 return None
153220
154221 def get_device_consensus_status_in_current_device (self , edge_id , status ):
155- self .status_center . add_job_status_in_slave (edge_id , status )
156- consensus_status = self .status_center . get_job_status_in_slave (edge_id )
222+ self .add_job_status_in_slave (edge_id , status )
223+ consensus_status = self .get_job_status_in_slave (edge_id )
157224 consensus_status = ClientConstants .MSG_MLOPS_CLIENT_STATUS_FAILED \
158225 if consensus_status == ClientConstants .MSG_MLOPS_CLIENT_STATUS_EXCEPTION else consensus_status
159226 return consensus_status
@@ -275,25 +342,13 @@ def report_server_status(self, run_id, edge_id, server_id, status):
275342 self .status_reporter .report_server_id_status (
276343 run_id , status , edge_id = edge_id , server_id = server_id , server_agent_id = edge_id , update_db = False )
277344
278- def report_exception_status (
279- self , edge_id_list , run_id = 0 , server_id = None , status = None , payload = None ):
280- if payload is None :
281- payload_obj = {"runId" : run_id , "edgeids" : edge_id_list }
282- if server_id is not None :
283- payload_obj ["serverId" ] = server_id
284- else :
285- payload_obj = json .loads (payload )
286- payload_obj ["run_status" ] = ClientConstants .MSG_MLOPS_CLIENT_STATUS_EXCEPTION if status is None else status
287- topic_exception = "flserver_agent/" + str (self .edge_id ) + "/stop_train"
288- self .message_reporter .send_message (topic_exception , json .dumps (payload_obj ))
345+ def report_exception_status (self , status ):
346+ self .status_reporter .report_job_status (self .run_id , status )
289347
290348 def status_center_process_slave_status_to_master_in_slave_agent (self , topic , payload ):
291349 # Forward the status message to the sender queue of message center.
292350 self .message_center .send_message (topic , payload )
293351
294- # Post the status message to the listener queue of message center
295- #self.message_center.receive_message(GeneralConstants.MSG_TOPIC_REPORT_DEVICE_STATUS_IN_JOB, payload)
296-
297352 def status_center_process_slave_status_to_mlops_in_slave_agent (self , topic , payload ):
298353 # Forward the status message to message center.
299354 self .message_center .send_message (topic , payload )
0 commit comments