@@ -81,6 +81,7 @@ class FedMLStatusCenter(object):
8181 TOPIC_SLAVE_JOB_LAUNCH_SUFFIX = "/start_train"
8282 TOPIC_SLAVE_JOB_STOP_PREFIX = "flserver_agent/"
8383 TOPIC_SLAVE_JOB_STOP_SUFFIX = "/stop_train"
84+ ALLOWED_MAX_JOB_STATUS_CACHE_NUM = 1000
8485
8586 def __init__ (self , message_queue = None ):
8687 self .status_queue = message_queue
@@ -203,38 +204,43 @@ def run_status_dispatcher(self, status_event, status_queue,
203204 status_entity = FedMLStatusEntity (status_msg_body = message_body )
204205
205206 # Generate status manager instance
206- if status_manager_instances .get (status_entity .run_id ) is None :
207- status_manager_instances [status_entity .run_id ] = FedMLStatusManager (
208- run_id = status_entity .run_id , edge_id = status_entity .edge_id , status_center = self ,
207+ run_id_str = str (status_entity .run_id )
208+ run_id_int = int (status_entity .run_id )
209+ if status_manager_instances .get (run_id_str ) is None :
210+ if len (status_manager_instances .keys ()) >= FedMLStatusCenter .ALLOWED_MAX_JOB_STATUS_CACHE_NUM :
211+ for iter_run_id , iter_status_mgr in status_manager_instances .items ():
212+ if iter_status_mgr .is_job_completed ():
213+ status_manager_instances .pop (iter_run_id )
214+ break
215+ status_manager_instances [run_id_str ] = FedMLStatusManager (
216+ run_id = run_id_int , edge_id = status_entity .edge_id ,
217+ server_id = status_entity .server_id , status_center = self ,
209218 message_center = message_center )
210219 else :
211- status_manager_instances [status_entity .run_id ].edge_id = status_entity .edge_id
220+ status_manager_instances [run_id_str ].edge_id = status_entity .edge_id
221+ if status_entity .server_id is not None and str (status_entity .server_id ) != "0" :
222+ status_manager_instances [run_id_str ].server_id = status_entity .server_id
212223
213224 # if the job status is completed then continue
214- if status_manager_instances [status_entity . run_id ].is_job_completed ():
225+ if status_manager_instances [run_id_str ].is_job_completed ():
215226 continue
216227
217228 # Process the master and slave status.
218229 if message_entity .topic .startswith (FedMLStatusCenter .TOPIC_MASTER_STATUS_PREFIX ):
219230 # Process the job status
220- status_manager_instances [status_entity . run_id ].status_center_process_master_status (
231+ status_manager_instances [run_id_str ].status_center_process_master_status (
221232 message_entity .topic , message_entity .payload )
222233
223234 # Save the job status
224- status_manager_instances [status_entity .run_id ].save_job_status ()
225-
226- # Popup the status manager instance when the job status is completed
227- if status_manager_instances [status_entity .run_id ].is_job_completed ():
228- status_manager_instances .pop (status_entity .run_id )
229- continue
235+ status_manager_instances [run_id_str ].save_job_status ()
230236
231237 elif message_entity .topic .startswith (FedMLStatusCenter .TOPIC_SLAVE_STATUS_PREFIX ):
232238 # Process the slave device status
233- status_manager_instances [status_entity . run_id ].status_center_process_slave_status (
239+ status_manager_instances [run_id_str ].status_center_process_slave_status (
234240 message_entity .topic , message_entity .payload )
235241
236242 # Save the device status in job
237- status_manager_instances [status_entity . run_id ].save_device_status_in_job (status_entity .edge_id )
243+ status_manager_instances [run_id_str ].save_device_status_in_job (status_entity .edge_id )
238244
239245 except Exception as e :
240246 if message_entity is not None :
@@ -295,40 +301,49 @@ def run_status_dispatcher_in_slave(self, status_event, status_queue,
295301 status_entity = FedMLStatusEntity (status_msg_body = message_body )
296302
297303 # Generate status manager instance
298- if status_manager_instances .get (status_entity .run_id ) is None :
299- status_manager_instances [status_entity .run_id ] = FedMLStatusManager (
300- run_id = status_entity .run_id , edge_id = status_entity .edge_id , status_center = self ,
304+ run_id_str = str (status_entity .run_id )
305+ run_id_int = int (status_entity .run_id )
306+ if status_manager_instances .get (run_id_str ) is None :
307+ if len (status_manager_instances .keys ()) >= FedMLStatusCenter .ALLOWED_MAX_JOB_STATUS_CACHE_NUM :
308+ for iter_run_id , iter_status_mgr in status_manager_instances .items ():
309+ if iter_status_mgr .is_job_completed ():
310+ status_manager_instances .pop (iter_run_id )
311+ break
312+
313+ status_manager_instances [run_id_str ] = FedMLStatusManager (
314+ run_id = run_id_int , edge_id = status_entity .edge_id , status_center = self ,
301315 message_center = message_center )
302316 else :
303- status_manager_instances [status_entity . run_id ].edge_id = status_entity .edge_id
317+ status_manager_instances [run_id_str ].edge_id = status_entity .edge_id
304318
305319 # Process the slave status
306320 if message_entity .topic .startswith (FedMLStatusCenter .TOPIC_SLAVE_STATUS_PREFIX ):
307321 # Report the slave status to master
308- status_manager_instances [status_entity . run_id ]. \
322+ status_manager_instances [run_id_str ]. \
309323 status_center_process_slave_status_to_master_in_slave_agent (
310324 message_entity .topic , message_entity .payload
311325 )
312326 elif message_entity .topic .startswith (FedMLStatusCenter .TOPIC_SLAVE_STATUS_TO_MLOPS_PREFIX ):
313327 # Report slave status to mlops (Active/IDLE message)
314- status_manager_instances [status_entity . run_id ]. \
328+ status_manager_instances [run_id_str ]. \
315329 status_center_process_slave_status_to_mlops_in_slave_agent (
316330 message_entity .topic , message_entity .payload
317331 )
318332 elif (message_entity .topic .startswith (FedMLStatusCenter .TOPIC_SLAVE_JOB_LAUNCH_PREFIX ) and
319333 message_entity .topic .endswith (FedMLStatusCenter .TOPIC_SLAVE_JOB_LAUNCH_SUFFIX )):
334+ pass
320335 # Async request the job status from master when launching the job
321- job_launch_message_map [status_entity . run_id ] = {"topic" : message_entity .topic ,
322- "payload" : message_entity .payload }
323- # status_manager_instances[status_entity.run_id ]. \
336+ # job_launch_message_map[run_id_str ] = {"topic": message_entity.topic,
337+ # "payload": message_entity.payload}
338+ # status_manager_instances[run_id_str ]. \
324339 # status_center_request_job_status_from_master_in_slave_agent(
325340 # message_entity.topic, message_entity.payload
326341 # )
327342 elif (message_entity .topic .startswith (FedMLStatusCenter .TOPIC_SLAVE_JOB_STOP_PREFIX ) and
328343 message_entity .topic .endswith (FedMLStatusCenter .TOPIC_SLAVE_JOB_STOP_SUFFIX )):
329344 # Cleanup when stopped the job
330- if job_launch_message_map .get (status_entity . run_id , None ) is not None :
331- job_launch_message_map .pop (status_entity . run_id )
345+ if job_launch_message_map .get (run_id_str , None ) is not None :
346+ job_launch_message_map .pop (run_id_str )
332347
333348 except Exception as e :
334349 if message_entity is not None :
0 commit comments