Skip to content

Commit 28e4af4

Browse files
authored
Merge pull request #2136 from FedML-AI/alexleung/dev_v070_for_refactor
In order to make the inference logs work, we save the container inference logs to the single dir
2 parents dcac1e2 + f4c49c9 commit 28e4af4

3 files changed

Lines changed: 17 additions & 5 deletions

File tree

python/fedml/computing/scheduler/comm_utils/job_monitor.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848
class JobMonitor(Singleton):
4949
ENDPOINT_CONTAINER_LOG_PREFIX = "endpoint"
5050
TIME_INTERVAL_FOR_INFERENCE_ON_GATEWAY = 60 * 10
51+
ENDPOINT_CONTAINER_LOG_SUBDIR = "monitor_endpoint_logs"
5152

5253
def __init__(self):
5354
if not hasattr(self, "endpoint_unavailable_counter"):
@@ -1055,8 +1056,11 @@ def monitor_endpoint_logs(self):
10551056
model_version = model_config.get("model_version", None)
10561057
endpoint_name = endpoint_json.get("end_point_name", None)
10571058

1059+
log_file_dir = os.path.join(
1060+
device_client_constants.ClientConstants.get_log_file_dir(),
1061+
JobMonitor.ENDPOINT_CONTAINER_LOG_SUBDIR)
10581062
log_file_path, program_prefix = MLOpsLoggingUtils.build_log_file_path_with_run_params(
1059-
job.job_id, int(job.edge_id), device_server_constants.ServerConstants.get_log_file_dir(), is_server=True,
1063+
job.job_id, int(job.edge_id), log_file_dir, is_server=False,
10601064
log_file_prefix=JobMonitor.ENDPOINT_CONTAINER_LOG_PREFIX,
10611065
)
10621066

@@ -1130,8 +1134,9 @@ def monitor_endpoint_logs(self):
11301134
nano_second_str = container_time.split(".")[1][:9]
11311135
t_datetime_obj = isoparse(container_time)
11321136

1133-
if t_sec_offset is not None:
1134-
t_datetime_obj = t_datetime_obj + datetime.timedelta(seconds=t_sec_offset)
1137+
# ISSUE: this will cause the timestamp is not correct.
1138+
#if t_sec_offset is not None:
1139+
# t_datetime_obj = t_datetime_obj + datetime.timedelta(seconds=t_sec_offset)
11351140
except Exception as e:
11361141
logging.error(f"Exception when parsing the container log time {e}")
11371142
t_datetime_obj = datetime.datetime.now()

python/fedml/computing/scheduler/scheduler_core/scheduler_base_job_runner.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from ..comm_utils.constants import SchedulerConstants
1313
from ..comm_utils.job_utils import JobRunnerUtils, DockerArgs
1414
from ..scheduler_entry.constants import Constants
15-
from ....core.mlops import MLOpsMetrics
15+
from ....core.mlops import MLOpsMetrics, MLOpsRuntimeLogDaemon
1616
from ....core.mlops.mlops_device_perfs import MLOpsDevicePerfStats
1717
from ..comm_utils.yaml_utils import load_yaml_config
1818
from .general_constants import GeneralConstants
@@ -449,10 +449,16 @@ def trigger_stop_event(self):
449449
if self.run_process_event is not None:
450450
self.run_process_event.set()
451451

452+
time.sleep(1)
453+
MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(self.run_id, self.edge_id)
454+
452455
def trigger_completed_event(self):
453456
if self.run_process_completed_event is not None:
454457
self.run_process_completed_event.set()
455458

459+
time.sleep(1)
460+
MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(self.run_id, self.edge_id)
461+
456462
def execute_job_task(self, unzip_package_path, entry_file_full_path, conf_file_full_path, dynamic_args_config,
457463
fedml_config_object):
458464
run_config = self.request_json["run_config"]

python/fedml/core/mlops/mlops_utils.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -150,10 +150,11 @@ def get_edge_id_from_args(args):
150150
else:
151151
edge_id = 0
152152
else:
153-
if getattr(args, "client_id", None) is not None:
153+
if getattr(args, "edge_id", None) is not None:
154154
edge_id = args.edge_id
155155
else:
156156
edge_id = 0
157+
157158
return edge_id
158159

159160
@staticmethod

0 commit comments

Comments
 (0)