Skip to content

Commit 5bda8f2

Browse files
Enhance logging and telemetry across the application, including improved error handling and attribute tracking in OpenTelemetry spans
1 parent 18f3605 commit 5bda8f2

8 files changed

Lines changed: 90 additions & 73 deletions

File tree

infra/main.bicep

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -285,7 +285,6 @@ module applicationInsights 'br/public:avm/res/insights/component:0.7.0' = if (en
285285
name: 'appi-${solutionSuffix}'
286286
location: location
287287
workspaceResourceId: logAnalyticsWorkspaceResourceId
288-
diagnosticSettings: [{ workspaceResourceId: logAnalyticsWorkspaceResourceId }]
289288
tags: allTags
290289
enableTelemetry: enableTelemetry
291290
retentionInDays: 365

src/backend/api/api_routes.py

Lines changed: 40 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,15 @@ def record_exception_to_trace(e):
4444
span.set_status(Status(StatusCode.ERROR, str(e)))
4545

4646

47+
def set_span_attributes(**attributes):
48+
"""Set key-value attributes on the current OpenTelemetry span."""
49+
span = trace.get_current_span()
50+
if span and span.is_recording():
51+
for key, value in attributes.items():
52+
if value:
53+
span.set_attribute(key, str(value))
54+
55+
4756
# start processing the batch
4857
@router.post("/start-processing")
4958
async def start_processing(request: Request):
@@ -87,6 +96,7 @@ async def start_processing(request: Request):
8796
batch_id = payload.get("batch_id")
8897
translate_from = payload.get("translate_from")
8998
translate_to = payload.get("translate_to")
99+
set_span_attributes(batch_id=batch_id)
90100

91101
track_event_if_configured(
92102
"ProcessingStart",
@@ -103,6 +113,7 @@ async def start_processing(request: Request):
103113
"message": "Files processed",
104114
}
105115
except Exception as e:
116+
track_event_if_configured("ProcessingFailed", {"batch_id": batch_id, "error": str(e)})
106117
record_exception_to_trace(e)
107118
raise HTTPException(status_code=500, detail=str(e)) from e
108119

@@ -146,6 +157,7 @@ async def download_files(batch_id: str):
146157
# call batch_service get_batch_for_zip to get all files for batch_id
147158
batch_service = BatchService()
148159
await batch_service.initialize_database()
160+
set_span_attributes(batch_id=batch_id)
149161

150162
file_data = await batch_service.get_batch_for_zip(batch_id)
151163
if not file_data:
@@ -187,6 +199,7 @@ async def download_files(batch_id: str):
187199
)
188200
return Response(zip_data, media_type="application/zip", headers=headers)
189201
except Exception as e:
202+
track_event_if_configured("DownloadZipFailed", {"batch_id": batch_id, "error": str(e)})
190203
record_exception_to_trace(e)
191204
raise HTTPException(
192205
status_code=404, detail=f"Error creating ZIP file: {str(e)}"
@@ -270,7 +283,7 @@ async def batch_status_updates(
270283
await close_connection(batch_id)
271284
except Exception as e:
272285
record_exception_to_trace(e)
273-
logger.error("Error in WebSocket connection", error=str(e))
286+
logger.error("Error in WebSocket connection", error=str(e), batch_id=batch_id)
274287
await close_connection(batch_id)
275288

276289

@@ -373,15 +386,16 @@ async def get_batch_status(request: Request, batch_id: str):
373386
# Authenticate user
374387
authenticated_user = get_authenticated_user(request)
375388
user_id = authenticated_user.user_principal_id
389+
set_span_attributes(batch_id=batch_id, user_id=user_id)
376390
if not user_id:
377391
track_event_if_configured(
378-
"UserIdNotFound", {"status_code": 400, "detail": "no user"}
392+
"UserIdNotFound", {"status_code": 400, "detail": "no user", "batch_id": batch_id}
379393
)
380394
raise HTTPException(status_code=401, detail="User not authenticated")
381395

382396
# Validate batch_id format
383397
if not batch_service.is_valid_uuid(batch_id):
384-
track_event_if_configured("InvalidBatchId", {"batch_id": batch_id})
398+
track_event_if_configured("InvalidBatchId", {"batch_id": batch_id, "user_id": user_id})
385399
raise HTTPException(status_code=400, detail="Invalid batch_id format")
386400

387401
# Fetch batch details
@@ -399,7 +413,7 @@ async def get_batch_status(request: Request, batch_id: str):
399413
record_exception_to_trace(e)
400414
raise e
401415
except Exception as e:
402-
logger.error("Error retrieving batch history", error=str(e))
416+
logger.error("Error retrieving batch history", error=str(e), batch_id=batch_id)
403417
record_exception_to_trace(e)
404418
error_message = str(e)
405419
if "403" in error_message:
@@ -417,9 +431,10 @@ async def get_batch_summary(request: Request, batch_id: str):
417431
# Authenticate user
418432
authenticated_user = get_authenticated_user(request)
419433
user_id = authenticated_user.user_principal_id
434+
set_span_attributes(batch_id=batch_id, user_id=user_id)
420435
if not user_id:
421436
track_event_if_configured(
422-
"UserIdNotFound", {"status_code": 400, "detail": "no user"}
437+
"UserIdNotFound", {"status_code": 400, "detail": "no user", "batch_id": batch_id}
423438
)
424439
raise HTTPException(status_code=401, detail="User not authenticated")
425440

@@ -431,16 +446,16 @@ async def get_batch_summary(request: Request, batch_id: str):
431446
)
432447
raise HTTPException(status_code=404, detail="No batch summary found.")
433448
track_event_if_configured(
434-
"BatchSummaryRetrieved", {"batch_summary": batch_summary}
449+
"BatchSummaryRetrieved", {"batch_id": batch_id, "user_id": user_id, "batch_summary": batch_summary}
435450
)
436451
return batch_summary
437452

438453
except HTTPException as e:
439-
logger.error("Error fetching batch summary", error=str(e))
454+
logger.error("Error fetching batch summary", error=str(e), batch_id=batch_id)
440455
record_exception_to_trace(e)
441456
raise e
442457
except Exception as e:
443-
logger.error("Error fetching batch summary", error=str(e))
458+
logger.error("Error fetching batch summary", error=str(e), batch_id=batch_id)
444459
record_exception_to_trace(e)
445460
raise HTTPException(status_code=404, detail="Batch not found") from e
446461

@@ -538,16 +553,17 @@ async def upload_file(
538553
authenticated_user = get_authenticated_user(request)
539554
user_id = authenticated_user.user_principal_id
540555

556+
set_span_attributes(batch_id=batch_id, user_id=user_id)
541557
if not user_id:
542558
track_event_if_configured(
543-
"UserIdNotFound", {"status_code": 400, "detail": "no user"}
559+
"UserIdNotFound", {"status_code": 400, "detail": "no user", "batch_id": batch_id}
544560
)
545561
raise HTTPException(status_code=401, detail="User not authenticated")
546562

547563
# Validate batch_id format
548-
logger.info(f"batch_id: {batch_id}")
564+
logger.info("Upload request received", batch_id=batch_id, filename=str(file.filename))
549565
if not batch_service.is_valid_uuid(batch_id):
550-
track_event_if_configured("InvalidBatchId", {"batch_id": batch_id})
566+
track_event_if_configured("InvalidBatchId", {"batch_id": batch_id, "user_id": user_id})
551567
raise HTTPException(status_code=400, detail="Invalid batch_id format")
552568

553569
# Upload file via BatchService
@@ -660,6 +676,7 @@ async def get_file_details(request: Request, file_id: str):
660676
# Authenticate user
661677
authenticated_user = get_authenticated_user(request)
662678
user_id = authenticated_user.user_principal_id
679+
set_span_attributes(file_id=file_id, user_id=user_id)
663680
if not user_id:
664681
track_event_if_configured(
665682
"UserIdNotFound", {"endpoint": "get_file_details"}
@@ -668,22 +685,22 @@ async def get_file_details(request: Request, file_id: str):
668685

669686
# Validate file_id format
670687
if not batch_service.is_valid_uuid(file_id):
671-
track_event_if_configured("InvalidFileId", {"file_id": file_id})
688+
track_event_if_configured("InvalidFileId", {"file_id": file_id, "user_id": user_id})
672689
raise HTTPException(status_code=400, detail="Invalid file_id format")
673690

674691
# Fetch file details
675692
file_data = await batch_service.get_file_report(file_id)
676693
if not file_data:
677-
track_event_if_configured("FileNotFound", {"file_id": file_id})
694+
track_event_if_configured("FileNotFound", {"file_id": file_id, "user_id": user_id})
678695
raise HTTPException(status_code=404, detail="File not found")
679-
track_event_if_configured("FileDetailsRetrieved", {"file_data": file_data})
696+
track_event_if_configured("FileDetailsRetrieved", {"file_id": file_id, "user_id": user_id, "file_data": file_data})
680697
return file_data
681698

682699
except HTTPException as e:
683700
record_exception_to_trace(e)
684701
raise e
685702
except Exception as e:
686-
logger.error("Error retrieving file details", error=str(e))
703+
logger.error("Error retrieving file details", error=str(e), file_id=file_id)
687704
record_exception_to_trace(e)
688705
raise HTTPException(status_code=500, detail="Internal server error") from e
689706

@@ -721,15 +738,16 @@ async def delete_batch_details(request: Request, batch_id: str):
721738
# Authenticate user
722739
authenticated_user = get_authenticated_user(request)
723740
user_id = authenticated_user.user_principal_id
741+
set_span_attributes(batch_id=batch_id, user_id=user_id)
724742
if not user_id:
725743
track_event_if_configured(
726-
"UserIdNotFound", {"endpoint": "delete_batch_details"}
744+
"UserIdNotFound", {"batch_id": batch_id, "endpoint": "delete_batch_details"}
727745
)
728746
raise HTTPException(status_code=401, detail="User not authenticated")
729747

730748
# Validate file_id format
731749
if not batch_service.is_valid_uuid(batch_id):
732-
track_event_if_configured("InvalidBatchId", {"batch_id": batch_id})
750+
track_event_if_configured("InvalidBatchId", {"batch_id": batch_id, "user_id": user_id})
733751
raise HTTPException(
734752
status_code=400, detail=f"Invalid batch_id format: {batch_id}"
735753
)
@@ -745,7 +763,7 @@ async def delete_batch_details(request: Request, batch_id: str):
745763
record_exception_to_trace(e)
746764
raise e
747765
except Exception as e:
748-
logger.error("Failed to delete batch from database", error=str(e))
766+
logger.error("Failed to delete batch from database", error=str(e), batch_id=batch_id)
749767
record_exception_to_trace(e)
750768
raise HTTPException(status_code=500, detail="Database connection error") from e
751769

@@ -783,6 +801,7 @@ async def delete_file_details(request: Request, file_id: str):
783801
# Authenticate user
784802
authenticated_user = get_authenticated_user(request)
785803
user_id = authenticated_user.user_principal_id
804+
set_span_attributes(file_id=file_id, user_id=user_id)
786805
if not user_id:
787806
track_event_if_configured(
788807
"UserIdNotFound", {"endpoint": "delete_file_details"}
@@ -791,15 +810,15 @@ async def delete_file_details(request: Request, file_id: str):
791810

792811
# Validate file_id format
793812
if not batch_service.is_valid_uuid(file_id):
794-
track_event_if_configured("InvalidFileId", {"file_id": file_id})
813+
track_event_if_configured("InvalidFileId", {"file_id": file_id, "user_id": user_id})
795814
raise HTTPException(
796815
status_code=400, detail=f"Invalid file_id format: {file_id}"
797816
)
798817

799818
# Delete file
800819
file_delete = await batch_service.delete_file(file_id, user_id)
801820
if file_delete is None:
802-
track_event_if_configured("FileDeleteNotFound", {"file_id": file_id})
821+
track_event_if_configured("FileDeleteNotFound", {"file_id": file_id, "user_id": user_id})
803822
raise HTTPException(status_code=404, detail=f"File not found: {file_id}")
804823

805824
logger.info(f"File deleted successfully: {file_delete}")
@@ -810,7 +829,7 @@ async def delete_file_details(request: Request, file_id: str):
810829
record_exception_to_trace(e)
811830
raise e
812831
except Exception as e:
813-
logger.error("Failed to delete file from database", error=str(e))
832+
logger.error("Failed to delete file from database", error=str(e), file_id=file_id)
814833
record_exception_to_trace(e)
815834
raise HTTPException(status_code=500, detail="Database connection error") from e
816835

src/backend/app.py

Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -47,14 +47,17 @@
4747
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
4848
)
4949

50+
# Suppress noisy Azure SDK and OpenTelemetry internal loggers.
51+
logging.getLogger("azure.core.pipeline.policies.http_logging_policy").setLevel(logging.WARNING)
52+
logging.getLogger("azure.core.pipeline.policies._universal").setLevel(logging.WARNING)
53+
logging.getLogger("azure.cosmos").setLevel(logging.WARNING)
54+
logging.getLogger("opentelemetry.sdk").setLevel(logging.WARNING)
55+
logging.getLogger("azure.monitor.opentelemetry.exporter.export._base").setLevel(logging.WARNING)
56+
5057
# Package config: Azure loggers set to WARNING to suppress INFO
5158
for logger_name in AZURE_LOGGING_PACKAGES:
5259
logging.getLogger(logger_name).setLevel(getattr(logging, AZURE_PACKAGE_LOGGING_LEVEL, logging.WARNING))
5360

54-
# Suppress noisy OpenTelemetry and Azure Monitor logs
55-
# logging.getLogger("opentelemetry.sdk").setLevel(logging.ERROR)
56-
# logging.getLogger("azure.core.pipeline.policies.http_logging_policy").setLevel(logging.WARNING)
57-
# logging.getLogger("azure.monitor.opentelemetry.exporter.export._base").setLevel(logging.WARNING)
5861

5962
logger = AppLogger("app")
6063

@@ -70,7 +73,7 @@ async def lifespan(app: FastAPI):
7073

7174
# Startup
7275
try:
73-
logger.logger.info("Initializing SQL agents...")
76+
logger.info("Initializing SQL agents...")
7477

7578
# Create Azure credentials and client
7679
creds = get_azure_credential(app_config.azure_client_id)
@@ -91,20 +94,20 @@ async def lifespan(app: FastAPI):
9194

9295
# Set the global agents instance
9396
set_sql_agents(sql_agents)
94-
logger.logger.info("SQL agents initialized successfully.")
97+
logger.info("SQL agents initialized successfully.")
9598

9699
except Exception as exc:
97-
logger.logger.error("Failed to initialize SQL agents: %s", exc)
100+
logger.error("Failed to initialize SQL agents")
98101
# Don't raise the exception to allow the app to start even if agents fail
99102

100103
yield # Application runs here
101104

102105
# Shutdown
103106
try:
104107
if sql_agents:
105-
logger.logger.info("Application shutting down - cleaning up SQL agents...")
108+
logger.info("Application shutting down - cleaning up SQL agents...")
106109
await sql_agents.delete_agents()
107-
logger.logger.info("SQL agents cleaned up successfully.")
110+
logger.info("SQL agents cleaned up successfully.")
108111

109112
# Clear the global agents instance
110113
await clear_sql_agents()
@@ -113,7 +116,7 @@ async def lifespan(app: FastAPI):
113116
await azure_client.close()
114117

115118
except Exception as exc:
116-
logger.logger.error("Error during agent cleanup: %s", exc)
119+
logger.error("Error during agent cleanup")
117120

118121

119122
def create_app() -> FastAPI:
@@ -133,8 +136,7 @@ def create_app() -> FastAPI:
133136
# This must happen AFTER app creation but BEFORE route registration
134137
instrumentation_key = os.getenv("APPLICATIONINSIGHTS_CONNECTION_STRING")
135138
if instrumentation_key:
136-
# Fix: Patch buggy instrumentors before Azure Monitor loads them
137-
# See: https://github.com/microsoft/semantic-kernel/issues/13715
139+
# Patch azure.ai.agents, azure.ai.projects instrumentor to handle dict response_format (prevents ValueError)
138140
patch_instrumentors()
139141

140142
# Configure Azure Monitor with FULL auto-instrumentation
@@ -149,9 +151,9 @@ def create_app() -> FastAPI:
149151
excluded_urls="health,socket,ws",
150152
)
151153

152-
logger.logger.info("Application Insights configured with full auto-instrumentation")
154+
logger.info("Application Insights configured with full auto-instrumentation")
153155
else:
154-
logger.logger.warning("No Application Insights connection string found. Telemetry disabled.")
156+
logger.warning("No Application Insights connection string found. Telemetry disabled.")
155157

156158
# Include routers with /api prefix
157159
app.include_router(backend_router, prefix="/api", tags=["backend"])

src/backend/common/logger/app_logger.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,8 @@ def info(self, message: Any, **kwargs) -> None:
4343
def warning(self, message: Any, **kwargs) -> None:
4444
self.logger.warning(self._format_message(message, **kwargs))
4545

46-
def error(self, message: Any, **kwargs) -> None:
47-
self.logger.error(self._format_message(message, **kwargs))
46+
def error(self, message: Any, exc_info=True, **kwargs) -> None:
47+
self.logger.error(self._format_message(message, **kwargs), exc_info=exc_info)
4848

4949
def critical(self, message: Any, **kwargs) -> None:
5050
self.logger.critical(self._format_message(message, **kwargs))

src/backend/common/services/batch_service.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -285,6 +285,7 @@ async def upload_file_to_batch(self, batch_id: str, user_id: str, file: UploadFi
285285
content_type=file.content_type,
286286
metadata={"batch_id": batch_id, "user_id": user_id, "file_id": file_id},
287287
)
288+
self.logger.info("File uploaded to blob storage", filename=file.filename, batch_id=batch_id)
288289

289290
# Create file entry
290291
await self.database.add_file(batch_id, file_id, file.filename, blob_path)
@@ -370,6 +371,7 @@ async def delete_batch_and_files(self, batch_id: str, user_id: str):
370371
# Delete file and log entry from database
371372
await self.database.delete_file(user_id, str(file_record.file_id))
372373
await self.database.delete_batch(user_id, batch_id)
374+
self.logger.info("Batch and all files deleted successfully", batch_id=batch_id)
373375
return {"message": "Files deleted successfully"}
374376

375377
except (RuntimeError, ValueError, IOError) as e:
@@ -429,6 +431,7 @@ async def update_batch(self, batch_id: str, status: ProcessStatus):
429431
batch_record.status = status
430432
batch_record.updated_at = datetime.utcnow()
431433
await self.database.update_batch(batch_record)
434+
self.logger.info("Batch status updated", batch_id=batch_id, status=status.value)
432435

433436
async def create_candidate(self, file_id: str, candidate: str):
434437
"""Create a new candidate entry in the database and upload the candita file to storage."""

0 commit comments

Comments
 (0)