fix: Cosmos record race condition and concurrent upsert lost-update

Dongbumlee · Dongbumlee · commit 215023d8a9c3 · 2026-03-28T15:11:30.000-07:00
Root cause: submit() enqueued the processing message BEFORE inserting
the Cosmos record.  ContentProcessor picked up the message instantly,
found no existing record, and created its own document.  The Workflow
then inserted a second document with the same process_id.  Subsequent
poll_status() always read the Workflow's stale 'processing' copy while
ContentProcessor updated the other one to 'Completed'.

Fixes:
- Reorder submit(): blob upload -&gt; Cosmos insert -&gt; queue enqueue,
  so ContentProcessor always finds and updates the pre-existing record.
- Add asyncio.Lock (_upsert_lock) around all Upsert_Content_Process
  calls to serialize read-modify-write on the shared Claim_Process
  document and prevent concurrent tasks from overwriting each other.
- Increase document processing semaphore from 2 to 4.
- Apply code-quality pass (structured docstrings, comment cleanup).
diff --git a/src/ContentProcessorWorkflow/src/services/content_process_service.py b/src/ContentProcessorWorkflow/src/services/content_process_service.py
@@ -102,7 +102,7 @@ def _get_queue_client(self) -> QueueClient:
         return self._queue_client
 
     # ------------------------------------------------------------------ #
-    # submit — replaces POST /contentprocessor/submit
+    # submit
     # ------------------------------------------------------------------ #
     async def submit(
         self,
@@ -112,13 +112,26 @@ async def submit(
         schema_id: str,
         metadata_id: str,
     ) -> str:
-        """Upload file to blob, enqueue processing message, create Cosmos record.
+        """Upload file to blob, insert Cosmos record, and enqueue processing.
 
-        Returns the generated process_id.
+        Steps:
+            1. Upload the file to blob storage.
+            2. Insert a Cosmos DB record so ContentProcessor finds it
+               on pickup (avoids duplicate-document race).
+            3. Enqueue a processing message to the extract queue.
+
+        Args:
+            file_bytes: Raw file content.
+            filename: Sanitized file name.
+            mime_type: Detected MIME type.
+            schema_id: Schema to apply during extraction.
+            metadata_id: Associated metadata identifier.
+
+        Returns:
+            The generated process_id (UUID string).
         """
         process_id = str(uuid.uuid4())
 
-        # 1. Upload file to blob: {cps-processes}/{process_id}/{filename}
         container_name = self._config.app_cps_processes
         blob_helper = self._get_blob_helper()
         await asyncio.to_thread(
@@ -128,7 +141,18 @@ async def submit(
             data=file_bytes,
         )
 
-        # 2. Enqueue processing message
+        # Insert Cosmos record BEFORE enqueuing so ContentProcessor
+        # finds this record (not creates a duplicate) when it starts.
+        record = ContentProcessRecord(
+            id=process_id,
+            process_id=process_id,
+            processed_file_name=filename,
+            processed_file_mime_type=mime_type,
+            status="processing",
+            imported_time=datetime.now(timezone.utc),
+        )
+        await self._process_repo.add_async(record)
+
         message = ContentProcessMessage(
             process_id=process_id,
             files=[
@@ -166,28 +190,21 @@ async def submit(
             self._get_queue_client().send_message, message.model_dump_json()
         )
 
-        # 3. Insert initial Cosmos record via sas-cosmosdb
-        record = ContentProcessRecord(
-            id=process_id,
-            process_id=process_id,
-            processed_file_name=filename,
-            processed_file_mime_type=mime_type,
-            status="processing",
-            imported_time=datetime.now(timezone.utc),
-        )
-        await self._process_repo.add_async(record)
-
         logger.info("Submitted process %s for file %s", process_id, filename)
         return process_id
 
     # ------------------------------------------------------------------ #
-    # get_status — replaces GET /contentprocessor/status/{id}
+    # get_status
     # ------------------------------------------------------------------ #
     async def get_status(self, process_id: str) -> dict | None:
         """Query Cosmos for process status.
 
-        Returns a dict with keys: status, process_id, file_name.
-        Returns None if not found.
+        Args:
+            process_id: The content process identifier.
+
+        Returns:
+            Dict with keys ``status``, ``process_id``, ``file_name``;
+            ``None`` if the record does not exist.
         """
         record = await self._process_repo.get_async(process_id)
         if record is None:
@@ -199,25 +216,33 @@ async def get_status(self, process_id: str) -> dict | None:
         }
 
     # ------------------------------------------------------------------ #
-    # get_processed — replaces GET /contentprocessor/processed/{id}
+    # get_processed
     # ------------------------------------------------------------------ #
     async def get_processed(self, process_id: str) -> dict | None:
         """Query Cosmos for the full processed content result.
 
-        Returns the full document dict, or None if not found.
+        Args:
+            process_id: The content process identifier.
+
+        Returns:
+            Full document dict, or ``None`` if not found.
         """
         record = await self._process_repo.get_async(process_id)
         if record is None:
             return None
         return record.model_dump(mode="json")
 
     # ------------------------------------------------------------------ #
-    # get_steps — replaces GET /contentprocessor/processed/{id}/steps
+    # get_steps
     # ------------------------------------------------------------------ #
     async def get_steps(self, process_id: str) -> list | None:
         """Download step_outputs.json from blob storage.
 
-        Returns parsed list of step output dicts, or None if not found.
+        Args:
+            process_id: The content process identifier.
+
+        Returns:
+            Parsed JSON list of step objects, or ``None`` if not found.
         """
         container_name = self._config.app_cps_processes
         blob_name = f"{process_id}/step_outputs.json"
@@ -234,7 +259,7 @@ async def get_steps(self, process_id: str) -> list | None:
             return None
 
     # ------------------------------------------------------------------ #
-    # poll_status — replaces the HTTP polling loop
+    # poll_status
     # ------------------------------------------------------------------ #
     async def poll_status(
         self,
@@ -243,16 +268,18 @@ async def poll_status(
         timeout_seconds: float = 600.0,
         on_poll: Callable[[dict], Awaitable[None] | None] | None = None,
     ) -> dict:
-        """Poll Cosmos for status until terminal state or timeout.
+        """Poll Cosmos for status until a terminal state or timeout.
 
         Args:
             process_id: The content process ID to poll.
             poll_interval_seconds: Delay between poll attempts.
-            timeout_seconds: Maximum time to wait for a terminal status.
-            on_poll: Optional callback invoked on each poll iteration with
+            timeout_seconds: Maximum elapsed time before giving up.
+            on_poll: Optional callback invoked on each iteration with
                 the current status dict.  Accepts sync or async callables.
 
-        Returns the final status dict with keys: status, process_id, file_name.
+        Returns:
+            Final status dict with keys ``status``, ``process_id``,
+            ``file_name``, and ``terminal``.
         """
         elapsed = 0.0
         result: dict | None = None
diff --git a/src/ContentProcessorWorkflow/src/steps/document_process/executor/document_process_executor.py b/src/ContentProcessorWorkflow/src/steps/document_process/executor/document_process_executor.py
@@ -157,9 +157,11 @@ async def handle_execute(
             )
         )
 
-        # Limit concurrency to avoid overwhelming the service
-        max_concurrency = 2
+        # Limit concurrency; serialize Cosmos writes via _upsert_lock to
+        # prevent lost-update races on the shared Claim_Process document.
+        max_concurrency = 4
         semaphore = asyncio.Semaphore(max_concurrency)
+        _upsert_lock = asyncio.Lock()
 
         async def _process_one(item) -> dict:
             async with semaphore:
@@ -184,7 +186,7 @@ async def _process_one(item) -> dict:
                         schema_id,
                     )
 
-                    # Direct submit: blob upload + queue enqueue + cosmos insert
+                    # Direct submit: blob upload + cosmos insert + queue enqueue
                     process_id = await content_process_service.submit(
                         file_bytes=file_bytes,
                         filename=filename,
@@ -193,21 +195,21 @@ async def _process_one(item) -> dict:
                         metadata_id=metadata_id,
                     )
 
-                    # Upsert initial status to claim process
-                    await claim_process_repository.Upsert_Content_Process(
-                        process_id=claim_id,
-                        content_process=Content_Process(
-                            process_id=process_id,
-                            file_name=str(item.file_name),
-                            mime_type=content_type or "application/octet-stream",
-                            status="processing",
-                        ),
-                    )
+                    # Upsert initial "processing" status to claim process
+                    async with _upsert_lock:
+                        await claim_process_repository.Upsert_Content_Process(
+                            process_id=claim_id,
+                            content_process=Content_Process(
+                                process_id=process_id,
+                                file_name=str(item.file_name),
+                                mime_type=content_type or "application/octet-stream",
+                                status="processing",
+                            ),
+                        )
 
-                    # Poll Cosmos directly until terminal status,
-                    # upserting intermediate status changes to the claim process.
-                    # Track the last status to avoid redundant writes that
-                    # create race conditions with concurrent tasks.
+                    # Poll until terminal status, upserting intermediate
+                    # changes. Skip duplicate and terminal statuses to
+                    # avoid clobbering the caller's richer final upsert.
                     _last_polled_status: str | None = None
 
                     async def _on_poll(poll_data: dict) -> None:
@@ -216,19 +218,20 @@ async def _on_poll(poll_data: dict) -> None:
                         if polled_status == _last_polled_status:
                             return
                         _last_polled_status = polled_status
-                        # Skip the final "Completed"/"Error" upsert here;
-                        # the caller does a richer upsert with scores.
+                        # Terminal statuses are handled by the caller with scores.
                         if polled_status in ("Completed", "Error"):
                             return
-                        await claim_process_repository.Upsert_Content_Process(
-                            process_id=claim_id,
-                            content_process=Content_Process(
-                                process_id=process_id,
-                                file_name=str(item.file_name),
-                                mime_type=content_type or "application/octet-stream",
-                                status=polled_status,
-                            ),
-                        )
+                        async with _upsert_lock:
+                            await claim_process_repository.Upsert_Content_Process(
+                                process_id=claim_id,
+                                content_process=Content_Process(
+                                    process_id=process_id,
+                                    file_name=str(item.file_name),
+                                    mime_type=content_type
+                                    or "application/octet-stream",
+                                    status=polled_status,
+                                ),
+                            )
 
                     poll_result = await content_process_service.poll_status(
                         process_id=process_id,
@@ -239,7 +242,6 @@ async def _on_poll(poll_data: dict) -> None:
 
                     status_text = poll_result.get("status", "Failed")
 
-                    # Fetch final processed result for scores
                     schema_score_f = 0.0
                     entity_score_f = 0.0
                     processed_time = ""
@@ -271,21 +273,23 @@ async def _on_poll(poll_data: dict) -> None:
                                 processed_time = ""
                             result_payload = final_payload
 
-                        # Final cosmos upsert with scores
-                        await claim_process_repository.Upsert_Content_Process(
-                            process_id=claim_id,
-                            content_process=Content_Process(
-                                process_id=process_id,
-                                file_name=str(item.file_name),
-                                mime_type=content_type or "application/octet-stream",
-                                status=status_text,
-                                schema_score=schema_score_f,
-                                entity_score=entity_score_f,
-                                processed_time=processed_time,
-                            ),
-                        )
-
-                    # Map status to HTTP-like code for downstream compatibility
+                        # Final upsert with scores
+                        async with _upsert_lock:
+                            await claim_process_repository.Upsert_Content_Process(
+                                process_id=claim_id,
+                                content_process=Content_Process(
+                                    process_id=process_id,
+                                    file_name=str(item.file_name),
+                                    mime_type=content_type
+                                    or "application/octet-stream",
+                                    status=status_text,
+                                    schema_score=schema_score_f,
+                                    entity_score=entity_score_f,
+                                    processed_time=processed_time,
+                                ),
+                            )
+
+                    # Map to HTTP-like code for downstream compatibility
                     if status_text == "Completed":
                         status_code = 302
                     elif status_text in ("Error", "Failed"):