Skip to content

Commit a4342b0

Browse files
Add filter to export only pdf and json files
1 parent 41f9c3d commit a4342b0

1 file changed

Lines changed: 6 additions & 3 deletions

File tree

Deployment/data_migration/migrate.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -789,15 +789,18 @@ def export_blob_storage(export_dir: Path, account_name: str) -> None:
789789
def _export_container(
790790
blob_service, container_name: str, blob_dir: Path
791791
) -> None:
792-
"""Download all blobs from a single container."""
792+
"""Download PDF and JSON blobs from a single container."""
793+
ALLOWED_EXTENSIONS = (".pdf", ".json")
794+
793795
container_client = blob_service.get_container_client(container_name)
794796
container_dir = blob_dir / container_name
795797
_long_path(container_dir).mkdir(parents=True, exist_ok=True)
796798

797799
# Collect blob names and content types to preserve metadata
798800
logger.info(" Listing blobs in container '%s'...", container_name)
799-
blob_list = list(container_client.list_blobs(include=["metadata"]))
800-
logger.info(" Found %d blobs.", len(blob_list))
801+
all_blobs = list(container_client.list_blobs(include=["metadata"]))
802+
blob_list = [b for b in all_blobs if b.name.lower().endswith(ALLOWED_EXTENSIONS)]
803+
logger.info(" Found %d blobs total, %d PDF/JSON files to export.", len(all_blobs), len(blob_list))
801804

802805
content_type_map: dict = {}
803806
blob_count = 0

0 commit comments

Comments
 (0)