2424
2525class PushEmbedder (EmbedderBase ):
2626 def __init__ (self , blob_client : AzureBlobStorageClient , env_helper : EnvHelper ):
27+ logger .info ("Initializing PushEmbedder" )
2728 self .env_helper = env_helper
2829 self .llm_helper = LLMHelper ()
2930 self .azure_search_helper = AzureSearchHelper ()
@@ -33,11 +34,14 @@ def __init__(self, blob_client: AzureBlobStorageClient, env_helper: EnvHelper):
3334 self .blob_client = blob_client
3435 self .config = ConfigHelper .get_active_config_or_default ()
3536 self .embedding_configs = {}
37+ logger .info ("Loading document processors" )
3638 for processor in self .config .document_processors :
3739 ext = processor .document_type .lower ()
3840 self .embedding_configs [ext ] = processor
41+ logger .info ("Document processors loaded" )
3942
4043 def embed_file (self , source_url : str , file_name : str ):
44+ logger .info (f"Embedding file: { file_name } from URL: { source_url } " )
4145 file_extension = file_name .split ("." )[- 1 ].lower ()
4246 embedding_config = self .embedding_configs .get (file_extension )
4347 self .__embed (
@@ -46,19 +50,22 @@ def embed_file(self, source_url: str, file_name: str):
4650 embedding_config = embedding_config ,
4751 )
4852 if file_extension != "url" :
53+ logger .info (f"Upserting blob metadata for file: { file_name } " )
4954 self .blob_client .upsert_blob_metadata (
5055 file_name , {"embeddings_added" : "true" }
5156 )
5257
5358 def __embed (
5459 self , source_url : str , file_extension : str , embedding_config : EmbeddingConfig
5560 ):
61+ logger .info (f"Processing embedding for file extension: { file_extension } " )
5662 documents_to_upload : List [SourceDocument ] = []
5763 if (
5864 embedding_config .use_advanced_image_processing
5965 and file_extension
6066 in self .config .get_advanced_image_processing_image_types ()
6167 ):
68+ logger .info (f"Using advanced image processing for: { source_url } " )
6269 caption = self .__generate_image_caption (source_url )
6370 caption_vector = self .llm_helper .generate_embeddings (caption )
6471
@@ -69,6 +76,7 @@ def __embed(
6976 )
7077 )
7178 else :
79+ logger .info (f"Loading documents from source: { source_url } " )
7280 documents : List [SourceDocument ] = self .document_loading .load (
7381 source_url , embedding_config .loading
7482 )
@@ -81,6 +89,7 @@ def __embed(
8189
8290 # Upload documents (which are chunks) to search index in batches
8391 if documents_to_upload :
92+ logger .info ("Uploading documents in batches" )
8493 batch_size = self .env_helper .AZURE_SEARCH_DOC_UPLOAD_BATCH_SIZE
8594 search_client = self .azure_search_helper .get_search_client ()
8695 for i in range (0 , len (documents_to_upload ), batch_size ):
@@ -93,6 +102,7 @@ def __embed(
93102 logger .warning ("No documents to upload." )
94103
95104 def __generate_image_caption (self , source_url ):
105+ logger .info (f"Generating image caption for URL: { source_url } " )
96106 model = self .env_helper .AZURE_OPENAI_VISION_MODEL
97107 caption_system_message = """You are an assistant that generates rich descriptions of images.
98108You need to be accurate in the information you extract and detailed in the descriptons you generate.
@@ -116,9 +126,11 @@ def __generate_image_caption(self, source_url):
116126
117127 response = self .llm_helper .get_chat_completion (messages , model )
118128 caption = response .choices [0 ].message .content
129+ logger .info ("Caption generation completed" )
119130 return caption
120131
121132 def __convert_to_search_document (self , document : SourceDocument ):
133+ logger .info (f"Converting document ID { document .id } to search document format" )
122134 embedded_content = self .llm_helper .generate_embeddings (document .content )
123135 metadata = {
124136 self .env_helper .AZURE_SEARCH_FIELDS_ID : document .id ,
@@ -151,6 +163,7 @@ def __create_image_document(
151163 content : str ,
152164 content_vector : List [float ],
153165 ):
166+ logger .info (f"Creating image document for source URL: { source_url } " )
154167 parsed_url = urlparse (source_url )
155168
156169 file_url = parsed_url .scheme + "://" + parsed_url .netloc + parsed_url .path
0 commit comments