@@ -75,18 +75,72 @@ def test_get_markdown_url(azure_blob_service_mock):
7575 assert markdown_url == "[A title](http://example.com/path/to/file.txt_12345)"
7676
7777
78+ def test_from_metadata_returns_empty_sas_placeholder ():
79+ # Given
80+ content = "Some content"
81+ metadata = {}
82+ # blob.core.windows.net needs to be the domain name - not a faked one as per CWE-20
83+ document_url = "http://blob.core.windows.net.example.com/path/to/file.txt"
84+ expectedFileName = "/path/to/file.txt"
85+ idx = 0
86+
87+ # When
88+ source_document = SourceDocument .from_metadata (content , metadata , document_url , idx )
89+
90+ # Then
91+ parsed_url = urlparse (document_url )
92+ file_url = parsed_url .scheme + "://" + parsed_url .netloc + parsed_url .path
93+ hash_key = hashlib .sha1 (f"{ file_url } _{ idx } " .encode ("utf-8" )).hexdigest ()
94+ hash_key = f"doc_{ hash_key } "
95+
96+ assert source_document .id == hash_key
97+ assert source_document .content == content
98+ assert source_document .source == document_url
99+ assert source_document .title == expectedFileName
100+ assert source_document .chunk == idx
101+ assert source_document .offset is None
102+ assert source_document .page_number is None
103+
104+
105+ def test_from_metadata_returns_sas_placeholder ():
106+ # Given
107+ content = "Some content"
108+ metadata = {}
109+ document_url = "http://example.blob.core.windows.net/path/to/file.txt"
110+ expectedFileName = "/path/to/file.txt"
111+ expected_sas_placeholder = "_SAS_TOKEN_PLACEHOLDER_"
112+ idx = 0
113+
114+ # When
115+ source_document = SourceDocument .from_metadata (content , metadata , document_url , idx )
116+
117+ # Then
118+ parsed_url = urlparse (document_url )
119+ file_url = parsed_url .scheme + "://" + parsed_url .netloc + parsed_url .path
120+ hash_key = hashlib .sha1 (f"{ file_url } _{ idx } " .encode ("utf-8" )).hexdigest ()
121+ hash_key = f"doc_{ hash_key } "
122+
123+ assert source_document .id == hash_key
124+ assert source_document .content == content
125+ assert source_document .source == f"{ file_url } { expected_sas_placeholder } "
126+ assert source_document .title == expectedFileName
127+ assert source_document .chunk == idx
128+ assert source_document .offset is None
129+ assert source_document .page_number is None
130+
131+
78132def test_from_metadata ():
79133 # Given
80134 content = "Some content"
81135 metadata = {
82136 "id" : "1" ,
83- "source" : "http://example.com/path/to/file.txt_SAS_TOKEN_PLACEHOLDER_ " ,
137+ "source" : "http://example.com/path/to/file.txt " ,
84138 "title" : "A title" ,
85139 "chunk" : "A chunk" ,
86140 "offset" : "An offset" ,
87141 "page_number" : "1" ,
88142 }
89- document_url = "http://example.com/path/to/file.txt_SAS_TOKEN_PLACEHOLDER_ "
143+ document_url = "http://example.com/path/to/file.txt "
90144 idx = 0
91145
92146 # When
@@ -98,15 +152,11 @@ def test_from_metadata():
98152 filename = parsed_url .path
99153 hash_key = hashlib .sha1 (f"{ file_url } _{ idx } " .encode ("utf-8" )).hexdigest ()
100154 hash_key = f"doc_{ hash_key } "
101- sas_placeholder = (
102- "_SAS_TOKEN_PLACEHOLDER_"
103- if "blob.core.windows.net" in parsed_url .netloc
104- else ""
105- )
155+
106156 expected_source_document = SourceDocument (
107157 id = metadata .get ("id" , hash_key ),
108158 content = content ,
109- source = metadata .get ("source" , f" { file_url } { sas_placeholder } " ),
159+ source = metadata .get ("source" , document_url ),
110160 title = metadata .get ("title" , filename ),
111161 chunk = metadata .get ("chunk" , idx ),
112162 offset = metadata .get ("offset" ),
0 commit comments