11// Copyright (c) 2024-2025, Decisym, LLC
22
3- use crate :: {
4- bitmap_triples:: BitmapTriples , convert:: convert_to_nt, dictionary:: FourSectionDictionary ,
5- vocab:: * ,
6- } ;
7- use hdt:: containers:: ControlType ;
3+ use super :: { bitmap_triples:: BitmapTriplesBuilder , dictionary:: FourSectDictBuilder } ;
4+ use crate :: { rdf_reader:: convert_to_nt, vocab:: * } ;
5+ use hdt:: containers:: { self , ControlType } ;
86use log:: { debug, error} ;
97use oxrdf:: { BlankNodeRef , Literal , NamedNodeRef , Triple , vocab:: rdf} ;
108use std:: {
@@ -14,13 +12,17 @@ use std::{
1412 io:: { BufWriter , Write } ,
1513} ;
1614
17- #[ derive( Debug ) ]
15+ #[ derive( Clone , Debug ) ]
1816pub struct Options {
1917 pub block_size : usize ,
18+ pub order : String ,
2019}
2120impl Default for Options {
2221 fn default ( ) -> Self {
23- Options { block_size : 16 }
22+ Options {
23+ block_size : 16 ,
24+ order : "SPO" . to_string ( ) ,
25+ }
2426 }
2527}
2628
@@ -39,6 +41,10 @@ pub fn build_hdt(
3941 }
4042
4143 let timer = std:: time:: Instant :: now ( ) ;
44+ // TODO
45+ // implement an RDF reader trait
46+ // 1. for larger datasets, read from source files everytime since storing all triples in memory may OOM kill process
47+ // 2. build Vec<Triple> in memory from source files
4248 let nt_file = if file_paths. len ( ) == 1 && file_paths[ 0 ] . ends_with ( ".nt" ) {
4349 file_paths[ 0 ] . clone ( )
4450 } else {
@@ -61,17 +67,17 @@ pub fn build_hdt(
6167
6268impl ConvertedHDT {
6369 fn load ( nt_file : & str , opts : Options ) -> Result < Self , Box < dyn Error > > {
64- let ( dictionary, encoded_triples) = FourSectionDictionary :: load ( nt_file, opts) ?;
70+ let ( dictionary, encoded_triples) = FourSectDictBuilder :: load ( nt_file, opts. clone ( ) ) ?;
6571 let num_triples = encoded_triples. len ( ) ;
66- let bmap_triples = BitmapTriples :: load ( encoded_triples) ?;
72+ let bmap_triples = BitmapTriplesBuilder :: load ( encoded_triples) ?;
6773
6874 let mut converted_hdt = ConvertedHDT {
6975 dict : dictionary,
7076 triples : bmap_triples,
7177 num_triples,
7278 ..Default :: default ( )
7379 } ;
74- converted_hdt. build_header ( nt_file) ?;
80+ converted_hdt. build_header ( nt_file, opts ) ?;
7581
7682 Ok ( converted_hdt)
7783 }
@@ -83,14 +89,14 @@ impl ConvertedHDT {
8389 let mut dest_writer = BufWriter :: new ( file) ;
8490
8591 // libhdt/src/hdt/BasicHDT.cpp::saveToHDT
86- let ci = hdt :: containers:: ControlInfo {
92+ let ci = containers:: ControlInfo {
8793 control_type : ControlType :: Global ,
8894 format : HDT_CONTAINER . to_string ( ) ,
8995 ..Default :: default ( )
9096 } ;
9197 ci. save ( & mut dest_writer) ?;
9298
93- let mut ci = hdt :: containers:: ControlInfo {
99+ let mut ci = containers:: ControlInfo {
94100 control_type : ControlType :: Header ,
95101 format : "ntriples" . to_string ( ) ,
96102 ..Default :: default ( )
@@ -113,7 +119,7 @@ impl ConvertedHDT {
113119 Ok ( ( ) )
114120 }
115121
116- fn build_header ( & mut self , source_file : & str ) -> Result < ( ) , Box < dyn Error > > {
122+ fn build_header ( & mut self , source_file : & str , opts : Options ) -> Result < ( ) , Box < dyn Error > > {
117123 let mut header = HashSet :: new ( ) ;
118124 // libhdt/src/hdt/BasicHDT.cpp::fillHeader()
119125
@@ -202,7 +208,7 @@ impl ConvertedHDT {
202208 header. insert ( Triple :: new (
203209 dict_id,
204210 HDT_DICT_BLOCK_SIZE ,
205- Literal :: new_simple_literal ( "16" ) , // TODO is this always 16?
211+ Literal :: new_simple_literal ( opts . block_size . to_string ( ) ) ,
206212 ) ) ;
207213
208214 // TRIPLES
@@ -218,7 +224,7 @@ impl ConvertedHDT {
218224 header. insert ( Triple :: new (
219225 triples_id,
220226 HDT_TRIPLES_ORDER ,
221- Literal :: new_simple_literal ( "SPO" ) ,
227+ Literal :: new_simple_literal ( opts . order ) ,
222228 ) ) ;
223229
224230 // // Sizes
@@ -258,25 +264,24 @@ impl ConvertedHDT {
258264
259265#[ derive( Default , Debug ) ]
260266pub struct ConvertedHDT {
261- pub dict : FourSectionDictionary ,
262- pub triples : BitmapTriples ,
267+ pub dict : FourSectDictBuilder ,
268+ pub triples : BitmapTriplesBuilder ,
263269 header : HashSet < oxrdf:: Triple > ,
264270 num_triples : usize ,
265271}
266272
267273#[ cfg( test) ]
268274mod tests {
269275
276+ use super :: * ;
277+ use hdt:: { Hdt , containers:: ControlInfo , four_sect_dict, header:: Header , triples} ;
278+ use std:: sync:: Arc ;
270279 use std:: {
271280 fs:: remove_file,
272281 io:: { BufReader , Read } ,
273282 path:: Path ,
274283 } ;
275284
276- use super :: * ;
277- use hdt:: { containers:: ControlInfo , header:: Header } ;
278- use std:: sync:: Arc ;
279-
280285 #[ test]
281286 fn test_build_hdt ( ) {
282287 let output_file = "test.hdt" ;
@@ -298,8 +303,8 @@ mod tests {
298303 let _ci = ControlInfo :: read ( & mut hdt_reader) . expect ( "failed to read HDT control info" ) ;
299304 let _h = Header :: read ( & mut hdt_reader) . expect ( "failed to read HDT Header" ) ;
300305
301- let unvalidated_dict = hdt :: four_sect_dict :: FourSectDict :: read ( & mut hdt_reader )
302- . expect ( "failed to read dictionary" ) ;
306+ let unvalidated_dict =
307+ four_sect_dict :: FourSectDict :: read ( & mut hdt_reader ) . expect ( "failed to read dictionary" ) ;
303308 let dict = unvalidated_dict
304309 . validate ( )
305310 . expect ( "invalid 4 section dictionary" ) ;
@@ -314,14 +319,14 @@ mod tests {
314319 ) ;
315320 assert_eq ! ( dict. shared. num_strings( ) , conv_hdt. dict. shared_terms. len( ) ) ;
316321
317- let _triples = hdt :: triples :: TriplesBitmap :: read_sect ( & mut hdt_reader )
318- . expect ( "invalid bitmap triples" ) ;
322+ let _triples =
323+ triples :: TriplesBitmap :: read_sect ( & mut hdt_reader ) . expect ( "invalid bitmap triples" ) ;
319324 let mut buffer = [ 0 ; 1024 ] ;
320325 assert ! ( hdt_reader. read( & mut buffer) . expect( "failed to read" ) == 0 ) ;
321326
322327 let source = std:: fs:: File :: open ( p) . expect ( "failed to open hdt file" ) ;
323328 let hdt_reader = BufReader :: new ( source) ;
324- let h = hdt :: Hdt :: new ( hdt_reader) . expect ( "failed to load HDT file" ) ;
329+ let h = Hdt :: new ( hdt_reader) . expect ( "failed to load HDT file" ) ;
325330 let t: Vec < ( Arc < str > , Arc < str > , Arc < str > ) > = h. triples ( ) . collect ( ) ;
326331 println ! ( "{:?}" , t) ;
327332 assert_eq ! ( t. len( ) , 9 ) ;
0 commit comments