Skip to content

Commit 4d3b56b

Browse files
committed
bug fixes, use latest hdt_rs lib
1 parent af548d2 commit 4d3b56b

11 files changed

Lines changed: 169 additions & 91 deletions

File tree

Cargo.lock

Lines changed: 64 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@ chrono = "0.4.40"
88
clap = { version = "4.5", features = ["derive","cargo"] }
99
clap-verbosity-flag = "3.0"
1010
crc = "3.2.1"
11-
hdt = { git = "https://github.com/GregHanson/hdt/", branch = "more-public-vals" }
11+
env_logger = "0.11"
12+
hdt = { git = "https://github.com/KonradHoeffner/hdt/" }
1213
log = "0.4"
1314
oxrdf = "0.2"
1415
oxrdfio = "0.1"

src/bitmap_triples.rs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ use crate::{
66
vocab::HDT_TYPE_BITMAP,
77
};
88
use hdt::{
9-
containers::{ControlType, vbyte::encode_vbyte},
9+
containers::{self, ControlType, vbyte::encode_vbyte},
1010
triples::Order,
1111
};
1212
use log::debug;
@@ -18,7 +18,7 @@ use std::{
1818
};
1919

2020
#[derive(Default, Debug)]
21-
pub struct BitmapTriples {
21+
pub struct BitmapTriplesBuilder {
2222
y_vec: Vec<u32>,
2323
z_vec: Vec<u32>,
2424
bitmap_y: Vec<bool>,
@@ -27,7 +27,7 @@ pub struct BitmapTriples {
2727
num_triples: usize,
2828
}
2929

30-
impl BitmapTriples {
30+
impl BitmapTriplesBuilder {
3131
/// Creates a new BitmapTriples from a list of sorted RDF triples
3232
pub fn load(mut triples: Vec<EncodedTripleId>) -> Result<Self, Box<dyn Error>> {
3333
// libhdt/src/triples/BitmapTriples.cpp:load()
@@ -96,7 +96,7 @@ impl BitmapTriples {
9696
z_bitmap.push(true);
9797
debug!("BitmapTriples build time: {:?}", timer.elapsed());
9898

99-
Ok(BitmapTriples {
99+
Ok(BitmapTriplesBuilder {
100100
bitmap_y: y_bitmap,
101101
bitmap_z: z_bitmap,
102102
y_vec: array_y,
@@ -107,7 +107,7 @@ impl BitmapTriples {
107107
}
108108

109109
pub fn save(&self, dest_writer: &mut BufWriter<File>) -> Result<(), Box<dyn Error>> {
110-
let mut ci = hdt::containers::ControlInfo {
110+
let mut ci = containers::ControlInfo {
111111
control_type: ControlType::Triples,
112112
format: HDT_TYPE_BITMAP.to_string(),
113113
..Default::default()

src/hdt.rs renamed to src/builder.rs

Lines changed: 31 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,8 @@
11
// Copyright (c) 2024-2025, Decisym, LLC
22

3-
use crate::{
4-
bitmap_triples::BitmapTriples, convert::convert_to_nt, dictionary::FourSectionDictionary,
5-
vocab::*,
6-
};
7-
use hdt::containers::ControlType;
3+
use super::{bitmap_triples::BitmapTriplesBuilder, dictionary::FourSectDictBuilder};
4+
use crate::{rdf_reader::convert_to_nt, vocab::*};
5+
use hdt::containers::{self, ControlType};
86
use log::{debug, error};
97
use oxrdf::{BlankNodeRef, Literal, NamedNodeRef, Triple, vocab::rdf};
108
use std::{
@@ -14,13 +12,17 @@ use std::{
1412
io::{BufWriter, Write},
1513
};
1614

17-
#[derive(Debug)]
15+
#[derive(Clone, Debug)]
1816
pub struct Options {
1917
pub block_size: usize,
18+
pub order: String,
2019
}
2120
impl Default for Options {
2221
fn default() -> Self {
23-
Options { block_size: 16 }
22+
Options {
23+
block_size: 16,
24+
order: "SPO".to_string(),
25+
}
2426
}
2527
}
2628

@@ -39,6 +41,10 @@ pub fn build_hdt(
3941
}
4042

4143
let timer = std::time::Instant::now();
44+
// TODO
45+
// implement an RDF reader trait
46+
// 1. for larger datasets, read from source files everytime since storing all triples in memory may OOM kill process
47+
// 2. build Vec<Triple> in memory from source files
4248
let nt_file = if file_paths.len() == 1 && file_paths[0].ends_with(".nt") {
4349
file_paths[0].clone()
4450
} else {
@@ -61,17 +67,17 @@ pub fn build_hdt(
6167

6268
impl ConvertedHDT {
6369
fn load(nt_file: &str, opts: Options) -> Result<Self, Box<dyn Error>> {
64-
let (dictionary, encoded_triples) = FourSectionDictionary::load(nt_file, opts)?;
70+
let (dictionary, encoded_triples) = FourSectDictBuilder::load(nt_file, opts.clone())?;
6571
let num_triples = encoded_triples.len();
66-
let bmap_triples = BitmapTriples::load(encoded_triples)?;
72+
let bmap_triples = BitmapTriplesBuilder::load(encoded_triples)?;
6773

6874
let mut converted_hdt = ConvertedHDT {
6975
dict: dictionary,
7076
triples: bmap_triples,
7177
num_triples,
7278
..Default::default()
7379
};
74-
converted_hdt.build_header(nt_file)?;
80+
converted_hdt.build_header(nt_file, opts)?;
7581

7682
Ok(converted_hdt)
7783
}
@@ -83,14 +89,14 @@ impl ConvertedHDT {
8389
let mut dest_writer = BufWriter::new(file);
8490

8591
// libhdt/src/hdt/BasicHDT.cpp::saveToHDT
86-
let ci = hdt::containers::ControlInfo {
92+
let ci = containers::ControlInfo {
8793
control_type: ControlType::Global,
8894
format: HDT_CONTAINER.to_string(),
8995
..Default::default()
9096
};
9197
ci.save(&mut dest_writer)?;
9298

93-
let mut ci = hdt::containers::ControlInfo {
99+
let mut ci = containers::ControlInfo {
94100
control_type: ControlType::Header,
95101
format: "ntriples".to_string(),
96102
..Default::default()
@@ -113,7 +119,7 @@ impl ConvertedHDT {
113119
Ok(())
114120
}
115121

116-
fn build_header(&mut self, source_file: &str) -> Result<(), Box<dyn Error>> {
122+
fn build_header(&mut self, source_file: &str, opts: Options) -> Result<(), Box<dyn Error>> {
117123
let mut header = HashSet::new();
118124
// libhdt/src/hdt/BasicHDT.cpp::fillHeader()
119125

@@ -202,7 +208,7 @@ impl ConvertedHDT {
202208
header.insert(Triple::new(
203209
dict_id,
204210
HDT_DICT_BLOCK_SIZE,
205-
Literal::new_simple_literal("16"), // TODO is this always 16?
211+
Literal::new_simple_literal(opts.block_size.to_string()),
206212
));
207213

208214
// TRIPLES
@@ -218,7 +224,7 @@ impl ConvertedHDT {
218224
header.insert(Triple::new(
219225
triples_id,
220226
HDT_TRIPLES_ORDER,
221-
Literal::new_simple_literal("SPO"),
227+
Literal::new_simple_literal(opts.order),
222228
));
223229

224230
// // Sizes
@@ -258,25 +264,24 @@ impl ConvertedHDT {
258264

259265
#[derive(Default, Debug)]
260266
pub struct ConvertedHDT {
261-
pub dict: FourSectionDictionary,
262-
pub triples: BitmapTriples,
267+
pub dict: FourSectDictBuilder,
268+
pub triples: BitmapTriplesBuilder,
263269
header: HashSet<oxrdf::Triple>,
264270
num_triples: usize,
265271
}
266272

267273
#[cfg(test)]
268274
mod tests {
269275

276+
use super::*;
277+
use hdt::{Hdt, containers::ControlInfo, four_sect_dict, header::Header, triples};
278+
use std::sync::Arc;
270279
use std::{
271280
fs::remove_file,
272281
io::{BufReader, Read},
273282
path::Path,
274283
};
275284

276-
use super::*;
277-
use hdt::{containers::ControlInfo, header::Header};
278-
use std::sync::Arc;
279-
280285
#[test]
281286
fn test_build_hdt() {
282287
let output_file = "test.hdt";
@@ -298,8 +303,8 @@ mod tests {
298303
let _ci = ControlInfo::read(&mut hdt_reader).expect("failed to read HDT control info");
299304
let _h = Header::read(&mut hdt_reader).expect("failed to read HDT Header");
300305

301-
let unvalidated_dict = hdt::four_sect_dict::FourSectDict::read(&mut hdt_reader)
302-
.expect("failed to read dictionary");
306+
let unvalidated_dict =
307+
four_sect_dict::FourSectDict::read(&mut hdt_reader).expect("failed to read dictionary");
303308
let dict = unvalidated_dict
304309
.validate()
305310
.expect("invalid 4 section dictionary");
@@ -314,14 +319,14 @@ mod tests {
314319
);
315320
assert_eq!(dict.shared.num_strings(), conv_hdt.dict.shared_terms.len());
316321

317-
let _triples = hdt::triples::TriplesBitmap::read_sect(&mut hdt_reader)
318-
.expect("invalid bitmap triples");
322+
let _triples =
323+
triples::TriplesBitmap::read_sect(&mut hdt_reader).expect("invalid bitmap triples");
319324
let mut buffer = [0; 1024];
320325
assert!(hdt_reader.read(&mut buffer).expect("failed to read") == 0);
321326

322327
let source = std::fs::File::open(p).expect("failed to open hdt file");
323328
let hdt_reader = BufReader::new(source);
324-
let h = hdt::Hdt::new(hdt_reader).expect("failed to load HDT file");
329+
let h = Hdt::new(hdt_reader).expect("failed to load HDT file");
325330
let t: Vec<(Arc<str>, Arc<str>, Arc<str>)> = h.triples().collect();
326331
println!("{:?}", t);
327332
assert_eq!(t.len(), 9);

src/common.rs

Lines changed: 3 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
// Copyright (c) 2024-2025, Decisym, LLC
22

3-
use hdt::containers::vbyte::encode_vbyte;
4-
use log::warn;
53
use std::{
64
error::Error,
75
fs::File,
86
io::{BufWriter, Write},
97
};
108

9+
use hdt::containers::vbyte::encode_vbyte;
10+
1111
pub fn save_u32_vec(
1212
ints: &[u32],
1313
dest_writer: &mut BufWriter<File>,
@@ -43,6 +43,7 @@ pub fn save_u32_vec(
4343
Ok(())
4444
}
4545

46+
// TODO duplicate of containers/sequence.rs::save()
4647
fn pack_bits(data: &[u32], bits_per_entry: u8) -> Vec<u8> {
4748
assert!(bits_per_entry > 0 && bits_per_entry as usize <= std::mem::size_of::<usize>() * 8);
4849

@@ -81,14 +82,6 @@ fn pack_bits(data: &[u32], bits_per_entry: u8) -> Vec<u8> {
8182
output
8283
}
8384

84-
pub fn convert_vec_u32_to_vec_u8(ints: &[u32]) -> Vec<u8> {
85-
let mut bytes = Vec::with_capacity(ints.len() * 4);
86-
for offset in ints {
87-
bytes.extend_from_slice(&offset.to_le_bytes());
88-
}
89-
bytes
90-
}
91-
9285
pub fn byte_align_bitmap(bits: &[bool]) -> Vec<u8> {
9386
let mut byte = 0u8;
9487
let mut bit_index = 0;
@@ -113,17 +106,3 @@ pub fn byte_align_bitmap(bits: &[bool]) -> Vec<u8> {
113106
}
114107
byte_vec
115108
}
116-
117-
pub fn usize_to_u8_array(val: usize) -> [u8; 1] {
118-
if val > 255 {
119-
warn!("{val} greater than 255");
120-
}
121-
[(val & 0xFF) as u8] // Extracts the least significant byte
122-
}
123-
124-
pub fn bytes_for_bitmap(bits: usize) -> usize {
125-
if bits == 0 {
126-
return 1;
127-
}
128-
((bits - 1) >> 3) + 1
129-
}

0 commit comments

Comments
 (0)