Skip to content

Commit dc2da80

Browse files
committed
add support for .gz, .bz2 and .owl files. Update docs
1 parent b306bd8 commit dc2da80

5 files changed

Lines changed: 166 additions & 19 deletions

File tree

Cargo.lock

Lines changed: 43 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,11 @@ name = "benchmark"
1515
harness = false
1616

1717
[dependencies]
18+
bzip2 = "0.6"
1819
clap = { version = "4.5", features = ["derive","cargo"] }
1920
clap-verbosity-flag = "3.0"
2021
env_logger = "0.11"
22+
flate2 = "1"
2123
hdt = { version = "0.6", default-features = false, features = ["nt"]}
2224
log = "0.4"
2325
oxrdf = "0.3"

README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ and then generates and saves the data as HDT. Implementation is based on the [HD
1212
and the output HDT is intended to be consumed by one of [hdt crate](https://github.com/KonradHoeffner/hdt), [hdt-cpp](https://github.com/rdfhdt/hdt-cpp),
1313
or [hdt-java](https://github.com/rdfhdt/hdt-java).
1414

15+
Inputs ending in `.gz` or `.bz2` are decompressed transparently, and `.owl` files are parsed as RDF/XML.
16+
1517
## Installation
1618

1719
Install `rdf2hdt` with `cargo`:
@@ -36,7 +38,7 @@ Options:
3638
-i, --input <INPUT>...
3739
Path to input RDF file(s).
3840

39-
Provide the path to one or more RDF files that will be parsed and converted. Support file formats: https://crates.io/crates/oxrdfio
41+
Provide the path to one or more RDF files that will be parsed and converted. RDF syntaxes supported: see https://crates.io/crates/oxrdfio. `.owl` files are parsed as RDF/XML. Inputs ending in `.gz` or `.bz2` are transparently decompressed.
4042

4143
-o, --output <OUTPUT>
4244
Path to output file.

src/main.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
//! ## Features
1313
//! - Parses RDF input and converts it to RDF triples
1414
//! - Convert NTriple data into HDT format
15+
//! - Transparent gzip (`.gz`) and bzip2 (`.bz2`) decompression of input files
16+
//! - `.owl` files parsed as RDF/XML
1517
//!
1618
//! ## Usage
1719
//! Run the rdf2hdt converter from the command line. For detailed usage information, run:
@@ -56,7 +58,9 @@ enum Commands {
5658
/// Path to input RDF file(s).
5759
///
5860
/// Provide the path to one or more RDF files that will be parsed and converted.
59-
/// Support file formats: https://crates.io/crates/oxrdfio
61+
/// RDF syntaxes supported: see https://crates.io/crates/oxrdfio.
62+
/// `.owl` files are parsed as RDF/XML. Inputs ending in `.gz` or `.bz2`
63+
/// are transparently decompressed.
6064
#[arg(short, long, num_args = 1..)]
6165
input: Vec<String>,
6266

src/rdf_reader.rs

Lines changed: 113 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2,51 +2,86 @@
22
// Licensed under the BSD 3-Clause License (see LICENSE file in the project root).
33

44
use crate::builder::Error;
5+
use bzip2::bufread::MultiBzDecoder;
6+
use flate2::bufread::MultiGzDecoder;
57
use log::{debug, error, warn};
68
use oxrdfio::RdfSerializer;
79
use oxrdfio::{
810
RdfFormat::{self, NTriples},
911
RdfParseError, RdfParser,
1012
};
13+
use std::fs::File;
1114
use std::io::Write;
1215
use std::{
13-
io::{self, BufReader, BufWriter},
14-
path::Path,
16+
io::{self, BufReader, BufWriter, Read},
17+
path::{Path, PathBuf},
1518
};
1619
use url::Url;
1720

1821
const IO_BUF: usize = 1 << 20;
1922

23+
fn open_rdf_reader(file: &Path) -> io::Result<Box<dyn Read>> {
24+
let fp = File::open(file)?;
25+
let buffered = BufReader::with_capacity(IO_BUF, fp);
26+
match file.extension().and_then(|e| e.to_str()) {
27+
Some(e) if e.eq_ignore_ascii_case("gz") => Ok(Box::new(BufReader::with_capacity(
28+
IO_BUF,
29+
MultiGzDecoder::new(buffered),
30+
))),
31+
Some(e) if e.eq_ignore_ascii_case("bz2") => Ok(Box::new(BufReader::with_capacity(
32+
IO_BUF,
33+
MultiBzDecoder::new(buffered),
34+
))),
35+
_ => Ok(Box::new(buffered)),
36+
}
37+
}
38+
39+
fn rdf_format_from_path(file: &Path) -> io::Result<RdfFormat> {
40+
let format_path: PathBuf = match file.extension().and_then(|e| e.to_str()) {
41+
Some(e) if e.eq_ignore_ascii_case("gz") || e.eq_ignore_ascii_case("bz2") => {
42+
file.with_extension("")
43+
}
44+
_ => file.to_path_buf(),
45+
};
46+
47+
let ext = format_path
48+
.extension()
49+
.and_then(|e| e.to_str())
50+
.ok_or_else(|| {
51+
io::Error::new(
52+
io::ErrorKind::InvalidInput,
53+
format!("file {} has no usable extension", file.display()),
54+
)
55+
})?;
56+
if ext.eq_ignore_ascii_case("owl") {
57+
return Ok(RdfFormat::RdfXml);
58+
}
59+
RdfFormat::from_extension(ext).ok_or_else(|| {
60+
error!("unrecognized file extension for {}", file.display());
61+
io::Error::new(
62+
io::ErrorKind::InvalidData,
63+
format!("unrecognized file extension for {}", file.display()),
64+
)
65+
})
66+
}
67+
2068
pub(crate) fn convert_to_nt<P: AsRef<Path>>(
2169
file_paths: &[P],
2270
output_file: std::fs::File,
2371
) -> Result<(), Error> {
2472
let mut dest_writer = BufWriter::with_capacity(IO_BUF, output_file);
2573
for file in file_paths {
2674
let file = file.as_ref();
27-
let source = std::fs::File::open(file).map_err(|e| {
75+
let source_reader = open_rdf_reader(file).map_err(|e| {
2876
error!("Error opening file {}: {e:?}", file.display());
2977
e
3078
})?;
31-
let source_reader = BufReader::with_capacity(IO_BUF, source);
3279

3380
debug!("converting {} to nt format", file.display());
3481

3582
let mut serializer = RdfSerializer::from_format(NTriples).for_writer(dest_writer.by_ref());
3683
let v = std::time::Instant::now();
37-
let ext = file.extension().and_then(|e| e.to_str()).ok_or_else(|| {
38-
io::Error::new(
39-
io::ErrorKind::InvalidInput,
40-
format!("file {} has no usable extension", file.display()),
41-
)
42-
})?;
43-
let rdf_format = RdfFormat::from_extension(ext).ok_or_else(|| {
44-
error!("unrecognized file extension for {}", file.display());
45-
io::Error::new(
46-
io::ErrorKind::InvalidData,
47-
format!("unrecognized file extension for {}", file.display()),
48-
)
49-
})?;
84+
let rdf_format = rdf_format_from_path(file)?;
5085
let abs_path = std::fs::canonicalize(file)?;
5186
let base_iri = Url::from_file_path(&abs_path).map_err(|_| {
5287
io::Error::new(
@@ -133,4 +168,65 @@ mod tests {
133168
assert!(quads.is_ok());
134169
assert_eq!(quads.unwrap().len(), 9)
135170
}
171+
172+
fn count_triples_in(nt_file: &tempfile::NamedTempFile) -> usize {
173+
let reader = BufReader::new(nt_file.reopen().expect("error opening tmp file"));
174+
RdfParser::from_format(NTriples)
175+
.for_reader(reader)
176+
.collect::<Result<Vec<_>, _>>()
177+
.expect("parse nt")
178+
.len()
179+
}
180+
181+
#[test]
182+
fn gzipped_ttl_input() -> Result<(), Error> {
183+
let tmp = tempfile::tempdir()?;
184+
let gz_path = tmp.path().join("apple.ttl.gz");
185+
let source = std::fs::read("tests/resources/apple.ttl")?;
186+
let mut enc =
187+
flate2::write::GzEncoder::new(File::create(&gz_path)?, flate2::Compression::default());
188+
enc.write_all(&source)?;
189+
enc.finish()?;
190+
191+
let out = tempfile::Builder::new().suffix(".nt").tempfile()?;
192+
convert_to_nt(&[&gz_path], out.reopen()?)?;
193+
assert_eq!(count_triples_in(&out), 9);
194+
Ok(())
195+
}
196+
197+
#[test]
198+
fn bzipped_ttl_input() -> Result<(), Error> {
199+
let tmp = tempfile::tempdir()?;
200+
let bz_path = tmp.path().join("apple.ttl.bz2");
201+
let source = std::fs::read("tests/resources/apple.ttl")?;
202+
let mut enc =
203+
bzip2::write::BzEncoder::new(File::create(&bz_path)?, bzip2::Compression::default());
204+
enc.write_all(&source)?;
205+
enc.finish()?;
206+
207+
let out = tempfile::Builder::new().suffix(".nt").tempfile()?;
208+
convert_to_nt(&[&bz_path], out.reopen()?)?;
209+
assert_eq!(count_triples_in(&out), 9);
210+
Ok(())
211+
}
212+
213+
#[test]
214+
fn owl_as_rdfxml() -> Result<(), Error> {
215+
let tmp = tempfile::tempdir()?;
216+
let owl_path = tmp.path().join("tiny.owl");
217+
std::fs::write(
218+
&owl_path,
219+
r#"<?xml version="1.0"?>
220+
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
221+
<rdf:Description rdf:about="http://example.org/a">
222+
<rdf:type rdf:resource="http://example.org/Thing"/>
223+
</rdf:Description>
224+
</rdf:RDF>"#,
225+
)?;
226+
227+
let out = tempfile::Builder::new().suffix(".nt").tempfile()?;
228+
convert_to_nt(&[&owl_path], out.reopen()?)?;
229+
assert_eq!(count_triples_in(&out), 1);
230+
Ok(())
231+
}
136232
}

0 commit comments

Comments
 (0)