|
2 | 2 | // Licensed under the BSD 3-Clause License (see LICENSE file in the project root). |
3 | 3 |
|
4 | 4 | use crate::builder::Error; |
| 5 | +use bzip2::bufread::MultiBzDecoder; |
| 6 | +use flate2::bufread::MultiGzDecoder; |
5 | 7 | use log::{debug, error, warn}; |
6 | 8 | use oxrdfio::RdfSerializer; |
7 | 9 | use oxrdfio::{ |
8 | 10 | RdfFormat::{self, NTriples}, |
9 | 11 | RdfParseError, RdfParser, |
10 | 12 | }; |
| 13 | +use std::fs::File; |
11 | 14 | use std::io::Write; |
12 | 15 | use std::{ |
13 | | - io::{self, BufReader, BufWriter}, |
14 | | - path::Path, |
| 16 | + io::{self, BufReader, BufWriter, Read}, |
| 17 | + path::{Path, PathBuf}, |
15 | 18 | }; |
16 | 19 | use url::Url; |
17 | 20 |
|
18 | 21 | const IO_BUF: usize = 1 << 20; |
19 | 22 |
|
| 23 | +fn open_rdf_reader(file: &Path) -> io::Result<Box<dyn Read>> { |
| 24 | + let fp = File::open(file)?; |
| 25 | + let buffered = BufReader::with_capacity(IO_BUF, fp); |
| 26 | + match file.extension().and_then(|e| e.to_str()) { |
| 27 | + Some(e) if e.eq_ignore_ascii_case("gz") => Ok(Box::new(BufReader::with_capacity( |
| 28 | + IO_BUF, |
| 29 | + MultiGzDecoder::new(buffered), |
| 30 | + ))), |
| 31 | + Some(e) if e.eq_ignore_ascii_case("bz2") => Ok(Box::new(BufReader::with_capacity( |
| 32 | + IO_BUF, |
| 33 | + MultiBzDecoder::new(buffered), |
| 34 | + ))), |
| 35 | + _ => Ok(Box::new(buffered)), |
| 36 | + } |
| 37 | +} |
| 38 | + |
| 39 | +fn rdf_format_from_path(file: &Path) -> io::Result<RdfFormat> { |
| 40 | + let format_path: PathBuf = match file.extension().and_then(|e| e.to_str()) { |
| 41 | + Some(e) if e.eq_ignore_ascii_case("gz") || e.eq_ignore_ascii_case("bz2") => { |
| 42 | + file.with_extension("") |
| 43 | + } |
| 44 | + _ => file.to_path_buf(), |
| 45 | + }; |
| 46 | + |
| 47 | + let ext = format_path |
| 48 | + .extension() |
| 49 | + .and_then(|e| e.to_str()) |
| 50 | + .ok_or_else(|| { |
| 51 | + io::Error::new( |
| 52 | + io::ErrorKind::InvalidInput, |
| 53 | + format!("file {} has no usable extension", file.display()), |
| 54 | + ) |
| 55 | + })?; |
| 56 | + if ext.eq_ignore_ascii_case("owl") { |
| 57 | + return Ok(RdfFormat::RdfXml); |
| 58 | + } |
| 59 | + RdfFormat::from_extension(ext).ok_or_else(|| { |
| 60 | + error!("unrecognized file extension for {}", file.display()); |
| 61 | + io::Error::new( |
| 62 | + io::ErrorKind::InvalidData, |
| 63 | + format!("unrecognized file extension for {}", file.display()), |
| 64 | + ) |
| 65 | + }) |
| 66 | +} |
| 67 | + |
20 | 68 | pub(crate) fn convert_to_nt<P: AsRef<Path>>( |
21 | 69 | file_paths: &[P], |
22 | 70 | output_file: std::fs::File, |
23 | 71 | ) -> Result<(), Error> { |
24 | 72 | let mut dest_writer = BufWriter::with_capacity(IO_BUF, output_file); |
25 | 73 | for file in file_paths { |
26 | 74 | let file = file.as_ref(); |
27 | | - let source = std::fs::File::open(file).map_err(|e| { |
| 75 | + let source_reader = open_rdf_reader(file).map_err(|e| { |
28 | 76 | error!("Error opening file {}: {e:?}", file.display()); |
29 | 77 | e |
30 | 78 | })?; |
31 | | - let source_reader = BufReader::with_capacity(IO_BUF, source); |
32 | 79 |
|
33 | 80 | debug!("converting {} to nt format", file.display()); |
34 | 81 |
|
35 | 82 | let mut serializer = RdfSerializer::from_format(NTriples).for_writer(dest_writer.by_ref()); |
36 | 83 | let v = std::time::Instant::now(); |
37 | | - let ext = file.extension().and_then(|e| e.to_str()).ok_or_else(|| { |
38 | | - io::Error::new( |
39 | | - io::ErrorKind::InvalidInput, |
40 | | - format!("file {} has no usable extension", file.display()), |
41 | | - ) |
42 | | - })?; |
43 | | - let rdf_format = RdfFormat::from_extension(ext).ok_or_else(|| { |
44 | | - error!("unrecognized file extension for {}", file.display()); |
45 | | - io::Error::new( |
46 | | - io::ErrorKind::InvalidData, |
47 | | - format!("unrecognized file extension for {}", file.display()), |
48 | | - ) |
49 | | - })?; |
| 84 | + let rdf_format = rdf_format_from_path(file)?; |
50 | 85 | let abs_path = std::fs::canonicalize(file)?; |
51 | 86 | let base_iri = Url::from_file_path(&abs_path).map_err(|_| { |
52 | 87 | io::Error::new( |
@@ -133,4 +168,65 @@ mod tests { |
133 | 168 | assert!(quads.is_ok()); |
134 | 169 | assert_eq!(quads.unwrap().len(), 9) |
135 | 170 | } |
| 171 | + |
| 172 | + fn count_triples_in(nt_file: &tempfile::NamedTempFile) -> usize { |
| 173 | + let reader = BufReader::new(nt_file.reopen().expect("error opening tmp file")); |
| 174 | + RdfParser::from_format(NTriples) |
| 175 | + .for_reader(reader) |
| 176 | + .collect::<Result<Vec<_>, _>>() |
| 177 | + .expect("parse nt") |
| 178 | + .len() |
| 179 | + } |
| 180 | + |
| 181 | + #[test] |
| 182 | + fn gzipped_ttl_input() -> Result<(), Error> { |
| 183 | + let tmp = tempfile::tempdir()?; |
| 184 | + let gz_path = tmp.path().join("apple.ttl.gz"); |
| 185 | + let source = std::fs::read("tests/resources/apple.ttl")?; |
| 186 | + let mut enc = |
| 187 | + flate2::write::GzEncoder::new(File::create(&gz_path)?, flate2::Compression::default()); |
| 188 | + enc.write_all(&source)?; |
| 189 | + enc.finish()?; |
| 190 | + |
| 191 | + let out = tempfile::Builder::new().suffix(".nt").tempfile()?; |
| 192 | + convert_to_nt(&[&gz_path], out.reopen()?)?; |
| 193 | + assert_eq!(count_triples_in(&out), 9); |
| 194 | + Ok(()) |
| 195 | + } |
| 196 | + |
| 197 | + #[test] |
| 198 | + fn bzipped_ttl_input() -> Result<(), Error> { |
| 199 | + let tmp = tempfile::tempdir()?; |
| 200 | + let bz_path = tmp.path().join("apple.ttl.bz2"); |
| 201 | + let source = std::fs::read("tests/resources/apple.ttl")?; |
| 202 | + let mut enc = |
| 203 | + bzip2::write::BzEncoder::new(File::create(&bz_path)?, bzip2::Compression::default()); |
| 204 | + enc.write_all(&source)?; |
| 205 | + enc.finish()?; |
| 206 | + |
| 207 | + let out = tempfile::Builder::new().suffix(".nt").tempfile()?; |
| 208 | + convert_to_nt(&[&bz_path], out.reopen()?)?; |
| 209 | + assert_eq!(count_triples_in(&out), 9); |
| 210 | + Ok(()) |
| 211 | + } |
| 212 | + |
| 213 | + #[test] |
| 214 | + fn owl_as_rdfxml() -> Result<(), Error> { |
| 215 | + let tmp = tempfile::tempdir()?; |
| 216 | + let owl_path = tmp.path().join("tiny.owl"); |
| 217 | + std::fs::write( |
| 218 | + &owl_path, |
| 219 | + r#"<?xml version="1.0"?> |
| 220 | +<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"> |
| 221 | + <rdf:Description rdf:about="http://example.org/a"> |
| 222 | + <rdf:type rdf:resource="http://example.org/Thing"/> |
| 223 | + </rdf:Description> |
| 224 | +</rdf:RDF>"#, |
| 225 | + )?; |
| 226 | + |
| 227 | + let out = tempfile::Builder::new().suffix(".nt").tempfile()?; |
| 228 | + convert_to_nt(&[&owl_path], out.reopen()?)?; |
| 229 | + assert_eq!(count_triples_in(&out), 1); |
| 230 | + Ok(()) |
| 231 | + } |
136 | 232 | } |
0 commit comments