diff --git a/Cargo.lock b/Cargo.lock index 2730abf..2ab1fb3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -142,12 +142,13 @@ checksum = "574b0cd5e90ee2ba03a66d0611fc9a09c9a0c28b2ecc2dc8a181dd31a53ca5d7" [[package]] name = "json2rdf" -version = "0.1.1" +version = "0.2.0" dependencies = [ "clap", "oxrdf", "oxrdfio", "serde_json", + "thiserror", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 40e4ba1..05a5699 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "json2rdf" -version = "0.1.1" +version = "0.2.0" authors = ["bharath181 ", "Greg Hanson "] edition = "2021" license = "BSD-3-Clause OR Apache-2.0" @@ -14,6 +14,7 @@ categories = ["command-line-utilities", "encoding", "parser-implementations", "s clap = { version = "4.6", features = ["derive"] } oxrdf = "0.3" serde_json = "1.0" +thiserror = "2" [dev-dependencies] oxrdfio = "0.2" \ No newline at end of file diff --git a/Makefile b/Makefile index 39a6b84..3db913a 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ # at your option. lint: - cargo install cargo-machete + @command -v cargo-machete >/dev/null 2>&1 || cargo install cargo-machete cargo fmt --check cargo machete cargo clippy --benches --tests --bins --no-deps --all-features diff --git a/README.md b/README.md index 8fe8b5c..7efdd21 100644 --- a/README.md +++ b/README.md @@ -42,10 +42,13 @@ The conversion functionality can also be called directly in Rust. The library su use json2rdf::json_to_rdf; // capture conversion results to file -let results = json_to_rdf(&"tests/airplane.json".to_string(), &Some("http://example.com/ns#".to_string()), &Some("output.nt".to_string())); +let results = json_to_rdf(&["tests/airplane.json"], Some("http://example.com/ns#"), Some("output.nt")); // capture conversion results to an oxrdf::Graph -let results = json_to_rdf(&"tests/airplane.json".to_string(), &Some("http://example.com/ns#".to_string()), &None); +let results = json_to_rdf(&["tests/airplane.json"], Some("http://example.com/ns#"), None); + +// multiple input files are merged into a single graph +let results = json_to_rdf(&["a.json", "b.json"], Some("http://example.com/ns#"), None); ``` ## License diff --git a/src/lib.rs b/src/lib.rs index 137bb9e..26f7da3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -18,14 +18,37 @@ //! - Allows specifying a custom RDF namespace for generated predicates and objects. //! - Outputs the RDF data to a specified file or prints it to the console. -use clap::Error; use oxrdf::vocab::xsd; -use oxrdf::{BlankNode, Graph, Literal, NamedNodeRef, TripleRef}; +use oxrdf::{BlankNode, Graph, IriParseError, Literal, NamedNodeRef, TripleRef}; use serde_json::{Deserializer, Value}; -use std::collections::VecDeque; -use std::fs::{File, OpenOptions}; +use std::fs::File; use std::io::{BufReader, Write}; +use thiserror::Error; + +/// Errors that can occur while converting JSON to RDF. +#[derive(Debug, Error)] +pub enum Json2RdfError { + /// Failure opening, reading, or writing a file. + #[error("I/O error: {0}")] + Io(#[from] std::io::Error), + + /// Failure parsing the input JSON. + #[error("JSON parse error: {0}")] + Json(#[from] serde_json::Error), + + /// A JSON key produced a string that is not a valid IRI. + #[error("invalid IRI {iri:?} generated from JSON key: {source}")] + InvalidIri { + iri: String, + #[source] + source: IriParseError, + }, + + /// A root-level JSON value has no predicate context and cannot be converted to a triple. + #[error("unsupported root-level JSON {kind}; root must be an object or array")] + UnsupportedRootValue { kind: &'static str }, +} /// Converts JSON data to RDF format. /// @@ -34,95 +57,98 @@ use std::io::{BufReader, Write}; /// an output file for saving the generated RDF data. /// /// # Arguments -/// - `file_path`: Path to the JSON file. +/// - `file_paths`: One or more paths to input JSON files. All files are merged into a single graph. /// - `namespace`: Optional custom namespace for RDF predicates. /// - `output_file`: Optional output file path for writing RDF data. /// +/// # Errors +/// Returns [`Json2RdfError`] if any input file cannot be read, the JSON cannot be parsed, +/// the output file cannot be written, or a JSON key produces an invalid IRI. +/// /// # Example /// ```rust /// use json2rdf::json_to_rdf; /// -/// json_to_rdf(&"tests/airplane.json".to_string(), &Some("http://example.com/ns#".to_string()), &Some("output.nt".to_string())); +/// let graph = json_to_rdf( +/// &["tests/airplane.json"], +/// Some("http://example.com/ns#"), +/// None, +/// ) +/// .expect("conversion failed") +/// .expect("expected a graph"); +/// assert!(!graph.is_empty()); /// ``` pub fn json_to_rdf( - file_path: &String, - namespace: &Option, - output_file: &Option, -) -> Result, Error> { - let rdf_namespace: String = if namespace.is_some() { - namespace.clone().unwrap() - } else { - "https://decisym.ai/json2rdf/model".to_owned() - }; - - let file = File::open(file_path).unwrap(); - let reader = BufReader::new(file); - let stream = Deserializer::from_reader(reader).into_iter::(); + file_paths: &[&str], + namespace: Option<&str>, + output_file: Option<&str>, +) -> Result, Json2RdfError> { + let mut prefix: String = namespace + .map(str::to_owned) + .unwrap_or_else(|| "https://decisym.ai/json2rdf/model".to_owned()); + // Respect hash (`#`), slash (`/`), and colon (`:`) terminators; otherwise default to `/`. + if !prefix.ends_with(['#', '/', ':']) { + prefix.push('/'); + } let mut graph = Graph::default(); // oxrdf Graph object - - let mut subject_stack: VecDeque = VecDeque::new(); - let mut property: Option = None; - - for value in stream { - match value { - Ok(Value::Object(obj)) => { - let subject = BlankNode::default(); // Create a new blank node - subject_stack.push_back(subject.clone()); - - for (key, val) in obj { - property = Some(format!("{}/{}", rdf_namespace, key)); - process_value( - &mut subject_stack, - &property, - val, - &mut graph, - &rdf_namespace, - ); - } - - subject_stack.pop_back(); - } - Ok(Value::Array(arr)) => { - for val in arr { - process_value( - &mut subject_stack, - &property, - val, - &mut graph, - &rdf_namespace.clone(), - ); - } - } - Ok(other) => { - process_value( - &mut subject_stack, - &property, - other, - &mut graph, - &rdf_namespace.clone(), - ); - } - Err(e) => { - eprintln!("Error parsing JSON: {}", e); - } + let mut subject_stack: Vec = Vec::new(); + + for path in file_paths { + let file = File::open(path)?; + let reader = BufReader::new(file); + let stream = Deserializer::from_reader(reader).into_iter::(); + for value in stream { + process_top_level(&mut subject_stack, value?, &mut graph, &prefix)?; } } if let Some(output_path) = output_file { - let mut file = OpenOptions::new() - .create(true) - .append(true) - .open(output_path) - .expect("Error opening file"); - - writeln!(file, "{}", graph).expect("Error writing json2rdf data to file"); + let mut file = File::create(output_path)?; + writeln!(file, "{}", graph)?; Ok(None) } else { Ok(Some(graph)) } } +/// Processes a single top-level JSON value from the input stream. +/// +/// Each top-level value is handled independently: streamed values (NDJSON) do not +/// share predicate state with each other. Root-level primitives have no predicate +/// context and are rejected with [`Json2RdfError::UnsupportedRootValue`]. +fn process_top_level( + subject_stack: &mut Vec, + value: Value, + graph: &mut Graph, + prefix: &str, +) -> Result<(), Json2RdfError> { + match value { + Value::Object(obj) => { + let subject = BlankNode::default(); + subject_stack.push(subject); + + for (key, val) in obj { + let property = Some(format!("{}{}", prefix, key)); + process_value(subject_stack, &property, val, graph, prefix)?; + } + + subject_stack.pop(); + Ok(()) + } + Value::Array(arr) => { + for item in arr { + process_top_level(subject_stack, item, graph, prefix)?; + } + Ok(()) + } + Value::Bool(_) => Err(Json2RdfError::UnsupportedRootValue { kind: "boolean" }), + Value::Number(_) => Err(Json2RdfError::UnsupportedRootValue { kind: "number" }), + Value::String(_) => Err(Json2RdfError::UnsupportedRootValue { kind: "string" }), + Value::Null => Err(Json2RdfError::UnsupportedRootValue { kind: "null" }), + } +} + /// This function handles different JSON data types, converting each into RDF triples: /// - JSON Objects create new blank nodes and recursively process nested values. /// - JSON Arrays iterate over each element and process it as an individual value. @@ -140,84 +166,79 @@ pub fn json_to_rdf( /// - `property`: RDF predicate (property) associated with the JSON value. /// - `value`: JSON value to process. /// - `graph`: RDF graph where triples are added. -/// - `namespace`: Namespace for generating predicate URIs. +/// - `prefix`: Fully-prepared namespace prefix (already terminated with `#`, `/`, or `:`) +/// used to build predicate IRIs by direct concatenation with each JSON key. /// /// # JSON Type to RDF Conversion /// - **Object**: Creates a blank node and recursively processes key-value pairs. /// - **Array**: Iterates over elements and processes each as a separate value. /// - **String**: Converts to `xsd:string` literal. /// - **Boolean**: Converts to `xsd:boolean` literal. -/// - **Number**: Converts to `xsd:int` or `xsd:float` literal based on value type. +/// - **Number**: Converts to `xsd:integer` for whole numbers, `xsd:double` for floating-point values. fn process_value( - subject_stack: &mut VecDeque, + subject_stack: &mut Vec, property: &Option, value: Value, graph: &mut Graph, - namespace: &String, -) { - let ns = if namespace.ends_with("/") { - namespace - } else { - &([namespace, "/"].join("")) + prefix: &str, +) -> Result<(), Json2RdfError> { + let Some(last_subject) = subject_stack.last().cloned() else { + return Ok(()); + }; + let Some(prop) = property else { + return Ok(()); }; - if let Some(last_subject) = subject_stack.clone().back() { - if let Some(prop) = property { - match value { - Value::Bool(b) => { - graph.insert(TripleRef::new( - subject_stack.back().unwrap(), - NamedNodeRef::new(prop.as_str()).unwrap(), - &Literal::new_typed_literal(b.to_string(), xsd::BOOLEAN), - )); - } - Value::Number(num) => { - if num.as_i64().is_some() { - graph.insert(TripleRef::new( - subject_stack.back().unwrap(), - NamedNodeRef::new(prop.as_str()).unwrap(), - &Literal::new_typed_literal(num.to_string(), xsd::INT), - )); - } else if num.as_f64().is_some() { - graph.insert(TripleRef::new( - subject_stack.back().unwrap(), - NamedNodeRef::new(prop.as_str()).unwrap(), - &Literal::new_typed_literal(num.to_string(), xsd::FLOAT), - )); - } - } - Value::String(s) => { - graph.insert(TripleRef::new( - subject_stack.back().unwrap(), - NamedNodeRef::new(prop.as_str()).unwrap(), - &Literal::new_typed_literal(s, xsd::STRING), - )); - } - Value::Null => { - //println!("Null value"); - } - Value::Object(obj) => { - let subject = BlankNode::default(); - subject_stack.push_back(subject); - - graph.insert(TripleRef::new( - last_subject, - NamedNodeRef::new(prop.as_str()).unwrap(), - subject_stack.back().unwrap(), - )); - - for (key, val) in obj { - let nested_property: Option = Some(format!("{}{}", ns, key)); - process_value(subject_stack, &nested_property, val, graph, ns); - } - subject_stack.pop_back(); - } - Value::Array(arr) => { - for val in arr { - process_value(subject_stack, property, val, graph, ns); - } - } + let predicate = + NamedNodeRef::new(prop.as_str()).map_err(|source| Json2RdfError::InvalidIri { + iri: prop.clone(), + source, + })?; + + match value { + Value::Bool(b) => { + graph.insert(TripleRef::new( + &last_subject, + predicate, + &Literal::new_typed_literal(b.to_string(), xsd::BOOLEAN), + )); + } + Value::Number(num) => { + let datatype = if num.is_i64() || num.is_u64() { + xsd::INTEGER + } else { + xsd::DOUBLE + }; + graph.insert(TripleRef::new( + &last_subject, + predicate, + &Literal::new_typed_literal(num.to_string(), datatype), + )); + } + Value::String(s) => { + graph.insert(TripleRef::new( + &last_subject, + predicate, + &Literal::new_typed_literal(s, xsd::STRING), + )); + } + Value::Null => {} + Value::Object(obj) => { + let new_subject = BlankNode::default(); + graph.insert(TripleRef::new(&last_subject, predicate, &new_subject)); + subject_stack.push(new_subject); + + for (key, val) in obj { + let nested_property: Option = Some(format!("{}{}", prefix, key)); + process_value(subject_stack, &nested_property, val, graph, prefix)?; + } + subject_stack.pop(); + } + Value::Array(arr) => { + for val in arr { + process_value(subject_stack, property, val, graph, prefix)?; } } } + Ok(()) } diff --git a/src/main.rs b/src/main.rs index 57004e4..1f733a7 100644 --- a/src/main.rs +++ b/src/main.rs @@ -59,8 +59,9 @@ enum Commands { /// Path to input JSON file(s). /// /// Provide the path to one or more JSON files that will be parsed and converted. - #[arg(short, long)] - json_files: String, + /// All files are merged into a single RDF graph. + #[arg(short, long, required = true, num_args = 1..)] + json_files: Vec, /// Path to output file. /// @@ -79,10 +80,13 @@ fn main() { namespace, json_files, output_file, - }) => match json_to_rdf(json_files, namespace, output_file) { - Ok(_) => {} - Err(e) => eprintln!("Error writing: {}", e), - }, + }) => { + let paths: Vec<&str> = json_files.iter().map(String::as_str).collect(); + if let Err(e) = json_to_rdf(&paths, namespace.as_deref(), output_file.as_deref()) { + eprintln!("json2rdf: {}", e); + std::process::exit(1); + } + } None => {} } } diff --git a/tests/empty_file.json b/tests/empty_file.json new file mode 100644 index 0000000..e69de29 diff --git a/tests/empty_object.json b/tests/empty_object.json new file mode 100644 index 0000000..0967ef4 --- /dev/null +++ b/tests/empty_object.json @@ -0,0 +1 @@ +{} diff --git a/tests/integration_test.rs b/tests/integration_test.rs index 1626779..aa1e8c8 100644 --- a/tests/integration_test.rs +++ b/tests/integration_test.rs @@ -4,13 +4,13 @@ // - BSD 3-Clause License (https://opensource.org/licenses/BSD-3-Clause) // at your option. -use json2rdf::json_to_rdf; +use json2rdf::{json_to_rdf, Json2RdfError}; use oxrdfio::{RdfFormat, RdfParser}; use std::fs::{self, File}; #[test] fn test_graph_triple_count() { - let triple_count_string = json_to_rdf(&"tests/airplane.json".to_string(), &None, &None); + let triple_count_string = json_to_rdf(&["tests/airplane.json"], None, None); assert!(triple_count_string.is_ok()); assert_eq!(triple_count_string.unwrap().unwrap().len(), 23); @@ -18,24 +18,198 @@ fn test_graph_triple_count() { #[test] fn test_graph_write() { - let output = "out.nt".to_string(); - let _ = fs::remove_file(output.clone()); + let output = "out.nt"; - let res = json_to_rdf( - &"tests/airplane.json".to_string(), - &None, - &Some(output.clone()), - ); + let res = json_to_rdf(&["tests/airplane.json"], None, Some(output)); assert!(res.is_ok()); assert!(res.unwrap().is_none()); - let f = File::open(output.clone()).expect("unable to open output file for result verification"); + let f = File::open(output).expect("unable to open output file for result verification"); + let quads = RdfParser::from_format(RdfFormat::NTriples) + .for_reader(f) + .collect::, _>>() + .expect("failed to parse generated output file"); + + assert_eq!(quads.len(), 23); + let _ = fs::remove_file(output); +} + +#[test] +fn test_graph_write_truncates_existing() { + let output = "out_truncate.nt"; + + // Pre-populate with junk to prove truncation happens. + fs::write(output, "stale garbage\n").expect("unable to seed stale output"); + + // Two writes in a row should not accumulate; final file should hold one run's worth. + for _ in 0..2 { + let res = json_to_rdf(&["tests/airplane.json"], None, Some(output)); + assert!(res.is_ok()); + } + + let f = File::open(output).expect("unable to open output file for result verification"); let quads = RdfParser::from_format(RdfFormat::NTriples) .for_reader(f) .collect::, _>>() .expect("failed to parse generated output file"); assert_eq!(quads.len(), 23); - let _ = fs::remove_file(output.clone()); + let _ = fs::remove_file(output); +} + +#[test] +fn test_root_array() { + let graph = json_to_rdf(&["tests/root_array.json"], None, None) + .expect("conversion failed") + .expect("expected graph"); + assert_eq!(graph.len(), 2); +} + +#[test] +fn test_root_primitive_errors() { + let result = json_to_rdf(&["tests/root_primitive.json"], None, None); + assert!(matches!( + result, + Err(Json2RdfError::UnsupportedRootValue { kind: "number" }) + )); +} + +#[test] +fn test_ndjson_stream_isolated() { + let graph = json_to_rdf(&["tests/ndjson.json"], None, None) + .expect("conversion failed") + .expect("expected graph"); + assert_eq!(graph.len(), 2); +} + +#[test] +fn test_multi_file_merges_graphs() { + // root_array.json → 2 triples; ndjson.json → 2 triples; merged = 4. + let graph = json_to_rdf(&["tests/root_array.json", "tests/ndjson.json"], None, None) + .expect("conversion failed") + .expect("expected graph"); + assert_eq!(graph.len(), 4); +} + +#[test] +fn test_large_integers_preserve_precision() { + let graph = json_to_rdf(&["tests/large_int.json"], None, None) + .expect("conversion failed") + .expect("expected graph"); + let serialized = graph.to_string(); + assert!( + serialized.contains("\"9223372036854775807\"^^"), + "i64::MAX should round-trip as xsd:integer, got:\n{}", + serialized + ); + assert!( + serialized.contains("\"18446744073709551615\"^^"), + "u64::MAX should round-trip as xsd:integer (not xsd:double), got:\n{}", + serialized + ); +} + +#[test] +fn test_empty_file_returns_empty_graph() { + let graph = json_to_rdf(&["tests/empty_file.json"], None, None) + .expect("conversion failed") + .expect("expected graph"); + assert_eq!(graph.len(), 0); +} + +#[test] +fn test_empty_object_at_root_produces_no_triples() { + let graph = json_to_rdf(&["tests/empty_object.json"], None, None) + .expect("conversion failed") + .expect("expected graph"); + assert_eq!(graph.len(), 0); +} + +#[test] +fn test_malformed_json_returns_error() { + let result = json_to_rdf(&["tests/malformed.json"], None, None); + assert!( + matches!(result, Err(Json2RdfError::Json(_))), + "expected Json2RdfError::Json, got {:?}", + result.err() + ); +} + +#[test] +fn test_missing_file_returns_error() { + let result = json_to_rdf(&["tests/does_not_exist.json"], None, None); + assert!( + matches!(result, Err(Json2RdfError::Io(_))), + "expected Json2RdfError::Io, got {:?}", + result.err() + ); +} + +#[test] +fn test_invalid_iri_key_returns_error() { + let result = json_to_rdf(&["tests/invalid_iri_key.json"], None, None); + assert!( + matches!(result, Err(Json2RdfError::InvalidIri { .. })), + "expected Json2RdfError::InvalidIri, got {:?}", + result.err() + ); +} + +#[test] +fn test_unicode_key_in_iri_range() { + let graph = json_to_rdf(&["tests/unicode_key.json"], None, None) + .expect("conversion failed") + .expect("expected graph"); + assert_eq!(graph.len(), 1); + assert!( + graph.to_string().contains("中文"), + "expected unicode predicate in output" + ); +} + +#[test] +fn test_hash_namespace_not_mangled() { + let graph = json_to_rdf( + &["tests/airplane.json"], + Some("http://example.com/ns#"), + None, + ) + .expect("conversion failed") + .expect("expected graph"); + + let serialized = graph.to_string(); + assert!( + serialized.contains(""), + "expected predicate to use hash namespace without injected '/', got:\n{}", + serialized + ); + assert!( + !serialized.contains(""), + "expected predicate to use slash namespace without double slash, got:\n{}", + serialized + ); } diff --git a/tests/invalid_iri_key.json b/tests/invalid_iri_key.json new file mode 100644 index 0000000..78e48d7 --- /dev/null +++ b/tests/invalid_iri_key.json @@ -0,0 +1 @@ +{"bad key": 1} diff --git a/tests/large_int.json b/tests/large_int.json new file mode 100644 index 0000000..edbe91f --- /dev/null +++ b/tests/large_int.json @@ -0,0 +1 @@ +{"big": 9223372036854775807, "bigger": 18446744073709551615} diff --git a/tests/malformed.json b/tests/malformed.json new file mode 100644 index 0000000..6b7a9f4 --- /dev/null +++ b/tests/malformed.json @@ -0,0 +1 @@ +not json diff --git a/tests/ndjson.json b/tests/ndjson.json new file mode 100644 index 0000000..114733c --- /dev/null +++ b/tests/ndjson.json @@ -0,0 +1,2 @@ +{"a": 1} +{"b": 2} diff --git a/tests/root_array.json b/tests/root_array.json new file mode 100644 index 0000000..fb50bce --- /dev/null +++ b/tests/root_array.json @@ -0,0 +1 @@ +[{"a": 1}, {"b": 2}] diff --git a/tests/root_primitive.json b/tests/root_primitive.json new file mode 100644 index 0000000..d81cc07 --- /dev/null +++ b/tests/root_primitive.json @@ -0,0 +1 @@ +42 diff --git a/tests/unicode_key.json b/tests/unicode_key.json new file mode 100644 index 0000000..e8ec853 --- /dev/null +++ b/tests/unicode_key.json @@ -0,0 +1 @@ +{"中文": 42}