diff --git a/Cargo.toml b/Cargo.toml index f7fcb04..8f03618 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,6 +6,6 @@ edition = "2021" [dependencies] zarr3 = { git = "https://github.com/clbarnes/zarr3-rs.git" } rdf-rs = { path = "./rdf-rs" } -rio_api = "0.8.4" +sophia = { version = "0.7.2" } ndarray = { version = "0.15.6", features = [ "rayon" ] } bimap = "0.6.3" \ No newline at end of file diff --git a/resources/rdf.nt b/resources/rdf.nt new file mode 100644 index 0000000..a7ea091 --- /dev/null +++ b/resources/rdf.nt @@ -0,0 +1,11 @@ + . + . + . + "1912-06-23"^^ . + . + . + . + . + . + . + . \ No newline at end of file diff --git a/src/remote_hdt.rs b/src/remote_hdt.rs index c476af1..e4e6a57 100644 --- a/src/remote_hdt.rs +++ b/src/remote_hdt.rs @@ -1,7 +1,10 @@ -use ndarray::{ArcArray, ArcArray1, Array2, Axis, Ix3}; +use bimap::BiHashMap; +use ndarray::{ArcArray, ArcArray1, Array2, ArrayBase, Axis, Dim, Ix3, IxDynImpl, OwnedArcRepr}; use rdf_rs::RdfParser; +use sophia::term::BoxTerm; use std::path::PathBuf; use std::str::FromStr; +use zarr3::codecs::bb::gzip_codec::GzipCodec; use zarr3::prelude::smallvec::smallvec; use zarr3::prelude::{ create_root_group, Array, ArrayMetadataBuilder, ArrayRegion, GroupMetadata, ReadableMetadata, @@ -10,7 +13,7 @@ use zarr3::store::filesystem::FileSystemStore; use zarr3::store::{NodeKey, NodeName}; use zarr3::{ArcArrayD, CoordVec}; -type ArcArray3 = ArcArray; +pub type ArcArray3 = ArcArray; #[derive(Default)] pub struct Domain { @@ -427,8 +430,9 @@ impl<'a> RemoteHDT<'a> { // 4. Build the structure of the Array; as such, several parameters of it are // tweaked. Namely, the size of the array, the size of the chunks, the name // of the different dimensions and the default values - let arr_meta = ArrayMetadataBuilder::::new(&self.reference_system.shape_u64(domain)) + let arr_meta = ArrayMetadataBuilder::::new(&self.reference_system.shape_u64(domain)) .dimension_names(self.reference_system.dimension_names())? + .push_bb_codec(GzipCodec::default()) .set_attribute( "subjects".to_string(), subjects @@ -462,7 +466,7 @@ impl<'a> RemoteHDT<'a> { Err(_) => return Err(String::from("Error parsing the NodeName")), }; - let arr = match root_group.create_array::(node_name, arr_meta) { + let arr = match root_group.create_array::(node_name, arr_meta) { Ok(array) => array, Err(_) => return Err(String::from("Error creating the Array")), }; @@ -472,9 +476,30 @@ impl<'a> RemoteHDT<'a> { // the provided values (second vector). What's more, an offset can be set; // that is, we can insert the created array with and X and Y shift. Lastly, // the region is written provided the aforementioned data and offset - let data = match ArcArrayD::from_shape_vec(self.reference_system.shape(domain).to_vec(), { - let mut v = - vec![false; domain.subjects_size * domain.predicates_size * domain.objects_size]; + let data = self.create_array(domain, dump, subjects, predicates, objects)?; + let offset = smallvec![0, 0, 0]; + + // TODO: could this be done using rayon or a multi-threaded approach. + // Maybe using chunks instead of a region and having several chunks of + // the same size (i.e 100x100). Then we write in parallel? + if arr.write_region(&offset, data).is_err() { + return Err(String::from("Error writing to the Array")); + }; + + Ok(self) + } + + fn create_array( + &self, + domain: &Domain, + dump: RdfParser, + subjects: BiHashMap, + predicates: BiHashMap, + objects: BiHashMap, + ) -> Result, Dim>, String> { + match ArcArrayD::from_shape_vec(self.reference_system.shape(domain).to_vec(), { + let mut v: Vec = + vec![0u8; domain.subjects_size * domain.predicates_size * domain.objects_size]; let slice = v.as_mut_slice(); dump.graph.iter().for_each(|[subject, predicate, object]| { slice[self.reference_system.index( @@ -482,37 +507,13 @@ impl<'a> RemoteHDT<'a> { predicates.get_by_left(predicate).unwrap().to_owned(), objects.get_by_left(object).unwrap().to_owned(), domain, - )] = true; + )] = 1u8; }); slice.to_vec() }) { - Ok(data) => data, + Ok(data) => Ok(data), Err(_) => return Err(String::from("Error creating the data Array")), - }; - - println!("{}", data); - - let offset = smallvec![0, 0, 0]; - - // TODO: could this be done using rayon or a multi-threaded approach. - // Maybe using chunks instead of a region and having several chunks of - // the same size (i.e 100x100). Then we write in parallel? - if arr.write_region(&offset, data).is_err() { - return Err(String::from("Error writing to the Array")); - }; - - println!("== Array ========================================================"); - println!( - "{:?}", - arr.read_region(ArrayRegion::from_offset_shape( - &[0, 0, 0], - &self.reference_system.shape_u64(domain) - )) - .unwrap() - .unwrap() - ); - - Ok(self) + } } pub fn parse(mut self) -> Result { @@ -589,6 +590,13 @@ impl<'a> RemoteHDT<'a> { Ok(self) } + + pub fn get_array(self) -> Result { + match self.array { + Some(array) => Ok(array), + None => Err(String::from("Array is None")), + } + } } impl Engine for RemoteHDT<'_> { diff --git a/tests/write_read_test.rs b/tests/write_read_test.rs new file mode 100644 index 0000000..fc5623c --- /dev/null +++ b/tests/write_read_test.rs @@ -0,0 +1,45 @@ +use std::fs::remove_dir_all; + +use remote_hdt::remote_hdt::{ArcArray3, RemoteHDTBuilder}; + +#[test] +fn write_read_test() { + let _ = remove_dir_all("root.zarr").unwrap(); + + let _ = RemoteHDTBuilder::new("root.zarr") + .reference_system(remote_hdt::remote_hdt::ReferenceSystem::SPO) + .rdf_path("resources/rdf.nt") + .array_name("array_name") + .build() + .serialize(); + + let expected = ArcArray3::from_shape_vec( + (4, 8, 9), + vec![ + 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, + ], + ) + .unwrap(); + + let actual = RemoteHDTBuilder::new("root.zarr") + .reference_system(remote_hdt::remote_hdt::ReferenceSystem::SPO) + .array_name("array_name") + .build() + .parse() + .unwrap() + .get_array() + .unwrap(); + + assert_eq!(actual, expected); + + let _ = remove_dir_all("root.zarr"); +}