From 221effd4be79dc0be0343110545d1fedea21d73a Mon Sep 17 00:00:00 2001 From: David Overton Date: Mon, 18 Mar 2024 11:46:41 +1100 Subject: [PATCH 01/14] create introspection submodule --- crates/cli/src/introspection/document.rs | 16 ++++++++++++++++ crates/cli/src/introspection/mod.rs | 4 ++++ .../validation_schema.rs} | 0 3 files changed, 20 insertions(+) create mode 100644 crates/cli/src/introspection/document.rs create mode 100644 crates/cli/src/introspection/mod.rs rename crates/cli/src/{introspection.rs => introspection/validation_schema.rs} (100%) diff --git a/crates/cli/src/introspection/document.rs b/crates/cli/src/introspection/document.rs new file mode 100644 index 00000000..85261749 --- /dev/null +++ b/crates/cli/src/introspection/document.rs @@ -0,0 +1,16 @@ +use configuration::{ + metadata::{Collection, ObjectField, ObjectType, Type}, + Metadata, +}; +use mongodb::bson::{Bson, Document}; +use mongodb_agent_common::interface_types::{MongoAgentError, MongoConfig}; +use mongodb_support::{BsonScalarType, BsonType}; + +pub fn schema_from_document(collection_name: &str, document: &Document) -> Metadata { + let (object_types, collection) = make_collection(collection_name, document); + Metadata { collections: vec!(collection), object_types} +} + +fn make_collection(collection_name: &str, document: &Document) -> (Vec, Collection) { + todo!() +} \ No newline at end of file diff --git a/crates/cli/src/introspection/mod.rs b/crates/cli/src/introspection/mod.rs new file mode 100644 index 00000000..8c7f56cc --- /dev/null +++ b/crates/cli/src/introspection/mod.rs @@ -0,0 +1,4 @@ +pub mod document; +pub mod validation_schema; + +pub use validation_schema::get_metadata_from_validation_schema; \ No newline at end of file diff --git a/crates/cli/src/introspection.rs b/crates/cli/src/introspection/validation_schema.rs similarity index 100% rename from crates/cli/src/introspection.rs rename to crates/cli/src/introspection/validation_schema.rs From 1a1f0388545d0acf0ea25dbb7b948900ddca0ca3 Mon Sep 17 00:00:00 2001 From: David Overton Date: Mon, 18 Mar 2024 11:49:04 +1100 Subject: [PATCH 02/14] Rename Metadata to Schema --- crates/cli/src/introspection/document.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/crates/cli/src/introspection/document.rs b/crates/cli/src/introspection/document.rs index 85261749..742db02a 100644 --- a/crates/cli/src/introspection/document.rs +++ b/crates/cli/src/introspection/document.rs @@ -1,14 +1,14 @@ use configuration::{ - metadata::{Collection, ObjectField, ObjectType, Type}, - Metadata, + schema::{Collection, ObjectField, ObjectType, Type}, + Schema, }; use mongodb::bson::{Bson, Document}; use mongodb_agent_common::interface_types::{MongoAgentError, MongoConfig}; use mongodb_support::{BsonScalarType, BsonType}; -pub fn schema_from_document(collection_name: &str, document: &Document) -> Metadata { +pub fn schema_from_document(collection_name: &str, document: &Document) -> Schema { let (object_types, collection) = make_collection(collection_name, document); - Metadata { collections: vec!(collection), object_types} + Schema { collections: vec!(collection), object_types} } fn make_collection(collection_name: &str, document: &Document) -> (Vec, Collection) { From 0b228b05b4961d2e957ba9f7bb683743ff9d5c40 Mon Sep 17 00:00:00 2001 From: David Overton Date: Mon, 18 Mar 2024 15:55:48 +1100 Subject: [PATCH 03/14] Implement schema_from_document --- crates/cli/src/introspection/document.rs | 96 ++++++++++++++++++++++-- crates/cli/src/introspection/mod.rs | 3 +- 2 files changed, 93 insertions(+), 6 deletions(-) diff --git a/crates/cli/src/introspection/document.rs b/crates/cli/src/introspection/document.rs index 742db02a..92f27710 100644 --- a/crates/cli/src/introspection/document.rs +++ b/crates/cli/src/introspection/document.rs @@ -4,13 +4,99 @@ use configuration::{ }; use mongodb::bson::{Bson, Document}; use mongodb_agent_common::interface_types::{MongoAgentError, MongoConfig}; -use mongodb_support::{BsonScalarType, BsonType}; +use mongodb_support::{BsonScalarType, BsonScalarType::*, BsonType}; pub fn schema_from_document(collection_name: &str, document: &Document) -> Schema { - let (object_types, collection) = make_collection(collection_name, document); - Schema { collections: vec!(collection), object_types} + let (object_types, collection) = make_collection(collection_name, document); + Schema { + collections: vec![collection], + object_types, + } } fn make_collection(collection_name: &str, document: &Document) -> (Vec, Collection) { - todo!() -} \ No newline at end of file + let object_type_defs = make_object_type(collection_name, document); + let collection_info = Collection { + name: collection_name.to_string(), + description: None, + r#type: collection_name.to_string(), + }; + + (object_type_defs, collection_info) +} + +fn make_object_type(object_type_name: &str, document: &Document) -> Vec { + let (mut object_type_defs, object_fields) = { + let type_prefix = format!("{object_type_name}_"); + let (object_type_defs, object_fields): (Vec>, Vec) = document + .iter() + .map(|(field_name, field_value)| make_object_fields(&type_prefix, field_name, field_value)) + .unzip(); + (object_type_defs.concat(), object_fields) + }; + + let object_type = ObjectType { + name: object_type_name.to_string(), + description: None, + fields: object_fields, + }; + + object_type_defs.push(object_type); + object_type_defs +} + +fn make_object_fields( + type_prefix: &str, + field_name: &str, + field_value: &Bson, +) -> (Vec, ObjectField) { + let object_type_name = format!("{type_prefix}{field_name}"); + let (collected_otds, field_type) = make_field_type(&object_type_name, field_value); + + let object_field = ObjectField { + name: field_name.to_owned(), + description: None, + r#type: Type::Nullable((Box::new(field_type))), + }; + + (collected_otds, object_field) +} + +fn make_field_type(object_type_name: &str, field_value: &Bson) -> (Vec, Type) { + fn scalar(t: BsonScalarType) -> (Vec, Type) { + (vec![], Type::Scalar(t)) + } + match field_value { + Bson::Double(_) => scalar(Double), + Bson::String(_) => scalar(String), + Bson::Array(arr) => { + // TODO: examine all elements of the array and take the union. + let (collected_otds, element_type) = match arr.first() { + Some(elem) => make_field_type(object_type_name, elem), + None => scalar(Undefined), + }; + (collected_otds, Type::ArrayOf(Box::new(element_type))) + } + Bson::Document(document) => { + let collected_otds = make_object_type(object_type_name, document); + (collected_otds, Type::Object(object_type_name.to_owned())) + } + Bson::Boolean(_) => scalar(Bool), + Bson::Null => scalar(Null), + Bson::RegularExpression(_) => scalar(Regex), + Bson::JavaScriptCode(_) => scalar(Javascript), + Bson::JavaScriptCodeWithScope(_) => scalar(JavascriptWithScope), + Bson::Int32(_) => scalar(Int), + Bson::Int64(_) => scalar(Long), + Bson::Timestamp(_) => scalar(Timestamp), + Bson::Binary(_) => scalar(BinData), + Bson::ObjectId(_) => scalar(ObjectId), + Bson::DateTime(_) => scalar(Date), + Bson::Symbol(_) => scalar(Symbol), + Bson::Decimal128(_) => scalar(Decimal), + Bson::Undefined => scalar(Undefined), + Bson::MaxKey => scalar(MaxKey), + Bson::MinKey => scalar(MinKey), + Bson::DbPointer(_) => scalar(DbPointer), + } +} diff --git a/crates/cli/src/introspection/mod.rs b/crates/cli/src/introspection/mod.rs index 8c7f56cc..75082e51 100644 --- a/crates/cli/src/introspection/mod.rs +++ b/crates/cli/src/introspection/mod.rs @@ -1,4 +1,5 @@ pub mod document; pub mod validation_schema; -pub use validation_schema::get_metadata_from_validation_schema; \ No newline at end of file +pub use validation_schema::get_metadata_from_validation_schema; +pub use document::schema_from_document; \ No newline at end of file From a3007ed45b8d20fafa61c792e3ccf045fd411acb Mon Sep 17 00:00:00 2001 From: David Overton Date: Mon, 18 Mar 2024 22:32:57 +1100 Subject: [PATCH 04/14] Implement type unification --- Cargo.lock | 11 +- crates/cli/Cargo.toml | 1 + crates/cli/src/introspection/document.rs | 135 ++++++++++++++++++++++- crates/mongodb-support/Cargo.toml | 2 + crates/mongodb-support/src/align.rs | 84 ++++++++++++++ crates/mongodb-support/src/lib.rs | 1 + 6 files changed, 230 insertions(+), 4 deletions(-) create mode 100644 crates/mongodb-support/src/align.rs diff --git a/Cargo.lock b/Cargo.lock index 875e0dda..ca0e45f1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1541,6 +1541,7 @@ dependencies = [ "mongodb-support", "serde", "serde_json", + "these", "thiserror", "tokio", ] @@ -1580,9 +1581,11 @@ dependencies = [ "anyhow", "dc-api-types", "enum-iterator", + "indexmap 1.9.3", "schemars", "serde", "serde_json", + "these", "thiserror", ] @@ -1771,7 +1774,7 @@ version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "75e56d5c441965b6425165b7e3223cc933ca469834f4a8b4786817a1f9dc4f13" dependencies = [ - "indexmap 1.9.3", + "indexmap 2.2.5", "serde", "serde_json", ] @@ -2959,6 +2962,12 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3369f5ac52d5eb6ab48c6b4ffdc8efbcad6b89c765749064ba298f2c68a16a76" +[[package]] +name = "these" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7162adbff4f8c44e938e0e51f6d3d829818c2ffefd793702a3a6f6ef0551de43" + [[package]] name = "thiserror" version = "1.0.58" diff --git a/crates/cli/Cargo.toml b/crates/cli/Cargo.toml index a4564c46..19db1bbf 100644 --- a/crates/cli/Cargo.toml +++ b/crates/cli/Cargo.toml @@ -21,3 +21,4 @@ serde = { version = "1.0", features = ["derive"] } serde_json = { version = "1.0.113", features = ["raw_value"] } thiserror = "1.0.57" tokio = { version = "1.36.0", features = ["full"] } +these = "2.0.0" diff --git a/crates/cli/src/introspection/document.rs b/crates/cli/src/introspection/document.rs index 92f27710..adc9189f 100644 --- a/crates/cli/src/introspection/document.rs +++ b/crates/cli/src/introspection/document.rs @@ -2,9 +2,15 @@ use configuration::{ schema::{Collection, ObjectField, ObjectType, Type}, Schema, }; +use indexmap::IndexMap; use mongodb::bson::{Bson, Document}; use mongodb_agent_common::interface_types::{MongoAgentError, MongoConfig}; -use mongodb_support::{BsonScalarType, BsonScalarType::*, BsonType}; +use mongodb_support::{ + align::align_with_result, + BsonScalarType::{self, *}, + BsonType, +}; +use std::string::String; pub fn schema_from_document(collection_name: &str, document: &Document) -> Schema { let (object_types, collection) = make_collection(collection_name, document); @@ -30,7 +36,9 @@ fn make_object_type(object_type_name: &str, document: &Document) -> Vec>, Vec) = document .iter() - .map(|(field_name, field_value)| make_object_fields(&type_prefix, field_name, field_value)) + .map(|(field_name, field_value)| { + make_object_fields(&type_prefix, field_name, field_value) + }) .unzip(); (object_type_defs.concat(), object_fields) }; @@ -56,7 +64,7 @@ fn make_object_fields( let object_field = ObjectField { name: field_name.to_owned(), description: None, - r#type: Type::Nullable((Box::new(field_type))), + r#type: Type::Nullable(Box::new(field_type)), }; (collected_otds, object_field) @@ -100,3 +108,124 @@ fn make_field_type(object_type_name: &str, field_value: &Bson) -> (Vec scalar(DbPointer), } } + +pub enum TypeUnificationError { + ScalarTypeMismatch(BsonScalarType, BsonScalarType), + ObjectTypeMismatch(String, String), + TypeKindMismatch(Type, Type), +} + +fn unify_types(type_a: Type, type_b: Type) -> Result { + match (type_a, type_b) { + // If one type is undefined, the union is the other type. + // This is used as the base case when inferring array types from documents. + (Type::Scalar(Undefined), type_b) => Ok(type_b), + (type_a, Type::Scalar(Undefined)) => Ok(type_a), + + // Union of any type with Null is the Nullable version of that type + (Type::Scalar(Null), type_b) => Ok(make_nullable(type_b)), + (type_a, Type::Scalar(Null)) => Ok(make_nullable(type_a)), + + (Type::Scalar(scalar_a), Type::Scalar(scalar_b)) => { + if scalar_a == scalar_b { + Ok(Type::Scalar(scalar_a)) + } else { + Err(TypeUnificationError::ScalarTypeMismatch(scalar_a, scalar_b)) + } + } + (Type::Object(object_a), Type::Object(object_b)) => { + if object_a == object_b { + Ok(Type::Object(object_a)) + } else { + Err(TypeUnificationError::ObjectTypeMismatch(object_a, object_b)) + } + } + (Type::ArrayOf(elem_type_a), Type::ArrayOf(elem_type_b)) => { + let elem_type = unify_types(*elem_type_a, *elem_type_b)?; + Ok(Type::ArrayOf(Box::new(elem_type))) + } + (Type::Nullable(nullable_type_a), type_b) => { + let result_type = unify_types(*nullable_type_a, type_b)?; + Ok(make_nullable(result_type)) + } + (type_a, Type::Nullable(nullable_type_b)) => { + let result_type = unify_types(type_a, *nullable_type_b)?; + Ok(make_nullable(result_type)) + } + (type_a, type_b) => Err(TypeUnificationError::TypeKindMismatch(type_a, type_b)), + } +} + +fn make_nullable(t: Type) -> Type { + match t { + Type::Nullable(t) => Type::Nullable(t), + t => Type::Nullable(Box::new(t)), + } +} + +fn make_nullable_field(field: ObjectField) -> Result { + Ok(ObjectField { + name: field.name, + r#type: make_nullable(field.r#type), + description: field.description, + }) +} + +fn unify_object_types( + object_type_a: ObjectType, + object_type_b: ObjectType, +) -> Result { + let field_map_a: IndexMap = object_type_a + .fields + .into_iter() + .map(|o| (o.name.to_owned(), o)) + .collect(); + let field_map_b: IndexMap = object_type_b + .fields + .into_iter() + .map(|o| (o.name.to_owned(), o)) + .collect(); + + let merged_field_map = align_with_result( + field_map_a, + field_map_b, + make_nullable_field, + make_nullable_field, + unify_object_fields, + )?; + + Ok(ObjectType { + name: object_type_a.name, + fields: merged_field_map.into_values().collect(), + description: object_type_a.description.or(object_type_b.description), + }) +} + +fn unify_object_fields( + object_field_a: ObjectField, + object_field_b: ObjectField, +) -> Result { + Ok(ObjectField { + name: object_field_a.name, + r#type: unify_types(object_field_a.r#type, object_field_b.r#type)?, + description: object_field_a.description.or(object_field_b.description), + }) +} + +fn unify( + object_types_a: Vec, + object_types_b: Vec, +) -> Result, TypeUnificationError> { + let type_map_a: IndexMap = object_types_a + .into_iter() + .map(|t| (t.name.to_owned(), t)) + .collect(); + let type_map_b: IndexMap = object_types_b + .into_iter() + .map(|t| (t.name.to_owned(), t)) + .collect(); + + let merged_type_map = align_with_result(type_map_a, type_map_b, Ok, Ok, unify_object_types)?; + + Ok(merged_type_map.into_values().collect()) +} diff --git a/crates/mongodb-support/Cargo.toml b/crates/mongodb-support/Cargo.toml index b6893a8b..a749ac9f 100644 --- a/crates/mongodb-support/Cargo.toml +++ b/crates/mongodb-support/Cargo.toml @@ -6,9 +6,11 @@ edition = "2021" [dependencies] dc-api-types = { path = "../dc-api-types" } enum-iterator = "1.4.1" +indexmap = { version = "1", features = ["serde"] } # must match the version that ndc-client uses schemars = "^0.8.12" serde = { version = "1", features = ["derive"] } serde_json = "1" +these = "2.0.0" thiserror = "1" [dev-dependencies] diff --git a/crates/mongodb-support/src/align.rs b/crates/mongodb-support/src/align.rs new file mode 100644 index 00000000..da61f5c6 --- /dev/null +++ b/crates/mongodb-support/src/align.rs @@ -0,0 +1,84 @@ +use indexmap::IndexMap; +use std::hash::Hash; +use these::These::{self, *}; + +pub fn align(ts: IndexMap, mut us: IndexMap) -> IndexMap> +where + K: Hash + Eq, +{ + let mut result: IndexMap> = IndexMap::new(); + + for (k, t) in ts { + match us.swap_remove(&k) { + None => result.insert(k, This(t)), + Some(u) => result.insert(k, Both(t, u)), + }; + } + + for (k, u) in us { + result.insert(k, That(u)); + } + result +} + +pub fn align_with(ts: IndexMap, mut us: IndexMap, f: F) -> IndexMap +where + K: Hash + Eq, + F: Fn(V, V) -> V, +{ + let mut result: IndexMap = IndexMap::new(); + + for (k, t) in ts { + match us.swap_remove(&k) { + None => result.insert(k, t), + Some(u) => result.insert(k, f(t, u)), + }; + } + + for (k, u) in us { + result.insert(k, u); + } + result +} + +// pub fn align_with_result(ts: IndexMap, mut us: IndexMap, f: F) -> Result, E> +// where +// K: Hash + Eq, +// F: Fn(V, V) -> Result, +// { +// let mut result: IndexMap = IndexMap::new(); + +// for (k, t) in ts { +// match us.swap_remove(&k) { +// None => result.insert(k, t), +// Some(u) => result.insert(k, f(t, u)?), +// }; +// } + +// for (k, u) in us { +// result.insert(k, u); +// } +// Ok(result) +// } + +pub fn align_with_result(ts: IndexMap, mut us: IndexMap, ft: FT, fu: FU, ftu: FTU) -> Result, E> +where + K: Hash + Eq, + FT: Fn(T) -> Result, + FU: Fn(U) -> Result, + FTU: Fn(T, U) -> Result, +{ + let mut result: IndexMap = IndexMap::new(); + + for (k, t) in ts { + match us.swap_remove(&k) { + None => result.insert(k, ft(t)?), + Some(u) => result.insert(k, ftu(t, u)?), + }; + } + + for (k, u) in us { + result.insert(k, fu(u)?); + } + Ok(result) +} diff --git a/crates/mongodb-support/src/lib.rs b/crates/mongodb-support/src/lib.rs index ed3f1734..a2c6fc08 100644 --- a/crates/mongodb-support/src/lib.rs +++ b/crates/mongodb-support/src/lib.rs @@ -1,4 +1,5 @@ mod bson_type; pub mod error; +pub mod align; pub use self::bson_type::{BsonScalarType, BsonType}; From 698707a2e6c4606af796c0c64c8c2aaa85f4fcb5 Mon Sep 17 00:00:00 2001 From: David Overton Date: Mon, 18 Mar 2024 22:49:01 +1100 Subject: [PATCH 05/14] Unify types of all elements when inferring type of an array --- crates/cli/src/introspection/document.rs | 83 ++++++++++++++---------- 1 file changed, 50 insertions(+), 33 deletions(-) diff --git a/crates/cli/src/introspection/document.rs b/crates/cli/src/introspection/document.rs index adc9189f..824bf8e6 100644 --- a/crates/cli/src/introspection/document.rs +++ b/crates/cli/src/introspection/document.rs @@ -12,26 +12,35 @@ use mongodb_support::{ }; use std::string::String; -pub fn schema_from_document(collection_name: &str, document: &Document) -> Schema { - let (object_types, collection) = make_collection(collection_name, document); - Schema { +pub fn schema_from_document( + collection_name: &str, + document: &Document, +) -> Result { + let (object_types, collection) = make_collection(collection_name, document)?; + Ok(Schema { collections: vec![collection], object_types, - } + }) } -fn make_collection(collection_name: &str, document: &Document) -> (Vec, Collection) { - let object_type_defs = make_object_type(collection_name, document); +fn make_collection( + collection_name: &str, + document: &Document, +) -> Result<(Vec, Collection), TypeUnificationError> { + let object_type_defs = make_object_type(collection_name, document)?; let collection_info = Collection { name: collection_name.to_string(), description: None, r#type: collection_name.to_string(), }; - (object_type_defs, collection_info) + Ok((object_type_defs, collection_info)) } -fn make_object_type(object_type_name: &str, document: &Document) -> Vec { +fn make_object_type( + object_type_name: &str, + document: &Document, +) -> Result, TypeUnificationError> { let (mut object_type_defs, object_fields) = { let type_prefix = format!("{object_type_name}_"); let (object_type_defs, object_fields): (Vec>, Vec) = document @@ -39,6 +48,8 @@ fn make_object_type(object_type_name: &str, document: &Document) -> Vec, ObjectField)>, TypeUnificationError>>()? + .into_iter() .unzip(); (object_type_defs.concat(), object_fields) }; @@ -50,16 +61,16 @@ fn make_object_type(object_type_name: &str, document: &Document) -> Vec (Vec, ObjectField) { +) -> Result<(Vec, ObjectField), TypeUnificationError> { let object_type_name = format!("{type_prefix}{field_name}"); - let (collected_otds, field_type) = make_field_type(&object_type_name, field_value); + let (collected_otds, field_type) = make_field_type(&object_type_name, field_value)?; let object_field = ObjectField { name: field_name.to_owned(), @@ -67,27 +78,33 @@ fn make_object_fields( r#type: Type::Nullable(Box::new(field_type)), }; - (collected_otds, object_field) + Ok((collected_otds, object_field)) } -fn make_field_type(object_type_name: &str, field_value: &Bson) -> (Vec, Type) { - fn scalar(t: BsonScalarType) -> (Vec, Type) { - (vec![], Type::Scalar(t)) +fn make_field_type( + object_type_name: &str, + field_value: &Bson, +) -> Result<(Vec, Type), TypeUnificationError> { + fn scalar(t: BsonScalarType) -> Result<(Vec, Type), TypeUnificationError> { + Ok((vec![], Type::Scalar(t))) } match field_value { Bson::Double(_) => scalar(Double), Bson::String(_) => scalar(String), Bson::Array(arr) => { - // TODO: examine all elements of the array and take the union. - let (collected_otds, element_type) = match arr.first() { - Some(elem) => make_field_type(object_type_name, elem), - None => scalar(Undefined), - }; - (collected_otds, Type::ArrayOf(Box::new(element_type))) + // Examine all elements of the array and take the union of the resulting types. + let mut collected_otds = vec![]; + let mut result_type = Type::Scalar(Undefined); + for elem in arr { + let (elem_collected_otds, elem_type) = make_field_type(object_type_name, elem)?; + collected_otds = unify_object_types(collected_otds, elem_collected_otds)?; + result_type = unify_type(result_type, elem_type)?; + } + Ok((collected_otds, Type::ArrayOf(Box::new(result_type)))) } Bson::Document(document) => { - let collected_otds = make_object_type(object_type_name, document); - (collected_otds, Type::Object(object_type_name.to_owned())) + let collected_otds = make_object_type(object_type_name, document)?; + Ok((collected_otds, Type::Object(object_type_name.to_owned()))) } Bson::Boolean(_) => scalar(Bool), Bson::Null => scalar(Null), @@ -115,7 +132,7 @@ pub enum TypeUnificationError { TypeKindMismatch(Type, Type), } -fn unify_types(type_a: Type, type_b: Type) -> Result { +fn unify_type(type_a: Type, type_b: Type) -> Result { match (type_a, type_b) { // If one type is undefined, the union is the other type. // This is used as the base case when inferring array types from documents. @@ -141,15 +158,15 @@ fn unify_types(type_a: Type, type_b: Type) -> Result } } (Type::ArrayOf(elem_type_a), Type::ArrayOf(elem_type_b)) => { - let elem_type = unify_types(*elem_type_a, *elem_type_b)?; + let elem_type = unify_type(*elem_type_a, *elem_type_b)?; Ok(Type::ArrayOf(Box::new(elem_type))) } (Type::Nullable(nullable_type_a), type_b) => { - let result_type = unify_types(*nullable_type_a, type_b)?; + let result_type = unify_type(*nullable_type_a, type_b)?; Ok(make_nullable(result_type)) } (type_a, Type::Nullable(nullable_type_b)) => { - let result_type = unify_types(type_a, *nullable_type_b)?; + let result_type = unify_type(type_a, *nullable_type_b)?; Ok(make_nullable(result_type)) } (type_a, type_b) => Err(TypeUnificationError::TypeKindMismatch(type_a, type_b)), @@ -171,7 +188,7 @@ fn make_nullable_field(field: ObjectField) -> Result { }) } -fn unify_object_types( +fn unify_object_type( object_type_a: ObjectType, object_type_b: ObjectType, ) -> Result { @@ -191,7 +208,7 @@ fn unify_object_types( field_map_b, make_nullable_field, make_nullable_field, - unify_object_fields, + unify_object_field, )?; Ok(ObjectType { @@ -201,18 +218,18 @@ fn unify_object_types( }) } -fn unify_object_fields( +fn unify_object_field( object_field_a: ObjectField, object_field_b: ObjectField, ) -> Result { Ok(ObjectField { name: object_field_a.name, - r#type: unify_types(object_field_a.r#type, object_field_b.r#type)?, + r#type: unify_type(object_field_a.r#type, object_field_b.r#type)?, description: object_field_a.description.or(object_field_b.description), }) } -fn unify( +fn unify_object_types( object_types_a: Vec, object_types_b: Vec, ) -> Result, TypeUnificationError> { @@ -225,7 +242,7 @@ fn unify( .map(|t| (t.name.to_owned(), t)) .collect(); - let merged_type_map = align_with_result(type_map_a, type_map_b, Ok, Ok, unify_object_types)?; + let merged_type_map = align_with_result(type_map_a, type_map_b, Ok, Ok, unify_object_type)?; Ok(merged_type_map.into_values().collect()) } From 9582635af8fbbbc4be4f868e0aecd22aee7a2204 Mon Sep 17 00:00:00 2001 From: David Overton Date: Tue, 19 Mar 2024 16:33:25 +1100 Subject: [PATCH 06/14] Sample documents from the database --- crates/cli/src/introspection/document.rs | 187 ++++++++++++++++++----- crates/cli/src/introspection/mod.rs | 2 +- crates/cli/src/lib.rs | 21 ++- 3 files changed, 163 insertions(+), 47 deletions(-) diff --git a/crates/cli/src/introspection/document.rs b/crates/cli/src/introspection/document.rs index 824bf8e6..f7a767af 100644 --- a/crates/cli/src/introspection/document.rs +++ b/crates/cli/src/introspection/document.rs @@ -2,53 +2,81 @@ use configuration::{ schema::{Collection, ObjectField, ObjectType, Type}, Schema, }; +use futures_util::TryStreamExt; use indexmap::IndexMap; -use mongodb::bson::{Bson, Document}; -use mongodb_agent_common::interface_types::{MongoAgentError, MongoConfig}; +use mongodb::bson::{doc, Bson, Document}; +use mongodb_agent_common::interface_types::MongoConfig; use mongodb_support::{ align::align_with_result, BsonScalarType::{self, *}, - BsonType, }; -use std::string::String; +use std::{ + fmt::{self, Display}, + string::String, +}; +use thiserror::Error; -pub fn schema_from_document( - collection_name: &str, - document: &Document, -) -> Result { - let (object_types, collection) = make_collection(collection_name, document)?; - Ok(Schema { - collections: vec![collection], - object_types, - }) +// Sample from all collections in the database +pub async fn sample_schema_from_db( + sample_size: u32, + config: &MongoConfig, +) -> anyhow::Result { + let mut schema = Schema { + collections: vec![], + object_types: vec![], + }; + let db = config.client.database(&config.database); + let mut collections_cursor = db.list_collections(None, None).await?; + + while let Some(collection_spec) = collections_cursor.try_next().await? { + let collection_name = collection_spec.name; + let collection_schema = + sample_schema_from_collection(&collection_name, sample_size, config).await?; + schema = unify_schema(schema, collection_schema)?; + } + Ok(schema) } -fn make_collection( +pub async fn sample_schema_from_collection( collection_name: &str, - document: &Document, -) -> Result<(Vec, Collection), TypeUnificationError> { - let object_type_defs = make_object_type(collection_name, document)?; + sample_size: u32, + config: &MongoConfig, +) -> anyhow::Result { + let db = config.client.database(&config.database); + let options = None; + let mut cursor = db + .collection::(collection_name) + .aggregate(vec![doc! {"$sample": { "size": sample_size }}], options) + .await?; + let mut collected_object_types = vec![]; + while let Some(document) = cursor.try_next().await? { + let object_types = make_object_type(collection_name, &document)?; + collected_object_types = unify_object_types(collected_object_types, object_types)?; + } let collection_info = Collection { name: collection_name.to_string(), description: None, r#type: collection_name.to_string(), }; - Ok((object_type_defs, collection_info)) + Ok(Schema { + collections: vec![collection_info], + object_types: collected_object_types, + }) } fn make_object_type( object_type_name: &str, document: &Document, -) -> Result, TypeUnificationError> { +) -> TypeUnificationResult> { let (mut object_type_defs, object_fields) = { let type_prefix = format!("{object_type_name}_"); let (object_type_defs, object_fields): (Vec>, Vec) = document .iter() .map(|(field_name, field_value)| { - make_object_fields(&type_prefix, field_name, field_value) + make_object_field(&type_prefix, field_name, field_value) }) - .collect::, ObjectField)>, TypeUnificationError>>()? + .collect::, ObjectField)>>>()? .into_iter() .unzip(); (object_type_defs.concat(), object_fields) @@ -64,18 +92,18 @@ fn make_object_type( Ok(object_type_defs) } -fn make_object_fields( +fn make_object_field( type_prefix: &str, field_name: &str, field_value: &Bson, -) -> Result<(Vec, ObjectField), TypeUnificationError> { +) -> TypeUnificationResult<(Vec, ObjectField)> { let object_type_name = format!("{type_prefix}{field_name}"); - let (collected_otds, field_type) = make_field_type(&object_type_name, field_value)?; + let (collected_otds, field_type) = make_field_type(&object_type_name, field_name, field_value)?; let object_field = ObjectField { name: field_name.to_owned(), description: None, - r#type: Type::Nullable(Box::new(field_type)), + r#type: field_type, }; Ok((collected_otds, object_field)) @@ -83,9 +111,10 @@ fn make_object_fields( fn make_field_type( object_type_name: &str, + field_name: &str, field_value: &Bson, -) -> Result<(Vec, Type), TypeUnificationError> { - fn scalar(t: BsonScalarType) -> Result<(Vec, Type), TypeUnificationError> { +) -> TypeUnificationResult<(Vec, Type)> { + fn scalar(t: BsonScalarType) -> TypeUnificationResult<(Vec, Type)> { Ok((vec![], Type::Scalar(t))) } match field_value { @@ -96,9 +125,11 @@ fn make_field_type( let mut collected_otds = vec![]; let mut result_type = Type::Scalar(Undefined); for elem in arr { - let (elem_collected_otds, elem_type) = make_field_type(object_type_name, elem)?; - collected_otds = unify_object_types(collected_otds, elem_collected_otds)?; - result_type = unify_type(result_type, elem_type)?; + let (elem_collected_otds, elem_type) = + make_field_type(object_type_name, field_name, elem)?; + collected_otds = unify_object_types(collected_otds, elem_collected_otds)?; + let context = TypeUnificationContext::new(object_type_name, field_name); + result_type = unify_type(context, result_type, elem_type)?; } Ok((collected_otds, Type::ArrayOf(Box::new(result_type)))) } @@ -126,13 +157,65 @@ fn make_field_type( } } +#[derive(Debug)] +pub struct TypeUnificationContext { + object_type_name: String, + field_name: String, +} + +impl TypeUnificationContext { + fn new(object_type_name: &str, field_name: &str) -> Self { + TypeUnificationContext { + object_type_name: object_type_name.to_owned(), + field_name: field_name.to_owned(), + } + } +} + +impl Display for TypeUnificationContext { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "object type: {}, field: {}", + self.object_type_name, self.field_name + ) + } +} + +#[derive(Debug, Error)] pub enum TypeUnificationError { - ScalarTypeMismatch(BsonScalarType, BsonScalarType), + ScalarTypeMismatch(TypeUnificationContext, BsonScalarType, BsonScalarType), ObjectTypeMismatch(String, String), TypeKindMismatch(Type, Type), } -fn unify_type(type_a: Type, type_b: Type) -> Result { +impl Display for TypeUnificationError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::ScalarTypeMismatch(context, scalar_a, scalar_b) => write!( + f, + "Scalar type mismatch {} {} at {}", + scalar_a.bson_name(), + scalar_b.bson_name(), + context + ), + Self::ObjectTypeMismatch(object_a, object_b) => { + write!(f, "Object type mismatch {} {}", object_a, object_b) + } + Self::TypeKindMismatch(type_a, type_b) => { + write!(f, "Object type mismatch {:?} {:?}", type_a, type_b) + } + } + } +} + +type TypeUnificationResult = Result; + +fn unify_type( + context: TypeUnificationContext, + type_a: Type, + type_b: Type, +) -> TypeUnificationResult { match (type_a, type_b) { // If one type is undefined, the union is the other type. // This is used as the base case when inferring array types from documents. @@ -147,7 +230,9 @@ fn unify_type(type_a: Type, type_b: Type) -> Result if scalar_a == scalar_b { Ok(Type::Scalar(scalar_a)) } else { - Err(TypeUnificationError::ScalarTypeMismatch(scalar_a, scalar_b)) + Err(TypeUnificationError::ScalarTypeMismatch( + context, scalar_a, scalar_b, + )) } } (Type::Object(object_a), Type::Object(object_b)) => { @@ -158,15 +243,15 @@ fn unify_type(type_a: Type, type_b: Type) -> Result } } (Type::ArrayOf(elem_type_a), Type::ArrayOf(elem_type_b)) => { - let elem_type = unify_type(*elem_type_a, *elem_type_b)?; + let elem_type = unify_type(context, *elem_type_a, *elem_type_b)?; Ok(Type::ArrayOf(Box::new(elem_type))) } (Type::Nullable(nullable_type_a), type_b) => { - let result_type = unify_type(*nullable_type_a, type_b)?; + let result_type = unify_type(context, *nullable_type_a, type_b)?; Ok(make_nullable(result_type)) } (type_a, Type::Nullable(nullable_type_b)) => { - let result_type = unify_type(type_a, *nullable_type_b)?; + let result_type = unify_type(context, type_a, *nullable_type_b)?; Ok(make_nullable(result_type)) } (type_a, type_b) => Err(TypeUnificationError::TypeKindMismatch(type_a, type_b)), @@ -191,7 +276,7 @@ fn make_nullable_field(field: ObjectField) -> Result { fn unify_object_type( object_type_a: ObjectType, object_type_b: ObjectType, -) -> Result { +) -> TypeUnificationResult { let field_map_a: IndexMap = object_type_a .fields .into_iter() @@ -208,7 +293,7 @@ fn unify_object_type( field_map_b, make_nullable_field, make_nullable_field, - unify_object_field, + |field_a, field_b| unify_object_field(&object_type_a.name, field_a, field_b), )?; Ok(ObjectType { @@ -219,12 +304,14 @@ fn unify_object_type( } fn unify_object_field( + object_type_name: &str, object_field_a: ObjectField, object_field_b: ObjectField, -) -> Result { +) -> TypeUnificationResult { + let context = TypeUnificationContext::new(object_type_name, &object_field_a.name); Ok(ObjectField { name: object_field_a.name, - r#type: unify_type(object_field_a.r#type, object_field_b.r#type)?, + r#type: unify_type(context, object_field_a.r#type, object_field_b.r#type)?, description: object_field_a.description.or(object_field_b.description), }) } @@ -232,7 +319,7 @@ fn unify_object_field( fn unify_object_types( object_types_a: Vec, object_types_b: Vec, -) -> Result, TypeUnificationError> { +) -> TypeUnificationResult> { let type_map_a: IndexMap = object_types_a .into_iter() .map(|t| (t.name.to_owned(), t)) @@ -246,3 +333,21 @@ fn unify_object_types( Ok(merged_type_map.into_values().collect()) } + +// Unify two schemas. Assumes that the schemas describe mutually exclusive sets of collections. +fn unify_schema(schema_a: Schema, schema_b: Schema) -> TypeUnificationResult { + let collections = schema_a + .collections + .into_iter() + .chain(schema_b.collections.into_iter()) + .collect(); + let object_types = schema_a + .object_types + .into_iter() + .chain(schema_b.object_types.into_iter()) + .collect(); + Ok(Schema { + collections, + object_types, + }) +} diff --git a/crates/cli/src/introspection/mod.rs b/crates/cli/src/introspection/mod.rs index 75082e51..0871e640 100644 --- a/crates/cli/src/introspection/mod.rs +++ b/crates/cli/src/introspection/mod.rs @@ -2,4 +2,4 @@ pub mod document; pub mod validation_schema; pub use validation_schema::get_metadata_from_validation_schema; -pub use document::schema_from_document; \ No newline at end of file +pub use document::sample_schema_from_db; \ No newline at end of file diff --git a/crates/cli/src/lib.rs b/crates/cli/src/lib.rs index b37c4ee2..e06babca 100644 --- a/crates/cli/src/lib.rs +++ b/crates/cli/src/lib.rs @@ -4,16 +4,22 @@ mod introspection; use std::path::PathBuf; -use clap::Subcommand; +use clap::{Parser, Subcommand}; use configuration::Configuration; use mongodb_agent_common::interface_types::MongoConfig; +#[derive(Debug, Clone, Parser)] +pub struct UpdateArgs { + #[arg(long = "sample-size", value_name = "N")] + sample_size: Option, +} + /// The command invoked by the user. #[derive(Debug, Clone, Subcommand)] pub enum Command { /// Update the configuration by introspecting the database, using the configuration options. - Update, + Update(UpdateArgs), } pub struct Context { @@ -24,14 +30,19 @@ pub struct Context { /// Run a command in a given directory. pub async fn run(command: Command, context: &Context) -> anyhow::Result<()> { match command { - Command::Update => update(context).await?, + Command::Update(args) => update(context, &args).await?, }; Ok(()) } /// Update the configuration in the current directory by introspecting the database. -async fn update(context: &Context) -> anyhow::Result<()> { - let schema = introspection::get_metadata_from_validation_schema(&context.mongo_config).await?; +async fn update(context: &Context, args: &UpdateArgs) -> anyhow::Result<()> { + let schema = match args.sample_size { + None => introspection::get_metadata_from_validation_schema(&context.mongo_config).await?, + Some(sample_size) => { + introspection::sample_schema_from_db(sample_size, &context.mongo_config).await? + } + }; let configuration = Configuration::from_schema(schema); configuration::write_directory(&context.path, &configuration).await?; From 0c281d1fe09ff5e4e325ff12d1a92d643f0c315c Mon Sep 17 00:00:00 2001 From: David Overton Date: Tue, 19 Mar 2024 17:30:35 +1100 Subject: [PATCH 07/14] Move type unfication code to its own module --- crates/cli/src/introspection/document.rs | 210 +----------------- crates/cli/src/introspection/mod.rs | 1 + .../cli/src/introspection/type_unification.rs | 209 +++++++++++++++++ 3 files changed, 215 insertions(+), 205 deletions(-) create mode 100644 crates/cli/src/introspection/type_unification.rs diff --git a/crates/cli/src/introspection/document.rs b/crates/cli/src/introspection/document.rs index f7a767af..e9bd3811 100644 --- a/crates/cli/src/introspection/document.rs +++ b/crates/cli/src/introspection/document.rs @@ -3,18 +3,13 @@ use configuration::{ Schema, }; use futures_util::TryStreamExt; -use indexmap::IndexMap; use mongodb::bson::{doc, Bson, Document}; use mongodb_agent_common::interface_types::MongoConfig; -use mongodb_support::{ - align::align_with_result, - BsonScalarType::{self, *}, -}; -use std::{ - fmt::{self, Display}, - string::String, -}; -use thiserror::Error; +use mongodb_support::BsonScalarType::{self, *}; + +use crate::introspection::type_unification::{unify_type, TypeUnificationContext}; + +use super::type_unification::{unify_object_types, unify_schema, TypeUnificationResult}; // Sample from all collections in the database pub async fn sample_schema_from_db( @@ -156,198 +151,3 @@ fn make_field_type( Bson::DbPointer(_) => scalar(DbPointer), } } - -#[derive(Debug)] -pub struct TypeUnificationContext { - object_type_name: String, - field_name: String, -} - -impl TypeUnificationContext { - fn new(object_type_name: &str, field_name: &str) -> Self { - TypeUnificationContext { - object_type_name: object_type_name.to_owned(), - field_name: field_name.to_owned(), - } - } -} - -impl Display for TypeUnificationContext { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!( - f, - "object type: {}, field: {}", - self.object_type_name, self.field_name - ) - } -} - -#[derive(Debug, Error)] -pub enum TypeUnificationError { - ScalarTypeMismatch(TypeUnificationContext, BsonScalarType, BsonScalarType), - ObjectTypeMismatch(String, String), - TypeKindMismatch(Type, Type), -} - -impl Display for TypeUnificationError { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - Self::ScalarTypeMismatch(context, scalar_a, scalar_b) => write!( - f, - "Scalar type mismatch {} {} at {}", - scalar_a.bson_name(), - scalar_b.bson_name(), - context - ), - Self::ObjectTypeMismatch(object_a, object_b) => { - write!(f, "Object type mismatch {} {}", object_a, object_b) - } - Self::TypeKindMismatch(type_a, type_b) => { - write!(f, "Object type mismatch {:?} {:?}", type_a, type_b) - } - } - } -} - -type TypeUnificationResult = Result; - -fn unify_type( - context: TypeUnificationContext, - type_a: Type, - type_b: Type, -) -> TypeUnificationResult { - match (type_a, type_b) { - // If one type is undefined, the union is the other type. - // This is used as the base case when inferring array types from documents. - (Type::Scalar(Undefined), type_b) => Ok(type_b), - (type_a, Type::Scalar(Undefined)) => Ok(type_a), - - // Union of any type with Null is the Nullable version of that type - (Type::Scalar(Null), type_b) => Ok(make_nullable(type_b)), - (type_a, Type::Scalar(Null)) => Ok(make_nullable(type_a)), - - (Type::Scalar(scalar_a), Type::Scalar(scalar_b)) => { - if scalar_a == scalar_b { - Ok(Type::Scalar(scalar_a)) - } else { - Err(TypeUnificationError::ScalarTypeMismatch( - context, scalar_a, scalar_b, - )) - } - } - (Type::Object(object_a), Type::Object(object_b)) => { - if object_a == object_b { - Ok(Type::Object(object_a)) - } else { - Err(TypeUnificationError::ObjectTypeMismatch(object_a, object_b)) - } - } - (Type::ArrayOf(elem_type_a), Type::ArrayOf(elem_type_b)) => { - let elem_type = unify_type(context, *elem_type_a, *elem_type_b)?; - Ok(Type::ArrayOf(Box::new(elem_type))) - } - (Type::Nullable(nullable_type_a), type_b) => { - let result_type = unify_type(context, *nullable_type_a, type_b)?; - Ok(make_nullable(result_type)) - } - (type_a, Type::Nullable(nullable_type_b)) => { - let result_type = unify_type(context, type_a, *nullable_type_b)?; - Ok(make_nullable(result_type)) - } - (type_a, type_b) => Err(TypeUnificationError::TypeKindMismatch(type_a, type_b)), - } -} - -fn make_nullable(t: Type) -> Type { - match t { - Type::Nullable(t) => Type::Nullable(t), - t => Type::Nullable(Box::new(t)), - } -} - -fn make_nullable_field(field: ObjectField) -> Result { - Ok(ObjectField { - name: field.name, - r#type: make_nullable(field.r#type), - description: field.description, - }) -} - -fn unify_object_type( - object_type_a: ObjectType, - object_type_b: ObjectType, -) -> TypeUnificationResult { - let field_map_a: IndexMap = object_type_a - .fields - .into_iter() - .map(|o| (o.name.to_owned(), o)) - .collect(); - let field_map_b: IndexMap = object_type_b - .fields - .into_iter() - .map(|o| (o.name.to_owned(), o)) - .collect(); - - let merged_field_map = align_with_result( - field_map_a, - field_map_b, - make_nullable_field, - make_nullable_field, - |field_a, field_b| unify_object_field(&object_type_a.name, field_a, field_b), - )?; - - Ok(ObjectType { - name: object_type_a.name, - fields: merged_field_map.into_values().collect(), - description: object_type_a.description.or(object_type_b.description), - }) -} - -fn unify_object_field( - object_type_name: &str, - object_field_a: ObjectField, - object_field_b: ObjectField, -) -> TypeUnificationResult { - let context = TypeUnificationContext::new(object_type_name, &object_field_a.name); - Ok(ObjectField { - name: object_field_a.name, - r#type: unify_type(context, object_field_a.r#type, object_field_b.r#type)?, - description: object_field_a.description.or(object_field_b.description), - }) -} - -fn unify_object_types( - object_types_a: Vec, - object_types_b: Vec, -) -> TypeUnificationResult> { - let type_map_a: IndexMap = object_types_a - .into_iter() - .map(|t| (t.name.to_owned(), t)) - .collect(); - let type_map_b: IndexMap = object_types_b - .into_iter() - .map(|t| (t.name.to_owned(), t)) - .collect(); - - let merged_type_map = align_with_result(type_map_a, type_map_b, Ok, Ok, unify_object_type)?; - - Ok(merged_type_map.into_values().collect()) -} - -// Unify two schemas. Assumes that the schemas describe mutually exclusive sets of collections. -fn unify_schema(schema_a: Schema, schema_b: Schema) -> TypeUnificationResult { - let collections = schema_a - .collections - .into_iter() - .chain(schema_b.collections.into_iter()) - .collect(); - let object_types = schema_a - .object_types - .into_iter() - .chain(schema_b.object_types.into_iter()) - .collect(); - Ok(Schema { - collections, - object_types, - }) -} diff --git a/crates/cli/src/introspection/mod.rs b/crates/cli/src/introspection/mod.rs index 0871e640..e2af4ee5 100644 --- a/crates/cli/src/introspection/mod.rs +++ b/crates/cli/src/introspection/mod.rs @@ -1,5 +1,6 @@ pub mod document; pub mod validation_schema; +pub mod type_unification; pub use validation_schema::get_metadata_from_validation_schema; pub use document::sample_schema_from_db; \ No newline at end of file diff --git a/crates/cli/src/introspection/type_unification.rs b/crates/cli/src/introspection/type_unification.rs new file mode 100644 index 00000000..ea8c815a --- /dev/null +++ b/crates/cli/src/introspection/type_unification.rs @@ -0,0 +1,209 @@ +use configuration::{ + schema::{ObjectField, ObjectType, Type}, + Schema, +}; +use indexmap::IndexMap; +use mongodb_support::{ + align::align_with_result, + BsonScalarType::{self, *}, +}; +use std::{ + fmt::{self, Display}, + string::String, +}; +use thiserror::Error; + +#[derive(Debug)] +pub struct TypeUnificationContext { + object_type_name: String, + field_name: String, +} + +impl TypeUnificationContext { + pub fn new(object_type_name: &str, field_name: &str) -> Self { + TypeUnificationContext { + object_type_name: object_type_name.to_owned(), + field_name: field_name.to_owned(), + } + } +} + +impl Display for TypeUnificationContext { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "object type: {}, field: {}", + self.object_type_name, self.field_name + ) + } +} + +#[derive(Debug, Error)] +pub enum TypeUnificationError { + ScalarTypeMismatch(TypeUnificationContext, BsonScalarType, BsonScalarType), + ObjectTypeMismatch(String, String), + TypeKindMismatch(Type, Type), +} + +impl Display for TypeUnificationError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::ScalarTypeMismatch(context, scalar_a, scalar_b) => write!( + f, + "Scalar type mismatch {} {} at {}", + scalar_a.bson_name(), + scalar_b.bson_name(), + context + ), + Self::ObjectTypeMismatch(object_a, object_b) => { + write!(f, "Object type mismatch {} {}", object_a, object_b) + } + Self::TypeKindMismatch(type_a, type_b) => { + write!(f, "Object type mismatch {:?} {:?}", type_a, type_b) + } + } + } +} + +pub type TypeUnificationResult = Result; + +pub fn unify_type( + context: TypeUnificationContext, + type_a: Type, + type_b: Type, +) -> TypeUnificationResult { + match (type_a, type_b) { + // If one type is undefined, the union is the other type. + // This is used as the base case when inferring array types from documents. + (Type::Scalar(Undefined), type_b) => Ok(type_b), + (type_a, Type::Scalar(Undefined)) => Ok(type_a), + + // Union of any type with Null is the Nullable version of that type + (Type::Scalar(Null), type_b) => Ok(make_nullable(type_b)), + (type_a, Type::Scalar(Null)) => Ok(make_nullable(type_a)), + + (Type::Scalar(scalar_a), Type::Scalar(scalar_b)) => { + if scalar_a == scalar_b { + Ok(Type::Scalar(scalar_a)) + } else { + Err(TypeUnificationError::ScalarTypeMismatch( + context, scalar_a, scalar_b, + )) + } + } + (Type::Object(object_a), Type::Object(object_b)) => { + if object_a == object_b { + Ok(Type::Object(object_a)) + } else { + Err(TypeUnificationError::ObjectTypeMismatch(object_a, object_b)) + } + } + (Type::ArrayOf(elem_type_a), Type::ArrayOf(elem_type_b)) => { + let elem_type = unify_type(context, *elem_type_a, *elem_type_b)?; + Ok(Type::ArrayOf(Box::new(elem_type))) + } + (Type::Nullable(nullable_type_a), type_b) => { + let result_type = unify_type(context, *nullable_type_a, type_b)?; + Ok(make_nullable(result_type)) + } + (type_a, Type::Nullable(nullable_type_b)) => { + let result_type = unify_type(context, type_a, *nullable_type_b)?; + Ok(make_nullable(result_type)) + } + (type_a, type_b) => Err(TypeUnificationError::TypeKindMismatch(type_a, type_b)), + } +} + +fn make_nullable(t: Type) -> Type { + match t { + Type::Nullable(t) => Type::Nullable(t), + t => Type::Nullable(Box::new(t)), + } +} + +fn make_nullable_field(field: ObjectField) -> Result { + Ok(ObjectField { + name: field.name, + r#type: make_nullable(field.r#type), + description: field.description, + }) +} + +fn unify_object_type( + object_type_a: ObjectType, + object_type_b: ObjectType, +) -> TypeUnificationResult { + let field_map_a: IndexMap = object_type_a + .fields + .into_iter() + .map(|o| (o.name.to_owned(), o)) + .collect(); + let field_map_b: IndexMap = object_type_b + .fields + .into_iter() + .map(|o| (o.name.to_owned(), o)) + .collect(); + + let merged_field_map = align_with_result( + field_map_a, + field_map_b, + make_nullable_field, + make_nullable_field, + |field_a, field_b| unify_object_field(&object_type_a.name, field_a, field_b), + )?; + + Ok(ObjectType { + name: object_type_a.name, + fields: merged_field_map.into_values().collect(), + description: object_type_a.description.or(object_type_b.description), + }) +} + +fn unify_object_field( + object_type_name: &str, + object_field_a: ObjectField, + object_field_b: ObjectField, +) -> TypeUnificationResult { + let context = TypeUnificationContext::new(object_type_name, &object_field_a.name); + Ok(ObjectField { + name: object_field_a.name, + r#type: unify_type(context, object_field_a.r#type, object_field_b.r#type)?, + description: object_field_a.description.or(object_field_b.description), + }) +} + +pub fn unify_object_types( + object_types_a: Vec, + object_types_b: Vec, +) -> TypeUnificationResult> { + let type_map_a: IndexMap = object_types_a + .into_iter() + .map(|t| (t.name.to_owned(), t)) + .collect(); + let type_map_b: IndexMap = object_types_b + .into_iter() + .map(|t| (t.name.to_owned(), t)) + .collect(); + + let merged_type_map = align_with_result(type_map_a, type_map_b, Ok, Ok, unify_object_type)?; + + Ok(merged_type_map.into_values().collect()) +} + +// Unify two schemas. Assumes that the schemas describe mutually exclusive sets of collections. +pub fn unify_schema(schema_a: Schema, schema_b: Schema) -> TypeUnificationResult { + let collections = schema_a + .collections + .into_iter() + .chain(schema_b.collections.into_iter()) + .collect(); + let object_types = schema_a + .object_types + .into_iter() + .chain(schema_b.object_types.into_iter()) + .collect(); + Ok(Schema { + collections, + object_types, + }) +} From f21ea31f305505a14574152ab87773ff781e959f Mon Sep 17 00:00:00 2001 From: David Overton Date: Tue, 19 Mar 2024 17:32:44 +1100 Subject: [PATCH 08/14] Clean up imports and exports --- crates/cli/src/introspection/document.rs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/crates/cli/src/introspection/document.rs b/crates/cli/src/introspection/document.rs index e9bd3811..1cfe52a6 100644 --- a/crates/cli/src/introspection/document.rs +++ b/crates/cli/src/introspection/document.rs @@ -1,3 +1,6 @@ +use super::type_unification::{ + unify_object_types, unify_schema, unify_type, TypeUnificationContext, TypeUnificationResult, +}; use configuration::{ schema::{Collection, ObjectField, ObjectType, Type}, Schema, @@ -7,10 +10,6 @@ use mongodb::bson::{doc, Bson, Document}; use mongodb_agent_common::interface_types::MongoConfig; use mongodb_support::BsonScalarType::{self, *}; -use crate::introspection::type_unification::{unify_type, TypeUnificationContext}; - -use super::type_unification::{unify_object_types, unify_schema, TypeUnificationResult}; - // Sample from all collections in the database pub async fn sample_schema_from_db( sample_size: u32, @@ -32,7 +31,7 @@ pub async fn sample_schema_from_db( Ok(schema) } -pub async fn sample_schema_from_collection( +async fn sample_schema_from_collection( collection_name: &str, sample_size: u32, config: &MongoConfig, From 2d86c61054c91ed8f3e6fa52b323789ffc9a2c6c Mon Sep 17 00:00:00 2001 From: David Overton Date: Tue, 19 Mar 2024 17:45:00 +1100 Subject: [PATCH 09/14] Make clippy happy --- .../cli/src/introspection/type_unification.rs | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/crates/cli/src/introspection/type_unification.rs b/crates/cli/src/introspection/type_unification.rs index ea8c815a..39a06dfe 100644 --- a/crates/cli/src/introspection/type_unification.rs +++ b/crates/cli/src/introspection/type_unification.rs @@ -40,25 +40,25 @@ impl Display for TypeUnificationContext { #[derive(Debug, Error)] pub enum TypeUnificationError { - ScalarTypeMismatch(TypeUnificationContext, BsonScalarType, BsonScalarType), - ObjectTypeMismatch(String, String), - TypeKindMismatch(Type, Type), + ScalarType(TypeUnificationContext, BsonScalarType, BsonScalarType), + ObjectType(String, String), + TypeKind(Type, Type), } impl Display for TypeUnificationError { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { - Self::ScalarTypeMismatch(context, scalar_a, scalar_b) => write!( + Self::ScalarType(context, scalar_a, scalar_b) => write!( f, "Scalar type mismatch {} {} at {}", scalar_a.bson_name(), scalar_b.bson_name(), context ), - Self::ObjectTypeMismatch(object_a, object_b) => { + Self::ObjectType(object_a, object_b) => { write!(f, "Object type mismatch {} {}", object_a, object_b) } - Self::TypeKindMismatch(type_a, type_b) => { + Self::TypeKind(type_a, type_b) => { write!(f, "Object type mismatch {:?} {:?}", type_a, type_b) } } @@ -86,7 +86,7 @@ pub fn unify_type( if scalar_a == scalar_b { Ok(Type::Scalar(scalar_a)) } else { - Err(TypeUnificationError::ScalarTypeMismatch( + Err(TypeUnificationError::ScalarType( context, scalar_a, scalar_b, )) } @@ -95,7 +95,7 @@ pub fn unify_type( if object_a == object_b { Ok(Type::Object(object_a)) } else { - Err(TypeUnificationError::ObjectTypeMismatch(object_a, object_b)) + Err(TypeUnificationError::ObjectType(object_a, object_b)) } } (Type::ArrayOf(elem_type_a), Type::ArrayOf(elem_type_b)) => { @@ -110,7 +110,7 @@ pub fn unify_type( let result_type = unify_type(context, type_a, *nullable_type_b)?; Ok(make_nullable(result_type)) } - (type_a, type_b) => Err(TypeUnificationError::TypeKindMismatch(type_a, type_b)), + (type_a, type_b) => Err(TypeUnificationError::TypeKind(type_a, type_b)), } } @@ -195,12 +195,12 @@ pub fn unify_schema(schema_a: Schema, schema_b: Schema) -> TypeUnificationResult let collections = schema_a .collections .into_iter() - .chain(schema_b.collections.into_iter()) + .chain(schema_b.collections) .collect(); let object_types = schema_a .object_types .into_iter() - .chain(schema_b.object_types.into_iter()) + .chain(schema_b.object_types) .collect(); Ok(Schema { collections, From 277b03e28972526ad482fa32e68cbaa808a1023e Mon Sep 17 00:00:00 2001 From: David Overton Date: Tue, 19 Mar 2024 18:02:11 +1100 Subject: [PATCH 10/14] add some documentation --- crates/cli/src/introspection/document.rs | 5 +++- .../cli/src/introspection/type_unification.rs | 25 ++++++++++++++++++- 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/crates/cli/src/introspection/document.rs b/crates/cli/src/introspection/document.rs index 1cfe52a6..5b0e4e61 100644 --- a/crates/cli/src/introspection/document.rs +++ b/crates/cli/src/introspection/document.rs @@ -10,7 +10,10 @@ use mongodb::bson::{doc, Bson, Document}; use mongodb_agent_common::interface_types::MongoConfig; use mongodb_support::BsonScalarType::{self, *}; -// Sample from all collections in the database +/// Sample from all collections in the database and return a Schema. +/// Return an error if there are any errors accessing the database +/// or if the types derived from the sample documents for a collection +/// are not unifiable. pub async fn sample_schema_from_db( sample_size: u32, config: &MongoConfig, diff --git a/crates/cli/src/introspection/type_unification.rs b/crates/cli/src/introspection/type_unification.rs index 39a06dfe..c0df4949 100644 --- a/crates/cli/src/introspection/type_unification.rs +++ b/crates/cli/src/introspection/type_unification.rs @@ -1,3 +1,7 @@ +/// This module contains functions for unifying types. +/// This is useful when deriving a schema from set of sample documents. +/// It allows the information in the schemas derived from several documents to be combined into one schema. +/// use configuration::{ schema::{ObjectField, ObjectType, Type}, Schema, @@ -67,6 +71,8 @@ impl Display for TypeUnificationError { pub type TypeUnificationResult = Result; +/// Unify two types. +/// Return an error if the types are not unifiable. pub fn unify_type( context: TypeUnificationContext, type_a: Type, @@ -82,6 +88,7 @@ pub fn unify_type( (Type::Scalar(Null), type_b) => Ok(make_nullable(type_b)), (type_a, Type::Scalar(Null)) => Ok(make_nullable(type_a)), + // Scalar types only unify if they are the same type. (Type::Scalar(scalar_a), Type::Scalar(scalar_b)) => { if scalar_a == scalar_b { Ok(Type::Scalar(scalar_a)) @@ -91,6 +98,8 @@ pub fn unify_type( )) } } + + // Object types only unify if they have the same name. (Type::Object(object_a), Type::Object(object_b)) => { if object_a == object_b { Ok(Type::Object(object_a)) @@ -98,10 +107,15 @@ pub fn unify_type( Err(TypeUnificationError::ObjectType(object_a, object_b)) } } + + // Array types unify iff their element types unify. (Type::ArrayOf(elem_type_a), Type::ArrayOf(elem_type_b)) => { let elem_type = unify_type(context, *elem_type_a, *elem_type_b)?; Ok(Type::ArrayOf(Box::new(elem_type))) } + + // A Nullable type will unify with another type iff the underlying type is unifiable. + // The resulting type will be Nullable. (Type::Nullable(nullable_type_a), type_b) => { let result_type = unify_type(context, *nullable_type_a, type_b)?; Ok(make_nullable(result_type)) @@ -110,6 +124,8 @@ pub fn unify_type( let result_type = unify_type(context, type_a, *nullable_type_b)?; Ok(make_nullable(result_type)) } + + // Anything else is a unification error. (type_a, type_b) => Err(TypeUnificationError::TypeKind(type_a, type_b)), } } @@ -129,6 +145,8 @@ fn make_nullable_field(field: ObjectField) -> Result { }) } +/// Unify two `ObjectType`s. +/// Any field that appears in only one of the `ObjectType`s will be made nullable. fn unify_object_type( object_type_a: ObjectType, object_type_b: ObjectType, @@ -159,6 +177,8 @@ fn unify_object_type( }) } +/// The types of two `ObjectField`s. +/// If the types are not unifiable then return an error. fn unify_object_field( object_type_name: &str, object_field_a: ObjectField, @@ -172,6 +192,9 @@ fn unify_object_field( }) } +/// Unify two sets of `ObjectType`s. +/// Any `ObjectType` that appears in only one set will be unchanged in the output. +/// Any type that appears in both sets will be unified using `unify_object_type`. pub fn unify_object_types( object_types_a: Vec, object_types_b: Vec, @@ -190,7 +213,7 @@ pub fn unify_object_types( Ok(merged_type_map.into_values().collect()) } -// Unify two schemas. Assumes that the schemas describe mutually exclusive sets of collections. +/// Unify two schemas. Assumes that the schemas describe mutually exclusive sets of collections. pub fn unify_schema(schema_a: Schema, schema_b: Schema) -> TypeUnificationResult { let collections = schema_a .collections From eb29080fc4abdc47275532198b4d87aceb3ccfc7 Mon Sep 17 00:00:00 2001 From: David Overton Date: Tue, 19 Mar 2024 18:11:32 +1100 Subject: [PATCH 11/14] Remove dependency on these --- Cargo.lock | 1 - crates/mongodb-support/Cargo.toml | 1 - crates/mongodb-support/src/align.rs | 60 ----------------------------- 3 files changed, 62 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ca0e45f1..cabb1cad 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1585,7 +1585,6 @@ dependencies = [ "schemars", "serde", "serde_json", - "these", "thiserror", ] diff --git a/crates/mongodb-support/Cargo.toml b/crates/mongodb-support/Cargo.toml index a749ac9f..dbd6cb2e 100644 --- a/crates/mongodb-support/Cargo.toml +++ b/crates/mongodb-support/Cargo.toml @@ -10,7 +10,6 @@ indexmap = { version = "1", features = ["serde"] } # must match the version that schemars = "^0.8.12" serde = { version = "1", features = ["derive"] } serde_json = "1" -these = "2.0.0" thiserror = "1" [dev-dependencies] diff --git a/crates/mongodb-support/src/align.rs b/crates/mongodb-support/src/align.rs index da61f5c6..25553f0f 100644 --- a/crates/mongodb-support/src/align.rs +++ b/crates/mongodb-support/src/align.rs @@ -1,65 +1,5 @@ use indexmap::IndexMap; use std::hash::Hash; -use these::These::{self, *}; - -pub fn align(ts: IndexMap, mut us: IndexMap) -> IndexMap> -where - K: Hash + Eq, -{ - let mut result: IndexMap> = IndexMap::new(); - - for (k, t) in ts { - match us.swap_remove(&k) { - None => result.insert(k, This(t)), - Some(u) => result.insert(k, Both(t, u)), - }; - } - - for (k, u) in us { - result.insert(k, That(u)); - } - result -} - -pub fn align_with(ts: IndexMap, mut us: IndexMap, f: F) -> IndexMap -where - K: Hash + Eq, - F: Fn(V, V) -> V, -{ - let mut result: IndexMap = IndexMap::new(); - - for (k, t) in ts { - match us.swap_remove(&k) { - None => result.insert(k, t), - Some(u) => result.insert(k, f(t, u)), - }; - } - - for (k, u) in us { - result.insert(k, u); - } - result -} - -// pub fn align_with_result(ts: IndexMap, mut us: IndexMap, f: F) -> Result, E> -// where -// K: Hash + Eq, -// F: Fn(V, V) -> Result, -// { -// let mut result: IndexMap = IndexMap::new(); - -// for (k, t) in ts { -// match us.swap_remove(&k) { -// None => result.insert(k, t), -// Some(u) => result.insert(k, f(t, u)?), -// }; -// } - -// for (k, u) in us { -// result.insert(k, u); -// } -// Ok(result) -// } pub fn align_with_result(ts: IndexMap, mut us: IndexMap, ft: FT, fu: FU, ftu: FTU) -> Result, E> where From ff6b69291db15d4cec23976c14cf4111a6d9aa9f Mon Sep 17 00:00:00 2001 From: David Overton Date: Tue, 19 Mar 2024 18:17:41 +1100 Subject: [PATCH 12/14] Rename document.rs to sampling.rs --- crates/cli/src/introspection/mod.rs | 4 ++-- crates/cli/src/introspection/{document.rs => sampling.rs} | 0 2 files changed, 2 insertions(+), 2 deletions(-) rename crates/cli/src/introspection/{document.rs => sampling.rs} (100%) diff --git a/crates/cli/src/introspection/mod.rs b/crates/cli/src/introspection/mod.rs index e2af4ee5..057303c2 100644 --- a/crates/cli/src/introspection/mod.rs +++ b/crates/cli/src/introspection/mod.rs @@ -1,6 +1,6 @@ -pub mod document; +pub mod sampling; pub mod validation_schema; pub mod type_unification; pub use validation_schema::get_metadata_from_validation_schema; -pub use document::sample_schema_from_db; \ No newline at end of file +pub use sampling::sample_schema_from_db; \ No newline at end of file diff --git a/crates/cli/src/introspection/document.rs b/crates/cli/src/introspection/sampling.rs similarity index 100% rename from crates/cli/src/introspection/document.rs rename to crates/cli/src/introspection/sampling.rs From b07dc12b66f1d2eb77d07cdf76781926fbc291ca Mon Sep 17 00:00:00 2001 From: David Overton Date: Tue, 19 Mar 2024 18:23:01 +1100 Subject: [PATCH 13/14] Fix for unifying object types --- crates/cli/src/introspection/sampling.rs | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/crates/cli/src/introspection/sampling.rs b/crates/cli/src/introspection/sampling.rs index 5b0e4e61..8e86fb77 100644 --- a/crates/cli/src/introspection/sampling.rs +++ b/crates/cli/src/introspection/sampling.rs @@ -48,7 +48,11 @@ async fn sample_schema_from_collection( let mut collected_object_types = vec![]; while let Some(document) = cursor.try_next().await? { let object_types = make_object_type(collection_name, &document)?; - collected_object_types = unify_object_types(collected_object_types, object_types)?; + collected_object_types = if collected_object_types.is_empty() { + object_types + } else { + unify_object_types(collected_object_types, object_types)? + }; } let collection_info = Collection { name: collection_name.to_string(), @@ -124,7 +128,11 @@ fn make_field_type( for elem in arr { let (elem_collected_otds, elem_type) = make_field_type(object_type_name, field_name, elem)?; - collected_otds = unify_object_types(collected_otds, elem_collected_otds)?; + collected_otds = if collected_otds.is_empty() { + elem_collected_otds + } else { + unify_object_types(collected_otds, elem_collected_otds)? + }; let context = TypeUnificationContext::new(object_type_name, field_name); result_type = unify_type(context, result_type, elem_type)?; } From df45726071198930fad9c26c62ea3880d8539abf Mon Sep 17 00:00:00 2001 From: David Overton Date: Wed, 20 Mar 2024 09:30:15 +1100 Subject: [PATCH 14/14] Fix error message --- crates/cli/src/introspection/type_unification.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/cli/src/introspection/type_unification.rs b/crates/cli/src/introspection/type_unification.rs index c0df4949..b435e54e 100644 --- a/crates/cli/src/introspection/type_unification.rs +++ b/crates/cli/src/introspection/type_unification.rs @@ -63,7 +63,7 @@ impl Display for TypeUnificationError { write!(f, "Object type mismatch {} {}", object_a, object_b) } Self::TypeKind(type_a, type_b) => { - write!(f, "Object type mismatch {:?} {:?}", type_a, type_b) + write!(f, "Type mismatch {:?} {:?}", type_a, type_b) } } }