From 0b9d8456bd3310bafa6affdfee1c818327cf1de3 Mon Sep 17 00:00:00 2001 From: Jesse Hallett Date: Tue, 29 Oct 2024 14:38:23 -0700 Subject: [PATCH 1/2] prune superfluous inferred object types from native query configuration --- crates/cli/src/native_query/mod.rs | 1 + .../src/native_query/pipeline_type_context.rs | 14 +++- .../src/native_query/prune_object_types.rs | 65 +++++++++++++++++++ 3 files changed, 78 insertions(+), 2 deletions(-) create mode 100644 crates/cli/src/native_query/prune_object_types.rs diff --git a/crates/cli/src/native_query/mod.rs b/crates/cli/src/native_query/mod.rs index 6d25330..a66a55d 100644 --- a/crates/cli/src/native_query/mod.rs +++ b/crates/cli/src/native_query/mod.rs @@ -3,6 +3,7 @@ pub mod error; mod helpers; mod pipeline; mod pipeline_type_context; +mod prune_object_types; mod reference_shorthand; mod type_constraint; mod type_solver; diff --git a/crates/cli/src/native_query/pipeline_type_context.rs b/crates/cli/src/native_query/pipeline_type_context.rs index e2acf76..2d5c173 100644 --- a/crates/cli/src/native_query/pipeline_type_context.rs +++ b/crates/cli/src/native_query/pipeline_type_context.rs @@ -14,6 +14,7 @@ use ndc_models::{ArgumentName, ObjectTypeName}; use super::{ error::{Error, Result}, + prune_object_types::prune_object_types, type_constraint::{ObjectTypeConstraint, TypeConstraint, TypeVariable, Variance}, type_solver::unify, }; @@ -116,7 +117,7 @@ impl PipelineTypeContext<'_> { })?, }; - let parameter_types = self + let parameter_types: BTreeMap = self .parameter_types .into_iter() .map(|(parameter_name, type_variable)| { @@ -127,10 +128,19 @@ impl PipelineTypeContext<'_> { }) .collect(); + // Prune added object types to remove types that are not referenced by the return type or + // by parameter types, and therefore don't need to be included in the native query + // configuration. + let object_types = { + let reference_types = + std::iter::once(result_document_type).chain(parameter_types.values()); + prune_object_types(reference_types, added_object_types)? + }; + Ok(PipelineTypes { result_document_type: result_document_type_name, parameter_types, - object_types: added_object_types, + object_types, warnings: self.warnings, }) } diff --git a/crates/cli/src/native_query/prune_object_types.rs b/crates/cli/src/native_query/prune_object_types.rs new file mode 100644 index 0000000..673cfa9 --- /dev/null +++ b/crates/cli/src/native_query/prune_object_types.rs @@ -0,0 +1,65 @@ +use std::collections::{BTreeMap, HashSet}; + +use configuration::schema::{ObjectType, Type}; +use ndc_models::ObjectTypeName; + +use super::error::{Error, Result}; + +/// Filters map of object types to get only types that are referenced directly or indirectly from +/// the set of reference types. +pub fn prune_object_types<'a>( + reference_types: impl IntoIterator, + object_types: BTreeMap, +) -> Result> { + let mut required_type_names = HashSet::new(); + for t in reference_types { + collect_names_from_type(&object_types, &mut required_type_names, t)?; + } + let pruned_object_types = object_types + .into_iter() + .filter(|(name, _)| required_type_names.contains(name)) + .collect(); + Ok(pruned_object_types) +} + +fn collect_names_from_type( + object_types: &BTreeMap, + found_type_names: &mut HashSet, + input_type: &Type, +) -> Result<()> { + match input_type { + Type::Object(type_name) => { + let object_type_name = mk_object_type_name(type_name); + collect_names_from_object_type(object_types, found_type_names, &object_type_name)?; + found_type_names.insert(object_type_name); + } + Type::Predicate { object_type_name } => { + let object_type_name = object_type_name.clone(); + collect_names_from_object_type(object_types, found_type_names, &object_type_name)?; + found_type_names.insert(object_type_name); + } + Type::ArrayOf(t) => collect_names_from_type(object_types, found_type_names, t)?, + Type::Nullable(t) => collect_names_from_type(object_types, found_type_names, t)?, + Type::ExtendedJSON => (), + Type::Scalar(_) => (), + }; + Ok(()) +} + +fn collect_names_from_object_type( + object_types: &BTreeMap, + found_type_names: &mut HashSet, + input_type_name: &ObjectTypeName, +) -> Result<()> { + let object_type = object_types + .get(input_type_name) + .ok_or_else(|| Error::UnknownObjectType(input_type_name.to_string()))?; + for (_, field) in object_type.fields.iter() { + collect_names_from_type(object_types, found_type_names, &field.r#type)?; + } + Ok(()) +} + +fn mk_object_type_name(name: &str) -> ObjectTypeName { + name.into() +} From ce53c5f5b79211dc87d18cca680b0cf5d27ad85d Mon Sep 17 00:00:00 2001 From: Jesse Hallett Date: Fri, 1 Nov 2024 15:02:02 -0700 Subject: [PATCH 2/2] don't fail on existing object types; try to simplify object type names --- Cargo.lock | 21 +- crates/cli/Cargo.toml | 1 + crates/cli/src/native_query/helpers.rs | 32 +++ crates/cli/src/native_query/mod.rs | 2 +- crates/cli/src/native_query/pipeline/mod.rs | 2 +- .../src/native_query/pipeline_type_context.rs | 46 ++-- .../src/native_query/prune_object_types.rs | 251 +++++++++++++++++- 7 files changed, 309 insertions(+), 46 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1467608..b884974 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1819,6 +1819,7 @@ dependencies = [ "pretty_assertions", "proptest", "ref-cast", + "regex", "serde", "serde_json", "test-helpers", @@ -2381,7 +2382,7 @@ dependencies = [ "rand", "rand_chacha", "rand_xorshift", - "regex-syntax 0.8.4", + "regex-syntax 0.8.5", "rusty-fork", "tempfile", "unarray", @@ -2507,14 +2508,14 @@ dependencies = [ [[package]] name = "regex" -version = "1.10.5" +version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b91213439dad192326a0d7c6ee3955910425f441d7038e0d6933b0aec5c4517f" +checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" dependencies = [ "aho-corasick", "memchr", - "regex-automata 0.4.7", - "regex-syntax 0.8.4", + "regex-automata 0.4.8", + "regex-syntax 0.8.5", ] [[package]] @@ -2528,13 +2529,13 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.7" +version = "0.4.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df" +checksum = "368758f23274712b504848e9d5a6f010445cc8b87a7cdb4d7cbee666c1288da3" dependencies = [ "aho-corasick", "memchr", - "regex-syntax 0.8.4", + "regex-syntax 0.8.5", ] [[package]] @@ -2545,9 +2546,9 @@ checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" [[package]] name = "regex-syntax" -version = "0.8.4" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b" +checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" [[package]] name = "reqwest" diff --git a/crates/cli/Cargo.toml b/crates/cli/Cargo.toml index 5b2c104..f57e006 100644 --- a/crates/cli/Cargo.toml +++ b/crates/cli/Cargo.toml @@ -21,6 +21,7 @@ ndc-models = { workspace = true } nom = "^7.1.3" nonempty = "^0.10.0" ref-cast = { workspace = true } +regex = "^1.11.1" serde = { workspace = true } serde_json = { workspace = true } thiserror = "1.0.57" diff --git a/crates/cli/src/native_query/helpers.rs b/crates/cli/src/native_query/helpers.rs index 5a3ee11..3a2d10c 100644 --- a/crates/cli/src/native_query/helpers.rs +++ b/crates/cli/src/native_query/helpers.rs @@ -1,5 +1,8 @@ +use std::{borrow::Cow, collections::BTreeMap}; + use configuration::Configuration; use ndc_models::{CollectionInfo, CollectionName, ObjectTypeName}; +use regex::Regex; use super::error::{Error, Result}; @@ -24,3 +27,32 @@ pub fn find_collection_object_type( let collection = find_collection(configuration, collection_name)?; Ok(collection.collection_type.clone()) } + +pub fn unique_type_name( + object_types: &BTreeMap, + added_object_types: &BTreeMap, + desired_type_name: &str, +) -> ObjectTypeName { + let (name, mut counter) = parse_counter_suffix(desired_type_name); + let mut type_name: ObjectTypeName = name.as_ref().into(); + while object_types.contains_key(&type_name) || added_object_types.contains_key(&type_name) { + counter += 1; + type_name = format!("{desired_type_name}_{counter}").into(); + } + type_name +} + +/// [unique_type_name] adds a `_n` numeric suffix where necessary. There are cases where we go +/// through multiple layers of unique names. Instead of accumulating multiple suffixes, we can +/// increment the existing suffix. If there is no suffix then the count starts at zero. +pub fn parse_counter_suffix(name: &str) -> (Cow<'_, str>, u32) { + let re = Regex::new(r"^(.*?)_(\d+)$").unwrap(); + let Some(captures) = re.captures(name) else { + return (Cow::Borrowed(name), 0); + }; + let prefix = captures.get(1).unwrap().as_str(); + let Some(count) = captures.get(2).and_then(|s| s.as_str().parse().ok()) else { + return (Cow::Borrowed(name), 0); + }; + (Cow::Owned(prefix.to_string()), count) +} diff --git a/crates/cli/src/native_query/mod.rs b/crates/cli/src/native_query/mod.rs index a66a55d..9c0331f 100644 --- a/crates/cli/src/native_query/mod.rs +++ b/crates/cli/src/native_query/mod.rs @@ -207,7 +207,7 @@ mod tests { pipeline.clone(), )?; - let expected_document_type_name: ObjectTypeName = "selected_title_documents_2".into(); + let expected_document_type_name: ObjectTypeName = "selected_title_documents".into(); let expected_object_types = [( expected_document_type_name.clone(), diff --git a/crates/cli/src/native_query/pipeline/mod.rs b/crates/cli/src/native_query/pipeline/mod.rs index 3aa2a42..144289b 100644 --- a/crates/cli/src/native_query/pipeline/mod.rs +++ b/crates/cli/src/native_query/pipeline/mod.rs @@ -329,7 +329,7 @@ mod tests { let config = mflix_config(); let pipeline_types = infer_pipeline_types(&config, "documents", None, &pipeline).unwrap(); let expected = [( - "documents_documents_2".into(), + "documents_documents".into(), ObjectType { fields: [ ( diff --git a/crates/cli/src/native_query/pipeline_type_context.rs b/crates/cli/src/native_query/pipeline_type_context.rs index 2d5c173..3f8e3ae 100644 --- a/crates/cli/src/native_query/pipeline_type_context.rs +++ b/crates/cli/src/native_query/pipeline_type_context.rs @@ -14,6 +14,7 @@ use ndc_models::{ArgumentName, ObjectTypeName}; use super::{ error::{Error, Result}, + helpers::unique_type_name, prune_object_types::prune_object_types, type_constraint::{ObjectTypeConstraint, TypeConstraint, TypeVariable, Variance}, type_solver::unify, @@ -107,17 +108,12 @@ impl PipelineTypeContext<'_> { e => e, })?; - let result_document_type = variable_types + let mut result_document_type = variable_types .get(&result_document_type_variable) - .expect("missing result type variable is missing"); - let result_document_type_name = match result_document_type { - Type::Object(type_name) => type_name.clone().into(), - t => Err(Error::ExpectedObject { - actual_type: t.clone(), - })?, - }; + .expect("missing result type variable is missing") + .clone(); - let parameter_types: BTreeMap = self + let mut parameter_types: BTreeMap = self .parameter_types .into_iter() .map(|(parameter_name, type_variable)| { @@ -132,9 +128,21 @@ impl PipelineTypeContext<'_> { // by parameter types, and therefore don't need to be included in the native query // configuration. let object_types = { - let reference_types = - std::iter::once(result_document_type).chain(parameter_types.values()); - prune_object_types(reference_types, added_object_types)? + let mut reference_types = std::iter::once(&mut result_document_type) + .chain(parameter_types.values_mut()) + .collect_vec(); + prune_object_types( + &mut reference_types, + &self.configuration.object_types, + added_object_types, + )? + }; + + let result_document_type_name = match result_document_type { + Type::Object(type_name) => type_name.clone().into(), + t => Err(Error::ExpectedObject { + actual_type: t.clone(), + })?, }; Ok(PipelineTypes { @@ -195,15 +203,11 @@ impl PipelineTypeContext<'_> { } pub fn unique_type_name(&self, desired_type_name: &str) -> ObjectTypeName { - let mut counter = 0; - let mut type_name: ObjectTypeName = desired_type_name.into(); - while self.configuration.object_types.contains_key(&type_name) - || self.object_types.contains_key(&type_name) - { - counter += 1; - type_name = format!("{desired_type_name}_{counter}").into(); - } - type_name + unique_type_name( + &self.configuration.object_types, + &self.object_types, + desired_type_name, + ) } pub fn set_stage_doc_type(&mut self, doc_type: TypeConstraint) { diff --git a/crates/cli/src/native_query/prune_object_types.rs b/crates/cli/src/native_query/prune_object_types.rs index 673cfa9..fa819e7 100644 --- a/crates/cli/src/native_query/prune_object_types.rs +++ b/crates/cli/src/native_query/prune_object_types.rs @@ -1,45 +1,82 @@ use std::collections::{BTreeMap, HashSet}; -use configuration::schema::{ObjectType, Type}; +use configuration::schema::{ObjectField, ObjectType, Type}; +use itertools::Itertools as _; use ndc_models::ObjectTypeName; +use crate::native_query::helpers::{parse_counter_suffix, unique_type_name}; + use super::error::{Error, Result}; /// Filters map of object types to get only types that are referenced directly or indirectly from /// the set of reference types. -pub fn prune_object_types<'a>( - reference_types: impl IntoIterator, - object_types: BTreeMap, +pub fn prune_object_types( + reference_types: &mut [&mut Type], + existing_object_types: &BTreeMap, + added_object_types: BTreeMap, ) -> Result> { let mut required_type_names = HashSet::new(); - for t in reference_types { - collect_names_from_type(&object_types, &mut required_type_names, t)?; + for t in &*reference_types { + collect_names_from_type( + existing_object_types, + &added_object_types, + &mut required_type_names, + t, + )?; } - let pruned_object_types = object_types + let mut pruned_object_types = added_object_types .into_iter() .filter(|(name, _)| required_type_names.contains(name)) .collect(); + + simplify_type_names( + reference_types, + existing_object_types, + &mut pruned_object_types, + ); + Ok(pruned_object_types) } fn collect_names_from_type( - object_types: &BTreeMap, + existing_object_types: &BTreeMap, + added_object_types: &BTreeMap, found_type_names: &mut HashSet, input_type: &Type, ) -> Result<()> { match input_type { Type::Object(type_name) => { let object_type_name = mk_object_type_name(type_name); - collect_names_from_object_type(object_types, found_type_names, &object_type_name)?; + collect_names_from_object_type( + existing_object_types, + added_object_types, + found_type_names, + &object_type_name, + )?; found_type_names.insert(object_type_name); } Type::Predicate { object_type_name } => { let object_type_name = object_type_name.clone(); - collect_names_from_object_type(object_types, found_type_names, &object_type_name)?; + collect_names_from_object_type( + existing_object_types, + added_object_types, + found_type_names, + &object_type_name, + )?; found_type_names.insert(object_type_name); } - Type::ArrayOf(t) => collect_names_from_type(object_types, found_type_names, t)?, - Type::Nullable(t) => collect_names_from_type(object_types, found_type_names, t)?, + Type::ArrayOf(t) => collect_names_from_type( + existing_object_types, + added_object_types, + found_type_names, + t, + )?, + Type::Nullable(t) => collect_names_from_type( + existing_object_types, + added_object_types, + found_type_names, + t, + )?, Type::ExtendedJSON => (), Type::Scalar(_) => (), }; @@ -47,19 +84,207 @@ fn collect_names_from_type( } fn collect_names_from_object_type( + existing_object_types: &BTreeMap, object_types: &BTreeMap, found_type_names: &mut HashSet, input_type_name: &ObjectTypeName, ) -> Result<()> { + if existing_object_types.contains_key(input_type_name) { + return Ok(()); + } let object_type = object_types .get(input_type_name) .ok_or_else(|| Error::UnknownObjectType(input_type_name.to_string()))?; for (_, field) in object_type.fields.iter() { - collect_names_from_type(object_types, found_type_names, &field.r#type)?; + collect_names_from_type( + existing_object_types, + object_types, + found_type_names, + &field.r#type, + )?; } Ok(()) } +/// The system for generating unique object type names uses numeric suffixes. After pruning we may +/// be able to remove these suffixes. +fn simplify_type_names( + reference_types: &mut [&mut Type], + existing_object_types: &BTreeMap, + added_object_types: &mut BTreeMap, +) { + let names = added_object_types.keys().cloned().collect_vec(); + for name in names { + let (name_root, count) = parse_counter_suffix(name.as_str()); + let maybe_simplified_name = + unique_type_name(existing_object_types, added_object_types, &name_root); + let (_, new_count) = parse_counter_suffix(maybe_simplified_name.as_str()); + if new_count < count { + rename_object_type( + reference_types, + added_object_types, + &name, + &maybe_simplified_name, + ); + } + } +} + +fn rename_object_type( + reference_types: &mut [&mut Type], + object_types: &mut BTreeMap, + old_name: &ObjectTypeName, + new_name: &ObjectTypeName, +) { + for t in reference_types.iter_mut() { + **t = rename_type_helper(old_name, new_name, (*t).clone()); + } + + let renamed_object_types = object_types + .clone() + .into_iter() + .map(|(name, object_type)| { + let new_type_name = if &name == old_name { + new_name.clone() + } else { + name + }; + let new_object_type = rename_object_type_helper(old_name, new_name, object_type); + (new_type_name, new_object_type) + }) + .collect(); + *object_types = renamed_object_types; +} + +fn rename_type_helper( + old_name: &ObjectTypeName, + new_name: &ObjectTypeName, + input_type: Type, +) -> Type { + let old_name_string = old_name.to_string(); + + match input_type { + Type::Object(name) => { + if name == old_name_string { + Type::Object(new_name.to_string()) + } else { + Type::Object(name) + } + } + Type::Predicate { object_type_name } => { + if &object_type_name == old_name { + Type::Predicate { + object_type_name: new_name.clone(), + } + } else { + Type::Predicate { object_type_name } + } + } + Type::ArrayOf(t) => Type::ArrayOf(Box::new(rename_type_helper(old_name, new_name, *t))), + Type::Nullable(t) => Type::Nullable(Box::new(rename_type_helper(old_name, new_name, *t))), + t @ Type::Scalar(_) => t, + t @ Type::ExtendedJSON => t, + } +} + +fn rename_object_type_helper( + old_name: &ObjectTypeName, + new_name: &ObjectTypeName, + object_type: ObjectType, +) -> ObjectType { + let new_fields = object_type + .fields + .into_iter() + .map(|(name, field)| { + let new_field = ObjectField { + r#type: rename_type_helper(old_name, new_name, field.r#type), + description: field.description, + }; + (name, new_field) + }) + .collect(); + ObjectType { + fields: new_fields, + description: object_type.description, + } +} + fn mk_object_type_name(name: &str) -> ObjectTypeName { name.into() } + +#[cfg(test)] +mod tests { + use configuration::schema::{ObjectField, ObjectType, Type}; + use googletest::prelude::*; + + use super::prune_object_types; + + #[googletest::test] + fn prunes_and_simplifies_object_types() -> Result<()> { + let mut result_type = Type::Object("Documents_2".into()); + let mut reference_types = [&mut result_type]; + let existing_object_types = Default::default(); + + let added_object_types = [ + ( + "Documents_1".into(), + ObjectType { + fields: [( + "bar".into(), + ObjectField { + r#type: Type::Scalar(mongodb_support::BsonScalarType::String), + description: None, + }, + )] + .into(), + description: None, + }, + ), + ( + "Documents_2".into(), + ObjectType { + fields: [( + "foo".into(), + ObjectField { + r#type: Type::Scalar(mongodb_support::BsonScalarType::String), + description: None, + }, + )] + .into(), + description: None, + }, + ), + ] + .into(); + + let pruned = prune_object_types( + &mut reference_types, + &existing_object_types, + added_object_types, + )?; + + expect_eq!( + pruned, + [( + "Documents".into(), + ObjectType { + fields: [( + "foo".into(), + ObjectField { + r#type: Type::Scalar(mongodb_support::BsonScalarType::String), + description: None, + }, + )] + .into(), + description: None, + }, + )] + .into() + ); + + expect_eq!(result_type, Type::Object("Documents".into())); + + Ok(()) + } +}