From 60355a0fb0414244139655f4d4b9eaa5f2189683 Mon Sep 17 00:00:00 2001 From: Matthew Kim <38759997+friendlymatthew@users.noreply.github.com> Date: Mon, 18 Aug 2025 09:17:52 -0400 Subject: [PATCH 1/2] Have T: impl FileSource implement DataSource and own FileScanConfig --- .../examples/advanced_parquet_index.rs | 22 +- .../examples/csv_json_opener.rs | 27 +- .../examples/custom_file_format.rs | 6 +- .../examples/default_column_values.rs | 19 +- .../examples/json_shredding.rs | 34 +- .../examples/parquet_embedded_index.rs | 10 +- .../examples/parquet_exec_visitor.rs | 8 +- datafusion-examples/examples/parquet_index.rs | 13 +- .../core/src/datasource/file_format/arrow.rs | 16 +- .../core/src/datasource/file_format/mod.rs | 2 +- .../core/src/datasource/listing/table.rs | 41 +- datafusion/core/src/datasource/mod.rs | 21 +- .../datasource/physical_plan/arrow_file.rs | 65 +-- .../core/src/datasource/physical_plan/avro.rs | 33 +- .../core/src/datasource/physical_plan/csv.rs | 116 +++--- .../core/src/datasource/physical_plan/json.rs | 28 +- .../src/datasource/physical_plan/parquet.rs | 52 ++- datafusion/core/src/test/mod.rs | 13 +- datafusion/core/src/test_util/parquet.rs | 26 +- datafusion/core/tests/fuzz_cases/pruning.rs | 17 +- .../core/tests/parquet/custom_reader.rs | 17 +- .../tests/parquet/external_access_plan.rs | 17 +- .../core/tests/parquet/file_statistics.rs | 14 +- datafusion/core/tests/parquet/page_pruning.rs | 16 +- .../core/tests/parquet/schema_coercion.rs | 34 +- datafusion/core/tests/parquet/utils.rs | 4 +- .../enforce_distribution.rs | 86 ++-- .../physical_optimizer/enforce_sorting.rs | 6 +- .../filter_pushdown/util.rs | 97 ++--- .../physical_optimizer/projection_pushdown.rs | 39 +- .../tests/physical_optimizer/test_utils.rs | 54 +-- .../schema_adapter_integration_tests.rs | 45 +- datafusion/datasource-avro/src/file_format.rs | 12 +- datafusion/datasource-avro/src/source.rs | 87 +--- datafusion/datasource-csv/src/file_format.rs | 18 +- datafusion/datasource-csv/src/mod.rs | 7 +- datafusion/datasource-csv/src/source.rs | 114 ++--- datafusion/datasource-json/src/file_format.rs | 10 +- datafusion/datasource-json/src/source.rs | 73 +--- .../datasource-parquet/src/file_format.rs | 16 +- datafusion/datasource-parquet/src/source.rs | 134 +++--- .../tests/apply_schema_adapter_tests.rs | 37 +- datafusion/datasource/src/display.rs | 2 +- datafusion/datasource/src/file.rs | 313 ++++++++++++-- datafusion/datasource/src/file_format.rs | 4 - datafusion/datasource/src/file_scan_config.rs | 394 +++++------------- datafusion/datasource/src/file_stream.rs | 3 +- datafusion/datasource/src/memory.rs | 4 + datafusion/datasource/src/source.rs | 30 +- datafusion/datasource/src/test_util.rs | 69 +-- .../proto/src/physical_plan/from_proto.rs | 5 +- datafusion/proto/src/physical_plan/mod.rs | 174 ++++---- .../proto/src/physical_plan/to_proto.rs | 2 +- .../tests/cases/roundtrip_physical_plan.rs | 131 +++--- .../substrait/src/physical_plan/consumer.rs | 11 +- .../substrait/src/physical_plan/producer.rs | 8 +- .../tests/cases/roundtrip_physical_plan.rs | 9 +- 57 files changed, 1216 insertions(+), 1449 deletions(-) diff --git a/datafusion-examples/examples/advanced_parquet_index.rs b/datafusion-examples/examples/advanced_parquet_index.rs index efaee23366a1..7d1591cd6c70 100644 --- a/datafusion-examples/examples/advanced_parquet_index.rs +++ b/datafusion-examples/examples/advanced_parquet_index.rs @@ -27,6 +27,7 @@ use datafusion::catalog::Session; use datafusion::common::{ internal_datafusion_err, DFSchema, DataFusionError, Result, ScalarValue, }; +use datafusion::config::TableParquetOptions; use datafusion::datasource::listing::PartitionedFile; use datafusion::datasource::physical_plan::parquet::ParquetAccessPlan; use datafusion::datasource::physical_plan::{ @@ -491,23 +492,22 @@ impl TableProvider for IndexTableProvider { CachedParquetFileReaderFactory::new(Arc::clone(&self.object_store)) .with_file(indexed_file); - let file_source = Arc::new( - ParquetSource::default() + let file_scan_config = FileScanConfigBuilder::new(object_store_url, schema) + .with_limit(limit) + .with_projection(projection.cloned()) + .with_file(partitioned_file) + .build(); + + let file_source = + ParquetSource::new(TableParquetOptions::default(), file_scan_config) // provide the predicate so the DataSourceExec can try and prune // row groups internally .with_predicate(predicate) // provide the factory to create parquet reader without re-reading metadata - .with_parquet_file_reader_factory(Arc::new(reader_factory)), - ); - let file_scan_config = - FileScanConfigBuilder::new(object_store_url, schema, file_source) - .with_limit(limit) - .with_projection(projection.cloned()) - .with_file(partitioned_file) - .build(); + .with_parquet_file_reader_factory(Arc::new(reader_factory)); // Finally, put it all together into a DataSourceExec - Ok(DataSourceExec::from_data_source(file_scan_config)) + Ok(DataSourceExec::from_data_source(file_source)) } /// Tell DataFusion to push filters down to the scan method diff --git a/datafusion-examples/examples/csv_json_opener.rs b/datafusion-examples/examples/csv_json_opener.rs index 1a2c2cbff418..3fcccced9c7b 100644 --- a/datafusion-examples/examples/csv_json_opener.rs +++ b/datafusion-examples/examples/csv_json_opener.rs @@ -24,7 +24,7 @@ use datafusion::{ file_format::file_compression_type::FileCompressionType, listing::PartitionedFile, object_store::ObjectStoreUrl, - physical_plan::{CsvSource, FileSource, FileStream, JsonOpener, JsonSource}, + physical_plan::{CsvSource, FileSource, FileStream, JsonOpener}, }, error::Result, physical_plan::metrics::ExecutionPlanMetricsSet, @@ -58,24 +58,22 @@ async fn csv_opener() -> Result<()> { let scan_config = FileScanConfigBuilder::new( ObjectStoreUrl::local_filesystem(), Arc::clone(&schema), - Arc::new(CsvSource::default()), ) .with_projection(Some(vec![12, 0])) .with_limit(Some(5)) .with_file(PartitionedFile::new(path.display().to_string(), 10)) .build(); - let config = CsvSource::new(true, b',', b'"') + let source = CsvSource::new(true, b',', b'"', scan_config) .with_comment(Some(b'#')) .with_schema(schema) - .with_batch_size(8192) - .with_projection(&scan_config); + .with_batch_size(8192); - let opener = config.create_file_opener(object_store, &scan_config, 0); + let opener = source.create_file_opener(object_store, 0); let mut result = vec![]; let mut stream = - FileStream::new(&scan_config, 0, opener, &ExecutionPlanMetricsSet::new())?; + FileStream::new(&source.config(), 0, opener, &ExecutionPlanMetricsSet::new())?; while let Some(batch) = stream.next().await.transpose()? { result.push(batch); } @@ -121,15 +119,12 @@ async fn json_opener() -> Result<()> { Arc::new(object_store), ); - let scan_config = FileScanConfigBuilder::new( - ObjectStoreUrl::local_filesystem(), - schema, - Arc::new(JsonSource::default()), - ) - .with_projection(Some(vec![1, 0])) - .with_limit(Some(5)) - .with_file(PartitionedFile::new(path.to_string(), 10)) - .build(); + let scan_config = + FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), schema) + .with_projection(Some(vec![1, 0])) + .with_limit(Some(5)) + .with_file(PartitionedFile::new(path.to_string(), 10)) + .build(); let mut stream = FileStream::new( &scan_config, diff --git a/datafusion-examples/examples/custom_file_format.rs b/datafusion-examples/examples/custom_file_format.rs index 67fe642fd46e..c931b193e2cd 100644 --- a/datafusion-examples/examples/custom_file_format.rs +++ b/datafusion-examples/examples/custom_file_format.rs @@ -29,7 +29,7 @@ use datafusion::{ csv::CsvFormatFactory, file_compression_type::FileCompressionType, FileFormat, FileFormatFactory, }, - physical_plan::{FileScanConfig, FileSinkConfig, FileSource}, + physical_plan::{FileScanConfig, FileSinkConfig}, MemTable, }, error::Result, @@ -127,10 +127,6 @@ impl FileFormat for TSVFileFormat { .create_writer_physical_plan(input, state, conf, order_requirements) .await } - - fn file_source(&self) -> Arc { - self.csv_file_format.file_source() - } } #[derive(Default, Debug)] diff --git a/datafusion-examples/examples/default_column_values.rs b/datafusion-examples/examples/default_column_values.rs index 43e2d4ca0988..d73e8692efea 100644 --- a/datafusion-examples/examples/default_column_values.rs +++ b/datafusion-examples/examples/default_column_values.rs @@ -29,6 +29,7 @@ use datafusion::catalog::{Session, TableProvider}; use datafusion::common::tree_node::{Transformed, TransformedResult, TreeNode}; use datafusion::common::DFSchema; use datafusion::common::{Result, ScalarValue}; +use datafusion::config::TableParquetOptions; use datafusion::datasource::listing::PartitionedFile; use datafusion::datasource::physical_plan::{FileScanConfigBuilder, ParquetSource}; use datafusion::execution::context::SessionContext; @@ -235,10 +236,6 @@ impl TableProvider for DefaultValueTableProvider { &df_schema, )?; - let parquet_source = ParquetSource::default() - .with_predicate(filter) - .with_pushdown_filters(true); - let object_store_url = ObjectStoreUrl::parse("memory://")?; let store = state.runtime_env().object_store(object_store_url)?; @@ -255,19 +252,21 @@ impl TableProvider for DefaultValueTableProvider { .map(|file| PartitionedFile::new(file.location.clone(), file.size)) .collect(); - let file_scan_config = FileScanConfigBuilder::new( + let config = FileScanConfigBuilder::new( ObjectStoreUrl::parse("memory://")?, self.schema.clone(), - Arc::new(parquet_source), ) .with_projection(projection.cloned()) .with_limit(limit) .with_file_group(file_group) - .with_expr_adapter(Some(Arc::new(DefaultValuePhysicalExprAdapterFactory) as _)); + .with_expr_adapter(Some(Arc::new(DefaultValuePhysicalExprAdapterFactory) as _)) + .build(); + + let parquet_source = ParquetSource::new(TableParquetOptions::default(), config) + .with_predicate(filter) + .with_pushdown_filters(true); - Ok(Arc::new(DataSourceExec::new(Arc::new( - file_scan_config.build(), - )))) + Ok(DataSourceExec::from_data_source(parquet_source)) } } diff --git a/datafusion-examples/examples/json_shredding.rs b/datafusion-examples/examples/json_shredding.rs index b7acb5c7b74c..5593c3a6eac9 100644 --- a/datafusion-examples/examples/json_shredding.rs +++ b/datafusion-examples/examples/json_shredding.rs @@ -29,6 +29,7 @@ use datafusion::common::tree_node::{ Transformed, TransformedResult, TreeNode, TreeNodeRecursion, }; use datafusion::common::{assert_contains, DFSchema, Result}; +use datafusion::config::TableParquetOptions; use datafusion::datasource::listing::PartitionedFile; use datafusion::datasource::physical_plan::{FileScanConfigBuilder, ParquetSource}; use datafusion::execution::context::SessionContext; @@ -243,10 +244,6 @@ impl TableProvider for ExampleTableProvider { &df_schema, )?; - let parquet_source = ParquetSource::default() - .with_predicate(filter) - .with_pushdown_filters(true); - let object_store_url = ObjectStoreUrl::parse("memory://")?; let store = state.runtime_env().object_store(object_store_url)?; @@ -264,20 +261,21 @@ impl TableProvider for ExampleTableProvider { .map(|file| PartitionedFile::new(file.location.clone(), file.size)) .collect(); - let file_scan_config = FileScanConfigBuilder::new( - ObjectStoreUrl::parse("memory://")?, - schema, - Arc::new(parquet_source), - ) - .with_projection(projection.cloned()) - .with_limit(limit) - .with_file_group(file_group) - // if the rewriter needs a reference to the table schema you can bind self.schema() here - .with_expr_adapter(Some(Arc::new(ShreddedJsonRewriterFactory) as _)); - - Ok(Arc::new(DataSourceExec::new(Arc::new( - file_scan_config.build(), - )))) + let file_scan_config = + FileScanConfigBuilder::new(ObjectStoreUrl::parse("memory://")?, schema) + .with_projection(projection.cloned()) + .with_limit(limit) + .with_file_group(file_group) + // if the rewriter needs a reference to the table schema you can bind self.schema() here + .with_expr_adapter(Some(Arc::new(ShreddedJsonRewriterFactory) as _)) + .build(); + + let parquet_source = + ParquetSource::new(TableParquetOptions::default(), file_scan_config) + .with_predicate(filter) + .with_pushdown_filters(true); + + Ok(DataSourceExec::from_data_source(parquet_source)) } } diff --git a/datafusion-examples/examples/parquet_embedded_index.rs b/datafusion-examples/examples/parquet_embedded_index.rs index 5191ae48b3af..2744af873899 100644 --- a/datafusion-examples/examples/parquet_embedded_index.rs +++ b/datafusion-examples/examples/parquet_embedded_index.rs @@ -117,6 +117,7 @@ use arrow_schema::{DataType, Field, Schema, SchemaRef}; use async_trait::async_trait; use datafusion::catalog::{Session, TableProvider}; use datafusion::common::{exec_err, HashMap, HashSet, Result}; +use datafusion::config::TableParquetOptions; use datafusion::datasource::listing::PartitionedFile; use datafusion::datasource::memory::DataSourceExec; use datafusion::datasource::physical_plan::{FileScanConfigBuilder, ParquetSource}; @@ -426,8 +427,8 @@ impl TableProvider for DistinctIndexTable { // Build ParquetSource to actually read the files let url = ObjectStoreUrl::parse("file://")?; - let source = Arc::new(ParquetSource::default().with_enable_page_index(true)); - let mut builder = FileScanConfigBuilder::new(url, self.schema.clone(), source); + + let mut builder = FileScanConfigBuilder::new(url, self.schema.clone()); for file in files_to_scan { let path = self.dir.join(file); let len = std::fs::metadata(&path)?.len(); @@ -438,7 +439,10 @@ impl TableProvider for DistinctIndexTable { PartitionedFile::new(path.to_str().unwrap().to_string(), len); builder = builder.with_file(partitioned_file); } - Ok(DataSourceExec::from_data_source(builder.build())) + + let source = ParquetSource::new(TableParquetOptions::default(), builder.build()) + .with_enable_page_index(true); + Ok(DataSourceExec::from_data_source(source)) } /// Tell DataFusion that we can handle filters on the "category" column diff --git a/datafusion-examples/examples/parquet_exec_visitor.rs b/datafusion-examples/examples/parquet_exec_visitor.rs index 84f92d4f450e..f7180afe8907 100644 --- a/datafusion-examples/examples/parquet_exec_visitor.rs +++ b/datafusion-examples/examples/parquet_exec_visitor.rs @@ -19,7 +19,7 @@ use std::sync::Arc; use datafusion::datasource::file_format::parquet::ParquetFormat; use datafusion::datasource::listing::ListingOptions; -use datafusion::datasource::physical_plan::{FileGroup, ParquetSource}; +use datafusion::datasource::physical_plan::{FileGroup, FileSource, ParquetSource}; use datafusion::datasource::source::DataSourceExec; use datafusion::error::DataFusionError; use datafusion::execution::context::SessionContext; @@ -98,9 +98,11 @@ impl ExecutionPlanVisitor for ParquetExecVisitor { fn pre_visit(&mut self, plan: &dyn ExecutionPlan) -> Result { // If needed match on a specific `ExecutionPlan` node type if let Some(data_source_exec) = plan.as_any().downcast_ref::() { - if let Some((file_config, _)) = - data_source_exec.downcast_to_file_source::() + if let Some(parquet_source) = + data_source_exec.as_any().downcast_ref::() { + let file_config = parquet_source.config(); + self.file_groups = Some(file_config.file_groups.clone()); let metrics = match data_source_exec.metrics() { diff --git a/datafusion-examples/examples/parquet_index.rs b/datafusion-examples/examples/parquet_index.rs index afc3b279f4a9..d62f19f5756d 100644 --- a/datafusion-examples/examples/parquet_index.rs +++ b/datafusion-examples/examples/parquet_index.rs @@ -27,6 +27,7 @@ use datafusion::common::pruning::PruningStatistics; use datafusion::common::{ internal_datafusion_err, DFSchema, DataFusionError, Result, ScalarValue, }; +use datafusion::config::TableParquetOptions; use datafusion::datasource::listing::PartitionedFile; use datafusion::datasource::memory::DataSourceExec; use datafusion::datasource::physical_plan::{FileScanConfigBuilder, ParquetSource}; @@ -243,9 +244,8 @@ impl TableProvider for IndexTableProvider { let files = self.index.get_files(predicate.clone())?; let object_store_url = ObjectStoreUrl::parse("file://")?; - let source = Arc::new(ParquetSource::default().with_predicate(predicate)); let mut file_scan_config_builder = - FileScanConfigBuilder::new(object_store_url, self.schema(), source) + FileScanConfigBuilder::new(object_store_url, self.schema()) .with_projection(projection.cloned()) .with_limit(limit); @@ -258,9 +258,12 @@ impl TableProvider for IndexTableProvider { PartitionedFile::new(canonical_path.display().to_string(), file_size), ); } - Ok(DataSourceExec::from_data_source( - file_scan_config_builder.build(), - )) + + let config = file_scan_config_builder.build(); + let source = ParquetSource::new(TableParquetOptions::default(), config) + .with_predicate(predicate); + + Ok(DataSourceExec::from_data_source(source)) } /// Tell DataFusion to push filters down to the scan method diff --git a/datafusion/core/src/datasource/file_format/arrow.rs b/datafusion/core/src/datasource/file_format/arrow.rs index 5ce70e32843d..b42d94a738b2 100644 --- a/datafusion/core/src/datasource/file_format/arrow.rs +++ b/datafusion/core/src/datasource/file_format/arrow.rs @@ -48,8 +48,7 @@ use datafusion_common::{ }; use datafusion_common_runtime::{JoinSet, SpawnedTask}; use datafusion_datasource::display::FileGroupDisplay; -use datafusion_datasource::file::FileSource; -use datafusion_datasource::file_scan_config::{FileScanConfig, FileScanConfigBuilder}; +use datafusion_datasource::file_scan_config::FileScanConfig; use datafusion_datasource::sink::{DataSink, DataSinkExec}; use datafusion_datasource::write::ObjectWriterBuilder; use datafusion_execution::{SendableRecordBatchStream, TaskContext}; @@ -178,12 +177,9 @@ impl FileFormat for ArrowFormat { _state: &dyn Session, conf: FileScanConfig, ) -> Result> { - let source = Arc::new(ArrowSource::default()); - let config = FileScanConfigBuilder::from(conf) - .with_source(source) - .build(); - - Ok(DataSourceExec::from_data_source(config)) + Ok(DataSourceExec::from_data_source(ArrowSource::new( + conf.clone(), + ))) } async fn create_writer_physical_plan( @@ -201,10 +197,6 @@ impl FileFormat for ArrowFormat { Ok(Arc::new(DataSinkExec::new(input, sink, order_requirements)) as _) } - - fn file_source(&self) -> Arc { - Arc::new(ArrowSource::default()) - } } /// Implements [`FileSink`] for writing to arrow_ipc files diff --git a/datafusion/core/src/datasource/file_format/mod.rs b/datafusion/core/src/datasource/file_format/mod.rs index e165707c2eb0..3fa86d632591 100644 --- a/datafusion/core/src/datasource/file_format/mod.rs +++ b/datafusion/core/src/datasource/file_format/mod.rs @@ -86,7 +86,7 @@ pub(crate) mod test_util { FileScanConfigBuilder::new( ObjectStoreUrl::local_filesystem(), file_schema, - format.file_source(), + // format.file_source(), ) .with_file_groups(file_groups) .with_statistics(statistics) diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs index d289a1d07129..d17a72cc7a6b 100644 --- a/datafusion/core/src/datasource/listing/table.rs +++ b/datafusion/core/src/datasource/listing/table.rs @@ -36,7 +36,6 @@ use datafusion_common::{ }; use datafusion_datasource::{ compute_all_files_statistics, - file::FileSource, file_groups::FileGroup, file_scan_config::{FileScanConfig, FileScanConfigBuilder}, schema_adapter::{DefaultSchemaAdapterFactory, SchemaAdapter, SchemaAdapterFactory}, @@ -1112,19 +1111,6 @@ impl ListingTable { } } - /// Creates a file source and applies schema adapter factory if available - fn create_file_source_with_schema_adapter(&self) -> Result> { - let mut source = self.options.format.file_source(); - // Apply schema adapter to source if available - // - // The source will use this SchemaAdapter to adapt data batches as they flow up the plan. - // Note: ListingTable also creates a SchemaAdapter in `scan()` but that is only used to adapt collected statistics. - if let Some(factory) = &self.schema_adapter_factory { - source = source.with_schema_adapter_factory(Arc::clone(factory))?; - } - Ok(source) - } - /// If file_sort_order is specified, creates the appropriate physical expressions fn try_create_output_ordering(&self) -> Result> { create_ordering(&self.table_schema, &self.options.file_sort_order) @@ -1233,18 +1219,8 @@ impl TableProvider for ListingTable { return Ok(Arc::new(EmptyExec::new(Arc::new(Schema::empty())))); }; - let file_source = self.create_file_source_with_schema_adapter()?; - - // create the execution plan - self.options - .format - .create_physical_plan( - state, - FileScanConfigBuilder::new( - object_store_url, - Arc::clone(&self.file_schema), - file_source, - ) + let mut config = + FileScanConfigBuilder::new(object_store_url, Arc::clone(&self.file_schema)) .with_file_groups(partitioned_file_lists) .with_constraints(self.constraints.clone()) .with_statistics(statistics) @@ -1252,9 +1228,16 @@ impl TableProvider for ListingTable { .with_limit(limit) .with_output_ordering(output_ordering) .with_table_partition_cols(table_partition_cols) - .with_expr_adapter(self.expr_adapter_factory.clone()) - .build(), - ) + .with_expr_adapter(self.expr_adapter_factory.clone()); + + if let Some(factory) = &self.schema_adapter_factory { + config = config.with_schema_adapter(Some(Arc::clone(factory))); + } + + // create the execution plan + self.options + .format + .create_physical_plan(state, config.build()) .await } diff --git a/datafusion/core/src/datasource/mod.rs b/datafusion/core/src/datasource/mod.rs index 94d651ddadd5..df2623fba4e6 100644 --- a/datafusion/core/src/datasource/mod.rs +++ b/datafusion/core/src/datasource/mod.rs @@ -58,7 +58,9 @@ mod tests { datatypes::{DataType, Field, Schema, SchemaRef}, record_batch::RecordBatch, }; - use datafusion_common::{record_batch, test_util::batches_to_sort_string}; + use datafusion_common::{ + config::TableParquetOptions, record_batch, test_util::batches_to_sort_string, + }; use datafusion_datasource::{ file::FileSource, file_scan_config::FileScanConfigBuilder, @@ -123,18 +125,17 @@ mod tests { let f2 = Field::new("extra_column", DataType::Utf8, true); let schema = Arc::new(Schema::new(vec![f1.clone(), f2.clone()])); - let source = ParquetSource::default() + + let base_conf = + FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), schema) + .with_file(partitioned_file) + .build(); + + let source = ParquetSource::new(TableParquetOptions::default(), base_conf) .with_schema_adapter_factory(Arc::new(TestSchemaAdapterFactory {})) .unwrap(); - let base_conf = FileScanConfigBuilder::new( - ObjectStoreUrl::local_filesystem(), - schema, - source, - ) - .with_file(partitioned_file) - .build(); - let parquet_exec = DataSourceExec::from_data_source(base_conf); + let parquet_exec = Arc::new(DataSourceExec::new(source.as_data_source())); let session_ctx = SessionContext::new(); let task_ctx = session_ctx.task_ctx(); diff --git a/datafusion/core/src/datasource/physical_plan/arrow_file.rs b/datafusion/core/src/datasource/physical_plan/arrow_file.rs index d0af96329b5f..c3320460c8c7 100644 --- a/datafusion/core/src/datasource/physical_plan/arrow_file.rs +++ b/datafusion/core/src/datasource/physical_plan/arrow_file.rs @@ -29,8 +29,8 @@ use arrow_ipc::reader::FileDecoder; use datafusion_common::Statistics; use datafusion_datasource::file::FileSource; use datafusion_datasource::file_scan_config::FileScanConfig; +use datafusion_datasource::source::DataSource; use datafusion_datasource::PartitionedFile; -use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet; use futures::StreamExt; use itertools::Itertools; @@ -38,11 +38,16 @@ use object_store::{GetOptions, GetRange, GetResultPayload, ObjectStore}; /// Arrow configuration struct that is given to DataSourceExec /// Does not hold anything special, since [`FileScanConfig`] is sufficient for arrow -#[derive(Clone, Default)] +#[derive(Debug, Clone)] pub struct ArrowSource { - metrics: ExecutionPlanMetricsSet, - projected_statistics: Option, - schema_adapter_factory: Option>, + config: FileScanConfig, +} + +impl ArrowSource { + /// Create a new ArrowSource with the given configuration + pub fn new(config: FileScanConfig) -> Self { + Self { config } + } } impl From for Arc { @@ -52,15 +57,25 @@ impl From for Arc { } impl FileSource for ArrowSource { + fn config(&self) -> &FileScanConfig { + &self.config + } + + fn with_config(&self, config: FileScanConfig) -> Arc { + let mut this = self.clone(); + this.config = config; + + Arc::new(this) + } + fn create_file_opener( &self, object_store: Arc, - base_config: &FileScanConfig, _partition: usize, ) -> Arc { Arc::new(ArrowOpener { object_store, - projection: base_config.file_column_projection_indices(), + projection: self.config.file_column_projection_indices(), }) } @@ -75,25 +90,11 @@ impl FileSource for ArrowSource { fn with_schema(&self, _schema: SchemaRef) -> Arc { Arc::new(Self { ..self.clone() }) } - fn with_statistics(&self, statistics: Statistics) -> Arc { - let mut conf = self.clone(); - conf.projected_statistics = Some(statistics); - Arc::new(conf) - } - - fn with_projection(&self, _config: &FileScanConfig) -> Arc { - Arc::new(Self { ..self.clone() }) - } - - fn metrics(&self) -> &ExecutionPlanMetricsSet { - &self.metrics - } + fn with_projected_statistics(&self, statistics: Statistics) -> Arc { + let mut this = self.clone(); + this.config.projected_statistics = statistics; - fn statistics(&self) -> Result { - let statistics = &self.projected_statistics; - Ok(statistics - .clone() - .expect("projected_statistics must be set")) + Arc::new(this) } fn file_type(&self) -> &str { @@ -104,14 +105,18 @@ impl FileSource for ArrowSource { &self, schema_adapter_factory: Arc, ) -> Result> { - Ok(Arc::new(Self { - schema_adapter_factory: Some(schema_adapter_factory), - ..self.clone() - })) + let mut this = self.clone(); + this.config.schema_adapter_factory = Some(schema_adapter_factory); + + Ok(Arc::new(this)) } fn schema_adapter_factory(&self) -> Option> { - self.schema_adapter_factory.clone() + self.config.schema_adapter_factory.clone() + } + + fn as_data_source(&self) -> Arc { + Arc::new(self.clone()) } } diff --git a/datafusion/core/src/datasource/physical_plan/avro.rs b/datafusion/core/src/datasource/physical_plan/avro.rs index 8a00af959ccc..19bd15f38e22 100644 --- a/datafusion/core/src/datasource/physical_plan/avro.rs +++ b/datafusion/core/src/datasource/physical_plan/avro.rs @@ -81,17 +81,14 @@ mod tests { .infer_schema(&state, &store, std::slice::from_ref(&meta)) .await?; - let source = Arc::new(AvroSource::new()); - let conf = FileScanConfigBuilder::new( - ObjectStoreUrl::local_filesystem(), - file_schema, - source, - ) - .with_file(meta.into()) - .with_projection(Some(vec![0, 1, 2])) - .build(); - - let source_exec = DataSourceExec::from_data_source(conf); + let conf = + FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_schema) + .with_file(meta.into()) + .with_projection(Some(vec![0, 1, 2])) + .build(); + + let source = AvroSource::new(conf); + let source_exec = DataSourceExec::from_data_source(source); assert_eq!( source_exec .properties() @@ -157,13 +154,14 @@ mod tests { // Include the missing column in the projection let projection = Some(vec![0, 1, 2, actual_schema.fields().len()]); - let source = Arc::new(AvroSource::new()); - let conf = FileScanConfigBuilder::new(object_store_url, file_schema, source) + let conf = FileScanConfigBuilder::new(object_store_url, file_schema) .with_file(meta.into()) .with_projection(projection) .build(); - let source_exec = DataSourceExec::from_data_source(conf); + let source = AvroSource::new(conf); + + let source_exec = DataSourceExec::from_data_source(source); assert_eq!( source_exec .properties() @@ -227,8 +225,7 @@ mod tests { partitioned_file.partition_values = vec![ScalarValue::from("2021-10-26")]; let projection = Some(vec![0, 1, file_schema.fields().len(), 2]); - let source = Arc::new(AvroSource::new()); - let conf = FileScanConfigBuilder::new(object_store_url, file_schema, source) + let conf = FileScanConfigBuilder::new(object_store_url, file_schema) // select specific columns of the files as well as the partitioning // column which is supposed to be the last column in the table schema. .with_projection(projection) @@ -236,7 +233,9 @@ mod tests { .with_table_partition_cols(vec![Field::new("date", DataType::Utf8, false)]) .build(); - let source_exec = DataSourceExec::from_data_source(conf); + let source = AvroSource::new(conf); + + let source_exec = DataSourceExec::from_data_source(source); assert_eq!( source_exec diff --git a/datafusion/core/src/datasource/physical_plan/csv.rs b/datafusion/core/src/datasource/physical_plan/csv.rs index e33761a0abb3..92ee81ca6979 100644 --- a/datafusion/core/src/datasource/physical_plan/csv.rs +++ b/datafusion/core/src/datasource/physical_plan/csv.rs @@ -110,19 +110,18 @@ mod tests { tmp_dir.path(), )?; - let source = Arc::new(CsvSource::new(true, b',', b'"')); - let config = FileScanConfigBuilder::from(partitioned_csv_config( - file_schema, - file_groups, - source, - )) - .with_file_compression_type(file_compression_type) - .with_newlines_in_values(false) - .with_projection(Some(vec![0, 2, 4])) - .build(); + let config = + FileScanConfigBuilder::from(partitioned_csv_config(file_schema, file_groups)) + .with_file_compression_type(file_compression_type) + .with_newlines_in_values(false) + .with_projection(Some(vec![0, 2, 4])) + .build(); assert_eq!(13, config.file_schema.fields().len()); - let csv = DataSourceExec::from_data_source(config); + + let source = CsvSource::new(true, b',', b'"', config); + + let csv = DataSourceExec::from_data_source(source); assert_eq!(3, csv.schema().fields().len()); @@ -175,18 +174,16 @@ mod tests { tmp_dir.path(), )?; - let source = Arc::new(CsvSource::new(true, b',', b'"')); - let config = FileScanConfigBuilder::from(partitioned_csv_config( - file_schema, - file_groups, - source, - )) - .with_newlines_in_values(false) - .with_file_compression_type(file_compression_type.to_owned()) - .with_projection(Some(vec![4, 0, 2])) - .build(); + let config = + FileScanConfigBuilder::from(partitioned_csv_config(file_schema, file_groups)) + .with_newlines_in_values(false) + .with_file_compression_type(file_compression_type.to_owned()) + .with_projection(Some(vec![4, 0, 2])) + .build(); assert_eq!(13, config.file_schema.fields().len()); - let csv = DataSourceExec::from_data_source(config); + + let source = CsvSource::new(true, b',', b'"', config); + let csv = DataSourceExec::from_data_source(source); assert_eq!(3, csv.schema().fields().len()); let mut stream = csv.execute(0, task_ctx)?; @@ -240,18 +237,16 @@ mod tests { tmp_dir.path(), )?; - let source = Arc::new(CsvSource::new(true, b',', b'"')); - let config = FileScanConfigBuilder::from(partitioned_csv_config( - file_schema, - file_groups, - source, - )) - .with_newlines_in_values(false) - .with_file_compression_type(file_compression_type.to_owned()) - .with_limit(Some(5)) - .build(); + let config = + FileScanConfigBuilder::from(partitioned_csv_config(file_schema, file_groups)) + .with_newlines_in_values(false) + .with_file_compression_type(file_compression_type.to_owned()) + .with_limit(Some(5)) + .build(); assert_eq!(13, config.file_schema.fields().len()); - let csv = DataSourceExec::from_data_source(config); + + let source = CsvSource::new(true, b',', b'"', config); + let csv = DataSourceExec::from_data_source(source); assert_eq!(13, csv.schema().fields().len()); let mut it = csv.execute(0, task_ctx)?; @@ -303,18 +298,15 @@ mod tests { tmp_dir.path(), )?; - let source = Arc::new(CsvSource::new(true, b',', b'"')); - let config = FileScanConfigBuilder::from(partitioned_csv_config( - file_schema, - file_groups, - source, - )) - .with_newlines_in_values(false) - .with_file_compression_type(file_compression_type.to_owned()) - .with_limit(Some(5)) - .build(); + let config = + FileScanConfigBuilder::from(partitioned_csv_config(file_schema, file_groups)) + .with_newlines_in_values(false) + .with_file_compression_type(file_compression_type.to_owned()) + .with_limit(Some(5)) + .build(); assert_eq!(14, config.file_schema.fields().len()); - let csv = DataSourceExec::from_data_source(config); + let source = CsvSource::new(true, b',', b'"', config); + let csv = DataSourceExec::from_data_source(source); assert_eq!(14, csv.schema().fields().len()); // errors due to https://github.com/apache/datafusion/issues/4918 @@ -358,15 +350,11 @@ mod tests { tmp_dir.path(), )?; - let source = Arc::new(CsvSource::new(true, b',', b'"')); - let mut config = FileScanConfigBuilder::from(partitioned_csv_config( - file_schema, - file_groups, - source, - )) - .with_newlines_in_values(false) - .with_file_compression_type(file_compression_type.to_owned()) - .build(); + let mut config = + FileScanConfigBuilder::from(partitioned_csv_config(file_schema, file_groups)) + .with_newlines_in_values(false) + .with_file_compression_type(file_compression_type.to_owned()) + .build(); // Add partition columns config.table_partition_cols = @@ -381,7 +369,9 @@ mod tests { // partitions are resolved during scan anyway assert_eq!(13, config.file_schema.fields().len()); - let csv = DataSourceExec::from_data_source(config); + + let source = CsvSource::new(true, b',', b'"', config); + let csv = DataSourceExec::from_data_source(source); assert_eq!(2, csv.schema().fields().len()); let mut it = csv.execute(0, task_ctx)?; @@ -464,16 +454,14 @@ mod tests { ) .unwrap(); - let source = Arc::new(CsvSource::new(true, b',', b'"')); - let config = FileScanConfigBuilder::from(partitioned_csv_config( - file_schema, - file_groups, - source, - )) - .with_newlines_in_values(false) - .with_file_compression_type(file_compression_type.to_owned()) - .build(); - let csv = DataSourceExec::from_data_source(config); + let config = + FileScanConfigBuilder::from(partitioned_csv_config(file_schema, file_groups)) + .with_newlines_in_values(false) + .with_file_compression_type(file_compression_type.to_owned()) + .build(); + + let source = CsvSource::new(true, b',', b'"', config); + let csv = DataSourceExec::from_data_source(source); let it = csv.execute(0, task_ctx).unwrap(); let batches: Vec<_> = it.try_collect().await.unwrap(); diff --git a/datafusion/core/src/datasource/physical_plan/json.rs b/datafusion/core/src/datasource/physical_plan/json.rs index 0d45711c76fb..700d5b5f0b0c 100644 --- a/datafusion/core/src/datasource/physical_plan/json.rs +++ b/datafusion/core/src/datasource/physical_plan/json.rs @@ -176,13 +176,15 @@ mod tests { let (object_store_url, file_groups, file_schema) = prepare_store(&state, file_compression_type.to_owned(), tmp_dir.path()).await; - let source = Arc::new(JsonSource::new()); - let conf = FileScanConfigBuilder::new(object_store_url, file_schema, source) + let conf = FileScanConfigBuilder::new(object_store_url, file_schema) .with_file_groups(file_groups) .with_limit(Some(3)) .with_file_compression_type(file_compression_type.to_owned()) .build(); - let exec = DataSourceExec::from_data_source(conf); + + let source = JsonSource::new(conf); + + let exec = DataSourceExec::from_data_source(source); // TODO: this is not where schema inference should be tested @@ -251,13 +253,14 @@ mod tests { let file_schema = Arc::new(builder.finish()); let missing_field_idx = file_schema.fields.len() - 1; - let source = Arc::new(JsonSource::new()); - let conf = FileScanConfigBuilder::new(object_store_url, file_schema, source) + let conf = FileScanConfigBuilder::new(object_store_url, file_schema) .with_file_groups(file_groups) .with_limit(Some(3)) .with_file_compression_type(file_compression_type.to_owned()) .build(); - let exec = DataSourceExec::from_data_source(conf); + + let source = JsonSource::new(conf); + let exec = DataSourceExec::from_data_source(source); let mut it = exec.execute(0, task_ctx)?; let batch = it.next().await.unwrap()?; @@ -294,13 +297,14 @@ mod tests { let (object_store_url, file_groups, file_schema) = prepare_store(&state, file_compression_type.to_owned(), tmp_dir.path()).await; - let source = Arc::new(JsonSource::new()); - let conf = FileScanConfigBuilder::new(object_store_url, file_schema, source) + let conf = FileScanConfigBuilder::new(object_store_url, file_schema) .with_file_groups(file_groups) .with_projection(Some(vec![0, 2])) .with_file_compression_type(file_compression_type.to_owned()) .build(); - let exec = DataSourceExec::from_data_source(conf); + + let source = JsonSource::new(conf); + let exec = DataSourceExec::from_data_source(source); let inferred_schema = exec.schema(); assert_eq!(inferred_schema.fields().len(), 2); @@ -342,13 +346,13 @@ mod tests { let (object_store_url, file_groups, file_schema) = prepare_store(&state, file_compression_type.to_owned(), tmp_dir.path()).await; - let source = Arc::new(JsonSource::new()); - let conf = FileScanConfigBuilder::new(object_store_url, file_schema, source) + let conf = FileScanConfigBuilder::new(object_store_url, file_schema) .with_file_groups(file_groups) .with_projection(Some(vec![3, 0, 2])) .with_file_compression_type(file_compression_type.to_owned()) .build(); - let exec = DataSourceExec::from_data_source(conf); + let source = JsonSource::new(conf); + let exec = DataSourceExec::from_data_source(source); let inferred_schema = exec.schema(); assert_eq!(inferred_schema.fields().len(), 3); diff --git a/datafusion/core/src/datasource/physical_plan/parquet.rs b/datafusion/core/src/datasource/physical_plan/parquet.rs index 55db0d854204..4e367ea91b63 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet.rs @@ -160,7 +160,13 @@ mod tests { .as_ref() .map(|p| logical2physical(p, &table_schema)); - let mut source = ParquetSource::default(); + let config = FileScanConfigBuilder::new( + ObjectStoreUrl::local_filesystem(), + table_schema.clone(), + ) + .build(); + + let mut source = ParquetSource::new(TableParquetOptions::default(), config); if let Some(predicate) = predicate { source = source.with_predicate(predicate); } @@ -197,12 +203,14 @@ mod tests { let base_config = FileScanConfigBuilder::new( ObjectStoreUrl::local_filesystem(), file_schema, - source, ) .with_file_group(file_group) .with_projection(self.projection.clone()) .build(); - DataSourceExec::from_data_source(base_config) + + let s = source.with_config(base_config.clone()); + + Arc::new(DataSourceExec::new(s.as_data_source())) } /// run the test, returning the `RoundTripResult` @@ -1547,12 +1555,13 @@ mod tests { let config = FileScanConfigBuilder::new( ObjectStoreUrl::local_filesystem(), file_schema, - Arc::new(ParquetSource::default()), ) .with_file_groups(file_groups) .build(); - let parquet_exec = DataSourceExec::from_data_source(config); + let source = ParquetSource::new(TableParquetOptions::default(), config); + + let parquet_exec = DataSourceExec::from_data_source(source); assert_eq!( parquet_exec .properties() @@ -1649,8 +1658,7 @@ mod tests { ), ]); - let source = Arc::new(ParquetSource::default()); - let config = FileScanConfigBuilder::new(object_store_url, schema.clone(), source) + let config = FileScanConfigBuilder::new(object_store_url, schema.clone()) .with_file(partitioned_file) // file has 10 cols so index 12 should be month and 13 should be day .with_projection(Some(vec![0, 1, 2, 12, 13])) @@ -1668,7 +1676,8 @@ mod tests { ]) .build(); - let parquet_exec = DataSourceExec::from_data_source(config); + let source = ParquetSource::new(TableParquetOptions::default(), config); + let parquet_exec = DataSourceExec::from_data_source(source); let partition_count = parquet_exec .data_source() .output_partitioning() @@ -1725,15 +1734,13 @@ mod tests { }; let file_schema = Arc::new(Schema::empty()); - let config = FileScanConfigBuilder::new( - ObjectStoreUrl::local_filesystem(), - file_schema, - Arc::new(ParquetSource::default()), - ) - .with_file(partitioned_file) - .build(); + let config = + FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_schema) + .with_file(partitioned_file) + .build(); - let parquet_exec = DataSourceExec::from_data_source(config); + let source = ParquetSource::new(TableParquetOptions::default(), config); + let parquet_exec = DataSourceExec::from_data_source(source); let mut results = parquet_exec.execute(0, state.task_ctx())?; let batch = results.next().await.unwrap(); @@ -2249,12 +2256,7 @@ mod tests { let size_hint_calls = reader_factory.metadata_size_hint_calls.clone(); - let source = Arc::new( - ParquetSource::default() - .with_parquet_file_reader_factory(reader_factory) - .with_metadata_size_hint(456), - ); - let config = FileScanConfigBuilder::new(store_url, schema, source) + let config = FileScanConfigBuilder::new(store_url, schema) .with_file( PartitionedFile { object_meta: ObjectMeta { @@ -2288,7 +2290,11 @@ mod tests { }) .build(); - let exec = DataSourceExec::from_data_source(config); + let source = ParquetSource::new(TableParquetOptions::default(), config) + .with_parquet_file_reader_factory(reader_factory) + .with_metadata_size_hint(456); + + let exec = DataSourceExec::from_data_source(source); let res = collect(exec, ctx.task_ctx()).await.unwrap(); assert_eq!(res.len(), 2); diff --git a/datafusion/core/src/test/mod.rs b/datafusion/core/src/test/mod.rs index 68f83e7f1f11..f4f8e767d873 100644 --- a/datafusion/core/src/test/mod.rs +++ b/datafusion/core/src/test/mod.rs @@ -92,12 +92,13 @@ pub fn scan_partitioned_csv( FileCompressionType::UNCOMPRESSED, work_dir, )?; - let source = Arc::new(CsvSource::new(true, b'"', b'"')); - let config = - FileScanConfigBuilder::from(partitioned_csv_config(schema, file_groups, source)) - .with_file_compression_type(FileCompressionType::UNCOMPRESSED) - .build(); - Ok(DataSourceExec::from_data_source(config)) + let config = FileScanConfigBuilder::from(partitioned_csv_config(schema, file_groups)) + .with_file_compression_type(FileCompressionType::UNCOMPRESSED) + .build(); + + Ok(DataSourceExec::from_data_source(CsvSource::new( + true, b'"', b'"', config, + ))) } /// Returns file groups [`Vec`] for scanning `partitions` of `filename` diff --git a/datafusion/core/src/test_util/parquet.rs b/datafusion/core/src/test_util/parquet.rs index eb4c61c02524..ab27aa16843e 100644 --- a/datafusion/core/src/test_util/parquet.rs +++ b/datafusion/core/src/test_util/parquet.rs @@ -37,7 +37,6 @@ use crate::physical_plan::metrics::MetricsSet; use crate::physical_plan::ExecutionPlan; use crate::prelude::{Expr, SessionConfig, SessionContext}; -use datafusion_datasource::file::FileSource; use datafusion_datasource::file_scan_config::FileScanConfigBuilder; use datafusion_datasource::source::DataSourceExec; use object_store::path::Path; @@ -156,11 +155,9 @@ impl TestParquetFile { maybe_filter: Option, ) -> Result> { let parquet_options = ctx.copied_table_options().parquet; - let source = Arc::new(ParquetSource::new(parquet_options.clone())); - let scan_config_builder = FileScanConfigBuilder::new( + let config = FileScanConfigBuilder::new( self.object_store_url.clone(), Arc::clone(&self.schema), - source, ) .with_file(PartitionedFile { object_meta: self.object_meta.clone(), @@ -169,7 +166,10 @@ impl TestParquetFile { statistics: None, extensions: None, metadata_size_hint: None, - }); + }) + .build(); + + let source = ParquetSource::new(parquet_options.clone(), config); let df_schema = Arc::clone(&self.schema).to_dfschema_ref()?; @@ -182,19 +182,13 @@ impl TestParquetFile { let physical_filter_expr = create_physical_expr(&filter, &df_schema, &ExecutionProps::default())?; - let source = Arc::new( - ParquetSource::new(parquet_options) - .with_predicate(Arc::clone(&physical_filter_expr)), - ) - .with_schema(Arc::clone(&self.schema)); - let config = scan_config_builder.with_source(source).build(); - let parquet_exec = DataSourceExec::from_data_source(config); + let source = source.with_predicate(Arc::clone(&physical_filter_expr)); + let parquet_exec = DataSourceExec::from_data_source(source); let exec = Arc::new(FilterExec::try_new(physical_filter_expr, parquet_exec)?); Ok(exec) } else { - let config = scan_config_builder.build(); - Ok(DataSourceExec::from_data_source(config)) + Ok(DataSourceExec::from_data_source(source)) } } @@ -205,7 +199,9 @@ impl TestParquetFile { pub fn parquet_metrics(plan: &Arc) -> Option { if let Some(data_source_exec) = plan.as_any().downcast_ref::() { if data_source_exec - .downcast_to_file_source::() + .data_source() + .as_any() + .downcast_ref::() .is_some() { return data_source_exec.metrics(); diff --git a/datafusion/core/tests/fuzz_cases/pruning.rs b/datafusion/core/tests/fuzz_cases/pruning.rs index c6e30c0722fc..60032a07cc06 100644 --- a/datafusion/core/tests/fuzz_cases/pruning.rs +++ b/datafusion/core/tests/fuzz_cases/pruning.rs @@ -20,6 +20,7 @@ use std::sync::{Arc, LazyLock}; use arrow::array::{Array, RecordBatch, StringArray}; use arrow::datatypes::{DataType, Field, Schema}; use bytes::{BufMut, Bytes, BytesMut}; +use datafusion::config::TableParquetOptions; use datafusion::{ datasource::{listing::PartitionedFile, physical_plan::ParquetSource}, prelude::*, @@ -275,15 +276,9 @@ async fn execute_with_predicate( schema: Arc, ctx: &SessionContext, ) -> Vec { - let parquet_source = if prune_stats { - ParquetSource::default().with_predicate(predicate.clone()) - } else { - ParquetSource::default() - }; let config = FileScanConfigBuilder::new( ObjectStoreUrl::parse("memory://").unwrap(), schema.clone(), - Arc::new(parquet_source), ) .with_file_group( files @@ -294,7 +289,15 @@ async fn execute_with_predicate( .collect(), ) .build(); - let exec = DataSourceExec::from_data_source(config); + + let parquet_source = if prune_stats { + ParquetSource::new(TableParquetOptions::default(), config) + .with_predicate(predicate.clone()) + } else { + ParquetSource::new(TableParquetOptions::default(), config) + }; + + let exec = DataSourceExec::from_data_source(parquet_source); let exec = Arc::new(FilterExec::try_new(predicate, exec).unwrap()) as Arc; diff --git a/datafusion/core/tests/parquet/custom_reader.rs b/datafusion/core/tests/parquet/custom_reader.rs index f7e48fa9cb91..5d06a07cf283 100644 --- a/datafusion/core/tests/parquet/custom_reader.rs +++ b/datafusion/core/tests/parquet/custom_reader.rs @@ -23,6 +23,7 @@ use std::time::SystemTime; use arrow::array::{ArrayRef, Int64Array, Int8Array, StringArray}; use arrow::datatypes::{Field, Schema, SchemaBuilder}; use arrow::record_batch::RecordBatch; +use datafusion::config::TableParquetOptions; use datafusion::datasource::file_format::parquet::fetch_parquet_metadata; use datafusion::datasource::listing::PartitionedFile; use datafusion::datasource::object_store::ObjectStoreUrl; @@ -80,23 +81,21 @@ async fn route_data_access_ops_to_parquet_file_reader_factory() { }) .collect(); - let source = Arc::new( - ParquetSource::default() - // prepare the scan - .with_parquet_file_reader_factory(Arc::new( - InMemoryParquetFileReaderFactory(Arc::clone(&in_memory_object_store)), - )), - ); let base_config = FileScanConfigBuilder::new( // just any url that doesn't point to in memory object store ObjectStoreUrl::local_filesystem(), file_schema, - source, ) .with_file_group(file_group) .build(); - let parquet_exec = DataSourceExec::from_data_source(base_config); + let source = ParquetSource::new(TableParquetOptions::default(), base_config) + // prepare the scan + .with_parquet_file_reader_factory(Arc::new(InMemoryParquetFileReaderFactory( + Arc::clone(&in_memory_object_store), + ))); + + let parquet_exec = DataSourceExec::from_data_source(source); let session_ctx = SessionContext::new(); let task_ctx = session_ctx.task_ctx(); diff --git a/datafusion/core/tests/parquet/external_access_plan.rs b/datafusion/core/tests/parquet/external_access_plan.rs index a5397c5a397c..b828d8f0d089 100644 --- a/datafusion/core/tests/parquet/external_access_plan.rs +++ b/datafusion/core/tests/parquet/external_access_plan.rs @@ -26,6 +26,7 @@ use crate::parquet::{create_data_batch, Scenario}; use arrow::datatypes::SchemaRef; use arrow::util::pretty::pretty_format_batches; use datafusion::common::Result; +use datafusion::config::TableParquetOptions; use datafusion::datasource::listing::PartitionedFile; use datafusion::datasource::physical_plan::ParquetSource; use datafusion::prelude::SessionContext; @@ -343,18 +344,22 @@ impl TestFull { // Create a DataSourceExec to read the file let object_store_url = ObjectStoreUrl::local_filesystem(); // add the predicate, if requested + let config = FileScanConfigBuilder::new(object_store_url, schema.clone()) + .with_file(partitioned_file) + .build(); + let source = if let Some(predicate) = predicate { let df_schema = DFSchema::try_from(schema.clone())?; let predicate = ctx.create_physical_expr(predicate, &df_schema)?; - Arc::new(ParquetSource::default().with_predicate(predicate)) + Arc::new( + ParquetSource::new(TableParquetOptions::default(), config) + .with_predicate(predicate), + ) } else { - Arc::new(ParquetSource::default()) + Arc::new(ParquetSource::new(TableParquetOptions::default(), config)) }; - let config = FileScanConfigBuilder::new(object_store_url, schema.clone(), source) - .with_file(partitioned_file) - .build(); - let plan: Arc = DataSourceExec::from_data_source(config); + let plan: Arc = Arc::new(DataSourceExec::new(source)); // run the DataSourceExec and collect the results let results = diff --git a/datafusion/core/tests/parquet/file_statistics.rs b/datafusion/core/tests/parquet/file_statistics.rs index 64ee92eda254..79fe52c29a9f 100644 --- a/datafusion/core/tests/parquet/file_statistics.rs +++ b/datafusion/core/tests/parquet/file_statistics.rs @@ -29,6 +29,8 @@ use datafusion::execution::session_state::SessionStateBuilder; use datafusion::prelude::SessionContext; use datafusion_common::stats::Precision; use datafusion_common::DFSchema; +use datafusion_datasource::file::FileSource; +use datafusion_datasource_parquet::source::ParquetSource; use datafusion_execution::cache::cache_manager::CacheManagerConfig; use datafusion_execution::cache::cache_unit::{ DefaultFileStatisticsCache, DefaultListFilesCache, @@ -200,11 +202,11 @@ async fn list_files_with_session_level_cache() { let data_source = data_source_exec.data_source(); let parquet1 = data_source .as_any() - .downcast_ref::() + .downcast_ref::() .unwrap(); assert_eq!(get_list_file_cache_size(&state1), 1); - let fg = &parquet1.file_groups; + let fg = &parquet1.config().file_groups; assert_eq!(fg.len(), 1); assert_eq!(fg.first().unwrap().len(), 1); @@ -216,11 +218,11 @@ async fn list_files_with_session_level_cache() { let data_source = data_source_exec.data_source(); let parquet2 = data_source .as_any() - .downcast_ref::() + .downcast_ref::() .unwrap(); assert_eq!(get_list_file_cache_size(&state2), 1); - let fg2 = &parquet2.file_groups; + let fg2 = &parquet2.config().file_groups; assert_eq!(fg2.len(), 1); assert_eq!(fg2.first().unwrap().len(), 1); @@ -232,11 +234,11 @@ async fn list_files_with_session_level_cache() { let data_source = data_source_exec.data_source(); let parquet3 = data_source .as_any() - .downcast_ref::() + .downcast_ref::() .unwrap(); assert_eq!(get_list_file_cache_size(&state1), 1); - let fg = &parquet3.file_groups; + let fg = &parquet3.config().file_groups; assert_eq!(fg.len(), 1); assert_eq!(fg.first().unwrap().len(), 1); // List same file no increase diff --git a/datafusion/core/tests/parquet/page_pruning.rs b/datafusion/core/tests/parquet/page_pruning.rs index 27bee10234b5..9ea33c3a9e76 100644 --- a/datafusion/core/tests/parquet/page_pruning.rs +++ b/datafusion/core/tests/parquet/page_pruning.rs @@ -21,6 +21,7 @@ use crate::parquet::Unit::Page; use crate::parquet::{ContextWithParquet, Scenario}; use arrow::array::RecordBatch; +use datafusion::config::TableParquetOptions; use datafusion::datasource::file_format::parquet::ParquetFormat; use datafusion::datasource::file_format::FileFormat; use datafusion::datasource::listing::PartitionedFile; @@ -80,17 +81,16 @@ async fn get_parquet_exec( let execution_props = ExecutionProps::new(); let predicate = create_physical_expr(&filter, &df_schema, &execution_props).unwrap(); - let source = Arc::new( - ParquetSource::default() - .with_predicate(predicate) - .with_enable_page_index(true) - .with_pushdown_filters(pushdown_filters), - ); - let base_config = FileScanConfigBuilder::new(object_store_url, schema, source) + let base_config = FileScanConfigBuilder::new(object_store_url, schema) .with_file(partitioned_file) .build(); - DataSourceExec::new(Arc::new(base_config)) + let source = ParquetSource::new(TableParquetOptions::default(), base_config) + .with_predicate(predicate) + .with_enable_page_index(true) + .with_pushdown_filters(pushdown_filters); + + DataSourceExec::new(Arc::new(source)) } async fn get_filter_results( diff --git a/datafusion/core/tests/parquet/schema_coercion.rs b/datafusion/core/tests/parquet/schema_coercion.rs index 59cbf4b0872e..d891e4de2908 100644 --- a/datafusion/core/tests/parquet/schema_coercion.rs +++ b/datafusion/core/tests/parquet/schema_coercion.rs @@ -22,6 +22,7 @@ use arrow::array::{ StringArray, }; use arrow::datatypes::{DataType, Field, Schema}; +use datafusion::config::TableParquetOptions; use datafusion::datasource::physical_plan::ParquetSource; use datafusion::physical_plan::collect; use datafusion::prelude::SessionContext; @@ -62,16 +63,13 @@ async fn multi_parquet_coercion() { Field::new("c2", DataType::Int32, true), Field::new("c3", DataType::Float64, true), ])); - let source = Arc::new(ParquetSource::default()); - let conf = FileScanConfigBuilder::new( - ObjectStoreUrl::local_filesystem(), - file_schema, - source, - ) - .with_file_group(file_group) - .build(); + let conf = + FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_schema) + .with_file_group(file_group) + .build(); - let parquet_exec = DataSourceExec::from_data_source(conf); + let source = ParquetSource::new(TableParquetOptions::default(), conf); + let parquet_exec = DataSourceExec::from_data_source(source); let session_ctx = SessionContext::new(); let task_ctx = session_ctx.task_ctx(); @@ -120,16 +118,14 @@ async fn multi_parquet_coercion_projection() { Field::new("c2", DataType::Int32, true), Field::new("c3", DataType::Float64, true), ])); - let config = FileScanConfigBuilder::new( - ObjectStoreUrl::local_filesystem(), - file_schema, - Arc::new(ParquetSource::default()), - ) - .with_file_group(file_group) - .with_projection(Some(vec![1, 0, 2])) - .build(); - - let parquet_exec = DataSourceExec::from_data_source(config); + let config = + FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_schema) + .with_file_group(file_group) + .with_projection(Some(vec![1, 0, 2])) + .build(); + + let source = ParquetSource::new(TableParquetOptions::default(), config); + let parquet_exec = DataSourceExec::from_data_source(source); let session_ctx = SessionContext::new(); let task_ctx = session_ctx.task_ctx(); diff --git a/datafusion/core/tests/parquet/utils.rs b/datafusion/core/tests/parquet/utils.rs index 24b6cadc148f..e9569ad45bc8 100644 --- a/datafusion/core/tests/parquet/utils.rs +++ b/datafusion/core/tests/parquet/utils.rs @@ -49,7 +49,9 @@ impl ExecutionPlanVisitor for MetricsFinder { fn pre_visit(&mut self, plan: &dyn ExecutionPlan) -> Result { if let Some(data_source_exec) = plan.as_any().downcast_ref::() { if data_source_exec - .downcast_to_file_source::() + .data_source() + .as_any() + .downcast_ref::() .is_some() { self.metrics = data_source_exec.metrics(); diff --git a/datafusion/core/tests/physical_optimizer/enforce_distribution.rs b/datafusion/core/tests/physical_optimizer/enforce_distribution.rs index fd847763124a..ab0c498cf66f 100644 --- a/datafusion/core/tests/physical_optimizer/enforce_distribution.rs +++ b/datafusion/core/tests/physical_optimizer/enforce_distribution.rs @@ -29,7 +29,7 @@ use crate::physical_optimizer::test_utils::{ use arrow::array::{RecordBatch, UInt64Array, UInt8Array}; use arrow::compute::SortOptions; use arrow_schema::{DataType, Field, Schema, SchemaRef}; -use datafusion::config::ConfigOptions; +use datafusion::config::{ConfigOptions, TableParquetOptions}; use datafusion::datasource::file_format::file_compression_type::FileCompressionType; use datafusion::datasource::listing::PartitionedFile; use datafusion::datasource::object_store::ObjectStoreUrl; @@ -184,19 +184,18 @@ fn parquet_exec_multiple() -> Arc { fn parquet_exec_multiple_sorted( output_ordering: Vec, ) -> Arc { - let config = FileScanConfigBuilder::new( - ObjectStoreUrl::parse("test:///").unwrap(), - schema(), - Arc::new(ParquetSource::default()), - ) - .with_file_groups(vec![ - FileGroup::new(vec![PartitionedFile::new("x".to_string(), 100)]), - FileGroup::new(vec![PartitionedFile::new("y".to_string(), 100)]), - ]) - .with_output_ordering(output_ordering) - .build(); + let config = + FileScanConfigBuilder::new(ObjectStoreUrl::parse("test:///").unwrap(), schema()) + .with_file_groups(vec![ + FileGroup::new(vec![PartitionedFile::new("x".to_string(), 100)]), + FileGroup::new(vec![PartitionedFile::new("y".to_string(), 100)]), + ]) + .with_output_ordering(output_ordering) + .build(); + + let source = ParquetSource::new(TableParquetOptions::default(), config); - DataSourceExec::from_data_source(config) + DataSourceExec::from_data_source(source) } fn csv_exec() -> Arc { @@ -204,16 +203,15 @@ fn csv_exec() -> Arc { } fn csv_exec_with_sort(output_ordering: Vec) -> Arc { - let config = FileScanConfigBuilder::new( - ObjectStoreUrl::parse("test:///").unwrap(), - schema(), - Arc::new(CsvSource::new(false, b',', b'"')), - ) - .with_file(PartitionedFile::new("x".to_string(), 100)) - .with_output_ordering(output_ordering) - .build(); + let config = + FileScanConfigBuilder::new(ObjectStoreUrl::parse("test:///").unwrap(), schema()) + .with_file(PartitionedFile::new("x".to_string(), 100)) + .with_output_ordering(output_ordering) + .build(); - DataSourceExec::from_data_source(config) + let source = CsvSource::new(false, b',', b'"', config); + + DataSourceExec::from_data_source(source) } fn csv_exec_multiple() -> Arc { @@ -222,19 +220,18 @@ fn csv_exec_multiple() -> Arc { // Created a sorted parquet exec with multiple files fn csv_exec_multiple_sorted(output_ordering: Vec) -> Arc { - let config = FileScanConfigBuilder::new( - ObjectStoreUrl::parse("test:///").unwrap(), - schema(), - Arc::new(CsvSource::new(false, b',', b'"')), - ) - .with_file_groups(vec![ - FileGroup::new(vec![PartitionedFile::new("x".to_string(), 100)]), - FileGroup::new(vec![PartitionedFile::new("y".to_string(), 100)]), - ]) - .with_output_ordering(output_ordering) - .build(); + let config = + FileScanConfigBuilder::new(ObjectStoreUrl::parse("test:///").unwrap(), schema()) + .with_file_groups(vec![ + FileGroup::new(vec![PartitionedFile::new("x".to_string(), 100)]), + FileGroup::new(vec![PartitionedFile::new("y".to_string(), 100)]), + ]) + .with_output_ordering(output_ordering) + .build(); + + let source = CsvSource::new(false, b',', b'"', config); - DataSourceExec::from_data_source(config) + DataSourceExec::from_data_source(source) } fn projection_exec_with_alias( @@ -2534,19 +2531,20 @@ fn parallelization_compressed_csv() -> Result<()> { &expected_partitioned[..] }; + let config = FileScanConfigBuilder::new( + ObjectStoreUrl::parse("test:///").unwrap(), + schema(), + ) + .with_file(PartitionedFile::new("x".to_string(), 100)) + .with_file_compression_type(compression_type) + .build(); + + let source = CsvSource::new(false, b',', b'"', config); let plan = aggregate_exec_with_alias( - DataSourceExec::from_data_source( - FileScanConfigBuilder::new( - ObjectStoreUrl::parse("test:///").unwrap(), - schema(), - Arc::new(CsvSource::new(false, b',', b'"')), - ) - .with_file(PartitionedFile::new("x".to_string(), 100)) - .with_file_compression_type(compression_type) - .build(), - ), + DataSourceExec::from_data_source(source), vec![("a".to_string(), "a".to_string())], ); + let test_config = TestConfig::default() .with_query_execution_partitions(2) .with_prefer_repartition_file_scans(10); diff --git a/datafusion/core/tests/physical_optimizer/enforce_sorting.rs b/datafusion/core/tests/physical_optimizer/enforce_sorting.rs index d10459ce86ae..dbc58fbdc540 100644 --- a/datafusion/core/tests/physical_optimizer/enforce_sorting.rs +++ b/datafusion/core/tests/physical_optimizer/enforce_sorting.rs @@ -80,7 +80,6 @@ fn csv_exec_sorted( let mut builder = FileScanConfigBuilder::new( ObjectStoreUrl::parse("test:///").unwrap(), schema.clone(), - Arc::new(CsvSource::new(false, 0, 0)), ) .with_file(PartitionedFile::new("x".to_string(), 100)); if let Some(ordering) = LexOrdering::new(sort_exprs) { @@ -88,7 +87,10 @@ fn csv_exec_sorted( } let config = builder.build(); - DataSourceExec::from_data_source(config) + + let source = CsvSource::new(false, 0, 0, config); + + DataSourceExec::from_data_source(source) } /// Runs the sort enforcement optimizer and asserts the plan diff --git a/datafusion/core/tests/physical_optimizer/filter_pushdown/util.rs b/datafusion/core/tests/physical_optimizer/filter_pushdown/util.rs index acb2b808ef8f..3c7053f8f574 100644 --- a/datafusion/core/tests/physical_optimizer/filter_pushdown/util.rs +++ b/datafusion/core/tests/physical_optimizer/filter_pushdown/util.rs @@ -19,12 +19,13 @@ use arrow::datatypes::SchemaRef; use arrow::error::ArrowError; use arrow::{array::RecordBatch, compute::concat_batches}; use datafusion::{datasource::object_store::ObjectStoreUrl, physical_plan::PhysicalExpr}; -use datafusion_common::{config::ConfigOptions, internal_err, Result, Statistics}; +use datafusion_common::{config::ConfigOptions, internal_err, Result}; +use datafusion_datasource::source::DataSource; use datafusion_datasource::{ file::FileSource, file_meta::FileMeta, file_scan_config::FileScanConfig, file_scan_config::FileScanConfigBuilder, file_stream::FileOpenFuture, file_stream::FileOpener, schema_adapter::DefaultSchemaAdapterFactory, - schema_adapter::SchemaAdapterFactory, source::DataSourceExec, PartitionedFile, + source::DataSourceExec, PartitionedFile, }; use datafusion_physical_expr_common::physical_expr::fmt_sql; use datafusion_physical_optimizer::PhysicalOptimizerRule; @@ -36,7 +37,6 @@ use datafusion_physical_plan::{ ChildFilterDescription, ChildPushdownResult, FilterDescription, FilterPushdownPropagation, }, - metrics::ExecutionPlanMetricsSet, DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties, }; use futures::stream::BoxStream; @@ -103,42 +103,41 @@ impl FileOpener for TestOpener { } /// A placeholder data source that accepts filter pushdown -#[derive(Clone, Default)] +#[derive(Debug, Clone)] pub struct TestSource { support: bool, predicate: Option>, - statistics: Option, - batch_size: Option, batches: Vec, - schema: Option, - metrics: ExecutionPlanMetricsSet, - projection: Option>, - schema_adapter_factory: Option>, + + config: FileScanConfig, } impl TestSource { - fn new(support: bool, batches: Vec) -> Self { + fn new(support: bool, batches: Vec, config: FileScanConfig) -> Self { Self { support, - metrics: ExecutionPlanMetricsSet::new(), batches, - ..Default::default() + predicate: Default::default(), + config, } } } impl FileSource for TestSource { + fn config(&self) -> &FileScanConfig { + &self.config + } + fn create_file_opener( &self, _object_store: Arc, - _base_config: &FileScanConfig, _partition: usize, ) -> Arc { Arc::new(TestOpener { batches: self.batches.clone(), - batch_size: self.batch_size, - schema: self.schema.clone(), - projection: self.projection.clone(), + batch_size: self.config.batch_size, + schema: Some(self.config.file_schema.clone()), + projection: self.config.projection.clone(), }) } @@ -146,46 +145,6 @@ impl FileSource for TestSource { todo!("should not be called") } - fn with_batch_size(&self, batch_size: usize) -> Arc { - Arc::new(TestSource { - batch_size: Some(batch_size), - ..self.clone() - }) - } - - fn with_schema(&self, schema: SchemaRef) -> Arc { - Arc::new(TestSource { - schema: Some(schema), - ..self.clone() - }) - } - - fn with_projection(&self, config: &FileScanConfig) -> Arc { - Arc::new(TestSource { - projection: config.projection.clone(), - ..self.clone() - }) - } - - fn with_statistics(&self, statistics: Statistics) -> Arc { - Arc::new(TestSource { - statistics: Some(statistics), - ..self.clone() - }) - } - - fn metrics(&self) -> &ExecutionPlanMetricsSet { - &self.metrics - } - - fn statistics(&self) -> Result { - Ok(self - .statistics - .as_ref() - .expect("statistics not set") - .clone()) - } - fn file_type(&self) -> &str { "test" } @@ -239,18 +198,15 @@ impl FileSource for TestSource { } } - fn with_schema_adapter_factory( - &self, - schema_adapter_factory: Arc, - ) -> Result> { - Ok(Arc::new(Self { - schema_adapter_factory: Some(schema_adapter_factory), - ..self.clone() - })) + fn with_config(&self, config: FileScanConfig) -> Arc { + let mut this = self.clone(); + this.config = config; + + Arc::new(this) } - fn schema_adapter_factory(&self) -> Option> { - self.schema_adapter_factory.clone() + fn as_data_source(&self) -> Arc { + Arc::new(self.clone()) } } @@ -281,15 +237,16 @@ impl TestScanBuilder { } pub fn build(self) -> Arc { - let source = Arc::new(TestSource::new(self.support, self.batches)); let base_config = FileScanConfigBuilder::new( ObjectStoreUrl::parse("test://").unwrap(), Arc::clone(&self.schema), - source, ) .with_file(PartitionedFile::new("test.parquet", 123)) .build(); - DataSourceExec::from_data_source(base_config) + + let source = TestSource::new(self.support, self.batches, base_config); + + DataSourceExec::from_data_source(source) } } diff --git a/datafusion/core/tests/physical_optimizer/projection_pushdown.rs b/datafusion/core/tests/physical_optimizer/projection_pushdown.rs index 149c50557c3a..76321321aae3 100644 --- a/datafusion/core/tests/physical_optimizer/projection_pushdown.rs +++ b/datafusion/core/tests/physical_optimizer/projection_pushdown.rs @@ -376,16 +376,15 @@ fn create_simple_csv_exec() -> Arc { Field::new("d", DataType::Int32, true), Field::new("e", DataType::Int32, true), ])); - let config = FileScanConfigBuilder::new( - ObjectStoreUrl::parse("test:///").unwrap(), - schema, - Arc::new(CsvSource::new(false, 0, 0)), - ) - .with_file(PartitionedFile::new("x".to_string(), 100)) - .with_projection(Some(vec![0, 1, 2, 3, 4])) - .build(); + let config = + FileScanConfigBuilder::new(ObjectStoreUrl::parse("test:///").unwrap(), schema) + .with_file(PartitionedFile::new("x".to_string(), 100)) + .with_projection(Some(vec![0, 1, 2, 3, 4])) + .build(); + + let source = CsvSource::new(false, 0, 0, config); - DataSourceExec::from_data_source(config) + DataSourceExec::from_data_source(source) } fn create_projecting_csv_exec() -> Arc { @@ -395,16 +394,15 @@ fn create_projecting_csv_exec() -> Arc { Field::new("c", DataType::Int32, true), Field::new("d", DataType::Int32, true), ])); - let config = FileScanConfigBuilder::new( - ObjectStoreUrl::parse("test:///").unwrap(), - schema, - Arc::new(CsvSource::new(false, 0, 0)), - ) - .with_file(PartitionedFile::new("x".to_string(), 100)) - .with_projection(Some(vec![3, 2, 1])) - .build(); + let config = + FileScanConfigBuilder::new(ObjectStoreUrl::parse("test:///").unwrap(), schema) + .with_file(PartitionedFile::new("x".to_string(), 100)) + .with_projection(Some(vec![3, 2, 1])) + .build(); + + let source = CsvSource::new(false, 0, 0, config); - DataSourceExec::from_data_source(config) + DataSourceExec::from_data_source(source) } fn create_projecting_memory_exec() -> Arc { @@ -1572,14 +1570,15 @@ fn partitioned_data_source() -> Arc { let config = FileScanConfigBuilder::new( ObjectStoreUrl::parse("test:///").unwrap(), file_schema.clone(), - Arc::new(CsvSource::default()), ) .with_file(PartitionedFile::new("x".to_string(), 100)) .with_table_partition_cols(vec![Field::new("partition_col", DataType::Utf8, true)]) .with_projection(Some(vec![0, 1, 2])) .build(); - DataSourceExec::from_data_source(config) + let source = CsvSource::new(false, 0, 0, config); + + DataSourceExec::from_data_source(source) } #[test] diff --git a/datafusion/core/tests/physical_optimizer/test_utils.rs b/datafusion/core/tests/physical_optimizer/test_utils.rs index 7f7926060edc..54b84f5d5bba 100644 --- a/datafusion/core/tests/physical_optimizer/test_utils.rs +++ b/datafusion/core/tests/physical_optimizer/test_utils.rs @@ -25,6 +25,7 @@ use arrow::array::Int32Array; use arrow::compute::SortOptions; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use arrow::record_batch::RecordBatch; +use datafusion::config::TableParquetOptions; use datafusion::datasource::listing::PartitionedFile; use datafusion::datasource::memory::MemorySourceConfig; use datafusion::datasource::physical_plan::ParquetSource; @@ -34,6 +35,7 @@ use datafusion_common::stats::Precision; use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; use datafusion_common::utils::expr::COUNT_STAR_EXPANSION; use datafusion_common::{ColumnStatistics, JoinType, NullEquality, Result, Statistics}; +use datafusion_datasource::file::FileSource; use datafusion_datasource::file_scan_config::FileScanConfigBuilder; use datafusion_execution::object_store::ObjectStoreUrl; use datafusion_execution::{SendableRecordBatchStream, TaskContext}; @@ -71,15 +73,14 @@ use datafusion_physical_plan::{ /// Create a non sorted parquet exec pub fn parquet_exec(schema: SchemaRef) -> Arc { - let config = FileScanConfigBuilder::new( - ObjectStoreUrl::parse("test:///").unwrap(), - schema, - Arc::new(ParquetSource::default()), - ) - .with_file(PartitionedFile::new("x".to_string(), 100)) - .build(); + let config = + FileScanConfigBuilder::new(ObjectStoreUrl::parse("test:///").unwrap(), schema) + .with_file(PartitionedFile::new("x".to_string(), 100)) + .build(); + + let source = ParquetSource::new(TableParquetOptions::default(), config); - DataSourceExec::from_data_source(config) + DataSourceExec::from_data_source(source) } /// Create a single parquet file that is sorted @@ -87,16 +88,15 @@ pub(crate) fn parquet_exec_with_sort( schema: SchemaRef, output_ordering: Vec, ) -> Arc { - let config = FileScanConfigBuilder::new( - ObjectStoreUrl::parse("test:///").unwrap(), - schema, - Arc::new(ParquetSource::default()), - ) - .with_file(PartitionedFile::new("x".to_string(), 100)) - .with_output_ordering(output_ordering) - .build(); + let config = + FileScanConfigBuilder::new(ObjectStoreUrl::parse("test:///").unwrap(), schema) + .with_file(PartitionedFile::new("x".to_string(), 100)) + .with_output_ordering(output_ordering) + .build(); + + let source = ParquetSource::new(TableParquetOptions::default(), config); - DataSourceExec::from_data_source(config) + DataSourceExec::from_data_source(source) } fn int64_stats() -> ColumnStatistics { @@ -125,20 +125,20 @@ pub(crate) fn parquet_exec_with_stats(file_size: u64) -> Arc { statistics.num_rows = Precision::Inexact(10000); statistics.column_statistics = column_stats(); - let config = FileScanConfigBuilder::new( - ObjectStoreUrl::parse("test:///").unwrap(), - schema(), - Arc::new(ParquetSource::new(Default::default())), - ) - .with_file(PartitionedFile::new("x".to_string(), file_size)) - .with_statistics(statistics) - .build(); + let config = + FileScanConfigBuilder::new(ObjectStoreUrl::parse("test:///").unwrap(), schema()) + .with_file(PartitionedFile::new("x".to_string(), file_size)) + .with_statistics(statistics) + .build(); + + let source = ParquetSource::new(Default::default(), config); assert_eq!( - config.file_source.statistics().unwrap().num_rows, + source.projected_statistics().num_rows, Precision::Inexact(10000) ); - DataSourceExec::from_data_source(config) + + DataSourceExec::from_data_source(source) } pub fn schema() -> SchemaRef { diff --git a/datafusion/core/tests/schema_adapter/schema_adapter_integration_tests.rs b/datafusion/core/tests/schema_adapter/schema_adapter_integration_tests.rs index c3c92a9028d6..60a2073c06d6 100644 --- a/datafusion/core/tests/schema_adapter/schema_adapter_integration_tests.rs +++ b/datafusion/core/tests/schema_adapter/schema_adapter_integration_tests.rs @@ -157,6 +157,8 @@ impl SchemaMapper for UppercaseSchemaMapper { #[cfg(feature = "parquet")] #[tokio::test] async fn test_parquet_integration_with_schema_adapter() -> Result<()> { + use datafusion::config::TableParquetOptions; + // Create test data let batch = RecordBatch::try_new( Arc::new(Schema::new(vec![ @@ -182,22 +184,23 @@ async fn test_parquet_integration_with_schema_adapter() -> Result<()> { let ctx = SessionContext::new(); ctx.register_object_store(store_url.as_ref(), Arc::clone(&store)); - // Create a ParquetSource with the adapter factory - let file_source = ParquetSource::default() - .with_schema_adapter_factory(Arc::new(UppercaseAdapterFactory {}))?; - // Create a table schema with uppercase column names let table_schema = Arc::new(Schema::new(vec![ Field::new("ID", DataType::Int32, false), Field::new("NAME", DataType::Utf8, true), ])); - let config = FileScanConfigBuilder::new(store_url, table_schema.clone(), file_source) + let config = FileScanConfigBuilder::new(store_url, table_schema.clone()) .with_file(PartitionedFile::new(path, file_size)) .build(); + // Create a ParquetSource with the adapter factory + let file_source = ParquetSource::new(TableParquetOptions::default(), config) + .with_schema_adapter_factory(Arc::new(UppercaseAdapterFactory {}))? + .as_data_source(); + // Create a data source executor - let exec = DataSourceExec::from_data_source(config); + let exec = Arc::new(DataSourceExec::new(file_source)); // Collect results let task_ctx = ctx.task_ctx(); @@ -220,6 +223,8 @@ async fn test_parquet_integration_with_schema_adapter() -> Result<()> { async fn test_parquet_integration_with_schema_adapter_and_expression_rewriter( ) -> Result<()> { // Create test data + + use datafusion::config::TableParquetOptions; let batch = RecordBatch::try_new( Arc::new(Schema::new(vec![ Field::new("id", DataType::Int32, false), @@ -244,16 +249,16 @@ async fn test_parquet_integration_with_schema_adapter_and_expression_rewriter( let ctx = SessionContext::new(); ctx.register_object_store(store_url.as_ref(), Arc::clone(&store)); - // Create a ParquetSource with the adapter factory - let file_source = ParquetSource::default() - .with_schema_adapter_factory(Arc::new(UppercaseAdapterFactory {}))?; - - let config = FileScanConfigBuilder::new(store_url, batch.schema(), file_source) + let config = FileScanConfigBuilder::new(store_url, batch.schema()) .with_file(PartitionedFile::new(path, file_size)) .build(); + // Create a ParquetSource with the adapter factory + let file_source = ParquetSource::new(TableParquetOptions::default(), config) + .with_schema_adapter_factory(Arc::new(UppercaseAdapterFactory {}))?; + // Create a data source executor - let exec = DataSourceExec::from_data_source(config); + let exec = Arc::new(DataSourceExec::new(file_source.as_data_source())); // Collect results let task_ctx = ctx.task_ctx(); @@ -273,6 +278,11 @@ async fn test_parquet_integration_with_schema_adapter_and_expression_rewriter( #[tokio::test] async fn test_multi_source_schema_adapter_reuse() -> Result<()> { + let config = FileScanConfigBuilder::new( + ObjectStoreUrl::local_filesystem(), + Arc::new(Schema::empty()), + ); + // This test verifies that the same schema adapter factory can be reused // across different file source types. This is important for ensuring that: // 1. The schema adapter factory interface works uniformly across all source types @@ -284,7 +294,7 @@ async fn test_multi_source_schema_adapter_reuse() -> Result<()> { // Test ArrowSource { - let source = ArrowSource::default(); + let source = ArrowSource::new(config.clone().build()); let source_with_adapter = source .clone() .with_schema_adapter_factory(factory.clone()) @@ -304,7 +314,10 @@ async fn test_multi_source_schema_adapter_reuse() -> Result<()> { // Test ParquetSource #[cfg(feature = "parquet")] { - let source = ParquetSource::default(); + use datafusion::config::TableParquetOptions; + + let source = + ParquetSource::new(TableParquetOptions::default(), config.clone().build()); let source_with_adapter = source .clone() .with_schema_adapter_factory(factory.clone()) @@ -323,7 +336,7 @@ async fn test_multi_source_schema_adapter_reuse() -> Result<()> { // Test CsvSource { - let source = CsvSource::default(); + let source = CsvSource::new(false, 0, 0, config.clone().build()); let source_with_adapter = source .clone() .with_schema_adapter_factory(factory.clone()) @@ -342,7 +355,7 @@ async fn test_multi_source_schema_adapter_reuse() -> Result<()> { // Test JsonSource { - let source = JsonSource::default(); + let source = JsonSource::new(config.clone().build()); let source_with_adapter = source .clone() .with_schema_adapter_factory(factory.clone()) diff --git a/datafusion/datasource-avro/src/file_format.rs b/datafusion/datasource-avro/src/file_format.rs index 60c361b42e77..3c4ad8cdd4d8 100644 --- a/datafusion/datasource-avro/src/file_format.rs +++ b/datafusion/datasource-avro/src/file_format.rs @@ -32,10 +32,9 @@ use datafusion_common::parsers::CompressionTypeVariant; use datafusion_common::GetExt; use datafusion_common::DEFAULT_AVRO_EXTENSION; use datafusion_common::{Result, Statistics}; -use datafusion_datasource::file::FileSource; use datafusion_datasource::file_compression_type::FileCompressionType; use datafusion_datasource::file_format::{FileFormat, FileFormatFactory}; -use datafusion_datasource::file_scan_config::{FileScanConfig, FileScanConfigBuilder}; +use datafusion_datasource::file_scan_config::FileScanConfig; use datafusion_datasource::source::DataSourceExec; use datafusion_physical_plan::ExecutionPlan; use datafusion_session::Session; @@ -154,13 +153,6 @@ impl FileFormat for AvroFormat { _state: &dyn Session, conf: FileScanConfig, ) -> Result> { - let config = FileScanConfigBuilder::from(conf) - .with_source(self.file_source()) - .build(); - Ok(DataSourceExec::from_data_source(config)) - } - - fn file_source(&self) -> Arc { - Arc::new(AvroSource::new()) + Ok(DataSourceExec::from_data_source(AvroSource::new(conf))) } } diff --git a/datafusion/datasource-avro/src/source.rs b/datafusion/datasource-avro/src/source.rs index 948049f5a747..4a93c6b35aa6 100644 --- a/datafusion/datasource-avro/src/source.rs +++ b/datafusion/datasource-avro/src/source.rs @@ -22,50 +22,54 @@ use std::sync::Arc; use crate::avro_to_arrow::Reader as AvroReader; -use arrow::datatypes::SchemaRef; use datafusion_common::error::Result; -use datafusion_common::Statistics; use datafusion_datasource::file::FileSource; use datafusion_datasource::file_scan_config::FileScanConfig; use datafusion_datasource::file_stream::FileOpener; -use datafusion_datasource::schema_adapter::SchemaAdapterFactory; +use datafusion_datasource::source::DataSource; use datafusion_physical_expr_common::sort_expr::LexOrdering; -use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet; use object_store::ObjectStore; /// AvroSource holds the extra configuration that is necessary for opening avro files -#[derive(Clone, Default)] +#[derive(Debug, Clone)] pub struct AvroSource { - schema: Option, - batch_size: Option, - projection: Option>, - metrics: ExecutionPlanMetricsSet, - projected_statistics: Option, - schema_adapter_factory: Option>, + config: FileScanConfig, } impl AvroSource { /// Initialize an AvroSource with default values - pub fn new() -> Self { - Self::default() + pub fn new(config: FileScanConfig) -> Self { + Self { config } } fn open(&self, reader: R) -> Result> { AvroReader::try_new( reader, - Arc::clone(self.schema.as_ref().expect("Schema must set before open")), - self.batch_size.expect("Batch size must set before open"), - self.projection.clone(), + self.config.file_schema.clone(), + self.config + .batch_size + .expect("Batch size must set before open"), + self.config.projected_file_column_names(), ) } } impl FileSource for AvroSource { + fn config(&self) -> &FileScanConfig { + &self.config + } + + fn with_config(&self, config: FileScanConfig) -> Arc { + let mut this = self.clone(); + this.config = config; + + Arc::new(this) + } + fn create_file_opener( &self, object_store: Arc, - _base_config: &FileScanConfig, _partition: usize, ) -> Arc { Arc::new(private::AvroOpener { @@ -78,40 +82,6 @@ impl FileSource for AvroSource { self } - fn with_batch_size(&self, batch_size: usize) -> Arc { - let mut conf = self.clone(); - conf.batch_size = Some(batch_size); - Arc::new(conf) - } - - fn with_schema(&self, schema: SchemaRef) -> Arc { - let mut conf = self.clone(); - conf.schema = Some(schema); - Arc::new(conf) - } - fn with_statistics(&self, statistics: Statistics) -> Arc { - let mut conf = self.clone(); - conf.projected_statistics = Some(statistics); - Arc::new(conf) - } - - fn with_projection(&self, config: &FileScanConfig) -> Arc { - let mut conf = self.clone(); - conf.projection = config.projected_file_column_names(); - Arc::new(conf) - } - - fn metrics(&self) -> &ExecutionPlanMetricsSet { - &self.metrics - } - - fn statistics(&self) -> Result { - let statistics = &self.projected_statistics; - Ok(statistics - .clone() - .expect("projected_statistics must be set")) - } - fn file_type(&self) -> &str { "avro" } @@ -121,23 +91,12 @@ impl FileSource for AvroSource { _target_partitions: usize, _repartition_file_min_size: usize, _output_ordering: Option, - _config: &FileScanConfig, ) -> Result> { Ok(None) } - fn with_schema_adapter_factory( - &self, - schema_adapter_factory: Arc, - ) -> Result> { - Ok(Arc::new(Self { - schema_adapter_factory: Some(schema_adapter_factory), - ..self.clone() - })) - } - - fn schema_adapter_factory(&self) -> Option> { - self.schema_adapter_factory.clone() + fn as_data_source(&self) -> Arc { + Arc::new(self.clone()) } } diff --git a/datafusion/datasource-csv/src/file_format.rs b/datafusion/datasource-csv/src/file_format.rs index 4eeb431584ba..199c4baacd19 100644 --- a/datafusion/datasource-csv/src/file_format.rs +++ b/datafusion/datasource-csv/src/file_format.rs @@ -422,20 +422,20 @@ impl FileFormat for CsvFormat { .newlines_in_values .unwrap_or_else(|| state.config_options().catalog.newlines_in_values); - let conf_builder = FileScanConfigBuilder::from(conf) + let conf = FileScanConfigBuilder::from(conf) .with_file_compression_type(self.options.compression.into()) - .with_newlines_in_values(newlines_in_values); + .with_newlines_in_values(newlines_in_values) + .build(); let source = Arc::new( - CsvSource::new(has_header, self.options.delimiter, self.options.quote) + CsvSource::new(has_header, self.options.delimiter, self.options.quote, conf) .with_escape(self.options.escape) .with_terminator(self.options.terminator) .with_comment(self.options.comment), - ); - - let config = conf_builder.with_source(source).build(); + ) + .as_data_source(); - Ok(DataSourceExec::from_data_source(config)) + Ok(Arc::new(DataSourceExec::new(source)) as Arc) } async fn create_writer_physical_plan( @@ -474,10 +474,6 @@ impl FileFormat for CsvFormat { Ok(Arc::new(DataSinkExec::new(input, sink, order_requirements)) as _) } - - fn file_source(&self) -> Arc { - Arc::new(CsvSource::default()) - } } impl CsvFormat { diff --git a/datafusion/datasource-csv/src/mod.rs b/datafusion/datasource-csv/src/mod.rs index 90538d0808b1..c942af4a0e15 100644 --- a/datafusion/datasource-csv/src/mod.rs +++ b/datafusion/datasource-csv/src/mod.rs @@ -22,12 +22,10 @@ pub mod file_format; pub mod source; -use std::sync::Arc; - use arrow::datatypes::SchemaRef; use datafusion_datasource::file_groups::FileGroup; +use datafusion_datasource::file_scan_config::FileScanConfig; use datafusion_datasource::file_scan_config::FileScanConfigBuilder; -use datafusion_datasource::{file::FileSource, file_scan_config::FileScanConfig}; use datafusion_execution::object_store::ObjectStoreUrl; pub use file_format::*; @@ -35,9 +33,8 @@ pub use file_format::*; pub fn partitioned_csv_config( schema: SchemaRef, file_groups: Vec, - file_source: Arc, ) -> FileScanConfig { - FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), schema, file_source) + FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), schema) .with_file_groups(file_groups) .build() } diff --git a/datafusion/datasource-csv/src/source.rs b/datafusion/datasource-csv/src/source.rs index 6c994af940d1..894a5a318900 100644 --- a/datafusion/datasource-csv/src/source.rs +++ b/datafusion/datasource-csv/src/source.rs @@ -17,7 +17,6 @@ //! Execution plan for reading CSV files -use datafusion_datasource::schema_adapter::SchemaAdapterFactory; use std::any::Any; use std::fmt; use std::io::{Read, Seek, SeekFrom}; @@ -28,19 +27,18 @@ use datafusion_datasource::decoder::{deserialize_stream, DecoderDeserializer}; use datafusion_datasource::file_compression_type::FileCompressionType; use datafusion_datasource::file_meta::FileMeta; use datafusion_datasource::file_stream::{FileOpenFuture, FileOpener}; +use datafusion_datasource::source::DataSource; use datafusion_datasource::{ as_file_source, calculate_range, FileRange, ListingTableUrl, PartitionedFile, RangeCalculation, }; use arrow::csv; -use arrow::datatypes::SchemaRef; -use datafusion_common::{DataFusionError, Result, Statistics}; +use datafusion_common::{DataFusionError, Result}; use datafusion_common_runtime::JoinSet; use datafusion_datasource::file::FileSource; use datafusion_datasource::file_scan_config::FileScanConfig; use datafusion_execution::TaskContext; -use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet; use datafusion_physical_plan::{ DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, }; @@ -80,30 +78,34 @@ use tokio::io::AsyncWriteExt; /// .build(); /// let exec = (DataSourceExec::from_data_source(config)); /// ``` -#[derive(Debug, Clone, Default)] +#[derive(Debug, Clone)] pub struct CsvSource { - batch_size: Option, - file_schema: Option, - file_projection: Option>, pub(crate) has_header: bool, delimiter: u8, quote: u8, terminator: Option, escape: Option, comment: Option, - metrics: ExecutionPlanMetricsSet, - projected_statistics: Option, - schema_adapter_factory: Option>, + + config: FileScanConfig, } impl CsvSource { /// Returns a [`CsvSource`] - pub fn new(has_header: bool, delimiter: u8, quote: u8) -> Self { + pub fn new( + has_header: bool, + delimiter: u8, + quote: u8, + config: FileScanConfig, + ) -> Self { Self { has_header, delimiter, quote, - ..Self::default() + terminator: Default::default(), + escape: Default::default(), + comment: Default::default(), + config, } } @@ -164,23 +166,20 @@ impl CsvSource { } fn builder(&self) -> csv::ReaderBuilder { - let mut builder = csv::ReaderBuilder::new(Arc::clone( - self.file_schema - .as_ref() - .expect("Schema must be set before initializing builder"), - )) - .with_delimiter(self.delimiter) - .with_batch_size( - self.batch_size - .expect("Batch size must be set before initializing builder"), - ) - .with_header(self.has_header) - .with_quote(self.quote); + let mut builder = csv::ReaderBuilder::new(self.config.file_schema.clone()) + .with_delimiter(self.delimiter) + .with_batch_size( + self.config + .batch_size + .expect("Batch size must be set before initializing builder"), + ) + .with_header(self.has_header) + .with_quote(self.quote); if let Some(terminator) = self.terminator { builder = builder.with_terminator(terminator); } - if let Some(proj) = &self.file_projection { - builder = builder.with_projection(proj.clone()); + if let Some(proj) = self.config.file_column_projection_indices() { + builder = builder.with_projection(proj); } if let Some(escape) = self.escape { builder = builder.with_escape(escape) @@ -222,15 +221,25 @@ impl From for Arc { } impl FileSource for CsvSource { + fn config(&self) -> &FileScanConfig { + &self.config + } + + fn with_config(&self, config: FileScanConfig) -> Arc { + let mut this = self.clone(); + this.config = config; + + Arc::new(this) + } + fn create_file_opener( &self, object_store: Arc, - base_config: &FileScanConfig, _partition: usize, ) -> Arc { Arc::new(CsvOpener { config: Arc::new(self.clone()), - file_compression_type: base_config.file_compression_type, + file_compression_type: self.config.file_compression_type, object_store, }) } @@ -239,39 +248,6 @@ impl FileSource for CsvSource { self } - fn with_batch_size(&self, batch_size: usize) -> Arc { - let mut conf = self.clone(); - conf.batch_size = Some(batch_size); - Arc::new(conf) - } - - fn with_schema(&self, schema: SchemaRef) -> Arc { - let mut conf = self.clone(); - conf.file_schema = Some(schema); - Arc::new(conf) - } - - fn with_statistics(&self, statistics: Statistics) -> Arc { - let mut conf = self.clone(); - conf.projected_statistics = Some(statistics); - Arc::new(conf) - } - - fn with_projection(&self, config: &FileScanConfig) -> Arc { - let mut conf = self.clone(); - conf.file_projection = config.file_column_projection_indices(); - Arc::new(conf) - } - - fn metrics(&self) -> &ExecutionPlanMetricsSet { - &self.metrics - } - fn statistics(&self) -> Result { - let statistics = &self.projected_statistics; - Ok(statistics - .clone() - .expect("projected_statistics must be set")) - } fn file_type(&self) -> &str { "csv" } @@ -284,18 +260,8 @@ impl FileSource for CsvSource { } } - fn with_schema_adapter_factory( - &self, - schema_adapter_factory: Arc, - ) -> Result> { - Ok(Arc::new(Self { - schema_adapter_factory: Some(schema_adapter_factory), - ..self.clone() - })) - } - - fn schema_adapter_factory(&self) -> Option> { - self.schema_adapter_factory.clone() + fn as_data_source(&self) -> Arc { + Arc::new(self.clone()) } } diff --git a/datafusion/datasource-json/src/file_format.rs b/datafusion/datasource-json/src/file_format.rs index 51f4bd7e963e..08506f6d18e0 100644 --- a/datafusion/datasource-json/src/file_format.rs +++ b/datafusion/datasource-json/src/file_format.rs @@ -39,7 +39,6 @@ use datafusion_common::{ use datafusion_common_runtime::SpawnedTask; use datafusion_datasource::decoder::Decoder; use datafusion_datasource::display::FileGroupDisplay; -use datafusion_datasource::file::FileSource; use datafusion_datasource::file_compression_type::FileCompressionType; use datafusion_datasource::file_format::{ FileFormat, FileFormatFactory, DEFAULT_SCHEMA_INFER_MAX_RECORD, @@ -253,14 +252,13 @@ impl FileFormat for JsonFormat { _state: &dyn Session, conf: FileScanConfig, ) -> Result> { - let source = Arc::new(JsonSource::new()); let conf = FileScanConfigBuilder::from(conf) .with_file_compression_type(FileCompressionType::from( self.options.compression, )) - .with_source(source) .build(); - Ok(DataSourceExec::from_data_source(conf)) + + Ok(DataSourceExec::from_data_source(JsonSource::new(conf))) } async fn create_writer_physical_plan( @@ -280,10 +278,6 @@ impl FileFormat for JsonFormat { Ok(Arc::new(DataSinkExec::new(input, sink, order_requirements)) as _) } - - fn file_source(&self) -> Arc { - Arc::new(JsonSource::default()) - } } impl Default for JsonSerializer { diff --git a/datafusion/datasource-json/src/source.rs b/datafusion/datasource-json/src/source.rs index d318928e5c6b..ba1004c6dd9d 100644 --- a/datafusion/datasource-json/src/source.rs +++ b/datafusion/datasource-json/src/source.rs @@ -30,7 +30,7 @@ use datafusion_datasource::decoder::{deserialize_stream, DecoderDeserializer}; use datafusion_datasource::file_compression_type::FileCompressionType; use datafusion_datasource::file_meta::FileMeta; use datafusion_datasource::file_stream::{FileOpenFuture, FileOpener}; -use datafusion_datasource::schema_adapter::SchemaAdapterFactory; +use datafusion_datasource::source::DataSource; use datafusion_datasource::{ as_file_source, calculate_range, ListingTableUrl, PartitionedFile, RangeCalculation, }; @@ -38,11 +38,9 @@ use datafusion_physical_plan::{ExecutionPlan, ExecutionPlanProperties}; use arrow::json::ReaderBuilder; use arrow::{datatypes::SchemaRef, json}; -use datafusion_common::Statistics; use datafusion_datasource::file::FileSource; use datafusion_datasource::file_scan_config::FileScanConfig; use datafusion_execution::TaskContext; -use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet; use futures::{StreamExt, TryStreamExt}; use object_store::buffered::BufWriter; @@ -75,18 +73,15 @@ impl JsonOpener { } /// JsonSource holds the extra configuration that is necessary for [`JsonOpener`] -#[derive(Clone, Default)] +#[derive(Debug, Clone)] pub struct JsonSource { - batch_size: Option, - metrics: ExecutionPlanMetricsSet, - projected_statistics: Option, - schema_adapter_factory: Option>, + config: FileScanConfig, } impl JsonSource { /// Initialize a JsonSource with default values - pub fn new() -> Self { - Self::default() + pub fn new(config: FileScanConfig) -> Self { + Self { config } } } @@ -97,18 +92,29 @@ impl From for Arc { } impl FileSource for JsonSource { + fn config(&self) -> &FileScanConfig { + &self.config + } + + fn with_config(&self, config: FileScanConfig) -> Arc { + let mut this = self.clone(); + this.config = config; + + Arc::new(this) + } + fn create_file_opener( &self, object_store: Arc, - base_config: &FileScanConfig, _partition: usize, ) -> Arc { Arc::new(JsonOpener { batch_size: self + .config .batch_size .expect("Batch size must set before creating opener"), - projected_schema: base_config.projected_file_schema(), - file_compression_type: base_config.file_compression_type, + projected_schema: self.config.projected_file_schema(), + file_compression_type: self.config.file_compression_type, object_store, }) } @@ -117,52 +123,17 @@ impl FileSource for JsonSource { self } - fn with_batch_size(&self, batch_size: usize) -> Arc { - let mut conf = self.clone(); - conf.batch_size = Some(batch_size); - Arc::new(conf) - } - + // why is this a no op? fn with_schema(&self, _schema: SchemaRef) -> Arc { Arc::new(Self { ..self.clone() }) } - fn with_statistics(&self, statistics: Statistics) -> Arc { - let mut conf = self.clone(); - conf.projected_statistics = Some(statistics); - Arc::new(conf) - } - - fn with_projection(&self, _config: &FileScanConfig) -> Arc { - Arc::new(Self { ..self.clone() }) - } - - fn metrics(&self) -> &ExecutionPlanMetricsSet { - &self.metrics - } - - fn statistics(&self) -> Result { - let statistics = &self.projected_statistics; - Ok(statistics - .clone() - .expect("projected_statistics must be set to call")) - } fn file_type(&self) -> &str { "json" } - fn with_schema_adapter_factory( - &self, - schema_adapter_factory: Arc, - ) -> Result> { - Ok(Arc::new(Self { - schema_adapter_factory: Some(schema_adapter_factory), - ..self.clone() - })) - } - - fn schema_adapter_factory(&self) -> Option> { - self.schema_adapter_factory.clone() + fn as_data_source(&self) -> Arc { + Arc::new(self.clone()) } } diff --git a/datafusion/datasource-parquet/src/file_format.rs b/datafusion/datasource-parquet/src/file_format.rs index 56718534a558..a2f5a99c316a 100644 --- a/datafusion/datasource-parquet/src/file_format.rs +++ b/datafusion/datasource-parquet/src/file_format.rs @@ -52,8 +52,7 @@ use datafusion_common::{ use datafusion_common::{HashMap, Statistics}; use datafusion_common_runtime::{JoinSet, SpawnedTask}; use datafusion_datasource::display::FileGroupDisplay; -use datafusion_datasource::file::FileSource; -use datafusion_datasource::file_scan_config::{FileScanConfig, FileScanConfigBuilder}; +use datafusion_datasource::file_scan_config::FileScanConfig; use datafusion_datasource::sink::{DataSink, DataSinkExec}; use datafusion_execution::memory_pool::{MemoryConsumer, MemoryPool, MemoryReservation}; use datafusion_execution::{SendableRecordBatchStream, TaskContext}; @@ -483,7 +482,7 @@ impl FileFormat for ParquetFormat { metadata_size_hint = Some(metadata); } - let mut source = ParquetSource::new(self.options.clone()); + let mut source = ParquetSource::new(self.options.clone(), conf.clone()); // Use the CachedParquetFileReaderFactory let metadata_cache = state.runtime_env().cache_manager.get_file_metadata_cache(); @@ -501,12 +500,9 @@ impl FileFormat for ParquetFormat { source = self.set_source_encryption_factory(source, state)?; // Apply schema adapter factory before building the new config - let file_source = source.apply_schema_adapter(&conf)?; + let file_source = source.apply_schema_adapter()?.as_data_source(); - let conf = FileScanConfigBuilder::from(conf) - .with_source(file_source) - .build(); - Ok(DataSourceExec::from_data_source(conf)) + Ok(Arc::new(DataSourceExec::new(file_source)) as Arc) } async fn create_writer_physical_plan( @@ -524,10 +520,6 @@ impl FileFormat for ParquetFormat { Ok(Arc::new(DataSinkExec::new(input, sink, order_requirements)) as _) } - - fn file_source(&self) -> Arc { - Arc::new(ParquetSource::default()) - } } #[cfg(feature = "parquet_encryption")] diff --git a/datafusion/datasource-parquet/src/source.rs b/datafusion/datasource-parquet/src/source.rs index caec7db0ce0b..559b21a05885 100644 --- a/datafusion/datasource-parquet/src/source.rs +++ b/datafusion/datasource-parquet/src/source.rs @@ -35,11 +35,12 @@ use datafusion_datasource::schema_adapter::{ DefaultSchemaAdapterFactory, SchemaAdapterFactory, }; -use arrow::datatypes::{SchemaRef, TimeUnit}; +use arrow::datatypes::TimeUnit; use datafusion_common::config::TableParquetOptions; use datafusion_common::{DataFusionError, Statistics}; use datafusion_datasource::file::FileSource; use datafusion_datasource::file_scan_config::FileScanConfig; +use datafusion_datasource::source::DataSource; use datafusion_physical_expr::conjunction; use datafusion_physical_expr_adapter::DefaultPhysicalExprAdapterFactory; use datafusion_physical_expr_common::physical_expr::fmt_sql; @@ -103,7 +104,7 @@ use object_store::ObjectStore; /// # let object_store_url = ObjectStoreUrl::local_filesystem(); /// # let predicate = lit(true); /// let source = Arc::new( -/// ParquetSource::default() +/// ParquetSource::new(TableParquetOptions::default()) /// .with_predicate(predicate) /// ); /// // Create a DataSourceExec for reading `file1.parquet` with a file size of 100MB @@ -181,7 +182,8 @@ use object_store::ObjectStore; /// // Split a single DataSourceExec into multiple DataSourceExecs, one for each file /// let exec = parquet_exec(); /// let data_source = exec.data_source(); -/// let base_config = data_source.as_any().downcast_ref::().unwrap(); +/// let parquet_source = data_source.as_any().downcast_ref::().unwrap(); +/// let base_config = parquet_source.config(); /// let existing_file_groups = &base_config.file_groups; /// let new_execs = existing_file_groups /// .iter() @@ -191,7 +193,9 @@ use object_store::ObjectStore; /// .with_file_groups(vec![file_group.clone()]) /// .build(); /// -/// (DataSourceExec::from_data_source(new_config)) +/// let parquet_source = parquet_source.clone().with_config(new_config); +/// +/// (Arc::new(DataSourceExec::new(parquet_source))) /// }) /// .collect::>(); /// ``` @@ -230,7 +234,7 @@ use object_store::ObjectStore; /// let partitioned_file = PartitionedFile::new("my_file.parquet", 1234) /// .with_extensions(Arc::new(access_plan)); /// // create a FileScanConfig to scan this file -/// let config = FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), schema(), Arc::new(ParquetSource::default())) +/// let config = FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), schema(), Arc::new(ParquetSource::new(TableParquetOptions::default()))) /// .with_file(partitioned_file).build(); /// // this parquet DataSourceExec will not even try to read row groups 2 and 4. Additional /// // pruning based on predicates may also happen @@ -265,39 +269,39 @@ use object_store::ObjectStore; /// [`RecordBatch`]: arrow::record_batch::RecordBatch /// [`SchemaAdapter`]: datafusion_datasource::schema_adapter::SchemaAdapter /// [`ParquetMetadata`]: parquet::file::metadata::ParquetMetaData -#[derive(Clone, Default, Debug)] +#[derive(Debug, Clone)] pub struct ParquetSource { /// Options for reading Parquet files pub(crate) table_parquet_options: TableParquetOptions, - /// Optional metrics - pub(crate) metrics: ExecutionPlanMetricsSet, - /// The schema of the file. - /// In particular, this is the schema of the table without partition columns, - /// *not* the physical schema of the file. - pub(crate) file_schema: Option, /// Optional predicate for row filtering during parquet scan pub(crate) predicate: Option>, /// Optional user defined parquet file reader factory pub(crate) parquet_file_reader_factory: Option>, - /// Optional user defined schema adapter - pub(crate) schema_adapter_factory: Option>, - /// Batch size configuration - pub(crate) batch_size: Option, + /// Optional hint for the size of the parquet metadata pub(crate) metadata_size_hint: Option, - pub(crate) projected_statistics: Option, #[cfg(feature = "parquet_encryption")] pub(crate) encryption_factory: Option>, + + pub(crate) config: FileScanConfig, } impl ParquetSource { /// Create a new ParquetSource to read the data specified in the file scan /// configuration with the provided `TableParquetOptions`. /// if default values are going to be used, use `ParguetConfig::default()` instead - pub fn new(table_parquet_options: TableParquetOptions) -> Self { + pub fn new( + table_parquet_options: TableParquetOptions, + config: FileScanConfig, + ) -> Self { Self { table_parquet_options, - ..Self::default() + predicate: Default::default(), + parquet_file_reader_factory: Default::default(), + metadata_size_hint: Default::default(), + #[cfg(feature = "parquet_encryption")] + encryption_factory: Default::default(), + config, } } @@ -313,7 +317,7 @@ impl ParquetSource { } fn with_metrics(mut self, metrics: ExecutionPlanMetricsSet) -> Self { - self.metrics = metrics; + self.config.metrics = metrics; self } @@ -432,14 +436,11 @@ impl ParquetSource { /// * `conf` - FileScanConfig that may contain a schema adapter factory /// # Returns /// The converted FileSource with schema adapter factory applied if provided - pub fn apply_schema_adapter( - self, - conf: &FileScanConfig, - ) -> datafusion_common::Result> { - let file_source: Arc = self.into(); + pub fn apply_schema_adapter(self) -> datafusion_common::Result> { + let file_source: Arc = self.clone().into(); // If the FileScanConfig.file_source() has a schema adapter factory, apply it - if let Some(factory) = conf.file_source().schema_adapter_factory() { + if let Some(factory) = self.config.schema_adapter_factory { file_source.with_schema_adapter_factory( Arc::::clone(&factory), ) @@ -488,19 +489,31 @@ impl From for Arc { } impl FileSource for ParquetSource { + fn config(&self) -> &FileScanConfig { + &self.config + } + + fn with_config(&self, config: FileScanConfig) -> Arc { + let mut this = self.clone(); + this.config = config; + + Arc::new(this) + } + fn create_file_opener( &self, object_store: Arc, - base_config: &FileScanConfig, partition: usize, ) -> Arc { + let base_config = &self.config; + let projection = base_config .file_column_projection_indices() .unwrap_or_else(|| (0..base_config.file_schema.fields().len()).collect()); let (expr_adapter_factory, schema_adapter_factory) = match ( base_config.expr_adapter_factory.as_ref(), - self.schema_adapter_factory.as_ref(), + base_config.schema_adapter_factory.as_ref(), ) { (Some(expr_adapter_factory), Some(schema_adapter_factory)) => { // Use both the schema adapter factory and the expr adapter factory. @@ -560,6 +573,7 @@ impl FileSource for ParquetSource { partition_index: partition, projection: Arc::from(projection), batch_size: self + .config .batch_size .expect("Batch size must set before creating ParquetOpener"), limit: base_config.limit, @@ -567,7 +581,7 @@ impl FileSource for ParquetSource { logical_file_schema: Arc::clone(&base_config.file_schema), partition_fields: base_config.table_partition_cols.clone(), metadata_size_hint: self.metadata_size_hint, - metrics: self.metrics().clone(), + metrics: self.config.metrics.clone(), parquet_file_reader_factory, pushdown_filters: self.pushdown_filters(), reorder_filters: self.reorder_filters(), @@ -587,47 +601,18 @@ impl FileSource for ParquetSource { self } - fn with_batch_size(&self, batch_size: usize) -> Arc { - let mut conf = self.clone(); - conf.batch_size = Some(batch_size); - Arc::new(conf) - } - - fn with_schema(&self, schema: SchemaRef) -> Arc { - Arc::new(Self { - file_schema: Some(schema), - ..self.clone() - }) - } + fn projected_statistics(&self) -> Statistics { + let statistics = self.config().projected_stats(); - fn with_statistics(&self, statistics: Statistics) -> Arc { - let mut conf = self.clone(); - conf.projected_statistics = Some(statistics); - Arc::new(conf) - } - - fn with_projection(&self, _config: &FileScanConfig) -> Arc { - Arc::new(Self { ..self.clone() }) - } - - fn metrics(&self) -> &ExecutionPlanMetricsSet { - &self.metrics - } - - fn statistics(&self) -> datafusion_common::Result { - let statistics = &self.projected_statistics; - let statistics = statistics - .clone() - .expect("projected_statistics must be set"); // When filters are pushed down, we have no way of knowing the exact statistics. // Note that pruning predicate is also a kind of filter pushdown. // (bloom filters use `pruning_predicate` too). // Because filter pushdown may happen dynamically as long as there is a predicate // if we have *any* predicate applied, we can't guarantee the statistics are exact. if self.predicate().is_some() { - Ok(statistics.to_inexact()) + statistics.to_inexact() } else { - Ok(statistics) + statistics } } @@ -653,8 +638,8 @@ impl FileSource for ParquetSource { // the actual predicates are built in reference to the physical schema of // each file, which we do not have at this point and hence cannot use. // Instead we use the logical schema of the file (the table schema without partition columns). - if let (Some(file_schema), Some(predicate)) = - (&self.file_schema, &self.predicate) + if let (file_schema, Some(predicate)) = + (&self.config.file_schema, &self.predicate) { let predicate_creation_errors = Count::new(); if let (Some(pruning_predicate), _) = build_pruning_predicates( @@ -692,11 +677,8 @@ impl FileSource for ParquetSource { filters: Vec>, config: &ConfigOptions, ) -> datafusion_common::Result>> { - let Some(file_schema) = self.file_schema.clone() else { - return Ok(FilterPushdownPropagation::with_parent_pushdown_result( - vec![PushedDown::No; filters.len()], - )); - }; + let file_schema = self.config.file_schema.clone(); + // Determine if based on configs we should push filters down. // If either the table / scan itself or the config has pushdown enabled, // we will push down the filters. @@ -759,17 +741,7 @@ impl FileSource for ParquetSource { .with_updated_node(source)) } - fn with_schema_adapter_factory( - &self, - schema_adapter_factory: Arc, - ) -> datafusion_common::Result> { - Ok(Arc::new(Self { - schema_adapter_factory: Some(schema_adapter_factory), - ..self.clone() - })) - } - - fn schema_adapter_factory(&self) -> Option> { - self.schema_adapter_factory.clone() + fn as_data_source(&self) -> Arc { + Arc::new(self.clone()) } } diff --git a/datafusion/datasource-parquet/tests/apply_schema_adapter_tests.rs b/datafusion/datasource-parquet/tests/apply_schema_adapter_tests.rs index e9288a5f80f6..75ea1050d5d1 100644 --- a/datafusion/datasource-parquet/tests/apply_schema_adapter_tests.rs +++ b/datafusion/datasource-parquet/tests/apply_schema_adapter_tests.rs @@ -20,7 +20,9 @@ mod parquet_adapter_tests { datatypes::{DataType, Field, Schema, SchemaRef}, record_batch::RecordBatch, }; - use datafusion_common::{ColumnStatistics, DataFusionError, Result}; + use datafusion_common::{ + config::TableParquetOptions, ColumnStatistics, DataFusionError, Result, + }; use datafusion_datasource::{ file::FileSource, file_scan_config::FileScanConfigBuilder, @@ -133,25 +135,26 @@ mod parquet_adapter_tests { Field::new("name", DataType::Utf8, true), ])); - // Create a parquet source - let source = ParquetSource::default(); + let config = FileScanConfigBuilder::new( + ObjectStoreUrl::local_filesystem(), + schema.clone(), + ) + .build(); // Create a file scan config with source that has a schema adapter factory let factory = Arc::new(PrefixAdapterFactory { prefix: "test_".to_string(), }); - let file_source = source.clone().with_schema_adapter_factory(factory).unwrap(); + // Create a parquet source + let source = ParquetSource::new(TableParquetOptions::default(), config) + .with_schema_adapter_factory(factory) + .unwrap(); - let config = FileScanConfigBuilder::new( - ObjectStoreUrl::local_filesystem(), - schema.clone(), - file_source, - ) - .build(); + let source = source.as_any().downcast_ref::().unwrap(); // Apply schema adapter to a new source - let result_source = source.apply_schema_adapter(&config).unwrap(); + let result_source = source.clone().apply_schema_adapter().unwrap(); // Verify the adapter was applied assert!(result_source.schema_adapter_factory().is_some()); @@ -183,22 +186,18 @@ mod parquet_adapter_tests { Field::new("name", DataType::Utf8, true), ])); - // Create a parquet source - let source = ParquetSource::default(); - - // Convert to Arc - let file_source: Arc = Arc::new(source.clone()); - // Create a file scan config without a schema adapter factory let config = FileScanConfigBuilder::new( ObjectStoreUrl::local_filesystem(), schema.clone(), - file_source, ) .build(); + // Create a parquet source + let source = ParquetSource::new(TableParquetOptions::default(), config); + // Apply schema adapter function - should pass through the source unchanged - let result_source = source.apply_schema_adapter(&config).unwrap(); + let result_source = source.apply_schema_adapter().unwrap(); // Verify no adapter was applied assert!(result_source.schema_adapter_factory().is_none()); diff --git a/datafusion/datasource/src/display.rs b/datafusion/datasource/src/display.rs index c9e979535963..92765bf0fa08 100644 --- a/datafusion/datasource/src/display.rs +++ b/datafusion/datasource/src/display.rs @@ -27,7 +27,7 @@ use std::fmt::{Debug, Formatter, Result as FmtResult}; /// {NUM_GROUPS groups: [[file1, file2,...], [fileN, fileM, ...], ...]} /// ``` #[derive(Debug)] -pub(crate) struct FileGroupsDisplay<'a>(pub(crate) &'a [FileGroup]); +pub struct FileGroupsDisplay<'a>(pub(crate) &'a [FileGroup]); impl DisplayAs for FileGroupsDisplay<'_> { fn fmt_as(&self, t: DisplayFormatType, f: &mut Formatter) -> FmtResult { diff --git a/datafusion/datasource/src/file.rs b/datafusion/datasource/src/file.rs index 29fa38a8ee36..b0d4b7494c0c 100644 --- a/datafusion/datasource/src/file.rs +++ b/datafusion/datasource/src/file.rs @@ -22,17 +22,31 @@ use std::fmt; use std::fmt::Formatter; use std::sync::Arc; +use crate::display::FileGroupsDisplay; use crate::file_groups::FileGroupPartitioner; -use crate::file_scan_config::FileScanConfig; -use crate::file_stream::FileOpener; +use crate::file_scan_config::{ + get_projected_output_ordering, FileScanConfig, FileScanConfigBuilder, +}; +use crate::file_stream::{FileOpener, FileStream}; use crate::schema_adapter::SchemaAdapterFactory; +use crate::source::{DataSource, DataSourceExec}; use arrow::datatypes::SchemaRef; use datafusion_common::config::ConfigOptions; -use datafusion_common::{not_impl_err, Result, Statistics}; -use datafusion_physical_expr::{LexOrdering, PhysicalExpr}; +use datafusion_common::{Result, Statistics}; +use datafusion_execution::{SendableRecordBatchStream, TaskContext}; +use datafusion_physical_expr::expressions::Column; +use datafusion_physical_expr::{ + EquivalenceProperties, LexOrdering, Partitioning, PhysicalExpr, +}; +use datafusion_physical_plan::coop::cooperative; +use datafusion_physical_plan::display::{display_orderings, ProjectSchemaDisplay}; +use datafusion_physical_plan::execution_plan::SchedulingType; use datafusion_physical_plan::filter_pushdown::{FilterPushdownPropagation, PushedDown}; use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet; -use datafusion_physical_plan::DisplayFormatType; +use datafusion_physical_plan::projection::{ + all_alias_free_columns, new_projections_for_columns, ProjectionExec, +}; +use datafusion_physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan}; use object_store::ObjectStore; @@ -51,35 +65,92 @@ pub fn as_file_source(source: T) -> Arc /// * [`ParquetSource`](https://docs.rs/datafusion/latest/datafusion/datasource/physical_plan/struct.ParquetSource.html) /// /// [`DataSource`]: crate::source::DataSource -pub trait FileSource: Send + Sync { +pub trait FileSource: fmt::Debug + Send + Sync { /// Creates a `dyn FileOpener` based on given parameters fn create_file_opener( &self, object_store: Arc, - base_config: &FileScanConfig, partition: usize, ) -> Arc; /// Any fn as_any(&self) -> &dyn Any; + + fn as_data_source(&self) -> Arc; + + fn with_config(&self, config: FileScanConfig) -> Arc; + /// Initialize new type with batch size configuration - fn with_batch_size(&self, batch_size: usize) -> Arc; + fn with_batch_size(&self, batch_size: usize) -> Arc { + let conf = FileScanConfigBuilder::from(self.config().to_owned()) + .with_batch_size(Some(batch_size)) + .build(); + + self.with_config(conf) + } + /// Initialize new instance with a new schema - fn with_schema(&self, schema: SchemaRef) -> Arc; - /// Initialize new instance with projection information - fn with_projection(&self, config: &FileScanConfig) -> Arc; + fn with_schema(&self, schema: SchemaRef) -> Arc { + let conf = FileScanConfigBuilder::from(self.config().to_owned()) + .with_file_schema(schema) + .build(); + + self.with_config(conf) + } + /// Initialize new instance with projected statistics - fn with_statistics(&self, statistics: Statistics) -> Arc; + fn with_projected_statistics( + &self, + projected_statistics: Statistics, + ) -> Arc { + let conf = FileScanConfigBuilder::from(self.config().to_owned()) + .with_statistics(projected_statistics) + .build(); + + self.with_config(conf) + } + + /// Set optional schema adapter factory. + /// + /// [`SchemaAdapterFactory`] allows user to specify how fields from the + /// file get mapped to that of the table schema. If you implement this + /// method, you should also implement [`schema_adapter_factory`]. + /// + /// + /// [`schema_adapter_factory`]: Self::schema_adapter_factory + fn with_schema_adapter_factory( + &self, + factory: Arc, + ) -> Result> { + let conf = FileScanConfigBuilder::from(self.config().to_owned()) + .with_schema_adapter(Some(factory)) + .build(); + + Ok(self.with_config(conf)) + } + /// Return execution plan metrics - fn metrics(&self) -> &ExecutionPlanMetricsSet; + fn metrics(&self) -> &ExecutionPlanMetricsSet { + &self.config().metrics + } + /// Return projected statistics - fn statistics(&self) -> Result; + fn projected_statistics(&self) -> Statistics { + self.config().projected_stats() + } + /// String representation of file source such as "csv", "json", "parquet" fn file_type(&self) -> &str; + /// Format FileType specific information fn fmt_extra(&self, _t: DisplayFormatType, _f: &mut Formatter) -> fmt::Result { Ok(()) } + fn fmt_file_source(&self, t: DisplayFormatType, f: &mut Formatter) -> fmt::Result { + write!(f, ", file_type={}", self.file_type())?; + self.fmt_extra(t, f) + } + /// If supported by the [`FileSource`], redistribute files across partitions /// according to their size. Allows custom file formats to implement their /// own repartitioning logic. @@ -91,9 +162,10 @@ pub trait FileSource: Send + Sync { target_partitions: usize, repartition_file_min_size: usize, output_ordering: Option, - config: &FileScanConfig, ) -> Result> { - if config.file_compression_type.is_compressed() || config.new_lines_in_values { + if self.config().file_compression_type.is_compressed() + || self.config().new_lines_in_values + { return Ok(None); } @@ -101,10 +173,10 @@ pub trait FileSource: Send + Sync { .with_target_partitions(target_partitions) .with_repartition_file_min_size(repartition_file_min_size) .with_preserve_order_within_groups(output_ordering.is_some()) - .repartition_file_groups(&config.file_groups); + .repartition_file_groups(&self.config().file_groups); if let Some(repartitioned_file_groups) = repartitioned_file_groups_option { - let mut source = config.clone(); + let mut source = self.config().clone(); source.file_groups = repartitioned_file_groups; return Ok(Some(source)); } @@ -125,29 +197,194 @@ pub trait FileSource: Send + Sync { )) } - /// Set optional schema adapter factory. - /// - /// [`SchemaAdapterFactory`] allows user to specify how fields from the - /// file get mapped to that of the table schema. If you implement this - /// method, you should also implement [`schema_adapter_factory`]. - /// - /// The default implementation returns a not implemented error. - /// - /// [`schema_adapter_factory`]: Self::schema_adapter_factory - fn with_schema_adapter_factory( - &self, - _factory: Arc, - ) -> Result> { - not_impl_err!( - "FileSource {} does not support schema adapter factory", - self.file_type() - ) - } - /// Returns the current schema adapter factory if set /// /// Default implementation returns `None`. fn schema_adapter_factory(&self) -> Option> { - None + self.config().schema_adapter_factory.clone() + } + + fn config(&self) -> &FileScanConfig; +} + +impl DataSource for T { + fn open( + &self, + partition: usize, + context: Arc, + ) -> Result { + let object_store = context + .runtime_env() + .object_store(&self.config().object_store_url)?; + + let batch_size = self + .config() + .batch_size + .unwrap_or_else(|| context.session_config().batch_size()); + + let source = self.with_batch_size(batch_size); + + let opener = source.create_file_opener(object_store, partition); + + let stream = + FileStream::new(source.config(), partition, opener, source.metrics())?; + Ok(Box::pin(cooperative(stream))) + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn fmt_as(&self, t: DisplayFormatType, f: &mut Formatter) -> fmt::Result { + match t { + DisplayFormatType::Default | DisplayFormatType::Verbose => { + let schema = self.config().projected_schema(); + let orderings = get_projected_output_ordering(self.config(), &schema); + + write!(f, "file_groups=")?; + FileGroupsDisplay(&self.config().file_groups).fmt_as(t, f)?; + + if !schema.fields().is_empty() { + write!(f, ", projection={}", ProjectSchemaDisplay(&schema))?; + } + + if let Some(limit) = self.config().limit { + write!(f, ", limit={limit}")?; + } + + display_orderings(f, &orderings)?; + + if !self.config().constraints.is_empty() { + write!(f, ", {}", self.config().constraints)?; + } + + self.fmt_file_source(t, f) + } + DisplayFormatType::TreeRender => { + writeln!(f, "format={}", self.file_type())?; + self.fmt_extra(t, f)?; + let num_files = self + .config() + .file_groups + .iter() + .map(|fg| fg.len()) + .sum::(); + writeln!(f, "files={num_files}")?; + Ok(()) + } + } + } + + /// If supported by the underlying [`FileSource`], redistribute files across partitions according to their size. + fn repartitioned( + &self, + target_partitions: usize, + repartition_file_min_size: usize, + output_ordering: Option, + ) -> Result>> { + let source = self.repartitioned( + target_partitions, + repartition_file_min_size, + output_ordering, + )?; + + Ok(source.map(|s| self.with_config(s).as_data_source())) + } + + fn output_partitioning(&self) -> Partitioning { + Partitioning::UnknownPartitioning(self.config().file_groups.len()) + } + + fn eq_properties(&self) -> EquivalenceProperties { + let (schema, constraints, _, orderings) = self.config().project(); + EquivalenceProperties::new_with_orderings(schema, orderings) + .with_constraints(constraints) + } + + fn scheduling_type(&self) -> SchedulingType { + SchedulingType::Cooperative + } + + fn statistics(&self) -> Result { + Ok(self.projected_statistics()) + } + + fn with_fetch(&self, limit: Option) -> Option> { + let config = FileScanConfigBuilder::from(self.config().to_owned()) + .with_limit(limit) + .build(); + + Some(self.with_config(config).as_data_source()) + } + + fn fetch(&self) -> Option { + self.config().limit + } + + fn metrics(&self) -> ExecutionPlanMetricsSet { + self.metrics().clone() + } + + fn try_swapping_with_projection( + &self, + projection: &ProjectionExec, + ) -> Result>> { + // This process can be moved into CsvExec, but it would be an overlap of their responsibility. + + // Must be all column references, with no table partition columns (which can not be projected) + let partitioned_columns_in_proj = projection.expr().iter().any(|(expr, _)| { + expr.as_any() + .downcast_ref::() + .map(|expr| expr.index() >= self.config().file_schema.fields().len()) + .unwrap_or(false) + }); + + // If there is any non-column or alias-carrier expression, Projection should not be removed. + let no_aliases = all_alias_free_columns(projection.expr()); + + Ok((no_aliases && !partitioned_columns_in_proj).then(|| { + let new_projections = new_projections_for_columns( + projection, + &self.config().projection.clone().unwrap_or_else(|| { + (0..self.config().file_schema.fields().len()).collect() + }), + ); + + let conf = FileScanConfigBuilder::from(self.config().to_owned()) + // Assign projected statistics to source + .with_projection(Some(new_projections)) + .build(); + + let this = self.with_config(conf).as_data_source(); + + Arc::new(DataSourceExec::new(this)) as Arc + })) + } + + fn try_pushdown_filters( + &self, + filters: Vec>, + config: &ConfigOptions, + ) -> Result>> { + let result = self.try_pushdown_filters(filters, config)?; + match result.updated_node { + Some(new_file_source) => Ok(FilterPushdownPropagation { + filters: result.filters, + updated_node: Some(new_file_source.as_data_source()), + }), + None => { + // If the file source does not support filter pushdown, return the original config + Ok(FilterPushdownPropagation { + filters: result.filters, + updated_node: None, + }) + } + } + } + + fn as_file_source(&self) -> Option> { + // just trigger a clone here + let this = self.with_config(self.config().to_owned()); + Some(this) } } diff --git a/datafusion/datasource/src/file_format.rs b/datafusion/datasource/src/file_format.rs index 23f68636c156..8988837907dc 100644 --- a/datafusion/datasource/src/file_format.rs +++ b/datafusion/datasource/src/file_format.rs @@ -23,7 +23,6 @@ use std::collections::HashMap; use std::fmt; use std::sync::Arc; -use crate::file::FileSource; use crate::file_compression_type::FileCompressionType; use crate::file_scan_config::FileScanConfig; use crate::file_sink_config::FileSinkConfig; @@ -109,9 +108,6 @@ pub trait FileFormat: Send + Sync + fmt::Debug { ) -> Result> { not_impl_err!("Writer not implemented for this format") } - - /// Return the related FileSource such as `CsvSource`, `JsonSource`, etc. - fn file_source(&self) -> Arc; } /// Factory for creating [`FileFormat`] instances based on session and command level options diff --git a/datafusion/datasource/src/file_scan_config.rs b/datafusion/datasource/src/file_scan_config.rs index 7088f811bbce..38f6d885709e 100644 --- a/datafusion/datasource/src/file_scan_config.rs +++ b/datafusion/datasource/src/file_scan_config.rs @@ -19,7 +19,7 @@ //! file sources. use std::{ - any::Any, borrow::Cow, collections::HashMap, fmt::Debug, fmt::Formatter, + borrow::Cow, collections::HashMap, fmt::Debug, fmt::Formatter, fmt::Result as FmtResult, marker::PhantomData, sync::Arc, }; @@ -27,13 +27,8 @@ use crate::file_groups::FileGroup; #[allow(unused_imports)] use crate::schema_adapter::SchemaAdapterFactory; use crate::{ - display::FileGroupsDisplay, - file::FileSource, - file_compression_type::FileCompressionType, - file_stream::FileStream, - source::{DataSource, DataSourceExec}, - statistics::MinMaxStatistics, - PartitionedFile, + display::FileGroupsDisplay, file_compression_type::FileCompressionType, + statistics::MinMaxStatistics, PartitionedFile, }; use arrow::datatypes::FieldRef; use arrow::{ @@ -44,29 +39,20 @@ use arrow::{ buffer::Buffer, datatypes::{ArrowNativeType, DataType, Field, Schema, SchemaRef, UInt16Type}, }; -use datafusion_common::config::ConfigOptions; use datafusion_common::{ exec_err, ColumnStatistics, Constraints, DataFusionError, Result, ScalarValue, Statistics, }; -use datafusion_execution::{ - object_store::ObjectStoreUrl, SendableRecordBatchStream, TaskContext, -}; +use datafusion_execution::object_store::ObjectStoreUrl; use datafusion_physical_expr::expressions::Column; -use datafusion_physical_expr::{EquivalenceProperties, Partitioning}; use datafusion_physical_expr_adapter::PhysicalExprAdapterFactory; -use datafusion_physical_expr_common::physical_expr::PhysicalExpr; use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr}; -use datafusion_physical_plan::filter_pushdown::FilterPushdownPropagation; use datafusion_physical_plan::{ display::{display_orderings, ProjectSchemaDisplay}, metrics::ExecutionPlanMetricsSet, - projection::{all_alias_free_columns, new_projections_for_columns, ProjectionExec}, - DisplayAs, DisplayFormatType, ExecutionPlan, + DisplayAs, DisplayFormatType, }; -use datafusion_physical_plan::coop::cooperative; -use datafusion_physical_plan::execution_plan::SchedulingType; use log::{debug, warn}; /// The base configurations for a [`DataSourceExec`], the a physical plan for @@ -92,6 +78,10 @@ use log::{debug, warn}; /// # use datafusion_physical_plan::ExecutionPlan; /// # use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet; /// # use datafusion_datasource::schema_adapter::SchemaAdapterFactory; +/// # use datafusion_datasource::source::DataSource; +/// # use datafusion_execution::TaskContext; +/// # use datafusion_execution::SendableRecordBatchStream; +/// # use datafusion_physical_plan::DisplayFormatType; /// # let file_schema = Arc::new(Schema::new(vec![ /// # Field::new("c1", DataType::Int32, false), /// # Field::new("c2", DataType::Int32, false), @@ -99,31 +89,33 @@ use log::{debug, warn}; /// # Field::new("c4", DataType::Int32, false), /// # ])); /// # // Note: crate mock ParquetSource, as ParquetSource is not in the datasource crate -/// #[derive(Clone)] +/// #[derive(Debug, Clone)] /// # struct ParquetSource { /// # projected_statistics: Option, -/// # schema_adapter_factory: Option> +/// # schema_adapter_factory: Option>, +/// # config: FileScanConfig, /// # }; /// # impl FileSource for ParquetSource { -/// # fn create_file_opener(&self, _: Arc, _: &FileScanConfig, _: usize) -> Arc { unimplemented!() } +/// # fn create_file_opener(&self, _: Arc, _: usize) -> Arc { unimplemented!() } /// # fn as_any(&self) -> &dyn Any { self } /// # fn with_batch_size(&self, _: usize) -> Arc { unimplemented!() } /// # fn with_schema(&self, _: SchemaRef) -> Arc { Arc::new(self.clone()) as Arc } -/// # fn with_projection(&self, _: &FileScanConfig) -> Arc { unimplemented!() } -/// # fn with_statistics(&self, statistics: Statistics) -> Arc { Arc::new(Self {projected_statistics: Some(statistics), schema_adapter_factory: self.schema_adapter_factory.clone()} ) } +/// # fn with_projected_statistics(&self, statistics: Statistics) -> Arc { Arc::new(Self {projected_statistics: Some(statistics), schema_adapter_factory: self.schema_adapter_factory.clone(), config: self.config.clone()} ) } /// # fn metrics(&self) -> &ExecutionPlanMetricsSet { unimplemented!() } -/// # fn statistics(&self) -> Result { Ok(self.projected_statistics.clone().expect("projected_statistics should be set")) } +/// # fn projected_statistics(&self) -> Statistics { self.projected_statistics.clone().expect("projected_statistics should be set") } /// # fn file_type(&self) -> &str { "parquet" } -/// # fn with_schema_adapter_factory(&self, factory: Arc) -> Result> { Ok(Arc::new(Self {projected_statistics: self.projected_statistics.clone(), schema_adapter_factory: Some(factory)} )) } +/// # fn with_schema_adapter_factory(&self, factory: Arc) -> Result> { Ok(Arc::new(Self {projected_statistics: self.projected_statistics.clone(), schema_adapter_factory: Some(factory), config: self.config.clone()} )) } /// # fn schema_adapter_factory(&self) -> Option> { self.schema_adapter_factory.clone() } +/// # fn as_data_source(&self) -> Arc { Arc::new(self.clone()) } +/// # fn with_config(&self, config: FileScanConfig) -> Arc { Arc::new(Self { projected_statistics: self.projected_statistics.clone(), schema_adapter_factory: self.schema_adapter_factory.clone(), config }) } +/// # fn config(&self) -> &FileScanConfig { &self.config } /// # } /// # impl ParquetSource { -/// # fn new() -> Self { Self {projected_statistics: None, schema_adapter_factory: None} } +/// # fn new(config: FileScanConfig) -> Self { Self {projected_statistics: None, schema_adapter_factory: None, config} } /// # } /// // create FileScan config for reading parquet files from file:// /// let object_store_url = ObjectStoreUrl::local_filesystem(); -/// let file_source = Arc::new(ParquetSource::new()); -/// let config = FileScanConfigBuilder::new(object_store_url, file_schema, file_source) +/// let config = FileScanConfigBuilder::new(object_store_url, file_schema) /// .with_limit(Some(1000)) // read only the first 1000 records /// .with_projection(Some(vec![2, 3])) // project columns 2 and 3 /// // Read /tmp/file1.parquet with known size of 1234 bytes in a single group @@ -134,8 +126,9 @@ use log::{debug, warn}; /// PartitionedFile::new("file2.parquet", 56), /// PartitionedFile::new("file3.parquet", 78), /// ])).build(); +/// let file_source = ParquetSource::new(config); /// // create an execution plan from the config -/// let plan: Arc = DataSourceExec::from_data_source(config); +/// let plan: Arc = DataSourceExec::from_data_source(file_source); /// ``` #[derive(Clone)] pub struct FileScanConfig { @@ -184,14 +177,18 @@ pub struct FileScanConfig { pub file_compression_type: FileCompressionType, /// Are new lines in values supported for CSVOptions pub new_lines_in_values: bool, - /// File source such as `ParquetSource`, `CsvSource`, `JsonSource`, etc. - pub file_source: Arc, /// Batch size while creating new batches /// Defaults to [`datafusion_common::config::ExecutionOptions`] batch_size. pub batch_size: Option, /// Expression adapter used to adapt filters and projections that are pushed down into the scan /// from the logical schema to the physical schema of the file. pub expr_adapter_factory: Option>, + + pub metrics: ExecutionPlanMetricsSet, + pub schema_adapter_factory: Option>, + + /// + pub projected_statistics: Statistics, } /// A builder for [`FileScanConfig`]'s. @@ -221,7 +218,6 @@ pub struct FileScanConfig { /// let config = FileScanConfigBuilder::new( /// ObjectStoreUrl::local_filesystem(), /// schema, -/// file_source, /// ) /// // Set a limit of 1000 rows /// .with_limit(Some(1000)) @@ -257,19 +253,20 @@ pub struct FileScanConfigBuilder { /// /// This probably would be better named `table_schema` file_schema: SchemaRef, - file_source: Arc, limit: Option, projection: Option>, table_partition_cols: Vec, constraints: Option, file_groups: Vec, - statistics: Option, + statistics: Statistics, output_ordering: Vec, file_compression_type: Option, new_lines_in_values: Option, batch_size: Option, + metrics: ExecutionPlanMetricsSet, expr_adapter_factory: Option>, + schema_adapter_factory: Option>, } impl FileScanConfigBuilder { @@ -278,18 +275,14 @@ impl FileScanConfigBuilder { /// # Parameters: /// * `object_store_url`: See [`FileScanConfig::object_store_url`] /// * `file_schema`: See [`FileScanConfig::file_schema`] - /// * `file_source`: See [`FileScanConfig::file_source`] - pub fn new( - object_store_url: ObjectStoreUrl, - file_schema: SchemaRef, - file_source: Arc, - ) -> Self { + pub fn new(object_store_url: ObjectStoreUrl, file_schema: SchemaRef) -> Self { + let statistics = Statistics::new_unknown(&file_schema); + Self { object_store_url, file_schema, - file_source, file_groups: vec![], - statistics: None, + statistics, output_ordering: vec![], file_compression_type: None, new_lines_in_values: None, @@ -299,9 +292,18 @@ impl FileScanConfigBuilder { constraints: None, batch_size: None, expr_adapter_factory: None, + metrics: Default::default(), + schema_adapter_factory: None, } } + pub fn with_file_schema(mut self, file_schema: SchemaRef) -> Self { + self.statistics = Statistics::new_unknown(&file_schema); + self.file_schema = file_schema; + + self + } + /// Set the maximum number of records to read from this plan. If `None`, /// all records after filtering are returned. pub fn with_limit(mut self, limit: Option) -> Self { @@ -309,15 +311,6 @@ impl FileScanConfigBuilder { self } - /// Set the file source for scanning files. - /// - /// This method allows you to change the file source implementation (e.g. ParquetSource, CsvSource, etc.) - /// after the builder has been created. - pub fn with_source(mut self, file_source: Arc) -> Self { - self.file_source = file_source; - self - } - /// Set the columns on which to project the data. Indexes that are higher than the /// number of columns of `file_schema` refer to `table_partition_cols`. pub fn with_projection(mut self, projection: Option>) -> Self { @@ -343,7 +336,7 @@ impl FileScanConfigBuilder { /// Set the estimated overall statistics of the files, taking `filters` into account. /// Defaults to [`Statistics::new_unknown`]. pub fn with_statistics(mut self, statistics: Statistics) -> Self { - self.statistics = Some(statistics); + self.statistics = statistics; self } @@ -421,6 +414,19 @@ impl FileScanConfigBuilder { self } + pub fn with_schema_adapter( + mut self, + schema_adapter: Option>, + ) -> Self { + self.schema_adapter_factory = schema_adapter; + self + } + + pub fn with_metrics(mut self, metrics: ExecutionPlanMetricsSet) -> Self { + self.metrics = metrics; + self + } + /// Build the final [`FileScanConfig`] with all the configured settings. /// /// This method takes ownership of the builder and returns the constructed `FileScanConfig`. @@ -429,7 +435,6 @@ impl FileScanConfigBuilder { let Self { object_store_url, file_schema, - file_source, limit, projection, table_partition_cols, @@ -440,16 +445,13 @@ impl FileScanConfigBuilder { file_compression_type, new_lines_in_values, batch_size, - expr_adapter_factory: expr_adapter, + expr_adapter_factory, + schema_adapter_factory, + metrics, } = self; let constraints = constraints.unwrap_or_default(); - let statistics = - statistics.unwrap_or_else(|| Statistics::new_unknown(&file_schema)); - let file_source = file_source - .with_statistics(statistics.clone()) - .with_schema(Arc::clone(&file_schema)); let file_compression_type = file_compression_type.unwrap_or(FileCompressionType::UNCOMPRESSED); let new_lines_in_values = new_lines_in_values.unwrap_or(false); @@ -457,7 +459,6 @@ impl FileScanConfigBuilder { FileScanConfig { object_store_url, file_schema, - file_source, limit, projection, table_partition_cols, @@ -467,7 +468,10 @@ impl FileScanConfigBuilder { file_compression_type, new_lines_in_values, batch_size, - expr_adapter_factory: expr_adapter, + expr_adapter_factory, + projected_statistics: statistics, + schema_adapter_factory, + metrics, } } } @@ -477,9 +481,8 @@ impl From for FileScanConfigBuilder { Self { object_store_url: config.object_store_url, file_schema: config.file_schema, - file_source: Arc::::clone(&config.file_source), file_groups: config.file_groups, - statistics: config.file_source.statistics().ok(), + statistics: config.projected_statistics, output_ordering: config.output_ordering, file_compression_type: Some(config.file_compression_type), new_lines_in_values: Some(config.new_lines_in_values), @@ -489,181 +492,8 @@ impl From for FileScanConfigBuilder { constraints: Some(config.constraints), batch_size: config.batch_size, expr_adapter_factory: config.expr_adapter_factory, - } - } -} - -impl DataSource for FileScanConfig { - fn open( - &self, - partition: usize, - context: Arc, - ) -> Result { - let object_store = context.runtime_env().object_store(&self.object_store_url)?; - let batch_size = self - .batch_size - .unwrap_or_else(|| context.session_config().batch_size()); - - let source = self - .file_source - .with_batch_size(batch_size) - .with_projection(self); - - let opener = source.create_file_opener(object_store, self, partition); - - let stream = FileStream::new(self, partition, opener, source.metrics())?; - Ok(Box::pin(cooperative(stream))) - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn fmt_as(&self, t: DisplayFormatType, f: &mut Formatter) -> FmtResult { - match t { - DisplayFormatType::Default | DisplayFormatType::Verbose => { - let schema = self.projected_schema(); - let orderings = get_projected_output_ordering(self, &schema); - - write!(f, "file_groups=")?; - FileGroupsDisplay(&self.file_groups).fmt_as(t, f)?; - - if !schema.fields().is_empty() { - write!(f, ", projection={}", ProjectSchemaDisplay(&schema))?; - } - - if let Some(limit) = self.limit { - write!(f, ", limit={limit}")?; - } - - display_orderings(f, &orderings)?; - - if !self.constraints.is_empty() { - write!(f, ", {}", self.constraints)?; - } - - self.fmt_file_source(t, f) - } - DisplayFormatType::TreeRender => { - writeln!(f, "format={}", self.file_source.file_type())?; - self.file_source.fmt_extra(t, f)?; - let num_files = self.file_groups.iter().map(|fg| fg.len()).sum::(); - writeln!(f, "files={num_files}")?; - Ok(()) - } - } - } - - /// If supported by the underlying [`FileSource`], redistribute files across partitions according to their size. - fn repartitioned( - &self, - target_partitions: usize, - repartition_file_min_size: usize, - output_ordering: Option, - ) -> Result>> { - let source = self.file_source.repartitioned( - target_partitions, - repartition_file_min_size, - output_ordering, - self, - )?; - - Ok(source.map(|s| Arc::new(s) as _)) - } - - fn output_partitioning(&self) -> Partitioning { - Partitioning::UnknownPartitioning(self.file_groups.len()) - } - - fn eq_properties(&self) -> EquivalenceProperties { - let (schema, constraints, _, orderings) = self.project(); - EquivalenceProperties::new_with_orderings(schema, orderings) - .with_constraints(constraints) - } - - fn scheduling_type(&self) -> SchedulingType { - SchedulingType::Cooperative - } - - fn statistics(&self) -> Result { - Ok(self.projected_stats()) - } - - fn with_fetch(&self, limit: Option) -> Option> { - let source = FileScanConfigBuilder::from(self.clone()) - .with_limit(limit) - .build(); - Some(Arc::new(source)) - } - - fn fetch(&self) -> Option { - self.limit - } - - fn metrics(&self) -> ExecutionPlanMetricsSet { - self.file_source.metrics().clone() - } - - fn try_swapping_with_projection( - &self, - projection: &ProjectionExec, - ) -> Result>> { - // This process can be moved into CsvExec, but it would be an overlap of their responsibility. - - // Must be all column references, with no table partition columns (which can not be projected) - let partitioned_columns_in_proj = projection.expr().iter().any(|(expr, _)| { - expr.as_any() - .downcast_ref::() - .map(|expr| expr.index() >= self.file_schema.fields().len()) - .unwrap_or(false) - }); - - // If there is any non-column or alias-carrier expression, Projection should not be removed. - let no_aliases = all_alias_free_columns(projection.expr()); - - Ok((no_aliases && !partitioned_columns_in_proj).then(|| { - let file_scan = self.clone(); - let source = Arc::clone(&file_scan.file_source); - let new_projections = new_projections_for_columns( - projection, - &file_scan - .projection - .clone() - .unwrap_or_else(|| (0..self.file_schema.fields().len()).collect()), - ); - DataSourceExec::from_data_source( - FileScanConfigBuilder::from(file_scan) - // Assign projected statistics to source - .with_projection(Some(new_projections)) - .with_source(source) - .build(), - ) as _ - })) - } - - fn try_pushdown_filters( - &self, - filters: Vec>, - config: &ConfigOptions, - ) -> Result>> { - let result = self.file_source.try_pushdown_filters(filters, config)?; - match result.updated_node { - Some(new_file_source) => { - let file_scan_config = FileScanConfigBuilder::from(self.clone()) - .with_source(new_file_source) - .build(); - Ok(FilterPushdownPropagation { - filters: result.filters, - updated_node: Some(Arc::new(file_scan_config) as _), - }) - } - None => { - // If the file source does not support filter pushdown, return the original config - Ok(FilterPushdownPropagation { - filters: result.filters, - updated_node: None, - }) - } + metrics: config.metrics, + schema_adapter_factory: config.schema_adapter_factory, } } } @@ -679,14 +509,12 @@ impl FileScanConfig { } pub fn projected_stats(&self) -> Statistics { - let statistics = self.file_source.statistics().unwrap(); - let table_cols_stats = self .projection_indices() .into_iter() .map(|idx| { if idx < self.file_schema.fields().len() { - statistics.column_statistics[idx].clone() + self.projected_statistics.column_statistics[idx].clone() } else { // TODO provide accurate stat for partition column (#1186) ColumnStatistics::new_unknown() @@ -695,9 +523,9 @@ impl FileScanConfig { .collect(); Statistics { - num_rows: statistics.num_rows, + num_rows: self.projected_statistics.num_rows, // TODO correct byte size: https://github.com/apache/datafusion/issues/14936 - total_byte_size: statistics.total_byte_size, + total_byte_size: self.projected_statistics.total_byte_size, column_statistics: table_cols_stats, } } @@ -746,7 +574,7 @@ impl FileScanConfig { return ( Arc::clone(&self.file_schema), self.constraints.clone(), - self.file_source.statistics().unwrap().clone(), + self.projected_statistics.clone(), self.output_ordering.clone(), ); } @@ -958,16 +786,11 @@ impl FileScanConfig { .collect()) } - /// Write the data_type based on file_source - fn fmt_file_source(&self, t: DisplayFormatType, f: &mut Formatter) -> FmtResult { - write!(f, ", file_type={}", self.file_source.file_type())?; - self.file_source.fmt_extra(t, f) - } - - /// Returns the file_source - pub fn file_source(&self) -> &Arc { - &self.file_source - } + // /// Write the data_type based on file_source + // fn fmt_file_source(&self, t: DisplayFormatType, f: &mut Formatter) -> FmtResult { + // write!(f, ", file_type={}", self.file_source.file_type())?; + // self.file_source.fmt_extra(t, f) + // } } impl Debug for FileScanConfig { @@ -975,11 +798,8 @@ impl Debug for FileScanConfig { write!(f, "FileScanConfig {{")?; write!(f, "object_store_url={:?}, ", self.object_store_url)?; - write!( - f, - "statistics={:?}, ", - self.file_source.statistics().unwrap() - )?; + write!(f, "metrics={:?}, ", self.metrics)?; + write!(f, "statistics={:?}, ", self.projected_statistics)?; DisplayAs::fmt_as(self, DisplayFormatType::Verbose, f)?; write!(f, "}}") @@ -1319,7 +1139,7 @@ fn create_output_array( /// /// DataSourceExec ///``` -fn get_projected_output_ordering( +pub fn get_projected_output_ordering( base_config: &FileScanConfig, projected_schema: &SchemaRef, ) -> Vec { @@ -1407,6 +1227,8 @@ pub fn wrap_partition_value_in_dict(val: ScalarValue) -> ScalarValue { #[cfg(test)] mod tests { use super::*; + use crate::file::FileSource as _; + use crate::source::DataSource; use crate::{ generate_test_files, test_util::MockSource, tests::aggr_test_schema, verify_sort_integrity, @@ -1569,8 +1391,10 @@ mod tests { to_partition_cols(partition_cols.clone()), ); - let source_statistics = conf.file_source.statistics().unwrap(); - let conf_stats = conf.statistics().unwrap(); + let file_source = MockSource::new(conf); + + let source_statistics = file_source.projected_statistics(); + let conf_stats = file_source.statistics().unwrap(); // projection should be reflected in the file source statistics assert_eq!(conf_stats.num_rows, Precision::Inexact(3)); @@ -1582,7 +1406,7 @@ mod tests { assert_eq!(source_statistics, statistics); assert_eq!(source_statistics.column_statistics.len(), 3); - let proj_schema = conf.projected_schema(); + let proj_schema = file_source.config().projected_schema(); // created a projector for that projected schema let mut proj = PartitionColumnProjector::new( proj_schema, @@ -2077,7 +1901,6 @@ mod tests { FileScanConfigBuilder::new( ObjectStoreUrl::parse("test:///").unwrap(), file_schema, - Arc::new(MockSource::default()), ) .with_projection(projection) .with_statistics(statistics) @@ -2120,13 +1943,11 @@ mod tests { fn test_file_scan_config_builder() { let file_schema = aggr_test_schema(); let object_store_url = ObjectStoreUrl::parse("test:///").unwrap(); - let file_source: Arc = Arc::new(MockSource::default()); // Create a builder with required parameters let builder = FileScanConfigBuilder::new( object_store_url.clone(), Arc::clone(&file_schema), - Arc::clone(&file_source), ); // Build with various configurations @@ -2176,16 +1997,16 @@ mod tests { fn test_file_scan_config_builder_defaults() { let file_schema = aggr_test_schema(); let object_store_url = ObjectStoreUrl::parse("test:///").unwrap(); - let file_source: Arc = Arc::new(MockSource::default()); // Create a builder with only required parameters and build without any additional configurations let config = FileScanConfigBuilder::new( object_store_url.clone(), Arc::clone(&file_schema), - Arc::clone(&file_source), ) .build(); + let file_source = MockSource::new(config.clone()); + // Verify default values assert_eq!(config.object_store_url, object_store_url); assert_eq!(config.file_schema, file_schema); @@ -2203,23 +2024,18 @@ mod tests { // Verify statistics are set to unknown assert_eq!( - config.file_source.statistics().unwrap().num_rows, + file_source.projected_statistics().num_rows, Precision::Absent ); assert_eq!( - config.file_source.statistics().unwrap().total_byte_size, + file_source.projected_statistics().total_byte_size, Precision::Absent ); assert_eq!( - config - .file_source - .statistics() - .unwrap() - .column_statistics - .len(), + file_source.projected_statistics().column_statistics.len(), file_schema.fields().len() ); - for stat in config.file_source.statistics().unwrap().column_statistics { + for stat in file_source.projected_statistics().column_statistics { assert_eq!(stat.distinct_count, Precision::Absent); assert_eq!(stat.min_value, Precision::Absent); assert_eq!(stat.max_value, Precision::Absent); @@ -2231,7 +2047,6 @@ mod tests { fn test_file_scan_config_builder_new_from() { let schema = aggr_test_schema(); let object_store_url = ObjectStoreUrl::parse("test:///").unwrap(); - let file_source: Arc = Arc::new(MockSource::default()); let partition_cols = vec![Field::new( "date", wrap_partition_type_in_dict(DataType::Utf8), @@ -2240,18 +2055,15 @@ mod tests { let file = PartitionedFile::new("test_file.parquet", 100); // Create a config with non-default values - let original_config = FileScanConfigBuilder::new( - object_store_url.clone(), - Arc::clone(&schema), - Arc::clone(&file_source), - ) - .with_projection(Some(vec![0, 2])) - .with_limit(Some(10)) - .with_table_partition_cols(partition_cols.clone()) - .with_file(file.clone()) - .with_constraints(Constraints::default()) - .with_newlines_in_values(true) - .build(); + let original_config = + FileScanConfigBuilder::new(object_store_url.clone(), Arc::clone(&schema)) + .with_projection(Some(vec![0, 2])) + .with_limit(Some(10)) + .with_table_partition_cols(partition_cols.clone()) + .with_file(file.clone()) + .with_constraints(Constraints::default()) + .with_newlines_in_values(true) + .build(); // Create a new builder from the config let new_builder = FileScanConfigBuilder::from(original_config); diff --git a/datafusion/datasource/src/file_stream.rs b/datafusion/datasource/src/file_stream.rs index 868b980b6476..cecfd5980b98 100644 --- a/datafusion/datasource/src/file_stream.rs +++ b/datafusion/datasource/src/file_stream.rs @@ -536,7 +536,6 @@ mod tests { use crate::file_meta::FileMeta; use crate::file_stream::{FileOpenFuture, FileOpener, FileStream, OnError}; - use crate::test_util::MockSource; use arrow::array::RecordBatch; use arrow::datatypes::Schema; @@ -664,11 +663,11 @@ mod tests { let config = FileScanConfigBuilder::new( ObjectStoreUrl::parse("test:///").unwrap(), file_schema, - Arc::new(MockSource::default()), ) .with_file_group(file_group) .with_limit(self.limit) .build(); + let metrics_set = ExecutionPlanMetricsSet::new(); let file_stream = FileStream::new(&config, 0, Arc::new(self.opener), &metrics_set) diff --git a/datafusion/datasource/src/memory.rs b/datafusion/datasource/src/memory.rs index 673c1b9dd45d..89c831168474 100644 --- a/datafusion/datasource/src/memory.rs +++ b/datafusion/datasource/src/memory.rs @@ -234,6 +234,10 @@ impl DataSource for MemorySourceConfig { }) .transpose() } + + fn as_file_source(&self) -> Option> { + None + } } impl MemorySourceConfig { diff --git a/datafusion/datasource/src/source.rs b/datafusion/datasource/src/source.rs index 153d03b3ab49..937babfc8769 100644 --- a/datafusion/datasource/src/source.rs +++ b/datafusion/datasource/src/source.rs @@ -35,7 +35,7 @@ use datafusion_physical_plan::{ }; use itertools::Itertools; -use crate::file_scan_config::FileScanConfig; +use crate::file::FileSource; use datafusion_common::config::ConfigOptions; use datafusion_common::{Constraints, Result, Statistics}; use datafusion_execution::{SendableRecordBatchStream, TaskContext}; @@ -128,6 +128,9 @@ pub trait DataSource: Send + Sync + Debug { context: Arc, ) -> Result; fn as_any(&self) -> &dyn Any; + + fn as_file_source(&self) -> Option>; + /// Format this source for display in explain plans fn fmt_as(&self, t: DisplayFormatType, f: &mut Formatter) -> fmt::Result; @@ -291,10 +294,9 @@ impl ExecutionPlan for DataSourceExec { fn partition_statistics(&self, partition: Option) -> Result { if let Some(partition) = partition { let mut statistics = Statistics::new_unknown(&self.schema()); - if let Some(file_config) = - self.data_source.as_any().downcast_ref::() - { - if let Some(file_group) = file_config.file_groups.get(partition) { + if let Some(file_source) = self.data_source.as_file_source() { + if let Some(file_group) = file_source.config().file_groups.get(partition) + { if let Some(stat) = file_group.file_statistics(None) { statistics = stat.clone(); } @@ -460,24 +462,6 @@ impl DataSourceExec { ) .with_scheduling_type(data_source.scheduling_type()) } - - /// Downcast the `DataSourceExec`'s `data_source` to a specific file source - /// - /// Returns `None` if - /// 1. the datasource is not scanning files (`FileScanConfig`) - /// 2. The [`FileScanConfig::file_source`] is not of type `T` - pub fn downcast_to_file_source(&self) -> Option<(&FileScanConfig, &T)> { - self.data_source() - .as_any() - .downcast_ref::() - .and_then(|file_scan_conf| { - file_scan_conf - .file_source() - .as_any() - .downcast_ref::() - .map(|source| (file_scan_conf, source)) - }) - } } /// Create a new `DataSourceExec` from a `DataSource` diff --git a/datafusion/datasource/src/test_util.rs b/datafusion/datasource/src/test_util.rs index e4a5114aa073..29ef4ddb0a8e 100644 --- a/datafusion/datasource/src/test_util.rs +++ b/datafusion/datasource/src/test_util.rs @@ -17,30 +17,32 @@ use crate::{ file::FileSource, file_scan_config::FileScanConfig, file_stream::FileOpener, - schema_adapter::SchemaAdapterFactory, + source::DataSource, }; use std::sync::Arc; -use arrow::datatypes::{Schema, SchemaRef}; -use datafusion_common::{Result, Statistics}; +use arrow::datatypes::Schema; +use datafusion_common::Result; use datafusion_physical_expr::{expressions::Column, PhysicalExpr}; -use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet; use object_store::ObjectStore; /// Minimal [`crate::file::FileSource`] implementation for use in tests. -#[derive(Clone, Default)] +#[derive(Debug, Clone)] pub(crate) struct MockSource { - metrics: ExecutionPlanMetricsSet, - projected_statistics: Option, - schema_adapter_factory: Option>, + config: FileScanConfig, +} + +impl MockSource { + pub fn new(config: FileScanConfig) -> Self { + Self { config } + } } impl FileSource for MockSource { fn create_file_opener( &self, _object_store: Arc, - _base_config: &FileScanConfig, _partition: usize, ) -> Arc { unimplemented!() @@ -50,52 +52,23 @@ impl FileSource for MockSource { self } - fn with_batch_size(&self, _batch_size: usize) -> Arc { - Arc::new(Self { ..self.clone() }) - } - - fn with_schema(&self, _schema: SchemaRef) -> Arc { - Arc::new(Self { ..self.clone() }) - } - - fn with_projection(&self, _config: &FileScanConfig) -> Arc { - Arc::new(Self { ..self.clone() }) - } - - fn with_statistics(&self, statistics: Statistics) -> Arc { - let mut source = self.clone(); - source.projected_statistics = Some(statistics); - Arc::new(source) - } - - fn metrics(&self) -> &ExecutionPlanMetricsSet { - &self.metrics + fn file_type(&self) -> &str { + "mock" } - fn statistics(&self) -> Result { - Ok(self - .projected_statistics - .as_ref() - .expect("projected_statistics must be set") - .clone()) + fn config(&self) -> &FileScanConfig { + &self.config } - fn file_type(&self) -> &str { - "mock" - } + fn with_config(&self, config: FileScanConfig) -> Arc { + let mut this = self.clone(); + this.config = config; - fn with_schema_adapter_factory( - &self, - schema_adapter_factory: Arc, - ) -> Result> { - Ok(Arc::new(Self { - schema_adapter_factory: Some(schema_adapter_factory), - ..self.clone() - })) + Arc::new(this) } - fn schema_adapter_factory(&self) -> Option> { - self.schema_adapter_factory.clone() + fn as_data_source(&self) -> Arc { + Arc::new(self.clone()) } } diff --git a/datafusion/proto/src/physical_plan/from_proto.rs b/datafusion/proto/src/physical_plan/from_proto.rs index 39ad52d46a80..d20e51d1565c 100644 --- a/datafusion/proto/src/physical_plan/from_proto.rs +++ b/datafusion/proto/src/physical_plan/from_proto.rs @@ -34,7 +34,7 @@ use datafusion::datasource::file_format::parquet::ParquetSink; use datafusion::datasource::listing::{FileRange, ListingTableUrl, PartitionedFile}; use datafusion::datasource::object_store::ObjectStoreUrl; use datafusion::datasource::physical_plan::{ - FileGroup, FileScanConfig, FileScanConfigBuilder, FileSinkConfig, FileSource, + FileGroup, FileScanConfig, FileScanConfigBuilder, FileSinkConfig, }; use datafusion::execution::FunctionRegistry; use datafusion::logical_expr::WindowFunctionDefinition; @@ -490,7 +490,6 @@ pub fn parse_protobuf_file_scan_config( proto: &protobuf::FileScanExecConf, ctx: &SessionContext, codec: &dyn PhysicalExtensionCodec, - file_source: Arc, ) -> Result { let schema: Arc = parse_protobuf_file_scan_schema(proto)?; let projection = proto @@ -543,7 +542,7 @@ pub fn parse_protobuf_file_scan_config( output_ordering.extend(LexOrdering::new(sort_exprs)); } - let config = FileScanConfigBuilder::new(object_store_url, file_schema, file_source) + let config = FileScanConfigBuilder::new(object_store_url, file_schema) .with_file_groups(file_groups) .with_constraints(constraints) .with_statistics(statistics) diff --git a/datafusion/proto/src/physical_plan/mod.rs b/datafusion/proto/src/physical_plan/mod.rs index fb86e380557f..a2c6bc338dc3 100644 --- a/datafusion/proto/src/physical_plan/mod.rs +++ b/datafusion/proto/src/physical_plan/mod.rs @@ -50,7 +50,7 @@ use datafusion::datasource::physical_plan::AvroSource; #[cfg(feature = "parquet")] use datafusion::datasource::physical_plan::ParquetSource; use datafusion::datasource::physical_plan::{ - CsvSource, FileScanConfig, FileScanConfigBuilder, JsonSource, + CsvSource, FileScanConfigBuilder, FileSource, JsonSource, }; use datafusion::datasource::sink::DataSinkExec; use datafusion::datasource::source::DataSourceExec; @@ -646,26 +646,25 @@ impl protobuf::PhysicalPlanNode { None }; - let source = Arc::new( - CsvSource::new( - scan.has_header, - str_to_byte(&scan.delimiter, "delimiter")?, - 0, - ) - .with_escape(escape) - .with_comment(comment), - ); - let conf = FileScanConfigBuilder::from(parse_protobuf_file_scan_config( scan.base_conf.as_ref().unwrap(), ctx, extension_codec, - source, )?) .with_newlines_in_values(scan.newlines_in_values) .with_file_compression_type(FileCompressionType::UNCOMPRESSED) .build(); - Ok(DataSourceExec::from_data_source(conf)) + + Ok(DataSourceExec::from_data_source( + CsvSource::new( + scan.has_header, + str_to_byte(&scan.delimiter, "delimiter")?, + 0, + conf, + ) + .with_escape(escape) + .with_comment(comment), + )) } fn try_into_json_scan_physical_plan( @@ -679,9 +678,9 @@ impl protobuf::PhysicalPlanNode { scan.base_conf.as_ref().unwrap(), ctx, extension_codec, - Arc::new(JsonSource::new()), )?; - Ok(DataSourceExec::from_data_source(scan_conf)) + + Ok(DataSourceExec::from_data_source(JsonSource::new(scan_conf))) } #[cfg_attr(not(feature = "parquet"), allow(unused_variables))] @@ -728,18 +727,17 @@ impl protobuf::PhysicalPlanNode { if let Some(table_options) = scan.parquet_options.as_ref() { options = table_options.try_into()?; } - let mut source = ParquetSource::new(options); + + let base_config = + parse_protobuf_file_scan_config(base_conf, ctx, extension_codec)?; + + let mut source = ParquetSource::new(options, base_config); if let Some(predicate) = predicate { source = source.with_predicate(predicate); } - let base_config = parse_protobuf_file_scan_config( - base_conf, - ctx, - extension_codec, - Arc::new(source), - )?; - Ok(DataSourceExec::from_data_source(base_config)) + + Ok(DataSourceExec::from_data_source(source)) } #[cfg(not(feature = "parquet"))] panic!("Unable to process a Parquet PhysicalPlan when `parquet` feature is not enabled") @@ -759,8 +757,9 @@ impl protobuf::PhysicalPlanNode { scan.base_conf.as_ref().unwrap(), ctx, extension_codec, - Arc::new(AvroSource::new()), )?; + + let source = AvroSource::new(conf); Ok(DataSourceExec::from_data_source(conf)) } #[cfg(not(feature = "avro"))] @@ -2315,67 +2314,55 @@ impl protobuf::PhysicalPlanNode { extension_codec: &dyn PhysicalExtensionCodec, ) -> Result> { let data_source = data_source_exec.data_source(); - if let Some(maybe_csv) = data_source.as_any().downcast_ref::() { - let source = maybe_csv.file_source(); - if let Some(csv_config) = source.as_any().downcast_ref::() { - return Ok(Some(protobuf::PhysicalPlanNode { - physical_plan_type: Some(PhysicalPlanType::CsvScan( - protobuf::CsvScanExecNode { - base_conf: Some(serialize_file_scan_config( - maybe_csv, - extension_codec, - )?), - has_header: csv_config.has_header(), - delimiter: byte_to_string( - csv_config.delimiter(), - "delimiter", - )?, - quote: byte_to_string(csv_config.quote(), "quote")?, - optional_escape: if let Some(escape) = csv_config.escape() { - Some( - protobuf::csv_scan_exec_node::OptionalEscape::Escape( - byte_to_string(escape, "escape")?, - ), - ) - } else { - None - }, - optional_comment: if let Some(comment) = csv_config.comment() - { - Some(protobuf::csv_scan_exec_node::OptionalComment::Comment( - byte_to_string(comment, "comment")?, - )) - } else { - None - }, - newlines_in_values: maybe_csv.newlines_in_values(), + + if let Some(csv_source) = data_source.as_any().downcast_ref::() { + return Ok(Some(protobuf::PhysicalPlanNode { + physical_plan_type: Some(PhysicalPlanType::CsvScan( + protobuf::CsvScanExecNode { + base_conf: Some(serialize_file_scan_config( + csv_source.config(), + extension_codec, + )?), + has_header: csv_source.has_header(), + delimiter: byte_to_string(csv_source.delimiter(), "delimiter")?, + quote: byte_to_string(csv_source.quote(), "quote")?, + optional_escape: if let Some(escape) = csv_source.escape() { + Some(protobuf::csv_scan_exec_node::OptionalEscape::Escape( + byte_to_string(escape, "escape")?, + )) + } else { + None }, - )), - })); - } + optional_comment: if let Some(comment) = csv_source.comment() { + Some(protobuf::csv_scan_exec_node::OptionalComment::Comment( + byte_to_string(comment, "comment")?, + )) + } else { + None + }, + newlines_in_values: csv_source.config().newlines_in_values(), + }, + )), + })); } - if let Some(scan_conf) = data_source.as_any().downcast_ref::() { - let source = scan_conf.file_source(); - if let Some(_json_source) = source.as_any().downcast_ref::() { - return Ok(Some(protobuf::PhysicalPlanNode { - physical_plan_type: Some(PhysicalPlanType::JsonScan( - protobuf::JsonScanExecNode { - base_conf: Some(serialize_file_scan_config( - scan_conf, - extension_codec, - )?), - }, - )), - })); - } + if let Some(json_source) = data_source.as_any().downcast_ref::() { + return Ok(Some(protobuf::PhysicalPlanNode { + physical_plan_type: Some(PhysicalPlanType::JsonScan( + protobuf::JsonScanExecNode { + base_conf: Some(serialize_file_scan_config( + json_source.config(), + extension_codec, + )?), + }, + )), + })); } #[cfg(feature = "parquet")] - if let Some((maybe_parquet, conf)) = - data_source_exec.downcast_to_file_source::() + if let Some(parquet_source) = data_source.as_any().downcast_ref::() { - let predicate = conf + let predicate = parquet_source .predicate() .map(|pred| serialize_physical_expr(pred, extension_codec)) .transpose()?; @@ -2383,31 +2370,30 @@ impl protobuf::PhysicalPlanNode { physical_plan_type: Some(PhysicalPlanType::ParquetScan( protobuf::ParquetScanExecNode { base_conf: Some(serialize_file_scan_config( - maybe_parquet, + parquet_source.config(), extension_codec, )?), predicate, - parquet_options: Some(conf.table_parquet_options().try_into()?), + parquet_options: Some( + parquet_source.table_parquet_options().try_into()?, + ), }, )), })); } #[cfg(feature = "avro")] - if let Some(maybe_avro) = data_source.as_any().downcast_ref::() { - let source = maybe_avro.file_source(); - if source.as_any().downcast_ref::().is_some() { - return Ok(Some(protobuf::PhysicalPlanNode { - physical_plan_type: Some(PhysicalPlanType::AvroScan( - protobuf::AvroScanExecNode { - base_conf: Some(serialize_file_scan_config( - maybe_avro, - extension_codec, - )?), - }, - )), - })); - } + if let Some(avro_source) = data_source.as_any().downcast_ref::() { + return Ok(Some(protobuf::PhysicalPlanNode { + physical_plan_type: Some(PhysicalPlanType::AvroScan( + protobuf::AvroScanExecNode { + base_conf: Some(serialize_file_scan_config( + avro_source.config(), + extension_codec, + )?), + }, + )), + })); } Ok(None) diff --git a/datafusion/proto/src/physical_plan/to_proto.rs b/datafusion/proto/src/physical_plan/to_proto.rs index 64960e39f75d..ae7e04b31cff 100644 --- a/datafusion/proto/src/physical_plan/to_proto.rs +++ b/datafusion/proto/src/physical_plan/to_proto.rs @@ -523,7 +523,7 @@ pub fn serialize_file_scan_config( Ok(protobuf::FileScanExecConf { file_groups, - statistics: Some((&conf.file_source.statistics().unwrap()).into()), + statistics: Some((&conf.projected_statistics).into()), limit: conf.limit.map(|l| protobuf::ScanLimit { limit: l as u32 }), projection: conf .projection diff --git a/datafusion/proto/tests/cases/roundtrip_physical_plan.rs b/datafusion/proto/tests/cases/roundtrip_physical_plan.rs index 1547b7087d5e..f0a6e5e2b3e9 100644 --- a/datafusion/proto/tests/cases/roundtrip_physical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_physical_plan.rs @@ -871,27 +871,24 @@ fn roundtrip_parquet_exec_with_pruning_predicate() -> Result<()> { let mut options = TableParquetOptions::new(); options.global.pushdown_filters = true; - let file_source = Arc::new(ParquetSource::new(options).with_predicate(predicate)); + let scan_config = + FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_schema) + .with_file_groups(vec![FileGroup::new(vec![PartitionedFile::new( + "/path/to/file.parquet".to_string(), + 1024, + )])]) + .with_statistics(Statistics { + num_rows: Precision::Inexact(100), + total_byte_size: Precision::Inexact(1024), + column_statistics: Statistics::unknown_column(&Arc::new(Schema::new( + vec![Field::new("col", DataType::Utf8, false)], + ))), + }) + .build(); - let scan_config = FileScanConfigBuilder::new( - ObjectStoreUrl::local_filesystem(), - file_schema, - file_source, - ) - .with_file_groups(vec![FileGroup::new(vec![PartitionedFile::new( - "/path/to/file.parquet".to_string(), - 1024, - )])]) - .with_statistics(Statistics { - num_rows: Precision::Inexact(100), - total_byte_size: Precision::Inexact(1024), - column_statistics: Statistics::unknown_column(&Arc::new(Schema::new(vec![ - Field::new("col", DataType::Utf8, false), - ]))), - }) - .build(); - - roundtrip_test(DataSourceExec::from_data_source(scan_config)) + let file_source = ParquetSource::new(options, scan_config).with_predicate(predicate); + + roundtrip_test(DataSourceExec::from_data_source(file_source)) } #[tokio::test] @@ -902,23 +899,21 @@ async fn roundtrip_parquet_exec_with_table_partition_cols() -> Result<()> { vec![wrap_partition_value_in_dict(ScalarValue::Int64(Some(0)))]; let schema = Arc::new(Schema::new(vec![Field::new("col", DataType::Utf8, false)])); - let file_source = Arc::new(ParquetSource::default()); - let scan_config = FileScanConfigBuilder::new( - ObjectStoreUrl::local_filesystem(), - schema, - file_source, - ) - .with_projection(Some(vec![0, 1])) - .with_file_group(FileGroup::new(vec![file_group])) - .with_table_partition_cols(vec![Field::new( - "part".to_string(), - wrap_partition_type_in_dict(DataType::Int16), - false, - )]) - .with_newlines_in_values(false) - .build(); + let scan_config = + FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), schema) + .with_projection(Some(vec![0, 1])) + .with_file_group(FileGroup::new(vec![file_group])) + .with_table_partition_cols(vec![Field::new( + "part".to_string(), + wrap_partition_type_in_dict(DataType::Int16), + false, + )]) + .with_newlines_in_values(false) + .build(); - roundtrip_test(DataSourceExec::from_data_source(scan_config)) + let file_source = ParquetSource::new(TableParquetOptions::default(), scan_config); + + roundtrip_test(DataSourceExec::from_data_source(file_source)) } #[test] @@ -930,26 +925,23 @@ fn roundtrip_parquet_exec_with_custom_predicate_expr() -> Result<()> { inner: Arc::new(Column::new("col", 1)), }); - let file_source = - Arc::new(ParquetSource::default().with_predicate(custom_predicate_expr)); + let scan_config = + FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_schema) + .with_file_groups(vec![FileGroup::new(vec![PartitionedFile::new( + "/path/to/file.parquet".to_string(), + 1024, + )])]) + .with_statistics(Statistics { + num_rows: Precision::Inexact(100), + total_byte_size: Precision::Inexact(1024), + column_statistics: Statistics::unknown_column(&Arc::new(Schema::new( + vec![Field::new("col", DataType::Utf8, false)], + ))), + }) + .build(); - let scan_config = FileScanConfigBuilder::new( - ObjectStoreUrl::local_filesystem(), - file_schema, - file_source, - ) - .with_file_groups(vec![FileGroup::new(vec![PartitionedFile::new( - "/path/to/file.parquet".to_string(), - 1024, - )])]) - .with_statistics(Statistics { - num_rows: Precision::Inexact(100), - total_byte_size: Precision::Inexact(1024), - column_statistics: Statistics::unknown_column(&Arc::new(Schema::new(vec![ - Field::new("col", DataType::Utf8, false), - ]))), - }) - .build(); + let file_source = ParquetSource::new(TableParquetOptions::default(), scan_config) + .with_predicate(custom_predicate_expr); #[derive(Debug, Clone, Eq)] struct CustomPredicateExpr { @@ -1060,7 +1052,7 @@ fn roundtrip_parquet_exec_with_custom_predicate_expr() -> Result<()> { } } - let exec_plan = DataSourceExec::from_data_source(scan_config); + let exec_plan = DataSourceExec::from_data_source(file_source); let ctx = SessionContext::new(); roundtrip_test_and_return(exec_plan, &ctx, &CustomPhysicalExtensionCodec {})?; @@ -1788,24 +1780,23 @@ async fn roundtrip_projection_source() -> Result<()> { let statistics = Statistics::new_unknown(&schema); - let file_source = ParquetSource::default().with_statistics(statistics.clone()); - let scan_config = FileScanConfigBuilder::new( - ObjectStoreUrl::local_filesystem(), - schema.clone(), - file_source, - ) - .with_file_groups(vec![FileGroup::new(vec![PartitionedFile::new( - "/path/to/file.parquet".to_string(), - 1024, - )])]) - .with_statistics(statistics) - .with_projection(Some(vec![0, 1, 2])) - .build(); + let scan_config = + FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), schema.clone()) + .with_file_groups(vec![FileGroup::new(vec![PartitionedFile::new( + "/path/to/file.parquet".to_string(), + 1024, + )])]) + .with_statistics(statistics.clone()) + .with_projection(Some(vec![0, 1, 2])) + .build(); + + let file_source = ParquetSource::new(TableParquetOptions::default(), scan_config) + .with_projected_statistics(statistics); let filter = Arc::new( FilterExec::try_new( Arc::new(BinaryExpr::new(col("c", &schema)?, Operator::Eq, lit(1))), - DataSourceExec::from_data_source(scan_config), + Arc::new(DataSourceExec::new(file_source.as_data_source())), )? .with_projection(Some(vec![0, 1]))?, ); diff --git a/datafusion/substrait/src/physical_plan/consumer.rs b/datafusion/substrait/src/physical_plan/consumer.rs index 4990054ac7fc..74a701c153a7 100644 --- a/datafusion/substrait/src/physical_plan/consumer.rs +++ b/datafusion/substrait/src/physical_plan/consumer.rs @@ -20,6 +20,7 @@ use std::sync::Arc; use datafusion::arrow::datatypes::{DataType, Field, Schema}; use datafusion::common::{not_impl_err, substrait_err}; +use datafusion::config::TableParquetOptions; use datafusion::datasource::listing::PartitionedFile; use datafusion::datasource::object_store::ObjectStoreUrl; use datafusion::datasource::physical_plan::{ @@ -53,7 +54,6 @@ pub async fn from_substrait_rel( ) -> Result> { let mut base_config_builder; - let source = Arc::new(ParquetSource::default()); match &rel.rel_type { Some(RelType::Read(read)) => { if read.filter.is_some() || read.best_effort_filter.is_some() { @@ -83,7 +83,6 @@ pub async fn from_substrait_rel( base_config_builder = FileScanConfigBuilder::new( ObjectStoreUrl::local_filesystem(), Arc::new(Schema::new(fields)), - source, ); } Err(e) => return Err(e), @@ -156,10 +155,10 @@ pub async fn from_substrait_rel( } } - Ok( - DataSourceExec::from_data_source(base_config_builder.build()) - as Arc, - ) + Ok(DataSourceExec::from_data_source(ParquetSource::new( + TableParquetOptions::default(), + base_config_builder.build(), + )) as Arc) } _ => not_impl_err!( "Only LocalFile reads are supported when parsing physical" diff --git a/datafusion/substrait/src/physical_plan/producer.rs b/datafusion/substrait/src/physical_plan/producer.rs index cb725a7277fd..566f57162211 100644 --- a/datafusion/substrait/src/physical_plan/producer.rs +++ b/datafusion/substrait/src/physical_plan/producer.rs @@ -27,7 +27,7 @@ use datafusion::datasource::source::DataSourceExec; use datafusion::error::{DataFusionError, Result}; use datafusion::physical_plan::{displayable, ExecutionPlan}; -use datafusion::datasource::physical_plan::ParquetSource; +use datafusion::datasource::physical_plan::{FileSource, ParquetSource}; use substrait::proto::expression::mask_expression::{StructItem, StructSelect}; use substrait::proto::expression::MaskExpression; use substrait::proto::r#type::{ @@ -52,9 +52,11 @@ pub fn to_substrait_rel( ), ) -> Result> { if let Some(data_source_exec) = plan.as_any().downcast_ref::() { - if let Some((file_config, _)) = - data_source_exec.downcast_to_file_source::() + if let Some(parquet_source) = + data_source_exec.as_any().downcast_ref::() { + let file_config = parquet_source.config(); + let mut substrait_files = vec![]; for (partition_index, files) in file_config.file_groups.iter().enumerate() { for file in files.iter() { diff --git a/datafusion/substrait/tests/cases/roundtrip_physical_plan.rs b/datafusion/substrait/tests/cases/roundtrip_physical_plan.rs index 64599465f96f..79aa1f08dbea 100644 --- a/datafusion/substrait/tests/cases/roundtrip_physical_plan.rs +++ b/datafusion/substrait/tests/cases/roundtrip_physical_plan.rs @@ -19,6 +19,7 @@ use std::collections::HashMap; use std::sync::Arc; use datafusion::arrow::datatypes::Schema; +use datafusion::config::TableParquetOptions; use datafusion::dataframe::DataFrame; use datafusion::datasource::listing::PartitionedFile; use datafusion::datasource::object_store::ObjectStoreUrl; @@ -35,12 +36,9 @@ use substrait::proto::extensions; #[tokio::test] async fn parquet_exec() -> Result<()> { - let source = Arc::new(ParquetSource::default()); - let scan_config = FileScanConfigBuilder::new( ObjectStoreUrl::local_filesystem(), Arc::new(Schema::empty()), - source, ) .with_file_groups(vec![ FileGroup::new(vec![PartitionedFile::new( @@ -53,8 +51,9 @@ async fn parquet_exec() -> Result<()> { )]), ]) .build(); - let parquet_exec: Arc = - DataSourceExec::from_data_source(scan_config); + + let source = ParquetSource::new(TableParquetOptions::default(), scan_config); + let parquet_exec: Arc = DataSourceExec::from_data_source(source); let mut extension_info: ( Vec, From 93b92636a96f2da978058ef9336b0b73f686cd9f Mon Sep 17 00:00:00 2001 From: Matthew Kim <38759997+friendlymatthew@users.noreply.github.com> Date: Tue, 19 Aug 2025 15:15:52 -0400 Subject: [PATCH 2/2] Fix clippy --- datafusion/datasource/src/file_scan_config.rs | 5 ++--- datafusion/proto/src/physical_plan/mod.rs | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/datafusion/datasource/src/file_scan_config.rs b/datafusion/datasource/src/file_scan_config.rs index 38f6d885709e..3e43ae9dd971 100644 --- a/datafusion/datasource/src/file_scan_config.rs +++ b/datafusion/datasource/src/file_scan_config.rs @@ -183,11 +183,10 @@ pub struct FileScanConfig { /// Expression adapter used to adapt filters and projections that are pushed down into the scan /// from the logical schema to the physical schema of the file. pub expr_adapter_factory: Option>, - + /// Optional metrics pub metrics: ExecutionPlanMetricsSet, + /// Optional user defined schema adapter pub schema_adapter_factory: Option>, - - /// pub projected_statistics: Statistics, } diff --git a/datafusion/proto/src/physical_plan/mod.rs b/datafusion/proto/src/physical_plan/mod.rs index a2c6bc338dc3..12483b68cea1 100644 --- a/datafusion/proto/src/physical_plan/mod.rs +++ b/datafusion/proto/src/physical_plan/mod.rs @@ -760,7 +760,7 @@ impl protobuf::PhysicalPlanNode { )?; let source = AvroSource::new(conf); - Ok(DataSourceExec::from_data_source(conf)) + Ok(DataSourceExec::from_data_source(source)) } #[cfg(not(feature = "avro"))] panic!("Unable to process a Avro PhysicalPlan when `avro` feature is not enabled")