-
Notifications
You must be signed in to change notification settings - Fork 1.7k
Push down preferred sorts into TableScan logical plan node
#17337
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
fe3b000
971b6b3
7040011
aacfd3d
8a7f787
f74b172
0ac540c
4b64b18
09f3d72
28bd754
6630c84
d22ea39
aa47936
ad8afb6
622a042
5a1b3cf
f8aada1
1dc5c01
c2d9751
5bcc9ab
cbfb904
8864374
c75f5cf
37fb14d
0bb576c
5747070
e591080
840b695
5662ca5
9eff254
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -41,8 +41,9 @@ pub use plan::{ | |
| projection_schema, Aggregate, Analyze, ColumnUnnestList, DescribeTable, Distinct, | ||
| DistinctOn, EmptyRelation, Explain, ExplainOption, Extension, FetchType, Filter, | ||
| Join, JoinConstraint, JoinType, Limit, LogicalPlan, Partitioning, PlanType, | ||
| Projection, RecursiveQuery, Repartition, SkipType, Sort, StringifiedPlan, Subquery, | ||
| SubqueryAlias, TableScan, ToStringifiedPlan, Union, Unnest, Values, Window, | ||
| Projection, RecursiveQuery, Repartition, ScanOrdering, SkipType, Sort, | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Addition of the |
||
| StringifiedPlan, Subquery, SubqueryAlias, TableScan, ToStringifiedPlan, Union, | ||
| Unnest, Values, Window, | ||
| }; | ||
| pub use statement::{ | ||
| Deallocate, Execute, Prepare, SetVariable, Statement, TransactionAccessMode, | ||
|
|
@@ -54,3 +55,5 @@ pub use datafusion_common::format::ExplainFormat; | |
| pub use display::display_schema; | ||
|
|
||
| pub use extension::{UserDefinedLogicalNode, UserDefinedLogicalNodeCore}; | ||
|
|
||
| pub use tree_node::LogicalPlanContext; | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1757,6 +1757,7 @@ impl LogicalPlan { | |
| ref projection, | ||
| ref filters, | ||
| ref fetch, | ||
| ref ordering, | ||
| .. | ||
| }) => { | ||
| let projected_fields = match projection { | ||
|
|
@@ -1824,6 +1825,20 @@ impl LogicalPlan { | |
| write!(f, ", fetch={n}")?; | ||
| } | ||
|
|
||
| if let Some(ordering) = ordering { | ||
| if let Some(preferred_ordering) = &ordering.preferred_ordering { | ||
| write!( | ||
| f, | ||
| " preferred_ordering=[{}]", | ||
| preferred_ordering | ||
| .iter() | ||
| .map(|e| e.to_string()) | ||
| .collect::<Vec<_>>() | ||
| .join(", ") | ||
| )?; | ||
| } | ||
| } | ||
|
|
||
| Ok(()) | ||
| } | ||
| LogicalPlan::Projection(Projection { ref expr, .. }) => { | ||
|
|
@@ -2593,6 +2608,68 @@ impl PartialOrd for Window { | |
| } | ||
| } | ||
|
|
||
| /// Communicates the desired ordering of the output of a scan operation. | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Most of this text is about the preferred_ordering field, so maybe it would be best moved closer there. I can imagine potentially adding other fields like |
||
| /// | ||
| /// Preferred orderings can potentially help DataFusion optimize queries, even in cases | ||
| /// when the output does not completely follow that order. This is information passed | ||
| /// to the scan about what might help. | ||
| /// | ||
| /// For example, a query with `ORDER BY time DESC LIMIT 10`, DataFusion's dynamic | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How about also linking to the blog that explains this in more detail: https://datafusion.apache.org/blog/2025/09/10/dynamic-filters/ |
||
| /// predicates and TopK operator will work better if the data is roughly ordered by descending | ||
| /// time (more recent data first). | ||
| /// | ||
| /// Implementers of [`TableProvider`] should use this information to optimize the order in which data is output from the scan. | ||
| /// | ||
| /// It is a hint and not a requirement: | ||
| /// - If this information is completely ignored, e.g. data is scanned randomly, the query will still be correct because a sort will be applied to the data. | ||
| /// - Partially ordered data will also be re-sorted but this may result in optimizations like early stopping, additional data pruning, reduced memory usage during the sort, etc. | ||
| /// - If the scan produces exactly the requested ordering, and sets it's properties to reflect this, upstream sorts may be optimized away. | ||
| /// | ||
| /// Actually removing unnecessary sorts is done at the physical plan level: logical operators like a join may or may not preserve ordering | ||
| /// depending on what physical operator is chosen (e.g. HashJoin vs. SortMergeJoin). | ||
| /// If you as a [`TableProvider`] implementer would like to eliminiate unnecessary sorts you should make sure the [`ExecutionPlan`] | ||
| /// you produce reflects the ordering in it's properties. | ||
| /// | ||
| /// [`TableProvider`]: https://docs.rs/datafusion/latest/datafusion/catalog/trait.TableProvider.html | ||
| /// [`ExecutionPlan`]: https://docs.rs/datafusion/latest/datafusion/physical_plan/trait.ExecutionPlan.html | ||
| #[derive(Clone, PartialEq, Eq, Hash, PartialOrd, Default)] | ||
| pub struct ScanOrdering { | ||
| /// Optional preferred ordering for the scan that matches the output order of upstream query nodes. | ||
| /// It is optional / best effort for the scan to produce this ordering. | ||
| /// If the scan produces this exact ordering and sets it's properties to reflect this upstream sorts may be optimized away. | ||
| /// Otherwise the sorts may remain in place but partial ordering may be exploited e.g. to do early stopping or reduce complexity of the sort. | ||
| /// Thus it is recommended for the scan to also do a best effort to produce partially sorted data if possible. | ||
| pub preferred_ordering: Option<Vec<SortExpr>>, | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should this be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It comes down to what does a I would personally recommend making this non |
||
| } | ||
|
|
||
| impl ScanOrdering { | ||
| /// Attach a preferred ordering to the scan ordering. | ||
| /// See [`ScanOrdering`] for details on how this is used. | ||
| pub fn with_preferred_ordering(mut self, preferred_ordering: Vec<SortExpr>) -> Self { | ||
| self.preferred_ordering = Some(preferred_ordering); | ||
| self | ||
| } | ||
| } | ||
|
|
||
| impl Debug for ScanOrdering { | ||
| fn fmt(&self, f: &mut Formatter) -> fmt::Result { | ||
| let ordering_display = self | ||
| .preferred_ordering | ||
| .as_ref() | ||
| .map(|ordering| { | ||
| ordering | ||
| .iter() | ||
| .map(|e| e.to_string()) | ||
| .collect::<Vec<String>>() | ||
| .join(", ") | ||
| }) | ||
| .unwrap_or_else(|| "None".to_string()); | ||
| f.debug_struct("ScanOrdering") | ||
| .field("preferred_ordering", &ordering_display) | ||
| .finish_non_exhaustive() | ||
| } | ||
| } | ||
|
|
||
| /// Produces rows from a table provider by reference or from the context | ||
| #[derive(Clone)] | ||
| pub struct TableScan { | ||
|
|
@@ -2608,6 +2685,8 @@ pub struct TableScan { | |
| pub filters: Vec<Expr>, | ||
| /// Optional number of rows to read | ||
| pub fetch: Option<usize>, | ||
| /// Ordering for the scan | ||
| pub ordering: Option<ScanOrdering>, | ||
|
Comment on lines
+2688
to
+2689
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should this just be |
||
| } | ||
|
|
||
| impl Debug for TableScan { | ||
|
|
@@ -2619,6 +2698,7 @@ impl Debug for TableScan { | |
| .field("projected_schema", &self.projected_schema) | ||
| .field("filters", &self.filters) | ||
| .field("fetch", &self.fetch) | ||
| .field("ordering", &self.ordering) | ||
| .finish_non_exhaustive() | ||
| } | ||
| } | ||
|
|
@@ -2630,6 +2710,7 @@ impl PartialEq for TableScan { | |
| && self.projected_schema == other.projected_schema | ||
| && self.filters == other.filters | ||
| && self.fetch == other.fetch | ||
| && self.ordering == other.ordering | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -2649,18 +2730,22 @@ impl PartialOrd for TableScan { | |
| pub filters: &'a Vec<Expr>, | ||
| /// Optional number of rows to read | ||
| pub fetch: &'a Option<usize>, | ||
| /// Ordering information passed from the query to the scan. | ||
| pub ordering: &'a Option<ScanOrdering>, | ||
| } | ||
| let comparable_self = ComparableTableScan { | ||
| table_name: &self.table_name, | ||
| projection: &self.projection, | ||
| filters: &self.filters, | ||
| fetch: &self.fetch, | ||
| ordering: &self.ordering, | ||
| }; | ||
| let comparable_other = ComparableTableScan { | ||
| table_name: &other.table_name, | ||
| projection: &other.projection, | ||
| filters: &other.filters, | ||
| fetch: &other.fetch, | ||
| ordering: &other.ordering, | ||
| }; | ||
| comparable_self | ||
| .partial_cmp(&comparable_other) | ||
|
|
@@ -2676,6 +2761,7 @@ impl Hash for TableScan { | |
| self.projected_schema.hash(state); | ||
| self.filters.hash(state); | ||
| self.fetch.hash(state); | ||
| self.ordering.hash(state); | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -2729,8 +2815,16 @@ impl TableScan { | |
| projected_schema, | ||
| filters, | ||
| fetch, | ||
| ordering: None, | ||
| }) | ||
| } | ||
|
|
||
| /// Sets the ordering information for the scan. | ||
| /// See [`ScanOrdering`] for details on how this is used. | ||
| pub fn with_ordering(mut self, ordering: ScanOrdering) -> Self { | ||
| self.ordering = Some(ordering); | ||
| self | ||
| } | ||
| } | ||
|
|
||
| // Repartition the plan based on a partitioning scheme. | ||
|
|
@@ -4823,6 +4917,7 @@ mod tests { | |
| projected_schema: Arc::clone(&schema), | ||
| filters: vec![], | ||
| fetch: None, | ||
| ordering: None, | ||
| })); | ||
| let col = schema.field_names()[0].clone(); | ||
|
|
||
|
|
@@ -4853,6 +4948,7 @@ mod tests { | |
| projected_schema: Arc::clone(&unique_schema), | ||
| filters: vec![], | ||
| fetch: None, | ||
| ordering: None, | ||
| })); | ||
| let col = schema.field_names()[0].clone(); | ||
|
|
||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.