Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
199 commits
Select commit Hold shift + click to select a range
9707a8a
bump version and generate changelog
andygrove Nov 5, 2024
88f58bf
bump version and generate changelog
andygrove Nov 5, 2024
2d5364e
Downgrade tonic
matthewmturner Dec 23, 2024
2c35f17
[bug]: Fix wrong order by removal from plan (#13497)
akurmustafa Nov 24, 2024
608ee58
Correct return type for initcap scalar function with utf8view (#13909…
alamb Dec 28, 2024
3cc3fca
Update CHANGELOG
alamb Dec 28, 2024
5383d30
enforce_distribution: fix for limits getting lost
Max-Meldrum Dec 30, 2024
13f6aca
set default-features=false for datafusion in proto crate
Max-Meldrum Jan 7, 2025
d357c7a
Adding node_id patch to our fork
emgeee Sep 11, 2024
cbd3dbc
Changes to make streaming work
ameyc May 2, 2024
deecef1
only output node_id in display if it exists
Max-Meldrum Dec 11, 2024
57bf8d6
include projection in FilterExec::with_node_id
Max-Meldrum Jan 7, 2025
c431f0f
add missing with_fetch calls to with_node_id method
Max-Meldrum Jan 7, 2025
fa581d0
rework SortExec::with_node_id to not drop preserve_partitioning
Max-Meldrum Jan 8, 2025
555ef6b
set schema_force_view_types to false in ParquetOptions
Max-Meldrum Jan 9, 2025
0e3c9e0
Revert "enforce_distribution: fix for limits getting lost"
suremarc Jan 14, 2025
a4153bf
update sqllogictests after disabling view types
suremarc Jan 14, 2025
8ae4a95
fix fetch missed in EnforceDistribution
xudong963 Jan 15, 2025
1ae2702
fix enforcesorting missing fetch
xudong963 Jan 17, 2025
38f39f5
fix more fetch missing in enforcesorting
xudong963 Jan 17, 2025
f7740af
fix: fetch is missed in the EnforceSorting (#14192)
xudong963 Jan 20, 2025
22473d9
fix remaining test issues regarding with_node_id
Max-Meldrum Jan 23, 2025
f0f6e81
use new_utf8 instead of new_utf8view in page_pruning test as we have …
Max-Meldrum Jan 23, 2025
f3e7004
Expose more components from sqllogictest (#14249)
xudong963 Jan 23, 2025
c976a89
Extract useful methods from sqllogictest bin (#14267)
xudong963 Jan 25, 2025
ffff7a1
expose df sqllogictest error
xudong963 Jan 27, 2025
63bad11
update sqllogictest
xudong963 Jan 27, 2025
e3ea7d1
chore: Upgrade to `arrow`/`parquet` `54.1.0` and fix clippy/ci (#144…
alamb Feb 3, 2025
8f10fdf
Fix join type coercion (#14387) (#14454)
alamb Feb 3, 2025
755b26a
Support `Utf8View` to `numeric` coercion (#14377) (#14455)
alamb Feb 3, 2025
9d287bd
Update REGEXP_MATCH scalar function to support Utf8View (#14449) (#14…
alamb Feb 3, 2025
6146600
Fix regression list Type Coercion List with inner type struct which h…
alamb Feb 3, 2025
26058ac
Update changelog (#14460)
alamb Feb 3, 2025
6e1e0d1
fix datafusion-cli
xudong963 Feb 6, 2025
af26638
missing fetch after removing SPM
xudong963 Feb 10, 2025
d290676
Merge remote-tracking branch 'upstream/branch-44' into branch-44-toni…
xudong963 Feb 10, 2025
d518b51
update cargo toml
xudong963 Feb 10, 2025
e5431f1
make new_group_values public
xudong963 Feb 10, 2025
c103d08
cherry-pick upstream/14569
xudong963 Feb 12, 2025
e9fb062
fix EnforceDistribution
xudong963 Feb 24, 2025
51d0dea
Merge remote-tracking branch 'upstream/branch-45' into branch-44-toni…
xudong963 Feb 24, 2025
ee7b658
Merge remote-tracking branch 'upstream/branch-45'(with our fixes)
xudong963 Feb 24, 2025
3766da9
downgrade tonic
xudong963 Feb 24, 2025
2b5cec2
cherry-pick upstream/14569
xudong963 Feb 24, 2025
08b3ce0
public more parquet components
xudong963 Feb 28, 2025
8b3cd7b
Do not swap with projection when file is partitioned (#14956) (#14964)
alamb Mar 2, 2025
76d833a
Improve documentation for `DataSourceExec`, `FileScanConfig`, `DataSo…
alamb Mar 2, 2025
b494e97
Deprecate `Expr::Wildcard` (#14959) (#14976)
xudong963 Mar 3, 2025
65c8560
[branch-46] Update changelog for backports to 46.0.0 (#14977)
xudong963 Mar 3, 2025
ec4862f
Add note about upgrade guide into the release notes (#14979)
alamb Mar 3, 2025
d5ca830
Fix verification script and extended tests due to `rustup` changes (#…
alamb Mar 4, 2025
1c92803
upgrade tonic
xudong963 Mar 13, 2025
112e9eb
Update ring to v0.17.13 (#15063) (#15228)
alamb Mar 14, 2025
0877c99
Fix broken `serde` feature (#15124) (#15227)
alamb Mar 14, 2025
048a125
[branch-46] Fix wasm32 build on version 46 (#15229)
alamb Mar 14, 2025
68f2903
Update version to 46.0.1, add CHANGELOG (#15243)
xudong963 Mar 15, 2025
b8699d9
Merge remote-tracking branch 'upstream/branch-46' into branch-46-stream
xudong963 Mar 20, 2025
2e5b5e2
fix with_node_id and clippy
xudong963 Mar 20, 2025
3be582f
Fix invalid schema for unions in ViewTables (#15135)
Friede80 Mar 16, 2025
a28f2cd
Fix enforce_distribution and enforce_sorting missing fetch
xudong963 Apr 14, 2025
e443304
Final release note touchups (#15740)
alamb Apr 16, 2025
d0b0211
Merge remote-tracking branch 'upstream/branch-47' into branch-47-stream
xudong963 Apr 21, 2025
fe4a4ca
Upgrade DF47
xudong963 Apr 21, 2025
dfb339d
Fix: fetch is missing in plan_with_order_breaking_variants method
xudong963 Apr 23, 2025
656092e
Add fast path for optimize_projection (#15746)
xudong963 Apr 18, 2025
d2b8c15
Improve `simplify_expressions` rule (#15735)
xudong963 Apr 19, 2025
2d1062f
Speed up `optimize_projection` (#15787)
xudong963 Apr 23, 2025
738816d
Support inferring new predicates to push down
xudong963 Apr 24, 2025
d029200
Fix: `build_predicate_expression` method doesn't process `false` expr…
xudong963 May 12, 2025
378ce3b
Revert use file schema in parquet pruning (#16086)
adriangb May 21, 2025
c76c1f0
fix: [branch-48] Revert "Improve performance of constant aggregate wi…
andygrove Jun 6, 2025
b5dfdbe
feat: add metadata to literal expressions (#16170) (#16315)
andygrove Jun 7, 2025
33a32d4
[branch-48] Update CHANGELOG for latest 48.0.0 release (#16314)
alamb Jun 7, 2025
a13a6fe
Simplify filter predicates
xudong963 Jun 10, 2025
88c42dc
Merge remote-tracking branch 'upstream/branch-48' into branch-48-stream
xudong963 Jun 24, 2025
e5e5c48
Upgrade DF48
xudong963 Jun 24, 2025
6851d8e
Add the missing equivalence info for filter pushdown
liamzwbao Jul 4, 2025
054d193
48.0.1
xudong963 Jul 12, 2025
1ded6ef
[branch-49] Update version to `49.0.0`, add changelog (#16822)
alamb Jul 19, 2025
273d37a
chore: use `equals_datatype` for `BinaryExpr` (#16813) (#16847)
comphead Jul 22, 2025
afb9099
[branch-49] Final Changelog Tweaks (#16852)
alamb Jul 22, 2025
45dd3f9
Merge remote-tracking branch 'upstream/branch-49' into branch-49
xudong963 Aug 4, 2025
e4dd102
branch 49
xudong963 Aug 4, 2025
9cfb9cd
remove warning from every file open (#16968) (#17059)
mbutrovich Aug 6, 2025
f6ec4c3
#16994 Ensure CooperativeExec#maintains_input_order returns a Vec of …
pepijnve Aug 7, 2025
c7fbb3f
Add ExecutionPlan::reset_state (#17028) (#17096)
adriangb Aug 8, 2025
ee28aa7
[branch-49] Backport #17129 to branch 49 (#17143)
AdamGS Aug 12, 2025
52e4ef8
Pass the input schema to stats_projection for ProjectionExpr (#17123)…
alamb Aug 13, 2025
f05b128
[branch-49] fix: string_agg not respecting ORDER BY (#17058)
nuno-faria Aug 14, 2025
d1a6e9a
[branch-49] Update version to 49.0.1 and add changelog (#17175)
alamb Aug 14, 2025
374fcec
cherry-pick inlist fix (#17254)
haohuaijin Aug 20, 2025
930608a
fix check license header
xudong963 Aug 21, 2025
66ae588
fix cargo check: cargo check --profile ci --workspace --all-targets -…
xudong963 Aug 21, 2025
292641c
fix cargo example
xudong963 Aug 21, 2025
a6068c2
FFI_RecordBatchStream was causing a memory leak (#17190) (#17270)
timsaucer Aug 21, 2025
0d04475
fix: align `array_has` null buffer for scalar (#17272) (#17274)
comphead Aug 21, 2025
f43df3f
[branch-49] Prepare `49.0.2` version and changelog (#17277)
alamb Aug 21, 2025
25058de
fix cargo check --profile ci --no-default-features -p datafusion-proto
xudong963 Aug 22, 2025
c46f7a9
fix cargo doc
xudong963 Aug 22, 2025
deaf2e2
fix ut:custom_sources_cases::statistics::sql_limit(with_node_id of Co…
xudong963 Aug 22, 2025
f1b1bd8
fix ut: test_no_pushdown_through_aggregates & test_plan_with_order_pr…
xudong963 Aug 22, 2025
7dd5e6e
fix format
xudong963 Aug 22, 2025
2eca4c0
fix roundtrip_test
xudong963 Aug 22, 2025
8baa05d
schema_force_view_types to true
xudong963 Aug 25, 2025
9b2fbbb
use utfview8
xudong963 Aug 25, 2025
63c2ebc
schema_force_view_types to false(try true after df49)
xudong963 Aug 25, 2025
ed718c0
fix page_index_filter_one_col and remove an example of proto
xudong963 Aug 25, 2025
0bb16fa
fix configs.md
xudong963 Aug 25, 2025
09ff8f7
fix clippy
xudong963 Aug 25, 2025
1545f2d
update configs.md
xudong963 Aug 25, 2025
ca5b0fb
fix flaky test limit.test
xudong963 Aug 25, 2025
d8c3e03
Simplify predicates in `PushDownFilter` optimizer rule (#16362)
xudong963 Jun 25, 2025
2099882
Fix intermittent SQL logic test failure in limit.slt by adding ORDER …
kosiew Jun 6, 2025
ff8418c
fix limit.rs
xudong963 Aug 25, 2025
2c7836a
fix tpch q19
xudong963 Aug 25, 2025
9191f39
public GroupValues & new_group_values
xudong963 Aug 25, 2025
d358db4
fix clippy
xudong963 Aug 25, 2025
6e71350
Merge pull request #8 from polygon-io/branch-48-stream-fix
xudong963 Aug 26, 2025
c6b8211
Merge remote-tracking branch 'upstream/branch-49' into branch_49_fix
zhuqi-lucas Sep 3, 2025
5a99099
Merge remote-tracking branch 'origin/branch-48-stream' into branch_49…
zhuqi-lucas Sep 3, 2025
cefa63a
fix fetch with new order lex
zhuqi-lucas Sep 3, 2025
1f47d46
fix fetch add back with new lex order
zhuqi-lucas Sep 3, 2025
95aadb9
fix clippy
zhuqi-lucas Sep 3, 2025
70a3c94
fix clippy
zhuqi-lucas Sep 3, 2025
a93e81e
add order needed
zhuqi-lucas Sep 3, 2025
6a3d4f8
fix
zhuqi-lucas Sep 3, 2025
91e2904
fix auth check and port upstream fix: https://github.com/apache/dataf…
zhuqi-lucas Sep 3, 2025
b571c3b
Support csv truncte for datafusion
zhuqi-lucas Sep 4, 2025
1a2f8dc
Addressed in latest PR
zhuqi-lucas Sep 4, 2025
be0276d
Merge pull request #9 from polygon-io/branch_49_fix
xudong963 Sep 4, 2025
63c54ea
add generated field to proto
zhuqi-lucas Sep 4, 2025
b7f9828
generate proto
zhuqi-lucas Sep 4, 2025
d0b757b
add proto message and generated.
zhuqi-lucas Sep 4, 2025
5b9219d
fix
zhuqi-lucas Sep 4, 2025
5aa43e5
fix clippy
zhuqi-lucas Sep 4, 2025
5918ef8
Merge pull request #10 from polygon-io/support_csv_truncate
zhuqi-lucas Sep 5, 2025
cae4095
X-1035 Part-2: support csv scan to read truncted rows
zhuqi-lucas Sep 5, 2025
86c8754
fix CI
zhuqi-lucas Sep 5, 2025
253e49c
add csvfmt with
zhuqi-lucas Sep 5, 2025
194d952
Merge pull request #11 from polygon-io/support_csv_truncate_for_read
xudong963 Sep 5, 2025
ca5d44b
Merge remote-tracking branch 'origin/branch-48-stream' into branch-49
zhuqi-lucas Sep 7, 2025
faca92d
Merge pull request #12 from polygon-io/branch-49-support-csv-truncate
zhuqi-lucas Sep 8, 2025
9e7141f
fix: Implement AggregateUDFImpl::reverse_expr for StringAgg (#17165) …
alamb Sep 8, 2025
10343c1
Revert #17295 (Support from-first SQL syntax) (#17520) (#17544)
alamb Sep 13, 2025
a0fc642
Support csv truncated rows in datafusion (#17465)
zhuqi-lucas Sep 9, 2025
ca8cd34
Merge remote-tracking branch 'origin/branch-49' into branch-50
zhuqi-lucas Sep 16, 2025
e3c2493
Merge branch 'branch-50' into branch-50-upgrade
zhuqi-lucas Sep 16, 2025
8588da4
fix clippy
zhuqi-lucas Sep 16, 2025
238d58b
fix test and fmt
zhuqi-lucas Sep 16, 2025
891202a
fix: ignore non-existent columns when adding filter equivalence info …
rkrishn7 Sep 16, 2025
e16c24f
fix proto test
zhuqi-lucas Sep 17, 2025
acd9ddf
remove unused file
zhuqi-lucas Sep 17, 2025
ba0e3a0
Merge pull request #13 from polygon-io/branch-50-upgrade
zhuqi-lucas Sep 17, 2025
1a31b79
fix: Ensure the CachedParquetFileReader respects the metadata prefetc…
shehabgamin Sep 17, 2025
674e6ac
Merge remote-tracking branch 'upstream/branch-50' into branch-50-upgrade
zhuqi-lucas Sep 21, 2025
5b191f8
fix: Partial AggregateMode will generate duplicate field names which …
zhuqi-lucas Sep 22, 2025
4840c8a
fix: Partial AggregateMode will generate duplicate field names which …
zhuqi-lucas Sep 22, 2025
14da942
feat: expose `udafs` and `udwfs` methods on `FunctionRegistry` (#1765…
milenkovicm Sep 22, 2025
e78eafe
Merge remote-tracking branch 'upstream/branch-50' into branch-50-upgrade
zhuqi-lucas Sep 23, 2025
5a85b7d
Merge pull request #14 from polygon-io/branch-50-upgrade
zhuqi-lucas Sep 23, 2025
f7f6a2a
change physical plan details loglevel from debug to trace for potenti…
zhuqi-lucas Sep 26, 2025
d198e90
fix comments.
zhuqi-lucas Sep 26, 2025
a6cf5de
debug to displayable
zhuqi-lucas Sep 26, 2025
172371b
clippy
zhuqi-lucas Sep 26, 2025
7b53d3e
Merge pull request #15 from polygon-io/branch-50-upgrade
zhuqi-lucas Sep 27, 2025
d799780
Impl `gather_filters_for_pushdown` for `CoalescePartitionsExec` (#18046)
xudong963 Oct 14, 2025
ab03bab
Add independent configs for topk/join dynamic filter (#18090)
xudong963 Oct 16, 2025
9fdb7b3
CoalescePartitionsExec fetch is not consistent with one partition and…
zhuqi-lucas Oct 23, 2025
50e9973
fix DynamicFilterPhysicalExpr test
zhuqi-lucas Oct 24, 2025
f520827
fix: Remove parquet encryption feature from root deps (#17700)
Vyquos Sep 23, 2025
6420a01
Continue to fix the datafusion will default to use encryption
zhuqi-lucas Nov 4, 2025
375d2e6
Make CI green
xudong963 Nov 14, 2025
f37ac20
Make CI green
alamb Oct 18, 2025
19bbdff
Support row group limit pruning
xudong963 Oct 31, 2025
ff301c8
Add restriction for enabling limit pruning (#21)
xudong963 Nov 19, 2025
667716f
Merge remote-tracking branch 'origin/branch-51' into jacob/branch-51-…
jcsherin Nov 23, 2025
7a11979
fix: resolve conflict by picking upstream
jcsherin Nov 23, 2025
942bab8
fix: required methods in `FunctionRegistry`
jcsherin Nov 23, 2025
08cdacc
fix: uses `ScalarValue::try_cmp`
jcsherin Nov 23, 2025
d00b42a
fix: keep `with_node_id`
jcsherin Nov 23, 2025
46565c4
fix: remove duplicate definition
jcsherin Nov 23, 2025
c441576
fix: `AnalyzeExec::new` now takes 5 arguments instead of 4
jcsherin Nov 23, 2025
2298984
fix: use `expr()` method
jcsherin Nov 23, 2025
e5237bf
fix: allow deprecated `UnionExec::new`
jcsherin Nov 23, 2025
8e99a42
fix: `UnnestExec::new` returns a `Result` type
jcsherin Nov 23, 2025
a57ddc5
fix: keep upstream `FileScanConfig` changes
jcsherin Nov 23, 2025
f1322fe
fix: parquet metrics + row_group_filter + reader + source
jcsherin Nov 23, 2025
b341374
fix: proto + regen
jcsherin Nov 23, 2025
ab8723c
fix: upstream reorg of `ListingTable` into `datafusion-catalog-listin…
jcsherin Nov 23, 2025
9ae854f
fix: keep comment
jcsherin Nov 23, 2025
4167f9f
fix: formatting
jcsherin Nov 23, 2025
53f0696
fix: test compilation errors
jcsherin Nov 23, 2025
99ddba7
fix: drop ignore of `RUSTSEC-2025-0111`
jcsherin Nov 23, 2025
2c240e5
fix: filter pushdown test
jcsherin Nov 23, 2025
fe01720
fix: sqllogictests
jcsherin Nov 23, 2025
218cf37
fix: annotate `node_id` in roundtripped physical plan
jcsherin Nov 23, 2025
f1c0694
fix: remove duplicate merge_group
jcsherin Nov 23, 2025
319cb70
refactor: remove unused values from physical plan module
jcsherin Nov 24, 2025
df445a2
feat: make `DefaultSchemaAdapter` public
jcsherin Nov 24, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/audit.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ jobs:
security_audit:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
- uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
- name: Install cargo-audit
uses: taiki-e/install-action@f535147c22906d77695e11cb199e764aa610a4fc # v2.62.46
with:
Expand Down
14 changes: 14 additions & 0 deletions .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,21 @@ jobs:
runs-on: ubuntu-latest
container:
image: amd64/rust
volumes:
- /usr/local:/host/usr/local
steps:
- name: Remove unnecessary preinstalled software
run: |
echo "Disk space before cleanup:"
df -h
# remove tool cache: about 8.5GB (github has host /opt/hostedtoolcache mounted as /__t)
rm -rf /__t/* || true
# remove Haskell runtime: about 6.3GB (host /usr/local/.ghcup)
rm -rf /host/usr/local/.ghcup || true
# remove Android library: about 7.8GB (host /usr/local/lib/android)
rm -rf /host/usr/local/lib/android || true
echo "Disk space after cleanup:"
df -h
- uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
with:
submodules: true
Expand Down
2 changes: 1 addition & 1 deletion datafusion/common/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -653,7 +653,7 @@ config_namespace! {

/// (reading) If true, parquet reader will read columns of `Utf8/Utf8Large` with `Utf8View`,
/// and `Binary/BinaryLarge` with `BinaryView`.
pub schema_force_view_types: bool, default = true
pub schema_force_view_types: bool, default = false

/// (reading) If true, parquet reader will read columns of
/// `Binary/LargeBinary` with `Utf8`, and `BinaryView` with `Utf8View`.
Expand Down
4 changes: 2 additions & 2 deletions datafusion/core/src/datasource/file_format/parquet.rs
Original file line number Diff line number Diff line change
Expand Up @@ -582,11 +582,11 @@ mod tests {
assert_eq!(string_truncation_stats.null_count, Precision::Exact(2));
assert_eq!(
string_truncation_stats.max_value,
Precision::Inexact(ScalarValue::Utf8View(Some("b".repeat(63) + "c")))
Precision::Inexact(Utf8(Some("b".repeat(63) + "c")))
);
assert_eq!(
string_truncation_stats.min_value,
Precision::Inexact(ScalarValue::Utf8View(Some("a".repeat(64))))
Precision::Inexact(Utf8(Some("a".repeat(64))))
);

Ok(())
Expand Down
10 changes: 8 additions & 2 deletions datafusion/core/src/execution/session_state.rs
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,9 @@ use datafusion_physical_expr::create_physical_expr;
use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
use datafusion_physical_optimizer::optimizer::PhysicalOptimizer;
use datafusion_physical_optimizer::PhysicalOptimizerRule;
use datafusion_physical_plan::node_id::{
annotate_node_id_for_execution_plan, NodeIdAnnotator,
};
use datafusion_physical_plan::ExecutionPlan;
use datafusion_session::Session;
#[cfg(feature = "sql")]
Expand Down Expand Up @@ -668,9 +671,12 @@ impl SessionState {
logical_plan: &LogicalPlan,
) -> datafusion_common::Result<Arc<dyn ExecutionPlan>> {
let logical_plan = self.optimize(logical_plan)?;
self.query_planner
let physical_plan = self
.query_planner
.create_physical_plan(&logical_plan, self)
.await
.await?;
let mut id_annotator = NodeIdAnnotator::new();
annotate_node_id_for_execution_plan(&physical_plan, &mut id_annotator)
}

/// Create a [`PhysicalExpr`] from an [`Expr`] after applying type
Expand Down
56 changes: 51 additions & 5 deletions datafusion/core/tests/parquet/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,18 @@ impl TestOutput {
.map(|(_pruned, matched)| matched)
}

/*
/// The number of row_groups fully matched by statistics
fn row_groups_fully_matched_statistics(&self) -> Option<usize> {
self.metric_value("row_groups_fully_matched_statistics")
}

/// The number of row groups pruned by limit pruning
fn limit_pruned_row_groups(&self) -> Option<usize> {
self.metric_value("limit_pruned_row_groups")
}
*/

/// The number of row_groups pruned by statistics
fn row_groups_pruned_statistics(&self) -> Option<usize> {
self.pruning_metric("row_groups_pruned_statistics")
Expand Down Expand Up @@ -232,20 +244,43 @@ impl TestOutput {
/// and the appropriate scenario
impl ContextWithParquet {
async fn new(scenario: Scenario, unit: Unit) -> Self {
Self::with_config(scenario, unit, SessionConfig::new()).await
Self::with_config(scenario, unit, SessionConfig::new(), None, None).await
}

// Set custom schema and batches for the test
/*
pub async fn with_custom_data(
scenario: Scenario,
unit: Unit,
schema: Arc<Schema>,
batches: Vec<RecordBatch>,
) -> Self {
Self::with_config(
scenario,
unit,
SessionConfig::new(),
Some(schema),
Some(batches),
)
.await
}
*/

async fn with_config(
scenario: Scenario,
unit: Unit,
mut config: SessionConfig,
custom_schema: Option<Arc<Schema>>,
custom_batches: Option<Vec<RecordBatch>>,
) -> Self {
// Use a single partition for deterministic results no matter how many CPUs the host has
config = config.with_target_partitions(1);
let file = match unit {
Unit::RowGroup(row_per_group) => {
config = config.with_parquet_bloom_filter_pruning(true);
make_test_file_rg(scenario, row_per_group).await
config.options_mut().execution.parquet.pushdown_filters = true;
make_test_file_rg(scenario, row_per_group, custom_schema, custom_batches)
.await
}
Unit::Page(row_per_page) => {
config = config.with_parquet_page_index_pruning(true);
Expand Down Expand Up @@ -1071,7 +1106,12 @@ fn create_data_batch(scenario: Scenario) -> Vec<RecordBatch> {
}

/// Create a test parquet file with various data types
async fn make_test_file_rg(scenario: Scenario, row_per_group: usize) -> NamedTempFile {
async fn make_test_file_rg(
scenario: Scenario,
row_per_group: usize,
custom_schema: Option<Arc<Schema>>,
custom_batches: Option<Vec<RecordBatch>>,
) -> NamedTempFile {
let mut output_file = tempfile::Builder::new()
.prefix("parquet_pruning")
.suffix(".parquet")
Expand All @@ -1084,8 +1124,14 @@ async fn make_test_file_rg(scenario: Scenario, row_per_group: usize) -> NamedTem
.set_statistics_enabled(EnabledStatistics::Page)
.build();

let batches = create_data_batch(scenario);
let schema = batches[0].schema();
let (batches, schema) =
if let (Some(schema), Some(batches)) = (custom_schema, custom_batches) {
(batches, schema)
} else {
let batches = create_data_batch(scenario);
let schema = batches[0].schema();
(batches, schema)
};

let mut writer = ArrowWriter::try_new(&mut output_file, schema, Some(props)).unwrap();

Expand Down
4 changes: 3 additions & 1 deletion datafusion/core/tests/parquet/page_pruning.rs
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,9 @@ async fn page_index_filter_one_col() {

// 5.create filter date_string_col == "01/01/09"`;
// Note this test doesn't apply type coercion so the literal must match the actual view type
let filter = col("date_string_col").eq(lit(ScalarValue::new_utf8view("01/01/09")));
// xudong: use new_utf8, because schema_force_view_types was changed to false now.
// qi: when schema_force_view_types setting to true, we should change back to utf8view
let filter = col("date_string_col").eq(lit(ScalarValue::new_utf8("01/01/09")));
let batches = get_filter_results(&state, filter.clone(), false).await;
assert_eq!(batches[0].num_rows(), 14);

Expand Down
Loading