@@ -116,13 +116,12 @@ pub use writer::plan_to_parquet;
116116///
117117/// Supports the following optimizations:
118118///
119- /// * Concurrent reads: Can read from one or more files in parallel as multiple
119+ /// * Concurrent reads: reads from one or more files in parallel as multiple
120120/// partitions, including concurrently reading multiple row groups from a single
121121/// file.
122122///
123- /// * Predicate push down: skips row groups and pages based on
124- /// min/max/null_counts in the row group metadata, the page index and bloom
125- /// filters.
123+ /// * Predicate push down: skips row groups, pages, rows based on metadata
124+ /// and late materialization. See "Predicate Pushdown" below.
126125///
127126/// * Projection pushdown: reads and decodes only the columns required.
128127///
@@ -132,9 +131,8 @@ pub use writer::plan_to_parquet;
132131/// coalesce I/O operations, etc. See [`ParquetFileReaderFactory`] for more
133132/// details.
134133///
135- /// * Schema adapters: read parquet files with different schemas into a unified
136- /// table schema. This can be used to implement "schema evolution". See
137- /// [`SchemaAdapterFactory`] for more details.
134+ /// * Schema evolution: read parquet files with different schemas into a unified
135+ /// table schema. See [`SchemaAdapterFactory`] for more details.
138136///
139137/// * metadata_size_hint: controls the number of bytes read from the end of the
140138/// file in the initial I/O when the default [`ParquetFileReaderFactory`]. If a
@@ -144,6 +142,29 @@ pub use writer::plan_to_parquet;
144142/// * User provided [`ParquetAccessPlan`]s to skip row groups and/or pages
145143/// based on external information. See "Implementing External Indexes" below
146144///
145+ /// # Predicate Pushdown
146+ ///
147+ /// `ParquetExec` uses the provided [`PhysicalExpr`] predicate as a filter to
148+ /// skip reading unnecessary data and improve query performance using several techniques:
149+ ///
150+ /// * Row group pruning: skips entire row groups based on min/max statistics
151+ /// found in [`ParquetMetaData`] and any Bloom filters that are present.
152+ ///
153+ /// * Page pruning: skips individual pages within a ColumnChunk using the
154+ /// [Parquet PageIndex], if present.
155+ ///
156+ /// * Row filtering: skips rows within a page using a form of late
157+ /// materialization. When possible, predicates are applied by the parquet
158+ /// decoder *during* decode (see [`ArrowPredicate`] and [`RowFilter`] for more
159+ /// details). This is only enabled if `ParquetScanOptions::pushdown_filters` is set to true.
160+ ///
161+ /// Note: If the predicate can not be used to accelerate the scan, it is ignored
162+ /// (no error is raised on predicate evaluation errors).
163+ ///
164+ /// [`ArrowPredicate`]: parquet::arrow::arrow_reader::ArrowPredicate
165+ /// [`RowFilter`]: parquet::arrow::arrow_reader::RowFilter
166+ /// [Parquet PageIndex]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
167+ ///
147168/// # Implementing External Indexes
148169///
149170/// It is possible to restrict the row groups and selections within those row
@@ -199,10 +220,11 @@ pub use writer::plan_to_parquet;
199220/// applying predicates to metadata. The plan and projections are used to
200221/// determine what pages must be read.
201222///
202- /// * Step 4: The stream begins reading data, fetching the required pages
203- /// and incrementally decoding them.
223+ /// * Step 4: The stream begins reading data, fetching the required parquet
224+ /// pages incrementally decoding them, and applying any row filters (see
225+ /// [`Self::with_pushdown_filters`]).
204226///
205- /// * Step 5: As each [`RecordBatch]` is read, it may be adapted by a
227+ /// * Step 5: As each [`RecordBatch`] is read, it may be adapted by a
206228/// [`SchemaAdapter`] to match the table schema. By default missing columns are
207229/// filled with nulls, but this can be customized via [`SchemaAdapterFactory`].
208230///
@@ -268,13 +290,10 @@ impl ParquetExecBuilder {
268290 }
269291 }
270292
271- /// Set the predicate for the scan.
272- ///
273- /// The ParquetExec uses this predicate to filter row groups and data pages
274- /// using the Parquet statistics and bloom filters.
293+ /// Set the filter predicate when reading.
275294 ///
276- /// If the predicate can not be used to prune the scan, it is ignored (no
277- /// error is raised) .
295+ /// See the "Predicate Pushdown" section of the [`ParquetExec`] documenation
296+ /// for more details .
278297 pub fn with_predicate ( mut self , predicate : Arc < dyn PhysicalExpr > ) -> Self {
279298 self . predicate = Some ( predicate) ;
280299 self
@@ -291,7 +310,7 @@ impl ParquetExecBuilder {
291310 self
292311 }
293312
294- /// Set the table parquet options that control how the ParquetExec reads.
313+ /// Set the options for controlling how the ParquetExec reads parquet files .
295314 ///
296315 /// See also [`Self::new_with_options`]
297316 pub fn with_table_parquet_options (
@@ -480,11 +499,8 @@ impl ParquetExec {
480499 self
481500 }
482501
483- /// If true, any filter [`Expr`]s on the scan will converted to a
484- /// [`RowFilter`](parquet::arrow::arrow_reader::RowFilter) in the
485- /// `ParquetRecordBatchStream`. These filters are applied by the
486- /// parquet decoder to skip unecessairly decoding other columns
487- /// which would not pass the predicate. Defaults to false
502+ /// If true, the predicate will be used during the parquet scan.
503+ /// Defaults to false
488504 ///
489505 /// [`Expr`]: datafusion_expr::Expr
490506 pub fn with_pushdown_filters ( mut self , pushdown_filters : bool ) -> Self {
0 commit comments