1717
1818//! [`ParquetOpener`] for opening Parquet files
1919
20+ use crate :: datasource:: file_format:: transform_schema_to_view;
2021use crate :: datasource:: physical_plan:: parquet:: page_filter:: PagePruningPredicate ;
2122use crate :: datasource:: physical_plan:: parquet:: row_group_filter:: RowGroupAccessPlanFilter ;
2223use crate :: datasource:: physical_plan:: parquet:: {
@@ -33,7 +34,7 @@ use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
3334use datafusion_physical_plan:: metrics:: ExecutionPlanMetricsSet ;
3435use futures:: { StreamExt , TryStreamExt } ;
3536use log:: debug;
36- use parquet:: arrow:: arrow_reader:: ArrowReaderOptions ;
37+ use parquet:: arrow:: arrow_reader:: { ArrowReaderMetadata , ArrowReaderOptions } ;
3738use parquet:: arrow:: async_reader:: AsyncFileReader ;
3839use parquet:: arrow:: { ParquetRecordBatchStreamBuilder , ProjectionMask } ;
3940use std:: sync:: Arc ;
@@ -56,6 +57,7 @@ pub(super) struct ParquetOpener {
5657 pub enable_page_index : bool ,
5758 pub enable_bloom_filter : bool ,
5859 pub schema_adapter_factory : Arc < dyn SchemaAdapterFactory > ,
60+ pub schema_force_string_view : bool ,
5961}
6062
6163impl FileOpener for ParquetOpener {
@@ -66,7 +68,7 @@ impl FileOpener for ParquetOpener {
6668 let file_metrics =
6769 ParquetFileMetrics :: new ( self . partition_index , & file_name, & self . metrics ) ;
6870
69- let reader: Box < dyn AsyncFileReader > =
71+ let mut reader: Box < dyn AsyncFileReader > =
7072 self . parquet_file_reader_factory . create_reader (
7173 self . partition_index ,
7274 file_meta,
@@ -90,14 +92,27 @@ impl FileOpener for ParquetOpener {
9092 ) ;
9193 let enable_bloom_filter = self . enable_bloom_filter ;
9294 let limit = self . limit ;
95+ let schema_force_string_view = self . schema_force_string_view ;
9396
9497 Ok ( Box :: pin ( async move {
98+ let options = ArrowReaderOptions :: new ( ) . with_page_index ( enable_page_index) ;
99+
100+ let metadata =
101+ ArrowReaderMetadata :: load_async ( & mut reader, options. clone ( ) ) . await ?;
102+ let mut schema = metadata. schema ( ) . clone ( ) ;
103+
104+ if schema_force_string_view {
105+ schema = Arc :: new ( transform_schema_to_view ( & schema) ) ;
106+ }
107+
95108 let options = ArrowReaderOptions :: new ( )
96109 . with_page_index ( enable_page_index)
97- . with_schema ( table_schema. clone ( ) ) ;
110+ . with_schema ( schema. clone ( ) ) ;
111+ let metadata =
112+ ArrowReaderMetadata :: try_new ( metadata. metadata ( ) . clone ( ) , options) ?;
113+
98114 let mut builder =
99- ParquetRecordBatchStreamBuilder :: new_with_options ( reader, options)
100- . await ?;
115+ ParquetRecordBatchStreamBuilder :: new_with_metadata ( reader, metadata) ;
101116
102117 let file_schema = builder. schema ( ) . clone ( ) ;
103118
0 commit comments