1818//! [`DFParquetMetadata`] for fetching Parquet file metadata, statistics
1919//! and schema information.
2020
21- use crate :: {
22- ObjectStoreFetch , apply_file_schema_type_coercions, coerce_int96_to_resolution,
23- } ;
21+ use crate :: { apply_file_schema_type_coercions, coerce_int96_to_resolution} ;
2422use arrow:: array:: { Array , ArrayRef , BooleanArray } ;
2523use arrow:: compute:: and;
2624use arrow:: compute:: kernels:: cmp:: eq;
@@ -41,10 +39,11 @@ use datafusion_physical_plan::Accumulator;
4139use log:: debug;
4240use object_store:: path:: Path ;
4341use object_store:: { ObjectMeta , ObjectStore } ;
42+ use parquet:: DecodeResult ;
4443use parquet:: arrow:: arrow_reader:: statistics:: StatisticsConverter ;
4544use parquet:: arrow:: { parquet_column, parquet_to_arrow_schema} ;
4645use parquet:: file:: metadata:: {
47- PageIndexPolicy , ParquetMetaData , ParquetMetaDataReader , RowGroupMetaData ,
46+ PageIndexPolicy , ParquetMetaData , ParquetMetaDataPushDecoder , RowGroupMetaData ,
4847 SortingColumn ,
4948} ;
5049use parquet:: schema:: types:: SchemaDescriptor ;
@@ -119,25 +118,14 @@ impl<'a> DFParquetMetadata<'a> {
119118
120119 /// Fetch parquet metadata from the remote object store
121120 pub async fn fetch_metadata ( & self ) -> Result < Arc < ParquetMetaData > > {
122- let Self {
123- store,
124- object_meta,
125- metadata_size_hint,
126- decryption_properties,
127- file_metadata_cache,
128- coerce_int96 : _,
129- } = self ;
130-
131- let fetch = ObjectStoreFetch :: new ( * store, object_meta) ;
132-
133121 // implementation to fetch parquet metadata
134122 let cache_metadata =
135- !cfg ! ( feature = "parquet_encryption" ) || decryption_properties. is_none ( ) ;
123+ !cfg ! ( feature = "parquet_encryption" ) || self . decryption_properties . is_none ( ) ;
136124
137125 if cache_metadata
138- && let Some ( file_metadata_cache) = file_metadata_cache. as_ref ( )
139- && let Some ( cached) = file_metadata_cache. get ( & object_meta. location )
140- && cached. is_valid_for ( object_meta)
126+ && let Some ( file_metadata_cache) = self . file_metadata_cache . as_ref ( )
127+ && let Some ( cached) = file_metadata_cache. get ( & self . object_meta . location )
128+ && cached. is_valid_for ( self . object_meta )
141129 && let Some ( cached_parquet) = cached
142130 . file_metadata
143131 . as_any ( )
@@ -146,32 +134,69 @@ impl<'a> DFParquetMetadata<'a> {
146134 return Ok ( Arc :: clone ( cached_parquet. parquet_metadata ( ) ) ) ;
147135 }
148136
149- let mut reader =
150- ParquetMetaDataReader :: new ( ) . with_prefetch_hint ( * metadata_size_hint) ;
137+ let file_size = self . object_meta . size ;
138+ let mut decoder = ParquetMetaDataPushDecoder :: try_new ( file_size)
139+ . map_err ( DataFusionError :: from) ?;
151140
152141 #[ cfg( feature = "parquet_encryption" ) ]
153- if let Some ( decryption_properties) = decryption_properties {
154- reader = reader
155- . with_decryption_properties ( Some ( Arc :: clone ( decryption_properties) ) ) ;
142+ if let Some ( decryption_properties) = & self . decryption_properties {
143+ decoder = decoder
144+ . with_file_decryption_properties ( Some ( Arc :: clone ( decryption_properties) ) ) ;
156145 }
157146
158- if cache_metadata && file_metadata_cache. is_some ( ) {
147+ if cache_metadata && self . file_metadata_cache . is_some ( ) {
159148 // Need to retrieve the entire metadata for the caching to be effective.
160- reader = reader. with_page_index_policy ( PageIndexPolicy :: Optional ) ;
149+ decoder = decoder. with_page_index_policy ( PageIndexPolicy :: Optional ) ;
150+ } else {
151+ decoder = decoder. with_page_index_policy ( PageIndexPolicy :: Skip ) ;
161152 }
162153
163- let metadata = Arc :: new (
164- reader
165- . load_and_finish ( fetch, object_meta. size )
154+ // If we have a size hint, prefetch that many bytes from the end of the file
155+ if let Some ( hint) = self . metadata_size_hint {
156+ let prefetch_start = file_size. saturating_sub ( hint as u64 ) ;
157+ let prefetch_range = prefetch_start..file_size;
158+ let data = self
159+ . store
160+ . get_ranges (
161+ & self . object_meta . location ,
162+ std:: slice:: from_ref ( & prefetch_range) ,
163+ )
166164 . await
167- . map_err ( DataFusionError :: from) ?,
168- ) ;
165+ . map_err ( DataFusionError :: from) ?;
166+ decoder
167+ . push_ranges ( vec ! [ prefetch_range] , data)
168+ . map_err ( DataFusionError :: from) ?;
169+ }
170+
171+ let metadata = loop {
172+ match decoder. try_decode ( ) . map_err ( DataFusionError :: from) ? {
173+ DecodeResult :: Data ( metadata) => break metadata,
174+ DecodeResult :: NeedsData ( ranges) => {
175+ let buffers = self
176+ . store
177+ . get_ranges ( & self . object_meta . location , & ranges)
178+ . await
179+ . map_err ( DataFusionError :: from) ?;
180+ decoder
181+ . push_ranges ( ranges, buffers)
182+ . map_err ( DataFusionError :: from) ?;
183+ }
184+ DecodeResult :: Finished => {
185+ return Err ( DataFusionError :: Internal (
186+ "ParquetMetaDataPushDecoder finished without producing metadata"
187+ . to_string ( ) ,
188+ ) ) ;
189+ }
190+ }
191+ } ;
192+
193+ let metadata = Arc :: new ( metadata) ;
169194
170- if cache_metadata && let Some ( file_metadata_cache) = file_metadata_cache {
195+ if cache_metadata && let Some ( file_metadata_cache) = & self . file_metadata_cache {
171196 file_metadata_cache. put (
172- & object_meta. location ,
197+ & self . object_meta . location ,
173198 CachedFileMetadataEntry :: new (
174- ( * object_meta) . clone ( ) ,
199+ self . object_meta . clone ( ) ,
175200 Arc :: new ( CachedParquetMetaData :: new ( Arc :: clone ( & metadata) ) ) ,
176201 ) ,
177202 ) ;
0 commit comments