Class foreign_storage::ParquetDataWrapper¶
-
class
ParquetDataWrapper
: public foreign_storage::AbstractFileStorageDataWrapper¶ Public Functions
-
ParquetDataWrapper
()¶
-
ParquetDataWrapper
(const int db_id, const ForeignTable *foreign_table)¶
-
ParquetDataWrapper
(const int db_id, const ForeignTable *foreign_table, const UserMapping *user_mapping, const bool do_metadata_stats_validation = true)¶
-
void
populateChunkMetadata
(ChunkMetadataVector &chunk_metadata_vector)¶ Populates given chunk metadata vector with metadata for all chunks in related foreign table.
- Parameters
chunk_metadata_vector
: - vector that will be populated with chunk metadata
-
void
populateChunkBuffers
(const ChunkToBufferMap &required_buffers, const ChunkToBufferMap &optional_buffers, AbstractBuffer *delete_buffer)¶ Populates given chunk buffers identified by chunk keys. All provided chunk buffers are expected to be for the same fragment.
- Parameters
required_buffers
: - chunk buffers that must always be populatedoptional_buffers
: - chunk buffers that can be optionally populated, if the data wrapper has to scan through chunk data anyways (typically for row wise data formats)delete_buffer
: - chunk buffer for fragment’s delete column, if non-null data wrapper is expected to mark deleted rows in buffer and continue processing
-
std::string
getSerializedDataWrapper
() const¶ Serialize internal state of wrapper into file at given path if implemented
-
void
restoreDataWrapperInternals
(const std::string &file_path, const ChunkMetadataVector &chunk_metadata)¶ Restore internal state of datawrapper
- Parameters
file_path
: - location of file created by serializeMetadatachunk_metadata_vector
: - vector of chunk metadata recovered from disk
-
bool
isRestored
() const¶
-
ParallelismLevel
getCachedParallelismLevel
() const¶ Gets the desired level of parallelism for the data wrapper when a cache is in use. This affects the optional buffers that the data wrapper is made aware of during data requests.
-
ParallelismLevel
getNonCachedParallelismLevel
() const¶ Gets the desired level of parallelism for the data wrapper when no cache is in use. This affects the optional buffers that the data wrapper is made aware of during data requests.
-
void
createRenderGroupAnalyzers
()¶ Create RenderGroupAnalyzers for poly columns.
Private Functions
-
std::list<const ColumnDescriptor *>
getColumnsToInitialize
(const Interval<ColumnType> &column_interval)¶
-
void
initializeChunkBuffers
(const int fragment_index, const Interval<ColumnType> &column_interval, const ChunkToBufferMap &required_buffers, const bool reserve_buffers_and_set_stats = false)¶
-
void
fetchChunkMetadata
()¶
-
void
loadBuffersUsingLazyParquetChunkLoader
(const int logical_column_id, const int fragment_id, const ChunkToBufferMap &required_buffers, AbstractBuffer *delete_buffer)¶
-
std::set<std::string>
getProcessedFilePaths
()¶
-
std::vector<std::string>
getAllFilePaths
()¶
-
bool
moveToNextFragment
(size_t new_rows_count) const¶
-
void
finalizeFragmentMap
()¶
-
void
addNewFragment
(int row_group, const std::string &file_path)¶
-
bool
isNewFile
(const std::string &file_path) const¶
-
void
addNewFile
(const std::string &file_path)¶
-
void
resetParquetMetadata
()¶
-
void
metadataScanFiles
(const std::vector<std::string> &file_paths)¶
Private Members
-
const bool
do_metadata_stats_validation_
¶
-
std::map<int, std::vector<RowGroupInterval>>
fragment_to_row_group_interval_map_
¶
-
std::map<ChunkKey, std::shared_ptr<ChunkMetadata>>
chunk_metadata_map_
¶
-
const int
db_id_
¶
-
const ForeignTable *
foreign_table_
¶
-
int
last_fragment_index_
¶
-
size_t
last_fragment_row_count_
¶
-
size_t
total_row_count_
¶
-
int
last_row_group_
¶
-
bool
is_restored_
¶
-
std::unique_ptr<ForeignTableSchema>
schema_
¶
-
std::shared_ptr<arrow::fs::FileSystem>
file_system_
¶
-
std::unique_ptr<FileReaderMap>
file_reader_cache_
¶
-
std::mutex
delete_buffer_mutex_
¶
-
RenderGroupAnalyzerMap
render_group_analyzer_map_
¶
-