Class foreign_storage::ParquetDataWrapper

class ParquetDataWrapper : public foreign_storage::AbstractFileStorageDataWrapper

Public Functions

ParquetDataWrapper()
ParquetDataWrapper(const int db_id, const ForeignTable *foreign_table)
ParquetDataWrapper(const int db_id, const ForeignTable *foreign_table, const UserMapping *user_mapping, const bool do_metadata_stats_validation = true)
void populateChunkMetadata(ChunkMetadataVector &chunk_metadata_vector)

Populates given chunk metadata vector with metadata for all chunks in related foreign table.

Parameters
  • chunk_metadata_vector: - vector that will be populated with chunk metadata

void populateChunkBuffers(const ChunkToBufferMap &required_buffers, const ChunkToBufferMap &optional_buffers, AbstractBuffer *delete_buffer)

Populates given chunk buffers identified by chunk keys. All provided chunk buffers are expected to be for the same fragment.

Parameters
  • required_buffers: - chunk buffers that must always be populated

  • optional_buffers: - chunk buffers that can be optionally populated, if the data wrapper has to scan through chunk data anyways (typically for row wise data formats)

  • delete_buffer: - chunk buffer for fragment’s delete column, if non-null data wrapper is expected to mark deleted rows in buffer and continue processing

std::string getSerializedDataWrapper() const

Serialize internal state of wrapper into file at given path if implemented

void restoreDataWrapperInternals(const std::string &file_path, const ChunkMetadataVector &chunk_metadata)

Restore internal state of datawrapper

Parameters
  • file_path: - location of file created by serializeMetadata

  • chunk_metadata_vector: - vector of chunk metadata recovered from disk

bool isRestored() const
ParallelismLevel getCachedParallelismLevel() const

Gets the desired level of parallelism for the data wrapper when a cache is in use. This affects the optional buffers that the data wrapper is made aware of during data requests.

ParallelismLevel getNonCachedParallelismLevel() const

Gets the desired level of parallelism for the data wrapper when no cache is in use. This affects the optional buffers that the data wrapper is made aware of during data requests.

void createRenderGroupAnalyzers()

Create RenderGroupAnalyzers for poly columns.

Private Functions

std::list<const ColumnDescriptor *> getColumnsToInitialize(const Interval<ColumnType> &column_interval)
void initializeChunkBuffers(const int fragment_index, const Interval<ColumnType> &column_interval, const ChunkToBufferMap &required_buffers, const bool reserve_buffers_and_set_stats = false)
void fetchChunkMetadata()
void loadBuffersUsingLazyParquetChunkLoader(const int logical_column_id, const int fragment_id, const ChunkToBufferMap &required_buffers, AbstractBuffer *delete_buffer)
std::set<std::string> getProcessedFilePaths()
std::vector<std::string> getAllFilePaths()
bool moveToNextFragment(size_t new_rows_count) const
void finalizeFragmentMap()
void addNewFragment(int row_group, const std::string &file_path)
bool isNewFile(const std::string &file_path) const
void addNewFile(const std::string &file_path)
void resetParquetMetadata()
void metadataScanFiles(const std::vector<std::string> &file_paths)

Private Members

const bool do_metadata_stats_validation_
std::map<int, std::vector<RowGroupInterval>> fragment_to_row_group_interval_map_
std::map<ChunkKey, std::shared_ptr<ChunkMetadata>> chunk_metadata_map_
const int db_id_
const ForeignTable *foreign_table_
int last_fragment_index_
size_t last_fragment_row_count_
size_t total_row_count_
int last_row_group_
bool is_restored_
std::unique_ptr<ForeignTableSchema> schema_
std::shared_ptr<arrow::fs::FileSystem> file_system_
std::unique_ptr<FileReaderMap> file_reader_cache_
std::mutex delete_buffer_mutex_
RenderGroupAnalyzerMap render_group_analyzer_map_