Class StringDictionary

class StringDictionary

Public Functions

StringDictionary(const DictRef &dict_ref, const std::string &folder, const bool isTemp, const bool recover, const bool materializeHashes = false, size_t initial_capacity = 256)
StringDictionary(const LeafHostInfo &host, const DictRef dict_ref)
~StringDictionary()
int32_t getDbId() const
int32_t getDictId() const
void eachStringSerially(int64_t const generation, StringCallback &serial_callback) const
std::function<int32_t(std::string const&)> makeLambdaStringToId() const
int32_t getOrAdd(const std::string &str)
template<class T, class String>
size_t getBulk(const std::vector<String> &string_vec, T *encoded_vec) const
template<class T, class String>
size_t getBulk(const std::vector<String> &string_vec, T *encoded_vec, const int64_t generation) const
template<class T, class String>template void StringDictionary::getOrAddBulk(const std::vector< String > & string_vec, T * encoded_vec)
template<class T, class String>
void getOrAddBulkParallel(const std::vector<String> &string_vec, T *encoded_vec)
template<class String>template void StringDictionary::getOrAddBulkArray(const std::vector< std::vector< String >> & string_array_vec, std::vector< std::vector< int32_t >> & ids_array_vec)
template<class String>template int32_t StringDictionary::getIdOfString(const String & str) const
std::string getString(int32_t string_id) const
std::pair<char *, size_t> getStringBytes(int32_t string_id) const
size_t storageEntryCount() const
std::vector<int32_t> getLike(const std::string &pattern, const bool icase, const bool is_simple, const char escape, const size_t generation) const
std::vector<int32_t> getCompare(const std::string &pattern, const std::string &comp_operator, const size_t generation)
std::vector<int32_t> getRegexpLike(const std::string &pattern, const char escape, const size_t generation) const
std::vector<std::string> copyStrings() const
std::vector<std::string_view> getStringViews() const
std::vector<std::string_view> getStringViews(const size_t generation) const
std::vector<int32_t> buildDictionaryTranslationMap(const std::shared_ptr<StringDictionary> dest_dict, StringLookupCallback const &dest_transient_lookup_callback) const
size_t buildDictionaryTranslationMap(const StringDictionary *dest_dict, int32_t *translated_ids, const int64_t source_generation, const int64_t dest_generation, const bool dest_has_transients, StringLookupCallback const &dest_transient_lookup_callback) const
bool checkpoint()
bool isClient() const
void update_leaf(const LeafHostInfo &host_info)

Public Static Functions

void populate_string_ids(std::vector<int32_t> &dest_ids, StringDictionary *dest_dict, const std::vector<int32_t> &source_ids, const StringDictionary *source_dict, const std::vector<std::string const *> &transient_string_vec = {})

Populates provided dest_ids vector with string ids corresponding to given source strings.

Given a vector of source string ids and corresponding source dictionary, this method populates a vector of destination string ids by either returning the string id of matching strings in the destination dictionary or creating new entries in the dictionary. Source string ids can also be transient if they were created by a function (e.g LOWER/UPPER functions). A map of transient string ids to string values is provided in order to handle this use case.

Parameters
  • dest_ids: - vector of destination string ids to be populated

  • dest_dict: - destination dictionary

  • source_ids: - vector of source string ids for which destination ids are needed

  • source_dict: - source dictionary

  • transient_string_vec: - ordered vector of string value pointers

void populate_string_array_ids(std::vector<std::vector<int32_t>> &dest_array_ids, StringDictionary *dest_dict, const std::vector<std::vector<int32_t>> &source_array_ids, const StringDictionary *source_dict)

Public Static Attributes

constexpr int32_t INVALID_STR_ID = -1
constexpr size_t MAX_STRLEN = (1 << 15) - 1
constexpr size_t MAX_STRCOUNT = (1U << 31) - 1

Private Functions

void processDictionaryFutures(std::vector<std::future<std::vector<std::pair<string_dict_hash_t, unsigned int>>>> &dictionary_futures)
size_t getNumStringsFromStorage(const size_t storage_slots) const

Method to retrieve number of strings in storage via a binary search for the first canary

Return

number of strings in storage

Parameters
  • storage_slots: number of storage entries we should search to find the minimum canary

bool fillRateIsHigh(const size_t num_strings) const
void increaseHashTableCapacity()
template<class String>
void increaseHashTableCapacityFromStorageAndMemory(const size_t str_count, const size_t storage_high_water_mark, const std::vector<String> &input_strings, const std::vector<size_t> &string_memory_ids, const std::vector<string_dict_hash_t> &input_strings_hashes)
int32_t getOrAddImpl(const std::string_view &str)
template<class String>
void hashStrings(const std::vector<String> &string_vec, std::vector<string_dict_hash_t> &hashes) const

Method to hash a vector of strings in parallel.

Parameters
  • string_vec: input vector of strings to be hashed

  • hashes: space for the output - should be pre-sized to match string_vec size

int32_t getUnlocked(const std::string_view sv) const
std::string getStringUnlocked(int32_t string_id) const
std::string getStringChecked(const int string_id) const
std::pair<char *, size_t> getStringBytesChecked(const int string_id) const
template<class String>
uint32_t computeBucket(const string_dict_hash_t hash, const String &input_string, const std::vector<int32_t> &string_id_string_dict_hash_table) const
template<class String>
uint32_t computeBucketFromStorageAndMemory(const string_dict_hash_t input_string_hash, const String &input_string, const std::vector<int32_t> &string_id_string_dict_hash_table, const size_t storage_high_water_mark, const std::vector<String> &input_strings, const std::vector<size_t> &string_memory_ids) const
uint32_t computeUniqueBucketWithHash(const string_dict_hash_t hash, const std::vector<int32_t> &string_id_string_dict_hash_table)
void checkAndConditionallyIncreasePayloadCapacity(const size_t write_length)
void checkAndConditionallyIncreaseOffsetCapacity(const size_t write_length)
template<class String>
void appendToStorage(const String str)
template<class String>
void appendToStorageBulk(const std::vector<String> &input_strings, const std::vector<size_t> &string_memory_ids, const size_t sum_new_strings_lengths)
StringDictionary::PayloadString getStringFromStorage(const int string_id) const
std::string_view getStringFromStorageFast(const int string_id) const
void addPayloadCapacity(const size_t min_capacity_requested = 0)
void addOffsetCapacity(const size_t min_capacity_requested = 0)
size_t addStorageCapacity(int fd, const size_t min_capacity_requested = 0)
void *addMemoryCapacity(void *addr, size_t &mem_size, const size_t min_capacity_requested = 0)
void invalidateInvertedIndex()
std::vector<int32_t> getEquals(std::string pattern, std::string comp_operator, size_t generation)
void buildSortedCache()
void insertInSortedCache(std::string str, int32_t str_id)
void sortCache(std::vector<int32_t> &cache)
void mergeSortedCache(std::vector<int32_t> &temp_sorted_cache)
compare_cache_value_t *binary_search_cache(const std::string &pattern) const

Private Members

const DictRef dict_ref_
const std::string folder_
size_t str_count_
size_t collisions_
std::vector<int32_t> string_id_string_dict_hash_table_
std::vector<string_dict_hash_t> hash_cache_
std::vector<int32_t> sorted_cache
bool isTemp_
bool materialize_hashes_
std::string offsets_path_
int payload_fd_
int offset_fd_
StringIdxEntry *offset_map_
char *payload_map_
size_t offset_file_size_
size_t payload_file_size_
size_t payload_file_off_
mapd_shared_mutex rw_mutex_
std::map<std::tuple<std::string, bool, bool, char>, std::vector<int32_t>> like_cache_
std::map<std::pair<std::string, char>, std::vector<int32_t>> regex_cache_
std::map<std::string, int32_t> equal_cache_
DictionaryCache<std::string, compare_cache_value_t> compare_cache_
std::shared_ptr<std::vector<std::string>> strings_cache_
std::unique_ptr<StringDictionaryClient> client_
std::unique_ptr<StringDictionaryClient> client_no_timeout_
char *CANARY_BUFFER = {nullptr}
size_t canary_buffer_size = 0

Friends

friend StringDictionary::StringLocalCallback
struct compare_cache_value_t

Public Members

int32_t index
int32_t diff
struct PayloadString

Public Members

char *c_str_ptr
size_t size
bool canary
class StringCallback

Subclassed by anonymous_namespace{StringDictionary.cpp}::MapMaker, StringLocalCallback, StringNetworkCallback

Public Functions

virtual ~StringCallback()
virtual void operator()(std::string const&, int32_t const string_id) = 0
virtual void operator()(std::string_view const, int32_t const string_id) = 0
struct StringIdxEntry

Public Members

uint64_t off
uint64_t size