diff options
Diffstat (limited to 'bloom.h')
-rw-r--r-- | bloom.h | 170 |
1 files changed, 170 insertions, 0 deletions
diff --git a/bloom.h b/bloom.h new file mode 100644 index 0000000000..92ab2100d3 --- /dev/null +++ b/bloom.h @@ -0,0 +1,170 @@ +#ifndef BLOOM_H +#define BLOOM_H + +struct commit; +struct repository; +struct commit_graph; + +struct bloom_filter_settings { + /* + * The version of the hashing technique being used. + * The newest version is 2, which is + * the seeded murmur3 hashing technique implemented + * in bloom.c. Bloom filters of version 1 were created + * with prior versions of Git, which had a bug in the + * implementation of the hash function. + */ + uint32_t hash_version; + + /* + * The number of times a path is hashed, i.e. the + * number of bit positions that cumulatively + * determine whether a path is present in the + * Bloom filter. + */ + uint32_t num_hashes; + + /* + * The minimum number of bits per entry in the Bloom + * filter. If the filter contains 'n' entries, then + * filter size is the minimum number of 8-bit words + * that contain n*b bits. + */ + uint32_t bits_per_entry; + + /* + * The maximum number of changed paths per commit + * before declaring a Bloom filter to be too-large. + * + * Not written to the commit-graph file. + */ + uint32_t max_changed_paths; +}; + +#define DEFAULT_BLOOM_MAX_CHANGES 512 +#define DEFAULT_BLOOM_FILTER_SETTINGS { 1, 7, 10, DEFAULT_BLOOM_MAX_CHANGES } +#define BITS_PER_WORD 8 +#define BLOOMDATA_CHUNK_HEADER_SIZE 3 * sizeof(uint32_t) + +/* + * A bloom_filter struct represents a data segment to + * use when testing hash values. The 'len' member + * dictates how many entries are stored in + * 'data'. + */ +struct bloom_filter { + unsigned char *data; + size_t len; + int version; + + void *to_free; +}; + +/* + * A bloom_key represents the k hash values for a + * given string. These can be precomputed and + * stored in a bloom_key for re-use when testing + * against a bloom_filter. The number of hashes is + * given by the Bloom filter settings and is the same + * for all Bloom filters and keys interacting with + * the loaded version of the commit graph file and + * the Bloom data chunks. + */ +struct bloom_key { + uint32_t *hashes; +}; + +/* + * A bloom_keyvec is a vector of bloom_keys, which + * can be used to store multiple keys for a single + * pathspec item. + */ +struct bloom_keyvec { + size_t count; + struct bloom_key key[FLEX_ARRAY]; +}; + +int load_bloom_filter_from_graph(struct commit_graph *g, + struct bloom_filter *filter, + uint32_t graph_pos); + +void bloom_key_fill(struct bloom_key *key, const char *data, size_t len, + const struct bloom_filter_settings *settings); +void bloom_key_clear(struct bloom_key *key); + +/* + * bloom_keyvec_new - Allocate and populate a bloom_keyvec with keys for the + * given path. + * + * This function splits the input path by '/' and generates a bloom key for each + * prefix, in reverse order of specificity. For example, given the input + * "a/b/c", it will generate bloom keys for: + * - "a/b/c" + * - "a/b" + * - "a" + * + * The resulting keys are stored in a newly allocated bloom_keyvec. + */ +struct bloom_keyvec *bloom_keyvec_new(const char *path, size_t len, + const struct bloom_filter_settings *settings); +void bloom_keyvec_free(struct bloom_keyvec *vec); + +void add_key_to_filter(const struct bloom_key *key, + struct bloom_filter *filter, + const struct bloom_filter_settings *settings); + +void init_bloom_filters(void); +void deinit_bloom_filters(void); + +enum bloom_filter_computed { + BLOOM_NOT_COMPUTED = (1 << 0), + BLOOM_COMPUTED = (1 << 1), + BLOOM_TRUNC_LARGE = (1 << 2), + BLOOM_TRUNC_EMPTY = (1 << 3), + BLOOM_UPGRADED = (1 << 4), +}; + +struct bloom_filter *get_or_compute_bloom_filter(struct repository *r, + struct commit *c, + int compute_if_not_present, + const struct bloom_filter_settings *settings, + enum bloom_filter_computed *computed); + +/* + * Find the Bloom filter associated with the given commit "c". + * + * If any of the following are true + * + * - the repository does not have a commit-graph, or + * - the repository disables reading from the commit-graph, or + * - the given commit does not have a Bloom filter computed, or + * - there is a Bloom filter for commit "c", but it cannot be read + * because the filter uses an incompatible version of murmur3 + * + * , then `get_bloom_filter()` will return NULL. Otherwise, the corresponding + * Bloom filter will be returned. + * + * For callers who wish to inspect Bloom filters with incompatible hash + * versions, use get_or_compute_bloom_filter(). + */ +struct bloom_filter *get_bloom_filter(struct repository *r, struct commit *c); + +int bloom_filter_contains(const struct bloom_filter *filter, + const struct bloom_key *key, + const struct bloom_filter_settings *settings); + +/* + * bloom_filter_contains_vec - Check if all keys in a key vector are in the + * Bloom filter. + * + * Returns 1 if **all** keys in the vector are present in the filter, + * 0 if **any** key is not present. + */ +int bloom_filter_contains_vec(const struct bloom_filter *filter, + const struct bloom_keyvec *v, + const struct bloom_filter_settings *settings); + +uint32_t test_bloom_murmur3_seeded(uint32_t seed, const char *data, size_t len, + int version); + +#endif |