summaryrefslogtreecommitdiff
path: root/bloom.h
diff options
context:
space:
mode:
Diffstat (limited to 'bloom.h')
-rw-r--r--bloom.h170
1 files changed, 170 insertions, 0 deletions
diff --git a/bloom.h b/bloom.h
new file mode 100644
index 0000000000..92ab2100d3
--- /dev/null
+++ b/bloom.h
@@ -0,0 +1,170 @@
+#ifndef BLOOM_H
+#define BLOOM_H
+
+struct commit;
+struct repository;
+struct commit_graph;
+
+struct bloom_filter_settings {
+ /*
+ * The version of the hashing technique being used.
+ * The newest version is 2, which is
+ * the seeded murmur3 hashing technique implemented
+ * in bloom.c. Bloom filters of version 1 were created
+ * with prior versions of Git, which had a bug in the
+ * implementation of the hash function.
+ */
+ uint32_t hash_version;
+
+ /*
+ * The number of times a path is hashed, i.e. the
+ * number of bit positions that cumulatively
+ * determine whether a path is present in the
+ * Bloom filter.
+ */
+ uint32_t num_hashes;
+
+ /*
+ * The minimum number of bits per entry in the Bloom
+ * filter. If the filter contains 'n' entries, then
+ * filter size is the minimum number of 8-bit words
+ * that contain n*b bits.
+ */
+ uint32_t bits_per_entry;
+
+ /*
+ * The maximum number of changed paths per commit
+ * before declaring a Bloom filter to be too-large.
+ *
+ * Not written to the commit-graph file.
+ */
+ uint32_t max_changed_paths;
+};
+
+#define DEFAULT_BLOOM_MAX_CHANGES 512
+#define DEFAULT_BLOOM_FILTER_SETTINGS { 1, 7, 10, DEFAULT_BLOOM_MAX_CHANGES }
+#define BITS_PER_WORD 8
+#define BLOOMDATA_CHUNK_HEADER_SIZE 3 * sizeof(uint32_t)
+
+/*
+ * A bloom_filter struct represents a data segment to
+ * use when testing hash values. The 'len' member
+ * dictates how many entries are stored in
+ * 'data'.
+ */
+struct bloom_filter {
+ unsigned char *data;
+ size_t len;
+ int version;
+
+ void *to_free;
+};
+
+/*
+ * A bloom_key represents the k hash values for a
+ * given string. These can be precomputed and
+ * stored in a bloom_key for re-use when testing
+ * against a bloom_filter. The number of hashes is
+ * given by the Bloom filter settings and is the same
+ * for all Bloom filters and keys interacting with
+ * the loaded version of the commit graph file and
+ * the Bloom data chunks.
+ */
+struct bloom_key {
+ uint32_t *hashes;
+};
+
+/*
+ * A bloom_keyvec is a vector of bloom_keys, which
+ * can be used to store multiple keys for a single
+ * pathspec item.
+ */
+struct bloom_keyvec {
+ size_t count;
+ struct bloom_key key[FLEX_ARRAY];
+};
+
+int load_bloom_filter_from_graph(struct commit_graph *g,
+ struct bloom_filter *filter,
+ uint32_t graph_pos);
+
+void bloom_key_fill(struct bloom_key *key, const char *data, size_t len,
+ const struct bloom_filter_settings *settings);
+void bloom_key_clear(struct bloom_key *key);
+
+/*
+ * bloom_keyvec_new - Allocate and populate a bloom_keyvec with keys for the
+ * given path.
+ *
+ * This function splits the input path by '/' and generates a bloom key for each
+ * prefix, in reverse order of specificity. For example, given the input
+ * "a/b/c", it will generate bloom keys for:
+ * - "a/b/c"
+ * - "a/b"
+ * - "a"
+ *
+ * The resulting keys are stored in a newly allocated bloom_keyvec.
+ */
+struct bloom_keyvec *bloom_keyvec_new(const char *path, size_t len,
+ const struct bloom_filter_settings *settings);
+void bloom_keyvec_free(struct bloom_keyvec *vec);
+
+void add_key_to_filter(const struct bloom_key *key,
+ struct bloom_filter *filter,
+ const struct bloom_filter_settings *settings);
+
+void init_bloom_filters(void);
+void deinit_bloom_filters(void);
+
+enum bloom_filter_computed {
+ BLOOM_NOT_COMPUTED = (1 << 0),
+ BLOOM_COMPUTED = (1 << 1),
+ BLOOM_TRUNC_LARGE = (1 << 2),
+ BLOOM_TRUNC_EMPTY = (1 << 3),
+ BLOOM_UPGRADED = (1 << 4),
+};
+
+struct bloom_filter *get_or_compute_bloom_filter(struct repository *r,
+ struct commit *c,
+ int compute_if_not_present,
+ const struct bloom_filter_settings *settings,
+ enum bloom_filter_computed *computed);
+
+/*
+ * Find the Bloom filter associated with the given commit "c".
+ *
+ * If any of the following are true
+ *
+ * - the repository does not have a commit-graph, or
+ * - the repository disables reading from the commit-graph, or
+ * - the given commit does not have a Bloom filter computed, or
+ * - there is a Bloom filter for commit "c", but it cannot be read
+ * because the filter uses an incompatible version of murmur3
+ *
+ * , then `get_bloom_filter()` will return NULL. Otherwise, the corresponding
+ * Bloom filter will be returned.
+ *
+ * For callers who wish to inspect Bloom filters with incompatible hash
+ * versions, use get_or_compute_bloom_filter().
+ */
+struct bloom_filter *get_bloom_filter(struct repository *r, struct commit *c);
+
+int bloom_filter_contains(const struct bloom_filter *filter,
+ const struct bloom_key *key,
+ const struct bloom_filter_settings *settings);
+
+/*
+ * bloom_filter_contains_vec - Check if all keys in a key vector are in the
+ * Bloom filter.
+ *
+ * Returns 1 if **all** keys in the vector are present in the filter,
+ * 0 if **any** key is not present.
+ */
+int bloom_filter_contains_vec(const struct bloom_filter *filter,
+ const struct bloom_keyvec *v,
+ const struct bloom_filter_settings *settings);
+
+uint32_t test_bloom_murmur3_seeded(uint32_t seed, const char *data, size_t len,
+ int version);
+
+#endif