summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJunio C Hamano <gitster@pobox.com>2025-07-23 15:45:15 -0700
committerJunio C Hamano <gitster@pobox.com>2025-07-23 15:45:15 -0700
commitf22d4ac4fd50b55c88142dfd15a361680cf3fb40 (patch)
tree92a3be8ed47f75e15d2bda0932b6e4ca45d6d064
parent0e8243a355a69035dac269528b49dc8c9bc81f8a (diff)
parent2a6ce090f27016d68ee6952809d98fe88ce53522 (diff)
Merge branch 'ly/changed-paths-traversal'
Lift the limitation to use changed-path filter in "git log" so that it can be used for a pathspec with multiple literal paths. * ly/changed-paths-traversal: bloom: optimize multiple pathspec items in revision revision: make helper for pathspec to bloom keyvec bloom: replace struct bloom_key * with struct bloom_keyvec bloom: rename function operates on bloom_key bloom: add test helper to return murmur3 hash
-rw-r--r--blame.c2
-rw-r--r--bloom.c84
-rw-r--r--bloom.h54
-rw-r--r--line-log.c5
-rw-r--r--revision.c122
-rw-r--r--revision.h6
-rw-r--r--t/helper/test-bloom.c8
-rwxr-xr-xt/t4216-log-bloom.sh23
8 files changed, 204 insertions, 100 deletions
diff --git a/blame.c b/blame.c
index dce5c8d855..f1c0670144 100644
--- a/blame.c
+++ b/blame.c
@@ -1311,7 +1311,7 @@ static void add_bloom_key(struct blame_bloom_data *bd,
}
bd->keys[bd->nr] = xmalloc(sizeof(struct bloom_key));
- fill_bloom_key(path, strlen(path), bd->keys[bd->nr], bd->settings);
+ bloom_key_fill(bd->keys[bd->nr], path, strlen(path), bd->settings);
bd->nr++;
}
diff --git a/bloom.c b/bloom.c
index 0c8d2cebf9..b86015f6d1 100644
--- a/bloom.c
+++ b/bloom.c
@@ -107,7 +107,7 @@ int load_bloom_filter_from_graph(struct commit_graph *g,
* Not considered to be cryptographically secure.
* Implemented as described in https://en.wikipedia.org/wiki/MurmurHash#Algorithm
*/
-uint32_t murmur3_seeded_v2(uint32_t seed, const char *data, size_t len)
+static uint32_t murmur3_seeded_v2(uint32_t seed, const char *data, size_t len)
{
const uint32_t c1 = 0xcc9e2d51;
const uint32_t c2 = 0x1b873593;
@@ -221,9 +221,7 @@ static uint32_t murmur3_seeded_v1(uint32_t seed, const char *data, size_t len)
return seed;
}
-void fill_bloom_key(const char *data,
- size_t len,
- struct bloom_key *key,
+void bloom_key_fill(struct bloom_key *key, const char *data, size_t len,
const struct bloom_filter_settings *settings)
{
int i;
@@ -243,7 +241,7 @@ void fill_bloom_key(const char *data,
key->hashes[i] = hash0 + i * hash1;
}
-void clear_bloom_key(struct bloom_key *key)
+void bloom_key_clear(struct bloom_key *key)
{
FREE_AND_NULL(key->hashes);
}
@@ -280,6 +278,55 @@ void deinit_bloom_filters(void)
deep_clear_bloom_filter_slab(&bloom_filters, free_one_bloom_filter);
}
+struct bloom_keyvec *bloom_keyvec_new(const char *path, size_t len,
+ const struct bloom_filter_settings *settings)
+{
+ struct bloom_keyvec *vec;
+ const char *p;
+ size_t sz;
+ size_t nr = 1;
+
+ p = path;
+ while (*p) {
+ /*
+ * At this point, the path is normalized to use Unix-style
+ * path separators. This is required due to how the
+ * changed-path Bloom filters store the paths.
+ */
+ if (*p == '/')
+ nr++;
+ p++;
+ }
+
+ sz = sizeof(struct bloom_keyvec);
+ sz += nr * sizeof(struct bloom_key);
+ vec = (struct bloom_keyvec *)xcalloc(1, sz);
+ if (!vec)
+ return NULL;
+ vec->count = nr;
+
+ bloom_key_fill(&vec->key[0], path, len, settings);
+ nr = 1;
+ p = path + len - 1;
+ while (p > path) {
+ if (*p == '/') {
+ bloom_key_fill(&vec->key[nr++], path, p - path, settings);
+ }
+ p--;
+ }
+ assert(nr == vec->count);
+ return vec;
+}
+
+void bloom_keyvec_free(struct bloom_keyvec *vec)
+{
+ if (!vec)
+ return;
+ for (size_t nr = 0; nr < vec->count; nr++)
+ bloom_key_clear(&vec->key[nr]);
+ free(vec);
+}
+
static int pathmap_cmp(const void *hashmap_cmp_fn_data UNUSED,
const struct hashmap_entry *eptr,
const struct hashmap_entry *entry_or_key,
@@ -500,9 +547,9 @@ struct bloom_filter *get_or_compute_bloom_filter(struct repository *r,
hashmap_for_each_entry(&pathmap, &iter, e, entry) {
struct bloom_key key;
- fill_bloom_key(e->path, strlen(e->path), &key, settings);
+ bloom_key_fill(&key, e->path, strlen(e->path), settings);
add_key_to_filter(&key, filter, settings);
- clear_bloom_key(&key);
+ bloom_key_clear(&key);
}
cleanup:
@@ -540,3 +587,26 @@ int bloom_filter_contains(const struct bloom_filter *filter,
return 1;
}
+
+int bloom_filter_contains_vec(const struct bloom_filter *filter,
+ const struct bloom_keyvec *vec,
+ const struct bloom_filter_settings *settings)
+{
+ int ret = 1;
+
+ for (size_t nr = 0; ret > 0 && nr < vec->count; nr++)
+ ret = bloom_filter_contains(filter, &vec->key[nr], settings);
+
+ return ret;
+}
+
+uint32_t test_bloom_murmur3_seeded(uint32_t seed, const char *data, size_t len,
+ int version)
+{
+ assert(version == 1 || version == 2);
+
+ if (version == 2)
+ return murmur3_seeded_v2(seed, data, len);
+ else
+ return murmur3_seeded_v1(seed, data, len);
+}
diff --git a/bloom.h b/bloom.h
index 6e46489a20..92ab2100d3 100644
--- a/bloom.h
+++ b/bloom.h
@@ -74,24 +74,40 @@ struct bloom_key {
uint32_t *hashes;
};
+/*
+ * A bloom_keyvec is a vector of bloom_keys, which
+ * can be used to store multiple keys for a single
+ * pathspec item.
+ */
+struct bloom_keyvec {
+ size_t count;
+ struct bloom_key key[FLEX_ARRAY];
+};
+
int load_bloom_filter_from_graph(struct commit_graph *g,
struct bloom_filter *filter,
uint32_t graph_pos);
+void bloom_key_fill(struct bloom_key *key, const char *data, size_t len,
+ const struct bloom_filter_settings *settings);
+void bloom_key_clear(struct bloom_key *key);
+
/*
- * Calculate the murmur3 32-bit hash value for the given data
- * using the given seed.
- * Produces a uniformly distributed hash value.
- * Not considered to be cryptographically secure.
- * Implemented as described in https://en.wikipedia.org/wiki/MurmurHash#Algorithm
+ * bloom_keyvec_new - Allocate and populate a bloom_keyvec with keys for the
+ * given path.
+ *
+ * This function splits the input path by '/' and generates a bloom key for each
+ * prefix, in reverse order of specificity. For example, given the input
+ * "a/b/c", it will generate bloom keys for:
+ * - "a/b/c"
+ * - "a/b"
+ * - "a"
+ *
+ * The resulting keys are stored in a newly allocated bloom_keyvec.
*/
-uint32_t murmur3_seeded_v2(uint32_t seed, const char *data, size_t len);
-
-void fill_bloom_key(const char *data,
- size_t len,
- struct bloom_key *key,
- const struct bloom_filter_settings *settings);
-void clear_bloom_key(struct bloom_key *key);
+struct bloom_keyvec *bloom_keyvec_new(const char *path, size_t len,
+ const struct bloom_filter_settings *settings);
+void bloom_keyvec_free(struct bloom_keyvec *vec);
void add_key_to_filter(const struct bloom_key *key,
struct bloom_filter *filter,
@@ -137,4 +153,18 @@ int bloom_filter_contains(const struct bloom_filter *filter,
const struct bloom_key *key,
const struct bloom_filter_settings *settings);
+/*
+ * bloom_filter_contains_vec - Check if all keys in a key vector are in the
+ * Bloom filter.
+ *
+ * Returns 1 if **all** keys in the vector are present in the filter,
+ * 0 if **any** key is not present.
+ */
+int bloom_filter_contains_vec(const struct bloom_filter *filter,
+ const struct bloom_keyvec *v,
+ const struct bloom_filter_settings *settings);
+
+uint32_t test_bloom_murmur3_seeded(uint32_t seed, const char *data, size_t len,
+ int version);
+
#endif
diff --git a/line-log.c b/line-log.c
index 628e3fe3ae..07f2154e84 100644
--- a/line-log.c
+++ b/line-log.c
@@ -1172,12 +1172,13 @@ static int bloom_filter_check(struct rev_info *rev,
return 0;
while (!result && range) {
- fill_bloom_key(range->path, strlen(range->path), &key, rev->bloom_filter_settings);
+ bloom_key_fill(&key, range->path, strlen(range->path),
+ rev->bloom_filter_settings);
if (bloom_filter_contains(filter, &key, rev->bloom_filter_settings))
result = 1;
- clear_bloom_key(&key);
+ bloom_key_clear(&key);
range = range->next;
}
diff --git a/revision.c b/revision.c
index 0ca6a297a6..212ca0de27 100644
--- a/revision.c
+++ b/revision.c
@@ -675,24 +675,48 @@ static int forbid_bloom_filters(struct pathspec *spec)
{
if (spec->has_wildcard)
return 1;
- if (spec->nr > 1)
- return 1;
if (spec->magic & ~PATHSPEC_LITERAL)
return 1;
- if (spec->nr && (spec->items[0].magic & ~PATHSPEC_LITERAL))
- return 1;
+ for (size_t nr = 0; nr < spec->nr; nr++)
+ if (spec->items[nr].magic & ~PATHSPEC_LITERAL)
+ return 1;
return 0;
}
-static void prepare_to_use_bloom_filter(struct rev_info *revs)
+static void release_revisions_bloom_keyvecs(struct rev_info *revs);
+
+static int convert_pathspec_to_bloom_keyvec(struct bloom_keyvec **out,
+ const struct pathspec_item *pi,
+ const struct bloom_filter_settings *settings)
{
- struct pathspec_item *pi;
char *path_alloc = NULL;
- const char *path, *p;
+ const char *path;
size_t len;
- int path_component_nr = 1;
+ int res = 0;
+
+ /* remove single trailing slash from path, if needed */
+ if (pi->len > 0 && pi->match[pi->len - 1] == '/') {
+ path_alloc = xmemdupz(pi->match, pi->len - 1);
+ path = path_alloc;
+ } else
+ path = pi->match;
+ len = strlen(path);
+ if (!len) {
+ res = -1;
+ goto cleanup;
+ }
+
+ *out = bloom_keyvec_new(path, len, settings);
+
+cleanup:
+ free(path_alloc);
+ return res;
+}
+
+static void prepare_to_use_bloom_filter(struct rev_info *revs)
+{
if (!revs->commits)
return;
@@ -708,48 +732,14 @@ static void prepare_to_use_bloom_filter(struct rev_info *revs)
if (!revs->pruning.pathspec.nr)
return;
- pi = &revs->pruning.pathspec.items[0];
-
- /* remove single trailing slash from path, if needed */
- if (pi->len > 0 && pi->match[pi->len - 1] == '/') {
- path_alloc = xmemdupz(pi->match, pi->len - 1);
- path = path_alloc;
- } else
- path = pi->match;
-
- len = strlen(path);
- if (!len) {
- revs->bloom_filter_settings = NULL;
- free(path_alloc);
- return;
- }
-
- p = path;
- while (*p) {
- /*
- * At this point, the path is normalized to use Unix-style
- * path separators. This is required due to how the
- * changed-path Bloom filters store the paths.
- */
- if (*p == '/')
- path_component_nr++;
- p++;
- }
+ revs->bloom_keyvecs_nr = revs->pruning.pathspec.nr;
+ CALLOC_ARRAY(revs->bloom_keyvecs, revs->bloom_keyvecs_nr);
- revs->bloom_keys_nr = path_component_nr;
- ALLOC_ARRAY(revs->bloom_keys, revs->bloom_keys_nr);
-
- fill_bloom_key(path, len, &revs->bloom_keys[0],
- revs->bloom_filter_settings);
- path_component_nr = 1;
-
- p = path + len - 1;
- while (p > path) {
- if (*p == '/')
- fill_bloom_key(path, p - path,
- &revs->bloom_keys[path_component_nr++],
- revs->bloom_filter_settings);
- p--;
+ for (int i = 0; i < revs->pruning.pathspec.nr; i++) {
+ if (convert_pathspec_to_bloom_keyvec(&revs->bloom_keyvecs[i],
+ &revs->pruning.pathspec.items[i],
+ revs->bloom_filter_settings))
+ goto fail;
}
if (trace2_is_enabled() && !bloom_filter_atexit_registered) {
@@ -757,14 +747,18 @@ static void prepare_to_use_bloom_filter(struct rev_info *revs)
bloom_filter_atexit_registered = 1;
}
- free(path_alloc);
+ return;
+
+fail:
+ revs->bloom_filter_settings = NULL;
+ release_revisions_bloom_keyvecs(revs);
}
static int check_maybe_different_in_bloom_filter(struct rev_info *revs,
struct commit *commit)
{
struct bloom_filter *filter;
- int result = 1, j;
+ int result = 0;
if (!revs->repo->objects->commit_graph)
return -1;
@@ -779,10 +773,10 @@ static int check_maybe_different_in_bloom_filter(struct rev_info *revs,
return -1;
}
- for (j = 0; result && j < revs->bloom_keys_nr; j++) {
- result = bloom_filter_contains(filter,
- &revs->bloom_keys[j],
- revs->bloom_filter_settings);
+ for (size_t nr = 0; !result && nr < revs->bloom_keyvecs_nr; nr++) {
+ result = bloom_filter_contains_vec(filter,
+ revs->bloom_keyvecs[nr],
+ revs->bloom_filter_settings);
}
if (result)
@@ -823,7 +817,7 @@ static int rev_compare_tree(struct rev_info *revs,
return REV_TREE_SAME;
}
- if (revs->bloom_keys_nr && !nth_parent) {
+ if (revs->bloom_keyvecs_nr && !nth_parent) {
bloom_ret = check_maybe_different_in_bloom_filter(revs, commit);
if (bloom_ret == 0)
@@ -850,7 +844,7 @@ static int rev_same_tree_as_empty(struct rev_info *revs, struct commit *commit,
if (!t1)
return 0;
- if (!nth_parent && revs->bloom_keys_nr) {
+ if (!nth_parent && revs->bloom_keyvecs_nr) {
bloom_ret = check_maybe_different_in_bloom_filter(revs, commit);
if (!bloom_ret)
return 1;
@@ -3202,6 +3196,14 @@ static void release_revisions_mailmap(struct string_list *mailmap)
static void release_revisions_topo_walk_info(struct topo_walk_info *info);
+static void release_revisions_bloom_keyvecs(struct rev_info *revs)
+{
+ for (size_t nr = 0; nr < revs->bloom_keyvecs_nr; nr++)
+ bloom_keyvec_free(revs->bloom_keyvecs[nr]);
+ FREE_AND_NULL(revs->bloom_keyvecs);
+ revs->bloom_keyvecs_nr = 0;
+}
+
static void free_void_commit_list(void *list)
{
free_commit_list(list);
@@ -3230,11 +3232,7 @@ void release_revisions(struct rev_info *revs)
clear_decoration(&revs->treesame, free);
line_log_free(revs);
oidset_clear(&revs->missing_commits);
-
- for (int i = 0; i < revs->bloom_keys_nr; i++)
- clear_bloom_key(&revs->bloom_keys[i]);
- FREE_AND_NULL(revs->bloom_keys);
- revs->bloom_keys_nr = 0;
+ release_revisions_bloom_keyvecs(revs);
}
static void add_child(struct rev_info *revs, struct commit *parent, struct commit *child)
diff --git a/revision.h b/revision.h
index 6d369cdad6..ac843f58d0 100644
--- a/revision.h
+++ b/revision.h
@@ -62,7 +62,7 @@ struct repository;
struct rev_info;
struct string_list;
struct saved_parents;
-struct bloom_key;
+struct bloom_keyvec;
struct bloom_filter_settings;
struct option;
struct parse_opt_ctx_t;
@@ -360,8 +360,8 @@ struct rev_info {
/* Commit graph bloom filter fields */
/* The bloom filter key(s) for the pathspec */
- struct bloom_key *bloom_keys;
- int bloom_keys_nr;
+ struct bloom_keyvec **bloom_keyvecs;
+ int bloom_keyvecs_nr;
/*
* The bloom filter settings used to generate the key.
diff --git a/t/helper/test-bloom.c b/t/helper/test-bloom.c
index 9aa2c5a592..3283544bd3 100644
--- a/t/helper/test-bloom.c
+++ b/t/helper/test-bloom.c
@@ -12,13 +12,13 @@ static struct bloom_filter_settings settings = DEFAULT_BLOOM_FILTER_SETTINGS;
static void add_string_to_filter(const char *data, struct bloom_filter *filter) {
struct bloom_key key;
- fill_bloom_key(data, strlen(data), &key, &settings);
+ bloom_key_fill(&key, data, strlen(data), &settings);
printf("Hashes:");
for (size_t i = 0; i < settings.num_hashes; i++)
printf("0x%08x|", key.hashes[i]);
printf("\n");
add_key_to_filter(&key, filter, &settings);
- clear_bloom_key(&key);
+ bloom_key_clear(&key);
}
static void print_bloom_filter(struct bloom_filter *filter) {
@@ -61,13 +61,13 @@ int cmd__bloom(int argc, const char **argv)
uint32_t hashed;
if (argc < 3)
usage(bloom_usage);
- hashed = murmur3_seeded_v2(0, argv[2], strlen(argv[2]));
+ hashed = test_bloom_murmur3_seeded(0, argv[2], strlen(argv[2]), 2);
printf("Murmur3 Hash with seed=0:0x%08x\n", hashed);
}
if (!strcmp(argv[1], "get_murmur3_seven_highbit")) {
uint32_t hashed;
- hashed = murmur3_seeded_v2(0, "\x99\xaa\xbb\xcc\xdd\xee\xff", 7);
+ hashed = test_bloom_murmur3_seeded(0, "\x99\xaa\xbb\xcc\xdd\xee\xff", 7, 2);
printf("Murmur3 Hash with seed=0:0x%08x\n", hashed);
}
diff --git a/t/t4216-log-bloom.sh b/t/t4216-log-bloom.sh
index 8910d53cac..639868ac56 100755
--- a/t/t4216-log-bloom.sh
+++ b/t/t4216-log-bloom.sh
@@ -66,8 +66,9 @@ sane_unset GIT_TRACE2_CONFIG_PARAMS
setup () {
rm -f "$TRASH_DIRECTORY/trace.perf" &&
- git -c core.commitGraph=false log --pretty="format:%s" $1 >log_wo_bloom &&
- GIT_TRACE2_PERF="$TRASH_DIRECTORY/trace.perf" git -c core.commitGraph=true log --pretty="format:%s" $1 >log_w_bloom
+ eval git -c core.commitGraph=false log --pretty="format:%s" "$1" >log_wo_bloom &&
+ eval "GIT_TRACE2_PERF=\"$TRASH_DIRECTORY/trace.perf\"" \
+ git -c core.commitGraph=true log --pretty="format:%s" "$1" >log_w_bloom
}
test_bloom_filters_used () {
@@ -138,10 +139,6 @@ test_expect_success 'git log with --walk-reflogs does not use Bloom filters' '
test_bloom_filters_not_used "--walk-reflogs -- A"
'
-test_expect_success 'git log -- multiple path specs does not use Bloom filters' '
- test_bloom_filters_not_used "-- file4 A/file1"
-'
-
test_expect_success 'git log -- "." pathspec at root does not use Bloom filters' '
test_bloom_filters_not_used "-- ."
'
@@ -151,9 +148,17 @@ test_expect_success 'git log with wildcard that resolves to a single path uses B
test_bloom_filters_used "-- *renamed"
'
-test_expect_success 'git log with wildcard that resolves to a multiple paths does not uses Bloom filters' '
- test_bloom_filters_not_used "-- *" &&
- test_bloom_filters_not_used "-- file*"
+test_expect_success 'git log with multiple literal paths uses Bloom filter' '
+ test_bloom_filters_used "-- file4 A/file1" &&
+ test_bloom_filters_used "-- *" &&
+ test_bloom_filters_used "-- file*"
+'
+
+test_expect_success 'git log with path contains a wildcard does not use Bloom filter' '
+ test_bloom_filters_not_used "-- file\*" &&
+ test_bloom_filters_not_used "-- A/\* file4" &&
+ test_bloom_filters_not_used "-- file4 A/\*" &&
+ test_bloom_filters_not_used "-- * A/\*"
'
test_expect_success 'setup - add commit-graph to the chain without Bloom filters' '