diff options
author | Junio C Hamano <gitster@pobox.com> | 2025-07-23 15:45:15 -0700 |
---|---|---|
committer | Junio C Hamano <gitster@pobox.com> | 2025-07-23 15:45:15 -0700 |
commit | f22d4ac4fd50b55c88142dfd15a361680cf3fb40 (patch) | |
tree | 92a3be8ed47f75e15d2bda0932b6e4ca45d6d064 | |
parent | 0e8243a355a69035dac269528b49dc8c9bc81f8a (diff) | |
parent | 2a6ce090f27016d68ee6952809d98fe88ce53522 (diff) |
Merge branch 'ly/changed-paths-traversal'
Lift the limitation to use changed-path filter in "git log" so that
it can be used for a pathspec with multiple literal paths.
* ly/changed-paths-traversal:
bloom: optimize multiple pathspec items in revision
revision: make helper for pathspec to bloom keyvec
bloom: replace struct bloom_key * with struct bloom_keyvec
bloom: rename function operates on bloom_key
bloom: add test helper to return murmur3 hash
-rw-r--r-- | blame.c | 2 | ||||
-rw-r--r-- | bloom.c | 84 | ||||
-rw-r--r-- | bloom.h | 54 | ||||
-rw-r--r-- | line-log.c | 5 | ||||
-rw-r--r-- | revision.c | 122 | ||||
-rw-r--r-- | revision.h | 6 | ||||
-rw-r--r-- | t/helper/test-bloom.c | 8 | ||||
-rwxr-xr-x | t/t4216-log-bloom.sh | 23 |
8 files changed, 204 insertions, 100 deletions
@@ -1311,7 +1311,7 @@ static void add_bloom_key(struct blame_bloom_data *bd, } bd->keys[bd->nr] = xmalloc(sizeof(struct bloom_key)); - fill_bloom_key(path, strlen(path), bd->keys[bd->nr], bd->settings); + bloom_key_fill(bd->keys[bd->nr], path, strlen(path), bd->settings); bd->nr++; } @@ -107,7 +107,7 @@ int load_bloom_filter_from_graph(struct commit_graph *g, * Not considered to be cryptographically secure. * Implemented as described in https://en.wikipedia.org/wiki/MurmurHash#Algorithm */ -uint32_t murmur3_seeded_v2(uint32_t seed, const char *data, size_t len) +static uint32_t murmur3_seeded_v2(uint32_t seed, const char *data, size_t len) { const uint32_t c1 = 0xcc9e2d51; const uint32_t c2 = 0x1b873593; @@ -221,9 +221,7 @@ static uint32_t murmur3_seeded_v1(uint32_t seed, const char *data, size_t len) return seed; } -void fill_bloom_key(const char *data, - size_t len, - struct bloom_key *key, +void bloom_key_fill(struct bloom_key *key, const char *data, size_t len, const struct bloom_filter_settings *settings) { int i; @@ -243,7 +241,7 @@ void fill_bloom_key(const char *data, key->hashes[i] = hash0 + i * hash1; } -void clear_bloom_key(struct bloom_key *key) +void bloom_key_clear(struct bloom_key *key) { FREE_AND_NULL(key->hashes); } @@ -280,6 +278,55 @@ void deinit_bloom_filters(void) deep_clear_bloom_filter_slab(&bloom_filters, free_one_bloom_filter); } +struct bloom_keyvec *bloom_keyvec_new(const char *path, size_t len, + const struct bloom_filter_settings *settings) +{ + struct bloom_keyvec *vec; + const char *p; + size_t sz; + size_t nr = 1; + + p = path; + while (*p) { + /* + * At this point, the path is normalized to use Unix-style + * path separators. This is required due to how the + * changed-path Bloom filters store the paths. + */ + if (*p == '/') + nr++; + p++; + } + + sz = sizeof(struct bloom_keyvec); + sz += nr * sizeof(struct bloom_key); + vec = (struct bloom_keyvec *)xcalloc(1, sz); + if (!vec) + return NULL; + vec->count = nr; + + bloom_key_fill(&vec->key[0], path, len, settings); + nr = 1; + p = path + len - 1; + while (p > path) { + if (*p == '/') { + bloom_key_fill(&vec->key[nr++], path, p - path, settings); + } + p--; + } + assert(nr == vec->count); + return vec; +} + +void bloom_keyvec_free(struct bloom_keyvec *vec) +{ + if (!vec) + return; + for (size_t nr = 0; nr < vec->count; nr++) + bloom_key_clear(&vec->key[nr]); + free(vec); +} + static int pathmap_cmp(const void *hashmap_cmp_fn_data UNUSED, const struct hashmap_entry *eptr, const struct hashmap_entry *entry_or_key, @@ -500,9 +547,9 @@ struct bloom_filter *get_or_compute_bloom_filter(struct repository *r, hashmap_for_each_entry(&pathmap, &iter, e, entry) { struct bloom_key key; - fill_bloom_key(e->path, strlen(e->path), &key, settings); + bloom_key_fill(&key, e->path, strlen(e->path), settings); add_key_to_filter(&key, filter, settings); - clear_bloom_key(&key); + bloom_key_clear(&key); } cleanup: @@ -540,3 +587,26 @@ int bloom_filter_contains(const struct bloom_filter *filter, return 1; } + +int bloom_filter_contains_vec(const struct bloom_filter *filter, + const struct bloom_keyvec *vec, + const struct bloom_filter_settings *settings) +{ + int ret = 1; + + for (size_t nr = 0; ret > 0 && nr < vec->count; nr++) + ret = bloom_filter_contains(filter, &vec->key[nr], settings); + + return ret; +} + +uint32_t test_bloom_murmur3_seeded(uint32_t seed, const char *data, size_t len, + int version) +{ + assert(version == 1 || version == 2); + + if (version == 2) + return murmur3_seeded_v2(seed, data, len); + else + return murmur3_seeded_v1(seed, data, len); +} @@ -74,24 +74,40 @@ struct bloom_key { uint32_t *hashes; }; +/* + * A bloom_keyvec is a vector of bloom_keys, which + * can be used to store multiple keys for a single + * pathspec item. + */ +struct bloom_keyvec { + size_t count; + struct bloom_key key[FLEX_ARRAY]; +}; + int load_bloom_filter_from_graph(struct commit_graph *g, struct bloom_filter *filter, uint32_t graph_pos); +void bloom_key_fill(struct bloom_key *key, const char *data, size_t len, + const struct bloom_filter_settings *settings); +void bloom_key_clear(struct bloom_key *key); + /* - * Calculate the murmur3 32-bit hash value for the given data - * using the given seed. - * Produces a uniformly distributed hash value. - * Not considered to be cryptographically secure. - * Implemented as described in https://en.wikipedia.org/wiki/MurmurHash#Algorithm + * bloom_keyvec_new - Allocate and populate a bloom_keyvec with keys for the + * given path. + * + * This function splits the input path by '/' and generates a bloom key for each + * prefix, in reverse order of specificity. For example, given the input + * "a/b/c", it will generate bloom keys for: + * - "a/b/c" + * - "a/b" + * - "a" + * + * The resulting keys are stored in a newly allocated bloom_keyvec. */ -uint32_t murmur3_seeded_v2(uint32_t seed, const char *data, size_t len); - -void fill_bloom_key(const char *data, - size_t len, - struct bloom_key *key, - const struct bloom_filter_settings *settings); -void clear_bloom_key(struct bloom_key *key); +struct bloom_keyvec *bloom_keyvec_new(const char *path, size_t len, + const struct bloom_filter_settings *settings); +void bloom_keyvec_free(struct bloom_keyvec *vec); void add_key_to_filter(const struct bloom_key *key, struct bloom_filter *filter, @@ -137,4 +153,18 @@ int bloom_filter_contains(const struct bloom_filter *filter, const struct bloom_key *key, const struct bloom_filter_settings *settings); +/* + * bloom_filter_contains_vec - Check if all keys in a key vector are in the + * Bloom filter. + * + * Returns 1 if **all** keys in the vector are present in the filter, + * 0 if **any** key is not present. + */ +int bloom_filter_contains_vec(const struct bloom_filter *filter, + const struct bloom_keyvec *v, + const struct bloom_filter_settings *settings); + +uint32_t test_bloom_murmur3_seeded(uint32_t seed, const char *data, size_t len, + int version); + #endif diff --git a/line-log.c b/line-log.c index 628e3fe3ae..07f2154e84 100644 --- a/line-log.c +++ b/line-log.c @@ -1172,12 +1172,13 @@ static int bloom_filter_check(struct rev_info *rev, return 0; while (!result && range) { - fill_bloom_key(range->path, strlen(range->path), &key, rev->bloom_filter_settings); + bloom_key_fill(&key, range->path, strlen(range->path), + rev->bloom_filter_settings); if (bloom_filter_contains(filter, &key, rev->bloom_filter_settings)) result = 1; - clear_bloom_key(&key); + bloom_key_clear(&key); range = range->next; } diff --git a/revision.c b/revision.c index 0ca6a297a6..212ca0de27 100644 --- a/revision.c +++ b/revision.c @@ -675,24 +675,48 @@ static int forbid_bloom_filters(struct pathspec *spec) { if (spec->has_wildcard) return 1; - if (spec->nr > 1) - return 1; if (spec->magic & ~PATHSPEC_LITERAL) return 1; - if (spec->nr && (spec->items[0].magic & ~PATHSPEC_LITERAL)) - return 1; + for (size_t nr = 0; nr < spec->nr; nr++) + if (spec->items[nr].magic & ~PATHSPEC_LITERAL) + return 1; return 0; } -static void prepare_to_use_bloom_filter(struct rev_info *revs) +static void release_revisions_bloom_keyvecs(struct rev_info *revs); + +static int convert_pathspec_to_bloom_keyvec(struct bloom_keyvec **out, + const struct pathspec_item *pi, + const struct bloom_filter_settings *settings) { - struct pathspec_item *pi; char *path_alloc = NULL; - const char *path, *p; + const char *path; size_t len; - int path_component_nr = 1; + int res = 0; + + /* remove single trailing slash from path, if needed */ + if (pi->len > 0 && pi->match[pi->len - 1] == '/') { + path_alloc = xmemdupz(pi->match, pi->len - 1); + path = path_alloc; + } else + path = pi->match; + len = strlen(path); + if (!len) { + res = -1; + goto cleanup; + } + + *out = bloom_keyvec_new(path, len, settings); + +cleanup: + free(path_alloc); + return res; +} + +static void prepare_to_use_bloom_filter(struct rev_info *revs) +{ if (!revs->commits) return; @@ -708,48 +732,14 @@ static void prepare_to_use_bloom_filter(struct rev_info *revs) if (!revs->pruning.pathspec.nr) return; - pi = &revs->pruning.pathspec.items[0]; - - /* remove single trailing slash from path, if needed */ - if (pi->len > 0 && pi->match[pi->len - 1] == '/') { - path_alloc = xmemdupz(pi->match, pi->len - 1); - path = path_alloc; - } else - path = pi->match; - - len = strlen(path); - if (!len) { - revs->bloom_filter_settings = NULL; - free(path_alloc); - return; - } - - p = path; - while (*p) { - /* - * At this point, the path is normalized to use Unix-style - * path separators. This is required due to how the - * changed-path Bloom filters store the paths. - */ - if (*p == '/') - path_component_nr++; - p++; - } + revs->bloom_keyvecs_nr = revs->pruning.pathspec.nr; + CALLOC_ARRAY(revs->bloom_keyvecs, revs->bloom_keyvecs_nr); - revs->bloom_keys_nr = path_component_nr; - ALLOC_ARRAY(revs->bloom_keys, revs->bloom_keys_nr); - - fill_bloom_key(path, len, &revs->bloom_keys[0], - revs->bloom_filter_settings); - path_component_nr = 1; - - p = path + len - 1; - while (p > path) { - if (*p == '/') - fill_bloom_key(path, p - path, - &revs->bloom_keys[path_component_nr++], - revs->bloom_filter_settings); - p--; + for (int i = 0; i < revs->pruning.pathspec.nr; i++) { + if (convert_pathspec_to_bloom_keyvec(&revs->bloom_keyvecs[i], + &revs->pruning.pathspec.items[i], + revs->bloom_filter_settings)) + goto fail; } if (trace2_is_enabled() && !bloom_filter_atexit_registered) { @@ -757,14 +747,18 @@ static void prepare_to_use_bloom_filter(struct rev_info *revs) bloom_filter_atexit_registered = 1; } - free(path_alloc); + return; + +fail: + revs->bloom_filter_settings = NULL; + release_revisions_bloom_keyvecs(revs); } static int check_maybe_different_in_bloom_filter(struct rev_info *revs, struct commit *commit) { struct bloom_filter *filter; - int result = 1, j; + int result = 0; if (!revs->repo->objects->commit_graph) return -1; @@ -779,10 +773,10 @@ static int check_maybe_different_in_bloom_filter(struct rev_info *revs, return -1; } - for (j = 0; result && j < revs->bloom_keys_nr; j++) { - result = bloom_filter_contains(filter, - &revs->bloom_keys[j], - revs->bloom_filter_settings); + for (size_t nr = 0; !result && nr < revs->bloom_keyvecs_nr; nr++) { + result = bloom_filter_contains_vec(filter, + revs->bloom_keyvecs[nr], + revs->bloom_filter_settings); } if (result) @@ -823,7 +817,7 @@ static int rev_compare_tree(struct rev_info *revs, return REV_TREE_SAME; } - if (revs->bloom_keys_nr && !nth_parent) { + if (revs->bloom_keyvecs_nr && !nth_parent) { bloom_ret = check_maybe_different_in_bloom_filter(revs, commit); if (bloom_ret == 0) @@ -850,7 +844,7 @@ static int rev_same_tree_as_empty(struct rev_info *revs, struct commit *commit, if (!t1) return 0; - if (!nth_parent && revs->bloom_keys_nr) { + if (!nth_parent && revs->bloom_keyvecs_nr) { bloom_ret = check_maybe_different_in_bloom_filter(revs, commit); if (!bloom_ret) return 1; @@ -3202,6 +3196,14 @@ static void release_revisions_mailmap(struct string_list *mailmap) static void release_revisions_topo_walk_info(struct topo_walk_info *info); +static void release_revisions_bloom_keyvecs(struct rev_info *revs) +{ + for (size_t nr = 0; nr < revs->bloom_keyvecs_nr; nr++) + bloom_keyvec_free(revs->bloom_keyvecs[nr]); + FREE_AND_NULL(revs->bloom_keyvecs); + revs->bloom_keyvecs_nr = 0; +} + static void free_void_commit_list(void *list) { free_commit_list(list); @@ -3230,11 +3232,7 @@ void release_revisions(struct rev_info *revs) clear_decoration(&revs->treesame, free); line_log_free(revs); oidset_clear(&revs->missing_commits); - - for (int i = 0; i < revs->bloom_keys_nr; i++) - clear_bloom_key(&revs->bloom_keys[i]); - FREE_AND_NULL(revs->bloom_keys); - revs->bloom_keys_nr = 0; + release_revisions_bloom_keyvecs(revs); } static void add_child(struct rev_info *revs, struct commit *parent, struct commit *child) diff --git a/revision.h b/revision.h index 6d369cdad6..ac843f58d0 100644 --- a/revision.h +++ b/revision.h @@ -62,7 +62,7 @@ struct repository; struct rev_info; struct string_list; struct saved_parents; -struct bloom_key; +struct bloom_keyvec; struct bloom_filter_settings; struct option; struct parse_opt_ctx_t; @@ -360,8 +360,8 @@ struct rev_info { /* Commit graph bloom filter fields */ /* The bloom filter key(s) for the pathspec */ - struct bloom_key *bloom_keys; - int bloom_keys_nr; + struct bloom_keyvec **bloom_keyvecs; + int bloom_keyvecs_nr; /* * The bloom filter settings used to generate the key. diff --git a/t/helper/test-bloom.c b/t/helper/test-bloom.c index 9aa2c5a592..3283544bd3 100644 --- a/t/helper/test-bloom.c +++ b/t/helper/test-bloom.c @@ -12,13 +12,13 @@ static struct bloom_filter_settings settings = DEFAULT_BLOOM_FILTER_SETTINGS; static void add_string_to_filter(const char *data, struct bloom_filter *filter) { struct bloom_key key; - fill_bloom_key(data, strlen(data), &key, &settings); + bloom_key_fill(&key, data, strlen(data), &settings); printf("Hashes:"); for (size_t i = 0; i < settings.num_hashes; i++) printf("0x%08x|", key.hashes[i]); printf("\n"); add_key_to_filter(&key, filter, &settings); - clear_bloom_key(&key); + bloom_key_clear(&key); } static void print_bloom_filter(struct bloom_filter *filter) { @@ -61,13 +61,13 @@ int cmd__bloom(int argc, const char **argv) uint32_t hashed; if (argc < 3) usage(bloom_usage); - hashed = murmur3_seeded_v2(0, argv[2], strlen(argv[2])); + hashed = test_bloom_murmur3_seeded(0, argv[2], strlen(argv[2]), 2); printf("Murmur3 Hash with seed=0:0x%08x\n", hashed); } if (!strcmp(argv[1], "get_murmur3_seven_highbit")) { uint32_t hashed; - hashed = murmur3_seeded_v2(0, "\x99\xaa\xbb\xcc\xdd\xee\xff", 7); + hashed = test_bloom_murmur3_seeded(0, "\x99\xaa\xbb\xcc\xdd\xee\xff", 7, 2); printf("Murmur3 Hash with seed=0:0x%08x\n", hashed); } diff --git a/t/t4216-log-bloom.sh b/t/t4216-log-bloom.sh index 8910d53cac..639868ac56 100755 --- a/t/t4216-log-bloom.sh +++ b/t/t4216-log-bloom.sh @@ -66,8 +66,9 @@ sane_unset GIT_TRACE2_CONFIG_PARAMS setup () { rm -f "$TRASH_DIRECTORY/trace.perf" && - git -c core.commitGraph=false log --pretty="format:%s" $1 >log_wo_bloom && - GIT_TRACE2_PERF="$TRASH_DIRECTORY/trace.perf" git -c core.commitGraph=true log --pretty="format:%s" $1 >log_w_bloom + eval git -c core.commitGraph=false log --pretty="format:%s" "$1" >log_wo_bloom && + eval "GIT_TRACE2_PERF=\"$TRASH_DIRECTORY/trace.perf\"" \ + git -c core.commitGraph=true log --pretty="format:%s" "$1" >log_w_bloom } test_bloom_filters_used () { @@ -138,10 +139,6 @@ test_expect_success 'git log with --walk-reflogs does not use Bloom filters' ' test_bloom_filters_not_used "--walk-reflogs -- A" ' -test_expect_success 'git log -- multiple path specs does not use Bloom filters' ' - test_bloom_filters_not_used "-- file4 A/file1" -' - test_expect_success 'git log -- "." pathspec at root does not use Bloom filters' ' test_bloom_filters_not_used "-- ." ' @@ -151,9 +148,17 @@ test_expect_success 'git log with wildcard that resolves to a single path uses B test_bloom_filters_used "-- *renamed" ' -test_expect_success 'git log with wildcard that resolves to a multiple paths does not uses Bloom filters' ' - test_bloom_filters_not_used "-- *" && - test_bloom_filters_not_used "-- file*" +test_expect_success 'git log with multiple literal paths uses Bloom filter' ' + test_bloom_filters_used "-- file4 A/file1" && + test_bloom_filters_used "-- *" && + test_bloom_filters_used "-- file*" +' + +test_expect_success 'git log with path contains a wildcard does not use Bloom filter' ' + test_bloom_filters_not_used "-- file\*" && + test_bloom_filters_not_used "-- A/\* file4" && + test_bloom_filters_not_used "-- file4 A/\*" && + test_bloom_filters_not_used "-- * A/\*" ' test_expect_success 'setup - add commit-graph to the chain without Bloom filters' ' |