diff options
| -rw-r--r-- | Documentation/git-pack-objects.txt | 32 | ||||
| -rw-r--r-- | builtin/pack-objects.c | 52 | ||||
| -rwxr-xr-x | t/t5300-pack-object.sh | 31 |
3 files changed, 109 insertions, 6 deletions
diff --git a/Documentation/git-pack-objects.txt b/Documentation/git-pack-objects.txt index e32404c6aa..7f69ae4855 100644 --- a/Documentation/git-pack-objects.txt +++ b/Documentation/git-pack-objects.txt @@ -15,7 +15,8 @@ SYNOPSIS [--revs [--unpacked | --all]] [--keep-pack=<pack-name>] [--cruft] [--cruft-expiration=<time>] [--stdout [--filter=<filter-spec>] | <base-name>] - [--shallow] [--keep-true-parents] [--[no-]sparse] < <object-list> + [--shallow] [--keep-true-parents] [--[no-]sparse] + [--name-hash-version=<n>] < <object-list> DESCRIPTION @@ -345,6 +346,35 @@ raise an error. Restrict delta matches based on "islands". See DELTA ISLANDS below. +--name-hash-version=<n>:: + While performing delta compression, Git groups objects that may be + similar based on heuristics using the path to that object. While + grouping objects by an exact path match is good for paths with + many versions, there are benefits for finding delta pairs across + different full paths. Git collects objects by type and then by a + "name hash" of the path and then by size, hoping to group objects + that will compress well together. ++ +The default name hash version is `1`, which prioritizes hash locality by +considering the final bytes of the path as providing the maximum magnitude +to the hash function. This version excels at distinguishing short paths +and finding renames across directories. However, the hash function depends +primarily on the final 16 bytes of the path. If there are many paths in +the repo that have the same final 16 bytes and differ only by parent +directory, then this name-hash may lead to too many collisions and cause +poor results. At the moment, this version is required when writing +reachability bitmap files with `--write-bitmap-index`. ++ +The name hash version `2` has similar locality features as version `1`, +except it considers each path component separately and overlays the hashes +with a shift. This still prioritizes the final bytes of the path, but also +"salts" the lower bits of the hash using the parent directory names. This +method allows for some of the locality benefits of version `1` while +breaking most of the collisions from a similarly-named file appearing in +many different directories. At the moment, this version is not allowed +when writing reachability bitmap files with `--write-bitmap-index` and it +will be automatically changed to version `1`. + DELTA ISLANDS ------------- diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index 0800714267..d0c717e9d0 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -266,6 +266,35 @@ struct configured_exclusion { static struct oidmap configured_exclusions; static struct oidset excluded_by_config; +static int name_hash_version = 1; + +/** + * Check whether the name_hash_version chosen by user input is appropriate, + * and also validate whether it is compatible with other features. + */ +static void validate_name_hash_version(void) +{ + if (name_hash_version < 1 || name_hash_version > 2) + die(_("invalid --name-hash-version option: %d"), name_hash_version); + if (write_bitmap_index && name_hash_version != 1) { + warning(_("currently, --write-bitmap-index requires --name-hash-version=1")); + name_hash_version = 1; + } +} + +static inline uint32_t pack_name_hash_fn(const char *name) +{ + switch (name_hash_version) { + case 1: + return pack_name_hash(name); + + case 2: + return pack_name_hash_v2((const unsigned char *)name); + + default: + BUG("invalid name-hash version: %d", name_hash_version); + } +} /* * stats @@ -1698,7 +1727,7 @@ static int add_object_entry(const struct object_id *oid, enum object_type type, return 0; } - create_object_entry(oid, type, pack_name_hash(name), + create_object_entry(oid, type, pack_name_hash_fn(name), exclude, name && no_try_delta(name), found_pack, found_offset); return 1; @@ -1912,7 +1941,7 @@ static void add_preferred_base_object(const char *name) { struct pbase_tree *it; size_t cmplen; - unsigned hash = pack_name_hash(name); + unsigned hash = pack_name_hash_fn(name); if (!num_preferred_base || check_pbase_path(hash)) return; @@ -3422,7 +3451,7 @@ static void show_object_pack_hint(struct object *object, const char *name, * here using a now in order to perhaps improve the delta selection * process. */ - oe->hash = pack_name_hash(name); + oe->hash = pack_name_hash_fn(name); oe->no_try_delta = name && no_try_delta(name); stdin_packs_hints_nr++; @@ -3572,7 +3601,7 @@ static void add_cruft_object_entry(const struct object_id *oid, enum object_type entry = packlist_find(&to_pack, oid); if (entry) { if (name) { - entry->hash = pack_name_hash(name); + entry->hash = pack_name_hash_fn(name); entry->no_try_delta = no_try_delta(name); } } else { @@ -3595,7 +3624,7 @@ static void add_cruft_object_entry(const struct object_id *oid, enum object_type return; } - entry = create_object_entry(oid, type, pack_name_hash(name), + entry = create_object_entry(oid, type, pack_name_hash_fn(name), 0, name && no_try_delta(name), pack, offset); } @@ -4069,6 +4098,15 @@ static int get_object_list_from_bitmap(struct rev_info *revs) if (!(bitmap_git = prepare_bitmap_walk(revs, 0))) return -1; + /* + * For now, force the name-hash version to be 1 since that + * is the version implied by the bitmap format. Later, the + * format can include this version explicitly in its format, + * allowing readers to know the version that was used during + * the bitmap write. + */ + name_hash_version = 1; + if (pack_options_allow_reuse()) reuse_partial_packfile_from_bitmap(bitmap_git, &reuse_packfiles, @@ -4429,6 +4467,8 @@ int cmd_pack_objects(int argc, OPT_STRING_LIST(0, "uri-protocol", &uri_protocols, N_("protocol"), N_("exclude any configured uploadpack.blobpackfileuri with this protocol")), + OPT_INTEGER(0, "name-hash-version", &name_hash_version, + N_("use the specified name-hash function to group similar objects")), OPT_END(), }; @@ -4576,6 +4616,8 @@ int cmd_pack_objects(int argc, if (pack_to_stdout || !rev_list_all) write_bitmap_index = 0; + validate_name_hash_version(); + if (use_delta_islands) strvec_push(&rp, "--topo-order"); diff --git a/t/t5300-pack-object.sh b/t/t5300-pack-object.sh index 3b9dae331a..4270eabe8b 100755 --- a/t/t5300-pack-object.sh +++ b/t/t5300-pack-object.sh @@ -674,4 +674,35 @@ do ' done +test_expect_success 'valid and invalid --name-hash-versions' ' + # Valid values are hard to verify other than "do not fail". + # Performance tests will be more valuable to validate these versions. + for value in 1 2 + do + git pack-objects base --all --name-hash-version=$value || return 1 + done && + + # Invalid values have clear post-conditions. + for value in -1 0 3 + do + test_must_fail git pack-objects base --all --name-hash-version=$value 2>err && + test_grep "invalid --name-hash-version option" err || return 1 + done +' + +# The following test is not necessarily a permanent choice, but since we do not +# have a "name hash version" bit in the .bitmap file format, we cannot write the +# hash values into the .bitmap file without risking breakage later. +# +# TODO: Make these compatible in the future and replace this test with the +# expected behavior when both are specified. +test_expect_success '--name-hash-version=2 and --write-bitmap-index are incompatible' ' + git pack-objects base --all --name-hash-version=2 --write-bitmap-index 2>err && + test_grep "currently, --write-bitmap-index requires --name-hash-version=1" err && + + # --stdout option silently removes --write-bitmap-index + git pack-objects --stdout --all --name-hash-version=2 --write-bitmap-index >out 2>err && + ! test_grep "currently, --write-bitmap-index requires --name-hash-version=1" err +' + test_done |
