diff options
| author | John Naylor <john.naylor@postgresql.org> | 2024-02-06 13:09:03 +0700 | 
|---|---|---|
| committer | John Naylor <john.naylor@postgresql.org> | 2024-02-06 14:49:06 +0700 | 
| commit | b83033c3cff556d520281aaec399e47b4f11edbe (patch) | |
| tree | 125f6ad360c22f34f480299c6d9c7ea15b4ee8f5 /src | |
| parent | 9ed3ee5001b6a2d4cb0166eb8f12a457f30aaca4 (diff) | |
Further cosmetic review of hashfn_unstable.h
In follow-up to e97b672c8,
* Flesh out comments explaining the incremental interface
* Clarify detection of zero bytes when hashing aligned C strings
The latter was suggested and reviewed by Jeff Davis
Discussion: https://postgr.es/m/48e8f8bbe0be9c789f98776c7438244ab7a7cc63.camel%40j-davis.com
Diffstat (limited to 'src')
| -rw-r--r-- | src/include/common/hashfn_unstable.h | 76 | 
1 files changed, 49 insertions, 27 deletions
| diff --git a/src/include/common/hashfn_unstable.h b/src/include/common/hashfn_unstable.h index 63ebe295c5f..af80e65fef8 100644 --- a/src/include/common/hashfn_unstable.h +++ b/src/include/common/hashfn_unstable.h @@ -53,24 +53,40 @@   * fasthash as implemented here has two interfaces:   *   * 1) Standalone functions, e.g. fasthash32() for a single value with a - * known length. + * known length. These return the same hash code as the original, at + * least on little-endian machines.   *   * 2) Incremental interface. This can used for incorporating multiple - * inputs. The standalone functions use this internally, so see fasthash64() - * for an an example of how this works. - * - * The incremental interface is especially useful if any of the inputs - * are NUL-terminated C strings, since the length is not needed ahead - * of time. This avoids needing to call strlen(). This case is optimized - * in fasthash_accum_cstring() : + * inputs. First, initialize the hash state (here with a zero seed):   *   * fasthash_state hs;   * fasthash_init(&hs, 0); - * len = fasthash_accum_cstring(&hs, *str); + * + * If the inputs are of types that can be trivially cast to uint64, it's + * sufficient to do: + * + * hs.accum = value1; + * fasthash_combine(&hs); + * hs.accum = value2; + * fasthash_combine(&hs);   * ... - * return fasthash_final32(&hs, len);   * - * The length is computed on-the-fly. Experimentation has found that + * For longer or variable-length input, fasthash_accum() is a more + * flexible, but more verbose method. The standalone functions use this + * internally, so see fasthash64() for an an example of this. + * + * After all inputs have been mixed in, finalize the hash: + * + * hashcode = fasthash_final32(&hs, 0); + * + * The incremental interface allows an optimization for NUL-terminated + * C strings: + * + * len = fasthash_accum_cstring(&hs, str); + * hashcode = fasthash_final32(&hs, len); + * + * By handling the terminator on-the-fly, we can avoid needing a strlen() + * call to tell us how many bytes to hash. Experimentation has found that   * SMHasher fails unless we incorporate the length, so it is passed to   * the finalizer as a tweak.   */ @@ -204,26 +220,33 @@ fasthash_accum_cstring_aligned(fasthash_state *hs, const char *str)  {  	const char *const start = str;  	int			remainder; -	uint64		zero_bytes_le; +	uint64		zero_byte_low;  	Assert(PointerIsAligned(start, uint64)); + +	/* +	 * For every chunk of input, check for zero bytes before mixing into the +	 * hash. The chunk with zeros must contain the NUL terminator. We arrange +	 * so that zero_byte_low tells us not only that a zero exists, but also +	 * where it is, so we can hash the remainder of the string. +	 * +	 * The haszero64 calculation will set bits corresponding to the lowest +	 * byte where a zero exists, so that suffices for little-endian machines. +	 * For big-endian machines, we would need bits set for the highest zero +	 * byte in the chunk, since the trailing junk past the terminator could +	 * contain additional zeros. haszero64 does not give us that, so we +	 * byteswap the chunk first. +	 */  	for (;;)  	{  		uint64		chunk = *(uint64 *) str; -		/* -		 * With little-endian representation, we can use this calculation, -		 * which sets bits in the first byte in the result word that -		 * corresponds to a zero byte in the original word. The rest of the -		 * bytes are indeterminate, so cannot be used on big-endian machines -		 * without either swapping or a bytewise check. -		 */  #ifdef WORDS_BIGENDIAN -		zero_bytes_le = haszero64(pg_bswap64(chunk)); +		zero_byte_low = haszero64(pg_bswap64(chunk));  #else -		zero_bytes_le = haszero64(chunk); +		zero_byte_low = haszero64(chunk);  #endif -		if (zero_bytes_le) +		if (zero_byte_low)  			break;  		hs->accum = chunk; @@ -232,12 +255,11 @@ fasthash_accum_cstring_aligned(fasthash_state *hs, const char *str)  	}  	/* -	 * For the last word, only use bytes up to the NUL for the hash. Bytes -	 * with set bits will be 0x80, so calculate the first occurrence of a zero -	 * byte within the input word by counting the number of trailing (because -	 * little-endian) zeros and dividing the result by 8. +	 * The byte corresponding to the NUL will be 0x80, so the rightmost bit +	 * position will be in the range 7, 15, ..., 63. Turn this into byte +	 * position by dividing by 8.  	 */ -	remainder = pg_rightmost_one_pos64(zero_bytes_le) / BITS_PER_BYTE; +	remainder = pg_rightmost_one_pos64(zero_byte_low) / BITS_PER_BYTE;  	fasthash_accum(hs, str, remainder);  	str += remainder; | 
