38 files changed, 8942 insertions, 0 deletions
diff --git a/reftable/LICENSE b/reftable/LICENSE
new file mode 100644
index 0000000000..402e0f9356
--- /dev/null
+++ b/reftable/LICENSE
@@ -0,0 +1,31 @@
+BSD License
+
+Copyright (c) 2020, Google LLC
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+* Neither the name of Google LLC nor the names of its contributors may
+be used to endorse or promote products derived from this software
+without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/reftable/basics.c b/reftable/basics.c
new file mode 100644
index 0000000000..9988ebd635
--- /dev/null
+++ b/reftable/basics.c
@@ -0,0 +1,267 @@
+/*
+ * Copyright 2020 Google LLC
+ *
+ * Use of this source code is governed by a BSD-style
+ * license that can be found in the LICENSE file or at
+ * https://developers.google.com/open-source/licenses/bsd
+ */
+
+#define REFTABLE_ALLOW_BANNED_ALLOCATORS
+#include "basics.h"
+#include "reftable-basics.h"
+#include "reftable-error.h"
+
+static void *(*reftable_malloc_ptr)(size_t sz);
+static void *(*reftable_realloc_ptr)(void *, size_t);
+static void (*reftable_free_ptr)(void *);
+
+void *reftable_malloc(size_t sz)
+{
+	if (!sz)
+		return NULL;
+	if (reftable_malloc_ptr)
+		return (*reftable_malloc_ptr)(sz);
+	return malloc(sz);
+}
+
+void *reftable_realloc(void *p, size_t sz)
+{
+	if (!sz) {
+		reftable_free(p);
+		return NULL;
+	}
+
+	if (reftable_realloc_ptr)
+		return (*reftable_realloc_ptr)(p, sz);
+	return realloc(p, sz);
+}
+
+void reftable_free(void *p)
+{
+	if (reftable_free_ptr)
+		reftable_free_ptr(p);
+	else
+		free(p);
+}
+
+void *reftable_calloc(size_t nelem, size_t elsize)
+{
+	void *p;
+
+	if (nelem && elsize > SIZE_MAX / nelem)
+		return NULL;
+
+	p = reftable_malloc(nelem * elsize);
+	if (!p)
+		return NULL;
+
+	memset(p, 0, nelem * elsize);
+	return p;
+}
+
+char *reftable_strdup(const char *str)
+{
+	size_t len = strlen(str);
+	char *result = reftable_malloc(len + 1);
+	if (!result)
+		return NULL;
+	memcpy(result, str, len + 1);
+	return result;
+}
+
+void reftable_set_alloc(void *(*malloc)(size_t),
+			void *(*realloc)(void *, size_t), void (*free)(void *))
+{
+	reftable_malloc_ptr = malloc;
+	reftable_realloc_ptr = realloc;
+	reftable_free_ptr = free;
+}
+
+void reftable_buf_init(struct reftable_buf *buf)
+{
+	struct reftable_buf empty = REFTABLE_BUF_INIT;
+	*buf = empty;
+}
+
+void reftable_buf_release(struct reftable_buf *buf)
+{
+	reftable_free(buf->buf);
+	reftable_buf_init(buf);
+}
+
+void reftable_buf_reset(struct reftable_buf *buf)
+{
+	if (buf->alloc) {
+		buf->len = 0;
+		buf->buf[0] = '\0';
+	}
+}
+
+int reftable_buf_setlen(struct reftable_buf *buf, size_t len)
+{
+	if (len > buf->len)
+		return -1;
+	if (len == buf->len)
+		return 0;
+	buf->buf[len] = '\0';
+	buf->len = len;
+	return 0;
+}
+
+int reftable_buf_cmp(const struct reftable_buf *a, const struct reftable_buf *b)
+{
+	size_t len = a->len < b->len ? a->len : b->len;
+	if (len) {
+		int cmp = memcmp(a->buf, b->buf, len);
+		if (cmp)
+			return cmp;
+	}
+	return a->len < b->len ? -1 : a->len != b->len;
+}
+
+int reftable_buf_add(struct reftable_buf *buf, const void *data, size_t len)
+{
+	size_t newlen = buf->len + len;
+
+	if (newlen + 1 > buf->alloc) {
+		if (REFTABLE_ALLOC_GROW(buf->buf, newlen + 1, buf->alloc))
+			return REFTABLE_OUT_OF_MEMORY_ERROR;
+	}
+
+	memcpy(buf->buf + buf->len, data, len);
+	buf->buf[newlen] = '\0';
+	buf->len = newlen;
+
+	return 0;
+}
+
+int reftable_buf_addstr(struct reftable_buf *buf, const char *s)
+{
+	return reftable_buf_add(buf, s, strlen(s));
+}
+
+char *reftable_buf_detach(struct reftable_buf *buf)
+{
+	char *result = buf->buf;
+	reftable_buf_init(buf);
+	return result;
+}
+
+size_t binsearch(size_t sz, int (*f)(size_t k, void *args), void *args)
+{
+	size_t lo = 0;
+	size_t hi = sz;
+
+	/* Invariants:
+	 *
+	 *  (hi == sz) || f(hi) == true
+	 *  (lo == 0 && f(0) == true) || fi(lo) == false
+	 */
+	while (hi - lo > 1) {
+		size_t mid = lo + (hi - lo) / 2;
+		int ret = f(mid, args);
+		if (ret < 0)
+			return sz;
+
+		if (ret > 0)
+			hi = mid;
+		else
+			lo = mid;
+	}
+
+	if (lo)
+		return hi;
+
+	return f(0, args) ? 0 : 1;
+}
+
+void free_names(char **a)
+{
+	char **p;
+	if (!a) {
+		return;
+	}
+	for (p = a; *p; p++) {
+		reftable_free(*p);
+	}
+	reftable_free(a);
+}
+
+size_t names_length(const char **names)
+{
+	const char **p = names;
+	while (*p)
+		p++;
+	return p - names;
+}
+
+char **parse_names(char *buf, int size)
+{
+	char **names = NULL;
+	size_t names_cap = 0;
+	size_t names_len = 0;
+	char *p = buf;
+	char *end = buf + size;
+
+	while (p < end) {
+		char *next = strchr(p, '\n');
+		if (next && next < end) {
+			*next = 0;
+		} else {
+			next = end;
+		}
+		if (p < next) {
+			if (REFTABLE_ALLOC_GROW(names, names_len + 1,
+						names_cap))
+				goto err;
+
+			names[names_len] = reftable_strdup(p);
+			if (!names[names_len++])
+				goto err;
+		}
+		p = next + 1;
+	}
+
+	if (REFTABLE_ALLOC_GROW(names, names_len + 1, names_cap))
+		goto err;
+	names[names_len] = NULL;
+
+	return names;
+
+err:
+	for (size_t i = 0; i < names_len; i++)
+		reftable_free(names[i]);
+	reftable_free(names);
+	return NULL;
+}
+
+int names_equal(const char **a, const char **b)
+{
+	size_t i = 0;
+	for (; a[i] && b[i]; i++)
+		if (strcmp(a[i], b[i]))
+			return 0;
+	return a[i] == b[i];
+}
+
+size_t common_prefix_size(struct reftable_buf *a, struct reftable_buf *b)
+{
+	size_t p = 0;
+	for (; p < a->len && p < b->len; p++)
+		if (a->buf[p] != b->buf[p])
+			break;
+	return p;
+}
+
+uint32_t hash_size(enum reftable_hash id)
+{
+	if (!id)
+		return REFTABLE_HASH_SIZE_SHA1;
+	switch (id) {
+	case REFTABLE_HASH_SHA1:
+		return REFTABLE_HASH_SIZE_SHA1;
+	case REFTABLE_HASH_SHA256:
+		return REFTABLE_HASH_SIZE_SHA256;
+	}
+	abort();
+}
diff --git a/reftable/basics.h b/reftable/basics.h
new file mode 100644
index 0000000000..7d22f96261
--- /dev/null
+++ b/reftable/basics.h
@@ -0,0 +1,290 @@
+/*
+ * Copyright 2020 Google LLC
+ *
+ * Use of this source code is governed by a BSD-style
+ * license that can be found in the LICENSE file or at
+ * https://developers.google.com/open-source/licenses/bsd
+ */
+
+#ifndef BASICS_H
+#define BASICS_H
+
+/*
+ * miscellaneous utilities that are not provided by Git.
+ */
+
+#include "system.h"
+#include "reftable-basics.h"
+
+#ifdef __GNUC__
+#define REFTABLE_UNUSED __attribute__((__unused__))
+#else
+#define REFTABLE_UNUSED
+#endif
+
+/*
+ * Initialize the buffer such that it is ready for use. This is equivalent to
+ * using REFTABLE_BUF_INIT for stack-allocated variables.
+ */
+void reftable_buf_init(struct reftable_buf *buf);
+
+/*
+ * Release memory associated with the buffer. The buffer is reinitialized such
+ * that it can be reused for subsequent operations.
+ */
+void reftable_buf_release(struct reftable_buf *buf);
+
+/*
+ * Reset the buffer such that it is effectively empty, without releasing the
+ * memory that this structure holds on to. This is equivalent to calling
+ * `reftable_buf_setlen(buf, 0)`.
+ */
+void reftable_buf_reset(struct reftable_buf *buf);
+
+/*
+ * Trim the buffer to a shorter length by updating the `len` member and writing
+ * a NUL byte to `buf[len]`. Returns 0 on success, -1 when `len` points outside
+ * of the array.
+ */
+int reftable_buf_setlen(struct reftable_buf *buf, size_t len);
+
+/*
+ * Lexicographically compare the two buffers. Returns 0 when both buffers have
+ * the same contents, -1 when `a` is lexicographically smaller than `b`, and 1
+ * otherwise.
+ */
+int reftable_buf_cmp(const struct reftable_buf *a, const struct reftable_buf *b);
+
+/*
+ * Append `len` bytes from `data` to the buffer. This function works with
+ * arbitrary byte sequences, including ones that contain embedded NUL
+ * characters. As such, we use `void *` as input type. Returns 0 on success,
+ * REFTABLE_OUT_OF_MEMORY_ERROR on allocation failure.
+ */
+int reftable_buf_add(struct reftable_buf *buf, const void *data, size_t len);
+
+/* Equivalent to `reftable_buf_add(buf, s, strlen(s))`. */
+int reftable_buf_addstr(struct reftable_buf *buf, const char *s);
+
+/*
+ * Detach the buffer from the structure such that the underlying memory is now
+ * owned by the caller. The buffer is reinitialized such that it can be reused
+ * for subsequent operations.
+ */
+char *reftable_buf_detach(struct reftable_buf *buf);
+
+/* Bigendian en/decoding of integers */
+
+static inline void reftable_put_be16(void *out, uint16_t i)
+{
+	unsigned char *p = out;
+	p[0] = (uint8_t)((i >> 8) & 0xff);
+	p[1] = (uint8_t)((i >> 0) & 0xff);
+}
+
+static inline void reftable_put_be24(void *out, uint32_t i)
+{
+	unsigned char *p = out;
+	p[0] = (uint8_t)((i >> 16) & 0xff);
+	p[1] = (uint8_t)((i >>  8) & 0xff);
+	p[2] = (uint8_t)((i >>  0) & 0xff);
+}
+
+static inline void reftable_put_be32(void *out, uint32_t i)
+{
+	unsigned char *p = out;
+	p[0] = (uint8_t)((i >> 24) & 0xff);
+	p[1] = (uint8_t)((i >> 16) & 0xff);
+	p[2] = (uint8_t)((i >>  8) & 0xff);
+	p[3] = (uint8_t)((i >>  0) & 0xff);
+}
+
+static inline void reftable_put_be64(void *out, uint64_t i)
+{
+	unsigned char *p = out;
+	p[0] = (uint8_t)((i >> 56) & 0xff);
+	p[1] = (uint8_t)((i >> 48) & 0xff);
+	p[2] = (uint8_t)((i >> 40) & 0xff);
+	p[3] = (uint8_t)((i >> 32) & 0xff);
+	p[4] = (uint8_t)((i >> 24) & 0xff);
+	p[5] = (uint8_t)((i >> 16) & 0xff);
+	p[6] = (uint8_t)((i >>  8) & 0xff);
+	p[7] = (uint8_t)((i >>  0) & 0xff);
+}
+
+static inline uint16_t reftable_get_be16(const void *in)
+{
+	const unsigned char *p = in;
+	return (uint16_t)(p[0]) << 8 |
+	       (uint16_t)(p[1]) << 0;
+}
+
+static inline uint32_t reftable_get_be24(const void *in)
+{
+	const unsigned char *p = in;
+	return (uint32_t)(p[0]) << 16 |
+	       (uint32_t)(p[1]) << 8 |
+	       (uint32_t)(p[2]) << 0;
+}
+
+static inline uint32_t reftable_get_be32(const void *in)
+{
+	const unsigned char *p = in;
+	return (uint32_t)(p[0]) << 24 |
+	       (uint32_t)(p[1]) << 16 |
+	       (uint32_t)(p[2]) <<  8|
+	       (uint32_t)(p[3]) <<  0;
+}
+
+static inline uint64_t reftable_get_be64(const void *in)
+{
+	const unsigned char *p = in;
+	return (uint64_t)(p[0]) << 56 |
+	       (uint64_t)(p[1]) << 48 |
+	       (uint64_t)(p[2]) << 40 |
+	       (uint64_t)(p[3]) << 32 |
+	       (uint64_t)(p[4]) << 24 |
+	       (uint64_t)(p[5]) << 16 |
+	       (uint64_t)(p[6]) <<  8 |
+	       (uint64_t)(p[7]) <<  0;
+}
+
+/*
+ * find smallest index i in [0, sz) at which `f(i) > 0`, assuming that f is
+ * ascending. Return sz if `f(i) == 0` for all indices. The search is aborted
+ * and `sz` is returned in case `f(i) < 0`.
+ *
+ * Contrary to bsearch(3), this returns something useful if the argument is not
+ * found.
+ */
+size_t binsearch(size_t sz, int (*f)(size_t k, void *args), void *args);
+
+/*
+ * Frees a NULL terminated array of malloced strings. The array itself is also
+ * freed.
+ */
+void free_names(char **a);
+
+/*
+ * Parse a newline separated list of names. `size` is the length of the buffer,
+ * without terminating '\0'. Empty names are discarded. Returns a `NULL`
+ * pointer when allocations fail.
+ */
+char **parse_names(char *buf, int size);
+
+/* compares two NULL-terminated arrays of strings. */
+int names_equal(const char **a, const char **b);
+
+/* returns the array size of a NULL-terminated array of strings. */
+size_t names_length(const char **names);
+
+/* Allocation routines; they invoke the functions set through
+ * reftable_set_alloc() */
+void *reftable_malloc(size_t sz);
+void *reftable_realloc(void *p, size_t sz);
+void reftable_free(void *p);
+void *reftable_calloc(size_t nelem, size_t elsize);
+char *reftable_strdup(const char *str);
+
+static inline int reftable_alloc_size(size_t nelem, size_t elsize, size_t *out)
+{
+	if (nelem && elsize > SIZE_MAX / nelem)
+		return -1;
+	*out = nelem * elsize;
+	return 0;
+}
+
+#define REFTABLE_ALLOC_ARRAY(x, alloc) do { \
+		size_t alloc_size; \
+		if (reftable_alloc_size(sizeof(*(x)), (alloc), &alloc_size) < 0) { \
+			errno = ENOMEM; \
+			(x) = NULL; \
+		} else { \
+			(x) = reftable_malloc(alloc_size); \
+		} \
+	} while (0)
+#define REFTABLE_CALLOC_ARRAY(x, alloc) (x) = reftable_calloc((alloc), sizeof(*(x)))
+#define REFTABLE_REALLOC_ARRAY(x, alloc) do { \
+		size_t alloc_size; \
+		if (reftable_alloc_size(sizeof(*(x)), (alloc), &alloc_size) < 0) { \
+			errno = ENOMEM; \
+			(x) = NULL; \
+		} else { \
+			(x) = reftable_realloc((x), alloc_size); \
+		} \
+	} while (0)
+
+static inline void *reftable_alloc_grow(void *p, size_t nelem, size_t elsize,
+					size_t *allocp)
+{
+	void *new_p;
+	size_t alloc = *allocp * 2 + 1, alloc_bytes;
+	if (alloc < nelem)
+		alloc = nelem;
+	if (reftable_alloc_size(elsize, alloc, &alloc_bytes) < 0) {
+		errno = ENOMEM;
+		return p;
+	}
+	new_p = reftable_realloc(p, alloc_bytes);
+	if (!new_p)
+		return p;
+	*allocp = alloc;
+	return new_p;
+}
+
+#define REFTABLE_ALLOC_GROW(x, nr, alloc) ( \
+	(nr) > (alloc) && ( \
+		(x) = reftable_alloc_grow((x), (nr), sizeof(*(x)), &(alloc)), \
+		(nr) > (alloc) \
+	) \
+)
+
+#define REFTABLE_ALLOC_GROW_OR_NULL(x, nr, alloc) do { \
+	size_t reftable_alloc_grow_or_null_alloc = alloc; \
+	if (REFTABLE_ALLOC_GROW((x), (nr), reftable_alloc_grow_or_null_alloc)) { \
+		REFTABLE_FREE_AND_NULL(x); \
+		alloc = 0; \
+	} else { \
+		alloc = reftable_alloc_grow_or_null_alloc; \
+	} \
+} while (0)
+
+#define REFTABLE_FREE_AND_NULL(p) do { reftable_free(p); (p) = NULL; } while (0)
+
+#ifndef REFTABLE_ALLOW_BANNED_ALLOCATORS
+# define REFTABLE_BANNED(func) use_reftable_##func##_instead
+# undef malloc
+# define malloc(sz) REFTABLE_BANNED(malloc)
+# undef realloc
+# define realloc(ptr, sz) REFTABLE_BANNED(realloc)
+# undef free
+# define free(ptr) REFTABLE_BANNED(free)
+# undef calloc
+# define calloc(nelem, elsize) REFTABLE_BANNED(calloc)
+# undef strdup
+# define strdup(str) REFTABLE_BANNED(strdup)
+#endif
+
+#define REFTABLE_SWAP(a, b) do {								\
+	void *_swap_a_ptr = &(a);								\
+	void *_swap_b_ptr = &(b);								\
+	unsigned char _swap_buffer[sizeof(a) - 2 * sizeof(a) * (sizeof(a) != sizeof(b))];	\
+	memcpy(_swap_buffer, _swap_a_ptr, sizeof(a));						\
+	memcpy(_swap_a_ptr, _swap_b_ptr, sizeof(a));						\
+	memcpy(_swap_b_ptr, _swap_buffer, sizeof(a));						\
+} while (0)
+
+/* Find the longest shared prefix size of `a` and `b` */
+size_t common_prefix_size(struct reftable_buf *a, struct reftable_buf *b);
+
+uint32_t hash_size(enum reftable_hash id);
+
+/*
+ * Format IDs that identify the hash function used by a reftable. Note that
+ * these constants end up on disk and thus mustn't change. The format IDs are
+ * "sha1" and "s256" in big endian, respectively.
+ */
+#define REFTABLE_FORMAT_ID_SHA1   ((uint32_t) 0x73686131)
+#define REFTABLE_FORMAT_ID_SHA256 ((uint32_t) 0x73323536)
+
+#endif
diff --git a/reftable/block.c b/reftable/block.c
new file mode 100644
index 0000000000..920b3f4486
--- /dev/null
+++ b/reftable/block.c
@@ -0,0 +1,655 @@
+/*
+ * Copyright 2020 Google LLC
+ *
+ * Use of this source code is governed by a BSD-style
+ * license that can be found in the LICENSE file or at
+ * https://developers.google.com/open-source/licenses/bsd
+ */
+
+#include "block.h"
+
+#include "blocksource.h"
+#include "constants.h"
+#include "iter.h"
+#include "record.h"
+#include "reftable-error.h"
+#include "system.h"
+
+size_t header_size(int version)
+{
+	switch (version) {
+	case 1:
+		return 24;
+	case 2:
+		return 28;
+	}
+	abort();
+}
+
+size_t footer_size(int version)
+{
+	switch (version) {
+	case 1:
+		return 68;
+	case 2:
+		return 72;
+	}
+	abort();
+}
+
+static int block_writer_register_restart(struct block_writer *w, int n,
+					 int is_restart, struct reftable_buf *key)
+{
+	uint32_t rlen;
+	int err;
+
+	rlen = w->restart_len;
+	if (rlen >= MAX_RESTARTS)
+		is_restart = 0;
+
+	if (is_restart)
+		rlen++;
+	if (2 + 3 * rlen + n > w->block_size - w->next)
+		return REFTABLE_ENTRY_TOO_BIG_ERROR;
+	if (is_restart) {
+		REFTABLE_ALLOC_GROW_OR_NULL(w->restarts, w->restart_len + 1,
+					    w->restart_cap);
+		if (!w->restarts)
+			return REFTABLE_OUT_OF_MEMORY_ERROR;
+		w->restarts[w->restart_len++] = w->next;
+	}
+
+	w->next += n;
+
+	reftable_buf_reset(&w->last_key);
+	err = reftable_buf_add(&w->last_key, key->buf, key->len);
+	if (err < 0)
+		return err;
+
+	w->entries++;
+	return 0;
+}
+
+int block_writer_init(struct block_writer *bw, uint8_t typ, uint8_t *block,
+		      uint32_t block_size, uint32_t header_off, uint32_t hash_size)
+{
+	bw->block = block;
+	bw->hash_size = hash_size;
+	bw->block_size = block_size;
+	bw->header_off = header_off;
+	bw->block[header_off] = typ;
+	bw->next = header_off + 4;
+	bw->restart_interval = 16;
+	bw->entries = 0;
+	bw->restart_len = 0;
+	bw->last_key.len = 0;
+	if (!bw->zstream) {
+		REFTABLE_CALLOC_ARRAY(bw->zstream, 1);
+		if (!bw->zstream)
+			return REFTABLE_OUT_OF_MEMORY_ERROR;
+		deflateInit(bw->zstream, 9);
+	}
+
+	return 0;
+}
+
+uint8_t block_writer_type(struct block_writer *bw)
+{
+	return bw->block[bw->header_off];
+}
+
+/*
+ * Adds the reftable_record to the block. Returns 0 on success and
+ * appropriate error codes on failure.
+ */
+int block_writer_add(struct block_writer *w, struct reftable_record *rec)
+{
+	struct reftable_buf empty = REFTABLE_BUF_INIT;
+	struct reftable_buf last =
+		w->entries % w->restart_interval == 0 ? empty : w->last_key;
+	struct string_view out = {
+		.buf = w->block + w->next,
+		.len = w->block_size - w->next,
+	};
+	struct string_view start = out;
+	int is_restart = 0;
+	int n = 0;
+	int err;
+
+	err = reftable_record_key(rec, &w->scratch);
+	if (err < 0)
+		goto done;
+
+	if (!w->scratch.len) {
+		err = REFTABLE_API_ERROR;
+		goto done;
+	}
+
+	n = reftable_encode_key(&is_restart, out, last, w->scratch,
+				reftable_record_val_type(rec));
+	if (n < 0) {
+		err = n;
+		goto done;
+	}
+	string_view_consume(&out, n);
+
+	n = reftable_record_encode(rec, out, w->hash_size);
+	if (n < 0) {
+		err = n;
+		goto done;
+	}
+	string_view_consume(&out, n);
+
+	err = block_writer_register_restart(w, start.len - out.len, is_restart,
+					    &w->scratch);
+done:
+	return err;
+}
+
+int block_writer_finish(struct block_writer *w)
+{
+	for (uint32_t i = 0; i < w->restart_len; i++) {
+		reftable_put_be24(w->block + w->next, w->restarts[i]);
+		w->next += 3;
+	}
+
+	reftable_put_be16(w->block + w->next, w->restart_len);
+	w->next += 2;
+	reftable_put_be24(w->block + 1 + w->header_off, w->next);
+
+	/*
+	 * Log records are stored zlib-compressed. Note that the compression
+	 * also spans over the restart points we have just written.
+	 */
+	if (block_writer_type(w) == REFTABLE_BLOCK_TYPE_LOG) {
+		int block_header_skip = 4 + w->header_off;
+		uLongf src_len = w->next - block_header_skip, compressed_len;
+		int ret;
+
+		ret = deflateReset(w->zstream);
+		if (ret != Z_OK)
+			return REFTABLE_ZLIB_ERROR;
+
+		/*
+		 * Precompute the upper bound of how many bytes the compressed
+		 * data may end up with. Combined with `Z_FINISH`, `deflate()`
+		 * is guaranteed to return `Z_STREAM_END`.
+		 */
+		compressed_len = deflateBound(w->zstream, src_len);
+		REFTABLE_ALLOC_GROW_OR_NULL(w->compressed, compressed_len,
+					    w->compressed_cap);
+		if (!w->compressed) {
+			ret = REFTABLE_OUT_OF_MEMORY_ERROR;
+			return ret;
+		}
+
+		w->zstream->next_out = w->compressed;
+		w->zstream->avail_out = compressed_len;
+		w->zstream->next_in = w->block + block_header_skip;
+		w->zstream->avail_in = src_len;
+
+		/*
+		 * We want to perform all decompression in a single step, which
+		 * is why we can pass Z_FINISH here. As we have precomputed the
+		 * deflated buffer's size via `deflateBound()` this function is
+		 * guaranteed to succeed according to the zlib documentation.
+		 */
+		ret = deflate(w->zstream, Z_FINISH);
+		if (ret != Z_STREAM_END)
+			return REFTABLE_ZLIB_ERROR;
+
+		/*
+		 * Overwrite the uncompressed data we have already written and
+		 * adjust the `next` pointer to point right after the
+		 * compressed data.
+		 */
+		memcpy(w->block + block_header_skip, w->compressed,
+		       w->zstream->total_out);
+		w->next = w->zstream->total_out + block_header_skip;
+	}
+
+	return w->next;
+}
+
+static int read_block(struct reftable_block_source *source,
+		      struct reftable_block_data *dest, uint64_t off,
+		      uint32_t sz)
+{
+	size_t size = block_source_size(source);
+	block_source_release_data(dest);
+	if (off >= size)
+		return 0;
+	if (off + sz > size)
+		sz = size - off;
+	return block_source_read_data(source, dest, off, sz);
+}
+
+int reftable_block_init(struct reftable_block *block,
+			struct reftable_block_source *source,
+			uint32_t offset, uint32_t header_size,
+			uint32_t table_block_size, uint32_t hash_size,
+			uint8_t want_type)
+{
+	uint32_t guess_block_size = table_block_size ?
+		table_block_size : DEFAULT_BLOCK_SIZE;
+	uint32_t full_block_size = table_block_size;
+	uint16_t restart_count;
+	uint32_t restart_off;
+	uint32_t block_size;
+	uint8_t block_type;
+	int err;
+
+	err = read_block(source, &block->block_data, offset, guess_block_size);
+	if (err < 0)
+		goto done;
+
+	block_type = block->block_data.data[header_size];
+	if (!reftable_is_block_type(block_type)) {
+		err = REFTABLE_FORMAT_ERROR;
+		goto done;
+	}
+	if (want_type != REFTABLE_BLOCK_TYPE_ANY && block_type != want_type) {
+		err = 1;
+		goto done;
+	}
+
+	block_size = reftable_get_be24(block->block_data.data + header_size + 1);
+	if (block_size > guess_block_size) {
+		err = read_block(source, &block->block_data, offset, block_size);
+		if (err < 0)
+			goto done;
+	}
+
+	if (block_type == REFTABLE_BLOCK_TYPE_LOG) {
+		uint32_t block_header_skip = 4 + header_size;
+		uLong dst_len = block_size - block_header_skip;
+		uLong src_len = block->block_data.len - block_header_skip;
+
+		/* Log blocks specify the *uncompressed* size in their header. */
+		REFTABLE_ALLOC_GROW_OR_NULL(block->uncompressed_data, block_size,
+					    block->uncompressed_cap);
+		if (!block->uncompressed_data) {
+			err = REFTABLE_OUT_OF_MEMORY_ERROR;
+			goto done;
+		}
+
+		/* Copy over the block header verbatim. It's not compressed. */
+		memcpy(block->uncompressed_data, block->block_data.data, block_header_skip);
+
+		if (!block->zstream) {
+			REFTABLE_CALLOC_ARRAY(block->zstream, 1);
+			if (!block->zstream) {
+				err = REFTABLE_OUT_OF_MEMORY_ERROR;
+				goto done;
+			}
+
+			err = inflateInit(block->zstream);
+		} else {
+			err = inflateReset(block->zstream);
+		}
+		if (err != Z_OK) {
+			err = REFTABLE_ZLIB_ERROR;
+			goto done;
+		}
+
+		block->zstream->next_in = block->block_data.data + block_header_skip;
+		block->zstream->avail_in = src_len;
+		block->zstream->next_out = block->uncompressed_data + block_header_skip;
+		block->zstream->avail_out = dst_len;
+
+		/*
+		 * We know both input as well as output size, and we know that
+		 * the sizes should never be bigger than `uInt_MAX` because
+		 * blocks can at most be 16MB large. We can thus use `Z_FINISH`
+		 * here to instruct zlib to inflate the data in one go, which
+		 * is more efficient than using `Z_NO_FLUSH`.
+		 */
+		err = inflate(block->zstream, Z_FINISH);
+		if (err != Z_STREAM_END) {
+			err = REFTABLE_ZLIB_ERROR;
+			goto done;
+		}
+		err = 0;
+
+		if (block->zstream->total_out + block_header_skip != block_size) {
+			err = REFTABLE_FORMAT_ERROR;
+			goto done;
+		}
+
+		/* We're done with the input data. */
+		block_source_release_data(&block->block_data);
+		block->block_data.data = block->uncompressed_data;
+		block->block_data.len = block_size;
+		full_block_size = src_len + block_header_skip - block->zstream->avail_in;
+	} else if (full_block_size == 0) {
+		full_block_size = block_size;
+	} else if (block_size < full_block_size && block_size < block->block_data.len &&
+		   block->block_data.data[block_size] != 0) {
+		/* If the block is smaller than the full block size, it is
+		   padded (data followed by '\0') or the next block is
+		   unaligned. */
+		full_block_size = block_size;
+	}
+
+	restart_count = reftable_get_be16(block->block_data.data + block_size - 2);
+	restart_off = block_size - 2 - 3 * restart_count;
+
+	block->block_type = block_type;
+	block->hash_size = hash_size;
+	block->restart_off = restart_off;
+	block->full_block_size = full_block_size;
+	block->header_off = header_size;
+	block->restart_count = restart_count;
+
+	err = 0;
+
+done:
+	if (err < 0)
+		reftable_block_release(block);
+	return err;
+}
+
+void reftable_block_release(struct reftable_block *block)
+{
+	inflateEnd(block->zstream);
+	reftable_free(block->zstream);
+	reftable_free(block->uncompressed_data);
+	block_source_release_data(&block->block_data);
+	memset(block, 0, sizeof(*block));
+}
+
+uint8_t reftable_block_type(const struct reftable_block *b)
+{
+	return b->block_data.data[b->header_off];
+}
+
+int reftable_block_first_key(const struct reftable_block *block, struct reftable_buf *key)
+{
+	int off = block->header_off + 4, n;
+	struct string_view in = {
+		.buf = block->block_data.data + off,
+		.len = block->restart_off - off,
+	};
+	uint8_t extra = 0;
+
+	reftable_buf_reset(key);
+
+	n = reftable_decode_key(key, &extra, in);
+	if (n < 0)
+		return n;
+	if (!key->len)
+		return REFTABLE_FORMAT_ERROR;
+
+	return 0;
+}
+
+static uint32_t block_restart_offset(const struct reftable_block *b, size_t idx)
+{
+	return reftable_get_be24(b->block_data.data + b->restart_off + 3 * idx);
+}
+
+void block_iter_init(struct block_iter *it, const struct reftable_block *block)
+{
+	it->block = block;
+	block_iter_seek_start(it);
+}
+
+void block_iter_seek_start(struct block_iter *it)
+{
+	reftable_buf_reset(&it->last_key);
+	it->next_off = it->block->header_off + 4;
+}
+
+struct restart_needle_less_args {
+	int error;
+	struct reftable_buf needle;
+	const struct reftable_block *block;
+};
+
+static int restart_needle_less(size_t idx, void *_args)
+{
+	struct restart_needle_less_args *args = _args;
+	uint32_t off = block_restart_offset(args->block, idx);
+	struct string_view in = {
+		.buf = args->block->block_data.data + off,
+		.len = args->block->restart_off - off,
+	};
+	uint64_t prefix_len, suffix_len;
+	uint8_t extra;
+	int n;
+
+	/*
+	 * Records at restart points are stored without prefix compression, so
+	 * there is no need to fully decode the record key here. This removes
+	 * the need for allocating memory.
+	 */
+	n = reftable_decode_keylen(in, &prefix_len, &suffix_len, &extra);
+	if (n < 0 || prefix_len) {
+		args->error = 1;
+		return -1;
+	}
+
+	string_view_consume(&in, n);
+	if (suffix_len > in.len) {
+		args->error = 1;
+		return -1;
+	}
+
+	n = memcmp(args->needle.buf, in.buf,
+		   args->needle.len < suffix_len ? args->needle.len : suffix_len);
+	if (n)
+		return n < 0;
+	return args->needle.len < suffix_len;
+}
+
+int block_iter_next(struct block_iter *it, struct reftable_record *rec)
+{
+	struct string_view in = {
+		.buf = (unsigned char *) it->block->block_data.data + it->next_off,
+		.len = it->block->restart_off - it->next_off,
+	};
+	struct string_view start = in;
+	uint8_t extra = 0;
+	int n = 0;
+
+	if (it->next_off >= it->block->restart_off)
+		return 1;
+
+	n = reftable_decode_key(&it->last_key, &extra, in);
+	if (n < 0)
+		return -1;
+	if (!it->last_key.len)
+		return REFTABLE_FORMAT_ERROR;
+
+	string_view_consume(&in, n);
+	n = reftable_record_decode(rec, it->last_key, extra, in, it->block->hash_size,
+				   &it->scratch);
+	if (n < 0)
+		return -1;
+	string_view_consume(&in, n);
+
+	it->next_off += start.len - in.len;
+	return 0;
+}
+
+void block_iter_reset(struct block_iter *it)
+{
+	reftable_buf_reset(&it->last_key);
+	it->next_off = 0;
+	it->block = NULL;
+}
+
+void block_iter_close(struct block_iter *it)
+{
+	reftable_buf_release(&it->last_key);
+	reftable_buf_release(&it->scratch);
+}
+
+int block_iter_seek_key(struct block_iter *it, struct reftable_buf *want)
+{
+	struct restart_needle_less_args args = {
+		.needle = *want,
+		.block = it->block,
+	};
+	struct reftable_record rec;
+	int err = 0;
+	size_t i;
+
+	/*
+	 * Perform a binary search over the block's restart points, which
+	 * avoids doing a linear scan over the whole block. Like this, we
+	 * identify the section of the block that should contain our key.
+	 *
+	 * Note that we explicitly search for the first restart point _greater_
+	 * than the sought-after record, not _greater or equal_ to it. In case
+	 * the sought-after record is located directly at the restart point we
+	 * would otherwise start doing the linear search at the preceding
+	 * restart point. While that works alright, we would end up scanning
+	 * too many record.
+	 */
+	i = binsearch(it->block->restart_count, &restart_needle_less, &args);
+	if (args.error) {
+		err = REFTABLE_FORMAT_ERROR;
+		goto done;
+	}
+
+	/*
+	 * Now there are multiple cases:
+	 *
+	 *   - `i == 0`: The wanted record is smaller than the record found at
+	 *     the first restart point. As the first restart point is the first
+	 *     record in the block, our wanted record cannot be located in this
+	 *     block at all. We still need to position the iterator so that the
+	 *     next call to `block_iter_next()` will yield an end-of-iterator
+	 *     signal.
+	 *
+	 *   - `i == restart_count`: The wanted record was not found at any of
+	 *     the restart points. As there is no restart point at the end of
+	 *     the section the record may thus be contained in the last block.
+	 *
+	 *   - `i > 0`: The wanted record must be contained in the section
+	 *     before the found restart point. We thus do a linear search
+	 *     starting from the preceding restart point.
+	 */
+	if (i > 0)
+		it->next_off = block_restart_offset(it->block, i - 1);
+	else
+		it->next_off = it->block->header_off + 4;
+
+	err = reftable_record_init(&rec, reftable_block_type(it->block));
+	if (err < 0)
+		goto done;
+
+	/*
+	 * We're looking for the last entry less than the wanted key so that
+	 * the next call to `block_reader_next()` would yield the wanted
+	 * record. We thus don't want to position our iterator at the sought
+	 * after record, but one before. To do so, we have to go one entry too
+	 * far and then back up.
+	 */
+	while (1) {
+		size_t prev_off = it->next_off;
+
+		err = block_iter_next(it, &rec);
+		if (err < 0)
+			goto done;
+		if (err > 0) {
+			it->next_off = prev_off;
+			err = 0;
+			goto done;
+		}
+
+		err = reftable_record_key(&rec, &it->last_key);
+		if (err < 0)
+			goto done;
+
+		/*
+		 * Check whether the current key is greater or equal to the
+		 * sought-after key. In case it is greater we know that the
+		 * record does not exist in the block and can thus abort early.
+		 * In case it is equal to the sought-after key we have found
+		 * the desired record.
+		 *
+		 * Note that we store the next record's key record directly in
+		 * `last_key` without restoring the key of the preceding record
+		 * in case we need to go one record back. This is safe to do as
+		 * `block_iter_next()` would return the ref whose key is equal
+		 * to `last_key` now, and naturally all keys share a prefix
+		 * with themselves.
+		 */
+		if (reftable_buf_cmp(&it->last_key, want) >= 0) {
+			it->next_off = prev_off;
+			goto done;
+		}
+	}
+
+done:
+	reftable_record_release(&rec);
+	return err;
+}
+
+static int block_iter_seek_void(void *it, struct reftable_record *want)
+{
+	struct reftable_buf buf = REFTABLE_BUF_INIT;
+	struct block_iter *bi = it;
+	int err;
+
+	if (bi->block->block_type != want->type)
+		return REFTABLE_API_ERROR;
+
+	err = reftable_record_key(want, &buf);
+	if (err < 0)
+		goto out;
+
+	err = block_iter_seek_key(it, &buf);
+	if (err < 0)
+		goto out;
+
+	err = 0;
+
+out:
+	reftable_buf_release(&buf);
+	return err;
+}
+
+static int block_iter_next_void(void *it, struct reftable_record *rec)
+{
+	return block_iter_next(it, rec);
+}
+
+static void block_iter_close_void(void *it)
+{
+	block_iter_close(it);
+}
+
+static struct reftable_iterator_vtable block_iter_vtable = {
+	.seek = &block_iter_seek_void,
+	.next = &block_iter_next_void,
+	.close = &block_iter_close_void,
+};
+
+int reftable_block_init_iterator(const struct reftable_block *b,
+				 struct reftable_iterator *it)
+{
+	struct block_iter *bi;
+
+	REFTABLE_CALLOC_ARRAY(bi, 1);
+	block_iter_init(bi, b);
+
+	assert(!it->ops);
+	it->iter_arg = bi;
+	it->ops = &block_iter_vtable;
+
+	return 0;
+}
+
+void block_writer_release(struct block_writer *bw)
+{
+	deflateEnd(bw->zstream);
+	REFTABLE_FREE_AND_NULL(bw->zstream);
+	REFTABLE_FREE_AND_NULL(bw->restarts);
+	REFTABLE_FREE_AND_NULL(bw->compressed);
+	reftable_buf_release(&bw->scratch);
+	reftable_buf_release(&bw->last_key);
+	/* the block is not owned. */
+}
diff --git a/reftable/block.h b/reftable/block.h
new file mode 100644
index 0000000000..d6dfaae33e
--- /dev/null
+++ b/reftable/block.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright 2020 Google LLC
+ *
+ * Use of this source code is governed by a BSD-style
+ * license that can be found in the LICENSE file or at
+ * https://developers.google.com/open-source/licenses/bsd
+ */
+
+#ifndef BLOCK_H
+#define BLOCK_H
+
+#include "basics.h"
+#include "record.h"
+#include "reftable-block.h"
+#include "reftable-blocksource.h"
+
+/*
+ * Writes reftable blocks. The block_writer is reused across blocks to minimize
+ * allocation overhead.
+ */
+struct block_writer {
+	struct z_stream_s *zstream;
+	unsigned char *compressed;
+	size_t compressed_cap;
+
+	uint8_t *block;
+	uint32_t block_size;
+
+	/* Offset of the global header. Nonzero in the first block only. */
+	uint32_t header_off;
+
+	/* How often to restart keys. */
+	uint16_t restart_interval;
+	uint32_t hash_size;
+
+	/* Offset of next uint8_t to write. */
+	uint32_t next;
+	uint32_t *restarts;
+	uint32_t restart_len;
+	uint32_t restart_cap;
+
+	struct reftable_buf last_key;
+	/* Scratch buffer used to avoid allocations. */
+	struct reftable_buf scratch;
+	int entries;
+};
+
+/*
+ * initializes the blockwriter to write `typ` entries, using `block` as temporary
+ * storage. `block` is not owned by the block_writer. */
+int block_writer_init(struct block_writer *bw, uint8_t typ, uint8_t *block,
+		      uint32_t block_size, uint32_t header_off, uint32_t hash_size);
+
+/* returns the block type (eg. 'r' for ref records. */
+uint8_t block_writer_type(struct block_writer *bw);
+
+/* Attempts to append the record. Returns 0 on success or error code on failure. */
+int block_writer_add(struct block_writer *w, struct reftable_record *rec);
+
+/* appends the key restarts, and compress the block if necessary. */
+int block_writer_finish(struct block_writer *w);
+
+/* clears out internally allocated block_writer members. */
+void block_writer_release(struct block_writer *bw);
+
+/* Iterator for records contained in a single block. */
+struct block_iter {
+	/* offset within the block of the next entry to read. */
+	uint32_t next_off;
+	const struct reftable_block *block;
+
+	/* key for last entry we read. */
+	struct reftable_buf last_key;
+	struct reftable_buf scratch;
+};
+
+#define BLOCK_ITER_INIT { \
+	.last_key = REFTABLE_BUF_INIT, \
+	.scratch = REFTABLE_BUF_INIT, \
+}
+
+/*
+ * Initialize the block iterator with the given block. The iterator will be
+ * positioned at the first record contained in the block. The block must remain
+ * valid until the end of the iterator's lifetime. It is valid to re-initialize
+ * iterators multiple times.
+ */
+void block_iter_init(struct block_iter *it, const struct reftable_block *block);
+
+/* Position the initialized iterator at the first record of its block. */
+void block_iter_seek_start(struct block_iter *it);
+
+/*
+ * Position the initialized iterator at the desired record key. It is not an
+ * error in case the record cannot be found. If so, a subsequent call to
+ * `block_iter_next()` will indicate that the iterator is exhausted.
+ */
+int block_iter_seek_key(struct block_iter *it, struct reftable_buf *want);
+
+/* return < 0 for error, 0 for OK, > 0 for EOF. */
+int block_iter_next(struct block_iter *it, struct reftable_record *rec);
+
+/* Reset the block iterator to pristine state without releasing its memory. */
+void block_iter_reset(struct block_iter *it);
+
+/* deallocate memory for `it`. The block reader and its block is left intact. */
+void block_iter_close(struct block_iter *it);
+
+/* size of file header, depending on format version */
+size_t header_size(int version);
+
+/* size of file footer, depending on format version */
+size_t footer_size(int version);
+
+#endif
diff --git a/reftable/blocksource.c b/reftable/blocksource.c
new file mode 100644
index 0000000000..573c81287f
--- /dev/null
+++ b/reftable/blocksource.c
@@ -0,0 +1,179 @@
+/*
+ * Copyright 2020 Google LLC
+ *
+ * Use of this source code is governed by a BSD-style
+ * license that can be found in the LICENSE file or at
+ * https://developers.google.com/open-source/licenses/bsd
+ */
+
+#include "system.h"
+
+#include "basics.h"
+#include "blocksource.h"
+#include "reftable-blocksource.h"
+#include "reftable-error.h"
+
+void block_source_release_data(struct reftable_block_data *data)
+{
+	struct reftable_block_source source = data->source;
+	if (data && source.ops)
+		source.ops->release_data(source.arg, data);
+	data->data = NULL;
+	data->len = 0;
+	data->source.ops = NULL;
+	data->source.arg = NULL;
+}
+
+void block_source_close(struct reftable_block_source *source)
+{
+	if (!source->ops) {
+		return;
+	}
+
+	source->ops->close(source->arg);
+	source->ops = NULL;
+}
+
+ssize_t block_source_read_data(struct reftable_block_source *source,
+			       struct reftable_block_data *dest, uint64_t off,
+			       uint32_t size)
+{
+	ssize_t result = source->ops->read_data(source->arg, dest, off, size);
+	dest->source = *source;
+	return result;
+}
+
+uint64_t block_source_size(struct reftable_block_source *source)
+{
+	return source->ops->size(source->arg);
+}
+
+static void reftable_buf_release_data(void *b REFTABLE_UNUSED, struct reftable_block_data *dest)
+{
+	if (dest->len)
+		memset(dest->data, 0xff, dest->len);
+	reftable_free(dest->data);
+}
+
+static void reftable_buf_close(void *b REFTABLE_UNUSED)
+{
+}
+
+static ssize_t reftable_buf_read_data(void *v, struct reftable_block_data *dest,
+				      uint64_t off, uint32_t size)
+{
+	struct reftable_buf *b = v;
+	assert(off + size <= b->len);
+	REFTABLE_CALLOC_ARRAY(dest->data, size);
+	if (!dest->data)
+		return -1;
+	memcpy(dest->data, b->buf + off, size);
+	dest->len = size;
+	return size;
+}
+
+static uint64_t reftable_buf_size(void *b)
+{
+	return ((struct reftable_buf *)b)->len;
+}
+
+static struct reftable_block_source_vtable reftable_buf_vtable = {
+	.size = &reftable_buf_size,
+	.read_data = &reftable_buf_read_data,
+	.release_data = &reftable_buf_release_data,
+	.close = &reftable_buf_close,
+};
+
+void block_source_from_buf(struct reftable_block_source *bs,
+			   struct reftable_buf *buf)
+{
+	assert(!bs->ops);
+	bs->ops = &reftable_buf_vtable;
+	bs->arg = buf;
+}
+
+struct file_block_source {
+	uint64_t size;
+	unsigned char *data;
+};
+
+static uint64_t file_size(void *b)
+{
+	return ((struct file_block_source *)b)->size;
+}
+
+static void file_release_data(void *b REFTABLE_UNUSED, struct reftable_block_data *dest REFTABLE_UNUSED)
+{
+}
+
+static void file_close(void *v)
+{
+	struct file_block_source *b = v;
+	munmap(b->data, b->size);
+	reftable_free(b);
+}
+
+static ssize_t file_read_data(void *v, struct reftable_block_data *dest, uint64_t off,
+			      uint32_t size)
+{
+	struct file_block_source *b = v;
+	assert(off + size <= b->size);
+	dest->data = b->data + off;
+	dest->len = size;
+	return size;
+}
+
+static struct reftable_block_source_vtable file_vtable = {
+	.size = &file_size,
+	.read_data = &file_read_data,
+	.release_data = &file_release_data,
+	.close = &file_close,
+};
+
+int reftable_block_source_from_file(struct reftable_block_source *bs,
+				    const char *name)
+{
+	struct file_block_source *p = NULL;
+	struct stat st;
+	int fd, err;
+
+	fd = open(name, O_RDONLY);
+	if (fd < 0) {
+		if (errno == ENOENT)
+			return REFTABLE_NOT_EXIST_ERROR;
+		err = -1;
+		goto out;
+	}
+
+	if (fstat(fd, &st) < 0) {
+		err = REFTABLE_IO_ERROR;
+		goto out;
+	}
+
+	REFTABLE_CALLOC_ARRAY(p, 1);
+	if (!p) {
+		err = REFTABLE_OUT_OF_MEMORY_ERROR;
+		goto out;
+	}
+
+	p->size = st.st_size;
+	p->data = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
+	if (p->data == MAP_FAILED) {
+		err = REFTABLE_IO_ERROR;
+		p->data = NULL;
+		goto out;
+	}
+
+	assert(!bs->ops);
+	bs->ops = &file_vtable;
+	bs->arg = p;
+
+	err = 0;
+
+out:
+	if (fd >= 0)
+		close(fd);
+	if (err < 0)
+		reftable_free(p);
+	return err;
+}
diff --git a/reftable/blocksource.h b/reftable/blocksource.h
new file mode 100644
index 0000000000..a110e05958
--- /dev/null
+++ b/reftable/blocksource.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright 2020 Google LLC
+ *
+ * Use of this source code is governed by a BSD-style
+ * license that can be found in the LICENSE file or at
+ * https://developers.google.com/open-source/licenses/bsd
+ */
+
+#ifndef BLOCKSOURCE_H
+#define BLOCKSOURCE_H
+
+#include "system.h"
+
+struct reftable_block_source;
+struct reftable_block_data;
+struct reftable_buf;
+
+/*
+ * Close the block source and the underlying resource. This is a no-op in case
+ * the block source is zero-initialized.
+ */
+void block_source_close(struct reftable_block_source *source);
+
+/*
+ * Read a block of length `size` from the source at the given `off`.
+ */
+ssize_t block_source_read_data(struct reftable_block_source *source,
+			       struct reftable_block_data *dest, uint64_t off,
+			       uint32_t size);
+
+/*
+ * Return the total length of the underlying resource.
+ */
+uint64_t block_source_size(struct reftable_block_source *source);
+
+/*
+ * Return a block to its original source, releasing any resources associated
+ * with it.
+ */
+void block_source_release_data(struct reftable_block_data *data);
+
+/* Create an in-memory block source for reading reftables. */
+void block_source_from_buf(struct reftable_block_source *bs,
+			   struct reftable_buf *buf);
+
+#endif
diff --git a/reftable/constants.h b/reftable/constants.h
new file mode 100644
index 0000000000..e3b1aaa516
--- /dev/null
+++ b/reftable/constants.h
@@ -0,0 +1,18 @@
+/*
+ * Copyright 2020 Google LLC
+ *
+ * Use of this source code is governed by a BSD-style
+ * license that can be found in the LICENSE file or at
+ * https://developers.google.com/open-source/licenses/bsd
+ */
+
+#ifndef CONSTANTS_H
+#define CONSTANTS_H
+
+#include "reftable-constants.h"
+
+#define MAX_RESTARTS ((1 << 16) - 1)
+#define DEFAULT_BLOCK_SIZE 4096
+#define DEFAULT_GEOMETRIC_FACTOR 2
+
+#endif
diff --git a/reftable/error.c b/reftable/error.c
new file mode 100644
index 0000000000..c7cab2dbc4
--- /dev/null
+++ b/reftable/error.c
@@ -0,0 +1,46 @@
+/*
+ * Copyright 2020 Google LLC
+ *
+ * Use of this source code is governed by a BSD-style
+ * license that can be found in the LICENSE file or at
+ * https://developers.google.com/open-source/licenses/bsd
+ */
+
+#include "system.h"
+#include "reftable-error.h"
+
+#include <stdio.h>
+
+const char *reftable_error_str(int err)
+{
+	static char buf[250];
+	switch (err) {
+	case REFTABLE_IO_ERROR:
+		return "I/O error";
+	case REFTABLE_FORMAT_ERROR:
+		return "corrupt reftable file";
+	case REFTABLE_NOT_EXIST_ERROR:
+		return "file does not exist";
+	case REFTABLE_LOCK_ERROR:
+		return "data is locked";
+	case REFTABLE_API_ERROR:
+		return "misuse of the reftable API";
+	case REFTABLE_ZLIB_ERROR:
+		return "zlib failure";
+	case REFTABLE_EMPTY_TABLE_ERROR:
+		return "wrote empty table";
+	case REFTABLE_REFNAME_ERROR:
+		return "invalid refname";
+	case REFTABLE_ENTRY_TOO_BIG_ERROR:
+		return "entry too large";
+	case REFTABLE_OUTDATED_ERROR:
+		return "data concurrently modified";
+	case REFTABLE_OUT_OF_MEMORY_ERROR:
+		return "out of memory";
+	case -1:
+		return "general error";
+	default:
+		snprintf(buf, sizeof(buf), "unknown error code %d", err);
+		return buf;
+	}
+}
diff --git a/reftable/iter.c b/reftable/iter.c
new file mode 100644
index 0000000000..2ecc52b336
--- /dev/null
+++ b/reftable/iter.c
@@ -0,0 +1,302 @@
+/*
+ * Copyright 2020 Google LLC
+ *
+ * Use of this source code is governed by a BSD-style
+ * license that can be found in the LICENSE file or at
+ * https://developers.google.com/open-source/licenses/bsd
+ */
+
+#include "iter.h"
+
+#include "system.h"
+
+#include "block.h"
+#include "blocksource.h"
+#include "constants.h"
+#include "reftable-error.h"
+#include "table.h"
+
+int iterator_seek(struct reftable_iterator *it, struct reftable_record *want)
+{
+	return it->ops->seek(it->iter_arg, want);
+}
+
+int iterator_next(struct reftable_iterator *it, struct reftable_record *rec)
+{
+	return it->ops->next(it->iter_arg, rec);
+}
+
+static int empty_iterator_seek(void *arg REFTABLE_UNUSED, struct reftable_record *want REFTABLE_UNUSED)
+{
+	return 0;
+}
+
+static int empty_iterator_next(void *arg REFTABLE_UNUSED, struct reftable_record *rec REFTABLE_UNUSED)
+{
+	return 1;
+}
+
+static void empty_iterator_close(void *arg REFTABLE_UNUSED)
+{
+}
+
+static struct reftable_iterator_vtable empty_vtable = {
+	.seek = &empty_iterator_seek,
+	.next = &empty_iterator_next,
+	.close = &empty_iterator_close,
+};
+
+void iterator_set_empty(struct reftable_iterator *it)
+{
+	assert(!it->ops);
+	it->iter_arg = NULL;
+	it->ops = &empty_vtable;
+}
+
+static void filtering_ref_iterator_close(void *iter_arg)
+{
+	struct filtering_ref_iterator *fri = iter_arg;
+	reftable_buf_release(&fri->oid);
+	reftable_iterator_destroy(&fri->it);
+}
+
+static int filtering_ref_iterator_seek(void *iter_arg,
+				       struct reftable_record *want)
+{
+	struct filtering_ref_iterator *fri = iter_arg;
+	return iterator_seek(&fri->it, want);
+}
+
+static int filtering_ref_iterator_next(void *iter_arg,
+				       struct reftable_record *rec)
+{
+	struct filtering_ref_iterator *fri = iter_arg;
+	struct reftable_ref_record *ref = &rec->u.ref;
+	int err = 0;
+	while (1) {
+		err = reftable_iterator_next_ref(&fri->it, ref);
+		if (err != 0) {
+			break;
+		}
+
+		if (ref->value_type == REFTABLE_REF_VAL2 &&
+		    (!memcmp(fri->oid.buf, ref->value.val2.target_value,
+			     fri->oid.len) ||
+		     !memcmp(fri->oid.buf, ref->value.val2.value,
+			     fri->oid.len)))
+			return 0;
+
+		if (ref->value_type == REFTABLE_REF_VAL1 &&
+		    !memcmp(fri->oid.buf, ref->value.val1, fri->oid.len)) {
+			return 0;
+		}
+	}
+
+	reftable_ref_record_release(ref);
+	return err;
+}
+
+static struct reftable_iterator_vtable filtering_ref_iterator_vtable = {
+	.seek = &filtering_ref_iterator_seek,
+	.next = &filtering_ref_iterator_next,
+	.close = &filtering_ref_iterator_close,
+};
+
+void iterator_from_filtering_ref_iterator(struct reftable_iterator *it,
+					  struct filtering_ref_iterator *fri)
+{
+	assert(!it->ops);
+	it->iter_arg = fri;
+	it->ops = &filtering_ref_iterator_vtable;
+}
+
+static void indexed_table_ref_iter_close(void *p)
+{
+	struct indexed_table_ref_iter *it = p;
+	block_iter_close(&it->cur);
+	block_source_release_data(&it->block.block_data);
+	reftable_free(it->offsets);
+	reftable_buf_release(&it->oid);
+}
+
+static int indexed_table_ref_iter_next_block(struct indexed_table_ref_iter *it)
+{
+	uint64_t off;
+	int err = 0;
+	if (it->offset_idx == it->offset_len) {
+		it->is_finished = 1;
+		return 1;
+	}
+
+	block_source_release_data(&it->block.block_data);
+
+	off = it->offsets[it->offset_idx++];
+	err = table_init_block(it->table, &it->block, off, REFTABLE_BLOCK_TYPE_REF);
+	if (err < 0) {
+		return err;
+	}
+	if (err > 0) {
+		/* indexed block does not exist. */
+		return REFTABLE_FORMAT_ERROR;
+	}
+	block_iter_init(&it->cur, &it->block);
+	return 0;
+}
+
+static int indexed_table_ref_iter_seek(void *p REFTABLE_UNUSED,
+				       struct reftable_record *want REFTABLE_UNUSED)
+{
+	return REFTABLE_API_ERROR;
+}
+
+static int indexed_table_ref_iter_next(void *p, struct reftable_record *rec)
+{
+	struct indexed_table_ref_iter *it = p;
+	struct reftable_ref_record *ref = &rec->u.ref;
+
+	while (1) {
+		int err = block_iter_next(&it->cur, rec);
+		if (err < 0) {
+			return err;
+		}
+
+		if (err > 0) {
+			err = indexed_table_ref_iter_next_block(it);
+			if (err < 0) {
+				return err;
+			}
+
+			if (it->is_finished) {
+				return 1;
+			}
+			continue;
+		}
+		/* BUG */
+		if (!memcmp(it->oid.buf, ref->value.val2.target_value,
+			    it->oid.len) ||
+		    !memcmp(it->oid.buf, ref->value.val2.value, it->oid.len)) {
+			return 0;
+		}
+	}
+}
+
+int indexed_table_ref_iter_new(struct indexed_table_ref_iter **dest,
+			       struct reftable_table *t, uint8_t *oid,
+			       int oid_len, uint64_t *offsets, int offset_len)
+{
+	struct indexed_table_ref_iter empty = INDEXED_TABLE_REF_ITER_INIT;
+	struct indexed_table_ref_iter *itr;
+	int err = 0;
+
+	itr = reftable_calloc(1, sizeof(*itr));
+	if (!itr) {
+		err = REFTABLE_OUT_OF_MEMORY_ERROR;
+		goto out;
+	}
+
+	*itr = empty;
+	itr->table = t;
+
+	err = reftable_buf_add(&itr->oid, oid, oid_len);
+	if (err < 0)
+		goto out;
+
+	itr->offsets = offsets;
+	itr->offset_len = offset_len;
+
+	err = indexed_table_ref_iter_next_block(itr);
+	if (err < 0)
+		goto out;
+
+	*dest = itr;
+	err = 0;
+
+out:
+	if (err < 0) {
+		*dest = NULL;
+		reftable_free(itr);
+	}
+	return err;
+}
+
+static struct reftable_iterator_vtable indexed_table_ref_iter_vtable = {
+	.seek = &indexed_table_ref_iter_seek,
+	.next = &indexed_table_ref_iter_next,
+	.close = &indexed_table_ref_iter_close,
+};
+
+void iterator_from_indexed_table_ref_iter(struct reftable_iterator *it,
+					  struct indexed_table_ref_iter *itr)
+{
+	assert(!it->ops);
+	it->iter_arg = itr;
+	it->ops = &indexed_table_ref_iter_vtable;
+}
+
+void reftable_iterator_destroy(struct reftable_iterator *it)
+{
+	if (!it->ops)
+		return;
+	it->ops->close(it->iter_arg);
+	it->ops = NULL;
+	REFTABLE_FREE_AND_NULL(it->iter_arg);
+}
+
+int reftable_iterator_seek_ref(struct reftable_iterator *it,
+			       const char *name)
+{
+	struct reftable_record want = {
+		.type = REFTABLE_BLOCK_TYPE_REF,
+		.u.ref = {
+			.refname = (char *)name,
+		},
+	};
+	return it->ops->seek(it->iter_arg, &want);
+}
+
+int reftable_iterator_next_ref(struct reftable_iterator *it,
+			       struct reftable_ref_record *ref)
+{
+	struct reftable_record rec = {
+		.type = REFTABLE_BLOCK_TYPE_REF,
+		.u = {
+			.ref = *ref
+		},
+	};
+	int err = iterator_next(it, &rec);
+	*ref = rec.u.ref;
+	return err;
+}
+
+int reftable_iterator_seek_log_at(struct reftable_iterator *it,
+				  const char *name, uint64_t update_index)
+{
+	struct reftable_record want = {
+		.type = REFTABLE_BLOCK_TYPE_LOG,
+		.u.log = {
+			.refname = (char *)name,
+			.update_index = update_index,
+		},
+	};
+	return it->ops->seek(it->iter_arg, &want);
+}
+
+int reftable_iterator_seek_log(struct reftable_iterator *it,
+			       const char *name)
+{
+	return reftable_iterator_seek_log_at(it, name, ~((uint64_t) 0));
+}
+
+int reftable_iterator_next_log(struct reftable_iterator *it,
+			       struct reftable_log_record *log)
+{
+	struct reftable_record rec = {
+		.type = REFTABLE_BLOCK_TYPE_LOG,
+		.u = {
+			.log = *log,
+		},
+	};
+	int err = iterator_next(it, &rec);
+	*log = rec.u.log;
+	return err;
+}
diff --git a/reftable/iter.h b/reftable/iter.h
new file mode 100644
index 0000000000..cc920970a5
--- /dev/null
+++ b/reftable/iter.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright 2020 Google LLC
+ *
+ * Use of this source code is governed by a BSD-style
+ * license that can be found in the LICENSE file or at
+ * https://developers.google.com/open-source/licenses/bsd
+ */
+
+#ifndef ITER_H
+#define ITER_H
+
+#include "system.h"
+#include "block.h"
+#include "record.h"
+
+#include "reftable-iterator.h"
+
+/*
+ * The virtual function table for implementing generic reftable iterators.
+ */
+struct reftable_iterator_vtable {
+	int (*seek)(void *iter_arg, struct reftable_record *want);
+	int (*next)(void *iter_arg, struct reftable_record *rec);
+	void (*close)(void *iter_arg);
+};
+
+/*
+ * Position the iterator at the wanted record such that a call to
+ * `iterator_next()` would return that record, if it exists.
+ */
+int iterator_seek(struct reftable_iterator *it, struct reftable_record *want);
+
+/*
+ * Yield the next record and advance the iterator. Returns <0 on error, 0 when
+ * a record was yielded, and >0 when the iterator hit an error.
+ */
+int iterator_next(struct reftable_iterator *it, struct reftable_record *rec);
+
+/*
+ * Set up the iterator such that it behaves the same as an iterator with no
+ * entries.
+ */
+void iterator_set_empty(struct reftable_iterator *it);
+
+/* iterator that produces only ref records that point to `oid` */
+struct filtering_ref_iterator {
+	struct reftable_buf oid;
+	struct reftable_iterator it;
+};
+#define FILTERING_REF_ITERATOR_INIT \
+	{                           \
+		.oid = REFTABLE_BUF_INIT  \
+	}
+
+void iterator_from_filtering_ref_iterator(struct reftable_iterator *,
+					  struct filtering_ref_iterator *);
+
+/* iterator that produces only ref records that point to `oid`,
+ * but using the object index.
+ */
+struct indexed_table_ref_iter {
+	struct reftable_table *table;
+	struct reftable_buf oid;
+
+	/* mutable */
+	uint64_t *offsets;
+
+	/* Points to the next offset to read. */
+	int offset_idx;
+	int offset_len;
+	struct reftable_block block;
+	struct block_iter cur;
+	int is_finished;
+};
+
+#define INDEXED_TABLE_REF_ITER_INIT { \
+	.cur = BLOCK_ITER_INIT, \
+	.oid = REFTABLE_BUF_INIT, \
+}
+
+void iterator_from_indexed_table_ref_iter(struct reftable_iterator *it,
+					  struct indexed_table_ref_iter *itr);
+
+/* Takes ownership of `offsets` */
+int indexed_table_ref_iter_new(struct indexed_table_ref_iter **dest,
+			       struct reftable_table *t, uint8_t *oid,
+			       int oid_len, uint64_t *offsets, int offset_len);
+
+#endif
diff --git a/reftable/merged.c b/reftable/merged.c
new file mode 100644
index 0000000000..733de07454
--- /dev/null
+++ b/reftable/merged.c
@@ -0,0 +1,316 @@
+/*
+ * Copyright 2020 Google LLC
+ *
+ * Use of this source code is governed by a BSD-style
+ * license that can be found in the LICENSE file or at
+ * https://developers.google.com/open-source/licenses/bsd
+ */
+
+#include "merged.h"
+
+#include "constants.h"
+#include "iter.h"
+#include "pq.h"
+#include "record.h"
+#include "reftable-merged.h"
+#include "reftable-error.h"
+#include "system.h"
+#include "table.h"
+
+struct merged_subiter {
+	struct reftable_iterator iter;
+	struct reftable_record rec;
+};
+
+struct merged_iter {
+	struct merged_subiter *subiters;
+	struct merged_iter_pqueue pq;
+	size_t subiters_len;
+	int suppress_deletions;
+	ssize_t advance_index;
+};
+
+static void merged_iter_close(void *p)
+{
+	struct merged_iter *mi = p;
+
+	merged_iter_pqueue_release(&mi->pq);
+	for (size_t i = 0; i < mi->subiters_len; i++) {
+		reftable_iterator_destroy(&mi->subiters[i].iter);
+		reftable_record_release(&mi->subiters[i].rec);
+	}
+	reftable_free(mi->subiters);
+}
+
+static int merged_iter_advance_subiter(struct merged_iter *mi, size_t idx)
+{
+	struct pq_entry e = {
+		.index = idx,
+		.rec = &mi->subiters[idx].rec,
+	};
+	int err;
+
+	err = iterator_next(&mi->subiters[idx].iter, &mi->subiters[idx].rec);
+	if (err)
+		return err;
+
+	err = merged_iter_pqueue_add(&mi->pq, &e);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+static int merged_iter_seek(struct merged_iter *mi, struct reftable_record *want)
+{
+	int err;
+
+	mi->advance_index = -1;
+	while (!merged_iter_pqueue_is_empty(mi->pq)) {
+		err = merged_iter_pqueue_remove(&mi->pq, NULL);
+		if (err < 0)
+			return err;
+	}
+
+	for (size_t i = 0; i < mi->subiters_len; i++) {
+		err = iterator_seek(&mi->subiters[i].iter, want);
+		if (err < 0)
+			return err;
+		if (err > 0)
+			continue;
+
+		err = merged_iter_advance_subiter(mi, i);
+		if (err < 0)
+			return err;
+	}
+
+	return 0;
+}
+
+static int merged_iter_next_entry(struct merged_iter *mi,
+				  struct reftable_record *rec)
+{
+	struct pq_entry entry = { 0 };
+	int err = 0, empty;
+
+	empty = merged_iter_pqueue_is_empty(mi->pq);
+
+	if (mi->advance_index >= 0) {
+		/*
+		 * When there are no pqueue entries then we only have a single
+		 * subiter left. There is no need to use the pqueue in that
+		 * case anymore as we know that the subiter will return entries
+		 * in the correct order already.
+		 *
+		 * While this may sound like a very specific edge case, it may
+		 * happen more frequently than you think. Most repositories
+		 * will end up having a single large base table that contains
+		 * most of the refs. It's thus likely that we exhaust all
+		 * subiters but the one from that base ref.
+		 */
+		if (empty)
+			return iterator_next(&mi->subiters[mi->advance_index].iter,
+					     rec);
+
+		err = merged_iter_advance_subiter(mi, mi->advance_index);
+		if (err < 0)
+			return err;
+		if (!err)
+			empty = 0;
+		mi->advance_index = -1;
+	}
+
+	if (empty)
+		return 1;
+
+	err = merged_iter_pqueue_remove(&mi->pq, &entry);
+	if (err < 0)
+		return err;
+
+	/*
+	  One can also use reftable as datacenter-local storage, where the ref
+	  database is maintained in globally consistent database (eg.
+	  CockroachDB or Spanner). In this scenario, replication delays together
+	  with compaction may cause newer tables to contain older entries. In
+	  such a deployment, the loop below must be changed to collect all
+	  entries for the same key, and return new the newest one.
+	*/
+	while (!merged_iter_pqueue_is_empty(mi->pq)) {
+		struct pq_entry top = merged_iter_pqueue_top(mi->pq);
+		int cmp;
+
+		err = reftable_record_cmp(top.rec, entry.rec, &cmp);
+		if (err < 0)
+			return err;
+		if (cmp > 0)
+			break;
+
+		err = merged_iter_pqueue_remove(&mi->pq, NULL);
+		if (err < 0)
+			return err;
+
+		err = merged_iter_advance_subiter(mi, top.index);
+		if (err < 0)
+			return err;
+	}
+
+	mi->advance_index = entry.index;
+	REFTABLE_SWAP(*rec, *entry.rec);
+	return 0;
+}
+
+static int merged_iter_seek_void(void *it, struct reftable_record *want)
+{
+	return merged_iter_seek(it, want);
+}
+
+static int merged_iter_next_void(void *p, struct reftable_record *rec)
+{
+	struct merged_iter *mi = p;
+	while (1) {
+		int err = merged_iter_next_entry(mi, rec);
+		if (err)
+			return err;
+		if (mi->suppress_deletions && reftable_record_is_deletion(rec))
+			continue;
+		return 0;
+	}
+}
+
+static struct reftable_iterator_vtable merged_iter_vtable = {
+	.seek = merged_iter_seek_void,
+	.next = &merged_iter_next_void,
+	.close = &merged_iter_close,
+};
+
+static void iterator_from_merged_iter(struct reftable_iterator *it,
+				      struct merged_iter *mi)
+{
+	assert(!it->ops);
+	it->iter_arg = mi;
+	it->ops = &merged_iter_vtable;
+}
+
+int reftable_merged_table_new(struct reftable_merged_table **dest,
+			      struct reftable_table **tables, size_t n,
+			      enum reftable_hash hash_id)
+{
+	struct reftable_merged_table *m = NULL;
+	uint64_t last_max = 0;
+	uint64_t first_min = 0;
+
+	for (size_t i = 0; i < n; i++) {
+		uint64_t min = reftable_table_min_update_index(tables[i]);
+		uint64_t max = reftable_table_max_update_index(tables[i]);
+
+		if (reftable_table_hash_id(tables[i]) != hash_id) {
+			return REFTABLE_FORMAT_ERROR;
+		}
+		if (i == 0 || min < first_min) {
+			first_min = min;
+		}
+		if (i == 0 || max > last_max) {
+			last_max = max;
+		}
+	}
+
+	REFTABLE_CALLOC_ARRAY(m, 1);
+	if (!m)
+		return REFTABLE_OUT_OF_MEMORY_ERROR;
+
+	m->tables = tables;
+	m->tables_len = n;
+	m->min = first_min;
+	m->max = last_max;
+	m->hash_id = hash_id;
+	*dest = m;
+	return 0;
+}
+
+void reftable_merged_table_free(struct reftable_merged_table *mt)
+{
+	if (!mt)
+		return;
+	reftable_free(mt);
+}
+
+uint64_t
+reftable_merged_table_max_update_index(struct reftable_merged_table *mt)
+{
+	return mt->max;
+}
+
+uint64_t
+reftable_merged_table_min_update_index(struct reftable_merged_table *mt)
+{
+	return mt->min;
+}
+
+int merged_table_init_iter(struct reftable_merged_table *mt,
+			   struct reftable_iterator *it,
+			   uint8_t typ)
+{
+	struct merged_subiter *subiters = NULL;
+	struct merged_iter *mi = NULL;
+	int ret;
+
+	if (mt->tables_len) {
+		REFTABLE_CALLOC_ARRAY(subiters, mt->tables_len);
+		if (!subiters) {
+			ret = REFTABLE_OUT_OF_MEMORY_ERROR;
+			goto out;
+		}
+	}
+
+	for (size_t i = 0; i < mt->tables_len; i++) {
+		ret = reftable_record_init(&subiters[i].rec, typ);
+		if (ret < 0)
+			goto out;
+
+		ret = table_init_iter(mt->tables[i], &subiters[i].iter, typ);
+		if (ret < 0)
+			goto out;
+	}
+
+	REFTABLE_CALLOC_ARRAY(mi, 1);
+	if (!mi) {
+		ret = REFTABLE_OUT_OF_MEMORY_ERROR;
+		goto out;
+	}
+	mi->advance_index = -1;
+	mi->suppress_deletions = mt->suppress_deletions;
+	mi->subiters = subiters;
+	mi->subiters_len = mt->tables_len;
+
+	iterator_from_merged_iter(it, mi);
+	ret = 0;
+
+out:
+	if (ret < 0) {
+		for (size_t i = 0; subiters && i < mt->tables_len; i++) {
+			reftable_iterator_destroy(&subiters[i].iter);
+			reftable_record_release(&subiters[i].rec);
+		}
+		reftable_free(subiters);
+		reftable_free(mi);
+	}
+
+	return ret;
+}
+
+int reftable_merged_table_init_ref_iterator(struct reftable_merged_table *mt,
+					    struct reftable_iterator *it)
+{
+	return merged_table_init_iter(mt, it, REFTABLE_BLOCK_TYPE_REF);
+}
+
+int reftable_merged_table_init_log_iterator(struct reftable_merged_table *mt,
+					    struct reftable_iterator *it)
+{
+	return merged_table_init_iter(mt, it, REFTABLE_BLOCK_TYPE_LOG);
+}
+
+enum reftable_hash reftable_merged_table_hash_id(struct reftable_merged_table *mt)
+{
+	return mt->hash_id;
+}
diff --git a/reftable/merged.h b/reftable/merged.h
new file mode 100644
index 0000000000..4317e5f5f6
--- /dev/null
+++ b/reftable/merged.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright 2020 Google LLC
+ *
+ * Use of this source code is governed by a BSD-style
+ * license that can be found in the LICENSE file or at
+ * https://developers.google.com/open-source/licenses/bsd
+ */
+
+#ifndef MERGED_H
+#define MERGED_H
+
+#include "system.h"
+#include "reftable-basics.h"
+
+struct reftable_merged_table {
+	struct reftable_table **tables;
+	size_t tables_len;
+	enum reftable_hash hash_id;
+
+	/* If unset, produce deletions. This is useful for compaction. For the
+	 * full stack, deletions should be produced. */
+	int suppress_deletions;
+
+	uint64_t min;
+	uint64_t max;
+};
+
+struct reftable_iterator;
+
+int merged_table_init_iter(struct reftable_merged_table *mt,
+			   struct reftable_iterator *it,
+			   uint8_t typ);
+
+#endif
diff --git a/reftable/pq.c b/reftable/pq.c
new file mode 100644
index 0000000000..9a79f5c5ee
--- /dev/null
+++ b/reftable/pq.c
@@ -0,0 +1,95 @@
+/*
+ * Copyright 2020 Google LLC
+ *
+ * Use of this source code is governed by a BSD-style
+ * license that can be found in the LICENSE file or at
+ * https://developers.google.com/open-source/licenses/bsd
+ */
+
+#include "pq.h"
+
+#include "reftable-error.h"
+#include "reftable-record.h"
+#include "system.h"
+#include "basics.h"
+
+int pq_less(struct pq_entry *a, struct pq_entry *b)
+{
+	int cmp, err;
+
+	err = reftable_record_cmp(a->rec, b->rec, &cmp);
+	if (err < 0)
+		return err;
+
+	if (cmp == 0)
+		return a->index > b->index;
+	return cmp < 0;
+}
+
+int merged_iter_pqueue_remove(struct merged_iter_pqueue *pq, struct pq_entry *out)
+{
+	size_t i = 0;
+	struct pq_entry e = pq->heap[0];
+	pq->heap[0] = pq->heap[pq->len - 1];
+	pq->len--;
+
+	while (i < pq->len) {
+		size_t min = i;
+		size_t j = 2 * i + 1;
+		size_t k = 2 * i + 2;
+		int cmp;
+
+		if (j < pq->len) {
+			cmp = pq_less(&pq->heap[j], &pq->heap[i]);
+			if (cmp < 0)
+				return -1;
+			else if (cmp)
+				min = j;
+		}
+
+		if (k < pq->len) {
+			cmp = pq_less(&pq->heap[k], &pq->heap[min]);
+			if (cmp < 0)
+				return -1;
+			else if (cmp)
+				min = k;
+		}
+
+		if (min == i)
+			break;
+		REFTABLE_SWAP(pq->heap[i], pq->heap[min]);
+		i = min;
+	}
+
+	if (out)
+		*out = e;
+
+	return 0;
+}
+
+int merged_iter_pqueue_add(struct merged_iter_pqueue *pq, const struct pq_entry *e)
+{
+	size_t i = 0;
+
+	REFTABLE_ALLOC_GROW_OR_NULL(pq->heap, pq->len + 1, pq->cap);
+	if (!pq->heap)
+		return REFTABLE_OUT_OF_MEMORY_ERROR;
+	pq->heap[pq->len++] = *e;
+
+	i = pq->len - 1;
+	while (i > 0) {
+		size_t j = (i - 1) / 2;
+		if (pq_less(&pq->heap[j], &pq->heap[i]))
+			break;
+		REFTABLE_SWAP(pq->heap[j], pq->heap[i]);
+		i = j;
+	}
+
+	return 0;
+}
+
+void merged_iter_pqueue_release(struct merged_iter_pqueue *pq)
+{
+	REFTABLE_FREE_AND_NULL(pq->heap);
+	memset(pq, 0, sizeof(*pq));
+}
diff --git a/reftable/pq.h b/reftable/pq.h
new file mode 100644
index 0000000000..42310670b0
--- /dev/null
+++ b/reftable/pq.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2020 Google LLC
+ *
+ * Use of this source code is governed by a BSD-style
+ * license that can be found in the LICENSE file or at
+ * https://developers.google.com/open-source/licenses/bsd
+ */
+
+#ifndef PQ_H
+#define PQ_H
+
+#include "record.h"
+
+struct pq_entry {
+	size_t index;
+	struct reftable_record *rec;
+};
+
+struct merged_iter_pqueue {
+	struct pq_entry *heap;
+	size_t len;
+	size_t cap;
+};
+
+int merged_iter_pqueue_remove(struct merged_iter_pqueue *pq, struct pq_entry *out);
+int merged_iter_pqueue_add(struct merged_iter_pqueue *pq, const struct pq_entry *e);
+void merged_iter_pqueue_release(struct merged_iter_pqueue *pq);
+int pq_less(struct pq_entry *a, struct pq_entry *b);
+
+static inline struct pq_entry merged_iter_pqueue_top(struct merged_iter_pqueue pq)
+{
+	return pq.heap[0];
+}
+
+static inline int merged_iter_pqueue_is_empty(struct merged_iter_pqueue pq)
+{
+	return pq.len == 0;
+}
+
+#endif
diff --git a/reftable/record.c b/reftable/record.c
new file mode 100644
index 0000000000..fcd387ba5d
--- /dev/null
+++ b/reftable/record.c
@@ -0,0 +1,1322 @@
+/*
+ * Copyright 2020 Google LLC
+ *
+ * Use of this source code is governed by a BSD-style
+ * license that can be found in the LICENSE file or at
+ * https://developers.google.com/open-source/licenses/bsd
+ */
+
+/* record.c - methods for different types of records. */
+
+#include "record.h"
+
+#include "system.h"
+#include "constants.h"
+#include "reftable-error.h"
+#include "basics.h"
+
+static struct reftable_record_vtable *
+reftable_record_vtable(struct reftable_record *rec);
+static void *reftable_record_data(struct reftable_record *rec);
+
+int get_var_int(uint64_t *dest, struct string_view *in)
+{
+	const unsigned char *buf = in->buf;
+	unsigned char c;
+	uint64_t val;
+
+	if (!in->len)
+		return -1;
+	c = *buf++;
+	val = c & 0x7f;
+
+	while (c & 0x80) {
+		/*
+		 * We use a micro-optimization here: whenever we see that the
+		 * 0x80 bit is set, we know that the remainder of the value
+		 * cannot be 0. The zero-values thus doesn't need to be encoded
+		 * at all, which is why we subtract 1 when encoding and add 1
+		 * when decoding.
+		 *
+		 * This allows us to save a byte in some edge cases.
+		 */
+		val += 1;
+		if (!val || (val & (uint64_t)(~0ULL << (64 - 7))))
+			return -1; /* overflow */
+		if (buf >= in->buf + in->len)
+			return -1;
+		c = *buf++;
+		val = (val << 7) + (c & 0x7f);
+	}
+
+	*dest = val;
+	return buf - in->buf;
+}
+
+int put_var_int(struct string_view *dest, uint64_t value)
+{
+	unsigned char varint[10];
+	unsigned pos = sizeof(varint) - 1;
+	varint[pos] = value & 0x7f;
+	while (value >>= 7)
+		varint[--pos] = 0x80 | (--value & 0x7f);
+	if (dest->len < sizeof(varint) - pos)
+		return REFTABLE_ENTRY_TOO_BIG_ERROR;
+	memcpy(dest->buf, varint + pos, sizeof(varint) - pos);
+	return sizeof(varint) - pos;
+}
+
+int reftable_is_block_type(uint8_t typ)
+{
+	switch (typ) {
+	case REFTABLE_BLOCK_TYPE_REF:
+	case REFTABLE_BLOCK_TYPE_LOG:
+	case REFTABLE_BLOCK_TYPE_OBJ:
+	case REFTABLE_BLOCK_TYPE_INDEX:
+		return 1;
+	}
+	return 0;
+}
+
+const unsigned char *reftable_ref_record_val1(const struct reftable_ref_record *rec)
+{
+	switch (rec->value_type) {
+	case REFTABLE_REF_VAL1:
+		return rec->value.val1;
+	case REFTABLE_REF_VAL2:
+		return rec->value.val2.value;
+	default:
+		return NULL;
+	}
+}
+
+const unsigned char *reftable_ref_record_val2(const struct reftable_ref_record *rec)
+{
+	switch (rec->value_type) {
+	case REFTABLE_REF_VAL2:
+		return rec->value.val2.target_value;
+	default:
+		return NULL;
+	}
+}
+
+static int decode_string(struct reftable_buf *dest, struct string_view in)
+{
+	int start_len = in.len;
+	uint64_t tsize = 0;
+	int n, err;
+
+	n = get_var_int(&tsize, &in);
+	if (n <= 0)
+		return -1;
+	string_view_consume(&in, n);
+	if (in.len < tsize)
+		return -1;
+
+	reftable_buf_reset(dest);
+	err = reftable_buf_add(dest, in.buf, tsize);
+	if (err < 0)
+		return err;
+
+	string_view_consume(&in, tsize);
+
+	return start_len - in.len;
+}
+
+static int encode_string(const char *str, struct string_view s)
+{
+	struct string_view start = s;
+	size_t l = strlen(str);
+	int n = put_var_int(&s, l);
+	if (n < 0)
+		return n;
+	string_view_consume(&s, n);
+	if (s.len < l)
+		return REFTABLE_ENTRY_TOO_BIG_ERROR;
+	memcpy(s.buf, str, l);
+	string_view_consume(&s, l);
+
+	return start.len - s.len;
+}
+
+int reftable_encode_key(int *restart, struct string_view dest,
+			struct reftable_buf prev_key, struct reftable_buf key,
+			uint8_t extra)
+{
+	struct string_view start = dest;
+	size_t prefix_len = common_prefix_size(&prev_key, &key);
+	uint64_t suffix_len = key.len - prefix_len;
+	int n = put_var_int(&dest, prefix_len);
+	if (n < 0)
+		return n;
+	string_view_consume(&dest, n);
+
+	*restart = (prefix_len == 0);
+
+	n = put_var_int(&dest, suffix_len << 3 | (uint64_t)extra);
+	if (n < 0)
+		return n;
+	string_view_consume(&dest, n);
+
+	if (dest.len < suffix_len)
+		return REFTABLE_ENTRY_TOO_BIG_ERROR;
+	memcpy(dest.buf, key.buf + prefix_len, suffix_len);
+	string_view_consume(&dest, suffix_len);
+
+	return start.len - dest.len;
+}
+
+int reftable_decode_keylen(struct string_view in,
+			   uint64_t *prefix_len,
+			   uint64_t *suffix_len,
+			   uint8_t *extra)
+{
+	size_t start_len = in.len;
+	int n;
+
+	n = get_var_int(prefix_len, &in);
+	if (n < 0)
+		return -1;
+	string_view_consume(&in, n);
+
+	n = get_var_int(suffix_len, &in);
+	if (n <= 0)
+		return -1;
+	string_view_consume(&in, n);
+
+	*extra = (uint8_t)(*suffix_len & 0x7);
+	*suffix_len >>= 3;
+
+	return start_len - in.len;
+}
+
+int reftable_decode_key(struct reftable_buf *last_key, uint8_t *extra,
+			struct string_view in)
+{
+	int start_len = in.len;
+	uint64_t prefix_len = 0;
+	uint64_t suffix_len = 0;
+	int err, n;
+
+	n = reftable_decode_keylen(in, &prefix_len, &suffix_len, extra);
+	if (n < 0)
+		return -1;
+	string_view_consume(&in, n);
+
+	if (in.len < suffix_len ||
+	    prefix_len > last_key->len)
+		return -1;
+
+	err = reftable_buf_setlen(last_key, prefix_len);
+	if (err < 0)
+		return err;
+
+	err = reftable_buf_add(last_key, in.buf, suffix_len);
+	if (err < 0)
+		return err;
+
+	string_view_consume(&in, suffix_len);
+
+	return start_len - in.len;
+}
+
+static int reftable_ref_record_key(const void *r, struct reftable_buf *dest)
+{
+	const struct reftable_ref_record *rec =
+		(const struct reftable_ref_record *)r;
+	reftable_buf_reset(dest);
+	return reftable_buf_addstr(dest, rec->refname);
+}
+
+static int reftable_ref_record_copy_from(void *rec, const void *src_rec,
+					 uint32_t hash_size)
+{
+	struct reftable_ref_record *ref = rec;
+	const struct reftable_ref_record *src = src_rec;
+	char *refname = NULL;
+	size_t refname_cap = 0;
+	int err;
+
+	REFTABLE_SWAP(refname, ref->refname);
+	REFTABLE_SWAP(refname_cap, ref->refname_cap);
+	reftable_ref_record_release(ref);
+	REFTABLE_SWAP(ref->refname, refname);
+	REFTABLE_SWAP(ref->refname_cap, refname_cap);
+
+	if (src->refname) {
+		size_t refname_len = strlen(src->refname);
+
+		REFTABLE_ALLOC_GROW_OR_NULL(ref->refname, refname_len + 1,
+					    ref->refname_cap);
+		if (!ref->refname) {
+			err = REFTABLE_OUT_OF_MEMORY_ERROR;
+			goto out;
+		}
+
+		memcpy(ref->refname, src->refname, refname_len);
+		ref->refname[refname_len] = 0;
+	}
+
+	ref->update_index = src->update_index;
+	ref->value_type = src->value_type;
+	switch (src->value_type) {
+	case REFTABLE_REF_DELETION:
+		break;
+	case REFTABLE_REF_VAL1:
+		memcpy(ref->value.val1, src->value.val1, hash_size);
+		break;
+	case REFTABLE_REF_VAL2:
+		memcpy(ref->value.val2.value, src->value.val2.value, hash_size);
+		memcpy(ref->value.val2.target_value,
+		       src->value.val2.target_value, hash_size);
+		break;
+	case REFTABLE_REF_SYMREF:
+		ref->value.symref = reftable_strdup(src->value.symref);
+		if (!ref->value.symref) {
+			err = REFTABLE_OUT_OF_MEMORY_ERROR;
+			goto out;
+		}
+		break;
+	}
+
+	err = 0;
+out:
+	return err;
+}
+
+static void reftable_ref_record_release_void(void *rec)
+{
+	reftable_ref_record_release(rec);
+}
+
+void reftable_ref_record_release(struct reftable_ref_record *ref)
+{
+	switch (ref->value_type) {
+	case REFTABLE_REF_SYMREF:
+		reftable_free(ref->value.symref);
+		break;
+	case REFTABLE_REF_VAL2:
+		break;
+	case REFTABLE_REF_VAL1:
+		break;
+	case REFTABLE_REF_DELETION:
+		break;
+	default:
+		abort();
+	}
+
+	reftable_free(ref->refname);
+	memset(ref, 0, sizeof(struct reftable_ref_record));
+}
+
+static uint8_t reftable_ref_record_val_type(const void *rec)
+{
+	const struct reftable_ref_record *r =
+		(const struct reftable_ref_record *)rec;
+	return r->value_type;
+}
+
+static int reftable_ref_record_encode(const void *rec, struct string_view s,
+				      uint32_t hash_size)
+{
+	const struct reftable_ref_record *r =
+		(const struct reftable_ref_record *)rec;
+	struct string_view start = s;
+	int n = put_var_int(&s, r->update_index);
+	if (n < 0)
+		return n;
+	string_view_consume(&s, n);
+
+	switch (r->value_type) {
+	case REFTABLE_REF_SYMREF:
+		n = encode_string(r->value.symref, s);
+		if (n < 0)
+			return n;
+		string_view_consume(&s, n);
+		break;
+	case REFTABLE_REF_VAL2:
+		if (s.len < 2 * hash_size)
+			return REFTABLE_ENTRY_TOO_BIG_ERROR;
+		memcpy(s.buf, r->value.val2.value, hash_size);
+		string_view_consume(&s, hash_size);
+		memcpy(s.buf, r->value.val2.target_value, hash_size);
+		string_view_consume(&s, hash_size);
+		break;
+	case REFTABLE_REF_VAL1:
+		if (s.len < hash_size)
+			return REFTABLE_ENTRY_TOO_BIG_ERROR;
+		memcpy(s.buf, r->value.val1, hash_size);
+		string_view_consume(&s, hash_size);
+		break;
+	case REFTABLE_REF_DELETION:
+		break;
+	default:
+		abort();
+	}
+
+	return start.len - s.len;
+}
+
+static int reftable_ref_record_decode(void *rec, struct reftable_buf key,
+				      uint8_t val_type, struct string_view in,
+				      uint32_t hash_size, struct reftable_buf *scratch)
+{
+	struct reftable_ref_record *r = rec;
+	struct string_view start = in;
+	uint64_t update_index = 0;
+	const char *refname = NULL;
+	size_t refname_cap = 0;
+	int n, err;
+
+	n = get_var_int(&update_index, &in);
+	if (n < 0)
+		return n;
+	string_view_consume(&in, n);
+
+	REFTABLE_SWAP(refname, r->refname);
+	REFTABLE_SWAP(refname_cap, r->refname_cap);
+	reftable_ref_record_release(r);
+	REFTABLE_SWAP(r->refname, refname);
+	REFTABLE_SWAP(r->refname_cap, refname_cap);
+
+	REFTABLE_ALLOC_GROW_OR_NULL(r->refname, key.len + 1, r->refname_cap);
+	if (!r->refname) {
+		err = REFTABLE_OUT_OF_MEMORY_ERROR;
+		goto done;
+	}
+	memcpy(r->refname, key.buf, key.len);
+	r->refname[key.len] = 0;
+
+	r->update_index = update_index;
+	r->value_type = val_type;
+	switch (val_type) {
+	case REFTABLE_REF_VAL1:
+		if (in.len < hash_size) {
+			err = REFTABLE_FORMAT_ERROR;
+			goto done;
+		}
+
+		memcpy(r->value.val1, in.buf, hash_size);
+		string_view_consume(&in, hash_size);
+		break;
+
+	case REFTABLE_REF_VAL2:
+		if (in.len < 2 * hash_size) {
+			err = REFTABLE_FORMAT_ERROR;
+			goto done;
+		}
+
+		memcpy(r->value.val2.value, in.buf, hash_size);
+		string_view_consume(&in, hash_size);
+
+		memcpy(r->value.val2.target_value, in.buf, hash_size);
+		string_view_consume(&in, hash_size);
+		break;
+
+	case REFTABLE_REF_SYMREF: {
+		int n = decode_string(scratch, in);
+		if (n < 0) {
+			err = REFTABLE_FORMAT_ERROR;
+			goto done;
+		}
+		string_view_consume(&in, n);
+		r->value.symref = reftable_buf_detach(scratch);
+	} break;
+
+	case REFTABLE_REF_DELETION:
+		break;
+	default:
+		abort();
+		break;
+	}
+
+	return start.len - in.len;
+
+done:
+	return err;
+}
+
+static int reftable_ref_record_is_deletion_void(const void *p)
+{
+	return reftable_ref_record_is_deletion(
+		(const struct reftable_ref_record *)p);
+}
+
+static int reftable_ref_record_equal_void(const void *a,
+					  const void *b, uint32_t hash_size)
+{
+	struct reftable_ref_record *ra = (struct reftable_ref_record *) a;
+	struct reftable_ref_record *rb = (struct reftable_ref_record *) b;
+	return reftable_ref_record_equal(ra, rb, hash_size);
+}
+
+static int reftable_ref_record_cmp_void(const void *_a, const void *_b)
+{
+	const struct reftable_ref_record *a = _a;
+	const struct reftable_ref_record *b = _b;
+	return strcmp(a->refname, b->refname);
+}
+
+static struct reftable_record_vtable reftable_ref_record_vtable = {
+	.key = &reftable_ref_record_key,
+	.type = REFTABLE_BLOCK_TYPE_REF,
+	.copy_from = &reftable_ref_record_copy_from,
+	.val_type = &reftable_ref_record_val_type,
+	.encode = &reftable_ref_record_encode,
+	.decode = &reftable_ref_record_decode,
+	.release = &reftable_ref_record_release_void,
+	.is_deletion = &reftable_ref_record_is_deletion_void,
+	.equal = &reftable_ref_record_equal_void,
+	.cmp = &reftable_ref_record_cmp_void,
+};
+
+static int reftable_obj_record_key(const void *r, struct reftable_buf *dest)
+{
+	const struct reftable_obj_record *rec =
+		(const struct reftable_obj_record *)r;
+	reftable_buf_reset(dest);
+	return reftable_buf_add(dest, rec->hash_prefix, rec->hash_prefix_len);
+}
+
+static void reftable_obj_record_release(void *rec)
+{
+	struct reftable_obj_record *obj = rec;
+	REFTABLE_FREE_AND_NULL(obj->hash_prefix);
+	REFTABLE_FREE_AND_NULL(obj->offsets);
+	memset(obj, 0, sizeof(struct reftable_obj_record));
+}
+
+static int reftable_obj_record_copy_from(void *rec, const void *src_rec,
+					 uint32_t hash_size REFTABLE_UNUSED)
+{
+	struct reftable_obj_record *obj = rec;
+	const struct reftable_obj_record *src = src_rec;
+
+	reftable_obj_record_release(obj);
+
+	REFTABLE_ALLOC_ARRAY(obj->hash_prefix, src->hash_prefix_len);
+	if (!obj->hash_prefix)
+		return REFTABLE_OUT_OF_MEMORY_ERROR;
+	obj->hash_prefix_len = src->hash_prefix_len;
+	if (src->hash_prefix_len)
+		memcpy(obj->hash_prefix, src->hash_prefix, obj->hash_prefix_len);
+
+	if (src->offset_len) {
+		if (sizeof(*src->offsets) > SIZE_MAX / src->offset_len)
+			return REFTABLE_OUT_OF_MEMORY_ERROR;
+
+		REFTABLE_ALLOC_ARRAY(obj->offsets, src->offset_len);
+		if (!obj->offsets)
+			return REFTABLE_OUT_OF_MEMORY_ERROR;
+
+		memcpy(obj->offsets, src->offsets, sizeof(*src->offsets) * src->offset_len);
+		obj->offset_len = src->offset_len;
+	}
+
+	return 0;
+}
+
+static uint8_t reftable_obj_record_val_type(const void *rec)
+{
+	const struct reftable_obj_record *r = rec;
+	if (r->offset_len > 0 && r->offset_len < 8)
+		return r->offset_len;
+	return 0;
+}
+
+static int reftable_obj_record_encode(const void *rec, struct string_view s,
+				      uint32_t hash_size REFTABLE_UNUSED)
+{
+	const struct reftable_obj_record *r = rec;
+	struct string_view start = s;
+	int i = 0;
+	int n = 0;
+	uint64_t last = 0;
+	if (r->offset_len == 0 || r->offset_len >= 8) {
+		n = put_var_int(&s, r->offset_len);
+		if (n < 0)
+			return n;
+		string_view_consume(&s, n);
+	}
+	if (r->offset_len == 0)
+		return start.len - s.len;
+	n = put_var_int(&s, r->offsets[0]);
+	if (n < 0)
+		return n;
+	string_view_consume(&s, n);
+
+	last = r->offsets[0];
+	for (i = 1; i < r->offset_len; i++) {
+		int n = put_var_int(&s, r->offsets[i] - last);
+		if (n < 0)
+			return n;
+		string_view_consume(&s, n);
+		last = r->offsets[i];
+	}
+	return start.len - s.len;
+}
+
+static int reftable_obj_record_decode(void *rec, struct reftable_buf key,
+				      uint8_t val_type, struct string_view in,
+				      uint32_t hash_size REFTABLE_UNUSED,
+				      struct reftable_buf *scratch REFTABLE_UNUSED)
+{
+	struct string_view start = in;
+	struct reftable_obj_record *r = rec;
+	uint64_t count = val_type;
+	int n = 0;
+	uint64_t last;
+
+	reftable_obj_record_release(r);
+
+	REFTABLE_ALLOC_ARRAY(r->hash_prefix, key.len);
+	if (!r->hash_prefix)
+		return REFTABLE_OUT_OF_MEMORY_ERROR;
+	memcpy(r->hash_prefix, key.buf, key.len);
+	r->hash_prefix_len = key.len;
+
+	if (val_type == 0) {
+		n = get_var_int(&count, &in);
+		if (n < 0) {
+			return n;
+		}
+
+		string_view_consume(&in, n);
+	}
+
+	r->offsets = NULL;
+	r->offset_len = 0;
+	if (count == 0)
+		return start.len - in.len;
+
+	REFTABLE_ALLOC_ARRAY(r->offsets, count);
+	if (!r->offsets)
+		return REFTABLE_OUT_OF_MEMORY_ERROR;
+	r->offset_len = count;
+
+	n = get_var_int(&r->offsets[0], &in);
+	if (n < 0)
+		return n;
+	string_view_consume(&in, n);
+
+	last = r->offsets[0];
+	for (uint64_t j = 1; j < count; j++) {
+		uint64_t delta = 0;
+		int n = get_var_int(&delta, &in);
+		if (n < 0) {
+			return n;
+		}
+		string_view_consume(&in, n);
+
+		last = r->offsets[j] = (delta + last);
+	}
+	return start.len - in.len;
+}
+
+static int not_a_deletion(const void *p REFTABLE_UNUSED)
+{
+	return 0;
+}
+
+static int reftable_obj_record_equal_void(const void *a, const void *b,
+					  uint32_t hash_size REFTABLE_UNUSED)
+{
+	struct reftable_obj_record *ra = (struct reftable_obj_record *) a;
+	struct reftable_obj_record *rb = (struct reftable_obj_record *) b;
+
+	if (ra->hash_prefix_len != rb->hash_prefix_len
+	    || ra->offset_len != rb->offset_len)
+		return 0;
+
+	if (ra->hash_prefix_len &&
+	    memcmp(ra->hash_prefix, rb->hash_prefix, ra->hash_prefix_len))
+		return 0;
+	if (ra->offset_len &&
+	    memcmp(ra->offsets, rb->offsets, ra->offset_len * sizeof(uint64_t)))
+		return 0;
+
+	return 1;
+}
+
+static int reftable_obj_record_cmp_void(const void *_a, const void *_b)
+{
+	const struct reftable_obj_record *a = _a;
+	const struct reftable_obj_record *b = _b;
+	int cmp;
+
+	cmp = memcmp(a->hash_prefix, b->hash_prefix,
+		     a->hash_prefix_len > b->hash_prefix_len ?
+		     a->hash_prefix_len : b->hash_prefix_len);
+	if (cmp)
+		return cmp;
+
+	/*
+	 * When the prefix is the same then the object record that is longer is
+	 * considered to be bigger.
+	 */
+	return a->hash_prefix_len - b->hash_prefix_len;
+}
+
+static struct reftable_record_vtable reftable_obj_record_vtable = {
+	.key = &reftable_obj_record_key,
+	.type = REFTABLE_BLOCK_TYPE_OBJ,
+	.copy_from = &reftable_obj_record_copy_from,
+	.val_type = &reftable_obj_record_val_type,
+	.encode = &reftable_obj_record_encode,
+	.decode = &reftable_obj_record_decode,
+	.release = &reftable_obj_record_release,
+	.is_deletion = &not_a_deletion,
+	.equal = &reftable_obj_record_equal_void,
+	.cmp = &reftable_obj_record_cmp_void,
+};
+
+static int reftable_log_record_key(const void *r, struct reftable_buf *dest)
+{
+	const struct reftable_log_record *rec =
+		(const struct reftable_log_record *)r;
+	int len = strlen(rec->refname), err;
+	uint8_t i64[8];
+	uint64_t ts = 0;
+
+	reftable_buf_reset(dest);
+	err = reftable_buf_add(dest, (uint8_t *)rec->refname, len + 1);
+	if (err < 0)
+		return err;
+
+	ts = (~ts) - rec->update_index;
+	reftable_put_be64(&i64[0], ts);
+
+	err = reftable_buf_add(dest, i64, sizeof(i64));
+	if (err < 0)
+		return err;
+
+	return 0;
+}
+
+static int reftable_log_record_copy_from(void *rec, const void *src_rec,
+					 uint32_t hash_size)
+{
+	struct reftable_log_record *dst = rec;
+	const struct reftable_log_record *src =
+		(const struct reftable_log_record *)src_rec;
+	int ret;
+
+	reftable_log_record_release(dst);
+	*dst = *src;
+
+	if (dst->refname) {
+		dst->refname = reftable_strdup(dst->refname);
+		if (!dst->refname) {
+			ret = REFTABLE_OUT_OF_MEMORY_ERROR;
+			goto out;
+		}
+	}
+
+	switch (dst->value_type) {
+	case REFTABLE_LOG_DELETION:
+		break;
+	case REFTABLE_LOG_UPDATE:
+		if (dst->value.update.email)
+			dst->value.update.email =
+				reftable_strdup(dst->value.update.email);
+		if (dst->value.update.name)
+			dst->value.update.name =
+				reftable_strdup(dst->value.update.name);
+		if (dst->value.update.message)
+			dst->value.update.message =
+				reftable_strdup(dst->value.update.message);
+
+		if (!dst->value.update.email ||
+		    !dst->value.update.name ||
+		    !dst->value.update.message) {
+			ret = REFTABLE_OUT_OF_MEMORY_ERROR;
+			goto out;
+		}
+
+		memcpy(dst->value.update.new_hash,
+		       src->value.update.new_hash, hash_size);
+		memcpy(dst->value.update.old_hash,
+		       src->value.update.old_hash, hash_size);
+		break;
+	}
+
+	ret = 0;
+out:
+	return ret;
+}
+
+static void reftable_log_record_release_void(void *rec)
+{
+	struct reftable_log_record *r = rec;
+	reftable_log_record_release(r);
+}
+
+void reftable_log_record_release(struct reftable_log_record *r)
+{
+	reftable_free(r->refname);
+	switch (r->value_type) {
+	case REFTABLE_LOG_DELETION:
+		break;
+	case REFTABLE_LOG_UPDATE:
+		reftable_free(r->value.update.name);
+		reftable_free(r->value.update.email);
+		reftable_free(r->value.update.message);
+		break;
+	}
+	memset(r, 0, sizeof(struct reftable_log_record));
+}
+
+static uint8_t reftable_log_record_val_type(const void *rec)
+{
+	const struct reftable_log_record *log =
+		(const struct reftable_log_record *)rec;
+
+	return reftable_log_record_is_deletion(log) ? 0 : 1;
+}
+
+static int reftable_log_record_encode(const void *rec, struct string_view s,
+				      uint32_t hash_size)
+{
+	const struct reftable_log_record *r = rec;
+	struct string_view start = s;
+	int n = 0;
+	if (reftable_log_record_is_deletion(r))
+		return 0;
+
+	if (s.len < 2 * hash_size)
+		return REFTABLE_ENTRY_TOO_BIG_ERROR;
+
+	memcpy(s.buf, r->value.update.old_hash, hash_size);
+	memcpy(s.buf + hash_size, r->value.update.new_hash, hash_size);
+	string_view_consume(&s, 2 * hash_size);
+
+	n = encode_string(r->value.update.name ? r->value.update.name : "", s);
+	if (n < 0)
+		return n;
+	string_view_consume(&s, n);
+
+	n = encode_string(r->value.update.email ? r->value.update.email : "",
+			  s);
+	if (n < 0)
+		return n;
+	string_view_consume(&s, n);
+
+	n = put_var_int(&s, r->value.update.time);
+	if (n < 0)
+		return n;
+	string_view_consume(&s, n);
+
+	if (s.len < 2)
+		return REFTABLE_ENTRY_TOO_BIG_ERROR;
+
+	reftable_put_be16(s.buf, r->value.update.tz_offset);
+	string_view_consume(&s, 2);
+
+	n = encode_string(
+		r->value.update.message ? r->value.update.message : "", s);
+	if (n < 0)
+		return n;
+	string_view_consume(&s, n);
+
+	return start.len - s.len;
+}
+
+static int reftable_log_record_decode(void *rec, struct reftable_buf key,
+				      uint8_t val_type, struct string_view in,
+				      uint32_t hash_size, struct reftable_buf *scratch)
+{
+	struct string_view start = in;
+	struct reftable_log_record *r = rec;
+	uint64_t max = 0;
+	uint64_t ts = 0;
+	int err, n;
+
+	if (key.len <= 9 || key.buf[key.len - 9] != 0)
+		return REFTABLE_FORMAT_ERROR;
+
+	REFTABLE_ALLOC_GROW_OR_NULL(r->refname, key.len - 8, r->refname_cap);
+	if (!r->refname) {
+		err = REFTABLE_OUT_OF_MEMORY_ERROR;
+		goto done;
+	}
+
+	memcpy(r->refname, key.buf, key.len - 8);
+	ts = reftable_get_be64((unsigned char *)key.buf + key.len - 8);
+
+	r->update_index = (~max) - ts;
+
+	if (val_type != r->value_type) {
+		switch (r->value_type) {
+		case REFTABLE_LOG_UPDATE:
+			REFTABLE_FREE_AND_NULL(r->value.update.message);
+			r->value.update.message_cap = 0;
+			REFTABLE_FREE_AND_NULL(r->value.update.email);
+			REFTABLE_FREE_AND_NULL(r->value.update.name);
+			break;
+		case REFTABLE_LOG_DELETION:
+			break;
+		}
+	}
+
+	r->value_type = val_type;
+	if (val_type == REFTABLE_LOG_DELETION)
+		return 0;
+
+	if (in.len < 2 * hash_size) {
+		err = REFTABLE_FORMAT_ERROR;
+		goto done;
+	}
+
+	memcpy(r->value.update.old_hash, in.buf, hash_size);
+	memcpy(r->value.update.new_hash, in.buf + hash_size, hash_size);
+
+	string_view_consume(&in, 2 * hash_size);
+
+	n = decode_string(scratch, in);
+	if (n < 0) {
+		err = REFTABLE_FORMAT_ERROR;
+		goto done;
+	}
+	string_view_consume(&in, n);
+
+	/*
+	 * In almost all cases we can expect the reflog name to not change for
+	 * reflog entries as they are tied to the local identity, not to the
+	 * target commits. As an optimization for this common case we can thus
+	 * skip copying over the name in case it's accurate already.
+	 */
+	if (!r->value.update.name ||
+	    strcmp(r->value.update.name, scratch->buf)) {
+		char *name = reftable_realloc(r->value.update.name, scratch->len + 1);
+		if (!name) {
+			err = REFTABLE_OUT_OF_MEMORY_ERROR;
+			goto done;
+		}
+
+		r->value.update.name = name;
+		memcpy(r->value.update.name, scratch->buf, scratch->len);
+		r->value.update.name[scratch->len] = 0;
+	}
+
+	n = decode_string(scratch, in);
+	if (n < 0) {
+		err = REFTABLE_FORMAT_ERROR;
+		goto done;
+	}
+	string_view_consume(&in, n);
+
+	/* Same as above, but for the reflog email. */
+	if (!r->value.update.email ||
+	    strcmp(r->value.update.email, scratch->buf)) {
+		char *email = reftable_realloc(r->value.update.email, scratch->len + 1);
+		if (!email) {
+			err = REFTABLE_OUT_OF_MEMORY_ERROR;
+			goto done;
+		}
+
+		r->value.update.email = email;
+		memcpy(r->value.update.email, scratch->buf, scratch->len);
+		r->value.update.email[scratch->len] = 0;
+	}
+
+	ts = 0;
+	n = get_var_int(&ts, &in);
+	if (n < 0) {
+		err = REFTABLE_FORMAT_ERROR;
+		goto done;
+	}
+	string_view_consume(&in, n);
+	r->value.update.time = ts;
+	if (in.len < 2) {
+		err = REFTABLE_FORMAT_ERROR;
+		goto done;
+	}
+
+	r->value.update.tz_offset = reftable_get_be16(in.buf);
+	string_view_consume(&in, 2);
+
+	n = decode_string(scratch, in);
+	if (n < 0) {
+		err = REFTABLE_FORMAT_ERROR;
+		goto done;
+	}
+	string_view_consume(&in, n);
+
+	REFTABLE_ALLOC_GROW_OR_NULL(r->value.update.message, scratch->len + 1,
+				    r->value.update.message_cap);
+	if (!r->value.update.message) {
+		err = REFTABLE_OUT_OF_MEMORY_ERROR;
+		goto done;
+	}
+
+	memcpy(r->value.update.message, scratch->buf, scratch->len);
+	r->value.update.message[scratch->len] = 0;
+
+	return start.len - in.len;
+
+done:
+	return err;
+}
+
+static int null_streq(const char *a, const char *b)
+{
+	const char *empty = "";
+	if (!a)
+		a = empty;
+
+	if (!b)
+		b = empty;
+
+	return 0 == strcmp(a, b);
+}
+
+static int reftable_log_record_equal_void(const void *a,
+					  const void *b, uint32_t hash_size)
+{
+	return reftable_log_record_equal((struct reftable_log_record *) a,
+					 (struct reftable_log_record *) b,
+					 hash_size);
+}
+
+static int reftable_log_record_cmp_void(const void *_a, const void *_b)
+{
+	const struct reftable_log_record *a = _a;
+	const struct reftable_log_record *b = _b;
+	int cmp = strcmp(a->refname, b->refname);
+	if (cmp)
+		return cmp;
+
+	/*
+	 * Note that the comparison here is reversed. This is because the
+	 * update index is reversed when comparing keys. For reference, see how
+	 * we handle this in reftable_log_record_key()`.
+	 */
+	return b->update_index - a->update_index;
+}
+
+int reftable_log_record_equal(const struct reftable_log_record *a,
+			      const struct reftable_log_record *b, uint32_t hash_size)
+{
+	if (!(null_streq(a->refname, b->refname) &&
+	      a->update_index == b->update_index &&
+	      a->value_type == b->value_type))
+		return 0;
+
+	switch (a->value_type) {
+	case REFTABLE_LOG_DELETION:
+		return 1;
+	case REFTABLE_LOG_UPDATE:
+		return null_streq(a->value.update.name, b->value.update.name) &&
+		       a->value.update.time == b->value.update.time &&
+		       a->value.update.tz_offset == b->value.update.tz_offset &&
+		       null_streq(a->value.update.email,
+				  b->value.update.email) &&
+		       null_streq(a->value.update.message,
+				  b->value.update.message) &&
+		       !memcmp(a->value.update.old_hash,
+			       b->value.update.old_hash, hash_size) &&
+		       !memcmp(a->value.update.new_hash,
+			       b->value.update.new_hash, hash_size);
+	}
+
+	abort();
+}
+
+static int reftable_log_record_is_deletion_void(const void *p)
+{
+	return reftable_log_record_is_deletion(
+		(const struct reftable_log_record *)p);
+}
+
+static struct reftable_record_vtable reftable_log_record_vtable = {
+	.key = &reftable_log_record_key,
+	.type = REFTABLE_BLOCK_TYPE_LOG,
+	.copy_from = &reftable_log_record_copy_from,
+	.val_type = &reftable_log_record_val_type,
+	.encode = &reftable_log_record_encode,
+	.decode = &reftable_log_record_decode,
+	.release = &reftable_log_record_release_void,
+	.is_deletion = &reftable_log_record_is_deletion_void,
+	.equal = &reftable_log_record_equal_void,
+	.cmp = &reftable_log_record_cmp_void,
+};
+
+static int reftable_index_record_key(const void *r, struct reftable_buf *dest)
+{
+	const struct reftable_index_record *rec = r;
+	reftable_buf_reset(dest);
+	return reftable_buf_add(dest, rec->last_key.buf, rec->last_key.len);
+}
+
+static int reftable_index_record_copy_from(void *rec, const void *src_rec,
+					   uint32_t hash_size REFTABLE_UNUSED)
+{
+	struct reftable_index_record *dst = rec;
+	const struct reftable_index_record *src = src_rec;
+	int err;
+
+	reftable_buf_reset(&dst->last_key);
+	err = reftable_buf_add(&dst->last_key, src->last_key.buf, src->last_key.len);
+	if (err < 0)
+		return err;
+	dst->offset = src->offset;
+
+	return 0;
+}
+
+static void reftable_index_record_release(void *rec)
+{
+	struct reftable_index_record *idx = rec;
+	reftable_buf_release(&idx->last_key);
+}
+
+static uint8_t reftable_index_record_val_type(const void *rec REFTABLE_UNUSED)
+{
+	return 0;
+}
+
+static int reftable_index_record_encode(const void *rec, struct string_view out,
+					uint32_t hash_size REFTABLE_UNUSED)
+{
+	const struct reftable_index_record *r =
+		(const struct reftable_index_record *)rec;
+	struct string_view start = out;
+
+	int n = put_var_int(&out, r->offset);
+	if (n < 0)
+		return n;
+
+	string_view_consume(&out, n);
+
+	return start.len - out.len;
+}
+
+static int reftable_index_record_decode(void *rec, struct reftable_buf key,
+					uint8_t val_type REFTABLE_UNUSED,
+					struct string_view in,
+					uint32_t hash_size REFTABLE_UNUSED,
+					struct reftable_buf *scratch REFTABLE_UNUSED)
+{
+	struct string_view start = in;
+	struct reftable_index_record *r = rec;
+	int err, n = 0;
+
+	reftable_buf_reset(&r->last_key);
+	err = reftable_buf_add(&r->last_key, key.buf, key.len);
+	if (err < 0)
+		return err;
+
+	n = get_var_int(&r->offset, &in);
+	if (n < 0)
+		return n;
+
+	string_view_consume(&in, n);
+	return start.len - in.len;
+}
+
+static int reftable_index_record_equal(const void *a, const void *b,
+				       uint32_t hash_size REFTABLE_UNUSED)
+{
+	struct reftable_index_record *ia = (struct reftable_index_record *) a;
+	struct reftable_index_record *ib = (struct reftable_index_record *) b;
+
+	return ia->offset == ib->offset && !reftable_buf_cmp(&ia->last_key, &ib->last_key);
+}
+
+static int reftable_index_record_cmp(const void *_a, const void *_b)
+{
+	const struct reftable_index_record *a = _a;
+	const struct reftable_index_record *b = _b;
+	return reftable_buf_cmp(&a->last_key, &b->last_key);
+}
+
+static struct reftable_record_vtable reftable_index_record_vtable = {
+	.key = &reftable_index_record_key,
+	.type = REFTABLE_BLOCK_TYPE_INDEX,
+	.copy_from = &reftable_index_record_copy_from,
+	.val_type = &reftable_index_record_val_type,
+	.encode = &reftable_index_record_encode,
+	.decode = &reftable_index_record_decode,
+	.release = &reftable_index_record_release,
+	.is_deletion = &not_a_deletion,
+	.equal = &reftable_index_record_equal,
+	.cmp = &reftable_index_record_cmp,
+};
+
+int reftable_record_key(struct reftable_record *rec, struct reftable_buf *dest)
+{
+	return reftable_record_vtable(rec)->key(reftable_record_data(rec), dest);
+}
+
+int reftable_record_encode(struct reftable_record *rec, struct string_view dest,
+			   uint32_t hash_size)
+{
+	return reftable_record_vtable(rec)->encode(reftable_record_data(rec),
+						   dest, hash_size);
+}
+
+int reftable_record_copy_from(struct reftable_record *rec,
+			       struct reftable_record *src, uint32_t hash_size)
+{
+	assert(src->type == rec->type);
+
+	return reftable_record_vtable(rec)->copy_from(reftable_record_data(rec),
+						      reftable_record_data(src),
+						      hash_size);
+}
+
+uint8_t reftable_record_val_type(struct reftable_record *rec)
+{
+	return reftable_record_vtable(rec)->val_type(reftable_record_data(rec));
+}
+
+int reftable_record_decode(struct reftable_record *rec, struct reftable_buf key,
+			   uint8_t extra, struct string_view src, uint32_t hash_size,
+			   struct reftable_buf *scratch)
+{
+	return reftable_record_vtable(rec)->decode(reftable_record_data(rec),
+						   key, extra, src, hash_size,
+						   scratch);
+}
+
+void reftable_record_release(struct reftable_record *rec)
+{
+	reftable_record_vtable(rec)->release(reftable_record_data(rec));
+}
+
+int reftable_record_is_deletion(struct reftable_record *rec)
+{
+	return reftable_record_vtable(rec)->is_deletion(
+		reftable_record_data(rec));
+}
+
+int reftable_record_cmp(struct reftable_record *a, struct reftable_record *b,
+			int *cmp)
+{
+	if (a->type != b->type)
+		return -1;
+	*cmp = reftable_record_vtable(a)->cmp(reftable_record_data(a),
+					      reftable_record_data(b));
+	return 0;
+}
+
+int reftable_record_equal(struct reftable_record *a, struct reftable_record *b, uint32_t hash_size)
+{
+	if (a->type != b->type)
+		return 0;
+	return reftable_record_vtable(a)->equal(
+		reftable_record_data(a), reftable_record_data(b), hash_size);
+}
+
+static int hash_equal(const unsigned char *a, const unsigned char *b, uint32_t hash_size)
+{
+	if (a && b)
+		return !memcmp(a, b, hash_size);
+
+	return a == b;
+}
+
+int reftable_ref_record_equal(const struct reftable_ref_record *a,
+			      const struct reftable_ref_record *b, uint32_t hash_size)
+{
+	if (!null_streq(a->refname, b->refname))
+		return 0;
+
+	if (a->update_index != b->update_index ||
+	    a->value_type != b->value_type)
+		return 0;
+
+	switch (a->value_type) {
+	case REFTABLE_REF_SYMREF:
+		return !strcmp(a->value.symref, b->value.symref);
+	case REFTABLE_REF_VAL2:
+		return hash_equal(a->value.val2.value, b->value.val2.value,
+				  hash_size) &&
+		       hash_equal(a->value.val2.target_value,
+				  b->value.val2.target_value, hash_size);
+	case REFTABLE_REF_VAL1:
+		return hash_equal(a->value.val1, b->value.val1, hash_size);
+	case REFTABLE_REF_DELETION:
+		return 1;
+	default:
+		abort();
+	}
+}
+
+int reftable_ref_record_compare_name(const void *a, const void *b)
+{
+	return strcmp(((struct reftable_ref_record *)a)->refname,
+		      ((struct reftable_ref_record *)b)->refname);
+}
+
+int reftable_ref_record_is_deletion(const struct reftable_ref_record *ref)
+{
+	return ref->value_type == REFTABLE_REF_DELETION;
+}
+
+int reftable_log_record_compare_key(const void *a, const void *b)
+{
+	const struct reftable_log_record *la = a;
+	const struct reftable_log_record *lb = b;
+
+	int cmp = strcmp(la->refname, lb->refname);
+	if (cmp)
+		return cmp;
+	if (la->update_index > lb->update_index)
+		return -1;
+	return (la->update_index < lb->update_index) ? 1 : 0;
+}
+
+int reftable_log_record_is_deletion(const struct reftable_log_record *log)
+{
+	return (log->value_type == REFTABLE_LOG_DELETION);
+}
+
+static void *reftable_record_data(struct reftable_record *rec)
+{
+	switch (rec->type) {
+	case REFTABLE_BLOCK_TYPE_REF:
+		return &rec->u.ref;
+	case REFTABLE_BLOCK_TYPE_LOG:
+		return &rec->u.log;
+	case REFTABLE_BLOCK_TYPE_INDEX:
+		return &rec->u.idx;
+	case REFTABLE_BLOCK_TYPE_OBJ:
+		return &rec->u.obj;
+	}
+	abort();
+}
+
+static struct reftable_record_vtable *
+reftable_record_vtable(struct reftable_record *rec)
+{
+	switch (rec->type) {
+	case REFTABLE_BLOCK_TYPE_REF:
+		return &reftable_ref_record_vtable;
+	case REFTABLE_BLOCK_TYPE_LOG:
+		return &reftable_log_record_vtable;
+	case REFTABLE_BLOCK_TYPE_INDEX:
+		return &reftable_index_record_vtable;
+	case REFTABLE_BLOCK_TYPE_OBJ:
+		return &reftable_obj_record_vtable;
+	}
+	abort();
+}
+
+int reftable_record_init(struct reftable_record *rec, uint8_t typ)
+{
+	memset(rec, 0, sizeof(*rec));
+	rec->type = typ;
+
+	switch (typ) {
+	case REFTABLE_BLOCK_TYPE_REF:
+	case REFTABLE_BLOCK_TYPE_LOG:
+	case REFTABLE_BLOCK_TYPE_OBJ:
+		return 0;
+	case REFTABLE_BLOCK_TYPE_INDEX:
+		reftable_buf_init(&rec->u.idx.last_key);
+		return 0;
+	default:
+		return REFTABLE_API_ERROR;
+	}
+}
diff --git a/reftable/record.h b/reftable/record.h
new file mode 100644
index 0000000000..7953f352a3
--- /dev/null
+++ b/reftable/record.h
@@ -0,0 +1,164 @@
+/*
+ * Copyright 2020 Google LLC
+ *
+ * Use of this source code is governed by a BSD-style
+ * license that can be found in the LICENSE file or at
+ * https://developers.google.com/open-source/licenses/bsd
+ */
+
+#ifndef RECORD_H
+#define RECORD_H
+
+#include "basics.h"
+#include "system.h"
+
+#include <stdint.h>
+
+#include "reftable-record.h"
+
+/*
+ * A substring of existing string data. This structure takes no responsibility
+ * for the lifetime of the data it points to.
+ */
+struct string_view {
+	uint8_t *buf;
+	size_t len;
+};
+
+/* Advance `s.buf` by `n`, and decrease length. */
+static inline void string_view_consume(struct string_view *s, int n)
+{
+	s->buf += n;
+	s->len -= n;
+}
+
+/*
+ * Decode and encode a varint. Returns the number of bytes read/written, or a
+ * negative value in case encoding/decoding the varint has failed.
+ */
+int get_var_int(uint64_t *dest, struct string_view *in);
+int put_var_int(struct string_view *dest, uint64_t val);
+
+/* Methods for records. */
+struct reftable_record_vtable {
+	/* encode the key of to a uint8_t reftable_buf. */
+	int (*key)(const void *rec, struct reftable_buf *dest);
+
+	/* The record type of ('r' for ref). */
+	uint8_t type;
+
+	int (*copy_from)(void *dest, const void *src, uint32_t hash_size);
+
+	/* a value of [0..7], indicating record subvariants (eg. ref vs. symref
+	 * vs ref deletion) */
+	uint8_t (*val_type)(const void *rec);
+
+	/* encodes rec into dest, returning how much space was used. */
+	int (*encode)(const void *rec, struct string_view dest, uint32_t hash_size);
+
+	/* decode data from `src` into the record. */
+	int (*decode)(void *rec, struct reftable_buf key, uint8_t extra,
+		      struct string_view src, uint32_t hash_size,
+		      struct reftable_buf *scratch);
+
+	/* deallocate and null the record. */
+	void (*release)(void *rec);
+
+	/* is this a tombstone? */
+	int (*is_deletion)(const void *rec);
+
+	/* Are two records equal? This assumes they have the same type. Returns 0 for non-equal. */
+	int (*equal)(const void *a, const void *b, uint32_t hash_size);
+
+	/*
+	 * Compare keys of two records with each other. The records must have
+	 * the same type.
+	 */
+	int (*cmp)(const void *a, const void *b);
+};
+
+/* returns true for recognized block types. Block start with the block type. */
+int reftable_is_block_type(uint8_t typ);
+
+/* Encode `key` into `dest`. Sets `is_restart` to indicate a restart. Returns
+ * number of bytes written. */
+int reftable_encode_key(int *is_restart, struct string_view dest,
+			struct reftable_buf prev_key, struct reftable_buf key,
+			uint8_t extra);
+
+/* Decode a record's key lengths. */
+int reftable_decode_keylen(struct string_view in,
+			   uint64_t *prefix_len,
+			   uint64_t *suffix_len,
+			   uint8_t *extra);
+
+/*
+ * Decode into `last_key` and `extra` from `in`. `last_key` is expected to
+ * contain the decoded key of the preceding record, if any.
+ */
+int reftable_decode_key(struct reftable_buf *last_key, uint8_t *extra,
+			struct string_view in);
+
+/* reftable_index_record are used internally to speed up lookups. */
+struct reftable_index_record {
+	uint64_t offset; /* Offset of block */
+	struct reftable_buf last_key; /* Last key of the block. */
+};
+
+/* reftable_obj_record stores an object ID => ref mapping. */
+struct reftable_obj_record {
+	uint8_t *hash_prefix; /* leading bytes of the object ID */
+	int hash_prefix_len; /* number of leading bytes. Constant
+			      * across a single table. */
+	uint64_t *offsets; /* a vector of file offsets. */
+	int offset_len;
+};
+
+/* record is a generic wrapper for different types of records. It is normally
+ * created on the stack, or embedded within another struct. If the type is
+ * known, a fresh instance can be initialized explicitly. Otherwise, use
+ * `reftable_record_init()` to initialize generically (as the index_record is
+ * not valid as 0-initialized structure)
+ */
+struct reftable_record {
+	uint8_t type;
+	union {
+		struct reftable_ref_record ref;
+		struct reftable_log_record log;
+		struct reftable_obj_record obj;
+		struct reftable_index_record idx;
+	} u;
+};
+
+/* Initialize the reftable record for the given type. */
+int reftable_record_init(struct reftable_record *rec, uint8_t typ);
+
+/* see struct record_vtable */
+int reftable_record_cmp(struct reftable_record *a, struct reftable_record *b, int *cmp);
+int reftable_record_equal(struct reftable_record *a, struct reftable_record *b, uint32_t hash_size);
+int reftable_record_key(struct reftable_record *rec, struct reftable_buf *dest);
+int reftable_record_copy_from(struct reftable_record *rec,
+			      struct reftable_record *src, uint32_t hash_size);
+uint8_t reftable_record_val_type(struct reftable_record *rec);
+int reftable_record_encode(struct reftable_record *rec, struct string_view dest,
+			   uint32_t hash_size);
+int reftable_record_decode(struct reftable_record *rec, struct reftable_buf key,
+			   uint8_t extra, struct string_view src,
+			   uint32_t hash_size, struct reftable_buf *scratch);
+int reftable_record_is_deletion(struct reftable_record *rec);
+
+static inline uint8_t reftable_record_type(struct reftable_record *rec)
+{
+	return rec->type;
+}
+
+/* frees and zeroes out the embedded record */
+void reftable_record_release(struct reftable_record *rec);
+
+/* for qsort. */
+int reftable_ref_record_compare_name(const void *a, const void *b);
+
+/* for qsort. */
+int reftable_log_record_compare_key(const void *a, const void *b);
+
+#endif
diff --git a/reftable/reftable-basics.h b/reftable/reftable-basics.h
new file mode 100644
index 0000000000..6d73f19c85
--- /dev/null
+++ b/reftable/reftable-basics.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2020 Google LLC
+ *
+ * Use of this source code is governed by a BSD-style
+ * license that can be found in the LICENSE file or at
+ * https://developers.google.com/open-source/licenses/bsd
+ */
+
+#ifndef REFTABLE_BASICS_H
+#define REFTABLE_BASICS_H
+
+#include <stddef.h>
+
+/* A buffer that contains arbitrary byte slices. */
+struct reftable_buf {
+	size_t alloc;
+	size_t len;
+	char *buf;
+};
+#define REFTABLE_BUF_INIT { 0 }
+
+/*
+ * Hash functions understood by the reftable library. Note that the values are
+ * arbitrary and somewhat random such that we can easily detect cases where the
+ * hash hasn't been properly set up.
+ */
+enum reftable_hash {
+	REFTABLE_HASH_SHA1   = 89,
+	REFTABLE_HASH_SHA256 = 247,
+};
+#define REFTABLE_HASH_SIZE_SHA1   20
+#define REFTABLE_HASH_SIZE_SHA256 32
+#define REFTABLE_HASH_SIZE_MAX    REFTABLE_HASH_SIZE_SHA256
+
+/* Overrides the functions to use for memory management. */
+void reftable_set_alloc(void *(*malloc)(size_t),
+			void *(*realloc)(void *, size_t), void (*free)(void *));
+
+#endif
diff --git a/reftable/reftable-block.h b/reftable/reftable-block.h
new file mode 100644
index 0000000000..0b05a8f7e3
--- /dev/null
+++ b/reftable/reftable-block.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright 2020 Google LLC
+ *
+ * Use of this source code is governed by a BSD-style
+ * license that can be found in the LICENSE file or at
+ * https://developers.google.com/open-source/licenses/bsd
+ */
+
+#ifndef REFTABLE_BLOCK_H
+#define REFTABLE_BLOCK_H
+
+#include <stdint.h>
+
+#include "reftable-basics.h"
+#include "reftable-blocksource.h"
+#include "reftable-iterator.h"
+
+struct z_stream_s;
+
+/*
+ * A block part of a reftable. Contains records as well as some metadata
+ * describing them.
+ */
+struct reftable_block {
+	/*
+	 * Offset of the block header; nonzero for the first block in a
+	 * reftable.
+	 */
+	uint32_t header_off;
+
+	/* The memory block. */
+	struct reftable_block_data block_data;
+	uint32_t hash_size;
+
+	/* Uncompressed data for log entries. */
+	struct z_stream_s *zstream;
+	unsigned char *uncompressed_data;
+	size_t uncompressed_cap;
+
+	/*
+	 * Restart point data. Restart points are located after the block's
+	 * record data.
+	 */
+	uint16_t restart_count;
+	uint32_t restart_off;
+
+	/*
+	 * Size of the data in the file. For log blocks, this is the compressed
+	 * size.
+	 */
+	uint32_t full_block_size;
+	uint8_t block_type;
+};
+
+/* Initialize a reftable block from the given block source. */
+int reftable_block_init(struct reftable_block *b,
+			struct reftable_block_source *source,
+			uint32_t offset, uint32_t header_size,
+			uint32_t table_block_size, uint32_t hash_size,
+			uint8_t want_type);
+
+/* Release resources allocated by the block. */
+void reftable_block_release(struct reftable_block *b);
+
+/* Initialize a generic record iterator from the given block. */
+int reftable_block_init_iterator(const struct reftable_block *b,
+				 struct reftable_iterator *it);
+
+/* Returns the block type (eg. 'r' for refs). */
+uint8_t reftable_block_type(const struct reftable_block *b);
+
+/* Decodes the first key in the block. */
+int reftable_block_first_key(const struct reftable_block *b, struct reftable_buf *key);
+
+#endif /* REFTABLE_BLOCK_H */
diff --git a/reftable/reftable-blocksource.h b/reftable/reftable-blocksource.h
new file mode 100644
index 0000000000..f5ba867bd6
--- /dev/null
+++ b/reftable/reftable-blocksource.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright 2020 Google LLC
+ *
+ * Use of this source code is governed by a BSD-style
+ * license that can be found in the LICENSE file or at
+ * https://developers.google.com/open-source/licenses/bsd
+ */
+
+#ifndef REFTABLE_BLOCKSOURCE_H
+#define REFTABLE_BLOCKSOURCE_H
+
+#include <stdint.h>
+
+/*
+ * Generic wrapper for a seekable readable file.
+ */
+struct reftable_block_source {
+	struct reftable_block_source_vtable *ops;
+	void *arg;
+};
+
+/* a contiguous segment of bytes. It keeps track of its generating block_source
+ * so it can return itself into the pool. */
+struct reftable_block_data {
+	uint8_t *data;
+	size_t len;
+	struct reftable_block_source source;
+};
+
+/* block_source_vtable are the operations that make up block_source */
+struct reftable_block_source_vtable {
+	/* Returns the size of a block source. */
+	uint64_t (*size)(void *source);
+
+	/*
+	 * Reads a segment from the block source. It is an error to read beyond
+	 * the end of the block.
+	 */
+	ssize_t (*read_data)(void *source, struct reftable_block_data *dest,
+			uint64_t off, uint32_t size);
+
+	/* Mark the block as read; may release the data. */
+	void (*release_data)(void *source, struct reftable_block_data *data);
+
+	/* Release all resources associated with the block source. */
+	void (*close)(void *source);
+};
+
+/* opens a file on the file system as a block_source */
+int reftable_block_source_from_file(struct reftable_block_source *block_src,
+				    const char *name);
+
+#endif
diff --git a/reftable/reftable-constants.h b/reftable/reftable-constants.h
new file mode 100644
index 0000000000..4ae9ba4bac
--- /dev/null
+++ b/reftable/reftable-constants.h
@@ -0,0 +1,18 @@
+/*
+ * Copyright 2020 Google LLC
+ *
+ * Use of this source code is governed by a BSD-style
+ * license that can be found in the LICENSE file or at
+ * https://developers.google.com/open-source/licenses/bsd
+ */
+
+#ifndef REFTABLE_CONSTANTS_H
+#define REFTABLE_CONSTANTS_H
+
+#define REFTABLE_BLOCK_TYPE_LOG 'g'
+#define REFTABLE_BLOCK_TYPE_INDEX 'i'
+#define REFTABLE_BLOCK_TYPE_REF 'r'
+#define REFTABLE_BLOCK_TYPE_OBJ 'o'
+#define REFTABLE_BLOCK_TYPE_ANY 0
+
+#endif /* REFTABLE_CONSTANTS_H */
diff --git a/reftable/reftable-error.h b/reftable/reftable-error.h
new file mode 100644
index 0000000000..d100e0df92
--- /dev/null
+++ b/reftable/reftable-error.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright 2020 Google LLC
+ *
+ * Use of this source code is governed by a BSD-style
+ * license that can be found in the LICENSE file or at
+ * https://developers.google.com/open-source/licenses/bsd
+ */
+
+#ifndef REFTABLE_ERROR_H
+#define REFTABLE_ERROR_H
+
+/*
+ * Errors in reftable calls are signaled with negative integer return values. 0
+ * means success.
+ */
+enum reftable_error {
+	/* Unexpected file system behavior */
+	REFTABLE_IO_ERROR = -2,
+
+	/* Format inconsistency on reading data */
+	REFTABLE_FORMAT_ERROR = -3,
+
+	/* File does not exist. Returned from block_source_from_file(), because
+	 * it needs special handling in stack.
+	 */
+	REFTABLE_NOT_EXIST_ERROR = -4,
+
+	/* Trying to access locked data. */
+	REFTABLE_LOCK_ERROR = -5,
+
+	/* Misuse of the API:
+	 *  - on writing a record with NULL refname.
+	 *  - on writing a record before setting the writer limits.
+	 *  - on writing a reftable_ref_record outside the table limits
+	 *  - on writing a ref or log record before the stack's
+	 * next_update_inde*x
+	 *  - on writing a log record with multiline message with
+	 *  exact_log_message unset
+	 *  - on reading a reftable_ref_record from log iterator, or vice versa.
+	 *
+	 * When a call misuses the API, the internal state of the library is
+	 * kept unchanged.
+	 */
+	REFTABLE_API_ERROR = -6,
+
+	/* Decompression error */
+	REFTABLE_ZLIB_ERROR = -7,
+
+	/* Wrote a table without blocks. */
+	REFTABLE_EMPTY_TABLE_ERROR = -8,
+
+	/* Invalid ref name. */
+	REFTABLE_REFNAME_ERROR = -10,
+
+	/* Entry does not fit. This can happen when writing outsize reflog
+	   messages. */
+	REFTABLE_ENTRY_TOO_BIG_ERROR = -11,
+
+	/* Trying to write out-of-date data. */
+	REFTABLE_OUTDATED_ERROR = -12,
+
+	/* An allocation has failed due to an out-of-memory situation. */
+	REFTABLE_OUT_OF_MEMORY_ERROR = -13,
+};
+
+/* convert the numeric error code to a string. The string should not be
+ * deallocated. */
+const char *reftable_error_str(int err);
+
+#endif
diff --git a/reftable/reftable-iterator.h b/reftable/reftable-iterator.h
new file mode 100644
index 0000000000..af582028c2
--- /dev/null
+++ b/reftable/reftable-iterator.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright 2020 Google LLC
+ *
+ * Use of this source code is governed by a BSD-style
+ * license that can be found in the LICENSE file or at
+ * https://developers.google.com/open-source/licenses/bsd
+ */
+
+#ifndef REFTABLE_ITERATOR_H
+#define REFTABLE_ITERATOR_H
+
+#include "reftable-record.h"
+
+struct reftable_iterator_vtable;
+
+/* iterator is the generic interface for walking over data stored in a
+ * reftable.
+ */
+struct reftable_iterator {
+	struct reftable_iterator_vtable *ops;
+	void *iter_arg;
+};
+
+/*
+ * Position the iterator at the ref record with given name such that the next
+ * call to `next_ref()` would yield the record.
+ */
+int reftable_iterator_seek_ref(struct reftable_iterator *it,
+			       const char *name);
+
+/* reads the next reftable_ref_record. Returns < 0 for error, 0 for OK and > 0:
+ * end of iteration.
+ */
+int reftable_iterator_next_ref(struct reftable_iterator *it,
+			       struct reftable_ref_record *ref);
+
+/*
+ * Position the iterator at the log record with given name and update index
+ * such that the next call to `next_log()` would yield the record.
+ */
+int reftable_iterator_seek_log_at(struct reftable_iterator *it,
+				  const char *name, uint64_t update_index);
+
+/*
+ * Position the iterator at the newest log record with given name such that the
+ * next call to `next_log()` would yield the record.
+ */
+int reftable_iterator_seek_log(struct reftable_iterator *it,
+			       const char *name);
+
+/* reads the next reftable_log_record. Returns < 0 for error, 0 for OK and > 0:
+ * end of iteration.
+ */
+int reftable_iterator_next_log(struct reftable_iterator *it,
+			       struct reftable_log_record *log);
+
+/* releases resources associated with an iterator. */
+void reftable_iterator_destroy(struct reftable_iterator *it);
+
+#endif
diff --git a/reftable/reftable-merged.h b/reftable/reftable-merged.h
new file mode 100644
index 0000000000..e5af846b32
--- /dev/null
+++ b/reftable/reftable-merged.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2020 Google LLC
+ *
+ * Use of this source code is governed by a BSD-style
+ * license that can be found in the LICENSE file or at
+ * https://developers.google.com/open-source/licenses/bsd
+ */
+
+#ifndef REFTABLE_MERGED_H
+#define REFTABLE_MERGED_H
+
+#include "reftable-iterator.h"
+
+/*
+ * Merged tables
+ *
+ * A ref database kept in a sequence of table files. The merged_table presents a
+ * unified view to reading (seeking, iterating) a sequence of immutable tables.
+ *
+ * The merged tables are on purpose kept disconnected from their actual storage
+ * (eg. files on disk), because it is useful to merge tables aren't files. For
+ * example, the per-workspace and global ref namespace can be implemented as a
+ * merged table of two stacks of file-backed reftables.
+ */
+
+/* A merged table is implements seeking/iterating over a stack of tables. */
+struct reftable_merged_table;
+
+struct reftable_table;
+
+/*
+ * reftable_merged_table_new creates a new merged table. The tables must be
+ * kept alive as long as the merged table is still in use.
+ */
+int reftable_merged_table_new(struct reftable_merged_table **dest,
+			      struct reftable_table **tables, size_t n,
+			      enum reftable_hash hash_id);
+
+/* Initialize a merged table iterator for reading refs. */
+int reftable_merged_table_init_ref_iterator(struct reftable_merged_table *mt,
+					    struct reftable_iterator *it);
+
+/* Initialize a merged table iterator for reading logs. */
+int reftable_merged_table_init_log_iterator(struct reftable_merged_table *mt,
+					    struct reftable_iterator *it);
+
+/* returns the max update_index covered by this merged table. */
+uint64_t
+reftable_merged_table_max_update_index(struct reftable_merged_table *mt);
+
+/* returns the min update_index covered by this merged table. */
+uint64_t
+reftable_merged_table_min_update_index(struct reftable_merged_table *mt);
+
+/* releases memory for the merged_table */
+void reftable_merged_table_free(struct reftable_merged_table *m);
+
+/* return the hash ID of the merged table. */
+enum reftable_hash reftable_merged_table_hash_id(struct reftable_merged_table *m);
+
+#endif
diff --git a/reftable/reftable-record.h b/reftable/reftable-record.h
new file mode 100644
index 0000000000..385a74cc86
--- /dev/null
+++ b/reftable/reftable-record.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright 2020 Google LLC
+ *
+ * Use of this source code is governed by a BSD-style
+ * license that can be found in the LICENSE file or at
+ * https://developers.google.com/open-source/licenses/bsd
+ */
+
+#ifndef REFTABLE_RECORD_H
+#define REFTABLE_RECORD_H
+
+#include "reftable-basics.h"
+#include <stdint.h>
+
+/*
+ * Basic data types
+ *
+ * Reftables store the state of each ref in struct reftable_ref_record, and they
+ * store a sequence of reflog updates in struct reftable_log_record.
+ */
+
+/* reftable_ref_record holds a ref database entry target_value */
+struct reftable_ref_record {
+	char *refname; /* Name of the ref, malloced. */
+	size_t refname_cap;
+	uint64_t update_index; /* Logical timestamp at which this value is
+				* written */
+
+	enum {
+		/* tombstone to hide deletions from earlier tables */
+		REFTABLE_REF_DELETION = 0x0,
+
+		/* a simple ref */
+		REFTABLE_REF_VAL1 = 0x1,
+		/* a tag, plus its peeled hash */
+		REFTABLE_REF_VAL2 = 0x2,
+
+		/* a symbolic reference */
+		REFTABLE_REF_SYMREF = 0x3,
+#define REFTABLE_NR_REF_VALUETYPES 4
+	} value_type;
+	union {
+		unsigned char val1[REFTABLE_HASH_SIZE_MAX];
+		struct {
+			unsigned char value[REFTABLE_HASH_SIZE_MAX]; /* first hash  */
+			unsigned char target_value[REFTABLE_HASH_SIZE_MAX]; /* second hash */
+		} val2;
+		char *symref; /* referent, malloced 0-terminated string */
+	} value;
+};
+
+/* Returns the first hash, or NULL if `rec` is not of type
+ * REFTABLE_REF_VAL1 or REFTABLE_REF_VAL2. */
+const unsigned char *reftable_ref_record_val1(const struct reftable_ref_record *rec);
+
+/* Returns the second hash, or NULL if `rec` is not of type
+ * REFTABLE_REF_VAL2. */
+const unsigned char *reftable_ref_record_val2(const struct reftable_ref_record *rec);
+
+/* returns whether 'ref' represents a deletion */
+int reftable_ref_record_is_deletion(const struct reftable_ref_record *ref);
+
+/* frees and nulls all pointer values inside `ref`. */
+void reftable_ref_record_release(struct reftable_ref_record *ref);
+
+/* returns whether two reftable_ref_records are the same. Useful for testing. */
+int reftable_ref_record_equal(const struct reftable_ref_record *a,
+			      const struct reftable_ref_record *b, uint32_t hash_size);
+
+/* reftable_log_record holds a reflog entry */
+struct reftable_log_record {
+	char *refname;
+	size_t refname_cap;
+	uint64_t update_index; /* logical timestamp of a transactional update.
+				*/
+
+	enum {
+		/* tombstone to hide deletions from earlier tables */
+		REFTABLE_LOG_DELETION = 0x0,
+
+		/* a simple update */
+		REFTABLE_LOG_UPDATE = 0x1,
+#define REFTABLE_NR_LOG_VALUETYPES 2
+	} value_type;
+
+	union {
+		struct {
+			unsigned char new_hash[REFTABLE_HASH_SIZE_MAX];
+			unsigned char old_hash[REFTABLE_HASH_SIZE_MAX];
+			char *name;
+			char *email;
+			uint64_t time;
+			int16_t tz_offset;
+			char *message;
+			size_t message_cap;
+		} update;
+	} value;
+};
+
+/* returns whether 'ref' represents the deletion of a log record. */
+int reftable_log_record_is_deletion(const struct reftable_log_record *log);
+
+/* frees and nulls all pointer values. */
+void reftable_log_record_release(struct reftable_log_record *log);
+
+/* returns whether two records are equal. Useful for testing. */
+int reftable_log_record_equal(const struct reftable_log_record *a,
+			      const struct reftable_log_record *b, uint32_t hash_size);
+
+#endif
diff --git a/reftable/reftable-stack.h b/reftable/reftable-stack.h
new file mode 100644
index 0000000000..910ec6ef3a
--- /dev/null
+++ b/reftable/reftable-stack.h
@@ -0,0 +1,155 @@
+/*
+ * Copyright 2020 Google LLC
+ *
+ * Use of this source code is governed by a BSD-style
+ * license that can be found in the LICENSE file or at
+ * https://developers.google.com/open-source/licenses/bsd
+ */
+
+#ifndef REFTABLE_STACK_H
+#define REFTABLE_STACK_H
+
+#include "reftable-writer.h"
+
+/*
+ * The stack presents an interface to a mutable sequence of reftables.
+
+ * A stack can be mutated by pushing a table to the top of the stack.
+
+ * The reftable_stack automatically compacts files on disk to ensure good
+ * amortized performance.
+ *
+ * For windows and other platforms that cannot have open files as rename
+ * destinations, concurrent access from multiple processes needs the rand()
+ * random seed to be randomized.
+ */
+struct reftable_stack;
+
+/* open a new reftable stack. The tables along with the table list will be
+ *  stored in 'dir'. Typically, this should be .git/reftables.
+ */
+int reftable_new_stack(struct reftable_stack **dest, const char *dir,
+		       const struct reftable_write_options *opts);
+
+/* returns the update_index at which a next table should be written. */
+uint64_t reftable_stack_next_update_index(struct reftable_stack *st);
+
+/* holds a transaction to add tables at the top of a stack. */
+struct reftable_addition;
+
+enum {
+	/*
+	 * Reload the stack when the stack is out-of-date after locking it.
+	 */
+	REFTABLE_STACK_NEW_ADDITION_RELOAD = (1 << 0),
+};
+
+/*
+ * returns a new transaction to add reftables to the given stack. As a side
+ * effect, the ref database is locked. Accepts REFTABLE_STACK_NEW_ADDITION_*
+ * flags.
+ */
+int reftable_stack_new_addition(struct reftable_addition **dest,
+				struct reftable_stack *st,
+				unsigned int flags);
+
+/* Adds a reftable to transaction. */
+int reftable_addition_add(struct reftable_addition *add,
+			  int (*write_table)(struct reftable_writer *wr,
+					     void *arg),
+			  void *arg);
+
+/* Commits the transaction, releasing the lock. After calling this,
+ * reftable_addition_destroy should still be called.
+ */
+int reftable_addition_commit(struct reftable_addition *add);
+
+/* Release all non-committed data from the transaction, and deallocate the
+ * transaction. Releases the lock if held. */
+void reftable_addition_destroy(struct reftable_addition *add);
+
+/* add a new table to the stack. The write_table function must call
+ * reftable_writer_set_limits, add refs and return an error value. */
+int reftable_stack_add(struct reftable_stack *st,
+		       int (*write_table)(struct reftable_writer *wr,
+					  void *write_arg),
+		       void *write_arg);
+
+struct reftable_iterator;
+
+/*
+ * Initialize an iterator for the merged tables contained in the stack that can
+ * be used to iterate through refs. The iterator is valid until the next reload
+ * or write.
+ */
+int reftable_stack_init_ref_iterator(struct reftable_stack *st,
+				     struct reftable_iterator *it);
+
+/*
+ * Initialize an iterator for the merged tables contained in the stack that can
+ * be used to iterate through logs. The iterator is valid until the next reload
+ * or write.
+ */
+int reftable_stack_init_log_iterator(struct reftable_stack *st,
+				     struct reftable_iterator *it);
+
+/* returns the merged_table for seeking. This table is valid until the
+ * next write or reload, and should not be closed or deleted.
+ */
+struct reftable_merged_table *
+reftable_stack_merged_table(struct reftable_stack *st);
+
+/* frees all resources associated with the stack. */
+void reftable_stack_destroy(struct reftable_stack *st);
+
+/* Reloads the stack if necessary. This is very cheap to run if the stack was up
+ * to date */
+int reftable_stack_reload(struct reftable_stack *st);
+
+/* Policy for expiring reflog entries. */
+struct reftable_log_expiry_config {
+	/* Drop entries older than this timestamp */
+	uint64_t time;
+
+	/* Drop older entries */
+	uint64_t min_update_index;
+};
+
+/* compacts all reftables into a giant table. Expire reflog entries if config is
+ * non-NULL */
+int reftable_stack_compact_all(struct reftable_stack *st,
+			       struct reftable_log_expiry_config *config);
+
+/* heuristically compact unbalanced table stack. */
+int reftable_stack_auto_compact(struct reftable_stack *st);
+
+/* delete stale .ref tables. */
+int reftable_stack_clean(struct reftable_stack *st);
+
+/* convenience function to read a single ref. Returns < 0 for error, 0 for
+ * success, and 1 if ref not found. */
+int reftable_stack_read_ref(struct reftable_stack *st, const char *refname,
+			    struct reftable_ref_record *ref);
+
+/* convenience function to read a single log. Returns < 0 for error, 0 for
+ * success, and 1 if ref not found. */
+int reftable_stack_read_log(struct reftable_stack *st, const char *refname,
+			    struct reftable_log_record *log);
+
+/* statistics on past compactions. */
+struct reftable_compaction_stats {
+	uint64_t bytes; /* total number of bytes written */
+	uint64_t entries_written; /* total number of entries written, including
+				     failures. */
+	int attempts; /* how often we tried to compact */
+	int failures; /* failures happen on concurrent updates */
+};
+
+/* return statistics for compaction up till now. */
+struct reftable_compaction_stats *
+reftable_stack_compaction_stats(struct reftable_stack *st);
+
+/* Return the hash of the stack. */
+enum reftable_hash reftable_stack_hash_id(struct reftable_stack *st);
+
+#endif
diff --git a/reftable/reftable-table.h b/reftable/reftable-table.h
new file mode 100644
index 0000000000..5f935d02e3
--- /dev/null
+++ b/reftable/reftable-table.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright 2020 Google LLC
+ *
+ * Use of this source code is governed by a BSD-style
+ * license that can be found in the LICENSE file or at
+ * https://developers.google.com/open-source/licenses/bsd
+ */
+
+#ifndef REFTABLE_TABLE_H
+#define REFTABLE_TABLE_H
+
+#include "reftable-iterator.h"
+#include "reftable-block.h"
+#include "reftable-blocksource.h"
+
+/*
+ * Reading single tables
+ *
+ * The follow routines are for reading single files. For an
+ * application-level interface, skip ahead to struct
+ * reftable_merged_table and struct reftable_stack.
+ */
+
+/* Metadata for a block type. */
+struct reftable_table_offsets {
+	int is_present;
+	uint64_t offset;
+	uint64_t index_offset;
+};
+
+/* The table struct is a handle to an open reftable file. */
+struct reftable_table {
+	/* for convenience, associate a name with the instance. */
+	char *name;
+	struct reftable_block_source source;
+
+	/* Size of the file, excluding the footer. */
+	uint64_t size;
+
+	/* The hash function used for ref records. */
+	enum reftable_hash hash_id;
+
+	uint32_t block_size;
+	uint64_t min_update_index;
+	uint64_t max_update_index;
+	/* Length of the OID keys in the 'o' section */
+	int object_id_len;
+	int version;
+
+	struct reftable_table_offsets ref_offsets;
+	struct reftable_table_offsets obj_offsets;
+	struct reftable_table_offsets log_offsets;
+
+	uint64_t refcount;
+};
+
+/* reftable_table_new opens a reftable for reading. If successful,
+ * returns 0 code and sets pp. The name is used for creating a
+ * stack. Typically, it is the basename of the file. The block source
+ * `src` is owned by the table, and is closed on calling
+ * reftable_table_destroy(). On error, the block source `src` is
+ * closed as well.
+ */
+int reftable_table_new(struct reftable_table **out,
+		       struct reftable_block_source *src, const char *name);
+
+/*
+ * Manage the reference count of the reftable table. A newly initialized
+ * table starts with a refcount of 1 and will be deleted once the refcount has
+ * reached 0.
+ *
+ * This is required because tables may have longer lifetimes than the stack
+ * they belong to. The stack may for example be reloaded while the old tables
+ * are still being accessed by an iterator.
+ */
+void reftable_table_incref(struct reftable_table *table);
+void reftable_table_decref(struct reftable_table *table);
+
+/* Initialize a reftable iterator for reading refs. */
+int reftable_table_init_ref_iterator(struct reftable_table *t,
+				     struct reftable_iterator *it);
+
+/* Initialize a reftable iterator for reading logs. */
+int reftable_table_init_log_iterator(struct reftable_table *t,
+				     struct reftable_iterator *it);
+
+/* returns the hash ID used in this table. */
+enum reftable_hash reftable_table_hash_id(struct reftable_table *t);
+
+/* return an iterator for the refs pointing to `oid`. */
+int reftable_table_refs_for(struct reftable_table *t,
+			    struct reftable_iterator *it, uint8_t *oid);
+
+/* return the max_update_index for a table */
+uint64_t reftable_table_max_update_index(struct reftable_table *t);
+
+/* return the min_update_index for a table */
+uint64_t reftable_table_min_update_index(struct reftable_table *t);
+
+/*
+ * An iterator that iterates through the blocks contained in a given table.
+ */
+struct reftable_table_iterator {
+	void *iter_arg;
+};
+
+int reftable_table_iterator_init(struct reftable_table_iterator *it,
+				 struct reftable_table *t);
+
+void reftable_table_iterator_release(struct reftable_table_iterator *it);
+
+int reftable_table_iterator_next(struct reftable_table_iterator *it,
+				 const struct reftable_block **out);
+
+#endif
diff --git a/reftable/reftable-writer.h b/reftable/reftable-writer.h
new file mode 100644
index 0000000000..0fbeff17f4
--- /dev/null
+++ b/reftable/reftable-writer.h
@@ -0,0 +1,189 @@
+/*
+ * Copyright 2020 Google LLC
+ *
+ * Use of this source code is governed by a BSD-style
+ * license that can be found in the LICENSE file or at
+ * https://developers.google.com/open-source/licenses/bsd
+ */
+
+#ifndef REFTABLE_WRITER_H
+#define REFTABLE_WRITER_H
+
+#include "reftable-record.h"
+
+#include <stdint.h>
+#include <unistd.h> /* ssize_t */
+
+/* Writing single reftables */
+
+/* reftable_write_options sets options for writing a single reftable. */
+struct reftable_write_options {
+	/* boolean: do not pad out blocks to block size. */
+	unsigned unpadded : 1;
+
+	/* the blocksize. Should be less than 2^24. */
+	uint32_t block_size;
+
+	/* boolean: do not generate a SHA1 => ref index. */
+	unsigned skip_index_objects : 1;
+
+	/* how often to write complete keys in each block. */
+	uint16_t restart_interval;
+
+	/* 4-byte identifier ("sha1", "s256") of the hash.
+	 * Defaults to SHA1 if unset
+	 */
+	enum reftable_hash hash_id;
+
+	/* Default mode for creating files. If unset, use 0666 (+umask) */
+	unsigned int default_permissions;
+
+	/* boolean: copy log messages exactly. If unset, check that the message
+	 *   is a single line, and add '\n' if missing.
+	 */
+	unsigned exact_log_message : 1;
+
+	/* boolean: Prevent auto-compaction of tables. */
+	unsigned disable_auto_compact : 1;
+
+	/*
+	 * Geometric sequence factor used by auto-compaction to decide which
+	 * tables to compact. Defaults to 2 if unset.
+	 */
+	uint8_t auto_compaction_factor;
+
+	/*
+	 * The number of milliseconds to wait when trying to lock "tables.list".
+	 * Note that this does not apply to locking individual tables, as these
+	 * should only ever be locked when already holding the "tables.list"
+	 * lock.
+	 *
+	 * Passing 0 will fail immediately when the file is locked, passing a
+	 * negative value will cause us to block indefinitely.
+	 */
+	long lock_timeout_ms;
+
+	/*
+	 * Optional callback used to fsync files to disk. Falls back to using
+	 * fsync(3P) when unset.
+	 */
+	int (*fsync)(int fd);
+
+	/*
+	 * Callback function to execute whenever the stack is being reloaded.
+	 * This can be used e.g. to discard cached information that relies on
+	 * the old stack's data. The payload data will be passed as argument to
+	 * the callback.
+	 */
+	void (*on_reload)(void *payload);
+	void *on_reload_payload;
+};
+
+/* reftable_block_stats holds statistics for a single block type */
+struct reftable_block_stats {
+	/* total number of entries written */
+	int entries;
+	/* total number of key restarts */
+	uint32_t restarts;
+	/* total number of blocks */
+	int blocks;
+	/* total number of index blocks */
+	int index_blocks;
+	/* depth of the index */
+	int max_index_level;
+
+	/* offset of the first block for this type */
+	uint64_t offset;
+	/* offset of the top level index block for this type, or 0 if not
+	 * present */
+	uint64_t index_offset;
+};
+
+/* stats holds overall statistics for a single reftable */
+struct reftable_stats {
+	/* total number of blocks written. */
+	int blocks;
+	/* stats for ref data */
+	struct reftable_block_stats ref_stats;
+	/* stats for the SHA1 to ref map. */
+	struct reftable_block_stats obj_stats;
+	/* stats for index blocks */
+	struct reftable_block_stats idx_stats;
+	/* stats for log blocks */
+	struct reftable_block_stats log_stats;
+
+	/* disambiguation length of shortened object IDs. */
+	int object_id_len;
+};
+
+struct reftable_writer;
+
+/* Create a new writer. */
+int reftable_writer_new(struct reftable_writer **out,
+			ssize_t (*writer_func)(void *, const void *, size_t),
+			int (*flush_func)(void *),
+			void *writer_arg, const struct reftable_write_options *opts);
+
+/*
+ * Set the range of update indices for the records we will add. When writing a
+ * table into a stack, the min should be at least
+ * reftable_stack_next_update_index(), or REFTABLE_API_ERROR is returned.
+ *
+ * For transactional updates to a stack, typically min==max, and the
+ * update_index can be obtained by inspeciting the stack. When converting an
+ * existing ref database into a single reftable, this would be a range of
+ * update-index timestamps.
+ *
+ * The function should be called before adding any records to the writer. If not
+ * it will fail with REFTABLE_API_ERROR.
+ */
+int reftable_writer_set_limits(struct reftable_writer *w, uint64_t min,
+			       uint64_t max);
+
+/*
+  Add a reftable_ref_record. The record should have names that come after
+  already added records.
+
+  The update_index must be within the limits set by
+  reftable_writer_set_limits(), or REFTABLE_API_ERROR is returned. It is an
+  REFTABLE_API_ERROR error to write a ref record after a log record.
+*/
+int reftable_writer_add_ref(struct reftable_writer *w,
+			    struct reftable_ref_record *ref);
+
+/*
+  Convenience function to add multiple reftable_ref_records; the function sorts
+  the records before adding them, reordering the records array passed in.
+*/
+int reftable_writer_add_refs(struct reftable_writer *w,
+			     struct reftable_ref_record *refs, int n);
+
+/*
+  adds reftable_log_records. Log records are keyed by (refname, decreasing
+  update_index). The key for the record added must come after the already added
+  log records.
+*/
+int reftable_writer_add_log(struct reftable_writer *w,
+			    struct reftable_log_record *log);
+
+/*
+  Convenience function to add multiple reftable_log_records; the function sorts
+  the records before adding them, reordering records array passed in.
+*/
+int reftable_writer_add_logs(struct reftable_writer *w,
+			     struct reftable_log_record *logs, int n);
+
+/* reftable_writer_close finalizes the reftable. The writer is retained so
+ * statistics can be inspected. */
+int reftable_writer_close(struct reftable_writer *w);
+
+/* writer_stats returns the statistics on the reftable being written.
+
+   This struct becomes invalid when the writer is freed.
+ */
+const struct reftable_stats *reftable_writer_stats(struct reftable_writer *w);
+
+/* reftable_writer_free deallocates memory for the writer */
+void reftable_writer_free(struct reftable_writer *w);
+
+#endif
diff --git a/reftable/stack.c b/reftable/stack.c
new file mode 100644
index 0000000000..4caf96aa1d
--- /dev/null
+++ b/reftable/stack.c
@@ -0,0 +1,1841 @@
+/*
+ * Copyright 2020 Google LLC
+ *
+ * Use of this source code is governed by a BSD-style
+ * license that can be found in the LICENSE file or at
+ * https://developers.google.com/open-source/licenses/bsd
+ */
+
+#include "stack.h"
+
+#include "system.h"
+#include "constants.h"
+#include "merged.h"
+#include "reftable-error.h"
+#include "reftable-record.h"
+#include "reftable-merged.h"
+#include "table.h"
+#include "writer.h"
+
+static int stack_try_add(struct reftable_stack *st,
+			 int (*write_table)(struct reftable_writer *wr,
+					    void *arg),
+			 void *arg);
+static int stack_write_compact(struct reftable_stack *st,
+			       struct reftable_writer *wr,
+			       size_t first, size_t last,
+			       struct reftable_log_expiry_config *config);
+static void reftable_addition_close(struct reftable_addition *add);
+static int reftable_stack_reload_maybe_reuse(struct reftable_stack *st,
+					     int reuse_open);
+
+static int stack_filename(struct reftable_buf *dest, struct reftable_stack *st,
+			  const char *name)
+{
+	int err;
+	reftable_buf_reset(dest);
+	if ((err = reftable_buf_addstr(dest, st->reftable_dir)) < 0 ||
+	    (err = reftable_buf_addstr(dest, "/")) < 0 ||
+	    (err = reftable_buf_addstr(dest, name)) < 0)
+		return err;
+	return 0;
+}
+
+static int stack_fsync(const struct reftable_write_options *opts, int fd)
+{
+	if (opts->fsync)
+		return opts->fsync(fd);
+	return fsync(fd);
+}
+
+static ssize_t reftable_write_data(int fd, const void *data, size_t size)
+{
+	size_t total_written = 0;
+	const char *p = data;
+
+	while (total_written < size) {
+		ssize_t bytes_written = write(fd, p, size - total_written);
+		if (bytes_written < 0 && (errno == EAGAIN || errno == EINTR))
+			continue;
+		if (bytes_written < 0)
+			return REFTABLE_IO_ERROR;
+
+		total_written += bytes_written;
+		p += bytes_written;
+	}
+
+	return total_written;
+}
+
+struct fd_writer {
+	const struct reftable_write_options *opts;
+	int fd;
+};
+
+static ssize_t fd_writer_write(void *arg, const void *data, size_t sz)
+{
+	struct fd_writer *writer = arg;
+	return reftable_write_data(writer->fd, data, sz);
+}
+
+static int fd_writer_flush(void *arg)
+{
+	struct fd_writer *writer = arg;
+	return stack_fsync(writer->opts, writer->fd);
+}
+
+int reftable_new_stack(struct reftable_stack **dest, const char *dir,
+		       const struct reftable_write_options *_opts)
+{
+	struct reftable_buf list_file_name = REFTABLE_BUF_INIT;
+	struct reftable_write_options opts = { 0 };
+	struct reftable_stack *p;
+	int err;
+
+	p = reftable_calloc(1, sizeof(*p));
+	if (!p) {
+		err = REFTABLE_OUT_OF_MEMORY_ERROR;
+		goto out;
+	}
+
+	if (_opts)
+		opts = *_opts;
+	if (opts.hash_id == 0)
+		opts.hash_id = REFTABLE_HASH_SHA1;
+
+	*dest = NULL;
+
+	reftable_buf_reset(&list_file_name);
+	if ((err = reftable_buf_addstr(&list_file_name, dir)) < 0 ||
+	    (err = reftable_buf_addstr(&list_file_name, "/tables.list")) < 0)
+		goto out;
+
+	p->list_file = reftable_buf_detach(&list_file_name);
+	p->list_fd = -1;
+	p->opts = opts;
+	p->reftable_dir = reftable_strdup(dir);
+	if (!p->reftable_dir) {
+		err = REFTABLE_OUT_OF_MEMORY_ERROR;
+		goto out;
+	}
+
+	err = reftable_stack_reload_maybe_reuse(p, 1);
+	if (err < 0)
+		goto out;
+
+	*dest = p;
+	err = 0;
+
+out:
+	if (err < 0)
+		reftable_stack_destroy(p);
+	return err;
+}
+
+static int fd_read_lines(int fd, char ***namesp)
+{
+	char *buf = NULL;
+	int err = 0;
+	off_t size;
+
+	size = lseek(fd, 0, SEEK_END);
+	if (size < 0) {
+		err = REFTABLE_IO_ERROR;
+		goto done;
+	}
+
+	err = lseek(fd, 0, SEEK_SET);
+	if (err < 0) {
+		err = REFTABLE_IO_ERROR;
+		goto done;
+	}
+
+	REFTABLE_ALLOC_ARRAY(buf, size + 1);
+	if (!buf) {
+		err = REFTABLE_OUT_OF_MEMORY_ERROR;
+		goto done;
+	}
+
+	for (off_t total_read = 0; total_read < size; ) {
+		ssize_t bytes_read = read(fd, buf + total_read, size - total_read);
+		if (bytes_read < 0 && (errno == EAGAIN || errno == EINTR))
+			continue;
+		if (bytes_read < 0 || !bytes_read) {
+			err = REFTABLE_IO_ERROR;
+			goto done;
+		}
+
+		total_read += bytes_read;
+	}
+	buf[size] = 0;
+
+	*namesp = parse_names(buf, size);
+	if (!*namesp) {
+		err = REFTABLE_OUT_OF_MEMORY_ERROR;
+		goto done;
+	}
+
+done:
+	reftable_free(buf);
+	return err;
+}
+
+int read_lines(const char *filename, char ***namesp)
+{
+	int fd = open(filename, O_RDONLY);
+	int err = 0;
+	if (fd < 0) {
+		if (errno == ENOENT) {
+			REFTABLE_CALLOC_ARRAY(*namesp, 1);
+			if (!*namesp)
+				return REFTABLE_OUT_OF_MEMORY_ERROR;
+			return 0;
+		}
+
+		return REFTABLE_IO_ERROR;
+	}
+	err = fd_read_lines(fd, namesp);
+	close(fd);
+	return err;
+}
+
+int reftable_stack_init_ref_iterator(struct reftable_stack *st,
+				      struct reftable_iterator *it)
+{
+	return merged_table_init_iter(reftable_stack_merged_table(st),
+				      it, REFTABLE_BLOCK_TYPE_REF);
+}
+
+int reftable_stack_init_log_iterator(struct reftable_stack *st,
+				     struct reftable_iterator *it)
+{
+	return merged_table_init_iter(reftable_stack_merged_table(st),
+				      it, REFTABLE_BLOCK_TYPE_LOG);
+}
+
+struct reftable_merged_table *
+reftable_stack_merged_table(struct reftable_stack *st)
+{
+	return st->merged;
+}
+
+static int has_name(char **names, const char *name)
+{
+	while (*names) {
+		if (!strcmp(*names, name))
+			return 1;
+		names++;
+	}
+	return 0;
+}
+
+/* Close and free the stack */
+void reftable_stack_destroy(struct reftable_stack *st)
+{
+	char **names = NULL;
+	int err = 0;
+
+	if (!st)
+		return;
+
+	if (st->merged) {
+		reftable_merged_table_free(st->merged);
+		st->merged = NULL;
+	}
+
+	err = read_lines(st->list_file, &names);
+	if (err < 0) {
+		REFTABLE_FREE_AND_NULL(names);
+	}
+
+	if (st->tables) {
+		struct reftable_buf filename = REFTABLE_BUF_INIT;
+
+		for (size_t i = 0; i < st->tables_len; i++) {
+			const char *name = reftable_table_name(st->tables[i]);
+			int try_unlinking = 1;
+
+			reftable_buf_reset(&filename);
+			if (names && !has_name(names, name)) {
+				if (stack_filename(&filename, st, name) < 0)
+					try_unlinking = 0;
+			}
+			reftable_table_decref(st->tables[i]);
+
+			if (try_unlinking && filename.len) {
+				/* On Windows, can only unlink after closing. */
+				unlink(filename.buf);
+			}
+		}
+
+		reftable_buf_release(&filename);
+		st->tables_len = 0;
+		REFTABLE_FREE_AND_NULL(st->tables);
+	}
+
+	if (st->list_fd >= 0) {
+		close(st->list_fd);
+		st->list_fd = -1;
+	}
+
+	REFTABLE_FREE_AND_NULL(st->list_file);
+	REFTABLE_FREE_AND_NULL(st->reftable_dir);
+	reftable_free(st);
+	free_names(names);
+}
+
+static struct reftable_table **stack_copy_tables(struct reftable_stack *st,
+						 size_t cur_len)
+{
+	struct reftable_table **cur = reftable_calloc(cur_len, sizeof(*cur));
+	if (!cur)
+		return NULL;
+	for (size_t i = 0; i < cur_len; i++)
+		cur[i] = st->tables[i];
+	return cur;
+}
+
+static int reftable_stack_reload_once(struct reftable_stack *st,
+				      const char **names,
+				      int reuse_open)
+{
+	size_t cur_len = !st->merged ? 0 : st->merged->tables_len;
+	struct reftable_table **cur = NULL;
+	struct reftable_table **reused = NULL;
+	struct reftable_table **new_tables = NULL;
+	size_t reused_len = 0, reused_alloc = 0, names_len;
+	size_t new_tables_len = 0;
+	struct reftable_merged_table *new_merged = NULL;
+	struct reftable_buf table_path = REFTABLE_BUF_INIT;
+	int err = 0;
+	size_t i;
+
+	if (cur_len) {
+		cur = stack_copy_tables(st, cur_len);
+		if (!cur) {
+			err = REFTABLE_OUT_OF_MEMORY_ERROR;
+			goto done;
+		}
+	}
+
+	names_len = names_length(names);
+
+	if (names_len) {
+		new_tables = reftable_calloc(names_len, sizeof(*new_tables));
+		if (!new_tables) {
+			err = REFTABLE_OUT_OF_MEMORY_ERROR;
+			goto done;
+		}
+	}
+
+	while (*names) {
+		struct reftable_table *table = NULL;
+		const char *name = *names++;
+
+		/* this is linear; we assume compaction keeps the number of
+		   tables under control so this is not quadratic. */
+		for (i = 0; reuse_open && i < cur_len; i++) {
+			if (cur[i] && 0 == strcmp(cur[i]->name, name)) {
+				table = cur[i];
+				cur[i] = NULL;
+
+				/*
+				 * When reloading the stack fails, we end up
+				 * releasing all new tables. This also
+				 * includes the reused tables, even though
+				 * they are still in used by the old stack. We
+				 * thus need to keep them alive here, which we
+				 * do by bumping their refcount.
+				 */
+				REFTABLE_ALLOC_GROW_OR_NULL(reused,
+							    reused_len + 1,
+							    reused_alloc);
+				if (!reused) {
+					err = REFTABLE_OUT_OF_MEMORY_ERROR;
+					goto done;
+				}
+				reused[reused_len++] = table;
+				reftable_table_incref(table);
+				break;
+			}
+		}
+
+		if (!table) {
+			struct reftable_block_source src = { NULL };
+
+			err = stack_filename(&table_path, st, name);
+			if (err < 0)
+				goto done;
+
+			err = reftable_block_source_from_file(&src,
+							      table_path.buf);
+			if (err < 0)
+				goto done;
+
+			err = reftable_table_new(&table, &src, name);
+			if (err < 0)
+				goto done;
+		}
+
+		new_tables[new_tables_len] = table;
+		new_tables_len++;
+	}
+
+	/* success! */
+	err = reftable_merged_table_new(&new_merged, new_tables,
+					new_tables_len, st->opts.hash_id);
+	if (err < 0)
+		goto done;
+
+	/*
+	 * Close the old, non-reused tables and proactively try to unlink
+	 * them. This is done for systems like Windows, where the underlying
+	 * file of such an open table wouldn't have been possible to be
+	 * unlinked by the compacting process.
+	 */
+	for (i = 0; i < cur_len; i++) {
+		if (cur[i]) {
+			const char *name = reftable_table_name(cur[i]);
+
+			err = stack_filename(&table_path, st, name);
+			if (err < 0)
+				goto done;
+
+			reftable_table_decref(cur[i]);
+			unlink(table_path.buf);
+		}
+	}
+
+	/* Update the stack to point to the new tables. */
+	if (st->merged)
+		reftable_merged_table_free(st->merged);
+	new_merged->suppress_deletions = 1;
+	st->merged = new_merged;
+
+	if (st->tables)
+		reftable_free(st->tables);
+	st->tables = new_tables;
+	st->tables_len = new_tables_len;
+	new_tables = NULL;
+	new_tables_len = 0;
+
+	/*
+	 * Decrement the refcount of reused tables again. This only needs to
+	 * happen on the successful case, because on the unsuccessful one we
+	 * decrement their refcount via `new_tables`.
+	 */
+	for (i = 0; i < reused_len; i++)
+		reftable_table_decref(reused[i]);
+
+done:
+	for (i = 0; i < new_tables_len; i++)
+		reftable_table_decref(new_tables[i]);
+	reftable_free(new_tables);
+	reftable_free(reused);
+	reftable_free(cur);
+	reftable_buf_release(&table_path);
+	return err;
+}
+
+/* return negative if a before b. */
+static int tv_cmp(struct timeval *a, struct timeval *b)
+{
+	time_t diff = a->tv_sec - b->tv_sec;
+	int udiff = a->tv_usec - b->tv_usec;
+
+	if (diff != 0)
+		return diff;
+
+	return udiff;
+}
+
+static int reftable_stack_reload_maybe_reuse(struct reftable_stack *st,
+					     int reuse_open)
+{
+	char **names = NULL, **names_after = NULL;
+	struct timeval deadline;
+	int64_t delay = 0;
+	int tries = 0, err;
+	int fd = -1;
+
+	err = gettimeofday(&deadline, NULL);
+	if (err < 0)
+		goto out;
+	deadline.tv_sec += 3;
+
+	while (1) {
+		struct timeval now;
+
+		err = gettimeofday(&now, NULL);
+		if (err < 0)
+			goto out;
+
+		/*
+		 * Only look at deadlines after the first few times. This
+		 * simplifies debugging in GDB.
+		 */
+		tries++;
+		if (tries > 3 && tv_cmp(&now, &deadline) >= 0)
+			goto out;
+
+		fd = open(st->list_file, O_RDONLY);
+		if (fd < 0) {
+			if (errno != ENOENT) {
+				err = REFTABLE_IO_ERROR;
+				goto out;
+			}
+
+			REFTABLE_CALLOC_ARRAY(names, 1);
+			if (!names) {
+				err = REFTABLE_OUT_OF_MEMORY_ERROR;
+				goto out;
+			}
+		} else {
+			err = fd_read_lines(fd, &names);
+			if (err < 0)
+				goto out;
+		}
+
+		err = reftable_stack_reload_once(st, (const char **) names, reuse_open);
+		if (!err)
+			break;
+		if (err != REFTABLE_NOT_EXIST_ERROR)
+			goto out;
+
+		/*
+		 * REFTABLE_NOT_EXIST_ERROR can be caused by a concurrent
+		 * writer. Check if there was one by checking if the name list
+		 * changed.
+		 */
+		err = read_lines(st->list_file, &names_after);
+		if (err < 0)
+			goto out;
+		if (names_equal((const char **) names_after,
+				(const char **) names)) {
+			err = REFTABLE_NOT_EXIST_ERROR;
+			goto out;
+		}
+
+		free_names(names);
+		names = NULL;
+		free_names(names_after);
+		names_after = NULL;
+		close(fd);
+		fd = -1;
+
+		delay = delay + (delay * reftable_rand()) / UINT32_MAX + 1;
+		poll(NULL, 0, delay);
+	}
+
+out:
+	/*
+	 * Invalidate the stat cache. It is sufficient to only close the file
+	 * descriptor and keep the cached stat info because we never use the
+	 * latter when the former is negative.
+	 */
+	if (st->list_fd >= 0) {
+		close(st->list_fd);
+		st->list_fd = -1;
+	}
+
+	/*
+	 * Cache stat information in case it provides a useful signal to us.
+	 * According to POSIX, "The st_ino and st_dev fields taken together
+	 * uniquely identify the file within the system." That being said,
+	 * Windows is not POSIX compliant and we do not have these fields
+	 * available. So the information we have there is insufficient to
+	 * determine whether two file descriptors point to the same file.
+	 *
+	 * While we could fall back to using other signals like the file's
+	 * mtime, those are not sufficient to avoid races. We thus refrain from
+	 * using the stat cache on such systems and fall back to the secondary
+	 * caching mechanism, which is to check whether contents of the file
+	 * have changed.
+	 *
+	 * On other systems which are POSIX compliant we must keep the file
+	 * descriptor open. This is to avoid a race condition where two
+	 * processes access the reftable stack at the same point in time:
+	 *
+	 *   1. A reads the reftable stack and caches its stat info.
+	 *
+	 *   2. B updates the stack, appending a new table to "tables.list".
+	 *      This will both use a new inode and result in a different file
+	 *      size, thus invalidating A's cache in theory.
+	 *
+	 *   3. B decides to auto-compact the stack and merges two tables. The
+	 *      file size now matches what A has cached again. Furthermore, the
+	 *      filesystem may decide to recycle the inode number of the file
+	 *      we have replaced in (2) because it is not in use anymore.
+	 *
+	 *   4. A reloads the reftable stack. Neither the inode number nor the
+	 *      file size changed. If the timestamps did not change either then
+	 *      we think the cached copy of our stack is up-to-date.
+	 *
+	 * By keeping the file descriptor open the inode number cannot be
+	 * recycled, mitigating the race.
+	 */
+	if (!err && fd >= 0 && !fstat(fd, &st->list_st) &&
+	    st->list_st.st_dev && st->list_st.st_ino) {
+		st->list_fd = fd;
+		fd = -1;
+	}
+
+	if (fd >= 0)
+		close(fd);
+	free_names(names);
+	free_names(names_after);
+
+	if (st->opts.on_reload)
+		st->opts.on_reload(st->opts.on_reload_payload);
+
+	return err;
+}
+
+/* -1 = error
+ 0 = up to date
+ 1 = changed. */
+static int stack_uptodate(struct reftable_stack *st)
+{
+	char **names = NULL;
+	int err;
+
+	/*
+	 * When we have cached stat information available then we use it to
+	 * verify whether the file has been rewritten.
+	 *
+	 * Note that we explicitly do not want to use `stat_validity_check()`
+	 * and friends here because they may end up not comparing the `st_dev`
+	 * and `st_ino` fields. These functions thus cannot guarantee that we
+	 * indeed still have the same file.
+	 */
+	if (st->list_fd >= 0) {
+		struct stat list_st;
+
+		if (stat(st->list_file, &list_st) < 0) {
+			/*
+			 * It's fine for "tables.list" to not exist. In that
+			 * case, we have to refresh when the loaded stack has
+			 * any tables.
+			 */
+			if (errno == ENOENT)
+				return !!st->tables_len;
+			return REFTABLE_IO_ERROR;
+		}
+
+		/*
+		 * When "tables.list" refers to the same file we can assume
+		 * that it didn't change. This is because we always use
+		 * rename(3P) to update the file and never write to it
+		 * directly.
+		 */
+		if (st->list_st.st_dev == list_st.st_dev &&
+		    st->list_st.st_ino == list_st.st_ino)
+			return 0;
+	}
+
+	err = read_lines(st->list_file, &names);
+	if (err < 0)
+		return err;
+
+	for (size_t i = 0; i < st->tables_len; i++) {
+		if (!names[i]) {
+			err = 1;
+			goto done;
+		}
+
+		if (strcmp(st->tables[i]->name, names[i])) {
+			err = 1;
+			goto done;
+		}
+	}
+
+	if (names[st->merged->tables_len]) {
+		err = 1;
+		goto done;
+	}
+
+done:
+	free_names(names);
+	return err;
+}
+
+int reftable_stack_reload(struct reftable_stack *st)
+{
+	int err = stack_uptodate(st);
+	if (err > 0)
+		return reftable_stack_reload_maybe_reuse(st, 1);
+	return err;
+}
+
+int reftable_stack_add(struct reftable_stack *st,
+		       int (*write)(struct reftable_writer *wr, void *arg),
+		       void *arg)
+{
+	int err = stack_try_add(st, write, arg);
+	if (err < 0) {
+		if (err == REFTABLE_OUTDATED_ERROR) {
+			/* Ignore error return, we want to propagate
+			   REFTABLE_OUTDATED_ERROR.
+			*/
+			reftable_stack_reload(st);
+		}
+		return err;
+	}
+
+	return 0;
+}
+
+static int format_name(struct reftable_buf *dest, uint64_t min, uint64_t max)
+{
+	char buf[100];
+	uint32_t rnd = reftable_rand();
+	snprintf(buf, sizeof(buf), "0x%012" PRIx64 "-0x%012" PRIx64 "-%08x",
+		 min, max, rnd);
+	reftable_buf_reset(dest);
+	return reftable_buf_addstr(dest, buf);
+}
+
+struct reftable_addition {
+	struct reftable_flock tables_list_lock;
+	struct reftable_stack *stack;
+
+	char **new_tables;
+	size_t new_tables_len, new_tables_cap;
+	uint64_t next_update_index;
+};
+
+#define REFTABLE_ADDITION_INIT {0}
+
+static int reftable_stack_init_addition(struct reftable_addition *add,
+					struct reftable_stack *st,
+					unsigned int flags)
+{
+	struct reftable_buf lock_file_name = REFTABLE_BUF_INIT;
+	int err;
+
+	add->stack = st;
+
+	err = flock_acquire(&add->tables_list_lock, st->list_file,
+			    st->opts.lock_timeout_ms);
+	if (err < 0) {
+		if (errno == EEXIST) {
+			err = REFTABLE_LOCK_ERROR;
+		} else {
+			err = REFTABLE_IO_ERROR;
+		}
+		goto done;
+	}
+	if (st->opts.default_permissions) {
+		if (chmod(add->tables_list_lock.path,
+			  st->opts.default_permissions) < 0) {
+			err = REFTABLE_IO_ERROR;
+			goto done;
+		}
+	}
+
+	err = stack_uptodate(st);
+	if (err < 0)
+		goto done;
+	if (err > 0 && flags & REFTABLE_STACK_NEW_ADDITION_RELOAD) {
+		err = reftable_stack_reload_maybe_reuse(add->stack, 1);
+		if (err)
+			goto done;
+	}
+	if (err > 0) {
+		err = REFTABLE_OUTDATED_ERROR;
+		goto done;
+	}
+
+	add->next_update_index = reftable_stack_next_update_index(st);
+done:
+	if (err)
+		reftable_addition_close(add);
+	reftable_buf_release(&lock_file_name);
+	return err;
+}
+
+static void reftable_addition_close(struct reftable_addition *add)
+{
+	struct reftable_buf nm = REFTABLE_BUF_INIT;
+	size_t i;
+
+	for (i = 0; i < add->new_tables_len; i++) {
+		if (!stack_filename(&nm, add->stack, add->new_tables[i]))
+			unlink(nm.buf);
+		reftable_free(add->new_tables[i]);
+		add->new_tables[i] = NULL;
+	}
+	reftable_free(add->new_tables);
+	add->new_tables = NULL;
+	add->new_tables_len = 0;
+	add->new_tables_cap = 0;
+
+	flock_release(&add->tables_list_lock);
+	reftable_buf_release(&nm);
+}
+
+void reftable_addition_destroy(struct reftable_addition *add)
+{
+	if (!add) {
+		return;
+	}
+	reftable_addition_close(add);
+	reftable_free(add);
+}
+
+int reftable_addition_commit(struct reftable_addition *add)
+{
+	struct reftable_buf table_list = REFTABLE_BUF_INIT;
+	int err = 0;
+	size_t i;
+
+	if (add->new_tables_len == 0)
+		goto done;
+
+	for (i = 0; i < add->stack->merged->tables_len; i++) {
+		if ((err = reftable_buf_addstr(&table_list, add->stack->tables[i]->name)) < 0 ||
+		    (err = reftable_buf_addstr(&table_list, "\n")) < 0)
+			goto done;
+	}
+	for (i = 0; i < add->new_tables_len; i++) {
+		if ((err = reftable_buf_addstr(&table_list, add->new_tables[i])) < 0 ||
+		    (err = reftable_buf_addstr(&table_list, "\n")) < 0)
+			goto done;
+	}
+
+	err = reftable_write_data(add->tables_list_lock.fd,
+				  table_list.buf, table_list.len);
+	reftable_buf_release(&table_list);
+	if (err < 0) {
+		err = REFTABLE_IO_ERROR;
+		goto done;
+	}
+
+	err = stack_fsync(&add->stack->opts, add->tables_list_lock.fd);
+	if (err < 0) {
+		err = REFTABLE_IO_ERROR;
+		goto done;
+	}
+
+	err = flock_commit(&add->tables_list_lock);
+	if (err < 0) {
+		err = REFTABLE_IO_ERROR;
+		goto done;
+	}
+
+	/* success, no more state to clean up. */
+	for (i = 0; i < add->new_tables_len; i++)
+		reftable_free(add->new_tables[i]);
+	reftable_free(add->new_tables);
+	add->new_tables = NULL;
+	add->new_tables_len = 0;
+	add->new_tables_cap = 0;
+
+	err = reftable_stack_reload_maybe_reuse(add->stack, 1);
+	if (err)
+		goto done;
+
+	if (!add->stack->opts.disable_auto_compact) {
+		/*
+		 * Auto-compact the stack to keep the number of tables in
+		 * control. It is possible that a concurrent writer is already
+		 * trying to compact parts of the stack, which would lead to a
+		 * `REFTABLE_LOCK_ERROR` because parts of the stack are locked
+		 * already. This is a benign error though, so we ignore it.
+		 */
+		err = reftable_stack_auto_compact(add->stack);
+		if (err < 0 && err != REFTABLE_LOCK_ERROR)
+			goto done;
+		err = 0;
+	}
+
+done:
+	reftable_addition_close(add);
+	return err;
+}
+
+int reftable_stack_new_addition(struct reftable_addition **dest,
+				struct reftable_stack *st,
+				unsigned int flags)
+{
+	int err = 0;
+	struct reftable_addition empty = REFTABLE_ADDITION_INIT;
+
+	REFTABLE_CALLOC_ARRAY(*dest, 1);
+	if (!*dest)
+		return REFTABLE_OUT_OF_MEMORY_ERROR;
+
+	**dest = empty;
+	err = reftable_stack_init_addition(*dest, st, flags);
+	if (err) {
+		reftable_free(*dest);
+		*dest = NULL;
+	}
+	return err;
+}
+
+static int stack_try_add(struct reftable_stack *st,
+			 int (*write_table)(struct reftable_writer *wr,
+					    void *arg),
+			 void *arg)
+{
+	struct reftable_addition add = REFTABLE_ADDITION_INIT;
+	int err = reftable_stack_init_addition(&add, st, 0);
+	if (err < 0)
+		goto done;
+
+	err = reftable_addition_add(&add, write_table, arg);
+	if (err < 0)
+		goto done;
+
+	err = reftable_addition_commit(&add);
+done:
+	reftable_addition_close(&add);
+	return err;
+}
+
+int reftable_addition_add(struct reftable_addition *add,
+			  int (*write_table)(struct reftable_writer *wr,
+					     void *arg),
+			  void *arg)
+{
+	struct reftable_buf temp_tab_file_name = REFTABLE_BUF_INIT;
+	struct reftable_buf tab_file_name = REFTABLE_BUF_INIT;
+	struct reftable_buf next_name = REFTABLE_BUF_INIT;
+	struct reftable_writer *wr = NULL;
+	struct reftable_tmpfile tab_file = REFTABLE_TMPFILE_INIT;
+	struct fd_writer writer = {
+		.opts = &add->stack->opts,
+	};
+	int err = 0;
+
+	reftable_buf_reset(&next_name);
+
+	err = format_name(&next_name, add->next_update_index, add->next_update_index);
+	if (err < 0)
+		goto done;
+
+	err = stack_filename(&temp_tab_file_name, add->stack, next_name.buf);
+	if (err < 0)
+		goto done;
+
+	err = reftable_buf_addstr(&temp_tab_file_name, ".temp.XXXXXX");
+	if (err < 0)
+		goto done;
+
+	err = tmpfile_from_pattern(&tab_file, temp_tab_file_name.buf);
+	if (err < 0)
+		goto done;
+	if (add->stack->opts.default_permissions) {
+		if (chmod(tab_file.path,
+			  add->stack->opts.default_permissions)) {
+			err = REFTABLE_IO_ERROR;
+			goto done;
+		}
+	}
+
+	writer.fd = tab_file.fd;
+	err = reftable_writer_new(&wr, fd_writer_write, fd_writer_flush,
+				  &writer, &add->stack->opts);
+	if (err < 0)
+		goto done;
+
+	err = write_table(wr, arg);
+	if (err < 0)
+		goto done;
+
+	err = reftable_writer_close(wr);
+	if (err == REFTABLE_EMPTY_TABLE_ERROR) {
+		err = 0;
+		goto done;
+	}
+	if (err < 0)
+		goto done;
+
+	err = tmpfile_close(&tab_file);
+	if (err < 0)
+		goto done;
+
+	if (wr->min_update_index < add->next_update_index) {
+		err = REFTABLE_API_ERROR;
+		goto done;
+	}
+
+	err = format_name(&next_name, wr->min_update_index, wr->max_update_index);
+	if (err < 0)
+		goto done;
+
+	err = reftable_buf_addstr(&next_name, ".ref");
+	if (err < 0)
+		goto done;
+
+	err = stack_filename(&tab_file_name, add->stack, next_name.buf);
+	if (err < 0)
+		goto done;
+
+	/*
+	  On windows, this relies on rand() picking a unique destination name.
+	  Maybe we should do retry loop as well?
+	 */
+	err = tmpfile_rename(&tab_file, tab_file_name.buf);
+	if (err < 0)
+		goto done;
+
+	REFTABLE_ALLOC_GROW_OR_NULL(add->new_tables, add->new_tables_len + 1,
+				    add->new_tables_cap);
+	if (!add->new_tables) {
+		err = REFTABLE_OUT_OF_MEMORY_ERROR;
+		goto done;
+	}
+	add->new_tables[add->new_tables_len++] = reftable_buf_detach(&next_name);
+
+done:
+	tmpfile_delete(&tab_file);
+	reftable_buf_release(&temp_tab_file_name);
+	reftable_buf_release(&tab_file_name);
+	reftable_buf_release(&next_name);
+	reftable_writer_free(wr);
+	return err;
+}
+
+uint64_t reftable_stack_next_update_index(struct reftable_stack *st)
+{
+	int sz = st->merged->tables_len;
+	if (sz > 0)
+		return reftable_table_max_update_index(st->tables[sz - 1]) +
+		       1;
+	return 1;
+}
+
+static int stack_compact_locked(struct reftable_stack *st,
+				size_t first, size_t last,
+				struct reftable_log_expiry_config *config,
+				struct reftable_tmpfile *tab_file_out)
+{
+	struct reftable_buf next_name = REFTABLE_BUF_INIT;
+	struct reftable_buf tab_file_path = REFTABLE_BUF_INIT;
+	struct reftable_writer *wr = NULL;
+	struct fd_writer writer=  {
+		.opts = &st->opts,
+	};
+	struct reftable_tmpfile tab_file = REFTABLE_TMPFILE_INIT;
+	int err = 0;
+
+	err = format_name(&next_name, reftable_table_min_update_index(st->tables[first]),
+			  reftable_table_max_update_index(st->tables[last]));
+	if (err < 0)
+		goto done;
+
+	err = stack_filename(&tab_file_path, st, next_name.buf);
+	if (err < 0)
+		goto done;
+
+	err = reftable_buf_addstr(&tab_file_path, ".temp.XXXXXX");
+	if (err < 0)
+		goto done;
+
+	err = tmpfile_from_pattern(&tab_file, tab_file_path.buf);
+	if (err < 0)
+		goto done;
+
+	if (st->opts.default_permissions &&
+	    chmod(tab_file.path, st->opts.default_permissions) < 0) {
+		err = REFTABLE_IO_ERROR;
+		goto done;
+	}
+
+	writer.fd = tab_file.fd;
+	err = reftable_writer_new(&wr, fd_writer_write, fd_writer_flush,
+				  &writer, &st->opts);
+	if (err < 0)
+		goto done;
+
+	err = stack_write_compact(st, wr, first, last, config);
+	if (err < 0)
+		goto done;
+
+	err = reftable_writer_close(wr);
+	if (err < 0)
+		goto done;
+
+	err = tmpfile_close(&tab_file);
+	if (err < 0)
+		goto done;
+
+	*tab_file_out = tab_file;
+	tab_file = REFTABLE_TMPFILE_INIT;
+
+done:
+	tmpfile_delete(&tab_file);
+	reftable_writer_free(wr);
+	reftable_buf_release(&next_name);
+	reftable_buf_release(&tab_file_path);
+	return err;
+}
+
+static int stack_write_compact(struct reftable_stack *st,
+			       struct reftable_writer *wr,
+			       size_t first, size_t last,
+			       struct reftable_log_expiry_config *config)
+{
+	struct reftable_merged_table *mt = NULL;
+	struct reftable_iterator it = { NULL };
+	struct reftable_ref_record ref = { NULL };
+	struct reftable_log_record log = { NULL };
+	size_t subtabs_len = last - first + 1;
+	uint64_t entries = 0;
+	int err = 0;
+
+	for (size_t i = first; i <= last; i++)
+		st->stats.bytes += st->tables[i]->size;
+	err = reftable_writer_set_limits(wr, st->tables[first]->min_update_index,
+					 st->tables[last]->max_update_index);
+	if (err < 0)
+		goto done;
+
+	err = reftable_merged_table_new(&mt, st->tables + first, subtabs_len,
+					st->opts.hash_id);
+	if (err < 0)
+		goto done;
+
+	err = merged_table_init_iter(mt, &it, REFTABLE_BLOCK_TYPE_REF);
+	if (err < 0)
+		goto done;
+
+	err = reftable_iterator_seek_ref(&it, "");
+	if (err < 0)
+		goto done;
+
+	while (1) {
+		err = reftable_iterator_next_ref(&it, &ref);
+		if (err > 0) {
+			err = 0;
+			break;
+		}
+		if (err < 0)
+			goto done;
+
+		if (first == 0 && reftable_ref_record_is_deletion(&ref)) {
+			continue;
+		}
+
+		err = reftable_writer_add_ref(wr, &ref);
+		if (err < 0)
+			goto done;
+		entries++;
+	}
+	reftable_iterator_destroy(&it);
+
+	err = merged_table_init_iter(mt, &it, REFTABLE_BLOCK_TYPE_LOG);
+	if (err < 0)
+		goto done;
+
+	err = reftable_iterator_seek_log(&it, "");
+	if (err < 0)
+		goto done;
+
+	while (1) {
+		err = reftable_iterator_next_log(&it, &log);
+		if (err > 0) {
+			err = 0;
+			break;
+		}
+		if (err < 0)
+			goto done;
+		if (first == 0 && reftable_log_record_is_deletion(&log)) {
+			continue;
+		}
+
+		if (config && config->min_update_index > 0 &&
+		    log.update_index < config->min_update_index) {
+			continue;
+		}
+
+		if (config && config->time > 0 &&
+		    log.value.update.time < config->time) {
+			continue;
+		}
+
+		err = reftable_writer_add_log(wr, &log);
+		if (err < 0)
+			goto done;
+		entries++;
+	}
+
+done:
+	reftable_iterator_destroy(&it);
+	if (mt)
+		reftable_merged_table_free(mt);
+	reftable_ref_record_release(&ref);
+	reftable_log_record_release(&log);
+	st->stats.entries_written += entries;
+	return err;
+}
+
+enum stack_compact_range_flags {
+	/*
+	 * Perform a best-effort compaction. That is, even if we cannot lock
+	 * all tables in the specified range, we will try to compact the
+	 * remaining slice.
+	 */
+	STACK_COMPACT_RANGE_BEST_EFFORT = (1 << 0),
+};
+
+/*
+ * Compact all tables in the range `[first, last)` into a single new table.
+ *
+ * This function returns `0` on success or a code `< 0` on failure. When the
+ * stack or any of the tables in the specified range are already locked then
+ * this function returns `REFTABLE_LOCK_ERROR`. This is a benign error that
+ * callers can either ignore, or they may choose to retry compaction after some
+ * amount of time.
+ */
+static int stack_compact_range(struct reftable_stack *st,
+			       size_t first, size_t last,
+			       struct reftable_log_expiry_config *expiry,
+			       unsigned int flags)
+{
+	struct reftable_buf tables_list_buf = REFTABLE_BUF_INIT;
+	struct reftable_buf new_table_name = REFTABLE_BUF_INIT;
+	struct reftable_buf new_table_path = REFTABLE_BUF_INIT;
+	struct reftable_buf table_name = REFTABLE_BUF_INIT;
+	struct reftable_flock tables_list_lock = REFTABLE_FLOCK_INIT;
+	struct reftable_flock *table_locks = NULL;
+	struct reftable_tmpfile new_table = REFTABLE_TMPFILE_INIT;
+	int is_empty_table = 0, err = 0;
+	size_t first_to_replace, last_to_replace;
+	size_t i, nlocks = 0;
+	char **names = NULL;
+
+	if (first > last || (!expiry && first == last)) {
+		err = 0;
+		goto done;
+	}
+
+	st->stats.attempts++;
+
+	/*
+	 * Hold the lock so that we can read "tables.list" and lock all tables
+	 * which are part of the user-specified range.
+	 */
+	err = flock_acquire(&tables_list_lock, st->list_file, st->opts.lock_timeout_ms);
+	if (err < 0) {
+		if (errno == EEXIST)
+			err = REFTABLE_LOCK_ERROR;
+		else
+			err = REFTABLE_IO_ERROR;
+		goto done;
+	}
+
+	err = stack_uptodate(st);
+	if (err)
+		goto done;
+
+	/*
+	 * Lock all tables in the user-provided range. This is the slice of our
+	 * stack which we'll compact.
+	 *
+	 * Note that we lock tables in reverse order from last to first. The
+	 * intent behind this is to allow a newer process to perform best
+	 * effort compaction of tables that it has added in the case where an
+	 * older process is still busy compacting tables which are preexisting
+	 * from the point of view of the newer process.
+	 */
+	REFTABLE_ALLOC_ARRAY(table_locks, last - first + 1);
+	if (!table_locks) {
+		err = REFTABLE_OUT_OF_MEMORY_ERROR;
+		goto done;
+	}
+	for (i = 0; i < last - first + 1; i++)
+		table_locks[i] = REFTABLE_FLOCK_INIT;
+
+	for (i = last + 1; i > first; i--) {
+		err = stack_filename(&table_name, st, reftable_table_name(st->tables[i - 1]));
+		if (err < 0)
+			goto done;
+
+		err = flock_acquire(&table_locks[nlocks], table_name.buf, 0);
+		if (err < 0) {
+			/*
+			 * When the table is locked already we may do a
+			 * best-effort compaction and compact only the tables
+			 * that we have managed to lock so far. This of course
+			 * requires that we have been able to lock at least two
+			 * tables, otherwise there would be nothing to compact.
+			 * In that case, we return a lock error to our caller.
+			 */
+			if (errno == EEXIST && last - (i - 1) >= 2 &&
+			    flags & STACK_COMPACT_RANGE_BEST_EFFORT) {
+				err = 0;
+				/*
+				 * The subtraction is to offset the index, the
+				 * addition is to only compact up to the table
+				 * of the preceding iteration. They obviously
+				 * cancel each other out, but that may be
+				 * non-obvious when it was omitted.
+				 */
+				first = (i - 1) + 1;
+				break;
+			} else if (errno == EEXIST) {
+				err = REFTABLE_LOCK_ERROR;
+				goto done;
+			} else {
+				err = REFTABLE_IO_ERROR;
+				goto done;
+			}
+		}
+
+		/*
+		 * We need to close the lockfiles as we might otherwise easily
+		 * run into file descriptor exhaustion when we compress a lot
+		 * of tables.
+		 */
+		err = flock_close(&table_locks[nlocks++]);
+		if (err < 0) {
+			err = REFTABLE_IO_ERROR;
+			goto done;
+		}
+	}
+
+	/*
+	 * We have locked all tables in our range and can thus release the
+	 * "tables.list" lock while compacting the locked tables. This allows
+	 * concurrent updates to the stack to proceed.
+	 */
+	err = flock_release(&tables_list_lock);
+	if (err < 0) {
+		err = REFTABLE_IO_ERROR;
+		goto done;
+	}
+
+	/*
+	 * Compact the now-locked tables into a new table. Note that compacting
+	 * these tables may end up with an empty new table in case tombstones
+	 * end up cancelling out all refs in that range.
+	 */
+	err = stack_compact_locked(st, first, last, expiry, &new_table);
+	if (err < 0) {
+		if (err != REFTABLE_EMPTY_TABLE_ERROR)
+			goto done;
+		is_empty_table = 1;
+	}
+
+	/*
+	 * Now that we have written the new, compacted table we need to re-lock
+	 * "tables.list". We'll then replace the compacted range of tables with
+	 * the new table.
+	 */
+	err = flock_acquire(&tables_list_lock, st->list_file, st->opts.lock_timeout_ms);
+	if (err < 0) {
+		if (errno == EEXIST)
+			err = REFTABLE_LOCK_ERROR;
+		else
+			err = REFTABLE_IO_ERROR;
+		goto done;
+	}
+
+	if (st->opts.default_permissions) {
+		if (chmod(tables_list_lock.path,
+			  st->opts.default_permissions) < 0) {
+			err = REFTABLE_IO_ERROR;
+			goto done;
+		}
+	}
+
+	/*
+	 * As we have unlocked the stack while compacting our slice of tables
+	 * it may have happened that a concurrently running process has updated
+	 * the stack while we were compacting. In that case, we need to check
+	 * whether the tables that we have just compacted still exist in the
+	 * stack in the exact same order as we have compacted them.
+	 *
+	 * If they do exist, then it is fine to continue and replace those
+	 * tables with our compacted version. If they don't, then we need to
+	 * abort.
+	 */
+	err = stack_uptodate(st);
+	if (err < 0)
+		goto done;
+	if (err > 0) {
+		ssize_t new_offset = -1;
+		int fd;
+
+		fd = open(st->list_file, O_RDONLY);
+		if (fd < 0) {
+			err = REFTABLE_IO_ERROR;
+			goto done;
+		}
+
+		err = fd_read_lines(fd, &names);
+		close(fd);
+		if (err < 0)
+			goto done;
+
+		/*
+		 * Search for the offset of the first table that we have
+		 * compacted in the updated "tables.list" file.
+		 */
+		for (size_t i = 0; names[i]; i++) {
+			if (strcmp(names[i], st->tables[first]->name))
+				continue;
+
+			/*
+			 * We have found the first entry. Verify that all the
+			 * subsequent tables we have compacted still exist in
+			 * the modified stack in the exact same order as we
+			 * have compacted them.
+			 */
+			for (size_t j = 1; j < last - first + 1; j++) {
+				const char *old = first + j < st->merged->tables_len ?
+					st->tables[first + j]->name : NULL;
+				const char *new = names[i + j];
+
+				/*
+				 * If some entries are missing or in case the tables
+				 * have changed then we need to bail out. Again, this
+				 * shouldn't ever happen because we have locked the
+				 * tables we are compacting.
+				 */
+				if (!old || !new || strcmp(old, new)) {
+					err = REFTABLE_OUTDATED_ERROR;
+					goto done;
+				}
+			}
+
+			new_offset = i;
+			break;
+		}
+
+		/*
+		 * In case we didn't find our compacted tables in the stack we
+		 * need to bail out. In theory, this should have never happened
+		 * because we locked the tables we are compacting.
+		 */
+		if (new_offset < 0) {
+			err = REFTABLE_OUTDATED_ERROR;
+			goto done;
+		}
+
+		/*
+		 * We have found the new range that we want to replace, so
+		 * let's update the range of tables that we want to replace.
+		 */
+		first_to_replace = new_offset;
+		last_to_replace = last + (new_offset - first);
+	} else {
+		/*
+		 * `fd_read_lines()` uses a `NULL` sentinel to indicate that
+		 * the array is at its end. As we use `free_names()` to free
+		 * the array, we need to include this sentinel value here and
+		 * thus have to allocate `tables_len + 1` many entries.
+		 */
+		REFTABLE_CALLOC_ARRAY(names, st->merged->tables_len + 1);
+		if (!names) {
+			err = REFTABLE_OUT_OF_MEMORY_ERROR;
+			goto done;
+		}
+
+		for (size_t i = 0; i < st->merged->tables_len; i++) {
+			names[i] = reftable_strdup(st->tables[i]->name);
+			if (!names[i]) {
+				err = REFTABLE_OUT_OF_MEMORY_ERROR;
+				goto done;
+			}
+		}
+		first_to_replace = first;
+		last_to_replace = last;
+	}
+
+	/*
+	 * If the resulting compacted table is not empty, then we need to move
+	 * it into place now.
+	 */
+	if (!is_empty_table) {
+		err = format_name(&new_table_name, st->tables[first]->min_update_index,
+				  st->tables[last]->max_update_index);
+		if (err < 0)
+			goto done;
+
+		err = reftable_buf_addstr(&new_table_name, ".ref");
+		if (err < 0)
+			goto done;
+
+		err = stack_filename(&new_table_path, st, new_table_name.buf);
+		if (err < 0)
+			goto done;
+
+		err = tmpfile_rename(&new_table, new_table_path.buf);
+		if (err < 0)
+			goto done;
+	}
+
+	/*
+	 * Write the new "tables.list" contents with the compacted table we
+	 * have just written. In case the compacted table became empty we
+	 * simply skip writing it.
+	 */
+	for (i = 0; i < first_to_replace; i++) {
+		if ((err = reftable_buf_addstr(&tables_list_buf, names[i])) < 0 ||
+		    (err = reftable_buf_addstr(&tables_list_buf, "\n")) < 0)
+		      goto done;
+	}
+	if (!is_empty_table) {
+		if ((err = reftable_buf_addstr(&tables_list_buf, new_table_name.buf)) < 0 ||
+		    (err = reftable_buf_addstr(&tables_list_buf, "\n")) < 0)
+			goto done;
+	}
+	for (i = last_to_replace + 1; names[i]; i++) {
+		if ((err = reftable_buf_addstr(&tables_list_buf, names[i])) < 0 ||
+		    (err = reftable_buf_addstr(&tables_list_buf, "\n")) < 0)
+			goto done;
+	}
+
+	err = reftable_write_data(tables_list_lock.fd,
+				  tables_list_buf.buf, tables_list_buf.len);
+	if (err < 0) {
+		err = REFTABLE_IO_ERROR;
+		unlink(new_table_path.buf);
+		goto done;
+	}
+
+	err = stack_fsync(&st->opts, tables_list_lock.fd);
+	if (err < 0) {
+		err = REFTABLE_IO_ERROR;
+		unlink(new_table_path.buf);
+		goto done;
+	}
+
+	err = flock_commit(&tables_list_lock);
+	if (err < 0) {
+		err = REFTABLE_IO_ERROR;
+		unlink(new_table_path.buf);
+		goto done;
+	}
+
+	/*
+	 * Reload the stack before deleting the compacted tables. We can only
+	 * delete the files after we closed them on Windows, so this needs to
+	 * happen first.
+	 */
+	err = reftable_stack_reload_maybe_reuse(st, first < last);
+	if (err < 0)
+		goto done;
+
+	/*
+	 * Delete the old tables. They may still be in use by concurrent
+	 * readers, so it is expected that unlinking tables may fail.
+	 */
+	for (i = 0; i < nlocks; i++) {
+		struct reftable_flock *table_lock = &table_locks[i];
+
+		reftable_buf_reset(&table_name);
+		err = reftable_buf_add(&table_name, table_lock->path,
+				       strlen(table_lock->path) - strlen(".lock"));
+		if (err)
+			continue;
+
+		unlink(table_name.buf);
+	}
+
+done:
+	flock_release(&tables_list_lock);
+	for (i = 0; table_locks && i < nlocks; i++)
+		flock_release(&table_locks[i]);
+	reftable_free(table_locks);
+
+	tmpfile_delete(&new_table);
+	reftable_buf_release(&new_table_name);
+	reftable_buf_release(&new_table_path);
+	reftable_buf_release(&tables_list_buf);
+	reftable_buf_release(&table_name);
+	free_names(names);
+
+	if (err == REFTABLE_LOCK_ERROR)
+		st->stats.failures++;
+
+	return err;
+}
+
+int reftable_stack_compact_all(struct reftable_stack *st,
+			       struct reftable_log_expiry_config *config)
+{
+	size_t last = st->merged->tables_len ? st->merged->tables_len - 1 : 0;
+	return stack_compact_range(st, 0, last, config, 0);
+}
+
+static int segment_size(struct segment *s)
+{
+	return s->end - s->start;
+}
+
+struct segment suggest_compaction_segment(uint64_t *sizes, size_t n,
+					  uint8_t factor)
+{
+	struct segment seg = { 0 };
+	uint64_t bytes;
+	size_t i;
+
+	if (!factor)
+		factor = DEFAULT_GEOMETRIC_FACTOR;
+
+	/*
+	 * If there are no tables or only a single one then we don't have to
+	 * compact anything. The sequence is geometric by definition already.
+	 */
+	if (n <= 1)
+		return seg;
+
+	/*
+	 * Find the ending table of the compaction segment needed to restore the
+	 * geometric sequence. Note that the segment end is exclusive.
+	 *
+	 * To do so, we iterate backwards starting from the most recent table
+	 * until a valid segment end is found. If the preceding table is smaller
+	 * than the current table multiplied by the geometric factor (2), the
+	 * compaction segment end has been identified.
+	 *
+	 * Tables after the ending point are not added to the byte count because
+	 * they are already valid members of the geometric sequence. Due to the
+	 * properties of a geometric sequence, it is not possible for the sum of
+	 * these tables to exceed the value of the ending point table.
+	 *
+	 * Example table size sequence requiring no compaction:
+	 * 	64, 32, 16, 8, 4, 2, 1
+	 *
+	 * Example table size sequence where compaction segment end is set to
+	 * the last table. Since the segment end is exclusive, the last table is
+	 * excluded during subsequent compaction and the table with size 3 is
+	 * the final table included:
+	 * 	64, 32, 16, 8, 4, 3, 1
+	 */
+	for (i = n - 1; i > 0; i--) {
+		if (sizes[i - 1] < sizes[i] * factor) {
+			seg.end = i + 1;
+			bytes = sizes[i];
+			break;
+		}
+	}
+
+	/*
+	 * Find the starting table of the compaction segment by iterating
+	 * through the remaining tables and keeping track of the accumulated
+	 * size of all tables seen from the segment end table. The previous
+	 * table is compared to the accumulated size because the tables from the
+	 * segment end are merged backwards recursively.
+	 *
+	 * Note that we keep iterating even after we have found the first
+	 * starting point. This is because there may be tables in the stack
+	 * preceding that first starting point which violate the geometric
+	 * sequence.
+	 *
+	 * Example compaction segment start set to table with size 32:
+	 * 	128, 32, 16, 8, 4, 3, 1
+	 */
+	for (; i > 0; i--) {
+		uint64_t curr = bytes;
+		bytes += sizes[i - 1];
+
+		if (sizes[i - 1] < curr * factor) {
+			seg.start = i - 1;
+			seg.bytes = bytes;
+		}
+	}
+
+	return seg;
+}
+
+static uint64_t *stack_table_sizes_for_compaction(struct reftable_stack *st)
+{
+	int version = (st->opts.hash_id == REFTABLE_HASH_SHA1) ? 1 : 2;
+	int overhead = header_size(version) - 1;
+	uint64_t *sizes;
+
+	REFTABLE_CALLOC_ARRAY(sizes, st->merged->tables_len);
+	if (!sizes)
+		return NULL;
+
+	for (size_t i = 0; i < st->merged->tables_len; i++)
+		sizes[i] = st->tables[i]->size - overhead;
+
+	return sizes;
+}
+
+int reftable_stack_auto_compact(struct reftable_stack *st)
+{
+	struct segment seg;
+	uint64_t *sizes;
+
+	if (st->merged->tables_len < 2)
+		return 0;
+
+	sizes = stack_table_sizes_for_compaction(st);
+	if (!sizes)
+		return REFTABLE_OUT_OF_MEMORY_ERROR;
+
+	seg = suggest_compaction_segment(sizes, st->merged->tables_len,
+					 st->opts.auto_compaction_factor);
+	reftable_free(sizes);
+
+	if (segment_size(&seg) > 0)
+		return stack_compact_range(st, seg.start, seg.end - 1,
+					   NULL, STACK_COMPACT_RANGE_BEST_EFFORT);
+
+	return 0;
+}
+
+struct reftable_compaction_stats *
+reftable_stack_compaction_stats(struct reftable_stack *st)
+{
+	return &st->stats;
+}
+
+int reftable_stack_read_ref(struct reftable_stack *st, const char *refname,
+			    struct reftable_ref_record *ref)
+{
+	struct reftable_iterator it = { 0 };
+	int ret;
+
+	ret = reftable_merged_table_init_ref_iterator(st->merged, &it);
+	if (ret)
+		goto out;
+
+	ret = reftable_iterator_seek_ref(&it, refname);
+	if (ret)
+		goto out;
+
+	ret = reftable_iterator_next_ref(&it, ref);
+	if (ret)
+		goto out;
+
+	if (strcmp(ref->refname, refname) ||
+	    reftable_ref_record_is_deletion(ref)) {
+		reftable_ref_record_release(ref);
+		ret = 1;
+		goto out;
+	}
+
+out:
+	reftable_iterator_destroy(&it);
+	return ret;
+}
+
+int reftable_stack_read_log(struct reftable_stack *st, const char *refname,
+			    struct reftable_log_record *log)
+{
+	struct reftable_iterator it = {0};
+	int err;
+
+	err = reftable_stack_init_log_iterator(st, &it);
+	if (err)
+		goto done;
+
+	err = reftable_iterator_seek_log(&it, refname);
+	if (err)
+		goto done;
+
+	err = reftable_iterator_next_log(&it, log);
+	if (err)
+		goto done;
+
+	if (strcmp(log->refname, refname) ||
+	    reftable_log_record_is_deletion(log)) {
+		err = 1;
+		goto done;
+	}
+
+done:
+	if (err) {
+		reftable_log_record_release(log);
+	}
+	reftable_iterator_destroy(&it);
+	return err;
+}
+
+static int is_table_name(const char *s)
+{
+	const char *dot = strrchr(s, '.');
+	return dot && !strcmp(dot, ".ref");
+}
+
+static void remove_maybe_stale_table(struct reftable_stack *st, uint64_t max,
+				     const char *name)
+{
+	int err = 0;
+	uint64_t update_idx = 0;
+	struct reftable_block_source src = { NULL };
+	struct reftable_table *table = NULL;
+	struct reftable_buf table_path = REFTABLE_BUF_INIT;
+
+	err = stack_filename(&table_path, st, name);
+	if (err < 0)
+		goto done;
+
+	err = reftable_block_source_from_file(&src, table_path.buf);
+	if (err < 0)
+		goto done;
+
+	err = reftable_table_new(&table, &src, name);
+	if (err < 0)
+		goto done;
+
+	update_idx = reftable_table_max_update_index(table);
+	reftable_table_decref(table);
+
+	if (update_idx <= max) {
+		unlink(table_path.buf);
+	}
+done:
+	reftable_buf_release(&table_path);
+}
+
+static int reftable_stack_clean_locked(struct reftable_stack *st)
+{
+	uint64_t max = reftable_merged_table_max_update_index(
+		reftable_stack_merged_table(st));
+	DIR *dir = opendir(st->reftable_dir);
+	struct dirent *d = NULL;
+	if (!dir) {
+		return REFTABLE_IO_ERROR;
+	}
+
+	while ((d = readdir(dir))) {
+		int found = 0;
+		if (!is_table_name(d->d_name))
+			continue;
+
+		for (size_t i = 0; !found && i < st->tables_len; i++)
+			found = !strcmp(reftable_table_name(st->tables[i]), d->d_name);
+		if (found)
+			continue;
+
+		remove_maybe_stale_table(st, max, d->d_name);
+	}
+
+	closedir(dir);
+	return 0;
+}
+
+int reftable_stack_clean(struct reftable_stack *st)
+{
+	struct reftable_addition *add = NULL;
+	int err = reftable_stack_new_addition(&add, st, 0);
+	if (err < 0) {
+		goto done;
+	}
+
+	err = reftable_stack_reload(st);
+	if (err < 0) {
+		goto done;
+	}
+
+	err = reftable_stack_clean_locked(st);
+
+done:
+	reftable_addition_destroy(add);
+	return err;
+}
+
+enum reftable_hash reftable_stack_hash_id(struct reftable_stack *st)
+{
+	return reftable_merged_table_hash_id(st->merged);
+}
diff --git a/reftable/stack.h b/reftable/stack.h
new file mode 100644
index 0000000000..bc28f2998a
--- /dev/null
+++ b/reftable/stack.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2020 Google LLC
+ *
+ * Use of this source code is governed by a BSD-style
+ * license that can be found in the LICENSE file or at
+ * https://developers.google.com/open-source/licenses/bsd
+ */
+
+#ifndef STACK_H
+#define STACK_H
+
+#include "system.h"
+#include "reftable-writer.h"
+#include "reftable-stack.h"
+
+struct reftable_stack {
+	struct stat list_st;
+	char *list_file;
+	int list_fd;
+
+	char *reftable_dir;
+
+	struct reftable_write_options opts;
+
+	struct reftable_table **tables;
+	size_t tables_len;
+	struct reftable_merged_table *merged;
+	struct reftable_compaction_stats stats;
+};
+
+int read_lines(const char *filename, char ***lines);
+
+struct segment {
+	size_t start, end;
+	uint64_t bytes;
+};
+
+struct segment suggest_compaction_segment(uint64_t *sizes, size_t n,
+					  uint8_t factor);
+
+#endif
diff --git a/reftable/system.c b/reftable/system.c
new file mode 100644
index 0000000000..1ee268b125
--- /dev/null
+++ b/reftable/system.c
@@ -0,0 +1,133 @@
+#include "../git-compat-util.h"
+
+#include "system.h"
+#include "basics.h"
+#include "reftable-error.h"
+#include "../lockfile.h"
+#include "../tempfile.h"
+
+uint32_t reftable_rand(void)
+{
+	return git_rand(CSPRNG_BYTES_INSECURE);
+}
+
+int tmpfile_from_pattern(struct reftable_tmpfile *out, const char *pattern)
+{
+	struct tempfile *tempfile;
+
+	tempfile = mks_tempfile(pattern);
+	if (!tempfile)
+		return REFTABLE_IO_ERROR;
+
+	out->path = tempfile->filename.buf;
+	out->fd = tempfile->fd;
+	out->priv = tempfile;
+
+	return 0;
+}
+
+int tmpfile_close(struct reftable_tmpfile *t)
+{
+	struct tempfile *tempfile = t->priv;
+	int ret = close_tempfile_gently(tempfile);
+	t->fd = -1;
+	if (ret < 0)
+		return REFTABLE_IO_ERROR;
+	return 0;
+}
+
+int tmpfile_delete(struct reftable_tmpfile *t)
+{
+	struct tempfile *tempfile = t->priv;
+	int ret = delete_tempfile(&tempfile);
+	*t = REFTABLE_TMPFILE_INIT;
+	if (ret < 0)
+		return REFTABLE_IO_ERROR;
+	return 0;
+}
+
+int tmpfile_rename(struct reftable_tmpfile *t, const char *path)
+{
+	struct tempfile *tempfile = t->priv;
+	int ret = rename_tempfile(&tempfile, path);
+	*t = REFTABLE_TMPFILE_INIT;
+	if (ret < 0)
+		return REFTABLE_IO_ERROR;
+	return 0;
+}
+
+int flock_acquire(struct reftable_flock *l, const char *target_path,
+		  long timeout_ms)
+{
+	struct lock_file *lockfile;
+	int err;
+
+	lockfile = reftable_malloc(sizeof(*lockfile));
+	if (!lockfile)
+		return REFTABLE_OUT_OF_MEMORY_ERROR;
+
+	err = hold_lock_file_for_update_timeout(lockfile, target_path, LOCK_NO_DEREF,
+						timeout_ms);
+	if (err < 0) {
+		reftable_free(lockfile);
+		if (errno == EEXIST)
+			return REFTABLE_LOCK_ERROR;
+		return -1;
+	}
+
+	l->fd = get_lock_file_fd(lockfile);
+	l->path = get_lock_file_path(lockfile);
+	l->priv = lockfile;
+
+	return 0;
+}
+
+int flock_close(struct reftable_flock *l)
+{
+	struct lock_file *lockfile = l->priv;
+	int ret;
+
+	if (!lockfile)
+		return REFTABLE_API_ERROR;
+
+	ret = close_lock_file_gently(lockfile);
+	l->fd = -1;
+	if (ret < 0)
+		return REFTABLE_IO_ERROR;
+
+	return 0;
+}
+
+int flock_release(struct reftable_flock *l)
+{
+	struct lock_file *lockfile = l->priv;
+	int ret;
+
+	if (!lockfile)
+		return 0;
+
+	ret = rollback_lock_file(lockfile);
+	reftable_free(lockfile);
+	*l = REFTABLE_FLOCK_INIT;
+	if (ret < 0)
+		return REFTABLE_IO_ERROR;
+
+	return 0;
+}
+
+int flock_commit(struct reftable_flock *l)
+{
+	struct lock_file *lockfile = l->priv;
+	int ret;
+
+	if (!lockfile)
+		return REFTABLE_API_ERROR;
+
+	ret = commit_lock_file(lockfile);
+	reftable_free(lockfile);
+	*l = REFTABLE_FLOCK_INIT;
+	if (ret < 0)
+		return REFTABLE_IO_ERROR;
+
+	return 0;
+}
diff --git a/reftable/system.h b/reftable/system.h
new file mode 100644
index 0000000000..beb9d2431f
--- /dev/null
+++ b/reftable/system.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright 2020 Google LLC
+ *
+ * Use of this source code is governed by a BSD-style
+ * license that can be found in the LICENSE file or at
+ * https://developers.google.com/open-source/licenses/bsd
+ */
+
+#ifndef SYSTEM_H
+#define SYSTEM_H
+
+/* This header glues the reftable library to the rest of Git */
+
+#define MINGW_DONT_HANDLE_IN_USE_ERROR
+#include "compat/posix.h"
+#include "compat/zlib-compat.h"
+
+/*
+ * Return a random 32 bit integer. This function is expected to return
+ * pre-seeded data.
+ */
+uint32_t reftable_rand(void);
+
+/*
+ * An implementation-specific temporary file. By making this specific to the
+ * implementation it becomes possible to tie temporary files into any kind of
+ * signal or atexit handlers for cleanup on abnormal situations.
+ */
+struct reftable_tmpfile {
+	const char *path;
+	int fd;
+	void *priv;
+};
+#define REFTABLE_TMPFILE_INIT ((struct reftable_tmpfile) { .fd = -1, })
+
+/*
+ * Create a temporary file from a pattern similar to how mkstemp(3p) would.
+ * The `pattern` shall not be modified. On success, the structure at `out` has
+ * been initialized such that it is ready for use. Returns 0 on success, a
+ * reftable error code on error.
+ */
+int tmpfile_from_pattern(struct reftable_tmpfile *out, const char *pattern);
+
+/*
+ * Close the temporary file's file descriptor without removing the file itself.
+ * This is a no-op in case the file has already been closed beforehand. Returns
+ * 0 on success, a reftable error code on error.
+ */
+int tmpfile_close(struct reftable_tmpfile *t);
+
+/*
+ * Close the temporary file and delete it. This is a no-op in case the file has
+ * already been deleted or renamed beforehand. Returns 0 on success, a reftable
+ * error code on error.
+ */
+int tmpfile_delete(struct reftable_tmpfile *t);
+
+/*
+ * Rename the temporary file to the provided path. The temporary file must be
+ * active. Return 0 on success, a reftable error code on error. Deactivates the
+ * temporary file.
+ */
+int tmpfile_rename(struct reftable_tmpfile *t, const char *path);
+
+/*
+ * An implementation-specific file lock. Same as with `reftable_tmpfile`,
+ * making this specific to the implementation makes it possible to tie this
+ * into signal or atexit handlers such that we know to clean up stale locks on
+ * abnormal exits.
+ */
+struct reftable_flock {
+	const char *path;
+	int fd;
+	void *priv;
+};
+#define REFTABLE_FLOCK_INIT ((struct reftable_flock){ .fd = -1, })
+
+/*
+ * Acquire the lock for the given target path by exclusively creating a file
+ * with ".lock" appended to it. If that lock exists, we wait up to `timeout_ms`
+ * to acquire the lock. If `timeout_ms` is 0 we don't wait, if it is negative
+ * we block indefinitely.
+ *
+ * Retrun 0 on success, a reftable error code on error.
+ */
+int flock_acquire(struct reftable_flock *l, const char *target_path,
+		  long timeout_ms);
+
+/*
+ * Close the lockfile's file descriptor without removing the lock itself. This
+ * is a no-op in case the lockfile has already been closed beforehand. Returns
+ * 0 on success, a reftable error code on error.
+ */
+int flock_close(struct reftable_flock *l);
+
+/*
+ * Release the lock by unlinking the lockfile. This is a no-op in case the
+ * lockfile has already been released or committed beforehand. Returns 0 on
+ * success, a reftable error code on error.
+ */
+int flock_release(struct reftable_flock *l);
+
+/*
+ * Commit the lock by renaming the lockfile into place. Returns 0 on success, a
+ * reftable error code on error.
+ */
+int flock_commit(struct reftable_flock *l);
+
+#endif
diff --git a/reftable/table.c b/reftable/table.c
new file mode 100644
index 0000000000..56362df0ed
--- /dev/null
+++ b/reftable/table.c
@@ -0,0 +1,779 @@
+/*
+ * Copyright 2020 Google LLC
+ *
+ * Use of this source code is governed by a BSD-style
+ * license that can be found in the LICENSE file or at
+ * https://developers.google.com/open-source/licenses/bsd
+ */
+
+#include "table.h"
+
+#include "system.h"
+#include "block.h"
+#include "blocksource.h"
+#include "constants.h"
+#include "iter.h"
+#include "record.h"
+#include "reftable-error.h"
+
+static struct reftable_table_offsets *
+table_offsets_for(struct reftable_table *t, uint8_t typ)
+{
+	switch (typ) {
+	case REFTABLE_BLOCK_TYPE_REF:
+		return &t->ref_offsets;
+	case REFTABLE_BLOCK_TYPE_LOG:
+		return &t->log_offsets;
+	case REFTABLE_BLOCK_TYPE_OBJ:
+		return &t->obj_offsets;
+	}
+	abort();
+}
+
+enum reftable_hash reftable_table_hash_id(struct reftable_table *t)
+{
+	return t->hash_id;
+}
+
+const char *reftable_table_name(struct reftable_table *t)
+{
+	return t->name;
+}
+
+static int parse_footer(struct reftable_table *t, uint8_t *footer,
+			uint8_t *header)
+{
+	uint8_t *f = footer;
+	uint8_t first_block_typ;
+	int err = 0;
+	uint32_t computed_crc;
+	uint32_t file_crc;
+
+	if (memcmp(f, "REFT", 4)) {
+		err = REFTABLE_FORMAT_ERROR;
+		goto done;
+	}
+	f += 4;
+
+	if (memcmp(footer, header, header_size(t->version))) {
+		err = REFTABLE_FORMAT_ERROR;
+		goto done;
+	}
+
+	f++;
+	t->block_size = reftable_get_be24(f);
+
+	f += 3;
+	t->min_update_index = reftable_get_be64(f);
+	f += 8;
+	t->max_update_index = reftable_get_be64(f);
+	f += 8;
+
+	if (t->version == 1) {
+		t->hash_id = REFTABLE_HASH_SHA1;
+	} else {
+		switch (reftable_get_be32(f)) {
+		case REFTABLE_FORMAT_ID_SHA1:
+			t->hash_id = REFTABLE_HASH_SHA1;
+			break;
+		case REFTABLE_FORMAT_ID_SHA256:
+			t->hash_id = REFTABLE_HASH_SHA256;
+			break;
+		default:
+			err = REFTABLE_FORMAT_ERROR;
+			goto done;
+		}
+
+		f += 4;
+	}
+
+	t->ref_offsets.index_offset = reftable_get_be64(f);
+	f += 8;
+
+	t->obj_offsets.offset = reftable_get_be64(f);
+	f += 8;
+
+	t->object_id_len = t->obj_offsets.offset & ((1 << 5) - 1);
+	t->obj_offsets.offset >>= 5;
+
+	t->obj_offsets.index_offset = reftable_get_be64(f);
+	f += 8;
+	t->log_offsets.offset = reftable_get_be64(f);
+	f += 8;
+	t->log_offsets.index_offset = reftable_get_be64(f);
+	f += 8;
+
+	computed_crc = crc32(0, footer, f - footer);
+	file_crc = reftable_get_be32(f);
+	f += 4;
+	if (computed_crc != file_crc) {
+		err = REFTABLE_FORMAT_ERROR;
+		goto done;
+	}
+
+	first_block_typ = header[header_size(t->version)];
+	t->ref_offsets.is_present = (first_block_typ == REFTABLE_BLOCK_TYPE_REF);
+	t->ref_offsets.offset = 0;
+	t->log_offsets.is_present = (first_block_typ == REFTABLE_BLOCK_TYPE_LOG ||
+				     t->log_offsets.offset > 0);
+	t->obj_offsets.is_present = t->obj_offsets.offset > 0;
+	if (t->obj_offsets.is_present && !t->object_id_len) {
+		err = REFTABLE_FORMAT_ERROR;
+		goto done;
+	}
+
+	err = 0;
+done:
+	return err;
+}
+
+struct table_iter {
+	struct reftable_table *table;
+	uint8_t typ;
+	uint64_t block_off;
+	struct reftable_block block;
+	struct block_iter bi;
+	int is_finished;
+};
+
+static int table_iter_init(struct table_iter *ti, struct reftable_table *t)
+{
+	struct block_iter bi = BLOCK_ITER_INIT;
+	memset(ti, 0, sizeof(*ti));
+	reftable_table_incref(t);
+	ti->table = t;
+	ti->bi = bi;
+	return 0;
+}
+
+static int table_iter_next_in_block(struct table_iter *ti,
+				    struct reftable_record *rec)
+{
+	int res = block_iter_next(&ti->bi, rec);
+	if (res == 0 && reftable_record_type(rec) == REFTABLE_BLOCK_TYPE_REF) {
+		rec->u.ref.update_index += ti->table->min_update_index;
+	}
+
+	return res;
+}
+
+static void table_iter_block_done(struct table_iter *ti)
+{
+	reftable_block_release(&ti->block);
+	block_iter_reset(&ti->bi);
+}
+
+int table_init_block(struct reftable_table *t, struct reftable_block *block,
+		     uint64_t next_off, uint8_t want_typ)
+{
+	uint32_t header_off = next_off ? 0 : header_size(t->version);
+	int err;
+
+	if (next_off >= t->size)
+		return 1;
+
+	err = reftable_block_init(block, &t->source, next_off, header_off,
+				  t->block_size, hash_size(t->hash_id), want_typ);
+	if (err)
+		reftable_block_release(block);
+	return err;
+}
+
+static void table_iter_close(struct table_iter *ti)
+{
+	table_iter_block_done(ti);
+	block_iter_close(&ti->bi);
+	reftable_table_decref(ti->table);
+}
+
+static int table_iter_next_block(struct table_iter *ti)
+{
+	uint64_t next_block_off = ti->block_off + ti->block.full_block_size;
+	int err;
+
+	err = table_init_block(ti->table, &ti->block, next_block_off, ti->typ);
+	if (err > 0)
+		ti->is_finished = 1;
+	if (err)
+		return err;
+
+	ti->block_off = next_block_off;
+	ti->is_finished = 0;
+	block_iter_init(&ti->bi, &ti->block);
+
+	return 0;
+}
+
+static int table_iter_next(struct table_iter *ti, struct reftable_record *rec)
+{
+	if (reftable_record_type(rec) != ti->typ)
+		return REFTABLE_API_ERROR;
+
+	while (1) {
+		int err;
+
+		if (ti->is_finished)
+			return 1;
+
+		/*
+		 * Check whether the current block still has more records. If
+		 * so, return it. If the iterator returns positive then the
+		 * current block has been exhausted.
+		 */
+		err = table_iter_next_in_block(ti, rec);
+		if (err <= 0)
+			return err;
+
+		/*
+		 * Otherwise, we need to continue to the next block in the
+		 * table and retry. If there are no more blocks then the
+		 * iterator is drained.
+		 */
+		err = table_iter_next_block(ti);
+		if (err) {
+			ti->is_finished = 1;
+			return err;
+		}
+	}
+}
+
+static int table_iter_seek_to(struct table_iter *ti, uint64_t off, uint8_t typ)
+{
+	int err;
+
+	err = table_init_block(ti->table, &ti->block, off, typ);
+	if (err != 0)
+		return err;
+
+	ti->typ = reftable_block_type(&ti->block);
+	ti->block_off = off;
+	block_iter_init(&ti->bi, &ti->block);
+	ti->is_finished = 0;
+	return 0;
+}
+
+static int table_iter_seek_start(struct table_iter *ti, uint8_t typ, int index)
+{
+	struct reftable_table_offsets *offs = table_offsets_for(ti->table, typ);
+	uint64_t off = offs->offset;
+	if (index) {
+		off = offs->index_offset;
+		if (off == 0) {
+			return 1;
+		}
+		typ = REFTABLE_BLOCK_TYPE_INDEX;
+	}
+
+	return table_iter_seek_to(ti, off, typ);
+}
+
+static int table_iter_seek_linear(struct table_iter *ti,
+				  struct reftable_record *want)
+{
+	struct reftable_buf want_key = REFTABLE_BUF_INIT;
+	struct reftable_buf got_key = REFTABLE_BUF_INIT;
+	struct reftable_record rec;
+	int err;
+
+	err = reftable_record_init(&rec, reftable_record_type(want));
+	if (err < 0)
+		goto done;
+
+	err = reftable_record_key(want, &want_key);
+	if (err < 0)
+		goto done;
+
+	/*
+	 * First we need to locate the block that must contain our record. To
+	 * do so we scan through blocks linearly until we find the first block
+	 * whose first key is bigger than our wanted key. Once we have found
+	 * that block we know that the key must be contained in the preceding
+	 * block.
+	 *
+	 * This algorithm is somewhat unfortunate because it means that we
+	 * always have to seek one block too far and then back up. But as we
+	 * can only decode the _first_ key of a block but not its _last_ key we
+	 * have no other way to do this.
+	 */
+	while (1) {
+		struct table_iter next = *ti;
+
+		/*
+		 * We must be careful to not modify underlying data of `ti`
+		 * because we may find that `next` does not contain our desired
+		 * block, but that `ti` does. In that case, we would discard
+		 * `next` and continue with `ti`.
+		 *
+		 * This also means that we cannot reuse allocated memory for
+		 * `next` here. While it would be great if we could, it should
+		 * in practice not be too bad given that we should only ever
+		 * end up doing linear seeks with at most three blocks. As soon
+		 * as we have more than three blocks we would have an index, so
+		 * we would not do a linear search there anymore.
+		 */
+		memset(&next.block.block_data, 0, sizeof(next.block.block_data));
+		next.block.zstream = NULL;
+		next.block.uncompressed_data = NULL;
+		next.block.uncompressed_cap = 0;
+
+		err = table_iter_next_block(&next);
+		if (err < 0)
+			goto done;
+		if (err > 0)
+			break;
+
+		err = reftable_block_first_key(&next.block, &got_key);
+		if (err < 0)
+			goto done;
+
+		if (reftable_buf_cmp(&got_key, &want_key) > 0) {
+			table_iter_block_done(&next);
+			break;
+		}
+
+		table_iter_block_done(ti);
+		*ti = next;
+	}
+
+	/*
+	 * We have located the block that must contain our record, so we seek
+	 * the wanted key inside of it. If the block does not contain our key
+	 * we know that the corresponding record does not exist.
+	 */
+	block_iter_init(&ti->bi, &ti->block);
+	err = block_iter_seek_key(&ti->bi, &want_key);
+	if (err < 0)
+		goto done;
+	err = 0;
+
+done:
+	reftable_record_release(&rec);
+	reftable_buf_release(&want_key);
+	reftable_buf_release(&got_key);
+	return err;
+}
+
+static int table_iter_seek_indexed(struct table_iter *ti,
+				   struct reftable_record *rec)
+{
+	struct reftable_record want_index = {
+		.type = REFTABLE_BLOCK_TYPE_INDEX, .u.idx = { .last_key = REFTABLE_BUF_INIT }
+	};
+	struct reftable_record index_result = {
+		.type = REFTABLE_BLOCK_TYPE_INDEX,
+		.u.idx = { .last_key = REFTABLE_BUF_INIT },
+	};
+	int err;
+
+	err = reftable_record_key(rec, &want_index.u.idx.last_key);
+	if (err < 0)
+		goto done;
+
+	/*
+	 * The index may consist of multiple levels, where each level may have
+	 * multiple index blocks. We start by doing a linear search in the
+	 * highest layer that identifies the relevant index block as well as
+	 * the record inside that block that corresponds to our wanted key.
+	 */
+	err = table_iter_seek_linear(ti, &want_index);
+	if (err < 0)
+		goto done;
+
+	/*
+	 * Traverse down the levels until we find a non-index entry.
+	 */
+	while (1) {
+		/*
+		 * In case we seek a record that does not exist the index iter
+		 * will tell us that the iterator is over. This works because
+		 * the last index entry of the current level will contain the
+		 * last key it knows about. So in case our seeked key is larger
+		 * than the last indexed key we know that it won't exist.
+		 *
+		 * There is one subtlety in the layout of the index section
+		 * that makes this work as expected: the highest-level index is
+		 * at end of the section and will point backwards and thus we
+		 * start reading from the end of the index section, not the
+		 * beginning.
+		 *
+		 * If that wasn't the case and the order was reversed then the
+		 * linear seek would seek into the lower levels and traverse
+		 * all levels of the index only to find out that the key does
+		 * not exist.
+		 */
+		err = table_iter_next(ti, &index_result);
+		if (err != 0)
+			goto done;
+
+		err = table_iter_seek_to(ti, index_result.u.idx.offset, 0);
+		if (err != 0)
+			goto done;
+
+		block_iter_init(&ti->bi, &ti->block);
+
+		err = block_iter_seek_key(&ti->bi, &want_index.u.idx.last_key);
+		if (err < 0)
+			goto done;
+
+		if (ti->typ == reftable_record_type(rec)) {
+			err = 0;
+			break;
+		}
+
+		if (ti->typ != REFTABLE_BLOCK_TYPE_INDEX) {
+			err = REFTABLE_FORMAT_ERROR;
+			goto done;
+		}
+	}
+
+done:
+	reftable_record_release(&want_index);
+	reftable_record_release(&index_result);
+	return err;
+}
+
+static int table_iter_seek(struct table_iter *ti,
+			   struct reftable_record *want)
+{
+	uint8_t typ = reftable_record_type(want);
+	struct reftable_table_offsets *offs = table_offsets_for(ti->table, typ);
+	int err;
+
+	err = table_iter_seek_start(ti, reftable_record_type(want),
+				    !!offs->index_offset);
+	if (err < 0)
+		goto out;
+
+	if (offs->index_offset)
+		err = table_iter_seek_indexed(ti, want);
+	else
+		err = table_iter_seek_linear(ti, want);
+	if (err)
+		goto out;
+
+out:
+	return err;
+}
+
+static int table_iter_seek_void(void *ti, struct reftable_record *want)
+{
+	return table_iter_seek(ti, want);
+}
+
+static int table_iter_next_void(void *ti, struct reftable_record *rec)
+{
+	return table_iter_next(ti, rec);
+}
+
+static void table_iter_close_void(void *ti)
+{
+	table_iter_close(ti);
+}
+
+static struct reftable_iterator_vtable table_iter_vtable = {
+	.seek = &table_iter_seek_void,
+	.next = &table_iter_next_void,
+	.close = &table_iter_close_void,
+};
+
+static void iterator_from_table_iter(struct reftable_iterator *it,
+				     struct table_iter *ti)
+{
+	assert(!it->ops);
+	it->iter_arg = ti;
+	it->ops = &table_iter_vtable;
+}
+
+int table_init_iter(struct reftable_table *t,
+		    struct reftable_iterator *it,
+		    uint8_t typ)
+{
+	struct reftable_table_offsets *offs = table_offsets_for(t, typ);
+
+	if (offs->is_present) {
+		struct table_iter *ti;
+		REFTABLE_ALLOC_ARRAY(ti, 1);
+		if (!ti)
+			return REFTABLE_OUT_OF_MEMORY_ERROR;
+
+		table_iter_init(ti, t);
+		iterator_from_table_iter(it, ti);
+	} else {
+		iterator_set_empty(it);
+	}
+
+	return 0;
+}
+
+int reftable_table_init_ref_iterator(struct reftable_table *t,
+				     struct reftable_iterator *it)
+{
+	return table_init_iter(t, it, REFTABLE_BLOCK_TYPE_REF);
+}
+
+int reftable_table_init_log_iterator(struct reftable_table *t,
+				     struct reftable_iterator *it)
+{
+	return table_init_iter(t, it, REFTABLE_BLOCK_TYPE_LOG);
+}
+
+int reftable_table_new(struct reftable_table **out,
+		       struct reftable_block_source *source, char const *name)
+{
+	struct reftable_block_data footer = { 0 };
+	struct reftable_block_data header = { 0 };
+	struct reftable_table *t;
+	uint64_t file_size = block_source_size(source);
+	uint32_t read_size;
+	ssize_t bytes_read;
+	int err;
+
+	REFTABLE_CALLOC_ARRAY(t, 1);
+	if (!t) {
+		err = REFTABLE_OUT_OF_MEMORY_ERROR;
+		goto done;
+	}
+
+	/*
+	 * We need one extra byte to read the type of first block. We also
+	 * pretend to always be reading v2 of the format because it is larger.
+	 */
+	read_size = header_size(2) + 1;
+	if (read_size > file_size) {
+		err = REFTABLE_FORMAT_ERROR;
+		goto done;
+	}
+
+	bytes_read = block_source_read_data(source, &header, 0, read_size);
+	if (bytes_read < 0 || (size_t)bytes_read != read_size) {
+		err = REFTABLE_IO_ERROR;
+		goto done;
+	}
+
+	if (memcmp(header.data, "REFT", 4)) {
+		err = REFTABLE_FORMAT_ERROR;
+		goto done;
+	}
+	t->version = header.data[4];
+	if (t->version != 1 && t->version != 2) {
+		err = REFTABLE_FORMAT_ERROR;
+		goto done;
+	}
+
+	t->size = file_size - footer_size(t->version);
+	t->source = *source;
+	t->name = reftable_strdup(name);
+	if (!t->name) {
+		err = REFTABLE_OUT_OF_MEMORY_ERROR;
+		goto done;
+	}
+	t->hash_id = 0;
+	t->refcount = 1;
+
+	bytes_read = block_source_read_data(source, &footer, t->size,
+					    footer_size(t->version));
+	if (bytes_read < 0 || (size_t)bytes_read != footer_size(t->version)) {
+		err = REFTABLE_IO_ERROR;
+		goto done;
+	}
+
+	err = parse_footer(t, footer.data, header.data);
+	if (err)
+		goto done;
+
+	*out = t;
+
+done:
+	block_source_release_data(&footer);
+	block_source_release_data(&header);
+	if (err) {
+		if (t)
+			reftable_free(t->name);
+		reftable_free(t);
+		block_source_close(source);
+	}
+	return err;
+}
+
+void reftable_table_incref(struct reftable_table *t)
+{
+	t->refcount++;
+}
+
+void reftable_table_decref(struct reftable_table *t)
+{
+	if (!t)
+		return;
+	if (--t->refcount)
+		return;
+	block_source_close(&t->source);
+	REFTABLE_FREE_AND_NULL(t->name);
+	reftable_free(t);
+}
+
+static int reftable_table_refs_for_indexed(struct reftable_table *t,
+					   struct reftable_iterator *it,
+					   uint8_t *oid)
+{
+	struct reftable_record want = {
+		.type = REFTABLE_BLOCK_TYPE_OBJ,
+		.u.obj = {
+			.hash_prefix = oid,
+			.hash_prefix_len = t->object_id_len,
+		},
+	};
+	struct reftable_iterator oit = { NULL };
+	struct reftable_record got = {
+		.type = REFTABLE_BLOCK_TYPE_OBJ,
+		.u.obj = { 0 },
+	};
+	int err = 0;
+	struct indexed_table_ref_iter *itr = NULL;
+
+	/* Look through the reverse index. */
+	err = table_init_iter(t, &oit, REFTABLE_BLOCK_TYPE_OBJ);
+	if (err < 0)
+		goto done;
+
+	err = iterator_seek(&oit, &want);
+	if (err != 0)
+		goto done;
+
+	/* read out the reftable_obj_record */
+	err = iterator_next(&oit, &got);
+	if (err < 0)
+		goto done;
+
+	if (err > 0 || memcmp(want.u.obj.hash_prefix, got.u.obj.hash_prefix,
+			      t->object_id_len)) {
+		/* didn't find it; return empty iterator */
+		iterator_set_empty(it);
+		err = 0;
+		goto done;
+	}
+
+	err = indexed_table_ref_iter_new(&itr, t, oid, hash_size(t->hash_id),
+					 got.u.obj.offsets,
+					 got.u.obj.offset_len);
+	if (err < 0)
+		goto done;
+	got.u.obj.offsets = NULL;
+	iterator_from_indexed_table_ref_iter(it, itr);
+
+done:
+	reftable_iterator_destroy(&oit);
+	reftable_record_release(&got);
+	return err;
+}
+
+static int reftable_table_refs_for_unindexed(struct reftable_table *t,
+					     struct reftable_iterator *it,
+					     uint8_t *oid)
+{
+	struct table_iter *ti;
+	struct filtering_ref_iterator *filter = NULL;
+	struct filtering_ref_iterator empty = FILTERING_REF_ITERATOR_INIT;
+	uint32_t oid_len = hash_size(t->hash_id);
+	int err;
+
+	REFTABLE_ALLOC_ARRAY(ti, 1);
+	if (!ti) {
+		err = REFTABLE_OUT_OF_MEMORY_ERROR;
+		goto out;
+	}
+
+	table_iter_init(ti, t);
+	err = table_iter_seek_start(ti, REFTABLE_BLOCK_TYPE_REF, 0);
+	if (err < 0)
+		goto out;
+
+	filter = reftable_malloc(sizeof(*filter));
+	if (!filter) {
+		err = REFTABLE_OUT_OF_MEMORY_ERROR;
+		goto out;
+	}
+	*filter = empty;
+
+	err = reftable_buf_add(&filter->oid, oid, oid_len);
+	if (err < 0)
+		goto out;
+
+	iterator_from_table_iter(&filter->it, ti);
+
+	iterator_from_filtering_ref_iterator(it, filter);
+
+	err = 0;
+
+out:
+	if (err < 0) {
+		if (ti)
+			table_iter_close(ti);
+		reftable_free(ti);
+	}
+	return err;
+}
+
+int reftable_table_refs_for(struct reftable_table *t,
+			    struct reftable_iterator *it, uint8_t *oid)
+{
+	if (t->obj_offsets.is_present)
+		return reftable_table_refs_for_indexed(t, it, oid);
+	return reftable_table_refs_for_unindexed(t, it, oid);
+}
+
+uint64_t reftable_table_max_update_index(struct reftable_table *t)
+{
+	return t->max_update_index;
+}
+
+uint64_t reftable_table_min_update_index(struct reftable_table *t)
+{
+	return t->min_update_index;
+}
+
+int reftable_table_iterator_init(struct reftable_table_iterator *it,
+				 struct reftable_table *t)
+{
+	struct table_iter *ti;
+	int err;
+
+	REFTABLE_ALLOC_ARRAY(ti, 1);
+	if (!ti)
+		return REFTABLE_OUT_OF_MEMORY_ERROR;
+
+	err = table_iter_init(ti, t);
+	if (err < 0)
+		goto out;
+
+	it->iter_arg = ti;
+	err = 0;
+
+out:
+	if (err < 0)
+		reftable_free(ti);
+	return err;
+}
+
+void reftable_table_iterator_release(struct reftable_table_iterator *it)
+{
+	if (!it->iter_arg)
+		return;
+	table_iter_close(it->iter_arg);
+	reftable_free(it->iter_arg);
+	it->iter_arg = NULL;
+}
+
+int reftable_table_iterator_next(struct reftable_table_iterator *it,
+				 const struct reftable_block **out)
+{
+	struct table_iter *ti = it->iter_arg;
+	int err;
+
+	err = table_iter_next_block(ti);
+	if (err)
+		return err;
+
+	*out = &ti->block;
+
+	return 0;
+}
diff --git a/reftable/table.h b/reftable/table.h
new file mode 100644
index 0000000000..c54703e621
--- /dev/null
+++ b/reftable/table.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright 2020 Google LLC
+ *
+ * Use of this source code is governed by a BSD-style
+ * license that can be found in the LICENSE file or at
+ * https://developers.google.com/open-source/licenses/bsd
+ */
+
+#ifndef TABLE_H
+#define TABLE_H
+
+#include "block.h"
+#include "record.h"
+#include "reftable-iterator.h"
+#include "reftable-table.h"
+
+const char *reftable_table_name(struct reftable_table *t);
+
+int table_init_iter(struct reftable_table *t,
+		    struct reftable_iterator *it,
+		    uint8_t typ);
+
+/*
+ * Initialize a block by reading from the given table and offset.
+ */
+int table_init_block(struct reftable_table *t, struct reftable_block *block,
+		     uint64_t next_off, uint8_t want_typ);
+
+#endif
diff --git a/reftable/tree.c b/reftable/tree.c
new file mode 100644
index 0000000000..a52f7c0c7d
--- /dev/null
+++ b/reftable/tree.c
@@ -0,0 +1,74 @@
+/*
+ * Copyright 2020 Google LLC
+ *
+ * Use of this source code is governed by a BSD-style
+ * license that can be found in the LICENSE file or at
+ * https://developers.google.com/open-source/licenses/bsd
+ */
+
+#include "system.h"
+#include "tree.h"
+
+#include "basics.h"
+
+struct tree_node *tree_search(struct tree_node *tree,
+			      void *key,
+			      int (*compare)(const void *, const void *))
+{
+	int res;
+	if (!tree)
+		return NULL;
+	res = compare(key, tree->key);
+	if (res < 0)
+		return tree_search(tree->left, key, compare);
+	else if (res > 0)
+		return tree_search(tree->right, key, compare);
+	return tree;
+}
+
+struct tree_node *tree_insert(struct tree_node **rootp,
+			      void *key,
+			      int (*compare)(const void *, const void *))
+{
+	int res;
+
+	if (!*rootp) {
+		struct tree_node *n;
+
+		REFTABLE_CALLOC_ARRAY(n, 1);
+		if (!n)
+			return NULL;
+
+		n->key = key;
+		*rootp = n;
+		return *rootp;
+	}
+
+	res = compare(key, (*rootp)->key);
+	if (res < 0)
+		return tree_insert(&(*rootp)->left, key, compare);
+	else if (res > 0)
+		return tree_insert(&(*rootp)->right, key, compare);
+	return *rootp;
+}
+
+void infix_walk(struct tree_node *t, void (*action)(void *arg, void *key),
+		void *arg)
+{
+	if (t->left)
+		infix_walk(t->left, action, arg);
+	action(arg, t->key);
+	if (t->right)
+		infix_walk(t->right, action, arg);
+}
+
+void tree_free(struct tree_node *t)
+{
+	if (!t)
+		return;
+	if (t->left)
+		tree_free(t->left);
+	if (t->right)
+		tree_free(t->right);
+	reftable_free(t);
+}
diff --git a/reftable/tree.h b/reftable/tree.h
new file mode 100644
index 0000000000..2c9c465299
--- /dev/null
+++ b/reftable/tree.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright 2020 Google LLC
+ *
+ * Use of this source code is governed by a BSD-style
+ * license that can be found in the LICENSE file or at
+ * https://developers.google.com/open-source/licenses/bsd
+ */
+
+#ifndef TREE_H
+#define TREE_H
+
+/* tree_node is a generic binary search tree. */
+struct tree_node {
+	void *key;
+	struct tree_node *left, *right;
+};
+
+/*
+ * Search the tree for the node matching the given key using `compare` as
+ * comparison function. Returns the node whose key matches or `NULL` in case
+ * the key does not exist in the tree.
+ */
+struct tree_node *tree_search(struct tree_node *tree,
+			      void *key,
+			      int (*compare)(const void *, const void *));
+
+/*
+ * Insert a node into the tree. Returns the newly inserted node if the key does
+ * not yet exist. Otherwise it returns the preexisting node. Returns `NULL`
+ * when allocating the new node fails.
+ */
+struct tree_node *tree_insert(struct tree_node **rootp,
+			      void *key,
+			      int (*compare)(const void *, const void *));
+
+/* performs an infix walk of the tree. */
+void infix_walk(struct tree_node *t, void (*action)(void *arg, void *key),
+		void *arg);
+
+/*
+ * deallocates the tree nodes recursively. Keys should be deallocated separately
+ * by walking over the tree. */
+void tree_free(struct tree_node *t);
+
+#endif
diff --git a/reftable/writer.c b/reftable/writer.c
new file mode 100644
index 0000000000..3b4ebdd6dc
--- /dev/null
+++ b/reftable/writer.c
@@ -0,0 +1,884 @@
+/*
+ * Copyright 2020 Google LLC
+ *
+ * Use of this source code is governed by a BSD-style
+ * license that can be found in the LICENSE file or at
+ * https://developers.google.com/open-source/licenses/bsd
+ */
+
+#include "writer.h"
+
+#include "system.h"
+
+#include "block.h"
+#include "constants.h"
+#include "record.h"
+#include "tree.h"
+#include "reftable-error.h"
+
+/* finishes a block, and writes it to storage */
+static int writer_flush_block(struct reftable_writer *w);
+
+/* deallocates memory related to the index */
+static void writer_clear_index(struct reftable_writer *w);
+
+/* finishes writing a 'r' (refs) or 'g' (reflogs) section */
+static int writer_finish_public_section(struct reftable_writer *w);
+
+static struct reftable_block_stats *
+writer_reftable_block_stats(struct reftable_writer *w, uint8_t typ)
+{
+	switch (typ) {
+	case 'r':
+		return &w->stats.ref_stats;
+	case 'o':
+		return &w->stats.obj_stats;
+	case 'i':
+		return &w->stats.idx_stats;
+	case 'g':
+		return &w->stats.log_stats;
+	}
+	abort();
+	return NULL;
+}
+
+/* write data, queuing the padding for the next write. Returns negative for
+ * error. */
+static int padded_write(struct reftable_writer *w, uint8_t *data, size_t len,
+			int padding)
+{
+	int n = 0;
+	if (w->pending_padding > 0) {
+		uint8_t *zeroed;
+		int n;
+
+		zeroed = reftable_calloc(w->pending_padding, sizeof(*zeroed));
+		if (!zeroed)
+			return -1;
+
+		n = w->write(w->write_arg, zeroed, w->pending_padding);
+		if (n < 0) {
+			reftable_free(zeroed);
+			return n;
+		}
+
+		w->pending_padding = 0;
+		reftable_free(zeroed);
+	}
+
+	w->pending_padding = padding;
+	n = w->write(w->write_arg, data, len);
+	if (n < 0)
+		return n;
+	n += padding;
+	return 0;
+}
+
+static void options_set_defaults(struct reftable_write_options *opts)
+{
+	if (opts->restart_interval == 0) {
+		opts->restart_interval = 16;
+	}
+
+	if (opts->hash_id == 0) {
+		opts->hash_id = REFTABLE_HASH_SHA1;
+	}
+	if (opts->block_size == 0) {
+		opts->block_size = DEFAULT_BLOCK_SIZE;
+	}
+}
+
+static int writer_version(struct reftable_writer *w)
+{
+	return (w->opts.hash_id == 0 || w->opts.hash_id == REFTABLE_HASH_SHA1) ?
+			     1 :
+			     2;
+}
+
+static int writer_write_header(struct reftable_writer *w, uint8_t *dest)
+{
+	memcpy(dest, "REFT", 4);
+
+	dest[4] = writer_version(w);
+
+	reftable_put_be24(dest + 5, w->opts.block_size);
+	reftable_put_be64(dest + 8, w->min_update_index);
+	reftable_put_be64(dest + 16, w->max_update_index);
+	if (writer_version(w) == 2) {
+		uint32_t hash_id;
+
+		switch (w->opts.hash_id) {
+		case REFTABLE_HASH_SHA1:
+			hash_id = REFTABLE_FORMAT_ID_SHA1;
+			break;
+		case REFTABLE_HASH_SHA256:
+			hash_id = REFTABLE_FORMAT_ID_SHA256;
+			break;
+		default:
+			return -1;
+		}
+
+		reftable_put_be32(dest + 24, hash_id);
+	}
+
+	return header_size(writer_version(w));
+}
+
+static int writer_reinit_block_writer(struct reftable_writer *w, uint8_t typ)
+{
+	int block_start = 0, ret;
+
+	if (w->next == 0)
+		block_start = header_size(writer_version(w));
+
+	reftable_buf_reset(&w->last_key);
+	ret = block_writer_init(&w->block_writer_data, typ, w->block,
+				w->opts.block_size, block_start,
+				hash_size(w->opts.hash_id));
+	if (ret < 0)
+		return ret;
+
+	w->block_writer = &w->block_writer_data;
+	w->block_writer->restart_interval = w->opts.restart_interval;
+
+	return 0;
+}
+
+int reftable_writer_new(struct reftable_writer **out,
+			ssize_t (*writer_func)(void *, const void *, size_t),
+			int (*flush_func)(void *),
+			void *writer_arg, const struct reftable_write_options *_opts)
+{
+	struct reftable_write_options opts = {0};
+	struct reftable_writer *wp;
+
+	wp = reftable_calloc(1, sizeof(*wp));
+	if (!wp)
+		return REFTABLE_OUT_OF_MEMORY_ERROR;
+
+	if (_opts)
+		opts = *_opts;
+	options_set_defaults(&opts);
+	if (opts.block_size >= (1 << 24))
+		return REFTABLE_API_ERROR;
+
+	reftable_buf_init(&wp->block_writer_data.last_key);
+	reftable_buf_init(&wp->last_key);
+	reftable_buf_init(&wp->scratch);
+	REFTABLE_CALLOC_ARRAY(wp->block, opts.block_size);
+	if (!wp->block) {
+		reftable_free(wp);
+		return REFTABLE_OUT_OF_MEMORY_ERROR;
+	}
+	wp->write = writer_func;
+	wp->write_arg = writer_arg;
+	wp->opts = opts;
+	wp->flush = flush_func;
+	writer_reinit_block_writer(wp, REFTABLE_BLOCK_TYPE_REF);
+
+	*out = wp;
+
+	return 0;
+}
+
+int reftable_writer_set_limits(struct reftable_writer *w, uint64_t min,
+				uint64_t max)
+{
+	/*
+	  * Set the min/max update index limits for the reftable writer.
+	  * This must be called before adding any records, since:
+	  * - The 'next' field gets set after writing the first block.
+	  * - The 'last_key' field updates with each new record (but resets
+	  *   after sections).
+	  * Returns REFTABLE_API_ERROR if called after writing has begun.
+	 */
+	if (w->next || w->last_key.len)
+		return REFTABLE_API_ERROR;
+
+	w->min_update_index = min;
+	w->max_update_index = max;
+
+	return 0;
+}
+
+static void writer_release(struct reftable_writer *w)
+{
+	if (w) {
+		reftable_free(w->block);
+		w->block = NULL;
+		block_writer_release(&w->block_writer_data);
+		w->block_writer = NULL;
+		writer_clear_index(w);
+		reftable_buf_release(&w->last_key);
+		reftable_buf_release(&w->scratch);
+	}
+}
+
+void reftable_writer_free(struct reftable_writer *w)
+{
+	writer_release(w);
+	reftable_free(w);
+}
+
+struct obj_index_tree_node {
+	struct reftable_buf hash;
+	uint64_t *offsets;
+	size_t offset_len;
+	size_t offset_cap;
+};
+
+#define OBJ_INDEX_TREE_NODE_INIT    \
+	{                           \
+		.hash = REFTABLE_BUF_INIT \
+	}
+
+static int obj_index_tree_node_compare(const void *a, const void *b)
+{
+	return reftable_buf_cmp(&((const struct obj_index_tree_node *)a)->hash,
+			  &((const struct obj_index_tree_node *)b)->hash);
+}
+
+static int writer_index_hash(struct reftable_writer *w, struct reftable_buf *hash)
+{
+	uint64_t off = w->next;
+	struct obj_index_tree_node want = { .hash = *hash };
+	struct obj_index_tree_node *key;
+	struct tree_node *node;
+
+	node = tree_search(w->obj_index_tree, &want, &obj_index_tree_node_compare);
+	if (!node) {
+		struct obj_index_tree_node empty = OBJ_INDEX_TREE_NODE_INIT;
+		int err;
+
+		key = reftable_malloc(sizeof(*key));
+		if (!key)
+			return REFTABLE_OUT_OF_MEMORY_ERROR;
+
+		*key = empty;
+
+		reftable_buf_reset(&key->hash);
+		err = reftable_buf_add(&key->hash, hash->buf, hash->len);
+		if (err < 0) {
+			reftable_free(key);
+			return err;
+		}
+		tree_insert(&w->obj_index_tree, key,
+			    &obj_index_tree_node_compare);
+	} else {
+		key = node->key;
+	}
+
+	if (key->offset_len > 0 && key->offsets[key->offset_len - 1] == off)
+		return 0;
+
+	REFTABLE_ALLOC_GROW_OR_NULL(key->offsets, key->offset_len + 1,
+				    key->offset_cap);
+	if (!key->offsets)
+		return REFTABLE_OUT_OF_MEMORY_ERROR;
+	key->offsets[key->offset_len++] = off;
+
+	return 0;
+}
+
+static int writer_add_record(struct reftable_writer *w,
+			     struct reftable_record *rec)
+{
+	int err;
+
+	err = reftable_record_key(rec, &w->scratch);
+	if (err < 0)
+		goto done;
+
+	if (reftable_buf_cmp(&w->last_key, &w->scratch) >= 0) {
+		err = REFTABLE_API_ERROR;
+		goto done;
+	}
+
+	reftable_buf_reset(&w->last_key);
+	err = reftable_buf_add(&w->last_key, w->scratch.buf, w->scratch.len);
+	if (err < 0)
+		goto done;
+
+	if (!w->block_writer) {
+		err = writer_reinit_block_writer(w, reftable_record_type(rec));
+		if (err < 0)
+			goto done;
+	}
+
+	if (block_writer_type(w->block_writer) != reftable_record_type(rec))
+		return REFTABLE_API_ERROR;
+
+	/*
+	 * Try to add the record to the writer. If this succeeds then we're
+	 * done. Otherwise the block writer may have hit the block size limit
+	 * and needs to be flushed.
+	 */
+	err = block_writer_add(w->block_writer, rec);
+	if (err == 0)
+		goto done;
+
+	if (err != REFTABLE_ENTRY_TOO_BIG_ERROR)
+		goto done;
+	/*
+	 * The current block is full, so we need to flush and reinitialize the
+	 * writer to start writing the next block.
+	 */
+	err = writer_flush_block(w);
+	if (err < 0)
+		goto done;
+	err = writer_reinit_block_writer(w, reftable_record_type(rec));
+	if (err < 0)
+		goto done;
+
+	/*
+	 * Try to add the record to the writer again. If this still fails then
+	 * the record does not fit into the block size.
+	 */
+	err = block_writer_add(w->block_writer, rec);
+	if (err)
+		goto done;
+
+done:
+	return err;
+}
+
+int reftable_writer_add_ref(struct reftable_writer *w,
+			    struct reftable_ref_record *ref)
+{
+	struct reftable_record rec = {
+		.type = REFTABLE_BLOCK_TYPE_REF,
+		.u = {
+			.ref = *ref
+		},
+	};
+	int err;
+
+	if (!ref->refname ||
+	    ref->update_index < w->min_update_index ||
+	    ref->update_index > w->max_update_index)
+		return REFTABLE_API_ERROR;
+
+	rec.u.ref.update_index -= w->min_update_index;
+
+	err = writer_add_record(w, &rec);
+	if (err < 0)
+		goto out;
+
+	if (!w->opts.skip_index_objects && reftable_ref_record_val1(ref)) {
+		reftable_buf_reset(&w->scratch);
+		err = reftable_buf_add(&w->scratch, (char *)reftable_ref_record_val1(ref),
+				       hash_size(w->opts.hash_id));
+		if (err < 0)
+			goto out;
+
+		err = writer_index_hash(w, &w->scratch);
+		if (err < 0)
+			goto out;
+	}
+
+	if (!w->opts.skip_index_objects && reftable_ref_record_val2(ref)) {
+		reftable_buf_reset(&w->scratch);
+		err = reftable_buf_add(&w->scratch, reftable_ref_record_val2(ref),
+				       hash_size(w->opts.hash_id));
+		if (err < 0)
+			goto out;
+
+		err = writer_index_hash(w, &w->scratch);
+		if (err < 0)
+			goto out;
+	}
+
+	err = 0;
+
+out:
+	return err;
+}
+
+int reftable_writer_add_refs(struct reftable_writer *w,
+			     struct reftable_ref_record *refs, int n)
+{
+	int err = 0;
+	int i = 0;
+	QSORT(refs, n, reftable_ref_record_compare_name);
+	for (i = 0; err == 0 && i < n; i++) {
+		err = reftable_writer_add_ref(w, &refs[i]);
+	}
+	return err;
+}
+
+static int reftable_writer_add_log_verbatim(struct reftable_writer *w,
+					    struct reftable_log_record *log)
+{
+	struct reftable_record rec = {
+		.type = REFTABLE_BLOCK_TYPE_LOG,
+		.u = {
+			.log = *log,
+		},
+	};
+	if (w->block_writer &&
+	    block_writer_type(w->block_writer) == REFTABLE_BLOCK_TYPE_REF) {
+		int err = writer_finish_public_section(w);
+		if (err < 0)
+			return err;
+	}
+
+	w->next -= w->pending_padding;
+	w->pending_padding = 0;
+	return writer_add_record(w, &rec);
+}
+
+int reftable_writer_add_log(struct reftable_writer *w,
+			    struct reftable_log_record *log)
+{
+	char *input_log_message = NULL;
+	struct reftable_buf cleaned_message = REFTABLE_BUF_INIT;
+	int err = 0;
+
+	if (log->value_type == REFTABLE_LOG_DELETION)
+		return reftable_writer_add_log_verbatim(w, log);
+
+	/*
+	 * Verify only the upper limit of the update_index. Each reflog entry
+	 * is tied to a specific update_index. Entries in the reflog can be
+	 * replaced by adding a new entry with the same update_index,
+	 * effectively canceling the old one.
+	 *
+	 * Consequently, reflog updates may include update_index values lower
+	 * than the writer's min_update_index.
+	 */
+	if (log->update_index > w->max_update_index)
+		return REFTABLE_API_ERROR;
+
+	if (!log->refname)
+		return REFTABLE_API_ERROR;
+
+	input_log_message = log->value.update.message;
+	if (!w->opts.exact_log_message && log->value.update.message) {
+		err = reftable_buf_addstr(&cleaned_message, log->value.update.message);
+		if (err < 0)
+			goto done;
+
+		while (cleaned_message.len &&
+		       cleaned_message.buf[cleaned_message.len - 1] == '\n') {
+			err = reftable_buf_setlen(&cleaned_message,
+						  cleaned_message.len - 1);
+			if (err < 0)
+				goto done;
+		}
+		if (strchr(cleaned_message.buf, '\n')) {
+			/* multiple lines not allowed. */
+			err = REFTABLE_API_ERROR;
+			goto done;
+		}
+
+		err = reftable_buf_addstr(&cleaned_message, "\n");
+		if (err < 0)
+			goto done;
+
+		log->value.update.message = cleaned_message.buf;
+	}
+
+	err = reftable_writer_add_log_verbatim(w, log);
+	log->value.update.message = input_log_message;
+done:
+	reftable_buf_release(&cleaned_message);
+	return err;
+}
+
+int reftable_writer_add_logs(struct reftable_writer *w,
+			     struct reftable_log_record *logs, int n)
+{
+	int err = 0;
+	int i = 0;
+	QSORT(logs, n, reftable_log_record_compare_key);
+
+	for (i = 0; err == 0 && i < n; i++) {
+		err = reftable_writer_add_log(w, &logs[i]);
+	}
+	return err;
+}
+
+static int writer_finish_section(struct reftable_writer *w)
+{
+	struct reftable_block_stats *bstats = NULL;
+	uint8_t typ = block_writer_type(w->block_writer);
+	uint64_t index_start = 0;
+	int max_level = 0;
+	size_t threshold = w->opts.unpadded ? 1 : 3;
+	int before_blocks = w->stats.idx_stats.blocks;
+	int err;
+
+	err = writer_flush_block(w);
+	if (err < 0)
+		return err;
+
+	/*
+	 * When the section we are about to index has a lot of blocks then the
+	 * index itself may span across multiple blocks, as well. This would
+	 * require a linear scan over index blocks only to find the desired
+	 * indexed block, which is inefficient. Instead, we write a multi-level
+	 * index where index records of level N+1 will refer to index blocks of
+	 * level N. This isn't constant time, either, but at least logarithmic.
+	 *
+	 * This loop handles writing this multi-level index. Note that we write
+	 * the lowest-level index pointing to the indexed blocks first. We then
+	 * continue writing additional index levels until the current level has
+	 * less blocks than the threshold so that the highest level will be at
+	 * the end of the index section.
+	 *
+	 * Readers are thus required to start reading the index section from
+	 * its end, which is why we set `index_start` to the beginning of the
+	 * last index section.
+	 */
+	while (w->index_len > threshold) {
+		struct reftable_index_record *idx = NULL;
+		size_t i, idx_len;
+
+		max_level++;
+		index_start = w->next;
+		err = writer_reinit_block_writer(w, REFTABLE_BLOCK_TYPE_INDEX);
+		if (err < 0)
+			return err;
+
+		idx = w->index;
+		idx_len = w->index_len;
+
+		w->index = NULL;
+		w->index_len = 0;
+		w->index_cap = 0;
+		for (i = 0; i < idx_len; i++) {
+			struct reftable_record rec = {
+				.type = REFTABLE_BLOCK_TYPE_INDEX,
+				.u = {
+					.idx = idx[i],
+				},
+			};
+
+			err = writer_add_record(w, &rec);
+			if (err < 0)
+				return err;
+		}
+
+		err = writer_flush_block(w);
+		if (err < 0)
+			return err;
+
+		for (i = 0; i < idx_len; i++)
+			reftable_buf_release(&idx[i].last_key);
+		reftable_free(idx);
+	}
+
+	/*
+	 * The index may still contain a number of index blocks lower than the
+	 * threshold. Clear it so that these entries don't leak into the next
+	 * index section.
+	 */
+	writer_clear_index(w);
+
+	bstats = writer_reftable_block_stats(w, typ);
+	bstats->index_blocks = w->stats.idx_stats.blocks - before_blocks;
+	bstats->index_offset = index_start;
+	bstats->max_index_level = max_level;
+
+	/* Reinit lastKey, as the next section can start with any key. */
+	reftable_buf_reset(&w->last_key);
+
+	return 0;
+}
+
+struct common_prefix_arg {
+	struct reftable_buf *last;
+	size_t max;
+};
+
+static void update_common(void *void_arg, void *key)
+{
+	struct common_prefix_arg *arg = void_arg;
+	struct obj_index_tree_node *entry = key;
+	if (arg->last) {
+		size_t n = common_prefix_size(&entry->hash, arg->last);
+		if (n > arg->max)
+			arg->max = n;
+	}
+	arg->last = &entry->hash;
+}
+
+struct write_record_arg {
+	struct reftable_writer *w;
+	int err;
+};
+
+static void write_object_record(void *void_arg, void *key)
+{
+	struct write_record_arg *arg = void_arg;
+	struct obj_index_tree_node *entry = key;
+	struct reftable_record
+		rec = { .type = REFTABLE_BLOCK_TYPE_OBJ,
+			.u.obj = {
+				.hash_prefix = (uint8_t *)entry->hash.buf,
+				.hash_prefix_len = arg->w->stats.object_id_len,
+				.offsets = entry->offsets,
+				.offset_len = entry->offset_len,
+			} };
+	if (arg->err < 0)
+		goto done;
+
+	/*
+	 * Try to add the record to the writer. If this succeeds then we're
+	 * done. Otherwise the block writer may have hit the block size limit
+	 * and needs to be flushed.
+	 */
+	arg->err = block_writer_add(arg->w->block_writer, &rec);
+	if (arg->err == 0)
+		goto done;
+
+	if (arg->err != REFTABLE_ENTRY_TOO_BIG_ERROR)
+		goto done;
+
+	/*
+	 * The current block is full, so we need to flush and reinitialize the
+	 * writer to start writing the next block.
+	 */
+	arg->err = writer_flush_block(arg->w);
+	if (arg->err < 0)
+		goto done;
+
+	arg->err = writer_reinit_block_writer(arg->w, REFTABLE_BLOCK_TYPE_OBJ);
+	if (arg->err < 0)
+		goto done;
+
+	/*
+	 * If this still fails then we may need to reset record's offset
+	 * length to reduce the data size to be written.
+	 */
+	arg->err = block_writer_add(arg->w->block_writer, &rec);
+	if (arg->err == 0)
+		goto done;
+
+	if (arg->err != REFTABLE_ENTRY_TOO_BIG_ERROR)
+		goto done;
+
+	rec.u.obj.offset_len = 0;
+	arg->err = block_writer_add(arg->w->block_writer, &rec);
+
+	/* Should be able to write into a fresh block. */
+	assert(arg->err == 0);
+
+done:;
+}
+
+static void object_record_free(void *void_arg REFTABLE_UNUSED, void *key)
+{
+	struct obj_index_tree_node *entry = key;
+
+	REFTABLE_FREE_AND_NULL(entry->offsets);
+	reftable_buf_release(&entry->hash);
+	reftable_free(entry);
+}
+
+static int writer_dump_object_index(struct reftable_writer *w)
+{
+	struct write_record_arg closure = { .w = w };
+	struct common_prefix_arg common = {
+		.max = 1,		/* obj_id_len should be >= 2. */
+	};
+	int err;
+
+	if (w->obj_index_tree)
+		infix_walk(w->obj_index_tree, &update_common, &common);
+	w->stats.object_id_len = common.max + 1;
+
+	err = writer_reinit_block_writer(w, REFTABLE_BLOCK_TYPE_OBJ);
+	if (err < 0)
+		return err;
+
+	if (w->obj_index_tree)
+		infix_walk(w->obj_index_tree, &write_object_record, &closure);
+
+	if (closure.err < 0)
+		return closure.err;
+	return writer_finish_section(w);
+}
+
+static int writer_finish_public_section(struct reftable_writer *w)
+{
+	uint8_t typ = 0;
+	int err = 0;
+
+	if (!w->block_writer)
+		return 0;
+
+	typ = block_writer_type(w->block_writer);
+	err = writer_finish_section(w);
+	if (err < 0)
+		return err;
+	if (typ == REFTABLE_BLOCK_TYPE_REF && !w->opts.skip_index_objects &&
+	    w->stats.ref_stats.index_blocks > 0) {
+		err = writer_dump_object_index(w);
+		if (err < 0)
+			return err;
+	}
+
+	if (w->obj_index_tree) {
+		infix_walk(w->obj_index_tree, &object_record_free, NULL);
+		tree_free(w->obj_index_tree);
+		w->obj_index_tree = NULL;
+	}
+
+	w->block_writer = NULL;
+	return 0;
+}
+
+int reftable_writer_close(struct reftable_writer *w)
+{
+	uint8_t footer[72];
+	uint8_t *p = footer;
+	int err = writer_finish_public_section(w);
+	int empty_table = w->next == 0;
+	if (err != 0)
+		goto done;
+	w->pending_padding = 0;
+	if (empty_table) {
+		/* Empty tables need a header anyway. */
+		uint8_t header[28];
+		int n = writer_write_header(w, header);
+		err = padded_write(w, header, n, 0);
+		if (err < 0)
+			goto done;
+	}
+
+	p += writer_write_header(w, footer);
+	reftable_put_be64(p, w->stats.ref_stats.index_offset);
+	p += 8;
+	reftable_put_be64(p, (w->stats.obj_stats.offset) << 5 | w->stats.object_id_len);
+	p += 8;
+	reftable_put_be64(p, w->stats.obj_stats.index_offset);
+	p += 8;
+
+	reftable_put_be64(p, w->stats.log_stats.offset);
+	p += 8;
+	reftable_put_be64(p, w->stats.log_stats.index_offset);
+	p += 8;
+
+	reftable_put_be32(p, crc32(0, footer, p - footer));
+	p += 4;
+
+	err = w->flush(w->write_arg);
+	if (err < 0) {
+		err = REFTABLE_IO_ERROR;
+		goto done;
+	}
+
+	err = padded_write(w, footer, footer_size(writer_version(w)), 0);
+	if (err < 0)
+		goto done;
+
+	if (empty_table) {
+		err = REFTABLE_EMPTY_TABLE_ERROR;
+		goto done;
+	}
+
+done:
+	writer_release(w);
+	return err;
+}
+
+static void writer_clear_index(struct reftable_writer *w)
+{
+	for (size_t i = 0; w->index && i < w->index_len; i++)
+		reftable_buf_release(&w->index[i].last_key);
+	REFTABLE_FREE_AND_NULL(w->index);
+	w->index_len = 0;
+	w->index_cap = 0;
+}
+
+static int writer_flush_nonempty_block(struct reftable_writer *w)
+{
+	struct reftable_index_record index_record = {
+		.last_key = REFTABLE_BUF_INIT,
+	};
+	uint8_t typ = block_writer_type(w->block_writer);
+	struct reftable_block_stats *bstats;
+	int raw_bytes, padding = 0, err;
+	uint64_t block_typ_off;
+
+	/*
+	 * Finish the current block. This will cause the block writer to emit
+	 * restart points and potentially compress records in case we are
+	 * writing a log block.
+	 *
+	 * Note that this is still happening in memory.
+	 */
+	raw_bytes = block_writer_finish(w->block_writer);
+	if (raw_bytes < 0)
+		return raw_bytes;
+
+	/*
+	 * By default, all records except for log records are padded to the
+	 * block size.
+	 */
+	if (!w->opts.unpadded && typ != REFTABLE_BLOCK_TYPE_LOG)
+		padding = w->opts.block_size - raw_bytes;
+
+	bstats = writer_reftable_block_stats(w, typ);
+	block_typ_off = (bstats->blocks == 0) ? w->next : 0;
+	if (block_typ_off > 0)
+		bstats->offset = block_typ_off;
+	bstats->entries += w->block_writer->entries;
+	bstats->restarts += w->block_writer->restart_len;
+	bstats->blocks++;
+	w->stats.blocks++;
+
+	/*
+	 * If this is the first block we're writing to the table then we need
+	 * to also write the reftable header.
+	 */
+	if (!w->next)
+		writer_write_header(w, w->block);
+
+	err = padded_write(w, w->block, raw_bytes, padding);
+	if (err < 0)
+		return err;
+
+	/*
+	 * Add an index record for every block that we're writing. If we end up
+	 * having more than a threshold of index records we will end up writing
+	 * an index section in `writer_finish_section()`. Each index record
+	 * contains the last record key of the block it is indexing as well as
+	 * the offset of that block.
+	 *
+	 * Note that this also applies when flushing index blocks, in which
+	 * case we will end up with a multi-level index.
+	 */
+	REFTABLE_ALLOC_GROW_OR_NULL(w->index, w->index_len + 1, w->index_cap);
+	if (!w->index)
+		return REFTABLE_OUT_OF_MEMORY_ERROR;
+
+	index_record.offset = w->next;
+	reftable_buf_reset(&index_record.last_key);
+	err = reftable_buf_add(&index_record.last_key, w->block_writer->last_key.buf,
+			       w->block_writer->last_key.len);
+	if (err < 0)
+		return err;
+	w->index[w->index_len] = index_record;
+	w->index_len++;
+
+	w->next += padding + raw_bytes;
+	w->block_writer = NULL;
+
+	return 0;
+}
+
+static int writer_flush_block(struct reftable_writer *w)
+{
+	if (!w->block_writer)
+		return 0;
+	if (w->block_writer->entries == 0)
+		return 0;
+	return writer_flush_nonempty_block(w);
+}
+
+const struct reftable_stats *reftable_writer_stats(struct reftable_writer *w)
+{
+	return &w->stats;
+}
diff --git a/reftable/writer.h b/reftable/writer.h
new file mode 100644
index 0000000000..9f53610b27
--- /dev/null
+++ b/reftable/writer.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright 2020 Google LLC
+ *
+ * Use of this source code is governed by a BSD-style
+ * license that can be found in the LICENSE file or at
+ * https://developers.google.com/open-source/licenses/bsd
+ */
+
+#ifndef WRITER_H
+#define WRITER_H
+
+#include "basics.h"
+#include "block.h"
+#include "tree.h"
+#include "reftable-writer.h"
+
+struct reftable_writer {
+	ssize_t (*write)(void *, const void *, size_t);
+	int (*flush)(void *);
+	void *write_arg;
+	int pending_padding;
+	struct reftable_buf last_key;
+	/* Scratch buffer used to avoid allocations. */
+	struct reftable_buf scratch;
+
+	/* offset of next block to write. */
+	uint64_t next;
+	uint64_t min_update_index, max_update_index;
+	struct reftable_write_options opts;
+
+	/* memory buffer for writing */
+	uint8_t *block;
+
+	/* writer for the current section. NULL or points to
+	 * block_writer_data */
+	struct block_writer *block_writer;
+
+	struct block_writer block_writer_data;
+
+	/* pending index records for the current section */
+	struct reftable_index_record *index;
+	size_t index_len;
+	size_t index_cap;
+
+	/*
+	 * tree for use with tsearch; used to populate the 'o' inverse OID
+	 * map */
+	struct tree_node *obj_index_tree;
+
+	struct reftable_stats stats;
+};
+
+#endif