Merge branch 'hx/unpack-streaming'

Allow large objects read from a packstream to be streamed into a loose object file straight, without having to keep it in-core as a whole. * hx/unpack-streaming: unpack-objects: use stream_loose_object() to unpack large objects core doc: modernize core.bigFileThreshold documentation object-file.c: add "stream_loose_object()" to handle large object object-file.c: factor out deflate part of write_loose_object() object-file.c: refactor write_loose_object() to several steps unpack-objects: low memory footprint for get_data() in dry_run mode
2022-07-14 15:03:59 -07:00 · 2022-07-14 15:03:59 -07:00 · 73b9ef6ab1
commit 73b9ef6ab1
parent be733e1200 aaf81223f4
5 changed files with 405 additions and 51 deletions
--- a/Documentation/config/core.txt
+++ b/Documentation/config/core.txt
@ -444,17 +444,32 @@ You probably do not need to adjust this value.
 Common unit suffixes of 'k', 'm', or 'g' are supported.
 core.bigFileThreshold::
-	Files larger than this size are stored deflated, without
+	The size of files considered "big", which as discussed below
-	attempting delta compression.  Storing large files without
+	changes the behavior of numerous git commands, as well as how
-	delta compression avoids excessive memory usage, at the
+	such files are stored within the repository. The default is
-	slight expense of increased disk usage. Additionally files
+	512 MiB. Common unit suffixes of 'k', 'm', or 'g' are
-	larger than this size are always treated as binary.
+	supported.
 +
-Default is 512 MiB on all platforms.  This should be reasonable
+Files above the configured limit will be:
 for most projects as source code and other text files can still
 be delta compressed, but larger binary media files won't be.
 +
-Common unit suffixes of 'k', 'm', or 'g' are supported.
+* Stored deflated in packfiles, without attempting delta compression.
 +
 The default limit is primarily set with this use-case in mind. With it,
 most projects will have their source code and other text files delta
 compressed, but not larger binary media files.
 +
 Storing large files without delta compression avoids excessive memory
 usage, at the slight expense of increased disk usage.
 +
 * Will be treated as if they were labeled "binary" (see
  linkgit:gitattributes[5]). e.g. linkgit:git-log[1] and
  linkgit:git-diff[1] will not compute diffs for files above this limit.
 +
 * Will generally be streamed when written, which avoids excessive
 memory usage, at the cost of some fixed overhead. Commands that make
 use of this include linkgit:git-archive[1],
 linkgit:git-fast-import[1], linkgit:git-index-pack[1],
 linkgit:git-unpack-objects[1] and linkgit:git-fsck[1].
 core.excludesFile::
 	Specifies the pathname to the file that contains patterns to
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@ -97,15 +97,27 @@ static void use(int bytes)
 	display_throughput(progress, consumed_bytes);
 }
 /*
 * Decompress zstream from the standard input into a newly
 * allocated buffer of specified size and return the buffer.
 * The caller is responsible to free the returned buffer.
 *
 * But for dry_run mode, "get_data()" is only used to check the
 * integrity of data, and the returned buffer is not used at all.
 * Therefore, in dry_run mode, "get_data()" will release the small
 * allocated buffer which is reused to hold temporary zstream output
 * and return NULL instead of returning garbage data.
 */
 static void *get_data(unsigned long size)
 {
 	git_zstream stream;
-	void *buf = xmallocz(size);
+	unsigned long bufsize = dry_run && size > 8192 ? 8192 : size;
 	void *buf = xmallocz(bufsize);
 	memset(&stream, 0, sizeof(stream));
 	stream.next_out = buf;
-	stream.avail_out = size;
+	stream.avail_out = bufsize;
 	stream.next_in = fill(1);
 	stream.avail_in = len;
 	git_inflate_init(&stream);
@ -125,8 +137,17 @@ static void *get_data(unsigned long size)
 		}
 		stream.next_in = fill(1);
 		stream.avail_in = len;
 		if (dry_run) {
 			/* reuse the buffer in dry_run mode */
 			stream.next_out = buf;
 			stream.avail_out = bufsize > size - stream.total_out ?
 						   size - stream.total_out :
 						   bufsize;
 		}
 	}
 	git_inflate_end(&stream);
 	if (dry_run)
 		FREE_AND_NULL(buf);
 	return buf;
 }
@ -326,10 +347,70 @@ static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 {
 	void *buf = get_data(size);
-	if (!dry_run && buf)
+	if (buf)
 		write_object(nr, type, buf, size);
-	else
+}
-		free(buf);
+
 struct input_zstream_data {
 	git_zstream *zstream;
 	unsigned char buf[8192];
 	int status;
 };
 static const void *feed_input_zstream(struct input_stream *in_stream,
 				      unsigned long *readlen)
 {
 	struct input_zstream_data *data = in_stream->data;
 	git_zstream *zstream = data->zstream;
 	void *in = fill(1);
 	if (in_stream->is_finished) {
 		*readlen = 0;
 		return NULL;
 	}
 	zstream->next_out = data->buf;
 	zstream->avail_out = sizeof(data->buf);
 	zstream->next_in = in;
 	zstream->avail_in = len;
 	data->status = git_inflate(zstream, 0);
 	in_stream->is_finished = data->status != Z_OK;
 	use(len - zstream->avail_in);
 	*readlen = sizeof(data->buf) - zstream->avail_out;
 	return data->buf;
 }
 static void stream_blob(unsigned long size, unsigned nr)
 {
 	git_zstream zstream = { 0 };
 	struct input_zstream_data data = { 0 };
 	struct input_stream in_stream = {
 		.read = feed_input_zstream,
 		.data = &data,
 	};
 	struct obj_info *info = &obj_list[nr];
 	data.zstream = &zstream;
 	git_inflate_init(&zstream);
 	if (stream_loose_object(&in_stream, size, &info->oid))
 		die(_("failed to write object in stream"));
 	if (data.status != Z_STREAM_END)
 		die(_("inflate returned (%d)"), data.status);
 	git_inflate_end(&zstream);
 	if (strict) {
 		struct blob *blob = lookup_blob(the_repository, &info->oid);
 		if (!blob)
 			die(_("invalid blob object from stream"));
 		blob->object.flags |= FLAG_WRITTEN;
 	}
 	info->obj = NULL;
 }
 static int resolve_against_held(unsigned nr, const struct object_id *base,
@ -359,10 +440,8 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
 		oidread(&base_oid, fill(the_hash_algo->rawsz));
 		use(the_hash_algo->rawsz);
 		delta_data = get_data(delta_size);
-		if (dry_run || !delta_data) {
+		if (!delta_data)
 			free(delta_data);
 			return;
 		}
 		if (has_object_file(&base_oid))
 			; /* Ok we have this one */
 		else if (resolve_against_held(nr, &base_oid,
@ -398,10 +477,8 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
 			die("offset value out of bound for delta base object");
 		delta_data = get_data(delta_size);
-		if (dry_run || !delta_data) {
+		if (!delta_data)
 			free(delta_data);
 			return;
 		}
 		lo = 0;
 		hi = nr;
 		while (lo < hi) {
@ -468,9 +545,14 @@ static void unpack_one(unsigned nr)
 	}
 	switch (type) {
 	case OBJ_BLOB:
 		if (!dry_run && size > big_file_threshold) {
 			stream_blob(size, nr);
 			return;
 		}
 		/* fallthrough */
 	case OBJ_COMMIT:
 	case OBJ_TREE:
 	case OBJ_BLOB:
 	case OBJ_TAG:
 		unpack_non_delta_entry(type, size, nr);
 		return;
--- a/object-file.c
+++ b/object-file.c
@ -1951,6 +1951,96 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
 	return fd;
 }
 /**
 * Common steps for loose object writers to start writing loose
 * objects:
 *
 * - Create tmpfile for the loose object.
 * - Setup zlib stream for compression.
 * - Start to feed header to zlib stream.
 *
 * Returns a "fd", which should later be provided to
 * end_loose_object_common().
 */
 static int start_loose_object_common(struct strbuf *tmp_file,
 				     const char *filename, unsigned flags,
 				     git_zstream *stream,
 				     unsigned char *buf, size_t buflen,
 				     git_hash_ctx *c,
 				     char *hdr, int hdrlen)
 {
 	int fd;
 	fd = create_tmpfile(tmp_file, filename);
 	if (fd < 0) {
 		if (flags & HASH_SILENT)
 			return -1;
 		else if (errno == EACCES)
 			return error(_("insufficient permission for adding "
 				       "an object to repository database %s"),
 				     get_object_directory());
 		else
 			return error_errno(
 				_("unable to create temporary file"));
 	}
 	/*  Setup zlib stream for compression */
 	git_deflate_init(stream, zlib_compression_level);
 	stream->next_out = buf;
 	stream->avail_out = buflen;
 	the_hash_algo->init_fn(c);
 	/*  Start to feed header to zlib stream */
 	stream->next_in = (unsigned char *)hdr;
 	stream->avail_in = hdrlen;
 	while (git_deflate(stream, 0) == Z_OK)
 		; /* nothing */
 	the_hash_algo->update_fn(c, hdr, hdrlen);
 	return fd;
 }
 /**
 * Common steps for the inner git_deflate() loop for writing loose
 * objects. Returns what git_deflate() returns.
 */
 static int write_loose_object_common(git_hash_ctx *c,
 				     git_zstream *stream, const int flush,
 				     unsigned char *in0, const int fd,
 				     unsigned char *compressed,
 				     const size_t compressed_len)
 {
 	int ret;
 	ret = git_deflate(stream, flush ? Z_FINISH : 0);
 	the_hash_algo->update_fn(c, in0, stream->next_in - in0);
 	if (write_buffer(fd, compressed, stream->next_out - compressed) < 0)
 		die(_("unable to write loose object file"));
 	stream->next_out = compressed;
 	stream->avail_out = compressed_len;
 	return ret;
 }
 /**
 * Common steps for loose object writers to end writing loose objects:
 *
 * - End the compression of zlib stream.
 * - Get the calculated oid to "oid".
 */
 static int end_loose_object_common(git_hash_ctx *c, git_zstream *stream,
 				   struct object_id *oid)
 {
 	int ret;
 	ret = git_deflate_end_gently(stream);
 	if (ret != Z_OK)
 		return ret;
 	the_hash_algo->final_oid_fn(oid, c);
 	return Z_OK;
 }
 static int write_loose_object(const struct object_id *oid, char *hdr,
 			      int hdrlen, const void *buf, unsigned long len,
 			      time_t mtime, unsigned flags)
@ -1968,50 +2058,29 @@ static int write_loose_object(const struct object_id *oid, char *hdr,
 	loose_object_path(the_repository, &filename, oid);
-	fd = create_tmpfile(&tmp_file, filename.buf);
+	fd = start_loose_object_common(&tmp_file, filename.buf, flags,
-	if (fd < 0) {
+				       &stream, compressed, sizeof(compressed),
-		if (flags & HASH_SILENT)
+				       &c, hdr, hdrlen);
-			return -1;
+	if (fd < 0)
-		else if (errno == EACCES)
+		return -1;
 			return error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
 		else
 			return error_errno(_("unable to create temporary file"));
 	}
 	/* Set it up */
 	git_deflate_init(&stream, zlib_compression_level);
 	stream.next_out = compressed;
 	stream.avail_out = sizeof(compressed);
 	the_hash_algo->init_fn(&c);
 	/* First header.. */
 	stream.next_in = (unsigned char *)hdr;
 	stream.avail_in = hdrlen;
 	while (git_deflate(&stream, 0) == Z_OK)
 		; /* nothing */
 	the_hash_algo->update_fn(&c, hdr, hdrlen);
 	/* Then the data itself.. */
 	stream.next_in = (void *)buf;
 	stream.avail_in = len;
 	do {
 		unsigned char *in0 = stream.next_in;
-		ret = git_deflate(&stream, Z_FINISH);
+
-		the_hash_algo->update_fn(&c, in0, stream.next_in - in0);
+		ret = write_loose_object_common(&c, &stream, 1, in0, fd,
-		if (write_buffer(fd, compressed, stream.next_out - compressed) < 0)
+						compressed, sizeof(compressed));
 			die(_("unable to write loose object file"));
 		stream.next_out = compressed;
 		stream.avail_out = sizeof(compressed);
 	} while (ret == Z_OK);
 	if (ret != Z_STREAM_END)
 		die(_("unable to deflate new object %s (%d)"), oid_to_hex(oid),
 		    ret);
-	ret = git_deflate_end_gently(&stream);
+	ret = end_loose_object_common(&c, &stream, &parano_oid);
 	if (ret != Z_OK)
 		die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid),
 		    ret);
 	the_hash_algo->final_oid_fn(&parano_oid, &c);
 	if (!oideq(oid, &parano_oid))
 		die(_("confused by unstable object source data for %s"),
 		    oid_to_hex(oid));
@ -2050,6 +2119,110 @@ static int freshen_packed_object(const struct object_id *oid)
 	return 1;
 }
 int stream_loose_object(struct input_stream *in_stream, size_t len,
 			struct object_id *oid)
 {
 	int fd, ret, err = 0, flush = 0;
 	unsigned char compressed[4096];
 	git_zstream stream;
 	git_hash_ctx c;
 	struct strbuf tmp_file = STRBUF_INIT;
 	struct strbuf filename = STRBUF_INIT;
 	int dirlen;
 	char hdr[MAX_HEADER_LEN];
 	int hdrlen;
 	if (batch_fsync_enabled(FSYNC_COMPONENT_LOOSE_OBJECT))
 		prepare_loose_object_bulk_checkin();
 	/* Since oid is not determined, save tmp file to odb path. */
 	strbuf_addf(&filename, "%s/", get_object_directory());
 	hdrlen = format_object_header(hdr, sizeof(hdr), OBJ_BLOB, len);
 	/*
 	 * Common steps for write_loose_object and stream_loose_object to
 	 * start writing loose objects:
 	 *
 	 *  - Create tmpfile for the loose object.
 	 *  - Setup zlib stream for compression.
 	 *  - Start to feed header to zlib stream.
 	 */
 	fd = start_loose_object_common(&tmp_file, filename.buf, 0,
 				       &stream, compressed, sizeof(compressed),
 				       &c, hdr, hdrlen);
 	if (fd < 0) {
 		err = -1;
 		goto cleanup;
 	}
 	/* Then the data itself.. */
 	do {
 		unsigned char *in0 = stream.next_in;
 		if (!stream.avail_in && !in_stream->is_finished) {
 			const void *in = in_stream->read(in_stream, &stream.avail_in);
 			stream.next_in = (void *)in;
 			in0 = (unsigned char *)in;
 			/* All data has been read. */
 			if (in_stream->is_finished)
 				flush = 1;
 		}
 		ret = write_loose_object_common(&c, &stream, flush, in0, fd,
 						compressed, sizeof(compressed));
 		/*
 		 * Unlike write_loose_object(), we do not have the entire
 		 * buffer. If we get Z_BUF_ERROR due to too few input bytes,
 		 * then we'll replenish them in the next input_stream->read()
 		 * call when we loop.
 		 */
 	} while (ret == Z_OK || ret == Z_BUF_ERROR);
 	if (stream.total_in != len + hdrlen)
 		die(_("write stream object %ld != %"PRIuMAX), stream.total_in,
 		    (uintmax_t)len + hdrlen);
 	/*
 	 * Common steps for write_loose_object and stream_loose_object to
 	 * end writing loose oject:
 	 *
 	 *  - End the compression of zlib stream.
 	 *  - Get the calculated oid.
 	 */
 	if (ret != Z_STREAM_END)
 		die(_("unable to stream deflate new object (%d)"), ret);
 	ret = end_loose_object_common(&c, &stream, oid);
 	if (ret != Z_OK)
 		die(_("deflateEnd on stream object failed (%d)"), ret);
 	close_loose_object(fd, tmp_file.buf);
 	if (freshen_packed_object(oid) || freshen_loose_object(oid)) {
 		unlink_or_warn(tmp_file.buf);
 		goto cleanup;
 	}
 	loose_object_path(the_repository, &filename, oid);
 	/* We finally know the object path, and create the missing dir. */
 	dirlen = directory_size(filename.buf);
 	if (dirlen) {
 		struct strbuf dir = STRBUF_INIT;
 		strbuf_add(&dir, filename.buf, dirlen);
 		if (mkdir_in_gitdir(dir.buf) && errno != EEXIST) {
 			err = error_errno(_("unable to create directory %s"), dir.buf);
 			strbuf_release(&dir);
 			goto cleanup;
 		}
 		strbuf_release(&dir);
 	}
 	err = finalize_object_file(tmp_file.buf, filename.buf);
 cleanup:
 	strbuf_release(&tmp_file);
 	strbuf_release(&filename);
 	return err;
 }
 int write_object_file_flags(const void *buf, unsigned long len,
 			    enum object_type type, struct object_id *oid,
 			    unsigned flags)
--- a/object-store.h
+++ b/object-store.h
@ -46,6 +46,12 @@ struct object_directory {
 	char *path;
 };
 struct input_stream {
 	const void *(*read)(struct input_stream *, unsigned long *len);
 	void *data;
 	int is_finished;
 };
 KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
 	struct object_directory *, 1, fspathhash, fspatheq)
@ -269,6 +275,8 @@ static inline int write_object_file(const void *buf, unsigned long len,
 int write_object_file_literally(const void *buf, unsigned long len,
 				const char *type, struct object_id *oid,
 				unsigned flags);
 int stream_loose_object(struct input_stream *in_stream, size_t len,
 			struct object_id *oid);
 /*
 * Add an object file to the in-memory object store, without writing it
--- a/t/t5351-unpack-large-objects.sh
+++ b/t/t5351-unpack-large-objects.sh
@ -0,0 +1,76 @@
 #!/bin/sh
 #
 # Copyright (c) 2022 Han Xin
 #
 test_description='git unpack-objects with large objects'
 . ./test-lib.sh
 prepare_dest () {
 	test_when_finished "rm -rf dest.git" &&
 	git init --bare dest.git &&
 	git -C dest.git config core.bigFileThreshold "$1"
 }
 test_expect_success "create large objects (1.5 MB) and PACK" '
 	test-tool genrandom foo 1500000 >big-blob &&
 	test_commit --append foo big-blob &&
 	test-tool genrandom bar 1500000 >big-blob &&
 	test_commit --append bar big-blob &&
 	PACK=$(echo HEAD | git pack-objects --revs pack) &&
 	git verify-pack -v pack-$PACK.pack >out &&
 	sed -n -e "s/^\([0-9a-f][0-9a-f]*\).*\(commit\|tree\|blob\).*/\1/p" \
 		<out >obj-list
 '
 test_expect_success 'set memory limitation to 1MB' '
 	GIT_ALLOC_LIMIT=1m &&
 	export GIT_ALLOC_LIMIT
 '
 test_expect_success 'unpack-objects failed under memory limitation' '
 	prepare_dest 2m &&
 	test_must_fail git -C dest.git unpack-objects <pack-$PACK.pack 2>err &&
 	grep "fatal: attempting to allocate" err
 '
 test_expect_success 'unpack-objects works with memory limitation in dry-run mode' '
 	prepare_dest 2m &&
 	git -C dest.git unpack-objects -n <pack-$PACK.pack &&
 	test_stdout_line_count = 0 find dest.git/objects -type f &&
 	test_dir_is_empty dest.git/objects/pack
 '
 test_expect_success 'unpack big object in stream' '
 	prepare_dest 1m &&
 	git -C dest.git unpack-objects <pack-$PACK.pack &&
 	test_dir_is_empty dest.git/objects/pack
 '
 BATCH_CONFIGURATION='-c core.fsync=loose-object -c core.fsyncmethod=batch'
 test_expect_success 'unpack big object in stream (core.fsyncmethod=batch)' '
 	prepare_dest 1m &&
 	GIT_TRACE2_EVENT="$(pwd)/trace2.txt" \
 		git -C dest.git $BATCH_CONFIGURATION unpack-objects <pack-$PACK.pack &&
 	grep fsync/hardware-flush trace2.txt &&
 	test_dir_is_empty dest.git/objects/pack &&
 	git -C dest.git cat-file --batch-check="%(objectname)" <obj-list >current &&
 	cmp obj-list current
 '
 test_expect_success 'do not unpack existing large objects' '
 	prepare_dest 1m &&
 	git -C dest.git index-pack --stdin <pack-$PACK.pack &&
 	git -C dest.git unpack-objects <pack-$PACK.pack &&
 	# The destination came up with the exact same pack...
 	DEST_PACK=$(echo dest.git/objects/pack/pack-*.pack) &&
 	test_cmp pack-$PACK.pack $DEST_PACK &&
 	# ...and wrote no loose objects
 	test_stdout_line_count = 0 find dest.git/objects -type f ! -name "pack-*"
 '
 test_done