From 47a02ff2ca821c52268197dd5fa46cd60a2e94bc Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Wed, 7 Mar 2012 17:54:15 +0700 Subject: [PATCH 1/7] streaming: make streaming-write-entry to be more reusable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The static function in entry.c takes a cache entry and streams its blob contents to a file in the working tree. Refactor the logic to a new API function stream_blob_to_fd() that takes an object name and an open file descriptor, so that it can be reused by other callers. Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- entry.c | 53 +++++---------------------------------------------- streaming.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++ streaming.h | 2 ++ 3 files changed, 62 insertions(+), 48 deletions(-) diff --git a/entry.c b/entry.c index 852fea1395..17a6bccec6 100644 --- a/entry.c +++ b/entry.c @@ -120,58 +120,15 @@ static int streaming_write_entry(struct cache_entry *ce, char *path, const struct checkout *state, int to_tempfile, int *fstat_done, struct stat *statbuf) { - struct git_istream *st; - enum object_type type; - unsigned long sz; int result = -1; - ssize_t kept = 0; - int fd = -1; - - st = open_istream(ce->sha1, &type, &sz, filter); - if (!st) - return -1; - if (type != OBJ_BLOB) - goto close_and_exit; + int fd; fd = open_output_fd(path, ce, to_tempfile); - if (fd < 0) - goto close_and_exit; - - for (;;) { - char buf[1024 * 16]; - ssize_t wrote, holeto; - ssize_t readlen = read_istream(st, buf, sizeof(buf)); - - if (!readlen) - break; - if (sizeof(buf) == readlen) { - for (holeto = 0; holeto < readlen; holeto++) - if (buf[holeto]) - break; - if (readlen == holeto) { - kept += holeto; - continue; - } - } - - if (kept && lseek(fd, kept, SEEK_CUR) == (off_t) -1) - goto close_and_exit; - else - kept = 0; - wrote = write_in_full(fd, buf, readlen); - - if (wrote != readlen) - goto close_and_exit; - } - if (kept && (lseek(fd, kept - 1, SEEK_CUR) == (off_t) -1 || - write(fd, "", 1) != 1)) - goto close_and_exit; - *fstat_done = fstat_output(fd, state, statbuf); - -close_and_exit: - close_istream(st); - if (0 <= fd) + if (0 <= fd) { + result = stream_blob_to_fd(fd, ce->sha1, filter, 1); + *fstat_done = fstat_output(fd, state, statbuf); result = close(fd); + } if (result && 0 <= fd) unlink(path); return result; diff --git a/streaming.c b/streaming.c index 71072e1b1d..7e7ee2be6f 100644 --- a/streaming.c +++ b/streaming.c @@ -489,3 +489,58 @@ static open_method_decl(incore) return st->u.incore.buf ? 0 : -1; } + + +/**************************************************************** + * Users of streaming interface + ****************************************************************/ + +int stream_blob_to_fd(int fd, unsigned const char *sha1, struct stream_filter *filter, + int can_seek) +{ + struct git_istream *st; + enum object_type type; + unsigned long sz; + ssize_t kept = 0; + int result = -1; + + st = open_istream(sha1, &type, &sz, filter); + if (!st) + return result; + if (type != OBJ_BLOB) + goto close_and_exit; + for (;;) { + char buf[1024 * 16]; + ssize_t wrote, holeto; + ssize_t readlen = read_istream(st, buf, sizeof(buf)); + + if (!readlen) + break; + if (can_seek && sizeof(buf) == readlen) { + for (holeto = 0; holeto < readlen; holeto++) + if (buf[holeto]) + break; + if (readlen == holeto) { + kept += holeto; + continue; + } + } + + if (kept && lseek(fd, kept, SEEK_CUR) == (off_t) -1) + goto close_and_exit; + else + kept = 0; + wrote = write_in_full(fd, buf, readlen); + + if (wrote != readlen) + goto close_and_exit; + } + if (kept && (lseek(fd, kept - 1, SEEK_CUR) == (off_t) -1 || + write(fd, "", 1) != 1)) + goto close_and_exit; + result = 0; + + close_and_exit: + close_istream(st); + return result; +} diff --git a/streaming.h b/streaming.h index 589e857b8c..3e827709c8 100644 --- a/streaming.h +++ b/streaming.h @@ -12,4 +12,6 @@ extern struct git_istream *open_istream(const unsigned char *, enum object_type extern int close_istream(struct git_istream *); extern ssize_t read_istream(struct git_istream *, char *, size_t); +extern int stream_blob_to_fd(int fd, const unsigned char *, struct stream_filter *, int can_seek); + #endif /* STREAMING_H */ From d41489a6424308dc9a0409bc2f6845aa08bd4f7d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Wed, 7 Mar 2012 17:54:16 +0700 Subject: [PATCH 2/7] Add more large blob test cases MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New test cases list commands that should work when memory is limited. All memory allocation functions (*) learn to reject any allocation larger than $GIT_ALLOC_LIMIT if set. (*) Not exactly all. Some places do not use x* functions, but malloc/calloc directly, notably diff-delta. These code path should never be run on large blobs. Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- t/t1050-large.sh | 38 ++++++++++++++++++++++++++++++++++++-- wrapper.c | 27 ++++++++++++++++++++++++--- 2 files changed, 60 insertions(+), 5 deletions(-) diff --git a/t/t1050-large.sh b/t/t1050-large.sh index 29d6024b7f..ded66b3228 100755 --- a/t/t1050-large.sh +++ b/t/t1050-large.sh @@ -6,11 +6,15 @@ test_description='adding and checking out large blobs' . ./test-lib.sh test_expect_success setup ' - git config core.bigfilethreshold 200k && + # clone does not allow us to pass core.bigfilethreshold to + # new repos, so set core.bigfilethreshold globally + git config --global core.bigfilethreshold 200k && echo X | dd of=large1 bs=1k seek=2000 && echo X | dd of=large2 bs=1k seek=2000 && echo X | dd of=large3 bs=1k seek=2000 && - echo Y | dd of=huge bs=1k seek=2500 + echo Y | dd of=huge bs=1k seek=2500 && + GIT_ALLOC_LIMIT=1500 && + export GIT_ALLOC_LIMIT ' test_expect_success 'add a large file or two' ' @@ -100,4 +104,34 @@ test_expect_success 'packsize limit' ' ) ' +test_expect_success 'diff --raw' ' + git commit -q -m initial && + echo modified >>large1 && + git add large1 && + git commit -q -m modified && + git diff --raw HEAD^ +' + +test_expect_success 'hash-object' ' + git hash-object large1 +' + +test_expect_failure 'cat-file a large file' ' + git cat-file blob :large1 >/dev/null +' + +test_expect_failure 'cat-file a large file from a tag' ' + git tag -m largefile largefiletag :large1 && + git cat-file blob largefiletag >/dev/null +' + +test_expect_failure 'git-show a large file' ' + git show :large1 >/dev/null + +' + +test_expect_failure 'repack' ' + git repack -ad +' + test_done diff --git a/wrapper.c b/wrapper.c index 85f09df747..6ccd0595f4 100644 --- a/wrapper.c +++ b/wrapper.c @@ -9,6 +9,18 @@ static void do_nothing(size_t size) static void (*try_to_free_routine)(size_t size) = do_nothing; +static void memory_limit_check(size_t size) +{ + static int limit = -1; + if (limit == -1) { + const char *env = getenv("GIT_ALLOC_LIMIT"); + limit = env ? atoi(env) * 1024 : 0; + } + if (limit && size > limit) + die("attempting to allocate %"PRIuMAX" over limit %d", + (intmax_t)size, limit); +} + try_to_free_t set_try_to_free_routine(try_to_free_t routine) { try_to_free_t old = try_to_free_routine; @@ -32,7 +44,10 @@ char *xstrdup(const char *str) void *xmalloc(size_t size) { - void *ret = malloc(size); + void *ret; + + memory_limit_check(size); + ret = malloc(size); if (!ret && !size) ret = malloc(1); if (!ret) { @@ -79,7 +94,10 @@ char *xstrndup(const char *str, size_t len) void *xrealloc(void *ptr, size_t size) { - void *ret = realloc(ptr, size); + void *ret; + + memory_limit_check(size); + ret = realloc(ptr, size); if (!ret && !size) ret = realloc(ptr, 1); if (!ret) { @@ -95,7 +113,10 @@ void *xrealloc(void *ptr, size_t size) void *xcalloc(size_t nmemb, size_t size) { - void *ret = calloc(nmemb, size); + void *ret; + + memory_limit_check(size * nmemb); + ret = calloc(nmemb, size); if (!ret && (!nmemb || !size)) ret = calloc(1, 1); if (!ret) { From 00c8fd493afbd1620febf2b895fb2365f76d5875 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Wed, 7 Mar 2012 17:54:17 +0700 Subject: [PATCH 3/7] cat-file: use streaming API to print blobs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- builtin/cat-file.c | 25 +++++++++++++++++++++++++ t/t1050-large.sh | 4 ++-- 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/builtin/cat-file.c b/builtin/cat-file.c index 8ed501f220..36a9104433 100644 --- a/builtin/cat-file.c +++ b/builtin/cat-file.c @@ -11,6 +11,7 @@ #include "parse-options.h" #include "diff.h" #include "userdiff.h" +#include "streaming.h" #define BATCH 1 #define BATCH_CHECK 2 @@ -127,6 +128,8 @@ static int cat_one_file(int opt, const char *exp_type, const char *obj_name) return cmd_ls_tree(2, ls_args, NULL); } + if (type == OBJ_BLOB) + return stream_blob_to_fd(1, sha1, NULL, 0); buf = read_sha1_file(sha1, &type, &size); if (!buf) die("Cannot read object %s", obj_name); @@ -149,6 +152,28 @@ static int cat_one_file(int opt, const char *exp_type, const char *obj_name) break; case 0: + if (type_from_string(exp_type) == OBJ_BLOB) { + unsigned char blob_sha1[20]; + if (sha1_object_info(sha1, NULL) == OBJ_TAG) { + enum object_type type; + unsigned long size; + char *buffer = read_sha1_file(sha1, &type, &size); + if (memcmp(buffer, "object ", 7) || + get_sha1_hex(buffer + 7, blob_sha1)) + die("%s not a valid tag", sha1_to_hex(sha1)); + free(buffer); + } else + hashcpy(blob_sha1, sha1); + + if (sha1_object_info(blob_sha1, NULL) == OBJ_BLOB) + return stream_blob_to_fd(1, blob_sha1, NULL, 0); + /* + * we attempted to dereference a tag to a blob + * and failed; there may be new dereference + * mechanisms this code is not aware of. + * fall-back to the usual case. + */ + } buf = read_object_with_reference(sha1, exp_type, &size, NULL); break; diff --git a/t/t1050-large.sh b/t/t1050-large.sh index ded66b3228..f662fefa67 100755 --- a/t/t1050-large.sh +++ b/t/t1050-large.sh @@ -116,11 +116,11 @@ test_expect_success 'hash-object' ' git hash-object large1 ' -test_expect_failure 'cat-file a large file' ' +test_expect_success 'cat-file a large file' ' git cat-file blob :large1 >/dev/null ' -test_expect_failure 'cat-file a large file from a tag' ' +test_expect_success 'cat-file a large file from a tag' ' git tag -m largefile largefiletag :large1 && git cat-file blob largefiletag >/dev/null ' From 090ea12671b2971b1c613f0a3d2657e8cdd35134 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Wed, 7 Mar 2012 17:54:18 +0700 Subject: [PATCH 4/7] parse_object: avoid putting whole blob in core MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Traditionally, all the callers of check_sha1_signature() first called read_sha1_file() to prepare the whole object data in core, and called this function. The function is used to revalidate what we read from the object database actually matches the object name we used to ask for the data from the object database. Update the API to allow callers to pass NULL as the object data, and have the function read and hash the object data using streaming API to recompute the object name, without having to hold everything in core at the same time. This is most useful in parse_object() that parses a blob object, because this caller does not have to keep the actual blob data around in memory after a "struct blob" is returned. Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- object.c | 11 +++++++++++ sha1_file.c | 42 ++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 51 insertions(+), 2 deletions(-) diff --git a/object.c b/object.c index 6b06297a5f..0498b18d45 100644 --- a/object.c +++ b/object.c @@ -198,6 +198,17 @@ struct object *parse_object(const unsigned char *sha1) if (obj && obj->parsed) return obj; + if ((obj && obj->type == OBJ_BLOB) || + (!obj && has_sha1_file(sha1) && + sha1_object_info(sha1, NULL) == OBJ_BLOB)) { + if (check_sha1_signature(repl, NULL, 0, NULL) < 0) { + error("sha1 mismatch %s\n", sha1_to_hex(repl)); + return NULL; + } + parse_blob_buffer(lookup_blob(sha1), NULL, 0); + return lookup_object(sha1); + } + buffer = read_sha1_file(sha1, &type, &size); if (buffer) { if (check_sha1_signature(repl, buffer, size, typename(type)) < 0) { diff --git a/sha1_file.c b/sha1_file.c index 4f06a0e450..ad314f08b9 100644 --- a/sha1_file.c +++ b/sha1_file.c @@ -19,6 +19,7 @@ #include "pack-revindex.h" #include "sha1-lookup.h" #include "bulk-checkin.h" +#include "streaming.h" #ifndef O_NOATIME #if defined(__linux__) && (defined(__i386__) || defined(__PPC__)) @@ -1146,10 +1147,47 @@ static const struct packed_git *has_packed_and_bad(const unsigned char *sha1) return NULL; } -int check_sha1_signature(const unsigned char *sha1, void *map, unsigned long size, const char *type) +/* + * With an in-core object data in "map", rehash it to make sure the + * object name actually matches "sha1" to detect object corruption. + * With "map" == NULL, try reading the object named with "sha1" using + * the streaming interface and rehash it to do the same. + */ +int check_sha1_signature(const unsigned char *sha1, void *map, + unsigned long size, const char *type) { unsigned char real_sha1[20]; - hash_sha1_file(map, size, type, real_sha1); + enum object_type obj_type; + struct git_istream *st; + git_SHA_CTX c; + char hdr[32]; + int hdrlen; + + if (map) { + hash_sha1_file(map, size, type, real_sha1); + return hashcmp(sha1, real_sha1) ? -1 : 0; + } + + st = open_istream(sha1, &obj_type, &size, NULL); + if (!st) + return -1; + + /* Generate the header */ + hdrlen = sprintf(hdr, "%s %lu", typename(obj_type), size) + 1; + + /* Sha1.. */ + git_SHA1_Init(&c); + git_SHA1_Update(&c, hdr, hdrlen); + for (;;) { + char buf[1024 * 16]; + ssize_t readlen = read_istream(st, buf, sizeof(buf)); + + if (!readlen) + break; + git_SHA1_Update(&c, buf, readlen); + } + git_SHA1_Final(real_sha1, &c); + close_istream(st); return hashcmp(sha1, real_sha1) ? -1 : 0; } From 74775a09b166fb85f5dce816548e337f11124c6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Wed, 7 Mar 2012 17:54:19 +0700 Subject: [PATCH 5/7] show: use streaming API for showing blobs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- builtin/log.c | 34 ++++++++++++++++++++-------------- t/t1050-large.sh | 2 +- 2 files changed, 21 insertions(+), 15 deletions(-) diff --git a/builtin/log.c b/builtin/log.c index 7d1f6f88a0..d1702e7580 100644 --- a/builtin/log.c +++ b/builtin/log.c @@ -20,6 +20,7 @@ #include "string-list.h" #include "parse-options.h" #include "branch.h" +#include "streaming.h" /* Set a default date-time format for git log ("log.date" config variable) */ static const char *default_date_mode = NULL; @@ -381,8 +382,13 @@ static void show_tagger(char *buf, int len, struct rev_info *rev) strbuf_release(&out); } -static int show_object(const unsigned char *sha1, int show_tag_object, - struct rev_info *rev) +static int show_blob_object(const unsigned char *sha1, struct rev_info *rev) +{ + fflush(stdout); + return stream_blob_to_fd(1, sha1, NULL, 0); +} + +static int show_tag_object(const unsigned char *sha1, struct rev_info *rev) { unsigned long size; enum object_type type; @@ -392,16 +398,16 @@ static int show_object(const unsigned char *sha1, int show_tag_object, if (!buf) return error(_("Could not read object %s"), sha1_to_hex(sha1)); - if (show_tag_object) - while (offset < size && buf[offset] != '\n') { - int new_offset = offset + 1; - while (new_offset < size && buf[new_offset++] != '\n') - ; /* do nothing */ - if (!prefixcmp(buf + offset, "tagger ")) - show_tagger(buf + offset + 7, - new_offset - offset - 7, rev); - offset = new_offset; - } + assert(type == OBJ_TAG); + while (offset < size && buf[offset] != '\n') { + int new_offset = offset + 1; + while (new_offset < size && buf[new_offset++] != '\n') + ; /* do nothing */ + if (!prefixcmp(buf + offset, "tagger ")) + show_tagger(buf + offset + 7, + new_offset - offset - 7, rev); + offset = new_offset; + } if (offset < size) fwrite(buf + offset, size - offset, 1, stdout); @@ -459,7 +465,7 @@ int cmd_show(int argc, const char **argv, const char *prefix) const char *name = objects[i].name; switch (o->type) { case OBJ_BLOB: - ret = show_object(o->sha1, 0, NULL); + ret = show_blob_object(o->sha1, NULL); break; case OBJ_TAG: { struct tag *t = (struct tag *)o; @@ -470,7 +476,7 @@ int cmd_show(int argc, const char **argv, const char *prefix) diff_get_color_opt(&rev.diffopt, DIFF_COMMIT), t->tag, diff_get_color_opt(&rev.diffopt, DIFF_RESET)); - ret = show_object(o->sha1, 1, &rev); + ret = show_tag_object(o->sha1, &rev); rev.shown_one = 1; if (ret) break; diff --git a/t/t1050-large.sh b/t/t1050-large.sh index f662fefa67..dd1bb8422c 100755 --- a/t/t1050-large.sh +++ b/t/t1050-large.sh @@ -125,7 +125,7 @@ test_expect_success 'cat-file a large file from a tag' ' git cat-file blob largefiletag >/dev/null ' -test_expect_failure 'git-show a large file' ' +test_expect_success 'git-show a large file' ' git show :large1 >/dev/null ' From 6f7f3beb2d19ab772729fc599d4a92ebf9140c5f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Wed, 7 Mar 2012 17:54:20 +0700 Subject: [PATCH 6/7] fsck: use streaming API for writing lost-found blobs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- builtin/fsck.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/builtin/fsck.c b/builtin/fsck.c index 67eb553c7d..a710227a64 100644 --- a/builtin/fsck.c +++ b/builtin/fsck.c @@ -12,6 +12,7 @@ #include "parse-options.h" #include "dir.h" #include "progress.h" +#include "streaming.h" #define REACHABLE 0x0001 #define SEEN 0x0002 @@ -238,13 +239,8 @@ static void check_unreachable_object(struct object *obj) if (!(f = fopen(filename, "w"))) die_errno("Could not open '%s'", filename); if (obj->type == OBJ_BLOB) { - enum object_type type; - unsigned long size; - char *buf = read_sha1_file(obj->sha1, - &type, &size); - if (buf && fwrite(buf, 1, size, f) != size) + if (stream_blob_to_fd(fileno(f), obj->sha1, NULL, 1)) die_errno("Could not write '%s'", filename); - free(buf); } else fprintf(f, "%s\n", sha1_to_hex(obj->sha1)); if (fclose(f)) From da591a7f4bbe1a208cc5f955523506eb857c45ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Wed, 7 Mar 2012 17:54:21 +0700 Subject: [PATCH 7/7] update-server-info: respect core.bigfilethreshold MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This command indirectly calls check_sha1_signature() (add_info_ref -> deref_tag -> parse_object -> ..) , which may put whole blob in memory if the blob's size is under core.bigfilethreshold. As config is not read, the threshold is always 512MB. Respect user settings here. Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- builtin/update-server-info.c | 1 + t/t1050-large.sh | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/builtin/update-server-info.c b/builtin/update-server-info.c index b90dce6358..0d63c4498c 100644 --- a/builtin/update-server-info.c +++ b/builtin/update-server-info.c @@ -15,6 +15,7 @@ int cmd_update_server_info(int argc, const char **argv, const char *prefix) OPT_END() }; + git_config(git_default_config, NULL); argc = parse_options(argc, argv, prefix, options, update_server_info_usage, 0); if (argc > 0) diff --git a/t/t1050-large.sh b/t/t1050-large.sh index dd1bb8422c..4d127f19b7 100755 --- a/t/t1050-large.sh +++ b/t/t1050-large.sh @@ -130,7 +130,7 @@ test_expect_success 'git-show a large file' ' ' -test_expect_failure 'repack' ' +test_expect_success 'repack' ' git repack -ad '