From 200ebe362cda2a520219f998d4b2c44767992bdb Mon Sep 17 00:00:00 2001 From: Jeff King Date: Sat, 26 Jan 2013 04:42:45 -0500 Subject: [PATCH 1/3] commit: drop useless xstrdup of commit message When git-commit is asked to reuse a commit message via "-c", we call read_commit_message, which looks up the commit and hands back either the re-encoded result, or a copy of the original. We make a copy in the latter case so that the ownership semantics of the return value are clear (in either case, it can be freed). However, since we return a "const char *", and since the resulting buffer's lifetime is the same as that of the whole program, we never bother to free it at all. Let's just drop the copy. That saves us a copy in the common case. While it does mean we leak in the re-encode case, it doesn't matter, since we are relying on program exit to free the memory anyway. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- builtin/commit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/builtin/commit.c b/builtin/commit.c index 38b9a9cc0d..fbbb40fff9 100644 --- a/builtin/commit.c +++ b/builtin/commit.c @@ -962,7 +962,7 @@ static const char *read_commit_message(const char *name) * encodings are identical. */ if (out == NULL) - out = xstrdup(commit->buffer); + out = commit->buffer; return out; } From dd0d388c44c28ebc021a24eeddc60287d4ea249c Mon Sep 17 00:00:00 2001 From: Jeff King Date: Sat, 26 Jan 2013 04:44:06 -0500 Subject: [PATCH 2/3] logmsg_reencode: never return NULL The logmsg_reencode function will return the reencoded commit buffer, or NULL if reencoding failed or no reencoding was necessary. Since every caller then ends up checking for NULL and just using the commit's original buffer, anyway, we can be a bit more helpful and just return that buffer when we would have returned NULL. Since the resulting string may or may not need to be freed, we introduce a logmsg_free, which checks whether the buffer came from the commit object or not (callers either implemented the same check already, or kept two separate pointers, one to mark the buffer to be used, and one for the to-be-freed string). Pushing this logic into logmsg_* simplifies the callers, and will let future patches lazily load the commit buffer in a single place. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- builtin/blame.c | 9 ++++----- builtin/commit.c | 14 ++------------ commit.h | 1 + pretty.c | 38 ++++++++++++++++++++++---------------- 4 files changed, 29 insertions(+), 33 deletions(-) diff --git a/builtin/blame.c b/builtin/blame.c index b431ba3209..962e4e3cd1 100644 --- a/builtin/blame.c +++ b/builtin/blame.c @@ -1420,7 +1420,7 @@ static void get_commit_info(struct commit *commit, { int len; const char *subject, *encoding; - char *reencoded, *message; + char *message; commit_info_init(ret); @@ -1438,14 +1438,13 @@ static void get_commit_info(struct commit *commit, sha1_to_hex(commit->object.sha1)); } encoding = get_log_output_encoding(); - reencoded = logmsg_reencode(commit, encoding); - message = reencoded ? reencoded : commit->buffer; + message = logmsg_reencode(commit, encoding); get_ac_line(message, "\nauthor ", &ret->author, &ret->author_mail, &ret->author_time, &ret->author_tz); if (!detailed) { - free(reencoded); + logmsg_free(message, commit); return; } @@ -1459,7 +1458,7 @@ static void get_commit_info(struct commit *commit, else strbuf_addf(&ret->summary, "(%s)", sha1_to_hex(commit->object.sha1)); - free(reencoded); + logmsg_free(message, commit); } /* diff --git a/builtin/commit.c b/builtin/commit.c index fbbb40fff9..6169f1e2dc 100644 --- a/builtin/commit.c +++ b/builtin/commit.c @@ -946,24 +946,14 @@ static void handle_untracked_files_arg(struct wt_status *s) static const char *read_commit_message(const char *name) { - const char *out_enc, *out; + const char *out_enc; struct commit *commit; commit = lookup_commit_reference_by_name(name); if (!commit) die(_("could not lookup commit %s"), name); out_enc = get_commit_output_encoding(); - out = logmsg_reencode(commit, out_enc); - - /* - * If we failed to reencode the buffer, just copy it - * byte for byte so the user can try to fix it up. - * This also handles the case where input and output - * encodings are identical. - */ - if (out == NULL) - out = commit->buffer; - return out; + return logmsg_reencode(commit, out_enc); } static int parse_and_validate_options(int argc, const char *argv[], diff --git a/commit.h b/commit.h index c16c8a7534..e770649731 100644 --- a/commit.h +++ b/commit.h @@ -101,6 +101,7 @@ extern int has_non_ascii(const char *text); struct rev_info; /* in revision.h, it circularly uses enum cmit_fmt */ extern char *logmsg_reencode(const struct commit *commit, const char *output_encoding); +extern void logmsg_free(char *msg, const struct commit *commit); extern void get_commit_format(const char *arg, struct rev_info *); extern const char *format_subject(struct strbuf *sb, const char *msg, const char *line_separator); diff --git a/pretty.c b/pretty.c index 07fc062865..c675349438 100644 --- a/pretty.c +++ b/pretty.c @@ -524,10 +524,11 @@ static void add_merge_info(const struct pretty_print_context *pp, strbuf_addch(sb, '\n'); } -static char *get_header(const struct commit *commit, const char *key) +static char *get_header(const struct commit *commit, const char *msg, + const char *key) { int key_len = strlen(key); - const char *line = commit->buffer; + const char *line = msg; while (line) { const char *eol = strchr(line, '\n'), *next; @@ -588,17 +589,18 @@ char *logmsg_reencode(const struct commit *commit, static const char *utf8 = "UTF-8"; const char *use_encoding; char *encoding; + char *msg = commit->buffer; char *out; if (!output_encoding || !*output_encoding) - return NULL; - encoding = get_header(commit, "encoding"); + return msg; + encoding = get_header(commit, msg, "encoding"); use_encoding = encoding ? encoding : utf8; if (same_encoding(use_encoding, output_encoding)) if (encoding) /* we'll strip encoding header later */ out = xstrdup(commit->buffer); else - return NULL; /* nothing to do */ + return msg; /* nothing to do */ else out = reencode_string(commit->buffer, output_encoding, use_encoding); @@ -606,7 +608,17 @@ char *logmsg_reencode(const struct commit *commit, out = replace_encoding_header(out, output_encoding); free(encoding); - return out; + /* + * If the re-encoding failed, out might be NULL here; in that + * case we just return the commit message verbatim. + */ + return out ? out : msg; +} + +void logmsg_free(char *msg, const struct commit *commit) +{ + if (msg != commit->buffer) + free(msg); } static int mailmap_name(const char **email, size_t *email_len, @@ -1278,14 +1290,11 @@ void format_commit_message(const struct commit *commit, context.pretty_ctx = pretty_ctx; context.wrap_start = sb->len; context.message = logmsg_reencode(commit, output_enc); - if (!context.message) - context.message = commit->buffer; strbuf_expand(sb, format, format_commit_item, &context); rewrap_message_tail(sb, &context, 0, 0, 0); - if (context.message != commit->buffer) - free(context.message); + logmsg_free(context.message, commit); free(context.signature.gpg_output); free(context.signature.signer); } @@ -1432,7 +1441,7 @@ void pretty_print_commit(const struct pretty_print_context *pp, { unsigned long beginning_of_body; int indent = 4; - const char *msg = commit->buffer; + const char *msg; char *reencoded; const char *encoding; int need_8bit_cte = pp->need_8bit_cte; @@ -1443,10 +1452,7 @@ void pretty_print_commit(const struct pretty_print_context *pp, } encoding = get_log_output_encoding(); - reencoded = logmsg_reencode(commit, encoding); - if (reencoded) { - msg = reencoded; - } + msg = reencoded = logmsg_reencode(commit, encoding); if (pp->fmt == CMIT_FMT_ONELINE || pp->fmt == CMIT_FMT_EMAIL) indent = 0; @@ -1503,7 +1509,7 @@ void pretty_print_commit(const struct pretty_print_context *pp, if (pp->fmt == CMIT_FMT_EMAIL && sb->len <= beginning_of_body) strbuf_addch(sb, '\n'); - free(reencoded); + logmsg_free(reencoded, commit); } void pp_commit_easy(enum cmit_fmt fmt, const struct commit *commit, From be5c9fb9049ed470e7005f159bb923a5f4de1309 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Sat, 26 Jan 2013 04:44:28 -0500 Subject: [PATCH 3/3] logmsg_reencode: lazily load missing commit buffers Usually a commit that makes it to logmsg_reencode will have been parsed, and the commit->buffer struct member will be valid. However, some code paths will free commit buffers after having used them (for example, the log traversal machinery will do so to keep memory usage down). Most of the time this is fine; log should only show a commit once, and then exits. However, there are some code paths where this does not work. At least two are known: 1. A commit may be shown as part of a regular ref, and then it may be shown again as part of a submodule diff (e.g., if a repo contains refs to both the superproject and subproject). 2. A notes-cache commit may be shown during "log --all", and then later used to access a textconv cache during a diff. Lazily loading in logmsg_reencode does not necessarily catch all such cases, but it should catch most of them. Users of the commit buffer tend to be either parsing for structure (in which they will call parse_commit, and either we will already have parsed, or we will load commit->buffer lazily there), or outputting (either to the user, or fetching a part of the commit message via format_commit_message). In the latter case, we should always be using logmsg_reencode anyway (and typically we do so via the pretty-print machinery). If there are any cases that this misses, we can fix them up to use logmsg_reencode (or handle them on a case-by-case basis if that is inappropriate). Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- builtin/blame.c | 13 -------- pretty.c | 57 +++++++++++++++++++++++++++----- t/t4042-diff-textconv-caching.sh | 8 +++++ 3 files changed, 57 insertions(+), 21 deletions(-) diff --git a/builtin/blame.c b/builtin/blame.c index 962e4e3cd1..86100e9662 100644 --- a/builtin/blame.c +++ b/builtin/blame.c @@ -1424,19 +1424,6 @@ static void get_commit_info(struct commit *commit, commit_info_init(ret); - /* - * We've operated without save_commit_buffer, so - * we now need to populate them for output. - */ - if (!commit->buffer) { - enum object_type type; - unsigned long size; - commit->buffer = - read_sha1_file(commit->object.sha1, &type, &size); - if (!commit->buffer) - die("Cannot read commit %s", - sha1_to_hex(commit->object.sha1)); - } encoding = get_log_output_encoding(); message = logmsg_reencode(commit, encoding); get_ac_line(message, "\nauthor ", diff --git a/pretty.c b/pretty.c index c675349438..eae57ad9d7 100644 --- a/pretty.c +++ b/pretty.c @@ -592,18 +592,59 @@ char *logmsg_reencode(const struct commit *commit, char *msg = commit->buffer; char *out; + if (!msg) { + enum object_type type; + unsigned long size; + + msg = read_sha1_file(commit->object.sha1, &type, &size); + if (!msg) + die("Cannot read commit object %s", + sha1_to_hex(commit->object.sha1)); + if (type != OBJ_COMMIT) + die("Expected commit for '%s', got %s", + sha1_to_hex(commit->object.sha1), typename(type)); + } + if (!output_encoding || !*output_encoding) return msg; encoding = get_header(commit, msg, "encoding"); use_encoding = encoding ? encoding : utf8; - if (same_encoding(use_encoding, output_encoding)) - if (encoding) /* we'll strip encoding header later */ - out = xstrdup(commit->buffer); - else - return msg; /* nothing to do */ - else - out = reencode_string(commit->buffer, - output_encoding, use_encoding); + if (same_encoding(use_encoding, output_encoding)) { + /* + * No encoding work to be done. If we have no encoding header + * at all, then there's nothing to do, and we can return the + * message verbatim (whether newly allocated or not). + */ + if (!encoding) + return msg; + + /* + * Otherwise, we still want to munge the encoding header in the + * result, which will be done by modifying the buffer. If we + * are using a fresh copy, we can reuse it. But if we are using + * the cached copy from commit->buffer, we need to duplicate it + * to avoid munging commit->buffer. + */ + out = msg; + if (out == commit->buffer) + out = xstrdup(out); + } + else { + /* + * There's actual encoding work to do. Do the reencoding, which + * still leaves the header to be replaced in the next step. At + * this point, we are done with msg. If we allocated a fresh + * copy, we can free it. + */ + out = reencode_string(msg, output_encoding, use_encoding); + if (out && msg != commit->buffer) + free(msg); + } + + /* + * This replacement actually consumes the buffer we hand it, so we do + * not have to worry about freeing the old "out" here. + */ if (out) out = replace_encoding_header(out, output_encoding); diff --git a/t/t4042-diff-textconv-caching.sh b/t/t4042-diff-textconv-caching.sh index 91f8198f05..04a44d5c61 100755 --- a/t/t4042-diff-textconv-caching.sh +++ b/t/t4042-diff-textconv-caching.sh @@ -106,4 +106,12 @@ test_expect_success 'switching diff driver produces correct results' ' test_cmp expect actual ' +# The point here is to test that we can log the notes cache and still use it to +# produce a diff later (older versions of git would segfault on this). It's +# much more likely to come up in the real world with "log --all -p", but using +# --no-walk lets us reliably reproduce the order of traversal. +test_expect_success 'log notes cache and still use cache for -p' ' + git log --no-walk -p refs/notes/textconv/magic HEAD +' + test_done