From 3efb988098858bf6b974b1e673a190f9d2965d1d Mon Sep 17 00:00:00 2001 From: Jeff King Date: Thu, 24 Sep 2015 19:12:23 -0400 Subject: [PATCH 1/3] react to errors in xdi_diff When we call into xdiff to perform a diff, we generally lose the return code completely. Typically by ignoring the return of our xdi_diff wrapper, but sometimes we even propagate that return value up and then ignore it later. This can lead to us silently producing incorrect diffs (e.g., "git log" might produce no output at all, not even a diff header, for a content-level diff). In practice this does not happen very often, because the typical reason for xdiff to report failure is that it malloc() failed (it uses straight malloc, and not our xmalloc wrapper). But it could also happen when xdiff triggers one our callbacks, which returns an error (e.g., outf() in builtin/rerere.c tries to report a write failure in this way). And the next patch also plans to add more failure modes. Let's notice an error return from xdiff and react appropriately. In most of the diff.c code, we can simply die(), which matches the surrounding code (e.g., that is what we do if we fail to load a file for diffing in the first place). This is not that elegant, but we are probably better off dying to let the user know there was a problem, rather than simply generating bogus output. We could also just die() directly in xdi_diff, but the callers typically have a bit more context, and can provide a better message (and if we do later decide to pass errors up, we're one step closer to doing so). There is one interesting case, which is in diff_grep(). Here if we cannot generate the diff, there is nothing to match, and we silently return "no hits". This is actually what the existing code does already, but we make it a little more explicit. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- builtin/blame.c | 9 +++++++-- builtin/merge-tree.c | 3 ++- builtin/rerere.c | 10 ++++++---- combine-diff.c | 6 ++++-- diff.c | 26 ++++++++++++++++---------- diffcore-pickaxe.c | 4 ++-- line-log.c | 7 ++++--- 7 files changed, 41 insertions(+), 24 deletions(-) diff --git a/builtin/blame.c b/builtin/blame.c index 2b1f9dd6cd..66b4fbf466 100644 --- a/builtin/blame.c +++ b/builtin/blame.c @@ -972,7 +972,10 @@ static void pass_blame_to_parent(struct scoreboard *sb, fill_origin_blob(&sb->revs->diffopt, target, &file_o); num_get_patch++; - diff_hunks(&file_p, &file_o, 0, blame_chunk_cb, &d); + if (diff_hunks(&file_p, &file_o, 0, blame_chunk_cb, &d)) + die("unable to generate diff (%s -> %s)", + sha1_to_hex(parent->commit->object.sha1), + sha1_to_hex(target->commit->object.sha1)); /* The rest are the same as the parent */ blame_chunk(&d.dstq, &d.srcq, INT_MAX, d.offset, INT_MAX, parent); *d.dstq = NULL; @@ -1118,7 +1121,9 @@ static void find_copy_in_blob(struct scoreboard *sb, * file_p partially may match that image. */ memset(split, 0, sizeof(struct blame_entry [3])); - diff_hunks(file_p, &file_o, 1, handle_split_cb, &d); + if (diff_hunks(file_p, &file_o, 1, handle_split_cb, &d)) + die("unable to generate diff (%s)", + sha1_to_hex(parent->commit->object.sha1)); /* remainder, if any, all match the preimage */ handle_split(sb, ent, d.tlno, d.plno, ent->num_lines, parent, split); } diff --git a/builtin/merge-tree.c b/builtin/merge-tree.c index f9ab48597e..2a4aafec6a 100644 --- a/builtin/merge-tree.c +++ b/builtin/merge-tree.c @@ -118,7 +118,8 @@ static void show_diff(struct merge_list *entry) if (!dst.ptr) size = 0; dst.size = size; - xdi_diff(&src, &dst, &xpp, &xecfg, &ecb); + if (xdi_diff(&src, &dst, &xpp, &xecfg, &ecb)) + die("unable to generate diff"); free(src.ptr); free(dst.ptr); } diff --git a/builtin/rerere.c b/builtin/rerere.c index 98eb8c5404..aab8f3b1f0 100644 --- a/builtin/rerere.c +++ b/builtin/rerere.c @@ -29,9 +29,10 @@ static int diff_two(const char *file1, const char *label1, xdemitconf_t xecfg; xdemitcb_t ecb; mmfile_t minus, plus; + int ret; if (read_mmfile(&minus, file1) || read_mmfile(&plus, file2)) - return 1; + return -1; printf("--- a/%s\n+++ b/%s\n", label1, label2); fflush(stdout); @@ -40,11 +41,11 @@ static int diff_two(const char *file1, const char *label1, memset(&xecfg, 0, sizeof(xecfg)); xecfg.ctxlen = 3; ecb.outf = outf; - xdi_diff(&minus, &plus, &xpp, &xecfg, &ecb); + ret = xdi_diff(&minus, &plus, &xpp, &xecfg, &ecb); free(minus.ptr); free(plus.ptr); - return 0; + return ret; } int cmd_rerere(int argc, const char **argv, const char *prefix) @@ -104,7 +105,8 @@ int cmd_rerere(int argc, const char **argv, const char *prefix) for (i = 0; i < merge_rr.nr; i++) { const char *path = merge_rr.items[i].string; const char *name = (const char *)merge_rr.items[i].util; - diff_two(rerere_path(name, "preimage"), path, path, path); + if (diff_two(rerere_path(name, "preimage"), path, path, path)) + die("unable to generate diff for %s", name); } else usage_with_options(rerere_usage, options); diff --git a/combine-diff.c b/combine-diff.c index 91edce58e1..284bec6ad5 100644 --- a/combine-diff.c +++ b/combine-diff.c @@ -419,8 +419,10 @@ static void combine_diff(const unsigned char *parent, unsigned int mode, state.num_parent = num_parent; state.n = n; - xdi_diff_outf(&parent_file, result_file, consume_line, &state, - &xpp, &xecfg); + if (xdi_diff_outf(&parent_file, result_file, consume_line, &state, + &xpp, &xecfg)) + die("unable to generate combined diff for %s", + sha1_to_hex(parent)); free(parent_file.ptr); /* Assign line numbers for this parent. diff --git a/diff.c b/diff.c index 7500c55095..6bbf28bff2 100644 --- a/diff.c +++ b/diff.c @@ -1002,8 +1002,9 @@ static void diff_words_show(struct diff_words_data *diff_words) xpp.flags = 0; /* as only the hunk header will be parsed, we need a 0-context */ xecfg.ctxlen = 0; - xdi_diff_outf(&minus, &plus, fn_out_diff_words_aux, diff_words, - &xpp, &xecfg); + if (xdi_diff_outf(&minus, &plus, fn_out_diff_words_aux, diff_words, + &xpp, &xecfg)) + die("unable to generate word diff"); free(minus.ptr); free(plus.ptr); if (diff_words->current_plus != diff_words->plus.text.ptr + @@ -2400,8 +2401,9 @@ static void builtin_diff(const char *name_a, xecfg.ctxlen = strtoul(v, NULL, 10); if (o->word_diff) init_diff_words_data(&ecbdata, o, one, two); - xdi_diff_outf(&mf1, &mf2, fn_out_consume, &ecbdata, - &xpp, &xecfg); + if (xdi_diff_outf(&mf1, &mf2, fn_out_consume, &ecbdata, + &xpp, &xecfg)) + die("unable to generate diff for %s", one->path); if (o->word_diff) free_diff_words_data(&ecbdata); if (textconv_one) @@ -2478,8 +2480,9 @@ static void builtin_diffstat(const char *name_a, const char *name_b, xpp.flags = o->xdl_opts; xecfg.ctxlen = o->context; xecfg.interhunkctxlen = o->interhunkcontext; - xdi_diff_outf(&mf1, &mf2, diffstat_consume, diffstat, - &xpp, &xecfg); + if (xdi_diff_outf(&mf1, &mf2, diffstat_consume, diffstat, + &xpp, &xecfg)) + die("unable to generate diffstat for %s", one->path); } diff_free_filespec_data(one); @@ -2525,8 +2528,9 @@ static void builtin_checkdiff(const char *name_a, const char *name_b, memset(&xecfg, 0, sizeof(xecfg)); xecfg.ctxlen = 1; /* at least one context line */ xpp.flags = 0; - xdi_diff_outf(&mf1, &mf2, checkdiff_consume, &data, - &xpp, &xecfg); + if (xdi_diff_outf(&mf1, &mf2, checkdiff_consume, &data, + &xpp, &xecfg)) + die("unable to generate checkdiff for %s", one->path); if (data.ws_rule & WS_BLANK_AT_EOF) { struct emit_callback ecbdata; @@ -4425,8 +4429,10 @@ static int diff_get_patch_id(struct diff_options *options, unsigned char *sha1) xpp.flags = 0; xecfg.ctxlen = 3; xecfg.flags = 0; - xdi_diff_outf(&mf1, &mf2, patch_id_consume, &data, - &xpp, &xecfg); + if (xdi_diff_outf(&mf1, &mf2, patch_id_consume, &data, + &xpp, &xecfg)) + return error("unable to generate patch-id diff for %s", + p->one->path); } git_SHA1_Final(sha1, &ctx); diff --git a/diffcore-pickaxe.c b/diffcore-pickaxe.c index 185f86b284..7715c13ec4 100644 --- a/diffcore-pickaxe.c +++ b/diffcore-pickaxe.c @@ -62,8 +62,8 @@ static int diff_grep(mmfile_t *one, mmfile_t *two, ecbdata.hit = 0; xecfg.ctxlen = o->context; xecfg.interhunkctxlen = o->interhunkcontext; - xdi_diff_outf(one, two, diffgrep_consume, &ecbdata, - &xpp, &xecfg); + if (xdi_diff_outf(one, two, diffgrep_consume, &ecbdata, &xpp, &xecfg)) + return 0; return ecbdata.hit; } diff --git a/line-log.c b/line-log.c index 1a6bc5921b..d4e29a574f 100644 --- a/line-log.c +++ b/line-log.c @@ -325,7 +325,7 @@ static int collect_diff_cb(long start_a, long count_a, return 0; } -static void collect_diff(mmfile_t *parent, mmfile_t *target, struct diff_ranges *out) +static int collect_diff(mmfile_t *parent, mmfile_t *target, struct diff_ranges *out) { struct collect_diff_cbdata cbdata = {NULL}; xpparam_t xpp; @@ -340,7 +340,7 @@ static void collect_diff(mmfile_t *parent, mmfile_t *target, struct diff_ranges xecfg.hunk_func = collect_diff_cb; memset(&ecb, 0, sizeof(ecb)); ecb.priv = &cbdata; - xdi_diff(parent, target, &xpp, &xecfg, &ecb); + return xdi_diff(parent, target, &xpp, &xecfg, &ecb); } /* @@ -1030,7 +1030,8 @@ static int process_diff_filepair(struct rev_info *rev, } diff_ranges_init(&diff); - collect_diff(&file_parent, &file_target, &diff); + if (collect_diff(&file_parent, &file_target, &diff)) + die("unable to generate diff for %s", pair->one->path); /* NEEDSWORK should apply some heuristics to prevent mismatches */ free(rg->path); From dcd1742e56ebb944c4ff62346da4548e1e3be675 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Thu, 24 Sep 2015 19:12:45 -0400 Subject: [PATCH 2/3] xdiff: reject files larger than ~1GB The xdiff code is not prepared to handle extremely large files. It uses "int" in many places, which can overflow if we have a very large number of lines or even bytes in our input files. This can cause us to produce incorrect diffs, with no indication that the output is wrong. Or worse, we may even underallocate a buffer whose size is the result of an overflowing addition. We're much better off to tell the user that we cannot diff or merge such a large file. This patch covers both cases, but in slightly different ways: 1. For merging, we notice the large file and cleanly fall back to a binary merge (which is effectively "we cannot merge this"). 2. For diffing, we make the binary/text distinction much earlier, and in many different places. For this case, we'll use the xdi_diff as our choke point, and reject any diff there before it hits the xdiff code. This means in most cases we'll die() immediately after. That's not ideal, but in practice we shouldn't generally hit this code path unless the user is trying to do something tricky. We already consider files larger than core.bigfilethreshold to be binary, so this code would only kick in when that is circumvented (either by bumping that value, or by using a .gitattribute to mark a file as diffable). In other words, we can avoid being "nice" here, because there is already nice code that tries to do the right thing. We are adding the suspenders to the nice code's belt, so notice when it has been worked around (both to protect the user from malicious inputs, and because it is better to die() than generate bogus output). The maximum size was chosen after experimenting with feeding large files to the xdiff code. It's just under a gigabyte, which leaves room for two obvious cases: - a diff3 merge conflict result on files of maximum size X could be 3*X plus the size of the markers, which would still be only about 3G, which fits in a 32-bit int. - some of the diff code allocates arrays of one int per record. Even if each file consists only of blank lines, then a file smaller than 1G will have fewer than 1G records, and therefore the int array will fit in 4G. Since the limit is arbitrary anyway, I chose to go under a gigabyte, to leave a safety margin (e.g., we would not want to overflow by allocating "(records + 1) * sizeof(int)" or similar. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- ll-merge.c | 5 ++++- xdiff-interface.c | 3 +++ xdiff-interface.h | 7 +++++++ 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/ll-merge.c b/ll-merge.c index 8ea03e536a..4e789f5330 100644 --- a/ll-merge.c +++ b/ll-merge.c @@ -88,7 +88,10 @@ static int ll_xdl_merge(const struct ll_merge_driver *drv_unused, xmparam_t xmp; assert(opts); - if (buffer_is_binary(orig->ptr, orig->size) || + if (orig->size > MAX_XDIFF_SIZE || + src1->size > MAX_XDIFF_SIZE || + src2->size > MAX_XDIFF_SIZE || + buffer_is_binary(orig->ptr, orig->size) || buffer_is_binary(src1->ptr, src1->size) || buffer_is_binary(src2->ptr, src2->size)) { return ll_binary_merge(drv_unused, result, diff --git a/xdiff-interface.c b/xdiff-interface.c index ecfa05f616..cb67c1c42b 100644 --- a/xdiff-interface.c +++ b/xdiff-interface.c @@ -131,6 +131,9 @@ int xdi_diff(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp, xdemitconf_t co mmfile_t a = *mf1; mmfile_t b = *mf2; + if (mf1->size > MAX_XDIFF_SIZE || mf2->size > MAX_XDIFF_SIZE) + return -1; + trim_common_tail(&a, &b, xecfg->ctxlen); return xdl_diff(&a, &b, xpp, xecfg, xecb); diff --git a/xdiff-interface.h b/xdiff-interface.h index eff7762ee1..fbb5a1c394 100644 --- a/xdiff-interface.h +++ b/xdiff-interface.h @@ -3,6 +3,13 @@ #include "xdiff/xdiff.h" +/* + * xdiff isn't equipped to handle content over a gigabyte; + * we make the cutoff 1GB - 1MB to give some breathing + * room for constant-sized additions (e.g., merge markers) + */ +#define MAX_XDIFF_SIZE (1024UL * 1024 * 1023) + typedef void (*xdiff_emit_consume_fn)(void *, char *, unsigned long); int xdi_diff(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp, xdemitconf_t const *xecfg, xdemitcb_t *ecb); From 83c4d380171a2ecd24dd2e04072692ec54a7aaa5 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Fri, 25 Sep 2015 17:58:09 -0400 Subject: [PATCH 3/3] merge-file: enforce MAX_XDIFF_SIZE on incoming files The previous commit enforces MAX_XDIFF_SIZE at the interfaces to xdiff: xdi_diff (which calls xdl_diff) and ll_xdl_merge (which calls xdl_merge). But we have another direct call to xdl_merge in merge-file.c. If it were written today, this probably would just use the ll_merge machinery. But it predates that code, and uses slightly different options to xdl_merge (e.g., ZEALOUS_ALNUM). We could try to abstract out an xdi_merge to match the existing xdi_diff, but even that is difficult. Rather than simply report error, we try to treat large files as binary, and that distinction would happen outside of xdi_merge. The simplest fix is to just replicate the MAX_XDIFF_SIZE check in merge-file.c. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- builtin/merge-file.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/builtin/merge-file.c b/builtin/merge-file.c index 232b76857c..04ae36a452 100644 --- a/builtin/merge-file.c +++ b/builtin/merge-file.c @@ -75,7 +75,8 @@ int cmd_merge_file(int argc, const char **argv, const char *prefix) names[i] = argv[i]; if (read_mmfile(mmfs + i, fname)) return -1; - if (buffer_is_binary(mmfs[i].ptr, mmfs[i].size)) + if (mmfs[i].size > MAX_XDIFF_SIZE || + buffer_is_binary(mmfs[i].ptr, mmfs[i].size)) return error("Cannot merge binary files: %s", argv[i]); }