From f4f19fb63449e1beee02b0ec845319f7115fa9d0 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Mon, 16 Nov 2009 10:56:25 -0500 Subject: [PATCH 1/3] diffcore-break: free filespec data as we go As we look at each changed file and consider breaking it, we load the blob data and make a decision about whether to break, which is independent of any other blobs that might have changed. However, we keep the data in memory while we consider breaking all of the other files. Which means that both versions of every file you are diffing are in memory at the same time. This patch instead frees the blob data as we finish with each file pair, leading to much lower memory usage. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- diffcore-break.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/diffcore-break.c b/diffcore-break.c index d7097bb576..15562e4556 100644 --- a/diffcore-break.c +++ b/diffcore-break.c @@ -204,12 +204,16 @@ void diffcore_break(int break_score) dp->score = score; dp->broken_pair = 1; + diff_free_filespec_data(p->one); + diff_free_filespec_data(p->two); free(p); /* not diff_free_filepair(), we are * reusing one and two here. */ continue; } } + diff_free_filespec_data(p->one); + diff_free_filespec_data(p->two); diff_q(&outq, p); } free(q->queue); From 8282de94bc76360e0bf76da4076755696b049d23 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Mon, 16 Nov 2009 11:02:02 -0500 Subject: [PATCH 2/3] diffcore-break: save cnt_data for other phases The "break" phase works by counting changes between two blobs with the same path. We do this by splitting the file into chunks (or lines for text oriented files) and then keeping a count of chunk hashes. The "rename" phase counts changes between blobs at two different paths. However, it uses the exact same set of chunk hashes (which are immutable for a given sha1). The rename phase can therefore use the same hash data as break. Unfortunately, we were throwing this data away after computing it in the break phase. This patch instead attaches it to the filespec and lets it live through the rename phase, working under the assumption that most of the time that breaks are being computed, renames will be too. We only do this optimization for files which have actually been broken, as those ones will be candidates for rename detection (and it is a time-space tradeoff, so we don't want to waste space keeping useless data). Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- diffcore-break.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/diffcore-break.c b/diffcore-break.c index 15562e4556..3a7b60a037 100644 --- a/diffcore-break.c +++ b/diffcore-break.c @@ -69,7 +69,7 @@ static int should_break(struct diff_filespec *src, return 0; /* we do not break too small filepair */ if (diffcore_count_changes(src, dst, - NULL, NULL, + &src->cnt_data, &dst->cnt_data, 0, &src_copied, &literal_added)) return 0; @@ -204,8 +204,8 @@ void diffcore_break(int break_score) dp->score = score; dp->broken_pair = 1; - diff_free_filespec_data(p->one); - diff_free_filespec_data(p->two); + diff_free_filespec_blob(p->one); + diff_free_filespec_blob(p->two); free(p); /* not diff_free_filepair(), we are * reusing one and two here. */ From 809809bb75e8a65ef543ab706aab4791459be95c Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Fri, 20 Nov 2009 22:13:47 -0800 Subject: [PATCH 3/3] diffcore-rename: reduce memory footprint by freeing blob data early After running one round of estimate_similarity(), filespecs on either side will have populated their cnt_data fields, and we do not need the blob text anymore. We used to retain the blob data to optimize for smaller projects (not freeing the blob data here would mean that the final output phase would not have to re-read it), but we are efficient enough without such optimization for smaller projects anyway, and freeing memory early will help larger projects. Signed-off-by: Junio C Hamano --- diffcore-rename.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/diffcore-rename.c b/diffcore-rename.c index 63ac998bfa..d6fd3cacd6 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -523,10 +523,13 @@ void diffcore_rename(struct diff_options *options) this_src.dst = i; this_src.src = j; record_if_better(m, &this_src); + /* + * Once we run estimate_similarity, + * We do not need the text anymore. + */ diff_free_filespec_blob(one); + diff_free_filespec_blob(two); } - /* We do not need the text anymore */ - diff_free_filespec_blob(two); dst_cnt++; }