patch-id: make it stable against hunk reordering

Patch id changes if users reorder file diffs that make up a patch.

As the result is functionally equivalent, a different patch id is
surprising to many users.
In particular, reordering files using diff -O is helpful to make patches
more readable (e.g. API header diff before implementation diff).

Add an option to change patch-id behaviour making it stable against
these kinds of patch change:
calculate SHA1 hash for each hunk separately and sum all hashes
(using a symmetrical sum) to get patch id

We use a 20byte sum and not xor - since xor would give 0 output
for patches that have two identical diffs, which isn't all that
unlikely (e.g. append the same line in two places).

The new behaviour is enabled
- when patchid.stable is true
- when --stable flag is present

Using a new flag --unstable or setting patchid.stable to false force
the historical behaviour.

In the documentation, clarify that patch ID can now be a sum of hashes,
not a hash.
Document how command line and config options affect the
behaviour.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
This commit is contained in:
Michael S. Tsirkin 2014-04-27 21:15:44 +03:00 committed by Junio C Hamano
parent bb98b01ee8
commit 30e12b924b
2 changed files with 91 additions and 20 deletions

View File

@ -8,14 +8,14 @@ git-patch-id - Compute unique ID for a patch
SYNOPSIS
--------
[verse]
'git patch-id' < <patch>
'git patch-id' [--stable | --unstable] < <patch>
DESCRIPTION
-----------
A "patch ID" is nothing but a SHA-1 of the diff associated with a patch, with
whitespace and line numbers ignored. As such, it's "reasonably stable", but at
the same time also reasonably unique, i.e., two patches that have the same "patch
ID" are almost guaranteed to be the same thing.
A "patch ID" is nothing but a sum of SHA-1 of the file diffs associated with a
patch, with whitespace and line numbers ignored. As such, it's "reasonably
stable", but at the same time also reasonably unique, i.e., two patches that
have the same "patch ID" are almost guaranteed to be the same thing.
IOW, you can use this thing to look for likely duplicate commits.
@ -27,6 +27,33 @@ This can be used to make a mapping from patch ID to commit ID.
OPTIONS
-------
--stable::
Use a "stable" sum of hashes as the patch ID. With this option:
- Reordering file diffs that make up a patch does not affect the ID.
In particular, two patches produced by comparing the same two trees
with two different settings for "-O<orderfile>" result in the same
patch ID signature, thereby allowing the computed result to be used
as a key to index some meta-information about the change between
the two trees;
- Result is different from the value produced by git 1.9 and older
or produced when an "unstable" hash (see --unstable below) is
configured - even when used on a diff output taken without any use
of "-O<orderfile>", thereby making existing databases storing such
"unstable" or historical patch-ids unusable.
This is the default if patchid.stable is set to true.
--unstable::
Use an "unstable" hash as the patch ID. With this option,
the result produced is compatible with the patch-id value produced
by git 1.9 and older. Users with pre-existing databases storing
patch-ids produced by git 1.9 and older (who do not deal with reordered
patches) may want to use this option.
This is the default.
<patch>::
The diff to create the ID of.

View File

@ -1,17 +1,14 @@
#include "builtin.h"
static void flush_current_id(int patchlen, unsigned char *id, git_SHA_CTX *c)
static void flush_current_id(int patchlen, unsigned char *id, unsigned char *result)
{
unsigned char result[20];
char name[50];
if (!patchlen)
return;
git_SHA1_Final(result, c);
memcpy(name, sha1_to_hex(id), 41);
printf("%s %s\n", sha1_to_hex(result), name);
git_SHA1_Init(c);
}
static int remove_space(char *line)
@ -56,10 +53,31 @@ static int scan_hunk_header(const char *p, int *p_before, int *p_after)
return 1;
}
static int get_one_patchid(unsigned char *next_sha1, git_SHA_CTX *ctx, struct strbuf *line_buf)
static void flush_one_hunk(unsigned char *result, git_SHA_CTX *ctx)
{
unsigned char hash[20];
unsigned short carry = 0;
int i;
git_SHA1_Final(hash, ctx);
git_SHA1_Init(ctx);
/* 20-byte sum, with carry */
for (i = 0; i < 20; ++i) {
carry += result[i] + hash[i];
result[i] = carry;
carry >>= 8;
}
}
static int get_one_patchid(unsigned char *next_sha1, unsigned char *result,
struct strbuf *line_buf, int stable)
{
int patchlen = 0, found_next = 0;
int before = -1, after = -1;
git_SHA_CTX ctx;
git_SHA1_Init(&ctx);
hashclr(result);
while (strbuf_getwholeline(line_buf, stdin, '\n') != EOF) {
char *line = line_buf->buf;
@ -107,6 +125,8 @@ static int get_one_patchid(unsigned char *next_sha1, git_SHA_CTX *ctx, struct st
break;
/* Else we're parsing another header. */
if (stable)
flush_one_hunk(result, &ctx);
before = after = -1;
}
@ -119,39 +139,63 @@ static int get_one_patchid(unsigned char *next_sha1, git_SHA_CTX *ctx, struct st
/* Compute the sha without whitespace */
len = remove_space(line);
patchlen += len;
git_SHA1_Update(ctx, line, len);
git_SHA1_Update(&ctx, line, len);
}
if (!found_next)
hashclr(next_sha1);
flush_one_hunk(result, &ctx);
return patchlen;
}
static void generate_id_list(void)
static void generate_id_list(int stable)
{
unsigned char sha1[20], n[20];
git_SHA_CTX ctx;
unsigned char sha1[20], n[20], result[20];
int patchlen;
struct strbuf line_buf = STRBUF_INIT;
git_SHA1_Init(&ctx);
hashclr(sha1);
while (!feof(stdin)) {
patchlen = get_one_patchid(n, &ctx, &line_buf);
flush_current_id(patchlen, sha1, &ctx);
patchlen = get_one_patchid(n, result, &line_buf, stable);
flush_current_id(patchlen, sha1, result);
hashcpy(sha1, n);
}
strbuf_release(&line_buf);
}
static const char patch_id_usage[] = "git patch-id < patch";
static const char patch_id_usage[] = "git patch-id [--stable | --unstable] < patch";
static int git_patch_id_config(const char *var, const char *value, void *cb)
{
int *stable = cb;
if (!strcmp(var, "patchid.stable")) {
*stable = git_config_bool(var, value);
return 0;
}
return git_default_config(var, value, cb);
}
int cmd_patch_id(int argc, const char **argv, const char *prefix)
{
if (argc != 1)
int stable = -1;
git_config(git_patch_id_config, &stable);
/* If nothing is set, default to unstable. */
if (stable < 0)
stable = 0;
if (argc == 2 && !strcmp(argv[1], "--stable"))
stable = 1;
else if (argc == 2 && !strcmp(argv[1], "--unstable"))
stable = 0;
else if (argc != 1)
usage(patch_id_usage);
generate_id_list();
generate_id_list(stable);
return 0;
}