Makefile + hash.h: remove PPC_SHA1 implementation

Remove the PPC_SHA1 implementation added in a6ef3518f9 ([PATCH] PPC
assembly implementation of SHA1, 2005-04-22). When this was added
Apple consumer hardware used the PPC architecture, and the
implementation was intended to improve SHA-1 speed there.

Since it was added we've moved to using sha1collisiondetection by
default, and anyone wanting hard-rolled non-DC SHA-1 implementation
can use OpenSSL's via the OPENSSL_SHA1 knob.

The PPC_SHA1 originally originally targeted 32 bit PPC, and later the
64 bit PPC 970 (a.k.a. Apple PowerPC G5). See 926172c5e4 (block-sha1:
improve code on large-register-set machines, 2009-08-10) for a
reference about the performance on G5 (a comment in block-sha1/sha1.c
being removed here).

I can't get it to do anything but segfault on both the BE and LE POWER
machines in the GCC compile farm[1]. Anyone who's concerned about
performance on PPC these days is likely to be using the IBM POWER
processors.

There have been proposals to entirely remove non-sha1collisiondetection
implementations from the tree[2]. I think per [3] that would be a bit
overzealous. I.e. there are various set-ups git's speed is going to be
more important than the relatively implausible SHA-1 collision attack,
or where such attacks are entirely mitigated by other means (e.g. by
incoming objects being checked with DC_SHA1).

But that really doesn't apply to PPC_SHA1 in particular, which seems
to have outlived its usefulness.

As this gets rid of the only in-tree *.S assembly file we can remove
the small bits of logic from the Makefile needed to build objects
from *.S (as opposed to *.c)

The code being removed here was also throwing warnings with the
"-pedantic" flag, it could have been fixed as 544d93bc3b (block-sha1:
remove use of obsolete x86 assembly, 2022-03-10) did for block-sha1/*,
but as noted above let's remove it instead.

1. https://cfarm.tetaneutral.net/machines/list/
   Tested on gcc{110,112,135,203}, a mixture of POWER [789] ppc64 and
   ppc64le. All segfault in anything needing object
   hashing (e.g. t/t1007-hash-object.sh) when compiled with
   PPC_SHA1=Y.
2. https://lore.kernel.org/git/20200223223758.120941-1-mh@glandium.org/
3. https://lore.kernel.org/git/20200224044732.GK1018190@coredump.intra.peff.net/

Acked-by: brian m. carlson" <sandals@crustytoothpaste.net>
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
This commit is contained in:
Ævar Arnfjörð Bjarmason 2022-08-31 11:18:43 +02:00 committed by Junio C Hamano
parent d42b38dfb5
commit 9dc523aa0e
8 changed files with 8 additions and 347 deletions

View File

@ -135,8 +135,7 @@ Issues of note:
By default, git uses OpenSSL for SHA1 but it will use its own By default, git uses OpenSSL for SHA1 but it will use its own
library (inspired by Mozilla's) with either NO_OPENSSL or library (inspired by Mozilla's) with either NO_OPENSSL or
BLK_SHA1. Also included is a version optimized for PowerPC BLK_SHA1.
(PPC_SHA1).
- "libcurl" library is used for fetching and pushing - "libcurl" library is used for fetching and pushing
repositories over http:// or https://, as well as by repositories over http:// or https://, as well as by

View File

@ -155,9 +155,6 @@ include shared.mak
# Define BLK_SHA1 environment variable to make use of the bundled # Define BLK_SHA1 environment variable to make use of the bundled
# optimized C SHA1 routine. # optimized C SHA1 routine.
# #
# Define PPC_SHA1 environment variable when running make to make use of
# a bundled SHA1 routine optimized for PowerPC.
#
# Define DC_SHA1 to unconditionally enable the collision-detecting sha1 # Define DC_SHA1 to unconditionally enable the collision-detecting sha1
# algorithm. This is slower, but may detect attempted collision attacks. # algorithm. This is slower, but may detect attempted collision attacks.
# Takes priority over other *_SHA1 knobs. # Takes priority over other *_SHA1 knobs.
@ -1802,6 +1799,10 @@ ifdef APPLE_COMMON_CRYPTO
SHA1_MAX_BLOCK_SIZE = 1024L*1024L*1024L SHA1_MAX_BLOCK_SIZE = 1024L*1024L*1024L
endif endif
ifdef PPC_SHA1
$(error the PPC_SHA1 flag has been removed along with the PowerPC-specific SHA-1 implementation.)
endif
ifdef OPENSSL_SHA1 ifdef OPENSSL_SHA1
EXTLIBS += $(LIB_4_CRYPTO) EXTLIBS += $(LIB_4_CRYPTO)
BASIC_CFLAGS += -DSHA1_OPENSSL BASIC_CFLAGS += -DSHA1_OPENSSL
@ -1810,10 +1811,6 @@ ifdef BLK_SHA1
LIB_OBJS += block-sha1/sha1.o LIB_OBJS += block-sha1/sha1.o
BASIC_CFLAGS += -DSHA1_BLK BASIC_CFLAGS += -DSHA1_BLK
else else
ifdef PPC_SHA1
LIB_OBJS += ppc/sha1.o ppc/sha1ppc.o
BASIC_CFLAGS += -DSHA1_PPC
else
ifdef APPLE_COMMON_CRYPTO ifdef APPLE_COMMON_CRYPTO
COMPAT_CFLAGS += -DCOMMON_DIGEST_FOR_OPENSSL COMPAT_CFLAGS += -DCOMMON_DIGEST_FOR_OPENSSL
BASIC_CFLAGS += -DSHA1_APPLE BASIC_CFLAGS += -DSHA1_APPLE
@ -1847,7 +1844,6 @@ endif
endif endif
endif endif
endif endif
endif
ifdef OPENSSL_SHA256 ifdef OPENSSL_SHA256
EXTLIBS += $(LIB_4_CRYPTO) EXTLIBS += $(LIB_4_CRYPTO)
@ -2594,14 +2590,10 @@ missing_compdb_dir =
compdb_args = compdb_args =
endif endif
ASM_SRC := $(wildcard $(OBJECTS:o=S)) C_OBJ := $(OBJECTS)
ASM_OBJ := $(ASM_SRC:S=o)
C_OBJ := $(filter-out $(ASM_OBJ),$(OBJECTS))
$(C_OBJ): %.o: %.c GIT-CFLAGS $(missing_dep_dirs) $(missing_compdb_dir) $(C_OBJ): %.o: %.c GIT-CFLAGS $(missing_dep_dirs) $(missing_compdb_dir)
$(QUIET_CC)$(CC) -o $*.o -c $(dep_args) $(compdb_args) $(ALL_CFLAGS) $(EXTRA_CPPFLAGS) $< $(QUIET_CC)$(CC) -o $*.o -c $(dep_args) $(compdb_args) $(ALL_CFLAGS) $(EXTRA_CPPFLAGS) $<
$(ASM_OBJ): %.o: %.S GIT-CFLAGS $(missing_dep_dirs) $(missing_compdb_dir)
$(QUIET_CC)$(CC) -o $*.o -c $(dep_args) $(compdb_args) $(ALL_CFLAGS) $(EXTRA_CPPFLAGS) $<
%.s: %.c GIT-CFLAGS FORCE %.s: %.c GIT-CFLAGS FORCE
$(QUIET_CC)$(CC) -o $@ -S $(ALL_CFLAGS) $(EXTRA_CPPFLAGS) $< $(QUIET_CC)$(CC) -o $@ -S $(ALL_CFLAGS) $(EXTRA_CPPFLAGS) $<

View File

@ -28,10 +28,6 @@
* try to do the silly "optimize away loads" part because it won't * try to do the silly "optimize away loads" part because it won't
* see what the value will be). * see what the value will be).
* *
* Ben Herrenschmidt reports that on PPC, the C version comes close
* to the optimized asm with this (ie on PPC you don't want that
* 'volatile', since there are lots of registers).
*
* On ARM we get the best code generation by forcing a full memory barrier * On ARM we get the best code generation by forcing a full memory barrier
* between each SHA_ROUND, otherwise gcc happily get wild with spilling and * between each SHA_ROUND, otherwise gcc happily get wild with spilling and
* the stack frame size simply explode and performance goes down the drain. * the stack frame size simply explode and performance goes down the drain.

View File

@ -237,9 +237,6 @@ AC_MSG_NOTICE([CHECKS for site configuration])
# tests. These tests take up a significant amount of the total test time # tests. These tests take up a significant amount of the total test time
# but are not needed unless you plan to talk to SVN repos. # but are not needed unless you plan to talk to SVN repos.
# #
# Define PPC_SHA1 environment variable when running make to make use of
# a bundled SHA1 routine optimized for PowerPC.
#
# Define NO_OPENSSL environment variable if you do not have OpenSSL. # Define NO_OPENSSL environment variable if you do not have OpenSSL.
# #
# Define OPENSSLDIR=/foo/bar if your openssl header and library files are in # Define OPENSSLDIR=/foo/bar if your openssl header and library files are in

6
hash.h
View File

@ -4,9 +4,7 @@
#include "git-compat-util.h" #include "git-compat-util.h"
#include "repository.h" #include "repository.h"
#if defined(SHA1_PPC) #if defined(SHA1_APPLE)
#include "ppc/sha1.h"
#elif defined(SHA1_APPLE)
#include <CommonCrypto/CommonDigest.h> #include <CommonCrypto/CommonDigest.h>
#elif defined(SHA1_OPENSSL) #elif defined(SHA1_OPENSSL)
#include <openssl/sha.h> #include <openssl/sha.h>
@ -32,7 +30,7 @@
* platform's underlying implementation of SHA-1; could be OpenSSL, * platform's underlying implementation of SHA-1; could be OpenSSL,
* blk_SHA, Apple CommonCrypto, etc... Note that the relevant * blk_SHA, Apple CommonCrypto, etc... Note that the relevant
* SHA-1 header may have already defined platform_SHA_CTX for our * SHA-1 header may have already defined platform_SHA_CTX for our
* own implementations like block-sha1 and ppc-sha1, so we list * own implementations like block-sha1, so we list
* the default for OpenSSL compatible SHA-1 implementations here. * the default for OpenSSL compatible SHA-1 implementations here.
*/ */
#define platform_SHA_CTX SHA_CTX #define platform_SHA_CTX SHA_CTX

View File

@ -1,72 +0,0 @@
/*
* SHA-1 implementation.
*
* Copyright (C) 2005 Paul Mackerras <paulus@samba.org>
*
* This version assumes we are running on a big-endian machine.
* It calls an external sha1_core() to process blocks of 64 bytes.
*/
#include <stdio.h>
#include <string.h>
#include "sha1.h"
void ppc_sha1_core(uint32_t *hash, const unsigned char *p,
unsigned int nblocks);
int ppc_SHA1_Init(ppc_SHA_CTX *c)
{
c->hash[0] = 0x67452301;
c->hash[1] = 0xEFCDAB89;
c->hash[2] = 0x98BADCFE;
c->hash[3] = 0x10325476;
c->hash[4] = 0xC3D2E1F0;
c->len = 0;
c->cnt = 0;
return 0;
}
int ppc_SHA1_Update(ppc_SHA_CTX *c, const void *ptr, unsigned long n)
{
unsigned long nb;
const unsigned char *p = ptr;
c->len += (uint64_t) n << 3;
while (n != 0) {
if (c->cnt || n < 64) {
nb = 64 - c->cnt;
if (nb > n)
nb = n;
memcpy(&c->buf.b[c->cnt], p, nb);
if ((c->cnt += nb) == 64) {
ppc_sha1_core(c->hash, c->buf.b, 1);
c->cnt = 0;
}
} else {
nb = n >> 6;
ppc_sha1_core(c->hash, p, nb);
nb <<= 6;
}
n -= nb;
p += nb;
}
return 0;
}
int ppc_SHA1_Final(unsigned char *hash, ppc_SHA_CTX *c)
{
unsigned int cnt = c->cnt;
c->buf.b[cnt++] = 0x80;
if (cnt > 56) {
if (cnt < 64)
memset(&c->buf.b[cnt], 0, 64 - cnt);
ppc_sha1_core(c->hash, c->buf.b, 1);
cnt = 0;
}
if (cnt < 56)
memset(&c->buf.b[cnt], 0, 56 - cnt);
c->buf.l[7] = c->len;
ppc_sha1_core(c->hash, c->buf.b, 1);
memcpy(hash, c->hash, 20);
return 0;
}

View File

@ -1,25 +0,0 @@
/*
* SHA-1 implementation.
*
* Copyright (C) 2005 Paul Mackerras <paulus@samba.org>
*/
#include <stdint.h>
typedef struct {
uint32_t hash[5];
uint32_t cnt;
uint64_t len;
union {
unsigned char b[64];
uint64_t l[8];
} buf;
} ppc_SHA_CTX;
int ppc_SHA1_Init(ppc_SHA_CTX *c);
int ppc_SHA1_Update(ppc_SHA_CTX *c, const void *p, unsigned long n);
int ppc_SHA1_Final(unsigned char *hash, ppc_SHA_CTX *c);
#define platform_SHA_CTX ppc_SHA_CTX
#define platform_SHA1_Init ppc_SHA1_Init
#define platform_SHA1_Update ppc_SHA1_Update
#define platform_SHA1_Final ppc_SHA1_Final

View File

@ -1,224 +0,0 @@
/*
* SHA-1 implementation for PowerPC.
*
* Copyright (C) 2005 Paul Mackerras <paulus@samba.org>
*/
/*
* PowerPC calling convention:
* %r0 - volatile temp
* %r1 - stack pointer.
* %r2 - reserved
* %r3-%r12 - Incoming arguments & return values; volatile.
* %r13-%r31 - Callee-save registers
* %lr - Return address, volatile
* %ctr - volatile
*
* Register usage in this routine:
* %r0 - temp
* %r3 - argument (pointer to 5 words of SHA state)
* %r4 - argument (pointer to data to hash)
* %r5 - Constant K in SHA round (initially number of blocks to hash)
* %r6-%r10 - Working copies of SHA variables A..E (actually E..A order)
* %r11-%r26 - Data being hashed W[].
* %r27-%r31 - Previous copies of A..E, for final add back.
* %ctr - loop count
*/
/*
* We roll the registers for A, B, C, D, E around on each
* iteration; E on iteration t is D on iteration t+1, and so on.
* We use registers 6 - 10 for this. (Registers 27 - 31 hold
* the previous values.)
*/
#define RA(t) (((t)+4)%5+6)
#define RB(t) (((t)+3)%5+6)
#define RC(t) (((t)+2)%5+6)
#define RD(t) (((t)+1)%5+6)
#define RE(t) (((t)+0)%5+6)
/* We use registers 11 - 26 for the W values */
#define W(t) ((t)%16+11)
/* Register 5 is used for the constant k */
/*
* The basic SHA-1 round function is:
* E += ROTL(A,5) + F(B,C,D) + W[i] + K; B = ROTL(B,30)
* Then the variables are renamed: (A,B,C,D,E) = (E,A,B,C,D).
*
* Every 20 rounds, the function F() and the constant K changes:
* - 20 rounds of f0(b,c,d) = "bit wise b ? c : d" = (^b & d) + (b & c)
* - 20 rounds of f1(b,c,d) = b^c^d = (b^d)^c
* - 20 rounds of f2(b,c,d) = majority(b,c,d) = (b&d) + ((b^d)&c)
* - 20 more rounds of f1(b,c,d)
*
* These are all scheduled for near-optimal performance on a G4.
* The G4 is a 3-issue out-of-order machine with 3 ALUs, but it can only
* *consider* starting the oldest 3 instructions per cycle. So to get
* maximum performance out of it, you have to treat it as an in-order
* machine. Which means interleaving the computation round t with the
* computation of W[t+4].
*
* The first 16 rounds use W values loaded directly from memory, while the
* remaining 64 use values computed from those first 16. We preload
* 4 values before starting, so there are three kinds of rounds:
* - The first 12 (all f0) also load the W values from memory.
* - The next 64 compute W(i+4) in parallel. 8*f0, 20*f1, 20*f2, 16*f1.
* - The last 4 (all f1) do not do anything with W.
*
* Therefore, we have 6 different round functions:
* STEPD0_LOAD(t,s) - Perform round t and load W(s). s < 16
* STEPD0_UPDATE(t,s) - Perform round t and compute W(s). s >= 16.
* STEPD1_UPDATE(t,s)
* STEPD2_UPDATE(t,s)
* STEPD1(t) - Perform round t with no load or update.
*
* The G5 is more fully out-of-order, and can find the parallelism
* by itself. The big limit is that it has a 2-cycle ALU latency, so
* even though it's 2-way, the code has to be scheduled as if it's
* 4-way, which can be a limit. To help it, we try to schedule the
* read of RA(t) as late as possible so it doesn't stall waiting for
* the previous round's RE(t-1), and we try to rotate RB(t) as early
* as possible while reading RC(t) (= RB(t-1)) as late as possible.
*/
/* the initial loads. */
#define LOADW(s) \
lwz W(s),(s)*4(%r4)
/*
* Perform a step with F0, and load W(s). Uses W(s) as a temporary
* before loading it.
* This is actually 10 instructions, which is an awkward fit.
* It can execute grouped as listed, or delayed one instruction.
* (If delayed two instructions, there is a stall before the start of the
* second line.) Thus, two iterations take 7 cycles, 3.5 cycles per round.
*/
#define STEPD0_LOAD(t,s) \
add RE(t),RE(t),W(t); andc %r0,RD(t),RB(t); and W(s),RC(t),RB(t); \
add RE(t),RE(t),%r0; rotlwi %r0,RA(t),5; rotlwi RB(t),RB(t),30; \
add RE(t),RE(t),W(s); add %r0,%r0,%r5; lwz W(s),(s)*4(%r4); \
add RE(t),RE(t),%r0
/*
* This is likewise awkward, 13 instructions. However, it can also
* execute starting with 2 out of 3 possible moduli, so it does 2 rounds
* in 9 cycles, 4.5 cycles/round.
*/
#define STEPD0_UPDATE(t,s,loadk...) \
add RE(t),RE(t),W(t); andc %r0,RD(t),RB(t); xor W(s),W((s)-16),W((s)-3); \
add RE(t),RE(t),%r0; and %r0,RC(t),RB(t); xor W(s),W(s),W((s)-8); \
add RE(t),RE(t),%r0; rotlwi %r0,RA(t),5; xor W(s),W(s),W((s)-14); \
add RE(t),RE(t),%r5; loadk; rotlwi RB(t),RB(t),30; rotlwi W(s),W(s),1; \
add RE(t),RE(t),%r0
/* Nicely optimal. Conveniently, also the most common. */
#define STEPD1_UPDATE(t,s,loadk...) \
add RE(t),RE(t),W(t); xor %r0,RD(t),RB(t); xor W(s),W((s)-16),W((s)-3); \
add RE(t),RE(t),%r5; loadk; xor %r0,%r0,RC(t); xor W(s),W(s),W((s)-8); \
add RE(t),RE(t),%r0; rotlwi %r0,RA(t),5; xor W(s),W(s),W((s)-14); \
add RE(t),RE(t),%r0; rotlwi RB(t),RB(t),30; rotlwi W(s),W(s),1
/*
* The naked version, no UPDATE, for the last 4 rounds. 3 cycles per.
* We could use W(s) as a temp register, but we don't need it.
*/
#define STEPD1(t) \
add RE(t),RE(t),W(t); xor %r0,RD(t),RB(t); \
rotlwi RB(t),RB(t),30; add RE(t),RE(t),%r5; xor %r0,%r0,RC(t); \
add RE(t),RE(t),%r0; rotlwi %r0,RA(t),5; /* spare slot */ \
add RE(t),RE(t),%r0
/*
* 14 instructions, 5 cycles per. The majority function is a bit
* awkward to compute. This can execute with a 1-instruction delay,
* but it causes a 2-instruction delay, which triggers a stall.
*/
#define STEPD2_UPDATE(t,s,loadk...) \
add RE(t),RE(t),W(t); and %r0,RD(t),RB(t); xor W(s),W((s)-16),W((s)-3); \
add RE(t),RE(t),%r0; xor %r0,RD(t),RB(t); xor W(s),W(s),W((s)-8); \
add RE(t),RE(t),%r5; loadk; and %r0,%r0,RC(t); xor W(s),W(s),W((s)-14); \
add RE(t),RE(t),%r0; rotlwi %r0,RA(t),5; rotlwi W(s),W(s),1; \
add RE(t),RE(t),%r0; rotlwi RB(t),RB(t),30
#define STEP0_LOAD4(t,s) \
STEPD0_LOAD(t,s); \
STEPD0_LOAD((t+1),(s)+1); \
STEPD0_LOAD((t)+2,(s)+2); \
STEPD0_LOAD((t)+3,(s)+3)
#define STEPUP4(fn, t, s, loadk...) \
STEP##fn##_UPDATE(t,s,); \
STEP##fn##_UPDATE((t)+1,(s)+1,); \
STEP##fn##_UPDATE((t)+2,(s)+2,); \
STEP##fn##_UPDATE((t)+3,(s)+3,loadk)
#define STEPUP20(fn, t, s, loadk...) \
STEPUP4(fn, t, s,); \
STEPUP4(fn, (t)+4, (s)+4,); \
STEPUP4(fn, (t)+8, (s)+8,); \
STEPUP4(fn, (t)+12, (s)+12,); \
STEPUP4(fn, (t)+16, (s)+16, loadk)
.globl ppc_sha1_core
ppc_sha1_core:
stwu %r1,-80(%r1)
stmw %r13,4(%r1)
/* Load up A - E */
lmw %r27,0(%r3)
mtctr %r5
1:
LOADW(0)
lis %r5,0x5a82
mr RE(0),%r31
LOADW(1)
mr RD(0),%r30
mr RC(0),%r29
LOADW(2)
ori %r5,%r5,0x7999 /* K0-19 */
mr RB(0),%r28
LOADW(3)
mr RA(0),%r27
STEP0_LOAD4(0, 4)
STEP0_LOAD4(4, 8)
STEP0_LOAD4(8, 12)
STEPUP4(D0, 12, 16,)
STEPUP4(D0, 16, 20, lis %r5,0x6ed9)
ori %r5,%r5,0xeba1 /* K20-39 */
STEPUP20(D1, 20, 24, lis %r5,0x8f1b)
ori %r5,%r5,0xbcdc /* K40-59 */
STEPUP20(D2, 40, 44, lis %r5,0xca62)
ori %r5,%r5,0xc1d6 /* K60-79 */
STEPUP4(D1, 60, 64,)
STEPUP4(D1, 64, 68,)
STEPUP4(D1, 68, 72,)
STEPUP4(D1, 72, 76,)
addi %r4,%r4,64
STEPD1(76)
STEPD1(77)
STEPD1(78)
STEPD1(79)
/* Add results to original values */
add %r31,%r31,RE(0)
add %r30,%r30,RD(0)
add %r29,%r29,RC(0)
add %r28,%r28,RB(0)
add %r27,%r27,RA(0)
bdnz 1b
/* Save final hash, restore registers, and return */
stmw %r27,0(%r3)
lmw %r13,4(%r1)
addi %r1,%r1,80
blr