diff --git a/convert.c b/convert.c index 33373b3ac0..4534e2c2b0 100644 --- a/convert.c +++ b/convert.c @@ -196,9 +196,17 @@ static int crlf_to_git(const char *path, const char *src, size_t len, char *dst; if (crlf_action == CRLF_BINARY || - (crlf_action == CRLF_GUESS && auto_crlf == AUTO_CRLF_FALSE) || !len) + (crlf_action == CRLF_GUESS && auto_crlf == AUTO_CRLF_FALSE) || + (src && !len)) return 0; + /* + * If we are doing a dry-run and have no source buffer, there is + * nothing to analyze; we must assume we would convert. + */ + if (!buf && !src) + return 1; + gather_stats(src, len, &stats); if (crlf_action == CRLF_AUTO || crlf_action == CRLF_GUESS) { @@ -232,6 +240,13 @@ static int crlf_to_git(const char *path, const char *src, size_t len, if (!stats.cr) return 0; + /* + * At this point all of our source analysis is done, and we are sure we + * would convert. If we are in dry-run mode, we can give an answer. + */ + if (!buf) + return 1; + /* only grow if not in place */ if (strbuf_avail(buf) + buf->len < len) strbuf_grow(buf, len - buf->len); @@ -396,6 +411,9 @@ static int apply_filter(const char *path, const char *src, size_t len, if (!cmd) return 0; + if (!dst) + return 1; + memset(&async, 0, sizeof(async)); async.proc = filter_buffer; async.data = ¶ms; @@ -527,9 +545,12 @@ static int ident_to_git(const char *path, const char *src, size_t len, { char *dst, *dollar; - if (!ident || !count_ident(src, len)) + if (!ident || (src && !count_ident(src, len))) return 0; + if (!buf) + return 1; + /* only grow if not in place */ if (strbuf_avail(buf) + buf->len < len) strbuf_grow(buf, len - buf->len); @@ -759,13 +780,13 @@ int convert_to_git(const char *path, const char *src, size_t len, filter = ca.drv->clean; ret |= apply_filter(path, src, len, dst, filter); - if (ret) { + if (ret && dst) { src = dst->buf; len = dst->len; } ca.crlf_action = input_crlf_action(ca.crlf_action, ca.eol_attr); ret |= crlf_to_git(path, src, len, dst, ca.crlf_action, checksafe); - if (ret) { + if (ret && dst) { src = dst->buf; len = dst->len; } diff --git a/convert.h b/convert.h index d799a165b4..ec5fd69430 100644 --- a/convert.h +++ b/convert.h @@ -40,6 +40,11 @@ extern int convert_to_working_tree(const char *path, const char *src, size_t len, struct strbuf *dst); extern int renormalize_buffer(const char *path, const char *src, size_t len, struct strbuf *dst); +static inline int would_convert_to_git(const char *path, const char *src, + size_t len, enum safe_crlf checksafe) +{ + return convert_to_git(path, src, len, NULL, checksafe); +} /***************************************************************** * diff --git a/sha1_file.c b/sha1_file.c index f9f8d5e91c..4f06a0e450 100644 --- a/sha1_file.c +++ b/sha1_file.c @@ -2700,10 +2700,13 @@ static int index_core(unsigned char *sha1, int fd, size_t size, * This also bypasses the usual "convert-to-git" dance, and that is on * purpose. We could write a streaming version of the converting * functions and insert that before feeding the data to fast-import - * (or equivalent in-core API described above), but the primary - * motivation for trying to stream from the working tree file and to - * avoid mmaping it in core is to deal with large binary blobs, and - * by definition they do _not_ want to get any conversion. + * (or equivalent in-core API described above). However, that is + * somewhat complicated, as we do not know the size of the filter + * result, which we need to know beforehand when writing a git object. + * Since the primary motivation for trying to stream from the working + * tree file and to avoid mmaping it in core is to deal with large + * binary blobs, they generally do not want to get any conversion, and + * callers should avoid this code path when filters are requested. */ static int index_stream(unsigned char *sha1, int fd, size_t size, enum object_type type, const char *path, @@ -2720,7 +2723,8 @@ int index_fd(unsigned char *sha1, int fd, struct stat *st, if (!S_ISREG(st->st_mode)) ret = index_pipe(sha1, fd, type, path, flags); - else if (size <= big_file_threshold || type != OBJ_BLOB) + else if (size <= big_file_threshold || type != OBJ_BLOB || + (path && would_convert_to_git(path, NULL, 0, 0))) ret = index_core(sha1, fd, size, type, path, flags); else ret = index_stream(sha1, fd, size, type, path, flags); diff --git a/t/t1051-large-conversion.sh b/t/t1051-large-conversion.sh new file mode 100755 index 0000000000..8b7640b3ba --- /dev/null +++ b/t/t1051-large-conversion.sh @@ -0,0 +1,86 @@ +#!/bin/sh + +test_description='test conversion filters on large files' +. ./test-lib.sh + +set_attr() { + test_when_finished 'rm -f .gitattributes' && + echo "* $*" >.gitattributes +} + +check_input() { + git read-tree --empty && + git add small large && + git cat-file blob :small >small.index && + git cat-file blob :large | head -n 1 >large.index && + test_cmp small.index large.index +} + +check_output() { + rm -f small large && + git checkout small large && + head -n 1 large >large.head && + test_cmp small large.head +} + +test_expect_success 'setup input tests' ' + printf "\$Id: foo\$\\r\\n" >small && + cat small small >large && + git config core.bigfilethreshold 20 && + git config filter.test.clean "sed s/.*/CLEAN/" +' + +test_expect_success 'autocrlf=true converts on input' ' + test_config core.autocrlf true && + check_input +' + +test_expect_success 'eol=crlf converts on input' ' + set_attr eol=crlf && + check_input +' + +test_expect_success 'ident converts on input' ' + set_attr ident && + check_input +' + +test_expect_success 'user-defined filters convert on input' ' + set_attr filter=test && + check_input +' + +test_expect_success 'setup output tests' ' + echo "\$Id\$" >small && + cat small small >large && + git add small large && + git config core.bigfilethreshold 7 && + git config filter.test.smudge "sed s/.*/SMUDGE/" +' + +test_expect_success 'autocrlf=true converts on output' ' + test_config core.autocrlf true && + check_output +' + +test_expect_success 'eol=crlf converts on output' ' + set_attr eol=crlf && + check_output +' + +test_expect_success 'user-defined filters convert on output' ' + set_attr filter=test && + check_output +' + +test_expect_success 'ident converts on output' ' + set_attr ident && + rm -f small large && + git checkout small large && + sed -n "s/Id: .*/Id: SHA/p" small.clean && + head -n 1 large >large.head && + sed -n "s/Id: .*/Id: SHA/p" large.clean && + test_cmp small.clean large.clean +' + +test_done