From 92ac3197e4859ba8c19e3e7f7b8cf5dc38e4669d Mon Sep 17 00:00:00 2001 From: Jeff King Date: Fri, 24 Feb 2012 17:02:37 -0500 Subject: [PATCH 1/3] teach convert_to_git a "dry run" mode Some callers may want to know whether convert_to_git will actually do anything before performing the conversion itself (e.g., to decide whether to stream or handle blobs in-core). This patch lets callers specify the dry run mode by passing a NULL destination buffer. The return value, instead of indicating whether conversion happened, will indicate whether conversion would occur. For readability, we also include a wrapper function which makes it more obvious we are not actually performing the conversion. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- convert.c | 17 +++++++++++++++-- convert.h | 5 +++++ 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/convert.c b/convert.c index 12868ed7bd..65fa9d5eda 100644 --- a/convert.c +++ b/convert.c @@ -231,6 +231,13 @@ static int crlf_to_git(const char *path, const char *src, size_t len, if (!stats.cr) return 0; + /* + * At this point all of our source analysis is done, and we are sure we + * would convert. If we are in dry-run mode, we can give an answer. + */ + if (!buf) + return 1; + /* only grow if not in place */ if (strbuf_avail(buf) + buf->len < len) strbuf_grow(buf, len - buf->len); @@ -391,6 +398,9 @@ static int apply_filter(const char *path, const char *src, size_t len, if (!cmd) return 0; + if (!dst) + return 1; + memset(&async, 0, sizeof(async)); async.proc = filter_buffer; async.data = ¶ms; @@ -525,6 +535,9 @@ static int ident_to_git(const char *path, const char *src, size_t len, if (!ident || !count_ident(src, len)) return 0; + if (!buf) + return 1; + /* only grow if not in place */ if (strbuf_avail(buf) + buf->len < len) strbuf_grow(buf, len - buf->len); @@ -754,13 +767,13 @@ int convert_to_git(const char *path, const char *src, size_t len, filter = ca.drv->clean; ret |= apply_filter(path, src, len, dst, filter); - if (ret) { + if (ret && dst) { src = dst->buf; len = dst->len; } ca.crlf_action = input_crlf_action(ca.crlf_action, ca.eol_attr); ret |= crlf_to_git(path, src, len, dst, ca.crlf_action, checksafe); - if (ret) { + if (ret && dst) { src = dst->buf; len = dst->len; } diff --git a/convert.h b/convert.h index d799a165b4..ec5fd69430 100644 --- a/convert.h +++ b/convert.h @@ -40,6 +40,11 @@ extern int convert_to_working_tree(const char *path, const char *src, size_t len, struct strbuf *dst); extern int renormalize_buffer(const char *path, const char *src, size_t len, struct strbuf *dst); +static inline int would_convert_to_git(const char *path, const char *src, + size_t len, enum safe_crlf checksafe) +{ + return convert_to_git(path, src, len, NULL, checksafe); +} /***************************************************************** * From 4c3b57b98bccfdb740b8f1d0ca574dd51deacfad Mon Sep 17 00:00:00 2001 From: Jeff King Date: Fri, 24 Feb 2012 17:05:03 -0500 Subject: [PATCH 2/3] teach dry-run convert_to_git not to require a src buffer When we call convert_to_git in dry-run mode, it may still want to look at the source buffer, because some CRLF conversion modes depend on analyzing the source to determine whether it is in fact convertible CRLF text. However, the main motivation for convert_to_git's dry-run mode is that we would decide which method to use to acquire the blob's data (streaming versus in-core). Requiring this source analysis creates a chicken-and-egg problem. We are better off simply guessing that anything we can't analyze will end up needing conversion. This patch lets a caller specify a NULL src buffer when using dry-run mode (and only dry-run mode). A non-zero return value goes from "we would convert" to "we might convert"; a zero return value remains "we would definitely not convert". Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- convert.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/convert.c b/convert.c index 65fa9d5eda..aa7f72d8a0 100644 --- a/convert.c +++ b/convert.c @@ -195,9 +195,17 @@ static int crlf_to_git(const char *path, const char *src, size_t len, char *dst; if (crlf_action == CRLF_BINARY || - (crlf_action == CRLF_GUESS && auto_crlf == AUTO_CRLF_FALSE) || !len) + (crlf_action == CRLF_GUESS && auto_crlf == AUTO_CRLF_FALSE) || + (src && !len)) return 0; + /* + * If we are doing a dry-run and have no source buffer, there is + * nothing to analyze; we must assume we would convert. + */ + if (!buf && !src) + return 1; + gather_stats(src, len, &stats); if (crlf_action == CRLF_AUTO || crlf_action == CRLF_GUESS) { @@ -532,7 +540,7 @@ static int ident_to_git(const char *path, const char *src, size_t len, { char *dst, *dollar; - if (!ident || !count_ident(src, len)) + if (!ident || (src && !count_ident(src, len))) return 0; if (!buf) From 4f22b1015d4203ccdf2b66f27ee5946504342ace Mon Sep 17 00:00:00 2001 From: Jeff King Date: Fri, 24 Feb 2012 17:10:17 -0500 Subject: [PATCH 3/3] do not stream large files to pack when filters are in use Because git's object format requires us to specify the number of bytes in the object in its header, we must know the size before streaming a blob into the object database. This is not a problem when adding a regular file, as we can get the size from stat(). However, when filters are in use (such as autocrlf, or the ident, filter, or eol gitattributes), we have no idea what the ultimate size will be. The current code just punts on the whole issue and ignores filter configuration entirely for files larger than core.bigfilethreshold. This can generate confusing results if you use filters for large binary files, as the filter will suddenly stop working as the file goes over a certain size. Rather than try to handle unknown input sizes with streaming, this patch just turns off the streaming optimization when filters are in use. This has a slight performance regression in a very specific case: if you have autocrlf on, but no gitattributes, a large binary file will avoid the streaming code path because we don't know beforehand whether it will need conversion or not. But if you are handling large binary files, you should be marking them as such via attributes (or at least not using autocrlf, and instead marking your text files as such). And the flip side is that if you have a large _non_-binary file, there is a correctness improvement; before we did not apply the conversion at all. The first half of the new t1051 script covers these failures on input. The second half tests the matching output code paths. These already work correctly, and do not need any adjustment. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- sha1_file.c | 14 +++--- t/t1051-large-conversion.sh | 86 +++++++++++++++++++++++++++++++++++++ 2 files changed, 95 insertions(+), 5 deletions(-) create mode 100755 t/t1051-large-conversion.sh diff --git a/sha1_file.c b/sha1_file.c index 956422ba4a..c9ae7ec2ae 100644 --- a/sha1_file.c +++ b/sha1_file.c @@ -2688,10 +2688,13 @@ static int index_core(unsigned char *sha1, int fd, size_t size, * This also bypasses the usual "convert-to-git" dance, and that is on * purpose. We could write a streaming version of the converting * functions and insert that before feeding the data to fast-import - * (or equivalent in-core API described above), but the primary - * motivation for trying to stream from the working tree file and to - * avoid mmaping it in core is to deal with large binary blobs, and - * by definition they do _not_ want to get any conversion. + * (or equivalent in-core API described above). However, that is + * somewhat complicated, as we do not know the size of the filter + * result, which we need to know beforehand when writing a git object. + * Since the primary motivation for trying to stream from the working + * tree file and to avoid mmaping it in core is to deal with large + * binary blobs, they generally do not want to get any conversion, and + * callers should avoid this code path when filters are requested. */ static int index_stream(unsigned char *sha1, int fd, size_t size, enum object_type type, const char *path, @@ -2766,7 +2769,8 @@ int index_fd(unsigned char *sha1, int fd, struct stat *st, if (!S_ISREG(st->st_mode)) ret = index_pipe(sha1, fd, type, path, flags); - else if (size <= big_file_threshold || type != OBJ_BLOB) + else if (size <= big_file_threshold || type != OBJ_BLOB || + (path && would_convert_to_git(path, NULL, 0, 0))) ret = index_core(sha1, fd, size, type, path, flags); else ret = index_stream(sha1, fd, size, type, path, flags); diff --git a/t/t1051-large-conversion.sh b/t/t1051-large-conversion.sh new file mode 100755 index 0000000000..8b7640b3ba --- /dev/null +++ b/t/t1051-large-conversion.sh @@ -0,0 +1,86 @@ +#!/bin/sh + +test_description='test conversion filters on large files' +. ./test-lib.sh + +set_attr() { + test_when_finished 'rm -f .gitattributes' && + echo "* $*" >.gitattributes +} + +check_input() { + git read-tree --empty && + git add small large && + git cat-file blob :small >small.index && + git cat-file blob :large | head -n 1 >large.index && + test_cmp small.index large.index +} + +check_output() { + rm -f small large && + git checkout small large && + head -n 1 large >large.head && + test_cmp small large.head +} + +test_expect_success 'setup input tests' ' + printf "\$Id: foo\$\\r\\n" >small && + cat small small >large && + git config core.bigfilethreshold 20 && + git config filter.test.clean "sed s/.*/CLEAN/" +' + +test_expect_success 'autocrlf=true converts on input' ' + test_config core.autocrlf true && + check_input +' + +test_expect_success 'eol=crlf converts on input' ' + set_attr eol=crlf && + check_input +' + +test_expect_success 'ident converts on input' ' + set_attr ident && + check_input +' + +test_expect_success 'user-defined filters convert on input' ' + set_attr filter=test && + check_input +' + +test_expect_success 'setup output tests' ' + echo "\$Id\$" >small && + cat small small >large && + git add small large && + git config core.bigfilethreshold 7 && + git config filter.test.smudge "sed s/.*/SMUDGE/" +' + +test_expect_success 'autocrlf=true converts on output' ' + test_config core.autocrlf true && + check_output +' + +test_expect_success 'eol=crlf converts on output' ' + set_attr eol=crlf && + check_output +' + +test_expect_success 'user-defined filters convert on output' ' + set_attr filter=test && + check_output +' + +test_expect_success 'ident converts on output' ' + set_attr ident && + rm -f small large && + git checkout small large && + sed -n "s/Id: .*/Id: SHA/p" small.clean && + head -n 1 large >large.head && + sed -n "s/Id: .*/Id: SHA/p" large.clean && + test_cmp small.clean large.clean +' + +test_done