git-commit-vandalism/t/t5351-unpack-large-objects.sh

77 lines
2.3 KiB
Bash
Raw Normal View History

#!/bin/sh
#
# Copyright (c) 2022 Han Xin
#
test_description='git unpack-objects with large objects'
. ./test-lib.sh
prepare_dest () {
test_when_finished "rm -rf dest.git" &&
unpack-objects: use stream_loose_object() to unpack large objects Make use of the stream_loose_object() function introduced in the preceding commit to unpack large objects. Before this we'd need to malloc() the size of the blob before unpacking it, which could cause OOM with very large blobs. We could use the new streaming interface to unpack all blobs, but doing so would be much slower, as demonstrated e.g. with this benchmark using git-hyperfine[0]: rm -rf /tmp/scalar.git && git clone --bare https://github.com/Microsoft/scalar.git /tmp/scalar.git && mv /tmp/scalar.git/objects/pack/*.pack /tmp/scalar.git/my.pack && git hyperfine \ -r 2 --warmup 1 \ -L rev origin/master,HEAD -L v "10,512,1k,1m" \ -s 'make' \ -p 'git init --bare dest.git' \ -c 'rm -rf dest.git' \ './git -C dest.git -c core.bigFileThreshold={v} unpack-objects </tmp/scalar.git/my.pack' Here we'll perform worse with lower core.bigFileThreshold settings with this change in terms of speed, but we're getting lower memory use in return: Summary './git -C dest.git -c core.bigFileThreshold=10 unpack-objects </tmp/scalar.git/my.pack' in 'origin/master' ran 1.01 ± 0.01 times faster than './git -C dest.git -c core.bigFileThreshold=1k unpack-objects </tmp/scalar.git/my.pack' in 'origin/master' 1.01 ± 0.01 times faster than './git -C dest.git -c core.bigFileThreshold=1m unpack-objects </tmp/scalar.git/my.pack' in 'origin/master' 1.01 ± 0.02 times faster than './git -C dest.git -c core.bigFileThreshold=1m unpack-objects </tmp/scalar.git/my.pack' in 'HEAD' 1.02 ± 0.00 times faster than './git -C dest.git -c core.bigFileThreshold=512 unpack-objects </tmp/scalar.git/my.pack' in 'origin/master' 1.09 ± 0.01 times faster than './git -C dest.git -c core.bigFileThreshold=1k unpack-objects </tmp/scalar.git/my.pack' in 'HEAD' 1.10 ± 0.00 times faster than './git -C dest.git -c core.bigFileThreshold=512 unpack-objects </tmp/scalar.git/my.pack' in 'HEAD' 1.11 ± 0.00 times faster than './git -C dest.git -c core.bigFileThreshold=10 unpack-objects </tmp/scalar.git/my.pack' in 'HEAD' A better benchmark to demonstrate the benefits of that this one, which creates an artificial repo with a 1, 25, 50, 75 and 100MB blob: rm -rf /tmp/repo && git init /tmp/repo && ( cd /tmp/repo && for i in 1 25 50 75 100 do dd if=/dev/urandom of=blob.$i count=$(($i*1024)) bs=1024 done && git add blob.* && git commit -mblobs && git gc && PACK=$(echo .git/objects/pack/pack-*.pack) && cp "$PACK" my.pack ) && git hyperfine \ --show-output \ -L rev origin/master,HEAD -L v "512,50m,100m" \ -s 'make' \ -p 'git init --bare dest.git' \ -c 'rm -rf dest.git' \ '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold={v} unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' Using this test we'll always use >100MB of memory on origin/master (around ~105MB), but max out at e.g. ~55MB if we set core.bigFileThreshold=50m. The relevant "Maximum resident set size" lines were manually added below the relevant benchmark: '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=50m unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'origin/master' ran Maximum resident set size (kbytes): 107080 1.02 ± 0.78 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=512 unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'origin/master' Maximum resident set size (kbytes): 106968 1.09 ± 0.79 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=100m unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'origin/master' Maximum resident set size (kbytes): 107032 1.42 ± 1.07 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=100m unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'HEAD' Maximum resident set size (kbytes): 107072 1.83 ± 1.02 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=50m unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'HEAD' Maximum resident set size (kbytes): 55704 2.16 ± 1.19 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=512 unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'HEAD' Maximum resident set size (kbytes): 4564 This shows that if you have enough memory this new streaming method is slower the lower you set the streaming threshold, but the benefit is more bounded memory use. An earlier version of this patch introduced a new "core.bigFileStreamingThreshold" instead of re-using the existing "core.bigFileThreshold" variable[1]. As noted in a detailed overview of its users in [2] using it has several different meanings. Still, we consider it good enough to simply re-use it. While it's possible that someone might want to e.g. consider objects "small" for the purposes of diffing but "big" for the purposes of writing them such use-cases are probably too obscure to worry about. We can always split up "core.bigFileThreshold" in the future if there's a need for that. 0. https://github.com/avar/git-hyperfine/ 1. https://lore.kernel.org/git/20211210103435.83656-1-chiyutianyi@gmail.com/ 2. https://lore.kernel.org/git/20220120112114.47618-5-chiyutianyi@gmail.com/ Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com> Helped-by: Derrick Stolee <stolee@gmail.com> Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com> Signed-off-by: Han Xin <chiyutianyi@gmail.com> Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2022-06-11 04:44:21 +02:00
git init --bare dest.git &&
git -C dest.git config core.bigFileThreshold "$1"
}
test_expect_success "create large objects (1.5 MB) and PACK" '
test-tool genrandom foo 1500000 >big-blob &&
test_commit --append foo big-blob &&
test-tool genrandom bar 1500000 >big-blob &&
test_commit --append bar big-blob &&
unpack-objects: use stream_loose_object() to unpack large objects Make use of the stream_loose_object() function introduced in the preceding commit to unpack large objects. Before this we'd need to malloc() the size of the blob before unpacking it, which could cause OOM with very large blobs. We could use the new streaming interface to unpack all blobs, but doing so would be much slower, as demonstrated e.g. with this benchmark using git-hyperfine[0]: rm -rf /tmp/scalar.git && git clone --bare https://github.com/Microsoft/scalar.git /tmp/scalar.git && mv /tmp/scalar.git/objects/pack/*.pack /tmp/scalar.git/my.pack && git hyperfine \ -r 2 --warmup 1 \ -L rev origin/master,HEAD -L v "10,512,1k,1m" \ -s 'make' \ -p 'git init --bare dest.git' \ -c 'rm -rf dest.git' \ './git -C dest.git -c core.bigFileThreshold={v} unpack-objects </tmp/scalar.git/my.pack' Here we'll perform worse with lower core.bigFileThreshold settings with this change in terms of speed, but we're getting lower memory use in return: Summary './git -C dest.git -c core.bigFileThreshold=10 unpack-objects </tmp/scalar.git/my.pack' in 'origin/master' ran 1.01 ± 0.01 times faster than './git -C dest.git -c core.bigFileThreshold=1k unpack-objects </tmp/scalar.git/my.pack' in 'origin/master' 1.01 ± 0.01 times faster than './git -C dest.git -c core.bigFileThreshold=1m unpack-objects </tmp/scalar.git/my.pack' in 'origin/master' 1.01 ± 0.02 times faster than './git -C dest.git -c core.bigFileThreshold=1m unpack-objects </tmp/scalar.git/my.pack' in 'HEAD' 1.02 ± 0.00 times faster than './git -C dest.git -c core.bigFileThreshold=512 unpack-objects </tmp/scalar.git/my.pack' in 'origin/master' 1.09 ± 0.01 times faster than './git -C dest.git -c core.bigFileThreshold=1k unpack-objects </tmp/scalar.git/my.pack' in 'HEAD' 1.10 ± 0.00 times faster than './git -C dest.git -c core.bigFileThreshold=512 unpack-objects </tmp/scalar.git/my.pack' in 'HEAD' 1.11 ± 0.00 times faster than './git -C dest.git -c core.bigFileThreshold=10 unpack-objects </tmp/scalar.git/my.pack' in 'HEAD' A better benchmark to demonstrate the benefits of that this one, which creates an artificial repo with a 1, 25, 50, 75 and 100MB blob: rm -rf /tmp/repo && git init /tmp/repo && ( cd /tmp/repo && for i in 1 25 50 75 100 do dd if=/dev/urandom of=blob.$i count=$(($i*1024)) bs=1024 done && git add blob.* && git commit -mblobs && git gc && PACK=$(echo .git/objects/pack/pack-*.pack) && cp "$PACK" my.pack ) && git hyperfine \ --show-output \ -L rev origin/master,HEAD -L v "512,50m,100m" \ -s 'make' \ -p 'git init --bare dest.git' \ -c 'rm -rf dest.git' \ '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold={v} unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' Using this test we'll always use >100MB of memory on origin/master (around ~105MB), but max out at e.g. ~55MB if we set core.bigFileThreshold=50m. The relevant "Maximum resident set size" lines were manually added below the relevant benchmark: '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=50m unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'origin/master' ran Maximum resident set size (kbytes): 107080 1.02 ± 0.78 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=512 unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'origin/master' Maximum resident set size (kbytes): 106968 1.09 ± 0.79 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=100m unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'origin/master' Maximum resident set size (kbytes): 107032 1.42 ± 1.07 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=100m unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'HEAD' Maximum resident set size (kbytes): 107072 1.83 ± 1.02 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=50m unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'HEAD' Maximum resident set size (kbytes): 55704 2.16 ± 1.19 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=512 unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'HEAD' Maximum resident set size (kbytes): 4564 This shows that if you have enough memory this new streaming method is slower the lower you set the streaming threshold, but the benefit is more bounded memory use. An earlier version of this patch introduced a new "core.bigFileStreamingThreshold" instead of re-using the existing "core.bigFileThreshold" variable[1]. As noted in a detailed overview of its users in [2] using it has several different meanings. Still, we consider it good enough to simply re-use it. While it's possible that someone might want to e.g. consider objects "small" for the purposes of diffing but "big" for the purposes of writing them such use-cases are probably too obscure to worry about. We can always split up "core.bigFileThreshold" in the future if there's a need for that. 0. https://github.com/avar/git-hyperfine/ 1. https://lore.kernel.org/git/20211210103435.83656-1-chiyutianyi@gmail.com/ 2. https://lore.kernel.org/git/20220120112114.47618-5-chiyutianyi@gmail.com/ Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com> Helped-by: Derrick Stolee <stolee@gmail.com> Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com> Signed-off-by: Han Xin <chiyutianyi@gmail.com> Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2022-06-11 04:44:21 +02:00
PACK=$(echo HEAD | git pack-objects --revs pack) &&
git verify-pack -v pack-$PACK.pack >out &&
sed -n -e "s/^\([0-9a-f][0-9a-f]*\).*\(commit\|tree\|blob\).*/\1/p" \
<out >obj-list
'
test_expect_success 'set memory limitation to 1MB' '
GIT_ALLOC_LIMIT=1m &&
export GIT_ALLOC_LIMIT
'
test_expect_success 'unpack-objects failed under memory limitation' '
unpack-objects: use stream_loose_object() to unpack large objects Make use of the stream_loose_object() function introduced in the preceding commit to unpack large objects. Before this we'd need to malloc() the size of the blob before unpacking it, which could cause OOM with very large blobs. We could use the new streaming interface to unpack all blobs, but doing so would be much slower, as demonstrated e.g. with this benchmark using git-hyperfine[0]: rm -rf /tmp/scalar.git && git clone --bare https://github.com/Microsoft/scalar.git /tmp/scalar.git && mv /tmp/scalar.git/objects/pack/*.pack /tmp/scalar.git/my.pack && git hyperfine \ -r 2 --warmup 1 \ -L rev origin/master,HEAD -L v "10,512,1k,1m" \ -s 'make' \ -p 'git init --bare dest.git' \ -c 'rm -rf dest.git' \ './git -C dest.git -c core.bigFileThreshold={v} unpack-objects </tmp/scalar.git/my.pack' Here we'll perform worse with lower core.bigFileThreshold settings with this change in terms of speed, but we're getting lower memory use in return: Summary './git -C dest.git -c core.bigFileThreshold=10 unpack-objects </tmp/scalar.git/my.pack' in 'origin/master' ran 1.01 ± 0.01 times faster than './git -C dest.git -c core.bigFileThreshold=1k unpack-objects </tmp/scalar.git/my.pack' in 'origin/master' 1.01 ± 0.01 times faster than './git -C dest.git -c core.bigFileThreshold=1m unpack-objects </tmp/scalar.git/my.pack' in 'origin/master' 1.01 ± 0.02 times faster than './git -C dest.git -c core.bigFileThreshold=1m unpack-objects </tmp/scalar.git/my.pack' in 'HEAD' 1.02 ± 0.00 times faster than './git -C dest.git -c core.bigFileThreshold=512 unpack-objects </tmp/scalar.git/my.pack' in 'origin/master' 1.09 ± 0.01 times faster than './git -C dest.git -c core.bigFileThreshold=1k unpack-objects </tmp/scalar.git/my.pack' in 'HEAD' 1.10 ± 0.00 times faster than './git -C dest.git -c core.bigFileThreshold=512 unpack-objects </tmp/scalar.git/my.pack' in 'HEAD' 1.11 ± 0.00 times faster than './git -C dest.git -c core.bigFileThreshold=10 unpack-objects </tmp/scalar.git/my.pack' in 'HEAD' A better benchmark to demonstrate the benefits of that this one, which creates an artificial repo with a 1, 25, 50, 75 and 100MB blob: rm -rf /tmp/repo && git init /tmp/repo && ( cd /tmp/repo && for i in 1 25 50 75 100 do dd if=/dev/urandom of=blob.$i count=$(($i*1024)) bs=1024 done && git add blob.* && git commit -mblobs && git gc && PACK=$(echo .git/objects/pack/pack-*.pack) && cp "$PACK" my.pack ) && git hyperfine \ --show-output \ -L rev origin/master,HEAD -L v "512,50m,100m" \ -s 'make' \ -p 'git init --bare dest.git' \ -c 'rm -rf dest.git' \ '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold={v} unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' Using this test we'll always use >100MB of memory on origin/master (around ~105MB), but max out at e.g. ~55MB if we set core.bigFileThreshold=50m. The relevant "Maximum resident set size" lines were manually added below the relevant benchmark: '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=50m unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'origin/master' ran Maximum resident set size (kbytes): 107080 1.02 ± 0.78 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=512 unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'origin/master' Maximum resident set size (kbytes): 106968 1.09 ± 0.79 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=100m unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'origin/master' Maximum resident set size (kbytes): 107032 1.42 ± 1.07 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=100m unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'HEAD' Maximum resident set size (kbytes): 107072 1.83 ± 1.02 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=50m unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'HEAD' Maximum resident set size (kbytes): 55704 2.16 ± 1.19 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=512 unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'HEAD' Maximum resident set size (kbytes): 4564 This shows that if you have enough memory this new streaming method is slower the lower you set the streaming threshold, but the benefit is more bounded memory use. An earlier version of this patch introduced a new "core.bigFileStreamingThreshold" instead of re-using the existing "core.bigFileThreshold" variable[1]. As noted in a detailed overview of its users in [2] using it has several different meanings. Still, we consider it good enough to simply re-use it. While it's possible that someone might want to e.g. consider objects "small" for the purposes of diffing but "big" for the purposes of writing them such use-cases are probably too obscure to worry about. We can always split up "core.bigFileThreshold" in the future if there's a need for that. 0. https://github.com/avar/git-hyperfine/ 1. https://lore.kernel.org/git/20211210103435.83656-1-chiyutianyi@gmail.com/ 2. https://lore.kernel.org/git/20220120112114.47618-5-chiyutianyi@gmail.com/ Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com> Helped-by: Derrick Stolee <stolee@gmail.com> Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com> Signed-off-by: Han Xin <chiyutianyi@gmail.com> Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2022-06-11 04:44:21 +02:00
prepare_dest 2m &&
test_must_fail git -C dest.git unpack-objects <pack-$PACK.pack 2>err &&
grep "fatal: attempting to allocate" err
'
test_expect_success 'unpack-objects works with memory limitation in dry-run mode' '
unpack-objects: use stream_loose_object() to unpack large objects Make use of the stream_loose_object() function introduced in the preceding commit to unpack large objects. Before this we'd need to malloc() the size of the blob before unpacking it, which could cause OOM with very large blobs. We could use the new streaming interface to unpack all blobs, but doing so would be much slower, as demonstrated e.g. with this benchmark using git-hyperfine[0]: rm -rf /tmp/scalar.git && git clone --bare https://github.com/Microsoft/scalar.git /tmp/scalar.git && mv /tmp/scalar.git/objects/pack/*.pack /tmp/scalar.git/my.pack && git hyperfine \ -r 2 --warmup 1 \ -L rev origin/master,HEAD -L v "10,512,1k,1m" \ -s 'make' \ -p 'git init --bare dest.git' \ -c 'rm -rf dest.git' \ './git -C dest.git -c core.bigFileThreshold={v} unpack-objects </tmp/scalar.git/my.pack' Here we'll perform worse with lower core.bigFileThreshold settings with this change in terms of speed, but we're getting lower memory use in return: Summary './git -C dest.git -c core.bigFileThreshold=10 unpack-objects </tmp/scalar.git/my.pack' in 'origin/master' ran 1.01 ± 0.01 times faster than './git -C dest.git -c core.bigFileThreshold=1k unpack-objects </tmp/scalar.git/my.pack' in 'origin/master' 1.01 ± 0.01 times faster than './git -C dest.git -c core.bigFileThreshold=1m unpack-objects </tmp/scalar.git/my.pack' in 'origin/master' 1.01 ± 0.02 times faster than './git -C dest.git -c core.bigFileThreshold=1m unpack-objects </tmp/scalar.git/my.pack' in 'HEAD' 1.02 ± 0.00 times faster than './git -C dest.git -c core.bigFileThreshold=512 unpack-objects </tmp/scalar.git/my.pack' in 'origin/master' 1.09 ± 0.01 times faster than './git -C dest.git -c core.bigFileThreshold=1k unpack-objects </tmp/scalar.git/my.pack' in 'HEAD' 1.10 ± 0.00 times faster than './git -C dest.git -c core.bigFileThreshold=512 unpack-objects </tmp/scalar.git/my.pack' in 'HEAD' 1.11 ± 0.00 times faster than './git -C dest.git -c core.bigFileThreshold=10 unpack-objects </tmp/scalar.git/my.pack' in 'HEAD' A better benchmark to demonstrate the benefits of that this one, which creates an artificial repo with a 1, 25, 50, 75 and 100MB blob: rm -rf /tmp/repo && git init /tmp/repo && ( cd /tmp/repo && for i in 1 25 50 75 100 do dd if=/dev/urandom of=blob.$i count=$(($i*1024)) bs=1024 done && git add blob.* && git commit -mblobs && git gc && PACK=$(echo .git/objects/pack/pack-*.pack) && cp "$PACK" my.pack ) && git hyperfine \ --show-output \ -L rev origin/master,HEAD -L v "512,50m,100m" \ -s 'make' \ -p 'git init --bare dest.git' \ -c 'rm -rf dest.git' \ '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold={v} unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' Using this test we'll always use >100MB of memory on origin/master (around ~105MB), but max out at e.g. ~55MB if we set core.bigFileThreshold=50m. The relevant "Maximum resident set size" lines were manually added below the relevant benchmark: '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=50m unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'origin/master' ran Maximum resident set size (kbytes): 107080 1.02 ± 0.78 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=512 unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'origin/master' Maximum resident set size (kbytes): 106968 1.09 ± 0.79 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=100m unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'origin/master' Maximum resident set size (kbytes): 107032 1.42 ± 1.07 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=100m unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'HEAD' Maximum resident set size (kbytes): 107072 1.83 ± 1.02 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=50m unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'HEAD' Maximum resident set size (kbytes): 55704 2.16 ± 1.19 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=512 unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'HEAD' Maximum resident set size (kbytes): 4564 This shows that if you have enough memory this new streaming method is slower the lower you set the streaming threshold, but the benefit is more bounded memory use. An earlier version of this patch introduced a new "core.bigFileStreamingThreshold" instead of re-using the existing "core.bigFileThreshold" variable[1]. As noted in a detailed overview of its users in [2] using it has several different meanings. Still, we consider it good enough to simply re-use it. While it's possible that someone might want to e.g. consider objects "small" for the purposes of diffing but "big" for the purposes of writing them such use-cases are probably too obscure to worry about. We can always split up "core.bigFileThreshold" in the future if there's a need for that. 0. https://github.com/avar/git-hyperfine/ 1. https://lore.kernel.org/git/20211210103435.83656-1-chiyutianyi@gmail.com/ 2. https://lore.kernel.org/git/20220120112114.47618-5-chiyutianyi@gmail.com/ Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com> Helped-by: Derrick Stolee <stolee@gmail.com> Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com> Signed-off-by: Han Xin <chiyutianyi@gmail.com> Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2022-06-11 04:44:21 +02:00
prepare_dest 2m &&
git -C dest.git unpack-objects -n <pack-$PACK.pack &&
test_stdout_line_count = 0 find dest.git/objects -type f &&
test_dir_is_empty dest.git/objects/pack
'
unpack-objects: use stream_loose_object() to unpack large objects Make use of the stream_loose_object() function introduced in the preceding commit to unpack large objects. Before this we'd need to malloc() the size of the blob before unpacking it, which could cause OOM with very large blobs. We could use the new streaming interface to unpack all blobs, but doing so would be much slower, as demonstrated e.g. with this benchmark using git-hyperfine[0]: rm -rf /tmp/scalar.git && git clone --bare https://github.com/Microsoft/scalar.git /tmp/scalar.git && mv /tmp/scalar.git/objects/pack/*.pack /tmp/scalar.git/my.pack && git hyperfine \ -r 2 --warmup 1 \ -L rev origin/master,HEAD -L v "10,512,1k,1m" \ -s 'make' \ -p 'git init --bare dest.git' \ -c 'rm -rf dest.git' \ './git -C dest.git -c core.bigFileThreshold={v} unpack-objects </tmp/scalar.git/my.pack' Here we'll perform worse with lower core.bigFileThreshold settings with this change in terms of speed, but we're getting lower memory use in return: Summary './git -C dest.git -c core.bigFileThreshold=10 unpack-objects </tmp/scalar.git/my.pack' in 'origin/master' ran 1.01 ± 0.01 times faster than './git -C dest.git -c core.bigFileThreshold=1k unpack-objects </tmp/scalar.git/my.pack' in 'origin/master' 1.01 ± 0.01 times faster than './git -C dest.git -c core.bigFileThreshold=1m unpack-objects </tmp/scalar.git/my.pack' in 'origin/master' 1.01 ± 0.02 times faster than './git -C dest.git -c core.bigFileThreshold=1m unpack-objects </tmp/scalar.git/my.pack' in 'HEAD' 1.02 ± 0.00 times faster than './git -C dest.git -c core.bigFileThreshold=512 unpack-objects </tmp/scalar.git/my.pack' in 'origin/master' 1.09 ± 0.01 times faster than './git -C dest.git -c core.bigFileThreshold=1k unpack-objects </tmp/scalar.git/my.pack' in 'HEAD' 1.10 ± 0.00 times faster than './git -C dest.git -c core.bigFileThreshold=512 unpack-objects </tmp/scalar.git/my.pack' in 'HEAD' 1.11 ± 0.00 times faster than './git -C dest.git -c core.bigFileThreshold=10 unpack-objects </tmp/scalar.git/my.pack' in 'HEAD' A better benchmark to demonstrate the benefits of that this one, which creates an artificial repo with a 1, 25, 50, 75 and 100MB blob: rm -rf /tmp/repo && git init /tmp/repo && ( cd /tmp/repo && for i in 1 25 50 75 100 do dd if=/dev/urandom of=blob.$i count=$(($i*1024)) bs=1024 done && git add blob.* && git commit -mblobs && git gc && PACK=$(echo .git/objects/pack/pack-*.pack) && cp "$PACK" my.pack ) && git hyperfine \ --show-output \ -L rev origin/master,HEAD -L v "512,50m,100m" \ -s 'make' \ -p 'git init --bare dest.git' \ -c 'rm -rf dest.git' \ '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold={v} unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' Using this test we'll always use >100MB of memory on origin/master (around ~105MB), but max out at e.g. ~55MB if we set core.bigFileThreshold=50m. The relevant "Maximum resident set size" lines were manually added below the relevant benchmark: '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=50m unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'origin/master' ran Maximum resident set size (kbytes): 107080 1.02 ± 0.78 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=512 unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'origin/master' Maximum resident set size (kbytes): 106968 1.09 ± 0.79 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=100m unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'origin/master' Maximum resident set size (kbytes): 107032 1.42 ± 1.07 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=100m unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'HEAD' Maximum resident set size (kbytes): 107072 1.83 ± 1.02 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=50m unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'HEAD' Maximum resident set size (kbytes): 55704 2.16 ± 1.19 times faster than '/usr/bin/time -v ./git -C dest.git -c core.bigFileThreshold=512 unpack-objects </tmp/repo/my.pack 2>&1 | grep Maximum' in 'HEAD' Maximum resident set size (kbytes): 4564 This shows that if you have enough memory this new streaming method is slower the lower you set the streaming threshold, but the benefit is more bounded memory use. An earlier version of this patch introduced a new "core.bigFileStreamingThreshold" instead of re-using the existing "core.bigFileThreshold" variable[1]. As noted in a detailed overview of its users in [2] using it has several different meanings. Still, we consider it good enough to simply re-use it. While it's possible that someone might want to e.g. consider objects "small" for the purposes of diffing but "big" for the purposes of writing them such use-cases are probably too obscure to worry about. We can always split up "core.bigFileThreshold" in the future if there's a need for that. 0. https://github.com/avar/git-hyperfine/ 1. https://lore.kernel.org/git/20211210103435.83656-1-chiyutianyi@gmail.com/ 2. https://lore.kernel.org/git/20220120112114.47618-5-chiyutianyi@gmail.com/ Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com> Helped-by: Derrick Stolee <stolee@gmail.com> Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com> Signed-off-by: Han Xin <chiyutianyi@gmail.com> Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2022-06-11 04:44:21 +02:00
test_expect_success 'unpack big object in stream' '
prepare_dest 1m &&
git -C dest.git unpack-objects <pack-$PACK.pack &&
test_dir_is_empty dest.git/objects/pack
'
BATCH_CONFIGURATION='-c core.fsync=loose-object -c core.fsyncmethod=batch'
test_expect_success 'unpack big object in stream (core.fsyncmethod=batch)' '
prepare_dest 1m &&
GIT_TRACE2_EVENT="$(pwd)/trace2.txt" \
git -C dest.git $BATCH_CONFIGURATION unpack-objects <pack-$PACK.pack &&
grep fsync/hardware-flush trace2.txt &&
test_dir_is_empty dest.git/objects/pack &&
git -C dest.git cat-file --batch-check="%(objectname)" <obj-list >current &&
cmp obj-list current
'
test_expect_success 'do not unpack existing large objects' '
prepare_dest 1m &&
git -C dest.git index-pack --stdin <pack-$PACK.pack &&
git -C dest.git unpack-objects <pack-$PACK.pack &&
# The destination came up with the exact same pack...
DEST_PACK=$(echo dest.git/objects/pack/pack-*.pack) &&
test_cmp pack-$PACK.pack $DEST_PACK &&
# ...and wrote no loose objects
test_stdout_line_count = 0 find dest.git/objects -type f ! -name "pack-*"
'
test_done