From 58325b93c5b6212697b088371809e9948fee8052 Mon Sep 17 00:00:00 2001
From: Taylor Blau <me@ttaylorr.com>
Date: Tue, 24 Jan 2023 19:43:45 -0500
Subject: [PATCH 1/3] t5619: demonstrate clone_local() with ambiguous transport

When cloning a repository, Git must determine (a) what transport
mechanism to use, and (b) whether or not the clone is local.

Since f38aa83f9a (use local cloning if insteadOf makes a local URL,
2014-07-17), the latter check happens after the remote has been
initialized, and references the remote's URL instead of the local path.
This is done to make it possible for a `url.<base>.insteadOf` rule to
convert a remote URL into a local one, in which case the `clone_local()`
mechanism should be used.

However, with a specially crafted repository, Git can be tricked into
using a non-local transport while still setting `is_local` to "1" and
using the `clone_local()` optimization. The below test case
demonstrates such an instance, and shows that it can be used to include
arbitrary (known) paths in the working copy of a cloned repository on a
victim's machine[^1], even if local file clones are forbidden by
`protocol.file.allow`.

This happens in a few parts:

 1. We first call `get_repo_path()` to see if the remote is a local
    path. If it is, we replace the repo name with its absolute path.

 2. We then call `transport_get()` on the repo name and decide how to
    access it. If it was turned into an absolute path in the previous
    step, then we should always treat it like a file.

 3. We use `get_repo_path()` again, and set `is_local` as appropriate.
    But it's already too late to rewrite the repo name as an absolute
    path, since we've already fed it to the transport code.

The attack works by including a submodule whose URL corresponds to a
path on disk. In the below example, the repository "sub" is reachable
via the dumb HTTP protocol at (something like):

    http://127.0.0.1:NNNN/dumb/sub.git

However, the path "http:/127.0.0.1:NNNN/dumb" (that is, a top-level
directory called "http:", then nested directories "127.0.0.1:NNNN", and
"dumb") exists within the repository, too.

To determine this, it first picks the appropriate transport, which is
dumb HTTP. It then uses the remote's URL in order to determine whether
the repository exists locally on disk. However, the malicious repository
also contains an embedded stub repository which is the target of a
symbolic link at the local path corresponding to the "sub" repository on
disk (i.e., there is a symbolic link at "http:/127.0.0.1/dumb/sub.git",
pointing to the stub repository via ".git/modules/sub/../../../repo").

This stub repository fools Git into thinking that a local repository
exists at that URL and thus can be cloned locally. The affected call is
in `get_repo_path()`, which in turn calls `get_repo_path_1()`, which
locates a valid repository at that target.

This then causes Git to set the `is_local` variable to "1", and in turn
instructs Git to clone the repository using its local clone optimization
via the `clone_local()` function.

The exploit comes into play because the stub repository's top-level
"$GIT_DIR/objects" directory is a symbolic link which can point to an
arbitrary path on the victim's machine. `clone_local()` resolves the
top-level "objects" directory through a `stat(2)` call, meaning that we
read through the symbolic link and copy or hardlink the directory
contents at the destination of the link.

In other words, we can get steps (1) and (3) to disagree by leveraging
the dangling symlink to pick a non-local transport in the first step,
and then set is_local to "1" in the third step when cloning with
`--separate-git-dir`, which makes the symlink non-dangling.

This can result in data-exfiltration on the victim's machine when
sensitive data is at a known path (e.g., "/home/$USER/.ssh").

The appropriate fix is two-fold:

 - Resolve the transport later on (to avoid using the local
   clone optimization with a non-local transport).

 - Avoid reading through the top-level "objects" directory when
   (correctly) using the clone_local() optimization.

This patch merely demonstrates the issue. The following two patches will
implement each part of the above fix, respectively.

[^1]: Provided that any target directory does not contain symbolic
  links, in which case the changes from 6f054f9fb3 (builtin/clone.c:
  disallow `--local` clones with symlinks, 2022-07-28) will abort the
  clone.

Reported-by: yvvdwf <yvvdwf@gmail.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 t/t5619-clone-local-ambiguous-transport.sh | 63 ++++++++++++++++++++++
 1 file changed, 63 insertions(+)
 create mode 100755 t/t5619-clone-local-ambiguous-transport.sh

diff --git a/t/t5619-clone-local-ambiguous-transport.sh b/t/t5619-clone-local-ambiguous-transport.sh
new file mode 100755
index 0000000000..7ebd31a150
--- /dev/null
+++ b/t/t5619-clone-local-ambiguous-transport.sh
@@ -0,0 +1,63 @@
+#!/bin/sh
+
+test_description='test local clone with ambiguous transport'
+
+. ./test-lib.sh
+. "$TEST_DIRECTORY/lib-httpd.sh"
+
+if ! test_have_prereq SYMLINKS
+then
+	skip_all='skipping test, symlink support unavailable'
+	test_done
+fi
+
+start_httpd
+
+REPO="$HTTPD_DOCUMENT_ROOT_PATH/sub.git"
+URI="$HTTPD_URL/dumb/sub.git"
+
+test_expect_success 'setup' '
+	mkdir -p sensitive &&
+	echo "secret" >sensitive/secret &&
+
+	git init --bare "$REPO" &&
+	test_commit_bulk -C "$REPO" --ref=main 1 &&
+
+	git -C "$REPO" update-ref HEAD main &&
+	git -C "$REPO" update-server-info &&
+
+	git init malicious &&
+	(
+		cd malicious &&
+
+		git submodule add "$URI" &&
+
+		mkdir -p repo/refs &&
+		touch repo/refs/.gitkeep &&
+		printf "ref: refs/heads/a" >repo/HEAD &&
+		ln -s "$(cd .. && pwd)/sensitive" repo/objects &&
+
+		mkdir -p "$HTTPD_URL/dumb" &&
+		ln -s "../../../.git/modules/sub/../../../repo/" "$URI" &&
+
+		git add . &&
+		git commit -m "initial commit"
+	) &&
+
+	# Delete all of the references in our malicious submodule to
+	# avoid the client attempting to checkout any objects (which
+	# will be missing, and thus will cause the clone to fail before
+	# we can trigger the exploit).
+	git -C "$REPO" for-each-ref --format="delete %(refname)" >in &&
+	git -C "$REPO" update-ref --stdin <in &&
+	git -C "$REPO" update-server-info
+'
+
+test_expect_failure 'ambiguous transport does not lead to arbitrary file-inclusion' '
+	git clone malicious clone &&
+	git -C clone submodule update --init &&
+
+	test_path_is_missing clone/.git/modules/sub/objects/secret
+'
+
+test_done

From cf8f6ce02a13f4d1979a53241afbee15a293fce9 Mon Sep 17 00:00:00 2001
From: Taylor Blau <me@ttaylorr.com>
Date: Tue, 24 Jan 2023 19:43:48 -0500
Subject: [PATCH 2/3] clone: delay picking a transport until after
 get_repo_path()

In the previous commit, t5619 demonstrates an issue where two calls to
`get_repo_path()` could trick Git into using its local clone mechanism
in conjunction with a non-local transport.

That sequence is:

 - the starting state is that the local path https:/example.com/foo is a
   symlink that points to ../../../.git/modules/foo. So it's dangling.

 - get_repo_path() sees that no such path exists (because it's
   dangling), and thus we do not canonicalize it into an absolute path

 - because we're using --separate-git-dir, we create .git/modules/foo.
   Now our symlink is no longer dangling!

 - we pass the url to transport_get(), which sees it as an https URL.

 - we call get_repo_path() again, on the url. This second call was
   introduced by f38aa83f9a (use local cloning if insteadOf makes a
   local URL, 2014-07-17). The idea is that we want to pull the url
   fresh from the remote.c API, because it will apply any aliases.

And of course now it sees that there is a local file, which is a
mismatch with the transport we already selected.

The issue in the above sequence is calling `transport_get()` before
deciding whether or not the repository is indeed local, and not passing
in an absolute path if it is local.

This is reminiscent of a similar bug report in [1], where it was
suggested to perform the `insteadOf` lookup earlier. Taking that
approach may not be as straightforward, since the intent is to store the
original URL in the config, but to actually fetch from the insteadOf
one, so conflating the two early on is a non-starter.

Note: we pass the path returned by `get_repo_path(remote->url[0])`,
which should be the same as `repo_name` (aside from any `insteadOf`
rewrites).

We *could* pass `absolute_pathdup()` of the same argument, which
86521acaca (Bring local clone's origin URL in line with that of a remote
clone, 2008-09-01) indicates may differ depending on the presence of
".git/" for a non-bare repo. That matters for forming relative submodule
paths, but doesn't matter for the second call, since we're just feeding
it to the transport code, which is fine either way.

[1]: https://lore.kernel.org/git/CAMoD=Bi41mB3QRn3JdZL-FGHs4w3C2jGpnJB-CqSndO7FMtfzA@mail.gmail.com/

Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 builtin/clone.c                            |  8 ++++----
 t/t5619-clone-local-ambiguous-transport.sh | 13 ++++++++++---
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/builtin/clone.c b/builtin/clone.c
index e626073b1f..c042b2e256 100644
--- a/builtin/clone.c
+++ b/builtin/clone.c
@@ -1201,10 +1201,6 @@ int cmd_clone(int argc, const char **argv, const char *prefix)
 	refspec_appendf(&remote->fetch, "+%s*:%s*", src_ref_prefix,
 			branch_top.buf);
 
-	transport = transport_get(remote, remote->url[0]);
-	transport_set_verbosity(transport, option_verbosity, option_progress);
-	transport->family = family;
-
 	path = get_repo_path(remote->url[0], &is_bundle);
 	is_local = option_local != 0 && path && !is_bundle;
 	if (is_local) {
@@ -1224,6 +1220,10 @@ int cmd_clone(int argc, const char **argv, const char *prefix)
 	}
 	if (option_local > 0 && !is_local)
 		warning(_("--local is ignored"));
+
+	transport = transport_get(remote, path ? path : remote->url[0]);
+	transport_set_verbosity(transport, option_verbosity, option_progress);
+	transport->family = family;
 	transport->cloning = 1;
 
 	transport_set_option(transport, TRANS_OPT_KEEP, "yes");
diff --git a/t/t5619-clone-local-ambiguous-transport.sh b/t/t5619-clone-local-ambiguous-transport.sh
index 7ebd31a150..cce62bf78d 100755
--- a/t/t5619-clone-local-ambiguous-transport.sh
+++ b/t/t5619-clone-local-ambiguous-transport.sh
@@ -53,11 +53,18 @@ test_expect_success 'setup' '
 	git -C "$REPO" update-server-info
 '
 
-test_expect_failure 'ambiguous transport does not lead to arbitrary file-inclusion' '
+test_expect_success 'ambiguous transport does not lead to arbitrary file-inclusion' '
 	git clone malicious clone &&
-	git -C clone submodule update --init &&
+	test_must_fail git -C clone submodule update --init 2>err &&
 
-	test_path_is_missing clone/.git/modules/sub/objects/secret
+	test_path_is_missing clone/.git/modules/sub/objects/secret &&
+	# We would actually expect "transport .file. not allowed" here,
+	# but due to quirks of the URL detection in Git, we mis-parse
+	# the absolute path as a bogus URL and die before that step.
+	#
+	# This works for now, and if we ever fix the URL detection, it
+	# is OK to change this to detect the transport error.
+	grep "protocol .* is not supported" err
 '
 
 test_done

From bffc762f87ae8d18c6001bf0044a76004245754c Mon Sep 17 00:00:00 2001
From: Taylor Blau <me@ttaylorr.com>
Date: Tue, 24 Jan 2023 19:43:51 -0500
Subject: [PATCH 3/3] dir-iterator: prevent top-level symlinks without
 FOLLOW_SYMLINKS

When using the dir_iterator API, we first stat(2) the base path, and
then use that as a starting point to enumerate the directory's contents.

If the directory contains symbolic links, we will immediately die() upon
encountering them without the `FOLLOW_SYMLINKS` flag. The same is not
true when resolving the top-level directory, though.

As explained in a previous commit, this oversight in 6f054f9fb3
(builtin/clone.c: disallow `--local` clones with symlinks, 2022-07-28)
can be used as an attack vector to include arbitrary files on a victim's
filesystem from outside of the repository.

Prevent resolving top-level symlinks unless the FOLLOW_SYMLINKS flag is
given, which will cause clones of a repository with a symlink'd
"$GIT_DIR/objects" directory to fail.

Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 dir-iterator.c             | 13 +++++++++----
 dir-iterator.h             |  5 +++++
 t/t0066-dir-iterator.sh    | 27 ++++++++++++++++++++++++++-
 t/t5604-clone-reference.sh | 16 ++++++++++++++++
 4 files changed, 56 insertions(+), 5 deletions(-)

diff --git a/dir-iterator.c b/dir-iterator.c
index b17e9f970a..3764dd81a1 100644
--- a/dir-iterator.c
+++ b/dir-iterator.c
@@ -203,7 +203,7 @@ struct dir_iterator *dir_iterator_begin(const char *path, unsigned int flags)
 {
 	struct dir_iterator_int *iter = xcalloc(1, sizeof(*iter));
 	struct dir_iterator *dir_iterator = &iter->base;
-	int saved_errno;
+	int saved_errno, err;
 
 	strbuf_init(&iter->base.path, PATH_MAX);
 	strbuf_addstr(&iter->base.path, path);
@@ -213,10 +213,15 @@ struct dir_iterator *dir_iterator_begin(const char *path, unsigned int flags)
 	iter->flags = flags;
 
 	/*
-	 * Note: stat already checks for NULL or empty strings and
-	 * inexistent paths.
+	 * Note: stat/lstat already checks for NULL or empty strings and
+	 * nonexistent paths.
 	 */
-	if (stat(iter->base.path.buf, &iter->base.st) < 0) {
+	if (iter->flags & DIR_ITERATOR_FOLLOW_SYMLINKS)
+		err = stat(iter->base.path.buf, &iter->base.st);
+	else
+		err = lstat(iter->base.path.buf, &iter->base.st);
+
+	if (err < 0) {
 		saved_errno = errno;
 		goto error_out;
 	}
diff --git a/dir-iterator.h b/dir-iterator.h
index 08229157c6..e3b6ff2800 100644
--- a/dir-iterator.h
+++ b/dir-iterator.h
@@ -61,6 +61,11 @@
  *   not the symlinks themselves, which is the default behavior. Broken
  *   symlinks are ignored.
  *
+ *   Note: setting DIR_ITERATOR_FOLLOW_SYMLINKS affects resolving the
+ *   starting path as well (e.g., attempting to iterate starting at a
+ *   symbolic link pointing to a directory without FOLLOW_SYMLINKS will
+ *   result in an error).
+ *
  * Warning: circular symlinks are also followed when
  * DIR_ITERATOR_FOLLOW_SYMLINKS is set. The iteration may end up with
  * an ELOOP if they happen and DIR_ITERATOR_PEDANTIC is set.
diff --git a/t/t0066-dir-iterator.sh b/t/t0066-dir-iterator.sh
index 92910e4e6c..c826f60f6d 100755
--- a/t/t0066-dir-iterator.sh
+++ b/t/t0066-dir-iterator.sh
@@ -109,7 +109,9 @@ test_expect_success SYMLINKS 'setup dirs with symlinks' '
 	mkdir -p dir5/a/c &&
 	ln -s ../c dir5/a/b/d &&
 	ln -s ../ dir5/a/b/e &&
-	ln -s ../../ dir5/a/b/f
+	ln -s ../../ dir5/a/b/f &&
+
+	ln -s dir4 dir6
 '
 
 test_expect_success SYMLINKS 'dir-iterator should not follow symlinks by default' '
@@ -145,4 +147,27 @@ test_expect_success SYMLINKS 'dir-iterator should follow symlinks w/ follow flag
 	test_cmp expected-follow-sorted-output actual-follow-sorted-output
 '
 
+test_expect_success SYMLINKS 'dir-iterator does not resolve top-level symlinks' '
+	test_must_fail test-tool dir-iterator ./dir6 >out &&
+
+	grep "ENOTDIR" out
+'
+
+test_expect_success SYMLINKS 'dir-iterator resolves top-level symlinks w/ follow flag' '
+	cat >expected-follow-sorted-output <<-EOF &&
+	[d] (a) [a] ./dir6/a
+	[d] (a/f) [f] ./dir6/a/f
+	[d] (a/f/c) [c] ./dir6/a/f/c
+	[d] (b) [b] ./dir6/b
+	[d] (b/c) [c] ./dir6/b/c
+	[f] (a/d) [d] ./dir6/a/d
+	[f] (a/e) [e] ./dir6/a/e
+	EOF
+
+	test-tool dir-iterator --follow-symlinks ./dir6 >out &&
+	sort out >actual-follow-sorted-output &&
+
+	test_cmp expected-follow-sorted-output actual-follow-sorted-output
+'
+
 test_done
diff --git a/t/t5604-clone-reference.sh b/t/t5604-clone-reference.sh
index 9d32f1c4a4..4ff21d7ccf 100755
--- a/t/t5604-clone-reference.sh
+++ b/t/t5604-clone-reference.sh
@@ -341,4 +341,20 @@ test_expect_success SYMLINKS 'clone repo with symlinked or unknown files at obje
 	test_must_be_empty T--shared.objects-symlinks.raw
 '
 
+test_expect_success SYMLINKS 'clone repo with symlinked objects directory' '
+	test_when_finished "rm -fr sensitive malicious" &&
+
+	mkdir -p sensitive &&
+	echo "secret" >sensitive/file &&
+
+	git init malicious &&
+	rm -fr malicious/.git/objects &&
+	ln -s "$(pwd)/sensitive" ./malicious/.git/objects &&
+
+	test_must_fail git clone --local malicious clone 2>err &&
+
+	test_path_is_missing clone &&
+	grep "failed to start iterator over" err
+'
+
 test_done