gitweb: add "e-mail privacy" feature to redact e-mail addresses

Gitweb extracts content from the Git log and makes it accessible
over HTTP. As a result, e-mail addresses found in commits are
exposed to web crawlers and they may not respect robots.txt.
This can result in unsolicited messages.

Introduce an 'email-privacy' feature which redacts e-mail addresses
from the generated HTML content. Specifically, obscure addresses
retrieved from the the author/committer and comment sections of the
Git log. The feature is off by default.

This feature does not prevent someone from downloading the
unredacted commit log, e.g., by cloning the repository, and
extracting information from it. It aims to hinder the low-
effort, bulk collection of e-mail addresses by web crawlers.

Signed-off-by: Georgios Kontaxis <geko1702+commits@99rst.org>
Acked-by: Eric Wong <e@80x24.org>
Acked-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
This commit is contained in:
Georgios Kontaxis 2021-03-28 23:26:03 +00:00 committed by Junio C Hamano
parent 84d06cdc06
commit 0996dd3d6d
2 changed files with 38 additions and 7 deletions

View File

@ -751,6 +751,17 @@ default font sizes or lineheights are changed (e.g. via adding extra
CSS stylesheet in `@stylesheets`), it may be appropriate to change CSS stylesheet in `@stylesheets`), it may be appropriate to change
these values. these values.
email-privacy::
Redact e-mail addresses from the generated HTML, etc. content.
This obscures e-mail addresses retrieved from the author/committer
and comment sections of the Git log.
It is meant to hinder web crawlers that harvest and abuse addresses.
Such crawlers may not respect robots.txt.
Note that users and user tools also see the addresses as redacted.
If Gitweb is not the final step in a workflow then subsequent steps
may misbehave because of the redacted information they receive.
Disabled by default.
highlight:: highlight::
Server-side syntax highlight support in "blob" view. It requires Server-side syntax highlight support in "blob" view. It requires
`$highlight_bin` program to be available (see the description of `$highlight_bin` program to be available (see the description of

View File

@ -569,6 +569,15 @@ our %feature = (
'sub' => \&feature_extra_branch_refs, 'sub' => \&feature_extra_branch_refs,
'override' => 0, 'override' => 0,
'default' => []}, 'default' => []},
# Redact e-mail addresses.
# To enable system wide have in $GITWEB_CONFIG
# $feature{'email-privacy'}{'default'} = [1];
'email-privacy' => {
'sub' => sub { feature_bool('email-privacy', @_) },
'override' => 1,
'default' => [0]},
); );
sub gitweb_get_feature { sub gitweb_get_feature {
@ -3449,6 +3458,13 @@ sub parse_date {
return %date; return %date;
} }
sub hide_mailaddrs_if_private {
my $line = shift;
return $line unless gitweb_check_feature('email-privacy');
$line =~ s/<[^@>]+@[^>]+>/<redacted>/g;
return $line;
}
sub parse_tag { sub parse_tag {
my $tag_id = shift; my $tag_id = shift;
my %tag; my %tag;
@ -3465,7 +3481,7 @@ sub parse_tag {
} elsif ($line =~ m/^tag (.+)$/) { } elsif ($line =~ m/^tag (.+)$/) {
$tag{'name'} = $1; $tag{'name'} = $1;
} elsif ($line =~ m/^tagger (.*) ([0-9]+) (.*)$/) { } elsif ($line =~ m/^tagger (.*) ([0-9]+) (.*)$/) {
$tag{'author'} = $1; $tag{'author'} = hide_mailaddrs_if_private($1);
$tag{'author_epoch'} = $2; $tag{'author_epoch'} = $2;
$tag{'author_tz'} = $3; $tag{'author_tz'} = $3;
if ($tag{'author'} =~ m/^([^<]+) <([^>]*)>/) { if ($tag{'author'} =~ m/^([^<]+) <([^>]*)>/) {
@ -3513,7 +3529,7 @@ sub parse_commit_text {
} elsif ((!defined $withparents) && ($line =~ m/^parent ($oid_regex)$/)) { } elsif ((!defined $withparents) && ($line =~ m/^parent ($oid_regex)$/)) {
push @parents, $1; push @parents, $1;
} elsif ($line =~ m/^author (.*) ([0-9]+) (.*)$/) { } elsif ($line =~ m/^author (.*) ([0-9]+) (.*)$/) {
$co{'author'} = to_utf8($1); $co{'author'} = hide_mailaddrs_if_private(to_utf8($1));
$co{'author_epoch'} = $2; $co{'author_epoch'} = $2;
$co{'author_tz'} = $3; $co{'author_tz'} = $3;
if ($co{'author'} =~ m/^([^<]+) <([^>]*)>/) { if ($co{'author'} =~ m/^([^<]+) <([^>]*)>/) {
@ -3523,7 +3539,7 @@ sub parse_commit_text {
$co{'author_name'} = $co{'author'}; $co{'author_name'} = $co{'author'};
} }
} elsif ($line =~ m/^committer (.*) ([0-9]+) (.*)$/) { } elsif ($line =~ m/^committer (.*) ([0-9]+) (.*)$/) {
$co{'committer'} = to_utf8($1); $co{'committer'} = hide_mailaddrs_if_private(to_utf8($1));
$co{'committer_epoch'} = $2; $co{'committer_epoch'} = $2;
$co{'committer_tz'} = $3; $co{'committer_tz'} = $3;
if ($co{'committer'} =~ m/^([^<]+) <([^>]*)>/) { if ($co{'committer'} =~ m/^([^<]+) <([^>]*)>/) {
@ -3568,9 +3584,10 @@ sub parse_commit_text {
if (! defined $co{'title'} || $co{'title'} eq "") { if (! defined $co{'title'} || $co{'title'} eq "") {
$co{'title'} = $co{'title_short'} = '(no commit message)'; $co{'title'} = $co{'title_short'} = '(no commit message)';
} }
# remove added spaces # remove added spaces, redact e-mail addresses if applicable.
foreach my $line (@commit_lines) { foreach my $line (@commit_lines) {
$line =~ s/^ //; $line =~ s/^ //;
$line = hide_mailaddrs_if_private($line);
} }
$co{'comment'} = \@commit_lines; $co{'comment'} = \@commit_lines;
@ -7489,7 +7506,8 @@ sub git_log_generic {
-accesskey => "n", -title => "Alt-n"}, "next"); -accesskey => "n", -title => "Alt-n"}, "next");
} }
my $patch_max = gitweb_get_feature('patches'); my $patch_max = gitweb_get_feature('patches');
if ($patch_max && !defined $file_name) { if ($patch_max && !defined $file_name &&
!gitweb_check_feature('email-privacy')) {
if ($patch_max < 0 || @commitlist <= $patch_max) { if ($patch_max < 0 || @commitlist <= $patch_max) {
$paging_nav .= " &sdot; " . $paging_nav .= " &sdot; " .
$cgi->a({-href => href(action=>"patches", -replay=>1)}, $cgi->a({-href => href(action=>"patches", -replay=>1)},
@ -7550,7 +7568,8 @@ sub git_commit {
} @$parents ) . } @$parents ) .
')'; ')';
} }
if (gitweb_check_feature('patches') && @$parents <= 1) { if (gitweb_check_feature('patches') && @$parents <= 1 &&
!gitweb_check_feature('email-privacy')) {
$formats_nav .= " | " . $formats_nav .= " | " .
$cgi->a({-href => href(action=>"patch", -replay=>1)}, $cgi->a({-href => href(action=>"patch", -replay=>1)},
"patch"); "patch");
@ -7863,7 +7882,8 @@ sub git_commitdiff {
$formats_nav = $formats_nav =
$cgi->a({-href => href(action=>"commitdiff_plain", -replay=>1)}, $cgi->a({-href => href(action=>"commitdiff_plain", -replay=>1)},
"raw"); "raw");
if ($patch_max && @{$co{'parents'}} <= 1) { if ($patch_max && @{$co{'parents'}} <= 1 &&
!gitweb_check_feature('email-privacy')) {
$formats_nav .= " | " . $formats_nav .= " | " .
$cgi->a({-href => href(action=>"patch", -replay=>1)}, $cgi->a({-href => href(action=>"patch", -replay=>1)},
"patch"); "patch");