2016-06-25 07:22:30 +02:00
|
|
|
#!/bin/sh
|
|
|
|
|
|
|
|
test_description='grep icase on non-English locales'
|
|
|
|
|
|
|
|
. ./lib-gettext.sh
|
|
|
|
|
|
|
|
test_expect_success GETTEXT_LOCALE 'setup' '
|
|
|
|
test_write_lines "TILRAUN: Halló Heimur!" >file &&
|
|
|
|
git add file &&
|
|
|
|
LC_ALL="$is_IS_locale" &&
|
|
|
|
export LC_ALL
|
|
|
|
'
|
|
|
|
|
|
|
|
test_have_prereq GETTEXT_LOCALE &&
|
2018-03-24 08:44:53 +01:00
|
|
|
test-tool regex "HALLÓ" "Halló" ICASE &&
|
2016-06-25 07:22:30 +02:00
|
|
|
test_set_prereq REGEX_LOCALE
|
|
|
|
|
|
|
|
test_expect_success REGEX_LOCALE 'grep literal string, no -F' '
|
|
|
|
git grep -i "TILRAUN: Halló Heimur!" &&
|
|
|
|
git grep -i "TILRAUN: HALLÓ HEIMUR!"
|
|
|
|
'
|
|
|
|
|
2017-05-20 23:42:06 +02:00
|
|
|
test_expect_success GETTEXT_LOCALE,PCRE 'grep pcre utf-8 icase' '
|
2016-06-25 07:22:35 +02:00
|
|
|
git grep --perl-regexp "TILRAUN: H.lló Heimur!" &&
|
|
|
|
git grep --perl-regexp -i "TILRAUN: H.lló Heimur!" &&
|
|
|
|
git grep --perl-regexp -i "TILRAUN: H.LLÓ HEIMUR!"
|
|
|
|
'
|
|
|
|
|
2017-05-20 23:42:06 +02:00
|
|
|
test_expect_success GETTEXT_LOCALE,PCRE 'grep pcre utf-8 string with "+"' '
|
2016-06-25 07:22:35 +02:00
|
|
|
test_write_lines "TILRAUN: Hallóó Heimur!" >file2 &&
|
|
|
|
git add file2 &&
|
|
|
|
git grep -l --perl-regexp "TILRAUN: H.lló+ Heimur!" >actual &&
|
|
|
|
echo file >expected &&
|
|
|
|
echo file2 >>expected &&
|
|
|
|
test_cmp expected actual
|
|
|
|
'
|
|
|
|
|
2016-06-25 07:22:31 +02:00
|
|
|
test_expect_success REGEX_LOCALE 'grep literal string, with -F' '
|
2017-05-20 23:42:11 +02:00
|
|
|
git grep -i -F "TILRAUN: Halló Heimur!" &&
|
|
|
|
git grep -i -F "TILRAUN: HALLÓ HEIMUR!"
|
2016-06-25 07:22:31 +02:00
|
|
|
'
|
|
|
|
|
|
|
|
test_expect_success REGEX_LOCALE 'grep string with regex, with -F' '
|
2017-05-20 23:42:11 +02:00
|
|
|
test_write_lines "TILRAUN: Halló Heimur [abc]!" >file3 &&
|
|
|
|
git add file3 &&
|
|
|
|
git grep -i -F "TILRAUN: Halló Heimur [abc]!" file3
|
2016-06-25 07:22:31 +02:00
|
|
|
'
|
|
|
|
|
2016-06-25 07:22:37 +02:00
|
|
|
test_expect_success REGEX_LOCALE 'pickaxe -i on non-ascii' '
|
|
|
|
git commit -m first &&
|
|
|
|
git log --format=%f -i -S"TILRAUN: HALLÓ HEIMUR!" >actual &&
|
|
|
|
echo first >expected &&
|
|
|
|
test_cmp expected actual
|
|
|
|
'
|
|
|
|
|
2021-10-15 18:13:56 +02:00
|
|
|
test_expect_success GETTEXT_LOCALE,PCRE 'log --author with an ascii pattern on UTF-8 data' '
|
|
|
|
cat >expected <<-\EOF &&
|
|
|
|
Author: <BOLD;RED>À Ú Thor<RESET> <author@example.com>
|
|
|
|
EOF
|
|
|
|
test_write_lines "forth" >file4 &&
|
|
|
|
git add file4 &&
|
|
|
|
git commit --author="À Ú Thor <author@example.com>" -m sécond &&
|
|
|
|
git log -1 --color=always --perl-regexp --author=".*Thor" >log &&
|
|
|
|
grep Author log >actual.raw &&
|
|
|
|
test_decode_color <actual.raw >actual &&
|
|
|
|
test_cmp expected actual
|
|
|
|
'
|
|
|
|
|
|
|
|
test_expect_success GETTEXT_LOCALE,PCRE 'log --committer with an ascii pattern on ISO-8859-1 data' '
|
|
|
|
cat >expected <<-\EOF &&
|
|
|
|
Commit: Ç<BOLD;RED> O Mîtter <committer@example.com><RESET>
|
|
|
|
EOF
|
|
|
|
test_write_lines "fifth" >file5 &&
|
|
|
|
git add file5 &&
|
|
|
|
GIT_COMMITTER_NAME="Ç O Mîtter" &&
|
|
|
|
GIT_COMMITTER_EMAIL="committer@example.com" &&
|
|
|
|
git -c i18n.commitEncoding=latin1 commit -m thïrd &&
|
|
|
|
git -c i18n.logOutputEncoding=latin1 log -1 --pretty=fuller --color=always --perl-regexp --committer=" O.*" >log &&
|
|
|
|
grep Commit: log >actual.raw &&
|
|
|
|
test_decode_color <actual.raw >actual &&
|
|
|
|
test_cmp expected actual
|
|
|
|
'
|
|
|
|
|
|
|
|
test_expect_success GETTEXT_LOCALE,PCRE 'log --grep with an ascii pattern on UTF-8 data' '
|
|
|
|
cat >expected <<-\EOF &&
|
|
|
|
sé<BOLD;RED>con<RESET>d
|
|
|
|
EOF
|
|
|
|
git log -1 --color=always --perl-regexp --grep="con" >log &&
|
|
|
|
grep con log >actual.raw &&
|
|
|
|
test_decode_color <actual.raw >actual &&
|
|
|
|
test_cmp expected actual
|
|
|
|
'
|
|
|
|
|
|
|
|
test_expect_success GETTEXT_LOCALE,PCRE 'log --grep with an ascii pattern on ISO-8859-1 data' '
|
|
|
|
cat >expected <<-\EOF &&
|
|
|
|
<BOLD;RED>thïrd<RESET>
|
|
|
|
EOF
|
|
|
|
git -c i18n.logOutputEncoding=latin1 log -1 --color=always --perl-regexp --grep="th.*rd" >log &&
|
|
|
|
grep "th.*rd" log >actual.raw &&
|
|
|
|
test_decode_color <actual.raw >actual &&
|
|
|
|
test_cmp expected actual
|
|
|
|
'
|
|
|
|
|
grep: stess test PCRE v2 on invalid UTF-8 data
Since my b65abcafc7 ("grep: use PCRE v2 for optimized fixed-string
search", 2019-07-01) we've been dying on invalid UTF-8 data when
grepping for fixed strings if the following are all true:
* The subject string is non-ASCII (e.g. "ævar")
* We're under a is_utf8_locale(), e.g. "en_US.UTF-8", not "C"
* We compiled with PCRE v2
* That PCRE v2 did not have JIT support
The last of those is why this wasn't caught earlier, per pcre2jit(3):
"unless PCRE2_NO_UTF_CHECK is set, a UTF subject string is tested
for validity. In the interests of speed, these checks do not
happen on the JIT fast path, and if invalid data is passed, the
result is undefined."
I.e. the subject being matched against our pattern was invalid, but we
were lucky and getting away with it on the JIT path, but the non-JIT
one is stricter.
This patch does nothing to fix that, instead we sneak in support for
fixed patterns starting with "(*NO_JIT)", this disables the PCRE v2
jit with implicit fixed-string matching for testing, see
pcre2syntax(3) the syntax.
This is technically a change in behavior, but it's so obscure that I
figured it was OK. We'd previously consider this an invalid regular
expression as regcomp() would die on it, now we feed it to the PCRE v2
fixed-string path. I thought this was better than introducing yet
another GIT_TEST_* environment variable.
We're also relying on a behavior of PCRE v2 that technically could
change, but I think the test coverage is worth dipping our toe into
some somewhat undefined behavior.
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-07-26 17:08:16 +02:00
|
|
|
test_expect_success GETTEXT_LOCALE,LIBPCRE2 'PCRE v2: setup invalid UTF-8 data' '
|
|
|
|
printf "\\200\\n" >invalid-0x80 &&
|
|
|
|
echo "ævar" >expected &&
|
|
|
|
cat expected >>invalid-0x80 &&
|
grep/pcre2: better support invalid UTF-8 haystacks
Improve the support for invalid UTF-8 haystacks given a non-ASCII
needle when using the PCREv2 backend.
This is a more complete fix for a bug I started to fix in
870eea8166 (grep: do not enter PCRE2_UTF mode on fixed matching,
2019-07-26), now that PCREv2 has the PCRE2_MATCH_INVALID_UTF mode we
can make use of it.
This fixes the sort of case described in 8a5999838e (grep: stess test
PCRE v2 on invalid UTF-8 data, 2019-07-26), i.e.:
- The subject string is non-ASCII (e.g. "ævar")
- We're under a is_utf8_locale(), e.g. "en_US.UTF-8", not "C"
- We are using --ignore-case, or we're a non-fixed pattern
If those conditions were satisfied and we matched found non-valid
UTF-8 data PCREv2 might bark on it, in practice this only happened
under the JIT backend (turned on by default on most platforms).
Ultimately this fixes a "regression" in b65abcafc7 ("grep: use PCRE v2
for optimized fixed-string search", 2019-07-01), I'm putting that in
scare-quotes because before then we wouldn't properly support these
complex case-folding, locale etc. cases either, it just broke in
different ways.
There was a bug related to this the PCRE2_NO_START_OPTIMIZE flag fixed
in PCREv2 10.36. It can be worked around by setting the
PCRE2_NO_START_OPTIMIZE flag. Let's do that in those cases, and add
tests for the bug.
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-01-24 18:28:13 +01:00
|
|
|
git add invalid-0x80 &&
|
|
|
|
|
|
|
|
# Test for PCRE2_MATCH_INVALID_UTF bug
|
|
|
|
# https://bugs.exim.org/show_bug.cgi?id=2642
|
|
|
|
printf "\\345Aæ\\n" >invalid-0xe5 &&
|
|
|
|
git add invalid-0xe5
|
grep: stess test PCRE v2 on invalid UTF-8 data
Since my b65abcafc7 ("grep: use PCRE v2 for optimized fixed-string
search", 2019-07-01) we've been dying on invalid UTF-8 data when
grepping for fixed strings if the following are all true:
* The subject string is non-ASCII (e.g. "ævar")
* We're under a is_utf8_locale(), e.g. "en_US.UTF-8", not "C"
* We compiled with PCRE v2
* That PCRE v2 did not have JIT support
The last of those is why this wasn't caught earlier, per pcre2jit(3):
"unless PCRE2_NO_UTF_CHECK is set, a UTF subject string is tested
for validity. In the interests of speed, these checks do not
happen on the JIT fast path, and if invalid data is passed, the
result is undefined."
I.e. the subject being matched against our pattern was invalid, but we
were lucky and getting away with it on the JIT path, but the non-JIT
one is stricter.
This patch does nothing to fix that, instead we sneak in support for
fixed patterns starting with "(*NO_JIT)", this disables the PCRE v2
jit with implicit fixed-string matching for testing, see
pcre2syntax(3) the syntax.
This is technically a change in behavior, but it's so obscure that I
figured it was OK. We'd previously consider this an invalid regular
expression as regcomp() would die on it, now we feed it to the PCRE v2
fixed-string path. I thought this was better than introducing yet
another GIT_TEST_* environment variable.
We're also relying on a behavior of PCRE v2 that technically could
change, but I think the test coverage is worth dipping our toe into
some somewhat undefined behavior.
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-07-26 17:08:16 +02:00
|
|
|
'
|
|
|
|
|
|
|
|
test_expect_success GETTEXT_LOCALE,LIBPCRE2 'PCRE v2: grep ASCII from invalid UTF-8 data' '
|
|
|
|
git grep -h "var" invalid-0x80 >actual &&
|
|
|
|
test_cmp expected actual &&
|
|
|
|
git grep -h "(*NO_JIT)var" invalid-0x80 >actual &&
|
|
|
|
test_cmp expected actual
|
|
|
|
'
|
|
|
|
|
grep/pcre2: better support invalid UTF-8 haystacks
Improve the support for invalid UTF-8 haystacks given a non-ASCII
needle when using the PCREv2 backend.
This is a more complete fix for a bug I started to fix in
870eea8166 (grep: do not enter PCRE2_UTF mode on fixed matching,
2019-07-26), now that PCREv2 has the PCRE2_MATCH_INVALID_UTF mode we
can make use of it.
This fixes the sort of case described in 8a5999838e (grep: stess test
PCRE v2 on invalid UTF-8 data, 2019-07-26), i.e.:
- The subject string is non-ASCII (e.g. "ævar")
- We're under a is_utf8_locale(), e.g. "en_US.UTF-8", not "C"
- We are using --ignore-case, or we're a non-fixed pattern
If those conditions were satisfied and we matched found non-valid
UTF-8 data PCREv2 might bark on it, in practice this only happened
under the JIT backend (turned on by default on most platforms).
Ultimately this fixes a "regression" in b65abcafc7 ("grep: use PCRE v2
for optimized fixed-string search", 2019-07-01), I'm putting that in
scare-quotes because before then we wouldn't properly support these
complex case-folding, locale etc. cases either, it just broke in
different ways.
There was a bug related to this the PCRE2_NO_START_OPTIMIZE flag fixed
in PCREv2 10.36. It can be worked around by setting the
PCRE2_NO_START_OPTIMIZE flag. Let's do that in those cases, and add
tests for the bug.
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-01-24 18:28:13 +01:00
|
|
|
test_expect_success GETTEXT_LOCALE,LIBPCRE2 'PCRE v2: grep ASCII from invalid UTF-8 data (PCRE2 bug #2642)' '
|
|
|
|
git grep -h "Aæ" invalid-0xe5 >actual &&
|
|
|
|
test_cmp invalid-0xe5 actual &&
|
|
|
|
git grep -h "(*NO_JIT)Aæ" invalid-0xe5 >actual &&
|
|
|
|
test_cmp invalid-0xe5 actual
|
|
|
|
'
|
|
|
|
|
grep: stess test PCRE v2 on invalid UTF-8 data
Since my b65abcafc7 ("grep: use PCRE v2 for optimized fixed-string
search", 2019-07-01) we've been dying on invalid UTF-8 data when
grepping for fixed strings if the following are all true:
* The subject string is non-ASCII (e.g. "ævar")
* We're under a is_utf8_locale(), e.g. "en_US.UTF-8", not "C"
* We compiled with PCRE v2
* That PCRE v2 did not have JIT support
The last of those is why this wasn't caught earlier, per pcre2jit(3):
"unless PCRE2_NO_UTF_CHECK is set, a UTF subject string is tested
for validity. In the interests of speed, these checks do not
happen on the JIT fast path, and if invalid data is passed, the
result is undefined."
I.e. the subject being matched against our pattern was invalid, but we
were lucky and getting away with it on the JIT path, but the non-JIT
one is stricter.
This patch does nothing to fix that, instead we sneak in support for
fixed patterns starting with "(*NO_JIT)", this disables the PCRE v2
jit with implicit fixed-string matching for testing, see
pcre2syntax(3) the syntax.
This is technically a change in behavior, but it's so obscure that I
figured it was OK. We'd previously consider this an invalid regular
expression as regcomp() would die on it, now we feed it to the PCRE v2
fixed-string path. I thought this was better than introducing yet
another GIT_TEST_* environment variable.
We're also relying on a behavior of PCRE v2 that technically could
change, but I think the test coverage is worth dipping our toe into
some somewhat undefined behavior.
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-07-26 17:08:16 +02:00
|
|
|
test_expect_success GETTEXT_LOCALE,LIBPCRE2 'PCRE v2: grep non-ASCII from invalid UTF-8 data' '
|
2019-07-26 17:08:17 +02:00
|
|
|
git grep -h "æ" invalid-0x80 >actual &&
|
grep: stess test PCRE v2 on invalid UTF-8 data
Since my b65abcafc7 ("grep: use PCRE v2 for optimized fixed-string
search", 2019-07-01) we've been dying on invalid UTF-8 data when
grepping for fixed strings if the following are all true:
* The subject string is non-ASCII (e.g. "ævar")
* We're under a is_utf8_locale(), e.g. "en_US.UTF-8", not "C"
* We compiled with PCRE v2
* That PCRE v2 did not have JIT support
The last of those is why this wasn't caught earlier, per pcre2jit(3):
"unless PCRE2_NO_UTF_CHECK is set, a UTF subject string is tested
for validity. In the interests of speed, these checks do not
happen on the JIT fast path, and if invalid data is passed, the
result is undefined."
I.e. the subject being matched against our pattern was invalid, but we
were lucky and getting away with it on the JIT path, but the non-JIT
one is stricter.
This patch does nothing to fix that, instead we sneak in support for
fixed patterns starting with "(*NO_JIT)", this disables the PCRE v2
jit with implicit fixed-string matching for testing, see
pcre2syntax(3) the syntax.
This is technically a change in behavior, but it's so obscure that I
figured it was OK. We'd previously consider this an invalid regular
expression as regcomp() would die on it, now we feed it to the PCRE v2
fixed-string path. I thought this was better than introducing yet
another GIT_TEST_* environment variable.
We're also relying on a behavior of PCRE v2 that technically could
change, but I think the test coverage is worth dipping our toe into
some somewhat undefined behavior.
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-07-26 17:08:16 +02:00
|
|
|
test_cmp expected actual &&
|
2019-11-26 22:50:51 +01:00
|
|
|
git grep -h "(*NO_JIT)æ" invalid-0x80 >actual &&
|
grep: stess test PCRE v2 on invalid UTF-8 data
Since my b65abcafc7 ("grep: use PCRE v2 for optimized fixed-string
search", 2019-07-01) we've been dying on invalid UTF-8 data when
grepping for fixed strings if the following are all true:
* The subject string is non-ASCII (e.g. "ævar")
* We're under a is_utf8_locale(), e.g. "en_US.UTF-8", not "C"
* We compiled with PCRE v2
* That PCRE v2 did not have JIT support
The last of those is why this wasn't caught earlier, per pcre2jit(3):
"unless PCRE2_NO_UTF_CHECK is set, a UTF subject string is tested
for validity. In the interests of speed, these checks do not
happen on the JIT fast path, and if invalid data is passed, the
result is undefined."
I.e. the subject being matched against our pattern was invalid, but we
were lucky and getting away with it on the JIT path, but the non-JIT
one is stricter.
This patch does nothing to fix that, instead we sneak in support for
fixed patterns starting with "(*NO_JIT)", this disables the PCRE v2
jit with implicit fixed-string matching for testing, see
pcre2syntax(3) the syntax.
This is technically a change in behavior, but it's so obscure that I
figured it was OK. We'd previously consider this an invalid regular
expression as regcomp() would die on it, now we feed it to the PCRE v2
fixed-string path. I thought this was better than introducing yet
another GIT_TEST_* environment variable.
We're also relying on a behavior of PCRE v2 that technically could
change, but I think the test coverage is worth dipping our toe into
some somewhat undefined behavior.
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-07-26 17:08:16 +02:00
|
|
|
test_cmp expected actual
|
|
|
|
'
|
|
|
|
|
grep/pcre2: better support invalid UTF-8 haystacks
Improve the support for invalid UTF-8 haystacks given a non-ASCII
needle when using the PCREv2 backend.
This is a more complete fix for a bug I started to fix in
870eea8166 (grep: do not enter PCRE2_UTF mode on fixed matching,
2019-07-26), now that PCREv2 has the PCRE2_MATCH_INVALID_UTF mode we
can make use of it.
This fixes the sort of case described in 8a5999838e (grep: stess test
PCRE v2 on invalid UTF-8 data, 2019-07-26), i.e.:
- The subject string is non-ASCII (e.g. "ævar")
- We're under a is_utf8_locale(), e.g. "en_US.UTF-8", not "C"
- We are using --ignore-case, or we're a non-fixed pattern
If those conditions were satisfied and we matched found non-valid
UTF-8 data PCREv2 might bark on it, in practice this only happened
under the JIT backend (turned on by default on most platforms).
Ultimately this fixes a "regression" in b65abcafc7 ("grep: use PCRE v2
for optimized fixed-string search", 2019-07-01), I'm putting that in
scare-quotes because before then we wouldn't properly support these
complex case-folding, locale etc. cases either, it just broke in
different ways.
There was a bug related to this the PCRE2_NO_START_OPTIMIZE flag fixed
in PCREv2 10.36. It can be worked around by setting the
PCRE2_NO_START_OPTIMIZE flag. Let's do that in those cases, and add
tests for the bug.
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-01-24 18:28:13 +01:00
|
|
|
test_expect_success GETTEXT_LOCALE,LIBPCRE2 'PCRE v2: grep non-ASCII from invalid UTF-8 data (PCRE2 bug #2642)' '
|
|
|
|
git grep -h "Aæ" invalid-0xe5 >actual &&
|
|
|
|
test_cmp invalid-0xe5 actual &&
|
|
|
|
git grep -h "(*NO_JIT)Aæ" invalid-0xe5 >actual &&
|
|
|
|
test_cmp invalid-0xe5 actual
|
|
|
|
'
|
|
|
|
|
|
|
|
test_lazy_prereq PCRE2_MATCH_INVALID_UTF '
|
|
|
|
test-tool pcre2-config has-PCRE2_MATCH_INVALID_UTF
|
|
|
|
'
|
|
|
|
|
grep: stess test PCRE v2 on invalid UTF-8 data
Since my b65abcafc7 ("grep: use PCRE v2 for optimized fixed-string
search", 2019-07-01) we've been dying on invalid UTF-8 data when
grepping for fixed strings if the following are all true:
* The subject string is non-ASCII (e.g. "ævar")
* We're under a is_utf8_locale(), e.g. "en_US.UTF-8", not "C"
* We compiled with PCRE v2
* That PCRE v2 did not have JIT support
The last of those is why this wasn't caught earlier, per pcre2jit(3):
"unless PCRE2_NO_UTF_CHECK is set, a UTF subject string is tested
for validity. In the interests of speed, these checks do not
happen on the JIT fast path, and if invalid data is passed, the
result is undefined."
I.e. the subject being matched against our pattern was invalid, but we
were lucky and getting away with it on the JIT path, but the non-JIT
one is stricter.
This patch does nothing to fix that, instead we sneak in support for
fixed patterns starting with "(*NO_JIT)", this disables the PCRE v2
jit with implicit fixed-string matching for testing, see
pcre2syntax(3) the syntax.
This is technically a change in behavior, but it's so obscure that I
figured it was OK. We'd previously consider this an invalid regular
expression as regcomp() would die on it, now we feed it to the PCRE v2
fixed-string path. I thought this was better than introducing yet
another GIT_TEST_* environment variable.
We're also relying on a behavior of PCRE v2 that technically could
change, but I think the test coverage is worth dipping our toe into
some somewhat undefined behavior.
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-07-26 17:08:16 +02:00
|
|
|
test_expect_success GETTEXT_LOCALE,LIBPCRE2 'PCRE v2: grep non-ASCII from invalid UTF-8 data with -i' '
|
|
|
|
test_might_fail git grep -hi "Æ" invalid-0x80 >actual &&
|
2021-01-24 18:28:12 +01:00
|
|
|
test_might_fail git grep -hi "(*NO_JIT)Æ" invalid-0x80 >actual
|
grep: stess test PCRE v2 on invalid UTF-8 data
Since my b65abcafc7 ("grep: use PCRE v2 for optimized fixed-string
search", 2019-07-01) we've been dying on invalid UTF-8 data when
grepping for fixed strings if the following are all true:
* The subject string is non-ASCII (e.g. "ævar")
* We're under a is_utf8_locale(), e.g. "en_US.UTF-8", not "C"
* We compiled with PCRE v2
* That PCRE v2 did not have JIT support
The last of those is why this wasn't caught earlier, per pcre2jit(3):
"unless PCRE2_NO_UTF_CHECK is set, a UTF subject string is tested
for validity. In the interests of speed, these checks do not
happen on the JIT fast path, and if invalid data is passed, the
result is undefined."
I.e. the subject being matched against our pattern was invalid, but we
were lucky and getting away with it on the JIT path, but the non-JIT
one is stricter.
This patch does nothing to fix that, instead we sneak in support for
fixed patterns starting with "(*NO_JIT)", this disables the PCRE v2
jit with implicit fixed-string matching for testing, see
pcre2syntax(3) the syntax.
This is technically a change in behavior, but it's so obscure that I
figured it was OK. We'd previously consider this an invalid regular
expression as regcomp() would die on it, now we feed it to the PCRE v2
fixed-string path. I thought this was better than introducing yet
another GIT_TEST_* environment variable.
We're also relying on a behavior of PCRE v2 that technically could
change, but I think the test coverage is worth dipping our toe into
some somewhat undefined behavior.
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-07-26 17:08:16 +02:00
|
|
|
'
|
|
|
|
|
grep/pcre2: better support invalid UTF-8 haystacks
Improve the support for invalid UTF-8 haystacks given a non-ASCII
needle when using the PCREv2 backend.
This is a more complete fix for a bug I started to fix in
870eea8166 (grep: do not enter PCRE2_UTF mode on fixed matching,
2019-07-26), now that PCREv2 has the PCRE2_MATCH_INVALID_UTF mode we
can make use of it.
This fixes the sort of case described in 8a5999838e (grep: stess test
PCRE v2 on invalid UTF-8 data, 2019-07-26), i.e.:
- The subject string is non-ASCII (e.g. "ævar")
- We're under a is_utf8_locale(), e.g. "en_US.UTF-8", not "C"
- We are using --ignore-case, or we're a non-fixed pattern
If those conditions were satisfied and we matched found non-valid
UTF-8 data PCREv2 might bark on it, in practice this only happened
under the JIT backend (turned on by default on most platforms).
Ultimately this fixes a "regression" in b65abcafc7 ("grep: use PCRE v2
for optimized fixed-string search", 2019-07-01), I'm putting that in
scare-quotes because before then we wouldn't properly support these
complex case-folding, locale etc. cases either, it just broke in
different ways.
There was a bug related to this the PCRE2_NO_START_OPTIMIZE flag fixed
in PCREv2 10.36. It can be worked around by setting the
PCRE2_NO_START_OPTIMIZE flag. Let's do that in those cases, and add
tests for the bug.
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-01-24 18:28:13 +01:00
|
|
|
test_expect_success GETTEXT_LOCALE,LIBPCRE2,PCRE2_MATCH_INVALID_UTF 'PCRE v2: grep non-ASCII from invalid UTF-8 data with -i' '
|
|
|
|
git grep -hi "Æ" invalid-0x80 >actual &&
|
|
|
|
test_cmp expected actual &&
|
|
|
|
git grep -hi "(*NO_JIT)Æ" invalid-0x80 >actual &&
|
|
|
|
test_cmp expected actual
|
|
|
|
'
|
|
|
|
|
|
|
|
test_expect_success GETTEXT_LOCALE,LIBPCRE2,PCRE2_MATCH_INVALID_UTF 'PCRE v2: grep non-ASCII from invalid UTF-8 data with -i (PCRE2 bug #2642)' '
|
|
|
|
git grep -hi "Æ" invalid-0xe5 >actual &&
|
|
|
|
test_cmp invalid-0xe5 actual &&
|
|
|
|
git grep -hi "(*NO_JIT)Æ" invalid-0xe5 >actual &&
|
|
|
|
test_cmp invalid-0xe5 actual &&
|
|
|
|
|
|
|
|
# Only the case of grepping the ASCII part in a way that
|
|
|
|
# relies on -i fails
|
|
|
|
git grep -hi "aÆ" invalid-0xe5 >actual &&
|
|
|
|
test_cmp invalid-0xe5 actual &&
|
|
|
|
git grep -hi "(*NO_JIT)aÆ" invalid-0xe5 >actual &&
|
|
|
|
test_cmp invalid-0xe5 actual
|
|
|
|
'
|
|
|
|
|
2016-06-25 07:22:30 +02:00
|
|
|
test_done
|