From b490dc534eedcc9878d4962e6d02baf4217a712f Mon Sep 17 00:00:00 2001 From: Tavian Barnes Date: Thu, 24 Feb 2022 14:06:05 -0500 Subject: regex: Use the encoding from the current locale --- regex.c | 91 ++++++++++++++++++++++++++++++++++----- tests.sh | 12 ++++++ tests/test_regex_invalid_utf8.out | 1 + 3 files changed, 94 insertions(+), 10 deletions(-) create mode 100644 tests/test_regex_invalid_utf8.out diff --git a/regex.c b/regex.c index 642e357..0852792 100644 --- a/regex.c +++ b/regex.c @@ -21,6 +21,7 @@ #include #if BFS_WITH_ONIGURUMA +# include # include #else # include @@ -34,19 +35,84 @@ struct bfs_regex { #endif }; -struct bfs_regex *bfs_regcomp(const char *expr, enum bfs_regex_type type, enum bfs_regcomp_flags flags, int *err) { #if BFS_WITH_ONIGURUMA - static bool onig_initialized = false; - if (!onig_initialized) { - OnigEncoding encs[] = {ONIG_ENCODING_UTF8}; - *err = onig_initialize(encs, sizeof(encs)/sizeof(encs[0])); - if (*err != ONIG_NORMAL) { - return NULL; - } - onig_initialized = true; +/** Get (and initialize) the appropriate encoding for the current locale. */ +static OnigEncoding bfs_onig_encoding(int *err) { + static OnigEncoding enc = NULL; + if (enc) { + return enc; } + + // Fall back to ASCII by default + enc = ONIG_ENCODING_ASCII; + + // Oniguruma has no locale support, so try to guess the right encoding + // from the current locale. + const char *charmap = nl_langinfo(CODESET); + if (charmap) { +#define BFS_MAP_ENCODING(name, value) \ + do { \ + if (strcmp(charmap, name) == 0) { \ + enc = value; \ + } \ + } while (0) +#define BFS_MAP_ENCODING2(name1, name2, value) \ + do { \ + BFS_MAP_ENCODING(name1, value); \ + BFS_MAP_ENCODING(name2, value); \ + } while (0) + + // These names were found with locale -m on Linux and FreeBSD +#define BFS_MAP_ISO_8859(n) \ + BFS_MAP_ENCODING2("ISO-8859-" #n, "ISO8859-" #n, ONIG_ENCODING_ISO_8859_ ## n) + + BFS_MAP_ISO_8859(1); + BFS_MAP_ISO_8859(2); + BFS_MAP_ISO_8859(3); + BFS_MAP_ISO_8859(4); + BFS_MAP_ISO_8859(5); + BFS_MAP_ISO_8859(6); + BFS_MAP_ISO_8859(7); + BFS_MAP_ISO_8859(8); + BFS_MAP_ISO_8859(9); + BFS_MAP_ISO_8859(10); + BFS_MAP_ISO_8859(11); + // BFS_MAP_ISO_8859(12); + BFS_MAP_ISO_8859(13); + BFS_MAP_ISO_8859(14); + BFS_MAP_ISO_8859(15); + BFS_MAP_ISO_8859(16); + + BFS_MAP_ENCODING("UTF-8", ONIG_ENCODING_UTF8); + +#define BFS_MAP_EUC(name) \ + BFS_MAP_ENCODING2("EUC-" #name, "euc" #name, ONIG_ENCODING_EUC_ ## name) + + BFS_MAP_EUC(JP); + BFS_MAP_EUC(TW); + BFS_MAP_EUC(KR); + BFS_MAP_EUC(CN); + + BFS_MAP_ENCODING2("SHIFT_JIS", "SJIS", ONIG_ENCODING_SJIS); + + // BFS_MAP_ENCODING("KOI-8", ONIG_ENCODING_KOI8); + BFS_MAP_ENCODING("KOI8-R", ONIG_ENCODING_KOI8_R); + + BFS_MAP_ENCODING("CP1251", ONIG_ENCODING_CP1251); + + BFS_MAP_ENCODING("GB18030", ONIG_ENCODING_BIG5); + } + + *err = onig_initialize(&enc, 1); + if (*err != ONIG_NORMAL) { + enc = NULL; + } + + return enc; +} #endif +struct bfs_regex *bfs_regcomp(const char *expr, enum bfs_regex_type type, enum bfs_regcomp_flags flags, int *err) { struct bfs_regex *regex = malloc(sizeof(*regex)); if (!regex) { #if BFS_WITH_ONIGURUMA @@ -80,9 +146,14 @@ struct bfs_regex *bfs_regcomp(const char *expr, enum bfs_regex_type type, enum b options |= ONIG_OPTION_IGNORECASE; } + OnigEncoding enc = bfs_onig_encoding(err); + if (!enc) { + goto fail; + } + const unsigned char *uexpr = (const unsigned char *)expr; const unsigned char *end = uexpr + strlen(expr); - *err = onig_new(®ex->impl, uexpr, end, options, ONIG_ENCODING_UTF8, syntax, NULL); + *err = onig_new(®ex->impl, uexpr, end, options, enc, syntax, NULL); if (*err != ONIG_NORMAL) { goto fail; } diff --git a/tests.sh b/tests.sh index d7d9947..9003efd 100755 --- a/tests.sh +++ b/tests.sh @@ -625,6 +625,7 @@ gnu_tests=( test_regex test_regex_parens test_regex_error + test_regex_invalid_utf8 test_regextype_posix_basic test_regextype_posix_extended @@ -2146,6 +2147,17 @@ function test_regex_error() { fail quiet invoke_bfs basic -regex '[' } +function test_regex_invalid_utf8() { + rm -rf scratch/* + + # Incomplete UTF-8 sequences + skip_if fail quiet touch scratch/$'\xC3' + skip_if fail quiet touch scratch/$'\xE2\x84' + skip_if fail quiet touch scratch/$'\xF0\x9F\x92' + + bfs_diff scratch -regex 'scratch/..' +} + function test_E() { cd weirdnames bfs_diff -E . -regex '\./(\()' diff --git a/tests/test_regex_invalid_utf8.out b/tests/test_regex_invalid_utf8.out new file mode 100644 index 0000000..03f3f58 --- /dev/null +++ b/tests/test_regex_invalid_utf8.out @@ -0,0 +1 @@ +scratch/â„ -- cgit v1.2.3