diff options
author | Tavian Barnes <tavianator@tavianator.com> | 2022-02-24 14:06:05 -0500 |
---|---|---|
committer | Tavian Barnes <tavianator@tavianator.com> | 2022-02-24 14:28:17 -0500 |
commit | b490dc534eedcc9878d4962e6d02baf4217a712f (patch) | |
tree | 708d96ae84732441bfa4b2e40d48e141e691f8a3 | |
parent | 0633154419955e35470b38aafd49b6f52510333a (diff) | |
download | bfs-b490dc534eedcc9878d4962e6d02baf4217a712f.tar.xz |
regex: Use the encoding from the current locale
-rw-r--r-- | regex.c | 91 | ||||
-rwxr-xr-x | tests.sh | 12 | ||||
-rw-r--r-- | tests/test_regex_invalid_utf8.out | 1 |
3 files changed, 94 insertions, 10 deletions
@@ -21,6 +21,7 @@ #include <string.h> #if BFS_WITH_ONIGURUMA +# include <langinfo.h> # include <oniguruma.h> #else # include <regex.h> @@ -34,19 +35,84 @@ struct bfs_regex { #endif }; -struct bfs_regex *bfs_regcomp(const char *expr, enum bfs_regex_type type, enum bfs_regcomp_flags flags, int *err) { #if BFS_WITH_ONIGURUMA - static bool onig_initialized = false; - if (!onig_initialized) { - OnigEncoding encs[] = {ONIG_ENCODING_UTF8}; - *err = onig_initialize(encs, sizeof(encs)/sizeof(encs[0])); - if (*err != ONIG_NORMAL) { - return NULL; - } - onig_initialized = true; +/** Get (and initialize) the appropriate encoding for the current locale. */ +static OnigEncoding bfs_onig_encoding(int *err) { + static OnigEncoding enc = NULL; + if (enc) { + return enc; } + + // Fall back to ASCII by default + enc = ONIG_ENCODING_ASCII; + + // Oniguruma has no locale support, so try to guess the right encoding + // from the current locale. + const char *charmap = nl_langinfo(CODESET); + if (charmap) { +#define BFS_MAP_ENCODING(name, value) \ + do { \ + if (strcmp(charmap, name) == 0) { \ + enc = value; \ + } \ + } while (0) +#define BFS_MAP_ENCODING2(name1, name2, value) \ + do { \ + BFS_MAP_ENCODING(name1, value); \ + BFS_MAP_ENCODING(name2, value); \ + } while (0) + + // These names were found with locale -m on Linux and FreeBSD +#define BFS_MAP_ISO_8859(n) \ + BFS_MAP_ENCODING2("ISO-8859-" #n, "ISO8859-" #n, ONIG_ENCODING_ISO_8859_ ## n) + + BFS_MAP_ISO_8859(1); + BFS_MAP_ISO_8859(2); + BFS_MAP_ISO_8859(3); + BFS_MAP_ISO_8859(4); + BFS_MAP_ISO_8859(5); + BFS_MAP_ISO_8859(6); + BFS_MAP_ISO_8859(7); + BFS_MAP_ISO_8859(8); + BFS_MAP_ISO_8859(9); + BFS_MAP_ISO_8859(10); + BFS_MAP_ISO_8859(11); + // BFS_MAP_ISO_8859(12); + BFS_MAP_ISO_8859(13); + BFS_MAP_ISO_8859(14); + BFS_MAP_ISO_8859(15); + BFS_MAP_ISO_8859(16); + + BFS_MAP_ENCODING("UTF-8", ONIG_ENCODING_UTF8); + +#define BFS_MAP_EUC(name) \ + BFS_MAP_ENCODING2("EUC-" #name, "euc" #name, ONIG_ENCODING_EUC_ ## name) + + BFS_MAP_EUC(JP); + BFS_MAP_EUC(TW); + BFS_MAP_EUC(KR); + BFS_MAP_EUC(CN); + + BFS_MAP_ENCODING2("SHIFT_JIS", "SJIS", ONIG_ENCODING_SJIS); + + // BFS_MAP_ENCODING("KOI-8", ONIG_ENCODING_KOI8); + BFS_MAP_ENCODING("KOI8-R", ONIG_ENCODING_KOI8_R); + + BFS_MAP_ENCODING("CP1251", ONIG_ENCODING_CP1251); + + BFS_MAP_ENCODING("GB18030", ONIG_ENCODING_BIG5); + } + + *err = onig_initialize(&enc, 1); + if (*err != ONIG_NORMAL) { + enc = NULL; + } + + return enc; +} #endif +struct bfs_regex *bfs_regcomp(const char *expr, enum bfs_regex_type type, enum bfs_regcomp_flags flags, int *err) { struct bfs_regex *regex = malloc(sizeof(*regex)); if (!regex) { #if BFS_WITH_ONIGURUMA @@ -80,9 +146,14 @@ struct bfs_regex *bfs_regcomp(const char *expr, enum bfs_regex_type type, enum b options |= ONIG_OPTION_IGNORECASE; } + OnigEncoding enc = bfs_onig_encoding(err); + if (!enc) { + goto fail; + } + const unsigned char *uexpr = (const unsigned char *)expr; const unsigned char *end = uexpr + strlen(expr); - *err = onig_new(®ex->impl, uexpr, end, options, ONIG_ENCODING_UTF8, syntax, NULL); + *err = onig_new(®ex->impl, uexpr, end, options, enc, syntax, NULL); if (*err != ONIG_NORMAL) { goto fail; } @@ -625,6 +625,7 @@ gnu_tests=( test_regex test_regex_parens test_regex_error + test_regex_invalid_utf8 test_regextype_posix_basic test_regextype_posix_extended @@ -2146,6 +2147,17 @@ function test_regex_error() { fail quiet invoke_bfs basic -regex '[' } +function test_regex_invalid_utf8() { + rm -rf scratch/* + + # Incomplete UTF-8 sequences + skip_if fail quiet touch scratch/$'\xC3' + skip_if fail quiet touch scratch/$'\xE2\x84' + skip_if fail quiet touch scratch/$'\xF0\x9F\x92' + + bfs_diff scratch -regex 'scratch/..' +} + function test_E() { cd weirdnames bfs_diff -E . -regex '\./(\()' diff --git a/tests/test_regex_invalid_utf8.out b/tests/test_regex_invalid_utf8.out new file mode 100644 index 0000000..03f3f58 --- /dev/null +++ b/tests/test_regex_invalid_utf8.out @@ -0,0 +1 @@ +scratch/â„ |