diff options
Diffstat (limited to 'xregex.c')
-rw-r--r-- | xregex.c | 301 |
1 files changed, 301 insertions, 0 deletions
diff --git a/xregex.c b/xregex.c new file mode 100644 index 0000000..3c3cf35 --- /dev/null +++ b/xregex.c @@ -0,0 +1,301 @@ +/**************************************************************************** + * bfs * + * Copyright (C) 2022 Tavian Barnes <tavianator@tavianator.com> * + * * + * Permission to use, copy, modify, and/or distribute this software for any * + * purpose with or without fee is hereby granted. * + * * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. * + ****************************************************************************/ + +#include "xregex.h" +#include "util.h" +#include <assert.h> +#include <errno.h> +#include <stdlib.h> +#include <string.h> + +#if BFS_WITH_ONIGURUMA +# include <langinfo.h> +# include <oniguruma.h> +#else +# include <regex.h> +#endif + +struct bfs_regex { +#if BFS_WITH_ONIGURUMA + unsigned char *pattern; + OnigRegex impl; + int err; + OnigErrorInfo einfo; +#else + regex_t impl; + int err; +#endif +}; + +#if BFS_WITH_ONIGURUMA +/** Get (and initialize) the appropriate encoding for the current locale. */ +static int bfs_onig_encoding(OnigEncoding *penc) { + static OnigEncoding enc = NULL; + if (enc) { + *penc = enc; + return ONIG_NORMAL; + } + + // Fall back to ASCII by default + enc = ONIG_ENCODING_ASCII; + + // Oniguruma has no locale support, so try to guess the right encoding + // from the current locale. + const char *charmap = nl_langinfo(CODESET); + if (charmap) { +#define BFS_MAP_ENCODING(name, value) \ + do { \ + if (strcmp(charmap, name) == 0) { \ + enc = value; \ + } \ + } while (0) +#define BFS_MAP_ENCODING2(name1, name2, value) \ + do { \ + BFS_MAP_ENCODING(name1, value); \ + BFS_MAP_ENCODING(name2, value); \ + } while (0) + + // These names were found with locale -m on Linux and FreeBSD +#define BFS_MAP_ISO_8859(n) \ + BFS_MAP_ENCODING2("ISO-8859-" #n, "ISO8859-" #n, ONIG_ENCODING_ISO_8859_ ## n) + + BFS_MAP_ISO_8859(1); + BFS_MAP_ISO_8859(2); + BFS_MAP_ISO_8859(3); + BFS_MAP_ISO_8859(4); + BFS_MAP_ISO_8859(5); + BFS_MAP_ISO_8859(6); + BFS_MAP_ISO_8859(7); + BFS_MAP_ISO_8859(8); + BFS_MAP_ISO_8859(9); + BFS_MAP_ISO_8859(10); + BFS_MAP_ISO_8859(11); + // BFS_MAP_ISO_8859(12); + BFS_MAP_ISO_8859(13); + BFS_MAP_ISO_8859(14); + BFS_MAP_ISO_8859(15); + BFS_MAP_ISO_8859(16); + + BFS_MAP_ENCODING("UTF-8", ONIG_ENCODING_UTF8); + +#define BFS_MAP_EUC(name) \ + BFS_MAP_ENCODING2("EUC-" #name, "euc" #name, ONIG_ENCODING_EUC_ ## name) + + BFS_MAP_EUC(JP); + BFS_MAP_EUC(TW); + BFS_MAP_EUC(KR); + BFS_MAP_EUC(CN); + + BFS_MAP_ENCODING2("SHIFT_JIS", "SJIS", ONIG_ENCODING_SJIS); + + // BFS_MAP_ENCODING("KOI-8", ONIG_ENCODING_KOI8); + BFS_MAP_ENCODING("KOI8-R", ONIG_ENCODING_KOI8_R); + + BFS_MAP_ENCODING("CP1251", ONIG_ENCODING_CP1251); + + BFS_MAP_ENCODING("GB18030", ONIG_ENCODING_BIG5); + } + + int ret = onig_initialize(&enc, 1); + if (ret != ONIG_NORMAL) { + enc = NULL; + } + *penc = enc; + return ret; +} +#endif + +int bfs_regcomp(struct bfs_regex **preg, const char *pattern, enum bfs_regex_type type, enum bfs_regcomp_flags flags) { + struct bfs_regex *regex = *preg = malloc(sizeof(*regex)); + if (!regex) { + return -1; + } + +#if BFS_WITH_ONIGURUMA + // onig_error_code_to_str() says + // + // don't call this after the pattern argument of onig_new() is freed + // + // so make a defensive copy. + regex->pattern = (unsigned char *)strdup(pattern); + if (!regex->pattern) { + goto fail; + } + + regex->impl = NULL; + regex->err = ONIG_NORMAL; + + OnigSyntaxType *syntax = NULL; + switch (type) { + case BFS_REGEX_POSIX_BASIC: + syntax = ONIG_SYNTAX_POSIX_BASIC; + break; + case BFS_REGEX_POSIX_EXTENDED: + syntax = ONIG_SYNTAX_POSIX_EXTENDED; + break; + case BFS_REGEX_EMACS: + syntax = ONIG_SYNTAX_EMACS; + break; + case BFS_REGEX_GREP: + syntax = ONIG_SYNTAX_GREP; + break; + } + assert(syntax); + + OnigOptionType options = syntax->options; + if (flags & BFS_REGEX_ICASE) { + options |= ONIG_OPTION_IGNORECASE; + } + + OnigEncoding enc; + regex->err = bfs_onig_encoding(&enc); + if (regex->err != ONIG_NORMAL) { + return -1; + } + + const unsigned char *end = regex->pattern + strlen(pattern); + regex->err = onig_new(®ex->impl, regex->pattern, end, options, enc, syntax, ®ex->einfo); + if (regex->err != ONIG_NORMAL) { + return -1; + } +#else + int cflags = 0; + switch (type) { + case BFS_REGEX_POSIX_BASIC: + break; + case BFS_REGEX_POSIX_EXTENDED: + cflags |= REG_EXTENDED; + break; + default: + errno = EINVAL; + goto fail; + } + + if (flags & BFS_REGEX_ICASE) { + cflags |= REG_ICASE; + } + +#if BFS_HAS_FEATURE(memory_sanitizer, false) + // https://github.com/google/sanitizers/issues/1496 + memset(®ex->impl, 0, sizeof(regex->impl)); +#endif + + regex->err = regcomp(®ex->impl, pattern, cflags); + if (regex->err != 0) { + return -1; + } +#endif + + return 0; + +fail: + free(regex); + *preg = NULL; + return -1; +} + +int bfs_regexec(struct bfs_regex *regex, const char *str, enum bfs_regexec_flags flags) { + size_t len = strlen(str); + +#if BFS_WITH_ONIGURUMA + const unsigned char *ustr = (const unsigned char *)str; + const unsigned char *end = ustr + len; + + // The docs for onig_{match,search}() say + // + // Do not pass invalid byte string in the regex character encoding. + if (!onigenc_is_valid_mbc_string(onig_get_encoding(regex->impl), ustr, end)) { + return 0; + } + + int ret; + if (flags & BFS_REGEX_ANCHOR) { + ret = onig_match(regex->impl, ustr, end, ustr, NULL, ONIG_OPTION_DEFAULT); + } else { + ret = onig_search(regex->impl, ustr, end, ustr, end, NULL, ONIG_OPTION_DEFAULT); + } + + if (ret >= 0) { + if (flags & BFS_REGEX_ANCHOR) { + return (size_t)ret == len; + } else { + return 1; + } + } else if (ret == ONIG_MISMATCH) { + return 0; + } else { + regex->err = ret; + return -1; + } +#else + regmatch_t match = { + .rm_so = 0, + .rm_eo = len, + }; + + int eflags = 0; +#ifdef REG_STARTEND + eflags |= REG_STARTEND; +#endif + + int ret = regexec(®ex->impl, str, 1, &match, eflags); + if (ret == 0) { + if (flags & BFS_REGEX_ANCHOR) { + return match.rm_so == 0 && (size_t)match.rm_eo == len; + } else { + return 1; + } + } else if (ret == REG_NOMATCH) { + return 0; + } else { + regex->err = ret; + return -1; + } +#endif +} + +void bfs_regfree(struct bfs_regex *regex) { + if (regex) { +#if BFS_WITH_ONIGURUMA + onig_free(regex->impl); + free(regex->pattern); +#else + regfree(®ex->impl); +#endif + free(regex); + } +} + +char *bfs_regerror(const struct bfs_regex *regex) { + if (!regex) { + return strdup(strerror(ENOMEM)); + } + +#if BFS_WITH_ONIGURUMA + unsigned char *str = malloc(ONIG_MAX_ERROR_MESSAGE_LEN); + if (str) { + onig_error_code_to_str(str, regex->err, ®ex->einfo); + } + return (char *)str; +#else + size_t len = regerror(regex->err, ®ex->impl, NULL, 0); + char *str = malloc(len); + if (str) { + regerror(regex->err, ®ex->impl, str, len); + } + return str; +#endif +} |