summaryrefslogtreecommitdiffstats
path: root/xregex.c
diff options
context:
space:
mode:
Diffstat (limited to 'xregex.c')
-rw-r--r--xregex.c301
1 files changed, 301 insertions, 0 deletions
diff --git a/xregex.c b/xregex.c
new file mode 100644
index 0000000..3c3cf35
--- /dev/null
+++ b/xregex.c
@@ -0,0 +1,301 @@
+/****************************************************************************
+ * bfs *
+ * Copyright (C) 2022 Tavian Barnes <tavianator@tavianator.com> *
+ * *
+ * Permission to use, copy, modify, and/or distribute this software for any *
+ * purpose with or without fee is hereby granted. *
+ * *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES *
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF *
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR *
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES *
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN *
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF *
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. *
+ ****************************************************************************/
+
+#include "xregex.h"
+#include "util.h"
+#include <assert.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+
+#if BFS_WITH_ONIGURUMA
+# include <langinfo.h>
+# include <oniguruma.h>
+#else
+# include <regex.h>
+#endif
+
+struct bfs_regex {
+#if BFS_WITH_ONIGURUMA
+ unsigned char *pattern;
+ OnigRegex impl;
+ int err;
+ OnigErrorInfo einfo;
+#else
+ regex_t impl;
+ int err;
+#endif
+};
+
+#if BFS_WITH_ONIGURUMA
+/** Get (and initialize) the appropriate encoding for the current locale. */
+static int bfs_onig_encoding(OnigEncoding *penc) {
+ static OnigEncoding enc = NULL;
+ if (enc) {
+ *penc = enc;
+ return ONIG_NORMAL;
+ }
+
+ // Fall back to ASCII by default
+ enc = ONIG_ENCODING_ASCII;
+
+ // Oniguruma has no locale support, so try to guess the right encoding
+ // from the current locale.
+ const char *charmap = nl_langinfo(CODESET);
+ if (charmap) {
+#define BFS_MAP_ENCODING(name, value) \
+ do { \
+ if (strcmp(charmap, name) == 0) { \
+ enc = value; \
+ } \
+ } while (0)
+#define BFS_MAP_ENCODING2(name1, name2, value) \
+ do { \
+ BFS_MAP_ENCODING(name1, value); \
+ BFS_MAP_ENCODING(name2, value); \
+ } while (0)
+
+ // These names were found with locale -m on Linux and FreeBSD
+#define BFS_MAP_ISO_8859(n) \
+ BFS_MAP_ENCODING2("ISO-8859-" #n, "ISO8859-" #n, ONIG_ENCODING_ISO_8859_ ## n)
+
+ BFS_MAP_ISO_8859(1);
+ BFS_MAP_ISO_8859(2);
+ BFS_MAP_ISO_8859(3);
+ BFS_MAP_ISO_8859(4);
+ BFS_MAP_ISO_8859(5);
+ BFS_MAP_ISO_8859(6);
+ BFS_MAP_ISO_8859(7);
+ BFS_MAP_ISO_8859(8);
+ BFS_MAP_ISO_8859(9);
+ BFS_MAP_ISO_8859(10);
+ BFS_MAP_ISO_8859(11);
+ // BFS_MAP_ISO_8859(12);
+ BFS_MAP_ISO_8859(13);
+ BFS_MAP_ISO_8859(14);
+ BFS_MAP_ISO_8859(15);
+ BFS_MAP_ISO_8859(16);
+
+ BFS_MAP_ENCODING("UTF-8", ONIG_ENCODING_UTF8);
+
+#define BFS_MAP_EUC(name) \
+ BFS_MAP_ENCODING2("EUC-" #name, "euc" #name, ONIG_ENCODING_EUC_ ## name)
+
+ BFS_MAP_EUC(JP);
+ BFS_MAP_EUC(TW);
+ BFS_MAP_EUC(KR);
+ BFS_MAP_EUC(CN);
+
+ BFS_MAP_ENCODING2("SHIFT_JIS", "SJIS", ONIG_ENCODING_SJIS);
+
+ // BFS_MAP_ENCODING("KOI-8", ONIG_ENCODING_KOI8);
+ BFS_MAP_ENCODING("KOI8-R", ONIG_ENCODING_KOI8_R);
+
+ BFS_MAP_ENCODING("CP1251", ONIG_ENCODING_CP1251);
+
+ BFS_MAP_ENCODING("GB18030", ONIG_ENCODING_BIG5);
+ }
+
+ int ret = onig_initialize(&enc, 1);
+ if (ret != ONIG_NORMAL) {
+ enc = NULL;
+ }
+ *penc = enc;
+ return ret;
+}
+#endif
+
+int bfs_regcomp(struct bfs_regex **preg, const char *pattern, enum bfs_regex_type type, enum bfs_regcomp_flags flags) {
+ struct bfs_regex *regex = *preg = malloc(sizeof(*regex));
+ if (!regex) {
+ return -1;
+ }
+
+#if BFS_WITH_ONIGURUMA
+ // onig_error_code_to_str() says
+ //
+ // don't call this after the pattern argument of onig_new() is freed
+ //
+ // so make a defensive copy.
+ regex->pattern = (unsigned char *)strdup(pattern);
+ if (!regex->pattern) {
+ goto fail;
+ }
+
+ regex->impl = NULL;
+ regex->err = ONIG_NORMAL;
+
+ OnigSyntaxType *syntax = NULL;
+ switch (type) {
+ case BFS_REGEX_POSIX_BASIC:
+ syntax = ONIG_SYNTAX_POSIX_BASIC;
+ break;
+ case BFS_REGEX_POSIX_EXTENDED:
+ syntax = ONIG_SYNTAX_POSIX_EXTENDED;
+ break;
+ case BFS_REGEX_EMACS:
+ syntax = ONIG_SYNTAX_EMACS;
+ break;
+ case BFS_REGEX_GREP:
+ syntax = ONIG_SYNTAX_GREP;
+ break;
+ }
+ assert(syntax);
+
+ OnigOptionType options = syntax->options;
+ if (flags & BFS_REGEX_ICASE) {
+ options |= ONIG_OPTION_IGNORECASE;
+ }
+
+ OnigEncoding enc;
+ regex->err = bfs_onig_encoding(&enc);
+ if (regex->err != ONIG_NORMAL) {
+ return -1;
+ }
+
+ const unsigned char *end = regex->pattern + strlen(pattern);
+ regex->err = onig_new(&regex->impl, regex->pattern, end, options, enc, syntax, &regex->einfo);
+ if (regex->err != ONIG_NORMAL) {
+ return -1;
+ }
+#else
+ int cflags = 0;
+ switch (type) {
+ case BFS_REGEX_POSIX_BASIC:
+ break;
+ case BFS_REGEX_POSIX_EXTENDED:
+ cflags |= REG_EXTENDED;
+ break;
+ default:
+ errno = EINVAL;
+ goto fail;
+ }
+
+ if (flags & BFS_REGEX_ICASE) {
+ cflags |= REG_ICASE;
+ }
+
+#if BFS_HAS_FEATURE(memory_sanitizer, false)
+ // https://github.com/google/sanitizers/issues/1496
+ memset(&regex->impl, 0, sizeof(regex->impl));
+#endif
+
+ regex->err = regcomp(&regex->impl, pattern, cflags);
+ if (regex->err != 0) {
+ return -1;
+ }
+#endif
+
+ return 0;
+
+fail:
+ free(regex);
+ *preg = NULL;
+ return -1;
+}
+
+int bfs_regexec(struct bfs_regex *regex, const char *str, enum bfs_regexec_flags flags) {
+ size_t len = strlen(str);
+
+#if BFS_WITH_ONIGURUMA
+ const unsigned char *ustr = (const unsigned char *)str;
+ const unsigned char *end = ustr + len;
+
+ // The docs for onig_{match,search}() say
+ //
+ // Do not pass invalid byte string in the regex character encoding.
+ if (!onigenc_is_valid_mbc_string(onig_get_encoding(regex->impl), ustr, end)) {
+ return 0;
+ }
+
+ int ret;
+ if (flags & BFS_REGEX_ANCHOR) {
+ ret = onig_match(regex->impl, ustr, end, ustr, NULL, ONIG_OPTION_DEFAULT);
+ } else {
+ ret = onig_search(regex->impl, ustr, end, ustr, end, NULL, ONIG_OPTION_DEFAULT);
+ }
+
+ if (ret >= 0) {
+ if (flags & BFS_REGEX_ANCHOR) {
+ return (size_t)ret == len;
+ } else {
+ return 1;
+ }
+ } else if (ret == ONIG_MISMATCH) {
+ return 0;
+ } else {
+ regex->err = ret;
+ return -1;
+ }
+#else
+ regmatch_t match = {
+ .rm_so = 0,
+ .rm_eo = len,
+ };
+
+ int eflags = 0;
+#ifdef REG_STARTEND
+ eflags |= REG_STARTEND;
+#endif
+
+ int ret = regexec(&regex->impl, str, 1, &match, eflags);
+ if (ret == 0) {
+ if (flags & BFS_REGEX_ANCHOR) {
+ return match.rm_so == 0 && (size_t)match.rm_eo == len;
+ } else {
+ return 1;
+ }
+ } else if (ret == REG_NOMATCH) {
+ return 0;
+ } else {
+ regex->err = ret;
+ return -1;
+ }
+#endif
+}
+
+void bfs_regfree(struct bfs_regex *regex) {
+ if (regex) {
+#if BFS_WITH_ONIGURUMA
+ onig_free(regex->impl);
+ free(regex->pattern);
+#else
+ regfree(&regex->impl);
+#endif
+ free(regex);
+ }
+}
+
+char *bfs_regerror(const struct bfs_regex *regex) {
+ if (!regex) {
+ return strdup(strerror(ENOMEM));
+ }
+
+#if BFS_WITH_ONIGURUMA
+ unsigned char *str = malloc(ONIG_MAX_ERROR_MESSAGE_LEN);
+ if (str) {
+ onig_error_code_to_str(str, regex->err, &regex->einfo);
+ }
+ return (char *)str;
+#else
+ size_t len = regerror(regex->err, &regex->impl, NULL, 0);
+ char *str = malloc(len);
+ if (str) {
+ regerror(regex->err, &regex->impl, str, len);
+ }
+ return str;
+#endif
+}