From 19c96abe0a1ee56cf206fd5e87defb1fd3e0daa5 Mon Sep 17 00:00:00 2001
From: Tavian Barnes <tavianator@tavianator.com>
Date: Thu, 13 Jul 2023 20:00:11 -0400
Subject: bfstd: Add an ASCII fast path to wordesc()

---
 src/bfstd.c | 159 ++++++++++++++++++++++++++++++++++++++----------------------
 1 file changed, 100 insertions(+), 59 deletions(-)

(limited to 'src/bfstd.c')

diff --git a/src/bfstd.c b/src/bfstd.c
index 97fa3b3..49c4a70 100644
--- a/src/bfstd.c
+++ b/src/bfstd.c
@@ -6,6 +6,7 @@
 #include "config.h"
 #include "diag.h"
 #include "xregex.h"
+#include <ctype.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <langinfo.h>
@@ -540,6 +541,25 @@ int xstrtofflags(const char **str, unsigned long long *set, unsigned long long *
 #endif
 }
 
+/** mbrtowc() wrapper. */
+static int xmbrtowc(wchar_t *wc, size_t *i, const char *str, size_t len, mbstate_t *mb) {
+	size_t mblen = mbrtowc(wc, str + *i, len - *i, mb);
+	switch (mblen) {
+	case -1:
+		// Invalid byte sequence
+		*i += 1;
+		memset(mb, 0, sizeof(*mb));
+		return -1;
+	case -2:
+		// Incomplete byte sequence
+		*i += len;
+		return -1;
+	default:
+		*i += mblen;
+		return 0;
+	}
+}
+
 size_t xstrwidth(const char *str) {
 	size_t len = strlen(str);
 	size_t ret = 0;
@@ -547,65 +567,76 @@ size_t xstrwidth(const char *str) {
 	mbstate_t mb;
 	memset(&mb, 0, sizeof(mb));
 
-	while (len > 0) {
+	for (size_t i = 0; i < len;) {
 		wchar_t wc;
-		size_t mblen = mbrtowc(&wc, str, len, &mb);
-		int cwidth;
-		if (mblen == (size_t)-1) {
-			// Invalid byte sequence, assume a single-width '?'
-			mblen = 1;
-			cwidth = 1;
-			memset(&mb, 0, sizeof(mb));
-		} else if (mblen == (size_t)-2) {
-			// Incomplete byte sequence, assume a single-width '?'
-			mblen = len;
-			cwidth = 1;
+		if (xmbrtowc(&wc, &i, str, len, &mb) == 0) {
+			ret += wcwidth(wc);
 		} else {
-			cwidth = wcwidth(wc);
-			if (cwidth < 0) {
-				cwidth = 0;
-			}
+			// Assume a single-width '?'
+			++ret;
 		}
-
-		str += mblen;
-		len -= mblen;
-		ret += cwidth;
 	}
 
 	return ret;
 }
 
+/** Check if a character is printable. */
+static bool xisprint(unsigned char c, enum wesc_flags flags) {
+	if (isprint(c)) {
+		return true;
+	}
+
+	// Technically a literal newline is safe inside single quotes, but $'\n'
+	// is much nicer than '
+	// '
+	if (!(flags & WESC_SHELL) && isspace(c)) {
+		return true;
+	}
+
+	return false;
+}
+
+/** Check if a wide character is printable. */
+static bool xiswprint(wchar_t c, enum wesc_flags flags) {
+	if (iswprint(c)) {
+		return true;
+	}
+
+	if (!(flags & WESC_SHELL) && iswspace(c)) {
+		return true;
+	}
+
+	return false;
+}
+
 /** Get the length of the longest printable prefix of a string. */
 static size_t printable_len(const char *str, size_t len, enum wesc_flags flags) {
+	// Fast path: avoid multibyte checks
+	size_t i;
+	for (i = 0; i < len; ++i) {
+		unsigned char c = str[i];
+		if (!isascii(c)) {
+			break;
+		}
+		if (!xisprint(c, flags)) {
+			return i;
+		}
+	}
+
 	mbstate_t mb;
 	memset(&mb, 0, sizeof(mb));
 
-	const char *cur = str;
-	while (len > 0) {
+	while (i < len) {
 		wchar_t wc;
-		size_t mblen = mbrtowc(&wc, cur, len, &mb);
-		if (mblen == (size_t)-1 || mblen == (size_t)-2) {
+		if (xmbrtowc(&wc, &i, str, len, &mb) != 0) {
 			break;
 		}
-
-		bool safe = iswprint(wc);
-
-		// Technically a literal newline is safe inside single quotes,
-		// but $'\n' is much nicer than '
-		// '
-		if (!(flags & WESC_SHELL) && iswspace(wc)) {
-			safe = true;
-		}
-
-		if (!safe) {
+		if (!xiswprint(wc, flags)) {
 			break;
 		}
-
-		cur += mblen;
-		len -= mblen;
 	}
 
-	return cur - str;
+	return i;
 }
 
 /** Convert a special char into a well-known escape sequence like "\n". */
@@ -639,32 +670,42 @@ static const char *dollar_esc(char c) {
 
 /** $'Quote' a string for the shell. */
 static char *dollar_quote(char *dest, char *end, const char *str, size_t len, enum wesc_flags flags) {
-	static const char *hex[] = {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "A", "B", "C", "D", "E", "F"};
-
 	dest = xstpecpy(dest, end, "$'");
 
-	while (len > 0) {
-		size_t plen = printable_len(str, len, flags);
-		size_t elen = strcspn(str, "'\\");
-		size_t min = plen < elen ? plen : elen;
-		dest = xstpencpy(dest, end, str, min);
-		str += min;
-		len -= min;
-		if (len == 0) {
-			break;
+	mbstate_t mb;
+	memset(&mb, 0, sizeof(mb));
+
+	for (size_t i = 0; i < len;) {
+		size_t start = i;
+		bool safe = false;
+
+		wchar_t wc;
+		if (xmbrtowc(&wc, &i, str, len, &mb) == 0) {
+			safe = xiswprint(wc, flags);
 		}
 
-		unsigned char byte = *str;
-		++str;
-		--len;
+		for (size_t j = start; j < i; ++j) {
+			if (str[j] == '\'' || str[j] == '\\') {
+				safe = false;
+				break;
+			}
+		}
 
-		const char *esc = dollar_esc(byte);
-		if (esc) {
-			dest = xstpecpy(dest, end, esc);
+		if (safe) {
+			dest = xstpencpy(dest, end, str + start, i - start);
 		} else {
-			dest = xstpecpy(dest, end, "\\x");
-			dest = xstpecpy(dest, end, hex[byte / 0x10]);
-			dest = xstpecpy(dest, end, hex[byte % 0x10]);
+			for (size_t j = start; j < i; ++j) {
+				unsigned char byte = str[j];
+				const char *esc = dollar_esc(byte);
+				if (esc) {
+					dest = xstpecpy(dest, end, esc);
+				} else {
+					static const char *hex[] = {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "A", "B", "C", "D", "E", "F"};
+					dest = xstpecpy(dest, end, "\\x");
+					dest = xstpecpy(dest, end, hex[byte / 0x10]);
+					dest = xstpecpy(dest, end, hex[byte % 0x10]);
+				}
+			}
 		}
 	}
 
-- 
cgit v1.2.3