BASH PATCH REPORT ================= Bash-Release: 5.3 Patch-ID: bash53-008 Bug-Reported-by: Grisha Levit Bug-Reference-ID: <20251022174207.10518-1-grishalevit@gmail.com> Bug-Reference-URL: https://lists.gnu.org/archive/html/bug-bash/2025-10/msg00145.html Bug-Description: Bash tries to consume entire multibyte characters when looking for backslash escapes in $'...' strings, and treats too many characters as potentially beginning a multibyte character in UTF-8 locales. Being more selective about when to call mbrtowc() can lead to optimized string processing and script speedups. This patch also handles the unlikely situation of a locale encoding null wide characters with non-null bytes. Patch (apply with `patch -p0'): *** ../bash-5.3-patched/lib/sh/strtrans.c Fri Oct 13 11:57:46 2023 --- lib/sh/strtrans.c Mon Oct 27 14:30:35 2025 *************** *** 56,60 **** unsigned long v; size_t clen; ! int mb_cur_max; #if defined (HANDLE_MULTIBYTE) wchar_t wc; --- 56,60 ---- unsigned long v; size_t clen; ! size_t mb_cur_max; #if defined (HANDLE_MULTIBYTE) wchar_t wc; *************** *** 64,68 **** return ((char *)0); ! mb_cur_max = MB_CUR_MAX; #if defined (HANDLE_MULTIBYTE) temp = 4*len + 4; --- 64,68 ---- return ((char *)0); ! mb_cur_max = locale_mb_cur_max; #if defined (HANDLE_MULTIBYTE) temp = 4*len + 4; *************** *** 80,87 **** clen = 1; #if defined (HANDLE_MULTIBYTE) ! if ((locale_utf8locale && (c & 0x80)) || ! (locale_utf8locale == 0 && mb_cur_max > 0 && is_basic (c) == 0)) { clen = mbrtowc (&wc, s - 1, mb_cur_max, 0); if (MB_INVALIDCH (clen)) clen = 1; --- 80,91 ---- clen = 1; #if defined (HANDLE_MULTIBYTE) ! /* We read an entire multibyte character at a time if we are in a ! locale where a backslash can possibly appear as part of a ! multibyte character. UTF-8 encodings prohibit this. */ ! if (locale_utf8locale == 0 && mb_cur_max > 1 && is_basic (c) == 0) { clen = mbrtowc (&wc, s - 1, mb_cur_max, 0); + if (MB_NULLWCH (clen)) + break; /* it apparently can happen */ if (MB_INVALIDCH (clen)) clen = 1; *************** *** 228,237 **** char *r, *ret; const char *s; - size_t l, rsize; unsigned char c; size_t clen; int b; - #if defined (HANDLE_MULTIBYTE) wchar_t wc; #endif --- 232,241 ---- char *r, *ret; const char *s; unsigned char c; + #if defined (HANDLE_MULTIBYTE) size_t clen; int b; wchar_t wc; + DECLARE_MBSTATE; #endif *************** *** 239,245 **** return ((char *)0); ! l = strlen (str); ! rsize = 4 * l + 4; ! r = ret = (char *)xmalloc (rsize); *r++ = '$'; --- 243,247 ---- return ((char *)0); ! r = ret = (char *)xmalloc (4 * strlen (str) + 4); *r++ = '$'; *************** *** 248,255 **** for (s = str; c = *s; s++) { - b = 1; /* 1 == add backslash; 0 == no backslash */ - l = 1; - clen = 1; - switch (c) { --- 250,253 ---- *************** *** 267,303 **** default: #if defined (HANDLE_MULTIBYTE) ! b = is_basic (c); ! /* XXX - clen comparison to 0 is dicey */ ! if ((b == 0 && ((clen = mbrtowc (&wc, s, MB_CUR_MAX, 0)) < 0 || MB_INVALIDCH (clen) || iswprint (wc) == 0)) || ! (b == 1 && ISPRINT (c) == 0)) ! #else ! if (ISPRINT (c) == 0) ! #endif { ! *r++ = '\\'; ! *r++ = TOCHAR ((c >> 6) & 07); ! *r++ = TOCHAR ((c >> 3) & 07); ! *r++ = TOCHAR (c & 07); ! continue; } ! l = 0; ! break; ! } ! if (b == 0 && clen == 0) ! break; ! if (l) ! *r++ = '\\'; ! ! if (clen == 1) ! *r++ = c; ! else ! { ! for (b = 0; b < (int)clen; b++) ! *r++ = (unsigned char)s[b]; ! s += clen - 1; /* -1 because of the increment above */ } } *r++ = '\''; *r = '\0'; --- 265,304 ---- default: #if defined (HANDLE_MULTIBYTE) ! if ((locale_utf8locale && (c & 0x80)) || ! (locale_utf8locale == 0 && locale_mb_cur_max > 1 && is_basic (c) == 0)) { ! clen = mbrtowc (&wc, s, locale_mb_cur_max, &state); ! if (MB_NULLWCH (clen)) ! goto quote_end; ! if (MB_INVALIDCH (clen)) ! INITIALIZE_MBSTATE; ! else if (iswprint (wc)) ! { ! for (b = 0; b < (int)clen; b++) ! *r++ = (unsigned char)s[b]; ! s += clen - 1; /* -1 because of the increment above */ ! continue; ! } } ! else ! #endif ! if (ISPRINT (c)) ! { ! *r++ = c; ! continue; ! } ! *r++ = '\\'; ! *r++ = TOCHAR ((c >> 6) & 07); ! *r++ = TOCHAR ((c >> 3) & 07); ! *r++ = TOCHAR (c & 07); ! continue; } + + *r++ = '\\'; + *r++ = c; } + quote_end: *r++ = '\''; *r = '\0'; *************** *** 349,353 **** { #if defined (HANDLE_MULTIBYTE) ! if (is_basic (c) == 0) return (ansic_wshouldquote (s)); #endif --- 350,355 ---- { #if defined (HANDLE_MULTIBYTE) ! if ((locale_utf8locale && (c & 0x80)) || ! (locale_utf8locale == 0 && locale_mb_cur_max > 1 && is_basic (c) == 0)) return (ansic_wshouldquote (s)); #endif *** ../bash-5.3/patchlevel.h 2020-06-22 14:51:03.000000000 -0400 --- patchlevel.h 2020-10-01 11:01:28.000000000 -0400 *************** *** 26,30 **** looks for to find the patch level (for the sccs version string). */ ! #define PATCHLEVEL 7 #endif /* _PATCHLEVEL_H_ */ --- 26,30 ---- looks for to find the patch level (for the sccs version string). */ ! #define PATCHLEVEL 8 #endif /* _PATCHLEVEL_H_ */