From 4ad7d02ffae070ba565fce6bdd4c4ffe12e761ec Mon Sep 17 00:00:00 2001 From: Bram Matthys Date: Mon, 9 Aug 2021 08:45:05 +0200 Subject: [PATCH] Changes to unrl_utf8_make_valid(): 1) No longer impose a static maximum length 2) Caller must provide a work buffer and maximum length 3) Add a strict length check option --- include/h.h | 2 +- src/modules/websocket.c | 4 ++- src/utf8.c | 59 +++++++++++++++++++++++++++-------------- 3 files changed, 43 insertions(+), 22 deletions(-) diff --git a/include/h.h b/include/h.h index 6006f8b8f..fadcffda4 100644 --- a/include/h.h +++ b/include/h.h @@ -1007,7 +1007,7 @@ extern NameList *find_name_list(NameList *list, char *name); extern NameList *find_name_list_match(NameList *list, char *name); extern int minimum_msec_since_last_run(struct timeval *tv_old, long minimum); extern int unrl_utf8_validate(const char *str, const char **end); -extern char *unrl_utf8_make_valid(const char *str); +extern char *unrl_utf8_make_valid(const char *str, char *outputbuf, size_t outputbuflen, int strict_length_check); extern void utf8_test(void); extern MODVAR int non_utf8_nick_chars_in_use; extern void short_motd(Client *client); diff --git a/src/modules/websocket.c b/src/modules/websocket.c index e5ac9fa58..12198024d 100644 --- a/src/modules/websocket.c +++ b/src/modules/websocket.c @@ -239,6 +239,8 @@ void websocket_mdata_free(ModData *m) */ int websocket_packet_out(Client *from, Client *to, Client *intended_to, char **msg, int *length) { + static char utf8buf[510]; + if (MyConnect(to) && WSU(to) && WSU(to)->handshake_completed) { if (WEBSOCKET_TYPE(to) == WEBSOCKET_TYPE_BINARY) @@ -246,7 +248,7 @@ int websocket_packet_out(Client *from, Client *to, Client *intended_to, char **m else if (WEBSOCKET_TYPE(to) == WEBSOCKET_TYPE_TEXT) { /* Some more conversions are needed */ - char *safe_msg = unrl_utf8_make_valid(*msg); + char *safe_msg = unrl_utf8_make_valid(*msg, utf8buf, sizeof(utf8buf), 1); *msg = safe_msg; *length = *msg ? strlen(safe_msg) : 0; websocket_create_packet(WSOP_TEXT, msg, length); diff --git a/src/utf8.c b/src/utf8.c index 701479b5b..9c8388561 100644 --- a/src/utf8.c +++ b/src/utf8.c @@ -140,38 +140,51 @@ char *unrl_utf8_find_prev_char (const char *begin, const char *p) } /** Return a valid UTF8 string based on the input. - * @param str The input string, with a maximum of 1024 bytes. - * @retval Returns a valid UTF8 string (which may be sanitized - * or simply the original string if it was OK already) + * @param str The input string + * @param outputbuf The output buffer + * @param outputbuflen Length of the output buffer + * @param strictlen If set to 1 we never return more than + * outputbuflen-1 characters. + * If set to 0, we may do that, if the + * input string was already 100% valid UTF8. + * @retval Returns a valid UTF8 string, either the input buffer + * (if it was already valid UTF8) or the output buffer. + * NULL is returned if either 'str' was NULL or outputlen is zero. + * @notes The 'outputbuf' is unused if the string is already valid UTF8. + * So don't rely on it being always set, use the returned string. */ -char *unrl_utf8_make_valid(const char *str) +char *unrl_utf8_make_valid(const char *str, char *outputbuf, size_t outputbuflen, int strictlen) { - static char string[4096]; /* crazy, but lazy, max amplification is x3, so x4 is safe. */ const char *remainder, *invalid; int remaining_bytes, valid_bytes, len; int replaced = 0; /**< UTF8 string needed replacement (was invalid) */ - if (!str) + if (!str || !outputbuflen) return NULL; len = strlen(str); - if (len >= 1024) - abort(); /* better safe than sorry */ - - *string = '\0'; + *outputbuf = '\0'; remainder = str; remaining_bytes = len; while (remaining_bytes != 0) { if (unrl_utf8_validate(remainder, &invalid)) + { + if (!replaced && strictlen) + { + /* Caller wants us to go through the 'replaced' branch */ + strlcpy(outputbuf, str, outputbuflen); + replaced = 1; + } break; + } replaced = 1; valid_bytes = invalid - remainder; - strlncat(string, remainder, sizeof(string), valid_bytes); /*g_string_append_len(string, remainder, valid_bytes);*/ - strlcat(string, "\357\277\275", sizeof(string)); + strlncat(outputbuf, remainder, outputbuflen, valid_bytes); /*g_string_append_len(string, remainder, valid_bytes);*/ + strlcat(outputbuf, "\357\277\275", outputbuflen); remaining_bytes -= valid_bytes + 1; remainder = invalid + 1; @@ -180,21 +193,25 @@ char *unrl_utf8_make_valid(const char *str) if (!replaced) return (char *)str; /* return original string (no changes needed) */ - /* If output size is too much for an IRC message then cut the string at - * the appropriate place (as in: not to cause invalid UTF8 due to - * cutting half-way a byte sequence). + /* If we took up all the space, then backtrack one character and cut + * things off from there. This to ensure that we don't end up with + * invalid UTF8 due to cutting half-way a UTF8 byte sequence. + * NOTE: This may cause us to remove 1 character needlessly at the + * end even though there was still (some) space. So be it. */ - if (strlen(string) >= 510) + if (strlen(outputbuf) == outputbuflen-1) { - char *cut_at = unrl_utf8_find_prev_char(string, string+509); + char *cut_at = unrl_utf8_find_prev_char(outputbuf, outputbuf+outputbuflen-1); if (cut_at) *cut_at = '\0'; } - if (!unrl_utf8_validate(string, NULL)) +#ifdef DEBUGMODE + if (!unrl_utf8_validate(outputbuf, NULL)) abort(); /* this should never happen, it means our conversion resulted in an invalid UTF8 string */ +#endif - return string; + return outputbuf; } /**************** END OF UTF8 HELPER FUNCTIONS *****************/ @@ -206,12 +223,14 @@ void utf8_test(void) char *res; int cnt = 0; char *heapbuf; /* for strict OOB testing with ASan */ + char *workbuf = safe_alloc(500); + size_t workbuflen = 500; while ((fgets(buf, sizeof(buf), stdin))) { stripcrlf(buf); heapbuf = strdup(buf); - res = unrl_utf8_make_valid(heapbuf); + res = unrl_utf8_make_valid(heapbuf, workbuf, workbuflen, 1); if (heapbuf == res) { printf(" %s\n", res);